PyPI - chunkr-ai - Versions diffs - 0.0.50__tar.gz → 0.3.0__tar.gz - Mend

chunkr-ai 0.0.50tar.gz → 0.3.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

{chunkr_ai-0.0.50/src/chunkr_ai.egg-info → chunkr_ai-0.3.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: chunkr-ai
-Version: 0.0.50
+Version: 0.3.0
 Summary: Python client for Chunkr: open source document intelligence
 Author-email: Ishaan Kapoor <ishaan@lumina.sh>
 License: MIT License
@@ -28,6 +28,7 @@ Project-URL: Homepage, https://chunkr.ai
 Description-Content-Type: text/markdown
 License-File: LICENSE
 Requires-Dist: httpx>=0.25.0
+Requires-Dist: matplotlib>=3.10.3
 Requires-Dist: nest-asyncio>=1.6.0
 Requires-Dist: pillow>=10.0.0
 Requires-Dist: pydantic>=2.0.0

{chunkr_ai-0.0.50 → chunkr_ai-0.3.0}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "chunkr-ai"
-version = "0.0.50"
+version = "0.3.0"
 authors = [{"name" = "Ishaan Kapoor", "email" = "ishaan@lumina.sh"}]
 description = "Python client for Chunkr: open source document intelligence"
 readme = "README.md"
@@ -12,6 +12,7 @@ license = {"file" = "LICENSE"}
 urls = {Homepage = "https://chunkr.ai"}
 dependencies = [
     "httpx>=0.25.0",
+    "matplotlib>=3.10.3",
     "nest-asyncio>=1.6.0",
     "pillow>=10.0.0",
     "pydantic>=2.0.0",

{chunkr_ai-0.0.50 → chunkr_ai-0.3.0}/src/chunkr_ai/api/configuration.py RENAMED Viewed

@@ -3,27 +3,34 @@ from enum import Enum
 from typing import Any, List, Optional, Union
 from pydantic import field_validator, field_serializer
-class GenerationStrategy(str, Enum):
-    LLM = "LLM"
-    AUTO = "Auto"
 class CroppingStrategy(str, Enum):
     ALL = "All"
     AUTO = "Auto"
-class EmbedSource(str, Enum):
-    HTML = "HTML"
+class SegmentFormat(str, Enum):
+    HTML = "Html"
     MARKDOWN = "Markdown"
-    LLM = "LLM"
+class EmbedSource(str, Enum):
     CONTENT = "Content"
+    HTML = "HTML"  # Deprecated
+    MARKDOWN = "Markdown"  # Deprecated
+    LLM = "LLM"
+class GenerationStrategy(str, Enum):
+    LLM = "LLM"
+    AUTO = "Auto"
 class GenerationConfig(BaseModel):
-    html: Optional[GenerationStrategy] = None
+    format: Optional[SegmentFormat] = None
+    strategy: Optional[GenerationStrategy] = None
     llm: Optional[str] = None
-    markdown: Optional[GenerationStrategy] = None
     crop_image: Optional[CroppingStrategy] = None
-    embed_sources: Optional[List[EmbedSource]] = Field(default_factory=lambda: [EmbedSource.MARKDOWN])
+    embed_sources: Optional[List[EmbedSource]] = None
     extended_context: Optional[bool] = None
+    # Deprecated fields for backwards compatibility
+    html: Optional[GenerationStrategy] = None  # Deprecated: Use format=SegmentFormat.HTML and strategy instead
+    markdown: Optional[GenerationStrategy] = None  # Deprecated: Use format=SegmentFormat.MARKDOWN and strategy instead
 class SegmentProcessing(BaseModel):
     model_config = ConfigDict(populate_by_name=True, alias_generator=str.title)
@@ -244,9 +251,45 @@ class SegmentType(str, Enum):
     TEXT = "Text"
     TITLE = "Title"
+class Alignment(str, Enum):
+    LEFT = "Left"
+    CENTER = "Center"
+    RIGHT = "Right"
+    JUSTIFY = "Justify"
+class VerticalAlignment(str, Enum):
+    TOP = "Top"
+    MIDDLE = "Middle"
+    BOTTOM = "Bottom"
+    BASELINE = "Baseline"
+class CellStyle(BaseModel):
+    bg_color: Optional[str] = None
+    text_color: Optional[str] = None
+    font_face: Optional[str] = None
+    is_bold: Optional[bool] = None
+    align: Optional[Alignment] = None
+    valign: Optional[VerticalAlignment] = None
+class Cell(BaseModel):
+    cell_id: str
+    text: str
+    range: str
+    formula: Optional[str] = None
+    value: Optional[str] = None
+    hyperlink: Optional[str] = None
+    style: Optional[CellStyle] = None
+class Page(BaseModel):
+    image: str
+    page_number: int
+    page_height: float
+    page_width: float
+    ss_sheet_name: Optional[str] = None
 class Segment(BaseModel):
     bbox: BoundingBox
-    content: str
+    content: str = ""
     page_height: float
     llm: Optional[str] = None
     html: Optional[str] = None
@@ -258,6 +301,16 @@ class Segment(BaseModel):
     segment_id: str
     segment_type: SegmentType
     confidence: Optional[float]
+    text: str = ""
+    segment_length: Optional[int] = None
+    # Spreadsheet-specific fields
+    ss_cells: Optional[List[Cell]] = None
+    ss_header_bbox: Optional[BoundingBox] = None
+    ss_header_ocr: Optional[List[OCRResult]] = None
+    ss_header_text: Optional[str] = None
+    ss_header_range: Optional[str] = None
+    ss_range: Optional[str] = None
+    ss_sheet_name: Optional[str] = None
 class Chunk(BaseModel):
     chunk_id: str
@@ -268,6 +321,8 @@ class Chunk(BaseModel):
 class OutputResponse(BaseModel):
     chunks: List[Chunk]
     file_name: Optional[str]
+    mime_type: Optional[str] = None
+    pages: Optional[List[Page]] = None
     page_count: Optional[int]
     pdf_url: Optional[str]

{chunkr_ai-0.0.50 → chunkr_ai-0.3.0}/src/chunkr_ai/models.py RENAMED Viewed

@@ -1,5 +1,8 @@
 from .api.configuration import (
+    Alignment,
     BoundingBox,
+    Cell,
+    CellStyle,
     Chunk,
     ChunkProcessing,
     Configuration,
@@ -14,7 +17,9 @@ from .api.configuration import (
     OCRResult,
     OcrStrategy,
     OutputResponse,
+    Page,
     Segment,
+    SegmentFormat,
     SegmentProcessing,
     SegmentType,
     SegmentationStrategy,
@@ -22,11 +27,15 @@ from .api.configuration import (
     Pipeline,
     Tokenizer,
     TokenizerType,
+    VerticalAlignment,
 )
 from .api.task_response import TaskResponse
 __all__ = [
+    "Alignment",
     "BoundingBox",
+    "Cell",
+    "CellStyle",
     "Chunk",
     "ChunkProcessing",
     "Configuration",
@@ -41,7 +50,9 @@ __all__ = [
     "OCRResult",
     "OcrStrategy",
     "OutputResponse",
+    "Page",
     "Segment",
+    "SegmentFormat",
     "SegmentProcessing",
     "SegmentType",
     "SegmentationStrategy",
@@ -50,4 +61,5 @@ __all__ = [
     "Pipeline",
     "Tokenizer",
     "TokenizerType",
+    "VerticalAlignment",
 ]

{chunkr_ai-0.0.50 → chunkr_ai-0.3.0/src/chunkr_ai.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: chunkr-ai
-Version: 0.0.50
+Version: 0.3.0
 Summary: Python client for Chunkr: open source document intelligence
 Author-email: Ishaan Kapoor <ishaan@lumina.sh>
 License: MIT License
@@ -28,6 +28,7 @@ Project-URL: Homepage, https://chunkr.ai
 Description-Content-Type: text/markdown
 License-File: LICENSE
 Requires-Dist: httpx>=0.25.0
+Requires-Dist: matplotlib>=3.10.3
 Requires-Dist: nest-asyncio>=1.6.0
 Requires-Dist: pillow>=10.0.0
 Requires-Dist: pydantic>=2.0.0

{chunkr_ai-0.0.50 → chunkr_ai-0.3.0}/src/chunkr_ai.egg-info/SOURCES.txt RENAMED Viewed

@@ -18,4 +18,6 @@ src/chunkr_ai/api/misc.py
 src/chunkr_ai/api/protocol.py
 src/chunkr_ai/api/task_response.py
 tests/test_chunkr.py
-tests/test_file_handling.py
+tests/test_excel.py
+tests/test_file_handling.py
+tests/test_pages.py

{chunkr_ai-0.0.50 → chunkr_ai-0.3.0}/src/chunkr_ai.egg-info/requires.txt RENAMED Viewed

@@ -1,4 +1,5 @@
 httpx>=0.25.0
+matplotlib>=3.10.3
 nest-asyncio>=1.6.0
 pillow>=10.0.0
 pydantic>=2.0.0

{chunkr_ai-0.0.50 → chunkr_ai-0.3.0}/tests/test_chunkr.py RENAMED Viewed

@@ -2,7 +2,6 @@ import pytest
 from pathlib import Path
 from PIL import Image
 import asyncio
-from typing import Awaitable
 from chunkr_ai import Chunkr
 from chunkr_ai.models import (
@@ -21,6 +20,7 @@ from chunkr_ai.models import (
     Status,
     TaskResponse,
     Tokenizer,
+    SegmentFormat,
 )
 @pytest.fixture
@@ -48,14 +48,21 @@ def client():
     client = Chunkr()
     yield client
+@pytest.fixture(params=[
+    pytest.param(None, id="none_pipeline"),
+    pytest.param(Pipeline.AZURE, id="azure_pipeline"),
+])
+def pipeline_type(request):
+    return request.param
 @pytest.fixture
 def markdown_embed_config():
     return Configuration(
         segment_processing=SegmentProcessing(
             Page=GenerationConfig(
-                html=GenerationStrategy.LLM,
-                markdown=GenerationStrategy.LLM,
-                embed_sources=[EmbedSource.MARKDOWN]
+                format=SegmentFormat.MARKDOWN,
+                strategy=GenerationStrategy.LLM,
+                embed_sources=[EmbedSource.CONTENT]
             )
         ),
     )
@@ -65,9 +72,9 @@ def html_embed_config():
     return Configuration(
         segment_processing=SegmentProcessing(
             Page=GenerationConfig(
-                html=GenerationStrategy.LLM,
-                markdown=GenerationStrategy.LLM,
-                embed_sources=[EmbedSource.HTML]
+                format=SegmentFormat.HTML,
+                strategy=GenerationStrategy.LLM,
+                embed_sources=[EmbedSource.HTML]  # Keep this for backwards compatibility testing
             )
         ),
     )
@@ -77,10 +84,10 @@ def multiple_embed_config():
     return Configuration(
         segment_processing=SegmentProcessing(
             Page=GenerationConfig(
-                html=GenerationStrategy.LLM,
-                markdown=GenerationStrategy.LLM,
+                format=SegmentFormat.MARKDOWN,
+                strategy=GenerationStrategy.LLM,
                 llm="Generate a summary of this content",
-                embed_sources=[EmbedSource.MARKDOWN, EmbedSource.LLM, EmbedSource.HTML]
+                embed_sources=[EmbedSource.CONTENT, EmbedSource.LLM, EmbedSource.HTML]
             )
         ),
     )
@@ -169,13 +176,15 @@ def model_fallback_config():
 def extended_context_config():
     return Configuration(
         segment_processing=SegmentProcessing(
-            picture=GenerationConfig(
+            Picture=GenerationConfig(
                 extended_context=True,
-                html=GenerationStrategy.LLM,
+                format=SegmentFormat.HTML,
+                strategy=GenerationStrategy.LLM,
             ),
-            table=GenerationConfig(
+            Table=GenerationConfig(
                 extended_context=True,
-                html=GenerationStrategy.LLM,
+                format=SegmentFormat.HTML,
+                strategy=GenerationStrategy.LLM,
             )
         ),
     )
@@ -471,12 +480,12 @@ async def test_tokenizer_custom_string(client, sample_path, custom_tokenizer_con
     assert response.status == "Succeeded"
     assert response.output is not None
-@pytest.mark.asyncio
-async def test_embed_sources_with_different_tokenizer(client, sample_path, xlm_roberta_with_html_content_config):
-    response = await client.upload(sample_path, xlm_roberta_with_html_content_config)
-    assert response.task_id is not None
-    assert response.status == "Succeeded"
-    assert response.output is not None
+# @pytest.mark.asyncio
+# async def test_embed_sources_with_different_tokenizer(client, sample_path, xlm_roberta_with_html_content_config):
+#     response = await client.upload(sample_path, xlm_roberta_with_html_content_config)
+#     assert response.task_id is not None
+#     assert response.status == "Succeeded"
+#     assert response.output is not None
 @pytest.mark.asyncio
 async def test_error_handling_continue(client, sample_path):
@@ -582,4 +591,129 @@ async def test_extended_context(client, sample_path, extended_context_config):
     except Exception as e:
         print(f"Error during extended context test: {e}")
-        raise # Re-raise the exception to fail the test explicitly
+        raise # Re-raise the exception to fail the test explicitly
+# Tests for new fields added in recent updates
+class TestNewFields:
+    """Test the newly added fields in the models"""
+    @pytest.mark.asyncio
+    async def test_output_has_mime_type_field(self, client, sample_path):
+        """Test that OutputResponse includes mime_type field"""
+        response = await client.upload(sample_path)
+        assert response.task_id is not None
+        assert response.status == "Succeeded"
+        assert response.output is not None
+        # mime_type should be accessible (might be None for some file types)
+        assert hasattr(response.output, 'mime_type')
+        # For PDF files, mime_type should be present
+        if response.output.mime_type:
+            assert "pdf" in response.output.mime_type.lower()
+    @pytest.mark.asyncio
+    async def test_output_has_pages_field(self, client, sample_path):
+        """Test that OutputResponse includes pages field"""
+        response = await client.upload(sample_path)
+        assert response.task_id is not None
+        assert response.status == "Succeeded"
+        assert response.output is not None
+        # pages should be accessible (might be None for some configurations)
+        assert hasattr(response.output, 'pages')
+        # If pages exist, validate structure
+        if response.output.pages:
+            assert len(response.output.pages) > 0
+            page = response.output.pages[0]
+            assert hasattr(page, 'image')
+            assert hasattr(page, 'page_number')
+            assert hasattr(page, 'page_height')
+            assert hasattr(page, 'page_width')
+            assert hasattr(page, 'ss_sheet_name')
+    @pytest.mark.asyncio
+    async def test_segments_have_spreadsheet_fields(self, client, sample_path):
+        """Test that Segment objects include new spreadsheet fields"""
+        response = await client.upload(sample_path)
+        assert response.task_id is not None
+        assert response.status == "Succeeded"
+        assert response.output is not None
+        assert len(response.output.chunks) > 0
+        segment = response.output.chunks[0].segments[0]
+        # All new spreadsheet fields should be accessible
+        assert hasattr(segment, 'segment_length')
+        assert hasattr(segment, 'ss_cells')
+        assert hasattr(segment, 'ss_header_bbox')
+        assert hasattr(segment, 'ss_header_ocr')
+        assert hasattr(segment, 'ss_header_text')
+        assert hasattr(segment, 'ss_header_range')
+        assert hasattr(segment, 'ss_range')
+        assert hasattr(segment, 'ss_sheet_name')
+        # For PDF files, spreadsheet fields should be None
+        assert segment.ss_cells is None
+        assert segment.ss_range is None
+        assert segment.ss_sheet_name is None
+    @pytest.mark.asyncio
+    async def test_segment_length_field(self, client, sample_path):
+        """Test that segments can have length calculations"""
+        response = await client.upload(sample_path)
+        assert response.task_id is not None
+        assert response.status == "Succeeded"
+        assert response.output is not None
+        # Check if any segments have length calculations
+        segments_with_length = []
+        for chunk in response.output.chunks:
+            for segment in chunk.segments:
+                if segment.segment_length is not None:
+                    segments_with_length.append(segment)
+        # segment_length might be None depending on configuration
+        # but if present, should be positive
+        for segment in segments_with_length:
+            assert segment.segment_length > 0
+    @pytest.mark.asyncio
+    async def test_backwards_compatibility_preserved(self, client, sample_path):
+        """Test that all existing fields still work after adding new ones"""
+        response = await client.upload(sample_path)
+        assert response.task_id is not None
+        assert response.status == "Succeeded"
+        assert response.output is not None
+        # All existing fields should still work
+        assert response.output.chunks is not None
+        assert response.output.file_name is not None
+        assert response.output.page_count is not None
+        assert response.output.pdf_url is not None
+        # Chunk structure should be unchanged
+        chunk = response.output.chunks[0]
+        assert chunk.chunk_id is not None
+        assert chunk.chunk_length is not None
+        assert chunk.segments is not None
+        assert chunk.embed is not None or chunk.embed is None  # embed can be None
+        # Segment structure should include all original fields
+        segment = chunk.segments[0]
+        assert segment.bbox is not None
+        assert segment.content is not None or segment.content == ""
+        assert segment.page_height is not None
+        assert segment.llm is not None or segment.llm is None
+        assert segment.html is not None or segment.html == ""
+        assert segment.image is not None or segment.image is None
+        assert segment.markdown is not None or segment.markdown == ""
+        assert segment.ocr is not None or segment.ocr == []
+        assert segment.page_number is not None
+        assert segment.page_width is not None
+        assert segment.segment_id is not None
+        assert segment.segment_type is not None
+        assert segment.confidence is not None or segment.confidence is None
+        assert segment.text is not None or segment.text == ""

chunkr_ai-0.3.0/tests/test_excel.py ADDED Viewed

@@ -0,0 +1,417 @@
+import pytest
+import json
+from pathlib import Path
+from chunkr_ai import Chunkr
+from chunkr_ai.models import (
+    Configuration,
+    ChunkProcessing,
+    SegmentProcessing,
+    GenerationConfig,
+    SegmentFormat,
+    GenerationStrategy,
+    EmbedSource,
+    Tokenizer,
+    OcrStrategy,
+    SegmentationStrategy,
+    Cell,
+    CellStyle,
+    Alignment,
+    VerticalAlignment,
+    Page,
+    Segment,
+    SegmentType,
+)
+@pytest.fixture
+def excel_sample_path():
+    """Path to the Excel test file"""
+    return Path("tests/files/excel/test.xlsx")
+@pytest.fixture
+def excel_expected_output():
+    """Expected output for Excel test file"""
+    with open("tests/files/excel/test.json", "r") as f:
+        return json.load(f)
+@pytest.fixture
+def client():
+    """Chunkr client instance"""
+    client = Chunkr()
+    yield client
+@pytest.fixture
+def excel_config():
+    """Configuration optimized for Excel processing"""
+    return Configuration(
+        high_resolution=True,
+        ocr_strategy=OcrStrategy.ALL,
+        segmentation_strategy=SegmentationStrategy.LAYOUT_ANALYSIS,
+        chunk_processing=ChunkProcessing(
+            target_length=512,
+            tokenizer=Tokenizer.WORD,
+        ),
+        segment_processing=SegmentProcessing(
+            Table=GenerationConfig(
+                format=SegmentFormat.MARKDOWN,
+                strategy=GenerationStrategy.AUTO,
+                embed_sources=[EmbedSource.MARKDOWN],
+            ),
+            Text=GenerationConfig(
+                format=SegmentFormat.MARKDOWN,
+                strategy=GenerationStrategy.AUTO,
+                embed_sources=[EmbedSource.MARKDOWN],
+            ),
+        ),
+    )
+class TestExcelBasicFunctionality:
+    """Test basic Excel file processing"""
+    @pytest.mark.asyncio
+    async def test_excel_upload_and_process(self, client, excel_sample_path, excel_config):
+        """Test that Excel file can be uploaded and processed successfully"""
+        response = await client.upload(excel_sample_path, excel_config)
+        assert response.task_id is not None
+        assert response.status == "Succeeded"
+        assert response.output is not None
+        assert response.output.chunks is not None
+        assert len(response.output.chunks) > 0
+    @pytest.mark.asyncio
+    async def test_excel_mime_type(self, client, excel_sample_path, excel_config):
+        """Test that Excel files have correct MIME type"""
+        response = await client.upload(excel_sample_path, excel_config)
+        assert response.output.mime_type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
+    @pytest.mark.asyncio
+    async def test_excel_pages_exist(self, client, excel_sample_path, excel_config):
+        """Test that Excel processing generates pages information"""
+        response = await client.upload(excel_sample_path, excel_config)
+        assert response.output.pages is not None
+        assert len(response.output.pages) > 0
+        assert response.output.page_count is not None
+        assert response.output.page_count > 0
+    @pytest.mark.asyncio
+    async def test_excel_chunks_have_segments(self, client, excel_sample_path, excel_config):
+        """Test that Excel chunks contain segments with data"""
+        response = await client.upload(excel_sample_path, excel_config)
+        assert len(response.output.chunks) > 0
+        chunk = response.output.chunks[0]
+        assert len(chunk.segments) > 0
+        assert chunk.chunk_length > 0
+class TestExcelSpreadsheetFields:
+    """Test Excel-specific spreadsheet fields"""
+    @pytest.mark.asyncio
+    async def test_segments_have_spreadsheet_fields(self, client, excel_sample_path, excel_config):
+        """Test that segments contain spreadsheet-specific fields"""
+        response = await client.upload(excel_sample_path, excel_config)
+        # Find a segment with spreadsheet data
+        spreadsheet_segment = None
+        for chunk in response.output.chunks:
+            for segment in chunk.segments:
+                if segment.ss_cells and len(segment.ss_cells) > 0:
+                    spreadsheet_segment = segment
+                    break
+            if spreadsheet_segment:
+                break
+        assert spreadsheet_segment is not None, "No segment with spreadsheet data found"
+        # Test spreadsheet-specific fields
+        assert spreadsheet_segment.ss_cells is not None
+        assert len(spreadsheet_segment.ss_cells) > 0
+        assert spreadsheet_segment.ss_sheet_name is not None
+        assert spreadsheet_segment.ss_range is not None
+    @pytest.mark.asyncio
+    async def test_cells_have_required_fields(self, client, excel_sample_path, excel_config):
+        """Test that cells contain all required fields"""
+        response = await client.upload(excel_sample_path, excel_config)
+        # Find a segment with cells
+        test_cell = None
+        for chunk in response.output.chunks:
+            for segment in chunk.segments:
+                if segment.ss_cells and len(segment.ss_cells) > 0:
+                    test_cell = segment.ss_cells[0]
+                    break
+            if test_cell:
+                break
+        assert test_cell is not None, "No cell found in any segment"
+        # Test required cell fields
+        assert test_cell.cell_id is not None
+        assert test_cell.text is not None
+        assert test_cell.range is not None
+        # Optional fields should be accessible
+        assert hasattr(test_cell, 'formula')
+        assert hasattr(test_cell, 'value')
+        assert hasattr(test_cell, 'hyperlink')
+        assert hasattr(test_cell, 'style')
+    @pytest.mark.asyncio
+    async def test_cell_styling_fields(self, client, excel_sample_path, excel_config):
+        """Test that cells with styling contain CellStyle information"""
+        response = await client.upload(excel_sample_path, excel_config)
+        # Find a cell with styling
+        styled_cell = None
+        for chunk in response.output.chunks:
+            for segment in chunk.segments:
+                if segment.ss_cells:
+                    for cell in segment.ss_cells:
+                        if cell.style is not None:
+                            styled_cell = cell
+                            break
+                if styled_cell:
+                    break
+            if styled_cell:
+                break
+        assert styled_cell is not None, "No styled cell found"
+        assert styled_cell.style is not None
+        # Test CellStyle fields
+        style = styled_cell.style
+        assert hasattr(style, 'bg_color')
+        assert hasattr(style, 'text_color')
+        assert hasattr(style, 'font_face')
+        assert hasattr(style, 'is_bold')
+        assert hasattr(style, 'align')
+        assert hasattr(style, 'valign')
+    @pytest.mark.asyncio
+    async def test_excel_sheet_names(self, client, excel_sample_path, excel_config):
+        """Test that sheet names are properly captured"""
+        response = await client.upload(excel_sample_path, excel_config)
+        # Check segments for sheet names
+        sheet_names = set()
+        for chunk in response.output.chunks:
+            for segment in chunk.segments:
+                if segment.ss_sheet_name:
+                    sheet_names.add(segment.ss_sheet_name)
+        assert len(sheet_names) > 0, "No sheet names found in segments"
+        # Check pages for sheet names
+        page_sheet_names = set()
+        if response.output.pages:
+            for page in response.output.pages:
+                if page.ss_sheet_name:
+                    page_sheet_names.add(page.ss_sheet_name)
+        # At least one source should have sheet names
+        assert len(sheet_names) > 0 or len(page_sheet_names) > 0
+class TestExcelPages:
+    """Test Excel pages functionality"""
+    @pytest.mark.asyncio
+    async def test_pages_structure(self, client, excel_sample_path, excel_config):
+        """Test that pages have correct structure and fields"""
+        response = await client.upload(excel_sample_path, excel_config)
+        assert response.output.pages is not None
+        assert len(response.output.pages) > 0
+        page = response.output.pages[0]
+        assert page.image is not None
+        assert page.page_number is not None
+        assert page.page_height is not None
+        assert page.page_width is not None
+        # ss_sheet_name is optional for pages
+        assert hasattr(page, 'ss_sheet_name')
+    @pytest.mark.asyncio
+    async def test_page_count_consistency(self, client, excel_sample_path, excel_config):
+        """Test that page_count matches the actual number of pages"""
+        response = await client.upload(excel_sample_path, excel_config)
+        assert response.output.page_count is not None
+        if response.output.pages:
+            assert response.output.page_count == len(response.output.pages)
+    @pytest.mark.asyncio
+    async def test_page_numbers_sequential(self, client, excel_sample_path, excel_config):
+        """Test that page numbers are sequential and start from 1"""
+        response = await client.upload(excel_sample_path, excel_config)
+        if response.output.pages and len(response.output.pages) > 1:
+            page_numbers = [page.page_number for page in response.output.pages]
+            page_numbers.sort()
+            # Should start from 1 and be sequential
+            for i, page_num in enumerate(page_numbers):
+                assert page_num == i + 1, f"Page numbers not sequential: {page_numbers}"
+class TestExcelSegmentTypes:
+    """Test Excel segment types and their properties"""
+    @pytest.mark.asyncio
+    async def test_segment_types_present(self, client, excel_sample_path, excel_config):
+        """Test that appropriate segment types are detected in Excel files"""
+        response = await client.upload(excel_sample_path, excel_config)
+        segment_types = set()
+        for chunk in response.output.chunks:
+            for segment in chunk.segments:
+                segment_types.add(segment.segment_type)
+        # Excel files should contain at least Table or Text segments
+        expected_types = {SegmentType.TABLE, SegmentType.TEXT}
+        assert len(segment_types.intersection(expected_types)) > 0, f"No expected segment types found. Got: {segment_types}"
+    @pytest.mark.asyncio
+    async def test_table_segments_have_cells(self, client, excel_sample_path, excel_config):
+        """Test that TABLE segments contain cell data"""
+        response = await client.upload(excel_sample_path, excel_config)
+        table_segments = []
+        for chunk in response.output.chunks:
+            for segment in chunk.segments:
+                if segment.segment_type == SegmentType.TABLE:
+                    table_segments.append(segment)
+        if table_segments:  # If we have table segments, they should have cells
+            found_cells = False
+            for segment in table_segments:
+                if segment.ss_cells and len(segment.ss_cells) > 0:
+                    found_cells = True
+                    break
+            assert found_cells, "TABLE segments should contain cell data"
+class TestExcelEmbedding:
+    """Test Excel embedding functionality"""
+    @pytest.mark.asyncio
+    async def test_chunks_have_embed_content(self, client, excel_sample_path, excel_config):
+        """Test that chunks generate embed content for Excel data"""
+        response = await client.upload(excel_sample_path, excel_config)
+        # At least some chunks should have embed content
+        chunks_with_embed = [chunk for chunk in response.output.chunks if chunk.embed]
+        assert len(chunks_with_embed) > 0, "No chunks with embed content found"
+        # Embed content should not be empty
+        for chunk in chunks_with_embed:
+            assert len(chunk.embed.strip()) > 0, "Empty embed content found"
+    @pytest.mark.asyncio
+    async def test_segment_length_calculation(self, client, excel_sample_path, excel_config):
+        """Test that segments have length calculations"""
+        response = await client.upload(excel_sample_path, excel_config)
+        segments_with_length = []
+        for chunk in response.output.chunks:
+            for segment in chunk.segments:
+                if segment.segment_length is not None:
+                    segments_with_length.append(segment)
+        # At least some segments should have length calculations
+        assert len(segments_with_length) > 0, "No segments with length calculations found"
+        # Lengths should be positive
+        for segment in segments_with_length:
+            assert segment.segment_length > 0, f"Invalid segment length: {segment.segment_length}"
+class TestExcelEdgeCases:
+    """Test edge cases and error handling for Excel processing"""
+    @pytest.mark.asyncio
+    async def test_empty_cells_handling(self, client, excel_sample_path, excel_config):
+        """Test that empty cells are handled properly"""
+        response = await client.upload(excel_sample_path, excel_config)
+        # Look for cells that might be empty
+        all_cells = []
+        for chunk in response.output.chunks:
+            for segment in chunk.segments:
+                if segment.ss_cells:
+                    all_cells.extend(segment.ss_cells)
+        assert len(all_cells) > 0, "No cells found to test"
+        # All cells should have a text field, even if empty
+        for cell in all_cells:
+            assert hasattr(cell, 'text'), "Cell missing text field"
+            assert cell.text is not None, "Cell text is None"
+    @pytest.mark.asyncio
+    async def test_range_format_validity(self, client, excel_sample_path, excel_config):
+        """Test that Excel ranges follow expected format"""
+        response = await client.upload(excel_sample_path, excel_config)
+        ranges = []
+        for chunk in response.output.chunks:
+            for segment in chunk.segments:
+                if segment.ss_range:
+                    ranges.append(segment.ss_range)
+                if segment.ss_cells:
+                    for cell in segment.ss_cells:
+                        ranges.append(cell.range)
+        assert len(ranges) > 0, "No ranges found to test"
+        # Basic range format validation (e.g., "A1", "A1:B2")
+        import re
+        range_pattern = re.compile(r'^[A-Z]+\d+(:[A-Z]+\d+)?$')
+        valid_ranges = [r for r in ranges if range_pattern.match(r)]
+        # Most ranges should follow the expected format
+        assert len(valid_ranges) > 0, f"No valid ranges found. Ranges: {ranges[:10]}..."
+# Integration test using the expected output fixture
+class TestExcelIntegration:
+    """Integration tests comparing against expected output"""
+    @pytest.mark.asyncio
+    async def test_compare_with_expected_structure(self, client, excel_sample_path, excel_config, excel_expected_output):
+        """Test that the output structure matches expected format"""
+        response = await client.upload(excel_sample_path, excel_config)
+        expected = excel_expected_output["output"]
+        actual = response.output
+        # Compare high-level structure
+        assert actual.mime_type == expected["mime_type"]
+        assert actual.page_count == expected["page_count"]
+        assert len(actual.chunks) > 0
+        assert len(actual.pages) > 0
+        # Verify that we have similar data structure
+        expected_has_cells = any(
+            segment.get("ss_cells")
+            for chunk in expected["chunks"]
+            for segment in chunk["segments"]
+        )
+        actual_has_cells = any(
+            segment.ss_cells
+            for chunk in actual.chunks
+            for segment in chunk.segments
+            if segment.ss_cells
+        )
+        if expected_has_cells:
+            assert actual_has_cells, "Expected cells in output but none found"

chunkr_ai-0.3.0/tests/test_pages.py ADDED Viewed

@@ -0,0 +1,261 @@
+import pytest
+from pathlib import Path
+from chunkr_ai import Chunkr
+from chunkr_ai.models import (
+    Configuration,
+    ChunkProcessing,
+    SegmentProcessing,
+    GenerationConfig,
+    SegmentFormat,
+    GenerationStrategy,
+    EmbedSource,
+    Tokenizer,
+    OcrStrategy,
+    SegmentationStrategy,
+    Page,
+)
+@pytest.fixture
+def client():
+    """Chunkr client instance"""
+    client = Chunkr()
+    yield client
+@pytest.fixture
+def sample_pdf_path():
+    """Path to the PDF test file"""
+    return Path("tests/files/test.pdf")
+@pytest.fixture
+def excel_sample_path():
+    """Path to the Excel test file"""
+    return Path("tests/files/excel/test.xlsx")
+@pytest.fixture
+def basic_config():
+    """Basic configuration for testing pages"""
+    return Configuration(
+        high_resolution=True,
+        ocr_strategy=OcrStrategy.ALL,
+        segmentation_strategy=SegmentationStrategy.LAYOUT_ANALYSIS,
+    )
+class TestPagesBasicFunctionality:
+    """Test basic pages functionality across different file types"""
+    @pytest.mark.asyncio
+    async def test_pdf_generates_pages(self, client, sample_pdf_path, basic_config):
+        """Test that PDF files generate pages information"""
+        response = await client.upload(sample_pdf_path, basic_config)
+        assert response.task_id is not None
+        assert response.status == "Succeeded"
+        assert response.output is not None
+        # Test pages structure
+        if response.output.pages:  # Pages might be optional for some file types
+            assert len(response.output.pages) > 0
+            page = response.output.pages[0]
+            assert page.image is not None
+            assert page.page_number is not None
+            assert page.page_height is not None
+            assert page.page_width is not None
+    @pytest.mark.asyncio
+    async def test_excel_generates_pages_with_sheet_info(self, client, excel_sample_path, basic_config):
+        """Test that Excel files generate pages with sheet information"""
+        response = await client.upload(excel_sample_path, basic_config)
+        assert response.task_id is not None
+        assert response.status == "Succeeded"
+        assert response.output is not None
+        # Excel should definitely have pages
+        assert response.output.pages is not None
+        assert len(response.output.pages) > 0
+        page = response.output.pages[0]
+        assert page.image is not None
+        assert page.page_number is not None
+        assert page.page_height is not None
+        assert page.page_width is not None
+        # Excel pages should have sheet names
+        assert page.ss_sheet_name is not None
+    @pytest.mark.asyncio
+    async def test_page_count_consistency(self, client, sample_pdf_path, basic_config):
+        """Test that page_count matches the actual number of pages"""
+        response = await client.upload(sample_pdf_path, basic_config)
+        assert response.output.page_count is not None
+        if response.output.pages:
+            assert response.output.page_count == len(response.output.pages)
+        else:
+            # If no pages array, page_count should still be meaningful
+            assert response.output.page_count > 0
+class TestPageStructure:
+    """Test the Page model structure and validation"""
+    @pytest.mark.asyncio
+    async def test_page_required_fields(self, client, sample_pdf_path, basic_config):
+        """Test that Page objects have all required fields"""
+        response = await client.upload(sample_pdf_path, basic_config)
+        if response.output.pages and len(response.output.pages) > 0:
+            page = response.output.pages[0]
+            # Required fields
+            assert page.image is not None
+            assert isinstance(page.page_number, int)
+            assert isinstance(page.page_height, (int, float))
+            assert isinstance(page.page_width, (int, float))
+            # Optional fields should be accessible
+            assert hasattr(page, 'ss_sheet_name')
+    @pytest.mark.asyncio
+    async def test_page_numbers_start_from_one(self, client, sample_pdf_path, basic_config):
+        """Test that page numbers start from 1 and are sequential"""
+        response = await client.upload(sample_pdf_path, basic_config)
+        if response.output.pages and len(response.output.pages) > 0:
+            page_numbers = [page.page_number for page in response.output.pages]
+            page_numbers.sort()
+            # Should start from 1
+            assert page_numbers[0] == 1, f"Page numbers should start from 1, got: {page_numbers[0]}"
+            # Should be sequential if multiple pages
+            if len(page_numbers) > 1:
+                for i in range(1, len(page_numbers)):
+                    assert page_numbers[i] == page_numbers[i-1] + 1, f"Page numbers not sequential: {page_numbers}"
+    @pytest.mark.asyncio
+    async def test_page_dimensions_positive(self, client, sample_pdf_path, basic_config):
+        """Test that page dimensions are positive values"""
+        response = await client.upload(sample_pdf_path, basic_config)
+        if response.output.pages:
+            for page in response.output.pages:
+                assert page.page_height > 0, f"Invalid page height: {page.page_height}"
+                assert page.page_width > 0, f"Invalid page width: {page.page_width}"
+    @pytest.mark.asyncio
+    async def test_page_images_are_urls(self, client, sample_pdf_path, basic_config):
+        """Test that page images are valid URLs"""
+        response = await client.upload(sample_pdf_path, basic_config)
+        if response.output.pages:
+            for page in response.output.pages:
+                assert page.image.startswith(('http://', 'https://')), f"Invalid page image URL: {page.image}"
+class TestMimeTypeHandling:
+    """Test MIME type handling for different file types"""
+    @pytest.mark.asyncio
+    async def test_pdf_mime_type(self, client, sample_pdf_path, basic_config):
+        """Test that PDF files have correct MIME type"""
+        response = await client.upload(sample_pdf_path, basic_config)
+        assert response.output.mime_type is not None
+        # Should be PDF MIME type
+        assert response.output.mime_type in [
+            "application/pdf",
+            "application/x-pdf"
+        ], f"Unexpected PDF MIME type: {response.output.mime_type}"
+    @pytest.mark.asyncio
+    async def test_excel_mime_type(self, client, excel_sample_path, basic_config):
+        """Test that Excel files have correct MIME type"""
+        response = await client.upload(excel_sample_path, basic_config)
+        assert response.output.mime_type is not None
+        # Should be Excel MIME type
+        expected_excel_types = [
+            "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+            "application/vnd.ms-excel"
+        ]
+        assert response.output.mime_type in expected_excel_types, f"Unexpected Excel MIME type: {response.output.mime_type}"
+class TestBackwardsCompatibility:
+    """Test that new fields don't break existing functionality"""
+    @pytest.mark.asyncio
+    async def test_existing_fields_still_work(self, client, sample_pdf_path, basic_config):
+        """Test that all existing fields still work with new page functionality"""
+        response = await client.upload(sample_pdf_path, basic_config)
+        # Test that all traditional fields still work
+        assert response.task_id is not None
+        assert response.status == "Succeeded"
+        assert response.output is not None
+        assert response.output.chunks is not None
+        assert len(response.output.chunks) > 0
+        assert response.output.file_name is not None
+        assert response.output.page_count is not None
+        assert response.output.pdf_url is not None
+        # Test chunk structure
+        chunk = response.output.chunks[0]
+        assert chunk.chunk_id is not None
+        assert chunk.chunk_length is not None
+        assert chunk.segments is not None
+        assert len(chunk.segments) > 0
+        # Test segment structure
+        segment = chunk.segments[0]
+        assert segment.segment_id is not None
+        assert segment.segment_type is not None
+        assert segment.bbox is not None
+    @pytest.mark.asyncio
+    async def test_optional_new_fields(self, client, sample_pdf_path, basic_config):
+        """Test that new optional fields are properly handled"""
+        response = await client.upload(sample_pdf_path, basic_config)
+        # New fields should be accessible but might be None
+        assert hasattr(response.output, 'mime_type')
+        assert hasattr(response.output, 'pages')
+        # For segments, spreadsheet fields should be accessible but None for PDFs
+        for chunk in response.output.chunks:
+            for segment in chunk.segments:
+                assert hasattr(segment, 'ss_cells')
+                assert hasattr(segment, 'ss_range')
+                assert hasattr(segment, 'ss_sheet_name')
+                assert hasattr(segment, 'segment_length')
+                # For non-Excel files, these should be None
+                if response.output.mime_type != "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
+                    assert segment.ss_cells is None
+                    assert segment.ss_range is None
+                    assert segment.ss_sheet_name is None
+class TestErrorHandling:
+    """Test error handling for pages functionality"""
+    @pytest.mark.asyncio
+    async def test_missing_pages_handled_gracefully(self, client, sample_pdf_path, basic_config):
+        """Test that missing pages are handled gracefully"""
+        response = await client.upload(sample_pdf_path, basic_config)
+        # Even if pages is None, the response should be valid
+        if response.output.pages is None:
+            # page_count should still be available
+            assert response.output.page_count is not None
+            assert response.output.page_count > 0
+        else:
+            # If pages exist, they should be valid
+            assert len(response.output.pages) > 0
+            assert response.output.page_count == len(response.output.pages)