chunkr-ai 0.1.0__tar.gz → 0.3.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (25) hide show
  1. {chunkr_ai-0.1.0/src/chunkr_ai.egg-info → chunkr_ai-0.3.1}/PKG-INFO +2 -1
  2. {chunkr_ai-0.1.0 → chunkr_ai-0.3.1}/pyproject.toml +2 -1
  3. {chunkr_ai-0.1.0 → chunkr_ai-0.3.1}/src/chunkr_ai/api/configuration.py +47 -0
  4. {chunkr_ai-0.1.0 → chunkr_ai-0.3.1}/src/chunkr_ai/models.py +10 -0
  5. {chunkr_ai-0.1.0 → chunkr_ai-0.3.1/src/chunkr_ai.egg-info}/PKG-INFO +2 -1
  6. {chunkr_ai-0.1.0 → chunkr_ai-0.3.1}/src/chunkr_ai.egg-info/SOURCES.txt +3 -1
  7. {chunkr_ai-0.1.0 → chunkr_ai-0.3.1}/src/chunkr_ai.egg-info/requires.txt +1 -0
  8. {chunkr_ai-0.1.0 → chunkr_ai-0.3.1}/tests/test_chunkr.py +133 -1
  9. chunkr_ai-0.3.1/tests/test_excel.py +417 -0
  10. chunkr_ai-0.3.1/tests/test_pages.py +261 -0
  11. {chunkr_ai-0.1.0 → chunkr_ai-0.3.1}/LICENSE +0 -0
  12. {chunkr_ai-0.1.0 → chunkr_ai-0.3.1}/README.md +0 -0
  13. {chunkr_ai-0.1.0 → chunkr_ai-0.3.1}/setup.cfg +0 -0
  14. {chunkr_ai-0.1.0 → chunkr_ai-0.3.1}/src/chunkr_ai/__init__.py +0 -0
  15. {chunkr_ai-0.1.0 → chunkr_ai-0.3.1}/src/chunkr_ai/api/__init__.py +0 -0
  16. {chunkr_ai-0.1.0 → chunkr_ai-0.3.1}/src/chunkr_ai/api/auth.py +0 -0
  17. {chunkr_ai-0.1.0 → chunkr_ai-0.3.1}/src/chunkr_ai/api/chunkr.py +0 -0
  18. {chunkr_ai-0.1.0 → chunkr_ai-0.3.1}/src/chunkr_ai/api/chunkr_base.py +0 -0
  19. {chunkr_ai-0.1.0 → chunkr_ai-0.3.1}/src/chunkr_ai/api/decorators.py +0 -0
  20. {chunkr_ai-0.1.0 → chunkr_ai-0.3.1}/src/chunkr_ai/api/misc.py +0 -0
  21. {chunkr_ai-0.1.0 → chunkr_ai-0.3.1}/src/chunkr_ai/api/protocol.py +0 -0
  22. {chunkr_ai-0.1.0 → chunkr_ai-0.3.1}/src/chunkr_ai/api/task_response.py +0 -0
  23. {chunkr_ai-0.1.0 → chunkr_ai-0.3.1}/src/chunkr_ai.egg-info/dependency_links.txt +0 -0
  24. {chunkr_ai-0.1.0 → chunkr_ai-0.3.1}/src/chunkr_ai.egg-info/top_level.txt +0 -0
  25. {chunkr_ai-0.1.0 → chunkr_ai-0.3.1}/tests/test_file_handling.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: chunkr-ai
3
- Version: 0.1.0
3
+ Version: 0.3.1
4
4
  Summary: Python client for Chunkr: open source document intelligence
5
5
  Author-email: Ishaan Kapoor <ishaan@lumina.sh>
6
6
  License: MIT License
@@ -28,6 +28,7 @@ Project-URL: Homepage, https://chunkr.ai
28
28
  Description-Content-Type: text/markdown
29
29
  License-File: LICENSE
30
30
  Requires-Dist: httpx>=0.25.0
31
+ Requires-Dist: matplotlib>=3.10.3
31
32
  Requires-Dist: nest-asyncio>=1.6.0
32
33
  Requires-Dist: pillow>=10.0.0
33
34
  Requires-Dist: pydantic>=2.0.0
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "chunkr-ai"
7
- version = "0.1.0"
7
+ version = "0.3.1"
8
8
  authors = [{"name" = "Ishaan Kapoor", "email" = "ishaan@lumina.sh"}]
9
9
  description = "Python client for Chunkr: open source document intelligence"
10
10
  readme = "README.md"
@@ -12,6 +12,7 @@ license = {"file" = "LICENSE"}
12
12
  urls = {Homepage = "https://chunkr.ai"}
13
13
  dependencies = [
14
14
  "httpx>=0.25.0",
15
+ "matplotlib>=3.10.3",
15
16
  "nest-asyncio>=1.6.0",
16
17
  "pillow>=10.0.0",
17
18
  "pydantic>=2.0.0",
@@ -251,6 +251,42 @@ class SegmentType(str, Enum):
251
251
  TEXT = "Text"
252
252
  TITLE = "Title"
253
253
 
254
+ class Alignment(str, Enum):
255
+ LEFT = "Left"
256
+ CENTER = "Center"
257
+ RIGHT = "Right"
258
+ JUSTIFY = "Justify"
259
+
260
+ class VerticalAlignment(str, Enum):
261
+ TOP = "Top"
262
+ MIDDLE = "Middle"
263
+ BOTTOM = "Bottom"
264
+ BASELINE = "Baseline"
265
+
266
+ class CellStyle(BaseModel):
267
+ bg_color: Optional[str] = None
268
+ text_color: Optional[str] = None
269
+ font_face: Optional[str] = None
270
+ is_bold: Optional[bool] = None
271
+ align: Optional[Alignment] = None
272
+ valign: Optional[VerticalAlignment] = None
273
+
274
+ class Cell(BaseModel):
275
+ cell_id: str
276
+ text: str
277
+ range: str
278
+ formula: Optional[str] = None
279
+ value: Optional[str] = None
280
+ hyperlink: Optional[str] = None
281
+ style: Optional[CellStyle] = None
282
+
283
+ class Page(BaseModel):
284
+ image: str
285
+ page_number: int
286
+ page_height: float
287
+ page_width: float
288
+ ss_sheet_name: Optional[str] = None
289
+
254
290
  class Segment(BaseModel):
255
291
  bbox: BoundingBox
256
292
  content: str = ""
@@ -266,6 +302,15 @@ class Segment(BaseModel):
266
302
  segment_type: SegmentType
267
303
  confidence: Optional[float]
268
304
  text: str = ""
305
+ segment_length: Optional[int] = None
306
+ # Spreadsheet-specific fields
307
+ ss_cells: Optional[List[Cell]] = None
308
+ ss_header_bbox: Optional[BoundingBox] = None
309
+ ss_header_ocr: Optional[List[OCRResult]] = None
310
+ ss_header_text: Optional[str] = None
311
+ ss_header_range: Optional[str] = None
312
+ ss_range: Optional[str] = None
313
+ ss_sheet_name: Optional[str] = None
269
314
 
270
315
  class Chunk(BaseModel):
271
316
  chunk_id: str
@@ -276,6 +321,8 @@ class Chunk(BaseModel):
276
321
  class OutputResponse(BaseModel):
277
322
  chunks: List[Chunk]
278
323
  file_name: Optional[str]
324
+ mime_type: Optional[str] = None
325
+ pages: Optional[List[Page]] = None
279
326
  page_count: Optional[int]
280
327
  pdf_url: Optional[str]
281
328
 
@@ -1,5 +1,8 @@
1
1
  from .api.configuration import (
2
+ Alignment,
2
3
  BoundingBox,
4
+ Cell,
5
+ CellStyle,
3
6
  Chunk,
4
7
  ChunkProcessing,
5
8
  Configuration,
@@ -14,6 +17,7 @@ from .api.configuration import (
14
17
  OCRResult,
15
18
  OcrStrategy,
16
19
  OutputResponse,
20
+ Page,
17
21
  Segment,
18
22
  SegmentFormat,
19
23
  SegmentProcessing,
@@ -23,11 +27,15 @@ from .api.configuration import (
23
27
  Pipeline,
24
28
  Tokenizer,
25
29
  TokenizerType,
30
+ VerticalAlignment,
26
31
  )
27
32
  from .api.task_response import TaskResponse
28
33
 
29
34
  __all__ = [
35
+ "Alignment",
30
36
  "BoundingBox",
37
+ "Cell",
38
+ "CellStyle",
31
39
  "Chunk",
32
40
  "ChunkProcessing",
33
41
  "Configuration",
@@ -42,6 +50,7 @@ __all__ = [
42
50
  "OCRResult",
43
51
  "OcrStrategy",
44
52
  "OutputResponse",
53
+ "Page",
45
54
  "Segment",
46
55
  "SegmentFormat",
47
56
  "SegmentProcessing",
@@ -52,4 +61,5 @@ __all__ = [
52
61
  "Pipeline",
53
62
  "Tokenizer",
54
63
  "TokenizerType",
64
+ "VerticalAlignment",
55
65
  ]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: chunkr-ai
3
- Version: 0.1.0
3
+ Version: 0.3.1
4
4
  Summary: Python client for Chunkr: open source document intelligence
5
5
  Author-email: Ishaan Kapoor <ishaan@lumina.sh>
6
6
  License: MIT License
@@ -28,6 +28,7 @@ Project-URL: Homepage, https://chunkr.ai
28
28
  Description-Content-Type: text/markdown
29
29
  License-File: LICENSE
30
30
  Requires-Dist: httpx>=0.25.0
31
+ Requires-Dist: matplotlib>=3.10.3
31
32
  Requires-Dist: nest-asyncio>=1.6.0
32
33
  Requires-Dist: pillow>=10.0.0
33
34
  Requires-Dist: pydantic>=2.0.0
@@ -18,4 +18,6 @@ src/chunkr_ai/api/misc.py
18
18
  src/chunkr_ai/api/protocol.py
19
19
  src/chunkr_ai/api/task_response.py
20
20
  tests/test_chunkr.py
21
- tests/test_file_handling.py
21
+ tests/test_excel.py
22
+ tests/test_file_handling.py
23
+ tests/test_pages.py
@@ -1,4 +1,5 @@
1
1
  httpx>=0.25.0
2
+ matplotlib>=3.10.3
2
3
  nest-asyncio>=1.6.0
3
4
  pillow>=10.0.0
4
5
  pydantic>=2.0.0
@@ -48,6 +48,13 @@ def client():
48
48
  client = Chunkr()
49
49
  yield client
50
50
 
51
+ @pytest.fixture(params=[
52
+ pytest.param(None, id="none_pipeline"),
53
+ pytest.param(Pipeline.AZURE, id="azure_pipeline"),
54
+ ])
55
+ def pipeline_type(request):
56
+ return request.param
57
+
51
58
  @pytest.fixture
52
59
  def markdown_embed_config():
53
60
  return Configuration(
@@ -584,4 +591,129 @@ async def test_extended_context(client, sample_path, extended_context_config):
584
591
 
585
592
  except Exception as e:
586
593
  print(f"Error during extended context test: {e}")
587
- raise # Re-raise the exception to fail the test explicitly
594
+ raise # Re-raise the exception to fail the test explicitly
595
+
596
+
597
+ # Tests for new fields added in recent updates
598
+ class TestNewFields:
599
+ """Test the newly added fields in the models"""
600
+
601
+ @pytest.mark.asyncio
602
+ async def test_output_has_mime_type_field(self, client, sample_path):
603
+ """Test that OutputResponse includes mime_type field"""
604
+ response = await client.upload(sample_path)
605
+ assert response.task_id is not None
606
+ assert response.status == "Succeeded"
607
+ assert response.output is not None
608
+
609
+ # mime_type should be accessible (might be None for some file types)
610
+ assert hasattr(response.output, 'mime_type')
611
+
612
+ # For PDF files, mime_type should be present
613
+ if response.output.mime_type:
614
+ assert "pdf" in response.output.mime_type.lower()
615
+
616
+ @pytest.mark.asyncio
617
+ async def test_output_has_pages_field(self, client, sample_path):
618
+ """Test that OutputResponse includes pages field"""
619
+ response = await client.upload(sample_path)
620
+ assert response.task_id is not None
621
+ assert response.status == "Succeeded"
622
+ assert response.output is not None
623
+
624
+ # pages should be accessible (might be None for some configurations)
625
+ assert hasattr(response.output, 'pages')
626
+
627
+ # If pages exist, validate structure
628
+ if response.output.pages:
629
+ assert len(response.output.pages) > 0
630
+ page = response.output.pages[0]
631
+ assert hasattr(page, 'image')
632
+ assert hasattr(page, 'page_number')
633
+ assert hasattr(page, 'page_height')
634
+ assert hasattr(page, 'page_width')
635
+ assert hasattr(page, 'ss_sheet_name')
636
+
637
+ @pytest.mark.asyncio
638
+ async def test_segments_have_spreadsheet_fields(self, client, sample_path):
639
+ """Test that Segment objects include new spreadsheet fields"""
640
+ response = await client.upload(sample_path)
641
+ assert response.task_id is not None
642
+ assert response.status == "Succeeded"
643
+ assert response.output is not None
644
+ assert len(response.output.chunks) > 0
645
+
646
+ segment = response.output.chunks[0].segments[0]
647
+
648
+ # All new spreadsheet fields should be accessible
649
+ assert hasattr(segment, 'segment_length')
650
+ assert hasattr(segment, 'ss_cells')
651
+ assert hasattr(segment, 'ss_header_bbox')
652
+ assert hasattr(segment, 'ss_header_ocr')
653
+ assert hasattr(segment, 'ss_header_text')
654
+ assert hasattr(segment, 'ss_header_range')
655
+ assert hasattr(segment, 'ss_range')
656
+ assert hasattr(segment, 'ss_sheet_name')
657
+
658
+ # For PDF files, spreadsheet fields should be None
659
+ assert segment.ss_cells is None
660
+ assert segment.ss_range is None
661
+ assert segment.ss_sheet_name is None
662
+
663
+ @pytest.mark.asyncio
664
+ async def test_segment_length_field(self, client, sample_path):
665
+ """Test that segments can have length calculations"""
666
+ response = await client.upload(sample_path)
667
+ assert response.task_id is not None
668
+ assert response.status == "Succeeded"
669
+ assert response.output is not None
670
+
671
+ # Check if any segments have length calculations
672
+ segments_with_length = []
673
+ for chunk in response.output.chunks:
674
+ for segment in chunk.segments:
675
+ if segment.segment_length is not None:
676
+ segments_with_length.append(segment)
677
+
678
+ # segment_length might be None depending on configuration
679
+ # but if present, should be positive
680
+ for segment in segments_with_length:
681
+ assert segment.segment_length > 0
682
+
683
+ @pytest.mark.asyncio
684
+ async def test_backwards_compatibility_preserved(self, client, sample_path):
685
+ """Test that all existing fields still work after adding new ones"""
686
+ response = await client.upload(sample_path)
687
+ assert response.task_id is not None
688
+ assert response.status == "Succeeded"
689
+ assert response.output is not None
690
+
691
+ # All existing fields should still work
692
+ assert response.output.chunks is not None
693
+ assert response.output.file_name is not None
694
+ assert response.output.page_count is not None
695
+ assert response.output.pdf_url is not None
696
+
697
+ # Chunk structure should be unchanged
698
+ chunk = response.output.chunks[0]
699
+ assert chunk.chunk_id is not None
700
+ assert chunk.chunk_length is not None
701
+ assert chunk.segments is not None
702
+ assert chunk.embed is not None or chunk.embed is None # embed can be None
703
+
704
+ # Segment structure should include all original fields
705
+ segment = chunk.segments[0]
706
+ assert segment.bbox is not None
707
+ assert segment.content is not None or segment.content == ""
708
+ assert segment.page_height is not None
709
+ assert segment.llm is not None or segment.llm is None
710
+ assert segment.html is not None or segment.html == ""
711
+ assert segment.image is not None or segment.image is None
712
+ assert segment.markdown is not None or segment.markdown == ""
713
+ assert segment.ocr is not None or segment.ocr == []
714
+ assert segment.page_number is not None
715
+ assert segment.page_width is not None
716
+ assert segment.segment_id is not None
717
+ assert segment.segment_type is not None
718
+ assert segment.confidence is not None or segment.confidence is None
719
+ assert segment.text is not None or segment.text == ""
@@ -0,0 +1,417 @@
1
+ import pytest
2
+ import json
3
+ from pathlib import Path
4
+
5
+ from chunkr_ai import Chunkr
6
+ from chunkr_ai.models import (
7
+ Configuration,
8
+ ChunkProcessing,
9
+ SegmentProcessing,
10
+ GenerationConfig,
11
+ SegmentFormat,
12
+ GenerationStrategy,
13
+ EmbedSource,
14
+ Tokenizer,
15
+ OcrStrategy,
16
+ SegmentationStrategy,
17
+ Cell,
18
+ CellStyle,
19
+ Alignment,
20
+ VerticalAlignment,
21
+ Page,
22
+ Segment,
23
+ SegmentType,
24
+ )
25
+
26
+
27
+ @pytest.fixture
28
+ def excel_sample_path():
29
+ """Path to the Excel test file"""
30
+ return Path("tests/files/excel/test.xlsx")
31
+
32
+
33
+ @pytest.fixture
34
+ def excel_expected_output():
35
+ """Expected output for Excel test file"""
36
+ with open("tests/files/excel/test.json", "r") as f:
37
+ return json.load(f)
38
+
39
+
40
+ @pytest.fixture
41
+ def client():
42
+ """Chunkr client instance"""
43
+ client = Chunkr()
44
+ yield client
45
+
46
+
47
+ @pytest.fixture
48
+ def excel_config():
49
+ """Configuration optimized for Excel processing"""
50
+ return Configuration(
51
+ high_resolution=True,
52
+ ocr_strategy=OcrStrategy.ALL,
53
+ segmentation_strategy=SegmentationStrategy.LAYOUT_ANALYSIS,
54
+ chunk_processing=ChunkProcessing(
55
+ target_length=512,
56
+ tokenizer=Tokenizer.WORD,
57
+ ),
58
+ segment_processing=SegmentProcessing(
59
+ Table=GenerationConfig(
60
+ format=SegmentFormat.MARKDOWN,
61
+ strategy=GenerationStrategy.AUTO,
62
+ embed_sources=[EmbedSource.MARKDOWN],
63
+ ),
64
+ Text=GenerationConfig(
65
+ format=SegmentFormat.MARKDOWN,
66
+ strategy=GenerationStrategy.AUTO,
67
+ embed_sources=[EmbedSource.MARKDOWN],
68
+ ),
69
+ ),
70
+ )
71
+
72
+
73
+ class TestExcelBasicFunctionality:
74
+ """Test basic Excel file processing"""
75
+
76
+ @pytest.mark.asyncio
77
+ async def test_excel_upload_and_process(self, client, excel_sample_path, excel_config):
78
+ """Test that Excel file can be uploaded and processed successfully"""
79
+ response = await client.upload(excel_sample_path, excel_config)
80
+
81
+ assert response.task_id is not None
82
+ assert response.status == "Succeeded"
83
+ assert response.output is not None
84
+ assert response.output.chunks is not None
85
+ assert len(response.output.chunks) > 0
86
+
87
+ @pytest.mark.asyncio
88
+ async def test_excel_mime_type(self, client, excel_sample_path, excel_config):
89
+ """Test that Excel files have correct MIME type"""
90
+ response = await client.upload(excel_sample_path, excel_config)
91
+
92
+ assert response.output.mime_type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
93
+
94
+ @pytest.mark.asyncio
95
+ async def test_excel_pages_exist(self, client, excel_sample_path, excel_config):
96
+ """Test that Excel processing generates pages information"""
97
+ response = await client.upload(excel_sample_path, excel_config)
98
+
99
+ assert response.output.pages is not None
100
+ assert len(response.output.pages) > 0
101
+ assert response.output.page_count is not None
102
+ assert response.output.page_count > 0
103
+
104
+ @pytest.mark.asyncio
105
+ async def test_excel_chunks_have_segments(self, client, excel_sample_path, excel_config):
106
+ """Test that Excel chunks contain segments with data"""
107
+ response = await client.upload(excel_sample_path, excel_config)
108
+
109
+ assert len(response.output.chunks) > 0
110
+ chunk = response.output.chunks[0]
111
+ assert len(chunk.segments) > 0
112
+ assert chunk.chunk_length > 0
113
+
114
+
115
+ class TestExcelSpreadsheetFields:
116
+ """Test Excel-specific spreadsheet fields"""
117
+
118
+ @pytest.mark.asyncio
119
+ async def test_segments_have_spreadsheet_fields(self, client, excel_sample_path, excel_config):
120
+ """Test that segments contain spreadsheet-specific fields"""
121
+ response = await client.upload(excel_sample_path, excel_config)
122
+
123
+ # Find a segment with spreadsheet data
124
+ spreadsheet_segment = None
125
+ for chunk in response.output.chunks:
126
+ for segment in chunk.segments:
127
+ if segment.ss_cells and len(segment.ss_cells) > 0:
128
+ spreadsheet_segment = segment
129
+ break
130
+ if spreadsheet_segment:
131
+ break
132
+
133
+ assert spreadsheet_segment is not None, "No segment with spreadsheet data found"
134
+
135
+ # Test spreadsheet-specific fields
136
+ assert spreadsheet_segment.ss_cells is not None
137
+ assert len(spreadsheet_segment.ss_cells) > 0
138
+ assert spreadsheet_segment.ss_sheet_name is not None
139
+ assert spreadsheet_segment.ss_range is not None
140
+
141
+ @pytest.mark.asyncio
142
+ async def test_cells_have_required_fields(self, client, excel_sample_path, excel_config):
143
+ """Test that cells contain all required fields"""
144
+ response = await client.upload(excel_sample_path, excel_config)
145
+
146
+ # Find a segment with cells
147
+ test_cell = None
148
+ for chunk in response.output.chunks:
149
+ for segment in chunk.segments:
150
+ if segment.ss_cells and len(segment.ss_cells) > 0:
151
+ test_cell = segment.ss_cells[0]
152
+ break
153
+ if test_cell:
154
+ break
155
+
156
+ assert test_cell is not None, "No cell found in any segment"
157
+
158
+ # Test required cell fields
159
+ assert test_cell.cell_id is not None
160
+ assert test_cell.text is not None
161
+ assert test_cell.range is not None
162
+ # Optional fields should be accessible
163
+ assert hasattr(test_cell, 'formula')
164
+ assert hasattr(test_cell, 'value')
165
+ assert hasattr(test_cell, 'hyperlink')
166
+ assert hasattr(test_cell, 'style')
167
+
168
+ @pytest.mark.asyncio
169
+ async def test_cell_styling_fields(self, client, excel_sample_path, excel_config):
170
+ """Test that cells with styling contain CellStyle information"""
171
+ response = await client.upload(excel_sample_path, excel_config)
172
+
173
+ # Find a cell with styling
174
+ styled_cell = None
175
+ for chunk in response.output.chunks:
176
+ for segment in chunk.segments:
177
+ if segment.ss_cells:
178
+ for cell in segment.ss_cells:
179
+ if cell.style is not None:
180
+ styled_cell = cell
181
+ break
182
+ if styled_cell:
183
+ break
184
+ if styled_cell:
185
+ break
186
+
187
+ assert styled_cell is not None, "No styled cell found"
188
+ assert styled_cell.style is not None
189
+
190
+ # Test CellStyle fields
191
+ style = styled_cell.style
192
+ assert hasattr(style, 'bg_color')
193
+ assert hasattr(style, 'text_color')
194
+ assert hasattr(style, 'font_face')
195
+ assert hasattr(style, 'is_bold')
196
+ assert hasattr(style, 'align')
197
+ assert hasattr(style, 'valign')
198
+
199
+ @pytest.mark.asyncio
200
+ async def test_excel_sheet_names(self, client, excel_sample_path, excel_config):
201
+ """Test that sheet names are properly captured"""
202
+ response = await client.upload(excel_sample_path, excel_config)
203
+
204
+ # Check segments for sheet names
205
+ sheet_names = set()
206
+ for chunk in response.output.chunks:
207
+ for segment in chunk.segments:
208
+ if segment.ss_sheet_name:
209
+ sheet_names.add(segment.ss_sheet_name)
210
+
211
+ assert len(sheet_names) > 0, "No sheet names found in segments"
212
+
213
+ # Check pages for sheet names
214
+ page_sheet_names = set()
215
+ if response.output.pages:
216
+ for page in response.output.pages:
217
+ if page.ss_sheet_name:
218
+ page_sheet_names.add(page.ss_sheet_name)
219
+
220
+ # At least one source should have sheet names
221
+ assert len(sheet_names) > 0 or len(page_sheet_names) > 0
222
+
223
+
224
+ class TestExcelPages:
225
+ """Test Excel pages functionality"""
226
+
227
+ @pytest.mark.asyncio
228
+ async def test_pages_structure(self, client, excel_sample_path, excel_config):
229
+ """Test that pages have correct structure and fields"""
230
+ response = await client.upload(excel_sample_path, excel_config)
231
+
232
+ assert response.output.pages is not None
233
+ assert len(response.output.pages) > 0
234
+
235
+ page = response.output.pages[0]
236
+ assert page.image is not None
237
+ assert page.page_number is not None
238
+ assert page.page_height is not None
239
+ assert page.page_width is not None
240
+ # ss_sheet_name is optional for pages
241
+ assert hasattr(page, 'ss_sheet_name')
242
+
243
+ @pytest.mark.asyncio
244
+ async def test_page_count_consistency(self, client, excel_sample_path, excel_config):
245
+ """Test that page_count matches the actual number of pages"""
246
+ response = await client.upload(excel_sample_path, excel_config)
247
+
248
+ assert response.output.page_count is not None
249
+ if response.output.pages:
250
+ assert response.output.page_count == len(response.output.pages)
251
+
252
+ @pytest.mark.asyncio
253
+ async def test_page_numbers_sequential(self, client, excel_sample_path, excel_config):
254
+ """Test that page numbers are sequential and start from 1"""
255
+ response = await client.upload(excel_sample_path, excel_config)
256
+
257
+ if response.output.pages and len(response.output.pages) > 1:
258
+ page_numbers = [page.page_number for page in response.output.pages]
259
+ page_numbers.sort()
260
+
261
+ # Should start from 1 and be sequential
262
+ for i, page_num in enumerate(page_numbers):
263
+ assert page_num == i + 1, f"Page numbers not sequential: {page_numbers}"
264
+
265
+
266
+ class TestExcelSegmentTypes:
267
+ """Test Excel segment types and their properties"""
268
+
269
+ @pytest.mark.asyncio
270
+ async def test_segment_types_present(self, client, excel_sample_path, excel_config):
271
+ """Test that appropriate segment types are detected in Excel files"""
272
+ response = await client.upload(excel_sample_path, excel_config)
273
+
274
+ segment_types = set()
275
+ for chunk in response.output.chunks:
276
+ for segment in chunk.segments:
277
+ segment_types.add(segment.segment_type)
278
+
279
+ # Excel files should contain at least Table or Text segments
280
+ expected_types = {SegmentType.TABLE, SegmentType.TEXT}
281
+ assert len(segment_types.intersection(expected_types)) > 0, f"No expected segment types found. Got: {segment_types}"
282
+
283
+ @pytest.mark.asyncio
284
+ async def test_table_segments_have_cells(self, client, excel_sample_path, excel_config):
285
+ """Test that TABLE segments contain cell data"""
286
+ response = await client.upload(excel_sample_path, excel_config)
287
+
288
+ table_segments = []
289
+ for chunk in response.output.chunks:
290
+ for segment in chunk.segments:
291
+ if segment.segment_type == SegmentType.TABLE:
292
+ table_segments.append(segment)
293
+
294
+ if table_segments: # If we have table segments, they should have cells
295
+ found_cells = False
296
+ for segment in table_segments:
297
+ if segment.ss_cells and len(segment.ss_cells) > 0:
298
+ found_cells = True
299
+ break
300
+ assert found_cells, "TABLE segments should contain cell data"
301
+
302
+
303
+ class TestExcelEmbedding:
304
+ """Test Excel embedding functionality"""
305
+
306
+ @pytest.mark.asyncio
307
+ async def test_chunks_have_embed_content(self, client, excel_sample_path, excel_config):
308
+ """Test that chunks generate embed content for Excel data"""
309
+ response = await client.upload(excel_sample_path, excel_config)
310
+
311
+ # At least some chunks should have embed content
312
+ chunks_with_embed = [chunk for chunk in response.output.chunks if chunk.embed]
313
+ assert len(chunks_with_embed) > 0, "No chunks with embed content found"
314
+
315
+ # Embed content should not be empty
316
+ for chunk in chunks_with_embed:
317
+ assert len(chunk.embed.strip()) > 0, "Empty embed content found"
318
+
319
+ @pytest.mark.asyncio
320
+ async def test_segment_length_calculation(self, client, excel_sample_path, excel_config):
321
+ """Test that segments have length calculations"""
322
+ response = await client.upload(excel_sample_path, excel_config)
323
+
324
+ segments_with_length = []
325
+ for chunk in response.output.chunks:
326
+ for segment in chunk.segments:
327
+ if segment.segment_length is not None:
328
+ segments_with_length.append(segment)
329
+
330
+ # At least some segments should have length calculations
331
+ assert len(segments_with_length) > 0, "No segments with length calculations found"
332
+
333
+ # Lengths should be positive
334
+ for segment in segments_with_length:
335
+ assert segment.segment_length > 0, f"Invalid segment length: {segment.segment_length}"
336
+
337
+
338
+ class TestExcelEdgeCases:
339
+ """Test edge cases and error handling for Excel processing"""
340
+
341
+ @pytest.mark.asyncio
342
+ async def test_empty_cells_handling(self, client, excel_sample_path, excel_config):
343
+ """Test that empty cells are handled properly"""
344
+ response = await client.upload(excel_sample_path, excel_config)
345
+
346
+ # Look for cells that might be empty
347
+ all_cells = []
348
+ for chunk in response.output.chunks:
349
+ for segment in chunk.segments:
350
+ if segment.ss_cells:
351
+ all_cells.extend(segment.ss_cells)
352
+
353
+ assert len(all_cells) > 0, "No cells found to test"
354
+
355
+ # All cells should have a text field, even if empty
356
+ for cell in all_cells:
357
+ assert hasattr(cell, 'text'), "Cell missing text field"
358
+ assert cell.text is not None, "Cell text is None"
359
+
360
+ @pytest.mark.asyncio
361
+ async def test_range_format_validity(self, client, excel_sample_path, excel_config):
362
+ """Test that Excel ranges follow expected format"""
363
+ response = await client.upload(excel_sample_path, excel_config)
364
+
365
+ ranges = []
366
+ for chunk in response.output.chunks:
367
+ for segment in chunk.segments:
368
+ if segment.ss_range:
369
+ ranges.append(segment.ss_range)
370
+ if segment.ss_cells:
371
+ for cell in segment.ss_cells:
372
+ ranges.append(cell.range)
373
+
374
+ assert len(ranges) > 0, "No ranges found to test"
375
+
376
+ # Basic range format validation (e.g., "A1", "A1:B2")
377
+ import re
378
+ range_pattern = re.compile(r'^[A-Z]+\d+(:[A-Z]+\d+)?$')
379
+ valid_ranges = [r for r in ranges if range_pattern.match(r)]
380
+
381
+ # Most ranges should follow the expected format
382
+ assert len(valid_ranges) > 0, f"No valid ranges found. Ranges: {ranges[:10]}..."
383
+
384
+
385
+ # Integration test using the expected output fixture
386
+ class TestExcelIntegration:
387
+ """Integration tests comparing against expected output"""
388
+
389
+ @pytest.mark.asyncio
390
+ async def test_compare_with_expected_structure(self, client, excel_sample_path, excel_config, excel_expected_output):
391
+ """Test that the output structure matches expected format"""
392
+ response = await client.upload(excel_sample_path, excel_config)
393
+
394
+ expected = excel_expected_output["output"]
395
+ actual = response.output
396
+
397
+ # Compare high-level structure
398
+ assert actual.mime_type == expected["mime_type"]
399
+ assert actual.page_count == expected["page_count"]
400
+ assert len(actual.chunks) > 0
401
+ assert len(actual.pages) > 0
402
+
403
+ # Verify that we have similar data structure
404
+ expected_has_cells = any(
405
+ segment.get("ss_cells")
406
+ for chunk in expected["chunks"]
407
+ for segment in chunk["segments"]
408
+ )
409
+ actual_has_cells = any(
410
+ segment.ss_cells
411
+ for chunk in actual.chunks
412
+ for segment in chunk.segments
413
+ if segment.ss_cells
414
+ )
415
+
416
+ if expected_has_cells:
417
+ assert actual_has_cells, "Expected cells in output but none found"
@@ -0,0 +1,261 @@
1
+ import pytest
2
+ from pathlib import Path
3
+
4
+ from chunkr_ai import Chunkr
5
+ from chunkr_ai.models import (
6
+ Configuration,
7
+ ChunkProcessing,
8
+ SegmentProcessing,
9
+ GenerationConfig,
10
+ SegmentFormat,
11
+ GenerationStrategy,
12
+ EmbedSource,
13
+ Tokenizer,
14
+ OcrStrategy,
15
+ SegmentationStrategy,
16
+ Page,
17
+ )
18
+
19
+
20
+ @pytest.fixture
21
+ def client():
22
+ """Chunkr client instance"""
23
+ client = Chunkr()
24
+ yield client
25
+
26
+
27
+ @pytest.fixture
28
+ def sample_pdf_path():
29
+ """Path to the PDF test file"""
30
+ return Path("tests/files/test.pdf")
31
+
32
+
33
+ @pytest.fixture
34
+ def excel_sample_path():
35
+ """Path to the Excel test file"""
36
+ return Path("tests/files/excel/test.xlsx")
37
+
38
+
39
+ @pytest.fixture
40
+ def basic_config():
41
+ """Basic configuration for testing pages"""
42
+ return Configuration(
43
+ high_resolution=True,
44
+ ocr_strategy=OcrStrategy.ALL,
45
+ segmentation_strategy=SegmentationStrategy.LAYOUT_ANALYSIS,
46
+ )
47
+
48
+
49
+ class TestPagesBasicFunctionality:
50
+ """Test basic pages functionality across different file types"""
51
+
52
+ @pytest.mark.asyncio
53
+ async def test_pdf_generates_pages(self, client, sample_pdf_path, basic_config):
54
+ """Test that PDF files generate pages information"""
55
+ response = await client.upload(sample_pdf_path, basic_config)
56
+
57
+ assert response.task_id is not None
58
+ assert response.status == "Succeeded"
59
+ assert response.output is not None
60
+
61
+ # Test pages structure
62
+ if response.output.pages: # Pages might be optional for some file types
63
+ assert len(response.output.pages) > 0
64
+ page = response.output.pages[0]
65
+ assert page.image is not None
66
+ assert page.page_number is not None
67
+ assert page.page_height is not None
68
+ assert page.page_width is not None
69
+
70
+ @pytest.mark.asyncio
71
+ async def test_excel_generates_pages_with_sheet_info(self, client, excel_sample_path, basic_config):
72
+ """Test that Excel files generate pages with sheet information"""
73
+ response = await client.upload(excel_sample_path, basic_config)
74
+
75
+ assert response.task_id is not None
76
+ assert response.status == "Succeeded"
77
+ assert response.output is not None
78
+
79
+ # Excel should definitely have pages
80
+ assert response.output.pages is not None
81
+ assert len(response.output.pages) > 0
82
+
83
+ page = response.output.pages[0]
84
+ assert page.image is not None
85
+ assert page.page_number is not None
86
+ assert page.page_height is not None
87
+ assert page.page_width is not None
88
+ # Excel pages should have sheet names
89
+ assert page.ss_sheet_name is not None
90
+
91
+ @pytest.mark.asyncio
92
+ async def test_page_count_consistency(self, client, sample_pdf_path, basic_config):
93
+ """Test that page_count matches the actual number of pages"""
94
+ response = await client.upload(sample_pdf_path, basic_config)
95
+
96
+ assert response.output.page_count is not None
97
+ if response.output.pages:
98
+ assert response.output.page_count == len(response.output.pages)
99
+ else:
100
+ # If no pages array, page_count should still be meaningful
101
+ assert response.output.page_count > 0
102
+
103
+
104
+ class TestPageStructure:
105
+ """Test the Page model structure and validation"""
106
+
107
+ @pytest.mark.asyncio
108
+ async def test_page_required_fields(self, client, sample_pdf_path, basic_config):
109
+ """Test that Page objects have all required fields"""
110
+ response = await client.upload(sample_pdf_path, basic_config)
111
+
112
+ if response.output.pages and len(response.output.pages) > 0:
113
+ page = response.output.pages[0]
114
+
115
+ # Required fields
116
+ assert page.image is not None
117
+ assert isinstance(page.page_number, int)
118
+ assert isinstance(page.page_height, (int, float))
119
+ assert isinstance(page.page_width, (int, float))
120
+
121
+ # Optional fields should be accessible
122
+ assert hasattr(page, 'ss_sheet_name')
123
+
124
+ @pytest.mark.asyncio
125
+ async def test_page_numbers_start_from_one(self, client, sample_pdf_path, basic_config):
126
+ """Test that page numbers start from 1 and are sequential"""
127
+ response = await client.upload(sample_pdf_path, basic_config)
128
+
129
+ if response.output.pages and len(response.output.pages) > 0:
130
+ page_numbers = [page.page_number for page in response.output.pages]
131
+ page_numbers.sort()
132
+
133
+ # Should start from 1
134
+ assert page_numbers[0] == 1, f"Page numbers should start from 1, got: {page_numbers[0]}"
135
+
136
+ # Should be sequential if multiple pages
137
+ if len(page_numbers) > 1:
138
+ for i in range(1, len(page_numbers)):
139
+ assert page_numbers[i] == page_numbers[i-1] + 1, f"Page numbers not sequential: {page_numbers}"
140
+
141
+ @pytest.mark.asyncio
142
+ async def test_page_dimensions_positive(self, client, sample_pdf_path, basic_config):
143
+ """Test that page dimensions are positive values"""
144
+ response = await client.upload(sample_pdf_path, basic_config)
145
+
146
+ if response.output.pages:
147
+ for page in response.output.pages:
148
+ assert page.page_height > 0, f"Invalid page height: {page.page_height}"
149
+ assert page.page_width > 0, f"Invalid page width: {page.page_width}"
150
+
151
+ @pytest.mark.asyncio
152
+ async def test_page_images_are_urls(self, client, sample_pdf_path, basic_config):
153
+ """Test that page images are valid URLs"""
154
+ response = await client.upload(sample_pdf_path, basic_config)
155
+
156
+ if response.output.pages:
157
+ for page in response.output.pages:
158
+ assert page.image.startswith(('http://', 'https://')), f"Invalid page image URL: {page.image}"
159
+
160
+
161
+ class TestMimeTypeHandling:
162
+ """Test MIME type handling for different file types"""
163
+
164
+ @pytest.mark.asyncio
165
+ async def test_pdf_mime_type(self, client, sample_pdf_path, basic_config):
166
+ """Test that PDF files have correct MIME type"""
167
+ response = await client.upload(sample_pdf_path, basic_config)
168
+
169
+ assert response.output.mime_type is not None
170
+ # Should be PDF MIME type
171
+ assert response.output.mime_type in [
172
+ "application/pdf",
173
+ "application/x-pdf"
174
+ ], f"Unexpected PDF MIME type: {response.output.mime_type}"
175
+
176
+ @pytest.mark.asyncio
177
+ async def test_excel_mime_type(self, client, excel_sample_path, basic_config):
178
+ """Test that Excel files have correct MIME type"""
179
+ response = await client.upload(excel_sample_path, basic_config)
180
+
181
+ assert response.output.mime_type is not None
182
+ # Should be Excel MIME type
183
+ expected_excel_types = [
184
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
185
+ "application/vnd.ms-excel"
186
+ ]
187
+ assert response.output.mime_type in expected_excel_types, f"Unexpected Excel MIME type: {response.output.mime_type}"
188
+
189
+
190
+ class TestBackwardsCompatibility:
191
+ """Test that new fields don't break existing functionality"""
192
+
193
+ @pytest.mark.asyncio
194
+ async def test_existing_fields_still_work(self, client, sample_pdf_path, basic_config):
195
+ """Test that all existing fields still work with new page functionality"""
196
+ response = await client.upload(sample_pdf_path, basic_config)
197
+
198
+ # Test that all traditional fields still work
199
+ assert response.task_id is not None
200
+ assert response.status == "Succeeded"
201
+ assert response.output is not None
202
+ assert response.output.chunks is not None
203
+ assert len(response.output.chunks) > 0
204
+ assert response.output.file_name is not None
205
+ assert response.output.page_count is not None
206
+ assert response.output.pdf_url is not None
207
+
208
+ # Test chunk structure
209
+ chunk = response.output.chunks[0]
210
+ assert chunk.chunk_id is not None
211
+ assert chunk.chunk_length is not None
212
+ assert chunk.segments is not None
213
+ assert len(chunk.segments) > 0
214
+
215
+ # Test segment structure
216
+ segment = chunk.segments[0]
217
+ assert segment.segment_id is not None
218
+ assert segment.segment_type is not None
219
+ assert segment.bbox is not None
220
+
221
+ @pytest.mark.asyncio
222
+ async def test_optional_new_fields(self, client, sample_pdf_path, basic_config):
223
+ """Test that new optional fields are properly handled"""
224
+ response = await client.upload(sample_pdf_path, basic_config)
225
+
226
+ # New fields should be accessible but might be None
227
+ assert hasattr(response.output, 'mime_type')
228
+ assert hasattr(response.output, 'pages')
229
+
230
+ # For segments, spreadsheet fields should be accessible but None for PDFs
231
+ for chunk in response.output.chunks:
232
+ for segment in chunk.segments:
233
+ assert hasattr(segment, 'ss_cells')
234
+ assert hasattr(segment, 'ss_range')
235
+ assert hasattr(segment, 'ss_sheet_name')
236
+ assert hasattr(segment, 'segment_length')
237
+
238
+ # For non-Excel files, these should be None
239
+ if response.output.mime_type != "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
240
+ assert segment.ss_cells is None
241
+ assert segment.ss_range is None
242
+ assert segment.ss_sheet_name is None
243
+
244
+
245
+ class TestErrorHandling:
246
+ """Test error handling for pages functionality"""
247
+
248
+ @pytest.mark.asyncio
249
+ async def test_missing_pages_handled_gracefully(self, client, sample_pdf_path, basic_config):
250
+ """Test that missing pages are handled gracefully"""
251
+ response = await client.upload(sample_pdf_path, basic_config)
252
+
253
+ # Even if pages is None, the response should be valid
254
+ if response.output.pages is None:
255
+ # page_count should still be available
256
+ assert response.output.page_count is not None
257
+ assert response.output.page_count > 0
258
+ else:
259
+ # If pages exist, they should be valid
260
+ assert len(response.output.pages) > 0
261
+ assert response.output.page_count == len(response.output.pages)
File without changes
File without changes
File without changes