chunkr-ai 0.0.43__tar.gz → 0.0.45__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (22) hide show
  1. {chunkr_ai-0.0.43/src/chunkr_ai.egg-info → chunkr_ai-0.0.45}/PKG-INFO +1 -1
  2. {chunkr_ai-0.0.43 → chunkr_ai-0.0.45}/pyproject.toml +1 -1
  3. {chunkr_ai-0.0.43 → chunkr_ai-0.0.45}/src/chunkr_ai/api/configuration.py +89 -2
  4. {chunkr_ai-0.0.43 → chunkr_ai-0.0.45}/src/chunkr_ai/models.py +8 -1
  5. {chunkr_ai-0.0.43 → chunkr_ai-0.0.45/src/chunkr_ai.egg-info}/PKG-INFO +1 -1
  6. {chunkr_ai-0.0.43 → chunkr_ai-0.0.45}/tests/test_chunkr.py +162 -1
  7. {chunkr_ai-0.0.43 → chunkr_ai-0.0.45}/LICENSE +0 -0
  8. {chunkr_ai-0.0.43 → chunkr_ai-0.0.45}/README.md +0 -0
  9. {chunkr_ai-0.0.43 → chunkr_ai-0.0.45}/setup.cfg +0 -0
  10. {chunkr_ai-0.0.43 → chunkr_ai-0.0.45}/src/chunkr_ai/__init__.py +0 -0
  11. {chunkr_ai-0.0.43 → chunkr_ai-0.0.45}/src/chunkr_ai/api/__init__.py +0 -0
  12. {chunkr_ai-0.0.43 → chunkr_ai-0.0.45}/src/chunkr_ai/api/auth.py +0 -0
  13. {chunkr_ai-0.0.43 → chunkr_ai-0.0.45}/src/chunkr_ai/api/chunkr.py +0 -0
  14. {chunkr_ai-0.0.43 → chunkr_ai-0.0.45}/src/chunkr_ai/api/chunkr_base.py +0 -0
  15. {chunkr_ai-0.0.43 → chunkr_ai-0.0.45}/src/chunkr_ai/api/decorators.py +0 -0
  16. {chunkr_ai-0.0.43 → chunkr_ai-0.0.45}/src/chunkr_ai/api/misc.py +0 -0
  17. {chunkr_ai-0.0.43 → chunkr_ai-0.0.45}/src/chunkr_ai/api/protocol.py +0 -0
  18. {chunkr_ai-0.0.43 → chunkr_ai-0.0.45}/src/chunkr_ai/api/task_response.py +0 -0
  19. {chunkr_ai-0.0.43 → chunkr_ai-0.0.45}/src/chunkr_ai.egg-info/SOURCES.txt +0 -0
  20. {chunkr_ai-0.0.43 → chunkr_ai-0.0.45}/src/chunkr_ai.egg-info/dependency_links.txt +0 -0
  21. {chunkr_ai-0.0.43 → chunkr_ai-0.0.45}/src/chunkr_ai.egg-info/requires.txt +0 -0
  22. {chunkr_ai-0.0.43 → chunkr_ai-0.0.45}/src/chunkr_ai.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: chunkr-ai
3
- Version: 0.0.43
3
+ Version: 0.0.45
4
4
  Summary: Python client for Chunkr: open source document intelligence
5
5
  Author-email: Ishaan Kapoor <ishaan@lumina.sh>
6
6
  License: MIT License
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "chunkr-ai"
7
- version = "0.0.43"
7
+ version = "0.0.45"
8
8
  authors = [{"name" = "Ishaan Kapoor", "email" = "ishaan@lumina.sh"}]
9
9
  description = "Python client for Chunkr: open source document intelligence"
10
10
  readme = "README.md"
@@ -1,6 +1,7 @@
1
1
  from pydantic import BaseModel, Field, ConfigDict
2
2
  from enum import Enum
3
- from typing import Any, List, Optional
3
+ from typing import Any, List, Optional, Union
4
+ from pydantic import field_validator
4
5
 
5
6
  class GenerationStrategy(str, Enum):
6
7
  LLM = "LLM"
@@ -10,11 +11,18 @@ class CroppingStrategy(str, Enum):
10
11
  ALL = "All"
11
12
  AUTO = "Auto"
12
13
 
14
+ class EmbedSource(str, Enum):
15
+ HTML = "HTML"
16
+ MARKDOWN = "Markdown"
17
+ LLM = "LLM"
18
+ CONTENT = "Content"
19
+
13
20
  class GenerationConfig(BaseModel):
14
21
  html: Optional[GenerationStrategy] = None
15
22
  llm: Optional[str] = None
16
23
  markdown: Optional[GenerationStrategy] = None
17
24
  crop_image: Optional[CroppingStrategy] = None
25
+ embed_sources: Optional[List[EmbedSource]] = Field(default_factory=lambda: [EmbedSource.MARKDOWN])
18
26
 
19
27
  class SegmentProcessing(BaseModel):
20
28
  model_config = ConfigDict(populate_by_name=True, alias_generator=str.title)
@@ -32,9 +40,83 @@ class SegmentProcessing(BaseModel):
32
40
  text: Optional[GenerationConfig] = Field(default=None, alias="Text")
33
41
  title: Optional[GenerationConfig] = Field(default=None, alias="Title")
34
42
 
43
+ class Tokenizer(str, Enum):
44
+ WORD = "Word"
45
+ CL100K_BASE = "Cl100kBase"
46
+ XLM_ROBERTA_BASE = "XlmRobertaBase"
47
+ BERT_BASE_UNCASED = "BertBaseUncased"
48
+
49
+ class TokenizerType(BaseModel):
50
+ enum_value: Optional[Tokenizer] = None
51
+ string_value: Optional[str] = None
52
+
53
+ @classmethod
54
+ def from_enum(cls, enum_value: Tokenizer) -> "TokenizerType":
55
+ return cls(enum_value=enum_value)
56
+
57
+ @classmethod
58
+ def from_string(cls, string_value: str) -> "TokenizerType":
59
+ return cls(string_value=string_value)
60
+
61
+ def __str__(self) -> str:
62
+ if self.enum_value is not None:
63
+ return f"enum:{self.enum_value.value}"
64
+ elif self.string_value is not None:
65
+ return f"string:{self.string_value}"
66
+ return ""
67
+
68
+ model_config = ConfigDict(
69
+ json_encoders={
70
+ 'TokenizerType': lambda v: v.model_dump()
71
+ }
72
+ )
73
+
74
+ def model_dump(self, **kwargs):
75
+ if self.enum_value is not None:
76
+ return {"Enum": self.enum_value.value}
77
+ elif self.string_value is not None:
78
+ return {"String": self.string_value}
79
+ return {}
80
+
35
81
  class ChunkProcessing(BaseModel):
36
- ignore_headers_and_footers: Optional[bool] = None
82
+ ignore_headers_and_footers: Optional[bool] = True
37
83
  target_length: Optional[int] = None
84
+ tokenizer: Optional[Union[TokenizerType, Tokenizer, str]] = None
85
+
86
+ model_config = ConfigDict(
87
+ arbitrary_types_allowed=True,
88
+ json_encoders={
89
+ TokenizerType: lambda v: v.model_dump()
90
+ }
91
+ )
92
+
93
+ @field_validator('tokenizer', mode='before')
94
+ def validate_tokenizer(cls, v):
95
+ if v is None:
96
+ return None
97
+
98
+ if isinstance(v, TokenizerType):
99
+ return v
100
+
101
+ if isinstance(v, Tokenizer):
102
+ return TokenizerType(enum_value=v)
103
+
104
+ if isinstance(v, dict):
105
+ if "Enum" in v:
106
+ try:
107
+ return TokenizerType(enum_value=Tokenizer(v["Enum"]))
108
+ except ValueError:
109
+ return TokenizerType(string_value=v["Enum"])
110
+ elif "String" in v:
111
+ return TokenizerType(string_value=v["String"])
112
+
113
+ if isinstance(v, str):
114
+ try:
115
+ return TokenizerType(enum_value=Tokenizer(v))
116
+ except ValueError:
117
+ return TokenizerType(string_value=v)
118
+
119
+ raise ValueError(f"Cannot convert {v} to TokenizerType")
38
120
 
39
121
  class OcrStrategy(str, Enum):
40
122
  ALL = "All"
@@ -44,6 +126,10 @@ class SegmentationStrategy(str, Enum):
44
126
  LAYOUT_ANALYSIS = "LayoutAnalysis"
45
127
  PAGE = "Page"
46
128
 
129
+ class ErrorHandlingStrategy(str, Enum):
130
+ FAIL = "Fail"
131
+ CONTINUE = "Continue"
132
+
47
133
  class BoundingBox(BaseModel):
48
134
  left: float
49
135
  top: float
@@ -107,6 +193,7 @@ class Pipeline(str, Enum):
107
193
  class Configuration(BaseModel):
108
194
  chunk_processing: Optional[ChunkProcessing] = None
109
195
  expires_in: Optional[int] = None
196
+ error_handling: Optional[ErrorHandlingStrategy] = None
110
197
  high_resolution: Optional[bool] = None
111
198
  ocr_strategy: Optional[OcrStrategy] = None
112
199
  segment_processing: Optional[SegmentProcessing] = None
@@ -4,6 +4,8 @@ from .api.configuration import (
4
4
  ChunkProcessing,
5
5
  Configuration,
6
6
  CroppingStrategy,
7
+ EmbedSource,
8
+ ErrorHandlingStrategy,
7
9
  GenerationStrategy,
8
10
  GenerationConfig,
9
11
  Model,
@@ -16,6 +18,8 @@ from .api.configuration import (
16
18
  SegmentationStrategy,
17
19
  Status,
18
20
  Pipeline,
21
+ Tokenizer,
22
+ TokenizerType,
19
23
  )
20
24
  from .api.task_response import TaskResponse
21
25
 
@@ -25,6 +29,8 @@ __all__ = [
25
29
  "ChunkProcessing",
26
30
  "Configuration",
27
31
  "CroppingStrategy",
32
+ "EmbedSource",
33
+ "ErrorHandlingStrategy",
28
34
  "GenerationConfig",
29
35
  "GenerationStrategy",
30
36
  "Model",
@@ -38,5 +44,6 @@ __all__ = [
38
44
  "Status",
39
45
  "TaskResponse",
40
46
  "Pipeline",
47
+ "Tokenizer",
48
+ "TokenizerType",
41
49
  ]
42
-
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: chunkr-ai
3
- Version: 0.0.43
3
+ Version: 0.0.45
4
4
  Summary: Python client for Chunkr: open source document intelligence
5
5
  Author-email: Ishaan Kapoor <ishaan@lumina.sh>
6
6
  License: MIT License
@@ -15,6 +15,9 @@ from chunkr_ai.models import (
15
15
  SegmentProcessing,
16
16
  ChunkProcessing,
17
17
  TaskResponse,
18
+ EmbedSource,
19
+ ErrorHandlingStrategy,
20
+ Tokenizer,
18
21
  )
19
22
 
20
23
  @pytest.fixture
@@ -34,6 +37,90 @@ def client():
34
37
  client = Chunkr()
35
38
  yield client
36
39
 
40
+ @pytest.fixture
41
+ def markdown_embed_config():
42
+ return Configuration(
43
+ segment_processing=SegmentProcessing(
44
+ page=GenerationConfig(
45
+ html=GenerationStrategy.LLM,
46
+ markdown=GenerationStrategy.LLM,
47
+ embed_sources=[EmbedSource.MARKDOWN]
48
+ )
49
+ ),
50
+ )
51
+
52
+ @pytest.fixture
53
+ def html_embed_config():
54
+ return Configuration(
55
+ segment_processing=SegmentProcessing(
56
+ page=GenerationConfig(
57
+ html=GenerationStrategy.LLM,
58
+ markdown=GenerationStrategy.LLM,
59
+ embed_sources=[EmbedSource.HTML]
60
+ )
61
+ ),
62
+ )
63
+
64
+ @pytest.fixture
65
+ def multiple_embed_config():
66
+ return Configuration(
67
+ segment_processing=SegmentProcessing(
68
+ page=GenerationConfig(
69
+ html=GenerationStrategy.LLM,
70
+ markdown=GenerationStrategy.LLM,
71
+ llm="Generate a summary of this content",
72
+ embed_sources=[EmbedSource.MARKDOWN, EmbedSource.LLM, EmbedSource.HTML]
73
+ )
74
+ ),
75
+ )
76
+
77
+ @pytest.fixture
78
+ def word_tokenizer_string_config():
79
+ return Configuration(
80
+ chunk_processing=ChunkProcessing(
81
+ tokenizer="Word"
82
+ ),
83
+ )
84
+
85
+ @pytest.fixture
86
+ def word_tokenizer_config():
87
+ return Configuration(
88
+ chunk_processing=ChunkProcessing(
89
+ tokenizer=Tokenizer.WORD
90
+ ),
91
+ )
92
+
93
+ @pytest.fixture
94
+ def cl100k_tokenizer_config():
95
+ return Configuration(
96
+ chunk_processing=ChunkProcessing(
97
+ tokenizer=Tokenizer.CL100K_BASE
98
+ ),
99
+ )
100
+
101
+ @pytest.fixture
102
+ def custom_tokenizer_config():
103
+ return Configuration(
104
+ chunk_processing=ChunkProcessing(
105
+ tokenizer="Qwen/Qwen-tokenizer"
106
+ ),
107
+ )
108
+
109
+ @pytest.fixture
110
+ def xlm_roberta_with_html_content_config():
111
+ return Configuration(
112
+ chunk_processing=ChunkProcessing(
113
+ tokenizer=Tokenizer.XLM_ROBERTA_BASE
114
+ ),
115
+ segment_processing=SegmentProcessing(
116
+ page=GenerationConfig(
117
+ html=GenerationStrategy.LLM,
118
+ markdown=GenerationStrategy.LLM,
119
+ embed_sources=[EmbedSource.HTML, EmbedSource.CONTENT]
120
+ )
121
+ ),
122
+ )
123
+
37
124
  @pytest.mark.asyncio
38
125
  async def test_send_file_path(client, sample_path):
39
126
  response = await client.upload(sample_path)
@@ -241,6 +328,15 @@ async def test_send_base64_file(client, sample_path):
241
328
  assert response.status == "Succeeded"
242
329
  assert response.output is not None
243
330
 
331
+ @pytest.mark.asyncio
332
+ async def test_send_base64_file_with_data_url(client, sample_path):
333
+ with open(sample_path, "rb") as f:
334
+ base64_content = base64.b64encode(f.read()).decode('utf-8')
335
+ response = await client.upload(f"data:application/pdf;base64,{base64_content}")
336
+ assert response.task_id is not None
337
+ assert response.status == "Succeeded"
338
+ assert response.output is not None
339
+
244
340
  @pytest.mark.asyncio
245
341
  async def test_send_base64_file_with_filename(client, sample_path):
246
342
  # Read file and convert to base64
@@ -289,4 +385,69 @@ async def test_output_files_with_dirs(client, sample_path, tmp_path):
289
385
  assert html_file.exists()
290
386
  assert md_file.exists()
291
387
  assert content_file.exists()
292
- assert json_file.exists()
388
+ assert json_file.exists()
389
+
390
+ @pytest.mark.asyncio
391
+ async def test_embed_sources_markdown_only(client, sample_path, markdown_embed_config):
392
+ response = await client.upload(sample_path, markdown_embed_config)
393
+ assert response.task_id is not None
394
+ assert response.status == "Succeeded"
395
+ assert response.output is not None
396
+ # Check the first chunk to verify embed exists
397
+ if response.output.chunks:
398
+ chunk = response.output.chunks[0]
399
+ assert chunk.embed is not None
400
+
401
+ @pytest.mark.asyncio
402
+ async def test_embed_sources_html_only(client, sample_path, html_embed_config):
403
+ response = await client.upload(sample_path, html_embed_config)
404
+ assert response.task_id is not None
405
+ assert response.status == "Succeeded"
406
+ assert response.output is not None
407
+
408
+ @pytest.mark.asyncio
409
+ async def test_embed_sources_multiple(client, sample_path, multiple_embed_config):
410
+ response = await client.upload(sample_path, multiple_embed_config)
411
+ assert response.task_id is not None
412
+ assert response.status == "Succeeded"
413
+ assert response.output is not None
414
+
415
+ @pytest.mark.asyncio
416
+ async def test_tokenizer_word(client, sample_path, word_tokenizer_config):
417
+ response = await client.upload(sample_path, word_tokenizer_config)
418
+ assert response.task_id is not None
419
+ assert response.status == "Succeeded"
420
+ assert response.output is not None
421
+ if response.output.chunks:
422
+ for chunk in response.output.chunks:
423
+ # Word tokenizer should result in chunks with length close to target
424
+ assert chunk.chunk_length > 0
425
+ assert chunk.chunk_length <= 600 # Allow some flexibility
426
+
427
+ @pytest.mark.asyncio
428
+ async def test_tokenizer_cl100k(client, sample_path, cl100k_tokenizer_config):
429
+ response = await client.upload(sample_path, cl100k_tokenizer_config)
430
+ assert response.task_id is not None
431
+ assert response.status == "Succeeded"
432
+ assert response.output is not None
433
+
434
+ @pytest.mark.asyncio
435
+ async def test_tokenizer_custom_string(client, sample_path, custom_tokenizer_config):
436
+ response = await client.upload(sample_path, custom_tokenizer_config)
437
+ assert response.task_id is not None
438
+ assert response.status == "Succeeded"
439
+ assert response.output is not None
440
+
441
+ @pytest.mark.asyncio
442
+ async def test_embed_sources_with_different_tokenizer(client, sample_path, xlm_roberta_with_html_content_config):
443
+ response = await client.upload(sample_path, xlm_roberta_with_html_content_config)
444
+ assert response.task_id is not None
445
+ assert response.status == "Succeeded"
446
+ assert response.output is not None
447
+
448
+ @pytest.mark.asyncio
449
+ async def test_error_handling_continue(client, sample_path):
450
+ response = await client.upload(sample_path, Configuration(error_handling=ErrorHandlingStrategy.CONTINUE))
451
+ assert response.task_id is not None
452
+ assert response.status == "Succeeded"
453
+ assert response.output is not None
File without changes
File without changes
File without changes