chunkr-ai 0.0.41__tar.gz → 0.0.44__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (22) hide show
  1. {chunkr_ai-0.0.41/src/chunkr_ai.egg-info → chunkr_ai-0.0.44}/PKG-INFO +3 -2
  2. {chunkr_ai-0.0.41 → chunkr_ai-0.0.44}/pyproject.toml +1 -1
  3. {chunkr_ai-0.0.41 → chunkr_ai-0.0.44}/src/chunkr_ai/api/configuration.py +84 -2
  4. {chunkr_ai-0.0.41 → chunkr_ai-0.0.44}/src/chunkr_ai/api/task_response.py +7 -2
  5. {chunkr_ai-0.0.41 → chunkr_ai-0.0.44}/src/chunkr_ai/models.py +6 -1
  6. {chunkr_ai-0.0.41 → chunkr_ai-0.0.44/src/chunkr_ai.egg-info}/PKG-INFO +3 -2
  7. {chunkr_ai-0.0.41 → chunkr_ai-0.0.44}/tests/test_chunkr.py +154 -1
  8. {chunkr_ai-0.0.41 → chunkr_ai-0.0.44}/LICENSE +0 -0
  9. {chunkr_ai-0.0.41 → chunkr_ai-0.0.44}/README.md +0 -0
  10. {chunkr_ai-0.0.41 → chunkr_ai-0.0.44}/setup.cfg +0 -0
  11. {chunkr_ai-0.0.41 → chunkr_ai-0.0.44}/src/chunkr_ai/__init__.py +0 -0
  12. {chunkr_ai-0.0.41 → chunkr_ai-0.0.44}/src/chunkr_ai/api/__init__.py +0 -0
  13. {chunkr_ai-0.0.41 → chunkr_ai-0.0.44}/src/chunkr_ai/api/auth.py +0 -0
  14. {chunkr_ai-0.0.41 → chunkr_ai-0.0.44}/src/chunkr_ai/api/chunkr.py +0 -0
  15. {chunkr_ai-0.0.41 → chunkr_ai-0.0.44}/src/chunkr_ai/api/chunkr_base.py +0 -0
  16. {chunkr_ai-0.0.41 → chunkr_ai-0.0.44}/src/chunkr_ai/api/decorators.py +0 -0
  17. {chunkr_ai-0.0.41 → chunkr_ai-0.0.44}/src/chunkr_ai/api/misc.py +0 -0
  18. {chunkr_ai-0.0.41 → chunkr_ai-0.0.44}/src/chunkr_ai/api/protocol.py +0 -0
  19. {chunkr_ai-0.0.41 → chunkr_ai-0.0.44}/src/chunkr_ai.egg-info/SOURCES.txt +0 -0
  20. {chunkr_ai-0.0.41 → chunkr_ai-0.0.44}/src/chunkr_ai.egg-info/dependency_links.txt +0 -0
  21. {chunkr_ai-0.0.41 → chunkr_ai-0.0.44}/src/chunkr_ai.egg-info/requires.txt +0 -0
  22. {chunkr_ai-0.0.41 → chunkr_ai-0.0.44}/src/chunkr_ai.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: chunkr-ai
3
- Version: 0.0.41
3
+ Version: 0.0.44
4
4
  Summary: Python client for Chunkr: open source document intelligence
5
5
  Author-email: Ishaan Kapoor <ishaan@lumina.sh>
6
6
  License: MIT License
@@ -37,6 +37,7 @@ Requires-Dist: pytest>=7.0.0; extra == "test"
37
37
  Requires-Dist: pytest-xdist>=3.0.0; extra == "test"
38
38
  Requires-Dist: pytest-asyncio>=0.21.0; extra == "test"
39
39
  Requires-Dist: ruff>=0.9.3; extra == "test"
40
+ Dynamic: license-file
40
41
 
41
42
  # Chunkr Python Client
42
43
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "chunkr-ai"
7
- version = "0.0.41"
7
+ version = "0.0.44"
8
8
  authors = [{"name" = "Ishaan Kapoor", "email" = "ishaan@lumina.sh"}]
9
9
  description = "Python client for Chunkr: open source document intelligence"
10
10
  readme = "README.md"
@@ -1,6 +1,7 @@
1
1
  from pydantic import BaseModel, Field, ConfigDict
2
2
  from enum import Enum
3
- from typing import Any, List, Optional
3
+ from typing import Any, List, Optional, Union
4
+ from pydantic import field_validator
4
5
 
5
6
  class GenerationStrategy(str, Enum):
6
7
  LLM = "LLM"
@@ -10,11 +11,18 @@ class CroppingStrategy(str, Enum):
10
11
  ALL = "All"
11
12
  AUTO = "Auto"
12
13
 
14
+ class EmbedSource(str, Enum):
15
+ HTML = "HTML"
16
+ MARKDOWN = "Markdown"
17
+ LLM = "LLM"
18
+ CONTENT = "Content"
19
+
13
20
  class GenerationConfig(BaseModel):
14
21
  html: Optional[GenerationStrategy] = None
15
22
  llm: Optional[str] = None
16
23
  markdown: Optional[GenerationStrategy] = None
17
24
  crop_image: Optional[CroppingStrategy] = None
25
+ embed_sources: Optional[List[EmbedSource]] = Field(default_factory=lambda: [EmbedSource.MARKDOWN])
18
26
 
19
27
  class SegmentProcessing(BaseModel):
20
28
  model_config = ConfigDict(populate_by_name=True, alias_generator=str.title)
@@ -32,9 +40,83 @@ class SegmentProcessing(BaseModel):
32
40
  text: Optional[GenerationConfig] = Field(default=None, alias="Text")
33
41
  title: Optional[GenerationConfig] = Field(default=None, alias="Title")
34
42
 
43
+ class Tokenizer(str, Enum):
44
+ WORD = "Word"
45
+ CL100K_BASE = "Cl100kBase"
46
+ XLM_ROBERTA_BASE = "XlmRobertaBase"
47
+ BERT_BASE_UNCASED = "BertBaseUncased"
48
+
49
+ class TokenizerType(BaseModel):
50
+ enum_value: Optional[Tokenizer] = None
51
+ string_value: Optional[str] = None
52
+
53
+ @classmethod
54
+ def from_enum(cls, enum_value: Tokenizer) -> "TokenizerType":
55
+ return cls(enum_value=enum_value)
56
+
57
+ @classmethod
58
+ def from_string(cls, string_value: str) -> "TokenizerType":
59
+ return cls(string_value=string_value)
60
+
61
+ def __str__(self) -> str:
62
+ if self.enum_value is not None:
63
+ return f"enum:{self.enum_value.value}"
64
+ elif self.string_value is not None:
65
+ return f"string:{self.string_value}"
66
+ return ""
67
+
68
+ model_config = ConfigDict(
69
+ json_encoders={
70
+ 'TokenizerType': lambda v: v.model_dump()
71
+ }
72
+ )
73
+
74
+ def model_dump(self, **kwargs):
75
+ if self.enum_value is not None:
76
+ return {"Enum": self.enum_value.value}
77
+ elif self.string_value is not None:
78
+ return {"String": self.string_value}
79
+ return {}
80
+
35
81
  class ChunkProcessing(BaseModel):
36
- ignore_headers_and_footers: Optional[bool] = None
82
+ ignore_headers_and_footers: Optional[bool] = True
37
83
  target_length: Optional[int] = None
84
+ tokenizer: Optional[Union[TokenizerType, Tokenizer, str]] = None
85
+
86
+ model_config = ConfigDict(
87
+ arbitrary_types_allowed=True,
88
+ json_encoders={
89
+ TokenizerType: lambda v: v.model_dump()
90
+ }
91
+ )
92
+
93
+ @field_validator('tokenizer', mode='before')
94
+ def validate_tokenizer(cls, v):
95
+ if v is None:
96
+ return None
97
+
98
+ if isinstance(v, TokenizerType):
99
+ return v
100
+
101
+ if isinstance(v, Tokenizer):
102
+ return TokenizerType(enum_value=v)
103
+
104
+ if isinstance(v, dict):
105
+ if "Enum" in v:
106
+ try:
107
+ return TokenizerType(enum_value=Tokenizer(v["Enum"]))
108
+ except ValueError:
109
+ return TokenizerType(string_value=v["Enum"])
110
+ elif "String" in v:
111
+ return TokenizerType(string_value=v["String"])
112
+
113
+ if isinstance(v, str):
114
+ try:
115
+ return TokenizerType(enum_value=Tokenizer(v))
116
+ except ValueError:
117
+ return TokenizerType(string_value=v)
118
+
119
+ raise ValueError(f"Cannot convert {v} to TokenizerType")
38
120
 
39
121
  class OcrStrategy(str, Enum):
40
122
  ALL = "All"
@@ -4,6 +4,7 @@ from pydantic import BaseModel, PrivateAttr
4
4
  import asyncio
5
5
  import json
6
6
  import os
7
+ import httpx
7
8
 
8
9
  from .configuration import Configuration, OutputConfiguration, OutputResponse, Status
9
10
  from .protocol import ChunkrClientProtocol
@@ -51,8 +52,12 @@ class TaskResponse(BaseModel, Generic[T]):
51
52
  )
52
53
  r.raise_for_status()
53
54
  return r.json()
54
- except (ConnectionError, TimeoutError, OSError) as e:
55
- print(f"Connection error while polling the task: {str(e)}, retrying...")
55
+ except (ConnectionError, TimeoutError, OSError,
56
+ httpx.ReadTimeout, httpx.ConnectTimeout,
57
+ httpx.WriteTimeout, httpx.PoolTimeout,
58
+ httpx.ConnectError, httpx.ReadError,
59
+ httpx.NetworkError) as e:
60
+ print(f"Connection error while polling the task: {str(e)}\nretrying...")
56
61
  await asyncio.sleep(0.5)
57
62
  return await self._poll_request()
58
63
  except Exception as e:
@@ -4,6 +4,7 @@ from .api.configuration import (
4
4
  ChunkProcessing,
5
5
  Configuration,
6
6
  CroppingStrategy,
7
+ EmbedSource,
7
8
  GenerationStrategy,
8
9
  GenerationConfig,
9
10
  Model,
@@ -16,6 +17,8 @@ from .api.configuration import (
16
17
  SegmentationStrategy,
17
18
  Status,
18
19
  Pipeline,
20
+ Tokenizer,
21
+ TokenizerType,
19
22
  )
20
23
  from .api.task_response import TaskResponse
21
24
 
@@ -25,6 +28,7 @@ __all__ = [
25
28
  "ChunkProcessing",
26
29
  "Configuration",
27
30
  "CroppingStrategy",
31
+ "EmbedSource",
28
32
  "GenerationConfig",
29
33
  "GenerationStrategy",
30
34
  "Model",
@@ -38,5 +42,6 @@ __all__ = [
38
42
  "Status",
39
43
  "TaskResponse",
40
44
  "Pipeline",
45
+ "Tokenizer",
46
+ "TokenizerType",
41
47
  ]
42
-
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: chunkr-ai
3
- Version: 0.0.41
3
+ Version: 0.0.44
4
4
  Summary: Python client for Chunkr: open source document intelligence
5
5
  Author-email: Ishaan Kapoor <ishaan@lumina.sh>
6
6
  License: MIT License
@@ -37,6 +37,7 @@ Requires-Dist: pytest>=7.0.0; extra == "test"
37
37
  Requires-Dist: pytest-xdist>=3.0.0; extra == "test"
38
38
  Requires-Dist: pytest-asyncio>=0.21.0; extra == "test"
39
39
  Requires-Dist: ruff>=0.9.3; extra == "test"
40
+ Dynamic: license-file
40
41
 
41
42
  # Chunkr Python Client
42
43
 
@@ -15,6 +15,8 @@ from chunkr_ai.models import (
15
15
  SegmentProcessing,
16
16
  ChunkProcessing,
17
17
  TaskResponse,
18
+ EmbedSource,
19
+ Tokenizer,
18
20
  )
19
21
 
20
22
  @pytest.fixture
@@ -34,6 +36,90 @@ def client():
34
36
  client = Chunkr()
35
37
  yield client
36
38
 
39
+ @pytest.fixture
40
+ def markdown_embed_config():
41
+ return Configuration(
42
+ segment_processing=SegmentProcessing(
43
+ page=GenerationConfig(
44
+ html=GenerationStrategy.LLM,
45
+ markdown=GenerationStrategy.LLM,
46
+ embed_sources=[EmbedSource.MARKDOWN]
47
+ )
48
+ ),
49
+ )
50
+
51
+ @pytest.fixture
52
+ def html_embed_config():
53
+ return Configuration(
54
+ segment_processing=SegmentProcessing(
55
+ page=GenerationConfig(
56
+ html=GenerationStrategy.LLM,
57
+ markdown=GenerationStrategy.LLM,
58
+ embed_sources=[EmbedSource.HTML]
59
+ )
60
+ ),
61
+ )
62
+
63
+ @pytest.fixture
64
+ def multiple_embed_config():
65
+ return Configuration(
66
+ segment_processing=SegmentProcessing(
67
+ page=GenerationConfig(
68
+ html=GenerationStrategy.LLM,
69
+ markdown=GenerationStrategy.LLM,
70
+ llm="Generate a summary of this content",
71
+ embed_sources=[EmbedSource.MARKDOWN, EmbedSource.LLM, EmbedSource.HTML]
72
+ )
73
+ ),
74
+ )
75
+
76
+ @pytest.fixture
77
+ def word_tokenizer_string_config():
78
+ return Configuration(
79
+ chunk_processing=ChunkProcessing(
80
+ tokenizer="Word"
81
+ ),
82
+ )
83
+
84
+ @pytest.fixture
85
+ def word_tokenizer_config():
86
+ return Configuration(
87
+ chunk_processing=ChunkProcessing(
88
+ tokenizer=Tokenizer.WORD
89
+ ),
90
+ )
91
+
92
+ @pytest.fixture
93
+ def cl100k_tokenizer_config():
94
+ return Configuration(
95
+ chunk_processing=ChunkProcessing(
96
+ tokenizer=Tokenizer.CL100K_BASE
97
+ ),
98
+ )
99
+
100
+ @pytest.fixture
101
+ def custom_tokenizer_config():
102
+ return Configuration(
103
+ chunk_processing=ChunkProcessing(
104
+ tokenizer="Qwen/Qwen-tokenizer"
105
+ ),
106
+ )
107
+
108
+ @pytest.fixture
109
+ def xlm_roberta_with_html_content_config():
110
+ return Configuration(
111
+ chunk_processing=ChunkProcessing(
112
+ tokenizer=Tokenizer.XLM_ROBERTA_BASE
113
+ ),
114
+ segment_processing=SegmentProcessing(
115
+ page=GenerationConfig(
116
+ html=GenerationStrategy.LLM,
117
+ markdown=GenerationStrategy.LLM,
118
+ embed_sources=[EmbedSource.HTML, EmbedSource.CONTENT]
119
+ )
120
+ ),
121
+ )
122
+
37
123
  @pytest.mark.asyncio
38
124
  async def test_send_file_path(client, sample_path):
39
125
  response = await client.upload(sample_path)
@@ -241,6 +327,15 @@ async def test_send_base64_file(client, sample_path):
241
327
  assert response.status == "Succeeded"
242
328
  assert response.output is not None
243
329
 
330
+ @pytest.mark.asyncio
331
+ async def test_send_base64_file_with_data_url(client, sample_path):
332
+ with open(sample_path, "rb") as f:
333
+ base64_content = base64.b64encode(f.read()).decode('utf-8')
334
+ response = await client.upload(f"data:application/pdf;base64,{base64_content}")
335
+ assert response.task_id is not None
336
+ assert response.status == "Succeeded"
337
+ assert response.output is not None
338
+
244
339
  @pytest.mark.asyncio
245
340
  async def test_send_base64_file_with_filename(client, sample_path):
246
341
  # Read file and convert to base64
@@ -289,4 +384,62 @@ async def test_output_files_with_dirs(client, sample_path, tmp_path):
289
384
  assert html_file.exists()
290
385
  assert md_file.exists()
291
386
  assert content_file.exists()
292
- assert json_file.exists()
387
+ assert json_file.exists()
388
+
389
+ @pytest.mark.asyncio
390
+ async def test_embed_sources_markdown_only(client, sample_path, markdown_embed_config):
391
+ response = await client.upload(sample_path, markdown_embed_config)
392
+ assert response.task_id is not None
393
+ assert response.status == "Succeeded"
394
+ assert response.output is not None
395
+ # Check the first chunk to verify embed exists
396
+ if response.output.chunks:
397
+ chunk = response.output.chunks[0]
398
+ assert chunk.embed is not None
399
+
400
+ @pytest.mark.asyncio
401
+ async def test_embed_sources_html_only(client, sample_path, html_embed_config):
402
+ response = await client.upload(sample_path, html_embed_config)
403
+ assert response.task_id is not None
404
+ assert response.status == "Succeeded"
405
+ assert response.output is not None
406
+
407
+ @pytest.mark.asyncio
408
+ async def test_embed_sources_multiple(client, sample_path, multiple_embed_config):
409
+ response = await client.upload(sample_path, multiple_embed_config)
410
+ assert response.task_id is not None
411
+ assert response.status == "Succeeded"
412
+ assert response.output is not None
413
+
414
+ @pytest.mark.asyncio
415
+ async def test_tokenizer_word(client, sample_path, word_tokenizer_config):
416
+ response = await client.upload(sample_path, word_tokenizer_config)
417
+ assert response.task_id is not None
418
+ assert response.status == "Succeeded"
419
+ assert response.output is not None
420
+ if response.output.chunks:
421
+ for chunk in response.output.chunks:
422
+ # Word tokenizer should result in chunks with length close to target
423
+ assert chunk.chunk_length > 0
424
+ assert chunk.chunk_length <= 600 # Allow some flexibility
425
+
426
+ @pytest.mark.asyncio
427
+ async def test_tokenizer_cl100k(client, sample_path, cl100k_tokenizer_config):
428
+ response = await client.upload(sample_path, cl100k_tokenizer_config)
429
+ assert response.task_id is not None
430
+ assert response.status == "Succeeded"
431
+ assert response.output is not None
432
+
433
+ @pytest.mark.asyncio
434
+ async def test_tokenizer_custom_string(client, sample_path, custom_tokenizer_config):
435
+ response = await client.upload(sample_path, custom_tokenizer_config)
436
+ assert response.task_id is not None
437
+ assert response.status == "Succeeded"
438
+ assert response.output is not None
439
+
440
+ @pytest.mark.asyncio
441
+ async def test_embed_sources_with_different_tokenizer(client, sample_path, xlm_roberta_with_html_content_config):
442
+ response = await client.upload(sample_path, xlm_roberta_with_html_content_config)
443
+ assert response.task_id is not None
444
+ assert response.status == "Succeeded"
445
+ assert response.output is not None
File without changes
File without changes
File without changes