chunkr-ai 0.0.41__tar.gz → 0.0.44__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {chunkr_ai-0.0.41/src/chunkr_ai.egg-info → chunkr_ai-0.0.44}/PKG-INFO +3 -2
- {chunkr_ai-0.0.41 → chunkr_ai-0.0.44}/pyproject.toml +1 -1
- {chunkr_ai-0.0.41 → chunkr_ai-0.0.44}/src/chunkr_ai/api/configuration.py +84 -2
- {chunkr_ai-0.0.41 → chunkr_ai-0.0.44}/src/chunkr_ai/api/task_response.py +7 -2
- {chunkr_ai-0.0.41 → chunkr_ai-0.0.44}/src/chunkr_ai/models.py +6 -1
- {chunkr_ai-0.0.41 → chunkr_ai-0.0.44/src/chunkr_ai.egg-info}/PKG-INFO +3 -2
- {chunkr_ai-0.0.41 → chunkr_ai-0.0.44}/tests/test_chunkr.py +154 -1
- {chunkr_ai-0.0.41 → chunkr_ai-0.0.44}/LICENSE +0 -0
- {chunkr_ai-0.0.41 → chunkr_ai-0.0.44}/README.md +0 -0
- {chunkr_ai-0.0.41 → chunkr_ai-0.0.44}/setup.cfg +0 -0
- {chunkr_ai-0.0.41 → chunkr_ai-0.0.44}/src/chunkr_ai/__init__.py +0 -0
- {chunkr_ai-0.0.41 → chunkr_ai-0.0.44}/src/chunkr_ai/api/__init__.py +0 -0
- {chunkr_ai-0.0.41 → chunkr_ai-0.0.44}/src/chunkr_ai/api/auth.py +0 -0
- {chunkr_ai-0.0.41 → chunkr_ai-0.0.44}/src/chunkr_ai/api/chunkr.py +0 -0
- {chunkr_ai-0.0.41 → chunkr_ai-0.0.44}/src/chunkr_ai/api/chunkr_base.py +0 -0
- {chunkr_ai-0.0.41 → chunkr_ai-0.0.44}/src/chunkr_ai/api/decorators.py +0 -0
- {chunkr_ai-0.0.41 → chunkr_ai-0.0.44}/src/chunkr_ai/api/misc.py +0 -0
- {chunkr_ai-0.0.41 → chunkr_ai-0.0.44}/src/chunkr_ai/api/protocol.py +0 -0
- {chunkr_ai-0.0.41 → chunkr_ai-0.0.44}/src/chunkr_ai.egg-info/SOURCES.txt +0 -0
- {chunkr_ai-0.0.41 → chunkr_ai-0.0.44}/src/chunkr_ai.egg-info/dependency_links.txt +0 -0
- {chunkr_ai-0.0.41 → chunkr_ai-0.0.44}/src/chunkr_ai.egg-info/requires.txt +0 -0
- {chunkr_ai-0.0.41 → chunkr_ai-0.0.44}/src/chunkr_ai.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.4
|
2
2
|
Name: chunkr-ai
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.44
|
4
4
|
Summary: Python client for Chunkr: open source document intelligence
|
5
5
|
Author-email: Ishaan Kapoor <ishaan@lumina.sh>
|
6
6
|
License: MIT License
|
@@ -37,6 +37,7 @@ Requires-Dist: pytest>=7.0.0; extra == "test"
|
|
37
37
|
Requires-Dist: pytest-xdist>=3.0.0; extra == "test"
|
38
38
|
Requires-Dist: pytest-asyncio>=0.21.0; extra == "test"
|
39
39
|
Requires-Dist: ruff>=0.9.3; extra == "test"
|
40
|
+
Dynamic: license-file
|
40
41
|
|
41
42
|
# Chunkr Python Client
|
42
43
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "chunkr-ai"
|
7
|
-
version = "0.0.
|
7
|
+
version = "0.0.44"
|
8
8
|
authors = [{"name" = "Ishaan Kapoor", "email" = "ishaan@lumina.sh"}]
|
9
9
|
description = "Python client for Chunkr: open source document intelligence"
|
10
10
|
readme = "README.md"
|
@@ -1,6 +1,7 @@
|
|
1
1
|
from pydantic import BaseModel, Field, ConfigDict
|
2
2
|
from enum import Enum
|
3
|
-
from typing import Any, List, Optional
|
3
|
+
from typing import Any, List, Optional, Union
|
4
|
+
from pydantic import field_validator
|
4
5
|
|
5
6
|
class GenerationStrategy(str, Enum):
|
6
7
|
LLM = "LLM"
|
@@ -10,11 +11,18 @@ class CroppingStrategy(str, Enum):
|
|
10
11
|
ALL = "All"
|
11
12
|
AUTO = "Auto"
|
12
13
|
|
14
|
+
class EmbedSource(str, Enum):
|
15
|
+
HTML = "HTML"
|
16
|
+
MARKDOWN = "Markdown"
|
17
|
+
LLM = "LLM"
|
18
|
+
CONTENT = "Content"
|
19
|
+
|
13
20
|
class GenerationConfig(BaseModel):
|
14
21
|
html: Optional[GenerationStrategy] = None
|
15
22
|
llm: Optional[str] = None
|
16
23
|
markdown: Optional[GenerationStrategy] = None
|
17
24
|
crop_image: Optional[CroppingStrategy] = None
|
25
|
+
embed_sources: Optional[List[EmbedSource]] = Field(default_factory=lambda: [EmbedSource.MARKDOWN])
|
18
26
|
|
19
27
|
class SegmentProcessing(BaseModel):
|
20
28
|
model_config = ConfigDict(populate_by_name=True, alias_generator=str.title)
|
@@ -32,9 +40,83 @@ class SegmentProcessing(BaseModel):
|
|
32
40
|
text: Optional[GenerationConfig] = Field(default=None, alias="Text")
|
33
41
|
title: Optional[GenerationConfig] = Field(default=None, alias="Title")
|
34
42
|
|
43
|
+
class Tokenizer(str, Enum):
|
44
|
+
WORD = "Word"
|
45
|
+
CL100K_BASE = "Cl100kBase"
|
46
|
+
XLM_ROBERTA_BASE = "XlmRobertaBase"
|
47
|
+
BERT_BASE_UNCASED = "BertBaseUncased"
|
48
|
+
|
49
|
+
class TokenizerType(BaseModel):
|
50
|
+
enum_value: Optional[Tokenizer] = None
|
51
|
+
string_value: Optional[str] = None
|
52
|
+
|
53
|
+
@classmethod
|
54
|
+
def from_enum(cls, enum_value: Tokenizer) -> "TokenizerType":
|
55
|
+
return cls(enum_value=enum_value)
|
56
|
+
|
57
|
+
@classmethod
|
58
|
+
def from_string(cls, string_value: str) -> "TokenizerType":
|
59
|
+
return cls(string_value=string_value)
|
60
|
+
|
61
|
+
def __str__(self) -> str:
|
62
|
+
if self.enum_value is not None:
|
63
|
+
return f"enum:{self.enum_value.value}"
|
64
|
+
elif self.string_value is not None:
|
65
|
+
return f"string:{self.string_value}"
|
66
|
+
return ""
|
67
|
+
|
68
|
+
model_config = ConfigDict(
|
69
|
+
json_encoders={
|
70
|
+
'TokenizerType': lambda v: v.model_dump()
|
71
|
+
}
|
72
|
+
)
|
73
|
+
|
74
|
+
def model_dump(self, **kwargs):
|
75
|
+
if self.enum_value is not None:
|
76
|
+
return {"Enum": self.enum_value.value}
|
77
|
+
elif self.string_value is not None:
|
78
|
+
return {"String": self.string_value}
|
79
|
+
return {}
|
80
|
+
|
35
81
|
class ChunkProcessing(BaseModel):
|
36
|
-
ignore_headers_and_footers: Optional[bool] =
|
82
|
+
ignore_headers_and_footers: Optional[bool] = True
|
37
83
|
target_length: Optional[int] = None
|
84
|
+
tokenizer: Optional[Union[TokenizerType, Tokenizer, str]] = None
|
85
|
+
|
86
|
+
model_config = ConfigDict(
|
87
|
+
arbitrary_types_allowed=True,
|
88
|
+
json_encoders={
|
89
|
+
TokenizerType: lambda v: v.model_dump()
|
90
|
+
}
|
91
|
+
)
|
92
|
+
|
93
|
+
@field_validator('tokenizer', mode='before')
|
94
|
+
def validate_tokenizer(cls, v):
|
95
|
+
if v is None:
|
96
|
+
return None
|
97
|
+
|
98
|
+
if isinstance(v, TokenizerType):
|
99
|
+
return v
|
100
|
+
|
101
|
+
if isinstance(v, Tokenizer):
|
102
|
+
return TokenizerType(enum_value=v)
|
103
|
+
|
104
|
+
if isinstance(v, dict):
|
105
|
+
if "Enum" in v:
|
106
|
+
try:
|
107
|
+
return TokenizerType(enum_value=Tokenizer(v["Enum"]))
|
108
|
+
except ValueError:
|
109
|
+
return TokenizerType(string_value=v["Enum"])
|
110
|
+
elif "String" in v:
|
111
|
+
return TokenizerType(string_value=v["String"])
|
112
|
+
|
113
|
+
if isinstance(v, str):
|
114
|
+
try:
|
115
|
+
return TokenizerType(enum_value=Tokenizer(v))
|
116
|
+
except ValueError:
|
117
|
+
return TokenizerType(string_value=v)
|
118
|
+
|
119
|
+
raise ValueError(f"Cannot convert {v} to TokenizerType")
|
38
120
|
|
39
121
|
class OcrStrategy(str, Enum):
|
40
122
|
ALL = "All"
|
@@ -4,6 +4,7 @@ from pydantic import BaseModel, PrivateAttr
|
|
4
4
|
import asyncio
|
5
5
|
import json
|
6
6
|
import os
|
7
|
+
import httpx
|
7
8
|
|
8
9
|
from .configuration import Configuration, OutputConfiguration, OutputResponse, Status
|
9
10
|
from .protocol import ChunkrClientProtocol
|
@@ -51,8 +52,12 @@ class TaskResponse(BaseModel, Generic[T]):
|
|
51
52
|
)
|
52
53
|
r.raise_for_status()
|
53
54
|
return r.json()
|
54
|
-
except (ConnectionError, TimeoutError, OSError
|
55
|
-
|
55
|
+
except (ConnectionError, TimeoutError, OSError,
|
56
|
+
httpx.ReadTimeout, httpx.ConnectTimeout,
|
57
|
+
httpx.WriteTimeout, httpx.PoolTimeout,
|
58
|
+
httpx.ConnectError, httpx.ReadError,
|
59
|
+
httpx.NetworkError) as e:
|
60
|
+
print(f"Connection error while polling the task: {str(e)}\nretrying...")
|
56
61
|
await asyncio.sleep(0.5)
|
57
62
|
return await self._poll_request()
|
58
63
|
except Exception as e:
|
@@ -4,6 +4,7 @@ from .api.configuration import (
|
|
4
4
|
ChunkProcessing,
|
5
5
|
Configuration,
|
6
6
|
CroppingStrategy,
|
7
|
+
EmbedSource,
|
7
8
|
GenerationStrategy,
|
8
9
|
GenerationConfig,
|
9
10
|
Model,
|
@@ -16,6 +17,8 @@ from .api.configuration import (
|
|
16
17
|
SegmentationStrategy,
|
17
18
|
Status,
|
18
19
|
Pipeline,
|
20
|
+
Tokenizer,
|
21
|
+
TokenizerType,
|
19
22
|
)
|
20
23
|
from .api.task_response import TaskResponse
|
21
24
|
|
@@ -25,6 +28,7 @@ __all__ = [
|
|
25
28
|
"ChunkProcessing",
|
26
29
|
"Configuration",
|
27
30
|
"CroppingStrategy",
|
31
|
+
"EmbedSource",
|
28
32
|
"GenerationConfig",
|
29
33
|
"GenerationStrategy",
|
30
34
|
"Model",
|
@@ -38,5 +42,6 @@ __all__ = [
|
|
38
42
|
"Status",
|
39
43
|
"TaskResponse",
|
40
44
|
"Pipeline",
|
45
|
+
"Tokenizer",
|
46
|
+
"TokenizerType",
|
41
47
|
]
|
42
|
-
|
@@ -1,6 +1,6 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.4
|
2
2
|
Name: chunkr-ai
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.44
|
4
4
|
Summary: Python client for Chunkr: open source document intelligence
|
5
5
|
Author-email: Ishaan Kapoor <ishaan@lumina.sh>
|
6
6
|
License: MIT License
|
@@ -37,6 +37,7 @@ Requires-Dist: pytest>=7.0.0; extra == "test"
|
|
37
37
|
Requires-Dist: pytest-xdist>=3.0.0; extra == "test"
|
38
38
|
Requires-Dist: pytest-asyncio>=0.21.0; extra == "test"
|
39
39
|
Requires-Dist: ruff>=0.9.3; extra == "test"
|
40
|
+
Dynamic: license-file
|
40
41
|
|
41
42
|
# Chunkr Python Client
|
42
43
|
|
@@ -15,6 +15,8 @@ from chunkr_ai.models import (
|
|
15
15
|
SegmentProcessing,
|
16
16
|
ChunkProcessing,
|
17
17
|
TaskResponse,
|
18
|
+
EmbedSource,
|
19
|
+
Tokenizer,
|
18
20
|
)
|
19
21
|
|
20
22
|
@pytest.fixture
|
@@ -34,6 +36,90 @@ def client():
|
|
34
36
|
client = Chunkr()
|
35
37
|
yield client
|
36
38
|
|
39
|
+
@pytest.fixture
|
40
|
+
def markdown_embed_config():
|
41
|
+
return Configuration(
|
42
|
+
segment_processing=SegmentProcessing(
|
43
|
+
page=GenerationConfig(
|
44
|
+
html=GenerationStrategy.LLM,
|
45
|
+
markdown=GenerationStrategy.LLM,
|
46
|
+
embed_sources=[EmbedSource.MARKDOWN]
|
47
|
+
)
|
48
|
+
),
|
49
|
+
)
|
50
|
+
|
51
|
+
@pytest.fixture
|
52
|
+
def html_embed_config():
|
53
|
+
return Configuration(
|
54
|
+
segment_processing=SegmentProcessing(
|
55
|
+
page=GenerationConfig(
|
56
|
+
html=GenerationStrategy.LLM,
|
57
|
+
markdown=GenerationStrategy.LLM,
|
58
|
+
embed_sources=[EmbedSource.HTML]
|
59
|
+
)
|
60
|
+
),
|
61
|
+
)
|
62
|
+
|
63
|
+
@pytest.fixture
|
64
|
+
def multiple_embed_config():
|
65
|
+
return Configuration(
|
66
|
+
segment_processing=SegmentProcessing(
|
67
|
+
page=GenerationConfig(
|
68
|
+
html=GenerationStrategy.LLM,
|
69
|
+
markdown=GenerationStrategy.LLM,
|
70
|
+
llm="Generate a summary of this content",
|
71
|
+
embed_sources=[EmbedSource.MARKDOWN, EmbedSource.LLM, EmbedSource.HTML]
|
72
|
+
)
|
73
|
+
),
|
74
|
+
)
|
75
|
+
|
76
|
+
@pytest.fixture
|
77
|
+
def word_tokenizer_string_config():
|
78
|
+
return Configuration(
|
79
|
+
chunk_processing=ChunkProcessing(
|
80
|
+
tokenizer="Word"
|
81
|
+
),
|
82
|
+
)
|
83
|
+
|
84
|
+
@pytest.fixture
|
85
|
+
def word_tokenizer_config():
|
86
|
+
return Configuration(
|
87
|
+
chunk_processing=ChunkProcessing(
|
88
|
+
tokenizer=Tokenizer.WORD
|
89
|
+
),
|
90
|
+
)
|
91
|
+
|
92
|
+
@pytest.fixture
|
93
|
+
def cl100k_tokenizer_config():
|
94
|
+
return Configuration(
|
95
|
+
chunk_processing=ChunkProcessing(
|
96
|
+
tokenizer=Tokenizer.CL100K_BASE
|
97
|
+
),
|
98
|
+
)
|
99
|
+
|
100
|
+
@pytest.fixture
|
101
|
+
def custom_tokenizer_config():
|
102
|
+
return Configuration(
|
103
|
+
chunk_processing=ChunkProcessing(
|
104
|
+
tokenizer="Qwen/Qwen-tokenizer"
|
105
|
+
),
|
106
|
+
)
|
107
|
+
|
108
|
+
@pytest.fixture
|
109
|
+
def xlm_roberta_with_html_content_config():
|
110
|
+
return Configuration(
|
111
|
+
chunk_processing=ChunkProcessing(
|
112
|
+
tokenizer=Tokenizer.XLM_ROBERTA_BASE
|
113
|
+
),
|
114
|
+
segment_processing=SegmentProcessing(
|
115
|
+
page=GenerationConfig(
|
116
|
+
html=GenerationStrategy.LLM,
|
117
|
+
markdown=GenerationStrategy.LLM,
|
118
|
+
embed_sources=[EmbedSource.HTML, EmbedSource.CONTENT]
|
119
|
+
)
|
120
|
+
),
|
121
|
+
)
|
122
|
+
|
37
123
|
@pytest.mark.asyncio
|
38
124
|
async def test_send_file_path(client, sample_path):
|
39
125
|
response = await client.upload(sample_path)
|
@@ -241,6 +327,15 @@ async def test_send_base64_file(client, sample_path):
|
|
241
327
|
assert response.status == "Succeeded"
|
242
328
|
assert response.output is not None
|
243
329
|
|
330
|
+
@pytest.mark.asyncio
|
331
|
+
async def test_send_base64_file_with_data_url(client, sample_path):
|
332
|
+
with open(sample_path, "rb") as f:
|
333
|
+
base64_content = base64.b64encode(f.read()).decode('utf-8')
|
334
|
+
response = await client.upload(f"data:application/pdf;base64,{base64_content}")
|
335
|
+
assert response.task_id is not None
|
336
|
+
assert response.status == "Succeeded"
|
337
|
+
assert response.output is not None
|
338
|
+
|
244
339
|
@pytest.mark.asyncio
|
245
340
|
async def test_send_base64_file_with_filename(client, sample_path):
|
246
341
|
# Read file and convert to base64
|
@@ -289,4 +384,62 @@ async def test_output_files_with_dirs(client, sample_path, tmp_path):
|
|
289
384
|
assert html_file.exists()
|
290
385
|
assert md_file.exists()
|
291
386
|
assert content_file.exists()
|
292
|
-
assert json_file.exists()
|
387
|
+
assert json_file.exists()
|
388
|
+
|
389
|
+
@pytest.mark.asyncio
|
390
|
+
async def test_embed_sources_markdown_only(client, sample_path, markdown_embed_config):
|
391
|
+
response = await client.upload(sample_path, markdown_embed_config)
|
392
|
+
assert response.task_id is not None
|
393
|
+
assert response.status == "Succeeded"
|
394
|
+
assert response.output is not None
|
395
|
+
# Check the first chunk to verify embed exists
|
396
|
+
if response.output.chunks:
|
397
|
+
chunk = response.output.chunks[0]
|
398
|
+
assert chunk.embed is not None
|
399
|
+
|
400
|
+
@pytest.mark.asyncio
|
401
|
+
async def test_embed_sources_html_only(client, sample_path, html_embed_config):
|
402
|
+
response = await client.upload(sample_path, html_embed_config)
|
403
|
+
assert response.task_id is not None
|
404
|
+
assert response.status == "Succeeded"
|
405
|
+
assert response.output is not None
|
406
|
+
|
407
|
+
@pytest.mark.asyncio
|
408
|
+
async def test_embed_sources_multiple(client, sample_path, multiple_embed_config):
|
409
|
+
response = await client.upload(sample_path, multiple_embed_config)
|
410
|
+
assert response.task_id is not None
|
411
|
+
assert response.status == "Succeeded"
|
412
|
+
assert response.output is not None
|
413
|
+
|
414
|
+
@pytest.mark.asyncio
|
415
|
+
async def test_tokenizer_word(client, sample_path, word_tokenizer_config):
|
416
|
+
response = await client.upload(sample_path, word_tokenizer_config)
|
417
|
+
assert response.task_id is not None
|
418
|
+
assert response.status == "Succeeded"
|
419
|
+
assert response.output is not None
|
420
|
+
if response.output.chunks:
|
421
|
+
for chunk in response.output.chunks:
|
422
|
+
# Word tokenizer should result in chunks with length close to target
|
423
|
+
assert chunk.chunk_length > 0
|
424
|
+
assert chunk.chunk_length <= 600 # Allow some flexibility
|
425
|
+
|
426
|
+
@pytest.mark.asyncio
|
427
|
+
async def test_tokenizer_cl100k(client, sample_path, cl100k_tokenizer_config):
|
428
|
+
response = await client.upload(sample_path, cl100k_tokenizer_config)
|
429
|
+
assert response.task_id is not None
|
430
|
+
assert response.status == "Succeeded"
|
431
|
+
assert response.output is not None
|
432
|
+
|
433
|
+
@pytest.mark.asyncio
|
434
|
+
async def test_tokenizer_custom_string(client, sample_path, custom_tokenizer_config):
|
435
|
+
response = await client.upload(sample_path, custom_tokenizer_config)
|
436
|
+
assert response.task_id is not None
|
437
|
+
assert response.status == "Succeeded"
|
438
|
+
assert response.output is not None
|
439
|
+
|
440
|
+
@pytest.mark.asyncio
|
441
|
+
async def test_embed_sources_with_different_tokenizer(client, sample_path, xlm_roberta_with_html_content_config):
|
442
|
+
response = await client.upload(sample_path, xlm_roberta_with_html_content_config)
|
443
|
+
assert response.task_id is not None
|
444
|
+
assert response.status == "Succeeded"
|
445
|
+
assert response.output is not None
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|