chunkr-ai 0.0.49__tar.gz → 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {chunkr_ai-0.0.49/src/chunkr_ai.egg-info → chunkr_ai-0.1.0}/PKG-INFO +1 -1
- {chunkr_ai-0.0.49 → chunkr_ai-0.1.0}/pyproject.toml +1 -1
- {chunkr_ai-0.0.49 → chunkr_ai-0.1.0}/src/chunkr_ai/api/configuration.py +19 -11
- {chunkr_ai-0.0.49 → chunkr_ai-0.1.0}/src/chunkr_ai/models.py +2 -0
- {chunkr_ai-0.0.49 → chunkr_ai-0.1.0/src/chunkr_ai.egg-info}/PKG-INFO +1 -1
- {chunkr_ai-0.0.49 → chunkr_ai-0.1.0}/tests/test_chunkr.py +24 -21
- {chunkr_ai-0.0.49 → chunkr_ai-0.1.0}/LICENSE +0 -0
- {chunkr_ai-0.0.49 → chunkr_ai-0.1.0}/README.md +0 -0
- {chunkr_ai-0.0.49 → chunkr_ai-0.1.0}/setup.cfg +0 -0
- {chunkr_ai-0.0.49 → chunkr_ai-0.1.0}/src/chunkr_ai/__init__.py +0 -0
- {chunkr_ai-0.0.49 → chunkr_ai-0.1.0}/src/chunkr_ai/api/__init__.py +0 -0
- {chunkr_ai-0.0.49 → chunkr_ai-0.1.0}/src/chunkr_ai/api/auth.py +0 -0
- {chunkr_ai-0.0.49 → chunkr_ai-0.1.0}/src/chunkr_ai/api/chunkr.py +0 -0
- {chunkr_ai-0.0.49 → chunkr_ai-0.1.0}/src/chunkr_ai/api/chunkr_base.py +0 -0
- {chunkr_ai-0.0.49 → chunkr_ai-0.1.0}/src/chunkr_ai/api/decorators.py +0 -0
- {chunkr_ai-0.0.49 → chunkr_ai-0.1.0}/src/chunkr_ai/api/misc.py +0 -0
- {chunkr_ai-0.0.49 → chunkr_ai-0.1.0}/src/chunkr_ai/api/protocol.py +0 -0
- {chunkr_ai-0.0.49 → chunkr_ai-0.1.0}/src/chunkr_ai/api/task_response.py +0 -0
- {chunkr_ai-0.0.49 → chunkr_ai-0.1.0}/src/chunkr_ai.egg-info/SOURCES.txt +0 -0
- {chunkr_ai-0.0.49 → chunkr_ai-0.1.0}/src/chunkr_ai.egg-info/dependency_links.txt +0 -0
- {chunkr_ai-0.0.49 → chunkr_ai-0.1.0}/src/chunkr_ai.egg-info/requires.txt +0 -0
- {chunkr_ai-0.0.49 → chunkr_ai-0.1.0}/src/chunkr_ai.egg-info/top_level.txt +0 -0
- {chunkr_ai-0.0.49 → chunkr_ai-0.1.0}/tests/test_file_handling.py +0 -0
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "chunkr-ai"
|
7
|
-
version = "0.0
|
7
|
+
version = "0.1.0"
|
8
8
|
authors = [{"name" = "Ishaan Kapoor", "email" = "ishaan@lumina.sh"}]
|
9
9
|
description = "Python client for Chunkr: open source document intelligence"
|
10
10
|
readme = "README.md"
|
@@ -3,27 +3,34 @@ from enum import Enum
|
|
3
3
|
from typing import Any, List, Optional, Union
|
4
4
|
from pydantic import field_validator, field_serializer
|
5
5
|
|
6
|
-
class GenerationStrategy(str, Enum):
|
7
|
-
LLM = "LLM"
|
8
|
-
AUTO = "Auto"
|
9
|
-
|
10
6
|
class CroppingStrategy(str, Enum):
|
11
7
|
ALL = "All"
|
12
8
|
AUTO = "Auto"
|
13
9
|
|
14
|
-
class
|
15
|
-
HTML = "
|
10
|
+
class SegmentFormat(str, Enum):
|
11
|
+
HTML = "Html"
|
16
12
|
MARKDOWN = "Markdown"
|
17
|
-
|
13
|
+
|
14
|
+
class EmbedSource(str, Enum):
|
18
15
|
CONTENT = "Content"
|
16
|
+
HTML = "HTML" # Deprecated
|
17
|
+
MARKDOWN = "Markdown" # Deprecated
|
18
|
+
LLM = "LLM"
|
19
|
+
|
20
|
+
class GenerationStrategy(str, Enum):
|
21
|
+
LLM = "LLM"
|
22
|
+
AUTO = "Auto"
|
19
23
|
|
20
24
|
class GenerationConfig(BaseModel):
|
21
|
-
|
25
|
+
format: Optional[SegmentFormat] = None
|
26
|
+
strategy: Optional[GenerationStrategy] = None
|
22
27
|
llm: Optional[str] = None
|
23
|
-
markdown: Optional[GenerationStrategy] = None
|
24
28
|
crop_image: Optional[CroppingStrategy] = None
|
25
|
-
embed_sources: Optional[List[EmbedSource]] =
|
29
|
+
embed_sources: Optional[List[EmbedSource]] = None
|
26
30
|
extended_context: Optional[bool] = None
|
31
|
+
# Deprecated fields for backwards compatibility
|
32
|
+
html: Optional[GenerationStrategy] = None # Deprecated: Use format=SegmentFormat.HTML and strategy instead
|
33
|
+
markdown: Optional[GenerationStrategy] = None # Deprecated: Use format=SegmentFormat.MARKDOWN and strategy instead
|
27
34
|
|
28
35
|
class SegmentProcessing(BaseModel):
|
29
36
|
model_config = ConfigDict(populate_by_name=True, alias_generator=str.title)
|
@@ -246,7 +253,7 @@ class SegmentType(str, Enum):
|
|
246
253
|
|
247
254
|
class Segment(BaseModel):
|
248
255
|
bbox: BoundingBox
|
249
|
-
content: str
|
256
|
+
content: str = ""
|
250
257
|
page_height: float
|
251
258
|
llm: Optional[str] = None
|
252
259
|
html: Optional[str] = None
|
@@ -258,6 +265,7 @@ class Segment(BaseModel):
|
|
258
265
|
segment_id: str
|
259
266
|
segment_type: SegmentType
|
260
267
|
confidence: Optional[float]
|
268
|
+
text: str = ""
|
261
269
|
|
262
270
|
class Chunk(BaseModel):
|
263
271
|
chunk_id: str
|
@@ -15,6 +15,7 @@ from .api.configuration import (
|
|
15
15
|
OcrStrategy,
|
16
16
|
OutputResponse,
|
17
17
|
Segment,
|
18
|
+
SegmentFormat,
|
18
19
|
SegmentProcessing,
|
19
20
|
SegmentType,
|
20
21
|
SegmentationStrategy,
|
@@ -42,6 +43,7 @@ __all__ = [
|
|
42
43
|
"OcrStrategy",
|
43
44
|
"OutputResponse",
|
44
45
|
"Segment",
|
46
|
+
"SegmentFormat",
|
45
47
|
"SegmentProcessing",
|
46
48
|
"SegmentType",
|
47
49
|
"SegmentationStrategy",
|
@@ -2,7 +2,6 @@ import pytest
|
|
2
2
|
from pathlib import Path
|
3
3
|
from PIL import Image
|
4
4
|
import asyncio
|
5
|
-
from typing import Awaitable
|
6
5
|
|
7
6
|
from chunkr_ai import Chunkr
|
8
7
|
from chunkr_ai.models import (
|
@@ -21,6 +20,7 @@ from chunkr_ai.models import (
|
|
21
20
|
Status,
|
22
21
|
TaskResponse,
|
23
22
|
Tokenizer,
|
23
|
+
SegmentFormat,
|
24
24
|
)
|
25
25
|
|
26
26
|
@pytest.fixture
|
@@ -53,9 +53,9 @@ def markdown_embed_config():
|
|
53
53
|
return Configuration(
|
54
54
|
segment_processing=SegmentProcessing(
|
55
55
|
Page=GenerationConfig(
|
56
|
-
|
57
|
-
|
58
|
-
embed_sources=[EmbedSource.
|
56
|
+
format=SegmentFormat.MARKDOWN,
|
57
|
+
strategy=GenerationStrategy.LLM,
|
58
|
+
embed_sources=[EmbedSource.CONTENT]
|
59
59
|
)
|
60
60
|
),
|
61
61
|
)
|
@@ -65,9 +65,9 @@ def html_embed_config():
|
|
65
65
|
return Configuration(
|
66
66
|
segment_processing=SegmentProcessing(
|
67
67
|
Page=GenerationConfig(
|
68
|
-
|
69
|
-
|
70
|
-
embed_sources=[EmbedSource.HTML]
|
68
|
+
format=SegmentFormat.HTML,
|
69
|
+
strategy=GenerationStrategy.LLM,
|
70
|
+
embed_sources=[EmbedSource.HTML] # Keep this for backwards compatibility testing
|
71
71
|
)
|
72
72
|
),
|
73
73
|
)
|
@@ -77,10 +77,10 @@ def multiple_embed_config():
|
|
77
77
|
return Configuration(
|
78
78
|
segment_processing=SegmentProcessing(
|
79
79
|
Page=GenerationConfig(
|
80
|
-
|
81
|
-
|
80
|
+
format=SegmentFormat.MARKDOWN,
|
81
|
+
strategy=GenerationStrategy.LLM,
|
82
82
|
llm="Generate a summary of this content",
|
83
|
-
embed_sources=[EmbedSource.
|
83
|
+
embed_sources=[EmbedSource.CONTENT, EmbedSource.LLM, EmbedSource.HTML]
|
84
84
|
)
|
85
85
|
),
|
86
86
|
)
|
@@ -169,13 +169,15 @@ def model_fallback_config():
|
|
169
169
|
def extended_context_config():
|
170
170
|
return Configuration(
|
171
171
|
segment_processing=SegmentProcessing(
|
172
|
-
|
172
|
+
Picture=GenerationConfig(
|
173
173
|
extended_context=True,
|
174
|
-
|
174
|
+
format=SegmentFormat.HTML,
|
175
|
+
strategy=GenerationStrategy.LLM,
|
175
176
|
),
|
176
|
-
|
177
|
+
Table=GenerationConfig(
|
177
178
|
extended_context=True,
|
178
|
-
|
179
|
+
format=SegmentFormat.HTML,
|
180
|
+
strategy=GenerationStrategy.LLM,
|
179
181
|
)
|
180
182
|
),
|
181
183
|
)
|
@@ -315,7 +317,8 @@ async def test_update_task_direct(client, sample_path):
|
|
315
317
|
segmentation_strategy=SegmentationStrategy.PAGE,
|
316
318
|
)
|
317
319
|
task = await client.upload(sample_path, original_config)
|
318
|
-
task = await task.update(new_config)
|
320
|
+
task = await (await task.update(new_config))
|
321
|
+
assert isinstance(task, TaskResponse)
|
319
322
|
assert task.status == "Succeeded"
|
320
323
|
assert task.output is not None
|
321
324
|
assert task.configuration.segmentation_strategy == SegmentationStrategy.PAGE
|
@@ -470,12 +473,12 @@ async def test_tokenizer_custom_string(client, sample_path, custom_tokenizer_con
|
|
470
473
|
assert response.status == "Succeeded"
|
471
474
|
assert response.output is not None
|
472
475
|
|
473
|
-
@pytest.mark.asyncio
|
474
|
-
async def test_embed_sources_with_different_tokenizer(client, sample_path, xlm_roberta_with_html_content_config):
|
475
|
-
|
476
|
-
|
477
|
-
|
478
|
-
|
476
|
+
# @pytest.mark.asyncio
|
477
|
+
# async def test_embed_sources_with_different_tokenizer(client, sample_path, xlm_roberta_with_html_content_config):
|
478
|
+
# response = await client.upload(sample_path, xlm_roberta_with_html_content_config)
|
479
|
+
# assert response.task_id is not None
|
480
|
+
# assert response.status == "Succeeded"
|
481
|
+
# assert response.output is not None
|
479
482
|
|
480
483
|
@pytest.mark.asyncio
|
481
484
|
async def test_error_handling_continue(client, sample_path):
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|