chunkr-ai 0.0.50__tar.gz → 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {chunkr_ai-0.0.50/src/chunkr_ai.egg-info → chunkr_ai-0.1.0}/PKG-INFO +1 -1
- {chunkr_ai-0.0.50 → chunkr_ai-0.1.0}/pyproject.toml +1 -1
- {chunkr_ai-0.0.50 → chunkr_ai-0.1.0}/src/chunkr_ai/api/configuration.py +19 -11
- {chunkr_ai-0.0.50 → chunkr_ai-0.1.0}/src/chunkr_ai/models.py +2 -0
- {chunkr_ai-0.0.50 → chunkr_ai-0.1.0/src/chunkr_ai.egg-info}/PKG-INFO +1 -1
- {chunkr_ai-0.0.50 → chunkr_ai-0.1.0}/tests/test_chunkr.py +22 -20
- {chunkr_ai-0.0.50 → chunkr_ai-0.1.0}/LICENSE +0 -0
- {chunkr_ai-0.0.50 → chunkr_ai-0.1.0}/README.md +0 -0
- {chunkr_ai-0.0.50 → chunkr_ai-0.1.0}/setup.cfg +0 -0
- {chunkr_ai-0.0.50 → chunkr_ai-0.1.0}/src/chunkr_ai/__init__.py +0 -0
- {chunkr_ai-0.0.50 → chunkr_ai-0.1.0}/src/chunkr_ai/api/__init__.py +0 -0
- {chunkr_ai-0.0.50 → chunkr_ai-0.1.0}/src/chunkr_ai/api/auth.py +0 -0
- {chunkr_ai-0.0.50 → chunkr_ai-0.1.0}/src/chunkr_ai/api/chunkr.py +0 -0
- {chunkr_ai-0.0.50 → chunkr_ai-0.1.0}/src/chunkr_ai/api/chunkr_base.py +0 -0
- {chunkr_ai-0.0.50 → chunkr_ai-0.1.0}/src/chunkr_ai/api/decorators.py +0 -0
- {chunkr_ai-0.0.50 → chunkr_ai-0.1.0}/src/chunkr_ai/api/misc.py +0 -0
- {chunkr_ai-0.0.50 → chunkr_ai-0.1.0}/src/chunkr_ai/api/protocol.py +0 -0
- {chunkr_ai-0.0.50 → chunkr_ai-0.1.0}/src/chunkr_ai/api/task_response.py +0 -0
- {chunkr_ai-0.0.50 → chunkr_ai-0.1.0}/src/chunkr_ai.egg-info/SOURCES.txt +0 -0
- {chunkr_ai-0.0.50 → chunkr_ai-0.1.0}/src/chunkr_ai.egg-info/dependency_links.txt +0 -0
- {chunkr_ai-0.0.50 → chunkr_ai-0.1.0}/src/chunkr_ai.egg-info/requires.txt +0 -0
- {chunkr_ai-0.0.50 → chunkr_ai-0.1.0}/src/chunkr_ai.egg-info/top_level.txt +0 -0
- {chunkr_ai-0.0.50 → chunkr_ai-0.1.0}/tests/test_file_handling.py +0 -0
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "chunkr-ai"
|
7
|
-
version = "0.0
|
7
|
+
version = "0.1.0"
|
8
8
|
authors = [{"name" = "Ishaan Kapoor", "email" = "ishaan@lumina.sh"}]
|
9
9
|
description = "Python client for Chunkr: open source document intelligence"
|
10
10
|
readme = "README.md"
|
@@ -3,27 +3,34 @@ from enum import Enum
|
|
3
3
|
from typing import Any, List, Optional, Union
|
4
4
|
from pydantic import field_validator, field_serializer
|
5
5
|
|
6
|
-
class GenerationStrategy(str, Enum):
|
7
|
-
LLM = "LLM"
|
8
|
-
AUTO = "Auto"
|
9
|
-
|
10
6
|
class CroppingStrategy(str, Enum):
|
11
7
|
ALL = "All"
|
12
8
|
AUTO = "Auto"
|
13
9
|
|
14
|
-
class
|
15
|
-
HTML = "
|
10
|
+
class SegmentFormat(str, Enum):
|
11
|
+
HTML = "Html"
|
16
12
|
MARKDOWN = "Markdown"
|
17
|
-
|
13
|
+
|
14
|
+
class EmbedSource(str, Enum):
|
18
15
|
CONTENT = "Content"
|
16
|
+
HTML = "HTML" # Deprecated
|
17
|
+
MARKDOWN = "Markdown" # Deprecated
|
18
|
+
LLM = "LLM"
|
19
|
+
|
20
|
+
class GenerationStrategy(str, Enum):
|
21
|
+
LLM = "LLM"
|
22
|
+
AUTO = "Auto"
|
19
23
|
|
20
24
|
class GenerationConfig(BaseModel):
|
21
|
-
|
25
|
+
format: Optional[SegmentFormat] = None
|
26
|
+
strategy: Optional[GenerationStrategy] = None
|
22
27
|
llm: Optional[str] = None
|
23
|
-
markdown: Optional[GenerationStrategy] = None
|
24
28
|
crop_image: Optional[CroppingStrategy] = None
|
25
|
-
embed_sources: Optional[List[EmbedSource]] =
|
29
|
+
embed_sources: Optional[List[EmbedSource]] = None
|
26
30
|
extended_context: Optional[bool] = None
|
31
|
+
# Deprecated fields for backwards compatibility
|
32
|
+
html: Optional[GenerationStrategy] = None # Deprecated: Use format=SegmentFormat.HTML and strategy instead
|
33
|
+
markdown: Optional[GenerationStrategy] = None # Deprecated: Use format=SegmentFormat.MARKDOWN and strategy instead
|
27
34
|
|
28
35
|
class SegmentProcessing(BaseModel):
|
29
36
|
model_config = ConfigDict(populate_by_name=True, alias_generator=str.title)
|
@@ -246,7 +253,7 @@ class SegmentType(str, Enum):
|
|
246
253
|
|
247
254
|
class Segment(BaseModel):
|
248
255
|
bbox: BoundingBox
|
249
|
-
content: str
|
256
|
+
content: str = ""
|
250
257
|
page_height: float
|
251
258
|
llm: Optional[str] = None
|
252
259
|
html: Optional[str] = None
|
@@ -258,6 +265,7 @@ class Segment(BaseModel):
|
|
258
265
|
segment_id: str
|
259
266
|
segment_type: SegmentType
|
260
267
|
confidence: Optional[float]
|
268
|
+
text: str = ""
|
261
269
|
|
262
270
|
class Chunk(BaseModel):
|
263
271
|
chunk_id: str
|
@@ -15,6 +15,7 @@ from .api.configuration import (
|
|
15
15
|
OcrStrategy,
|
16
16
|
OutputResponse,
|
17
17
|
Segment,
|
18
|
+
SegmentFormat,
|
18
19
|
SegmentProcessing,
|
19
20
|
SegmentType,
|
20
21
|
SegmentationStrategy,
|
@@ -42,6 +43,7 @@ __all__ = [
|
|
42
43
|
"OcrStrategy",
|
43
44
|
"OutputResponse",
|
44
45
|
"Segment",
|
46
|
+
"SegmentFormat",
|
45
47
|
"SegmentProcessing",
|
46
48
|
"SegmentType",
|
47
49
|
"SegmentationStrategy",
|
@@ -2,7 +2,6 @@ import pytest
|
|
2
2
|
from pathlib import Path
|
3
3
|
from PIL import Image
|
4
4
|
import asyncio
|
5
|
-
from typing import Awaitable
|
6
5
|
|
7
6
|
from chunkr_ai import Chunkr
|
8
7
|
from chunkr_ai.models import (
|
@@ -21,6 +20,7 @@ from chunkr_ai.models import (
|
|
21
20
|
Status,
|
22
21
|
TaskResponse,
|
23
22
|
Tokenizer,
|
23
|
+
SegmentFormat,
|
24
24
|
)
|
25
25
|
|
26
26
|
@pytest.fixture
|
@@ -53,9 +53,9 @@ def markdown_embed_config():
|
|
53
53
|
return Configuration(
|
54
54
|
segment_processing=SegmentProcessing(
|
55
55
|
Page=GenerationConfig(
|
56
|
-
|
57
|
-
|
58
|
-
embed_sources=[EmbedSource.
|
56
|
+
format=SegmentFormat.MARKDOWN,
|
57
|
+
strategy=GenerationStrategy.LLM,
|
58
|
+
embed_sources=[EmbedSource.CONTENT]
|
59
59
|
)
|
60
60
|
),
|
61
61
|
)
|
@@ -65,9 +65,9 @@ def html_embed_config():
|
|
65
65
|
return Configuration(
|
66
66
|
segment_processing=SegmentProcessing(
|
67
67
|
Page=GenerationConfig(
|
68
|
-
|
69
|
-
|
70
|
-
embed_sources=[EmbedSource.HTML]
|
68
|
+
format=SegmentFormat.HTML,
|
69
|
+
strategy=GenerationStrategy.LLM,
|
70
|
+
embed_sources=[EmbedSource.HTML] # Keep this for backwards compatibility testing
|
71
71
|
)
|
72
72
|
),
|
73
73
|
)
|
@@ -77,10 +77,10 @@ def multiple_embed_config():
|
|
77
77
|
return Configuration(
|
78
78
|
segment_processing=SegmentProcessing(
|
79
79
|
Page=GenerationConfig(
|
80
|
-
|
81
|
-
|
80
|
+
format=SegmentFormat.MARKDOWN,
|
81
|
+
strategy=GenerationStrategy.LLM,
|
82
82
|
llm="Generate a summary of this content",
|
83
|
-
embed_sources=[EmbedSource.
|
83
|
+
embed_sources=[EmbedSource.CONTENT, EmbedSource.LLM, EmbedSource.HTML]
|
84
84
|
)
|
85
85
|
),
|
86
86
|
)
|
@@ -169,13 +169,15 @@ def model_fallback_config():
|
|
169
169
|
def extended_context_config():
|
170
170
|
return Configuration(
|
171
171
|
segment_processing=SegmentProcessing(
|
172
|
-
|
172
|
+
Picture=GenerationConfig(
|
173
173
|
extended_context=True,
|
174
|
-
|
174
|
+
format=SegmentFormat.HTML,
|
175
|
+
strategy=GenerationStrategy.LLM,
|
175
176
|
),
|
176
|
-
|
177
|
+
Table=GenerationConfig(
|
177
178
|
extended_context=True,
|
178
|
-
|
179
|
+
format=SegmentFormat.HTML,
|
180
|
+
strategy=GenerationStrategy.LLM,
|
179
181
|
)
|
180
182
|
),
|
181
183
|
)
|
@@ -471,12 +473,12 @@ async def test_tokenizer_custom_string(client, sample_path, custom_tokenizer_con
|
|
471
473
|
assert response.status == "Succeeded"
|
472
474
|
assert response.output is not None
|
473
475
|
|
474
|
-
@pytest.mark.asyncio
|
475
|
-
async def test_embed_sources_with_different_tokenizer(client, sample_path, xlm_roberta_with_html_content_config):
|
476
|
-
|
477
|
-
|
478
|
-
|
479
|
-
|
476
|
+
# @pytest.mark.asyncio
|
477
|
+
# async def test_embed_sources_with_different_tokenizer(client, sample_path, xlm_roberta_with_html_content_config):
|
478
|
+
# response = await client.upload(sample_path, xlm_roberta_with_html_content_config)
|
479
|
+
# assert response.task_id is not None
|
480
|
+
# assert response.status == "Succeeded"
|
481
|
+
# assert response.output is not None
|
480
482
|
|
481
483
|
@pytest.mark.asyncio
|
482
484
|
async def test_error_handling_continue(client, sample_path):
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|