chunkr-ai 0.0.49__tar.gz → 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (23) hide show
  1. {chunkr_ai-0.0.49/src/chunkr_ai.egg-info → chunkr_ai-0.1.0}/PKG-INFO +1 -1
  2. {chunkr_ai-0.0.49 → chunkr_ai-0.1.0}/pyproject.toml +1 -1
  3. {chunkr_ai-0.0.49 → chunkr_ai-0.1.0}/src/chunkr_ai/api/configuration.py +19 -11
  4. {chunkr_ai-0.0.49 → chunkr_ai-0.1.0}/src/chunkr_ai/models.py +2 -0
  5. {chunkr_ai-0.0.49 → chunkr_ai-0.1.0/src/chunkr_ai.egg-info}/PKG-INFO +1 -1
  6. {chunkr_ai-0.0.49 → chunkr_ai-0.1.0}/tests/test_chunkr.py +24 -21
  7. {chunkr_ai-0.0.49 → chunkr_ai-0.1.0}/LICENSE +0 -0
  8. {chunkr_ai-0.0.49 → chunkr_ai-0.1.0}/README.md +0 -0
  9. {chunkr_ai-0.0.49 → chunkr_ai-0.1.0}/setup.cfg +0 -0
  10. {chunkr_ai-0.0.49 → chunkr_ai-0.1.0}/src/chunkr_ai/__init__.py +0 -0
  11. {chunkr_ai-0.0.49 → chunkr_ai-0.1.0}/src/chunkr_ai/api/__init__.py +0 -0
  12. {chunkr_ai-0.0.49 → chunkr_ai-0.1.0}/src/chunkr_ai/api/auth.py +0 -0
  13. {chunkr_ai-0.0.49 → chunkr_ai-0.1.0}/src/chunkr_ai/api/chunkr.py +0 -0
  14. {chunkr_ai-0.0.49 → chunkr_ai-0.1.0}/src/chunkr_ai/api/chunkr_base.py +0 -0
  15. {chunkr_ai-0.0.49 → chunkr_ai-0.1.0}/src/chunkr_ai/api/decorators.py +0 -0
  16. {chunkr_ai-0.0.49 → chunkr_ai-0.1.0}/src/chunkr_ai/api/misc.py +0 -0
  17. {chunkr_ai-0.0.49 → chunkr_ai-0.1.0}/src/chunkr_ai/api/protocol.py +0 -0
  18. {chunkr_ai-0.0.49 → chunkr_ai-0.1.0}/src/chunkr_ai/api/task_response.py +0 -0
  19. {chunkr_ai-0.0.49 → chunkr_ai-0.1.0}/src/chunkr_ai.egg-info/SOURCES.txt +0 -0
  20. {chunkr_ai-0.0.49 → chunkr_ai-0.1.0}/src/chunkr_ai.egg-info/dependency_links.txt +0 -0
  21. {chunkr_ai-0.0.49 → chunkr_ai-0.1.0}/src/chunkr_ai.egg-info/requires.txt +0 -0
  22. {chunkr_ai-0.0.49 → chunkr_ai-0.1.0}/src/chunkr_ai.egg-info/top_level.txt +0 -0
  23. {chunkr_ai-0.0.49 → chunkr_ai-0.1.0}/tests/test_file_handling.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: chunkr-ai
3
- Version: 0.0.49
3
+ Version: 0.1.0
4
4
  Summary: Python client for Chunkr: open source document intelligence
5
5
  Author-email: Ishaan Kapoor <ishaan@lumina.sh>
6
6
  License: MIT License
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "chunkr-ai"
7
- version = "0.0.49"
7
+ version = "0.1.0"
8
8
  authors = [{"name" = "Ishaan Kapoor", "email" = "ishaan@lumina.sh"}]
9
9
  description = "Python client for Chunkr: open source document intelligence"
10
10
  readme = "README.md"
@@ -3,27 +3,34 @@ from enum import Enum
3
3
  from typing import Any, List, Optional, Union
4
4
  from pydantic import field_validator, field_serializer
5
5
 
6
- class GenerationStrategy(str, Enum):
7
- LLM = "LLM"
8
- AUTO = "Auto"
9
-
10
6
  class CroppingStrategy(str, Enum):
11
7
  ALL = "All"
12
8
  AUTO = "Auto"
13
9
 
14
- class EmbedSource(str, Enum):
15
- HTML = "HTML"
10
+ class SegmentFormat(str, Enum):
11
+ HTML = "Html"
16
12
  MARKDOWN = "Markdown"
17
- LLM = "LLM"
13
+
14
+ class EmbedSource(str, Enum):
18
15
  CONTENT = "Content"
16
+ HTML = "HTML" # Deprecated
17
+ MARKDOWN = "Markdown" # Deprecated
18
+ LLM = "LLM"
19
+
20
+ class GenerationStrategy(str, Enum):
21
+ LLM = "LLM"
22
+ AUTO = "Auto"
19
23
 
20
24
  class GenerationConfig(BaseModel):
21
- html: Optional[GenerationStrategy] = None
25
+ format: Optional[SegmentFormat] = None
26
+ strategy: Optional[GenerationStrategy] = None
22
27
  llm: Optional[str] = None
23
- markdown: Optional[GenerationStrategy] = None
24
28
  crop_image: Optional[CroppingStrategy] = None
25
- embed_sources: Optional[List[EmbedSource]] = Field(default_factory=lambda: [EmbedSource.MARKDOWN])
29
+ embed_sources: Optional[List[EmbedSource]] = None
26
30
  extended_context: Optional[bool] = None
31
+ # Deprecated fields for backwards compatibility
32
+ html: Optional[GenerationStrategy] = None # Deprecated: Use format=SegmentFormat.HTML and strategy instead
33
+ markdown: Optional[GenerationStrategy] = None # Deprecated: Use format=SegmentFormat.MARKDOWN and strategy instead
27
34
 
28
35
  class SegmentProcessing(BaseModel):
29
36
  model_config = ConfigDict(populate_by_name=True, alias_generator=str.title)
@@ -246,7 +253,7 @@ class SegmentType(str, Enum):
246
253
 
247
254
  class Segment(BaseModel):
248
255
  bbox: BoundingBox
249
- content: str
256
+ content: str = ""
250
257
  page_height: float
251
258
  llm: Optional[str] = None
252
259
  html: Optional[str] = None
@@ -258,6 +265,7 @@ class Segment(BaseModel):
258
265
  segment_id: str
259
266
  segment_type: SegmentType
260
267
  confidence: Optional[float]
268
+ text: str = ""
261
269
 
262
270
  class Chunk(BaseModel):
263
271
  chunk_id: str
@@ -15,6 +15,7 @@ from .api.configuration import (
15
15
  OcrStrategy,
16
16
  OutputResponse,
17
17
  Segment,
18
+ SegmentFormat,
18
19
  SegmentProcessing,
19
20
  SegmentType,
20
21
  SegmentationStrategy,
@@ -42,6 +43,7 @@ __all__ = [
42
43
  "OcrStrategy",
43
44
  "OutputResponse",
44
45
  "Segment",
46
+ "SegmentFormat",
45
47
  "SegmentProcessing",
46
48
  "SegmentType",
47
49
  "SegmentationStrategy",
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: chunkr-ai
3
- Version: 0.0.49
3
+ Version: 0.1.0
4
4
  Summary: Python client for Chunkr: open source document intelligence
5
5
  Author-email: Ishaan Kapoor <ishaan@lumina.sh>
6
6
  License: MIT License
@@ -2,7 +2,6 @@ import pytest
2
2
  from pathlib import Path
3
3
  from PIL import Image
4
4
  import asyncio
5
- from typing import Awaitable
6
5
 
7
6
  from chunkr_ai import Chunkr
8
7
  from chunkr_ai.models import (
@@ -21,6 +20,7 @@ from chunkr_ai.models import (
21
20
  Status,
22
21
  TaskResponse,
23
22
  Tokenizer,
23
+ SegmentFormat,
24
24
  )
25
25
 
26
26
  @pytest.fixture
@@ -53,9 +53,9 @@ def markdown_embed_config():
53
53
  return Configuration(
54
54
  segment_processing=SegmentProcessing(
55
55
  Page=GenerationConfig(
56
- html=GenerationStrategy.LLM,
57
- markdown=GenerationStrategy.LLM,
58
- embed_sources=[EmbedSource.MARKDOWN]
56
+ format=SegmentFormat.MARKDOWN,
57
+ strategy=GenerationStrategy.LLM,
58
+ embed_sources=[EmbedSource.CONTENT]
59
59
  )
60
60
  ),
61
61
  )
@@ -65,9 +65,9 @@ def html_embed_config():
65
65
  return Configuration(
66
66
  segment_processing=SegmentProcessing(
67
67
  Page=GenerationConfig(
68
- html=GenerationStrategy.LLM,
69
- markdown=GenerationStrategy.LLM,
70
- embed_sources=[EmbedSource.HTML]
68
+ format=SegmentFormat.HTML,
69
+ strategy=GenerationStrategy.LLM,
70
+ embed_sources=[EmbedSource.HTML] # Keep this for backwards compatibility testing
71
71
  )
72
72
  ),
73
73
  )
@@ -77,10 +77,10 @@ def multiple_embed_config():
77
77
  return Configuration(
78
78
  segment_processing=SegmentProcessing(
79
79
  Page=GenerationConfig(
80
- html=GenerationStrategy.LLM,
81
- markdown=GenerationStrategy.LLM,
80
+ format=SegmentFormat.MARKDOWN,
81
+ strategy=GenerationStrategy.LLM,
82
82
  llm="Generate a summary of this content",
83
- embed_sources=[EmbedSource.MARKDOWN, EmbedSource.LLM, EmbedSource.HTML]
83
+ embed_sources=[EmbedSource.CONTENT, EmbedSource.LLM, EmbedSource.HTML]
84
84
  )
85
85
  ),
86
86
  )
@@ -169,13 +169,15 @@ def model_fallback_config():
169
169
  def extended_context_config():
170
170
  return Configuration(
171
171
  segment_processing=SegmentProcessing(
172
- picture=GenerationConfig(
172
+ Picture=GenerationConfig(
173
173
  extended_context=True,
174
- html=GenerationStrategy.LLM,
174
+ format=SegmentFormat.HTML,
175
+ strategy=GenerationStrategy.LLM,
175
176
  ),
176
- table=GenerationConfig(
177
+ Table=GenerationConfig(
177
178
  extended_context=True,
178
- html=GenerationStrategy.LLM,
179
+ format=SegmentFormat.HTML,
180
+ strategy=GenerationStrategy.LLM,
179
181
  )
180
182
  ),
181
183
  )
@@ -315,7 +317,8 @@ async def test_update_task_direct(client, sample_path):
315
317
  segmentation_strategy=SegmentationStrategy.PAGE,
316
318
  )
317
319
  task = await client.upload(sample_path, original_config)
318
- task = await task.update(new_config)
320
+ task = await (await task.update(new_config))
321
+ assert isinstance(task, TaskResponse)
319
322
  assert task.status == "Succeeded"
320
323
  assert task.output is not None
321
324
  assert task.configuration.segmentation_strategy == SegmentationStrategy.PAGE
@@ -470,12 +473,12 @@ async def test_tokenizer_custom_string(client, sample_path, custom_tokenizer_con
470
473
  assert response.status == "Succeeded"
471
474
  assert response.output is not None
472
475
 
473
- @pytest.mark.asyncio
474
- async def test_embed_sources_with_different_tokenizer(client, sample_path, xlm_roberta_with_html_content_config):
475
- response = await client.upload(sample_path, xlm_roberta_with_html_content_config)
476
- assert response.task_id is not None
477
- assert response.status == "Succeeded"
478
- assert response.output is not None
476
+ # @pytest.mark.asyncio
477
+ # async def test_embed_sources_with_different_tokenizer(client, sample_path, xlm_roberta_with_html_content_config):
478
+ # response = await client.upload(sample_path, xlm_roberta_with_html_content_config)
479
+ # assert response.task_id is not None
480
+ # assert response.status == "Succeeded"
481
+ # assert response.output is not None
479
482
 
480
483
  @pytest.mark.asyncio
481
484
  async def test_error_handling_continue(client, sample_path):
File without changes
File without changes
File without changes