PyPI - chunkr-ai - Versions diffs - 0.0.49__tar.gz → 0.1.0__tar.gz - Mend

chunkr-ai 0.0.49tar.gz → 0.1.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

{chunkr_ai-0.0.49/src/chunkr_ai.egg-info → chunkr_ai-0.1.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: chunkr-ai
-Version: 0.0.49
+Version: 0.1.0
 Summary: Python client for Chunkr: open source document intelligence
 Author-email: Ishaan Kapoor <ishaan@lumina.sh>
 License: MIT License

{chunkr_ai-0.0.49 → chunkr_ai-0.1.0}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "chunkr-ai"
-version = "0.0.49"
+version = "0.1.0"
 authors = [{"name" = "Ishaan Kapoor", "email" = "ishaan@lumina.sh"}]
 description = "Python client for Chunkr: open source document intelligence"
 readme = "README.md"

{chunkr_ai-0.0.49 → chunkr_ai-0.1.0}/src/chunkr_ai/api/configuration.py RENAMED Viewed

@@ -3,27 +3,34 @@ from enum import Enum
 from typing import Any, List, Optional, Union
 from pydantic import field_validator, field_serializer
-class GenerationStrategy(str, Enum):
-    LLM = "LLM"
-    AUTO = "Auto"
 class CroppingStrategy(str, Enum):
     ALL = "All"
     AUTO = "Auto"
-class EmbedSource(str, Enum):
-    HTML = "HTML"
+class SegmentFormat(str, Enum):
+    HTML = "Html"
     MARKDOWN = "Markdown"
-    LLM = "LLM"
+class EmbedSource(str, Enum):
     CONTENT = "Content"
+    HTML = "HTML"  # Deprecated
+    MARKDOWN = "Markdown"  # Deprecated
+    LLM = "LLM"
+class GenerationStrategy(str, Enum):
+    LLM = "LLM"
+    AUTO = "Auto"
 class GenerationConfig(BaseModel):
-    html: Optional[GenerationStrategy] = None
+    format: Optional[SegmentFormat] = None
+    strategy: Optional[GenerationStrategy] = None
     llm: Optional[str] = None
-    markdown: Optional[GenerationStrategy] = None
     crop_image: Optional[CroppingStrategy] = None
-    embed_sources: Optional[List[EmbedSource]] = Field(default_factory=lambda: [EmbedSource.MARKDOWN])
+    embed_sources: Optional[List[EmbedSource]] = None
     extended_context: Optional[bool] = None
+    # Deprecated fields for backwards compatibility
+    html: Optional[GenerationStrategy] = None  # Deprecated: Use format=SegmentFormat.HTML and strategy instead
+    markdown: Optional[GenerationStrategy] = None  # Deprecated: Use format=SegmentFormat.MARKDOWN and strategy instead
 class SegmentProcessing(BaseModel):
     model_config = ConfigDict(populate_by_name=True, alias_generator=str.title)
@@ -246,7 +253,7 @@ class SegmentType(str, Enum):
 class Segment(BaseModel):
     bbox: BoundingBox
-    content: str
+    content: str = ""
     page_height: float
     llm: Optional[str] = None
     html: Optional[str] = None
@@ -258,6 +265,7 @@ class Segment(BaseModel):
     segment_id: str
     segment_type: SegmentType
     confidence: Optional[float]
+    text: str = ""
 class Chunk(BaseModel):
     chunk_id: str

{chunkr_ai-0.0.49 → chunkr_ai-0.1.0}/src/chunkr_ai/models.py RENAMED Viewed

@@ -15,6 +15,7 @@ from .api.configuration import (
     OcrStrategy,
     OutputResponse,
     Segment,
+    SegmentFormat,
     SegmentProcessing,
     SegmentType,
     SegmentationStrategy,
@@ -42,6 +43,7 @@ __all__ = [
     "OcrStrategy",
     "OutputResponse",
     "Segment",
+    "SegmentFormat",
     "SegmentProcessing",
     "SegmentType",
     "SegmentationStrategy",

{chunkr_ai-0.0.49 → chunkr_ai-0.1.0/src/chunkr_ai.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: chunkr-ai
-Version: 0.0.49
+Version: 0.1.0
 Summary: Python client for Chunkr: open source document intelligence
 Author-email: Ishaan Kapoor <ishaan@lumina.sh>
 License: MIT License

{chunkr_ai-0.0.49 → chunkr_ai-0.1.0}/tests/test_chunkr.py RENAMED Viewed

@@ -2,7 +2,6 @@ import pytest
 from pathlib import Path
 from PIL import Image
 import asyncio
-from typing import Awaitable
 from chunkr_ai import Chunkr
 from chunkr_ai.models import (
@@ -21,6 +20,7 @@ from chunkr_ai.models import (
     Status,
     TaskResponse,
     Tokenizer,
+    SegmentFormat,
 )
 @pytest.fixture
@@ -53,9 +53,9 @@ def markdown_embed_config():
     return Configuration(
         segment_processing=SegmentProcessing(
             Page=GenerationConfig(
-                html=GenerationStrategy.LLM,
-                markdown=GenerationStrategy.LLM,
-                embed_sources=[EmbedSource.MARKDOWN]
+                format=SegmentFormat.MARKDOWN,
+                strategy=GenerationStrategy.LLM,
+                embed_sources=[EmbedSource.CONTENT]
             )
         ),
     )
@@ -65,9 +65,9 @@ def html_embed_config():
     return Configuration(
         segment_processing=SegmentProcessing(
             Page=GenerationConfig(
-                html=GenerationStrategy.LLM,
-                markdown=GenerationStrategy.LLM,
-                embed_sources=[EmbedSource.HTML]
+                format=SegmentFormat.HTML,
+                strategy=GenerationStrategy.LLM,
+                embed_sources=[EmbedSource.HTML]  # Keep this for backwards compatibility testing
             )
         ),
     )
@@ -77,10 +77,10 @@ def multiple_embed_config():
     return Configuration(
         segment_processing=SegmentProcessing(
             Page=GenerationConfig(
-                html=GenerationStrategy.LLM,
-                markdown=GenerationStrategy.LLM,
+                format=SegmentFormat.MARKDOWN,
+                strategy=GenerationStrategy.LLM,
                 llm="Generate a summary of this content",
-                embed_sources=[EmbedSource.MARKDOWN, EmbedSource.LLM, EmbedSource.HTML]
+                embed_sources=[EmbedSource.CONTENT, EmbedSource.LLM, EmbedSource.HTML]
             )
         ),
     )
@@ -169,13 +169,15 @@ def model_fallback_config():
 def extended_context_config():
     return Configuration(
         segment_processing=SegmentProcessing(
-            picture=GenerationConfig(
+            Picture=GenerationConfig(
                 extended_context=True,
-                html=GenerationStrategy.LLM,
+                format=SegmentFormat.HTML,
+                strategy=GenerationStrategy.LLM,
             ),
-            table=GenerationConfig(
+            Table=GenerationConfig(
                 extended_context=True,
-                html=GenerationStrategy.LLM,
+                format=SegmentFormat.HTML,
+                strategy=GenerationStrategy.LLM,
             )
         ),
     )
@@ -315,7 +317,8 @@ async def test_update_task_direct(client, sample_path):
         segmentation_strategy=SegmentationStrategy.PAGE,
     )
     task = await client.upload(sample_path, original_config)
-    task = await task.update(new_config)
+    task = await (await task.update(new_config))
+    assert isinstance(task, TaskResponse)
     assert task.status == "Succeeded"
     assert task.output is not None
     assert task.configuration.segmentation_strategy == SegmentationStrategy.PAGE
@@ -470,12 +473,12 @@ async def test_tokenizer_custom_string(client, sample_path, custom_tokenizer_con
     assert response.status == "Succeeded"
     assert response.output is not None
-@pytest.mark.asyncio
-async def test_embed_sources_with_different_tokenizer(client, sample_path, xlm_roberta_with_html_content_config):
-    response = await client.upload(sample_path, xlm_roberta_with_html_content_config)
-    assert response.task_id is not None
-    assert response.status == "Succeeded"
-    assert response.output is not None
+# @pytest.mark.asyncio
+# async def test_embed_sources_with_different_tokenizer(client, sample_path, xlm_roberta_with_html_content_config):
+#     response = await client.upload(sample_path, xlm_roberta_with_html_content_config)
+#     assert response.task_id is not None
+#     assert response.status == "Succeeded"
+#     assert response.output is not None
 @pytest.mark.asyncio
 async def test_error_handling_continue(client, sample_path):

{chunkr_ai-0.0.49 → chunkr_ai-0.1.0}/LICENSE RENAMED Viewed

File without changes

{chunkr_ai-0.0.49 → chunkr_ai-0.1.0}/README.md RENAMED Viewed

File without changes

{chunkr_ai-0.0.49 → chunkr_ai-0.1.0}/setup.cfg RENAMED Viewed

File without changes

{chunkr_ai-0.0.49 → chunkr_ai-0.1.0}/src/chunkr_ai/__init__.py RENAMED Viewed

File without changes

{chunkr_ai-0.0.49 → chunkr_ai-0.1.0}/src/chunkr_ai/api/__init__.py RENAMED Viewed

File without changes

{chunkr_ai-0.0.49 → chunkr_ai-0.1.0}/src/chunkr_ai/api/auth.py RENAMED Viewed

File without changes

{chunkr_ai-0.0.49 → chunkr_ai-0.1.0}/src/chunkr_ai/api/chunkr.py RENAMED Viewed

File without changes

{chunkr_ai-0.0.49 → chunkr_ai-0.1.0}/src/chunkr_ai/api/chunkr_base.py RENAMED Viewed

File without changes

{chunkr_ai-0.0.49 → chunkr_ai-0.1.0}/src/chunkr_ai/api/decorators.py RENAMED Viewed

File without changes

{chunkr_ai-0.0.49 → chunkr_ai-0.1.0}/src/chunkr_ai/api/misc.py RENAMED Viewed

File without changes

{chunkr_ai-0.0.49 → chunkr_ai-0.1.0}/src/chunkr_ai/api/protocol.py RENAMED Viewed

File without changes

{chunkr_ai-0.0.49 → chunkr_ai-0.1.0}/src/chunkr_ai/api/task_response.py RENAMED Viewed

File without changes

{chunkr_ai-0.0.49 → chunkr_ai-0.1.0}/src/chunkr_ai.egg-info/SOURCES.txt RENAMED Viewed

File without changes

{chunkr_ai-0.0.49 → chunkr_ai-0.1.0}/src/chunkr_ai.egg-info/dependency_links.txt RENAMED Viewed

File without changes

{chunkr_ai-0.0.49 → chunkr_ai-0.1.0}/src/chunkr_ai.egg-info/requires.txt RENAMED Viewed

File without changes

{chunkr_ai-0.0.49 → chunkr_ai-0.1.0}/src/chunkr_ai.egg-info/top_level.txt RENAMED Viewed

File without changes

{chunkr_ai-0.0.49 → chunkr_ai-0.1.0}/tests/test_file_handling.py RENAMED Viewed

File without changes

chunkr-ai 0.0.49__tar.gz → 0.1.0__tar.gz

chunkr-ai 0.0.49tar.gz → 0.1.0tar.gz