chunkr-ai 0.3.1__tar.gz → 0.3.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {chunkr_ai-0.3.1/src/chunkr_ai.egg-info → chunkr_ai-0.3.3}/PKG-INFO +1 -1
- {chunkr_ai-0.3.1 → chunkr_ai-0.3.3}/pyproject.toml +6 -4
- chunkr_ai-0.3.3/src/chunkr_ai/__init__.py +14 -0
- chunkr_ai-0.3.3/src/chunkr_ai/api/auth.py +42 -0
- {chunkr_ai-0.3.1 → chunkr_ai-0.3.3}/src/chunkr_ai/api/chunkr.py +5 -5
- {chunkr_ai-0.3.1 → chunkr_ai-0.3.3}/src/chunkr_ai/api/configuration.py +10 -8
- {chunkr_ai-0.3.1 → chunkr_ai-0.3.3}/src/chunkr_ai/api/decorators.py +5 -3
- {chunkr_ai-0.3.1 → chunkr_ai-0.3.3}/src/chunkr_ai/api/misc.py +6 -6
- {chunkr_ai-0.3.1 → chunkr_ai-0.3.3}/src/chunkr_ai/api/task_response.py +1 -1
- {chunkr_ai-0.3.1 → chunkr_ai-0.3.3/src/chunkr_ai.egg-info}/PKG-INFO +1 -1
- {chunkr_ai-0.3.1 → chunkr_ai-0.3.3}/tests/test_chunkr.py +75 -1
- {chunkr_ai-0.3.1 → chunkr_ai-0.3.3}/tests/test_excel.py +11 -6
- {chunkr_ai-0.3.1 → chunkr_ai-0.3.3}/tests/test_pages.py +0 -8
- chunkr_ai-0.3.1/src/chunkr_ai/__init__.py +0 -3
- chunkr_ai-0.3.1/src/chunkr_ai/api/auth.py +0 -13
- {chunkr_ai-0.3.1 → chunkr_ai-0.3.3}/LICENSE +0 -0
- {chunkr_ai-0.3.1 → chunkr_ai-0.3.3}/README.md +0 -0
- {chunkr_ai-0.3.1 → chunkr_ai-0.3.3}/setup.cfg +0 -0
- {chunkr_ai-0.3.1 → chunkr_ai-0.3.3}/src/chunkr_ai/api/__init__.py +0 -0
- {chunkr_ai-0.3.1 → chunkr_ai-0.3.3}/src/chunkr_ai/api/chunkr_base.py +0 -0
- {chunkr_ai-0.3.1 → chunkr_ai-0.3.3}/src/chunkr_ai/api/protocol.py +0 -0
- {chunkr_ai-0.3.1 → chunkr_ai-0.3.3}/src/chunkr_ai/models.py +0 -0
- {chunkr_ai-0.3.1 → chunkr_ai-0.3.3}/src/chunkr_ai.egg-info/SOURCES.txt +0 -0
- {chunkr_ai-0.3.1 → chunkr_ai-0.3.3}/src/chunkr_ai.egg-info/dependency_links.txt +0 -0
- {chunkr_ai-0.3.1 → chunkr_ai-0.3.3}/src/chunkr_ai.egg-info/requires.txt +0 -0
- {chunkr_ai-0.3.1 → chunkr_ai-0.3.3}/src/chunkr_ai.egg-info/top_level.txt +0 -0
- {chunkr_ai-0.3.1 → chunkr_ai-0.3.3}/tests/test_file_handling.py +0 -0
@@ -4,12 +4,12 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "chunkr-ai"
|
7
|
-
version = "0.3.
|
8
|
-
authors = [{"name" = "Ishaan Kapoor", "email" = "ishaan@lumina.sh"}]
|
7
|
+
version = "0.3.3"
|
8
|
+
authors = [{ "name" = "Ishaan Kapoor", "email" = "ishaan@lumina.sh" }]
|
9
9
|
description = "Python client for Chunkr: open source document intelligence"
|
10
10
|
readme = "README.md"
|
11
|
-
license = {"file" = "LICENSE"}
|
12
|
-
urls = {Homepage = "https://chunkr.ai"}
|
11
|
+
license = { "file" = "LICENSE" }
|
12
|
+
urls = { Homepage = "https://chunkr.ai" }
|
13
13
|
dependencies = [
|
14
14
|
"httpx>=0.25.0",
|
15
15
|
"matplotlib>=3.10.3",
|
@@ -27,3 +27,5 @@ test = [
|
|
27
27
|
"ruff>=0.9.3",
|
28
28
|
]
|
29
29
|
|
30
|
+
[dependency-groups]
|
31
|
+
dev = ["mypy>=1.17.1"]
|
@@ -0,0 +1,14 @@
|
|
1
|
+
from .api.chunkr import Chunkr
|
2
|
+
import tomllib
|
3
|
+
from pathlib import Path
|
4
|
+
|
5
|
+
# Read version from pyproject.toml
|
6
|
+
try:
|
7
|
+
pyproject_path = Path(__file__).parent.parent.parent / "pyproject.toml"
|
8
|
+
with open(pyproject_path, "rb") as f:
|
9
|
+
pyproject_data = tomllib.load(f)
|
10
|
+
__version__ = pyproject_data["project"]["version"]
|
11
|
+
except Exception:
|
12
|
+
__version__ = "unknown"
|
13
|
+
|
14
|
+
__all__ = ["Chunkr", "__version__"]
|
@@ -0,0 +1,42 @@
|
|
1
|
+
import platform
|
2
|
+
import sys
|
3
|
+
import tomllib
|
4
|
+
from pathlib import Path
|
5
|
+
|
6
|
+
def _find_pyproject_toml(start_path: Path) -> Path | None:
|
7
|
+
"""Search for pyproject.toml in current and parent directories."""
|
8
|
+
for parent in [start_path, *start_path.parents]:
|
9
|
+
candidate = parent / "pyproject.toml"
|
10
|
+
if candidate.is_file():
|
11
|
+
return candidate
|
12
|
+
return None
|
13
|
+
|
14
|
+
# Read version from pyproject.toml
|
15
|
+
try:
|
16
|
+
pyproject_path = _find_pyproject_toml(Path(__file__).resolve().parent)
|
17
|
+
if pyproject_path is not None:
|
18
|
+
with open(pyproject_path, "rb") as f:
|
19
|
+
pyproject_data = tomllib.load(f)
|
20
|
+
__version__ = pyproject_data["project"]["version"]
|
21
|
+
else:
|
22
|
+
__version__ = "unknown"
|
23
|
+
except Exception:
|
24
|
+
__version__ = "unknown"
|
25
|
+
|
26
|
+
class HeadersMixin:
|
27
|
+
"""Mixin class for handling authorization headers"""
|
28
|
+
_api_key: str = ""
|
29
|
+
|
30
|
+
def get_api_key(self) -> str:
|
31
|
+
"""Get the API key"""
|
32
|
+
if not hasattr(self, "_api_key") or not self._api_key:
|
33
|
+
raise ValueError("API key not set")
|
34
|
+
return self._api_key
|
35
|
+
|
36
|
+
def _headers(self) -> dict:
|
37
|
+
"""Generate authorization headers and version information"""
|
38
|
+
user_agent = f"chunkr-ai/{__version__} (Python/{sys.version.split()[0]}; {platform.system()}/{platform.release()})"
|
39
|
+
return {
|
40
|
+
"Authorization": self.get_api_key(),
|
41
|
+
"User-Agent": user_agent
|
42
|
+
}
|
@@ -42,7 +42,7 @@ class Chunkr(ChunkrBase):
|
|
42
42
|
data = await prepare_upload_data(file, filename, config)
|
43
43
|
assert self._client is not None
|
44
44
|
r = await self._client.post(
|
45
|
-
f"{self.url}/
|
45
|
+
f"{self.url}/task/parse", json=data, headers=self._headers()
|
46
46
|
)
|
47
47
|
r.raise_for_status()
|
48
48
|
return TaskResponse(**r.json()).with_client(cast(ChunkrClientProtocol, self), True, False)
|
@@ -55,7 +55,7 @@ class Chunkr(ChunkrBase):
|
|
55
55
|
data = await prepare_upload_data(None, None, config)
|
56
56
|
assert self._client is not None
|
57
57
|
r = await self._client.patch(
|
58
|
-
f"{self.url}/
|
58
|
+
f"{self.url}/task/{task_id}/parse",
|
59
59
|
json=data,
|
60
60
|
headers=self._headers(),
|
61
61
|
)
|
@@ -71,7 +71,7 @@ class Chunkr(ChunkrBase):
|
|
71
71
|
}
|
72
72
|
assert self._client is not None
|
73
73
|
r = await self._client.get(
|
74
|
-
f"{self.url}/
|
74
|
+
f"{self.url}/task/{task_id}",
|
75
75
|
params=params,
|
76
76
|
headers=self._headers()
|
77
77
|
)
|
@@ -83,7 +83,7 @@ class Chunkr(ChunkrBase):
|
|
83
83
|
async def delete_task(self, task_id: str) -> None:
|
84
84
|
assert self._client is not None
|
85
85
|
r = await self._client.delete(
|
86
|
-
f"{self.url}/
|
86
|
+
f"{self.url}/task/{task_id}", headers=self._headers()
|
87
87
|
)
|
88
88
|
r.raise_for_status()
|
89
89
|
|
@@ -92,7 +92,7 @@ class Chunkr(ChunkrBase):
|
|
92
92
|
async def cancel_task(self, task_id: str) -> None:
|
93
93
|
assert self._client is not None
|
94
94
|
r = await self._client.get(
|
95
|
-
f"{self.url}/
|
95
|
+
f"{self.url}/task/{task_id}/cancel", headers=self._headers()
|
96
96
|
)
|
97
97
|
r.raise_for_status()
|
98
98
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
from pydantic import BaseModel, Field, ConfigDict
|
2
2
|
from enum import Enum
|
3
|
-
from typing import
|
3
|
+
from typing import List, Optional, Union
|
4
4
|
from pydantic import field_validator, field_serializer
|
5
5
|
|
6
6
|
class CroppingStrategy(str, Enum):
|
@@ -20,15 +20,17 @@ class EmbedSource(str, Enum):
|
|
20
20
|
class GenerationStrategy(str, Enum):
|
21
21
|
LLM = "LLM"
|
22
22
|
AUTO = "Auto"
|
23
|
+
IGNORE = "Ignore"
|
23
24
|
|
24
25
|
class GenerationConfig(BaseModel):
|
25
26
|
format: Optional[SegmentFormat] = None
|
26
27
|
strategy: Optional[GenerationStrategy] = None
|
27
|
-
llm: Optional[str] = None
|
28
28
|
crop_image: Optional[CroppingStrategy] = None
|
29
|
-
embed_sources: Optional[List[EmbedSource]] = None
|
30
29
|
extended_context: Optional[bool] = None
|
30
|
+
description: Optional[bool] = None
|
31
31
|
# Deprecated fields for backwards compatibility
|
32
|
+
llm: Optional[str] = None # Deprecated
|
33
|
+
embed_sources: Optional[List[EmbedSource]] = None # Deprecated
|
32
34
|
html: Optional[GenerationStrategy] = None # Deprecated: Use format=SegmentFormat.HTML and strategy instead
|
33
35
|
markdown: Optional[GenerationStrategy] = None # Deprecated: Use format=SegmentFormat.MARKDOWN and strategy instead
|
34
36
|
|
@@ -83,7 +85,7 @@ class TokenizerType(BaseModel):
|
|
83
85
|
return {}
|
84
86
|
|
85
87
|
class ChunkProcessing(BaseModel):
|
86
|
-
ignore_headers_and_footers: Optional[bool] =
|
88
|
+
ignore_headers_and_footers: Optional[bool] = None # Deprecated
|
87
89
|
target_length: Optional[int] = None
|
88
90
|
tokenizer: Optional[Union[TokenizerType, Tokenizer, str]] = None
|
89
91
|
|
@@ -286,6 +288,7 @@ class Page(BaseModel):
|
|
286
288
|
page_height: float
|
287
289
|
page_width: float
|
288
290
|
ss_sheet_name: Optional[str] = None
|
291
|
+
dpi: Optional[float] = None
|
289
292
|
|
290
293
|
class Segment(BaseModel):
|
291
294
|
bbox: BoundingBox
|
@@ -303,6 +306,8 @@ class Segment(BaseModel):
|
|
303
306
|
confidence: Optional[float]
|
304
307
|
text: str = ""
|
305
308
|
segment_length: Optional[int] = None
|
309
|
+
embed: Optional[str] = None
|
310
|
+
description: Optional[str] = None
|
306
311
|
# Spreadsheet-specific fields
|
307
312
|
ss_cells: Optional[List[Cell]] = None
|
308
313
|
ss_header_bbox: Optional[BoundingBox] = None
|
@@ -317,6 +322,7 @@ class Chunk(BaseModel):
|
|
317
322
|
chunk_length: int
|
318
323
|
segments: List[Segment]
|
319
324
|
embed: Optional[str] = None
|
325
|
+
content: Optional[str] = None
|
320
326
|
|
321
327
|
class OutputResponse(BaseModel):
|
322
328
|
chunks: List[Chunk]
|
@@ -347,10 +353,6 @@ class Configuration(BaseModel):
|
|
347
353
|
|
348
354
|
class OutputConfiguration(Configuration):
|
349
355
|
input_file_url: Optional[str] = None
|
350
|
-
# Deprecated
|
351
|
-
json_schema: Optional[Any] = None
|
352
|
-
model: Optional[Model] = None
|
353
|
-
target_chunk_length: Optional[int] = None
|
354
356
|
|
355
357
|
class Status(str, Enum):
|
356
358
|
STARTING = "Starting"
|
@@ -2,10 +2,12 @@ import asyncio
|
|
2
2
|
import functools
|
3
3
|
import httpx
|
4
4
|
import nest_asyncio
|
5
|
-
from typing import Callable, Any, TypeVar, Awaitable, Union
|
6
|
-
|
5
|
+
from typing import Callable, Any, TypeVar, Awaitable, Union
|
6
|
+
import sys
|
7
|
+
|
8
|
+
if sys.version_info >= (3, 10):
|
7
9
|
from typing import ParamSpec
|
8
|
-
|
10
|
+
else:
|
9
11
|
from typing_extensions import ParamSpec
|
10
12
|
|
11
13
|
T = TypeVar('T')
|
@@ -3,7 +3,7 @@ import base64
|
|
3
3
|
import io
|
4
4
|
from pathlib import Path
|
5
5
|
from PIL import Image
|
6
|
-
from typing import Union, Tuple, BinaryIO, Optional
|
6
|
+
from typing import Union, Tuple, BinaryIO, Optional
|
7
7
|
|
8
8
|
async def prepare_file(file: Union[str, Path, BinaryIO, Image.Image, bytes, bytearray, memoryview]) -> Tuple[Optional[str], str]:
|
9
9
|
"""Convert various file types into a tuple of (filename, file content).
|
@@ -39,7 +39,7 @@ async def prepare_file(file: Union[str, Path, BinaryIO, Image.Image, bytes, byte
|
|
39
39
|
base64.b64decode(potential_base64)
|
40
40
|
# If we get here, it was a valid base64 string in bytes form
|
41
41
|
return None, potential_base64
|
42
|
-
except:
|
42
|
+
except Exception:
|
43
43
|
# Not a base64 string in bytes form, encode it as base64
|
44
44
|
base64_str = base64.b64encode(file_bytes).decode()
|
45
45
|
return None, base64_str
|
@@ -66,14 +66,14 @@ async def prepare_file(file: Union[str, Path, BinaryIO, Image.Image, bytes, byte
|
|
66
66
|
# Just test if it's valid base64, don't store the result
|
67
67
|
base64.b64decode(file)
|
68
68
|
return None, file
|
69
|
-
except:
|
69
|
+
except Exception:
|
70
70
|
raise ValueError(f"File not found: {file} and it's not a valid base64 string")
|
71
71
|
except Exception as e:
|
72
72
|
# If string can't be converted to Path or decoded as base64, it might still be a base64 string
|
73
73
|
try:
|
74
|
-
base64.b64decode(file)
|
75
|
-
return None, file
|
76
|
-
except:
|
74
|
+
base64.b64decode(str(file))
|
75
|
+
return None, str(file)
|
76
|
+
except Exception:
|
77
77
|
raise ValueError(f"Unable to process file: {e}")
|
78
78
|
|
79
79
|
# Handle file paths - convert to base64
|
@@ -680,6 +680,74 @@ class TestNewFields:
|
|
680
680
|
for segment in segments_with_length:
|
681
681
|
assert segment.segment_length > 0
|
682
682
|
|
683
|
+
@pytest.mark.asyncio
|
684
|
+
async def test_segment_embed_field(self, client, sample_path):
|
685
|
+
"""Test that Segment objects include the embed field"""
|
686
|
+
response = await client.upload(sample_path)
|
687
|
+
assert response.task_id is not None
|
688
|
+
assert response.status == "Succeeded"
|
689
|
+
assert response.output is not None
|
690
|
+
assert len(response.output.chunks) > 0
|
691
|
+
|
692
|
+
segment = response.output.chunks[0].segments[0]
|
693
|
+
|
694
|
+
# embed field should be accessible
|
695
|
+
assert hasattr(segment, 'embed')
|
696
|
+
# embed can be None or a string
|
697
|
+
assert segment.embed is None or isinstance(segment.embed, str)
|
698
|
+
|
699
|
+
@pytest.mark.asyncio
|
700
|
+
async def test_segment_description_field(self, client, sample_path):
|
701
|
+
"""Test that Segment objects include the description field"""
|
702
|
+
response = await client.upload(sample_path)
|
703
|
+
assert response.task_id is not None
|
704
|
+
assert response.status == "Succeeded"
|
705
|
+
assert response.output is not None
|
706
|
+
assert len(response.output.chunks) > 0
|
707
|
+
|
708
|
+
segment = response.output.chunks[0].segments[0]
|
709
|
+
|
710
|
+
# description field should be accessible
|
711
|
+
assert hasattr(segment, 'description')
|
712
|
+
# description can be None or a string
|
713
|
+
assert segment.description is None or isinstance(segment.description, str)
|
714
|
+
|
715
|
+
@pytest.mark.asyncio
|
716
|
+
async def test_chunk_content_field(self, client, sample_path):
|
717
|
+
"""Test that Chunk objects include the content field"""
|
718
|
+
response = await client.upload(sample_path)
|
719
|
+
assert response.task_id is not None
|
720
|
+
assert response.status == "Succeeded"
|
721
|
+
assert response.output is not None
|
722
|
+
assert len(response.output.chunks) > 0
|
723
|
+
|
724
|
+
chunk = response.output.chunks[0]
|
725
|
+
|
726
|
+
# content field should be accessible
|
727
|
+
assert hasattr(chunk, 'content')
|
728
|
+
# content can be None or a string
|
729
|
+
assert chunk.content is None or isinstance(chunk.content, str)
|
730
|
+
|
731
|
+
@pytest.mark.asyncio
|
732
|
+
async def test_page_dpi_field_type(self, client, sample_path):
|
733
|
+
"""Test that Page objects have dpi field as float type"""
|
734
|
+
response = await client.upload(sample_path)
|
735
|
+
assert response.task_id is not None
|
736
|
+
assert response.status == "Succeeded"
|
737
|
+
assert response.output is not None
|
738
|
+
|
739
|
+
# pages should be accessible (might be None for some configurations)
|
740
|
+
assert hasattr(response.output, 'pages')
|
741
|
+
|
742
|
+
# If pages exist, validate dpi type
|
743
|
+
if response.output.pages:
|
744
|
+
assert len(response.output.pages) > 0
|
745
|
+
page = response.output.pages[0]
|
746
|
+
assert hasattr(page, 'dpi')
|
747
|
+
# dpi can be None or a float/int (numbers)
|
748
|
+
if page.dpi is not None:
|
749
|
+
assert isinstance(page.dpi, (int, float))
|
750
|
+
|
683
751
|
@pytest.mark.asyncio
|
684
752
|
async def test_backwards_compatibility_preserved(self, client, sample_path):
|
685
753
|
"""Test that all existing fields still work after adding new ones"""
|
@@ -716,4 +784,10 @@ class TestNewFields:
|
|
716
784
|
assert segment.segment_id is not None
|
717
785
|
assert segment.segment_type is not None
|
718
786
|
assert segment.confidence is not None or segment.confidence is None
|
719
|
-
assert segment.text is not None or segment.text == ""
|
787
|
+
assert segment.text is not None or segment.text == ""
|
788
|
+
# New fields added for server compatibility
|
789
|
+
assert hasattr(segment, 'embed')
|
790
|
+
assert hasattr(segment, 'description')
|
791
|
+
|
792
|
+
# Chunk should include new content field
|
793
|
+
assert hasattr(chunk, 'content')
|
@@ -14,12 +14,6 @@ from chunkr_ai.models import (
|
|
14
14
|
Tokenizer,
|
15
15
|
OcrStrategy,
|
16
16
|
SegmentationStrategy,
|
17
|
-
Cell,
|
18
|
-
CellStyle,
|
19
|
-
Alignment,
|
20
|
-
VerticalAlignment,
|
21
|
-
Page,
|
22
|
-
Segment,
|
23
17
|
SegmentType,
|
24
18
|
)
|
25
19
|
|
@@ -316,6 +310,17 @@ class TestExcelEmbedding:
|
|
316
310
|
for chunk in chunks_with_embed:
|
317
311
|
assert len(chunk.embed.strip()) > 0, "Empty embed content found"
|
318
312
|
|
313
|
+
@pytest.mark.asyncio
|
314
|
+
async def test_chunks_have_content_field(self, client, excel_sample_path, excel_config):
|
315
|
+
"""Test that chunks include the new content field for Excel data"""
|
316
|
+
response = await client.upload(excel_sample_path, excel_config)
|
317
|
+
|
318
|
+
# All chunks should have the content field accessible
|
319
|
+
for chunk in response.output.chunks:
|
320
|
+
assert hasattr(chunk, 'content'), "Chunk missing content field"
|
321
|
+
# content can be None or a string
|
322
|
+
assert chunk.content is None or isinstance(chunk.content, str)
|
323
|
+
|
319
324
|
@pytest.mark.asyncio
|
320
325
|
async def test_segment_length_calculation(self, client, excel_sample_path, excel_config):
|
321
326
|
"""Test that segments have length calculations"""
|
@@ -4,16 +4,8 @@ from pathlib import Path
|
|
4
4
|
from chunkr_ai import Chunkr
|
5
5
|
from chunkr_ai.models import (
|
6
6
|
Configuration,
|
7
|
-
ChunkProcessing,
|
8
|
-
SegmentProcessing,
|
9
|
-
GenerationConfig,
|
10
|
-
SegmentFormat,
|
11
|
-
GenerationStrategy,
|
12
|
-
EmbedSource,
|
13
|
-
Tokenizer,
|
14
7
|
OcrStrategy,
|
15
8
|
SegmentationStrategy,
|
16
|
-
Page,
|
17
9
|
)
|
18
10
|
|
19
11
|
|
@@ -1,13 +0,0 @@
|
|
1
|
-
class HeadersMixin:
|
2
|
-
"""Mixin class for handling authorization headers"""
|
3
|
-
_api_key: str = ""
|
4
|
-
|
5
|
-
def get_api_key(self) -> str:
|
6
|
-
"""Get the API key"""
|
7
|
-
if not hasattr(self, "_api_key") or not self._api_key:
|
8
|
-
raise ValueError("API key not set")
|
9
|
-
return self._api_key
|
10
|
-
|
11
|
-
def _headers(self) -> dict:
|
12
|
-
"""Generate authorization headers"""
|
13
|
-
return {"Authorization": self.get_api_key()}
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|