chunkr-ai 0.0.50__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chunkr_ai/api/configuration.py +66 -11
- chunkr_ai/models.py +12 -0
- {chunkr_ai-0.0.50.dist-info → chunkr_ai-0.3.0.dist-info}/METADATA +2 -1
- {chunkr_ai-0.0.50.dist-info → chunkr_ai-0.3.0.dist-info}/RECORD +7 -7
- {chunkr_ai-0.0.50.dist-info → chunkr_ai-0.3.0.dist-info}/WHEEL +1 -1
- {chunkr_ai-0.0.50.dist-info → chunkr_ai-0.3.0.dist-info}/licenses/LICENSE +0 -0
- {chunkr_ai-0.0.50.dist-info → chunkr_ai-0.3.0.dist-info}/top_level.txt +0 -0
chunkr_ai/api/configuration.py
CHANGED
@@ -3,27 +3,34 @@ from enum import Enum
|
|
3
3
|
from typing import Any, List, Optional, Union
|
4
4
|
from pydantic import field_validator, field_serializer
|
5
5
|
|
6
|
-
class GenerationStrategy(str, Enum):
|
7
|
-
LLM = "LLM"
|
8
|
-
AUTO = "Auto"
|
9
|
-
|
10
6
|
class CroppingStrategy(str, Enum):
|
11
7
|
ALL = "All"
|
12
8
|
AUTO = "Auto"
|
13
9
|
|
14
|
-
class
|
15
|
-
HTML = "
|
10
|
+
class SegmentFormat(str, Enum):
|
11
|
+
HTML = "Html"
|
16
12
|
MARKDOWN = "Markdown"
|
17
|
-
|
13
|
+
|
14
|
+
class EmbedSource(str, Enum):
|
18
15
|
CONTENT = "Content"
|
16
|
+
HTML = "HTML" # Deprecated
|
17
|
+
MARKDOWN = "Markdown" # Deprecated
|
18
|
+
LLM = "LLM"
|
19
|
+
|
20
|
+
class GenerationStrategy(str, Enum):
|
21
|
+
LLM = "LLM"
|
22
|
+
AUTO = "Auto"
|
19
23
|
|
20
24
|
class GenerationConfig(BaseModel):
|
21
|
-
|
25
|
+
format: Optional[SegmentFormat] = None
|
26
|
+
strategy: Optional[GenerationStrategy] = None
|
22
27
|
llm: Optional[str] = None
|
23
|
-
markdown: Optional[GenerationStrategy] = None
|
24
28
|
crop_image: Optional[CroppingStrategy] = None
|
25
|
-
embed_sources: Optional[List[EmbedSource]] =
|
29
|
+
embed_sources: Optional[List[EmbedSource]] = None
|
26
30
|
extended_context: Optional[bool] = None
|
31
|
+
# Deprecated fields for backwards compatibility
|
32
|
+
html: Optional[GenerationStrategy] = None # Deprecated: Use format=SegmentFormat.HTML and strategy instead
|
33
|
+
markdown: Optional[GenerationStrategy] = None # Deprecated: Use format=SegmentFormat.MARKDOWN and strategy instead
|
27
34
|
|
28
35
|
class SegmentProcessing(BaseModel):
|
29
36
|
model_config = ConfigDict(populate_by_name=True, alias_generator=str.title)
|
@@ -244,9 +251,45 @@ class SegmentType(str, Enum):
|
|
244
251
|
TEXT = "Text"
|
245
252
|
TITLE = "Title"
|
246
253
|
|
254
|
+
class Alignment(str, Enum):
|
255
|
+
LEFT = "Left"
|
256
|
+
CENTER = "Center"
|
257
|
+
RIGHT = "Right"
|
258
|
+
JUSTIFY = "Justify"
|
259
|
+
|
260
|
+
class VerticalAlignment(str, Enum):
|
261
|
+
TOP = "Top"
|
262
|
+
MIDDLE = "Middle"
|
263
|
+
BOTTOM = "Bottom"
|
264
|
+
BASELINE = "Baseline"
|
265
|
+
|
266
|
+
class CellStyle(BaseModel):
|
267
|
+
bg_color: Optional[str] = None
|
268
|
+
text_color: Optional[str] = None
|
269
|
+
font_face: Optional[str] = None
|
270
|
+
is_bold: Optional[bool] = None
|
271
|
+
align: Optional[Alignment] = None
|
272
|
+
valign: Optional[VerticalAlignment] = None
|
273
|
+
|
274
|
+
class Cell(BaseModel):
|
275
|
+
cell_id: str
|
276
|
+
text: str
|
277
|
+
range: str
|
278
|
+
formula: Optional[str] = None
|
279
|
+
value: Optional[str] = None
|
280
|
+
hyperlink: Optional[str] = None
|
281
|
+
style: Optional[CellStyle] = None
|
282
|
+
|
283
|
+
class Page(BaseModel):
|
284
|
+
image: str
|
285
|
+
page_number: int
|
286
|
+
page_height: float
|
287
|
+
page_width: float
|
288
|
+
ss_sheet_name: Optional[str] = None
|
289
|
+
|
247
290
|
class Segment(BaseModel):
|
248
291
|
bbox: BoundingBox
|
249
|
-
content: str
|
292
|
+
content: str = ""
|
250
293
|
page_height: float
|
251
294
|
llm: Optional[str] = None
|
252
295
|
html: Optional[str] = None
|
@@ -258,6 +301,16 @@ class Segment(BaseModel):
|
|
258
301
|
segment_id: str
|
259
302
|
segment_type: SegmentType
|
260
303
|
confidence: Optional[float]
|
304
|
+
text: str = ""
|
305
|
+
segment_length: Optional[int] = None
|
306
|
+
# Spreadsheet-specific fields
|
307
|
+
ss_cells: Optional[List[Cell]] = None
|
308
|
+
ss_header_bbox: Optional[BoundingBox] = None
|
309
|
+
ss_header_ocr: Optional[List[OCRResult]] = None
|
310
|
+
ss_header_text: Optional[str] = None
|
311
|
+
ss_header_range: Optional[str] = None
|
312
|
+
ss_range: Optional[str] = None
|
313
|
+
ss_sheet_name: Optional[str] = None
|
261
314
|
|
262
315
|
class Chunk(BaseModel):
|
263
316
|
chunk_id: str
|
@@ -268,6 +321,8 @@ class Chunk(BaseModel):
|
|
268
321
|
class OutputResponse(BaseModel):
|
269
322
|
chunks: List[Chunk]
|
270
323
|
file_name: Optional[str]
|
324
|
+
mime_type: Optional[str] = None
|
325
|
+
pages: Optional[List[Page]] = None
|
271
326
|
page_count: Optional[int]
|
272
327
|
pdf_url: Optional[str]
|
273
328
|
|
chunkr_ai/models.py
CHANGED
@@ -1,5 +1,8 @@
|
|
1
1
|
from .api.configuration import (
|
2
|
+
Alignment,
|
2
3
|
BoundingBox,
|
4
|
+
Cell,
|
5
|
+
CellStyle,
|
3
6
|
Chunk,
|
4
7
|
ChunkProcessing,
|
5
8
|
Configuration,
|
@@ -14,7 +17,9 @@ from .api.configuration import (
|
|
14
17
|
OCRResult,
|
15
18
|
OcrStrategy,
|
16
19
|
OutputResponse,
|
20
|
+
Page,
|
17
21
|
Segment,
|
22
|
+
SegmentFormat,
|
18
23
|
SegmentProcessing,
|
19
24
|
SegmentType,
|
20
25
|
SegmentationStrategy,
|
@@ -22,11 +27,15 @@ from .api.configuration import (
|
|
22
27
|
Pipeline,
|
23
28
|
Tokenizer,
|
24
29
|
TokenizerType,
|
30
|
+
VerticalAlignment,
|
25
31
|
)
|
26
32
|
from .api.task_response import TaskResponse
|
27
33
|
|
28
34
|
__all__ = [
|
35
|
+
"Alignment",
|
29
36
|
"BoundingBox",
|
37
|
+
"Cell",
|
38
|
+
"CellStyle",
|
30
39
|
"Chunk",
|
31
40
|
"ChunkProcessing",
|
32
41
|
"Configuration",
|
@@ -41,7 +50,9 @@ __all__ = [
|
|
41
50
|
"OCRResult",
|
42
51
|
"OcrStrategy",
|
43
52
|
"OutputResponse",
|
53
|
+
"Page",
|
44
54
|
"Segment",
|
55
|
+
"SegmentFormat",
|
45
56
|
"SegmentProcessing",
|
46
57
|
"SegmentType",
|
47
58
|
"SegmentationStrategy",
|
@@ -50,4 +61,5 @@ __all__ = [
|
|
50
61
|
"Pipeline",
|
51
62
|
"Tokenizer",
|
52
63
|
"TokenizerType",
|
64
|
+
"VerticalAlignment",
|
53
65
|
]
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: chunkr-ai
|
3
|
-
Version: 0.0
|
3
|
+
Version: 0.3.0
|
4
4
|
Summary: Python client for Chunkr: open source document intelligence
|
5
5
|
Author-email: Ishaan Kapoor <ishaan@lumina.sh>
|
6
6
|
License: MIT License
|
@@ -28,6 +28,7 @@ Project-URL: Homepage, https://chunkr.ai
|
|
28
28
|
Description-Content-Type: text/markdown
|
29
29
|
License-File: LICENSE
|
30
30
|
Requires-Dist: httpx>=0.25.0
|
31
|
+
Requires-Dist: matplotlib>=3.10.3
|
31
32
|
Requires-Dist: nest-asyncio>=1.6.0
|
32
33
|
Requires-Dist: pillow>=10.0.0
|
33
34
|
Requires-Dist: pydantic>=2.0.0
|
@@ -1,16 +1,16 @@
|
|
1
1
|
chunkr_ai/__init__.py,sha256=6KpYv2lmD6S5z2kc9pqwuLP5VDHmOuu2qDZArUIhb1s,53
|
2
|
-
chunkr_ai/models.py,sha256=
|
2
|
+
chunkr_ai/models.py,sha256=NvFJOpsgzEyYHhE-flp7Yr9tpTDvFmF4T87jttFRquU,1202
|
3
3
|
chunkr_ai/api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
4
4
|
chunkr_ai/api/auth.py,sha256=0RSNFPvHt4Nrg8qtP2xvA2KbR0J_KUe1B_tKynbq9Fc,436
|
5
5
|
chunkr_ai/api/chunkr.py,sha256=uSNYtB_mcs4-QRKsX7wZb8yv6ayXgRrJSDNZ-EbAyvc,3857
|
6
6
|
chunkr_ai/api/chunkr_base.py,sha256=8roSPoCADmaXM2r7zz2iHfZzIcY9NopOfa4j-dfk8RA,6310
|
7
|
-
chunkr_ai/api/configuration.py,sha256=
|
7
|
+
chunkr_ai/api/configuration.py,sha256=y_jd3K5GB-P8N3uym4wqHDVq-Rq-VT_bhqJqgKs0PVg,11586
|
8
8
|
chunkr_ai/api/decorators.py,sha256=w1l_ZEkl99C-BO3qRTbi74sYwHDFspB1Bjt1Arv9lPc,4384
|
9
9
|
chunkr_ai/api/misc.py,sha256=AaGLxZlMzNgVPwErskDRKc2UVGkC0JwxLXU-enPwzA0,5354
|
10
10
|
chunkr_ai/api/protocol.py,sha256=LjPrYSq52m1afIlAo0yVGXlGZxPRh8J6g7S4PAit3Zo,388
|
11
11
|
chunkr_ai/api/task_response.py,sha256=VYa62E08VlZUyjn2YslnY4cohdK9e53HbEzsaYIXKXM,8028
|
12
|
-
chunkr_ai-0.0.
|
13
|
-
chunkr_ai-0.0.
|
14
|
-
chunkr_ai-0.0.
|
15
|
-
chunkr_ai-0.0.
|
16
|
-
chunkr_ai-0.0.
|
12
|
+
chunkr_ai-0.3.0.dist-info/licenses/LICENSE,sha256=w3R12yNDyZpMiy2lxy_hvNbsldC75ww79sF0u11rkho,1069
|
13
|
+
chunkr_ai-0.3.0.dist-info/METADATA,sha256=RCgp4cjj3CduWuM1ycLczz7iBHohtaEXLv8CZseephw,7086
|
14
|
+
chunkr_ai-0.3.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
15
|
+
chunkr_ai-0.3.0.dist-info/top_level.txt,sha256=0IZY7PZIiS8bw5r4NUQRUQ-ATi-L_3vLQVq3ZLouOW8,10
|
16
|
+
chunkr_ai-0.3.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|