chunkr-ai 0.3.1__py3-none-any.whl → 0.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chunkr_ai/api/chunkr.py +5 -5
- chunkr_ai/api/configuration.py +10 -8
- chunkr_ai/api/decorators.py +5 -3
- chunkr_ai/api/misc.py +6 -6
- chunkr_ai/api/task_response.py +1 -1
- {chunkr_ai-0.3.1.dist-info → chunkr_ai-0.3.2.dist-info}/METADATA +1 -1
- chunkr_ai-0.3.2.dist-info/RECORD +16 -0
- chunkr_ai-0.3.1.dist-info/RECORD +0 -16
- {chunkr_ai-0.3.1.dist-info → chunkr_ai-0.3.2.dist-info}/WHEEL +0 -0
- {chunkr_ai-0.3.1.dist-info → chunkr_ai-0.3.2.dist-info}/licenses/LICENSE +0 -0
- {chunkr_ai-0.3.1.dist-info → chunkr_ai-0.3.2.dist-info}/top_level.txt +0 -0
chunkr_ai/api/chunkr.py
CHANGED
@@ -42,7 +42,7 @@ class Chunkr(ChunkrBase):
|
|
42
42
|
data = await prepare_upload_data(file, filename, config)
|
43
43
|
assert self._client is not None
|
44
44
|
r = await self._client.post(
|
45
|
-
f"{self.url}/
|
45
|
+
f"{self.url}/task/parse", json=data, headers=self._headers()
|
46
46
|
)
|
47
47
|
r.raise_for_status()
|
48
48
|
return TaskResponse(**r.json()).with_client(cast(ChunkrClientProtocol, self), True, False)
|
@@ -55,7 +55,7 @@ class Chunkr(ChunkrBase):
|
|
55
55
|
data = await prepare_upload_data(None, None, config)
|
56
56
|
assert self._client is not None
|
57
57
|
r = await self._client.patch(
|
58
|
-
f"{self.url}/
|
58
|
+
f"{self.url}/task/{task_id}/parse",
|
59
59
|
json=data,
|
60
60
|
headers=self._headers(),
|
61
61
|
)
|
@@ -71,7 +71,7 @@ class Chunkr(ChunkrBase):
|
|
71
71
|
}
|
72
72
|
assert self._client is not None
|
73
73
|
r = await self._client.get(
|
74
|
-
f"{self.url}/
|
74
|
+
f"{self.url}/task/{task_id}",
|
75
75
|
params=params,
|
76
76
|
headers=self._headers()
|
77
77
|
)
|
@@ -83,7 +83,7 @@ class Chunkr(ChunkrBase):
|
|
83
83
|
async def delete_task(self, task_id: str) -> None:
|
84
84
|
assert self._client is not None
|
85
85
|
r = await self._client.delete(
|
86
|
-
f"{self.url}/
|
86
|
+
f"{self.url}/task/{task_id}", headers=self._headers()
|
87
87
|
)
|
88
88
|
r.raise_for_status()
|
89
89
|
|
@@ -92,7 +92,7 @@ class Chunkr(ChunkrBase):
|
|
92
92
|
async def cancel_task(self, task_id: str) -> None:
|
93
93
|
assert self._client is not None
|
94
94
|
r = await self._client.get(
|
95
|
-
f"{self.url}/
|
95
|
+
f"{self.url}/task/{task_id}/cancel", headers=self._headers()
|
96
96
|
)
|
97
97
|
r.raise_for_status()
|
98
98
|
|
chunkr_ai/api/configuration.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
from pydantic import BaseModel, Field, ConfigDict
|
2
2
|
from enum import Enum
|
3
|
-
from typing import
|
3
|
+
from typing import List, Optional, Union
|
4
4
|
from pydantic import field_validator, field_serializer
|
5
5
|
|
6
6
|
class CroppingStrategy(str, Enum):
|
@@ -20,15 +20,17 @@ class EmbedSource(str, Enum):
|
|
20
20
|
class GenerationStrategy(str, Enum):
|
21
21
|
LLM = "LLM"
|
22
22
|
AUTO = "Auto"
|
23
|
+
IGNORE = "Ignore"
|
23
24
|
|
24
25
|
class GenerationConfig(BaseModel):
|
25
26
|
format: Optional[SegmentFormat] = None
|
26
27
|
strategy: Optional[GenerationStrategy] = None
|
27
|
-
llm: Optional[str] = None
|
28
28
|
crop_image: Optional[CroppingStrategy] = None
|
29
|
-
embed_sources: Optional[List[EmbedSource]] = None
|
30
29
|
extended_context: Optional[bool] = None
|
30
|
+
description: Optional[bool] = None
|
31
31
|
# Deprecated fields for backwards compatibility
|
32
|
+
llm: Optional[str] = None # Deprecated
|
33
|
+
embed_sources: Optional[List[EmbedSource]] = None # Deprecated
|
32
34
|
html: Optional[GenerationStrategy] = None # Deprecated: Use format=SegmentFormat.HTML and strategy instead
|
33
35
|
markdown: Optional[GenerationStrategy] = None # Deprecated: Use format=SegmentFormat.MARKDOWN and strategy instead
|
34
36
|
|
@@ -83,7 +85,7 @@ class TokenizerType(BaseModel):
|
|
83
85
|
return {}
|
84
86
|
|
85
87
|
class ChunkProcessing(BaseModel):
|
86
|
-
ignore_headers_and_footers: Optional[bool] = True
|
88
|
+
ignore_headers_and_footers: Optional[bool] = True # Deprecated
|
87
89
|
target_length: Optional[int] = None
|
88
90
|
tokenizer: Optional[Union[TokenizerType, Tokenizer, str]] = None
|
89
91
|
|
@@ -286,6 +288,7 @@ class Page(BaseModel):
|
|
286
288
|
page_height: float
|
287
289
|
page_width: float
|
288
290
|
ss_sheet_name: Optional[str] = None
|
291
|
+
dpi: Optional[float] = None
|
289
292
|
|
290
293
|
class Segment(BaseModel):
|
291
294
|
bbox: BoundingBox
|
@@ -303,6 +306,8 @@ class Segment(BaseModel):
|
|
303
306
|
confidence: Optional[float]
|
304
307
|
text: str = ""
|
305
308
|
segment_length: Optional[int] = None
|
309
|
+
embed: Optional[str] = None
|
310
|
+
description: Optional[str] = None
|
306
311
|
# Spreadsheet-specific fields
|
307
312
|
ss_cells: Optional[List[Cell]] = None
|
308
313
|
ss_header_bbox: Optional[BoundingBox] = None
|
@@ -317,6 +322,7 @@ class Chunk(BaseModel):
|
|
317
322
|
chunk_length: int
|
318
323
|
segments: List[Segment]
|
319
324
|
embed: Optional[str] = None
|
325
|
+
content: Optional[str] = None
|
320
326
|
|
321
327
|
class OutputResponse(BaseModel):
|
322
328
|
chunks: List[Chunk]
|
@@ -347,10 +353,6 @@ class Configuration(BaseModel):
|
|
347
353
|
|
348
354
|
class OutputConfiguration(Configuration):
|
349
355
|
input_file_url: Optional[str] = None
|
350
|
-
# Deprecated
|
351
|
-
json_schema: Optional[Any] = None
|
352
|
-
model: Optional[Model] = None
|
353
|
-
target_chunk_length: Optional[int] = None
|
354
356
|
|
355
357
|
class Status(str, Enum):
|
356
358
|
STARTING = "Starting"
|
chunkr_ai/api/decorators.py
CHANGED
@@ -2,10 +2,12 @@ import asyncio
|
|
2
2
|
import functools
|
3
3
|
import httpx
|
4
4
|
import nest_asyncio
|
5
|
-
from typing import Callable, Any, TypeVar, Awaitable, Union
|
6
|
-
|
5
|
+
from typing import Callable, Any, TypeVar, Awaitable, Union
|
6
|
+
import sys
|
7
|
+
|
8
|
+
if sys.version_info >= (3, 10):
|
7
9
|
from typing import ParamSpec
|
8
|
-
|
10
|
+
else:
|
9
11
|
from typing_extensions import ParamSpec
|
10
12
|
|
11
13
|
T = TypeVar('T')
|
chunkr_ai/api/misc.py
CHANGED
@@ -3,7 +3,7 @@ import base64
|
|
3
3
|
import io
|
4
4
|
from pathlib import Path
|
5
5
|
from PIL import Image
|
6
|
-
from typing import Union, Tuple, BinaryIO, Optional
|
6
|
+
from typing import Union, Tuple, BinaryIO, Optional
|
7
7
|
|
8
8
|
async def prepare_file(file: Union[str, Path, BinaryIO, Image.Image, bytes, bytearray, memoryview]) -> Tuple[Optional[str], str]:
|
9
9
|
"""Convert various file types into a tuple of (filename, file content).
|
@@ -39,7 +39,7 @@ async def prepare_file(file: Union[str, Path, BinaryIO, Image.Image, bytes, byte
|
|
39
39
|
base64.b64decode(potential_base64)
|
40
40
|
# If we get here, it was a valid base64 string in bytes form
|
41
41
|
return None, potential_base64
|
42
|
-
except:
|
42
|
+
except Exception:
|
43
43
|
# Not a base64 string in bytes form, encode it as base64
|
44
44
|
base64_str = base64.b64encode(file_bytes).decode()
|
45
45
|
return None, base64_str
|
@@ -66,14 +66,14 @@ async def prepare_file(file: Union[str, Path, BinaryIO, Image.Image, bytes, byte
|
|
66
66
|
# Just test if it's valid base64, don't store the result
|
67
67
|
base64.b64decode(file)
|
68
68
|
return None, file
|
69
|
-
except:
|
69
|
+
except Exception:
|
70
70
|
raise ValueError(f"File not found: {file} and it's not a valid base64 string")
|
71
71
|
except Exception as e:
|
72
72
|
# If string can't be converted to Path or decoded as base64, it might still be a base64 string
|
73
73
|
try:
|
74
|
-
base64.b64decode(file)
|
75
|
-
return None, file
|
76
|
-
except:
|
74
|
+
base64.b64decode(str(file))
|
75
|
+
return None, str(file)
|
76
|
+
except Exception:
|
77
77
|
raise ValueError(f"Unable to process file: {e}")
|
78
78
|
|
79
79
|
# Handle file paths - convert to base64
|
chunkr_ai/api/task_response.py
CHANGED
@@ -0,0 +1,16 @@
|
|
1
|
+
chunkr_ai/__init__.py,sha256=6KpYv2lmD6S5z2kc9pqwuLP5VDHmOuu2qDZArUIhb1s,53
|
2
|
+
chunkr_ai/models.py,sha256=NvFJOpsgzEyYHhE-flp7Yr9tpTDvFmF4T87jttFRquU,1202
|
3
|
+
chunkr_ai/api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
4
|
+
chunkr_ai/api/auth.py,sha256=0RSNFPvHt4Nrg8qtP2xvA2KbR0J_KUe1B_tKynbq9Fc,436
|
5
|
+
chunkr_ai/api/chunkr.py,sha256=3QAlZeq8zbiHp1HxgWpBBUAmvjabD9iBZxIkuGVsKJk,3822
|
6
|
+
chunkr_ai/api/chunkr_base.py,sha256=8roSPoCADmaXM2r7zz2iHfZzIcY9NopOfa4j-dfk8RA,6310
|
7
|
+
chunkr_ai/api/configuration.py,sha256=PkoSdzEE4v1LdeQ_ziJHk02RBrcTxaGHDA5o49taeAo,11682
|
8
|
+
chunkr_ai/api/decorators.py,sha256=B-neL5d4N-skq2rjnOfaCVSTz6HEye6udcykacbv7G4,4399
|
9
|
+
chunkr_ai/api/misc.py,sha256=pNjbiD5reMdDSkjNTWHn0VgTVsGYn0fl751WuRtSkL8,5389
|
10
|
+
chunkr_ai/api/protocol.py,sha256=LjPrYSq52m1afIlAo0yVGXlGZxPRh8J6g7S4PAit3Zo,388
|
11
|
+
chunkr_ai/api/task_response.py,sha256=omnKkACjN3ijnbG6UncHLI_IDbsaJ1wHu1g4X9JbijU,8017
|
12
|
+
chunkr_ai-0.3.2.dist-info/licenses/LICENSE,sha256=w3R12yNDyZpMiy2lxy_hvNbsldC75ww79sF0u11rkho,1069
|
13
|
+
chunkr_ai-0.3.2.dist-info/METADATA,sha256=MXEHbelRC7E0jrdI-y9jHQ93i-1B0_Hs1ALO0Mmqx1o,7086
|
14
|
+
chunkr_ai-0.3.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
15
|
+
chunkr_ai-0.3.2.dist-info/top_level.txt,sha256=0IZY7PZIiS8bw5r4NUQRUQ-ATi-L_3vLQVq3ZLouOW8,10
|
16
|
+
chunkr_ai-0.3.2.dist-info/RECORD,,
|
chunkr_ai-0.3.1.dist-info/RECORD
DELETED
@@ -1,16 +0,0 @@
|
|
1
|
-
chunkr_ai/__init__.py,sha256=6KpYv2lmD6S5z2kc9pqwuLP5VDHmOuu2qDZArUIhb1s,53
|
2
|
-
chunkr_ai/models.py,sha256=NvFJOpsgzEyYHhE-flp7Yr9tpTDvFmF4T87jttFRquU,1202
|
3
|
-
chunkr_ai/api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
4
|
-
chunkr_ai/api/auth.py,sha256=0RSNFPvHt4Nrg8qtP2xvA2KbR0J_KUe1B_tKynbq9Fc,436
|
5
|
-
chunkr_ai/api/chunkr.py,sha256=uSNYtB_mcs4-QRKsX7wZb8yv6ayXgRrJSDNZ-EbAyvc,3857
|
6
|
-
chunkr_ai/api/chunkr_base.py,sha256=8roSPoCADmaXM2r7zz2iHfZzIcY9NopOfa4j-dfk8RA,6310
|
7
|
-
chunkr_ai/api/configuration.py,sha256=y_jd3K5GB-P8N3uym4wqHDVq-Rq-VT_bhqJqgKs0PVg,11586
|
8
|
-
chunkr_ai/api/decorators.py,sha256=w1l_ZEkl99C-BO3qRTbi74sYwHDFspB1Bjt1Arv9lPc,4384
|
9
|
-
chunkr_ai/api/misc.py,sha256=AaGLxZlMzNgVPwErskDRKc2UVGkC0JwxLXU-enPwzA0,5354
|
10
|
-
chunkr_ai/api/protocol.py,sha256=LjPrYSq52m1afIlAo0yVGXlGZxPRh8J6g7S4PAit3Zo,388
|
11
|
-
chunkr_ai/api/task_response.py,sha256=VYa62E08VlZUyjn2YslnY4cohdK9e53HbEzsaYIXKXM,8028
|
12
|
-
chunkr_ai-0.3.1.dist-info/licenses/LICENSE,sha256=w3R12yNDyZpMiy2lxy_hvNbsldC75ww79sF0u11rkho,1069
|
13
|
-
chunkr_ai-0.3.1.dist-info/METADATA,sha256=_Lg59OcvE1hpsbc3zg20yQFGQ2bpAqOXSx_o6_1UlzY,7086
|
14
|
-
chunkr_ai-0.3.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
15
|
-
chunkr_ai-0.3.1.dist-info/top_level.txt,sha256=0IZY7PZIiS8bw5r4NUQRUQ-ATi-L_3vLQVq3ZLouOW8,10
|
16
|
-
chunkr_ai-0.3.1.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|