chunkr-ai 0.0.24__py3-none-any.whl → 0.0.26__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
chunkr_ai/api/chunkr.py CHANGED
@@ -2,8 +2,8 @@ from pathlib import Path
2
2
  from PIL import Image
3
3
  from typing import Union, BinaryIO
4
4
 
5
- from .config import Configuration
6
- from .decorators import anywhere, ensure_client
5
+ from .configuration import Configuration
6
+ from .decorators import anywhere, ensure_client, retry_on_429
7
7
  from .misc import prepare_upload_data
8
8
  from .task_response import TaskResponse
9
9
  from .chunkr_base import ChunkrBase
@@ -29,6 +29,7 @@ class Chunkr(ChunkrBase):
29
29
 
30
30
  @anywhere()
31
31
  @ensure_client()
32
+ @retry_on_429()
32
33
  async def create_task(
33
34
  self,
34
35
  file: Union[str, Path, BinaryIO, Image.Image],
@@ -39,10 +40,11 @@ class Chunkr(ChunkrBase):
39
40
  f"{self.url}/api/v1/task", files=files, headers=self._headers()
40
41
  )
41
42
  r.raise_for_status()
42
- return TaskResponse(**r.json()).with_client(self)
43
+ return TaskResponse(**r.json()).with_client(self, True, False)
43
44
 
44
45
  @anywhere()
45
46
  @ensure_client()
47
+ @retry_on_429()
46
48
  async def update_task(self, task_id: str, config: Configuration) -> TaskResponse:
47
49
  files = await prepare_upload_data(None, config, self._client)
48
50
  r = await self._client.patch(
@@ -51,16 +53,22 @@ class Chunkr(ChunkrBase):
51
53
  headers=self._headers(),
52
54
  )
53
55
  r.raise_for_status()
54
- return TaskResponse(**r.json()).with_client(self)
56
+ return TaskResponse(**r.json()).with_client(self, True, False)
55
57
 
56
58
  @anywhere()
57
59
  @ensure_client()
58
- async def get_task(self, task_id: str) -> TaskResponse:
60
+ async def get_task(self, task_id: str, include_chunks: bool = True, base64_urls: bool = False) -> TaskResponse:
61
+ params = {
62
+ "base64_urls": str(base64_urls).lower(),
63
+ "include_chunks": str(include_chunks).lower()
64
+ }
59
65
  r = await self._client.get(
60
- f"{self.url}/api/v1/task/{task_id}", headers=self._headers()
66
+ f"{self.url}/api/v1/task/{task_id}",
67
+ params=params,
68
+ headers=self._headers()
61
69
  )
62
70
  r.raise_for_status()
63
- return TaskResponse(**r.json()).with_client(self)
71
+ return TaskResponse(**r.json()).with_client(self, include_chunks, base64_urls)
64
72
 
65
73
  @anywhere()
66
74
  @ensure_client()
@@ -1,4 +1,4 @@
1
- from .config import Configuration
1
+ from .configuration import Configuration
2
2
  from .task_response import TaskResponse
3
3
  from .auth import HeadersMixin
4
4
  from abc import abstractmethod
@@ -139,11 +139,13 @@ class ChunkrBase(HeadersMixin):
139
139
  pass
140
140
 
141
141
  @abstractmethod
142
- def get_task(self, task_id: str) -> TaskResponse:
142
+ def get_task(self, task_id: str, include_chunks: bool = True, base64_urls: bool = False) -> TaskResponse:
143
143
  """Get a task response by its ID.
144
144
 
145
145
  Args:
146
146
  task_id: The ID of the task to get
147
+ include_chunks: Whether to include chunks in the output response. Defaults to True.
148
+ base64_urls: Whether to return base64 encoded URLs. If false, the URLs will be returned as presigned URLs. Defaults to False.
147
149
 
148
150
  Returns:
149
151
  TaskResponse: The task response
@@ -1,6 +1,6 @@
1
- from pydantic import BaseModel, Field, model_validator, ConfigDict
1
+ from pydantic import BaseModel, Field, ConfigDict
2
2
  from enum import Enum
3
- from typing import Optional, List, Dict, Union, Type
3
+ from typing import Any, List, Optional
4
4
 
5
5
  class GenerationStrategy(str, Enum):
6
6
  LLM = "LLM"
@@ -37,16 +37,6 @@ class SegmentProcessing(BaseModel):
37
37
  class ChunkProcessing(BaseModel):
38
38
  target_length: Optional[int] = None
39
39
 
40
- class Property(BaseModel):
41
- name: str
42
- prop_type: str
43
- description: Optional[str] = None
44
- default: Optional[str] = None
45
-
46
- class JsonSchema(BaseModel):
47
- title: str
48
- properties: List[Property]
49
-
50
40
  class OcrStrategy(str, Enum):
51
41
  ALL = "All"
52
42
  AUTO = "Auto"
@@ -98,9 +88,6 @@ class Chunk(BaseModel):
98
88
  chunk_length: int
99
89
  segments: List[Segment]
100
90
 
101
- class ExtractedJson(BaseModel):
102
- data: Dict
103
-
104
91
  class OutputResponse(BaseModel):
105
92
  chunks: List[Chunk]
106
93
  file_name: Optional[str]
@@ -118,7 +105,6 @@ class Configuration(BaseModel):
118
105
  chunk_processing: Optional[ChunkProcessing] = None
119
106
  expires_in: Optional[int] = None
120
107
  high_resolution: Optional[bool] = None
121
- model: Optional[Model] = None
122
108
  ocr_strategy: Optional[OcrStrategy] = None
123
109
  segment_processing: Optional[SegmentProcessing] = None
124
110
  segmentation_strategy: Optional[SegmentationStrategy] = None
@@ -126,16 +112,10 @@ class Configuration(BaseModel):
126
112
 
127
113
  class OutputConfiguration(Configuration):
128
114
  input_file_url: Optional[str] = None
129
- json_schema: Optional[Union[JsonSchema, Type[BaseModel], BaseModel]] = None
130
-
131
- @model_validator(mode="before")
132
- def map_deprecated_fields(cls, values: Dict) -> Dict:
133
- if isinstance(values, dict) and "target_chunk_length" in values:
134
- target_length = values.pop("target_chunk_length")
135
- if target_length is not None:
136
- values["chunk_processing"] = values.get("chunk_processing", {}) or {}
137
- values["chunk_processing"]["target_length"] = target_length
138
- return values
115
+ # Deprecated
116
+ json_schema: Optional[Any] = None
117
+ model: Optional[Model] = None
118
+ target_chunk_length: Optional[int] = None
139
119
 
140
120
  class Status(str, Enum):
141
121
  STARTING = "Starting"
@@ -59,4 +59,34 @@ def require_task() -> Callable[[Callable[P, Awaitable[T]]], Callable[P, Awaitabl
59
59
  self._client._client = httpx.AsyncClient()
60
60
  return await async_func(self, *args, **kwargs)
61
61
  return wrapper
62
+ return decorator
63
+
64
+ def retry_on_429(max_retries: int = 10, initial_delay: float = 0.5) -> Callable[[Callable[P, Awaitable[T]]], Callable[P, Awaitable[T]]]:
65
+ """Decorator that retries the request when encountering 429 Too Many Requests errors.
66
+
67
+ Args:
68
+ max_retries: Maximum number of retry attempts (default: 3)
69
+ initial_delay: Initial delay in seconds, will be exponentially increased (default: 1.0)
70
+ """
71
+ def decorator(async_func: Callable[P, Awaitable[T]]) -> Callable[P, Awaitable[T]]:
72
+ @functools.wraps(async_func)
73
+ async def wrapper(*args: P.args, **kwargs: P.kwargs) -> T:
74
+ retries = 0
75
+ while True:
76
+ try:
77
+ return await async_func(*args, **kwargs)
78
+ except httpx.HTTPStatusError as e:
79
+ if e.response.status_code != 429 or retries >= max_retries:
80
+ raise
81
+ retries += 1
82
+ delay = initial_delay
83
+ # Use Retry-After header if available
84
+ retry_after = e.response.headers.get('Retry-After')
85
+ if retry_after:
86
+ try:
87
+ delay = float(retry_after)
88
+ except (ValueError, TypeError):
89
+ pass
90
+ await asyncio.sleep(delay)
91
+ return wrapper
62
92
  return decorator
chunkr_ai/api/misc.py CHANGED
@@ -1,4 +1,4 @@
1
- from .config import Configuration
1
+ from .configuration import Configuration
2
2
  import io
3
3
  import json
4
4
  from pathlib import Path
@@ -2,11 +2,12 @@ from datetime import datetime
2
2
  from typing import TypeVar, Optional, Generic
3
3
  from pydantic import BaseModel, PrivateAttr
4
4
  import asyncio
5
+ import json
5
6
 
6
- from .config import Configuration, OutputConfiguration, OutputResponse, Status
7
+ from .configuration import Configuration, OutputConfiguration, OutputResponse, Status
7
8
  from .protocol import ChunkrClientProtocol
8
9
  from .misc import prepare_upload_data
9
- from .decorators import anywhere, require_task
10
+ from .decorators import anywhere, require_task, retry_on_429
10
11
 
11
12
  T = TypeVar("T", bound="TaskResponse")
12
13
 
@@ -21,10 +22,14 @@ class TaskResponse(BaseModel, Generic[T]):
21
22
  status: Status
22
23
  task_id: str
23
24
  task_url: Optional[str] = None
25
+ include_chunks: bool = False
26
+ _base64_urls: bool = False
24
27
  _client: Optional[ChunkrClientProtocol] = PrivateAttr(default=None)
25
28
 
26
- def with_client(self, client: ChunkrClientProtocol) -> T:
29
+ def with_client(self, client: ChunkrClientProtocol, include_chunks: bool = False, base64_urls: bool = False) -> T:
27
30
  self._client = client
31
+ self.include_chunks = include_chunks
32
+ self._base64_urls = base64_urls
28
33
  return self
29
34
 
30
35
  def _check_status(self) -> Optional[T]:
@@ -45,11 +50,12 @@ class TaskResponse(BaseModel, Generic[T]):
45
50
  )
46
51
  r.raise_for_status()
47
52
  return r.json()
48
- except (ConnectionError, TimeoutError) as _:
49
- print("Connection error while polling the task, retrying...")
53
+ except (ConnectionError, TimeoutError, OSError) as e:
54
+ print(f"Connection error while polling the task: {str(e)}, retrying...")
50
55
  await asyncio.sleep(0.5)
51
- except Exception:
52
- raise
56
+ return await self._poll_request()
57
+ except Exception as e:
58
+ raise e
53
59
 
54
60
  @anywhere()
55
61
  async def poll(self) -> T:
@@ -64,6 +70,7 @@ class TaskResponse(BaseModel, Generic[T]):
64
70
 
65
71
  @anywhere()
66
72
  @require_task()
73
+ @retry_on_429()
67
74
  async def update(self, config: Configuration) -> T:
68
75
  """Update the task configuration."""
69
76
  f = await prepare_upload_data(None, config, self._client._client)
@@ -95,17 +102,59 @@ class TaskResponse(BaseModel, Generic[T]):
95
102
  r.raise_for_status()
96
103
  return await self.poll()
97
104
 
98
- def html(self) -> str:
99
- """Get the full HTML of the task"""
100
- return self._get_content("html")
105
+ def html(self, output_file: str = None) -> str:
106
+ """Get the full HTML of the task
107
+
108
+ Args:
109
+ output_file (str, optional): Path to save the HTML content. Defaults to None.
110
+ """
111
+ content = self._get_content("html")
112
+ if output_file:
113
+ with open(output_file, "w", encoding="utf-8") as f:
114
+ f.write(content)
115
+ return content
101
116
 
102
- def markdown(self) -> str:
103
- """Get the full markdown of the task"""
104
- return self._get_content("markdown")
117
+ def markdown(self, output_file: str = None) -> str:
118
+ """Get the full markdown of the task
119
+
120
+ Args:
121
+ output_file (str, optional): Path to save the markdown content. Defaults to None.
122
+ """
123
+ content = self._get_content("markdown")
124
+ if output_file:
125
+ with open(output_file, "w", encoding="utf-8") as f:
126
+ f.write(content)
127
+ return content
105
128
 
106
- def content(self) -> str:
107
- """Get the full content of the task"""
108
- return self._get_content("content")
129
+ def content(self, output_file: str = None) -> str:
130
+ """Get the full content of the task
131
+
132
+ Args:
133
+ output_file (str, optional): Path to save the content. Defaults to None.
134
+ """
135
+ content = self._get_content("content")
136
+ if output_file:
137
+ with open(output_file, "w", encoding="utf-8") as f:
138
+ f.write(content)
139
+ return content
140
+
141
+ def json(self, output_file: str = None) -> dict:
142
+ """Get the full task data as JSON
143
+
144
+ Args:
145
+ output_file (str, optional): Path to save the task data as JSON. Defaults to None.
146
+ """
147
+ class DateTimeEncoder(json.JSONEncoder):
148
+ def default(self, obj):
149
+ if isinstance(obj, datetime):
150
+ return obj.isoformat()
151
+ return super().default(obj)
152
+
153
+ data = self.model_dump()
154
+ if output_file:
155
+ with open(output_file, "w", encoding="utf-8") as f:
156
+ json.dump(data, f, cls=DateTimeEncoder, indent=2)
157
+ return data
109
158
 
110
159
  def _get_content(self, t: str) -> str:
111
160
  if not self.output:
chunkr_ai/models.py CHANGED
@@ -1,4 +1,4 @@
1
- from .api.config import (
1
+ from .api.configuration import (
2
2
  BoundingBox,
3
3
  Chunk,
4
4
  ChunkProcessing,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: chunkr-ai
3
- Version: 0.0.24
3
+ Version: 0.0.26
4
4
  Summary: Python client for Chunkr: open source document intelligence
5
5
  Author-email: Ishaan Kapoor <ishaan@lumina.sh>
6
6
  License: MIT License
@@ -0,0 +1,16 @@
1
+ chunkr_ai/__init__.py,sha256=6KpYv2lmD6S5z2kc9pqwuLP5VDHmOuu2qDZArUIhb1s,53
2
+ chunkr_ai/models.py,sha256=tOI7ylkhyeFfCLMisk96EPsH4UEcjBx1Mcisxc_AYXI,757
3
+ chunkr_ai/api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
+ chunkr_ai/api/auth.py,sha256=hlv0GiUmlsbFO1wLL9sslqOnsBSoBqkL_6Mk2SDvxgE,413
5
+ chunkr_ai/api/chunkr.py,sha256=VnbuAPlWLqyf8xCCU_kpdybgjVPTwZLarDQoD3uozY0,3065
6
+ chunkr_ai/api/chunkr_base.py,sha256=giW56fL7xxJphdOTpIH52dXxpNt7OdP8pNiPSqbNjGM,5835
7
+ chunkr_ai/api/configuration.py,sha256=0wnrKlUIO7opvV963Gr_S8tlAjpo_IkNmbTi1_FwEug,3751
8
+ chunkr_ai/api/decorators.py,sha256=HSq3vcxOeUJkaWaf7HOvCyg9dWkVo8cG5BrU-jhbhmc,4053
9
+ chunkr_ai/api/misc.py,sha256=5PBI6pvOXr0x-3WieSKLrC8MA0iGPa-IG-5FEZ3vnr0,5724
10
+ chunkr_ai/api/protocol.py,sha256=LjPrYSq52m1afIlAo0yVGXlGZxPRh8J6g7S4PAit3Zo,388
11
+ chunkr_ai/api/task_response.py,sha256=lYzR3Oa6HwLmW5Plo5AF4Ky3UMXHU9zcUMRYOHb7Gwg,5805
12
+ chunkr_ai-0.0.26.dist-info/LICENSE,sha256=w3R12yNDyZpMiy2lxy_hvNbsldC75ww79sF0u11rkho,1069
13
+ chunkr_ai-0.0.26.dist-info/METADATA,sha256=LcIn-LIE_RsPawnkh26NyU2EGicKOQ1Qf1KsAu0dPuw,6996
14
+ chunkr_ai-0.0.26.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
15
+ chunkr_ai-0.0.26.dist-info/top_level.txt,sha256=0IZY7PZIiS8bw5r4NUQRUQ-ATi-L_3vLQVq3ZLouOW8,10
16
+ chunkr_ai-0.0.26.dist-info/RECORD,,
chunkr_ai/api/api.py DELETED
File without changes
@@ -1,17 +0,0 @@
1
- chunkr_ai/__init__.py,sha256=6KpYv2lmD6S5z2kc9pqwuLP5VDHmOuu2qDZArUIhb1s,53
2
- chunkr_ai/models.py,sha256=MK8FPbWDj1ynvSHaYuslKCPybxLuAlrsVIM3Eym3kKI,750
3
- chunkr_ai/api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
- chunkr_ai/api/api.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
- chunkr_ai/api/auth.py,sha256=hlv0GiUmlsbFO1wLL9sslqOnsBSoBqkL_6Mk2SDvxgE,413
6
- chunkr_ai/api/chunkr.py,sha256=XTXJFs0xjYY3w3N4fSQcxtJFBtNfzFYYkh6nDlFz4cY,2714
7
- chunkr_ai/api/chunkr_base.py,sha256=4SXA-gdZd1w2zZeeMdy4xog0NKOrKjmo6IMvSl9KSBg,5538
8
- chunkr_ai/api/config.py,sha256=NmPTsDvcjkvNx0gNzDTz-oFG5rQC7jm-H70O_crJCw8,4478
9
- chunkr_ai/api/decorators.py,sha256=y_Z9z0O2XXiX9z6jWDwdbCPdQyMLnjE0pGkJjHQEv_Q,2652
10
- chunkr_ai/api/misc.py,sha256=5Q2K713VPwf3S2519KTzjT9PKhTEBgBMk1d8NNnmpZ0,5717
11
- chunkr_ai/api/protocol.py,sha256=LjPrYSq52m1afIlAo0yVGXlGZxPRh8J6g7S4PAit3Zo,388
12
- chunkr_ai/api/task_response.py,sha256=hcHsBgX-2C5Px5Bu0IKk33K_AkqHSEM1Wu2zkcPh9to,3935
13
- chunkr_ai-0.0.24.dist-info/LICENSE,sha256=w3R12yNDyZpMiy2lxy_hvNbsldC75ww79sF0u11rkho,1069
14
- chunkr_ai-0.0.24.dist-info/METADATA,sha256=JyDI8EkFaJQQ7vIo2osHxXmeuNqhQ0UWjgUMHSFIYow,6996
15
- chunkr_ai-0.0.24.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
16
- chunkr_ai-0.0.24.dist-info/top_level.txt,sha256=0IZY7PZIiS8bw5r4NUQRUQ-ATi-L_3vLQVq3ZLouOW8,10
17
- chunkr_ai-0.0.24.dist-info/RECORD,,