chunkr-ai 0.0.23__py3-none-any.whl → 0.0.25__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
chunkr_ai/api/chunkr.py CHANGED
@@ -2,8 +2,8 @@ from pathlib import Path
2
2
  from PIL import Image
3
3
  from typing import Union, BinaryIO
4
4
 
5
- from .config import Configuration
6
- from .decorators import anywhere, ensure_client
5
+ from .configuration import Configuration
6
+ from .decorators import anywhere, ensure_client, retry_on_429
7
7
  from .misc import prepare_upload_data
8
8
  from .task_response import TaskResponse
9
9
  from .chunkr_base import ChunkrBase
@@ -29,6 +29,7 @@ class Chunkr(ChunkrBase):
29
29
 
30
30
  @anywhere()
31
31
  @ensure_client()
32
+ @retry_on_429()
32
33
  async def create_task(
33
34
  self,
34
35
  file: Union[str, Path, BinaryIO, Image.Image],
@@ -39,10 +40,11 @@ class Chunkr(ChunkrBase):
39
40
  f"{self.url}/api/v1/task", files=files, headers=self._headers()
40
41
  )
41
42
  r.raise_for_status()
42
- return TaskResponse(**r.json()).with_client(self)
43
+ return TaskResponse(**r.json()).with_client(self, True, False)
43
44
 
44
45
  @anywhere()
45
46
  @ensure_client()
47
+ @retry_on_429()
46
48
  async def update_task(self, task_id: str, config: Configuration) -> TaskResponse:
47
49
  files = await prepare_upload_data(None, config, self._client)
48
50
  r = await self._client.patch(
@@ -51,16 +53,22 @@ class Chunkr(ChunkrBase):
51
53
  headers=self._headers(),
52
54
  )
53
55
  r.raise_for_status()
54
- return TaskResponse(**r.json()).with_client(self)
56
+ return TaskResponse(**r.json()).with_client(self, True, False)
55
57
 
56
58
  @anywhere()
57
59
  @ensure_client()
58
- async def get_task(self, task_id: str) -> TaskResponse:
60
+ async def get_task(self, task_id: str, include_chunks: bool = True, base64_urls: bool = False) -> TaskResponse:
61
+ params = {
62
+ "base64_urls": str(base64_urls).lower(),
63
+ "include_chunks": str(include_chunks).lower()
64
+ }
59
65
  r = await self._client.get(
60
- f"{self.url}/api/v1/task/{task_id}", headers=self._headers()
66
+ f"{self.url}/api/v1/task/{task_id}",
67
+ params=params,
68
+ headers=self._headers()
61
69
  )
62
70
  r.raise_for_status()
63
- return TaskResponse(**r.json()).with_client(self)
71
+ return TaskResponse(**r.json()).with_client(self, include_chunks, base64_urls)
64
72
 
65
73
  @anywhere()
66
74
  @ensure_client()
@@ -1,4 +1,4 @@
1
- from .config import Configuration
1
+ from .configuration import Configuration
2
2
  from .task_response import TaskResponse
3
3
  from .auth import HeadersMixin
4
4
  from abc import abstractmethod
@@ -11,12 +11,20 @@ from typing import BinaryIO, Union
11
11
 
12
12
 
13
13
  class ChunkrBase(HeadersMixin):
14
- """Base class with shared functionality for Chunkr API clients."""
15
-
16
- def __init__(self, url: str = None, api_key: str = None):
14
+ """Base class with shared functionality for Chunkr API clients.
15
+
16
+ Args:
17
+ url: The base URL of the Chunkr API. Defaults to the value of the CHUNKR_URL environment variable, or "https://api.chunkr.ai" if not set.
18
+ api_key: The API key to use for authentication. Defaults to the value of the CHUNKR_API_KEY environment variable, or None if not set.
19
+ raise_on_failure: Whether to raise an exception if the task fails. Defaults to False.
20
+ """
21
+
22
+ def __init__(self, url: str = None, api_key: str = None, raise_on_failure: bool = False):
17
23
  load_dotenv()
18
24
  self.url = url or os.getenv("CHUNKR_URL") or "https://api.chunkr.ai"
19
25
  self._api_key = api_key or os.getenv("CHUNKR_API_KEY")
26
+ self.raise_on_failure = raise_on_failure
27
+
20
28
  if not self._api_key:
21
29
  raise ValueError(
22
30
  "API key must be provided either directly, in .env file, or as CHUNKR_API_KEY environment variable. You can get an api key at: https://www.chunkr.ai"
@@ -131,11 +139,13 @@ class ChunkrBase(HeadersMixin):
131
139
  pass
132
140
 
133
141
  @abstractmethod
134
- def get_task(self, task_id: str) -> TaskResponse:
142
+ def get_task(self, task_id: str, include_chunks: bool = True, base64_urls: bool = False) -> TaskResponse:
135
143
  """Get a task response by its ID.
136
144
 
137
145
  Args:
138
146
  task_id: The ID of the task to get
147
+ include_chunks: Whether to include chunks in the output response. Defaults to True.
148
+ base64_urls: Whether to return base64 encoded URLs. If false, the URLs will be returned as presigned URLs. Defaults to False.
139
149
 
140
150
  Returns:
141
151
  TaskResponse: The task response
@@ -1,6 +1,6 @@
1
- from pydantic import BaseModel, Field, model_validator, ConfigDict
1
+ from pydantic import BaseModel, Field, ConfigDict
2
2
  from enum import Enum
3
- from typing import Optional, List, Dict, Union, Type
3
+ from typing import Any, List, Optional
4
4
 
5
5
  class GenerationStrategy(str, Enum):
6
6
  LLM = "LLM"
@@ -37,16 +37,6 @@ class SegmentProcessing(BaseModel):
37
37
  class ChunkProcessing(BaseModel):
38
38
  target_length: Optional[int] = None
39
39
 
40
- class Property(BaseModel):
41
- name: str
42
- prop_type: str
43
- description: Optional[str] = None
44
- default: Optional[str] = None
45
-
46
- class JsonSchema(BaseModel):
47
- title: str
48
- properties: List[Property]
49
-
50
40
  class OcrStrategy(str, Enum):
51
41
  ALL = "All"
52
42
  AUTO = "Auto"
@@ -98,9 +88,6 @@ class Chunk(BaseModel):
98
88
  chunk_length: int
99
89
  segments: List[Segment]
100
90
 
101
- class ExtractedJson(BaseModel):
102
- data: Dict
103
-
104
91
  class OutputResponse(BaseModel):
105
92
  chunks: List[Chunk]
106
93
  file_name: Optional[str]
@@ -118,7 +105,6 @@ class Configuration(BaseModel):
118
105
  chunk_processing: Optional[ChunkProcessing] = None
119
106
  expires_in: Optional[int] = None
120
107
  high_resolution: Optional[bool] = None
121
- model: Optional[Model] = None
122
108
  ocr_strategy: Optional[OcrStrategy] = None
123
109
  segment_processing: Optional[SegmentProcessing] = None
124
110
  segmentation_strategy: Optional[SegmentationStrategy] = None
@@ -126,16 +112,10 @@ class Configuration(BaseModel):
126
112
 
127
113
  class OutputConfiguration(Configuration):
128
114
  input_file_url: Optional[str] = None
129
- json_schema: Optional[Union[JsonSchema, Type[BaseModel], BaseModel]] = None
130
-
131
- @model_validator(mode="before")
132
- def map_deprecated_fields(cls, values: Dict) -> Dict:
133
- if isinstance(values, dict) and "target_chunk_length" in values:
134
- target_length = values.pop("target_chunk_length")
135
- if target_length is not None:
136
- values["chunk_processing"] = values.get("chunk_processing", {}) or {}
137
- values["chunk_processing"]["target_length"] = target_length
138
- return values
115
+ # Deprecated
116
+ json_schema: Optional[Any] = None
117
+ model: Optional[Model] = None
118
+ target_chunk_length: Optional[int] = None
139
119
 
140
120
  class Status(str, Enum):
141
121
  STARTING = "Starting"
@@ -59,4 +59,34 @@ def require_task() -> Callable[[Callable[P, Awaitable[T]]], Callable[P, Awaitabl
59
59
  self._client._client = httpx.AsyncClient()
60
60
  return await async_func(self, *args, **kwargs)
61
61
  return wrapper
62
+ return decorator
63
+
64
+ def retry_on_429(max_retries: int = 10, initial_delay: float = 0.5) -> Callable[[Callable[P, Awaitable[T]]], Callable[P, Awaitable[T]]]:
65
+ """Decorator that retries the request when encountering 429 Too Many Requests errors.
66
+
67
+ Args:
68
+ max_retries: Maximum number of retry attempts (default: 3)
69
+ initial_delay: Initial delay in seconds, will be exponentially increased (default: 1.0)
70
+ """
71
+ def decorator(async_func: Callable[P, Awaitable[T]]) -> Callable[P, Awaitable[T]]:
72
+ @functools.wraps(async_func)
73
+ async def wrapper(*args: P.args, **kwargs: P.kwargs) -> T:
74
+ retries = 0
75
+ while True:
76
+ try:
77
+ return await async_func(*args, **kwargs)
78
+ except httpx.HTTPStatusError as e:
79
+ if e.response.status_code != 429 or retries >= max_retries:
80
+ raise
81
+ retries += 1
82
+ delay = initial_delay
83
+ # Use Retry-After header if available
84
+ retry_after = e.response.headers.get('Retry-After')
85
+ if retry_after:
86
+ try:
87
+ delay = float(retry_after)
88
+ except (ValueError, TypeError):
89
+ pass
90
+ await asyncio.sleep(delay)
91
+ return wrapper
62
92
  return decorator
chunkr_ai/api/misc.py CHANGED
@@ -1,4 +1,4 @@
1
- from .config import Configuration
1
+ from .configuration import Configuration
2
2
  import io
3
3
  import json
4
4
  from pathlib import Path
chunkr_ai/api/protocol.py CHANGED
@@ -5,15 +5,10 @@ from httpx import AsyncClient
5
5
  @runtime_checkable
6
6
  class ChunkrClientProtocol(Protocol):
7
7
  """Protocol defining the interface for Chunkr clients"""
8
-
9
- url: str
10
- _api_key: str
8
+
9
+ raise_on_failure: bool = True
11
10
  _client: Optional[AsyncClient] = None
12
11
 
13
- def get_api_key(self) -> str:
14
- """Get the API key"""
15
- ...
16
-
17
12
  def _headers(self) -> dict:
18
13
  """Return headers required for API requests"""
19
14
  ...
@@ -3,10 +3,10 @@ from typing import TypeVar, Optional, Generic
3
3
  from pydantic import BaseModel, PrivateAttr
4
4
  import asyncio
5
5
 
6
- from .config import Configuration, OutputConfiguration, OutputResponse, Status
6
+ from .configuration import Configuration, OutputConfiguration, OutputResponse, Status
7
7
  from .protocol import ChunkrClientProtocol
8
8
  from .misc import prepare_upload_data
9
- from .decorators import anywhere, require_task
9
+ from .decorators import anywhere, require_task, retry_on_429
10
10
 
11
11
  T = TypeVar("T", bound="TaskResponse")
12
12
 
@@ -21,16 +21,22 @@ class TaskResponse(BaseModel, Generic[T]):
21
21
  status: Status
22
22
  task_id: str
23
23
  task_url: Optional[str] = None
24
+ include_chunks: bool = False
25
+ _base64_urls: bool = False
24
26
  _client: Optional[ChunkrClientProtocol] = PrivateAttr(default=None)
25
27
 
26
- def with_client(self, client: ChunkrClientProtocol) -> T:
28
+ def with_client(self, client: ChunkrClientProtocol, include_chunks: bool = False, base64_urls: bool = False) -> T:
27
29
  self._client = client
30
+ self.include_chunks = include_chunks
31
+ self._base64_urls = base64_urls
28
32
  return self
29
33
 
30
34
  def _check_status(self) -> Optional[T]:
31
35
  """Helper method to check task status and handle completion/failure"""
32
36
  if self.status == "Failed":
33
- raise ValueError(self.message)
37
+ if getattr(self._client, 'raise_on_failure', True):
38
+ raise ValueError(self.message)
39
+ return self
34
40
  if self.status not in ("Starting", "Processing"):
35
41
  return self
36
42
  return None
@@ -43,11 +49,12 @@ class TaskResponse(BaseModel, Generic[T]):
43
49
  )
44
50
  r.raise_for_status()
45
51
  return r.json()
46
- except (ConnectionError, TimeoutError) as _:
47
- print("Connection error while polling the task, retrying...")
52
+ except (ConnectionError, TimeoutError, OSError) as e:
53
+ print(f"Connection error while polling the task: {str(e)}, retrying...")
48
54
  await asyncio.sleep(0.5)
49
- except Exception:
50
- raise
55
+ return await self._poll_request()
56
+ except Exception as e:
57
+ raise e
51
58
 
52
59
  @anywhere()
53
60
  async def poll(self) -> T:
@@ -62,6 +69,7 @@ class TaskResponse(BaseModel, Generic[T]):
62
69
 
63
70
  @anywhere()
64
71
  @require_task()
72
+ @retry_on_429()
65
73
  async def update(self, config: Configuration) -> T:
66
74
  """Update the task configuration."""
67
75
  f = await prepare_upload_data(None, config, self._client._client)
chunkr_ai/models.py CHANGED
@@ -1,4 +1,4 @@
1
- from .api.config import (
1
+ from .api.configuration import (
2
2
  BoundingBox,
3
3
  Chunk,
4
4
  ChunkProcessing,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: chunkr-ai
3
- Version: 0.0.23
3
+ Version: 0.0.25
4
4
  Summary: Python client for Chunkr: open source document intelligence
5
5
  Author-email: Ishaan Kapoor <ishaan@lumina.sh>
6
6
  License: MIT License
@@ -35,6 +35,7 @@ Provides-Extra: test
35
35
  Requires-Dist: pytest>=7.0.0; extra == "test"
36
36
  Requires-Dist: pytest-xdist>=3.0.0; extra == "test"
37
37
  Requires-Dist: pytest-asyncio>=0.21.0; extra == "test"
38
+ Requires-Dist: ruff>=0.9.3; extra == "test"
38
39
 
39
40
  # Chunkr Python Client
40
41
 
@@ -0,0 +1,16 @@
1
+ chunkr_ai/__init__.py,sha256=6KpYv2lmD6S5z2kc9pqwuLP5VDHmOuu2qDZArUIhb1s,53
2
+ chunkr_ai/models.py,sha256=tOI7ylkhyeFfCLMisk96EPsH4UEcjBx1Mcisxc_AYXI,757
3
+ chunkr_ai/api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
+ chunkr_ai/api/auth.py,sha256=hlv0GiUmlsbFO1wLL9sslqOnsBSoBqkL_6Mk2SDvxgE,413
5
+ chunkr_ai/api/chunkr.py,sha256=VnbuAPlWLqyf8xCCU_kpdybgjVPTwZLarDQoD3uozY0,3065
6
+ chunkr_ai/api/chunkr_base.py,sha256=giW56fL7xxJphdOTpIH52dXxpNt7OdP8pNiPSqbNjGM,5835
7
+ chunkr_ai/api/configuration.py,sha256=0wnrKlUIO7opvV963Gr_S8tlAjpo_IkNmbTi1_FwEug,3751
8
+ chunkr_ai/api/decorators.py,sha256=HSq3vcxOeUJkaWaf7HOvCyg9dWkVo8cG5BrU-jhbhmc,4053
9
+ chunkr_ai/api/misc.py,sha256=5PBI6pvOXr0x-3WieSKLrC8MA0iGPa-IG-5FEZ3vnr0,5724
10
+ chunkr_ai/api/protocol.py,sha256=LjPrYSq52m1afIlAo0yVGXlGZxPRh8J6g7S4PAit3Zo,388
11
+ chunkr_ai/api/task_response.py,sha256=CZIa3w5qPvSZDbDJ-LAtg7OOY91LsruemaXNyO2PymI,4256
12
+ chunkr_ai-0.0.25.dist-info/LICENSE,sha256=w3R12yNDyZpMiy2lxy_hvNbsldC75ww79sF0u11rkho,1069
13
+ chunkr_ai-0.0.25.dist-info/METADATA,sha256=CG1cO9YX7TpAHwBXgqLDgF9nwmVv30WLsWzfULx06W4,6996
14
+ chunkr_ai-0.0.25.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
15
+ chunkr_ai-0.0.25.dist-info/top_level.txt,sha256=0IZY7PZIiS8bw5r4NUQRUQ-ATi-L_3vLQVq3ZLouOW8,10
16
+ chunkr_ai-0.0.25.dist-info/RECORD,,
chunkr_ai/api/api.py DELETED
File without changes
@@ -1,17 +0,0 @@
1
- chunkr_ai/__init__.py,sha256=6KpYv2lmD6S5z2kc9pqwuLP5VDHmOuu2qDZArUIhb1s,53
2
- chunkr_ai/models.py,sha256=MK8FPbWDj1ynvSHaYuslKCPybxLuAlrsVIM3Eym3kKI,750
3
- chunkr_ai/api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
- chunkr_ai/api/api.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
- chunkr_ai/api/auth.py,sha256=hlv0GiUmlsbFO1wLL9sslqOnsBSoBqkL_6Mk2SDvxgE,413
6
- chunkr_ai/api/chunkr.py,sha256=XTXJFs0xjYY3w3N4fSQcxtJFBtNfzFYYkh6nDlFz4cY,2714
7
- chunkr_ai/api/chunkr_base.py,sha256=OkycHDHkdGX953_ab0XdYBnPDzSXYE30L3j52hBb8D0,5046
8
- chunkr_ai/api/config.py,sha256=NmPTsDvcjkvNx0gNzDTz-oFG5rQC7jm-H70O_crJCw8,4478
9
- chunkr_ai/api/decorators.py,sha256=y_Z9z0O2XXiX9z6jWDwdbCPdQyMLnjE0pGkJjHQEv_Q,2652
10
- chunkr_ai/api/misc.py,sha256=5Q2K713VPwf3S2519KTzjT9PKhTEBgBMk1d8NNnmpZ0,5717
11
- chunkr_ai/api/protocol.py,sha256=Nt8aWr4ouVwCvoLqVI5vnXJhT2cvxt0sQC-svUk2G5w,458
12
- chunkr_ai/api/task_response.py,sha256=aAx7otuvsp-A0U5EaHRkbnRJMoLI8N4lOMo8bS8emJc,3843
13
- chunkr_ai-0.0.23.dist-info/LICENSE,sha256=w3R12yNDyZpMiy2lxy_hvNbsldC75ww79sF0u11rkho,1069
14
- chunkr_ai-0.0.23.dist-info/METADATA,sha256=QsO__q1V9SJYz2uugyzj_CZpOuPie6AhLfY39hTNaOM,6952
15
- chunkr_ai-0.0.23.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
16
- chunkr_ai-0.0.23.dist-info/top_level.txt,sha256=0IZY7PZIiS8bw5r4NUQRUQ-ATi-L_3vLQVq3ZLouOW8,10
17
- chunkr_ai-0.0.23.dist-info/RECORD,,