chunkr-ai 0.0.17__py3-none-any.whl → 0.0.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
chunkr_ai/__init__.py CHANGED
@@ -1,4 +1,3 @@
1
1
  from .api.chunkr import Chunkr
2
- from .api.chunkr_async import ChunkrAsync
3
2
 
4
- __all__ = ["Chunkr", "ChunkrAsync"]
3
+ __all__ = ["Chunkr"]
chunkr_ai/api/chunkr.py CHANGED
@@ -1,78 +1,85 @@
1
- from .chunkr_base import ChunkrBase
2
- from .config import Configuration
3
- from .task import TaskResponse
4
1
  from pathlib import Path
5
2
  from PIL import Image
6
- import requests
7
3
  from typing import Union, BinaryIO
8
- from .misc import prepare_upload_data
9
4
 
5
+ from .config import Configuration
6
+ from .decorators import anywhere, ensure_client
7
+ from .misc import prepare_upload_data
8
+ from .task_response import TaskResponse
9
+ from .chunkr_base import ChunkrBase
10
10
 
11
11
  class Chunkr(ChunkrBase):
12
- """Chunkr API client"""
13
-
14
- def __init__(self, url: str = None, api_key: str = None):
15
- super().__init__(url, api_key)
16
- self._session = requests.Session()
17
-
18
- def upload(
12
+ """Chunkr API client that works in both sync and async contexts"""
13
+
14
+ @anywhere()
15
+ @ensure_client()
16
+ async def upload(
19
17
  self,
20
18
  file: Union[str, Path, BinaryIO, Image.Image],
21
19
  config: Configuration = None,
22
20
  ) -> TaskResponse:
23
- task = self.create_task(file, config)
24
- return task.poll()
21
+ task = await self.create_task(file, config)
22
+ return await task.poll()
25
23
 
26
- def update(self, task_id: str, config: Configuration) -> TaskResponse:
27
- task = self.update_task(task_id, config)
28
- return task.poll()
24
+ @anywhere()
25
+ @ensure_client()
26
+ async def update(self, task_id: str, config: Configuration) -> TaskResponse:
27
+ task = await self.update_task(task_id, config)
28
+ return await task.poll()
29
29
 
30
- def create_task(
30
+ @anywhere()
31
+ @ensure_client()
32
+ async def create_task(
31
33
  self,
32
34
  file: Union[str, Path, BinaryIO, Image.Image],
33
35
  config: Configuration = None,
34
36
  ) -> TaskResponse:
35
37
  files = prepare_upload_data(file, config)
36
- if not self._session:
37
- raise ValueError("Session not found")
38
- r = self._session.post(
38
+ r = await self._client.post(
39
39
  f"{self.url}/api/v1/task", files=files, headers=self._headers()
40
40
  )
41
41
  r.raise_for_status()
42
42
  return TaskResponse(**r.json()).with_client(self)
43
43
 
44
- def update_task(self, task_id: str, config: Configuration) -> TaskResponse:
44
+ @anywhere()
45
+ @ensure_client()
46
+ async def update_task(self, task_id: str, config: Configuration) -> TaskResponse:
45
47
  files = prepare_upload_data(None, config)
46
- if not self._session:
47
- raise ValueError("Session not found")
48
- r = self._session.patch(
49
- f"{self.url}/api/v1/task/{task_id}", files=files, headers=self._headers()
48
+ r = await self._client.patch(
49
+ f"{self.url}/api/v1/task/{task_id}",
50
+ files=files,
51
+ headers=self._headers(),
50
52
  )
51
-
52
53
  r.raise_for_status()
53
54
  return TaskResponse(**r.json()).with_client(self)
54
55
 
55
- def get_task(self, task_id: str) -> TaskResponse:
56
- if not self._session:
57
- raise ValueError("Session not found")
58
- r = self._session.get(
56
+ @anywhere()
57
+ @ensure_client()
58
+ async def get_task(self, task_id: str) -> TaskResponse:
59
+ r = await self._client.get(
59
60
  f"{self.url}/api/v1/task/{task_id}", headers=self._headers()
60
61
  )
61
62
  r.raise_for_status()
62
63
  return TaskResponse(**r.json()).with_client(self)
63
64
 
64
- def delete_task(self, task_id: str) -> None:
65
- if not self._session:
66
- raise ValueError("Session not found")
67
- r = self._session.delete(
65
+ @anywhere()
66
+ @ensure_client()
67
+ async def delete_task(self, task_id: str) -> None:
68
+ r = await self._client.delete(
68
69
  f"{self.url}/api/v1/task/{task_id}", headers=self._headers()
69
70
  )
70
71
  r.raise_for_status()
71
72
 
72
- def cancel_task(self, task_id: str) -> None:
73
- if not self._session:
74
- raise ValueError("Session not found")
75
- r = self._session.get(
73
+ @ensure_client()
74
+ @anywhere()
75
+ async def cancel_task(self, task_id: str) -> None:
76
+ r = await self._client.get(
76
77
  f"{self.url}/api/v1/task/{task_id}/cancel", headers=self._headers()
77
78
  )
78
79
  r.raise_for_status()
80
+
81
+ @anywhere()
82
+ async def close(self):
83
+ if self._client:
84
+ await self._client.aclose()
85
+ self._client = None
@@ -1,13 +1,16 @@
1
1
  from .config import Configuration
2
- from .task import TaskResponse
3
- from .task_async import TaskResponseAsync
2
+ from .task_response import TaskResponse
4
3
  from .auth import HeadersMixin
5
4
  from abc import abstractmethod
6
5
  from dotenv import load_dotenv
6
+ import httpx
7
+ import io
8
+ import json
7
9
  import os
8
10
  from pathlib import Path
9
11
  from PIL import Image
10
- from typing import BinaryIO, Union
12
+ import requests
13
+ from typing import BinaryIO, Tuple, Union
11
14
 
12
15
 
13
16
  class ChunkrBase(HeadersMixin):
@@ -23,13 +26,138 @@ class ChunkrBase(HeadersMixin):
23
26
  )
24
27
 
25
28
  self.url = self.url.rstrip("/")
29
+ self._client = httpx.AsyncClient()
30
+
31
+ def _prepare_file(
32
+ self, file: Union[str, Path, BinaryIO, Image.Image]
33
+ ) -> Tuple[str, BinaryIO]:
34
+ """Convert various file types into a tuple of (filename, file-like object).
35
+
36
+ Args:
37
+ file: Input file, can be:
38
+ - String or Path to a file
39
+ - URL string starting with http:// or https://
40
+ - Base64 string
41
+ - Opened binary file (mode='rb')
42
+ - PIL/Pillow Image object
43
+
44
+ Returns:
45
+ Tuple[str, BinaryIO]: (filename, file-like object) ready for upload
46
+
47
+ Raises:
48
+ FileNotFoundError: If the file path doesn't exist
49
+ TypeError: If the file type is not supported
50
+ ValueError: If the URL is invalid or unreachable
51
+ ValueError: If the MIME type is unsupported
52
+ """
53
+ # Handle URLs
54
+ if isinstance(file, str) and (
55
+ file.startswith("http://") or file.startswith("https://")
56
+ ):
57
+ response = requests.get(file)
58
+ response.raise_for_status()
59
+ file_obj = io.BytesIO(response.content)
60
+ filename = Path(file.split("/")[-1]).name or "downloaded_file"
61
+ return filename, file_obj
62
+
63
+ # Handle base64 strings
64
+ if isinstance(file, str) and "," in file and ";base64," in file:
65
+ try:
66
+ # Split header and data
67
+ header, base64_data = file.split(",", 1)
68
+ import base64
69
+
70
+ file_bytes = base64.b64decode(base64_data)
71
+ file_obj = io.BytesIO(file_bytes)
72
+
73
+ # Try to determine format from header
74
+ format = "bin"
75
+ mime_type = header.split(":")[-1].split(";")[0].lower()
76
+
77
+ # Map MIME types to file extensions
78
+ mime_to_ext = {
79
+ "application/pdf": "pdf",
80
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document": "docx",
81
+ "application/msword": "doc",
82
+ "application/vnd.openxmlformats-officedocument.presentationml.presentation": "pptx",
83
+ "application/vnd.ms-powerpoint": "ppt",
84
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": "xlsx",
85
+ "application/vnd.ms-excel": "xls",
86
+ "image/jpeg": "jpg",
87
+ "image/png": "png",
88
+ "image/jpg": "jpg",
89
+ }
90
+
91
+ if mime_type in mime_to_ext:
92
+ format = mime_to_ext[mime_type]
93
+ else:
94
+ raise ValueError(f"Unsupported MIME type: {mime_type}")
95
+
96
+ return f"file.{format}", file_obj
97
+ except Exception as e:
98
+ raise ValueError(f"Invalid base64 string: {str(e)}")
99
+
100
+ # Handle file paths
101
+ if isinstance(file, (str, Path)):
102
+ path = Path(file).resolve()
103
+ if not path.exists():
104
+ raise FileNotFoundError(f"File not found: {file}")
105
+ return path.name, open(path, "rb")
106
+
107
+ # Handle PIL Images
108
+ if isinstance(file, Image.Image):
109
+ img_byte_arr = io.BytesIO()
110
+ format = file.format or "PNG"
111
+ file.save(img_byte_arr, format=format)
112
+ img_byte_arr.seek(0)
113
+ return f"image.{format.lower()}", img_byte_arr
114
+
115
+ # Handle file-like objects
116
+ if hasattr(file, "read") and hasattr(file, "seek"):
117
+ # Try to get the filename from the file object if possible
118
+ name = (
119
+ getattr(file, "name", "document")
120
+ if hasattr(file, "name")
121
+ else "document"
122
+ )
123
+ return Path(name).name, file
124
+
125
+ raise TypeError(f"Unsupported file type: {type(file)}")
126
+
127
+ def _prepare_upload_data(
128
+ self,
129
+ file: Union[str, Path, BinaryIO, Image.Image],
130
+ config: Configuration = None,
131
+ ) -> Tuple[dict, dict]:
132
+ """Prepare files and data dictionaries for upload.
133
+
134
+ Args:
135
+ file: The file to upload
136
+ config: Optional configuration settings
137
+
138
+ Returns:
139
+ Tuple[dict, dict]: (files dict, data dict) ready for upload
140
+ """
141
+ filename, file_obj = self._prepare_file(file)
142
+ files = {"file": (filename, file_obj)}
143
+ data = {}
144
+
145
+ if config:
146
+ config_dict = config.model_dump(mode="json", exclude_none=True)
147
+ for key, value in config_dict.items():
148
+ if isinstance(value, dict):
149
+ files[key] = (None, json.dumps(value), "application/json")
150
+ else:
151
+ data[key] = value
152
+
153
+ return files, data
26
154
 
27
155
  @abstractmethod
28
156
  def upload(
29
157
  self,
30
158
  file: Union[str, Path, BinaryIO, Image.Image],
31
159
  config: Configuration = None,
32
- ) -> Union[TaskResponse, TaskResponseAsync]:
160
+ ) -> TaskResponse:
33
161
  """Upload a file and wait for processing to complete.
34
162
 
35
163
  Args:
@@ -64,7 +192,7 @@ class ChunkrBase(HeadersMixin):
64
192
  @abstractmethod
65
193
  def update(
66
194
  self, task_id: str, config: Configuration
67
- ) -> Union[TaskResponse, TaskResponseAsync]:
195
+ ) -> TaskResponse:
68
196
  """Update a task by its ID and wait for processing to complete.
69
197
 
70
198
  Args:
@@ -81,7 +209,7 @@ class ChunkrBase(HeadersMixin):
81
209
  self,
82
210
  file: Union[str, Path, BinaryIO, Image.Image],
83
211
  config: Configuration = None,
84
- ) -> Union[TaskResponse, TaskResponseAsync]:
212
+ ) -> TaskResponse:
85
213
  """Upload a file for processing and immediately return the task response. It will not wait for processing to complete. To wait for the full processing to complete, use `task.poll()`.
86
214
 
87
215
  Args:
@@ -117,7 +245,7 @@ class ChunkrBase(HeadersMixin):
117
245
  @abstractmethod
118
246
  def update_task(
119
247
  self, task_id: str, config: Configuration
120
- ) -> Union[TaskResponse, TaskResponseAsync]:
248
+ ) -> TaskResponse:
121
249
  """Update a task by its ID and immediately return the task response. It will not wait for processing to complete. To wait for the full processing to complete, use `task.poll()`.
122
250
 
123
251
  Args:
@@ -130,7 +258,7 @@ class ChunkrBase(HeadersMixin):
130
258
  pass
131
259
 
132
260
  @abstractmethod
133
- def get_task(self, task_id: str) -> Union[TaskResponse, TaskResponseAsync]:
261
+ def get_task(self, task_id: str) -> TaskResponse:
134
262
  """Get a task response by its ID.
135
263
 
136
264
  Args:
@@ -158,3 +286,9 @@ class ChunkrBase(HeadersMixin):
158
286
  task_id: The ID of the task to cancel
159
287
  """
160
288
  pass
289
+
290
+ @abstractmethod
291
+ def close(self) -> None:
292
+ """Close the client connection.
293
+ This should be called when you're done using the client to properly clean up resources."""
294
+ pass
chunkr_ai/api/config.py CHANGED
@@ -1,26 +1,21 @@
1
1
  from pydantic import BaseModel, Field, model_validator, ConfigDict
2
2
  from enum import Enum
3
3
  from typing import Optional, List, Dict, Union, Type
4
- from .schema import from_pydantic
5
-
6
4
 
7
5
  class GenerationStrategy(str, Enum):
8
6
  LLM = "LLM"
9
7
  AUTO = "Auto"
10
8
 
11
-
12
9
  class CroppingStrategy(str, Enum):
13
10
  ALL = "All"
14
11
  AUTO = "Auto"
15
12
 
16
-
17
13
  class GenerationConfig(BaseModel):
18
14
  html: Optional[GenerationStrategy] = None
19
15
  llm: Optional[str] = None
20
16
  markdown: Optional[GenerationStrategy] = None
21
17
  crop_image: Optional[CroppingStrategy] = None
22
18
 
23
-
24
19
  class SegmentProcessing(BaseModel):
25
20
  model_config = ConfigDict(populate_by_name=True, alias_generator=str.title)
26
21
 
@@ -39,46 +34,38 @@ class SegmentProcessing(BaseModel):
39
34
  page_footer: Optional[GenerationConfig] = Field(default=None, alias="PageFooter")
40
35
  page: Optional[GenerationConfig] = Field(default=None, alias="Page")
41
36
 
42
-
43
37
  class ChunkProcessing(BaseModel):
44
38
  target_length: Optional[int] = None
45
39
 
46
-
47
40
  class Property(BaseModel):
48
41
  name: str
49
42
  prop_type: str
50
43
  description: Optional[str] = None
51
44
  default: Optional[str] = None
52
45
 
53
-
54
46
  class JsonSchema(BaseModel):
55
47
  title: str
56
48
  properties: List[Property]
57
49
 
58
-
59
50
  class OcrStrategy(str, Enum):
60
51
  ALL = "All"
61
52
  AUTO = "Auto"
62
53
 
63
-
64
54
  class SegmentationStrategy(str, Enum):
65
55
  LAYOUT_ANALYSIS = "LayoutAnalysis"
66
56
  PAGE = "Page"
67
57
 
68
-
69
58
  class BoundingBox(BaseModel):
70
59
  left: float
71
60
  top: float
72
61
  width: float
73
62
  height: float
74
63
 
75
-
76
64
  class OCRResult(BaseModel):
77
65
  bbox: BoundingBox
78
66
  text: str
79
67
  confidence: Optional[float]
80
68
 
81
-
82
69
  class SegmentType(str, Enum):
83
70
  CAPTION = "Caption"
84
71
  FOOTNOTE = "Footnote"
@@ -93,7 +80,6 @@ class SegmentType(str, Enum):
93
80
  TEXT = "Text"
94
81
  TITLE = "Title"
95
82
 
96
-
97
83
  class Segment(BaseModel):
98
84
  bbox: BoundingBox
99
85
  content: str
@@ -107,42 +93,41 @@ class Segment(BaseModel):
107
93
  segment_id: str
108
94
  segment_type: SegmentType
109
95
 
110
-
111
96
  class Chunk(BaseModel):
112
97
  chunk_id: str
113
98
  chunk_length: int
114
99
  segments: List[Segment]
115
100
 
116
-
117
101
  class ExtractedJson(BaseModel):
118
102
  data: Dict
119
103
 
120
-
121
104
  class OutputResponse(BaseModel):
122
105
  chunks: List[Chunk]
123
- extracted_json: Optional[ExtractedJson] = Field(default=None)
124
-
106
+ file_name: Optional[str]
107
+ page_count: Optional[int]
108
+ pdf_url: Optional[str]
125
109
 
126
110
  class Model(str, Enum):
127
111
  FAST = "Fast"
128
112
  HIGH_QUALITY = "HighQuality"
129
113
 
130
- class PipelineType(str, Enum):
114
+ class Pipeline(str, Enum):
131
115
  AZURE = "Azure"
132
116
 
133
117
  class Configuration(BaseModel):
134
- chunk_processing: Optional[ChunkProcessing] = Field(default=None)
135
- expires_in: Optional[int] = Field(default=None)
136
- high_resolution: Optional[bool] = Field(default=None)
137
- json_schema: Optional[Union[JsonSchema, Type[BaseModel], BaseModel]] = Field(
138
- default=None
139
- )
140
- model: Optional[Model] = Field(default=None)
141
- ocr_strategy: Optional[OcrStrategy] = Field(default=None)
142
- segment_processing: Optional[SegmentProcessing] = Field(default=None)
143
- segmentation_strategy: Optional[SegmentationStrategy] = Field(default=None)
144
- pipeline: Optional[PipelineType] = Field(default=None)
145
-
118
+ chunk_processing: Optional[ChunkProcessing] = None
119
+ expires_in: Optional[int] = None
120
+ high_resolution: Optional[bool] = None
121
+ model: Optional[Model] = None
122
+ ocr_strategy: Optional[OcrStrategy] = None
123
+ segment_processing: Optional[SegmentProcessing] = None
124
+ segmentation_strategy: Optional[SegmentationStrategy] = None
125
+ pipeline: Optional[Pipeline] = None
126
+
127
+ class OutputConfiguration(Configuration):
128
+ input_file_url: Optional[str] = None
129
+ json_schema: Optional[Union[JsonSchema, Type[BaseModel], BaseModel]] = None
130
+
146
131
  @model_validator(mode="before")
147
132
  def map_deprecated_fields(cls, values: Dict) -> Dict:
148
133
  if isinstance(values, dict) and "target_chunk_length" in values:
@@ -151,19 +136,7 @@ class Configuration(BaseModel):
151
136
  values["chunk_processing"] = values.get("chunk_processing", {}) or {}
152
137
  values["chunk_processing"]["target_length"] = target_length
153
138
  return values
154
-
155
- @model_validator(mode="after")
156
- def convert_json_schema(self) -> "Configuration":
157
- if self.json_schema is not None and not isinstance(
158
- self.json_schema, JsonSchema
159
- ):
160
- if isinstance(self.json_schema, (BaseModel, type)) and issubclass(
161
- getattr(self.json_schema, "__class__", type), BaseModel
162
- ):
163
- self.json_schema = JsonSchema(**from_pydantic(self.json_schema))
164
- return self
165
-
166
-
139
+
167
140
  class Status(str, Enum):
168
141
  STARTING = "Starting"
169
142
  PROCESSING = "Processing"
@@ -0,0 +1,58 @@
1
+ import functools
2
+ import asyncio
3
+ import httpx
4
+ from typing import Callable, Any, TypeVar, Awaitable, ParamSpec, Union, overload
5
+
6
+ T = TypeVar('T')
7
+ P = ParamSpec('P')
8
+
9
+ _sync_loop = None
10
+
11
+ @overload
12
+ def anywhere() -> Callable[[Callable[P, Awaitable[T]]], Callable[P, Union[Awaitable[T], T]]]: ...
13
+
14
+ def anywhere():
15
+ """Decorator that allows an async function to run anywhere - sync or async context."""
16
+ def decorator(async_func: Callable[P, Awaitable[T]]) -> Callable[P, Union[Awaitable[T], T]]:
17
+ @functools.wraps(async_func)
18
+ def wrapper(*args: P.args, **kwargs: P.kwargs) -> Union[Awaitable[T], T]:
19
+ global _sync_loop
20
+ try:
21
+ loop = asyncio.get_running_loop()
22
+ return async_func(*args, **kwargs)
23
+ except RuntimeError:
24
+ if _sync_loop is None:
25
+ _sync_loop = asyncio.new_event_loop()
26
+ asyncio.set_event_loop(_sync_loop)
27
+ try:
28
+ return _sync_loop.run_until_complete(async_func(*args, **kwargs))
29
+ finally:
30
+ asyncio.set_event_loop(None)
31
+ return wrapper
32
+ return decorator
33
+
34
+ def ensure_client() -> Callable[[Callable[P, Awaitable[T]]], Callable[P, Awaitable[T]]]:
35
+ """Decorator that ensures a valid httpx.AsyncClient exists before executing the method"""
36
+ def decorator(async_func: Callable[P, Awaitable[T]]) -> Callable[P, Awaitable[T]]:
37
+ @functools.wraps(async_func)
38
+ async def wrapper(self: Any, *args: P.args, **kwargs: P.kwargs) -> T:
39
+ if not self._client or self._client.is_closed:
40
+ self._client = httpx.AsyncClient()
41
+ return await async_func(self, *args, **kwargs)
42
+ return wrapper
43
+ return decorator
44
+
45
+ def require_task() -> Callable[[Callable[P, Awaitable[T]]], Callable[P, Awaitable[T]]]:
46
+ """Decorator that ensures task has required attributes and valid client before execution"""
47
+ def decorator(async_func: Callable[P, Awaitable[T]]) -> Callable[P, Awaitable[T]]:
48
+ @functools.wraps(async_func)
49
+ async def wrapper(self: Any, *args: P.args, **kwargs: P.kwargs) -> T:
50
+ if not self.task_url:
51
+ raise ValueError("Task URL not found")
52
+ if not self._client:
53
+ raise ValueError("Client not found")
54
+ if not self._client._client or self._client._client.is_closed:
55
+ self._client._client = httpx.AsyncClient()
56
+ return await async_func(self, *args, **kwargs)
57
+ return wrapper
58
+ return decorator
chunkr_ai/api/misc.py CHANGED
@@ -3,11 +3,9 @@ import io
3
3
  import json
4
4
  from pathlib import Path
5
5
  from PIL import Image
6
- from pydantic import BaseModel
7
6
  import requests
8
7
  from typing import Union, Tuple, BinaryIO, Optional
9
8
 
10
-
11
9
  def prepare_file(file: Union[str, Path, BinaryIO, Image.Image]) -> Tuple[str, BinaryIO]:
12
10
  """Convert various file types into a tuple of (filename, file-like object)."""
13
11
  # Handle URLs
chunkr_ai/api/protocol.py CHANGED
@@ -1,5 +1,4 @@
1
1
  from typing import Optional, runtime_checkable, Protocol
2
- from requests import Session
3
2
  from httpx import AsyncClient
4
3
 
5
4
 
@@ -9,7 +8,6 @@ class ChunkrClientProtocol(Protocol):
9
8
 
10
9
  url: str
11
10
  _api_key: str
12
- _session: Optional[Session] = None
13
11
  _client: Optional[AsyncClient] = None
14
12
 
15
13
  def get_api_key(self) -> str: