chunkr-ai 0.0.17__py3-none-any.whl → 0.0.18__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
chunkr_ai/__init__.py CHANGED
@@ -1,4 +1,3 @@
1
1
  from .api.chunkr import Chunkr
2
- from .api.chunkr_async import ChunkrAsync
3
2
 
4
- __all__ = ["Chunkr", "ChunkrAsync"]
3
+ __all__ = ["Chunkr"]
chunkr_ai/api/chunkr.py CHANGED
@@ -1,78 +1,85 @@
1
- from .chunkr_base import ChunkrBase
2
- from .config import Configuration
3
- from .task import TaskResponse
4
1
  from pathlib import Path
5
2
  from PIL import Image
6
- import requests
7
3
  from typing import Union, BinaryIO
8
- from .misc import prepare_upload_data
9
4
 
5
+ from .config import Configuration
6
+ from .decorators import anywhere, ensure_client
7
+ from .misc import prepare_upload_data
8
+ from .task_response import TaskResponse
9
+ from .chunkr_base import ChunkrBase
10
10
 
11
11
  class Chunkr(ChunkrBase):
12
- """Chunkr API client"""
13
-
14
- def __init__(self, url: str = None, api_key: str = None):
15
- super().__init__(url, api_key)
16
- self._session = requests.Session()
17
-
18
- def upload(
12
+ """Chunkr API client that works in both sync and async contexts"""
13
+
14
+ @anywhere()
15
+ @ensure_client()
16
+ async def upload(
19
17
  self,
20
18
  file: Union[str, Path, BinaryIO, Image.Image],
21
19
  config: Configuration = None,
22
20
  ) -> TaskResponse:
23
- task = self.create_task(file, config)
24
- return task.poll()
21
+ task = await self.create_task(file, config)
22
+ return await task.poll()
25
23
 
26
- def update(self, task_id: str, config: Configuration) -> TaskResponse:
27
- task = self.update_task(task_id, config)
28
- return task.poll()
24
+ @anywhere()
25
+ @ensure_client()
26
+ async def update(self, task_id: str, config: Configuration) -> TaskResponse:
27
+ task = await self.update_task(task_id, config)
28
+ return await task.poll()
29
29
 
30
- def create_task(
30
+ @anywhere()
31
+ @ensure_client()
32
+ async def create_task(
31
33
  self,
32
34
  file: Union[str, Path, BinaryIO, Image.Image],
33
35
  config: Configuration = None,
34
36
  ) -> TaskResponse:
35
37
  files = prepare_upload_data(file, config)
36
- if not self._session:
37
- raise ValueError("Session not found")
38
- r = self._session.post(
38
+ r = await self._client.post(
39
39
  f"{self.url}/api/v1/task", files=files, headers=self._headers()
40
40
  )
41
41
  r.raise_for_status()
42
42
  return TaskResponse(**r.json()).with_client(self)
43
43
 
44
- def update_task(self, task_id: str, config: Configuration) -> TaskResponse:
44
+ @anywhere()
45
+ @ensure_client()
46
+ async def update_task(self, task_id: str, config: Configuration) -> TaskResponse:
45
47
  files = prepare_upload_data(None, config)
46
- if not self._session:
47
- raise ValueError("Session not found")
48
- r = self._session.patch(
49
- f"{self.url}/api/v1/task/{task_id}", files=files, headers=self._headers()
48
+ r = await self._client.patch(
49
+ f"{self.url}/api/v1/task/{task_id}",
50
+ files=files,
51
+ headers=self._headers(),
50
52
  )
51
-
52
53
  r.raise_for_status()
53
54
  return TaskResponse(**r.json()).with_client(self)
54
55
 
55
- def get_task(self, task_id: str) -> TaskResponse:
56
- if not self._session:
57
- raise ValueError("Session not found")
58
- r = self._session.get(
56
+ @anywhere()
57
+ @ensure_client()
58
+ async def get_task(self, task_id: str) -> TaskResponse:
59
+ r = await self._client.get(
59
60
  f"{self.url}/api/v1/task/{task_id}", headers=self._headers()
60
61
  )
61
62
  r.raise_for_status()
62
63
  return TaskResponse(**r.json()).with_client(self)
63
64
 
64
- def delete_task(self, task_id: str) -> None:
65
- if not self._session:
66
- raise ValueError("Session not found")
67
- r = self._session.delete(
65
+ @anywhere()
66
+ @ensure_client()
67
+ async def delete_task(self, task_id: str) -> None:
68
+ r = await self._client.delete(
68
69
  f"{self.url}/api/v1/task/{task_id}", headers=self._headers()
69
70
  )
70
71
  r.raise_for_status()
71
72
 
72
- def cancel_task(self, task_id: str) -> None:
73
- if not self._session:
74
- raise ValueError("Session not found")
75
- r = self._session.get(
73
+ @ensure_client()
74
+ @anywhere()
75
+ async def cancel_task(self, task_id: str) -> None:
76
+ r = await self._client.get(
76
77
  f"{self.url}/api/v1/task/{task_id}/cancel", headers=self._headers()
77
78
  )
78
79
  r.raise_for_status()
80
+
81
+ @anywhere()
82
+ async def close(self):
83
+ if self._client:
84
+ await self._client.aclose()
85
+ self._client = None
@@ -1,13 +1,16 @@
1
1
  from .config import Configuration
2
- from .task import TaskResponse
3
- from .task_async import TaskResponseAsync
2
+ from .task_response import TaskResponse
4
3
  from .auth import HeadersMixin
5
4
  from abc import abstractmethod
6
5
  from dotenv import load_dotenv
6
+ import httpx
7
+ import io
8
+ import json
7
9
  import os
8
10
  from pathlib import Path
9
11
  from PIL import Image
10
- from typing import BinaryIO, Union
12
+ import requests
13
+ from typing import BinaryIO, Tuple, Union
11
14
 
12
15
 
13
16
  class ChunkrBase(HeadersMixin):
@@ -23,13 +26,138 @@ class ChunkrBase(HeadersMixin):
23
26
  )
24
27
 
25
28
  self.url = self.url.rstrip("/")
29
+ self._client = httpx.AsyncClient()
30
+
31
+ def _prepare_file(
32
+ self, file: Union[str, Path, BinaryIO, Image.Image]
33
+ ) -> Tuple[str, BinaryIO]:
34
+ """Convert various file types into a tuple of (filename, file-like object).
35
+
36
+ Args:
37
+ file: Input file, can be:
38
+ - String or Path to a file
39
+ - URL string starting with http:// or https://
40
+ - Base64 string
41
+ - Opened binary file (mode='rb')
42
+ - PIL/Pillow Image object
43
+
44
+ Returns:
45
+ Tuple[str, BinaryIO]: (filename, file-like object) ready for upload
46
+
47
+ Raises:
48
+ FileNotFoundError: If the file path doesn't exist
49
+ TypeError: If the file type is not supported
50
+ ValueError: If the URL is invalid or unreachable
51
+ ValueError: If the MIME type is unsupported
52
+ """
53
+ # Handle URLs
54
+ if isinstance(file, str) and (
55
+ file.startswith("http://") or file.startswith("https://")
56
+ ):
57
+ response = requests.get(file)
58
+ response.raise_for_status()
59
+ file_obj = io.BytesIO(response.content)
60
+ filename = Path(file.split("/")[-1]).name or "downloaded_file"
61
+ return filename, file_obj
62
+
63
+ # Handle base64 strings
64
+ if isinstance(file, str) and "," in file and ";base64," in file:
65
+ try:
66
+ # Split header and data
67
+ header, base64_data = file.split(",", 1)
68
+ import base64
69
+
70
+ file_bytes = base64.b64decode(base64_data)
71
+ file_obj = io.BytesIO(file_bytes)
72
+
73
+ # Try to determine format from header
74
+ format = "bin"
75
+ mime_type = header.split(":")[-1].split(";")[0].lower()
76
+
77
+ # Map MIME types to file extensions
78
+ mime_to_ext = {
79
+ "application/pdf": "pdf",
80
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document": "docx",
81
+ "application/msword": "doc",
82
+ "application/vnd.openxmlformats-officedocument.presentationml.presentation": "pptx",
83
+ "application/vnd.ms-powerpoint": "ppt",
84
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": "xlsx",
85
+ "application/vnd.ms-excel": "xls",
86
+ "image/jpeg": "jpg",
87
+ "image/png": "png",
88
+ "image/jpg": "jpg",
89
+ }
90
+
91
+ if mime_type in mime_to_ext:
92
+ format = mime_to_ext[mime_type]
93
+ else:
94
+ raise ValueError(f"Unsupported MIME type: {mime_type}")
95
+
96
+ return f"file.{format}", file_obj
97
+ except Exception as e:
98
+ raise ValueError(f"Invalid base64 string: {str(e)}")
99
+
100
+ # Handle file paths
101
+ if isinstance(file, (str, Path)):
102
+ path = Path(file).resolve()
103
+ if not path.exists():
104
+ raise FileNotFoundError(f"File not found: {file}")
105
+ return path.name, open(path, "rb")
106
+
107
+ # Handle PIL Images
108
+ if isinstance(file, Image.Image):
109
+ img_byte_arr = io.BytesIO()
110
+ format = file.format or "PNG"
111
+ file.save(img_byte_arr, format=format)
112
+ img_byte_arr.seek(0)
113
+ return f"image.{format.lower()}", img_byte_arr
114
+
115
+ # Handle file-like objects
116
+ if hasattr(file, "read") and hasattr(file, "seek"):
117
+ # Try to get the filename from the file object if possible
118
+ name = (
119
+ getattr(file, "name", "document")
120
+ if hasattr(file, "name")
121
+ else "document"
122
+ )
123
+ return Path(name).name, file
124
+
125
+ raise TypeError(f"Unsupported file type: {type(file)}")
126
+
127
+ def _prepare_upload_data(
128
+ self,
129
+ file: Union[str, Path, BinaryIO, Image.Image],
130
+ config: Configuration = None,
131
+ ) -> Tuple[dict, dict]:
132
+ """Prepare files and data dictionaries for upload.
133
+
134
+ Args:
135
+ file: The file to upload
136
+ config: Optional configuration settings
137
+
138
+ Returns:
139
+ Tuple[dict, dict]: (files dict, data dict) ready for upload
140
+ """
141
+ filename, file_obj = self._prepare_file(file)
142
+ files = {"file": (filename, file_obj)}
143
+ data = {}
144
+
145
+ if config:
146
+ config_dict = config.model_dump(mode="json", exclude_none=True)
147
+ for key, value in config_dict.items():
148
+ if isinstance(value, dict):
149
+ files[key] = (None, json.dumps(value), "application/json")
150
+ else:
151
+ data[key] = value
152
+
153
+ return files, data
26
154
 
27
155
  @abstractmethod
28
156
  def upload(
29
157
  self,
30
158
  file: Union[str, Path, BinaryIO, Image.Image],
31
159
  config: Configuration = None,
32
- ) -> Union[TaskResponse, TaskResponseAsync]:
160
+ ) -> TaskResponse:
33
161
  """Upload a file and wait for processing to complete.
34
162
 
35
163
  Args:
@@ -64,7 +192,7 @@ class ChunkrBase(HeadersMixin):
64
192
  @abstractmethod
65
193
  def update(
66
194
  self, task_id: str, config: Configuration
67
- ) -> Union[TaskResponse, TaskResponseAsync]:
195
+ ) -> TaskResponse:
68
196
  """Update a task by its ID and wait for processing to complete.
69
197
 
70
198
  Args:
@@ -81,7 +209,7 @@ class ChunkrBase(HeadersMixin):
81
209
  self,
82
210
  file: Union[str, Path, BinaryIO, Image.Image],
83
211
  config: Configuration = None,
84
- ) -> Union[TaskResponse, TaskResponseAsync]:
212
+ ) -> TaskResponse:
85
213
  """Upload a file for processing and immediately return the task response. It will not wait for processing to complete. To wait for the full processing to complete, use `task.poll()`.
86
214
 
87
215
  Args:
@@ -117,7 +245,7 @@ class ChunkrBase(HeadersMixin):
117
245
  @abstractmethod
118
246
  def update_task(
119
247
  self, task_id: str, config: Configuration
120
- ) -> Union[TaskResponse, TaskResponseAsync]:
248
+ ) -> TaskResponse:
121
249
  """Update a task by its ID and immediately return the task response. It will not wait for processing to complete. To wait for the full processing to complete, use `task.poll()`.
122
250
 
123
251
  Args:
@@ -130,7 +258,7 @@ class ChunkrBase(HeadersMixin):
130
258
  pass
131
259
 
132
260
  @abstractmethod
133
- def get_task(self, task_id: str) -> Union[TaskResponse, TaskResponseAsync]:
261
+ def get_task(self, task_id: str) -> TaskResponse:
134
262
  """Get a task response by its ID.
135
263
 
136
264
  Args:
@@ -158,3 +286,9 @@ class ChunkrBase(HeadersMixin):
158
286
  task_id: The ID of the task to cancel
159
287
  """
160
288
  pass
289
+
290
+ @abstractmethod
291
+ def close(self) -> None:
292
+ """Close the client connection.
293
+ This should be called when you're done using the client to properly clean up resources."""
294
+ pass
chunkr_ai/api/config.py CHANGED
@@ -1,26 +1,21 @@
1
1
  from pydantic import BaseModel, Field, model_validator, ConfigDict
2
2
  from enum import Enum
3
3
  from typing import Optional, List, Dict, Union, Type
4
- from .schema import from_pydantic
5
-
6
4
 
7
5
  class GenerationStrategy(str, Enum):
8
6
  LLM = "LLM"
9
7
  AUTO = "Auto"
10
8
 
11
-
12
9
  class CroppingStrategy(str, Enum):
13
10
  ALL = "All"
14
11
  AUTO = "Auto"
15
12
 
16
-
17
13
  class GenerationConfig(BaseModel):
18
14
  html: Optional[GenerationStrategy] = None
19
15
  llm: Optional[str] = None
20
16
  markdown: Optional[GenerationStrategy] = None
21
17
  crop_image: Optional[CroppingStrategy] = None
22
18
 
23
-
24
19
  class SegmentProcessing(BaseModel):
25
20
  model_config = ConfigDict(populate_by_name=True, alias_generator=str.title)
26
21
 
@@ -39,46 +34,38 @@ class SegmentProcessing(BaseModel):
39
34
  page_footer: Optional[GenerationConfig] = Field(default=None, alias="PageFooter")
40
35
  page: Optional[GenerationConfig] = Field(default=None, alias="Page")
41
36
 
42
-
43
37
  class ChunkProcessing(BaseModel):
44
38
  target_length: Optional[int] = None
45
39
 
46
-
47
40
  class Property(BaseModel):
48
41
  name: str
49
42
  prop_type: str
50
43
  description: Optional[str] = None
51
44
  default: Optional[str] = None
52
45
 
53
-
54
46
  class JsonSchema(BaseModel):
55
47
  title: str
56
48
  properties: List[Property]
57
49
 
58
-
59
50
  class OcrStrategy(str, Enum):
60
51
  ALL = "All"
61
52
  AUTO = "Auto"
62
53
 
63
-
64
54
  class SegmentationStrategy(str, Enum):
65
55
  LAYOUT_ANALYSIS = "LayoutAnalysis"
66
56
  PAGE = "Page"
67
57
 
68
-
69
58
  class BoundingBox(BaseModel):
70
59
  left: float
71
60
  top: float
72
61
  width: float
73
62
  height: float
74
63
 
75
-
76
64
  class OCRResult(BaseModel):
77
65
  bbox: BoundingBox
78
66
  text: str
79
67
  confidence: Optional[float]
80
68
 
81
-
82
69
  class SegmentType(str, Enum):
83
70
  CAPTION = "Caption"
84
71
  FOOTNOTE = "Footnote"
@@ -93,7 +80,6 @@ class SegmentType(str, Enum):
93
80
  TEXT = "Text"
94
81
  TITLE = "Title"
95
82
 
96
-
97
83
  class Segment(BaseModel):
98
84
  bbox: BoundingBox
99
85
  content: str
@@ -107,42 +93,41 @@ class Segment(BaseModel):
107
93
  segment_id: str
108
94
  segment_type: SegmentType
109
95
 
110
-
111
96
  class Chunk(BaseModel):
112
97
  chunk_id: str
113
98
  chunk_length: int
114
99
  segments: List[Segment]
115
100
 
116
-
117
101
  class ExtractedJson(BaseModel):
118
102
  data: Dict
119
103
 
120
-
121
104
  class OutputResponse(BaseModel):
122
105
  chunks: List[Chunk]
123
- extracted_json: Optional[ExtractedJson] = Field(default=None)
124
-
106
+ file_name: Optional[str]
107
+ page_count: Optional[int]
108
+ pdf_url: Optional[str]
125
109
 
126
110
  class Model(str, Enum):
127
111
  FAST = "Fast"
128
112
  HIGH_QUALITY = "HighQuality"
129
113
 
130
- class PipelineType(str, Enum):
114
+ class Pipeline(str, Enum):
131
115
  AZURE = "Azure"
132
116
 
133
117
  class Configuration(BaseModel):
134
- chunk_processing: Optional[ChunkProcessing] = Field(default=None)
135
- expires_in: Optional[int] = Field(default=None)
136
- high_resolution: Optional[bool] = Field(default=None)
137
- json_schema: Optional[Union[JsonSchema, Type[BaseModel], BaseModel]] = Field(
138
- default=None
139
- )
140
- model: Optional[Model] = Field(default=None)
141
- ocr_strategy: Optional[OcrStrategy] = Field(default=None)
142
- segment_processing: Optional[SegmentProcessing] = Field(default=None)
143
- segmentation_strategy: Optional[SegmentationStrategy] = Field(default=None)
144
- pipeline: Optional[PipelineType] = Field(default=None)
145
-
118
+ chunk_processing: Optional[ChunkProcessing] = None
119
+ expires_in: Optional[int] = None
120
+ high_resolution: Optional[bool] = None
121
+ model: Optional[Model] = None
122
+ ocr_strategy: Optional[OcrStrategy] = None
123
+ segment_processing: Optional[SegmentProcessing] = None
124
+ segmentation_strategy: Optional[SegmentationStrategy] = None
125
+ pipeline: Optional[Pipeline] = None
126
+
127
+ class OutputConfiguration(Configuration):
128
+ input_file_url: Optional[str] = None
129
+ json_schema: Optional[Union[JsonSchema, Type[BaseModel], BaseModel]] = None
130
+
146
131
  @model_validator(mode="before")
147
132
  def map_deprecated_fields(cls, values: Dict) -> Dict:
148
133
  if isinstance(values, dict) and "target_chunk_length" in values:
@@ -151,19 +136,7 @@ class Configuration(BaseModel):
151
136
  values["chunk_processing"] = values.get("chunk_processing", {}) or {}
152
137
  values["chunk_processing"]["target_length"] = target_length
153
138
  return values
154
-
155
- @model_validator(mode="after")
156
- def convert_json_schema(self) -> "Configuration":
157
- if self.json_schema is not None and not isinstance(
158
- self.json_schema, JsonSchema
159
- ):
160
- if isinstance(self.json_schema, (BaseModel, type)) and issubclass(
161
- getattr(self.json_schema, "__class__", type), BaseModel
162
- ):
163
- self.json_schema = JsonSchema(**from_pydantic(self.json_schema))
164
- return self
165
-
166
-
139
+
167
140
  class Status(str, Enum):
168
141
  STARTING = "Starting"
169
142
  PROCESSING = "Processing"
@@ -0,0 +1,58 @@
1
+ import functools
2
+ import asyncio
3
+ import httpx
4
+ from typing import Callable, Any, TypeVar, Awaitable, ParamSpec, Union, overload
5
+
6
+ T = TypeVar('T')
7
+ P = ParamSpec('P')
8
+
9
+ _sync_loop = None
10
+
11
+ @overload
12
+ def anywhere() -> Callable[[Callable[P, Awaitable[T]]], Callable[P, Union[Awaitable[T], T]]]: ...
13
+
14
+ def anywhere():
15
+ """Decorator that allows an async function to run anywhere - sync or async context."""
16
+ def decorator(async_func: Callable[P, Awaitable[T]]) -> Callable[P, Union[Awaitable[T], T]]:
17
+ @functools.wraps(async_func)
18
+ def wrapper(*args: P.args, **kwargs: P.kwargs) -> Union[Awaitable[T], T]:
19
+ global _sync_loop
20
+ try:
21
+ loop = asyncio.get_running_loop()
22
+ return async_func(*args, **kwargs)
23
+ except RuntimeError:
24
+ if _sync_loop is None:
25
+ _sync_loop = asyncio.new_event_loop()
26
+ asyncio.set_event_loop(_sync_loop)
27
+ try:
28
+ return _sync_loop.run_until_complete(async_func(*args, **kwargs))
29
+ finally:
30
+ asyncio.set_event_loop(None)
31
+ return wrapper
32
+ return decorator
33
+
34
+ def ensure_client() -> Callable[[Callable[P, Awaitable[T]]], Callable[P, Awaitable[T]]]:
35
+ """Decorator that ensures a valid httpx.AsyncClient exists before executing the method"""
36
+ def decorator(async_func: Callable[P, Awaitable[T]]) -> Callable[P, Awaitable[T]]:
37
+ @functools.wraps(async_func)
38
+ async def wrapper(self: Any, *args: P.args, **kwargs: P.kwargs) -> T:
39
+ if not self._client or self._client.is_closed:
40
+ self._client = httpx.AsyncClient()
41
+ return await async_func(self, *args, **kwargs)
42
+ return wrapper
43
+ return decorator
44
+
45
+ def require_task() -> Callable[[Callable[P, Awaitable[T]]], Callable[P, Awaitable[T]]]:
46
+ """Decorator that ensures task has required attributes and valid client before execution"""
47
+ def decorator(async_func: Callable[P, Awaitable[T]]) -> Callable[P, Awaitable[T]]:
48
+ @functools.wraps(async_func)
49
+ async def wrapper(self: Any, *args: P.args, **kwargs: P.kwargs) -> T:
50
+ if not self.task_url:
51
+ raise ValueError("Task URL not found")
52
+ if not self._client:
53
+ raise ValueError("Client not found")
54
+ if not self._client._client or self._client._client.is_closed:
55
+ self._client._client = httpx.AsyncClient()
56
+ return await async_func(self, *args, **kwargs)
57
+ return wrapper
58
+ return decorator
chunkr_ai/api/misc.py CHANGED
@@ -3,11 +3,9 @@ import io
3
3
  import json
4
4
  from pathlib import Path
5
5
  from PIL import Image
6
- from pydantic import BaseModel
7
6
  import requests
8
7
  from typing import Union, Tuple, BinaryIO, Optional
9
8
 
10
-
11
9
  def prepare_file(file: Union[str, Path, BinaryIO, Image.Image]) -> Tuple[str, BinaryIO]:
12
10
  """Convert various file types into a tuple of (filename, file-like object)."""
13
11
  # Handle URLs
chunkr_ai/api/protocol.py CHANGED
@@ -1,5 +1,4 @@
1
1
  from typing import Optional, runtime_checkable, Protocol
2
- from requests import Session
3
2
  from httpx import AsyncClient
4
3
 
5
4
 
@@ -9,7 +8,6 @@ class ChunkrClientProtocol(Protocol):
9
8
 
10
9
  url: str
11
10
  _api_key: str
12
- _session: Optional[Session] = None
13
11
  _client: Optional[AsyncClient] = None
14
12
 
15
13
  def get_api_key(self) -> str: