chunkr-ai 0.0.22__py3-none-any.whl → 0.0.24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
chunkr_ai/api/chunkr.py CHANGED
@@ -34,7 +34,7 @@ class Chunkr(ChunkrBase):
34
34
  file: Union[str, Path, BinaryIO, Image.Image],
35
35
  config: Configuration = None,
36
36
  ) -> TaskResponse:
37
- files = prepare_upload_data(file, config)
37
+ files = await prepare_upload_data(file, config, self._client)
38
38
  r = await self._client.post(
39
39
  f"{self.url}/api/v1/task", files=files, headers=self._headers()
40
40
  )
@@ -44,7 +44,7 @@ class Chunkr(ChunkrBase):
44
44
  @anywhere()
45
45
  @ensure_client()
46
46
  async def update_task(self, task_id: str, config: Configuration) -> TaskResponse:
47
- files = prepare_upload_data(None, config)
47
+ files = await prepare_upload_data(None, config, self._client)
48
48
  r = await self._client.patch(
49
49
  f"{self.url}/api/v1/task/{task_id}",
50
50
  files=files,
@@ -4,22 +4,27 @@ from .auth import HeadersMixin
4
4
  from abc import abstractmethod
5
5
  from dotenv import load_dotenv
6
6
  import httpx
7
- import io
8
- import json
9
7
  import os
10
8
  from pathlib import Path
11
9
  from PIL import Image
12
- import requests
13
- from typing import BinaryIO, Tuple, Union
10
+ from typing import BinaryIO, Union
14
11
 
15
12
 
16
13
  class ChunkrBase(HeadersMixin):
17
- """Base class with shared functionality for Chunkr API clients."""
18
-
19
- def __init__(self, url: str = None, api_key: str = None):
14
+ """Base class with shared functionality for Chunkr API clients.
15
+
16
+ Args:
17
+ url: The base URL of the Chunkr API. Defaults to the value of the CHUNKR_URL environment variable, or "https://api.chunkr.ai" if not set.
18
+ api_key: The API key to use for authentication. Defaults to the value of the CHUNKR_API_KEY environment variable, or None if not set.
19
+ raise_on_failure: Whether to raise an exception if the task fails. Defaults to False.
20
+ """
21
+
22
+ def __init__(self, url: str = None, api_key: str = None, raise_on_failure: bool = False):
20
23
  load_dotenv()
21
24
  self.url = url or os.getenv("CHUNKR_URL") or "https://api.chunkr.ai"
22
25
  self._api_key = api_key or os.getenv("CHUNKR_API_KEY")
26
+ self.raise_on_failure = raise_on_failure
27
+
23
28
  if not self._api_key:
24
29
  raise ValueError(
25
30
  "API key must be provided either directly, in .env file, or as CHUNKR_API_KEY environment variable. You can get an api key at: https://www.chunkr.ai"
@@ -28,130 +33,6 @@ class ChunkrBase(HeadersMixin):
28
33
  self.url = self.url.rstrip("/")
29
34
  self._client = httpx.AsyncClient()
30
35
 
31
- def _prepare_file(
32
- self, file: Union[str, Path, BinaryIO, Image.Image]
33
- ) -> Tuple[str, BinaryIO]:
34
- """Convert various file types into a tuple of (filename, file-like object).
35
-
36
- Args:
37
- file: Input file, can be:
38
- - String or Path to a file
39
- - URL string starting with http:// or https://
40
- - Base64 string
41
- - Opened binary file (mode='rb')
42
- - PIL/Pillow Image object
43
-
44
- Returns:
45
- Tuple[str, BinaryIO]: (filename, file-like object) ready for upload
46
-
47
- Raises:
48
- FileNotFoundError: If the file path doesn't exist
49
- TypeError: If the file type is not supported
50
- ValueError: If the URL is invalid or unreachable
51
- ValueError: If the MIME type is unsupported
52
- """
53
- # Handle URLs
54
- if isinstance(file, str) and (
55
- file.startswith("http://") or file.startswith("https://")
56
- ):
57
- response = requests.get(file)
58
- response.raise_for_status()
59
- file_obj = io.BytesIO(response.content)
60
- filename = Path(file.split("/")[-1]).name or "downloaded_file"
61
- return filename, file_obj
62
-
63
- # Handle base64 strings
64
- if isinstance(file, str) and "," in file and ";base64," in file:
65
- try:
66
- # Split header and data
67
- header, base64_data = file.split(",", 1)
68
- import base64
69
-
70
- file_bytes = base64.b64decode(base64_data)
71
- file_obj = io.BytesIO(file_bytes)
72
-
73
- # Try to determine format from header
74
- format = "bin"
75
- mime_type = header.split(":")[-1].split(";")[0].lower()
76
-
77
- # Map MIME types to file extensions
78
- mime_to_ext = {
79
- "application/pdf": "pdf",
80
- "application/vnd.openxmlformats-officedocument.wordprocessingml.document": "docx",
81
- "application/msword": "doc",
82
- "application/vnd.openxmlformats-officedocument.presentationml.presentation": "pptx",
83
- "application/vnd.ms-powerpoint": "ppt",
84
- "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": "xlsx",
85
- "application/vnd.ms-excel": "xls",
86
- "image/jpeg": "jpg",
87
- "image/png": "png",
88
- "image/jpg": "jpg",
89
- }
90
-
91
- if mime_type in mime_to_ext:
92
- format = mime_to_ext[mime_type]
93
- else:
94
- raise ValueError(f"Unsupported MIME type: {mime_type}")
95
-
96
- return f"file.{format}", file_obj
97
- except Exception as e:
98
- raise ValueError(f"Invalid base64 string: {str(e)}")
99
-
100
- # Handle file paths
101
- if isinstance(file, (str, Path)):
102
- path = Path(file).resolve()
103
- if not path.exists():
104
- raise FileNotFoundError(f"File not found: {file}")
105
- return path.name, open(path, "rb")
106
-
107
- # Handle PIL Images
108
- if isinstance(file, Image.Image):
109
- img_byte_arr = io.BytesIO()
110
- format = file.format or "PNG"
111
- file.save(img_byte_arr, format=format)
112
- img_byte_arr.seek(0)
113
- return f"image.{format.lower()}", img_byte_arr
114
-
115
- # Handle file-like objects
116
- if hasattr(file, "read") and hasattr(file, "seek"):
117
- # Try to get the filename from the file object if possible
118
- name = (
119
- getattr(file, "name", "document")
120
- if hasattr(file, "name")
121
- else "document"
122
- )
123
- return Path(name).name, file
124
-
125
- raise TypeError(f"Unsupported file type: {type(file)}")
126
-
127
- def _prepare_upload_data(
128
- self,
129
- file: Union[str, Path, BinaryIO, Image.Image],
130
- config: Configuration = None,
131
- ) -> Tuple[dict, dict]:
132
- """Prepare files and data dictionaries for upload.
133
-
134
- Args:
135
- file: The file to upload
136
- config: Optional configuration settings
137
-
138
- Returns:
139
- Tuple[dict, dict]: (files dict, data dict) ready for upload
140
- """
141
- filename, file_obj = self._prepare_file(file)
142
- files = {"file": (filename, file_obj)}
143
- data = {}
144
-
145
- if config:
146
- config_dict = config.model_dump(mode="json", exclude_none=True)
147
- for key, value in config_dict.items():
148
- if isinstance(value, dict):
149
- files[key] = (None, json.dumps(value), "application/json")
150
- else:
151
- data[key] = value
152
-
153
- return files, data
154
-
155
36
  @abstractmethod
156
37
  def upload(
157
38
  self,
chunkr_ai/api/misc.py CHANGED
@@ -3,16 +3,36 @@ import io
3
3
  import json
4
4
  from pathlib import Path
5
5
  from PIL import Image
6
- import requests
6
+ import httpx
7
7
  from typing import Union, Tuple, BinaryIO, Optional
8
8
 
9
- def prepare_file(file: Union[str, Path, BinaryIO, Image.Image]) -> Tuple[str, BinaryIO]:
10
- """Convert various file types into a tuple of (filename, file-like object)."""
9
+ async def prepare_file(file: Union[str, Path, BinaryIO, Image.Image], client: httpx.AsyncClient = None) -> Tuple[str, BinaryIO]:
10
+ """Convert various file types into a tuple of (filename, file-like object).
11
+
12
+ Args:
13
+ file: Input file, can be:
14
+ - String or Path to a file
15
+ - URL string starting with http:// or https://
16
+ - Base64 string
17
+ - Opened binary file (mode='rb')
18
+ - PIL/Pillow Image object
19
+
20
+ Returns:
21
+ Tuple[str, BinaryIO]: (filename, file-like object) ready for upload
22
+
23
+ Raises:
24
+ FileNotFoundError: If the file path doesn't exist
25
+ TypeError: If the file type is not supported
26
+ ValueError: If the URL is invalid or unreachable
27
+ ValueError: If the MIME type is unsupported
28
+ """
11
29
  # Handle URLs
12
30
  if isinstance(file, str) and (
13
31
  file.startswith("http://") or file.startswith("https://")
14
32
  ):
15
- response = requests.get(file)
33
+ if not client:
34
+ raise ValueError("Client must be provided to download files from URLs")
35
+ response = client.get(file)
16
36
  response.raise_for_status()
17
37
 
18
38
  # Try to get filename from Content-Disposition header first
@@ -108,9 +128,10 @@ def prepare_file(file: Union[str, Path, BinaryIO, Image.Image]) -> Tuple[str, Bi
108
128
  raise TypeError(f"Unsupported file type: {type(file)}")
109
129
 
110
130
 
111
- def prepare_upload_data(
131
+ async def prepare_upload_data(
112
132
  file: Optional[Union[str, Path, BinaryIO, Image.Image]] = None,
113
133
  config: Optional[Configuration] = None,
134
+ client: httpx.AsyncClient = None,
114
135
  ) -> dict:
115
136
  """Prepare files and data dictionaries for upload.
116
137
 
@@ -123,7 +144,7 @@ def prepare_upload_data(
123
144
  """
124
145
  files = {}
125
146
  if file:
126
- filename, file_obj = prepare_file(file)
147
+ filename, file_obj = await prepare_file(file, client)
127
148
  files = {"file": (filename, file_obj)}
128
149
 
129
150
  if config:
chunkr_ai/api/protocol.py CHANGED
@@ -5,15 +5,10 @@ from httpx import AsyncClient
5
5
  @runtime_checkable
6
6
  class ChunkrClientProtocol(Protocol):
7
7
  """Protocol defining the interface for Chunkr clients"""
8
-
9
- url: str
10
- _api_key: str
8
+
9
+ raise_on_failure: bool = True
11
10
  _client: Optional[AsyncClient] = None
12
11
 
13
- def get_api_key(self) -> str:
14
- """Get the API key"""
15
- ...
16
-
17
12
  def _headers(self) -> dict:
18
13
  """Return headers required for API requests"""
19
14
  ...
@@ -30,15 +30,16 @@ class TaskResponse(BaseModel, Generic[T]):
30
30
  def _check_status(self) -> Optional[T]:
31
31
  """Helper method to check task status and handle completion/failure"""
32
32
  if self.status == "Failed":
33
- raise ValueError(self.message)
33
+ if getattr(self._client, 'raise_on_failure', True):
34
+ raise ValueError(self.message)
35
+ return self
34
36
  if self.status not in ("Starting", "Processing"):
35
37
  return self
36
38
  return None
37
39
 
40
+ @require_task()
38
41
  async def _poll_request(self) -> dict:
39
42
  try:
40
- if not self._client._client:
41
- raise ValueError("Client not found")
42
43
  r = await self._client._client.get(
43
44
  self.task_url, headers=self._client._headers()
44
45
  )
@@ -51,7 +52,6 @@ class TaskResponse(BaseModel, Generic[T]):
51
52
  raise
52
53
 
53
54
  @anywhere()
54
- @require_task()
55
55
  async def poll(self) -> T:
56
56
  """Poll the task for completion."""
57
57
  while True:
@@ -66,7 +66,7 @@ class TaskResponse(BaseModel, Generic[T]):
66
66
  @require_task()
67
67
  async def update(self, config: Configuration) -> T:
68
68
  """Update the task configuration."""
69
- f = prepare_upload_data(None, config)
69
+ f = await prepare_upload_data(None, config, self._client._client)
70
70
  r = await self._client._client.patch(
71
71
  self.task_url, files=f, headers=self._client._headers()
72
72
  )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: chunkr-ai
3
- Version: 0.0.22
3
+ Version: 0.0.24
4
4
  Summary: Python client for Chunkr: open source document intelligence
5
5
  Author-email: Ishaan Kapoor <ishaan@lumina.sh>
6
6
  License: MIT License
@@ -35,6 +35,7 @@ Provides-Extra: test
35
35
  Requires-Dist: pytest>=7.0.0; extra == "test"
36
36
  Requires-Dist: pytest-xdist>=3.0.0; extra == "test"
37
37
  Requires-Dist: pytest-asyncio>=0.21.0; extra == "test"
38
+ Requires-Dist: ruff>=0.9.3; extra == "test"
38
39
 
39
40
  # Chunkr Python Client
40
41
 
@@ -0,0 +1,17 @@
1
+ chunkr_ai/__init__.py,sha256=6KpYv2lmD6S5z2kc9pqwuLP5VDHmOuu2qDZArUIhb1s,53
2
+ chunkr_ai/models.py,sha256=MK8FPbWDj1ynvSHaYuslKCPybxLuAlrsVIM3Eym3kKI,750
3
+ chunkr_ai/api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
+ chunkr_ai/api/api.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
+ chunkr_ai/api/auth.py,sha256=hlv0GiUmlsbFO1wLL9sslqOnsBSoBqkL_6Mk2SDvxgE,413
6
+ chunkr_ai/api/chunkr.py,sha256=XTXJFs0xjYY3w3N4fSQcxtJFBtNfzFYYkh6nDlFz4cY,2714
7
+ chunkr_ai/api/chunkr_base.py,sha256=4SXA-gdZd1w2zZeeMdy4xog0NKOrKjmo6IMvSl9KSBg,5538
8
+ chunkr_ai/api/config.py,sha256=NmPTsDvcjkvNx0gNzDTz-oFG5rQC7jm-H70O_crJCw8,4478
9
+ chunkr_ai/api/decorators.py,sha256=y_Z9z0O2XXiX9z6jWDwdbCPdQyMLnjE0pGkJjHQEv_Q,2652
10
+ chunkr_ai/api/misc.py,sha256=5Q2K713VPwf3S2519KTzjT9PKhTEBgBMk1d8NNnmpZ0,5717
11
+ chunkr_ai/api/protocol.py,sha256=LjPrYSq52m1afIlAo0yVGXlGZxPRh8J6g7S4PAit3Zo,388
12
+ chunkr_ai/api/task_response.py,sha256=hcHsBgX-2C5Px5Bu0IKk33K_AkqHSEM1Wu2zkcPh9to,3935
13
+ chunkr_ai-0.0.24.dist-info/LICENSE,sha256=w3R12yNDyZpMiy2lxy_hvNbsldC75ww79sF0u11rkho,1069
14
+ chunkr_ai-0.0.24.dist-info/METADATA,sha256=JyDI8EkFaJQQ7vIo2osHxXmeuNqhQ0UWjgUMHSFIYow,6996
15
+ chunkr_ai-0.0.24.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
16
+ chunkr_ai-0.0.24.dist-info/top_level.txt,sha256=0IZY7PZIiS8bw5r4NUQRUQ-ATi-L_3vLQVq3ZLouOW8,10
17
+ chunkr_ai-0.0.24.dist-info/RECORD,,
@@ -1,17 +0,0 @@
1
- chunkr_ai/__init__.py,sha256=6KpYv2lmD6S5z2kc9pqwuLP5VDHmOuu2qDZArUIhb1s,53
2
- chunkr_ai/models.py,sha256=MK8FPbWDj1ynvSHaYuslKCPybxLuAlrsVIM3Eym3kKI,750
3
- chunkr_ai/api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
- chunkr_ai/api/api.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
- chunkr_ai/api/auth.py,sha256=hlv0GiUmlsbFO1wLL9sslqOnsBSoBqkL_6Mk2SDvxgE,413
6
- chunkr_ai/api/chunkr.py,sha256=ZjDVUeR1KPiLLHbj7AJipt-prBTw7a_GvjJ0nd7JV7c,2674
7
- chunkr_ai/api/chunkr_base.py,sha256=TDqEwCCfgshggi_Mzv76FhPj5z21QP8EVj7siczvfao,9826
8
- chunkr_ai/api/config.py,sha256=NmPTsDvcjkvNx0gNzDTz-oFG5rQC7jm-H70O_crJCw8,4478
9
- chunkr_ai/api/decorators.py,sha256=y_Z9z0O2XXiX9z6jWDwdbCPdQyMLnjE0pGkJjHQEv_Q,2652
10
- chunkr_ai/api/misc.py,sha256=wUG4SpfEEo7NcVK47gmw42dRy9zT5F9S2DtVC4T4ERs,4877
11
- chunkr_ai/api/protocol.py,sha256=Nt8aWr4ouVwCvoLqVI5vnXJhT2cvxt0sQC-svUk2G5w,458
12
- chunkr_ai/api/task_response.py,sha256=yjlUOADqf0O9X-yF7lbZuv39ttvitZHQpbGy3DtrJ80,3909
13
- chunkr_ai-0.0.22.dist-info/LICENSE,sha256=w3R12yNDyZpMiy2lxy_hvNbsldC75ww79sF0u11rkho,1069
14
- chunkr_ai-0.0.22.dist-info/METADATA,sha256=ro8wcqBnNGLkIhh7XrbSxdl4zaWXhA0d5SUgBQteAec,6952
15
- chunkr_ai-0.0.22.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
16
- chunkr_ai-0.0.22.dist-info/top_level.txt,sha256=0IZY7PZIiS8bw5r4NUQRUQ-ATi-L_3vLQVq3ZLouOW8,10
17
- chunkr_ai-0.0.22.dist-info/RECORD,,