chunkr-ai 0.0.12__py3-none-any.whl → 0.0.15__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
chunkr_ai/__init__.py CHANGED
@@ -1,4 +1,4 @@
1
1
  from .api.chunkr import Chunkr
2
2
  from .api.chunkr_async import ChunkrAsync
3
3
 
4
- __all__ = ['Chunkr', 'ChunkrAsync']
4
+ __all__ = ["Chunkr", "ChunkrAsync"]
chunkr_ai/api/api.py ADDED
File without changes
chunkr_ai/api/auth.py CHANGED
@@ -1,12 +1,12 @@
1
1
  class HeadersMixin:
2
2
  """Mixin class for handling authorization headers"""
3
-
3
+
4
4
  def get_api_key(self) -> str:
5
5
  """Get the API key"""
6
- if not hasattr(self, '_api_key') or not self._api_key:
6
+ if not hasattr(self, "_api_key") or not self._api_key:
7
7
  raise ValueError("API key not set")
8
8
  return self._api_key
9
-
9
+
10
10
  def _headers(self) -> dict:
11
11
  """Generate authorization headers"""
12
- return {"Authorization": self.get_api_key()}
12
+ return {"Authorization": self.get_api_key()}
chunkr_ai/api/base.py ADDED
@@ -0,0 +1,183 @@
1
+ from .config import Configuration
2
+ from .task import TaskResponse
3
+ from .auth import HeadersMixin
4
+ from abc import abstractmethod
5
+ from dotenv import load_dotenv
6
+ import io
7
+ import json
8
+ import os
9
+ from pathlib import Path
10
+ from PIL import Image
11
+ import requests
12
+ from typing import BinaryIO, Tuple, Union
13
+
14
+
15
+ class ChunkrBase(HeadersMixin):
16
+ """Base class with shared functionality for Chunkr API clients."""
17
+
18
+ def __init__(self, url: str = None, api_key: str = None):
19
+ load_dotenv()
20
+ self.url = url or os.getenv("CHUNKR_URL") or "https://api.chunkr.ai"
21
+ self._api_key = api_key or os.getenv("CHUNKR_API_KEY")
22
+ if not self._api_key:
23
+ raise ValueError(
24
+ "API key must be provided either directly, in .env file, or as CHUNKR_API_KEY environment variable. You can get an api key at: https://www.chunkr.ai"
25
+ )
26
+
27
+ self.url = self.url.rstrip("/")
28
+
29
+ def _prepare_file(
30
+ self, file: Union[str, Path, BinaryIO, Image.Image]
31
+ ) -> Tuple[str, BinaryIO]:
32
+ """Convert various file types into a tuple of (filename, file-like object).
33
+
34
+ Args:
35
+ file: Input file, can be:
36
+ - String or Path to a file
37
+ - URL string starting with http:// or https://
38
+ - Base64 string
39
+ - Opened binary file (mode='rb')
40
+ - PIL/Pillow Image object
41
+
42
+ Returns:
43
+ Tuple[str, BinaryIO]: (filename, file-like object) ready for upload
44
+
45
+ Raises:
46
+ FileNotFoundError: If the file path doesn't exist
47
+ TypeError: If the file type is not supported
48
+ ValueError: If the URL is invalid or unreachable
49
+ ValueError: If the MIME type is unsupported
50
+ """
51
+ # Handle URLs
52
+ if isinstance(file, str) and (
53
+ file.startswith("http://") or file.startswith("https://")
54
+ ):
55
+ response = requests.get(file)
56
+ response.raise_for_status()
57
+ file_obj = io.BytesIO(response.content)
58
+ filename = Path(file.split("/")[-1]).name or "downloaded_file"
59
+ return filename, file_obj
60
+
61
+ # Handle base64 strings
62
+ if isinstance(file, str) and "," in file and ";base64," in file:
63
+ try:
64
+ # Split header and data
65
+ header, base64_data = file.split(",", 1)
66
+ import base64
67
+
68
+ file_bytes = base64.b64decode(base64_data)
69
+ file_obj = io.BytesIO(file_bytes)
70
+
71
+ # Try to determine format from header
72
+ format = "bin"
73
+ mime_type = header.split(":")[-1].split(";")[0].lower()
74
+
75
+ # Map MIME types to file extensions
76
+ mime_to_ext = {
77
+ "application/pdf": "pdf",
78
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document": "docx",
79
+ "application/msword": "doc",
80
+ "application/vnd.openxmlformats-officedocument.presentationml.presentation": "pptx",
81
+ "application/vnd.ms-powerpoint": "ppt",
82
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": "xlsx",
83
+ "application/vnd.ms-excel": "xls",
84
+ "image/jpeg": "jpg",
85
+ "image/png": "png",
86
+ "image/jpg": "jpg",
87
+ }
88
+
89
+ if mime_type in mime_to_ext:
90
+ format = mime_to_ext[mime_type]
91
+ else:
92
+ raise ValueError(f"Unsupported MIME type: {mime_type}")
93
+
94
+ return f"file.{format}", file_obj
95
+ except Exception as e:
96
+ raise ValueError(f"Invalid base64 string: {str(e)}")
97
+
98
+ # Handle file paths
99
+ if isinstance(file, (str, Path)):
100
+ path = Path(file).resolve()
101
+ if not path.exists():
102
+ raise FileNotFoundError(f"File not found: {file}")
103
+ return path.name, open(path, "rb")
104
+
105
+ # Handle PIL Images
106
+ if isinstance(file, Image.Image):
107
+ img_byte_arr = io.BytesIO()
108
+ format = file.format or "PNG"
109
+ file.save(img_byte_arr, format=format)
110
+ img_byte_arr.seek(0)
111
+ return f"image.{format.lower()}", img_byte_arr
112
+
113
+ # Handle file-like objects
114
+ if hasattr(file, "read") and hasattr(file, "seek"):
115
+ # Try to get the filename from the file object if possible
116
+ name = (
117
+ getattr(file, "name", "document")
118
+ if hasattr(file, "name")
119
+ else "document"
120
+ )
121
+ return Path(name).name, file
122
+
123
+ raise TypeError(f"Unsupported file type: {type(file)}")
124
+
125
+ def _prepare_upload_data(
126
+ self,
127
+ file: Union[str, Path, BinaryIO, Image.Image],
128
+ config: Configuration = None,
129
+ ) -> Tuple[dict, dict]:
130
+ """Prepare files and data dictionaries for upload.
131
+
132
+ Args:
133
+ file: The file to upload
134
+ config: Optional configuration settings
135
+
136
+ Returns:
137
+ Tuple[dict, dict]: (files dict, data dict) ready for upload
138
+ """
139
+ filename, file_obj = self._prepare_file(file)
140
+ files = {"file": (filename, file_obj)}
141
+ data = {}
142
+
143
+ if config:
144
+ config_dict = config.model_dump(mode="json", exclude_none=True)
145
+ for key, value in config_dict.items():
146
+ if isinstance(value, dict):
147
+ files[key] = (None, json.dumps(value), "application/json")
148
+ else:
149
+ data[key] = value
150
+
151
+ return files, data
152
+
153
+ @abstractmethod
154
+ def upload(
155
+ self,
156
+ file: Union[str, Path, BinaryIO, Image.Image],
157
+ config: Configuration = None,
158
+ ) -> TaskResponse:
159
+ """Upload a file and wait for processing to complete.
160
+
161
+ Must be implemented by subclasses.
162
+ """
163
+ pass
164
+
165
+ @abstractmethod
166
+ def start_upload(
167
+ self,
168
+ file: Union[str, Path, BinaryIO, Image.Image],
169
+ config: Configuration = None,
170
+ ) -> TaskResponse:
171
+ """Upload a file for processing and immediately return the task response.
172
+
173
+ Must be implemented by subclasses.
174
+ """
175
+ pass
176
+
177
+ @abstractmethod
178
+ def get_task(self, task_id: str) -> TaskResponse:
179
+ """Get a task response by its ID.
180
+
181
+ Must be implemented by subclasses.
182
+ """
183
+ pass
chunkr_ai/api/chunkr.py CHANGED
@@ -7,6 +7,7 @@ import requests
7
7
  from typing import Union, BinaryIO
8
8
  from .misc import prepare_upload_data
9
9
 
10
+
10
11
  class Chunkr(ChunkrBase):
11
12
  """Chunkr API client"""
12
13
 
@@ -14,152 +15,64 @@ class Chunkr(ChunkrBase):
14
15
  super().__init__(url, api_key)
15
16
  self._session = requests.Session()
16
17
 
17
- def upload(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
18
- """Upload a file and wait for processing to complete.
19
-
20
- Args:
21
- file: The file to upload.
22
- config: Configuration options for processing. Optional.
23
-
24
- Examples:
25
- ```
26
- # Upload from file path
27
- chunkr.upload("document.pdf")
28
-
29
- # Upload from URL
30
- chunkr.upload("https://example.com/document.pdf")
31
-
32
- # Upload from base64 string (must include MIME type header)
33
- chunkr.upload("data:application/pdf;base64,JVBERi0xLjcKCjEgMCBvYmo...")
34
-
35
- # Upload from opened file
36
- with open("document.pdf", "rb") as f:
37
- chunkr.upload(f)
38
-
39
- # Upload an image
40
- from PIL import Image
41
- img = Image.open("photo.jpg")
42
- chunkr.upload(img)
43
- ```
44
- Returns:
45
- TaskResponse: The completed task response
46
- """
18
+ def upload(
19
+ self,
20
+ file: Union[str, Path, BinaryIO, Image.Image],
21
+ config: Configuration = None,
22
+ ) -> TaskResponse:
47
23
  task = self.create_task(file, config)
48
24
  return task.poll()
49
-
50
- def update(self, task_id: str, config: Configuration) -> TaskResponse:
51
- """Update a task by its ID and wait for processing to complete.
52
-
53
- Args:
54
- task_id: The ID of the task to update
55
- config: Configuration options for processing. Optional.
56
25
 
57
- Returns:
58
- TaskResponse: The updated task response
59
- """
26
+ def update(self, task_id: str, config: Configuration) -> TaskResponse:
60
27
  task = self.update_task(task_id, config)
61
28
  return task.poll()
62
29
 
63
- def create_task(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
64
- """Upload a file for processing and immediately return the task response. It will not wait for processing to complete. To wait for the full processing to complete, use `task.poll()`
65
-
66
- Args:
67
- file: The file to upload.
68
- config: Configuration options for processing. Optional.
69
-
70
- Examples:
71
- ```
72
- # Upload from file path
73
- task = chunkr.start_upload("document.pdf")
74
-
75
- # Upload from opened file
76
- with open("document.pdf", "rb") as f:
77
- task = chunkr.start_upload(f)
78
-
79
- # Upload from URL
80
- task = chunkr.start_upload("https://example.com/document.pdf")
81
-
82
- # Upload from base64 string (must include MIME type header)
83
- task = chunkr.start_upload("data:application/pdf;base64,JVBERi0xLjcKCjEgMCBvYmo...")
84
-
85
- # Upload an image
86
- from PIL import Image
87
- img = Image.open("photo.jpg")
88
- task = chunkr.start_upload(img)
89
-
90
- # Wait for the task to complete - this can be done when needed
91
- task.poll()
92
- ```
93
-
94
- Returns:
95
- TaskResponse: The initial task response
96
- """
97
- files= prepare_upload_data(file, config)
30
+ def create_task(
31
+ self,
32
+ file: Union[str, Path, BinaryIO, Image.Image],
33
+ config: Configuration = None,
34
+ ) -> TaskResponse:
35
+ files = prepare_upload_data(file, config)
36
+ if not self._session:
37
+ raise ValueError("Session not found")
98
38
  r = self._session.post(
99
- f"{self.url}/api/v1/task",
100
- files=files,
101
- headers=self._headers()
39
+ f"{self.url}/api/v1/task", files=files, headers=self._headers()
102
40
  )
103
41
  r.raise_for_status()
104
42
  return TaskResponse(**r.json()).with_client(self)
105
-
106
- def update_task(self, task_id: str, config: Configuration) -> TaskResponse:
107
- """Update a task by its ID.
108
-
109
- Args:
110
- task_id: The ID of the task to update
111
- config: The new configuration to use
112
43
 
113
- Returns:
114
- TaskResponse: The updated task response
115
- """
44
+ def update_task(self, task_id: str, config: Configuration) -> TaskResponse:
116
45
  files = prepare_upload_data(None, config)
46
+ if not self._session:
47
+ raise ValueError("Session not found")
117
48
  r = self._session.patch(
118
- f"{self.url}/api/v1/task/{task_id}",
119
- files=files,
120
- headers=self._headers()
49
+ f"{self.url}/api/v1/task/{task_id}", files=files, headers=self._headers()
121
50
  )
122
-
51
+
123
52
  r.raise_for_status()
124
53
  return TaskResponse(**r.json()).with_client(self)
125
-
126
- def get_task(self, task_id: str) -> TaskResponse:
127
- """Get a task response by its ID.
128
-
129
- Args:
130
- task_id: The ID of the task to get
131
54
 
132
- Returns:
133
- TaskResponse: The task response
134
- """
55
+ def get_task(self, task_id: str) -> TaskResponse:
56
+ if not self._session:
57
+ raise ValueError("Session not found")
135
58
  r = self._session.get(
136
- f"{self.url}/api/v1/task/{task_id}",
137
- headers=self._headers()
59
+ f"{self.url}/api/v1/task/{task_id}", headers=self._headers()
138
60
  )
139
61
  r.raise_for_status()
140
62
  return TaskResponse(**r.json()).with_client(self)
141
63
 
142
-
143
64
  def delete_task(self, task_id: str) -> None:
144
- """Delete a task by its ID.
145
-
146
- Args:
147
- task_id: The ID of the task to delete
148
- """
65
+ if not self._session:
66
+ raise ValueError("Session not found")
149
67
  r = self._session.delete(
150
- f"{self.url}/api/v1/task/{task_id}",
151
- headers=self._headers()
68
+ f"{self.url}/api/v1/task/{task_id}", headers=self._headers()
152
69
  )
153
70
  r.raise_for_status()
154
71
 
155
72
  def cancel_task(self, task_id: str) -> None:
156
- """Cancel a task by its ID.
157
-
158
- Args:
159
- task_id: The ID of the task to cancel
160
- """
73
+ if not self._session:
74
+ raise ValueError("Session not found")
161
75
  r = self._session.get(
162
- f"{self.url}/api/v1/task/{task_id}/cancel",
163
- headers=self._headers()
76
+ f"{self.url}/api/v1/task/{task_id}/cancel", headers=self._headers()
164
77
  )
165
78
  r.raise_for_status()
@@ -1,144 +1,120 @@
1
1
  from .chunkr_base import ChunkrBase
2
- from .task import TaskResponse
3
2
  from .config import Configuration
3
+ from .misc import prepare_upload_data
4
+ from .task_async import TaskResponseAsync
4
5
  import httpx
5
6
  from pathlib import Path
6
7
  from PIL import Image
7
8
  from typing import Union, BinaryIO
8
- from .misc import prepare_upload_data
9
+
9
10
 
10
11
  class ChunkrAsync(ChunkrBase):
11
12
  """Asynchronous Chunkr API client"""
12
-
13
+
13
14
  def __init__(self, url: str = None, api_key: str = None):
14
15
  super().__init__(url, api_key)
15
16
  self._client = httpx.AsyncClient()
16
17
 
17
- async def upload(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
18
- """Upload a file and wait for processing to complete.
19
-
20
- Args:
21
- file: The file to upload.
22
- config: Configuration options for processing. Optional.
23
-
24
- Examples:
25
- ```python
26
- # Upload from file path
27
- await chunkr.upload("document.pdf")
28
-
29
- # Upload from opened file
30
- with open("document.pdf", "rb") as f:
31
- await chunkr.upload(f)
32
-
33
- # Upload from URL
34
- await chunkr.upload("https://example.com/document.pdf")
35
-
36
- # Upload from base64 string (must include MIME type header)
37
- await chunkr.upload("data:application/pdf;base64,JVBERi0xLjcKCjEgMCBvYmo...")
38
-
39
- # Upload an image
40
- from PIL import Image
41
- img = Image.open("photo.jpg")
42
- await chunkr.upload(img)
43
- ```
44
- Returns:
45
- TaskResponse: The completed task response
46
- """
47
- task = await self.create_task(file, config)
48
- return await task.poll_async()
49
-
50
- async def update(self, task_id: str, config: Configuration) -> TaskResponse:
51
- """Update a task by its ID and wait for processing to complete.
52
-
53
- Args:
54
- task_id: The ID of the task to update
55
- config: Configuration options for processing. Optional.
56
-
57
- Returns:
58
- TaskResponse: The updated task response
59
- """
60
- task = await self.update_task(task_id, config)
61
- return await task.poll_async()
18
+ async def upload(
19
+ self,
20
+ file: Union[str, Path, BinaryIO, Image.Image],
21
+ config: Configuration = None,
22
+ ) -> TaskResponseAsync:
23
+ if not self._client or self._client.is_closed:
24
+ self._client = httpx.AsyncClient()
25
+ try:
26
+ task = await self.create_task(file, config)
27
+ return await task.poll()
28
+ except Exception as e:
29
+ await self._client.aclose()
30
+ raise e
31
+
32
+ async def update(self, task_id: str, config: Configuration) -> TaskResponseAsync:
33
+ if not self._client or self._client.is_closed:
34
+ self._client = httpx.AsyncClient()
35
+ try:
36
+ task = await self.update_task(task_id, config)
37
+ return await task.poll()
38
+ except Exception as e:
39
+ await self._client.aclose()
40
+ raise e
41
+
42
+ async def create_task(
43
+ self,
44
+ file: Union[str, Path, BinaryIO, Image.Image],
45
+ config: Configuration = None,
46
+ ) -> TaskResponseAsync:
47
+ if not self._client or self._client.is_closed:
48
+ self._client = httpx.AsyncClient()
49
+ try:
50
+ files = prepare_upload_data(file, config)
51
+ r = await self._client.post(
52
+ f"{self.url}/api/v1/task", files=files, headers=self._headers()
53
+ )
54
+ r.raise_for_status()
55
+ return TaskResponseAsync(**r.json()).with_client(self)
56
+ except Exception as e:
57
+ await self._client.aclose()
58
+ raise e
59
+
60
+ async def update_task(
61
+ self, task_id: str, config: Configuration
62
+ ) -> TaskResponseAsync:
63
+ if not self._client or self._client.is_closed:
64
+ self._client = httpx.AsyncClient()
65
+ try:
66
+ files = prepare_upload_data(None, config)
67
+ r = await self._client.patch(
68
+ f"{self.url}/api/v1/task/{task_id}",
69
+ files=files,
70
+ headers=self._headers(),
71
+ )
72
+
73
+ r.raise_for_status()
74
+ return TaskResponseAsync(**r.json()).with_client(self)
75
+ except Exception as e:
76
+ await self._client.aclose()
77
+ raise e
78
+
79
+ async def get_task(self, task_id: str) -> TaskResponseAsync:
80
+ if not self._client or self._client.is_closed:
81
+ self._client = httpx.AsyncClient()
82
+ try:
83
+ r = await self._client.get(
84
+ f"{self.url}/api/v1/task/{task_id}", headers=self._headers()
85
+ )
86
+ r.raise_for_status()
87
+ return TaskResponseAsync(**r.json()).with_client(self)
88
+ except Exception as e:
89
+ await self._client.aclose()
90
+ raise e
62
91
 
63
- async def create_task(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
64
- """Upload a file for processing and immediately return the task response. It will not wait for processing to complete. To wait for the full processing to complete, use `task.poll_async()`.
65
-
66
- Args:
67
- file: The file to upload.
68
- config: Configuration options for processing. Optional.
69
-
70
- Examples:
71
- ```
72
- # Upload from file path
73
- task = await chunkr.start_upload("document.pdf")
74
-
75
- # Upload from opened file
76
- with open("document.pdf", "rb") as f:
77
- task = await chunkr.start_upload(f)
78
-
79
- # Upload from URL
80
- task = await chunkr.start_upload("https://example.com/document.pdf")
81
-
82
- # Upload from base64 string (must include MIME type header)
83
- task = await chunkr.start_upload("data:application/pdf;base64,JVBERi0xLjcKCjEgMCBvYmo...")
84
-
85
- # Upload an image
86
- from PIL import Image
87
- img = Image.open("photo.jpg")
88
- task = await chunkr.start_upload(img)
89
-
90
- # Wait for the task to complete - this can be done when needed
91
- await task.poll_async()
92
- ```
93
-
94
- Returns:
95
- TaskResponse: The initial task response
96
- """
97
- files = prepare_upload_data(file, config)
98
- r = await self._client.post(
99
- f"{self.url}/api/v1/task",
100
- files=files,
101
- headers=self._headers()
102
- )
103
- r.raise_for_status()
104
- return TaskResponse(**r.json()).with_client(self)
105
-
106
- async def update_task(self, task_id: str, config: Configuration) -> TaskResponse:
107
- files = prepare_upload_data(None, config)
108
- r = await self._client.patch(
109
- f"{self.url}/api/v1/task/{task_id}",
110
- files=files,
111
- headers=self._headers()
112
- )
113
-
114
- r.raise_for_status()
115
- return TaskResponse(**r.json()).with_client(self)
116
-
117
- async def get_task(self, task_id: str) -> TaskResponse:
118
- r = await self._client.get(
119
- f"{self.url}/api/v1/task/{task_id}",
120
- headers=self._headers()
121
- )
122
- r.raise_for_status()
123
- return TaskResponse(**r.json()).with_client(self)
124
-
125
92
  async def delete_task(self, task_id: str) -> None:
126
- r = await self._client.delete(
127
- f"{self.url}/api/v1/task/{task_id}",
128
- headers=self._headers()
129
- )
130
- r.raise_for_status()
131
-
93
+ if not self._client or self._client.is_closed:
94
+ self._client = httpx.AsyncClient()
95
+ try:
96
+ r = await self._client.delete(
97
+ f"{self.url}/api/v1/task/{task_id}", headers=self._headers()
98
+ )
99
+ r.raise_for_status()
100
+ except Exception as e:
101
+ await self._client.aclose()
102
+ raise e
103
+
132
104
  async def cancel_task(self, task_id: str) -> None:
133
- r = await self._client.get(
134
- f"{self.url}/api/v1/task/{task_id}/cancel",
135
- headers=self._headers()
136
- )
137
- r.raise_for_status()
105
+ if not self._client or self._client.is_closed:
106
+ self._client = httpx.AsyncClient()
107
+ try:
108
+ r = await self._client.get(
109
+ f"{self.url}/api/v1/task/{task_id}/cancel", headers=self._headers()
110
+ )
111
+ r.raise_for_status()
112
+ except Exception as e:
113
+ await self._client.aclose()
114
+ raise e
138
115
 
139
-
140
116
  async def __aenter__(self):
141
117
  return self
142
118
 
143
119
  async def __aexit__(self, exc_type, exc_val, exc_tb):
144
- await self._client.aclose()
120
+ await self._client.aclose()