chunkr-ai 0.0.14__py3-none-any.whl → 0.0.15__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
chunkr_ai/__init__.py CHANGED
@@ -1,4 +1,4 @@
1
1
  from .api.chunkr import Chunkr
2
2
  from .api.chunkr_async import ChunkrAsync
3
3
 
4
- __all__ = ['Chunkr', 'ChunkrAsync']
4
+ __all__ = ["Chunkr", "ChunkrAsync"]
chunkr_ai/api/auth.py CHANGED
@@ -1,12 +1,12 @@
1
1
  class HeadersMixin:
2
2
  """Mixin class for handling authorization headers"""
3
-
3
+
4
4
  def get_api_key(self) -> str:
5
5
  """Get the API key"""
6
- if not hasattr(self, '_api_key') or not self._api_key:
6
+ if not hasattr(self, "_api_key") or not self._api_key:
7
7
  raise ValueError("API key not set")
8
8
  return self._api_key
9
-
9
+
10
10
  def _headers(self) -> dict:
11
11
  """Generate authorization headers"""
12
- return {"Authorization": self.get_api_key()}
12
+ return {"Authorization": self.get_api_key()}
chunkr_ai/api/base.py CHANGED
@@ -11,28 +11,23 @@ from PIL import Image
11
11
  import requests
12
12
  from typing import BinaryIO, Tuple, Union
13
13
 
14
+
14
15
  class ChunkrBase(HeadersMixin):
15
16
  """Base class with shared functionality for Chunkr API clients."""
16
17
 
17
18
  def __init__(self, url: str = None, api_key: str = None):
18
19
  load_dotenv()
19
- self.url = (
20
- url or
21
- os.getenv('CHUNKR_URL') or
22
- 'https://api.chunkr.ai'
23
- )
24
- self._api_key = (
25
- api_key or
26
- os.getenv('CHUNKR_API_KEY')
27
- )
20
+ self.url = url or os.getenv("CHUNKR_URL") or "https://api.chunkr.ai"
21
+ self._api_key = api_key or os.getenv("CHUNKR_API_KEY")
28
22
  if not self._api_key:
29
- raise ValueError("API key must be provided either directly, in .env file, or as CHUNKR_API_KEY environment variable. You can get an api key at: https://www.chunkr.ai")
30
-
23
+ raise ValueError(
24
+ "API key must be provided either directly, in .env file, or as CHUNKR_API_KEY environment variable. You can get an api key at: https://www.chunkr.ai"
25
+ )
26
+
31
27
  self.url = self.url.rstrip("/")
32
28
 
33
29
  def _prepare_file(
34
- self,
35
- file: Union[str, Path, BinaryIO, Image.Image]
30
+ self, file: Union[str, Path, BinaryIO, Image.Image]
36
31
  ) -> Tuple[str, BinaryIO]:
37
32
  """Convert various file types into a tuple of (filename, file-like object).
38
33
 
@@ -54,40 +49,43 @@ class ChunkrBase(HeadersMixin):
54
49
  ValueError: If the MIME type is unsupported
55
50
  """
56
51
  # Handle URLs
57
- if isinstance(file, str) and (file.startswith('http://') or file.startswith('https://')):
52
+ if isinstance(file, str) and (
53
+ file.startswith("http://") or file.startswith("https://")
54
+ ):
58
55
  response = requests.get(file)
59
56
  response.raise_for_status()
60
57
  file_obj = io.BytesIO(response.content)
61
- filename = Path(file.split('/')[-1]).name or 'downloaded_file'
58
+ filename = Path(file.split("/")[-1]).name or "downloaded_file"
62
59
  return filename, file_obj
63
60
 
64
61
  # Handle base64 strings
65
- if isinstance(file, str) and ',' in file and ';base64,' in file:
62
+ if isinstance(file, str) and "," in file and ";base64," in file:
66
63
  try:
67
64
  # Split header and data
68
- header, base64_data = file.split(',', 1)
65
+ header, base64_data = file.split(",", 1)
69
66
  import base64
67
+
70
68
  file_bytes = base64.b64decode(base64_data)
71
69
  file_obj = io.BytesIO(file_bytes)
72
-
70
+
73
71
  # Try to determine format from header
74
- format = 'bin'
75
- mime_type = header.split(':')[-1].split(';')[0].lower()
76
-
72
+ format = "bin"
73
+ mime_type = header.split(":")[-1].split(";")[0].lower()
74
+
77
75
  # Map MIME types to file extensions
78
76
  mime_to_ext = {
79
- 'application/pdf': 'pdf',
80
- 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx',
81
- 'application/msword': 'doc',
82
- 'application/vnd.openxmlformats-officedocument.presentationml.presentation': 'pptx',
83
- 'application/vnd.ms-powerpoint': 'ppt',
84
- 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': 'xlsx',
85
- 'application/vnd.ms-excel': 'xls',
86
- 'image/jpeg': 'jpg',
87
- 'image/png': 'png',
88
- 'image/jpg': 'jpg'
77
+ "application/pdf": "pdf",
78
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document": "docx",
79
+ "application/msword": "doc",
80
+ "application/vnd.openxmlformats-officedocument.presentationml.presentation": "pptx",
81
+ "application/vnd.ms-powerpoint": "ppt",
82
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": "xlsx",
83
+ "application/vnd.ms-excel": "xls",
84
+ "image/jpeg": "jpg",
85
+ "image/png": "png",
86
+ "image/jpg": "jpg",
89
87
  }
90
-
88
+
91
89
  if mime_type in mime_to_ext:
92
90
  format = mime_to_ext[mime_type]
93
91
  else:
@@ -102,20 +100,24 @@ class ChunkrBase(HeadersMixin):
102
100
  path = Path(file).resolve()
103
101
  if not path.exists():
104
102
  raise FileNotFoundError(f"File not found: {file}")
105
- return path.name, open(path, 'rb')
103
+ return path.name, open(path, "rb")
106
104
 
107
105
  # Handle PIL Images
108
106
  if isinstance(file, Image.Image):
109
107
  img_byte_arr = io.BytesIO()
110
- format = file.format or 'PNG'
108
+ format = file.format or "PNG"
111
109
  file.save(img_byte_arr, format=format)
112
110
  img_byte_arr.seek(0)
113
111
  return f"image.{format.lower()}", img_byte_arr
114
112
 
115
113
  # Handle file-like objects
116
- if hasattr(file, 'read') and hasattr(file, 'seek'):
114
+ if hasattr(file, "read") and hasattr(file, "seek"):
117
115
  # Try to get the filename from the file object if possible
118
- name = getattr(file, 'name', 'document') if hasattr(file, 'name') else 'document'
116
+ name = (
117
+ getattr(file, "name", "document")
118
+ if hasattr(file, "name")
119
+ else "document"
120
+ )
119
121
  return Path(name).name, file
120
122
 
121
123
  raise TypeError(f"Unsupported file type: {type(file)}")
@@ -123,43 +125,51 @@ class ChunkrBase(HeadersMixin):
123
125
  def _prepare_upload_data(
124
126
  self,
125
127
  file: Union[str, Path, BinaryIO, Image.Image],
126
- config: Configuration = None
128
+ config: Configuration = None,
127
129
  ) -> Tuple[dict, dict]:
128
130
  """Prepare files and data dictionaries for upload.
129
-
131
+
130
132
  Args:
131
133
  file: The file to upload
132
134
  config: Optional configuration settings
133
-
135
+
134
136
  Returns:
135
137
  Tuple[dict, dict]: (files dict, data dict) ready for upload
136
138
  """
137
139
  filename, file_obj = self._prepare_file(file)
138
140
  files = {"file": (filename, file_obj)}
139
141
  data = {}
140
-
142
+
141
143
  if config:
142
144
  config_dict = config.model_dump(mode="json", exclude_none=True)
143
145
  for key, value in config_dict.items():
144
146
  if isinstance(value, dict):
145
- files[key] = (None, json.dumps(value), 'application/json')
147
+ files[key] = (None, json.dumps(value), "application/json")
146
148
  else:
147
149
  data[key] = value
148
-
150
+
149
151
  return files, data
150
-
152
+
151
153
  @abstractmethod
152
- def upload(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
154
+ def upload(
155
+ self,
156
+ file: Union[str, Path, BinaryIO, Image.Image],
157
+ config: Configuration = None,
158
+ ) -> TaskResponse:
153
159
  """Upload a file and wait for processing to complete.
154
-
160
+
155
161
  Must be implemented by subclasses.
156
162
  """
157
163
  pass
158
164
 
159
165
  @abstractmethod
160
- def start_upload(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
166
+ def start_upload(
167
+ self,
168
+ file: Union[str, Path, BinaryIO, Image.Image],
169
+ config: Configuration = None,
170
+ ) -> TaskResponse:
161
171
  """Upload a file for processing and immediately return the task response.
162
-
172
+
163
173
  Must be implemented by subclasses.
164
174
  """
165
175
  pass
@@ -167,7 +177,7 @@ class ChunkrBase(HeadersMixin):
167
177
  @abstractmethod
168
178
  def get_task(self, task_id: str) -> TaskResponse:
169
179
  """Get a task response by its ID.
170
-
180
+
171
181
  Must be implemented by subclasses.
172
182
  """
173
183
  pass
chunkr_ai/api/chunkr.py CHANGED
@@ -7,6 +7,7 @@ import requests
7
7
  from typing import Union, BinaryIO
8
8
  from .misc import prepare_upload_data
9
9
 
10
+
10
11
  class Chunkr(ChunkrBase):
11
12
  """Chunkr API client"""
12
13
 
@@ -14,56 +15,57 @@ class Chunkr(ChunkrBase):
14
15
  super().__init__(url, api_key)
15
16
  self._session = requests.Session()
16
17
 
17
- def upload(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
18
+ def upload(
19
+ self,
20
+ file: Union[str, Path, BinaryIO, Image.Image],
21
+ config: Configuration = None,
22
+ ) -> TaskResponse:
18
23
  task = self.create_task(file, config)
19
24
  return task.poll()
20
-
25
+
21
26
  def update(self, task_id: str, config: Configuration) -> TaskResponse:
22
27
  task = self.update_task(task_id, config)
23
28
  return task.poll()
24
29
 
25
- def create_task(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
26
- files= prepare_upload_data(file, config)
30
+ def create_task(
31
+ self,
32
+ file: Union[str, Path, BinaryIO, Image.Image],
33
+ config: Configuration = None,
34
+ ) -> TaskResponse:
35
+ files = prepare_upload_data(file, config)
27
36
  if not self._session:
28
37
  raise ValueError("Session not found")
29
38
  r = self._session.post(
30
- f"{self.url}/api/v1/task",
31
- files=files,
32
- headers=self._headers()
39
+ f"{self.url}/api/v1/task", files=files, headers=self._headers()
33
40
  )
34
41
  r.raise_for_status()
35
42
  return TaskResponse(**r.json()).with_client(self)
36
-
43
+
37
44
  def update_task(self, task_id: str, config: Configuration) -> TaskResponse:
38
45
  files = prepare_upload_data(None, config)
39
46
  if not self._session:
40
47
  raise ValueError("Session not found")
41
48
  r = self._session.patch(
42
- f"{self.url}/api/v1/task/{task_id}",
43
- files=files,
44
- headers=self._headers()
49
+ f"{self.url}/api/v1/task/{task_id}", files=files, headers=self._headers()
45
50
  )
46
-
51
+
47
52
  r.raise_for_status()
48
53
  return TaskResponse(**r.json()).with_client(self)
49
-
54
+
50
55
  def get_task(self, task_id: str) -> TaskResponse:
51
56
  if not self._session:
52
57
  raise ValueError("Session not found")
53
58
  r = self._session.get(
54
- f"{self.url}/api/v1/task/{task_id}",
55
- headers=self._headers()
59
+ f"{self.url}/api/v1/task/{task_id}", headers=self._headers()
56
60
  )
57
61
  r.raise_for_status()
58
62
  return TaskResponse(**r.json()).with_client(self)
59
63
 
60
-
61
64
  def delete_task(self, task_id: str) -> None:
62
65
  if not self._session:
63
66
  raise ValueError("Session not found")
64
67
  r = self._session.delete(
65
- f"{self.url}/api/v1/task/{task_id}",
66
- headers=self._headers()
68
+ f"{self.url}/api/v1/task/{task_id}", headers=self._headers()
67
69
  )
68
70
  r.raise_for_status()
69
71
 
@@ -71,7 +73,6 @@ class Chunkr(ChunkrBase):
71
73
  if not self._session:
72
74
  raise ValueError("Session not found")
73
75
  r = self._session.get(
74
- f"{self.url}/api/v1/task/{task_id}/cancel",
75
- headers=self._headers()
76
+ f"{self.url}/api/v1/task/{task_id}/cancel", headers=self._headers()
76
77
  )
77
78
  r.raise_for_status()
@@ -7,14 +7,19 @@ from pathlib import Path
7
7
  from PIL import Image
8
8
  from typing import Union, BinaryIO
9
9
 
10
+
10
11
  class ChunkrAsync(ChunkrBase):
11
12
  """Asynchronous Chunkr API client"""
12
-
13
+
13
14
  def __init__(self, url: str = None, api_key: str = None):
14
15
  super().__init__(url, api_key)
15
16
  self._client = httpx.AsyncClient()
16
17
 
17
- async def upload(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponseAsync:
18
+ async def upload(
19
+ self,
20
+ file: Union[str, Path, BinaryIO, Image.Image],
21
+ config: Configuration = None,
22
+ ) -> TaskResponseAsync:
18
23
  if not self._client or self._client.is_closed:
19
24
  self._client = httpx.AsyncClient()
20
25
  try:
@@ -23,7 +28,7 @@ class ChunkrAsync(ChunkrBase):
23
28
  except Exception as e:
24
29
  await self._client.aclose()
25
30
  raise e
26
-
31
+
27
32
  async def update(self, task_id: str, config: Configuration) -> TaskResponseAsync:
28
33
  if not self._client or self._client.is_closed:
29
34
  self._client = httpx.AsyncClient()
@@ -34,15 +39,17 @@ class ChunkrAsync(ChunkrBase):
34
39
  await self._client.aclose()
35
40
  raise e
36
41
 
37
- async def create_task(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponseAsync:
42
+ async def create_task(
43
+ self,
44
+ file: Union[str, Path, BinaryIO, Image.Image],
45
+ config: Configuration = None,
46
+ ) -> TaskResponseAsync:
38
47
  if not self._client or self._client.is_closed:
39
48
  self._client = httpx.AsyncClient()
40
49
  try:
41
50
  files = prepare_upload_data(file, config)
42
51
  r = await self._client.post(
43
- f"{self.url}/api/v1/task",
44
- files=files,
45
- headers=self._headers()
52
+ f"{self.url}/api/v1/task", files=files, headers=self._headers()
46
53
  )
47
54
  r.raise_for_status()
48
55
  return TaskResponseAsync(**r.json()).with_client(self)
@@ -50,7 +57,9 @@ class ChunkrAsync(ChunkrBase):
50
57
  await self._client.aclose()
51
58
  raise e
52
59
 
53
- async def update_task(self, task_id: str, config: Configuration) -> TaskResponseAsync:
60
+ async def update_task(
61
+ self, task_id: str, config: Configuration
62
+ ) -> TaskResponseAsync:
54
63
  if not self._client or self._client.is_closed:
55
64
  self._client = httpx.AsyncClient()
56
65
  try:
@@ -58,49 +67,46 @@ class ChunkrAsync(ChunkrBase):
58
67
  r = await self._client.patch(
59
68
  f"{self.url}/api/v1/task/{task_id}",
60
69
  files=files,
61
- headers=self._headers()
70
+ headers=self._headers(),
62
71
  )
63
-
72
+
64
73
  r.raise_for_status()
65
74
  return TaskResponseAsync(**r.json()).with_client(self)
66
75
  except Exception as e:
67
76
  await self._client.aclose()
68
77
  raise e
69
-
78
+
70
79
  async def get_task(self, task_id: str) -> TaskResponseAsync:
71
80
  if not self._client or self._client.is_closed:
72
81
  self._client = httpx.AsyncClient()
73
82
  try:
74
83
  r = await self._client.get(
75
- f"{self.url}/api/v1/task/{task_id}",
76
- headers=self._headers()
84
+ f"{self.url}/api/v1/task/{task_id}", headers=self._headers()
77
85
  )
78
86
  r.raise_for_status()
79
87
  return TaskResponseAsync(**r.json()).with_client(self)
80
88
  except Exception as e:
81
89
  await self._client.aclose()
82
90
  raise e
83
-
91
+
84
92
  async def delete_task(self, task_id: str) -> None:
85
93
  if not self._client or self._client.is_closed:
86
94
  self._client = httpx.AsyncClient()
87
95
  try:
88
96
  r = await self._client.delete(
89
- f"{self.url}/api/v1/task/{task_id}",
90
- headers=self._headers()
97
+ f"{self.url}/api/v1/task/{task_id}", headers=self._headers()
91
98
  )
92
99
  r.raise_for_status()
93
100
  except Exception as e:
94
101
  await self._client.aclose()
95
102
  raise e
96
-
103
+
97
104
  async def cancel_task(self, task_id: str) -> None:
98
105
  if not self._client or self._client.is_closed:
99
106
  self._client = httpx.AsyncClient()
100
107
  try:
101
108
  r = await self._client.get(
102
- f"{self.url}/api/v1/task/{task_id}/cancel",
103
- headers=self._headers()
109
+ f"{self.url}/api/v1/task/{task_id}/cancel", headers=self._headers()
104
110
  )
105
111
  r.raise_for_status()
106
112
  except Exception as e:
@@ -111,4 +117,4 @@ class ChunkrAsync(ChunkrBase):
111
117
  return self
112
118
 
113
119
  async def __aexit__(self, exc_type, exc_val, exc_tb):
114
- await self._client.aclose()
120
+ await self._client.aclose()
@@ -9,31 +9,31 @@ from pathlib import Path
9
9
  from PIL import Image
10
10
  from typing import BinaryIO, Union
11
11
 
12
+
12
13
  class ChunkrBase(HeadersMixin):
13
14
  """Base class with shared functionality for Chunkr API clients."""
14
15
 
15
16
  def __init__(self, url: str = None, api_key: str = None):
16
17
  load_dotenv()
17
- self.url = (
18
- url or
19
- os.getenv('CHUNKR_URL') or
20
- 'https://api.chunkr.ai'
21
- )
22
- self._api_key = (
23
- api_key or
24
- os.getenv('CHUNKR_API_KEY')
25
- )
18
+ self.url = url or os.getenv("CHUNKR_URL") or "https://api.chunkr.ai"
19
+ self._api_key = api_key or os.getenv("CHUNKR_API_KEY")
26
20
  if not self._api_key:
27
- raise ValueError("API key must be provided either directly, in .env file, or as CHUNKR_API_KEY environment variable. You can get an api key at: https://www.chunkr.ai")
28
-
21
+ raise ValueError(
22
+ "API key must be provided either directly, in .env file, or as CHUNKR_API_KEY environment variable. You can get an api key at: https://www.chunkr.ai"
23
+ )
24
+
29
25
  self.url = self.url.rstrip("/")
30
26
 
31
27
  @abstractmethod
32
- def upload(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> Union[TaskResponse, TaskResponseAsync]:
28
+ def upload(
29
+ self,
30
+ file: Union[str, Path, BinaryIO, Image.Image],
31
+ config: Configuration = None,
32
+ ) -> Union[TaskResponse, TaskResponseAsync]:
33
33
  """Upload a file and wait for processing to complete.
34
34
 
35
35
  Args:
36
- file: The file to upload.
36
+ file: The file to upload.
37
37
  config: Configuration options for processing. Optional.
38
38
 
39
39
  Examples:
@@ -44,7 +44,7 @@ class ChunkrBase(HeadersMixin):
44
44
  # Upload from opened file
45
45
  with open("document.pdf", "rb") as f:
46
46
  await chunkr.upload(f)
47
-
47
+
48
48
  # Upload from URL
49
49
  await chunkr.upload("https://example.com/document.pdf")
50
50
 
@@ -60,11 +60,13 @@ class ChunkrBase(HeadersMixin):
60
60
  TaskResponse: The completed task response
61
61
  """
62
62
  pass
63
-
63
+
64
64
  @abstractmethod
65
- def update(self, task_id: str, config: Configuration) -> Union[TaskResponse, TaskResponseAsync]:
65
+ def update(
66
+ self, task_id: str, config: Configuration
67
+ ) -> Union[TaskResponse, TaskResponseAsync]:
66
68
  """Update a task by its ID and wait for processing to complete.
67
-
69
+
68
70
  Args:
69
71
  task_id: The ID of the task to update
70
72
  config: Configuration options for processing. Optional.
@@ -75,7 +77,11 @@ class ChunkrBase(HeadersMixin):
75
77
  pass
76
78
 
77
79
  @abstractmethod
78
- def create_task(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> Union[TaskResponse, TaskResponseAsync]:
80
+ def create_task(
81
+ self,
82
+ file: Union[str, Path, BinaryIO, Image.Image],
83
+ config: Configuration = None,
84
+ ) -> Union[TaskResponse, TaskResponseAsync]:
79
85
  """Upload a file for processing and immediately return the task response. It will not wait for processing to complete. To wait for the full processing to complete, use `task.poll()`.
80
86
 
81
87
  Args:
@@ -90,7 +96,7 @@ class ChunkrBase(HeadersMixin):
90
96
  # Upload from opened file
91
97
  with open("document.pdf", "rb") as f:
92
98
  task = await chunkr.create_task(f)
93
-
99
+
94
100
  # Upload from URL
95
101
  task = await chunkr.create_task("https://example.com/document.pdf")
96
102
 
@@ -109,9 +115,11 @@ class ChunkrBase(HeadersMixin):
109
115
  pass
110
116
 
111
117
  @abstractmethod
112
- def update_task(self, task_id: str, config: Configuration) -> Union[TaskResponse, TaskResponseAsync]:
118
+ def update_task(
119
+ self, task_id: str, config: Configuration
120
+ ) -> Union[TaskResponse, TaskResponseAsync]:
113
121
  """Update a task by its ID and immediately return the task response. It will not wait for processing to complete. To wait for the full processing to complete, use `task.poll()`.
114
-
122
+
115
123
  Args:
116
124
  task_id: The ID of the task to update
117
125
  config: Configuration options for processing. Optional.
@@ -120,11 +128,11 @@ class ChunkrBase(HeadersMixin):
120
128
  TaskResponse: The updated task response
121
129
  """
122
130
  pass
123
-
131
+
124
132
  @abstractmethod
125
133
  def get_task(self, task_id: str) -> Union[TaskResponse, TaskResponseAsync]:
126
134
  """Get a task response by its ID.
127
-
135
+
128
136
  Args:
129
137
  task_id: The ID of the task to get
130
138
 
@@ -136,18 +144,17 @@ class ChunkrBase(HeadersMixin):
136
144
  @abstractmethod
137
145
  def delete_task(self, task_id: str) -> None:
138
146
  """Delete a task by its ID.
139
-
147
+
140
148
  Args:
141
149
  task_id: The ID of the task to delete
142
150
  """
143
151
  pass
144
-
152
+
145
153
  @abstractmethod
146
154
  def cancel_task(self, task_id: str) -> None:
147
155
  """Cancel a task by its ID.
148
-
156
+
149
157
  Args:
150
158
  task_id: The ID of the task to cancel
151
159
  """
152
160
  pass
153
-