chunkr-ai 0.0.36__tar.gz → 0.0.38__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (23) hide show
  1. {chunkr_ai-0.0.36/src/chunkr_ai.egg-info → chunkr_ai-0.0.38}/PKG-INFO +1 -1
  2. {chunkr_ai-0.0.36 → chunkr_ai-0.0.38}/pyproject.toml +1 -1
  3. {chunkr_ai-0.0.36 → chunkr_ai-0.0.38}/src/chunkr_ai/api/chunkr.py +11 -7
  4. {chunkr_ai-0.0.36 → chunkr_ai-0.0.38}/src/chunkr_ai/api/chunkr_base.py +9 -7
  5. {chunkr_ai-0.0.36 → chunkr_ai-0.0.38}/src/chunkr_ai/api/configuration.py +11 -11
  6. chunkr_ai-0.0.38/src/chunkr_ai/api/misc.py +103 -0
  7. {chunkr_ai-0.0.36 → chunkr_ai-0.0.38}/src/chunkr_ai/api/task_response.py +7 -5
  8. {chunkr_ai-0.0.36 → chunkr_ai-0.0.38/src/chunkr_ai.egg-info}/PKG-INFO +1 -1
  9. {chunkr_ai-0.0.36 → chunkr_ai-0.0.38}/tests/test_chunkr.py +35 -2
  10. chunkr_ai-0.0.36/src/chunkr_ai/api/misc.py +0 -155
  11. {chunkr_ai-0.0.36 → chunkr_ai-0.0.38}/LICENSE +0 -0
  12. {chunkr_ai-0.0.36 → chunkr_ai-0.0.38}/README.md +0 -0
  13. {chunkr_ai-0.0.36 → chunkr_ai-0.0.38}/setup.cfg +0 -0
  14. {chunkr_ai-0.0.36 → chunkr_ai-0.0.38}/src/chunkr_ai/__init__.py +0 -0
  15. {chunkr_ai-0.0.36 → chunkr_ai-0.0.38}/src/chunkr_ai/api/__init__.py +0 -0
  16. {chunkr_ai-0.0.36 → chunkr_ai-0.0.38}/src/chunkr_ai/api/auth.py +0 -0
  17. {chunkr_ai-0.0.36 → chunkr_ai-0.0.38}/src/chunkr_ai/api/decorators.py +0 -0
  18. {chunkr_ai-0.0.36 → chunkr_ai-0.0.38}/src/chunkr_ai/api/protocol.py +0 -0
  19. {chunkr_ai-0.0.36 → chunkr_ai-0.0.38}/src/chunkr_ai/models.py +0 -0
  20. {chunkr_ai-0.0.36 → chunkr_ai-0.0.38}/src/chunkr_ai.egg-info/SOURCES.txt +0 -0
  21. {chunkr_ai-0.0.36 → chunkr_ai-0.0.38}/src/chunkr_ai.egg-info/dependency_links.txt +0 -0
  22. {chunkr_ai-0.0.36 → chunkr_ai-0.0.38}/src/chunkr_ai.egg-info/requires.txt +0 -0
  23. {chunkr_ai-0.0.36 → chunkr_ai-0.0.38}/src/chunkr_ai.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: chunkr-ai
3
- Version: 0.0.36
3
+ Version: 0.0.38
4
4
  Summary: Python client for Chunkr: open source document intelligence
5
5
  Author-email: Ishaan Kapoor <ishaan@lumina.sh>
6
6
  License: MIT License
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "chunkr-ai"
7
- version = "0.0.36"
7
+ version = "0.0.38"
8
8
  authors = [{"name" = "Ishaan Kapoor", "email" = "ishaan@lumina.sh"}]
9
9
  description = "Python client for Chunkr: open source document intelligence"
10
10
  readme = "README.md"
@@ -1,6 +1,6 @@
1
1
  from pathlib import Path
2
2
  from PIL import Image
3
- from typing import Union, BinaryIO
3
+ from typing import Union, BinaryIO, Optional
4
4
 
5
5
  from .configuration import Configuration
6
6
  from .decorators import anywhere, ensure_client, retry_on_429
@@ -17,8 +17,9 @@ class Chunkr(ChunkrBase):
17
17
  self,
18
18
  file: Union[str, Path, BinaryIO, Image.Image],
19
19
  config: Configuration = None,
20
+ filename: Optional[str] = None,
20
21
  ) -> TaskResponse:
21
- task = await self.create_task(file, config)
22
+ task = await self.create_task(file, config, filename)
22
23
  return await task.poll()
23
24
 
24
25
  @anywhere()
@@ -34,10 +35,12 @@ class Chunkr(ChunkrBase):
34
35
  self,
35
36
  file: Union[str, Path, BinaryIO, Image.Image],
36
37
  config: Configuration = None,
38
+ filename: Optional[str] = None,
37
39
  ) -> TaskResponse:
38
- files = await prepare_upload_data(file, config, self._client)
40
+ """Create a new task with the given file and configuration."""
41
+ data = await prepare_upload_data(file, filename, config)
39
42
  r = await self._client.post(
40
- f"{self.url}/api/v1/task", files=files, headers=self._headers()
43
+ f"{self.url}/api/v1/task/parse", json=data, headers=self._headers()
41
44
  )
42
45
  r.raise_for_status()
43
46
  return TaskResponse(**r.json()).with_client(self, True, False)
@@ -46,10 +49,11 @@ class Chunkr(ChunkrBase):
46
49
  @ensure_client()
47
50
  @retry_on_429()
48
51
  async def update_task(self, task_id: str, config: Configuration) -> TaskResponse:
49
- files = await prepare_upload_data(None, config, self._client)
52
+ """Update an existing task with new configuration."""
53
+ data = await prepare_upload_data(None, None, config)
50
54
  r = await self._client.patch(
51
- f"{self.url}/api/v1/task/{task_id}",
52
- files=files,
55
+ f"{self.url}/api/v1/task/{task_id}/parse",
56
+ json=data,
53
57
  headers=self._headers(),
54
58
  )
55
59
  r.raise_for_status()
@@ -7,8 +7,7 @@ import httpx
7
7
  import os
8
8
  from pathlib import Path
9
9
  from PIL import Image
10
- from typing import BinaryIO, Union
11
-
10
+ from typing import BinaryIO, Union, Optional
12
11
 
13
12
  class ChunkrBase(HeadersMixin):
14
13
  """Base class with shared functionality for Chunkr API clients.
@@ -20,7 +19,7 @@ class ChunkrBase(HeadersMixin):
20
19
  """
21
20
 
22
21
  def __init__(self, url: str = None, api_key: str = None, raise_on_failure: bool = False):
23
- load_dotenv()
22
+ load_dotenv(override=True)
24
23
  self.url = url or os.getenv("CHUNKR_URL") or "https://api.chunkr.ai"
25
24
  self._api_key = api_key or os.getenv("CHUNKR_API_KEY")
26
25
  self.raise_on_failure = raise_on_failure
@@ -38,13 +37,15 @@ class ChunkrBase(HeadersMixin):
38
37
  self,
39
38
  file: Union[str, Path, BinaryIO, Image.Image],
40
39
  config: Configuration = None,
40
+ filename: Optional[str] = None,
41
41
  ) -> TaskResponse:
42
42
  """Upload a file and wait for processing to complete.
43
43
 
44
44
  Args:
45
45
  file: The file to upload.
46
46
  config: Configuration options for processing. Optional.
47
-
47
+ filename: The filename to use for the file. Optional.
48
+
48
49
  Examples:
49
50
  ```python
50
51
  # Upload from file path
@@ -58,7 +59,7 @@ class ChunkrBase(HeadersMixin):
58
59
  await chunkr.upload("https://example.com/document.pdf")
59
60
 
60
61
  # Upload from base64 string (must include MIME type header)
61
- await chunkr.upload("data:application/pdf;base64,JVBERi0...")
62
+ await chunkr.upload("data:application/pdf;base64,JVBERi0...", filename="document.pdf")
62
63
 
63
64
  # Upload an image
64
65
  from PIL import Image
@@ -90,13 +91,14 @@ class ChunkrBase(HeadersMixin):
90
91
  self,
91
92
  file: Union[str, Path, BinaryIO, Image.Image],
92
93
  config: Configuration = None,
94
+ filename: Optional[str] = None,
93
95
  ) -> TaskResponse:
94
96
  """Upload a file for processing and immediately return the task response. It will not wait for processing to complete. To wait for the full processing to complete, use `task.poll()`.
95
97
 
96
98
  Args:
97
99
  file: The file to upload.
98
100
  config: Configuration options for processing. Optional.
99
-
101
+ filename: The filename to use for the file. Optional.
100
102
  Examples:
101
103
  ```
102
104
  # Upload from file path
@@ -110,7 +112,7 @@ class ChunkrBase(HeadersMixin):
110
112
  task = await chunkr.create_task("https://example.com/document.pdf")
111
113
 
112
114
  # Upload from base64 string (must include MIME type header)
113
- task = await chunkr.create_task("data:application/pdf;base64,JVBERi0xLjcKCjEgMCBvYmo...")
115
+ task = await chunkr.create_task("data:application/pdf;base64,JVBERi0xLjcKCjEgMCBvYmo...", filename="document.pdf")
114
116
 
115
117
  # Upload an image
116
118
  from PIL import Image
@@ -19,20 +19,18 @@ class GenerationConfig(BaseModel):
19
19
  class SegmentProcessing(BaseModel):
20
20
  model_config = ConfigDict(populate_by_name=True, alias_generator=str.title)
21
21
 
22
- title: Optional[GenerationConfig] = Field(default=None, alias="Title")
23
- section_header: Optional[GenerationConfig] = Field(
24
- default=None, alias="SectionHeader"
25
- )
26
- text: Optional[GenerationConfig] = Field(default=None, alias="Text")
27
- list_item: Optional[GenerationConfig] = Field(default=None, alias="ListItem")
28
- table: Optional[GenerationConfig] = Field(default=None, alias="Table")
29
- picture: Optional[GenerationConfig] = Field(default=None, alias="Picture")
30
22
  caption: Optional[GenerationConfig] = Field(default=None, alias="Caption")
31
- formula: Optional[GenerationConfig] = Field(default=None, alias="Formula")
32
23
  footnote: Optional[GenerationConfig] = Field(default=None, alias="Footnote")
33
- page_header: Optional[GenerationConfig] = Field(default=None, alias="PageHeader")
34
- page_footer: Optional[GenerationConfig] = Field(default=None, alias="PageFooter")
24
+ formula: Optional[GenerationConfig] = Field(default=None, alias="Formula")
25
+ list_item: Optional[GenerationConfig] = Field(default=None, alias="ListItem")
35
26
  page: Optional[GenerationConfig] = Field(default=None, alias="Page")
27
+ page_footer: Optional[GenerationConfig] = Field(default=None, alias="PageFooter")
28
+ page_header: Optional[GenerationConfig] = Field(default=None, alias="PageHeader")
29
+ picture: Optional[GenerationConfig] = Field(default=None, alias="Picture")
30
+ section_header: Optional[GenerationConfig] = Field(default=None, alias="SectionHeader")
31
+ table: Optional[GenerationConfig] = Field(default=None, alias="Table")
32
+ text: Optional[GenerationConfig] = Field(default=None, alias="Text")
33
+ title: Optional[GenerationConfig] = Field(default=None, alias="Title")
36
34
 
37
35
  class ChunkProcessing(BaseModel):
38
36
  ignore_headers_and_footers: Optional[bool] = None
@@ -84,11 +82,13 @@ class Segment(BaseModel):
84
82
  page_width: float
85
83
  segment_id: str
86
84
  segment_type: SegmentType
85
+ confidence: Optional[float]
87
86
 
88
87
  class Chunk(BaseModel):
89
88
  chunk_id: str
90
89
  chunk_length: int
91
90
  segments: List[Segment]
91
+ embed: Optional[str] = None
92
92
 
93
93
  class OutputResponse(BaseModel):
94
94
  chunks: List[Chunk]
@@ -0,0 +1,103 @@
1
+ from .configuration import Configuration
2
+ import base64
3
+ import io
4
+ from pathlib import Path
5
+ from PIL import Image
6
+ from typing import Union, Tuple, BinaryIO, Optional
7
+
8
+ async def prepare_file(file: Union[str, Path, BinaryIO, Image.Image]) -> Tuple[Optional[str], str]:
9
+ """Convert various file types into a tuple of (filename, file content).
10
+
11
+ Args:
12
+ file: Input file, can be:
13
+ - URL string starting with http:// or https://
14
+ - Base64 string
15
+ - Local file path (will be converted to base64)
16
+ - Opened binary file (will be converted to base64)
17
+ - PIL/Pillow Image object (will be converted to base64)
18
+
19
+ Returns:
20
+ Tuple[Optional[str], str]: (filename, content) where content is either a URL or base64 string
21
+ The filename may be None for URLs, base64 strings, and PIL Images
22
+
23
+ Raises:
24
+ FileNotFoundError: If the file path doesn't exist
25
+ TypeError: If the file type is not supported
26
+ ValueError: If the URL is invalid or unreachable
27
+ ValueError: If the MIME type is unsupported
28
+ """
29
+ # Handle strings
30
+ if isinstance(file, str):
31
+ if file.startswith(('http://', 'https://')):
32
+ return None, file
33
+ try:
34
+ base64.b64decode(file)
35
+ return None, file
36
+ except:
37
+ try:
38
+ file = Path(file)
39
+ except:
40
+ raise ValueError("File must be a valid path, URL, or base64 string")
41
+
42
+ # Handle file paths - convert to base64
43
+ if isinstance(file, Path):
44
+ path = Path(file).resolve()
45
+ if not path.exists():
46
+ raise FileNotFoundError(f"File not found: {file}")
47
+
48
+ with open(path, "rb") as f:
49
+ file_content = f.read()
50
+ file_ext = path.suffix.lower().lstrip('.')
51
+ if not file_ext:
52
+ raise ValueError("File must have an extension")
53
+ base64_str = base64.b64encode(file_content).decode()
54
+ return path.name, base64_str
55
+
56
+ # Handle PIL Images - convert to base64
57
+ if isinstance(file, Image.Image):
58
+ img_byte_arr = io.BytesIO()
59
+ format = file.format or "PNG"
60
+ file.save(img_byte_arr, format=format)
61
+ img_byte_arr.seek(0)
62
+ base64_str = base64.b64encode(img_byte_arr.getvalue()).decode()
63
+ return None, base64_str
64
+
65
+ # Handle file-like objects - convert to base64
66
+ if hasattr(file, "read") and hasattr(file, "seek"):
67
+ file.seek(0)
68
+ file_content = file.read()
69
+ name = getattr(file, "name", "document")
70
+ file_ext = Path(name).suffix.lower().lstrip('.')
71
+ if not file_ext:
72
+ raise ValueError("File must have an extension")
73
+ base64_str = base64.b64encode(file_content).decode()
74
+ return Path(name).name, base64_str
75
+
76
+ raise TypeError(f"Unsupported file type: {type(file)}")
77
+
78
+
79
+ async def prepare_upload_data(
80
+ file: Optional[Union[str, Path, BinaryIO, Image.Image]] = None,
81
+ filename: Optional[str] = None,
82
+ config: Optional[Configuration] = None,
83
+ ) -> dict:
84
+ """Prepare data dictionary for upload.
85
+
86
+ Args:
87
+ file: The file to upload
88
+ config: Optional configuration settings
89
+ client: HTTP client for downloading remote files
90
+
91
+ Returns:
92
+ dict: JSON-serializable data dictionary ready for upload
93
+ """
94
+ data = {}
95
+ if file:
96
+ processed_filename, processed_file = await prepare_file(file)
97
+ data["file"] = processed_file
98
+ data["file_name"] = filename or processed_filename
99
+
100
+ if config:
101
+ data.update(config.model_dump(mode="json", exclude_none=True))
102
+
103
+ return data
@@ -74,9 +74,11 @@ class TaskResponse(BaseModel, Generic[T]):
74
74
  @retry_on_429()
75
75
  async def update(self, config: Configuration) -> T:
76
76
  """Update the task configuration."""
77
- f = await prepare_upload_data(None, config, self._client._client)
77
+ data = await prepare_upload_data(None, None, config)
78
78
  r = await self._client._client.patch(
79
- self.task_url, files=f, headers=self._client._headers()
79
+ f"{self.task_url}/parse",
80
+ json=data,
81
+ headers=self._client._headers()
80
82
  )
81
83
  r.raise_for_status()
82
84
  updated = TaskResponse(**r.json()).with_client(self._client)
@@ -142,7 +144,7 @@ class TaskResponse(BaseModel, Generic[T]):
142
144
  Args:
143
145
  output_file (str, optional): Path to save the markdown content. Defaults to None.
144
146
  """
145
- content = self._get_content("markdown")
147
+ content = self._get_content("markdown", separator="\n\n")
146
148
  self._write_to_file(content, output_file)
147
149
  return content
148
150
 
@@ -166,7 +168,7 @@ class TaskResponse(BaseModel, Generic[T]):
166
168
  self._write_to_file(data, output_file, is_json=True)
167
169
  return data
168
170
 
169
- def _get_content(self, t: str) -> str:
171
+ def _get_content(self, t: str, separator: str = "\n") -> str:
170
172
  if not self.output:
171
173
  return ""
172
174
  parts = []
@@ -175,4 +177,4 @@ class TaskResponse(BaseModel, Generic[T]):
175
177
  v = getattr(s, t)
176
178
  if v:
177
179
  parts.append(v)
178
- return "\n".join(parts)
180
+ return separator.join(parts)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: chunkr-ai
3
- Version: 0.0.36
3
+ Version: 0.0.38
4
4
  Summary: Python client for Chunkr: open source document intelligence
5
5
  Author-email: Ishaan Kapoor <ishaan@lumina.sh>
6
6
  License: MIT License
@@ -2,6 +2,7 @@ import pytest
2
2
  from pathlib import Path
3
3
  from PIL import Image
4
4
  import asyncio
5
+ import base64
5
6
 
6
7
  from chunkr_ai import Chunkr
7
8
  from chunkr_ai.models import (
@@ -68,6 +69,7 @@ async def test_send_pil_image(client, sample_image):
68
69
  assert response.task_id is not None
69
70
  assert response.status == "Succeeded"
70
71
  assert response.output is not None
72
+ assert response.output is not None
71
73
 
72
74
  @pytest.mark.asyncio
73
75
  async def test_ocr_auto(client, sample_path):
@@ -222,15 +224,41 @@ async def test_task_operations_after_client_close(client, sample_path):
222
224
  result = await task.poll()
223
225
  assert result.status == "Succeeded"
224
226
 
227
+ @pytest.mark.asyncio
228
+ async def test_send_base64_file(client, sample_path):
229
+ # Read file and convert to base64
230
+ with open(sample_path, "rb") as f:
231
+ base64_content = base64.b64encode(f.read()).decode('utf-8')
232
+ response = await client.upload(base64_content)
233
+ assert response.task_id is not None
234
+ assert response.status == "Succeeded"
235
+ assert response.output is not None
236
+
237
+ @pytest.mark.asyncio
238
+ async def test_send_base64_file_with_filename(client, sample_path):
239
+ # Read file and convert to base64
240
+ with open(sample_path, "rb") as f:
241
+ base64_content = base64.b64encode(f.read()).decode('utf-8')
242
+
243
+ response = await client.upload(base64_content, filename="test.pdf")
244
+ assert response.task_id is not None
245
+ assert response.status == "Succeeded"
246
+ assert response.output is not None
247
+
225
248
  @pytest.mark.asyncio
226
249
  async def test_output_files_no_dir(client, sample_path, tmp_path):
227
- await client.upload(sample_path)
250
+ task = await client.upload(sample_path)
228
251
 
229
252
  html_file = tmp_path / "output.html"
230
253
  md_file = tmp_path / "output.md"
231
254
  content_file = tmp_path / "output.txt"
232
255
  json_file = tmp_path / "output.json"
233
256
 
257
+ task.html(html_file)
258
+ task.markdown(md_file)
259
+ task.content(content_file)
260
+ task.json(json_file)
261
+
234
262
  assert html_file.exists()
235
263
  assert md_file.exists()
236
264
  assert content_file.exists()
@@ -238,13 +266,18 @@ async def test_output_files_no_dir(client, sample_path, tmp_path):
238
266
 
239
267
  @pytest.mark.asyncio
240
268
  async def test_output_files_with_dirs(client, sample_path, tmp_path):
241
- await client.upload(sample_path)
269
+ task = await client.upload(sample_path)
242
270
 
243
271
  nested_dir = tmp_path / "nested" / "output" / "dir"
244
272
  html_file = nested_dir / "output.html"
245
273
  md_file = nested_dir / "output.md"
246
274
  content_file = nested_dir / "output.txt"
247
275
  json_file = nested_dir / "output.json"
276
+
277
+ task.html(html_file)
278
+ task.markdown(md_file)
279
+ task.content(content_file)
280
+ task.json(json_file)
248
281
 
249
282
  assert html_file.exists()
250
283
  assert md_file.exists()
@@ -1,155 +0,0 @@
1
- from .configuration import Configuration
2
- import io
3
- import json
4
- from pathlib import Path
5
- from PIL import Image
6
- import httpx
7
- from typing import Union, Tuple, BinaryIO, Optional
8
-
9
- async def prepare_file(file: Union[str, Path, BinaryIO, Image.Image], client: httpx.AsyncClient = None) -> Tuple[str, BinaryIO]:
10
- """Convert various file types into a tuple of (filename, file-like object).
11
-
12
- Args:
13
- file: Input file, can be:
14
- - String or Path to a file
15
- - URL string starting with http:// or https://
16
- - Base64 string
17
- - Opened binary file (mode='rb')
18
- - PIL/Pillow Image object
19
-
20
- Returns:
21
- Tuple[str, BinaryIO]: (filename, file-like object) ready for upload
22
-
23
- Raises:
24
- FileNotFoundError: If the file path doesn't exist
25
- TypeError: If the file type is not supported
26
- ValueError: If the URL is invalid or unreachable
27
- ValueError: If the MIME type is unsupported
28
- """
29
- # Handle URLs
30
- if isinstance(file, str) and (
31
- file.startswith("http://") or file.startswith("https://")
32
- ):
33
- if not client:
34
- raise ValueError("Client must be provided to download files from URLs")
35
- response = await client.get(file)
36
- response.raise_for_status()
37
-
38
- # Try to get filename from Content-Disposition header first
39
- filename = None
40
- content_disposition = response.headers.get("Content-Disposition")
41
- if content_disposition and "filename=" in content_disposition:
42
- filename = content_disposition.split("filename=")[-1].strip("\"'")
43
-
44
- # If no Content-Disposition, try to get clean filename from URL path
45
- if not filename:
46
- from urllib.parse import urlparse, unquote
47
-
48
- parsed_url = urlparse(file)
49
- path = unquote(parsed_url.path)
50
- filename = Path(path).name if path else None
51
-
52
- # Fallback to default name if we couldn't extract one
53
- filename = filename or "downloaded_file"
54
-
55
- # Sanitize filename: remove invalid characters and limit length
56
- import re
57
-
58
- filename = re.sub(
59
- r'[<>:"/\\|?*%]', "_", filename
60
- ) # Replace invalid chars with underscore
61
- filename = re.sub(r"\s+", "_", filename) # Replace whitespace with underscore
62
- filename = filename.strip("._") # Remove leading/trailing dots and underscores
63
- filename = filename[:255] # Limit length to 255 characters
64
-
65
- file_obj = io.BytesIO(response.content)
66
- return filename, file_obj
67
-
68
- # Handle base64 strings
69
- if isinstance(file, str) and "," in file and ";base64," in file:
70
- try:
71
- # Split header and data
72
- header, base64_data = file.split(",", 1)
73
- import base64
74
-
75
- file_bytes = base64.b64decode(base64_data)
76
- file_obj = io.BytesIO(file_bytes)
77
-
78
- # Try to determine format from header
79
- format = "bin"
80
- mime_type = header.split(":")[-1].split(";")[0].lower()
81
-
82
- # Map MIME types to file extensions
83
- mime_to_ext = {
84
- "application/pdf": "pdf",
85
- "application/vnd.openxmlformats-officedocument.wordprocessingml.document": "docx",
86
- "application/msword": "doc",
87
- "application/vnd.openxmlformats-officedocument.presentationml.presentation": "pptx",
88
- "application/vnd.ms-powerpoint": "ppt",
89
- "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": "xlsx",
90
- "application/vnd.ms-excel": "xls",
91
- "image/jpeg": "jpg",
92
- "image/png": "png",
93
- "image/jpg": "jpg",
94
- }
95
-
96
- if mime_type in mime_to_ext:
97
- format = mime_to_ext[mime_type]
98
- else:
99
- raise ValueError(f"Unsupported MIME type: {mime_type}")
100
-
101
- return f"file.{format}", file_obj
102
- except Exception as e:
103
- raise ValueError(f"Invalid base64 string: {str(e)}")
104
-
105
- # Handle file paths
106
- if isinstance(file, (str, Path)):
107
- path = Path(file).resolve()
108
- if not path.exists():
109
- raise FileNotFoundError(f"File not found: {file}")
110
- return path.name, open(path, "rb")
111
-
112
- # Handle PIL Images
113
- if isinstance(file, Image.Image):
114
- img_byte_arr = io.BytesIO()
115
- format = file.format or "PNG"
116
- file.save(img_byte_arr, format=format)
117
- img_byte_arr.seek(0)
118
- return f"image.{format.lower()}", img_byte_arr
119
-
120
- # Handle file-like objects
121
- if hasattr(file, "read") and hasattr(file, "seek"):
122
- # Try to get the filename from the file object if possible
123
- name = (
124
- getattr(file, "name", "document") if hasattr(file, "name") else "document"
125
- )
126
- return Path(name).name, file
127
-
128
- raise TypeError(f"Unsupported file type: {type(file)}")
129
-
130
-
131
- async def prepare_upload_data(
132
- file: Optional[Union[str, Path, BinaryIO, Image.Image]] = None,
133
- config: Optional[Configuration] = None,
134
- client: httpx.AsyncClient = None,
135
- ) -> dict:
136
- """Prepare files and data dictionaries for upload.
137
-
138
- Args:
139
- file: The file to upload
140
- config: Optional configuration settings
141
-
142
- Returns:
143
- dict: (files dict) ready for upload
144
- """
145
- files = {}
146
- if file:
147
- filename, file_obj = await prepare_file(file, client)
148
- files = {"file": (filename, file_obj)}
149
-
150
- if config:
151
- config_dict = config.model_dump(mode="json", exclude_none=True)
152
- for key, value in config_dict.items():
153
- files[key] = (None, json.dumps(value), "application/json")
154
-
155
- return files
File without changes
File without changes
File without changes