chunkr-ai 0.0.36__py3-none-any.whl → 0.0.38__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
chunkr_ai/api/chunkr.py CHANGED
@@ -1,6 +1,6 @@
1
1
  from pathlib import Path
2
2
  from PIL import Image
3
- from typing import Union, BinaryIO
3
+ from typing import Union, BinaryIO, Optional
4
4
 
5
5
  from .configuration import Configuration
6
6
  from .decorators import anywhere, ensure_client, retry_on_429
@@ -17,8 +17,9 @@ class Chunkr(ChunkrBase):
17
17
  self,
18
18
  file: Union[str, Path, BinaryIO, Image.Image],
19
19
  config: Configuration = None,
20
+ filename: Optional[str] = None,
20
21
  ) -> TaskResponse:
21
- task = await self.create_task(file, config)
22
+ task = await self.create_task(file, config, filename)
22
23
  return await task.poll()
23
24
 
24
25
  @anywhere()
@@ -34,10 +35,12 @@ class Chunkr(ChunkrBase):
34
35
  self,
35
36
  file: Union[str, Path, BinaryIO, Image.Image],
36
37
  config: Configuration = None,
38
+ filename: Optional[str] = None,
37
39
  ) -> TaskResponse:
38
- files = await prepare_upload_data(file, config, self._client)
40
+ """Create a new task with the given file and configuration."""
41
+ data = await prepare_upload_data(file, filename, config)
39
42
  r = await self._client.post(
40
- f"{self.url}/api/v1/task", files=files, headers=self._headers()
43
+ f"{self.url}/api/v1/task/parse", json=data, headers=self._headers()
41
44
  )
42
45
  r.raise_for_status()
43
46
  return TaskResponse(**r.json()).with_client(self, True, False)
@@ -46,10 +49,11 @@ class Chunkr(ChunkrBase):
46
49
  @ensure_client()
47
50
  @retry_on_429()
48
51
  async def update_task(self, task_id: str, config: Configuration) -> TaskResponse:
49
- files = await prepare_upload_data(None, config, self._client)
52
+ """Update an existing task with new configuration."""
53
+ data = await prepare_upload_data(None, None, config)
50
54
  r = await self._client.patch(
51
- f"{self.url}/api/v1/task/{task_id}",
52
- files=files,
55
+ f"{self.url}/api/v1/task/{task_id}/parse",
56
+ json=data,
53
57
  headers=self._headers(),
54
58
  )
55
59
  r.raise_for_status()
@@ -7,8 +7,7 @@ import httpx
7
7
  import os
8
8
  from pathlib import Path
9
9
  from PIL import Image
10
- from typing import BinaryIO, Union
11
-
10
+ from typing import BinaryIO, Union, Optional
12
11
 
13
12
  class ChunkrBase(HeadersMixin):
14
13
  """Base class with shared functionality for Chunkr API clients.
@@ -20,7 +19,7 @@ class ChunkrBase(HeadersMixin):
20
19
  """
21
20
 
22
21
  def __init__(self, url: str = None, api_key: str = None, raise_on_failure: bool = False):
23
- load_dotenv()
22
+ load_dotenv(override=True)
24
23
  self.url = url or os.getenv("CHUNKR_URL") or "https://api.chunkr.ai"
25
24
  self._api_key = api_key or os.getenv("CHUNKR_API_KEY")
26
25
  self.raise_on_failure = raise_on_failure
@@ -38,13 +37,15 @@ class ChunkrBase(HeadersMixin):
38
37
  self,
39
38
  file: Union[str, Path, BinaryIO, Image.Image],
40
39
  config: Configuration = None,
40
+ filename: Optional[str] = None,
41
41
  ) -> TaskResponse:
42
42
  """Upload a file and wait for processing to complete.
43
43
 
44
44
  Args:
45
45
  file: The file to upload.
46
46
  config: Configuration options for processing. Optional.
47
-
47
+ filename: The filename to use for the file. Optional.
48
+
48
49
  Examples:
49
50
  ```python
50
51
  # Upload from file path
@@ -58,7 +59,7 @@ class ChunkrBase(HeadersMixin):
58
59
  await chunkr.upload("https://example.com/document.pdf")
59
60
 
60
61
  # Upload from base64 string (must include MIME type header)
61
- await chunkr.upload("data:application/pdf;base64,JVBERi0...")
62
+ await chunkr.upload("data:application/pdf;base64,JVBERi0...", filename="document.pdf")
62
63
 
63
64
  # Upload an image
64
65
  from PIL import Image
@@ -90,13 +91,14 @@ class ChunkrBase(HeadersMixin):
90
91
  self,
91
92
  file: Union[str, Path, BinaryIO, Image.Image],
92
93
  config: Configuration = None,
94
+ filename: Optional[str] = None,
93
95
  ) -> TaskResponse:
94
96
  """Upload a file for processing and immediately return the task response. It will not wait for processing to complete. To wait for the full processing to complete, use `task.poll()`.
95
97
 
96
98
  Args:
97
99
  file: The file to upload.
98
100
  config: Configuration options for processing. Optional.
99
-
101
+ filename: The filename to use for the file. Optional.
100
102
  Examples:
101
103
  ```
102
104
  # Upload from file path
@@ -110,7 +112,7 @@ class ChunkrBase(HeadersMixin):
110
112
  task = await chunkr.create_task("https://example.com/document.pdf")
111
113
 
112
114
  # Upload from base64 string (must include MIME type header)
113
- task = await chunkr.create_task("data:application/pdf;base64,JVBERi0xLjcKCjEgMCBvYmo...")
115
+ task = await chunkr.create_task("data:application/pdf;base64,JVBERi0xLjcKCjEgMCBvYmo...", filename="document.pdf")
114
116
 
115
117
  # Upload an image
116
118
  from PIL import Image
@@ -19,20 +19,18 @@ class GenerationConfig(BaseModel):
19
19
  class SegmentProcessing(BaseModel):
20
20
  model_config = ConfigDict(populate_by_name=True, alias_generator=str.title)
21
21
 
22
- title: Optional[GenerationConfig] = Field(default=None, alias="Title")
23
- section_header: Optional[GenerationConfig] = Field(
24
- default=None, alias="SectionHeader"
25
- )
26
- text: Optional[GenerationConfig] = Field(default=None, alias="Text")
27
- list_item: Optional[GenerationConfig] = Field(default=None, alias="ListItem")
28
- table: Optional[GenerationConfig] = Field(default=None, alias="Table")
29
- picture: Optional[GenerationConfig] = Field(default=None, alias="Picture")
30
22
  caption: Optional[GenerationConfig] = Field(default=None, alias="Caption")
31
- formula: Optional[GenerationConfig] = Field(default=None, alias="Formula")
32
23
  footnote: Optional[GenerationConfig] = Field(default=None, alias="Footnote")
33
- page_header: Optional[GenerationConfig] = Field(default=None, alias="PageHeader")
34
- page_footer: Optional[GenerationConfig] = Field(default=None, alias="PageFooter")
24
+ formula: Optional[GenerationConfig] = Field(default=None, alias="Formula")
25
+ list_item: Optional[GenerationConfig] = Field(default=None, alias="ListItem")
35
26
  page: Optional[GenerationConfig] = Field(default=None, alias="Page")
27
+ page_footer: Optional[GenerationConfig] = Field(default=None, alias="PageFooter")
28
+ page_header: Optional[GenerationConfig] = Field(default=None, alias="PageHeader")
29
+ picture: Optional[GenerationConfig] = Field(default=None, alias="Picture")
30
+ section_header: Optional[GenerationConfig] = Field(default=None, alias="SectionHeader")
31
+ table: Optional[GenerationConfig] = Field(default=None, alias="Table")
32
+ text: Optional[GenerationConfig] = Field(default=None, alias="Text")
33
+ title: Optional[GenerationConfig] = Field(default=None, alias="Title")
36
34
 
37
35
  class ChunkProcessing(BaseModel):
38
36
  ignore_headers_and_footers: Optional[bool] = None
@@ -84,11 +82,13 @@ class Segment(BaseModel):
84
82
  page_width: float
85
83
  segment_id: str
86
84
  segment_type: SegmentType
85
+ confidence: Optional[float]
87
86
 
88
87
  class Chunk(BaseModel):
89
88
  chunk_id: str
90
89
  chunk_length: int
91
90
  segments: List[Segment]
91
+ embed: Optional[str] = None
92
92
 
93
93
  class OutputResponse(BaseModel):
94
94
  chunks: List[Chunk]
chunkr_ai/api/misc.py CHANGED
@@ -1,155 +1,103 @@
1
1
  from .configuration import Configuration
2
+ import base64
2
3
  import io
3
- import json
4
4
  from pathlib import Path
5
5
  from PIL import Image
6
- import httpx
7
6
  from typing import Union, Tuple, BinaryIO, Optional
8
7
 
9
- async def prepare_file(file: Union[str, Path, BinaryIO, Image.Image], client: httpx.AsyncClient = None) -> Tuple[str, BinaryIO]:
10
- """Convert various file types into a tuple of (filename, file-like object).
8
+ async def prepare_file(file: Union[str, Path, BinaryIO, Image.Image]) -> Tuple[Optional[str], str]:
9
+ """Convert various file types into a tuple of (filename, file content).
11
10
 
12
- Args:
13
- file: Input file, can be:
14
- - String or Path to a file
15
- - URL string starting with http:// or https://
16
- - Base64 string
17
- - Opened binary file (mode='rb')
18
- - PIL/Pillow Image object
19
-
20
- Returns:
21
- Tuple[str, BinaryIO]: (filename, file-like object) ready for upload
22
-
23
- Raises:
24
- FileNotFoundError: If the file path doesn't exist
25
- TypeError: If the file type is not supported
26
- ValueError: If the URL is invalid or unreachable
27
- ValueError: If the MIME type is unsupported
28
- """
29
- # Handle URLs
30
- if isinstance(file, str) and (
31
- file.startswith("http://") or file.startswith("https://")
32
- ):
33
- if not client:
34
- raise ValueError("Client must be provided to download files from URLs")
35
- response = await client.get(file)
36
- response.raise_for_status()
37
-
38
- # Try to get filename from Content-Disposition header first
39
- filename = None
40
- content_disposition = response.headers.get("Content-Disposition")
41
- if content_disposition and "filename=" in content_disposition:
42
- filename = content_disposition.split("filename=")[-1].strip("\"'")
43
-
44
- # If no Content-Disposition, try to get clean filename from URL path
45
- if not filename:
46
- from urllib.parse import urlparse, unquote
47
-
48
- parsed_url = urlparse(file)
49
- path = unquote(parsed_url.path)
50
- filename = Path(path).name if path else None
51
-
52
- # Fallback to default name if we couldn't extract one
53
- filename = filename or "downloaded_file"
54
-
55
- # Sanitize filename: remove invalid characters and limit length
56
- import re
57
-
58
- filename = re.sub(
59
- r'[<>:"/\\|?*%]', "_", filename
60
- ) # Replace invalid chars with underscore
61
- filename = re.sub(r"\s+", "_", filename) # Replace whitespace with underscore
62
- filename = filename.strip("._") # Remove leading/trailing dots and underscores
63
- filename = filename[:255] # Limit length to 255 characters
64
-
65
- file_obj = io.BytesIO(response.content)
66
- return filename, file_obj
11
+ Args:
12
+ file: Input file, can be:
13
+ - URL string starting with http:// or https://
14
+ - Base64 string
15
+ - Local file path (will be converted to base64)
16
+ - Opened binary file (will be converted to base64)
17
+ - PIL/Pillow Image object (will be converted to base64)
67
18
 
68
- # Handle base64 strings
69
- if isinstance(file, str) and "," in file and ";base64," in file:
19
+ Returns:
20
+ Tuple[Optional[str], str]: (filename, content) where content is either a URL or base64 string
21
+ The filename may be None for URLs, base64 strings, and PIL Images
22
+
23
+ Raises:
24
+ FileNotFoundError: If the file path doesn't exist
25
+ TypeError: If the file type is not supported
26
+ ValueError: If the URL is invalid or unreachable
27
+ ValueError: If the MIME type is unsupported
28
+ """
29
+ # Handle strings
30
+ if isinstance(file, str):
31
+ if file.startswith(('http://', 'https://')):
32
+ return None, file
70
33
  try:
71
- # Split header and data
72
- header, base64_data = file.split(",", 1)
73
- import base64
74
-
75
- file_bytes = base64.b64decode(base64_data)
76
- file_obj = io.BytesIO(file_bytes)
77
-
78
- # Try to determine format from header
79
- format = "bin"
80
- mime_type = header.split(":")[-1].split(";")[0].lower()
81
-
82
- # Map MIME types to file extensions
83
- mime_to_ext = {
84
- "application/pdf": "pdf",
85
- "application/vnd.openxmlformats-officedocument.wordprocessingml.document": "docx",
86
- "application/msword": "doc",
87
- "application/vnd.openxmlformats-officedocument.presentationml.presentation": "pptx",
88
- "application/vnd.ms-powerpoint": "ppt",
89
- "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": "xlsx",
90
- "application/vnd.ms-excel": "xls",
91
- "image/jpeg": "jpg",
92
- "image/png": "png",
93
- "image/jpg": "jpg",
94
- }
95
-
96
- if mime_type in mime_to_ext:
97
- format = mime_to_ext[mime_type]
98
- else:
99
- raise ValueError(f"Unsupported MIME type: {mime_type}")
100
-
101
- return f"file.{format}", file_obj
102
- except Exception as e:
103
- raise ValueError(f"Invalid base64 string: {str(e)}")
104
-
105
- # Handle file paths
106
- if isinstance(file, (str, Path)):
34
+ base64.b64decode(file)
35
+ return None, file
36
+ except:
37
+ try:
38
+ file = Path(file)
39
+ except:
40
+ raise ValueError("File must be a valid path, URL, or base64 string")
41
+
42
+ # Handle file paths - convert to base64
43
+ if isinstance(file, Path):
107
44
  path = Path(file).resolve()
108
45
  if not path.exists():
109
46
  raise FileNotFoundError(f"File not found: {file}")
110
- return path.name, open(path, "rb")
111
-
112
- # Handle PIL Images
47
+
48
+ with open(path, "rb") as f:
49
+ file_content = f.read()
50
+ file_ext = path.suffix.lower().lstrip('.')
51
+ if not file_ext:
52
+ raise ValueError("File must have an extension")
53
+ base64_str = base64.b64encode(file_content).decode()
54
+ return path.name, base64_str
55
+
56
+ # Handle PIL Images - convert to base64
113
57
  if isinstance(file, Image.Image):
114
58
  img_byte_arr = io.BytesIO()
115
59
  format = file.format or "PNG"
116
60
  file.save(img_byte_arr, format=format)
117
61
  img_byte_arr.seek(0)
118
- return f"image.{format.lower()}", img_byte_arr
62
+ base64_str = base64.b64encode(img_byte_arr.getvalue()).decode()
63
+ return None, base64_str
119
64
 
120
- # Handle file-like objects
65
+ # Handle file-like objects - convert to base64
121
66
  if hasattr(file, "read") and hasattr(file, "seek"):
122
- # Try to get the filename from the file object if possible
123
- name = (
124
- getattr(file, "name", "document") if hasattr(file, "name") else "document"
125
- )
126
- return Path(name).name, file
67
+ file.seek(0)
68
+ file_content = file.read()
69
+ name = getattr(file, "name", "document")
70
+ file_ext = Path(name).suffix.lower().lstrip('.')
71
+ if not file_ext:
72
+ raise ValueError("File must have an extension")
73
+ base64_str = base64.b64encode(file_content).decode()
74
+ return Path(name).name, base64_str
127
75
 
128
76
  raise TypeError(f"Unsupported file type: {type(file)}")
129
77
 
130
78
 
131
79
  async def prepare_upload_data(
132
80
  file: Optional[Union[str, Path, BinaryIO, Image.Image]] = None,
81
+ filename: Optional[str] = None,
133
82
  config: Optional[Configuration] = None,
134
- client: httpx.AsyncClient = None,
135
83
  ) -> dict:
136
- """Prepare files and data dictionaries for upload.
84
+ """Prepare data dictionary for upload.
137
85
 
138
86
  Args:
139
87
  file: The file to upload
140
88
  config: Optional configuration settings
89
+ client: HTTP client for downloading remote files
141
90
 
142
91
  Returns:
143
- dict: (files dict) ready for upload
92
+ dict: JSON-serializable data dictionary ready for upload
144
93
  """
145
- files = {}
94
+ data = {}
146
95
  if file:
147
- filename, file_obj = await prepare_file(file, client)
148
- files = {"file": (filename, file_obj)}
96
+ processed_filename, processed_file = await prepare_file(file)
97
+ data["file"] = processed_file
98
+ data["file_name"] = filename or processed_filename
149
99
 
150
100
  if config:
151
- config_dict = config.model_dump(mode="json", exclude_none=True)
152
- for key, value in config_dict.items():
153
- files[key] = (None, json.dumps(value), "application/json")
154
-
155
- return files
101
+ data.update(config.model_dump(mode="json", exclude_none=True))
102
+
103
+ return data
@@ -74,9 +74,11 @@ class TaskResponse(BaseModel, Generic[T]):
74
74
  @retry_on_429()
75
75
  async def update(self, config: Configuration) -> T:
76
76
  """Update the task configuration."""
77
- f = await prepare_upload_data(None, config, self._client._client)
77
+ data = await prepare_upload_data(None, None, config)
78
78
  r = await self._client._client.patch(
79
- self.task_url, files=f, headers=self._client._headers()
79
+ f"{self.task_url}/parse",
80
+ json=data,
81
+ headers=self._client._headers()
80
82
  )
81
83
  r.raise_for_status()
82
84
  updated = TaskResponse(**r.json()).with_client(self._client)
@@ -142,7 +144,7 @@ class TaskResponse(BaseModel, Generic[T]):
142
144
  Args:
143
145
  output_file (str, optional): Path to save the markdown content. Defaults to None.
144
146
  """
145
- content = self._get_content("markdown")
147
+ content = self._get_content("markdown", separator="\n\n")
146
148
  self._write_to_file(content, output_file)
147
149
  return content
148
150
 
@@ -166,7 +168,7 @@ class TaskResponse(BaseModel, Generic[T]):
166
168
  self._write_to_file(data, output_file, is_json=True)
167
169
  return data
168
170
 
169
- def _get_content(self, t: str) -> str:
171
+ def _get_content(self, t: str, separator: str = "\n") -> str:
170
172
  if not self.output:
171
173
  return ""
172
174
  parts = []
@@ -175,4 +177,4 @@ class TaskResponse(BaseModel, Generic[T]):
175
177
  v = getattr(s, t)
176
178
  if v:
177
179
  parts.append(v)
178
- return "\n".join(parts)
180
+ return separator.join(parts)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: chunkr-ai
3
- Version: 0.0.36
3
+ Version: 0.0.38
4
4
  Summary: Python client for Chunkr: open source document intelligence
5
5
  Author-email: Ishaan Kapoor <ishaan@lumina.sh>
6
6
  License: MIT License
@@ -0,0 +1,16 @@
1
+ chunkr_ai/__init__.py,sha256=6KpYv2lmD6S5z2kc9pqwuLP5VDHmOuu2qDZArUIhb1s,53
2
+ chunkr_ai/models.py,sha256=tOI7ylkhyeFfCLMisk96EPsH4UEcjBx1Mcisxc_AYXI,757
3
+ chunkr_ai/api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
+ chunkr_ai/api/auth.py,sha256=hlv0GiUmlsbFO1wLL9sslqOnsBSoBqkL_6Mk2SDvxgE,413
5
+ chunkr_ai/api/chunkr.py,sha256=BzwcKNCuLfVR-HzgY8tKStsW4pIDVVjBgnEqPLyUUMM,3292
6
+ chunkr_ai/api/chunkr_base.py,sha256=FDl0Ew8eOY4hur5FFqPENZiq9YQy0G3XWEqcKPeCO-U,6130
7
+ chunkr_ai/api/configuration.py,sha256=KrXKcC1Yd7wfK8JMfihlWjNxlDyzKydr1Pe1_r1DTZw,3885
8
+ chunkr_ai/api/decorators.py,sha256=VJX4qGBIL00K2zY8bh5KAMWv7SltJ38TvPJH06FnFss,4415
9
+ chunkr_ai/api/misc.py,sha256=QN-2YWQ8e3VvvK63Ua-e8jsx6gxVxkO88Z96yWOofu0,3653
10
+ chunkr_ai/api/protocol.py,sha256=LjPrYSq52m1afIlAo0yVGXlGZxPRh8J6g7S4PAit3Zo,388
11
+ chunkr_ai/api/task_response.py,sha256=ti_2VTYtYS9FWyW-QIm16rp6qhs8RVy4vvgCZUkI2wA,6328
12
+ chunkr_ai-0.0.38.dist-info/LICENSE,sha256=w3R12yNDyZpMiy2lxy_hvNbsldC75ww79sF0u11rkho,1069
13
+ chunkr_ai-0.0.38.dist-info/METADATA,sha256=R_ZY3lS_hw4velP2QW-YKoK9UW9GZxr_y_qpblbZCYI,7031
14
+ chunkr_ai-0.0.38.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
15
+ chunkr_ai-0.0.38.dist-info/top_level.txt,sha256=0IZY7PZIiS8bw5r4NUQRUQ-ATi-L_3vLQVq3ZLouOW8,10
16
+ chunkr_ai-0.0.38.dist-info/RECORD,,
@@ -1,16 +0,0 @@
1
- chunkr_ai/__init__.py,sha256=6KpYv2lmD6S5z2kc9pqwuLP5VDHmOuu2qDZArUIhb1s,53
2
- chunkr_ai/models.py,sha256=tOI7ylkhyeFfCLMisk96EPsH4UEcjBx1Mcisxc_AYXI,757
3
- chunkr_ai/api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
- chunkr_ai/api/auth.py,sha256=hlv0GiUmlsbFO1wLL9sslqOnsBSoBqkL_6Mk2SDvxgE,413
5
- chunkr_ai/api/chunkr.py,sha256=VnbuAPlWLqyf8xCCU_kpdybgjVPTwZLarDQoD3uozY0,3065
6
- chunkr_ai/api/chunkr_base.py,sha256=giW56fL7xxJphdOTpIH52dXxpNt7OdP8pNiPSqbNjGM,5835
7
- chunkr_ai/api/configuration.py,sha256=2Bfw_c8eQVijb0EvsexiuRbF1pZUspYFBMuZ-ErJHvs,3835
8
- chunkr_ai/api/decorators.py,sha256=VJX4qGBIL00K2zY8bh5KAMWv7SltJ38TvPJH06FnFss,4415
9
- chunkr_ai/api/misc.py,sha256=gTL8UG_R6bunQdKSXwm_SpyIyTmLprzdX3re_X-mMto,5730
10
- chunkr_ai/api/protocol.py,sha256=LjPrYSq52m1afIlAo0yVGXlGZxPRh8J6g7S4PAit3Zo,388
11
- chunkr_ai/api/task_response.py,sha256=FC4OQUv4fltUij5OtFRlWRE9LxzRJGgBhh0olfHJBBg,6258
12
- chunkr_ai-0.0.36.dist-info/LICENSE,sha256=w3R12yNDyZpMiy2lxy_hvNbsldC75ww79sF0u11rkho,1069
13
- chunkr_ai-0.0.36.dist-info/METADATA,sha256=1hamwWrDvj0DirX84MAAbZs_yqSxmzGVR7mK7521HK0,7031
14
- chunkr_ai-0.0.36.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
15
- chunkr_ai-0.0.36.dist-info/top_level.txt,sha256=0IZY7PZIiS8bw5r4NUQRUQ-ATi-L_3vLQVq3ZLouOW8,10
16
- chunkr_ai-0.0.36.dist-info/RECORD,,