chunkr-ai 0.0.8__py3-none-any.whl → 0.0.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
chunkr_ai/api/chunkr.py CHANGED
@@ -1,4 +1,4 @@
1
- from .base import ChunkrBase
1
+ from .chunkr_base import ChunkrBase
2
2
  from .config import Configuration
3
3
  from .task import TaskResponse
4
4
  from pathlib import Path
@@ -163,5 +163,3 @@ class Chunkr(ChunkrBase):
163
163
  headers=self._headers()
164
164
  )
165
165
  r.raise_for_status()
166
-
167
-
@@ -1,4 +1,4 @@
1
- from .base import ChunkrBase
1
+ from .chunkr_base import ChunkrBase
2
2
  from .task import TaskResponse
3
3
  from .config import Configuration
4
4
  import httpx
chunkr_ai/api/config.py CHANGED
@@ -40,7 +40,6 @@ class ChunkProcessing(BaseModel):
40
40
 
41
41
  class Property(BaseModel):
42
42
  name: str
43
- title: Optional[str] = None
44
43
  prop_type: str
45
44
  description: Optional[str] = None
46
45
  default: Optional[str] = None
chunkr_ai/api/misc.py CHANGED
@@ -1,11 +1,11 @@
1
+ from .config import Configuration, Property, JsonSchema
1
2
  import io
2
3
  import json
3
4
  from pathlib import Path
4
5
  from PIL import Image
5
6
  import requests
6
7
  from typing import Union, Tuple, BinaryIO, Optional
7
- from .config import Configuration
8
-
8
+ from pydantic import BaseModel
9
9
 
10
10
  def prepare_file(
11
11
  file: Union[str, Path, BinaryIO, Image.Image]
@@ -15,8 +15,31 @@ def prepare_file(
15
15
  if isinstance(file, str) and (file.startswith('http://') or file.startswith('https://')):
16
16
  response = requests.get(file)
17
17
  response.raise_for_status()
18
+
19
+ # Try to get filename from Content-Disposition header first
20
+ filename = None
21
+ content_disposition = response.headers.get('Content-Disposition')
22
+ if content_disposition and 'filename=' in content_disposition:
23
+ filename = content_disposition.split('filename=')[-1].strip('"\'')
24
+
25
+ # If no Content-Disposition, try to get clean filename from URL path
26
+ if not filename:
27
+ from urllib.parse import urlparse, unquote
28
+ parsed_url = urlparse(file)
29
+ path = unquote(parsed_url.path)
30
+ filename = Path(path).name if path else None
31
+
32
+ # Fallback to default name if we couldn't extract one
33
+ filename = filename or 'downloaded_file'
34
+
35
+ # Sanitize filename: remove invalid characters and limit length
36
+ import re
37
+ filename = re.sub(r'[<>:"/\\|?*%]', '_', filename) # Replace invalid chars with underscore
38
+ filename = re.sub(r'\s+', '_', filename) # Replace whitespace with underscore
39
+ filename = filename.strip('._') # Remove leading/trailing dots and underscores
40
+ filename = filename[:255] # Limit length to 255 characters
41
+
18
42
  file_obj = io.BytesIO(response.content)
19
- filename = Path(file.split('/')[-1]).name or 'downloaded_file'
20
43
  return filename, file_obj
21
44
 
22
45
  # Handle base64 strings
@@ -104,3 +127,33 @@ def prepare_upload_data(
104
127
  files[key] = (None, json.dumps(value), 'application/json')
105
128
 
106
129
  return files
130
+
131
+ def from_pydantic(pydantic: BaseModel) -> dict:
132
+ """Convert a Pydantic model to a Chunk json schema.
133
+
134
+ Args:
135
+ pydantic: A Pydantic BaseModel class or instance
136
+
137
+ Returns:
138
+ dict: A JSON schema compatible with Chunk's format
139
+ """
140
+ model = pydantic if isinstance(pydantic, type) else pydantic.__class__
141
+ schema = model.model_json_schema()
142
+ print(schema)
143
+ properties = []
144
+ for name, details in schema.get('properties', {}).items():
145
+ prop = Property(
146
+ name=name,
147
+ title=details.get('title'),
148
+ prop_type=details.get('type', 'string'),
149
+ description=details.get('description'),
150
+ default=str(details.get('default')) if details.get('default') is not None else None
151
+ )
152
+ properties.append(prop)
153
+
154
+ json_schema = JsonSchema(
155
+ title=schema.get('title', model.__name__),
156
+ properties=properties
157
+ )
158
+
159
+ return json_schema.model_dump(mode="json", exclude_none=True)
@@ -0,0 +1,111 @@
1
+ import asyncio
2
+ from pydantic import BaseModel, PrivateAttr
3
+ from datetime import datetime
4
+ from enum import Enum
5
+ from typing import Optional, Union
6
+ from .task_base import TaskBase
7
+ from .protocol import ChunkrClientProtocol
8
+ from .config import Configuration, OutputResponse
9
+ from .misc import prepare_upload_data
10
+
11
+ class Status(str, Enum):
12
+ STARTING = "Starting"
13
+ PROCESSING = "Processing"
14
+ SUCCEEDED = "Succeeded"
15
+ FAILED = "Failed"
16
+ CANCELLED = "Cancelled"
17
+
18
+ class TaskResponseAsync(BaseModel, TaskBase):
19
+ configuration: Configuration
20
+ created_at: datetime
21
+ expires_at: Optional[datetime]
22
+ file_name: Optional[str]
23
+ finished_at: Optional[datetime]
24
+ input_file_url: Optional[str]
25
+ message: str
26
+ output: Optional[OutputResponse]
27
+ page_count: Optional[int]
28
+ pdf_url: Optional[str]
29
+ started_at: Optional[datetime]
30
+ status: Status
31
+ task_id: str
32
+ task_url: Optional[str]
33
+ _client: Optional[Union[ChunkrClientProtocol]] = PrivateAttr(default=None)
34
+
35
+ def with_client(self, client: Union[ChunkrClientProtocol]) -> 'TaskResponseAsync':
36
+ self._client = client
37
+ return self
38
+
39
+ async def poll(self) -> 'TaskResponseAsync':
40
+ while True:
41
+ j = await self._poll_request()
42
+ updated = TaskResponseAsync(**j).with_client(self._client)
43
+ self.__dict__.update(updated.__dict__)
44
+ if res := self._check_status():
45
+ return res
46
+ await asyncio.sleep(0.5)
47
+
48
+ async def _poll_request(self) -> dict:
49
+ if not self.task_url:
50
+ raise ValueError("Task URL not found")
51
+ while True:
52
+ try:
53
+ r = await self._client._client.get(self.task_url, headers=self._client._headers())
54
+ r.raise_for_status()
55
+ return r.json()
56
+ except Exception as e:
57
+ if self.status == Status.FAILED:
58
+ raise ValueError(self.message) from e
59
+ await asyncio.sleep(0.5)
60
+
61
+ def _check_status(self) -> Optional['TaskResponseAsync']:
62
+ if self.status == Status.FAILED:
63
+ raise ValueError(f"Task failed: {self.message}")
64
+ if self.status == Status.CANCELLED:
65
+ return self
66
+ if self.status not in [Status.STARTING, Status.PROCESSING]:
67
+ return self
68
+ return None
69
+
70
+ async def update(self, config: Configuration) -> 'TaskResponseAsync':
71
+ if not self.task_url:
72
+ raise ValueError("Task URL not found")
73
+ f = prepare_upload_data(None, config)
74
+ r = await self._client._client.patch(self.task_url, files=f, headers=self._client._headers())
75
+ r.raise_for_status()
76
+ updated = TaskResponseAsync(**r.json()).with_client(self._client)
77
+ self.__dict__.update(updated.__dict__)
78
+ return await self.poll()
79
+
80
+ async def cancel(self):
81
+ if not self.task_url:
82
+ raise ValueError("Task URL not found")
83
+ r = await self._client._client.get(f"{self.task_url}/cancel", headers=self._client._headers())
84
+ r.raise_for_status()
85
+ return await self.poll()
86
+
87
+ async def delete(self):
88
+ r = await self._client._client.delete(self.task_url, headers=self._client._headers())
89
+ r.raise_for_status()
90
+
91
+ def html(self) -> str:
92
+ return self._get_content("html")
93
+
94
+ def markdown(self) -> str:
95
+ return self._get_content("markdown")
96
+
97
+ def content(self) -> str:
98
+ return self._get_content("content")
99
+
100
+ def _get_content(self, t: str) -> str:
101
+ if not self.output:
102
+ return ""
103
+ parts = []
104
+ for c in self.output.chunks:
105
+ for s in c.segments:
106
+ v = getattr(s, t)
107
+ if v:
108
+ parts.append(v)
109
+ return "\n".join(parts)
110
+
111
+ # Satisfying TaskBase abstract methods with stubs
@@ -0,0 +1,31 @@
1
+ from abc import ABC, abstractmethod
2
+ from .config import Configuration
3
+
4
+ class TaskBase(ABC):
5
+ @abstractmethod
6
+ def poll(self):
7
+ pass
8
+
9
+ @abstractmethod
10
+ def update(self, config: Configuration):
11
+ pass
12
+
13
+ @abstractmethod
14
+ def cancel(self):
15
+ pass
16
+
17
+ @abstractmethod
18
+ def delete(self):
19
+ pass
20
+
21
+ @abstractmethod
22
+ def html(self) -> str:
23
+ pass
24
+
25
+ @abstractmethod
26
+ def markdown(self) -> str:
27
+ pass
28
+
29
+ @abstractmethod
30
+ def content(self) -> str:
31
+ pass
@@ -1,20 +1,20 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: chunkr-ai
3
- Version: 0.0.8
3
+ Version: 0.0.10
4
4
  Summary: Python client for Chunkr: open source document intelligence
5
5
  Author-email: Ishaan Kapoor <ishaan@lumina.sh>
6
6
  Project-URL: Homepage, https://chunkr.ai
7
7
  Description-Content-Type: text/markdown
8
8
  License-File: LICENSE
9
- Requires-Dist: httpx>=0.28.1
10
- Requires-Dist: pillow>=11.1.0
11
- Requires-Dist: pydantic>=2.10.4
12
- Requires-Dist: pytest-asyncio>=0.25.2
13
- Requires-Dist: python-dotenv>=1.0.1
14
- Requires-Dist: requests>=2.32.3
9
+ Requires-Dist: httpx>=0.25.0
10
+ Requires-Dist: pillow>=10.0.0
11
+ Requires-Dist: pydantic>=2.0.0
12
+ Requires-Dist: pytest-asyncio>=0.21.0
13
+ Requires-Dist: python-dotenv>=0.19.0
14
+ Requires-Dist: requests>=2.28.0
15
15
  Provides-Extra: test
16
- Requires-Dist: pytest>=8.3.4; extra == "test"
17
- Requires-Dist: pytest-xdist>=3.6.1; extra == "test"
16
+ Requires-Dist: pytest>=7.0.0; extra == "test"
17
+ Requires-Dist: pytest-xdist>=3.0.0; extra == "test"
18
18
 
19
19
  # Chunkr Python Client
20
20
 
@@ -0,0 +1,19 @@
1
+ chunkr_ai/__init__.py,sha256=eXygrEhGxxIHXNYIlHF2eied8rGsx2RphgR8Wo4lRyo,110
2
+ chunkr_ai/main.py,sha256=_MT1lcnNiXjVW9ZkZYl28SB_f6M9g_IOgZxvhodTzAo,394
3
+ chunkr_ai/models.py,sha256=T8_F-Y1US21ZJVzLIaroqp-Hd0_ZFbdkbEOxr63-PNE,827
4
+ chunkr_ai/api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
+ chunkr_ai/api/auth.py,sha256=iSd5Jek2BFaHGw9HY-RrqgwP56BHFU0xbSuJS4fU6AA,425
6
+ chunkr_ai/api/chunkr.py,sha256=0qpV9b1hOpDhA9EuKkXW9X_laUmw5NY3ZYq0cUOTbww,5190
7
+ chunkr_ai/api/chunkr_async.py,sha256=ZkLBrn4cqzu3sqMfS8cfZZgSvpdyQuWZP95lfGxuHx0,4900
8
+ chunkr_ai/api/chunkr_base.py,sha256=IYO0pmoL02GchIggj6_Q5nvtAUoOvYAAvT7VLFU6scY,2506
9
+ chunkr_ai/api/config.py,sha256=eu7a28UjlNaM3QRrzElRTQXqMPBynAvlusVSIAMNXUY,4203
10
+ chunkr_ai/api/misc.py,sha256=DiY-BV5nPMDVKAiHTcND8w-8mSB-dENxrOhxnkyEoRA,6034
11
+ chunkr_ai/api/protocol.py,sha256=XKS9RmtvBpJItYhPg18qlOCKpaSHdOuQTRSUxAdUz2g,479
12
+ chunkr_ai/api/task.py,sha256=EB6RK8ms7EaNj57tNJZoNgNMHGWKXFhkQ1WC7gk5ht4,6059
13
+ chunkr_ai/api/task_async.py,sha256=Dd-Fenie0Q6GxXce7OlXvuQ14NQ58F_0b9P7AGKWyYA,3833
14
+ chunkr_ai/api/task_base.py,sha256=Tkk7dhIeB3ic5M9g_b-MVRdNv4XQTvajpaUy8JylQ8A,526
15
+ chunkr_ai-0.0.10.dist-info/LICENSE,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
+ chunkr_ai-0.0.10.dist-info/METADATA,sha256=W8PCDpT4hN5tpn_9fyVrjEbd0abG0ReP5reG4_9Glp8,4845
17
+ chunkr_ai-0.0.10.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
18
+ chunkr_ai-0.0.10.dist-info/top_level.txt,sha256=0IZY7PZIiS8bw5r4NUQRUQ-ATi-L_3vLQVq3ZLouOW8,10
19
+ chunkr_ai-0.0.10.dist-info/RECORD,,
chunkr_ai/api/api.py DELETED
File without changes
@@ -1,18 +0,0 @@
1
- chunkr_ai/__init__.py,sha256=eXygrEhGxxIHXNYIlHF2eied8rGsx2RphgR8Wo4lRyo,110
2
- chunkr_ai/main.py,sha256=_MT1lcnNiXjVW9ZkZYl28SB_f6M9g_IOgZxvhodTzAo,394
3
- chunkr_ai/models.py,sha256=T8_F-Y1US21ZJVzLIaroqp-Hd0_ZFbdkbEOxr63-PNE,827
4
- chunkr_ai/api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
- chunkr_ai/api/api.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
- chunkr_ai/api/auth.py,sha256=iSd5Jek2BFaHGw9HY-RrqgwP56BHFU0xbSuJS4fU6AA,425
7
- chunkr_ai/api/base.py,sha256=IYO0pmoL02GchIggj6_Q5nvtAUoOvYAAvT7VLFU6scY,2506
8
- chunkr_ai/api/chunkr.py,sha256=PmrK37HbK2T1KUPitKnt4wZqIujL61Jo12qW9DEpNMI,5186
9
- chunkr_ai/api/chunkr_async.py,sha256=2yYyAO9-j2xKQYH0fJb2S6gL26hgbtL4QyqlG9l0QBY,4893
10
- chunkr_ai/api/config.py,sha256=XIqXZ_8q7U_BEmY5wyIC9mbQGZBw1956EN9yhC4svD0,4235
11
- chunkr_ai/api/misc.py,sha256=tScsUUcrqeVh_bZv1YlbmjGkQSTDQN8NyKxoNwAG6XA,3792
12
- chunkr_ai/api/protocol.py,sha256=XKS9RmtvBpJItYhPg18qlOCKpaSHdOuQTRSUxAdUz2g,479
13
- chunkr_ai/api/task.py,sha256=EB6RK8ms7EaNj57tNJZoNgNMHGWKXFhkQ1WC7gk5ht4,6059
14
- chunkr_ai-0.0.8.dist-info/LICENSE,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
- chunkr_ai-0.0.8.dist-info/METADATA,sha256=tL3OZfFIRsgfIKoDYWAS89bZw48_0C8cdqHJ6_GrT7A,4844
16
- chunkr_ai-0.0.8.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
17
- chunkr_ai-0.0.8.dist-info/top_level.txt,sha256=0IZY7PZIiS8bw5r4NUQRUQ-ATi-L_3vLQVq3ZLouOW8,10
18
- chunkr_ai-0.0.8.dist-info/RECORD,,
File without changes