chunkr-ai 0.0.8__py3-none-any.whl → 0.0.10__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
chunkr_ai/api/chunkr.py CHANGED
@@ -1,4 +1,4 @@
1
- from .base import ChunkrBase
1
+ from .chunkr_base import ChunkrBase
2
2
  from .config import Configuration
3
3
  from .task import TaskResponse
4
4
  from pathlib import Path
@@ -163,5 +163,3 @@ class Chunkr(ChunkrBase):
163
163
  headers=self._headers()
164
164
  )
165
165
  r.raise_for_status()
166
-
167
-
@@ -1,4 +1,4 @@
1
- from .base import ChunkrBase
1
+ from .chunkr_base import ChunkrBase
2
2
  from .task import TaskResponse
3
3
  from .config import Configuration
4
4
  import httpx
chunkr_ai/api/config.py CHANGED
@@ -40,7 +40,6 @@ class ChunkProcessing(BaseModel):
40
40
 
41
41
  class Property(BaseModel):
42
42
  name: str
43
- title: Optional[str] = None
44
43
  prop_type: str
45
44
  description: Optional[str] = None
46
45
  default: Optional[str] = None
chunkr_ai/api/misc.py CHANGED
@@ -1,11 +1,11 @@
1
+ from .config import Configuration, Property, JsonSchema
1
2
  import io
2
3
  import json
3
4
  from pathlib import Path
4
5
  from PIL import Image
5
6
  import requests
6
7
  from typing import Union, Tuple, BinaryIO, Optional
7
- from .config import Configuration
8
-
8
+ from pydantic import BaseModel
9
9
 
10
10
  def prepare_file(
11
11
  file: Union[str, Path, BinaryIO, Image.Image]
@@ -15,8 +15,31 @@ def prepare_file(
15
15
  if isinstance(file, str) and (file.startswith('http://') or file.startswith('https://')):
16
16
  response = requests.get(file)
17
17
  response.raise_for_status()
18
+
19
+ # Try to get filename from Content-Disposition header first
20
+ filename = None
21
+ content_disposition = response.headers.get('Content-Disposition')
22
+ if content_disposition and 'filename=' in content_disposition:
23
+ filename = content_disposition.split('filename=')[-1].strip('"\'')
24
+
25
+ # If no Content-Disposition, try to get clean filename from URL path
26
+ if not filename:
27
+ from urllib.parse import urlparse, unquote
28
+ parsed_url = urlparse(file)
29
+ path = unquote(parsed_url.path)
30
+ filename = Path(path).name if path else None
31
+
32
+ # Fallback to default name if we couldn't extract one
33
+ filename = filename or 'downloaded_file'
34
+
35
+ # Sanitize filename: remove invalid characters and limit length
36
+ import re
37
+ filename = re.sub(r'[<>:"/\\|?*%]', '_', filename) # Replace invalid chars with underscore
38
+ filename = re.sub(r'\s+', '_', filename) # Replace whitespace with underscore
39
+ filename = filename.strip('._') # Remove leading/trailing dots and underscores
40
+ filename = filename[:255] # Limit length to 255 characters
41
+
18
42
  file_obj = io.BytesIO(response.content)
19
- filename = Path(file.split('/')[-1]).name or 'downloaded_file'
20
43
  return filename, file_obj
21
44
 
22
45
  # Handle base64 strings
@@ -104,3 +127,33 @@ def prepare_upload_data(
104
127
  files[key] = (None, json.dumps(value), 'application/json')
105
128
 
106
129
  return files
130
+
131
+ def from_pydantic(pydantic: BaseModel) -> dict:
132
+ """Convert a Pydantic model to a Chunk json schema.
133
+
134
+ Args:
135
+ pydantic: A Pydantic BaseModel class or instance
136
+
137
+ Returns:
138
+ dict: A JSON schema compatible with Chunk's format
139
+ """
140
+ model = pydantic if isinstance(pydantic, type) else pydantic.__class__
141
+ schema = model.model_json_schema()
142
+ print(schema)
143
+ properties = []
144
+ for name, details in schema.get('properties', {}).items():
145
+ prop = Property(
146
+ name=name,
147
+ title=details.get('title'),
148
+ prop_type=details.get('type', 'string'),
149
+ description=details.get('description'),
150
+ default=str(details.get('default')) if details.get('default') is not None else None
151
+ )
152
+ properties.append(prop)
153
+
154
+ json_schema = JsonSchema(
155
+ title=schema.get('title', model.__name__),
156
+ properties=properties
157
+ )
158
+
159
+ return json_schema.model_dump(mode="json", exclude_none=True)
@@ -0,0 +1,111 @@
1
+ import asyncio
2
+ from pydantic import BaseModel, PrivateAttr
3
+ from datetime import datetime
4
+ from enum import Enum
5
+ from typing import Optional, Union
6
+ from .task_base import TaskBase
7
+ from .protocol import ChunkrClientProtocol
8
+ from .config import Configuration, OutputResponse
9
+ from .misc import prepare_upload_data
10
+
11
+ class Status(str, Enum):
12
+ STARTING = "Starting"
13
+ PROCESSING = "Processing"
14
+ SUCCEEDED = "Succeeded"
15
+ FAILED = "Failed"
16
+ CANCELLED = "Cancelled"
17
+
18
+ class TaskResponseAsync(BaseModel, TaskBase):
19
+ configuration: Configuration
20
+ created_at: datetime
21
+ expires_at: Optional[datetime]
22
+ file_name: Optional[str]
23
+ finished_at: Optional[datetime]
24
+ input_file_url: Optional[str]
25
+ message: str
26
+ output: Optional[OutputResponse]
27
+ page_count: Optional[int]
28
+ pdf_url: Optional[str]
29
+ started_at: Optional[datetime]
30
+ status: Status
31
+ task_id: str
32
+ task_url: Optional[str]
33
+ _client: Optional[Union[ChunkrClientProtocol]] = PrivateAttr(default=None)
34
+
35
+ def with_client(self, client: Union[ChunkrClientProtocol]) -> 'TaskResponseAsync':
36
+ self._client = client
37
+ return self
38
+
39
+ async def poll(self) -> 'TaskResponseAsync':
40
+ while True:
41
+ j = await self._poll_request()
42
+ updated = TaskResponseAsync(**j).with_client(self._client)
43
+ self.__dict__.update(updated.__dict__)
44
+ if res := self._check_status():
45
+ return res
46
+ await asyncio.sleep(0.5)
47
+
48
+ async def _poll_request(self) -> dict:
49
+ if not self.task_url:
50
+ raise ValueError("Task URL not found")
51
+ while True:
52
+ try:
53
+ r = await self._client._client.get(self.task_url, headers=self._client._headers())
54
+ r.raise_for_status()
55
+ return r.json()
56
+ except Exception as e:
57
+ if self.status == Status.FAILED:
58
+ raise ValueError(self.message) from e
59
+ await asyncio.sleep(0.5)
60
+
61
+ def _check_status(self) -> Optional['TaskResponseAsync']:
62
+ if self.status == Status.FAILED:
63
+ raise ValueError(f"Task failed: {self.message}")
64
+ if self.status == Status.CANCELLED:
65
+ return self
66
+ if self.status not in [Status.STARTING, Status.PROCESSING]:
67
+ return self
68
+ return None
69
+
70
+ async def update(self, config: Configuration) -> 'TaskResponseAsync':
71
+ if not self.task_url:
72
+ raise ValueError("Task URL not found")
73
+ f = prepare_upload_data(None, config)
74
+ r = await self._client._client.patch(self.task_url, files=f, headers=self._client._headers())
75
+ r.raise_for_status()
76
+ updated = TaskResponseAsync(**r.json()).with_client(self._client)
77
+ self.__dict__.update(updated.__dict__)
78
+ return await self.poll()
79
+
80
+ async def cancel(self):
81
+ if not self.task_url:
82
+ raise ValueError("Task URL not found")
83
+ r = await self._client._client.get(f"{self.task_url}/cancel", headers=self._client._headers())
84
+ r.raise_for_status()
85
+ return await self.poll()
86
+
87
+ async def delete(self):
88
+ r = await self._client._client.delete(self.task_url, headers=self._client._headers())
89
+ r.raise_for_status()
90
+
91
+ def html(self) -> str:
92
+ return self._get_content("html")
93
+
94
+ def markdown(self) -> str:
95
+ return self._get_content("markdown")
96
+
97
+ def content(self) -> str:
98
+ return self._get_content("content")
99
+
100
+ def _get_content(self, t: str) -> str:
101
+ if not self.output:
102
+ return ""
103
+ parts = []
104
+ for c in self.output.chunks:
105
+ for s in c.segments:
106
+ v = getattr(s, t)
107
+ if v:
108
+ parts.append(v)
109
+ return "\n".join(parts)
110
+
111
+ # Satisfying TaskBase abstract methods with stubs
@@ -0,0 +1,31 @@
1
+ from abc import ABC, abstractmethod
2
+ from .config import Configuration
3
+
4
+ class TaskBase(ABC):
5
+ @abstractmethod
6
+ def poll(self):
7
+ pass
8
+
9
+ @abstractmethod
10
+ def update(self, config: Configuration):
11
+ pass
12
+
13
+ @abstractmethod
14
+ def cancel(self):
15
+ pass
16
+
17
+ @abstractmethod
18
+ def delete(self):
19
+ pass
20
+
21
+ @abstractmethod
22
+ def html(self) -> str:
23
+ pass
24
+
25
+ @abstractmethod
26
+ def markdown(self) -> str:
27
+ pass
28
+
29
+ @abstractmethod
30
+ def content(self) -> str:
31
+ pass
@@ -1,20 +1,20 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: chunkr-ai
3
- Version: 0.0.8
3
+ Version: 0.0.10
4
4
  Summary: Python client for Chunkr: open source document intelligence
5
5
  Author-email: Ishaan Kapoor <ishaan@lumina.sh>
6
6
  Project-URL: Homepage, https://chunkr.ai
7
7
  Description-Content-Type: text/markdown
8
8
  License-File: LICENSE
9
- Requires-Dist: httpx>=0.28.1
10
- Requires-Dist: pillow>=11.1.0
11
- Requires-Dist: pydantic>=2.10.4
12
- Requires-Dist: pytest-asyncio>=0.25.2
13
- Requires-Dist: python-dotenv>=1.0.1
14
- Requires-Dist: requests>=2.32.3
9
+ Requires-Dist: httpx>=0.25.0
10
+ Requires-Dist: pillow>=10.0.0
11
+ Requires-Dist: pydantic>=2.0.0
12
+ Requires-Dist: pytest-asyncio>=0.21.0
13
+ Requires-Dist: python-dotenv>=0.19.0
14
+ Requires-Dist: requests>=2.28.0
15
15
  Provides-Extra: test
16
- Requires-Dist: pytest>=8.3.4; extra == "test"
17
- Requires-Dist: pytest-xdist>=3.6.1; extra == "test"
16
+ Requires-Dist: pytest>=7.0.0; extra == "test"
17
+ Requires-Dist: pytest-xdist>=3.0.0; extra == "test"
18
18
 
19
19
  # Chunkr Python Client
20
20
 
@@ -0,0 +1,19 @@
1
+ chunkr_ai/__init__.py,sha256=eXygrEhGxxIHXNYIlHF2eied8rGsx2RphgR8Wo4lRyo,110
2
+ chunkr_ai/main.py,sha256=_MT1lcnNiXjVW9ZkZYl28SB_f6M9g_IOgZxvhodTzAo,394
3
+ chunkr_ai/models.py,sha256=T8_F-Y1US21ZJVzLIaroqp-Hd0_ZFbdkbEOxr63-PNE,827
4
+ chunkr_ai/api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
+ chunkr_ai/api/auth.py,sha256=iSd5Jek2BFaHGw9HY-RrqgwP56BHFU0xbSuJS4fU6AA,425
6
+ chunkr_ai/api/chunkr.py,sha256=0qpV9b1hOpDhA9EuKkXW9X_laUmw5NY3ZYq0cUOTbww,5190
7
+ chunkr_ai/api/chunkr_async.py,sha256=ZkLBrn4cqzu3sqMfS8cfZZgSvpdyQuWZP95lfGxuHx0,4900
8
+ chunkr_ai/api/chunkr_base.py,sha256=IYO0pmoL02GchIggj6_Q5nvtAUoOvYAAvT7VLFU6scY,2506
9
+ chunkr_ai/api/config.py,sha256=eu7a28UjlNaM3QRrzElRTQXqMPBynAvlusVSIAMNXUY,4203
10
+ chunkr_ai/api/misc.py,sha256=DiY-BV5nPMDVKAiHTcND8w-8mSB-dENxrOhxnkyEoRA,6034
11
+ chunkr_ai/api/protocol.py,sha256=XKS9RmtvBpJItYhPg18qlOCKpaSHdOuQTRSUxAdUz2g,479
12
+ chunkr_ai/api/task.py,sha256=EB6RK8ms7EaNj57tNJZoNgNMHGWKXFhkQ1WC7gk5ht4,6059
13
+ chunkr_ai/api/task_async.py,sha256=Dd-Fenie0Q6GxXce7OlXvuQ14NQ58F_0b9P7AGKWyYA,3833
14
+ chunkr_ai/api/task_base.py,sha256=Tkk7dhIeB3ic5M9g_b-MVRdNv4XQTvajpaUy8JylQ8A,526
15
+ chunkr_ai-0.0.10.dist-info/LICENSE,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
+ chunkr_ai-0.0.10.dist-info/METADATA,sha256=W8PCDpT4hN5tpn_9fyVrjEbd0abG0ReP5reG4_9Glp8,4845
17
+ chunkr_ai-0.0.10.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
18
+ chunkr_ai-0.0.10.dist-info/top_level.txt,sha256=0IZY7PZIiS8bw5r4NUQRUQ-ATi-L_3vLQVq3ZLouOW8,10
19
+ chunkr_ai-0.0.10.dist-info/RECORD,,
chunkr_ai/api/api.py DELETED
File without changes
@@ -1,18 +0,0 @@
1
- chunkr_ai/__init__.py,sha256=eXygrEhGxxIHXNYIlHF2eied8rGsx2RphgR8Wo4lRyo,110
2
- chunkr_ai/main.py,sha256=_MT1lcnNiXjVW9ZkZYl28SB_f6M9g_IOgZxvhodTzAo,394
3
- chunkr_ai/models.py,sha256=T8_F-Y1US21ZJVzLIaroqp-Hd0_ZFbdkbEOxr63-PNE,827
4
- chunkr_ai/api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
- chunkr_ai/api/api.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
- chunkr_ai/api/auth.py,sha256=iSd5Jek2BFaHGw9HY-RrqgwP56BHFU0xbSuJS4fU6AA,425
7
- chunkr_ai/api/base.py,sha256=IYO0pmoL02GchIggj6_Q5nvtAUoOvYAAvT7VLFU6scY,2506
8
- chunkr_ai/api/chunkr.py,sha256=PmrK37HbK2T1KUPitKnt4wZqIujL61Jo12qW9DEpNMI,5186
9
- chunkr_ai/api/chunkr_async.py,sha256=2yYyAO9-j2xKQYH0fJb2S6gL26hgbtL4QyqlG9l0QBY,4893
10
- chunkr_ai/api/config.py,sha256=XIqXZ_8q7U_BEmY5wyIC9mbQGZBw1956EN9yhC4svD0,4235
11
- chunkr_ai/api/misc.py,sha256=tScsUUcrqeVh_bZv1YlbmjGkQSTDQN8NyKxoNwAG6XA,3792
12
- chunkr_ai/api/protocol.py,sha256=XKS9RmtvBpJItYhPg18qlOCKpaSHdOuQTRSUxAdUz2g,479
13
- chunkr_ai/api/task.py,sha256=EB6RK8ms7EaNj57tNJZoNgNMHGWKXFhkQ1WC7gk5ht4,6059
14
- chunkr_ai-0.0.8.dist-info/LICENSE,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
- chunkr_ai-0.0.8.dist-info/METADATA,sha256=tL3OZfFIRsgfIKoDYWAS89bZw48_0C8cdqHJ6_GrT7A,4844
16
- chunkr_ai-0.0.8.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
17
- chunkr_ai-0.0.8.dist-info/top_level.txt,sha256=0IZY7PZIiS8bw5r4NUQRUQ-ATi-L_3vLQVq3ZLouOW8,10
18
- chunkr_ai-0.0.8.dist-info/RECORD,,
File without changes