chunkr-ai 0.0.17__py3-none-any.whl → 0.0.19__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- chunkr_ai/__init__.py +1 -2
- chunkr_ai/api/chunkr.py +46 -39
- chunkr_ai/api/chunkr_base.py +142 -8
- chunkr_ai/api/config.py +18 -45
- chunkr_ai/api/decorators.py +58 -0
- chunkr_ai/api/misc.py +0 -2
- chunkr_ai/api/protocol.py +0 -2
- chunkr_ai/api/task_response.py +119 -0
- chunkr_ai/models.py +3 -12
- {chunkr_ai-0.0.17.dist-info → chunkr_ai-0.0.19.dist-info}/METADATA +89 -40
- chunkr_ai-0.0.19.dist-info/RECORD +17 -0
- chunkr_ai/api/base.py +0 -183
- chunkr_ai/api/chunkr_async.py +0 -120
- chunkr_ai/api/schema.py +0 -136
- chunkr_ai/api/task.py +0 -66
- chunkr_ai/api/task_async.py +0 -69
- chunkr_ai/api/task_base.py +0 -85
- chunkr_ai-0.0.17.dist-info/RECORD +0 -21
- {chunkr_ai-0.0.17.dist-info → chunkr_ai-0.0.19.dist-info}/LICENSE +0 -0
- {chunkr_ai-0.0.17.dist-info → chunkr_ai-0.0.19.dist-info}/WHEEL +0 -0
- {chunkr_ai-0.0.17.dist-info → chunkr_ai-0.0.19.dist-info}/top_level.txt +0 -0
chunkr_ai/api/schema.py
DELETED
@@ -1,136 +0,0 @@
|
|
1
|
-
from pydantic import BaseModel
|
2
|
-
from typing import Optional, List, Union, Type
|
3
|
-
import json
|
4
|
-
|
5
|
-
|
6
|
-
class Property(BaseModel):
|
7
|
-
name: str
|
8
|
-
prop_type: str
|
9
|
-
description: Optional[str] = None
|
10
|
-
default: Optional[str] = None
|
11
|
-
|
12
|
-
|
13
|
-
class JsonSchema(BaseModel):
|
14
|
-
title: str
|
15
|
-
properties: List[Property]
|
16
|
-
|
17
|
-
|
18
|
-
def from_pydantic(
|
19
|
-
pydantic: Union[BaseModel, Type[BaseModel]], current_depth: int = 0
|
20
|
-
) -> dict:
|
21
|
-
"""Convert a Pydantic model to a Chunk json schema."""
|
22
|
-
MAX_DEPTH = 5
|
23
|
-
model = pydantic if isinstance(pydantic, type) else pydantic.__class__
|
24
|
-
schema = model.model_json_schema()
|
25
|
-
properties = []
|
26
|
-
|
27
|
-
def get_enum_description(details: dict) -> str:
|
28
|
-
"""Get description including enum values if they exist"""
|
29
|
-
description = details.get("description", "")
|
30
|
-
|
31
|
-
# First check if this is a direct enum
|
32
|
-
if "enum" in details:
|
33
|
-
enum_values = details["enum"]
|
34
|
-
enum_str = "\nAllowed values:\n" + "\n".join(
|
35
|
-
f"- {val}" for val in enum_values
|
36
|
-
)
|
37
|
-
return f"{description}{enum_str}"
|
38
|
-
|
39
|
-
# Then check if it's a reference to an enum
|
40
|
-
if "$ref" in details:
|
41
|
-
ref_schema = resolve_ref(details["$ref"], schema.get("$defs", {}))
|
42
|
-
if "enum" in ref_schema:
|
43
|
-
enum_values = ref_schema["enum"]
|
44
|
-
enum_str = "\nAllowed values:\n" + "\n".join(
|
45
|
-
f"- {val}" for val in enum_values
|
46
|
-
)
|
47
|
-
return f"{description}{enum_str}"
|
48
|
-
|
49
|
-
return description
|
50
|
-
|
51
|
-
def resolve_ref(ref: str, definitions: dict) -> dict:
|
52
|
-
"""Resolve a $ref reference to its actual schema"""
|
53
|
-
if not ref.startswith("#/$defs/"):
|
54
|
-
return {}
|
55
|
-
ref_name = ref[len("#/$defs/") :]
|
56
|
-
return definitions.get(ref_name, {})
|
57
|
-
|
58
|
-
def get_nested_schema(field_schema: dict, depth: int) -> dict:
|
59
|
-
if depth >= MAX_DEPTH:
|
60
|
-
return {}
|
61
|
-
|
62
|
-
# If there's a $ref, resolve it first
|
63
|
-
if "$ref" in field_schema:
|
64
|
-
field_schema = resolve_ref(field_schema["$ref"], schema.get("$defs", {}))
|
65
|
-
|
66
|
-
nested_props = {}
|
67
|
-
if field_schema.get("type") == "object":
|
68
|
-
for name, details in field_schema.get("properties", {}).items():
|
69
|
-
if details.get("type") == "object" or "$ref" in details:
|
70
|
-
ref_schema = details
|
71
|
-
if "$ref" in details:
|
72
|
-
ref_schema = resolve_ref(
|
73
|
-
details["$ref"], schema.get("$defs", {})
|
74
|
-
)
|
75
|
-
nested_schema = get_nested_schema(ref_schema, depth + 1)
|
76
|
-
nested_props[name] = {
|
77
|
-
"type": "object",
|
78
|
-
"description": get_enum_description(details),
|
79
|
-
"properties": nested_schema,
|
80
|
-
}
|
81
|
-
else:
|
82
|
-
nested_props[name] = {
|
83
|
-
"type": details.get("type", "string"),
|
84
|
-
"description": get_enum_description(details),
|
85
|
-
}
|
86
|
-
return nested_props
|
87
|
-
|
88
|
-
for name, details in schema.get("properties", {}).items():
|
89
|
-
# Handle arrays
|
90
|
-
if details.get("type") == "array":
|
91
|
-
items = details.get("items", {})
|
92
|
-
if "$ref" in items:
|
93
|
-
items = resolve_ref(items["$ref"], schema.get("$defs", {}))
|
94
|
-
|
95
|
-
# Get nested schema for array items
|
96
|
-
item_schema = get_nested_schema(items, current_depth)
|
97
|
-
description = get_enum_description(details)
|
98
|
-
|
99
|
-
if item_schema:
|
100
|
-
description = f"{description}\nList items schema:\n{json.dumps(item_schema, indent=2)}"
|
101
|
-
|
102
|
-
prop = Property(name=name, prop_type="list", description=description)
|
103
|
-
# Handle objects and references
|
104
|
-
elif details.get("type") == "object" or "$ref" in details:
|
105
|
-
prop_type = "object"
|
106
|
-
ref_schema = details
|
107
|
-
if "$ref" in details:
|
108
|
-
ref_schema = resolve_ref(details["$ref"], schema.get("$defs", {}))
|
109
|
-
|
110
|
-
nested_schema = get_nested_schema(ref_schema, current_depth)
|
111
|
-
|
112
|
-
prop = Property(
|
113
|
-
name=name,
|
114
|
-
prop_type=prop_type,
|
115
|
-
description=get_enum_description(details),
|
116
|
-
properties=nested_schema,
|
117
|
-
)
|
118
|
-
|
119
|
-
# Handle primitive types
|
120
|
-
else:
|
121
|
-
prop = Property(
|
122
|
-
name=name,
|
123
|
-
prop_type=details.get("type", "string"),
|
124
|
-
description=get_enum_description(details),
|
125
|
-
default=str(details.get("default"))
|
126
|
-
if details.get("default") is not None
|
127
|
-
else None,
|
128
|
-
)
|
129
|
-
|
130
|
-
properties.append(prop)
|
131
|
-
|
132
|
-
json_schema = JsonSchema(
|
133
|
-
title=schema.get("title", model.__name__), properties=properties
|
134
|
-
)
|
135
|
-
|
136
|
-
return json_schema.model_dump(mode="json", exclude_none=True)
|
chunkr_ai/api/task.py
DELETED
@@ -1,66 +0,0 @@
|
|
1
|
-
from .config import Configuration
|
2
|
-
from .misc import prepare_upload_data
|
3
|
-
from .task_base import TaskBase
|
4
|
-
import time
|
5
|
-
|
6
|
-
|
7
|
-
class TaskResponse(TaskBase):
|
8
|
-
def _poll_request(self) -> dict:
|
9
|
-
while True:
|
10
|
-
try:
|
11
|
-
if not self.task_url:
|
12
|
-
raise ValueError("Task URL not found in response")
|
13
|
-
if not self._client._session:
|
14
|
-
raise ValueError("Client session not found")
|
15
|
-
r = self._client._session.get(
|
16
|
-
self.task_url, headers=self._client._headers()
|
17
|
-
)
|
18
|
-
r.raise_for_status()
|
19
|
-
return r.json()
|
20
|
-
except (ConnectionError, TimeoutError) as _:
|
21
|
-
print("Connection error while polling the task, retrying...")
|
22
|
-
time.sleep(0.5)
|
23
|
-
except Exception:
|
24
|
-
raise
|
25
|
-
|
26
|
-
def poll(self) -> "TaskResponse":
|
27
|
-
while True:
|
28
|
-
response = self._poll_request()
|
29
|
-
updated_task = TaskResponse(**response).with_client(self._client)
|
30
|
-
self.__dict__.update(updated_task.__dict__)
|
31
|
-
if result := self._check_status():
|
32
|
-
return result
|
33
|
-
time.sleep(0.5)
|
34
|
-
|
35
|
-
def update(self, config: Configuration) -> "TaskResponse":
|
36
|
-
if not self.task_url:
|
37
|
-
raise ValueError("Task URL not found")
|
38
|
-
if not self._client._session:
|
39
|
-
raise ValueError("Client session not found")
|
40
|
-
files = prepare_upload_data(None, config)
|
41
|
-
r = self._client._session.patch(
|
42
|
-
self.task_url, files=files, headers=self._client._headers()
|
43
|
-
)
|
44
|
-
r.raise_for_status()
|
45
|
-
updated = TaskResponse(**r.json()).with_client(self._client)
|
46
|
-
self.__dict__.update(updated.__dict__)
|
47
|
-
return self.poll()
|
48
|
-
|
49
|
-
def cancel(self):
|
50
|
-
if not self.task_url:
|
51
|
-
raise ValueError("Task URL not found")
|
52
|
-
if not self._client._session:
|
53
|
-
raise ValueError("Client session not found")
|
54
|
-
r = self._client._session.get(
|
55
|
-
f"{self.task_url}/cancel", headers=self._client._headers()
|
56
|
-
)
|
57
|
-
r.raise_for_status()
|
58
|
-
self.poll()
|
59
|
-
|
60
|
-
def delete(self):
|
61
|
-
if not self.task_url:
|
62
|
-
raise ValueError("Task URL not found")
|
63
|
-
if not self._client._session:
|
64
|
-
raise ValueError("Client session not found")
|
65
|
-
r = self._client._session.delete(self.task_url, headers=self._client._headers())
|
66
|
-
r.raise_for_status()
|
chunkr_ai/api/task_async.py
DELETED
@@ -1,69 +0,0 @@
|
|
1
|
-
from .config import Configuration
|
2
|
-
from .misc import prepare_upload_data
|
3
|
-
from .task_base import TaskBase
|
4
|
-
import asyncio
|
5
|
-
|
6
|
-
|
7
|
-
class TaskResponseAsync(TaskBase):
|
8
|
-
async def _poll_request(self) -> dict:
|
9
|
-
try:
|
10
|
-
if not self._client._client:
|
11
|
-
raise ValueError("Client not found")
|
12
|
-
r = await self._client._client.get(
|
13
|
-
self.task_url, headers=self._client._headers()
|
14
|
-
)
|
15
|
-
r.raise_for_status()
|
16
|
-
return r.json()
|
17
|
-
except (ConnectionError, TimeoutError) as _:
|
18
|
-
print("Connection error while polling the task, retrying...")
|
19
|
-
await asyncio.sleep(0.5)
|
20
|
-
except Exception:
|
21
|
-
raise
|
22
|
-
|
23
|
-
async def poll(self) -> "TaskResponseAsync":
|
24
|
-
if not self.task_url:
|
25
|
-
raise ValueError("Task URL not found")
|
26
|
-
if not self._client._client:
|
27
|
-
raise ValueError("Client not found")
|
28
|
-
while True:
|
29
|
-
j = await self._poll_request()
|
30
|
-
updated = TaskResponseAsync(**j).with_client(self._client)
|
31
|
-
self.__dict__.update(updated.__dict__)
|
32
|
-
if res := self._check_status():
|
33
|
-
return res
|
34
|
-
await asyncio.sleep(0.5)
|
35
|
-
|
36
|
-
async def update(self, config: Configuration) -> "TaskResponseAsync":
|
37
|
-
if not self.task_url:
|
38
|
-
raise ValueError("Task URL not found")
|
39
|
-
if not self._client._client:
|
40
|
-
raise ValueError("Client not found")
|
41
|
-
f = prepare_upload_data(None, config)
|
42
|
-
r = await self._client._client.patch(
|
43
|
-
self.task_url, files=f, headers=self._client._headers()
|
44
|
-
)
|
45
|
-
r.raise_for_status()
|
46
|
-
updated = TaskResponseAsync(**r.json()).with_client(self._client)
|
47
|
-
self.__dict__.update(updated.__dict__)
|
48
|
-
return await self.poll()
|
49
|
-
|
50
|
-
async def cancel(self):
|
51
|
-
if not self.task_url:
|
52
|
-
raise ValueError("Task URL not found")
|
53
|
-
if not self._client._client:
|
54
|
-
raise ValueError("Client not found")
|
55
|
-
r = await self._client._client.get(
|
56
|
-
f"{self.task_url}/cancel", headers=self._client._headers()
|
57
|
-
)
|
58
|
-
r.raise_for_status()
|
59
|
-
return await self.poll()
|
60
|
-
|
61
|
-
async def delete(self):
|
62
|
-
if not self.task_url:
|
63
|
-
raise ValueError("Task URL not found")
|
64
|
-
if not self._client._client:
|
65
|
-
raise ValueError("Client not found")
|
66
|
-
r = await self._client._client.delete(
|
67
|
-
self.task_url, headers=self._client._headers()
|
68
|
-
)
|
69
|
-
r.raise_for_status()
|
chunkr_ai/api/task_base.py
DELETED
@@ -1,85 +0,0 @@
|
|
1
|
-
from .config import Configuration, Status, OutputResponse
|
2
|
-
from .protocol import ChunkrClientProtocol
|
3
|
-
from abc import ABC, abstractmethod
|
4
|
-
from typing import TypeVar, Optional, Generic
|
5
|
-
from pydantic import BaseModel, PrivateAttr
|
6
|
-
from datetime import datetime
|
7
|
-
|
8
|
-
T = TypeVar("T", bound="TaskBase")
|
9
|
-
|
10
|
-
class TaskBase(BaseModel, ABC, Generic[T]):
|
11
|
-
configuration: Configuration
|
12
|
-
created_at: datetime
|
13
|
-
expires_at: Optional[datetime]
|
14
|
-
file_name: Optional[str]
|
15
|
-
finished_at: Optional[datetime]
|
16
|
-
input_file_url: Optional[str]
|
17
|
-
message: str
|
18
|
-
output: Optional[OutputResponse]
|
19
|
-
page_count: Optional[int]
|
20
|
-
pdf_url: Optional[str]
|
21
|
-
started_at: Optional[datetime]
|
22
|
-
status: Status
|
23
|
-
task_id: str
|
24
|
-
task_url: Optional[str]
|
25
|
-
_client: Optional[ChunkrClientProtocol] = PrivateAttr(default=None)
|
26
|
-
|
27
|
-
@abstractmethod
|
28
|
-
def _poll_request(self) -> dict:
|
29
|
-
"""Helper method to make polling request with retry logic (synchronous)"""
|
30
|
-
pass
|
31
|
-
|
32
|
-
@abstractmethod
|
33
|
-
def poll(self) -> T:
|
34
|
-
"""Poll the task for completion."""
|
35
|
-
pass
|
36
|
-
|
37
|
-
@abstractmethod
|
38
|
-
def update(self, config: Configuration) -> T:
|
39
|
-
"""Update the task configuration."""
|
40
|
-
pass
|
41
|
-
|
42
|
-
@abstractmethod
|
43
|
-
def cancel(self) -> T:
|
44
|
-
"""Cancel the task."""
|
45
|
-
pass
|
46
|
-
|
47
|
-
@abstractmethod
|
48
|
-
def delete(self) -> T:
|
49
|
-
"""Delete the task."""
|
50
|
-
pass
|
51
|
-
|
52
|
-
def with_client(self, client: ChunkrClientProtocol) -> T:
|
53
|
-
self._client = client
|
54
|
-
return self
|
55
|
-
|
56
|
-
def _check_status(self) -> Optional[T]:
|
57
|
-
"""Helper method to check task status and handle completion/failure"""
|
58
|
-
if self.status == "Failed":
|
59
|
-
raise ValueError(self.message)
|
60
|
-
if self.status not in ("Starting", "Processing"):
|
61
|
-
return self
|
62
|
-
return None
|
63
|
-
|
64
|
-
def html(self) -> str:
|
65
|
-
"""Get the full HTML of the task"""
|
66
|
-
return self._get_content("html")
|
67
|
-
|
68
|
-
def markdown(self) -> str:
|
69
|
-
"""Get the full markdown of the task"""
|
70
|
-
return self._get_content("markdown")
|
71
|
-
|
72
|
-
def content(self) -> str:
|
73
|
-
"""Get the full content of the task"""
|
74
|
-
return self._get_content("content")
|
75
|
-
|
76
|
-
def _get_content(self, t: str) -> str:
|
77
|
-
if not self.output:
|
78
|
-
return ""
|
79
|
-
parts = []
|
80
|
-
for c in self.output.chunks:
|
81
|
-
for s in c.segments:
|
82
|
-
v = getattr(s, t)
|
83
|
-
if v:
|
84
|
-
parts.append(v)
|
85
|
-
return "\n".join(parts)
|
@@ -1,21 +0,0 @@
|
|
1
|
-
chunkr_ai/__init__.py,sha256=q5YosvCNXPNGjV10pZY1gcvdosqUh38nVQTQA9g8EuM,110
|
2
|
-
chunkr_ai/models.py,sha256=hahbtxtTyzE_ygFgmlZwbfM6Vj2k5uSDEP02psxDOSQ,924
|
3
|
-
chunkr_ai/api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
4
|
-
chunkr_ai/api/api.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
|
-
chunkr_ai/api/auth.py,sha256=hlv0GiUmlsbFO1wLL9sslqOnsBSoBqkL_6Mk2SDvxgE,413
|
6
|
-
chunkr_ai/api/base.py,sha256=QvHl8FInKHYKPLWDeEPpCchB1uktzOwTW7iPnyXccUc,6449
|
7
|
-
chunkr_ai/api/chunkr.py,sha256=0extAWVeZtI7B-g14smTfFZD_csdJNCcVNXx2_L69OQ,2617
|
8
|
-
chunkr_ai/api/chunkr_async.py,sha256=aa0s_tnYoujHBsfe8uLiPpVEnb2l9A3CXwPP34w9Mk8,4127
|
9
|
-
chunkr_ai/api/chunkr_base.py,sha256=k34Dyt1f21NBWZvZJ3w6Svvpg4SKnzr2ldGQ4ib96Wc,4951
|
10
|
-
chunkr_ai/api/config.py,sha256=TWl0Az6acKQCS1LIpKD4qr_lQ_63wqQ5M6calpLOlDM,5040
|
11
|
-
chunkr_ai/api/misc.py,sha256=bQpURc7soT5GL2ZpY7EiYyvPYWEzDM9qaX-UHa-oFeI,4909
|
12
|
-
chunkr_ai/api/protocol.py,sha256=lxIR_qoCA2a1OXjpq3LrWMdS0jRHct1bEmBlUzV8gvE,526
|
13
|
-
chunkr_ai/api/schema.py,sha256=yYesvueGgtmRa7Fi_Tpdv8A2bzHlx-B-5DxRAPlaDHo,4926
|
14
|
-
chunkr_ai/api/task.py,sha256=28J4dR8BDjvtkh3CQjW_YUEkgPXhCHBGu0wH6AQKKuE,2474
|
15
|
-
chunkr_ai/api/task_async.py,sha256=K5hTEOnmD42snPZg_JtJsVWg6QBUFZ1aBz1Abwv58-A,2529
|
16
|
-
chunkr_ai/api/task_base.py,sha256=KLiMhvvbCgcilguQKrtEPMlNs8oaatfQUtn8pYt9t6g,2467
|
17
|
-
chunkr_ai-0.0.17.dist-info/LICENSE,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
18
|
-
chunkr_ai-0.0.17.dist-info/METADATA,sha256=giy1xeKYXk18W5U-baNoAAlvXciJldhA_EBi87NqKpA,4839
|
19
|
-
chunkr_ai-0.0.17.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
20
|
-
chunkr_ai-0.0.17.dist-info/top_level.txt,sha256=0IZY7PZIiS8bw5r4NUQRUQ-ATi-L_3vLQVq3ZLouOW8,10
|
21
|
-
chunkr_ai-0.0.17.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|