chunkr-ai 0.0.10__tar.gz → 0.0.12__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {chunkr_ai-0.0.10/src/chunkr_ai.egg-info → chunkr_ai-0.0.12}/PKG-INFO +2 -1
- {chunkr_ai-0.0.10 → chunkr_ai-0.0.12}/pyproject.toml +2 -1
- {chunkr_ai-0.0.10 → chunkr_ai-0.0.12}/src/chunkr_ai/api/config.py +17 -2
- {chunkr_ai-0.0.10 → chunkr_ai-0.0.12}/src/chunkr_ai/api/misc.py +1 -32
- chunkr_ai-0.0.12/src/chunkr_ai/api/schema.py +128 -0
- chunkr_ai-0.0.12/src/chunkr_ai/api/task.py +61 -0
- chunkr_ai-0.0.12/src/chunkr_ai/api/task_async.py +50 -0
- chunkr_ai-0.0.12/src/chunkr_ai/api/task_base.py +83 -0
- {chunkr_ai-0.0.10 → chunkr_ai-0.0.12}/src/chunkr_ai/models.py +5 -2
- {chunkr_ai-0.0.10 → chunkr_ai-0.0.12/src/chunkr_ai.egg-info}/PKG-INFO +2 -1
- {chunkr_ai-0.0.10 → chunkr_ai-0.0.12}/src/chunkr_ai.egg-info/SOURCES.txt +1 -1
- {chunkr_ai-0.0.10 → chunkr_ai-0.0.12}/src/chunkr_ai.egg-info/requires.txt +1 -0
- chunkr_ai-0.0.10/src/chunkr_ai/api/task.py +0 -176
- chunkr_ai-0.0.10/src/chunkr_ai/api/task_async.py +0 -111
- chunkr_ai-0.0.10/src/chunkr_ai/api/task_base.py +0 -31
- chunkr_ai-0.0.10/src/chunkr_ai/main.py +0 -12
- {chunkr_ai-0.0.10 → chunkr_ai-0.0.12}/LICENSE +0 -0
- {chunkr_ai-0.0.10 → chunkr_ai-0.0.12}/README.md +0 -0
- {chunkr_ai-0.0.10 → chunkr_ai-0.0.12}/setup.cfg +0 -0
- {chunkr_ai-0.0.10 → chunkr_ai-0.0.12}/src/chunkr_ai/__init__.py +0 -0
- {chunkr_ai-0.0.10 → chunkr_ai-0.0.12}/src/chunkr_ai/api/__init__.py +0 -0
- {chunkr_ai-0.0.10 → chunkr_ai-0.0.12}/src/chunkr_ai/api/auth.py +0 -0
- {chunkr_ai-0.0.10 → chunkr_ai-0.0.12}/src/chunkr_ai/api/chunkr.py +0 -0
- {chunkr_ai-0.0.10 → chunkr_ai-0.0.12}/src/chunkr_ai/api/chunkr_async.py +0 -0
- {chunkr_ai-0.0.10 → chunkr_ai-0.0.12}/src/chunkr_ai/api/chunkr_base.py +0 -0
- {chunkr_ai-0.0.10 → chunkr_ai-0.0.12}/src/chunkr_ai/api/protocol.py +0 -0
- {chunkr_ai-0.0.10 → chunkr_ai-0.0.12}/src/chunkr_ai.egg-info/dependency_links.txt +0 -0
- {chunkr_ai-0.0.10 → chunkr_ai-0.0.12}/src/chunkr_ai.egg-info/top_level.txt +0 -0
- {chunkr_ai-0.0.10 → chunkr_ai-0.0.12}/tests/test_chunkr.py +0 -0
@@ -1,12 +1,13 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: chunkr-ai
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.12
|
4
4
|
Summary: Python client for Chunkr: open source document intelligence
|
5
5
|
Author-email: Ishaan Kapoor <ishaan@lumina.sh>
|
6
6
|
Project-URL: Homepage, https://chunkr.ai
|
7
7
|
Description-Content-Type: text/markdown
|
8
8
|
License-File: LICENSE
|
9
9
|
Requires-Dist: httpx>=0.25.0
|
10
|
+
Requires-Dist: httpx>=0.25.0
|
10
11
|
Requires-Dist: pillow>=10.0.0
|
11
12
|
Requires-Dist: pydantic>=2.0.0
|
12
13
|
Requires-Dist: pytest-asyncio>=0.21.0
|
@@ -4,13 +4,14 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "chunkr-ai"
|
7
|
-
version = "0.0.
|
7
|
+
version = "0.0.12"
|
8
8
|
authors = [{"name" = "Ishaan Kapoor", "email" = "ishaan@lumina.sh"}]
|
9
9
|
description = "Python client for Chunkr: open source document intelligence"
|
10
10
|
readme = "README.md"
|
11
11
|
license = {"file" = "LICENSE"}
|
12
12
|
urls = {Homepage = "https://chunkr.ai"}
|
13
13
|
dependencies = [
|
14
|
+
"httpx>=0.25.0",
|
14
15
|
"httpx>=0.25.0",
|
15
16
|
"pillow>=10.0.0",
|
16
17
|
"pydantic>=2.0.0",
|
@@ -1,6 +1,7 @@
|
|
1
1
|
from pydantic import BaseModel, Field, model_validator, ConfigDict
|
2
2
|
from enum import Enum
|
3
|
-
from typing import Optional, List, Dict
|
3
|
+
from typing import Optional, List, Dict, Union, Type
|
4
|
+
from .schema import from_pydantic
|
4
5
|
|
5
6
|
class GenerationStrategy(str, Enum):
|
6
7
|
LLM = "LLM"
|
@@ -114,7 +115,7 @@ class Configuration(BaseModel):
|
|
114
115
|
chunk_processing: Optional[ChunkProcessing] = Field(default=None)
|
115
116
|
expires_in: Optional[int] = Field(default=None)
|
116
117
|
high_resolution: Optional[bool] = Field(default=None)
|
117
|
-
json_schema: Optional[JsonSchema] = Field(default=None)
|
118
|
+
json_schema: Optional[Union[JsonSchema, Type[BaseModel], BaseModel]] = Field(default=None)
|
118
119
|
model: Optional[Model] = Field(default=None)
|
119
120
|
ocr_strategy: Optional[OcrStrategy] = Field(default=None)
|
120
121
|
segment_processing: Optional[SegmentProcessing] = Field(default=None)
|
@@ -128,3 +129,17 @@ class Configuration(BaseModel):
|
|
128
129
|
values["chunk_processing"] = values.get("chunk_processing", {}) or {}
|
129
130
|
values["chunk_processing"]["target_length"] = target_length
|
130
131
|
return values
|
132
|
+
|
133
|
+
@model_validator(mode='after')
|
134
|
+
def convert_json_schema(self) -> 'Configuration':
|
135
|
+
if self.json_schema is not None and not isinstance(self.json_schema, JsonSchema):
|
136
|
+
if isinstance(self.json_schema, (BaseModel, type)) and issubclass(getattr(self.json_schema, '__class__', type), BaseModel):
|
137
|
+
self.json_schema = JsonSchema(**from_pydantic(self.json_schema))
|
138
|
+
return self
|
139
|
+
|
140
|
+
class Status(str, Enum):
|
141
|
+
STARTING = "Starting"
|
142
|
+
PROCESSING = "Processing"
|
143
|
+
SUCCEEDED = "Succeeded"
|
144
|
+
FAILED = "Failed"
|
145
|
+
CANCELLED = "Cancelled"
|
@@ -1,11 +1,10 @@
|
|
1
|
-
from .config import Configuration
|
1
|
+
from .config import Configuration
|
2
2
|
import io
|
3
3
|
import json
|
4
4
|
from pathlib import Path
|
5
5
|
from PIL import Image
|
6
6
|
import requests
|
7
7
|
from typing import Union, Tuple, BinaryIO, Optional
|
8
|
-
from pydantic import BaseModel
|
9
8
|
|
10
9
|
def prepare_file(
|
11
10
|
file: Union[str, Path, BinaryIO, Image.Image]
|
@@ -127,33 +126,3 @@ def prepare_upload_data(
|
|
127
126
|
files[key] = (None, json.dumps(value), 'application/json')
|
128
127
|
|
129
128
|
return files
|
130
|
-
|
131
|
-
def from_pydantic(pydantic: BaseModel) -> dict:
|
132
|
-
"""Convert a Pydantic model to a Chunk json schema.
|
133
|
-
|
134
|
-
Args:
|
135
|
-
pydantic: A Pydantic BaseModel class or instance
|
136
|
-
|
137
|
-
Returns:
|
138
|
-
dict: A JSON schema compatible with Chunk's format
|
139
|
-
"""
|
140
|
-
model = pydantic if isinstance(pydantic, type) else pydantic.__class__
|
141
|
-
schema = model.model_json_schema()
|
142
|
-
print(schema)
|
143
|
-
properties = []
|
144
|
-
for name, details in schema.get('properties', {}).items():
|
145
|
-
prop = Property(
|
146
|
-
name=name,
|
147
|
-
title=details.get('title'),
|
148
|
-
prop_type=details.get('type', 'string'),
|
149
|
-
description=details.get('description'),
|
150
|
-
default=str(details.get('default')) if details.get('default') is not None else None
|
151
|
-
)
|
152
|
-
properties.append(prop)
|
153
|
-
|
154
|
-
json_schema = JsonSchema(
|
155
|
-
title=schema.get('title', model.__name__),
|
156
|
-
properties=properties
|
157
|
-
)
|
158
|
-
|
159
|
-
return json_schema.model_dump(mode="json", exclude_none=True)
|
@@ -0,0 +1,128 @@
|
|
1
|
+
from pydantic import BaseModel
|
2
|
+
from typing import Optional, List, Union, Type
|
3
|
+
import json
|
4
|
+
|
5
|
+
class Property(BaseModel):
|
6
|
+
name: str
|
7
|
+
prop_type: str
|
8
|
+
description: Optional[str] = None
|
9
|
+
default: Optional[str] = None
|
10
|
+
|
11
|
+
class JsonSchema(BaseModel):
|
12
|
+
title: str
|
13
|
+
properties: List[Property]
|
14
|
+
|
15
|
+
def from_pydantic(pydantic: Union[BaseModel, Type[BaseModel]], current_depth: int = 0) -> dict:
|
16
|
+
"""Convert a Pydantic model to a Chunk json schema."""
|
17
|
+
MAX_DEPTH = 5
|
18
|
+
model = pydantic if isinstance(pydantic, type) else pydantic.__class__
|
19
|
+
schema = model.model_json_schema()
|
20
|
+
properties = []
|
21
|
+
|
22
|
+
def get_enum_description(details: dict) -> str:
|
23
|
+
"""Get description including enum values if they exist"""
|
24
|
+
description = details.get('description', '')
|
25
|
+
|
26
|
+
# First check if this is a direct enum
|
27
|
+
if 'enum' in details:
|
28
|
+
enum_values = details['enum']
|
29
|
+
enum_str = '\nAllowed values:\n' + '\n'.join(f'- {val}' for val in enum_values)
|
30
|
+
return f"{description}{enum_str}"
|
31
|
+
|
32
|
+
# Then check if it's a reference to an enum
|
33
|
+
if '$ref' in details:
|
34
|
+
ref_schema = resolve_ref(details['$ref'], schema.get('$defs', {}))
|
35
|
+
if 'enum' in ref_schema:
|
36
|
+
enum_values = ref_schema['enum']
|
37
|
+
enum_str = '\nAllowed values:\n' + '\n'.join(f'- {val}' for val in enum_values)
|
38
|
+
return f"{description}{enum_str}"
|
39
|
+
|
40
|
+
return description
|
41
|
+
|
42
|
+
def resolve_ref(ref: str, definitions: dict) -> dict:
|
43
|
+
"""Resolve a $ref reference to its actual schema"""
|
44
|
+
if not ref.startswith('#/$defs/'):
|
45
|
+
return {}
|
46
|
+
ref_name = ref[len('#/$defs/'):]
|
47
|
+
return definitions.get(ref_name, {})
|
48
|
+
|
49
|
+
def get_nested_schema(field_schema: dict, depth: int) -> dict:
|
50
|
+
if depth >= MAX_DEPTH:
|
51
|
+
return {}
|
52
|
+
|
53
|
+
# If there's a $ref, resolve it first
|
54
|
+
if '$ref' in field_schema:
|
55
|
+
field_schema = resolve_ref(field_schema['$ref'], schema.get('$defs', {}))
|
56
|
+
|
57
|
+
nested_props = {}
|
58
|
+
if field_schema.get('type') == 'object':
|
59
|
+
for name, details in field_schema.get('properties', {}).items():
|
60
|
+
if details.get('type') == 'object' or '$ref' in details:
|
61
|
+
ref_schema = details
|
62
|
+
if '$ref' in details:
|
63
|
+
ref_schema = resolve_ref(details['$ref'], schema.get('$defs', {}))
|
64
|
+
nested_schema = get_nested_schema(ref_schema, depth + 1)
|
65
|
+
nested_props[name] = {
|
66
|
+
'type': 'object',
|
67
|
+
'description': get_enum_description(details),
|
68
|
+
'properties': nested_schema
|
69
|
+
}
|
70
|
+
else:
|
71
|
+
nested_props[name] = {
|
72
|
+
'type': details.get('type', 'string'),
|
73
|
+
'description': get_enum_description(details)
|
74
|
+
}
|
75
|
+
return nested_props
|
76
|
+
|
77
|
+
for name, details in schema.get('properties', {}).items():
|
78
|
+
# Handle arrays
|
79
|
+
if details.get('type') == 'array':
|
80
|
+
items = details.get('items', {})
|
81
|
+
if '$ref' in items:
|
82
|
+
items = resolve_ref(items['$ref'], schema.get('$defs', {}))
|
83
|
+
|
84
|
+
# Get nested schema for array items
|
85
|
+
item_schema = get_nested_schema(items, current_depth)
|
86
|
+
description = get_enum_description(details)
|
87
|
+
|
88
|
+
if item_schema:
|
89
|
+
description = f"{description}\nList items schema:\n{json.dumps(item_schema, indent=2)}"
|
90
|
+
|
91
|
+
prop = Property(
|
92
|
+
name=name,
|
93
|
+
prop_type='list',
|
94
|
+
description=description
|
95
|
+
)
|
96
|
+
# Handle objects and references
|
97
|
+
elif details.get('type') == 'object' or '$ref' in details:
|
98
|
+
prop_type = 'object'
|
99
|
+
ref_schema = details
|
100
|
+
if '$ref' in details:
|
101
|
+
ref_schema = resolve_ref(details['$ref'], schema.get('$defs', {}))
|
102
|
+
|
103
|
+
nested_schema = get_nested_schema(ref_schema, current_depth)
|
104
|
+
|
105
|
+
prop = Property(
|
106
|
+
name=name,
|
107
|
+
prop_type=prop_type,
|
108
|
+
description=get_enum_description(details),
|
109
|
+
properties=nested_schema
|
110
|
+
)
|
111
|
+
|
112
|
+
# Handle primitive types
|
113
|
+
else:
|
114
|
+
prop = Property(
|
115
|
+
name=name,
|
116
|
+
prop_type=details.get('type', 'string'),
|
117
|
+
description=get_enum_description(details),
|
118
|
+
default=str(details.get('default')) if details.get('default') is not None else None
|
119
|
+
)
|
120
|
+
|
121
|
+
properties.append(prop)
|
122
|
+
|
123
|
+
json_schema = JsonSchema(
|
124
|
+
title=schema.get('title', model.__name__),
|
125
|
+
properties=properties
|
126
|
+
)
|
127
|
+
|
128
|
+
return json_schema.model_dump(mode="json", exclude_none=True)
|
@@ -0,0 +1,61 @@
|
|
1
|
+
from .config import Configuration
|
2
|
+
from .misc import prepare_upload_data
|
3
|
+
from .task_base import TaskBase
|
4
|
+
import time
|
5
|
+
|
6
|
+
class TaskResponse(TaskBase):
|
7
|
+
def _poll_request(self) -> dict:
|
8
|
+
while True:
|
9
|
+
try:
|
10
|
+
r = self._client._session.get(self.task_url, headers=self._client._headers())
|
11
|
+
r.raise_for_status()
|
12
|
+
return r.json()
|
13
|
+
except (ConnectionError, TimeoutError) as _:
|
14
|
+
print("Connection error while polling the task, retrying...")
|
15
|
+
time.sleep(0.5)
|
16
|
+
except Exception as e:
|
17
|
+
raise
|
18
|
+
|
19
|
+
def poll(self) -> 'TaskResponse':
|
20
|
+
if not self.task_url:
|
21
|
+
raise ValueError("Task URL not found in response")
|
22
|
+
while True:
|
23
|
+
response = self._poll_request_sync()
|
24
|
+
updated_task = TaskResponse(**response).with_client(self._client)
|
25
|
+
self.__dict__.update(updated_task.__dict__)
|
26
|
+
if result := self._check_status():
|
27
|
+
return result
|
28
|
+
time.sleep(0.5)
|
29
|
+
|
30
|
+
def update(self, config: Configuration) -> 'TaskResponse':
|
31
|
+
if not self.task_url:
|
32
|
+
raise ValueError("Task URL not found")
|
33
|
+
files = prepare_upload_data(None, config)
|
34
|
+
r = self._client._session.patch(
|
35
|
+
f"{self.task_url}",
|
36
|
+
files=files,
|
37
|
+
headers=self._client._headers()
|
38
|
+
)
|
39
|
+
r.raise_for_status()
|
40
|
+
updated = TaskResponse(**r.json()).with_client(self._client)
|
41
|
+
self.__dict__.update(updated.__dict__)
|
42
|
+
return self.poll()
|
43
|
+
|
44
|
+
def cancel(self):
|
45
|
+
if not self.task_url:
|
46
|
+
raise ValueError("Task URL not found")
|
47
|
+
r = self._client._session.get(
|
48
|
+
f"{self.task_url}/cancel",
|
49
|
+
headers=self._client._headers()
|
50
|
+
)
|
51
|
+
r.raise_for_status()
|
52
|
+
self.poll()
|
53
|
+
|
54
|
+
def delete(self):
|
55
|
+
if not self.task_url:
|
56
|
+
raise ValueError("Task URL not found")
|
57
|
+
r = self._client._session.delete(
|
58
|
+
self.task_url,
|
59
|
+
headers=self._client._headers()
|
60
|
+
)
|
61
|
+
r.raise_for_status()
|
@@ -0,0 +1,50 @@
|
|
1
|
+
from .config import Configuration
|
2
|
+
from .misc import prepare_upload_data
|
3
|
+
from .task_base import TaskBase
|
4
|
+
import asyncio
|
5
|
+
|
6
|
+
class TaskResponseAsync(TaskBase):
|
7
|
+
async def _poll_request(self) -> dict:
|
8
|
+
try:
|
9
|
+
r = await self._client._client.get(self.task_url, headers=self._client._headers())
|
10
|
+
r.raise_for_status()
|
11
|
+
return r.json()
|
12
|
+
except (ConnectionError, TimeoutError) as _:
|
13
|
+
print("Connection error while polling the task, retrying...")
|
14
|
+
await asyncio.sleep(0.5)
|
15
|
+
except Exception as e:
|
16
|
+
raise
|
17
|
+
|
18
|
+
async def poll(self) -> 'TaskResponseAsync':
|
19
|
+
if not self.task_url:
|
20
|
+
raise ValueError("Task URL not found")
|
21
|
+
while True:
|
22
|
+
j = await self._poll_request()
|
23
|
+
updated = TaskResponseAsync(**j).with_client(self._client)
|
24
|
+
self.__dict__.update(updated.__dict__)
|
25
|
+
if res := self._check_status():
|
26
|
+
return res
|
27
|
+
await asyncio.sleep(0.5)
|
28
|
+
|
29
|
+
async def update(self, config: Configuration) -> 'TaskResponseAsync':
|
30
|
+
if not self.task_url:
|
31
|
+
raise ValueError("Task URL not found")
|
32
|
+
f = prepare_upload_data(None, config)
|
33
|
+
r = await self._client._client.patch(self.task_url, files=f, headers=self._client._headers())
|
34
|
+
r.raise_for_status()
|
35
|
+
updated = TaskResponseAsync(**r.json()).with_client(self._client)
|
36
|
+
self.__dict__.update(updated.__dict__)
|
37
|
+
return await self.poll()
|
38
|
+
|
39
|
+
async def cancel(self):
|
40
|
+
if not self.task_url:
|
41
|
+
raise ValueError("Task URL not found")
|
42
|
+
r = await self._client._client.get(f"{self.task_url}/cancel", headers=self._client._headers())
|
43
|
+
r.raise_for_status()
|
44
|
+
return await self.poll()
|
45
|
+
|
46
|
+
async def delete(self):
|
47
|
+
if not self.task_url:
|
48
|
+
raise ValueError("Task URL not found")
|
49
|
+
r = await self._client._client.delete(self.task_url, headers=self._client._headers())
|
50
|
+
r.raise_for_status()
|
@@ -0,0 +1,83 @@
|
|
1
|
+
from .config import Configuration
|
2
|
+
from .protocol import ChunkrClientProtocol
|
3
|
+
from ..models import Status, OutputResponse
|
4
|
+
from abc import ABC, abstractmethod
|
5
|
+
from typing import TypeVar, Optional, Generic, Union
|
6
|
+
from pydantic import BaseModel, PrivateAttr
|
7
|
+
from datetime import datetime
|
8
|
+
|
9
|
+
T = TypeVar('T', bound='TaskBase')
|
10
|
+
|
11
|
+
class TaskBase(BaseModel, ABC, Generic[T]):
|
12
|
+
configuration: Configuration
|
13
|
+
created_at: datetime
|
14
|
+
expires_at: Optional[datetime]
|
15
|
+
file_name: Optional[str]
|
16
|
+
finished_at: Optional[datetime]
|
17
|
+
input_file_url: Optional[str]
|
18
|
+
message: str
|
19
|
+
output: Optional[OutputResponse]
|
20
|
+
page_count: Optional[int]
|
21
|
+
pdf_url: Optional[str]
|
22
|
+
started_at: Optional[datetime]
|
23
|
+
status: Status
|
24
|
+
task_id: str
|
25
|
+
task_url: Optional[str]
|
26
|
+
_client: Optional[Union[ChunkrClientProtocol]] = PrivateAttr(default=None)
|
27
|
+
|
28
|
+
@abstractmethod
|
29
|
+
def _poll_request(self) -> dict:
|
30
|
+
"""Helper method to make polling request with retry logic (synchronous)"""
|
31
|
+
pass
|
32
|
+
|
33
|
+
@abstractmethod
|
34
|
+
def poll(self) -> T:
|
35
|
+
"""Poll the task for completion."""
|
36
|
+
pass
|
37
|
+
|
38
|
+
@abstractmethod
|
39
|
+
def update(self, config: Configuration) -> T:
|
40
|
+
"""Update the task configuration."""
|
41
|
+
pass
|
42
|
+
|
43
|
+
@abstractmethod
|
44
|
+
def cancel(self) -> T:
|
45
|
+
"""Cancel the task."""
|
46
|
+
pass
|
47
|
+
|
48
|
+
@abstractmethod
|
49
|
+
def delete(self) -> T:
|
50
|
+
"""Delete the task."""
|
51
|
+
pass
|
52
|
+
|
53
|
+
def with_client(self, client: Union[ChunkrClientProtocol]) -> T:
|
54
|
+
self._client = client
|
55
|
+
return self
|
56
|
+
|
57
|
+
def _check_status(self) -> Optional[T]:
|
58
|
+
"""Helper method to check task status and handle completion/failure"""
|
59
|
+
if self.status == "Failed":
|
60
|
+
raise ValueError(self.message)
|
61
|
+
if self.status not in ("Starting", "Processing"):
|
62
|
+
return self
|
63
|
+
return None
|
64
|
+
|
65
|
+
def html(self) -> str:
|
66
|
+
return self._get_content("html")
|
67
|
+
|
68
|
+
def markdown(self) -> str:
|
69
|
+
return self._get_content("markdown")
|
70
|
+
|
71
|
+
def content(self) -> str:
|
72
|
+
return self._get_content("content")
|
73
|
+
|
74
|
+
def _get_content(self, t: str) -> str:
|
75
|
+
if not self.output:
|
76
|
+
return ""
|
77
|
+
parts = []
|
78
|
+
for c in self.output.chunks:
|
79
|
+
for s in c.segments:
|
80
|
+
v = getattr(s, t)
|
81
|
+
if v:
|
82
|
+
parts.append(v)
|
83
|
+
return "\n".join(parts)
|
@@ -17,9 +17,11 @@ from .api.config import (
|
|
17
17
|
SegmentProcessing,
|
18
18
|
SegmentType,
|
19
19
|
SegmentationStrategy,
|
20
|
+
Status,
|
20
21
|
)
|
21
22
|
|
22
|
-
from .api.task import TaskResponse
|
23
|
+
from .api.task import TaskResponse
|
24
|
+
from .api.task_async import TaskResponseAsync
|
23
25
|
|
24
26
|
__all__ = [
|
25
27
|
'BoundingBox',
|
@@ -42,5 +44,6 @@ __all__ = [
|
|
42
44
|
'SegmentType',
|
43
45
|
'SegmentationStrategy',
|
44
46
|
'Status',
|
45
|
-
'TaskResponse'
|
47
|
+
'TaskResponse',
|
48
|
+
'TaskResponseAsync',
|
46
49
|
]
|
@@ -1,12 +1,13 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: chunkr-ai
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.12
|
4
4
|
Summary: Python client for Chunkr: open source document intelligence
|
5
5
|
Author-email: Ishaan Kapoor <ishaan@lumina.sh>
|
6
6
|
Project-URL: Homepage, https://chunkr.ai
|
7
7
|
Description-Content-Type: text/markdown
|
8
8
|
License-File: LICENSE
|
9
9
|
Requires-Dist: httpx>=0.25.0
|
10
|
+
Requires-Dist: httpx>=0.25.0
|
10
11
|
Requires-Dist: pillow>=10.0.0
|
11
12
|
Requires-Dist: pydantic>=2.0.0
|
12
13
|
Requires-Dist: pytest-asyncio>=0.21.0
|
@@ -2,7 +2,6 @@ LICENSE
|
|
2
2
|
README.md
|
3
3
|
pyproject.toml
|
4
4
|
src/chunkr_ai/__init__.py
|
5
|
-
src/chunkr_ai/main.py
|
6
5
|
src/chunkr_ai/models.py
|
7
6
|
src/chunkr_ai.egg-info/PKG-INFO
|
8
7
|
src/chunkr_ai.egg-info/SOURCES.txt
|
@@ -17,6 +16,7 @@ src/chunkr_ai/api/chunkr_base.py
|
|
17
16
|
src/chunkr_ai/api/config.py
|
18
17
|
src/chunkr_ai/api/misc.py
|
19
18
|
src/chunkr_ai/api/protocol.py
|
19
|
+
src/chunkr_ai/api/schema.py
|
20
20
|
src/chunkr_ai/api/task.py
|
21
21
|
src/chunkr_ai/api/task_async.py
|
22
22
|
src/chunkr_ai/api/task_base.py
|
@@ -1,176 +0,0 @@
|
|
1
|
-
from .protocol import ChunkrClientProtocol
|
2
|
-
from .config import Configuration, OutputResponse
|
3
|
-
from .misc import prepare_upload_data
|
4
|
-
import asyncio
|
5
|
-
from datetime import datetime
|
6
|
-
from enum import Enum
|
7
|
-
from pydantic import BaseModel, PrivateAttr
|
8
|
-
import time
|
9
|
-
from typing import Optional, Union
|
10
|
-
|
11
|
-
class Status(str, Enum):
|
12
|
-
STARTING = "Starting"
|
13
|
-
PROCESSING = "Processing"
|
14
|
-
SUCCEEDED = "Succeeded"
|
15
|
-
FAILED = "Failed"
|
16
|
-
CANCELLED = "Cancelled"
|
17
|
-
|
18
|
-
class TaskResponse(BaseModel):
|
19
|
-
configuration: Configuration
|
20
|
-
created_at: datetime
|
21
|
-
expires_at: Optional[datetime] = None
|
22
|
-
file_name: Optional[str] = None
|
23
|
-
finished_at: Optional[datetime] = None
|
24
|
-
input_file_url: Optional[str] = None
|
25
|
-
message: str
|
26
|
-
output: Optional[OutputResponse] = None
|
27
|
-
page_count: Optional[int] = None
|
28
|
-
pdf_url: Optional[str] = None
|
29
|
-
started_at: Optional[datetime] = None
|
30
|
-
status: Status
|
31
|
-
task_id: str
|
32
|
-
task_url: Optional[str] = None
|
33
|
-
_client: Optional[Union[ChunkrClientProtocol]] = PrivateAttr(default=None)
|
34
|
-
|
35
|
-
def with_client(self, client: Union[ChunkrClientProtocol]) -> 'TaskResponse':
|
36
|
-
self._client = client
|
37
|
-
return self
|
38
|
-
|
39
|
-
def _poll_request_sync(self) -> dict:
|
40
|
-
"""Helper method to make polling request with retry logic (synchronous)"""
|
41
|
-
if not self.task_url:
|
42
|
-
raise ValueError("Task URL not found in response")
|
43
|
-
|
44
|
-
while True:
|
45
|
-
try:
|
46
|
-
r = self._client._session.get(self.task_url, headers=self._client._headers())
|
47
|
-
r.raise_for_status()
|
48
|
-
return r.json()
|
49
|
-
except (ConnectionError, TimeoutError) as _:
|
50
|
-
print("Connection error while polling the task, retrying...")
|
51
|
-
time.sleep(0.5)
|
52
|
-
except Exception as e:
|
53
|
-
raise
|
54
|
-
|
55
|
-
async def _poll_request_async(self) -> dict:
|
56
|
-
"""Helper method to make polling request with retry logic (asynchronous)"""
|
57
|
-
if not self.task_url:
|
58
|
-
raise ValueError("Task URL not found in response")
|
59
|
-
|
60
|
-
while True:
|
61
|
-
try:
|
62
|
-
r = await self._client._client.get(self.task_url, headers=self._client._headers())
|
63
|
-
r.raise_for_status()
|
64
|
-
response = r.json()
|
65
|
-
return response
|
66
|
-
except (ConnectionError, TimeoutError) as _:
|
67
|
-
print("Connection error while polling the task, retrying...")
|
68
|
-
await asyncio.sleep(0.5)
|
69
|
-
except Exception as e:
|
70
|
-
raise
|
71
|
-
|
72
|
-
def _check_status(self) -> Optional['TaskResponse']:
|
73
|
-
"""Helper method to check task status and handle completion/failure"""
|
74
|
-
if self.status == "Failed":
|
75
|
-
raise ValueError(self.message)
|
76
|
-
if self.status not in ("Starting", "Processing"):
|
77
|
-
return self
|
78
|
-
return None
|
79
|
-
|
80
|
-
def poll(self) -> 'TaskResponse':
|
81
|
-
"""Poll the task for completion."""
|
82
|
-
while True:
|
83
|
-
response = self._poll_request_sync()
|
84
|
-
updated_task = TaskResponse(**response).with_client(self._client)
|
85
|
-
self.__dict__.update(updated_task.__dict__)
|
86
|
-
|
87
|
-
if result := self._check_status():
|
88
|
-
return result
|
89
|
-
|
90
|
-
time.sleep(0.5)
|
91
|
-
|
92
|
-
async def poll_async(self) -> 'TaskResponse':
|
93
|
-
"""Poll the task for completion asynchronously."""
|
94
|
-
while True:
|
95
|
-
response = await self._poll_request_async()
|
96
|
-
updated_task = TaskResponse(**response).with_client(self._client)
|
97
|
-
self.__dict__.update(updated_task.__dict__)
|
98
|
-
|
99
|
-
if result := self._check_status():
|
100
|
-
return result
|
101
|
-
|
102
|
-
await asyncio.sleep(0.5)
|
103
|
-
|
104
|
-
def _get_content(self, content_type: str) -> str:
|
105
|
-
"""Helper method to get either HTML, Markdown, or raw content."""
|
106
|
-
if not self.output:
|
107
|
-
return ""
|
108
|
-
parts = []
|
109
|
-
for c in self.output.chunks:
|
110
|
-
for s in c.segments:
|
111
|
-
content = getattr(s, content_type)
|
112
|
-
if content:
|
113
|
-
parts.append(content)
|
114
|
-
return "\n".join(parts)
|
115
|
-
|
116
|
-
def update(self, config: Configuration) -> 'TaskResponse':
|
117
|
-
files = prepare_upload_data(None, config)
|
118
|
-
r = self._client._session.patch(
|
119
|
-
f"{self.task_url}",
|
120
|
-
files=files,
|
121
|
-
headers=self._client._headers()
|
122
|
-
)
|
123
|
-
r.raise_for_status()
|
124
|
-
return TaskResponse(**r.json()).with_client(self._client)
|
125
|
-
|
126
|
-
async def update_async(self, config: Configuration) -> 'TaskResponse':
|
127
|
-
files = prepare_upload_data(None, config)
|
128
|
-
r = await self._client._client.patch(
|
129
|
-
f"{self.task_url}",
|
130
|
-
files=files,
|
131
|
-
headers=self._client._headers()
|
132
|
-
)
|
133
|
-
r.raise_for_status()
|
134
|
-
return TaskResponse(**r.json()).with_client(self._client)
|
135
|
-
|
136
|
-
def cancel(self):
|
137
|
-
r = self._client._session.get(
|
138
|
-
f"{self.task_url}/cancel",
|
139
|
-
headers=self._client._headers()
|
140
|
-
)
|
141
|
-
r.raise_for_status()
|
142
|
-
self.poll()
|
143
|
-
|
144
|
-
async def cancel_async(self):
|
145
|
-
r = await self._client._client.get(
|
146
|
-
f"{self.task_url}/cancel",
|
147
|
-
headers=self._client._headers()
|
148
|
-
)
|
149
|
-
r.raise_for_status()
|
150
|
-
await self.poll_async()
|
151
|
-
|
152
|
-
def delete(self):
|
153
|
-
r = self._client._session.delete(
|
154
|
-
f"{self.task_url}",
|
155
|
-
headers=self._client._headers()
|
156
|
-
)
|
157
|
-
r.raise_for_status()
|
158
|
-
|
159
|
-
async def delete_async(self):
|
160
|
-
r = await self._client._client.delete(
|
161
|
-
f"{self.task_url}",
|
162
|
-
headers=self._client._headers()
|
163
|
-
)
|
164
|
-
r.raise_for_status()
|
165
|
-
|
166
|
-
def html(self) -> str:
|
167
|
-
"""Get full HTML for the task"""
|
168
|
-
return self._get_content("html")
|
169
|
-
|
170
|
-
def markdown(self) -> str:
|
171
|
-
"""Get full markdown for the task"""
|
172
|
-
return self._get_content("markdown")
|
173
|
-
|
174
|
-
def content(self) -> str:
|
175
|
-
"""Get full text for the task"""
|
176
|
-
return self._get_content("content")
|
@@ -1,111 +0,0 @@
|
|
1
|
-
import asyncio
|
2
|
-
from pydantic import BaseModel, PrivateAttr
|
3
|
-
from datetime import datetime
|
4
|
-
from enum import Enum
|
5
|
-
from typing import Optional, Union
|
6
|
-
from .task_base import TaskBase
|
7
|
-
from .protocol import ChunkrClientProtocol
|
8
|
-
from .config import Configuration, OutputResponse
|
9
|
-
from .misc import prepare_upload_data
|
10
|
-
|
11
|
-
class Status(str, Enum):
|
12
|
-
STARTING = "Starting"
|
13
|
-
PROCESSING = "Processing"
|
14
|
-
SUCCEEDED = "Succeeded"
|
15
|
-
FAILED = "Failed"
|
16
|
-
CANCELLED = "Cancelled"
|
17
|
-
|
18
|
-
class TaskResponseAsync(BaseModel, TaskBase):
|
19
|
-
configuration: Configuration
|
20
|
-
created_at: datetime
|
21
|
-
expires_at: Optional[datetime]
|
22
|
-
file_name: Optional[str]
|
23
|
-
finished_at: Optional[datetime]
|
24
|
-
input_file_url: Optional[str]
|
25
|
-
message: str
|
26
|
-
output: Optional[OutputResponse]
|
27
|
-
page_count: Optional[int]
|
28
|
-
pdf_url: Optional[str]
|
29
|
-
started_at: Optional[datetime]
|
30
|
-
status: Status
|
31
|
-
task_id: str
|
32
|
-
task_url: Optional[str]
|
33
|
-
_client: Optional[Union[ChunkrClientProtocol]] = PrivateAttr(default=None)
|
34
|
-
|
35
|
-
def with_client(self, client: Union[ChunkrClientProtocol]) -> 'TaskResponseAsync':
|
36
|
-
self._client = client
|
37
|
-
return self
|
38
|
-
|
39
|
-
async def poll(self) -> 'TaskResponseAsync':
|
40
|
-
while True:
|
41
|
-
j = await self._poll_request()
|
42
|
-
updated = TaskResponseAsync(**j).with_client(self._client)
|
43
|
-
self.__dict__.update(updated.__dict__)
|
44
|
-
if res := self._check_status():
|
45
|
-
return res
|
46
|
-
await asyncio.sleep(0.5)
|
47
|
-
|
48
|
-
async def _poll_request(self) -> dict:
|
49
|
-
if not self.task_url:
|
50
|
-
raise ValueError("Task URL not found")
|
51
|
-
while True:
|
52
|
-
try:
|
53
|
-
r = await self._client._client.get(self.task_url, headers=self._client._headers())
|
54
|
-
r.raise_for_status()
|
55
|
-
return r.json()
|
56
|
-
except Exception as e:
|
57
|
-
if self.status == Status.FAILED:
|
58
|
-
raise ValueError(self.message) from e
|
59
|
-
await asyncio.sleep(0.5)
|
60
|
-
|
61
|
-
def _check_status(self) -> Optional['TaskResponseAsync']:
|
62
|
-
if self.status == Status.FAILED:
|
63
|
-
raise ValueError(f"Task failed: {self.message}")
|
64
|
-
if self.status == Status.CANCELLED:
|
65
|
-
return self
|
66
|
-
if self.status not in [Status.STARTING, Status.PROCESSING]:
|
67
|
-
return self
|
68
|
-
return None
|
69
|
-
|
70
|
-
async def update(self, config: Configuration) -> 'TaskResponseAsync':
|
71
|
-
if not self.task_url:
|
72
|
-
raise ValueError("Task URL not found")
|
73
|
-
f = prepare_upload_data(None, config)
|
74
|
-
r = await self._client._client.patch(self.task_url, files=f, headers=self._client._headers())
|
75
|
-
r.raise_for_status()
|
76
|
-
updated = TaskResponseAsync(**r.json()).with_client(self._client)
|
77
|
-
self.__dict__.update(updated.__dict__)
|
78
|
-
return await self.poll()
|
79
|
-
|
80
|
-
async def cancel(self):
|
81
|
-
if not self.task_url:
|
82
|
-
raise ValueError("Task URL not found")
|
83
|
-
r = await self._client._client.get(f"{self.task_url}/cancel", headers=self._client._headers())
|
84
|
-
r.raise_for_status()
|
85
|
-
return await self.poll()
|
86
|
-
|
87
|
-
async def delete(self):
|
88
|
-
r = await self._client._client.delete(self.task_url, headers=self._client._headers())
|
89
|
-
r.raise_for_status()
|
90
|
-
|
91
|
-
def html(self) -> str:
|
92
|
-
return self._get_content("html")
|
93
|
-
|
94
|
-
def markdown(self) -> str:
|
95
|
-
return self._get_content("markdown")
|
96
|
-
|
97
|
-
def content(self) -> str:
|
98
|
-
return self._get_content("content")
|
99
|
-
|
100
|
-
def _get_content(self, t: str) -> str:
|
101
|
-
if not self.output:
|
102
|
-
return ""
|
103
|
-
parts = []
|
104
|
-
for c in self.output.chunks:
|
105
|
-
for s in c.segments:
|
106
|
-
v = getattr(s, t)
|
107
|
-
if v:
|
108
|
-
parts.append(v)
|
109
|
-
return "\n".join(parts)
|
110
|
-
|
111
|
-
# Satisfying TaskBase abstract methods with stubs
|
@@ -1,31 +0,0 @@
|
|
1
|
-
from abc import ABC, abstractmethod
|
2
|
-
from .config import Configuration
|
3
|
-
|
4
|
-
class TaskBase(ABC):
|
5
|
-
@abstractmethod
|
6
|
-
def poll(self):
|
7
|
-
pass
|
8
|
-
|
9
|
-
@abstractmethod
|
10
|
-
def update(self, config: Configuration):
|
11
|
-
pass
|
12
|
-
|
13
|
-
@abstractmethod
|
14
|
-
def cancel(self):
|
15
|
-
pass
|
16
|
-
|
17
|
-
@abstractmethod
|
18
|
-
def delete(self):
|
19
|
-
pass
|
20
|
-
|
21
|
-
@abstractmethod
|
22
|
-
def html(self) -> str:
|
23
|
-
pass
|
24
|
-
|
25
|
-
@abstractmethod
|
26
|
-
def markdown(self) -> str:
|
27
|
-
pass
|
28
|
-
|
29
|
-
@abstractmethod
|
30
|
-
def content(self) -> str:
|
31
|
-
pass
|
@@ -1,12 +0,0 @@
|
|
1
|
-
from chunkr_ai.api.chunkr import Chunkr
|
2
|
-
from chunkr_ai.models import Configuration
|
3
|
-
from chunkr_ai.api.config import SegmentationStrategy, ChunkProcessing
|
4
|
-
|
5
|
-
if __name__ == "__main__":
|
6
|
-
chunkr = Chunkr()
|
7
|
-
task = chunkr.update_task("556b4fe5-e3f7-48dc-9f56-0fb7fbacdb87", Configuration(
|
8
|
-
chunk_processing=ChunkProcessing(
|
9
|
-
target_length=1000
|
10
|
-
)
|
11
|
-
))
|
12
|
-
print(task)
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|