chunkr-ai 0.0.17__py3-none-any.whl → 0.0.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
chunkr_ai/api/schema.py DELETED
@@ -1,136 +0,0 @@
1
- from pydantic import BaseModel
2
- from typing import Optional, List, Union, Type
3
- import json
4
-
5
-
6
- class Property(BaseModel):
7
- name: str
8
- prop_type: str
9
- description: Optional[str] = None
10
- default: Optional[str] = None
11
-
12
-
13
- class JsonSchema(BaseModel):
14
- title: str
15
- properties: List[Property]
16
-
17
-
18
- def from_pydantic(
19
- pydantic: Union[BaseModel, Type[BaseModel]], current_depth: int = 0
20
- ) -> dict:
21
- """Convert a Pydantic model to a Chunk json schema."""
22
- MAX_DEPTH = 5
23
- model = pydantic if isinstance(pydantic, type) else pydantic.__class__
24
- schema = model.model_json_schema()
25
- properties = []
26
-
27
- def get_enum_description(details: dict) -> str:
28
- """Get description including enum values if they exist"""
29
- description = details.get("description", "")
30
-
31
- # First check if this is a direct enum
32
- if "enum" in details:
33
- enum_values = details["enum"]
34
- enum_str = "\nAllowed values:\n" + "\n".join(
35
- f"- {val}" for val in enum_values
36
- )
37
- return f"{description}{enum_str}"
38
-
39
- # Then check if it's a reference to an enum
40
- if "$ref" in details:
41
- ref_schema = resolve_ref(details["$ref"], schema.get("$defs", {}))
42
- if "enum" in ref_schema:
43
- enum_values = ref_schema["enum"]
44
- enum_str = "\nAllowed values:\n" + "\n".join(
45
- f"- {val}" for val in enum_values
46
- )
47
- return f"{description}{enum_str}"
48
-
49
- return description
50
-
51
- def resolve_ref(ref: str, definitions: dict) -> dict:
52
- """Resolve a $ref reference to its actual schema"""
53
- if not ref.startswith("#/$defs/"):
54
- return {}
55
- ref_name = ref[len("#/$defs/") :]
56
- return definitions.get(ref_name, {})
57
-
58
- def get_nested_schema(field_schema: dict, depth: int) -> dict:
59
- if depth >= MAX_DEPTH:
60
- return {}
61
-
62
- # If there's a $ref, resolve it first
63
- if "$ref" in field_schema:
64
- field_schema = resolve_ref(field_schema["$ref"], schema.get("$defs", {}))
65
-
66
- nested_props = {}
67
- if field_schema.get("type") == "object":
68
- for name, details in field_schema.get("properties", {}).items():
69
- if details.get("type") == "object" or "$ref" in details:
70
- ref_schema = details
71
- if "$ref" in details:
72
- ref_schema = resolve_ref(
73
- details["$ref"], schema.get("$defs", {})
74
- )
75
- nested_schema = get_nested_schema(ref_schema, depth + 1)
76
- nested_props[name] = {
77
- "type": "object",
78
- "description": get_enum_description(details),
79
- "properties": nested_schema,
80
- }
81
- else:
82
- nested_props[name] = {
83
- "type": details.get("type", "string"),
84
- "description": get_enum_description(details),
85
- }
86
- return nested_props
87
-
88
- for name, details in schema.get("properties", {}).items():
89
- # Handle arrays
90
- if details.get("type") == "array":
91
- items = details.get("items", {})
92
- if "$ref" in items:
93
- items = resolve_ref(items["$ref"], schema.get("$defs", {}))
94
-
95
- # Get nested schema for array items
96
- item_schema = get_nested_schema(items, current_depth)
97
- description = get_enum_description(details)
98
-
99
- if item_schema:
100
- description = f"{description}\nList items schema:\n{json.dumps(item_schema, indent=2)}"
101
-
102
- prop = Property(name=name, prop_type="list", description=description)
103
- # Handle objects and references
104
- elif details.get("type") == "object" or "$ref" in details:
105
- prop_type = "object"
106
- ref_schema = details
107
- if "$ref" in details:
108
- ref_schema = resolve_ref(details["$ref"], schema.get("$defs", {}))
109
-
110
- nested_schema = get_nested_schema(ref_schema, current_depth)
111
-
112
- prop = Property(
113
- name=name,
114
- prop_type=prop_type,
115
- description=get_enum_description(details),
116
- properties=nested_schema,
117
- )
118
-
119
- # Handle primitive types
120
- else:
121
- prop = Property(
122
- name=name,
123
- prop_type=details.get("type", "string"),
124
- description=get_enum_description(details),
125
- default=str(details.get("default"))
126
- if details.get("default") is not None
127
- else None,
128
- )
129
-
130
- properties.append(prop)
131
-
132
- json_schema = JsonSchema(
133
- title=schema.get("title", model.__name__), properties=properties
134
- )
135
-
136
- return json_schema.model_dump(mode="json", exclude_none=True)
chunkr_ai/api/task.py DELETED
@@ -1,66 +0,0 @@
1
- from .config import Configuration
2
- from .misc import prepare_upload_data
3
- from .task_base import TaskBase
4
- import time
5
-
6
-
7
- class TaskResponse(TaskBase):
8
- def _poll_request(self) -> dict:
9
- while True:
10
- try:
11
- if not self.task_url:
12
- raise ValueError("Task URL not found in response")
13
- if not self._client._session:
14
- raise ValueError("Client session not found")
15
- r = self._client._session.get(
16
- self.task_url, headers=self._client._headers()
17
- )
18
- r.raise_for_status()
19
- return r.json()
20
- except (ConnectionError, TimeoutError) as _:
21
- print("Connection error while polling the task, retrying...")
22
- time.sleep(0.5)
23
- except Exception:
24
- raise
25
-
26
- def poll(self) -> "TaskResponse":
27
- while True:
28
- response = self._poll_request()
29
- updated_task = TaskResponse(**response).with_client(self._client)
30
- self.__dict__.update(updated_task.__dict__)
31
- if result := self._check_status():
32
- return result
33
- time.sleep(0.5)
34
-
35
- def update(self, config: Configuration) -> "TaskResponse":
36
- if not self.task_url:
37
- raise ValueError("Task URL not found")
38
- if not self._client._session:
39
- raise ValueError("Client session not found")
40
- files = prepare_upload_data(None, config)
41
- r = self._client._session.patch(
42
- self.task_url, files=files, headers=self._client._headers()
43
- )
44
- r.raise_for_status()
45
- updated = TaskResponse(**r.json()).with_client(self._client)
46
- self.__dict__.update(updated.__dict__)
47
- return self.poll()
48
-
49
- def cancel(self):
50
- if not self.task_url:
51
- raise ValueError("Task URL not found")
52
- if not self._client._session:
53
- raise ValueError("Client session not found")
54
- r = self._client._session.get(
55
- f"{self.task_url}/cancel", headers=self._client._headers()
56
- )
57
- r.raise_for_status()
58
- self.poll()
59
-
60
- def delete(self):
61
- if not self.task_url:
62
- raise ValueError("Task URL not found")
63
- if not self._client._session:
64
- raise ValueError("Client session not found")
65
- r = self._client._session.delete(self.task_url, headers=self._client._headers())
66
- r.raise_for_status()
@@ -1,69 +0,0 @@
1
- from .config import Configuration
2
- from .misc import prepare_upload_data
3
- from .task_base import TaskBase
4
- import asyncio
5
-
6
-
7
- class TaskResponseAsync(TaskBase):
8
- async def _poll_request(self) -> dict:
9
- try:
10
- if not self._client._client:
11
- raise ValueError("Client not found")
12
- r = await self._client._client.get(
13
- self.task_url, headers=self._client._headers()
14
- )
15
- r.raise_for_status()
16
- return r.json()
17
- except (ConnectionError, TimeoutError) as _:
18
- print("Connection error while polling the task, retrying...")
19
- await asyncio.sleep(0.5)
20
- except Exception:
21
- raise
22
-
23
- async def poll(self) -> "TaskResponseAsync":
24
- if not self.task_url:
25
- raise ValueError("Task URL not found")
26
- if not self._client._client:
27
- raise ValueError("Client not found")
28
- while True:
29
- j = await self._poll_request()
30
- updated = TaskResponseAsync(**j).with_client(self._client)
31
- self.__dict__.update(updated.__dict__)
32
- if res := self._check_status():
33
- return res
34
- await asyncio.sleep(0.5)
35
-
36
- async def update(self, config: Configuration) -> "TaskResponseAsync":
37
- if not self.task_url:
38
- raise ValueError("Task URL not found")
39
- if not self._client._client:
40
- raise ValueError("Client not found")
41
- f = prepare_upload_data(None, config)
42
- r = await self._client._client.patch(
43
- self.task_url, files=f, headers=self._client._headers()
44
- )
45
- r.raise_for_status()
46
- updated = TaskResponseAsync(**r.json()).with_client(self._client)
47
- self.__dict__.update(updated.__dict__)
48
- return await self.poll()
49
-
50
- async def cancel(self):
51
- if not self.task_url:
52
- raise ValueError("Task URL not found")
53
- if not self._client._client:
54
- raise ValueError("Client not found")
55
- r = await self._client._client.get(
56
- f"{self.task_url}/cancel", headers=self._client._headers()
57
- )
58
- r.raise_for_status()
59
- return await self.poll()
60
-
61
- async def delete(self):
62
- if not self.task_url:
63
- raise ValueError("Task URL not found")
64
- if not self._client._client:
65
- raise ValueError("Client not found")
66
- r = await self._client._client.delete(
67
- self.task_url, headers=self._client._headers()
68
- )
69
- r.raise_for_status()
@@ -1,85 +0,0 @@
1
- from .config import Configuration, Status, OutputResponse
2
- from .protocol import ChunkrClientProtocol
3
- from abc import ABC, abstractmethod
4
- from typing import TypeVar, Optional, Generic
5
- from pydantic import BaseModel, PrivateAttr
6
- from datetime import datetime
7
-
8
- T = TypeVar("T", bound="TaskBase")
9
-
10
- class TaskBase(BaseModel, ABC, Generic[T]):
11
- configuration: Configuration
12
- created_at: datetime
13
- expires_at: Optional[datetime]
14
- file_name: Optional[str]
15
- finished_at: Optional[datetime]
16
- input_file_url: Optional[str]
17
- message: str
18
- output: Optional[OutputResponse]
19
- page_count: Optional[int]
20
- pdf_url: Optional[str]
21
- started_at: Optional[datetime]
22
- status: Status
23
- task_id: str
24
- task_url: Optional[str]
25
- _client: Optional[ChunkrClientProtocol] = PrivateAttr(default=None)
26
-
27
- @abstractmethod
28
- def _poll_request(self) -> dict:
29
- """Helper method to make polling request with retry logic (synchronous)"""
30
- pass
31
-
32
- @abstractmethod
33
- def poll(self) -> T:
34
- """Poll the task for completion."""
35
- pass
36
-
37
- @abstractmethod
38
- def update(self, config: Configuration) -> T:
39
- """Update the task configuration."""
40
- pass
41
-
42
- @abstractmethod
43
- def cancel(self) -> T:
44
- """Cancel the task."""
45
- pass
46
-
47
- @abstractmethod
48
- def delete(self) -> T:
49
- """Delete the task."""
50
- pass
51
-
52
- def with_client(self, client: ChunkrClientProtocol) -> T:
53
- self._client = client
54
- return self
55
-
56
- def _check_status(self) -> Optional[T]:
57
- """Helper method to check task status and handle completion/failure"""
58
- if self.status == "Failed":
59
- raise ValueError(self.message)
60
- if self.status not in ("Starting", "Processing"):
61
- return self
62
- return None
63
-
64
- def html(self) -> str:
65
- """Get the full HTML of the task"""
66
- return self._get_content("html")
67
-
68
- def markdown(self) -> str:
69
- """Get the full markdown of the task"""
70
- return self._get_content("markdown")
71
-
72
- def content(self) -> str:
73
- """Get the full content of the task"""
74
- return self._get_content("content")
75
-
76
- def _get_content(self, t: str) -> str:
77
- if not self.output:
78
- return ""
79
- parts = []
80
- for c in self.output.chunks:
81
- for s in c.segments:
82
- v = getattr(s, t)
83
- if v:
84
- parts.append(v)
85
- return "\n".join(parts)
@@ -1,21 +0,0 @@
1
- chunkr_ai/__init__.py,sha256=q5YosvCNXPNGjV10pZY1gcvdosqUh38nVQTQA9g8EuM,110
2
- chunkr_ai/models.py,sha256=hahbtxtTyzE_ygFgmlZwbfM6Vj2k5uSDEP02psxDOSQ,924
3
- chunkr_ai/api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
- chunkr_ai/api/api.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
- chunkr_ai/api/auth.py,sha256=hlv0GiUmlsbFO1wLL9sslqOnsBSoBqkL_6Mk2SDvxgE,413
6
- chunkr_ai/api/base.py,sha256=QvHl8FInKHYKPLWDeEPpCchB1uktzOwTW7iPnyXccUc,6449
7
- chunkr_ai/api/chunkr.py,sha256=0extAWVeZtI7B-g14smTfFZD_csdJNCcVNXx2_L69OQ,2617
8
- chunkr_ai/api/chunkr_async.py,sha256=aa0s_tnYoujHBsfe8uLiPpVEnb2l9A3CXwPP34w9Mk8,4127
9
- chunkr_ai/api/chunkr_base.py,sha256=k34Dyt1f21NBWZvZJ3w6Svvpg4SKnzr2ldGQ4ib96Wc,4951
10
- chunkr_ai/api/config.py,sha256=TWl0Az6acKQCS1LIpKD4qr_lQ_63wqQ5M6calpLOlDM,5040
11
- chunkr_ai/api/misc.py,sha256=bQpURc7soT5GL2ZpY7EiYyvPYWEzDM9qaX-UHa-oFeI,4909
12
- chunkr_ai/api/protocol.py,sha256=lxIR_qoCA2a1OXjpq3LrWMdS0jRHct1bEmBlUzV8gvE,526
13
- chunkr_ai/api/schema.py,sha256=yYesvueGgtmRa7Fi_Tpdv8A2bzHlx-B-5DxRAPlaDHo,4926
14
- chunkr_ai/api/task.py,sha256=28J4dR8BDjvtkh3CQjW_YUEkgPXhCHBGu0wH6AQKKuE,2474
15
- chunkr_ai/api/task_async.py,sha256=K5hTEOnmD42snPZg_JtJsVWg6QBUFZ1aBz1Abwv58-A,2529
16
- chunkr_ai/api/task_base.py,sha256=KLiMhvvbCgcilguQKrtEPMlNs8oaatfQUtn8pYt9t6g,2467
17
- chunkr_ai-0.0.17.dist-info/LICENSE,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
- chunkr_ai-0.0.17.dist-info/METADATA,sha256=giy1xeKYXk18W5U-baNoAAlvXciJldhA_EBi87NqKpA,4839
19
- chunkr_ai-0.0.17.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
20
- chunkr_ai-0.0.17.dist-info/top_level.txt,sha256=0IZY7PZIiS8bw5r4NUQRUQ-ATi-L_3vLQVq3ZLouOW8,10
21
- chunkr_ai-0.0.17.dist-info/RECORD,,