chunkr-ai 0.0.10__tar.gz → 0.0.12__tar.gz

Sign up to get free protection for your applications and to get access to all the features.
Files changed (29) hide show
  1. {chunkr_ai-0.0.10/src/chunkr_ai.egg-info → chunkr_ai-0.0.12}/PKG-INFO +2 -1
  2. {chunkr_ai-0.0.10 → chunkr_ai-0.0.12}/pyproject.toml +2 -1
  3. {chunkr_ai-0.0.10 → chunkr_ai-0.0.12}/src/chunkr_ai/api/config.py +17 -2
  4. {chunkr_ai-0.0.10 → chunkr_ai-0.0.12}/src/chunkr_ai/api/misc.py +1 -32
  5. chunkr_ai-0.0.12/src/chunkr_ai/api/schema.py +128 -0
  6. chunkr_ai-0.0.12/src/chunkr_ai/api/task.py +61 -0
  7. chunkr_ai-0.0.12/src/chunkr_ai/api/task_async.py +50 -0
  8. chunkr_ai-0.0.12/src/chunkr_ai/api/task_base.py +83 -0
  9. {chunkr_ai-0.0.10 → chunkr_ai-0.0.12}/src/chunkr_ai/models.py +5 -2
  10. {chunkr_ai-0.0.10 → chunkr_ai-0.0.12/src/chunkr_ai.egg-info}/PKG-INFO +2 -1
  11. {chunkr_ai-0.0.10 → chunkr_ai-0.0.12}/src/chunkr_ai.egg-info/SOURCES.txt +1 -1
  12. {chunkr_ai-0.0.10 → chunkr_ai-0.0.12}/src/chunkr_ai.egg-info/requires.txt +1 -0
  13. chunkr_ai-0.0.10/src/chunkr_ai/api/task.py +0 -176
  14. chunkr_ai-0.0.10/src/chunkr_ai/api/task_async.py +0 -111
  15. chunkr_ai-0.0.10/src/chunkr_ai/api/task_base.py +0 -31
  16. chunkr_ai-0.0.10/src/chunkr_ai/main.py +0 -12
  17. {chunkr_ai-0.0.10 → chunkr_ai-0.0.12}/LICENSE +0 -0
  18. {chunkr_ai-0.0.10 → chunkr_ai-0.0.12}/README.md +0 -0
  19. {chunkr_ai-0.0.10 → chunkr_ai-0.0.12}/setup.cfg +0 -0
  20. {chunkr_ai-0.0.10 → chunkr_ai-0.0.12}/src/chunkr_ai/__init__.py +0 -0
  21. {chunkr_ai-0.0.10 → chunkr_ai-0.0.12}/src/chunkr_ai/api/__init__.py +0 -0
  22. {chunkr_ai-0.0.10 → chunkr_ai-0.0.12}/src/chunkr_ai/api/auth.py +0 -0
  23. {chunkr_ai-0.0.10 → chunkr_ai-0.0.12}/src/chunkr_ai/api/chunkr.py +0 -0
  24. {chunkr_ai-0.0.10 → chunkr_ai-0.0.12}/src/chunkr_ai/api/chunkr_async.py +0 -0
  25. {chunkr_ai-0.0.10 → chunkr_ai-0.0.12}/src/chunkr_ai/api/chunkr_base.py +0 -0
  26. {chunkr_ai-0.0.10 → chunkr_ai-0.0.12}/src/chunkr_ai/api/protocol.py +0 -0
  27. {chunkr_ai-0.0.10 → chunkr_ai-0.0.12}/src/chunkr_ai.egg-info/dependency_links.txt +0 -0
  28. {chunkr_ai-0.0.10 → chunkr_ai-0.0.12}/src/chunkr_ai.egg-info/top_level.txt +0 -0
  29. {chunkr_ai-0.0.10 → chunkr_ai-0.0.12}/tests/test_chunkr.py +0 -0
@@ -1,12 +1,13 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: chunkr-ai
3
- Version: 0.0.10
3
+ Version: 0.0.12
4
4
  Summary: Python client for Chunkr: open source document intelligence
5
5
  Author-email: Ishaan Kapoor <ishaan@lumina.sh>
6
6
  Project-URL: Homepage, https://chunkr.ai
7
7
  Description-Content-Type: text/markdown
8
8
  License-File: LICENSE
9
9
  Requires-Dist: httpx>=0.25.0
10
+ Requires-Dist: httpx>=0.25.0
10
11
  Requires-Dist: pillow>=10.0.0
11
12
  Requires-Dist: pydantic>=2.0.0
12
13
  Requires-Dist: pytest-asyncio>=0.21.0
@@ -4,13 +4,14 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "chunkr-ai"
7
- version = "0.0.10"
7
+ version = "0.0.12"
8
8
  authors = [{"name" = "Ishaan Kapoor", "email" = "ishaan@lumina.sh"}]
9
9
  description = "Python client for Chunkr: open source document intelligence"
10
10
  readme = "README.md"
11
11
  license = {"file" = "LICENSE"}
12
12
  urls = {Homepage = "https://chunkr.ai"}
13
13
  dependencies = [
14
+ "httpx>=0.25.0",
14
15
  "httpx>=0.25.0",
15
16
  "pillow>=10.0.0",
16
17
  "pydantic>=2.0.0",
@@ -1,6 +1,7 @@
1
1
  from pydantic import BaseModel, Field, model_validator, ConfigDict
2
2
  from enum import Enum
3
- from typing import Optional, List, Dict
3
+ from typing import Optional, List, Dict, Union, Type
4
+ from .schema import from_pydantic
4
5
 
5
6
  class GenerationStrategy(str, Enum):
6
7
  LLM = "LLM"
@@ -114,7 +115,7 @@ class Configuration(BaseModel):
114
115
  chunk_processing: Optional[ChunkProcessing] = Field(default=None)
115
116
  expires_in: Optional[int] = Field(default=None)
116
117
  high_resolution: Optional[bool] = Field(default=None)
117
- json_schema: Optional[JsonSchema] = Field(default=None)
118
+ json_schema: Optional[Union[JsonSchema, Type[BaseModel], BaseModel]] = Field(default=None)
118
119
  model: Optional[Model] = Field(default=None)
119
120
  ocr_strategy: Optional[OcrStrategy] = Field(default=None)
120
121
  segment_processing: Optional[SegmentProcessing] = Field(default=None)
@@ -128,3 +129,17 @@ class Configuration(BaseModel):
128
129
  values["chunk_processing"] = values.get("chunk_processing", {}) or {}
129
130
  values["chunk_processing"]["target_length"] = target_length
130
131
  return values
132
+
133
+ @model_validator(mode='after')
134
+ def convert_json_schema(self) -> 'Configuration':
135
+ if self.json_schema is not None and not isinstance(self.json_schema, JsonSchema):
136
+ if isinstance(self.json_schema, (BaseModel, type)) and issubclass(getattr(self.json_schema, '__class__', type), BaseModel):
137
+ self.json_schema = JsonSchema(**from_pydantic(self.json_schema))
138
+ return self
139
+
140
+ class Status(str, Enum):
141
+ STARTING = "Starting"
142
+ PROCESSING = "Processing"
143
+ SUCCEEDED = "Succeeded"
144
+ FAILED = "Failed"
145
+ CANCELLED = "Cancelled"
@@ -1,11 +1,10 @@
1
- from .config import Configuration, Property, JsonSchema
1
+ from .config import Configuration
2
2
  import io
3
3
  import json
4
4
  from pathlib import Path
5
5
  from PIL import Image
6
6
  import requests
7
7
  from typing import Union, Tuple, BinaryIO, Optional
8
- from pydantic import BaseModel
9
8
 
10
9
  def prepare_file(
11
10
  file: Union[str, Path, BinaryIO, Image.Image]
@@ -127,33 +126,3 @@ def prepare_upload_data(
127
126
  files[key] = (None, json.dumps(value), 'application/json')
128
127
 
129
128
  return files
130
-
131
- def from_pydantic(pydantic: BaseModel) -> dict:
132
- """Convert a Pydantic model to a Chunk json schema.
133
-
134
- Args:
135
- pydantic: A Pydantic BaseModel class or instance
136
-
137
- Returns:
138
- dict: A JSON schema compatible with Chunk's format
139
- """
140
- model = pydantic if isinstance(pydantic, type) else pydantic.__class__
141
- schema = model.model_json_schema()
142
- print(schema)
143
- properties = []
144
- for name, details in schema.get('properties', {}).items():
145
- prop = Property(
146
- name=name,
147
- title=details.get('title'),
148
- prop_type=details.get('type', 'string'),
149
- description=details.get('description'),
150
- default=str(details.get('default')) if details.get('default') is not None else None
151
- )
152
- properties.append(prop)
153
-
154
- json_schema = JsonSchema(
155
- title=schema.get('title', model.__name__),
156
- properties=properties
157
- )
158
-
159
- return json_schema.model_dump(mode="json", exclude_none=True)
@@ -0,0 +1,128 @@
1
+ from pydantic import BaseModel
2
+ from typing import Optional, List, Union, Type
3
+ import json
4
+
5
+ class Property(BaseModel):
6
+ name: str
7
+ prop_type: str
8
+ description: Optional[str] = None
9
+ default: Optional[str] = None
10
+
11
+ class JsonSchema(BaseModel):
12
+ title: str
13
+ properties: List[Property]
14
+
15
+ def from_pydantic(pydantic: Union[BaseModel, Type[BaseModel]], current_depth: int = 0) -> dict:
16
+ """Convert a Pydantic model to a Chunk json schema."""
17
+ MAX_DEPTH = 5
18
+ model = pydantic if isinstance(pydantic, type) else pydantic.__class__
19
+ schema = model.model_json_schema()
20
+ properties = []
21
+
22
+ def get_enum_description(details: dict) -> str:
23
+ """Get description including enum values if they exist"""
24
+ description = details.get('description', '')
25
+
26
+ # First check if this is a direct enum
27
+ if 'enum' in details:
28
+ enum_values = details['enum']
29
+ enum_str = '\nAllowed values:\n' + '\n'.join(f'- {val}' for val in enum_values)
30
+ return f"{description}{enum_str}"
31
+
32
+ # Then check if it's a reference to an enum
33
+ if '$ref' in details:
34
+ ref_schema = resolve_ref(details['$ref'], schema.get('$defs', {}))
35
+ if 'enum' in ref_schema:
36
+ enum_values = ref_schema['enum']
37
+ enum_str = '\nAllowed values:\n' + '\n'.join(f'- {val}' for val in enum_values)
38
+ return f"{description}{enum_str}"
39
+
40
+ return description
41
+
42
+ def resolve_ref(ref: str, definitions: dict) -> dict:
43
+ """Resolve a $ref reference to its actual schema"""
44
+ if not ref.startswith('#/$defs/'):
45
+ return {}
46
+ ref_name = ref[len('#/$defs/'):]
47
+ return definitions.get(ref_name, {})
48
+
49
+ def get_nested_schema(field_schema: dict, depth: int) -> dict:
50
+ if depth >= MAX_DEPTH:
51
+ return {}
52
+
53
+ # If there's a $ref, resolve it first
54
+ if '$ref' in field_schema:
55
+ field_schema = resolve_ref(field_schema['$ref'], schema.get('$defs', {}))
56
+
57
+ nested_props = {}
58
+ if field_schema.get('type') == 'object':
59
+ for name, details in field_schema.get('properties', {}).items():
60
+ if details.get('type') == 'object' or '$ref' in details:
61
+ ref_schema = details
62
+ if '$ref' in details:
63
+ ref_schema = resolve_ref(details['$ref'], schema.get('$defs', {}))
64
+ nested_schema = get_nested_schema(ref_schema, depth + 1)
65
+ nested_props[name] = {
66
+ 'type': 'object',
67
+ 'description': get_enum_description(details),
68
+ 'properties': nested_schema
69
+ }
70
+ else:
71
+ nested_props[name] = {
72
+ 'type': details.get('type', 'string'),
73
+ 'description': get_enum_description(details)
74
+ }
75
+ return nested_props
76
+
77
+ for name, details in schema.get('properties', {}).items():
78
+ # Handle arrays
79
+ if details.get('type') == 'array':
80
+ items = details.get('items', {})
81
+ if '$ref' in items:
82
+ items = resolve_ref(items['$ref'], schema.get('$defs', {}))
83
+
84
+ # Get nested schema for array items
85
+ item_schema = get_nested_schema(items, current_depth)
86
+ description = get_enum_description(details)
87
+
88
+ if item_schema:
89
+ description = f"{description}\nList items schema:\n{json.dumps(item_schema, indent=2)}"
90
+
91
+ prop = Property(
92
+ name=name,
93
+ prop_type='list',
94
+ description=description
95
+ )
96
+ # Handle objects and references
97
+ elif details.get('type') == 'object' or '$ref' in details:
98
+ prop_type = 'object'
99
+ ref_schema = details
100
+ if '$ref' in details:
101
+ ref_schema = resolve_ref(details['$ref'], schema.get('$defs', {}))
102
+
103
+ nested_schema = get_nested_schema(ref_schema, current_depth)
104
+
105
+ prop = Property(
106
+ name=name,
107
+ prop_type=prop_type,
108
+ description=get_enum_description(details),
109
+ properties=nested_schema
110
+ )
111
+
112
+ # Handle primitive types
113
+ else:
114
+ prop = Property(
115
+ name=name,
116
+ prop_type=details.get('type', 'string'),
117
+ description=get_enum_description(details),
118
+ default=str(details.get('default')) if details.get('default') is not None else None
119
+ )
120
+
121
+ properties.append(prop)
122
+
123
+ json_schema = JsonSchema(
124
+ title=schema.get('title', model.__name__),
125
+ properties=properties
126
+ )
127
+
128
+ return json_schema.model_dump(mode="json", exclude_none=True)
@@ -0,0 +1,61 @@
1
+ from .config import Configuration
2
+ from .misc import prepare_upload_data
3
+ from .task_base import TaskBase
4
+ import time
5
+
6
+ class TaskResponse(TaskBase):
7
+ def _poll_request(self) -> dict:
8
+ while True:
9
+ try:
10
+ r = self._client._session.get(self.task_url, headers=self._client._headers())
11
+ r.raise_for_status()
12
+ return r.json()
13
+ except (ConnectionError, TimeoutError) as _:
14
+ print("Connection error while polling the task, retrying...")
15
+ time.sleep(0.5)
16
+ except Exception as e:
17
+ raise
18
+
19
+ def poll(self) -> 'TaskResponse':
20
+ if not self.task_url:
21
+ raise ValueError("Task URL not found in response")
22
+ while True:
23
+ response = self._poll_request_sync()
24
+ updated_task = TaskResponse(**response).with_client(self._client)
25
+ self.__dict__.update(updated_task.__dict__)
26
+ if result := self._check_status():
27
+ return result
28
+ time.sleep(0.5)
29
+
30
+ def update(self, config: Configuration) -> 'TaskResponse':
31
+ if not self.task_url:
32
+ raise ValueError("Task URL not found")
33
+ files = prepare_upload_data(None, config)
34
+ r = self._client._session.patch(
35
+ f"{self.task_url}",
36
+ files=files,
37
+ headers=self._client._headers()
38
+ )
39
+ r.raise_for_status()
40
+ updated = TaskResponse(**r.json()).with_client(self._client)
41
+ self.__dict__.update(updated.__dict__)
42
+ return self.poll()
43
+
44
+ def cancel(self):
45
+ if not self.task_url:
46
+ raise ValueError("Task URL not found")
47
+ r = self._client._session.get(
48
+ f"{self.task_url}/cancel",
49
+ headers=self._client._headers()
50
+ )
51
+ r.raise_for_status()
52
+ self.poll()
53
+
54
+ def delete(self):
55
+ if not self.task_url:
56
+ raise ValueError("Task URL not found")
57
+ r = self._client._session.delete(
58
+ self.task_url,
59
+ headers=self._client._headers()
60
+ )
61
+ r.raise_for_status()
@@ -0,0 +1,50 @@
1
+ from .config import Configuration
2
+ from .misc import prepare_upload_data
3
+ from .task_base import TaskBase
4
+ import asyncio
5
+
6
+ class TaskResponseAsync(TaskBase):
7
+ async def _poll_request(self) -> dict:
8
+ try:
9
+ r = await self._client._client.get(self.task_url, headers=self._client._headers())
10
+ r.raise_for_status()
11
+ return r.json()
12
+ except (ConnectionError, TimeoutError) as _:
13
+ print("Connection error while polling the task, retrying...")
14
+ await asyncio.sleep(0.5)
15
+ except Exception as e:
16
+ raise
17
+
18
+ async def poll(self) -> 'TaskResponseAsync':
19
+ if not self.task_url:
20
+ raise ValueError("Task URL not found")
21
+ while True:
22
+ j = await self._poll_request()
23
+ updated = TaskResponseAsync(**j).with_client(self._client)
24
+ self.__dict__.update(updated.__dict__)
25
+ if res := self._check_status():
26
+ return res
27
+ await asyncio.sleep(0.5)
28
+
29
+ async def update(self, config: Configuration) -> 'TaskResponseAsync':
30
+ if not self.task_url:
31
+ raise ValueError("Task URL not found")
32
+ f = prepare_upload_data(None, config)
33
+ r = await self._client._client.patch(self.task_url, files=f, headers=self._client._headers())
34
+ r.raise_for_status()
35
+ updated = TaskResponseAsync(**r.json()).with_client(self._client)
36
+ self.__dict__.update(updated.__dict__)
37
+ return await self.poll()
38
+
39
+ async def cancel(self):
40
+ if not self.task_url:
41
+ raise ValueError("Task URL not found")
42
+ r = await self._client._client.get(f"{self.task_url}/cancel", headers=self._client._headers())
43
+ r.raise_for_status()
44
+ return await self.poll()
45
+
46
+ async def delete(self):
47
+ if not self.task_url:
48
+ raise ValueError("Task URL not found")
49
+ r = await self._client._client.delete(self.task_url, headers=self._client._headers())
50
+ r.raise_for_status()
@@ -0,0 +1,83 @@
1
+ from .config import Configuration
2
+ from .protocol import ChunkrClientProtocol
3
+ from ..models import Status, OutputResponse
4
+ from abc import ABC, abstractmethod
5
+ from typing import TypeVar, Optional, Generic, Union
6
+ from pydantic import BaseModel, PrivateAttr
7
+ from datetime import datetime
8
+
9
+ T = TypeVar('T', bound='TaskBase')
10
+
11
+ class TaskBase(BaseModel, ABC, Generic[T]):
12
+ configuration: Configuration
13
+ created_at: datetime
14
+ expires_at: Optional[datetime]
15
+ file_name: Optional[str]
16
+ finished_at: Optional[datetime]
17
+ input_file_url: Optional[str]
18
+ message: str
19
+ output: Optional[OutputResponse]
20
+ page_count: Optional[int]
21
+ pdf_url: Optional[str]
22
+ started_at: Optional[datetime]
23
+ status: Status
24
+ task_id: str
25
+ task_url: Optional[str]
26
+ _client: Optional[Union[ChunkrClientProtocol]] = PrivateAttr(default=None)
27
+
28
+ @abstractmethod
29
+ def _poll_request(self) -> dict:
30
+ """Helper method to make polling request with retry logic (synchronous)"""
31
+ pass
32
+
33
+ @abstractmethod
34
+ def poll(self) -> T:
35
+ """Poll the task for completion."""
36
+ pass
37
+
38
+ @abstractmethod
39
+ def update(self, config: Configuration) -> T:
40
+ """Update the task configuration."""
41
+ pass
42
+
43
+ @abstractmethod
44
+ def cancel(self) -> T:
45
+ """Cancel the task."""
46
+ pass
47
+
48
+ @abstractmethod
49
+ def delete(self) -> T:
50
+ """Delete the task."""
51
+ pass
52
+
53
+ def with_client(self, client: Union[ChunkrClientProtocol]) -> T:
54
+ self._client = client
55
+ return self
56
+
57
+ def _check_status(self) -> Optional[T]:
58
+ """Helper method to check task status and handle completion/failure"""
59
+ if self.status == "Failed":
60
+ raise ValueError(self.message)
61
+ if self.status not in ("Starting", "Processing"):
62
+ return self
63
+ return None
64
+
65
+ def html(self) -> str:
66
+ return self._get_content("html")
67
+
68
+ def markdown(self) -> str:
69
+ return self._get_content("markdown")
70
+
71
+ def content(self) -> str:
72
+ return self._get_content("content")
73
+
74
+ def _get_content(self, t: str) -> str:
75
+ if not self.output:
76
+ return ""
77
+ parts = []
78
+ for c in self.output.chunks:
79
+ for s in c.segments:
80
+ v = getattr(s, t)
81
+ if v:
82
+ parts.append(v)
83
+ return "\n".join(parts)
@@ -17,9 +17,11 @@ from .api.config import (
17
17
  SegmentProcessing,
18
18
  SegmentType,
19
19
  SegmentationStrategy,
20
+ Status,
20
21
  )
21
22
 
22
- from .api.task import TaskResponse, Status
23
+ from .api.task import TaskResponse
24
+ from .api.task_async import TaskResponseAsync
23
25
 
24
26
  __all__ = [
25
27
  'BoundingBox',
@@ -42,5 +44,6 @@ __all__ = [
42
44
  'SegmentType',
43
45
  'SegmentationStrategy',
44
46
  'Status',
45
- 'TaskResponse'
47
+ 'TaskResponse',
48
+ 'TaskResponseAsync',
46
49
  ]
@@ -1,12 +1,13 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: chunkr-ai
3
- Version: 0.0.10
3
+ Version: 0.0.12
4
4
  Summary: Python client for Chunkr: open source document intelligence
5
5
  Author-email: Ishaan Kapoor <ishaan@lumina.sh>
6
6
  Project-URL: Homepage, https://chunkr.ai
7
7
  Description-Content-Type: text/markdown
8
8
  License-File: LICENSE
9
9
  Requires-Dist: httpx>=0.25.0
10
+ Requires-Dist: httpx>=0.25.0
10
11
  Requires-Dist: pillow>=10.0.0
11
12
  Requires-Dist: pydantic>=2.0.0
12
13
  Requires-Dist: pytest-asyncio>=0.21.0
@@ -2,7 +2,6 @@ LICENSE
2
2
  README.md
3
3
  pyproject.toml
4
4
  src/chunkr_ai/__init__.py
5
- src/chunkr_ai/main.py
6
5
  src/chunkr_ai/models.py
7
6
  src/chunkr_ai.egg-info/PKG-INFO
8
7
  src/chunkr_ai.egg-info/SOURCES.txt
@@ -17,6 +16,7 @@ src/chunkr_ai/api/chunkr_base.py
17
16
  src/chunkr_ai/api/config.py
18
17
  src/chunkr_ai/api/misc.py
19
18
  src/chunkr_ai/api/protocol.py
19
+ src/chunkr_ai/api/schema.py
20
20
  src/chunkr_ai/api/task.py
21
21
  src/chunkr_ai/api/task_async.py
22
22
  src/chunkr_ai/api/task_base.py
@@ -1,4 +1,5 @@
1
1
  httpx>=0.25.0
2
+ httpx>=0.25.0
2
3
  pillow>=10.0.0
3
4
  pydantic>=2.0.0
4
5
  pytest-asyncio>=0.21.0
@@ -1,176 +0,0 @@
1
- from .protocol import ChunkrClientProtocol
2
- from .config import Configuration, OutputResponse
3
- from .misc import prepare_upload_data
4
- import asyncio
5
- from datetime import datetime
6
- from enum import Enum
7
- from pydantic import BaseModel, PrivateAttr
8
- import time
9
- from typing import Optional, Union
10
-
11
- class Status(str, Enum):
12
- STARTING = "Starting"
13
- PROCESSING = "Processing"
14
- SUCCEEDED = "Succeeded"
15
- FAILED = "Failed"
16
- CANCELLED = "Cancelled"
17
-
18
- class TaskResponse(BaseModel):
19
- configuration: Configuration
20
- created_at: datetime
21
- expires_at: Optional[datetime] = None
22
- file_name: Optional[str] = None
23
- finished_at: Optional[datetime] = None
24
- input_file_url: Optional[str] = None
25
- message: str
26
- output: Optional[OutputResponse] = None
27
- page_count: Optional[int] = None
28
- pdf_url: Optional[str] = None
29
- started_at: Optional[datetime] = None
30
- status: Status
31
- task_id: str
32
- task_url: Optional[str] = None
33
- _client: Optional[Union[ChunkrClientProtocol]] = PrivateAttr(default=None)
34
-
35
- def with_client(self, client: Union[ChunkrClientProtocol]) -> 'TaskResponse':
36
- self._client = client
37
- return self
38
-
39
- def _poll_request_sync(self) -> dict:
40
- """Helper method to make polling request with retry logic (synchronous)"""
41
- if not self.task_url:
42
- raise ValueError("Task URL not found in response")
43
-
44
- while True:
45
- try:
46
- r = self._client._session.get(self.task_url, headers=self._client._headers())
47
- r.raise_for_status()
48
- return r.json()
49
- except (ConnectionError, TimeoutError) as _:
50
- print("Connection error while polling the task, retrying...")
51
- time.sleep(0.5)
52
- except Exception as e:
53
- raise
54
-
55
- async def _poll_request_async(self) -> dict:
56
- """Helper method to make polling request with retry logic (asynchronous)"""
57
- if not self.task_url:
58
- raise ValueError("Task URL not found in response")
59
-
60
- while True:
61
- try:
62
- r = await self._client._client.get(self.task_url, headers=self._client._headers())
63
- r.raise_for_status()
64
- response = r.json()
65
- return response
66
- except (ConnectionError, TimeoutError) as _:
67
- print("Connection error while polling the task, retrying...")
68
- await asyncio.sleep(0.5)
69
- except Exception as e:
70
- raise
71
-
72
- def _check_status(self) -> Optional['TaskResponse']:
73
- """Helper method to check task status and handle completion/failure"""
74
- if self.status == "Failed":
75
- raise ValueError(self.message)
76
- if self.status not in ("Starting", "Processing"):
77
- return self
78
- return None
79
-
80
- def poll(self) -> 'TaskResponse':
81
- """Poll the task for completion."""
82
- while True:
83
- response = self._poll_request_sync()
84
- updated_task = TaskResponse(**response).with_client(self._client)
85
- self.__dict__.update(updated_task.__dict__)
86
-
87
- if result := self._check_status():
88
- return result
89
-
90
- time.sleep(0.5)
91
-
92
- async def poll_async(self) -> 'TaskResponse':
93
- """Poll the task for completion asynchronously."""
94
- while True:
95
- response = await self._poll_request_async()
96
- updated_task = TaskResponse(**response).with_client(self._client)
97
- self.__dict__.update(updated_task.__dict__)
98
-
99
- if result := self._check_status():
100
- return result
101
-
102
- await asyncio.sleep(0.5)
103
-
104
- def _get_content(self, content_type: str) -> str:
105
- """Helper method to get either HTML, Markdown, or raw content."""
106
- if not self.output:
107
- return ""
108
- parts = []
109
- for c in self.output.chunks:
110
- for s in c.segments:
111
- content = getattr(s, content_type)
112
- if content:
113
- parts.append(content)
114
- return "\n".join(parts)
115
-
116
- def update(self, config: Configuration) -> 'TaskResponse':
117
- files = prepare_upload_data(None, config)
118
- r = self._client._session.patch(
119
- f"{self.task_url}",
120
- files=files,
121
- headers=self._client._headers()
122
- )
123
- r.raise_for_status()
124
- return TaskResponse(**r.json()).with_client(self._client)
125
-
126
- async def update_async(self, config: Configuration) -> 'TaskResponse':
127
- files = prepare_upload_data(None, config)
128
- r = await self._client._client.patch(
129
- f"{self.task_url}",
130
- files=files,
131
- headers=self._client._headers()
132
- )
133
- r.raise_for_status()
134
- return TaskResponse(**r.json()).with_client(self._client)
135
-
136
- def cancel(self):
137
- r = self._client._session.get(
138
- f"{self.task_url}/cancel",
139
- headers=self._client._headers()
140
- )
141
- r.raise_for_status()
142
- self.poll()
143
-
144
- async def cancel_async(self):
145
- r = await self._client._client.get(
146
- f"{self.task_url}/cancel",
147
- headers=self._client._headers()
148
- )
149
- r.raise_for_status()
150
- await self.poll_async()
151
-
152
- def delete(self):
153
- r = self._client._session.delete(
154
- f"{self.task_url}",
155
- headers=self._client._headers()
156
- )
157
- r.raise_for_status()
158
-
159
- async def delete_async(self):
160
- r = await self._client._client.delete(
161
- f"{self.task_url}",
162
- headers=self._client._headers()
163
- )
164
- r.raise_for_status()
165
-
166
- def html(self) -> str:
167
- """Get full HTML for the task"""
168
- return self._get_content("html")
169
-
170
- def markdown(self) -> str:
171
- """Get full markdown for the task"""
172
- return self._get_content("markdown")
173
-
174
- def content(self) -> str:
175
- """Get full text for the task"""
176
- return self._get_content("content")
@@ -1,111 +0,0 @@
1
- import asyncio
2
- from pydantic import BaseModel, PrivateAttr
3
- from datetime import datetime
4
- from enum import Enum
5
- from typing import Optional, Union
6
- from .task_base import TaskBase
7
- from .protocol import ChunkrClientProtocol
8
- from .config import Configuration, OutputResponse
9
- from .misc import prepare_upload_data
10
-
11
- class Status(str, Enum):
12
- STARTING = "Starting"
13
- PROCESSING = "Processing"
14
- SUCCEEDED = "Succeeded"
15
- FAILED = "Failed"
16
- CANCELLED = "Cancelled"
17
-
18
- class TaskResponseAsync(BaseModel, TaskBase):
19
- configuration: Configuration
20
- created_at: datetime
21
- expires_at: Optional[datetime]
22
- file_name: Optional[str]
23
- finished_at: Optional[datetime]
24
- input_file_url: Optional[str]
25
- message: str
26
- output: Optional[OutputResponse]
27
- page_count: Optional[int]
28
- pdf_url: Optional[str]
29
- started_at: Optional[datetime]
30
- status: Status
31
- task_id: str
32
- task_url: Optional[str]
33
- _client: Optional[Union[ChunkrClientProtocol]] = PrivateAttr(default=None)
34
-
35
- def with_client(self, client: Union[ChunkrClientProtocol]) -> 'TaskResponseAsync':
36
- self._client = client
37
- return self
38
-
39
- async def poll(self) -> 'TaskResponseAsync':
40
- while True:
41
- j = await self._poll_request()
42
- updated = TaskResponseAsync(**j).with_client(self._client)
43
- self.__dict__.update(updated.__dict__)
44
- if res := self._check_status():
45
- return res
46
- await asyncio.sleep(0.5)
47
-
48
- async def _poll_request(self) -> dict:
49
- if not self.task_url:
50
- raise ValueError("Task URL not found")
51
- while True:
52
- try:
53
- r = await self._client._client.get(self.task_url, headers=self._client._headers())
54
- r.raise_for_status()
55
- return r.json()
56
- except Exception as e:
57
- if self.status == Status.FAILED:
58
- raise ValueError(self.message) from e
59
- await asyncio.sleep(0.5)
60
-
61
- def _check_status(self) -> Optional['TaskResponseAsync']:
62
- if self.status == Status.FAILED:
63
- raise ValueError(f"Task failed: {self.message}")
64
- if self.status == Status.CANCELLED:
65
- return self
66
- if self.status not in [Status.STARTING, Status.PROCESSING]:
67
- return self
68
- return None
69
-
70
- async def update(self, config: Configuration) -> 'TaskResponseAsync':
71
- if not self.task_url:
72
- raise ValueError("Task URL not found")
73
- f = prepare_upload_data(None, config)
74
- r = await self._client._client.patch(self.task_url, files=f, headers=self._client._headers())
75
- r.raise_for_status()
76
- updated = TaskResponseAsync(**r.json()).with_client(self._client)
77
- self.__dict__.update(updated.__dict__)
78
- return await self.poll()
79
-
80
- async def cancel(self):
81
- if not self.task_url:
82
- raise ValueError("Task URL not found")
83
- r = await self._client._client.get(f"{self.task_url}/cancel", headers=self._client._headers())
84
- r.raise_for_status()
85
- return await self.poll()
86
-
87
- async def delete(self):
88
- r = await self._client._client.delete(self.task_url, headers=self._client._headers())
89
- r.raise_for_status()
90
-
91
- def html(self) -> str:
92
- return self._get_content("html")
93
-
94
- def markdown(self) -> str:
95
- return self._get_content("markdown")
96
-
97
- def content(self) -> str:
98
- return self._get_content("content")
99
-
100
- def _get_content(self, t: str) -> str:
101
- if not self.output:
102
- return ""
103
- parts = []
104
- for c in self.output.chunks:
105
- for s in c.segments:
106
- v = getattr(s, t)
107
- if v:
108
- parts.append(v)
109
- return "\n".join(parts)
110
-
111
- # Satisfying TaskBase abstract methods with stubs
@@ -1,31 +0,0 @@
1
- from abc import ABC, abstractmethod
2
- from .config import Configuration
3
-
4
- class TaskBase(ABC):
5
- @abstractmethod
6
- def poll(self):
7
- pass
8
-
9
- @abstractmethod
10
- def update(self, config: Configuration):
11
- pass
12
-
13
- @abstractmethod
14
- def cancel(self):
15
- pass
16
-
17
- @abstractmethod
18
- def delete(self):
19
- pass
20
-
21
- @abstractmethod
22
- def html(self) -> str:
23
- pass
24
-
25
- @abstractmethod
26
- def markdown(self) -> str:
27
- pass
28
-
29
- @abstractmethod
30
- def content(self) -> str:
31
- pass
@@ -1,12 +0,0 @@
1
- from chunkr_ai.api.chunkr import Chunkr
2
- from chunkr_ai.models import Configuration
3
- from chunkr_ai.api.config import SegmentationStrategy, ChunkProcessing
4
-
5
- if __name__ == "__main__":
6
- chunkr = Chunkr()
7
- task = chunkr.update_task("556b4fe5-e3f7-48dc-9f56-0fb7fbacdb87", Configuration(
8
- chunk_processing=ChunkProcessing(
9
- target_length=1000
10
- )
11
- ))
12
- print(task)
File without changes
File without changes
File without changes