chunkr-ai 0.0.11__py3-none-any.whl → 0.0.12__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
chunkr_ai/api/config.py CHANGED
@@ -142,4 +142,4 @@ class Status(str, Enum):
142
142
  PROCESSING = "Processing"
143
143
  SUCCEEDED = "Succeeded"
144
144
  FAILED = "Failed"
145
- CANCELLED = "Cancelled"
145
+ CANCELLED = "Cancelled"
chunkr_ai/api/task.py CHANGED
@@ -1,38 +1,10 @@
1
- from .protocol import ChunkrClientProtocol
2
- from .config import Configuration, OutputResponse, Status
1
+ from .config import Configuration
3
2
  from .misc import prepare_upload_data
4
- import asyncio
5
- from datetime import datetime
6
- from pydantic import BaseModel, PrivateAttr
3
+ from .task_base import TaskBase
7
4
  import time
8
- from typing import Optional, Union
9
-
10
- class TaskResponse(BaseModel):
11
- configuration: Configuration
12
- created_at: datetime
13
- expires_at: Optional[datetime] = None
14
- file_name: Optional[str] = None
15
- finished_at: Optional[datetime] = None
16
- input_file_url: Optional[str] = None
17
- message: str
18
- output: Optional[OutputResponse] = None
19
- page_count: Optional[int] = None
20
- pdf_url: Optional[str] = None
21
- started_at: Optional[datetime] = None
22
- status: Status
23
- task_id: str
24
- task_url: Optional[str] = None
25
- _client: Optional[Union[ChunkrClientProtocol]] = PrivateAttr(default=None)
26
-
27
- def with_client(self, client: Union[ChunkrClientProtocol]) -> 'TaskResponse':
28
- self._client = client
29
- return self
30
-
31
- def _poll_request_sync(self) -> dict:
32
- """Helper method to make polling request with retry logic (synchronous)"""
33
- if not self.task_url:
34
- raise ValueError("Task URL not found in response")
35
5
 
6
+ class TaskResponse(TaskBase):
7
+ def _poll_request(self) -> dict:
36
8
  while True:
37
9
  try:
38
10
  r = self._client._session.get(self.task_url, headers=self._client._headers())
@@ -44,68 +16,20 @@ class TaskResponse(BaseModel):
44
16
  except Exception as e:
45
17
  raise
46
18
 
47
- async def _poll_request_async(self) -> dict:
48
- """Helper method to make polling request with retry logic (asynchronous)"""
19
+ def poll(self) -> 'TaskResponse':
49
20
  if not self.task_url:
50
21
  raise ValueError("Task URL not found in response")
51
-
52
- while True:
53
- try:
54
- r = await self._client._client.get(self.task_url, headers=self._client._headers())
55
- r.raise_for_status()
56
- response = r.json()
57
- return response
58
- except (ConnectionError, TimeoutError) as _:
59
- print("Connection error while polling the task, retrying...")
60
- await asyncio.sleep(0.5)
61
- except Exception as e:
62
- raise
63
-
64
- def _check_status(self) -> Optional['TaskResponse']:
65
- """Helper method to check task status and handle completion/failure"""
66
- if self.status == "Failed":
67
- raise ValueError(self.message)
68
- if self.status not in ("Starting", "Processing"):
69
- return self
70
- return None
71
-
72
- def poll(self) -> 'TaskResponse':
73
- """Poll the task for completion."""
74
22
  while True:
75
23
  response = self._poll_request_sync()
76
24
  updated_task = TaskResponse(**response).with_client(self._client)
77
25
  self.__dict__.update(updated_task.__dict__)
78
-
79
26
  if result := self._check_status():
80
27
  return result
81
-
82
28
  time.sleep(0.5)
83
-
84
- async def poll_async(self) -> 'TaskResponse':
85
- """Poll the task for completion asynchronously."""
86
- while True:
87
- response = await self._poll_request_async()
88
- updated_task = TaskResponse(**response).with_client(self._client)
89
- self.__dict__.update(updated_task.__dict__)
90
-
91
- if result := self._check_status():
92
- return result
93
-
94
- await asyncio.sleep(0.5)
95
-
96
- def _get_content(self, content_type: str) -> str:
97
- """Helper method to get either HTML, Markdown, or raw content."""
98
- if not self.output:
99
- return ""
100
- parts = []
101
- for c in self.output.chunks:
102
- for s in c.segments:
103
- content = getattr(s, content_type)
104
- if content:
105
- parts.append(content)
106
- return "\n".join(parts)
107
29
 
108
30
  def update(self, config: Configuration) -> 'TaskResponse':
31
+ if not self.task_url:
32
+ raise ValueError("Task URL not found")
109
33
  files = prepare_upload_data(None, config)
110
34
  r = self._client._session.patch(
111
35
  f"{self.task_url}",
@@ -113,56 +37,25 @@ class TaskResponse(BaseModel):
113
37
  headers=self._client._headers()
114
38
  )
115
39
  r.raise_for_status()
116
- return TaskResponse(**r.json()).with_client(self._client)
117
-
118
- async def update_async(self, config: Configuration) -> 'TaskResponse':
119
- files = prepare_upload_data(None, config)
120
- r = await self._client._client.patch(
121
- f"{self.task_url}",
122
- files=files,
123
- headers=self._client._headers()
124
- )
125
- r.raise_for_status()
126
- return TaskResponse(**r.json()).with_client(self._client)
40
+ updated = TaskResponse(**r.json()).with_client(self._client)
41
+ self.__dict__.update(updated.__dict__)
42
+ return self.poll()
127
43
 
128
44
  def cancel(self):
45
+ if not self.task_url:
46
+ raise ValueError("Task URL not found")
129
47
  r = self._client._session.get(
130
48
  f"{self.task_url}/cancel",
131
49
  headers=self._client._headers()
132
50
  )
133
51
  r.raise_for_status()
134
52
  self.poll()
135
-
136
- async def cancel_async(self):
137
- r = await self._client._client.get(
138
- f"{self.task_url}/cancel",
139
- headers=self._client._headers()
140
- )
141
- r.raise_for_status()
142
- await self.poll_async()
143
53
 
144
54
  def delete(self):
55
+ if not self.task_url:
56
+ raise ValueError("Task URL not found")
145
57
  r = self._client._session.delete(
146
- f"{self.task_url}",
58
+ self.task_url,
147
59
  headers=self._client._headers()
148
60
  )
149
61
  r.raise_for_status()
150
-
151
- async def delete_async(self):
152
- r = await self._client._client.delete(
153
- f"{self.task_url}",
154
- headers=self._client._headers()
155
- )
156
- r.raise_for_status()
157
-
158
- def html(self) -> str:
159
- """Get full HTML for the task"""
160
- return self._get_content("html")
161
-
162
- def markdown(self) -> str:
163
- """Get full markdown for the task"""
164
- return self._get_content("markdown")
165
-
166
- def content(self) -> str:
167
- """Get full text for the task"""
168
- return self._get_content("content")
@@ -1,34 +1,23 @@
1
- import asyncio
2
- from pydantic import BaseModel, PrivateAttr
3
- from datetime import datetime
4
- from typing import Optional, Union
5
- from .task_base import TaskBase
6
- from .protocol import ChunkrClientProtocol
7
- from .config import Configuration, OutputResponse, Status
1
+ from .config import Configuration
8
2
  from .misc import prepare_upload_data
3
+ from .task_base import TaskBase
4
+ import asyncio
9
5
 
10
- class TaskResponseAsync(BaseModel, TaskBase):
11
- configuration: Configuration
12
- created_at: datetime
13
- expires_at: Optional[datetime]
14
- file_name: Optional[str]
15
- finished_at: Optional[datetime]
16
- input_file_url: Optional[str]
17
- message: str
18
- output: Optional[OutputResponse]
19
- page_count: Optional[int]
20
- pdf_url: Optional[str]
21
- started_at: Optional[datetime]
22
- status: Status
23
- task_id: str
24
- task_url: Optional[str]
25
- _client: Optional[Union[ChunkrClientProtocol]] = PrivateAttr(default=None)
26
-
27
- def with_client(self, client: Union[ChunkrClientProtocol]) -> 'TaskResponseAsync':
28
- self._client = client
29
- return self
6
+ class TaskResponseAsync(TaskBase):
7
+ async def _poll_request(self) -> dict:
8
+ try:
9
+ r = await self._client._client.get(self.task_url, headers=self._client._headers())
10
+ r.raise_for_status()
11
+ return r.json()
12
+ except (ConnectionError, TimeoutError) as _:
13
+ print("Connection error while polling the task, retrying...")
14
+ await asyncio.sleep(0.5)
15
+ except Exception as e:
16
+ raise
30
17
 
31
18
  async def poll(self) -> 'TaskResponseAsync':
19
+ if not self.task_url:
20
+ raise ValueError("Task URL not found")
32
21
  while True:
33
22
  j = await self._poll_request()
34
23
  updated = TaskResponseAsync(**j).with_client(self._client)
@@ -37,28 +26,6 @@ class TaskResponseAsync(BaseModel, TaskBase):
37
26
  return res
38
27
  await asyncio.sleep(0.5)
39
28
 
40
- async def _poll_request(self) -> dict:
41
- if not self.task_url:
42
- raise ValueError("Task URL not found")
43
- while True:
44
- try:
45
- r = await self._client._client.get(self.task_url, headers=self._client._headers())
46
- r.raise_for_status()
47
- return r.json()
48
- except Exception as e:
49
- if self.status == Status.FAILED:
50
- raise ValueError(self.message) from e
51
- await asyncio.sleep(0.5)
52
-
53
- def _check_status(self) -> Optional['TaskResponseAsync']:
54
- if self.status == Status.FAILED:
55
- raise ValueError(f"Task failed: {self.message}")
56
- if self.status == Status.CANCELLED:
57
- return self
58
- if self.status not in [Status.STARTING, Status.PROCESSING]:
59
- return self
60
- return None
61
-
62
29
  async def update(self, config: Configuration) -> 'TaskResponseAsync':
63
30
  if not self.task_url:
64
31
  raise ValueError("Task URL not found")
@@ -77,27 +44,7 @@ class TaskResponseAsync(BaseModel, TaskBase):
77
44
  return await self.poll()
78
45
 
79
46
  async def delete(self):
47
+ if not self.task_url:
48
+ raise ValueError("Task URL not found")
80
49
  r = await self._client._client.delete(self.task_url, headers=self._client._headers())
81
- r.raise_for_status()
82
-
83
- def html(self) -> str:
84
- return self._get_content("html")
85
-
86
- def markdown(self) -> str:
87
- return self._get_content("markdown")
88
-
89
- def content(self) -> str:
90
- return self._get_content("content")
91
-
92
- def _get_content(self, t: str) -> str:
93
- if not self.output:
94
- return ""
95
- parts = []
96
- for c in self.output.chunks:
97
- for s in c.segments:
98
- v = getattr(s, t)
99
- if v:
100
- parts.append(v)
101
- return "\n".join(parts)
102
-
103
- # Satisfying TaskBase abstract methods with stubs
50
+ r.raise_for_status()
@@ -1,31 +1,83 @@
1
- from abc import ABC, abstractmethod
2
1
  from .config import Configuration
2
+ from .protocol import ChunkrClientProtocol
3
+ from ..models import Status, OutputResponse
4
+ from abc import ABC, abstractmethod
5
+ from typing import TypeVar, Optional, Generic, Union
6
+ from pydantic import BaseModel, PrivateAttr
7
+ from datetime import datetime
8
+
9
+ T = TypeVar('T', bound='TaskBase')
10
+
11
+ class TaskBase(BaseModel, ABC, Generic[T]):
12
+ configuration: Configuration
13
+ created_at: datetime
14
+ expires_at: Optional[datetime]
15
+ file_name: Optional[str]
16
+ finished_at: Optional[datetime]
17
+ input_file_url: Optional[str]
18
+ message: str
19
+ output: Optional[OutputResponse]
20
+ page_count: Optional[int]
21
+ pdf_url: Optional[str]
22
+ started_at: Optional[datetime]
23
+ status: Status
24
+ task_id: str
25
+ task_url: Optional[str]
26
+ _client: Optional[Union[ChunkrClientProtocol]] = PrivateAttr(default=None)
3
27
 
4
- class TaskBase(ABC):
5
28
  @abstractmethod
6
- def poll(self):
29
+ def _poll_request(self) -> dict:
30
+ """Helper method to make polling request with retry logic (synchronous)"""
7
31
  pass
8
32
 
9
33
  @abstractmethod
10
- def update(self, config: Configuration):
34
+ def poll(self) -> T:
35
+ """Poll the task for completion."""
11
36
  pass
12
37
 
13
38
  @abstractmethod
14
- def cancel(self):
39
+ def update(self, config: Configuration) -> T:
40
+ """Update the task configuration."""
15
41
  pass
16
42
 
17
43
  @abstractmethod
18
- def delete(self):
44
+ def cancel(self) -> T:
45
+ """Cancel the task."""
19
46
  pass
20
47
 
21
48
  @abstractmethod
22
- def html(self) -> str:
49
+ def delete(self) -> T:
50
+ """Delete the task."""
23
51
  pass
24
52
 
25
- @abstractmethod
53
+ def with_client(self, client: Union[ChunkrClientProtocol]) -> T:
54
+ self._client = client
55
+ return self
56
+
57
+ def _check_status(self) -> Optional[T]:
58
+ """Helper method to check task status and handle completion/failure"""
59
+ if self.status == "Failed":
60
+ raise ValueError(self.message)
61
+ if self.status not in ("Starting", "Processing"):
62
+ return self
63
+ return None
64
+
65
+ def html(self) -> str:
66
+ return self._get_content("html")
67
+
26
68
  def markdown(self) -> str:
27
- pass
69
+ return self._get_content("markdown")
28
70
 
29
- @abstractmethod
30
71
  def content(self) -> str:
31
- pass
72
+ return self._get_content("content")
73
+
74
+ def _get_content(self, t: str) -> str:
75
+ if not self.output:
76
+ return ""
77
+ parts = []
78
+ for c in self.output.chunks:
79
+ for s in c.segments:
80
+ v = getattr(s, t)
81
+ if v:
82
+ parts.append(v)
83
+ return "\n".join(parts)
@@ -1,12 +1,13 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: chunkr-ai
3
- Version: 0.0.11
3
+ Version: 0.0.12
4
4
  Summary: Python client for Chunkr: open source document intelligence
5
5
  Author-email: Ishaan Kapoor <ishaan@lumina.sh>
6
6
  Project-URL: Homepage, https://chunkr.ai
7
7
  Description-Content-Type: text/markdown
8
8
  License-File: LICENSE
9
9
  Requires-Dist: httpx>=0.25.0
10
+ Requires-Dist: httpx>=0.25.0
10
11
  Requires-Dist: pillow>=10.0.0
11
12
  Requires-Dist: pydantic>=2.0.0
12
13
  Requires-Dist: pytest-asyncio>=0.21.0
@@ -5,15 +5,15 @@ chunkr_ai/api/auth.py,sha256=iSd5Jek2BFaHGw9HY-RrqgwP56BHFU0xbSuJS4fU6AA,425
5
5
  chunkr_ai/api/chunkr.py,sha256=0qpV9b1hOpDhA9EuKkXW9X_laUmw5NY3ZYq0cUOTbww,5190
6
6
  chunkr_ai/api/chunkr_async.py,sha256=ZkLBrn4cqzu3sqMfS8cfZZgSvpdyQuWZP95lfGxuHx0,4900
7
7
  chunkr_ai/api/chunkr_base.py,sha256=IYO0pmoL02GchIggj6_Q5nvtAUoOvYAAvT7VLFU6scY,2506
8
- chunkr_ai/api/config.py,sha256=y6wZz01ihRJ_5_cK_JklFWn397yll7jfXntd8bBBa5s,4861
8
+ chunkr_ai/api/config.py,sha256=joTn7jiOlJXTwwza-jHauLV-39CMzaxZVGB9JBm8Cok,4862
9
9
  chunkr_ai/api/misc.py,sha256=9vnfrbJ7sFlZqwEIQ4NTMb5rhPOmETT7e1jR-b42PXM,4977
10
10
  chunkr_ai/api/protocol.py,sha256=XKS9RmtvBpJItYhPg18qlOCKpaSHdOuQTRSUxAdUz2g,479
11
11
  chunkr_ai/api/schema.py,sha256=OeLOhBRXeRBgEImg0Q6O9Z10ojT6aSEVvwnDR8UeENo,4971
12
- chunkr_ai/api/task.py,sha256=Z5Da_Ijvih5rBz5ry98oAYNcJEDbQhhDWBQ35nHCRK4,5881
13
- chunkr_ai/api/task_async.py,sha256=o7tXvViIrdcrdclxaGzxrgIv-n-W8-twQ7XsDLXfXhM,3659
14
- chunkr_ai/api/task_base.py,sha256=Tkk7dhIeB3ic5M9g_b-MVRdNv4XQTvajpaUy8JylQ8A,526
15
- chunkr_ai-0.0.11.dist-info/LICENSE,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
- chunkr_ai-0.0.11.dist-info/METADATA,sha256=s8UeXDnBDVG_1RN5colcJCGhwrICRy9VMQWmTUKVRJc,4845
17
- chunkr_ai-0.0.11.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
18
- chunkr_ai-0.0.11.dist-info/top_level.txt,sha256=0IZY7PZIiS8bw5r4NUQRUQ-ATi-L_3vLQVq3ZLouOW8,10
19
- chunkr_ai-0.0.11.dist-info/RECORD,,
12
+ chunkr_ai/api/task.py,sha256=4insrdGEVzBHs1ejZvde8bbEetVzgJELa47UjhfBqCA,2116
13
+ chunkr_ai/api/task_async.py,sha256=LqS-LL-mCOgfGsgvuSXhKkSEUM6MMro-EZHl_ZedQQk,1998
14
+ chunkr_ai/api/task_base.py,sha256=iS5UVIDEPIiDoWrn21Oh_dQurkd_hvKQ8ng32j6sGoA,2369
15
+ chunkr_ai-0.0.12.dist-info/LICENSE,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
+ chunkr_ai-0.0.12.dist-info/METADATA,sha256=dfo9myRizW2A5W0H6FpIoBzHa4QxmEe3lsedPYhwjXM,4874
17
+ chunkr_ai-0.0.12.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
18
+ chunkr_ai-0.0.12.dist-info/top_level.txt,sha256=0IZY7PZIiS8bw5r4NUQRUQ-ATi-L_3vLQVq3ZLouOW8,10
19
+ chunkr_ai-0.0.12.dist-info/RECORD,,