chunkr-ai 0.0.7__py3-none-any.whl → 0.0.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
chunkr_ai/api/config.py CHANGED
@@ -86,9 +86,9 @@ class Segment(BaseModel):
86
86
  bbox: BoundingBox
87
87
  content: str
88
88
  page_height: float
89
- html: Optional[str]
90
- image: Optional[str]
91
- markdown: Optional[str]
89
+ html: Optional[str] = None
90
+ image: Optional[str] = None
91
+ markdown: Optional[str] = None
92
92
  ocr: List[OCRResult]
93
93
  page_number: int
94
94
  page_width: float
@@ -104,8 +104,8 @@ class ExtractedJson(BaseModel):
104
104
  data: Dict
105
105
 
106
106
  class OutputResponse(BaseModel):
107
- chunks: List[Chunk] = []
108
- extracted_json: Optional[ExtractedJson]
107
+ chunks: List[Chunk]
108
+ extracted_json: Optional[ExtractedJson] = Field(default=None)
109
109
 
110
110
  class Model(str, Enum):
111
111
  FAST = "Fast"
chunkr_ai/api/task.py CHANGED
@@ -18,18 +18,18 @@ class Status(str, Enum):
18
18
  class TaskResponse(BaseModel):
19
19
  configuration: Configuration
20
20
  created_at: datetime
21
- expires_at: Optional[datetime]
22
- file_name: Optional[str]
23
- finished_at: Optional[datetime]
24
- input_file_url: Optional[str]
21
+ expires_at: Optional[datetime] = None
22
+ file_name: Optional[str] = None
23
+ finished_at: Optional[datetime] = None
24
+ input_file_url: Optional[str] = None
25
25
  message: str
26
- output: Optional[OutputResponse]
27
- page_count: Optional[int]
28
- pdf_url: Optional[str]
29
- started_at: Optional[datetime]
26
+ output: Optional[OutputResponse] = None
27
+ page_count: Optional[int] = None
28
+ pdf_url: Optional[str] = None
29
+ started_at: Optional[datetime] = None
30
30
  status: Status
31
31
  task_id: str
32
- task_url: Optional[str]
32
+ task_url: Optional[str] = None
33
33
  _client: Optional[Union[ChunkrClientProtocol]] = PrivateAttr(default=None)
34
34
 
35
35
  def with_client(self, client: Union[ChunkrClientProtocol]) -> 'TaskResponse':
@@ -0,0 +1,111 @@
1
+ import asyncio
2
+ from pydantic import BaseModel, PrivateAttr
3
+ from datetime import datetime
4
+ from enum import Enum
5
+ from typing import Optional, Union
6
+ from .task_base import TaskBase
7
+ from .protocol import ChunkrClientProtocol
8
+ from .config import Configuration, OutputResponse
9
+ from .misc import prepare_upload_data
10
+
11
+ class Status(str, Enum):
12
+ STARTING = "Starting"
13
+ PROCESSING = "Processing"
14
+ SUCCEEDED = "Succeeded"
15
+ FAILED = "Failed"
16
+ CANCELLED = "Cancelled"
17
+
18
+ class TaskResponseAsync(BaseModel, TaskBase):
19
+ configuration: Configuration
20
+ created_at: datetime
21
+ expires_at: Optional[datetime]
22
+ file_name: Optional[str]
23
+ finished_at: Optional[datetime]
24
+ input_file_url: Optional[str]
25
+ message: str
26
+ output: Optional[OutputResponse]
27
+ page_count: Optional[int]
28
+ pdf_url: Optional[str]
29
+ started_at: Optional[datetime]
30
+ status: Status
31
+ task_id: str
32
+ task_url: Optional[str]
33
+ _client: Optional[Union[ChunkrClientProtocol]] = PrivateAttr(default=None)
34
+
35
+ def with_client(self, client: Union[ChunkrClientProtocol]) -> 'TaskResponseAsync':
36
+ self._client = client
37
+ return self
38
+
39
+ async def poll(self) -> 'TaskResponseAsync':
40
+ while True:
41
+ j = await self._poll_request()
42
+ updated = TaskResponseAsync(**j).with_client(self._client)
43
+ self.__dict__.update(updated.__dict__)
44
+ if res := self._check_status():
45
+ return res
46
+ await asyncio.sleep(0.5)
47
+
48
+ async def _poll_request(self) -> dict:
49
+ if not self.task_url:
50
+ raise ValueError("Task URL not found")
51
+ while True:
52
+ try:
53
+ r = await self._client._client.get(self.task_url, headers=self._client._headers())
54
+ r.raise_for_status()
55
+ return r.json()
56
+ except Exception as e:
57
+ if self.status == Status.FAILED:
58
+ raise ValueError(self.message) from e
59
+ await asyncio.sleep(0.5)
60
+
61
+ def _check_status(self) -> Optional['TaskResponseAsync']:
62
+ if self.status == Status.FAILED:
63
+ raise ValueError(f"Task failed: {self.message}")
64
+ if self.status == Status.CANCELLED:
65
+ return self
66
+ if self.status not in [Status.STARTING, Status.PROCESSING]:
67
+ return self
68
+ return None
69
+
70
+ async def update(self, config: Configuration) -> 'TaskResponseAsync':
71
+ if not self.task_url:
72
+ raise ValueError("Task URL not found")
73
+ f = prepare_upload_data(None, config)
74
+ r = await self._client._client.patch(self.task_url, files=f, headers=self._client._headers())
75
+ r.raise_for_status()
76
+ updated = TaskResponseAsync(**r.json()).with_client(self._client)
77
+ self.__dict__.update(updated.__dict__)
78
+ return await self.poll()
79
+
80
+ async def cancel(self):
81
+ if not self.task_url:
82
+ raise ValueError("Task URL not found")
83
+ r = await self._client._client.get(f"{self.task_url}/cancel", headers=self._client._headers())
84
+ r.raise_for_status()
85
+ return await self.poll()
86
+
87
+ async def delete(self):
88
+ r = await self._client._client.delete(self.task_url, headers=self._client._headers())
89
+ r.raise_for_status()
90
+
91
+ def html(self) -> str:
92
+ return self._get_content("html")
93
+
94
+ def markdown(self) -> str:
95
+ return self._get_content("markdown")
96
+
97
+ def content(self) -> str:
98
+ return self._get_content("content")
99
+
100
+ def _get_content(self, t: str) -> str:
101
+ if not self.output:
102
+ return ""
103
+ parts = []
104
+ for c in self.output.chunks:
105
+ for s in c.segments:
106
+ v = getattr(s, t)
107
+ if v:
108
+ parts.append(v)
109
+ return "\n".join(parts)
110
+
111
+ # Satisfying TaskBase abstract methods with stubs
@@ -0,0 +1,31 @@
1
+ from abc import ABC, abstractmethod
2
+ from .config import Configuration
3
+
4
+ class TaskBase(ABC):
5
+ @abstractmethod
6
+ def poll(self):
7
+ pass
8
+
9
+ @abstractmethod
10
+ def update(self, config: Configuration):
11
+ pass
12
+
13
+ @abstractmethod
14
+ def cancel(self):
15
+ pass
16
+
17
+ @abstractmethod
18
+ def delete(self):
19
+ pass
20
+
21
+ @abstractmethod
22
+ def html(self) -> str:
23
+ pass
24
+
25
+ @abstractmethod
26
+ def markdown(self) -> str:
27
+ pass
28
+
29
+ @abstractmethod
30
+ def content(self) -> str:
31
+ pass
@@ -1,20 +1,20 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: chunkr-ai
3
- Version: 0.0.7
3
+ Version: 0.0.9
4
4
  Summary: Python client for Chunkr: open source document intelligence
5
5
  Author-email: Ishaan Kapoor <ishaan@lumina.sh>
6
6
  Project-URL: Homepage, https://chunkr.ai
7
7
  Description-Content-Type: text/markdown
8
8
  License-File: LICENSE
9
- Requires-Dist: httpx>=0.28.1
10
- Requires-Dist: pillow>=11.1.0
11
- Requires-Dist: pydantic>=2.10.4
12
- Requires-Dist: pytest-asyncio>=0.25.2
13
- Requires-Dist: python-dotenv>=1.0.1
14
- Requires-Dist: requests>=2.32.3
9
+ Requires-Dist: httpx>=0.24.0
10
+ Requires-Dist: pillow>=10.0.0
11
+ Requires-Dist: pydantic>=2.0.0
12
+ Requires-Dist: pytest-asyncio>=0.21.0
13
+ Requires-Dist: python-dotenv>=0.19.0
14
+ Requires-Dist: requests>=2.28.0
15
15
  Provides-Extra: test
16
- Requires-Dist: pytest>=8.3.4; extra == "test"
17
- Requires-Dist: pytest-xdist>=3.6.1; extra == "test"
16
+ Requires-Dist: pytest>=7.0.0; extra == "test"
17
+ Requires-Dist: pytest-xdist>=3.0.0; extra == "test"
18
18
 
19
19
  # Chunkr Python Client
20
20
 
@@ -7,12 +7,14 @@ chunkr_ai/api/auth.py,sha256=iSd5Jek2BFaHGw9HY-RrqgwP56BHFU0xbSuJS4fU6AA,425
7
7
  chunkr_ai/api/base.py,sha256=IYO0pmoL02GchIggj6_Q5nvtAUoOvYAAvT7VLFU6scY,2506
8
8
  chunkr_ai/api/chunkr.py,sha256=PmrK37HbK2T1KUPitKnt4wZqIujL61Jo12qW9DEpNMI,5186
9
9
  chunkr_ai/api/chunkr_async.py,sha256=2yYyAO9-j2xKQYH0fJb2S6gL26hgbtL4QyqlG9l0QBY,4893
10
- chunkr_ai/api/config.py,sha256=r5N-noRs4HHZqgT8PsSSe0HTR5gPA5_SEcFm_tHLw0M,4197
10
+ chunkr_ai/api/config.py,sha256=XIqXZ_8q7U_BEmY5wyIC9mbQGZBw1956EN9yhC4svD0,4235
11
11
  chunkr_ai/api/misc.py,sha256=tScsUUcrqeVh_bZv1YlbmjGkQSTDQN8NyKxoNwAG6XA,3792
12
12
  chunkr_ai/api/protocol.py,sha256=XKS9RmtvBpJItYhPg18qlOCKpaSHdOuQTRSUxAdUz2g,479
13
- chunkr_ai/api/task.py,sha256=hFuZoJVEknDpeX8AVNBC6V8fqfOaxmsBa3rO51iWTlA,5996
14
- chunkr_ai-0.0.7.dist-info/LICENSE,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
- chunkr_ai-0.0.7.dist-info/METADATA,sha256=vlQ1YS8yz_IvMTDx2s7e_dypxM87dFfvZctxAR0FnME,4844
16
- chunkr_ai-0.0.7.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
17
- chunkr_ai-0.0.7.dist-info/top_level.txt,sha256=0IZY7PZIiS8bw5r4NUQRUQ-ATi-L_3vLQVq3ZLouOW8,10
18
- chunkr_ai-0.0.7.dist-info/RECORD,,
13
+ chunkr_ai/api/task.py,sha256=EB6RK8ms7EaNj57tNJZoNgNMHGWKXFhkQ1WC7gk5ht4,6059
14
+ chunkr_ai/api/task_async.py,sha256=Dd-Fenie0Q6GxXce7OlXvuQ14NQ58F_0b9P7AGKWyYA,3833
15
+ chunkr_ai/api/task_base.py,sha256=Tkk7dhIeB3ic5M9g_b-MVRdNv4XQTvajpaUy8JylQ8A,526
16
+ chunkr_ai-0.0.9.dist-info/LICENSE,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
+ chunkr_ai-0.0.9.dist-info/METADATA,sha256=XFGPjuDARO1VYvdcyMOHhxZK1FYjEr0_ySI0Ni6tWMc,4844
18
+ chunkr_ai-0.0.9.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
19
+ chunkr_ai-0.0.9.dist-info/top_level.txt,sha256=0IZY7PZIiS8bw5r4NUQRUQ-ATi-L_3vLQVq3ZLouOW8,10
20
+ chunkr_ai-0.0.9.dist-info/RECORD,,