chunkr-ai 0.0.11__tar.gz → 0.0.12__tar.gz

Sign up to get free protection for your applications and to get access to all the features.
Files changed (28) hide show
  1. {chunkr_ai-0.0.11/src/chunkr_ai.egg-info → chunkr_ai-0.0.12}/PKG-INFO +2 -1
  2. {chunkr_ai-0.0.11 → chunkr_ai-0.0.12}/pyproject.toml +2 -1
  3. {chunkr_ai-0.0.11 → chunkr_ai-0.0.12}/src/chunkr_ai/api/config.py +1 -1
  4. chunkr_ai-0.0.12/src/chunkr_ai/api/task.py +61 -0
  5. chunkr_ai-0.0.12/src/chunkr_ai/api/task_async.py +50 -0
  6. chunkr_ai-0.0.12/src/chunkr_ai/api/task_base.py +83 -0
  7. {chunkr_ai-0.0.11 → chunkr_ai-0.0.12/src/chunkr_ai.egg-info}/PKG-INFO +2 -1
  8. {chunkr_ai-0.0.11 → chunkr_ai-0.0.12}/src/chunkr_ai.egg-info/requires.txt +1 -0
  9. chunkr_ai-0.0.11/src/chunkr_ai/api/task.py +0 -168
  10. chunkr_ai-0.0.11/src/chunkr_ai/api/task_async.py +0 -103
  11. chunkr_ai-0.0.11/src/chunkr_ai/api/task_base.py +0 -31
  12. {chunkr_ai-0.0.11 → chunkr_ai-0.0.12}/LICENSE +0 -0
  13. {chunkr_ai-0.0.11 → chunkr_ai-0.0.12}/README.md +0 -0
  14. {chunkr_ai-0.0.11 → chunkr_ai-0.0.12}/setup.cfg +0 -0
  15. {chunkr_ai-0.0.11 → chunkr_ai-0.0.12}/src/chunkr_ai/__init__.py +0 -0
  16. {chunkr_ai-0.0.11 → chunkr_ai-0.0.12}/src/chunkr_ai/api/__init__.py +0 -0
  17. {chunkr_ai-0.0.11 → chunkr_ai-0.0.12}/src/chunkr_ai/api/auth.py +0 -0
  18. {chunkr_ai-0.0.11 → chunkr_ai-0.0.12}/src/chunkr_ai/api/chunkr.py +0 -0
  19. {chunkr_ai-0.0.11 → chunkr_ai-0.0.12}/src/chunkr_ai/api/chunkr_async.py +0 -0
  20. {chunkr_ai-0.0.11 → chunkr_ai-0.0.12}/src/chunkr_ai/api/chunkr_base.py +0 -0
  21. {chunkr_ai-0.0.11 → chunkr_ai-0.0.12}/src/chunkr_ai/api/misc.py +0 -0
  22. {chunkr_ai-0.0.11 → chunkr_ai-0.0.12}/src/chunkr_ai/api/protocol.py +0 -0
  23. {chunkr_ai-0.0.11 → chunkr_ai-0.0.12}/src/chunkr_ai/api/schema.py +0 -0
  24. {chunkr_ai-0.0.11 → chunkr_ai-0.0.12}/src/chunkr_ai/models.py +0 -0
  25. {chunkr_ai-0.0.11 → chunkr_ai-0.0.12}/src/chunkr_ai.egg-info/SOURCES.txt +0 -0
  26. {chunkr_ai-0.0.11 → chunkr_ai-0.0.12}/src/chunkr_ai.egg-info/dependency_links.txt +0 -0
  27. {chunkr_ai-0.0.11 → chunkr_ai-0.0.12}/src/chunkr_ai.egg-info/top_level.txt +0 -0
  28. {chunkr_ai-0.0.11 → chunkr_ai-0.0.12}/tests/test_chunkr.py +0 -0
@@ -1,12 +1,13 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: chunkr-ai
3
- Version: 0.0.11
3
+ Version: 0.0.12
4
4
  Summary: Python client for Chunkr: open source document intelligence
5
5
  Author-email: Ishaan Kapoor <ishaan@lumina.sh>
6
6
  Project-URL: Homepage, https://chunkr.ai
7
7
  Description-Content-Type: text/markdown
8
8
  License-File: LICENSE
9
9
  Requires-Dist: httpx>=0.25.0
10
+ Requires-Dist: httpx>=0.25.0
10
11
  Requires-Dist: pillow>=10.0.0
11
12
  Requires-Dist: pydantic>=2.0.0
12
13
  Requires-Dist: pytest-asyncio>=0.21.0
@@ -4,13 +4,14 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "chunkr-ai"
7
- version = "0.0.11"
7
+ version = "0.0.12"
8
8
  authors = [{"name" = "Ishaan Kapoor", "email" = "ishaan@lumina.sh"}]
9
9
  description = "Python client for Chunkr: open source document intelligence"
10
10
  readme = "README.md"
11
11
  license = {"file" = "LICENSE"}
12
12
  urls = {Homepage = "https://chunkr.ai"}
13
13
  dependencies = [
14
+ "httpx>=0.25.0",
14
15
  "httpx>=0.25.0",
15
16
  "pillow>=10.0.0",
16
17
  "pydantic>=2.0.0",
@@ -142,4 +142,4 @@ class Status(str, Enum):
142
142
  PROCESSING = "Processing"
143
143
  SUCCEEDED = "Succeeded"
144
144
  FAILED = "Failed"
145
- CANCELLED = "Cancelled"
145
+ CANCELLED = "Cancelled"
@@ -0,0 +1,61 @@
1
+ from .config import Configuration
2
+ from .misc import prepare_upload_data
3
+ from .task_base import TaskBase
4
+ import time
5
+
6
+ class TaskResponse(TaskBase):
7
+ def _poll_request(self) -> dict:
8
+ while True:
9
+ try:
10
+ r = self._client._session.get(self.task_url, headers=self._client._headers())
11
+ r.raise_for_status()
12
+ return r.json()
13
+ except (ConnectionError, TimeoutError) as _:
14
+ print("Connection error while polling the task, retrying...")
15
+ time.sleep(0.5)
16
+ except Exception as e:
17
+ raise
18
+
19
+ def poll(self) -> 'TaskResponse':
20
+ if not self.task_url:
21
+ raise ValueError("Task URL not found in response")
22
+ while True:
23
+ response = self._poll_request_sync()
24
+ updated_task = TaskResponse(**response).with_client(self._client)
25
+ self.__dict__.update(updated_task.__dict__)
26
+ if result := self._check_status():
27
+ return result
28
+ time.sleep(0.5)
29
+
30
+ def update(self, config: Configuration) -> 'TaskResponse':
31
+ if not self.task_url:
32
+ raise ValueError("Task URL not found")
33
+ files = prepare_upload_data(None, config)
34
+ r = self._client._session.patch(
35
+ f"{self.task_url}",
36
+ files=files,
37
+ headers=self._client._headers()
38
+ )
39
+ r.raise_for_status()
40
+ updated = TaskResponse(**r.json()).with_client(self._client)
41
+ self.__dict__.update(updated.__dict__)
42
+ return self.poll()
43
+
44
+ def cancel(self):
45
+ if not self.task_url:
46
+ raise ValueError("Task URL not found")
47
+ r = self._client._session.get(
48
+ f"{self.task_url}/cancel",
49
+ headers=self._client._headers()
50
+ )
51
+ r.raise_for_status()
52
+ self.poll()
53
+
54
+ def delete(self):
55
+ if not self.task_url:
56
+ raise ValueError("Task URL not found")
57
+ r = self._client._session.delete(
58
+ self.task_url,
59
+ headers=self._client._headers()
60
+ )
61
+ r.raise_for_status()
@@ -0,0 +1,50 @@
1
+ from .config import Configuration
2
+ from .misc import prepare_upload_data
3
+ from .task_base import TaskBase
4
+ import asyncio
5
+
6
+ class TaskResponseAsync(TaskBase):
7
+ async def _poll_request(self) -> dict:
8
+ try:
9
+ r = await self._client._client.get(self.task_url, headers=self._client._headers())
10
+ r.raise_for_status()
11
+ return r.json()
12
+ except (ConnectionError, TimeoutError) as _:
13
+ print("Connection error while polling the task, retrying...")
14
+ await asyncio.sleep(0.5)
15
+ except Exception as e:
16
+ raise
17
+
18
+ async def poll(self) -> 'TaskResponseAsync':
19
+ if not self.task_url:
20
+ raise ValueError("Task URL not found")
21
+ while True:
22
+ j = await self._poll_request()
23
+ updated = TaskResponseAsync(**j).with_client(self._client)
24
+ self.__dict__.update(updated.__dict__)
25
+ if res := self._check_status():
26
+ return res
27
+ await asyncio.sleep(0.5)
28
+
29
+ async def update(self, config: Configuration) -> 'TaskResponseAsync':
30
+ if not self.task_url:
31
+ raise ValueError("Task URL not found")
32
+ f = prepare_upload_data(None, config)
33
+ r = await self._client._client.patch(self.task_url, files=f, headers=self._client._headers())
34
+ r.raise_for_status()
35
+ updated = TaskResponseAsync(**r.json()).with_client(self._client)
36
+ self.__dict__.update(updated.__dict__)
37
+ return await self.poll()
38
+
39
+ async def cancel(self):
40
+ if not self.task_url:
41
+ raise ValueError("Task URL not found")
42
+ r = await self._client._client.get(f"{self.task_url}/cancel", headers=self._client._headers())
43
+ r.raise_for_status()
44
+ return await self.poll()
45
+
46
+ async def delete(self):
47
+ if not self.task_url:
48
+ raise ValueError("Task URL not found")
49
+ r = await self._client._client.delete(self.task_url, headers=self._client._headers())
50
+ r.raise_for_status()
@@ -0,0 +1,83 @@
1
+ from .config import Configuration
2
+ from .protocol import ChunkrClientProtocol
3
+ from ..models import Status, OutputResponse
4
+ from abc import ABC, abstractmethod
5
+ from typing import TypeVar, Optional, Generic, Union
6
+ from pydantic import BaseModel, PrivateAttr
7
+ from datetime import datetime
8
+
9
+ T = TypeVar('T', bound='TaskBase')
10
+
11
+ class TaskBase(BaseModel, ABC, Generic[T]):
12
+ configuration: Configuration
13
+ created_at: datetime
14
+ expires_at: Optional[datetime]
15
+ file_name: Optional[str]
16
+ finished_at: Optional[datetime]
17
+ input_file_url: Optional[str]
18
+ message: str
19
+ output: Optional[OutputResponse]
20
+ page_count: Optional[int]
21
+ pdf_url: Optional[str]
22
+ started_at: Optional[datetime]
23
+ status: Status
24
+ task_id: str
25
+ task_url: Optional[str]
26
+ _client: Optional[Union[ChunkrClientProtocol]] = PrivateAttr(default=None)
27
+
28
+ @abstractmethod
29
+ def _poll_request(self) -> dict:
30
+ """Helper method to make polling request with retry logic (synchronous)"""
31
+ pass
32
+
33
+ @abstractmethod
34
+ def poll(self) -> T:
35
+ """Poll the task for completion."""
36
+ pass
37
+
38
+ @abstractmethod
39
+ def update(self, config: Configuration) -> T:
40
+ """Update the task configuration."""
41
+ pass
42
+
43
+ @abstractmethod
44
+ def cancel(self) -> T:
45
+ """Cancel the task."""
46
+ pass
47
+
48
+ @abstractmethod
49
+ def delete(self) -> T:
50
+ """Delete the task."""
51
+ pass
52
+
53
+ def with_client(self, client: Union[ChunkrClientProtocol]) -> T:
54
+ self._client = client
55
+ return self
56
+
57
+ def _check_status(self) -> Optional[T]:
58
+ """Helper method to check task status and handle completion/failure"""
59
+ if self.status == "Failed":
60
+ raise ValueError(self.message)
61
+ if self.status not in ("Starting", "Processing"):
62
+ return self
63
+ return None
64
+
65
+ def html(self) -> str:
66
+ return self._get_content("html")
67
+
68
+ def markdown(self) -> str:
69
+ return self._get_content("markdown")
70
+
71
+ def content(self) -> str:
72
+ return self._get_content("content")
73
+
74
+ def _get_content(self, t: str) -> str:
75
+ if not self.output:
76
+ return ""
77
+ parts = []
78
+ for c in self.output.chunks:
79
+ for s in c.segments:
80
+ v = getattr(s, t)
81
+ if v:
82
+ parts.append(v)
83
+ return "\n".join(parts)
@@ -1,12 +1,13 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: chunkr-ai
3
- Version: 0.0.11
3
+ Version: 0.0.12
4
4
  Summary: Python client for Chunkr: open source document intelligence
5
5
  Author-email: Ishaan Kapoor <ishaan@lumina.sh>
6
6
  Project-URL: Homepage, https://chunkr.ai
7
7
  Description-Content-Type: text/markdown
8
8
  License-File: LICENSE
9
9
  Requires-Dist: httpx>=0.25.0
10
+ Requires-Dist: httpx>=0.25.0
10
11
  Requires-Dist: pillow>=10.0.0
11
12
  Requires-Dist: pydantic>=2.0.0
12
13
  Requires-Dist: pytest-asyncio>=0.21.0
@@ -1,4 +1,5 @@
1
1
  httpx>=0.25.0
2
+ httpx>=0.25.0
2
3
  pillow>=10.0.0
3
4
  pydantic>=2.0.0
4
5
  pytest-asyncio>=0.21.0
@@ -1,168 +0,0 @@
1
- from .protocol import ChunkrClientProtocol
2
- from .config import Configuration, OutputResponse, Status
3
- from .misc import prepare_upload_data
4
- import asyncio
5
- from datetime import datetime
6
- from pydantic import BaseModel, PrivateAttr
7
- import time
8
- from typing import Optional, Union
9
-
10
- class TaskResponse(BaseModel):
11
- configuration: Configuration
12
- created_at: datetime
13
- expires_at: Optional[datetime] = None
14
- file_name: Optional[str] = None
15
- finished_at: Optional[datetime] = None
16
- input_file_url: Optional[str] = None
17
- message: str
18
- output: Optional[OutputResponse] = None
19
- page_count: Optional[int] = None
20
- pdf_url: Optional[str] = None
21
- started_at: Optional[datetime] = None
22
- status: Status
23
- task_id: str
24
- task_url: Optional[str] = None
25
- _client: Optional[Union[ChunkrClientProtocol]] = PrivateAttr(default=None)
26
-
27
- def with_client(self, client: Union[ChunkrClientProtocol]) -> 'TaskResponse':
28
- self._client = client
29
- return self
30
-
31
- def _poll_request_sync(self) -> dict:
32
- """Helper method to make polling request with retry logic (synchronous)"""
33
- if not self.task_url:
34
- raise ValueError("Task URL not found in response")
35
-
36
- while True:
37
- try:
38
- r = self._client._session.get(self.task_url, headers=self._client._headers())
39
- r.raise_for_status()
40
- return r.json()
41
- except (ConnectionError, TimeoutError) as _:
42
- print("Connection error while polling the task, retrying...")
43
- time.sleep(0.5)
44
- except Exception as e:
45
- raise
46
-
47
- async def _poll_request_async(self) -> dict:
48
- """Helper method to make polling request with retry logic (asynchronous)"""
49
- if not self.task_url:
50
- raise ValueError("Task URL not found in response")
51
-
52
- while True:
53
- try:
54
- r = await self._client._client.get(self.task_url, headers=self._client._headers())
55
- r.raise_for_status()
56
- response = r.json()
57
- return response
58
- except (ConnectionError, TimeoutError) as _:
59
- print("Connection error while polling the task, retrying...")
60
- await asyncio.sleep(0.5)
61
- except Exception as e:
62
- raise
63
-
64
- def _check_status(self) -> Optional['TaskResponse']:
65
- """Helper method to check task status and handle completion/failure"""
66
- if self.status == "Failed":
67
- raise ValueError(self.message)
68
- if self.status not in ("Starting", "Processing"):
69
- return self
70
- return None
71
-
72
- def poll(self) -> 'TaskResponse':
73
- """Poll the task for completion."""
74
- while True:
75
- response = self._poll_request_sync()
76
- updated_task = TaskResponse(**response).with_client(self._client)
77
- self.__dict__.update(updated_task.__dict__)
78
-
79
- if result := self._check_status():
80
- return result
81
-
82
- time.sleep(0.5)
83
-
84
- async def poll_async(self) -> 'TaskResponse':
85
- """Poll the task for completion asynchronously."""
86
- while True:
87
- response = await self._poll_request_async()
88
- updated_task = TaskResponse(**response).with_client(self._client)
89
- self.__dict__.update(updated_task.__dict__)
90
-
91
- if result := self._check_status():
92
- return result
93
-
94
- await asyncio.sleep(0.5)
95
-
96
- def _get_content(self, content_type: str) -> str:
97
- """Helper method to get either HTML, Markdown, or raw content."""
98
- if not self.output:
99
- return ""
100
- parts = []
101
- for c in self.output.chunks:
102
- for s in c.segments:
103
- content = getattr(s, content_type)
104
- if content:
105
- parts.append(content)
106
- return "\n".join(parts)
107
-
108
- def update(self, config: Configuration) -> 'TaskResponse':
109
- files = prepare_upload_data(None, config)
110
- r = self._client._session.patch(
111
- f"{self.task_url}",
112
- files=files,
113
- headers=self._client._headers()
114
- )
115
- r.raise_for_status()
116
- return TaskResponse(**r.json()).with_client(self._client)
117
-
118
- async def update_async(self, config: Configuration) -> 'TaskResponse':
119
- files = prepare_upload_data(None, config)
120
- r = await self._client._client.patch(
121
- f"{self.task_url}",
122
- files=files,
123
- headers=self._client._headers()
124
- )
125
- r.raise_for_status()
126
- return TaskResponse(**r.json()).with_client(self._client)
127
-
128
- def cancel(self):
129
- r = self._client._session.get(
130
- f"{self.task_url}/cancel",
131
- headers=self._client._headers()
132
- )
133
- r.raise_for_status()
134
- self.poll()
135
-
136
- async def cancel_async(self):
137
- r = await self._client._client.get(
138
- f"{self.task_url}/cancel",
139
- headers=self._client._headers()
140
- )
141
- r.raise_for_status()
142
- await self.poll_async()
143
-
144
- def delete(self):
145
- r = self._client._session.delete(
146
- f"{self.task_url}",
147
- headers=self._client._headers()
148
- )
149
- r.raise_for_status()
150
-
151
- async def delete_async(self):
152
- r = await self._client._client.delete(
153
- f"{self.task_url}",
154
- headers=self._client._headers()
155
- )
156
- r.raise_for_status()
157
-
158
- def html(self) -> str:
159
- """Get full HTML for the task"""
160
- return self._get_content("html")
161
-
162
- def markdown(self) -> str:
163
- """Get full markdown for the task"""
164
- return self._get_content("markdown")
165
-
166
- def content(self) -> str:
167
- """Get full text for the task"""
168
- return self._get_content("content")
@@ -1,103 +0,0 @@
1
- import asyncio
2
- from pydantic import BaseModel, PrivateAttr
3
- from datetime import datetime
4
- from typing import Optional, Union
5
- from .task_base import TaskBase
6
- from .protocol import ChunkrClientProtocol
7
- from .config import Configuration, OutputResponse, Status
8
- from .misc import prepare_upload_data
9
-
10
- class TaskResponseAsync(BaseModel, TaskBase):
11
- configuration: Configuration
12
- created_at: datetime
13
- expires_at: Optional[datetime]
14
- file_name: Optional[str]
15
- finished_at: Optional[datetime]
16
- input_file_url: Optional[str]
17
- message: str
18
- output: Optional[OutputResponse]
19
- page_count: Optional[int]
20
- pdf_url: Optional[str]
21
- started_at: Optional[datetime]
22
- status: Status
23
- task_id: str
24
- task_url: Optional[str]
25
- _client: Optional[Union[ChunkrClientProtocol]] = PrivateAttr(default=None)
26
-
27
- def with_client(self, client: Union[ChunkrClientProtocol]) -> 'TaskResponseAsync':
28
- self._client = client
29
- return self
30
-
31
- async def poll(self) -> 'TaskResponseAsync':
32
- while True:
33
- j = await self._poll_request()
34
- updated = TaskResponseAsync(**j).with_client(self._client)
35
- self.__dict__.update(updated.__dict__)
36
- if res := self._check_status():
37
- return res
38
- await asyncio.sleep(0.5)
39
-
40
- async def _poll_request(self) -> dict:
41
- if not self.task_url:
42
- raise ValueError("Task URL not found")
43
- while True:
44
- try:
45
- r = await self._client._client.get(self.task_url, headers=self._client._headers())
46
- r.raise_for_status()
47
- return r.json()
48
- except Exception as e:
49
- if self.status == Status.FAILED:
50
- raise ValueError(self.message) from e
51
- await asyncio.sleep(0.5)
52
-
53
- def _check_status(self) -> Optional['TaskResponseAsync']:
54
- if self.status == Status.FAILED:
55
- raise ValueError(f"Task failed: {self.message}")
56
- if self.status == Status.CANCELLED:
57
- return self
58
- if self.status not in [Status.STARTING, Status.PROCESSING]:
59
- return self
60
- return None
61
-
62
- async def update(self, config: Configuration) -> 'TaskResponseAsync':
63
- if not self.task_url:
64
- raise ValueError("Task URL not found")
65
- f = prepare_upload_data(None, config)
66
- r = await self._client._client.patch(self.task_url, files=f, headers=self._client._headers())
67
- r.raise_for_status()
68
- updated = TaskResponseAsync(**r.json()).with_client(self._client)
69
- self.__dict__.update(updated.__dict__)
70
- return await self.poll()
71
-
72
- async def cancel(self):
73
- if not self.task_url:
74
- raise ValueError("Task URL not found")
75
- r = await self._client._client.get(f"{self.task_url}/cancel", headers=self._client._headers())
76
- r.raise_for_status()
77
- return await self.poll()
78
-
79
- async def delete(self):
80
- r = await self._client._client.delete(self.task_url, headers=self._client._headers())
81
- r.raise_for_status()
82
-
83
- def html(self) -> str:
84
- return self._get_content("html")
85
-
86
- def markdown(self) -> str:
87
- return self._get_content("markdown")
88
-
89
- def content(self) -> str:
90
- return self._get_content("content")
91
-
92
- def _get_content(self, t: str) -> str:
93
- if not self.output:
94
- return ""
95
- parts = []
96
- for c in self.output.chunks:
97
- for s in c.segments:
98
- v = getattr(s, t)
99
- if v:
100
- parts.append(v)
101
- return "\n".join(parts)
102
-
103
- # Satisfying TaskBase abstract methods with stubs
@@ -1,31 +0,0 @@
1
- from abc import ABC, abstractmethod
2
- from .config import Configuration
3
-
4
- class TaskBase(ABC):
5
- @abstractmethod
6
- def poll(self):
7
- pass
8
-
9
- @abstractmethod
10
- def update(self, config: Configuration):
11
- pass
12
-
13
- @abstractmethod
14
- def cancel(self):
15
- pass
16
-
17
- @abstractmethod
18
- def delete(self):
19
- pass
20
-
21
- @abstractmethod
22
- def html(self) -> str:
23
- pass
24
-
25
- @abstractmethod
26
- def markdown(self) -> str:
27
- pass
28
-
29
- @abstractmethod
30
- def content(self) -> str:
31
- pass
File without changes
File without changes
File without changes