chunkr-ai 0.0.4__tar.gz → 0.0.6__tar.gz

Sign up to get free protection for your applications and to get access to all the features.
Files changed (24) hide show
  1. {chunkr_ai-0.0.4/src/chunkr_ai.egg-info → chunkr_ai-0.0.6}/PKG-INFO +2 -11
  2. {chunkr_ai-0.0.4 → chunkr_ai-0.0.6}/README.md +1 -11
  3. {chunkr_ai-0.0.4 → chunkr_ai-0.0.6}/pyproject.toml +3 -1
  4. {chunkr_ai-0.0.4 → chunkr_ai-0.0.6}/src/chunkr_ai/api/task.py +5 -14
  5. {chunkr_ai-0.0.4 → chunkr_ai-0.0.6}/src/chunkr_ai/models.py +1 -2
  6. {chunkr_ai-0.0.4 → chunkr_ai-0.0.6/src/chunkr_ai.egg-info}/PKG-INFO +2 -11
  7. {chunkr_ai-0.0.4 → chunkr_ai-0.0.6}/src/chunkr_ai.egg-info/requires.txt +1 -0
  8. chunkr_ai-0.0.6/tests/test_chunkr.py +212 -0
  9. chunkr_ai-0.0.4/tests/test_chunkr.py +0 -158
  10. {chunkr_ai-0.0.4 → chunkr_ai-0.0.6}/LICENSE +0 -0
  11. {chunkr_ai-0.0.4 → chunkr_ai-0.0.6}/setup.cfg +0 -0
  12. {chunkr_ai-0.0.4 → chunkr_ai-0.0.6}/src/chunkr_ai/__init__.py +0 -0
  13. {chunkr_ai-0.0.4 → chunkr_ai-0.0.6}/src/chunkr_ai/api/__init__.py +0 -0
  14. {chunkr_ai-0.0.4 → chunkr_ai-0.0.6}/src/chunkr_ai/api/api.py +0 -0
  15. {chunkr_ai-0.0.4 → chunkr_ai-0.0.6}/src/chunkr_ai/api/auth.py +0 -0
  16. {chunkr_ai-0.0.4 → chunkr_ai-0.0.6}/src/chunkr_ai/api/base.py +0 -0
  17. {chunkr_ai-0.0.4 → chunkr_ai-0.0.6}/src/chunkr_ai/api/chunkr.py +0 -0
  18. {chunkr_ai-0.0.4 → chunkr_ai-0.0.6}/src/chunkr_ai/api/chunkr_async.py +0 -0
  19. {chunkr_ai-0.0.4 → chunkr_ai-0.0.6}/src/chunkr_ai/api/config.py +0 -0
  20. {chunkr_ai-0.0.4 → chunkr_ai-0.0.6}/src/chunkr_ai/api/protocol.py +0 -0
  21. {chunkr_ai-0.0.4 → chunkr_ai-0.0.6}/src/chunkr_ai/main.py +0 -0
  22. {chunkr_ai-0.0.4 → chunkr_ai-0.0.6}/src/chunkr_ai.egg-info/SOURCES.txt +0 -0
  23. {chunkr_ai-0.0.4 → chunkr_ai-0.0.6}/src/chunkr_ai.egg-info/dependency_links.txt +0 -0
  24. {chunkr_ai-0.0.4 → chunkr_ai-0.0.6}/src/chunkr_ai.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: chunkr-ai
3
- Version: 0.0.4
3
+ Version: 0.0.6
4
4
  Summary: Python client for Chunkr: open source document intelligence
5
5
  Author-email: Ishaan Kapoor <ishaan@lumina.sh>
6
6
  Project-URL: Homepage, https://chunkr.ai
@@ -9,6 +9,7 @@ License-File: LICENSE
9
9
  Requires-Dist: httpx>=0.28.1
10
10
  Requires-Dist: pillow>=11.1.0
11
11
  Requires-Dist: pydantic>=2.10.4
12
+ Requires-Dist: pytest-asyncio>=0.25.2
12
13
  Requires-Dist: python-dotenv>=1.0.1
13
14
  Requires-Dist: requests>=2.32.3
14
15
  Provides-Extra: test
@@ -192,13 +193,3 @@ chunkr = Chunkr(
192
193
  url="https://api.chunkr.ai"
193
194
  )
194
195
  ```
195
-
196
- ## Run tests
197
-
198
- ```python
199
- # Install dependencies
200
- uv pip install -e ".[test]"
201
-
202
- # Run tests
203
- uv run pytest
204
- ```
@@ -174,14 +174,4 @@ chunkr = Chunkr(
174
174
  api_key="your-api-key",
175
175
  url="https://api.chunkr.ai"
176
176
  )
177
- ```
178
-
179
- ## Run tests
180
-
181
- ```python
182
- # Install dependencies
183
- uv pip install -e ".[test]"
184
-
185
- # Run tests
186
- uv run pytest
187
- ```
177
+ ```
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "chunkr-ai"
7
- version = "0.0.4"
7
+ version = "0.0.6"
8
8
  authors = [{"name" = "Ishaan Kapoor", "email" = "ishaan@lumina.sh"}]
9
9
  description = "Python client for Chunkr: open source document intelligence"
10
10
  readme = "README.md"
@@ -14,6 +14,7 @@ dependencies = [
14
14
  "httpx>=0.28.1",
15
15
  "pillow>=11.1.0",
16
16
  "pydantic>=2.10.4",
17
+ "pytest-asyncio>=0.25.2",
17
18
  "python-dotenv>=1.0.1",
18
19
  "requests>=2.32.3",
19
20
  ]
@@ -23,3 +24,4 @@ test = [
23
24
  "pytest>=8.3.4",
24
25
  "pytest-xdist>=3.6.1",
25
26
  ]
27
+
@@ -24,6 +24,7 @@ class TaskResponse(BaseModel):
24
24
  output: Optional[OutputResponse]
25
25
  page_count: Optional[int]
26
26
  pdf_url: Optional[str]
27
+ started_at: Optional[datetime]
27
28
  status: Status
28
29
  task_id: str
29
30
  task_url: Optional[str]
@@ -57,8 +58,9 @@ class TaskResponse(BaseModel):
57
58
  while True:
58
59
  try:
59
60
  r = await self._client._client.get(self.task_url, headers=self._client._headers())
60
- await r.raise_for_status()
61
- return await r.json()
61
+ r.raise_for_status()
62
+ response = r.json()
63
+ return response
62
64
  except (ConnectionError, TimeoutError) as _:
63
65
  print("Connection error while polling the task, retrying...")
64
66
  await asyncio.sleep(0.5)
@@ -117,15 +119,4 @@ class TaskResponse(BaseModel):
117
119
 
118
120
  def content(self) -> str:
119
121
  """Get full text for the task"""
120
- return self._get_content("content")
121
-
122
- class TaskPayload(BaseModel):
123
- current_configuration: Configuration
124
- file_name: str
125
- image_folder_location: str
126
- input_location: str
127
- output_location: str
128
- pdf_location: str
129
- previous_configuration: Optional[Configuration]
130
- task_id: str
131
- user_id: str
122
+ return self._get_content("content")
@@ -20,7 +20,7 @@ from .api.config import (
20
20
  SegmentationStrategy,
21
21
  )
22
22
 
23
- from .api.task import TaskResponse, TaskPayload, Status
23
+ from .api.task import TaskResponse, Status
24
24
 
25
25
  __all__ = [
26
26
  'BoundingBox',
@@ -43,6 +43,5 @@ __all__ = [
43
43
  'SegmentType',
44
44
  'SegmentationStrategy',
45
45
  'Status',
46
- 'TaskPayload',
47
46
  'TaskResponse'
48
47
  ]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: chunkr-ai
3
- Version: 0.0.4
3
+ Version: 0.0.6
4
4
  Summary: Python client for Chunkr: open source document intelligence
5
5
  Author-email: Ishaan Kapoor <ishaan@lumina.sh>
6
6
  Project-URL: Homepage, https://chunkr.ai
@@ -9,6 +9,7 @@ License-File: LICENSE
9
9
  Requires-Dist: httpx>=0.28.1
10
10
  Requires-Dist: pillow>=11.1.0
11
11
  Requires-Dist: pydantic>=2.10.4
12
+ Requires-Dist: pytest-asyncio>=0.25.2
12
13
  Requires-Dist: python-dotenv>=1.0.1
13
14
  Requires-Dist: requests>=2.32.3
14
15
  Provides-Extra: test
@@ -192,13 +193,3 @@ chunkr = Chunkr(
192
193
  url="https://api.chunkr.ai"
193
194
  )
194
195
  ```
195
-
196
- ## Run tests
197
-
198
- ```python
199
- # Install dependencies
200
- uv pip install -e ".[test]"
201
-
202
- # Run tests
203
- uv run pytest
204
- ```
@@ -1,6 +1,7 @@
1
1
  httpx>=0.28.1
2
2
  pillow>=11.1.0
3
3
  pydantic>=2.10.4
4
+ pytest-asyncio>=0.25.2
4
5
  python-dotenv>=1.0.1
5
6
  requests>=2.32.3
6
7
 
@@ -0,0 +1,212 @@
1
+ import pytest
2
+ import pytest_asyncio
3
+ from pathlib import Path
4
+ from PIL import Image
5
+
6
+ from chunkr_ai import Chunkr, ChunkrAsync
7
+ from chunkr_ai.models import (
8
+ ChunkProcessing,
9
+ Configuration,
10
+ GenerationStrategy,
11
+ GenerationConfig,
12
+ JsonSchema,
13
+ OcrStrategy,
14
+ Property,
15
+ SegmentationStrategy,
16
+ SegmentProcessing,
17
+ TaskResponse,
18
+ )
19
+
20
+ @pytest.fixture(params=[
21
+ pytest.param(("sync", Chunkr()), id="sync"),
22
+ pytest.param(("async", ChunkrAsync()), id="async")
23
+ ])
24
+ def chunkr_client(request):
25
+ return request.param
26
+
27
+ @pytest.fixture
28
+ def sample_path():
29
+ return Path("tests/files/test.pdf")
30
+
31
+ @pytest.fixture
32
+ def sample_image():
33
+ img = Image.open("tests/files/test.jpg")
34
+ return img
35
+
36
+ @pytest.mark.asyncio
37
+ async def test_send_file_path(chunkr_client, sample_path):
38
+ client_type, client = chunkr_client
39
+ response = await client.upload(sample_path) if client_type == "async" else client.upload(sample_path)
40
+
41
+ assert isinstance(response, TaskResponse)
42
+ assert response.task_id is not None
43
+ assert response.status == "Succeeded"
44
+ assert response.output is not None
45
+
46
+ @pytest.mark.asyncio
47
+ async def test_send_file_path_str(chunkr_client, sample_path):
48
+ client_type, client = chunkr_client
49
+ response = await client.upload(str(sample_path)) if client_type == "async" else client.upload(str(sample_path))
50
+
51
+ assert isinstance(response, TaskResponse)
52
+ assert response.task_id is not None
53
+ assert response.status == "Succeeded"
54
+ assert response.output is not None
55
+
56
+ @pytest.mark.asyncio
57
+ async def test_send_opened_file(chunkr_client, sample_path):
58
+ client_type, client = chunkr_client
59
+ with open(sample_path, 'rb') as f:
60
+ response = await client.upload(f) if client_type == "async" else client.upload(f)
61
+
62
+ assert isinstance(response, TaskResponse)
63
+ assert response.task_id is not None
64
+ assert response.status == "Succeeded"
65
+ assert response.output is not None
66
+
67
+ @pytest.mark.asyncio
68
+ async def test_send_pil_image(chunkr_client, sample_image):
69
+ client_type, client = chunkr_client
70
+ response = await client.upload(sample_image) if client_type == "async" else client.upload(sample_image)
71
+
72
+ assert isinstance(response, TaskResponse)
73
+ assert response.task_id is not None
74
+ assert response.status == "Succeeded"
75
+
76
+ @pytest.mark.asyncio
77
+ async def test_ocr_auto(chunkr_client, sample_path):
78
+ client_type, client = chunkr_client
79
+ response = await client.upload(sample_path, Configuration(
80
+ ocr_strategy=OcrStrategy.AUTO
81
+ )) if client_type == "async" else client.upload(sample_path, Configuration(
82
+ ocr_strategy=OcrStrategy.AUTO
83
+ ))
84
+
85
+ assert isinstance(response, TaskResponse)
86
+ assert response.task_id is not None
87
+ assert response.status == "Succeeded"
88
+ assert response.output is not None
89
+
90
+ @pytest.mark.asyncio
91
+ async def test_expires_in(chunkr_client, sample_path):
92
+ client_type, client = chunkr_client
93
+ response = await client.upload(sample_path, Configuration(
94
+ expires_in=10
95
+ )) if client_type == "async" else client.upload(sample_path, Configuration(
96
+ expires_in=10
97
+ ))
98
+
99
+ assert isinstance(response, TaskResponse)
100
+ assert response.task_id is not None
101
+ assert response.status == "Succeeded"
102
+ assert response.output is not None
103
+
104
+ @pytest.mark.asyncio
105
+ async def test_chunk_processing(chunkr_client, sample_path):
106
+ client_type, client = chunkr_client
107
+ response = await client.upload(sample_path, Configuration(
108
+ chunk_processing=ChunkProcessing(
109
+ target_length=1024
110
+ )
111
+ )) if client_type == "async" else client.upload(sample_path, Configuration(
112
+ chunk_processing=ChunkProcessing(
113
+ target_length=1024
114
+ )
115
+ ))
116
+
117
+ assert isinstance(response, TaskResponse)
118
+ assert response.task_id is not None
119
+ assert response.status == "Succeeded"
120
+ assert response.output is not None
121
+
122
+ @pytest.mark.asyncio
123
+ async def test_segmentation_strategy_page(chunkr_client, sample_path):
124
+ client_type, client = chunkr_client
125
+ response = await client.upload(sample_path, Configuration(
126
+ segmentation_strategy=SegmentationStrategy.PAGE
127
+ )) if client_type == "async" else client.upload(sample_path, Configuration(
128
+ segmentation_strategy=SegmentationStrategy.PAGE
129
+ ))
130
+
131
+ assert isinstance(response, TaskResponse)
132
+ assert response.task_id is not None
133
+ assert response.status == "Succeeded"
134
+ assert response.output is not None
135
+
136
+ @pytest.mark.asyncio
137
+ async def test_page_llm_html(chunkr_client, sample_path):
138
+ client_type, client = chunkr_client
139
+ response = await client.upload(sample_path, Configuration(
140
+ segmentation_strategy=SegmentationStrategy.PAGE,
141
+ segment_processing=SegmentProcessing(
142
+ page=GenerationConfig(
143
+ html=GenerationStrategy.LLM
144
+ )
145
+ )
146
+ )) if client_type == "async" else client.upload(sample_path, Configuration(
147
+ segmentation_strategy=SegmentationStrategy.PAGE,
148
+ segment_processing=SegmentProcessing(
149
+ page=GenerationConfig(
150
+ html=GenerationStrategy.LLM
151
+ )
152
+ )
153
+ ))
154
+
155
+ assert isinstance(response, TaskResponse)
156
+ assert response.task_id is not None
157
+ assert response.status == "Succeeded"
158
+ assert response.output is not None
159
+
160
+ @pytest.mark.asyncio
161
+ async def test_page_llm(chunkr_client, sample_path):
162
+ client_type, client = chunkr_client
163
+ response = await client.upload(sample_path, Configuration(
164
+ segmentation_strategy=SegmentationStrategy.PAGE,
165
+ segment_processing=SegmentProcessing(
166
+ page=GenerationConfig(
167
+ html=GenerationStrategy.LLM,
168
+ markdown=GenerationStrategy.LLM
169
+ )
170
+ )
171
+ )) if client_type == "async" else client.upload(sample_path, Configuration(
172
+ segmentation_strategy=SegmentationStrategy.PAGE,
173
+ segment_processing=SegmentProcessing(
174
+ page=GenerationConfig(
175
+ html=GenerationStrategy.LLM,
176
+ markdown=GenerationStrategy.LLM
177
+ )
178
+ )
179
+ ))
180
+
181
+ assert isinstance(response, TaskResponse)
182
+ assert response.task_id is not None
183
+ assert response.status == "Succeeded"
184
+ assert response.output is not None
185
+
186
+ @pytest.mark.asyncio
187
+ async def test_json_schema(chunkr_client, sample_path):
188
+ client_type, client = chunkr_client
189
+ response = await client.upload(sample_path, Configuration(
190
+ json_schema=JsonSchema(
191
+ title="Sales Data",
192
+ properties=[
193
+ Property(name="Person with highest sales", prop_type="string", description="The person with the highest sales"),
194
+ Property(name="Person with lowest sales", prop_type="string", description="The person with the lowest sales"),
195
+ ]
196
+ )
197
+ )) if client_type == "async" else client.upload(sample_path, Configuration(
198
+ json_schema=JsonSchema(
199
+ title="Sales Data",
200
+ properties=[
201
+ Property(name="Person with highest sales", prop_type="string", description="The person with the highest sales"),
202
+ Property(name="Person with lowest sales", prop_type="string", description="The person with the lowest sales"),
203
+ ]
204
+ )
205
+ ))
206
+
207
+ assert isinstance(response, TaskResponse)
208
+ assert response.task_id is not None
209
+ assert response.status == "Succeeded"
210
+ assert response.output is not None
211
+
212
+
@@ -1,158 +0,0 @@
1
- import pytest
2
- from pathlib import Path
3
- from PIL import Image
4
-
5
- from chunkr_ai import Chunkr, ChunkrAsync
6
- from chunkr_ai.models import (
7
- ChunkProcessing,
8
- Configuration,
9
- GenerationStrategy,
10
- GenerationConfig,
11
- JsonSchema,
12
- OcrStrategy,
13
- Property,
14
- SegmentationStrategy,
15
- SegmentProcessing,
16
- TaskResponse,
17
- )
18
-
19
- @pytest.fixture
20
- def chunkr():
21
- return Chunkr()
22
-
23
- @pytest.fixture
24
- def async_chunkr():
25
- return ChunkrAsync()
26
-
27
- @pytest.fixture
28
- def sample_path():
29
- return Path("tests/files/test.pdf")
30
-
31
- @pytest.fixture
32
- def sample_image():
33
- img = Image.open("tests/files/test.jpg")
34
- return img
35
-
36
- def test_send_file_path(chunkr, sample_path):
37
- response = chunkr.upload(sample_path)
38
-
39
- assert isinstance(response, TaskResponse)
40
- assert response.task_id is not None
41
- assert response.status == "Succeeded"
42
- assert response.output is not None
43
-
44
- def test_send_file_path_str(chunkr, sample_path):
45
- response = chunkr.upload(str(sample_path))
46
-
47
- assert isinstance(response, TaskResponse)
48
- assert response.task_id is not None
49
- assert response.status == "Succeeded"
50
- assert response.output is not None
51
-
52
- def test_send_opened_file(chunkr, sample_path):
53
- with open(sample_path, 'rb') as f:
54
- response = chunkr.upload(f)
55
-
56
- assert isinstance(response, TaskResponse)
57
- assert response.task_id is not None
58
- assert response.status == "Succeeded"
59
- assert response.output is not None
60
-
61
- def test_send_pil_image(chunkr, sample_image):
62
- response = chunkr.upload(sample_image)
63
-
64
- assert isinstance(response, TaskResponse)
65
- assert response.task_id is not None
66
- assert response.status == "Succeeded"
67
-
68
- def test_ocr_auto(chunkr, sample_path):
69
- response = chunkr.upload(sample_path, Configuration(
70
- ocr_strategy=OcrStrategy.AUTO
71
- ))
72
- assert isinstance(response, TaskResponse)
73
- assert response.task_id is not None
74
- assert response.status == "Succeeded"
75
- assert response.output is not None
76
-
77
- def test_expires_in(chunkr, sample_path):
78
- response = chunkr.upload(sample_path, Configuration(
79
- expires_in=10
80
- ))
81
- assert isinstance(response, TaskResponse)
82
- assert response.task_id is not None
83
- assert response.status == "Succeeded"
84
- assert response.output is not None
85
-
86
- def test_chunk_processing(chunkr, sample_path):
87
- response = chunkr.upload(sample_path, Configuration(
88
- chunk_processing=ChunkProcessing(
89
- target_length=1024
90
- )
91
- ))
92
- assert isinstance(response, TaskResponse)
93
- assert response.task_id is not None
94
- assert response.status == "Succeeded"
95
- assert response.output is not None
96
-
97
- def test_segmentation_strategy_page(chunkr, sample_path):
98
- response = chunkr.upload(sample_path, Configuration(
99
- segmentation_strategy=SegmentationStrategy.PAGE
100
- ))
101
- assert isinstance(response, TaskResponse)
102
- assert response.task_id is not None
103
- assert response.status == "Succeeded"
104
- assert response.output is not None
105
-
106
- def test_page_llm_html(chunkr, sample_path):
107
- response = chunkr.upload(sample_path, Configuration(
108
- segmentation_strategy=SegmentationStrategy.PAGE,
109
- segment_processing=SegmentProcessing(
110
- page=GenerationConfig(
111
- html=GenerationStrategy.LLM
112
- )
113
- )
114
- ))
115
- assert isinstance(response, TaskResponse)
116
- assert response.task_id is not None
117
- assert response.status == "Succeeded"
118
- assert response.output is not None
119
-
120
- def test_page_llm(chunkr, sample_path):
121
- response = chunkr.upload(sample_path, Configuration(
122
- segmentation_strategy=SegmentationStrategy.PAGE,
123
- segment_processing=SegmentProcessing(
124
- page=GenerationConfig(
125
- html=GenerationStrategy.LLM,
126
- markdown=GenerationStrategy.LLM
127
- )
128
- )
129
- ))
130
- assert isinstance(response, TaskResponse)
131
- assert response.task_id is not None
132
- assert response.status == "Succeeded"
133
- assert response.output is not None
134
-
135
- def test_json_schema(chunkr, sample_path):
136
- response = chunkr.upload(sample_path, Configuration(
137
- json_schema=JsonSchema(
138
- title="Sales Data",
139
- properties=[
140
- Property(name="Person with highest sales", prop_type="string", description="The person with the highest sales"),
141
- Property(name="Person with lowest sales", prop_type="string", description="The person with the lowest sales"),
142
- ]
143
- )
144
- ))
145
- assert isinstance(response, TaskResponse)
146
- assert response.task_id is not None
147
- assert response.status == "Succeeded"
148
- assert response.output is not None
149
-
150
- async def test_async_send_file_path(async_chunkr, sample_path):
151
- response = await async_chunkr.upload(sample_path)
152
-
153
- assert isinstance(response, TaskResponse)
154
- assert response.task_id is not None
155
- assert response.status == "Succeeded"
156
- assert response.output is not None
157
-
158
-
File without changes
File without changes