chunkr-ai 0.0.47__tar.gz → 0.0.49__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {chunkr_ai-0.0.47/src/chunkr_ai.egg-info → chunkr_ai-0.0.49}/PKG-INFO +1 -1
- {chunkr_ai-0.0.47 → chunkr_ai-0.0.49}/pyproject.toml +1 -1
- {chunkr_ai-0.0.47 → chunkr_ai-0.0.49}/src/chunkr_ai/api/chunkr.py +2 -2
- {chunkr_ai-0.0.47 → chunkr_ai-0.0.49}/src/chunkr_ai/api/configuration.py +1 -0
- {chunkr_ai-0.0.47 → chunkr_ai-0.0.49}/src/chunkr_ai/api/misc.py +49 -17
- {chunkr_ai-0.0.47 → chunkr_ai-0.0.49/src/chunkr_ai.egg-info}/PKG-INFO +1 -1
- {chunkr_ai-0.0.47 → chunkr_ai-0.0.49}/src/chunkr_ai.egg-info/SOURCES.txt +2 -1
- {chunkr_ai-0.0.47 → chunkr_ai-0.0.49}/tests/test_chunkr.py +92 -116
- chunkr_ai-0.0.49/tests/test_file_handling.py +362 -0
- {chunkr_ai-0.0.47 → chunkr_ai-0.0.49}/LICENSE +0 -0
- {chunkr_ai-0.0.47 → chunkr_ai-0.0.49}/README.md +0 -0
- {chunkr_ai-0.0.47 → chunkr_ai-0.0.49}/setup.cfg +0 -0
- {chunkr_ai-0.0.47 → chunkr_ai-0.0.49}/src/chunkr_ai/__init__.py +0 -0
- {chunkr_ai-0.0.47 → chunkr_ai-0.0.49}/src/chunkr_ai/api/__init__.py +0 -0
- {chunkr_ai-0.0.47 → chunkr_ai-0.0.49}/src/chunkr_ai/api/auth.py +0 -0
- {chunkr_ai-0.0.47 → chunkr_ai-0.0.49}/src/chunkr_ai/api/chunkr_base.py +0 -0
- {chunkr_ai-0.0.47 → chunkr_ai-0.0.49}/src/chunkr_ai/api/decorators.py +0 -0
- {chunkr_ai-0.0.47 → chunkr_ai-0.0.49}/src/chunkr_ai/api/protocol.py +0 -0
- {chunkr_ai-0.0.47 → chunkr_ai-0.0.49}/src/chunkr_ai/api/task_response.py +0 -0
- {chunkr_ai-0.0.47 → chunkr_ai-0.0.49}/src/chunkr_ai/models.py +0 -0
- {chunkr_ai-0.0.47 → chunkr_ai-0.0.49}/src/chunkr_ai.egg-info/dependency_links.txt +0 -0
- {chunkr_ai-0.0.47 → chunkr_ai-0.0.49}/src/chunkr_ai.egg-info/requires.txt +0 -0
- {chunkr_ai-0.0.47 → chunkr_ai-0.0.49}/src/chunkr_ai.egg-info/top_level.txt +0 -0
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "chunkr-ai"
|
7
|
-
version = "0.0.
|
7
|
+
version = "0.0.49"
|
8
8
|
authors = [{"name" = "Ishaan Kapoor", "email" = "ishaan@lumina.sh"}]
|
9
9
|
description = "Python client for Chunkr: open source document intelligence"
|
10
10
|
readme = "README.md"
|
@@ -16,7 +16,7 @@ class Chunkr(ChunkrBase):
|
|
16
16
|
@ensure_client()
|
17
17
|
async def upload(
|
18
18
|
self,
|
19
|
-
file: Union[str, Path, BinaryIO, Image.Image],
|
19
|
+
file: Union[str, Path, BinaryIO, Image.Image, bytes, bytearray, memoryview],
|
20
20
|
config: Optional[Configuration] = None,
|
21
21
|
filename: Optional[str] = None,
|
22
22
|
) -> TaskResponse:
|
@@ -34,7 +34,7 @@ class Chunkr(ChunkrBase):
|
|
34
34
|
@retry_on_429()
|
35
35
|
async def create_task(
|
36
36
|
self,
|
37
|
-
file: Union[str, Path, BinaryIO, Image.Image],
|
37
|
+
file: Union[str, Path, BinaryIO, Image.Image, bytes, bytearray, memoryview],
|
38
38
|
config: Optional[Configuration] = None,
|
39
39
|
filename: Optional[str] = None,
|
40
40
|
) -> TaskResponse:
|
@@ -23,6 +23,7 @@ class GenerationConfig(BaseModel):
|
|
23
23
|
markdown: Optional[GenerationStrategy] = None
|
24
24
|
crop_image: Optional[CroppingStrategy] = None
|
25
25
|
embed_sources: Optional[List[EmbedSource]] = Field(default_factory=lambda: [EmbedSource.MARKDOWN])
|
26
|
+
extended_context: Optional[bool] = None
|
26
27
|
|
27
28
|
class SegmentProcessing(BaseModel):
|
28
29
|
model_config = ConfigDict(populate_by_name=True, alias_generator=str.title)
|
@@ -3,9 +3,9 @@ import base64
|
|
3
3
|
import io
|
4
4
|
from pathlib import Path
|
5
5
|
from PIL import Image
|
6
|
-
from typing import Union, Tuple, BinaryIO, Optional
|
6
|
+
from typing import Union, Tuple, BinaryIO, Optional, Any
|
7
7
|
|
8
|
-
async def prepare_file(file: Union[str, Path, BinaryIO, Image.Image]) -> Tuple[Optional[str], str]:
|
8
|
+
async def prepare_file(file: Union[str, Path, BinaryIO, Image.Image, bytes, bytearray, memoryview]) -> Tuple[Optional[str], str]:
|
9
9
|
"""Convert various file types into a tuple of (filename, file content).
|
10
10
|
|
11
11
|
Args:
|
@@ -15,6 +15,7 @@ async def prepare_file(file: Union[str, Path, BinaryIO, Image.Image]) -> Tuple[O
|
|
15
15
|
- Local file path (will be converted to base64)
|
16
16
|
- Opened binary file (will be converted to base64)
|
17
17
|
- PIL/Pillow Image object (will be converted to base64)
|
18
|
+
- Bytes object (will be converted to base64)
|
18
19
|
|
19
20
|
Returns:
|
20
21
|
Tuple[Optional[str], str]: (filename, content) where content is either a URL or base64 string
|
@@ -26,22 +27,54 @@ async def prepare_file(file: Union[str, Path, BinaryIO, Image.Image]) -> Tuple[O
|
|
26
27
|
ValueError: If the URL is invalid or unreachable
|
27
28
|
ValueError: If the MIME type is unsupported
|
28
29
|
"""
|
29
|
-
# Handle
|
30
|
+
# Handle bytes-like objects
|
31
|
+
if isinstance(file, (bytes, bytearray, memoryview)):
|
32
|
+
# Convert to bytes first if it's not already
|
33
|
+
file_bytes = bytes(file)
|
34
|
+
|
35
|
+
# Check if this might be an already-encoded base64 string in bytes form
|
36
|
+
try:
|
37
|
+
# Try to decode the bytes to a string and see if it's valid base64
|
38
|
+
potential_base64 = file_bytes.decode('utf-8', errors='strict')
|
39
|
+
base64.b64decode(potential_base64)
|
40
|
+
# If we get here, it was a valid base64 string in bytes form
|
41
|
+
return None, potential_base64
|
42
|
+
except:
|
43
|
+
# Not a base64 string in bytes form, encode it as base64
|
44
|
+
base64_str = base64.b64encode(file_bytes).decode()
|
45
|
+
return None, base64_str
|
46
|
+
|
47
|
+
# Handle strings - urls or paths or base64
|
30
48
|
if isinstance(file, str):
|
49
|
+
# Handle URLs
|
31
50
|
if file.startswith(('http://', 'https://')):
|
32
51
|
return None, file
|
33
|
-
|
34
|
-
|
35
|
-
if
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
52
|
+
|
53
|
+
# Handle data URLs
|
54
|
+
if file.startswith('data:'):
|
55
|
+
return None, file
|
56
|
+
|
57
|
+
# Try to handle as a file path
|
58
|
+
try:
|
59
|
+
path = Path(file)
|
60
|
+
if path.exists():
|
61
|
+
# It's a valid file path, convert to Path object and continue processing
|
62
|
+
file = path
|
63
|
+
else:
|
64
|
+
# If not a valid file path, try treating as base64
|
65
|
+
try:
|
66
|
+
# Just test if it's valid base64, don't store the result
|
67
|
+
base64.b64decode(file)
|
68
|
+
return None, file
|
69
|
+
except:
|
70
|
+
raise ValueError(f"File not found: {file} and it's not a valid base64 string")
|
71
|
+
except Exception as e:
|
72
|
+
# If string can't be converted to Path or decoded as base64, it might still be a base64 string
|
40
73
|
try:
|
41
74
|
base64.b64decode(file)
|
42
75
|
return None, file
|
43
76
|
except:
|
44
|
-
raise ValueError(f"
|
77
|
+
raise ValueError(f"Unable to process file: {e}")
|
45
78
|
|
46
79
|
# Handle file paths - convert to base64
|
47
80
|
if isinstance(file, Path):
|
@@ -71,17 +104,16 @@ async def prepare_file(file: Union[str, Path, BinaryIO, Image.Image]) -> Tuple[O
|
|
71
104
|
file.seek(0)
|
72
105
|
file_content = file.read()
|
73
106
|
name = getattr(file, "name", "document")
|
74
|
-
|
75
|
-
|
76
|
-
raise ValueError("File must have an extension")
|
107
|
+
if not name or not isinstance(name, str):
|
108
|
+
name = None
|
77
109
|
base64_str = base64.b64encode(file_content).decode()
|
78
|
-
return
|
110
|
+
return name, base64_str
|
79
111
|
|
80
112
|
raise TypeError(f"Unsupported file type: {type(file)}")
|
81
113
|
|
82
114
|
|
83
115
|
async def prepare_upload_data(
|
84
|
-
file: Optional[Union[str, Path, BinaryIO, Image.Image]] = None,
|
116
|
+
file: Optional[Union[str, Path, BinaryIO, Image.Image, bytes, bytearray, memoryview]] = None,
|
85
117
|
filename: Optional[str] = None,
|
86
118
|
config: Optional[Configuration] = None,
|
87
119
|
) -> dict:
|
@@ -89,8 +121,8 @@ async def prepare_upload_data(
|
|
89
121
|
|
90
122
|
Args:
|
91
123
|
file: The file to upload
|
124
|
+
filename: Optional filename to use (overrides any filename from the file)
|
92
125
|
config: Optional configuration settings
|
93
|
-
client: HTTP client for downloading remote files
|
94
126
|
|
95
127
|
Returns:
|
96
128
|
dict: JSON-serializable data dictionary ready for upload
|
@@ -2,24 +2,25 @@ import pytest
|
|
2
2
|
from pathlib import Path
|
3
3
|
from PIL import Image
|
4
4
|
import asyncio
|
5
|
-
import
|
5
|
+
from typing import Awaitable
|
6
6
|
|
7
7
|
from chunkr_ai import Chunkr
|
8
8
|
from chunkr_ai.models import (
|
9
|
+
ChunkProcessing,
|
9
10
|
Configuration,
|
10
|
-
|
11
|
+
EmbedSource,
|
12
|
+
ErrorHandlingStrategy,
|
13
|
+
FallbackStrategy,
|
11
14
|
GenerationConfig,
|
15
|
+
GenerationStrategy,
|
16
|
+
LlmProcessing,
|
12
17
|
OcrStrategy,
|
13
18
|
Pipeline,
|
14
19
|
SegmentationStrategy,
|
15
20
|
SegmentProcessing,
|
16
|
-
|
21
|
+
Status,
|
17
22
|
TaskResponse,
|
18
|
-
EmbedSource,
|
19
|
-
ErrorHandlingStrategy,
|
20
23
|
Tokenizer,
|
21
|
-
LlmProcessing,
|
22
|
-
FallbackStrategy,
|
23
24
|
)
|
24
25
|
|
25
26
|
@pytest.fixture
|
@@ -164,56 +165,20 @@ def model_fallback_config():
|
|
164
165
|
),
|
165
166
|
)
|
166
167
|
|
167
|
-
@pytest.
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
@pytest.mark.asyncio
|
182
|
-
async def test_send_file_relative_path_str(client, sample_relative_path_str):
|
183
|
-
response = await client.upload(sample_relative_path_str)
|
184
|
-
assert response.task_id is not None
|
185
|
-
assert response.status == "Succeeded"
|
186
|
-
assert response.output is not None
|
187
|
-
|
188
|
-
@pytest.mark.asyncio
|
189
|
-
async def test_send_file_url(client, sample_url):
|
190
|
-
response = await client.upload(sample_url)
|
191
|
-
assert response.task_id is not None
|
192
|
-
assert response.status == "Succeeded"
|
193
|
-
assert response.output is not None
|
194
|
-
|
195
|
-
@pytest.mark.asyncio
|
196
|
-
async def test_send_file_path_as_str(client, sample_path):
|
197
|
-
response = await client.upload(str(sample_path))
|
198
|
-
assert response.task_id is not None
|
199
|
-
assert response.status == "Succeeded"
|
200
|
-
assert response.output is not None
|
201
|
-
|
202
|
-
@pytest.mark.asyncio
|
203
|
-
async def test_send_opened_file(client, sample_path):
|
204
|
-
with open(sample_path, "rb") as f:
|
205
|
-
response = await client.upload(f)
|
206
|
-
assert response.task_id is not None
|
207
|
-
assert response.status == "Succeeded"
|
208
|
-
assert response.output is not None
|
209
|
-
|
210
|
-
@pytest.mark.asyncio
|
211
|
-
async def test_send_pil_image(client, sample_image):
|
212
|
-
response = await client.upload(sample_image)
|
213
|
-
assert response.task_id is not None
|
214
|
-
assert response.status == "Succeeded"
|
215
|
-
assert response.output is not None
|
216
|
-
assert response.output is not None
|
168
|
+
@pytest.fixture
|
169
|
+
def extended_context_config():
|
170
|
+
return Configuration(
|
171
|
+
segment_processing=SegmentProcessing(
|
172
|
+
picture=GenerationConfig(
|
173
|
+
extended_context=True,
|
174
|
+
html=GenerationStrategy.LLM,
|
175
|
+
),
|
176
|
+
table=GenerationConfig(
|
177
|
+
extended_context=True,
|
178
|
+
html=GenerationStrategy.LLM,
|
179
|
+
)
|
180
|
+
),
|
181
|
+
)
|
217
182
|
|
218
183
|
@pytest.mark.asyncio
|
219
184
|
async def test_ocr_auto(client, sample_path):
|
@@ -313,9 +278,18 @@ async def test_cancel_task(client, sample_path):
|
|
313
278
|
@pytest.mark.asyncio
|
314
279
|
async def test_cancel_task_direct(client, sample_path):
|
315
280
|
task = await client.create_task(sample_path)
|
316
|
-
assert isinstance(task, TaskResponse)
|
317
281
|
assert task.status == "Starting"
|
318
|
-
|
282
|
+
try:
|
283
|
+
await task.cancel()
|
284
|
+
except Exception as e:
|
285
|
+
task = await client.get_task(task.task_id)
|
286
|
+
print(task.status)
|
287
|
+
if task.status == Status.PROCESSING:
|
288
|
+
print("Task is processing, so it can't be cancelled")
|
289
|
+
assert True
|
290
|
+
else:
|
291
|
+
print("Task status:", task.status)
|
292
|
+
raise e
|
319
293
|
assert task.status == "Cancelled"
|
320
294
|
|
321
295
|
@pytest.mark.asyncio
|
@@ -352,6 +326,7 @@ async def test_pipeline_type_azure(client, sample_path):
|
|
352
326
|
assert response.task_id is not None
|
353
327
|
assert response.status == "Succeeded"
|
354
328
|
assert response.output is not None
|
329
|
+
assert response.configuration.pipeline == Pipeline.AZURE
|
355
330
|
|
356
331
|
@pytest.mark.asyncio
|
357
332
|
async def test_pipeline_type_chunkr(client, sample_path):
|
@@ -359,7 +334,8 @@ async def test_pipeline_type_chunkr(client, sample_path):
|
|
359
334
|
assert response.task_id is not None
|
360
335
|
assert response.status == "Succeeded"
|
361
336
|
assert response.output is not None
|
362
|
-
|
337
|
+
assert response.configuration.pipeline == Pipeline.CHUNKR
|
338
|
+
|
363
339
|
@pytest.mark.asyncio
|
364
340
|
async def test_client_lifecycle(client, sample_path):
|
365
341
|
response1 = await client.upload(sample_path)
|
@@ -375,36 +351,6 @@ async def test_task_operations_after_client_close(client, sample_path):
|
|
375
351
|
result = await task.poll()
|
376
352
|
assert result.status == "Succeeded"
|
377
353
|
|
378
|
-
@pytest.mark.asyncio
|
379
|
-
async def test_send_base64_file(client, sample_path):
|
380
|
-
# Read file and convert to base64
|
381
|
-
with open(sample_path, "rb") as f:
|
382
|
-
base64_content = base64.b64encode(f.read()).decode('utf-8')
|
383
|
-
response = await client.upload(base64_content)
|
384
|
-
assert response.task_id is not None
|
385
|
-
assert response.status == "Succeeded"
|
386
|
-
assert response.output is not None
|
387
|
-
|
388
|
-
@pytest.mark.asyncio
|
389
|
-
async def test_send_base64_file_with_data_url(client, sample_path):
|
390
|
-
with open(sample_path, "rb") as f:
|
391
|
-
base64_content = base64.b64encode(f.read()).decode('utf-8')
|
392
|
-
response = await client.upload(f"data:application/pdf;base64,{base64_content}")
|
393
|
-
assert response.task_id is not None
|
394
|
-
assert response.status == "Succeeded"
|
395
|
-
assert response.output is not None
|
396
|
-
|
397
|
-
@pytest.mark.asyncio
|
398
|
-
async def test_send_base64_file_with_filename(client, sample_path):
|
399
|
-
# Read file and convert to base64
|
400
|
-
with open(sample_path, "rb") as f:
|
401
|
-
base64_content = base64.b64encode(f.read()).decode('utf-8')
|
402
|
-
|
403
|
-
response = await client.upload(base64_content, filename="test.pdf")
|
404
|
-
assert response.task_id is not None
|
405
|
-
assert response.status == "Succeeded"
|
406
|
-
assert response.output is not None
|
407
|
-
|
408
354
|
@pytest.mark.asyncio
|
409
355
|
async def test_output_files_no_dir(client, sample_path, tmp_path):
|
410
356
|
task = await client.upload(sample_path)
|
@@ -444,6 +390,35 @@ async def test_output_files_with_dirs(client, sample_path, tmp_path):
|
|
444
390
|
assert content_file.exists()
|
445
391
|
assert json_file.exists()
|
446
392
|
|
393
|
+
|
394
|
+
@pytest.mark.asyncio
|
395
|
+
async def test_combined_config_with_llm_and_other_settings(client, sample_path):
|
396
|
+
# Test combining LLM settings with other configuration options
|
397
|
+
config = Configuration(
|
398
|
+
llm_processing=LlmProcessing(
|
399
|
+
model_id="qwen-2.5-vl-7b-instruct",
|
400
|
+
fallback_strategy=FallbackStrategy.model("gemini-flash-2.0"),
|
401
|
+
temperature=0.4
|
402
|
+
),
|
403
|
+
segmentation_strategy=SegmentationStrategy.PAGE,
|
404
|
+
segment_processing=SegmentProcessing(
|
405
|
+
Page=GenerationConfig(
|
406
|
+
html=GenerationStrategy.LLM,
|
407
|
+
markdown=GenerationStrategy.LLM
|
408
|
+
)
|
409
|
+
),
|
410
|
+
chunk_processing=ChunkProcessing(target_length=1024)
|
411
|
+
)
|
412
|
+
|
413
|
+
response = await client.upload(sample_path, config)
|
414
|
+
assert response.task_id is not None
|
415
|
+
assert response.status == "Succeeded"
|
416
|
+
assert response.output is not None
|
417
|
+
assert response.configuration.llm_processing is not None
|
418
|
+
assert response.configuration.llm_processing.model_id == "qwen-2.5-vl-7b-instruct"
|
419
|
+
assert response.configuration.segmentation_strategy == SegmentationStrategy.PAGE
|
420
|
+
assert response.configuration.chunk_processing.target_length == 1024
|
421
|
+
|
447
422
|
@pytest.mark.asyncio
|
448
423
|
async def test_embed_sources_markdown_only(client, sample_path, markdown_embed_config):
|
449
424
|
response = await client.upload(sample_path, markdown_embed_config)
|
@@ -580,29 +555,30 @@ async def test_fallback_strategy_serialization():
|
|
580
555
|
assert str(model_strategy) == "Model(gpt-4.1)"
|
581
556
|
|
582
557
|
@pytest.mark.asyncio
|
583
|
-
async def
|
584
|
-
|
585
|
-
|
586
|
-
|
587
|
-
|
588
|
-
|
589
|
-
|
590
|
-
|
591
|
-
|
592
|
-
|
593
|
-
|
594
|
-
|
595
|
-
|
596
|
-
|
597
|
-
|
598
|
-
|
599
|
-
|
600
|
-
|
601
|
-
|
602
|
-
|
603
|
-
|
604
|
-
|
605
|
-
|
606
|
-
|
607
|
-
|
608
|
-
|
558
|
+
async def test_extended_context(client, sample_path, extended_context_config):
|
559
|
+
"""Tests uploading with extended context enabled for pictures and tables."""
|
560
|
+
print("\nTesting extended context for Pictures and Tables...")
|
561
|
+
try:
|
562
|
+
task = await client.upload(sample_path, config=extended_context_config)
|
563
|
+
print(f"Task created with extended context config: {task.task_id}")
|
564
|
+
print(f"Initial Status: {task.status}")
|
565
|
+
|
566
|
+
# Poll the task until it finishes or fails
|
567
|
+
print(f"Final Status: {task.status}")
|
568
|
+
print(f"Message: {task.message}")
|
569
|
+
|
570
|
+
# Basic assertion: Check if the task completed (either succeeded or failed)
|
571
|
+
assert task.status in [Status.SUCCEEDED, Status.FAILED], f"Task ended in unexpected state: {task.status}"
|
572
|
+
|
573
|
+
# More specific assertions based on expected outcomes with your local server
|
574
|
+
# if task.status == Status.FAILED:
|
575
|
+
# assert "context_length_exceeded" in task.message, "Expected context length error"
|
576
|
+
# elif task.status == Status.SUCCEEDED:
|
577
|
+
# # Check if output reflects extended context usage if possible
|
578
|
+
# pass
|
579
|
+
|
580
|
+
print("Extended context test completed.")
|
581
|
+
|
582
|
+
except Exception as e:
|
583
|
+
print(f"Error during extended context test: {e}")
|
584
|
+
raise # Re-raise the exception to fail the test explicitly
|
@@ -0,0 +1,362 @@
|
|
1
|
+
import pytest
|
2
|
+
from pathlib import Path
|
3
|
+
from PIL import Image
|
4
|
+
import base64
|
5
|
+
import io
|
6
|
+
import tempfile
|
7
|
+
|
8
|
+
from chunkr_ai import Chunkr
|
9
|
+
|
10
|
+
@pytest.fixture
|
11
|
+
def sample_path():
|
12
|
+
return Path("tests/files/test.pdf")
|
13
|
+
|
14
|
+
@pytest.fixture
|
15
|
+
def sample_url():
|
16
|
+
return "https://chunkr-web.s3.us-east-1.amazonaws.com/landing_page/input/science.pdf"
|
17
|
+
|
18
|
+
@pytest.fixture
|
19
|
+
def sample_image():
|
20
|
+
return Image.open("tests/files/test.jpg")
|
21
|
+
|
22
|
+
@pytest.fixture
|
23
|
+
def client():
|
24
|
+
client = Chunkr()
|
25
|
+
yield client
|
26
|
+
|
27
|
+
@pytest.mark.asyncio
|
28
|
+
async def test_send_file_path(client, sample_path):
|
29
|
+
response = await client.upload(sample_path)
|
30
|
+
assert response.task_id is not None
|
31
|
+
assert response.status == "Succeeded"
|
32
|
+
assert response.output is not None
|
33
|
+
|
34
|
+
@pytest.mark.asyncio
|
35
|
+
async def test_send_file_path_str(client, sample_path):
|
36
|
+
response = await client.upload(str(sample_path))
|
37
|
+
assert response.task_id is not None
|
38
|
+
assert response.status == "Succeeded"
|
39
|
+
assert response.output is not None
|
40
|
+
|
41
|
+
@pytest.mark.asyncio
|
42
|
+
async def test_send_file_relative_path_str(client):
|
43
|
+
response = await client.upload("./tests/files/test.pdf")
|
44
|
+
assert response.task_id is not None
|
45
|
+
assert response.status == "Succeeded"
|
46
|
+
assert response.output is not None
|
47
|
+
|
48
|
+
@pytest.mark.asyncio
|
49
|
+
async def test_send_file_url(client, sample_url):
|
50
|
+
response = await client.upload(sample_url)
|
51
|
+
assert response.task_id is not None
|
52
|
+
assert response.status == "Succeeded"
|
53
|
+
assert response.output is not None
|
54
|
+
|
55
|
+
@pytest.mark.asyncio
|
56
|
+
async def test_send_opened_file(client, sample_path):
|
57
|
+
with open(sample_path, "rb") as f:
|
58
|
+
response = await client.upload(f)
|
59
|
+
assert response.task_id is not None
|
60
|
+
assert response.status == "Succeeded"
|
61
|
+
assert response.output is not None
|
62
|
+
|
63
|
+
@pytest.mark.asyncio
|
64
|
+
async def test_send_pil_image(client, sample_image):
|
65
|
+
response = await client.upload(sample_image)
|
66
|
+
assert response.task_id is not None
|
67
|
+
assert response.status == "Succeeded"
|
68
|
+
assert response.output is not None
|
69
|
+
assert response.output is not None
|
70
|
+
|
71
|
+
@pytest.mark.asyncio
|
72
|
+
async def test_send_base64_file(client, sample_path):
|
73
|
+
# Read file and convert to base64
|
74
|
+
with open(sample_path, "rb") as f:
|
75
|
+
base64_content = base64.b64encode(f.read())
|
76
|
+
response = await client.upload(base64_content)
|
77
|
+
assert response.task_id is not None
|
78
|
+
assert response.status == "Succeeded"
|
79
|
+
assert response.output is not None
|
80
|
+
|
81
|
+
@pytest.mark.asyncio
|
82
|
+
async def test_send_base64_file_w_decode(client, sample_path):
|
83
|
+
# Read file and convert to base64
|
84
|
+
with open(sample_path, "rb") as f:
|
85
|
+
base64_content = base64.b64encode(f.read()).decode()
|
86
|
+
response = await client.upload(base64_content)
|
87
|
+
assert response.task_id is not None
|
88
|
+
assert response.status == "Succeeded"
|
89
|
+
assert response.output is not None
|
90
|
+
|
91
|
+
@pytest.mark.asyncio
|
92
|
+
async def test_send_base64_file_with_data_url(client, sample_path):
|
93
|
+
with open(sample_path, "rb") as f:
|
94
|
+
base64_content = base64.b64encode(f.read()).decode('utf-8')
|
95
|
+
response = await client.upload(f"data:application/pdf;base64,{base64_content}")
|
96
|
+
assert response.task_id is not None
|
97
|
+
assert response.status == "Succeeded"
|
98
|
+
assert response.output is not None
|
99
|
+
|
100
|
+
@pytest.mark.asyncio
|
101
|
+
async def test_send_base64_file_with_filename(client, sample_path):
|
102
|
+
# Read file and convert to base64
|
103
|
+
with open(sample_path, "rb") as f:
|
104
|
+
base64_content = base64.b64encode(f.read()).decode('utf-8')
|
105
|
+
|
106
|
+
response = await client.upload(base64_content, filename="test.pdf")
|
107
|
+
assert response.task_id is not None
|
108
|
+
assert response.status == "Succeeded"
|
109
|
+
assert response.output is not None
|
110
|
+
|
111
|
+
@pytest.mark.asyncio
|
112
|
+
async def test_file_like_no_name_attribute(client, sample_path):
|
113
|
+
# Create a file-like object without a name attribute
|
114
|
+
class NamelessBuffer:
|
115
|
+
def __init__(self, content):
|
116
|
+
self.buffer = io.BytesIO(content)
|
117
|
+
|
118
|
+
def read(self):
|
119
|
+
return self.buffer.read()
|
120
|
+
|
121
|
+
def seek(self, pos):
|
122
|
+
return self.buffer.seek(pos)
|
123
|
+
|
124
|
+
with open(sample_path, "rb") as f:
|
125
|
+
content = f.read()
|
126
|
+
|
127
|
+
nameless_buffer = NamelessBuffer(content)
|
128
|
+
response = await client.upload(nameless_buffer, filename="test.pdf")
|
129
|
+
assert response.task_id is not None
|
130
|
+
assert response.status == "Succeeded"
|
131
|
+
assert response.output is not None
|
132
|
+
|
133
|
+
@pytest.mark.asyncio
|
134
|
+
async def test_file_like_none_name(client, sample_path):
|
135
|
+
# Create a file-like object with None as name
|
136
|
+
class NoneNameBuffer:
|
137
|
+
def __init__(self, content):
|
138
|
+
self.buffer = io.BytesIO(content)
|
139
|
+
self.name = None
|
140
|
+
|
141
|
+
def read(self):
|
142
|
+
return self.buffer.read()
|
143
|
+
|
144
|
+
def seek(self, pos):
|
145
|
+
return self.buffer.seek(pos)
|
146
|
+
|
147
|
+
with open(sample_path, "rb") as f:
|
148
|
+
content = f.read()
|
149
|
+
|
150
|
+
none_name_buffer = NoneNameBuffer(content)
|
151
|
+
response = await client.upload(none_name_buffer, filename="test.pdf")
|
152
|
+
assert response.task_id is not None
|
153
|
+
assert response.status == "Succeeded"
|
154
|
+
assert response.output is not None
|
155
|
+
|
156
|
+
@pytest.mark.asyncio
|
157
|
+
async def test_file_like_no_extension(client, sample_path):
|
158
|
+
# Create a file-like object with a name but no extension
|
159
|
+
class NoExtensionBuffer:
|
160
|
+
def __init__(self, content):
|
161
|
+
self.buffer = io.BytesIO(content)
|
162
|
+
self.name = "test_document"
|
163
|
+
|
164
|
+
def read(self):
|
165
|
+
return self.buffer.read()
|
166
|
+
|
167
|
+
def seek(self, pos):
|
168
|
+
return self.buffer.seek(pos)
|
169
|
+
|
170
|
+
with open(sample_path, "rb") as f:
|
171
|
+
content = f.read()
|
172
|
+
|
173
|
+
no_ext_buffer = NoExtensionBuffer(content)
|
174
|
+
response = await client.upload(no_ext_buffer, filename="test.pdf")
|
175
|
+
assert response.task_id is not None
|
176
|
+
assert response.status == "Succeeded"
|
177
|
+
assert response.output is not None
|
178
|
+
|
179
|
+
@pytest.mark.asyncio
|
180
|
+
async def test_spooled_temporary_file(client, sample_path):
|
181
|
+
# Test with SpooledTemporaryFile which is what the user is using
|
182
|
+
with open(sample_path, "rb") as f:
|
183
|
+
content = f.read()
|
184
|
+
|
185
|
+
temp_file = tempfile.SpooledTemporaryFile()
|
186
|
+
temp_file.write(content)
|
187
|
+
temp_file.seek(0)
|
188
|
+
|
189
|
+
response = await client.upload(temp_file, filename="test.pdf")
|
190
|
+
assert response.task_id is not None
|
191
|
+
assert response.status == "Succeeded"
|
192
|
+
assert response.output is not None
|
193
|
+
|
194
|
+
@pytest.mark.asyncio
|
195
|
+
async def test_send_bytearray(client, sample_path):
|
196
|
+
# Read file and convert to bytearray
|
197
|
+
with open(sample_path, "rb") as f:
|
198
|
+
content = bytearray(f.read())
|
199
|
+
|
200
|
+
response = await client.upload(content, filename="test.pdf")
|
201
|
+
assert response.task_id is not None
|
202
|
+
assert response.status == "Succeeded"
|
203
|
+
assert response.output is not None
|
204
|
+
|
205
|
+
@pytest.mark.asyncio
|
206
|
+
async def test_send_memoryview(client, sample_path):
|
207
|
+
# Read file and convert to memoryview
|
208
|
+
with open(sample_path, "rb") as f:
|
209
|
+
content_bytes = f.read()
|
210
|
+
content = memoryview(content_bytes)
|
211
|
+
|
212
|
+
response = await client.upload(content, filename="test.pdf")
|
213
|
+
assert response.task_id is not None
|
214
|
+
assert response.status == "Succeeded"
|
215
|
+
assert response.output is not None
|
216
|
+
|
217
|
+
@pytest.mark.asyncio
|
218
|
+
async def test_with_explicit_filename_pdf(client, sample_path):
|
219
|
+
response = await client.upload(sample_path, filename="custom_name.pdf")
|
220
|
+
assert response.task_id is not None
|
221
|
+
assert response.status == "Succeeded"
|
222
|
+
assert response.output is not None
|
223
|
+
|
224
|
+
@pytest.mark.asyncio
|
225
|
+
async def test_with_explicit_filename_image(client, sample_image):
|
226
|
+
response = await client.upload(sample_image, filename="custom_image.jpg")
|
227
|
+
assert response.task_id is not None
|
228
|
+
assert response.status == "Succeeded"
|
229
|
+
assert response.output is not None
|
230
|
+
|
231
|
+
@pytest.mark.asyncio
|
232
|
+
async def test_with_special_character_filename(client, sample_path):
|
233
|
+
response = await client.upload(sample_path, filename="test file (1)&%$#@!.pdf")
|
234
|
+
assert response.task_id is not None
|
235
|
+
assert response.status == "Succeeded"
|
236
|
+
assert response.output is not None
|
237
|
+
|
238
|
+
@pytest.mark.asyncio
|
239
|
+
async def test_filename_with_non_matching_extension(client, sample_path):
|
240
|
+
# Test providing a filename with a different extension than the actual file
|
241
|
+
response = await client.upload(sample_path, filename="document.docx")
|
242
|
+
assert response.task_id is not None
|
243
|
+
assert response.status == "Succeeded"
|
244
|
+
assert response.output is not None
|
245
|
+
|
246
|
+
@pytest.mark.asyncio
|
247
|
+
async def test_bytes_with_explicit_filename(client, sample_path):
|
248
|
+
with open(sample_path, "rb") as f:
|
249
|
+
content = f.read()
|
250
|
+
|
251
|
+
# For bytes objects, filename is required to know the file type
|
252
|
+
response = await client.upload(content, filename="document.pdf")
|
253
|
+
assert response.task_id is not None
|
254
|
+
assert response.status == "Succeeded"
|
255
|
+
assert response.output is not None
|
256
|
+
|
257
|
+
@pytest.mark.asyncio
|
258
|
+
async def test_bytearray_with_explicit_filename(client, sample_path):
|
259
|
+
with open(sample_path, "rb") as f:
|
260
|
+
content = bytearray(f.read())
|
261
|
+
|
262
|
+
response = await client.upload(content, filename="document.pdf")
|
263
|
+
assert response.task_id is not None
|
264
|
+
assert response.status == "Succeeded"
|
265
|
+
assert response.output is not None
|
266
|
+
|
267
|
+
@pytest.mark.asyncio
|
268
|
+
async def test_memoryview_with_explicit_filename(client, sample_path):
|
269
|
+
with open(sample_path, "rb") as f:
|
270
|
+
content_bytes = f.read()
|
271
|
+
content = memoryview(content_bytes)
|
272
|
+
|
273
|
+
response = await client.upload(content, filename="document.pdf")
|
274
|
+
assert response.task_id is not None
|
275
|
+
assert response.status == "Succeeded"
|
276
|
+
assert response.output is not None
|
277
|
+
|
278
|
+
@pytest.mark.asyncio
|
279
|
+
async def test_unicode_filename(client, sample_path):
|
280
|
+
# Test with a filename containing Unicode characters
|
281
|
+
response = await client.upload(sample_path, filename="测试文件.pdf")
|
282
|
+
assert response.task_id is not None
|
283
|
+
assert response.status == "Succeeded"
|
284
|
+
assert response.output is not None
|
285
|
+
|
286
|
+
@pytest.mark.asyncio
|
287
|
+
async def test_very_long_filename(client, sample_path):
|
288
|
+
# Test with an extremely long filename
|
289
|
+
long_name = "a" * 200 + ".pdf" # 200 character filename
|
290
|
+
response = await client.upload(sample_path, filename=long_name)
|
291
|
+
assert response.task_id is not None
|
292
|
+
assert response.status == "Succeeded"
|
293
|
+
assert response.output is not None
|
294
|
+
|
295
|
+
@pytest.mark.asyncio
|
296
|
+
async def test_filename_without_extension(client, sample_path):
|
297
|
+
# Test with a filename that has no extension
|
298
|
+
with open(sample_path, "rb") as f:
|
299
|
+
content = f.read()
|
300
|
+
|
301
|
+
# This test verifies that the system uses the provided filename even without extension
|
302
|
+
response = await client.upload(content, filename="document_without_extension")
|
303
|
+
assert response.task_id is not None
|
304
|
+
assert response.status == "Succeeded"
|
305
|
+
assert response.output is not None
|
306
|
+
|
307
|
+
@pytest.mark.asyncio
|
308
|
+
async def test_custom_file_like_with_filename(client, sample_path):
|
309
|
+
# A more complex file-like object implementation
|
310
|
+
class CustomFileWrapper:
|
311
|
+
def __init__(self, content):
|
312
|
+
self.buffer = io.BytesIO(content)
|
313
|
+
self.position = 0
|
314
|
+
self.name = "original_name.txt" # Should be overridden by explicit filename
|
315
|
+
|
316
|
+
def read(self, size=-1):
|
317
|
+
return self.buffer.read(size)
|
318
|
+
|
319
|
+
def seek(self, position, whence=0):
|
320
|
+
return self.buffer.seek(position, whence)
|
321
|
+
|
322
|
+
def tell(self):
|
323
|
+
return self.buffer.tell()
|
324
|
+
|
325
|
+
def close(self):
|
326
|
+
self.buffer.close()
|
327
|
+
|
328
|
+
with open(sample_path, "rb") as f:
|
329
|
+
content = f.read()
|
330
|
+
|
331
|
+
custom_file = CustomFileWrapper(content)
|
332
|
+
response = await client.upload(custom_file, filename="custom_wrapper.pdf")
|
333
|
+
assert response.task_id is not None
|
334
|
+
assert response.status == "Succeeded"
|
335
|
+
assert response.output is not None
|
336
|
+
|
337
|
+
@pytest.mark.asyncio
|
338
|
+
async def test_seek_at_nonzero_position(client, sample_path):
|
339
|
+
# Test with a file-like object that's not at position 0
|
340
|
+
with open(sample_path, "rb") as f:
|
341
|
+
content = f.read()
|
342
|
+
|
343
|
+
buffer = io.BytesIO(content)
|
344
|
+
buffer.seek(100) # Move position to 100
|
345
|
+
|
346
|
+
response = await client.upload(buffer, filename="seek_test.pdf")
|
347
|
+
assert response.task_id is not None
|
348
|
+
assert response.status == "Succeeded"
|
349
|
+
assert response.output is not None
|
350
|
+
|
351
|
+
@pytest.mark.asyncio
|
352
|
+
async def test_reused_file_object(client, sample_path):
|
353
|
+
# Test that a file object can be reused after being processed
|
354
|
+
with open(sample_path, "rb") as f:
|
355
|
+
response1 = await client.upload(f, filename="first_use.pdf")
|
356
|
+
f.seek(0) # Reset position
|
357
|
+
response2 = await client.upload(f, filename="second_use.pdf")
|
358
|
+
|
359
|
+
assert response1.task_id is not None
|
360
|
+
assert response1.status == "Succeeded"
|
361
|
+
assert response2.task_id is not None
|
362
|
+
assert response2.status == "Succeeded"
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|