chunkr-ai 0.0.47__tar.gz → 0.0.49__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (23) hide show
  1. {chunkr_ai-0.0.47/src/chunkr_ai.egg-info → chunkr_ai-0.0.49}/PKG-INFO +1 -1
  2. {chunkr_ai-0.0.47 → chunkr_ai-0.0.49}/pyproject.toml +1 -1
  3. {chunkr_ai-0.0.47 → chunkr_ai-0.0.49}/src/chunkr_ai/api/chunkr.py +2 -2
  4. {chunkr_ai-0.0.47 → chunkr_ai-0.0.49}/src/chunkr_ai/api/configuration.py +1 -0
  5. {chunkr_ai-0.0.47 → chunkr_ai-0.0.49}/src/chunkr_ai/api/misc.py +49 -17
  6. {chunkr_ai-0.0.47 → chunkr_ai-0.0.49/src/chunkr_ai.egg-info}/PKG-INFO +1 -1
  7. {chunkr_ai-0.0.47 → chunkr_ai-0.0.49}/src/chunkr_ai.egg-info/SOURCES.txt +2 -1
  8. {chunkr_ai-0.0.47 → chunkr_ai-0.0.49}/tests/test_chunkr.py +92 -116
  9. chunkr_ai-0.0.49/tests/test_file_handling.py +362 -0
  10. {chunkr_ai-0.0.47 → chunkr_ai-0.0.49}/LICENSE +0 -0
  11. {chunkr_ai-0.0.47 → chunkr_ai-0.0.49}/README.md +0 -0
  12. {chunkr_ai-0.0.47 → chunkr_ai-0.0.49}/setup.cfg +0 -0
  13. {chunkr_ai-0.0.47 → chunkr_ai-0.0.49}/src/chunkr_ai/__init__.py +0 -0
  14. {chunkr_ai-0.0.47 → chunkr_ai-0.0.49}/src/chunkr_ai/api/__init__.py +0 -0
  15. {chunkr_ai-0.0.47 → chunkr_ai-0.0.49}/src/chunkr_ai/api/auth.py +0 -0
  16. {chunkr_ai-0.0.47 → chunkr_ai-0.0.49}/src/chunkr_ai/api/chunkr_base.py +0 -0
  17. {chunkr_ai-0.0.47 → chunkr_ai-0.0.49}/src/chunkr_ai/api/decorators.py +0 -0
  18. {chunkr_ai-0.0.47 → chunkr_ai-0.0.49}/src/chunkr_ai/api/protocol.py +0 -0
  19. {chunkr_ai-0.0.47 → chunkr_ai-0.0.49}/src/chunkr_ai/api/task_response.py +0 -0
  20. {chunkr_ai-0.0.47 → chunkr_ai-0.0.49}/src/chunkr_ai/models.py +0 -0
  21. {chunkr_ai-0.0.47 → chunkr_ai-0.0.49}/src/chunkr_ai.egg-info/dependency_links.txt +0 -0
  22. {chunkr_ai-0.0.47 → chunkr_ai-0.0.49}/src/chunkr_ai.egg-info/requires.txt +0 -0
  23. {chunkr_ai-0.0.47 → chunkr_ai-0.0.49}/src/chunkr_ai.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: chunkr-ai
3
- Version: 0.0.47
3
+ Version: 0.0.49
4
4
  Summary: Python client for Chunkr: open source document intelligence
5
5
  Author-email: Ishaan Kapoor <ishaan@lumina.sh>
6
6
  License: MIT License
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "chunkr-ai"
7
- version = "0.0.47"
7
+ version = "0.0.49"
8
8
  authors = [{"name" = "Ishaan Kapoor", "email" = "ishaan@lumina.sh"}]
9
9
  description = "Python client for Chunkr: open source document intelligence"
10
10
  readme = "README.md"
@@ -16,7 +16,7 @@ class Chunkr(ChunkrBase):
16
16
  @ensure_client()
17
17
  async def upload(
18
18
  self,
19
- file: Union[str, Path, BinaryIO, Image.Image],
19
+ file: Union[str, Path, BinaryIO, Image.Image, bytes, bytearray, memoryview],
20
20
  config: Optional[Configuration] = None,
21
21
  filename: Optional[str] = None,
22
22
  ) -> TaskResponse:
@@ -34,7 +34,7 @@ class Chunkr(ChunkrBase):
34
34
  @retry_on_429()
35
35
  async def create_task(
36
36
  self,
37
- file: Union[str, Path, BinaryIO, Image.Image],
37
+ file: Union[str, Path, BinaryIO, Image.Image, bytes, bytearray, memoryview],
38
38
  config: Optional[Configuration] = None,
39
39
  filename: Optional[str] = None,
40
40
  ) -> TaskResponse:
@@ -23,6 +23,7 @@ class GenerationConfig(BaseModel):
23
23
  markdown: Optional[GenerationStrategy] = None
24
24
  crop_image: Optional[CroppingStrategy] = None
25
25
  embed_sources: Optional[List[EmbedSource]] = Field(default_factory=lambda: [EmbedSource.MARKDOWN])
26
+ extended_context: Optional[bool] = None
26
27
 
27
28
  class SegmentProcessing(BaseModel):
28
29
  model_config = ConfigDict(populate_by_name=True, alias_generator=str.title)
@@ -3,9 +3,9 @@ import base64
3
3
  import io
4
4
  from pathlib import Path
5
5
  from PIL import Image
6
- from typing import Union, Tuple, BinaryIO, Optional
6
+ from typing import Union, Tuple, BinaryIO, Optional, Any
7
7
 
8
- async def prepare_file(file: Union[str, Path, BinaryIO, Image.Image]) -> Tuple[Optional[str], str]:
8
+ async def prepare_file(file: Union[str, Path, BinaryIO, Image.Image, bytes, bytearray, memoryview]) -> Tuple[Optional[str], str]:
9
9
  """Convert various file types into a tuple of (filename, file content).
10
10
 
11
11
  Args:
@@ -15,6 +15,7 @@ async def prepare_file(file: Union[str, Path, BinaryIO, Image.Image]) -> Tuple[O
15
15
  - Local file path (will be converted to base64)
16
16
  - Opened binary file (will be converted to base64)
17
17
  - PIL/Pillow Image object (will be converted to base64)
18
+ - Bytes object (will be converted to base64)
18
19
 
19
20
  Returns:
20
21
  Tuple[Optional[str], str]: (filename, content) where content is either a URL or base64 string
@@ -26,22 +27,54 @@ async def prepare_file(file: Union[str, Path, BinaryIO, Image.Image]) -> Tuple[O
26
27
  ValueError: If the URL is invalid or unreachable
27
28
  ValueError: If the MIME type is unsupported
28
29
  """
29
- # Handle strings
30
+ # Handle bytes-like objects
31
+ if isinstance(file, (bytes, bytearray, memoryview)):
32
+ # Convert to bytes first if it's not already
33
+ file_bytes = bytes(file)
34
+
35
+ # Check if this might be an already-encoded base64 string in bytes form
36
+ try:
37
+ # Try to decode the bytes to a string and see if it's valid base64
38
+ potential_base64 = file_bytes.decode('utf-8', errors='strict')
39
+ base64.b64decode(potential_base64)
40
+ # If we get here, it was a valid base64 string in bytes form
41
+ return None, potential_base64
42
+ except:
43
+ # Not a base64 string in bytes form, encode it as base64
44
+ base64_str = base64.b64encode(file_bytes).decode()
45
+ return None, base64_str
46
+
47
+ # Handle strings - urls or paths or base64
30
48
  if isinstance(file, str):
49
+ # Handle URLs
31
50
  if file.startswith(('http://', 'https://')):
32
51
  return None, file
33
- # Try to handle as a file path first
34
- path = Path(file)
35
- if path.exists():
36
- # It's a valid file path, convert to Path object and continue processing
37
- file = path
38
- else:
39
- # If not a valid file path, try treating as base64
52
+
53
+ # Handle data URLs
54
+ if file.startswith('data:'):
55
+ return None, file
56
+
57
+ # Try to handle as a file path
58
+ try:
59
+ path = Path(file)
60
+ if path.exists():
61
+ # It's a valid file path, convert to Path object and continue processing
62
+ file = path
63
+ else:
64
+ # If not a valid file path, try treating as base64
65
+ try:
66
+ # Just test if it's valid base64, don't store the result
67
+ base64.b64decode(file)
68
+ return None, file
69
+ except:
70
+ raise ValueError(f"File not found: {file} and it's not a valid base64 string")
71
+ except Exception as e:
72
+ # If string can't be converted to Path or decoded as base64, it might still be a base64 string
40
73
  try:
41
74
  base64.b64decode(file)
42
75
  return None, file
43
76
  except:
44
- raise ValueError(f"File not found: {file} and it's not a valid base64 string")
77
+ raise ValueError(f"Unable to process file: {e}")
45
78
 
46
79
  # Handle file paths - convert to base64
47
80
  if isinstance(file, Path):
@@ -71,17 +104,16 @@ async def prepare_file(file: Union[str, Path, BinaryIO, Image.Image]) -> Tuple[O
71
104
  file.seek(0)
72
105
  file_content = file.read()
73
106
  name = getattr(file, "name", "document")
74
- file_ext = Path(name).suffix.lower().lstrip('.')
75
- if not file_ext:
76
- raise ValueError("File must have an extension")
107
+ if not name or not isinstance(name, str):
108
+ name = None
77
109
  base64_str = base64.b64encode(file_content).decode()
78
- return Path(name).name, base64_str
110
+ return name, base64_str
79
111
 
80
112
  raise TypeError(f"Unsupported file type: {type(file)}")
81
113
 
82
114
 
83
115
  async def prepare_upload_data(
84
- file: Optional[Union[str, Path, BinaryIO, Image.Image]] = None,
116
+ file: Optional[Union[str, Path, BinaryIO, Image.Image, bytes, bytearray, memoryview]] = None,
85
117
  filename: Optional[str] = None,
86
118
  config: Optional[Configuration] = None,
87
119
  ) -> dict:
@@ -89,8 +121,8 @@ async def prepare_upload_data(
89
121
 
90
122
  Args:
91
123
  file: The file to upload
124
+ filename: Optional filename to use (overrides any filename from the file)
92
125
  config: Optional configuration settings
93
- client: HTTP client for downloading remote files
94
126
 
95
127
  Returns:
96
128
  dict: JSON-serializable data dictionary ready for upload
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: chunkr-ai
3
- Version: 0.0.47
3
+ Version: 0.0.49
4
4
  Summary: Python client for Chunkr: open source document intelligence
5
5
  Author-email: Ishaan Kapoor <ishaan@lumina.sh>
6
6
  License: MIT License
@@ -17,4 +17,5 @@ src/chunkr_ai/api/decorators.py
17
17
  src/chunkr_ai/api/misc.py
18
18
  src/chunkr_ai/api/protocol.py
19
19
  src/chunkr_ai/api/task_response.py
20
- tests/test_chunkr.py
20
+ tests/test_chunkr.py
21
+ tests/test_file_handling.py
@@ -2,24 +2,25 @@ import pytest
2
2
  from pathlib import Path
3
3
  from PIL import Image
4
4
  import asyncio
5
- import base64
5
+ from typing import Awaitable
6
6
 
7
7
  from chunkr_ai import Chunkr
8
8
  from chunkr_ai.models import (
9
+ ChunkProcessing,
9
10
  Configuration,
10
- GenerationStrategy,
11
+ EmbedSource,
12
+ ErrorHandlingStrategy,
13
+ FallbackStrategy,
11
14
  GenerationConfig,
15
+ GenerationStrategy,
16
+ LlmProcessing,
12
17
  OcrStrategy,
13
18
  Pipeline,
14
19
  SegmentationStrategy,
15
20
  SegmentProcessing,
16
- ChunkProcessing,
21
+ Status,
17
22
  TaskResponse,
18
- EmbedSource,
19
- ErrorHandlingStrategy,
20
23
  Tokenizer,
21
- LlmProcessing,
22
- FallbackStrategy,
23
24
  )
24
25
 
25
26
  @pytest.fixture
@@ -164,56 +165,20 @@ def model_fallback_config():
164
165
  ),
165
166
  )
166
167
 
167
- @pytest.mark.asyncio
168
- async def test_send_file_path(client, sample_path):
169
- response = await client.upload(sample_path)
170
- assert response.task_id is not None
171
- assert response.status == "Succeeded"
172
- assert response.output is not None
173
-
174
- @pytest.mark.asyncio
175
- async def test_send_file_path_str(client, sample_absolute_path_str):
176
- response = await client.upload(sample_absolute_path_str)
177
- assert response.task_id is not None
178
- assert response.status == "Succeeded"
179
- assert response.output is not None
180
-
181
- @pytest.mark.asyncio
182
- async def test_send_file_relative_path_str(client, sample_relative_path_str):
183
- response = await client.upload(sample_relative_path_str)
184
- assert response.task_id is not None
185
- assert response.status == "Succeeded"
186
- assert response.output is not None
187
-
188
- @pytest.mark.asyncio
189
- async def test_send_file_url(client, sample_url):
190
- response = await client.upload(sample_url)
191
- assert response.task_id is not None
192
- assert response.status == "Succeeded"
193
- assert response.output is not None
194
-
195
- @pytest.mark.asyncio
196
- async def test_send_file_path_as_str(client, sample_path):
197
- response = await client.upload(str(sample_path))
198
- assert response.task_id is not None
199
- assert response.status == "Succeeded"
200
- assert response.output is not None
201
-
202
- @pytest.mark.asyncio
203
- async def test_send_opened_file(client, sample_path):
204
- with open(sample_path, "rb") as f:
205
- response = await client.upload(f)
206
- assert response.task_id is not None
207
- assert response.status == "Succeeded"
208
- assert response.output is not None
209
-
210
- @pytest.mark.asyncio
211
- async def test_send_pil_image(client, sample_image):
212
- response = await client.upload(sample_image)
213
- assert response.task_id is not None
214
- assert response.status == "Succeeded"
215
- assert response.output is not None
216
- assert response.output is not None
168
+ @pytest.fixture
169
+ def extended_context_config():
170
+ return Configuration(
171
+ segment_processing=SegmentProcessing(
172
+ picture=GenerationConfig(
173
+ extended_context=True,
174
+ html=GenerationStrategy.LLM,
175
+ ),
176
+ table=GenerationConfig(
177
+ extended_context=True,
178
+ html=GenerationStrategy.LLM,
179
+ )
180
+ ),
181
+ )
217
182
 
218
183
  @pytest.mark.asyncio
219
184
  async def test_ocr_auto(client, sample_path):
@@ -313,9 +278,18 @@ async def test_cancel_task(client, sample_path):
313
278
  @pytest.mark.asyncio
314
279
  async def test_cancel_task_direct(client, sample_path):
315
280
  task = await client.create_task(sample_path)
316
- assert isinstance(task, TaskResponse)
317
281
  assert task.status == "Starting"
318
- await task.cancel()
282
+ try:
283
+ await task.cancel()
284
+ except Exception as e:
285
+ task = await client.get_task(task.task_id)
286
+ print(task.status)
287
+ if task.status == Status.PROCESSING:
288
+ print("Task is processing, so it can't be cancelled")
289
+ assert True
290
+ else:
291
+ print("Task status:", task.status)
292
+ raise e
319
293
  assert task.status == "Cancelled"
320
294
 
321
295
  @pytest.mark.asyncio
@@ -352,6 +326,7 @@ async def test_pipeline_type_azure(client, sample_path):
352
326
  assert response.task_id is not None
353
327
  assert response.status == "Succeeded"
354
328
  assert response.output is not None
329
+ assert response.configuration.pipeline == Pipeline.AZURE
355
330
 
356
331
  @pytest.mark.asyncio
357
332
  async def test_pipeline_type_chunkr(client, sample_path):
@@ -359,7 +334,8 @@ async def test_pipeline_type_chunkr(client, sample_path):
359
334
  assert response.task_id is not None
360
335
  assert response.status == "Succeeded"
361
336
  assert response.output is not None
362
-
337
+ assert response.configuration.pipeline == Pipeline.CHUNKR
338
+
363
339
  @pytest.mark.asyncio
364
340
  async def test_client_lifecycle(client, sample_path):
365
341
  response1 = await client.upload(sample_path)
@@ -375,36 +351,6 @@ async def test_task_operations_after_client_close(client, sample_path):
375
351
  result = await task.poll()
376
352
  assert result.status == "Succeeded"
377
353
 
378
- @pytest.mark.asyncio
379
- async def test_send_base64_file(client, sample_path):
380
- # Read file and convert to base64
381
- with open(sample_path, "rb") as f:
382
- base64_content = base64.b64encode(f.read()).decode('utf-8')
383
- response = await client.upload(base64_content)
384
- assert response.task_id is not None
385
- assert response.status == "Succeeded"
386
- assert response.output is not None
387
-
388
- @pytest.mark.asyncio
389
- async def test_send_base64_file_with_data_url(client, sample_path):
390
- with open(sample_path, "rb") as f:
391
- base64_content = base64.b64encode(f.read()).decode('utf-8')
392
- response = await client.upload(f"data:application/pdf;base64,{base64_content}")
393
- assert response.task_id is not None
394
- assert response.status == "Succeeded"
395
- assert response.output is not None
396
-
397
- @pytest.mark.asyncio
398
- async def test_send_base64_file_with_filename(client, sample_path):
399
- # Read file and convert to base64
400
- with open(sample_path, "rb") as f:
401
- base64_content = base64.b64encode(f.read()).decode('utf-8')
402
-
403
- response = await client.upload(base64_content, filename="test.pdf")
404
- assert response.task_id is not None
405
- assert response.status == "Succeeded"
406
- assert response.output is not None
407
-
408
354
  @pytest.mark.asyncio
409
355
  async def test_output_files_no_dir(client, sample_path, tmp_path):
410
356
  task = await client.upload(sample_path)
@@ -444,6 +390,35 @@ async def test_output_files_with_dirs(client, sample_path, tmp_path):
444
390
  assert content_file.exists()
445
391
  assert json_file.exists()
446
392
 
393
+
394
+ @pytest.mark.asyncio
395
+ async def test_combined_config_with_llm_and_other_settings(client, sample_path):
396
+ # Test combining LLM settings with other configuration options
397
+ config = Configuration(
398
+ llm_processing=LlmProcessing(
399
+ model_id="qwen-2.5-vl-7b-instruct",
400
+ fallback_strategy=FallbackStrategy.model("gemini-flash-2.0"),
401
+ temperature=0.4
402
+ ),
403
+ segmentation_strategy=SegmentationStrategy.PAGE,
404
+ segment_processing=SegmentProcessing(
405
+ Page=GenerationConfig(
406
+ html=GenerationStrategy.LLM,
407
+ markdown=GenerationStrategy.LLM
408
+ )
409
+ ),
410
+ chunk_processing=ChunkProcessing(target_length=1024)
411
+ )
412
+
413
+ response = await client.upload(sample_path, config)
414
+ assert response.task_id is not None
415
+ assert response.status == "Succeeded"
416
+ assert response.output is not None
417
+ assert response.configuration.llm_processing is not None
418
+ assert response.configuration.llm_processing.model_id == "qwen-2.5-vl-7b-instruct"
419
+ assert response.configuration.segmentation_strategy == SegmentationStrategy.PAGE
420
+ assert response.configuration.chunk_processing.target_length == 1024
421
+
447
422
  @pytest.mark.asyncio
448
423
  async def test_embed_sources_markdown_only(client, sample_path, markdown_embed_config):
449
424
  response = await client.upload(sample_path, markdown_embed_config)
@@ -580,29 +555,30 @@ async def test_fallback_strategy_serialization():
580
555
  assert str(model_strategy) == "Model(gpt-4.1)"
581
556
 
582
557
  @pytest.mark.asyncio
583
- async def test_combined_config_with_llm_and_other_settings(client, sample_path):
584
- # Test combining LLM settings with other configuration options
585
- config = Configuration(
586
- llm_processing=LlmProcessing(
587
- model_id="qwen-2.5-vl-7b-instruct",
588
- fallback_strategy=FallbackStrategy.model("gemini-flash-2.0"),
589
- temperature=0.4
590
- ),
591
- segmentation_strategy=SegmentationStrategy.PAGE,
592
- segment_processing=SegmentProcessing(
593
- Page=GenerationConfig(
594
- html=GenerationStrategy.LLM,
595
- markdown=GenerationStrategy.LLM
596
- )
597
- ),
598
- chunk_processing=ChunkProcessing(target_length=1024)
599
- )
600
-
601
- response = await client.upload(sample_path, config)
602
- assert response.task_id is not None
603
- assert response.status == "Succeeded"
604
- assert response.output is not None
605
- assert response.configuration.llm_processing is not None
606
- assert response.configuration.llm_processing.model_id == "qwen-2.5-vl-7b-instruct"
607
- assert response.configuration.segmentation_strategy == SegmentationStrategy.PAGE
608
- assert response.configuration.chunk_processing.target_length == 1024
558
+ async def test_extended_context(client, sample_path, extended_context_config):
559
+ """Tests uploading with extended context enabled for pictures and tables."""
560
+ print("\nTesting extended context for Pictures and Tables...")
561
+ try:
562
+ task = await client.upload(sample_path, config=extended_context_config)
563
+ print(f"Task created with extended context config: {task.task_id}")
564
+ print(f"Initial Status: {task.status}")
565
+
566
+ # Poll the task until it finishes or fails
567
+ print(f"Final Status: {task.status}")
568
+ print(f"Message: {task.message}")
569
+
570
+ # Basic assertion: Check if the task completed (either succeeded or failed)
571
+ assert task.status in [Status.SUCCEEDED, Status.FAILED], f"Task ended in unexpected state: {task.status}"
572
+
573
+ # More specific assertions based on expected outcomes with your local server
574
+ # if task.status == Status.FAILED:
575
+ # assert "context_length_exceeded" in task.message, "Expected context length error"
576
+ # elif task.status == Status.SUCCEEDED:
577
+ # # Check if output reflects extended context usage if possible
578
+ # pass
579
+
580
+ print("Extended context test completed.")
581
+
582
+ except Exception as e:
583
+ print(f"Error during extended context test: {e}")
584
+ raise # Re-raise the exception to fail the test explicitly
@@ -0,0 +1,362 @@
1
+ import pytest
2
+ from pathlib import Path
3
+ from PIL import Image
4
+ import base64
5
+ import io
6
+ import tempfile
7
+
8
+ from chunkr_ai import Chunkr
9
+
10
+ @pytest.fixture
11
+ def sample_path():
12
+ return Path("tests/files/test.pdf")
13
+
14
+ @pytest.fixture
15
+ def sample_url():
16
+ return "https://chunkr-web.s3.us-east-1.amazonaws.com/landing_page/input/science.pdf"
17
+
18
+ @pytest.fixture
19
+ def sample_image():
20
+ return Image.open("tests/files/test.jpg")
21
+
22
+ @pytest.fixture
23
+ def client():
24
+ client = Chunkr()
25
+ yield client
26
+
27
+ @pytest.mark.asyncio
28
+ async def test_send_file_path(client, sample_path):
29
+ response = await client.upload(sample_path)
30
+ assert response.task_id is not None
31
+ assert response.status == "Succeeded"
32
+ assert response.output is not None
33
+
34
+ @pytest.mark.asyncio
35
+ async def test_send_file_path_str(client, sample_path):
36
+ response = await client.upload(str(sample_path))
37
+ assert response.task_id is not None
38
+ assert response.status == "Succeeded"
39
+ assert response.output is not None
40
+
41
+ @pytest.mark.asyncio
42
+ async def test_send_file_relative_path_str(client):
43
+ response = await client.upload("./tests/files/test.pdf")
44
+ assert response.task_id is not None
45
+ assert response.status == "Succeeded"
46
+ assert response.output is not None
47
+
48
+ @pytest.mark.asyncio
49
+ async def test_send_file_url(client, sample_url):
50
+ response = await client.upload(sample_url)
51
+ assert response.task_id is not None
52
+ assert response.status == "Succeeded"
53
+ assert response.output is not None
54
+
55
+ @pytest.mark.asyncio
56
+ async def test_send_opened_file(client, sample_path):
57
+ with open(sample_path, "rb") as f:
58
+ response = await client.upload(f)
59
+ assert response.task_id is not None
60
+ assert response.status == "Succeeded"
61
+ assert response.output is not None
62
+
63
+ @pytest.mark.asyncio
64
+ async def test_send_pil_image(client, sample_image):
65
+ response = await client.upload(sample_image)
66
+ assert response.task_id is not None
67
+ assert response.status == "Succeeded"
68
+ assert response.output is not None
69
+ assert response.output is not None
70
+
71
+ @pytest.mark.asyncio
72
+ async def test_send_base64_file(client, sample_path):
73
+ # Read file and convert to base64
74
+ with open(sample_path, "rb") as f:
75
+ base64_content = base64.b64encode(f.read())
76
+ response = await client.upload(base64_content)
77
+ assert response.task_id is not None
78
+ assert response.status == "Succeeded"
79
+ assert response.output is not None
80
+
81
+ @pytest.mark.asyncio
82
+ async def test_send_base64_file_w_decode(client, sample_path):
83
+ # Read file and convert to base64
84
+ with open(sample_path, "rb") as f:
85
+ base64_content = base64.b64encode(f.read()).decode()
86
+ response = await client.upload(base64_content)
87
+ assert response.task_id is not None
88
+ assert response.status == "Succeeded"
89
+ assert response.output is not None
90
+
91
+ @pytest.mark.asyncio
92
+ async def test_send_base64_file_with_data_url(client, sample_path):
93
+ with open(sample_path, "rb") as f:
94
+ base64_content = base64.b64encode(f.read()).decode('utf-8')
95
+ response = await client.upload(f"data:application/pdf;base64,{base64_content}")
96
+ assert response.task_id is not None
97
+ assert response.status == "Succeeded"
98
+ assert response.output is not None
99
+
100
+ @pytest.mark.asyncio
101
+ async def test_send_base64_file_with_filename(client, sample_path):
102
+ # Read file and convert to base64
103
+ with open(sample_path, "rb") as f:
104
+ base64_content = base64.b64encode(f.read()).decode('utf-8')
105
+
106
+ response = await client.upload(base64_content, filename="test.pdf")
107
+ assert response.task_id is not None
108
+ assert response.status == "Succeeded"
109
+ assert response.output is not None
110
+
111
+ @pytest.mark.asyncio
112
+ async def test_file_like_no_name_attribute(client, sample_path):
113
+ # Create a file-like object without a name attribute
114
+ class NamelessBuffer:
115
+ def __init__(self, content):
116
+ self.buffer = io.BytesIO(content)
117
+
118
+ def read(self):
119
+ return self.buffer.read()
120
+
121
+ def seek(self, pos):
122
+ return self.buffer.seek(pos)
123
+
124
+ with open(sample_path, "rb") as f:
125
+ content = f.read()
126
+
127
+ nameless_buffer = NamelessBuffer(content)
128
+ response = await client.upload(nameless_buffer, filename="test.pdf")
129
+ assert response.task_id is not None
130
+ assert response.status == "Succeeded"
131
+ assert response.output is not None
132
+
133
+ @pytest.mark.asyncio
134
+ async def test_file_like_none_name(client, sample_path):
135
+ # Create a file-like object with None as name
136
+ class NoneNameBuffer:
137
+ def __init__(self, content):
138
+ self.buffer = io.BytesIO(content)
139
+ self.name = None
140
+
141
+ def read(self):
142
+ return self.buffer.read()
143
+
144
+ def seek(self, pos):
145
+ return self.buffer.seek(pos)
146
+
147
+ with open(sample_path, "rb") as f:
148
+ content = f.read()
149
+
150
+ none_name_buffer = NoneNameBuffer(content)
151
+ response = await client.upload(none_name_buffer, filename="test.pdf")
152
+ assert response.task_id is not None
153
+ assert response.status == "Succeeded"
154
+ assert response.output is not None
155
+
156
+ @pytest.mark.asyncio
157
+ async def test_file_like_no_extension(client, sample_path):
158
+ # Create a file-like object with a name but no extension
159
+ class NoExtensionBuffer:
160
+ def __init__(self, content):
161
+ self.buffer = io.BytesIO(content)
162
+ self.name = "test_document"
163
+
164
+ def read(self):
165
+ return self.buffer.read()
166
+
167
+ def seek(self, pos):
168
+ return self.buffer.seek(pos)
169
+
170
+ with open(sample_path, "rb") as f:
171
+ content = f.read()
172
+
173
+ no_ext_buffer = NoExtensionBuffer(content)
174
+ response = await client.upload(no_ext_buffer, filename="test.pdf")
175
+ assert response.task_id is not None
176
+ assert response.status == "Succeeded"
177
+ assert response.output is not None
178
+
179
+ @pytest.mark.asyncio
180
+ async def test_spooled_temporary_file(client, sample_path):
181
+ # Test with SpooledTemporaryFile which is what the user is using
182
+ with open(sample_path, "rb") as f:
183
+ content = f.read()
184
+
185
+ temp_file = tempfile.SpooledTemporaryFile()
186
+ temp_file.write(content)
187
+ temp_file.seek(0)
188
+
189
+ response = await client.upload(temp_file, filename="test.pdf")
190
+ assert response.task_id is not None
191
+ assert response.status == "Succeeded"
192
+ assert response.output is not None
193
+
194
+ @pytest.mark.asyncio
195
+ async def test_send_bytearray(client, sample_path):
196
+ # Read file and convert to bytearray
197
+ with open(sample_path, "rb") as f:
198
+ content = bytearray(f.read())
199
+
200
+ response = await client.upload(content, filename="test.pdf")
201
+ assert response.task_id is not None
202
+ assert response.status == "Succeeded"
203
+ assert response.output is not None
204
+
205
+ @pytest.mark.asyncio
206
+ async def test_send_memoryview(client, sample_path):
207
+ # Read file and convert to memoryview
208
+ with open(sample_path, "rb") as f:
209
+ content_bytes = f.read()
210
+ content = memoryview(content_bytes)
211
+
212
+ response = await client.upload(content, filename="test.pdf")
213
+ assert response.task_id is not None
214
+ assert response.status == "Succeeded"
215
+ assert response.output is not None
216
+
217
+ @pytest.mark.asyncio
218
+ async def test_with_explicit_filename_pdf(client, sample_path):
219
+ response = await client.upload(sample_path, filename="custom_name.pdf")
220
+ assert response.task_id is not None
221
+ assert response.status == "Succeeded"
222
+ assert response.output is not None
223
+
224
+ @pytest.mark.asyncio
225
+ async def test_with_explicit_filename_image(client, sample_image):
226
+ response = await client.upload(sample_image, filename="custom_image.jpg")
227
+ assert response.task_id is not None
228
+ assert response.status == "Succeeded"
229
+ assert response.output is not None
230
+
231
+ @pytest.mark.asyncio
232
+ async def test_with_special_character_filename(client, sample_path):
233
+ response = await client.upload(sample_path, filename="test file (1)&%$#@!.pdf")
234
+ assert response.task_id is not None
235
+ assert response.status == "Succeeded"
236
+ assert response.output is not None
237
+
238
+ @pytest.mark.asyncio
239
+ async def test_filename_with_non_matching_extension(client, sample_path):
240
+ # Test providing a filename with a different extension than the actual file
241
+ response = await client.upload(sample_path, filename="document.docx")
242
+ assert response.task_id is not None
243
+ assert response.status == "Succeeded"
244
+ assert response.output is not None
245
+
246
+ @pytest.mark.asyncio
247
+ async def test_bytes_with_explicit_filename(client, sample_path):
248
+ with open(sample_path, "rb") as f:
249
+ content = f.read()
250
+
251
+ # For bytes objects, filename is required to know the file type
252
+ response = await client.upload(content, filename="document.pdf")
253
+ assert response.task_id is not None
254
+ assert response.status == "Succeeded"
255
+ assert response.output is not None
256
+
257
+ @pytest.mark.asyncio
258
+ async def test_bytearray_with_explicit_filename(client, sample_path):
259
+ with open(sample_path, "rb") as f:
260
+ content = bytearray(f.read())
261
+
262
+ response = await client.upload(content, filename="document.pdf")
263
+ assert response.task_id is not None
264
+ assert response.status == "Succeeded"
265
+ assert response.output is not None
266
+
267
+ @pytest.mark.asyncio
268
+ async def test_memoryview_with_explicit_filename(client, sample_path):
269
+ with open(sample_path, "rb") as f:
270
+ content_bytes = f.read()
271
+ content = memoryview(content_bytes)
272
+
273
+ response = await client.upload(content, filename="document.pdf")
274
+ assert response.task_id is not None
275
+ assert response.status == "Succeeded"
276
+ assert response.output is not None
277
+
278
+ @pytest.mark.asyncio
279
+ async def test_unicode_filename(client, sample_path):
280
+ # Test with a filename containing Unicode characters
281
+ response = await client.upload(sample_path, filename="测试文件.pdf")
282
+ assert response.task_id is not None
283
+ assert response.status == "Succeeded"
284
+ assert response.output is not None
285
+
286
+ @pytest.mark.asyncio
287
+ async def test_very_long_filename(client, sample_path):
288
+ # Test with an extremely long filename
289
+ long_name = "a" * 200 + ".pdf" # 200 character filename
290
+ response = await client.upload(sample_path, filename=long_name)
291
+ assert response.task_id is not None
292
+ assert response.status == "Succeeded"
293
+ assert response.output is not None
294
+
295
+ @pytest.mark.asyncio
296
+ async def test_filename_without_extension(client, sample_path):
297
+ # Test with a filename that has no extension
298
+ with open(sample_path, "rb") as f:
299
+ content = f.read()
300
+
301
+ # This test verifies that the system uses the provided filename even without extension
302
+ response = await client.upload(content, filename="document_without_extension")
303
+ assert response.task_id is not None
304
+ assert response.status == "Succeeded"
305
+ assert response.output is not None
306
+
307
+ @pytest.mark.asyncio
308
+ async def test_custom_file_like_with_filename(client, sample_path):
309
+ # A more complex file-like object implementation
310
+ class CustomFileWrapper:
311
+ def __init__(self, content):
312
+ self.buffer = io.BytesIO(content)
313
+ self.position = 0
314
+ self.name = "original_name.txt" # Should be overridden by explicit filename
315
+
316
+ def read(self, size=-1):
317
+ return self.buffer.read(size)
318
+
319
+ def seek(self, position, whence=0):
320
+ return self.buffer.seek(position, whence)
321
+
322
+ def tell(self):
323
+ return self.buffer.tell()
324
+
325
+ def close(self):
326
+ self.buffer.close()
327
+
328
+ with open(sample_path, "rb") as f:
329
+ content = f.read()
330
+
331
+ custom_file = CustomFileWrapper(content)
332
+ response = await client.upload(custom_file, filename="custom_wrapper.pdf")
333
+ assert response.task_id is not None
334
+ assert response.status == "Succeeded"
335
+ assert response.output is not None
336
+
337
+ @pytest.mark.asyncio
338
+ async def test_seek_at_nonzero_position(client, sample_path):
339
+ # Test with a file-like object that's not at position 0
340
+ with open(sample_path, "rb") as f:
341
+ content = f.read()
342
+
343
+ buffer = io.BytesIO(content)
344
+ buffer.seek(100) # Move position to 100
345
+
346
+ response = await client.upload(buffer, filename="seek_test.pdf")
347
+ assert response.task_id is not None
348
+ assert response.status == "Succeeded"
349
+ assert response.output is not None
350
+
351
+ @pytest.mark.asyncio
352
+ async def test_reused_file_object(client, sample_path):
353
+ # Test that a file object can be reused after being processed
354
+ with open(sample_path, "rb") as f:
355
+ response1 = await client.upload(f, filename="first_use.pdf")
356
+ f.seek(0) # Reset position
357
+ response2 = await client.upload(f, filename="second_use.pdf")
358
+
359
+ assert response1.task_id is not None
360
+ assert response1.status == "Succeeded"
361
+ assert response2.task_id is not None
362
+ assert response2.status == "Succeeded"
File without changes
File without changes
File without changes