chunkr-ai 0.0.47__tar.gz → 0.0.48__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (23) hide show
  1. {chunkr_ai-0.0.47/src/chunkr_ai.egg-info → chunkr_ai-0.0.48}/PKG-INFO +1 -1
  2. {chunkr_ai-0.0.47 → chunkr_ai-0.0.48}/pyproject.toml +1 -1
  3. {chunkr_ai-0.0.47 → chunkr_ai-0.0.48}/src/chunkr_ai/api/chunkr.py +2 -2
  4. {chunkr_ai-0.0.47 → chunkr_ai-0.0.48}/src/chunkr_ai/api/misc.py +49 -17
  5. {chunkr_ai-0.0.47 → chunkr_ai-0.0.48/src/chunkr_ai.egg-info}/PKG-INFO +1 -1
  6. {chunkr_ai-0.0.47 → chunkr_ai-0.0.48}/src/chunkr_ai.egg-info/SOURCES.txt +2 -1
  7. {chunkr_ai-0.0.47 → chunkr_ai-0.0.48}/tests/test_chunkr.py +34 -111
  8. chunkr_ai-0.0.48/tests/test_file_handling.py +362 -0
  9. {chunkr_ai-0.0.47 → chunkr_ai-0.0.48}/LICENSE +0 -0
  10. {chunkr_ai-0.0.47 → chunkr_ai-0.0.48}/README.md +0 -0
  11. {chunkr_ai-0.0.47 → chunkr_ai-0.0.48}/setup.cfg +0 -0
  12. {chunkr_ai-0.0.47 → chunkr_ai-0.0.48}/src/chunkr_ai/__init__.py +0 -0
  13. {chunkr_ai-0.0.47 → chunkr_ai-0.0.48}/src/chunkr_ai/api/__init__.py +0 -0
  14. {chunkr_ai-0.0.47 → chunkr_ai-0.0.48}/src/chunkr_ai/api/auth.py +0 -0
  15. {chunkr_ai-0.0.47 → chunkr_ai-0.0.48}/src/chunkr_ai/api/chunkr_base.py +0 -0
  16. {chunkr_ai-0.0.47 → chunkr_ai-0.0.48}/src/chunkr_ai/api/configuration.py +0 -0
  17. {chunkr_ai-0.0.47 → chunkr_ai-0.0.48}/src/chunkr_ai/api/decorators.py +0 -0
  18. {chunkr_ai-0.0.47 → chunkr_ai-0.0.48}/src/chunkr_ai/api/protocol.py +0 -0
  19. {chunkr_ai-0.0.47 → chunkr_ai-0.0.48}/src/chunkr_ai/api/task_response.py +0 -0
  20. {chunkr_ai-0.0.47 → chunkr_ai-0.0.48}/src/chunkr_ai/models.py +0 -0
  21. {chunkr_ai-0.0.47 → chunkr_ai-0.0.48}/src/chunkr_ai.egg-info/dependency_links.txt +0 -0
  22. {chunkr_ai-0.0.47 → chunkr_ai-0.0.48}/src/chunkr_ai.egg-info/requires.txt +0 -0
  23. {chunkr_ai-0.0.47 → chunkr_ai-0.0.48}/src/chunkr_ai.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: chunkr-ai
3
- Version: 0.0.47
3
+ Version: 0.0.48
4
4
  Summary: Python client for Chunkr: open source document intelligence
5
5
  Author-email: Ishaan Kapoor <ishaan@lumina.sh>
6
6
  License: MIT License
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "chunkr-ai"
7
- version = "0.0.47"
7
+ version = "0.0.48"
8
8
  authors = [{"name" = "Ishaan Kapoor", "email" = "ishaan@lumina.sh"}]
9
9
  description = "Python client for Chunkr: open source document intelligence"
10
10
  readme = "README.md"
@@ -16,7 +16,7 @@ class Chunkr(ChunkrBase):
16
16
  @ensure_client()
17
17
  async def upload(
18
18
  self,
19
- file: Union[str, Path, BinaryIO, Image.Image],
19
+ file: Union[str, Path, BinaryIO, Image.Image, bytes, bytearray, memoryview],
20
20
  config: Optional[Configuration] = None,
21
21
  filename: Optional[str] = None,
22
22
  ) -> TaskResponse:
@@ -34,7 +34,7 @@ class Chunkr(ChunkrBase):
34
34
  @retry_on_429()
35
35
  async def create_task(
36
36
  self,
37
- file: Union[str, Path, BinaryIO, Image.Image],
37
+ file: Union[str, Path, BinaryIO, Image.Image, bytes, bytearray, memoryview],
38
38
  config: Optional[Configuration] = None,
39
39
  filename: Optional[str] = None,
40
40
  ) -> TaskResponse:
@@ -3,9 +3,9 @@ import base64
3
3
  import io
4
4
  from pathlib import Path
5
5
  from PIL import Image
6
- from typing import Union, Tuple, BinaryIO, Optional
6
+ from typing import Union, Tuple, BinaryIO, Optional, Any
7
7
 
8
- async def prepare_file(file: Union[str, Path, BinaryIO, Image.Image]) -> Tuple[Optional[str], str]:
8
+ async def prepare_file(file: Union[str, Path, BinaryIO, Image.Image, bytes, bytearray, memoryview]) -> Tuple[Optional[str], str]:
9
9
  """Convert various file types into a tuple of (filename, file content).
10
10
 
11
11
  Args:
@@ -15,6 +15,7 @@ async def prepare_file(file: Union[str, Path, BinaryIO, Image.Image]) -> Tuple[O
15
15
  - Local file path (will be converted to base64)
16
16
  - Opened binary file (will be converted to base64)
17
17
  - PIL/Pillow Image object (will be converted to base64)
18
+ - Bytes object (will be converted to base64)
18
19
 
19
20
  Returns:
20
21
  Tuple[Optional[str], str]: (filename, content) where content is either a URL or base64 string
@@ -26,22 +27,54 @@ async def prepare_file(file: Union[str, Path, BinaryIO, Image.Image]) -> Tuple[O
26
27
  ValueError: If the URL is invalid or unreachable
27
28
  ValueError: If the MIME type is unsupported
28
29
  """
29
- # Handle strings
30
+ # Handle bytes-like objects
31
+ if isinstance(file, (bytes, bytearray, memoryview)):
32
+ # Convert to bytes first if it's not already
33
+ file_bytes = bytes(file)
34
+
35
+ # Check if this might be an already-encoded base64 string in bytes form
36
+ try:
37
+ # Try to decode the bytes to a string and see if it's valid base64
38
+ potential_base64 = file_bytes.decode('utf-8', errors='strict')
39
+ base64.b64decode(potential_base64)
40
+ # If we get here, it was a valid base64 string in bytes form
41
+ return None, potential_base64
42
+ except:
43
+ # Not a base64 string in bytes form, encode it as base64
44
+ base64_str = base64.b64encode(file_bytes).decode()
45
+ return None, base64_str
46
+
47
+ # Handle strings - urls or paths or base64
30
48
  if isinstance(file, str):
49
+ # Handle URLs
31
50
  if file.startswith(('http://', 'https://')):
32
51
  return None, file
33
- # Try to handle as a file path first
34
- path = Path(file)
35
- if path.exists():
36
- # It's a valid file path, convert to Path object and continue processing
37
- file = path
38
- else:
39
- # If not a valid file path, try treating as base64
52
+
53
+ # Handle data URLs
54
+ if file.startswith('data:'):
55
+ return None, file
56
+
57
+ # Try to handle as a file path
58
+ try:
59
+ path = Path(file)
60
+ if path.exists():
61
+ # It's a valid file path, convert to Path object and continue processing
62
+ file = path
63
+ else:
64
+ # If not a valid file path, try treating as base64
65
+ try:
66
+ # Just test if it's valid base64, don't store the result
67
+ base64.b64decode(file)
68
+ return None, file
69
+ except:
70
+ raise ValueError(f"File not found: {file} and it's not a valid base64 string")
71
+ except Exception as e:
72
+ # If string can't be converted to Path or decoded as base64, it might still be a base64 string
40
73
  try:
41
74
  base64.b64decode(file)
42
75
  return None, file
43
76
  except:
44
- raise ValueError(f"File not found: {file} and it's not a valid base64 string")
77
+ raise ValueError(f"Unable to process file: {e}")
45
78
 
46
79
  # Handle file paths - convert to base64
47
80
  if isinstance(file, Path):
@@ -71,17 +104,16 @@ async def prepare_file(file: Union[str, Path, BinaryIO, Image.Image]) -> Tuple[O
71
104
  file.seek(0)
72
105
  file_content = file.read()
73
106
  name = getattr(file, "name", "document")
74
- file_ext = Path(name).suffix.lower().lstrip('.')
75
- if not file_ext:
76
- raise ValueError("File must have an extension")
107
+ if not name or not isinstance(name, str):
108
+ name = None
77
109
  base64_str = base64.b64encode(file_content).decode()
78
- return Path(name).name, base64_str
110
+ return name, base64_str
79
111
 
80
112
  raise TypeError(f"Unsupported file type: {type(file)}")
81
113
 
82
114
 
83
115
  async def prepare_upload_data(
84
- file: Optional[Union[str, Path, BinaryIO, Image.Image]] = None,
116
+ file: Optional[Union[str, Path, BinaryIO, Image.Image, bytes, bytearray, memoryview]] = None,
85
117
  filename: Optional[str] = None,
86
118
  config: Optional[Configuration] = None,
87
119
  ) -> dict:
@@ -89,8 +121,8 @@ async def prepare_upload_data(
89
121
 
90
122
  Args:
91
123
  file: The file to upload
124
+ filename: Optional filename to use (overrides any filename from the file)
92
125
  config: Optional configuration settings
93
- client: HTTP client for downloading remote files
94
126
 
95
127
  Returns:
96
128
  dict: JSON-serializable data dictionary ready for upload
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: chunkr-ai
3
- Version: 0.0.47
3
+ Version: 0.0.48
4
4
  Summary: Python client for Chunkr: open source document intelligence
5
5
  Author-email: Ishaan Kapoor <ishaan@lumina.sh>
6
6
  License: MIT License
@@ -17,4 +17,5 @@ src/chunkr_ai/api/decorators.py
17
17
  src/chunkr_ai/api/misc.py
18
18
  src/chunkr_ai/api/protocol.py
19
19
  src/chunkr_ai/api/task_response.py
20
- tests/test_chunkr.py
20
+ tests/test_chunkr.py
21
+ tests/test_file_handling.py
@@ -3,6 +3,9 @@ from pathlib import Path
3
3
  from PIL import Image
4
4
  import asyncio
5
5
  import base64
6
+ import io
7
+ import tempfile
8
+ from typing import Awaitable
6
9
 
7
10
  from chunkr_ai import Chunkr
8
11
  from chunkr_ai.models import (
@@ -164,57 +167,6 @@ def model_fallback_config():
164
167
  ),
165
168
  )
166
169
 
167
- @pytest.mark.asyncio
168
- async def test_send_file_path(client, sample_path):
169
- response = await client.upload(sample_path)
170
- assert response.task_id is not None
171
- assert response.status == "Succeeded"
172
- assert response.output is not None
173
-
174
- @pytest.mark.asyncio
175
- async def test_send_file_path_str(client, sample_absolute_path_str):
176
- response = await client.upload(sample_absolute_path_str)
177
- assert response.task_id is not None
178
- assert response.status == "Succeeded"
179
- assert response.output is not None
180
-
181
- @pytest.mark.asyncio
182
- async def test_send_file_relative_path_str(client, sample_relative_path_str):
183
- response = await client.upload(sample_relative_path_str)
184
- assert response.task_id is not None
185
- assert response.status == "Succeeded"
186
- assert response.output is not None
187
-
188
- @pytest.mark.asyncio
189
- async def test_send_file_url(client, sample_url):
190
- response = await client.upload(sample_url)
191
- assert response.task_id is not None
192
- assert response.status == "Succeeded"
193
- assert response.output is not None
194
-
195
- @pytest.mark.asyncio
196
- async def test_send_file_path_as_str(client, sample_path):
197
- response = await client.upload(str(sample_path))
198
- assert response.task_id is not None
199
- assert response.status == "Succeeded"
200
- assert response.output is not None
201
-
202
- @pytest.mark.asyncio
203
- async def test_send_opened_file(client, sample_path):
204
- with open(sample_path, "rb") as f:
205
- response = await client.upload(f)
206
- assert response.task_id is not None
207
- assert response.status == "Succeeded"
208
- assert response.output is not None
209
-
210
- @pytest.mark.asyncio
211
- async def test_send_pil_image(client, sample_image):
212
- response = await client.upload(sample_image)
213
- assert response.task_id is not None
214
- assert response.status == "Succeeded"
215
- assert response.output is not None
216
- assert response.output is not None
217
-
218
170
  @pytest.mark.asyncio
219
171
  async def test_ocr_auto(client, sample_path):
220
172
  response = await client.upload(sample_path, Configuration(ocr_strategy=OcrStrategy.AUTO))
@@ -313,7 +265,7 @@ async def test_cancel_task(client, sample_path):
313
265
  @pytest.mark.asyncio
314
266
  async def test_cancel_task_direct(client, sample_path):
315
267
  task = await client.create_task(sample_path)
316
- assert isinstance(task, TaskResponse)
268
+ assert isinstance(task, Awaitable) and isinstance(task, TaskResponse)
317
269
  assert task.status == "Starting"
318
270
  await task.cancel()
319
271
  assert task.status == "Cancelled"
@@ -375,36 +327,6 @@ async def test_task_operations_after_client_close(client, sample_path):
375
327
  result = await task.poll()
376
328
  assert result.status == "Succeeded"
377
329
 
378
- @pytest.mark.asyncio
379
- async def test_send_base64_file(client, sample_path):
380
- # Read file and convert to base64
381
- with open(sample_path, "rb") as f:
382
- base64_content = base64.b64encode(f.read()).decode('utf-8')
383
- response = await client.upload(base64_content)
384
- assert response.task_id is not None
385
- assert response.status == "Succeeded"
386
- assert response.output is not None
387
-
388
- @pytest.mark.asyncio
389
- async def test_send_base64_file_with_data_url(client, sample_path):
390
- with open(sample_path, "rb") as f:
391
- base64_content = base64.b64encode(f.read()).decode('utf-8')
392
- response = await client.upload(f"data:application/pdf;base64,{base64_content}")
393
- assert response.task_id is not None
394
- assert response.status == "Succeeded"
395
- assert response.output is not None
396
-
397
- @pytest.mark.asyncio
398
- async def test_send_base64_file_with_filename(client, sample_path):
399
- # Read file and convert to base64
400
- with open(sample_path, "rb") as f:
401
- base64_content = base64.b64encode(f.read()).decode('utf-8')
402
-
403
- response = await client.upload(base64_content, filename="test.pdf")
404
- assert response.task_id is not None
405
- assert response.status == "Succeeded"
406
- assert response.output is not None
407
-
408
330
  @pytest.mark.asyncio
409
331
  async def test_output_files_no_dir(client, sample_path, tmp_path):
410
332
  task = await client.upload(sample_path)
@@ -444,6 +366,35 @@ async def test_output_files_with_dirs(client, sample_path, tmp_path):
444
366
  assert content_file.exists()
445
367
  assert json_file.exists()
446
368
 
369
+
370
+ @pytest.mark.asyncio
371
+ async def test_combined_config_with_llm_and_other_settings(client, sample_path):
372
+ # Test combining LLM settings with other configuration options
373
+ config = Configuration(
374
+ llm_processing=LlmProcessing(
375
+ model_id="qwen-2.5-vl-7b-instruct",
376
+ fallback_strategy=FallbackStrategy.model("gemini-flash-2.0"),
377
+ temperature=0.4
378
+ ),
379
+ segmentation_strategy=SegmentationStrategy.PAGE,
380
+ segment_processing=SegmentProcessing(
381
+ Page=GenerationConfig(
382
+ html=GenerationStrategy.LLM,
383
+ markdown=GenerationStrategy.LLM
384
+ )
385
+ ),
386
+ chunk_processing=ChunkProcessing(target_length=1024)
387
+ )
388
+
389
+ response = await client.upload(sample_path, config)
390
+ assert response.task_id is not None
391
+ assert response.status == "Succeeded"
392
+ assert response.output is not None
393
+ assert response.configuration.llm_processing is not None
394
+ assert response.configuration.llm_processing.model_id == "qwen-2.5-vl-7b-instruct"
395
+ assert response.configuration.segmentation_strategy == SegmentationStrategy.PAGE
396
+ assert response.configuration.chunk_processing.target_length == 1024
397
+
447
398
  @pytest.mark.asyncio
448
399
  async def test_embed_sources_markdown_only(client, sample_path, markdown_embed_config):
449
400
  response = await client.upload(sample_path, markdown_embed_config)
@@ -577,32 +528,4 @@ async def test_fallback_strategy_serialization():
577
528
  # Test string representation
578
529
  assert str(none_strategy) == "None"
579
530
  assert str(default_strategy) == "Default"
580
- assert str(model_strategy) == "Model(gpt-4.1)"
581
-
582
- @pytest.mark.asyncio
583
- async def test_combined_config_with_llm_and_other_settings(client, sample_path):
584
- # Test combining LLM settings with other configuration options
585
- config = Configuration(
586
- llm_processing=LlmProcessing(
587
- model_id="qwen-2.5-vl-7b-instruct",
588
- fallback_strategy=FallbackStrategy.model("gemini-flash-2.0"),
589
- temperature=0.4
590
- ),
591
- segmentation_strategy=SegmentationStrategy.PAGE,
592
- segment_processing=SegmentProcessing(
593
- Page=GenerationConfig(
594
- html=GenerationStrategy.LLM,
595
- markdown=GenerationStrategy.LLM
596
- )
597
- ),
598
- chunk_processing=ChunkProcessing(target_length=1024)
599
- )
600
-
601
- response = await client.upload(sample_path, config)
602
- assert response.task_id is not None
603
- assert response.status == "Succeeded"
604
- assert response.output is not None
605
- assert response.configuration.llm_processing is not None
606
- assert response.configuration.llm_processing.model_id == "qwen-2.5-vl-7b-instruct"
607
- assert response.configuration.segmentation_strategy == SegmentationStrategy.PAGE
608
- assert response.configuration.chunk_processing.target_length == 1024
531
+ assert str(model_strategy) == "Model(gpt-4.1)"
@@ -0,0 +1,362 @@
1
+ import pytest
2
+ from pathlib import Path
3
+ from PIL import Image
4
+ import base64
5
+ import io
6
+ import tempfile
7
+
8
+ from chunkr_ai import Chunkr
9
+
10
+ @pytest.fixture
11
+ def sample_path():
12
+ return Path("tests/files/test.pdf")
13
+
14
+ @pytest.fixture
15
+ def sample_url():
16
+ return "https://chunkr-web.s3.us-east-1.amazonaws.com/landing_page/input/science.pdf"
17
+
18
+ @pytest.fixture
19
+ def sample_image():
20
+ return Image.open("tests/files/test.jpg")
21
+
22
+ @pytest.fixture
23
+ def client():
24
+ client = Chunkr()
25
+ yield client
26
+
27
+ @pytest.mark.asyncio
28
+ async def test_send_file_path(client, sample_path):
29
+ response = await client.upload(sample_path)
30
+ assert response.task_id is not None
31
+ assert response.status == "Succeeded"
32
+ assert response.output is not None
33
+
34
+ @pytest.mark.asyncio
35
+ async def test_send_file_path_str(client, sample_path):
36
+ response = await client.upload(str(sample_path))
37
+ assert response.task_id is not None
38
+ assert response.status == "Succeeded"
39
+ assert response.output is not None
40
+
41
+ @pytest.mark.asyncio
42
+ async def test_send_file_relative_path_str(client):
43
+ response = await client.upload("./tests/files/test.pdf")
44
+ assert response.task_id is not None
45
+ assert response.status == "Succeeded"
46
+ assert response.output is not None
47
+
48
+ @pytest.mark.asyncio
49
+ async def test_send_file_url(client, sample_url):
50
+ response = await client.upload(sample_url)
51
+ assert response.task_id is not None
52
+ assert response.status == "Succeeded"
53
+ assert response.output is not None
54
+
55
+ @pytest.mark.asyncio
56
+ async def test_send_opened_file(client, sample_path):
57
+ with open(sample_path, "rb") as f:
58
+ response = await client.upload(f)
59
+ assert response.task_id is not None
60
+ assert response.status == "Succeeded"
61
+ assert response.output is not None
62
+
63
+ @pytest.mark.asyncio
64
+ async def test_send_pil_image(client, sample_image):
65
+ response = await client.upload(sample_image)
66
+ assert response.task_id is not None
67
+ assert response.status == "Succeeded"
68
+ assert response.output is not None
69
+ assert response.output is not None
70
+
71
+ @pytest.mark.asyncio
72
+ async def test_send_base64_file(client, sample_path):
73
+ # Read file and convert to base64
74
+ with open(sample_path, "rb") as f:
75
+ base64_content = base64.b64encode(f.read())
76
+ response = await client.upload(base64_content)
77
+ assert response.task_id is not None
78
+ assert response.status == "Succeeded"
79
+ assert response.output is not None
80
+
81
+ @pytest.mark.asyncio
82
+ async def test_send_base64_file_w_decode(client, sample_path):
83
+ # Read file and convert to base64
84
+ with open(sample_path, "rb") as f:
85
+ base64_content = base64.b64encode(f.read()).decode()
86
+ response = await client.upload(base64_content)
87
+ assert response.task_id is not None
88
+ assert response.status == "Succeeded"
89
+ assert response.output is not None
90
+
91
+ @pytest.mark.asyncio
92
+ async def test_send_base64_file_with_data_url(client, sample_path):
93
+ with open(sample_path, "rb") as f:
94
+ base64_content = base64.b64encode(f.read()).decode('utf-8')
95
+ response = await client.upload(f"data:application/pdf;base64,{base64_content}")
96
+ assert response.task_id is not None
97
+ assert response.status == "Succeeded"
98
+ assert response.output is not None
99
+
100
+ @pytest.mark.asyncio
101
+ async def test_send_base64_file_with_filename(client, sample_path):
102
+ # Read file and convert to base64
103
+ with open(sample_path, "rb") as f:
104
+ base64_content = base64.b64encode(f.read()).decode('utf-8')
105
+
106
+ response = await client.upload(base64_content, filename="test.pdf")
107
+ assert response.task_id is not None
108
+ assert response.status == "Succeeded"
109
+ assert response.output is not None
110
+
111
+ @pytest.mark.asyncio
112
+ async def test_file_like_no_name_attribute(client, sample_path):
113
+ # Create a file-like object without a name attribute
114
+ class NamelessBuffer:
115
+ def __init__(self, content):
116
+ self.buffer = io.BytesIO(content)
117
+
118
+ def read(self):
119
+ return self.buffer.read()
120
+
121
+ def seek(self, pos):
122
+ return self.buffer.seek(pos)
123
+
124
+ with open(sample_path, "rb") as f:
125
+ content = f.read()
126
+
127
+ nameless_buffer = NamelessBuffer(content)
128
+ response = await client.upload(nameless_buffer, filename="test.pdf")
129
+ assert response.task_id is not None
130
+ assert response.status == "Succeeded"
131
+ assert response.output is not None
132
+
133
+ @pytest.mark.asyncio
134
+ async def test_file_like_none_name(client, sample_path):
135
+ # Create a file-like object with None as name
136
+ class NoneNameBuffer:
137
+ def __init__(self, content):
138
+ self.buffer = io.BytesIO(content)
139
+ self.name = None
140
+
141
+ def read(self):
142
+ return self.buffer.read()
143
+
144
+ def seek(self, pos):
145
+ return self.buffer.seek(pos)
146
+
147
+ with open(sample_path, "rb") as f:
148
+ content = f.read()
149
+
150
+ none_name_buffer = NoneNameBuffer(content)
151
+ response = await client.upload(none_name_buffer, filename="test.pdf")
152
+ assert response.task_id is not None
153
+ assert response.status == "Succeeded"
154
+ assert response.output is not None
155
+
156
+ @pytest.mark.asyncio
157
+ async def test_file_like_no_extension(client, sample_path):
158
+ # Create a file-like object with a name but no extension
159
+ class NoExtensionBuffer:
160
+ def __init__(self, content):
161
+ self.buffer = io.BytesIO(content)
162
+ self.name = "test_document"
163
+
164
+ def read(self):
165
+ return self.buffer.read()
166
+
167
+ def seek(self, pos):
168
+ return self.buffer.seek(pos)
169
+
170
+ with open(sample_path, "rb") as f:
171
+ content = f.read()
172
+
173
+ no_ext_buffer = NoExtensionBuffer(content)
174
+ response = await client.upload(no_ext_buffer, filename="test.pdf")
175
+ assert response.task_id is not None
176
+ assert response.status == "Succeeded"
177
+ assert response.output is not None
178
+
179
+ @pytest.mark.asyncio
180
+ async def test_spooled_temporary_file(client, sample_path):
181
+ # Test with SpooledTemporaryFile which is what the user is using
182
+ with open(sample_path, "rb") as f:
183
+ content = f.read()
184
+
185
+ temp_file = tempfile.SpooledTemporaryFile()
186
+ temp_file.write(content)
187
+ temp_file.seek(0)
188
+
189
+ response = await client.upload(temp_file, filename="test.pdf")
190
+ assert response.task_id is not None
191
+ assert response.status == "Succeeded"
192
+ assert response.output is not None
193
+
194
+ @pytest.mark.asyncio
195
+ async def test_send_bytearray(client, sample_path):
196
+ # Read file and convert to bytearray
197
+ with open(sample_path, "rb") as f:
198
+ content = bytearray(f.read())
199
+
200
+ response = await client.upload(content, filename="test.pdf")
201
+ assert response.task_id is not None
202
+ assert response.status == "Succeeded"
203
+ assert response.output is not None
204
+
205
+ @pytest.mark.asyncio
206
+ async def test_send_memoryview(client, sample_path):
207
+ # Read file and convert to memoryview
208
+ with open(sample_path, "rb") as f:
209
+ content_bytes = f.read()
210
+ content = memoryview(content_bytes)
211
+
212
+ response = await client.upload(content, filename="test.pdf")
213
+ assert response.task_id is not None
214
+ assert response.status == "Succeeded"
215
+ assert response.output is not None
216
+
217
+ @pytest.mark.asyncio
218
+ async def test_with_explicit_filename_pdf(client, sample_path):
219
+ response = await client.upload(sample_path, filename="custom_name.pdf")
220
+ assert response.task_id is not None
221
+ assert response.status == "Succeeded"
222
+ assert response.output is not None
223
+
224
+ @pytest.mark.asyncio
225
+ async def test_with_explicit_filename_image(client, sample_image):
226
+ response = await client.upload(sample_image, filename="custom_image.jpg")
227
+ assert response.task_id is not None
228
+ assert response.status == "Succeeded"
229
+ assert response.output is not None
230
+
231
+ @pytest.mark.asyncio
232
+ async def test_with_special_character_filename(client, sample_path):
233
+ response = await client.upload(sample_path, filename="test file (1)&%$#@!.pdf")
234
+ assert response.task_id is not None
235
+ assert response.status == "Succeeded"
236
+ assert response.output is not None
237
+
238
+ @pytest.mark.asyncio
239
+ async def test_filename_with_non_matching_extension(client, sample_path):
240
+ # Test providing a filename with a different extension than the actual file
241
+ response = await client.upload(sample_path, filename="document.docx")
242
+ assert response.task_id is not None
243
+ assert response.status == "Succeeded"
244
+ assert response.output is not None
245
+
246
+ @pytest.mark.asyncio
247
+ async def test_bytes_with_explicit_filename(client, sample_path):
248
+ with open(sample_path, "rb") as f:
249
+ content = f.read()
250
+
251
+ # For bytes objects, filename is required to know the file type
252
+ response = await client.upload(content, filename="document.pdf")
253
+ assert response.task_id is not None
254
+ assert response.status == "Succeeded"
255
+ assert response.output is not None
256
+
257
+ @pytest.mark.asyncio
258
+ async def test_bytearray_with_explicit_filename(client, sample_path):
259
+ with open(sample_path, "rb") as f:
260
+ content = bytearray(f.read())
261
+
262
+ response = await client.upload(content, filename="document.pdf")
263
+ assert response.task_id is not None
264
+ assert response.status == "Succeeded"
265
+ assert response.output is not None
266
+
267
+ @pytest.mark.asyncio
268
+ async def test_memoryview_with_explicit_filename(client, sample_path):
269
+ with open(sample_path, "rb") as f:
270
+ content_bytes = f.read()
271
+ content = memoryview(content_bytes)
272
+
273
+ response = await client.upload(content, filename="document.pdf")
274
+ assert response.task_id is not None
275
+ assert response.status == "Succeeded"
276
+ assert response.output is not None
277
+
278
+ @pytest.mark.asyncio
279
+ async def test_unicode_filename(client, sample_path):
280
+ # Test with a filename containing Unicode characters
281
+ response = await client.upload(sample_path, filename="测试文件.pdf")
282
+ assert response.task_id is not None
283
+ assert response.status == "Succeeded"
284
+ assert response.output is not None
285
+
286
+ @pytest.mark.asyncio
287
+ async def test_very_long_filename(client, sample_path):
288
+ # Test with an extremely long filename
289
+ long_name = "a" * 200 + ".pdf" # 200 character filename
290
+ response = await client.upload(sample_path, filename=long_name)
291
+ assert response.task_id is not None
292
+ assert response.status == "Succeeded"
293
+ assert response.output is not None
294
+
295
+ @pytest.mark.asyncio
296
+ async def test_filename_without_extension(client, sample_path):
297
+ # Test with a filename that has no extension
298
+ with open(sample_path, "rb") as f:
299
+ content = f.read()
300
+
301
+ # This test verifies that the system uses the provided filename even without extension
302
+ response = await client.upload(content, filename="document_without_extension")
303
+ assert response.task_id is not None
304
+ assert response.status == "Succeeded"
305
+ assert response.output is not None
306
+
307
+ @pytest.mark.asyncio
308
+ async def test_custom_file_like_with_filename(client, sample_path):
309
+ # A more complex file-like object implementation
310
+ class CustomFileWrapper:
311
+ def __init__(self, content):
312
+ self.buffer = io.BytesIO(content)
313
+ self.position = 0
314
+ self.name = "original_name.txt" # Should be overridden by explicit filename
315
+
316
+ def read(self, size=-1):
317
+ return self.buffer.read(size)
318
+
319
+ def seek(self, position, whence=0):
320
+ return self.buffer.seek(position, whence)
321
+
322
+ def tell(self):
323
+ return self.buffer.tell()
324
+
325
+ def close(self):
326
+ self.buffer.close()
327
+
328
+ with open(sample_path, "rb") as f:
329
+ content = f.read()
330
+
331
+ custom_file = CustomFileWrapper(content)
332
+ response = await client.upload(custom_file, filename="custom_wrapper.pdf")
333
+ assert response.task_id is not None
334
+ assert response.status == "Succeeded"
335
+ assert response.output is not None
336
+
337
+ @pytest.mark.asyncio
338
+ async def test_seek_at_nonzero_position(client, sample_path):
339
+ # Test with a file-like object that's not at position 0
340
+ with open(sample_path, "rb") as f:
341
+ content = f.read()
342
+
343
+ buffer = io.BytesIO(content)
344
+ buffer.seek(100) # Move position to 100
345
+
346
+ response = await client.upload(buffer, filename="seek_test.pdf")
347
+ assert response.task_id is not None
348
+ assert response.status == "Succeeded"
349
+ assert response.output is not None
350
+
351
+ @pytest.mark.asyncio
352
+ async def test_reused_file_object(client, sample_path):
353
+ # Test that a file object can be reused after being processed
354
+ with open(sample_path, "rb") as f:
355
+ response1 = await client.upload(f, filename="first_use.pdf")
356
+ f.seek(0) # Reset position
357
+ response2 = await client.upload(f, filename="second_use.pdf")
358
+
359
+ assert response1.task_id is not None
360
+ assert response1.status == "Succeeded"
361
+ assert response2.task_id is not None
362
+ assert response2.status == "Succeeded"
File without changes
File without changes
File without changes