chunkr-ai 0.0.46__tar.gz → 0.0.48__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (23) hide show
  1. {chunkr_ai-0.0.46/src/chunkr_ai.egg-info → chunkr_ai-0.0.48}/PKG-INFO +1 -1
  2. {chunkr_ai-0.0.46 → chunkr_ai-0.0.48}/pyproject.toml +1 -1
  3. {chunkr_ai-0.0.46 → chunkr_ai-0.0.48}/src/chunkr_ai/api/auth.py +1 -0
  4. {chunkr_ai-0.0.46 → chunkr_ai-0.0.48}/src/chunkr_ai/api/chunkr.py +19 -13
  5. {chunkr_ai-0.0.46 → chunkr_ai-0.0.48}/src/chunkr_ai/api/chunkr_base.py +12 -6
  6. {chunkr_ai-0.0.46 → chunkr_ai-0.0.48}/src/chunkr_ai/api/decorators.py +7 -10
  7. {chunkr_ai-0.0.46 → chunkr_ai-0.0.48}/src/chunkr_ai/api/misc.py +50 -14
  8. {chunkr_ai-0.0.46 → chunkr_ai-0.0.48}/src/chunkr_ai/api/task_response.py +41 -18
  9. {chunkr_ai-0.0.46 → chunkr_ai-0.0.48/src/chunkr_ai.egg-info}/PKG-INFO +1 -1
  10. {chunkr_ai-0.0.46 → chunkr_ai-0.0.48}/src/chunkr_ai.egg-info/SOURCES.txt +2 -1
  11. {chunkr_ai-0.0.46 → chunkr_ai-0.0.48}/tests/test_chunkr.py +49 -104
  12. chunkr_ai-0.0.48/tests/test_file_handling.py +362 -0
  13. {chunkr_ai-0.0.46 → chunkr_ai-0.0.48}/LICENSE +0 -0
  14. {chunkr_ai-0.0.46 → chunkr_ai-0.0.48}/README.md +0 -0
  15. {chunkr_ai-0.0.46 → chunkr_ai-0.0.48}/setup.cfg +0 -0
  16. {chunkr_ai-0.0.46 → chunkr_ai-0.0.48}/src/chunkr_ai/__init__.py +0 -0
  17. {chunkr_ai-0.0.46 → chunkr_ai-0.0.48}/src/chunkr_ai/api/__init__.py +0 -0
  18. {chunkr_ai-0.0.46 → chunkr_ai-0.0.48}/src/chunkr_ai/api/configuration.py +0 -0
  19. {chunkr_ai-0.0.46 → chunkr_ai-0.0.48}/src/chunkr_ai/api/protocol.py +0 -0
  20. {chunkr_ai-0.0.46 → chunkr_ai-0.0.48}/src/chunkr_ai/models.py +0 -0
  21. {chunkr_ai-0.0.46 → chunkr_ai-0.0.48}/src/chunkr_ai.egg-info/dependency_links.txt +0 -0
  22. {chunkr_ai-0.0.46 → chunkr_ai-0.0.48}/src/chunkr_ai.egg-info/requires.txt +0 -0
  23. {chunkr_ai-0.0.46 → chunkr_ai-0.0.48}/src/chunkr_ai.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: chunkr-ai
3
- Version: 0.0.46
3
+ Version: 0.0.48
4
4
  Summary: Python client for Chunkr: open source document intelligence
5
5
  Author-email: Ishaan Kapoor <ishaan@lumina.sh>
6
6
  License: MIT License
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "chunkr-ai"
7
- version = "0.0.46"
7
+ version = "0.0.48"
8
8
  authors = [{"name" = "Ishaan Kapoor", "email" = "ishaan@lumina.sh"}]
9
9
  description = "Python client for Chunkr: open source document intelligence"
10
10
  readme = "README.md"
@@ -1,5 +1,6 @@
1
1
  class HeadersMixin:
2
2
  """Mixin class for handling authorization headers"""
3
+ _api_key: str = ""
3
4
 
4
5
  def get_api_key(self) -> str:
5
6
  """Get the API key"""
@@ -1,12 +1,13 @@
1
1
  from pathlib import Path
2
2
  from PIL import Image
3
- from typing import Union, BinaryIO, Optional
3
+ from typing import Union, BinaryIO, Optional, cast, Awaitable
4
4
 
5
5
  from .configuration import Configuration
6
6
  from .decorators import anywhere, ensure_client, retry_on_429
7
7
  from .misc import prepare_upload_data
8
8
  from .task_response import TaskResponse
9
9
  from .chunkr_base import ChunkrBase
10
+ from .protocol import ChunkrClientProtocol
10
11
 
11
12
  class Chunkr(ChunkrBase):
12
13
  """Chunkr API client that works in both sync and async contexts"""
@@ -15,49 +16,51 @@ class Chunkr(ChunkrBase):
15
16
  @ensure_client()
16
17
  async def upload(
17
18
  self,
18
- file: Union[str, Path, BinaryIO, Image.Image],
19
- config: Configuration = None,
19
+ file: Union[str, Path, BinaryIO, Image.Image, bytes, bytearray, memoryview],
20
+ config: Optional[Configuration] = None,
20
21
  filename: Optional[str] = None,
21
22
  ) -> TaskResponse:
22
- task = await self.create_task(file, config, filename)
23
- return await task.poll()
23
+ task = await cast(Awaitable[TaskResponse], self.create_task(file, config, filename))
24
+ return await cast(Awaitable[TaskResponse], task.poll())
24
25
 
25
26
  @anywhere()
26
27
  @ensure_client()
27
28
  async def update(self, task_id: str, config: Configuration) -> TaskResponse:
28
- task = await self.update_task(task_id, config)
29
- return await task.poll()
29
+ task = await cast(Awaitable[TaskResponse], self.update_task(task_id, config))
30
+ return await cast(Awaitable[TaskResponse], task.poll())
30
31
 
31
32
  @anywhere()
32
33
  @ensure_client()
33
34
  @retry_on_429()
34
35
  async def create_task(
35
36
  self,
36
- file: Union[str, Path, BinaryIO, Image.Image],
37
- config: Configuration = None,
37
+ file: Union[str, Path, BinaryIO, Image.Image, bytes, bytearray, memoryview],
38
+ config: Optional[Configuration] = None,
38
39
  filename: Optional[str] = None,
39
40
  ) -> TaskResponse:
40
41
  """Create a new task with the given file and configuration."""
41
42
  data = await prepare_upload_data(file, filename, config)
43
+ assert self._client is not None
42
44
  r = await self._client.post(
43
45
  f"{self.url}/api/v1/task/parse", json=data, headers=self._headers()
44
46
  )
45
47
  r.raise_for_status()
46
- return TaskResponse(**r.json()).with_client(self, True, False)
48
+ return TaskResponse(**r.json()).with_client(cast(ChunkrClientProtocol, self), True, False)
47
49
 
48
50
  @anywhere()
49
51
  @ensure_client()
50
52
  @retry_on_429()
51
- async def update_task(self, task_id: str, config: Configuration) -> TaskResponse:
53
+ async def update_task(self, task_id: str, config: Optional[Configuration] = None) -> TaskResponse:
52
54
  """Update an existing task with new configuration."""
53
55
  data = await prepare_upload_data(None, None, config)
56
+ assert self._client is not None
54
57
  r = await self._client.patch(
55
58
  f"{self.url}/api/v1/task/{task_id}/parse",
56
59
  json=data,
57
60
  headers=self._headers(),
58
61
  )
59
62
  r.raise_for_status()
60
- return TaskResponse(**r.json()).with_client(self, True, False)
63
+ return TaskResponse(**r.json()).with_client(cast(ChunkrClientProtocol, self), True, False)
61
64
 
62
65
  @anywhere()
63
66
  @ensure_client()
@@ -66,17 +69,19 @@ class Chunkr(ChunkrBase):
66
69
  "base64_urls": str(base64_urls).lower(),
67
70
  "include_chunks": str(include_chunks).lower()
68
71
  }
72
+ assert self._client is not None
69
73
  r = await self._client.get(
70
74
  f"{self.url}/api/v1/task/{task_id}",
71
75
  params=params,
72
76
  headers=self._headers()
73
77
  )
74
78
  r.raise_for_status()
75
- return TaskResponse(**r.json()).with_client(self, include_chunks, base64_urls)
79
+ return TaskResponse(**r.json()).with_client(cast(ChunkrClientProtocol, self), include_chunks, base64_urls)
76
80
 
77
81
  @anywhere()
78
82
  @ensure_client()
79
83
  async def delete_task(self, task_id: str) -> None:
84
+ assert self._client is not None
80
85
  r = await self._client.delete(
81
86
  f"{self.url}/api/v1/task/{task_id}", headers=self._headers()
82
87
  )
@@ -85,6 +90,7 @@ class Chunkr(ChunkrBase):
85
90
  @anywhere()
86
91
  @ensure_client()
87
92
  async def cancel_task(self, task_id: str) -> None:
93
+ assert self._client is not None
88
94
  r = await self._client.get(
89
95
  f"{self.url}/api/v1/task/{task_id}/cancel", headers=self._headers()
90
96
  )
@@ -18,17 +18,23 @@ class ChunkrBase(HeadersMixin):
18
18
  raise_on_failure: Whether to raise an exception if the task fails. Defaults to False.
19
19
  """
20
20
 
21
- def __init__(self, url: str = None, api_key: str = None, raise_on_failure: bool = False):
21
+ url: str
22
+ _api_key: str
23
+ raise_on_failure: bool
24
+ _client: Optional[httpx.AsyncClient]
25
+
26
+ def __init__(self, url: Optional[str] = None, api_key: Optional[str] = None, raise_on_failure: bool = False):
22
27
  load_dotenv(override=True)
23
28
  self.url = url or os.getenv("CHUNKR_URL") or "https://api.chunkr.ai"
24
- self._api_key = api_key or os.getenv("CHUNKR_API_KEY")
29
+ _api_key = api_key or os.getenv("CHUNKR_API_KEY")
25
30
  self.raise_on_failure = raise_on_failure
26
31
 
27
- if not self._api_key:
32
+ if not _api_key:
28
33
  raise ValueError(
29
34
  "API key must be provided either directly, in .env file, or as CHUNKR_API_KEY environment variable. You can get an api key at: https://www.chunkr.ai"
30
35
  )
31
36
 
37
+ self._api_key = _api_key
32
38
  self.url = self.url.rstrip("/")
33
39
  self._client = httpx.AsyncClient()
34
40
 
@@ -36,7 +42,7 @@ class ChunkrBase(HeadersMixin):
36
42
  def upload(
37
43
  self,
38
44
  file: Union[str, Path, BinaryIO, Image.Image],
39
- config: Configuration = None,
45
+ config: Optional[Configuration] = None,
40
46
  filename: Optional[str] = None,
41
47
  ) -> TaskResponse:
42
48
  """Upload a file and wait for processing to complete.
@@ -90,7 +96,7 @@ class ChunkrBase(HeadersMixin):
90
96
  def create_task(
91
97
  self,
92
98
  file: Union[str, Path, BinaryIO, Image.Image],
93
- config: Configuration = None,
99
+ config: Optional[Configuration] = None,
94
100
  filename: Optional[str] = None,
95
101
  ) -> TaskResponse:
96
102
  """Upload a file for processing and immediately return the task response. It will not wait for processing to complete. To wait for the full processing to complete, use `task.poll()`.
@@ -127,7 +133,7 @@ class ChunkrBase(HeadersMixin):
127
133
 
128
134
  @abstractmethod
129
135
  def update_task(
130
- self, task_id: str, config: Configuration
136
+ self, task_id: str, config: Optional[Configuration] = None
131
137
  ) -> TaskResponse:
132
138
  """Update a task by its ID and immediately return the task response. It will not wait for processing to complete. To wait for the full processing to complete, use `task.poll()`.
133
139
 
@@ -13,10 +13,7 @@ P = ParamSpec('P')
13
13
 
14
14
  _sync_loop = None
15
15
 
16
- @overload
17
- def anywhere() -> Callable[[Callable[P, Awaitable[T]]], Callable[P, Union[Awaitable[T], T]]]: ...
18
-
19
- def anywhere():
16
+ def anywhere() -> Callable[[Callable[P, Awaitable[T]]], Callable[P, Union[Awaitable[T], T]]]:
20
17
  """Decorator that allows an async function to run anywhere - sync or async context."""
21
18
  def decorator(async_func: Callable[P, Awaitable[T]]) -> Callable[P, Union[Awaitable[T], T]]:
22
19
  @functools.wraps(async_func)
@@ -42,22 +39,22 @@ def anywhere():
42
39
  return wrapper
43
40
  return decorator
44
41
 
45
- def ensure_client() -> Callable[[Callable[P, Awaitable[T]]], Callable[P, Awaitable[T]]]:
42
+ def ensure_client() -> Callable[[Callable[..., Awaitable[T]]], Callable[..., Awaitable[T]]]:
46
43
  """Decorator that ensures a valid httpx.AsyncClient exists before executing the method"""
47
- def decorator(async_func: Callable[P, Awaitable[T]]) -> Callable[P, Awaitable[T]]:
44
+ def decorator(async_func: Callable[..., Awaitable[T]]) -> Callable[..., Awaitable[T]]:
48
45
  @functools.wraps(async_func)
49
- async def wrapper(self: Any, *args: P.args, **kwargs: P.kwargs) -> T:
46
+ async def wrapper(self: Any, *args: Any, **kwargs: Any) -> T:
50
47
  if not self._client or self._client.is_closed:
51
48
  self._client = httpx.AsyncClient()
52
49
  return await async_func(self, *args, **kwargs)
53
50
  return wrapper
54
51
  return decorator
55
52
 
56
- def require_task() -> Callable[[Callable[P, Awaitable[T]]], Callable[P, Awaitable[T]]]:
53
+ def require_task() -> Callable[[Callable[..., Awaitable[T]]], Callable[..., Awaitable[T]]]:
57
54
  """Decorator that ensures task has required attributes and valid client before execution"""
58
- def decorator(async_func: Callable[P, Awaitable[T]]) -> Callable[P, Awaitable[T]]:
55
+ def decorator(async_func: Callable[..., Awaitable[T]]) -> Callable[..., Awaitable[T]]:
59
56
  @functools.wraps(async_func)
60
- async def wrapper(self: Any, *args: P.args, **kwargs: P.kwargs) -> T:
57
+ async def wrapper(self: Any, *args: Any, **kwargs: Any) -> T:
61
58
  if not self.task_url:
62
59
  raise ValueError("Task URL not found")
63
60
  if not self._client:
@@ -3,9 +3,9 @@ import base64
3
3
  import io
4
4
  from pathlib import Path
5
5
  from PIL import Image
6
- from typing import Union, Tuple, BinaryIO, Optional
6
+ from typing import Union, Tuple, BinaryIO, Optional, Any
7
7
 
8
- async def prepare_file(file: Union[str, Path, BinaryIO, Image.Image]) -> Tuple[Optional[str], str]:
8
+ async def prepare_file(file: Union[str, Path, BinaryIO, Image.Image, bytes, bytearray, memoryview]) -> Tuple[Optional[str], str]:
9
9
  """Convert various file types into a tuple of (filename, file content).
10
10
 
11
11
  Args:
@@ -15,6 +15,7 @@ async def prepare_file(file: Union[str, Path, BinaryIO, Image.Image]) -> Tuple[O
15
15
  - Local file path (will be converted to base64)
16
16
  - Opened binary file (will be converted to base64)
17
17
  - PIL/Pillow Image object (will be converted to base64)
18
+ - Bytes object (will be converted to base64)
18
19
 
19
20
  Returns:
20
21
  Tuple[Optional[str], str]: (filename, content) where content is either a URL or base64 string
@@ -26,18 +27,54 @@ async def prepare_file(file: Union[str, Path, BinaryIO, Image.Image]) -> Tuple[O
26
27
  ValueError: If the URL is invalid or unreachable
27
28
  ValueError: If the MIME type is unsupported
28
29
  """
29
- # Handle strings
30
+ # Handle bytes-like objects
31
+ if isinstance(file, (bytes, bytearray, memoryview)):
32
+ # Convert to bytes first if it's not already
33
+ file_bytes = bytes(file)
34
+
35
+ # Check if this might be an already-encoded base64 string in bytes form
36
+ try:
37
+ # Try to decode the bytes to a string and see if it's valid base64
38
+ potential_base64 = file_bytes.decode('utf-8', errors='strict')
39
+ base64.b64decode(potential_base64)
40
+ # If we get here, it was a valid base64 string in bytes form
41
+ return None, potential_base64
42
+ except:
43
+ # Not a base64 string in bytes form, encode it as base64
44
+ base64_str = base64.b64encode(file_bytes).decode()
45
+ return None, base64_str
46
+
47
+ # Handle strings - urls or paths or base64
30
48
  if isinstance(file, str):
49
+ # Handle URLs
31
50
  if file.startswith(('http://', 'https://')):
32
51
  return None, file
33
- try:
34
- base64.b64decode(file)
52
+
53
+ # Handle data URLs
54
+ if file.startswith('data:'):
35
55
  return None, file
36
- except:
56
+
57
+ # Try to handle as a file path
58
+ try:
59
+ path = Path(file)
60
+ if path.exists():
61
+ # It's a valid file path, convert to Path object and continue processing
62
+ file = path
63
+ else:
64
+ # If not a valid file path, try treating as base64
65
+ try:
66
+ # Just test if it's valid base64, don't store the result
67
+ base64.b64decode(file)
68
+ return None, file
69
+ except:
70
+ raise ValueError(f"File not found: {file} and it's not a valid base64 string")
71
+ except Exception as e:
72
+ # If string can't be converted to Path or decoded as base64, it might still be a base64 string
37
73
  try:
38
- file = Path(file)
74
+ base64.b64decode(file)
75
+ return None, file
39
76
  except:
40
- raise ValueError("File must be a valid path, URL, or base64 string")
77
+ raise ValueError(f"Unable to process file: {e}")
41
78
 
42
79
  # Handle file paths - convert to base64
43
80
  if isinstance(file, Path):
@@ -67,17 +104,16 @@ async def prepare_file(file: Union[str, Path, BinaryIO, Image.Image]) -> Tuple[O
67
104
  file.seek(0)
68
105
  file_content = file.read()
69
106
  name = getattr(file, "name", "document")
70
- file_ext = Path(name).suffix.lower().lstrip('.')
71
- if not file_ext:
72
- raise ValueError("File must have an extension")
107
+ if not name or not isinstance(name, str):
108
+ name = None
73
109
  base64_str = base64.b64encode(file_content).decode()
74
- return Path(name).name, base64_str
110
+ return name, base64_str
75
111
 
76
112
  raise TypeError(f"Unsupported file type: {type(file)}")
77
113
 
78
114
 
79
115
  async def prepare_upload_data(
80
- file: Optional[Union[str, Path, BinaryIO, Image.Image]] = None,
116
+ file: Optional[Union[str, Path, BinaryIO, Image.Image, bytes, bytearray, memoryview]] = None,
81
117
  filename: Optional[str] = None,
82
118
  config: Optional[Configuration] = None,
83
119
  ) -> dict:
@@ -85,8 +121,8 @@ async def prepare_upload_data(
85
121
 
86
122
  Args:
87
123
  file: The file to upload
124
+ filename: Optional filename to use (overrides any filename from the file)
88
125
  config: Optional configuration settings
89
- client: HTTP client for downloading remote files
90
126
 
91
127
  Returns:
92
128
  dict: JSON-serializable data dictionary ready for upload
@@ -1,5 +1,5 @@
1
1
  from datetime import datetime
2
- from typing import TypeVar, Optional, Generic
2
+ from typing import Optional, cast, Awaitable, Union
3
3
  from pydantic import BaseModel, PrivateAttr
4
4
  import asyncio
5
5
  import json
@@ -11,9 +11,7 @@ from .protocol import ChunkrClientProtocol
11
11
  from .misc import prepare_upload_data
12
12
  from .decorators import anywhere, require_task, retry_on_429
13
13
 
14
- T = TypeVar("T", bound="TaskResponse")
15
-
16
- class TaskResponse(BaseModel, Generic[T]):
14
+ class TaskResponse(BaseModel):
17
15
  configuration: OutputConfiguration
18
16
  created_at: datetime
19
17
  expires_at: Optional[datetime] = None
@@ -28,13 +26,13 @@ class TaskResponse(BaseModel, Generic[T]):
28
26
  _base64_urls: bool = False
29
27
  _client: Optional[ChunkrClientProtocol] = PrivateAttr(default=None)
30
28
 
31
- def with_client(self, client: ChunkrClientProtocol, include_chunks: bool = False, base64_urls: bool = False) -> T:
29
+ def with_client(self, client: ChunkrClientProtocol, include_chunks: bool = False, base64_urls: bool = False) -> "TaskResponse":
32
30
  self._client = client
33
31
  self.include_chunks = include_chunks
34
32
  self._base64_urls = base64_urls
35
33
  return self
36
34
 
37
- def _check_status(self) -> Optional[T]:
35
+ def _check_status(self) -> Optional["TaskResponse"]:
38
36
  """Helper method to check task status and handle completion/failure"""
39
37
  if self.status == "Failed":
40
38
  if getattr(self._client, 'raise_on_failure', True):
@@ -47,6 +45,11 @@ class TaskResponse(BaseModel, Generic[T]):
47
45
  @require_task()
48
46
  async def _poll_request(self) -> dict:
49
47
  try:
48
+ if not self._client:
49
+ raise ValueError("Chunkr client protocol is not initialized")
50
+ if not self._client._client or self._client._client.is_closed:
51
+ raise ValueError("httpx client is not open")
52
+ assert self.task_url is not None
50
53
  r = await self._client._client.get(
51
54
  self.task_url, headers=self._client._headers()
52
55
  )
@@ -64,10 +67,12 @@ class TaskResponse(BaseModel, Generic[T]):
64
67
  raise e
65
68
 
66
69
  @anywhere()
67
- async def poll(self) -> T:
70
+ async def poll(self) -> "TaskResponse":
68
71
  """Poll the task for completion."""
69
72
  while True:
70
73
  j = await self._poll_request()
74
+ if not self._client:
75
+ raise ValueError("Chunkr client protocol is not initialized")
71
76
  updated = TaskResponse(**j).with_client(self._client)
72
77
  self.__dict__.update(updated.__dict__)
73
78
  if res := self._check_status():
@@ -77,9 +82,14 @@ class TaskResponse(BaseModel, Generic[T]):
77
82
  @anywhere()
78
83
  @require_task()
79
84
  @retry_on_429()
80
- async def update(self, config: Configuration) -> T:
85
+ async def update(self, config: Configuration) -> "TaskResponse":
81
86
  """Update the task configuration."""
82
87
  data = await prepare_upload_data(None, None, config)
88
+ if not self._client:
89
+ raise ValueError("Chunkr client protocol is not initialized")
90
+ if not self._client._client or self._client._client.is_closed:
91
+ raise ValueError("httpx client is not open")
92
+ assert self.task_url is not None
83
93
  r = await self._client._client.patch(
84
94
  f"{self.task_url}/parse",
85
95
  json=data,
@@ -88,12 +98,17 @@ class TaskResponse(BaseModel, Generic[T]):
88
98
  r.raise_for_status()
89
99
  updated = TaskResponse(**r.json()).with_client(self._client)
90
100
  self.__dict__.update(updated.__dict__)
91
- return await self.poll()
101
+ return cast(TaskResponse, self.poll())
92
102
 
93
103
  @anywhere()
94
104
  @require_task()
95
- async def delete(self) -> T:
105
+ async def delete(self) -> "TaskResponse":
96
106
  """Delete the task."""
107
+ if not self._client:
108
+ raise ValueError("Chunkr client protocol is not initialized")
109
+ if not self._client._client or self._client._client.is_closed:
110
+ raise ValueError("httpx client is not open")
111
+ assert self.task_url is not None
97
112
  r = await self._client._client.delete(
98
113
  self.task_url, headers=self._client._headers()
99
114
  )
@@ -102,15 +117,20 @@ class TaskResponse(BaseModel, Generic[T]):
102
117
 
103
118
  @anywhere()
104
119
  @require_task()
105
- async def cancel(self) -> T:
120
+ async def cancel(self) -> "TaskResponse":
106
121
  """Cancel the task."""
122
+ if not self._client:
123
+ raise ValueError("Chunkr client protocol is not initialized")
124
+ if not self._client._client or self._client._client.is_closed:
125
+ raise ValueError("httpx client is not open")
126
+ assert self.task_url is not None
107
127
  r = await self._client._client.get(
108
128
  f"{self.task_url}/cancel", headers=self._client._headers()
109
129
  )
110
130
  r.raise_for_status()
111
- return await self.poll()
131
+ return cast(TaskResponse, self.poll())
112
132
 
113
- def _write_to_file(self, content: str | dict, output_file: str, is_json: bool = False) -> None:
133
+ def _write_to_file(self, content: Union[str, dict], output_file: Optional[str], is_json: bool = False) -> None:
114
134
  """Helper method to write content to a file
115
135
 
116
136
  Args:
@@ -131,9 +151,12 @@ class TaskResponse(BaseModel, Generic[T]):
131
151
  if is_json:
132
152
  json.dump(content, f, cls=DateTimeEncoder, indent=2)
133
153
  else:
134
- f.write(content)
154
+ if isinstance(content, str):
155
+ f.write(content)
156
+ else:
157
+ raise ValueError("Content is not a string")
135
158
 
136
- def html(self, output_file: str = None) -> str:
159
+ def html(self, output_file: Optional[str] = None) -> str:
137
160
  """Get the full HTML of the task
138
161
 
139
162
  Args:
@@ -143,7 +166,7 @@ class TaskResponse(BaseModel, Generic[T]):
143
166
  self._write_to_file(content, output_file)
144
167
  return content
145
168
 
146
- def markdown(self, output_file: str = None) -> str:
169
+ def markdown(self, output_file: Optional[str] = None) -> str:
147
170
  """Get the full markdown of the task
148
171
 
149
172
  Args:
@@ -153,7 +176,7 @@ class TaskResponse(BaseModel, Generic[T]):
153
176
  self._write_to_file(content, output_file)
154
177
  return content
155
178
 
156
- def content(self, output_file: str = None) -> str:
179
+ def content(self, output_file: Optional[str] = None) -> str:
157
180
  """Get the full content of the task
158
181
 
159
182
  Args:
@@ -163,7 +186,7 @@ class TaskResponse(BaseModel, Generic[T]):
163
186
  self._write_to_file(content, output_file)
164
187
  return content
165
188
 
166
- def json(self, output_file: str = None) -> dict:
189
+ def json(self, output_file: Optional[str] = None) -> dict:
167
190
  """Get the full task data as JSON
168
191
 
169
192
  Args:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: chunkr-ai
3
- Version: 0.0.46
3
+ Version: 0.0.48
4
4
  Summary: Python client for Chunkr: open source document intelligence
5
5
  Author-email: Ishaan Kapoor <ishaan@lumina.sh>
6
6
  License: MIT License
@@ -17,4 +17,5 @@ src/chunkr_ai/api/decorators.py
17
17
  src/chunkr_ai/api/misc.py
18
18
  src/chunkr_ai/api/protocol.py
19
19
  src/chunkr_ai/api/task_response.py
20
- tests/test_chunkr.py
20
+ tests/test_chunkr.py
21
+ tests/test_file_handling.py
@@ -3,6 +3,9 @@ from pathlib import Path
3
3
  from PIL import Image
4
4
  import asyncio
5
5
  import base64
6
+ import io
7
+ import tempfile
8
+ from typing import Awaitable
6
9
 
7
10
  from chunkr_ai import Chunkr
8
11
  from chunkr_ai.models import (
@@ -26,6 +29,14 @@ from chunkr_ai.models import (
26
29
  def sample_path():
27
30
  return Path("tests/files/test.pdf")
28
31
 
32
+ @pytest.fixture
33
+ def sample_absolute_path_str():
34
+ return "tests/files/test.pdf"
35
+
36
+ @pytest.fixture
37
+ def sample_relative_path_str():
38
+ return "./tests/files/test.pdf"
39
+
29
40
  @pytest.fixture
30
41
  def sample_image():
31
42
  return Image.open("tests/files/test.jpg")
@@ -43,7 +54,7 @@ def client():
43
54
  def markdown_embed_config():
44
55
  return Configuration(
45
56
  segment_processing=SegmentProcessing(
46
- page=GenerationConfig(
57
+ Page=GenerationConfig(
47
58
  html=GenerationStrategy.LLM,
48
59
  markdown=GenerationStrategy.LLM,
49
60
  embed_sources=[EmbedSource.MARKDOWN]
@@ -55,7 +66,7 @@ def markdown_embed_config():
55
66
  def html_embed_config():
56
67
  return Configuration(
57
68
  segment_processing=SegmentProcessing(
58
- page=GenerationConfig(
69
+ Page=GenerationConfig(
59
70
  html=GenerationStrategy.LLM,
60
71
  markdown=GenerationStrategy.LLM,
61
72
  embed_sources=[EmbedSource.HTML]
@@ -67,7 +78,7 @@ def html_embed_config():
67
78
  def multiple_embed_config():
68
79
  return Configuration(
69
80
  segment_processing=SegmentProcessing(
70
- page=GenerationConfig(
81
+ Page=GenerationConfig(
71
82
  html=GenerationStrategy.LLM,
72
83
  markdown=GenerationStrategy.LLM,
73
84
  llm="Generate a summary of this content",
@@ -115,7 +126,7 @@ def xlm_roberta_with_html_content_config():
115
126
  tokenizer=Tokenizer.XLM_ROBERTA_BASE
116
127
  ),
117
128
  segment_processing=SegmentProcessing(
118
- page=GenerationConfig(
129
+ Page=GenerationConfig(
119
130
  html=GenerationStrategy.LLM,
120
131
  markdown=GenerationStrategy.LLM,
121
132
  embed_sources=[EmbedSource.HTML, EmbedSource.CONTENT]
@@ -156,43 +167,6 @@ def model_fallback_config():
156
167
  ),
157
168
  )
158
169
 
159
- @pytest.mark.asyncio
160
- async def test_send_file_path(client, sample_path):
161
- response = await client.upload(sample_path)
162
- assert response.task_id is not None
163
- assert response.status == "Succeeded"
164
- assert response.output is not None
165
-
166
- @pytest.mark.asyncio
167
- async def test_send_file_url(client, sample_url):
168
- response = await client.upload(sample_url)
169
- assert response.task_id is not None
170
- assert response.status == "Succeeded"
171
- assert response.output is not None
172
-
173
- @pytest.mark.asyncio
174
- async def test_send_file_path_str(client, sample_path):
175
- response = await client.upload(str(sample_path))
176
- assert response.task_id is not None
177
- assert response.status == "Succeeded"
178
- assert response.output is not None
179
-
180
- @pytest.mark.asyncio
181
- async def test_send_opened_file(client, sample_path):
182
- with open(sample_path, "rb") as f:
183
- response = await client.upload(f)
184
- assert response.task_id is not None
185
- assert response.status == "Succeeded"
186
- assert response.output is not None
187
-
188
- @pytest.mark.asyncio
189
- async def test_send_pil_image(client, sample_image):
190
- response = await client.upload(sample_image)
191
- assert response.task_id is not None
192
- assert response.status == "Succeeded"
193
- assert response.output is not None
194
- assert response.output is not None
195
-
196
170
  @pytest.mark.asyncio
197
171
  async def test_ocr_auto(client, sample_path):
198
172
  response = await client.upload(sample_path, Configuration(ocr_strategy=OcrStrategy.AUTO))
@@ -240,7 +214,7 @@ async def test_page_llm_html(client, sample_path):
240
214
  Configuration(
241
215
  segmentation_strategy=SegmentationStrategy.PAGE,
242
216
  segment_processing=SegmentProcessing(
243
- page=GenerationConfig(html=GenerationStrategy.LLM)
217
+ Page=GenerationConfig(html=GenerationStrategy.LLM)
244
218
  ),
245
219
  ),
246
220
  )
@@ -253,7 +227,7 @@ async def test_page_llm(client, sample_path):
253
227
  configuration = Configuration(
254
228
  segmentation_strategy=SegmentationStrategy.PAGE,
255
229
  segment_processing=SegmentProcessing(
256
- page=GenerationConfig(
230
+ Page=GenerationConfig(
257
231
  html=GenerationStrategy.LLM, markdown=GenerationStrategy.LLM
258
232
  )
259
233
  ),
@@ -291,7 +265,7 @@ async def test_cancel_task(client, sample_path):
291
265
  @pytest.mark.asyncio
292
266
  async def test_cancel_task_direct(client, sample_path):
293
267
  task = await client.create_task(sample_path)
294
- assert isinstance(task, TaskResponse)
268
+ assert isinstance(task, Awaitable) and isinstance(task, TaskResponse)
295
269
  assert task.status == "Starting"
296
270
  await task.cancel()
297
271
  assert task.status == "Cancelled"
@@ -332,7 +306,7 @@ async def test_pipeline_type_azure(client, sample_path):
332
306
  assert response.output is not None
333
307
 
334
308
  @pytest.mark.asyncio
335
- async def test_pipeline_type_azure(client, sample_path):
309
+ async def test_pipeline_type_chunkr(client, sample_path):
336
310
  response = await client.upload(sample_path, Configuration(pipeline=Pipeline.CHUNKR))
337
311
  assert response.task_id is not None
338
312
  assert response.status == "Succeeded"
@@ -353,36 +327,6 @@ async def test_task_operations_after_client_close(client, sample_path):
353
327
  result = await task.poll()
354
328
  assert result.status == "Succeeded"
355
329
 
356
- @pytest.mark.asyncio
357
- async def test_send_base64_file(client, sample_path):
358
- # Read file and convert to base64
359
- with open(sample_path, "rb") as f:
360
- base64_content = base64.b64encode(f.read()).decode('utf-8')
361
- response = await client.upload(base64_content)
362
- assert response.task_id is not None
363
- assert response.status == "Succeeded"
364
- assert response.output is not None
365
-
366
- @pytest.mark.asyncio
367
- async def test_send_base64_file_with_data_url(client, sample_path):
368
- with open(sample_path, "rb") as f:
369
- base64_content = base64.b64encode(f.read()).decode('utf-8')
370
- response = await client.upload(f"data:application/pdf;base64,{base64_content}")
371
- assert response.task_id is not None
372
- assert response.status == "Succeeded"
373
- assert response.output is not None
374
-
375
- @pytest.mark.asyncio
376
- async def test_send_base64_file_with_filename(client, sample_path):
377
- # Read file and convert to base64
378
- with open(sample_path, "rb") as f:
379
- base64_content = base64.b64encode(f.read()).decode('utf-8')
380
-
381
- response = await client.upload(base64_content, filename="test.pdf")
382
- assert response.task_id is not None
383
- assert response.status == "Succeeded"
384
- assert response.output is not None
385
-
386
330
  @pytest.mark.asyncio
387
331
  async def test_output_files_no_dir(client, sample_path, tmp_path):
388
332
  task = await client.upload(sample_path)
@@ -422,6 +366,35 @@ async def test_output_files_with_dirs(client, sample_path, tmp_path):
422
366
  assert content_file.exists()
423
367
  assert json_file.exists()
424
368
 
369
+
370
+ @pytest.mark.asyncio
371
+ async def test_combined_config_with_llm_and_other_settings(client, sample_path):
372
+ # Test combining LLM settings with other configuration options
373
+ config = Configuration(
374
+ llm_processing=LlmProcessing(
375
+ model_id="qwen-2.5-vl-7b-instruct",
376
+ fallback_strategy=FallbackStrategy.model("gemini-flash-2.0"),
377
+ temperature=0.4
378
+ ),
379
+ segmentation_strategy=SegmentationStrategy.PAGE,
380
+ segment_processing=SegmentProcessing(
381
+ Page=GenerationConfig(
382
+ html=GenerationStrategy.LLM,
383
+ markdown=GenerationStrategy.LLM
384
+ )
385
+ ),
386
+ chunk_processing=ChunkProcessing(target_length=1024)
387
+ )
388
+
389
+ response = await client.upload(sample_path, config)
390
+ assert response.task_id is not None
391
+ assert response.status == "Succeeded"
392
+ assert response.output is not None
393
+ assert response.configuration.llm_processing is not None
394
+ assert response.configuration.llm_processing.model_id == "qwen-2.5-vl-7b-instruct"
395
+ assert response.configuration.segmentation_strategy == SegmentationStrategy.PAGE
396
+ assert response.configuration.chunk_processing.target_length == 1024
397
+
425
398
  @pytest.mark.asyncio
426
399
  async def test_embed_sources_markdown_only(client, sample_path, markdown_embed_config):
427
400
  response = await client.upload(sample_path, markdown_embed_config)
@@ -555,32 +528,4 @@ async def test_fallback_strategy_serialization():
555
528
  # Test string representation
556
529
  assert str(none_strategy) == "None"
557
530
  assert str(default_strategy) == "Default"
558
- assert str(model_strategy) == "Model(gpt-4.1)"
559
-
560
- @pytest.mark.asyncio
561
- async def test_combined_config_with_llm_and_other_settings(client, sample_path):
562
- # Test combining LLM settings with other configuration options
563
- config = Configuration(
564
- llm_processing=LlmProcessing(
565
- model_id="qwen-2.5-vl-7b-instruct",
566
- fallback_strategy=FallbackStrategy.model("gemini-flash-2.0"),
567
- temperature=0.4
568
- ),
569
- segmentation_strategy=SegmentationStrategy.PAGE,
570
- segment_processing=SegmentProcessing(
571
- page=GenerationConfig(
572
- html=GenerationStrategy.LLM,
573
- markdown=GenerationStrategy.LLM
574
- )
575
- ),
576
- chunk_processing=ChunkProcessing(target_length=1024)
577
- )
578
-
579
- response = await client.upload(sample_path, config)
580
- assert response.task_id is not None
581
- assert response.status == "Succeeded"
582
- assert response.output is not None
583
- assert response.configuration.llm_processing is not None
584
- assert response.configuration.llm_processing.model_id == "qwen-2.5-vl-7b-instruct"
585
- assert response.configuration.segmentation_strategy == SegmentationStrategy.PAGE
586
- assert response.configuration.chunk_processing.target_length == 1024
531
+ assert str(model_strategy) == "Model(gpt-4.1)"
@@ -0,0 +1,362 @@
1
+ import pytest
2
+ from pathlib import Path
3
+ from PIL import Image
4
+ import base64
5
+ import io
6
+ import tempfile
7
+
8
+ from chunkr_ai import Chunkr
9
+
10
+ @pytest.fixture
11
+ def sample_path():
12
+ return Path("tests/files/test.pdf")
13
+
14
+ @pytest.fixture
15
+ def sample_url():
16
+ return "https://chunkr-web.s3.us-east-1.amazonaws.com/landing_page/input/science.pdf"
17
+
18
+ @pytest.fixture
19
+ def sample_image():
20
+ return Image.open("tests/files/test.jpg")
21
+
22
+ @pytest.fixture
23
+ def client():
24
+ client = Chunkr()
25
+ yield client
26
+
27
+ @pytest.mark.asyncio
28
+ async def test_send_file_path(client, sample_path):
29
+ response = await client.upload(sample_path)
30
+ assert response.task_id is not None
31
+ assert response.status == "Succeeded"
32
+ assert response.output is not None
33
+
34
+ @pytest.mark.asyncio
35
+ async def test_send_file_path_str(client, sample_path):
36
+ response = await client.upload(str(sample_path))
37
+ assert response.task_id is not None
38
+ assert response.status == "Succeeded"
39
+ assert response.output is not None
40
+
41
+ @pytest.mark.asyncio
42
+ async def test_send_file_relative_path_str(client):
43
+ response = await client.upload("./tests/files/test.pdf")
44
+ assert response.task_id is not None
45
+ assert response.status == "Succeeded"
46
+ assert response.output is not None
47
+
48
+ @pytest.mark.asyncio
49
+ async def test_send_file_url(client, sample_url):
50
+ response = await client.upload(sample_url)
51
+ assert response.task_id is not None
52
+ assert response.status == "Succeeded"
53
+ assert response.output is not None
54
+
55
+ @pytest.mark.asyncio
56
+ async def test_send_opened_file(client, sample_path):
57
+ with open(sample_path, "rb") as f:
58
+ response = await client.upload(f)
59
+ assert response.task_id is not None
60
+ assert response.status == "Succeeded"
61
+ assert response.output is not None
62
+
63
+ @pytest.mark.asyncio
64
+ async def test_send_pil_image(client, sample_image):
65
+ response = await client.upload(sample_image)
66
+ assert response.task_id is not None
67
+ assert response.status == "Succeeded"
68
+ assert response.output is not None
69
+ assert response.output is not None
70
+
71
+ @pytest.mark.asyncio
72
+ async def test_send_base64_file(client, sample_path):
73
+ # Read file and convert to base64
74
+ with open(sample_path, "rb") as f:
75
+ base64_content = base64.b64encode(f.read())
76
+ response = await client.upload(base64_content)
77
+ assert response.task_id is not None
78
+ assert response.status == "Succeeded"
79
+ assert response.output is not None
80
+
81
+ @pytest.mark.asyncio
82
+ async def test_send_base64_file_w_decode(client, sample_path):
83
+ # Read file and convert to base64
84
+ with open(sample_path, "rb") as f:
85
+ base64_content = base64.b64encode(f.read()).decode()
86
+ response = await client.upload(base64_content)
87
+ assert response.task_id is not None
88
+ assert response.status == "Succeeded"
89
+ assert response.output is not None
90
+
91
+ @pytest.mark.asyncio
92
+ async def test_send_base64_file_with_data_url(client, sample_path):
93
+ with open(sample_path, "rb") as f:
94
+ base64_content = base64.b64encode(f.read()).decode('utf-8')
95
+ response = await client.upload(f"data:application/pdf;base64,{base64_content}")
96
+ assert response.task_id is not None
97
+ assert response.status == "Succeeded"
98
+ assert response.output is not None
99
+
100
+ @pytest.mark.asyncio
101
+ async def test_send_base64_file_with_filename(client, sample_path):
102
+ # Read file and convert to base64
103
+ with open(sample_path, "rb") as f:
104
+ base64_content = base64.b64encode(f.read()).decode('utf-8')
105
+
106
+ response = await client.upload(base64_content, filename="test.pdf")
107
+ assert response.task_id is not None
108
+ assert response.status == "Succeeded"
109
+ assert response.output is not None
110
+
111
+ @pytest.mark.asyncio
112
+ async def test_file_like_no_name_attribute(client, sample_path):
113
+ # Create a file-like object without a name attribute
114
+ class NamelessBuffer:
115
+ def __init__(self, content):
116
+ self.buffer = io.BytesIO(content)
117
+
118
+ def read(self):
119
+ return self.buffer.read()
120
+
121
+ def seek(self, pos):
122
+ return self.buffer.seek(pos)
123
+
124
+ with open(sample_path, "rb") as f:
125
+ content = f.read()
126
+
127
+ nameless_buffer = NamelessBuffer(content)
128
+ response = await client.upload(nameless_buffer, filename="test.pdf")
129
+ assert response.task_id is not None
130
+ assert response.status == "Succeeded"
131
+ assert response.output is not None
132
+
133
+ @pytest.mark.asyncio
134
+ async def test_file_like_none_name(client, sample_path):
135
+ # Create a file-like object with None as name
136
+ class NoneNameBuffer:
137
+ def __init__(self, content):
138
+ self.buffer = io.BytesIO(content)
139
+ self.name = None
140
+
141
+ def read(self):
142
+ return self.buffer.read()
143
+
144
+ def seek(self, pos):
145
+ return self.buffer.seek(pos)
146
+
147
+ with open(sample_path, "rb") as f:
148
+ content = f.read()
149
+
150
+ none_name_buffer = NoneNameBuffer(content)
151
+ response = await client.upload(none_name_buffer, filename="test.pdf")
152
+ assert response.task_id is not None
153
+ assert response.status == "Succeeded"
154
+ assert response.output is not None
155
+
156
+ @pytest.mark.asyncio
157
+ async def test_file_like_no_extension(client, sample_path):
158
+ # Create a file-like object with a name but no extension
159
+ class NoExtensionBuffer:
160
+ def __init__(self, content):
161
+ self.buffer = io.BytesIO(content)
162
+ self.name = "test_document"
163
+
164
+ def read(self):
165
+ return self.buffer.read()
166
+
167
+ def seek(self, pos):
168
+ return self.buffer.seek(pos)
169
+
170
+ with open(sample_path, "rb") as f:
171
+ content = f.read()
172
+
173
+ no_ext_buffer = NoExtensionBuffer(content)
174
+ response = await client.upload(no_ext_buffer, filename="test.pdf")
175
+ assert response.task_id is not None
176
+ assert response.status == "Succeeded"
177
+ assert response.output is not None
178
+
179
+ @pytest.mark.asyncio
180
+ async def test_spooled_temporary_file(client, sample_path):
181
+ # Test with SpooledTemporaryFile which is what the user is using
182
+ with open(sample_path, "rb") as f:
183
+ content = f.read()
184
+
185
+ temp_file = tempfile.SpooledTemporaryFile()
186
+ temp_file.write(content)
187
+ temp_file.seek(0)
188
+
189
+ response = await client.upload(temp_file, filename="test.pdf")
190
+ assert response.task_id is not None
191
+ assert response.status == "Succeeded"
192
+ assert response.output is not None
193
+
194
+ @pytest.mark.asyncio
195
+ async def test_send_bytearray(client, sample_path):
196
+ # Read file and convert to bytearray
197
+ with open(sample_path, "rb") as f:
198
+ content = bytearray(f.read())
199
+
200
+ response = await client.upload(content, filename="test.pdf")
201
+ assert response.task_id is not None
202
+ assert response.status == "Succeeded"
203
+ assert response.output is not None
204
+
205
+ @pytest.mark.asyncio
206
+ async def test_send_memoryview(client, sample_path):
207
+ # Read file and convert to memoryview
208
+ with open(sample_path, "rb") as f:
209
+ content_bytes = f.read()
210
+ content = memoryview(content_bytes)
211
+
212
+ response = await client.upload(content, filename="test.pdf")
213
+ assert response.task_id is not None
214
+ assert response.status == "Succeeded"
215
+ assert response.output is not None
216
+
217
+ @pytest.mark.asyncio
218
+ async def test_with_explicit_filename_pdf(client, sample_path):
219
+ response = await client.upload(sample_path, filename="custom_name.pdf")
220
+ assert response.task_id is not None
221
+ assert response.status == "Succeeded"
222
+ assert response.output is not None
223
+
224
+ @pytest.mark.asyncio
225
+ async def test_with_explicit_filename_image(client, sample_image):
226
+ response = await client.upload(sample_image, filename="custom_image.jpg")
227
+ assert response.task_id is not None
228
+ assert response.status == "Succeeded"
229
+ assert response.output is not None
230
+
231
+ @pytest.mark.asyncio
232
+ async def test_with_special_character_filename(client, sample_path):
233
+ response = await client.upload(sample_path, filename="test file (1)&%$#@!.pdf")
234
+ assert response.task_id is not None
235
+ assert response.status == "Succeeded"
236
+ assert response.output is not None
237
+
238
+ @pytest.mark.asyncio
239
+ async def test_filename_with_non_matching_extension(client, sample_path):
240
+ # Test providing a filename with a different extension than the actual file
241
+ response = await client.upload(sample_path, filename="document.docx")
242
+ assert response.task_id is not None
243
+ assert response.status == "Succeeded"
244
+ assert response.output is not None
245
+
246
+ @pytest.mark.asyncio
247
+ async def test_bytes_with_explicit_filename(client, sample_path):
248
+ with open(sample_path, "rb") as f:
249
+ content = f.read()
250
+
251
+ # For bytes objects, filename is required to know the file type
252
+ response = await client.upload(content, filename="document.pdf")
253
+ assert response.task_id is not None
254
+ assert response.status == "Succeeded"
255
+ assert response.output is not None
256
+
257
+ @pytest.mark.asyncio
258
+ async def test_bytearray_with_explicit_filename(client, sample_path):
259
+ with open(sample_path, "rb") as f:
260
+ content = bytearray(f.read())
261
+
262
+ response = await client.upload(content, filename="document.pdf")
263
+ assert response.task_id is not None
264
+ assert response.status == "Succeeded"
265
+ assert response.output is not None
266
+
267
+ @pytest.mark.asyncio
268
+ async def test_memoryview_with_explicit_filename(client, sample_path):
269
+ with open(sample_path, "rb") as f:
270
+ content_bytes = f.read()
271
+ content = memoryview(content_bytes)
272
+
273
+ response = await client.upload(content, filename="document.pdf")
274
+ assert response.task_id is not None
275
+ assert response.status == "Succeeded"
276
+ assert response.output is not None
277
+
278
+ @pytest.mark.asyncio
279
+ async def test_unicode_filename(client, sample_path):
280
+ # Test with a filename containing Unicode characters
281
+ response = await client.upload(sample_path, filename="测试文件.pdf")
282
+ assert response.task_id is not None
283
+ assert response.status == "Succeeded"
284
+ assert response.output is not None
285
+
286
+ @pytest.mark.asyncio
287
+ async def test_very_long_filename(client, sample_path):
288
+ # Test with an extremely long filename
289
+ long_name = "a" * 200 + ".pdf" # 200 character filename
290
+ response = await client.upload(sample_path, filename=long_name)
291
+ assert response.task_id is not None
292
+ assert response.status == "Succeeded"
293
+ assert response.output is not None
294
+
295
+ @pytest.mark.asyncio
296
+ async def test_filename_without_extension(client, sample_path):
297
+ # Test with a filename that has no extension
298
+ with open(sample_path, "rb") as f:
299
+ content = f.read()
300
+
301
+ # This test verifies that the system uses the provided filename even without extension
302
+ response = await client.upload(content, filename="document_without_extension")
303
+ assert response.task_id is not None
304
+ assert response.status == "Succeeded"
305
+ assert response.output is not None
306
+
307
+ @pytest.mark.asyncio
308
+ async def test_custom_file_like_with_filename(client, sample_path):
309
+ # A more complex file-like object implementation
310
+ class CustomFileWrapper:
311
+ def __init__(self, content):
312
+ self.buffer = io.BytesIO(content)
313
+ self.position = 0
314
+ self.name = "original_name.txt" # Should be overridden by explicit filename
315
+
316
+ def read(self, size=-1):
317
+ return self.buffer.read(size)
318
+
319
+ def seek(self, position, whence=0):
320
+ return self.buffer.seek(position, whence)
321
+
322
+ def tell(self):
323
+ return self.buffer.tell()
324
+
325
+ def close(self):
326
+ self.buffer.close()
327
+
328
+ with open(sample_path, "rb") as f:
329
+ content = f.read()
330
+
331
+ custom_file = CustomFileWrapper(content)
332
+ response = await client.upload(custom_file, filename="custom_wrapper.pdf")
333
+ assert response.task_id is not None
334
+ assert response.status == "Succeeded"
335
+ assert response.output is not None
336
+
337
+ @pytest.mark.asyncio
338
+ async def test_seek_at_nonzero_position(client, sample_path):
339
+ # Test with a file-like object that's not at position 0
340
+ with open(sample_path, "rb") as f:
341
+ content = f.read()
342
+
343
+ buffer = io.BytesIO(content)
344
+ buffer.seek(100) # Move position to 100
345
+
346
+ response = await client.upload(buffer, filename="seek_test.pdf")
347
+ assert response.task_id is not None
348
+ assert response.status == "Succeeded"
349
+ assert response.output is not None
350
+
351
+ @pytest.mark.asyncio
352
+ async def test_reused_file_object(client, sample_path):
353
+ # Test that a file object can be reused after being processed
354
+ with open(sample_path, "rb") as f:
355
+ response1 = await client.upload(f, filename="first_use.pdf")
356
+ f.seek(0) # Reset position
357
+ response2 = await client.upload(f, filename="second_use.pdf")
358
+
359
+ assert response1.task_id is not None
360
+ assert response1.status == "Succeeded"
361
+ assert response2.task_id is not None
362
+ assert response2.status == "Succeeded"
File without changes
File without changes
File without changes