chunkr-ai 0.0.46__tar.gz → 0.0.47__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {chunkr_ai-0.0.46/src/chunkr_ai.egg-info → chunkr_ai-0.0.47}/PKG-INFO +1 -1
- {chunkr_ai-0.0.46 → chunkr_ai-0.0.47}/pyproject.toml +1 -1
- {chunkr_ai-0.0.46 → chunkr_ai-0.0.47}/src/chunkr_ai/api/auth.py +1 -0
- {chunkr_ai-0.0.46 → chunkr_ai-0.0.47}/src/chunkr_ai/api/chunkr.py +17 -11
- {chunkr_ai-0.0.46 → chunkr_ai-0.0.47}/src/chunkr_ai/api/chunkr_base.py +12 -6
- {chunkr_ai-0.0.46 → chunkr_ai-0.0.47}/src/chunkr_ai/api/decorators.py +7 -10
- {chunkr_ai-0.0.46 → chunkr_ai-0.0.47}/src/chunkr_ai/api/misc.py +10 -6
- {chunkr_ai-0.0.46 → chunkr_ai-0.0.47}/src/chunkr_ai/api/task_response.py +41 -18
- {chunkr_ai-0.0.46 → chunkr_ai-0.0.47/src/chunkr_ai.egg-info}/PKG-INFO +1 -1
- {chunkr_ai-0.0.46 → chunkr_ai-0.0.47}/tests/test_chunkr.py +31 -9
- {chunkr_ai-0.0.46 → chunkr_ai-0.0.47}/LICENSE +0 -0
- {chunkr_ai-0.0.46 → chunkr_ai-0.0.47}/README.md +0 -0
- {chunkr_ai-0.0.46 → chunkr_ai-0.0.47}/setup.cfg +0 -0
- {chunkr_ai-0.0.46 → chunkr_ai-0.0.47}/src/chunkr_ai/__init__.py +0 -0
- {chunkr_ai-0.0.46 → chunkr_ai-0.0.47}/src/chunkr_ai/api/__init__.py +0 -0
- {chunkr_ai-0.0.46 → chunkr_ai-0.0.47}/src/chunkr_ai/api/configuration.py +0 -0
- {chunkr_ai-0.0.46 → chunkr_ai-0.0.47}/src/chunkr_ai/api/protocol.py +0 -0
- {chunkr_ai-0.0.46 → chunkr_ai-0.0.47}/src/chunkr_ai/models.py +0 -0
- {chunkr_ai-0.0.46 → chunkr_ai-0.0.47}/src/chunkr_ai.egg-info/SOURCES.txt +0 -0
- {chunkr_ai-0.0.46 → chunkr_ai-0.0.47}/src/chunkr_ai.egg-info/dependency_links.txt +0 -0
- {chunkr_ai-0.0.46 → chunkr_ai-0.0.47}/src/chunkr_ai.egg-info/requires.txt +0 -0
- {chunkr_ai-0.0.46 → chunkr_ai-0.0.47}/src/chunkr_ai.egg-info/top_level.txt +0 -0
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "chunkr-ai"
|
7
|
-
version = "0.0.
|
7
|
+
version = "0.0.47"
|
8
8
|
authors = [{"name" = "Ishaan Kapoor", "email" = "ishaan@lumina.sh"}]
|
9
9
|
description = "Python client for Chunkr: open source document intelligence"
|
10
10
|
readme = "README.md"
|
@@ -1,12 +1,13 @@
|
|
1
1
|
from pathlib import Path
|
2
2
|
from PIL import Image
|
3
|
-
from typing import Union, BinaryIO, Optional
|
3
|
+
from typing import Union, BinaryIO, Optional, cast, Awaitable
|
4
4
|
|
5
5
|
from .configuration import Configuration
|
6
6
|
from .decorators import anywhere, ensure_client, retry_on_429
|
7
7
|
from .misc import prepare_upload_data
|
8
8
|
from .task_response import TaskResponse
|
9
9
|
from .chunkr_base import ChunkrBase
|
10
|
+
from .protocol import ChunkrClientProtocol
|
10
11
|
|
11
12
|
class Chunkr(ChunkrBase):
|
12
13
|
"""Chunkr API client that works in both sync and async contexts"""
|
@@ -16,17 +17,17 @@ class Chunkr(ChunkrBase):
|
|
16
17
|
async def upload(
|
17
18
|
self,
|
18
19
|
file: Union[str, Path, BinaryIO, Image.Image],
|
19
|
-
config: Configuration = None,
|
20
|
+
config: Optional[Configuration] = None,
|
20
21
|
filename: Optional[str] = None,
|
21
22
|
) -> TaskResponse:
|
22
|
-
task = await self.create_task(file, config, filename)
|
23
|
-
return await task.poll()
|
23
|
+
task = await cast(Awaitable[TaskResponse], self.create_task(file, config, filename))
|
24
|
+
return await cast(Awaitable[TaskResponse], task.poll())
|
24
25
|
|
25
26
|
@anywhere()
|
26
27
|
@ensure_client()
|
27
28
|
async def update(self, task_id: str, config: Configuration) -> TaskResponse:
|
28
|
-
task = await self.update_task(task_id, config)
|
29
|
-
return await task.poll()
|
29
|
+
task = await cast(Awaitable[TaskResponse], self.update_task(task_id, config))
|
30
|
+
return await cast(Awaitable[TaskResponse], task.poll())
|
30
31
|
|
31
32
|
@anywhere()
|
32
33
|
@ensure_client()
|
@@ -34,30 +35,32 @@ class Chunkr(ChunkrBase):
|
|
34
35
|
async def create_task(
|
35
36
|
self,
|
36
37
|
file: Union[str, Path, BinaryIO, Image.Image],
|
37
|
-
config: Configuration = None,
|
38
|
+
config: Optional[Configuration] = None,
|
38
39
|
filename: Optional[str] = None,
|
39
40
|
) -> TaskResponse:
|
40
41
|
"""Create a new task with the given file and configuration."""
|
41
42
|
data = await prepare_upload_data(file, filename, config)
|
43
|
+
assert self._client is not None
|
42
44
|
r = await self._client.post(
|
43
45
|
f"{self.url}/api/v1/task/parse", json=data, headers=self._headers()
|
44
46
|
)
|
45
47
|
r.raise_for_status()
|
46
|
-
return TaskResponse(**r.json()).with_client(self, True, False)
|
48
|
+
return TaskResponse(**r.json()).with_client(cast(ChunkrClientProtocol, self), True, False)
|
47
49
|
|
48
50
|
@anywhere()
|
49
51
|
@ensure_client()
|
50
52
|
@retry_on_429()
|
51
|
-
async def update_task(self, task_id: str, config: Configuration) -> TaskResponse:
|
53
|
+
async def update_task(self, task_id: str, config: Optional[Configuration] = None) -> TaskResponse:
|
52
54
|
"""Update an existing task with new configuration."""
|
53
55
|
data = await prepare_upload_data(None, None, config)
|
56
|
+
assert self._client is not None
|
54
57
|
r = await self._client.patch(
|
55
58
|
f"{self.url}/api/v1/task/{task_id}/parse",
|
56
59
|
json=data,
|
57
60
|
headers=self._headers(),
|
58
61
|
)
|
59
62
|
r.raise_for_status()
|
60
|
-
return TaskResponse(**r.json()).with_client(self, True, False)
|
63
|
+
return TaskResponse(**r.json()).with_client(cast(ChunkrClientProtocol, self), True, False)
|
61
64
|
|
62
65
|
@anywhere()
|
63
66
|
@ensure_client()
|
@@ -66,17 +69,19 @@ class Chunkr(ChunkrBase):
|
|
66
69
|
"base64_urls": str(base64_urls).lower(),
|
67
70
|
"include_chunks": str(include_chunks).lower()
|
68
71
|
}
|
72
|
+
assert self._client is not None
|
69
73
|
r = await self._client.get(
|
70
74
|
f"{self.url}/api/v1/task/{task_id}",
|
71
75
|
params=params,
|
72
76
|
headers=self._headers()
|
73
77
|
)
|
74
78
|
r.raise_for_status()
|
75
|
-
return TaskResponse(**r.json()).with_client(self, include_chunks, base64_urls)
|
79
|
+
return TaskResponse(**r.json()).with_client(cast(ChunkrClientProtocol, self), include_chunks, base64_urls)
|
76
80
|
|
77
81
|
@anywhere()
|
78
82
|
@ensure_client()
|
79
83
|
async def delete_task(self, task_id: str) -> None:
|
84
|
+
assert self._client is not None
|
80
85
|
r = await self._client.delete(
|
81
86
|
f"{self.url}/api/v1/task/{task_id}", headers=self._headers()
|
82
87
|
)
|
@@ -85,6 +90,7 @@ class Chunkr(ChunkrBase):
|
|
85
90
|
@anywhere()
|
86
91
|
@ensure_client()
|
87
92
|
async def cancel_task(self, task_id: str) -> None:
|
93
|
+
assert self._client is not None
|
88
94
|
r = await self._client.get(
|
89
95
|
f"{self.url}/api/v1/task/{task_id}/cancel", headers=self._headers()
|
90
96
|
)
|
@@ -18,17 +18,23 @@ class ChunkrBase(HeadersMixin):
|
|
18
18
|
raise_on_failure: Whether to raise an exception if the task fails. Defaults to False.
|
19
19
|
"""
|
20
20
|
|
21
|
-
|
21
|
+
url: str
|
22
|
+
_api_key: str
|
23
|
+
raise_on_failure: bool
|
24
|
+
_client: Optional[httpx.AsyncClient]
|
25
|
+
|
26
|
+
def __init__(self, url: Optional[str] = None, api_key: Optional[str] = None, raise_on_failure: bool = False):
|
22
27
|
load_dotenv(override=True)
|
23
28
|
self.url = url or os.getenv("CHUNKR_URL") or "https://api.chunkr.ai"
|
24
|
-
|
29
|
+
_api_key = api_key or os.getenv("CHUNKR_API_KEY")
|
25
30
|
self.raise_on_failure = raise_on_failure
|
26
31
|
|
27
|
-
if not
|
32
|
+
if not _api_key:
|
28
33
|
raise ValueError(
|
29
34
|
"API key must be provided either directly, in .env file, or as CHUNKR_API_KEY environment variable. You can get an api key at: https://www.chunkr.ai"
|
30
35
|
)
|
31
36
|
|
37
|
+
self._api_key = _api_key
|
32
38
|
self.url = self.url.rstrip("/")
|
33
39
|
self._client = httpx.AsyncClient()
|
34
40
|
|
@@ -36,7 +42,7 @@ class ChunkrBase(HeadersMixin):
|
|
36
42
|
def upload(
|
37
43
|
self,
|
38
44
|
file: Union[str, Path, BinaryIO, Image.Image],
|
39
|
-
config: Configuration = None,
|
45
|
+
config: Optional[Configuration] = None,
|
40
46
|
filename: Optional[str] = None,
|
41
47
|
) -> TaskResponse:
|
42
48
|
"""Upload a file and wait for processing to complete.
|
@@ -90,7 +96,7 @@ class ChunkrBase(HeadersMixin):
|
|
90
96
|
def create_task(
|
91
97
|
self,
|
92
98
|
file: Union[str, Path, BinaryIO, Image.Image],
|
93
|
-
config: Configuration = None,
|
99
|
+
config: Optional[Configuration] = None,
|
94
100
|
filename: Optional[str] = None,
|
95
101
|
) -> TaskResponse:
|
96
102
|
"""Upload a file for processing and immediately return the task response. It will not wait for processing to complete. To wait for the full processing to complete, use `task.poll()`.
|
@@ -127,7 +133,7 @@ class ChunkrBase(HeadersMixin):
|
|
127
133
|
|
128
134
|
@abstractmethod
|
129
135
|
def update_task(
|
130
|
-
self, task_id: str, config: Configuration
|
136
|
+
self, task_id: str, config: Optional[Configuration] = None
|
131
137
|
) -> TaskResponse:
|
132
138
|
"""Update a task by its ID and immediately return the task response. It will not wait for processing to complete. To wait for the full processing to complete, use `task.poll()`.
|
133
139
|
|
@@ -13,10 +13,7 @@ P = ParamSpec('P')
|
|
13
13
|
|
14
14
|
_sync_loop = None
|
15
15
|
|
16
|
-
|
17
|
-
def anywhere() -> Callable[[Callable[P, Awaitable[T]]], Callable[P, Union[Awaitable[T], T]]]: ...
|
18
|
-
|
19
|
-
def anywhere():
|
16
|
+
def anywhere() -> Callable[[Callable[P, Awaitable[T]]], Callable[P, Union[Awaitable[T], T]]]:
|
20
17
|
"""Decorator that allows an async function to run anywhere - sync or async context."""
|
21
18
|
def decorator(async_func: Callable[P, Awaitable[T]]) -> Callable[P, Union[Awaitable[T], T]]:
|
22
19
|
@functools.wraps(async_func)
|
@@ -42,22 +39,22 @@ def anywhere():
|
|
42
39
|
return wrapper
|
43
40
|
return decorator
|
44
41
|
|
45
|
-
def ensure_client() -> Callable[[Callable[
|
42
|
+
def ensure_client() -> Callable[[Callable[..., Awaitable[T]]], Callable[..., Awaitable[T]]]:
|
46
43
|
"""Decorator that ensures a valid httpx.AsyncClient exists before executing the method"""
|
47
|
-
def decorator(async_func: Callable[
|
44
|
+
def decorator(async_func: Callable[..., Awaitable[T]]) -> Callable[..., Awaitable[T]]:
|
48
45
|
@functools.wraps(async_func)
|
49
|
-
async def wrapper(self: Any, *args:
|
46
|
+
async def wrapper(self: Any, *args: Any, **kwargs: Any) -> T:
|
50
47
|
if not self._client or self._client.is_closed:
|
51
48
|
self._client = httpx.AsyncClient()
|
52
49
|
return await async_func(self, *args, **kwargs)
|
53
50
|
return wrapper
|
54
51
|
return decorator
|
55
52
|
|
56
|
-
def require_task() -> Callable[[Callable[
|
53
|
+
def require_task() -> Callable[[Callable[..., Awaitable[T]]], Callable[..., Awaitable[T]]]:
|
57
54
|
"""Decorator that ensures task has required attributes and valid client before execution"""
|
58
|
-
def decorator(async_func: Callable[
|
55
|
+
def decorator(async_func: Callable[..., Awaitable[T]]) -> Callable[..., Awaitable[T]]:
|
59
56
|
@functools.wraps(async_func)
|
60
|
-
async def wrapper(self: Any, *args:
|
57
|
+
async def wrapper(self: Any, *args: Any, **kwargs: Any) -> T:
|
61
58
|
if not self.task_url:
|
62
59
|
raise ValueError("Task URL not found")
|
63
60
|
if not self._client:
|
@@ -30,14 +30,18 @@ async def prepare_file(file: Union[str, Path, BinaryIO, Image.Image]) -> Tuple[O
|
|
30
30
|
if isinstance(file, str):
|
31
31
|
if file.startswith(('http://', 'https://')):
|
32
32
|
return None, file
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
33
|
+
# Try to handle as a file path first
|
34
|
+
path = Path(file)
|
35
|
+
if path.exists():
|
36
|
+
# It's a valid file path, convert to Path object and continue processing
|
37
|
+
file = path
|
38
|
+
else:
|
39
|
+
# If not a valid file path, try treating as base64
|
37
40
|
try:
|
38
|
-
|
41
|
+
base64.b64decode(file)
|
42
|
+
return None, file
|
39
43
|
except:
|
40
|
-
raise ValueError("File
|
44
|
+
raise ValueError(f"File not found: {file} and it's not a valid base64 string")
|
41
45
|
|
42
46
|
# Handle file paths - convert to base64
|
43
47
|
if isinstance(file, Path):
|
@@ -1,5 +1,5 @@
|
|
1
1
|
from datetime import datetime
|
2
|
-
from typing import
|
2
|
+
from typing import Optional, cast, Awaitable, Union
|
3
3
|
from pydantic import BaseModel, PrivateAttr
|
4
4
|
import asyncio
|
5
5
|
import json
|
@@ -11,9 +11,7 @@ from .protocol import ChunkrClientProtocol
|
|
11
11
|
from .misc import prepare_upload_data
|
12
12
|
from .decorators import anywhere, require_task, retry_on_429
|
13
13
|
|
14
|
-
|
15
|
-
|
16
|
-
class TaskResponse(BaseModel, Generic[T]):
|
14
|
+
class TaskResponse(BaseModel):
|
17
15
|
configuration: OutputConfiguration
|
18
16
|
created_at: datetime
|
19
17
|
expires_at: Optional[datetime] = None
|
@@ -28,13 +26,13 @@ class TaskResponse(BaseModel, Generic[T]):
|
|
28
26
|
_base64_urls: bool = False
|
29
27
|
_client: Optional[ChunkrClientProtocol] = PrivateAttr(default=None)
|
30
28
|
|
31
|
-
def with_client(self, client: ChunkrClientProtocol, include_chunks: bool = False, base64_urls: bool = False) ->
|
29
|
+
def with_client(self, client: ChunkrClientProtocol, include_chunks: bool = False, base64_urls: bool = False) -> "TaskResponse":
|
32
30
|
self._client = client
|
33
31
|
self.include_chunks = include_chunks
|
34
32
|
self._base64_urls = base64_urls
|
35
33
|
return self
|
36
34
|
|
37
|
-
def _check_status(self) -> Optional[
|
35
|
+
def _check_status(self) -> Optional["TaskResponse"]:
|
38
36
|
"""Helper method to check task status and handle completion/failure"""
|
39
37
|
if self.status == "Failed":
|
40
38
|
if getattr(self._client, 'raise_on_failure', True):
|
@@ -47,6 +45,11 @@ class TaskResponse(BaseModel, Generic[T]):
|
|
47
45
|
@require_task()
|
48
46
|
async def _poll_request(self) -> dict:
|
49
47
|
try:
|
48
|
+
if not self._client:
|
49
|
+
raise ValueError("Chunkr client protocol is not initialized")
|
50
|
+
if not self._client._client or self._client._client.is_closed:
|
51
|
+
raise ValueError("httpx client is not open")
|
52
|
+
assert self.task_url is not None
|
50
53
|
r = await self._client._client.get(
|
51
54
|
self.task_url, headers=self._client._headers()
|
52
55
|
)
|
@@ -64,10 +67,12 @@ class TaskResponse(BaseModel, Generic[T]):
|
|
64
67
|
raise e
|
65
68
|
|
66
69
|
@anywhere()
|
67
|
-
async def poll(self) ->
|
70
|
+
async def poll(self) -> "TaskResponse":
|
68
71
|
"""Poll the task for completion."""
|
69
72
|
while True:
|
70
73
|
j = await self._poll_request()
|
74
|
+
if not self._client:
|
75
|
+
raise ValueError("Chunkr client protocol is not initialized")
|
71
76
|
updated = TaskResponse(**j).with_client(self._client)
|
72
77
|
self.__dict__.update(updated.__dict__)
|
73
78
|
if res := self._check_status():
|
@@ -77,9 +82,14 @@ class TaskResponse(BaseModel, Generic[T]):
|
|
77
82
|
@anywhere()
|
78
83
|
@require_task()
|
79
84
|
@retry_on_429()
|
80
|
-
async def update(self, config: Configuration) ->
|
85
|
+
async def update(self, config: Configuration) -> "TaskResponse":
|
81
86
|
"""Update the task configuration."""
|
82
87
|
data = await prepare_upload_data(None, None, config)
|
88
|
+
if not self._client:
|
89
|
+
raise ValueError("Chunkr client protocol is not initialized")
|
90
|
+
if not self._client._client or self._client._client.is_closed:
|
91
|
+
raise ValueError("httpx client is not open")
|
92
|
+
assert self.task_url is not None
|
83
93
|
r = await self._client._client.patch(
|
84
94
|
f"{self.task_url}/parse",
|
85
95
|
json=data,
|
@@ -88,12 +98,17 @@ class TaskResponse(BaseModel, Generic[T]):
|
|
88
98
|
r.raise_for_status()
|
89
99
|
updated = TaskResponse(**r.json()).with_client(self._client)
|
90
100
|
self.__dict__.update(updated.__dict__)
|
91
|
-
return
|
101
|
+
return cast(TaskResponse, self.poll())
|
92
102
|
|
93
103
|
@anywhere()
|
94
104
|
@require_task()
|
95
|
-
async def delete(self) ->
|
105
|
+
async def delete(self) -> "TaskResponse":
|
96
106
|
"""Delete the task."""
|
107
|
+
if not self._client:
|
108
|
+
raise ValueError("Chunkr client protocol is not initialized")
|
109
|
+
if not self._client._client or self._client._client.is_closed:
|
110
|
+
raise ValueError("httpx client is not open")
|
111
|
+
assert self.task_url is not None
|
97
112
|
r = await self._client._client.delete(
|
98
113
|
self.task_url, headers=self._client._headers()
|
99
114
|
)
|
@@ -102,15 +117,20 @@ class TaskResponse(BaseModel, Generic[T]):
|
|
102
117
|
|
103
118
|
@anywhere()
|
104
119
|
@require_task()
|
105
|
-
async def cancel(self) ->
|
120
|
+
async def cancel(self) -> "TaskResponse":
|
106
121
|
"""Cancel the task."""
|
122
|
+
if not self._client:
|
123
|
+
raise ValueError("Chunkr client protocol is not initialized")
|
124
|
+
if not self._client._client or self._client._client.is_closed:
|
125
|
+
raise ValueError("httpx client is not open")
|
126
|
+
assert self.task_url is not None
|
107
127
|
r = await self._client._client.get(
|
108
128
|
f"{self.task_url}/cancel", headers=self._client._headers()
|
109
129
|
)
|
110
130
|
r.raise_for_status()
|
111
|
-
return
|
131
|
+
return cast(TaskResponse, self.poll())
|
112
132
|
|
113
|
-
def _write_to_file(self, content: str
|
133
|
+
def _write_to_file(self, content: Union[str, dict], output_file: Optional[str], is_json: bool = False) -> None:
|
114
134
|
"""Helper method to write content to a file
|
115
135
|
|
116
136
|
Args:
|
@@ -131,9 +151,12 @@ class TaskResponse(BaseModel, Generic[T]):
|
|
131
151
|
if is_json:
|
132
152
|
json.dump(content, f, cls=DateTimeEncoder, indent=2)
|
133
153
|
else:
|
134
|
-
|
154
|
+
if isinstance(content, str):
|
155
|
+
f.write(content)
|
156
|
+
else:
|
157
|
+
raise ValueError("Content is not a string")
|
135
158
|
|
136
|
-
def html(self, output_file: str = None) -> str:
|
159
|
+
def html(self, output_file: Optional[str] = None) -> str:
|
137
160
|
"""Get the full HTML of the task
|
138
161
|
|
139
162
|
Args:
|
@@ -143,7 +166,7 @@ class TaskResponse(BaseModel, Generic[T]):
|
|
143
166
|
self._write_to_file(content, output_file)
|
144
167
|
return content
|
145
168
|
|
146
|
-
def markdown(self, output_file: str = None) -> str:
|
169
|
+
def markdown(self, output_file: Optional[str] = None) -> str:
|
147
170
|
"""Get the full markdown of the task
|
148
171
|
|
149
172
|
Args:
|
@@ -153,7 +176,7 @@ class TaskResponse(BaseModel, Generic[T]):
|
|
153
176
|
self._write_to_file(content, output_file)
|
154
177
|
return content
|
155
178
|
|
156
|
-
def content(self, output_file: str = None) -> str:
|
179
|
+
def content(self, output_file: Optional[str] = None) -> str:
|
157
180
|
"""Get the full content of the task
|
158
181
|
|
159
182
|
Args:
|
@@ -163,7 +186,7 @@ class TaskResponse(BaseModel, Generic[T]):
|
|
163
186
|
self._write_to_file(content, output_file)
|
164
187
|
return content
|
165
188
|
|
166
|
-
def json(self, output_file: str = None) -> dict:
|
189
|
+
def json(self, output_file: Optional[str] = None) -> dict:
|
167
190
|
"""Get the full task data as JSON
|
168
191
|
|
169
192
|
Args:
|
@@ -26,6 +26,14 @@ from chunkr_ai.models import (
|
|
26
26
|
def sample_path():
|
27
27
|
return Path("tests/files/test.pdf")
|
28
28
|
|
29
|
+
@pytest.fixture
|
30
|
+
def sample_absolute_path_str():
|
31
|
+
return "tests/files/test.pdf"
|
32
|
+
|
33
|
+
@pytest.fixture
|
34
|
+
def sample_relative_path_str():
|
35
|
+
return "./tests/files/test.pdf"
|
36
|
+
|
29
37
|
@pytest.fixture
|
30
38
|
def sample_image():
|
31
39
|
return Image.open("tests/files/test.jpg")
|
@@ -43,7 +51,7 @@ def client():
|
|
43
51
|
def markdown_embed_config():
|
44
52
|
return Configuration(
|
45
53
|
segment_processing=SegmentProcessing(
|
46
|
-
|
54
|
+
Page=GenerationConfig(
|
47
55
|
html=GenerationStrategy.LLM,
|
48
56
|
markdown=GenerationStrategy.LLM,
|
49
57
|
embed_sources=[EmbedSource.MARKDOWN]
|
@@ -55,7 +63,7 @@ def markdown_embed_config():
|
|
55
63
|
def html_embed_config():
|
56
64
|
return Configuration(
|
57
65
|
segment_processing=SegmentProcessing(
|
58
|
-
|
66
|
+
Page=GenerationConfig(
|
59
67
|
html=GenerationStrategy.LLM,
|
60
68
|
markdown=GenerationStrategy.LLM,
|
61
69
|
embed_sources=[EmbedSource.HTML]
|
@@ -67,7 +75,7 @@ def html_embed_config():
|
|
67
75
|
def multiple_embed_config():
|
68
76
|
return Configuration(
|
69
77
|
segment_processing=SegmentProcessing(
|
70
|
-
|
78
|
+
Page=GenerationConfig(
|
71
79
|
html=GenerationStrategy.LLM,
|
72
80
|
markdown=GenerationStrategy.LLM,
|
73
81
|
llm="Generate a summary of this content",
|
@@ -115,7 +123,7 @@ def xlm_roberta_with_html_content_config():
|
|
115
123
|
tokenizer=Tokenizer.XLM_ROBERTA_BASE
|
116
124
|
),
|
117
125
|
segment_processing=SegmentProcessing(
|
118
|
-
|
126
|
+
Page=GenerationConfig(
|
119
127
|
html=GenerationStrategy.LLM,
|
120
128
|
markdown=GenerationStrategy.LLM,
|
121
129
|
embed_sources=[EmbedSource.HTML, EmbedSource.CONTENT]
|
@@ -163,6 +171,20 @@ async def test_send_file_path(client, sample_path):
|
|
163
171
|
assert response.status == "Succeeded"
|
164
172
|
assert response.output is not None
|
165
173
|
|
174
|
+
@pytest.mark.asyncio
|
175
|
+
async def test_send_file_path_str(client, sample_absolute_path_str):
|
176
|
+
response = await client.upload(sample_absolute_path_str)
|
177
|
+
assert response.task_id is not None
|
178
|
+
assert response.status == "Succeeded"
|
179
|
+
assert response.output is not None
|
180
|
+
|
181
|
+
@pytest.mark.asyncio
|
182
|
+
async def test_send_file_relative_path_str(client, sample_relative_path_str):
|
183
|
+
response = await client.upload(sample_relative_path_str)
|
184
|
+
assert response.task_id is not None
|
185
|
+
assert response.status == "Succeeded"
|
186
|
+
assert response.output is not None
|
187
|
+
|
166
188
|
@pytest.mark.asyncio
|
167
189
|
async def test_send_file_url(client, sample_url):
|
168
190
|
response = await client.upload(sample_url)
|
@@ -171,7 +193,7 @@ async def test_send_file_url(client, sample_url):
|
|
171
193
|
assert response.output is not None
|
172
194
|
|
173
195
|
@pytest.mark.asyncio
|
174
|
-
async def
|
196
|
+
async def test_send_file_path_as_str(client, sample_path):
|
175
197
|
response = await client.upload(str(sample_path))
|
176
198
|
assert response.task_id is not None
|
177
199
|
assert response.status == "Succeeded"
|
@@ -240,7 +262,7 @@ async def test_page_llm_html(client, sample_path):
|
|
240
262
|
Configuration(
|
241
263
|
segmentation_strategy=SegmentationStrategy.PAGE,
|
242
264
|
segment_processing=SegmentProcessing(
|
243
|
-
|
265
|
+
Page=GenerationConfig(html=GenerationStrategy.LLM)
|
244
266
|
),
|
245
267
|
),
|
246
268
|
)
|
@@ -253,7 +275,7 @@ async def test_page_llm(client, sample_path):
|
|
253
275
|
configuration = Configuration(
|
254
276
|
segmentation_strategy=SegmentationStrategy.PAGE,
|
255
277
|
segment_processing=SegmentProcessing(
|
256
|
-
|
278
|
+
Page=GenerationConfig(
|
257
279
|
html=GenerationStrategy.LLM, markdown=GenerationStrategy.LLM
|
258
280
|
)
|
259
281
|
),
|
@@ -332,7 +354,7 @@ async def test_pipeline_type_azure(client, sample_path):
|
|
332
354
|
assert response.output is not None
|
333
355
|
|
334
356
|
@pytest.mark.asyncio
|
335
|
-
async def
|
357
|
+
async def test_pipeline_type_chunkr(client, sample_path):
|
336
358
|
response = await client.upload(sample_path, Configuration(pipeline=Pipeline.CHUNKR))
|
337
359
|
assert response.task_id is not None
|
338
360
|
assert response.status == "Succeeded"
|
@@ -568,7 +590,7 @@ async def test_combined_config_with_llm_and_other_settings(client, sample_path):
|
|
568
590
|
),
|
569
591
|
segmentation_strategy=SegmentationStrategy.PAGE,
|
570
592
|
segment_processing=SegmentProcessing(
|
571
|
-
|
593
|
+
Page=GenerationConfig(
|
572
594
|
html=GenerationStrategy.LLM,
|
573
595
|
markdown=GenerationStrategy.LLM
|
574
596
|
)
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|