retab 0.0.66__py3-none-any.whl → 0.0.68__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- retab/__init__.py +2 -2
- retab/client.py +89 -6
- retab/generate_types.py +2 -2
- retab/resources/projects/client.py +25 -78
- retab/types/mime.py +56 -14
- retab/types/projects/__init__.py +2 -27
- retab/types/projects/model.py +51 -27
- retab/types/schemas/chat.py +2 -2
- retab/types/schemas/model.py +2 -2
- {retab-0.0.66.dist-info → retab-0.0.68.dist-info}/METADATA +3 -1
- {retab-0.0.66.dist-info → retab-0.0.68.dist-info}/RECORD +13 -19
- retab/resources/projects/documents.py +0 -257
- retab/resources/projects/iterations.py +0 -433
- retab/types/projects/Untitled-2.py +0 -16671
- retab/types/projects/documents.py +0 -38
- retab/types/projects/iterations.py +0 -123
- retab/types/projects/v2.py +0 -137
- {retab-0.0.66.dist-info → retab-0.0.68.dist-info}/WHEEL +0 -0
- {retab-0.0.66.dist-info → retab-0.0.68.dist-info}/top_level.txt +0 -0
retab/__init__.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from .client import AsyncRetab, Retab
|
|
1
|
+
from .client import AsyncRetab, Retab, SignatureVerificationError
|
|
2
2
|
from . import utils
|
|
3
3
|
from . import types
|
|
4
|
-
__all__ = ["Retab", "AsyncRetab", "utils", "types"]
|
|
4
|
+
__all__ = ["Retab", "AsyncRetab", "SignatureVerificationError", "utils", "types"]
|
retab/client.py
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
import hashlib
|
|
2
|
+
import hmac
|
|
1
3
|
import json
|
|
2
4
|
import os
|
|
3
5
|
from types import TracebackType
|
|
@@ -12,6 +14,11 @@ from .resources import documents, models, schemas, projects
|
|
|
12
14
|
from .types.standards import PreparedRequest, FieldUnset
|
|
13
15
|
|
|
14
16
|
|
|
17
|
+
class SignatureVerificationError(Exception):
|
|
18
|
+
"""Raised when webhook signature verification fails."""
|
|
19
|
+
pass
|
|
20
|
+
|
|
21
|
+
|
|
15
22
|
class MaxRetriesExceeded(Exception):
|
|
16
23
|
pass
|
|
17
24
|
|
|
@@ -34,7 +41,7 @@ class BaseRetab:
|
|
|
34
41
|
Args:
|
|
35
42
|
api_key (str, optional): Retab API key. If not provided, will look for RETAB_API_KEY env variable.
|
|
36
43
|
base_url (str, optional): Base URL for API requests. Defaults to https://api.retab.com
|
|
37
|
-
timeout (float): Request timeout in seconds. Defaults to
|
|
44
|
+
timeout (float): Request timeout in seconds. Defaults to 1800.0 (30 minutes)
|
|
38
45
|
max_retries (int): Maximum number of retries for failed requests. Defaults to 3
|
|
39
46
|
openai_api_key (str, optional): OpenAI API key. Will look for OPENAI_API_KEY env variable if not provided
|
|
40
47
|
|
|
@@ -46,7 +53,7 @@ class BaseRetab:
|
|
|
46
53
|
self,
|
|
47
54
|
api_key: Optional[str] = None,
|
|
48
55
|
base_url: Optional[str] = None,
|
|
49
|
-
timeout: float =
|
|
56
|
+
timeout: float = 1800.0,
|
|
50
57
|
max_retries: int = 3,
|
|
51
58
|
openai_api_key: Optional[str] = FieldUnset,
|
|
52
59
|
gemini_api_key: Optional[str] = FieldUnset,
|
|
@@ -140,7 +147,7 @@ class Retab(BaseRetab):
|
|
|
140
147
|
Args:
|
|
141
148
|
api_key (str, optional): Retab API key. If not provided, will look for RETAB_API_KEY env variable.
|
|
142
149
|
base_url (str, optional): Base URL for API requests. Defaults to https://api.retab.com
|
|
143
|
-
timeout (float): Request timeout in seconds. Defaults to
|
|
150
|
+
timeout (float): Request timeout in seconds. Defaults to 1800.0 (30 minutes)
|
|
144
151
|
max_retries (int): Maximum number of retries for failed requests. Defaults to 3
|
|
145
152
|
openai_api_key (str, optional): OpenAI API key. Will look for OPENAI_API_KEY env variable if not provided
|
|
146
153
|
gemini_api_key (str, optional): Gemini API key. Will look for GEMINI_API_KEY env variable if not provided
|
|
@@ -161,7 +168,7 @@ class Retab(BaseRetab):
|
|
|
161
168
|
self,
|
|
162
169
|
api_key: Optional[str] = None,
|
|
163
170
|
base_url: Optional[str] = None,
|
|
164
|
-
timeout: float =
|
|
171
|
+
timeout: float = 1800.0,
|
|
165
172
|
max_retries: int = 3,
|
|
166
173
|
openai_api_key: Optional[str] = FieldUnset,
|
|
167
174
|
gemini_api_key: Optional[str] = FieldUnset,
|
|
@@ -385,6 +392,44 @@ class Retab(BaseRetab):
|
|
|
385
392
|
"""
|
|
386
393
|
self.close()
|
|
387
394
|
|
|
395
|
+
@staticmethod
|
|
396
|
+
def verify_event(event_body: bytes, event_signature: str, secret: str) -> Any:
|
|
397
|
+
"""Verify the signature of a webhook event.
|
|
398
|
+
|
|
399
|
+
Args:
|
|
400
|
+
event_body: The raw request body as bytes
|
|
401
|
+
event_signature: The signature from the request header (x-retab-signature)
|
|
402
|
+
secret: The webhook secret key used for signing
|
|
403
|
+
|
|
404
|
+
Returns:
|
|
405
|
+
Any: The parsed event payload (JSON)
|
|
406
|
+
|
|
407
|
+
Raises:
|
|
408
|
+
SignatureVerificationError: If the signature verification fails
|
|
409
|
+
|
|
410
|
+
Example:
|
|
411
|
+
```python
|
|
412
|
+
from retab import Retab
|
|
413
|
+
|
|
414
|
+
# In your webhook handler
|
|
415
|
+
secret = "your_webhook_secret"
|
|
416
|
+
body = request.body # Raw bytes
|
|
417
|
+
signature = request.headers.get("x-retab-signature")
|
|
418
|
+
|
|
419
|
+
try:
|
|
420
|
+
event = Retab.verify_event(body, signature, secret)
|
|
421
|
+
print(f"Verified event: {event}")
|
|
422
|
+
except SignatureVerificationError:
|
|
423
|
+
print("Invalid signature!")
|
|
424
|
+
```
|
|
425
|
+
"""
|
|
426
|
+
expected_signature = hmac.new(secret.encode(), event_body, hashlib.sha256).hexdigest()
|
|
427
|
+
|
|
428
|
+
if not hmac.compare_digest(event_signature, expected_signature):
|
|
429
|
+
raise SignatureVerificationError("Invalid signature")
|
|
430
|
+
|
|
431
|
+
return json.loads(event_body.decode("utf-8"))
|
|
432
|
+
|
|
388
433
|
|
|
389
434
|
class AsyncRetab(BaseRetab):
|
|
390
435
|
"""Asynchronous client for interacting with the Retab API.
|
|
@@ -395,7 +440,7 @@ class AsyncRetab(BaseRetab):
|
|
|
395
440
|
Args:
|
|
396
441
|
api_key (str, optional): Retab API key. If not provided, will look for RETAB_API_KEY env variable.
|
|
397
442
|
base_url (str, optional): Base URL for API requests. Defaults to https://api.retab.com
|
|
398
|
-
timeout (float): Request timeout in seconds. Defaults to
|
|
443
|
+
timeout (float): Request timeout in seconds. Defaults to 1800.0 (30 minutes)
|
|
399
444
|
max_retries (int): Maximum number of retries for failed requests. Defaults to 3
|
|
400
445
|
openai_api_key (str, optional): OpenAI API key. Will look for OPENAI_API_KEY env variable if not provided
|
|
401
446
|
claude_api_key (str, optional): Claude API key. Will look for CLAUDE_API_KEY env variable if not provided
|
|
@@ -418,7 +463,7 @@ class AsyncRetab(BaseRetab):
|
|
|
418
463
|
self,
|
|
419
464
|
api_key: Optional[str] = None,
|
|
420
465
|
base_url: Optional[str] = None,
|
|
421
|
-
timeout: float =
|
|
466
|
+
timeout: float = 1800.0,
|
|
422
467
|
max_retries: int = 3,
|
|
423
468
|
openai_api_key: Optional[str] = FieldUnset,
|
|
424
469
|
gemini_api_key: Optional[str] = FieldUnset,
|
|
@@ -661,3 +706,41 @@ class AsyncRetab(BaseRetab):
|
|
|
661
706
|
traceback: The traceback of the exception that was raised, if any
|
|
662
707
|
"""
|
|
663
708
|
await self.close()
|
|
709
|
+
@staticmethod
|
|
710
|
+
def verify_event(event_body: bytes, event_signature: str, secret: str) -> Any:
|
|
711
|
+
"""Verify the signature of a webhook event.
|
|
712
|
+
|
|
713
|
+
Args:
|
|
714
|
+
event_body: The raw request body as bytes
|
|
715
|
+
event_signature: The signature from the request header (x-retab-signature)
|
|
716
|
+
secret: The webhook secret key used for signing
|
|
717
|
+
|
|
718
|
+
Returns:
|
|
719
|
+
Any: The parsed event payload (JSON)
|
|
720
|
+
|
|
721
|
+
Raises:
|
|
722
|
+
SignatureVerificationError: If the signature verification fails
|
|
723
|
+
|
|
724
|
+
Example:
|
|
725
|
+
```python
|
|
726
|
+
from retab import AsyncRetab
|
|
727
|
+
|
|
728
|
+
# In your async webhook handler
|
|
729
|
+
secret = "your_webhook_secret"
|
|
730
|
+
body = await request.body() # Raw bytes
|
|
731
|
+
signature = request.headers.get("x-retab-signature")
|
|
732
|
+
|
|
733
|
+
try:
|
|
734
|
+
event = AsyncRetab.verify_event(body, signature, secret)
|
|
735
|
+
print(f"Verified event: {event}")
|
|
736
|
+
except SignatureVerificationError:
|
|
737
|
+
print("Invalid signature!")
|
|
738
|
+
```
|
|
739
|
+
"""
|
|
740
|
+
expected_signature = hmac.new(secret.encode(), event_body, hashlib.sha256).hexdigest()
|
|
741
|
+
|
|
742
|
+
if not hmac.compare_digest(event_signature, expected_signature):
|
|
743
|
+
raise SignatureVerificationError("Invalid signature")
|
|
744
|
+
|
|
745
|
+
return json.loads(event_body.decode("utf-8"))
|
|
746
|
+
|
retab/generate_types.py
CHANGED
|
@@ -59,7 +59,7 @@ def type_to_zod(field_type: Any, put_names: bool = True, ts: bool = False) -> st
|
|
|
59
59
|
optional = True
|
|
60
60
|
typename = make_union([type_to_zod(x) for x in args])
|
|
61
61
|
ts_typename = make_ts_union([type_to_zod(x, ts=True) for x in args])
|
|
62
|
-
elif issubclass(origin, BaseModel) or is_typeddict(origin) or is_typeddict_ext(origin):
|
|
62
|
+
elif isinstance(origin, type) and (issubclass(origin, BaseModel) or is_typeddict(origin) or is_typeddict_ext(origin)):
|
|
63
63
|
if put_names:
|
|
64
64
|
name = get_class_name(origin)
|
|
65
65
|
typename = "Z" + name
|
|
@@ -77,7 +77,7 @@ def type_to_zod(field_type: Any, put_names: bool = True, ts: bool = False) -> st
|
|
|
77
77
|
|
|
78
78
|
typename += "z.object({\n"
|
|
79
79
|
ts_typename += "{\n"
|
|
80
|
-
props = [(n, f.annotation, f.default) for n, f in origin.model_fields.items() if not f.exclude] if issubclass(origin, BaseModel) else \
|
|
80
|
+
props = [(n, f.annotation, f.default) for n, f in origin.model_fields.items() if not f.exclude] if isinstance(origin, type) and issubclass(origin, BaseModel) else \
|
|
81
81
|
[(n, f, PydanticUndefined) for n, f in origin.__annotations__.items()]
|
|
82
82
|
|
|
83
83
|
for field_name, field, default in props:
|
|
@@ -8,11 +8,8 @@ from pydantic import HttpUrl
|
|
|
8
8
|
from ..._resource import AsyncAPIResource, SyncAPIResource
|
|
9
9
|
from ...utils.mime import MIMEData, prepare_mime_document
|
|
10
10
|
from ...types.documents.extract import RetabParsedChatCompletion
|
|
11
|
-
from ...types.projects import Project, PatchProjectRequest,
|
|
11
|
+
from ...types.projects import Project, PatchProjectRequest, CreateProjectRequest
|
|
12
12
|
from ...types.standards import PreparedRequest, DeleteResponse, FieldUnset
|
|
13
|
-
from .documents import Documents, AsyncDocuments
|
|
14
|
-
from .iterations import Iterations, AsyncIterations
|
|
15
|
-
|
|
16
13
|
|
|
17
14
|
class ProjectsMixin:
|
|
18
15
|
def prepare_create(
|
|
@@ -29,7 +26,7 @@ class ProjectsMixin:
|
|
|
29
26
|
if extra_body:
|
|
30
27
|
eval_dict.update(extra_body)
|
|
31
28
|
|
|
32
|
-
eval_data =
|
|
29
|
+
eval_data = CreateProjectRequest(**eval_dict)
|
|
33
30
|
return PreparedRequest(method="POST", url="/v1/projects", data=eval_data.model_dump(exclude_unset=True, mode="json"))
|
|
34
31
|
|
|
35
32
|
def prepare_get(self, project_id: str) -> PreparedRequest:
|
|
@@ -76,6 +73,10 @@ class ProjectsMixin:
|
|
|
76
73
|
def prepare_delete(self, id: str) -> PreparedRequest:
|
|
77
74
|
return PreparedRequest(method="DELETE", url=f"/v1/projects/{id}")
|
|
78
75
|
|
|
76
|
+
def prepare_publish(self, project_id: str, **extra_body: Any) -> PreparedRequest:
|
|
77
|
+
data = extra_body or None
|
|
78
|
+
return PreparedRequest(method="POST", url=f"/v1/projects/{project_id}/publish", data=data)
|
|
79
|
+
|
|
79
80
|
def prepare_extract(
|
|
80
81
|
self,
|
|
81
82
|
project_id: str,
|
|
@@ -160,8 +161,6 @@ class Projects(SyncAPIResource, ProjectsMixin):
|
|
|
160
161
|
|
|
161
162
|
def __init__(self, *args, **kwargs):
|
|
162
163
|
super().__init__(*args, **kwargs)
|
|
163
|
-
self.documents = Documents(self._client)
|
|
164
|
-
self.iterations = Iterations(self._client)
|
|
165
164
|
|
|
166
165
|
def create(
|
|
167
166
|
self,
|
|
@@ -174,9 +173,7 @@ class Projects(SyncAPIResource, ProjectsMixin):
|
|
|
174
173
|
|
|
175
174
|
Args:
|
|
176
175
|
name: The name of the project
|
|
177
|
-
json_schema: The
|
|
178
|
-
documents: The documents to associate with the project
|
|
179
|
-
|
|
176
|
+
json_schema: The json schema of the project
|
|
180
177
|
Returns:
|
|
181
178
|
Project: The created project
|
|
182
179
|
Raises:
|
|
@@ -202,37 +199,6 @@ class Projects(SyncAPIResource, ProjectsMixin):
|
|
|
202
199
|
response = self._client._prepared_request(request)
|
|
203
200
|
return Project(**response)
|
|
204
201
|
|
|
205
|
-
def update(
|
|
206
|
-
self,
|
|
207
|
-
project_id: str,
|
|
208
|
-
name: str = FieldUnset,
|
|
209
|
-
json_schema: dict[str, Any] = FieldUnset,
|
|
210
|
-
**extra_body: Any,
|
|
211
|
-
) -> Project:
|
|
212
|
-
"""
|
|
213
|
-
Update an project with partial updates.
|
|
214
|
-
|
|
215
|
-
Args:
|
|
216
|
-
project_id: The ID of the project to update
|
|
217
|
-
name: Optional new name for the project
|
|
218
|
-
json_schema: Optional new JSON schema
|
|
219
|
-
documents: Optional list of documents to update
|
|
220
|
-
iterations: Optional list of iterations to update
|
|
221
|
-
|
|
222
|
-
Returns:
|
|
223
|
-
Project: The updated project
|
|
224
|
-
Raises:
|
|
225
|
-
HTTPException if the request fails
|
|
226
|
-
"""
|
|
227
|
-
request = self.prepare_update(
|
|
228
|
-
project_id=project_id,
|
|
229
|
-
name=name,
|
|
230
|
-
json_schema=json_schema,
|
|
231
|
-
**extra_body,
|
|
232
|
-
)
|
|
233
|
-
response = self._client._prepared_request(request)
|
|
234
|
-
return Project(**response)
|
|
235
|
-
|
|
236
202
|
def list(self, **extra_params: Any) -> List[Project]:
|
|
237
203
|
"""
|
|
238
204
|
List projects for a project.
|
|
@@ -262,6 +228,12 @@ class Projects(SyncAPIResource, ProjectsMixin):
|
|
|
262
228
|
request = self.prepare_delete(project_id)
|
|
263
229
|
return self._client._prepared_request(request)
|
|
264
230
|
|
|
231
|
+
def publish(self, project_id: str, **extra_body: Any) -> Project:
|
|
232
|
+
"""Publish a project's draft configuration."""
|
|
233
|
+
request = self.prepare_publish(project_id, **extra_body)
|
|
234
|
+
response = self._client._prepared_request(request)
|
|
235
|
+
return Project(**response)
|
|
236
|
+
|
|
265
237
|
def extract(
|
|
266
238
|
self,
|
|
267
239
|
project_id: str,
|
|
@@ -314,23 +286,20 @@ class AsyncProjects(AsyncAPIResource, ProjectsMixin):
|
|
|
314
286
|
|
|
315
287
|
def __init__(self, *args, **kwargs):
|
|
316
288
|
super().__init__(*args, **kwargs)
|
|
317
|
-
self.documents = AsyncDocuments(self._client)
|
|
318
|
-
self.iterations = AsyncIterations(self._client)
|
|
319
289
|
|
|
320
|
-
async def create(self, name: str, json_schema:
|
|
290
|
+
async def create(self, name: str, json_schema: dict[str, Any], **extra_body: Any) -> Project:
|
|
321
291
|
"""
|
|
322
292
|
Create a new project.
|
|
323
293
|
|
|
324
294
|
Args:
|
|
325
295
|
name: The name of the project
|
|
326
|
-
json_schema: The
|
|
327
|
-
|
|
296
|
+
json_schema: The json schema of the project
|
|
328
297
|
Returns:
|
|
329
298
|
Project: The created project
|
|
330
299
|
Raises:
|
|
331
300
|
HTTPException if the request fails
|
|
332
301
|
"""
|
|
333
|
-
request = self.prepare_create(name, json_schema)
|
|
302
|
+
request = self.prepare_create(name, json_schema, **extra_body)
|
|
334
303
|
response = await self._client._prepared_request(request)
|
|
335
304
|
return Project(**response)
|
|
336
305
|
|
|
@@ -350,36 +319,7 @@ class AsyncProjects(AsyncAPIResource, ProjectsMixin):
|
|
|
350
319
|
response = await self._client._prepared_request(request)
|
|
351
320
|
return Project(**response)
|
|
352
321
|
|
|
353
|
-
async def
|
|
354
|
-
self,
|
|
355
|
-
project_id: str,
|
|
356
|
-
name: str = FieldUnset,
|
|
357
|
-
json_schema: dict[str, Any] = FieldUnset,
|
|
358
|
-
) -> Project:
|
|
359
|
-
"""
|
|
360
|
-
Update an project with partial updates.
|
|
361
|
-
|
|
362
|
-
Args:
|
|
363
|
-
id: The ID of the project to update
|
|
364
|
-
name: Optional new name for the project
|
|
365
|
-
json_schema: Optional new JSON schema
|
|
366
|
-
documents: Optional list of documents to update
|
|
367
|
-
iterations: Optional list of iterations to update
|
|
368
|
-
|
|
369
|
-
Returns:
|
|
370
|
-
Project: The updated project
|
|
371
|
-
Raises:
|
|
372
|
-
HTTPException if the request fails
|
|
373
|
-
"""
|
|
374
|
-
request = self.prepare_update(
|
|
375
|
-
project_id=project_id,
|
|
376
|
-
name=name,
|
|
377
|
-
json_schema=json_schema,
|
|
378
|
-
)
|
|
379
|
-
response = await self._client._prepared_request(request)
|
|
380
|
-
return Project(**response)
|
|
381
|
-
|
|
382
|
-
async def list(self) -> List[Project]:
|
|
322
|
+
async def list(self, **extra_params: Any) -> List[Project]:
|
|
383
323
|
"""
|
|
384
324
|
List projects for a project.
|
|
385
325
|
|
|
@@ -388,7 +328,7 @@ class AsyncProjects(AsyncAPIResource, ProjectsMixin):
|
|
|
388
328
|
Raises:
|
|
389
329
|
HTTPException if the request fails
|
|
390
330
|
"""
|
|
391
|
-
request = self.prepare_list()
|
|
331
|
+
request = self.prepare_list(**extra_params)
|
|
392
332
|
response = await self._client._prepared_request(request)
|
|
393
333
|
return [Project(**item) for item in response.get("data", [])]
|
|
394
334
|
|
|
@@ -407,6 +347,12 @@ class AsyncProjects(AsyncAPIResource, ProjectsMixin):
|
|
|
407
347
|
request = self.prepare_delete(project_id)
|
|
408
348
|
return await self._client._prepared_request(request)
|
|
409
349
|
|
|
350
|
+
async def publish(self, project_id: str, **extra_body: Any) -> Project:
|
|
351
|
+
"""Publish a project's draft configuration."""
|
|
352
|
+
request = self.prepare_publish(project_id, **extra_body)
|
|
353
|
+
response = await self._client._prepared_request(request)
|
|
354
|
+
return Project(**response)
|
|
355
|
+
|
|
410
356
|
async def extract(
|
|
411
357
|
self,
|
|
412
358
|
project_id: str,
|
|
@@ -419,6 +365,7 @@ class AsyncProjects(AsyncAPIResource, ProjectsMixin):
|
|
|
419
365
|
n_consensus: int | None = None,
|
|
420
366
|
seed: int | None = None,
|
|
421
367
|
store: bool = True,
|
|
368
|
+
**extra_form: Any,
|
|
422
369
|
) -> RetabParsedChatCompletion:
|
|
423
370
|
"""Extract documents from a project.
|
|
424
371
|
|
retab/types/mime.py
CHANGED
|
@@ -8,6 +8,8 @@ from typing import Any, Optional, Self, Sequence
|
|
|
8
8
|
from pydantic import BaseModel, Field, field_validator
|
|
9
9
|
from ..utils.hashing import generate_blake2b_hash_from_base64
|
|
10
10
|
|
|
11
|
+
import io
|
|
12
|
+
|
|
11
13
|
# Add webp and heic to the list of supported mime types
|
|
12
14
|
mimetypes.add_type("image/webp", ".webp")
|
|
13
15
|
mimetypes.add_type("image/heic", ".heic")
|
|
@@ -85,8 +87,17 @@ class OCR(BaseModel):
|
|
|
85
87
|
|
|
86
88
|
|
|
87
89
|
class MIMEData(BaseModel):
|
|
88
|
-
filename: str = Field(
|
|
89
|
-
|
|
90
|
+
filename: str = Field(
|
|
91
|
+
description="The filename of the file",
|
|
92
|
+
examples=["file.pdf", "image.png", "data.txt"]
|
|
93
|
+
)
|
|
94
|
+
url: str = Field(
|
|
95
|
+
description="The URL of the file in base64 format",
|
|
96
|
+
examples=["..."]
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
# Internal resource
|
|
100
|
+
_buffer: Optional[io.BytesIO] = None
|
|
90
101
|
|
|
91
102
|
@property
|
|
92
103
|
def id(self) -> str:
|
|
@@ -99,18 +110,14 @@ class MIMEData(BaseModel):
|
|
|
99
110
|
@property
|
|
100
111
|
def content(self) -> str:
|
|
101
112
|
if self.url.startswith("data:"):
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
return base64_content
|
|
105
|
-
else:
|
|
106
|
-
raise ValueError("Content is not available for this file")
|
|
113
|
+
return self.url.split(",")[1]
|
|
114
|
+
raise ValueError("Content is not available for this file")
|
|
107
115
|
|
|
108
116
|
@property
|
|
109
117
|
def mime_type(self) -> str:
|
|
110
118
|
if self.url.startswith("data:"):
|
|
111
119
|
return self.url.split(";")[0].split(":")[1]
|
|
112
|
-
|
|
113
|
-
return mimetypes.guess_type(self.filename)[0] or "application/octet-stream"
|
|
120
|
+
return mimetypes.guess_type(self.filename)[0] or "application/octet-stream"
|
|
114
121
|
|
|
115
122
|
@property
|
|
116
123
|
def unique_filename(self) -> str:
|
|
@@ -118,22 +125,57 @@ class MIMEData(BaseModel):
|
|
|
118
125
|
|
|
119
126
|
@property
|
|
120
127
|
def size(self) -> int:
|
|
121
|
-
# size in bytes
|
|
122
128
|
return len(base64.b64decode(self.content))
|
|
123
129
|
|
|
130
|
+
# def to_bytesio(self) -> io.BytesIO:
|
|
131
|
+
# """Decode base64 and return a BytesIO (without leaking references)."""
|
|
132
|
+
# buf = io.BytesIO(base64.b64decode(self.content))
|
|
133
|
+
# buf.seek(0)
|
|
134
|
+
# return buf
|
|
135
|
+
|
|
136
|
+
# # -------- Context manager interface --------
|
|
137
|
+
|
|
138
|
+
# def __enter__(self) -> io.BytesIO:
|
|
139
|
+
# """Opens the internal buffer so you can use it like a file."""
|
|
140
|
+
# if self._buffer is None:
|
|
141
|
+
# self._buffer = self.to_bytesio()
|
|
142
|
+
# return self._buffer
|
|
143
|
+
|
|
144
|
+
# def __exit__(self, exc_type, exc_val, exc_tb):
|
|
145
|
+
# """Close and cleanup the buffer."""
|
|
146
|
+
# if self._buffer is not None:
|
|
147
|
+
# self._buffer.close()
|
|
148
|
+
# self._buffer = None
|
|
149
|
+
|
|
150
|
+
# # -------- Optional convenience methods --------
|
|
151
|
+
|
|
152
|
+
# def open(self) -> io.BytesIO:
|
|
153
|
+
# """Manual open without `with`."""
|
|
154
|
+
# return self.__enter__()
|
|
155
|
+
|
|
156
|
+
# def close(self):
|
|
157
|
+
# """Manual close."""
|
|
158
|
+
# self.__exit__(None, None, None)
|
|
159
|
+
|
|
124
160
|
def __str__(self) -> str:
|
|
125
161
|
truncated_url = self.url[:50] + "..." if len(self.url) > 50 else self.url
|
|
126
|
-
|
|
127
|
-
|
|
162
|
+
return (
|
|
163
|
+
f"MIMEData(filename='{self.filename}', "
|
|
164
|
+
f"url='{truncated_url}', "
|
|
165
|
+
f"mime_type='{self.mime_type}', "
|
|
166
|
+
f"size='{self.size}', "
|
|
167
|
+
f"extension='{self.extension}')"
|
|
168
|
+
)
|
|
128
169
|
|
|
129
170
|
def __repr__(self) -> str:
|
|
130
171
|
return self.__str__()
|
|
131
172
|
|
|
132
173
|
|
|
174
|
+
|
|
133
175
|
class BaseMIMEData(MIMEData):
|
|
134
176
|
@classmethod
|
|
135
177
|
def model_validate(
|
|
136
|
-
cls, obj: Any, *, strict: bool | None = None, from_attributes: bool | None = None, context: Any | None = None, by_alias: bool | None = None, by_name: bool | None = None
|
|
178
|
+
cls, obj: Any, *, strict: bool | None = None, extra: Any | None = None, from_attributes: bool | None = None, context: Any | None = None, by_alias: bool | None = None, by_name: bool | None = None
|
|
137
179
|
) -> Self:
|
|
138
180
|
if isinstance(obj, MIMEData):
|
|
139
181
|
# Convert MIMEData instance to dict
|
|
@@ -153,7 +195,7 @@ class BaseMIMEData(MIMEData):
|
|
|
153
195
|
else:
|
|
154
196
|
# If there's no comma (unexpected format), truncate to 996 chars (multiple of 4)
|
|
155
197
|
obj["url"] = obj["url"][:996]
|
|
156
|
-
return super().model_validate(obj, strict=strict, from_attributes=from_attributes, context=context, by_alias=by_alias, by_name=by_name)
|
|
198
|
+
return super().model_validate(obj, strict=strict, extra=extra, from_attributes=from_attributes, context=context, by_alias=by_alias, by_name=by_name)
|
|
157
199
|
|
|
158
200
|
@property
|
|
159
201
|
def id(self) -> str:
|
retab/types/projects/__init__.py
CHANGED
|
@@ -1,33 +1,8 @@
|
|
|
1
|
-
from .model import Project,
|
|
2
|
-
from .documents import AnnotatedDocument, DocumentItem, ProjectDocument, CreateProjectDocumentRequest, PatchProjectDocumentRequest
|
|
3
|
-
from .iterations import (
|
|
4
|
-
BaseIteration,
|
|
5
|
-
Iteration,
|
|
6
|
-
CreateIterationRequest,
|
|
7
|
-
PatchIterationRequest,
|
|
8
|
-
ProcessIterationRequest,
|
|
9
|
-
DocumentStatus,
|
|
10
|
-
IterationDocumentStatusResponse,
|
|
11
|
-
AddIterationFromJsonlRequest,
|
|
12
|
-
)
|
|
1
|
+
from .model import Project, CreateProjectRequest, PatchProjectRequest
|
|
13
2
|
|
|
14
3
|
|
|
15
4
|
__all__ = [
|
|
16
5
|
"Project",
|
|
17
|
-
"BaseProject",
|
|
18
6
|
"CreateProjectRequest",
|
|
19
|
-
"PatchProjectRequest"
|
|
20
|
-
"AnnotatedDocument",
|
|
21
|
-
"DocumentItem",
|
|
22
|
-
"ProjectDocument",
|
|
23
|
-
"CreateProjectDocumentRequest",
|
|
24
|
-
"PatchProjectDocumentRequest",
|
|
25
|
-
"BaseIteration",
|
|
26
|
-
"Iteration",
|
|
27
|
-
"CreateIterationRequest",
|
|
28
|
-
"PatchIterationRequest",
|
|
29
|
-
"ProcessIterationRequest",
|
|
30
|
-
"DocumentStatus",
|
|
31
|
-
"IterationDocumentStatusResponse",
|
|
32
|
-
"AddIterationFromJsonlRequest",
|
|
7
|
+
"PatchProjectRequest"
|
|
33
8
|
]
|
retab/types/projects/model.py
CHANGED
|
@@ -4,8 +4,6 @@ from typing import Any, Optional
|
|
|
4
4
|
import nanoid # type: ignore
|
|
5
5
|
from pydantic import BaseModel, Field, ConfigDict
|
|
6
6
|
|
|
7
|
-
from .documents import ProjectDocument
|
|
8
|
-
from .iterations import Iteration
|
|
9
7
|
from ..inference_settings import InferenceSettings
|
|
10
8
|
|
|
11
9
|
default_inference_settings = InferenceSettings(
|
|
@@ -17,42 +15,68 @@ default_inference_settings = InferenceSettings(
|
|
|
17
15
|
browser_canvas="A4",
|
|
18
16
|
n_consensus=1,
|
|
19
17
|
)
|
|
18
|
+
class Function(BaseModel):
|
|
19
|
+
model_config = ConfigDict(extra="ignore")
|
|
20
|
+
id: str = Field(default_factory=lambda: "function_" + nanoid.generate())
|
|
21
|
+
path: str
|
|
22
|
+
code: Optional[str] = Field(default=None, description="The code of the function")
|
|
23
|
+
function_registry_id: Optional[str] = Field(default=None, description="The function registry id of the function")
|
|
24
|
+
|
|
25
|
+
# @model_validator(mode="before")
|
|
26
|
+
# @classmethod
|
|
27
|
+
# def validate_function(cls, data: Any):
|
|
28
|
+
# if isinstance(data, dict):
|
|
29
|
+
# code = data.get("code")
|
|
30
|
+
# function_registry_id = data.get("function_registry_id")
|
|
31
|
+
# if code is None and function_registry_id is None:
|
|
32
|
+
# raise ValueError("Either code or function_registry_id must be provided")
|
|
33
|
+
# return data
|
|
34
|
+
|
|
35
|
+
class FunctionHilCriterion(BaseModel):
|
|
36
|
+
path: str
|
|
37
|
+
agentic_fix: bool = Field(default=False, description="Whether to use agentic fix for the criterion")
|
|
20
38
|
|
|
21
|
-
class
|
|
22
|
-
|
|
23
|
-
|
|
39
|
+
class HumanInTheLoopParams(BaseModel):
|
|
40
|
+
enabled: bool = Field(default=False)
|
|
41
|
+
url: str = Field(default="", description="The URL of the human in the loop endpoint")
|
|
42
|
+
headers: dict[str, str] = Field(default_factory=dict, description="The headers to send to the human in the loop endpoint")
|
|
43
|
+
criteria: list[FunctionHilCriterion] = Field(default_factory=list, description="The criteria to use for the human in the loop")
|
|
24
44
|
|
|
25
|
-
class
|
|
45
|
+
class PublishedConfig(BaseModel):
|
|
46
|
+
inference_settings: InferenceSettings = default_inference_settings
|
|
47
|
+
json_schema: dict[str, Any] = Field(default_factory=dict, description="The json schema of the project")
|
|
48
|
+
human_in_the_loop_params: HumanInTheLoopParams = Field(default_factory=HumanInTheLoopParams)
|
|
49
|
+
origin: str = Field(default="manual", description="The origin of the published config. Either 'Manual' or the iteration id that was used to generate the config")
|
|
50
|
+
class DraftConfig(BaseModel):
|
|
51
|
+
inference_settings: InferenceSettings = default_inference_settings
|
|
52
|
+
json_schema: dict[str, Any] = Field(default_factory=dict, description="The json schema of the builder config")
|
|
53
|
+
human_in_the_loop_criteria: list[FunctionHilCriterion] = Field(default_factory=list)
|
|
54
|
+
class Project(BaseModel):
|
|
26
55
|
model_config = ConfigDict(extra="ignore")
|
|
27
|
-
id: str = Field(default_factory=lambda: "
|
|
56
|
+
id: str = Field(default_factory=lambda: "project_" + nanoid.generate())
|
|
28
57
|
name: str = Field(default="", description="The name of the project")
|
|
29
|
-
json_schema: dict[str, Any] = Field(default_factory=dict, description="The json schema of the project")
|
|
30
58
|
updated_at: datetime.datetime = Field(default_factory=lambda: datetime.datetime.now(tz=datetime.timezone.utc))
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
59
|
+
published_config: PublishedConfig
|
|
60
|
+
draft_config: DraftConfig
|
|
61
|
+
is_published: bool = False
|
|
62
|
+
#computation_spec: ComputationSpec = Field(default_factory=ComputationSpec, description="The computation spec of the project")
|
|
63
|
+
functions: list[Function] = Field(default_factory=list, description="The functions of the project")
|
|
34
64
|
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
iterations: list[Iteration] = Field(default_factory=list)
|
|
65
|
+
class StoredProject(Project):
|
|
66
|
+
"""Project model with organization_id for database storage"""
|
|
67
|
+
organization_id: str
|
|
39
68
|
|
|
40
69
|
class CreateProjectRequest(BaseModel):
|
|
41
70
|
model_config = ConfigDict(extra="ignore")
|
|
42
71
|
name: str
|
|
43
|
-
json_schema: dict[str, Any]
|
|
44
|
-
|
|
72
|
+
json_schema: dict[str, Any] = Field(default_factory=dict, description="The json schema of the project")
|
|
45
73
|
|
|
46
|
-
# This is basically the same as
|
|
47
|
-
# Could be achieved by convert_basemodel_to_partial_basemodel(BaseProject) but we prefer explicitness
|
|
74
|
+
# This is basically the same as Project, but everything is optional.
|
|
48
75
|
class PatchProjectRequest(BaseModel):
|
|
49
76
|
model_config = ConfigDict(extra="ignore")
|
|
50
77
|
name: Optional[str] = Field(default=None, description="The name of the document")
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
class AddIterationFromJsonlRequest(BaseModel):
|
|
57
|
-
model_config = ConfigDict(extra="ignore")
|
|
58
|
-
jsonl_gcs_path: str
|
|
78
|
+
published_config: Optional[PublishedConfig] = Field(default=None, description="The published config of the project")
|
|
79
|
+
draft_config: Optional[DraftConfig] = Field(default=None, description="The draft config of the project")
|
|
80
|
+
is_published: Optional[bool] = Field(default=None, description="The published status of the project")
|
|
81
|
+
#computation_spec: Optional[ComputationSpec] = Field(default=None, description="The computation spec of the project")
|
|
82
|
+
functions: Optional[list[Function]] = Field(default=None, description="The functions of the project")
|
retab/types/schemas/chat.py
CHANGED
|
@@ -25,8 +25,8 @@ from openai.types.responses.response_input_message_content_list_param import Res
|
|
|
25
25
|
from openai.types.responses.response_input_param import ResponseInputItemParam
|
|
26
26
|
from openai.types.responses.response_input_text_param import ResponseInputTextParam
|
|
27
27
|
|
|
28
|
-
from
|
|
29
|
-
from
|
|
28
|
+
from ...types.chat import ChatCompletionRetabMessage
|
|
29
|
+
from ...types.documents.extract import RetabParsedChatCompletion, RetabParsedChoice
|
|
30
30
|
|
|
31
31
|
|
|
32
32
|
MediaType = Literal["image/jpeg", "image/png", "image/gif", "image/webp"]
|
retab/types/schemas/model.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import copy
|
|
2
2
|
import json
|
|
3
3
|
from typing import Any, Literal, Optional, Self, Union, Type, MutableMapping, Tuple, MutableSequence
|
|
4
|
-
from
|
|
4
|
+
from ...utils.hashing import generate_blake2b_hash_from_string
|
|
5
5
|
|
|
6
6
|
import datetime
|
|
7
7
|
from pathlib import Path
|
|
@@ -16,7 +16,7 @@ from pydantic import BaseModel, Field, PrivateAttr, computed_field, model_valida
|
|
|
16
16
|
from .chat import convert_to_anthropic_format, convert_to_google_genai_format
|
|
17
17
|
from .chat import convert_to_openai_completions_api_format
|
|
18
18
|
|
|
19
|
-
from
|
|
19
|
+
from ...utils.json_schema import convert_json_schema_to_basemodel, expand_refs, load_json_schema
|
|
20
20
|
from .chat import convert_to_openai_responses_api_format
|
|
21
21
|
from ..standards import StreamingBaseModel
|
|
22
22
|
from ..chat import ChatCompletionRetabMessage
|