retab 0.0.68__py3-none-any.whl → 0.0.70__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- retab/client.py +3 -1
- retab/resources/documents/client.py +44 -138
- retab/resources/extractions/__init__.py +3 -0
- retab/resources/extractions/client.py +288 -0
- retab/resources/projects/client.py +7 -1
- retab/resources/schemas.py +0 -8
- retab/types/documents/create_messages.py +10 -12
- retab/types/documents/extract.py +16 -81
- retab/types/documents/parse.py +0 -2
- retab/types/extractions/__init__.py +0 -0
- retab/types/extractions/types.py +3 -0
- retab/types/inference_settings.py +6 -4
- retab/types/mime.py +4 -38
- retab/types/pagination.py +8 -0
- retab/types/projects/model.py +49 -36
- retab/types/schemas/generate.py +0 -4
- {retab-0.0.68.dist-info → retab-0.0.70.dist-info}/METADATA +1 -1
- {retab-0.0.68.dist-info → retab-0.0.70.dist-info}/RECORD +20 -18
- retab/client copy.py +0 -693
- retab/types/browser_canvas.py +0 -3
- {retab-0.0.68.dist-info → retab-0.0.70.dist-info}/WHEEL +0 -0
- {retab-0.0.68.dist-info → retab-0.0.70.dist-info}/top_level.txt +0 -0
retab/client.py
CHANGED
|
@@ -10,7 +10,7 @@ import backoff.types
|
|
|
10
10
|
import httpx
|
|
11
11
|
import truststore
|
|
12
12
|
|
|
13
|
-
from .resources import documents, models, schemas, projects
|
|
13
|
+
from .resources import documents, models, schemas, projects, extractions
|
|
14
14
|
from .types.standards import PreparedRequest, FieldUnset
|
|
15
15
|
|
|
16
16
|
|
|
@@ -184,6 +184,7 @@ class Retab(BaseRetab):
|
|
|
184
184
|
|
|
185
185
|
self.client = httpx.Client(timeout=self.timeout)
|
|
186
186
|
self.projects = projects.Projects(client=self)
|
|
187
|
+
self.extractions = extractions.Extractions(client=self)
|
|
187
188
|
self.documents = documents.Documents(client=self)
|
|
188
189
|
self.models = models.Models(client=self)
|
|
189
190
|
self.schemas = schemas.Schemas(client=self)
|
|
@@ -480,6 +481,7 @@ class AsyncRetab(BaseRetab):
|
|
|
480
481
|
self.client = httpx.AsyncClient(timeout=self.timeout)
|
|
481
482
|
|
|
482
483
|
self.projects = projects.AsyncProjects(client=self)
|
|
484
|
+
self.extractions = extractions.AsyncExtractions(client=self)
|
|
483
485
|
self.documents = documents.AsyncDocuments(client=self)
|
|
484
486
|
self.models = models.AsyncModels(client=self)
|
|
485
487
|
self.schemas = schemas.AsyncSchemas(client=self)
|
|
@@ -14,7 +14,6 @@ from ...utils.stream_context_managers import as_async_context_manager, as_contex
|
|
|
14
14
|
from ...types.documents.create_messages import DocumentCreateInputRequest, DocumentCreateMessageRequest, DocumentMessage
|
|
15
15
|
from ...types.documents.extract import DocumentExtractRequest, RetabParsedChatCompletion, RetabParsedChatCompletionChunk, RetabParsedChoice, maybe_parse_to_pydantic
|
|
16
16
|
from ...types.documents.parse import ParseRequest, ParseResult, TableParsingFormat
|
|
17
|
-
from ...types.browser_canvas import BrowserCanvas
|
|
18
17
|
from ...types.mime import MIMEData
|
|
19
18
|
from ...types.standards import PreparedRequest, FieldUnset
|
|
20
19
|
from ...utils.json_schema import load_json_schema, unflatten_dict
|
|
@@ -26,8 +25,6 @@ class BaseDocumentsMixin:
|
|
|
26
25
|
self,
|
|
27
26
|
document: Path | str | IOBase | MIMEData | PIL.Image.Image | HttpUrl,
|
|
28
27
|
image_resolution_dpi: int = FieldUnset,
|
|
29
|
-
browser_canvas: BrowserCanvas = FieldUnset,
|
|
30
|
-
idempotency_key: str | None = None,
|
|
31
28
|
**extra_body: Any,
|
|
32
29
|
) -> PreparedRequest:
|
|
33
30
|
mime_document = prepare_mime_document(document)
|
|
@@ -35,8 +32,6 @@ class BaseDocumentsMixin:
|
|
|
35
32
|
loading_request_dict: dict[str, Any] = {"document": mime_document}
|
|
36
33
|
if image_resolution_dpi is not FieldUnset:
|
|
37
34
|
loading_request_dict["image_resolution_dpi"] = image_resolution_dpi
|
|
38
|
-
if browser_canvas is not FieldUnset:
|
|
39
|
-
loading_request_dict["browser_canvas"] = browser_canvas
|
|
40
35
|
|
|
41
36
|
# Merge any extra fields provided by the caller
|
|
42
37
|
if extra_body:
|
|
@@ -44,7 +39,7 @@ class BaseDocumentsMixin:
|
|
|
44
39
|
|
|
45
40
|
loading_request = DocumentCreateMessageRequest(**loading_request_dict)
|
|
46
41
|
return PreparedRequest(
|
|
47
|
-
method="POST", url="/v1/documents/create_messages", data=loading_request.model_dump(mode="json", exclude_unset=True)
|
|
42
|
+
method="POST", url="/v1/documents/create_messages", data=loading_request.model_dump(mode="json", exclude_unset=True)
|
|
48
43
|
)
|
|
49
44
|
|
|
50
45
|
def _prepare_get_extraction(self, extraction_id: str) -> PreparedRequest:
|
|
@@ -61,8 +56,6 @@ class BaseDocumentsMixin:
|
|
|
61
56
|
document: Path | str | IOBase | MIMEData | PIL.Image.Image | HttpUrl,
|
|
62
57
|
json_schema: dict[str, Any] | Path | str,
|
|
63
58
|
image_resolution_dpi: int = FieldUnset,
|
|
64
|
-
browser_canvas: BrowserCanvas = FieldUnset,
|
|
65
|
-
idempotency_key: str | None = None,
|
|
66
59
|
**extra_body: Any,
|
|
67
60
|
) -> PreparedRequest:
|
|
68
61
|
mime_document = prepare_mime_document(document)
|
|
@@ -74,15 +67,13 @@ class BaseDocumentsMixin:
|
|
|
74
67
|
}
|
|
75
68
|
if image_resolution_dpi is not FieldUnset:
|
|
76
69
|
loading_request_dict["image_resolution_dpi"] = image_resolution_dpi
|
|
77
|
-
if browser_canvas is not FieldUnset:
|
|
78
|
-
loading_request_dict["browser_canvas"] = browser_canvas
|
|
79
70
|
|
|
80
71
|
# Merge any extra fields provided by the caller
|
|
81
72
|
if extra_body:
|
|
82
73
|
loading_request_dict.update(extra_body)
|
|
83
74
|
|
|
84
75
|
loading_request = DocumentCreateInputRequest(**loading_request_dict)
|
|
85
|
-
return PreparedRequest(method="POST", url="/v1/documents/create_inputs", data=loading_request.model_dump(mode="json", exclude_unset=True)
|
|
76
|
+
return PreparedRequest(method="POST", url="/v1/documents/create_inputs", data=loading_request.model_dump(mode="json", exclude_unset=True))
|
|
86
77
|
|
|
87
78
|
def _prepare_correct_image_orientation(self, document: Path | str | IOBase | MIMEData | PIL.Image.Image) -> PreparedRequest:
|
|
88
79
|
mime_document = prepare_mime_document(document)
|
|
@@ -102,8 +93,6 @@ class BaseDocumentsMixin:
|
|
|
102
93
|
model: str,
|
|
103
94
|
table_parsing_format: TableParsingFormat = FieldUnset,
|
|
104
95
|
image_resolution_dpi: int = FieldUnset,
|
|
105
|
-
browser_canvas: BrowserCanvas = FieldUnset,
|
|
106
|
-
idempotency_key: str | None = None,
|
|
107
96
|
**extra_body: Any,
|
|
108
97
|
) -> PreparedRequest:
|
|
109
98
|
mime_document = prepare_mime_document(document)
|
|
@@ -116,50 +105,40 @@ class BaseDocumentsMixin:
|
|
|
116
105
|
request_dict["table_parsing_format"] = table_parsing_format
|
|
117
106
|
if image_resolution_dpi is not FieldUnset:
|
|
118
107
|
request_dict["image_resolution_dpi"] = image_resolution_dpi
|
|
119
|
-
if browser_canvas is not FieldUnset:
|
|
120
|
-
request_dict["browser_canvas"] = browser_canvas
|
|
121
108
|
|
|
122
109
|
# Merge any extra fields provided by the caller
|
|
123
110
|
if extra_body:
|
|
124
111
|
request_dict.update(extra_body)
|
|
125
112
|
|
|
126
113
|
parse_request = ParseRequest(**request_dict)
|
|
127
|
-
return PreparedRequest(method="POST", url="/v1/documents/parse", data=parse_request.model_dump(mode="json", exclude_unset=True)
|
|
114
|
+
return PreparedRequest(method="POST", url="/v1/documents/parse", data=parse_request.model_dump(mode="json", exclude_unset=True))
|
|
128
115
|
|
|
129
116
|
def _prepare_extract(
|
|
130
117
|
self,
|
|
131
118
|
json_schema: dict[str, Any] | Path | str,
|
|
132
119
|
model: str,
|
|
133
|
-
document: Path | str | IOBase | HttpUrl
|
|
134
|
-
documents: list[Path | str | IOBase | HttpUrl] | None = None,
|
|
120
|
+
document: Path | str | IOBase | HttpUrl,
|
|
135
121
|
image_resolution_dpi: int = FieldUnset,
|
|
136
|
-
browser_canvas: BrowserCanvas = FieldUnset,
|
|
137
122
|
temperature: float = FieldUnset,
|
|
138
123
|
reasoning_effort: ChatCompletionReasoningEffort = FieldUnset,
|
|
139
124
|
n_consensus: int = FieldUnset,
|
|
140
125
|
stream: bool = FieldUnset,
|
|
141
126
|
store: bool = FieldUnset,
|
|
142
|
-
|
|
127
|
+
metadata: dict[str, str] = FieldUnset,
|
|
143
128
|
**extra_body: Any,
|
|
144
129
|
) -> PreparedRequest:
|
|
145
130
|
loaded_schema = load_json_schema(json_schema)
|
|
146
131
|
|
|
147
|
-
# Handle
|
|
148
|
-
if document is
|
|
149
|
-
raise ValueError("
|
|
132
|
+
# Handle document parameter
|
|
133
|
+
if document is None:
|
|
134
|
+
raise ValueError("Must provide 'document' parameter.")
|
|
150
135
|
|
|
151
|
-
|
|
152
|
-
if document is not None:
|
|
153
|
-
processed_documents = [prepare_mime_document(document)]
|
|
154
|
-
elif documents is not None:
|
|
155
|
-
processed_documents = [prepare_mime_document(doc) for doc in documents]
|
|
156
|
-
else:
|
|
157
|
-
raise ValueError("Must provide either 'document' or 'documents' parameter.")
|
|
136
|
+
processed_document = prepare_mime_document(document)
|
|
158
137
|
|
|
159
138
|
# Build request dictionary with only provided fields
|
|
160
|
-
request_dict = {
|
|
139
|
+
request_dict: dict[str, Any] = {
|
|
161
140
|
"json_schema": loaded_schema,
|
|
162
|
-
"
|
|
141
|
+
"document": processed_document,
|
|
163
142
|
"model": model,
|
|
164
143
|
}
|
|
165
144
|
if stream is not FieldUnset:
|
|
@@ -174,8 +153,8 @@ class BaseDocumentsMixin:
|
|
|
174
153
|
request_dict["n_consensus"] = n_consensus
|
|
175
154
|
if image_resolution_dpi is not FieldUnset:
|
|
176
155
|
request_dict["image_resolution_dpi"] = image_resolution_dpi
|
|
177
|
-
if
|
|
178
|
-
request_dict["
|
|
156
|
+
if metadata is not FieldUnset:
|
|
157
|
+
request_dict["metadata"] = metadata
|
|
179
158
|
|
|
180
159
|
# Merge any extra fields provided by the caller
|
|
181
160
|
if extra_body:
|
|
@@ -186,7 +165,7 @@ class BaseDocumentsMixin:
|
|
|
186
165
|
|
|
187
166
|
# Use the same URL as extractions.py for consistency when streaming
|
|
188
167
|
url = "/v1/documents/extractions" if stream else "/v1/documents/extract"
|
|
189
|
-
return PreparedRequest(method="POST", url=url, data=extract_request.model_dump(mode="json", exclude_unset=True, exclude_defaults=True)
|
|
168
|
+
return PreparedRequest(method="POST", url=url, data=extract_request.model_dump(mode="json", exclude_unset=True, exclude_defaults=True))
|
|
190
169
|
|
|
191
170
|
|
|
192
171
|
class Documents(SyncAPIResource, BaseDocumentsMixin):
|
|
@@ -199,8 +178,6 @@ class Documents(SyncAPIResource, BaseDocumentsMixin):
|
|
|
199
178
|
self,
|
|
200
179
|
document: Path | str | IOBase | MIMEData | PIL.Image.Image | HttpUrl,
|
|
201
180
|
image_resolution_dpi: int = FieldUnset,
|
|
202
|
-
browser_canvas: BrowserCanvas = FieldUnset,
|
|
203
|
-
idempotency_key: str | None = None,
|
|
204
181
|
**extra_body: Any,
|
|
205
182
|
) -> DocumentMessage:
|
|
206
183
|
"""
|
|
@@ -209,8 +186,6 @@ class Documents(SyncAPIResource, BaseDocumentsMixin):
|
|
|
209
186
|
Args:
|
|
210
187
|
document: The document to process. Can be a file path (Path or str) or a file-like object.
|
|
211
188
|
image_resolution_dpi: Optional image resolution DPI.
|
|
212
|
-
browser_canvas: Optional browser canvas size.
|
|
213
|
-
idempotency_key: Optional idempotency key for the request
|
|
214
189
|
Returns:
|
|
215
190
|
DocumentMessage: The processed document message containing extracted content.
|
|
216
191
|
|
|
@@ -220,8 +195,6 @@ class Documents(SyncAPIResource, BaseDocumentsMixin):
|
|
|
220
195
|
request = self._prepare_create_messages(
|
|
221
196
|
document=document,
|
|
222
197
|
image_resolution_dpi=image_resolution_dpi,
|
|
223
|
-
browser_canvas=browser_canvas,
|
|
224
|
-
idempotency_key=idempotency_key,
|
|
225
198
|
**extra_body,
|
|
226
199
|
)
|
|
227
200
|
response = self._client._prepared_request(request)
|
|
@@ -232,8 +205,6 @@ class Documents(SyncAPIResource, BaseDocumentsMixin):
|
|
|
232
205
|
document: Path | str | IOBase | MIMEData | PIL.Image.Image | HttpUrl,
|
|
233
206
|
json_schema: dict[str, Any] | Path | str,
|
|
234
207
|
image_resolution_dpi: int = FieldUnset,
|
|
235
|
-
browser_canvas: BrowserCanvas = FieldUnset,
|
|
236
|
-
idempotency_key: str | None = None,
|
|
237
208
|
**extra_body: Any,
|
|
238
209
|
) -> DocumentMessage:
|
|
239
210
|
"""
|
|
@@ -243,8 +214,6 @@ class Documents(SyncAPIResource, BaseDocumentsMixin):
|
|
|
243
214
|
document: The document to process. Can be a file path (Path or str), file-like object, MIMEData, PIL Image, or URL.
|
|
244
215
|
json_schema: The JSON schema to use for structuring the document content.
|
|
245
216
|
image_resolution_dpi: Optional image resolution DPI.
|
|
246
|
-
browser_canvas: Optional browser canvas size.
|
|
247
|
-
idempotency_key: Optional idempotency key for the request
|
|
248
217
|
Returns:
|
|
249
218
|
DocumentMessage: The processed document message containing extracted content with schema context.
|
|
250
219
|
|
|
@@ -255,8 +224,6 @@ class Documents(SyncAPIResource, BaseDocumentsMixin):
|
|
|
255
224
|
document=document,
|
|
256
225
|
json_schema=json_schema,
|
|
257
226
|
image_resolution_dpi=image_resolution_dpi,
|
|
258
|
-
browser_canvas=browser_canvas,
|
|
259
|
-
idempotency_key=idempotency_key,
|
|
260
227
|
**extra_body,
|
|
261
228
|
)
|
|
262
229
|
response = self._client._prepared_request(request)
|
|
@@ -266,55 +233,48 @@ class Documents(SyncAPIResource, BaseDocumentsMixin):
|
|
|
266
233
|
self,
|
|
267
234
|
json_schema: dict[str, Any] | Path | str,
|
|
268
235
|
model: str,
|
|
269
|
-
document: Path | str | IOBase | HttpUrl
|
|
270
|
-
documents: list[Path | str | IOBase | HttpUrl] | None = None,
|
|
236
|
+
document: Path | str | IOBase | HttpUrl,
|
|
271
237
|
image_resolution_dpi: int = FieldUnset,
|
|
272
|
-
browser_canvas: BrowserCanvas = FieldUnset,
|
|
273
238
|
temperature: float = FieldUnset,
|
|
274
239
|
reasoning_effort: ChatCompletionReasoningEffort = FieldUnset,
|
|
275
240
|
n_consensus: int = FieldUnset,
|
|
276
|
-
idempotency_key: str | None = None,
|
|
277
241
|
store: bool = FieldUnset,
|
|
242
|
+
metadata: dict[str, str] = FieldUnset,
|
|
278
243
|
**extra_body: Any,
|
|
279
244
|
) -> RetabParsedChatCompletion:
|
|
280
245
|
"""
|
|
281
|
-
Process
|
|
246
|
+
Process a document using the Retab API for structured data extraction.
|
|
282
247
|
|
|
283
|
-
This method provides a direct interface to document extraction functionality
|
|
284
|
-
intended to replace the current `.extractions.parse()` pattern.
|
|
248
|
+
This method provides a direct interface to document extraction functionality.
|
|
285
249
|
|
|
286
250
|
Args:
|
|
287
251
|
json_schema: JSON schema defining the expected data structure
|
|
288
252
|
model: The AI model to use for processing
|
|
289
|
-
document:
|
|
290
|
-
documents: List of documents to process (use either this or document, not both)
|
|
253
|
+
document: Document to process (file path, URL, or file-like object)
|
|
291
254
|
image_resolution_dpi: Optional image resolution DPI
|
|
292
|
-
browser_canvas: Optional browser canvas size
|
|
293
255
|
temperature: Model temperature setting (0-1)
|
|
294
256
|
reasoning_effort: The effort level for the model to reason about the input data
|
|
295
257
|
n_consensus: Number of consensus extractions to perform
|
|
296
|
-
idempotency_key: Idempotency key for request
|
|
297
258
|
store: Whether to store the document in the Retab database
|
|
259
|
+
metadata: User-defined metadata to associate with this extraction
|
|
298
260
|
|
|
299
261
|
Returns:
|
|
300
262
|
RetabParsedChatCompletion: Parsed response from the API
|
|
301
263
|
|
|
302
264
|
Raises:
|
|
303
|
-
ValueError: If
|
|
265
|
+
ValueError: If document is not provided
|
|
304
266
|
HTTPException: If the request fails
|
|
305
267
|
"""
|
|
306
268
|
request = self._prepare_extract(
|
|
307
269
|
json_schema=json_schema,
|
|
308
270
|
model=model,
|
|
309
271
|
document=document,
|
|
310
|
-
documents=documents,
|
|
311
272
|
image_resolution_dpi=image_resolution_dpi,
|
|
312
|
-
browser_canvas=browser_canvas,
|
|
313
273
|
temperature=temperature,
|
|
314
274
|
reasoning_effort=reasoning_effort,
|
|
315
275
|
n_consensus=n_consensus,
|
|
316
276
|
store=store,
|
|
317
|
-
|
|
277
|
+
metadata=metadata,
|
|
318
278
|
**extra_body,
|
|
319
279
|
)
|
|
320
280
|
response = self._client._prepared_request(request)
|
|
@@ -414,64 +374,52 @@ class Documents(SyncAPIResource, BaseDocumentsMixin):
|
|
|
414
374
|
self,
|
|
415
375
|
json_schema: dict[str, Any] | Path | str,
|
|
416
376
|
model: str,
|
|
417
|
-
document: Path | str | IOBase | HttpUrl
|
|
418
|
-
documents: list[Path | str | IOBase | HttpUrl] | None = None,
|
|
377
|
+
document: Path | str | IOBase | HttpUrl,
|
|
419
378
|
image_resolution_dpi: int = FieldUnset,
|
|
420
|
-
browser_canvas: BrowserCanvas = FieldUnset,
|
|
421
379
|
temperature: float = FieldUnset,
|
|
422
380
|
reasoning_effort: ChatCompletionReasoningEffort = FieldUnset,
|
|
423
381
|
n_consensus: int = FieldUnset,
|
|
424
|
-
idempotency_key: str | None = None,
|
|
425
382
|
store: bool = FieldUnset,
|
|
383
|
+
metadata: dict[str, str] = FieldUnset,
|
|
426
384
|
**extra_body: Any,
|
|
427
385
|
) -> Generator[RetabParsedChatCompletion, None, None]:
|
|
428
386
|
"""
|
|
429
|
-
Process
|
|
387
|
+
Process a document using the Retab API with streaming enabled.
|
|
430
388
|
|
|
431
389
|
Args:
|
|
432
390
|
json_schema: JSON schema defining the expected data structure
|
|
433
391
|
model: The AI model to use for processing
|
|
434
|
-
document:
|
|
435
|
-
documents: List of documents to process (use either this or document, not both)
|
|
392
|
+
document: Document to process (file path, URL, or file-like object)
|
|
436
393
|
image_resolution_dpi: Optional image resolution DPI.
|
|
437
|
-
browser_canvas: Optional browser canvas size.
|
|
438
394
|
temperature: Model temperature setting (0-1)
|
|
439
395
|
reasoning_effort: The effort level for the model to reason about the input data.
|
|
440
396
|
n_consensus: Number of consensus extractions to perform (default: 1 which computes a single extraction and the likelihoods comes from the model logprobs)
|
|
441
|
-
idempotency_key: Idempotency key for request
|
|
442
397
|
store: Whether to store the document in the Retab database
|
|
398
|
+
metadata: User-defined metadata to associate with this extraction
|
|
443
399
|
|
|
444
400
|
Returns:
|
|
445
401
|
Generator[RetabParsedChatCompletion]: Stream of parsed responses
|
|
446
402
|
Raises:
|
|
447
|
-
ValueError: If
|
|
403
|
+
ValueError: If document is not provided
|
|
448
404
|
HTTPException: If the request fails
|
|
449
405
|
Usage:
|
|
450
406
|
```python
|
|
451
|
-
# Single document
|
|
452
407
|
with retab.documents.extract_stream(json_schema, model, document=document) as stream:
|
|
453
408
|
for response in stream:
|
|
454
409
|
print(response)
|
|
455
|
-
|
|
456
|
-
# Multiple documents
|
|
457
|
-
with retab.documents.extract_stream(json_schema, model, documents=[doc1, doc2]) as stream:
|
|
458
|
-
for response in stream:
|
|
459
|
-
print(response)
|
|
460
410
|
```
|
|
461
411
|
"""
|
|
462
412
|
request = self._prepare_extract(
|
|
463
413
|
json_schema=json_schema,
|
|
464
414
|
document=document,
|
|
465
|
-
documents=documents,
|
|
466
415
|
image_resolution_dpi=image_resolution_dpi,
|
|
467
|
-
browser_canvas=browser_canvas,
|
|
468
416
|
model=model,
|
|
469
417
|
temperature=temperature,
|
|
470
418
|
reasoning_effort=reasoning_effort,
|
|
471
419
|
stream=True,
|
|
472
420
|
n_consensus=n_consensus,
|
|
473
421
|
store=store,
|
|
474
|
-
|
|
422
|
+
metadata=metadata,
|
|
475
423
|
**extra_body,
|
|
476
424
|
)
|
|
477
425
|
schema = load_json_schema(json_schema)
|
|
@@ -521,8 +469,6 @@ class Documents(SyncAPIResource, BaseDocumentsMixin):
|
|
|
521
469
|
model: str,
|
|
522
470
|
table_parsing_format: TableParsingFormat = FieldUnset,
|
|
523
471
|
image_resolution_dpi: int = FieldUnset,
|
|
524
|
-
browser_canvas: BrowserCanvas = FieldUnset,
|
|
525
|
-
idempotency_key: str | None = None,
|
|
526
472
|
**extra_body: Any,
|
|
527
473
|
) -> ParseResult:
|
|
528
474
|
"""
|
|
@@ -536,8 +482,6 @@ class Documents(SyncAPIResource, BaseDocumentsMixin):
|
|
|
536
482
|
model: The AI model to use for document parsing.
|
|
537
483
|
table_parsing_format: Format for parsing tables. Options: "html", "json", "yaml", "markdown". Defaults to "html".
|
|
538
484
|
image_resolution_dpi: DPI for image processing. Defaults to 72.
|
|
539
|
-
browser_canvas: Canvas size for document rendering. Defaults to "A4".
|
|
540
|
-
idempotency_key: Optional idempotency key for the request.
|
|
541
485
|
|
|
542
486
|
Returns:
|
|
543
487
|
ParseResult: Parsed response containing document metadata, usage information, and page text content.
|
|
@@ -550,8 +494,6 @@ class Documents(SyncAPIResource, BaseDocumentsMixin):
|
|
|
550
494
|
model=model,
|
|
551
495
|
table_parsing_format=table_parsing_format,
|
|
552
496
|
image_resolution_dpi=image_resolution_dpi,
|
|
553
|
-
browser_canvas=browser_canvas,
|
|
554
|
-
idempotency_key=idempotency_key,
|
|
555
497
|
**extra_body,
|
|
556
498
|
)
|
|
557
499
|
response = self._client._prepared_request(request)
|
|
@@ -569,8 +511,6 @@ class AsyncDocuments(AsyncAPIResource, BaseDocumentsMixin):
|
|
|
569
511
|
self,
|
|
570
512
|
document: Path | str | IOBase | MIMEData | PIL.Image.Image,
|
|
571
513
|
image_resolution_dpi: int = FieldUnset,
|
|
572
|
-
browser_canvas: BrowserCanvas = FieldUnset,
|
|
573
|
-
idempotency_key: str | None = None,
|
|
574
514
|
**extra_body: Any,
|
|
575
515
|
) -> DocumentMessage:
|
|
576
516
|
"""
|
|
@@ -578,7 +518,6 @@ class AsyncDocuments(AsyncAPIResource, BaseDocumentsMixin):
|
|
|
578
518
|
|
|
579
519
|
Args:
|
|
580
520
|
document: The document to process. Can be a file path (Path or str) or a file-like object.
|
|
581
|
-
idempotency_key: Idempotency key for request
|
|
582
521
|
Returns:
|
|
583
522
|
DocumentMessage: The processed document message containing extracted content.
|
|
584
523
|
|
|
@@ -588,8 +527,6 @@ class AsyncDocuments(AsyncAPIResource, BaseDocumentsMixin):
|
|
|
588
527
|
request = self._prepare_create_messages(
|
|
589
528
|
document=document,
|
|
590
529
|
image_resolution_dpi=image_resolution_dpi,
|
|
591
|
-
browser_canvas=browser_canvas,
|
|
592
|
-
idempotency_key=idempotency_key,
|
|
593
530
|
**extra_body,
|
|
594
531
|
)
|
|
595
532
|
response = await self._client._prepared_request(request)
|
|
@@ -600,8 +537,6 @@ class AsyncDocuments(AsyncAPIResource, BaseDocumentsMixin):
|
|
|
600
537
|
document: Path | str | IOBase | MIMEData | PIL.Image.Image | HttpUrl,
|
|
601
538
|
json_schema: dict[str, Any] | Path | str,
|
|
602
539
|
image_resolution_dpi: int = FieldUnset,
|
|
603
|
-
browser_canvas: BrowserCanvas = FieldUnset,
|
|
604
|
-
idempotency_key: str | None = None,
|
|
605
540
|
**extra_body: Any,
|
|
606
541
|
) -> DocumentMessage:
|
|
607
542
|
"""
|
|
@@ -611,8 +546,6 @@ class AsyncDocuments(AsyncAPIResource, BaseDocumentsMixin):
|
|
|
611
546
|
document: The document to process. Can be a file path (Path or str), file-like object, MIMEData, PIL Image, or URL.
|
|
612
547
|
json_schema: The JSON schema to use for structuring the document content.
|
|
613
548
|
image_resolution_dpi: Optional image resolution DPI.
|
|
614
|
-
browser_canvas: Optional browser canvas size.
|
|
615
|
-
idempotency_key: Idempotency key for request
|
|
616
549
|
Returns:
|
|
617
550
|
DocumentMessage: The processed document message containing extracted content with schema context.
|
|
618
551
|
|
|
@@ -623,8 +556,6 @@ class AsyncDocuments(AsyncAPIResource, BaseDocumentsMixin):
|
|
|
623
556
|
document=document,
|
|
624
557
|
json_schema=json_schema,
|
|
625
558
|
image_resolution_dpi=image_resolution_dpi,
|
|
626
|
-
browser_canvas=browser_canvas,
|
|
627
|
-
idempotency_key=idempotency_key,
|
|
628
559
|
**extra_body,
|
|
629
560
|
)
|
|
630
561
|
response = await self._client._prepared_request(request)
|
|
@@ -634,55 +565,48 @@ class AsyncDocuments(AsyncAPIResource, BaseDocumentsMixin):
|
|
|
634
565
|
self,
|
|
635
566
|
json_schema: dict[str, Any] | Path | str,
|
|
636
567
|
model: str,
|
|
637
|
-
document: Path | str | IOBase | HttpUrl
|
|
638
|
-
documents: list[Path | str | IOBase | HttpUrl] | None = None,
|
|
568
|
+
document: Path | str | IOBase | HttpUrl,
|
|
639
569
|
image_resolution_dpi: int = FieldUnset,
|
|
640
|
-
browser_canvas: BrowserCanvas = FieldUnset,
|
|
641
570
|
temperature: float = FieldUnset,
|
|
642
571
|
reasoning_effort: ChatCompletionReasoningEffort = FieldUnset,
|
|
643
572
|
n_consensus: int = FieldUnset,
|
|
644
|
-
idempotency_key: str | None = None,
|
|
645
573
|
store: bool = FieldUnset,
|
|
574
|
+
metadata: dict[str, str] = FieldUnset,
|
|
646
575
|
**extra_body: Any,
|
|
647
576
|
) -> RetabParsedChatCompletion:
|
|
648
577
|
"""
|
|
649
|
-
Process
|
|
578
|
+
Process a document using the Retab API for structured data extraction asynchronously.
|
|
650
579
|
|
|
651
|
-
This method provides a direct interface to document extraction functionality
|
|
652
|
-
intended to replace the current `.extractions.parse()` pattern.
|
|
580
|
+
This method provides a direct interface to document extraction functionality.
|
|
653
581
|
|
|
654
582
|
Args:
|
|
655
583
|
json_schema: JSON schema defining the expected data structure
|
|
656
584
|
model: The AI model to use for processing
|
|
657
|
-
document:
|
|
658
|
-
documents: List of documents to process (use either this or document, not both)
|
|
585
|
+
document: Document to process (file path, URL, or file-like object)
|
|
659
586
|
image_resolution_dpi: Optional image resolution DPI
|
|
660
|
-
browser_canvas: Optional browser canvas size
|
|
661
587
|
temperature: Model temperature setting (0-1)
|
|
662
588
|
reasoning_effort: The effort level for the model to reason about the input data
|
|
663
589
|
n_consensus: Number of consensus extractions to perform
|
|
664
|
-
idempotency_key: Idempotency key for request
|
|
665
590
|
store: Whether to store the document in the Retab database
|
|
591
|
+
metadata: User-defined metadata to associate with this extraction
|
|
666
592
|
|
|
667
593
|
Returns:
|
|
668
594
|
RetabParsedChatCompletion: Parsed response from the API
|
|
669
595
|
|
|
670
596
|
Raises:
|
|
671
|
-
ValueError: If
|
|
597
|
+
ValueError: If document is not provided
|
|
672
598
|
HTTPException: If the request fails
|
|
673
599
|
"""
|
|
674
600
|
request = self._prepare_extract(
|
|
675
601
|
json_schema=json_schema,
|
|
676
602
|
model=model,
|
|
677
603
|
document=document,
|
|
678
|
-
documents=documents,
|
|
679
604
|
image_resolution_dpi=image_resolution_dpi,
|
|
680
|
-
browser_canvas=browser_canvas,
|
|
681
605
|
temperature=temperature,
|
|
682
606
|
reasoning_effort=reasoning_effort,
|
|
683
607
|
n_consensus=n_consensus,
|
|
684
608
|
store=store,
|
|
685
|
-
|
|
609
|
+
metadata=metadata,
|
|
686
610
|
**extra_body,
|
|
687
611
|
)
|
|
688
612
|
response = await self._client._prepared_request(request)
|
|
@@ -694,63 +618,51 @@ class AsyncDocuments(AsyncAPIResource, BaseDocumentsMixin):
|
|
|
694
618
|
self,
|
|
695
619
|
json_schema: dict[str, Any] | Path | str,
|
|
696
620
|
model: str,
|
|
697
|
-
document: Path | str | IOBase | HttpUrl
|
|
698
|
-
documents: list[Path | str | IOBase | HttpUrl] | None = None,
|
|
621
|
+
document: Path | str | IOBase | HttpUrl,
|
|
699
622
|
image_resolution_dpi: int = FieldUnset,
|
|
700
|
-
browser_canvas: BrowserCanvas = FieldUnset,
|
|
701
623
|
temperature: float = FieldUnset,
|
|
702
624
|
reasoning_effort: ChatCompletionReasoningEffort = FieldUnset,
|
|
703
625
|
n_consensus: int = FieldUnset,
|
|
704
|
-
idempotency_key: str | None = None,
|
|
705
626
|
store: bool = FieldUnset,
|
|
627
|
+
metadata: dict[str, str] = FieldUnset,
|
|
706
628
|
**extra_body: Any,
|
|
707
629
|
) -> AsyncGenerator[RetabParsedChatCompletion, None]:
|
|
708
630
|
"""
|
|
709
|
-
Extract structured data from
|
|
631
|
+
Extract structured data from a document asynchronously with streaming.
|
|
710
632
|
|
|
711
633
|
Args:
|
|
712
634
|
json_schema: JSON schema defining the expected data structure.
|
|
713
635
|
model: The AI model to use.
|
|
714
|
-
document:
|
|
715
|
-
documents: List of documents to process (use either this or document, not both)
|
|
636
|
+
document: Document to process (file path, URL, or file-like object)
|
|
716
637
|
image_resolution_dpi: Optional image resolution DPI.
|
|
717
|
-
browser_canvas: Optional browser canvas size.
|
|
718
638
|
temperature: Model temperature setting (0-1).
|
|
719
639
|
reasoning_effort: The effort level for the model to reason about the input data.
|
|
720
640
|
n_consensus: Number of consensus extractions to perform (default: 1 which computes a single extraction and the likelihoods comes from the model logprobs)
|
|
721
|
-
idempotency_key: Idempotency key for request
|
|
722
641
|
store: Whether to store the document in the Retab database
|
|
642
|
+
metadata: User-defined metadata to associate with this extraction
|
|
723
643
|
Returns:
|
|
724
644
|
AsyncGenerator[RetabParsedChatCompletion, None]: Stream of parsed responses.
|
|
725
645
|
Raises:
|
|
726
|
-
ValueError: If
|
|
646
|
+
ValueError: If document is not provided
|
|
727
647
|
|
|
728
648
|
Usage:
|
|
729
649
|
```python
|
|
730
|
-
# Single document
|
|
731
650
|
async with retab.documents.extract_stream(json_schema, model, document=document) as stream:
|
|
732
651
|
async for response in stream:
|
|
733
652
|
print(response)
|
|
734
|
-
|
|
735
|
-
# Multiple documents
|
|
736
|
-
async with retab.documents.extract_stream(json_schema, model, documents=[doc1, doc2]) as stream:
|
|
737
|
-
async for response in stream:
|
|
738
|
-
print(response)
|
|
739
653
|
```
|
|
740
654
|
"""
|
|
741
655
|
request = self._prepare_extract(
|
|
742
656
|
json_schema=json_schema,
|
|
743
657
|
document=document,
|
|
744
|
-
documents=documents,
|
|
745
658
|
image_resolution_dpi=image_resolution_dpi,
|
|
746
|
-
browser_canvas=browser_canvas,
|
|
747
659
|
model=model,
|
|
748
660
|
temperature=temperature,
|
|
749
661
|
reasoning_effort=reasoning_effort,
|
|
750
662
|
stream=True,
|
|
751
663
|
n_consensus=n_consensus,
|
|
752
664
|
store=store,
|
|
753
|
-
|
|
665
|
+
metadata=metadata,
|
|
754
666
|
**extra_body,
|
|
755
667
|
)
|
|
756
668
|
schema = load_json_schema(json_schema)
|
|
@@ -800,8 +712,6 @@ class AsyncDocuments(AsyncAPIResource, BaseDocumentsMixin):
|
|
|
800
712
|
model: str,
|
|
801
713
|
table_parsing_format: TableParsingFormat = FieldUnset,
|
|
802
714
|
image_resolution_dpi: int = FieldUnset,
|
|
803
|
-
browser_canvas: BrowserCanvas = FieldUnset,
|
|
804
|
-
idempotency_key: str | None = None,
|
|
805
715
|
**extra_body: Any,
|
|
806
716
|
) -> ParseResult:
|
|
807
717
|
"""
|
|
@@ -815,8 +725,6 @@ class AsyncDocuments(AsyncAPIResource, BaseDocumentsMixin):
|
|
|
815
725
|
model: The AI model to use for document parsing.
|
|
816
726
|
table_parsing_format: Format for parsing tables. Options: "html", "json", "yaml", "markdown". Defaults to "html".
|
|
817
727
|
image_resolution_dpi: DPI for image processing. Defaults to 96.
|
|
818
|
-
browser_canvas: Canvas size for document rendering. Defaults to "A4".
|
|
819
|
-
idempotency_key: Optional idempotency key for the request.
|
|
820
728
|
|
|
821
729
|
Returns:
|
|
822
730
|
ParseResult: Parsed response containing document metadata, usage information, and page text content.
|
|
@@ -829,8 +737,6 @@ class AsyncDocuments(AsyncAPIResource, BaseDocumentsMixin):
|
|
|
829
737
|
model=model,
|
|
830
738
|
table_parsing_format=table_parsing_format,
|
|
831
739
|
image_resolution_dpi=image_resolution_dpi,
|
|
832
|
-
browser_canvas=browser_canvas,
|
|
833
|
-
idempotency_key=idempotency_key,
|
|
834
740
|
**extra_body,
|
|
835
741
|
)
|
|
836
742
|
response = await self._client._prepared_request(request)
|