retab 0.0.76__tar.gz → 0.0.78__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {retab-0.0.76 → retab-0.0.78}/PKG-INFO +1 -1
- {retab-0.0.76 → retab-0.0.78}/retab/resources/documents/client.py +167 -19
- retab-0.0.78/retab/types/documents/__init__.py +12 -0
- {retab-0.0.76 → retab-0.0.78}/retab/types/documents/create_messages.py +1 -1
- {retab-0.0.76 → retab-0.0.78}/retab/types/documents/edit.py +18 -5
- {retab-0.0.76 → retab-0.0.78}/retab/types/documents/extract.py +1 -1
- {retab-0.0.76 → retab-0.0.78}/retab/types/documents/parse.py +1 -1
- retab-0.0.78/retab/types/documents/split.py +32 -0
- {retab-0.0.76 → retab-0.0.78}/retab/types/inference_settings.py +1 -1
- {retab-0.0.76 → retab-0.0.78}/retab/types/projects/model.py +1 -1
- {retab-0.0.76 → retab-0.0.78}/retab/types/schemas/model.py +81 -1
- {retab-0.0.76 → retab-0.0.78}/retab/utils/json_schema.py +2 -2
- {retab-0.0.76 → retab-0.0.78}/retab.egg-info/PKG-INFO +1 -1
- {retab-0.0.76 → retab-0.0.78}/retab.egg-info/SOURCES.txt +1 -2
- {retab-0.0.76 → retab-0.0.78}/setup.py +1 -1
- retab-0.0.76/retab/types/documents/__init__.py +0 -3
- retab-0.0.76/retab/utils/usage/__init__.py +0 -0
- retab-0.0.76/retab/utils/usage/json_schema.py +0 -2197
- {retab-0.0.76 → retab-0.0.78}/README.md +0 -0
- {retab-0.0.76 → retab-0.0.78}/pyproject.toml +0 -0
- {retab-0.0.76 → retab-0.0.78}/retab/__init__.py +0 -0
- {retab-0.0.76 → retab-0.0.78}/retab/_resource.py +0 -0
- {retab-0.0.76 → retab-0.0.78}/retab/client.py +0 -0
- {retab-0.0.76 → retab-0.0.78}/retab/generate_types.py +0 -0
- {retab-0.0.76 → retab-0.0.78}/retab/py.typed +0 -0
- {retab-0.0.76 → retab-0.0.78}/retab/resources/__init__.py +0 -0
- {retab-0.0.76 → retab-0.0.78}/retab/resources/documents/__init__.py +0 -0
- {retab-0.0.76 → retab-0.0.78}/retab/resources/extractions/__init__.py +0 -0
- {retab-0.0.76 → retab-0.0.78}/retab/resources/extractions/client.py +0 -0
- {retab-0.0.76 → retab-0.0.78}/retab/resources/models.py +0 -0
- {retab-0.0.76 → retab-0.0.78}/retab/resources/projects/__init__.py +0 -0
- {retab-0.0.76 → retab-0.0.78}/retab/resources/projects/client.py +0 -0
- {retab-0.0.76 → retab-0.0.78}/retab/resources/schemas.py +0 -0
- {retab-0.0.76 → retab-0.0.78}/retab/types/__init__.py +0 -0
- {retab-0.0.76 → retab-0.0.78}/retab/types/chat.py +0 -0
- {retab-0.0.76 → retab-0.0.78}/retab/types/documents/correct_orientation.py +0 -0
- {retab-0.0.76 → retab-0.0.78}/retab/types/extractions/__init__.py +0 -0
- {retab-0.0.76 → retab-0.0.78}/retab/types/extractions/types.py +0 -0
- {retab-0.0.76 → retab-0.0.78}/retab/types/mime.py +0 -0
- {retab-0.0.76 → retab-0.0.78}/retab/types/modality.py +0 -0
- {retab-0.0.76 → retab-0.0.78}/retab/types/pagination.py +0 -0
- {retab-0.0.76 → retab-0.0.78}/retab/types/projects/__init__.py +0 -0
- {retab-0.0.76 → retab-0.0.78}/retab/types/projects/metrics.py +0 -0
- {retab-0.0.76 → retab-0.0.78}/retab/types/projects/predictions.py +0 -0
- {retab-0.0.76 → retab-0.0.78}/retab/types/schemas/__init__.py +0 -0
- {retab-0.0.76 → retab-0.0.78}/retab/types/schemas/chat.py +0 -0
- {retab-0.0.76 → retab-0.0.78}/retab/types/schemas/generate.py +0 -0
- {retab-0.0.76 → retab-0.0.78}/retab/types/schemas/layout.py +0 -0
- {retab-0.0.76 → retab-0.0.78}/retab/types/schemas/templates.py +0 -0
- {retab-0.0.76 → retab-0.0.78}/retab/types/standards.py +0 -0
- {retab-0.0.76 → retab-0.0.78}/retab/utils/__init__.py +0 -0
- {retab-0.0.76 → retab-0.0.78}/retab/utils/display.py +0 -0
- {retab-0.0.76 → retab-0.0.78}/retab/utils/hashing.py +0 -0
- {retab-0.0.76 → retab-0.0.78}/retab/utils/mime.py +0 -0
- {retab-0.0.76 → retab-0.0.78}/retab/utils/stream_context_managers.py +0 -0
- {retab-0.0.76 → retab-0.0.78}/retab.egg-info/dependency_links.txt +0 -0
- {retab-0.0.76 → retab-0.0.78}/retab.egg-info/requires.txt +0 -0
- {retab-0.0.76 → retab-0.0.78}/retab.egg-info/top_level.txt +0 -0
- {retab-0.0.76 → retab-0.0.78}/setup.cfg +0 -0
- {retab-0.0.76 → retab-0.0.78}/tests/test_projects.py +0 -0
|
@@ -16,6 +16,7 @@ from ...types.chat import ChatCompletionRetabMessage
|
|
|
16
16
|
from ...types.documents.edit import EditRequest, EditResponse
|
|
17
17
|
from ...types.documents.extract import DocumentExtractRequest, RetabParsedChatCompletion, RetabParsedChatCompletionChunk, RetabParsedChoice, maybe_parse_to_pydantic
|
|
18
18
|
from ...types.documents.parse import ParseRequest, ParseResult, TableParsingFormat
|
|
19
|
+
from ...types.documents.split import Category, SplitRequest, SplitResponse
|
|
19
20
|
from ...types.mime import MIMEData
|
|
20
21
|
from ...types.standards import PreparedRequest, FieldUnset
|
|
21
22
|
from ...utils.json_schema import load_json_schema, unflatten_dict
|
|
@@ -117,19 +118,24 @@ class BaseDocumentsMixin:
|
|
|
117
118
|
|
|
118
119
|
def _prepare_edit(
|
|
119
120
|
self,
|
|
120
|
-
document: Path | str | IOBase | MIMEData | PIL.Image.Image | HttpUrl,
|
|
121
121
|
filling_instructions: str,
|
|
122
|
+
document: Path | str | IOBase | MIMEData | PIL.Image.Image | HttpUrl | None = None,
|
|
122
123
|
model: str = FieldUnset,
|
|
124
|
+
template_id: str | None = FieldUnset,
|
|
123
125
|
**extra_body: Any,
|
|
124
126
|
) -> PreparedRequest:
|
|
125
|
-
mime_document = prepare_mime_document(document)
|
|
126
|
-
|
|
127
127
|
request_dict: dict[str, Any] = {
|
|
128
|
-
"document": mime_document,
|
|
129
128
|
"filling_instructions": filling_instructions,
|
|
130
129
|
}
|
|
130
|
+
|
|
131
|
+
if document is not None:
|
|
132
|
+
mime_document = prepare_mime_document(document)
|
|
133
|
+
request_dict["document"] = mime_document
|
|
134
|
+
|
|
131
135
|
if model is not FieldUnset:
|
|
132
136
|
request_dict["model"] = model
|
|
137
|
+
if template_id is not FieldUnset:
|
|
138
|
+
request_dict["template_id"] = template_id
|
|
133
139
|
|
|
134
140
|
# Merge any extra fields provided by the caller
|
|
135
141
|
if extra_body:
|
|
@@ -138,11 +144,39 @@ class BaseDocumentsMixin:
|
|
|
138
144
|
edit_request = EditRequest(**request_dict)
|
|
139
145
|
return PreparedRequest(method="POST", url="/v1/documents/edit", data=edit_request.model_dump(mode="json", exclude_unset=True))
|
|
140
146
|
|
|
147
|
+
def _prepare_split(
|
|
148
|
+
self,
|
|
149
|
+
document: Path | str | IOBase | MIMEData | PIL.Image.Image | HttpUrl,
|
|
150
|
+
categories: list[Category] | list[dict[str, str]],
|
|
151
|
+
model: str,
|
|
152
|
+
**extra_body: Any,
|
|
153
|
+
) -> PreparedRequest:
|
|
154
|
+
mime_document = prepare_mime_document(document)
|
|
155
|
+
|
|
156
|
+
# Convert dict categories to Category objects if needed
|
|
157
|
+
category_objects = [
|
|
158
|
+
Category(**cat) if isinstance(cat, dict) else cat
|
|
159
|
+
for cat in categories
|
|
160
|
+
]
|
|
161
|
+
|
|
162
|
+
request_dict: dict[str, Any] = {
|
|
163
|
+
"document": mime_document,
|
|
164
|
+
"categories": category_objects,
|
|
165
|
+
"model": model,
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
# Merge any extra fields provided by the caller
|
|
169
|
+
if extra_body:
|
|
170
|
+
request_dict.update(extra_body)
|
|
171
|
+
|
|
172
|
+
split_request = SplitRequest(**request_dict)
|
|
173
|
+
return PreparedRequest(method="POST", url="/v1/documents/split", data=split_request.model_dump(mode="json", exclude_unset=True))
|
|
174
|
+
|
|
141
175
|
def _prepare_extract(
|
|
142
176
|
self,
|
|
143
177
|
json_schema: dict[str, Any] | Path | str,
|
|
144
178
|
model: str,
|
|
145
|
-
document: Path | str | IOBase | HttpUrl,
|
|
179
|
+
document: Path | str | IOBase | HttpUrl | MIMEData,
|
|
146
180
|
image_resolution_dpi: int = FieldUnset,
|
|
147
181
|
temperature: float = FieldUnset,
|
|
148
182
|
reasoning_effort: ChatCompletionReasoningEffort = FieldUnset,
|
|
@@ -261,7 +295,7 @@ class Documents(SyncAPIResource, BaseDocumentsMixin):
|
|
|
261
295
|
self,
|
|
262
296
|
json_schema: dict[str, Any] | Path | str,
|
|
263
297
|
model: str,
|
|
264
|
-
document: Path | str | IOBase | HttpUrl,
|
|
298
|
+
document: Path | str | IOBase | HttpUrl | MIMEData,
|
|
265
299
|
image_resolution_dpi: int = FieldUnset,
|
|
266
300
|
temperature: float = FieldUnset,
|
|
267
301
|
reasoning_effort: ChatCompletionReasoningEffort = FieldUnset,
|
|
@@ -279,7 +313,7 @@ class Documents(SyncAPIResource, BaseDocumentsMixin):
|
|
|
279
313
|
Args:
|
|
280
314
|
json_schema: JSON schema defining the expected data structure
|
|
281
315
|
model: The AI model to use for processing
|
|
282
|
-
document: Document to process (file path, URL,
|
|
316
|
+
document: Document to process (file path, URL, file-like object, or MIMEData)
|
|
283
317
|
image_resolution_dpi: Optional image resolution DPI
|
|
284
318
|
temperature: Model temperature setting (0-1)
|
|
285
319
|
reasoning_effort: The effort level for the model to reason about the input data
|
|
@@ -405,7 +439,7 @@ class Documents(SyncAPIResource, BaseDocumentsMixin):
|
|
|
405
439
|
self,
|
|
406
440
|
json_schema: dict[str, Any] | Path | str,
|
|
407
441
|
model: str,
|
|
408
|
-
document: Path | str | IOBase | HttpUrl,
|
|
442
|
+
document: Path | str | IOBase | HttpUrl | MIMEData,
|
|
409
443
|
image_resolution_dpi: int = FieldUnset,
|
|
410
444
|
temperature: float = FieldUnset,
|
|
411
445
|
reasoning_effort: ChatCompletionReasoningEffort = FieldUnset,
|
|
@@ -535,9 +569,10 @@ class Documents(SyncAPIResource, BaseDocumentsMixin):
|
|
|
535
569
|
|
|
536
570
|
def edit(
|
|
537
571
|
self,
|
|
538
|
-
document: Path | str | IOBase | MIMEData | PIL.Image.Image | HttpUrl,
|
|
539
572
|
filling_instructions: str,
|
|
573
|
+
document: Path | str | IOBase | MIMEData | PIL.Image.Image | HttpUrl | None = None,
|
|
540
574
|
model: str = FieldUnset,
|
|
575
|
+
template_id: str | None = FieldUnset,
|
|
541
576
|
**extra_body: Any,
|
|
542
577
|
) -> EditResponse:
|
|
543
578
|
"""
|
|
@@ -549,10 +584,15 @@ class Documents(SyncAPIResource, BaseDocumentsMixin):
|
|
|
549
584
|
3. LLM-based form filling using the provided instructions
|
|
550
585
|
4. Returns the filled PDF with form field values populated
|
|
551
586
|
|
|
587
|
+
Either `document` OR `template_id` must be provided, but not both.
|
|
588
|
+
|
|
552
589
|
Args:
|
|
553
|
-
document: The document to edit. Can be a file path (Path or str), file-like object, MIMEData, PIL Image, or URL.
|
|
554
590
|
filling_instructions: Instructions describing how to fill the form fields.
|
|
555
|
-
|
|
591
|
+
document: The document to edit. Can be a file path (Path or str), file-like object, MIMEData, PIL Image, or URL.
|
|
592
|
+
Mutually exclusive with template_id.
|
|
593
|
+
model: The LLM model to use for inference. Defaults to "retab-small".
|
|
594
|
+
template_id: Template ID to use for filling. When provided, uses the template's pre-defined form fields
|
|
595
|
+
and empty PDF. Only works for PDF documents. Mutually exclusive with document.
|
|
556
596
|
|
|
557
597
|
Returns:
|
|
558
598
|
EditResponse: Response containing:
|
|
@@ -563,14 +603,65 @@ class Documents(SyncAPIResource, BaseDocumentsMixin):
|
|
|
563
603
|
HTTPException: If the request fails.
|
|
564
604
|
"""
|
|
565
605
|
request = self._prepare_edit(
|
|
566
|
-
document=document,
|
|
567
606
|
filling_instructions=filling_instructions,
|
|
607
|
+
document=document,
|
|
568
608
|
model=model,
|
|
609
|
+
template_id=template_id,
|
|
569
610
|
**extra_body,
|
|
570
611
|
)
|
|
571
612
|
response = self._client._prepared_request(request)
|
|
572
613
|
return EditResponse.model_validate(response)
|
|
573
614
|
|
|
615
|
+
def split(
|
|
616
|
+
self,
|
|
617
|
+
document: Path | str | IOBase | MIMEData | PIL.Image.Image | HttpUrl,
|
|
618
|
+
categories: list[Category] | list[dict[str, str]],
|
|
619
|
+
model: str,
|
|
620
|
+
**extra_body: Any,
|
|
621
|
+
) -> SplitResponse:
|
|
622
|
+
"""
|
|
623
|
+
Split a document into sections based on provided categories.
|
|
624
|
+
|
|
625
|
+
This method analyzes a multi-page document and classifies pages into
|
|
626
|
+
user-defined categories, returning the page ranges for each section.
|
|
627
|
+
|
|
628
|
+
Args:
|
|
629
|
+
document: The document to split. Can be a file path (Path or str), file-like object, MIMEData, PIL Image, or URL.
|
|
630
|
+
categories: List of categories to split the document into. Each category should have a 'name' and 'description'.
|
|
631
|
+
Can be Category objects or dicts with 'name' and 'description' keys.
|
|
632
|
+
model: The AI model to use for document splitting (e.g., "gemini-2.5-flash").
|
|
633
|
+
|
|
634
|
+
Returns:
|
|
635
|
+
SplitResponse: Response containing:
|
|
636
|
+
- splits: List of SplitResult objects with name, start_page, and end_page for each section.
|
|
637
|
+
|
|
638
|
+
Raises:
|
|
639
|
+
HTTPException: If the request fails.
|
|
640
|
+
|
|
641
|
+
Example:
|
|
642
|
+
```python
|
|
643
|
+
response = retab.documents.split(
|
|
644
|
+
document="invoice_batch.pdf",
|
|
645
|
+
model="gemini-2.5-flash",
|
|
646
|
+
categories=[
|
|
647
|
+
{"name": "invoice", "description": "Invoice documents with billing information"},
|
|
648
|
+
{"name": "receipt", "description": "Receipt documents for payments"},
|
|
649
|
+
{"name": "contract", "description": "Legal contract documents"},
|
|
650
|
+
]
|
|
651
|
+
)
|
|
652
|
+
for split in response.splits:
|
|
653
|
+
print(f"{split.name}: pages {split.start_page}-{split.end_page}")
|
|
654
|
+
```
|
|
655
|
+
"""
|
|
656
|
+
request = self._prepare_split(
|
|
657
|
+
document=document,
|
|
658
|
+
categories=categories,
|
|
659
|
+
model=model,
|
|
660
|
+
**extra_body,
|
|
661
|
+
)
|
|
662
|
+
response = self._client._prepared_request(request)
|
|
663
|
+
return SplitResponse.model_validate(response)
|
|
664
|
+
|
|
574
665
|
|
|
575
666
|
class AsyncDocuments(AsyncAPIResource, BaseDocumentsMixin):
|
|
576
667
|
"""Documents API wrapper for asynchronous usage."""
|
|
@@ -637,7 +728,7 @@ class AsyncDocuments(AsyncAPIResource, BaseDocumentsMixin):
|
|
|
637
728
|
self,
|
|
638
729
|
json_schema: dict[str, Any] | Path | str,
|
|
639
730
|
model: str,
|
|
640
|
-
document: Path | str | IOBase | HttpUrl,
|
|
731
|
+
document: Path | str | IOBase | HttpUrl | MIMEData,
|
|
641
732
|
image_resolution_dpi: int = FieldUnset,
|
|
642
733
|
temperature: float = FieldUnset,
|
|
643
734
|
reasoning_effort: ChatCompletionReasoningEffort = FieldUnset,
|
|
@@ -655,7 +746,7 @@ class AsyncDocuments(AsyncAPIResource, BaseDocumentsMixin):
|
|
|
655
746
|
Args:
|
|
656
747
|
json_schema: JSON schema defining the expected data structure
|
|
657
748
|
model: The AI model to use for processing
|
|
658
|
-
document: Document to process (file path, URL,
|
|
749
|
+
document: Document to process (file path, URL, file-like object, or MIMEData)
|
|
659
750
|
image_resolution_dpi: Optional image resolution DPI
|
|
660
751
|
temperature: Model temperature setting (0-1)
|
|
661
752
|
reasoning_effort: The effort level for the model to reason about the input data
|
|
@@ -693,7 +784,7 @@ class AsyncDocuments(AsyncAPIResource, BaseDocumentsMixin):
|
|
|
693
784
|
self,
|
|
694
785
|
json_schema: dict[str, Any] | Path | str,
|
|
695
786
|
model: str,
|
|
696
|
-
document: Path | str | IOBase | HttpUrl,
|
|
787
|
+
document: Path | str | IOBase | HttpUrl | MIMEData,
|
|
697
788
|
image_resolution_dpi: int = FieldUnset,
|
|
698
789
|
temperature: float = FieldUnset,
|
|
699
790
|
reasoning_effort: ChatCompletionReasoningEffort = FieldUnset,
|
|
@@ -709,7 +800,7 @@ class AsyncDocuments(AsyncAPIResource, BaseDocumentsMixin):
|
|
|
709
800
|
Args:
|
|
710
801
|
json_schema: JSON schema defining the expected data structure.
|
|
711
802
|
model: The AI model to use.
|
|
712
|
-
document: Document to process (file path, URL,
|
|
803
|
+
document: Document to process (file path, URL, file-like object, or MIMEData)
|
|
713
804
|
image_resolution_dpi: Optional image resolution DPI.
|
|
714
805
|
temperature: Model temperature setting (0-1).
|
|
715
806
|
reasoning_effort: The effort level for the model to reason about the input data.
|
|
@@ -822,9 +913,10 @@ class AsyncDocuments(AsyncAPIResource, BaseDocumentsMixin):
|
|
|
822
913
|
|
|
823
914
|
async def edit(
|
|
824
915
|
self,
|
|
825
|
-
document: Path | str | IOBase | MIMEData | PIL.Image.Image | HttpUrl,
|
|
826
916
|
filling_instructions: str,
|
|
917
|
+
document: Path | str | IOBase | MIMEData | PIL.Image.Image | HttpUrl | None = None,
|
|
827
918
|
model: str = FieldUnset,
|
|
919
|
+
template_id: str | None = FieldUnset,
|
|
828
920
|
**extra_body: Any,
|
|
829
921
|
) -> EditResponse:
|
|
830
922
|
"""
|
|
@@ -836,10 +928,15 @@ class AsyncDocuments(AsyncAPIResource, BaseDocumentsMixin):
|
|
|
836
928
|
3. LLM-based form filling using the provided instructions
|
|
837
929
|
4. Returns the filled PDF with form field values populated
|
|
838
930
|
|
|
931
|
+
Either `document` OR `template_id` must be provided, but not both.
|
|
932
|
+
|
|
839
933
|
Args:
|
|
840
|
-
document: The document to edit. Can be a file path (Path or str), file-like object, MIMEData, PIL Image, or URL.
|
|
841
934
|
filling_instructions: Instructions describing how to fill the form fields.
|
|
935
|
+
document: The document to edit. Can be a file path (Path or str), file-like object, MIMEData, PIL Image, or URL.
|
|
936
|
+
Mutually exclusive with template_id.
|
|
842
937
|
model: The LLM model to use for inference. Defaults to "gemini-2.5-pro".
|
|
938
|
+
template_id: Template ID to use for filling. When provided, uses the template's pre-defined form fields
|
|
939
|
+
and empty PDF. Only works for PDF documents. Mutually exclusive with document.
|
|
843
940
|
|
|
844
941
|
Returns:
|
|
845
942
|
EditResponse: Response containing:
|
|
@@ -850,10 +947,61 @@ class AsyncDocuments(AsyncAPIResource, BaseDocumentsMixin):
|
|
|
850
947
|
HTTPException: If the request fails.
|
|
851
948
|
"""
|
|
852
949
|
request = self._prepare_edit(
|
|
853
|
-
document=document,
|
|
854
950
|
filling_instructions=filling_instructions,
|
|
951
|
+
document=document,
|
|
855
952
|
model=model,
|
|
953
|
+
template_id=template_id,
|
|
856
954
|
**extra_body,
|
|
857
955
|
)
|
|
858
956
|
response = await self._client._prepared_request(request)
|
|
859
957
|
return EditResponse.model_validate(response)
|
|
958
|
+
|
|
959
|
+
async def split(
|
|
960
|
+
self,
|
|
961
|
+
document: Path | str | IOBase | MIMEData | PIL.Image.Image | HttpUrl,
|
|
962
|
+
categories: list[Category] | list[dict[str, str]],
|
|
963
|
+
model: str,
|
|
964
|
+
**extra_body: Any,
|
|
965
|
+
) -> SplitResponse:
|
|
966
|
+
"""
|
|
967
|
+
Split a document into sections based on provided categories asynchronously.
|
|
968
|
+
|
|
969
|
+
This method analyzes a multi-page document and classifies pages into
|
|
970
|
+
user-defined categories, returning the page ranges for each section.
|
|
971
|
+
|
|
972
|
+
Args:
|
|
973
|
+
document: The document to split. Can be a file path (Path or str), file-like object, MIMEData, PIL Image, or URL.
|
|
974
|
+
categories: List of categories to split the document into. Each category should have a 'name' and 'description'.
|
|
975
|
+
Can be Category objects or dicts with 'name' and 'description' keys.
|
|
976
|
+
model: The AI model to use for document splitting (e.g., "gemini-2.5-flash").
|
|
977
|
+
|
|
978
|
+
Returns:
|
|
979
|
+
SplitResponse: Response containing:
|
|
980
|
+
- splits: List of SplitResult objects with name, start_page, and end_page for each section.
|
|
981
|
+
|
|
982
|
+
Raises:
|
|
983
|
+
HTTPException: If the request fails.
|
|
984
|
+
|
|
985
|
+
Example:
|
|
986
|
+
```python
|
|
987
|
+
response = await retab.documents.split(
|
|
988
|
+
document="invoice_batch.pdf",
|
|
989
|
+
model="gemini-2.5-flash",
|
|
990
|
+
categories=[
|
|
991
|
+
{"name": "invoice", "description": "Invoice documents with billing information"},
|
|
992
|
+
{"name": "receipt", "description": "Receipt documents for payments"},
|
|
993
|
+
{"name": "contract", "description": "Legal contract documents"},
|
|
994
|
+
]
|
|
995
|
+
)
|
|
996
|
+
for split in response.splits:
|
|
997
|
+
print(f"{split.name}: pages {split.start_page}-{split.end_page}")
|
|
998
|
+
```
|
|
999
|
+
"""
|
|
1000
|
+
request = self._prepare_split(
|
|
1001
|
+
document=document,
|
|
1002
|
+
categories=categories,
|
|
1003
|
+
model=model,
|
|
1004
|
+
**extra_body,
|
|
1005
|
+
)
|
|
1006
|
+
response = await self._client._prepared_request(request)
|
|
1007
|
+
return SplitResponse.model_validate(response)
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
from .parse import ParseRequest, ParseResult, RetabUsage
|
|
2
|
+
from .split import Category, SplitRequest, SplitResult, SplitResponse
|
|
3
|
+
|
|
4
|
+
__all__ = [
|
|
5
|
+
"ParseRequest",
|
|
6
|
+
"ParseResult",
|
|
7
|
+
"RetabUsage",
|
|
8
|
+
"Category",
|
|
9
|
+
"SplitRequest",
|
|
10
|
+
"SplitResult",
|
|
11
|
+
"SplitResponse",
|
|
12
|
+
]
|
|
@@ -22,7 +22,7 @@ class DocumentCreateMessageRequest(BaseModel):
|
|
|
22
22
|
model_config = ConfigDict(extra="ignore")
|
|
23
23
|
document: MIMEData = Field(description="The document to load.")
|
|
24
24
|
image_resolution_dpi: int = Field(default=192, description="Resolution of the image sent to the LLM")
|
|
25
|
-
model: str = Field(default="
|
|
25
|
+
model: str = Field(default="retab-small", description="The model to use for the document.")
|
|
26
26
|
|
|
27
27
|
class DocumentCreateInputRequest(DocumentCreateMessageRequest):
|
|
28
28
|
json_schema: dict[str, Any] = Field(description="The json schema to use for the document.")
|
|
@@ -60,6 +60,10 @@ class BaseFormField(BaseModel):
|
|
|
60
60
|
...,
|
|
61
61
|
description="Type of field. Currently supported values: 'text' and 'checkbox'.",
|
|
62
62
|
)
|
|
63
|
+
key: str = Field(
|
|
64
|
+
...,
|
|
65
|
+
description="Key of the field. This is used to identify the field in the form data.",
|
|
66
|
+
)
|
|
63
67
|
|
|
64
68
|
|
|
65
69
|
class FormField(BaseFormField):
|
|
@@ -113,10 +117,12 @@ class OCRResult(BaseModel):
|
|
|
113
117
|
|
|
114
118
|
|
|
115
119
|
class InferFormSchemaRequest(BaseModel):
|
|
116
|
-
"""Request to infer form schema from a PDF."""
|
|
120
|
+
"""Request to infer form schema from a PDF or DOCX document."""
|
|
117
121
|
|
|
118
|
-
document: MIMEData = Field(..., description="Input document (PDF)")
|
|
119
|
-
model: str = Field(default="
|
|
122
|
+
document: MIMEData = Field(..., description="Input document (PDF or DOCX). DOCX files will be converted to PDF.")
|
|
123
|
+
model: str = Field(default="retab-small", description="LLM model to use for inference")
|
|
124
|
+
instructions: Optional[str] = Field(default=None, description="Optional instructions to guide form field detection (e.g., which fields to focus on, specific areas to look for)")
|
|
125
|
+
per_page: Optional[bool] = Field(default=None, description="If True, process each page separately for better accuracy on long PDFs. If None (default), automatically uses per-page for PDFs with more than 3 pages.")
|
|
120
126
|
|
|
121
127
|
|
|
122
128
|
class InferFormSchemaResponse(BaseModel):
|
|
@@ -127,10 +133,17 @@ class InferFormSchemaResponse(BaseModel):
|
|
|
127
133
|
form_fields_pdf: MIMEData = Field(..., description="PDF with form field bounding boxes")
|
|
128
134
|
|
|
129
135
|
|
|
130
|
-
class EditRequest(
|
|
131
|
-
"""Request for the infer_and_fill_schema endpoint.
|
|
136
|
+
class EditRequest(BaseModel):
|
|
137
|
+
"""Request for the infer_and_fill_schema endpoint.
|
|
132
138
|
|
|
139
|
+
Either `document` OR `template_id` must be provided, but not both.
|
|
140
|
+
- When `document` is provided: OCR + LLM inference to detect and fill form fields
|
|
141
|
+
- When `template_id` is provided: Uses pre-defined form fields from the template (PDF only)
|
|
142
|
+
"""
|
|
143
|
+
document: Optional[MIMEData] = Field(default=None, description="Input document (PDF or DOCX). DOCX files will be converted to PDF. Mutually exclusive with template_id.")
|
|
144
|
+
model: str = Field(default="retab-small", description="LLM model to use for inference")
|
|
133
145
|
filling_instructions: str = Field(..., description="Instructions to fill the form")
|
|
146
|
+
template_id: Optional[str] = Field(default=None, description="Template ID to use for filling. When provided, uses the template's pre-defined form fields and empty PDF. Only works for PDF documents. Mutually exclusive with document.")
|
|
134
147
|
|
|
135
148
|
class EditResponse(BaseModel):
|
|
136
149
|
"""Response from the fill_form endpoint.
|
|
@@ -34,7 +34,7 @@ class DocumentExtractRequest(BaseModel):
|
|
|
34
34
|
stream: bool = Field(default=False, description="If true, the extraction will be streamed to the user using the active WebSocket connection")
|
|
35
35
|
seed: int | None = Field(default=None, description="Seed for the random number generator. If not provided, a random seed will be generated.", examples=[None])
|
|
36
36
|
store: bool = Field(default=True, description="If true, the extraction will be stored in the database")
|
|
37
|
-
|
|
37
|
+
chunking_keys: Optional[dict[str, str]] = Field(default=None, description="If set, keys to be used for the extraction of long lists of data using Parallel OCR", examples=[{"properties": "ID", "products": "identity.id"}])
|
|
38
38
|
web_search: bool = Field(default=False, description="Enable web search enrichment with Parallel AI to add external context during extraction")
|
|
39
39
|
metadata: dict[str, str] = Field(default_factory=dict, description="User-defined metadata to associate with this extraction")
|
|
40
40
|
extraction_id: Optional[str] = Field(default=None, description="Extraction ID to use for this extraction. If not provided, a new ID will be generated.")
|
|
@@ -18,7 +18,7 @@ class ParseRequest(BaseModel):
|
|
|
18
18
|
model_config = ConfigDict(extra="ignore")
|
|
19
19
|
|
|
20
20
|
document: MIMEData = Field(..., description="Document to parse")
|
|
21
|
-
model: str = Field(default="
|
|
21
|
+
model: str = Field(default="retab-small", description="Model to use for parsing")
|
|
22
22
|
table_parsing_format: TableParsingFormat = Field(default="html", description="Format for parsing tables")
|
|
23
23
|
image_resolution_dpi: int = Field(default=192, description="DPI for image processing", ge=96, le=300)
|
|
24
24
|
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
from pydantic import BaseModel, Field
|
|
2
|
+
from ..mime import MIMEData
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class Category(BaseModel):
|
|
6
|
+
name: str = Field(..., description="The name of the category")
|
|
7
|
+
description: str = Field(..., description="The description of the category")
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class SplitRequest(BaseModel):
|
|
11
|
+
document: MIMEData = Field(..., description="The document to split")
|
|
12
|
+
categories: list[Category] = Field(..., description="The categories to split the document into")
|
|
13
|
+
model: str = Field(default="retab-small", description="The model to use to split the document")
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class SplitResult(BaseModel):
|
|
17
|
+
name: str = Field(..., description="The name of the category")
|
|
18
|
+
start_page: int = Field(..., description="The start page of the category (1-indexed)")
|
|
19
|
+
end_page: int = Field(..., description="The end page of the category (1-indexed, inclusive)")
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class SplitResponse(BaseModel):
|
|
23
|
+
splits: list[SplitResult] = Field(..., description="The list of document splits with their page ranges")
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class SplitOutputSchema(BaseModel):
|
|
28
|
+
"""Schema for LLM structured output."""
|
|
29
|
+
splits: list[SplitResult] = Field(
|
|
30
|
+
...,
|
|
31
|
+
description="List of document sections, each classified into one of the provided categories with their page ranges"
|
|
32
|
+
)
|
|
@@ -8,7 +8,7 @@ class InferenceSettings(BaseModel):
|
|
|
8
8
|
reasoning_effort: ChatCompletionReasoningEffort = "minimal"
|
|
9
9
|
image_resolution_dpi: int = Field(default=192, description="Resolution of the image sent to the LLM", ge=96, le=300)
|
|
10
10
|
n_consensus: int = Field(default=1, ge=1, le=8, description="Number of consensus rounds to perform")
|
|
11
|
-
|
|
11
|
+
chunking_keys: dict[str, str] | None = Field(default=None, description="If set, keys to be used for the extraction of long lists of data using Parallel OCR", examples=[{"properties": "ID", "products": "identity.id"}])
|
|
12
12
|
web_search: bool = Field(default=False, description="Enable web search enrichment with Parallel AI to add external context during extraction")
|
|
13
13
|
model_config = ConfigDict(extra="ignore")
|
|
14
14
|
|
|
@@ -9,7 +9,7 @@ from ..inference_settings import InferenceSettings
|
|
|
9
9
|
from .predictions import PredictionData
|
|
10
10
|
|
|
11
11
|
default_inference_settings = InferenceSettings(
|
|
12
|
-
model="
|
|
12
|
+
model="retab-small",
|
|
13
13
|
temperature=0.5,
|
|
14
14
|
reasoning_effort="minimal",
|
|
15
15
|
image_resolution_dpi=192,
|
|
@@ -103,7 +103,57 @@ def _insert_reasoning_fields_inner(schema: dict[str, Any]) -> tuple[dict[str, An
|
|
|
103
103
|
return schema, reasoning_desc
|
|
104
104
|
|
|
105
105
|
|
|
106
|
-
def
|
|
106
|
+
def _insert_quote_fields_inner(schema: dict[str, Any]) -> dict[str, Any]:
|
|
107
|
+
"""
|
|
108
|
+
Inner function that processes a schema and adds source___ fields for leaf nodes with X-SourceQuote: true.
|
|
109
|
+
Only applies to leaf fields, never to the root.
|
|
110
|
+
"""
|
|
111
|
+
if not isinstance(schema, dict):
|
|
112
|
+
return schema
|
|
113
|
+
|
|
114
|
+
# Create a copy to avoid modifying the original
|
|
115
|
+
new_schema = copy.deepcopy(schema)
|
|
116
|
+
|
|
117
|
+
# Process children recursively
|
|
118
|
+
if "properties" in new_schema and isinstance(new_schema["properties"], dict):
|
|
119
|
+
new_props = {}
|
|
120
|
+
for property_key, property_value in new_schema["properties"].items():
|
|
121
|
+
updated_prop_schema_value = _insert_quote_fields_inner(property_value)
|
|
122
|
+
has_quote_field = updated_prop_schema_value.get("X-SourceQuote") is True
|
|
123
|
+
|
|
124
|
+
# Check if this property is a leaf with X-SourceQuote: true
|
|
125
|
+
if has_quote_field:
|
|
126
|
+
# Add the quote field
|
|
127
|
+
quote_key = f"source___{property_key}"
|
|
128
|
+
new_props[quote_key] = {"type": "string", "description": f"The exact quote from the source document that supports the extracted value for '{property_key}'."}
|
|
129
|
+
|
|
130
|
+
# Add the quote field to required if the property is required
|
|
131
|
+
if "required" in new_schema and property_key in new_schema["required"]:
|
|
132
|
+
# add the quote field to required just before the property_key
|
|
133
|
+
new_schema["required"].insert(new_schema["required"].index(property_key), quote_key)
|
|
134
|
+
|
|
135
|
+
# Remove the X-SourceQuote field
|
|
136
|
+
updated_prop_schema_value.pop("X-SourceQuote", None)
|
|
137
|
+
|
|
138
|
+
new_props[property_key] = updated_prop_schema_value
|
|
139
|
+
new_schema["properties"] = new_props
|
|
140
|
+
|
|
141
|
+
elif "items" in new_schema:
|
|
142
|
+
# Recurse into items if present
|
|
143
|
+
updated_items = _insert_quote_fields_inner(new_schema["items"])
|
|
144
|
+
new_schema["items"] = updated_items
|
|
145
|
+
|
|
146
|
+
# Process $defs as well
|
|
147
|
+
if "$defs" in new_schema and isinstance(new_schema["$defs"], dict):
|
|
148
|
+
new_defs = {}
|
|
149
|
+
for dk, dv in new_schema["$defs"].items():
|
|
150
|
+
new_defs[dk] = _insert_quote_fields_inner(dv)
|
|
151
|
+
new_schema["$defs"] = new_defs
|
|
152
|
+
|
|
153
|
+
return new_schema
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def filter_auxiliary_fields(data: dict[str, Any], prefixes: list[str] = ["reasoning___", "source___"]) -> dict[str, Any]:
|
|
107
157
|
"""
|
|
108
158
|
Recursively filters out fields that start with any of the prefixes in `prefixes` from the input data.
|
|
109
159
|
"""
|
|
@@ -142,6 +192,9 @@ def create_reasoning_schema_without_ref_expansion(json_schema: dict[str, Any]) -
|
|
|
142
192
|
if "required" in updated_schema:
|
|
143
193
|
updated_schema["required"].append("reasoning___root")
|
|
144
194
|
|
|
195
|
+
# Insert quote fields for leaf nodes with X-SourceQuote: true
|
|
196
|
+
updated_schema = _insert_quote_fields_inner(updated_schema)
|
|
197
|
+
|
|
145
198
|
# Clean the schema (remove defaults, etc)
|
|
146
199
|
updated_schema = clean_schema(updated_schema, remove_custom_fields=True)
|
|
147
200
|
return updated_schema
|
|
@@ -167,6 +220,9 @@ def create_reasoning_schema(json_schema: dict[str, Any]) -> dict[str, Any]:
|
|
|
167
220
|
if "required" in updated_schema:
|
|
168
221
|
updated_schema["required"].append("reasoning___root")
|
|
169
222
|
|
|
223
|
+
# Insert quote fields for leaf nodes with X-SourceQuote: true
|
|
224
|
+
updated_schema = _insert_quote_fields_inner(updated_schema)
|
|
225
|
+
|
|
170
226
|
# Clean the schema (remove defaults, etc)
|
|
171
227
|
updated_schema = clean_schema(updated_schema, remove_custom_fields=True)
|
|
172
228
|
return updated_schema
|
|
@@ -1118,6 +1174,30 @@ No ambiguities."
|
|
|
1118
1174
|
|
|
1119
1175
|
---
|
|
1120
1176
|
|
|
1177
|
+
## Source Quote Fields
|
|
1178
|
+
|
|
1179
|
+
The schema may include source quote fields (`source___*`) for capturing exact quotes from the document that support extracted values. These fields appear as siblings to the fields they document.
|
|
1180
|
+
|
|
1181
|
+
Naming:
|
|
1182
|
+
- `source___[fieldname]` for each field marked with X-SourceQuote in the schema
|
|
1183
|
+
|
|
1184
|
+
Guidelines:
|
|
1185
|
+
- Extract the exact verbatim text from the document that supports the extracted value.
|
|
1186
|
+
- Include surrounding context when helpful for verification.
|
|
1187
|
+
- For missing data, use an empty string `""`.
|
|
1188
|
+
- These fields are internal and omitted from final outputs.
|
|
1189
|
+
|
|
1190
|
+
### Example
|
|
1191
|
+
If extracting a company name with source quote:
|
|
1192
|
+
```json
|
|
1193
|
+
{
|
|
1194
|
+
"source___company_name": "Registered Office: ACME Corporation Ltd",
|
|
1195
|
+
"company_name": "ACME Corporation Ltd"
|
|
1196
|
+
}
|
|
1197
|
+
```
|
|
1198
|
+
|
|
1199
|
+
---
|
|
1200
|
+
|
|
1121
1201
|
## Extraction Principles
|
|
1122
1202
|
|
|
1123
1203
|
- **Transparency**: Justify every decision with evidence.
|
|
@@ -368,7 +368,7 @@ def convert_basemodel_to_partial_basemodel(base_model: Type[BaseModel]) -> Type[
|
|
|
368
368
|
|
|
369
369
|
|
|
370
370
|
|
|
371
|
-
def filter_auxiliary_fields(data: dict[str, Any], prefixes: list[str] = ["reasoning___", "
|
|
371
|
+
def filter_auxiliary_fields(data: dict[str, Any], prefixes: list[str] = ["reasoning___", "source___"]) -> dict[str, Any]:
|
|
372
372
|
"""
|
|
373
373
|
Recursively filters out fields that start with any of the prefixes in `prefixes` from the input data.
|
|
374
374
|
"""
|
|
@@ -388,7 +388,7 @@ def filter_auxiliary_fields(data: dict[str, Any], prefixes: list[str] = ["reason
|
|
|
388
388
|
return filtered
|
|
389
389
|
|
|
390
390
|
|
|
391
|
-
def filter_auxiliary_fields_json(data: str, prefixes: list[str] = ["reasoning___", "
|
|
391
|
+
def filter_auxiliary_fields_json(data: str, prefixes: list[str] = ["reasoning___", "source___"]) -> dict[str, Any]:
|
|
392
392
|
"""
|
|
393
393
|
Recursively filters out fields that start with any of the prefixes in `prefixes` from the input JSON data.
|
|
394
394
|
"""
|
|
@@ -33,6 +33,7 @@ retab/types/documents/create_messages.py
|
|
|
33
33
|
retab/types/documents/edit.py
|
|
34
34
|
retab/types/documents/extract.py
|
|
35
35
|
retab/types/documents/parse.py
|
|
36
|
+
retab/types/documents/split.py
|
|
36
37
|
retab/types/extractions/__init__.py
|
|
37
38
|
retab/types/extractions/types.py
|
|
38
39
|
retab/types/projects/__init__.py
|
|
@@ -51,6 +52,4 @@ retab/utils/hashing.py
|
|
|
51
52
|
retab/utils/json_schema.py
|
|
52
53
|
retab/utils/mime.py
|
|
53
54
|
retab/utils/stream_context_managers.py
|
|
54
|
-
retab/utils/usage/__init__.py
|
|
55
|
-
retab/utils/usage/json_schema.py
|
|
56
55
|
tests/test_projects.py
|
|
File without changes
|