retab 0.0.76__py3-none-any.whl → 0.0.77__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- retab/resources/documents/client.py +137 -8
- retab/types/documents/__init__.py +10 -1
- retab/types/documents/split.py +32 -0
- retab/types/schemas/model.py +81 -1
- retab/utils/json_schema.py +2 -2
- {retab-0.0.76.dist-info → retab-0.0.77.dist-info}/METADATA +1 -1
- {retab-0.0.76.dist-info → retab-0.0.77.dist-info}/RECORD +9 -10
- retab/utils/usage/__init__.py +0 -0
- retab/utils/usage/json_schema.py +0 -2197
- {retab-0.0.76.dist-info → retab-0.0.77.dist-info}/WHEEL +0 -0
- {retab-0.0.76.dist-info → retab-0.0.77.dist-info}/top_level.txt +0 -0
|
@@ -16,6 +16,7 @@ from ...types.chat import ChatCompletionRetabMessage
|
|
|
16
16
|
from ...types.documents.edit import EditRequest, EditResponse
|
|
17
17
|
from ...types.documents.extract import DocumentExtractRequest, RetabParsedChatCompletion, RetabParsedChatCompletionChunk, RetabParsedChoice, maybe_parse_to_pydantic
|
|
18
18
|
from ...types.documents.parse import ParseRequest, ParseResult, TableParsingFormat
|
|
19
|
+
from ...types.documents.split import Category, SplitRequest, SplitResponse
|
|
19
20
|
from ...types.mime import MIMEData
|
|
20
21
|
from ...types.standards import PreparedRequest, FieldUnset
|
|
21
22
|
from ...utils.json_schema import load_json_schema, unflatten_dict
|
|
@@ -138,11 +139,39 @@ class BaseDocumentsMixin:
|
|
|
138
139
|
edit_request = EditRequest(**request_dict)
|
|
139
140
|
return PreparedRequest(method="POST", url="/v1/documents/edit", data=edit_request.model_dump(mode="json", exclude_unset=True))
|
|
140
141
|
|
|
142
|
+
def _prepare_split(
|
|
143
|
+
self,
|
|
144
|
+
document: Path | str | IOBase | MIMEData | PIL.Image.Image | HttpUrl,
|
|
145
|
+
categories: list[Category] | list[dict[str, str]],
|
|
146
|
+
model: str,
|
|
147
|
+
**extra_body: Any,
|
|
148
|
+
) -> PreparedRequest:
|
|
149
|
+
mime_document = prepare_mime_document(document)
|
|
150
|
+
|
|
151
|
+
# Convert dict categories to Category objects if needed
|
|
152
|
+
category_objects = [
|
|
153
|
+
Category(**cat) if isinstance(cat, dict) else cat
|
|
154
|
+
for cat in categories
|
|
155
|
+
]
|
|
156
|
+
|
|
157
|
+
request_dict: dict[str, Any] = {
|
|
158
|
+
"document": mime_document,
|
|
159
|
+
"categories": category_objects,
|
|
160
|
+
"model": model,
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
# Merge any extra fields provided by the caller
|
|
164
|
+
if extra_body:
|
|
165
|
+
request_dict.update(extra_body)
|
|
166
|
+
|
|
167
|
+
split_request = SplitRequest(**request_dict)
|
|
168
|
+
return PreparedRequest(method="POST", url="/v1/documents/split", data=split_request.model_dump(mode="json", exclude_unset=True))
|
|
169
|
+
|
|
141
170
|
def _prepare_extract(
|
|
142
171
|
self,
|
|
143
172
|
json_schema: dict[str, Any] | Path | str,
|
|
144
173
|
model: str,
|
|
145
|
-
document: Path | str | IOBase | HttpUrl,
|
|
174
|
+
document: Path | str | IOBase | HttpUrl | MIMEData,
|
|
146
175
|
image_resolution_dpi: int = FieldUnset,
|
|
147
176
|
temperature: float = FieldUnset,
|
|
148
177
|
reasoning_effort: ChatCompletionReasoningEffort = FieldUnset,
|
|
@@ -261,7 +290,7 @@ class Documents(SyncAPIResource, BaseDocumentsMixin):
|
|
|
261
290
|
self,
|
|
262
291
|
json_schema: dict[str, Any] | Path | str,
|
|
263
292
|
model: str,
|
|
264
|
-
document: Path | str | IOBase | HttpUrl,
|
|
293
|
+
document: Path | str | IOBase | HttpUrl | MIMEData,
|
|
265
294
|
image_resolution_dpi: int = FieldUnset,
|
|
266
295
|
temperature: float = FieldUnset,
|
|
267
296
|
reasoning_effort: ChatCompletionReasoningEffort = FieldUnset,
|
|
@@ -279,7 +308,7 @@ class Documents(SyncAPIResource, BaseDocumentsMixin):
|
|
|
279
308
|
Args:
|
|
280
309
|
json_schema: JSON schema defining the expected data structure
|
|
281
310
|
model: The AI model to use for processing
|
|
282
|
-
document: Document to process (file path, URL,
|
|
311
|
+
document: Document to process (file path, URL, file-like object, or MIMEData)
|
|
283
312
|
image_resolution_dpi: Optional image resolution DPI
|
|
284
313
|
temperature: Model temperature setting (0-1)
|
|
285
314
|
reasoning_effort: The effort level for the model to reason about the input data
|
|
@@ -405,7 +434,7 @@ class Documents(SyncAPIResource, BaseDocumentsMixin):
|
|
|
405
434
|
self,
|
|
406
435
|
json_schema: dict[str, Any] | Path | str,
|
|
407
436
|
model: str,
|
|
408
|
-
document: Path | str | IOBase | HttpUrl,
|
|
437
|
+
document: Path | str | IOBase | HttpUrl | MIMEData,
|
|
409
438
|
image_resolution_dpi: int = FieldUnset,
|
|
410
439
|
temperature: float = FieldUnset,
|
|
411
440
|
reasoning_effort: ChatCompletionReasoningEffort = FieldUnset,
|
|
@@ -571,6 +600,56 @@ class Documents(SyncAPIResource, BaseDocumentsMixin):
|
|
|
571
600
|
response = self._client._prepared_request(request)
|
|
572
601
|
return EditResponse.model_validate(response)
|
|
573
602
|
|
|
603
|
+
def split(
|
|
604
|
+
self,
|
|
605
|
+
document: Path | str | IOBase | MIMEData | PIL.Image.Image | HttpUrl,
|
|
606
|
+
categories: list[Category] | list[dict[str, str]],
|
|
607
|
+
model: str,
|
|
608
|
+
**extra_body: Any,
|
|
609
|
+
) -> SplitResponse:
|
|
610
|
+
"""
|
|
611
|
+
Split a document into sections based on provided categories.
|
|
612
|
+
|
|
613
|
+
This method analyzes a multi-page document and classifies pages into
|
|
614
|
+
user-defined categories, returning the page ranges for each section.
|
|
615
|
+
|
|
616
|
+
Args:
|
|
617
|
+
document: The document to split. Can be a file path (Path or str), file-like object, MIMEData, PIL Image, or URL.
|
|
618
|
+
categories: List of categories to split the document into. Each category should have a 'name' and 'description'.
|
|
619
|
+
Can be Category objects or dicts with 'name' and 'description' keys.
|
|
620
|
+
model: The AI model to use for document splitting (e.g., "gemini-2.5-flash").
|
|
621
|
+
|
|
622
|
+
Returns:
|
|
623
|
+
SplitResponse: Response containing:
|
|
624
|
+
- splits: List of SplitResult objects with name, start_page, and end_page for each section.
|
|
625
|
+
|
|
626
|
+
Raises:
|
|
627
|
+
HTTPException: If the request fails.
|
|
628
|
+
|
|
629
|
+
Example:
|
|
630
|
+
```python
|
|
631
|
+
response = retab.documents.split(
|
|
632
|
+
document="invoice_batch.pdf",
|
|
633
|
+
model="gemini-2.5-flash",
|
|
634
|
+
categories=[
|
|
635
|
+
{"name": "invoice", "description": "Invoice documents with billing information"},
|
|
636
|
+
{"name": "receipt", "description": "Receipt documents for payments"},
|
|
637
|
+
{"name": "contract", "description": "Legal contract documents"},
|
|
638
|
+
]
|
|
639
|
+
)
|
|
640
|
+
for split in response.splits:
|
|
641
|
+
print(f"{split.name}: pages {split.start_page}-{split.end_page}")
|
|
642
|
+
```
|
|
643
|
+
"""
|
|
644
|
+
request = self._prepare_split(
|
|
645
|
+
document=document,
|
|
646
|
+
categories=categories,
|
|
647
|
+
model=model,
|
|
648
|
+
**extra_body,
|
|
649
|
+
)
|
|
650
|
+
response = self._client._prepared_request(request)
|
|
651
|
+
return SplitResponse.model_validate(response)
|
|
652
|
+
|
|
574
653
|
|
|
575
654
|
class AsyncDocuments(AsyncAPIResource, BaseDocumentsMixin):
|
|
576
655
|
"""Documents API wrapper for asynchronous usage."""
|
|
@@ -637,7 +716,7 @@ class AsyncDocuments(AsyncAPIResource, BaseDocumentsMixin):
|
|
|
637
716
|
self,
|
|
638
717
|
json_schema: dict[str, Any] | Path | str,
|
|
639
718
|
model: str,
|
|
640
|
-
document: Path | str | IOBase | HttpUrl,
|
|
719
|
+
document: Path | str | IOBase | HttpUrl | MIMEData,
|
|
641
720
|
image_resolution_dpi: int = FieldUnset,
|
|
642
721
|
temperature: float = FieldUnset,
|
|
643
722
|
reasoning_effort: ChatCompletionReasoningEffort = FieldUnset,
|
|
@@ -655,7 +734,7 @@ class AsyncDocuments(AsyncAPIResource, BaseDocumentsMixin):
|
|
|
655
734
|
Args:
|
|
656
735
|
json_schema: JSON schema defining the expected data structure
|
|
657
736
|
model: The AI model to use for processing
|
|
658
|
-
document: Document to process (file path, URL,
|
|
737
|
+
document: Document to process (file path, URL, file-like object, or MIMEData)
|
|
659
738
|
image_resolution_dpi: Optional image resolution DPI
|
|
660
739
|
temperature: Model temperature setting (0-1)
|
|
661
740
|
reasoning_effort: The effort level for the model to reason about the input data
|
|
@@ -693,7 +772,7 @@ class AsyncDocuments(AsyncAPIResource, BaseDocumentsMixin):
|
|
|
693
772
|
self,
|
|
694
773
|
json_schema: dict[str, Any] | Path | str,
|
|
695
774
|
model: str,
|
|
696
|
-
document: Path | str | IOBase | HttpUrl,
|
|
775
|
+
document: Path | str | IOBase | HttpUrl | MIMEData,
|
|
697
776
|
image_resolution_dpi: int = FieldUnset,
|
|
698
777
|
temperature: float = FieldUnset,
|
|
699
778
|
reasoning_effort: ChatCompletionReasoningEffort = FieldUnset,
|
|
@@ -709,7 +788,7 @@ class AsyncDocuments(AsyncAPIResource, BaseDocumentsMixin):
|
|
|
709
788
|
Args:
|
|
710
789
|
json_schema: JSON schema defining the expected data structure.
|
|
711
790
|
model: The AI model to use.
|
|
712
|
-
document: Document to process (file path, URL,
|
|
791
|
+
document: Document to process (file path, URL, file-like object, or MIMEData)
|
|
713
792
|
image_resolution_dpi: Optional image resolution DPI.
|
|
714
793
|
temperature: Model temperature setting (0-1).
|
|
715
794
|
reasoning_effort: The effort level for the model to reason about the input data.
|
|
@@ -857,3 +936,53 @@ class AsyncDocuments(AsyncAPIResource, BaseDocumentsMixin):
|
|
|
857
936
|
)
|
|
858
937
|
response = await self._client._prepared_request(request)
|
|
859
938
|
return EditResponse.model_validate(response)
|
|
939
|
+
|
|
940
|
+
async def split(
|
|
941
|
+
self,
|
|
942
|
+
document: Path | str | IOBase | MIMEData | PIL.Image.Image | HttpUrl,
|
|
943
|
+
categories: list[Category] | list[dict[str, str]],
|
|
944
|
+
model: str,
|
|
945
|
+
**extra_body: Any,
|
|
946
|
+
) -> SplitResponse:
|
|
947
|
+
"""
|
|
948
|
+
Split a document into sections based on provided categories asynchronously.
|
|
949
|
+
|
|
950
|
+
This method analyzes a multi-page document and classifies pages into
|
|
951
|
+
user-defined categories, returning the page ranges for each section.
|
|
952
|
+
|
|
953
|
+
Args:
|
|
954
|
+
document: The document to split. Can be a file path (Path or str), file-like object, MIMEData, PIL Image, or URL.
|
|
955
|
+
categories: List of categories to split the document into. Each category should have a 'name' and 'description'.
|
|
956
|
+
Can be Category objects or dicts with 'name' and 'description' keys.
|
|
957
|
+
model: The AI model to use for document splitting (e.g., "gemini-2.5-flash").
|
|
958
|
+
|
|
959
|
+
Returns:
|
|
960
|
+
SplitResponse: Response containing:
|
|
961
|
+
- splits: List of SplitResult objects with name, start_page, and end_page for each section.
|
|
962
|
+
|
|
963
|
+
Raises:
|
|
964
|
+
HTTPException: If the request fails.
|
|
965
|
+
|
|
966
|
+
Example:
|
|
967
|
+
```python
|
|
968
|
+
response = await retab.documents.split(
|
|
969
|
+
document="invoice_batch.pdf",
|
|
970
|
+
model="gemini-2.5-flash",
|
|
971
|
+
categories=[
|
|
972
|
+
{"name": "invoice", "description": "Invoice documents with billing information"},
|
|
973
|
+
{"name": "receipt", "description": "Receipt documents for payments"},
|
|
974
|
+
{"name": "contract", "description": "Legal contract documents"},
|
|
975
|
+
]
|
|
976
|
+
)
|
|
977
|
+
for split in response.splits:
|
|
978
|
+
print(f"{split.name}: pages {split.start_page}-{split.end_page}")
|
|
979
|
+
```
|
|
980
|
+
"""
|
|
981
|
+
request = self._prepare_split(
|
|
982
|
+
document=document,
|
|
983
|
+
categories=categories,
|
|
984
|
+
model=model,
|
|
985
|
+
**extra_body,
|
|
986
|
+
)
|
|
987
|
+
response = await self._client._prepared_request(request)
|
|
988
|
+
return SplitResponse.model_validate(response)
|
|
@@ -1,3 +1,12 @@
|
|
|
1
1
|
from .parse import ParseRequest, ParseResult, RetabUsage
|
|
2
|
+
from .split import Category, SplitRequest, SplitResult, SplitResponse
|
|
2
3
|
|
|
3
|
-
__all__ = [
|
|
4
|
+
__all__ = [
|
|
5
|
+
"ParseRequest",
|
|
6
|
+
"ParseResult",
|
|
7
|
+
"RetabUsage",
|
|
8
|
+
"Category",
|
|
9
|
+
"SplitRequest",
|
|
10
|
+
"SplitResult",
|
|
11
|
+
"SplitResponse",
|
|
12
|
+
]
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
from pydantic import BaseModel, Field
|
|
2
|
+
from ..mime import MIMEData
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class Category(BaseModel):
|
|
6
|
+
name: str = Field(..., description="The name of the category")
|
|
7
|
+
description: str = Field(..., description="The description of the category")
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class SplitRequest(BaseModel):
|
|
11
|
+
document: MIMEData = Field(..., description="The document to split")
|
|
12
|
+
categories: list[Category] = Field(..., description="The categories to split the document into")
|
|
13
|
+
model: str = Field(..., description="The model to use to split the document")
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class SplitResult(BaseModel):
|
|
17
|
+
name: str = Field(..., description="The name of the category")
|
|
18
|
+
start_page: int = Field(..., description="The start page of the category (1-indexed)")
|
|
19
|
+
end_page: int = Field(..., description="The end page of the category (1-indexed, inclusive)")
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class SplitResponse(BaseModel):
|
|
23
|
+
splits: list[SplitResult] = Field(..., description="The list of document splits with their page ranges")
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class SplitOutputSchema(BaseModel):
|
|
28
|
+
"""Schema for LLM structured output."""
|
|
29
|
+
splits: list[SplitResult] = Field(
|
|
30
|
+
...,
|
|
31
|
+
description="List of document sections, each classified into one of the provided categories with their page ranges"
|
|
32
|
+
)
|
retab/types/schemas/model.py
CHANGED
|
@@ -103,7 +103,57 @@ def _insert_reasoning_fields_inner(schema: dict[str, Any]) -> tuple[dict[str, An
|
|
|
103
103
|
return schema, reasoning_desc
|
|
104
104
|
|
|
105
105
|
|
|
106
|
-
def
|
|
106
|
+
def _insert_quote_fields_inner(schema: dict[str, Any]) -> dict[str, Any]:
|
|
107
|
+
"""
|
|
108
|
+
Inner function that processes a schema and adds source___ fields for leaf nodes with X-SourceQuote: true.
|
|
109
|
+
Only applies to leaf fields, never to the root.
|
|
110
|
+
"""
|
|
111
|
+
if not isinstance(schema, dict):
|
|
112
|
+
return schema
|
|
113
|
+
|
|
114
|
+
# Create a copy to avoid modifying the original
|
|
115
|
+
new_schema = copy.deepcopy(schema)
|
|
116
|
+
|
|
117
|
+
# Process children recursively
|
|
118
|
+
if "properties" in new_schema and isinstance(new_schema["properties"], dict):
|
|
119
|
+
new_props = {}
|
|
120
|
+
for property_key, property_value in new_schema["properties"].items():
|
|
121
|
+
updated_prop_schema_value = _insert_quote_fields_inner(property_value)
|
|
122
|
+
has_quote_field = updated_prop_schema_value.get("X-SourceQuote") is True
|
|
123
|
+
|
|
124
|
+
# Check if this property is a leaf with X-SourceQuote: true
|
|
125
|
+
if has_quote_field:
|
|
126
|
+
# Add the quote field
|
|
127
|
+
quote_key = f"source___{property_key}"
|
|
128
|
+
new_props[quote_key] = {"type": "string", "description": f"The exact quote from the source document that supports the extracted value for '{property_key}'."}
|
|
129
|
+
|
|
130
|
+
# Add the quote field to required if the property is required
|
|
131
|
+
if "required" in new_schema and property_key in new_schema["required"]:
|
|
132
|
+
# add the quote field to required just before the property_key
|
|
133
|
+
new_schema["required"].insert(new_schema["required"].index(property_key), quote_key)
|
|
134
|
+
|
|
135
|
+
# Remove the X-SourceQuote field
|
|
136
|
+
updated_prop_schema_value.pop("X-SourceQuote", None)
|
|
137
|
+
|
|
138
|
+
new_props[property_key] = updated_prop_schema_value
|
|
139
|
+
new_schema["properties"] = new_props
|
|
140
|
+
|
|
141
|
+
elif "items" in new_schema:
|
|
142
|
+
# Recurse into items if present
|
|
143
|
+
updated_items = _insert_quote_fields_inner(new_schema["items"])
|
|
144
|
+
new_schema["items"] = updated_items
|
|
145
|
+
|
|
146
|
+
# Process $defs as well
|
|
147
|
+
if "$defs" in new_schema and isinstance(new_schema["$defs"], dict):
|
|
148
|
+
new_defs = {}
|
|
149
|
+
for dk, dv in new_schema["$defs"].items():
|
|
150
|
+
new_defs[dk] = _insert_quote_fields_inner(dv)
|
|
151
|
+
new_schema["$defs"] = new_defs
|
|
152
|
+
|
|
153
|
+
return new_schema
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def filter_auxiliary_fields(data: dict[str, Any], prefixes: list[str] = ["reasoning___", "source___"]) -> dict[str, Any]:
|
|
107
157
|
"""
|
|
108
158
|
Recursively filters out fields that start with any of the prefixes in `prefixes` from the input data.
|
|
109
159
|
"""
|
|
@@ -142,6 +192,9 @@ def create_reasoning_schema_without_ref_expansion(json_schema: dict[str, Any]) -
|
|
|
142
192
|
if "required" in updated_schema:
|
|
143
193
|
updated_schema["required"].append("reasoning___root")
|
|
144
194
|
|
|
195
|
+
# Insert quote fields for leaf nodes with X-SourceQuote: true
|
|
196
|
+
updated_schema = _insert_quote_fields_inner(updated_schema)
|
|
197
|
+
|
|
145
198
|
# Clean the schema (remove defaults, etc)
|
|
146
199
|
updated_schema = clean_schema(updated_schema, remove_custom_fields=True)
|
|
147
200
|
return updated_schema
|
|
@@ -167,6 +220,9 @@ def create_reasoning_schema(json_schema: dict[str, Any]) -> dict[str, Any]:
|
|
|
167
220
|
if "required" in updated_schema:
|
|
168
221
|
updated_schema["required"].append("reasoning___root")
|
|
169
222
|
|
|
223
|
+
# Insert quote fields for leaf nodes with X-SourceQuote: true
|
|
224
|
+
updated_schema = _insert_quote_fields_inner(updated_schema)
|
|
225
|
+
|
|
170
226
|
# Clean the schema (remove defaults, etc)
|
|
171
227
|
updated_schema = clean_schema(updated_schema, remove_custom_fields=True)
|
|
172
228
|
return updated_schema
|
|
@@ -1118,6 +1174,30 @@ No ambiguities."
|
|
|
1118
1174
|
|
|
1119
1175
|
---
|
|
1120
1176
|
|
|
1177
|
+
## Source Quote Fields
|
|
1178
|
+
|
|
1179
|
+
The schema may include source quote fields (`source___*`) for capturing exact quotes from the document that support extracted values. These fields appear as siblings to the fields they document.
|
|
1180
|
+
|
|
1181
|
+
Naming:
|
|
1182
|
+
- `source___[fieldname]` for each field marked with X-SourceQuote in the schema
|
|
1183
|
+
|
|
1184
|
+
Guidelines:
|
|
1185
|
+
- Extract the exact verbatim text from the document that supports the extracted value.
|
|
1186
|
+
- Include surrounding context when helpful for verification.
|
|
1187
|
+
- For missing data, use an empty string `""`.
|
|
1188
|
+
- These fields are internal and omitted from final outputs.
|
|
1189
|
+
|
|
1190
|
+
### Example
|
|
1191
|
+
If extracting a company name with source quote:
|
|
1192
|
+
```json
|
|
1193
|
+
{
|
|
1194
|
+
"source___company_name": "Registered Office: ACME Corporation Ltd",
|
|
1195
|
+
"company_name": "ACME Corporation Ltd"
|
|
1196
|
+
}
|
|
1197
|
+
```
|
|
1198
|
+
|
|
1199
|
+
---
|
|
1200
|
+
|
|
1121
1201
|
## Extraction Principles
|
|
1122
1202
|
|
|
1123
1203
|
- **Transparency**: Justify every decision with evidence.
|
retab/utils/json_schema.py
CHANGED
|
@@ -368,7 +368,7 @@ def convert_basemodel_to_partial_basemodel(base_model: Type[BaseModel]) -> Type[
|
|
|
368
368
|
|
|
369
369
|
|
|
370
370
|
|
|
371
|
-
def filter_auxiliary_fields(data: dict[str, Any], prefixes: list[str] = ["reasoning___", "
|
|
371
|
+
def filter_auxiliary_fields(data: dict[str, Any], prefixes: list[str] = ["reasoning___", "source___"]) -> dict[str, Any]:
|
|
372
372
|
"""
|
|
373
373
|
Recursively filters out fields that start with any of the prefixes in `prefixes` from the input data.
|
|
374
374
|
"""
|
|
@@ -388,7 +388,7 @@ def filter_auxiliary_fields(data: dict[str, Any], prefixes: list[str] = ["reason
|
|
|
388
388
|
return filtered
|
|
389
389
|
|
|
390
390
|
|
|
391
|
-
def filter_auxiliary_fields_json(data: str, prefixes: list[str] = ["reasoning___", "
|
|
391
|
+
def filter_auxiliary_fields_json(data: str, prefixes: list[str] = ["reasoning___", "source___"]) -> dict[str, Any]:
|
|
392
392
|
"""
|
|
393
393
|
Recursively filters out fields that start with any of the prefixes in `prefixes` from the input JSON data.
|
|
394
394
|
"""
|
|
@@ -7,7 +7,7 @@ retab/resources/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
|
7
7
|
retab/resources/models.py,sha256=4WidFBnTGZEA65DSn2pLP2SRnCVXkMTw7o_m8xVCFC4,2469
|
|
8
8
|
retab/resources/schemas.py,sha256=rZ6OzfmoYv-mGaRVzvXjO09dD-KxP74mZhOO8sMgcDQ,4632
|
|
9
9
|
retab/resources/documents/__init__.py,sha256=OjXmngFN0RKqO4SI-mJBNzr6Ex6rMxfq0DxaqzP0RQs,89
|
|
10
|
-
retab/resources/documents/client.py,sha256=
|
|
10
|
+
retab/resources/documents/client.py,sha256=xiHZPvaxETqZGXanOzp1zFQBcSB7WlgiXGtiv6Ys1dQ,42496
|
|
11
11
|
retab/resources/extractions/__init__.py,sha256=2H1ezUG8hI5SmTRy6NFzXdYLOdGFFsFrI60uzkitV20,97
|
|
12
12
|
retab/resources/extractions/client.py,sha256=sEoNjOgX91FTOgoJUV-I1A9A9xl1ciCdPlhYwjhEjbA,11035
|
|
13
13
|
retab/resources/projects/__init__.py,sha256=tPR3_3tr7bsoYd618qmGjnYN2R23PmF5oCFd7Z5_HGY,85
|
|
@@ -19,12 +19,13 @@ retab/types/mime.py,sha256=ZLNCD3pvgn5cbGfJwzrdkjgB9dMHCbN67YEV9bx47zE,10063
|
|
|
19
19
|
retab/types/modality.py,sha256=4B8LctdUBZVgIjtS2FjrJpljn2Eyse0XE1bpFsGb9O4,131
|
|
20
20
|
retab/types/pagination.py,sha256=A0Fw06baPTfEaYwo3kvNs4vaupzlqylBc6tQH-2DFuY,279
|
|
21
21
|
retab/types/standards.py,sha256=7aGtuvzzkKidvqY8JB2Cjfn43V80FeKwrTtp162kjKc,1477
|
|
22
|
-
retab/types/documents/__init__.py,sha256=
|
|
22
|
+
retab/types/documents/__init__.py,sha256=YDsvsmwkS5lfGXk5aBqSqmFh6LKX3dM6q_cUo5oIydU,277
|
|
23
23
|
retab/types/documents/correct_orientation.py,sha256=e-ivsslI6L6Gl0YkcslXw_DH620xMGEYVp4tdeviXeM,261
|
|
24
24
|
retab/types/documents/create_messages.py,sha256=Cox0QgIyLhTXIvw1Nzd2BCnB9-5KAYgw_gads5eTaDw,7272
|
|
25
25
|
retab/types/documents/edit.py,sha256=HjDjhHlj08Kks7ABVohTrAJ9QngDgwVj32AxXitjrv0,4804
|
|
26
26
|
retab/types/documents/extract.py,sha256=DhS9jm0lUgXVLObKm2CnSJQ2eqMmsBfttO0K9TndfIw,16728
|
|
27
27
|
retab/types/documents/parse.py,sha256=Jd6i-1UXhAtgntRBZItEHGHeevyLdLmbTQa1-HNrico,1305
|
|
28
|
+
retab/types/documents/split.py,sha256=Sjp2u7Ob6nBRQL23RlgiabgyUmoyf8aEyr7zdvUdU-M,1228
|
|
28
29
|
retab/types/extractions/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
29
30
|
retab/types/extractions/types.py,sha256=mnCYSfJoEKsXN2eG-PrahnnQyR6RDjP5VO9sHC1Opmg,102
|
|
30
31
|
retab/types/projects/__init__.py,sha256=I7P_dems5_LOLgYQ-4Bzt9B6P6jRlQwP-D_9GxRDhVk,155
|
|
@@ -35,17 +36,15 @@ retab/types/schemas/__init__.py,sha256=9ODWiC_4pUVKxoIKglYZjvRjRyd1ZCVxG8GBdQgHN
|
|
|
35
36
|
retab/types/schemas/chat.py,sha256=ppTidxsNslTKE5aBva04i9IxeARMqYpXYLjtR7V6pBc,21219
|
|
36
37
|
retab/types/schemas/generate.py,sha256=8c9LzFgsG9BpteKzjPaLJEneEHsjCyYvGo1jdko-DI4,743
|
|
37
38
|
retab/types/schemas/layout.py,sha256=JLPwQGIWfPBoe1Y5r-MhiNDJigzZ-yKZnVGgox0uqMk,1487
|
|
38
|
-
retab/types/schemas/model.py,sha256=
|
|
39
|
+
retab/types/schemas/model.py,sha256=kIMB1C_q7YjYJeVV3y06n3m_ebCGSLXyjDs34Ye-oes,72863
|
|
39
40
|
retab/types/schemas/templates.py,sha256=XihWTHi6t_6QjxN07n_1dee5KdhHiuoHAYfmKwI7gQg,1708
|
|
40
41
|
retab/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
41
42
|
retab/utils/display.py,sha256=ZFPbiBnwEWGR-suS8e9Xilz9OqyYRDwsKYWfbFSJPJM,18868
|
|
42
43
|
retab/utils/hashing.py,sha256=_BMVUvftOcJav68QL0rLkH2dbhW9RRJPzeGC2akR0fc,757
|
|
43
|
-
retab/utils/json_schema.py,sha256=
|
|
44
|
+
retab/utils/json_schema.py,sha256=F3MLNGskpfPh1IkXHPLp60ceOEFD79GyL8mVvr0OiVM,19583
|
|
44
45
|
retab/utils/mime.py,sha256=mTP_lqSPttOP5DYJxopiWaeFXrUCPjhwd7y53nCVGO4,6189
|
|
45
46
|
retab/utils/stream_context_managers.py,sha256=gI1gVQSj3nWz6Mvjz7Ix5AiY0g6vSL-c2tPfuP04izo,2314
|
|
46
|
-
retab/
|
|
47
|
-
retab/
|
|
48
|
-
retab-0.0.
|
|
49
|
-
retab-0.0.
|
|
50
|
-
retab-0.0.76.dist-info/top_level.txt,sha256=waQR0EGdhLIQtztoE3AXg7ik5ONQ9q_bsKVpyFuJdq0,6
|
|
51
|
-
retab-0.0.76.dist-info/RECORD,,
|
|
47
|
+
retab-0.0.77.dist-info/METADATA,sha256=F2-lc5_Am2m8rqSaVLrlsp0Uwdhe1pLZmiwcplBM9KA,4532
|
|
48
|
+
retab-0.0.77.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
49
|
+
retab-0.0.77.dist-info/top_level.txt,sha256=waQR0EGdhLIQtztoE3AXg7ik5ONQ9q_bsKVpyFuJdq0,6
|
|
50
|
+
retab-0.0.77.dist-info/RECORD,,
|
retab/utils/usage/__init__.py
DELETED
|
File without changes
|