retab 0.0.76__tar.gz → 0.0.78__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. {retab-0.0.76 → retab-0.0.78}/PKG-INFO +1 -1
  2. {retab-0.0.76 → retab-0.0.78}/retab/resources/documents/client.py +167 -19
  3. retab-0.0.78/retab/types/documents/__init__.py +12 -0
  4. {retab-0.0.76 → retab-0.0.78}/retab/types/documents/create_messages.py +1 -1
  5. {retab-0.0.76 → retab-0.0.78}/retab/types/documents/edit.py +18 -5
  6. {retab-0.0.76 → retab-0.0.78}/retab/types/documents/extract.py +1 -1
  7. {retab-0.0.76 → retab-0.0.78}/retab/types/documents/parse.py +1 -1
  8. retab-0.0.78/retab/types/documents/split.py +32 -0
  9. {retab-0.0.76 → retab-0.0.78}/retab/types/inference_settings.py +1 -1
  10. {retab-0.0.76 → retab-0.0.78}/retab/types/projects/model.py +1 -1
  11. {retab-0.0.76 → retab-0.0.78}/retab/types/schemas/model.py +81 -1
  12. {retab-0.0.76 → retab-0.0.78}/retab/utils/json_schema.py +2 -2
  13. {retab-0.0.76 → retab-0.0.78}/retab.egg-info/PKG-INFO +1 -1
  14. {retab-0.0.76 → retab-0.0.78}/retab.egg-info/SOURCES.txt +1 -2
  15. {retab-0.0.76 → retab-0.0.78}/setup.py +1 -1
  16. retab-0.0.76/retab/types/documents/__init__.py +0 -3
  17. retab-0.0.76/retab/utils/usage/__init__.py +0 -0
  18. retab-0.0.76/retab/utils/usage/json_schema.py +0 -2197
  19. {retab-0.0.76 → retab-0.0.78}/README.md +0 -0
  20. {retab-0.0.76 → retab-0.0.78}/pyproject.toml +0 -0
  21. {retab-0.0.76 → retab-0.0.78}/retab/__init__.py +0 -0
  22. {retab-0.0.76 → retab-0.0.78}/retab/_resource.py +0 -0
  23. {retab-0.0.76 → retab-0.0.78}/retab/client.py +0 -0
  24. {retab-0.0.76 → retab-0.0.78}/retab/generate_types.py +0 -0
  25. {retab-0.0.76 → retab-0.0.78}/retab/py.typed +0 -0
  26. {retab-0.0.76 → retab-0.0.78}/retab/resources/__init__.py +0 -0
  27. {retab-0.0.76 → retab-0.0.78}/retab/resources/documents/__init__.py +0 -0
  28. {retab-0.0.76 → retab-0.0.78}/retab/resources/extractions/__init__.py +0 -0
  29. {retab-0.0.76 → retab-0.0.78}/retab/resources/extractions/client.py +0 -0
  30. {retab-0.0.76 → retab-0.0.78}/retab/resources/models.py +0 -0
  31. {retab-0.0.76 → retab-0.0.78}/retab/resources/projects/__init__.py +0 -0
  32. {retab-0.0.76 → retab-0.0.78}/retab/resources/projects/client.py +0 -0
  33. {retab-0.0.76 → retab-0.0.78}/retab/resources/schemas.py +0 -0
  34. {retab-0.0.76 → retab-0.0.78}/retab/types/__init__.py +0 -0
  35. {retab-0.0.76 → retab-0.0.78}/retab/types/chat.py +0 -0
  36. {retab-0.0.76 → retab-0.0.78}/retab/types/documents/correct_orientation.py +0 -0
  37. {retab-0.0.76 → retab-0.0.78}/retab/types/extractions/__init__.py +0 -0
  38. {retab-0.0.76 → retab-0.0.78}/retab/types/extractions/types.py +0 -0
  39. {retab-0.0.76 → retab-0.0.78}/retab/types/mime.py +0 -0
  40. {retab-0.0.76 → retab-0.0.78}/retab/types/modality.py +0 -0
  41. {retab-0.0.76 → retab-0.0.78}/retab/types/pagination.py +0 -0
  42. {retab-0.0.76 → retab-0.0.78}/retab/types/projects/__init__.py +0 -0
  43. {retab-0.0.76 → retab-0.0.78}/retab/types/projects/metrics.py +0 -0
  44. {retab-0.0.76 → retab-0.0.78}/retab/types/projects/predictions.py +0 -0
  45. {retab-0.0.76 → retab-0.0.78}/retab/types/schemas/__init__.py +0 -0
  46. {retab-0.0.76 → retab-0.0.78}/retab/types/schemas/chat.py +0 -0
  47. {retab-0.0.76 → retab-0.0.78}/retab/types/schemas/generate.py +0 -0
  48. {retab-0.0.76 → retab-0.0.78}/retab/types/schemas/layout.py +0 -0
  49. {retab-0.0.76 → retab-0.0.78}/retab/types/schemas/templates.py +0 -0
  50. {retab-0.0.76 → retab-0.0.78}/retab/types/standards.py +0 -0
  51. {retab-0.0.76 → retab-0.0.78}/retab/utils/__init__.py +0 -0
  52. {retab-0.0.76 → retab-0.0.78}/retab/utils/display.py +0 -0
  53. {retab-0.0.76 → retab-0.0.78}/retab/utils/hashing.py +0 -0
  54. {retab-0.0.76 → retab-0.0.78}/retab/utils/mime.py +0 -0
  55. {retab-0.0.76 → retab-0.0.78}/retab/utils/stream_context_managers.py +0 -0
  56. {retab-0.0.76 → retab-0.0.78}/retab.egg-info/dependency_links.txt +0 -0
  57. {retab-0.0.76 → retab-0.0.78}/retab.egg-info/requires.txt +0 -0
  58. {retab-0.0.76 → retab-0.0.78}/retab.egg-info/top_level.txt +0 -0
  59. {retab-0.0.76 → retab-0.0.78}/setup.cfg +0 -0
  60. {retab-0.0.76 → retab-0.0.78}/tests/test_projects.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: retab
3
- Version: 0.0.76
3
+ Version: 0.0.78
4
4
  Summary: Retab official python library
5
5
  Home-page: https://github.com/retab-dev/retab
6
6
  Author: Retab
@@ -16,6 +16,7 @@ from ...types.chat import ChatCompletionRetabMessage
16
16
  from ...types.documents.edit import EditRequest, EditResponse
17
17
  from ...types.documents.extract import DocumentExtractRequest, RetabParsedChatCompletion, RetabParsedChatCompletionChunk, RetabParsedChoice, maybe_parse_to_pydantic
18
18
  from ...types.documents.parse import ParseRequest, ParseResult, TableParsingFormat
19
+ from ...types.documents.split import Category, SplitRequest, SplitResponse
19
20
  from ...types.mime import MIMEData
20
21
  from ...types.standards import PreparedRequest, FieldUnset
21
22
  from ...utils.json_schema import load_json_schema, unflatten_dict
@@ -117,19 +118,24 @@ class BaseDocumentsMixin:
117
118
 
118
119
  def _prepare_edit(
119
120
  self,
120
- document: Path | str | IOBase | MIMEData | PIL.Image.Image | HttpUrl,
121
121
  filling_instructions: str,
122
+ document: Path | str | IOBase | MIMEData | PIL.Image.Image | HttpUrl | None = None,
122
123
  model: str = FieldUnset,
124
+ template_id: str | None = FieldUnset,
123
125
  **extra_body: Any,
124
126
  ) -> PreparedRequest:
125
- mime_document = prepare_mime_document(document)
126
-
127
127
  request_dict: dict[str, Any] = {
128
- "document": mime_document,
129
128
  "filling_instructions": filling_instructions,
130
129
  }
130
+
131
+ if document is not None:
132
+ mime_document = prepare_mime_document(document)
133
+ request_dict["document"] = mime_document
134
+
131
135
  if model is not FieldUnset:
132
136
  request_dict["model"] = model
137
+ if template_id is not FieldUnset:
138
+ request_dict["template_id"] = template_id
133
139
 
134
140
  # Merge any extra fields provided by the caller
135
141
  if extra_body:
@@ -138,11 +144,39 @@ class BaseDocumentsMixin:
138
144
  edit_request = EditRequest(**request_dict)
139
145
  return PreparedRequest(method="POST", url="/v1/documents/edit", data=edit_request.model_dump(mode="json", exclude_unset=True))
140
146
 
147
+ def _prepare_split(
148
+ self,
149
+ document: Path | str | IOBase | MIMEData | PIL.Image.Image | HttpUrl,
150
+ categories: list[Category] | list[dict[str, str]],
151
+ model: str,
152
+ **extra_body: Any,
153
+ ) -> PreparedRequest:
154
+ mime_document = prepare_mime_document(document)
155
+
156
+ # Convert dict categories to Category objects if needed
157
+ category_objects = [
158
+ Category(**cat) if isinstance(cat, dict) else cat
159
+ for cat in categories
160
+ ]
161
+
162
+ request_dict: dict[str, Any] = {
163
+ "document": mime_document,
164
+ "categories": category_objects,
165
+ "model": model,
166
+ }
167
+
168
+ # Merge any extra fields provided by the caller
169
+ if extra_body:
170
+ request_dict.update(extra_body)
171
+
172
+ split_request = SplitRequest(**request_dict)
173
+ return PreparedRequest(method="POST", url="/v1/documents/split", data=split_request.model_dump(mode="json", exclude_unset=True))
174
+
141
175
  def _prepare_extract(
142
176
  self,
143
177
  json_schema: dict[str, Any] | Path | str,
144
178
  model: str,
145
- document: Path | str | IOBase | HttpUrl,
179
+ document: Path | str | IOBase | HttpUrl | MIMEData,
146
180
  image_resolution_dpi: int = FieldUnset,
147
181
  temperature: float = FieldUnset,
148
182
  reasoning_effort: ChatCompletionReasoningEffort = FieldUnset,
@@ -261,7 +295,7 @@ class Documents(SyncAPIResource, BaseDocumentsMixin):
261
295
  self,
262
296
  json_schema: dict[str, Any] | Path | str,
263
297
  model: str,
264
- document: Path | str | IOBase | HttpUrl,
298
+ document: Path | str | IOBase | HttpUrl | MIMEData,
265
299
  image_resolution_dpi: int = FieldUnset,
266
300
  temperature: float = FieldUnset,
267
301
  reasoning_effort: ChatCompletionReasoningEffort = FieldUnset,
@@ -279,7 +313,7 @@ class Documents(SyncAPIResource, BaseDocumentsMixin):
279
313
  Args:
280
314
  json_schema: JSON schema defining the expected data structure
281
315
  model: The AI model to use for processing
282
- document: Document to process (file path, URL, or file-like object)
316
+ document: Document to process (file path, URL, file-like object, or MIMEData)
283
317
  image_resolution_dpi: Optional image resolution DPI
284
318
  temperature: Model temperature setting (0-1)
285
319
  reasoning_effort: The effort level for the model to reason about the input data
@@ -405,7 +439,7 @@ class Documents(SyncAPIResource, BaseDocumentsMixin):
405
439
  self,
406
440
  json_schema: dict[str, Any] | Path | str,
407
441
  model: str,
408
- document: Path | str | IOBase | HttpUrl,
442
+ document: Path | str | IOBase | HttpUrl | MIMEData,
409
443
  image_resolution_dpi: int = FieldUnset,
410
444
  temperature: float = FieldUnset,
411
445
  reasoning_effort: ChatCompletionReasoningEffort = FieldUnset,
@@ -535,9 +569,10 @@ class Documents(SyncAPIResource, BaseDocumentsMixin):
535
569
 
536
570
  def edit(
537
571
  self,
538
- document: Path | str | IOBase | MIMEData | PIL.Image.Image | HttpUrl,
539
572
  filling_instructions: str,
573
+ document: Path | str | IOBase | MIMEData | PIL.Image.Image | HttpUrl | None = None,
540
574
  model: str = FieldUnset,
575
+ template_id: str | None = FieldUnset,
541
576
  **extra_body: Any,
542
577
  ) -> EditResponse:
543
578
  """
@@ -549,10 +584,15 @@ class Documents(SyncAPIResource, BaseDocumentsMixin):
549
584
  3. LLM-based form filling using the provided instructions
550
585
  4. Returns the filled PDF with form field values populated
551
586
 
587
+ Either `document` OR `template_id` must be provided, but not both.
588
+
552
589
  Args:
553
- document: The document to edit. Can be a file path (Path or str), file-like object, MIMEData, PIL Image, or URL.
554
590
  filling_instructions: Instructions describing how to fill the form fields.
555
- model: The LLM model to use for inference. Defaults to "gemini-2.5-pro".
591
+ document: The document to edit. Can be a file path (Path or str), file-like object, MIMEData, PIL Image, or URL.
592
+ Mutually exclusive with template_id.
593
+ model: The LLM model to use for inference. Defaults to "retab-small".
594
+ template_id: Template ID to use for filling. When provided, uses the template's pre-defined form fields
595
+ and empty PDF. Only works for PDF documents. Mutually exclusive with document.
556
596
 
557
597
  Returns:
558
598
  EditResponse: Response containing:
@@ -563,14 +603,65 @@ class Documents(SyncAPIResource, BaseDocumentsMixin):
563
603
  HTTPException: If the request fails.
564
604
  """
565
605
  request = self._prepare_edit(
566
- document=document,
567
606
  filling_instructions=filling_instructions,
607
+ document=document,
568
608
  model=model,
609
+ template_id=template_id,
569
610
  **extra_body,
570
611
  )
571
612
  response = self._client._prepared_request(request)
572
613
  return EditResponse.model_validate(response)
573
614
 
615
+ def split(
616
+ self,
617
+ document: Path | str | IOBase | MIMEData | PIL.Image.Image | HttpUrl,
618
+ categories: list[Category] | list[dict[str, str]],
619
+ model: str,
620
+ **extra_body: Any,
621
+ ) -> SplitResponse:
622
+ """
623
+ Split a document into sections based on provided categories.
624
+
625
+ This method analyzes a multi-page document and classifies pages into
626
+ user-defined categories, returning the page ranges for each section.
627
+
628
+ Args:
629
+ document: The document to split. Can be a file path (Path or str), file-like object, MIMEData, PIL Image, or URL.
630
+ categories: List of categories to split the document into. Each category should have a 'name' and 'description'.
631
+ Can be Category objects or dicts with 'name' and 'description' keys.
632
+ model: The AI model to use for document splitting (e.g., "gemini-2.5-flash").
633
+
634
+ Returns:
635
+ SplitResponse: Response containing:
636
+ - splits: List of SplitResult objects with name, start_page, and end_page for each section.
637
+
638
+ Raises:
639
+ HTTPException: If the request fails.
640
+
641
+ Example:
642
+ ```python
643
+ response = retab.documents.split(
644
+ document="invoice_batch.pdf",
645
+ model="gemini-2.5-flash",
646
+ categories=[
647
+ {"name": "invoice", "description": "Invoice documents with billing information"},
648
+ {"name": "receipt", "description": "Receipt documents for payments"},
649
+ {"name": "contract", "description": "Legal contract documents"},
650
+ ]
651
+ )
652
+ for split in response.splits:
653
+ print(f"{split.name}: pages {split.start_page}-{split.end_page}")
654
+ ```
655
+ """
656
+ request = self._prepare_split(
657
+ document=document,
658
+ categories=categories,
659
+ model=model,
660
+ **extra_body,
661
+ )
662
+ response = self._client._prepared_request(request)
663
+ return SplitResponse.model_validate(response)
664
+
574
665
 
575
666
  class AsyncDocuments(AsyncAPIResource, BaseDocumentsMixin):
576
667
  """Documents API wrapper for asynchronous usage."""
@@ -637,7 +728,7 @@ class AsyncDocuments(AsyncAPIResource, BaseDocumentsMixin):
637
728
  self,
638
729
  json_schema: dict[str, Any] | Path | str,
639
730
  model: str,
640
- document: Path | str | IOBase | HttpUrl,
731
+ document: Path | str | IOBase | HttpUrl | MIMEData,
641
732
  image_resolution_dpi: int = FieldUnset,
642
733
  temperature: float = FieldUnset,
643
734
  reasoning_effort: ChatCompletionReasoningEffort = FieldUnset,
@@ -655,7 +746,7 @@ class AsyncDocuments(AsyncAPIResource, BaseDocumentsMixin):
655
746
  Args:
656
747
  json_schema: JSON schema defining the expected data structure
657
748
  model: The AI model to use for processing
658
- document: Document to process (file path, URL, or file-like object)
749
+ document: Document to process (file path, URL, file-like object, or MIMEData)
659
750
  image_resolution_dpi: Optional image resolution DPI
660
751
  temperature: Model temperature setting (0-1)
661
752
  reasoning_effort: The effort level for the model to reason about the input data
@@ -693,7 +784,7 @@ class AsyncDocuments(AsyncAPIResource, BaseDocumentsMixin):
693
784
  self,
694
785
  json_schema: dict[str, Any] | Path | str,
695
786
  model: str,
696
- document: Path | str | IOBase | HttpUrl,
787
+ document: Path | str | IOBase | HttpUrl | MIMEData,
697
788
  image_resolution_dpi: int = FieldUnset,
698
789
  temperature: float = FieldUnset,
699
790
  reasoning_effort: ChatCompletionReasoningEffort = FieldUnset,
@@ -709,7 +800,7 @@ class AsyncDocuments(AsyncAPIResource, BaseDocumentsMixin):
709
800
  Args:
710
801
  json_schema: JSON schema defining the expected data structure.
711
802
  model: The AI model to use.
712
- document: Document to process (file path, URL, or file-like object)
803
+ document: Document to process (file path, URL, file-like object, or MIMEData)
713
804
  image_resolution_dpi: Optional image resolution DPI.
714
805
  temperature: Model temperature setting (0-1).
715
806
  reasoning_effort: The effort level for the model to reason about the input data.
@@ -822,9 +913,10 @@ class AsyncDocuments(AsyncAPIResource, BaseDocumentsMixin):
822
913
 
823
914
  async def edit(
824
915
  self,
825
- document: Path | str | IOBase | MIMEData | PIL.Image.Image | HttpUrl,
826
916
  filling_instructions: str,
917
+ document: Path | str | IOBase | MIMEData | PIL.Image.Image | HttpUrl | None = None,
827
918
  model: str = FieldUnset,
919
+ template_id: str | None = FieldUnset,
828
920
  **extra_body: Any,
829
921
  ) -> EditResponse:
830
922
  """
@@ -836,10 +928,15 @@ class AsyncDocuments(AsyncAPIResource, BaseDocumentsMixin):
836
928
  3. LLM-based form filling using the provided instructions
837
929
  4. Returns the filled PDF with form field values populated
838
930
 
931
+ Either `document` OR `template_id` must be provided, but not both.
932
+
839
933
  Args:
840
- document: The document to edit. Can be a file path (Path or str), file-like object, MIMEData, PIL Image, or URL.
841
934
  filling_instructions: Instructions describing how to fill the form fields.
935
+ document: The document to edit. Can be a file path (Path or str), file-like object, MIMEData, PIL Image, or URL.
936
+ Mutually exclusive with template_id.
842
937
  model: The LLM model to use for inference. Defaults to "gemini-2.5-pro".
938
+ template_id: Template ID to use for filling. When provided, uses the template's pre-defined form fields
939
+ and empty PDF. Only works for PDF documents. Mutually exclusive with document.
843
940
 
844
941
  Returns:
845
942
  EditResponse: Response containing:
@@ -850,10 +947,61 @@ class AsyncDocuments(AsyncAPIResource, BaseDocumentsMixin):
850
947
  HTTPException: If the request fails.
851
948
  """
852
949
  request = self._prepare_edit(
853
- document=document,
854
950
  filling_instructions=filling_instructions,
951
+ document=document,
855
952
  model=model,
953
+ template_id=template_id,
856
954
  **extra_body,
857
955
  )
858
956
  response = await self._client._prepared_request(request)
859
957
  return EditResponse.model_validate(response)
958
+
959
+ async def split(
960
+ self,
961
+ document: Path | str | IOBase | MIMEData | PIL.Image.Image | HttpUrl,
962
+ categories: list[Category] | list[dict[str, str]],
963
+ model: str,
964
+ **extra_body: Any,
965
+ ) -> SplitResponse:
966
+ """
967
+ Split a document into sections based on provided categories asynchronously.
968
+
969
+ This method analyzes a multi-page document and classifies pages into
970
+ user-defined categories, returning the page ranges for each section.
971
+
972
+ Args:
973
+ document: The document to split. Can be a file path (Path or str), file-like object, MIMEData, PIL Image, or URL.
974
+ categories: List of categories to split the document into. Each category should have a 'name' and 'description'.
975
+ Can be Category objects or dicts with 'name' and 'description' keys.
976
+ model: The AI model to use for document splitting (e.g., "gemini-2.5-flash").
977
+
978
+ Returns:
979
+ SplitResponse: Response containing:
980
+ - splits: List of SplitResult objects with name, start_page, and end_page for each section.
981
+
982
+ Raises:
983
+ HTTPException: If the request fails.
984
+
985
+ Example:
986
+ ```python
987
+ response = await retab.documents.split(
988
+ document="invoice_batch.pdf",
989
+ model="gemini-2.5-flash",
990
+ categories=[
991
+ {"name": "invoice", "description": "Invoice documents with billing information"},
992
+ {"name": "receipt", "description": "Receipt documents for payments"},
993
+ {"name": "contract", "description": "Legal contract documents"},
994
+ ]
995
+ )
996
+ for split in response.splits:
997
+ print(f"{split.name}: pages {split.start_page}-{split.end_page}")
998
+ ```
999
+ """
1000
+ request = self._prepare_split(
1001
+ document=document,
1002
+ categories=categories,
1003
+ model=model,
1004
+ **extra_body,
1005
+ )
1006
+ response = await self._client._prepared_request(request)
1007
+ return SplitResponse.model_validate(response)
@@ -0,0 +1,12 @@
1
+ from .parse import ParseRequest, ParseResult, RetabUsage
2
+ from .split import Category, SplitRequest, SplitResult, SplitResponse
3
+
4
+ __all__ = [
5
+ "ParseRequest",
6
+ "ParseResult",
7
+ "RetabUsage",
8
+ "Category",
9
+ "SplitRequest",
10
+ "SplitResult",
11
+ "SplitResponse",
12
+ ]
@@ -22,7 +22,7 @@ class DocumentCreateMessageRequest(BaseModel):
22
22
  model_config = ConfigDict(extra="ignore")
23
23
  document: MIMEData = Field(description="The document to load.")
24
24
  image_resolution_dpi: int = Field(default=192, description="Resolution of the image sent to the LLM")
25
- model: str = Field(default="gemini-2.5-flash", description="The model to use for the document.")
25
+ model: str = Field(default="retab-small", description="The model to use for the document.")
26
26
 
27
27
  class DocumentCreateInputRequest(DocumentCreateMessageRequest):
28
28
  json_schema: dict[str, Any] = Field(description="The json schema to use for the document.")
@@ -60,6 +60,10 @@ class BaseFormField(BaseModel):
60
60
  ...,
61
61
  description="Type of field. Currently supported values: 'text' and 'checkbox'.",
62
62
  )
63
+ key: str = Field(
64
+ ...,
65
+ description="Key of the field. This is used to identify the field in the form data.",
66
+ )
63
67
 
64
68
 
65
69
  class FormField(BaseFormField):
@@ -113,10 +117,12 @@ class OCRResult(BaseModel):
113
117
 
114
118
 
115
119
  class InferFormSchemaRequest(BaseModel):
116
- """Request to infer form schema from a PDF."""
120
+ """Request to infer form schema from a PDF or DOCX document."""
117
121
 
118
- document: MIMEData = Field(..., description="Input document (PDF)")
119
- model: str = Field(default="gemini-2.5-pro", description="LLM model to use for inference")
122
+ document: MIMEData = Field(..., description="Input document (PDF or DOCX). DOCX files will be converted to PDF.")
123
+ model: str = Field(default="retab-small", description="LLM model to use for inference")
124
+ instructions: Optional[str] = Field(default=None, description="Optional instructions to guide form field detection (e.g., which fields to focus on, specific areas to look for)")
125
+ per_page: Optional[bool] = Field(default=None, description="If True, process each page separately for better accuracy on long PDFs. If None (default), automatically uses per-page for PDFs with more than 3 pages.")
120
126
 
121
127
 
122
128
  class InferFormSchemaResponse(BaseModel):
@@ -127,10 +133,17 @@ class InferFormSchemaResponse(BaseModel):
127
133
  form_fields_pdf: MIMEData = Field(..., description="PDF with form field bounding boxes")
128
134
 
129
135
 
130
- class EditRequest(InferFormSchemaRequest):
131
- """Request for the infer_and_fill_schema endpoint."""
136
+ class EditRequest(BaseModel):
137
+ """Request for the infer_and_fill_schema endpoint.
132
138
 
139
+ Either `document` OR `template_id` must be provided, but not both.
140
+ - When `document` is provided: OCR + LLM inference to detect and fill form fields
141
+ - When `template_id` is provided: Uses pre-defined form fields from the template (PDF only)
142
+ """
143
+ document: Optional[MIMEData] = Field(default=None, description="Input document (PDF or DOCX). DOCX files will be converted to PDF. Mutually exclusive with template_id.")
144
+ model: str = Field(default="retab-small", description="LLM model to use for inference")
133
145
  filling_instructions: str = Field(..., description="Instructions to fill the form")
146
+ template_id: Optional[str] = Field(default=None, description="Template ID to use for filling. When provided, uses the template's pre-defined form fields and empty PDF. Only works for PDF documents. Mutually exclusive with document.")
134
147
 
135
148
  class EditResponse(BaseModel):
136
149
  """Response from the fill_form endpoint.
@@ -34,7 +34,7 @@ class DocumentExtractRequest(BaseModel):
34
34
  stream: bool = Field(default=False, description="If true, the extraction will be streamed to the user using the active WebSocket connection")
35
35
  seed: int | None = Field(default=None, description="Seed for the random number generator. If not provided, a random seed will be generated.", examples=[None])
36
36
  store: bool = Field(default=True, description="If true, the extraction will be stored in the database")
37
- parallel_ocr_keys: Optional[dict[str, str]] = Field(default=None, description="If set, keys to be used for the extraction of long lists of data using Parallel OCR", examples=[{"properties": "ID", "products": "identity.id"}])
37
+ chunking_keys: Optional[dict[str, str]] = Field(default=None, description="If set, keys to be used for the extraction of long lists of data using Parallel OCR", examples=[{"properties": "ID", "products": "identity.id"}])
38
38
  web_search: bool = Field(default=False, description="Enable web search enrichment with Parallel AI to add external context during extraction")
39
39
  metadata: dict[str, str] = Field(default_factory=dict, description="User-defined metadata to associate with this extraction")
40
40
  extraction_id: Optional[str] = Field(default=None, description="Extraction ID to use for this extraction. If not provided, a new ID will be generated.")
@@ -18,7 +18,7 @@ class ParseRequest(BaseModel):
18
18
  model_config = ConfigDict(extra="ignore")
19
19
 
20
20
  document: MIMEData = Field(..., description="Document to parse")
21
- model: str = Field(default="gemini-2.5-flash", description="Model to use for parsing")
21
+ model: str = Field(default="retab-small", description="Model to use for parsing")
22
22
  table_parsing_format: TableParsingFormat = Field(default="html", description="Format for parsing tables")
23
23
  image_resolution_dpi: int = Field(default=192, description="DPI for image processing", ge=96, le=300)
24
24
 
@@ -0,0 +1,32 @@
1
+ from pydantic import BaseModel, Field
2
+ from ..mime import MIMEData
3
+
4
+
5
+ class Category(BaseModel):
6
+ name: str = Field(..., description="The name of the category")
7
+ description: str = Field(..., description="The description of the category")
8
+
9
+
10
+ class SplitRequest(BaseModel):
11
+ document: MIMEData = Field(..., description="The document to split")
12
+ categories: list[Category] = Field(..., description="The categories to split the document into")
13
+ model: str = Field(default="retab-small", description="The model to use to split the document")
14
+
15
+
16
+ class SplitResult(BaseModel):
17
+ name: str = Field(..., description="The name of the category")
18
+ start_page: int = Field(..., description="The start page of the category (1-indexed)")
19
+ end_page: int = Field(..., description="The end page of the category (1-indexed, inclusive)")
20
+
21
+
22
+ class SplitResponse(BaseModel):
23
+ splits: list[SplitResult] = Field(..., description="The list of document splits with their page ranges")
24
+
25
+
26
+
27
+ class SplitOutputSchema(BaseModel):
28
+ """Schema for LLM structured output."""
29
+ splits: list[SplitResult] = Field(
30
+ ...,
31
+ description="List of document sections, each classified into one of the provided categories with their page ranges"
32
+ )
@@ -8,7 +8,7 @@ class InferenceSettings(BaseModel):
8
8
  reasoning_effort: ChatCompletionReasoningEffort = "minimal"
9
9
  image_resolution_dpi: int = Field(default=192, description="Resolution of the image sent to the LLM", ge=96, le=300)
10
10
  n_consensus: int = Field(default=1, ge=1, le=8, description="Number of consensus rounds to perform")
11
- parallel_ocr_keys: dict[str, str] | None = Field(default=None, description="If set, keys to be used for the extraction of long lists of data using Parallel OCR", examples=[{"properties": "ID", "products": "identity.id"}])
11
+ chunking_keys: dict[str, str] | None = Field(default=None, description="If set, keys to be used for the extraction of long lists of data using Parallel OCR", examples=[{"properties": "ID", "products": "identity.id"}])
12
12
  web_search: bool = Field(default=False, description="Enable web search enrichment with Parallel AI to add external context during extraction")
13
13
  model_config = ConfigDict(extra="ignore")
14
14
 
@@ -9,7 +9,7 @@ from ..inference_settings import InferenceSettings
9
9
  from .predictions import PredictionData
10
10
 
11
11
  default_inference_settings = InferenceSettings(
12
- model="auto-small",
12
+ model="retab-small",
13
13
  temperature=0.5,
14
14
  reasoning_effort="minimal",
15
15
  image_resolution_dpi=192,
@@ -103,7 +103,57 @@ def _insert_reasoning_fields_inner(schema: dict[str, Any]) -> tuple[dict[str, An
103
103
  return schema, reasoning_desc
104
104
 
105
105
 
106
- def filter_auxiliary_fields(data: dict[str, Any], prefixes: list[str] = ["reasoning___"]) -> dict[str, Any]:
106
+ def _insert_quote_fields_inner(schema: dict[str, Any]) -> dict[str, Any]:
107
+ """
108
+ Inner function that processes a schema and adds source___ fields for leaf nodes with X-SourceQuote: true.
109
+ Only applies to leaf fields, never to the root.
110
+ """
111
+ if not isinstance(schema, dict):
112
+ return schema
113
+
114
+ # Create a copy to avoid modifying the original
115
+ new_schema = copy.deepcopy(schema)
116
+
117
+ # Process children recursively
118
+ if "properties" in new_schema and isinstance(new_schema["properties"], dict):
119
+ new_props = {}
120
+ for property_key, property_value in new_schema["properties"].items():
121
+ updated_prop_schema_value = _insert_quote_fields_inner(property_value)
122
+ has_quote_field = updated_prop_schema_value.get("X-SourceQuote") is True
123
+
124
+ # Check if this property is a leaf with X-SourceQuote: true
125
+ if has_quote_field:
126
+ # Add the quote field
127
+ quote_key = f"source___{property_key}"
128
+ new_props[quote_key] = {"type": "string", "description": f"The exact quote from the source document that supports the extracted value for '{property_key}'."}
129
+
130
+ # Add the quote field to required if the property is required
131
+ if "required" in new_schema and property_key in new_schema["required"]:
132
+ # add the quote field to required just before the property_key
133
+ new_schema["required"].insert(new_schema["required"].index(property_key), quote_key)
134
+
135
+ # Remove the X-SourceQuote field
136
+ updated_prop_schema_value.pop("X-SourceQuote", None)
137
+
138
+ new_props[property_key] = updated_prop_schema_value
139
+ new_schema["properties"] = new_props
140
+
141
+ elif "items" in new_schema:
142
+ # Recurse into items if present
143
+ updated_items = _insert_quote_fields_inner(new_schema["items"])
144
+ new_schema["items"] = updated_items
145
+
146
+ # Process $defs as well
147
+ if "$defs" in new_schema and isinstance(new_schema["$defs"], dict):
148
+ new_defs = {}
149
+ for dk, dv in new_schema["$defs"].items():
150
+ new_defs[dk] = _insert_quote_fields_inner(dv)
151
+ new_schema["$defs"] = new_defs
152
+
153
+ return new_schema
154
+
155
+
156
+ def filter_auxiliary_fields(data: dict[str, Any], prefixes: list[str] = ["reasoning___", "source___"]) -> dict[str, Any]:
107
157
  """
108
158
  Recursively filters out fields that start with any of the prefixes in `prefixes` from the input data.
109
159
  """
@@ -142,6 +192,9 @@ def create_reasoning_schema_without_ref_expansion(json_schema: dict[str, Any]) -
142
192
  if "required" in updated_schema:
143
193
  updated_schema["required"].append("reasoning___root")
144
194
 
195
+ # Insert quote fields for leaf nodes with X-SourceQuote: true
196
+ updated_schema = _insert_quote_fields_inner(updated_schema)
197
+
145
198
  # Clean the schema (remove defaults, etc)
146
199
  updated_schema = clean_schema(updated_schema, remove_custom_fields=True)
147
200
  return updated_schema
@@ -167,6 +220,9 @@ def create_reasoning_schema(json_schema: dict[str, Any]) -> dict[str, Any]:
167
220
  if "required" in updated_schema:
168
221
  updated_schema["required"].append("reasoning___root")
169
222
 
223
+ # Insert quote fields for leaf nodes with X-SourceQuote: true
224
+ updated_schema = _insert_quote_fields_inner(updated_schema)
225
+
170
226
  # Clean the schema (remove defaults, etc)
171
227
  updated_schema = clean_schema(updated_schema, remove_custom_fields=True)
172
228
  return updated_schema
@@ -1118,6 +1174,30 @@ No ambiguities."
1118
1174
 
1119
1175
  ---
1120
1176
 
1177
+ ## Source Quote Fields
1178
+
1179
+ The schema may include source quote fields (`source___*`) for capturing exact quotes from the document that support extracted values. These fields appear as siblings to the fields they document.
1180
+
1181
+ Naming:
1182
+ - `source___[fieldname]` for each field marked with X-SourceQuote in the schema
1183
+
1184
+ Guidelines:
1185
+ - Extract the exact verbatim text from the document that supports the extracted value.
1186
+ - Include surrounding context when helpful for verification.
1187
+ - For missing data, use an empty string `""`.
1188
+ - These fields are internal and omitted from final outputs.
1189
+
1190
+ ### Example
1191
+ If extracting a company name with source quote:
1192
+ ```json
1193
+ {
1194
+ "source___company_name": "Registered Office: ACME Corporation Ltd",
1195
+ "company_name": "ACME Corporation Ltd"
1196
+ }
1197
+ ```
1198
+
1199
+ ---
1200
+
1121
1201
  ## Extraction Principles
1122
1202
 
1123
1203
  - **Transparency**: Justify every decision with evidence.
@@ -368,7 +368,7 @@ def convert_basemodel_to_partial_basemodel(base_model: Type[BaseModel]) -> Type[
368
368
 
369
369
 
370
370
 
371
- def filter_auxiliary_fields(data: dict[str, Any], prefixes: list[str] = ["reasoning___", "quote___"]) -> dict[str, Any]:
371
+ def filter_auxiliary_fields(data: dict[str, Any], prefixes: list[str] = ["reasoning___", "source___"]) -> dict[str, Any]:
372
372
  """
373
373
  Recursively filters out fields that start with any of the prefixes in `prefixes` from the input data.
374
374
  """
@@ -388,7 +388,7 @@ def filter_auxiliary_fields(data: dict[str, Any], prefixes: list[str] = ["reason
388
388
  return filtered
389
389
 
390
390
 
391
- def filter_auxiliary_fields_json(data: str, prefixes: list[str] = ["reasoning___", "quote___"]) -> dict[str, Any]:
391
+ def filter_auxiliary_fields_json(data: str, prefixes: list[str] = ["reasoning___", "source___"]) -> dict[str, Any]:
392
392
  """
393
393
  Recursively filters out fields that start with any of the prefixes in `prefixes` from the input JSON data.
394
394
  """
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: retab
3
- Version: 0.0.76
3
+ Version: 0.0.78
4
4
  Summary: Retab official python library
5
5
  Home-page: https://github.com/retab-dev/retab
6
6
  Author: Retab
@@ -33,6 +33,7 @@ retab/types/documents/create_messages.py
33
33
  retab/types/documents/edit.py
34
34
  retab/types/documents/extract.py
35
35
  retab/types/documents/parse.py
36
+ retab/types/documents/split.py
36
37
  retab/types/extractions/__init__.py
37
38
  retab/types/extractions/types.py
38
39
  retab/types/projects/__init__.py
@@ -51,6 +52,4 @@ retab/utils/hashing.py
51
52
  retab/utils/json_schema.py
52
53
  retab/utils/mime.py
53
54
  retab/utils/stream_context_managers.py
54
- retab/utils/usage/__init__.py
55
- retab/utils/usage/json_schema.py
56
55
  tests/test_projects.py
@@ -6,7 +6,7 @@ with open("requirements.txt") as f:
6
6
 
7
7
  setup(
8
8
  name="retab",
9
- version="0.0.76",
9
+ version="0.0.78",
10
10
  author="Retab",
11
11
  author_email="contact@retab.com",
12
12
  description="Retab official python library",
@@ -1,3 +0,0 @@
1
- from .parse import ParseRequest, ParseResult, RetabUsage
2
-
3
- __all__ = ["ParseRequest", "ParseResult", "RetabUsage"]
File without changes