PyPI - retab - Versions diffs - 0.0.76__tar.gz → 0.0.78__tar.gz - Mend

retab 0.0.76tar.gz → 0.0.78tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (60) hide show

{retab-0.0.76 → retab-0.0.78}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: retab
-Version: 0.0.76
+Version: 0.0.78
 Summary: Retab official python library
 Home-page: https://github.com/retab-dev/retab
 Author: Retab

{retab-0.0.76 → retab-0.0.78}/retab/resources/documents/client.py RENAMED Viewed

@@ -16,6 +16,7 @@ from ...types.chat import ChatCompletionRetabMessage
 from ...types.documents.edit import EditRequest, EditResponse
 from ...types.documents.extract import DocumentExtractRequest, RetabParsedChatCompletion, RetabParsedChatCompletionChunk, RetabParsedChoice, maybe_parse_to_pydantic
 from ...types.documents.parse import ParseRequest, ParseResult, TableParsingFormat
+from ...types.documents.split import Category, SplitRequest, SplitResponse
 from ...types.mime import MIMEData
 from ...types.standards import PreparedRequest, FieldUnset
 from ...utils.json_schema import load_json_schema, unflatten_dict
@@ -117,19 +118,24 @@ class BaseDocumentsMixin:
     def _prepare_edit(
         self,
-        document: Path | str | IOBase | MIMEData | PIL.Image.Image | HttpUrl,
         filling_instructions: str,
+        document: Path | str | IOBase | MIMEData | PIL.Image.Image | HttpUrl | None = None,
         model: str = FieldUnset,
+        template_id: str | None = FieldUnset,
         **extra_body: Any,
     ) -> PreparedRequest:
-        mime_document = prepare_mime_document(document)
         request_dict: dict[str, Any] = {
-            "document": mime_document,
             "filling_instructions": filling_instructions,
         }
+        if document is not None:
+            mime_document = prepare_mime_document(document)
+            request_dict["document"] = mime_document
         if model is not FieldUnset:
             request_dict["model"] = model
+        if template_id is not FieldUnset:
+            request_dict["template_id"] = template_id
         # Merge any extra fields provided by the caller
         if extra_body:
@@ -138,11 +144,39 @@ class BaseDocumentsMixin:
         edit_request = EditRequest(**request_dict)
         return PreparedRequest(method="POST", url="/v1/documents/edit", data=edit_request.model_dump(mode="json", exclude_unset=True))
+    def _prepare_split(
+        self,
+        document: Path | str | IOBase | MIMEData | PIL.Image.Image | HttpUrl,
+        categories: list[Category] | list[dict[str, str]],
+        model: str,
+        **extra_body: Any,
+    ) -> PreparedRequest:
+        mime_document = prepare_mime_document(document)
+        # Convert dict categories to Category objects if needed
+        category_objects = [
+            Category(**cat) if isinstance(cat, dict) else cat
+            for cat in categories
+        ]
+        request_dict: dict[str, Any] = {
+            "document": mime_document,
+            "categories": category_objects,
+            "model": model,
+        }
+        # Merge any extra fields provided by the caller
+        if extra_body:
+            request_dict.update(extra_body)
+        split_request = SplitRequest(**request_dict)
+        return PreparedRequest(method="POST", url="/v1/documents/split", data=split_request.model_dump(mode="json", exclude_unset=True))
     def _prepare_extract(
         self,
         json_schema: dict[str, Any] | Path | str,
         model: str,
-        document: Path | str | IOBase | HttpUrl,
+        document: Path | str | IOBase | HttpUrl | MIMEData,
         image_resolution_dpi: int = FieldUnset,
         temperature: float = FieldUnset,
         reasoning_effort: ChatCompletionReasoningEffort = FieldUnset,
@@ -261,7 +295,7 @@ class Documents(SyncAPIResource, BaseDocumentsMixin):
         self,
         json_schema: dict[str, Any] | Path | str,
         model: str,
-        document: Path | str | IOBase | HttpUrl,
+        document: Path | str | IOBase | HttpUrl | MIMEData,
         image_resolution_dpi: int = FieldUnset,
         temperature: float = FieldUnset,
         reasoning_effort: ChatCompletionReasoningEffort = FieldUnset,
@@ -279,7 +313,7 @@ class Documents(SyncAPIResource, BaseDocumentsMixin):
         Args:
             json_schema: JSON schema defining the expected data structure
             model: The AI model to use for processing
-            document: Document to process (file path, URL, or file-like object)
+            document: Document to process (file path, URL, file-like object, or MIMEData)
             image_resolution_dpi: Optional image resolution DPI
             temperature: Model temperature setting (0-1)
             reasoning_effort: The effort level for the model to reason about the input data
@@ -405,7 +439,7 @@ class Documents(SyncAPIResource, BaseDocumentsMixin):
         self,
         json_schema: dict[str, Any] | Path | str,
         model: str,
-        document: Path | str | IOBase | HttpUrl,
+        document: Path | str | IOBase | HttpUrl | MIMEData,
         image_resolution_dpi: int = FieldUnset,
         temperature: float = FieldUnset,
         reasoning_effort: ChatCompletionReasoningEffort = FieldUnset,
@@ -535,9 +569,10 @@ class Documents(SyncAPIResource, BaseDocumentsMixin):
     def edit(
         self,
-        document: Path | str | IOBase | MIMEData | PIL.Image.Image | HttpUrl,
         filling_instructions: str,
+        document: Path | str | IOBase | MIMEData | PIL.Image.Image | HttpUrl | None = None,
         model: str = FieldUnset,
+        template_id: str | None = FieldUnset,
         **extra_body: Any,
     ) -> EditResponse:
         """
@@ -549,10 +584,15 @@ class Documents(SyncAPIResource, BaseDocumentsMixin):
         3. LLM-based form filling using the provided instructions
         4. Returns the filled PDF with form field values populated
+        Either `document` OR `template_id` must be provided, but not both.
         Args:
-            document: The document to edit. Can be a file path (Path or str), file-like object, MIMEData, PIL Image, or URL.
             filling_instructions: Instructions describing how to fill the form fields.
-            model: The LLM model to use for inference. Defaults to "gemini-2.5-pro".
+            document: The document to edit. Can be a file path (Path or str), file-like object, MIMEData, PIL Image, or URL.
+                Mutually exclusive with template_id.
+            model: The LLM model to use for inference. Defaults to "retab-small".
+            template_id: Template ID to use for filling. When provided, uses the template's pre-defined form fields
+                and empty PDF. Only works for PDF documents. Mutually exclusive with document.
         Returns:
             EditResponse: Response containing:
@@ -563,14 +603,65 @@ class Documents(SyncAPIResource, BaseDocumentsMixin):
             HTTPException: If the request fails.
         """
         request = self._prepare_edit(
-            document=document,
             filling_instructions=filling_instructions,
+            document=document,
             model=model,
+            template_id=template_id,
             **extra_body,
         )
         response = self._client._prepared_request(request)
         return EditResponse.model_validate(response)
+    def split(
+        self,
+        document: Path | str | IOBase | MIMEData | PIL.Image.Image | HttpUrl,
+        categories: list[Category] | list[dict[str, str]],
+        model: str,
+        **extra_body: Any,
+    ) -> SplitResponse:
+        """
+        Split a document into sections based on provided categories.
+        This method analyzes a multi-page document and classifies pages into
+        user-defined categories, returning the page ranges for each section.
+        Args:
+            document: The document to split. Can be a file path (Path or str), file-like object, MIMEData, PIL Image, or URL.
+            categories: List of categories to split the document into. Each category should have a 'name' and 'description'.
+                Can be Category objects or dicts with 'name' and 'description' keys.
+            model: The AI model to use for document splitting (e.g., "gemini-2.5-flash").
+        Returns:
+            SplitResponse: Response containing:
+                - splits: List of SplitResult objects with name, start_page, and end_page for each section.
+        Raises:
+            HTTPException: If the request fails.
+        Example:
+            ```python
+            response = retab.documents.split(
+                document="invoice_batch.pdf",
+                model="gemini-2.5-flash",
+                categories=[
+                    {"name": "invoice", "description": "Invoice documents with billing information"},
+                    {"name": "receipt", "description": "Receipt documents for payments"},
+                    {"name": "contract", "description": "Legal contract documents"},
+                ]
+            )
+            for split in response.splits:
+                print(f"{split.name}: pages {split.start_page}-{split.end_page}")
+            ```
+        """
+        request = self._prepare_split(
+            document=document,
+            categories=categories,
+            model=model,
+            **extra_body,
+        )
+        response = self._client._prepared_request(request)
+        return SplitResponse.model_validate(response)
 class AsyncDocuments(AsyncAPIResource, BaseDocumentsMixin):
     """Documents API wrapper for asynchronous usage."""
@@ -637,7 +728,7 @@ class AsyncDocuments(AsyncAPIResource, BaseDocumentsMixin):
         self,
         json_schema: dict[str, Any] | Path | str,
         model: str,
-        document: Path | str | IOBase | HttpUrl,
+        document: Path | str | IOBase | HttpUrl | MIMEData,
         image_resolution_dpi: int = FieldUnset,
         temperature: float = FieldUnset,
         reasoning_effort: ChatCompletionReasoningEffort = FieldUnset,
@@ -655,7 +746,7 @@ class AsyncDocuments(AsyncAPIResource, BaseDocumentsMixin):
         Args:
             json_schema: JSON schema defining the expected data structure
             model: The AI model to use for processing
-            document: Document to process (file path, URL, or file-like object)
+            document: Document to process (file path, URL, file-like object, or MIMEData)
             image_resolution_dpi: Optional image resolution DPI
             temperature: Model temperature setting (0-1)
             reasoning_effort: The effort level for the model to reason about the input data
@@ -693,7 +784,7 @@ class AsyncDocuments(AsyncAPIResource, BaseDocumentsMixin):
         self,
         json_schema: dict[str, Any] | Path | str,
         model: str,
-        document: Path | str | IOBase | HttpUrl,
+        document: Path | str | IOBase | HttpUrl | MIMEData,
         image_resolution_dpi: int = FieldUnset,
         temperature: float = FieldUnset,
         reasoning_effort: ChatCompletionReasoningEffort = FieldUnset,
@@ -709,7 +800,7 @@ class AsyncDocuments(AsyncAPIResource, BaseDocumentsMixin):
         Args:
             json_schema: JSON schema defining the expected data structure.
             model: The AI model to use.
-            document: Document to process (file path, URL, or file-like object)
+            document: Document to process (file path, URL, file-like object, or MIMEData)
             image_resolution_dpi: Optional image resolution DPI.
             temperature: Model temperature setting (0-1).
             reasoning_effort: The effort level for the model to reason about the input data.
@@ -822,9 +913,10 @@ class AsyncDocuments(AsyncAPIResource, BaseDocumentsMixin):
     async def edit(
         self,
-        document: Path | str | IOBase | MIMEData | PIL.Image.Image | HttpUrl,
         filling_instructions: str,
+        document: Path | str | IOBase | MIMEData | PIL.Image.Image | HttpUrl | None = None,
         model: str = FieldUnset,
+        template_id: str | None = FieldUnset,
         **extra_body: Any,
     ) -> EditResponse:
         """
@@ -836,10 +928,15 @@ class AsyncDocuments(AsyncAPIResource, BaseDocumentsMixin):
         3. LLM-based form filling using the provided instructions
         4. Returns the filled PDF with form field values populated
+        Either `document` OR `template_id` must be provided, but not both.
         Args:
-            document: The document to edit. Can be a file path (Path or str), file-like object, MIMEData, PIL Image, or URL.
             filling_instructions: Instructions describing how to fill the form fields.
+            document: The document to edit. Can be a file path (Path or str), file-like object, MIMEData, PIL Image, or URL.
+                Mutually exclusive with template_id.
             model: The LLM model to use for inference. Defaults to "gemini-2.5-pro".
+            template_id: Template ID to use for filling. When provided, uses the template's pre-defined form fields
+                and empty PDF. Only works for PDF documents. Mutually exclusive with document.
         Returns:
             EditResponse: Response containing:
@@ -850,10 +947,61 @@ class AsyncDocuments(AsyncAPIResource, BaseDocumentsMixin):
             HTTPException: If the request fails.
         """
         request = self._prepare_edit(
-            document=document,
             filling_instructions=filling_instructions,
+            document=document,
             model=model,
+            template_id=template_id,
             **extra_body,
         )
         response = await self._client._prepared_request(request)
         return EditResponse.model_validate(response)
+    async def split(
+        self,
+        document: Path | str | IOBase | MIMEData | PIL.Image.Image | HttpUrl,
+        categories: list[Category] | list[dict[str, str]],
+        model: str,
+        **extra_body: Any,
+    ) -> SplitResponse:
+        """
+        Split a document into sections based on provided categories asynchronously.
+        This method analyzes a multi-page document and classifies pages into
+        user-defined categories, returning the page ranges for each section.
+        Args:
+            document: The document to split. Can be a file path (Path or str), file-like object, MIMEData, PIL Image, or URL.
+            categories: List of categories to split the document into. Each category should have a 'name' and 'description'.
+                Can be Category objects or dicts with 'name' and 'description' keys.
+            model: The AI model to use for document splitting (e.g., "gemini-2.5-flash").
+        Returns:
+            SplitResponse: Response containing:
+                - splits: List of SplitResult objects with name, start_page, and end_page for each section.
+        Raises:
+            HTTPException: If the request fails.
+        Example:
+            ```python
+            response = await retab.documents.split(
+                document="invoice_batch.pdf",
+                model="gemini-2.5-flash",
+                categories=[
+                    {"name": "invoice", "description": "Invoice documents with billing information"},
+                    {"name": "receipt", "description": "Receipt documents for payments"},
+                    {"name": "contract", "description": "Legal contract documents"},
+                ]
+            )
+            for split in response.splits:
+                print(f"{split.name}: pages {split.start_page}-{split.end_page}")
+            ```
+        """
+        request = self._prepare_split(
+            document=document,
+            categories=categories,
+            model=model,
+            **extra_body,
+        )
+        response = await self._client._prepared_request(request)
+        return SplitResponse.model_validate(response)

retab-0.0.78/retab/types/documents/__init__.py ADDED Viewed

@@ -0,0 +1,12 @@
+from .parse import ParseRequest, ParseResult, RetabUsage
+from .split import Category, SplitRequest, SplitResult, SplitResponse
+__all__ = [
+    "ParseRequest",
+    "ParseResult",
+    "RetabUsage",
+    "Category",
+    "SplitRequest",
+    "SplitResult",
+    "SplitResponse",
+]

{retab-0.0.76 → retab-0.0.78}/retab/types/documents/create_messages.py RENAMED Viewed

@@ -22,7 +22,7 @@ class DocumentCreateMessageRequest(BaseModel):
     model_config = ConfigDict(extra="ignore")
     document: MIMEData = Field(description="The document to load.")
     image_resolution_dpi: int = Field(default=192, description="Resolution of the image sent to the LLM")
-    model: str = Field(default="gemini-2.5-flash", description="The model to use for the document.")
+    model: str = Field(default="retab-small", description="The model to use for the document.")
 class DocumentCreateInputRequest(DocumentCreateMessageRequest):
     json_schema: dict[str, Any] = Field(description="The json schema to use for the document.")

{retab-0.0.76 → retab-0.0.78}/retab/types/documents/edit.py RENAMED Viewed

@@ -60,6 +60,10 @@ class BaseFormField(BaseModel):
         ...,
         description="Type of field. Currently supported values: 'text' and 'checkbox'.",
     )
+    key: str = Field(
+        ...,
+        description="Key of the field. This is used to identify the field in the form data.",
+    )
 class FormField(BaseFormField):
@@ -113,10 +117,12 @@ class OCRResult(BaseModel):
 class InferFormSchemaRequest(BaseModel):
-    """Request to infer form schema from a PDF."""
+    """Request to infer form schema from a PDF or DOCX document."""
-    document: MIMEData = Field(..., description="Input document (PDF)")
-    model: str = Field(default="gemini-2.5-pro", description="LLM model to use for inference")
+    document: MIMEData = Field(..., description="Input document (PDF or DOCX). DOCX files will be converted to PDF.")
+    model: str = Field(default="retab-small", description="LLM model to use for inference")
+    instructions: Optional[str] = Field(default=None, description="Optional instructions to guide form field detection (e.g., which fields to focus on, specific areas to look for)")
+    per_page: Optional[bool] = Field(default=None, description="If True, process each page separately for better accuracy on long PDFs. If None (default), automatically uses per-page for PDFs with more than 3 pages.")
 class InferFormSchemaResponse(BaseModel):
@@ -127,10 +133,17 @@ class InferFormSchemaResponse(BaseModel):
     form_fields_pdf: MIMEData = Field(..., description="PDF with form field bounding boxes")
-class EditRequest(InferFormSchemaRequest):
-    """Request for the infer_and_fill_schema endpoint."""
+class EditRequest(BaseModel):
+    """Request for the infer_and_fill_schema endpoint.
+    Either `document` OR `template_id` must be provided, but not both.
+    - When `document` is provided: OCR + LLM inference to detect and fill form fields
+    - When `template_id` is provided: Uses pre-defined form fields from the template (PDF only)
+    """
+    document: Optional[MIMEData] = Field(default=None, description="Input document (PDF or DOCX). DOCX files will be converted to PDF. Mutually exclusive with template_id.")
+    model: str = Field(default="retab-small", description="LLM model to use for inference")
     filling_instructions: str = Field(..., description="Instructions to fill the form")
+    template_id: Optional[str] = Field(default=None, description="Template ID to use for filling. When provided, uses the template's pre-defined form fields and empty PDF. Only works for PDF documents. Mutually exclusive with document.")
 class EditResponse(BaseModel):
     """Response from the fill_form endpoint.

{retab-0.0.76 → retab-0.0.78}/retab/types/documents/extract.py RENAMED Viewed

@@ -34,7 +34,7 @@ class DocumentExtractRequest(BaseModel):
     stream: bool = Field(default=False, description="If true, the extraction will be streamed to the user using the active WebSocket connection")
     seed: int | None = Field(default=None, description="Seed for the random number generator. If not provided, a random seed will be generated.", examples=[None])
     store: bool = Field(default=True, description="If true, the extraction will be stored in the database")
-    parallel_ocr_keys: Optional[dict[str, str]] = Field(default=None, description="If set, keys to be used for the extraction of long lists of data using Parallel OCR", examples=[{"properties": "ID", "products": "identity.id"}])
+    chunking_keys: Optional[dict[str, str]] = Field(default=None, description="If set, keys to be used for the extraction of long lists of data using Parallel OCR", examples=[{"properties": "ID", "products": "identity.id"}])
     web_search: bool = Field(default=False, description="Enable web search enrichment with Parallel AI to add external context during extraction")
     metadata: dict[str, str] = Field(default_factory=dict, description="User-defined metadata to associate with this extraction")
     extraction_id: Optional[str] = Field(default=None, description="Extraction ID to use for this extraction. If not provided, a new ID will be generated.")

{retab-0.0.76 → retab-0.0.78}/retab/types/documents/parse.py RENAMED Viewed

@@ -18,7 +18,7 @@ class ParseRequest(BaseModel):
     model_config = ConfigDict(extra="ignore")
     document: MIMEData = Field(..., description="Document to parse")
-    model: str = Field(default="gemini-2.5-flash", description="Model to use for parsing")
+    model: str = Field(default="retab-small", description="Model to use for parsing")
     table_parsing_format: TableParsingFormat = Field(default="html", description="Format for parsing tables")
     image_resolution_dpi: int = Field(default=192, description="DPI for image processing", ge=96, le=300)

retab-0.0.78/retab/types/documents/split.py ADDED Viewed

@@ -0,0 +1,32 @@
+from pydantic import BaseModel, Field
+from ..mime import MIMEData
+class Category(BaseModel):
+    name: str = Field(..., description="The name of the category")
+    description: str = Field(..., description="The description of the category")
+class SplitRequest(BaseModel):
+    document: MIMEData = Field(..., description="The document to split")
+    categories: list[Category] = Field(..., description="The categories to split the document into")
+    model: str = Field(default="retab-small", description="The model to use to split the document")
+class SplitResult(BaseModel):
+    name: str = Field(..., description="The name of the category")
+    start_page: int = Field(..., description="The start page of the category (1-indexed)")
+    end_page: int = Field(..., description="The end page of the category (1-indexed, inclusive)")
+class SplitResponse(BaseModel):
+    splits: list[SplitResult] = Field(..., description="The list of document splits with their page ranges")
+class SplitOutputSchema(BaseModel):
+    """Schema for LLM structured output."""
+    splits: list[SplitResult] = Field(
+        ...,
+        description="List of document sections, each classified into one of the provided categories with their page ranges"
+    )

{retab-0.0.76 → retab-0.0.78}/retab/types/inference_settings.py RENAMED Viewed

@@ -8,7 +8,7 @@ class InferenceSettings(BaseModel):
     reasoning_effort: ChatCompletionReasoningEffort = "minimal"
     image_resolution_dpi: int = Field(default=192, description="Resolution of the image sent to the LLM", ge=96, le=300)
     n_consensus: int = Field(default=1, ge=1, le=8, description="Number of consensus rounds to perform")
-    parallel_ocr_keys: dict[str, str] | None = Field(default=None, description="If set, keys to be used for the extraction of long lists of data using Parallel OCR", examples=[{"properties": "ID", "products": "identity.id"}])
+    chunking_keys: dict[str, str] | None = Field(default=None, description="If set, keys to be used for the extraction of long lists of data using Parallel OCR", examples=[{"properties": "ID", "products": "identity.id"}])
     web_search: bool = Field(default=False, description="Enable web search enrichment with Parallel AI to add external context during extraction")
     model_config = ConfigDict(extra="ignore")

{retab-0.0.76 → retab-0.0.78}/retab/types/projects/model.py RENAMED Viewed

@@ -9,7 +9,7 @@ from ..inference_settings import InferenceSettings
 from .predictions import PredictionData
 default_inference_settings = InferenceSettings(
-    model="auto-small",
+    model="retab-small",
     temperature=0.5,
     reasoning_effort="minimal",
     image_resolution_dpi=192,

{retab-0.0.76 → retab-0.0.78}/retab/types/schemas/model.py RENAMED Viewed

@@ -103,7 +103,57 @@ def _insert_reasoning_fields_inner(schema: dict[str, Any]) -> tuple[dict[str, An
     return schema, reasoning_desc
-def filter_auxiliary_fields(data: dict[str, Any], prefixes: list[str] = ["reasoning___"]) -> dict[str, Any]:
+def _insert_quote_fields_inner(schema: dict[str, Any]) -> dict[str, Any]:
+    """
+    Inner function that processes a schema and adds source___ fields for leaf nodes with X-SourceQuote: true.
+    Only applies to leaf fields, never to the root.
+    """
+    if not isinstance(schema, dict):
+        return schema
+    # Create a copy to avoid modifying the original
+    new_schema = copy.deepcopy(schema)
+    # Process children recursively
+    if "properties" in new_schema and isinstance(new_schema["properties"], dict):
+        new_props = {}
+        for property_key, property_value in new_schema["properties"].items():
+            updated_prop_schema_value = _insert_quote_fields_inner(property_value)
+            has_quote_field = updated_prop_schema_value.get("X-SourceQuote") is True
+            # Check if this property is a leaf with X-SourceQuote: true
+            if has_quote_field:
+                # Add the quote field
+                quote_key = f"source___{property_key}"
+                new_props[quote_key] = {"type": "string", "description": f"The exact quote from the source document that supports the extracted value for '{property_key}'."}
+                # Add the quote field to required if the property is required
+                if "required" in new_schema and property_key in new_schema["required"]:
+                    # add the quote field to required just before the property_key
+                    new_schema["required"].insert(new_schema["required"].index(property_key), quote_key)
+                # Remove the X-SourceQuote field
+                updated_prop_schema_value.pop("X-SourceQuote", None)
+            new_props[property_key] = updated_prop_schema_value
+        new_schema["properties"] = new_props
+    elif "items" in new_schema:
+        # Recurse into items if present
+        updated_items = _insert_quote_fields_inner(new_schema["items"])
+        new_schema["items"] = updated_items
+    # Process $defs as well
+    if "$defs" in new_schema and isinstance(new_schema["$defs"], dict):
+        new_defs = {}
+        for dk, dv in new_schema["$defs"].items():
+            new_defs[dk] = _insert_quote_fields_inner(dv)
+        new_schema["$defs"] = new_defs
+    return new_schema
+def filter_auxiliary_fields(data: dict[str, Any], prefixes: list[str] = ["reasoning___", "source___"]) -> dict[str, Any]:
     """
     Recursively filters out fields that start with any of the prefixes in `prefixes` from the input data.
     """
@@ -142,6 +192,9 @@ def create_reasoning_schema_without_ref_expansion(json_schema: dict[str, Any]) -
         if "required" in updated_schema:
             updated_schema["required"].append("reasoning___root")
+    # Insert quote fields for leaf nodes with X-SourceQuote: true
+    updated_schema = _insert_quote_fields_inner(updated_schema)
     # Clean the schema (remove defaults, etc)
     updated_schema = clean_schema(updated_schema, remove_custom_fields=True)
     return updated_schema
@@ -167,6 +220,9 @@ def create_reasoning_schema(json_schema: dict[str, Any]) -> dict[str, Any]:
         if "required" in updated_schema:
             updated_schema["required"].append("reasoning___root")
+    # Insert quote fields for leaf nodes with X-SourceQuote: true
+    updated_schema = _insert_quote_fields_inner(updated_schema)
     # Clean the schema (remove defaults, etc)
     updated_schema = clean_schema(updated_schema, remove_custom_fields=True)
     return updated_schema
@@ -1118,6 +1174,30 @@ No ambiguities."
 ---
+## Source Quote Fields
+The schema may include source quote fields (`source___*`) for capturing exact quotes from the document that support extracted values. These fields appear as siblings to the fields they document.
+Naming:
+- `source___[fieldname]` for each field marked with X-SourceQuote in the schema
+Guidelines:
+- Extract the exact verbatim text from the document that supports the extracted value.
+- Include surrounding context when helpful for verification.
+- For missing data, use an empty string `""`.
+- These fields are internal and omitted from final outputs.
+### Example
+If extracting a company name with source quote:
+```json
+{
+  "source___company_name": "Registered Office: ACME Corporation Ltd",
+  "company_name": "ACME Corporation Ltd"
+}
+```
+---
 ## Extraction Principles
 - **Transparency**: Justify every decision with evidence.

{retab-0.0.76 → retab-0.0.78}/retab/utils/json_schema.py RENAMED Viewed

@@ -368,7 +368,7 @@ def convert_basemodel_to_partial_basemodel(base_model: Type[BaseModel]) -> Type[
-def filter_auxiliary_fields(data: dict[str, Any], prefixes: list[str] = ["reasoning___", "quote___"]) -> dict[str, Any]:
+def filter_auxiliary_fields(data: dict[str, Any], prefixes: list[str] = ["reasoning___", "source___"]) -> dict[str, Any]:
     """
     Recursively filters out fields that start with any of the prefixes in `prefixes` from the input data.
     """
@@ -388,7 +388,7 @@ def filter_auxiliary_fields(data: dict[str, Any], prefixes: list[str] = ["reason
     return filtered
-def filter_auxiliary_fields_json(data: str, prefixes: list[str] = ["reasoning___", "quote___"]) -> dict[str, Any]:
+def filter_auxiliary_fields_json(data: str, prefixes: list[str] = ["reasoning___", "source___"]) -> dict[str, Any]:
     """
     Recursively filters out fields that start with any of the prefixes in `prefixes` from the input JSON data.
     """

{retab-0.0.76 → retab-0.0.78}/retab.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: retab
-Version: 0.0.76
+Version: 0.0.78
 Summary: Retab official python library
 Home-page: https://github.com/retab-dev/retab
 Author: Retab

{retab-0.0.76 → retab-0.0.78}/retab.egg-info/SOURCES.txt RENAMED Viewed

@@ -33,6 +33,7 @@ retab/types/documents/create_messages.py
 retab/types/documents/edit.py
 retab/types/documents/extract.py
 retab/types/documents/parse.py
+retab/types/documents/split.py
 retab/types/extractions/__init__.py
 retab/types/extractions/types.py
 retab/types/projects/__init__.py
@@ -51,6 +52,4 @@ retab/utils/hashing.py
 retab/utils/json_schema.py
 retab/utils/mime.py
 retab/utils/stream_context_managers.py
-retab/utils/usage/__init__.py
-retab/utils/usage/json_schema.py
 tests/test_projects.py

{retab-0.0.76 → retab-0.0.78}/setup.py RENAMED Viewed

@@ -6,7 +6,7 @@ with open("requirements.txt") as f:
 setup(
     name="retab",
-    version="0.0.76",
+    version="0.0.78",
     author="Retab",
     author_email="contact@retab.com",
     description="Retab official python library",

retab-0.0.76/retab/types/documents/__init__.py DELETED Viewed

@@ -1,3 +0,0 @@
-from .parse import ParseRequest, ParseResult, RetabUsage
-__all__ = ["ParseRequest", "ParseResult", "RetabUsage"]

retab-0.0.76/retab/utils/usage/__init__.py DELETED Viewed

File without changes

retab 0.0.76__tar.gz → 0.0.78__tar.gz

retab 0.0.76tar.gz → 0.0.78tar.gz