PyPI - retab - Versions diffs - 0.0.79__py3-none-any.whl → 0.0.81__py3-none-any.whl - Mend

retab 0.0.79py3-none-any.whl → 0.0.81py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

retab/resources/documents/client.py +131 -0
retab/resources/workflows/__init__.py +3 -0
retab/resources/workflows/client.py +190 -0
retab/types/documents/__init__.py +4 -0
retab/types/documents/classify.py +31 -0
retab/types/documents/extract.py +20 -2
retab/types/workflows/__init__.py +11 -0
retab/types/workflows/model.py +76 -0
retab/utils/json_schema.py +29 -8
{retab-0.0.79.dist-info → retab-0.0.81.dist-info}/METADATA +1 -1
{retab-0.0.79.dist-info → retab-0.0.81.dist-info}/RECORD +13 -8
{retab-0.0.79.dist-info → retab-0.0.81.dist-info}/WHEEL +0 -0
{retab-0.0.79.dist-info → retab-0.0.81.dist-info}/top_level.txt +0 -0

retab/resources/documents/client.py CHANGED Viewed

@@ -17,6 +17,7 @@ from ...types.documents.edit import EditRequest, EditResponse
 from ...types.documents.extract import DocumentExtractRequest, RetabParsedChatCompletion, RetabParsedChatCompletionChunk, RetabParsedChoice, maybe_parse_to_pydantic
 from ...types.documents.parse import ParseRequest, ParseResult, TableParsingFormat
 from ...types.documents.split import Category, SplitRequest, SplitResponse
+from ...types.documents.classify import ClassifyRequest, ClassifyResponse
 from ...types.mime import MIMEData
 from ...types.standards import PreparedRequest, FieldUnset
 from ...utils.json_schema import load_json_schema, unflatten_dict
@@ -172,6 +173,34 @@ class BaseDocumentsMixin:
         split_request = SplitRequest(**request_dict)
         return PreparedRequest(method="POST", url="/v1/documents/split", data=split_request.model_dump(mode="json", exclude_unset=True))
+    def _prepare_classify(
+        self,
+        document: Path | str | IOBase | MIMEData | PIL.Image.Image | HttpUrl,
+        categories: list[Category] | list[dict[str, str]],
+        model: str,
+        **extra_body: Any,
+    ) -> PreparedRequest:
+        mime_document = prepare_mime_document(document)
+        # Convert dict categories to Category objects if needed
+        category_objects = [
+            Category(**cat) if isinstance(cat, dict) else cat
+            for cat in categories
+        ]
+        request_dict: dict[str, Any] = {
+            "document": mime_document,
+            "categories": category_objects,
+            "model": model,
+        }
+        # Merge any extra fields provided by the caller
+        if extra_body:
+            request_dict.update(extra_body)
+        classify_request = ClassifyRequest(**request_dict)
+        return PreparedRequest(method="POST", url="/v1/documents/classify", data=classify_request.model_dump(mode="json", exclude_unset=True))
     def _prepare_extract(
         self,
         json_schema: dict[str, Any] | Path | str,
@@ -662,6 +691,57 @@ class Documents(SyncAPIResource, BaseDocumentsMixin):
         response = self._client._prepared_request(request)
         return SplitResponse.model_validate(response)
+    def classify(
+        self,
+        document: Path | str | IOBase | MIMEData | PIL.Image.Image | HttpUrl,
+        categories: list[Category] | list[dict[str, str]],
+        model: str,
+        **extra_body: Any,
+    ) -> ClassifyResponse:
+        """
+        Classify a document into one of the provided categories.
+        This method analyzes a document and classifies it into exactly one
+        of the user-defined categories, returning the classification with
+        chain-of-thought reasoning explaining the decision.
+        Args:
+            document: The document to classify. Can be a file path (Path or str), file-like object, MIMEData, PIL Image, or URL.
+            categories: List of categories to classify the document into. Each category should have a 'name' and 'description'.
+                Can be Category objects or dicts with 'name' and 'description' keys.
+            model: The AI model to use for document classification (e.g., "gemini-2.5-flash").
+        Returns:
+            ClassifyResponse: Response containing:
+                - result: ClassifyResult with reasoning and classification.
+        Raises:
+            HTTPException: If the request fails.
+        Example:
+            ```python
+            response = retab.documents.classify(
+                document="invoice.pdf",
+                model="gemini-2.5-flash",
+                categories=[
+                    {"name": "invoice", "description": "Invoice documents with billing information"},
+                    {"name": "receipt", "description": "Receipt documents for payments"},
+                    {"name": "contract", "description": "Legal contract documents"},
+                ]
+            )
+            print(f"Classification: {response.result.classification}")
+            print(f"Reasoning: {response.result.reasoning}")
+            ```
+        """
+        request = self._prepare_classify(
+            document=document,
+            categories=categories,
+            model=model,
+            **extra_body,
+        )
+        response = self._client._prepared_request(request)
+        return ClassifyResponse.model_validate(response)
 class AsyncDocuments(AsyncAPIResource, BaseDocumentsMixin):
     """Documents API wrapper for asynchronous usage."""
@@ -1005,3 +1085,54 @@ class AsyncDocuments(AsyncAPIResource, BaseDocumentsMixin):
         )
         response = await self._client._prepared_request(request)
         return SplitResponse.model_validate(response)
+    async def classify(
+        self,
+        document: Path | str | IOBase | MIMEData | PIL.Image.Image | HttpUrl,
+        categories: list[Category] | list[dict[str, str]],
+        model: str,
+        **extra_body: Any,
+    ) -> ClassifyResponse:
+        """
+        Classify a document into one of the provided categories asynchronously.
+        This method analyzes a document and classifies it into exactly one
+        of the user-defined categories, returning the classification with
+        chain-of-thought reasoning explaining the decision.
+        Args:
+            document: The document to classify. Can be a file path (Path or str), file-like object, MIMEData, PIL Image, or URL.
+            categories: List of categories to classify the document into. Each category should have a 'name' and 'description'.
+                Can be Category objects or dicts with 'name' and 'description' keys.
+            model: The AI model to use for document classification (e.g., "gemini-2.5-flash").
+        Returns:
+            ClassifyResponse: Response containing:
+                - result: ClassifyResult with reasoning and classification.
+        Raises:
+            HTTPException: If the request fails.
+        Example:
+            ```python
+            response = await retab.documents.classify(
+                document="invoice.pdf",
+                model="gemini-2.5-flash",
+                categories=[
+                    {"name": "invoice", "description": "Invoice documents with billing information"},
+                    {"name": "receipt", "description": "Receipt documents for payments"},
+                    {"name": "contract", "description": "Legal contract documents"},
+                ]
+            )
+            print(f"Classification: {response.result.classification}")
+            print(f"Reasoning: {response.result.reasoning}")
+            ```
+        """
+        request = self._prepare_classify(
+            document=document,
+            categories=categories,
+            model=model,
+            **extra_body,
+        )
+        response = await self._client._prepared_request(request)
+        return ClassifyResponse.model_validate(response)

retab/resources/workflows/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from .client import AsyncWorkflows, Workflows
+__all__ = ["Workflows", "AsyncWorkflows"]

retab/resources/workflows/client.py ADDED Viewed

@@ -0,0 +1,190 @@
+from io import IOBase
+from pathlib import Path
+from typing import Any, Dict
+import PIL.Image
+from pydantic import HttpUrl
+from ..._resource import AsyncAPIResource, SyncAPIResource
+from ...utils.mime import MIMEData, prepare_mime_document
+from ...types.standards import PreparedRequest
+from ...types.workflows import WorkflowRun
+# Type alias for document inputs
+DocumentInput = Path | str | bytes | IOBase | MIMEData | PIL.Image.Image | HttpUrl
+class WorkflowsMixin:
+    """Mixin providing shared methods for workflow operations."""
+    def prepare_run(
+        self,
+        workflow_id: str,
+        documents: Dict[str, DocumentInput],
+    ) -> PreparedRequest:
+        """Prepare a request to run a workflow with input documents.
+        Args:
+            workflow_id: The ID of the workflow to run
+            documents: Mapping of start node IDs to their input documents.
+                       Each document can be a file path, bytes, file-like object,
+                       MIMEData, PIL Image, or HttpUrl.
+        Returns:
+            PreparedRequest: The prepared request
+        Example:
+            >>> client.workflows.run(
+            ...     workflow_id="wf_abc123",
+            ...     documents={
+            ...         "start-node-1": Path("invoice.pdf"),
+            ...         "start-node-2": Path("receipt.pdf"),
+            ...     }
+            ... )
+        """
+        # Convert each document to MIMEData and then to the format expected by the backend
+        documents_payload: Dict[str, Dict[str, Any]] = {}
+        for node_id, document in documents.items():
+            mime_data = prepare_mime_document(document)
+            documents_payload[node_id] = {
+                "filename": mime_data.filename,
+                "content": mime_data.content,
+                "mime_type": mime_data.mime_type,
+            }
+        data = {"documents": documents_payload}
+        return PreparedRequest(method="POST", url=f"/v1/workflows/{workflow_id}/run", data=data)
+    def prepare_get_run(self, run_id: str) -> PreparedRequest:
+        """Prepare a request to get a workflow run by ID.
+        Args:
+            run_id: The ID of the workflow run to retrieve
+        Returns:
+            PreparedRequest: The prepared request
+        """
+        return PreparedRequest(method="GET", url=f"/v1/workflows/runs/{run_id}")
+class Workflows(SyncAPIResource, WorkflowsMixin):
+    """Workflows API wrapper for synchronous operations."""
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+    def run(
+        self,
+        workflow_id: str,
+        documents: Dict[str, DocumentInput],
+    ) -> WorkflowRun:
+        """Run a workflow with the provided input documents.
+        This creates a workflow run and starts execution in the background.
+        The returned WorkflowRun will have status "running" - use get_run()
+        to check for updates on the run status.
+        Args:
+            workflow_id: The ID of the workflow to run
+            documents: Mapping of start node IDs to their input documents.
+                       Each document can be a file path, bytes, file-like object,
+                       MIMEData, PIL Image, or HttpUrl.
+        Returns:
+            WorkflowRun: The created workflow run with status "running"
+        Raises:
+            HTTPException: If the request fails (e.g., workflow not found,
+                          missing input documents for start nodes)
+        Example:
+            >>> run = client.workflows.run(
+            ...     workflow_id="wf_abc123",
+            ...     documents={
+            ...         "start-node-1": Path("invoice.pdf"),
+            ...         "start-node-2": Path("receipt.pdf"),
+            ...     }
+            ... )
+            >>> print(f"Run started: {run.id}, status: {run.status}")
+        """
+        request = self.prepare_run(workflow_id=workflow_id, documents=documents)
+        response = self._client._prepared_request(request)
+        return WorkflowRun.model_validate(response)
+    def get_run(self, run_id: str) -> WorkflowRun:
+        """Get a workflow run by ID.
+        Args:
+            run_id: The ID of the workflow run to retrieve
+        Returns:
+            WorkflowRun: The workflow run
+        Raises:
+            HTTPException: If the request fails (e.g., run not found)
+        """
+        request = self.prepare_get_run(run_id)
+        response = self._client._prepared_request(request)
+        return WorkflowRun.model_validate(response)
+class AsyncWorkflows(AsyncAPIResource, WorkflowsMixin):
+    """Workflows API wrapper for asynchronous operations."""
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+    async def run(
+        self,
+        workflow_id: str,
+        documents: Dict[str, DocumentInput],
+    ) -> WorkflowRun:
+        """Run a workflow with the provided input documents.
+        This creates a workflow run and starts execution in the background.
+        The returned WorkflowRun will have status "running" - use get_run()
+        to check for updates on the run status.
+        Args:
+            workflow_id: The ID of the workflow to run
+            documents: Mapping of start node IDs to their input documents.
+                       Each document can be a file path, bytes, file-like object,
+                       MIMEData, PIL Image, or HttpUrl.
+        Returns:
+            WorkflowRun: The created workflow run with status "running"
+        Raises:
+            HTTPException: If the request fails (e.g., workflow not found,
+                          missing input documents for start nodes)
+        Example:
+            >>> run = await client.workflows.run(
+            ...     workflow_id="wf_abc123",
+            ...     documents={
+            ...         "start-node-1": Path("invoice.pdf"),
+            ...         "start-node-2": Path("receipt.pdf"),
+            ...     }
+            ... )
+            >>> print(f"Run started: {run.id}, status: {run.status}")
+        """
+        request = self.prepare_run(workflow_id=workflow_id, documents=documents)
+        response = await self._client._prepared_request(request)
+        return WorkflowRun.model_validate(response)
+    async def get_run(self, run_id: str) -> WorkflowRun:
+        """Get a workflow run by ID.
+        Args:
+            run_id: The ID of the workflow run to retrieve
+        Returns:
+            WorkflowRun: The workflow run
+        Raises:
+            HTTPException: If the request fails (e.g., run not found)
+        """
+        request = self.prepare_get_run(run_id)
+        response = await self._client._prepared_request(request)
+        return WorkflowRun.model_validate(response)

retab/types/documents/__init__.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from .parse import ParseRequest, ParseResult, RetabUsage
 from .split import Category, SplitRequest, SplitResult, SplitResponse
+from .classify import ClassifyRequest, ClassifyResult, ClassifyResponse
 __all__ = [
     "ParseRequest",
@@ -9,4 +10,7 @@ __all__ = [
     "SplitRequest",
     "SplitResult",
     "SplitResponse",
+    "ClassifyRequest",
+    "ClassifyResult",
+    "ClassifyResponse",
 ]

retab/types/documents/classify.py ADDED Viewed

@@ -0,0 +1,31 @@
+from pydantic import BaseModel, Field
+from ..mime import MIMEData
+from .split import Category
+class ClassifyRequest(BaseModel):
+    document: MIMEData = Field(..., description="The document to classify")
+    categories: list[Category] = Field(..., description="The categories to classify the document into")
+    model: str = Field(default="retab-small", description="The model to use for classification")
+class ClassifyResult(BaseModel):
+    reasoning: str = Field(..., description="The reasoning for the classification decision")
+    classification: str = Field(..., description="The category name that the document belongs to")
+class ClassifyResponse(BaseModel):
+    result: ClassifyResult = Field(..., description="The classification result with reasoning")
+class ClassifyOutputSchema(BaseModel):
+    """Schema for LLM structured output."""
+    reasoning: str = Field(
+        ...,
+        description="Step-by-step reasoning explaining why this document belongs to the chosen category"
+    )
+    classification: str = Field(
+        ...,
+        description="The category name that this document belongs to"
+    )

retab/types/documents/extract.py CHANGED Viewed

@@ -155,6 +155,9 @@ class RetabParsedChoiceDeltaChunk(ChoiceDeltaChunk):
     flat_deleted_keys: list[str] = []
     is_valid_json: bool = False
     key_mapping: dict[str, Optional[str]] | None = Field(default=None, description="Mapping of consensus keys to original model keys")
+    # Full parsed object from the LLM (when available). Used to avoid data corruption
+    # from unflatten_dict when null values are not transmitted in streaming deltas.
+    full_parsed: dict[str, Any] | None = Field(default=None, description="Complete parsed object from LLM, used instead of unflatten_dict when available")
 class RetabParsedChoiceChunk(ChoiceChunk):
@@ -183,6 +186,7 @@ class RetabParsedChatCompletionChunk(StreamingBaseModel, ChatCompletionChunk):
                     flat_parsed={},
                     flat_likelihoods={},
                     is_valid_json=False,
+                    full_parsed=None,
                 )
         max_choices = max(len(self.choices), len(previous_cumulated_chunk.choices)) if previous_cumulated_chunk is not None else len(self.choices)
@@ -201,6 +205,8 @@ class RetabParsedChatCompletionChunk(StreamingBaseModel, ChatCompletionChunk):
         acc_flat_parsed = [safe_get_delta(previous_cumulated_chunk, i).flat_parsed | safe_get_delta(self, i).flat_parsed for i in range(max_choices)]
         acc_flat_likelihoods = [safe_get_delta(previous_cumulated_chunk, i).flat_likelihoods | safe_get_delta(self, i).flat_likelihoods for i in range(max_choices)]
         acc_key_mapping = [safe_get_delta(previous_cumulated_chunk, i).key_mapping or safe_get_delta(self, i).key_mapping for i in range(max_choices)]
+        # Preserve full_parsed: use the current chunk's full_parsed if available, otherwise keep the previous one
+        acc_full_parsed = [safe_get_delta(self, i).full_parsed or safe_get_delta(previous_cumulated_chunk, i).full_parsed for i in range(max_choices)]
         acc_content = [(safe_get_delta(previous_cumulated_chunk, i).content or "") + (safe_get_delta(self, i).content or "") for i in range(max_choices)]
@@ -219,6 +225,7 @@ class RetabParsedChatCompletionChunk(StreamingBaseModel, ChatCompletionChunk):
                         flat_deleted_keys=acc_flat_deleted_keys[i],
                         is_valid_json=acc_is_valid_json[i],
                         key_mapping=acc_key_mapping[i],
+                        full_parsed=acc_full_parsed[i],
                     ),
                     index=i,
                 )
@@ -238,7 +245,18 @@ class RetabParsedChatCompletionChunk(StreamingBaseModel, ChatCompletionChunk):
         if override_final_flat_parseds is None:
             override_final_flat_parseds = [self.choices[idx].delta.flat_parsed for idx in range(len(self.choices))]
-        final_parsed_list = [unflatten_dict(override_final_flat_parseds[idx]) for idx in range(len(self.choices))]
+        # Build final_parsed_list using full_parsed when available (correct data from LLM),
+        # falling back to unflatten_dict for backward compatibility
+        final_parsed_list = []
+        for idx in range(len(self.choices)):
+            full_parsed = self.choices[idx].delta.full_parsed
+            if full_parsed is not None:
+                # Use the complete parsed object from the LLM (avoids data corruption from unflatten_dict)
+                final_parsed_list.append(full_parsed)
+            else:
+                # Fallback: reconstruct from flat_parsed (may lose null values in sparse arrays)
+                final_parsed_list.append(unflatten_dict(override_final_flat_parseds[idx]))
         final_content_list = [json.dumps(final_parsed_list[idx]) for idx in range(len(self.choices))]
         # The final likelihoods are only on the first choice.
@@ -264,7 +282,7 @@ class RetabParsedChatCompletionChunk(StreamingBaseModel, ChatCompletionChunk):
                         role="assistant",
                         parsed=final_parsed_list[idx],
                     ),
-                    key_mapping=self.choices[idx].delta.key_mapping,
+                    key_mapping=self.choices[idx].delta.key_mapping, # type: ignore[call-arg]
                     finish_reason="stop",
                     logprobs=None,
                 )

retab/types/workflows/__init__.py ADDED Viewed

@@ -0,0 +1,11 @@
+from .model import WorkflowRun, StepStatus, StepIOReference, HandlePayload, NodeType
+__all__ = [
+    "WorkflowRun",
+    "StepStatus",
+    "StepIOReference",
+    "HandlePayload",
+    "NodeType",
+]

retab/types/workflows/model.py ADDED Viewed

@@ -0,0 +1,76 @@
+import datetime
+from typing import Any, Dict, List, Literal, Optional
+from pydantic import BaseModel, Field, ConfigDict
+class StepIOReference(BaseModel):
+    """Reference to step input/output stored in GCS"""
+    file_id: Optional[str] = Field(default=None, description="File ID for document storage lookup")
+    gcs_path: Optional[str] = Field(default=None, description="GCS path to the stored file")
+    filename: Optional[str] = Field(default=None, description="Original filename")
+    mime_type: Optional[str] = Field(default=None, description="MIME type of the file")
+class HandlePayload(BaseModel):
+    """
+    Payload for a single output handle.
+    Each output handle on a node produces a typed payload that can be:
+    - file: A document reference (PDF, image, etc.)
+    - json: Structured JSON data (extracted data, etc.)
+    - text: Plain text content
+    """
+    type: Literal["file", "json", "text"] = Field(..., description="Type of payload")
+    document: Optional[StepIOReference] = Field(default=None, description="For file handles: document reference")
+    data: Optional[dict] = Field(default=None, description="For JSON handles: structured data")
+    text: Optional[str] = Field(default=None, description="For text handles: text content")
+NodeType = Literal["start", "extract", "split", "end", "hil"]
+class StepStatus(BaseModel):
+    """Status of a single step in workflow execution"""
+    node_id: str = Field(..., description="ID of the node")
+    node_type: NodeType = Field(..., description="Type of the node")
+    node_label: str = Field(..., description="Label of the node")
+    status: Literal["pending", "running", "completed", "error", "waiting_for_human"] = Field(..., description="Current status")
+    started_at: Optional[datetime.datetime] = Field(default=None, description="When the step started")
+    completed_at: Optional[datetime.datetime] = Field(default=None, description="When the step completed")
+    duration_ms: Optional[int] = Field(default=None, description="Duration in milliseconds")
+    error: Optional[str] = Field(default=None, description="Error message if failed")
+    output: Optional[dict] = Field(default=None, description="Output data from the step")
+    handle_outputs: Optional[Dict[str, HandlePayload]] = Field(
+        default=None,
+        description="Output payloads keyed by handle ID (e.g., 'output-file-0', 'output-json-0')"
+    )
+    input_document: Optional[StepIOReference] = Field(default=None, description="Reference to input document")
+    output_document: Optional[StepIOReference] = Field(default=None, description="Reference to output document")
+    split_documents: Optional[Dict[str, StepIOReference]] = Field(default=None, description="For split nodes: category -> document reference")
+    requires_human_review: Optional[bool] = Field(default=None, description="Whether this step requires human review")
+    human_reviewed_at: Optional[datetime.datetime] = Field(default=None, description="When human review was completed")
+    human_review_approved: Optional[bool] = Field(default=None, description="Whether human approved or rejected")
+class WorkflowRun(BaseModel):
+    """A stored workflow run record"""
+    model_config = ConfigDict(extra="ignore")
+    id: str = Field(..., description="Unique ID for this run")
+    workflow_id: str = Field(..., description="ID of the workflow that was run")
+    workflow_name: str = Field(..., description="Name of the workflow at time of execution")
+    organization_id: str = Field(..., description="Organization that owns this run")
+    status: Literal["pending", "running", "completed", "error", "waiting_for_human"] = Field(default="pending", description="Overall status")
+    started_at: datetime.datetime = Field(..., description="When the workflow started")
+    completed_at: Optional[datetime.datetime] = Field(default=None, description="When the workflow completed")
+    duration_ms: Optional[int] = Field(default=None, description="Total duration in milliseconds")
+    steps: List[StepStatus] = Field(default_factory=list, description="Status of each step")
+    input_documents: Optional[Dict[str, StepIOReference]] = Field(default=None, description="Start node ID -> input document reference")
+    final_outputs: Optional[dict] = Field(default=None, description="Final outputs from end nodes")
+    error: Optional[str] = Field(default=None, description="Error message if workflow failed")
+    created_at: datetime.datetime = Field(..., description="When the run was created")
+    updated_at: datetime.datetime = Field(..., description="When the run was last updated")
+    waiting_for_node_ids: List[str] = Field(default_factory=list, description="Node IDs that are waiting for human review")
+    pending_node_outputs: Optional[dict] = Field(default=None, description="Serialized node outputs to resume from")

retab/utils/json_schema.py CHANGED Viewed

@@ -158,23 +158,43 @@ def expand_refs(schema: dict[str, Any], definitions: dict[str, dict[str, Any]] |
         print("Cyclic refs found, keeping it as is")
         return schema
+    # Support both $defs (draft 2019-09+) and definitions (draft-07)
     if definitions is None:
-        definitions = schema.pop("$defs", {})
+        definitions = schema.pop("$defs", None) or schema.pop("definitions", {})
     assert isinstance(definitions, dict)
-    if "allOf" in schema:
-        # Some schemas (notably the one converted from a pydantic model) have allOf. We only accept one element in allOf
-        if len(schema["allOf"]) != 1:
-            raise ValueError(f"Property schema must have a single element in 'allOf'. Found: {schema['allOf']}")
-        schema.update(schema.pop("allOf", [{}])[0])
+    # Handle allOf - merge all elements
+    if "allOf" in schema and isinstance(schema["allOf"], list) and len(schema["allOf"]) > 0:
+        all_of_elements = schema.pop("allOf")
+        for element in all_of_elements:
+            if isinstance(element, dict):
+                # Recursively expand refs in each allOf element first
+                expanded = expand_refs(element, definitions)
+                # Deep merge properties if both have them
+                if "properties" in expanded and "properties" in schema:
+                    schema["properties"] = {**schema["properties"], **expanded["properties"]}
+                    del expanded["properties"]
+                # Merge required arrays if both have them
+                if "required" in expanded and "required" in schema:
+                    schema["required"] = list(set(schema["required"] + expanded["required"]))
+                    del expanded["required"]
+                schema.update(expanded)
     if "$ref" in schema:
         ref: str = schema["$ref"]
+        def_name: str | None = None
+        # Support both #/$defs/ and #/definitions/ formats
         if ref.startswith("#/$defs/"):
             def_name = ref.removeprefix("#/$defs/")
+        elif ref.startswith("#/definitions/"):
+            def_name = ref.removeprefix("#/definitions/")
+        if def_name is not None:
             if def_name not in definitions:
-                raise ValueError(f"Reference {ref} not found in definitions.")
+                # Return schema as-is if reference not found (might be external)
+                return schema
             target = definitions[def_name]
             merged = merge_descriptions(schema, target)
             merged.pop("$ref", None)
@@ -184,7 +204,8 @@ def expand_refs(schema: dict[str, Any], definitions: dict[str, dict[str, Any]] |
     result: dict[str, Any] = {}
     for annotation, subschema in schema.items():
-        if annotation in ["properties", "$defs"]:
+        # Handle properties, $defs, and definitions (draft-07) keys
+        if annotation in ["properties", "$defs", "definitions"]:
             if isinstance(subschema, dict):
                 new_dict = {}
                 for pk, pv in subschema.items():

{retab-0.0.79.dist-info → retab-0.0.81.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: retab
-Version: 0.0.79
+Version: 0.0.81
 Summary: Retab official python library
 Home-page: https://github.com/retab-dev/retab
 Author: Retab

{retab-0.0.79.dist-info → retab-0.0.81.dist-info}/RECORD RENAMED Viewed

@@ -7,11 +7,13 @@ retab/resources/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 retab/resources/models.py,sha256=4WidFBnTGZEA65DSn2pLP2SRnCVXkMTw7o_m8xVCFC4,2469
 retab/resources/schemas.py,sha256=rZ6OzfmoYv-mGaRVzvXjO09dD-KxP74mZhOO8sMgcDQ,4632
 retab/resources/documents/__init__.py,sha256=OjXmngFN0RKqO4SI-mJBNzr6Ex6rMxfq0DxaqzP0RQs,89
-retab/resources/documents/client.py,sha256=XxWo9FlktrpuskAPyKWTx9UIA2VA81g0SbHjHYnigMM,43583
+retab/resources/documents/client.py,sha256=E8v0aBF4-9ATYo5hkQ629OP5mm2AtodTzznlj2xRWtQ,49000
 retab/resources/extractions/__init__.py,sha256=2H1ezUG8hI5SmTRy6NFzXdYLOdGFFsFrI60uzkitV20,97
 retab/resources/extractions/client.py,sha256=sEoNjOgX91FTOgoJUV-I1A9A9xl1ciCdPlhYwjhEjbA,11035
 retab/resources/projects/__init__.py,sha256=tPR3_3tr7bsoYd618qmGjnYN2R23PmF5oCFd7Z5_HGY,85
 retab/resources/projects/client.py,sha256=5LPAhJt5-nqBP4VWYvo0k7cW6HLGF6K9xMiHKQzIXho,15593
+retab/resources/workflows/__init__.py,sha256=-I0QNX7XKEr8ZJTV4-awMyKxZqGlSkKMdibiHiB7cZ0,89
+retab/resources/workflows/client.py,sha256=svKOmkqB1-P56IjzauWNdfQtzT0rlWRIu3EddwX-HiM,6743
 retab/types/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 retab/types/chat.py,sha256=x9VbtPMa4w6Gc0HrFC3ILl6cCnfEn5ytDnwJtZmlcys,1436
 retab/types/inference_settings.py,sha256=wIivYffvEE7v6lhbjbhAZGssK4uYr64Oq6cZKxzY5_M,1131
@@ -19,11 +21,12 @@ retab/types/mime.py,sha256=ZLNCD3pvgn5cbGfJwzrdkjgB9dMHCbN67YEV9bx47zE,10063
 retab/types/modality.py,sha256=4B8LctdUBZVgIjtS2FjrJpljn2Eyse0XE1bpFsGb9O4,131
 retab/types/pagination.py,sha256=A0Fw06baPTfEaYwo3kvNs4vaupzlqylBc6tQH-2DFuY,279
 retab/types/standards.py,sha256=7aGtuvzzkKidvqY8JB2Cjfn43V80FeKwrTtp162kjKc,1477
-retab/types/documents/__init__.py,sha256=YDsvsmwkS5lfGXk5aBqSqmFh6LKX3dM6q_cUo5oIydU,277
+retab/types/documents/__init__.py,sha256=t1jXdpYqi-zQMC_9uM0m7eA1hRU0MCROwUx89ccD2-c,418
+retab/types/documents/classify.py,sha256=Tb6d_7kuTlWLr7bPn782dHrjtUVBCvXV3o9zm7j2lmE,1128
 retab/types/documents/correct_orientation.py,sha256=e-ivsslI6L6Gl0YkcslXw_DH620xMGEYVp4tdeviXeM,261
 retab/types/documents/create_messages.py,sha256=Uym0SnVUGkyt1C5AOD37BsZ3puyeu_igR6X9SboojfA,7267
 retab/types/documents/edit.py,sha256=ZY-a_Q9Y76e4oojeJJsisoCZbNSU6gqwAgb9fq9S76w,5930
-retab/types/documents/extract.py,sha256=eMaVl76K_1CeuLmdttfrf4yoQqs27f10w9rNBePb0DY,16724
+retab/types/documents/extract.py,sha256=x_59fm69-icsxxGRgpFd0NN-SLRoMYqbvfCZuG7zyGc,18033
 retab/types/documents/parse.py,sha256=MXe7zh3DusWQhGe0Sr95nPy6cB8DRX8MA4Hmjj_AP7E,1300
 retab/types/documents/split.py,sha256=xRdJ6IpSRAPi_ZtAG2FNqg5A-v5tzfb1QQkW5UfO2pY,1246
 retab/types/extractions/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -38,13 +41,15 @@ retab/types/schemas/generate.py,sha256=8c9LzFgsG9BpteKzjPaLJEneEHsjCyYvGo1jdko-D
 retab/types/schemas/layout.py,sha256=JLPwQGIWfPBoe1Y5r-MhiNDJigzZ-yKZnVGgox0uqMk,1487
 retab/types/schemas/model.py,sha256=kIMB1C_q7YjYJeVV3y06n3m_ebCGSLXyjDs34Ye-oes,72863
 retab/types/schemas/templates.py,sha256=XihWTHi6t_6QjxN07n_1dee5KdhHiuoHAYfmKwI7gQg,1708
+retab/types/workflows/__init__.py,sha256=lyFqR2wWTb3l0Uq_84kU4GK6xPKCxUvHtC1hQ6yAHs0,200
+retab/types/workflows/model.py,sha256=t3VCulEsSZN17uHO-TTU_y5kELwcCvmfjgjY9sg3qPM,4863
 retab/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 retab/utils/display.py,sha256=ZFPbiBnwEWGR-suS8e9Xilz9OqyYRDwsKYWfbFSJPJM,18868
 retab/utils/hashing.py,sha256=_BMVUvftOcJav68QL0rLkH2dbhW9RRJPzeGC2akR0fc,757
-retab/utils/json_schema.py,sha256=F3MLNGskpfPh1IkXHPLp60ceOEFD79GyL8mVvr0OiVM,19583
+retab/utils/json_schema.py,sha256=zP4pQLpVHBKWo_abCjb_dU4kA0azhHopd-1TFUgVEvc,20655
 retab/utils/mime.py,sha256=mTP_lqSPttOP5DYJxopiWaeFXrUCPjhwd7y53nCVGO4,6189
 retab/utils/stream_context_managers.py,sha256=gI1gVQSj3nWz6Mvjz7Ix5AiY0g6vSL-c2tPfuP04izo,2314
-retab-0.0.79.dist-info/METADATA,sha256=GAgtfkDV8Zu0Bc4dBl7vL87xLutKpGUqpwCY3RxGFP0,4532
-retab-0.0.79.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-retab-0.0.79.dist-info/top_level.txt,sha256=waQR0EGdhLIQtztoE3AXg7ik5ONQ9q_bsKVpyFuJdq0,6
-retab-0.0.79.dist-info/RECORD,,
+retab-0.0.81.dist-info/METADATA,sha256=1dsE31zFzslvv3Up5BOM62auWgQNbLCie0hZ2NfwP5Y,4532
+retab-0.0.81.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+retab-0.0.81.dist-info/top_level.txt,sha256=waQR0EGdhLIQtztoE3AXg7ik5ONQ9q_bsKVpyFuJdq0,6
+retab-0.0.81.dist-info/RECORD,,

{retab-0.0.79.dist-info → retab-0.0.81.dist-info}/WHEEL RENAMED Viewed

File without changes

{retab-0.0.79.dist-info → retab-0.0.81.dist-info}/top_level.txt RENAMED Viewed

File without changes

retab 0.0.79__py3-none-any.whl → 0.0.81__py3-none-any.whl

retab 0.0.79py3-none-any.whl → 0.0.81py3-none-any.whl