PyPI - retab - Versions diffs - 0.0.79__tar.gz → 0.0.80__tar.gz - Mend

retab 0.0.79tar.gz → 0.0.80tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (62) hide show

{retab-0.0.79 → retab-0.0.80}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: retab
-Version: 0.0.79
+Version: 0.0.80
 Summary: Retab official python library
 Home-page: https://github.com/retab-dev/retab
 Author: Retab

retab-0.0.80/retab/resources/workflows/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from .client import AsyncWorkflows, Workflows
+__all__ = ["Workflows", "AsyncWorkflows"]

retab-0.0.80/retab/resources/workflows/client.py ADDED Viewed

@@ -0,0 +1,190 @@
+from io import IOBase
+from pathlib import Path
+from typing import Any, Dict
+import PIL.Image
+from pydantic import HttpUrl
+from ..._resource import AsyncAPIResource, SyncAPIResource
+from ...utils.mime import MIMEData, prepare_mime_document
+from ...types.standards import PreparedRequest
+from ...types.workflows import WorkflowRun
+# Type alias for document inputs
+DocumentInput = Path | str | bytes | IOBase | MIMEData | PIL.Image.Image | HttpUrl
+class WorkflowsMixin:
+    """Mixin providing shared methods for workflow operations."""
+    def prepare_run(
+        self,
+        workflow_id: str,
+        documents: Dict[str, DocumentInput],
+    ) -> PreparedRequest:
+        """Prepare a request to run a workflow with input documents.
+        Args:
+            workflow_id: The ID of the workflow to run
+            documents: Mapping of start node IDs to their input documents.
+                       Each document can be a file path, bytes, file-like object,
+                       MIMEData, PIL Image, or HttpUrl.
+        Returns:
+            PreparedRequest: The prepared request
+        Example:
+            >>> client.workflows.run(
+            ...     workflow_id="wf_abc123",
+            ...     documents={
+            ...         "start-node-1": Path("invoice.pdf"),
+            ...         "start-node-2": Path("receipt.pdf"),
+            ...     }
+            ... )
+        """
+        # Convert each document to MIMEData and then to the format expected by the backend
+        documents_payload: Dict[str, Dict[str, Any]] = {}
+        for node_id, document in documents.items():
+            mime_data = prepare_mime_document(document)
+            documents_payload[node_id] = {
+                "filename": mime_data.filename,
+                "content": mime_data.content,
+                "mime_type": mime_data.mime_type,
+            }
+        data = {"documents": documents_payload}
+        return PreparedRequest(method="POST", url=f"/v1/workflows/{workflow_id}/run", data=data)
+    def prepare_get_run(self, run_id: str) -> PreparedRequest:
+        """Prepare a request to get a workflow run by ID.
+        Args:
+            run_id: The ID of the workflow run to retrieve
+        Returns:
+            PreparedRequest: The prepared request
+        """
+        return PreparedRequest(method="GET", url=f"/v1/workflows/runs/{run_id}")
+class Workflows(SyncAPIResource, WorkflowsMixin):
+    """Workflows API wrapper for synchronous operations."""
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+    def run(
+        self,
+        workflow_id: str,
+        documents: Dict[str, DocumentInput],
+    ) -> WorkflowRun:
+        """Run a workflow with the provided input documents.
+        This creates a workflow run and starts execution in the background.
+        The returned WorkflowRun will have status "running" - use get_run()
+        to check for updates on the run status.
+        Args:
+            workflow_id: The ID of the workflow to run
+            documents: Mapping of start node IDs to their input documents.
+                       Each document can be a file path, bytes, file-like object,
+                       MIMEData, PIL Image, or HttpUrl.
+        Returns:
+            WorkflowRun: The created workflow run with status "running"
+        Raises:
+            HTTPException: If the request fails (e.g., workflow not found,
+                          missing input documents for start nodes)
+        Example:
+            >>> run = client.workflows.run(
+            ...     workflow_id="wf_abc123",
+            ...     documents={
+            ...         "start-node-1": Path("invoice.pdf"),
+            ...         "start-node-2": Path("receipt.pdf"),
+            ...     }
+            ... )
+            >>> print(f"Run started: {run.id}, status: {run.status}")
+        """
+        request = self.prepare_run(workflow_id=workflow_id, documents=documents)
+        response = self._client._prepared_request(request)
+        return WorkflowRun.model_validate(response)
+    def get_run(self, run_id: str) -> WorkflowRun:
+        """Get a workflow run by ID.
+        Args:
+            run_id: The ID of the workflow run to retrieve
+        Returns:
+            WorkflowRun: The workflow run
+        Raises:
+            HTTPException: If the request fails (e.g., run not found)
+        """
+        request = self.prepare_get_run(run_id)
+        response = self._client._prepared_request(request)
+        return WorkflowRun.model_validate(response)
+class AsyncWorkflows(AsyncAPIResource, WorkflowsMixin):
+    """Workflows API wrapper for asynchronous operations."""
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+    async def run(
+        self,
+        workflow_id: str,
+        documents: Dict[str, DocumentInput],
+    ) -> WorkflowRun:
+        """Run a workflow with the provided input documents.
+        This creates a workflow run and starts execution in the background.
+        The returned WorkflowRun will have status "running" - use get_run()
+        to check for updates on the run status.
+        Args:
+            workflow_id: The ID of the workflow to run
+            documents: Mapping of start node IDs to their input documents.
+                       Each document can be a file path, bytes, file-like object,
+                       MIMEData, PIL Image, or HttpUrl.
+        Returns:
+            WorkflowRun: The created workflow run with status "running"
+        Raises:
+            HTTPException: If the request fails (e.g., workflow not found,
+                          missing input documents for start nodes)
+        Example:
+            >>> run = await client.workflows.run(
+            ...     workflow_id="wf_abc123",
+            ...     documents={
+            ...         "start-node-1": Path("invoice.pdf"),
+            ...         "start-node-2": Path("receipt.pdf"),
+            ...     }
+            ... )
+            >>> print(f"Run started: {run.id}, status: {run.status}")
+        """
+        request = self.prepare_run(workflow_id=workflow_id, documents=documents)
+        response = await self._client._prepared_request(request)
+        return WorkflowRun.model_validate(response)
+    async def get_run(self, run_id: str) -> WorkflowRun:
+        """Get a workflow run by ID.
+        Args:
+            run_id: The ID of the workflow run to retrieve
+        Returns:
+            WorkflowRun: The workflow run
+        Raises:
+            HTTPException: If the request fails (e.g., run not found)
+        """
+        request = self.prepare_get_run(run_id)
+        response = await self._client._prepared_request(request)
+        return WorkflowRun.model_validate(response)

{retab-0.0.79 → retab-0.0.80}/retab/types/documents/__init__.py RENAMED Viewed

@@ -1,5 +1,6 @@
 from .parse import ParseRequest, ParseResult, RetabUsage
 from .split import Category, SplitRequest, SplitResult, SplitResponse
+from .classify import ClassifyRequest, ClassifyResult, ClassifyResponse
 __all__ = [
     "ParseRequest",
@@ -9,4 +10,7 @@ __all__ = [
     "SplitRequest",
     "SplitResult",
     "SplitResponse",
+    "ClassifyRequest",
+    "ClassifyResult",
+    "ClassifyResponse",
 ]

retab-0.0.80/retab/types/documents/classify.py ADDED Viewed

@@ -0,0 +1,31 @@
+from pydantic import BaseModel, Field
+from ..mime import MIMEData
+from .split import Category
+class ClassifyRequest(BaseModel):
+    document: MIMEData = Field(..., description="The document to classify")
+    categories: list[Category] = Field(..., description="The categories to classify the document into")
+    model: str = Field(default="retab-small", description="The model to use for classification")
+class ClassifyResult(BaseModel):
+    reasoning: str = Field(..., description="The reasoning for the classification decision")
+    classification: str = Field(..., description="The category name that the document belongs to")
+class ClassifyResponse(BaseModel):
+    result: ClassifyResult = Field(..., description="The classification result with reasoning")
+class ClassifyOutputSchema(BaseModel):
+    """Schema for LLM structured output."""
+    reasoning: str = Field(
+        ...,
+        description="Step-by-step reasoning explaining why this document belongs to the chosen category"
+    )
+    classification: str = Field(
+        ...,
+        description="The category name that this document belongs to"
+    )

{retab-0.0.79 → retab-0.0.80}/retab/types/documents/extract.py RENAMED Viewed

@@ -155,6 +155,9 @@ class RetabParsedChoiceDeltaChunk(ChoiceDeltaChunk):
     flat_deleted_keys: list[str] = []
     is_valid_json: bool = False
     key_mapping: dict[str, Optional[str]] | None = Field(default=None, description="Mapping of consensus keys to original model keys")
+    # Full parsed object from the LLM (when available). Used to avoid data corruption
+    # from unflatten_dict when null values are not transmitted in streaming deltas.
+    full_parsed: dict[str, Any] | None = Field(default=None, description="Complete parsed object from LLM, used instead of unflatten_dict when available")
 class RetabParsedChoiceChunk(ChoiceChunk):
@@ -183,6 +186,7 @@ class RetabParsedChatCompletionChunk(StreamingBaseModel, ChatCompletionChunk):
                     flat_parsed={},
                     flat_likelihoods={},
                     is_valid_json=False,
+                    full_parsed=None,
                 )
         max_choices = max(len(self.choices), len(previous_cumulated_chunk.choices)) if previous_cumulated_chunk is not None else len(self.choices)
@@ -201,6 +205,8 @@ class RetabParsedChatCompletionChunk(StreamingBaseModel, ChatCompletionChunk):
         acc_flat_parsed = [safe_get_delta(previous_cumulated_chunk, i).flat_parsed | safe_get_delta(self, i).flat_parsed for i in range(max_choices)]
         acc_flat_likelihoods = [safe_get_delta(previous_cumulated_chunk, i).flat_likelihoods | safe_get_delta(self, i).flat_likelihoods for i in range(max_choices)]
         acc_key_mapping = [safe_get_delta(previous_cumulated_chunk, i).key_mapping or safe_get_delta(self, i).key_mapping for i in range(max_choices)]
+        # Preserve full_parsed: use the current chunk's full_parsed if available, otherwise keep the previous one
+        acc_full_parsed = [safe_get_delta(self, i).full_parsed or safe_get_delta(previous_cumulated_chunk, i).full_parsed for i in range(max_choices)]
         acc_content = [(safe_get_delta(previous_cumulated_chunk, i).content or "") + (safe_get_delta(self, i).content or "") for i in range(max_choices)]
@@ -219,6 +225,7 @@ class RetabParsedChatCompletionChunk(StreamingBaseModel, ChatCompletionChunk):
                         flat_deleted_keys=acc_flat_deleted_keys[i],
                         is_valid_json=acc_is_valid_json[i],
                         key_mapping=acc_key_mapping[i],
+                        full_parsed=acc_full_parsed[i],
                     ),
                     index=i,
                 )
@@ -238,7 +245,18 @@ class RetabParsedChatCompletionChunk(StreamingBaseModel, ChatCompletionChunk):
         if override_final_flat_parseds is None:
             override_final_flat_parseds = [self.choices[idx].delta.flat_parsed for idx in range(len(self.choices))]
-        final_parsed_list = [unflatten_dict(override_final_flat_parseds[idx]) for idx in range(len(self.choices))]
+        # Build final_parsed_list using full_parsed when available (correct data from LLM),
+        # falling back to unflatten_dict for backward compatibility
+        final_parsed_list = []
+        for idx in range(len(self.choices)):
+            full_parsed = self.choices[idx].delta.full_parsed
+            if full_parsed is not None:
+                # Use the complete parsed object from the LLM (avoids data corruption from unflatten_dict)
+                final_parsed_list.append(full_parsed)
+            else:
+                # Fallback: reconstruct from flat_parsed (may lose null values in sparse arrays)
+                final_parsed_list.append(unflatten_dict(override_final_flat_parseds[idx]))
         final_content_list = [json.dumps(final_parsed_list[idx]) for idx in range(len(self.choices))]
         # The final likelihoods are only on the first choice.
@@ -264,7 +282,7 @@ class RetabParsedChatCompletionChunk(StreamingBaseModel, ChatCompletionChunk):
                         role="assistant",
                         parsed=final_parsed_list[idx],
                     ),
-                    key_mapping=self.choices[idx].delta.key_mapping,
+                    key_mapping=self.choices[idx].delta.key_mapping, # type: ignore[call-arg]
                     finish_reason="stop",
                     logprobs=None,
                 )

retab-0.0.80/retab/types/workflows/__init__.py ADDED Viewed

@@ -0,0 +1,11 @@
+from .model import WorkflowRun, StepStatus, StepIOReference, HandlePayload, NodeType
+__all__ = [
+    "WorkflowRun",
+    "StepStatus",
+    "StepIOReference",
+    "HandlePayload",
+    "NodeType",
+]

retab-0.0.80/retab/types/workflows/model.py ADDED Viewed

@@ -0,0 +1,76 @@
+import datetime
+from typing import Any, Dict, List, Literal, Optional
+from pydantic import BaseModel, Field, ConfigDict
+class StepIOReference(BaseModel):
+    """Reference to step input/output stored in GCS"""
+    file_id: Optional[str] = Field(default=None, description="File ID for document storage lookup")
+    gcs_path: Optional[str] = Field(default=None, description="GCS path to the stored file")
+    filename: Optional[str] = Field(default=None, description="Original filename")
+    mime_type: Optional[str] = Field(default=None, description="MIME type of the file")
+class HandlePayload(BaseModel):
+    """
+    Payload for a single output handle.
+    Each output handle on a node produces a typed payload that can be:
+    - file: A document reference (PDF, image, etc.)
+    - json: Structured JSON data (extracted data, etc.)
+    - text: Plain text content
+    """
+    type: Literal["file", "json", "text"] = Field(..., description="Type of payload")
+    document: Optional[StepIOReference] = Field(default=None, description="For file handles: document reference")
+    data: Optional[dict] = Field(default=None, description="For JSON handles: structured data")
+    text: Optional[str] = Field(default=None, description="For text handles: text content")
+NodeType = Literal["start", "extract", "split", "end", "hil"]
+class StepStatus(BaseModel):
+    """Status of a single step in workflow execution"""
+    node_id: str = Field(..., description="ID of the node")
+    node_type: NodeType = Field(..., description="Type of the node")
+    node_label: str = Field(..., description="Label of the node")
+    status: Literal["pending", "running", "completed", "error", "waiting_for_human"] = Field(..., description="Current status")
+    started_at: Optional[datetime.datetime] = Field(default=None, description="When the step started")
+    completed_at: Optional[datetime.datetime] = Field(default=None, description="When the step completed")
+    duration_ms: Optional[int] = Field(default=None, description="Duration in milliseconds")
+    error: Optional[str] = Field(default=None, description="Error message if failed")
+    output: Optional[dict] = Field(default=None, description="Output data from the step")
+    handle_outputs: Optional[Dict[str, HandlePayload]] = Field(
+        default=None,
+        description="Output payloads keyed by handle ID (e.g., 'output-file-0', 'output-json-0')"
+    )
+    input_document: Optional[StepIOReference] = Field(default=None, description="Reference to input document")
+    output_document: Optional[StepIOReference] = Field(default=None, description="Reference to output document")
+    split_documents: Optional[Dict[str, StepIOReference]] = Field(default=None, description="For split nodes: category -> document reference")
+    requires_human_review: Optional[bool] = Field(default=None, description="Whether this step requires human review")
+    human_reviewed_at: Optional[datetime.datetime] = Field(default=None, description="When human review was completed")
+    human_review_approved: Optional[bool] = Field(default=None, description="Whether human approved or rejected")
+class WorkflowRun(BaseModel):
+    """A stored workflow run record"""
+    model_config = ConfigDict(extra="ignore")
+    id: str = Field(..., description="Unique ID for this run")
+    workflow_id: str = Field(..., description="ID of the workflow that was run")
+    workflow_name: str = Field(..., description="Name of the workflow at time of execution")
+    organization_id: str = Field(..., description="Organization that owns this run")
+    status: Literal["pending", "running", "completed", "error", "waiting_for_human"] = Field(default="pending", description="Overall status")
+    started_at: datetime.datetime = Field(..., description="When the workflow started")
+    completed_at: Optional[datetime.datetime] = Field(default=None, description="When the workflow completed")
+    duration_ms: Optional[int] = Field(default=None, description="Total duration in milliseconds")
+    steps: List[StepStatus] = Field(default_factory=list, description="Status of each step")
+    input_documents: Optional[Dict[str, StepIOReference]] = Field(default=None, description="Start node ID -> input document reference")
+    final_outputs: Optional[dict] = Field(default=None, description="Final outputs from end nodes")
+    error: Optional[str] = Field(default=None, description="Error message if workflow failed")
+    created_at: datetime.datetime = Field(..., description="When the run was created")
+    updated_at: datetime.datetime = Field(..., description="When the run was last updated")
+    waiting_for_node_ids: List[str] = Field(default_factory=list, description="Node IDs that are waiting for human review")
+    pending_node_outputs: Optional[dict] = Field(default=None, description="Serialized node outputs to resume from")

{retab-0.0.79 → retab-0.0.80}/retab/utils/json_schema.py RENAMED Viewed

@@ -158,23 +158,43 @@ def expand_refs(schema: dict[str, Any], definitions: dict[str, dict[str, Any]] |
         print("Cyclic refs found, keeping it as is")
         return schema
+    # Support both $defs (draft 2019-09+) and definitions (draft-07)
     if definitions is None:
-        definitions = schema.pop("$defs", {})
+        definitions = schema.pop("$defs", None) or schema.pop("definitions", {})
     assert isinstance(definitions, dict)
-    if "allOf" in schema:
-        # Some schemas (notably the one converted from a pydantic model) have allOf. We only accept one element in allOf
-        if len(schema["allOf"]) != 1:
-            raise ValueError(f"Property schema must have a single element in 'allOf'. Found: {schema['allOf']}")
-        schema.update(schema.pop("allOf", [{}])[0])
+    # Handle allOf - merge all elements
+    if "allOf" in schema and isinstance(schema["allOf"], list) and len(schema["allOf"]) > 0:
+        all_of_elements = schema.pop("allOf")
+        for element in all_of_elements:
+            if isinstance(element, dict):
+                # Recursively expand refs in each allOf element first
+                expanded = expand_refs(element, definitions)
+                # Deep merge properties if both have them
+                if "properties" in expanded and "properties" in schema:
+                    schema["properties"] = {**schema["properties"], **expanded["properties"]}
+                    del expanded["properties"]
+                # Merge required arrays if both have them
+                if "required" in expanded and "required" in schema:
+                    schema["required"] = list(set(schema["required"] + expanded["required"]))
+                    del expanded["required"]
+                schema.update(expanded)
     if "$ref" in schema:
         ref: str = schema["$ref"]
+        def_name: str | None = None
+        # Support both #/$defs/ and #/definitions/ formats
         if ref.startswith("#/$defs/"):
             def_name = ref.removeprefix("#/$defs/")
+        elif ref.startswith("#/definitions/"):
+            def_name = ref.removeprefix("#/definitions/")
+        if def_name is not None:
             if def_name not in definitions:
-                raise ValueError(f"Reference {ref} not found in definitions.")
+                # Return schema as-is if reference not found (might be external)
+                return schema
             target = definitions[def_name]
             merged = merge_descriptions(schema, target)
             merged.pop("$ref", None)
@@ -184,7 +204,8 @@ def expand_refs(schema: dict[str, Any], definitions: dict[str, dict[str, Any]] |
     result: dict[str, Any] = {}
     for annotation, subschema in schema.items():
-        if annotation in ["properties", "$defs"]:
+        # Handle properties, $defs, and definitions (draft-07) keys
+        if annotation in ["properties", "$defs", "definitions"]:
             if isinstance(subschema, dict):
                 new_dict = {}
                 for pk, pv in subschema.items():

{retab-0.0.79 → retab-0.0.80}/retab.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: retab
-Version: 0.0.79
+Version: 0.0.80
 Summary: Retab official python library
 Home-page: https://github.com/retab-dev/retab
 Author: Retab

{retab-0.0.79 → retab-0.0.80}/retab.egg-info/SOURCES.txt RENAMED Viewed

@@ -20,6 +20,8 @@ retab/resources/extractions/__init__.py
 retab/resources/extractions/client.py
 retab/resources/projects/__init__.py
 retab/resources/projects/client.py
+retab/resources/workflows/__init__.py
+retab/resources/workflows/client.py
 retab/types/__init__.py
 retab/types/chat.py
 retab/types/inference_settings.py
@@ -28,6 +30,7 @@ retab/types/modality.py
 retab/types/pagination.py
 retab/types/standards.py
 retab/types/documents/__init__.py
+retab/types/documents/classify.py
 retab/types/documents/correct_orientation.py
 retab/types/documents/create_messages.py
 retab/types/documents/edit.py
@@ -46,6 +49,8 @@ retab/types/schemas/generate.py
 retab/types/schemas/layout.py
 retab/types/schemas/model.py
 retab/types/schemas/templates.py
+retab/types/workflows/__init__.py
+retab/types/workflows/model.py
 retab/utils/__init__.py
 retab/utils/display.py
 retab/utils/hashing.py

{retab-0.0.79 → retab-0.0.80}/setup.py RENAMED Viewed

@@ -6,7 +6,7 @@ with open("requirements.txt") as f:
 setup(
     name="retab",
-    version="0.0.79",
+    version="0.0.80",
     author="Retab",
     author_email="contact@retab.com",
     description="Retab official python library",