retab 0.0.79__py3-none-any.whl → 0.0.81__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -17,6 +17,7 @@ from ...types.documents.edit import EditRequest, EditResponse
17
17
  from ...types.documents.extract import DocumentExtractRequest, RetabParsedChatCompletion, RetabParsedChatCompletionChunk, RetabParsedChoice, maybe_parse_to_pydantic
18
18
  from ...types.documents.parse import ParseRequest, ParseResult, TableParsingFormat
19
19
  from ...types.documents.split import Category, SplitRequest, SplitResponse
20
+ from ...types.documents.classify import ClassifyRequest, ClassifyResponse
20
21
  from ...types.mime import MIMEData
21
22
  from ...types.standards import PreparedRequest, FieldUnset
22
23
  from ...utils.json_schema import load_json_schema, unflatten_dict
@@ -172,6 +173,34 @@ class BaseDocumentsMixin:
172
173
  split_request = SplitRequest(**request_dict)
173
174
  return PreparedRequest(method="POST", url="/v1/documents/split", data=split_request.model_dump(mode="json", exclude_unset=True))
174
175
 
176
+ def _prepare_classify(
177
+ self,
178
+ document: Path | str | IOBase | MIMEData | PIL.Image.Image | HttpUrl,
179
+ categories: list[Category] | list[dict[str, str]],
180
+ model: str,
181
+ **extra_body: Any,
182
+ ) -> PreparedRequest:
183
+ mime_document = prepare_mime_document(document)
184
+
185
+ # Convert dict categories to Category objects if needed
186
+ category_objects = [
187
+ Category(**cat) if isinstance(cat, dict) else cat
188
+ for cat in categories
189
+ ]
190
+
191
+ request_dict: dict[str, Any] = {
192
+ "document": mime_document,
193
+ "categories": category_objects,
194
+ "model": model,
195
+ }
196
+
197
+ # Merge any extra fields provided by the caller
198
+ if extra_body:
199
+ request_dict.update(extra_body)
200
+
201
+ classify_request = ClassifyRequest(**request_dict)
202
+ return PreparedRequest(method="POST", url="/v1/documents/classify", data=classify_request.model_dump(mode="json", exclude_unset=True))
203
+
175
204
  def _prepare_extract(
176
205
  self,
177
206
  json_schema: dict[str, Any] | Path | str,
@@ -662,6 +691,57 @@ class Documents(SyncAPIResource, BaseDocumentsMixin):
662
691
  response = self._client._prepared_request(request)
663
692
  return SplitResponse.model_validate(response)
664
693
 
694
+ def classify(
695
+ self,
696
+ document: Path | str | IOBase | MIMEData | PIL.Image.Image | HttpUrl,
697
+ categories: list[Category] | list[dict[str, str]],
698
+ model: str,
699
+ **extra_body: Any,
700
+ ) -> ClassifyResponse:
701
+ """
702
+ Classify a document into one of the provided categories.
703
+
704
+ This method analyzes a document and classifies it into exactly one
705
+ of the user-defined categories, returning the classification with
706
+ chain-of-thought reasoning explaining the decision.
707
+
708
+ Args:
709
+ document: The document to classify. Can be a file path (Path or str), file-like object, MIMEData, PIL Image, or URL.
710
+ categories: List of categories to classify the document into. Each category should have a 'name' and 'description'.
711
+ Can be Category objects or dicts with 'name' and 'description' keys.
712
+ model: The AI model to use for document classification (e.g., "gemini-2.5-flash").
713
+
714
+ Returns:
715
+ ClassifyResponse: Response containing:
716
+ - result: ClassifyResult with reasoning and classification.
717
+
718
+ Raises:
719
+ HTTPException: If the request fails.
720
+
721
+ Example:
722
+ ```python
723
+ response = retab.documents.classify(
724
+ document="invoice.pdf",
725
+ model="gemini-2.5-flash",
726
+ categories=[
727
+ {"name": "invoice", "description": "Invoice documents with billing information"},
728
+ {"name": "receipt", "description": "Receipt documents for payments"},
729
+ {"name": "contract", "description": "Legal contract documents"},
730
+ ]
731
+ )
732
+ print(f"Classification: {response.result.classification}")
733
+ print(f"Reasoning: {response.result.reasoning}")
734
+ ```
735
+ """
736
+ request = self._prepare_classify(
737
+ document=document,
738
+ categories=categories,
739
+ model=model,
740
+ **extra_body,
741
+ )
742
+ response = self._client._prepared_request(request)
743
+ return ClassifyResponse.model_validate(response)
744
+
665
745
 
666
746
  class AsyncDocuments(AsyncAPIResource, BaseDocumentsMixin):
667
747
  """Documents API wrapper for asynchronous usage."""
@@ -1005,3 +1085,54 @@ class AsyncDocuments(AsyncAPIResource, BaseDocumentsMixin):
1005
1085
  )
1006
1086
  response = await self._client._prepared_request(request)
1007
1087
  return SplitResponse.model_validate(response)
1088
+
1089
+ async def classify(
1090
+ self,
1091
+ document: Path | str | IOBase | MIMEData | PIL.Image.Image | HttpUrl,
1092
+ categories: list[Category] | list[dict[str, str]],
1093
+ model: str,
1094
+ **extra_body: Any,
1095
+ ) -> ClassifyResponse:
1096
+ """
1097
+ Classify a document into one of the provided categories asynchronously.
1098
+
1099
+ This method analyzes a document and classifies it into exactly one
1100
+ of the user-defined categories, returning the classification with
1101
+ chain-of-thought reasoning explaining the decision.
1102
+
1103
+ Args:
1104
+ document: The document to classify. Can be a file path (Path or str), file-like object, MIMEData, PIL Image, or URL.
1105
+ categories: List of categories to classify the document into. Each category should have a 'name' and 'description'.
1106
+ Can be Category objects or dicts with 'name' and 'description' keys.
1107
+ model: The AI model to use for document classification (e.g., "gemini-2.5-flash").
1108
+
1109
+ Returns:
1110
+ ClassifyResponse: Response containing:
1111
+ - result: ClassifyResult with reasoning and classification.
1112
+
1113
+ Raises:
1114
+ HTTPException: If the request fails.
1115
+
1116
+ Example:
1117
+ ```python
1118
+ response = await retab.documents.classify(
1119
+ document="invoice.pdf",
1120
+ model="gemini-2.5-flash",
1121
+ categories=[
1122
+ {"name": "invoice", "description": "Invoice documents with billing information"},
1123
+ {"name": "receipt", "description": "Receipt documents for payments"},
1124
+ {"name": "contract", "description": "Legal contract documents"},
1125
+ ]
1126
+ )
1127
+ print(f"Classification: {response.result.classification}")
1128
+ print(f"Reasoning: {response.result.reasoning}")
1129
+ ```
1130
+ """
1131
+ request = self._prepare_classify(
1132
+ document=document,
1133
+ categories=categories,
1134
+ model=model,
1135
+ **extra_body,
1136
+ )
1137
+ response = await self._client._prepared_request(request)
1138
+ return ClassifyResponse.model_validate(response)
@@ -0,0 +1,3 @@
1
+ from .client import AsyncWorkflows, Workflows
2
+
3
+ __all__ = ["Workflows", "AsyncWorkflows"]
@@ -0,0 +1,190 @@
1
+ from io import IOBase
2
+ from pathlib import Path
3
+ from typing import Any, Dict
4
+
5
+ import PIL.Image
6
+ from pydantic import HttpUrl
7
+
8
+ from ..._resource import AsyncAPIResource, SyncAPIResource
9
+ from ...utils.mime import MIMEData, prepare_mime_document
10
+ from ...types.standards import PreparedRequest
11
+ from ...types.workflows import WorkflowRun
12
+
13
+
14
+ # Type alias for document inputs
15
+ DocumentInput = Path | str | bytes | IOBase | MIMEData | PIL.Image.Image | HttpUrl
16
+
17
+
18
+ class WorkflowsMixin:
19
+ """Mixin providing shared methods for workflow operations."""
20
+
21
+ def prepare_run(
22
+ self,
23
+ workflow_id: str,
24
+ documents: Dict[str, DocumentInput],
25
+ ) -> PreparedRequest:
26
+ """Prepare a request to run a workflow with input documents.
27
+
28
+ Args:
29
+ workflow_id: The ID of the workflow to run
30
+ documents: Mapping of start node IDs to their input documents.
31
+ Each document can be a file path, bytes, file-like object,
32
+ MIMEData, PIL Image, or HttpUrl.
33
+
34
+ Returns:
35
+ PreparedRequest: The prepared request
36
+
37
+ Example:
38
+ >>> client.workflows.run(
39
+ ... workflow_id="wf_abc123",
40
+ ... documents={
41
+ ... "start-node-1": Path("invoice.pdf"),
42
+ ... "start-node-2": Path("receipt.pdf"),
43
+ ... }
44
+ ... )
45
+ """
46
+ # Convert each document to MIMEData and then to the format expected by the backend
47
+ documents_payload: Dict[str, Dict[str, Any]] = {}
48
+ for node_id, document in documents.items():
49
+ mime_data = prepare_mime_document(document)
50
+ documents_payload[node_id] = {
51
+ "filename": mime_data.filename,
52
+ "content": mime_data.content,
53
+ "mime_type": mime_data.mime_type,
54
+ }
55
+
56
+ data = {"documents": documents_payload}
57
+ return PreparedRequest(method="POST", url=f"/v1/workflows/{workflow_id}/run", data=data)
58
+
59
+ def prepare_get_run(self, run_id: str) -> PreparedRequest:
60
+ """Prepare a request to get a workflow run by ID.
61
+
62
+ Args:
63
+ run_id: The ID of the workflow run to retrieve
64
+
65
+ Returns:
66
+ PreparedRequest: The prepared request
67
+ """
68
+ return PreparedRequest(method="GET", url=f"/v1/workflows/runs/{run_id}")
69
+
70
+
71
+ class Workflows(SyncAPIResource, WorkflowsMixin):
72
+ """Workflows API wrapper for synchronous operations."""
73
+
74
+ def __init__(self, *args, **kwargs):
75
+ super().__init__(*args, **kwargs)
76
+
77
+ def run(
78
+ self,
79
+ workflow_id: str,
80
+ documents: Dict[str, DocumentInput],
81
+ ) -> WorkflowRun:
82
+ """Run a workflow with the provided input documents.
83
+
84
+ This creates a workflow run and starts execution in the background.
85
+ The returned WorkflowRun will have status "running" - use get_run()
86
+ to check for updates on the run status.
87
+
88
+ Args:
89
+ workflow_id: The ID of the workflow to run
90
+ documents: Mapping of start node IDs to their input documents.
91
+ Each document can be a file path, bytes, file-like object,
92
+ MIMEData, PIL Image, or HttpUrl.
93
+
94
+ Returns:
95
+ WorkflowRun: The created workflow run with status "running"
96
+
97
+ Raises:
98
+ HTTPException: If the request fails (e.g., workflow not found,
99
+ missing input documents for start nodes)
100
+
101
+ Example:
102
+ >>> run = client.workflows.run(
103
+ ... workflow_id="wf_abc123",
104
+ ... documents={
105
+ ... "start-node-1": Path("invoice.pdf"),
106
+ ... "start-node-2": Path("receipt.pdf"),
107
+ ... }
108
+ ... )
109
+ >>> print(f"Run started: {run.id}, status: {run.status}")
110
+ """
111
+ request = self.prepare_run(workflow_id=workflow_id, documents=documents)
112
+ response = self._client._prepared_request(request)
113
+ return WorkflowRun.model_validate(response)
114
+
115
+ def get_run(self, run_id: str) -> WorkflowRun:
116
+ """Get a workflow run by ID.
117
+
118
+ Args:
119
+ run_id: The ID of the workflow run to retrieve
120
+
121
+ Returns:
122
+ WorkflowRun: The workflow run
123
+
124
+ Raises:
125
+ HTTPException: If the request fails (e.g., run not found)
126
+ """
127
+ request = self.prepare_get_run(run_id)
128
+ response = self._client._prepared_request(request)
129
+ return WorkflowRun.model_validate(response)
130
+
131
+
132
+ class AsyncWorkflows(AsyncAPIResource, WorkflowsMixin):
133
+ """Workflows API wrapper for asynchronous operations."""
134
+
135
+ def __init__(self, *args, **kwargs):
136
+ super().__init__(*args, **kwargs)
137
+
138
+ async def run(
139
+ self,
140
+ workflow_id: str,
141
+ documents: Dict[str, DocumentInput],
142
+ ) -> WorkflowRun:
143
+ """Run a workflow with the provided input documents.
144
+
145
+ This creates a workflow run and starts execution in the background.
146
+ The returned WorkflowRun will have status "running" - use get_run()
147
+ to check for updates on the run status.
148
+
149
+ Args:
150
+ workflow_id: The ID of the workflow to run
151
+ documents: Mapping of start node IDs to their input documents.
152
+ Each document can be a file path, bytes, file-like object,
153
+ MIMEData, PIL Image, or HttpUrl.
154
+
155
+ Returns:
156
+ WorkflowRun: The created workflow run with status "running"
157
+
158
+ Raises:
159
+ HTTPException: If the request fails (e.g., workflow not found,
160
+ missing input documents for start nodes)
161
+
162
+ Example:
163
+ >>> run = await client.workflows.run(
164
+ ... workflow_id="wf_abc123",
165
+ ... documents={
166
+ ... "start-node-1": Path("invoice.pdf"),
167
+ ... "start-node-2": Path("receipt.pdf"),
168
+ ... }
169
+ ... )
170
+ >>> print(f"Run started: {run.id}, status: {run.status}")
171
+ """
172
+ request = self.prepare_run(workflow_id=workflow_id, documents=documents)
173
+ response = await self._client._prepared_request(request)
174
+ return WorkflowRun.model_validate(response)
175
+
176
+ async def get_run(self, run_id: str) -> WorkflowRun:
177
+ """Get a workflow run by ID.
178
+
179
+ Args:
180
+ run_id: The ID of the workflow run to retrieve
181
+
182
+ Returns:
183
+ WorkflowRun: The workflow run
184
+
185
+ Raises:
186
+ HTTPException: If the request fails (e.g., run not found)
187
+ """
188
+ request = self.prepare_get_run(run_id)
189
+ response = await self._client._prepared_request(request)
190
+ return WorkflowRun.model_validate(response)
@@ -1,5 +1,6 @@
1
1
  from .parse import ParseRequest, ParseResult, RetabUsage
2
2
  from .split import Category, SplitRequest, SplitResult, SplitResponse
3
+ from .classify import ClassifyRequest, ClassifyResult, ClassifyResponse
3
4
 
4
5
  __all__ = [
5
6
  "ParseRequest",
@@ -9,4 +10,7 @@ __all__ = [
9
10
  "SplitRequest",
10
11
  "SplitResult",
11
12
  "SplitResponse",
13
+ "ClassifyRequest",
14
+ "ClassifyResult",
15
+ "ClassifyResponse",
12
16
  ]
@@ -0,0 +1,31 @@
1
+ from pydantic import BaseModel, Field
2
+ from ..mime import MIMEData
3
+ from .split import Category
4
+
5
+
6
+ class ClassifyRequest(BaseModel):
7
+ document: MIMEData = Field(..., description="The document to classify")
8
+ categories: list[Category] = Field(..., description="The categories to classify the document into")
9
+ model: str = Field(default="retab-small", description="The model to use for classification")
10
+
11
+
12
+ class ClassifyResult(BaseModel):
13
+ reasoning: str = Field(..., description="The reasoning for the classification decision")
14
+ classification: str = Field(..., description="The category name that the document belongs to")
15
+
16
+
17
+ class ClassifyResponse(BaseModel):
18
+ result: ClassifyResult = Field(..., description="The classification result with reasoning")
19
+
20
+
21
+ class ClassifyOutputSchema(BaseModel):
22
+ """Schema for LLM structured output."""
23
+ reasoning: str = Field(
24
+ ...,
25
+ description="Step-by-step reasoning explaining why this document belongs to the chosen category"
26
+ )
27
+ classification: str = Field(
28
+ ...,
29
+ description="The category name that this document belongs to"
30
+ )
31
+
@@ -155,6 +155,9 @@ class RetabParsedChoiceDeltaChunk(ChoiceDeltaChunk):
155
155
  flat_deleted_keys: list[str] = []
156
156
  is_valid_json: bool = False
157
157
  key_mapping: dict[str, Optional[str]] | None = Field(default=None, description="Mapping of consensus keys to original model keys")
158
+ # Full parsed object from the LLM (when available). Used to avoid data corruption
159
+ # from unflatten_dict when null values are not transmitted in streaming deltas.
160
+ full_parsed: dict[str, Any] | None = Field(default=None, description="Complete parsed object from LLM, used instead of unflatten_dict when available")
158
161
 
159
162
 
160
163
  class RetabParsedChoiceChunk(ChoiceChunk):
@@ -183,6 +186,7 @@ class RetabParsedChatCompletionChunk(StreamingBaseModel, ChatCompletionChunk):
183
186
  flat_parsed={},
184
187
  flat_likelihoods={},
185
188
  is_valid_json=False,
189
+ full_parsed=None,
186
190
  )
187
191
 
188
192
  max_choices = max(len(self.choices), len(previous_cumulated_chunk.choices)) if previous_cumulated_chunk is not None else len(self.choices)
@@ -201,6 +205,8 @@ class RetabParsedChatCompletionChunk(StreamingBaseModel, ChatCompletionChunk):
201
205
  acc_flat_parsed = [safe_get_delta(previous_cumulated_chunk, i).flat_parsed | safe_get_delta(self, i).flat_parsed for i in range(max_choices)]
202
206
  acc_flat_likelihoods = [safe_get_delta(previous_cumulated_chunk, i).flat_likelihoods | safe_get_delta(self, i).flat_likelihoods for i in range(max_choices)]
203
207
  acc_key_mapping = [safe_get_delta(previous_cumulated_chunk, i).key_mapping or safe_get_delta(self, i).key_mapping for i in range(max_choices)]
208
+ # Preserve full_parsed: use the current chunk's full_parsed if available, otherwise keep the previous one
209
+ acc_full_parsed = [safe_get_delta(self, i).full_parsed or safe_get_delta(previous_cumulated_chunk, i).full_parsed for i in range(max_choices)]
204
210
 
205
211
  acc_content = [(safe_get_delta(previous_cumulated_chunk, i).content or "") + (safe_get_delta(self, i).content or "") for i in range(max_choices)]
206
212
 
@@ -219,6 +225,7 @@ class RetabParsedChatCompletionChunk(StreamingBaseModel, ChatCompletionChunk):
219
225
  flat_deleted_keys=acc_flat_deleted_keys[i],
220
226
  is_valid_json=acc_is_valid_json[i],
221
227
  key_mapping=acc_key_mapping[i],
228
+ full_parsed=acc_full_parsed[i],
222
229
  ),
223
230
  index=i,
224
231
  )
@@ -238,7 +245,18 @@ class RetabParsedChatCompletionChunk(StreamingBaseModel, ChatCompletionChunk):
238
245
  if override_final_flat_parseds is None:
239
246
  override_final_flat_parseds = [self.choices[idx].delta.flat_parsed for idx in range(len(self.choices))]
240
247
 
241
- final_parsed_list = [unflatten_dict(override_final_flat_parseds[idx]) for idx in range(len(self.choices))]
248
+ # Build final_parsed_list using full_parsed when available (correct data from LLM),
249
+ # falling back to unflatten_dict for backward compatibility
250
+ final_parsed_list = []
251
+ for idx in range(len(self.choices)):
252
+ full_parsed = self.choices[idx].delta.full_parsed
253
+ if full_parsed is not None:
254
+ # Use the complete parsed object from the LLM (avoids data corruption from unflatten_dict)
255
+ final_parsed_list.append(full_parsed)
256
+ else:
257
+ # Fallback: reconstruct from flat_parsed (may lose null values in sparse arrays)
258
+ final_parsed_list.append(unflatten_dict(override_final_flat_parseds[idx]))
259
+
242
260
  final_content_list = [json.dumps(final_parsed_list[idx]) for idx in range(len(self.choices))]
243
261
 
244
262
  # The final likelihoods are only on the first choice.
@@ -264,7 +282,7 @@ class RetabParsedChatCompletionChunk(StreamingBaseModel, ChatCompletionChunk):
264
282
  role="assistant",
265
283
  parsed=final_parsed_list[idx],
266
284
  ),
267
- key_mapping=self.choices[idx].delta.key_mapping,
285
+ key_mapping=self.choices[idx].delta.key_mapping, # type: ignore[call-arg]
268
286
  finish_reason="stop",
269
287
  logprobs=None,
270
288
  )
@@ -0,0 +1,11 @@
1
+ from .model import WorkflowRun, StepStatus, StepIOReference, HandlePayload, NodeType
2
+
3
+
4
+ __all__ = [
5
+ "WorkflowRun",
6
+ "StepStatus",
7
+ "StepIOReference",
8
+ "HandlePayload",
9
+ "NodeType",
10
+ ]
11
+
@@ -0,0 +1,76 @@
1
+ import datetime
2
+ from typing import Any, Dict, List, Literal, Optional
3
+
4
+ from pydantic import BaseModel, Field, ConfigDict
5
+
6
+
7
+ class StepIOReference(BaseModel):
8
+ """Reference to step input/output stored in GCS"""
9
+ file_id: Optional[str] = Field(default=None, description="File ID for document storage lookup")
10
+ gcs_path: Optional[str] = Field(default=None, description="GCS path to the stored file")
11
+ filename: Optional[str] = Field(default=None, description="Original filename")
12
+ mime_type: Optional[str] = Field(default=None, description="MIME type of the file")
13
+
14
+
15
+ class HandlePayload(BaseModel):
16
+ """
17
+ Payload for a single output handle.
18
+
19
+ Each output handle on a node produces a typed payload that can be:
20
+ - file: A document reference (PDF, image, etc.)
21
+ - json: Structured JSON data (extracted data, etc.)
22
+ - text: Plain text content
23
+ """
24
+ type: Literal["file", "json", "text"] = Field(..., description="Type of payload")
25
+ document: Optional[StepIOReference] = Field(default=None, description="For file handles: document reference")
26
+ data: Optional[dict] = Field(default=None, description="For JSON handles: structured data")
27
+ text: Optional[str] = Field(default=None, description="For text handles: text content")
28
+
29
+
30
+ NodeType = Literal["start", "extract", "split", "end", "hil"]
31
+
32
+
33
+ class StepStatus(BaseModel):
34
+ """Status of a single step in workflow execution"""
35
+ node_id: str = Field(..., description="ID of the node")
36
+ node_type: NodeType = Field(..., description="Type of the node")
37
+ node_label: str = Field(..., description="Label of the node")
38
+ status: Literal["pending", "running", "completed", "error", "waiting_for_human"] = Field(..., description="Current status")
39
+ started_at: Optional[datetime.datetime] = Field(default=None, description="When the step started")
40
+ completed_at: Optional[datetime.datetime] = Field(default=None, description="When the step completed")
41
+ duration_ms: Optional[int] = Field(default=None, description="Duration in milliseconds")
42
+ error: Optional[str] = Field(default=None, description="Error message if failed")
43
+ output: Optional[dict] = Field(default=None, description="Output data from the step")
44
+ handle_outputs: Optional[Dict[str, HandlePayload]] = Field(
45
+ default=None,
46
+ description="Output payloads keyed by handle ID (e.g., 'output-file-0', 'output-json-0')"
47
+ )
48
+ input_document: Optional[StepIOReference] = Field(default=None, description="Reference to input document")
49
+ output_document: Optional[StepIOReference] = Field(default=None, description="Reference to output document")
50
+ split_documents: Optional[Dict[str, StepIOReference]] = Field(default=None, description="For split nodes: category -> document reference")
51
+ requires_human_review: Optional[bool] = Field(default=None, description="Whether this step requires human review")
52
+ human_reviewed_at: Optional[datetime.datetime] = Field(default=None, description="When human review was completed")
53
+ human_review_approved: Optional[bool] = Field(default=None, description="Whether human approved or rejected")
54
+
55
+
56
+ class WorkflowRun(BaseModel):
57
+ """A stored workflow run record"""
58
+ model_config = ConfigDict(extra="ignore")
59
+
60
+ id: str = Field(..., description="Unique ID for this run")
61
+ workflow_id: str = Field(..., description="ID of the workflow that was run")
62
+ workflow_name: str = Field(..., description="Name of the workflow at time of execution")
63
+ organization_id: str = Field(..., description="Organization that owns this run")
64
+ status: Literal["pending", "running", "completed", "error", "waiting_for_human"] = Field(default="pending", description="Overall status")
65
+ started_at: datetime.datetime = Field(..., description="When the workflow started")
66
+ completed_at: Optional[datetime.datetime] = Field(default=None, description="When the workflow completed")
67
+ duration_ms: Optional[int] = Field(default=None, description="Total duration in milliseconds")
68
+ steps: List[StepStatus] = Field(default_factory=list, description="Status of each step")
69
+ input_documents: Optional[Dict[str, StepIOReference]] = Field(default=None, description="Start node ID -> input document reference")
70
+ final_outputs: Optional[dict] = Field(default=None, description="Final outputs from end nodes")
71
+ error: Optional[str] = Field(default=None, description="Error message if workflow failed")
72
+ created_at: datetime.datetime = Field(..., description="When the run was created")
73
+ updated_at: datetime.datetime = Field(..., description="When the run was last updated")
74
+ waiting_for_node_ids: List[str] = Field(default_factory=list, description="Node IDs that are waiting for human review")
75
+ pending_node_outputs: Optional[dict] = Field(default=None, description="Serialized node outputs to resume from")
76
+
@@ -158,23 +158,43 @@ def expand_refs(schema: dict[str, Any], definitions: dict[str, dict[str, Any]] |
158
158
  print("Cyclic refs found, keeping it as is")
159
159
  return schema
160
160
 
161
+ # Support both $defs (draft 2019-09+) and definitions (draft-07)
161
162
  if definitions is None:
162
- definitions = schema.pop("$defs", {})
163
+ definitions = schema.pop("$defs", None) or schema.pop("definitions", {})
163
164
 
164
165
  assert isinstance(definitions, dict)
165
166
 
166
- if "allOf" in schema:
167
- # Some schemas (notably the one converted from a pydantic model) have allOf. We only accept one element in allOf
168
- if len(schema["allOf"]) != 1:
169
- raise ValueError(f"Property schema must have a single element in 'allOf'. Found: {schema['allOf']}")
170
- schema.update(schema.pop("allOf", [{}])[0])
167
+ # Handle allOf - merge all elements
168
+ if "allOf" in schema and isinstance(schema["allOf"], list) and len(schema["allOf"]) > 0:
169
+ all_of_elements = schema.pop("allOf")
170
+ for element in all_of_elements:
171
+ if isinstance(element, dict):
172
+ # Recursively expand refs in each allOf element first
173
+ expanded = expand_refs(element, definitions)
174
+ # Deep merge properties if both have them
175
+ if "properties" in expanded and "properties" in schema:
176
+ schema["properties"] = {**schema["properties"], **expanded["properties"]}
177
+ del expanded["properties"]
178
+ # Merge required arrays if both have them
179
+ if "required" in expanded and "required" in schema:
180
+ schema["required"] = list(set(schema["required"] + expanded["required"]))
181
+ del expanded["required"]
182
+ schema.update(expanded)
171
183
 
172
184
  if "$ref" in schema:
173
185
  ref: str = schema["$ref"]
186
+ def_name: str | None = None
187
+
188
+ # Support both #/$defs/ and #/definitions/ formats
174
189
  if ref.startswith("#/$defs/"):
175
190
  def_name = ref.removeprefix("#/$defs/")
191
+ elif ref.startswith("#/definitions/"):
192
+ def_name = ref.removeprefix("#/definitions/")
193
+
194
+ if def_name is not None:
176
195
  if def_name not in definitions:
177
- raise ValueError(f"Reference {ref} not found in definitions.")
196
+ # Return schema as-is if reference not found (might be external)
197
+ return schema
178
198
  target = definitions[def_name]
179
199
  merged = merge_descriptions(schema, target)
180
200
  merged.pop("$ref", None)
@@ -184,7 +204,8 @@ def expand_refs(schema: dict[str, Any], definitions: dict[str, dict[str, Any]] |
184
204
 
185
205
  result: dict[str, Any] = {}
186
206
  for annotation, subschema in schema.items():
187
- if annotation in ["properties", "$defs"]:
207
+ # Handle properties, $defs, and definitions (draft-07) keys
208
+ if annotation in ["properties", "$defs", "definitions"]:
188
209
  if isinstance(subschema, dict):
189
210
  new_dict = {}
190
211
  for pk, pv in subschema.items():
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: retab
3
- Version: 0.0.79
3
+ Version: 0.0.81
4
4
  Summary: Retab official python library
5
5
  Home-page: https://github.com/retab-dev/retab
6
6
  Author: Retab
@@ -7,11 +7,13 @@ retab/resources/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
7
  retab/resources/models.py,sha256=4WidFBnTGZEA65DSn2pLP2SRnCVXkMTw7o_m8xVCFC4,2469
8
8
  retab/resources/schemas.py,sha256=rZ6OzfmoYv-mGaRVzvXjO09dD-KxP74mZhOO8sMgcDQ,4632
9
9
  retab/resources/documents/__init__.py,sha256=OjXmngFN0RKqO4SI-mJBNzr6Ex6rMxfq0DxaqzP0RQs,89
10
- retab/resources/documents/client.py,sha256=XxWo9FlktrpuskAPyKWTx9UIA2VA81g0SbHjHYnigMM,43583
10
+ retab/resources/documents/client.py,sha256=E8v0aBF4-9ATYo5hkQ629OP5mm2AtodTzznlj2xRWtQ,49000
11
11
  retab/resources/extractions/__init__.py,sha256=2H1ezUG8hI5SmTRy6NFzXdYLOdGFFsFrI60uzkitV20,97
12
12
  retab/resources/extractions/client.py,sha256=sEoNjOgX91FTOgoJUV-I1A9A9xl1ciCdPlhYwjhEjbA,11035
13
13
  retab/resources/projects/__init__.py,sha256=tPR3_3tr7bsoYd618qmGjnYN2R23PmF5oCFd7Z5_HGY,85
14
14
  retab/resources/projects/client.py,sha256=5LPAhJt5-nqBP4VWYvo0k7cW6HLGF6K9xMiHKQzIXho,15593
15
+ retab/resources/workflows/__init__.py,sha256=-I0QNX7XKEr8ZJTV4-awMyKxZqGlSkKMdibiHiB7cZ0,89
16
+ retab/resources/workflows/client.py,sha256=svKOmkqB1-P56IjzauWNdfQtzT0rlWRIu3EddwX-HiM,6743
15
17
  retab/types/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
18
  retab/types/chat.py,sha256=x9VbtPMa4w6Gc0HrFC3ILl6cCnfEn5ytDnwJtZmlcys,1436
17
19
  retab/types/inference_settings.py,sha256=wIivYffvEE7v6lhbjbhAZGssK4uYr64Oq6cZKxzY5_M,1131
@@ -19,11 +21,12 @@ retab/types/mime.py,sha256=ZLNCD3pvgn5cbGfJwzrdkjgB9dMHCbN67YEV9bx47zE,10063
19
21
  retab/types/modality.py,sha256=4B8LctdUBZVgIjtS2FjrJpljn2Eyse0XE1bpFsGb9O4,131
20
22
  retab/types/pagination.py,sha256=A0Fw06baPTfEaYwo3kvNs4vaupzlqylBc6tQH-2DFuY,279
21
23
  retab/types/standards.py,sha256=7aGtuvzzkKidvqY8JB2Cjfn43V80FeKwrTtp162kjKc,1477
22
- retab/types/documents/__init__.py,sha256=YDsvsmwkS5lfGXk5aBqSqmFh6LKX3dM6q_cUo5oIydU,277
24
+ retab/types/documents/__init__.py,sha256=t1jXdpYqi-zQMC_9uM0m7eA1hRU0MCROwUx89ccD2-c,418
25
+ retab/types/documents/classify.py,sha256=Tb6d_7kuTlWLr7bPn782dHrjtUVBCvXV3o9zm7j2lmE,1128
23
26
  retab/types/documents/correct_orientation.py,sha256=e-ivsslI6L6Gl0YkcslXw_DH620xMGEYVp4tdeviXeM,261
24
27
  retab/types/documents/create_messages.py,sha256=Uym0SnVUGkyt1C5AOD37BsZ3puyeu_igR6X9SboojfA,7267
25
28
  retab/types/documents/edit.py,sha256=ZY-a_Q9Y76e4oojeJJsisoCZbNSU6gqwAgb9fq9S76w,5930
26
- retab/types/documents/extract.py,sha256=eMaVl76K_1CeuLmdttfrf4yoQqs27f10w9rNBePb0DY,16724
29
+ retab/types/documents/extract.py,sha256=x_59fm69-icsxxGRgpFd0NN-SLRoMYqbvfCZuG7zyGc,18033
27
30
  retab/types/documents/parse.py,sha256=MXe7zh3DusWQhGe0Sr95nPy6cB8DRX8MA4Hmjj_AP7E,1300
28
31
  retab/types/documents/split.py,sha256=xRdJ6IpSRAPi_ZtAG2FNqg5A-v5tzfb1QQkW5UfO2pY,1246
29
32
  retab/types/extractions/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -38,13 +41,15 @@ retab/types/schemas/generate.py,sha256=8c9LzFgsG9BpteKzjPaLJEneEHsjCyYvGo1jdko-D
38
41
  retab/types/schemas/layout.py,sha256=JLPwQGIWfPBoe1Y5r-MhiNDJigzZ-yKZnVGgox0uqMk,1487
39
42
  retab/types/schemas/model.py,sha256=kIMB1C_q7YjYJeVV3y06n3m_ebCGSLXyjDs34Ye-oes,72863
40
43
  retab/types/schemas/templates.py,sha256=XihWTHi6t_6QjxN07n_1dee5KdhHiuoHAYfmKwI7gQg,1708
44
+ retab/types/workflows/__init__.py,sha256=lyFqR2wWTb3l0Uq_84kU4GK6xPKCxUvHtC1hQ6yAHs0,200
45
+ retab/types/workflows/model.py,sha256=t3VCulEsSZN17uHO-TTU_y5kELwcCvmfjgjY9sg3qPM,4863
41
46
  retab/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
42
47
  retab/utils/display.py,sha256=ZFPbiBnwEWGR-suS8e9Xilz9OqyYRDwsKYWfbFSJPJM,18868
43
48
  retab/utils/hashing.py,sha256=_BMVUvftOcJav68QL0rLkH2dbhW9RRJPzeGC2akR0fc,757
44
- retab/utils/json_schema.py,sha256=F3MLNGskpfPh1IkXHPLp60ceOEFD79GyL8mVvr0OiVM,19583
49
+ retab/utils/json_schema.py,sha256=zP4pQLpVHBKWo_abCjb_dU4kA0azhHopd-1TFUgVEvc,20655
45
50
  retab/utils/mime.py,sha256=mTP_lqSPttOP5DYJxopiWaeFXrUCPjhwd7y53nCVGO4,6189
46
51
  retab/utils/stream_context_managers.py,sha256=gI1gVQSj3nWz6Mvjz7Ix5AiY0g6vSL-c2tPfuP04izo,2314
47
- retab-0.0.79.dist-info/METADATA,sha256=GAgtfkDV8Zu0Bc4dBl7vL87xLutKpGUqpwCY3RxGFP0,4532
48
- retab-0.0.79.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
49
- retab-0.0.79.dist-info/top_level.txt,sha256=waQR0EGdhLIQtztoE3AXg7ik5ONQ9q_bsKVpyFuJdq0,6
50
- retab-0.0.79.dist-info/RECORD,,
52
+ retab-0.0.81.dist-info/METADATA,sha256=1dsE31zFzslvv3Up5BOM62auWgQNbLCie0hZ2NfwP5Y,4532
53
+ retab-0.0.81.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
54
+ retab-0.0.81.dist-info/top_level.txt,sha256=waQR0EGdhLIQtztoE3AXg7ik5ONQ9q_bsKVpyFuJdq0,6
55
+ retab-0.0.81.dist-info/RECORD,,
File without changes