retab 0.0.79__tar.gz → 0.0.80__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {retab-0.0.79 → retab-0.0.80}/PKG-INFO +1 -1
- retab-0.0.80/retab/resources/workflows/__init__.py +3 -0
- retab-0.0.80/retab/resources/workflows/client.py +190 -0
- {retab-0.0.79 → retab-0.0.80}/retab/types/documents/__init__.py +4 -0
- retab-0.0.80/retab/types/documents/classify.py +31 -0
- {retab-0.0.79 → retab-0.0.80}/retab/types/documents/extract.py +20 -2
- retab-0.0.80/retab/types/workflows/__init__.py +11 -0
- retab-0.0.80/retab/types/workflows/model.py +76 -0
- {retab-0.0.79 → retab-0.0.80}/retab/utils/json_schema.py +29 -8
- {retab-0.0.79 → retab-0.0.80}/retab.egg-info/PKG-INFO +1 -1
- {retab-0.0.79 → retab-0.0.80}/retab.egg-info/SOURCES.txt +5 -0
- {retab-0.0.79 → retab-0.0.80}/setup.py +1 -1
- {retab-0.0.79 → retab-0.0.80}/README.md +0 -0
- {retab-0.0.79 → retab-0.0.80}/pyproject.toml +0 -0
- {retab-0.0.79 → retab-0.0.80}/retab/__init__.py +0 -0
- {retab-0.0.79 → retab-0.0.80}/retab/_resource.py +0 -0
- {retab-0.0.79 → retab-0.0.80}/retab/client.py +0 -0
- {retab-0.0.79 → retab-0.0.80}/retab/generate_types.py +0 -0
- {retab-0.0.79 → retab-0.0.80}/retab/py.typed +0 -0
- {retab-0.0.79 → retab-0.0.80}/retab/resources/__init__.py +0 -0
- {retab-0.0.79 → retab-0.0.80}/retab/resources/documents/__init__.py +0 -0
- {retab-0.0.79 → retab-0.0.80}/retab/resources/documents/client.py +0 -0
- {retab-0.0.79 → retab-0.0.80}/retab/resources/extractions/__init__.py +0 -0
- {retab-0.0.79 → retab-0.0.80}/retab/resources/extractions/client.py +0 -0
- {retab-0.0.79 → retab-0.0.80}/retab/resources/models.py +0 -0
- {retab-0.0.79 → retab-0.0.80}/retab/resources/projects/__init__.py +0 -0
- {retab-0.0.79 → retab-0.0.80}/retab/resources/projects/client.py +0 -0
- {retab-0.0.79 → retab-0.0.80}/retab/resources/schemas.py +0 -0
- {retab-0.0.79 → retab-0.0.80}/retab/types/__init__.py +0 -0
- {retab-0.0.79 → retab-0.0.80}/retab/types/chat.py +0 -0
- {retab-0.0.79 → retab-0.0.80}/retab/types/documents/correct_orientation.py +0 -0
- {retab-0.0.79 → retab-0.0.80}/retab/types/documents/create_messages.py +0 -0
- {retab-0.0.79 → retab-0.0.80}/retab/types/documents/edit.py +0 -0
- {retab-0.0.79 → retab-0.0.80}/retab/types/documents/parse.py +0 -0
- {retab-0.0.79 → retab-0.0.80}/retab/types/documents/split.py +0 -0
- {retab-0.0.79 → retab-0.0.80}/retab/types/extractions/__init__.py +0 -0
- {retab-0.0.79 → retab-0.0.80}/retab/types/extractions/types.py +0 -0
- {retab-0.0.79 → retab-0.0.80}/retab/types/inference_settings.py +0 -0
- {retab-0.0.79 → retab-0.0.80}/retab/types/mime.py +0 -0
- {retab-0.0.79 → retab-0.0.80}/retab/types/modality.py +0 -0
- {retab-0.0.79 → retab-0.0.80}/retab/types/pagination.py +0 -0
- {retab-0.0.79 → retab-0.0.80}/retab/types/projects/__init__.py +0 -0
- {retab-0.0.79 → retab-0.0.80}/retab/types/projects/metrics.py +0 -0
- {retab-0.0.79 → retab-0.0.80}/retab/types/projects/model.py +0 -0
- {retab-0.0.79 → retab-0.0.80}/retab/types/projects/predictions.py +0 -0
- {retab-0.0.79 → retab-0.0.80}/retab/types/schemas/__init__.py +0 -0
- {retab-0.0.79 → retab-0.0.80}/retab/types/schemas/chat.py +0 -0
- {retab-0.0.79 → retab-0.0.80}/retab/types/schemas/generate.py +0 -0
- {retab-0.0.79 → retab-0.0.80}/retab/types/schemas/layout.py +0 -0
- {retab-0.0.79 → retab-0.0.80}/retab/types/schemas/model.py +0 -0
- {retab-0.0.79 → retab-0.0.80}/retab/types/schemas/templates.py +0 -0
- {retab-0.0.79 → retab-0.0.80}/retab/types/standards.py +0 -0
- {retab-0.0.79 → retab-0.0.80}/retab/utils/__init__.py +0 -0
- {retab-0.0.79 → retab-0.0.80}/retab/utils/display.py +0 -0
- {retab-0.0.79 → retab-0.0.80}/retab/utils/hashing.py +0 -0
- {retab-0.0.79 → retab-0.0.80}/retab/utils/mime.py +0 -0
- {retab-0.0.79 → retab-0.0.80}/retab/utils/stream_context_managers.py +0 -0
- {retab-0.0.79 → retab-0.0.80}/retab.egg-info/dependency_links.txt +0 -0
- {retab-0.0.79 → retab-0.0.80}/retab.egg-info/requires.txt +0 -0
- {retab-0.0.79 → retab-0.0.80}/retab.egg-info/top_level.txt +0 -0
- {retab-0.0.79 → retab-0.0.80}/setup.cfg +0 -0
- {retab-0.0.79 → retab-0.0.80}/tests/test_projects.py +0 -0
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
from io import IOBase
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Any, Dict
|
|
4
|
+
|
|
5
|
+
import PIL.Image
|
|
6
|
+
from pydantic import HttpUrl
|
|
7
|
+
|
|
8
|
+
from ..._resource import AsyncAPIResource, SyncAPIResource
|
|
9
|
+
from ...utils.mime import MIMEData, prepare_mime_document
|
|
10
|
+
from ...types.standards import PreparedRequest
|
|
11
|
+
from ...types.workflows import WorkflowRun
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
# Type alias for document inputs
|
|
15
|
+
DocumentInput = Path | str | bytes | IOBase | MIMEData | PIL.Image.Image | HttpUrl
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class WorkflowsMixin:
|
|
19
|
+
"""Mixin providing shared methods for workflow operations."""
|
|
20
|
+
|
|
21
|
+
def prepare_run(
|
|
22
|
+
self,
|
|
23
|
+
workflow_id: str,
|
|
24
|
+
documents: Dict[str, DocumentInput],
|
|
25
|
+
) -> PreparedRequest:
|
|
26
|
+
"""Prepare a request to run a workflow with input documents.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
workflow_id: The ID of the workflow to run
|
|
30
|
+
documents: Mapping of start node IDs to their input documents.
|
|
31
|
+
Each document can be a file path, bytes, file-like object,
|
|
32
|
+
MIMEData, PIL Image, or HttpUrl.
|
|
33
|
+
|
|
34
|
+
Returns:
|
|
35
|
+
PreparedRequest: The prepared request
|
|
36
|
+
|
|
37
|
+
Example:
|
|
38
|
+
>>> client.workflows.run(
|
|
39
|
+
... workflow_id="wf_abc123",
|
|
40
|
+
... documents={
|
|
41
|
+
... "start-node-1": Path("invoice.pdf"),
|
|
42
|
+
... "start-node-2": Path("receipt.pdf"),
|
|
43
|
+
... }
|
|
44
|
+
... )
|
|
45
|
+
"""
|
|
46
|
+
# Convert each document to MIMEData and then to the format expected by the backend
|
|
47
|
+
documents_payload: Dict[str, Dict[str, Any]] = {}
|
|
48
|
+
for node_id, document in documents.items():
|
|
49
|
+
mime_data = prepare_mime_document(document)
|
|
50
|
+
documents_payload[node_id] = {
|
|
51
|
+
"filename": mime_data.filename,
|
|
52
|
+
"content": mime_data.content,
|
|
53
|
+
"mime_type": mime_data.mime_type,
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
data = {"documents": documents_payload}
|
|
57
|
+
return PreparedRequest(method="POST", url=f"/v1/workflows/{workflow_id}/run", data=data)
|
|
58
|
+
|
|
59
|
+
def prepare_get_run(self, run_id: str) -> PreparedRequest:
|
|
60
|
+
"""Prepare a request to get a workflow run by ID.
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
run_id: The ID of the workflow run to retrieve
|
|
64
|
+
|
|
65
|
+
Returns:
|
|
66
|
+
PreparedRequest: The prepared request
|
|
67
|
+
"""
|
|
68
|
+
return PreparedRequest(method="GET", url=f"/v1/workflows/runs/{run_id}")
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
class Workflows(SyncAPIResource, WorkflowsMixin):
|
|
72
|
+
"""Workflows API wrapper for synchronous operations."""
|
|
73
|
+
|
|
74
|
+
def __init__(self, *args, **kwargs):
|
|
75
|
+
super().__init__(*args, **kwargs)
|
|
76
|
+
|
|
77
|
+
def run(
|
|
78
|
+
self,
|
|
79
|
+
workflow_id: str,
|
|
80
|
+
documents: Dict[str, DocumentInput],
|
|
81
|
+
) -> WorkflowRun:
|
|
82
|
+
"""Run a workflow with the provided input documents.
|
|
83
|
+
|
|
84
|
+
This creates a workflow run and starts execution in the background.
|
|
85
|
+
The returned WorkflowRun will have status "running" - use get_run()
|
|
86
|
+
to check for updates on the run status.
|
|
87
|
+
|
|
88
|
+
Args:
|
|
89
|
+
workflow_id: The ID of the workflow to run
|
|
90
|
+
documents: Mapping of start node IDs to their input documents.
|
|
91
|
+
Each document can be a file path, bytes, file-like object,
|
|
92
|
+
MIMEData, PIL Image, or HttpUrl.
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
WorkflowRun: The created workflow run with status "running"
|
|
96
|
+
|
|
97
|
+
Raises:
|
|
98
|
+
HTTPException: If the request fails (e.g., workflow not found,
|
|
99
|
+
missing input documents for start nodes)
|
|
100
|
+
|
|
101
|
+
Example:
|
|
102
|
+
>>> run = client.workflows.run(
|
|
103
|
+
... workflow_id="wf_abc123",
|
|
104
|
+
... documents={
|
|
105
|
+
... "start-node-1": Path("invoice.pdf"),
|
|
106
|
+
... "start-node-2": Path("receipt.pdf"),
|
|
107
|
+
... }
|
|
108
|
+
... )
|
|
109
|
+
>>> print(f"Run started: {run.id}, status: {run.status}")
|
|
110
|
+
"""
|
|
111
|
+
request = self.prepare_run(workflow_id=workflow_id, documents=documents)
|
|
112
|
+
response = self._client._prepared_request(request)
|
|
113
|
+
return WorkflowRun.model_validate(response)
|
|
114
|
+
|
|
115
|
+
def get_run(self, run_id: str) -> WorkflowRun:
|
|
116
|
+
"""Get a workflow run by ID.
|
|
117
|
+
|
|
118
|
+
Args:
|
|
119
|
+
run_id: The ID of the workflow run to retrieve
|
|
120
|
+
|
|
121
|
+
Returns:
|
|
122
|
+
WorkflowRun: The workflow run
|
|
123
|
+
|
|
124
|
+
Raises:
|
|
125
|
+
HTTPException: If the request fails (e.g., run not found)
|
|
126
|
+
"""
|
|
127
|
+
request = self.prepare_get_run(run_id)
|
|
128
|
+
response = self._client._prepared_request(request)
|
|
129
|
+
return WorkflowRun.model_validate(response)
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
class AsyncWorkflows(AsyncAPIResource, WorkflowsMixin):
|
|
133
|
+
"""Workflows API wrapper for asynchronous operations."""
|
|
134
|
+
|
|
135
|
+
def __init__(self, *args, **kwargs):
|
|
136
|
+
super().__init__(*args, **kwargs)
|
|
137
|
+
|
|
138
|
+
async def run(
|
|
139
|
+
self,
|
|
140
|
+
workflow_id: str,
|
|
141
|
+
documents: Dict[str, DocumentInput],
|
|
142
|
+
) -> WorkflowRun:
|
|
143
|
+
"""Run a workflow with the provided input documents.
|
|
144
|
+
|
|
145
|
+
This creates a workflow run and starts execution in the background.
|
|
146
|
+
The returned WorkflowRun will have status "running" - use get_run()
|
|
147
|
+
to check for updates on the run status.
|
|
148
|
+
|
|
149
|
+
Args:
|
|
150
|
+
workflow_id: The ID of the workflow to run
|
|
151
|
+
documents: Mapping of start node IDs to their input documents.
|
|
152
|
+
Each document can be a file path, bytes, file-like object,
|
|
153
|
+
MIMEData, PIL Image, or HttpUrl.
|
|
154
|
+
|
|
155
|
+
Returns:
|
|
156
|
+
WorkflowRun: The created workflow run with status "running"
|
|
157
|
+
|
|
158
|
+
Raises:
|
|
159
|
+
HTTPException: If the request fails (e.g., workflow not found,
|
|
160
|
+
missing input documents for start nodes)
|
|
161
|
+
|
|
162
|
+
Example:
|
|
163
|
+
>>> run = await client.workflows.run(
|
|
164
|
+
... workflow_id="wf_abc123",
|
|
165
|
+
... documents={
|
|
166
|
+
... "start-node-1": Path("invoice.pdf"),
|
|
167
|
+
... "start-node-2": Path("receipt.pdf"),
|
|
168
|
+
... }
|
|
169
|
+
... )
|
|
170
|
+
>>> print(f"Run started: {run.id}, status: {run.status}")
|
|
171
|
+
"""
|
|
172
|
+
request = self.prepare_run(workflow_id=workflow_id, documents=documents)
|
|
173
|
+
response = await self._client._prepared_request(request)
|
|
174
|
+
return WorkflowRun.model_validate(response)
|
|
175
|
+
|
|
176
|
+
async def get_run(self, run_id: str) -> WorkflowRun:
|
|
177
|
+
"""Get a workflow run by ID.
|
|
178
|
+
|
|
179
|
+
Args:
|
|
180
|
+
run_id: The ID of the workflow run to retrieve
|
|
181
|
+
|
|
182
|
+
Returns:
|
|
183
|
+
WorkflowRun: The workflow run
|
|
184
|
+
|
|
185
|
+
Raises:
|
|
186
|
+
HTTPException: If the request fails (e.g., run not found)
|
|
187
|
+
"""
|
|
188
|
+
request = self.prepare_get_run(run_id)
|
|
189
|
+
response = await self._client._prepared_request(request)
|
|
190
|
+
return WorkflowRun.model_validate(response)
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from .parse import ParseRequest, ParseResult, RetabUsage
|
|
2
2
|
from .split import Category, SplitRequest, SplitResult, SplitResponse
|
|
3
|
+
from .classify import ClassifyRequest, ClassifyResult, ClassifyResponse
|
|
3
4
|
|
|
4
5
|
__all__ = [
|
|
5
6
|
"ParseRequest",
|
|
@@ -9,4 +10,7 @@ __all__ = [
|
|
|
9
10
|
"SplitRequest",
|
|
10
11
|
"SplitResult",
|
|
11
12
|
"SplitResponse",
|
|
13
|
+
"ClassifyRequest",
|
|
14
|
+
"ClassifyResult",
|
|
15
|
+
"ClassifyResponse",
|
|
12
16
|
]
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
from pydantic import BaseModel, Field
|
|
2
|
+
from ..mime import MIMEData
|
|
3
|
+
from .split import Category
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class ClassifyRequest(BaseModel):
|
|
7
|
+
document: MIMEData = Field(..., description="The document to classify")
|
|
8
|
+
categories: list[Category] = Field(..., description="The categories to classify the document into")
|
|
9
|
+
model: str = Field(default="retab-small", description="The model to use for classification")
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class ClassifyResult(BaseModel):
|
|
13
|
+
reasoning: str = Field(..., description="The reasoning for the classification decision")
|
|
14
|
+
classification: str = Field(..., description="The category name that the document belongs to")
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class ClassifyResponse(BaseModel):
|
|
18
|
+
result: ClassifyResult = Field(..., description="The classification result with reasoning")
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class ClassifyOutputSchema(BaseModel):
|
|
22
|
+
"""Schema for LLM structured output."""
|
|
23
|
+
reasoning: str = Field(
|
|
24
|
+
...,
|
|
25
|
+
description="Step-by-step reasoning explaining why this document belongs to the chosen category"
|
|
26
|
+
)
|
|
27
|
+
classification: str = Field(
|
|
28
|
+
...,
|
|
29
|
+
description="The category name that this document belongs to"
|
|
30
|
+
)
|
|
31
|
+
|
|
@@ -155,6 +155,9 @@ class RetabParsedChoiceDeltaChunk(ChoiceDeltaChunk):
|
|
|
155
155
|
flat_deleted_keys: list[str] = []
|
|
156
156
|
is_valid_json: bool = False
|
|
157
157
|
key_mapping: dict[str, Optional[str]] | None = Field(default=None, description="Mapping of consensus keys to original model keys")
|
|
158
|
+
# Full parsed object from the LLM (when available). Used to avoid data corruption
|
|
159
|
+
# from unflatten_dict when null values are not transmitted in streaming deltas.
|
|
160
|
+
full_parsed: dict[str, Any] | None = Field(default=None, description="Complete parsed object from LLM, used instead of unflatten_dict when available")
|
|
158
161
|
|
|
159
162
|
|
|
160
163
|
class RetabParsedChoiceChunk(ChoiceChunk):
|
|
@@ -183,6 +186,7 @@ class RetabParsedChatCompletionChunk(StreamingBaseModel, ChatCompletionChunk):
|
|
|
183
186
|
flat_parsed={},
|
|
184
187
|
flat_likelihoods={},
|
|
185
188
|
is_valid_json=False,
|
|
189
|
+
full_parsed=None,
|
|
186
190
|
)
|
|
187
191
|
|
|
188
192
|
max_choices = max(len(self.choices), len(previous_cumulated_chunk.choices)) if previous_cumulated_chunk is not None else len(self.choices)
|
|
@@ -201,6 +205,8 @@ class RetabParsedChatCompletionChunk(StreamingBaseModel, ChatCompletionChunk):
|
|
|
201
205
|
acc_flat_parsed = [safe_get_delta(previous_cumulated_chunk, i).flat_parsed | safe_get_delta(self, i).flat_parsed for i in range(max_choices)]
|
|
202
206
|
acc_flat_likelihoods = [safe_get_delta(previous_cumulated_chunk, i).flat_likelihoods | safe_get_delta(self, i).flat_likelihoods for i in range(max_choices)]
|
|
203
207
|
acc_key_mapping = [safe_get_delta(previous_cumulated_chunk, i).key_mapping or safe_get_delta(self, i).key_mapping for i in range(max_choices)]
|
|
208
|
+
# Preserve full_parsed: use the current chunk's full_parsed if available, otherwise keep the previous one
|
|
209
|
+
acc_full_parsed = [safe_get_delta(self, i).full_parsed or safe_get_delta(previous_cumulated_chunk, i).full_parsed for i in range(max_choices)]
|
|
204
210
|
|
|
205
211
|
acc_content = [(safe_get_delta(previous_cumulated_chunk, i).content or "") + (safe_get_delta(self, i).content or "") for i in range(max_choices)]
|
|
206
212
|
|
|
@@ -219,6 +225,7 @@ class RetabParsedChatCompletionChunk(StreamingBaseModel, ChatCompletionChunk):
|
|
|
219
225
|
flat_deleted_keys=acc_flat_deleted_keys[i],
|
|
220
226
|
is_valid_json=acc_is_valid_json[i],
|
|
221
227
|
key_mapping=acc_key_mapping[i],
|
|
228
|
+
full_parsed=acc_full_parsed[i],
|
|
222
229
|
),
|
|
223
230
|
index=i,
|
|
224
231
|
)
|
|
@@ -238,7 +245,18 @@ class RetabParsedChatCompletionChunk(StreamingBaseModel, ChatCompletionChunk):
|
|
|
238
245
|
if override_final_flat_parseds is None:
|
|
239
246
|
override_final_flat_parseds = [self.choices[idx].delta.flat_parsed for idx in range(len(self.choices))]
|
|
240
247
|
|
|
241
|
-
final_parsed_list
|
|
248
|
+
# Build final_parsed_list using full_parsed when available (correct data from LLM),
|
|
249
|
+
# falling back to unflatten_dict for backward compatibility
|
|
250
|
+
final_parsed_list = []
|
|
251
|
+
for idx in range(len(self.choices)):
|
|
252
|
+
full_parsed = self.choices[idx].delta.full_parsed
|
|
253
|
+
if full_parsed is not None:
|
|
254
|
+
# Use the complete parsed object from the LLM (avoids data corruption from unflatten_dict)
|
|
255
|
+
final_parsed_list.append(full_parsed)
|
|
256
|
+
else:
|
|
257
|
+
# Fallback: reconstruct from flat_parsed (may lose null values in sparse arrays)
|
|
258
|
+
final_parsed_list.append(unflatten_dict(override_final_flat_parseds[idx]))
|
|
259
|
+
|
|
242
260
|
final_content_list = [json.dumps(final_parsed_list[idx]) for idx in range(len(self.choices))]
|
|
243
261
|
|
|
244
262
|
# The final likelihoods are only on the first choice.
|
|
@@ -264,7 +282,7 @@ class RetabParsedChatCompletionChunk(StreamingBaseModel, ChatCompletionChunk):
|
|
|
264
282
|
role="assistant",
|
|
265
283
|
parsed=final_parsed_list[idx],
|
|
266
284
|
),
|
|
267
|
-
key_mapping=self.choices[idx].delta.key_mapping,
|
|
285
|
+
key_mapping=self.choices[idx].delta.key_mapping, # type: ignore[call-arg]
|
|
268
286
|
finish_reason="stop",
|
|
269
287
|
logprobs=None,
|
|
270
288
|
)
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
import datetime
|
|
2
|
+
from typing import Any, Dict, List, Literal, Optional
|
|
3
|
+
|
|
4
|
+
from pydantic import BaseModel, Field, ConfigDict
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class StepIOReference(BaseModel):
|
|
8
|
+
"""Reference to step input/output stored in GCS"""
|
|
9
|
+
file_id: Optional[str] = Field(default=None, description="File ID for document storage lookup")
|
|
10
|
+
gcs_path: Optional[str] = Field(default=None, description="GCS path to the stored file")
|
|
11
|
+
filename: Optional[str] = Field(default=None, description="Original filename")
|
|
12
|
+
mime_type: Optional[str] = Field(default=None, description="MIME type of the file")
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class HandlePayload(BaseModel):
|
|
16
|
+
"""
|
|
17
|
+
Payload for a single output handle.
|
|
18
|
+
|
|
19
|
+
Each output handle on a node produces a typed payload that can be:
|
|
20
|
+
- file: A document reference (PDF, image, etc.)
|
|
21
|
+
- json: Structured JSON data (extracted data, etc.)
|
|
22
|
+
- text: Plain text content
|
|
23
|
+
"""
|
|
24
|
+
type: Literal["file", "json", "text"] = Field(..., description="Type of payload")
|
|
25
|
+
document: Optional[StepIOReference] = Field(default=None, description="For file handles: document reference")
|
|
26
|
+
data: Optional[dict] = Field(default=None, description="For JSON handles: structured data")
|
|
27
|
+
text: Optional[str] = Field(default=None, description="For text handles: text content")
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
NodeType = Literal["start", "extract", "split", "end", "hil"]
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class StepStatus(BaseModel):
|
|
34
|
+
"""Status of a single step in workflow execution"""
|
|
35
|
+
node_id: str = Field(..., description="ID of the node")
|
|
36
|
+
node_type: NodeType = Field(..., description="Type of the node")
|
|
37
|
+
node_label: str = Field(..., description="Label of the node")
|
|
38
|
+
status: Literal["pending", "running", "completed", "error", "waiting_for_human"] = Field(..., description="Current status")
|
|
39
|
+
started_at: Optional[datetime.datetime] = Field(default=None, description="When the step started")
|
|
40
|
+
completed_at: Optional[datetime.datetime] = Field(default=None, description="When the step completed")
|
|
41
|
+
duration_ms: Optional[int] = Field(default=None, description="Duration in milliseconds")
|
|
42
|
+
error: Optional[str] = Field(default=None, description="Error message if failed")
|
|
43
|
+
output: Optional[dict] = Field(default=None, description="Output data from the step")
|
|
44
|
+
handle_outputs: Optional[Dict[str, HandlePayload]] = Field(
|
|
45
|
+
default=None,
|
|
46
|
+
description="Output payloads keyed by handle ID (e.g., 'output-file-0', 'output-json-0')"
|
|
47
|
+
)
|
|
48
|
+
input_document: Optional[StepIOReference] = Field(default=None, description="Reference to input document")
|
|
49
|
+
output_document: Optional[StepIOReference] = Field(default=None, description="Reference to output document")
|
|
50
|
+
split_documents: Optional[Dict[str, StepIOReference]] = Field(default=None, description="For split nodes: category -> document reference")
|
|
51
|
+
requires_human_review: Optional[bool] = Field(default=None, description="Whether this step requires human review")
|
|
52
|
+
human_reviewed_at: Optional[datetime.datetime] = Field(default=None, description="When human review was completed")
|
|
53
|
+
human_review_approved: Optional[bool] = Field(default=None, description="Whether human approved or rejected")
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class WorkflowRun(BaseModel):
|
|
57
|
+
"""A stored workflow run record"""
|
|
58
|
+
model_config = ConfigDict(extra="ignore")
|
|
59
|
+
|
|
60
|
+
id: str = Field(..., description="Unique ID for this run")
|
|
61
|
+
workflow_id: str = Field(..., description="ID of the workflow that was run")
|
|
62
|
+
workflow_name: str = Field(..., description="Name of the workflow at time of execution")
|
|
63
|
+
organization_id: str = Field(..., description="Organization that owns this run")
|
|
64
|
+
status: Literal["pending", "running", "completed", "error", "waiting_for_human"] = Field(default="pending", description="Overall status")
|
|
65
|
+
started_at: datetime.datetime = Field(..., description="When the workflow started")
|
|
66
|
+
completed_at: Optional[datetime.datetime] = Field(default=None, description="When the workflow completed")
|
|
67
|
+
duration_ms: Optional[int] = Field(default=None, description="Total duration in milliseconds")
|
|
68
|
+
steps: List[StepStatus] = Field(default_factory=list, description="Status of each step")
|
|
69
|
+
input_documents: Optional[Dict[str, StepIOReference]] = Field(default=None, description="Start node ID -> input document reference")
|
|
70
|
+
final_outputs: Optional[dict] = Field(default=None, description="Final outputs from end nodes")
|
|
71
|
+
error: Optional[str] = Field(default=None, description="Error message if workflow failed")
|
|
72
|
+
created_at: datetime.datetime = Field(..., description="When the run was created")
|
|
73
|
+
updated_at: datetime.datetime = Field(..., description="When the run was last updated")
|
|
74
|
+
waiting_for_node_ids: List[str] = Field(default_factory=list, description="Node IDs that are waiting for human review")
|
|
75
|
+
pending_node_outputs: Optional[dict] = Field(default=None, description="Serialized node outputs to resume from")
|
|
76
|
+
|
|
@@ -158,23 +158,43 @@ def expand_refs(schema: dict[str, Any], definitions: dict[str, dict[str, Any]] |
|
|
|
158
158
|
print("Cyclic refs found, keeping it as is")
|
|
159
159
|
return schema
|
|
160
160
|
|
|
161
|
+
# Support both $defs (draft 2019-09+) and definitions (draft-07)
|
|
161
162
|
if definitions is None:
|
|
162
|
-
definitions = schema.pop("$defs", {})
|
|
163
|
+
definitions = schema.pop("$defs", None) or schema.pop("definitions", {})
|
|
163
164
|
|
|
164
165
|
assert isinstance(definitions, dict)
|
|
165
166
|
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
167
|
+
# Handle allOf - merge all elements
|
|
168
|
+
if "allOf" in schema and isinstance(schema["allOf"], list) and len(schema["allOf"]) > 0:
|
|
169
|
+
all_of_elements = schema.pop("allOf")
|
|
170
|
+
for element in all_of_elements:
|
|
171
|
+
if isinstance(element, dict):
|
|
172
|
+
# Recursively expand refs in each allOf element first
|
|
173
|
+
expanded = expand_refs(element, definitions)
|
|
174
|
+
# Deep merge properties if both have them
|
|
175
|
+
if "properties" in expanded and "properties" in schema:
|
|
176
|
+
schema["properties"] = {**schema["properties"], **expanded["properties"]}
|
|
177
|
+
del expanded["properties"]
|
|
178
|
+
# Merge required arrays if both have them
|
|
179
|
+
if "required" in expanded and "required" in schema:
|
|
180
|
+
schema["required"] = list(set(schema["required"] + expanded["required"]))
|
|
181
|
+
del expanded["required"]
|
|
182
|
+
schema.update(expanded)
|
|
171
183
|
|
|
172
184
|
if "$ref" in schema:
|
|
173
185
|
ref: str = schema["$ref"]
|
|
186
|
+
def_name: str | None = None
|
|
187
|
+
|
|
188
|
+
# Support both #/$defs/ and #/definitions/ formats
|
|
174
189
|
if ref.startswith("#/$defs/"):
|
|
175
190
|
def_name = ref.removeprefix("#/$defs/")
|
|
191
|
+
elif ref.startswith("#/definitions/"):
|
|
192
|
+
def_name = ref.removeprefix("#/definitions/")
|
|
193
|
+
|
|
194
|
+
if def_name is not None:
|
|
176
195
|
if def_name not in definitions:
|
|
177
|
-
|
|
196
|
+
# Return schema as-is if reference not found (might be external)
|
|
197
|
+
return schema
|
|
178
198
|
target = definitions[def_name]
|
|
179
199
|
merged = merge_descriptions(schema, target)
|
|
180
200
|
merged.pop("$ref", None)
|
|
@@ -184,7 +204,8 @@ def expand_refs(schema: dict[str, Any], definitions: dict[str, dict[str, Any]] |
|
|
|
184
204
|
|
|
185
205
|
result: dict[str, Any] = {}
|
|
186
206
|
for annotation, subschema in schema.items():
|
|
187
|
-
|
|
207
|
+
# Handle properties, $defs, and definitions (draft-07) keys
|
|
208
|
+
if annotation in ["properties", "$defs", "definitions"]:
|
|
188
209
|
if isinstance(subschema, dict):
|
|
189
210
|
new_dict = {}
|
|
190
211
|
for pk, pv in subschema.items():
|
|
@@ -20,6 +20,8 @@ retab/resources/extractions/__init__.py
|
|
|
20
20
|
retab/resources/extractions/client.py
|
|
21
21
|
retab/resources/projects/__init__.py
|
|
22
22
|
retab/resources/projects/client.py
|
|
23
|
+
retab/resources/workflows/__init__.py
|
|
24
|
+
retab/resources/workflows/client.py
|
|
23
25
|
retab/types/__init__.py
|
|
24
26
|
retab/types/chat.py
|
|
25
27
|
retab/types/inference_settings.py
|
|
@@ -28,6 +30,7 @@ retab/types/modality.py
|
|
|
28
30
|
retab/types/pagination.py
|
|
29
31
|
retab/types/standards.py
|
|
30
32
|
retab/types/documents/__init__.py
|
|
33
|
+
retab/types/documents/classify.py
|
|
31
34
|
retab/types/documents/correct_orientation.py
|
|
32
35
|
retab/types/documents/create_messages.py
|
|
33
36
|
retab/types/documents/edit.py
|
|
@@ -46,6 +49,8 @@ retab/types/schemas/generate.py
|
|
|
46
49
|
retab/types/schemas/layout.py
|
|
47
50
|
retab/types/schemas/model.py
|
|
48
51
|
retab/types/schemas/templates.py
|
|
52
|
+
retab/types/workflows/__init__.py
|
|
53
|
+
retab/types/workflows/model.py
|
|
49
54
|
retab/utils/__init__.py
|
|
50
55
|
retab/utils/display.py
|
|
51
56
|
retab/utils/hashing.py
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|