retab 0.0.89__py3-none-any.whl → 0.0.91__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- retab/client.py +3 -1
- retab/resources/documents/client.py +22 -21
- retab/resources/jobs/__init__.py +3 -0
- retab/resources/jobs/client.py +252 -0
- retab/types/documents/__init__.py +4 -2
- retab/types/documents/classify.py +4 -1
- retab/types/documents/split.py +13 -13
- retab/types/jobs.py +90 -0
- {retab-0.0.89.dist-info → retab-0.0.91.dist-info}/METADATA +1 -1
- {retab-0.0.89.dist-info → retab-0.0.91.dist-info}/RECORD +12 -9
- {retab-0.0.89.dist-info → retab-0.0.91.dist-info}/WHEEL +0 -0
- {retab-0.0.89.dist-info → retab-0.0.91.dist-info}/top_level.txt +0 -0
retab/client.py
CHANGED
|
@@ -10,7 +10,7 @@ import backoff.types
|
|
|
10
10
|
import httpx
|
|
11
11
|
import truststore
|
|
12
12
|
|
|
13
|
-
from .resources import documents, models, schemas, projects, extractions, edit, workflows
|
|
13
|
+
from .resources import documents, models, schemas, projects, extractions, edit, workflows, jobs
|
|
14
14
|
from .types.standards import PreparedRequest, FieldUnset
|
|
15
15
|
|
|
16
16
|
|
|
@@ -190,6 +190,7 @@ class Retab(BaseRetab):
|
|
|
190
190
|
self.schemas = schemas.Schemas(client=self)
|
|
191
191
|
self.edit = edit.Edit(client=self)
|
|
192
192
|
self.workflows = workflows.Workflows(client=self)
|
|
193
|
+
self.jobs = jobs.Jobs(client=self)
|
|
193
194
|
def _request(
|
|
194
195
|
self,
|
|
195
196
|
method: str,
|
|
@@ -488,6 +489,7 @@ class AsyncRetab(BaseRetab):
|
|
|
488
489
|
self.schemas = schemas.AsyncSchemas(client=self)
|
|
489
490
|
self.edit = edit.AsyncEdit(client=self)
|
|
490
491
|
self.workflows = workflows.AsyncWorkflows(client=self)
|
|
492
|
+
self.jobs = jobs.AsyncJobs(client=self)
|
|
491
493
|
|
|
492
494
|
def _parse_response(self, response: httpx.Response) -> Any:
|
|
493
495
|
"""Parse response based on content-type.
|
|
@@ -16,7 +16,8 @@ from ...types.chat import ChatCompletionRetabMessage
|
|
|
16
16
|
from ...types.documents.edit import EditRequest, EditResponse
|
|
17
17
|
from ...types.documents.extract import DocumentExtractRequest, RetabParsedChatCompletion, RetabParsedChatCompletionChunk, RetabParsedChoice, maybe_parse_to_pydantic
|
|
18
18
|
from ...types.documents.parse import ParseRequest, ParseResult, TableParsingFormat
|
|
19
|
-
from ...types.documents.split import
|
|
19
|
+
from ...types.documents.split import Subdocument, SplitRequest, SplitResponse
|
|
20
|
+
from ...types.documents.classify import Category
|
|
20
21
|
from ...types.documents.classify import ClassifyRequest, ClassifyResponse
|
|
21
22
|
from ...types.mime import MIMEData
|
|
22
23
|
from ...types.standards import PreparedRequest, FieldUnset
|
|
@@ -148,21 +149,21 @@ class BaseDocumentsMixin:
|
|
|
148
149
|
def _prepare_split(
|
|
149
150
|
self,
|
|
150
151
|
document: Path | str | IOBase | MIMEData | PIL.Image.Image | HttpUrl,
|
|
151
|
-
|
|
152
|
+
subdocuments: list[Subdocument] | list[dict[str, str]],
|
|
152
153
|
model: str,
|
|
153
154
|
**extra_body: Any,
|
|
154
155
|
) -> PreparedRequest:
|
|
155
156
|
mime_document = prepare_mime_document(document)
|
|
156
157
|
|
|
157
|
-
# Convert dict
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
for
|
|
158
|
+
# Convert dict subdocuments to Subdocument objects if needed
|
|
159
|
+
subdocument_objects = [
|
|
160
|
+
Subdocument(**subdoc) if isinstance(subdoc, dict) else subdoc
|
|
161
|
+
for subdoc in subdocuments
|
|
161
162
|
]
|
|
162
163
|
|
|
163
164
|
request_dict: dict[str, Any] = {
|
|
164
165
|
"document": mime_document,
|
|
165
|
-
"
|
|
166
|
+
"subdocuments": subdocument_objects,
|
|
166
167
|
"model": model,
|
|
167
168
|
}
|
|
168
169
|
|
|
@@ -644,20 +645,20 @@ class Documents(SyncAPIResource, BaseDocumentsMixin):
|
|
|
644
645
|
def split(
|
|
645
646
|
self,
|
|
646
647
|
document: Path | str | IOBase | MIMEData | PIL.Image.Image | HttpUrl,
|
|
647
|
-
|
|
648
|
+
subdocuments: list[Subdocument] | list[dict[str, str]],
|
|
648
649
|
model: str,
|
|
649
650
|
**extra_body: Any,
|
|
650
651
|
) -> SplitResponse:
|
|
651
652
|
"""
|
|
652
|
-
Split a document into sections based on provided
|
|
653
|
+
Split a document into sections based on provided subdocuments.
|
|
653
654
|
|
|
654
655
|
This method analyzes a multi-page document and classifies pages into
|
|
655
|
-
user-defined
|
|
656
|
+
user-defined subdocuments, returning the page ranges for each section.
|
|
656
657
|
|
|
657
658
|
Args:
|
|
658
659
|
document: The document to split. Can be a file path (Path or str), file-like object, MIMEData, PIL Image, or URL.
|
|
659
|
-
|
|
660
|
-
Can be
|
|
660
|
+
subdocuments: List of subdocuments to split the document into. Each subdocument should have a 'name' and 'description'.
|
|
661
|
+
Can be Subdocument objects or dicts with 'name' and 'description' keys.
|
|
661
662
|
model: The AI model to use for document splitting (e.g., "gemini-2.5-flash").
|
|
662
663
|
|
|
663
664
|
Returns:
|
|
@@ -672,7 +673,7 @@ class Documents(SyncAPIResource, BaseDocumentsMixin):
|
|
|
672
673
|
response = retab.documents.split(
|
|
673
674
|
document="invoice_batch.pdf",
|
|
674
675
|
model="gemini-2.5-flash",
|
|
675
|
-
|
|
676
|
+
subdocuments=[
|
|
676
677
|
{"name": "invoice", "description": "Invoice documents with billing information"},
|
|
677
678
|
{"name": "receipt", "description": "Receipt documents for payments"},
|
|
678
679
|
{"name": "contract", "description": "Legal contract documents"},
|
|
@@ -684,7 +685,7 @@ class Documents(SyncAPIResource, BaseDocumentsMixin):
|
|
|
684
685
|
"""
|
|
685
686
|
request = self._prepare_split(
|
|
686
687
|
document=document,
|
|
687
|
-
|
|
688
|
+
subdocuments=subdocuments,
|
|
688
689
|
model=model,
|
|
689
690
|
**extra_body,
|
|
690
691
|
)
|
|
@@ -1039,20 +1040,20 @@ class AsyncDocuments(AsyncAPIResource, BaseDocumentsMixin):
|
|
|
1039
1040
|
async def split(
|
|
1040
1041
|
self,
|
|
1041
1042
|
document: Path | str | IOBase | MIMEData | PIL.Image.Image | HttpUrl,
|
|
1042
|
-
|
|
1043
|
+
subdocuments: list[Subdocument] | list[dict[str, str]],
|
|
1043
1044
|
model: str,
|
|
1044
1045
|
**extra_body: Any,
|
|
1045
1046
|
) -> SplitResponse:
|
|
1046
1047
|
"""
|
|
1047
|
-
Split a document into sections based on provided
|
|
1048
|
+
Split a document into sections based on provided subdocuments asynchronously.
|
|
1048
1049
|
|
|
1049
1050
|
This method analyzes a multi-page document and classifies pages into
|
|
1050
|
-
user-defined
|
|
1051
|
+
user-defined subdocuments, returning the page ranges for each section.
|
|
1051
1052
|
|
|
1052
1053
|
Args:
|
|
1053
1054
|
document: The document to split. Can be a file path (Path or str), file-like object, MIMEData, PIL Image, or URL.
|
|
1054
|
-
|
|
1055
|
-
Can be
|
|
1055
|
+
subdocuments: List of subdocuments to split the document into. Each subdocument should have a 'name' and 'description'.
|
|
1056
|
+
Can be Subdocument objects or dicts with 'name' and 'description' keys.
|
|
1056
1057
|
model: The AI model to use for document splitting (e.g., "gemini-2.5-flash").
|
|
1057
1058
|
|
|
1058
1059
|
Returns:
|
|
@@ -1067,7 +1068,7 @@ class AsyncDocuments(AsyncAPIResource, BaseDocumentsMixin):
|
|
|
1067
1068
|
response = await retab.documents.split(
|
|
1068
1069
|
document="invoice_batch.pdf",
|
|
1069
1070
|
model="gemini-2.5-flash",
|
|
1070
|
-
|
|
1071
|
+
subdocuments=[
|
|
1071
1072
|
{"name": "invoice", "description": "Invoice documents with billing information"},
|
|
1072
1073
|
{"name": "receipt", "description": "Receipt documents for payments"},
|
|
1073
1074
|
{"name": "contract", "description": "Legal contract documents"},
|
|
@@ -1079,7 +1080,7 @@ class AsyncDocuments(AsyncAPIResource, BaseDocumentsMixin):
|
|
|
1079
1080
|
"""
|
|
1080
1081
|
request = self._prepare_split(
|
|
1081
1082
|
document=document,
|
|
1082
|
-
|
|
1083
|
+
subdocuments=subdocuments,
|
|
1083
1084
|
model=model,
|
|
1084
1085
|
**extra_body,
|
|
1085
1086
|
)
|
|
@@ -0,0 +1,252 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Jobs API Resource
|
|
3
|
+
|
|
4
|
+
Provides synchronous and asynchronous clients for the Jobs API.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from ..._resource import AsyncAPIResource, SyncAPIResource
|
|
10
|
+
from ...types.jobs import Job, JobListResponse, JobStatus, SupportedEndpoint
|
|
11
|
+
from ...types.standards import PreparedRequest
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class BaseJobsMixin:
|
|
15
|
+
"""Shared methods for preparing Jobs API requests."""
|
|
16
|
+
|
|
17
|
+
def _prepare_create(
|
|
18
|
+
self,
|
|
19
|
+
endpoint: SupportedEndpoint,
|
|
20
|
+
request: dict[str, Any],
|
|
21
|
+
metadata: dict[str, str] | None = None,
|
|
22
|
+
) -> PreparedRequest:
|
|
23
|
+
data = {
|
|
24
|
+
"endpoint": endpoint,
|
|
25
|
+
"request": request,
|
|
26
|
+
}
|
|
27
|
+
if metadata is not None:
|
|
28
|
+
data["metadata"] = metadata
|
|
29
|
+
return PreparedRequest(method="POST", url="/v1/jobs", data=data)
|
|
30
|
+
|
|
31
|
+
def _prepare_retrieve(self, job_id: str) -> PreparedRequest:
|
|
32
|
+
return PreparedRequest(method="GET", url=f"/v1/jobs/{job_id}")
|
|
33
|
+
|
|
34
|
+
def _prepare_cancel(self, job_id: str) -> PreparedRequest:
|
|
35
|
+
return PreparedRequest(method="POST", url=f"/v1/jobs/{job_id}/cancel")
|
|
36
|
+
|
|
37
|
+
def _prepare_list(
|
|
38
|
+
self,
|
|
39
|
+
after: str | None = None,
|
|
40
|
+
limit: int = 20,
|
|
41
|
+
status: JobStatus | None = None,
|
|
42
|
+
) -> PreparedRequest:
|
|
43
|
+
params: dict[str, Any] = {"limit": limit}
|
|
44
|
+
if after is not None:
|
|
45
|
+
params["after"] = after
|
|
46
|
+
if status is not None:
|
|
47
|
+
params["status"] = status
|
|
48
|
+
return PreparedRequest(method="GET", url="/v1/jobs", params=params)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class Jobs(SyncAPIResource, BaseJobsMixin):
|
|
52
|
+
"""
|
|
53
|
+
Synchronous Jobs API client.
|
|
54
|
+
|
|
55
|
+
The Jobs API allows you to submit long-running extract or parse operations
|
|
56
|
+
asynchronously and poll for their results.
|
|
57
|
+
|
|
58
|
+
Example:
|
|
59
|
+
>>> from retab import Retab
|
|
60
|
+
>>> client = Retab(api_key="your-api-key")
|
|
61
|
+
>>>
|
|
62
|
+
>>> # Create an async extraction job
|
|
63
|
+
>>> job = client.jobs.create(
|
|
64
|
+
... endpoint="/v1/documents/extract",
|
|
65
|
+
... request={
|
|
66
|
+
... "document": {"content": "...", "mime_type": "application/pdf"},
|
|
67
|
+
... "json_schema": {"type": "object", ...},
|
|
68
|
+
... "model": "gpt-4o",
|
|
69
|
+
... }
|
|
70
|
+
... )
|
|
71
|
+
>>>
|
|
72
|
+
>>> # Poll for completion
|
|
73
|
+
>>> while job.status not in ("completed", "failed", "cancelled"):
|
|
74
|
+
... import time
|
|
75
|
+
... time.sleep(5)
|
|
76
|
+
... job = client.jobs.retrieve(job.id)
|
|
77
|
+
>>>
|
|
78
|
+
>>> if job.status == "completed":
|
|
79
|
+
... print(job.response.body)
|
|
80
|
+
"""
|
|
81
|
+
|
|
82
|
+
def create(
|
|
83
|
+
self,
|
|
84
|
+
endpoint: SupportedEndpoint,
|
|
85
|
+
request: dict[str, Any],
|
|
86
|
+
metadata: dict[str, str] | None = None,
|
|
87
|
+
) -> Job:
|
|
88
|
+
"""
|
|
89
|
+
Create a new asynchronous job.
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
endpoint: The API endpoint to call ("/v1/documents/extract" or "/v1/documents/parse")
|
|
93
|
+
request: The full request body for the target endpoint
|
|
94
|
+
metadata: Optional metadata (max 16 pairs; keys ≤64 chars, values ≤512 chars)
|
|
95
|
+
|
|
96
|
+
Returns:
|
|
97
|
+
Job: The created job with status "queued"
|
|
98
|
+
"""
|
|
99
|
+
prepared = self._prepare_create(endpoint, request, metadata)
|
|
100
|
+
response = self._client._prepared_request(prepared)
|
|
101
|
+
return Job.model_validate(response)
|
|
102
|
+
|
|
103
|
+
def retrieve(self, job_id: str) -> Job:
|
|
104
|
+
"""
|
|
105
|
+
Retrieve a job by ID.
|
|
106
|
+
|
|
107
|
+
Args:
|
|
108
|
+
job_id: The job ID to retrieve
|
|
109
|
+
|
|
110
|
+
Returns:
|
|
111
|
+
Job: The job with current status and result (if completed)
|
|
112
|
+
"""
|
|
113
|
+
prepared = self._prepare_retrieve(job_id)
|
|
114
|
+
response = self._client._prepared_request(prepared)
|
|
115
|
+
return Job.model_validate(response)
|
|
116
|
+
|
|
117
|
+
def cancel(self, job_id: str) -> Job:
|
|
118
|
+
"""
|
|
119
|
+
Cancel a queued or in-progress job.
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
job_id: The job ID to cancel
|
|
123
|
+
|
|
124
|
+
Returns:
|
|
125
|
+
Job: The updated job with status "cancelled"
|
|
126
|
+
"""
|
|
127
|
+
prepared = self._prepare_cancel(job_id)
|
|
128
|
+
response = self._client._prepared_request(prepared)
|
|
129
|
+
return Job.model_validate(response)
|
|
130
|
+
|
|
131
|
+
def list(
|
|
132
|
+
self,
|
|
133
|
+
after: str | None = None,
|
|
134
|
+
limit: int = 20,
|
|
135
|
+
status: JobStatus | None = None,
|
|
136
|
+
) -> JobListResponse:
|
|
137
|
+
"""
|
|
138
|
+
List jobs with pagination and optional status filtering.
|
|
139
|
+
|
|
140
|
+
Args:
|
|
141
|
+
after: Pagination cursor (last ID from previous page)
|
|
142
|
+
limit: Number of jobs to return (1-100, default 20)
|
|
143
|
+
status: Filter by job status
|
|
144
|
+
|
|
145
|
+
Returns:
|
|
146
|
+
JobListResponse: List of jobs with pagination info
|
|
147
|
+
"""
|
|
148
|
+
prepared = self._prepare_list(after, limit, status)
|
|
149
|
+
response = self._client._prepared_request(prepared)
|
|
150
|
+
return JobListResponse.model_validate(response)
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
class AsyncJobs(AsyncAPIResource, BaseJobsMixin):
|
|
154
|
+
"""
|
|
155
|
+
Asynchronous Jobs API client.
|
|
156
|
+
|
|
157
|
+
The Jobs API allows you to submit long-running extract or parse operations
|
|
158
|
+
asynchronously and poll for their results.
|
|
159
|
+
|
|
160
|
+
Example:
|
|
161
|
+
>>> from retab import AsyncRetab
|
|
162
|
+
>>> client = AsyncRetab(api_key="your-api-key")
|
|
163
|
+
>>>
|
|
164
|
+
>>> # Create an async extraction job
|
|
165
|
+
>>> job = await client.jobs.create(
|
|
166
|
+
... endpoint="/v1/documents/extract",
|
|
167
|
+
... request={
|
|
168
|
+
... "document": {"content": "...", "mime_type": "application/pdf"},
|
|
169
|
+
... "json_schema": {"type": "object", ...},
|
|
170
|
+
... "model": "gpt-4o",
|
|
171
|
+
... }
|
|
172
|
+
... )
|
|
173
|
+
>>>
|
|
174
|
+
>>> # Poll for completion
|
|
175
|
+
>>> while job.status not in ("completed", "failed", "cancelled"):
|
|
176
|
+
... import asyncio
|
|
177
|
+
... await asyncio.sleep(5)
|
|
178
|
+
... job = await client.jobs.retrieve(job.id)
|
|
179
|
+
>>>
|
|
180
|
+
>>> if job.status == "completed":
|
|
181
|
+
... print(job.response.body)
|
|
182
|
+
"""
|
|
183
|
+
|
|
184
|
+
async def create(
|
|
185
|
+
self,
|
|
186
|
+
endpoint: SupportedEndpoint,
|
|
187
|
+
request: dict[str, Any],
|
|
188
|
+
metadata: dict[str, str] | None = None,
|
|
189
|
+
) -> Job:
|
|
190
|
+
"""
|
|
191
|
+
Create a new asynchronous job.
|
|
192
|
+
|
|
193
|
+
Args:
|
|
194
|
+
endpoint: The API endpoint to call ("/v1/documents/extract" or "/v1/documents/parse")
|
|
195
|
+
request: The full request body for the target endpoint
|
|
196
|
+
metadata: Optional metadata (max 16 pairs; keys ≤64 chars, values ≤512 chars)
|
|
197
|
+
|
|
198
|
+
Returns:
|
|
199
|
+
Job: The created job with status "queued"
|
|
200
|
+
"""
|
|
201
|
+
prepared = self._prepare_create(endpoint, request, metadata)
|
|
202
|
+
response = await self._client._prepared_request(prepared)
|
|
203
|
+
return Job.model_validate(response)
|
|
204
|
+
|
|
205
|
+
async def retrieve(self, job_id: str) -> Job:
|
|
206
|
+
"""
|
|
207
|
+
Retrieve a job by ID.
|
|
208
|
+
|
|
209
|
+
Args:
|
|
210
|
+
job_id: The job ID to retrieve
|
|
211
|
+
|
|
212
|
+
Returns:
|
|
213
|
+
Job: The job with current status and result (if completed)
|
|
214
|
+
"""
|
|
215
|
+
prepared = self._prepare_retrieve(job_id)
|
|
216
|
+
response = await self._client._prepared_request(prepared)
|
|
217
|
+
return Job.model_validate(response)
|
|
218
|
+
|
|
219
|
+
async def cancel(self, job_id: str) -> Job:
|
|
220
|
+
"""
|
|
221
|
+
Cancel a queued or in-progress job.
|
|
222
|
+
|
|
223
|
+
Args:
|
|
224
|
+
job_id: The job ID to cancel
|
|
225
|
+
|
|
226
|
+
Returns:
|
|
227
|
+
Job: The updated job with status "cancelled"
|
|
228
|
+
"""
|
|
229
|
+
prepared = self._prepare_cancel(job_id)
|
|
230
|
+
response = await self._client._prepared_request(prepared)
|
|
231
|
+
return Job.model_validate(response)
|
|
232
|
+
|
|
233
|
+
async def list(
|
|
234
|
+
self,
|
|
235
|
+
after: str | None = None,
|
|
236
|
+
limit: int = 20,
|
|
237
|
+
status: JobStatus | None = None,
|
|
238
|
+
) -> JobListResponse:
|
|
239
|
+
"""
|
|
240
|
+
List jobs with pagination and optional status filtering.
|
|
241
|
+
|
|
242
|
+
Args:
|
|
243
|
+
after: Pagination cursor (last ID from previous page)
|
|
244
|
+
limit: Number of jobs to return (1-100, default 20)
|
|
245
|
+
status: Filter by job status
|
|
246
|
+
|
|
247
|
+
Returns:
|
|
248
|
+
JobListResponse: List of jobs with pagination info
|
|
249
|
+
"""
|
|
250
|
+
prepared = self._prepare_list(after, limit, status)
|
|
251
|
+
response = await self._client._prepared_request(prepared)
|
|
252
|
+
return JobListResponse.model_validate(response)
|
|
@@ -1,12 +1,14 @@
|
|
|
1
1
|
from .parse import ParseRequest, ParseResult, RetabUsage
|
|
2
|
-
from .split import
|
|
3
|
-
from .classify import ClassifyRequest, ClassifyResult, ClassifyResponse
|
|
2
|
+
from .split import Subdocument, SplitRequest, SplitResult, SplitResponse
|
|
3
|
+
from .classify import ClassifyRequest, ClassifyResult, ClassifyResponse, Category
|
|
4
|
+
|
|
4
5
|
|
|
5
6
|
__all__ = [
|
|
6
7
|
"ParseRequest",
|
|
7
8
|
"ParseResult",
|
|
8
9
|
"RetabUsage",
|
|
9
10
|
"Category",
|
|
11
|
+
"Subdocument",
|
|
10
12
|
"SplitRequest",
|
|
11
13
|
"SplitResult",
|
|
12
14
|
"SplitResponse",
|
|
@@ -1,13 +1,16 @@
|
|
|
1
1
|
from pydantic import BaseModel, Field
|
|
2
2
|
from ..mime import MIMEData
|
|
3
|
-
from .split import Category
|
|
4
3
|
|
|
4
|
+
class Category(BaseModel):
|
|
5
|
+
name: str = Field(..., description="The name of the category")
|
|
6
|
+
description: str = Field(..., description="The description of the category")
|
|
5
7
|
|
|
6
8
|
class ClassifyRequest(BaseModel):
|
|
7
9
|
document: MIMEData = Field(..., description="The document to classify")
|
|
8
10
|
categories: list[Category] = Field(..., description="The categories to classify the document into")
|
|
9
11
|
model: str = Field(default="retab-small", description="The model to use for classification")
|
|
10
12
|
first_n_pages: int | None = Field(default=None, description="Only use the first N pages of the document for classification. Useful for large documents where classification can be determined from early pages.")
|
|
13
|
+
context: str | None = Field(default=None, description="Additional context for classification (e.g., iteration context from a loop)")
|
|
11
14
|
|
|
12
15
|
|
|
13
16
|
class ClassifyResult(BaseModel):
|
retab/types/documents/split.py
CHANGED
|
@@ -1,17 +1,17 @@
|
|
|
1
1
|
from pydantic import BaseModel, Field
|
|
2
2
|
from ..mime import MIMEData
|
|
3
3
|
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
partition_key: str | None = Field(default=None, description="The key to partition the category")
|
|
4
|
+
class Subdocument(BaseModel):
|
|
5
|
+
name: str = Field(..., description="The name of the subdocument")
|
|
6
|
+
description: str = Field(..., description="The description of the subdocument")
|
|
7
|
+
partition_key: str | None = Field(default=None, description="The key to partition the subdocument")
|
|
9
8
|
|
|
10
9
|
|
|
11
10
|
class SplitRequest(BaseModel):
|
|
12
11
|
document: MIMEData = Field(..., description="The document to split")
|
|
13
|
-
|
|
12
|
+
subdocuments: list[Subdocument] = Field(..., description="The subdocuments to split the document into")
|
|
14
13
|
model: str = Field(default="retab-small", description="The model to use to split the document")
|
|
14
|
+
context: str | None = Field(default=None, description="Additional context for the split operation (e.g., iteration context from a loop)")
|
|
15
15
|
|
|
16
16
|
|
|
17
17
|
class Partition(BaseModel):
|
|
@@ -21,9 +21,9 @@ class Partition(BaseModel):
|
|
|
21
21
|
last_page_y_end: float = Field(default=1.0, description="The y coordinate of the last page of the partition")
|
|
22
22
|
|
|
23
23
|
class SplitResult(BaseModel):
|
|
24
|
-
name: str = Field(..., description="The name of the
|
|
25
|
-
pages: list[int] = Field(..., description="The pages of the
|
|
26
|
-
partitions: list[Partition] = Field(default_factory=list, description="The partitions of the
|
|
24
|
+
name: str = Field(..., description="The name of the subdocument")
|
|
25
|
+
pages: list[int] = Field(..., description="The pages of the subdocument (1-indexed)")
|
|
26
|
+
partitions: list[Partition] = Field(default_factory=list, description="The partitions of the subdocument")
|
|
27
27
|
|
|
28
28
|
|
|
29
29
|
class SplitResponse(BaseModel):
|
|
@@ -32,14 +32,14 @@ class SplitResponse(BaseModel):
|
|
|
32
32
|
|
|
33
33
|
class SplitOutputItem(BaseModel):
|
|
34
34
|
"""Internal schema item for LLM structured output validation."""
|
|
35
|
-
name: str = Field(..., description="The name of the
|
|
36
|
-
start_page: int = Field(..., description="The start page of the
|
|
37
|
-
end_page: int = Field(..., description="The end page of the
|
|
35
|
+
name: str = Field(..., description="The name of the subdocument")
|
|
36
|
+
start_page: int = Field(..., description="The start page of the subdocument (1-indexed)")
|
|
37
|
+
end_page: int = Field(..., description="The end page of the subdocument (1-indexed, inclusive)")
|
|
38
38
|
|
|
39
39
|
|
|
40
40
|
class SplitOutputSchema(BaseModel):
|
|
41
41
|
"""Schema for LLM structured output."""
|
|
42
42
|
splits: list[SplitOutputItem] = Field(
|
|
43
43
|
...,
|
|
44
|
-
description="List of document sections, each classified into one of the provided
|
|
44
|
+
description="List of document sections, each classified into one of the provided subdocuments with their page ranges"
|
|
45
45
|
)
|
retab/types/jobs.py
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Jobs API Types
|
|
3
|
+
|
|
4
|
+
Pydantic models for the asynchronous Jobs API.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from typing import Any, Literal
|
|
8
|
+
|
|
9
|
+
from pydantic import BaseModel, Field
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
JobStatus = Literal[
|
|
13
|
+
"validating",
|
|
14
|
+
"queued",
|
|
15
|
+
"in_progress",
|
|
16
|
+
"completed",
|
|
17
|
+
"failed",
|
|
18
|
+
"cancelled",
|
|
19
|
+
"expired",
|
|
20
|
+
]
|
|
21
|
+
|
|
22
|
+
SupportedEndpoint = Literal[
|
|
23
|
+
"/v1/documents/extract",
|
|
24
|
+
"/v1/documents/parse",
|
|
25
|
+
"/v1/documents/split",
|
|
26
|
+
"/v1/documents/classify",
|
|
27
|
+
"/v1/schemas/generate",
|
|
28
|
+
"/v1/edit/agent/fill",
|
|
29
|
+
"/v1/edit/templates/fill",
|
|
30
|
+
"/v1/edit/templates/generate",
|
|
31
|
+
"/v1/projects/extract", # Requires "project_id" in request body
|
|
32
|
+
]
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class JobResponse(BaseModel):
|
|
36
|
+
"""Response stored when job completes successfully."""
|
|
37
|
+
status_code: int
|
|
38
|
+
body: dict[str, Any]
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class JobError(BaseModel):
|
|
42
|
+
"""Error details when job fails."""
|
|
43
|
+
code: str
|
|
44
|
+
message: str
|
|
45
|
+
details: dict[str, Any] | None = None
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class Job(BaseModel):
|
|
49
|
+
"""
|
|
50
|
+
Job object representing an asynchronous operation.
|
|
51
|
+
|
|
52
|
+
Use this to track the status of long-running operations like extract, parse,
|
|
53
|
+
split, classify, schema generation, and template operations.
|
|
54
|
+
"""
|
|
55
|
+
id: str
|
|
56
|
+
object: Literal["job"] = "job"
|
|
57
|
+
status: JobStatus
|
|
58
|
+
endpoint: SupportedEndpoint
|
|
59
|
+
request: dict[str, Any]
|
|
60
|
+
response: JobResponse | None = None
|
|
61
|
+
error: JobError | None = None
|
|
62
|
+
|
|
63
|
+
# Timestamps (Unix timestamps)
|
|
64
|
+
created_at: int
|
|
65
|
+
started_at: int | None = None
|
|
66
|
+
completed_at: int | None = None
|
|
67
|
+
expires_at: int
|
|
68
|
+
|
|
69
|
+
# User context
|
|
70
|
+
organization_id: str
|
|
71
|
+
metadata: dict[str, str] | None = None
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
class CreateJobRequest(BaseModel):
|
|
75
|
+
"""Request body for creating a new job."""
|
|
76
|
+
endpoint: SupportedEndpoint
|
|
77
|
+
request: dict[str, Any]
|
|
78
|
+
metadata: dict[str, str] | None = Field(
|
|
79
|
+
default=None,
|
|
80
|
+
description="Max 16 pairs; keys ≤64 chars, values ≤512 chars"
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
class JobListResponse(BaseModel):
|
|
85
|
+
"""Response for listing jobs."""
|
|
86
|
+
object: Literal["list"] = "list"
|
|
87
|
+
data: list[Job]
|
|
88
|
+
first_id: str | None = None
|
|
89
|
+
last_id: str | None = None
|
|
90
|
+
has_more: bool = False
|
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
retab/__init__.py,sha256=s4GawWTRBYz4VY-CyAV5-ZdFtdw8V5oopGIYm9GgdSo,188
|
|
2
2
|
retab/_resource.py,sha256=JfAU4UTa05ugWfbrpO7fsVr_pFewht99NkoIfK6kBQM,577
|
|
3
|
-
retab/client.py,sha256=
|
|
3
|
+
retab/client.py,sha256=Ds-Sy3ynN9GusN5rDrc2ogX3ATv-Dq1MuiZeDnLOWGk,30408
|
|
4
4
|
retab/generate_types.py,sha256=cUu1IX65uU__MHivmEb_PZtzAi8DYsvppZvcY30hj90,8425
|
|
5
5
|
retab/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
6
|
retab/resources/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
7
7
|
retab/resources/models.py,sha256=4WidFBnTGZEA65DSn2pLP2SRnCVXkMTw7o_m8xVCFC4,2469
|
|
8
8
|
retab/resources/schemas.py,sha256=rZ6OzfmoYv-mGaRVzvXjO09dD-KxP74mZhOO8sMgcDQ,4632
|
|
9
9
|
retab/resources/documents/__init__.py,sha256=OjXmngFN0RKqO4SI-mJBNzr6Ex6rMxfq0DxaqzP0RQs,89
|
|
10
|
-
retab/resources/documents/client.py,sha256=
|
|
10
|
+
retab/resources/documents/client.py,sha256=GrVgOM37M3K62PdEXyBQNo1J4oyqXHhafR-hmHwe3Lw,49049
|
|
11
11
|
retab/resources/edit/__init__.py,sha256=yycIstpTSKsz2qXbrY3Buzd35UDcPWvb5hw6Eb2rLow,69
|
|
12
12
|
retab/resources/edit/client.py,sha256=DJKlwh8xui7IDRjwPmiGKTC1_HshXLYXX-xr93FhSbo,1270
|
|
13
13
|
retab/resources/edit/agent/__init__.py,sha256=i5IdOMhwOOQmnhPFeBbh7-ChqwQh5q7oLow1zJ0ZAwM,74
|
|
@@ -16,6 +16,8 @@ retab/resources/edit/templates/__init__.py,sha256=n-zA_HXo7iGgeIclSwcsxmSueXJIRM
|
|
|
16
16
|
retab/resources/edit/templates/client.py,sha256=kEyqat5I84_QBeWSjptteSwvlMGRZ1UF9KDzH7p0f9s,20173
|
|
17
17
|
retab/resources/extractions/__init__.py,sha256=2H1ezUG8hI5SmTRy6NFzXdYLOdGFFsFrI60uzkitV20,97
|
|
18
18
|
retab/resources/extractions/client.py,sha256=sEoNjOgX91FTOgoJUV-I1A9A9xl1ciCdPlhYwjhEjbA,11035
|
|
19
|
+
retab/resources/jobs/__init__.py,sha256=g7WnNAw69CExMSyfxU9ROcSj-KODjxeLe2YlUqi8l0c,69
|
|
20
|
+
retab/resources/jobs/client.py,sha256=Cf7bafUzECqCXbCeKW396Q4fRFOMgjKDtgQ3e_ThIQY,8115
|
|
19
21
|
retab/resources/projects/__init__.py,sha256=tPR3_3tr7bsoYd618qmGjnYN2R23PmF5oCFd7Z5_HGY,85
|
|
20
22
|
retab/resources/projects/client.py,sha256=5LPAhJt5-nqBP4VWYvo0k7cW6HLGF6K9xMiHKQzIXho,15593
|
|
21
23
|
retab/resources/workflows/__init__.py,sha256=-I0QNX7XKEr8ZJTV4-awMyKxZqGlSkKMdibiHiB7cZ0,89
|
|
@@ -25,18 +27,19 @@ retab/resources/workflows/runs/client.py,sha256=GopedV363XnGl0mL3bZHWaOay12uAeTq
|
|
|
25
27
|
retab/types/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
26
28
|
retab/types/chat.py,sha256=x9VbtPMa4w6Gc0HrFC3ILl6cCnfEn5ytDnwJtZmlcys,1436
|
|
27
29
|
retab/types/inference_settings.py,sha256=wIivYffvEE7v6lhbjbhAZGssK4uYr64Oq6cZKxzY5_M,1131
|
|
30
|
+
retab/types/jobs.py,sha256=iVCl2EmlvvgvdO217gOGqvkuX-38le89C9XR7gnEb3E,2086
|
|
28
31
|
retab/types/mime.py,sha256=ZLNCD3pvgn5cbGfJwzrdkjgB9dMHCbN67YEV9bx47zE,10063
|
|
29
32
|
retab/types/modality.py,sha256=4B8LctdUBZVgIjtS2FjrJpljn2Eyse0XE1bpFsGb9O4,131
|
|
30
33
|
retab/types/pagination.py,sha256=A0Fw06baPTfEaYwo3kvNs4vaupzlqylBc6tQH-2DFuY,279
|
|
31
34
|
retab/types/standards.py,sha256=7aGtuvzzkKidvqY8JB2Cjfn43V80FeKwrTtp162kjKc,1477
|
|
32
|
-
retab/types/documents/__init__.py,sha256=
|
|
33
|
-
retab/types/documents/classify.py,sha256=
|
|
35
|
+
retab/types/documents/__init__.py,sha256=7vRh1a5WRy1Ue0Tg69CsXeOltGmPawJ6GgAKpRy8huk,451
|
|
36
|
+
retab/types/documents/classify.py,sha256=PBeF5bmCc3GUHxR8hcQ-49DBndO84WWFiuGRcAcpBBo,1626
|
|
34
37
|
retab/types/documents/correct_orientation.py,sha256=e-ivsslI6L6Gl0YkcslXw_DH620xMGEYVp4tdeviXeM,261
|
|
35
38
|
retab/types/documents/create_messages.py,sha256=Uym0SnVUGkyt1C5AOD37BsZ3puyeu_igR6X9SboojfA,7267
|
|
36
39
|
retab/types/documents/edit.py,sha256=b6UcYLOJkClpMu4QyYmdp-X4WtN8U_3oiMBc1KLklVY,5663
|
|
37
40
|
retab/types/documents/extract.py,sha256=x_59fm69-icsxxGRgpFd0NN-SLRoMYqbvfCZuG7zyGc,18033
|
|
38
41
|
retab/types/documents/parse.py,sha256=MXe7zh3DusWQhGe0Sr95nPy6cB8DRX8MA4Hmjj_AP7E,1300
|
|
39
|
-
retab/types/documents/split.py,sha256=
|
|
42
|
+
retab/types/documents/split.py,sha256=z670Ppg7tiu_eBf4zPcsy5l57f_2T5yTYTzacquZpn4,2346
|
|
40
43
|
retab/types/edit/__init__.py,sha256=M8hF97h7fX8RP9IsB6qpkw0eyvO0DFQvP6FmWL8caCQ,331
|
|
41
44
|
retab/types/edit/templates.py,sha256=RLRIMdXzU-5_3XPf0iMSozjRTAP5Tliq0nrjlZn0l8E,2412
|
|
42
45
|
retab/types/extractions/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -59,7 +62,7 @@ retab/utils/hashing.py,sha256=_BMVUvftOcJav68QL0rLkH2dbhW9RRJPzeGC2akR0fc,757
|
|
|
59
62
|
retab/utils/json_schema.py,sha256=zP4pQLpVHBKWo_abCjb_dU4kA0azhHopd-1TFUgVEvc,20655
|
|
60
63
|
retab/utils/mime.py,sha256=mTP_lqSPttOP5DYJxopiWaeFXrUCPjhwd7y53nCVGO4,6189
|
|
61
64
|
retab/utils/stream_context_managers.py,sha256=gI1gVQSj3nWz6Mvjz7Ix5AiY0g6vSL-c2tPfuP04izo,2314
|
|
62
|
-
retab-0.0.
|
|
63
|
-
retab-0.0.
|
|
64
|
-
retab-0.0.
|
|
65
|
-
retab-0.0.
|
|
65
|
+
retab-0.0.91.dist-info/METADATA,sha256=f3TPkuDOc9Wmpqjmx8zJpEySKkSq3Re8EXjB7ev92oE,4532
|
|
66
|
+
retab-0.0.91.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
67
|
+
retab-0.0.91.dist-info/top_level.txt,sha256=waQR0EGdhLIQtztoE3AXg7ik5ONQ9q_bsKVpyFuJdq0,6
|
|
68
|
+
retab-0.0.91.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|