retab 0.0.42__py3-none-any.whl → 0.0.43__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- retab/__init__.py +2 -1
- retab/client.py +16 -45
- retab/resources/consensus/client.py +1 -1
- retab/resources/consensus/responses.py +1 -1
- retab/resources/documents/client.py +94 -68
- retab/resources/documents/extractions.py +55 -46
- retab/resources/evaluations/client.py +32 -19
- retab/resources/evaluations/documents.py +12 -11
- retab/resources/evaluations/iterations.py +48 -30
- retab/resources/jsonlUtils.py +3 -4
- retab/resources/processors/automations/endpoints.py +49 -39
- retab/resources/processors/automations/links.py +52 -43
- retab/resources/processors/automations/mailboxes.py +74 -59
- retab/resources/processors/automations/outlook.py +104 -82
- retab/resources/processors/client.py +35 -30
- retab/resources/usage.py +2 -0
- retab/types/ai_models.py +1 -1
- retab/types/deprecated_evals.py +195 -0
- retab/types/evaluations/__init__.py +5 -2
- retab/types/evaluations/iterations.py +9 -43
- retab/types/evaluations/model.py +20 -22
- retab/types/extractions.py +1 -0
- retab/types/logs.py +5 -6
- retab/types/mime.py +1 -10
- retab/types/schemas/enhance.py +22 -5
- retab/types/schemas/evaluate.py +1 -1
- retab/types/schemas/object.py +26 -0
- retab/types/standards.py +2 -2
- retab/utils/__init__.py +3 -0
- retab/utils/ai_models.py +127 -12
- retab/utils/hashing.py +24 -0
- retab/utils/json_schema.py +1 -26
- retab/utils/mime.py +0 -17
- {retab-0.0.42.dist-info → retab-0.0.43.dist-info}/METADATA +3 -5
- {retab-0.0.42.dist-info → retab-0.0.43.dist-info}/RECORD +37 -51
- retab/_utils/__init__.py +0 -0
- retab/_utils/_model_cards/anthropic.yaml +0 -59
- retab/_utils/_model_cards/auto.yaml +0 -43
- retab/_utils/_model_cards/gemini.yaml +0 -117
- retab/_utils/_model_cards/openai.yaml +0 -301
- retab/_utils/_model_cards/xai.yaml +0 -28
- retab/_utils/ai_models.py +0 -138
- retab/_utils/benchmarking.py +0 -484
- retab/_utils/chat.py +0 -327
- retab/_utils/display.py +0 -440
- retab/_utils/json_schema.py +0 -2156
- retab/_utils/mime.py +0 -165
- retab/_utils/responses.py +0 -169
- retab/_utils/stream_context_managers.py +0 -52
- retab/_utils/usage/__init__.py +0 -0
- retab/_utils/usage/usage.py +0 -301
- {retab-0.0.42.dist-info → retab-0.0.43.dist-info}/WHEEL +0 -0
- {retab-0.0.42.dist-info → retab-0.0.43.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,195 @@
|
|
1
|
+
import datetime
|
2
|
+
from typing import Any, List, Literal, Optional
|
3
|
+
|
4
|
+
import nanoid # type: ignore
|
5
|
+
from pydantic import BaseModel, Field, computed_field
|
6
|
+
|
7
|
+
from ..utils.json_schema import generate_schema_data_id, generate_schema_id
|
8
|
+
from .ai_models import Amount
|
9
|
+
from .inference_settings import InferenceSettings
|
10
|
+
from .mime import MIMEData
|
11
|
+
|
12
|
+
# Define the type alias for MetricType
|
13
|
+
MetricType = Literal["levenshtein", "jaccard", "hamming"]
|
14
|
+
|
15
|
+
|
16
|
+
# Define the structure for an individual item metric
|
17
|
+
class ItemMetric(BaseModel):
|
18
|
+
id: str = Field(description="The ID of the item being measured")
|
19
|
+
name: str = Field(description="The name of the item being measured")
|
20
|
+
similarity: float = Field(description="The similarity score between 0 and 1")
|
21
|
+
similarities: dict[str, Any] = Field(description="The similarity scores for each item in the list")
|
22
|
+
flat_similarities: dict[str, Optional[float]] = Field(description="The similarity scores for each item in the list in dot notation format")
|
23
|
+
aligned_similarity: float = Field(description="The similarity score between 0 and 1, after alignment")
|
24
|
+
aligned_similarities: dict[str, Any] = Field(description="The similarity scores for each item in the list, after alignment")
|
25
|
+
aligned_flat_similarities: dict[str, Optional[float]] = Field(description="The similarity scores for each item in the list in dot notation format, after alignment")
|
26
|
+
|
27
|
+
|
28
|
+
# Define the main MetricResult model
|
29
|
+
class MetricResult(BaseModel):
|
30
|
+
item_metrics: List[ItemMetric] = Field(description="List of similarity metrics for individual items")
|
31
|
+
mean_similarity: float = Field(description="The average similarity score across all items")
|
32
|
+
aligned_mean_similarity: float = Field(description="The average similarity score across all items, after alignment")
|
33
|
+
metric_type: MetricType = Field(description="The type of similarity metric used for comparison")
|
34
|
+
|
35
|
+
|
36
|
+
class DistancesResult(BaseModel):
|
37
|
+
distances: dict[str, Any] = Field(description="List of distances for individual items")
|
38
|
+
mean_distance: float = Field(description="The average distance across all items")
|
39
|
+
metric_type: MetricType = Field(description="The type of distance metric used for comparison")
|
40
|
+
|
41
|
+
|
42
|
+
class PredictionMetadata(BaseModel):
|
43
|
+
extraction_id: Optional[str] = Field(default=None, description="The ID of the extraction")
|
44
|
+
likelihoods: Optional[dict[str, Any]] = Field(default=None, description="The likelihoods of the extraction")
|
45
|
+
field_locations: Optional[dict[str, Any]] = Field(default=None, description="The field locations of the extraction")
|
46
|
+
agentic_field_locations: Optional[dict[str, Any]] = Field(default=None, description="The field locations of the extraction extracted by an llm")
|
47
|
+
consensus_details: Optional[list[dict[str, Any]]] = Field(default=None, description="The consensus details of the extraction")
|
48
|
+
api_cost: Optional[Amount] = Field(default=None, description="The cost of the API call for this document (if any -- ground truth for example)")
|
49
|
+
|
50
|
+
|
51
|
+
class PredictionData(BaseModel):
|
52
|
+
prediction: dict[str, Any] = Field(default={}, description="The result of the extraction or manual annotation")
|
53
|
+
metadata: Optional[PredictionMetadata] = Field(default=None, description="The metadata of the prediction")
|
54
|
+
|
55
|
+
|
56
|
+
class Iteration(BaseModel):
|
57
|
+
id: str = Field(default_factory=lambda: "eval_iter_" + nanoid.generate())
|
58
|
+
inference_settings: InferenceSettings
|
59
|
+
json_schema: dict[str, Any]
|
60
|
+
predictions: list[PredictionData] = Field(default_factory=list, description="The predictions of the iteration for all the documents")
|
61
|
+
metric_results: Optional[MetricResult] = Field(default=None, description="The metric results of the iteration")
|
62
|
+
|
63
|
+
@computed_field # type: ignore
|
64
|
+
@property
|
65
|
+
def schema_data_id(self) -> str:
|
66
|
+
"""Returns the SHA1 hash of the schema data, ignoring all prompt/description/default fields.
|
67
|
+
|
68
|
+
Returns:
|
69
|
+
str: A SHA1 hash string representing the schema data version.
|
70
|
+
"""
|
71
|
+
return generate_schema_data_id(self.json_schema)
|
72
|
+
|
73
|
+
# This is a computed field, it is exposed when serializing the object
|
74
|
+
@computed_field # type: ignore
|
75
|
+
@property
|
76
|
+
def schema_id(self) -> str:
|
77
|
+
"""Returns the SHA1 hash of the complete schema.
|
78
|
+
|
79
|
+
Returns:
|
80
|
+
str: A SHA1 hash string representing the complete schema version.
|
81
|
+
"""
|
82
|
+
return generate_schema_id(self.json_schema)
|
83
|
+
|
84
|
+
|
85
|
+
class AnnotatedDocument(BaseModel):
|
86
|
+
mime_data: MIMEData = Field(
|
87
|
+
description="The mime data of the document. Can also be a BaseMIMEData, which is why we have this id field (to be able to identify the file, but id is equal to mime_data.id)"
|
88
|
+
)
|
89
|
+
annotation: dict[str, Any] = Field(default={}, description="The ground truth of the document")
|
90
|
+
|
91
|
+
|
92
|
+
class DocumentItem(AnnotatedDocument):
|
93
|
+
annotation_metadata: Optional[PredictionMetadata] = Field(default=None, description="The metadata of the annotation when the annotation is a prediction")
|
94
|
+
|
95
|
+
|
96
|
+
class EvaluationDocument(DocumentItem):
|
97
|
+
id: str = Field(description="The ID of the document. Equal to mime_data.id but robust to the case where mime_data is a BaseMIMEData")
|
98
|
+
|
99
|
+
|
100
|
+
class CreateIterationRequest(BaseModel):
|
101
|
+
"""
|
102
|
+
Request model for performing a new iteration with custom inference settings and optional JSON schema.
|
103
|
+
"""
|
104
|
+
|
105
|
+
inference_settings: InferenceSettings
|
106
|
+
json_schema: Optional[dict[str, Any]] = None
|
107
|
+
|
108
|
+
|
109
|
+
class UpdateEvaluationDocumentRequest(BaseModel):
|
110
|
+
annotation: Optional[dict[str, Any]] = Field(default=None, description="The ground truth of the document")
|
111
|
+
annotation_metadata: Optional[PredictionMetadata] = Field(default=None, description="The metadata of the annotation when the annotation is a prediction")
|
112
|
+
|
113
|
+
|
114
|
+
class UpdateEvaluationRequest(BaseModel):
|
115
|
+
name: Optional[str] = Field(default=None, description="The name of the document")
|
116
|
+
documents: Optional[list[EvaluationDocument]] = Field(default=None, description="The documents of the evaluation")
|
117
|
+
iterations: Optional[list[Iteration]] = Field(default=None, description="The iterations of the evaluation")
|
118
|
+
json_schema: Optional[dict[str, Any]] = Field(default=None, description="The json schema of the evaluation")
|
119
|
+
|
120
|
+
project_id: Optional[str] = Field(default=None, description="The ID of the project")
|
121
|
+
default_inference_settings: Optional[InferenceSettings] = Field(default=None, description="The default inference properties for the evaluation (mostly used in the frontend)")
|
122
|
+
|
123
|
+
@computed_field # type: ignore
|
124
|
+
@property
|
125
|
+
def schema_data_id(self) -> Optional[str]:
|
126
|
+
"""Returns the SHA1 hash of the schema data, ignoring all prompt/description/default fields.
|
127
|
+
|
128
|
+
Returns:
|
129
|
+
str: A SHA1 hash string representing the schema data version.
|
130
|
+
"""
|
131
|
+
if self.json_schema is None:
|
132
|
+
return None
|
133
|
+
|
134
|
+
return generate_schema_data_id(self.json_schema)
|
135
|
+
|
136
|
+
# This is a computed field, it is exposed when serializing the object
|
137
|
+
@computed_field # type: ignore
|
138
|
+
@property
|
139
|
+
def schema_id(self) -> Optional[str]:
|
140
|
+
"""Returns the SHA1 hash of the complete schema.
|
141
|
+
|
142
|
+
Returns:
|
143
|
+
str: A SHA1 hash string representing the complete schema version.
|
144
|
+
"""
|
145
|
+
if self.json_schema is None:
|
146
|
+
return None
|
147
|
+
return generate_schema_id(self.json_schema)
|
148
|
+
|
149
|
+
|
150
|
+
class Evaluation(BaseModel):
|
151
|
+
id: str = Field(default_factory=lambda: "eval_" + nanoid.generate())
|
152
|
+
updated_at: datetime.datetime = Field(default_factory=lambda: datetime.datetime.now(tz=datetime.timezone.utc))
|
153
|
+
|
154
|
+
name: str
|
155
|
+
old_documents: list[EvaluationDocument] | None = None
|
156
|
+
documents: list[EvaluationDocument]
|
157
|
+
iterations: list[Iteration]
|
158
|
+
json_schema: dict[str, Any]
|
159
|
+
|
160
|
+
project_id: str = Field(description="The ID of the project", default="default_spreadsheets")
|
161
|
+
default_inference_settings: Optional[InferenceSettings] = Field(default=None, description="The default inference properties for the evaluation (mostly used in the frontend)")
|
162
|
+
|
163
|
+
# @field_validator('iterations')
|
164
|
+
# def validate_iterations_content_length(cls: Any, v: list[Iteration], values: Any) -> list[Iteration]:
|
165
|
+
# if 'ground_truth' in values:
|
166
|
+
# ground_truth_length = len(values['ground_truth'])
|
167
|
+
# for iteration in v:
|
168
|
+
# if len(iteration.content) != ground_truth_length:
|
169
|
+
# raise ValueError(f"Iteration content length must match ground_truth length ({ground_truth_length})")
|
170
|
+
# return v
|
171
|
+
|
172
|
+
@computed_field # type: ignore
|
173
|
+
@property
|
174
|
+
def schema_data_id(self) -> str:
|
175
|
+
"""Returns the SHA1 hash of the schema data, ignoring all prompt/description/default fields.
|
176
|
+
|
177
|
+
Returns:
|
178
|
+
str: A SHA1 hash string representing the schema data version.
|
179
|
+
"""
|
180
|
+
return generate_schema_data_id(self.json_schema)
|
181
|
+
|
182
|
+
# This is a computed field, it is exposed when serializing the object
|
183
|
+
@computed_field # type: ignore
|
184
|
+
@property
|
185
|
+
def schema_id(self) -> str:
|
186
|
+
"""Returns the SHA1 hash of the complete schema.
|
187
|
+
|
188
|
+
Returns:
|
189
|
+
str: A SHA1 hash string representing the complete schema version.
|
190
|
+
"""
|
191
|
+
return generate_schema_id(self.json_schema)
|
192
|
+
|
193
|
+
|
194
|
+
class AddIterationFromJsonlRequest(BaseModel):
|
195
|
+
jsonl_gcs_path: str
|
@@ -1,6 +1,7 @@
|
|
1
|
-
from .model import Evaluation,
|
1
|
+
from .model import Evaluation, BaseEvaluation, CreateEvaluationRequest, PatchEvaluationRequest, ListEvaluationParams
|
2
2
|
from .documents import AnnotatedDocument, DocumentItem, EvaluationDocument, CreateEvaluationDocumentRequest, PatchEvaluationDocumentRequest
|
3
3
|
from .iterations import (
|
4
|
+
BaseIteration,
|
4
5
|
Iteration,
|
5
6
|
CreateIterationRequest,
|
6
7
|
PatchIterationRequest,
|
@@ -13,7 +14,8 @@ from .iterations import (
|
|
13
14
|
|
14
15
|
__all__ = [
|
15
16
|
"Evaluation",
|
16
|
-
"
|
17
|
+
"BaseEvaluation",
|
18
|
+
"CreateEvaluationRequest",
|
17
19
|
"PatchEvaluationRequest",
|
18
20
|
"ListEvaluationParams",
|
19
21
|
"AnnotatedDocument",
|
@@ -21,6 +23,7 @@ __all__ = [
|
|
21
23
|
"EvaluationDocument",
|
22
24
|
"CreateEvaluationDocumentRequest",
|
23
25
|
"PatchEvaluationDocumentRequest",
|
26
|
+
"BaseIteration",
|
24
27
|
"Iteration",
|
25
28
|
"CreateIterationRequest",
|
26
29
|
"PatchIterationRequest",
|
@@ -1,59 +1,24 @@
|
|
1
|
-
import copy
|
2
1
|
import datetime
|
3
|
-
import json
|
4
2
|
from typing import Any, Optional, Self
|
5
3
|
|
6
4
|
import nanoid # type: ignore
|
7
|
-
from pydantic import BaseModel, Field,
|
5
|
+
from pydantic import BaseModel, Field, model_validator
|
8
6
|
|
9
|
-
from ...utils.json_schema import clean_schema
|
10
|
-
from ...utils.mime import generate_blake2b_hash_from_string
|
11
7
|
from ..inference_settings import InferenceSettings
|
12
|
-
from ..metrics import MetricResult
|
13
8
|
from ..predictions import PredictionData
|
14
9
|
|
15
10
|
|
16
|
-
class
|
11
|
+
class BaseIteration(BaseModel):
|
17
12
|
id: str = Field(default_factory=lambda: "eval_iter_" + nanoid.generate())
|
13
|
+
inference_settings: InferenceSettings
|
14
|
+
json_schema: dict[str, Any]
|
18
15
|
updated_at: datetime.datetime = Field(
|
19
16
|
default_factory=lambda: datetime.datetime.now(tz=datetime.timezone.utc),
|
20
17
|
description="The last update date of inference settings or json schema",
|
21
18
|
)
|
22
|
-
inference_settings: InferenceSettings
|
23
|
-
json_schema: dict[str, Any]
|
24
|
-
predictions: dict[str, PredictionData] = Field(default_factory=dict, description="The predictions of the iteration for all the documents")
|
25
|
-
metric_results: Optional[MetricResult] = Field(default=None, description="The metric results of the iteration")
|
26
|
-
|
27
|
-
@computed_field # type: ignore
|
28
|
-
@property
|
29
|
-
def schema_data_id(self) -> str:
|
30
|
-
"""Returns the SHA1 hash of the schema data, ignoring all prompt/description/default fields.
|
31
|
-
|
32
|
-
Returns:
|
33
|
-
str: A SHA1 hash string representing the schema data version.
|
34
|
-
"""
|
35
|
-
return "sch_data_id_" + generate_blake2b_hash_from_string(
|
36
|
-
json.dumps(
|
37
|
-
clean_schema(
|
38
|
-
copy.deepcopy(self.json_schema),
|
39
|
-
remove_custom_fields=True,
|
40
|
-
fields_to_remove=["description", "default", "title", "required", "examples", "deprecated", "readOnly", "writeOnly"],
|
41
|
-
),
|
42
|
-
sort_keys=True,
|
43
|
-
).strip()
|
44
|
-
)
|
45
|
-
|
46
|
-
# This is a computed field, it is exposed when serializing the object
|
47
|
-
@computed_field # type: ignore
|
48
|
-
@property
|
49
|
-
def schema_id(self) -> str:
|
50
|
-
"""Returns the SHA1 hash of the complete schema.
|
51
|
-
|
52
|
-
Returns:
|
53
|
-
str: A SHA1 hash string representing the complete schema version.
|
54
|
-
"""
|
55
|
-
return "sch_id_" + generate_blake2b_hash_from_string(json.dumps(self.json_schema, sort_keys=True).strip())
|
56
19
|
|
20
|
+
class Iteration(BaseIteration):
|
21
|
+
predictions: dict[str, PredictionData] = Field(default_factory=dict, description="The predictions of the iteration for all the documents")
|
57
22
|
|
58
23
|
class CreateIterationRequest(BaseModel):
|
59
24
|
"""
|
@@ -71,13 +36,14 @@ class CreateIterationRequest(BaseModel):
|
|
71
36
|
@model_validator(mode="after")
|
72
37
|
def validate_one_of_from_iteration_id_or_json_schema(self) -> Self:
|
73
38
|
if (self.from_iteration_id is None) ^ (self.json_schema is None):
|
74
|
-
|
75
|
-
|
39
|
+
return self
|
40
|
+
raise ValueError("Exactly one of from_iteration_id or json_schema must be provided")
|
76
41
|
|
77
42
|
|
78
43
|
class PatchIterationRequest(BaseModel):
|
79
44
|
inference_settings: Optional[InferenceSettings] = Field(default=None, description="The new inference settings of the iteration")
|
80
45
|
json_schema: Optional[dict[str, Any]] = Field(default=None, description="The new json schema of the iteration")
|
46
|
+
version: Optional[int] = Field(default=None, description="Current version for optimistic locking")
|
81
47
|
|
82
48
|
|
83
49
|
class ProcessIterationRequest(BaseModel):
|
retab/types/evaluations/model.py
CHANGED
@@ -1,31 +1,27 @@
|
|
1
1
|
import datetime
|
2
|
-
import json
|
3
2
|
from typing import Any, Optional
|
4
3
|
|
5
4
|
import nanoid # type: ignore
|
6
5
|
from pydantic import BaseModel, Field, computed_field
|
7
6
|
|
8
|
-
from ...utils.json_schema import
|
9
|
-
from ...utils.mime import generate_blake2b_hash_from_string
|
7
|
+
from ...utils.json_schema import generate_schema_data_id, generate_schema_id
|
10
8
|
from ..inference_settings import InferenceSettings
|
11
9
|
from .documents import EvaluationDocument
|
12
10
|
from .iterations import Iteration
|
13
11
|
|
14
|
-
|
15
|
-
# Actual Object stored in DB
|
16
|
-
class Evaluation(BaseModel):
|
12
|
+
class BaseEvaluation(BaseModel):
|
17
13
|
id: str = Field(default_factory=lambda: "eval_" + nanoid.generate())
|
14
|
+
name: str = Field(default="", description="The name of the evaluation")
|
15
|
+
json_schema: dict[str, Any] = Field(default_factory=dict, description="The json schema of the evaluation")
|
16
|
+
project_id: str = Field(description="The ID of the project", default="default_spreadsheets")
|
17
|
+
default_inference_settings: InferenceSettings = Field(default=InferenceSettings(), description="The default inference properties for the evaluation.")
|
18
18
|
updated_at: datetime.datetime = Field(default_factory=lambda: datetime.datetime.now(tz=datetime.timezone.utc))
|
19
19
|
|
20
|
-
|
20
|
+
|
21
|
+
# Actual Object stored in DB
|
22
|
+
class Evaluation(BaseEvaluation):
|
21
23
|
documents: list[EvaluationDocument] = Field(default_factory=list)
|
22
24
|
iterations: list[Iteration] = Field(default_factory=list)
|
23
|
-
json_schema: dict[str, Any]
|
24
|
-
|
25
|
-
project_id: str = Field(description="The ID of the project", default="default_spreadsheets")
|
26
|
-
default_inference_settings: InferenceSettings = Field(
|
27
|
-
default=InferenceSettings(), description="The default inference properties for the evaluation (mostly used in the frontend)"
|
28
|
-
)
|
29
25
|
|
30
26
|
@computed_field # type: ignore
|
31
27
|
@property
|
@@ -35,7 +31,7 @@ class Evaluation(BaseModel):
|
|
35
31
|
Returns:
|
36
32
|
str: A SHA1 hash string representing the schema data version.
|
37
33
|
"""
|
38
|
-
return
|
34
|
+
return generate_schema_data_id(self.json_schema)
|
39
35
|
|
40
36
|
# This is a computed field, it is exposed when serializing the object
|
41
37
|
@computed_field # type: ignore
|
@@ -46,14 +42,7 @@ class Evaluation(BaseModel):
|
|
46
42
|
Returns:
|
47
43
|
str: A SHA1 hash string representing the complete schema version.
|
48
44
|
"""
|
49
|
-
return
|
50
|
-
|
51
|
-
|
52
|
-
class CreateEvaluation(BaseModel):
|
53
|
-
name: str
|
54
|
-
json_schema: dict[str, Any]
|
55
|
-
project_id: str = Field(description="The ID of the project", default="default_spreadsheets")
|
56
|
-
default_inference_settings: InferenceSettings = Field(default=InferenceSettings(), description="The default inference properties for the evaluation.")
|
45
|
+
return generate_schema_id(self.json_schema)
|
57
46
|
|
58
47
|
|
59
48
|
class ListEvaluationParams(BaseModel):
|
@@ -62,6 +51,15 @@ class ListEvaluationParams(BaseModel):
|
|
62
51
|
schema_data_id: Optional[str] = Field(default=None, description="The ID of the schema data")
|
63
52
|
|
64
53
|
|
54
|
+
class CreateEvaluationRequest(BaseModel):
|
55
|
+
name: str
|
56
|
+
project_id: str
|
57
|
+
json_schema: dict[str, Any]
|
58
|
+
default_inference_settings: InferenceSettings
|
59
|
+
|
60
|
+
|
61
|
+
# This is basically the same as BaseEvaluation, but everything is optional.
|
62
|
+
# Could be achieved by convert_basemodel_to_partial_basemodel(BaseEvaluation) but we prefer explicitness
|
65
63
|
class PatchEvaluationRequest(BaseModel):
|
66
64
|
name: Optional[str] = Field(default=None, description="The name of the document")
|
67
65
|
json_schema: Optional[dict[str, Any]] = Field(default=None, description="The json schema of the evaluation")
|
retab/types/extractions.py
CHANGED
@@ -77,6 +77,7 @@ class Extraction(BaseModel):
|
|
77
77
|
default=None,
|
78
78
|
description="The effort level for the model to reason about the input data.",
|
79
79
|
)
|
80
|
+
n_consensus: int = Field(default=1, description="Number of consensus models used for the extraction")
|
80
81
|
timings: list[ExtractionTimingStep] = Field(default_factory=list, description="Timings of the extraction")
|
81
82
|
|
82
83
|
# Infered from the schema
|
retab/types/logs.py
CHANGED
@@ -7,8 +7,7 @@ from openai.types.chat.chat_completion import ChatCompletion
|
|
7
7
|
from openai.types.chat.chat_completion_reasoning_effort import ChatCompletionReasoningEffort
|
8
8
|
from pydantic import BaseModel, EmailStr, Field, HttpUrl, computed_field, field_validator
|
9
9
|
|
10
|
-
from ..utils.json_schema import
|
11
|
-
from ..utils.mime import generate_blake2b_hash_from_string
|
10
|
+
from ..utils.json_schema import generate_schema_data_id, generate_schema_id
|
12
11
|
from ..utils.usage.usage import CostBreakdown, compute_cost_from_model, compute_cost_from_model_with_breakdown
|
13
12
|
from .ai_models import Amount
|
14
13
|
from .documents.extractions import RetabParsedChatCompletion
|
@@ -47,7 +46,7 @@ class ProcessorConfig(BaseModel):
|
|
47
46
|
Returns:
|
48
47
|
str: A SHA1 hash string representing the schema data version.
|
49
48
|
"""
|
50
|
-
return
|
49
|
+
return generate_schema_data_id(self.json_schema)
|
51
50
|
|
52
51
|
# This is a computed field, it is exposed when serializing the object
|
53
52
|
@computed_field # type: ignore
|
@@ -58,7 +57,7 @@ class ProcessorConfig(BaseModel):
|
|
58
57
|
Returns:
|
59
58
|
str: A SHA1 hash string representing the complete schema version.
|
60
59
|
"""
|
61
|
-
return
|
60
|
+
return generate_schema_id(self.json_schema)
|
62
61
|
|
63
62
|
|
64
63
|
class AutomationConfig(BaseModel):
|
@@ -111,7 +110,7 @@ class UpdateProcessorRequest(BaseModel):
|
|
111
110
|
"""
|
112
111
|
if self.json_schema is None:
|
113
112
|
return None
|
114
|
-
return
|
113
|
+
return generate_schema_data_id(self.json_schema)
|
115
114
|
|
116
115
|
@computed_field # type: ignore
|
117
116
|
@property
|
@@ -123,7 +122,7 @@ class UpdateProcessorRequest(BaseModel):
|
|
123
122
|
"""
|
124
123
|
if self.json_schema is None:
|
125
124
|
return None
|
126
|
-
return
|
125
|
+
return generate_schema_id(self.json_schema)
|
127
126
|
|
128
127
|
|
129
128
|
class UpdateAutomationRequest(BaseModel):
|
retab/types/mime.py
CHANGED
@@ -1,21 +1,12 @@
|
|
1
1
|
import base64
|
2
2
|
import datetime
|
3
3
|
import gzip
|
4
|
-
import hashlib
|
5
4
|
import mimetypes
|
6
5
|
import re
|
7
6
|
from typing import Any, Optional, Self, Sequence
|
8
7
|
|
9
8
|
from pydantic import BaseModel, Field, field_validator
|
10
|
-
|
11
|
-
|
12
|
-
def generate_blake2b_hash_from_bytes(bytes_: bytes) -> str:
|
13
|
-
return hashlib.blake2b(bytes_, digest_size=8).hexdigest()
|
14
|
-
|
15
|
-
|
16
|
-
def generate_blake2b_hash_from_base64(base64_string: str) -> str:
|
17
|
-
return generate_blake2b_hash_from_bytes(base64.b64decode(base64_string))
|
18
|
-
|
9
|
+
from ..utils.hashing import generate_blake2b_hash_from_base64
|
19
10
|
|
20
11
|
# **** OCR DATACLASSES (DocumentAI-compatible) ****
|
21
12
|
class Point(BaseModel):
|
retab/types/schemas/enhance.py
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
from typing import Any, Self, TypedDict
|
2
2
|
|
3
|
-
from openai.types.chat.chat_completion_reasoning_effort import
|
3
|
+
from openai.types.chat.chat_completion_reasoning_effort import (
|
4
|
+
ChatCompletionReasoningEffort,
|
5
|
+
)
|
4
6
|
from pydantic import BaseModel, Field, model_validator
|
5
7
|
|
6
8
|
from ..mime import MIMEData
|
@@ -9,23 +11,35 @@ from ..browser_canvas import BrowserCanvas
|
|
9
11
|
|
10
12
|
|
11
13
|
class EnhanceSchemaConfig(BaseModel):
|
14
|
+
allow_reasoning_fields_added: bool = True # Whether to allow the llm to add reasoning fields
|
12
15
|
allow_field_description_update: bool = False # Whether to allow the llm to update the description of existing fields
|
13
16
|
allow_system_prompt_update: bool = True # Whether to allow the llm to update the system prompt
|
14
|
-
|
17
|
+
allow_field_simple_type_change: bool = False # Whether to allow the llm to make simple type changes (optional, string to date, etc.)
|
18
|
+
allow_field_data_structure_breakdown: bool = False # Whether to allow the llm to make complex data-structure changes (raw diff)
|
15
19
|
|
16
20
|
# Model validator
|
17
21
|
@model_validator(mode="after")
|
18
22
|
def check_at_least_one_tool_allowed(self) -> Self:
|
19
|
-
if not any(
|
23
|
+
if not any(
|
24
|
+
[
|
25
|
+
self.allow_reasoning_fields_added,
|
26
|
+
self.allow_field_description_update,
|
27
|
+
self.allow_system_prompt_update,
|
28
|
+
self.allow_field_simple_type_change,
|
29
|
+
self.allow_field_data_structure_breakdown,
|
30
|
+
]
|
31
|
+
):
|
20
32
|
raise ValueError("At least one tool must be allowed")
|
21
33
|
return self
|
22
34
|
|
23
35
|
|
24
36
|
# Define a typed Dict for EnhanceSchemaConfig (for now it is kind static, but we will add more flexibility in the future)
|
25
37
|
class EnhanceSchemaConfigDict(TypedDict, total=False):
|
38
|
+
allow_reasoning_fields_added: bool
|
26
39
|
allow_field_description_update: bool
|
27
40
|
allow_system_prompt_update: bool
|
28
|
-
|
41
|
+
allow_field_simple_type_change: bool
|
42
|
+
allow_field_data_structure_breakdown: bool
|
29
43
|
|
30
44
|
|
31
45
|
class EnhanceSchemaRequest(BaseModel):
|
@@ -48,7 +62,10 @@ class EnhanceSchemaRequest(BaseModel):
|
|
48
62
|
stream: bool = False
|
49
63
|
"""Whether to stream the response."""
|
50
64
|
|
51
|
-
tools_config: EnhanceSchemaConfig = Field(
|
65
|
+
tools_config: EnhanceSchemaConfig = Field(
|
66
|
+
default_factory=EnhanceSchemaConfig,
|
67
|
+
description="The configuration for the tools to use",
|
68
|
+
)
|
52
69
|
|
53
70
|
json_schema: dict[str, Any]
|
54
71
|
instructions: str | None = None
|
retab/types/schemas/evaluate.py
CHANGED
@@ -3,7 +3,7 @@ from typing import Any, Self
|
|
3
3
|
from openai.types.chat.chat_completion_reasoning_effort import ChatCompletionReasoningEffort
|
4
4
|
from pydantic import BaseModel, Field, model_validator
|
5
5
|
|
6
|
-
from ..
|
6
|
+
from ..metrics import ItemMetric
|
7
7
|
from ..mime import MIMEData
|
8
8
|
from ..modalities import Modality
|
9
9
|
from ..browser_canvas import BrowserCanvas
|
retab/types/schemas/object.py
CHANGED
@@ -233,6 +233,32 @@ When provided with a **JSON schema** and a **document**, you must:
|
|
233
233
|
|
234
234
|
---
|
235
235
|
|
236
|
+
## Date and Time Formatting
|
237
|
+
|
238
|
+
When extracting date, time, or datetime values:
|
239
|
+
|
240
|
+
- **Always use ISO format** for dates and times (e.g., "2023-12-25", "14:30:00", "2023-12-25T14:30:00Z")
|
241
|
+
- **Include timezone information** when available (e.g., "2023-12-25T14:30:00+02:00")
|
242
|
+
- **Use UTC timezone** when timezone is not specified or unclear (e.g., "2023-12-25T14:30:00Z")
|
243
|
+
- **Maintain precision** as found in the source document (seconds, milliseconds if present)
|
244
|
+
|
245
|
+
**Examples:**
|
246
|
+
|
247
|
+
```json
|
248
|
+
// Correct ISO formats:
|
249
|
+
{"date": "2023-12-25"}
|
250
|
+
{"time": "14:30:00"}
|
251
|
+
{"datetime": "2023-12-25T14:30:00Z"}
|
252
|
+
{"datetime_with_tz": "2023-12-25T14:30:00+02:00"}
|
253
|
+
|
254
|
+
// Incorrect formats:
|
255
|
+
{"date": "12/25/2023"}
|
256
|
+
{"time": "2:30 PM"}
|
257
|
+
{"datetime": "Dec 25, 2023 at 2:30 PM"}
|
258
|
+
```
|
259
|
+
|
260
|
+
---
|
261
|
+
|
236
262
|
## Handling Missing and Nullable Fields
|
237
263
|
|
238
264
|
### Nullable Leaf Attributes
|
retab/types/standards.py
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
from typing import Any, List, Literal, Optional, Tuple, TypeVar, TypedDict
|
2
2
|
|
3
3
|
from pydantic import BaseModel, Field
|
4
|
-
from
|
4
|
+
from pydantic_core import PydanticUndefined
|
5
5
|
|
6
6
|
# API Standards
|
7
7
|
|
8
8
|
# Define a type variable to represent the content type
|
9
9
|
T = TypeVar("T")
|
10
10
|
|
11
|
-
FieldUnset =
|
11
|
+
FieldUnset: Any = PydanticUndefined
|
12
12
|
|
13
13
|
|
14
14
|
# Define the ErrorDetail model
|
retab/utils/__init__.py
CHANGED