retab 0.0.35__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- retab-0.0.35.dist-info/METADATA +417 -0
- retab-0.0.35.dist-info/RECORD +111 -0
- retab-0.0.35.dist-info/WHEEL +5 -0
- retab-0.0.35.dist-info/top_level.txt +1 -0
- uiform/__init__.py +4 -0
- uiform/_resource.py +28 -0
- uiform/_utils/__init__.py +0 -0
- uiform/_utils/ai_models.py +100 -0
- uiform/_utils/benchmarking copy.py +588 -0
- uiform/_utils/benchmarking.py +485 -0
- uiform/_utils/chat.py +332 -0
- uiform/_utils/display.py +443 -0
- uiform/_utils/json_schema.py +2161 -0
- uiform/_utils/mime.py +168 -0
- uiform/_utils/responses.py +163 -0
- uiform/_utils/stream_context_managers.py +52 -0
- uiform/_utils/usage/__init__.py +0 -0
- uiform/_utils/usage/usage.py +300 -0
- uiform/client.py +701 -0
- uiform/py.typed +0 -0
- uiform/resources/__init__.py +0 -0
- uiform/resources/consensus/__init__.py +3 -0
- uiform/resources/consensus/client.py +114 -0
- uiform/resources/consensus/completions.py +252 -0
- uiform/resources/consensus/completions_stream.py +278 -0
- uiform/resources/consensus/responses.py +325 -0
- uiform/resources/consensus/responses_stream.py +373 -0
- uiform/resources/deployments/__init__.py +9 -0
- uiform/resources/deployments/client.py +78 -0
- uiform/resources/deployments/endpoints.py +322 -0
- uiform/resources/deployments/links.py +452 -0
- uiform/resources/deployments/logs.py +211 -0
- uiform/resources/deployments/mailboxes.py +496 -0
- uiform/resources/deployments/outlook.py +531 -0
- uiform/resources/deployments/tests.py +158 -0
- uiform/resources/documents/__init__.py +3 -0
- uiform/resources/documents/client.py +255 -0
- uiform/resources/documents/extractions.py +441 -0
- uiform/resources/evals.py +812 -0
- uiform/resources/files.py +24 -0
- uiform/resources/finetuning.py +62 -0
- uiform/resources/jsonlUtils.py +1046 -0
- uiform/resources/models.py +45 -0
- uiform/resources/openai_example.py +22 -0
- uiform/resources/processors/__init__.py +3 -0
- uiform/resources/processors/automations/__init__.py +9 -0
- uiform/resources/processors/automations/client.py +78 -0
- uiform/resources/processors/automations/endpoints.py +317 -0
- uiform/resources/processors/automations/links.py +356 -0
- uiform/resources/processors/automations/logs.py +211 -0
- uiform/resources/processors/automations/mailboxes.py +435 -0
- uiform/resources/processors/automations/outlook.py +444 -0
- uiform/resources/processors/automations/tests.py +158 -0
- uiform/resources/processors/client.py +474 -0
- uiform/resources/prompt_optimization.py +76 -0
- uiform/resources/schemas.py +369 -0
- uiform/resources/secrets/__init__.py +9 -0
- uiform/resources/secrets/client.py +20 -0
- uiform/resources/secrets/external_api_keys.py +109 -0
- uiform/resources/secrets/webhook.py +62 -0
- uiform/resources/usage.py +271 -0
- uiform/types/__init__.py +0 -0
- uiform/types/ai_models.py +645 -0
- uiform/types/automations/__init__.py +0 -0
- uiform/types/automations/cron.py +58 -0
- uiform/types/automations/endpoints.py +21 -0
- uiform/types/automations/links.py +28 -0
- uiform/types/automations/mailboxes.py +60 -0
- uiform/types/automations/outlook.py +68 -0
- uiform/types/automations/webhooks.py +21 -0
- uiform/types/chat.py +8 -0
- uiform/types/completions.py +93 -0
- uiform/types/consensus.py +10 -0
- uiform/types/db/__init__.py +0 -0
- uiform/types/db/annotations.py +24 -0
- uiform/types/db/files.py +36 -0
- uiform/types/deployments/__init__.py +0 -0
- uiform/types/deployments/cron.py +59 -0
- uiform/types/deployments/endpoints.py +28 -0
- uiform/types/deployments/links.py +36 -0
- uiform/types/deployments/mailboxes.py +67 -0
- uiform/types/deployments/outlook.py +76 -0
- uiform/types/deployments/webhooks.py +21 -0
- uiform/types/documents/__init__.py +0 -0
- uiform/types/documents/correct_orientation.py +13 -0
- uiform/types/documents/create_messages.py +226 -0
- uiform/types/documents/extractions.py +297 -0
- uiform/types/evals.py +207 -0
- uiform/types/events.py +76 -0
- uiform/types/extractions.py +85 -0
- uiform/types/jobs/__init__.py +0 -0
- uiform/types/jobs/base.py +150 -0
- uiform/types/jobs/batch_annotation.py +22 -0
- uiform/types/jobs/evaluation.py +133 -0
- uiform/types/jobs/finetune.py +6 -0
- uiform/types/jobs/prompt_optimization.py +41 -0
- uiform/types/jobs/webcrawl.py +6 -0
- uiform/types/logs.py +231 -0
- uiform/types/mime.py +257 -0
- uiform/types/modalities.py +68 -0
- uiform/types/pagination.py +6 -0
- uiform/types/schemas/__init__.py +0 -0
- uiform/types/schemas/enhance.py +53 -0
- uiform/types/schemas/evaluate.py +55 -0
- uiform/types/schemas/generate.py +32 -0
- uiform/types/schemas/layout.py +58 -0
- uiform/types/schemas/object.py +631 -0
- uiform/types/schemas/templates.py +107 -0
- uiform/types/secrets/__init__.py +0 -0
- uiform/types/secrets/external_api_keys.py +22 -0
- uiform/types/standards.py +39 -0
uiform/types/evals.py
ADDED
@@ -0,0 +1,207 @@
|
|
1
|
+
import copy
|
2
|
+
import datetime
|
3
|
+
import json
|
4
|
+
from typing import Any, List, Literal, Optional, Union
|
5
|
+
|
6
|
+
import nanoid # type: ignore
|
7
|
+
from pydantic import BaseModel, Field, computed_field
|
8
|
+
|
9
|
+
|
10
|
+
from .._utils.json_schema import clean_schema, compute_schema_data_id
|
11
|
+
from .._utils.mime import generate_blake2b_hash_from_string
|
12
|
+
from .ai_models import Amount, LLMModel
|
13
|
+
from .jobs.base import InferenceSettings
|
14
|
+
from .mime import MIMEData
|
15
|
+
|
16
|
+
|
17
|
+
# Define the type alias for MetricType
|
18
|
+
MetricType = Literal["levenshtein", "jaccard", "hamming"]
|
19
|
+
|
20
|
+
|
21
|
+
# Define the structure for an individual item metric
|
22
|
+
class ItemMetric(BaseModel):
|
23
|
+
id: str = Field(description="The ID of the item being measured")
|
24
|
+
name: str = Field(description="The name of the item being measured")
|
25
|
+
similarity: float = Field(description="The similarity score between 0 and 1")
|
26
|
+
similarities: dict[str, Any] = Field(description="The similarity scores for each item in the list")
|
27
|
+
flat_similarities: dict[str, Optional[float]] = Field(description="The similarity scores for each item in the list in dot notation format")
|
28
|
+
aligned_similarity: float = Field(description="The similarity score between 0 and 1, after alignment")
|
29
|
+
aligned_similarities: dict[str, Any] = Field(description="The similarity scores for each item in the list, after alignment")
|
30
|
+
aligned_flat_similarities: dict[str, Optional[float]] = Field(description="The similarity scores for each item in the list in dot notation format, after alignment")
|
31
|
+
|
32
|
+
|
33
|
+
# Define the main MetricResult model
|
34
|
+
class MetricResult(BaseModel):
|
35
|
+
item_metrics: List[ItemMetric] = Field(description="List of similarity metrics for individual items")
|
36
|
+
mean_similarity: float = Field(description="The average similarity score across all items")
|
37
|
+
aligned_mean_similarity: float = Field(description="The average similarity score across all items, after alignment")
|
38
|
+
metric_type: MetricType = Field(description="The type of similarity metric used for comparison")
|
39
|
+
|
40
|
+
|
41
|
+
class DistancesResult(BaseModel):
|
42
|
+
distances: dict[str, Any] = Field(description="List of distances for individual items")
|
43
|
+
mean_distance: float = Field(description="The average distance across all items")
|
44
|
+
metric_type: MetricType = Field(description="The type of distance metric used for comparison")
|
45
|
+
|
46
|
+
|
47
|
+
class PredictionMetadata(BaseModel):
|
48
|
+
extraction_id: Optional[str] = Field(default=None, description="The ID of the extraction")
|
49
|
+
likelihoods: Optional[dict[str, Any]] = Field(default=None, description="The likelihoods of the extraction")
|
50
|
+
field_locations: Optional[dict[str, Any]] = Field(default=None, description="The field locations of the extraction")
|
51
|
+
agentic_field_locations: Optional[dict[str, Any]] = Field(default=None, description="The field locations of the extraction extracted by an llm")
|
52
|
+
consensus_details: Optional[list[dict[str, Any]]] = Field(default=None, description="The consensus details of the extraction")
|
53
|
+
api_cost: Optional[Amount] = Field(default=None, description="The cost of the API call for this document (if any -- ground truth for example)")
|
54
|
+
|
55
|
+
|
56
|
+
class PredictionData(BaseModel):
|
57
|
+
prediction: dict[str, Any] = Field(default={}, description="The result of the extraction or manual annotation")
|
58
|
+
metadata: Optional[PredictionMetadata] = Field(default=None, description="The metadata of the prediction")
|
59
|
+
|
60
|
+
|
61
|
+
class Iteration(BaseModel):
|
62
|
+
id: str = Field(default_factory=lambda: "eval_iter_" + nanoid.generate())
|
63
|
+
inference_settings: InferenceSettings
|
64
|
+
json_schema: dict[str, Any]
|
65
|
+
predictions: list[PredictionData] = Field(default_factory=list, description="The predictions of the iteration for all the documents")
|
66
|
+
metric_results: Optional[MetricResult] = Field(default=None, description="The metric results of the iteration")
|
67
|
+
|
68
|
+
@computed_field # type: ignore
|
69
|
+
@property
|
70
|
+
def schema_data_id(self) -> str:
|
71
|
+
"""Returns the SHA1 hash of the schema data, ignoring all prompt/description/default fields.
|
72
|
+
|
73
|
+
Returns:
|
74
|
+
str: A SHA1 hash string representing the schema data version.
|
75
|
+
"""
|
76
|
+
return "sch_data_id_" + generate_blake2b_hash_from_string(
|
77
|
+
json.dumps(
|
78
|
+
clean_schema(
|
79
|
+
copy.deepcopy(self.json_schema),
|
80
|
+
remove_custom_fields=True,
|
81
|
+
fields_to_remove=["description", "default", "title", "required", "examples", "deprecated", "readOnly", "writeOnly"],
|
82
|
+
),
|
83
|
+
sort_keys=True,
|
84
|
+
).strip()
|
85
|
+
)
|
86
|
+
|
87
|
+
# This is a computed field, it is exposed when serializing the object
|
88
|
+
@computed_field # type: ignore
|
89
|
+
@property
|
90
|
+
def schema_id(self) -> str:
|
91
|
+
"""Returns the SHA1 hash of the complete schema.
|
92
|
+
|
93
|
+
Returns:
|
94
|
+
str: A SHA1 hash string representing the complete schema version.
|
95
|
+
"""
|
96
|
+
return "sch_id_" + generate_blake2b_hash_from_string(json.dumps(self.json_schema, sort_keys=True).strip())
|
97
|
+
|
98
|
+
|
99
|
+
class AnnotatedDocument(BaseModel):
|
100
|
+
mime_data: MIMEData = Field(
|
101
|
+
description="The mime data of the document. Can also be a BaseMIMEData, which is why we have this id field (to be able to identify the file, but id is equal to mime_data.id)"
|
102
|
+
)
|
103
|
+
annotation: dict[str, Any] = Field(default={}, description="The ground truth of the document")
|
104
|
+
|
105
|
+
|
106
|
+
class DocumentItem(AnnotatedDocument):
|
107
|
+
annotation_metadata: Optional[PredictionMetadata] = Field(default=None, description="The metadata of the annotation when the annotation is a prediction")
|
108
|
+
|
109
|
+
|
110
|
+
class EvaluationDocument(DocumentItem):
|
111
|
+
id: str = Field(description="The ID of the document. Equal to mime_data.id but robust to the case where mime_data is a BaseMIMEData")
|
112
|
+
|
113
|
+
|
114
|
+
class CreateIterationRequest(BaseModel):
|
115
|
+
"""
|
116
|
+
Request model for performing a new iteration with custom inference settings and optional JSON schema.
|
117
|
+
"""
|
118
|
+
|
119
|
+
inference_settings: InferenceSettings
|
120
|
+
json_schema: Optional[dict[str, Any]] = None
|
121
|
+
|
122
|
+
|
123
|
+
class UpdateEvaluationDocumentRequest(BaseModel):
|
124
|
+
annotation: Optional[dict[str, Any]] = Field(default=None, description="The ground truth of the document")
|
125
|
+
annotation_metadata: Optional[PredictionMetadata] = Field(default=None, description="The metadata of the annotation when the annotation is a prediction")
|
126
|
+
|
127
|
+
|
128
|
+
class UpdateEvaluationRequest(BaseModel):
|
129
|
+
name: Optional[str] = Field(default=None, description="The name of the document")
|
130
|
+
documents: Optional[list[EvaluationDocument]] = Field(default=None, description="The documents of the evaluation")
|
131
|
+
iterations: Optional[list[Iteration]] = Field(default=None, description="The iterations of the evaluation")
|
132
|
+
json_schema: Optional[dict[str, Any]] = Field(default=None, description="The json schema of the evaluation")
|
133
|
+
|
134
|
+
project_id: Optional[str] = Field(default=None, description="The ID of the project")
|
135
|
+
|
136
|
+
@computed_field # type: ignore
|
137
|
+
@property
|
138
|
+
def schema_data_id(self) -> Optional[str]:
|
139
|
+
"""Returns the SHA1 hash of the schema data, ignoring all prompt/description/default fields.
|
140
|
+
|
141
|
+
Returns:
|
142
|
+
str: A SHA1 hash string representing the schema data version.
|
143
|
+
"""
|
144
|
+
if self.json_schema is None:
|
145
|
+
return None
|
146
|
+
|
147
|
+
return compute_schema_data_id(self.json_schema)
|
148
|
+
|
149
|
+
# This is a computed field, it is exposed when serializing the object
|
150
|
+
@computed_field # type: ignore
|
151
|
+
@property
|
152
|
+
def schema_id(self) -> Optional[str]:
|
153
|
+
"""Returns the SHA1 hash of the complete schema.
|
154
|
+
|
155
|
+
Returns:
|
156
|
+
str: A SHA1 hash string representing the complete schema version.
|
157
|
+
"""
|
158
|
+
if self.json_schema is None:
|
159
|
+
return None
|
160
|
+
return "sch_id_" + generate_blake2b_hash_from_string(json.dumps(self.json_schema, sort_keys=True).strip())
|
161
|
+
|
162
|
+
|
163
|
+
class Evaluation(BaseModel):
|
164
|
+
id: str = Field(default_factory=lambda: "eval_" + nanoid.generate())
|
165
|
+
updated_at: datetime.datetime = Field(default_factory=lambda: datetime.datetime.now(tz=datetime.timezone.utc))
|
166
|
+
|
167
|
+
name: str
|
168
|
+
documents: list[EvaluationDocument]
|
169
|
+
iterations: list[Iteration]
|
170
|
+
json_schema: dict[str, Any]
|
171
|
+
|
172
|
+
project_id: str = Field(description="The ID of the project", default="default_spreadsheets")
|
173
|
+
default_inference_settings: Optional[InferenceSettings] = Field(default=None, description="The default inference properties for the evaluation (mostly used in the frontend)")
|
174
|
+
|
175
|
+
# @field_validator('iterations')
|
176
|
+
# def validate_iterations_content_length(cls: Any, v: list[Iteration], values: Any) -> list[Iteration]:
|
177
|
+
# if 'ground_truth' in values:
|
178
|
+
# ground_truth_length = len(values['ground_truth'])
|
179
|
+
# for iteration in v:
|
180
|
+
# if len(iteration.content) != ground_truth_length:
|
181
|
+
# raise ValueError(f"Iteration content length must match ground_truth length ({ground_truth_length})")
|
182
|
+
# return v
|
183
|
+
|
184
|
+
@computed_field # type: ignore
|
185
|
+
@property
|
186
|
+
def schema_data_id(self) -> str:
|
187
|
+
"""Returns the SHA1 hash of the schema data, ignoring all prompt/description/default fields.
|
188
|
+
|
189
|
+
Returns:
|
190
|
+
str: A SHA1 hash string representing the schema data version.
|
191
|
+
"""
|
192
|
+
return compute_schema_data_id(self.json_schema)
|
193
|
+
|
194
|
+
# This is a computed field, it is exposed when serializing the object
|
195
|
+
@computed_field # type: ignore
|
196
|
+
@property
|
197
|
+
def schema_id(self) -> str:
|
198
|
+
"""Returns the SHA1 hash of the complete schema.
|
199
|
+
|
200
|
+
Returns:
|
201
|
+
str: A SHA1 hash string representing the complete schema version.
|
202
|
+
"""
|
203
|
+
return "sch_id_" + generate_blake2b_hash_from_string(json.dumps(self.json_schema, sort_keys=True).strip())
|
204
|
+
|
205
|
+
|
206
|
+
class AddIterationFromJsonlRequest(BaseModel):
|
207
|
+
jsonl_gcs_path: str
|
uiform/types/events.py
ADDED
@@ -0,0 +1,76 @@
|
|
1
|
+
import datetime
|
2
|
+
from typing import Any, Literal, Optional
|
3
|
+
|
4
|
+
import nanoid # type: ignore
|
5
|
+
from pydantic import BaseModel, Field
|
6
|
+
|
7
|
+
metadata_key = Literal[
|
8
|
+
'user',
|
9
|
+
'organization',
|
10
|
+
'link',
|
11
|
+
'mailbox',
|
12
|
+
'cron',
|
13
|
+
'outlook',
|
14
|
+
'extraction',
|
15
|
+
'webhook',
|
16
|
+
'reconciliation',
|
17
|
+
'preprocessing',
|
18
|
+
'schema',
|
19
|
+
'data_structure',
|
20
|
+
'file',
|
21
|
+
'preprocessing',
|
22
|
+
'dataset',
|
23
|
+
'dataset_membership',
|
24
|
+
'endpoint',
|
25
|
+
'automation',
|
26
|
+
'template',
|
27
|
+
]
|
28
|
+
|
29
|
+
event_type = Literal[
|
30
|
+
'extraction.created',
|
31
|
+
'messages.created',
|
32
|
+
'document.orientation_corrected',
|
33
|
+
'consensus.reconciled',
|
34
|
+
'automation.created',
|
35
|
+
'automation.updated',
|
36
|
+
'automation.deleted',
|
37
|
+
'automation.webhook',
|
38
|
+
'preprocessing.created',
|
39
|
+
'link.created',
|
40
|
+
'link.updated',
|
41
|
+
'link.deleted',
|
42
|
+
'link.webhook',
|
43
|
+
'mailbox.created',
|
44
|
+
'mailbox.updated',
|
45
|
+
'mailbox.deleted',
|
46
|
+
'mailbox.webhook',
|
47
|
+
'outlook.created',
|
48
|
+
'outlook.updated',
|
49
|
+
'outlook.deleted',
|
50
|
+
'outlook.webhook',
|
51
|
+
'schema.generated',
|
52
|
+
'schema.promptified',
|
53
|
+
'schema.system_promptfile.created',
|
54
|
+
'file.updated',
|
55
|
+
'file.deleted',
|
56
|
+
'template.created',
|
57
|
+
'template.deleted',
|
58
|
+
'template.sample_document_uploaded',
|
59
|
+
'template.sample_document_deleted',
|
60
|
+
'template.updated',
|
61
|
+
]
|
62
|
+
|
63
|
+
|
64
|
+
class Event(BaseModel):
|
65
|
+
object: Literal['event'] = "event"
|
66
|
+
id: str = Field(default_factory=lambda: "event_" + nanoid.generate(), description="Unique identifier for the event")
|
67
|
+
event: str = Field(..., description="A string that distinguishes the event type. Ex: user.created, user.updated, user.deleted, etc.")
|
68
|
+
created_at: datetime.datetime = Field(default_factory=lambda: datetime.datetime.now(datetime.timezone.utc))
|
69
|
+
data: dict[str, Any] = Field(..., description="Event payload. Payloads match the corresponding API objects.")
|
70
|
+
metadata: Optional[dict[metadata_key, str]] = Field(
|
71
|
+
default=None, description="Ids giving informations about the event. Ex: user.created.metadata = {'user': 'usr_8478973619047837'}"
|
72
|
+
)
|
73
|
+
|
74
|
+
|
75
|
+
class StoredEvent(Event):
|
76
|
+
organization_id: str = Field(..., description="Organization ID")
|
@@ -0,0 +1,85 @@
|
|
1
|
+
import datetime
|
2
|
+
from typing import Any, Literal, Optional
|
3
|
+
|
4
|
+
import nanoid # type: ignore
|
5
|
+
from openai.types.chat import ChatCompletion
|
6
|
+
from openai.types.chat.chat_completion_reasoning_effort import ChatCompletionReasoningEffort
|
7
|
+
from pydantic import BaseModel, Field, computed_field
|
8
|
+
|
9
|
+
from uiform.types.chat import ChatCompletionUiformMessage
|
10
|
+
from uiform.types.documents.extractions import UiParsedChatCompletion
|
11
|
+
|
12
|
+
from .._utils.usage.usage import compute_cost_from_model, compute_cost_from_model_with_breakdown, CostBreakdown
|
13
|
+
from .ai_models import Amount
|
14
|
+
from .modalities import Modality
|
15
|
+
|
16
|
+
ValidationsState = Literal["pending", "validated", "invalid"]
|
17
|
+
|
18
|
+
|
19
|
+
class ExtractionSource(BaseModel):
|
20
|
+
type: Literal["api", "annotation","processor", "automation.link", "automation.mailbox", "automation.cron", "automation.outlook", "automation.endpoint", "schema.extract"] = Field(
|
21
|
+
description="Type of extraction"
|
22
|
+
)
|
23
|
+
id: str | None = Field(default=None, description="ID the trigger of the extraction")
|
24
|
+
|
25
|
+
|
26
|
+
ExtractionSteps = str | Literal['initialization', 'prepare_messages', 'yield_first_token', 'completion'] # Steps are meant to not overlap
|
27
|
+
|
28
|
+
|
29
|
+
class ExtractionTimingStep(BaseModel):
|
30
|
+
name: ExtractionSteps
|
31
|
+
duration: float # in seconds
|
32
|
+
notes: str | None = None
|
33
|
+
|
34
|
+
|
35
|
+
class Extraction(BaseModel):
|
36
|
+
id: str = Field(default_factory=lambda: "extr_" + nanoid.generate(), description="Unique identifier of the analysis")
|
37
|
+
messages: list[ChatCompletionUiformMessage] = Field(default_factory=list)
|
38
|
+
messages_gcs: str = Field(..., description="GCS path to the messages")
|
39
|
+
file_gcs: str = Field(..., description="GCS path to the file")
|
40
|
+
file_id: str = Field(..., description="ID of the file")
|
41
|
+
|
42
|
+
status: Literal["success", "failed"] = Field(..., description="Whether the analysis was successful")
|
43
|
+
completion: UiParsedChatCompletion | ChatCompletion = Field(..., description="Response generated by the analysis")
|
44
|
+
json_schema: Any = Field(..., description="Response format (JSON Schema or pydantic_v2.BaseModel)")
|
45
|
+
model: str = Field(..., description="Model used for the analysis")
|
46
|
+
temperature: float = Field(default=0.0, description="Temperature used for the analysis")
|
47
|
+
source: ExtractionSource = Field(..., description="Source of the extraction")
|
48
|
+
image_resolution_dpi: int = Field(default=96, description="Resolution of the image sent to the LLM")
|
49
|
+
browser_canvas: Literal['A3', 'A4', 'A5'] = Field(default='A4', description="Sets the size of the browser canvas for rendering documents in browser-based processing. Choose a size that matches the document type.")
|
50
|
+
modality: Modality = Field(default="native", description="Modality of the extraction")
|
51
|
+
reasoning_effort: Optional[ChatCompletionReasoningEffort] = Field(default=None, description="The effort level for the model to reason about the input data.")
|
52
|
+
timings: list[ExtractionTimingStep] = Field(default_factory=list, description="Timings of the extraction")
|
53
|
+
|
54
|
+
# Infered from the schema
|
55
|
+
schema_id: str = Field(..., description="Version of the schema used for the analysis")
|
56
|
+
schema_data_id: str = Field(..., description="Version of the schema data used for the analysis")
|
57
|
+
created_at: datetime.datetime = Field(default_factory=lambda: datetime.datetime.now(datetime.timezone.utc), description="Timestamp of the creation of the extraction object")
|
58
|
+
request_at: datetime.datetime | None = Field(default=None, description="Timestamp of the extraction request if provided.")
|
59
|
+
organization_id: str = Field(..., description="Organization ID of the user or application")
|
60
|
+
validation_state: Optional[ValidationsState] = Field(default=None, description="Validation state of the extraction")
|
61
|
+
billed: bool = Field(default=False, description="Whether the extraction has been billed or not")
|
62
|
+
|
63
|
+
@computed_field
|
64
|
+
@property
|
65
|
+
def api_cost(self) -> Optional[Amount]:
|
66
|
+
if self.completion and self.completion.usage:
|
67
|
+
try:
|
68
|
+
cost = compute_cost_from_model(self.completion.model, self.completion.usage)
|
69
|
+
return cost
|
70
|
+
except Exception as e:
|
71
|
+
print(f"Error computing cost: {e}")
|
72
|
+
return None
|
73
|
+
return None
|
74
|
+
|
75
|
+
@computed_field # type: ignore
|
76
|
+
@property
|
77
|
+
def cost_breakdown(self) -> Optional[CostBreakdown]:
|
78
|
+
if self.completion and self.completion.usage:
|
79
|
+
try:
|
80
|
+
cost = compute_cost_from_model_with_breakdown(self.completion.model, self.completion.usage)
|
81
|
+
return cost
|
82
|
+
except Exception as e:
|
83
|
+
print(f"Error computing cost: {e}")
|
84
|
+
return None
|
85
|
+
return None
|
File without changes
|
@@ -0,0 +1,150 @@
|
|
1
|
+
from typing import Literal, Optional, Self
|
2
|
+
|
3
|
+
from openai.types.chat.chat_completion_reasoning_effort import ChatCompletionReasoningEffort
|
4
|
+
from pydantic import BaseModel, Field, model_validator
|
5
|
+
|
6
|
+
from ..modalities import Modality
|
7
|
+
|
8
|
+
SelectionMode = Literal["all", "manual"]
|
9
|
+
|
10
|
+
|
11
|
+
# This is the input data for the prepare_dataset job
|
12
|
+
class PrepareDatasetInputData(BaseModel):
|
13
|
+
dataset_id: Optional[str] = None
|
14
|
+
schema_id: Optional[str] = None
|
15
|
+
schema_data_id: Optional[str] = None
|
16
|
+
|
17
|
+
selection_model: SelectionMode = "all"
|
18
|
+
|
19
|
+
@model_validator(mode="after")
|
20
|
+
def validate_input(self) -> Self:
|
21
|
+
# The preference is:
|
22
|
+
# 1. dataset_id
|
23
|
+
# 2. schema_id
|
24
|
+
# 3. schema_data_id
|
25
|
+
if self.dataset_id is None and self.schema_id is None and self.schema_data_id is None:
|
26
|
+
raise ValueError("At least one of dataset_id, schema_id, or schema_data_id must be provided")
|
27
|
+
|
28
|
+
return self
|
29
|
+
|
30
|
+
|
31
|
+
# This is the input data for the split_dataset job
|
32
|
+
class DatasetSplitInputData(BaseModel):
|
33
|
+
dataset_id: str
|
34
|
+
train_size: Optional[int | float] = None
|
35
|
+
eval_size: Optional[int | float] = None
|
36
|
+
|
37
|
+
@model_validator(mode="after")
|
38
|
+
def validate_input(self) -> Self:
|
39
|
+
if self.train_size is not None and self.eval_size is not None:
|
40
|
+
raise ValueError("train_size and eval_size cannot both be provided")
|
41
|
+
return self
|
42
|
+
|
43
|
+
|
44
|
+
# This is the input data for the batch annotation job
|
45
|
+
class InferenceSettings(BaseModel):
|
46
|
+
model: str = "gpt-4o-mini"
|
47
|
+
temperature: float = 0.0
|
48
|
+
modality: Modality = "native"
|
49
|
+
reasoning_effort: ChatCompletionReasoningEffort = "medium"
|
50
|
+
image_resolution_dpi: int = 96
|
51
|
+
browser_canvas: Literal['A3', 'A4', 'A5'] = 'A4'
|
52
|
+
n_consensus: int = Field(default=1, description="Number of consensus rounds to perform")
|
53
|
+
|
54
|
+
|
55
|
+
class AnnotationInputData(BaseModel):
|
56
|
+
data_file: str
|
57
|
+
schema_id: str
|
58
|
+
inference_settings: InferenceSettings
|
59
|
+
|
60
|
+
|
61
|
+
# This is the input data for the evaluation job
|
62
|
+
class EvaluationInputData(BaseModel):
|
63
|
+
eval_data_file: str
|
64
|
+
schema_id: str
|
65
|
+
inference_settings_1: InferenceSettings | None = None
|
66
|
+
inference_settings_2: InferenceSettings
|
67
|
+
|
68
|
+
|
69
|
+
# from pydantic import BaseModel, Field, model_validator
|
70
|
+
# from typing import Literal, Optional, Any
|
71
|
+
# import datetime
|
72
|
+
|
73
|
+
|
74
|
+
# JobType = Literal["prompt-optimization", "annotate-files", "finetune-dataset", "webcrawl"]
|
75
|
+
# JobStatus = Literal["pending", "running", "completed", "failed"]
|
76
|
+
#### JOBS ####
|
77
|
+
|
78
|
+
# class JobTemplateCreateRequest(BaseModel):
|
79
|
+
# job_type: JobType
|
80
|
+
# default_input_data: dict = Field(default_factory=dict)
|
81
|
+
# description: Optional[str] = None
|
82
|
+
# cron: Optional[str] = None
|
83
|
+
|
84
|
+
|
85
|
+
# class JobTemplateDocument(BaseModel):
|
86
|
+
# object: Literal["job_template"] = "job_template"
|
87
|
+
# id: str
|
88
|
+
# type: JobType
|
89
|
+
# identity: Any | None = None
|
90
|
+
# description: Optional[str] = None
|
91
|
+
# default_input_data: dict = Field(default_factory=dict)
|
92
|
+
# # For scheduled jobs, include a valid CRON expression (None for on-demand only jobs)
|
93
|
+
# cron: Optional[str] = None
|
94
|
+
# next_run: Optional[datetime.datetime] = None
|
95
|
+
# created_at: Optional[datetime.datetime] = None
|
96
|
+
# updated_at: Optional[datetime.datetime] = None
|
97
|
+
# is_active: bool = True # Change to status.
|
98
|
+
|
99
|
+
# class JobTemplateUpdateRequest(BaseModel):
|
100
|
+
# cron: Optional[str] = None
|
101
|
+
# default_input_data: Optional[dict] = None
|
102
|
+
# description: Optional[str] = None
|
103
|
+
# is_active: Optional[bool] = None # Change to status.
|
104
|
+
|
105
|
+
|
106
|
+
#### EXECUTIONS ####
|
107
|
+
|
108
|
+
# class JobExecutionCreateRequest(BaseModel):
|
109
|
+
# type: JobType
|
110
|
+
# template_id: Optional[str] = None
|
111
|
+
# input_data: dict = Field(default_factory=dict)
|
112
|
+
|
113
|
+
# @model_validator(mode='before')
|
114
|
+
# @classmethod
|
115
|
+
# def validate_job_identifiers(cls, data: Any) -> Any:
|
116
|
+
# if isinstance(data, dict):
|
117
|
+
# if bool(data.get('job_type')) == bool(data.get('job_template_id')):
|
118
|
+
# raise ValueError("Either job_type or job_template_id must be provided")
|
119
|
+
# return data
|
120
|
+
|
121
|
+
# class JobExecutionResponse(BaseModel):
|
122
|
+
# id: str
|
123
|
+
# template_id: Optional[str] = None
|
124
|
+
# type: JobType
|
125
|
+
# status: JobStatus
|
126
|
+
# result: Optional[dict] = None
|
127
|
+
# error: Optional[str] = None
|
128
|
+
# created_at: Optional[datetime.datetime] = None
|
129
|
+
# updated_at: Optional[datetime.datetime] = None
|
130
|
+
|
131
|
+
# class JobExecutionDocument(BaseModel):
|
132
|
+
# object: Literal["job_execution"] = "job_execution"
|
133
|
+
# id: str
|
134
|
+
# template_id: Optional[str] = None
|
135
|
+
# type: JobType
|
136
|
+
# identity: Any | None = None
|
137
|
+
# status: JobStatus
|
138
|
+
# input_data_gcs_path: str
|
139
|
+
# result: Optional[dict] = None
|
140
|
+
# error: Optional[str] = None
|
141
|
+
# created_at: Optional[datetime.datetime] = None
|
142
|
+
# updated_at: Optional[datetime.datetime] = None
|
143
|
+
# checkpoint: Any = None # Useful for jobs that need to be resumed
|
144
|
+
# checkpoint_data: Optional[dict] = None
|
145
|
+
# needs: list[str] = Field(default_factory=list, description="list of jobs execution id that must be completed before this job can run")
|
146
|
+
|
147
|
+
|
148
|
+
# class Workflow(BaseModel):
|
149
|
+
# name: str
|
150
|
+
# jobs: list[JobExecutionDocument]
|
@@ -0,0 +1,22 @@
|
|
1
|
+
from typing import Literal, Optional
|
2
|
+
|
3
|
+
from openai.types.chat.chat_completion_reasoning_effort import ChatCompletionReasoningEffort
|
4
|
+
from pydantic import BaseModel, Field
|
5
|
+
|
6
|
+
from ..modalities import Modality
|
7
|
+
|
8
|
+
|
9
|
+
class InferenceSettings(BaseModel):
|
10
|
+
model: str = "gpt-4o-mini"
|
11
|
+
temperature: float = 0.0
|
12
|
+
modality: Modality
|
13
|
+
image_resolution_dpi: int = 96
|
14
|
+
browser_canvas: Literal['A3', 'A4', 'A5'] = 'A4'
|
15
|
+
reasoning_effort: ChatCompletionReasoningEffort = "medium"
|
16
|
+
|
17
|
+
|
18
|
+
class AnnotationInputData(BaseModel):
|
19
|
+
dataset_id: str
|
20
|
+
files_ids: Optional[list[str]] = None
|
21
|
+
upsert: bool = False
|
22
|
+
inference_settings: InferenceSettings
|
@@ -0,0 +1,133 @@
|
|
1
|
+
from pydantic import BaseModel
|
2
|
+
|
3
|
+
from ..._utils.benchmarking import EvalMetrics, SingleFileEval, compute_dict_difference
|
4
|
+
from .batch_annotation import AnnotationInputData, InferenceSettings
|
5
|
+
|
6
|
+
# This job will generate two datasets from the original dataset, one with the first annotation and one with the second annotation
|
7
|
+
# It will then evaluate the two datasets using the evaluation metrics and return an EvalMetrics object
|
8
|
+
|
9
|
+
|
10
|
+
class EvaluationInputData(BaseModel):
|
11
|
+
original_dataset_id: str
|
12
|
+
schema_id: str
|
13
|
+
schema_data_id: str
|
14
|
+
inference_settings_1: InferenceSettings
|
15
|
+
inference_settings_2: InferenceSettings
|
16
|
+
|
17
|
+
|
18
|
+
# def evaluate_datasets(
|
19
|
+
# original_dataset_id: str,
|
20
|
+
# inference_settings_1: InferenceSettings,
|
21
|
+
# inference_settings_2: InferenceSettings,
|
22
|
+
# identity: Identity,
|
23
|
+
# job_execution_id: str,
|
24
|
+
# settings: Settings,
|
25
|
+
# dashboard_db: AsyncIOMotorDatabase,
|
26
|
+
# ) -> EvalMetrics:
|
27
|
+
# # Generate two datasets from the original dataset
|
28
|
+
|
29
|
+
# # Create the actual dataset objects.
|
30
|
+
|
31
|
+
# # Solution:
|
32
|
+
# # 1. Create the two datasets objects
|
33
|
+
# # 2. Duplicate all the dataset membership objects for the two datasets (with the right dataset_id)
|
34
|
+
|
35
|
+
# # 3. Annotate the two datasets with the two annotation props
|
36
|
+
# annotation_job_1 = AnnotationJob(
|
37
|
+
# input_data=AnnotationInputData(
|
38
|
+
# dataset_id=original_dataset_id,
|
39
|
+
# files_ids=None,
|
40
|
+
# upsert=True,
|
41
|
+
# inference_settings=inference_settings_1
|
42
|
+
# )
|
43
|
+
# )
|
44
|
+
|
45
|
+
# annotation_job_2 = AnnotationJob(
|
46
|
+
# input_data=AnnotationInputData(
|
47
|
+
# dataset_id=original_dataset_id,
|
48
|
+
# files_ids=None,
|
49
|
+
# upsert=True,
|
50
|
+
# inference_settings=inference_settings_2
|
51
|
+
# )
|
52
|
+
# )
|
53
|
+
# batch_annotate_job_with_checkpoints(
|
54
|
+
# identity=identity,
|
55
|
+
# job_execution_id=job_execution_id,
|
56
|
+
# annotation_job=annotation_job_1,
|
57
|
+
# settings=settings,
|
58
|
+
# dashboard_db=dashboard_db,
|
59
|
+
# )
|
60
|
+
|
61
|
+
# batch_annotate_job_with_checkpoints(
|
62
|
+
# identity=identity,
|
63
|
+
# job_execution_id=job_execution_id,
|
64
|
+
# annotation_job=annotation_job_2,
|
65
|
+
# settings=settings,
|
66
|
+
# dashboard_db=dashboard_db,
|
67
|
+
# )
|
68
|
+
|
69
|
+
# def compute_all_single_file_evals(
|
70
|
+
# dataset_1: Dataset,
|
71
|
+
# dataset_2: Dataset,
|
72
|
+
# ) -> list[SingleFileEval]:
|
73
|
+
|
74
|
+
# single_file_evals: list[SingleFileEval] = []
|
75
|
+
# for file_id in dataset_1.file_ids:
|
76
|
+
# single_file_evals.append(
|
77
|
+
# SingleFileEval(
|
78
|
+
# file_id=file_id,
|
79
|
+
# dict_1=dataset_1,
|
80
|
+
# dict_2=dataset_2.get_file(file_id),
|
81
|
+
# )
|
82
|
+
# )
|
83
|
+
|
84
|
+
# for file_id in dataset_2.file_ids:
|
85
|
+
# single_file_evals.append(
|
86
|
+
# SingleFileEval(
|
87
|
+
# file_id=file_id,
|
88
|
+
# dict_1=dataset_2.get_file(file_id),
|
89
|
+
# dict_2=dataset_1,
|
90
|
+
# )
|
91
|
+
# )
|
92
|
+
|
93
|
+
# for file_id in dataset_1.file_ids:
|
94
|
+
# single_file_evals.append(SingleFileEval(
|
95
|
+
# file_id=file_id,
|
96
|
+
# dict_1=dataset_1.get_file(file_id),
|
97
|
+
# dict_2=dataset_2.get_file(file_id),
|
98
|
+
# schema_id=schema_id,
|
99
|
+
# schema_data_id=schema_data_id,
|
100
|
+
# dataset_membership_id_1=dataset_1.get_file(file_id).id,
|
101
|
+
# dataset_membership_id_2=dataset_2.get_file(file_id).id,
|
102
|
+
# hamming_similarity=compute_dict_difference(
|
103
|
+
# dict_1=dataset_1.get_file(file_id),
|
104
|
+
# dict_2=dataset_2.get_file(file_id),
|
105
|
+
# metric="hamming_similarity"
|
106
|
+
# ),
|
107
|
+
# jaccard_similarity=compute_dict_difference(
|
108
|
+
# dict_1=dataset_1.get_file(file_id),
|
109
|
+
# dict_2=dataset_2.get_file(file_id),
|
110
|
+
# metric="jaccard_similarity"
|
111
|
+
# ),
|
112
|
+
# levenshtein_similarity=compute_dict_difference(
|
113
|
+
# dict_1=dataset_1.get_file(file_id),
|
114
|
+
# dict_2=dataset_2.get_file(file_id),
|
115
|
+
# metric="levenshtein_similarity"
|
116
|
+
# )
|
117
|
+
# )
|
118
|
+
|
119
|
+
|
120
|
+
# )
|
121
|
+
# # Then go through all the entries in the datasets and compute the evaluation metrics
|
122
|
+
# compute_all_single_file_evals(
|
123
|
+
# dataset_1=dataset_1,
|
124
|
+
# dataset_2=dataset_2,
|
125
|
+
# )
|
126
|
+
# # Return the EvalMetrics object
|
127
|
+
|
128
|
+
# compute_eval_metrics
|
129
|
+
|
130
|
+
|
131
|
+
# raise NotImplementedError("Not implemented")
|
132
|
+
|
133
|
+
# return eval_metrics
|