retab 0.0.35__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (111) hide show
  1. retab-0.0.35.dist-info/METADATA +417 -0
  2. retab-0.0.35.dist-info/RECORD +111 -0
  3. retab-0.0.35.dist-info/WHEEL +5 -0
  4. retab-0.0.35.dist-info/top_level.txt +1 -0
  5. uiform/__init__.py +4 -0
  6. uiform/_resource.py +28 -0
  7. uiform/_utils/__init__.py +0 -0
  8. uiform/_utils/ai_models.py +100 -0
  9. uiform/_utils/benchmarking copy.py +588 -0
  10. uiform/_utils/benchmarking.py +485 -0
  11. uiform/_utils/chat.py +332 -0
  12. uiform/_utils/display.py +443 -0
  13. uiform/_utils/json_schema.py +2161 -0
  14. uiform/_utils/mime.py +168 -0
  15. uiform/_utils/responses.py +163 -0
  16. uiform/_utils/stream_context_managers.py +52 -0
  17. uiform/_utils/usage/__init__.py +0 -0
  18. uiform/_utils/usage/usage.py +300 -0
  19. uiform/client.py +701 -0
  20. uiform/py.typed +0 -0
  21. uiform/resources/__init__.py +0 -0
  22. uiform/resources/consensus/__init__.py +3 -0
  23. uiform/resources/consensus/client.py +114 -0
  24. uiform/resources/consensus/completions.py +252 -0
  25. uiform/resources/consensus/completions_stream.py +278 -0
  26. uiform/resources/consensus/responses.py +325 -0
  27. uiform/resources/consensus/responses_stream.py +373 -0
  28. uiform/resources/deployments/__init__.py +9 -0
  29. uiform/resources/deployments/client.py +78 -0
  30. uiform/resources/deployments/endpoints.py +322 -0
  31. uiform/resources/deployments/links.py +452 -0
  32. uiform/resources/deployments/logs.py +211 -0
  33. uiform/resources/deployments/mailboxes.py +496 -0
  34. uiform/resources/deployments/outlook.py +531 -0
  35. uiform/resources/deployments/tests.py +158 -0
  36. uiform/resources/documents/__init__.py +3 -0
  37. uiform/resources/documents/client.py +255 -0
  38. uiform/resources/documents/extractions.py +441 -0
  39. uiform/resources/evals.py +812 -0
  40. uiform/resources/files.py +24 -0
  41. uiform/resources/finetuning.py +62 -0
  42. uiform/resources/jsonlUtils.py +1046 -0
  43. uiform/resources/models.py +45 -0
  44. uiform/resources/openai_example.py +22 -0
  45. uiform/resources/processors/__init__.py +3 -0
  46. uiform/resources/processors/automations/__init__.py +9 -0
  47. uiform/resources/processors/automations/client.py +78 -0
  48. uiform/resources/processors/automations/endpoints.py +317 -0
  49. uiform/resources/processors/automations/links.py +356 -0
  50. uiform/resources/processors/automations/logs.py +211 -0
  51. uiform/resources/processors/automations/mailboxes.py +435 -0
  52. uiform/resources/processors/automations/outlook.py +444 -0
  53. uiform/resources/processors/automations/tests.py +158 -0
  54. uiform/resources/processors/client.py +474 -0
  55. uiform/resources/prompt_optimization.py +76 -0
  56. uiform/resources/schemas.py +369 -0
  57. uiform/resources/secrets/__init__.py +9 -0
  58. uiform/resources/secrets/client.py +20 -0
  59. uiform/resources/secrets/external_api_keys.py +109 -0
  60. uiform/resources/secrets/webhook.py +62 -0
  61. uiform/resources/usage.py +271 -0
  62. uiform/types/__init__.py +0 -0
  63. uiform/types/ai_models.py +645 -0
  64. uiform/types/automations/__init__.py +0 -0
  65. uiform/types/automations/cron.py +58 -0
  66. uiform/types/automations/endpoints.py +21 -0
  67. uiform/types/automations/links.py +28 -0
  68. uiform/types/automations/mailboxes.py +60 -0
  69. uiform/types/automations/outlook.py +68 -0
  70. uiform/types/automations/webhooks.py +21 -0
  71. uiform/types/chat.py +8 -0
  72. uiform/types/completions.py +93 -0
  73. uiform/types/consensus.py +10 -0
  74. uiform/types/db/__init__.py +0 -0
  75. uiform/types/db/annotations.py +24 -0
  76. uiform/types/db/files.py +36 -0
  77. uiform/types/deployments/__init__.py +0 -0
  78. uiform/types/deployments/cron.py +59 -0
  79. uiform/types/deployments/endpoints.py +28 -0
  80. uiform/types/deployments/links.py +36 -0
  81. uiform/types/deployments/mailboxes.py +67 -0
  82. uiform/types/deployments/outlook.py +76 -0
  83. uiform/types/deployments/webhooks.py +21 -0
  84. uiform/types/documents/__init__.py +0 -0
  85. uiform/types/documents/correct_orientation.py +13 -0
  86. uiform/types/documents/create_messages.py +226 -0
  87. uiform/types/documents/extractions.py +297 -0
  88. uiform/types/evals.py +207 -0
  89. uiform/types/events.py +76 -0
  90. uiform/types/extractions.py +85 -0
  91. uiform/types/jobs/__init__.py +0 -0
  92. uiform/types/jobs/base.py +150 -0
  93. uiform/types/jobs/batch_annotation.py +22 -0
  94. uiform/types/jobs/evaluation.py +133 -0
  95. uiform/types/jobs/finetune.py +6 -0
  96. uiform/types/jobs/prompt_optimization.py +41 -0
  97. uiform/types/jobs/webcrawl.py +6 -0
  98. uiform/types/logs.py +231 -0
  99. uiform/types/mime.py +257 -0
  100. uiform/types/modalities.py +68 -0
  101. uiform/types/pagination.py +6 -0
  102. uiform/types/schemas/__init__.py +0 -0
  103. uiform/types/schemas/enhance.py +53 -0
  104. uiform/types/schemas/evaluate.py +55 -0
  105. uiform/types/schemas/generate.py +32 -0
  106. uiform/types/schemas/layout.py +58 -0
  107. uiform/types/schemas/object.py +631 -0
  108. uiform/types/schemas/templates.py +107 -0
  109. uiform/types/secrets/__init__.py +0 -0
  110. uiform/types/secrets/external_api_keys.py +22 -0
  111. uiform/types/standards.py +39 -0
uiform/types/evals.py ADDED
@@ -0,0 +1,207 @@
1
+ import copy
2
+ import datetime
3
+ import json
4
+ from typing import Any, List, Literal, Optional, Union
5
+
6
+ import nanoid # type: ignore
7
+ from pydantic import BaseModel, Field, computed_field
8
+
9
+
10
+ from .._utils.json_schema import clean_schema, compute_schema_data_id
11
+ from .._utils.mime import generate_blake2b_hash_from_string
12
+ from .ai_models import Amount, LLMModel
13
+ from .jobs.base import InferenceSettings
14
+ from .mime import MIMEData
15
+
16
+
17
+ # Define the type alias for MetricType
18
+ MetricType = Literal["levenshtein", "jaccard", "hamming"]
19
+
20
+
21
+ # Define the structure for an individual item metric
22
+ class ItemMetric(BaseModel):
23
+ id: str = Field(description="The ID of the item being measured")
24
+ name: str = Field(description="The name of the item being measured")
25
+ similarity: float = Field(description="The similarity score between 0 and 1")
26
+ similarities: dict[str, Any] = Field(description="The similarity scores for each item in the list")
27
+ flat_similarities: dict[str, Optional[float]] = Field(description="The similarity scores for each item in the list in dot notation format")
28
+ aligned_similarity: float = Field(description="The similarity score between 0 and 1, after alignment")
29
+ aligned_similarities: dict[str, Any] = Field(description="The similarity scores for each item in the list, after alignment")
30
+ aligned_flat_similarities: dict[str, Optional[float]] = Field(description="The similarity scores for each item in the list in dot notation format, after alignment")
31
+
32
+
33
+ # Define the main MetricResult model
34
+ class MetricResult(BaseModel):
35
+ item_metrics: List[ItemMetric] = Field(description="List of similarity metrics for individual items")
36
+ mean_similarity: float = Field(description="The average similarity score across all items")
37
+ aligned_mean_similarity: float = Field(description="The average similarity score across all items, after alignment")
38
+ metric_type: MetricType = Field(description="The type of similarity metric used for comparison")
39
+
40
+
41
+ class DistancesResult(BaseModel):
42
+ distances: dict[str, Any] = Field(description="List of distances for individual items")
43
+ mean_distance: float = Field(description="The average distance across all items")
44
+ metric_type: MetricType = Field(description="The type of distance metric used for comparison")
45
+
46
+
47
+ class PredictionMetadata(BaseModel):
48
+ extraction_id: Optional[str] = Field(default=None, description="The ID of the extraction")
49
+ likelihoods: Optional[dict[str, Any]] = Field(default=None, description="The likelihoods of the extraction")
50
+ field_locations: Optional[dict[str, Any]] = Field(default=None, description="The field locations of the extraction")
51
+ agentic_field_locations: Optional[dict[str, Any]] = Field(default=None, description="The field locations of the extraction extracted by an llm")
52
+ consensus_details: Optional[list[dict[str, Any]]] = Field(default=None, description="The consensus details of the extraction")
53
+ api_cost: Optional[Amount] = Field(default=None, description="The cost of the API call for this document (if any -- ground truth for example)")
54
+
55
+
56
+ class PredictionData(BaseModel):
57
+ prediction: dict[str, Any] = Field(default={}, description="The result of the extraction or manual annotation")
58
+ metadata: Optional[PredictionMetadata] = Field(default=None, description="The metadata of the prediction")
59
+
60
+
61
+ class Iteration(BaseModel):
62
+ id: str = Field(default_factory=lambda: "eval_iter_" + nanoid.generate())
63
+ inference_settings: InferenceSettings
64
+ json_schema: dict[str, Any]
65
+ predictions: list[PredictionData] = Field(default_factory=list, description="The predictions of the iteration for all the documents")
66
+ metric_results: Optional[MetricResult] = Field(default=None, description="The metric results of the iteration")
67
+
68
+ @computed_field # type: ignore
69
+ @property
70
+ def schema_data_id(self) -> str:
71
+ """Returns the SHA1 hash of the schema data, ignoring all prompt/description/default fields.
72
+
73
+ Returns:
74
+ str: A SHA1 hash string representing the schema data version.
75
+ """
76
+ return "sch_data_id_" + generate_blake2b_hash_from_string(
77
+ json.dumps(
78
+ clean_schema(
79
+ copy.deepcopy(self.json_schema),
80
+ remove_custom_fields=True,
81
+ fields_to_remove=["description", "default", "title", "required", "examples", "deprecated", "readOnly", "writeOnly"],
82
+ ),
83
+ sort_keys=True,
84
+ ).strip()
85
+ )
86
+
87
+ # This is a computed field, it is exposed when serializing the object
88
+ @computed_field # type: ignore
89
+ @property
90
+ def schema_id(self) -> str:
91
+ """Returns the SHA1 hash of the complete schema.
92
+
93
+ Returns:
94
+ str: A SHA1 hash string representing the complete schema version.
95
+ """
96
+ return "sch_id_" + generate_blake2b_hash_from_string(json.dumps(self.json_schema, sort_keys=True).strip())
97
+
98
+
99
+ class AnnotatedDocument(BaseModel):
100
+ mime_data: MIMEData = Field(
101
+ description="The mime data of the document. Can also be a BaseMIMEData, which is why we have this id field (to be able to identify the file, but id is equal to mime_data.id)"
102
+ )
103
+ annotation: dict[str, Any] = Field(default={}, description="The ground truth of the document")
104
+
105
+
106
+ class DocumentItem(AnnotatedDocument):
107
+ annotation_metadata: Optional[PredictionMetadata] = Field(default=None, description="The metadata of the annotation when the annotation is a prediction")
108
+
109
+
110
+ class EvaluationDocument(DocumentItem):
111
+ id: str = Field(description="The ID of the document. Equal to mime_data.id but robust to the case where mime_data is a BaseMIMEData")
112
+
113
+
114
+ class CreateIterationRequest(BaseModel):
115
+ """
116
+ Request model for performing a new iteration with custom inference settings and optional JSON schema.
117
+ """
118
+
119
+ inference_settings: InferenceSettings
120
+ json_schema: Optional[dict[str, Any]] = None
121
+
122
+
123
+ class UpdateEvaluationDocumentRequest(BaseModel):
124
+ annotation: Optional[dict[str, Any]] = Field(default=None, description="The ground truth of the document")
125
+ annotation_metadata: Optional[PredictionMetadata] = Field(default=None, description="The metadata of the annotation when the annotation is a prediction")
126
+
127
+
128
+ class UpdateEvaluationRequest(BaseModel):
129
+ name: Optional[str] = Field(default=None, description="The name of the document")
130
+ documents: Optional[list[EvaluationDocument]] = Field(default=None, description="The documents of the evaluation")
131
+ iterations: Optional[list[Iteration]] = Field(default=None, description="The iterations of the evaluation")
132
+ json_schema: Optional[dict[str, Any]] = Field(default=None, description="The json schema of the evaluation")
133
+
134
+ project_id: Optional[str] = Field(default=None, description="The ID of the project")
135
+
136
+ @computed_field # type: ignore
137
+ @property
138
+ def schema_data_id(self) -> Optional[str]:
139
+ """Returns the SHA1 hash of the schema data, ignoring all prompt/description/default fields.
140
+
141
+ Returns:
142
+ str: A SHA1 hash string representing the schema data version.
143
+ """
144
+ if self.json_schema is None:
145
+ return None
146
+
147
+ return compute_schema_data_id(self.json_schema)
148
+
149
+ # This is a computed field, it is exposed when serializing the object
150
+ @computed_field # type: ignore
151
+ @property
152
+ def schema_id(self) -> Optional[str]:
153
+ """Returns the SHA1 hash of the complete schema.
154
+
155
+ Returns:
156
+ str: A SHA1 hash string representing the complete schema version.
157
+ """
158
+ if self.json_schema is None:
159
+ return None
160
+ return "sch_id_" + generate_blake2b_hash_from_string(json.dumps(self.json_schema, sort_keys=True).strip())
161
+
162
+
163
+ class Evaluation(BaseModel):
164
+ id: str = Field(default_factory=lambda: "eval_" + nanoid.generate())
165
+ updated_at: datetime.datetime = Field(default_factory=lambda: datetime.datetime.now(tz=datetime.timezone.utc))
166
+
167
+ name: str
168
+ documents: list[EvaluationDocument]
169
+ iterations: list[Iteration]
170
+ json_schema: dict[str, Any]
171
+
172
+ project_id: str = Field(description="The ID of the project", default="default_spreadsheets")
173
+ default_inference_settings: Optional[InferenceSettings] = Field(default=None, description="The default inference properties for the evaluation (mostly used in the frontend)")
174
+
175
+ # @field_validator('iterations')
176
+ # def validate_iterations_content_length(cls: Any, v: list[Iteration], values: Any) -> list[Iteration]:
177
+ # if 'ground_truth' in values:
178
+ # ground_truth_length = len(values['ground_truth'])
179
+ # for iteration in v:
180
+ # if len(iteration.content) != ground_truth_length:
181
+ # raise ValueError(f"Iteration content length must match ground_truth length ({ground_truth_length})")
182
+ # return v
183
+
184
+ @computed_field # type: ignore
185
+ @property
186
+ def schema_data_id(self) -> str:
187
+ """Returns the SHA1 hash of the schema data, ignoring all prompt/description/default fields.
188
+
189
+ Returns:
190
+ str: A SHA1 hash string representing the schema data version.
191
+ """
192
+ return compute_schema_data_id(self.json_schema)
193
+
194
+ # This is a computed field, it is exposed when serializing the object
195
+ @computed_field # type: ignore
196
+ @property
197
+ def schema_id(self) -> str:
198
+ """Returns the SHA1 hash of the complete schema.
199
+
200
+ Returns:
201
+ str: A SHA1 hash string representing the complete schema version.
202
+ """
203
+ return "sch_id_" + generate_blake2b_hash_from_string(json.dumps(self.json_schema, sort_keys=True).strip())
204
+
205
+
206
+ class AddIterationFromJsonlRequest(BaseModel):
207
+ jsonl_gcs_path: str
uiform/types/events.py ADDED
@@ -0,0 +1,76 @@
1
+ import datetime
2
+ from typing import Any, Literal, Optional
3
+
4
+ import nanoid # type: ignore
5
+ from pydantic import BaseModel, Field
6
+
7
+ metadata_key = Literal[
8
+ 'user',
9
+ 'organization',
10
+ 'link',
11
+ 'mailbox',
12
+ 'cron',
13
+ 'outlook',
14
+ 'extraction',
15
+ 'webhook',
16
+ 'reconciliation',
17
+ 'preprocessing',
18
+ 'schema',
19
+ 'data_structure',
20
+ 'file',
21
+ 'preprocessing',
22
+ 'dataset',
23
+ 'dataset_membership',
24
+ 'endpoint',
25
+ 'automation',
26
+ 'template',
27
+ ]
28
+
29
+ event_type = Literal[
30
+ 'extraction.created',
31
+ 'messages.created',
32
+ 'document.orientation_corrected',
33
+ 'consensus.reconciled',
34
+ 'automation.created',
35
+ 'automation.updated',
36
+ 'automation.deleted',
37
+ 'automation.webhook',
38
+ 'preprocessing.created',
39
+ 'link.created',
40
+ 'link.updated',
41
+ 'link.deleted',
42
+ 'link.webhook',
43
+ 'mailbox.created',
44
+ 'mailbox.updated',
45
+ 'mailbox.deleted',
46
+ 'mailbox.webhook',
47
+ 'outlook.created',
48
+ 'outlook.updated',
49
+ 'outlook.deleted',
50
+ 'outlook.webhook',
51
+ 'schema.generated',
52
+ 'schema.promptified',
53
+ 'schema.system_promptfile.created',
54
+ 'file.updated',
55
+ 'file.deleted',
56
+ 'template.created',
57
+ 'template.deleted',
58
+ 'template.sample_document_uploaded',
59
+ 'template.sample_document_deleted',
60
+ 'template.updated',
61
+ ]
62
+
63
+
64
+ class Event(BaseModel):
65
+ object: Literal['event'] = "event"
66
+ id: str = Field(default_factory=lambda: "event_" + nanoid.generate(), description="Unique identifier for the event")
67
+ event: str = Field(..., description="A string that distinguishes the event type. Ex: user.created, user.updated, user.deleted, etc.")
68
+ created_at: datetime.datetime = Field(default_factory=lambda: datetime.datetime.now(datetime.timezone.utc))
69
+ data: dict[str, Any] = Field(..., description="Event payload. Payloads match the corresponding API objects.")
70
+ metadata: Optional[dict[metadata_key, str]] = Field(
71
+ default=None, description="Ids giving informations about the event. Ex: user.created.metadata = {'user': 'usr_8478973619047837'}"
72
+ )
73
+
74
+
75
+ class StoredEvent(Event):
76
+ organization_id: str = Field(..., description="Organization ID")
@@ -0,0 +1,85 @@
1
+ import datetime
2
+ from typing import Any, Literal, Optional
3
+
4
+ import nanoid # type: ignore
5
+ from openai.types.chat import ChatCompletion
6
+ from openai.types.chat.chat_completion_reasoning_effort import ChatCompletionReasoningEffort
7
+ from pydantic import BaseModel, Field, computed_field
8
+
9
+ from uiform.types.chat import ChatCompletionUiformMessage
10
+ from uiform.types.documents.extractions import UiParsedChatCompletion
11
+
12
+ from .._utils.usage.usage import compute_cost_from_model, compute_cost_from_model_with_breakdown, CostBreakdown
13
+ from .ai_models import Amount
14
+ from .modalities import Modality
15
+
16
+ ValidationsState = Literal["pending", "validated", "invalid"]
17
+
18
+
19
+ class ExtractionSource(BaseModel):
20
+ type: Literal["api", "annotation","processor", "automation.link", "automation.mailbox", "automation.cron", "automation.outlook", "automation.endpoint", "schema.extract"] = Field(
21
+ description="Type of extraction"
22
+ )
23
+ id: str | None = Field(default=None, description="ID the trigger of the extraction")
24
+
25
+
26
+ ExtractionSteps = str | Literal['initialization', 'prepare_messages', 'yield_first_token', 'completion'] # Steps are meant to not overlap
27
+
28
+
29
+ class ExtractionTimingStep(BaseModel):
30
+ name: ExtractionSteps
31
+ duration: float # in seconds
32
+ notes: str | None = None
33
+
34
+
35
+ class Extraction(BaseModel):
36
+ id: str = Field(default_factory=lambda: "extr_" + nanoid.generate(), description="Unique identifier of the analysis")
37
+ messages: list[ChatCompletionUiformMessage] = Field(default_factory=list)
38
+ messages_gcs: str = Field(..., description="GCS path to the messages")
39
+ file_gcs: str = Field(..., description="GCS path to the file")
40
+ file_id: str = Field(..., description="ID of the file")
41
+
42
+ status: Literal["success", "failed"] = Field(..., description="Whether the analysis was successful")
43
+ completion: UiParsedChatCompletion | ChatCompletion = Field(..., description="Response generated by the analysis")
44
+ json_schema: Any = Field(..., description="Response format (JSON Schema or pydantic_v2.BaseModel)")
45
+ model: str = Field(..., description="Model used for the analysis")
46
+ temperature: float = Field(default=0.0, description="Temperature used for the analysis")
47
+ source: ExtractionSource = Field(..., description="Source of the extraction")
48
+ image_resolution_dpi: int = Field(default=96, description="Resolution of the image sent to the LLM")
49
+ browser_canvas: Literal['A3', 'A4', 'A5'] = Field(default='A4', description="Sets the size of the browser canvas for rendering documents in browser-based processing. Choose a size that matches the document type.")
50
+ modality: Modality = Field(default="native", description="Modality of the extraction")
51
+ reasoning_effort: Optional[ChatCompletionReasoningEffort] = Field(default=None, description="The effort level for the model to reason about the input data.")
52
+ timings: list[ExtractionTimingStep] = Field(default_factory=list, description="Timings of the extraction")
53
+
54
+ # Infered from the schema
55
+ schema_id: str = Field(..., description="Version of the schema used for the analysis")
56
+ schema_data_id: str = Field(..., description="Version of the schema data used for the analysis")
57
+ created_at: datetime.datetime = Field(default_factory=lambda: datetime.datetime.now(datetime.timezone.utc), description="Timestamp of the creation of the extraction object")
58
+ request_at: datetime.datetime | None = Field(default=None, description="Timestamp of the extraction request if provided.")
59
+ organization_id: str = Field(..., description="Organization ID of the user or application")
60
+ validation_state: Optional[ValidationsState] = Field(default=None, description="Validation state of the extraction")
61
+ billed: bool = Field(default=False, description="Whether the extraction has been billed or not")
62
+
63
+ @computed_field
64
+ @property
65
+ def api_cost(self) -> Optional[Amount]:
66
+ if self.completion and self.completion.usage:
67
+ try:
68
+ cost = compute_cost_from_model(self.completion.model, self.completion.usage)
69
+ return cost
70
+ except Exception as e:
71
+ print(f"Error computing cost: {e}")
72
+ return None
73
+ return None
74
+
75
+ @computed_field # type: ignore
76
+ @property
77
+ def cost_breakdown(self) -> Optional[CostBreakdown]:
78
+ if self.completion and self.completion.usage:
79
+ try:
80
+ cost = compute_cost_from_model_with_breakdown(self.completion.model, self.completion.usage)
81
+ return cost
82
+ except Exception as e:
83
+ print(f"Error computing cost: {e}")
84
+ return None
85
+ return None
File without changes
@@ -0,0 +1,150 @@
1
+ from typing import Literal, Optional, Self
2
+
3
+ from openai.types.chat.chat_completion_reasoning_effort import ChatCompletionReasoningEffort
4
+ from pydantic import BaseModel, Field, model_validator
5
+
6
+ from ..modalities import Modality
7
+
8
+ SelectionMode = Literal["all", "manual"]
9
+
10
+
11
+ # This is the input data for the prepare_dataset job
12
+ class PrepareDatasetInputData(BaseModel):
13
+ dataset_id: Optional[str] = None
14
+ schema_id: Optional[str] = None
15
+ schema_data_id: Optional[str] = None
16
+
17
+ selection_model: SelectionMode = "all"
18
+
19
+ @model_validator(mode="after")
20
+ def validate_input(self) -> Self:
21
+ # The preference is:
22
+ # 1. dataset_id
23
+ # 2. schema_id
24
+ # 3. schema_data_id
25
+ if self.dataset_id is None and self.schema_id is None and self.schema_data_id is None:
26
+ raise ValueError("At least one of dataset_id, schema_id, or schema_data_id must be provided")
27
+
28
+ return self
29
+
30
+
31
+ # This is the input data for the split_dataset job
32
+ class DatasetSplitInputData(BaseModel):
33
+ dataset_id: str
34
+ train_size: Optional[int | float] = None
35
+ eval_size: Optional[int | float] = None
36
+
37
+ @model_validator(mode="after")
38
+ def validate_input(self) -> Self:
39
+ if self.train_size is not None and self.eval_size is not None:
40
+ raise ValueError("train_size and eval_size cannot both be provided")
41
+ return self
42
+
43
+
44
+ # This is the input data for the batch annotation job
45
+ class InferenceSettings(BaseModel):
46
+ model: str = "gpt-4o-mini"
47
+ temperature: float = 0.0
48
+ modality: Modality = "native"
49
+ reasoning_effort: ChatCompletionReasoningEffort = "medium"
50
+ image_resolution_dpi: int = 96
51
+ browser_canvas: Literal['A3', 'A4', 'A5'] = 'A4'
52
+ n_consensus: int = Field(default=1, description="Number of consensus rounds to perform")
53
+
54
+
55
+ class AnnotationInputData(BaseModel):
56
+ data_file: str
57
+ schema_id: str
58
+ inference_settings: InferenceSettings
59
+
60
+
61
+ # This is the input data for the evaluation job
62
+ class EvaluationInputData(BaseModel):
63
+ eval_data_file: str
64
+ schema_id: str
65
+ inference_settings_1: InferenceSettings | None = None
66
+ inference_settings_2: InferenceSettings
67
+
68
+
69
+ # from pydantic import BaseModel, Field, model_validator
70
+ # from typing import Literal, Optional, Any
71
+ # import datetime
72
+
73
+
74
+ # JobType = Literal["prompt-optimization", "annotate-files", "finetune-dataset", "webcrawl"]
75
+ # JobStatus = Literal["pending", "running", "completed", "failed"]
76
+ #### JOBS ####
77
+
78
+ # class JobTemplateCreateRequest(BaseModel):
79
+ # job_type: JobType
80
+ # default_input_data: dict = Field(default_factory=dict)
81
+ # description: Optional[str] = None
82
+ # cron: Optional[str] = None
83
+
84
+
85
+ # class JobTemplateDocument(BaseModel):
86
+ # object: Literal["job_template"] = "job_template"
87
+ # id: str
88
+ # type: JobType
89
+ # identity: Any | None = None
90
+ # description: Optional[str] = None
91
+ # default_input_data: dict = Field(default_factory=dict)
92
+ # # For scheduled jobs, include a valid CRON expression (None for on-demand only jobs)
93
+ # cron: Optional[str] = None
94
+ # next_run: Optional[datetime.datetime] = None
95
+ # created_at: Optional[datetime.datetime] = None
96
+ # updated_at: Optional[datetime.datetime] = None
97
+ # is_active: bool = True # Change to status.
98
+
99
+ # class JobTemplateUpdateRequest(BaseModel):
100
+ # cron: Optional[str] = None
101
+ # default_input_data: Optional[dict] = None
102
+ # description: Optional[str] = None
103
+ # is_active: Optional[bool] = None # Change to status.
104
+
105
+
106
+ #### EXECUTIONS ####
107
+
108
+ # class JobExecutionCreateRequest(BaseModel):
109
+ # type: JobType
110
+ # template_id: Optional[str] = None
111
+ # input_data: dict = Field(default_factory=dict)
112
+
113
+ # @model_validator(mode='before')
114
+ # @classmethod
115
+ # def validate_job_identifiers(cls, data: Any) -> Any:
116
+ # if isinstance(data, dict):
117
+ # if bool(data.get('job_type')) == bool(data.get('job_template_id')):
118
+ # raise ValueError("Either job_type or job_template_id must be provided")
119
+ # return data
120
+
121
+ # class JobExecutionResponse(BaseModel):
122
+ # id: str
123
+ # template_id: Optional[str] = None
124
+ # type: JobType
125
+ # status: JobStatus
126
+ # result: Optional[dict] = None
127
+ # error: Optional[str] = None
128
+ # created_at: Optional[datetime.datetime] = None
129
+ # updated_at: Optional[datetime.datetime] = None
130
+
131
+ # class JobExecutionDocument(BaseModel):
132
+ # object: Literal["job_execution"] = "job_execution"
133
+ # id: str
134
+ # template_id: Optional[str] = None
135
+ # type: JobType
136
+ # identity: Any | None = None
137
+ # status: JobStatus
138
+ # input_data_gcs_path: str
139
+ # result: Optional[dict] = None
140
+ # error: Optional[str] = None
141
+ # created_at: Optional[datetime.datetime] = None
142
+ # updated_at: Optional[datetime.datetime] = None
143
+ # checkpoint: Any = None # Useful for jobs that need to be resumed
144
+ # checkpoint_data: Optional[dict] = None
145
+ # needs: list[str] = Field(default_factory=list, description="list of jobs execution id that must be completed before this job can run")
146
+
147
+
148
+ # class Workflow(BaseModel):
149
+ # name: str
150
+ # jobs: list[JobExecutionDocument]
@@ -0,0 +1,22 @@
1
+ from typing import Literal, Optional
2
+
3
+ from openai.types.chat.chat_completion_reasoning_effort import ChatCompletionReasoningEffort
4
+ from pydantic import BaseModel, Field
5
+
6
+ from ..modalities import Modality
7
+
8
+
9
+ class InferenceSettings(BaseModel):
10
+ model: str = "gpt-4o-mini"
11
+ temperature: float = 0.0
12
+ modality: Modality
13
+ image_resolution_dpi: int = 96
14
+ browser_canvas: Literal['A3', 'A4', 'A5'] = 'A4'
15
+ reasoning_effort: ChatCompletionReasoningEffort = "medium"
16
+
17
+
18
+ class AnnotationInputData(BaseModel):
19
+ dataset_id: str
20
+ files_ids: Optional[list[str]] = None
21
+ upsert: bool = False
22
+ inference_settings: InferenceSettings
@@ -0,0 +1,133 @@
1
+ from pydantic import BaseModel
2
+
3
+ from ..._utils.benchmarking import EvalMetrics, SingleFileEval, compute_dict_difference
4
+ from .batch_annotation import AnnotationInputData, InferenceSettings
5
+
6
+ # This job will generate two datasets from the original dataset, one with the first annotation and one with the second annotation
7
+ # It will then evaluate the two datasets using the evaluation metrics and return an EvalMetrics object
8
+
9
+
10
+ class EvaluationInputData(BaseModel):
11
+ original_dataset_id: str
12
+ schema_id: str
13
+ schema_data_id: str
14
+ inference_settings_1: InferenceSettings
15
+ inference_settings_2: InferenceSettings
16
+
17
+
18
+ # def evaluate_datasets(
19
+ # original_dataset_id: str,
20
+ # inference_settings_1: InferenceSettings,
21
+ # inference_settings_2: InferenceSettings,
22
+ # identity: Identity,
23
+ # job_execution_id: str,
24
+ # settings: Settings,
25
+ # dashboard_db: AsyncIOMotorDatabase,
26
+ # ) -> EvalMetrics:
27
+ # # Generate two datasets from the original dataset
28
+
29
+ # # Create the actual dataset objects.
30
+
31
+ # # Solution:
32
+ # # 1. Create the two datasets objects
33
+ # # 2. Duplicate all the dataset membership objects for the two datasets (with the right dataset_id)
34
+
35
+ # # 3. Annotate the two datasets with the two annotation props
36
+ # annotation_job_1 = AnnotationJob(
37
+ # input_data=AnnotationInputData(
38
+ # dataset_id=original_dataset_id,
39
+ # files_ids=None,
40
+ # upsert=True,
41
+ # inference_settings=inference_settings_1
42
+ # )
43
+ # )
44
+
45
+ # annotation_job_2 = AnnotationJob(
46
+ # input_data=AnnotationInputData(
47
+ # dataset_id=original_dataset_id,
48
+ # files_ids=None,
49
+ # upsert=True,
50
+ # inference_settings=inference_settings_2
51
+ # )
52
+ # )
53
+ # batch_annotate_job_with_checkpoints(
54
+ # identity=identity,
55
+ # job_execution_id=job_execution_id,
56
+ # annotation_job=annotation_job_1,
57
+ # settings=settings,
58
+ # dashboard_db=dashboard_db,
59
+ # )
60
+
61
+ # batch_annotate_job_with_checkpoints(
62
+ # identity=identity,
63
+ # job_execution_id=job_execution_id,
64
+ # annotation_job=annotation_job_2,
65
+ # settings=settings,
66
+ # dashboard_db=dashboard_db,
67
+ # )
68
+
69
+ # def compute_all_single_file_evals(
70
+ # dataset_1: Dataset,
71
+ # dataset_2: Dataset,
72
+ # ) -> list[SingleFileEval]:
73
+
74
+ # single_file_evals: list[SingleFileEval] = []
75
+ # for file_id in dataset_1.file_ids:
76
+ # single_file_evals.append(
77
+ # SingleFileEval(
78
+ # file_id=file_id,
79
+ # dict_1=dataset_1,
80
+ # dict_2=dataset_2.get_file(file_id),
81
+ # )
82
+ # )
83
+
84
+ # for file_id in dataset_2.file_ids:
85
+ # single_file_evals.append(
86
+ # SingleFileEval(
87
+ # file_id=file_id,
88
+ # dict_1=dataset_2.get_file(file_id),
89
+ # dict_2=dataset_1,
90
+ # )
91
+ # )
92
+
93
+ # for file_id in dataset_1.file_ids:
94
+ # single_file_evals.append(SingleFileEval(
95
+ # file_id=file_id,
96
+ # dict_1=dataset_1.get_file(file_id),
97
+ # dict_2=dataset_2.get_file(file_id),
98
+ # schema_id=schema_id,
99
+ # schema_data_id=schema_data_id,
100
+ # dataset_membership_id_1=dataset_1.get_file(file_id).id,
101
+ # dataset_membership_id_2=dataset_2.get_file(file_id).id,
102
+ # hamming_similarity=compute_dict_difference(
103
+ # dict_1=dataset_1.get_file(file_id),
104
+ # dict_2=dataset_2.get_file(file_id),
105
+ # metric="hamming_similarity"
106
+ # ),
107
+ # jaccard_similarity=compute_dict_difference(
108
+ # dict_1=dataset_1.get_file(file_id),
109
+ # dict_2=dataset_2.get_file(file_id),
110
+ # metric="jaccard_similarity"
111
+ # ),
112
+ # levenshtein_similarity=compute_dict_difference(
113
+ # dict_1=dataset_1.get_file(file_id),
114
+ # dict_2=dataset_2.get_file(file_id),
115
+ # metric="levenshtein_similarity"
116
+ # )
117
+ # )
118
+
119
+
120
+ # )
121
+ # # Then go through all the entries in the datasets and compute the evaluation metrics
122
+ # compute_all_single_file_evals(
123
+ # dataset_1=dataset_1,
124
+ # dataset_2=dataset_2,
125
+ # )
126
+ # # Return the EvalMetrics object
127
+
128
+ # compute_eval_metrics
129
+
130
+
131
+ # raise NotImplementedError("Not implemented")
132
+
133
+ # return eval_metrics
@@ -0,0 +1,6 @@
1
+ # from pydantic import BaseModel
2
+ # from .batch_annotation import InferenceSettings
3
+
4
+ # class FineTuningInputData(BaseModel):
5
+ # dataset_id: str
6
+ # finetuning_props : FinetuningProps