retab 0.0.35__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- retab-0.0.35.dist-info/METADATA +417 -0
- retab-0.0.35.dist-info/RECORD +111 -0
- retab-0.0.35.dist-info/WHEEL +5 -0
- retab-0.0.35.dist-info/top_level.txt +1 -0
- uiform/__init__.py +4 -0
- uiform/_resource.py +28 -0
- uiform/_utils/__init__.py +0 -0
- uiform/_utils/ai_models.py +100 -0
- uiform/_utils/benchmarking copy.py +588 -0
- uiform/_utils/benchmarking.py +485 -0
- uiform/_utils/chat.py +332 -0
- uiform/_utils/display.py +443 -0
- uiform/_utils/json_schema.py +2161 -0
- uiform/_utils/mime.py +168 -0
- uiform/_utils/responses.py +163 -0
- uiform/_utils/stream_context_managers.py +52 -0
- uiform/_utils/usage/__init__.py +0 -0
- uiform/_utils/usage/usage.py +300 -0
- uiform/client.py +701 -0
- uiform/py.typed +0 -0
- uiform/resources/__init__.py +0 -0
- uiform/resources/consensus/__init__.py +3 -0
- uiform/resources/consensus/client.py +114 -0
- uiform/resources/consensus/completions.py +252 -0
- uiform/resources/consensus/completions_stream.py +278 -0
- uiform/resources/consensus/responses.py +325 -0
- uiform/resources/consensus/responses_stream.py +373 -0
- uiform/resources/deployments/__init__.py +9 -0
- uiform/resources/deployments/client.py +78 -0
- uiform/resources/deployments/endpoints.py +322 -0
- uiform/resources/deployments/links.py +452 -0
- uiform/resources/deployments/logs.py +211 -0
- uiform/resources/deployments/mailboxes.py +496 -0
- uiform/resources/deployments/outlook.py +531 -0
- uiform/resources/deployments/tests.py +158 -0
- uiform/resources/documents/__init__.py +3 -0
- uiform/resources/documents/client.py +255 -0
- uiform/resources/documents/extractions.py +441 -0
- uiform/resources/evals.py +812 -0
- uiform/resources/files.py +24 -0
- uiform/resources/finetuning.py +62 -0
- uiform/resources/jsonlUtils.py +1046 -0
- uiform/resources/models.py +45 -0
- uiform/resources/openai_example.py +22 -0
- uiform/resources/processors/__init__.py +3 -0
- uiform/resources/processors/automations/__init__.py +9 -0
- uiform/resources/processors/automations/client.py +78 -0
- uiform/resources/processors/automations/endpoints.py +317 -0
- uiform/resources/processors/automations/links.py +356 -0
- uiform/resources/processors/automations/logs.py +211 -0
- uiform/resources/processors/automations/mailboxes.py +435 -0
- uiform/resources/processors/automations/outlook.py +444 -0
- uiform/resources/processors/automations/tests.py +158 -0
- uiform/resources/processors/client.py +474 -0
- uiform/resources/prompt_optimization.py +76 -0
- uiform/resources/schemas.py +369 -0
- uiform/resources/secrets/__init__.py +9 -0
- uiform/resources/secrets/client.py +20 -0
- uiform/resources/secrets/external_api_keys.py +109 -0
- uiform/resources/secrets/webhook.py +62 -0
- uiform/resources/usage.py +271 -0
- uiform/types/__init__.py +0 -0
- uiform/types/ai_models.py +645 -0
- uiform/types/automations/__init__.py +0 -0
- uiform/types/automations/cron.py +58 -0
- uiform/types/automations/endpoints.py +21 -0
- uiform/types/automations/links.py +28 -0
- uiform/types/automations/mailboxes.py +60 -0
- uiform/types/automations/outlook.py +68 -0
- uiform/types/automations/webhooks.py +21 -0
- uiform/types/chat.py +8 -0
- uiform/types/completions.py +93 -0
- uiform/types/consensus.py +10 -0
- uiform/types/db/__init__.py +0 -0
- uiform/types/db/annotations.py +24 -0
- uiform/types/db/files.py +36 -0
- uiform/types/deployments/__init__.py +0 -0
- uiform/types/deployments/cron.py +59 -0
- uiform/types/deployments/endpoints.py +28 -0
- uiform/types/deployments/links.py +36 -0
- uiform/types/deployments/mailboxes.py +67 -0
- uiform/types/deployments/outlook.py +76 -0
- uiform/types/deployments/webhooks.py +21 -0
- uiform/types/documents/__init__.py +0 -0
- uiform/types/documents/correct_orientation.py +13 -0
- uiform/types/documents/create_messages.py +226 -0
- uiform/types/documents/extractions.py +297 -0
- uiform/types/evals.py +207 -0
- uiform/types/events.py +76 -0
- uiform/types/extractions.py +85 -0
- uiform/types/jobs/__init__.py +0 -0
- uiform/types/jobs/base.py +150 -0
- uiform/types/jobs/batch_annotation.py +22 -0
- uiform/types/jobs/evaluation.py +133 -0
- uiform/types/jobs/finetune.py +6 -0
- uiform/types/jobs/prompt_optimization.py +41 -0
- uiform/types/jobs/webcrawl.py +6 -0
- uiform/types/logs.py +231 -0
- uiform/types/mime.py +257 -0
- uiform/types/modalities.py +68 -0
- uiform/types/pagination.py +6 -0
- uiform/types/schemas/__init__.py +0 -0
- uiform/types/schemas/enhance.py +53 -0
- uiform/types/schemas/evaluate.py +55 -0
- uiform/types/schemas/generate.py +32 -0
- uiform/types/schemas/layout.py +58 -0
- uiform/types/schemas/object.py +631 -0
- uiform/types/schemas/templates.py +107 -0
- uiform/types/secrets/__init__.py +0 -0
- uiform/types/secrets/external_api_keys.py +22 -0
- uiform/types/standards.py +39 -0
@@ -0,0 +1,41 @@
|
|
1
|
+
# from typing import Literal, Any
|
2
|
+
# from pydantic import BaseModel, computed_field
|
3
|
+
# from ..mime import MIMEData
|
4
|
+
# from ..._utils.benchmarking import ExtractionAnalysis
|
5
|
+
|
6
|
+
# MAX_CONCURRENCY = 15
|
7
|
+
|
8
|
+
|
9
|
+
# class PromptOptimizationObject(BaseModel):
|
10
|
+
# mime_document: MIMEData
|
11
|
+
# target: dict
|
12
|
+
# extracted: dict | None = None
|
13
|
+
|
14
|
+
# @computed_field # type: ignore
|
15
|
+
# @property
|
16
|
+
# def analysis(self) -> ExtractionAnalysis | None:
|
17
|
+
# if self.extracted is None:
|
18
|
+
# return None
|
19
|
+
# return ExtractionAnalysis(ground_truth=self.target, prediction=self.extracted)
|
20
|
+
|
21
|
+
|
22
|
+
# Metrics = Literal["levenshtein_similarity_per_field", "accuracy_per_field"]
|
23
|
+
|
24
|
+
|
25
|
+
# # Insert default values for the parameters
|
26
|
+
# class PromptOptimizationProps(BaseModel):
|
27
|
+
# start_hierarchy_level: int = 0
|
28
|
+
# threshold: float = 0.9
|
29
|
+
# model: str = "gpt-4o-mini"
|
30
|
+
# iterations_per_level: int = 1
|
31
|
+
# metric: Metrics = "accuracy_per_field"
|
32
|
+
|
33
|
+
# class PromptOptimizationJobInputData(BaseModel):
|
34
|
+
# json_schema: dict[str, Any]
|
35
|
+
# optimization_objects: list[PromptOptimizationObject]
|
36
|
+
# schema_optimization_props: PromptOptimizationProps
|
37
|
+
|
38
|
+
|
39
|
+
# class PromptOptimizationJob(BaseModel):
|
40
|
+
# job_type: Literal["prompt-optimization"] = "prompt-optimization"
|
41
|
+
# input_data: PromptOptimizationJobInputData
|
uiform/types/logs.py
ADDED
@@ -0,0 +1,231 @@
|
|
1
|
+
import copy
|
2
|
+
import datetime
|
3
|
+
import json
|
4
|
+
from typing import Any, Dict, List, Literal, Optional
|
5
|
+
|
6
|
+
import nanoid # type: ignore
|
7
|
+
from openai import OpenAI
|
8
|
+
from openai.types.chat.chat_completion_reasoning_effort import ChatCompletionReasoningEffort
|
9
|
+
from pydantic import BaseModel, EmailStr, Field, HttpUrl, computed_field, field_serializer
|
10
|
+
from pydantic_core import Url
|
11
|
+
|
12
|
+
from .._utils.json_schema import clean_schema, compute_schema_data_id
|
13
|
+
from .._utils.mime import generate_blake2b_hash_from_string
|
14
|
+
from .._utils.usage.usage import compute_cost_from_model, compute_cost_from_model_with_breakdown, CostBreakdown
|
15
|
+
from .ai_models import Amount
|
16
|
+
from .documents.extractions import UiParsedChatCompletion
|
17
|
+
from .mime import BaseMIMEData
|
18
|
+
from .modalities import Modality
|
19
|
+
from .pagination import ListMetadata
|
20
|
+
|
21
|
+
|
22
|
+
class ProcessorConfig(BaseModel):
|
23
|
+
object: str = Field(default="processor", description="Type of the object")
|
24
|
+
id: str = Field(default_factory=lambda: "proc_" + nanoid.generate(), description="Unique identifier for the processor")
|
25
|
+
updated_at: datetime.datetime = Field(default_factory=lambda: datetime.datetime.now(datetime.timezone.utc))
|
26
|
+
name: str = Field(..., description="Name of the processor")
|
27
|
+
|
28
|
+
modality: Modality
|
29
|
+
image_resolution_dpi: int = Field(default=96, description="Resolution of the image sent to the LLM")
|
30
|
+
browser_canvas: Literal['A3', 'A4', 'A5'] = Field(default='A4', description="Sets the size of the browser canvas for rendering documents in browser-based processing. Choose a size that matches the document type.")
|
31
|
+
|
32
|
+
# New attributes
|
33
|
+
model: str = Field(..., description="Model used for chat completion")
|
34
|
+
json_schema: dict[str, Any] = Field(..., description="JSON schema format used to validate the output data.")
|
35
|
+
temperature: float = Field(default=0.0, description="Temperature for sampling. If not provided, the default temperature for the model will be used.", examples=[0.0])
|
36
|
+
reasoning_effort: ChatCompletionReasoningEffort = Field(
|
37
|
+
default="medium", description="The effort level for the model to reason about the input data. If not provided, the default reasoning effort for the model will be used."
|
38
|
+
)
|
39
|
+
n_consensus: int = Field(default=1, description="Number of consensus required to validate the data")
|
40
|
+
|
41
|
+
@computed_field # type: ignore
|
42
|
+
@property
|
43
|
+
def schema_data_id(self) -> str:
|
44
|
+
"""Returns the SHA1 hash of the schema data, ignoring all prompt/description/default fields.
|
45
|
+
|
46
|
+
Returns:
|
47
|
+
str: A SHA1 hash string representing the schema data version.
|
48
|
+
"""
|
49
|
+
return compute_schema_data_id(self.json_schema)
|
50
|
+
|
51
|
+
# This is a computed field, it is exposed when serializing the object
|
52
|
+
@computed_field # type: ignore
|
53
|
+
@property
|
54
|
+
def schema_id(self) -> str:
|
55
|
+
"""Returns the SHA1 hash of the complete schema.
|
56
|
+
|
57
|
+
Returns:
|
58
|
+
str: A SHA1 hash string representing the complete schema version.
|
59
|
+
"""
|
60
|
+
return "sch_id_" + generate_blake2b_hash_from_string(json.dumps(self.json_schema, sort_keys=True).strip())
|
61
|
+
|
62
|
+
|
63
|
+
class AutomationConfig(BaseModel):
|
64
|
+
object: str = Field(default="automation", description="Type of the object")
|
65
|
+
id: str = Field(default_factory=lambda: "auto_" + nanoid.generate(), description="Unique identifier for the automation")
|
66
|
+
name: str = Field(..., description="Name of the automation")
|
67
|
+
processor_id: str = Field(..., description="ID of the processor to use for the automation")
|
68
|
+
updated_at: datetime.datetime = Field(default_factory=lambda: datetime.datetime.now(datetime.timezone.utc), description="Timestamp of the last update")
|
69
|
+
|
70
|
+
default_language: str = Field(default="en", description="Default language for the automation")
|
71
|
+
|
72
|
+
# HTTP Config
|
73
|
+
webhook_url: HttpUrl = Field(..., description="Url of the webhook to send the data to")
|
74
|
+
webhook_headers: Dict[str, str] = Field(default_factory=dict, description="Headers to send with the request")
|
75
|
+
|
76
|
+
need_validation: bool = Field(default=False, description="If the automation needs to be validated before running")
|
77
|
+
|
78
|
+
@field_serializer('webhook_url')
|
79
|
+
def url2str(self, val: HttpUrl) -> str:
|
80
|
+
return str(val)
|
81
|
+
|
82
|
+
|
83
|
+
class UpdateProcessorRequest(BaseModel):
|
84
|
+
# ------------------------------
|
85
|
+
# Processor Parameters
|
86
|
+
# ------------------------------
|
87
|
+
name: Optional[str] = None
|
88
|
+
modality: Optional[Modality] = None
|
89
|
+
image_resolution_dpi: Optional[int] = None
|
90
|
+
browser_canvas: Optional[Literal['A3', 'A4', 'A5']] = None
|
91
|
+
model: Optional[str] = None
|
92
|
+
json_schema: Optional[Dict] = None
|
93
|
+
temperature: Optional[float] = None
|
94
|
+
reasoning_effort: Optional[ChatCompletionReasoningEffort] = None
|
95
|
+
n_consensus: Optional[int] = None
|
96
|
+
|
97
|
+
@computed_field # type: ignore
|
98
|
+
@property
|
99
|
+
def schema_data_id(self) -> Optional[str]:
|
100
|
+
"""Returns the SHA1 hash of the schema data, ignoring all prompt/description/default fields.
|
101
|
+
|
102
|
+
Returns:
|
103
|
+
str: A SHA1 hash string representing the schema data version.
|
104
|
+
"""
|
105
|
+
if self.json_schema is None:
|
106
|
+
return None
|
107
|
+
return compute_schema_data_id(self.json_schema)
|
108
|
+
|
109
|
+
@computed_field # type: ignore
|
110
|
+
@property
|
111
|
+
def schema_id(self) -> Optional[str]:
|
112
|
+
"""Returns the SHA1 hash of the complete schema.
|
113
|
+
|
114
|
+
Returns:
|
115
|
+
str: A SHA1 hash string representing the complete schema version.
|
116
|
+
"""
|
117
|
+
if self.json_schema is None:
|
118
|
+
return None
|
119
|
+
return "sch_id_" + generate_blake2b_hash_from_string(json.dumps(self.json_schema, sort_keys=True).strip())
|
120
|
+
|
121
|
+
|
122
|
+
class UpdateAutomationRequest(BaseModel):
|
123
|
+
name: Optional[str] = None
|
124
|
+
processor_id: Optional[str] = None
|
125
|
+
|
126
|
+
default_language: Optional[str] = None
|
127
|
+
|
128
|
+
webhook_url: Optional[HttpUrl] = None
|
129
|
+
webhook_headers: Optional[Dict[str, str]] = None
|
130
|
+
|
131
|
+
need_validation: Optional[bool] = None
|
132
|
+
|
133
|
+
|
134
|
+
@field_serializer('webhook_url')
|
135
|
+
def url2str(self, val: HttpUrl | None) -> str | None:
|
136
|
+
if isinstance(val, HttpUrl):
|
137
|
+
return str(val)
|
138
|
+
return val
|
139
|
+
|
140
|
+
class OpenAIRequestConfig(BaseModel):
|
141
|
+
object: Literal['openai_request'] = "openai_request"
|
142
|
+
id: str = Field(default_factory=lambda: "openai_req_" + nanoid.generate(), description="Unique identifier for the openai request")
|
143
|
+
model: str
|
144
|
+
json_schema: dict[str, Any]
|
145
|
+
reasoning_effort: Optional[ChatCompletionReasoningEffort] = None
|
146
|
+
|
147
|
+
|
148
|
+
# ------------------------------
|
149
|
+
# ------------------------------
|
150
|
+
# ------------------------------
|
151
|
+
|
152
|
+
# from .automations.mailboxes import Mailbox
|
153
|
+
# from .automations.links import Link
|
154
|
+
# from .automations.cron import ScrappingConfig
|
155
|
+
# from .automations.outlook import Outlook
|
156
|
+
|
157
|
+
# class OpenAILog(BaseModel):
|
158
|
+
# request_config: OpenAIRequestConfig
|
159
|
+
# completion: ChatCompletion
|
160
|
+
|
161
|
+
|
162
|
+
class ExternalRequestLog(BaseModel):
|
163
|
+
webhook_url: Optional[HttpUrl]
|
164
|
+
request_body: dict[str, Any]
|
165
|
+
request_headers: dict[str, str]
|
166
|
+
request_at: datetime.datetime
|
167
|
+
|
168
|
+
response_body: dict[str, Any]
|
169
|
+
response_headers: dict[str, str]
|
170
|
+
response_at: datetime.datetime
|
171
|
+
|
172
|
+
status_code: int
|
173
|
+
error: Optional[str] = None
|
174
|
+
duration_ms: float
|
175
|
+
|
176
|
+
@field_serializer('webhook_url')
|
177
|
+
def url2str(self, val: HttpUrl | None) -> str | None:
|
178
|
+
if isinstance(val, HttpUrl):
|
179
|
+
return str(val)
|
180
|
+
return val
|
181
|
+
|
182
|
+
|
183
|
+
from openai.types.chat import completion_create_params
|
184
|
+
from openai.types.chat.chat_completion import ChatCompletion
|
185
|
+
|
186
|
+
|
187
|
+
class LogCompletionRequest(BaseModel):
|
188
|
+
json_schema: dict[str, Any]
|
189
|
+
completion: ChatCompletion
|
190
|
+
|
191
|
+
|
192
|
+
class AutomationLog(BaseModel):
|
193
|
+
object: Literal['automation_log'] = "automation_log"
|
194
|
+
id: str = Field(default_factory=lambda: "log_auto_" + nanoid.generate(), description="Unique identifier for the automation log")
|
195
|
+
user_email: Optional[EmailStr] # When the user is logged or when he forwards an email
|
196
|
+
organization_id: str
|
197
|
+
created_at: datetime.datetime = Field(default_factory=lambda: datetime.datetime.now(datetime.timezone.utc))
|
198
|
+
automation_snapshot: AutomationConfig
|
199
|
+
completion: UiParsedChatCompletion | ChatCompletion
|
200
|
+
file_metadata: Optional[BaseMIMEData]
|
201
|
+
external_request_log: Optional[ExternalRequestLog]
|
202
|
+
extraction_id: Optional[str] = Field(default=None, description="ID of the extraction")
|
203
|
+
|
204
|
+
@computed_field # type: ignore
|
205
|
+
@property
|
206
|
+
def api_cost(self) -> Optional[Amount]:
|
207
|
+
if self.completion and self.completion.usage:
|
208
|
+
try:
|
209
|
+
cost = compute_cost_from_model(self.completion.model, self.completion.usage)
|
210
|
+
return cost
|
211
|
+
except Exception as e:
|
212
|
+
print(f"Error computing cost: {e}")
|
213
|
+
return None
|
214
|
+
return None
|
215
|
+
|
216
|
+
@computed_field # type: ignore
|
217
|
+
@property
|
218
|
+
def cost_breakdown(self) -> Optional[CostBreakdown]:
|
219
|
+
if self.completion and self.completion.usage:
|
220
|
+
try:
|
221
|
+
cost = compute_cost_from_model_with_breakdown(self.completion.model, self.completion.usage)
|
222
|
+
return cost
|
223
|
+
except Exception as e:
|
224
|
+
print(f"Error computing cost: {e}")
|
225
|
+
return None
|
226
|
+
return None
|
227
|
+
|
228
|
+
|
229
|
+
class ListLogs(BaseModel):
|
230
|
+
data: List[AutomationLog]
|
231
|
+
list_metadata: ListMetadata
|
uiform/types/mime.py
ADDED
@@ -0,0 +1,257 @@
|
|
1
|
+
import base64
|
2
|
+
import datetime
|
3
|
+
import gzip
|
4
|
+
import hashlib
|
5
|
+
import mimetypes
|
6
|
+
import re
|
7
|
+
from typing import Any, Optional, Self, Sequence
|
8
|
+
|
9
|
+
from pydantic import BaseModel, Field, field_validator
|
10
|
+
|
11
|
+
|
12
|
+
def generate_blake2b_hash_from_bytes(bytes_: bytes) -> str:
|
13
|
+
return hashlib.blake2b(bytes_, digest_size=8).hexdigest()
|
14
|
+
|
15
|
+
|
16
|
+
def generate_blake2b_hash_from_base64(base64_string: str) -> str:
|
17
|
+
return generate_blake2b_hash_from_bytes(base64.b64decode(base64_string))
|
18
|
+
|
19
|
+
|
20
|
+
# **** OCR DATACLASSES (DocumentAI-compatible) ****
|
21
|
+
class Point(BaseModel):
|
22
|
+
x: int
|
23
|
+
y: int
|
24
|
+
|
25
|
+
|
26
|
+
class Matrix(BaseModel):
|
27
|
+
"""Representation for transformation matrix, compatible with OpenCV format.
|
28
|
+
|
29
|
+
This represents transformation matrices that were applied to the original
|
30
|
+
document image to produce the processed page image.
|
31
|
+
"""
|
32
|
+
|
33
|
+
rows: int = Field(description="Number of rows in the matrix")
|
34
|
+
cols: int = Field(description="Number of columns in the matrix")
|
35
|
+
type_: int = Field(description="OpenCV data type (e.g., 0 for CV_8U)")
|
36
|
+
data: str = Field(description="The matrix data compressed with gzip and encoded as base64 string for JSON serialization")
|
37
|
+
|
38
|
+
@property
|
39
|
+
def data_bytes(self) -> bytes:
|
40
|
+
"""Get the matrix data as bytes."""
|
41
|
+
# Decode base64 then decompress with gzip
|
42
|
+
compressed_data = base64.b64decode(self.data)
|
43
|
+
return gzip.decompress(compressed_data)
|
44
|
+
|
45
|
+
@classmethod
|
46
|
+
def from_bytes(cls, rows: int, cols: int, type_: int, data_bytes: bytes) -> Self:
|
47
|
+
"""Create a Matrix from raw bytes data."""
|
48
|
+
# Compress with gzip then encode with base64
|
49
|
+
compressed_data = gzip.compress(data_bytes, compresslevel=6) # Good balance of speed vs compression
|
50
|
+
encoded_data = base64.b64encode(compressed_data).decode("utf-8")
|
51
|
+
return cls(rows=rows, cols=cols, type_=type_, data=encoded_data)
|
52
|
+
|
53
|
+
|
54
|
+
class TextBox(BaseModel):
|
55
|
+
width: int
|
56
|
+
height: int
|
57
|
+
center: Point
|
58
|
+
vertices: tuple[Point, Point, Point, Point] = Field(description="(top-left, top-right, bottom-right, bottom-left)")
|
59
|
+
text: str
|
60
|
+
|
61
|
+
@field_validator('width', 'height')
|
62
|
+
@classmethod
|
63
|
+
def check_positive_dimensions(cls, v: int) -> int:
|
64
|
+
if not isinstance(v, int) or v <= 0:
|
65
|
+
raise ValueError(f"Dimension must be a positive integer, got {v}")
|
66
|
+
return v
|
67
|
+
|
68
|
+
|
69
|
+
class Page(BaseModel):
|
70
|
+
page_number: int
|
71
|
+
width: int
|
72
|
+
height: int
|
73
|
+
unit: str = Field(default="pixels", description="The unit of the page dimensions")
|
74
|
+
blocks: list[TextBox]
|
75
|
+
lines: list[TextBox]
|
76
|
+
tokens: list[TextBox]
|
77
|
+
transforms: list[Matrix] = Field(default=[], description="Transformation matrices applied to the original document image")
|
78
|
+
|
79
|
+
@field_validator('width', 'height')
|
80
|
+
@classmethod
|
81
|
+
def check_positive_dimensions(cls, v: int) -> int:
|
82
|
+
if not isinstance(v, int) or v <= 0:
|
83
|
+
raise ValueError(f"Page dimension must be a positive integer, got {v}")
|
84
|
+
return v
|
85
|
+
|
86
|
+
|
87
|
+
class OCR(BaseModel):
|
88
|
+
pages: list[Page]
|
89
|
+
|
90
|
+
|
91
|
+
class MIMEData(BaseModel):
|
92
|
+
filename: str = Field(description="The filename of the file", examples=["file.pdf", "image.png", "data.txt"])
|
93
|
+
url: str = Field(description="The URL of the file in base64 format", examples=["..."])
|
94
|
+
|
95
|
+
@property
|
96
|
+
def id(self) -> str:
|
97
|
+
return f"file_{generate_blake2b_hash_from_base64(self.content)}"
|
98
|
+
|
99
|
+
@property
|
100
|
+
def extension(self) -> str:
|
101
|
+
return self.filename.split('.')[-1].lower()
|
102
|
+
|
103
|
+
@property
|
104
|
+
def content(self) -> str:
|
105
|
+
if self.url.startswith('data:'):
|
106
|
+
# Extract base64 content from data URL
|
107
|
+
base64_content = self.url.split(',')[1]
|
108
|
+
return base64_content
|
109
|
+
else:
|
110
|
+
raise ValueError("Content is not available for this file")
|
111
|
+
|
112
|
+
@property
|
113
|
+
def mime_type(self) -> str:
|
114
|
+
if self.url.startswith('data:'):
|
115
|
+
return self.url.split(';')[0].split(':')[1]
|
116
|
+
else:
|
117
|
+
return mimetypes.guess_type(self.filename)[0] or "application/octet-stream"
|
118
|
+
|
119
|
+
@property
|
120
|
+
def unique_filename(self) -> str:
|
121
|
+
return f"{self.id}.{self.extension}"
|
122
|
+
|
123
|
+
@property
|
124
|
+
def size(self) -> int:
|
125
|
+
# size in bytes
|
126
|
+
return len(base64.b64decode(self.content))
|
127
|
+
|
128
|
+
def __str__(self) -> str:
|
129
|
+
truncated_url = self.url[:50] + '...' if len(self.url) > 50 else self.url
|
130
|
+
# truncated_content = self.content[:50] + '...' if len(self.content) > 50 else self.content
|
131
|
+
return f"MIMEData(filename='{self.filename}', url='{truncated_url}', mime_type='{self.mime_type}', size='{self.size}', extension='{self.extension}')"
|
132
|
+
|
133
|
+
def __repr__(self) -> str:
|
134
|
+
return self.__str__()
|
135
|
+
|
136
|
+
|
137
|
+
class BaseMIMEData(MIMEData):
|
138
|
+
@classmethod
|
139
|
+
def model_validate(cls, obj: Any, *, strict: bool | None = None, from_attributes: bool | None = None, context: Any | None = None) -> Self:
|
140
|
+
if isinstance(obj, MIMEData):
|
141
|
+
# Convert MIMEData instance to dict
|
142
|
+
obj = obj.model_dump()
|
143
|
+
if isinstance(obj, dict) and 'url' in obj:
|
144
|
+
# Truncate URL to 1000 chars or less, ensuring it's a valid base64 string
|
145
|
+
if len(obj['url']) > 1000:
|
146
|
+
# Find the position of the base64 data
|
147
|
+
if ',' in obj['url']:
|
148
|
+
prefix, base64_data = obj['url'].split(',', 1)
|
149
|
+
# Calculate how many characters we can keep (must be a multiple of 4)
|
150
|
+
max_base64_len = 1000 - len(prefix) - 1 # -1 for the comma
|
151
|
+
# Ensure the length is a multiple of 4
|
152
|
+
max_base64_len = max_base64_len - (max_base64_len % 4)
|
153
|
+
# Truncate and reassemble
|
154
|
+
obj['url'] = prefix + ',' + base64_data[:max_base64_len]
|
155
|
+
else:
|
156
|
+
# If there's no comma (unexpected format), truncate to 996 chars (multiple of 4)
|
157
|
+
obj['url'] = obj['url'][:996]
|
158
|
+
return super().model_validate(obj, strict=strict, from_attributes=from_attributes, context=context)
|
159
|
+
|
160
|
+
@property
|
161
|
+
def id(self) -> str:
|
162
|
+
raise NotImplementedError("id is not implemented for BaseMIMEData - id is the hash of the content, so it's not possible to generate it from the base class")
|
163
|
+
|
164
|
+
def __str__(self) -> str:
|
165
|
+
truncated_url = self.url[:50] + '...' if len(self.url) > 50 else self.url
|
166
|
+
truncated_content = self.content[:50] + '...' if len(self.content) > 50 else self.content
|
167
|
+
return f"BaseMIMEData(filename='{self.filename}', url='{truncated_url}', content='{truncated_content}', mime_type='{self.mime_type}', extension='{self.extension}')"
|
168
|
+
|
169
|
+
def __repr__(self) -> str:
|
170
|
+
return self.__str__()
|
171
|
+
|
172
|
+
|
173
|
+
# **** MIME DATACLASSES ****
|
174
|
+
class AttachmentMetadata(BaseModel):
|
175
|
+
is_inline: bool = Field(default=False, description="Whether the attachment is inline or not.")
|
176
|
+
inline_cid: Optional[str] = Field(default=None, description="CID reference for inline attachments.")
|
177
|
+
source: Optional[str] = Field(
|
178
|
+
default=None,
|
179
|
+
description="Source of the attachment in dot notation attachment_id, or email_id.attachment_id, allow us to keep track of the origin of the attachment, for search purposes. ",
|
180
|
+
)
|
181
|
+
|
182
|
+
|
183
|
+
class BaseAttachmentMIMEData(BaseMIMEData):
|
184
|
+
metadata: AttachmentMetadata = Field(default=AttachmentMetadata(), description="Additional metadata about the attachment.")
|
185
|
+
|
186
|
+
|
187
|
+
class AttachmentMIMEData(MIMEData):
|
188
|
+
metadata: AttachmentMetadata = Field(default=AttachmentMetadata(), description="Additional metadata about the attachment.")
|
189
|
+
|
190
|
+
|
191
|
+
# **** EMAIL DATACLASSES ****
|
192
|
+
|
193
|
+
|
194
|
+
class EmailAddressData(BaseModel):
|
195
|
+
email: str = Field(..., description="The email address")
|
196
|
+
display_name: Optional[str] = Field(default=None, description="The display name associated with the email address")
|
197
|
+
|
198
|
+
def __str__(self) -> str:
|
199
|
+
if self.display_name:
|
200
|
+
return f"{self.display_name} <{self.email}>"
|
201
|
+
else:
|
202
|
+
return f"<{self.email}>"
|
203
|
+
|
204
|
+
|
205
|
+
# Light EmailData object that can conveniently be stored in mongoDB for search
|
206
|
+
class BaseEmailData(BaseModel):
|
207
|
+
id: str = Field(..., description="The Message-ID header of the email")
|
208
|
+
tree_id: str = Field(..., description="The root email ID, which is references[0] if it exists, otherwise the email's ID")
|
209
|
+
|
210
|
+
subject: Optional[str] = Field(default=None, description="The subject of the email")
|
211
|
+
body_plain: Optional[str] = Field(default=None, description="The plain text body of the email")
|
212
|
+
body_html: Optional[str] = Field(default=None, description="The HTML body of the email")
|
213
|
+
sender: EmailAddressData = Field(..., description="The sender's email address information")
|
214
|
+
recipients_to: list[EmailAddressData] = Field(..., description="List of primary recipients' email address information")
|
215
|
+
recipients_cc: list[EmailAddressData] = Field(default=[], description="List of carbon copy recipients' email address information")
|
216
|
+
recipients_bcc: list[EmailAddressData] = Field(default=[], description="List of blind carbon copy recipients' email address information")
|
217
|
+
sent_at: datetime.datetime = Field(..., description="The date and time when the email was sent")
|
218
|
+
received_at: Optional[datetime.datetime] = Field(default=None, description="The date and time when the email was received")
|
219
|
+
|
220
|
+
in_reply_to: Optional[str] = Field(default=None, description="The Message-ID of the email this is replying to")
|
221
|
+
references: list[str] = Field(default=[], description="List of Message-IDs this email references")
|
222
|
+
headers: dict[str, str] = Field(default={}, description="Dictionary of email headers")
|
223
|
+
|
224
|
+
url: Optional[str] = Field(default=None, description="URL where the email content can be accessed")
|
225
|
+
|
226
|
+
attachments: Sequence[BaseAttachmentMIMEData] = Field(default=[], description="List of email attachments")
|
227
|
+
|
228
|
+
@property
|
229
|
+
def unique_filename(self) -> str:
|
230
|
+
cleaned_id = re.sub(r'[\s<>]', '', self.id)
|
231
|
+
return f"{cleaned_id}.eml"
|
232
|
+
|
233
|
+
def __repr__(self) -> str:
|
234
|
+
recipient_count = len(self.recipients_to) + len(self.recipients_cc) + len(self.recipients_bcc)
|
235
|
+
attachment_count = len(self.attachments)
|
236
|
+
|
237
|
+
subject_preview = self.subject
|
238
|
+
body_preview = self.body_plain[:5000] + '...' if self.body_plain and len(self.body_plain) > 5000 else self.body_plain
|
239
|
+
|
240
|
+
return (
|
241
|
+
f"BaseEmailData("
|
242
|
+
f"id='{self.id}', "
|
243
|
+
f"subject='{subject_preview}', "
|
244
|
+
f"body='{body_preview}', "
|
245
|
+
f"sender='{self.sender.email}', "
|
246
|
+
f"recipients={recipient_count}, "
|
247
|
+
f"attachments={attachment_count}, "
|
248
|
+
f"sent_at='{self.sent_at.strftime('%Y-%m-%d %H:%M:%S')}'"
|
249
|
+
f")"
|
250
|
+
)
|
251
|
+
|
252
|
+
def __str__(self) -> str:
|
253
|
+
return self.__repr__()
|
254
|
+
|
255
|
+
|
256
|
+
class EmailData(BaseEmailData):
|
257
|
+
attachments: Sequence[AttachmentMIMEData] = Field([], description="List of email attachments") # type: ignore
|
@@ -0,0 +1,68 @@
|
|
1
|
+
from typing import Literal
|
2
|
+
|
3
|
+
BaseModality = Literal["text", "image"] # "video" , "audio"
|
4
|
+
Modality = Literal[BaseModality, "native", "image+text"]
|
5
|
+
TYPE_FAMILIES = Literal["excel", "word", "powerpoint", "pdf", "image", "text", "email", "audio", "html", "web"]
|
6
|
+
NativeModalities: dict[TYPE_FAMILIES, Modality] = {
|
7
|
+
'excel': 'image',
|
8
|
+
'word': 'image',
|
9
|
+
'html': 'text',
|
10
|
+
'powerpoint': 'image',
|
11
|
+
'pdf': 'image',
|
12
|
+
'image': 'image',
|
13
|
+
'web': 'image',
|
14
|
+
'text': 'text',
|
15
|
+
'email': 'native',
|
16
|
+
'audio': 'text',
|
17
|
+
}
|
18
|
+
|
19
|
+
EXCEL_TYPES = Literal[".xls", ".xlsx", ".ods"]
|
20
|
+
WORD_TYPES = Literal[".doc", ".docx", ".odt"]
|
21
|
+
PPT_TYPES = Literal[".ppt", ".pptx", ".odp"]
|
22
|
+
PDF_TYPES = Literal[".pdf"]
|
23
|
+
IMAGE_TYPES = Literal[".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".webp"]
|
24
|
+
TEXT_TYPES = Literal[
|
25
|
+
".txt",
|
26
|
+
".csv",
|
27
|
+
".tsv",
|
28
|
+
".md",
|
29
|
+
".log",
|
30
|
+
".xml",
|
31
|
+
".json",
|
32
|
+
".yaml",
|
33
|
+
".yml",
|
34
|
+
".rtf",
|
35
|
+
".ini",
|
36
|
+
".conf",
|
37
|
+
".cfg",
|
38
|
+
".nfo",
|
39
|
+
".srt",
|
40
|
+
".sql",
|
41
|
+
".sh",
|
42
|
+
".bat",
|
43
|
+
".ps1",
|
44
|
+
".js",
|
45
|
+
".jsx",
|
46
|
+
".ts",
|
47
|
+
".tsx",
|
48
|
+
".py",
|
49
|
+
".java",
|
50
|
+
".c",
|
51
|
+
".cpp",
|
52
|
+
".cs",
|
53
|
+
".rb",
|
54
|
+
".php",
|
55
|
+
".swift",
|
56
|
+
".kt",
|
57
|
+
".go",
|
58
|
+
".rs",
|
59
|
+
".pl",
|
60
|
+
".r",
|
61
|
+
".m",
|
62
|
+
".scala",
|
63
|
+
]
|
64
|
+
HTML_TYPES = Literal[".html", ".htm"]
|
65
|
+
WEB_TYPES = Literal[".mhtml"]
|
66
|
+
EMAIL_TYPES = Literal[".eml", ".msg"]
|
67
|
+
AUDIO_TYPES = Literal[".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm"]
|
68
|
+
SUPPORTED_TYPES = Literal[EXCEL_TYPES, WORD_TYPES, PPT_TYPES, PDF_TYPES, IMAGE_TYPES, TEXT_TYPES, HTML_TYPES, WEB_TYPES, EMAIL_TYPES, AUDIO_TYPES]
|
File without changes
|
@@ -0,0 +1,53 @@
|
|
1
|
+
from typing import Any, Self, TypedDict, Literal
|
2
|
+
from openai.types.chat.chat_completion_reasoning_effort import ChatCompletionReasoningEffort
|
3
|
+
from pydantic import BaseModel, Field, model_validator
|
4
|
+
|
5
|
+
from ..mime import MIMEData
|
6
|
+
from ..modalities import Modality
|
7
|
+
|
8
|
+
|
9
|
+
class EnhanceSchemaConfig(BaseModel):
|
10
|
+
allow_field_description_update: bool = False # Whether to allow the llm to update the description of existing fields
|
11
|
+
allow_system_prompt_update: bool = True # Whether to allow the llm to update the system prompt
|
12
|
+
allow_reasoning_field_toggle: bool = False # Whether to allow the llm to toggle the reasoning for fields
|
13
|
+
|
14
|
+
# Model validator
|
15
|
+
@model_validator(mode="after")
|
16
|
+
def check_at_least_one_tool_allowed(self) -> Self:
|
17
|
+
if not any([self.allow_field_description_update, self.allow_system_prompt_update, self.allow_reasoning_field_toggle]):
|
18
|
+
raise ValueError("At least one tool must be allowed")
|
19
|
+
return self
|
20
|
+
|
21
|
+
|
22
|
+
# Define a typed Dict for EnhanceSchemaConfig (for now it is kind static, but we will add more flexibility in the future)
|
23
|
+
class EnhanceSchemaConfigDict(TypedDict, total=False):
|
24
|
+
allow_field_description_update: bool
|
25
|
+
allow_system_prompt_update: bool
|
26
|
+
allow_reasoning_field_toggle: bool
|
27
|
+
|
28
|
+
|
29
|
+
class EnhanceSchemaRequest(BaseModel):
|
30
|
+
"""
|
31
|
+
The request body for enhancing a JSON Schema.
|
32
|
+
"""
|
33
|
+
|
34
|
+
documents: list[MIMEData]
|
35
|
+
ground_truths: list[dict[str, Any]] | None = None
|
36
|
+
model: str = "gpt-4o-mini"
|
37
|
+
temperature: float = 0.0
|
38
|
+
reasoning_effort: ChatCompletionReasoningEffort = "medium"
|
39
|
+
modality: Modality
|
40
|
+
"""The modality of the document to load."""
|
41
|
+
|
42
|
+
image_resolution_dpi: int = 96
|
43
|
+
browser_canvas: Literal['A3', 'A4', 'A5'] = 'A4'
|
44
|
+
"""The image operations to apply to the document."""
|
45
|
+
|
46
|
+
stream: bool = False
|
47
|
+
"""Whether to stream the response."""
|
48
|
+
|
49
|
+
tools_config: EnhanceSchemaConfig = Field(default_factory=EnhanceSchemaConfig, description="The configuration for the tools to use")
|
50
|
+
|
51
|
+
json_schema: dict[str, Any]
|
52
|
+
instructions: str | None = None
|
53
|
+
flat_likelihoods: list[dict[str, float]] | dict[str, float] | None = None
|