retab 0.0.35__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (111) hide show
  1. retab-0.0.35.dist-info/METADATA +417 -0
  2. retab-0.0.35.dist-info/RECORD +111 -0
  3. retab-0.0.35.dist-info/WHEEL +5 -0
  4. retab-0.0.35.dist-info/top_level.txt +1 -0
  5. uiform/__init__.py +4 -0
  6. uiform/_resource.py +28 -0
  7. uiform/_utils/__init__.py +0 -0
  8. uiform/_utils/ai_models.py +100 -0
  9. uiform/_utils/benchmarking copy.py +588 -0
  10. uiform/_utils/benchmarking.py +485 -0
  11. uiform/_utils/chat.py +332 -0
  12. uiform/_utils/display.py +443 -0
  13. uiform/_utils/json_schema.py +2161 -0
  14. uiform/_utils/mime.py +168 -0
  15. uiform/_utils/responses.py +163 -0
  16. uiform/_utils/stream_context_managers.py +52 -0
  17. uiform/_utils/usage/__init__.py +0 -0
  18. uiform/_utils/usage/usage.py +300 -0
  19. uiform/client.py +701 -0
  20. uiform/py.typed +0 -0
  21. uiform/resources/__init__.py +0 -0
  22. uiform/resources/consensus/__init__.py +3 -0
  23. uiform/resources/consensus/client.py +114 -0
  24. uiform/resources/consensus/completions.py +252 -0
  25. uiform/resources/consensus/completions_stream.py +278 -0
  26. uiform/resources/consensus/responses.py +325 -0
  27. uiform/resources/consensus/responses_stream.py +373 -0
  28. uiform/resources/deployments/__init__.py +9 -0
  29. uiform/resources/deployments/client.py +78 -0
  30. uiform/resources/deployments/endpoints.py +322 -0
  31. uiform/resources/deployments/links.py +452 -0
  32. uiform/resources/deployments/logs.py +211 -0
  33. uiform/resources/deployments/mailboxes.py +496 -0
  34. uiform/resources/deployments/outlook.py +531 -0
  35. uiform/resources/deployments/tests.py +158 -0
  36. uiform/resources/documents/__init__.py +3 -0
  37. uiform/resources/documents/client.py +255 -0
  38. uiform/resources/documents/extractions.py +441 -0
  39. uiform/resources/evals.py +812 -0
  40. uiform/resources/files.py +24 -0
  41. uiform/resources/finetuning.py +62 -0
  42. uiform/resources/jsonlUtils.py +1046 -0
  43. uiform/resources/models.py +45 -0
  44. uiform/resources/openai_example.py +22 -0
  45. uiform/resources/processors/__init__.py +3 -0
  46. uiform/resources/processors/automations/__init__.py +9 -0
  47. uiform/resources/processors/automations/client.py +78 -0
  48. uiform/resources/processors/automations/endpoints.py +317 -0
  49. uiform/resources/processors/automations/links.py +356 -0
  50. uiform/resources/processors/automations/logs.py +211 -0
  51. uiform/resources/processors/automations/mailboxes.py +435 -0
  52. uiform/resources/processors/automations/outlook.py +444 -0
  53. uiform/resources/processors/automations/tests.py +158 -0
  54. uiform/resources/processors/client.py +474 -0
  55. uiform/resources/prompt_optimization.py +76 -0
  56. uiform/resources/schemas.py +369 -0
  57. uiform/resources/secrets/__init__.py +9 -0
  58. uiform/resources/secrets/client.py +20 -0
  59. uiform/resources/secrets/external_api_keys.py +109 -0
  60. uiform/resources/secrets/webhook.py +62 -0
  61. uiform/resources/usage.py +271 -0
  62. uiform/types/__init__.py +0 -0
  63. uiform/types/ai_models.py +645 -0
  64. uiform/types/automations/__init__.py +0 -0
  65. uiform/types/automations/cron.py +58 -0
  66. uiform/types/automations/endpoints.py +21 -0
  67. uiform/types/automations/links.py +28 -0
  68. uiform/types/automations/mailboxes.py +60 -0
  69. uiform/types/automations/outlook.py +68 -0
  70. uiform/types/automations/webhooks.py +21 -0
  71. uiform/types/chat.py +8 -0
  72. uiform/types/completions.py +93 -0
  73. uiform/types/consensus.py +10 -0
  74. uiform/types/db/__init__.py +0 -0
  75. uiform/types/db/annotations.py +24 -0
  76. uiform/types/db/files.py +36 -0
  77. uiform/types/deployments/__init__.py +0 -0
  78. uiform/types/deployments/cron.py +59 -0
  79. uiform/types/deployments/endpoints.py +28 -0
  80. uiform/types/deployments/links.py +36 -0
  81. uiform/types/deployments/mailboxes.py +67 -0
  82. uiform/types/deployments/outlook.py +76 -0
  83. uiform/types/deployments/webhooks.py +21 -0
  84. uiform/types/documents/__init__.py +0 -0
  85. uiform/types/documents/correct_orientation.py +13 -0
  86. uiform/types/documents/create_messages.py +226 -0
  87. uiform/types/documents/extractions.py +297 -0
  88. uiform/types/evals.py +207 -0
  89. uiform/types/events.py +76 -0
  90. uiform/types/extractions.py +85 -0
  91. uiform/types/jobs/__init__.py +0 -0
  92. uiform/types/jobs/base.py +150 -0
  93. uiform/types/jobs/batch_annotation.py +22 -0
  94. uiform/types/jobs/evaluation.py +133 -0
  95. uiform/types/jobs/finetune.py +6 -0
  96. uiform/types/jobs/prompt_optimization.py +41 -0
  97. uiform/types/jobs/webcrawl.py +6 -0
  98. uiform/types/logs.py +231 -0
  99. uiform/types/mime.py +257 -0
  100. uiform/types/modalities.py +68 -0
  101. uiform/types/pagination.py +6 -0
  102. uiform/types/schemas/__init__.py +0 -0
  103. uiform/types/schemas/enhance.py +53 -0
  104. uiform/types/schemas/evaluate.py +55 -0
  105. uiform/types/schemas/generate.py +32 -0
  106. uiform/types/schemas/layout.py +58 -0
  107. uiform/types/schemas/object.py +631 -0
  108. uiform/types/schemas/templates.py +107 -0
  109. uiform/types/secrets/__init__.py +0 -0
  110. uiform/types/secrets/external_api_keys.py +22 -0
  111. uiform/types/standards.py +39 -0
@@ -0,0 +1,41 @@
1
+ # from typing import Literal, Any
2
+ # from pydantic import BaseModel, computed_field
3
+ # from ..mime import MIMEData
4
+ # from ..._utils.benchmarking import ExtractionAnalysis
5
+
6
+ # MAX_CONCURRENCY = 15
7
+
8
+
9
+ # class PromptOptimizationObject(BaseModel):
10
+ # mime_document: MIMEData
11
+ # target: dict
12
+ # extracted: dict | None = None
13
+
14
+ # @computed_field # type: ignore
15
+ # @property
16
+ # def analysis(self) -> ExtractionAnalysis | None:
17
+ # if self.extracted is None:
18
+ # return None
19
+ # return ExtractionAnalysis(ground_truth=self.target, prediction=self.extracted)
20
+
21
+
22
+ # Metrics = Literal["levenshtein_similarity_per_field", "accuracy_per_field"]
23
+
24
+
25
+ # # Insert default values for the parameters
26
+ # class PromptOptimizationProps(BaseModel):
27
+ # start_hierarchy_level: int = 0
28
+ # threshold: float = 0.9
29
+ # model: str = "gpt-4o-mini"
30
+ # iterations_per_level: int = 1
31
+ # metric: Metrics = "accuracy_per_field"
32
+
33
+ # class PromptOptimizationJobInputData(BaseModel):
34
+ # json_schema: dict[str, Any]
35
+ # optimization_objects: list[PromptOptimizationObject]
36
+ # schema_optimization_props: PromptOptimizationProps
37
+
38
+
39
+ # class PromptOptimizationJob(BaseModel):
40
+ # job_type: Literal["prompt-optimization"] = "prompt-optimization"
41
+ # input_data: PromptOptimizationJobInputData
@@ -0,0 +1,6 @@
1
+ # from pydantic import BaseModel, Field
2
+ # from typing import Optional, Literal
3
+
4
+ # class WebcrawlInputData(BaseModel):
5
+ # url: str
6
+ # limit: int = Field(default=3, ge=1, le=100)
uiform/types/logs.py ADDED
@@ -0,0 +1,231 @@
1
+ import copy
2
+ import datetime
3
+ import json
4
+ from typing import Any, Dict, List, Literal, Optional
5
+
6
+ import nanoid # type: ignore
7
+ from openai import OpenAI
8
+ from openai.types.chat.chat_completion_reasoning_effort import ChatCompletionReasoningEffort
9
+ from pydantic import BaseModel, EmailStr, Field, HttpUrl, computed_field, field_serializer
10
+ from pydantic_core import Url
11
+
12
+ from .._utils.json_schema import clean_schema, compute_schema_data_id
13
+ from .._utils.mime import generate_blake2b_hash_from_string
14
+ from .._utils.usage.usage import compute_cost_from_model, compute_cost_from_model_with_breakdown, CostBreakdown
15
+ from .ai_models import Amount
16
+ from .documents.extractions import UiParsedChatCompletion
17
+ from .mime import BaseMIMEData
18
+ from .modalities import Modality
19
+ from .pagination import ListMetadata
20
+
21
+
22
+ class ProcessorConfig(BaseModel):
23
+ object: str = Field(default="processor", description="Type of the object")
24
+ id: str = Field(default_factory=lambda: "proc_" + nanoid.generate(), description="Unique identifier for the processor")
25
+ updated_at: datetime.datetime = Field(default_factory=lambda: datetime.datetime.now(datetime.timezone.utc))
26
+ name: str = Field(..., description="Name of the processor")
27
+
28
+ modality: Modality
29
+ image_resolution_dpi: int = Field(default=96, description="Resolution of the image sent to the LLM")
30
+ browser_canvas: Literal['A3', 'A4', 'A5'] = Field(default='A4', description="Sets the size of the browser canvas for rendering documents in browser-based processing. Choose a size that matches the document type.")
31
+
32
+ # New attributes
33
+ model: str = Field(..., description="Model used for chat completion")
34
+ json_schema: dict[str, Any] = Field(..., description="JSON schema format used to validate the output data.")
35
+ temperature: float = Field(default=0.0, description="Temperature for sampling. If not provided, the default temperature for the model will be used.", examples=[0.0])
36
+ reasoning_effort: ChatCompletionReasoningEffort = Field(
37
+ default="medium", description="The effort level for the model to reason about the input data. If not provided, the default reasoning effort for the model will be used."
38
+ )
39
+ n_consensus: int = Field(default=1, description="Number of consensus required to validate the data")
40
+
41
+ @computed_field # type: ignore
42
+ @property
43
+ def schema_data_id(self) -> str:
44
+ """Returns the SHA1 hash of the schema data, ignoring all prompt/description/default fields.
45
+
46
+ Returns:
47
+ str: A SHA1 hash string representing the schema data version.
48
+ """
49
+ return compute_schema_data_id(self.json_schema)
50
+
51
+ # This is a computed field, it is exposed when serializing the object
52
+ @computed_field # type: ignore
53
+ @property
54
+ def schema_id(self) -> str:
55
+ """Returns the SHA1 hash of the complete schema.
56
+
57
+ Returns:
58
+ str: A SHA1 hash string representing the complete schema version.
59
+ """
60
+ return "sch_id_" + generate_blake2b_hash_from_string(json.dumps(self.json_schema, sort_keys=True).strip())
61
+
62
+
63
+ class AutomationConfig(BaseModel):
64
+ object: str = Field(default="automation", description="Type of the object")
65
+ id: str = Field(default_factory=lambda: "auto_" + nanoid.generate(), description="Unique identifier for the automation")
66
+ name: str = Field(..., description="Name of the automation")
67
+ processor_id: str = Field(..., description="ID of the processor to use for the automation")
68
+ updated_at: datetime.datetime = Field(default_factory=lambda: datetime.datetime.now(datetime.timezone.utc), description="Timestamp of the last update")
69
+
70
+ default_language: str = Field(default="en", description="Default language for the automation")
71
+
72
+ # HTTP Config
73
+ webhook_url: HttpUrl = Field(..., description="Url of the webhook to send the data to")
74
+ webhook_headers: Dict[str, str] = Field(default_factory=dict, description="Headers to send with the request")
75
+
76
+ need_validation: bool = Field(default=False, description="If the automation needs to be validated before running")
77
+
78
+ @field_serializer('webhook_url')
79
+ def url2str(self, val: HttpUrl) -> str:
80
+ return str(val)
81
+
82
+
83
+ class UpdateProcessorRequest(BaseModel):
84
+ # ------------------------------
85
+ # Processor Parameters
86
+ # ------------------------------
87
+ name: Optional[str] = None
88
+ modality: Optional[Modality] = None
89
+ image_resolution_dpi: Optional[int] = None
90
+ browser_canvas: Optional[Literal['A3', 'A4', 'A5']] = None
91
+ model: Optional[str] = None
92
+ json_schema: Optional[Dict] = None
93
+ temperature: Optional[float] = None
94
+ reasoning_effort: Optional[ChatCompletionReasoningEffort] = None
95
+ n_consensus: Optional[int] = None
96
+
97
+ @computed_field # type: ignore
98
+ @property
99
+ def schema_data_id(self) -> Optional[str]:
100
+ """Returns the SHA1 hash of the schema data, ignoring all prompt/description/default fields.
101
+
102
+ Returns:
103
+ str: A SHA1 hash string representing the schema data version.
104
+ """
105
+ if self.json_schema is None:
106
+ return None
107
+ return compute_schema_data_id(self.json_schema)
108
+
109
+ @computed_field # type: ignore
110
+ @property
111
+ def schema_id(self) -> Optional[str]:
112
+ """Returns the SHA1 hash of the complete schema.
113
+
114
+ Returns:
115
+ str: A SHA1 hash string representing the complete schema version.
116
+ """
117
+ if self.json_schema is None:
118
+ return None
119
+ return "sch_id_" + generate_blake2b_hash_from_string(json.dumps(self.json_schema, sort_keys=True).strip())
120
+
121
+
122
+ class UpdateAutomationRequest(BaseModel):
123
+ name: Optional[str] = None
124
+ processor_id: Optional[str] = None
125
+
126
+ default_language: Optional[str] = None
127
+
128
+ webhook_url: Optional[HttpUrl] = None
129
+ webhook_headers: Optional[Dict[str, str]] = None
130
+
131
+ need_validation: Optional[bool] = None
132
+
133
+
134
+ @field_serializer('webhook_url')
135
+ def url2str(self, val: HttpUrl | None) -> str | None:
136
+ if isinstance(val, HttpUrl):
137
+ return str(val)
138
+ return val
139
+
140
+ class OpenAIRequestConfig(BaseModel):
141
+ object: Literal['openai_request'] = "openai_request"
142
+ id: str = Field(default_factory=lambda: "openai_req_" + nanoid.generate(), description="Unique identifier for the openai request")
143
+ model: str
144
+ json_schema: dict[str, Any]
145
+ reasoning_effort: Optional[ChatCompletionReasoningEffort] = None
146
+
147
+
148
+ # ------------------------------
149
+ # ------------------------------
150
+ # ------------------------------
151
+
152
+ # from .automations.mailboxes import Mailbox
153
+ # from .automations.links import Link
154
+ # from .automations.cron import ScrappingConfig
155
+ # from .automations.outlook import Outlook
156
+
157
+ # class OpenAILog(BaseModel):
158
+ # request_config: OpenAIRequestConfig
159
+ # completion: ChatCompletion
160
+
161
+
162
+ class ExternalRequestLog(BaseModel):
163
+ webhook_url: Optional[HttpUrl]
164
+ request_body: dict[str, Any]
165
+ request_headers: dict[str, str]
166
+ request_at: datetime.datetime
167
+
168
+ response_body: dict[str, Any]
169
+ response_headers: dict[str, str]
170
+ response_at: datetime.datetime
171
+
172
+ status_code: int
173
+ error: Optional[str] = None
174
+ duration_ms: float
175
+
176
+ @field_serializer('webhook_url')
177
+ def url2str(self, val: HttpUrl | None) -> str | None:
178
+ if isinstance(val, HttpUrl):
179
+ return str(val)
180
+ return val
181
+
182
+
183
+ from openai.types.chat import completion_create_params
184
+ from openai.types.chat.chat_completion import ChatCompletion
185
+
186
+
187
+ class LogCompletionRequest(BaseModel):
188
+ json_schema: dict[str, Any]
189
+ completion: ChatCompletion
190
+
191
+
192
+ class AutomationLog(BaseModel):
193
+ object: Literal['automation_log'] = "automation_log"
194
+ id: str = Field(default_factory=lambda: "log_auto_" + nanoid.generate(), description="Unique identifier for the automation log")
195
+ user_email: Optional[EmailStr] # When the user is logged or when he forwards an email
196
+ organization_id: str
197
+ created_at: datetime.datetime = Field(default_factory=lambda: datetime.datetime.now(datetime.timezone.utc))
198
+ automation_snapshot: AutomationConfig
199
+ completion: UiParsedChatCompletion | ChatCompletion
200
+ file_metadata: Optional[BaseMIMEData]
201
+ external_request_log: Optional[ExternalRequestLog]
202
+ extraction_id: Optional[str] = Field(default=None, description="ID of the extraction")
203
+
204
+ @computed_field # type: ignore
205
+ @property
206
+ def api_cost(self) -> Optional[Amount]:
207
+ if self.completion and self.completion.usage:
208
+ try:
209
+ cost = compute_cost_from_model(self.completion.model, self.completion.usage)
210
+ return cost
211
+ except Exception as e:
212
+ print(f"Error computing cost: {e}")
213
+ return None
214
+ return None
215
+
216
+ @computed_field # type: ignore
217
+ @property
218
+ def cost_breakdown(self) -> Optional[CostBreakdown]:
219
+ if self.completion and self.completion.usage:
220
+ try:
221
+ cost = compute_cost_from_model_with_breakdown(self.completion.model, self.completion.usage)
222
+ return cost
223
+ except Exception as e:
224
+ print(f"Error computing cost: {e}")
225
+ return None
226
+ return None
227
+
228
+
229
+ class ListLogs(BaseModel):
230
+ data: List[AutomationLog]
231
+ list_metadata: ListMetadata
uiform/types/mime.py ADDED
@@ -0,0 +1,257 @@
1
+ import base64
2
+ import datetime
3
+ import gzip
4
+ import hashlib
5
+ import mimetypes
6
+ import re
7
+ from typing import Any, Optional, Self, Sequence
8
+
9
+ from pydantic import BaseModel, Field, field_validator
10
+
11
+
12
+ def generate_blake2b_hash_from_bytes(bytes_: bytes) -> str:
13
+ return hashlib.blake2b(bytes_, digest_size=8).hexdigest()
14
+
15
+
16
+ def generate_blake2b_hash_from_base64(base64_string: str) -> str:
17
+ return generate_blake2b_hash_from_bytes(base64.b64decode(base64_string))
18
+
19
+
20
+ # **** OCR DATACLASSES (DocumentAI-compatible) ****
21
+ class Point(BaseModel):
22
+ x: int
23
+ y: int
24
+
25
+
26
+ class Matrix(BaseModel):
27
+ """Representation for transformation matrix, compatible with OpenCV format.
28
+
29
+ This represents transformation matrices that were applied to the original
30
+ document image to produce the processed page image.
31
+ """
32
+
33
+ rows: int = Field(description="Number of rows in the matrix")
34
+ cols: int = Field(description="Number of columns in the matrix")
35
+ type_: int = Field(description="OpenCV data type (e.g., 0 for CV_8U)")
36
+ data: str = Field(description="The matrix data compressed with gzip and encoded as base64 string for JSON serialization")
37
+
38
+ @property
39
+ def data_bytes(self) -> bytes:
40
+ """Get the matrix data as bytes."""
41
+ # Decode base64 then decompress with gzip
42
+ compressed_data = base64.b64decode(self.data)
43
+ return gzip.decompress(compressed_data)
44
+
45
+ @classmethod
46
+ def from_bytes(cls, rows: int, cols: int, type_: int, data_bytes: bytes) -> Self:
47
+ """Create a Matrix from raw bytes data."""
48
+ # Compress with gzip then encode with base64
49
+ compressed_data = gzip.compress(data_bytes, compresslevel=6) # Good balance of speed vs compression
50
+ encoded_data = base64.b64encode(compressed_data).decode("utf-8")
51
+ return cls(rows=rows, cols=cols, type_=type_, data=encoded_data)
52
+
53
+
54
+ class TextBox(BaseModel):
55
+ width: int
56
+ height: int
57
+ center: Point
58
+ vertices: tuple[Point, Point, Point, Point] = Field(description="(top-left, top-right, bottom-right, bottom-left)")
59
+ text: str
60
+
61
+ @field_validator('width', 'height')
62
+ @classmethod
63
+ def check_positive_dimensions(cls, v: int) -> int:
64
+ if not isinstance(v, int) or v <= 0:
65
+ raise ValueError(f"Dimension must be a positive integer, got {v}")
66
+ return v
67
+
68
+
69
+ class Page(BaseModel):
70
+ page_number: int
71
+ width: int
72
+ height: int
73
+ unit: str = Field(default="pixels", description="The unit of the page dimensions")
74
+ blocks: list[TextBox]
75
+ lines: list[TextBox]
76
+ tokens: list[TextBox]
77
+ transforms: list[Matrix] = Field(default=[], description="Transformation matrices applied to the original document image")
78
+
79
+ @field_validator('width', 'height')
80
+ @classmethod
81
+ def check_positive_dimensions(cls, v: int) -> int:
82
+ if not isinstance(v, int) or v <= 0:
83
+ raise ValueError(f"Page dimension must be a positive integer, got {v}")
84
+ return v
85
+
86
+
87
+ class OCR(BaseModel):
88
+ pages: list[Page]
89
+
90
+
91
+ class MIMEData(BaseModel):
92
+ filename: str = Field(description="The filename of the file", examples=["file.pdf", "image.png", "data.txt"])
93
+ url: str = Field(description="The URL of the file in base64 format", examples=["..."])
94
+
95
+ @property
96
+ def id(self) -> str:
97
+ return f"file_{generate_blake2b_hash_from_base64(self.content)}"
98
+
99
+ @property
100
+ def extension(self) -> str:
101
+ return self.filename.split('.')[-1].lower()
102
+
103
+ @property
104
+ def content(self) -> str:
105
+ if self.url.startswith('data:'):
106
+ # Extract base64 content from data URL
107
+ base64_content = self.url.split(',')[1]
108
+ return base64_content
109
+ else:
110
+ raise ValueError("Content is not available for this file")
111
+
112
+ @property
113
+ def mime_type(self) -> str:
114
+ if self.url.startswith('data:'):
115
+ return self.url.split(';')[0].split(':')[1]
116
+ else:
117
+ return mimetypes.guess_type(self.filename)[0] or "application/octet-stream"
118
+
119
+ @property
120
+ def unique_filename(self) -> str:
121
+ return f"{self.id}.{self.extension}"
122
+
123
+ @property
124
+ def size(self) -> int:
125
+ # size in bytes
126
+ return len(base64.b64decode(self.content))
127
+
128
+ def __str__(self) -> str:
129
+ truncated_url = self.url[:50] + '...' if len(self.url) > 50 else self.url
130
+ # truncated_content = self.content[:50] + '...' if len(self.content) > 50 else self.content
131
+ return f"MIMEData(filename='{self.filename}', url='{truncated_url}', mime_type='{self.mime_type}', size='{self.size}', extension='{self.extension}')"
132
+
133
+ def __repr__(self) -> str:
134
+ return self.__str__()
135
+
136
+
137
+ class BaseMIMEData(MIMEData):
138
+ @classmethod
139
+ def model_validate(cls, obj: Any, *, strict: bool | None = None, from_attributes: bool | None = None, context: Any | None = None) -> Self:
140
+ if isinstance(obj, MIMEData):
141
+ # Convert MIMEData instance to dict
142
+ obj = obj.model_dump()
143
+ if isinstance(obj, dict) and 'url' in obj:
144
+ # Truncate URL to 1000 chars or less, ensuring it's a valid base64 string
145
+ if len(obj['url']) > 1000:
146
+ # Find the position of the base64 data
147
+ if ',' in obj['url']:
148
+ prefix, base64_data = obj['url'].split(',', 1)
149
+ # Calculate how many characters we can keep (must be a multiple of 4)
150
+ max_base64_len = 1000 - len(prefix) - 1 # -1 for the comma
151
+ # Ensure the length is a multiple of 4
152
+ max_base64_len = max_base64_len - (max_base64_len % 4)
153
+ # Truncate and reassemble
154
+ obj['url'] = prefix + ',' + base64_data[:max_base64_len]
155
+ else:
156
+ # If there's no comma (unexpected format), truncate to 996 chars (multiple of 4)
157
+ obj['url'] = obj['url'][:996]
158
+ return super().model_validate(obj, strict=strict, from_attributes=from_attributes, context=context)
159
+
160
+ @property
161
+ def id(self) -> str:
162
+ raise NotImplementedError("id is not implemented for BaseMIMEData - id is the hash of the content, so it's not possible to generate it from the base class")
163
+
164
+ def __str__(self) -> str:
165
+ truncated_url = self.url[:50] + '...' if len(self.url) > 50 else self.url
166
+ truncated_content = self.content[:50] + '...' if len(self.content) > 50 else self.content
167
+ return f"BaseMIMEData(filename='{self.filename}', url='{truncated_url}', content='{truncated_content}', mime_type='{self.mime_type}', extension='{self.extension}')"
168
+
169
+ def __repr__(self) -> str:
170
+ return self.__str__()
171
+
172
+
173
+ # **** MIME DATACLASSES ****
174
+ class AttachmentMetadata(BaseModel):
175
+ is_inline: bool = Field(default=False, description="Whether the attachment is inline or not.")
176
+ inline_cid: Optional[str] = Field(default=None, description="CID reference for inline attachments.")
177
+ source: Optional[str] = Field(
178
+ default=None,
179
+ description="Source of the attachment in dot notation attachment_id, or email_id.attachment_id, allow us to keep track of the origin of the attachment, for search purposes. ",
180
+ )
181
+
182
+
183
+ class BaseAttachmentMIMEData(BaseMIMEData):
184
+ metadata: AttachmentMetadata = Field(default=AttachmentMetadata(), description="Additional metadata about the attachment.")
185
+
186
+
187
+ class AttachmentMIMEData(MIMEData):
188
+ metadata: AttachmentMetadata = Field(default=AttachmentMetadata(), description="Additional metadata about the attachment.")
189
+
190
+
191
+ # **** EMAIL DATACLASSES ****
192
+
193
+
194
+ class EmailAddressData(BaseModel):
195
+ email: str = Field(..., description="The email address")
196
+ display_name: Optional[str] = Field(default=None, description="The display name associated with the email address")
197
+
198
+ def __str__(self) -> str:
199
+ if self.display_name:
200
+ return f"{self.display_name} <{self.email}>"
201
+ else:
202
+ return f"<{self.email}>"
203
+
204
+
205
+ # Light EmailData object that can conveniently be stored in mongoDB for search
206
+ class BaseEmailData(BaseModel):
207
+ id: str = Field(..., description="The Message-ID header of the email")
208
+ tree_id: str = Field(..., description="The root email ID, which is references[0] if it exists, otherwise the email's ID")
209
+
210
+ subject: Optional[str] = Field(default=None, description="The subject of the email")
211
+ body_plain: Optional[str] = Field(default=None, description="The plain text body of the email")
212
+ body_html: Optional[str] = Field(default=None, description="The HTML body of the email")
213
+ sender: EmailAddressData = Field(..., description="The sender's email address information")
214
+ recipients_to: list[EmailAddressData] = Field(..., description="List of primary recipients' email address information")
215
+ recipients_cc: list[EmailAddressData] = Field(default=[], description="List of carbon copy recipients' email address information")
216
+ recipients_bcc: list[EmailAddressData] = Field(default=[], description="List of blind carbon copy recipients' email address information")
217
+ sent_at: datetime.datetime = Field(..., description="The date and time when the email was sent")
218
+ received_at: Optional[datetime.datetime] = Field(default=None, description="The date and time when the email was received")
219
+
220
+ in_reply_to: Optional[str] = Field(default=None, description="The Message-ID of the email this is replying to")
221
+ references: list[str] = Field(default=[], description="List of Message-IDs this email references")
222
+ headers: dict[str, str] = Field(default={}, description="Dictionary of email headers")
223
+
224
+ url: Optional[str] = Field(default=None, description="URL where the email content can be accessed")
225
+
226
+ attachments: Sequence[BaseAttachmentMIMEData] = Field(default=[], description="List of email attachments")
227
+
228
+ @property
229
+ def unique_filename(self) -> str:
230
+ cleaned_id = re.sub(r'[\s<>]', '', self.id)
231
+ return f"{cleaned_id}.eml"
232
+
233
+ def __repr__(self) -> str:
234
+ recipient_count = len(self.recipients_to) + len(self.recipients_cc) + len(self.recipients_bcc)
235
+ attachment_count = len(self.attachments)
236
+
237
+ subject_preview = self.subject
238
+ body_preview = self.body_plain[:5000] + '...' if self.body_plain and len(self.body_plain) > 5000 else self.body_plain
239
+
240
+ return (
241
+ f"BaseEmailData("
242
+ f"id='{self.id}', "
243
+ f"subject='{subject_preview}', "
244
+ f"body='{body_preview}', "
245
+ f"sender='{self.sender.email}', "
246
+ f"recipients={recipient_count}, "
247
+ f"attachments={attachment_count}, "
248
+ f"sent_at='{self.sent_at.strftime('%Y-%m-%d %H:%M:%S')}'"
249
+ f")"
250
+ )
251
+
252
+ def __str__(self) -> str:
253
+ return self.__repr__()
254
+
255
+
256
+ class EmailData(BaseEmailData):
257
+ attachments: Sequence[AttachmentMIMEData] = Field([], description="List of email attachments") # type: ignore
@@ -0,0 +1,68 @@
1
+ from typing import Literal
2
+
3
+ BaseModality = Literal["text", "image"] # "video" , "audio"
4
+ Modality = Literal[BaseModality, "native", "image+text"]
5
+ TYPE_FAMILIES = Literal["excel", "word", "powerpoint", "pdf", "image", "text", "email", "audio", "html", "web"]
6
+ NativeModalities: dict[TYPE_FAMILIES, Modality] = {
7
+ 'excel': 'image',
8
+ 'word': 'image',
9
+ 'html': 'text',
10
+ 'powerpoint': 'image',
11
+ 'pdf': 'image',
12
+ 'image': 'image',
13
+ 'web': 'image',
14
+ 'text': 'text',
15
+ 'email': 'native',
16
+ 'audio': 'text',
17
+ }
18
+
19
+ EXCEL_TYPES = Literal[".xls", ".xlsx", ".ods"]
20
+ WORD_TYPES = Literal[".doc", ".docx", ".odt"]
21
+ PPT_TYPES = Literal[".ppt", ".pptx", ".odp"]
22
+ PDF_TYPES = Literal[".pdf"]
23
+ IMAGE_TYPES = Literal[".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".webp"]
24
+ TEXT_TYPES = Literal[
25
+ ".txt",
26
+ ".csv",
27
+ ".tsv",
28
+ ".md",
29
+ ".log",
30
+ ".xml",
31
+ ".json",
32
+ ".yaml",
33
+ ".yml",
34
+ ".rtf",
35
+ ".ini",
36
+ ".conf",
37
+ ".cfg",
38
+ ".nfo",
39
+ ".srt",
40
+ ".sql",
41
+ ".sh",
42
+ ".bat",
43
+ ".ps1",
44
+ ".js",
45
+ ".jsx",
46
+ ".ts",
47
+ ".tsx",
48
+ ".py",
49
+ ".java",
50
+ ".c",
51
+ ".cpp",
52
+ ".cs",
53
+ ".rb",
54
+ ".php",
55
+ ".swift",
56
+ ".kt",
57
+ ".go",
58
+ ".rs",
59
+ ".pl",
60
+ ".r",
61
+ ".m",
62
+ ".scala",
63
+ ]
64
+ HTML_TYPES = Literal[".html", ".htm"]
65
+ WEB_TYPES = Literal[".mhtml"]
66
+ EMAIL_TYPES = Literal[".eml", ".msg"]
67
+ AUDIO_TYPES = Literal[".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm"]
68
+ SUPPORTED_TYPES = Literal[EXCEL_TYPES, WORD_TYPES, PPT_TYPES, PDF_TYPES, IMAGE_TYPES, TEXT_TYPES, HTML_TYPES, WEB_TYPES, EMAIL_TYPES, AUDIO_TYPES]
@@ -0,0 +1,6 @@
1
+ from pydantic import BaseModel
2
+
3
+
4
+ class ListMetadata(BaseModel):
5
+ before: str | None
6
+ after: str | None
File without changes
@@ -0,0 +1,53 @@
1
+ from typing import Any, Self, TypedDict, Literal
2
+ from openai.types.chat.chat_completion_reasoning_effort import ChatCompletionReasoningEffort
3
+ from pydantic import BaseModel, Field, model_validator
4
+
5
+ from ..mime import MIMEData
6
+ from ..modalities import Modality
7
+
8
+
9
+ class EnhanceSchemaConfig(BaseModel):
10
+ allow_field_description_update: bool = False # Whether to allow the llm to update the description of existing fields
11
+ allow_system_prompt_update: bool = True # Whether to allow the llm to update the system prompt
12
+ allow_reasoning_field_toggle: bool = False # Whether to allow the llm to toggle the reasoning for fields
13
+
14
+ # Model validator
15
+ @model_validator(mode="after")
16
+ def check_at_least_one_tool_allowed(self) -> Self:
17
+ if not any([self.allow_field_description_update, self.allow_system_prompt_update, self.allow_reasoning_field_toggle]):
18
+ raise ValueError("At least one tool must be allowed")
19
+ return self
20
+
21
+
22
+ # Define a typed Dict for EnhanceSchemaConfig (for now it is kind static, but we will add more flexibility in the future)
23
+ class EnhanceSchemaConfigDict(TypedDict, total=False):
24
+ allow_field_description_update: bool
25
+ allow_system_prompt_update: bool
26
+ allow_reasoning_field_toggle: bool
27
+
28
+
29
+ class EnhanceSchemaRequest(BaseModel):
30
+ """
31
+ The request body for enhancing a JSON Schema.
32
+ """
33
+
34
+ documents: list[MIMEData]
35
+ ground_truths: list[dict[str, Any]] | None = None
36
+ model: str = "gpt-4o-mini"
37
+ temperature: float = 0.0
38
+ reasoning_effort: ChatCompletionReasoningEffort = "medium"
39
+ modality: Modality
40
+ """The modality of the document to load."""
41
+
42
+ image_resolution_dpi: int = 96
43
+ browser_canvas: Literal['A3', 'A4', 'A5'] = 'A4'
44
+ """The image operations to apply to the document."""
45
+
46
+ stream: bool = False
47
+ """Whether to stream the response."""
48
+
49
+ tools_config: EnhanceSchemaConfig = Field(default_factory=EnhanceSchemaConfig, description="The configuration for the tools to use")
50
+
51
+ json_schema: dict[str, Any]
52
+ instructions: str | None = None
53
+ flat_likelihoods: list[dict[str, float]] | dict[str, float] | None = None