retab 0.0.42__py3-none-any.whl → 0.0.44__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- retab/__init__.py +2 -1
- retab/client.py +26 -51
- retab/generate_types.py +180 -0
- retab/resources/consensus/client.py +1 -1
- retab/resources/consensus/responses.py +1 -1
- retab/resources/deployments/__init__.py +3 -0
- retab/resources/deployments/automations/__init__.py +9 -0
- retab/resources/deployments/automations/client.py +244 -0
- retab/resources/deployments/automations/endpoints.py +290 -0
- retab/resources/deployments/automations/links.py +303 -0
- retab/resources/deployments/automations/logs.py +222 -0
- retab/resources/deployments/automations/mailboxes.py +423 -0
- retab/resources/deployments/automations/outlook.py +377 -0
- retab/resources/deployments/automations/tests.py +161 -0
- retab/resources/deployments/client.py +148 -0
- retab/resources/documents/client.py +94 -68
- retab/resources/documents/extractions.py +55 -46
- retab/resources/evaluations/__init__.py +2 -2
- retab/resources/evaluations/client.py +61 -77
- retab/resources/evaluations/documents.py +48 -37
- retab/resources/evaluations/iterations.py +58 -40
- retab/resources/jsonlUtils.py +3 -4
- retab/resources/processors/automations/endpoints.py +49 -39
- retab/resources/processors/automations/links.py +52 -43
- retab/resources/processors/automations/mailboxes.py +74 -59
- retab/resources/processors/automations/outlook.py +104 -82
- retab/resources/processors/client.py +35 -30
- retab/resources/projects/__init__.py +3 -0
- retab/resources/projects/client.py +285 -0
- retab/resources/projects/documents.py +244 -0
- retab/resources/projects/iterations.py +470 -0
- retab/resources/usage.py +2 -0
- retab/types/ai_models.py +2 -1
- retab/types/deprecated_evals.py +195 -0
- retab/types/evaluations/__init__.py +5 -2
- retab/types/evaluations/iterations.py +9 -43
- retab/types/evaluations/model.py +19 -24
- retab/types/extractions.py +1 -0
- retab/types/jobs/base.py +1 -1
- retab/types/jobs/evaluation.py +1 -1
- retab/types/logs.py +5 -6
- retab/types/mime.py +1 -10
- retab/types/projects/__init__.py +34 -0
- retab/types/projects/documents.py +30 -0
- retab/types/projects/iterations.py +78 -0
- retab/types/projects/model.py +68 -0
- retab/types/schemas/enhance.py +22 -5
- retab/types/schemas/evaluate.py +2 -2
- retab/types/schemas/object.py +27 -25
- retab/types/standards.py +2 -2
- retab/utils/__init__.py +3 -0
- retab/utils/ai_models.py +127 -12
- retab/utils/hashing.py +24 -0
- retab/utils/json_schema.py +1 -26
- retab/utils/mime.py +0 -17
- retab/utils/usage/usage.py +0 -1
- {retab-0.0.42.dist-info → retab-0.0.44.dist-info}/METADATA +4 -6
- {retab-0.0.42.dist-info → retab-0.0.44.dist-info}/RECORD +60 -55
- retab/_utils/__init__.py +0 -0
- retab/_utils/_model_cards/anthropic.yaml +0 -59
- retab/_utils/_model_cards/auto.yaml +0 -43
- retab/_utils/_model_cards/gemini.yaml +0 -117
- retab/_utils/_model_cards/openai.yaml +0 -301
- retab/_utils/_model_cards/xai.yaml +0 -28
- retab/_utils/ai_models.py +0 -138
- retab/_utils/benchmarking.py +0 -484
- retab/_utils/chat.py +0 -327
- retab/_utils/display.py +0 -440
- retab/_utils/json_schema.py +0 -2156
- retab/_utils/mime.py +0 -165
- retab/_utils/responses.py +0 -169
- retab/_utils/stream_context_managers.py +0 -52
- retab/_utils/usage/__init__.py +0 -0
- retab/_utils/usage/usage.py +0 -301
- {retab-0.0.42.dist-info → retab-0.0.44.dist-info}/WHEEL +0 -0
- {retab-0.0.42.dist-info → retab-0.0.44.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,68 @@
|
|
1
|
+
import datetime
|
2
|
+
from typing import Any, Optional
|
3
|
+
|
4
|
+
import nanoid # type: ignore
|
5
|
+
from pydantic import BaseModel, Field, computed_field
|
6
|
+
|
7
|
+
from ...utils.json_schema import generate_schema_data_id, generate_schema_id
|
8
|
+
from ..inference_settings import InferenceSettings
|
9
|
+
from .documents import ProjectDocument
|
10
|
+
from .iterations import Iteration
|
11
|
+
|
12
|
+
|
13
|
+
class BaseProject(BaseModel):
|
14
|
+
id: str = Field(default_factory=lambda: "proj_" + nanoid.generate())
|
15
|
+
name: str = Field(default="", description="The name of the project")
|
16
|
+
json_schema: dict[str, Any] = Field(default_factory=dict, description="The json schema of the project")
|
17
|
+
default_inference_settings: InferenceSettings = Field(default=InferenceSettings(), description="The default inference properties for the project.")
|
18
|
+
updated_at: datetime.datetime = Field(default_factory=lambda: datetime.datetime.now(tz=datetime.timezone.utc))
|
19
|
+
|
20
|
+
|
21
|
+
# Actual Object stored in DB
|
22
|
+
class Project(BaseProject):
|
23
|
+
documents: list[ProjectDocument] = Field(default_factory=list)
|
24
|
+
iterations: list[Iteration] = Field(default_factory=list)
|
25
|
+
|
26
|
+
@computed_field # type: ignore
|
27
|
+
@property
|
28
|
+
def schema_data_id(self) -> str:
|
29
|
+
"""Returns the SHA1 hash of the schema data, ignoring all prompt/description/default fields.
|
30
|
+
|
31
|
+
Returns:
|
32
|
+
str: A SHA1 hash string representing the schema data version.
|
33
|
+
"""
|
34
|
+
return generate_schema_data_id(self.json_schema)
|
35
|
+
|
36
|
+
# This is a computed field, it is exposed when serializing the object
|
37
|
+
@computed_field # type: ignore
|
38
|
+
@property
|
39
|
+
def schema_id(self) -> str:
|
40
|
+
"""Returns the SHA1 hash of the complete schema.
|
41
|
+
|
42
|
+
Returns:
|
43
|
+
str: A SHA1 hash string representing the complete schema version.
|
44
|
+
"""
|
45
|
+
return generate_schema_id(self.json_schema)
|
46
|
+
|
47
|
+
|
48
|
+
class ListProjectParams(BaseModel):
|
49
|
+
schema_id: Optional[str] = Field(default=None, description="The ID of the schema")
|
50
|
+
schema_data_id: Optional[str] = Field(default=None, description="The ID of the schema data")
|
51
|
+
|
52
|
+
|
53
|
+
class CreateProjectRequest(BaseModel):
|
54
|
+
name: str
|
55
|
+
json_schema: dict[str, Any]
|
56
|
+
default_inference_settings: InferenceSettings
|
57
|
+
|
58
|
+
|
59
|
+
# This is basically the same as BaseProject, but everything is optional.
|
60
|
+
# Could be achieved by convert_basemodel_to_partial_basemodel(BaseProject) but we prefer explicitness
|
61
|
+
class PatchProjectRequest(BaseModel):
|
62
|
+
name: Optional[str] = Field(default=None, description="The name of the document")
|
63
|
+
json_schema: Optional[dict[str, Any]] = Field(default=None, description="The json schema of the project")
|
64
|
+
default_inference_settings: Optional[InferenceSettings] = Field(default=None, description="The default inference properties for the project (mostly used in the frontend)")
|
65
|
+
|
66
|
+
|
67
|
+
class AddIterationFromJsonlRequest(BaseModel):
|
68
|
+
jsonl_gcs_path: str
|
retab/types/schemas/enhance.py
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
from typing import Any, Self, TypedDict
|
2
2
|
|
3
|
-
from openai.types.chat.chat_completion_reasoning_effort import
|
3
|
+
from openai.types.chat.chat_completion_reasoning_effort import (
|
4
|
+
ChatCompletionReasoningEffort,
|
5
|
+
)
|
4
6
|
from pydantic import BaseModel, Field, model_validator
|
5
7
|
|
6
8
|
from ..mime import MIMEData
|
@@ -9,23 +11,35 @@ from ..browser_canvas import BrowserCanvas
|
|
9
11
|
|
10
12
|
|
11
13
|
class EnhanceSchemaConfig(BaseModel):
|
14
|
+
allow_reasoning_fields_added: bool = True # Whether to allow the llm to add reasoning fields
|
12
15
|
allow_field_description_update: bool = False # Whether to allow the llm to update the description of existing fields
|
13
16
|
allow_system_prompt_update: bool = True # Whether to allow the llm to update the system prompt
|
14
|
-
|
17
|
+
allow_field_simple_type_change: bool = False # Whether to allow the llm to make simple type changes (optional, string to date, etc.)
|
18
|
+
allow_field_data_structure_breakdown: bool = False # Whether to allow the llm to make complex data-structure changes (raw diff)
|
15
19
|
|
16
20
|
# Model validator
|
17
21
|
@model_validator(mode="after")
|
18
22
|
def check_at_least_one_tool_allowed(self) -> Self:
|
19
|
-
if not any(
|
23
|
+
if not any(
|
24
|
+
[
|
25
|
+
self.allow_reasoning_fields_added,
|
26
|
+
self.allow_field_description_update,
|
27
|
+
self.allow_system_prompt_update,
|
28
|
+
self.allow_field_simple_type_change,
|
29
|
+
self.allow_field_data_structure_breakdown,
|
30
|
+
]
|
31
|
+
):
|
20
32
|
raise ValueError("At least one tool must be allowed")
|
21
33
|
return self
|
22
34
|
|
23
35
|
|
24
36
|
# Define a typed Dict for EnhanceSchemaConfig (for now it is kind static, but we will add more flexibility in the future)
|
25
37
|
class EnhanceSchemaConfigDict(TypedDict, total=False):
|
38
|
+
allow_reasoning_fields_added: bool
|
26
39
|
allow_field_description_update: bool
|
27
40
|
allow_system_prompt_update: bool
|
28
|
-
|
41
|
+
allow_field_simple_type_change: bool
|
42
|
+
allow_field_data_structure_breakdown: bool
|
29
43
|
|
30
44
|
|
31
45
|
class EnhanceSchemaRequest(BaseModel):
|
@@ -48,7 +62,10 @@ class EnhanceSchemaRequest(BaseModel):
|
|
48
62
|
stream: bool = False
|
49
63
|
"""Whether to stream the response."""
|
50
64
|
|
51
|
-
tools_config: EnhanceSchemaConfig = Field(
|
65
|
+
tools_config: EnhanceSchemaConfig = Field(
|
66
|
+
default_factory=EnhanceSchemaConfig,
|
67
|
+
description="The configuration for the tools to use",
|
68
|
+
)
|
52
69
|
|
53
70
|
json_schema: dict[str, Any]
|
54
71
|
instructions: str | None = None
|
retab/types/schemas/evaluate.py
CHANGED
@@ -3,7 +3,7 @@ from typing import Any, Self
|
|
3
3
|
from openai.types.chat.chat_completion_reasoning_effort import ChatCompletionReasoningEffort
|
4
4
|
from pydantic import BaseModel, Field, model_validator
|
5
5
|
|
6
|
-
from ..
|
6
|
+
from ..metrics import ItemMetric
|
7
7
|
from ..mime import MIMEData
|
8
8
|
from ..modalities import Modality
|
9
9
|
from ..browser_canvas import BrowserCanvas
|
@@ -45,7 +45,7 @@ class EvaluateSchemaRequest(BaseModel):
|
|
45
45
|
if len(self.documents) != len(self.ground_truths):
|
46
46
|
raise ValueError("Distance mode requires equal number of documents and ground_truths")
|
47
47
|
if len(self.documents) == 0:
|
48
|
-
raise ValueError("
|
48
|
+
raise ValueError("Project mode requires at least one document")
|
49
49
|
|
50
50
|
return self
|
51
51
|
|
retab/types/schemas/object.py
CHANGED
@@ -233,6 +233,29 @@ When provided with a **JSON schema** and a **document**, you must:
|
|
233
233
|
|
234
234
|
---
|
235
235
|
|
236
|
+
## Date and Time Formatting
|
237
|
+
|
238
|
+
When extracting date, time, or datetime values:
|
239
|
+
|
240
|
+
- **Always use ISO format** for dates and times (e.g., "2023-12-25", "14:30:00", "2023-12-25T14:30:00")
|
241
|
+
|
242
|
+
**Examples:**
|
243
|
+
|
244
|
+
```json
|
245
|
+
// Correct ISO formats:
|
246
|
+
{"date": "2023-12-25"}
|
247
|
+
{"time": "14:30:00"}
|
248
|
+
{"datetime": "2023-12-25T14:30:00Z"}
|
249
|
+
{"datetime_with_tz": "2023-12-25T14:30:00+02:00"}
|
250
|
+
|
251
|
+
// Incorrect formats:
|
252
|
+
{"date": "12/25/2023"}
|
253
|
+
{"time": "2:30 PM"}
|
254
|
+
{"datetime": "Dec 25, 2023 at 2:30 PM"}
|
255
|
+
```
|
256
|
+
|
257
|
+
---
|
258
|
+
|
236
259
|
## Handling Missing and Nullable Fields
|
237
260
|
|
238
261
|
### Nullable Leaf Attributes
|
@@ -357,32 +380,11 @@ When performing extraction, explicitly follow these core principles:
|
|
357
380
|
- **Structure Preservation**: Always maintain explicitly the full schema structure, even when entire nested objects lack data (leaf attributes as null).
|
358
381
|
|
359
382
|
|
360
|
-
|
361
|
-
|
362
|
-
Some leaf fields require you to explicitly provide the source of the data (verbatim from the document).
|
363
|
-
The idea is to simply provide a verbatim quote from the document, without any additional formatting or commentary, keeping it as close as possible to the original text.
|
364
|
-
Make sure to reasonably include some surrounding text to provide context about the quote.
|
365
|
-
|
366
|
-
You can easily identify the fields that require a source by the `quote___[attributename]` naming pattern.
|
367
|
-
|
368
|
-
**Example:**
|
369
|
-
|
370
|
-
```json
|
371
|
-
{
|
372
|
-
"quote___name": "NAME:\nJohn Doe",
|
373
|
-
"name": "John Doe"
|
374
|
-
}
|
375
|
-
```
|
376
|
-
|
377
|
-
---
|
378
|
-
|
379
|
-
# User Defined System Prompt
|
380
|
-
|
381
|
-
"""
|
383
|
+
---"""
|
382
384
|
|
383
385
|
@property
|
384
|
-
def user_system_prompt(self) -> str:
|
385
|
-
return self.json_schema.get("X-SystemPrompt",
|
386
|
+
def user_system_prompt(self) -> str | None:
|
387
|
+
return self.json_schema.get("X-SystemPrompt", None)
|
386
388
|
|
387
389
|
@property
|
388
390
|
def schema_system_prompt(self) -> str:
|
@@ -397,7 +399,7 @@ You can easily identify the fields that require a source by the `quote___[attrib
|
|
397
399
|
Returns:
|
398
400
|
str: The combined system prompt string.
|
399
401
|
"""
|
400
|
-
return self.developer_system_prompt + "\n\n" + self.user_system_prompt + "\n\n" + self.schema_system_prompt
|
402
|
+
return self.developer_system_prompt + "\n\n" + (self.user_system_prompt + "\n\n" if self.user_system_prompt else "") + self.schema_system_prompt
|
401
403
|
|
402
404
|
@property
|
403
405
|
def title(self) -> str:
|
retab/types/standards.py
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
from typing import Any, List, Literal, Optional, Tuple, TypeVar, TypedDict
|
2
2
|
|
3
3
|
from pydantic import BaseModel, Field
|
4
|
-
from
|
4
|
+
from pydantic_core import PydanticUndefined
|
5
5
|
|
6
6
|
# API Standards
|
7
7
|
|
8
8
|
# Define a type variable to represent the content type
|
9
9
|
T = TypeVar("T")
|
10
10
|
|
11
|
-
FieldUnset =
|
11
|
+
FieldUnset: Any = PydanticUndefined
|
12
12
|
|
13
13
|
|
14
14
|
# Define the ErrorDetail model
|
retab/utils/__init__.py
CHANGED
retab/utils/ai_models.py
CHANGED
@@ -6,6 +6,7 @@ from ..types.ai_models import AIProvider, GeminiModel, OpenAIModel, xAI_Model, R
|
|
6
6
|
|
7
7
|
MODEL_CARDS_DIR = os.path.join(os.path.dirname(__file__), "_model_cards")
|
8
8
|
|
9
|
+
|
9
10
|
def merge_model_cards(base: dict, override: dict) -> dict:
|
10
11
|
result = base.copy()
|
11
12
|
for key, value in override.items():
|
@@ -17,6 +18,7 @@ def merge_model_cards(base: dict, override: dict) -> dict:
|
|
17
18
|
result[key] = value
|
18
19
|
return result
|
19
20
|
|
21
|
+
|
20
22
|
def load_model_cards(yaml_file: str) -> list[ModelCard]:
|
21
23
|
raw_cards = yaml.safe_load(open(yaml_file))
|
22
24
|
name_to_card = {c["model"]: c for c in raw_cards if "inherits" not in c}
|
@@ -31,14 +33,18 @@ def load_model_cards(yaml_file: str) -> list[ModelCard]:
|
|
31
33
|
final_cards.append(ModelCard(**card))
|
32
34
|
return final_cards
|
33
35
|
|
36
|
+
|
34
37
|
# Load all model cards
|
35
|
-
model_cards = sum(
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
38
|
+
model_cards = sum(
|
39
|
+
[
|
40
|
+
load_model_cards(os.path.join(MODEL_CARDS_DIR, "openai.yaml")),
|
41
|
+
load_model_cards(os.path.join(MODEL_CARDS_DIR, "anthropic.yaml")),
|
42
|
+
load_model_cards(os.path.join(MODEL_CARDS_DIR, "xai.yaml")),
|
43
|
+
load_model_cards(os.path.join(MODEL_CARDS_DIR, "gemini.yaml")),
|
44
|
+
load_model_cards(os.path.join(MODEL_CARDS_DIR, "auto.yaml")),
|
45
|
+
],
|
46
|
+
[],
|
47
|
+
)
|
42
48
|
model_cards_dict = {card.model: card for card in model_cards}
|
43
49
|
|
44
50
|
|
@@ -108,7 +114,7 @@ def get_provider_for_model(model_id: str) -> AIProvider:
|
|
108
114
|
|
109
115
|
|
110
116
|
def assert_valid_model_extraction(model: str) -> None:
|
111
|
-
try:
|
117
|
+
try:
|
112
118
|
get_provider_for_model(model)
|
113
119
|
except ValueError:
|
114
120
|
raise ValueError(
|
@@ -132,7 +138,116 @@ def assert_valid_model_schema_generation(model: str) -> None:
|
|
132
138
|
return
|
133
139
|
else:
|
134
140
|
raise ValueError(
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
141
|
+
f"Invalid model format: {model}. Must be either:\n"
|
142
|
+
f"1. A standard model: {get_args(OpenAIModel)}\n"
|
143
|
+
f"2. A fine-tuned model in format 'base_model:id' where base_model is one of the standard openai models"
|
144
|
+
) from None
|
145
|
+
|
146
|
+
|
147
|
+
def get_model_credits(model: str) -> float:
|
148
|
+
"""
|
149
|
+
Get the credit cost for a given model based on its capabilities and size.
|
150
|
+
|
151
|
+
Credit tiers:
|
152
|
+
- 0.1 credits: Micro/nano models (fastest, cheapest)
|
153
|
+
- 0.5 credits: Small/mini models (balanced performance)
|
154
|
+
- 2.0 credits: Large/advanced models (highest capability)
|
155
|
+
|
156
|
+
Args:
|
157
|
+
model: The model name to look up
|
158
|
+
|
159
|
+
Returns:
|
160
|
+
The credit cost for the model
|
161
|
+
|
162
|
+
Raises:
|
163
|
+
ValueError: If no model card is found for the specified model
|
164
|
+
"""
|
165
|
+
try:
|
166
|
+
model_card = get_model_card(model)
|
167
|
+
model_name = get_model_from_model_id(model)
|
168
|
+
except ValueError:
|
169
|
+
# Unknown model, return 0 credits (no billing)
|
170
|
+
return 0.0
|
171
|
+
|
172
|
+
# Define credit mapping based on model capabilities and naming patterns
|
173
|
+
model_credits = {
|
174
|
+
# 0.1 credit models - Micro/Nano tier (fastest, most efficient)
|
175
|
+
"auto-micro": 0.1,
|
176
|
+
"gemini-flash-lite": 0.1,
|
177
|
+
"gpt-4o-mini": 0.1,
|
178
|
+
"gpt-3.5-turbo": 0.1,
|
179
|
+
"gpt-4.1-nano": 0.1, # Future model
|
180
|
+
# 0.5 credit models - Small/Mini tier (balanced performance)
|
181
|
+
"auto-small": 0.5,
|
182
|
+
"gemini-flash": 0.5,
|
183
|
+
"gpt-4o": 0.5,
|
184
|
+
"gpt-4-turbo": 0.5,
|
185
|
+
"gpt-4.1-mini": 0.5, # Future model
|
186
|
+
"claude-3-haiku": 0.5,
|
187
|
+
"claude-3.5-haiku": 0.5,
|
188
|
+
# 2.0 credit models - Large/Advanced tier (highest capability)
|
189
|
+
"auto-large": 2.0,
|
190
|
+
"gemini-pro": 2.0,
|
191
|
+
"gpt-4": 2.0,
|
192
|
+
"gpt-4.1": 2.0, # Future model
|
193
|
+
"o1-mini": 2.0,
|
194
|
+
"o1-preview": 2.0,
|
195
|
+
"o3": 5.0, # Future model
|
196
|
+
"claude-3-sonnet": 2.0,
|
197
|
+
"claude-3-opus": 2.0,
|
198
|
+
"claude-3.5-sonnet": 2.0,
|
199
|
+
"grok-beta": 2.0,
|
200
|
+
"grok-2": 2.0,
|
201
|
+
# Special reasoning models - Higher tier
|
202
|
+
"o1": 3.0,
|
203
|
+
"o3-max": 3.0, # Future model, highest tier
|
204
|
+
}
|
205
|
+
|
206
|
+
# Return the credits for the specific model
|
207
|
+
if model_name in model_credits:
|
208
|
+
return model_credits[model_name]
|
209
|
+
|
210
|
+
# Fallback logic based on model patterns and capabilities
|
211
|
+
model_lower = model_name.lower()
|
212
|
+
|
213
|
+
# Auto-model fallback logic
|
214
|
+
if model_lower.startswith("auto-"):
|
215
|
+
if "micro" in model_lower or "nano" in model_lower:
|
216
|
+
return 0.1
|
217
|
+
elif "small" in model_lower or "mini" in model_lower:
|
218
|
+
return 0.5
|
219
|
+
elif "large" in model_lower or "pro" in model_lower:
|
220
|
+
return 2.0
|
221
|
+
|
222
|
+
# Gemini model fallback logic
|
223
|
+
if "gemini" in model_lower:
|
224
|
+
if "lite" in model_lower or "nano" in model_lower:
|
225
|
+
return 0.1
|
226
|
+
elif "flash" in model_lower:
|
227
|
+
return 0.5
|
228
|
+
elif "pro" in model_lower or "ultra" in model_lower:
|
229
|
+
return 2.0
|
230
|
+
|
231
|
+
# GPT model fallback logic
|
232
|
+
if "gpt" in model_lower:
|
233
|
+
if "mini" in model_lower or "3.5" in model_lower:
|
234
|
+
return 0.1
|
235
|
+
elif "4o" in model_lower and "mini" not in model_lower:
|
236
|
+
return 0.5
|
237
|
+
elif "4" in model_lower or "o1" in model_lower:
|
238
|
+
return 2.0
|
239
|
+
|
240
|
+
# Claude model fallback logic
|
241
|
+
if "claude" in model_lower:
|
242
|
+
if "haiku" in model_lower:
|
243
|
+
return 0.5
|
244
|
+
elif "sonnet" in model_lower or "opus" in model_lower:
|
245
|
+
return 2.0
|
246
|
+
|
247
|
+
# Default for unknown models - use model card info if available
|
248
|
+
try:
|
249
|
+
# Try to determine based on model card properties
|
250
|
+
# This could be enhanced based on the actual ModelCard structure
|
251
|
+
return 1.0 # Default middle tier
|
252
|
+
except:
|
253
|
+
return 0.0 # No billing for completely unknown models
|
retab/utils/hashing.py
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
import base64
|
2
|
+
import hashlib
|
3
|
+
import json
|
4
|
+
|
5
|
+
from fastapi.encoders import jsonable_encoder
|
6
|
+
|
7
|
+
# ************* Generalistic utils *************
|
8
|
+
|
9
|
+
|
10
|
+
def generate_blake2b_hash_from_bytes(bytes_: bytes) -> str:
|
11
|
+
return hashlib.blake2b(bytes_, digest_size=8).hexdigest()
|
12
|
+
|
13
|
+
|
14
|
+
def generate_blake2b_hash_from_base64(base64_string: str) -> str:
|
15
|
+
return generate_blake2b_hash_from_bytes(base64.b64decode(base64_string))
|
16
|
+
|
17
|
+
|
18
|
+
def generate_blake2b_hash_from_string(input_string: str) -> str:
|
19
|
+
return generate_blake2b_hash_from_bytes(input_string.encode("utf-8"))
|
20
|
+
|
21
|
+
|
22
|
+
def generate_blake2b_hash_from_dict(input_dict: dict) -> str:
|
23
|
+
jsonable_dict = jsonable_encoder(input_dict)
|
24
|
+
return generate_blake2b_hash_from_string(json.dumps(jsonable_dict, sort_keys=True).strip())
|
retab/utils/json_schema.py
CHANGED
@@ -15,7 +15,7 @@ from pydantic import BaseModel, BeforeValidator, Field, create_model
|
|
15
15
|
from pydantic.config import ConfigDict
|
16
16
|
|
17
17
|
from ..types.schemas.layout import Column, FieldItem, Layout, RefObject, Row, RowList
|
18
|
-
from .
|
18
|
+
from .hashing import generate_blake2b_hash_from_string
|
19
19
|
|
20
20
|
# **** Validation Functions ****
|
21
21
|
|
@@ -2091,31 +2091,6 @@ def sanitize(instance: Any, schema: dict[str, Any]) -> Any:
|
|
2091
2091
|
return __sanitize_instance(instance, expanded_schema)
|
2092
2092
|
|
2093
2093
|
|
2094
|
-
def compute_schema_data_id(json_schema: dict[str, Any]) -> str:
|
2095
|
-
"""Returns the schema_data_id for a given JSON schema.
|
2096
|
-
|
2097
|
-
The schema_data_id is a hash of the schema data, ignoring all prompt/description/default fields
|
2098
|
-
and other non-structural metadata.
|
2099
|
-
|
2100
|
-
Args:
|
2101
|
-
json_schema: The JSON schema to compute the ID for
|
2102
|
-
|
2103
|
-
Returns:
|
2104
|
-
str: A hash string representing the schema data version with "sch_data_id_" prefix
|
2105
|
-
"""
|
2106
|
-
|
2107
|
-
return "sch_data_id_" + generate_blake2b_hash_from_string(
|
2108
|
-
json.dumps(
|
2109
|
-
clean_schema(
|
2110
|
-
copy.deepcopy(json_schema),
|
2111
|
-
remove_custom_fields=True,
|
2112
|
-
fields_to_remove=["description", "default", "title", "required", "examples", "deprecated", "readOnly", "writeOnly"],
|
2113
|
-
),
|
2114
|
-
sort_keys=True,
|
2115
|
-
).strip()
|
2116
|
-
)
|
2117
|
-
|
2118
|
-
|
2119
2094
|
def validate_json_against_schema(
|
2120
2095
|
data: Any,
|
2121
2096
|
schema: dict[str, Any],
|
retab/utils/mime.py
CHANGED
@@ -16,23 +16,6 @@ from ..types.modalities import SUPPORTED_TYPES
|
|
16
16
|
|
17
17
|
T = TypeVar("T")
|
18
18
|
|
19
|
-
|
20
|
-
def generate_blake2b_hash_from_bytes(bytes_: bytes) -> str:
|
21
|
-
return hashlib.blake2b(bytes_, digest_size=8).hexdigest()
|
22
|
-
|
23
|
-
|
24
|
-
def generate_blake2b_hash_from_base64(base64_string: str) -> str:
|
25
|
-
return generate_blake2b_hash_from_bytes(base64.b64decode(base64_string))
|
26
|
-
|
27
|
-
|
28
|
-
def generate_blake2b_hash_from_string(input_string: str) -> str:
|
29
|
-
return generate_blake2b_hash_from_bytes(input_string.encode("utf-8"))
|
30
|
-
|
31
|
-
|
32
|
-
def generate_blake2b_hash_from_dict(input_dict: dict) -> str:
|
33
|
-
return generate_blake2b_hash_from_string(json.dumps(input_dict, sort_keys=True).strip())
|
34
|
-
|
35
|
-
|
36
19
|
def convert_pil_image_to_mime_data(image: PIL.Image.Image) -> MIMEData:
|
37
20
|
"""Convert a PIL Image object to a MIMEData object.
|
38
21
|
|
retab/utils/usage/usage.py
CHANGED
@@ -117,7 +117,6 @@ class CompletionsUsage(BaseModel):
|
|
117
117
|
input_audio_tokens: int = Field(description="The aggregated number of audio input tokens used, including cached tokens.")
|
118
118
|
output_audio_tokens: int = Field(description="The aggregated number of audio output tokens used.")
|
119
119
|
num_model_requests: int = Field(description="The count of requests made to the model.")
|
120
|
-
project_id: Optional[str] = Field(default=None, description="When group_by=project_id, this field provides the project ID of the grouped usage result.")
|
121
120
|
user_id: Optional[str] = Field(default=None, description="When group_by=user_id, this field provides the user ID of the grouped usage result.")
|
122
121
|
api_key_id: Optional[str] = Field(default=None, description="When group_by=api_key_id, this field provides the API key ID of the grouped usage result.")
|
123
122
|
model: Optional[str] = Field(default=None, description="When group_by=model, this field provides the model name of the grouped usage result.")
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: retab
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.44
|
4
4
|
Summary: Retab official python library
|
5
5
|
Home-page: https://github.com/Retab-dev/retab
|
6
6
|
Author: Retab
|
@@ -61,13 +61,13 @@ Made with love by the team at [Retab](https://retab.com) 🤍.
|
|
61
61
|
|
62
62
|
### What is Retab?
|
63
63
|
|
64
|
-
Retab solves all the major challenges in document processing with
|
64
|
+
Retab solves all the major challenges in document processing with Large Language Models:
|
65
65
|
|
66
66
|
1. **Universal Document Preprocessing**: Convert any file type (PDFs, Excel, emails, etc.) into LLM-ready format without writing custom parsers
|
67
67
|
2. **Structured, Schema-driven Extraction**: Get consistent, reliable outputs using schema-based prompt engineering
|
68
68
|
3. **Processors**: Publish a live, stable, shareable document processor.
|
69
69
|
4. **Automations**: Create document processing workflows that can be triggered by events (mailbox, upload link, endpoint, outlook plugin).
|
70
|
-
5. **
|
70
|
+
5. **Projects**: Evaluate the performance of models against annotated datasets
|
71
71
|
6. **Optimizations**: Identify the most used processors and help you finetune models to reduce costs and improve performance
|
72
72
|
|
73
73
|
We are offering you all the software-defined primitives to build your own document processing solutions. We see it as **Stripe** for document processing.
|
@@ -90,7 +90,7 @@ Many people haven't yet realized how powerful LLMs have become at document proce
|
|
90
90
|
|
91
91
|
## Code examples
|
92
92
|
|
93
|
-
|
93
|
+
You can check our Github repository to see code examples: [python examples](https://github.com/Retab-dev/retab/tree/main/examples) and [jupyter notebooks](https://github.com/Retab-dev/retab-nodejs/tree/main/notebooks).
|
94
94
|
|
95
95
|
## Community
|
96
96
|
|
@@ -112,8 +112,6 @@ We share our roadmap publicly on [Github](https://github.com/Retab-dev/retab)
|
|
112
112
|
Among the features we're working on:
|
113
113
|
|
114
114
|
* [ ] Node.js SDK
|
115
|
-
* [ ] Low-level speed optimizations for Evals Frontend
|
116
115
|
* [ ] Schema optimization autopilot
|
117
116
|
* [ ] Sources API
|
118
|
-
* [ ] Parse API for RAG
|
119
117
|
|