retab 0.0.42__py3-none-any.whl → 0.0.44__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. retab/__init__.py +2 -1
  2. retab/client.py +26 -51
  3. retab/generate_types.py +180 -0
  4. retab/resources/consensus/client.py +1 -1
  5. retab/resources/consensus/responses.py +1 -1
  6. retab/resources/deployments/__init__.py +3 -0
  7. retab/resources/deployments/automations/__init__.py +9 -0
  8. retab/resources/deployments/automations/client.py +244 -0
  9. retab/resources/deployments/automations/endpoints.py +290 -0
  10. retab/resources/deployments/automations/links.py +303 -0
  11. retab/resources/deployments/automations/logs.py +222 -0
  12. retab/resources/deployments/automations/mailboxes.py +423 -0
  13. retab/resources/deployments/automations/outlook.py +377 -0
  14. retab/resources/deployments/automations/tests.py +161 -0
  15. retab/resources/deployments/client.py +148 -0
  16. retab/resources/documents/client.py +94 -68
  17. retab/resources/documents/extractions.py +55 -46
  18. retab/resources/evaluations/__init__.py +2 -2
  19. retab/resources/evaluations/client.py +61 -77
  20. retab/resources/evaluations/documents.py +48 -37
  21. retab/resources/evaluations/iterations.py +58 -40
  22. retab/resources/jsonlUtils.py +3 -4
  23. retab/resources/processors/automations/endpoints.py +49 -39
  24. retab/resources/processors/automations/links.py +52 -43
  25. retab/resources/processors/automations/mailboxes.py +74 -59
  26. retab/resources/processors/automations/outlook.py +104 -82
  27. retab/resources/processors/client.py +35 -30
  28. retab/resources/projects/__init__.py +3 -0
  29. retab/resources/projects/client.py +285 -0
  30. retab/resources/projects/documents.py +244 -0
  31. retab/resources/projects/iterations.py +470 -0
  32. retab/resources/usage.py +2 -0
  33. retab/types/ai_models.py +2 -1
  34. retab/types/deprecated_evals.py +195 -0
  35. retab/types/evaluations/__init__.py +5 -2
  36. retab/types/evaluations/iterations.py +9 -43
  37. retab/types/evaluations/model.py +19 -24
  38. retab/types/extractions.py +1 -0
  39. retab/types/jobs/base.py +1 -1
  40. retab/types/jobs/evaluation.py +1 -1
  41. retab/types/logs.py +5 -6
  42. retab/types/mime.py +1 -10
  43. retab/types/projects/__init__.py +34 -0
  44. retab/types/projects/documents.py +30 -0
  45. retab/types/projects/iterations.py +78 -0
  46. retab/types/projects/model.py +68 -0
  47. retab/types/schemas/enhance.py +22 -5
  48. retab/types/schemas/evaluate.py +2 -2
  49. retab/types/schemas/object.py +27 -25
  50. retab/types/standards.py +2 -2
  51. retab/utils/__init__.py +3 -0
  52. retab/utils/ai_models.py +127 -12
  53. retab/utils/hashing.py +24 -0
  54. retab/utils/json_schema.py +1 -26
  55. retab/utils/mime.py +0 -17
  56. retab/utils/usage/usage.py +0 -1
  57. {retab-0.0.42.dist-info → retab-0.0.44.dist-info}/METADATA +4 -6
  58. {retab-0.0.42.dist-info → retab-0.0.44.dist-info}/RECORD +60 -55
  59. retab/_utils/__init__.py +0 -0
  60. retab/_utils/_model_cards/anthropic.yaml +0 -59
  61. retab/_utils/_model_cards/auto.yaml +0 -43
  62. retab/_utils/_model_cards/gemini.yaml +0 -117
  63. retab/_utils/_model_cards/openai.yaml +0 -301
  64. retab/_utils/_model_cards/xai.yaml +0 -28
  65. retab/_utils/ai_models.py +0 -138
  66. retab/_utils/benchmarking.py +0 -484
  67. retab/_utils/chat.py +0 -327
  68. retab/_utils/display.py +0 -440
  69. retab/_utils/json_schema.py +0 -2156
  70. retab/_utils/mime.py +0 -165
  71. retab/_utils/responses.py +0 -169
  72. retab/_utils/stream_context_managers.py +0 -52
  73. retab/_utils/usage/__init__.py +0 -0
  74. retab/_utils/usage/usage.py +0 -301
  75. {retab-0.0.42.dist-info → retab-0.0.44.dist-info}/WHEEL +0 -0
  76. {retab-0.0.42.dist-info → retab-0.0.44.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,68 @@
1
+ import datetime
2
+ from typing import Any, Optional
3
+
4
+ import nanoid # type: ignore
5
+ from pydantic import BaseModel, Field, computed_field
6
+
7
+ from ...utils.json_schema import generate_schema_data_id, generate_schema_id
8
+ from ..inference_settings import InferenceSettings
9
+ from .documents import ProjectDocument
10
+ from .iterations import Iteration
11
+
12
+
13
+ class BaseProject(BaseModel):
14
+ id: str = Field(default_factory=lambda: "proj_" + nanoid.generate())
15
+ name: str = Field(default="", description="The name of the project")
16
+ json_schema: dict[str, Any] = Field(default_factory=dict, description="The json schema of the project")
17
+ default_inference_settings: InferenceSettings = Field(default=InferenceSettings(), description="The default inference properties for the project.")
18
+ updated_at: datetime.datetime = Field(default_factory=lambda: datetime.datetime.now(tz=datetime.timezone.utc))
19
+
20
+
21
+ # Actual Object stored in DB
22
+ class Project(BaseProject):
23
+ documents: list[ProjectDocument] = Field(default_factory=list)
24
+ iterations: list[Iteration] = Field(default_factory=list)
25
+
26
+ @computed_field # type: ignore
27
+ @property
28
+ def schema_data_id(self) -> str:
29
+ """Returns the SHA1 hash of the schema data, ignoring all prompt/description/default fields.
30
+
31
+ Returns:
32
+ str: A SHA1 hash string representing the schema data version.
33
+ """
34
+ return generate_schema_data_id(self.json_schema)
35
+
36
+ # This is a computed field, it is exposed when serializing the object
37
+ @computed_field # type: ignore
38
+ @property
39
+ def schema_id(self) -> str:
40
+ """Returns the SHA1 hash of the complete schema.
41
+
42
+ Returns:
43
+ str: A SHA1 hash string representing the complete schema version.
44
+ """
45
+ return generate_schema_id(self.json_schema)
46
+
47
+
48
+ class ListProjectParams(BaseModel):
49
+ schema_id: Optional[str] = Field(default=None, description="The ID of the schema")
50
+ schema_data_id: Optional[str] = Field(default=None, description="The ID of the schema data")
51
+
52
+
53
+ class CreateProjectRequest(BaseModel):
54
+ name: str
55
+ json_schema: dict[str, Any]
56
+ default_inference_settings: InferenceSettings
57
+
58
+
59
+ # This is basically the same as BaseProject, but everything is optional.
60
+ # Could be achieved by convert_basemodel_to_partial_basemodel(BaseProject) but we prefer explicitness
61
+ class PatchProjectRequest(BaseModel):
62
+ name: Optional[str] = Field(default=None, description="The name of the document")
63
+ json_schema: Optional[dict[str, Any]] = Field(default=None, description="The json schema of the project")
64
+ default_inference_settings: Optional[InferenceSettings] = Field(default=None, description="The default inference properties for the project (mostly used in the frontend)")
65
+
66
+
67
+ class AddIterationFromJsonlRequest(BaseModel):
68
+ jsonl_gcs_path: str
@@ -1,6 +1,8 @@
1
1
  from typing import Any, Self, TypedDict
2
2
 
3
- from openai.types.chat.chat_completion_reasoning_effort import ChatCompletionReasoningEffort
3
+ from openai.types.chat.chat_completion_reasoning_effort import (
4
+ ChatCompletionReasoningEffort,
5
+ )
4
6
  from pydantic import BaseModel, Field, model_validator
5
7
 
6
8
  from ..mime import MIMEData
@@ -9,23 +11,35 @@ from ..browser_canvas import BrowserCanvas
9
11
 
10
12
 
11
13
  class EnhanceSchemaConfig(BaseModel):
14
+ allow_reasoning_fields_added: bool = True # Whether to allow the llm to add reasoning fields
12
15
  allow_field_description_update: bool = False # Whether to allow the llm to update the description of existing fields
13
16
  allow_system_prompt_update: bool = True # Whether to allow the llm to update the system prompt
14
- allow_reasoning_field_toggle: bool = False # Whether to allow the llm to toggle the reasoning for fields
17
+ allow_field_simple_type_change: bool = False # Whether to allow the llm to make simple type changes (optional, string to date, etc.)
18
+ allow_field_data_structure_breakdown: bool = False # Whether to allow the llm to make complex data-structure changes (raw diff)
15
19
 
16
20
  # Model validator
17
21
  @model_validator(mode="after")
18
22
  def check_at_least_one_tool_allowed(self) -> Self:
19
- if not any([self.allow_field_description_update, self.allow_system_prompt_update, self.allow_reasoning_field_toggle]):
23
+ if not any(
24
+ [
25
+ self.allow_reasoning_fields_added,
26
+ self.allow_field_description_update,
27
+ self.allow_system_prompt_update,
28
+ self.allow_field_simple_type_change,
29
+ self.allow_field_data_structure_breakdown,
30
+ ]
31
+ ):
20
32
  raise ValueError("At least one tool must be allowed")
21
33
  return self
22
34
 
23
35
 
24
36
  # Define a typed Dict for EnhanceSchemaConfig (for now it is kind static, but we will add more flexibility in the future)
25
37
  class EnhanceSchemaConfigDict(TypedDict, total=False):
38
+ allow_reasoning_fields_added: bool
26
39
  allow_field_description_update: bool
27
40
  allow_system_prompt_update: bool
28
- allow_reasoning_field_toggle: bool
41
+ allow_field_simple_type_change: bool
42
+ allow_field_data_structure_breakdown: bool
29
43
 
30
44
 
31
45
  class EnhanceSchemaRequest(BaseModel):
@@ -48,7 +62,10 @@ class EnhanceSchemaRequest(BaseModel):
48
62
  stream: bool = False
49
63
  """Whether to stream the response."""
50
64
 
51
- tools_config: EnhanceSchemaConfig = Field(default_factory=EnhanceSchemaConfig, description="The configuration for the tools to use")
65
+ tools_config: EnhanceSchemaConfig = Field(
66
+ default_factory=EnhanceSchemaConfig,
67
+ description="The configuration for the tools to use",
68
+ )
52
69
 
53
70
  json_schema: dict[str, Any]
54
71
  instructions: str | None = None
@@ -3,7 +3,7 @@ from typing import Any, Self
3
3
  from openai.types.chat.chat_completion_reasoning_effort import ChatCompletionReasoningEffort
4
4
  from pydantic import BaseModel, Field, model_validator
5
5
 
6
- from ..evals import ItemMetric
6
+ from ..metrics import ItemMetric
7
7
  from ..mime import MIMEData
8
8
  from ..modalities import Modality
9
9
  from ..browser_canvas import BrowserCanvas
@@ -45,7 +45,7 @@ class EvaluateSchemaRequest(BaseModel):
45
45
  if len(self.documents) != len(self.ground_truths):
46
46
  raise ValueError("Distance mode requires equal number of documents and ground_truths")
47
47
  if len(self.documents) == 0:
48
- raise ValueError("Evaluation mode requires at least one document")
48
+ raise ValueError("Project mode requires at least one document")
49
49
 
50
50
  return self
51
51
 
@@ -233,6 +233,29 @@ When provided with a **JSON schema** and a **document**, you must:
233
233
 
234
234
  ---
235
235
 
236
+ ## Date and Time Formatting
237
+
238
+ When extracting date, time, or datetime values:
239
+
240
+ - **Always use ISO format** for dates and times (e.g., "2023-12-25", "14:30:00", "2023-12-25T14:30:00")
241
+
242
+ **Examples:**
243
+
244
+ ```json
245
+ // Correct ISO formats:
246
+ {"date": "2023-12-25"}
247
+ {"time": "14:30:00"}
248
+ {"datetime": "2023-12-25T14:30:00Z"}
249
+ {"datetime_with_tz": "2023-12-25T14:30:00+02:00"}
250
+
251
+ // Incorrect formats:
252
+ {"date": "12/25/2023"}
253
+ {"time": "2:30 PM"}
254
+ {"datetime": "Dec 25, 2023 at 2:30 PM"}
255
+ ```
256
+
257
+ ---
258
+
236
259
  ## Handling Missing and Nullable Fields
237
260
 
238
261
  ### Nullable Leaf Attributes
@@ -357,32 +380,11 @@ When performing extraction, explicitly follow these core principles:
357
380
  - **Structure Preservation**: Always maintain explicitly the full schema structure, even when entire nested objects lack data (leaf attributes as null).
358
381
 
359
382
 
360
- ## Source Fields
361
-
362
- Some leaf fields require you to explicitly provide the source of the data (verbatim from the document).
363
- The idea is to simply provide a verbatim quote from the document, without any additional formatting or commentary, keeping it as close as possible to the original text.
364
- Make sure to reasonably include some surrounding text to provide context about the quote.
365
-
366
- You can easily identify the fields that require a source by the `quote___[attributename]` naming pattern.
367
-
368
- **Example:**
369
-
370
- ```json
371
- {
372
- "quote___name": "NAME:\nJohn Doe",
373
- "name": "John Doe"
374
- }
375
- ```
376
-
377
- ---
378
-
379
- # User Defined System Prompt
380
-
381
- """
383
+ ---"""
382
384
 
383
385
  @property
384
- def user_system_prompt(self) -> str:
385
- return self.json_schema.get("X-SystemPrompt", "")
386
+ def user_system_prompt(self) -> str | None:
387
+ return self.json_schema.get("X-SystemPrompt", None)
386
388
 
387
389
  @property
388
390
  def schema_system_prompt(self) -> str:
@@ -397,7 +399,7 @@ You can easily identify the fields that require a source by the `quote___[attrib
397
399
  Returns:
398
400
  str: The combined system prompt string.
399
401
  """
400
- return self.developer_system_prompt + "\n\n" + self.user_system_prompt + "\n\n" + self.schema_system_prompt
402
+ return self.developer_system_prompt + "\n\n" + (self.user_system_prompt + "\n\n" if self.user_system_prompt else "") + self.schema_system_prompt
401
403
 
402
404
  @property
403
405
  def title(self) -> str:
retab/types/standards.py CHANGED
@@ -1,14 +1,14 @@
1
1
  from typing import Any, List, Literal, Optional, Tuple, TypeVar, TypedDict
2
2
 
3
3
  from pydantic import BaseModel, Field
4
- from pydantic.fields import _Unset
4
+ from pydantic_core import PydanticUndefined
5
5
 
6
6
  # API Standards
7
7
 
8
8
  # Define a type variable to represent the content type
9
9
  T = TypeVar("T")
10
10
 
11
- FieldUnset = _Unset
11
+ FieldUnset: Any = PydanticUndefined
12
12
 
13
13
 
14
14
  # Define the ErrorDetail model
retab/utils/__init__.py CHANGED
@@ -0,0 +1,3 @@
1
+ from .json_schema import filter_auxiliary_fields, flatten_dict, unflatten_dict
2
+
3
+ __all__ = ["filter_auxiliary_fields", "flatten_dict", "unflatten_dict"]
retab/utils/ai_models.py CHANGED
@@ -6,6 +6,7 @@ from ..types.ai_models import AIProvider, GeminiModel, OpenAIModel, xAI_Model, R
6
6
 
7
7
  MODEL_CARDS_DIR = os.path.join(os.path.dirname(__file__), "_model_cards")
8
8
 
9
+
9
10
  def merge_model_cards(base: dict, override: dict) -> dict:
10
11
  result = base.copy()
11
12
  for key, value in override.items():
@@ -17,6 +18,7 @@ def merge_model_cards(base: dict, override: dict) -> dict:
17
18
  result[key] = value
18
19
  return result
19
20
 
21
+
20
22
  def load_model_cards(yaml_file: str) -> list[ModelCard]:
21
23
  raw_cards = yaml.safe_load(open(yaml_file))
22
24
  name_to_card = {c["model"]: c for c in raw_cards if "inherits" not in c}
@@ -31,14 +33,18 @@ def load_model_cards(yaml_file: str) -> list[ModelCard]:
31
33
  final_cards.append(ModelCard(**card))
32
34
  return final_cards
33
35
 
36
+
34
37
  # Load all model cards
35
- model_cards = sum([
36
- load_model_cards(os.path.join(MODEL_CARDS_DIR, "openai.yaml")),
37
- load_model_cards(os.path.join(MODEL_CARDS_DIR, "anthropic.yaml")),
38
- load_model_cards(os.path.join(MODEL_CARDS_DIR, "xai.yaml")),
39
- load_model_cards(os.path.join(MODEL_CARDS_DIR, "gemini.yaml")),
40
- load_model_cards(os.path.join(MODEL_CARDS_DIR, "auto.yaml")),
41
- ], [])
38
+ model_cards = sum(
39
+ [
40
+ load_model_cards(os.path.join(MODEL_CARDS_DIR, "openai.yaml")),
41
+ load_model_cards(os.path.join(MODEL_CARDS_DIR, "anthropic.yaml")),
42
+ load_model_cards(os.path.join(MODEL_CARDS_DIR, "xai.yaml")),
43
+ load_model_cards(os.path.join(MODEL_CARDS_DIR, "gemini.yaml")),
44
+ load_model_cards(os.path.join(MODEL_CARDS_DIR, "auto.yaml")),
45
+ ],
46
+ [],
47
+ )
42
48
  model_cards_dict = {card.model: card for card in model_cards}
43
49
 
44
50
 
@@ -108,7 +114,7 @@ def get_provider_for_model(model_id: str) -> AIProvider:
108
114
 
109
115
 
110
116
  def assert_valid_model_extraction(model: str) -> None:
111
- try:
117
+ try:
112
118
  get_provider_for_model(model)
113
119
  except ValueError:
114
120
  raise ValueError(
@@ -132,7 +138,116 @@ def assert_valid_model_schema_generation(model: str) -> None:
132
138
  return
133
139
  else:
134
140
  raise ValueError(
135
- f"Invalid model format: {model}. Must be either:\n"
136
- f"1. A standard model: {get_args(OpenAIModel)}\n"
137
- f"2. A fine-tuned model in format 'base_model:id' where base_model is one of the standard openai models"
138
- ) from None
141
+ f"Invalid model format: {model}. Must be either:\n"
142
+ f"1. A standard model: {get_args(OpenAIModel)}\n"
143
+ f"2. A fine-tuned model in format 'base_model:id' where base_model is one of the standard openai models"
144
+ ) from None
145
+
146
+
147
+ def get_model_credits(model: str) -> float:
148
+ """
149
+ Get the credit cost for a given model based on its capabilities and size.
150
+
151
+ Credit tiers:
152
+ - 0.1 credits: Micro/nano models (fastest, cheapest)
153
+ - 0.5 credits: Small/mini models (balanced performance)
154
+ - 2.0 credits: Large/advanced models (highest capability)
155
+
156
+ Args:
157
+ model: The model name to look up
158
+
159
+ Returns:
160
+ The credit cost for the model
161
+
162
+ Raises:
163
+ ValueError: If no model card is found for the specified model
164
+ """
165
+ try:
166
+ model_card = get_model_card(model)
167
+ model_name = get_model_from_model_id(model)
168
+ except ValueError:
169
+ # Unknown model, return 0 credits (no billing)
170
+ return 0.0
171
+
172
+ # Define credit mapping based on model capabilities and naming patterns
173
+ model_credits = {
174
+ # 0.1 credit models - Micro/Nano tier (fastest, most efficient)
175
+ "auto-micro": 0.1,
176
+ "gemini-flash-lite": 0.1,
177
+ "gpt-4o-mini": 0.1,
178
+ "gpt-3.5-turbo": 0.1,
179
+ "gpt-4.1-nano": 0.1, # Future model
180
+ # 0.5 credit models - Small/Mini tier (balanced performance)
181
+ "auto-small": 0.5,
182
+ "gemini-flash": 0.5,
183
+ "gpt-4o": 0.5,
184
+ "gpt-4-turbo": 0.5,
185
+ "gpt-4.1-mini": 0.5, # Future model
186
+ "claude-3-haiku": 0.5,
187
+ "claude-3.5-haiku": 0.5,
188
+ # 2.0 credit models - Large/Advanced tier (highest capability)
189
+ "auto-large": 2.0,
190
+ "gemini-pro": 2.0,
191
+ "gpt-4": 2.0,
192
+ "gpt-4.1": 2.0, # Future model
193
+ "o1-mini": 2.0,
194
+ "o1-preview": 2.0,
195
+ "o3": 5.0, # Future model
196
+ "claude-3-sonnet": 2.0,
197
+ "claude-3-opus": 2.0,
198
+ "claude-3.5-sonnet": 2.0,
199
+ "grok-beta": 2.0,
200
+ "grok-2": 2.0,
201
+ # Special reasoning models - Higher tier
202
+ "o1": 3.0,
203
+ "o3-max": 3.0, # Future model, highest tier
204
+ }
205
+
206
+ # Return the credits for the specific model
207
+ if model_name in model_credits:
208
+ return model_credits[model_name]
209
+
210
+ # Fallback logic based on model patterns and capabilities
211
+ model_lower = model_name.lower()
212
+
213
+ # Auto-model fallback logic
214
+ if model_lower.startswith("auto-"):
215
+ if "micro" in model_lower or "nano" in model_lower:
216
+ return 0.1
217
+ elif "small" in model_lower or "mini" in model_lower:
218
+ return 0.5
219
+ elif "large" in model_lower or "pro" in model_lower:
220
+ return 2.0
221
+
222
+ # Gemini model fallback logic
223
+ if "gemini" in model_lower:
224
+ if "lite" in model_lower or "nano" in model_lower:
225
+ return 0.1
226
+ elif "flash" in model_lower:
227
+ return 0.5
228
+ elif "pro" in model_lower or "ultra" in model_lower:
229
+ return 2.0
230
+
231
+ # GPT model fallback logic
232
+ if "gpt" in model_lower:
233
+ if "mini" in model_lower or "3.5" in model_lower:
234
+ return 0.1
235
+ elif "4o" in model_lower and "mini" not in model_lower:
236
+ return 0.5
237
+ elif "4" in model_lower or "o1" in model_lower:
238
+ return 2.0
239
+
240
+ # Claude model fallback logic
241
+ if "claude" in model_lower:
242
+ if "haiku" in model_lower:
243
+ return 0.5
244
+ elif "sonnet" in model_lower or "opus" in model_lower:
245
+ return 2.0
246
+
247
+ # Default for unknown models - use model card info if available
248
+ try:
249
+ # Try to determine based on model card properties
250
+ # This could be enhanced based on the actual ModelCard structure
251
+ return 1.0 # Default middle tier
252
+ except:
253
+ return 0.0 # No billing for completely unknown models
retab/utils/hashing.py ADDED
@@ -0,0 +1,24 @@
1
+ import base64
2
+ import hashlib
3
+ import json
4
+
5
+ from fastapi.encoders import jsonable_encoder
6
+
7
+ # ************* Generalistic utils *************
8
+
9
+
10
+ def generate_blake2b_hash_from_bytes(bytes_: bytes) -> str:
11
+ return hashlib.blake2b(bytes_, digest_size=8).hexdigest()
12
+
13
+
14
+ def generate_blake2b_hash_from_base64(base64_string: str) -> str:
15
+ return generate_blake2b_hash_from_bytes(base64.b64decode(base64_string))
16
+
17
+
18
+ def generate_blake2b_hash_from_string(input_string: str) -> str:
19
+ return generate_blake2b_hash_from_bytes(input_string.encode("utf-8"))
20
+
21
+
22
+ def generate_blake2b_hash_from_dict(input_dict: dict) -> str:
23
+ jsonable_dict = jsonable_encoder(input_dict)
24
+ return generate_blake2b_hash_from_string(json.dumps(jsonable_dict, sort_keys=True).strip())
@@ -15,7 +15,7 @@ from pydantic import BaseModel, BeforeValidator, Field, create_model
15
15
  from pydantic.config import ConfigDict
16
16
 
17
17
  from ..types.schemas.layout import Column, FieldItem, Layout, RefObject, Row, RowList
18
- from .mime import generate_blake2b_hash_from_string
18
+ from .hashing import generate_blake2b_hash_from_string
19
19
 
20
20
  # **** Validation Functions ****
21
21
 
@@ -2091,31 +2091,6 @@ def sanitize(instance: Any, schema: dict[str, Any]) -> Any:
2091
2091
  return __sanitize_instance(instance, expanded_schema)
2092
2092
 
2093
2093
 
2094
- def compute_schema_data_id(json_schema: dict[str, Any]) -> str:
2095
- """Returns the schema_data_id for a given JSON schema.
2096
-
2097
- The schema_data_id is a hash of the schema data, ignoring all prompt/description/default fields
2098
- and other non-structural metadata.
2099
-
2100
- Args:
2101
- json_schema: The JSON schema to compute the ID for
2102
-
2103
- Returns:
2104
- str: A hash string representing the schema data version with "sch_data_id_" prefix
2105
- """
2106
-
2107
- return "sch_data_id_" + generate_blake2b_hash_from_string(
2108
- json.dumps(
2109
- clean_schema(
2110
- copy.deepcopy(json_schema),
2111
- remove_custom_fields=True,
2112
- fields_to_remove=["description", "default", "title", "required", "examples", "deprecated", "readOnly", "writeOnly"],
2113
- ),
2114
- sort_keys=True,
2115
- ).strip()
2116
- )
2117
-
2118
-
2119
2094
  def validate_json_against_schema(
2120
2095
  data: Any,
2121
2096
  schema: dict[str, Any],
retab/utils/mime.py CHANGED
@@ -16,23 +16,6 @@ from ..types.modalities import SUPPORTED_TYPES
16
16
 
17
17
  T = TypeVar("T")
18
18
 
19
-
20
- def generate_blake2b_hash_from_bytes(bytes_: bytes) -> str:
21
- return hashlib.blake2b(bytes_, digest_size=8).hexdigest()
22
-
23
-
24
- def generate_blake2b_hash_from_base64(base64_string: str) -> str:
25
- return generate_blake2b_hash_from_bytes(base64.b64decode(base64_string))
26
-
27
-
28
- def generate_blake2b_hash_from_string(input_string: str) -> str:
29
- return generate_blake2b_hash_from_bytes(input_string.encode("utf-8"))
30
-
31
-
32
- def generate_blake2b_hash_from_dict(input_dict: dict) -> str:
33
- return generate_blake2b_hash_from_string(json.dumps(input_dict, sort_keys=True).strip())
34
-
35
-
36
19
  def convert_pil_image_to_mime_data(image: PIL.Image.Image) -> MIMEData:
37
20
  """Convert a PIL Image object to a MIMEData object.
38
21
 
@@ -117,7 +117,6 @@ class CompletionsUsage(BaseModel):
117
117
  input_audio_tokens: int = Field(description="The aggregated number of audio input tokens used, including cached tokens.")
118
118
  output_audio_tokens: int = Field(description="The aggregated number of audio output tokens used.")
119
119
  num_model_requests: int = Field(description="The count of requests made to the model.")
120
- project_id: Optional[str] = Field(default=None, description="When group_by=project_id, this field provides the project ID of the grouped usage result.")
121
120
  user_id: Optional[str] = Field(default=None, description="When group_by=user_id, this field provides the user ID of the grouped usage result.")
122
121
  api_key_id: Optional[str] = Field(default=None, description="When group_by=api_key_id, this field provides the API key ID of the grouped usage result.")
123
122
  model: Optional[str] = Field(default=None, description="When group_by=model, this field provides the model name of the grouped usage result.")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: retab
3
- Version: 0.0.42
3
+ Version: 0.0.44
4
4
  Summary: Retab official python library
5
5
  Home-page: https://github.com/Retab-dev/retab
6
6
  Author: Retab
@@ -61,13 +61,13 @@ Made with love by the team at [Retab](https://retab.com) 🤍.
61
61
 
62
62
  ### What is Retab?
63
63
 
64
- Retab solves all the major challenges in document processing with LLMs:
64
+ Retab solves all the major challenges in document processing with Large Language Models:
65
65
 
66
66
  1. **Universal Document Preprocessing**: Convert any file type (PDFs, Excel, emails, etc.) into LLM-ready format without writing custom parsers
67
67
  2. **Structured, Schema-driven Extraction**: Get consistent, reliable outputs using schema-based prompt engineering
68
68
  3. **Processors**: Publish a live, stable, shareable document processor.
69
69
  4. **Automations**: Create document processing workflows that can be triggered by events (mailbox, upload link, endpoint, outlook plugin).
70
- 5. **Evaluations**: Evaluate the performance of models against annotated datasets
70
+ 5. **Projects**: Evaluate the performance of models against annotated datasets
71
71
  6. **Optimizations**: Identify the most used processors and help you finetune models to reduce costs and improve performance
72
72
 
73
73
  We are offering you all the software-defined primitives to build your own document processing solutions. We see it as **Stripe** for document processing.
@@ -90,7 +90,7 @@ Many people haven't yet realized how powerful LLMs have become at document proce
90
90
 
91
91
  ## Code examples
92
92
 
93
- ## You can check our Github repository to see code examples: [python examples](https://github.com/Retab-dev/retab/tree/main/examples) and [jupyter notebooks](https://github.com/Retab-dev/retab-nodejs/tree/main/notebooks).
93
+ You can check our Github repository to see code examples: [python examples](https://github.com/Retab-dev/retab/tree/main/examples) and [jupyter notebooks](https://github.com/Retab-dev/retab-nodejs/tree/main/notebooks).
94
94
 
95
95
  ## Community
96
96
 
@@ -112,8 +112,6 @@ We share our roadmap publicly on [Github](https://github.com/Retab-dev/retab)
112
112
  Among the features we're working on:
113
113
 
114
114
  * [ ] Node.js SDK
115
- * [ ] Low-level speed optimizations for Evals Frontend
116
115
  * [ ] Schema optimization autopilot
117
116
  * [ ] Sources API
118
- * [ ] Parse API for RAG
119
117