retab 0.0.35__py3-none-any.whl → 0.0.37__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. {uiform → retab}/_utils/ai_models.py +2 -2
  2. {uiform → retab}/_utils/benchmarking.py +15 -16
  3. {uiform → retab}/_utils/chat.py +9 -14
  4. {uiform → retab}/_utils/display.py +0 -3
  5. {uiform → retab}/_utils/json_schema.py +9 -14
  6. {uiform → retab}/_utils/mime.py +11 -14
  7. {uiform → retab}/_utils/responses.py +9 -3
  8. {uiform → retab}/_utils/stream_context_managers.py +1 -1
  9. {uiform → retab}/_utils/usage/usage.py +28 -28
  10. {uiform → retab}/client.py +32 -31
  11. {uiform → retab}/resources/consensus/client.py +17 -36
  12. {uiform → retab}/resources/consensus/completions.py +24 -47
  13. {uiform → retab}/resources/consensus/completions_stream.py +26 -38
  14. {uiform → retab}/resources/consensus/responses.py +31 -80
  15. {uiform → retab}/resources/consensus/responses_stream.py +31 -79
  16. {uiform → retab}/resources/documents/client.py +59 -45
  17. {uiform → retab}/resources/documents/extractions.py +181 -90
  18. {uiform → retab}/resources/evals.py +56 -43
  19. retab/resources/evaluations/__init__.py +3 -0
  20. retab/resources/evaluations/client.py +301 -0
  21. retab/resources/evaluations/documents.py +233 -0
  22. retab/resources/evaluations/iterations.py +452 -0
  23. {uiform → retab}/resources/files.py +2 -2
  24. {uiform → retab}/resources/jsonlUtils.py +220 -216
  25. retab/resources/models.py +73 -0
  26. retab/resources/processors/automations/client.py +244 -0
  27. {uiform → retab}/resources/processors/automations/endpoints.py +77 -118
  28. retab/resources/processors/automations/links.py +294 -0
  29. {uiform → retab}/resources/processors/automations/logs.py +30 -19
  30. {uiform → retab}/resources/processors/automations/mailboxes.py +136 -174
  31. retab/resources/processors/automations/outlook.py +337 -0
  32. {uiform → retab}/resources/processors/automations/tests.py +22 -25
  33. {uiform → retab}/resources/processors/client.py +179 -164
  34. {uiform → retab}/resources/schemas.py +78 -66
  35. {uiform → retab}/resources/secrets/external_api_keys.py +1 -5
  36. retab/resources/secrets/webhook.py +64 -0
  37. {uiform → retab}/resources/usage.py +39 -2
  38. {uiform → retab}/types/ai_models.py +13 -13
  39. {uiform → retab}/types/automations/cron.py +19 -12
  40. {uiform → retab}/types/automations/endpoints.py +7 -4
  41. {uiform → retab}/types/automations/links.py +7 -3
  42. {uiform → retab}/types/automations/mailboxes.py +9 -9
  43. {uiform → retab}/types/automations/outlook.py +15 -11
  44. retab/types/browser_canvas.py +3 -0
  45. {uiform → retab}/types/chat.py +2 -2
  46. {uiform → retab}/types/completions.py +9 -12
  47. retab/types/consensus.py +19 -0
  48. {uiform → retab}/types/db/annotations.py +3 -3
  49. {uiform → retab}/types/db/files.py +8 -6
  50. {uiform → retab}/types/documents/create_messages.py +18 -20
  51. {uiform → retab}/types/documents/extractions.py +69 -24
  52. {uiform → retab}/types/evals.py +5 -5
  53. retab/types/evaluations/__init__.py +31 -0
  54. retab/types/evaluations/documents.py +30 -0
  55. retab/types/evaluations/iterations.py +112 -0
  56. retab/types/evaluations/model.py +73 -0
  57. retab/types/events.py +79 -0
  58. {uiform → retab}/types/extractions.py +33 -10
  59. retab/types/inference_settings.py +15 -0
  60. retab/types/jobs/base.py +54 -0
  61. retab/types/jobs/batch_annotation.py +12 -0
  62. {uiform → retab}/types/jobs/evaluation.py +1 -2
  63. {uiform → retab}/types/logs.py +37 -34
  64. retab/types/metrics.py +32 -0
  65. {uiform → retab}/types/mime.py +22 -20
  66. {uiform → retab}/types/modalities.py +10 -10
  67. retab/types/predictions.py +19 -0
  68. {uiform → retab}/types/schemas/enhance.py +4 -2
  69. {uiform → retab}/types/schemas/evaluate.py +7 -4
  70. {uiform → retab}/types/schemas/generate.py +6 -3
  71. {uiform → retab}/types/schemas/layout.py +1 -1
  72. {uiform → retab}/types/schemas/object.py +13 -14
  73. {uiform → retab}/types/schemas/templates.py +1 -3
  74. {uiform → retab}/types/secrets/external_api_keys.py +0 -1
  75. {uiform → retab}/types/standards.py +18 -1
  76. {retab-0.0.35.dist-info → retab-0.0.37.dist-info}/METADATA +7 -6
  77. retab-0.0.37.dist-info/RECORD +107 -0
  78. retab-0.0.37.dist-info/top_level.txt +1 -0
  79. retab-0.0.35.dist-info/RECORD +0 -111
  80. retab-0.0.35.dist-info/top_level.txt +0 -1
  81. uiform/_utils/benchmarking copy.py +0 -588
  82. uiform/resources/deployments/__init__.py +0 -9
  83. uiform/resources/deployments/client.py +0 -78
  84. uiform/resources/deployments/endpoints.py +0 -322
  85. uiform/resources/deployments/links.py +0 -452
  86. uiform/resources/deployments/logs.py +0 -211
  87. uiform/resources/deployments/mailboxes.py +0 -496
  88. uiform/resources/deployments/outlook.py +0 -531
  89. uiform/resources/deployments/tests.py +0 -158
  90. uiform/resources/models.py +0 -45
  91. uiform/resources/processors/automations/client.py +0 -78
  92. uiform/resources/processors/automations/links.py +0 -356
  93. uiform/resources/processors/automations/outlook.py +0 -444
  94. uiform/resources/secrets/webhook.py +0 -62
  95. uiform/types/consensus.py +0 -10
  96. uiform/types/deployments/cron.py +0 -59
  97. uiform/types/deployments/endpoints.py +0 -28
  98. uiform/types/deployments/links.py +0 -36
  99. uiform/types/deployments/mailboxes.py +0 -67
  100. uiform/types/deployments/outlook.py +0 -76
  101. uiform/types/deployments/webhooks.py +0 -21
  102. uiform/types/events.py +0 -76
  103. uiform/types/jobs/base.py +0 -150
  104. uiform/types/jobs/batch_annotation.py +0 -22
  105. uiform/types/secrets/__init__.py +0 -0
  106. {uiform → retab}/__init__.py +0 -0
  107. {uiform → retab}/_resource.py +0 -0
  108. {uiform → retab}/_utils/__init__.py +0 -0
  109. {uiform → retab}/_utils/usage/__init__.py +0 -0
  110. {uiform → retab}/py.typed +0 -0
  111. {uiform → retab}/resources/__init__.py +0 -0
  112. {uiform → retab}/resources/consensus/__init__.py +0 -0
  113. {uiform → retab}/resources/documents/__init__.py +0 -0
  114. {uiform → retab}/resources/finetuning.py +0 -0
  115. {uiform → retab}/resources/openai_example.py +0 -0
  116. {uiform → retab}/resources/processors/__init__.py +0 -0
  117. {uiform → retab}/resources/processors/automations/__init__.py +0 -0
  118. {uiform → retab}/resources/prompt_optimization.py +0 -0
  119. {uiform → retab}/resources/secrets/__init__.py +0 -0
  120. {uiform → retab}/resources/secrets/client.py +0 -0
  121. {uiform → retab}/types/__init__.py +0 -0
  122. {uiform → retab}/types/automations/__init__.py +0 -0
  123. {uiform → retab}/types/automations/webhooks.py +0 -0
  124. {uiform → retab}/types/db/__init__.py +0 -0
  125. {uiform/types/deployments → retab/types/documents}/__init__.py +0 -0
  126. {uiform → retab}/types/documents/correct_orientation.py +0 -0
  127. {uiform/types/documents → retab/types/jobs}/__init__.py +0 -0
  128. {uiform → retab}/types/jobs/finetune.py +0 -0
  129. {uiform → retab}/types/jobs/prompt_optimization.py +0 -0
  130. {uiform → retab}/types/jobs/webcrawl.py +0 -0
  131. {uiform → retab}/types/pagination.py +0 -0
  132. {uiform/types/jobs → retab/types/schemas}/__init__.py +0 -0
  133. {uiform/types/schemas → retab/types/secrets}/__init__.py +0 -0
  134. {retab-0.0.35.dist-info → retab-0.0.37.dist-info}/WHEEL +0 -0
@@ -0,0 +1,112 @@
1
+ import copy
2
+ import datetime
3
+ import json
4
+ from typing import Any, Optional, Self
5
+
6
+ import nanoid # type: ignore
7
+ from pydantic import BaseModel, Field, computed_field, model_validator
8
+
9
+ from ..._utils.json_schema import clean_schema
10
+ from ..._utils.mime import generate_blake2b_hash_from_string
11
+ from ..inference_settings import InferenceSettings
12
+ from ..metrics import MetricResult
13
+ from ..predictions import PredictionData
14
+
15
+
16
+ class Iteration(BaseModel):
17
+ id: str = Field(default_factory=lambda: "eval_iter_" + nanoid.generate())
18
+ updated_at: datetime.datetime = Field(
19
+ default_factory=lambda: datetime.datetime.now(tz=datetime.timezone.utc),
20
+ description="The last update date of inference settings or json schema",
21
+ )
22
+ inference_settings: InferenceSettings
23
+ json_schema: dict[str, Any]
24
+ predictions: dict[str, PredictionData] = Field(default_factory=dict, description="The predictions of the iteration for all the documents")
25
+ metric_results: Optional[MetricResult] = Field(default=None, description="The metric results of the iteration")
26
+
27
+ @computed_field # type: ignore
28
+ @property
29
+ def schema_data_id(self) -> str:
30
+ """Returns the SHA1 hash of the schema data, ignoring all prompt/description/default fields.
31
+
32
+ Returns:
33
+ str: A SHA1 hash string representing the schema data version.
34
+ """
35
+ return "sch_data_id_" + generate_blake2b_hash_from_string(
36
+ json.dumps(
37
+ clean_schema(
38
+ copy.deepcopy(self.json_schema),
39
+ remove_custom_fields=True,
40
+ fields_to_remove=["description", "default", "title", "required", "examples", "deprecated", "readOnly", "writeOnly"],
41
+ ),
42
+ sort_keys=True,
43
+ ).strip()
44
+ )
45
+
46
+ # This is a computed field, it is exposed when serializing the object
47
+ @computed_field # type: ignore
48
+ @property
49
+ def schema_id(self) -> str:
50
+ """Returns the SHA1 hash of the complete schema.
51
+
52
+ Returns:
53
+ str: A SHA1 hash string representing the complete schema version.
54
+ """
55
+ return "sch_id_" + generate_blake2b_hash_from_string(json.dumps(self.json_schema, sort_keys=True).strip())
56
+
57
+
58
+ class CreateIterationRequest(BaseModel):
59
+ """
60
+ Request model for performing a new iteration with custom inference settings and optional JSON schema.
61
+ """
62
+
63
+ inference_settings: InferenceSettings
64
+ json_schema: Optional[dict[str, Any]] = None
65
+ from_iteration_id: Optional[str] = Field(
66
+ default=None,
67
+ description="The ID of the iteration to copy the JSON Schema from.",
68
+ )
69
+
70
+ # validate that exactly one of from_iteration_id or json_schema is provided
71
+ @model_validator(mode="after")
72
+ def validate_one_of_from_iteration_id_or_json_schema(self) -> Self:
73
+ if (self.from_iteration_id is None) ^ (self.json_schema is None):
74
+ raise ValueError("Exactly one of from_iteration_id or json_schema must be provided")
75
+ return self
76
+
77
+
78
+ class PatchIterationRequest(BaseModel):
79
+ inference_settings: Optional[InferenceSettings] = Field(default=None, description="The new inference settings of the iteration")
80
+ json_schema: Optional[dict[str, Any]] = Field(default=None, description="The new json schema of the iteration")
81
+
82
+
83
+ class ProcessIterationRequest(BaseModel):
84
+ """Request model for processing an iteration - running extractions on documents."""
85
+
86
+ document_ids: Optional[list[str]] = Field(default=None, description="Specific document IDs to process. If None, all documents will be processed.")
87
+ only_outdated: bool = Field(default=True, description="Only process documents that need updates (prediction.updated_at is None or older than iteration.updated_at)")
88
+
89
+
90
+ class DocumentStatus(BaseModel):
91
+ """Status of a document within an iteration."""
92
+
93
+ document_id: str
94
+ filename: str
95
+ needs_update: bool = Field(description="True if prediction is missing or outdated")
96
+ has_prediction: bool = Field(description="True if any prediction exists")
97
+ prediction_updated_at: Optional[datetime.datetime] = Field(description="When the prediction was last updated")
98
+ iteration_updated_at: datetime.datetime = Field(description="When the iteration settings were last updated")
99
+
100
+
101
+ class IterationDocumentStatusResponse(BaseModel):
102
+ """Response showing the status of all documents in an iteration."""
103
+
104
+ iteration_id: str
105
+ documents: list[DocumentStatus]
106
+ total_documents: int
107
+ documents_needing_update: int
108
+ documents_up_to_date: int
109
+
110
+
111
+ class AddIterationFromJsonlRequest(BaseModel):
112
+ jsonl_gcs_path: str
@@ -0,0 +1,73 @@
1
+ import datetime
2
+ import json
3
+ from typing import Any, Optional
4
+
5
+ import nanoid # type: ignore
6
+ from pydantic import BaseModel, Field, computed_field
7
+
8
+ from ..._utils.json_schema import compute_schema_data_id
9
+ from ..._utils.mime import generate_blake2b_hash_from_string
10
+ from ..inference_settings import InferenceSettings
11
+ from .documents import EvaluationDocument
12
+ from .iterations import Iteration
13
+
14
+
15
+ # Actual Object stored in DB
16
+ class Evaluation(BaseModel):
17
+ id: str = Field(default_factory=lambda: "eval_" + nanoid.generate())
18
+ updated_at: datetime.datetime = Field(default_factory=lambda: datetime.datetime.now(tz=datetime.timezone.utc))
19
+
20
+ name: str
21
+ documents: list[EvaluationDocument] = Field(default_factory=list)
22
+ iterations: list[Iteration] = Field(default_factory=list)
23
+ json_schema: dict[str, Any]
24
+
25
+ project_id: str = Field(description="The ID of the project", default="default_spreadsheets")
26
+ default_inference_settings: InferenceSettings = Field(
27
+ default=InferenceSettings(), description="The default inference properties for the evaluation (mostly used in the frontend)"
28
+ )
29
+
30
+ @computed_field # type: ignore
31
+ @property
32
+ def schema_data_id(self) -> str:
33
+ """Returns the SHA1 hash of the schema data, ignoring all prompt/description/default fields.
34
+
35
+ Returns:
36
+ str: A SHA1 hash string representing the schema data version.
37
+ """
38
+ return compute_schema_data_id(self.json_schema)
39
+
40
+ # This is a computed field, it is exposed when serializing the object
41
+ @computed_field # type: ignore
42
+ @property
43
+ def schema_id(self) -> str:
44
+ """Returns the SHA1 hash of the complete schema.
45
+
46
+ Returns:
47
+ str: A SHA1 hash string representing the complete schema version.
48
+ """
49
+ return "sch_id_" + generate_blake2b_hash_from_string(json.dumps(self.json_schema, sort_keys=True).strip())
50
+
51
+
52
+ class CreateEvaluation(BaseModel):
53
+ name: str
54
+ json_schema: dict[str, Any]
55
+ project_id: str = Field(description="The ID of the project", default="default_spreadsheets")
56
+ default_inference_settings: InferenceSettings = Field(default=InferenceSettings(), description="The default inference properties for the evaluation.")
57
+
58
+
59
+ class ListEvaluationParams(BaseModel):
60
+ project_id: Optional[str] = Field(default=None, description="The ID of the project")
61
+ schema_id: Optional[str] = Field(default=None, description="The ID of the schema")
62
+ schema_data_id: Optional[str] = Field(default=None, description="The ID of the schema data")
63
+
64
+
65
+ class PatchEvaluationRequest(BaseModel):
66
+ name: Optional[str] = Field(default=None, description="The name of the document")
67
+ json_schema: Optional[dict[str, Any]] = Field(default=None, description="The json schema of the evaluation")
68
+ project_id: Optional[str] = Field(default=None, description="The ID of the project")
69
+ default_inference_settings: Optional[InferenceSettings] = Field(default=None, description="The default inference properties for the evaluation (mostly used in the frontend)")
70
+
71
+
72
+ class AddIterationFromJsonlRequest(BaseModel):
73
+ jsonl_gcs_path: str
retab/types/events.py ADDED
@@ -0,0 +1,79 @@
1
+ import datetime
2
+ from typing import Any, Literal, Optional
3
+
4
+ import nanoid # type: ignore
5
+ from pydantic import BaseModel, Field
6
+
7
+ metadata_key = Literal[
8
+ "automation",
9
+ "cron",
10
+ "data_structure",
11
+ "dataset",
12
+ "dataset_membership",
13
+ "endpoint",
14
+ "evaluation",
15
+ "extraction",
16
+ "file",
17
+ "files",
18
+ "link",
19
+ "mailbox",
20
+ "organization",
21
+ "outlook",
22
+ "preprocessing",
23
+ "preprocessing",
24
+ "reconciliation",
25
+ "schema",
26
+ "schema_data",
27
+ "template",
28
+ "user",
29
+ "webhook",
30
+ ]
31
+
32
+ event_type = Literal[
33
+ "extraction.created",
34
+ "messages.created",
35
+ "document.orientation_corrected",
36
+ "consensus.reconciled",
37
+ "automation.created",
38
+ "automation.updated",
39
+ "automation.deleted",
40
+ "automation.webhook",
41
+ "preprocessing.created",
42
+ "link.created",
43
+ "link.updated",
44
+ "link.deleted",
45
+ "link.webhook",
46
+ "mailbox.created",
47
+ "mailbox.updated",
48
+ "mailbox.deleted",
49
+ "mailbox.webhook",
50
+ "outlook.created",
51
+ "outlook.updated",
52
+ "outlook.deleted",
53
+ "outlook.webhook",
54
+ "schema.generated",
55
+ "schema.promptified",
56
+ "schema.system_promptfile.created",
57
+ "file.updated",
58
+ "file.deleted",
59
+ "template.created",
60
+ "template.deleted",
61
+ "template.sample_document_uploaded",
62
+ "template.sample_document_deleted",
63
+ "template.updated",
64
+ ]
65
+
66
+
67
+ class Event(BaseModel):
68
+ object: Literal["event"] = "event"
69
+ id: str = Field(default_factory=lambda: "event_" + nanoid.generate(), description="Unique identifier for the event")
70
+ event: str = Field(..., description="A string that distinguishes the event type. Ex: user.created, user.updated, user.deleted, etc.")
71
+ created_at: datetime.datetime = Field(default_factory=lambda: datetime.datetime.now(datetime.timezone.utc))
72
+ data: dict[str, Any] = Field(..., description="Event payload. Payloads match the corresponding API objects.")
73
+ metadata: Optional[dict[metadata_key, str]] = Field(
74
+ default=None, description="Ids giving informations about the event. Ex: user.created.metadata = {'user': 'usr_8478973619047837'}"
75
+ )
76
+
77
+
78
+ class StoredEvent(Event):
79
+ organization_id: str = Field(..., description="Organization ID")
@@ -4,12 +4,12 @@ from typing import Any, Literal, Optional
4
4
  import nanoid # type: ignore
5
5
  from openai.types.chat import ChatCompletion
6
6
  from openai.types.chat.chat_completion_reasoning_effort import ChatCompletionReasoningEffort
7
- from pydantic import BaseModel, Field, computed_field
7
+ from pydantic import BaseModel, Field, computed_field, model_validator
8
8
 
9
9
  from uiform.types.chat import ChatCompletionUiformMessage
10
10
  from uiform.types.documents.extractions import UiParsedChatCompletion
11
11
 
12
- from .._utils.usage.usage import compute_cost_from_model, compute_cost_from_model_with_breakdown, CostBreakdown
12
+ from .._utils.usage.usage import CostBreakdown, compute_cost_from_model, compute_cost_from_model_with_breakdown
13
13
  from .ai_models import Amount
14
14
  from .modalities import Modality
15
15
 
@@ -17,13 +17,14 @@ ValidationsState = Literal["pending", "validated", "invalid"]
17
17
 
18
18
 
19
19
  class ExtractionSource(BaseModel):
20
- type: Literal["api", "annotation","processor", "automation.link", "automation.mailbox", "automation.cron", "automation.outlook", "automation.endpoint", "schema.extract"] = Field(
21
- description="Type of extraction"
20
+ type: Literal["api", "annotation", "processor", "automation.link", "automation.mailbox", "automation.cron", "automation.outlook", "automation.endpoint", "schema.extract"] = (
21
+ Field(description="Type of extraction")
22
22
  )
23
23
  id: str | None = Field(default=None, description="ID the trigger of the extraction")
24
24
 
25
25
 
26
- ExtractionSteps = str | Literal['initialization', 'prepare_messages', 'yield_first_token', 'completion'] # Steps are meant to not overlap
26
+ ExtractionSteps = str | Literal["initialization", "prepare_messages", "yield_first_token", "completion"] # Steps are meant to not overlap
27
+ BrowserCanvas = Literal["A3", "A4", "A5"]
27
28
 
28
29
 
29
30
  class ExtractionTimingStep(BaseModel):
@@ -36,8 +37,11 @@ class Extraction(BaseModel):
36
37
  id: str = Field(default_factory=lambda: "extr_" + nanoid.generate(), description="Unique identifier of the analysis")
37
38
  messages: list[ChatCompletionUiformMessage] = Field(default_factory=list)
38
39
  messages_gcs: str = Field(..., description="GCS path to the messages")
39
- file_gcs: str = Field(..., description="GCS path to the file")
40
- file_id: str = Field(..., description="ID of the file")
40
+ file_gcs_paths: list[str] = Field(..., description="GCS paths to the files")
41
+ file_ids: list[str] = Field(..., description="IDs of the files")
42
+ # Legacy fields for backward compatibility
43
+ file_gcs: str = Field(default="", description="GCS path to the first file (deprecated)")
44
+ file_id: str = Field(default="", description="ID of the first file (deprecated)")
41
45
 
42
46
  status: Literal["success", "failed"] = Field(..., description="Whether the analysis was successful")
43
47
  completion: UiParsedChatCompletion | ChatCompletion = Field(..., description="Response generated by the analysis")
@@ -46,7 +50,9 @@ class Extraction(BaseModel):
46
50
  temperature: float = Field(default=0.0, description="Temperature used for the analysis")
47
51
  source: ExtractionSource = Field(..., description="Source of the extraction")
48
52
  image_resolution_dpi: int = Field(default=96, description="Resolution of the image sent to the LLM")
49
- browser_canvas: Literal['A3', 'A4', 'A5'] = Field(default='A4', description="Sets the size of the browser canvas for rendering documents in browser-based processing. Choose a size that matches the document type.")
53
+ browser_canvas: BrowserCanvas = Field(
54
+ default="A4", description="Sets the size of the browser canvas for rendering documents in browser-based processing. Choose a size that matches the document type."
55
+ )
50
56
  modality: Modality = Field(default="native", description="Modality of the extraction")
51
57
  reasoning_effort: Optional[ChatCompletionReasoningEffort] = Field(default=None, description="The effort level for the model to reason about the input data.")
52
58
  timings: list[ExtractionTimingStep] = Field(default_factory=list, description="Timings of the extraction")
@@ -60,7 +66,24 @@ class Extraction(BaseModel):
60
66
  validation_state: Optional[ValidationsState] = Field(default=None, description="Validation state of the extraction")
61
67
  billed: bool = Field(default=False, description="Whether the extraction has been billed or not")
62
68
 
63
- @computed_field
69
+ @model_validator(mode="before")
70
+ def handle_legacy_fields(cls, data):
71
+ """Handle backward compatibility for legacy file_gcs and file_id fields."""
72
+ if isinstance(data, dict):
73
+ # If only legacy fields are provided, convert to new format
74
+ if "file_gcs" in data and "file_gcs_paths" not in data:
75
+ data["file_gcs_paths"] = [data["file_gcs"]]
76
+ if "file_id" in data and "file_ids" not in data:
77
+ data["file_ids"] = [data["file_id"]]
78
+
79
+ # Set legacy fields from new format for backward compatibility
80
+ if "file_gcs_paths" in data and data["file_gcs_paths"]:
81
+ data["file_gcs"] = data["file_gcs_paths"][0]
82
+ if "file_ids" in data and data["file_ids"]:
83
+ data["file_id"] = data["file_ids"][0]
84
+ return data
85
+
86
+ @computed_field # type: ignore
64
87
  @property
65
88
  def api_cost(self) -> Optional[Amount]:
66
89
  if self.completion and self.completion.usage:
@@ -71,7 +94,7 @@ class Extraction(BaseModel):
71
94
  print(f"Error computing cost: {e}")
72
95
  return None
73
96
  return None
74
-
97
+
75
98
  @computed_field # type: ignore
76
99
  @property
77
100
  def cost_breakdown(self) -> Optional[CostBreakdown]:
@@ -0,0 +1,15 @@
1
+ from openai.types.chat.chat_completion_reasoning_effort import ChatCompletionReasoningEffort
2
+ from pydantic import BaseModel, Field
3
+
4
+ from .browser_canvas import BrowserCanvas
5
+ from .modalities import Modality
6
+
7
+
8
+ class InferenceSettings(BaseModel):
9
+ model: str = "gpt-4.1-mini"
10
+ temperature: float = 0.0
11
+ modality: Modality = "native"
12
+ reasoning_effort: ChatCompletionReasoningEffort = "medium"
13
+ image_resolution_dpi: int = 96
14
+ browser_canvas: BrowserCanvas = "A4"
15
+ n_consensus: int = Field(default=1, description="Number of consensus rounds to perform")
@@ -0,0 +1,54 @@
1
+ from typing import Literal, Optional, Self
2
+
3
+ from pydantic import BaseModel, model_validator
4
+ from ..inference_settings import InferenceSettings
5
+
6
+ SelectionMode = Literal["all", "manual"]
7
+
8
+
9
+ # This is the input data for the prepare_dataset job
10
+ class PrepareDatasetInputData(BaseModel):
11
+ dataset_id: Optional[str] = None
12
+ schema_id: Optional[str] = None
13
+ schema_data_id: Optional[str] = None
14
+
15
+ selection_model: SelectionMode = "all"
16
+
17
+ @model_validator(mode="after")
18
+ def validate_input(self) -> Self:
19
+ # The preference is:
20
+ # 1. dataset_id
21
+ # 2. schema_id
22
+ # 3. schema_data_id
23
+ if self.dataset_id is None and self.schema_id is None and self.schema_data_id is None:
24
+ raise ValueError("At least one of dataset_id, schema_id, or schema_data_id must be provided")
25
+
26
+ return self
27
+
28
+
29
+ # This is the input data for the split_dataset job
30
+ class DatasetSplitInputData(BaseModel):
31
+ dataset_id: str
32
+ train_size: Optional[int | float] = None
33
+ eval_size: Optional[int | float] = None
34
+
35
+ @model_validator(mode="after")
36
+ def validate_input(self) -> Self:
37
+ if self.train_size is not None and self.eval_size is not None:
38
+ raise ValueError("train_size and eval_size cannot both be provided")
39
+ return self
40
+
41
+
42
+ # This is the input data for the batch annotation job
43
+ class AnnotationInputData(BaseModel):
44
+ data_file: str
45
+ schema_id: str
46
+ inference_settings: InferenceSettings
47
+
48
+
49
+ # This is the input data for the evaluation job
50
+ class EvaluationInputData(BaseModel):
51
+ eval_data_file: str
52
+ schema_id: str
53
+ inference_settings_1: InferenceSettings | None = None
54
+ inference_settings_2: InferenceSettings
@@ -0,0 +1,12 @@
1
+ from typing import Optional
2
+
3
+ from pydantic import BaseModel
4
+
5
+ from ..inference_settings import InferenceSettings
6
+
7
+
8
+ class AnnotationInputData(BaseModel):
9
+ dataset_id: str
10
+ files_ids: Optional[list[str]] = None
11
+ upsert: bool = False
12
+ inference_settings: InferenceSettings
@@ -1,7 +1,6 @@
1
1
  from pydantic import BaseModel
2
2
 
3
- from ..._utils.benchmarking import EvalMetrics, SingleFileEval, compute_dict_difference
4
- from .batch_annotation import AnnotationInputData, InferenceSettings
3
+ from ..inference_settings import InferenceSettings
5
4
 
6
5
  # This job will generate two datasets from the original dataset, one with the first annotation and one with the second annotation
7
6
  # It will then evaluate the two datasets using the evaluation metrics and return an EvalMetrics object
@@ -1,22 +1,21 @@
1
- import copy
2
1
  import datetime
3
2
  import json
4
3
  from typing import Any, Dict, List, Literal, Optional
5
4
 
6
5
  import nanoid # type: ignore
7
- from openai import OpenAI
6
+ from openai.types.chat.chat_completion import ChatCompletion
8
7
  from openai.types.chat.chat_completion_reasoning_effort import ChatCompletionReasoningEffort
9
- from pydantic import BaseModel, EmailStr, Field, HttpUrl, computed_field, field_serializer
10
- from pydantic_core import Url
8
+ from pydantic import BaseModel, EmailStr, Field, HttpUrl, computed_field, field_validator
11
9
 
12
- from .._utils.json_schema import clean_schema, compute_schema_data_id
10
+ from .._utils.json_schema import compute_schema_data_id
13
11
  from .._utils.mime import generate_blake2b_hash_from_string
14
- from .._utils.usage.usage import compute_cost_from_model, compute_cost_from_model_with_breakdown, CostBreakdown
12
+ from .._utils.usage.usage import CostBreakdown, compute_cost_from_model, compute_cost_from_model_with_breakdown
15
13
  from .ai_models import Amount
16
14
  from .documents.extractions import UiParsedChatCompletion
17
15
  from .mime import BaseMIMEData
18
16
  from .modalities import Modality
19
17
  from .pagination import ListMetadata
18
+ from .browser_canvas import BrowserCanvas
20
19
 
21
20
 
22
21
  class ProcessorConfig(BaseModel):
@@ -27,7 +26,9 @@ class ProcessorConfig(BaseModel):
27
26
 
28
27
  modality: Modality
29
28
  image_resolution_dpi: int = Field(default=96, description="Resolution of the image sent to the LLM")
30
- browser_canvas: Literal['A3', 'A4', 'A5'] = Field(default='A4', description="Sets the size of the browser canvas for rendering documents in browser-based processing. Choose a size that matches the document type.")
29
+ browser_canvas: BrowserCanvas = Field(
30
+ default="A4", description="Sets the size of the browser canvas for rendering documents in browser-based processing. Choose a size that matches the document type."
31
+ )
31
32
 
32
33
  # New attributes
33
34
  model: str = Field(..., description="Model used for chat completion")
@@ -61,7 +62,11 @@ class ProcessorConfig(BaseModel):
61
62
 
62
63
 
63
64
  class AutomationConfig(BaseModel):
64
- object: str = Field(default="automation", description="Type of the object")
65
+ @computed_field
66
+ @property
67
+ def object(self) -> str:
68
+ return "automation"
69
+
65
70
  id: str = Field(default_factory=lambda: "auto_" + nanoid.generate(), description="Unique identifier for the automation")
66
71
  name: str = Field(..., description="Name of the automation")
67
72
  processor_id: str = Field(..., description="ID of the processor to use for the automation")
@@ -70,14 +75,16 @@ class AutomationConfig(BaseModel):
70
75
  default_language: str = Field(default="en", description="Default language for the automation")
71
76
 
72
77
  # HTTP Config
73
- webhook_url: HttpUrl = Field(..., description="Url of the webhook to send the data to")
78
+ webhook_url: str = Field(..., description="Url of the webhook to send the data to")
74
79
  webhook_headers: Dict[str, str] = Field(default_factory=dict, description="Headers to send with the request")
75
80
 
76
81
  need_validation: bool = Field(default=False, description="If the automation needs to be validated before running")
77
82
 
78
- @field_serializer('webhook_url')
79
- def url2str(self, val: HttpUrl) -> str:
80
- return str(val)
83
+ @field_validator("webhook_url", mode="after")
84
+ def validate_httpurl(cls, val: Any) -> Any:
85
+ if isinstance(val, str):
86
+ HttpUrl(val)
87
+ return val
81
88
 
82
89
 
83
90
  class UpdateProcessorRequest(BaseModel):
@@ -87,7 +94,7 @@ class UpdateProcessorRequest(BaseModel):
87
94
  name: Optional[str] = None
88
95
  modality: Optional[Modality] = None
89
96
  image_resolution_dpi: Optional[int] = None
90
- browser_canvas: Optional[Literal['A3', 'A4', 'A5']] = None
97
+ browser_canvas: Optional[BrowserCanvas] = None
91
98
  model: Optional[str] = None
92
99
  json_schema: Optional[Dict] = None
93
100
  temperature: Optional[float] = None
@@ -121,24 +128,24 @@ class UpdateProcessorRequest(BaseModel):
121
128
 
122
129
  class UpdateAutomationRequest(BaseModel):
123
130
  name: Optional[str] = None
124
- processor_id: Optional[str] = None
131
+ # processor_id: Optional[str] = None # TODO: Is it allowed to change the processor_id?
125
132
 
126
133
  default_language: Optional[str] = None
127
134
 
128
- webhook_url: Optional[HttpUrl] = None
129
- webhook_headers: Optional[Dict[str, str]] = None
130
-
131
- need_validation: Optional[bool] = None
135
+ webhook_url: Optional[str] = None
136
+ webhook_headers: Optional[dict[str, str]] = None
132
137
 
138
+ need_validation: Optional[bool] = None
133
139
 
134
- @field_serializer('webhook_url')
135
- def url2str(self, val: HttpUrl | None) -> str | None:
136
- if isinstance(val, HttpUrl):
137
- return str(val)
140
+ @field_validator("webhook_url", mode="after")
141
+ def validate_httpurl(cls, val: Any) -> Any:
142
+ if isinstance(val, str):
143
+ HttpUrl(val)
138
144
  return val
139
145
 
146
+
140
147
  class OpenAIRequestConfig(BaseModel):
141
- object: Literal['openai_request'] = "openai_request"
148
+ object: Literal["openai_request"] = "openai_request"
142
149
  id: str = Field(default_factory=lambda: "openai_req_" + nanoid.generate(), description="Unique identifier for the openai request")
143
150
  model: str
144
151
  json_schema: dict[str, Any]
@@ -160,7 +167,7 @@ class OpenAIRequestConfig(BaseModel):
160
167
 
161
168
 
162
169
  class ExternalRequestLog(BaseModel):
163
- webhook_url: Optional[HttpUrl]
170
+ webhook_url: Optional[str]
164
171
  request_body: dict[str, Any]
165
172
  request_headers: dict[str, str]
166
173
  request_at: datetime.datetime
@@ -173,24 +180,20 @@ class ExternalRequestLog(BaseModel):
173
180
  error: Optional[str] = None
174
181
  duration_ms: float
175
182
 
176
- @field_serializer('webhook_url')
177
- def url2str(self, val: HttpUrl | None) -> str | None:
178
- if isinstance(val, HttpUrl):
179
- return str(val)
183
+ @field_validator("webhook_url", mode="after")
184
+ def validate_httpurl(cls, val: Any) -> Any:
185
+ if isinstance(val, str):
186
+ HttpUrl(val)
180
187
  return val
181
188
 
182
189
 
183
- from openai.types.chat import completion_create_params
184
- from openai.types.chat.chat_completion import ChatCompletion
185
-
186
-
187
190
  class LogCompletionRequest(BaseModel):
188
191
  json_schema: dict[str, Any]
189
192
  completion: ChatCompletion
190
193
 
191
194
 
192
195
  class AutomationLog(BaseModel):
193
- object: Literal['automation_log'] = "automation_log"
196
+ object: Literal["automation_log"] = "automation_log"
194
197
  id: str = Field(default_factory=lambda: "log_auto_" + nanoid.generate(), description="Unique identifier for the automation log")
195
198
  user_email: Optional[EmailStr] # When the user is logged or when he forwards an email
196
199
  organization_id: str
@@ -212,7 +215,7 @@ class AutomationLog(BaseModel):
212
215
  print(f"Error computing cost: {e}")
213
216
  return None
214
217
  return None
215
-
218
+
216
219
  @computed_field # type: ignore
217
220
  @property
218
221
  def cost_breakdown(self) -> Optional[CostBreakdown]:
retab/types/metrics.py ADDED
@@ -0,0 +1,32 @@
1
+ from typing import Any, Literal, Optional
2
+ from pydantic import BaseModel, Field
3
+
4
+
5
+ # Define the type alias for MetricType
6
+ MetricType = Literal["levenshtein", "jaccard", "hamming"]
7
+
8
+
9
+ # Define the structure for an individual item metric
10
+ class ItemMetric(BaseModel):
11
+ id: str = Field(description="The ID of the item being measured")
12
+ name: str = Field(description="The name of the item being measured")
13
+ similarity: float = Field(description="The similarity score between 0 and 1")
14
+ similarities: dict[str, Any] = Field(description="The similarity scores for each item in the list")
15
+ flat_similarities: dict[str, Optional[float]] = Field(description="The similarity scores for each item in the list in dot notation format")
16
+ aligned_similarity: float = Field(description="The similarity score between 0 and 1, after alignment")
17
+ aligned_similarities: dict[str, Any] = Field(description="The similarity scores for each item in the list, after alignment")
18
+ aligned_flat_similarities: dict[str, Optional[float]] = Field(description="The similarity scores for each item in the list in dot notation format, after alignment")
19
+
20
+
21
+ # Define the main MetricResult model
22
+ class MetricResult(BaseModel):
23
+ item_metrics: list[ItemMetric] = Field(description="List of similarity metrics for individual items")
24
+ mean_similarity: float = Field(description="The average similarity score across all items")
25
+ aligned_mean_similarity: float = Field(description="The average similarity score across all items, after alignment")
26
+ metric_type: MetricType = Field(description="The type of similarity metric used for comparison")
27
+
28
+
29
+ class DistancesResult(BaseModel):
30
+ distances: dict[str, Any] = Field(description="List of distances for individual items")
31
+ mean_distance: float = Field(description="The average distance across all items")
32
+ metric_type: MetricType = Field(description="The type of distance metric used for comparison")