retab 0.0.36__py3-none-any.whl → 0.0.37__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {uiform → retab}/_utils/ai_models.py +2 -2
- {uiform → retab}/_utils/benchmarking.py +15 -16
- {uiform → retab}/_utils/chat.py +9 -14
- {uiform → retab}/_utils/display.py +0 -3
- {uiform → retab}/_utils/json_schema.py +9 -14
- {uiform → retab}/_utils/mime.py +11 -14
- {uiform → retab}/_utils/responses.py +9 -3
- {uiform → retab}/_utils/stream_context_managers.py +1 -1
- {uiform → retab}/_utils/usage/usage.py +28 -28
- {uiform → retab}/client.py +32 -31
- {uiform → retab}/resources/consensus/client.py +17 -36
- {uiform → retab}/resources/consensus/completions.py +24 -47
- {uiform → retab}/resources/consensus/completions_stream.py +26 -38
- {uiform → retab}/resources/consensus/responses.py +31 -80
- {uiform → retab}/resources/consensus/responses_stream.py +31 -79
- {uiform → retab}/resources/documents/client.py +59 -45
- {uiform → retab}/resources/documents/extractions.py +181 -90
- {uiform → retab}/resources/evals.py +56 -43
- retab/resources/evaluations/__init__.py +3 -0
- retab/resources/evaluations/client.py +301 -0
- retab/resources/evaluations/documents.py +233 -0
- retab/resources/evaluations/iterations.py +452 -0
- {uiform → retab}/resources/files.py +2 -2
- {uiform → retab}/resources/jsonlUtils.py +220 -216
- retab/resources/models.py +73 -0
- retab/resources/processors/automations/client.py +244 -0
- {uiform → retab}/resources/processors/automations/endpoints.py +77 -118
- retab/resources/processors/automations/links.py +294 -0
- {uiform → retab}/resources/processors/automations/logs.py +30 -19
- {uiform → retab}/resources/processors/automations/mailboxes.py +136 -174
- retab/resources/processors/automations/outlook.py +337 -0
- {uiform → retab}/resources/processors/automations/tests.py +22 -25
- {uiform → retab}/resources/processors/client.py +179 -164
- {uiform → retab}/resources/schemas.py +78 -66
- {uiform → retab}/resources/secrets/external_api_keys.py +1 -5
- retab/resources/secrets/webhook.py +64 -0
- {uiform → retab}/resources/usage.py +39 -2
- {uiform → retab}/types/ai_models.py +13 -13
- {uiform → retab}/types/automations/cron.py +19 -12
- {uiform → retab}/types/automations/endpoints.py +7 -4
- {uiform → retab}/types/automations/links.py +7 -3
- {uiform → retab}/types/automations/mailboxes.py +9 -9
- {uiform → retab}/types/automations/outlook.py +15 -11
- retab/types/browser_canvas.py +3 -0
- {uiform → retab}/types/chat.py +2 -2
- {uiform → retab}/types/completions.py +9 -12
- retab/types/consensus.py +19 -0
- {uiform → retab}/types/db/annotations.py +3 -3
- {uiform → retab}/types/db/files.py +8 -6
- {uiform → retab}/types/documents/create_messages.py +18 -20
- {uiform → retab}/types/documents/extractions.py +69 -24
- {uiform → retab}/types/evals.py +5 -5
- retab/types/evaluations/__init__.py +31 -0
- retab/types/evaluations/documents.py +30 -0
- retab/types/evaluations/iterations.py +112 -0
- retab/types/evaluations/model.py +73 -0
- retab/types/events.py +79 -0
- {uiform → retab}/types/extractions.py +33 -10
- retab/types/inference_settings.py +15 -0
- retab/types/jobs/base.py +54 -0
- retab/types/jobs/batch_annotation.py +12 -0
- {uiform → retab}/types/jobs/evaluation.py +1 -2
- {uiform → retab}/types/logs.py +37 -34
- retab/types/metrics.py +32 -0
- {uiform → retab}/types/mime.py +22 -20
- {uiform → retab}/types/modalities.py +10 -10
- retab/types/predictions.py +19 -0
- {uiform → retab}/types/schemas/enhance.py +4 -2
- {uiform → retab}/types/schemas/evaluate.py +7 -4
- {uiform → retab}/types/schemas/generate.py +6 -3
- {uiform → retab}/types/schemas/layout.py +1 -1
- {uiform → retab}/types/schemas/object.py +13 -14
- {uiform → retab}/types/schemas/templates.py +1 -3
- {uiform → retab}/types/secrets/external_api_keys.py +0 -1
- {uiform → retab}/types/standards.py +18 -1
- {retab-0.0.36.dist-info → retab-0.0.37.dist-info}/METADATA +7 -6
- retab-0.0.37.dist-info/RECORD +107 -0
- retab-0.0.37.dist-info/top_level.txt +1 -0
- retab-0.0.36.dist-info/RECORD +0 -96
- retab-0.0.36.dist-info/top_level.txt +0 -1
- uiform/_utils/benchmarking copy.py +0 -588
- uiform/resources/models.py +0 -45
- uiform/resources/processors/automations/client.py +0 -78
- uiform/resources/processors/automations/links.py +0 -356
- uiform/resources/processors/automations/outlook.py +0 -444
- uiform/resources/secrets/webhook.py +0 -62
- uiform/types/consensus.py +0 -10
- uiform/types/events.py +0 -76
- uiform/types/jobs/base.py +0 -150
- uiform/types/jobs/batch_annotation.py +0 -22
- {uiform → retab}/__init__.py +0 -0
- {uiform → retab}/_resource.py +0 -0
- {uiform → retab}/_utils/__init__.py +0 -0
- {uiform → retab}/_utils/usage/__init__.py +0 -0
- {uiform → retab}/py.typed +0 -0
- {uiform → retab}/resources/__init__.py +0 -0
- {uiform → retab}/resources/consensus/__init__.py +0 -0
- {uiform → retab}/resources/documents/__init__.py +0 -0
- {uiform → retab}/resources/finetuning.py +0 -0
- {uiform → retab}/resources/openai_example.py +0 -0
- {uiform → retab}/resources/processors/__init__.py +0 -0
- {uiform → retab}/resources/processors/automations/__init__.py +0 -0
- {uiform → retab}/resources/prompt_optimization.py +0 -0
- {uiform → retab}/resources/secrets/__init__.py +0 -0
- {uiform → retab}/resources/secrets/client.py +0 -0
- {uiform → retab}/types/__init__.py +0 -0
- {uiform → retab}/types/automations/__init__.py +0 -0
- {uiform → retab}/types/automations/webhooks.py +0 -0
- {uiform → retab}/types/db/__init__.py +0 -0
- {uiform → retab}/types/documents/__init__.py +0 -0
- {uiform → retab}/types/documents/correct_orientation.py +0 -0
- {uiform → retab}/types/jobs/__init__.py +0 -0
- {uiform → retab}/types/jobs/finetune.py +0 -0
- {uiform → retab}/types/jobs/prompt_optimization.py +0 -0
- {uiform → retab}/types/jobs/webcrawl.py +0 -0
- {uiform → retab}/types/pagination.py +0 -0
- {uiform → retab}/types/schemas/__init__.py +0 -0
- {uiform → retab}/types/secrets/__init__.py +0 -0
- {retab-0.0.36.dist-info → retab-0.0.37.dist-info}/WHEEL +0 -0
@@ -0,0 +1,112 @@
|
|
1
|
+
import copy
|
2
|
+
import datetime
|
3
|
+
import json
|
4
|
+
from typing import Any, Optional, Self
|
5
|
+
|
6
|
+
import nanoid # type: ignore
|
7
|
+
from pydantic import BaseModel, Field, computed_field, model_validator
|
8
|
+
|
9
|
+
from ..._utils.json_schema import clean_schema
|
10
|
+
from ..._utils.mime import generate_blake2b_hash_from_string
|
11
|
+
from ..inference_settings import InferenceSettings
|
12
|
+
from ..metrics import MetricResult
|
13
|
+
from ..predictions import PredictionData
|
14
|
+
|
15
|
+
|
16
|
+
class Iteration(BaseModel):
|
17
|
+
id: str = Field(default_factory=lambda: "eval_iter_" + nanoid.generate())
|
18
|
+
updated_at: datetime.datetime = Field(
|
19
|
+
default_factory=lambda: datetime.datetime.now(tz=datetime.timezone.utc),
|
20
|
+
description="The last update date of inference settings or json schema",
|
21
|
+
)
|
22
|
+
inference_settings: InferenceSettings
|
23
|
+
json_schema: dict[str, Any]
|
24
|
+
predictions: dict[str, PredictionData] = Field(default_factory=dict, description="The predictions of the iteration for all the documents")
|
25
|
+
metric_results: Optional[MetricResult] = Field(default=None, description="The metric results of the iteration")
|
26
|
+
|
27
|
+
@computed_field # type: ignore
|
28
|
+
@property
|
29
|
+
def schema_data_id(self) -> str:
|
30
|
+
"""Returns the SHA1 hash of the schema data, ignoring all prompt/description/default fields.
|
31
|
+
|
32
|
+
Returns:
|
33
|
+
str: A SHA1 hash string representing the schema data version.
|
34
|
+
"""
|
35
|
+
return "sch_data_id_" + generate_blake2b_hash_from_string(
|
36
|
+
json.dumps(
|
37
|
+
clean_schema(
|
38
|
+
copy.deepcopy(self.json_schema),
|
39
|
+
remove_custom_fields=True,
|
40
|
+
fields_to_remove=["description", "default", "title", "required", "examples", "deprecated", "readOnly", "writeOnly"],
|
41
|
+
),
|
42
|
+
sort_keys=True,
|
43
|
+
).strip()
|
44
|
+
)
|
45
|
+
|
46
|
+
# This is a computed field, it is exposed when serializing the object
|
47
|
+
@computed_field # type: ignore
|
48
|
+
@property
|
49
|
+
def schema_id(self) -> str:
|
50
|
+
"""Returns the SHA1 hash of the complete schema.
|
51
|
+
|
52
|
+
Returns:
|
53
|
+
str: A SHA1 hash string representing the complete schema version.
|
54
|
+
"""
|
55
|
+
return "sch_id_" + generate_blake2b_hash_from_string(json.dumps(self.json_schema, sort_keys=True).strip())
|
56
|
+
|
57
|
+
|
58
|
+
class CreateIterationRequest(BaseModel):
|
59
|
+
"""
|
60
|
+
Request model for performing a new iteration with custom inference settings and optional JSON schema.
|
61
|
+
"""
|
62
|
+
|
63
|
+
inference_settings: InferenceSettings
|
64
|
+
json_schema: Optional[dict[str, Any]] = None
|
65
|
+
from_iteration_id: Optional[str] = Field(
|
66
|
+
default=None,
|
67
|
+
description="The ID of the iteration to copy the JSON Schema from.",
|
68
|
+
)
|
69
|
+
|
70
|
+
# validate that exactly one of from_iteration_id or json_schema is provided
|
71
|
+
@model_validator(mode="after")
|
72
|
+
def validate_one_of_from_iteration_id_or_json_schema(self) -> Self:
|
73
|
+
if (self.from_iteration_id is None) ^ (self.json_schema is None):
|
74
|
+
raise ValueError("Exactly one of from_iteration_id or json_schema must be provided")
|
75
|
+
return self
|
76
|
+
|
77
|
+
|
78
|
+
class PatchIterationRequest(BaseModel):
|
79
|
+
inference_settings: Optional[InferenceSettings] = Field(default=None, description="The new inference settings of the iteration")
|
80
|
+
json_schema: Optional[dict[str, Any]] = Field(default=None, description="The new json schema of the iteration")
|
81
|
+
|
82
|
+
|
83
|
+
class ProcessIterationRequest(BaseModel):
|
84
|
+
"""Request model for processing an iteration - running extractions on documents."""
|
85
|
+
|
86
|
+
document_ids: Optional[list[str]] = Field(default=None, description="Specific document IDs to process. If None, all documents will be processed.")
|
87
|
+
only_outdated: bool = Field(default=True, description="Only process documents that need updates (prediction.updated_at is None or older than iteration.updated_at)")
|
88
|
+
|
89
|
+
|
90
|
+
class DocumentStatus(BaseModel):
|
91
|
+
"""Status of a document within an iteration."""
|
92
|
+
|
93
|
+
document_id: str
|
94
|
+
filename: str
|
95
|
+
needs_update: bool = Field(description="True if prediction is missing or outdated")
|
96
|
+
has_prediction: bool = Field(description="True if any prediction exists")
|
97
|
+
prediction_updated_at: Optional[datetime.datetime] = Field(description="When the prediction was last updated")
|
98
|
+
iteration_updated_at: datetime.datetime = Field(description="When the iteration settings were last updated")
|
99
|
+
|
100
|
+
|
101
|
+
class IterationDocumentStatusResponse(BaseModel):
|
102
|
+
"""Response showing the status of all documents in an iteration."""
|
103
|
+
|
104
|
+
iteration_id: str
|
105
|
+
documents: list[DocumentStatus]
|
106
|
+
total_documents: int
|
107
|
+
documents_needing_update: int
|
108
|
+
documents_up_to_date: int
|
109
|
+
|
110
|
+
|
111
|
+
class AddIterationFromJsonlRequest(BaseModel):
|
112
|
+
jsonl_gcs_path: str
|
@@ -0,0 +1,73 @@
|
|
1
|
+
import datetime
|
2
|
+
import json
|
3
|
+
from typing import Any, Optional
|
4
|
+
|
5
|
+
import nanoid # type: ignore
|
6
|
+
from pydantic import BaseModel, Field, computed_field
|
7
|
+
|
8
|
+
from ..._utils.json_schema import compute_schema_data_id
|
9
|
+
from ..._utils.mime import generate_blake2b_hash_from_string
|
10
|
+
from ..inference_settings import InferenceSettings
|
11
|
+
from .documents import EvaluationDocument
|
12
|
+
from .iterations import Iteration
|
13
|
+
|
14
|
+
|
15
|
+
# Actual Object stored in DB
|
16
|
+
class Evaluation(BaseModel):
|
17
|
+
id: str = Field(default_factory=lambda: "eval_" + nanoid.generate())
|
18
|
+
updated_at: datetime.datetime = Field(default_factory=lambda: datetime.datetime.now(tz=datetime.timezone.utc))
|
19
|
+
|
20
|
+
name: str
|
21
|
+
documents: list[EvaluationDocument] = Field(default_factory=list)
|
22
|
+
iterations: list[Iteration] = Field(default_factory=list)
|
23
|
+
json_schema: dict[str, Any]
|
24
|
+
|
25
|
+
project_id: str = Field(description="The ID of the project", default="default_spreadsheets")
|
26
|
+
default_inference_settings: InferenceSettings = Field(
|
27
|
+
default=InferenceSettings(), description="The default inference properties for the evaluation (mostly used in the frontend)"
|
28
|
+
)
|
29
|
+
|
30
|
+
@computed_field # type: ignore
|
31
|
+
@property
|
32
|
+
def schema_data_id(self) -> str:
|
33
|
+
"""Returns the SHA1 hash of the schema data, ignoring all prompt/description/default fields.
|
34
|
+
|
35
|
+
Returns:
|
36
|
+
str: A SHA1 hash string representing the schema data version.
|
37
|
+
"""
|
38
|
+
return compute_schema_data_id(self.json_schema)
|
39
|
+
|
40
|
+
# This is a computed field, it is exposed when serializing the object
|
41
|
+
@computed_field # type: ignore
|
42
|
+
@property
|
43
|
+
def schema_id(self) -> str:
|
44
|
+
"""Returns the SHA1 hash of the complete schema.
|
45
|
+
|
46
|
+
Returns:
|
47
|
+
str: A SHA1 hash string representing the complete schema version.
|
48
|
+
"""
|
49
|
+
return "sch_id_" + generate_blake2b_hash_from_string(json.dumps(self.json_schema, sort_keys=True).strip())
|
50
|
+
|
51
|
+
|
52
|
+
class CreateEvaluation(BaseModel):
|
53
|
+
name: str
|
54
|
+
json_schema: dict[str, Any]
|
55
|
+
project_id: str = Field(description="The ID of the project", default="default_spreadsheets")
|
56
|
+
default_inference_settings: InferenceSettings = Field(default=InferenceSettings(), description="The default inference properties for the evaluation.")
|
57
|
+
|
58
|
+
|
59
|
+
class ListEvaluationParams(BaseModel):
|
60
|
+
project_id: Optional[str] = Field(default=None, description="The ID of the project")
|
61
|
+
schema_id: Optional[str] = Field(default=None, description="The ID of the schema")
|
62
|
+
schema_data_id: Optional[str] = Field(default=None, description="The ID of the schema data")
|
63
|
+
|
64
|
+
|
65
|
+
class PatchEvaluationRequest(BaseModel):
|
66
|
+
name: Optional[str] = Field(default=None, description="The name of the document")
|
67
|
+
json_schema: Optional[dict[str, Any]] = Field(default=None, description="The json schema of the evaluation")
|
68
|
+
project_id: Optional[str] = Field(default=None, description="The ID of the project")
|
69
|
+
default_inference_settings: Optional[InferenceSettings] = Field(default=None, description="The default inference properties for the evaluation (mostly used in the frontend)")
|
70
|
+
|
71
|
+
|
72
|
+
class AddIterationFromJsonlRequest(BaseModel):
|
73
|
+
jsonl_gcs_path: str
|
retab/types/events.py
ADDED
@@ -0,0 +1,79 @@
|
|
1
|
+
import datetime
|
2
|
+
from typing import Any, Literal, Optional
|
3
|
+
|
4
|
+
import nanoid # type: ignore
|
5
|
+
from pydantic import BaseModel, Field
|
6
|
+
|
7
|
+
metadata_key = Literal[
|
8
|
+
"automation",
|
9
|
+
"cron",
|
10
|
+
"data_structure",
|
11
|
+
"dataset",
|
12
|
+
"dataset_membership",
|
13
|
+
"endpoint",
|
14
|
+
"evaluation",
|
15
|
+
"extraction",
|
16
|
+
"file",
|
17
|
+
"files",
|
18
|
+
"link",
|
19
|
+
"mailbox",
|
20
|
+
"organization",
|
21
|
+
"outlook",
|
22
|
+
"preprocessing",
|
23
|
+
"preprocessing",
|
24
|
+
"reconciliation",
|
25
|
+
"schema",
|
26
|
+
"schema_data",
|
27
|
+
"template",
|
28
|
+
"user",
|
29
|
+
"webhook",
|
30
|
+
]
|
31
|
+
|
32
|
+
event_type = Literal[
|
33
|
+
"extraction.created",
|
34
|
+
"messages.created",
|
35
|
+
"document.orientation_corrected",
|
36
|
+
"consensus.reconciled",
|
37
|
+
"automation.created",
|
38
|
+
"automation.updated",
|
39
|
+
"automation.deleted",
|
40
|
+
"automation.webhook",
|
41
|
+
"preprocessing.created",
|
42
|
+
"link.created",
|
43
|
+
"link.updated",
|
44
|
+
"link.deleted",
|
45
|
+
"link.webhook",
|
46
|
+
"mailbox.created",
|
47
|
+
"mailbox.updated",
|
48
|
+
"mailbox.deleted",
|
49
|
+
"mailbox.webhook",
|
50
|
+
"outlook.created",
|
51
|
+
"outlook.updated",
|
52
|
+
"outlook.deleted",
|
53
|
+
"outlook.webhook",
|
54
|
+
"schema.generated",
|
55
|
+
"schema.promptified",
|
56
|
+
"schema.system_promptfile.created",
|
57
|
+
"file.updated",
|
58
|
+
"file.deleted",
|
59
|
+
"template.created",
|
60
|
+
"template.deleted",
|
61
|
+
"template.sample_document_uploaded",
|
62
|
+
"template.sample_document_deleted",
|
63
|
+
"template.updated",
|
64
|
+
]
|
65
|
+
|
66
|
+
|
67
|
+
class Event(BaseModel):
|
68
|
+
object: Literal["event"] = "event"
|
69
|
+
id: str = Field(default_factory=lambda: "event_" + nanoid.generate(), description="Unique identifier for the event")
|
70
|
+
event: str = Field(..., description="A string that distinguishes the event type. Ex: user.created, user.updated, user.deleted, etc.")
|
71
|
+
created_at: datetime.datetime = Field(default_factory=lambda: datetime.datetime.now(datetime.timezone.utc))
|
72
|
+
data: dict[str, Any] = Field(..., description="Event payload. Payloads match the corresponding API objects.")
|
73
|
+
metadata: Optional[dict[metadata_key, str]] = Field(
|
74
|
+
default=None, description="Ids giving informations about the event. Ex: user.created.metadata = {'user': 'usr_8478973619047837'}"
|
75
|
+
)
|
76
|
+
|
77
|
+
|
78
|
+
class StoredEvent(Event):
|
79
|
+
organization_id: str = Field(..., description="Organization ID")
|
@@ -4,12 +4,12 @@ from typing import Any, Literal, Optional
|
|
4
4
|
import nanoid # type: ignore
|
5
5
|
from openai.types.chat import ChatCompletion
|
6
6
|
from openai.types.chat.chat_completion_reasoning_effort import ChatCompletionReasoningEffort
|
7
|
-
from pydantic import BaseModel, Field, computed_field
|
7
|
+
from pydantic import BaseModel, Field, computed_field, model_validator
|
8
8
|
|
9
9
|
from uiform.types.chat import ChatCompletionUiformMessage
|
10
10
|
from uiform.types.documents.extractions import UiParsedChatCompletion
|
11
11
|
|
12
|
-
from .._utils.usage.usage import compute_cost_from_model, compute_cost_from_model_with_breakdown
|
12
|
+
from .._utils.usage.usage import CostBreakdown, compute_cost_from_model, compute_cost_from_model_with_breakdown
|
13
13
|
from .ai_models import Amount
|
14
14
|
from .modalities import Modality
|
15
15
|
|
@@ -17,13 +17,14 @@ ValidationsState = Literal["pending", "validated", "invalid"]
|
|
17
17
|
|
18
18
|
|
19
19
|
class ExtractionSource(BaseModel):
|
20
|
-
type: Literal["api", "annotation","processor", "automation.link", "automation.mailbox", "automation.cron", "automation.outlook", "automation.endpoint", "schema.extract"] =
|
21
|
-
description="Type of extraction"
|
20
|
+
type: Literal["api", "annotation", "processor", "automation.link", "automation.mailbox", "automation.cron", "automation.outlook", "automation.endpoint", "schema.extract"] = (
|
21
|
+
Field(description="Type of extraction")
|
22
22
|
)
|
23
23
|
id: str | None = Field(default=None, description="ID the trigger of the extraction")
|
24
24
|
|
25
25
|
|
26
|
-
ExtractionSteps = str | Literal[
|
26
|
+
ExtractionSteps = str | Literal["initialization", "prepare_messages", "yield_first_token", "completion"] # Steps are meant to not overlap
|
27
|
+
BrowserCanvas = Literal["A3", "A4", "A5"]
|
27
28
|
|
28
29
|
|
29
30
|
class ExtractionTimingStep(BaseModel):
|
@@ -36,8 +37,11 @@ class Extraction(BaseModel):
|
|
36
37
|
id: str = Field(default_factory=lambda: "extr_" + nanoid.generate(), description="Unique identifier of the analysis")
|
37
38
|
messages: list[ChatCompletionUiformMessage] = Field(default_factory=list)
|
38
39
|
messages_gcs: str = Field(..., description="GCS path to the messages")
|
39
|
-
|
40
|
-
|
40
|
+
file_gcs_paths: list[str] = Field(..., description="GCS paths to the files")
|
41
|
+
file_ids: list[str] = Field(..., description="IDs of the files")
|
42
|
+
# Legacy fields for backward compatibility
|
43
|
+
file_gcs: str = Field(default="", description="GCS path to the first file (deprecated)")
|
44
|
+
file_id: str = Field(default="", description="ID of the first file (deprecated)")
|
41
45
|
|
42
46
|
status: Literal["success", "failed"] = Field(..., description="Whether the analysis was successful")
|
43
47
|
completion: UiParsedChatCompletion | ChatCompletion = Field(..., description="Response generated by the analysis")
|
@@ -46,7 +50,9 @@ class Extraction(BaseModel):
|
|
46
50
|
temperature: float = Field(default=0.0, description="Temperature used for the analysis")
|
47
51
|
source: ExtractionSource = Field(..., description="Source of the extraction")
|
48
52
|
image_resolution_dpi: int = Field(default=96, description="Resolution of the image sent to the LLM")
|
49
|
-
browser_canvas:
|
53
|
+
browser_canvas: BrowserCanvas = Field(
|
54
|
+
default="A4", description="Sets the size of the browser canvas for rendering documents in browser-based processing. Choose a size that matches the document type."
|
55
|
+
)
|
50
56
|
modality: Modality = Field(default="native", description="Modality of the extraction")
|
51
57
|
reasoning_effort: Optional[ChatCompletionReasoningEffort] = Field(default=None, description="The effort level for the model to reason about the input data.")
|
52
58
|
timings: list[ExtractionTimingStep] = Field(default_factory=list, description="Timings of the extraction")
|
@@ -60,7 +66,24 @@ class Extraction(BaseModel):
|
|
60
66
|
validation_state: Optional[ValidationsState] = Field(default=None, description="Validation state of the extraction")
|
61
67
|
billed: bool = Field(default=False, description="Whether the extraction has been billed or not")
|
62
68
|
|
63
|
-
@
|
69
|
+
@model_validator(mode="before")
|
70
|
+
def handle_legacy_fields(cls, data):
|
71
|
+
"""Handle backward compatibility for legacy file_gcs and file_id fields."""
|
72
|
+
if isinstance(data, dict):
|
73
|
+
# If only legacy fields are provided, convert to new format
|
74
|
+
if "file_gcs" in data and "file_gcs_paths" not in data:
|
75
|
+
data["file_gcs_paths"] = [data["file_gcs"]]
|
76
|
+
if "file_id" in data and "file_ids" not in data:
|
77
|
+
data["file_ids"] = [data["file_id"]]
|
78
|
+
|
79
|
+
# Set legacy fields from new format for backward compatibility
|
80
|
+
if "file_gcs_paths" in data and data["file_gcs_paths"]:
|
81
|
+
data["file_gcs"] = data["file_gcs_paths"][0]
|
82
|
+
if "file_ids" in data and data["file_ids"]:
|
83
|
+
data["file_id"] = data["file_ids"][0]
|
84
|
+
return data
|
85
|
+
|
86
|
+
@computed_field # type: ignore
|
64
87
|
@property
|
65
88
|
def api_cost(self) -> Optional[Amount]:
|
66
89
|
if self.completion and self.completion.usage:
|
@@ -71,7 +94,7 @@ class Extraction(BaseModel):
|
|
71
94
|
print(f"Error computing cost: {e}")
|
72
95
|
return None
|
73
96
|
return None
|
74
|
-
|
97
|
+
|
75
98
|
@computed_field # type: ignore
|
76
99
|
@property
|
77
100
|
def cost_breakdown(self) -> Optional[CostBreakdown]:
|
@@ -0,0 +1,15 @@
|
|
1
|
+
from openai.types.chat.chat_completion_reasoning_effort import ChatCompletionReasoningEffort
|
2
|
+
from pydantic import BaseModel, Field
|
3
|
+
|
4
|
+
from .browser_canvas import BrowserCanvas
|
5
|
+
from .modalities import Modality
|
6
|
+
|
7
|
+
|
8
|
+
class InferenceSettings(BaseModel):
|
9
|
+
model: str = "gpt-4.1-mini"
|
10
|
+
temperature: float = 0.0
|
11
|
+
modality: Modality = "native"
|
12
|
+
reasoning_effort: ChatCompletionReasoningEffort = "medium"
|
13
|
+
image_resolution_dpi: int = 96
|
14
|
+
browser_canvas: BrowserCanvas = "A4"
|
15
|
+
n_consensus: int = Field(default=1, description="Number of consensus rounds to perform")
|
retab/types/jobs/base.py
ADDED
@@ -0,0 +1,54 @@
|
|
1
|
+
from typing import Literal, Optional, Self
|
2
|
+
|
3
|
+
from pydantic import BaseModel, model_validator
|
4
|
+
from ..inference_settings import InferenceSettings
|
5
|
+
|
6
|
+
SelectionMode = Literal["all", "manual"]
|
7
|
+
|
8
|
+
|
9
|
+
# This is the input data for the prepare_dataset job
|
10
|
+
class PrepareDatasetInputData(BaseModel):
|
11
|
+
dataset_id: Optional[str] = None
|
12
|
+
schema_id: Optional[str] = None
|
13
|
+
schema_data_id: Optional[str] = None
|
14
|
+
|
15
|
+
selection_model: SelectionMode = "all"
|
16
|
+
|
17
|
+
@model_validator(mode="after")
|
18
|
+
def validate_input(self) -> Self:
|
19
|
+
# The preference is:
|
20
|
+
# 1. dataset_id
|
21
|
+
# 2. schema_id
|
22
|
+
# 3. schema_data_id
|
23
|
+
if self.dataset_id is None and self.schema_id is None and self.schema_data_id is None:
|
24
|
+
raise ValueError("At least one of dataset_id, schema_id, or schema_data_id must be provided")
|
25
|
+
|
26
|
+
return self
|
27
|
+
|
28
|
+
|
29
|
+
# This is the input data for the split_dataset job
|
30
|
+
class DatasetSplitInputData(BaseModel):
|
31
|
+
dataset_id: str
|
32
|
+
train_size: Optional[int | float] = None
|
33
|
+
eval_size: Optional[int | float] = None
|
34
|
+
|
35
|
+
@model_validator(mode="after")
|
36
|
+
def validate_input(self) -> Self:
|
37
|
+
if self.train_size is not None and self.eval_size is not None:
|
38
|
+
raise ValueError("train_size and eval_size cannot both be provided")
|
39
|
+
return self
|
40
|
+
|
41
|
+
|
42
|
+
# This is the input data for the batch annotation job
|
43
|
+
class AnnotationInputData(BaseModel):
|
44
|
+
data_file: str
|
45
|
+
schema_id: str
|
46
|
+
inference_settings: InferenceSettings
|
47
|
+
|
48
|
+
|
49
|
+
# This is the input data for the evaluation job
|
50
|
+
class EvaluationInputData(BaseModel):
|
51
|
+
eval_data_file: str
|
52
|
+
schema_id: str
|
53
|
+
inference_settings_1: InferenceSettings | None = None
|
54
|
+
inference_settings_2: InferenceSettings
|
@@ -0,0 +1,12 @@
|
|
1
|
+
from typing import Optional
|
2
|
+
|
3
|
+
from pydantic import BaseModel
|
4
|
+
|
5
|
+
from ..inference_settings import InferenceSettings
|
6
|
+
|
7
|
+
|
8
|
+
class AnnotationInputData(BaseModel):
|
9
|
+
dataset_id: str
|
10
|
+
files_ids: Optional[list[str]] = None
|
11
|
+
upsert: bool = False
|
12
|
+
inference_settings: InferenceSettings
|
@@ -1,7 +1,6 @@
|
|
1
1
|
from pydantic import BaseModel
|
2
2
|
|
3
|
-
from
|
4
|
-
from .batch_annotation import AnnotationInputData, InferenceSettings
|
3
|
+
from ..inference_settings import InferenceSettings
|
5
4
|
|
6
5
|
# This job will generate two datasets from the original dataset, one with the first annotation and one with the second annotation
|
7
6
|
# It will then evaluate the two datasets using the evaluation metrics and return an EvalMetrics object
|
{uiform → retab}/types/logs.py
RENAMED
@@ -1,22 +1,21 @@
|
|
1
|
-
import copy
|
2
1
|
import datetime
|
3
2
|
import json
|
4
3
|
from typing import Any, Dict, List, Literal, Optional
|
5
4
|
|
6
5
|
import nanoid # type: ignore
|
7
|
-
from openai import
|
6
|
+
from openai.types.chat.chat_completion import ChatCompletion
|
8
7
|
from openai.types.chat.chat_completion_reasoning_effort import ChatCompletionReasoningEffort
|
9
|
-
from pydantic import BaseModel, EmailStr, Field, HttpUrl, computed_field,
|
10
|
-
from pydantic_core import Url
|
8
|
+
from pydantic import BaseModel, EmailStr, Field, HttpUrl, computed_field, field_validator
|
11
9
|
|
12
|
-
from .._utils.json_schema import
|
10
|
+
from .._utils.json_schema import compute_schema_data_id
|
13
11
|
from .._utils.mime import generate_blake2b_hash_from_string
|
14
|
-
from .._utils.usage.usage import compute_cost_from_model, compute_cost_from_model_with_breakdown
|
12
|
+
from .._utils.usage.usage import CostBreakdown, compute_cost_from_model, compute_cost_from_model_with_breakdown
|
15
13
|
from .ai_models import Amount
|
16
14
|
from .documents.extractions import UiParsedChatCompletion
|
17
15
|
from .mime import BaseMIMEData
|
18
16
|
from .modalities import Modality
|
19
17
|
from .pagination import ListMetadata
|
18
|
+
from .browser_canvas import BrowserCanvas
|
20
19
|
|
21
20
|
|
22
21
|
class ProcessorConfig(BaseModel):
|
@@ -27,7 +26,9 @@ class ProcessorConfig(BaseModel):
|
|
27
26
|
|
28
27
|
modality: Modality
|
29
28
|
image_resolution_dpi: int = Field(default=96, description="Resolution of the image sent to the LLM")
|
30
|
-
browser_canvas:
|
29
|
+
browser_canvas: BrowserCanvas = Field(
|
30
|
+
default="A4", description="Sets the size of the browser canvas for rendering documents in browser-based processing. Choose a size that matches the document type."
|
31
|
+
)
|
31
32
|
|
32
33
|
# New attributes
|
33
34
|
model: str = Field(..., description="Model used for chat completion")
|
@@ -61,7 +62,11 @@ class ProcessorConfig(BaseModel):
|
|
61
62
|
|
62
63
|
|
63
64
|
class AutomationConfig(BaseModel):
|
64
|
-
|
65
|
+
@computed_field
|
66
|
+
@property
|
67
|
+
def object(self) -> str:
|
68
|
+
return "automation"
|
69
|
+
|
65
70
|
id: str = Field(default_factory=lambda: "auto_" + nanoid.generate(), description="Unique identifier for the automation")
|
66
71
|
name: str = Field(..., description="Name of the automation")
|
67
72
|
processor_id: str = Field(..., description="ID of the processor to use for the automation")
|
@@ -70,14 +75,16 @@ class AutomationConfig(BaseModel):
|
|
70
75
|
default_language: str = Field(default="en", description="Default language for the automation")
|
71
76
|
|
72
77
|
# HTTP Config
|
73
|
-
webhook_url:
|
78
|
+
webhook_url: str = Field(..., description="Url of the webhook to send the data to")
|
74
79
|
webhook_headers: Dict[str, str] = Field(default_factory=dict, description="Headers to send with the request")
|
75
80
|
|
76
81
|
need_validation: bool = Field(default=False, description="If the automation needs to be validated before running")
|
77
82
|
|
78
|
-
@
|
79
|
-
def
|
80
|
-
|
83
|
+
@field_validator("webhook_url", mode="after")
|
84
|
+
def validate_httpurl(cls, val: Any) -> Any:
|
85
|
+
if isinstance(val, str):
|
86
|
+
HttpUrl(val)
|
87
|
+
return val
|
81
88
|
|
82
89
|
|
83
90
|
class UpdateProcessorRequest(BaseModel):
|
@@ -87,7 +94,7 @@ class UpdateProcessorRequest(BaseModel):
|
|
87
94
|
name: Optional[str] = None
|
88
95
|
modality: Optional[Modality] = None
|
89
96
|
image_resolution_dpi: Optional[int] = None
|
90
|
-
browser_canvas: Optional[
|
97
|
+
browser_canvas: Optional[BrowserCanvas] = None
|
91
98
|
model: Optional[str] = None
|
92
99
|
json_schema: Optional[Dict] = None
|
93
100
|
temperature: Optional[float] = None
|
@@ -121,24 +128,24 @@ class UpdateProcessorRequest(BaseModel):
|
|
121
128
|
|
122
129
|
class UpdateAutomationRequest(BaseModel):
|
123
130
|
name: Optional[str] = None
|
124
|
-
processor_id: Optional[str] = None
|
131
|
+
# processor_id: Optional[str] = None # TODO: Is it allowed to change the processor_id?
|
125
132
|
|
126
133
|
default_language: Optional[str] = None
|
127
134
|
|
128
|
-
webhook_url: Optional[
|
129
|
-
webhook_headers: Optional[
|
130
|
-
|
131
|
-
need_validation: Optional[bool] = None
|
135
|
+
webhook_url: Optional[str] = None
|
136
|
+
webhook_headers: Optional[dict[str, str]] = None
|
132
137
|
|
138
|
+
need_validation: Optional[bool] = None
|
133
139
|
|
134
|
-
@
|
135
|
-
def
|
136
|
-
if isinstance(val,
|
137
|
-
|
140
|
+
@field_validator("webhook_url", mode="after")
|
141
|
+
def validate_httpurl(cls, val: Any) -> Any:
|
142
|
+
if isinstance(val, str):
|
143
|
+
HttpUrl(val)
|
138
144
|
return val
|
139
145
|
|
146
|
+
|
140
147
|
class OpenAIRequestConfig(BaseModel):
|
141
|
-
object: Literal[
|
148
|
+
object: Literal["openai_request"] = "openai_request"
|
142
149
|
id: str = Field(default_factory=lambda: "openai_req_" + nanoid.generate(), description="Unique identifier for the openai request")
|
143
150
|
model: str
|
144
151
|
json_schema: dict[str, Any]
|
@@ -160,7 +167,7 @@ class OpenAIRequestConfig(BaseModel):
|
|
160
167
|
|
161
168
|
|
162
169
|
class ExternalRequestLog(BaseModel):
|
163
|
-
webhook_url: Optional[
|
170
|
+
webhook_url: Optional[str]
|
164
171
|
request_body: dict[str, Any]
|
165
172
|
request_headers: dict[str, str]
|
166
173
|
request_at: datetime.datetime
|
@@ -173,24 +180,20 @@ class ExternalRequestLog(BaseModel):
|
|
173
180
|
error: Optional[str] = None
|
174
181
|
duration_ms: float
|
175
182
|
|
176
|
-
@
|
177
|
-
def
|
178
|
-
if isinstance(val,
|
179
|
-
|
183
|
+
@field_validator("webhook_url", mode="after")
|
184
|
+
def validate_httpurl(cls, val: Any) -> Any:
|
185
|
+
if isinstance(val, str):
|
186
|
+
HttpUrl(val)
|
180
187
|
return val
|
181
188
|
|
182
189
|
|
183
|
-
from openai.types.chat import completion_create_params
|
184
|
-
from openai.types.chat.chat_completion import ChatCompletion
|
185
|
-
|
186
|
-
|
187
190
|
class LogCompletionRequest(BaseModel):
|
188
191
|
json_schema: dict[str, Any]
|
189
192
|
completion: ChatCompletion
|
190
193
|
|
191
194
|
|
192
195
|
class AutomationLog(BaseModel):
|
193
|
-
object: Literal[
|
196
|
+
object: Literal["automation_log"] = "automation_log"
|
194
197
|
id: str = Field(default_factory=lambda: "log_auto_" + nanoid.generate(), description="Unique identifier for the automation log")
|
195
198
|
user_email: Optional[EmailStr] # When the user is logged or when he forwards an email
|
196
199
|
organization_id: str
|
@@ -212,7 +215,7 @@ class AutomationLog(BaseModel):
|
|
212
215
|
print(f"Error computing cost: {e}")
|
213
216
|
return None
|
214
217
|
return None
|
215
|
-
|
218
|
+
|
216
219
|
@computed_field # type: ignore
|
217
220
|
@property
|
218
221
|
def cost_breakdown(self) -> Optional[CostBreakdown]:
|
retab/types/metrics.py
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
from typing import Any, Literal, Optional
|
2
|
+
from pydantic import BaseModel, Field
|
3
|
+
|
4
|
+
|
5
|
+
# Define the type alias for MetricType
|
6
|
+
MetricType = Literal["levenshtein", "jaccard", "hamming"]
|
7
|
+
|
8
|
+
|
9
|
+
# Define the structure for an individual item metric
|
10
|
+
class ItemMetric(BaseModel):
|
11
|
+
id: str = Field(description="The ID of the item being measured")
|
12
|
+
name: str = Field(description="The name of the item being measured")
|
13
|
+
similarity: float = Field(description="The similarity score between 0 and 1")
|
14
|
+
similarities: dict[str, Any] = Field(description="The similarity scores for each item in the list")
|
15
|
+
flat_similarities: dict[str, Optional[float]] = Field(description="The similarity scores for each item in the list in dot notation format")
|
16
|
+
aligned_similarity: float = Field(description="The similarity score between 0 and 1, after alignment")
|
17
|
+
aligned_similarities: dict[str, Any] = Field(description="The similarity scores for each item in the list, after alignment")
|
18
|
+
aligned_flat_similarities: dict[str, Optional[float]] = Field(description="The similarity scores for each item in the list in dot notation format, after alignment")
|
19
|
+
|
20
|
+
|
21
|
+
# Define the main MetricResult model
|
22
|
+
class MetricResult(BaseModel):
|
23
|
+
item_metrics: list[ItemMetric] = Field(description="List of similarity metrics for individual items")
|
24
|
+
mean_similarity: float = Field(description="The average similarity score across all items")
|
25
|
+
aligned_mean_similarity: float = Field(description="The average similarity score across all items, after alignment")
|
26
|
+
metric_type: MetricType = Field(description="The type of similarity metric used for comparison")
|
27
|
+
|
28
|
+
|
29
|
+
class DistancesResult(BaseModel):
|
30
|
+
distances: dict[str, Any] = Field(description="List of distances for individual items")
|
31
|
+
mean_distance: float = Field(description="The average distance across all items")
|
32
|
+
metric_type: MetricType = Field(description="The type of distance metric used for comparison")
|