retab 0.0.42__py3-none-any.whl → 0.0.44__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. retab/__init__.py +2 -1
  2. retab/client.py +26 -51
  3. retab/generate_types.py +180 -0
  4. retab/resources/consensus/client.py +1 -1
  5. retab/resources/consensus/responses.py +1 -1
  6. retab/resources/deployments/__init__.py +3 -0
  7. retab/resources/deployments/automations/__init__.py +9 -0
  8. retab/resources/deployments/automations/client.py +244 -0
  9. retab/resources/deployments/automations/endpoints.py +290 -0
  10. retab/resources/deployments/automations/links.py +303 -0
  11. retab/resources/deployments/automations/logs.py +222 -0
  12. retab/resources/deployments/automations/mailboxes.py +423 -0
  13. retab/resources/deployments/automations/outlook.py +377 -0
  14. retab/resources/deployments/automations/tests.py +161 -0
  15. retab/resources/deployments/client.py +148 -0
  16. retab/resources/documents/client.py +94 -68
  17. retab/resources/documents/extractions.py +55 -46
  18. retab/resources/evaluations/__init__.py +2 -2
  19. retab/resources/evaluations/client.py +61 -77
  20. retab/resources/evaluations/documents.py +48 -37
  21. retab/resources/evaluations/iterations.py +58 -40
  22. retab/resources/jsonlUtils.py +3 -4
  23. retab/resources/processors/automations/endpoints.py +49 -39
  24. retab/resources/processors/automations/links.py +52 -43
  25. retab/resources/processors/automations/mailboxes.py +74 -59
  26. retab/resources/processors/automations/outlook.py +104 -82
  27. retab/resources/processors/client.py +35 -30
  28. retab/resources/projects/__init__.py +3 -0
  29. retab/resources/projects/client.py +285 -0
  30. retab/resources/projects/documents.py +244 -0
  31. retab/resources/projects/iterations.py +470 -0
  32. retab/resources/usage.py +2 -0
  33. retab/types/ai_models.py +2 -1
  34. retab/types/deprecated_evals.py +195 -0
  35. retab/types/evaluations/__init__.py +5 -2
  36. retab/types/evaluations/iterations.py +9 -43
  37. retab/types/evaluations/model.py +19 -24
  38. retab/types/extractions.py +1 -0
  39. retab/types/jobs/base.py +1 -1
  40. retab/types/jobs/evaluation.py +1 -1
  41. retab/types/logs.py +5 -6
  42. retab/types/mime.py +1 -10
  43. retab/types/projects/__init__.py +34 -0
  44. retab/types/projects/documents.py +30 -0
  45. retab/types/projects/iterations.py +78 -0
  46. retab/types/projects/model.py +68 -0
  47. retab/types/schemas/enhance.py +22 -5
  48. retab/types/schemas/evaluate.py +2 -2
  49. retab/types/schemas/object.py +27 -25
  50. retab/types/standards.py +2 -2
  51. retab/utils/__init__.py +3 -0
  52. retab/utils/ai_models.py +127 -12
  53. retab/utils/hashing.py +24 -0
  54. retab/utils/json_schema.py +1 -26
  55. retab/utils/mime.py +0 -17
  56. retab/utils/usage/usage.py +0 -1
  57. {retab-0.0.42.dist-info → retab-0.0.44.dist-info}/METADATA +4 -6
  58. {retab-0.0.42.dist-info → retab-0.0.44.dist-info}/RECORD +60 -55
  59. retab/_utils/__init__.py +0 -0
  60. retab/_utils/_model_cards/anthropic.yaml +0 -59
  61. retab/_utils/_model_cards/auto.yaml +0 -43
  62. retab/_utils/_model_cards/gemini.yaml +0 -117
  63. retab/_utils/_model_cards/openai.yaml +0 -301
  64. retab/_utils/_model_cards/xai.yaml +0 -28
  65. retab/_utils/ai_models.py +0 -138
  66. retab/_utils/benchmarking.py +0 -484
  67. retab/_utils/chat.py +0 -327
  68. retab/_utils/display.py +0 -440
  69. retab/_utils/json_schema.py +0 -2156
  70. retab/_utils/mime.py +0 -165
  71. retab/_utils/responses.py +0 -169
  72. retab/_utils/stream_context_managers.py +0 -52
  73. retab/_utils/usage/__init__.py +0 -0
  74. retab/_utils/usage/usage.py +0 -301
  75. {retab-0.0.42.dist-info → retab-0.0.44.dist-info}/WHEEL +0 -0
  76. {retab-0.0.42.dist-info → retab-0.0.44.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,195 @@
1
+ import datetime
2
+ from typing import Any, List, Literal, Optional
3
+
4
+ import nanoid # type: ignore
5
+ from pydantic import BaseModel, Field, computed_field
6
+
7
+ from ..utils.json_schema import generate_schema_data_id, generate_schema_id
8
+ from .ai_models import Amount
9
+ from .inference_settings import InferenceSettings
10
+ from .mime import MIMEData
11
+
12
+ # Define the type alias for MetricType
13
+ MetricType = Literal["levenshtein", "jaccard", "hamming"]
14
+
15
+
16
+ # Define the structure for an individual item metric
17
+ class ItemMetric(BaseModel):
18
+ id: str = Field(description="The ID of the item being measured")
19
+ name: str = Field(description="The name of the item being measured")
20
+ similarity: float = Field(description="The similarity score between 0 and 1")
21
+ similarities: dict[str, Any] = Field(description="The similarity scores for each item in the list")
22
+ flat_similarities: dict[str, Optional[float]] = Field(description="The similarity scores for each item in the list in dot notation format")
23
+ aligned_similarity: float = Field(description="The similarity score between 0 and 1, after alignment")
24
+ aligned_similarities: dict[str, Any] = Field(description="The similarity scores for each item in the list, after alignment")
25
+ aligned_flat_similarities: dict[str, Optional[float]] = Field(description="The similarity scores for each item in the list in dot notation format, after alignment")
26
+
27
+
28
+ # Define the main MetricResult model
29
+ class MetricResult(BaseModel):
30
+ item_metrics: List[ItemMetric] = Field(description="List of similarity metrics for individual items")
31
+ mean_similarity: float = Field(description="The average similarity score across all items")
32
+ aligned_mean_similarity: float = Field(description="The average similarity score across all items, after alignment")
33
+ metric_type: MetricType = Field(description="The type of similarity metric used for comparison")
34
+
35
+
36
+ class DistancesResult(BaseModel):
37
+ distances: dict[str, Any] = Field(description="List of distances for individual items")
38
+ mean_distance: float = Field(description="The average distance across all items")
39
+ metric_type: MetricType = Field(description="The type of distance metric used for comparison")
40
+
41
+
42
+ class PredictionMetadata(BaseModel):
43
+ extraction_id: Optional[str] = Field(default=None, description="The ID of the extraction")
44
+ likelihoods: Optional[dict[str, Any]] = Field(default=None, description="The likelihoods of the extraction")
45
+ field_locations: Optional[dict[str, Any]] = Field(default=None, description="The field locations of the extraction")
46
+ agentic_field_locations: Optional[dict[str, Any]] = Field(default=None, description="The field locations of the extraction extracted by an llm")
47
+ consensus_details: Optional[list[dict[str, Any]]] = Field(default=None, description="The consensus details of the extraction")
48
+ api_cost: Optional[Amount] = Field(default=None, description="The cost of the API call for this document (if any -- ground truth for example)")
49
+
50
+
51
+ class PredictionData(BaseModel):
52
+ prediction: dict[str, Any] = Field(default={}, description="The result of the extraction or manual annotation")
53
+ metadata: Optional[PredictionMetadata] = Field(default=None, description="The metadata of the prediction")
54
+
55
+
56
+ class Iteration(BaseModel):
57
+ id: str = Field(default_factory=lambda: "eval_iter_" + nanoid.generate())
58
+ inference_settings: InferenceSettings
59
+ json_schema: dict[str, Any]
60
+ predictions: list[PredictionData] = Field(default_factory=list, description="The predictions of the iteration for all the documents")
61
+ metric_results: Optional[MetricResult] = Field(default=None, description="The metric results of the iteration")
62
+
63
+ @computed_field # type: ignore
64
+ @property
65
+ def schema_data_id(self) -> str:
66
+ """Returns the SHA1 hash of the schema data, ignoring all prompt/description/default fields.
67
+
68
+ Returns:
69
+ str: A SHA1 hash string representing the schema data version.
70
+ """
71
+ return generate_schema_data_id(self.json_schema)
72
+
73
+ # This is a computed field, it is exposed when serializing the object
74
+ @computed_field # type: ignore
75
+ @property
76
+ def schema_id(self) -> str:
77
+ """Returns the SHA1 hash of the complete schema.
78
+
79
+ Returns:
80
+ str: A SHA1 hash string representing the complete schema version.
81
+ """
82
+ return generate_schema_id(self.json_schema)
83
+
84
+
85
+ class AnnotatedDocument(BaseModel):
86
+ mime_data: MIMEData = Field(
87
+ description="The mime data of the document. Can also be a BaseMIMEData, which is why we have this id field (to be able to identify the file, but id is equal to mime_data.id)"
88
+ )
89
+ annotation: dict[str, Any] = Field(default={}, description="The ground truth of the document")
90
+
91
+
92
+ class DocumentItem(AnnotatedDocument):
93
+ annotation_metadata: Optional[PredictionMetadata] = Field(default=None, description="The metadata of the annotation when the annotation is a prediction")
94
+
95
+
96
+ class ProjectDocument(DocumentItem):
97
+ id: str = Field(description="The ID of the document. Equal to mime_data.id but robust to the case where mime_data is a BaseMIMEData")
98
+
99
+
100
+ class CreateIterationRequest(BaseModel):
101
+ """
102
+ Request model for performing a new iteration with custom inference settings and optional JSON schema.
103
+ """
104
+
105
+ inference_settings: InferenceSettings
106
+ json_schema: Optional[dict[str, Any]] = None
107
+
108
+
109
+ class UpdateProjectDocumentRequest(BaseModel):
110
+ annotation: Optional[dict[str, Any]] = Field(default=None, description="The ground truth of the document")
111
+ annotation_metadata: Optional[PredictionMetadata] = Field(default=None, description="The metadata of the annotation when the annotation is a prediction")
112
+
113
+
114
+ class UpdateProjectRequest(BaseModel):
115
+ name: Optional[str] = Field(default=None, description="The name of the document")
116
+ documents: Optional[list[ProjectDocument]] = Field(default=None, description="The documents of the evaluation")
117
+ iterations: Optional[list[Iteration]] = Field(default=None, description="The iterations of the evaluation")
118
+ json_schema: Optional[dict[str, Any]] = Field(default=None, description="The json schema of the evaluation")
119
+
120
+ project_id: Optional[str] = Field(default=None, description="The ID of the project")
121
+ default_inference_settings: Optional[InferenceSettings] = Field(default=None, description="The default inference properties for the evaluation (mostly used in the frontend)")
122
+
123
+ @computed_field # type: ignore
124
+ @property
125
+ def schema_data_id(self) -> Optional[str]:
126
+ """Returns the SHA1 hash of the schema data, ignoring all prompt/description/default fields.
127
+
128
+ Returns:
129
+ str: A SHA1 hash string representing the schema data version.
130
+ """
131
+ if self.json_schema is None:
132
+ return None
133
+
134
+ return generate_schema_data_id(self.json_schema)
135
+
136
+ # This is a computed field, it is exposed when serializing the object
137
+ @computed_field # type: ignore
138
+ @property
139
+ def schema_id(self) -> Optional[str]:
140
+ """Returns the SHA1 hash of the complete schema.
141
+
142
+ Returns:
143
+ str: A SHA1 hash string representing the complete schema version.
144
+ """
145
+ if self.json_schema is None:
146
+ return None
147
+ return generate_schema_id(self.json_schema)
148
+
149
+
150
+ class Project(BaseModel):
151
+ id: str = Field(default_factory=lambda: "eval_" + nanoid.generate())
152
+ updated_at: datetime.datetime = Field(default_factory=lambda: datetime.datetime.now(tz=datetime.timezone.utc))
153
+
154
+ name: str
155
+ old_documents: list[ProjectDocument] | None = None
156
+ documents: list[ProjectDocument]
157
+ iterations: list[Iteration]
158
+ json_schema: dict[str, Any]
159
+
160
+ project_id: str = Field(description="The ID of the project", default="default_spreadsheets")
161
+ default_inference_settings: Optional[InferenceSettings] = Field(default=None, description="The default inference properties for the evaluation (mostly used in the frontend)")
162
+
163
+ # @field_validator('iterations')
164
+ # def validate_iterations_content_length(cls: Any, v: list[Iteration], values: Any) -> list[Iteration]:
165
+ # if 'ground_truth' in values:
166
+ # ground_truth_length = len(values['ground_truth'])
167
+ # for iteration in v:
168
+ # if len(iteration.content) != ground_truth_length:
169
+ # raise ValueError(f"Iteration content length must match ground_truth length ({ground_truth_length})")
170
+ # return v
171
+
172
+ @computed_field # type: ignore
173
+ @property
174
+ def schema_data_id(self) -> str:
175
+ """Returns the SHA1 hash of the schema data, ignoring all prompt/description/default fields.
176
+
177
+ Returns:
178
+ str: A SHA1 hash string representing the schema data version.
179
+ """
180
+ return generate_schema_data_id(self.json_schema)
181
+
182
+ # This is a computed field, it is exposed when serializing the object
183
+ @computed_field # type: ignore
184
+ @property
185
+ def schema_id(self) -> str:
186
+ """Returns the SHA1 hash of the complete schema.
187
+
188
+ Returns:
189
+ str: A SHA1 hash string representing the complete schema version.
190
+ """
191
+ return generate_schema_id(self.json_schema)
192
+
193
+
194
+ class AddIterationFromJsonlRequest(BaseModel):
195
+ jsonl_gcs_path: str
@@ -1,6 +1,7 @@
1
- from .model import Evaluation, CreateEvaluation, PatchEvaluationRequest, ListEvaluationParams
1
+ from .model import Evaluation, BaseEvaluation, CreateEvaluationRequest, PatchEvaluationRequest, ListEvaluationParams
2
2
  from .documents import AnnotatedDocument, DocumentItem, EvaluationDocument, CreateEvaluationDocumentRequest, PatchEvaluationDocumentRequest
3
3
  from .iterations import (
4
+ BaseIteration,
4
5
  Iteration,
5
6
  CreateIterationRequest,
6
7
  PatchIterationRequest,
@@ -13,7 +14,8 @@ from .iterations import (
13
14
 
14
15
  __all__ = [
15
16
  "Evaluation",
16
- "CreateEvaluation",
17
+ "BaseEvaluation",
18
+ "CreateEvaluationRequest",
17
19
  "PatchEvaluationRequest",
18
20
  "ListEvaluationParams",
19
21
  "AnnotatedDocument",
@@ -21,6 +23,7 @@ __all__ = [
21
23
  "EvaluationDocument",
22
24
  "CreateEvaluationDocumentRequest",
23
25
  "PatchEvaluationDocumentRequest",
26
+ "BaseIteration",
24
27
  "Iteration",
25
28
  "CreateIterationRequest",
26
29
  "PatchIterationRequest",
@@ -1,59 +1,24 @@
1
- import copy
2
1
  import datetime
3
- import json
4
2
  from typing import Any, Optional, Self
5
3
 
6
4
  import nanoid # type: ignore
7
- from pydantic import BaseModel, Field, computed_field, model_validator
5
+ from pydantic import BaseModel, Field, model_validator
8
6
 
9
- from ...utils.json_schema import clean_schema
10
- from ...utils.mime import generate_blake2b_hash_from_string
11
7
  from ..inference_settings import InferenceSettings
12
- from ..metrics import MetricResult
13
8
  from ..predictions import PredictionData
14
9
 
15
10
 
16
- class Iteration(BaseModel):
11
+ class BaseIteration(BaseModel):
17
12
  id: str = Field(default_factory=lambda: "eval_iter_" + nanoid.generate())
13
+ inference_settings: InferenceSettings
14
+ json_schema: dict[str, Any]
18
15
  updated_at: datetime.datetime = Field(
19
16
  default_factory=lambda: datetime.datetime.now(tz=datetime.timezone.utc),
20
17
  description="The last update date of inference settings or json schema",
21
18
  )
22
- inference_settings: InferenceSettings
23
- json_schema: dict[str, Any]
24
- predictions: dict[str, PredictionData] = Field(default_factory=dict, description="The predictions of the iteration for all the documents")
25
- metric_results: Optional[MetricResult] = Field(default=None, description="The metric results of the iteration")
26
-
27
- @computed_field # type: ignore
28
- @property
29
- def schema_data_id(self) -> str:
30
- """Returns the SHA1 hash of the schema data, ignoring all prompt/description/default fields.
31
-
32
- Returns:
33
- str: A SHA1 hash string representing the schema data version.
34
- """
35
- return "sch_data_id_" + generate_blake2b_hash_from_string(
36
- json.dumps(
37
- clean_schema(
38
- copy.deepcopy(self.json_schema),
39
- remove_custom_fields=True,
40
- fields_to_remove=["description", "default", "title", "required", "examples", "deprecated", "readOnly", "writeOnly"],
41
- ),
42
- sort_keys=True,
43
- ).strip()
44
- )
45
-
46
- # This is a computed field, it is exposed when serializing the object
47
- @computed_field # type: ignore
48
- @property
49
- def schema_id(self) -> str:
50
- """Returns the SHA1 hash of the complete schema.
51
-
52
- Returns:
53
- str: A SHA1 hash string representing the complete schema version.
54
- """
55
- return "sch_id_" + generate_blake2b_hash_from_string(json.dumps(self.json_schema, sort_keys=True).strip())
56
19
 
20
+ class Iteration(BaseIteration):
21
+ predictions: dict[str, PredictionData] = Field(default_factory=dict, description="The predictions of the iteration for all the documents")
57
22
 
58
23
  class CreateIterationRequest(BaseModel):
59
24
  """
@@ -71,13 +36,14 @@ class CreateIterationRequest(BaseModel):
71
36
  @model_validator(mode="after")
72
37
  def validate_one_of_from_iteration_id_or_json_schema(self) -> Self:
73
38
  if (self.from_iteration_id is None) ^ (self.json_schema is None):
74
- raise ValueError("Exactly one of from_iteration_id or json_schema must be provided")
75
- return self
39
+ return self
40
+ raise ValueError("Exactly one of from_iteration_id or json_schema must be provided")
76
41
 
77
42
 
78
43
  class PatchIterationRequest(BaseModel):
79
44
  inference_settings: Optional[InferenceSettings] = Field(default=None, description="The new inference settings of the iteration")
80
45
  json_schema: Optional[dict[str, Any]] = Field(default=None, description="The new json schema of the iteration")
46
+ version: Optional[int] = Field(default=None, description="Current version for optimistic locking")
81
47
 
82
48
 
83
49
  class ProcessIterationRequest(BaseModel):
@@ -1,31 +1,27 @@
1
1
  import datetime
2
- import json
3
2
  from typing import Any, Optional
4
3
 
5
4
  import nanoid # type: ignore
6
5
  from pydantic import BaseModel, Field, computed_field
7
6
 
8
- from ...utils.json_schema import compute_schema_data_id
9
- from ...utils.mime import generate_blake2b_hash_from_string
7
+ from ...utils.json_schema import generate_schema_data_id, generate_schema_id
10
8
  from ..inference_settings import InferenceSettings
11
9
  from .documents import EvaluationDocument
12
10
  from .iterations import Iteration
13
11
 
14
12
 
15
- # Actual Object stored in DB
16
- class Evaluation(BaseModel):
17
- id: str = Field(default_factory=lambda: "eval_" + nanoid.generate())
13
+ class BaseEvaluation(BaseModel):
14
+ id: str = Field(default_factory=lambda: "proj_" + nanoid.generate())
15
+ name: str = Field(default="", description="The name of the evaluation")
16
+ json_schema: dict[str, Any] = Field(default_factory=dict, description="The json schema of the evaluation")
17
+ default_inference_settings: InferenceSettings = Field(default=InferenceSettings(), description="The default inference properties for the evaluation.")
18
18
  updated_at: datetime.datetime = Field(default_factory=lambda: datetime.datetime.now(tz=datetime.timezone.utc))
19
19
 
20
- name: str
20
+
21
+ # Actual Object stored in DB
22
+ class Evaluation(BaseEvaluation):
21
23
  documents: list[EvaluationDocument] = Field(default_factory=list)
22
24
  iterations: list[Iteration] = Field(default_factory=list)
23
- json_schema: dict[str, Any]
24
-
25
- project_id: str = Field(description="The ID of the project", default="default_spreadsheets")
26
- default_inference_settings: InferenceSettings = Field(
27
- default=InferenceSettings(), description="The default inference properties for the evaluation (mostly used in the frontend)"
28
- )
29
25
 
30
26
  @computed_field # type: ignore
31
27
  @property
@@ -35,7 +31,7 @@ class Evaluation(BaseModel):
35
31
  Returns:
36
32
  str: A SHA1 hash string representing the schema data version.
37
33
  """
38
- return compute_schema_data_id(self.json_schema)
34
+ return generate_schema_data_id(self.json_schema)
39
35
 
40
36
  # This is a computed field, it is exposed when serializing the object
41
37
  @computed_field # type: ignore
@@ -46,26 +42,25 @@ class Evaluation(BaseModel):
46
42
  Returns:
47
43
  str: A SHA1 hash string representing the complete schema version.
48
44
  """
49
- return "sch_id_" + generate_blake2b_hash_from_string(json.dumps(self.json_schema, sort_keys=True).strip())
50
-
51
-
52
- class CreateEvaluation(BaseModel):
53
- name: str
54
- json_schema: dict[str, Any]
55
- project_id: str = Field(description="The ID of the project", default="default_spreadsheets")
56
- default_inference_settings: InferenceSettings = Field(default=InferenceSettings(), description="The default inference properties for the evaluation.")
45
+ return generate_schema_id(self.json_schema)
57
46
 
58
47
 
59
48
  class ListEvaluationParams(BaseModel):
60
- project_id: Optional[str] = Field(default=None, description="The ID of the project")
61
49
  schema_id: Optional[str] = Field(default=None, description="The ID of the schema")
62
50
  schema_data_id: Optional[str] = Field(default=None, description="The ID of the schema data")
63
51
 
64
52
 
53
+ class CreateEvaluationRequest(BaseModel):
54
+ name: str
55
+ json_schema: dict[str, Any]
56
+ default_inference_settings: InferenceSettings
57
+
58
+
59
+ # This is basically the same as BaseEvaluation, but everything is optional.
60
+ # Could be achieved by convert_basemodel_to_partial_basemodel(BaseEvaluation) but we prefer explicitness
65
61
  class PatchEvaluationRequest(BaseModel):
66
62
  name: Optional[str] = Field(default=None, description="The name of the document")
67
63
  json_schema: Optional[dict[str, Any]] = Field(default=None, description="The json schema of the evaluation")
68
- project_id: Optional[str] = Field(default=None, description="The ID of the project")
69
64
  default_inference_settings: Optional[InferenceSettings] = Field(default=None, description="The default inference properties for the evaluation (mostly used in the frontend)")
70
65
 
71
66
 
@@ -77,6 +77,7 @@ class Extraction(BaseModel):
77
77
  default=None,
78
78
  description="The effort level for the model to reason about the input data.",
79
79
  )
80
+ n_consensus: int = Field(default=1, description="Number of consensus models used for the extraction")
80
81
  timings: list[ExtractionTimingStep] = Field(default_factory=list, description="Timings of the extraction")
81
82
 
82
83
  # Infered from the schema
retab/types/jobs/base.py CHANGED
@@ -47,7 +47,7 @@ class AnnotationInputData(BaseModel):
47
47
 
48
48
 
49
49
  # This is the input data for the evaluation job
50
- class EvaluationInputData(BaseModel):
50
+ class ProjectInputData(BaseModel):
51
51
  eval_data_file: str
52
52
  schema_id: str
53
53
  inference_settings_1: InferenceSettings | None = None
@@ -6,7 +6,7 @@ from ..inference_settings import InferenceSettings
6
6
  # It will then evaluate the two datasets using the evaluation metrics and return an EvalMetrics object
7
7
 
8
8
 
9
- class EvaluationInputData(BaseModel):
9
+ class ProjectInputData(BaseModel):
10
10
  original_dataset_id: str
11
11
  schema_id: str
12
12
  schema_data_id: str
retab/types/logs.py CHANGED
@@ -7,8 +7,7 @@ from openai.types.chat.chat_completion import ChatCompletion
7
7
  from openai.types.chat.chat_completion_reasoning_effort import ChatCompletionReasoningEffort
8
8
  from pydantic import BaseModel, EmailStr, Field, HttpUrl, computed_field, field_validator
9
9
 
10
- from ..utils.json_schema import compute_schema_data_id
11
- from ..utils.mime import generate_blake2b_hash_from_string
10
+ from ..utils.json_schema import generate_schema_data_id, generate_schema_id
12
11
  from ..utils.usage.usage import CostBreakdown, compute_cost_from_model, compute_cost_from_model_with_breakdown
13
12
  from .ai_models import Amount
14
13
  from .documents.extractions import RetabParsedChatCompletion
@@ -47,7 +46,7 @@ class ProcessorConfig(BaseModel):
47
46
  Returns:
48
47
  str: A SHA1 hash string representing the schema data version.
49
48
  """
50
- return compute_schema_data_id(self.json_schema)
49
+ return generate_schema_data_id(self.json_schema)
51
50
 
52
51
  # This is a computed field, it is exposed when serializing the object
53
52
  @computed_field # type: ignore
@@ -58,7 +57,7 @@ class ProcessorConfig(BaseModel):
58
57
  Returns:
59
58
  str: A SHA1 hash string representing the complete schema version.
60
59
  """
61
- return "sch_id_" + generate_blake2b_hash_from_string(json.dumps(self.json_schema, sort_keys=True).strip())
60
+ return generate_schema_id(self.json_schema)
62
61
 
63
62
 
64
63
  class AutomationConfig(BaseModel):
@@ -111,7 +110,7 @@ class UpdateProcessorRequest(BaseModel):
111
110
  """
112
111
  if self.json_schema is None:
113
112
  return None
114
- return compute_schema_data_id(self.json_schema)
113
+ return generate_schema_data_id(self.json_schema)
115
114
 
116
115
  @computed_field # type: ignore
117
116
  @property
@@ -123,7 +122,7 @@ class UpdateProcessorRequest(BaseModel):
123
122
  """
124
123
  if self.json_schema is None:
125
124
  return None
126
- return "sch_id_" + generate_blake2b_hash_from_string(json.dumps(self.json_schema, sort_keys=True).strip())
125
+ return generate_schema_id(self.json_schema)
127
126
 
128
127
 
129
128
  class UpdateAutomationRequest(BaseModel):
retab/types/mime.py CHANGED
@@ -1,21 +1,12 @@
1
1
  import base64
2
2
  import datetime
3
3
  import gzip
4
- import hashlib
5
4
  import mimetypes
6
5
  import re
7
6
  from typing import Any, Optional, Self, Sequence
8
7
 
9
8
  from pydantic import BaseModel, Field, field_validator
10
-
11
-
12
- def generate_blake2b_hash_from_bytes(bytes_: bytes) -> str:
13
- return hashlib.blake2b(bytes_, digest_size=8).hexdigest()
14
-
15
-
16
- def generate_blake2b_hash_from_base64(base64_string: str) -> str:
17
- return generate_blake2b_hash_from_bytes(base64.b64decode(base64_string))
18
-
9
+ from ..utils.hashing import generate_blake2b_hash_from_base64
19
10
 
20
11
  # **** OCR DATACLASSES (DocumentAI-compatible) ****
21
12
  class Point(BaseModel):
@@ -0,0 +1,34 @@
1
+ from .model import Project, BaseProject, CreateProjectRequest, PatchProjectRequest, ListProjectParams
2
+ from .documents import AnnotatedDocument, DocumentItem, ProjectDocument, CreateProjectDocumentRequest, PatchProjectDocumentRequest
3
+ from .iterations import (
4
+ BaseIteration,
5
+ Iteration,
6
+ CreateIterationRequest,
7
+ PatchIterationRequest,
8
+ ProcessIterationRequest,
9
+ DocumentStatus,
10
+ IterationDocumentStatusResponse,
11
+ AddIterationFromJsonlRequest,
12
+ )
13
+
14
+
15
+ __all__ = [
16
+ "Project",
17
+ "BaseProject",
18
+ "CreateProjectRequest",
19
+ "PatchProjectRequest",
20
+ "ListProjectParams",
21
+ "AnnotatedDocument",
22
+ "DocumentItem",
23
+ "ProjectDocument",
24
+ "CreateProjectDocumentRequest",
25
+ "PatchProjectDocumentRequest",
26
+ "BaseIteration",
27
+ "Iteration",
28
+ "CreateIterationRequest",
29
+ "PatchIterationRequest",
30
+ "ProcessIterationRequest",
31
+ "DocumentStatus",
32
+ "IterationDocumentStatusResponse",
33
+ "AddIterationFromJsonlRequest",
34
+ ]
@@ -0,0 +1,30 @@
1
+ from typing import Any, Optional
2
+
3
+ from pydantic import BaseModel, Field
4
+
5
+ from ..mime import MIMEData
6
+ from ..predictions import PredictionMetadata
7
+
8
+
9
+ class AnnotatedDocument(BaseModel):
10
+ mime_data: MIMEData = Field(
11
+ description="The mime data of the document. Can also be a BaseMIMEData, which is why we have this id field (to be able to identify the file, but id is equal to mime_data.id)"
12
+ )
13
+ annotation: dict[str, Any] = Field(default={}, description="The ground truth of the document")
14
+
15
+
16
+ class DocumentItem(AnnotatedDocument):
17
+ annotation_metadata: Optional[PredictionMetadata] = Field(default=None, description="The metadata of the annotation when the annotation is a prediction")
18
+
19
+
20
+ class ProjectDocument(DocumentItem):
21
+ id: str = Field(description="The ID of the document. Equal to mime_data.id but robust to the case where mime_data is a BaseMIMEData")
22
+
23
+
24
+ class CreateProjectDocumentRequest(DocumentItem):
25
+ pass
26
+
27
+
28
+ class PatchProjectDocumentRequest(BaseModel):
29
+ annotation: Optional[dict[str, Any]] = Field(default=None, description="The ground truth of the document")
30
+ annotation_metadata: Optional[PredictionMetadata] = Field(default=None, description="The metadata of the annotation when the annotation is a prediction")
@@ -0,0 +1,78 @@
1
+ import datetime
2
+ from typing import Any, Optional, Self
3
+
4
+ import nanoid # type: ignore
5
+ from pydantic import BaseModel, Field, model_validator
6
+
7
+ from ..inference_settings import InferenceSettings
8
+ from ..predictions import PredictionData
9
+
10
+
11
+ class BaseIteration(BaseModel):
12
+ id: str = Field(default_factory=lambda: "eval_iter_" + nanoid.generate())
13
+ inference_settings: InferenceSettings
14
+ json_schema: dict[str, Any]
15
+ updated_at: datetime.datetime = Field(
16
+ default_factory=lambda: datetime.datetime.now(tz=datetime.timezone.utc),
17
+ description="The last update date of inference settings or json schema",
18
+ )
19
+
20
+ class Iteration(BaseIteration):
21
+ predictions: dict[str, PredictionData] = Field(default_factory=dict, description="The predictions of the iteration for all the documents")
22
+
23
+ class CreateIterationRequest(BaseModel):
24
+ """
25
+ Request model for performing a new iteration with custom inference settings and optional JSON schema.
26
+ """
27
+
28
+ inference_settings: InferenceSettings
29
+ json_schema: Optional[dict[str, Any]] = None
30
+ from_iteration_id: Optional[str] = Field(
31
+ default=None,
32
+ description="The ID of the iteration to copy the JSON Schema from.",
33
+ )
34
+
35
+ # validate that exactly one of from_iteration_id or json_schema is provided
36
+ @model_validator(mode="after")
37
+ def validate_one_of_from_iteration_id_or_json_schema(self) -> Self:
38
+ if (self.from_iteration_id is None) ^ (self.json_schema is None):
39
+ return self
40
+ raise ValueError("Exactly one of from_iteration_id or json_schema must be provided")
41
+
42
+
43
+ class PatchIterationRequest(BaseModel):
44
+ inference_settings: Optional[InferenceSettings] = Field(default=None, description="The new inference settings of the iteration")
45
+ json_schema: Optional[dict[str, Any]] = Field(default=None, description="The new json schema of the iteration")
46
+ version: Optional[int] = Field(default=None, description="Current version for optimistic locking")
47
+
48
+
49
+ class ProcessIterationRequest(BaseModel):
50
+ """Request model for processing an iteration - running extractions on documents."""
51
+
52
+ document_ids: Optional[list[str]] = Field(default=None, description="Specific document IDs to process. If None, all documents will be processed.")
53
+ only_outdated: bool = Field(default=True, description="Only process documents that need updates (prediction.updated_at is None or older than iteration.updated_at)")
54
+
55
+
56
+ class DocumentStatus(BaseModel):
57
+ """Status of a document within an iteration."""
58
+
59
+ document_id: str
60
+ filename: str
61
+ needs_update: bool = Field(description="True if prediction is missing or outdated")
62
+ has_prediction: bool = Field(description="True if any prediction exists")
63
+ prediction_updated_at: Optional[datetime.datetime] = Field(description="When the prediction was last updated")
64
+ iteration_updated_at: datetime.datetime = Field(description="When the iteration settings were last updated")
65
+
66
+
67
+ class IterationDocumentStatusResponse(BaseModel):
68
+ """Response showing the status of all documents in an iteration."""
69
+
70
+ iteration_id: str
71
+ documents: list[DocumentStatus]
72
+ total_documents: int
73
+ documents_needing_update: int
74
+ documents_up_to_date: int
75
+
76
+
77
+ class AddIterationFromJsonlRequest(BaseModel):
78
+ jsonl_gcs_path: str