retab 0.0.36__py3-none-any.whl → 0.0.38__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- retab/__init__.py +4 -0
- {uiform → retab}/_resource.py +5 -5
- {uiform → retab}/_utils/ai_models.py +2 -2
- {uiform → retab}/_utils/benchmarking.py +15 -16
- {uiform → retab}/_utils/chat.py +29 -34
- {uiform → retab}/_utils/display.py +0 -3
- {uiform → retab}/_utils/json_schema.py +9 -14
- {uiform → retab}/_utils/mime.py +11 -14
- {uiform → retab}/_utils/responses.py +16 -10
- {uiform → retab}/_utils/stream_context_managers.py +1 -1
- {uiform → retab}/_utils/usage/usage.py +31 -31
- {uiform → retab}/client.py +54 -53
- {uiform → retab}/resources/consensus/client.py +19 -38
- {uiform → retab}/resources/consensus/completions.py +36 -59
- {uiform → retab}/resources/consensus/completions_stream.py +35 -47
- {uiform → retab}/resources/consensus/responses.py +37 -86
- {uiform → retab}/resources/consensus/responses_stream.py +41 -89
- retab/resources/documents/client.py +455 -0
- {uiform → retab}/resources/documents/extractions.py +192 -101
- {uiform → retab}/resources/evals.py +56 -43
- retab/resources/evaluations/__init__.py +3 -0
- retab/resources/evaluations/client.py +301 -0
- retab/resources/evaluations/documents.py +233 -0
- retab/resources/evaluations/iterations.py +452 -0
- {uiform → retab}/resources/files.py +2 -2
- {uiform → retab}/resources/jsonlUtils.py +225 -221
- retab/resources/models.py +73 -0
- retab/resources/processors/automations/client.py +244 -0
- {uiform → retab}/resources/processors/automations/endpoints.py +79 -120
- retab/resources/processors/automations/links.py +294 -0
- {uiform → retab}/resources/processors/automations/logs.py +30 -19
- retab/resources/processors/automations/mailboxes.py +397 -0
- retab/resources/processors/automations/outlook.py +337 -0
- {uiform → retab}/resources/processors/automations/tests.py +22 -25
- {uiform → retab}/resources/processors/client.py +181 -166
- {uiform → retab}/resources/schemas.py +78 -66
- {uiform → retab}/resources/secrets/external_api_keys.py +1 -5
- retab/resources/secrets/webhook.py +64 -0
- {uiform → retab}/resources/usage.py +41 -4
- {uiform → retab}/types/ai_models.py +17 -17
- {uiform → retab}/types/automations/cron.py +19 -12
- {uiform → retab}/types/automations/endpoints.py +7 -4
- {uiform → retab}/types/automations/links.py +7 -3
- {uiform → retab}/types/automations/mailboxes.py +10 -10
- {uiform → retab}/types/automations/outlook.py +15 -11
- {uiform → retab}/types/automations/webhooks.py +1 -1
- retab/types/browser_canvas.py +3 -0
- retab/types/chat.py +8 -0
- {uiform → retab}/types/completions.py +12 -15
- retab/types/consensus.py +19 -0
- {uiform → retab}/types/db/annotations.py +3 -3
- {uiform → retab}/types/db/files.py +8 -6
- {uiform → retab}/types/documents/create_messages.py +20 -22
- {uiform → retab}/types/documents/extractions.py +71 -26
- {uiform → retab}/types/evals.py +5 -5
- retab/types/evaluations/__init__.py +31 -0
- retab/types/evaluations/documents.py +30 -0
- retab/types/evaluations/iterations.py +112 -0
- retab/types/evaluations/model.py +73 -0
- retab/types/events.py +79 -0
- {uiform → retab}/types/extractions.py +36 -13
- retab/types/inference_settings.py +15 -0
- retab/types/jobs/base.py +54 -0
- retab/types/jobs/batch_annotation.py +12 -0
- {uiform → retab}/types/jobs/evaluation.py +1 -2
- {uiform → retab}/types/logs.py +37 -34
- retab/types/metrics.py +32 -0
- {uiform → retab}/types/mime.py +22 -20
- {uiform → retab}/types/modalities.py +10 -10
- retab/types/predictions.py +19 -0
- {uiform → retab}/types/schemas/enhance.py +4 -2
- {uiform → retab}/types/schemas/evaluate.py +7 -4
- {uiform → retab}/types/schemas/generate.py +6 -3
- {uiform → retab}/types/schemas/layout.py +1 -1
- {uiform → retab}/types/schemas/object.py +16 -17
- {uiform → retab}/types/schemas/templates.py +1 -3
- {uiform → retab}/types/secrets/external_api_keys.py +0 -1
- {uiform → retab}/types/standards.py +18 -1
- {retab-0.0.36.dist-info → retab-0.0.38.dist-info}/METADATA +78 -77
- retab-0.0.38.dist-info/RECORD +107 -0
- retab-0.0.38.dist-info/top_level.txt +1 -0
- retab-0.0.36.dist-info/RECORD +0 -96
- retab-0.0.36.dist-info/top_level.txt +0 -1
- uiform/__init__.py +0 -4
- uiform/_utils/benchmarking copy.py +0 -588
- uiform/resources/documents/client.py +0 -255
- uiform/resources/models.py +0 -45
- uiform/resources/processors/automations/client.py +0 -78
- uiform/resources/processors/automations/links.py +0 -356
- uiform/resources/processors/automations/mailboxes.py +0 -435
- uiform/resources/processors/automations/outlook.py +0 -444
- uiform/resources/secrets/webhook.py +0 -62
- uiform/types/chat.py +0 -8
- uiform/types/consensus.py +0 -10
- uiform/types/events.py +0 -76
- uiform/types/jobs/base.py +0 -150
- uiform/types/jobs/batch_annotation.py +0 -22
- {uiform → retab}/_utils/__init__.py +0 -0
- {uiform → retab}/_utils/usage/__init__.py +0 -0
- {uiform → retab}/py.typed +0 -0
- {uiform → retab}/resources/__init__.py +0 -0
- {uiform → retab}/resources/consensus/__init__.py +0 -0
- {uiform → retab}/resources/documents/__init__.py +0 -0
- {uiform → retab}/resources/finetuning.py +0 -0
- {uiform → retab}/resources/openai_example.py +0 -0
- {uiform → retab}/resources/processors/__init__.py +0 -0
- {uiform → retab}/resources/processors/automations/__init__.py +0 -0
- {uiform → retab}/resources/prompt_optimization.py +0 -0
- {uiform → retab}/resources/secrets/__init__.py +0 -0
- {uiform → retab}/resources/secrets/client.py +0 -0
- {uiform → retab}/types/__init__.py +0 -0
- {uiform → retab}/types/automations/__init__.py +0 -0
- {uiform → retab}/types/db/__init__.py +0 -0
- {uiform → retab}/types/documents/__init__.py +0 -0
- {uiform → retab}/types/documents/correct_orientation.py +0 -0
- {uiform → retab}/types/jobs/__init__.py +0 -0
- {uiform → retab}/types/jobs/finetune.py +0 -0
- {uiform → retab}/types/jobs/prompt_optimization.py +0 -0
- {uiform → retab}/types/jobs/webcrawl.py +0 -0
- {uiform → retab}/types/pagination.py +0 -0
- {uiform → retab}/types/schemas/__init__.py +0 -0
- {uiform → retab}/types/secrets/__init__.py +0 -0
- {retab-0.0.36.dist-info → retab-0.0.38.dist-info}/WHEEL +0 -0
retab/__init__.py
ADDED
{uiform → retab}/_resource.py
RENAMED
@@ -5,13 +5,13 @@ import time
|
|
5
5
|
from typing import TYPE_CHECKING
|
6
6
|
|
7
7
|
if TYPE_CHECKING:
|
8
|
-
from .client import
|
8
|
+
from .client import AsyncRetab, Retab
|
9
9
|
|
10
10
|
|
11
11
|
class SyncAPIResource:
|
12
|
-
_client:
|
12
|
+
_client: Retab
|
13
13
|
|
14
|
-
def __init__(self, client:
|
14
|
+
def __init__(self, client: Retab) -> None:
|
15
15
|
self._client = client
|
16
16
|
|
17
17
|
def _sleep(self, seconds: float) -> None:
|
@@ -19,9 +19,9 @@ class SyncAPIResource:
|
|
19
19
|
|
20
20
|
|
21
21
|
class AsyncAPIResource:
|
22
|
-
_client:
|
22
|
+
_client: AsyncRetab
|
23
23
|
|
24
|
-
def __init__(self, client:
|
24
|
+
def __init__(self, client: AsyncRetab) -> None:
|
25
25
|
self._client = client
|
26
26
|
|
27
27
|
async def _sleep(self, seconds: float) -> None:
|
@@ -61,7 +61,7 @@ def assert_valid_model_batch_processing(model: str) -> None:
|
|
61
61
|
raise ValueError(f"Invalid base model in fine-tuned model '{model}'. Base model must be one of: {get_args(OpenAIModel)}")
|
62
62
|
if not model_id or not model_id.strip():
|
63
63
|
raise ValueError(f"Model ID cannot be empty in fine-tuned model '{model}'")
|
64
|
-
except ValueError
|
64
|
+
except ValueError:
|
65
65
|
if ":" not in model:
|
66
66
|
raise ValueError(
|
67
67
|
f"Invalid model format: {model}. Must be either:\n"
|
@@ -90,7 +90,7 @@ def assert_valid_model_schema_generation(model: str) -> None:
|
|
90
90
|
raise ValueError(f"Invalid base model in fine-tuned model '{model}'. Base model must be one of: {get_args(OpenAIModel)}")
|
91
91
|
if not model_id or not model_id.strip():
|
92
92
|
raise ValueError(f"Model ID cannot be empty in fine-tuned model '{model}'")
|
93
|
-
except ValueError
|
93
|
+
except ValueError:
|
94
94
|
if ":" not in model:
|
95
95
|
raise ValueError(
|
96
96
|
f"Invalid model format: {model}. Must be either:\n"
|
@@ -5,7 +5,7 @@ import shutil
|
|
5
5
|
# The goal is to leverage this piece of code to open a jsonl file and get an analysis of the performance of the model using a one-liner.
|
6
6
|
############# BENCHMARKING MODELS #############
|
7
7
|
from itertools import zip_longest
|
8
|
-
from typing import Any, Callable, Literal, Optional
|
8
|
+
from typing import Any, Callable, Literal, Optional, cast
|
9
9
|
|
10
10
|
import pandas as pd # type: ignore
|
11
11
|
from Levenshtein import distance as levenshtein_distance
|
@@ -27,7 +27,7 @@ def normalize_string(text: str) -> str:
|
|
27
27
|
if not text:
|
28
28
|
return ""
|
29
29
|
# Remove all non-alphanumeric characters and convert to lowercase
|
30
|
-
return re.sub(r
|
30
|
+
return re.sub(r"[^a-zA-Z0-9]", "", text).lower()
|
31
31
|
|
32
32
|
|
33
33
|
def hamming_distance_padded(s: str, t: str) -> int:
|
@@ -45,7 +45,7 @@ def hamming_distance_padded(s: str, t: str) -> int:
|
|
45
45
|
s = normalize_string(s)
|
46
46
|
t = normalize_string(t)
|
47
47
|
|
48
|
-
return sum(a != b for a, b in zip_longest(s, t, fillvalue=
|
48
|
+
return sum(a != b for a, b in zip_longest(s, t, fillvalue=" "))
|
49
49
|
|
50
50
|
|
51
51
|
def hamming_similarity(str_1: str, str_2: str) -> float:
|
@@ -385,7 +385,7 @@ class EvalMetrics(BaseModel):
|
|
385
385
|
distances: dict[dictionary_metrics, EvalMetric]
|
386
386
|
|
387
387
|
|
388
|
-
def flatten_dict(d: dict[str, Any], parent_key: str =
|
388
|
+
def flatten_dict(d: dict[str, Any], parent_key: str = "", sep: str = ".") -> dict[str, Any]:
|
389
389
|
"""Flatten a nested dictionary with dot-separated keys."""
|
390
390
|
items: list[tuple[str, Any]] = []
|
391
391
|
for k, v in d.items():
|
@@ -408,16 +408,14 @@ def plot_metrics_with_uncertainty(analysis: dict[str, Any], uncertainties: Optio
|
|
408
408
|
"""
|
409
409
|
# Flatten the dictionaries
|
410
410
|
flattened_analysis = flatten_dict(analysis)
|
411
|
-
if uncertainties:
|
412
|
-
flattened_uncertainties = flatten_dict(uncertainties)
|
413
|
-
else:
|
414
|
-
uncertainties_list = None
|
415
|
-
|
416
411
|
# Prepare data by matching fields
|
417
412
|
fields = list(flattened_analysis.keys())
|
418
413
|
similarities = [flattened_analysis[field] for field in fields]
|
419
414
|
|
415
|
+
# Prepare uncertainties if provided
|
416
|
+
uncertainties_list = None
|
420
417
|
if uncertainties:
|
418
|
+
flattened_uncertainties = flatten_dict(uncertainties)
|
421
419
|
uncertainties_list = [flattened_uncertainties.get(field, None) for field in fields]
|
422
420
|
|
423
421
|
# Create a DataFrame
|
@@ -454,10 +452,11 @@ def plot_metrics_with_uncertainty(analysis: dict[str, Any], uncertainties: Optio
|
|
454
452
|
|
455
453
|
if similarity is None:
|
456
454
|
continue # Skip fields with no similarity value
|
457
|
-
|
455
|
+
similarity = cast(float, similarity)
|
458
456
|
# Calculate bar length and uncertainty range
|
459
457
|
bar_len = round(similarity * scale)
|
460
458
|
if uncertainty is not None and uncertainty > 0:
|
459
|
+
uncertainty = cast(float, uncertainty)
|
461
460
|
uncertainty_start = max(0, round((similarity - uncertainty) * scale))
|
462
461
|
uncertainty_end = min(bar_width, round((similarity + uncertainty) * scale))
|
463
462
|
else:
|
@@ -465,21 +464,21 @@ def plot_metrics_with_uncertainty(analysis: dict[str, Any], uncertainties: Optio
|
|
465
464
|
uncertainty_end = bar_len # No uncertainty to display
|
466
465
|
|
467
466
|
# Build the bar string
|
468
|
-
bar_string =
|
467
|
+
bar_string = ""
|
469
468
|
for i in range(bar_width):
|
470
469
|
if i < bar_len:
|
471
470
|
if i < uncertainty_start:
|
472
|
-
char =
|
471
|
+
char = "█" # Solid block for certain part
|
473
472
|
else:
|
474
|
-
char =
|
473
|
+
char = "█" # Lighter block for uncertainty overlap
|
475
474
|
else:
|
476
475
|
if i < uncertainty_end:
|
477
|
-
char =
|
476
|
+
char = "░" # Dash for upper uncertainty range
|
478
477
|
else:
|
479
|
-
char =
|
478
|
+
char = " " # Space for empty area
|
480
479
|
bar_string += char
|
481
480
|
|
482
481
|
# Print the label and bar
|
483
|
-
score_field = f
|
482
|
+
score_field = f"[{similarity:.4f}]"
|
484
483
|
|
485
484
|
print(f"{field:<{label_width}} {score_field} | {bar_string}")
|
{uiform → retab}/_utils/chat.py
RENAMED
@@ -1,31 +1,26 @@
|
|
1
1
|
import base64
|
2
|
-
import io
|
3
2
|
import logging
|
4
3
|
from typing import List, Literal, Optional, Union, cast
|
5
4
|
|
6
5
|
import requests
|
7
|
-
from anthropic.types.
|
8
|
-
from anthropic.types.image_block_param import ImageBlockParam, Source
|
6
|
+
from anthropic.types.image_block_param import ImageBlockParam
|
9
7
|
from anthropic.types.message_param import MessageParam
|
10
8
|
from anthropic.types.text_block_param import TextBlockParam
|
11
|
-
from anthropic.types.tool_result_block_param import ToolResultBlockParam
|
12
|
-
from anthropic.types.tool_use_block_param import ToolUseBlockParam
|
13
9
|
from google.genai.types import BlobDict, ContentDict, ContentUnionDict, PartDict # type: ignore
|
14
10
|
from openai.types.chat.chat_completion_content_part_image_param import ChatCompletionContentPartImageParam
|
15
11
|
from openai.types.chat.chat_completion_content_part_input_audio_param import ChatCompletionContentPartInputAudioParam
|
16
12
|
from openai.types.chat.chat_completion_content_part_param import ChatCompletionContentPartParam
|
17
13
|
from openai.types.chat.chat_completion_content_part_text_param import ChatCompletionContentPartTextParam
|
18
14
|
from openai.types.chat.chat_completion_message_param import ChatCompletionMessageParam
|
19
|
-
from PIL import Image
|
20
15
|
|
21
|
-
from ..types.chat import
|
16
|
+
from ..types.chat import ChatCompletionRetabMessage
|
22
17
|
|
23
18
|
MediaType = Literal["image/jpeg", "image/png", "image/gif", "image/webp"]
|
24
19
|
|
25
20
|
|
26
|
-
def convert_to_google_genai_format(messages: List[
|
21
|
+
def convert_to_google_genai_format(messages: List[ChatCompletionRetabMessage]) -> tuple[str, list[ContentUnionDict]]:
|
27
22
|
"""
|
28
|
-
Converts a list of
|
23
|
+
Converts a list of ChatCompletionRetabMessage to a format compatible with the google.genai SDK.
|
29
24
|
|
30
25
|
|
31
26
|
Example:
|
@@ -45,7 +40,7 @@ def convert_to_google_genai_format(messages: List[ChatCompletionUiformMessage])
|
|
45
40
|
```
|
46
41
|
|
47
42
|
Args:
|
48
|
-
messages (List[
|
43
|
+
messages (List[ChatCompletionRetabMessage]): List of chat messages.
|
49
44
|
|
50
45
|
Returns:
|
51
46
|
List[Union[Dict[str, str], str]]: A list of formatted inputs for the google.genai SDK.
|
@@ -64,7 +59,7 @@ def convert_to_google_genai_format(messages: List[ChatCompletionUiformMessage])
|
|
64
59
|
continue
|
65
60
|
parts: list[PartDict] = []
|
66
61
|
|
67
|
-
message_content = message[
|
62
|
+
message_content = message["content"]
|
68
63
|
if isinstance(message_content, str):
|
69
64
|
# Direct string content is treated as the prompt for the SDK
|
70
65
|
parts.append(PartDict(text=message_content))
|
@@ -74,8 +69,8 @@ def convert_to_google_genai_format(messages: List[ChatCompletionUiformMessage])
|
|
74
69
|
if part["type"] == "text":
|
75
70
|
parts.append(PartDict(text=part["text"]))
|
76
71
|
elif part["type"] == "image_url":
|
77
|
-
url = part[
|
78
|
-
if url.startswith(
|
72
|
+
url = part["image_url"].get("url", "") # type: ignore
|
73
|
+
if url.startswith("data:image"):
|
79
74
|
# Extract base64 data and add it to the formatted inputs
|
80
75
|
media_type, data_content = url.split(";base64,")
|
81
76
|
media_type = media_type.split("data:")[-1] # => "image/jpeg"
|
@@ -99,12 +94,12 @@ def convert_to_google_genai_format(messages: List[ChatCompletionUiformMessage])
|
|
99
94
|
return system_message, formatted_content
|
100
95
|
|
101
96
|
|
102
|
-
def convert_to_anthropic_format(messages: List[
|
97
|
+
def convert_to_anthropic_format(messages: List[ChatCompletionRetabMessage]) -> tuple[str, List[MessageParam]]:
|
103
98
|
"""
|
104
|
-
Converts a list of
|
99
|
+
Converts a list of ChatCompletionRetabMessage to a format compatible with the Anthropic SDK.
|
105
100
|
|
106
101
|
Args:
|
107
|
-
messages (List[
|
102
|
+
messages (List[ChatCompletionRetabMessage]): List of chat messages.
|
108
103
|
|
109
104
|
Returns:
|
110
105
|
(system_message, formatted_messages):
|
@@ -133,24 +128,24 @@ def convert_to_anthropic_format(messages: List[ChatCompletionUiformMessage]) ->
|
|
133
128
|
# -----------------------
|
134
129
|
# Handle non-system roles
|
135
130
|
# -----------------------
|
136
|
-
if isinstance(message[
|
131
|
+
if isinstance(message["content"], str):
|
137
132
|
# Direct string content is treated as a single text block
|
138
133
|
content_blocks.append(
|
139
134
|
{
|
140
135
|
"type": "text",
|
141
|
-
"text": message[
|
136
|
+
"text": message["content"],
|
142
137
|
}
|
143
138
|
)
|
144
139
|
|
145
|
-
elif isinstance(message[
|
140
|
+
elif isinstance(message["content"], list):
|
146
141
|
# Handle structured content
|
147
|
-
for part in message[
|
142
|
+
for part in message["content"]:
|
148
143
|
if part["type"] == "text":
|
149
144
|
part = cast(ChatCompletionContentPartTextParam, part)
|
150
145
|
content_blocks.append(
|
151
146
|
{
|
152
147
|
"type": "text",
|
153
|
-
"text": part[
|
148
|
+
"text": part["text"], # type: ignore
|
154
149
|
}
|
155
150
|
)
|
156
151
|
|
@@ -221,11 +216,11 @@ def convert_to_anthropic_format(messages: List[ChatCompletionUiformMessage]) ->
|
|
221
216
|
return system_message, formatted_messages
|
222
217
|
|
223
218
|
|
224
|
-
def convert_from_anthropic_format(messages: list[MessageParam], system_prompt: str) -> list[
|
219
|
+
def convert_from_anthropic_format(messages: list[MessageParam], system_prompt: str) -> list[ChatCompletionRetabMessage]:
|
225
220
|
"""
|
226
|
-
Converts a list of Anthropic MessageParam to a list of
|
221
|
+
Converts a list of Anthropic MessageParam to a list of ChatCompletionRetabMessage.
|
227
222
|
"""
|
228
|
-
formatted_messages: list[
|
223
|
+
formatted_messages: list[ChatCompletionRetabMessage] = [ChatCompletionRetabMessage(role="developer", content=system_prompt)]
|
229
224
|
|
230
225
|
for message in messages:
|
231
226
|
role = message["role"]
|
@@ -234,7 +229,7 @@ def convert_from_anthropic_format(messages: list[MessageParam], system_prompt: s
|
|
234
229
|
# Handle different content structures
|
235
230
|
if isinstance(content_blocks, list) and len(content_blocks) == 1 and isinstance(content_blocks[0], dict) and content_blocks[0].get("type") == "text":
|
236
231
|
# Simple text message
|
237
|
-
formatted_messages.append(cast(
|
232
|
+
formatted_messages.append(cast(ChatCompletionRetabMessage, {"role": role, "content": content_blocks[0].get("text", "")}))
|
238
233
|
elif isinstance(content_blocks, list):
|
239
234
|
# Message with multiple content parts or non-text content
|
240
235
|
formatted_content: list[ChatCompletionContentPartParam] = []
|
@@ -253,22 +248,22 @@ def convert_from_anthropic_format(messages: list[MessageParam], system_prompt: s
|
|
253
248
|
|
254
249
|
formatted_content.append(cast(ChatCompletionContentPartParam, {"type": "image_url", "image_url": {"url": image_url}}))
|
255
250
|
|
256
|
-
formatted_messages.append(cast(
|
251
|
+
formatted_messages.append(cast(ChatCompletionRetabMessage, {"role": role, "content": formatted_content}))
|
257
252
|
|
258
253
|
return formatted_messages
|
259
254
|
|
260
255
|
|
261
|
-
def convert_to_openai_format(messages: List[
|
256
|
+
def convert_to_openai_format(messages: List[ChatCompletionRetabMessage]) -> List[ChatCompletionMessageParam]:
|
262
257
|
return cast(list[ChatCompletionMessageParam], messages)
|
263
258
|
|
264
259
|
|
265
|
-
def convert_from_openai_format(messages: list[ChatCompletionMessageParam]) -> list[
|
266
|
-
return cast(list[
|
260
|
+
def convert_from_openai_format(messages: list[ChatCompletionMessageParam]) -> list[ChatCompletionRetabMessage]:
|
261
|
+
return cast(list[ChatCompletionRetabMessage], messages)
|
267
262
|
|
268
263
|
|
269
264
|
def separate_messages(
|
270
|
-
messages: list[
|
271
|
-
) -> tuple[Optional[
|
265
|
+
messages: list[ChatCompletionRetabMessage],
|
266
|
+
) -> tuple[Optional[ChatCompletionRetabMessage], list[ChatCompletionRetabMessage], list[ChatCompletionRetabMessage]]:
|
272
267
|
"""
|
273
268
|
Separates messages into system, user and assistant messages.
|
274
269
|
|
@@ -296,12 +291,12 @@ def separate_messages(
|
|
296
291
|
return system_message, user_messages, assistant_messages
|
297
292
|
|
298
293
|
|
299
|
-
def str_messages(messages: list[
|
294
|
+
def str_messages(messages: list[ChatCompletionRetabMessage], max_length: int = 100) -> str:
|
300
295
|
"""
|
301
296
|
Converts a list of chat messages into a string representation with faithfully serialized structure.
|
302
297
|
|
303
298
|
Args:
|
304
|
-
messages (list[
|
299
|
+
messages (list[ChatCompletionRetabMessage]): The list of chat messages.
|
305
300
|
max_length (int): Maximum length for content before truncation.
|
306
301
|
|
307
302
|
Returns:
|
@@ -312,7 +307,7 @@ def str_messages(messages: list[ChatCompletionUiformMessage], max_length: int =
|
|
312
307
|
"""Truncate text to max_len with ellipsis."""
|
313
308
|
return text if len(text) <= max_len else f"{text[:max_len]}..."
|
314
309
|
|
315
|
-
serialized: list[
|
310
|
+
serialized: list[ChatCompletionRetabMessage] = []
|
316
311
|
for message in messages:
|
317
312
|
role = message["role"]
|
318
313
|
content = message["content"]
|
@@ -105,9 +105,6 @@ def count_image_tokens(image_url: str, detail: Literal["low", "high", "auto"] =
|
|
105
105
|
total_tiles = tiles_wide * tiles_high
|
106
106
|
|
107
107
|
return base_token_cost + (token_per_tile * total_tiles)
|
108
|
-
|
109
|
-
|
110
|
-
|
111
108
|
|
112
109
|
|
113
110
|
def process_jsonl_file(jsonl_path: str) -> List[TokenCounts]:
|
@@ -14,8 +14,8 @@ from email_validator import validate_email
|
|
14
14
|
from pydantic import BaseModel, BeforeValidator, Field, create_model
|
15
15
|
from pydantic.config import ConfigDict
|
16
16
|
|
17
|
-
from
|
18
|
-
from
|
17
|
+
from ..types.schemas.layout import Column, FieldItem, Layout, RefObject, Row, RowList
|
18
|
+
from .mime import generate_blake2b_hash_from_string
|
19
19
|
|
20
20
|
# **** Validation Functions ****
|
21
21
|
|
@@ -116,7 +116,7 @@ def validate_vat_number(v: Any) -> Optional[str]:
|
|
116
116
|
try:
|
117
117
|
if stdnum.eu.vat.is_valid(v_str):
|
118
118
|
return stdnum.eu.vat.validate(v_str)
|
119
|
-
except:
|
119
|
+
except Exception:
|
120
120
|
pass
|
121
121
|
return None
|
122
122
|
|
@@ -150,7 +150,7 @@ def validate_email_address(v: Any) -> Optional[str]:
|
|
150
150
|
return None
|
151
151
|
try:
|
152
152
|
return validate_email(v_str).normalized
|
153
|
-
except:
|
153
|
+
except Exception:
|
154
154
|
return None
|
155
155
|
|
156
156
|
|
@@ -170,7 +170,7 @@ def validate_frenchpostcode(v: Any) -> Optional[str]:
|
|
170
170
|
if not v_str.isdigit():
|
171
171
|
return None
|
172
172
|
return v_str
|
173
|
-
except:
|
173
|
+
except Exception:
|
174
174
|
return None
|
175
175
|
|
176
176
|
|
@@ -201,7 +201,7 @@ def validate_un_code(v: Any) -> Optional[int]:
|
|
201
201
|
val = int(float(v_str)) # handle numeric strings
|
202
202
|
if 0 <= val <= 3481:
|
203
203
|
return val
|
204
|
-
except:
|
204
|
+
except Exception:
|
205
205
|
pass
|
206
206
|
return None
|
207
207
|
|
@@ -242,7 +242,7 @@ def validate_integer(v: Any) -> Optional[int]:
|
|
242
242
|
return None
|
243
243
|
try:
|
244
244
|
return int(float(v_str))
|
245
|
-
except:
|
245
|
+
except Exception:
|
246
246
|
return None
|
247
247
|
|
248
248
|
|
@@ -257,7 +257,7 @@ def validate_float(v: Any) -> Optional[float]:
|
|
257
257
|
return None
|
258
258
|
try:
|
259
259
|
return float(v_str)
|
260
|
-
except:
|
260
|
+
except Exception:
|
261
261
|
return None
|
262
262
|
|
263
263
|
|
@@ -333,7 +333,7 @@ def validate_bool(v: Any) -> bool:
|
|
333
333
|
return True
|
334
334
|
elif v_str in false_values:
|
335
335
|
return False
|
336
|
-
except:
|
336
|
+
except Exception:
|
337
337
|
pass
|
338
338
|
|
339
339
|
return False
|
@@ -2091,11 +2091,6 @@ def sanitize(instance: Any, schema: dict[str, Any]) -> Any:
|
|
2091
2091
|
return __sanitize_instance(instance, expanded_schema)
|
2092
2092
|
|
2093
2093
|
|
2094
|
-
import copy
|
2095
|
-
import json
|
2096
|
-
from .mime import generate_blake2b_hash_from_string
|
2097
|
-
|
2098
|
-
|
2099
2094
|
def compute_schema_data_id(json_schema: dict[str, Any]) -> str:
|
2100
2095
|
"""Returns the schema_data_id for a given JSON schema.
|
2101
2096
|
|
{uiform → retab}/_utils/mime.py
RENAMED
@@ -4,16 +4,17 @@ import io
|
|
4
4
|
import json
|
5
5
|
import mimetypes
|
6
6
|
from pathlib import Path
|
7
|
-
from typing import
|
7
|
+
from typing import Sequence, TypeVar, get_args
|
8
8
|
|
9
9
|
import httpx
|
10
10
|
import PIL.Image
|
11
|
+
import puremagic
|
11
12
|
from pydantic import HttpUrl
|
12
13
|
|
13
14
|
from ..types.mime import MIMEData
|
14
15
|
from ..types.modalities import SUPPORTED_TYPES
|
15
16
|
|
16
|
-
T = TypeVar(
|
17
|
+
T = TypeVar("T")
|
17
18
|
|
18
19
|
|
19
20
|
def generate_blake2b_hash_from_bytes(bytes_: bytes) -> str:
|
@@ -25,7 +26,7 @@ def generate_blake2b_hash_from_base64(base64_string: str) -> str:
|
|
25
26
|
|
26
27
|
|
27
28
|
def generate_blake2b_hash_from_string(input_string: str) -> str:
|
28
|
-
return generate_blake2b_hash_from_bytes(input_string.encode(
|
29
|
+
return generate_blake2b_hash_from_bytes(input_string.encode("utf-8"))
|
29
30
|
|
30
31
|
|
31
32
|
def generate_blake2b_hash_from_dict(input_dict: dict) -> str:
|
@@ -43,7 +44,7 @@ def convert_pil_image_to_mime_data(image: PIL.Image.Image) -> MIMEData:
|
|
43
44
|
"""
|
44
45
|
# Convert PIL image to base64 string
|
45
46
|
buffered = io.BytesIO()
|
46
|
-
choosen_format = image.format if (image.format and image.format.lower() in [
|
47
|
+
choosen_format = image.format if (image.format and image.format.lower() in ["png", "jpeg", "gif", "webp"]) else "JPEG"
|
47
48
|
image.save(buffered, format=choosen_format)
|
48
49
|
base64_content = base64.b64encode(buffered.getvalue()).decode("utf-8")
|
49
50
|
|
@@ -98,13 +99,11 @@ def prepare_mime_document(document: Path | str | bytes | io.IOBase | MIMEData |
|
|
98
99
|
if isinstance(document, bytes):
|
99
100
|
# `document` is already the raw bytes
|
100
101
|
try:
|
101
|
-
import puremagic
|
102
|
-
|
103
102
|
extension = puremagic.from_string(document)
|
104
103
|
if extension.lower() in [".jpg", ".jpeg", ".jfif"]:
|
105
104
|
extension = ".jpeg"
|
106
|
-
except:
|
107
|
-
extension =
|
105
|
+
except Exception:
|
106
|
+
extension = ".txt"
|
108
107
|
file_bytes = document
|
109
108
|
filename = "uploaded_file" + extension
|
110
109
|
elif isinstance(document, io.IOBase):
|
@@ -112,19 +111,17 @@ def prepare_mime_document(document: Path | str | bytes | io.IOBase | MIMEData |
|
|
112
111
|
file_bytes = document.read()
|
113
112
|
filename = getattr(document, "name", "uploaded_file")
|
114
113
|
filename = Path(filename).name
|
115
|
-
elif hasattr(document,
|
114
|
+
elif hasattr(document, "unicode_string") and callable(getattr(document, "unicode_string")):
|
116
115
|
with httpx.Client() as client:
|
117
116
|
url: str = document.unicode_string() # type: ignore
|
118
117
|
response = client.get(url)
|
119
118
|
response.raise_for_status()
|
120
119
|
try:
|
121
|
-
import puremagic
|
122
|
-
|
123
120
|
extension = puremagic.from_string(response.content)
|
124
121
|
if extension.lower() in [".jpg", ".jpeg", ".jfif"]:
|
125
122
|
extension = ".jpeg"
|
126
|
-
except:
|
127
|
-
extension =
|
123
|
+
except Exception:
|
124
|
+
extension = ".txt"
|
128
125
|
file_bytes = response.content # Fix: Use response.content instead of document
|
129
126
|
filename = "uploaded_file" + extension
|
130
127
|
else:
|
@@ -139,7 +136,7 @@ def prepare_mime_document(document: Path | str | bytes | io.IOBase | MIMEData |
|
|
139
136
|
encoded_content = base64.b64encode(file_bytes).decode("utf-8")
|
140
137
|
# Compute SHA-256 hash over the *base64-encoded* content
|
141
138
|
hash_obj = hashlib.sha256(encoded_content.encode("utf-8"))
|
142
|
-
|
139
|
+
hash_obj.hexdigest()
|
143
140
|
|
144
141
|
# Guess MIME type based on file extension
|
145
142
|
guessed_type, _ = mimetypes.guess_type(filename)
|
@@ -16,13 +16,13 @@ from openai.types.responses.response_input_message_content_list_param import Res
|
|
16
16
|
from openai.types.responses.response_input_param import ResponseInputItemParam
|
17
17
|
from openai.types.responses.response_input_text_param import ResponseInputTextParam
|
18
18
|
|
19
|
-
from ..types.chat import
|
19
|
+
from ..types.chat import ChatCompletionRetabMessage
|
20
20
|
from ..types.documents.extractions import UiParsedChatCompletion, UiParsedChoice
|
21
21
|
|
22
22
|
|
23
|
-
def convert_to_openai_format(messages: list[
|
23
|
+
def convert_to_openai_format(messages: list[ChatCompletionRetabMessage]) -> list[ResponseInputItemParam]:
|
24
24
|
"""
|
25
|
-
Converts a list of
|
25
|
+
Converts a list of ChatCompletionRetabMessage to the OpenAI ResponseInputParam format.
|
26
26
|
|
27
27
|
Args:
|
28
28
|
messages: List of chat messages in UIForm format
|
@@ -64,9 +64,9 @@ def convert_to_openai_format(messages: list[ChatCompletionUiformMessage]) -> lis
|
|
64
64
|
return formatted_messages
|
65
65
|
|
66
66
|
|
67
|
-
def convert_from_openai_format(messages: list[ResponseInputItemParam]) -> list[
|
67
|
+
def convert_from_openai_format(messages: list[ResponseInputItemParam]) -> list[ChatCompletionRetabMessage]:
|
68
68
|
"""
|
69
|
-
Converts messages from OpenAI ResponseInputParam format to
|
69
|
+
Converts messages from OpenAI ResponseInputParam format to ChatCompletionRetabMessage format.
|
70
70
|
|
71
71
|
Args:
|
72
72
|
messages: Messages in OpenAI ResponseInputParam format
|
@@ -74,16 +74,22 @@ def convert_from_openai_format(messages: list[ResponseInputItemParam]) -> list[C
|
|
74
74
|
Returns:
|
75
75
|
List of chat messages in UIForm format
|
76
76
|
"""
|
77
|
-
formatted_messages: list[
|
77
|
+
formatted_messages: list[ChatCompletionRetabMessage] = []
|
78
78
|
|
79
79
|
for message in messages:
|
80
|
+
if "role" not in message or "content" not in message:
|
81
|
+
# Mandatory fields for a message
|
82
|
+
if message.get("type") != "message":
|
83
|
+
print(f"Not supported message type: {message.get('type')}... Skipping...")
|
84
|
+
continue
|
85
|
+
|
86
|
+
role = message["role"]
|
87
|
+
content = message["content"]
|
88
|
+
|
80
89
|
if "type" not in message:
|
81
90
|
# The type is required by all other sub-types of ResponseInputItemParam except for EasyInputMessageParam and Message, which are messages.
|
82
91
|
message["type"] = "message"
|
83
92
|
|
84
|
-
if message["type"] != "message":
|
85
|
-
print(f"Not supported message type: {message['type']}... Skipping...")
|
86
|
-
continue
|
87
93
|
role = message["role"]
|
88
94
|
content = message["content"]
|
89
95
|
formatted_content: str | list[ChatCompletionContentPartParam]
|
@@ -104,7 +110,7 @@ def convert_from_openai_format(messages: list[ResponseInputItemParam]) -> list[C
|
|
104
110
|
print(f"Not supported content type: {part['type']}... Skipping...")
|
105
111
|
|
106
112
|
# Create message in UIForm format
|
107
|
-
formatted_message =
|
113
|
+
formatted_message = ChatCompletionRetabMessage(role=role, content=formatted_content)
|
108
114
|
formatted_messages.append(formatted_message)
|
109
115
|
|
110
116
|
return formatted_messages
|
@@ -1,7 +1,7 @@
|
|
1
1
|
from contextlib import AbstractAsyncContextManager, AbstractContextManager
|
2
2
|
from typing import Any, AsyncGenerator, Callable, Generator, TypeVar, Union
|
3
3
|
|
4
|
-
T = TypeVar(
|
4
|
+
T = TypeVar("T")
|
5
5
|
|
6
6
|
|
7
7
|
class AsyncGeneratorContextManager(AbstractAsyncContextManager[AsyncGenerator[T, None]]):
|