retab 0.0.36__py3-none-any.whl → 0.0.38__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- retab/__init__.py +4 -0
- {uiform → retab}/_resource.py +5 -5
- {uiform → retab}/_utils/ai_models.py +2 -2
- {uiform → retab}/_utils/benchmarking.py +15 -16
- {uiform → retab}/_utils/chat.py +29 -34
- {uiform → retab}/_utils/display.py +0 -3
- {uiform → retab}/_utils/json_schema.py +9 -14
- {uiform → retab}/_utils/mime.py +11 -14
- {uiform → retab}/_utils/responses.py +16 -10
- {uiform → retab}/_utils/stream_context_managers.py +1 -1
- {uiform → retab}/_utils/usage/usage.py +31 -31
- {uiform → retab}/client.py +54 -53
- {uiform → retab}/resources/consensus/client.py +19 -38
- {uiform → retab}/resources/consensus/completions.py +36 -59
- {uiform → retab}/resources/consensus/completions_stream.py +35 -47
- {uiform → retab}/resources/consensus/responses.py +37 -86
- {uiform → retab}/resources/consensus/responses_stream.py +41 -89
- retab/resources/documents/client.py +455 -0
- {uiform → retab}/resources/documents/extractions.py +192 -101
- {uiform → retab}/resources/evals.py +56 -43
- retab/resources/evaluations/__init__.py +3 -0
- retab/resources/evaluations/client.py +301 -0
- retab/resources/evaluations/documents.py +233 -0
- retab/resources/evaluations/iterations.py +452 -0
- {uiform → retab}/resources/files.py +2 -2
- {uiform → retab}/resources/jsonlUtils.py +225 -221
- retab/resources/models.py +73 -0
- retab/resources/processors/automations/client.py +244 -0
- {uiform → retab}/resources/processors/automations/endpoints.py +79 -120
- retab/resources/processors/automations/links.py +294 -0
- {uiform → retab}/resources/processors/automations/logs.py +30 -19
- retab/resources/processors/automations/mailboxes.py +397 -0
- retab/resources/processors/automations/outlook.py +337 -0
- {uiform → retab}/resources/processors/automations/tests.py +22 -25
- {uiform → retab}/resources/processors/client.py +181 -166
- {uiform → retab}/resources/schemas.py +78 -66
- {uiform → retab}/resources/secrets/external_api_keys.py +1 -5
- retab/resources/secrets/webhook.py +64 -0
- {uiform → retab}/resources/usage.py +41 -4
- {uiform → retab}/types/ai_models.py +17 -17
- {uiform → retab}/types/automations/cron.py +19 -12
- {uiform → retab}/types/automations/endpoints.py +7 -4
- {uiform → retab}/types/automations/links.py +7 -3
- {uiform → retab}/types/automations/mailboxes.py +10 -10
- {uiform → retab}/types/automations/outlook.py +15 -11
- {uiform → retab}/types/automations/webhooks.py +1 -1
- retab/types/browser_canvas.py +3 -0
- retab/types/chat.py +8 -0
- {uiform → retab}/types/completions.py +12 -15
- retab/types/consensus.py +19 -0
- {uiform → retab}/types/db/annotations.py +3 -3
- {uiform → retab}/types/db/files.py +8 -6
- {uiform → retab}/types/documents/create_messages.py +20 -22
- {uiform → retab}/types/documents/extractions.py +71 -26
- {uiform → retab}/types/evals.py +5 -5
- retab/types/evaluations/__init__.py +31 -0
- retab/types/evaluations/documents.py +30 -0
- retab/types/evaluations/iterations.py +112 -0
- retab/types/evaluations/model.py +73 -0
- retab/types/events.py +79 -0
- {uiform → retab}/types/extractions.py +36 -13
- retab/types/inference_settings.py +15 -0
- retab/types/jobs/base.py +54 -0
- retab/types/jobs/batch_annotation.py +12 -0
- {uiform → retab}/types/jobs/evaluation.py +1 -2
- {uiform → retab}/types/logs.py +37 -34
- retab/types/metrics.py +32 -0
- {uiform → retab}/types/mime.py +22 -20
- {uiform → retab}/types/modalities.py +10 -10
- retab/types/predictions.py +19 -0
- {uiform → retab}/types/schemas/enhance.py +4 -2
- {uiform → retab}/types/schemas/evaluate.py +7 -4
- {uiform → retab}/types/schemas/generate.py +6 -3
- {uiform → retab}/types/schemas/layout.py +1 -1
- {uiform → retab}/types/schemas/object.py +16 -17
- {uiform → retab}/types/schemas/templates.py +1 -3
- {uiform → retab}/types/secrets/external_api_keys.py +0 -1
- {uiform → retab}/types/standards.py +18 -1
- {retab-0.0.36.dist-info → retab-0.0.38.dist-info}/METADATA +78 -77
- retab-0.0.38.dist-info/RECORD +107 -0
- retab-0.0.38.dist-info/top_level.txt +1 -0
- retab-0.0.36.dist-info/RECORD +0 -96
- retab-0.0.36.dist-info/top_level.txt +0 -1
- uiform/__init__.py +0 -4
- uiform/_utils/benchmarking copy.py +0 -588
- uiform/resources/documents/client.py +0 -255
- uiform/resources/models.py +0 -45
- uiform/resources/processors/automations/client.py +0 -78
- uiform/resources/processors/automations/links.py +0 -356
- uiform/resources/processors/automations/mailboxes.py +0 -435
- uiform/resources/processors/automations/outlook.py +0 -444
- uiform/resources/secrets/webhook.py +0 -62
- uiform/types/chat.py +0 -8
- uiform/types/consensus.py +0 -10
- uiform/types/events.py +0 -76
- uiform/types/jobs/base.py +0 -150
- uiform/types/jobs/batch_annotation.py +0 -22
- {uiform → retab}/_utils/__init__.py +0 -0
- {uiform → retab}/_utils/usage/__init__.py +0 -0
- {uiform → retab}/py.typed +0 -0
- {uiform → retab}/resources/__init__.py +0 -0
- {uiform → retab}/resources/consensus/__init__.py +0 -0
- {uiform → retab}/resources/documents/__init__.py +0 -0
- {uiform → retab}/resources/finetuning.py +0 -0
- {uiform → retab}/resources/openai_example.py +0 -0
- {uiform → retab}/resources/processors/__init__.py +0 -0
- {uiform → retab}/resources/processors/automations/__init__.py +0 -0
- {uiform → retab}/resources/prompt_optimization.py +0 -0
- {uiform → retab}/resources/secrets/__init__.py +0 -0
- {uiform → retab}/resources/secrets/client.py +0 -0
- {uiform → retab}/types/__init__.py +0 -0
- {uiform → retab}/types/automations/__init__.py +0 -0
- {uiform → retab}/types/db/__init__.py +0 -0
- {uiform → retab}/types/documents/__init__.py +0 -0
- {uiform → retab}/types/documents/correct_orientation.py +0 -0
- {uiform → retab}/types/jobs/__init__.py +0 -0
- {uiform → retab}/types/jobs/finetune.py +0 -0
- {uiform → retab}/types/jobs/prompt_optimization.py +0 -0
- {uiform → retab}/types/jobs/webcrawl.py +0 -0
- {uiform → retab}/types/pagination.py +0 -0
- {uiform → retab}/types/schemas/__init__.py +0 -0
- {uiform → retab}/types/secrets/__init__.py +0 -0
- {retab-0.0.36.dist-info → retab-0.0.38.dist-info}/WHEEL +0 -0
@@ -1,588 +0,0 @@
|
|
1
|
-
import re
|
2
|
-
import unicodedata
|
3
|
-
from collections import defaultdict
|
4
|
-
from typing import Any, Literal, Optional
|
5
|
-
|
6
|
-
import numpy as np
|
7
|
-
import pandas as pd
|
8
|
-
import termplotlib as tpl # type: ignore
|
9
|
-
from Levenshtein import distance as levenshtein_distance
|
10
|
-
from pydantic import BaseModel, computed_field
|
11
|
-
|
12
|
-
# The goal is to leverage this piece of code to open a jsonl file and get an analysis of the performance of the model using a one-liner.
|
13
|
-
|
14
|
-
|
15
|
-
############# BENCHMARKING MODELS #############
|
16
|
-
|
17
|
-
|
18
|
-
class DictionaryComparisonMetrics(BaseModel):
|
19
|
-
# Pure dict comparison
|
20
|
-
unchanged_fields: int
|
21
|
-
total_fields: int
|
22
|
-
is_equal: dict[str, bool]
|
23
|
-
false_positives: list[dict[str, Any]]
|
24
|
-
false_negatives: list[dict[str, Any]]
|
25
|
-
mismatched_values: list[dict[str, Any]]
|
26
|
-
keys_only_on_1: list[str]
|
27
|
-
keys_only_on_2: list[str]
|
28
|
-
|
29
|
-
# Some metrics
|
30
|
-
valid_comparisons: int
|
31
|
-
total_accuracy: float
|
32
|
-
false_positive_rate: float
|
33
|
-
false_negative_rate: float
|
34
|
-
mismatched_value_rate: float
|
35
|
-
|
36
|
-
similarity_levenshtein: dict[str, float]
|
37
|
-
similarity_jaccard: dict[str, float]
|
38
|
-
|
39
|
-
avg_similarity_levenshtein: float
|
40
|
-
avg_similarity_jaccard: float
|
41
|
-
total_similarity_levenshtein: float
|
42
|
-
total_similarity_jaccard: float
|
43
|
-
|
44
|
-
|
45
|
-
def flatten_dict(obj: Any, prefix: str = '') -> dict[str, Any]:
|
46
|
-
items = [] # type: ignore
|
47
|
-
if isinstance(obj, dict):
|
48
|
-
for k, v in obj.items():
|
49
|
-
new_key = f"{prefix}.{k}" if prefix else k
|
50
|
-
items.extend(flatten_dict(v, new_key).items())
|
51
|
-
elif isinstance(obj, list):
|
52
|
-
for i, v in enumerate(obj):
|
53
|
-
new_key = f"{prefix}.{i}"
|
54
|
-
items.extend(flatten_dict(v, new_key).items())
|
55
|
-
else:
|
56
|
-
items.append((prefix, obj))
|
57
|
-
return dict(items)
|
58
|
-
|
59
|
-
|
60
|
-
def normalize_value(val: Any) -> str:
|
61
|
-
"""Convert value to uppercase and remove all spacing for comparison."""
|
62
|
-
if val is None:
|
63
|
-
return ""
|
64
|
-
prep = re.sub(r'\s+', '', str(val).upper())
|
65
|
-
# Remove all accents (é -> e, etc.)
|
66
|
-
return unicodedata.normalize('NFKD', prep).encode('ASCII', 'ignore').decode()
|
67
|
-
|
68
|
-
|
69
|
-
def key_normalization(key: str) -> str:
|
70
|
-
"""This method is useful to compare keys under list indexes (that refers to the same kind of error but on different list index position)"""
|
71
|
-
# We will replace all .{i} with .* where i is the index of the list (using regex for this)
|
72
|
-
key_parts = key.split(".")
|
73
|
-
new_key_parts = []
|
74
|
-
for key_part in key_parts:
|
75
|
-
if key_part.isdigit():
|
76
|
-
new_key_parts.append("*")
|
77
|
-
else:
|
78
|
-
new_key_parts.append(key_part)
|
79
|
-
return ".".join(new_key_parts)
|
80
|
-
|
81
|
-
|
82
|
-
def should_ignore_key(
|
83
|
-
key: str, exclude_field_patterns: list[str] | None, include_field_patterns: list[str] | None = None, information_presence_per_field: dict[str, bool] | None = None
|
84
|
-
) -> bool:
|
85
|
-
if information_presence_per_field and information_presence_per_field.get(key) is False:
|
86
|
-
# If we have the information_presence_per_field dict and the key is marked as false, then we should ignore it
|
87
|
-
should_ignore = True
|
88
|
-
else:
|
89
|
-
# If exclude_field_patterns is None, we should not ignore any key
|
90
|
-
normalized_key = key_normalization(key)
|
91
|
-
should_ignore = any(normalized_key.startswith(key_normalization(pattern)) for pattern in exclude_field_patterns or [])
|
92
|
-
|
93
|
-
if include_field_patterns and not should_ignore:
|
94
|
-
# If include_field_patterns is not None, we should ignore the key if it does not start with any of the include_field_patterns and is not in the exclude_field_patterns
|
95
|
-
should_ignore = not any(normalized_key.startswith(key_normalization(pattern)) for pattern in include_field_patterns)
|
96
|
-
|
97
|
-
return should_ignore
|
98
|
-
|
99
|
-
|
100
|
-
def levenshtein_similarity(val1: Any, val2: Any) -> float:
|
101
|
-
"""
|
102
|
-
Calculate similarity between two values using Levenshtein distance.
|
103
|
-
Returns a similarity score between 0.0 and 1.0.
|
104
|
-
"""
|
105
|
-
# Handle None/empty and general cases
|
106
|
-
if (val1 or "") == (val2 or ""):
|
107
|
-
return 1.0
|
108
|
-
|
109
|
-
# Check if both values are numeric, compare with 5% tolerance
|
110
|
-
if isinstance(val1, (int, float)) and isinstance(val2, (int, float)):
|
111
|
-
return 1.0 if abs(val1 - val2) <= 0.05 * max(abs(val1), abs(val2)) else 0.0
|
112
|
-
|
113
|
-
# Convert to normalized strings
|
114
|
-
str1 = normalize_value(val1)
|
115
|
-
str2 = normalize_value(val2)
|
116
|
-
|
117
|
-
if str1 == str2:
|
118
|
-
return 1.0
|
119
|
-
|
120
|
-
# Calculate Levenshtein distance
|
121
|
-
if str1 and str2: # Only if both strings are non-empty
|
122
|
-
max_len = max(len(str1), len(str2))
|
123
|
-
if max_len == 0:
|
124
|
-
return 1.0
|
125
|
-
|
126
|
-
dist = levenshtein_distance(str1, str2)
|
127
|
-
return 1 - (dist / max_len)
|
128
|
-
|
129
|
-
return 0.0
|
130
|
-
|
131
|
-
|
132
|
-
def jaccard_similarity(val1: Any, val2: Any) -> float:
|
133
|
-
"""
|
134
|
-
Calculate Jaccard similarity between two values.
|
135
|
-
Returns a similarity score between 0.0 and 1.0.
|
136
|
-
"""
|
137
|
-
# Handle None/empty and general cases
|
138
|
-
if (val1 or "") == (val2 or ""):
|
139
|
-
return 1.0
|
140
|
-
|
141
|
-
# Check if both values are numeric, compare with 5% tolerance
|
142
|
-
if isinstance(val1, (int, float)) and isinstance(val2, (int, float)):
|
143
|
-
return 1.0 if abs(val1 - val2) <= 0.05 * max(abs(val1), abs(val2)) else 0.0
|
144
|
-
|
145
|
-
# Convert to normalized strings and split into words
|
146
|
-
str1 = set(normalize_value(val1).split())
|
147
|
-
str2 = set(normalize_value(val2).split())
|
148
|
-
|
149
|
-
if not str1 and not str2:
|
150
|
-
return 1.0
|
151
|
-
|
152
|
-
# Calculate Jaccard similarity
|
153
|
-
intersection = len(str1.intersection(str2))
|
154
|
-
union = len(str1.union(str2))
|
155
|
-
|
156
|
-
return intersection / union if union > 0 else 0.0
|
157
|
-
|
158
|
-
|
159
|
-
def compare_dicts(
|
160
|
-
ground_truth: dict[str, Any],
|
161
|
-
prediction: dict[str, Any],
|
162
|
-
include_fields: list[str] | None = None,
|
163
|
-
exclude_fields: list[str] | None = None,
|
164
|
-
information_presence_per_field: dict[str, bool] | None = None,
|
165
|
-
levenshtein_threshold: float = 0.0, # 0.0 means exact match
|
166
|
-
) -> DictionaryComparisonMetrics:
|
167
|
-
flat_ground_truth = flatten_dict(ground_truth)
|
168
|
-
flat_prediction = flatten_dict(prediction)
|
169
|
-
|
170
|
-
flat_ground_truth = {k: v for k, v in flat_ground_truth.items() if not should_ignore_key(k, exclude_fields, include_fields, information_presence_per_field)}
|
171
|
-
flat_prediction = {k: v for k, v in flat_prediction.items() if not should_ignore_key(k, exclude_fields, include_fields, information_presence_per_field)}
|
172
|
-
|
173
|
-
keys_ground_truth = set(flat_ground_truth.keys())
|
174
|
-
keys_prediction = set(flat_prediction.keys())
|
175
|
-
common_keys = keys_ground_truth & keys_prediction
|
176
|
-
|
177
|
-
keys_only_on_1 = sorted(list(keys_ground_truth - keys_prediction))
|
178
|
-
keys_only_on_2 = sorted(list(keys_prediction - keys_ground_truth))
|
179
|
-
|
180
|
-
total_fields = len(common_keys)
|
181
|
-
unchanged_fields = 0
|
182
|
-
is_equal_per_field = {}
|
183
|
-
|
184
|
-
false_positives = []
|
185
|
-
false_negatives = []
|
186
|
-
mismatched_values = []
|
187
|
-
|
188
|
-
total_similarity_levenshtein = 0.0
|
189
|
-
total_similarity_jaccard = 0.0
|
190
|
-
similarity_levenshtein_per_field = {}
|
191
|
-
similarity_jaccard_per_field = {}
|
192
|
-
|
193
|
-
valid_comparisons = 0
|
194
|
-
|
195
|
-
for key in common_keys:
|
196
|
-
llm_value = flat_ground_truth[key]
|
197
|
-
extraction_value = flat_prediction[key]
|
198
|
-
|
199
|
-
coerced_llm_value = llm_value or ""
|
200
|
-
coerced_extraction_value = extraction_value or ""
|
201
|
-
|
202
|
-
similarity_lev = levenshtein_similarity(llm_value, extraction_value)
|
203
|
-
similarity_jac = jaccard_similarity(llm_value, extraction_value)
|
204
|
-
# print("Jaccard similarity", similarity_jac)
|
205
|
-
|
206
|
-
# Use Levenshtein for equality comparison (you can adjust this if needed)
|
207
|
-
is_equal = similarity_lev >= (1 - levenshtein_threshold)
|
208
|
-
|
209
|
-
similarity_levenshtein_per_field[key] = similarity_lev
|
210
|
-
similarity_jaccard_per_field[key] = similarity_jac
|
211
|
-
is_equal_per_field[key] = is_equal
|
212
|
-
|
213
|
-
# Only count non-empty comparisons for average similarity
|
214
|
-
if coerced_llm_value != "" and coerced_extraction_value != "":
|
215
|
-
total_similarity_levenshtein += similarity_lev
|
216
|
-
total_similarity_jaccard += similarity_jac
|
217
|
-
valid_comparisons += 1
|
218
|
-
|
219
|
-
if is_equal:
|
220
|
-
unchanged_fields += 1
|
221
|
-
else:
|
222
|
-
if coerced_llm_value != "" and coerced_extraction_value == "":
|
223
|
-
false_positives.append({"key": key, "expected": extraction_value, "got": llm_value, "similarity": similarity_lev})
|
224
|
-
elif coerced_llm_value == "" and coerced_extraction_value != "":
|
225
|
-
false_negatives.append({"key": key, "expected": extraction_value, "got": llm_value, "similarity": similarity_lev})
|
226
|
-
elif coerced_llm_value != "" and coerced_extraction_value != "":
|
227
|
-
# Both are non-empty but not equal
|
228
|
-
mismatched_values.append({"key": key, "expected": extraction_value, "got": llm_value, "similarity": similarity_lev})
|
229
|
-
# Some metrics
|
230
|
-
avg_similarity_levenshtein = total_similarity_levenshtein / valid_comparisons if valid_comparisons > 0 else 1.0
|
231
|
-
avg_similarity_jaccard = total_similarity_jaccard / valid_comparisons if valid_comparisons > 0 else 1.0
|
232
|
-
total_accuracy = unchanged_fields / total_fields if total_fields > 0 else 1.0
|
233
|
-
false_positive_rate = len(false_positives) / total_fields if total_fields > 0 else 0.0
|
234
|
-
false_negative_rate = len(false_negatives) / total_fields if total_fields > 0 else 0.0
|
235
|
-
mismatched_value_rate = len(mismatched_values) / total_fields if total_fields > 0 else 0.0
|
236
|
-
|
237
|
-
return DictionaryComparisonMetrics(
|
238
|
-
unchanged_fields=unchanged_fields,
|
239
|
-
total_fields=total_fields,
|
240
|
-
is_equal=is_equal_per_field,
|
241
|
-
false_positives=false_positives,
|
242
|
-
false_negatives=false_negatives,
|
243
|
-
mismatched_values=mismatched_values,
|
244
|
-
keys_only_on_1=keys_only_on_1,
|
245
|
-
keys_only_on_2=keys_only_on_2,
|
246
|
-
valid_comparisons=valid_comparisons,
|
247
|
-
total_accuracy=total_accuracy,
|
248
|
-
false_positive_rate=false_positive_rate,
|
249
|
-
false_negative_rate=false_negative_rate,
|
250
|
-
mismatched_value_rate=mismatched_value_rate,
|
251
|
-
similarity_levenshtein=similarity_levenshtein_per_field,
|
252
|
-
similarity_jaccard=similarity_jaccard_per_field,
|
253
|
-
avg_similarity_levenshtein=avg_similarity_levenshtein,
|
254
|
-
avg_similarity_jaccard=avg_similarity_jaccard,
|
255
|
-
total_similarity_levenshtein=total_similarity_levenshtein,
|
256
|
-
total_similarity_jaccard=total_similarity_jaccard,
|
257
|
-
)
|
258
|
-
|
259
|
-
|
260
|
-
class ExtractionAnalysis(BaseModel):
|
261
|
-
ground_truth: dict[str, Any]
|
262
|
-
prediction: dict[str, Any]
|
263
|
-
time_spent: Optional[float] = None
|
264
|
-
include_fields: list[str] | None = None
|
265
|
-
exclude_fields: list[str] | None = None
|
266
|
-
information_presence_per_field: dict[str, bool] | None = None
|
267
|
-
levenshtein_threshold: float = 0.0
|
268
|
-
|
269
|
-
@computed_field # type: ignore
|
270
|
-
@property
|
271
|
-
def comparison(self) -> DictionaryComparisonMetrics:
|
272
|
-
return compare_dicts(
|
273
|
-
self.ground_truth,
|
274
|
-
self.prediction,
|
275
|
-
include_fields=self.include_fields,
|
276
|
-
exclude_fields=self.exclude_fields,
|
277
|
-
information_presence_per_field=self.information_presence_per_field,
|
278
|
-
levenshtein_threshold=self.levenshtein_threshold,
|
279
|
-
)
|
280
|
-
|
281
|
-
|
282
|
-
class BenchmarkMetrics(BaseModel):
|
283
|
-
ai_model: str
|
284
|
-
accuracy: float
|
285
|
-
levenshtein_similarity: float
|
286
|
-
jaccard_similarity: float
|
287
|
-
false_positive_rate: float
|
288
|
-
false_negative_rate: float
|
289
|
-
mismatched_value_rate: float
|
290
|
-
|
291
|
-
|
292
|
-
from rich.console import Console
|
293
|
-
from rich.table import Table
|
294
|
-
|
295
|
-
|
296
|
-
def display_benchmark_metrics(benchmark_metrics: list[BenchmarkMetrics]) -> None:
|
297
|
-
"""
|
298
|
-
Display benchmark metrics for multiple models in a formatted table.
|
299
|
-
|
300
|
-
Args:
|
301
|
-
benchmark_metrics: List of BenchmarkMetrics objects containing model performance data
|
302
|
-
"""
|
303
|
-
console = Console(style="on grey23")
|
304
|
-
table = Table(title="Model Benchmark Comparison", show_lines=True)
|
305
|
-
|
306
|
-
# Add columns
|
307
|
-
table.add_column("Model", justify="left", style="#BDE8F6", no_wrap=True)
|
308
|
-
table.add_column("Accuracy", justify="right", style="#C2BDF6")
|
309
|
-
table.add_column("Levenshtein", justify="right", style="#F6BDBD")
|
310
|
-
table.add_column("Jaccard", justify="right", style="#F6E4BD")
|
311
|
-
table.add_column("False Positive Rate", justify="right", style="#BDF6C0")
|
312
|
-
table.add_column("False Negative Rate", justify="right", style="#F6BDE4")
|
313
|
-
table.add_column("Mismatched Value Rate", justify="right", style="#E4F6BD")
|
314
|
-
|
315
|
-
# Find best values for each metric
|
316
|
-
best_values = {
|
317
|
-
'accuracy': max(m.accuracy for m in benchmark_metrics),
|
318
|
-
'levenshtein': max(m.levenshtein_similarity for m in benchmark_metrics),
|
319
|
-
'jaccard': max(m.jaccard_similarity for m in benchmark_metrics),
|
320
|
-
'fp_rate': min(m.false_positive_rate for m in benchmark_metrics),
|
321
|
-
'fn_rate': min(m.false_negative_rate for m in benchmark_metrics),
|
322
|
-
'mismatch_rate': min(m.mismatched_value_rate for m in benchmark_metrics),
|
323
|
-
}
|
324
|
-
|
325
|
-
# Add rows for each model's metrics
|
326
|
-
for metrics in benchmark_metrics:
|
327
|
-
table.add_row(
|
328
|
-
metrics.ai_models,
|
329
|
-
f"[bold]{metrics.accuracy:.3f}[/bold]" if metrics.accuracy == best_values['accuracy'] else f"[dim]{metrics.accuracy:.3f}[/dim]",
|
330
|
-
f"[bold]{metrics.levenshtein_similarity:.3f}[/bold]"
|
331
|
-
if metrics.levenshtein_similarity == best_values['levenshtein']
|
332
|
-
else f"[dim]{metrics.levenshtein_similarity:.3f}[/dim]",
|
333
|
-
f"[bold]{metrics.jaccard_similarity:.3f}[/bold]" if metrics.jaccard_similarity == best_values['jaccard'] else f"[dim]{metrics.jaccard_similarity:.3f}[/dim]",
|
334
|
-
f"[bold]{metrics.false_positive_rate:.3f}[/bold]" if metrics.false_positive_rate == best_values['fp_rate'] else f"[dim]{metrics.false_positive_rate:.3f}[/dim]",
|
335
|
-
f"[bold]{metrics.false_negative_rate:.3f}[/bold]" if metrics.false_negative_rate == best_values['fn_rate'] else f"[dim]{metrics.false_negative_rate:.3f}[/dim]",
|
336
|
-
f"[bold]{metrics.mismatched_value_rate:.3f}[/bold]"
|
337
|
-
if metrics.mismatched_value_rate == best_values['mismatch_rate']
|
338
|
-
else f"[dim]{metrics.mismatched_value_rate:.3f}[/dim]",
|
339
|
-
)
|
340
|
-
|
341
|
-
# Print the table
|
342
|
-
console.print(table)
|
343
|
-
|
344
|
-
|
345
|
-
class ComparisonMetrics(BaseModel):
|
346
|
-
# Total Values (count or sum) per Field
|
347
|
-
false_positive_counts: dict[str, int] = defaultdict(int)
|
348
|
-
false_positive_rate_per_field: dict[str, float] = defaultdict(float)
|
349
|
-
|
350
|
-
false_negative_counts: dict[str, int] = defaultdict(int)
|
351
|
-
false_negative_rate_per_field: dict[str, float] = defaultdict(float)
|
352
|
-
|
353
|
-
mismatched_value_counts: dict[str, int] = defaultdict(int)
|
354
|
-
mismatched_value_rate_per_field: dict[str, float] = defaultdict(float)
|
355
|
-
|
356
|
-
common_presence_counts: dict[str, int] = defaultdict(int)
|
357
|
-
accuracy_per_field: dict[str, float] = defaultdict(float)
|
358
|
-
|
359
|
-
jaccard_similarity_per_field: dict[str, float] = defaultdict(float)
|
360
|
-
total_jaccard_similarity_per_field: dict[str, float] = defaultdict(float)
|
361
|
-
|
362
|
-
levenshtein_similarity_per_field: dict[str, float] = defaultdict(float)
|
363
|
-
total_levenshtein_similarity_per_field: dict[str, float] = defaultdict(float)
|
364
|
-
|
365
|
-
@computed_field # type: ignore
|
366
|
-
@property
|
367
|
-
def accuracy(self) -> float:
|
368
|
-
return sum(self.accuracy_per_field.values()) / len(self.accuracy_per_field)
|
369
|
-
|
370
|
-
@computed_field # type: ignore
|
371
|
-
@property
|
372
|
-
def levenshtein_similarity(self) -> float:
|
373
|
-
return sum(self.levenshtein_similarity_per_field.values()) / len(self.levenshtein_similarity_per_field)
|
374
|
-
|
375
|
-
@computed_field # type: ignore
|
376
|
-
@property
|
377
|
-
def jaccard_similarity(self) -> float:
|
378
|
-
return sum(self.jaccard_similarity_per_field.values()) / len(self.jaccard_similarity_per_field)
|
379
|
-
|
380
|
-
@computed_field # type: ignore
|
381
|
-
@property
|
382
|
-
def false_positive_rate(self) -> float:
|
383
|
-
return sum(self.false_positive_rate_per_field.values()) / len(self.false_positive_rate_per_field)
|
384
|
-
|
385
|
-
@computed_field # type: ignore
|
386
|
-
@property
|
387
|
-
def false_negative_rate(self) -> float:
|
388
|
-
return sum(self.false_negative_rate_per_field.values()) / len(self.false_negative_rate_per_field)
|
389
|
-
|
390
|
-
@computed_field # type: ignore
|
391
|
-
@property
|
392
|
-
def mismatched_value_rate(self) -> float:
|
393
|
-
return sum(self.mismatched_value_rate_per_field.values()) / len(self.mismatched_value_rate_per_field)
|
394
|
-
|
395
|
-
|
396
|
-
def normalized_comparison_metrics(list_analyses: list[ExtractionAnalysis], min_freq: float = 0.2) -> ComparisonMetrics:
|
397
|
-
false_positive_counts: dict[str, int] = defaultdict(int)
|
398
|
-
false_negative_counts: dict[str, int] = defaultdict(int)
|
399
|
-
mismatched_value_counts: dict[str, int] = defaultdict(int)
|
400
|
-
common_presence_counts: dict[str, int] = defaultdict(int)
|
401
|
-
is_equal_per_field: dict[str, int] = defaultdict(int)
|
402
|
-
|
403
|
-
total_levenshtein_similarity_per_field: dict[str, float] = defaultdict(float)
|
404
|
-
total_jaccard_similarity_per_field: dict[str, float] = defaultdict(float)
|
405
|
-
false_positive_rate_per_field: dict[str, float] = defaultdict(float)
|
406
|
-
false_negative_rate_per_field: dict[str, float] = defaultdict(float)
|
407
|
-
mismatched_value_rate_per_field: dict[str, float] = defaultdict(float)
|
408
|
-
|
409
|
-
for analysis in list_analyses:
|
410
|
-
# Count false positives
|
411
|
-
for error in analysis.comparison.false_positives:
|
412
|
-
key = error["key"]
|
413
|
-
false_positive_counts[key_normalization(key)] += 1
|
414
|
-
|
415
|
-
# Count false negatives
|
416
|
-
for error in analysis.comparison.false_negatives:
|
417
|
-
key = error["key"]
|
418
|
-
false_negative_counts[key_normalization(key)] += 1
|
419
|
-
|
420
|
-
# Count Wrong Predictions
|
421
|
-
for error in analysis.comparison.mismatched_values:
|
422
|
-
key = error["key"]
|
423
|
-
mismatched_value_counts[key_normalization(key)] += 1
|
424
|
-
|
425
|
-
# Count total errors per field (Levenshtein)
|
426
|
-
for key, similarity in analysis.comparison.similarity_levenshtein.items():
|
427
|
-
common_presence_counts[key_normalization(key)] += 1
|
428
|
-
total_levenshtein_similarity_per_field[key_normalization(key)] += similarity
|
429
|
-
|
430
|
-
for key, is_equal in analysis.comparison.is_equal.items():
|
431
|
-
is_equal_per_field[key_normalization(key)] += int(is_equal)
|
432
|
-
|
433
|
-
# Count Jaccard Similarity
|
434
|
-
for key, similarity in analysis.comparison.similarity_jaccard.items():
|
435
|
-
total_jaccard_similarity_per_field[key_normalization(key)] += similarity
|
436
|
-
|
437
|
-
accuracy_per_field = {
|
438
|
-
key: is_equal_per_field[key] / common_presence_counts[key] for key in common_presence_counts if common_presence_counts[key] > int(min_freq * len(list_analyses))
|
439
|
-
}
|
440
|
-
levenshtein_similarity_per_field = {
|
441
|
-
key: total_levenshtein_similarity_per_field[key] / common_presence_counts[key]
|
442
|
-
for key in common_presence_counts
|
443
|
-
if common_presence_counts[key] > int(min_freq * len(list_analyses))
|
444
|
-
}
|
445
|
-
jaccard_similarity_per_field = {
|
446
|
-
key: total_jaccard_similarity_per_field[key] / common_presence_counts[key]
|
447
|
-
for key in common_presence_counts
|
448
|
-
if common_presence_counts[key] > int(min_freq * len(list_analyses))
|
449
|
-
}
|
450
|
-
false_positive_rate_per_field = {
|
451
|
-
key: false_positive_counts[key] / common_presence_counts[key] for key in common_presence_counts if common_presence_counts[key] > int(min_freq * len(list_analyses))
|
452
|
-
}
|
453
|
-
false_negative_rate_per_field = {
|
454
|
-
key: false_negative_counts[key] / common_presence_counts[key] for key in common_presence_counts if common_presence_counts[key] > int(min_freq * len(list_analyses))
|
455
|
-
}
|
456
|
-
mismatched_value_rate_per_field = {
|
457
|
-
key: mismatched_value_counts[key] / common_presence_counts[key] for key in common_presence_counts if common_presence_counts[key] > int(min_freq * len(list_analyses))
|
458
|
-
}
|
459
|
-
|
460
|
-
return ComparisonMetrics(
|
461
|
-
false_positive_counts=false_positive_counts,
|
462
|
-
false_negative_counts=false_negative_counts,
|
463
|
-
mismatched_value_counts=mismatched_value_counts,
|
464
|
-
common_presence_counts=common_presence_counts,
|
465
|
-
total_levenshtein_similarity_per_field=total_levenshtein_similarity_per_field,
|
466
|
-
total_jaccard_similarity_per_field=total_jaccard_similarity_per_field,
|
467
|
-
accuracy_per_field=accuracy_per_field,
|
468
|
-
levenshtein_similarity_per_field=levenshtein_similarity_per_field,
|
469
|
-
jaccard_similarity_per_field=jaccard_similarity_per_field,
|
470
|
-
false_positive_rate_per_field=false_positive_rate_per_field,
|
471
|
-
false_negative_rate_per_field=false_negative_rate_per_field,
|
472
|
-
mismatched_value_rate_per_field=mismatched_value_rate_per_field,
|
473
|
-
)
|
474
|
-
|
475
|
-
|
476
|
-
def plot_metric(
|
477
|
-
analysis: ComparisonMetrics,
|
478
|
-
value_type: Literal["accuracy", "levenshtein_similarity", "jaccard_similarity", "false_positive_rate", "false_negative_rate", "mismatched_value_rate"] = "accuracy",
|
479
|
-
top_n: int = 20,
|
480
|
-
ascending: bool = False,
|
481
|
-
) -> None:
|
482
|
-
"""Plot a metric from analysis results using a horizontal bar chart.
|
483
|
-
|
484
|
-
Args:
|
485
|
-
analysis: ComparisonMetrics object containing the analysis results
|
486
|
-
"""
|
487
|
-
# Create dataframe from accuracy data
|
488
|
-
df = pd.DataFrame(list(analysis.__getattribute__(value_type + "_per_field").items()), columns=["field", value_type]).sort_values(by=value_type, ascending=ascending)
|
489
|
-
|
490
|
-
# Filter top n fields with the lowest accuracy
|
491
|
-
top_n_df = df.head(top_n)
|
492
|
-
|
493
|
-
# Create the plot
|
494
|
-
fig = tpl.figure()
|
495
|
-
fig.barh(np.array(top_n_df[value_type]).round(4), np.array(top_n_df["field"]), force_ascii=False)
|
496
|
-
|
497
|
-
fig.show()
|
498
|
-
|
499
|
-
|
500
|
-
def plot_comparison_metrics(analysis: ComparisonMetrics, top_n: int = 20) -> None:
|
501
|
-
metric_ascendency_dict: dict[
|
502
|
-
Literal["accuracy", "levenshtein_similarity", "jaccard_similarity", "false_positive_rate", "false_negative_rate", "mismatched_value_rate"], bool
|
503
|
-
] = {"accuracy": True, "levenshtein_similarity": True, "jaccard_similarity": True, "false_positive_rate": False, "false_negative_rate": False, "mismatched_value_rate": False}
|
504
|
-
|
505
|
-
print(f"#########################################")
|
506
|
-
print(f"############ AVERAGE METRICS ############")
|
507
|
-
print(f"#########################################")
|
508
|
-
print(f"Accuracy: {analysis.accuracy:.2f}")
|
509
|
-
print(f"Levenshtein Similarity: {analysis.levenshtein_similarity:.2f}")
|
510
|
-
print(f"Jaccard Similarity (IOU): {analysis.jaccard_similarity:.2f}")
|
511
|
-
print(f"False Positive Rate: {analysis.false_positive_rate:.2f}")
|
512
|
-
print(f"False Negative Rate: {analysis.false_negative_rate:.2f}")
|
513
|
-
print(f"Mismatched Value Rate: {analysis.mismatched_value_rate:.2f}")
|
514
|
-
|
515
|
-
for metric, ascending in metric_ascendency_dict.items():
|
516
|
-
print(f"\n\n############ {metric.upper()} ############")
|
517
|
-
plot_metric(analysis, metric, top_n, ascending)
|
518
|
-
|
519
|
-
|
520
|
-
def get_aggregation_metrics(metric: dict[str, float | int], _hierarchy_level: int) -> dict[str, float]:
|
521
|
-
if _hierarchy_level == 0:
|
522
|
-
# For level 0, aggregate all values under empty string key
|
523
|
-
return {"": sum(metric.values()) / len(metric)}
|
524
|
-
|
525
|
-
aggregated_metrics: dict[str, list[float | int]] = {}
|
526
|
-
for key, value in metric.items():
|
527
|
-
# Split key and handle array notation by replacing array indices with '*'
|
528
|
-
key_parts: list[str] = []
|
529
|
-
for part in key.split('.'):
|
530
|
-
if part.isdigit():
|
531
|
-
key_parts.append('*')
|
532
|
-
else:
|
533
|
-
key_parts.append(part)
|
534
|
-
actual_depth = len([part for part in key_parts if part != '*'])
|
535
|
-
if actual_depth < _hierarchy_level:
|
536
|
-
continue
|
537
|
-
|
538
|
-
used_depth = 0
|
539
|
-
aggregation_prefix_parts: list[str] = []
|
540
|
-
for part in key_parts:
|
541
|
-
if part == "*":
|
542
|
-
aggregation_prefix_parts.append("*")
|
543
|
-
else:
|
544
|
-
aggregation_prefix_parts.append(part)
|
545
|
-
used_depth += 1
|
546
|
-
if used_depth == _hierarchy_level:
|
547
|
-
break
|
548
|
-
aggregation_prefix = '.'.join(aggregation_prefix_parts)
|
549
|
-
|
550
|
-
# Aggregate metrics
|
551
|
-
if aggregation_prefix not in aggregated_metrics:
|
552
|
-
aggregated_metrics[aggregation_prefix] = []
|
553
|
-
aggregated_metrics[aggregation_prefix].append(value)
|
554
|
-
|
555
|
-
# Calculate averages
|
556
|
-
return {key: sum(values) / len(values) if len(values) > 0 else 0 for key, values in aggregated_metrics.items()}
|
557
|
-
|
558
|
-
|
559
|
-
def get_max_depth(metric: dict[str, float | int]) -> int:
|
560
|
-
max_depth_upper_bound = max(len(key.split('.')) for key in metric.keys())
|
561
|
-
for max_depth in range(max_depth_upper_bound, 0, -1):
|
562
|
-
if get_aggregation_metrics(metric, max_depth):
|
563
|
-
return max_depth
|
564
|
-
return 0
|
565
|
-
|
566
|
-
|
567
|
-
def aggregate_metric_per_hierarchy_level(metric: dict[str, float | int], hierarchy_level: int) -> dict[str, float | int]:
|
568
|
-
"""Aggregates metrics by grouping and averaging values at a specified hierarchy level in the key structure.
|
569
|
-
|
570
|
-
Args:
|
571
|
-
metric: Dictionary mapping hierarchical keys (dot-separated strings) to numeric values.
|
572
|
-
Array indices in keys are treated as wildcards.
|
573
|
-
hierarchy_level: The depth level at which to aggregate the metrics.
|
574
|
-
E.g. level 1 aggregates at first dot separator.
|
575
|
-
|
576
|
-
Returns:
|
577
|
-
Dictionary mapping aggregated keys to averaged values. Keys are truncated to the specified
|
578
|
-
hierarchy level with array indices replaced by '*'.
|
579
|
-
|
580
|
-
Raises:
|
581
|
-
ValueError: If the requested hierarchy level exceeds the maximum depth in the data.
|
582
|
-
"""
|
583
|
-
max_depth = get_max_depth(metric)
|
584
|
-
|
585
|
-
if hierarchy_level > max_depth:
|
586
|
-
raise ValueError(f"Hierarchy level {hierarchy_level} is greater than the maximum depth {max_depth}")
|
587
|
-
|
588
|
-
return get_aggregation_metrics(metric, hierarchy_level)
|