retab 0.0.35__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (111) hide show
  1. retab-0.0.35.dist-info/METADATA +417 -0
  2. retab-0.0.35.dist-info/RECORD +111 -0
  3. retab-0.0.35.dist-info/WHEEL +5 -0
  4. retab-0.0.35.dist-info/top_level.txt +1 -0
  5. uiform/__init__.py +4 -0
  6. uiform/_resource.py +28 -0
  7. uiform/_utils/__init__.py +0 -0
  8. uiform/_utils/ai_models.py +100 -0
  9. uiform/_utils/benchmarking copy.py +588 -0
  10. uiform/_utils/benchmarking.py +485 -0
  11. uiform/_utils/chat.py +332 -0
  12. uiform/_utils/display.py +443 -0
  13. uiform/_utils/json_schema.py +2161 -0
  14. uiform/_utils/mime.py +168 -0
  15. uiform/_utils/responses.py +163 -0
  16. uiform/_utils/stream_context_managers.py +52 -0
  17. uiform/_utils/usage/__init__.py +0 -0
  18. uiform/_utils/usage/usage.py +300 -0
  19. uiform/client.py +701 -0
  20. uiform/py.typed +0 -0
  21. uiform/resources/__init__.py +0 -0
  22. uiform/resources/consensus/__init__.py +3 -0
  23. uiform/resources/consensus/client.py +114 -0
  24. uiform/resources/consensus/completions.py +252 -0
  25. uiform/resources/consensus/completions_stream.py +278 -0
  26. uiform/resources/consensus/responses.py +325 -0
  27. uiform/resources/consensus/responses_stream.py +373 -0
  28. uiform/resources/deployments/__init__.py +9 -0
  29. uiform/resources/deployments/client.py +78 -0
  30. uiform/resources/deployments/endpoints.py +322 -0
  31. uiform/resources/deployments/links.py +452 -0
  32. uiform/resources/deployments/logs.py +211 -0
  33. uiform/resources/deployments/mailboxes.py +496 -0
  34. uiform/resources/deployments/outlook.py +531 -0
  35. uiform/resources/deployments/tests.py +158 -0
  36. uiform/resources/documents/__init__.py +3 -0
  37. uiform/resources/documents/client.py +255 -0
  38. uiform/resources/documents/extractions.py +441 -0
  39. uiform/resources/evals.py +812 -0
  40. uiform/resources/files.py +24 -0
  41. uiform/resources/finetuning.py +62 -0
  42. uiform/resources/jsonlUtils.py +1046 -0
  43. uiform/resources/models.py +45 -0
  44. uiform/resources/openai_example.py +22 -0
  45. uiform/resources/processors/__init__.py +3 -0
  46. uiform/resources/processors/automations/__init__.py +9 -0
  47. uiform/resources/processors/automations/client.py +78 -0
  48. uiform/resources/processors/automations/endpoints.py +317 -0
  49. uiform/resources/processors/automations/links.py +356 -0
  50. uiform/resources/processors/automations/logs.py +211 -0
  51. uiform/resources/processors/automations/mailboxes.py +435 -0
  52. uiform/resources/processors/automations/outlook.py +444 -0
  53. uiform/resources/processors/automations/tests.py +158 -0
  54. uiform/resources/processors/client.py +474 -0
  55. uiform/resources/prompt_optimization.py +76 -0
  56. uiform/resources/schemas.py +369 -0
  57. uiform/resources/secrets/__init__.py +9 -0
  58. uiform/resources/secrets/client.py +20 -0
  59. uiform/resources/secrets/external_api_keys.py +109 -0
  60. uiform/resources/secrets/webhook.py +62 -0
  61. uiform/resources/usage.py +271 -0
  62. uiform/types/__init__.py +0 -0
  63. uiform/types/ai_models.py +645 -0
  64. uiform/types/automations/__init__.py +0 -0
  65. uiform/types/automations/cron.py +58 -0
  66. uiform/types/automations/endpoints.py +21 -0
  67. uiform/types/automations/links.py +28 -0
  68. uiform/types/automations/mailboxes.py +60 -0
  69. uiform/types/automations/outlook.py +68 -0
  70. uiform/types/automations/webhooks.py +21 -0
  71. uiform/types/chat.py +8 -0
  72. uiform/types/completions.py +93 -0
  73. uiform/types/consensus.py +10 -0
  74. uiform/types/db/__init__.py +0 -0
  75. uiform/types/db/annotations.py +24 -0
  76. uiform/types/db/files.py +36 -0
  77. uiform/types/deployments/__init__.py +0 -0
  78. uiform/types/deployments/cron.py +59 -0
  79. uiform/types/deployments/endpoints.py +28 -0
  80. uiform/types/deployments/links.py +36 -0
  81. uiform/types/deployments/mailboxes.py +67 -0
  82. uiform/types/deployments/outlook.py +76 -0
  83. uiform/types/deployments/webhooks.py +21 -0
  84. uiform/types/documents/__init__.py +0 -0
  85. uiform/types/documents/correct_orientation.py +13 -0
  86. uiform/types/documents/create_messages.py +226 -0
  87. uiform/types/documents/extractions.py +297 -0
  88. uiform/types/evals.py +207 -0
  89. uiform/types/events.py +76 -0
  90. uiform/types/extractions.py +85 -0
  91. uiform/types/jobs/__init__.py +0 -0
  92. uiform/types/jobs/base.py +150 -0
  93. uiform/types/jobs/batch_annotation.py +22 -0
  94. uiform/types/jobs/evaluation.py +133 -0
  95. uiform/types/jobs/finetune.py +6 -0
  96. uiform/types/jobs/prompt_optimization.py +41 -0
  97. uiform/types/jobs/webcrawl.py +6 -0
  98. uiform/types/logs.py +231 -0
  99. uiform/types/mime.py +257 -0
  100. uiform/types/modalities.py +68 -0
  101. uiform/types/pagination.py +6 -0
  102. uiform/types/schemas/__init__.py +0 -0
  103. uiform/types/schemas/enhance.py +53 -0
  104. uiform/types/schemas/evaluate.py +55 -0
  105. uiform/types/schemas/generate.py +32 -0
  106. uiform/types/schemas/layout.py +58 -0
  107. uiform/types/schemas/object.py +631 -0
  108. uiform/types/schemas/templates.py +107 -0
  109. uiform/types/secrets/__init__.py +0 -0
  110. uiform/types/secrets/external_api_keys.py +22 -0
  111. uiform/types/standards.py +39 -0
@@ -0,0 +1,588 @@
1
+ import re
2
+ import unicodedata
3
+ from collections import defaultdict
4
+ from typing import Any, Literal, Optional
5
+
6
+ import numpy as np
7
+ import pandas as pd
8
+ import termplotlib as tpl # type: ignore
9
+ from Levenshtein import distance as levenshtein_distance
10
+ from pydantic import BaseModel, computed_field
11
+
12
+ # The goal is to leverage this piece of code to open a jsonl file and get an analysis of the performance of the model using a one-liner.
13
+
14
+
15
+ ############# BENCHMARKING MODELS #############
16
+
17
+
18
+ class DictionaryComparisonMetrics(BaseModel):
19
+ # Pure dict comparison
20
+ unchanged_fields: int
21
+ total_fields: int
22
+ is_equal: dict[str, bool]
23
+ false_positives: list[dict[str, Any]]
24
+ false_negatives: list[dict[str, Any]]
25
+ mismatched_values: list[dict[str, Any]]
26
+ keys_only_on_1: list[str]
27
+ keys_only_on_2: list[str]
28
+
29
+ # Some metrics
30
+ valid_comparisons: int
31
+ total_accuracy: float
32
+ false_positive_rate: float
33
+ false_negative_rate: float
34
+ mismatched_value_rate: float
35
+
36
+ similarity_levenshtein: dict[str, float]
37
+ similarity_jaccard: dict[str, float]
38
+
39
+ avg_similarity_levenshtein: float
40
+ avg_similarity_jaccard: float
41
+ total_similarity_levenshtein: float
42
+ total_similarity_jaccard: float
43
+
44
+
45
+ def flatten_dict(obj: Any, prefix: str = '') -> dict[str, Any]:
46
+ items = [] # type: ignore
47
+ if isinstance(obj, dict):
48
+ for k, v in obj.items():
49
+ new_key = f"{prefix}.{k}" if prefix else k
50
+ items.extend(flatten_dict(v, new_key).items())
51
+ elif isinstance(obj, list):
52
+ for i, v in enumerate(obj):
53
+ new_key = f"{prefix}.{i}"
54
+ items.extend(flatten_dict(v, new_key).items())
55
+ else:
56
+ items.append((prefix, obj))
57
+ return dict(items)
58
+
59
+
60
+ def normalize_value(val: Any) -> str:
61
+ """Convert value to uppercase and remove all spacing for comparison."""
62
+ if val is None:
63
+ return ""
64
+ prep = re.sub(r'\s+', '', str(val).upper())
65
+ # Remove all accents (é -> e, etc.)
66
+ return unicodedata.normalize('NFKD', prep).encode('ASCII', 'ignore').decode()
67
+
68
+
69
+ def key_normalization(key: str) -> str:
70
+ """This method is useful to compare keys under list indexes (that refers to the same kind of error but on different list index position)"""
71
+ # We will replace all .{i} with .* where i is the index of the list (using regex for this)
72
+ key_parts = key.split(".")
73
+ new_key_parts = []
74
+ for key_part in key_parts:
75
+ if key_part.isdigit():
76
+ new_key_parts.append("*")
77
+ else:
78
+ new_key_parts.append(key_part)
79
+ return ".".join(new_key_parts)
80
+
81
+
82
+ def should_ignore_key(
83
+ key: str, exclude_field_patterns: list[str] | None, include_field_patterns: list[str] | None = None, information_presence_per_field: dict[str, bool] | None = None
84
+ ) -> bool:
85
+ if information_presence_per_field and information_presence_per_field.get(key) is False:
86
+ # If we have the information_presence_per_field dict and the key is marked as false, then we should ignore it
87
+ should_ignore = True
88
+ else:
89
+ # If exclude_field_patterns is None, we should not ignore any key
90
+ normalized_key = key_normalization(key)
91
+ should_ignore = any(normalized_key.startswith(key_normalization(pattern)) for pattern in exclude_field_patterns or [])
92
+
93
+ if include_field_patterns and not should_ignore:
94
+ # If include_field_patterns is not None, we should ignore the key if it does not start with any of the include_field_patterns and is not in the exclude_field_patterns
95
+ should_ignore = not any(normalized_key.startswith(key_normalization(pattern)) for pattern in include_field_patterns)
96
+
97
+ return should_ignore
98
+
99
+
100
+ def levenshtein_similarity(val1: Any, val2: Any) -> float:
101
+ """
102
+ Calculate similarity between two values using Levenshtein distance.
103
+ Returns a similarity score between 0.0 and 1.0.
104
+ """
105
+ # Handle None/empty and general cases
106
+ if (val1 or "") == (val2 or ""):
107
+ return 1.0
108
+
109
+ # Check if both values are numeric, compare with 5% tolerance
110
+ if isinstance(val1, (int, float)) and isinstance(val2, (int, float)):
111
+ return 1.0 if abs(val1 - val2) <= 0.05 * max(abs(val1), abs(val2)) else 0.0
112
+
113
+ # Convert to normalized strings
114
+ str1 = normalize_value(val1)
115
+ str2 = normalize_value(val2)
116
+
117
+ if str1 == str2:
118
+ return 1.0
119
+
120
+ # Calculate Levenshtein distance
121
+ if str1 and str2: # Only if both strings are non-empty
122
+ max_len = max(len(str1), len(str2))
123
+ if max_len == 0:
124
+ return 1.0
125
+
126
+ dist = levenshtein_distance(str1, str2)
127
+ return 1 - (dist / max_len)
128
+
129
+ return 0.0
130
+
131
+
132
+ def jaccard_similarity(val1: Any, val2: Any) -> float:
133
+ """
134
+ Calculate Jaccard similarity between two values.
135
+ Returns a similarity score between 0.0 and 1.0.
136
+ """
137
+ # Handle None/empty and general cases
138
+ if (val1 or "") == (val2 or ""):
139
+ return 1.0
140
+
141
+ # Check if both values are numeric, compare with 5% tolerance
142
+ if isinstance(val1, (int, float)) and isinstance(val2, (int, float)):
143
+ return 1.0 if abs(val1 - val2) <= 0.05 * max(abs(val1), abs(val2)) else 0.0
144
+
145
+ # Convert to normalized strings and split into words
146
+ str1 = set(normalize_value(val1).split())
147
+ str2 = set(normalize_value(val2).split())
148
+
149
+ if not str1 and not str2:
150
+ return 1.0
151
+
152
+ # Calculate Jaccard similarity
153
+ intersection = len(str1.intersection(str2))
154
+ union = len(str1.union(str2))
155
+
156
+ return intersection / union if union > 0 else 0.0
157
+
158
+
159
+ def compare_dicts(
160
+ ground_truth: dict[str, Any],
161
+ prediction: dict[str, Any],
162
+ include_fields: list[str] | None = None,
163
+ exclude_fields: list[str] | None = None,
164
+ information_presence_per_field: dict[str, bool] | None = None,
165
+ levenshtein_threshold: float = 0.0, # 0.0 means exact match
166
+ ) -> DictionaryComparisonMetrics:
167
+ flat_ground_truth = flatten_dict(ground_truth)
168
+ flat_prediction = flatten_dict(prediction)
169
+
170
+ flat_ground_truth = {k: v for k, v in flat_ground_truth.items() if not should_ignore_key(k, exclude_fields, include_fields, information_presence_per_field)}
171
+ flat_prediction = {k: v for k, v in flat_prediction.items() if not should_ignore_key(k, exclude_fields, include_fields, information_presence_per_field)}
172
+
173
+ keys_ground_truth = set(flat_ground_truth.keys())
174
+ keys_prediction = set(flat_prediction.keys())
175
+ common_keys = keys_ground_truth & keys_prediction
176
+
177
+ keys_only_on_1 = sorted(list(keys_ground_truth - keys_prediction))
178
+ keys_only_on_2 = sorted(list(keys_prediction - keys_ground_truth))
179
+
180
+ total_fields = len(common_keys)
181
+ unchanged_fields = 0
182
+ is_equal_per_field = {}
183
+
184
+ false_positives = []
185
+ false_negatives = []
186
+ mismatched_values = []
187
+
188
+ total_similarity_levenshtein = 0.0
189
+ total_similarity_jaccard = 0.0
190
+ similarity_levenshtein_per_field = {}
191
+ similarity_jaccard_per_field = {}
192
+
193
+ valid_comparisons = 0
194
+
195
+ for key in common_keys:
196
+ llm_value = flat_ground_truth[key]
197
+ extraction_value = flat_prediction[key]
198
+
199
+ coerced_llm_value = llm_value or ""
200
+ coerced_extraction_value = extraction_value or ""
201
+
202
+ similarity_lev = levenshtein_similarity(llm_value, extraction_value)
203
+ similarity_jac = jaccard_similarity(llm_value, extraction_value)
204
+ # print("Jaccard similarity", similarity_jac)
205
+
206
+ # Use Levenshtein for equality comparison (you can adjust this if needed)
207
+ is_equal = similarity_lev >= (1 - levenshtein_threshold)
208
+
209
+ similarity_levenshtein_per_field[key] = similarity_lev
210
+ similarity_jaccard_per_field[key] = similarity_jac
211
+ is_equal_per_field[key] = is_equal
212
+
213
+ # Only count non-empty comparisons for average similarity
214
+ if coerced_llm_value != "" and coerced_extraction_value != "":
215
+ total_similarity_levenshtein += similarity_lev
216
+ total_similarity_jaccard += similarity_jac
217
+ valid_comparisons += 1
218
+
219
+ if is_equal:
220
+ unchanged_fields += 1
221
+ else:
222
+ if coerced_llm_value != "" and coerced_extraction_value == "":
223
+ false_positives.append({"key": key, "expected": extraction_value, "got": llm_value, "similarity": similarity_lev})
224
+ elif coerced_llm_value == "" and coerced_extraction_value != "":
225
+ false_negatives.append({"key": key, "expected": extraction_value, "got": llm_value, "similarity": similarity_lev})
226
+ elif coerced_llm_value != "" and coerced_extraction_value != "":
227
+ # Both are non-empty but not equal
228
+ mismatched_values.append({"key": key, "expected": extraction_value, "got": llm_value, "similarity": similarity_lev})
229
+ # Some metrics
230
+ avg_similarity_levenshtein = total_similarity_levenshtein / valid_comparisons if valid_comparisons > 0 else 1.0
231
+ avg_similarity_jaccard = total_similarity_jaccard / valid_comparisons if valid_comparisons > 0 else 1.0
232
+ total_accuracy = unchanged_fields / total_fields if total_fields > 0 else 1.0
233
+ false_positive_rate = len(false_positives) / total_fields if total_fields > 0 else 0.0
234
+ false_negative_rate = len(false_negatives) / total_fields if total_fields > 0 else 0.0
235
+ mismatched_value_rate = len(mismatched_values) / total_fields if total_fields > 0 else 0.0
236
+
237
+ return DictionaryComparisonMetrics(
238
+ unchanged_fields=unchanged_fields,
239
+ total_fields=total_fields,
240
+ is_equal=is_equal_per_field,
241
+ false_positives=false_positives,
242
+ false_negatives=false_negatives,
243
+ mismatched_values=mismatched_values,
244
+ keys_only_on_1=keys_only_on_1,
245
+ keys_only_on_2=keys_only_on_2,
246
+ valid_comparisons=valid_comparisons,
247
+ total_accuracy=total_accuracy,
248
+ false_positive_rate=false_positive_rate,
249
+ false_negative_rate=false_negative_rate,
250
+ mismatched_value_rate=mismatched_value_rate,
251
+ similarity_levenshtein=similarity_levenshtein_per_field,
252
+ similarity_jaccard=similarity_jaccard_per_field,
253
+ avg_similarity_levenshtein=avg_similarity_levenshtein,
254
+ avg_similarity_jaccard=avg_similarity_jaccard,
255
+ total_similarity_levenshtein=total_similarity_levenshtein,
256
+ total_similarity_jaccard=total_similarity_jaccard,
257
+ )
258
+
259
+
260
+ class ExtractionAnalysis(BaseModel):
261
+ ground_truth: dict[str, Any]
262
+ prediction: dict[str, Any]
263
+ time_spent: Optional[float] = None
264
+ include_fields: list[str] | None = None
265
+ exclude_fields: list[str] | None = None
266
+ information_presence_per_field: dict[str, bool] | None = None
267
+ levenshtein_threshold: float = 0.0
268
+
269
+ @computed_field # type: ignore
270
+ @property
271
+ def comparison(self) -> DictionaryComparisonMetrics:
272
+ return compare_dicts(
273
+ self.ground_truth,
274
+ self.prediction,
275
+ include_fields=self.include_fields,
276
+ exclude_fields=self.exclude_fields,
277
+ information_presence_per_field=self.information_presence_per_field,
278
+ levenshtein_threshold=self.levenshtein_threshold,
279
+ )
280
+
281
+
282
+ class BenchmarkMetrics(BaseModel):
283
+ ai_model: str
284
+ accuracy: float
285
+ levenshtein_similarity: float
286
+ jaccard_similarity: float
287
+ false_positive_rate: float
288
+ false_negative_rate: float
289
+ mismatched_value_rate: float
290
+
291
+
292
+ from rich.console import Console
293
+ from rich.table import Table
294
+
295
+
296
+ def display_benchmark_metrics(benchmark_metrics: list[BenchmarkMetrics]) -> None:
297
+ """
298
+ Display benchmark metrics for multiple models in a formatted table.
299
+
300
+ Args:
301
+ benchmark_metrics: List of BenchmarkMetrics objects containing model performance data
302
+ """
303
+ console = Console(style="on grey23")
304
+ table = Table(title="Model Benchmark Comparison", show_lines=True)
305
+
306
+ # Add columns
307
+ table.add_column("Model", justify="left", style="#BDE8F6", no_wrap=True)
308
+ table.add_column("Accuracy", justify="right", style="#C2BDF6")
309
+ table.add_column("Levenshtein", justify="right", style="#F6BDBD")
310
+ table.add_column("Jaccard", justify="right", style="#F6E4BD")
311
+ table.add_column("False Positive Rate", justify="right", style="#BDF6C0")
312
+ table.add_column("False Negative Rate", justify="right", style="#F6BDE4")
313
+ table.add_column("Mismatched Value Rate", justify="right", style="#E4F6BD")
314
+
315
+ # Find best values for each metric
316
+ best_values = {
317
+ 'accuracy': max(m.accuracy for m in benchmark_metrics),
318
+ 'levenshtein': max(m.levenshtein_similarity for m in benchmark_metrics),
319
+ 'jaccard': max(m.jaccard_similarity for m in benchmark_metrics),
320
+ 'fp_rate': min(m.false_positive_rate for m in benchmark_metrics),
321
+ 'fn_rate': min(m.false_negative_rate for m in benchmark_metrics),
322
+ 'mismatch_rate': min(m.mismatched_value_rate for m in benchmark_metrics),
323
+ }
324
+
325
+ # Add rows for each model's metrics
326
+ for metrics in benchmark_metrics:
327
+ table.add_row(
328
+ metrics.ai_models,
329
+ f"[bold]{metrics.accuracy:.3f}[/bold]" if metrics.accuracy == best_values['accuracy'] else f"[dim]{metrics.accuracy:.3f}[/dim]",
330
+ f"[bold]{metrics.levenshtein_similarity:.3f}[/bold]"
331
+ if metrics.levenshtein_similarity == best_values['levenshtein']
332
+ else f"[dim]{metrics.levenshtein_similarity:.3f}[/dim]",
333
+ f"[bold]{metrics.jaccard_similarity:.3f}[/bold]" if metrics.jaccard_similarity == best_values['jaccard'] else f"[dim]{metrics.jaccard_similarity:.3f}[/dim]",
334
+ f"[bold]{metrics.false_positive_rate:.3f}[/bold]" if metrics.false_positive_rate == best_values['fp_rate'] else f"[dim]{metrics.false_positive_rate:.3f}[/dim]",
335
+ f"[bold]{metrics.false_negative_rate:.3f}[/bold]" if metrics.false_negative_rate == best_values['fn_rate'] else f"[dim]{metrics.false_negative_rate:.3f}[/dim]",
336
+ f"[bold]{metrics.mismatched_value_rate:.3f}[/bold]"
337
+ if metrics.mismatched_value_rate == best_values['mismatch_rate']
338
+ else f"[dim]{metrics.mismatched_value_rate:.3f}[/dim]",
339
+ )
340
+
341
+ # Print the table
342
+ console.print(table)
343
+
344
+
345
+ class ComparisonMetrics(BaseModel):
346
+ # Total Values (count or sum) per Field
347
+ false_positive_counts: dict[str, int] = defaultdict(int)
348
+ false_positive_rate_per_field: dict[str, float] = defaultdict(float)
349
+
350
+ false_negative_counts: dict[str, int] = defaultdict(int)
351
+ false_negative_rate_per_field: dict[str, float] = defaultdict(float)
352
+
353
+ mismatched_value_counts: dict[str, int] = defaultdict(int)
354
+ mismatched_value_rate_per_field: dict[str, float] = defaultdict(float)
355
+
356
+ common_presence_counts: dict[str, int] = defaultdict(int)
357
+ accuracy_per_field: dict[str, float] = defaultdict(float)
358
+
359
+ jaccard_similarity_per_field: dict[str, float] = defaultdict(float)
360
+ total_jaccard_similarity_per_field: dict[str, float] = defaultdict(float)
361
+
362
+ levenshtein_similarity_per_field: dict[str, float] = defaultdict(float)
363
+ total_levenshtein_similarity_per_field: dict[str, float] = defaultdict(float)
364
+
365
+ @computed_field # type: ignore
366
+ @property
367
+ def accuracy(self) -> float:
368
+ return sum(self.accuracy_per_field.values()) / len(self.accuracy_per_field)
369
+
370
+ @computed_field # type: ignore
371
+ @property
372
+ def levenshtein_similarity(self) -> float:
373
+ return sum(self.levenshtein_similarity_per_field.values()) / len(self.levenshtein_similarity_per_field)
374
+
375
+ @computed_field # type: ignore
376
+ @property
377
+ def jaccard_similarity(self) -> float:
378
+ return sum(self.jaccard_similarity_per_field.values()) / len(self.jaccard_similarity_per_field)
379
+
380
+ @computed_field # type: ignore
381
+ @property
382
+ def false_positive_rate(self) -> float:
383
+ return sum(self.false_positive_rate_per_field.values()) / len(self.false_positive_rate_per_field)
384
+
385
+ @computed_field # type: ignore
386
+ @property
387
+ def false_negative_rate(self) -> float:
388
+ return sum(self.false_negative_rate_per_field.values()) / len(self.false_negative_rate_per_field)
389
+
390
+ @computed_field # type: ignore
391
+ @property
392
+ def mismatched_value_rate(self) -> float:
393
+ return sum(self.mismatched_value_rate_per_field.values()) / len(self.mismatched_value_rate_per_field)
394
+
395
+
396
+ def normalized_comparison_metrics(list_analyses: list[ExtractionAnalysis], min_freq: float = 0.2) -> ComparisonMetrics:
397
+ false_positive_counts: dict[str, int] = defaultdict(int)
398
+ false_negative_counts: dict[str, int] = defaultdict(int)
399
+ mismatched_value_counts: dict[str, int] = defaultdict(int)
400
+ common_presence_counts: dict[str, int] = defaultdict(int)
401
+ is_equal_per_field: dict[str, int] = defaultdict(int)
402
+
403
+ total_levenshtein_similarity_per_field: dict[str, float] = defaultdict(float)
404
+ total_jaccard_similarity_per_field: dict[str, float] = defaultdict(float)
405
+ false_positive_rate_per_field: dict[str, float] = defaultdict(float)
406
+ false_negative_rate_per_field: dict[str, float] = defaultdict(float)
407
+ mismatched_value_rate_per_field: dict[str, float] = defaultdict(float)
408
+
409
+ for analysis in list_analyses:
410
+ # Count false positives
411
+ for error in analysis.comparison.false_positives:
412
+ key = error["key"]
413
+ false_positive_counts[key_normalization(key)] += 1
414
+
415
+ # Count false negatives
416
+ for error in analysis.comparison.false_negatives:
417
+ key = error["key"]
418
+ false_negative_counts[key_normalization(key)] += 1
419
+
420
+ # Count Wrong Predictions
421
+ for error in analysis.comparison.mismatched_values:
422
+ key = error["key"]
423
+ mismatched_value_counts[key_normalization(key)] += 1
424
+
425
+ # Count total errors per field (Levenshtein)
426
+ for key, similarity in analysis.comparison.similarity_levenshtein.items():
427
+ common_presence_counts[key_normalization(key)] += 1
428
+ total_levenshtein_similarity_per_field[key_normalization(key)] += similarity
429
+
430
+ for key, is_equal in analysis.comparison.is_equal.items():
431
+ is_equal_per_field[key_normalization(key)] += int(is_equal)
432
+
433
+ # Count Jaccard Similarity
434
+ for key, similarity in analysis.comparison.similarity_jaccard.items():
435
+ total_jaccard_similarity_per_field[key_normalization(key)] += similarity
436
+
437
+ accuracy_per_field = {
438
+ key: is_equal_per_field[key] / common_presence_counts[key] for key in common_presence_counts if common_presence_counts[key] > int(min_freq * len(list_analyses))
439
+ }
440
+ levenshtein_similarity_per_field = {
441
+ key: total_levenshtein_similarity_per_field[key] / common_presence_counts[key]
442
+ for key in common_presence_counts
443
+ if common_presence_counts[key] > int(min_freq * len(list_analyses))
444
+ }
445
+ jaccard_similarity_per_field = {
446
+ key: total_jaccard_similarity_per_field[key] / common_presence_counts[key]
447
+ for key in common_presence_counts
448
+ if common_presence_counts[key] > int(min_freq * len(list_analyses))
449
+ }
450
+ false_positive_rate_per_field = {
451
+ key: false_positive_counts[key] / common_presence_counts[key] for key in common_presence_counts if common_presence_counts[key] > int(min_freq * len(list_analyses))
452
+ }
453
+ false_negative_rate_per_field = {
454
+ key: false_negative_counts[key] / common_presence_counts[key] for key in common_presence_counts if common_presence_counts[key] > int(min_freq * len(list_analyses))
455
+ }
456
+ mismatched_value_rate_per_field = {
457
+ key: mismatched_value_counts[key] / common_presence_counts[key] for key in common_presence_counts if common_presence_counts[key] > int(min_freq * len(list_analyses))
458
+ }
459
+
460
+ return ComparisonMetrics(
461
+ false_positive_counts=false_positive_counts,
462
+ false_negative_counts=false_negative_counts,
463
+ mismatched_value_counts=mismatched_value_counts,
464
+ common_presence_counts=common_presence_counts,
465
+ total_levenshtein_similarity_per_field=total_levenshtein_similarity_per_field,
466
+ total_jaccard_similarity_per_field=total_jaccard_similarity_per_field,
467
+ accuracy_per_field=accuracy_per_field,
468
+ levenshtein_similarity_per_field=levenshtein_similarity_per_field,
469
+ jaccard_similarity_per_field=jaccard_similarity_per_field,
470
+ false_positive_rate_per_field=false_positive_rate_per_field,
471
+ false_negative_rate_per_field=false_negative_rate_per_field,
472
+ mismatched_value_rate_per_field=mismatched_value_rate_per_field,
473
+ )
474
+
475
+
476
+ def plot_metric(
477
+ analysis: ComparisonMetrics,
478
+ value_type: Literal["accuracy", "levenshtein_similarity", "jaccard_similarity", "false_positive_rate", "false_negative_rate", "mismatched_value_rate"] = "accuracy",
479
+ top_n: int = 20,
480
+ ascending: bool = False,
481
+ ) -> None:
482
+ """Plot a metric from analysis results using a horizontal bar chart.
483
+
484
+ Args:
485
+ analysis: ComparisonMetrics object containing the analysis results
486
+ """
487
+ # Create dataframe from accuracy data
488
+ df = pd.DataFrame(list(analysis.__getattribute__(value_type + "_per_field").items()), columns=["field", value_type]).sort_values(by=value_type, ascending=ascending)
489
+
490
+ # Filter top n fields with the lowest accuracy
491
+ top_n_df = df.head(top_n)
492
+
493
+ # Create the plot
494
+ fig = tpl.figure()
495
+ fig.barh(np.array(top_n_df[value_type]).round(4), np.array(top_n_df["field"]), force_ascii=False)
496
+
497
+ fig.show()
498
+
499
+
500
+ def plot_comparison_metrics(analysis: ComparisonMetrics, top_n: int = 20) -> None:
501
+ metric_ascendency_dict: dict[
502
+ Literal["accuracy", "levenshtein_similarity", "jaccard_similarity", "false_positive_rate", "false_negative_rate", "mismatched_value_rate"], bool
503
+ ] = {"accuracy": True, "levenshtein_similarity": True, "jaccard_similarity": True, "false_positive_rate": False, "false_negative_rate": False, "mismatched_value_rate": False}
504
+
505
+ print(f"#########################################")
506
+ print(f"############ AVERAGE METRICS ############")
507
+ print(f"#########################################")
508
+ print(f"Accuracy: {analysis.accuracy:.2f}")
509
+ print(f"Levenshtein Similarity: {analysis.levenshtein_similarity:.2f}")
510
+ print(f"Jaccard Similarity (IOU): {analysis.jaccard_similarity:.2f}")
511
+ print(f"False Positive Rate: {analysis.false_positive_rate:.2f}")
512
+ print(f"False Negative Rate: {analysis.false_negative_rate:.2f}")
513
+ print(f"Mismatched Value Rate: {analysis.mismatched_value_rate:.2f}")
514
+
515
+ for metric, ascending in metric_ascendency_dict.items():
516
+ print(f"\n\n############ {metric.upper()} ############")
517
+ plot_metric(analysis, metric, top_n, ascending)
518
+
519
+
520
+ def get_aggregation_metrics(metric: dict[str, float | int], _hierarchy_level: int) -> dict[str, float]:
521
+ if _hierarchy_level == 0:
522
+ # For level 0, aggregate all values under empty string key
523
+ return {"": sum(metric.values()) / len(metric)}
524
+
525
+ aggregated_metrics: dict[str, list[float | int]] = {}
526
+ for key, value in metric.items():
527
+ # Split key and handle array notation by replacing array indices with '*'
528
+ key_parts: list[str] = []
529
+ for part in key.split('.'):
530
+ if part.isdigit():
531
+ key_parts.append('*')
532
+ else:
533
+ key_parts.append(part)
534
+ actual_depth = len([part for part in key_parts if part != '*'])
535
+ if actual_depth < _hierarchy_level:
536
+ continue
537
+
538
+ used_depth = 0
539
+ aggregation_prefix_parts: list[str] = []
540
+ for part in key_parts:
541
+ if part == "*":
542
+ aggregation_prefix_parts.append("*")
543
+ else:
544
+ aggregation_prefix_parts.append(part)
545
+ used_depth += 1
546
+ if used_depth == _hierarchy_level:
547
+ break
548
+ aggregation_prefix = '.'.join(aggregation_prefix_parts)
549
+
550
+ # Aggregate metrics
551
+ if aggregation_prefix not in aggregated_metrics:
552
+ aggregated_metrics[aggregation_prefix] = []
553
+ aggregated_metrics[aggregation_prefix].append(value)
554
+
555
+ # Calculate averages
556
+ return {key: sum(values) / len(values) if len(values) > 0 else 0 for key, values in aggregated_metrics.items()}
557
+
558
+
559
+ def get_max_depth(metric: dict[str, float | int]) -> int:
560
+ max_depth_upper_bound = max(len(key.split('.')) for key in metric.keys())
561
+ for max_depth in range(max_depth_upper_bound, 0, -1):
562
+ if get_aggregation_metrics(metric, max_depth):
563
+ return max_depth
564
+ return 0
565
+
566
+
567
+ def aggregate_metric_per_hierarchy_level(metric: dict[str, float | int], hierarchy_level: int) -> dict[str, float | int]:
568
+ """Aggregates metrics by grouping and averaging values at a specified hierarchy level in the key structure.
569
+
570
+ Args:
571
+ metric: Dictionary mapping hierarchical keys (dot-separated strings) to numeric values.
572
+ Array indices in keys are treated as wildcards.
573
+ hierarchy_level: The depth level at which to aggregate the metrics.
574
+ E.g. level 1 aggregates at first dot separator.
575
+
576
+ Returns:
577
+ Dictionary mapping aggregated keys to averaged values. Keys are truncated to the specified
578
+ hierarchy level with array indices replaced by '*'.
579
+
580
+ Raises:
581
+ ValueError: If the requested hierarchy level exceeds the maximum depth in the data.
582
+ """
583
+ max_depth = get_max_depth(metric)
584
+
585
+ if hierarchy_level > max_depth:
586
+ raise ValueError(f"Hierarchy level {hierarchy_level} is greater than the maximum depth {max_depth}")
587
+
588
+ return get_aggregation_metrics(metric, hierarchy_level)