levelapp 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of levelapp might be problematic. Click here for more details.
- levelapp/__init__.py +0 -0
- levelapp/aspects/__init__.py +8 -0
- levelapp/aspects/loader.py +253 -0
- levelapp/aspects/logger.py +59 -0
- levelapp/aspects/monitor.py +614 -0
- levelapp/aspects/sanitizer.py +168 -0
- levelapp/clients/__init__.py +119 -0
- levelapp/clients/anthropic.py +112 -0
- levelapp/clients/ionos.py +116 -0
- levelapp/clients/mistral.py +106 -0
- levelapp/clients/openai.py +102 -0
- levelapp/comparator/__init__.py +5 -0
- levelapp/comparator/comparator.py +232 -0
- levelapp/comparator/extractor.py +108 -0
- levelapp/comparator/schemas.py +61 -0
- levelapp/comparator/scorer.py +271 -0
- levelapp/comparator/utils.py +136 -0
- levelapp/config/__init__.py +5 -0
- levelapp/config/endpoint.py +190 -0
- levelapp/config/prompts.py +35 -0
- levelapp/core/__init__.py +0 -0
- levelapp/core/base.py +386 -0
- levelapp/core/session.py +214 -0
- levelapp/evaluator/__init__.py +3 -0
- levelapp/evaluator/evaluator.py +265 -0
- levelapp/metrics/__init__.py +67 -0
- levelapp/metrics/embedding.py +2 -0
- levelapp/metrics/exact.py +182 -0
- levelapp/metrics/fuzzy.py +80 -0
- levelapp/metrics/token.py +103 -0
- levelapp/plugins/__init__.py +0 -0
- levelapp/repository/__init__.py +3 -0
- levelapp/repository/firestore.py +282 -0
- levelapp/simulator/__init__.py +3 -0
- levelapp/simulator/schemas.py +89 -0
- levelapp/simulator/simulator.py +441 -0
- levelapp/simulator/utils.py +201 -0
- levelapp/workflow/__init__.py +5 -0
- levelapp/workflow/base.py +113 -0
- levelapp/workflow/factory.py +51 -0
- levelapp/workflow/registration.py +6 -0
- levelapp/workflow/schemas.py +121 -0
- levelapp-0.1.0.dist-info/METADATA +254 -0
- levelapp-0.1.0.dist-info/RECORD +46 -0
- levelapp-0.1.0.dist-info/WHEEL +4 -0
- levelapp-0.1.0.dist-info/licenses/LICENSE +0 -0
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
"""levelapp/clients/openai.py"""
|
|
2
|
+
import os
|
|
3
|
+
|
|
4
|
+
from typing import Dict, Any
|
|
5
|
+
from levelapp.core.base import BaseChatClient
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class OpenAIClient(BaseChatClient):
|
|
9
|
+
"""
|
|
10
|
+
Client for interacting with OpenAI's Chat Completions API.
|
|
11
|
+
|
|
12
|
+
This implementation adapts requests and responses to the OpenAI API
|
|
13
|
+
format, including chat message structure, headers, and token usage reporting.
|
|
14
|
+
|
|
15
|
+
Attributes:
|
|
16
|
+
model (str): Target model ID (default: "gpt-4o-mini").
|
|
17
|
+
base_url (str): Base endpoint for OpenAI API (default: https://api.openai.com/v1).
|
|
18
|
+
api_key (str): Authentication token for the OpenAI API.
|
|
19
|
+
max_tokens (int): Maximum tokens allowed in the completion.
|
|
20
|
+
"""
|
|
21
|
+
def __init__(self, **kwargs):
|
|
22
|
+
super().__init__(**kwargs)
|
|
23
|
+
self.model = kwargs.get('model') or "gpt-4o-mini"
|
|
24
|
+
self.base_url = kwargs.get('base_url') or "https://api.openai.com/v1"
|
|
25
|
+
self.api_key = kwargs.get('api_key') or os.environ.get('OPENAI_API_KEY')
|
|
26
|
+
self.max_tokens = kwargs.get('max_tokens') or 1024
|
|
27
|
+
|
|
28
|
+
if not self.api_key:
|
|
29
|
+
raise ValueError("OpenAI API key not set")
|
|
30
|
+
|
|
31
|
+
@property
|
|
32
|
+
def endpoint_path(self) -> str:
|
|
33
|
+
"""
|
|
34
|
+
API-specific endpoint path for chat completions.
|
|
35
|
+
|
|
36
|
+
Returns:
|
|
37
|
+
str: "/chat/completions"
|
|
38
|
+
"""
|
|
39
|
+
return "/chat/completions"
|
|
40
|
+
|
|
41
|
+
def _build_endpoint(self) -> str:
|
|
42
|
+
"""
|
|
43
|
+
Construct the full API endpoint URL.
|
|
44
|
+
|
|
45
|
+
Returns:
|
|
46
|
+
str: Concatenation of base_url and endpoint_path.
|
|
47
|
+
"""
|
|
48
|
+
return f"{self.base_url}/{self.endpoint_path.lstrip('/')}"
|
|
49
|
+
|
|
50
|
+
def _build_headers(self) -> Dict[str, str]:
|
|
51
|
+
"""
|
|
52
|
+
Build HTTP headers for the OpenAI API request.
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
Dict[str, str]: Headers with authentication and content type.
|
|
56
|
+
"""
|
|
57
|
+
return {
|
|
58
|
+
"Authorization": f"Bearer {self.api_key}",
|
|
59
|
+
"Content-Type": "application/json",
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
def _build_payload(self, message: str) -> Dict[str, Any]:
|
|
63
|
+
"""
|
|
64
|
+
Construct the JSON payload for the OpenAI Chat Completions API.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
message (str): User input or prompt to evaluate.
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
Dict[str, Any]: Payload containing model ID, messages, and token limit.
|
|
71
|
+
"""
|
|
72
|
+
return {
|
|
73
|
+
"model": self.model,
|
|
74
|
+
"messages": [{"role": "user", "content": message}],
|
|
75
|
+
"max_tokens": self.max_tokens,
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
def parse_response(self, response: Dict[str, Any]) -> Dict[str, Any]:
|
|
79
|
+
"""
|
|
80
|
+
Parse and normalize the OpenAI API response.
|
|
81
|
+
|
|
82
|
+
- Extracts text output from `choices[0].message.content`.
|
|
83
|
+
- Attempts to JSON-parse the result if it contains structured content.
|
|
84
|
+
- Collects token usage metadata from `usage`.
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
response (Dict[str, Any]): Raw JSON response from OpenAI.
|
|
88
|
+
|
|
89
|
+
Returns:
|
|
90
|
+
Dict[str, Any]: {
|
|
91
|
+
"output": Parsed model output (dict or str),
|
|
92
|
+
"metadata": {
|
|
93
|
+
"input_tokens": int,
|
|
94
|
+
"output_tokens": int
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
"""
|
|
98
|
+
input_tokens = response.get("usage", {}).get("prompt_tokens", 0)
|
|
99
|
+
output_tokens = response.get("usage", {}).get("completion_tokens", 0)
|
|
100
|
+
output = response.get("choices", [{}])[0].get("message", {}).get("content", "")
|
|
101
|
+
parsed = self.sanitizer.safe_load_json(text=output)
|
|
102
|
+
return {"output": parsed, "metadata": {"input_tokens": input_tokens, "output_tokens": output_tokens}}
|
|
@@ -0,0 +1,232 @@
|
|
|
1
|
+
"""'comparator/service.py':"""
|
|
2
|
+
from collections.abc import Mapping
|
|
3
|
+
from typing import Any, Dict, List, Tuple, Literal
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel
|
|
6
|
+
|
|
7
|
+
from levelapp.core.base import BaseProcess
|
|
8
|
+
from levelapp.comparator.extractor import DataExtractor
|
|
9
|
+
from levelapp.comparator.scorer import MetricsManager, ComparisonResults
|
|
10
|
+
from levelapp.comparator.schemas import EntityMetric, SetMetric, MetricConfig
|
|
11
|
+
from levelapp.comparator.utils import format_evaluation_results
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class MetadataComparator(BaseProcess):
|
|
15
|
+
"""Metadata comparator component."""
|
|
16
|
+
|
|
17
|
+
def __init__(
|
|
18
|
+
self,
|
|
19
|
+
reference: BaseModel | None = None,
|
|
20
|
+
generated: BaseModel | None = None,
|
|
21
|
+
metrics_manager: MetricsManager | None = None,
|
|
22
|
+
):
|
|
23
|
+
"""
|
|
24
|
+
Initialize the MetadataComparator.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
reference (BaseModel): Reference BaseModel
|
|
28
|
+
generated (BaseModel): Extracted BaseModel
|
|
29
|
+
metrics_manager (MetricsManager): MetricsManager
|
|
30
|
+
"""
|
|
31
|
+
self.extractor = DataExtractor()
|
|
32
|
+
|
|
33
|
+
self._reference = reference
|
|
34
|
+
self._generated = generated
|
|
35
|
+
self._metrics_manager = metrics_manager
|
|
36
|
+
|
|
37
|
+
self._evaluation_data: List[
|
|
38
|
+
Tuple[str, list[str], list[str], Any, Any, Any, Any, float]
|
|
39
|
+
] = []
|
|
40
|
+
|
|
41
|
+
@property
|
|
42
|
+
def reference_data(self) -> BaseModel:
|
|
43
|
+
return self._reference
|
|
44
|
+
|
|
45
|
+
@property
|
|
46
|
+
def generated_data(self) -> BaseModel:
|
|
47
|
+
return self._generated
|
|
48
|
+
|
|
49
|
+
@property
|
|
50
|
+
def metrics_manager(self) -> MetricsManager:
|
|
51
|
+
return self._metrics_manager
|
|
52
|
+
|
|
53
|
+
@reference_data.setter
|
|
54
|
+
def reference_data(self, value: BaseModel):
|
|
55
|
+
self._reference = value
|
|
56
|
+
|
|
57
|
+
@generated_data.setter
|
|
58
|
+
def generated_data(self, value: BaseModel):
|
|
59
|
+
self._generated = value
|
|
60
|
+
|
|
61
|
+
@metrics_manager.setter
|
|
62
|
+
def metrics_manager(self, value: MetricsManager):
|
|
63
|
+
self._metrics_manager = value
|
|
64
|
+
|
|
65
|
+
def _get_score(self, field: str) -> Tuple[EntityMetric, SetMetric, float]:
|
|
66
|
+
"""
|
|
67
|
+
Retrieve the scoring metric and threshold for a given field.
|
|
68
|
+
|
|
69
|
+
Args:
|
|
70
|
+
field: The field for which to retrieve the metric and threshold.
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
A tuple containing the scoring metric and its threshold.
|
|
74
|
+
"""
|
|
75
|
+
if self._metrics_manager:
|
|
76
|
+
config = self._metrics_manager.get_metrics_config(field=field)
|
|
77
|
+
else:
|
|
78
|
+
config = MetricConfig()
|
|
79
|
+
|
|
80
|
+
return config.entity_metric, config.set_metric, config.threshold
|
|
81
|
+
|
|
82
|
+
def _format_results(
|
|
83
|
+
self,
|
|
84
|
+
output_type: Literal["json", "csv"] = "json"
|
|
85
|
+
) -> Dict[int, Any]:
|
|
86
|
+
"""
|
|
87
|
+
Format the internal evaluation data for reporting or storage.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
output_type: 'json' returns a list of dictionaries; 'csv' returns a DataFrame.
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
Formatted evaluation results or None if no data.
|
|
94
|
+
"""
|
|
95
|
+
formatted_results = format_evaluation_results(self._evaluation_data, output_type=output_type)
|
|
96
|
+
|
|
97
|
+
return dict(enumerate(formatted_results))
|
|
98
|
+
|
|
99
|
+
def evaluate(
|
|
100
|
+
self,
|
|
101
|
+
reference_list: List[str],
|
|
102
|
+
extracted_list: List[str],
|
|
103
|
+
entity_metric: EntityMetric,
|
|
104
|
+
set_metric: SetMetric,
|
|
105
|
+
threshold: float,
|
|
106
|
+
) -> ComparisonResults:
|
|
107
|
+
"""
|
|
108
|
+
Evaluates pairwise similarity between elements in two lists using fuzzy matching.
|
|
109
|
+
|
|
110
|
+
Args:
|
|
111
|
+
reference_list: Ground-truth list of strings.
|
|
112
|
+
extracted_list: Extracted list of strings to compare.
|
|
113
|
+
entity_metric (EntityMetric): entity-level comparison metric.
|
|
114
|
+
set_metric (SetMetric): set-level comparison metric.
|
|
115
|
+
threshold: Similarity threshold (0–100) for considering a match.
|
|
116
|
+
|
|
117
|
+
Returns:
|
|
118
|
+
A dict with accuracy, precision, recall, and F1-score.
|
|
119
|
+
"""
|
|
120
|
+
if not (reference_list or extracted_list):
|
|
121
|
+
return ComparisonResults("", "", entity_metric.value, None, set_metric.value, None)
|
|
122
|
+
|
|
123
|
+
scores = self._metrics_manager.compute_entity_scores(
|
|
124
|
+
reference_seq=reference_list,
|
|
125
|
+
extracted_seq=extracted_list,
|
|
126
|
+
scorer=entity_metric,
|
|
127
|
+
pairwise=False
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
return self._metrics_manager.compute_set_scores(
|
|
131
|
+
data=scores,
|
|
132
|
+
scorer=set_metric,
|
|
133
|
+
threshold=threshold,
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
def _recursive_compare(
|
|
137
|
+
self,
|
|
138
|
+
ref_node: Any,
|
|
139
|
+
ext_node: Any,
|
|
140
|
+
results: Dict[str, Dict[str, float]],
|
|
141
|
+
prefix: str = "",
|
|
142
|
+
threshold: float = 99.0,
|
|
143
|
+
) -> None:
|
|
144
|
+
"""
|
|
145
|
+
Recursively compare extracted vs. reference metadata nodes.
|
|
146
|
+
|
|
147
|
+
Args:
|
|
148
|
+
ref_node: dict or list (from deep_extract reference metadata)
|
|
149
|
+
ext_node: dict or list (from deep_extract extracted metadata)
|
|
150
|
+
results: Dict to accumulate comp_results keyed by hierarchical attribute paths.
|
|
151
|
+
prefix: str, current path prefix to form hierarchical keys.
|
|
152
|
+
"""
|
|
153
|
+
# Case 1: Both nodes are dicts -> recurse on keys
|
|
154
|
+
if isinstance(ref_node, Mapping) and isinstance(ext_node, Mapping):
|
|
155
|
+
all_keys = set(ref_node.keys()) | set(ext_node.keys())
|
|
156
|
+
for key in all_keys:
|
|
157
|
+
new_prefix = f"{prefix}.{key}" if prefix else key
|
|
158
|
+
ref_subnode = ref_node.get(key, [])
|
|
159
|
+
ext_subnode = ext_node.get(key, [])
|
|
160
|
+
self._recursive_compare(
|
|
161
|
+
ref_node=ref_subnode,
|
|
162
|
+
ext_node=ext_subnode,
|
|
163
|
+
results=results,
|
|
164
|
+
prefix=new_prefix,
|
|
165
|
+
threshold=threshold,
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
# Case 2: Leaf nodes (lists) -> evaluate directly
|
|
169
|
+
else:
|
|
170
|
+
# Defensive: convert to list if not list
|
|
171
|
+
ref_list = ref_node if isinstance(ref_node, list) else [ref_node]
|
|
172
|
+
ext_list = ext_node if isinstance(ext_node, list) else [ext_node]
|
|
173
|
+
|
|
174
|
+
# Convert all to strings for consistent fuzzy matching
|
|
175
|
+
ref_list_str = list(map(str, ref_list))
|
|
176
|
+
ext_list_str = list(map(str, ext_list))
|
|
177
|
+
|
|
178
|
+
entity_metric_, set_metric_, threshold = self._get_score(field=prefix)
|
|
179
|
+
|
|
180
|
+
# Evaluate similarity metrics
|
|
181
|
+
comp_results = self.evaluate(
|
|
182
|
+
reference_list=ref_list_str,
|
|
183
|
+
extracted_list=ext_list_str,
|
|
184
|
+
entity_metric=entity_metric_,
|
|
185
|
+
set_metric=set_metric_,
|
|
186
|
+
threshold=threshold,
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
if comp_results:
|
|
190
|
+
self._evaluation_data.append(
|
|
191
|
+
(
|
|
192
|
+
prefix,
|
|
193
|
+
ref_list_str,
|
|
194
|
+
ext_list_str,
|
|
195
|
+
comp_results.e_metric,
|
|
196
|
+
comp_results.e_score,
|
|
197
|
+
comp_results.s_metric,
|
|
198
|
+
comp_results.s_score,
|
|
199
|
+
threshold,
|
|
200
|
+
)
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
results[prefix] = comp_results or {"accuracy": 0}
|
|
204
|
+
|
|
205
|
+
def run(self, indexed_mode: bool = False) -> Dict[int, Any]:
|
|
206
|
+
"""
|
|
207
|
+
Launch a metadata comparison process between reference and extracted data.
|
|
208
|
+
|
|
209
|
+
Args:
|
|
210
|
+
indexed_mode: Flag to use indexed mode for metadata extraction.
|
|
211
|
+
|
|
212
|
+
Returns:
|
|
213
|
+
Dictionary with comparison results, keyed by attribute paths.
|
|
214
|
+
"""
|
|
215
|
+
self._evaluation_data.clear()
|
|
216
|
+
|
|
217
|
+
ref_data = self.extractor.deep_extract(model=self.reference_data, indexed=indexed_mode)
|
|
218
|
+
ext_data = self.extractor.deep_extract(model=self.generated_data, indexed=indexed_mode)
|
|
219
|
+
|
|
220
|
+
results: Dict[str, Dict[str, float]] = {}
|
|
221
|
+
|
|
222
|
+
self._recursive_compare(
|
|
223
|
+
ref_node=ref_data,
|
|
224
|
+
ext_node=ext_data,
|
|
225
|
+
results=results,
|
|
226
|
+
prefix="",
|
|
227
|
+
threshold=1,
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
formatted_results = self._format_results()
|
|
231
|
+
|
|
232
|
+
return formatted_results
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
"""levelapp/comparator/extractor.py"""
|
|
2
|
+
|
|
3
|
+
from collections import defaultdict
|
|
4
|
+
from collections.abc import Sequence
|
|
5
|
+
from typing import List, Dict, Any
|
|
6
|
+
from pydantic import BaseModel
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class DataExtractor:
|
|
10
|
+
"""
|
|
11
|
+
Extracts primitive values from nested Pydantic models, dicts, and sequences.
|
|
12
|
+
"""
|
|
13
|
+
def deep_extract(
|
|
14
|
+
self, model: BaseModel,
|
|
15
|
+
indexed: bool = False
|
|
16
|
+
) -> Dict[str, List[str]]:
|
|
17
|
+
"""
|
|
18
|
+
Extracts data in a recursive way from pydantic model.
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
model: An instance of a BaseModel.
|
|
22
|
+
indexed: Switch parameter to select the extraction approach.
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
A dictionary where keys are attribute names and values are lists of string values.
|
|
26
|
+
"""
|
|
27
|
+
result: Dict[str, List[str]] = defaultdict(list)
|
|
28
|
+
for field_name, field_info in type(model).model_fields.items():
|
|
29
|
+
field_value = getattr(model, field_name)
|
|
30
|
+
self._extract_field_values(
|
|
31
|
+
value=field_value, prefix=field_name, result=result, indexed=indexed
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
return result
|
|
35
|
+
|
|
36
|
+
def _extract_field_values(
|
|
37
|
+
self,
|
|
38
|
+
value: Any,
|
|
39
|
+
prefix: str,
|
|
40
|
+
result: Dict[str, List[str]],
|
|
41
|
+
indexed: bool = False,
|
|
42
|
+
) -> None:
|
|
43
|
+
"""
|
|
44
|
+
Recursively extract values from a field, storing them in result with field path as key.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
value: The value to extract (BaseModel, dict, list, or primitive).
|
|
48
|
+
prefix: The current field path (e.g., 'documents.tribunal_members').
|
|
49
|
+
result: Dictionary to store field paths and their value lists.
|
|
50
|
+
indexed: Switch parameter to select the extraction approach.
|
|
51
|
+
"""
|
|
52
|
+
if isinstance(value, BaseModel):
|
|
53
|
+
self._handle_model(model=value, prefix=prefix, result=result)
|
|
54
|
+
|
|
55
|
+
elif isinstance(value, Sequence) and not isinstance(value, (str, bytes)):
|
|
56
|
+
self._handle_sequence(
|
|
57
|
+
sequence=value, prefix=prefix, result=result, indexed=indexed
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
else:
|
|
61
|
+
result[prefix].append(value)
|
|
62
|
+
|
|
63
|
+
def _handle_model(
|
|
64
|
+
self, model: BaseModel, prefix: str, result: Dict[str, List[str]]
|
|
65
|
+
) -> None:
|
|
66
|
+
"""
|
|
67
|
+
Extract values from a Pydantic model recursively.
|
|
68
|
+
|
|
69
|
+
Args:
|
|
70
|
+
model: Pydantic BaseModel instance.
|
|
71
|
+
prefix: Current field path.
|
|
72
|
+
result: Dictionary to store field paths and value lists.
|
|
73
|
+
"""
|
|
74
|
+
for field_name, field_info in type(model).model_fields.items():
|
|
75
|
+
field_value = getattr(model, field_name)
|
|
76
|
+
new_prefix = f"{prefix}.{field_name}" if prefix else field_name
|
|
77
|
+
self._extract_field_values(
|
|
78
|
+
value=field_value, prefix=new_prefix, result=result
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
def _handle_sequence(
|
|
82
|
+
self,
|
|
83
|
+
sequence: Sequence,
|
|
84
|
+
prefix: str,
|
|
85
|
+
result: Dict[str, List[str]],
|
|
86
|
+
indexed: bool = False,
|
|
87
|
+
) -> None:
|
|
88
|
+
"""
|
|
89
|
+
Extract values from a sequence (list or tuple) recursively.
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
sequence: List or tuple of values.
|
|
93
|
+
prefix: Current field path.
|
|
94
|
+
result: Dictionary to store field paths and value lists.
|
|
95
|
+
indexed: Switch parameter to select the extraction approach.
|
|
96
|
+
"""
|
|
97
|
+
if not sequence:
|
|
98
|
+
result[prefix] = []
|
|
99
|
+
|
|
100
|
+
if indexed:
|
|
101
|
+
for i, item in enumerate(sequence):
|
|
102
|
+
new_prefix = f"{prefix}[{i}]" if prefix else f"[{i}]"
|
|
103
|
+
self._extract_field_values(value=item, prefix=new_prefix, result=result)
|
|
104
|
+
else:
|
|
105
|
+
for i, item in enumerate(sequence):
|
|
106
|
+
self._extract_field_values(
|
|
107
|
+
value=item, prefix=prefix, result=result, indexed=indexed
|
|
108
|
+
)
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
"""'comparator/schemas.py': Defines Pydantic models for extracted metadata."""
|
|
2
|
+
|
|
3
|
+
from enum import Enum
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel, Field
|
|
6
|
+
from rapidfuzz import fuzz, utils
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class AttrCompMixin:
|
|
10
|
+
def __eq__(self, other) -> bool:
|
|
11
|
+
if not isinstance(other, type(self)):
|
|
12
|
+
return False
|
|
13
|
+
|
|
14
|
+
attr_name = next(iter(self.__dict__.keys()))
|
|
15
|
+
_cond = (
|
|
16
|
+
fuzz.ratio(
|
|
17
|
+
s1=getattr(self, attr_name),
|
|
18
|
+
s2=getattr(other, attr_name),
|
|
19
|
+
processor=utils.default_process,
|
|
20
|
+
)
|
|
21
|
+
> 99
|
|
22
|
+
)
|
|
23
|
+
return _cond
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class CompScoreMixin:
|
|
27
|
+
def comp_score(self, other) -> float:
|
|
28
|
+
attr_name = next(iter(self.__dict__.keys()))
|
|
29
|
+
_score = fuzz.ratio(
|
|
30
|
+
s1=getattr(self, attr_name),
|
|
31
|
+
s2=getattr(other, attr_name),
|
|
32
|
+
processor=utils.default_process,
|
|
33
|
+
)
|
|
34
|
+
return _score
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class EntityMetric(str, Enum):
|
|
38
|
+
WRATIO = "wratio"
|
|
39
|
+
LEV_NORM = "lev-norm"
|
|
40
|
+
JARO_WINKLER = "jaro-winkler"
|
|
41
|
+
TOKEN_SORT_RATIO = "token-sort-ratio"
|
|
42
|
+
TOKEN_SET_RATIO = "token-set-ratio"
|
|
43
|
+
|
|
44
|
+
@classmethod
|
|
45
|
+
def list(cls):
|
|
46
|
+
return [field.value for field in cls]
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class SetMetric(str, Enum):
|
|
50
|
+
ACCURACY = "accuracy"
|
|
51
|
+
F1_SCORE = "f1-score"
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class MetricConfig(BaseModel):
|
|
55
|
+
"""
|
|
56
|
+
Configuration for a field's comparison metric.
|
|
57
|
+
"""
|
|
58
|
+
field_name: str = Field(default="lev_norm", description="Name of the field")
|
|
59
|
+
entity_metric: EntityMetric = Field(default=EntityMetric.LEV_NORM, description="Entity level metric")
|
|
60
|
+
set_metric: SetMetric = Field(default=SetMetric.ACCURACY, description="Set level metric")
|
|
61
|
+
threshold: float = Field(default=100, ge=0, le=100, description="Match threshold")
|