netrias_client 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of netrias_client might be problematic. Click here for more details.
- netrias_client/__init__.py +9 -0
- netrias_client/_adapter.py +288 -0
- netrias_client/_client.py +251 -0
- netrias_client/_config.py +95 -0
- netrias_client/_core.py +560 -0
- netrias_client/_discovery.py +437 -0
- netrias_client/_errors.py +33 -0
- netrias_client/_gateway_bypass.py +208 -0
- netrias_client/_http.py +126 -0
- netrias_client/_io.py +28 -0
- netrias_client/_logging.py +46 -0
- netrias_client/_models.py +72 -0
- netrias_client/_validators.py +173 -0
- netrias_client/scripts.py +313 -0
- netrias_client-0.0.1.dist-info/METADATA +222 -0
- netrias_client-0.0.1.dist-info/RECORD +19 -0
- netrias_client-0.0.1.dist-info/WHEEL +4 -0
- netrias_client-0.0.1.dist-info/entry_points.txt +5 -0
- netrias_client-0.0.1.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,288 @@
|
|
|
1
|
+
"""Translate discovery results into manifest-friendly mappings.
|
|
2
|
+
|
|
3
|
+
'why': bridge API recommendations to harmonization manifests while respecting confidence bounds
|
|
4
|
+
"""
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import json
|
|
8
|
+
import logging
|
|
9
|
+
from collections.abc import Iterable, Mapping
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Final, cast
|
|
12
|
+
|
|
13
|
+
from ._models import MappingDiscoveryResult, MappingRecommendationOption, MappingSuggestion
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def build_column_mapping_payload(
|
|
18
|
+
result: MappingDiscoveryResult,
|
|
19
|
+
threshold: float,
|
|
20
|
+
logger: logging.Logger | None = None,
|
|
21
|
+
) -> dict[str, dict[str, dict[str, object]]]:
|
|
22
|
+
"""Convert discovery output into the manifest structure expected by harmonization."""
|
|
23
|
+
|
|
24
|
+
active_logger = logger or logging.getLogger("netrias_client")
|
|
25
|
+
strongest = strongest_targets(result, threshold=threshold, logger=active_logger)
|
|
26
|
+
return {"column_mappings": _column_entries(strongest, active_logger)}
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
_COLUMN_METADATA: Final[dict[str, dict[str, object]]] = {
|
|
30
|
+
# "study_name": {"route": "api:passthrough", "targetField": "study_name"},
|
|
31
|
+
# "number_of_participants": {"route": "api:passthrough", "targetField": "number_of_participants"},
|
|
32
|
+
# "number_of_samples": {"route": "api:passthrough", "targetField": "number_of_samples"},
|
|
33
|
+
# "study_data_types": {
|
|
34
|
+
# "route": "api:passthrough",
|
|
35
|
+
# "targetField": "study_data_types",
|
|
36
|
+
# "cdeId": 12_571_096,
|
|
37
|
+
# "cde_id": 12_571_096,
|
|
38
|
+
# },
|
|
39
|
+
# "participant_id": {"route": "api:passthrough", "targetField": "participant_id"},
|
|
40
|
+
# "sample_id": {"route": "api:passthrough", "targetField": "sample_id"},
|
|
41
|
+
# "file_name": {"route": "api:passthrough", "targetField": "file_name"},
|
|
42
|
+
"primary_diagnosis": {
|
|
43
|
+
"route": "sagemaker:primary",
|
|
44
|
+
"targetField": "primary_diagnosis",
|
|
45
|
+
"cdeId": -200,
|
|
46
|
+
"cde_id": -200,
|
|
47
|
+
},
|
|
48
|
+
"therapeutic_agents": {
|
|
49
|
+
"route": "sagemaker:therapeutic_agents",
|
|
50
|
+
"targetField": "therapeutic_agents",
|
|
51
|
+
"cdeId": -203,
|
|
52
|
+
"cde_id": -203,
|
|
53
|
+
},
|
|
54
|
+
"morphology": {
|
|
55
|
+
"route": "sagemaker:morphology",
|
|
56
|
+
"targetField": "morphology",
|
|
57
|
+
"cdeId": -201,
|
|
58
|
+
"cde_id": -201,
|
|
59
|
+
},
|
|
60
|
+
# "tissue_or_organ_of_origin": {
|
|
61
|
+
# "route": "sagemaker:tissue_origin",
|
|
62
|
+
# "targetField": "tissue_or_organ_of_origin",
|
|
63
|
+
# "cdeId": -204,
|
|
64
|
+
# "cde_id": -204,
|
|
65
|
+
# },
|
|
66
|
+
# "site_of_resection_or_biopsy": {
|
|
67
|
+
# "route": "sagemaker:sample_anatomic_site",
|
|
68
|
+
# "targetField": "site_of_resection_or_biopsy",
|
|
69
|
+
# "cdeId": -202,
|
|
70
|
+
# "cde_id": -202,
|
|
71
|
+
# },
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def strongest_targets(
|
|
76
|
+
result: MappingDiscoveryResult,
|
|
77
|
+
threshold: float,
|
|
78
|
+
logger: logging.Logger,
|
|
79
|
+
) -> dict[str, str]:
|
|
80
|
+
"""Return the highest-confidence target per column, filtered by threshold."""
|
|
81
|
+
|
|
82
|
+
if result.suggestions:
|
|
83
|
+
selected = _from_suggestions(result.suggestions, threshold)
|
|
84
|
+
else:
|
|
85
|
+
selected = _from_raw_payload(result.raw, threshold)
|
|
86
|
+
|
|
87
|
+
if selected:
|
|
88
|
+
logger.info("adapter strongest targets: %s", selected)
|
|
89
|
+
else:
|
|
90
|
+
logger.warning("adapter strongest targets empty after filtering")
|
|
91
|
+
return selected
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def _column_entries(
|
|
95
|
+
strongest: Mapping[str, str],
|
|
96
|
+
logger: logging.Logger,
|
|
97
|
+
) -> dict[str, dict[str, object]]:
|
|
98
|
+
entries: dict[str, dict[str, object]] = {}
|
|
99
|
+
missing_cde: dict[str, str] = {}
|
|
100
|
+
for source, target in strongest.items():
|
|
101
|
+
entry = _initial_entry(source, target)
|
|
102
|
+
if _needs_cde(entry):
|
|
103
|
+
missing_cde[source] = target
|
|
104
|
+
entries[source] = entry
|
|
105
|
+
|
|
106
|
+
_apply_metadata_defaults(entries)
|
|
107
|
+
|
|
108
|
+
if missing_cde:
|
|
109
|
+
logger.info("adapter unresolved targets (no CDE id mapping): %s", missing_cde)
|
|
110
|
+
return entries
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def _initial_entry(source: str, target: str) -> dict[str, object]:
|
|
114
|
+
metadata = _COLUMN_METADATA.get(source)
|
|
115
|
+
if metadata is None:
|
|
116
|
+
return {"targetField": target}
|
|
117
|
+
# Preserve configured targetField when metadata defines it.
|
|
118
|
+
return dict(metadata)
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def _needs_cde(entry: Mapping[str, object]) -> bool:
|
|
122
|
+
return "cdeId" not in entry
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def _apply_metadata_defaults(entries: dict[str, dict[str, object]]) -> None:
|
|
126
|
+
for source, metadata in _COLUMN_METADATA.items():
|
|
127
|
+
if source not in entries:
|
|
128
|
+
entries[source] = dict(metadata)
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def _from_suggestions(
|
|
132
|
+
suggestions: Iterable[MappingSuggestion], threshold: float
|
|
133
|
+
) -> dict[str, str]:
|
|
134
|
+
strongest: dict[str, str] = {}
|
|
135
|
+
for suggestion in suggestions:
|
|
136
|
+
option = _top_option(suggestion.options, threshold)
|
|
137
|
+
if option is None or option.target is None:
|
|
138
|
+
continue
|
|
139
|
+
strongest[suggestion.source_column] = option.target
|
|
140
|
+
return strongest
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def _from_raw_payload(payload: Mapping[str, object], threshold: float) -> dict[str, str]:
|
|
144
|
+
strongest: dict[str, str] = {}
|
|
145
|
+
for column, value in payload.items():
|
|
146
|
+
options = _coerce_options(value)
|
|
147
|
+
option = _top_option(options, threshold)
|
|
148
|
+
if option is None or option.target is None:
|
|
149
|
+
continue
|
|
150
|
+
strongest[column] = option.target
|
|
151
|
+
return strongest
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def _coerce_options(value: object) -> tuple[MappingRecommendationOption, ...]:
|
|
155
|
+
if not isinstance(value, list):
|
|
156
|
+
return ()
|
|
157
|
+
return tuple(_option_iterator(cast(list[object], value)))
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def _option_iterator(items: list[object]) -> Iterable[MappingRecommendationOption]:
|
|
161
|
+
for item in items:
|
|
162
|
+
if not isinstance(item, Mapping):
|
|
163
|
+
continue
|
|
164
|
+
option = _option_from_mapping(cast(Mapping[str, object], item))
|
|
165
|
+
if option is not None:
|
|
166
|
+
yield option
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def _option_from_mapping(item: Mapping[str, object]) -> MappingRecommendationOption | None:
|
|
170
|
+
target = item.get("target")
|
|
171
|
+
if not isinstance(target, str):
|
|
172
|
+
return None
|
|
173
|
+
similarity = item.get("similarity")
|
|
174
|
+
score: float | None = None
|
|
175
|
+
if isinstance(similarity, (float, int)):
|
|
176
|
+
score = float(similarity)
|
|
177
|
+
return MappingRecommendationOption(target=target, confidence=score, raw=item)
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def _top_option(
|
|
181
|
+
options: Iterable[MappingRecommendationOption], threshold: float
|
|
182
|
+
) -> MappingRecommendationOption | None:
|
|
183
|
+
eligible = [opt for opt in options if _meets_threshold(opt, threshold)]
|
|
184
|
+
if not eligible:
|
|
185
|
+
return None
|
|
186
|
+
return max(eligible, key=lambda opt: opt.confidence or float("-inf"))
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def _meets_threshold(option: MappingRecommendationOption, threshold: float) -> bool:
|
|
190
|
+
score = option.confidence
|
|
191
|
+
if score is None:
|
|
192
|
+
return False
|
|
193
|
+
return score >= threshold
|
|
194
|
+
|
|
195
|
+
def normalize_manifest_mapping(
|
|
196
|
+
manifest: Path | Mapping[str, object] | None,
|
|
197
|
+
) -> dict[str, int]:
|
|
198
|
+
"""Normalize manifest column→CDE entries for harmonization payloads."""
|
|
199
|
+
|
|
200
|
+
if manifest is None:
|
|
201
|
+
return {}
|
|
202
|
+
raw = _load_manifest_raw(manifest)
|
|
203
|
+
mapping = _mapping_dict(raw)
|
|
204
|
+
normalized: dict[str, int] = {}
|
|
205
|
+
for field, value in mapping.items():
|
|
206
|
+
_apply_cde_entry(normalized, field, value)
|
|
207
|
+
return normalized
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
def _load_manifest_raw(manifest: Path | Mapping[str, object]) -> Mapping[str, object]:
|
|
211
|
+
if isinstance(manifest, Path):
|
|
212
|
+
content = manifest.read_text(encoding="utf-8")
|
|
213
|
+
try:
|
|
214
|
+
return cast(Mapping[str, object], json.loads(content))
|
|
215
|
+
except json.JSONDecodeError as exc:
|
|
216
|
+
raise ValueError(f"manifest must be valid JSON: {exc}") from exc
|
|
217
|
+
return manifest
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
def _mapping_dict(raw: Mapping[str, object]) -> dict[str, object]:
|
|
221
|
+
mapping = _dict_if_str_mapping(raw)
|
|
222
|
+
if mapping is None:
|
|
223
|
+
return {}
|
|
224
|
+
candidate = _dict_if_str_mapping(mapping.get("column_mappings"))
|
|
225
|
+
return candidate if candidate is not None else mapping
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
def _dict_if_str_mapping(value: object) -> dict[str, object] | None:
|
|
229
|
+
if isinstance(value, Mapping):
|
|
230
|
+
typed = cast(Mapping[str, object], value)
|
|
231
|
+
return dict(typed)
|
|
232
|
+
return None
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
def _apply_cde_entry(destination: dict[str, int], field: object, value: object) -> None:
|
|
236
|
+
name = _clean_field(field)
|
|
237
|
+
cde_id = _coerce_cde_id(value)
|
|
238
|
+
if name is None or cde_id is None:
|
|
239
|
+
return
|
|
240
|
+
destination[name] = cde_id
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
def _clean_field(field: object) -> str | None:
|
|
244
|
+
if not isinstance(field, str):
|
|
245
|
+
return None
|
|
246
|
+
name = field.strip()
|
|
247
|
+
return name or None
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
def _coerce_cde_id(value: object) -> int | None:
|
|
251
|
+
candidate = _cde_candidate(value)
|
|
252
|
+
if candidate is None:
|
|
253
|
+
return None
|
|
254
|
+
return _int_from_candidate(candidate)
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
def _cde_candidate(value: object) -> object | None:
|
|
258
|
+
mapping = _dict_if_str_mapping(value)
|
|
259
|
+
if mapping is not None:
|
|
260
|
+
return mapping.get("cdeId") or mapping.get("cde_id")
|
|
261
|
+
return value
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
def _int_from_candidate(candidate: object) -> int | None:
|
|
265
|
+
if isinstance(candidate, bool):
|
|
266
|
+
return int(candidate)
|
|
267
|
+
if isinstance(candidate, (int, float)):
|
|
268
|
+
return _int_from_number(candidate)
|
|
269
|
+
if isinstance(candidate, str):
|
|
270
|
+
return _int_from_string(candidate)
|
|
271
|
+
return None
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
def _int_from_number(value: int | float) -> int | None:
|
|
275
|
+
try:
|
|
276
|
+
return int(value)
|
|
277
|
+
except (TypeError, ValueError):
|
|
278
|
+
return None
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
def _int_from_string(value: str) -> int | None:
|
|
282
|
+
stripped = value.strip()
|
|
283
|
+
if not stripped:
|
|
284
|
+
return None
|
|
285
|
+
try:
|
|
286
|
+
return int(stripped)
|
|
287
|
+
except ValueError:
|
|
288
|
+
return None
|
|
@@ -0,0 +1,251 @@
|
|
|
1
|
+
"""Coordinate stateful access to discovery and harmonization APIs.
|
|
2
|
+
|
|
3
|
+
'why': provide a single, inspectable entry point that captures configuration once
|
|
4
|
+
and exposes typed discovery and harmonization helpers (sync/async) for consumers
|
|
5
|
+
"""
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import logging
|
|
9
|
+
import threading
|
|
10
|
+
from collections.abc import Mapping, Sequence
|
|
11
|
+
from dataclasses import replace
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from uuid import uuid4
|
|
14
|
+
|
|
15
|
+
from ._core import harmonize as _harmonize
|
|
16
|
+
from ._core import harmonize_async as _harmonize_async
|
|
17
|
+
from ._discovery import (
|
|
18
|
+
discover_cde_mapping as _discover_cde_mapping,
|
|
19
|
+
discover_mapping as _discover_mapping,
|
|
20
|
+
discover_mapping_async as _discover_mapping_async,
|
|
21
|
+
discover_mapping_from_csv_async as _discover_mapping_from_csv_async,
|
|
22
|
+
)
|
|
23
|
+
from ._config import build_settings
|
|
24
|
+
from ._errors import ClientConfigurationError
|
|
25
|
+
from ._logging import configure_logger
|
|
26
|
+
from ._models import HarmonizationResult, LogLevel, Settings
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
ManifestPayload = dict[str, dict[str, dict[str, object]]]
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class NetriasClient:
|
|
33
|
+
"""Expose discovery and harmonization workflows behind instance state.
|
|
34
|
+
|
|
35
|
+
A `NetriasClient` manages configuration snapshots (API key, URLs, thresholds,
|
|
36
|
+
bypass preferences) and threads them through every outbound call. Consumers
|
|
37
|
+
typically instantiate a client, call :meth:`configure`, and then interact via
|
|
38
|
+
the discovery/harmonization methods below.
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
def __init__(self) -> None:
|
|
42
|
+
"""Initialise an empty client awaiting configuration."""
|
|
43
|
+
|
|
44
|
+
self._lock: threading.Lock = threading.Lock()
|
|
45
|
+
self._settings: Settings | None = None
|
|
46
|
+
self._logger_name: str = f"netrias_client.{uuid4().hex}"
|
|
47
|
+
self._logger: logging.Logger | None = None
|
|
48
|
+
|
|
49
|
+
def configure(
|
|
50
|
+
self,
|
|
51
|
+
api_key: str,
|
|
52
|
+
timeout: float | None = None,
|
|
53
|
+
log_level: LogLevel | str | None = None,
|
|
54
|
+
confidence_threshold: float | None = None,
|
|
55
|
+
discovery_use_gateway_bypass: bool | None = None,
|
|
56
|
+
log_directory: Path | str | None = None,
|
|
57
|
+
) -> None:
|
|
58
|
+
"""Validate inputs and persist a new immutable settings snapshot.
|
|
59
|
+
|
|
60
|
+
Parameters
|
|
61
|
+
----------
|
|
62
|
+
api_key:
|
|
63
|
+
Netrias API bearer token used for authentication.
|
|
64
|
+
timeout:
|
|
65
|
+
Overall request timeout in seconds (defaults to six hours).
|
|
66
|
+
log_level:
|
|
67
|
+
Desired logging verbosity as a :class:`~netrias_client._models.LogLevel`
|
|
68
|
+
(string aliases are also accepted for convenience).
|
|
69
|
+
confidence_threshold:
|
|
70
|
+
Minimum confidence score required for discovery recommendations.
|
|
71
|
+
discovery_use_gateway_bypass:
|
|
72
|
+
When ``True`` (default) calls the temporary Lambda bypass instead of
|
|
73
|
+
API Gateway.
|
|
74
|
+
log_directory:
|
|
75
|
+
Optional directory where this client's log files should be written.
|
|
76
|
+
When omitted, logging remains stream-only.
|
|
77
|
+
|
|
78
|
+
Calling this method multiple times replaces the active snapshot and
|
|
79
|
+
reconfigures the package logger.
|
|
80
|
+
"""
|
|
81
|
+
|
|
82
|
+
settings = build_settings(
|
|
83
|
+
api_key=api_key,
|
|
84
|
+
timeout=timeout,
|
|
85
|
+
log_level=log_level,
|
|
86
|
+
confidence_threshold=confidence_threshold,
|
|
87
|
+
discovery_use_gateway_bypass=discovery_use_gateway_bypass,
|
|
88
|
+
log_directory=log_directory,
|
|
89
|
+
)
|
|
90
|
+
logger = configure_logger(
|
|
91
|
+
self._logger_name,
|
|
92
|
+
settings.log_level,
|
|
93
|
+
settings.log_directory,
|
|
94
|
+
)
|
|
95
|
+
with self._lock:
|
|
96
|
+
self._settings = settings
|
|
97
|
+
self._logger = logger
|
|
98
|
+
|
|
99
|
+
@property
|
|
100
|
+
def settings(self) -> Settings:
|
|
101
|
+
"""Return a defensive copy of the current settings.
|
|
102
|
+
|
|
103
|
+
'why': aid observability without exposing internal state for mutation
|
|
104
|
+
"""
|
|
105
|
+
|
|
106
|
+
return self._snapshot_settings()
|
|
107
|
+
|
|
108
|
+
def discover_mapping(
|
|
109
|
+
self,
|
|
110
|
+
target_schema: str,
|
|
111
|
+
column_samples: Mapping[str, Sequence[object]],
|
|
112
|
+
) -> ManifestPayload:
|
|
113
|
+
"""Perform synchronous mapping discovery for the provided schema."""
|
|
114
|
+
|
|
115
|
+
settings = self._snapshot_settings()
|
|
116
|
+
|
|
117
|
+
return _discover_mapping(
|
|
118
|
+
settings=settings,
|
|
119
|
+
target_schema=target_schema,
|
|
120
|
+
column_samples=column_samples,
|
|
121
|
+
logger=self._require_logger(),
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
async def discover_mapping_async(
|
|
125
|
+
self,
|
|
126
|
+
target_schema: str,
|
|
127
|
+
column_samples: Mapping[str, Sequence[object]],
|
|
128
|
+
) -> ManifestPayload:
|
|
129
|
+
"""Async variant of :meth:`discover_mapping` with identical semantics."""
|
|
130
|
+
|
|
131
|
+
settings = self._snapshot_settings()
|
|
132
|
+
|
|
133
|
+
return await _discover_mapping_async(
|
|
134
|
+
settings=settings,
|
|
135
|
+
target_schema=target_schema,
|
|
136
|
+
column_samples=column_samples,
|
|
137
|
+
logger=self._require_logger(),
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
def discover_mapping_from_csv(
|
|
141
|
+
self,
|
|
142
|
+
source_csv: Path,
|
|
143
|
+
target_schema: str,
|
|
144
|
+
sample_limit: int = 25,
|
|
145
|
+
) -> ManifestPayload:
|
|
146
|
+
"""Derive column samples from a CSV file then perform mapping discovery."""
|
|
147
|
+
|
|
148
|
+
settings = self._snapshot_settings()
|
|
149
|
+
|
|
150
|
+
return _discover_cde_mapping(
|
|
151
|
+
settings=settings,
|
|
152
|
+
source_csv=source_csv,
|
|
153
|
+
target_schema=target_schema,
|
|
154
|
+
sample_limit=sample_limit,
|
|
155
|
+
logger=self._require_logger(),
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
def discover_cde_mapping(
|
|
159
|
+
self,
|
|
160
|
+
source_csv: Path,
|
|
161
|
+
target_schema: str,
|
|
162
|
+
sample_limit: int = 25,
|
|
163
|
+
) -> ManifestPayload:
|
|
164
|
+
"""Compatibility alias for :meth:`discover_mapping_from_csv`."""
|
|
165
|
+
|
|
166
|
+
return self.discover_mapping_from_csv(
|
|
167
|
+
source_csv=source_csv,
|
|
168
|
+
target_schema=target_schema,
|
|
169
|
+
sample_limit=sample_limit,
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
async def discover_mapping_from_csv_async(
|
|
173
|
+
self,
|
|
174
|
+
source_csv: Path,
|
|
175
|
+
target_schema: str,
|
|
176
|
+
sample_limit: int = 25,
|
|
177
|
+
) -> ManifestPayload:
|
|
178
|
+
"""Async variant of :meth:`discover_mapping_from_csv`."""
|
|
179
|
+
|
|
180
|
+
settings = self._snapshot_settings()
|
|
181
|
+
|
|
182
|
+
return await _discover_mapping_from_csv_async(
|
|
183
|
+
settings=settings,
|
|
184
|
+
source_csv=source_csv,
|
|
185
|
+
target_schema=target_schema,
|
|
186
|
+
sample_limit=sample_limit,
|
|
187
|
+
logger=self._require_logger(),
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
def harmonize(
|
|
191
|
+
self,
|
|
192
|
+
source_path: Path,
|
|
193
|
+
manifest: Path | Mapping[str, object],
|
|
194
|
+
output_path: Path | None = None,
|
|
195
|
+
manifest_output_path: Path | None = None,
|
|
196
|
+
) -> HarmonizationResult:
|
|
197
|
+
"""Execute the harmonization workflow synchronously and block.
|
|
198
|
+
|
|
199
|
+
The method accepts either a manifest mapping or a JSON file path and
|
|
200
|
+
writes the harmonized CSV to the resolved output location (which may be
|
|
201
|
+
auto-versioned). A :class:`HarmonizationResult` is always returned even on
|
|
202
|
+
failure, allowing callers to inspect status and description.
|
|
203
|
+
"""
|
|
204
|
+
|
|
205
|
+
settings = self._snapshot_settings()
|
|
206
|
+
|
|
207
|
+
return _harmonize(
|
|
208
|
+
settings=settings,
|
|
209
|
+
source_path=source_path,
|
|
210
|
+
manifest=manifest,
|
|
211
|
+
output_path=output_path,
|
|
212
|
+
manifest_output_path=manifest_output_path,
|
|
213
|
+
logger=self._require_logger(),
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
async def harmonize_async(
|
|
217
|
+
self,
|
|
218
|
+
source_path: Path,
|
|
219
|
+
manifest: Path | Mapping[str, object],
|
|
220
|
+
output_path: Path | None = None,
|
|
221
|
+
manifest_output_path: Path | None = None,
|
|
222
|
+
) -> HarmonizationResult:
|
|
223
|
+
"""Async counterpart to :meth:`harmonize` with identical semantics."""
|
|
224
|
+
|
|
225
|
+
settings = self._snapshot_settings()
|
|
226
|
+
|
|
227
|
+
return await _harmonize_async(
|
|
228
|
+
settings=settings,
|
|
229
|
+
source_path=source_path,
|
|
230
|
+
manifest=manifest,
|
|
231
|
+
output_path=output_path,
|
|
232
|
+
manifest_output_path=manifest_output_path,
|
|
233
|
+
logger=self._require_logger(),
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
def _snapshot_settings(self) -> Settings:
|
|
237
|
+
"""Return a copy of the current settings or raise if not configured."""
|
|
238
|
+
|
|
239
|
+
with self._lock:
|
|
240
|
+
if self._settings is None:
|
|
241
|
+
raise ClientConfigurationError(
|
|
242
|
+
"client not configured; call configure(api_key=...) before use"
|
|
243
|
+
)
|
|
244
|
+
return replace(self._settings)
|
|
245
|
+
|
|
246
|
+
def _require_logger(self) -> logging.Logger:
|
|
247
|
+
if self._logger is None:
|
|
248
|
+
raise ClientConfigurationError(
|
|
249
|
+
"client not configured; call configure(api_key=...) before use"
|
|
250
|
+
)
|
|
251
|
+
return self._logger
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
"""Manage runtime client configuration.
|
|
2
|
+
|
|
3
|
+
'why': centralize settings creation and validation for NetriasClient
|
|
4
|
+
"""
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
from ._errors import ClientConfigurationError
|
|
10
|
+
from ._models import LogLevel, Settings
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
DISCOVERY_BASE_URL = "https://api.netriasbdf.cloud"
|
|
14
|
+
HARMONIZATION_BASE_URL = "https://tbdxz7nffi.execute-api.us-east-2.amazonaws.com"
|
|
15
|
+
# TODO: remove once API Gateway latency constraints are resolved.
|
|
16
|
+
BYPASS_FUNCTION = "cde-recommendation"
|
|
17
|
+
BYPASS_ALIAS = "prod"
|
|
18
|
+
BYPASS_REGION = "us-east-2"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def build_settings(
|
|
22
|
+
api_key: str,
|
|
23
|
+
timeout: float | None = None,
|
|
24
|
+
log_level: LogLevel | str | None = None,
|
|
25
|
+
confidence_threshold: float | None = None,
|
|
26
|
+
discovery_use_gateway_bypass: bool | None = None,
|
|
27
|
+
log_directory: Path | str | None = None,
|
|
28
|
+
) -> Settings:
|
|
29
|
+
"""Return a validated Settings snapshot for the provided configuration."""
|
|
30
|
+
|
|
31
|
+
key = (api_key or "").strip()
|
|
32
|
+
if not key:
|
|
33
|
+
raise ClientConfigurationError("api_key must be a non-empty string; call configure(api_key=...) before use")
|
|
34
|
+
|
|
35
|
+
level = _normalized_level(log_level)
|
|
36
|
+
timeout_value = _validated_timeout(timeout)
|
|
37
|
+
threshold = _validated_confidence_threshold(confidence_threshold)
|
|
38
|
+
bypass_enabled = _normalized_bool(discovery_use_gateway_bypass, default=True)
|
|
39
|
+
directory = _validated_log_directory(log_directory)
|
|
40
|
+
|
|
41
|
+
return Settings(
|
|
42
|
+
api_key=key,
|
|
43
|
+
discovery_url=DISCOVERY_BASE_URL,
|
|
44
|
+
harmonization_url=HARMONIZATION_BASE_URL,
|
|
45
|
+
timeout=timeout_value,
|
|
46
|
+
log_level=level,
|
|
47
|
+
confidence_threshold=threshold,
|
|
48
|
+
discovery_use_gateway_bypass=bypass_enabled,
|
|
49
|
+
log_directory=directory,
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _normalized_level(level: LogLevel | str | None) -> LogLevel:
|
|
54
|
+
if level is None:
|
|
55
|
+
return LogLevel.INFO
|
|
56
|
+
if isinstance(level, LogLevel):
|
|
57
|
+
return level
|
|
58
|
+
upper = level.upper()
|
|
59
|
+
try:
|
|
60
|
+
return LogLevel[upper]
|
|
61
|
+
except KeyError as exc:
|
|
62
|
+
raise ClientConfigurationError(f"unsupported log_level: {level}") from exc
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def _validated_timeout(timeout: float | None) -> float:
|
|
66
|
+
if timeout is None:
|
|
67
|
+
return 21600.0 # default to 6 hours to accommodate long-running jobs
|
|
68
|
+
if timeout <= 0:
|
|
69
|
+
raise ClientConfigurationError("timeout must be positive when provided")
|
|
70
|
+
return float(timeout)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _validated_confidence_threshold(value: float | None) -> float:
|
|
74
|
+
if value is None:
|
|
75
|
+
return 0.8
|
|
76
|
+
if not (0.0 <= value <= 1.0):
|
|
77
|
+
raise ClientConfigurationError("confidence_threshold must be between 0.0 and 1.0")
|
|
78
|
+
return float(value)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def _normalized_bool(value: bool | None, default: bool = False) -> bool:
|
|
82
|
+
if value is None:
|
|
83
|
+
return default
|
|
84
|
+
return bool(value)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def _validated_log_directory(value: Path | str | None) -> Path | None:
|
|
88
|
+
if value is None:
|
|
89
|
+
return None
|
|
90
|
+
directory = Path(value)
|
|
91
|
+
try:
|
|
92
|
+
directory.mkdir(parents=True, exist_ok=True)
|
|
93
|
+
except OSError as exc:
|
|
94
|
+
raise ClientConfigurationError(f"unable to create log directory {directory}: {exc}") from exc
|
|
95
|
+
return directory
|