netrias_client 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of netrias_client might be problematic. Click here for more details.

@@ -0,0 +1,9 @@
1
+ """Expose the Netrias client facade and package metadata."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from ._client import NetriasClient
6
+
7
+ __all__ = ["NetriasClient", "__version__"]
8
+
9
+ __version__ = "0.0.1"
@@ -0,0 +1,288 @@
1
+ """Translate discovery results into manifest-friendly mappings.
2
+
3
+ 'why': bridge API recommendations to harmonization manifests while respecting confidence bounds
4
+ """
5
+ from __future__ import annotations
6
+
7
+ import json
8
+ import logging
9
+ from collections.abc import Iterable, Mapping
10
+ from pathlib import Path
11
+ from typing import Final, cast
12
+
13
+ from ._models import MappingDiscoveryResult, MappingRecommendationOption, MappingSuggestion
14
+
15
+
16
+
17
+ def build_column_mapping_payload(
18
+ result: MappingDiscoveryResult,
19
+ threshold: float,
20
+ logger: logging.Logger | None = None,
21
+ ) -> dict[str, dict[str, dict[str, object]]]:
22
+ """Convert discovery output into the manifest structure expected by harmonization."""
23
+
24
+ active_logger = logger or logging.getLogger("netrias_client")
25
+ strongest = strongest_targets(result, threshold=threshold, logger=active_logger)
26
+ return {"column_mappings": _column_entries(strongest, active_logger)}
27
+
28
+
29
+ _COLUMN_METADATA: Final[dict[str, dict[str, object]]] = {
30
+ # "study_name": {"route": "api:passthrough", "targetField": "study_name"},
31
+ # "number_of_participants": {"route": "api:passthrough", "targetField": "number_of_participants"},
32
+ # "number_of_samples": {"route": "api:passthrough", "targetField": "number_of_samples"},
33
+ # "study_data_types": {
34
+ # "route": "api:passthrough",
35
+ # "targetField": "study_data_types",
36
+ # "cdeId": 12_571_096,
37
+ # "cde_id": 12_571_096,
38
+ # },
39
+ # "participant_id": {"route": "api:passthrough", "targetField": "participant_id"},
40
+ # "sample_id": {"route": "api:passthrough", "targetField": "sample_id"},
41
+ # "file_name": {"route": "api:passthrough", "targetField": "file_name"},
42
+ "primary_diagnosis": {
43
+ "route": "sagemaker:primary",
44
+ "targetField": "primary_diagnosis",
45
+ "cdeId": -200,
46
+ "cde_id": -200,
47
+ },
48
+ "therapeutic_agents": {
49
+ "route": "sagemaker:therapeutic_agents",
50
+ "targetField": "therapeutic_agents",
51
+ "cdeId": -203,
52
+ "cde_id": -203,
53
+ },
54
+ "morphology": {
55
+ "route": "sagemaker:morphology",
56
+ "targetField": "morphology",
57
+ "cdeId": -201,
58
+ "cde_id": -201,
59
+ },
60
+ # "tissue_or_organ_of_origin": {
61
+ # "route": "sagemaker:tissue_origin",
62
+ # "targetField": "tissue_or_organ_of_origin",
63
+ # "cdeId": -204,
64
+ # "cde_id": -204,
65
+ # },
66
+ # "site_of_resection_or_biopsy": {
67
+ # "route": "sagemaker:sample_anatomic_site",
68
+ # "targetField": "site_of_resection_or_biopsy",
69
+ # "cdeId": -202,
70
+ # "cde_id": -202,
71
+ # },
72
+ }
73
+
74
+
75
+ def strongest_targets(
76
+ result: MappingDiscoveryResult,
77
+ threshold: float,
78
+ logger: logging.Logger,
79
+ ) -> dict[str, str]:
80
+ """Return the highest-confidence target per column, filtered by threshold."""
81
+
82
+ if result.suggestions:
83
+ selected = _from_suggestions(result.suggestions, threshold)
84
+ else:
85
+ selected = _from_raw_payload(result.raw, threshold)
86
+
87
+ if selected:
88
+ logger.info("adapter strongest targets: %s", selected)
89
+ else:
90
+ logger.warning("adapter strongest targets empty after filtering")
91
+ return selected
92
+
93
+
94
+ def _column_entries(
95
+ strongest: Mapping[str, str],
96
+ logger: logging.Logger,
97
+ ) -> dict[str, dict[str, object]]:
98
+ entries: dict[str, dict[str, object]] = {}
99
+ missing_cde: dict[str, str] = {}
100
+ for source, target in strongest.items():
101
+ entry = _initial_entry(source, target)
102
+ if _needs_cde(entry):
103
+ missing_cde[source] = target
104
+ entries[source] = entry
105
+
106
+ _apply_metadata_defaults(entries)
107
+
108
+ if missing_cde:
109
+ logger.info("adapter unresolved targets (no CDE id mapping): %s", missing_cde)
110
+ return entries
111
+
112
+
113
+ def _initial_entry(source: str, target: str) -> dict[str, object]:
114
+ metadata = _COLUMN_METADATA.get(source)
115
+ if metadata is None:
116
+ return {"targetField": target}
117
+ # Preserve configured targetField when metadata defines it.
118
+ return dict(metadata)
119
+
120
+
121
+ def _needs_cde(entry: Mapping[str, object]) -> bool:
122
+ return "cdeId" not in entry
123
+
124
+
125
+ def _apply_metadata_defaults(entries: dict[str, dict[str, object]]) -> None:
126
+ for source, metadata in _COLUMN_METADATA.items():
127
+ if source not in entries:
128
+ entries[source] = dict(metadata)
129
+
130
+
131
+ def _from_suggestions(
132
+ suggestions: Iterable[MappingSuggestion], threshold: float
133
+ ) -> dict[str, str]:
134
+ strongest: dict[str, str] = {}
135
+ for suggestion in suggestions:
136
+ option = _top_option(suggestion.options, threshold)
137
+ if option is None or option.target is None:
138
+ continue
139
+ strongest[suggestion.source_column] = option.target
140
+ return strongest
141
+
142
+
143
+ def _from_raw_payload(payload: Mapping[str, object], threshold: float) -> dict[str, str]:
144
+ strongest: dict[str, str] = {}
145
+ for column, value in payload.items():
146
+ options = _coerce_options(value)
147
+ option = _top_option(options, threshold)
148
+ if option is None or option.target is None:
149
+ continue
150
+ strongest[column] = option.target
151
+ return strongest
152
+
153
+
154
+ def _coerce_options(value: object) -> tuple[MappingRecommendationOption, ...]:
155
+ if not isinstance(value, list):
156
+ return ()
157
+ return tuple(_option_iterator(cast(list[object], value)))
158
+
159
+
160
+ def _option_iterator(items: list[object]) -> Iterable[MappingRecommendationOption]:
161
+ for item in items:
162
+ if not isinstance(item, Mapping):
163
+ continue
164
+ option = _option_from_mapping(cast(Mapping[str, object], item))
165
+ if option is not None:
166
+ yield option
167
+
168
+
169
+ def _option_from_mapping(item: Mapping[str, object]) -> MappingRecommendationOption | None:
170
+ target = item.get("target")
171
+ if not isinstance(target, str):
172
+ return None
173
+ similarity = item.get("similarity")
174
+ score: float | None = None
175
+ if isinstance(similarity, (float, int)):
176
+ score = float(similarity)
177
+ return MappingRecommendationOption(target=target, confidence=score, raw=item)
178
+
179
+
180
+ def _top_option(
181
+ options: Iterable[MappingRecommendationOption], threshold: float
182
+ ) -> MappingRecommendationOption | None:
183
+ eligible = [opt for opt in options if _meets_threshold(opt, threshold)]
184
+ if not eligible:
185
+ return None
186
+ return max(eligible, key=lambda opt: opt.confidence or float("-inf"))
187
+
188
+
189
+ def _meets_threshold(option: MappingRecommendationOption, threshold: float) -> bool:
190
+ score = option.confidence
191
+ if score is None:
192
+ return False
193
+ return score >= threshold
194
+
195
+ def normalize_manifest_mapping(
196
+ manifest: Path | Mapping[str, object] | None,
197
+ ) -> dict[str, int]:
198
+ """Normalize manifest column→CDE entries for harmonization payloads."""
199
+
200
+ if manifest is None:
201
+ return {}
202
+ raw = _load_manifest_raw(manifest)
203
+ mapping = _mapping_dict(raw)
204
+ normalized: dict[str, int] = {}
205
+ for field, value in mapping.items():
206
+ _apply_cde_entry(normalized, field, value)
207
+ return normalized
208
+
209
+
210
+ def _load_manifest_raw(manifest: Path | Mapping[str, object]) -> Mapping[str, object]:
211
+ if isinstance(manifest, Path):
212
+ content = manifest.read_text(encoding="utf-8")
213
+ try:
214
+ return cast(Mapping[str, object], json.loads(content))
215
+ except json.JSONDecodeError as exc:
216
+ raise ValueError(f"manifest must be valid JSON: {exc}") from exc
217
+ return manifest
218
+
219
+
220
+ def _mapping_dict(raw: Mapping[str, object]) -> dict[str, object]:
221
+ mapping = _dict_if_str_mapping(raw)
222
+ if mapping is None:
223
+ return {}
224
+ candidate = _dict_if_str_mapping(mapping.get("column_mappings"))
225
+ return candidate if candidate is not None else mapping
226
+
227
+
228
+ def _dict_if_str_mapping(value: object) -> dict[str, object] | None:
229
+ if isinstance(value, Mapping):
230
+ typed = cast(Mapping[str, object], value)
231
+ return dict(typed)
232
+ return None
233
+
234
+
235
+ def _apply_cde_entry(destination: dict[str, int], field: object, value: object) -> None:
236
+ name = _clean_field(field)
237
+ cde_id = _coerce_cde_id(value)
238
+ if name is None or cde_id is None:
239
+ return
240
+ destination[name] = cde_id
241
+
242
+
243
+ def _clean_field(field: object) -> str | None:
244
+ if not isinstance(field, str):
245
+ return None
246
+ name = field.strip()
247
+ return name or None
248
+
249
+
250
+ def _coerce_cde_id(value: object) -> int | None:
251
+ candidate = _cde_candidate(value)
252
+ if candidate is None:
253
+ return None
254
+ return _int_from_candidate(candidate)
255
+
256
+
257
+ def _cde_candidate(value: object) -> object | None:
258
+ mapping = _dict_if_str_mapping(value)
259
+ if mapping is not None:
260
+ return mapping.get("cdeId") or mapping.get("cde_id")
261
+ return value
262
+
263
+
264
+ def _int_from_candidate(candidate: object) -> int | None:
265
+ if isinstance(candidate, bool):
266
+ return int(candidate)
267
+ if isinstance(candidate, (int, float)):
268
+ return _int_from_number(candidate)
269
+ if isinstance(candidate, str):
270
+ return _int_from_string(candidate)
271
+ return None
272
+
273
+
274
+ def _int_from_number(value: int | float) -> int | None:
275
+ try:
276
+ return int(value)
277
+ except (TypeError, ValueError):
278
+ return None
279
+
280
+
281
+ def _int_from_string(value: str) -> int | None:
282
+ stripped = value.strip()
283
+ if not stripped:
284
+ return None
285
+ try:
286
+ return int(stripped)
287
+ except ValueError:
288
+ return None
@@ -0,0 +1,251 @@
1
+ """Coordinate stateful access to discovery and harmonization APIs.
2
+
3
+ 'why': provide a single, inspectable entry point that captures configuration once
4
+ and exposes typed discovery and harmonization helpers (sync/async) for consumers
5
+ """
6
+ from __future__ import annotations
7
+
8
+ import logging
9
+ import threading
10
+ from collections.abc import Mapping, Sequence
11
+ from dataclasses import replace
12
+ from pathlib import Path
13
+ from uuid import uuid4
14
+
15
+ from ._core import harmonize as _harmonize
16
+ from ._core import harmonize_async as _harmonize_async
17
+ from ._discovery import (
18
+ discover_cde_mapping as _discover_cde_mapping,
19
+ discover_mapping as _discover_mapping,
20
+ discover_mapping_async as _discover_mapping_async,
21
+ discover_mapping_from_csv_async as _discover_mapping_from_csv_async,
22
+ )
23
+ from ._config import build_settings
24
+ from ._errors import ClientConfigurationError
25
+ from ._logging import configure_logger
26
+ from ._models import HarmonizationResult, LogLevel, Settings
27
+
28
+
29
+ ManifestPayload = dict[str, dict[str, dict[str, object]]]
30
+
31
+
32
+ class NetriasClient:
33
+ """Expose discovery and harmonization workflows behind instance state.
34
+
35
+ A `NetriasClient` manages configuration snapshots (API key, URLs, thresholds,
36
+ bypass preferences) and threads them through every outbound call. Consumers
37
+ typically instantiate a client, call :meth:`configure`, and then interact via
38
+ the discovery/harmonization methods below.
39
+ """
40
+
41
+ def __init__(self) -> None:
42
+ """Initialise an empty client awaiting configuration."""
43
+
44
+ self._lock: threading.Lock = threading.Lock()
45
+ self._settings: Settings | None = None
46
+ self._logger_name: str = f"netrias_client.{uuid4().hex}"
47
+ self._logger: logging.Logger | None = None
48
+
49
+ def configure(
50
+ self,
51
+ api_key: str,
52
+ timeout: float | None = None,
53
+ log_level: LogLevel | str | None = None,
54
+ confidence_threshold: float | None = None,
55
+ discovery_use_gateway_bypass: bool | None = None,
56
+ log_directory: Path | str | None = None,
57
+ ) -> None:
58
+ """Validate inputs and persist a new immutable settings snapshot.
59
+
60
+ Parameters
61
+ ----------
62
+ api_key:
63
+ Netrias API bearer token used for authentication.
64
+ timeout:
65
+ Overall request timeout in seconds (defaults to six hours).
66
+ log_level:
67
+ Desired logging verbosity as a :class:`~netrias_client._models.LogLevel`
68
+ (string aliases are also accepted for convenience).
69
+ confidence_threshold:
70
+ Minimum confidence score required for discovery recommendations.
71
+ discovery_use_gateway_bypass:
72
+ When ``True`` (default) calls the temporary Lambda bypass instead of
73
+ API Gateway.
74
+ log_directory:
75
+ Optional directory where this client's log files should be written.
76
+ When omitted, logging remains stream-only.
77
+
78
+ Calling this method multiple times replaces the active snapshot and
79
+ reconfigures the package logger.
80
+ """
81
+
82
+ settings = build_settings(
83
+ api_key=api_key,
84
+ timeout=timeout,
85
+ log_level=log_level,
86
+ confidence_threshold=confidence_threshold,
87
+ discovery_use_gateway_bypass=discovery_use_gateway_bypass,
88
+ log_directory=log_directory,
89
+ )
90
+ logger = configure_logger(
91
+ self._logger_name,
92
+ settings.log_level,
93
+ settings.log_directory,
94
+ )
95
+ with self._lock:
96
+ self._settings = settings
97
+ self._logger = logger
98
+
99
+ @property
100
+ def settings(self) -> Settings:
101
+ """Return a defensive copy of the current settings.
102
+
103
+ 'why': aid observability without exposing internal state for mutation
104
+ """
105
+
106
+ return self._snapshot_settings()
107
+
108
+ def discover_mapping(
109
+ self,
110
+ target_schema: str,
111
+ column_samples: Mapping[str, Sequence[object]],
112
+ ) -> ManifestPayload:
113
+ """Perform synchronous mapping discovery for the provided schema."""
114
+
115
+ settings = self._snapshot_settings()
116
+
117
+ return _discover_mapping(
118
+ settings=settings,
119
+ target_schema=target_schema,
120
+ column_samples=column_samples,
121
+ logger=self._require_logger(),
122
+ )
123
+
124
+ async def discover_mapping_async(
125
+ self,
126
+ target_schema: str,
127
+ column_samples: Mapping[str, Sequence[object]],
128
+ ) -> ManifestPayload:
129
+ """Async variant of :meth:`discover_mapping` with identical semantics."""
130
+
131
+ settings = self._snapshot_settings()
132
+
133
+ return await _discover_mapping_async(
134
+ settings=settings,
135
+ target_schema=target_schema,
136
+ column_samples=column_samples,
137
+ logger=self._require_logger(),
138
+ )
139
+
140
+ def discover_mapping_from_csv(
141
+ self,
142
+ source_csv: Path,
143
+ target_schema: str,
144
+ sample_limit: int = 25,
145
+ ) -> ManifestPayload:
146
+ """Derive column samples from a CSV file then perform mapping discovery."""
147
+
148
+ settings = self._snapshot_settings()
149
+
150
+ return _discover_cde_mapping(
151
+ settings=settings,
152
+ source_csv=source_csv,
153
+ target_schema=target_schema,
154
+ sample_limit=sample_limit,
155
+ logger=self._require_logger(),
156
+ )
157
+
158
+ def discover_cde_mapping(
159
+ self,
160
+ source_csv: Path,
161
+ target_schema: str,
162
+ sample_limit: int = 25,
163
+ ) -> ManifestPayload:
164
+ """Compatibility alias for :meth:`discover_mapping_from_csv`."""
165
+
166
+ return self.discover_mapping_from_csv(
167
+ source_csv=source_csv,
168
+ target_schema=target_schema,
169
+ sample_limit=sample_limit,
170
+ )
171
+
172
+ async def discover_mapping_from_csv_async(
173
+ self,
174
+ source_csv: Path,
175
+ target_schema: str,
176
+ sample_limit: int = 25,
177
+ ) -> ManifestPayload:
178
+ """Async variant of :meth:`discover_mapping_from_csv`."""
179
+
180
+ settings = self._snapshot_settings()
181
+
182
+ return await _discover_mapping_from_csv_async(
183
+ settings=settings,
184
+ source_csv=source_csv,
185
+ target_schema=target_schema,
186
+ sample_limit=sample_limit,
187
+ logger=self._require_logger(),
188
+ )
189
+
190
+ def harmonize(
191
+ self,
192
+ source_path: Path,
193
+ manifest: Path | Mapping[str, object],
194
+ output_path: Path | None = None,
195
+ manifest_output_path: Path | None = None,
196
+ ) -> HarmonizationResult:
197
+ """Execute the harmonization workflow synchronously and block.
198
+
199
+ The method accepts either a manifest mapping or a JSON file path and
200
+ writes the harmonized CSV to the resolved output location (which may be
201
+ auto-versioned). A :class:`HarmonizationResult` is always returned even on
202
+ failure, allowing callers to inspect status and description.
203
+ """
204
+
205
+ settings = self._snapshot_settings()
206
+
207
+ return _harmonize(
208
+ settings=settings,
209
+ source_path=source_path,
210
+ manifest=manifest,
211
+ output_path=output_path,
212
+ manifest_output_path=manifest_output_path,
213
+ logger=self._require_logger(),
214
+ )
215
+
216
+ async def harmonize_async(
217
+ self,
218
+ source_path: Path,
219
+ manifest: Path | Mapping[str, object],
220
+ output_path: Path | None = None,
221
+ manifest_output_path: Path | None = None,
222
+ ) -> HarmonizationResult:
223
+ """Async counterpart to :meth:`harmonize` with identical semantics."""
224
+
225
+ settings = self._snapshot_settings()
226
+
227
+ return await _harmonize_async(
228
+ settings=settings,
229
+ source_path=source_path,
230
+ manifest=manifest,
231
+ output_path=output_path,
232
+ manifest_output_path=manifest_output_path,
233
+ logger=self._require_logger(),
234
+ )
235
+
236
+ def _snapshot_settings(self) -> Settings:
237
+ """Return a copy of the current settings or raise if not configured."""
238
+
239
+ with self._lock:
240
+ if self._settings is None:
241
+ raise ClientConfigurationError(
242
+ "client not configured; call configure(api_key=...) before use"
243
+ )
244
+ return replace(self._settings)
245
+
246
+ def _require_logger(self) -> logging.Logger:
247
+ if self._logger is None:
248
+ raise ClientConfigurationError(
249
+ "client not configured; call configure(api_key=...) before use"
250
+ )
251
+ return self._logger
@@ -0,0 +1,95 @@
1
+ """Manage runtime client configuration.
2
+
3
+ 'why': centralize settings creation and validation for NetriasClient
4
+ """
5
+ from __future__ import annotations
6
+
7
+ from pathlib import Path
8
+
9
+ from ._errors import ClientConfigurationError
10
+ from ._models import LogLevel, Settings
11
+
12
+
13
+ DISCOVERY_BASE_URL = "https://api.netriasbdf.cloud"
14
+ HARMONIZATION_BASE_URL = "https://tbdxz7nffi.execute-api.us-east-2.amazonaws.com"
15
+ # TODO: remove once API Gateway latency constraints are resolved.
16
+ BYPASS_FUNCTION = "cde-recommendation"
17
+ BYPASS_ALIAS = "prod"
18
+ BYPASS_REGION = "us-east-2"
19
+
20
+
21
+ def build_settings(
22
+ api_key: str,
23
+ timeout: float | None = None,
24
+ log_level: LogLevel | str | None = None,
25
+ confidence_threshold: float | None = None,
26
+ discovery_use_gateway_bypass: bool | None = None,
27
+ log_directory: Path | str | None = None,
28
+ ) -> Settings:
29
+ """Return a validated Settings snapshot for the provided configuration."""
30
+
31
+ key = (api_key or "").strip()
32
+ if not key:
33
+ raise ClientConfigurationError("api_key must be a non-empty string; call configure(api_key=...) before use")
34
+
35
+ level = _normalized_level(log_level)
36
+ timeout_value = _validated_timeout(timeout)
37
+ threshold = _validated_confidence_threshold(confidence_threshold)
38
+ bypass_enabled = _normalized_bool(discovery_use_gateway_bypass, default=True)
39
+ directory = _validated_log_directory(log_directory)
40
+
41
+ return Settings(
42
+ api_key=key,
43
+ discovery_url=DISCOVERY_BASE_URL,
44
+ harmonization_url=HARMONIZATION_BASE_URL,
45
+ timeout=timeout_value,
46
+ log_level=level,
47
+ confidence_threshold=threshold,
48
+ discovery_use_gateway_bypass=bypass_enabled,
49
+ log_directory=directory,
50
+ )
51
+
52
+
53
+ def _normalized_level(level: LogLevel | str | None) -> LogLevel:
54
+ if level is None:
55
+ return LogLevel.INFO
56
+ if isinstance(level, LogLevel):
57
+ return level
58
+ upper = level.upper()
59
+ try:
60
+ return LogLevel[upper]
61
+ except KeyError as exc:
62
+ raise ClientConfigurationError(f"unsupported log_level: {level}") from exc
63
+
64
+
65
+ def _validated_timeout(timeout: float | None) -> float:
66
+ if timeout is None:
67
+ return 21600.0 # default to 6 hours to accommodate long-running jobs
68
+ if timeout <= 0:
69
+ raise ClientConfigurationError("timeout must be positive when provided")
70
+ return float(timeout)
71
+
72
+
73
+ def _validated_confidence_threshold(value: float | None) -> float:
74
+ if value is None:
75
+ return 0.8
76
+ if not (0.0 <= value <= 1.0):
77
+ raise ClientConfigurationError("confidence_threshold must be between 0.0 and 1.0")
78
+ return float(value)
79
+
80
+
81
+ def _normalized_bool(value: bool | None, default: bool = False) -> bool:
82
+ if value is None:
83
+ return default
84
+ return bool(value)
85
+
86
+
87
+ def _validated_log_directory(value: Path | str | None) -> Path | None:
88
+ if value is None:
89
+ return None
90
+ directory = Path(value)
91
+ try:
92
+ directory.mkdir(parents=True, exist_ok=True)
93
+ except OSError as exc:
94
+ raise ClientConfigurationError(f"unable to create log directory {directory}: {exc}") from exc
95
+ return directory