netrias_client 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,559 @@
1
+ """Coordinate stateful access to discovery and harmonization APIs.
2
+
3
+ 'why': provide a single, inspectable entry point that captures configuration once
4
+ and exposes typed discovery and harmonization helpers (sync/async) for consumers
5
+ """
6
+ from __future__ import annotations
7
+
8
+ import logging
9
+ import threading
10
+ from collections.abc import Mapping, Sequence
11
+ from dataclasses import replace
12
+ from pathlib import Path
13
+ from uuid import uuid4
14
+
15
+ from ._core import harmonize as _harmonize
16
+ from ._core import harmonize_async as _harmonize_async
17
+ from ._data_model_store import (
18
+ get_pv_set as _get_pv_set,
19
+ get_pv_set_async as _get_pv_set_async,
20
+ list_cdes as _list_cdes,
21
+ list_cdes_async as _list_cdes_async,
22
+ list_data_models as _list_data_models,
23
+ list_data_models_async as _list_data_models_async,
24
+ list_pvs as _list_pvs,
25
+ list_pvs_async as _list_pvs_async,
26
+ )
27
+ from ._discovery import (
28
+ discover_cde_mapping as _discover_cde_mapping,
29
+ discover_mapping as _discover_mapping,
30
+ discover_mapping_async as _discover_mapping_async,
31
+ discover_mapping_from_csv_async as _discover_mapping_from_csv_async,
32
+ )
33
+ from ._config import build_settings
34
+ from ._errors import ClientConfigurationError
35
+ from ._logging import configure_logger
36
+ from ._models import CDE, DataModel, HarmonizationResult, LogLevel, PermissibleValue, Settings
37
+
38
+
39
+ ManifestPayload = dict[str, dict[str, dict[str, object]]]
40
+
41
+
42
+ class NetriasClient:
43
+ """Expose discovery and harmonization workflows behind instance state.
44
+
45
+ A `NetriasClient` manages configuration snapshots (API key, URLs, thresholds,
46
+ bypass preferences) and threads them through every outbound call. Consumers
47
+ typically instantiate a client, call :meth:`configure`, and then interact via
48
+ the discovery/harmonization methods below.
49
+ """
50
+
51
+ def __init__(self) -> None:
52
+ """Initialise an empty client awaiting configuration."""
53
+
54
+ self._lock: threading.Lock = threading.Lock()
55
+ self._settings: Settings | None = None
56
+ self._logger_name: str = f"netrias_client.{uuid4().hex}"
57
+ self._logger: logging.Logger | None = None
58
+
59
+ def configure(
60
+ self,
61
+ api_key: str,
62
+ timeout: float | None = None,
63
+ log_level: LogLevel | str | None = None,
64
+ confidence_threshold: float | None = None,
65
+ discovery_use_gateway_bypass: bool | None = None,
66
+ log_directory: Path | str | None = None,
67
+ ) -> None:
68
+ """Validate inputs and persist a new immutable settings snapshot.
69
+
70
+ Parameters
71
+ ----------
72
+ api_key:
73
+ Netrias API bearer token used for authentication.
74
+ timeout:
75
+ Overall request timeout in seconds (defaults to six hours).
76
+ log_level:
77
+ Desired logging verbosity as a :class:`~netrias_client._models.LogLevel`
78
+ (string aliases are also accepted for convenience).
79
+ confidence_threshold:
80
+ Minimum confidence score required for discovery recommendations.
81
+ discovery_use_gateway_bypass:
82
+ When ``True`` (default) calls the temporary Lambda bypass instead of
83
+ API Gateway.
84
+ log_directory:
85
+ Optional directory where this client's log files should be written.
86
+ When omitted, logging remains stream-only.
87
+
88
+ Calling this method multiple times replaces the active snapshot and
89
+ reconfigures the package logger.
90
+ """
91
+
92
+ settings = build_settings(
93
+ api_key=api_key,
94
+ timeout=timeout,
95
+ log_level=log_level,
96
+ confidence_threshold=confidence_threshold,
97
+ discovery_use_gateway_bypass=discovery_use_gateway_bypass,
98
+ log_directory=log_directory,
99
+ )
100
+ logger = configure_logger(
101
+ self._logger_name,
102
+ settings.log_level,
103
+ settings.log_directory,
104
+ )
105
+ with self._lock:
106
+ self._settings = settings
107
+ self._logger = logger
108
+
109
+ @property
110
+ def settings(self) -> Settings:
111
+ """Return a defensive copy of the current settings.
112
+
113
+ 'why': aid observability without exposing internal state for mutation
114
+ """
115
+
116
+ return self._snapshot_settings()
117
+
118
+ def discover_mapping(
119
+ self,
120
+ target_schema: str,
121
+ target_version: str,
122
+ column_samples: Mapping[str, Sequence[object]],
123
+ top_k: int | None = None,
124
+ ) -> ManifestPayload:
125
+ """Perform synchronous mapping discovery for the provided schema."""
126
+
127
+ settings = self._snapshot_settings()
128
+
129
+ return _discover_mapping(
130
+ settings=settings,
131
+ target_schema=target_schema,
132
+ target_version=target_version,
133
+ column_samples=column_samples,
134
+ logger=self._require_logger(),
135
+ top_k=top_k,
136
+ )
137
+
138
+ async def discover_mapping_async(
139
+ self,
140
+ target_schema: str,
141
+ target_version: str,
142
+ column_samples: Mapping[str, Sequence[object]],
143
+ top_k: int | None = None,
144
+ ) -> ManifestPayload:
145
+ """Async variant of :meth:`discover_mapping` with identical semantics."""
146
+
147
+ settings = self._snapshot_settings()
148
+
149
+ return await _discover_mapping_async(
150
+ settings=settings,
151
+ target_schema=target_schema,
152
+ target_version=target_version,
153
+ column_samples=column_samples,
154
+ logger=self._require_logger(),
155
+ top_k=top_k,
156
+ )
157
+
158
+ def discover_mapping_from_csv(
159
+ self,
160
+ source_csv: Path,
161
+ target_schema: str,
162
+ target_version: str,
163
+ sample_limit: int = 25,
164
+ top_k: int | None = None,
165
+ ) -> ManifestPayload:
166
+ """Derive column samples from a CSV file then perform mapping discovery."""
167
+
168
+ settings = self._snapshot_settings()
169
+
170
+ return _discover_cde_mapping(
171
+ settings=settings,
172
+ source_csv=source_csv,
173
+ target_schema=target_schema,
174
+ target_version=target_version,
175
+ sample_limit=sample_limit,
176
+ logger=self._require_logger(),
177
+ top_k=top_k,
178
+ )
179
+
180
+ def discover_cde_mapping(
181
+ self,
182
+ source_csv: Path,
183
+ target_schema: str,
184
+ target_version: str,
185
+ sample_limit: int = 25,
186
+ top_k: int | None = None,
187
+ ) -> ManifestPayload:
188
+ """Compatibility alias for :meth:`discover_mapping_from_csv`."""
189
+
190
+ return self.discover_mapping_from_csv(
191
+ source_csv=source_csv,
192
+ target_schema=target_schema,
193
+ target_version=target_version,
194
+ sample_limit=sample_limit,
195
+ top_k=top_k,
196
+ )
197
+
198
+ async def discover_mapping_from_csv_async(
199
+ self,
200
+ source_csv: Path,
201
+ target_schema: str,
202
+ target_version: str,
203
+ sample_limit: int = 25,
204
+ top_k: int | None = None,
205
+ ) -> ManifestPayload:
206
+ """Async variant of :meth:`discover_mapping_from_csv`."""
207
+
208
+ settings = self._snapshot_settings()
209
+
210
+ return await _discover_mapping_from_csv_async(
211
+ settings=settings,
212
+ source_csv=source_csv,
213
+ target_schema=target_schema,
214
+ target_version=target_version,
215
+ sample_limit=sample_limit,
216
+ logger=self._require_logger(),
217
+ top_k=top_k,
218
+ )
219
+
220
+ def harmonize(
221
+ self,
222
+ source_path: Path,
223
+ manifest: Path | Mapping[str, object],
224
+ output_path: Path | None = None,
225
+ manifest_output_path: Path | None = None,
226
+ ) -> HarmonizationResult:
227
+ """Execute the harmonization workflow synchronously and block.
228
+
229
+ The method accepts either a manifest mapping or a JSON file path and
230
+ writes the harmonized CSV to the resolved output location (which may be
231
+ auto-versioned). A :class:`HarmonizationResult` is always returned even on
232
+ failure, allowing callers to inspect status and description.
233
+ """
234
+
235
+ settings = self._snapshot_settings()
236
+
237
+ return _harmonize(
238
+ settings=settings,
239
+ source_path=source_path,
240
+ manifest=manifest,
241
+ output_path=output_path,
242
+ manifest_output_path=manifest_output_path,
243
+ logger=self._require_logger(),
244
+ )
245
+
246
+ async def harmonize_async(
247
+ self,
248
+ source_path: Path,
249
+ manifest: Path | Mapping[str, object],
250
+ output_path: Path | None = None,
251
+ manifest_output_path: Path | None = None,
252
+ ) -> HarmonizationResult:
253
+ """Async counterpart to :meth:`harmonize` with identical semantics."""
254
+
255
+ settings = self._snapshot_settings()
256
+
257
+ return await _harmonize_async(
258
+ settings=settings,
259
+ source_path=source_path,
260
+ manifest=manifest,
261
+ output_path=output_path,
262
+ manifest_output_path=manifest_output_path,
263
+ logger=self._require_logger(),
264
+ )
265
+
266
+ # ---- Data Model Store methods ----
267
+
268
+ def list_data_models(
269
+ self,
270
+ query: str | None = None,
271
+ include_versions: bool = False,
272
+ include_counts: bool = False,
273
+ limit: int | None = None,
274
+ offset: int = 0,
275
+ ) -> tuple[DataModel, ...]:
276
+ """Fetch data models from the Data Model Store.
277
+
278
+ Parameters
279
+ ----------
280
+ query:
281
+ Substring search on model key or name.
282
+ include_versions:
283
+ Include version metadata per model.
284
+ include_counts:
285
+ Include CDE/PV counts per version.
286
+ limit:
287
+ Maximum number of results to return.
288
+ offset:
289
+ Number of results to skip.
290
+ """
291
+
292
+ settings = self._snapshot_settings()
293
+
294
+ return _list_data_models(
295
+ settings=settings,
296
+ query=query,
297
+ include_versions=include_versions,
298
+ include_counts=include_counts,
299
+ limit=limit,
300
+ offset=offset,
301
+ )
302
+
303
+ async def list_data_models_async(
304
+ self,
305
+ query: str | None = None,
306
+ include_versions: bool = False,
307
+ include_counts: bool = False,
308
+ limit: int | None = None,
309
+ offset: int = 0,
310
+ ) -> tuple[DataModel, ...]:
311
+ """Async variant of :meth:`list_data_models`."""
312
+
313
+ settings = self._snapshot_settings()
314
+
315
+ return await _list_data_models_async(
316
+ settings=settings,
317
+ query=query,
318
+ include_versions=include_versions,
319
+ include_counts=include_counts,
320
+ limit=limit,
321
+ offset=offset,
322
+ )
323
+
324
+ def list_cdes(
325
+ self,
326
+ model_key: str,
327
+ version: str,
328
+ include_description: bool = False,
329
+ query: str | None = None,
330
+ limit: int | None = None,
331
+ offset: int = 0,
332
+ ) -> tuple[CDE, ...]:
333
+ """Fetch CDEs for a data model version from the Data Model Store.
334
+
335
+ Parameters
336
+ ----------
337
+ model_key:
338
+ Data model key (e.g., 'ccdi').
339
+ version:
340
+ Version label (e.g., 'v1').
341
+ include_description:
342
+ Include CDE descriptions.
343
+ query:
344
+ Substring search on cde_key.
345
+ limit:
346
+ Maximum number of results to return.
347
+ offset:
348
+ Number of results to skip.
349
+ """
350
+
351
+ settings = self._snapshot_settings()
352
+
353
+ return _list_cdes(
354
+ settings=settings,
355
+ model_key=model_key,
356
+ version=version,
357
+ include_description=include_description,
358
+ query=query,
359
+ limit=limit,
360
+ offset=offset,
361
+ )
362
+
363
+ async def list_cdes_async(
364
+ self,
365
+ model_key: str,
366
+ version: str,
367
+ include_description: bool = False,
368
+ query: str | None = None,
369
+ limit: int | None = None,
370
+ offset: int = 0,
371
+ ) -> tuple[CDE, ...]:
372
+ """Async variant of :meth:`list_cdes`."""
373
+
374
+ settings = self._snapshot_settings()
375
+
376
+ return await _list_cdes_async(
377
+ settings=settings,
378
+ model_key=model_key,
379
+ version=version,
380
+ include_description=include_description,
381
+ query=query,
382
+ limit=limit,
383
+ offset=offset,
384
+ )
385
+
386
+ def list_pvs(
387
+ self,
388
+ model_key: str,
389
+ version: str,
390
+ cde_key: str,
391
+ include_inactive: bool = False,
392
+ query: str | None = None,
393
+ limit: int | None = None,
394
+ offset: int = 0,
395
+ ) -> tuple[PermissibleValue, ...]:
396
+ """Fetch permissible values for a CDE from the Data Model Store.
397
+
398
+ Parameters
399
+ ----------
400
+ model_key:
401
+ Data model key (e.g., 'ccdi').
402
+ version:
403
+ Version label (e.g., 'v1').
404
+ cde_key:
405
+ CDE key (e.g., 'sex_at_birth').
406
+ include_inactive:
407
+ Include inactive permissible values.
408
+ query:
409
+ Substring search on PV value.
410
+ limit:
411
+ Maximum number of results to return.
412
+ offset:
413
+ Number of results to skip.
414
+ """
415
+
416
+ settings = self._snapshot_settings()
417
+
418
+ return _list_pvs(
419
+ settings=settings,
420
+ model_key=model_key,
421
+ version=version,
422
+ cde_key=cde_key,
423
+ include_inactive=include_inactive,
424
+ query=query,
425
+ limit=limit,
426
+ offset=offset,
427
+ )
428
+
429
+ async def list_pvs_async(
430
+ self,
431
+ model_key: str,
432
+ version: str,
433
+ cde_key: str,
434
+ include_inactive: bool = False,
435
+ query: str | None = None,
436
+ limit: int | None = None,
437
+ offset: int = 0,
438
+ ) -> tuple[PermissibleValue, ...]:
439
+ """Async variant of :meth:`list_pvs`."""
440
+
441
+ settings = self._snapshot_settings()
442
+
443
+ return await _list_pvs_async(
444
+ settings=settings,
445
+ model_key=model_key,
446
+ version=version,
447
+ cde_key=cde_key,
448
+ include_inactive=include_inactive,
449
+ query=query,
450
+ limit=limit,
451
+ offset=offset,
452
+ )
453
+
454
+ def get_pv_set(
455
+ self,
456
+ model_key: str,
457
+ version: str,
458
+ cde_key: str,
459
+ include_inactive: bool = False,
460
+ ) -> frozenset[str]:
461
+ """Return all permissible values for a CDE as a set for O(1) membership testing.
462
+
463
+ 'why': validation use case requires efficient lookup; auto-paginates all results
464
+
465
+ Parameters
466
+ ----------
467
+ model_key:
468
+ Data model key (e.g., 'ccdi').
469
+ version:
470
+ Version label (e.g., 'v1').
471
+ cde_key:
472
+ CDE key (e.g., 'sex_at_birth').
473
+ include_inactive:
474
+ Include inactive permissible values.
475
+ """
476
+
477
+ settings = self._snapshot_settings()
478
+
479
+ return _get_pv_set(
480
+ settings=settings,
481
+ model_key=model_key,
482
+ version=version,
483
+ cde_key=cde_key,
484
+ include_inactive=include_inactive,
485
+ )
486
+
487
+ async def get_pv_set_async(
488
+ self,
489
+ model_key: str,
490
+ version: str,
491
+ cde_key: str,
492
+ include_inactive: bool = False,
493
+ ) -> frozenset[str]:
494
+ """Async variant of :meth:`get_pv_set`."""
495
+
496
+ settings = self._snapshot_settings()
497
+
498
+ return await _get_pv_set_async(
499
+ settings=settings,
500
+ model_key=model_key,
501
+ version=version,
502
+ cde_key=cde_key,
503
+ include_inactive=include_inactive,
504
+ )
505
+
506
+ def validate_value(
507
+ self,
508
+ value: str,
509
+ model_key: str,
510
+ version: str,
511
+ cde_key: str,
512
+ ) -> bool:
513
+ """Check if a value is in the permissible values for a CDE.
514
+
515
+ 'why': convenience wrapper for the common validation use case
516
+
517
+ Parameters
518
+ ----------
519
+ value:
520
+ The value to validate.
521
+ model_key:
522
+ Data model key (e.g., 'ccdi').
523
+ version:
524
+ Version label (e.g., 'v1').
525
+ cde_key:
526
+ CDE key (e.g., 'sex_at_birth').
527
+ """
528
+
529
+ pv_set = self.get_pv_set(model_key, version, cde_key)
530
+ return value in pv_set
531
+
532
+ async def validate_value_async(
533
+ self,
534
+ value: str,
535
+ model_key: str,
536
+ version: str,
537
+ cde_key: str,
538
+ ) -> bool:
539
+ """Async variant of :meth:`validate_value`."""
540
+
541
+ pv_set = await self.get_pv_set_async(model_key, version, cde_key)
542
+ return value in pv_set
543
+
544
+ def _snapshot_settings(self) -> Settings:
545
+ """Return a copy of the current settings or raise if not configured."""
546
+
547
+ with self._lock:
548
+ if self._settings is None:
549
+ raise ClientConfigurationError(
550
+ "client not configured; call configure(api_key=...) before use"
551
+ )
552
+ return replace(self._settings)
553
+
554
+ def _require_logger(self) -> logging.Logger:
555
+ if self._logger is None:
556
+ raise ClientConfigurationError(
557
+ "client not configured; call configure(api_key=...) before use"
558
+ )
559
+ return self._logger
@@ -0,0 +1,101 @@
1
+ """Manage runtime client configuration.
2
+
3
+ 'why': centralize settings creation and validation for NetriasClient
4
+ """
5
+ from __future__ import annotations
6
+
7
+ from pathlib import Path
8
+
9
+ from ._errors import ClientConfigurationError
10
+ from ._models import DataModelStoreEndpoints, LogLevel, Settings
11
+
12
+
13
+ DISCOVERY_BASE_URL = "https://api.netriasbdf.cloud"
14
+ HARMONIZATION_BASE_URL = "https://tbdxz7nffi.execute-api.us-east-2.amazonaws.com"
15
+ DATA_MODEL_STORE_BASE_URL = "https://85fnwlcuc2.execute-api.us-east-2.amazonaws.com/default"
16
+ # TODO: remove once API Gateway latency constraints are resolved.
17
+ BYPASS_FUNCTION = "cde-recommendation"
18
+ BYPASS_ALIAS = "prod"
19
+ BYPASS_REGION = "us-east-2"
20
+
21
+
22
+ def build_settings(
23
+ api_key: str,
24
+ timeout: float | None = None,
25
+ log_level: LogLevel | str | None = None,
26
+ confidence_threshold: float | None = None,
27
+ discovery_use_gateway_bypass: bool | None = None,
28
+ log_directory: Path | str | None = None,
29
+ ) -> Settings:
30
+ """Return a validated Settings snapshot for the provided configuration."""
31
+
32
+ key = (api_key or "").strip()
33
+ if not key:
34
+ raise ClientConfigurationError("api_key must be a non-empty string; call configure(api_key=...) before use")
35
+
36
+ level = _normalized_level(log_level)
37
+ timeout_value = _validated_timeout(timeout)
38
+ threshold = _validated_confidence_threshold(confidence_threshold)
39
+ bypass_enabled = _normalized_bool(discovery_use_gateway_bypass, default=True)
40
+ directory = _validated_log_directory(log_directory)
41
+
42
+ data_model_store_endpoints = DataModelStoreEndpoints(
43
+ base_url=DATA_MODEL_STORE_BASE_URL,
44
+ )
45
+
46
+ return Settings(
47
+ api_key=key,
48
+ discovery_url=DISCOVERY_BASE_URL,
49
+ harmonization_url=HARMONIZATION_BASE_URL,
50
+ timeout=timeout_value,
51
+ log_level=level,
52
+ confidence_threshold=threshold,
53
+ discovery_use_gateway_bypass=bypass_enabled,
54
+ log_directory=directory,
55
+ data_model_store_endpoints=data_model_store_endpoints,
56
+ )
57
+
58
+
59
+ def _normalized_level(level: LogLevel | str | None) -> LogLevel:
60
+ if level is None:
61
+ return LogLevel.INFO
62
+ if isinstance(level, LogLevel):
63
+ return level
64
+ upper = level.upper()
65
+ try:
66
+ return LogLevel[upper]
67
+ except KeyError as exc:
68
+ raise ClientConfigurationError(f"unsupported log_level: {level}") from exc
69
+
70
+
71
+ def _validated_timeout(timeout: float | None) -> float:
72
+ if timeout is None:
73
+ return 21600.0 # default to 6 hours to accommodate long-running jobs
74
+ if timeout <= 0:
75
+ raise ClientConfigurationError("timeout must be positive when provided")
76
+ return float(timeout)
77
+
78
+
79
+ def _validated_confidence_threshold(value: float | None) -> float:
80
+ if value is None:
81
+ return 0.8
82
+ if not (0.0 <= value <= 1.0):
83
+ raise ClientConfigurationError("confidence_threshold must be between 0.0 and 1.0")
84
+ return float(value)
85
+
86
+
87
+ def _normalized_bool(value: bool | None, default: bool = False) -> bool:
88
+ if value is None:
89
+ return default
90
+ return bool(value)
91
+
92
+
93
+ def _validated_log_directory(value: Path | str | None) -> Path | None:
94
+ if value is None:
95
+ return None
96
+ directory = Path(value)
97
+ try:
98
+ directory.mkdir(parents=True, exist_ok=True)
99
+ except OSError as exc:
100
+ raise ClientConfigurationError(f"unable to create log directory {directory}: {exc}") from exc
101
+ return directory