netrias_client 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,525 @@
1
+ """Mapping discovery workflow functions.
2
+
3
+ 'why': call the recommendation service and normalize responses for callers
4
+ """
5
+ from __future__ import annotations
6
+
7
+ import asyncio
8
+ import json
9
+ import csv
10
+ import time
11
+ from pathlib import Path
12
+ from collections.abc import Mapping, Sequence
13
+ from typing import cast
14
+
15
+ import httpx
16
+ import logging
17
+
18
+ from ._adapter import build_column_mapping_payload
19
+ from ._config import BYPASS_ALIAS, BYPASS_FUNCTION, BYPASS_REGION
20
+ from ._errors import MappingDiscoveryError, NetriasAPIUnavailable
21
+ from ._gateway_bypass import GatewayBypassError, invoke_cde_recommendation_alias
22
+ from ._http import request_mapping_discovery
23
+ from ._models import MappingDiscoveryResult, MappingRecommendationOption, MappingSuggestion, Settings
24
+ from ._validators import validate_column_samples, validate_target_schema, validate_target_version, validate_top_k, validate_source_path
25
+
26
+
27
+ ManifestPayload = dict[str, dict[str, dict[str, object]]]
28
+
29
+
30
+ async def _discover_mapping_async(
31
+ settings: Settings,
32
+ target_schema: str,
33
+ target_version: str,
34
+ column_samples: Mapping[str, Sequence[object]],
35
+ logger: logging.Logger,
36
+ top_k: int | None = None,
37
+ ) -> ManifestPayload:
38
+ """Perform mapping discovery via the recommendation endpoint."""
39
+
40
+ schema = validate_target_schema(target_schema)
41
+ version = validate_target_version(target_version)
42
+ validated_top_k = validate_top_k(top_k)
43
+ samples: dict[str, list[str]] = validate_column_samples(column_samples)
44
+ started = time.perf_counter()
45
+ logger.info("discover mapping start: schema=%s version=%s columns=%s", schema, version, len(samples))
46
+
47
+ try:
48
+ result = await _discover_with_backend(settings, schema, version, samples, logger, validated_top_k)
49
+ except (httpx.TimeoutException, httpx.HTTPError, GatewayBypassError) as exc:
50
+ _handle_discovery_error(schema, started, exc, logger)
51
+ raise AssertionError("_handle_discovery_error should raise") from exc
52
+
53
+ manifest = build_column_mapping_payload(
54
+ result,
55
+ threshold=settings.confidence_threshold,
56
+ logger=logger,
57
+ )
58
+ elapsed = time.perf_counter() - started
59
+ logger.info(
60
+ "discover mapping complete: schema=%s version=%s columns=%s duration=%.2fs",
61
+ schema,
62
+ version,
63
+ len(manifest.get("column_mappings", {})),
64
+ elapsed,
65
+ )
66
+ return manifest
67
+
68
+
69
+ def discover_mapping(
70
+ settings: Settings,
71
+ target_schema: str,
72
+ target_version: str,
73
+ column_samples: Mapping[str, Sequence[object]],
74
+ logger: logging.Logger,
75
+ top_k: int | None = None,
76
+ ) -> ManifestPayload:
77
+ """Sync wrapper around `_discover_mapping_async`."""
78
+
79
+ return asyncio.run(
80
+ _discover_mapping_async(
81
+ settings=settings,
82
+ target_schema=target_schema,
83
+ target_version=target_version,
84
+ column_samples=column_samples,
85
+ logger=logger,
86
+ top_k=top_k,
87
+ )
88
+ )
89
+
90
+
91
+ async def discover_mapping_async(
92
+ settings: Settings,
93
+ target_schema: str,
94
+ target_version: str,
95
+ column_samples: Mapping[str, Sequence[object]],
96
+ logger: logging.Logger,
97
+ top_k: int | None = None,
98
+ ) -> ManifestPayload:
99
+ """Async entry point mirroring `discover_mapping` semantics."""
100
+
101
+ return await _discover_mapping_async(
102
+ settings=settings,
103
+ target_schema=target_schema,
104
+ target_version=target_version,
105
+ column_samples=column_samples,
106
+ logger=logger,
107
+ top_k=top_k,
108
+ )
109
+
110
+
111
+ def discover_cde_mapping(
112
+ settings: Settings,
113
+ source_csv: Path,
114
+ target_schema: str,
115
+ target_version: str,
116
+ sample_limit: int,
117
+ logger: logging.Logger,
118
+ top_k: int | None = None,
119
+ ) -> ManifestPayload:
120
+ """Convenience wrapper that derives column samples from a CSV file."""
121
+
122
+ samples = _samples_from_csv(source_csv, sample_limit)
123
+ return discover_mapping(
124
+ settings=settings,
125
+ target_schema=target_schema,
126
+ target_version=target_version,
127
+ column_samples=samples,
128
+ logger=logger,
129
+ top_k=top_k,
130
+ )
131
+
132
+
133
+ async def discover_mapping_from_csv_async(
134
+ settings: Settings,
135
+ source_csv: Path,
136
+ target_schema: str,
137
+ target_version: str,
138
+ sample_limit: int,
139
+ logger: logging.Logger,
140
+ top_k: int | None = None,
141
+ ) -> ManifestPayload:
142
+ """Async variant of `discover_mapping_from_csv`."""
143
+
144
+ samples = _samples_from_csv(source_csv, sample_limit)
145
+ return await discover_mapping_async(
146
+ settings=settings,
147
+ target_schema=target_schema,
148
+ target_version=target_version,
149
+ column_samples=samples,
150
+ logger=logger,
151
+ top_k=top_k,
152
+ )
153
+
154
+
155
+ async def _discover_with_backend(
156
+ settings: Settings,
157
+ schema: str,
158
+ version: str,
159
+ samples: Mapping[str, Sequence[str]],
160
+ logger: logging.Logger,
161
+ top_k: int | None = None,
162
+ ) -> MappingDiscoveryResult:
163
+ if settings.discovery_use_gateway_bypass:
164
+ logger.debug("discover backend via bypass alias")
165
+ payload = invoke_cde_recommendation_alias(
166
+ target_schema=schema,
167
+ target_version=version,
168
+ columns=samples,
169
+ function_name=BYPASS_FUNCTION,
170
+ alias=BYPASS_ALIAS,
171
+ region_name=BYPASS_REGION,
172
+ timeout_seconds=settings.timeout,
173
+ logger=logger,
174
+ top_k=top_k,
175
+ )
176
+ return _result_from_payload(payload, schema)
177
+
178
+ logger.debug("discover backend via HTTP API")
179
+ response = await request_mapping_discovery(
180
+ base_url=settings.discovery_url,
181
+ api_key=settings.api_key,
182
+ timeout=settings.timeout,
183
+ schema=schema,
184
+ version=version,
185
+ columns=samples,
186
+ top_k=top_k,
187
+ )
188
+ return _interpret_discovery_response(response, schema)
189
+
190
+
191
+ def _handle_discovery_error(
192
+ schema: str,
193
+ started: float,
194
+ exc: Exception,
195
+ logger: logging.Logger,
196
+ ) -> None:
197
+ elapsed = time.perf_counter() - started
198
+ if isinstance(exc, httpx.TimeoutException): # pragma: no cover - exercised via integration tests
199
+ logger.error("discover mapping timeout: schema=%s duration=%.2fs err=%s", schema, elapsed, exc)
200
+ raise NetriasAPIUnavailable("mapping discovery timed out") from exc
201
+ if isinstance(exc, GatewayBypassError):
202
+ logger.error(
203
+ "discover mapping bypass error: schema=%s duration=%.2fs err=%s",
204
+ schema,
205
+ elapsed,
206
+ exc,
207
+ )
208
+ raise NetriasAPIUnavailable(f"gateway bypass error: {exc}") from exc
209
+
210
+ logger.error(
211
+ "discover mapping transport error: schema=%s duration=%.2fs err=%s",
212
+ schema,
213
+ elapsed,
214
+ exc,
215
+ )
216
+ raise NetriasAPIUnavailable(f"mapping discovery transport error: {exc}") from exc
217
+
218
+
219
+ def _interpret_discovery_response(response: httpx.Response, requested_schema: str) -> MappingDiscoveryResult:
220
+ if response.status_code >= 500:
221
+ message = _error_message(response)
222
+ raise NetriasAPIUnavailable(message)
223
+ if response.status_code >= 400:
224
+ message = _error_message(response)
225
+ raise MappingDiscoveryError(message)
226
+
227
+ payload = _load_payload(response)
228
+ return _result_from_payload(payload, requested_schema)
229
+
230
+
231
+ def _result_from_payload(payload: Mapping[str, object], requested_schema: str) -> MappingDiscoveryResult:
232
+ schema = _resolved_schema(payload, requested_schema)
233
+ suggestions = _suggestions_from_payload(payload)
234
+ return MappingDiscoveryResult(schema=schema, suggestions=suggestions, raw=payload)
235
+
236
+
237
+ def _error_message(response: httpx.Response) -> str:
238
+ mapping = _mapping_or_none(_safe_json(response))
239
+ message = _message_from_mapping(mapping)
240
+ if message:
241
+ return message
242
+ return _default_error(response)
243
+
244
+
245
+ def _extract_message(payload: Mapping[str, object]) -> str | None:
246
+ for key in ("message", "error", "detail"):
247
+ value = payload.get(key)
248
+ if isinstance(value, str) and value.strip():
249
+ return value.strip()
250
+ return None
251
+
252
+
253
+ def _message_from_mapping(payload: Mapping[str, object] | None) -> str | None:
254
+ if payload is None:
255
+ return None
256
+ direct = _extract_message(payload)
257
+ if direct:
258
+ return direct
259
+ nested = _resolve_body_optional(payload)
260
+ if nested:
261
+ return _extract_message(nested)
262
+ return None
263
+
264
+
265
+ def _mapping_or_none(data: object) -> Mapping[str, object] | None:
266
+ if isinstance(data, Mapping):
267
+ return cast(Mapping[str, object], data)
268
+ return None
269
+
270
+
271
+ def _safe_json(response: httpx.Response) -> object:
272
+ try:
273
+ return cast(object, response.json())
274
+ except json.JSONDecodeError:
275
+ return None
276
+
277
+
278
+ def _default_error(response: httpx.Response) -> str:
279
+ return f"mapping discovery failed (HTTP {response.status_code})"
280
+
281
+
282
+ def _resolve_body_optional(container: Mapping[str, object]) -> dict[str, object] | None:
283
+ body = container.get("body")
284
+ if body is None:
285
+ return None
286
+ parsed = _decode_body(body, strict=False)
287
+ if isinstance(parsed, dict):
288
+ return _coerce_mapping(cast(Mapping[object, object], parsed), strict=False)
289
+ return None
290
+
291
+
292
+ def _expect_mapping(data: object) -> dict[str, object]:
293
+ if isinstance(data, dict):
294
+ mapping = _coerce_mapping(cast(Mapping[object, object], data), strict=True)
295
+ if mapping is not None:
296
+ return mapping
297
+ raise MappingDiscoveryError("mapping discovery response body must be a JSON object")
298
+
299
+
300
+ def _extract_body_object(container: Mapping[str, object]) -> dict[str, object] | None:
301
+ if "body" not in container:
302
+ return None
303
+ parsed = _decode_body(container["body"], strict=True)
304
+ if isinstance(parsed, dict):
305
+ mapping = _coerce_mapping(cast(Mapping[object, object], parsed), strict=True)
306
+ if mapping is not None:
307
+ return mapping
308
+ raise MappingDiscoveryError("mapping discovery response body must be a JSON object")
309
+
310
+
311
+ def _entries_from_value(value: object) -> tuple[Mapping[str, object], ...]:
312
+ if not isinstance(value, list):
313
+ return ()
314
+ collected: list[Mapping[str, object]] = []
315
+ items = cast(list[object], value)
316
+ for item in items:
317
+ if isinstance(item, Mapping):
318
+ collected.append(cast(Mapping[str, object], item))
319
+ return tuple(collected)
320
+
321
+
322
+ def _coerce_mapping(obj: Mapping[object, object], strict: bool) -> dict[str, object] | None:
323
+ result: dict[str, object] = {}
324
+ for key, value in obj.items():
325
+ if not isinstance(key, str):
326
+ if strict:
327
+ raise MappingDiscoveryError("mapping discovery response body must be a JSON object")
328
+ return None
329
+ result[key] = value
330
+ return result
331
+
332
+
333
+ def _samples_from_csv(csv_path: Path, sample_limit: int) -> dict[str, list[str]]:
334
+ dataset = validate_source_path(csv_path)
335
+ headers, rows = _read_limited_rows(dataset, sample_limit)
336
+ samples: dict[str, list[str]] = {header: [] for header in headers}
337
+ _fill_samples(samples, rows)
338
+ return {key: value for key, value in samples.items() if value}
339
+
340
+
341
+ def _read_limited_rows(dataset: Path, sample_limit: int) -> tuple[list[str], list[dict[str, str | None]]]:
342
+ headers: list[str] = []
343
+ rows: list[dict[str, str | None]] = []
344
+ with dataset.open("r", encoding="utf-8", newline="") as handle:
345
+ reader = csv.DictReader(handle)
346
+ headers = [header for header in reader.fieldnames or [] if header]
347
+ for index, row in enumerate(reader):
348
+ if index >= sample_limit:
349
+ break
350
+ rows.append(row)
351
+ return headers, rows
352
+
353
+
354
+ def _fill_samples(samples: dict[str, list[str]], rows: list[dict[str, str | None]]) -> None:
355
+ for row in rows:
356
+ _append_row(samples, row)
357
+
358
+
359
+ def _append_row(samples: dict[str, list[str]], row: dict[str, str | None]) -> None:
360
+ for header, raw_value in row.items():
361
+ if header not in samples or raw_value is None:
362
+ continue
363
+ value = raw_value.strip()
364
+ if value:
365
+ samples[header].append(value)
366
+
367
+
368
+ def _decode_body(body: object, strict: bool) -> object:
369
+ if not isinstance(body, str):
370
+ return body
371
+ try:
372
+ return cast(object, json.loads(body))
373
+ except json.JSONDecodeError as exc:
374
+ if strict:
375
+ raise MappingDiscoveryError("mapping discovery body was not valid JSON") from exc
376
+ return None
377
+
378
+
379
+ def _load_payload(response: httpx.Response) -> dict[str, object]:
380
+ data = _safe_json(response)
381
+ mapping = _expect_mapping(data)
382
+ body = _extract_body_object(mapping)
383
+ if body is not None:
384
+ return body
385
+ return mapping
386
+
387
+
388
+ def _resolved_schema(payload: Mapping[str, object], requested_schema: str) -> str:
389
+ for key in ("target_schema", "schema", "recommended_schema"):
390
+ value = payload.get(key)
391
+ if isinstance(value, str) and value.strip():
392
+ return value.strip()
393
+ return requested_schema
394
+
395
+
396
+ def _suggestions_from_payload(payload: Mapping[str, object]) -> tuple[MappingSuggestion, ...]:
397
+ # Try the new dict-based `results` format first
398
+ results_dict = _results_dict_from_payload(payload)
399
+ if results_dict:
400
+ return _suggestions_from_results_dict(results_dict)
401
+
402
+ # Fall back to the old array-based format
403
+ raw_entries = _candidate_entries(payload)
404
+ suggestions: list[MappingSuggestion] = []
405
+ for entry in raw_entries:
406
+ source = _source_column(entry)
407
+ if not source:
408
+ continue
409
+ options = _options_from_entry(entry)
410
+ suggestions.append(
411
+ MappingSuggestion(source_column=source, options=options, raw=entry)
412
+ )
413
+ return tuple(suggestions)
414
+
415
+
416
+ def _results_dict_from_payload(payload: Mapping[str, object]) -> dict[str, list[object]] | None:
417
+ """Extract the new dict-based results structure if present."""
418
+
419
+ results = payload.get("results")
420
+ if not isinstance(results, dict):
421
+ return None
422
+ return cast(dict[str, list[object]], results)
423
+
424
+
425
+ def _suggestions_from_results_dict(results: dict[str, list[object]]) -> tuple[MappingSuggestion, ...]:
426
+ """Convert dict-keyed results to MappingSuggestion tuples."""
427
+
428
+ suggestions: list[MappingSuggestion] = []
429
+ for column_name, options_list in results.items():
430
+ if not isinstance(options_list, list):
431
+ continue
432
+ options = _options_from_list(options_list)
433
+ raw_entry: dict[str, object] = {"column": column_name, "options": options_list}
434
+ suggestions.append(
435
+ MappingSuggestion(source_column=column_name, options=options, raw=raw_entry)
436
+ )
437
+ return tuple(suggestions)
438
+
439
+
440
+ def _options_from_list(options_list: list[object]) -> tuple[MappingRecommendationOption, ...]:
441
+ """Convert a list of option dicts to MappingRecommendationOption tuples."""
442
+
443
+ options: list[MappingRecommendationOption] = []
444
+ for item in options_list:
445
+ if not isinstance(item, Mapping):
446
+ continue
447
+ mapping = cast(Mapping[str, object], item)
448
+ target = _option_target(mapping)
449
+ confidence = _option_confidence(mapping)
450
+ target_cde_id = _option_target_cde_id(mapping)
451
+ options.append(
452
+ MappingRecommendationOption(
453
+ target=target, confidence=confidence, target_cde_id=target_cde_id, raw=mapping
454
+ )
455
+ )
456
+ return tuple(options)
457
+
458
+
459
+ def _candidate_entries(payload: Mapping[str, object]) -> tuple[Mapping[str, object], ...]:
460
+ for key in ("recommendations", "columns", "suggestions"):
461
+ entries = _entries_from_value(payload.get(key))
462
+ if entries:
463
+ return entries
464
+ return ()
465
+
466
+
467
+ def _source_column(entry: Mapping[str, object]) -> str | None:
468
+ candidates = (
469
+ entry.get("column"),
470
+ entry.get("source_column"),
471
+ entry.get("name"),
472
+ entry.get("field"),
473
+ )
474
+ for candidate in candidates:
475
+ if isinstance(candidate, str):
476
+ name = candidate.strip()
477
+ if name:
478
+ return name
479
+ return None
480
+
481
+
482
+ def _options_from_entry(entry: Mapping[str, object]) -> tuple[MappingRecommendationOption, ...]:
483
+ raw_options = entry.get("suggestions") or entry.get("options") or entry.get("targets")
484
+ if not isinstance(raw_options, list):
485
+ return ()
486
+ options: list[MappingRecommendationOption] = []
487
+ items = cast(list[object], raw_options)
488
+ for item in items:
489
+ if not isinstance(item, Mapping):
490
+ continue
491
+ mapping = cast(Mapping[str, object], item)
492
+ target = _option_target(mapping)
493
+ confidence = _option_confidence(mapping)
494
+ target_cde_id = _option_target_cde_id(mapping)
495
+ options.append(
496
+ MappingRecommendationOption(
497
+ target=target, confidence=confidence, target_cde_id=target_cde_id, raw=mapping
498
+ )
499
+ )
500
+ return tuple(options)
501
+
502
+
503
+ def _option_target(option: Mapping[str, object]) -> str | None:
504
+ for key in ("target", "cde", "field", "name", "qualified_name"):
505
+ value = option.get(key)
506
+ if isinstance(value, str):
507
+ candidate = value.strip()
508
+ if candidate:
509
+ return candidate
510
+ return None
511
+
512
+
513
+ def _option_confidence(option: Mapping[str, object]) -> float | None:
514
+ for key in ("similarity", "confidence", "score", "probability"):
515
+ value = option.get(key)
516
+ if isinstance(value, (int, float)):
517
+ return float(value)
518
+ return None
519
+
520
+
521
+ def _option_target_cde_id(option: Mapping[str, object]) -> int | None:
522
+ value = option.get("target_cde_id")
523
+ if isinstance(value, int):
524
+ return value
525
+ return None
@@ -0,0 +1,37 @@
1
+ """Define client-specific exceptions.
2
+
3
+ 'why': keep error taxonomy explicit and lightweight
4
+ """
5
+ from __future__ import annotations
6
+
7
+
8
+ class NetriasClientError(Exception):
9
+ """Base class for all client-specific exceptions."""
10
+
11
+
12
+ class ClientConfigurationError(NetriasClientError):
13
+ """Raised when configuration is incomplete or malformed."""
14
+
15
+
16
+ class FileValidationError(NetriasClientError):
17
+ """Raised for unreadable files, unsupported extensions, or size violations."""
18
+
19
+
20
+ class MappingValidationError(NetriasClientError):
21
+ """Raised when mapping discovery inputs fail validation."""
22
+
23
+
24
+ class OutputLocationError(NetriasClientError):
25
+ """Raised when the output path is unwritable or collides with an existing directory."""
26
+
27
+
28
+ class NetriasAPIUnavailable(NetriasClientError):
29
+ """Raised for timeouts or network failures."""
30
+
31
+
32
+ class MappingDiscoveryError(NetriasClientError):
33
+ """Raised when the mapping discovery API returns an error payload."""
34
+
35
+
36
+ class DataModelStoreError(NetriasClientError):
37
+ """Raised when the Data Model Store API returns an error."""