netrias_client 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,560 @@
1
+ """Core harmonization workflow functions.
2
+
3
+ 'why': unify sync/async paths via a single async implementation
4
+ """
5
+ from __future__ import annotations
6
+
7
+ import asyncio
8
+ import json
9
+ import logging
10
+ import time
11
+ from collections.abc import Mapping, Sequence
12
+ from pathlib import Path
13
+ from typing import Final, TypeAlias, cast
14
+
15
+ import httpx
16
+
17
+ from ._errors import NetriasAPIUnavailable
18
+ from ._http import build_harmonize_payload, fetch_job_status, submit_harmonize_job
19
+ from ._io import stream_download_to_file
20
+ from ._models import HarmonizationResult, Settings
21
+ from ._validators import validate_manifest_path, validate_output_path, validate_source_path
22
+
23
+ JSONPrimitive: TypeAlias = str | int | float | bool | None
24
+ JSONValue: TypeAlias = JSONPrimitive | Mapping[str, "JSONValue"] | Sequence["JSONValue"]
25
+ JOB_POLL_INTERVAL_SECONDS: Final[float] = 3.0
26
+ _MESSAGE_KEYS: Final[tuple[str, ...]] = (
27
+ "message",
28
+ "detail",
29
+ "error",
30
+ "description",
31
+ "statusMessage",
32
+ )
33
+
34
+
35
+ class HarmonizationJobError(RuntimeError):
36
+ """Raised when the harmonization job fails before producing a result."""
37
+
38
+
39
+ async def _harmonize_async(
40
+ settings: Settings,
41
+ source_path: Path,
42
+ manifest: Path | Mapping[str, object],
43
+ output_path: Path | None = None,
44
+ manifest_output_path: Path | None = None,
45
+ logger: logging.Logger | None = None,
46
+ ) -> HarmonizationResult:
47
+ """Execute harmonization using the asynchronous job API."""
48
+
49
+ logger = logger or logging.getLogger("netrias_client")
50
+ csv_path = validate_source_path(source_path)
51
+ manifest_input = _resolve_manifest(manifest, manifest_output_path)
52
+ dest = validate_output_path(output_path, source_name=csv_path.stem, allow_versioning=True)
53
+
54
+ started = time.perf_counter()
55
+ status_label = "error"
56
+ logger.info("harmonize start: file=%s", csv_path)
57
+
58
+ try:
59
+ payload = build_harmonize_payload(csv_path, manifest_input)
60
+ job_payload = await _submit_job_response(
61
+ base_url=settings.harmonization_url,
62
+ api_key=settings.api_key,
63
+ timeout=settings.timeout,
64
+ payload=payload,
65
+ csv_path=csv_path,
66
+ logger=logger,
67
+ )
68
+ job_id = _require_job_id(job_payload, csv_path, logger)
69
+ logger.info("harmonize job queued: file=%s job_id=%s", csv_path, job_id)
70
+ final_payload = await _resolve_final_payload(
71
+ base_url=settings.harmonization_url,
72
+ api_key=settings.api_key,
73
+ job_id=job_id,
74
+ timeout=settings.timeout,
75
+ csv_path=csv_path,
76
+ logger=logger,
77
+ )
78
+ final_url = _require_final_url(final_payload, csv_path, logger)
79
+ except HarmonizationJobError as exc:
80
+ status_label = "failed"
81
+ return HarmonizationResult(file_path=dest, status="failed", description=str(exc))
82
+ else:
83
+ result = await _download_final(final_url, dest, settings.timeout, csv_path, logger)
84
+ status_label = result.status
85
+ return result
86
+ finally:
87
+ elapsed = time.perf_counter() - started
88
+ logger.info(
89
+ "harmonize finished: file=%s status=%s duration=%.2fs",
90
+ csv_path,
91
+ status_label,
92
+ elapsed,
93
+ )
94
+
95
+
96
+ def harmonize(
97
+ settings: Settings,
98
+ source_path: Path,
99
+ manifest: Path | Mapping[str, object],
100
+ output_path: Path | None = None,
101
+ manifest_output_path: Path | None = None,
102
+ logger: logging.Logger | None = None,
103
+ ) -> HarmonizationResult:
104
+ """Sync wrapper: run the async harmonize workflow and block until completion."""
105
+
106
+ return asyncio.run(
107
+ _harmonize_async(
108
+ settings=settings,
109
+ source_path=source_path,
110
+ manifest=manifest,
111
+ output_path=output_path,
112
+ manifest_output_path=manifest_output_path,
113
+ logger=logger,
114
+ )
115
+ )
116
+
117
+
118
+ async def harmonize_async(
119
+ settings: Settings,
120
+ source_path: Path,
121
+ manifest: Path | Mapping[str, object],
122
+ output_path: Path | None = None,
123
+ manifest_output_path: Path | None = None,
124
+ logger: logging.Logger | None = None,
125
+ ) -> HarmonizationResult:
126
+ """Async counterpart to `harmonize` with identical validation and result semantics."""
127
+
128
+ return await _harmonize_async(
129
+ settings=settings,
130
+ source_path=source_path,
131
+ manifest=manifest,
132
+ output_path=output_path,
133
+ manifest_output_path=manifest_output_path,
134
+ logger=logger,
135
+ )
136
+
137
+
138
+ def _resolve_manifest(
139
+ manifest: Path | Mapping[str, object], manifest_output_path: Path | None
140
+ ) -> Path | Mapping[str, object]:
141
+ if isinstance(manifest, Path):
142
+ return _manifest_from_path(manifest, manifest_output_path)
143
+ return _manifest_from_mapping(manifest, manifest_output_path)
144
+
145
+
146
+ def _manifest_from_path(
147
+ manifest_path: Path, manifest_output_path: Path | None
148
+ ) -> Path:
149
+ validated = validate_manifest_path(manifest_path)
150
+ if manifest_output_path is None or manifest_output_path == validated:
151
+ return validated
152
+ manifest_output_path.parent.mkdir(parents=True, exist_ok=True)
153
+ _ = manifest_output_path.write_text(
154
+ validated.read_text(encoding="utf-8"),
155
+ encoding="utf-8",
156
+ )
157
+ return manifest_output_path
158
+
159
+
160
+ def _manifest_from_mapping(
161
+ manifest: Mapping[str, object], manifest_output_path: Path | None
162
+ ) -> Path | Mapping[str, object]:
163
+ normalized = _normalize_manifest_mapping(manifest)
164
+ if manifest_output_path is None:
165
+ return normalized
166
+ manifest_output_path.parent.mkdir(parents=True, exist_ok=True)
167
+ _ = manifest_output_path.write_text(
168
+ json.dumps(normalized, indent=2),
169
+ encoding="utf-8",
170
+ )
171
+ return manifest_output_path
172
+
173
+
174
+ def _normalize_manifest_mapping(manifest: Mapping[str, object]) -> dict[str, object]:
175
+ try:
176
+ serialized = json.dumps(manifest)
177
+ except TypeError as exc: # pragma: no cover - guarded by tests
178
+ raise ValueError("manifest mapping must be JSON-serializable") from exc
179
+ return cast(dict[str, object], json.loads(serialized))
180
+
181
+
182
+ async def _submit_job_response(
183
+ base_url: str,
184
+ api_key: str,
185
+ timeout: float,
186
+ payload: bytes,
187
+ csv_path: Path,
188
+ logger: logging.Logger,
189
+ ) -> Mapping[str, JSONValue]:
190
+ response = await _submit_job_http(
191
+ base_url=base_url,
192
+ api_key=api_key,
193
+ timeout=timeout,
194
+ payload=payload,
195
+ csv_path=csv_path,
196
+ logger=logger,
197
+ )
198
+ _ensure_submit_success(response, csv_path, logger)
199
+ payload_mapping = _json_mapping(response)
200
+ if not payload_mapping:
201
+ logger.error("harmonize submit response was not JSON: file=%s", csv_path)
202
+ raise HarmonizationJobError("harmonization job response was not JSON")
203
+ return payload_mapping
204
+
205
+
206
+ async def _submit_job_http(
207
+ base_url: str,
208
+ api_key: str,
209
+ timeout: float,
210
+ payload: bytes,
211
+ csv_path: Path,
212
+ logger: logging.Logger,
213
+ ) -> httpx.Response:
214
+ try:
215
+ return await submit_harmonize_job(
216
+ base_url=base_url,
217
+ api_key=api_key,
218
+ payload_gz=payload,
219
+ timeout=timeout,
220
+ )
221
+ except httpx.TimeoutException as exc:
222
+ logger.error("harmonize submit timeout: file=%s err=%s", csv_path, exc)
223
+ raise HarmonizationJobError("harmonization submit request timed out") from exc
224
+ except httpx.HTTPError as exc:
225
+ logger.error("harmonize submit transport error: file=%s err=%s", csv_path, exc)
226
+ raise NetriasAPIUnavailable(f"transport error: {exc}") from exc
227
+
228
+
229
+ def _ensure_submit_success(response: httpx.Response, csv_path: Path, logger: logging.Logger) -> None:
230
+ if response.status_code < 400:
231
+ return
232
+ message, payload_for_log = _error_description(
233
+ status=response.status_code,
234
+ body_text=response.text,
235
+ default="harmonization submit failed",
236
+ )
237
+ logger.error(
238
+ "harmonize submit failed: file=%s status=%s body=%s",
239
+ csv_path,
240
+ response.status_code,
241
+ _formatted_body(payload_for_log),
242
+ )
243
+ raise HarmonizationJobError(message)
244
+
245
+
246
+ def _require_job_id(
247
+ payload: Mapping[str, JSONValue],
248
+ csv_path: Path,
249
+ logger: logging.Logger,
250
+ ) -> str:
251
+ job_id = _string_field(payload, "jobId")
252
+ if job_id:
253
+ return job_id
254
+ logger.error("harmonize submit response missing jobId: file=%s body=%s", csv_path, payload)
255
+ raise HarmonizationJobError("harmonization job response missing jobId")
256
+
257
+
258
+ async def _resolve_final_payload(
259
+ base_url: str,
260
+ api_key: str,
261
+ job_id: str,
262
+ timeout: float,
263
+ csv_path: Path,
264
+ logger: logging.Logger,
265
+ ) -> Mapping[str, JSONValue]:
266
+ started = time.monotonic()
267
+ deadline = started + timeout
268
+ poll_interval = max(1.0, min(JOB_POLL_INTERVAL_SECONDS, timeout / 60 if timeout else JOB_POLL_INTERVAL_SECONDS))
269
+
270
+ while time.monotonic() < deadline:
271
+ elapsed = time.monotonic() - started
272
+ response = await _job_status_http(
273
+ base_url=base_url,
274
+ api_key=api_key,
275
+ job_id=job_id,
276
+ timeout=timeout,
277
+ csv_path=csv_path,
278
+ logger=logger,
279
+ )
280
+
281
+ payload = _interpret_job_status(response, csv_path, logger)
282
+ if payload is None:
283
+ logger.info(
284
+ "harmonize job polling: file=%s job_id=%s status=pending elapsed=%.2fs",
285
+ csv_path,
286
+ job_id,
287
+ elapsed,
288
+ )
289
+ await asyncio.sleep(poll_interval)
290
+ continue
291
+ logger.info(
292
+ "harmonize job polling: file=%s job_id=%s status=%s elapsed=%.2fs",
293
+ csv_path,
294
+ job_id,
295
+ payload.get("status"),
296
+ elapsed,
297
+ )
298
+ return payload
299
+
300
+ total_elapsed = time.monotonic() - started
301
+ logger.error("harmonize job polling timed out: file=%s elapsed=%.2fs", csv_path, total_elapsed)
302
+ raise HarmonizationJobError("harmonization job polling timed out")
303
+
304
+
305
+ async def _job_status_http(
306
+ base_url: str,
307
+ api_key: str,
308
+ job_id: str,
309
+ timeout: float,
310
+ csv_path: Path,
311
+ logger: logging.Logger,
312
+ ) -> httpx.Response:
313
+ try:
314
+ return await fetch_job_status(
315
+ base_url=base_url,
316
+ api_key=api_key,
317
+ job_id=job_id,
318
+ timeout=timeout,
319
+ )
320
+ except httpx.TimeoutException as exc:
321
+ logger.error("harmonize job status timeout: file=%s err=%s", csv_path, exc)
322
+ raise HarmonizationJobError("harmonization job status timed out") from exc
323
+ except httpx.HTTPError as exc:
324
+ logger.error("harmonize job status transport error: file=%s err=%s", csv_path, exc)
325
+ raise NetriasAPIUnavailable(f"transport error: {exc}") from exc
326
+
327
+
328
+ def _interpret_job_status(response: httpx.Response, csv_path: Path, logger: logging.Logger) -> Mapping[str, JSONValue] | None:
329
+ if response.status_code == 404:
330
+ return None
331
+
332
+ payload = _validated_status_payload(response, csv_path, logger)
333
+ state = _job_state(payload)
334
+ if state == "FAILED":
335
+ message = _job_failure_message(payload)
336
+ logger.error("harmonize job failed: file=%s message=%s", csv_path, message)
337
+ raise HarmonizationJobError(message)
338
+ if state == "SUCCEEDED":
339
+ return payload
340
+ return None
341
+
342
+
343
+ def _validated_status_payload(response: httpx.Response, csv_path: Path, logger: logging.Logger) -> Mapping[str, JSONValue]:
344
+ if response.status_code >= 400:
345
+ message, payload_for_log = _error_description(
346
+ status=response.status_code,
347
+ body_text=response.text,
348
+ default="harmonization job status failed",
349
+ )
350
+ logger.error(
351
+ "harmonize job status failed: file=%s status=%s body=%s",
352
+ csv_path,
353
+ response.status_code,
354
+ _formatted_body(payload_for_log),
355
+ )
356
+ raise HarmonizationJobError(message)
357
+
358
+ payload = _json_mapping(response)
359
+ if not payload:
360
+ logger.error("harmonize job status response was not JSON: file=%s", csv_path)
361
+ raise HarmonizationJobError("harmonization job status response was not JSON")
362
+ return payload
363
+
364
+
365
+ def _job_state(payload: Mapping[str, JSONValue]) -> str:
366
+ status_value = (_string_field(payload, "status") or "").upper()
367
+ if status_value == "SUCCEEDED":
368
+ return "SUCCEEDED"
369
+ if status_value == "FAILED":
370
+ return "FAILED"
371
+ return "PENDING"
372
+
373
+
374
+ def _require_final_url(
375
+ payload: Mapping[str, JSONValue],
376
+ csv_path: Path,
377
+ logger: logging.Logger,
378
+ ) -> str:
379
+ final_url = _string_field(payload, "finalUrl")
380
+ if final_url:
381
+ return final_url
382
+ logger.error("harmonize job missing finalUrl: file=%s payload=%s", csv_path, payload)
383
+ raise HarmonizationJobError("harmonization job completed without a download URL")
384
+
385
+
386
+ async def _download_final(
387
+ final_url: str,
388
+ dest: Path,
389
+ timeout: float,
390
+ csv_path: Path,
391
+ logger: logging.Logger,
392
+ ) -> HarmonizationResult:
393
+ try:
394
+ async with httpx.AsyncClient(timeout=httpx.Timeout(timeout)) as client:
395
+ async with client.stream("GET", final_url) as response:
396
+ if 200 <= response.status_code < 300:
397
+ _ = await stream_download_to_file(response, dest)
398
+ logger.info("harmonize complete: file=%s -> %s", csv_path, dest)
399
+ return HarmonizationResult(file_path=dest, status="succeeded", description="harmonization succeeded")
400
+
401
+ body_bytes = await response.aread()
402
+ description = _download_error_message(response.status_code, body_bytes)
403
+ logger.error(
404
+ "harmonize download failed: file=%s status=%s body=%s",
405
+ csv_path,
406
+ response.status_code,
407
+ _formatted_body(_payload_for_logging(body_bytes)),
408
+ )
409
+ return HarmonizationResult(file_path=dest, status="failed", description=description)
410
+ except httpx.TimeoutException as exc:
411
+ logger.error("harmonize download timeout: file=%s err=%s", csv_path, exc)
412
+ return HarmonizationResult(file_path=dest, status="timeout", description="download timed out")
413
+ except httpx.HTTPError as exc:
414
+ logger.error("harmonize download transport error: file=%s err=%s", csv_path, exc)
415
+ raise NetriasAPIUnavailable(f"transport error: {exc}") from exc
416
+
417
+
418
+ def _error_description(status: int, body_text: str, default: str) -> tuple[str, JSONValue | str]:
419
+ parsed = _try_parse_json(body_text)
420
+ message = _message_from_mapping(parsed if isinstance(parsed, Mapping) else None)
421
+ if not message:
422
+ hint = _failure_hint(status)
423
+ if hint:
424
+ message = hint
425
+ description = message or default
426
+ payload_for_log: JSONValue | str = parsed if parsed is not None else body_text
427
+ return description, payload_for_log
428
+
429
+
430
+ def _json_mapping(response: httpx.Response) -> Mapping[str, JSONValue]:
431
+ try:
432
+ data = cast(object, response.json())
433
+ except (json.JSONDecodeError, ValueError):
434
+ return {}
435
+ if isinstance(data, Mapping):
436
+ return cast(Mapping[str, JSONValue], data)
437
+ return {}
438
+
439
+
440
+ def _string_field(payload: Mapping[str, JSONValue], key: str) -> str | None:
441
+ value = payload.get(key)
442
+ if isinstance(value, str):
443
+ stripped = value.strip()
444
+ if stripped:
445
+ return stripped
446
+ return None
447
+
448
+
449
+ def _job_failure_message(payload: Mapping[str, JSONValue]) -> str:
450
+ direct = _message_from_mapping(payload)
451
+ if direct:
452
+ return direct
453
+ for key in ("statusReason", "failureReason", "errorMessage"):
454
+ text = _string_field(payload, key)
455
+ if text:
456
+ return text
457
+ return "harmonization job failed"
458
+
459
+
460
+ def _download_error_message(status: int, body: bytes) -> str:
461
+ payload = _payload_for_logging(body)
462
+ message = _message_from_mapping(payload if isinstance(payload, Mapping) else None)
463
+ if message:
464
+ return message
465
+ hint = _failure_hint(status)
466
+ if hint:
467
+ return hint
468
+ return f"harmonization download failed (HTTP {status})"
469
+
470
+
471
+ def _message_from_mapping(payload: Mapping[str, JSONValue] | None) -> str | None:
472
+ direct = _direct_message(payload)
473
+ if direct:
474
+ return direct
475
+ return _message_from_body_field(payload)
476
+
477
+
478
+ def _direct_message(payload: Mapping[str, JSONValue] | None) -> str | None:
479
+ if payload is None:
480
+ return None
481
+ for key in _MESSAGE_KEYS:
482
+ text = _coerce_message(payload.get(key))
483
+ if text:
484
+ return text
485
+ return None
486
+
487
+
488
+ def _message_from_body_field(payload: Mapping[str, JSONValue] | None) -> str | None:
489
+ body_mapping = _body_mapping(payload)
490
+ if body_mapping is None:
491
+ return None
492
+ return _message_from_mapping(body_mapping)
493
+
494
+
495
+ def _coerce_message(value: JSONValue | None) -> str | None:
496
+ if isinstance(value, str):
497
+ stripped = value.strip()
498
+ if stripped:
499
+ return stripped
500
+ return None
501
+
502
+
503
+ def _body_mapping(payload: Mapping[str, JSONValue] | None) -> Mapping[str, JSONValue] | None:
504
+ if payload is None:
505
+ return None
506
+ body = payload.get("body")
507
+ if isinstance(body, str):
508
+ parsed = _try_parse_json(body)
509
+ return parsed if isinstance(parsed, Mapping) else None
510
+ if isinstance(body, Mapping):
511
+ return cast(Mapping[str, JSONValue], body)
512
+ return None
513
+
514
+
515
+ def _payload_for_logging(body: bytes) -> JSONValue | str:
516
+ text = body.decode("utf-8", errors="replace")
517
+ parsed = _try_parse_json(text)
518
+ return parsed if parsed is not None else text
519
+
520
+
521
+ def _failure_hint(status: int) -> str | None:
522
+ if status in {401, 403}:
523
+ return "harmonization request was rejected (check API credentials and permissions)"
524
+ if status == 404:
525
+ return "harmonization endpoint not found (confirm base URL/path)"
526
+ if 500 <= status < 600:
527
+ return "harmonization service encountered an internal error"
528
+ return None
529
+
530
+
531
+ def _formatted_body(payload: JSONValue | str) -> str:
532
+ if isinstance(payload, str):
533
+ return _formatted_string_body(payload)
534
+ if isinstance(payload, (dict, list)):
535
+ return _render_json(payload)
536
+ return _truncate(str(payload))
537
+
538
+
539
+ def _formatted_string_body(raw: str) -> str:
540
+ parsed = _try_parse_json(raw)
541
+ if isinstance(parsed, (dict, list)):
542
+ return _render_json(parsed)
543
+ return _truncate(raw)
544
+
545
+
546
+ def _try_parse_json(raw: str) -> JSONValue | None:
547
+ try:
548
+ return cast(JSONValue, json.loads(raw))
549
+ except Exception:
550
+ return None
551
+
552
+
553
+ def _render_json(data: Mapping[str, JSONValue] | Sequence[JSONValue]) -> str:
554
+ return _truncate(json.dumps(data, indent=2, sort_keys=True))
555
+
556
+
557
+ def _truncate(text: str, limit: int = 2000) -> str:
558
+ if len(text) <= limit:
559
+ return text
560
+ return f"{text[: limit - 1]}…"