netrias_client 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of netrias_client might be problematic. Click here for more details.

@@ -0,0 +1,126 @@
1
+ """HTTP helpers for harmonization and discovery."""
2
+ from __future__ import annotations
3
+
4
+ import csv
5
+ import gzip
6
+ import json
7
+ from collections.abc import Mapping, Sequence
8
+ from pathlib import Path
9
+ from typing import Final
10
+
11
+ import httpx
12
+
13
+ from ._adapter import normalize_manifest_mapping
14
+
15
+ SCHEMA_VERSION: Final[str] = "1.0"
16
+ DEFAULT_MODEL_VERSION: Final[str] = "v1"
17
+ MAX_COMPRESSED_BYTES: Final[int] = 10 * 1024 * 1024
18
+
19
+ def build_harmonize_payload(
20
+ csv_path: Path,
21
+ manifest: Path | Mapping[str, object] | None,
22
+ model_version: str = DEFAULT_MODEL_VERSION,
23
+ ) -> bytes:
24
+ """Return gzip-compressed harmonization payload for the given CSV and manifest."""
25
+
26
+ rows = _read_tabular(csv_path)
27
+ header = rows[0] if rows else []
28
+ data_rows = rows[1:] if len(rows) > 1 else []
29
+
30
+ envelope: dict[str, object] = {
31
+ "schemaVersion": SCHEMA_VERSION,
32
+ "modelVersion": model_version,
33
+ "document": {
34
+ "name": csv_path.name,
35
+ "sheetName": None,
36
+ "header": header,
37
+ "rows": data_rows,
38
+ },
39
+ }
40
+
41
+ mapping = normalize_manifest_mapping(manifest)
42
+ if mapping:
43
+ envelope["mapping"] = mapping
44
+
45
+ raw = json.dumps(envelope, ensure_ascii=False, separators=(",", ":")).encode("utf-8")
46
+ compressed = gzip.compress(raw)
47
+ if len(compressed) > MAX_COMPRESSED_BYTES:
48
+ raise ValueError("compressed harmonization payload exceeds 10 MiB")
49
+ return compressed
50
+
51
+ async def submit_harmonize_job(
52
+ base_url: str,
53
+ api_key: str,
54
+ payload_gz: bytes,
55
+ timeout: float,
56
+ idempotency_key: str | None = None,
57
+ ) -> httpx.Response:
58
+ """Submit a harmonization job request and return the raw response."""
59
+
60
+ url = _build_job_submit_url(base_url)
61
+ headers = {
62
+ "Authorization": f"Bearer {api_key}",
63
+ "Content-Type": "application/json",
64
+ "Content-Encoding": "gzip",
65
+ }
66
+ if idempotency_key:
67
+ headers["Idempotency-Key"] = idempotency_key
68
+
69
+ async with httpx.AsyncClient(timeout=httpx.Timeout(timeout)) as client:
70
+ return await client.post(url, content=payload_gz, headers=headers)
71
+
72
+ async def fetch_job_status(
73
+ base_url: str,
74
+ api_key: str,
75
+ job_id: str,
76
+ timeout: float,
77
+ ) -> httpx.Response:
78
+ """Return the status response for a previously submitted harmonization job."""
79
+
80
+ url = _build_job_status_url(base_url, job_id)
81
+ headers = {"Authorization": f"Bearer {api_key}"}
82
+ async with httpx.AsyncClient(timeout=httpx.Timeout(timeout)) as client:
83
+ return await client.get(url, headers=headers)
84
+
85
+ async def request_mapping_discovery(
86
+ base_url: str,
87
+ api_key: str,
88
+ timeout: float,
89
+ schema: str,
90
+ columns: Mapping[str, Sequence[str]],
91
+ ) -> httpx.Response:
92
+ """Submit column samples for mapping recommendations."""
93
+
94
+ url = _build_discovery_url(base_url)
95
+ headers = {
96
+ "Content-Type": "application/json",
97
+ "x-api-key": api_key,
98
+ }
99
+ body = {"target_schema": schema, "data": columns}
100
+ payload = {"body": json.dumps(body)}
101
+ async with httpx.AsyncClient(timeout=httpx.Timeout(timeout)) as client:
102
+ return await client.post(url, headers=headers, json=payload)
103
+
104
+ def _build_job_submit_url(base_url: str) -> str:
105
+ base = base_url.rstrip("/")
106
+ return f"{base}/v1/jobs/harmonize"
107
+
108
+ def _build_job_status_url(base_url: str, job_id: str) -> str:
109
+ base = base_url.rstrip("/")
110
+ return f"{base}/v1/jobs/{job_id}"
111
+
112
+ def _build_discovery_url(base_url: str) -> str:
113
+ base = base_url.rstrip("/")
114
+ return f"{base}/cde-recommendation"
115
+
116
+ def _read_tabular(path: Path) -> list[list[str]]:
117
+ if not path.exists():
118
+ raise FileNotFoundError(path)
119
+ ext = path.suffix.lower()
120
+ if ext not in {".csv", ".tsv"}:
121
+ raise ValueError("harmonization only supports CSV or TSV inputs")
122
+ delimiter = "," if ext == ".csv" else "\t"
123
+ with path.open("r", encoding="utf-8", newline="") as handle:
124
+ reader = csv.reader(handle, delimiter=delimiter)
125
+ return [list(row) for row in reader]
126
+
netrias_client/_io.py ADDED
@@ -0,0 +1,28 @@
1
+ """I/O helpers for streaming responses.
2
+
3
+ 'why': keep file operations small and testable; avoid partial outputs
4
+ """
5
+ from __future__ import annotations
6
+
7
+ import tempfile
8
+ from pathlib import Path
9
+
10
+ import httpx
11
+
12
+
13
+ async def stream_download_to_file(response: httpx.Response, dest_path: Path) -> Path:
14
+ """Stream an HTTP response body to `dest_path` atomically.
15
+
16
+ Writes to a temporary file in the destination directory and then renames.
17
+ """
18
+
19
+ dest_path = Path(dest_path)
20
+ tmp_dir = dest_path.parent
21
+ tmp_dir.mkdir(parents=True, exist_ok=True)
22
+ with tempfile.NamedTemporaryFile(dir=tmp_dir, delete=False, suffix=".partial") as tmp:
23
+ async for chunk in response.aiter_bytes():
24
+ _ = tmp.write(chunk)
25
+ tmp_path = Path(tmp.name)
26
+ _ = tmp_path.replace(dest_path)
27
+ return dest_path
28
+
@@ -0,0 +1,46 @@
1
+ """Logger helpers for the Netrias client."""
2
+ from __future__ import annotations
3
+
4
+ import logging
5
+ from pathlib import Path
6
+ from typing import Final
7
+
8
+ from ._models import LogLevel
9
+
10
+
11
+ _FORMAT: Final[str] = "%(asctime)s %(levelname)s netrias_client: %(message)s"
12
+
13
+
14
+ def configure_logger(
15
+ name: str,
16
+ level: LogLevel,
17
+ log_directory: Path | None,
18
+ ) -> logging.Logger:
19
+ """Configure and return a logger dedicated to a Netrias client instance."""
20
+
21
+ logger = logging.getLogger(name)
22
+ logger.handlers.clear()
23
+ logger.propagate = False
24
+
25
+ formatter = logging.Formatter(fmt=_FORMAT)
26
+
27
+ stream_handler = logging.StreamHandler()
28
+ stream_handler.setFormatter(formatter)
29
+ logger.addHandler(stream_handler)
30
+
31
+ if log_directory is not None:
32
+ log_directory.mkdir(parents=True, exist_ok=True)
33
+ file_path = log_directory / f"{name.replace('.', '_')}.log"
34
+ file_handler = logging.FileHandler(file_path, encoding="utf-8")
35
+ file_handler.setFormatter(formatter)
36
+ logger.addHandler(file_handler)
37
+
38
+ mapping = {
39
+ LogLevel.CRITICAL: logging.CRITICAL,
40
+ LogLevel.ERROR: logging.ERROR,
41
+ LogLevel.WARNING: logging.WARNING,
42
+ LogLevel.INFO: logging.INFO,
43
+ LogLevel.DEBUG: logging.DEBUG,
44
+ }
45
+ logger.setLevel(mapping[level])
46
+ return logger
@@ -0,0 +1,72 @@
1
+ """Define dataclasses and types for the client.
2
+
3
+ 'why': capture configuration and results in typed, testable shapes
4
+ """
5
+ from __future__ import annotations
6
+
7
+ from collections.abc import Mapping
8
+ from dataclasses import dataclass
9
+ from enum import Enum
10
+ from pathlib import Path
11
+ from typing import Literal
12
+
13
+
14
+ class LogLevel(str, Enum):
15
+ """Enumerate supported logging levels for the client."""
16
+
17
+ CRITICAL = "CRITICAL"
18
+ ERROR = "ERROR"
19
+ WARNING = "WARNING"
20
+ INFO = "INFO"
21
+ DEBUG = "DEBUG"
22
+
23
+
24
+ @dataclass(frozen=True)
25
+ class Settings:
26
+ """Capture runtime settings for API calls."""
27
+
28
+ api_key: str
29
+ discovery_url: str
30
+ harmonization_url: str
31
+ timeout: float
32
+ log_level: LogLevel
33
+ confidence_threshold: float
34
+ discovery_use_gateway_bypass: bool
35
+ log_directory: Path | None
36
+
37
+
38
+ @dataclass(frozen=True)
39
+ class HarmonizationResult:
40
+ """Communicate harmonization outcome in a consistent shape."""
41
+
42
+ file_path: Path
43
+ status: Literal["succeeded", "failed", "timeout"]
44
+ description: str
45
+ mapping_id: str | None = None
46
+
47
+
48
+ @dataclass(frozen=True)
49
+ class MappingRecommendationOption:
50
+ """Capture a single recommended target for a source column."""
51
+
52
+ target: str | None
53
+ confidence: float | None
54
+ raw: Mapping[str, object] | None = None
55
+
56
+
57
+ @dataclass(frozen=True)
58
+ class MappingSuggestion:
59
+ """Group recommendation options for a single source column."""
60
+
61
+ source_column: str
62
+ options: tuple[MappingRecommendationOption, ...]
63
+ raw: Mapping[str, object] | None = None
64
+
65
+
66
+ @dataclass(frozen=True)
67
+ class MappingDiscoveryResult:
68
+ """Communicate column mapping recommendations for a dataset."""
69
+
70
+ schema: str
71
+ suggestions: tuple[MappingSuggestion, ...]
72
+ raw: Mapping[str, object]
@@ -0,0 +1,173 @@
1
+ """Validate inputs for harmonization.
2
+
3
+ 'why': fail fast with clear, actionable messages prior to network calls
4
+ """
5
+ from __future__ import annotations
6
+
7
+ import os
8
+ from collections.abc import Mapping, Sequence
9
+ from pathlib import Path
10
+
11
+ from ._errors import FileValidationError, MappingValidationError, OutputLocationError
12
+
13
+
14
+ # OBVIOUS HARD-CODED SIZE LIMIT: 250 MB maximum CSV size prior to upload
15
+ HARD_MAX_CSV_BYTES = 250 * 1024 * 1024
16
+
17
+
18
+ def validate_source_path(path: Path) -> Path:
19
+ """Ensure the CSV exists, is a file, has a .csv extension, and respects size limits."""
20
+
21
+ _require_exists(path, "source CSV not found")
22
+ _require_is_file(path, "source path is not a file")
23
+ _require_suffix(path, ".csv", "unsupported file extension for source CSV")
24
+ _require_not_too_large(path)
25
+ return path
26
+
27
+
28
+ def validate_manifest_path(path: Path) -> Path:
29
+ """Ensure the manifest JSON exists and is a file."""
30
+
31
+ _require_exists(path, "manifest JSON not found")
32
+ _require_is_file(path, "manifest path is not a file")
33
+ _require_suffix(path, ".json", "manifest must be a .json file")
34
+ return path
35
+
36
+
37
+ def validate_output_path(path: Path | None, source_name: str, allow_versioning: bool = False) -> Path:
38
+ """Return a valid output file path, creating parent directories when needed.
39
+
40
+ Defaults to `<CWD>/<source_name>.harmonized.csv` when `path` is None or a directory.
41
+ """
42
+
43
+ candidate = _resolve_output_candidate(path, source_name)
44
+ _ensure_parent(candidate)
45
+ _require_parent_writable(candidate)
46
+ if allow_versioning:
47
+ candidate = _next_available_path(candidate)
48
+ else:
49
+ _require_not_exists(candidate)
50
+ return candidate
51
+
52
+
53
+ def validate_target_schema(schema: str) -> str:
54
+ """Ensure the target schema identifier is a non-empty string."""
55
+
56
+ candidate = (schema or "").strip()
57
+ if not candidate:
58
+ raise MappingValidationError("target_schema must be a non-empty string")
59
+ return candidate
60
+
61
+
62
+ def validate_column_samples(columns: Mapping[str, Sequence[object]]) -> dict[str, list[str]]:
63
+ """Normalize column sample data for mapping discovery."""
64
+
65
+ if not columns:
66
+ raise MappingValidationError("column data must include at least one column")
67
+ normalized: dict[str, list[str]] = {}
68
+ for raw_name, values in columns.items():
69
+ name = _normalized_column_name(raw_name)
70
+ samples = _normalized_samples(name, values)
71
+ normalized[name] = samples
72
+ return normalized
73
+
74
+
75
+ def _require_exists(path: Path, message: str) -> None:
76
+ if not path.exists():
77
+ raise FileValidationError(f"{message}: {path}")
78
+
79
+
80
+ def _require_is_file(path: Path, message: str) -> None:
81
+ if not path.is_file():
82
+ raise FileValidationError(f"{message}: {path}")
83
+
84
+
85
+ def _require_suffix(path: Path, suffix: str, message: str) -> None:
86
+ if path.suffix.lower() != suffix:
87
+ raise FileValidationError(f"{message}: {path.suffix}")
88
+
89
+
90
+ def _require_not_too_large(path: Path) -> None:
91
+ try:
92
+ size = os.path.getsize(path)
93
+ except OSError as exc:
94
+ raise FileValidationError(f"unable to stat source CSV: {exc}") from exc
95
+ if size > HARD_MAX_CSV_BYTES:
96
+ raise FileValidationError(
97
+ f"source CSV exceeds hard-coded limit of {HARD_MAX_CSV_BYTES // (1024 * 1024)} MB (got {size} bytes)"
98
+ )
99
+
100
+
101
+ def _resolve_output_candidate(path: Path | None, source_name: str) -> Path:
102
+ if path is None:
103
+ return Path.cwd() / f"{source_name}.harmonized.csv"
104
+ if path.exists() and path.is_dir():
105
+ return path / f"{source_name}.harmonized.csv"
106
+ return path
107
+
108
+
109
+ def _ensure_parent(candidate: Path) -> None:
110
+ parent = candidate.parent
111
+ if not parent.exists():
112
+ try:
113
+ parent.mkdir(parents=True, exist_ok=True)
114
+ except OSError as exc:
115
+ raise OutputLocationError(f"unable to create output directory {parent}: {exc}") from exc
116
+
117
+
118
+ def _require_parent_writable(candidate: Path) -> None:
119
+ parent = candidate.parent
120
+ if parent.exists() and not os.access(parent, os.W_OK):
121
+ raise OutputLocationError(f"output directory not writable: {parent}")
122
+
123
+
124
+ def _require_not_exists(candidate: Path) -> None:
125
+ if candidate.exists():
126
+ raise OutputLocationError(f"refusing to overwrite existing file: {candidate}")
127
+
128
+
129
+ def _next_available_path(candidate: Path) -> Path:
130
+ if not candidate.exists():
131
+ return candidate
132
+ stem = candidate.stem
133
+ suffix = candidate.suffix
134
+ parent = candidate.parent
135
+ index = 1
136
+ while index < 1000:
137
+ versioned = parent / f"{stem}.v{index}{suffix}"
138
+ if not versioned.exists():
139
+ return versioned
140
+ index += 1
141
+ raise OutputLocationError(
142
+ f"unable to determine unique output path after {index - 1} attempts for {candidate}"
143
+ )
144
+
145
+
146
+ def _normalized_column_name(raw_name: object) -> str:
147
+ if not isinstance(raw_name, str):
148
+ raise MappingValidationError("column names must be strings")
149
+ name = raw_name.strip()
150
+ if not name:
151
+ raise MappingValidationError("column names must be non-empty strings")
152
+ return name
153
+
154
+
155
+ def _normalized_samples(column_name: str, values: Sequence[object] | None) -> list[str]:
156
+ sequence = _require_sequence(column_name, values)
157
+ samples = [sample for sample in (_coerced_sample(value) for value in sequence) if sample]
158
+ if not samples:
159
+ raise MappingValidationError(f"column '{column_name}' must include at least one non-empty sample value")
160
+ return samples
161
+
162
+
163
+ def _require_sequence(column_name: str, values: Sequence[object] | None) -> Sequence[object]:
164
+ if values is None or isinstance(values, (str, bytes)):
165
+ raise MappingValidationError(f"column '{column_name}' values must be a sequence of samples")
166
+ return values
167
+
168
+
169
+ def _coerced_sample(value: object) -> str | None:
170
+ if value is None:
171
+ return None
172
+ text = str(value).strip()
173
+ return text or None