netrias_client 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,217 @@
1
+ """Temporary gateway bypass helpers for direct Lambda invocation.
2
+
3
+ 'why': mitigate API Gateway timeouts by calling the CDE recommendation alias directly
4
+
5
+ # TODO: remove this module once API Gateway latency is resolved and direct Lambda
6
+ # calls are no longer necessary.
7
+ """
8
+ from __future__ import annotations
9
+
10
+ import json
11
+ import logging
12
+ from collections.abc import Mapping, Sequence
13
+ from typing import Callable, IO, Protocol, cast
14
+
15
+
16
+ class GatewayBypassError(RuntimeError):
17
+ """Raised when the direct Lambda invocation fails."""
18
+
19
+
20
+ class _LambdaClient(Protocol):
21
+ def invoke(
22
+ self,
23
+ FunctionName: str,
24
+ Qualifier: str,
25
+ Payload: bytes,
26
+ ) -> Mapping[str, object]:
27
+ ...
28
+
29
+
30
+ class _ClientFactory(Protocol):
31
+ def __call__(self, service_name: str, **kwargs: object) -> object:
32
+ ...
33
+
34
+
35
+ class _SessionProtocol(Protocol):
36
+ def client(self, service_name: str, **kwargs: object) -> object:
37
+ ...
38
+
39
+
40
+ def invoke_cde_recommendation_alias(
41
+ target_schema: str,
42
+ target_version: str,
43
+ columns: Mapping[str, Sequence[object]],
44
+ function_name: str = "cde-recommendation",
45
+ alias: str = "prod",
46
+ region_name: str = "us-east-2",
47
+ timeout_seconds: float | None = None,
48
+ profile_name: str | None = None,
49
+ logger: logging.Logger | None = None,
50
+ top_k: int | None = None,
51
+ ) -> Mapping[str, object]:
52
+ """Call the CDE recommendation Lambda alias directly and return its parsed payload.
53
+
54
+ NOTE: This bypass is temporary. Prefer the public API once API Gateway limits are addressed.
55
+ """
56
+
57
+ client = _build_lambda_client(
58
+ region_name=region_name,
59
+ profile_name=profile_name,
60
+ timeout_seconds=timeout_seconds,
61
+ )
62
+ normalized_columns = _normalized_columns(columns)
63
+ body_dict: dict[str, object] = {
64
+ "target_schema": target_schema,
65
+ "target_version": target_version,
66
+ "data": normalized_columns,
67
+ }
68
+ if top_k is not None:
69
+ body_dict["top_k"] = top_k
70
+ body = json.dumps(body_dict)
71
+ event = {"body": body, "isBase64Encoded": False}
72
+
73
+ active_logger = logger or logging.getLogger("netrias_client")
74
+
75
+ active_logger.info(
76
+ "gateway bypass invoke start: function=%s alias=%s schema=%s columns=%s",
77
+ function_name,
78
+ alias,
79
+ target_schema,
80
+ len(columns),
81
+ )
82
+
83
+ try:
84
+ response = client.invoke(
85
+ FunctionName=function_name,
86
+ Qualifier=alias,
87
+ Payload=json.dumps(event).encode("utf-8"),
88
+ )
89
+ except Exception as exc: # pragma: no cover - boto3 specific
90
+ active_logger.error(
91
+ "gateway bypass invoke failed: function=%s alias=%s err=%s",
92
+ function_name,
93
+ alias,
94
+ exc,
95
+ )
96
+ raise GatewayBypassError(f"lambda invoke failed: {exc}") from exc
97
+
98
+ status_code = response.get("StatusCode")
99
+ payload_stream = cast(IO[bytes] | None, response.get("Payload"))
100
+ raw_payload = _read_lambda_payload(payload_stream)
101
+ payload = _json_payload(raw_payload)
102
+
103
+ active_logger.info(
104
+ "gateway bypass invoke complete: function=%s alias=%s status=%s",
105
+ function_name,
106
+ alias,
107
+ status_code,
108
+ )
109
+
110
+ return _extract_body_mapping(payload)
111
+
112
+
113
+ def _build_lambda_client(
114
+ region_name: str,
115
+ profile_name: str | None,
116
+ timeout_seconds: float | None,
117
+ ) -> _LambdaClient:
118
+ boto3, Config = _load_boto_dependencies()
119
+ config = (
120
+ Config(
121
+ read_timeout=timeout_seconds,
122
+ connect_timeout=min(timeout_seconds, 10.0),
123
+ )
124
+ if timeout_seconds is not None
125
+ else None
126
+ )
127
+
128
+ if profile_name:
129
+ session_factory = cast(
130
+ Callable[..., object],
131
+ getattr(boto3, "Session"),
132
+ )
133
+ session = cast(
134
+ _SessionProtocol,
135
+ session_factory(profile_name=profile_name, region_name=region_name),
136
+ )
137
+ factory = cast(_ClientFactory, session.client)
138
+ else:
139
+ factory = cast(_ClientFactory, getattr(boto3, "client"))
140
+
141
+ return _lambda_client_from_factory(factory, region_name=region_name, config=config)
142
+
143
+
144
+ def _load_boto_dependencies():
145
+ try:
146
+ import boto3 # pyright: ignore[reportMissingTypeStubs]
147
+ from botocore.config import Config # pyright: ignore[reportMissingTypeStubs]
148
+ except ImportError as exc: # pragma: no cover - optional dependency
149
+ raise GatewayBypassError(
150
+ "boto3 is required for the gateway bypass helper; install netrias-client[aws] or boto3 explicitly"
151
+ ) from exc
152
+ return boto3, Config
153
+
154
+
155
+ def _lambda_client_from_factory(
156
+ factory: _ClientFactory,
157
+ region_name: str,
158
+ config: object | None,
159
+ ) -> _LambdaClient:
160
+ kwargs: dict[str, object] = {"region_name": region_name}
161
+ if config is not None:
162
+ kwargs["config"] = config
163
+ client_obj = factory("lambda", **kwargs)
164
+ return cast(_LambdaClient, client_obj)
165
+
166
+
167
+ def _read_lambda_payload(stream: IO[bytes] | None) -> bytes:
168
+ if stream is None:
169
+ return b""
170
+ return stream.read()
171
+
172
+
173
+ def _json_payload(raw_payload: bytes) -> Mapping[str, object]:
174
+ if not raw_payload:
175
+ return {}
176
+ try:
177
+ return cast(Mapping[str, object], json.loads(raw_payload.decode("utf-8")))
178
+ except json.JSONDecodeError as exc: # pragma: no cover - unexpected lambda output
179
+ raise GatewayBypassError(f"lambda returned non-JSON payload: {exc}") from exc
180
+
181
+
182
+ def _extract_body_mapping(payload: Mapping[str, object]) -> Mapping[str, object]:
183
+ body = payload.get("body")
184
+ if isinstance(body, str):
185
+ try:
186
+ return cast(Mapping[str, object], json.loads(body))
187
+ except json.JSONDecodeError as exc: # pragma: no cover - unexpected lambda output
188
+ raise GatewayBypassError(f"lambda body was not valid JSON: {exc}") from exc
189
+ return payload
190
+
191
+
192
+ def _normalized_columns(columns: Mapping[str, Sequence[object]]) -> dict[str, list[str]]:
193
+ normalized: dict[str, list[str]] = {}
194
+ for key, values in columns.items():
195
+ name = _normalized_column_key(key)
196
+ if name is None:
197
+ continue
198
+ cleaned = _normalized_column_values(values)
199
+ if cleaned:
200
+ normalized[name] = cleaned
201
+ return normalized
202
+
203
+
204
+ def _normalized_column_key(raw: str) -> str | None:
205
+ text = raw.strip()
206
+ return text or None
207
+
208
+
209
+ def _normalized_column_values(values: Sequence[object]) -> list[str]:
210
+ return [text for text in (_normalized_column_value(value) for value in values) if text]
211
+
212
+
213
+ def _normalized_column_value(value: object) -> str | None:
214
+ if value is None:
215
+ return None
216
+ text = str(value).strip()
217
+ return text or None
@@ -0,0 +1,234 @@
1
+ """HTTP helpers for harmonization and discovery."""
2
+ from __future__ import annotations
3
+
4
+ import csv
5
+ import gzip
6
+ import json
7
+ from collections.abc import Mapping, Sequence
8
+ from pathlib import Path
9
+ from typing import Final
10
+ from urllib.parse import quote
11
+
12
+ import httpx
13
+
14
+ from ._adapter import normalize_manifest_mapping
15
+
16
+ SCHEMA_VERSION: Final[str] = "1.0"
17
+ DEFAULT_MODEL_VERSION: Final[str] = "v1"
18
+ MAX_COMPRESSED_BYTES: Final[int] = 10 * 1024 * 1024
19
+
20
+ def build_harmonize_payload(
21
+ csv_path: Path,
22
+ manifest: Path | Mapping[str, object] | None,
23
+ model_version: str = DEFAULT_MODEL_VERSION,
24
+ ) -> bytes:
25
+ """Return gzip-compressed harmonization payload for the given CSV and manifest."""
26
+
27
+ rows = _read_tabular(csv_path)
28
+ header = rows[0] if rows else []
29
+ data_rows = rows[1:] if len(rows) > 1 else []
30
+
31
+ envelope: dict[str, object] = {
32
+ "schemaVersion": SCHEMA_VERSION,
33
+ "modelVersion": model_version,
34
+ "document": {
35
+ "name": csv_path.name,
36
+ "sheetName": None,
37
+ "header": header,
38
+ "rows": data_rows,
39
+ },
40
+ }
41
+
42
+ mapping = normalize_manifest_mapping(manifest)
43
+ if mapping:
44
+ envelope["mapping"] = mapping
45
+
46
+ raw = json.dumps(envelope, ensure_ascii=False, separators=(",", ":")).encode("utf-8")
47
+ compressed = gzip.compress(raw)
48
+ if len(compressed) > MAX_COMPRESSED_BYTES:
49
+ raise ValueError("compressed harmonization payload exceeds 10 MiB")
50
+ return compressed
51
+
52
+ async def submit_harmonize_job(
53
+ base_url: str,
54
+ api_key: str,
55
+ payload_gz: bytes,
56
+ timeout: float,
57
+ idempotency_key: str | None = None,
58
+ ) -> httpx.Response:
59
+ """Submit a harmonization job request and return the raw response."""
60
+
61
+ url = _build_job_submit_url(base_url)
62
+ headers = {
63
+ "Authorization": f"Bearer {api_key}",
64
+ "Content-Type": "application/json",
65
+ "Content-Encoding": "gzip",
66
+ }
67
+ if idempotency_key:
68
+ headers["Idempotency-Key"] = idempotency_key
69
+
70
+ async with httpx.AsyncClient(timeout=httpx.Timeout(timeout)) as client:
71
+ return await client.post(url, content=payload_gz, headers=headers)
72
+
73
+ async def fetch_job_status(
74
+ base_url: str,
75
+ api_key: str,
76
+ job_id: str,
77
+ timeout: float,
78
+ ) -> httpx.Response:
79
+ """Return the status response for a previously submitted harmonization job."""
80
+
81
+ url = _build_job_status_url(base_url, job_id)
82
+ headers = {"Authorization": f"Bearer {api_key}"}
83
+ async with httpx.AsyncClient(timeout=httpx.Timeout(timeout)) as client:
84
+ return await client.get(url, headers=headers)
85
+
86
+ async def request_mapping_discovery(
87
+ base_url: str,
88
+ api_key: str,
89
+ timeout: float,
90
+ schema: str,
91
+ version: str,
92
+ columns: Mapping[str, Sequence[str]],
93
+ top_k: int | None = None,
94
+ ) -> httpx.Response:
95
+ """Submit column samples for mapping recommendations."""
96
+
97
+ url = _build_discovery_url(base_url)
98
+ headers = {
99
+ "Content-Type": "application/json",
100
+ "x-api-key": api_key,
101
+ }
102
+ body: dict[str, object] = {
103
+ "target_schema": schema,
104
+ "target_version": version,
105
+ "data": columns,
106
+ }
107
+ if top_k is not None:
108
+ body["top_k"] = top_k
109
+ payload = {"body": json.dumps(body)}
110
+ async with httpx.AsyncClient(timeout=httpx.Timeout(timeout)) as client:
111
+ return await client.post(url, headers=headers, json=payload)
112
+
113
+
114
+ async def fetch_data_models(
115
+ base_url: str,
116
+ api_key: str,
117
+ timeout: float,
118
+ query: str | None = None,
119
+ include_versions: bool = False,
120
+ include_counts: bool = False,
121
+ limit: int | None = None,
122
+ offset: int = 0,
123
+ ) -> httpx.Response:
124
+ """Fetch data models from the Data Model Store."""
125
+
126
+ url = f"{base_url.rstrip('/')}/data-models"
127
+ headers = {"x-api-key": api_key}
128
+ params = _build_data_models_params(query, include_versions, include_counts, limit, offset)
129
+
130
+ async with httpx.AsyncClient(timeout=httpx.Timeout(timeout)) as client:
131
+ return await client.get(url, headers=headers, params=params)
132
+
133
+
134
+ def _build_data_models_params(
135
+ query: str | None,
136
+ include_versions: bool,
137
+ include_counts: bool,
138
+ limit: int | None,
139
+ offset: int,
140
+ ) -> dict[str, str | int]:
141
+ """Build query parameters for data models endpoint."""
142
+
143
+ candidates: list[tuple[str, str | int | None]] = [
144
+ ("offset", offset),
145
+ ("q", query),
146
+ ("include_versions", "true" if include_versions else None),
147
+ ("include_counts", "true" if include_counts else None),
148
+ ("limit", limit),
149
+ ]
150
+ return {k: v for k, v in candidates if v is not None}
151
+
152
+
153
+ async def fetch_cdes(
154
+ base_url: str,
155
+ api_key: str,
156
+ timeout: float,
157
+ model_key: str,
158
+ version: str,
159
+ include_description: bool = False,
160
+ query: str | None = None,
161
+ limit: int | None = None,
162
+ offset: int = 0,
163
+ ) -> httpx.Response:
164
+ """Fetch CDEs for a data model version from the Data Model Store."""
165
+
166
+ url = f"{base_url.rstrip('/')}/data-models/{quote(model_key, safe='')}/versions/{quote(version, safe='')}/cdes"
167
+ headers = {"x-api-key": api_key}
168
+ params: dict[str, str | int] = {"offset": offset}
169
+ if include_description:
170
+ params["include_description"] = "true"
171
+ if query:
172
+ params["q"] = query
173
+ if limit is not None:
174
+ params["limit"] = limit
175
+
176
+ async with httpx.AsyncClient(timeout=httpx.Timeout(timeout)) as client:
177
+ return await client.get(url, headers=headers, params=params)
178
+
179
+
180
+ async def fetch_pvs(
181
+ base_url: str,
182
+ api_key: str,
183
+ timeout: float,
184
+ model_key: str,
185
+ version: str,
186
+ cde_key: str,
187
+ include_inactive: bool = False,
188
+ query: str | None = None,
189
+ limit: int | None = None,
190
+ offset: int = 0,
191
+ ) -> httpx.Response:
192
+ """Fetch permissible values for a CDE from the Data Model Store."""
193
+
194
+ path = (
195
+ f"/data-models/{quote(model_key, safe='')}"
196
+ f"/versions/{quote(version, safe='')}"
197
+ f"/cdes/{quote(cde_key, safe='')}/pvs"
198
+ )
199
+ url = f"{base_url.rstrip('/')}{path}"
200
+ headers = {"x-api-key": api_key}
201
+ params: dict[str, str | int] = {"offset": offset}
202
+ if include_inactive:
203
+ params["include_inactive"] = "true"
204
+ if query:
205
+ params["q"] = query
206
+ if limit is not None:
207
+ params["limit"] = limit
208
+
209
+ async with httpx.AsyncClient(timeout=httpx.Timeout(timeout)) as client:
210
+ return await client.get(url, headers=headers, params=params)
211
+
212
+ def _build_job_submit_url(base_url: str) -> str:
213
+ base = base_url.rstrip("/")
214
+ return f"{base}/v1/jobs/harmonize"
215
+
216
+ def _build_job_status_url(base_url: str, job_id: str) -> str:
217
+ base = base_url.rstrip("/")
218
+ return f"{base}/v1/jobs/{job_id}"
219
+
220
+ def _build_discovery_url(base_url: str) -> str:
221
+ base = base_url.rstrip("/")
222
+ return f"{base}/cde-recommendation"
223
+
224
+ def _read_tabular(path: Path) -> list[list[str]]:
225
+ if not path.exists():
226
+ raise FileNotFoundError(path)
227
+ ext = path.suffix.lower()
228
+ if ext not in {".csv", ".tsv"}:
229
+ raise ValueError("harmonization only supports CSV or TSV inputs")
230
+ delimiter = "," if ext == ".csv" else "\t"
231
+ with path.open("r", encoding="utf-8", newline="") as handle:
232
+ reader = csv.reader(handle, delimiter=delimiter)
233
+ return [list(row) for row in reader]
234
+
netrias_client/_io.py ADDED
@@ -0,0 +1,28 @@
1
+ """I/O helpers for streaming responses.
2
+
3
+ 'why': keep file operations small and testable; avoid partial outputs
4
+ """
5
+ from __future__ import annotations
6
+
7
+ import tempfile
8
+ from pathlib import Path
9
+
10
+ import httpx
11
+
12
+
13
+ async def stream_download_to_file(response: httpx.Response, dest_path: Path) -> Path:
14
+ """Stream an HTTP response body to `dest_path` atomically.
15
+
16
+ Writes to a temporary file in the destination directory and then renames.
17
+ """
18
+
19
+ dest_path = Path(dest_path)
20
+ tmp_dir = dest_path.parent
21
+ tmp_dir.mkdir(parents=True, exist_ok=True)
22
+ with tempfile.NamedTemporaryFile(dir=tmp_dir, delete=False, suffix=".partial") as tmp:
23
+ async for chunk in response.aiter_bytes():
24
+ _ = tmp.write(chunk)
25
+ tmp_path = Path(tmp.name)
26
+ _ = tmp_path.replace(dest_path)
27
+ return dest_path
28
+
@@ -0,0 +1,46 @@
1
+ """Logger helpers for the Netrias client."""
2
+ from __future__ import annotations
3
+
4
+ import logging
5
+ from pathlib import Path
6
+ from typing import Final
7
+
8
+ from ._models import LogLevel
9
+
10
+
11
+ _FORMAT: Final[str] = "%(asctime)s %(levelname)s netrias_client: %(message)s"
12
+
13
+
14
+ def configure_logger(
15
+ name: str,
16
+ level: LogLevel,
17
+ log_directory: Path | None,
18
+ ) -> logging.Logger:
19
+ """Configure and return a logger dedicated to a Netrias client instance."""
20
+
21
+ logger = logging.getLogger(name)
22
+ logger.handlers.clear()
23
+ logger.propagate = False
24
+
25
+ formatter = logging.Formatter(fmt=_FORMAT)
26
+
27
+ stream_handler = logging.StreamHandler()
28
+ stream_handler.setFormatter(formatter)
29
+ logger.addHandler(stream_handler)
30
+
31
+ if log_directory is not None:
32
+ log_directory.mkdir(parents=True, exist_ok=True)
33
+ file_path = log_directory / f"{name.replace('.', '_')}.log"
34
+ file_handler = logging.FileHandler(file_path, encoding="utf-8")
35
+ file_handler.setFormatter(formatter)
36
+ logger.addHandler(file_handler)
37
+
38
+ mapping = {
39
+ LogLevel.CRITICAL: logging.CRITICAL,
40
+ LogLevel.ERROR: logging.ERROR,
41
+ LogLevel.WARNING: logging.WARNING,
42
+ LogLevel.INFO: logging.INFO,
43
+ LogLevel.DEBUG: logging.DEBUG,
44
+ }
45
+ logger.setLevel(mapping[level])
46
+ return logger
@@ -0,0 +1,115 @@
1
+ """Define dataclasses and types for the client.
2
+
3
+ 'why': capture configuration and results in typed, testable shapes
4
+ """
5
+ from __future__ import annotations
6
+
7
+ from collections.abc import Mapping
8
+ from dataclasses import dataclass
9
+ from enum import Enum
10
+ from pathlib import Path
11
+ from typing import Literal
12
+
13
+
14
+ class LogLevel(str, Enum):
15
+ """Enumerate supported logging levels for the client."""
16
+
17
+ CRITICAL = "CRITICAL"
18
+ ERROR = "ERROR"
19
+ WARNING = "WARNING"
20
+ INFO = "INFO"
21
+ DEBUG = "DEBUG"
22
+
23
+
24
+ @dataclass(frozen=True)
25
+ class Settings:
26
+ """Capture runtime settings for API calls."""
27
+
28
+ api_key: str
29
+ discovery_url: str
30
+ harmonization_url: str
31
+ timeout: float
32
+ log_level: LogLevel
33
+ confidence_threshold: float
34
+ discovery_use_gateway_bypass: bool
35
+ log_directory: Path | None
36
+ data_model_store_endpoints: DataModelStoreEndpoints | None = None
37
+
38
+
39
+ @dataclass(frozen=True)
40
+ class HarmonizationResult:
41
+ """Communicate harmonization outcome in a consistent shape."""
42
+
43
+ file_path: Path
44
+ status: Literal["succeeded", "failed", "timeout"]
45
+ description: str
46
+ mapping_id: str | None = None
47
+
48
+
49
+ @dataclass(frozen=True)
50
+ class MappingRecommendationOption:
51
+ """Capture a single recommended target for a source column."""
52
+
53
+ target: str | None
54
+ confidence: float | None
55
+ target_cde_id: int | None = None
56
+ raw: Mapping[str, object] | None = None
57
+
58
+
59
+ @dataclass(frozen=True)
60
+ class MappingSuggestion:
61
+ """Group recommendation options for a single source column."""
62
+
63
+ source_column: str
64
+ options: tuple[MappingRecommendationOption, ...]
65
+ raw: Mapping[str, object] | None = None
66
+
67
+
68
+ @dataclass(frozen=True)
69
+ class MappingDiscoveryResult:
70
+ """Communicate column mapping recommendations for a dataset."""
71
+
72
+ schema: str
73
+ suggestions: tuple[MappingSuggestion, ...]
74
+ raw: Mapping[str, object]
75
+
76
+
77
+ @dataclass(frozen=True)
78
+ class DataModelStoreEndpoints:
79
+ """Encapsulate Data Model Store endpoint URLs for swappability.
80
+
81
+ 'why': endpoints may change; grouping them enables single-point override
82
+ """
83
+
84
+ base_url: str
85
+
86
+
87
+ @dataclass(frozen=True)
88
+ class DataModel:
89
+ """Represent a data commons/model from the Data Model Store."""
90
+
91
+ data_commons_id: int
92
+ key: str
93
+ name: str
94
+ description: str | None
95
+ is_active: bool
96
+
97
+
98
+ @dataclass(frozen=True)
99
+ class CDE:
100
+ """Represent a Common Data Element within a data model version."""
101
+
102
+ cde_key: str
103
+ cde_id: int
104
+ cde_version_id: int
105
+ description: str | None = None
106
+
107
+
108
+ @dataclass(frozen=True)
109
+ class PermissibleValue:
110
+ """Represent a permissible value for a CDE."""
111
+
112
+ pv_id: int
113
+ value: str
114
+ description: str | None
115
+ is_active: bool