PyPI - netrias_client - Versions diffs - 0.0.1__py3-none-any.whl - Mend

netrias_client 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of netrias_client might be problematic. Click here for more details.

Files changed (19) hide show

netrias_client/__init__.py +9 -0
netrias_client/_adapter.py +288 -0
netrias_client/_client.py +251 -0
netrias_client/_config.py +95 -0
netrias_client/_core.py +560 -0
netrias_client/_discovery.py +437 -0
netrias_client/_errors.py +33 -0
netrias_client/_gateway_bypass.py +208 -0
netrias_client/_http.py +126 -0
netrias_client/_io.py +28 -0
netrias_client/_logging.py +46 -0
netrias_client/_models.py +72 -0
netrias_client/_validators.py +173 -0
netrias_client/scripts.py +313 -0
netrias_client-0.0.1.dist-info/METADATA +222 -0
netrias_client-0.0.1.dist-info/RECORD +19 -0
netrias_client-0.0.1.dist-info/WHEEL +4 -0
netrias_client-0.0.1.dist-info/entry_points.txt +5 -0
netrias_client-0.0.1.dist-info/licenses/LICENSE +21 -0

netrias_client/_http.py ADDED Viewed

@@ -0,0 +1,126 @@
+"""HTTP helpers for harmonization and discovery."""
+from __future__ import annotations
+import csv
+import gzip
+import json
+from collections.abc import Mapping, Sequence
+from pathlib import Path
+from typing import Final
+import httpx
+from ._adapter import normalize_manifest_mapping
+SCHEMA_VERSION: Final[str] = "1.0"
+DEFAULT_MODEL_VERSION: Final[str] = "v1"
+MAX_COMPRESSED_BYTES: Final[int] = 10 * 1024 * 1024
+def build_harmonize_payload(
+    csv_path: Path,
+    manifest: Path | Mapping[str, object] | None,
+    model_version: str = DEFAULT_MODEL_VERSION,
+) -> bytes:
+    """Return gzip-compressed harmonization payload for the given CSV and manifest."""
+    rows = _read_tabular(csv_path)
+    header = rows[0] if rows else []
+    data_rows = rows[1:] if len(rows) > 1 else []
+    envelope: dict[str, object] = {
+        "schemaVersion": SCHEMA_VERSION,
+        "modelVersion": model_version,
+        "document": {
+            "name": csv_path.name,
+            "sheetName": None,
+            "header": header,
+            "rows": data_rows,
+        },
+    }
+    mapping = normalize_manifest_mapping(manifest)
+    if mapping:
+        envelope["mapping"] = mapping
+    raw = json.dumps(envelope, ensure_ascii=False, separators=(",", ":")).encode("utf-8")
+    compressed = gzip.compress(raw)
+    if len(compressed) > MAX_COMPRESSED_BYTES:
+        raise ValueError("compressed harmonization payload exceeds 10 MiB")
+    return compressed
+async def submit_harmonize_job(
+    base_url: str,
+    api_key: str,
+    payload_gz: bytes,
+    timeout: float,
+    idempotency_key: str | None = None,
+) -> httpx.Response:
+    """Submit a harmonization job request and return the raw response."""
+    url = _build_job_submit_url(base_url)
+    headers = {
+        "Authorization": f"Bearer {api_key}",
+        "Content-Type": "application/json",
+        "Content-Encoding": "gzip",
+    }
+    if idempotency_key:
+        headers["Idempotency-Key"] = idempotency_key
+    async with httpx.AsyncClient(timeout=httpx.Timeout(timeout)) as client:
+        return await client.post(url, content=payload_gz, headers=headers)
+async def fetch_job_status(
+    base_url: str,
+    api_key: str,
+    job_id: str,
+    timeout: float,
+) -> httpx.Response:
+    """Return the status response for a previously submitted harmonization job."""
+    url = _build_job_status_url(base_url, job_id)
+    headers = {"Authorization": f"Bearer {api_key}"}
+    async with httpx.AsyncClient(timeout=httpx.Timeout(timeout)) as client:
+        return await client.get(url, headers=headers)
+async def request_mapping_discovery(
+    base_url: str,
+    api_key: str,
+    timeout: float,
+    schema: str,
+    columns: Mapping[str, Sequence[str]],
+) -> httpx.Response:
+    """Submit column samples for mapping recommendations."""
+    url = _build_discovery_url(base_url)
+    headers = {
+        "Content-Type": "application/json",
+        "x-api-key": api_key,
+    }
+    body = {"target_schema": schema, "data": columns}
+    payload = {"body": json.dumps(body)}
+    async with httpx.AsyncClient(timeout=httpx.Timeout(timeout)) as client:
+        return await client.post(url, headers=headers, json=payload)
+def _build_job_submit_url(base_url: str) -> str:
+    base = base_url.rstrip("/")
+    return f"{base}/v1/jobs/harmonize"
+def _build_job_status_url(base_url: str, job_id: str) -> str:
+    base = base_url.rstrip("/")
+    return f"{base}/v1/jobs/{job_id}"
+def _build_discovery_url(base_url: str) -> str:
+    base = base_url.rstrip("/")
+    return f"{base}/cde-recommendation"
+def _read_tabular(path: Path) -> list[list[str]]:
+    if not path.exists():
+        raise FileNotFoundError(path)
+    ext = path.suffix.lower()
+    if ext not in {".csv", ".tsv"}:
+        raise ValueError("harmonization only supports CSV or TSV inputs")
+    delimiter = "," if ext == ".csv" else "\t"
+    with path.open("r", encoding="utf-8", newline="") as handle:
+        reader = csv.reader(handle, delimiter=delimiter)
+        return [list(row) for row in reader]

netrias_client/_io.py ADDED Viewed

@@ -0,0 +1,28 @@
+"""I/O helpers for streaming responses.
+'why': keep file operations small and testable; avoid partial outputs
+"""
+from __future__ import annotations
+import tempfile
+from pathlib import Path
+import httpx
+async def stream_download_to_file(response: httpx.Response, dest_path: Path) -> Path:
+    """Stream an HTTP response body to `dest_path` atomically.
+    Writes to a temporary file in the destination directory and then renames.
+    """
+    dest_path = Path(dest_path)
+    tmp_dir = dest_path.parent
+    tmp_dir.mkdir(parents=True, exist_ok=True)
+    with tempfile.NamedTemporaryFile(dir=tmp_dir, delete=False, suffix=".partial") as tmp:
+        async for chunk in response.aiter_bytes():
+            _ = tmp.write(chunk)
+        tmp_path = Path(tmp.name)
+    _ = tmp_path.replace(dest_path)
+    return dest_path

netrias_client/_logging.py ADDED Viewed

@@ -0,0 +1,46 @@
+"""Logger helpers for the Netrias client."""
+from __future__ import annotations
+import logging
+from pathlib import Path
+from typing import Final
+from ._models import LogLevel
+_FORMAT: Final[str] = "%(asctime)s %(levelname)s netrias_client: %(message)s"
+def configure_logger(
+    name: str,
+    level: LogLevel,
+    log_directory: Path | None,
+) -> logging.Logger:
+    """Configure and return a logger dedicated to a Netrias client instance."""
+    logger = logging.getLogger(name)
+    logger.handlers.clear()
+    logger.propagate = False
+    formatter = logging.Formatter(fmt=_FORMAT)
+    stream_handler = logging.StreamHandler()
+    stream_handler.setFormatter(formatter)
+    logger.addHandler(stream_handler)
+    if log_directory is not None:
+        log_directory.mkdir(parents=True, exist_ok=True)
+        file_path = log_directory / f"{name.replace('.', '_')}.log"
+        file_handler = logging.FileHandler(file_path, encoding="utf-8")
+        file_handler.setFormatter(formatter)
+        logger.addHandler(file_handler)
+    mapping = {
+        LogLevel.CRITICAL: logging.CRITICAL,
+        LogLevel.ERROR: logging.ERROR,
+        LogLevel.WARNING: logging.WARNING,
+        LogLevel.INFO: logging.INFO,
+        LogLevel.DEBUG: logging.DEBUG,
+    }
+    logger.setLevel(mapping[level])
+    return logger

netrias_client/_models.py ADDED Viewed

@@ -0,0 +1,72 @@
+"""Define dataclasses and types for the client.
+'why': capture configuration and results in typed, testable shapes
+"""
+from __future__ import annotations
+from collections.abc import Mapping
+from dataclasses import dataclass
+from enum import Enum
+from pathlib import Path
+from typing import Literal
+class LogLevel(str, Enum):
+    """Enumerate supported logging levels for the client."""
+    CRITICAL = "CRITICAL"
+    ERROR = "ERROR"
+    WARNING = "WARNING"
+    INFO = "INFO"
+    DEBUG = "DEBUG"
+@dataclass(frozen=True)
+class Settings:
+    """Capture runtime settings for API calls."""
+    api_key: str
+    discovery_url: str
+    harmonization_url: str
+    timeout: float
+    log_level: LogLevel
+    confidence_threshold: float
+    discovery_use_gateway_bypass: bool
+    log_directory: Path | None
+@dataclass(frozen=True)
+class HarmonizationResult:
+    """Communicate harmonization outcome in a consistent shape."""
+    file_path: Path
+    status: Literal["succeeded", "failed", "timeout"]
+    description: str
+    mapping_id: str | None = None
+@dataclass(frozen=True)
+class MappingRecommendationOption:
+    """Capture a single recommended target for a source column."""
+    target: str | None
+    confidence: float | None
+    raw: Mapping[str, object] | None = None
+@dataclass(frozen=True)
+class MappingSuggestion:
+    """Group recommendation options for a single source column."""
+    source_column: str
+    options: tuple[MappingRecommendationOption, ...]
+    raw: Mapping[str, object] | None = None
+@dataclass(frozen=True)
+class MappingDiscoveryResult:
+    """Communicate column mapping recommendations for a dataset."""
+    schema: str
+    suggestions: tuple[MappingSuggestion, ...]
+    raw: Mapping[str, object]

netrias_client/_validators.py ADDED Viewed

@@ -0,0 +1,173 @@
+"""Validate inputs for harmonization.
+'why': fail fast with clear, actionable messages prior to network calls
+"""
+from __future__ import annotations
+import os
+from collections.abc import Mapping, Sequence
+from pathlib import Path
+from ._errors import FileValidationError, MappingValidationError, OutputLocationError
+# OBVIOUS HARD-CODED SIZE LIMIT: 250 MB maximum CSV size prior to upload
+HARD_MAX_CSV_BYTES = 250 * 1024 * 1024
+def validate_source_path(path: Path) -> Path:
+    """Ensure the CSV exists, is a file, has a .csv extension, and respects size limits."""
+    _require_exists(path, "source CSV not found")
+    _require_is_file(path, "source path is not a file")
+    _require_suffix(path, ".csv", "unsupported file extension for source CSV")
+    _require_not_too_large(path)
+    return path
+def validate_manifest_path(path: Path) -> Path:
+    """Ensure the manifest JSON exists and is a file."""
+    _require_exists(path, "manifest JSON not found")
+    _require_is_file(path, "manifest path is not a file")
+    _require_suffix(path, ".json", "manifest must be a .json file")
+    return path
+def validate_output_path(path: Path | None, source_name: str, allow_versioning: bool = False) -> Path:
+    """Return a valid output file path, creating parent directories when needed.
+    Defaults to `<CWD>/<source_name>.harmonized.csv` when `path` is None or a directory.
+    """
+    candidate = _resolve_output_candidate(path, source_name)
+    _ensure_parent(candidate)
+    _require_parent_writable(candidate)
+    if allow_versioning:
+        candidate = _next_available_path(candidate)
+    else:
+        _require_not_exists(candidate)
+    return candidate
+def validate_target_schema(schema: str) -> str:
+    """Ensure the target schema identifier is a non-empty string."""
+    candidate = (schema or "").strip()
+    if not candidate:
+        raise MappingValidationError("target_schema must be a non-empty string")
+    return candidate
+def validate_column_samples(columns: Mapping[str, Sequence[object]]) -> dict[str, list[str]]:
+    """Normalize column sample data for mapping discovery."""
+    if not columns:
+        raise MappingValidationError("column data must include at least one column")
+    normalized: dict[str, list[str]] = {}
+    for raw_name, values in columns.items():
+        name = _normalized_column_name(raw_name)
+        samples = _normalized_samples(name, values)
+        normalized[name] = samples
+    return normalized
+def _require_exists(path: Path, message: str) -> None:
+    if not path.exists():
+        raise FileValidationError(f"{message}: {path}")
+def _require_is_file(path: Path, message: str) -> None:
+    if not path.is_file():
+        raise FileValidationError(f"{message}: {path}")
+def _require_suffix(path: Path, suffix: str, message: str) -> None:
+    if path.suffix.lower() != suffix:
+        raise FileValidationError(f"{message}: {path.suffix}")
+def _require_not_too_large(path: Path) -> None:
+    try:
+        size = os.path.getsize(path)
+    except OSError as exc:
+        raise FileValidationError(f"unable to stat source CSV: {exc}") from exc
+    if size > HARD_MAX_CSV_BYTES:
+        raise FileValidationError(
+            f"source CSV exceeds hard-coded limit of {HARD_MAX_CSV_BYTES // (1024 * 1024)} MB (got {size} bytes)"
+        )
+def _resolve_output_candidate(path: Path | None, source_name: str) -> Path:
+    if path is None:
+        return Path.cwd() / f"{source_name}.harmonized.csv"
+    if path.exists() and path.is_dir():
+        return path / f"{source_name}.harmonized.csv"
+    return path
+def _ensure_parent(candidate: Path) -> None:
+    parent = candidate.parent
+    if not parent.exists():
+        try:
+            parent.mkdir(parents=True, exist_ok=True)
+        except OSError as exc:
+            raise OutputLocationError(f"unable to create output directory {parent}: {exc}") from exc
+def _require_parent_writable(candidate: Path) -> None:
+    parent = candidate.parent
+    if parent.exists() and not os.access(parent, os.W_OK):
+        raise OutputLocationError(f"output directory not writable: {parent}")
+def _require_not_exists(candidate: Path) -> None:
+    if candidate.exists():
+        raise OutputLocationError(f"refusing to overwrite existing file: {candidate}")
+def _next_available_path(candidate: Path) -> Path:
+    if not candidate.exists():
+        return candidate
+    stem = candidate.stem
+    suffix = candidate.suffix
+    parent = candidate.parent
+    index = 1
+    while index < 1000:
+        versioned = parent / f"{stem}.v{index}{suffix}"
+        if not versioned.exists():
+            return versioned
+        index += 1
+    raise OutputLocationError(
+        f"unable to determine unique output path after {index - 1} attempts for {candidate}"
+    )
+def _normalized_column_name(raw_name: object) -> str:
+    if not isinstance(raw_name, str):
+        raise MappingValidationError("column names must be strings")
+    name = raw_name.strip()
+    if not name:
+        raise MappingValidationError("column names must be non-empty strings")
+    return name
+def _normalized_samples(column_name: str, values: Sequence[object] | None) -> list[str]:
+    sequence = _require_sequence(column_name, values)
+    samples = [sample for sample in (_coerced_sample(value) for value in sequence) if sample]
+    if not samples:
+        raise MappingValidationError(f"column '{column_name}' must include at least one non-empty sample value")
+    return samples
+def _require_sequence(column_name: str, values: Sequence[object] | None) -> Sequence[object]:
+    if values is None or isinstance(values, (str, bytes)):
+        raise MappingValidationError(f"column '{column_name}' values must be a sequence of samples")
+    return values
+def _coerced_sample(value: object) -> str | None:
+    if value is None:
+        return None
+    text = str(value).strip()
+    return text or None