PyPI - dq-made-easy-utils - Versions diffs - 0.1.0__tar.gz - Mend

dq-made-easy-utils 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

dq_made_easy_utils-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,14 @@
+Metadata-Version: 2.4
+Name: dq-made-easy-utils
+Version: 0.1.0
+Summary: Shared utilities for dq-made-easy Python services
+Requires-Python: >=3.11
+Description-Content-Type: text/markdown
+Requires-Dist: jsonschema>=4.25.1
+Requires-Dist: requests>=2.32.0
+# dq-made-easy-utils
+Shared Python utilities used across dq-made-easy services.
+Import package name: `dq_utils`.

dq_made_easy_utils-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,5 @@
+# dq-made-easy-utils
+Shared Python utilities used across dq-made-easy services.
+Import package name: `dq_utils`.

dq_made_easy_utils-0.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,20 @@
+[build-system]
+requires = ["setuptools>=69", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "dq-made-easy-utils"
+version = "0.1.0"
+description = "Shared utilities for dq-made-easy Python services"
+readme = "README.md"
+requires-python = ">=3.11"
+dependencies = [
+  "jsonschema>=4.25.1",
+  "requests>=2.32.0",
+]
+[tool.setuptools]
+package-dir = {"" = "src"}
+[tool.setuptools.packages.find]
+where = ["src"]

dq_made_easy_utils-0.1.0/setup.cfg ADDED Viewed

@@ -0,0 +1,4 @@
+[egg_info]
+tag_build =
+tag_date = 0

dq_made_easy_utils-0.1.0/src/dq_made_easy_utils.egg-info/PKG-INFO ADDED Viewed

@@ -0,0 +1,14 @@
+Metadata-Version: 2.4
+Name: dq-made-easy-utils
+Version: 0.1.0
+Summary: Shared utilities for dq-made-easy Python services
+Requires-Python: >=3.11
+Description-Content-Type: text/markdown
+Requires-Dist: jsonschema>=4.25.1
+Requires-Dist: requests>=2.32.0
+# dq-made-easy-utils
+Shared Python utilities used across dq-made-easy services.
+Import package name: `dq_utils`.

dq_made_easy_utils-0.1.0/src/dq_made_easy_utils.egg-info/SOURCES.txt ADDED Viewed

@@ -0,0 +1,15 @@
+README.md
+pyproject.toml
+src/dq_made_easy_utils.egg-info/PKG-INFO
+src/dq_made_easy_utils.egg-info/SOURCES.txt
+src/dq_made_easy_utils.egg-info/dependency_links.txt
+src/dq_made_easy_utils.egg-info/requires.txt
+src/dq_made_easy_utils.egg-info/top_level.txt
+src/dq_utils/__init__.py
+src/dq_utils/auth_utils.py
+src/dq_utils/internal_api_contracts.py
+src/dq_utils/logging_utils.py
+src/dq_utils/spark_jars.py
+src/dq_utils/spark_runtime.py
+tests/test_auth_utils.py
+tests/test_logging_utils.py

dq_made_easy_utils-0.1.0/src/dq_made_easy_utils.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+

dq_made_easy_utils-0.1.0/src/dq_made_easy_utils.egg-info/requires.txt ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ jsonschema>=4.25.1
2	+ requests>=2.32.0

dq_made_easy_utils-0.1.0/src/dq_made_easy_utils.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ dq_utils

dq_made_easy_utils-0.1.0/src/dq_utils/__init__.py ADDED Viewed

@@ -0,0 +1,11 @@
+"""Shared utilities for dq-made-easy Python services."""
+from dq_utils.internal_api_contracts import InternalApiContractLookupError
+from dq_utils.internal_api_contracts import InternalApiContractRegistry
+from dq_utils.internal_api_contracts import InternalApiContractValidationError
+__all__ = [
+	"InternalApiContractLookupError",
+	"InternalApiContractRegistry",
+	"InternalApiContractValidationError",
+]

dq_made_easy_utils-0.1.0/src/dq_utils/auth_utils.py ADDED Viewed

@@ -0,0 +1,303 @@
+from __future__ import annotations
+import os
+import time
+from dataclasses import dataclass
+from typing import Protocol
+import requests
+class AuthConfigError(RuntimeError):
+    pass
+class TokenProvider(Protocol):
+    def get_token(self, *, correlation_id: str) -> str: ...
+@dataclass
+class TokenBundle:
+    access_token: str
+    expires_at_epoch_seconds: float
+class StaticTokenProvider:
+    def __init__(self, token: str) -> None:
+        token = str(token or "").strip()
+        if not token:
+            raise AuthConfigError("Static token is empty")
+        self._token = token
+    def get_token(self, *, correlation_id: str) -> str:
+        _ = correlation_id
+        return self._token
+class OidcClientCredentialsTokenProvider:
+    def __init__(
+        self,
+        *,
+        token_url: str,
+        client_id: str,
+        client_secret: str,
+        scope: str | None = None,
+        refresh_skew_seconds: int = 60,
+        timeout_seconds: int = 10,
+    ) -> None:
+        token_url = str(token_url or "").strip()
+        client_id = str(client_id or "").strip()
+        client_secret = str(client_secret or "").strip()
+        scope = str(scope or "").strip() or None
+        if not token_url:
+            raise AuthConfigError("OIDC token_url is required")
+        if not client_id:
+            raise AuthConfigError("OIDC client_id is required")
+        if not client_secret:
+            raise AuthConfigError("OIDC client_secret is required")
+        self._token_url = token_url
+        self._client_id = client_id
+        self._client_secret = client_secret
+        self._scope = scope
+        self._refresh_skew_seconds = int(refresh_skew_seconds)
+        self._timeout_seconds = int(timeout_seconds)
+        self._cached: TokenBundle | None = None
+    def get_token(self, *, correlation_id: str) -> str:
+        now = time.time()
+        if self._cached is not None and (self._cached.expires_at_epoch_seconds - self._refresh_skew_seconds) > now:
+            return self._cached.access_token
+        data: dict[str, str] = {
+            "grant_type": "client_credentials",
+            "client_id": self._client_id,
+            "client_secret": self._client_secret,
+        }
+        if self._scope:
+            data["scope"] = self._scope
+        try:
+            response = requests.post(
+                self._token_url,
+                data=data,
+                headers={"X-Correlation-ID": correlation_id},
+                timeout=self._timeout_seconds,
+            )
+        except Exception as exc:
+            raise AuthConfigError(
+                f"Unable to obtain OIDC access token (token endpoint unreachable at '{self._token_url}')"
+            ) from exc
+        if response.status_code >= 400:
+            raise AuthConfigError(
+                f"Unable to obtain OIDC access token (token endpoint returned {response.status_code})"
+            )
+        try:
+            payload = response.json()
+        except Exception as exc:
+            raise AuthConfigError("OIDC token endpoint returned non-JSON response") from exc
+        token = str(payload.get("access_token") or "").strip()
+        expires_in = payload.get("expires_in")
+        try:
+            expires_in_seconds = int(expires_in)
+        except Exception:
+            expires_in_seconds = 0
+        if not token:
+            raise AuthConfigError("OIDC token endpoint response missing access_token")
+        if expires_in_seconds <= 0:
+            raise AuthConfigError("OIDC token endpoint response missing/invalid expires_in")
+        self._cached = TokenBundle(
+            access_token=token,
+            expires_at_epoch_seconds=now + float(expires_in_seconds),
+        )
+        return token
+class OidcPasswordTokenProvider:
+    def __init__(
+        self,
+        *,
+        token_url: str,
+        client_id: str,
+        username: str,
+        password: str,
+        client_secret: str | None = None,
+        scope: str | None = None,
+        refresh_skew_seconds: int = 60,
+        timeout_seconds: int = 10,
+    ) -> None:
+        token_url = str(token_url or "").strip()
+        client_id = str(client_id or "").strip()
+        username = str(username or "").strip()
+        password = str(password or "").strip()
+        client_secret = str(client_secret or "").strip() or None
+        scope = str(scope or "").strip() or None
+        if not token_url:
+            raise AuthConfigError("OIDC token_url is required")
+        if not client_id:
+            raise AuthConfigError("OIDC client_id is required")
+        if not username:
+            raise AuthConfigError("OIDC username is required")
+        if not password:
+            raise AuthConfigError("OIDC password is required")
+        self._token_url = token_url
+        self._client_id = client_id
+        self._username = username
+        self._password = password
+        self._client_secret = client_secret
+        self._scope = scope
+        self._refresh_skew_seconds = int(refresh_skew_seconds)
+        self._timeout_seconds = int(timeout_seconds)
+        self._cached: TokenBundle | None = None
+    def get_token(self, *, correlation_id: str) -> str:
+        now = time.time()
+        if self._cached is not None and (self._cached.expires_at_epoch_seconds - self._refresh_skew_seconds) > now:
+            return self._cached.access_token
+        data: dict[str, str] = {
+            "grant_type": "password",
+            "client_id": self._client_id,
+            "username": self._username,
+            "password": self._password,
+        }
+        if self._client_secret:
+            data["client_secret"] = self._client_secret
+        if self._scope:
+            data["scope"] = self._scope
+        try:
+            response = requests.post(
+                self._token_url,
+                data=data,
+                headers={"X-Correlation-ID": correlation_id},
+                timeout=self._timeout_seconds,
+            )
+        except Exception as exc:
+            raise AuthConfigError(
+                f"Unable to obtain OIDC access token (token endpoint unreachable at '{self._token_url}')"
+            ) from exc
+        if response.status_code >= 400:
+            raise AuthConfigError(
+                f"Unable to obtain OIDC access token (token endpoint returned {response.status_code})"
+            )
+        try:
+            payload = response.json()
+        except Exception as exc:
+            raise AuthConfigError("OIDC token endpoint returned non-JSON response") from exc
+        token = str(payload.get("access_token") or "").strip()
+        expires_in = payload.get("expires_in")
+        try:
+            expires_in_seconds = int(expires_in)
+        except Exception:
+            expires_in_seconds = 0
+        if not token:
+            raise AuthConfigError("OIDC token endpoint response missing access_token")
+        if expires_in_seconds <= 0:
+            raise AuthConfigError("OIDC token endpoint response missing/invalid expires_in")
+        self._cached = TokenBundle(
+            access_token=token,
+            expires_at_epoch_seconds=now + float(expires_in_seconds),
+        )
+        return token
+def resolve_oidc_token_url(*, issuer: str | None, token_url: str | None) -> str | None:
+    token_url_value = str(token_url or "").strip()
+    if token_url_value:
+        return token_url_value
+    issuer_value = str(issuer or "").strip().rstrip("/")
+    if issuer_value:
+        return issuer_value + "/protocol/openid-connect/token"
+    return None
+def build_token_provider_from_env(
+    *,
+    static_token_env_var: str,
+    issuer_env_var: str,
+    token_url_env_var: str,
+    client_id_env_var: str,
+    client_secret_env_var: str,
+    scope_env_var: str,
+    refresh_skew_seconds: int = 60,
+) -> TokenProvider:
+    static_token = str(os.getenv(static_token_env_var) or "").strip()
+    if static_token:
+        return StaticTokenProvider(static_token)
+    token_url = resolve_oidc_token_url(
+        issuer=os.getenv(issuer_env_var),
+        token_url=os.getenv(token_url_env_var),
+    )
+    client_id = str(os.getenv(client_id_env_var) or "").strip()
+    client_secret = str(os.getenv(client_secret_env_var) or "").strip()
+    scope = str(os.getenv(scope_env_var) or "").strip() or None
+    if token_url and client_id and client_secret:
+        return OidcClientCredentialsTokenProvider(
+            token_url=token_url,
+            client_id=client_id,
+            client_secret=client_secret,
+            scope=scope,
+            refresh_skew_seconds=refresh_skew_seconds,
+        )
+    raise AuthConfigError(
+        "Auth is not configured. Set a static bearer token in "
+        f"{static_token_env_var}, or configure OIDC client credentials using "
+        f"({issuer_env_var} or {token_url_env_var}) plus {client_id_env_var} and {client_secret_env_var}."
+    )
+def build_oidc_token_provider_from_env(
+    *,
+    issuer_env_var: str,
+    token_url_env_var: str,
+    client_id_env_var: str,
+    client_secret_env_var: str,
+    scope_env_var: str,
+    refresh_skew_seconds: int = 60,
+) -> TokenProvider:
+    """Build an OIDC client-credentials token provider from env.
+    This intentionally does not support static bearer tokens. Callers that need
+    fail-fast token rotation should use this helper.
+    """
+    token_url = resolve_oidc_token_url(
+        issuer=os.getenv(issuer_env_var),
+        token_url=os.getenv(token_url_env_var),
+    )
+    client_id = str(os.getenv(client_id_env_var) or "").strip()
+    client_secret = str(os.getenv(client_secret_env_var) or "").strip()
+    scope = str(os.getenv(scope_env_var) or "").strip() or None
+    if token_url and client_id and client_secret:
+        return OidcClientCredentialsTokenProvider(
+            token_url=token_url,
+            client_id=client_id,
+            client_secret=client_secret,
+            scope=scope,
+            refresh_skew_seconds=refresh_skew_seconds,
+        )
+    raise AuthConfigError(
+        "OIDC auth is not configured. Configure OIDC client credentials using "
+        f"({issuer_env_var} or {token_url_env_var}) plus {client_id_env_var} and {client_secret_env_var}."
+    )

dq_made_easy_utils-0.1.0/src/dq_utils/internal_api_contracts.py ADDED Viewed

@@ -0,0 +1,185 @@
+from __future__ import annotations
+from dataclasses import dataclass
+import json
+from pathlib import Path
+from typing import Any
+from jsonschema import Draft202012Validator
+def _format_json_path(segments: tuple[Any, ...]) -> str:
+    path = "$"
+    for segment in segments:
+        if isinstance(segment, int):
+            path += f"[{segment}]"
+            continue
+        text = str(segment)
+        if text.isidentifier():
+            path += f".{text}"
+            continue
+        path += f"[{json.dumps(text)}]"
+    return path
+@dataclass(frozen=True)
+class ContractValidationIssue:
+    json_path: str
+    schema_path: str
+    message: str
+    validator: str
+    def as_dict(self) -> dict[str, str]:
+        return {
+            "json_path": self.json_path,
+            "schema_path": self.schema_path,
+            "message": self.message,
+            "validator": self.validator,
+        }
+@dataclass(frozen=True)
+class OperationContract:
+    version: str
+    method: str
+    path: str
+    operation_id: str
+    request_body_required: bool
+    request_body_schema_ref: str | None
+    request_content_types: tuple[str, ...]
+class InternalApiContractLookupError(RuntimeError):
+    pass
+class InternalApiContractValidationError(RuntimeError):
+    def __init__(self, operation: OperationContract, issues: list[ContractValidationIssue]) -> None:
+        self.operation = operation
+        self.issues = tuple(issues)
+        super().__init__(
+            f"Request payload does not match contract for {operation.method} {operation.path} ({operation.operation_id})"
+        )
+    def as_dict(self) -> dict[str, Any]:
+        return {
+            "operation_id": self.operation.operation_id,
+            "path": self.operation.path,
+            "method": self.operation.method,
+            "validation_errors": [issue.as_dict() for issue in self.issues],
+        }
+class InternalApiContractRegistry:
+    def __init__(self, contracts_root: str | Path) -> None:
+        self._contracts_root = Path(contracts_root)
+        self._operations: dict[tuple[str, str], OperationContract] = {}
+        self._schema_bundles: dict[str, dict[str, Any]] = {}
+        self._validators: dict[tuple[str, str], Draft202012Validator] = {}
+        self._load()
+    @property
+    def contracts_root(self) -> Path:
+        return self._contracts_root
+    def get_operation(self, method: str, path: str) -> OperationContract | None:
+        return self._operations.get((str(method or "").upper(), str(path or "")))
+    def validate_request_payload(self, method: str, path: str, payload: Any) -> OperationContract:
+        operation = self.get_operation(method, path)
+        if operation is None:
+            raise InternalApiContractLookupError(f"No internal API contract found for {method} {path}")
+        if operation.request_body_schema_ref is None:
+            return operation
+        validator = self._get_validator(operation.version, operation.request_body_schema_ref)
+        errors = sorted(validator.iter_errors(payload), key=lambda err: (list(err.path), list(err.schema_path)))
+        if not errors:
+            return operation
+        issues = [
+            ContractValidationIssue(
+                json_path=_format_json_path(tuple(error.path)),
+                schema_path=_format_json_path(tuple(error.schema_path)),
+                message=error.message,
+                validator=str(error.validator),
+            )
+            for error in errors
+        ]
+        raise InternalApiContractValidationError(operation, issues)
+    def _get_validator(self, version: str, schema_ref: str) -> Draft202012Validator:
+        cache_key = (version, schema_ref)
+        cached = self._validators.get(cache_key)
+        if cached is not None:
+            return cached
+        schema_bundle = self._schema_bundles.get(version)
+        if schema_bundle is None:
+            raise InternalApiContractLookupError(f"No schema bundle loaded for internal API version {version}")
+        validation_schema = {
+            "$schema": schema_bundle.get("$schema", "https://json-schema.org/draft/2020-12/schema"),
+            "$defs": schema_bundle.get("$defs", {}),
+            "allOf": [{"$ref": schema_ref}],
+        }
+        validator = Draft202012Validator(validation_schema)
+        self._validators[cache_key] = validator
+        return validator
+    def _load(self) -> None:
+        index_path = self._contracts_root / "index.json"
+        if not index_path.exists():
+            raise RuntimeError(f"Internal API contract index is missing: {index_path}")
+        index_payload = json.loads(index_path.read_text())
+        contracts = index_payload.get("contracts")
+        if not isinstance(contracts, list):
+            raise RuntimeError(f"Internal API contract index is invalid: {index_path}")
+        aggregate_contracts = [
+            contract for contract in contracts if isinstance(contract, dict) and contract.get("kind") == "aggregate"
+        ]
+        if not aggregate_contracts:
+            raise RuntimeError(f"Internal API contract index has no aggregate bundle entries: {index_path}")
+        for contract in aggregate_contracts:
+            version = str(contract.get("version") or "").strip()
+            files = contract.get("files") or {}
+            schema_path = self._contracts_root / str(files.get("schema") or "")
+            operations_path = self._contracts_root / str(files.get("operations") or "")
+            if not version or not schema_path.exists() or not operations_path.exists():
+                raise RuntimeError(
+                    f"Internal API aggregate contract bundle is incomplete for version {version or '<unknown>'}: {contract}"
+                )
+            schema_bundle = json.loads(schema_path.read_text())
+            operations_manifest = json.loads(operations_path.read_text())
+            operations = operations_manifest.get("operations")
+            if not isinstance(operations, list):
+                raise RuntimeError(f"Internal API operations manifest is invalid: {operations_path}")
+            self._schema_bundles[version] = schema_bundle
+            for operation in operations:
+                if not isinstance(operation, dict):
+                    continue
+                method = str(operation.get("method") or "").upper()
+                path = str(operation.get("path") or "")
+                operation_id = str(operation.get("operation_id") or "").strip()
+                request_body = operation.get("request_body") or {}
+                content = request_body.get("content") or {}
+                request_content_types = tuple(sorted(str(media_type) for media_type in content.keys()))
+                application_json = content.get("application/json") if isinstance(content, dict) else None
+                schema_ref = None
+                if isinstance(application_json, dict):
+                    schema_ref = application_json.get("schema_ref")
+                self._operations[(method, path)] = OperationContract(
+                    version=version,
+                    method=method,
+                    path=path,
+                    operation_id=operation_id,
+                    request_body_required=bool(request_body.get("required", False)),
+                    request_body_schema_ref=str(schema_ref) if schema_ref else None,
+                    request_content_types=request_content_types,
+                )

dq_made_easy_utils-0.1.0/src/dq_utils/logging_utils.py ADDED Viewed

@@ -0,0 +1,76 @@
+from __future__ import annotations
+import json
+import logging
+import time
+from typing import Any
+_STD_KEYS = frozenset(
+    {
+        "name",
+        "msg",
+        "args",
+        "created",
+        "relativeCreated",
+        "levelname",
+        "levelno",
+        "pathname",
+        "filename",
+        "module",
+        "funcName",
+        "lineno",
+        "thread",
+        "threadName",
+        "processName",
+        "process",
+        "msecs",
+        "exc_info",
+        "exc_text",
+        "stack_info",
+        "taskName",
+        "message",
+    }
+)
+class _JsonFormatter(logging.Formatter):
+    def format(self, record: logging.LogRecord) -> str:
+        payload: dict[str, Any] = {
+            "ts": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime(record.created)),
+            "level": record.levelname,
+            "logger": record.name,
+            "msg": record.getMessage(),
+        }
+        for key, value in record.__dict__.items():
+            if key.startswith("_") or key in _STD_KEYS:
+                continue
+            payload[key] = value
+        if record.exc_info:
+            payload["exception"] = self.formatException(record.exc_info)
+        return json.dumps(payload, default=str)
+def configure_logging(level: str = "INFO") -> None:
+    handler = logging.StreamHandler()
+    handler.setFormatter(_JsonFormatter())
+    root = logging.getLogger()
+    root.handlers.clear()
+    root.addHandler(handler)
+    root.setLevel(getattr(logging, level.upper(), logging.INFO))
+def log_event(logger: logging.Logger, event: str, level: str = "info", **context: Any) -> None:
+    raw_extra = {"event": event, **context}
+    # Never allow callers to overwrite reserved LogRecord attributes.
+    # Python's logging will raise KeyError (and can crash workers) if `extra`
+    # contains any standard LogRecord keys such as `message`.
+    safe_extra: dict[str, Any] = {}
+    for key, value in raw_extra.items():
+        if key in _STD_KEYS:
+            safe_extra[f"ctx_{key}"] = value
+        else:
+            safe_extra[key] = value
+    getattr(logger, level.lower())(event, extra=safe_extra)

dq_made_easy_utils-0.1.0/src/dq_utils/spark_jars.py ADDED Viewed

@@ -0,0 +1,99 @@
+from __future__ import annotations
+import os
+import re
+from pathlib import Path
+from typing import Any
+DEFAULT_SPARK_JAR_DIR = Path.home() / ".dq-spark-jars"
+DIRECT_SPARK_PACKAGE_ARTIFACTS = (
+    "spark-avro_2.13",
+    "hadoop-aws",
+    "delta-spark_2.13",
+    "delta-storage",
+    "iceberg-spark-runtime-4.0_2.13",
+)
+def _artifact_versions(jar_paths: list[Path], artifact_name: str) -> dict[str, list[str]]:
+    versions: dict[str, list[str]] = {}
+    pattern = re.compile(rf"(?:^|_){re.escape(artifact_name)}-(?P<version>[^/]+)\.jar$")
+    for path in jar_paths:
+        match = pattern.search(path.name)
+        if match is None:
+            continue
+        versions.setdefault(match.group("version"), []).append(path.name)
+    return versions
+def _reject_duplicate_direct_artifacts(jar_paths: list[Path]) -> None:
+    conflicts: list[str] = []
+    for artifact_name in DIRECT_SPARK_PACKAGE_ARTIFACTS:
+        versions = _artifact_versions(jar_paths, artifact_name)
+        if len(versions) < 2:
+            continue
+        version_list = ", ".join(f"{version} ({', '.join(names)})" for version, names in sorted(versions.items()))
+        conflicts.append(f"{artifact_name}: {version_list}")
+    if conflicts:
+        raise SystemExit(
+            "Conflicting Spark package jar versions found in the shared Spark jar directory: "
+            + "; ".join(conflicts)
+            + ". Re-run dq-engine-warmup or clear the spark-jars volume so only the canonical package versions remain."
+        )
+def spark_jar_paths() -> list[Path]:
+    jar_dir = Path(os.getenv("DQ_SPARK_JAR_DIR") or DEFAULT_SPARK_JAR_DIR)
+    if not jar_dir.is_dir():
+        raise SystemExit(
+            f"Spark jar directory not found: {jar_dir}. The dq-engine image must bake the required Spark jars during the build phase."
+        )
+    all_jars = sorted(path for path in jar_dir.glob("*.jar") if path.is_file())
+    if not all_jars:
+        raise SystemExit(
+            f"No Spark jars were found in {jar_dir}. The dq-engine image must copy the build-time Spark cache into that directory."
+        )
+    max_mb_env = os.getenv("DQ_SPARK_MAX_JAR_SIZE_MB")
+    try:
+        max_mb = int(max_mb_env) if max_mb_env else 200
+    except Exception:
+        max_mb = 200
+    include_large = os.getenv("DQ_SPARK_INCLUDE_LARGE_JARS", "").strip().lower() in ("1", "true", "yes")
+    filtered: list[Path] = []
+    excluded: list[tuple[str, float]] = []
+    for p in all_jars:
+        try:
+            size_mb = p.stat().st_size / (1024 * 1024)
+        except Exception:
+            size_mb = 0.0
+        if size_mb > max_mb and not include_large:
+            excluded.append((p.name, size_mb))
+            continue
+        filtered.append(p)
+    if not filtered:
+        raise SystemExit(
+            f"No Spark jars remain after applying size filter (max {max_mb}MB)."
+            " Set DQ_SPARK_INCLUDE_LARGE_JARS=1 to include large jars or increase DQ_SPARK_MAX_JAR_SIZE_MB."
+        )
+    _reject_duplicate_direct_artifacts(filtered)
+    if excluded:
+        names = ", ".join(name for name, _ in excluded[:10])
+        print(
+            f"warning: excluded {len(excluded)} large jar(s) >{max_mb}MB: {names}{'...' if len(excluded)>10 else ''}"
+        )
+    return filtered
+def configure_spark_builder_with_local_jars(builder: Any) -> Any:
+    jar_paths = spark_jar_paths()
+    return builder.config("spark.jars", ",".join(str(path) for path in jar_paths))

dq_made_easy_utils-0.1.0/src/dq_utils/spark_runtime.py ADDED Viewed

@@ -0,0 +1,66 @@
+from __future__ import annotations
+import os
+from typing import Any
+DEFAULT_SPARK_MASTER = "local[*]"
+DEFAULT_SPARK_UI_PORT = 4044
+DEFAULT_SPARK_SESSION_TIMEZONE = "UTC"
+def resolve_spark_master(default: str = DEFAULT_SPARK_MASTER) -> str:
+    return str(os.getenv("DQ_SPARK_MASTER") or default).strip() or default
+def resolve_spark_ui_port(raw_value: str | int | None = None) -> int:
+    if raw_value is None:
+        raw_value = os.getenv("DQ_SPARK_UI_PORT") or str(DEFAULT_SPARK_UI_PORT)
+    normalized = str(raw_value).strip()
+    try:
+        parsed = int(normalized)
+    except Exception as exc:
+        raise ValueError("DQ_SPARK_UI_PORT must be a positive integer") from exc
+    if parsed < 1:
+        raise ValueError("DQ_SPARK_UI_PORT must be a positive integer")
+    return parsed
+def configure_spark_builder(
+    builder: Any,
+    *,
+    spark_ui_port: str | int | None = None,
+    session_timezone: str | None = None,
+) -> Any:
+    configured = builder.config("spark.ui.port", str(resolve_spark_ui_port(spark_ui_port)))
+    if session_timezone:
+        configured = configured.config("spark.sql.session.timeZone", str(session_timezone))
+    # Allow overriding driver/executor memory from environment variables.
+    # Respect DQ-prefixed vars first, then fall back to Spark-standard names.
+    driver_mem = os.getenv("DQ_SPARK_DRIVER_MEMORY") or os.getenv("SPARK_DRIVER_MEMORY")
+    executor_mem = os.getenv("DQ_SPARK_EXECUTOR_MEMORY") or os.getenv("SPARK_EXECUTOR_MEMORY")
+    if driver_mem:
+        configured = configured.config("spark.driver.memory", str(driver_mem))
+    if executor_mem:
+        configured = configured.config("spark.executor.memory", str(executor_mem))
+    return configured
+def build_spark_session_builder(
+    *,
+    SparkSession: Any,
+    app_name: str,
+    master: str | None = None,
+    spark_ui_port: str | int | None = None,
+    session_timezone: str | None = None,
+) -> Any:
+    builder = SparkSession.builder.appName(app_name)
+    if master is not None:
+        builder = builder.master(master)
+    return configure_spark_builder(
+        builder,
+        spark_ui_port=spark_ui_port,
+        session_timezone=session_timezone,
+    )

dq_made_easy_utils-0.1.0/tests/test_auth_utils.py ADDED Viewed

@@ -0,0 +1,143 @@
+import os
+import sys
+import importlib.util
+import types
+import logging
+import pytest
+# Make local source importable
+ROOT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
+SRC_DIR = os.path.join(ROOT_DIR, "dq-utils", "src")
+if SRC_DIR not in sys.path:
+    sys.path.insert(0, SRC_DIR)
+# Load module directly from file to avoid package-level side-effects
+mod_path = os.path.join(SRC_DIR, "dq_utils", "auth_utils.py")
+# Ensure a dq_utils package exists in sys.modules so dataclasses and relative
+# module-level references resolve correctly when loading the module.
+pkg = types.ModuleType("dq_utils")
+pkg.__path__ = [os.path.join(SRC_DIR, "dq_utils")]
+sys.modules["dq_utils"] = pkg
+spec = importlib.util.spec_from_file_location("dq_utils.auth_utils", mod_path)
+auth_utils = importlib.util.module_from_spec(spec)
+# Ensure the module is present in sys.modules under its intended name so
+# decorators (dataclasses) can resolve module references during class creation.
+sys.modules[spec.name] = auth_utils
+assert spec.loader is not None
+spec.loader.exec_module(auth_utils)
+def test_static_token_provider_accepts_and_returns_token():
+    with pytest.raises(auth_utils.AuthConfigError):
+        auth_utils.StaticTokenProvider("")
+    p = auth_utils.StaticTokenProvider(" secret ")
+    assert p.get_token(correlation_id="cid") == "secret"
+def test_resolve_oidc_token_url_behaviour():
+    assert (
+        auth_utils.resolve_oidc_token_url(issuer="https://issuer", token_url=None)
+        == "https://issuer/protocol/openid-connect/token"
+    )
+    assert (
+        auth_utils.resolve_oidc_token_url(issuer=None, token_url="https://t")
+        == "https://t"
+    )
+    assert auth_utils.resolve_oidc_token_url(issuer=None, token_url=None) is None
+class DummyResponse:
+    def __init__(self, status_code=200, payload=None, json_raises=False):
+        self.status_code = status_code
+        self._payload = payload or {}
+        self._json_raises = json_raises
+    def json(self):
+        if self._json_raises:
+            raise ValueError("not json")
+        return self._payload
+def test_oidc_client_credentials_get_token_success_and_errors(monkeypatch):
+    calls = {}
+    def fake_post_success(url, data=None, headers=None, timeout=None):
+        calls['last'] = dict(url=url, data=data, headers=headers, timeout=timeout)
+        return DummyResponse(status_code=200, payload={"access_token": "abc", "expires_in": 3600})
+    provider = auth_utils.OidcClientCredentialsTokenProvider(
+        token_url="https://tok",
+        client_id="cid",
+        client_secret="cs",
+        scope=None,
+        refresh_skew_seconds=60,
+        timeout_seconds=1,
+    )
+    # network success
+    monkeypatch.setattr(auth_utils.requests, "post", fake_post_success)
+    token = provider.get_token(correlation_id="cid")
+    assert token == "abc"
+    # Clear cache so subsequent calls actually invoke the token endpoint
+    provider._cached = None
+    # response with error status
+    def fake_post_400(*a, **k):
+        return DummyResponse(status_code=400, payload={})
+    monkeypatch.setattr(auth_utils.requests, "post", fake_post_400)
+    with pytest.raises(auth_utils.AuthConfigError):
+        provider.get_token(correlation_id="cid")
+    # response with non-json
+    # Clear cache again for next scenario
+    provider._cached = None
+    def fake_post_nonjson(*a, **k):
+        return DummyResponse(status_code=200, json_raises=True)
+    monkeypatch.setattr(auth_utils.requests, "post", fake_post_nonjson)
+    with pytest.raises(auth_utils.AuthConfigError):
+        provider.get_token(correlation_id="cid")
+    # network exception
+    def fake_post_exc(*a, **k):
+        raise RuntimeError("boom")
+    monkeypatch.setattr(auth_utils.requests, "post", fake_post_exc)
+    with pytest.raises(auth_utils.AuthConfigError):
+        provider.get_token(correlation_id="cid")
+def test_build_token_provider_from_env_prefers_static(monkeypatch):
+    monkeypatch.setenv("MY_STATIC_TOKEN", "s1")
+    p = auth_utils.build_token_provider_from_env(
+        static_token_env_var="MY_STATIC_TOKEN",
+        issuer_env_var="ISS",
+        token_url_env_var="T",
+        client_id_env_var="CID",
+        client_secret_env_var="CS",
+        scope_env_var="S",
+    )
+    assert isinstance(p, auth_utils.StaticTokenProvider)
+    assert p.get_token(correlation_id="c") == "s1"
+def test_build_oidc_token_provider_from_env_raises_when_missing(monkeypatch):
+    monkeypatch.delenv("ISS", raising=False)
+    monkeypatch.delenv("T", raising=False)
+    monkeypatch.delenv("CID", raising=False)
+    monkeypatch.delenv("CS", raising=False)
+    with pytest.raises(auth_utils.AuthConfigError):
+        auth_utils.build_oidc_token_provider_from_env(
+            issuer_env_var="ISS",
+            token_url_env_var="T",
+            client_id_env_var="CID",
+            client_secret_env_var="CS",
+            scope_env_var="S",
+        )

dq_made_easy_utils-0.1.0/tests/test_logging_utils.py ADDED Viewed

@@ -0,0 +1,83 @@
+import os
+import sys
+import json
+import logging
+# Ensure local package source is importable when running this test file directly.
+ROOT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
+SRC_DIR = os.path.join(ROOT_DIR, "dq-utils", "src")
+if SRC_DIR not in sys.path:
+    sys.path.insert(0, SRC_DIR)
+import importlib.util
+# Load the module directly from its source file path to avoid importing
+# dq_utils.__init__ (which pulls heavy optional deps during import).
+mod_path = os.path.join(SRC_DIR, "dq_utils", "logging_utils.py")
+spec = importlib.util.spec_from_file_location("dq_utils_logging_utils", mod_path)
+logging_utils = importlib.util.module_from_spec(spec)
+assert spec.loader is not None
+spec.loader.exec_module(logging_utils)
+_JsonFormatter = logging_utils._JsonFormatter
+configure_logging = logging_utils.configure_logging
+log_event = logging_utils.log_event
+def test_json_formatter_includes_custom_fields():
+    fmt = _JsonFormatter()
+    record = logging.LogRecord(
+        name="mylogger",
+        level=logging.INFO,
+        pathname=__file__,
+        lineno=10,
+        msg="hello",
+        args=(),
+        exc_info=None,
+    )
+    # add a non-standard attribute which should be included in the JSON
+    record.__dict__["custom_key"] = "custom_value"
+    payload = fmt.format(record)
+    data = json.loads(payload)
+    assert data["logger"] == "mylogger"
+    assert data["msg"] == "hello"
+    assert data["custom_key"] == "custom_value"
+    assert "ts" in data and "level" in data
+def test_configure_logging_sets_handler_and_level():
+    # configure logging and assert root logger has a StreamHandler and correct level
+    configure_logging("WARNING")
+    root = logging.getLogger()
+    assert any(isinstance(h, logging.StreamHandler) for h in root.handlers)
+    assert root.level == logging.WARNING
+def test_log_event_safe_extra_and_reserved_prefix():
+    captured = []
+    class ListHandler(logging.Handler):
+        def emit(self, rec: logging.LogRecord) -> None:  # type: ignore[override]
+            # store a shallow copy of the record dict so assertions can inspect it
+            captured.append(rec.__dict__.copy())
+    logger = logging.getLogger("test_logger_for_log_event")
+    # ensure a clean handler set for the logger used in this test
+    logger.handlers.clear()
+    handler = ListHandler()
+    logger.addHandler(handler)
+    logger.setLevel(logging.DEBUG)
+    # Call log_event with a reserved key ('message') and a normal key ('user')
+    log_event(logger, "evt", level="info", message="danger", user="alice")
+    assert captured, "expected a log record to be emitted"
+    rec = captured[-1]
+    # reserved key should be prefixed to avoid overwriting LogRecord internals
+    assert rec.get("ctx_message") == "danger"
+    assert rec.get("user") == "alice"
+    assert rec.get("msg") == "evt"