PyPI - idi-ftm2j-shared - Versions diffs - 0.1.1__py3-none-any.whl - Mend

idi-ftm2j-shared 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

idi_ftm2j_shared/__init__.py +5 -0
idi_ftm2j_shared/api.py +266 -0
idi_ftm2j_shared/failures.py +168 -0
idi_ftm2j_shared/logs.py +171 -0
idi_ftm2j_shared/storage.py +177 -0
idi_ftm2j_shared-0.1.1.dist-info/METADATA +236 -0
idi_ftm2j_shared-0.1.1.dist-info/RECORD +9 -0
idi_ftm2j_shared-0.1.1.dist-info/WHEEL +4 -0
idi_ftm2j_shared-0.1.1.dist-info/licenses/LICENSE +28 -0

idi_ftm2j_shared/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+"""Shared runtime utilities for FTM2J pipeline processors."""
+from importlib.metadata import version
+__version__ = version("idi-ftm2j-shared")

idi_ftm2j_shared/api.py ADDED Viewed

@@ -0,0 +1,266 @@
+"""Provides API utilities for use across the application."""
+# Standard library imports
+import contextlib
+import logging
+import threading
+import time
+from abc import ABC, abstractmethod
+from functools import cached_property
+from typing import Any, Literal
+# Third party imports
+import requests
+from requests.adapters import HTTPAdapter
+from urllib3.util.retry import Retry
+# Application imports
+from idi_ftm2j_shared.logs import get_logger
+class ApiClient(ABC):
+    """Base class for API clients."""
+    DEFAULT_MAX_RETRIES: int = 3
+    REQUEST_TIMEOUT: tuple[int, int] = (10, 30)
+    RETRY_BACKOFF_FACTOR: int = 2  # Wait 1, 2, 4 seconds between retries
+    RETRY_STATUS_FORCELIST: list[int] = [429, 500, 502, 503, 504]
+    def __init__(
+        self,
+        api_key: str = "",
+        max_retries: int = DEFAULT_MAX_RETRIES,
+        rate_limit: float | None = None,
+    ) -> None:
+        """Initialize the ApiClient.
+        Args:
+            api_key: The API key.
+            max_retries: The maximum number of retries.
+            rate_limit: Minimum seconds between requests. None disables rate limiting.
+        """
+        self.api_key: str = api_key
+        self.max_retries: int = max_retries if max_retries is not None else self.DEFAULT_MAX_RETRIES
+        self.logger: logging.Logger = get_logger(type(self).__name__)
+        self._rate_limit = rate_limit
+        self._last_request = time.time()
+        self._lock: threading.Lock | contextlib.AbstractContextManager = (
+            threading.Lock() if rate_limit is not None else contextlib.nullcontext()
+        )
+    def rate_limit(self) -> None:
+        """Enforce rate limit between requests.
+        No-op when rate_limit was not set at construction time.
+        Thread-safe: serializes callers when rate_limit is configured.
+        """
+        if self._rate_limit is None:
+            return
+        with self._lock:
+            elapsed = time.time() - self._last_request
+            if elapsed < self._rate_limit:
+                time.sleep(self._rate_limit - elapsed)
+            self._last_request = time.time()
+    @cached_property
+    def session(self) -> requests.Session:
+        """Create a requests Session with retry strategy.
+        Returns:
+            Configured requests.Session with retry logic
+        """
+        session = requests.Session()
+        # Configure retry strategy
+        retry_strategy = Retry(
+            total=self.max_retries,
+            backoff_factor=self.RETRY_BACKOFF_FACTOR,  # Wait 1, 2, 4 seconds between retries
+            status_forcelist=self.RETRY_STATUS_FORCELIST,
+            allowed_methods=["GET", "POST"],
+            respect_retry_after_header=True,
+        )
+        adapter = HTTPAdapter(max_retries=retry_strategy)
+        session.mount("http://", adapter)
+        session.mount("https://", adapter)
+        return session
+    def get(
+        self, url: str, params: dict | None = None, headers: dict | None = None, **kwargs: object
+    ) -> requests.Response:
+        """Get a resource from the API.
+        Args:
+            url: The URL to get from.
+            params: The parameters to pass to the API.
+            headers: The headers to pass to the API.
+            kwargs: Additional keyword arguments to pass to the API.
+        Returns:
+            The response from the API.
+        """
+        kwargs.setdefault("timeout", self.REQUEST_TIMEOUT)
+        response = self.session.get(url, params=params, headers=headers, **kwargs)
+        response.raise_for_status()
+        return response
+    def post(
+        self,
+        url: str,
+        data: str | dict | None = None,
+        headers: dict | None = None,
+        **kwargs: object,
+    ) -> requests.Response:
+        """Post a resource to the API.
+        Args:
+            url: The URL to post to.
+            data: The data to post to the API.
+            headers: The headers to post to the API.
+            kwargs: Additional keyword arguments to pass to the API.
+        Returns:
+            The response from the API.
+        """
+        kwargs.setdefault("timeout", self.REQUEST_TIMEOUT)
+        response = self.session.post(url, headers=headers, data=data, **kwargs)
+        response.raise_for_status()
+        return response
+    def _query_with_error_handling(
+        self,
+        url: str,
+        data: str | dict | None = None,
+        params: dict | None = None,
+        headers: dict | None = None,
+        method: Literal["get", "post"] = "get",
+        return_json: bool = True,
+        return_bytes: bool = False,
+    ) -> dict[str, Any]:
+        """Query an endpoint with error handling, capturing errors in the return value.
+        On success the returned dict contains ``status_code``, ``url``, and ``data`` keys.
+        On failure an ``error`` key is added; HTTP errors also include ``status_code``.
+        Exceptions are never re-raised — callers should check for the ``error`` key.
+        Args:
+            url: The URL to query.
+            data: The data to post to the API. Only used when ``method`` is ``"post"``.
+            params: Query-string parameters to pass to the API.
+            headers: HTTP headers to pass to the API.
+            method: HTTP verb to use — ``"get"`` or ``"post"``.
+            return_json: If True, parse the response body as JSON; otherwise return raw text.
+            return_bytes: If True, return raw response bytes (overrides ``return_json``).
+        Returns:
+            Dict with ``status_code``, ``url``, and ``data`` keys on success.
+            On error, ``error`` is added and ``data`` may be absent.
+        """
+        response, error, error_exc = None, None, None
+        response_data: dict = {}
+        try:
+            response = (
+                self.get(url=url, params=params, headers=headers)
+                if method == "get"
+                else self.post(url=url, data=data, headers=headers)
+            )
+        except requests.exceptions.Timeout as e:
+            error = f"Timeout querying {url}: {e}"
+            error_exc = e
+            self.logger.error(error)
+            response_data["timeout"] = True
+        except requests.exceptions.RequestException as e:
+            error = f"Error querying {url}: {e}"
+            error_exc = e
+            self.logger.error(error)
+        if isinstance(error_exc, requests.exceptions.HTTPError) and error_exc.response is not None:
+            response_data["status_code"] = error_exc.response.status_code
+        if response is not None:
+            try:
+                if return_bytes:
+                    r_data = response.content
+                elif return_json:
+                    r_data = response.json()
+                else:
+                    r_data = response.text
+                response_data.update(
+                    {
+                        "status_code": response.status_code,
+                        "url": response.url,
+                        "data": r_data,
+                    }
+                )
+            except ValueError:
+                self.logger.error(f"Error parsing JSON response from {url}: {response.text}")
+        if error is not None:
+            response_data.update({"error": error})
+        return response_data
+    @abstractmethod
+    def query_endpoint(self, **kwargs: object) -> dict[str, Any]:
+        """Query the API endpoint specific to this client.
+        Subclasses define the exact positional/keyword parameters relevant to their
+        endpoint. The return dict follows the ``_query_with_error_handling`` contract:
+        ``status_code``, ``url``, and ``data`` on success; ``error`` on failure.
+        Args:
+            **kwargs: Endpoint-specific arguments defined by each subclass.
+        Returns:
+            Dict with ``status_code``, ``url``, and ``data`` on success, plus ``error``
+            on failure.
+        """
+        ...
+class SecClient(ApiClient):
+    """API client for the SEC EDGAR archive, with built-in rate limiting."""
+    SEC_URL = "https://www.sec.gov/Archives/edgar/data"
+    def __init__(self, rate_limit: float = 0.2, user_agent: str = "") -> None:
+        """Initializes the SEC API.
+        Args:
+            rate_limit: How long to wait in between requests.
+            user_agent: Value for the SEC-required ``User-Agent`` header.
+        """
+        super().__init__(rate_limit=rate_limit)
+        self._sec_headers = {"User-Agent": user_agent}
+    @property
+    def sec_headers(self) -> dict:
+        """Return the SEC header for querying."""
+        return self._sec_headers
+    def query_endpoint(
+        self, sec_url: str, return_json: bool = True, return_bytes: bool = False
+    ) -> dict[str, Any]:
+        """Query a SEC EDGAR endpoint with the required User-Agent header.
+        Args:
+            sec_url: Full SEC EDGAR URL to query.
+            return_json: If True, parse response as JSON; otherwise return raw text.
+            return_bytes: If True, return raw response bytes (overrides ``return_json``).
+        Returns:
+            Dict with ``status_code``, ``url``, and ``data`` on success, plus ``error``
+            on failure.
+        """
+        self.rate_limit()
+        return self._query_with_error_handling(
+            url=sec_url,
+            headers=self._sec_headers,
+            method="get",
+            return_json=return_json,
+            return_bytes=return_bytes,
+        )

idi_ftm2j_shared/failures.py ADDED Viewed

@@ -0,0 +1,168 @@
+"""Generic failure classification and registry for pipeline processors."""
+# Standard library imports
+import json
+import pathlib
+import threading
+from abc import ABC, abstractmethod
+from enum import StrEnum
+# Application imports
+from idi_ftm2j_shared.storage import load_json, save_json
+_MIN_ENTRY_LEN = 2
+class FailureClassifier(ABC):
+    """Base class for failure classification. Subclasses implement domain-specific logic."""
+    @property
+    @abstractmethod
+    def do_not_retry(self) -> frozenset[StrEnum]:
+        """Return the set of failure types that should not be retried."""
+        ...
+    def is_retryable(self, failure_type: StrEnum) -> bool:
+        """Check if a failure type should be retried.
+        Args:
+            failure_type: The type of failure.
+        Returns:
+            True if the failure is transient and should be retried.
+        """
+        return failure_type not in self.do_not_retry
+    @abstractmethod
+    def classify_from_response(self, response: dict, **kwargs: object) -> StrEnum:
+        """Classify a failure from an API response.
+        Args:
+            response: API response dict with status_code and optional error.
+            **kwargs: Additional keyword arguments for subclass implementations.
+        Returns:
+            The classified failure type.
+        """
+        ...
+class FailureRegistry:
+    """Persists permanent failures to avoid retrying entities that will always fail."""
+    def __init__(
+        self, file_path: str, classifier: FailureClassifier, flush_every: int = 10
+    ) -> None:
+        """Initialize the FailureRegistry.
+        Args:
+            file_path: Path to the JSON file for persistence.
+            classifier: Domain-specific failure classifier.
+            flush_every: Number of new failures to buffer before writing to disk.
+        """
+        self.file_path = file_path
+        self._classifier = classifier
+        self._flush_every = flush_every
+        self._pending = 0
+        self._entries: set[tuple[str, str]] = set()
+        self._reasons: dict[tuple[str, str], str] = {}
+        self._lock = threading.RLock()
+        self.load()
+    def load(self) -> None:
+        """Load persisted failure entries from disk into memory.
+        If the file does not exist (locally or on S3), the registry is initialised
+        as empty. A ``json.JSONDecodeError`` from a corrupt file is silently caught
+        and the registry is reset to empty so the pipeline can continue.
+        Returns:
+            None
+        Raises:
+            botocore.exceptions.ClientError: If an S3 error other than ``NoSuchKey``
+                occurs when reading the persistence file.
+        """
+        if not self.file_path or (
+            not self.file_path.startswith("s3://") and not pathlib.Path(self.file_path).exists()
+        ):
+            self._entries = set()
+            self._reasons = {}
+            return
+        try:
+            data = load_json(self.file_path, return_type="dict")
+        except json.JSONDecodeError:
+            self._entries = set()
+            self._reasons = {}
+            return
+        if not isinstance(data, dict):
+            self._entries = set()
+            self._reasons = {}
+            return
+        entries_data = data.get("entries", [])
+        reasons_data = data.get("reasons", {})
+        self._entries = {tuple(e) for e in entries_data if len(e) >= _MIN_ENTRY_LEN}
+        self._reasons = {}
+        for entry in self._entries:
+            key = " ".join(entry)
+            if key in reasons_data:
+                self._reasons[entry] = reasons_data[key]
+    def save(self) -> None:
+        """Persist current failure entries and reasons to the configured file path.
+        Writes entries as a JSON object with ``entries`` (list of lists) and
+        ``reasons`` (dict keyed by space-joined entry tuples) keys. If
+        ``file_path`` is empty the call is a no-op.
+        Returns:
+            None
+        """
+        if not self.file_path:
+            return
+        entries_list = [list(e) for e in self._entries]
+        reasons_dict = {" ".join(e): self._reasons.get(e, "") for e in self._entries}
+        save_json(self.file_path, {"entries": entries_list, "reasons": reasons_dict})
+    def add(self, key: tuple[str, str], failure_type: StrEnum) -> None:
+        """Add a permanent failure entry.
+        Args:
+            key: Tuple of identifier and associated file or relevant metadata.
+            failure_type: The classified failure type.
+        """
+        if self._classifier.is_retryable(failure_type):
+            return
+        with self._lock:
+            if key in self._entries:
+                return
+            self._entries.add(key)
+            self._reasons[key] = str(failure_type)
+            self._pending += 1
+            if self._pending >= self._flush_every:
+                self.flush()
+    def flush(self) -> None:
+        """Write all buffered failures to disk and reset the pending counter."""
+        with self._lock:
+            self.save()
+            self._pending = 0
+    def __contains__(self, key: tuple[str, str]) -> bool:
+        """Set-like membership check.
+        Args:
+            key: Tuple of identifier and associated file or relevant metadata.
+        Returns:
+            True if the filing should not be retried.
+        """
+        return key in self._entries

idi_ftm2j_shared/logs.py ADDED Viewed

@@ -0,0 +1,171 @@
+"""Provides loggers for use across the application."""
+# Standard library imports
+import datetime
+import logging
+import os
+# Third party imports
+import boto3
+import requests
+import tqdm
+import watchtower
+_configured_loggers: set[str] = set()
+_EXECUTION_ID = datetime.datetime.now().strftime("%Y%m%d_%H%M%S_%f")
+EC2_METADATA_BASE = "http://169.254.169.254"
+EC2_METADATA_TOKEN_URL = f"{EC2_METADATA_BASE}/latest/api/token"
+EC2_METADATA_INSTANCE_ID_URL = f"{EC2_METADATA_BASE}/latest/meta-data/instance-id"
+LOG_RETENTION_DAYS = (
+    30  # Possible values are: 1, 3, 5, 7, 14, 30, 60, 90, 120, 150, 180, 365, 400, ...
+)
+def _get_instance_id() -> str:
+    """Returns the EC2 instance ID when available, otherwise a fallback identifier."""
+    # Prefer explicit env var (e.g. when running in Docker where metadata may be unreachable)
+    if instance_id := os.environ.get("INSTANCE_ID"):
+        return instance_id
+    try:
+        # IMDSv2: obtain session token first (required when IMDSv2 is enforced)
+        token_resp = requests.put(
+            EC2_METADATA_TOKEN_URL,
+            headers={"X-aws-ec2-metadata-token-ttl-seconds": "60"},
+            timeout=1,
+        )
+        token_resp.raise_for_status()
+        token = token_resp.text.strip()
+        # Fetch instance-id with token
+        instance_resp = requests.get(
+            EC2_METADATA_INSTANCE_ID_URL,
+            headers={"X-aws-ec2-metadata-token": token},
+            timeout=1,
+        )
+        instance_resp.raise_for_status()
+        return instance_resp.text.strip()
+    except Exception:
+        hostname = os.environ.get("HOSTNAME", "unknown")
+        return hostname.split(".")[0]
+class TqdmLoggingHandler(logging.Handler):
+    """Logging handler that writes via tqdm.write() to avoid disrupting progress bars."""
+    def emit(self, record: logging.LogRecord) -> None:
+        """Emit a log record.
+        Args:
+            record: The log record to emit.
+        Raises:
+            Exception: If an error occurs while emitting the log record.
+        """
+        try:
+            tqdm.tqdm.write(self.format(record))
+        except Exception:
+            self.handleError(record)
+def get_logger(
+    name: str, level: int = logging.INFO, log_group_name: str = "", log_stream_prefix: str = ""
+) -> logging.Logger:
+    """Creates a logger with the given name and level.
+    Attaches a stream handler that prints logs in a standard format to the console.
+    If log_group_name and log_stream_prefix are provided, CloudWatch logging is enabled.
+    Args:
+        name: The logger name.
+        level: The initial level. Defaults to 20 ("INFO").
+        log_group_name: The name of the log group. Defaults to empty string.
+        log_stream_prefix: The prefix of the log stream. Defaults to empty string.
+    Returns:
+        The logger
+    """
+    # Check if logger has already been configured
+    if name in _configured_loggers:
+        return logging.getLogger(name)
+    env_level = os.environ.get("LOG_LEVEL", "").upper()
+    if env_level and hasattr(logging, env_level):
+        level = getattr(logging, env_level)
+    # Create logger and set level
+    logger = logging.getLogger(name)
+    logger.setLevel(level)
+    logger.propagate = False  # Prevent log messages from being propagated to the root logger
+    # Create console handler and set level
+    ch = TqdmLoggingHandler()
+    ch.setLevel(level)
+    # Create formatter and add to handler
+    format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+    formatter = logging.Formatter(format)
+    ch.setFormatter(formatter)
+    # Add handler to logger
+    logger.addHandler(ch)
+    # Configure CloudWatch logging if executing on AWS EC2 instance
+    _configure_cloudwatch(logger, name, log_group_name, log_stream_prefix)
+    # Add logger to set of configured loggers
+    _configured_loggers.add(name)
+    return logger
+def _configure_cloudwatch(
+    logger: logging.Logger, name: str, log_group_name: str, log_stream_prefix: str
+) -> None:
+    """Configures the logger to send logs to CloudWatch if executing in AWS.
+    Enables CloudWatch when:
+    - EC2 metadata endpoint is reachable, or
+    - CLOUDWATCH_LOGS_ENABLED=true (e.g. when running in Docker on EC2).
+    Args:
+        logger: The logger to configure.
+        name: The name of the logger.
+        log_group_name: The name of the log group.
+        log_stream_prefix: The prefix of the log stream.
+    """
+    # Enable when explicitly requested (e.g. Docker on EC2)
+    env_enabled = os.environ.get("CLOUDWATCH_LOGS_ENABLED", "").lower() in ("true", "1", "yes")
+    if env_enabled and log_group_name and log_stream_prefix:
+        instance_id = _get_instance_id()
+        log_stream_name = f"{log_stream_prefix}/{instance_id}/{_EXECUTION_ID}"
+        if "AWS_REGION" in os.environ:
+            logs_client = boto3.client("logs", region_name=os.environ["AWS_REGION"])
+        else:
+            logs_client = boto3.client("logs")
+        handler = watchtower.CloudWatchLogHandler(
+            log_group_name=log_group_name,
+            log_stream_name=log_stream_name,
+            use_queues=False,
+            boto3_client=logs_client,
+            log_group_retention_days=LOG_RETENTION_DAYS,
+        )
+        format = "%(name)s - %(levelname)s - %(message)s"
+        formatter = logging.Formatter(format)
+        handler.setFormatter(formatter)
+        logger.addHandler(handler)
+        logger.info(
+            "CloudWatch logging enabled: name=%s, log_group=%s log_stream=%s",
+            name,
+            log_group_name,
+            log_stream_name,
+        )

idi_ftm2j_shared/storage.py ADDED Viewed

@@ -0,0 +1,177 @@
+"""Provides storage utilities for use across the application."""
+# Standard library imports
+import json
+import tempfile
+import zipfile
+from collections.abc import Iterator
+from contextlib import contextmanager
+# Third party imports
+import smart_open
+from botocore.exceptions import ClientError
+def _empty_for_return_type(return_type: str) -> dict | list:
+    """Return empty dict or list per return_type."""
+    if return_type == "dict":
+        return {}
+    if return_type == "list":
+        return []
+    raise ValueError(f"Invalid return type: {return_type}")
+def load_json(file_path: str, return_type: str = "dict") -> dict | list:
+    """Load a JSON file from a local path or S3 URL.
+    Supports any path scheme understood by ``smart_open`` (local, ``s3://``).
+    Missing files — locally absent or absent on S3 — are treated as empty and
+    return the appropriate empty container instead of raising.
+    Args:
+        file_path: Local filesystem path or ``s3://bucket/key`` URL of the JSON file.
+        return_type: Expected top-level type of the JSON document — ``"dict"`` or
+            ``"list"``. Controls the empty value returned when the file is absent.
+            Raises ``ValueError`` for any other value.
+    Returns:
+        Parsed JSON content as a ``dict`` or ``list``. Returns an empty ``dict`` or
+        ``list`` (per ``return_type``) when the file does not exist.
+    Raises:
+        ValueError: If ``return_type`` is not ``"dict"`` or ``"list"``.
+        botocore.exceptions.ClientError: If an S3 error other than ``NoSuchKey`` occurs.
+        json.JSONDecodeError: If the file exists but contains invalid JSON.
+    """
+    try:
+        with smart_open.open(file_path) as f:
+            return json.load(f)
+    except (FileNotFoundError, OSError):
+        return _empty_for_return_type(return_type)
+    except ClientError as e:
+        if e.response.get("Error", {}).get("Code") == "NoSuchKey":
+            return _empty_for_return_type(return_type)
+        raise
+def save_json(file_path: str, data: dict | list, mode: str = "w") -> None:
+    """Save a JSON file to the given path.
+    Efficient writing: https://github.com/piskvorky/smart_open/blob/develop/howto.md#how-to-write-to-s3-efficiently
+    Can write in append mode for local files, S3 files are always overwritten.
+    Args:
+        file_path: The path to the JSON file.
+        data: The JSON data to save to the file as a dictionary or list.
+        mode: File open mode ("w" to overwrite, "a" to append). S3 paths always overwrite.
+    """
+    if "s3://" in file_path:
+        with tempfile.NamedTemporaryFile() as tmp:
+            tp = {"writebuffer": tmp}
+            with smart_open.open(file_path, "w", transport_params=tp) as fout:
+                json.dump(data, fout, indent=2)
+    else:
+        with smart_open.open(file_path, mode) as fout:
+            json.dump(data, fout, indent=2)
+def key_exists(file_path: str) -> bool:
+    """Return True if the file at the given path exists.
+    Supports local filesystem paths and ``s3://`` URLs.
+    Args:
+        file_path: Local path or ``s3://`` URL to check.
+    Returns:
+        True if the file exists, False if it does not.
+    Raises:
+        botocore.exceptions.ClientError: If an S3 error other than ``NoSuchKey`` occurs.
+    """
+    try:
+        with smart_open.open(file_path, "rb") as f:
+            f.read(1)
+        return True
+    except (FileNotFoundError, OSError):
+        return False
+    except ClientError as e:
+        if e.response.get("Error", {}).get("Code") in ("NoSuchKey", "404"):
+            return False
+        raise
+def load_content(file_path: str) -> str:
+    """Load text content from a local path or S3 URL.
+    Missing files return an empty string instead of raising.
+    Args:
+        file_path: Local filesystem path or ``s3://`` URL of the text file.
+    Returns:
+        File contents as a string, or ``""`` when the file does not exist.
+    Raises:
+        botocore.exceptions.ClientError: If an S3 error other than ``NoSuchKey`` occurs.
+    """
+    try:
+        with smart_open.open(file_path) as f:
+            return f.read()
+    except (FileNotFoundError, OSError):
+        return ""
+    except ClientError as e:
+        if e.response.get("Error", {}).get("Code") == "NoSuchKey":
+            return ""
+        raise
+def save_content(file_path: str, content: str) -> None:
+    """Save text content to a local path or S3 URL.
+    Args:
+        file_path: Local filesystem path or ``s3://`` URL to write to.
+        content: Text content to write.
+    """
+    try:
+        if "s3://" in file_path:
+            with tempfile.NamedTemporaryFile() as tmp:
+                tp = {"writebuffer": tmp}
+                with smart_open.open(file_path, "w", transport_params=tp) as fout:
+                    fout.write(content)
+        else:
+            with smart_open.open(file_path, "w") as fout:
+                fout.write(content)
+    except ValueError as e:
+        raise ValueError(f"Failed to save content to {file_path!r}: {e}") from e
+@contextmanager
+def open_zip(file_path: str, headers: dict | None = None) -> Iterator[zipfile.ZipFile]:
+    """Open a zip file from a local path, S3, or HTTPS URL.
+    Supports any path scheme handled by smart_open (local, s3://, https://).
+    HTTPS requires the server to support range requests (Accept-Ranges: bytes).
+    Args:
+        file_path: Path to the ZIP file — local filesystem path, ``s3://`` URL, or
+            ``https://`` URL. HTTPS requires the server to support range requests
+            (``Accept-Ranges: bytes``).
+        headers: Optional HTTP headers passed as transport params (e.g. ``User-Agent``
+            for SEC EDGAR). Ignored for local and S3 paths.
+    Yields:
+        An open ``zipfile.ZipFile`` object. The underlying stream is closed
+        automatically when the context manager exits.
+    Raises:
+        zipfile.BadZipFile: If the file is not a valid ZIP archive.
+        OSError: If the file cannot be opened or read.
+    """
+    tp = {"headers": headers} if headers else {}
+    with smart_open.open(file_path, "rb", transport_params=tp) as f:
+        with zipfile.ZipFile(f) as zf:
+            yield zf

idi_ftm2j_shared-0.1.1.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,236 @@
+Metadata-Version: 2.4
+Name: idi-ftm2j-shared
+Version: 0.1.1
+Summary: Shared runtime utilities and Pulumi factories for FTM2J processors
+Author-email: UChicago DSI Core Facility <corefacility@uchicago.edu>
+License-Expression: BSD-3-Clause
+License-File: LICENSE
+Requires-Python: >=3.13
+Requires-Dist: requests>=2.33.1
+Requires-Dist: smart-open[s3]>=7.6.0
+Requires-Dist: tqdm>=4.67.3
+Requires-Dist: watchtower>=3.4.0
+Description-Content-Type: text/markdown
+# idi-ftm2j-shared
+Shared AWS infrastructure for the FTM2J terminal ecosystem. Two independent Pulumi stacks — deploy bootstrap first, then shared.
+---
+## `pulumi-bootstrap` — GitHub Actions OIDC
+Provisions the account-level OIDC identity provider and the two IAM roles that all `dsi-clinic` repos use to authenticate with AWS from GitHub Actions.
+> **Run locally.** This stack must be deployed from a workstation with AWS credentials — it creates the very roles that CI uses, so CI cannot deploy it itself.
+```bash
+cd pulumi-bootstrap
+pulumi stack select dev
+pulumi preview
+pulumi up
+```
+**Roles created:**
+| Role | Assumed by | Access |
+|------|-----------|--------|
+| `checks` | Pull requests, manual `workflow_dispatch` runs | Read-only (`pulumi preview`) |
+| `deploy` | Pushes to `main`, `dev`, `release/**` | Full deploy (`pulumi up`) |
+Both roles trust any repository in the `dsi-clinic` org — no updates needed when new repos are added.
+---
+## `pulumi` — Shared Infrastructure
+Provisions the AWS resources shared across all FTM2J processor pipelines. Individual processor stacks reference these outputs rather than creating their own copies.
+```bash
+cd pulumi
+pulumi stack select dev
+pulumi preview
+pulumi up
+```
+**Resources:**
+| Resource | Description |
+|----------|-------------|
+| S3 bucket | Pipeline input, output, and failure storage. Encrypted at rest; retained on stack destroy to prevent data loss. |
+| S3 VPC gateway endpoint | Routes S3 traffic over the private AWS network, avoiding internet egress from ECS tasks. |
+| SQS dead-letter queue | Captures EventBridge Scheduler invocation failures for inspection and replay. |
+**Stack outputs** consumed by downstream processor stacks:
+```
+processor_bucket_name
+processor_bucket_arn
+s3_endpoint_id
+s3_endpoint_arn
+dlq_url
+dlq_arn
+```
+>`deploy.yml` is path-filtered: version/publish jobs only run when `src/**` or `pyproject.toml` changed; the Pulumi deploy job only runs when `pulumi-shared/**` changed.
+---
+# development + contributing
+Install all dependency groups (includes `dev` tools: pytest, ruff):
+```bash
+uv sync --all-groups
+```
+## tests
+```bash
+uv run pytest
+```
+## linting + formatting
+```bash
+uv run ruff check .    # lint
+uv run ruff format .   # format
+```
+## code style
+| Rule | Value |
+|---|---|
+| Line length | 100 characters |
+| Docstring convention | Google (`pydocstyle`) |
+| Type annotations | Required on all public functions and classes |
+| String quotes | Double-quoted (ruff `Q` ruleset) |
+## branching strategy + versioning
+Two-branch model with short-lived issue branches.
+### long-lived branches
+| Branch | Purpose      | Version style                | Deploy target |
+| ------ | ------------ | ---------------------------- | ------------- |
+| `dev`  | Integration  | `X.Y.Z-alphaN` (pre-release) | `dev` stack   |
+| `main` | Production   | `X.Y.Z` (stable)             | `prod` stack  |
+Both branches are protected. All changes occur via pull request.
+### short-lived branches
+- **`issue-<number>-<slug>`** — feature, bug-fix, and chore work.
+    - Branch from `dev`, PR back to `dev`.
+    - While the PR is open, only [`checks.yml`](../.github/workflows/checks.yml) runs (lint, tests, security, Pulumi preview). Pushes to the issue branch do not bump the version or deploy.
+    - On merge, the push to `dev` triggers [`deploy.yml`](../.github/workflows/deploy.yml): bumps the alpha version and deploys the `dev` stack.
+    - Note: It is best to create branches with this naming convention as you will be able to manually deploy these branches for testing in the `dev` stack. See (#manual-deploys)
+- **Hotfix** — urgent production fix.
+    - Branch from `main` as `issue-<number>-hotfix-<slug>`, PR back to `main`.
+    - After release, merge `main` back into `dev` (see [Syncing main back into dev](#3-syncing-main-back-into-dev)).
+### ci/cd pipelines
+Validation and deployment are split across two workflows:
+- [`checks.yml`](../.github/workflows/checks.yml) — runs on every PR, required before merge. Lint, tests, security scan, Pulumi preview.
+- [`deploy.yml`](../.github/workflows/deploy.yml) — runs on push to `dev` or `main` (i.e. after a merge). Bumps version, tags, releases, deploys Pulumi, publishes to PyPI (`main` only, if the repo includes a package).
+### versioning
+Versions live in `pyproject.toml` and are bumped by `deploy.yml` using `uv version`.
+| Trigger                          | Bump command                           | Example               |
+| -------------------------------- | -------------------------------------- | --------------------- |
+| Push to `dev`, no existing alpha | `uv version --bump patch --bump alpha` | `1.4.0` → `1.4.1a1`   |
+| Push to `dev`, existing alpha    | `uv version --bump alpha`              | `1.4.1a1` → `1.4.1a2` |
+| Push to `main`                   | `uv version --bump stable`             | `1.4.1a3` → `1.4.1`   |
+Each successful deploy:
+1. Commits the bumped `pyproject.toml` + `uv.lock` with `[skip ci]`.
+2. Pushes a `vX.Y.Z[aN]` git tag.
+3. Creates a GitHub Release — pre-release on `dev`, stable on `main`.
+4. On `main`: builds the wheel/sdist and publishes to PyPI (if the repo ships a package).
+### development cycle
+#### 1. dev → issue → alpha release
+```
+                                    PR
+issue-123-add-feature  ────────────────────────────────► dev
+        ▲                                              │
+        │ branch                                       │ push triggers deploy.yml
+        │                                              ▼
+       dev ◄──────────────────────────────────── 1.4.1a1, 1.4.1a2, ...
+                        merge                    deployed to dev stack
+```
+1. `git switch dev && git pull`
+2. `git switch -c issue-123-add-feature`
+3. Commit, push, open PR targeting `dev`. `checks.yml` runs.
+4. Merge the PR (squash recommended). The push to `dev` triggers `deploy.yml`:
+   - Bumps to the next alpha (`1.4.1a1` if no alpha exists yet, otherwise increments the alpha counter).
+   - Tags, creates a pre-release, deploys the `dev` Pulumi stack, publishes the image. PyPI publish is skipped.
+5. More issue PRs into `dev` keep stacking alphas (`1.4.1a2`, `1.4.1a3`, …) on the same patch line until a stable release cuts that line off.
+#### 2. dev → main → stable release
+```
+dev (1.4.1a3) ───────── PR ─────────► main
+ ▲                                      │ push triggers deploy.yml
+ │                                      ▼
+  ◄────────────────────────────────── 1.4.1 (stable)
+                sync/merge            deployed to prod stack
+                                      published to PyPI
+```
+1. When `dev` is ready to ship, open a PR from `dev` → `main`. `checks.yml` runs against the `prod` Pulumi stack preview.
+2. Review and merge. **Do not squash** — preserve the alpha history so release notes capture every change. A merge commit is fine.
+3. The push to `main` triggers `deploy.yml`:
+   - `uv version --bump stable` drops the `aN` suffix (`1.4.1a3` → `1.4.1`).
+   - Tags `v1.4.1`, creates a stable GitHub Release, deploys the `prod` Pulumi stack, publishes to PyPI (if applicable).
+#### 3. syncing main back into dev
+After every stable release (and any hotfix that lands directly on `main`), merge `main` back into `dev` so `dev` stays ahead of `main` and the histories stay aligned.
+```bash
+git switch main && git pull
+git switch dev && git pull
+git merge main          # bring in the stable bump commit + any hotfixes
+git push
+```
+The next push to `dev` produces `1.4.2a1` — a new alpha line above the just-released `1.4.1`.
+On a `pyproject.toml` conflict, keep `main`'s stable version. The next `dev` deploy bumps from there.
+### manual deploys
+`deploy.yml` accepts `workflow_dispatch`:
+- From `dev` it deploys the `dev` stack.
+- From `main` it deploys the `prod` stack.
+Use this to redeploy Pulumi without a code change (e.g. after rotating a secret). Version/publish jobs stay gated on `src/**` changes.
+### summary
+- `dev` is the only place new work lands; every merge produces an alpha.
+- `main` cuts stable releases from whatever alpha `dev` is on.
+- After every release on `main`, merge `main` back into `dev`.
+## branch protection rules
+- Default branch is set to `dev`
+- There are two rulesets: `dev` and `main`
+- Deploy keys are added to the bypass list and set to "Always allow"
+- The branch targeting criteria is either set to: `dev` or `main`
+- ✅ Restrict deletions
+- ✅ Require a pull request before mergining
+- ✅ Require status checkts to pass: Lint, Test, Security, Pulumi Preview
+- ✅ Block force pushes
+- ✅ Require code scanning results; set to CodeQL security alerts "High or higher"

idi_ftm2j_shared-0.1.1.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,9 @@
+idi_ftm2j_shared/__init__.py,sha256=78ErW2BC3eZrnPznjDscP2gBGelm2fHTfz4L8WI_5AA,145
+idi_ftm2j_shared/api.py,sha256=25BV2YXKEEMRA8WTNKcvhcTPHwx9bxUC_Li-9xuLCN0,9400
+idi_ftm2j_shared/failures.py,sha256=x80FSSehzvPNzChFn98px5sQVplerLCpIP4Vix9NADA,5394
+idi_ftm2j_shared/logs.py,sha256=wO_d6RZAVmAVZxQqSfpNImx8-CsjEEqN4r5xQEuyEWM,5609
+idi_ftm2j_shared/storage.py,sha256=M_5IeAB1v5sXvPPQ-t0k1bVo_vhVzR8Q1-UkML7FWkE,6214
+idi_ftm2j_shared-0.1.1.dist-info/METADATA,sha256=96H4qeqe8Naa7bDPzAkpth1V0ryS3OZpkyI0y6CuyYs,9456
+idi_ftm2j_shared-0.1.1.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
+idi_ftm2j_shared-0.1.1.dist-info/licenses/LICENSE,sha256=qlWEZ_QLy9KO01sLoPcyXHd9-nqrPqPPTSTA9hebLfE,1515
+idi_ftm2j_shared-0.1.1.dist-info/RECORD,,

idi_ftm2j_shared-0.1.1.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,4 @@
+Wheel-Version: 1.0
+Generator: hatchling 1.29.0
+Root-Is-Purelib: true
+Tag: py3-none-any

idi_ftm2j_shared-0.1.1.dist-info/licenses/LICENSE ADDED Viewed

@@ -0,0 +1,28 @@
+BSD 3-Clause License
+Copyright (c) 2026, UChicago Data Science Clinic
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+3. Neither the name of the copyright holder nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.