idi-ftm2j-shared 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,5 @@
1
+ """Shared runtime utilities for FTM2J pipeline processors."""
2
+
3
+ from importlib.metadata import version
4
+
5
+ __version__ = version("idi-ftm2j-shared")
@@ -0,0 +1,266 @@
1
+ """Provides API utilities for use across the application."""
2
+
3
+ # Standard library imports
4
+ import contextlib
5
+ import logging
6
+ import threading
7
+ import time
8
+ from abc import ABC, abstractmethod
9
+ from functools import cached_property
10
+ from typing import Any, Literal
11
+
12
+ # Third party imports
13
+ import requests
14
+ from requests.adapters import HTTPAdapter
15
+ from urllib3.util.retry import Retry
16
+
17
+ # Application imports
18
+ from idi_ftm2j_shared.logs import get_logger
19
+
20
+
21
+ class ApiClient(ABC):
22
+ """Base class for API clients."""
23
+
24
+ DEFAULT_MAX_RETRIES: int = 3
25
+ REQUEST_TIMEOUT: tuple[int, int] = (10, 30)
26
+ RETRY_BACKOFF_FACTOR: int = 2 # Wait 1, 2, 4 seconds between retries
27
+ RETRY_STATUS_FORCELIST: list[int] = [429, 500, 502, 503, 504]
28
+
29
+ def __init__(
30
+ self,
31
+ api_key: str = "",
32
+ max_retries: int = DEFAULT_MAX_RETRIES,
33
+ rate_limit: float | None = None,
34
+ ) -> None:
35
+ """Initialize the ApiClient.
36
+
37
+ Args:
38
+ api_key: The API key.
39
+ max_retries: The maximum number of retries.
40
+ rate_limit: Minimum seconds between requests. None disables rate limiting.
41
+ """
42
+ self.api_key: str = api_key
43
+ self.max_retries: int = max_retries if max_retries is not None else self.DEFAULT_MAX_RETRIES
44
+ self.logger: logging.Logger = get_logger(type(self).__name__)
45
+ self._rate_limit = rate_limit
46
+ self._last_request = time.time()
47
+ self._lock: threading.Lock | contextlib.AbstractContextManager = (
48
+ threading.Lock() if rate_limit is not None else contextlib.nullcontext()
49
+ )
50
+
51
+ def rate_limit(self) -> None:
52
+ """Enforce rate limit between requests.
53
+
54
+ No-op when rate_limit was not set at construction time.
55
+ Thread-safe: serializes callers when rate_limit is configured.
56
+ """
57
+ if self._rate_limit is None:
58
+ return
59
+ with self._lock:
60
+ elapsed = time.time() - self._last_request
61
+ if elapsed < self._rate_limit:
62
+ time.sleep(self._rate_limit - elapsed)
63
+ self._last_request = time.time()
64
+
65
+ @cached_property
66
+ def session(self) -> requests.Session:
67
+ """Create a requests Session with retry strategy.
68
+
69
+ Returns:
70
+ Configured requests.Session with retry logic
71
+ """
72
+ session = requests.Session()
73
+
74
+ # Configure retry strategy
75
+ retry_strategy = Retry(
76
+ total=self.max_retries,
77
+ backoff_factor=self.RETRY_BACKOFF_FACTOR, # Wait 1, 2, 4 seconds between retries
78
+ status_forcelist=self.RETRY_STATUS_FORCELIST,
79
+ allowed_methods=["GET", "POST"],
80
+ respect_retry_after_header=True,
81
+ )
82
+
83
+ adapter = HTTPAdapter(max_retries=retry_strategy)
84
+ session.mount("http://", adapter)
85
+ session.mount("https://", adapter)
86
+
87
+ return session
88
+
89
+ def get(
90
+ self, url: str, params: dict | None = None, headers: dict | None = None, **kwargs: object
91
+ ) -> requests.Response:
92
+ """Get a resource from the API.
93
+
94
+ Args:
95
+ url: The URL to get from.
96
+ params: The parameters to pass to the API.
97
+ headers: The headers to pass to the API.
98
+ kwargs: Additional keyword arguments to pass to the API.
99
+
100
+ Returns:
101
+ The response from the API.
102
+ """
103
+ kwargs.setdefault("timeout", self.REQUEST_TIMEOUT)
104
+ response = self.session.get(url, params=params, headers=headers, **kwargs)
105
+ response.raise_for_status()
106
+ return response
107
+
108
+ def post(
109
+ self,
110
+ url: str,
111
+ data: str | dict | None = None,
112
+ headers: dict | None = None,
113
+ **kwargs: object,
114
+ ) -> requests.Response:
115
+ """Post a resource to the API.
116
+
117
+ Args:
118
+ url: The URL to post to.
119
+ data: The data to post to the API.
120
+ headers: The headers to post to the API.
121
+ kwargs: Additional keyword arguments to pass to the API.
122
+
123
+ Returns:
124
+ The response from the API.
125
+ """
126
+ kwargs.setdefault("timeout", self.REQUEST_TIMEOUT)
127
+ response = self.session.post(url, headers=headers, data=data, **kwargs)
128
+ response.raise_for_status()
129
+ return response
130
+
131
+ def _query_with_error_handling(
132
+ self,
133
+ url: str,
134
+ data: str | dict | None = None,
135
+ params: dict | None = None,
136
+ headers: dict | None = None,
137
+ method: Literal["get", "post"] = "get",
138
+ return_json: bool = True,
139
+ return_bytes: bool = False,
140
+ ) -> dict[str, Any]:
141
+ """Query an endpoint with error handling, capturing errors in the return value.
142
+
143
+ On success the returned dict contains ``status_code``, ``url``, and ``data`` keys.
144
+ On failure an ``error`` key is added; HTTP errors also include ``status_code``.
145
+ Exceptions are never re-raised — callers should check for the ``error`` key.
146
+
147
+ Args:
148
+ url: The URL to query.
149
+ data: The data to post to the API. Only used when ``method`` is ``"post"``.
150
+ params: Query-string parameters to pass to the API.
151
+ headers: HTTP headers to pass to the API.
152
+ method: HTTP verb to use — ``"get"`` or ``"post"``.
153
+ return_json: If True, parse the response body as JSON; otherwise return raw text.
154
+ return_bytes: If True, return raw response bytes (overrides ``return_json``).
155
+
156
+ Returns:
157
+ Dict with ``status_code``, ``url``, and ``data`` keys on success.
158
+ On error, ``error`` is added and ``data`` may be absent.
159
+ """
160
+ response, error, error_exc = None, None, None
161
+ response_data: dict = {}
162
+ try:
163
+ response = (
164
+ self.get(url=url, params=params, headers=headers)
165
+ if method == "get"
166
+ else self.post(url=url, data=data, headers=headers)
167
+ )
168
+
169
+ except requests.exceptions.Timeout as e:
170
+ error = f"Timeout querying {url}: {e}"
171
+ error_exc = e
172
+ self.logger.error(error)
173
+ response_data["timeout"] = True
174
+
175
+ except requests.exceptions.RequestException as e:
176
+ error = f"Error querying {url}: {e}"
177
+ error_exc = e
178
+ self.logger.error(error)
179
+
180
+ if isinstance(error_exc, requests.exceptions.HTTPError) and error_exc.response is not None:
181
+ response_data["status_code"] = error_exc.response.status_code
182
+
183
+ if response is not None:
184
+ try:
185
+ if return_bytes:
186
+ r_data = response.content
187
+ elif return_json:
188
+ r_data = response.json()
189
+ else:
190
+ r_data = response.text
191
+
192
+ response_data.update(
193
+ {
194
+ "status_code": response.status_code,
195
+ "url": response.url,
196
+ "data": r_data,
197
+ }
198
+ )
199
+ except ValueError:
200
+ self.logger.error(f"Error parsing JSON response from {url}: {response.text}")
201
+
202
+ if error is not None:
203
+ response_data.update({"error": error})
204
+
205
+ return response_data
206
+
207
+ @abstractmethod
208
+ def query_endpoint(self, **kwargs: object) -> dict[str, Any]:
209
+ """Query the API endpoint specific to this client.
210
+
211
+ Subclasses define the exact positional/keyword parameters relevant to their
212
+ endpoint. The return dict follows the ``_query_with_error_handling`` contract:
213
+ ``status_code``, ``url``, and ``data`` on success; ``error`` on failure.
214
+
215
+ Args:
216
+ **kwargs: Endpoint-specific arguments defined by each subclass.
217
+
218
+ Returns:
219
+ Dict with ``status_code``, ``url``, and ``data`` on success, plus ``error``
220
+ on failure.
221
+ """
222
+ ...
223
+
224
+
225
+ class SecClient(ApiClient):
226
+ """API client for the SEC EDGAR archive, with built-in rate limiting."""
227
+
228
+ SEC_URL = "https://www.sec.gov/Archives/edgar/data"
229
+
230
+ def __init__(self, rate_limit: float = 0.2, user_agent: str = "") -> None:
231
+ """Initializes the SEC API.
232
+
233
+ Args:
234
+ rate_limit: How long to wait in between requests.
235
+ user_agent: Value for the SEC-required ``User-Agent`` header.
236
+ """
237
+ super().__init__(rate_limit=rate_limit)
238
+ self._sec_headers = {"User-Agent": user_agent}
239
+
240
+ @property
241
+ def sec_headers(self) -> dict:
242
+ """Return the SEC header for querying."""
243
+ return self._sec_headers
244
+
245
+ def query_endpoint(
246
+ self, sec_url: str, return_json: bool = True, return_bytes: bool = False
247
+ ) -> dict[str, Any]:
248
+ """Query a SEC EDGAR endpoint with the required User-Agent header.
249
+
250
+ Args:
251
+ sec_url: Full SEC EDGAR URL to query.
252
+ return_json: If True, parse response as JSON; otherwise return raw text.
253
+ return_bytes: If True, return raw response bytes (overrides ``return_json``).
254
+
255
+ Returns:
256
+ Dict with ``status_code``, ``url``, and ``data`` on success, plus ``error``
257
+ on failure.
258
+ """
259
+ self.rate_limit()
260
+ return self._query_with_error_handling(
261
+ url=sec_url,
262
+ headers=self._sec_headers,
263
+ method="get",
264
+ return_json=return_json,
265
+ return_bytes=return_bytes,
266
+ )
@@ -0,0 +1,168 @@
1
+ """Generic failure classification and registry for pipeline processors."""
2
+
3
+ # Standard library imports
4
+ import json
5
+ import pathlib
6
+ import threading
7
+ from abc import ABC, abstractmethod
8
+ from enum import StrEnum
9
+
10
+ # Application imports
11
+ from idi_ftm2j_shared.storage import load_json, save_json
12
+
13
+ _MIN_ENTRY_LEN = 2
14
+
15
+
16
+ class FailureClassifier(ABC):
17
+ """Base class for failure classification. Subclasses implement domain-specific logic."""
18
+
19
+ @property
20
+ @abstractmethod
21
+ def do_not_retry(self) -> frozenset[StrEnum]:
22
+ """Return the set of failure types that should not be retried."""
23
+ ...
24
+
25
+ def is_retryable(self, failure_type: StrEnum) -> bool:
26
+ """Check if a failure type should be retried.
27
+
28
+ Args:
29
+ failure_type: The type of failure.
30
+
31
+ Returns:
32
+ True if the failure is transient and should be retried.
33
+ """
34
+ return failure_type not in self.do_not_retry
35
+
36
+ @abstractmethod
37
+ def classify_from_response(self, response: dict, **kwargs: object) -> StrEnum:
38
+ """Classify a failure from an API response.
39
+
40
+ Args:
41
+ response: API response dict with status_code and optional error.
42
+ **kwargs: Additional keyword arguments for subclass implementations.
43
+
44
+ Returns:
45
+ The classified failure type.
46
+ """
47
+ ...
48
+
49
+
50
+ class FailureRegistry:
51
+ """Persists permanent failures to avoid retrying entities that will always fail."""
52
+
53
+ def __init__(
54
+ self, file_path: str, classifier: FailureClassifier, flush_every: int = 10
55
+ ) -> None:
56
+ """Initialize the FailureRegistry.
57
+
58
+ Args:
59
+ file_path: Path to the JSON file for persistence.
60
+ classifier: Domain-specific failure classifier.
61
+ flush_every: Number of new failures to buffer before writing to disk.
62
+ """
63
+ self.file_path = file_path
64
+ self._classifier = classifier
65
+ self._flush_every = flush_every
66
+ self._pending = 0
67
+ self._entries: set[tuple[str, str]] = set()
68
+ self._reasons: dict[tuple[str, str], str] = {}
69
+ self._lock = threading.RLock()
70
+ self.load()
71
+
72
+ def load(self) -> None:
73
+ """Load persisted failure entries from disk into memory.
74
+
75
+ If the file does not exist (locally or on S3), the registry is initialised
76
+ as empty. A ``json.JSONDecodeError`` from a corrupt file is silently caught
77
+ and the registry is reset to empty so the pipeline can continue.
78
+
79
+ Returns:
80
+ None
81
+
82
+ Raises:
83
+ botocore.exceptions.ClientError: If an S3 error other than ``NoSuchKey``
84
+ occurs when reading the persistence file.
85
+ """
86
+ if not self.file_path or (
87
+ not self.file_path.startswith("s3://") and not pathlib.Path(self.file_path).exists()
88
+ ):
89
+ self._entries = set()
90
+ self._reasons = {}
91
+ return
92
+
93
+ try:
94
+ data = load_json(self.file_path, return_type="dict")
95
+ except json.JSONDecodeError:
96
+ self._entries = set()
97
+ self._reasons = {}
98
+ return
99
+
100
+ if not isinstance(data, dict):
101
+ self._entries = set()
102
+ self._reasons = {}
103
+ return
104
+
105
+ entries_data = data.get("entries", [])
106
+ reasons_data = data.get("reasons", {})
107
+
108
+ self._entries = {tuple(e) for e in entries_data if len(e) >= _MIN_ENTRY_LEN}
109
+ self._reasons = {}
110
+ for entry in self._entries:
111
+ key = " ".join(entry)
112
+ if key in reasons_data:
113
+ self._reasons[entry] = reasons_data[key]
114
+
115
+ def save(self) -> None:
116
+ """Persist current failure entries and reasons to the configured file path.
117
+
118
+ Writes entries as a JSON object with ``entries`` (list of lists) and
119
+ ``reasons`` (dict keyed by space-joined entry tuples) keys. If
120
+ ``file_path`` is empty the call is a no-op.
121
+
122
+ Returns:
123
+ None
124
+ """
125
+ if not self.file_path:
126
+ return
127
+
128
+ entries_list = [list(e) for e in self._entries]
129
+ reasons_dict = {" ".join(e): self._reasons.get(e, "") for e in self._entries}
130
+ save_json(self.file_path, {"entries": entries_list, "reasons": reasons_dict})
131
+
132
+ def add(self, key: tuple[str, str], failure_type: StrEnum) -> None:
133
+ """Add a permanent failure entry.
134
+
135
+ Args:
136
+ key: Tuple of identifier and associated file or relevant metadata.
137
+ failure_type: The classified failure type.
138
+ """
139
+ if self._classifier.is_retryable(failure_type):
140
+ return
141
+
142
+ with self._lock:
143
+ if key in self._entries:
144
+ return
145
+
146
+ self._entries.add(key)
147
+ self._reasons[key] = str(failure_type)
148
+
149
+ self._pending += 1
150
+ if self._pending >= self._flush_every:
151
+ self.flush()
152
+
153
+ def flush(self) -> None:
154
+ """Write all buffered failures to disk and reset the pending counter."""
155
+ with self._lock:
156
+ self.save()
157
+ self._pending = 0
158
+
159
+ def __contains__(self, key: tuple[str, str]) -> bool:
160
+ """Set-like membership check.
161
+
162
+ Args:
163
+ key: Tuple of identifier and associated file or relevant metadata.
164
+
165
+ Returns:
166
+ True if the filing should not be retried.
167
+ """
168
+ return key in self._entries
@@ -0,0 +1,171 @@
1
+ """Provides loggers for use across the application."""
2
+
3
+ # Standard library imports
4
+ import datetime
5
+ import logging
6
+ import os
7
+
8
+ # Third party imports
9
+ import boto3
10
+ import requests
11
+ import tqdm
12
+ import watchtower
13
+
14
+ _configured_loggers: set[str] = set()
15
+
16
+ _EXECUTION_ID = datetime.datetime.now().strftime("%Y%m%d_%H%M%S_%f")
17
+
18
+ EC2_METADATA_BASE = "http://169.254.169.254"
19
+ EC2_METADATA_TOKEN_URL = f"{EC2_METADATA_BASE}/latest/api/token"
20
+ EC2_METADATA_INSTANCE_ID_URL = f"{EC2_METADATA_BASE}/latest/meta-data/instance-id"
21
+
22
+ LOG_RETENTION_DAYS = (
23
+ 30 # Possible values are: 1, 3, 5, 7, 14, 30, 60, 90, 120, 150, 180, 365, 400, ...
24
+ )
25
+
26
+
27
+ def _get_instance_id() -> str:
28
+ """Returns the EC2 instance ID when available, otherwise a fallback identifier."""
29
+ # Prefer explicit env var (e.g. when running in Docker where metadata may be unreachable)
30
+ if instance_id := os.environ.get("INSTANCE_ID"):
31
+ return instance_id
32
+ try:
33
+ # IMDSv2: obtain session token first (required when IMDSv2 is enforced)
34
+ token_resp = requests.put(
35
+ EC2_METADATA_TOKEN_URL,
36
+ headers={"X-aws-ec2-metadata-token-ttl-seconds": "60"},
37
+ timeout=1,
38
+ )
39
+ token_resp.raise_for_status()
40
+ token = token_resp.text.strip()
41
+
42
+ # Fetch instance-id with token
43
+ instance_resp = requests.get(
44
+ EC2_METADATA_INSTANCE_ID_URL,
45
+ headers={"X-aws-ec2-metadata-token": token},
46
+ timeout=1,
47
+ )
48
+ instance_resp.raise_for_status()
49
+ return instance_resp.text.strip()
50
+
51
+ except Exception:
52
+ hostname = os.environ.get("HOSTNAME", "unknown")
53
+ return hostname.split(".")[0]
54
+
55
+
56
+ class TqdmLoggingHandler(logging.Handler):
57
+ """Logging handler that writes via tqdm.write() to avoid disrupting progress bars."""
58
+
59
+ def emit(self, record: logging.LogRecord) -> None:
60
+ """Emit a log record.
61
+
62
+ Args:
63
+ record: The log record to emit.
64
+
65
+ Raises:
66
+ Exception: If an error occurs while emitting the log record.
67
+ """
68
+ try:
69
+ tqdm.tqdm.write(self.format(record))
70
+ except Exception:
71
+ self.handleError(record)
72
+
73
+
74
+ def get_logger(
75
+ name: str, level: int = logging.INFO, log_group_name: str = "", log_stream_prefix: str = ""
76
+ ) -> logging.Logger:
77
+ """Creates a logger with the given name and level.
78
+
79
+ Attaches a stream handler that prints logs in a standard format to the console.
80
+
81
+ If log_group_name and log_stream_prefix are provided, CloudWatch logging is enabled.
82
+
83
+ Args:
84
+ name: The logger name.
85
+ level: The initial level. Defaults to 20 ("INFO").
86
+ log_group_name: The name of the log group. Defaults to empty string.
87
+ log_stream_prefix: The prefix of the log stream. Defaults to empty string.
88
+
89
+ Returns:
90
+ The logger
91
+ """
92
+ # Check if logger has already been configured
93
+ if name in _configured_loggers:
94
+ return logging.getLogger(name)
95
+
96
+ env_level = os.environ.get("LOG_LEVEL", "").upper()
97
+ if env_level and hasattr(logging, env_level):
98
+ level = getattr(logging, env_level)
99
+
100
+ # Create logger and set level
101
+ logger = logging.getLogger(name)
102
+ logger.setLevel(level)
103
+ logger.propagate = False # Prevent log messages from being propagated to the root logger
104
+
105
+ # Create console handler and set level
106
+ ch = TqdmLoggingHandler()
107
+ ch.setLevel(level)
108
+
109
+ # Create formatter and add to handler
110
+ format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
111
+ formatter = logging.Formatter(format)
112
+ ch.setFormatter(formatter)
113
+
114
+ # Add handler to logger
115
+ logger.addHandler(ch)
116
+
117
+ # Configure CloudWatch logging if executing on AWS EC2 instance
118
+ _configure_cloudwatch(logger, name, log_group_name, log_stream_prefix)
119
+
120
+ # Add logger to set of configured loggers
121
+ _configured_loggers.add(name)
122
+
123
+ return logger
124
+
125
+
126
+ def _configure_cloudwatch(
127
+ logger: logging.Logger, name: str, log_group_name: str, log_stream_prefix: str
128
+ ) -> None:
129
+ """Configures the logger to send logs to CloudWatch if executing in AWS.
130
+
131
+ Enables CloudWatch when:
132
+ - EC2 metadata endpoint is reachable, or
133
+ - CLOUDWATCH_LOGS_ENABLED=true (e.g. when running in Docker on EC2).
134
+
135
+ Args:
136
+ logger: The logger to configure.
137
+ name: The name of the logger.
138
+ log_group_name: The name of the log group.
139
+ log_stream_prefix: The prefix of the log stream.
140
+ """
141
+ # Enable when explicitly requested (e.g. Docker on EC2)
142
+ env_enabled = os.environ.get("CLOUDWATCH_LOGS_ENABLED", "").lower() in ("true", "1", "yes")
143
+
144
+ if env_enabled and log_group_name and log_stream_prefix:
145
+ instance_id = _get_instance_id()
146
+ log_stream_name = f"{log_stream_prefix}/{instance_id}/{_EXECUTION_ID}"
147
+
148
+ if "AWS_REGION" in os.environ:
149
+ logs_client = boto3.client("logs", region_name=os.environ["AWS_REGION"])
150
+ else:
151
+ logs_client = boto3.client("logs")
152
+
153
+ handler = watchtower.CloudWatchLogHandler(
154
+ log_group_name=log_group_name,
155
+ log_stream_name=log_stream_name,
156
+ use_queues=False,
157
+ boto3_client=logs_client,
158
+ log_group_retention_days=LOG_RETENTION_DAYS,
159
+ )
160
+
161
+ format = "%(name)s - %(levelname)s - %(message)s"
162
+ formatter = logging.Formatter(format)
163
+ handler.setFormatter(formatter)
164
+
165
+ logger.addHandler(handler)
166
+ logger.info(
167
+ "CloudWatch logging enabled: name=%s, log_group=%s log_stream=%s",
168
+ name,
169
+ log_group_name,
170
+ log_stream_name,
171
+ )
@@ -0,0 +1,177 @@
1
+ """Provides storage utilities for use across the application."""
2
+
3
+ # Standard library imports
4
+ import json
5
+ import tempfile
6
+ import zipfile
7
+ from collections.abc import Iterator
8
+ from contextlib import contextmanager
9
+
10
+ # Third party imports
11
+ import smart_open
12
+ from botocore.exceptions import ClientError
13
+
14
+
15
+ def _empty_for_return_type(return_type: str) -> dict | list:
16
+ """Return empty dict or list per return_type."""
17
+ if return_type == "dict":
18
+ return {}
19
+ if return_type == "list":
20
+ return []
21
+ raise ValueError(f"Invalid return type: {return_type}")
22
+
23
+
24
+ def load_json(file_path: str, return_type: str = "dict") -> dict | list:
25
+ """Load a JSON file from a local path or S3 URL.
26
+
27
+ Supports any path scheme understood by ``smart_open`` (local, ``s3://``).
28
+ Missing files — locally absent or absent on S3 — are treated as empty and
29
+ return the appropriate empty container instead of raising.
30
+
31
+ Args:
32
+ file_path: Local filesystem path or ``s3://bucket/key`` URL of the JSON file.
33
+ return_type: Expected top-level type of the JSON document — ``"dict"`` or
34
+ ``"list"``. Controls the empty value returned when the file is absent.
35
+ Raises ``ValueError`` for any other value.
36
+
37
+ Returns:
38
+ Parsed JSON content as a ``dict`` or ``list``. Returns an empty ``dict`` or
39
+ ``list`` (per ``return_type``) when the file does not exist.
40
+
41
+ Raises:
42
+ ValueError: If ``return_type`` is not ``"dict"`` or ``"list"``.
43
+ botocore.exceptions.ClientError: If an S3 error other than ``NoSuchKey`` occurs.
44
+ json.JSONDecodeError: If the file exists but contains invalid JSON.
45
+ """
46
+ try:
47
+ with smart_open.open(file_path) as f:
48
+ return json.load(f)
49
+
50
+ except (FileNotFoundError, OSError):
51
+ return _empty_for_return_type(return_type)
52
+
53
+ except ClientError as e:
54
+ if e.response.get("Error", {}).get("Code") == "NoSuchKey":
55
+ return _empty_for_return_type(return_type)
56
+ raise
57
+
58
+
59
+ def save_json(file_path: str, data: dict | list, mode: str = "w") -> None:
60
+ """Save a JSON file to the given path.
61
+
62
+ Efficient writing: https://github.com/piskvorky/smart_open/blob/develop/howto.md#how-to-write-to-s3-efficiently
63
+
64
+ Can write in append mode for local files, S3 files are always overwritten.
65
+
66
+ Args:
67
+ file_path: The path to the JSON file.
68
+ data: The JSON data to save to the file as a dictionary or list.
69
+ mode: File open mode ("w" to overwrite, "a" to append). S3 paths always overwrite.
70
+ """
71
+ if "s3://" in file_path:
72
+ with tempfile.NamedTemporaryFile() as tmp:
73
+ tp = {"writebuffer": tmp}
74
+ with smart_open.open(file_path, "w", transport_params=tp) as fout:
75
+ json.dump(data, fout, indent=2)
76
+ else:
77
+ with smart_open.open(file_path, mode) as fout:
78
+ json.dump(data, fout, indent=2)
79
+
80
+
81
+ def key_exists(file_path: str) -> bool:
82
+ """Return True if the file at the given path exists.
83
+
84
+ Supports local filesystem paths and ``s3://`` URLs.
85
+
86
+ Args:
87
+ file_path: Local path or ``s3://`` URL to check.
88
+
89
+ Returns:
90
+ True if the file exists, False if it does not.
91
+
92
+ Raises:
93
+ botocore.exceptions.ClientError: If an S3 error other than ``NoSuchKey`` occurs.
94
+ """
95
+ try:
96
+ with smart_open.open(file_path, "rb") as f:
97
+ f.read(1)
98
+ return True
99
+ except (FileNotFoundError, OSError):
100
+ return False
101
+ except ClientError as e:
102
+ if e.response.get("Error", {}).get("Code") in ("NoSuchKey", "404"):
103
+ return False
104
+ raise
105
+
106
+
107
+ def load_content(file_path: str) -> str:
108
+ """Load text content from a local path or S3 URL.
109
+
110
+ Missing files return an empty string instead of raising.
111
+
112
+ Args:
113
+ file_path: Local filesystem path or ``s3://`` URL of the text file.
114
+
115
+ Returns:
116
+ File contents as a string, or ``""`` when the file does not exist.
117
+
118
+ Raises:
119
+ botocore.exceptions.ClientError: If an S3 error other than ``NoSuchKey`` occurs.
120
+ """
121
+ try:
122
+ with smart_open.open(file_path) as f:
123
+ return f.read()
124
+ except (FileNotFoundError, OSError):
125
+ return ""
126
+ except ClientError as e:
127
+ if e.response.get("Error", {}).get("Code") == "NoSuchKey":
128
+ return ""
129
+ raise
130
+
131
+
132
+ def save_content(file_path: str, content: str) -> None:
133
+ """Save text content to a local path or S3 URL.
134
+
135
+ Args:
136
+ file_path: Local filesystem path or ``s3://`` URL to write to.
137
+ content: Text content to write.
138
+ """
139
+ try:
140
+ if "s3://" in file_path:
141
+ with tempfile.NamedTemporaryFile() as tmp:
142
+ tp = {"writebuffer": tmp}
143
+ with smart_open.open(file_path, "w", transport_params=tp) as fout:
144
+ fout.write(content)
145
+ else:
146
+ with smart_open.open(file_path, "w") as fout:
147
+ fout.write(content)
148
+ except ValueError as e:
149
+ raise ValueError(f"Failed to save content to {file_path!r}: {e}") from e
150
+
151
+
152
+ @contextmanager
153
+ def open_zip(file_path: str, headers: dict | None = None) -> Iterator[zipfile.ZipFile]:
154
+ """Open a zip file from a local path, S3, or HTTPS URL.
155
+
156
+ Supports any path scheme handled by smart_open (local, s3://, https://).
157
+ HTTPS requires the server to support range requests (Accept-Ranges: bytes).
158
+
159
+ Args:
160
+ file_path: Path to the ZIP file — local filesystem path, ``s3://`` URL, or
161
+ ``https://`` URL. HTTPS requires the server to support range requests
162
+ (``Accept-Ranges: bytes``).
163
+ headers: Optional HTTP headers passed as transport params (e.g. ``User-Agent``
164
+ for SEC EDGAR). Ignored for local and S3 paths.
165
+
166
+ Yields:
167
+ An open ``zipfile.ZipFile`` object. The underlying stream is closed
168
+ automatically when the context manager exits.
169
+
170
+ Raises:
171
+ zipfile.BadZipFile: If the file is not a valid ZIP archive.
172
+ OSError: If the file cannot be opened or read.
173
+ """
174
+ tp = {"headers": headers} if headers else {}
175
+ with smart_open.open(file_path, "rb", transport_params=tp) as f:
176
+ with zipfile.ZipFile(f) as zf:
177
+ yield zf
@@ -0,0 +1,236 @@
1
+ Metadata-Version: 2.4
2
+ Name: idi-ftm2j-shared
3
+ Version: 0.1.1
4
+ Summary: Shared runtime utilities and Pulumi factories for FTM2J processors
5
+ Author-email: UChicago DSI Core Facility <corefacility@uchicago.edu>
6
+ License-Expression: BSD-3-Clause
7
+ License-File: LICENSE
8
+ Requires-Python: >=3.13
9
+ Requires-Dist: requests>=2.33.1
10
+ Requires-Dist: smart-open[s3]>=7.6.0
11
+ Requires-Dist: tqdm>=4.67.3
12
+ Requires-Dist: watchtower>=3.4.0
13
+ Description-Content-Type: text/markdown
14
+
15
+ # idi-ftm2j-shared
16
+
17
+ Shared AWS infrastructure for the FTM2J terminal ecosystem. Two independent Pulumi stacks — deploy bootstrap first, then shared.
18
+
19
+ ---
20
+
21
+ ## `pulumi-bootstrap` — GitHub Actions OIDC
22
+
23
+ Provisions the account-level OIDC identity provider and the two IAM roles that all `dsi-clinic` repos use to authenticate with AWS from GitHub Actions.
24
+
25
+ > **Run locally.** This stack must be deployed from a workstation with AWS credentials — it creates the very roles that CI uses, so CI cannot deploy it itself.
26
+
27
+ ```bash
28
+ cd pulumi-bootstrap
29
+ pulumi stack select dev
30
+ pulumi preview
31
+ pulumi up
32
+ ```
33
+
34
+ **Roles created:**
35
+
36
+ | Role | Assumed by | Access |
37
+ |------|-----------|--------|
38
+ | `checks` | Pull requests, manual `workflow_dispatch` runs | Read-only (`pulumi preview`) |
39
+ | `deploy` | Pushes to `main`, `dev`, `release/**` | Full deploy (`pulumi up`) |
40
+
41
+ Both roles trust any repository in the `dsi-clinic` org — no updates needed when new repos are added.
42
+
43
+ ---
44
+
45
+ ## `pulumi` — Shared Infrastructure
46
+
47
+ Provisions the AWS resources shared across all FTM2J processor pipelines. Individual processor stacks reference these outputs rather than creating their own copies.
48
+
49
+ ```bash
50
+ cd pulumi
51
+ pulumi stack select dev
52
+ pulumi preview
53
+ pulumi up
54
+ ```
55
+
56
+ **Resources:**
57
+
58
+ | Resource | Description |
59
+ |----------|-------------|
60
+ | S3 bucket | Pipeline input, output, and failure storage. Encrypted at rest; retained on stack destroy to prevent data loss. |
61
+ | S3 VPC gateway endpoint | Routes S3 traffic over the private AWS network, avoiding internet egress from ECS tasks. |
62
+ | SQS dead-letter queue | Captures EventBridge Scheduler invocation failures for inspection and replay. |
63
+
64
+ **Stack outputs** consumed by downstream processor stacks:
65
+
66
+ ```
67
+ processor_bucket_name
68
+ processor_bucket_arn
69
+ s3_endpoint_id
70
+ s3_endpoint_arn
71
+ dlq_url
72
+ dlq_arn
73
+ ```
74
+
75
+ >`deploy.yml` is path-filtered: version/publish jobs only run when `src/**` or `pyproject.toml` changed; the Pulumi deploy job only runs when `pulumi-shared/**` changed.
76
+
77
+ ---
78
+
79
+ # development + contributing
80
+
81
+ Install all dependency groups (includes `dev` tools: pytest, ruff):
82
+
83
+ ```bash
84
+ uv sync --all-groups
85
+ ```
86
+
87
+ ## tests
88
+
89
+ ```bash
90
+ uv run pytest
91
+ ```
92
+
93
+ ## linting + formatting
94
+
95
+ ```bash
96
+ uv run ruff check . # lint
97
+ uv run ruff format . # format
98
+ ```
99
+
100
+ ## code style
101
+
102
+ | Rule | Value |
103
+ |---|---|
104
+ | Line length | 100 characters |
105
+ | Docstring convention | Google (`pydocstyle`) |
106
+ | Type annotations | Required on all public functions and classes |
107
+ | String quotes | Double-quoted (ruff `Q` ruleset) |
108
+
109
+ ## branching strategy + versioning
110
+
111
+ Two-branch model with short-lived issue branches.
112
+
113
+ ### long-lived branches
114
+
115
+ | Branch | Purpose | Version style | Deploy target |
116
+ | ------ | ------------ | ---------------------------- | ------------- |
117
+ | `dev` | Integration | `X.Y.Z-alphaN` (pre-release) | `dev` stack |
118
+ | `main` | Production | `X.Y.Z` (stable) | `prod` stack |
119
+
120
+ Both branches are protected. All changes occur via pull request.
121
+
122
+ ### short-lived branches
123
+
124
+ - **`issue-<number>-<slug>`** — feature, bug-fix, and chore work.
125
+ - Branch from `dev`, PR back to `dev`.
126
+ - While the PR is open, only [`checks.yml`](../.github/workflows/checks.yml) runs (lint, tests, security, Pulumi preview). Pushes to the issue branch do not bump the version or deploy.
127
+ - On merge, the push to `dev` triggers [`deploy.yml`](../.github/workflows/deploy.yml): bumps the alpha version and deploys the `dev` stack.
128
+ - Note: It is best to create branches with this naming convention as you will be able to manually deploy these branches for testing in the `dev` stack. See (#manual-deploys)
129
+ - **Hotfix** — urgent production fix.
130
+ - Branch from `main` as `issue-<number>-hotfix-<slug>`, PR back to `main`.
131
+ - After release, merge `main` back into `dev` (see [Syncing main back into dev](#3-syncing-main-back-into-dev)).
132
+
133
+ ### ci/cd pipelines
134
+
135
+ Validation and deployment are split across two workflows:
136
+
137
+ - [`checks.yml`](../.github/workflows/checks.yml) — runs on every PR, required before merge. Lint, tests, security scan, Pulumi preview.
138
+ - [`deploy.yml`](../.github/workflows/deploy.yml) — runs on push to `dev` or `main` (i.e. after a merge). Bumps version, tags, releases, deploys Pulumi, publishes to PyPI (`main` only, if the repo includes a package).
139
+
140
+ ### versioning
141
+
142
+ Versions live in `pyproject.toml` and are bumped by `deploy.yml` using `uv version`.
143
+
144
+ | Trigger | Bump command | Example |
145
+ | -------------------------------- | -------------------------------------- | --------------------- |
146
+ | Push to `dev`, no existing alpha | `uv version --bump patch --bump alpha` | `1.4.0` → `1.4.1a1` |
147
+ | Push to `dev`, existing alpha | `uv version --bump alpha` | `1.4.1a1` → `1.4.1a2` |
148
+ | Push to `main` | `uv version --bump stable` | `1.4.1a3` → `1.4.1` |
149
+
150
+ Each successful deploy:
151
+
152
+ 1. Commits the bumped `pyproject.toml` + `uv.lock` with `[skip ci]`.
153
+ 2. Pushes a `vX.Y.Z[aN]` git tag.
154
+ 3. Creates a GitHub Release — pre-release on `dev`, stable on `main`.
155
+ 4. On `main`: builds the wheel/sdist and publishes to PyPI (if the repo ships a package).
156
+
157
+ ### development cycle
158
+
159
+ #### 1. dev → issue → alpha release
160
+
161
+ ```
162
+ PR
163
+ issue-123-add-feature ────────────────────────────────► dev
164
+ ▲ │
165
+ │ branch │ push triggers deploy.yml
166
+ │ ▼
167
+ dev ◄──────────────────────────────────── 1.4.1a1, 1.4.1a2, ...
168
+ merge deployed to dev stack
169
+ ```
170
+
171
+ 1. `git switch dev && git pull`
172
+ 2. `git switch -c issue-123-add-feature`
173
+ 3. Commit, push, open PR targeting `dev`. `checks.yml` runs.
174
+ 4. Merge the PR (squash recommended). The push to `dev` triggers `deploy.yml`:
175
+ - Bumps to the next alpha (`1.4.1a1` if no alpha exists yet, otherwise increments the alpha counter).
176
+ - Tags, creates a pre-release, deploys the `dev` Pulumi stack, publishes the image. PyPI publish is skipped.
177
+ 5. More issue PRs into `dev` keep stacking alphas (`1.4.1a2`, `1.4.1a3`, …) on the same patch line until a stable release cuts that line off.
178
+
179
+ #### 2. dev → main → stable release
180
+
181
+ ```
182
+ dev (1.4.1a3) ───────── PR ─────────► main
183
+ ▲ │ push triggers deploy.yml
184
+ │ ▼
185
+ ◄────────────────────────────────── 1.4.1 (stable)
186
+ sync/merge deployed to prod stack
187
+ published to PyPI
188
+ ```
189
+
190
+ 1. When `dev` is ready to ship, open a PR from `dev` → `main`. `checks.yml` runs against the `prod` Pulumi stack preview.
191
+ 2. Review and merge. **Do not squash** — preserve the alpha history so release notes capture every change. A merge commit is fine.
192
+ 3. The push to `main` triggers `deploy.yml`:
193
+ - `uv version --bump stable` drops the `aN` suffix (`1.4.1a3` → `1.4.1`).
194
+ - Tags `v1.4.1`, creates a stable GitHub Release, deploys the `prod` Pulumi stack, publishes to PyPI (if applicable).
195
+
196
+ #### 3. syncing main back into dev
197
+
198
+ After every stable release (and any hotfix that lands directly on `main`), merge `main` back into `dev` so `dev` stays ahead of `main` and the histories stay aligned.
199
+
200
+ ```bash
201
+ git switch main && git pull
202
+ git switch dev && git pull
203
+ git merge main # bring in the stable bump commit + any hotfixes
204
+ git push
205
+ ```
206
+
207
+ The next push to `dev` produces `1.4.2a1` — a new alpha line above the just-released `1.4.1`.
208
+
209
+ On a `pyproject.toml` conflict, keep `main`'s stable version. The next `dev` deploy bumps from there.
210
+
211
+ ### manual deploys
212
+
213
+ `deploy.yml` accepts `workflow_dispatch`:
214
+
215
+ - From `dev` it deploys the `dev` stack.
216
+ - From `main` it deploys the `prod` stack.
217
+
218
+ Use this to redeploy Pulumi without a code change (e.g. after rotating a secret). Version/publish jobs stay gated on `src/**` changes.
219
+
220
+ ### summary
221
+
222
+ - `dev` is the only place new work lands; every merge produces an alpha.
223
+ - `main` cuts stable releases from whatever alpha `dev` is on.
224
+ - After every release on `main`, merge `main` back into `dev`.
225
+
226
+ ## branch protection rules
227
+
228
+ - Default branch is set to `dev`
229
+ - There are two rulesets: `dev` and `main`
230
+ - Deploy keys are added to the bypass list and set to "Always allow"
231
+ - The branch targeting criteria is either set to: `dev` or `main`
232
+ - ✅ Restrict deletions
233
+ - ✅ Require a pull request before mergining
234
+ - ✅ Require status checkts to pass: Lint, Test, Security, Pulumi Preview
235
+ - ✅ Block force pushes
236
+ - ✅ Require code scanning results; set to CodeQL security alerts "High or higher"
@@ -0,0 +1,9 @@
1
+ idi_ftm2j_shared/__init__.py,sha256=78ErW2BC3eZrnPznjDscP2gBGelm2fHTfz4L8WI_5AA,145
2
+ idi_ftm2j_shared/api.py,sha256=25BV2YXKEEMRA8WTNKcvhcTPHwx9bxUC_Li-9xuLCN0,9400
3
+ idi_ftm2j_shared/failures.py,sha256=x80FSSehzvPNzChFn98px5sQVplerLCpIP4Vix9NADA,5394
4
+ idi_ftm2j_shared/logs.py,sha256=wO_d6RZAVmAVZxQqSfpNImx8-CsjEEqN4r5xQEuyEWM,5609
5
+ idi_ftm2j_shared/storage.py,sha256=M_5IeAB1v5sXvPPQ-t0k1bVo_vhVzR8Q1-UkML7FWkE,6214
6
+ idi_ftm2j_shared-0.1.1.dist-info/METADATA,sha256=96H4qeqe8Naa7bDPzAkpth1V0ryS3OZpkyI0y6CuyYs,9456
7
+ idi_ftm2j_shared-0.1.1.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
8
+ idi_ftm2j_shared-0.1.1.dist-info/licenses/LICENSE,sha256=qlWEZ_QLy9KO01sLoPcyXHd9-nqrPqPPTSTA9hebLfE,1515
9
+ idi_ftm2j_shared-0.1.1.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.29.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,28 @@
1
+ BSD 3-Clause License
2
+
3
+ Copyright (c) 2026, UChicago Data Science Clinic
4
+
5
+ Redistribution and use in source and binary forms, with or without
6
+ modification, are permitted provided that the following conditions are met:
7
+
8
+ 1. Redistributions of source code must retain the above copyright notice, this
9
+ list of conditions and the following disclaimer.
10
+
11
+ 2. Redistributions in binary form must reproduce the above copyright notice,
12
+ this list of conditions and the following disclaimer in the documentation
13
+ and/or other materials provided with the distribution.
14
+
15
+ 3. Neither the name of the copyright holder nor the names of its
16
+ contributors may be used to endorse or promote products derived from
17
+ this software without specific prior written permission.
18
+
19
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
23
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.