alias-mapper 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,305 @@
1
+ """
2
+ bootstrap.py
3
+ ------------
4
+ First-run setup: download the latest alias TSV from GitHub Releases
5
+ and build a local SQLite database.
6
+
7
+ The CLI checks for a cached local DB on startup. If it's missing,
8
+ this module runs to bring it into being. The same code path is
9
+ exposed as `alias-mapper update` for manual refresh.
10
+
11
+ Design notes:
12
+ - The TSV is the source of truth; the local DB is a derived,
13
+ disposable cache. `update` always does a full rebuild rather
14
+ than trying to merge new rows into an existing DB.
15
+ - Latest-release discovery uses the GitHub API to find the most
16
+ recent `data-*` tagged release. We don't trust filesystem dates
17
+ or any client-side heuristic; the server is authoritative.
18
+ - All failure paths print a clear manual fallback (download the
19
+ TSV yourself, run build_alias_db.py yourself) so a user with a
20
+ flaky network or rate-limited GitHub can always work around it.
21
+ """
22
+
23
+ import json
24
+ import sys
25
+ import urllib.error
26
+ import urllib.request
27
+ from pathlib import Path
28
+
29
+ import platformdirs
30
+
31
+ from .build_alias_db import build_db
32
+ from .alias_source import StaleSchemaError, verify_schema_version
33
+ from ._ssl import SSL_CONTEXT
34
+
35
+
36
+ # GitHub repo coordinates. Constants up here so they're easy to change
37
+ # if the project moves or someone forks it.
38
+ GITHUB_OWNER = "guigolab"
39
+ GITHUB_REPO = "alias-mapper"
40
+ RELEASES_API_URL = f"https://api.github.com/repos/{GITHUB_OWNER}/{GITHUB_REPO}/releases"
41
+
42
+ # Name of the TSV asset on each data release.
43
+ TSV_ASSET_NAME = "aliases.tsv.gz"
44
+
45
+ # Prefix that identifies data releases (vs. code releases like v1.0.0).
46
+ DATA_RELEASE_PREFIX = "data-"
47
+
48
+ # Filename for the local cached database.
49
+ LOCAL_DB_NAME = "aliases.db"
50
+
51
+ # The downloaded TSV is kept alongside the DB after a successful build
52
+ # (rather than deleted) so a schema-only rebuild — triggered when the
53
+ # user upgrades the CLI and their cached DB no longer matches the
54
+ # expected schema — can run from the local copy without a network round
55
+ # trip. `alias-mapper update` still re-downloads, since its whole purpose
56
+ # is to fetch fresher data.
57
+ LOCAL_TSV_NAME = TSV_ASSET_NAME
58
+
59
+ # User-Agent string for HTTP requests. GitHub appreciates a real
60
+ # identifier; some endpoints reject requests without one.
61
+ USER_AGENT = f"{GITHUB_REPO}/bootstrap (https://github.com/{GITHUB_OWNER}/{GITHUB_REPO})"
62
+
63
+
64
+ class BootstrapError(Exception):
65
+ """Raised when first-run setup fails. The CLI translates this into a user-facing error."""
66
+
67
+
68
+ def default_cache_path() -> Path:
69
+ """
70
+ Return the platform-appropriate location for the cached DB.
71
+
72
+ macOS: ~/Library/Caches/alias-mapper/aliases.db
73
+ Linux: ~/.cache/alias-mapper/aliases.db
74
+ Windows: %LOCALAPPDATA%\\alias-mapper\\Cache\\aliases.db
75
+ """
76
+ cache_dir = Path(platformdirs.user_cache_dir(GITHUB_REPO))
77
+ return cache_dir / LOCAL_DB_NAME
78
+
79
+
80
+ def _http_get_json(url: str):
81
+ """GET a URL, parse the JSON response, return the parsed object."""
82
+ req = urllib.request.Request(
83
+ url,
84
+ headers={
85
+ "User-Agent": USER_AGENT,
86
+ "Accept": "application/vnd.github+json",
87
+ },
88
+ )
89
+ try:
90
+ with urllib.request.urlopen(req, timeout=30, context=SSL_CONTEXT) as r:
91
+ return json.loads(r.read().decode("utf-8"))
92
+ except urllib.error.HTTPError as e:
93
+ body = ""
94
+ try:
95
+ body = e.read().decode("utf-8", errors="replace")
96
+ except Exception:
97
+ pass
98
+ if e.code == 403 and "rate limit" in body.lower():
99
+ raise BootstrapError(
100
+ "GitHub API rate limit exceeded. Retry later, or download "
101
+ f"{TSV_ASSET_NAME} manually from "
102
+ f"https://github.com/{GITHUB_OWNER}/{GITHUB_REPO}/releases"
103
+ )
104
+ raise BootstrapError(f"GitHub API returned HTTP {e.code}: {e}")
105
+ except urllib.error.URLError as e:
106
+ raise BootstrapError(f"could not reach GitHub API: {e.reason}")
107
+
108
+
109
+ def find_latest_data_release_url() -> str:
110
+ """
111
+ Query the GitHub API for the most recent data-* release.
112
+
113
+ Returns the browser_download_url of its aliases.tsv.gz asset.
114
+ Raises BootstrapError if no data release is found or the API
115
+ call fails.
116
+ """
117
+ print(f" Looking up latest data release on GitHub...", file=sys.stderr)
118
+ releases = _http_get_json(RELEASES_API_URL)
119
+
120
+ # Filter to data-* tags. The API returns releases sorted newest-first
121
+ # by created_at, so the first match is the most recent.
122
+ data_releases = [
123
+ r for r in releases
124
+ if r.get("tag_name", "").startswith(DATA_RELEASE_PREFIX)
125
+ ]
126
+
127
+ if not data_releases:
128
+ raise BootstrapError(
129
+ f"no data release found in the {GITHUB_OWNER}/{GITHUB_REPO} repo. "
130
+ f"This shouldn't happen unless the weekly workflow has never run "
131
+ f"successfully. Check "
132
+ f"https://github.com/{GITHUB_OWNER}/{GITHUB_REPO}/releases"
133
+ )
134
+
135
+ latest = data_releases[0]
136
+ tag = latest.get("tag_name", "<unknown>")
137
+ print(f" Found: {tag}", file=sys.stderr)
138
+
139
+ # Find the aliases.tsv.gz asset.
140
+ for asset in latest.get("assets", []):
141
+ if asset.get("name") == TSV_ASSET_NAME:
142
+ return asset["browser_download_url"]
143
+
144
+ raise BootstrapError(
145
+ f"data release {tag} exists but does not contain {TSV_ASSET_NAME}. "
146
+ f"Check the assets at "
147
+ f"https://github.com/{GITHUB_OWNER}/{GITHUB_REPO}/releases/tag/{tag}"
148
+ )
149
+
150
+
151
+ def download_with_progress(url: str, dest: Path) -> None:
152
+ """
153
+ Stream a URL to a local file, printing a progress line as bytes arrive.
154
+
155
+ Writes to a `.part` file first and renames on success, so an
156
+ interrupted download doesn't leave a half-finished file looking
157
+ like a finished one.
158
+ """
159
+ part = dest.with_suffix(dest.suffix + ".part")
160
+ part.parent.mkdir(parents=True, exist_ok=True)
161
+
162
+ req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
163
+ try:
164
+ with urllib.request.urlopen(req, timeout=60, context=SSL_CONTEXT) as response:
165
+ total_str = response.headers.get("Content-Length")
166
+ total = int(total_str) if total_str else None
167
+ downloaded = 0
168
+ chunk_size = 64 * 1024 # 64 KB
169
+
170
+ with open(part, "wb") as f:
171
+ while True:
172
+ chunk = response.read(chunk_size)
173
+ if not chunk:
174
+ break
175
+ f.write(chunk)
176
+ downloaded += len(chunk)
177
+ _print_progress(downloaded, total)
178
+
179
+ # Newline after the carriage-return progress line so the next
180
+ # stderr message doesn't overwrite it.
181
+ print("", file=sys.stderr)
182
+
183
+ except urllib.error.HTTPError as e:
184
+ part.unlink(missing_ok=True)
185
+ raise BootstrapError(f"download failed with HTTP {e.code}: {e}")
186
+ except urllib.error.URLError as e:
187
+ part.unlink(missing_ok=True)
188
+ raise BootstrapError(f"download failed: {e.reason}")
189
+ except KeyboardInterrupt:
190
+ part.unlink(missing_ok=True)
191
+ raise
192
+
193
+ # Atomic rename: only after the full download succeeds.
194
+ part.replace(dest)
195
+
196
+
197
+ def _print_progress(downloaded: int, total: int | None) -> None:
198
+ """Print a one-line progress indicator that updates in place."""
199
+ mb_down = downloaded / (1024 * 1024)
200
+ if total:
201
+ mb_total = total / (1024 * 1024)
202
+ pct = (downloaded / total) * 100
203
+ msg = f" Downloading... {mb_down:6.2f} MB / {mb_total:6.2f} MB ({pct:5.1f}%)"
204
+ else:
205
+ msg = f" Downloading... {mb_down:6.2f} MB"
206
+ sys.stderr.write("\r" + msg)
207
+ sys.stderr.flush()
208
+
209
+
210
+ def ensure_db(db_path: Path | None = None, force: bool = False) -> Path:
211
+ """
212
+ Ensure a local DB exists, downloading and building it if needed.
213
+
214
+ Args:
215
+ db_path: Where to place the DB. Defaults to the platform cache path.
216
+ force: If True, rebuild even if the DB already exists. Used by
217
+ `alias-mapper update`.
218
+
219
+ Returns:
220
+ The path to the DB (same as db_path, or the default if None).
221
+
222
+ Raises:
223
+ BootstrapError on any failure, with a message including a manual
224
+ workaround.
225
+ """
226
+ if db_path is None:
227
+ db_path = default_cache_path()
228
+
229
+ # Tracks why we're (re)building, which decides whether a locally
230
+ # cached TSV may be reused instead of re-downloading.
231
+ stale_schema_rebuild = False
232
+
233
+ if db_path.exists() and not force:
234
+ # Check schema version; a stale cache forces a rebuild even
235
+ # if the user didn't ask for one. This is the silent-upgrade
236
+ # path when a user updates the CLI and their old DB no longer
237
+ # matches the expected schema.
238
+ try:
239
+ verify_schema_version(db_path)
240
+ return db_path
241
+ except StaleSchemaError as e:
242
+ print(
243
+ f"Cached alias DB has stale schema ({e}); rebuilding...",
244
+ file=sys.stderr,
245
+ )
246
+ force = True
247
+ stale_schema_rebuild = True
248
+
249
+ if force and db_path.exists():
250
+ print(f"Refreshing alias database at {db_path}", file=sys.stderr)
251
+ else:
252
+ print(f"No local alias database found. Setting up...", file=sys.stderr)
253
+
254
+ db_path.parent.mkdir(parents=True, exist_ok=True)
255
+ tsv_path = db_path.parent / LOCAL_TSV_NAME
256
+
257
+ # A schema-only rebuild can reuse a cached TSV if one is present:
258
+ # the data is current, only the DB shape is stale, so there's no
259
+ # need to touch the network. An explicit `update` (force=True but
260
+ # not stale_schema_rebuild) always re-downloads — fetching fresher
261
+ # data is the point.
262
+ if stale_schema_rebuild and tsv_path.exists():
263
+ print(
264
+ f" Reusing cached TSV at {tsv_path} (offline rebuild).",
265
+ file=sys.stderr,
266
+ )
267
+ try:
268
+ build_db(tsv_path, db_path)
269
+ except (FileNotFoundError, ValueError) as e:
270
+ # The cached TSV is unusable (corrupt or itself stale-format).
271
+ # Fall through to a fresh download rather than failing.
272
+ print(
273
+ f" Cached TSV could not be used ({e}); downloading fresh...",
274
+ file=sys.stderr,
275
+ )
276
+ else:
277
+ print(f" Cached at {db_path}", file=sys.stderr)
278
+ return db_path
279
+
280
+ # Either a first run, an explicit update, or a stale-schema rebuild
281
+ # with no usable cached TSV: download and build.
282
+ downloaded_this_call = False
283
+ try:
284
+ url = find_latest_data_release_url()
285
+ download_with_progress(url, tsv_path)
286
+ downloaded_this_call = True
287
+ print(f" Building local database from TSV...", file=sys.stderr)
288
+ try:
289
+ build_db(tsv_path, db_path)
290
+ except (FileNotFoundError, ValueError) as e:
291
+ raise BootstrapError(f"build failed: {e}")
292
+ except BootstrapError:
293
+ # If the download itself failed, don't leave a partial/freshly
294
+ # downloaded TSV behind. (download_with_progress already cleans
295
+ # up its own .part file; this guards a TSV that downloaded but
296
+ # then failed to build.)
297
+ if downloaded_this_call:
298
+ tsv_path.unlink(missing_ok=True)
299
+ raise
300
+
301
+ # Keep the TSV on success — it's the offline-rebuild fallback for the
302
+ # next schema bump. It's the same artifact a future `update` would
303
+ # overwrite anyway, so the only cost is ~9 MB of cache.
304
+ print(f" Cached at {db_path}", file=sys.stderr)
305
+ return db_path