alias-mapper 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,407 @@
1
+ """
2
+ build_alias_db.py
3
+ -----------------
4
+ Builds a SQLite alias database from a TSV produced by the weekly
5
+ collection workflow.
6
+
7
+ The TSV is the merged-row format (schema v3): one row per assembly,
8
+ with per-molecule data as pipe-separated (|), position-aligned list
9
+ columns. This builder explodes those lists back into per-molecule
10
+ rows for indexed lookup at query time. The TSV is the human-readable
11
+ source of truth; the DB is whatever shape is fastest for queries.
12
+
13
+ The same code is invoked two ways:
14
+ - Via the CLI's bootstrap flow on first run (and on `update`)
15
+ - Via scripts/build_alias_db.py from the GitHub Actions workflow
16
+
17
+ Schema:
18
+ - `_meta`: schema version + build date. Used by SqliteAliasSource
19
+ to detect stale caches and trigger rebuild.
20
+ - `assemblies`: one row per assembly. Holds assembly-level metadata
21
+ (organism, taxid, level, etc) plus the paired RefSeq assembly
22
+ accession when known. Small (~50k rows).
23
+ - `aliases`: one row per molecule (exploded from the TSV's list
24
+ columns). Holds per-molecule names. Big (~3M rows).
25
+ - Single index on aliases(accession). Other indexes are intentionally
26
+ not created — the CLI's only filter is by accession, and unused
27
+ indexes would inflate the file by hundreds of MB for no benefit.
28
+
29
+ Destructive idempotency: the DB is dropped and rebuilt on each run.
30
+ """
31
+
32
+ import csv
33
+ import gzip
34
+ import sqlite3
35
+ import sys
36
+ from datetime import date
37
+ from pathlib import Path
38
+
39
+ # Bumped whenever the SQLite schema changes incompatibly. Kept here
40
+ # rather than imported from alias_source to avoid a circular import
41
+ # at script-invocation time; alias_source imports the same number.
42
+ #
43
+ # Stored as int in `_meta` so future versions can compare numerically
44
+ # ("is this cache older than v3?") without string-vs-numeric pitfalls.
45
+ SCHEMA_VERSION = 3
46
+
47
+ CREATE_META_SQL = """
48
+ CREATE TABLE _meta (
49
+ key TEXT PRIMARY KEY,
50
+ value TEXT
51
+ )
52
+ """
53
+
54
+ CREATE_ASSEMBLIES_SQL = """
55
+ CREATE TABLE assemblies (
56
+ accession TEXT PRIMARY KEY, -- assembly-level GenBank acc (GCA_*)
57
+ assembly_name TEXT,
58
+ paired_refseq_acc TEXT, -- assembly-level RefSeq acc (GCF_*), if paired
59
+ taxid INTEGER,
60
+ organism_name TEXT,
61
+ group_name TEXT, -- "group" is a reserved word in SQL
62
+ assembly_level TEXT,
63
+ genome_coverage_pct REAL, -- kept-molecule length / genome_size, percent
64
+ genome_coverage_ungapped_pct REAL, -- kept-molecule length / genome_size_ungapped, percent
65
+ last_updated TEXT
66
+ )
67
+ """
68
+
69
+ CREATE_ALIASES_SQL = """
70
+ CREATE TABLE aliases (
71
+ accession TEXT NOT NULL, -- FK to assemblies.accession
72
+ position INTEGER NOT NULL, -- 0-based, longest molecule first
73
+ sequence_name TEXT,
74
+ assigned_molecule TEXT,
75
+ genbank_acc TEXT, -- per-sequence (e.g. CM000663.2)
76
+ refseq_acc TEXT, -- per-sequence (e.g. NC_000001.11)
77
+ ucsc_name TEXT,
78
+ length INTEGER
79
+ )
80
+ """
81
+
82
+ # Single index. The CLI always filters by accession; other columns are
83
+ # returned but not used as filters. Adding more indexes would inflate
84
+ # the DB by ~200 MB for queries the CLI never makes.
85
+ INDEX_SQL = "CREATE INDEX idx_accession ON aliases(accession)"
86
+
87
+ # Expected TSV column set (schema v3 merged-row format).
88
+ EXPECTED_TSV_COLS = [
89
+ "genbank_acc", "refseq_acc", "assembly_name", "taxid",
90
+ "organism_name", "group", "assembly_level",
91
+ "sequence_names", "genbank_seq_accs", "refseq_seq_accs",
92
+ "ucsc_names", "assigned_molecules", "lengths",
93
+ "genome_coverage_pct", "genome_coverage_ungapped_pct",
94
+ ]
95
+
96
+ # Schema-v1 columns. Used only for detection — if the TSV looks like
97
+ # the old per-molecule format, we want to give the user a useful error
98
+ # instead of a wall of column names.
99
+ V1_TSV_COLS_MARKER = {"ACCESSION", "GENBANK_ACC", "REFSEQ_ACC", "LENGTH"}
100
+
101
+ # Within-cell list delimiter for the merged-row TSV. Must match
102
+ # collect_aliases.LIST_DELIM. Pipe (not comma) because NCBI Sequence-Name
103
+ # / Assigned-Molecule values can contain commas, which would break
104
+ # position alignment.
105
+ LIST_DELIM = "|"
106
+
107
+ # Which TSV list columns hold per-molecule data, and what SQLite column
108
+ # each maps to. The first element of each tuple is the TSV column, the
109
+ # second is the SQLite alias column.
110
+ LIST_COLUMN_MAP = [
111
+ ("sequence_names", "sequence_name"),
112
+ ("assigned_molecules", "assigned_molecule"),
113
+ ("genbank_seq_accs", "genbank_acc"),
114
+ ("refseq_seq_accs", "refseq_acc"),
115
+ ("ucsc_names", "ucsc_name"),
116
+ ("lengths", "length"),
117
+ ]
118
+
119
+
120
+ def open_tsv(path: Path):
121
+ """Open the TSV, transparently handling .gz compression."""
122
+ if path.suffix == ".gz":
123
+ return gzip.open(path, "rt", encoding="utf-8")
124
+ return open(path, "r", encoding="utf-8")
125
+
126
+
127
+ def _text_or_none(v):
128
+ """Empty/whitespace -> None; otherwise stripped."""
129
+ v = (v or "").strip()
130
+ return v if v else None
131
+
132
+
133
+ def _int_or_none(v):
134
+ """Best-effort integer parse; non-digit (incl empty) -> None."""
135
+ v = (v or "").strip()
136
+ if v.isdigit() or (v.startswith("-") and v[1:].isdigit()):
137
+ try:
138
+ return int(v)
139
+ except ValueError:
140
+ return None
141
+ return None
142
+
143
+
144
+ def _float_or_none(v):
145
+ """Best-effort float parse; empty/non-numeric -> None."""
146
+ v = (v or "").strip()
147
+ if not v:
148
+ return None
149
+ try:
150
+ return float(v)
151
+ except ValueError:
152
+ return None
153
+
154
+
155
+ def _explode_row(row: dict[str, str]) -> tuple[dict, list[tuple]]:
156
+ """
157
+ Take one merged TSV row and return:
158
+ - assembly_record: dict ready for the assemblies INSERT
159
+ - molecule_tuples: list of tuples ready for the aliases INSERT,
160
+ one per molecule, in position order.
161
+
162
+ Validates that all list columns have the same length within this
163
+ row. Mismatched-length lists indicate a TSV bug worth failing on.
164
+ """
165
+ accession = _text_or_none(row["genbank_acc"])
166
+ if not accession:
167
+ # The TSV writer should never produce a row without a genbank_acc.
168
+ # If it ever does, fail loudly rather than silently drop.
169
+ raise ValueError(f"TSV row missing genbank_acc: {row!r}")
170
+
171
+ assembly_record = {
172
+ "accession": accession,
173
+ "assembly_name": _text_or_none(row["assembly_name"]),
174
+ "paired_refseq_acc": _text_or_none(row["refseq_acc"]),
175
+ "taxid": _int_or_none(row["taxid"]),
176
+ "organism_name": _text_or_none(row["organism_name"]),
177
+ "group_name": _text_or_none(row["group"]),
178
+ "assembly_level": _text_or_none(row["assembly_level"]),
179
+ "genome_coverage_pct": _float_or_none(row.get("genome_coverage_pct", "")),
180
+ "genome_coverage_ungapped_pct": _float_or_none(row.get("genome_coverage_ungapped_pct", "")),
181
+ }
182
+
183
+ # Split each list column on the list delimiter. An empty cell yields
184
+ # [""], which split() would treat as one empty molecule — guard
185
+ # against that by treating the whole list as empty if every entry is
186
+ # empty.
187
+ split_lists = {}
188
+ for tsv_col, _sql_col in LIST_COLUMN_MAP:
189
+ raw = row.get(tsv_col, "") or ""
190
+ if raw == "":
191
+ split_lists[tsv_col] = []
192
+ else:
193
+ split_lists[tsv_col] = raw.split(LIST_DELIM)
194
+
195
+ # Validate position alignment. All non-empty lists must have the
196
+ # same length. (Empty lists are tolerated for fields that genuinely
197
+ # have no data, e.g. UCSC names absent for non-vertebrates.)
198
+ lengths = {col: len(vals) for col, vals in split_lists.items() if vals}
199
+ if lengths:
200
+ canonical = next(iter(lengths.values()))
201
+ if not all(v == canonical for v in lengths.values()):
202
+ raise ValueError(
203
+ f"TSV row for {accession} has misaligned list lengths: {lengths}"
204
+ )
205
+ n_molecules = canonical
206
+ else:
207
+ n_molecules = 0
208
+
209
+ molecule_tuples = []
210
+ for pos in range(n_molecules):
211
+ def _at(tsv_col):
212
+ vals = split_lists[tsv_col]
213
+ return vals[pos] if pos < len(vals) else ""
214
+
215
+ molecule_tuples.append((
216
+ accession,
217
+ pos,
218
+ _text_or_none(_at("sequence_names")),
219
+ _text_or_none(_at("assigned_molecules")),
220
+ _text_or_none(_at("genbank_seq_accs")),
221
+ _text_or_none(_at("refseq_seq_accs")),
222
+ _text_or_none(_at("ucsc_names")),
223
+ _int_or_none(_at("lengths")),
224
+ ))
225
+
226
+ return assembly_record, molecule_tuples
227
+
228
+
229
+ def build_db(tsv_path: Path, db_path: Path, batch_size: int = 10_000) -> None:
230
+ """
231
+ Build a SQLite alias DB from a merged-row TSV (schema v3).
232
+
233
+ Destructive: drops the existing DB at db_path before rebuilding.
234
+ Prints progress to stderr.
235
+
236
+ Raises:
237
+ FileNotFoundError: if tsv_path doesn't exist.
238
+ ValueError: if the TSV is missing expected columns or
239
+ has misaligned position-aligned lists.
240
+ """
241
+ tsv_path = Path(tsv_path)
242
+ db_path = Path(db_path)
243
+
244
+ if not tsv_path.exists():
245
+ raise FileNotFoundError(f"TSV not found at {tsv_path}")
246
+
247
+ if db_path.exists():
248
+ print(f"Removing existing DB at {db_path}", file=sys.stderr)
249
+ db_path.unlink()
250
+
251
+ db_path.parent.mkdir(parents=True, exist_ok=True)
252
+
253
+ print(f"Creating SQLite database at {db_path}", file=sys.stderr)
254
+ conn = sqlite3.connect(db_path)
255
+ cur = conn.cursor()
256
+
257
+ # Speed up the bulk insert by relaxing durability guarantees.
258
+ cur.execute("PRAGMA journal_mode = OFF")
259
+ cur.execute("PRAGMA synchronous = OFF")
260
+ cur.execute("PRAGMA temp_store = MEMORY")
261
+
262
+ print("Creating tables...", file=sys.stderr)
263
+ cur.execute(CREATE_META_SQL)
264
+ cur.execute(CREATE_ASSEMBLIES_SQL)
265
+ cur.execute(CREATE_ALIASES_SQL)
266
+
267
+ today_iso = date.today().isoformat()
268
+
269
+ # Write meta first so a corrupted/partial DB is still recognizable
270
+ # as the right schema version (downstream rebuild can be confident).
271
+ cur.execute("INSERT INTO _meta (key, value) VALUES (?, ?)",
272
+ ("schema_version", str(SCHEMA_VERSION)))
273
+ cur.execute("INSERT INTO _meta (key, value) VALUES (?, ?)",
274
+ ("build_date", today_iso))
275
+
276
+ insert_assembly_sql = """
277
+ INSERT OR IGNORE INTO assemblies (
278
+ accession, assembly_name, paired_refseq_acc, taxid,
279
+ organism_name, group_name, assembly_level,
280
+ genome_coverage_pct, genome_coverage_ungapped_pct, last_updated
281
+ ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
282
+ """
283
+ insert_alias_sql = """
284
+ INSERT INTO aliases (
285
+ accession, position, sequence_name, assigned_molecule,
286
+ genbank_acc, refseq_acc, ucsc_name, length
287
+ ) VALUES (?, ?, ?, ?, ?, ?, ?, ?)
288
+ """
289
+
290
+ n_assemblies = 0
291
+ n_aliases = 0
292
+ alias_batch: list[tuple] = []
293
+
294
+ print(f"Reading {tsv_path} and inserting rows...", file=sys.stderr)
295
+ with open_tsv(tsv_path) as f:
296
+ reader = csv.DictReader(f, delimiter="\t")
297
+
298
+ missing = [c for c in EXPECTED_TSV_COLS if c not in reader.fieldnames]
299
+ if missing:
300
+ conn.close()
301
+ # Detect the v1 schema specifically so we can give a useful
302
+ # upgrade message instead of dumping column lists.
303
+ found_set = set(reader.fieldnames or [])
304
+ if V1_TSV_COLS_MARKER.issubset(found_set):
305
+ raise ValueError(
306
+ f"TSV at {tsv_path} uses schema v1 (per-molecule rows), "
307
+ f"but this CLI expects the merged-row schema (v3). This "
308
+ f"usually means the data release predates the schema "
309
+ f"upgrade. Either wait for the next weekly data release, "
310
+ f"or downgrade the CLI to a v1-compatible version."
311
+ )
312
+ raise ValueError(
313
+ f"TSV missing expected columns: {missing}. "
314
+ f"Found: {reader.fieldnames}"
315
+ )
316
+
317
+ for row in reader:
318
+ assembly_record, molecule_tuples = _explode_row(row)
319
+
320
+ cur.execute(insert_assembly_sql, (
321
+ assembly_record["accession"],
322
+ assembly_record["assembly_name"],
323
+ assembly_record["paired_refseq_acc"],
324
+ assembly_record["taxid"],
325
+ assembly_record["organism_name"],
326
+ assembly_record["group_name"],
327
+ assembly_record["assembly_level"],
328
+ assembly_record["genome_coverage_pct"],
329
+ assembly_record["genome_coverage_ungapped_pct"],
330
+ today_iso,
331
+ ))
332
+ n_assemblies += 1
333
+
334
+ alias_batch.extend(molecule_tuples)
335
+
336
+ if len(alias_batch) >= batch_size:
337
+ cur.executemany(insert_alias_sql, alias_batch)
338
+ n_aliases += len(alias_batch)
339
+ alias_batch.clear()
340
+ if n_aliases % 100_000 == 0:
341
+ print(f" ... {n_aliases:>10,} molecule rows", file=sys.stderr)
342
+
343
+ if alias_batch:
344
+ cur.executemany(insert_alias_sql, alias_batch)
345
+ n_aliases += len(alias_batch)
346
+
347
+ print(
348
+ f"Inserted {n_aliases:,} molecule rows across "
349
+ f"{n_assemblies:,} assemblies.",
350
+ file=sys.stderr,
351
+ )
352
+ print("Creating index...", file=sys.stderr)
353
+ cur.execute(INDEX_SQL)
354
+
355
+ conn.commit()
356
+
357
+ # Restore safe defaults after the bulk load.
358
+ cur.execute("PRAGMA journal_mode = WAL")
359
+ cur.execute("PRAGMA synchronous = NORMAL")
360
+
361
+ cur.execute("SELECT COUNT(*) FROM aliases")
362
+ total = cur.fetchone()[0]
363
+ cur.execute("SELECT COUNT(*) FROM assemblies")
364
+ asm_total = cur.fetchone()[0]
365
+ conn.close()
366
+
367
+ db_size_mb = db_path.stat().st_size / (1024 * 1024)
368
+ print(
369
+ f"Done.\n"
370
+ f" Schema version: {SCHEMA_VERSION}\n"
371
+ f" Molecule rows: {total:,}\n"
372
+ f" Assemblies: {asm_total:,}\n"
373
+ f" DB size: {db_size_mb:.1f} MB\n"
374
+ f" Path: {db_path}",
375
+ file=sys.stderr,
376
+ )
377
+
378
+
379
+ def _cli():
380
+ """Entry point when this module is run directly as a script."""
381
+ import argparse
382
+
383
+ parser = argparse.ArgumentParser(description=__doc__)
384
+ parser.add_argument(
385
+ "--tsv", type=Path, required=True,
386
+ help="Path to the input TSV (gzipped or plain).",
387
+ )
388
+ parser.add_argument(
389
+ "--db", type=Path, required=True,
390
+ help="Path to the SQLite DB to create (overwritten if it exists).",
391
+ )
392
+ parser.add_argument(
393
+ "--batch-size", type=int, default=10_000,
394
+ help="Number of rows to insert per transaction (default: 10000).",
395
+ )
396
+ args = parser.parse_args()
397
+
398
+ try:
399
+ build_db(args.tsv, args.db, args.batch_size)
400
+ except FileNotFoundError as e:
401
+ sys.exit(f"error: {e}")
402
+ except ValueError as e:
403
+ sys.exit(f"error: {e}")
404
+
405
+
406
+ if __name__ == "__main__":
407
+ _cli()