alias-mapper 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,8 @@
1
+ """
2
+ alias-mapper: translate chromosome / scaffold names in bioinformatics
3
+ files between naming conventions.
4
+
5
+ See README.md and docs/design.md for usage and architecture.
6
+ """
7
+
8
+ __version__ = "1.0.0"
alias_mapper/_ssl.py ADDED
@@ -0,0 +1,40 @@
1
+ """
2
+ _ssl.py
3
+ -------
4
+ Shared SSL context setup for the installed alias-mapper package.
5
+
6
+ Mirrors scripts/_http.py's setup, but lives inside the package so
7
+ bootstrap.py and any future HTTP-using module (e.g. HttpAliasSource)
8
+ can import it without depending on scripts/.
9
+
10
+ Order of preference: truststore > certifi > stdlib defaults.
11
+
12
+ - truststore: uses the system keychain (necessary on networks with
13
+ TLS inspection like CRG's, which inject a non-Mozilla root cert)
14
+ - certifi: Mozilla's CA bundle, covers most environments including
15
+ GitHub Actions runners
16
+ - stdlib: last fallback, used if neither extra is installed
17
+
18
+ Both truststore and certifi are optional installs. The package will
19
+ work without them on any network where the system already trusts the
20
+ NCBI/GitHub cert chains.
21
+ """
22
+
23
+ import ssl
24
+
25
+ try:
26
+ import truststore
27
+ truststore.inject_into_ssl()
28
+ SSL_BACKEND = "truststore"
29
+ except ImportError:
30
+ SSL_BACKEND = None
31
+
32
+ try:
33
+ import certifi
34
+ SSL_CONTEXT = ssl.create_default_context(cafile=certifi.where())
35
+ if SSL_BACKEND is None:
36
+ SSL_BACKEND = "certifi"
37
+ except ImportError:
38
+ SSL_CONTEXT = ssl.create_default_context()
39
+ if SSL_BACKEND is None:
40
+ SSL_BACKEND = "stdlib"
@@ -0,0 +1,358 @@
1
+ """
2
+ alias_source.py
3
+ ---------------
4
+ Abstraction over the alias data source. The CLI and translator code
5
+ ask an `AliasSource` for an alias map; they don't care whether the
6
+ source is a local SQLite file, a TSV, or (eventually) a remote HTTP
7
+ endpoint.
8
+
9
+ Today's only implementation is SqliteAliasSource. When the hosted API
10
+ ships, HttpAliasSource will live alongside it implementing the same
11
+ interface, and the CLI will pick one based on config.
12
+ """
13
+
14
+ from abc import ABC, abstractmethod
15
+ from dataclasses import dataclass
16
+ from pathlib import Path
17
+ import sqlite3
18
+ import sys
19
+
20
+
21
+ # Bumped whenever the SQLite schema changes incompatibly. Mirrored in
22
+ # build_alias_db.SCHEMA_VERSION; both must agree at runtime.
23
+ #
24
+ # Stored as int rather than str so future versions can compare
25
+ # numerically ("is this cache older than v3?") without lexical pitfalls
26
+ # ("10" < "2" as strings).
27
+ CURRENT_SCHEMA_VERSION = 3
28
+
29
+
30
+ # Convention column names in the aliases table. Kept here (not in the
31
+ # CLI) so SqliteAliasSource can iterate over them during detection
32
+ # without the CLI having to pass the list in.
33
+ CONVENTION_COLUMNS = (
34
+ "genbank_acc",
35
+ "refseq_acc",
36
+ "ucsc_name",
37
+ "sequence_name",
38
+ "assigned_molecule",
39
+ )
40
+
41
+
42
+ class AliasNotFoundError(Exception):
43
+ """Raised when an assembly has no rows for the requested convention pair."""
44
+
45
+
46
+ class AssemblyNotFoundError(Exception):
47
+ """Raised when the requested assembly accession doesn't exist in the source."""
48
+
49
+
50
+ class StaleSchemaError(Exception):
51
+ """
52
+ Raised when the local DB exists but was built against an older schema
53
+ (or by a build script that didn't write _meta at all).
54
+
55
+ Caught by bootstrap.ensure_db, which responds by forcing a rebuild.
56
+ """
57
+ def __init__(self, found: int | None, expected: int):
58
+ self.found = found
59
+ self.expected = expected
60
+ super().__init__(
61
+ f"DB schema is {found!r}, expected {expected!r}"
62
+ )
63
+
64
+
65
+ class LowConfidenceDetection(Exception):
66
+ """
67
+ Raised when auto-detection can't pick a clear winner.
68
+
69
+ The user must supply the corresponding flag (--from or --assembly)
70
+ explicitly.
71
+ """
72
+
73
+
74
+ @dataclass
75
+ class DetectionResult:
76
+ """One auto-detection outcome with the runner-up for confidence checks."""
77
+ winner: str
78
+ winner_score: int
79
+ runner_up: str | None
80
+ runner_up_score: int
81
+
82
+
83
+ class AliasSource(ABC):
84
+ """
85
+ Interface for anything that can answer alias lookup queries.
86
+
87
+ Implementations:
88
+ - SqliteAliasSource: reads from a local SQLite DB built by build_alias_db.py
89
+ - (future) HttpAliasSource: reads from the hosted API
90
+ """
91
+
92
+ @abstractmethod
93
+ def assembly_exists(self, assembly: str) -> bool:
94
+ """Return True if the assembly accession is known to this source."""
95
+
96
+ @abstractmethod
97
+ def get_map(
98
+ self,
99
+ assembly: str,
100
+ source_column: str,
101
+ target_column: str,
102
+ ) -> dict[str, str]:
103
+ """
104
+ Return a {source_name -> target_name} dict for one assembly.
105
+
106
+ Rows where either column is NULL are skipped.
107
+
108
+ Raises:
109
+ AssemblyNotFoundError: assembly accession not in the source.
110
+ AliasNotFoundError: assembly exists, but no rows have both
111
+ source_column and target_column populated.
112
+ """
113
+
114
+ @abstractmethod
115
+ def detect_convention(self, sample_names: list[str]) -> DetectionResult:
116
+ """
117
+ Pick the convention column whose values best match the sample.
118
+
119
+ For each candidate convention column, count how many of the
120
+ sample names appear in that column (anywhere in the DB).
121
+ Returns the winner with its score and the runner-up.
122
+
123
+ Raises:
124
+ LowConfidenceDetection: no convention has a clear winner
125
+ (see _is_confident for the rule).
126
+ """
127
+
128
+ @abstractmethod
129
+ def detect_assembly(self, sample_names: list[str]) -> DetectionResult:
130
+ """
131
+ Pick the assembly whose rows best match the sample names.
132
+
133
+ For each assembly, count how many of the sample names match
134
+ any convention column for that assembly. Returns the winner
135
+ with its score and the runner-up.
136
+
137
+ Raises:
138
+ LowConfidenceDetection: no assembly has a clear winner.
139
+ """
140
+
141
+
142
+ # Confidence rule shared by both detection methods. Conservative on
143
+ # purpose; we can loosen these after seeing real-world failure modes.
144
+ MIN_ABSOLUTE_MATCHES = 5
145
+ MIN_RATIO_OVER_RUNNER_UP = 2.0
146
+
147
+
148
+ def _is_confident(winner_score: int, runner_up_score: int) -> bool:
149
+ """Apply the confidence rule. Centralized so both detection paths agree."""
150
+ if winner_score < MIN_ABSOLUTE_MATCHES:
151
+ return False
152
+ if runner_up_score == 0:
153
+ return True
154
+ return (winner_score / runner_up_score) >= MIN_RATIO_OVER_RUNNER_UP
155
+
156
+
157
+ def verify_schema_version(db_path: Path) -> None:
158
+ """
159
+ Confirm the SQLite at db_path matches CURRENT_SCHEMA_VERSION.
160
+
161
+ Raises StaleSchemaError if the DB exists but is stale (or pre-v2:
162
+ lacks _meta entirely). The caller (typically bootstrap.ensure_db)
163
+ is expected to respond by rebuilding.
164
+
165
+ Cheap: opens the DB, runs one SELECT, closes. Safe to call before
166
+ any other DB work.
167
+
168
+ Note: _meta.value is stored as TEXT in SQLite (it's a generic
169
+ key/value table), so we parse the version back to int before
170
+ comparing. A non-numeric value lands as StaleSchemaError(found=None)
171
+ — same as a missing _meta table, same rebuild path.
172
+ """
173
+ if not db_path.exists():
174
+ # Not stale, just absent. Caller decides whether to build.
175
+ return
176
+ conn = sqlite3.connect(db_path)
177
+ try:
178
+ cur = conn.cursor()
179
+ try:
180
+ cur.execute(
181
+ "SELECT value FROM _meta WHERE key = 'schema_version'"
182
+ )
183
+ row = cur.fetchone()
184
+ except sqlite3.OperationalError:
185
+ # _meta table doesn't exist — this is a pre-v2 DB.
186
+ raise StaleSchemaError(found=None, expected=CURRENT_SCHEMA_VERSION)
187
+ try:
188
+ found = int(row[0]) if row else None
189
+ except (TypeError, ValueError):
190
+ # _meta.value isn't a number — corrupted or from some
191
+ # incompatible build script. Treat as stale and rebuild.
192
+ found = None
193
+ if found != CURRENT_SCHEMA_VERSION:
194
+ raise StaleSchemaError(found=found, expected=CURRENT_SCHEMA_VERSION)
195
+ finally:
196
+ conn.close()
197
+
198
+
199
+ class SqliteAliasSource(AliasSource):
200
+ """Alias source backed by a local SQLite DB (the one build_alias_db.py produces)."""
201
+
202
+ def __init__(self, db_path: Path):
203
+ if not db_path.exists():
204
+ sys.exit(f"error: alias database not found at {db_path}")
205
+ # Verify schema upfront so a stale cache surfaces as StaleSchemaError
206
+ # rather than a confusing query error later.
207
+ verify_schema_version(db_path)
208
+ self.db_path = db_path
209
+
210
+ def _connect(self):
211
+ return sqlite3.connect(self.db_path)
212
+
213
+ def assembly_exists(self, assembly: str) -> bool:
214
+ conn = self._connect()
215
+ try:
216
+ cur = conn.cursor()
217
+ cur.execute("SELECT 1 FROM assemblies WHERE accession = ?", (assembly,))
218
+ return cur.fetchone() is not None
219
+ finally:
220
+ conn.close()
221
+
222
+ def get_map(
223
+ self,
224
+ assembly: str,
225
+ source_column: str,
226
+ target_column: str,
227
+ ) -> dict[str, str]:
228
+ conn = self._connect()
229
+ try:
230
+ cur = conn.cursor()
231
+
232
+ # Sanity check the assembly exists before running the lookup.
233
+ cur.execute("SELECT 1 FROM assemblies WHERE accession = ?", (assembly,))
234
+ if not cur.fetchone():
235
+ raise AssemblyNotFoundError(assembly)
236
+
237
+ # idx_accession makes this fast.
238
+ query = f"""
239
+ SELECT {source_column}, {target_column}
240
+ FROM aliases
241
+ WHERE accession = ?
242
+ AND {source_column} IS NOT NULL
243
+ AND {target_column} IS NOT NULL
244
+ """
245
+ try:
246
+ cur.execute(query, (assembly,))
247
+ rows = cur.fetchall()
248
+ except sqlite3.OperationalError as e:
249
+ sys.exit(f"error: SQL query failed: {e}")
250
+
251
+ if not rows:
252
+ raise AliasNotFoundError(
253
+ f"no rows for assembly {assembly!r} with both "
254
+ f"{source_column} and {target_column} populated"
255
+ )
256
+
257
+ return dict(rows)
258
+ finally:
259
+ conn.close()
260
+
261
+ def detect_convention(self, sample_names: list[str]) -> DetectionResult:
262
+ if not sample_names:
263
+ raise LowConfidenceDetection("no sample names to detect from")
264
+
265
+ placeholders = ",".join("?" * len(sample_names))
266
+ scores: list[tuple[str, int]] = []
267
+
268
+ conn = self._connect()
269
+ try:
270
+ cur = conn.cursor()
271
+ for col in CONVENTION_COLUMNS:
272
+ # Count how many distinct sample names appear in this column.
273
+ # COUNT(DISTINCT) so a name that appears for many assemblies
274
+ # counts as one match, not many.
275
+ query = f"""
276
+ SELECT COUNT(DISTINCT {col})
277
+ FROM aliases
278
+ WHERE {col} IN ({placeholders})
279
+ """
280
+ cur.execute(query, sample_names)
281
+ count = cur.fetchone()[0] or 0
282
+ scores.append((col, count))
283
+ finally:
284
+ conn.close()
285
+
286
+ scores.sort(key=lambda x: x[1], reverse=True)
287
+ winner_col, winner_score = scores[0]
288
+ runner_up_col, runner_up_score = scores[1] if len(scores) > 1 else (None, 0)
289
+
290
+ if not _is_confident(winner_score, runner_up_score):
291
+ raise LowConfidenceDetection(
292
+ f"could not determine source convention from sample. "
293
+ f"Top candidate {winner_col!r} matched {winner_score}/{len(sample_names)}, "
294
+ f"runner-up {runner_up_col!r} matched {runner_up_score}. "
295
+ f"Pass --from to specify explicitly."
296
+ )
297
+
298
+ return DetectionResult(
299
+ winner=winner_col,
300
+ winner_score=winner_score,
301
+ runner_up=runner_up_col,
302
+ runner_up_score=runner_up_score,
303
+ )
304
+
305
+ def detect_assembly(self, sample_names: list[str]) -> DetectionResult:
306
+ if not sample_names:
307
+ raise LowConfidenceDetection("no sample names to detect from")
308
+
309
+ placeholders = ",".join("?" * len(sample_names))
310
+
311
+ # Build the WHERE clause: a name matches if it appears in ANY
312
+ # convention column. We bind the sample names once per column.
313
+ column_clauses = " OR ".join(
314
+ f"{col} IN ({placeholders})" for col in CONVENTION_COLUMNS
315
+ )
316
+ params = sample_names * len(CONVENTION_COLUMNS)
317
+
318
+ query = f"""
319
+ SELECT accession, COUNT(*) AS hits
320
+ FROM aliases
321
+ WHERE {column_clauses}
322
+ GROUP BY accession
323
+ ORDER BY hits DESC
324
+ LIMIT 2
325
+ """
326
+
327
+ conn = self._connect()
328
+ try:
329
+ cur = conn.cursor()
330
+ cur.execute(query, params)
331
+ rows = cur.fetchall()
332
+ finally:
333
+ conn.close()
334
+
335
+ if not rows:
336
+ raise LowConfidenceDetection(
337
+ f"no assembly in the database contained any of the {len(sample_names)} "
338
+ f"sample names. The input may use names from an assembly we don't have, "
339
+ f"or the names may be lab-internal IDs. Pass --assembly to specify."
340
+ )
341
+
342
+ winner, winner_score = rows[0]
343
+ runner_up, runner_up_score = rows[1] if len(rows) > 1 else (None, 0)
344
+
345
+ if not _is_confident(winner_score, runner_up_score):
346
+ raise LowConfidenceDetection(
347
+ f"could not determine assembly from sample. "
348
+ f"Top candidate {winner!r} matched {winner_score}/{len(sample_names)}, "
349
+ f"runner-up {runner_up!r} matched {runner_up_score}. "
350
+ f"Pass --assembly to specify explicitly."
351
+ )
352
+
353
+ return DetectionResult(
354
+ winner=winner,
355
+ winner_score=winner_score,
356
+ runner_up=runner_up,
357
+ runner_up_score=runner_up_score,
358
+ )