alias-mapper 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
alias_mapper/cli.py ADDED
@@ -0,0 +1,585 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ cli.py
4
+ ------
5
+ Command-line entry point for alias-mapper.
6
+
7
+ Translates the chromosome / scaffold names in GFF, GTF, or FASTA files
8
+ from one naming convention to another, using an alias source (SQLite
9
+ DB today; HTTP API in the future).
10
+
11
+ Modes of `convert`:
12
+
13
+ Single file:
14
+ alias-mapper convert INPUT --to ucsc -o OUTPUT
15
+ Multi file, conform (omit --to): conform the annotations to whatever
16
+ convention the reference FASTA is already in; the FASTA is left
17
+ untouched:
18
+ alias-mapper convert --fasta REF.fa ANN1.gff ANN2.gtf --out-dir OUT/
19
+ Multi file, overwrite (--overwrite-to): convert the FASTA and every
20
+ annotation into one chosen convention:
21
+ alias-mapper convert --fasta REF.fa ANN1.gff --overwrite-to ucsc --out-dir OUT/
22
+
23
+ In multi-file mode the assembly is detected ONCE from the FASTA. Conform
24
+ mode then maps any recognized name (in any convention) to the FASTA's own
25
+ convention, matching the common workflow where you have one genome and
26
+ want its annotations to line up with it. Overwrite mode instead detects
27
+ the shared source convention from the FASTA and forces everything to the
28
+ target. Outputs are written to --out-dir as `<stem>.<conv>.<ext>` (gzip
29
+ preserved); in conform mode the FASTA itself is not written, since it is
30
+ unchanged.
31
+
32
+ Input files may be gzipped: compression is detected from contents, and
33
+ output is gzipped when the chosen path ends in .gz.
34
+
35
+ If --from or --assembly is omitted, the tool samples the input (or the
36
+ FASTA, in multi-file mode) and auto-detects from the database.
37
+
38
+ Subcommands:
39
+ convert Translate one file, or a FASTA + its annotation files.
40
+ update Re-download the latest alias data and rebuild the cache.
41
+
42
+ On first run `convert` downloads the latest alias TSV from GitHub
43
+ Releases and builds a local SQLite database in the platform cache
44
+ directory; later invocations reuse it. Run `update` to refresh.
45
+
46
+ Usage:
47
+ alias-mapper convert INPUT.gff --to ucsc -o OUTPUT.gff
48
+ alias-mapper convert INPUT.gff.gz --to ucsc -o OUTPUT.gff.gz
49
+ alias-mapper convert --fasta REF.fa ann1.gff ann2.gtf --out-dir out/
50
+ alias-mapper convert --fasta REF.fa ann1.gff --overwrite-to ucsc --out-dir out/
51
+ alias-mapper convert INPUT.gff --from refseq --to ucsc \\
52
+ --assembly GCF_000001405.40 -o OUTPUT.gff
53
+ alias-mapper update
54
+ """
55
+
56
+ import argparse
57
+ import sys
58
+ from pathlib import Path
59
+
60
+ from .alias_source import (
61
+ SqliteAliasSource,
62
+ AssemblyNotFoundError,
63
+ AliasNotFoundError,
64
+ LowConfidenceDetection,
65
+ CONVENTION_COLUMNS,
66
+ )
67
+ from .formats import translator_for, open_text_read, open_text_write
68
+ from .bootstrap import (
69
+ BootstrapError,
70
+ default_cache_path,
71
+ ensure_db,
72
+ )
73
+
74
+ # Short names exposed on the CLI mapped to columns in the aliases table.
75
+ CONVENTIONS = {
76
+ "genbank": "genbank_acc",
77
+ "refseq": "refseq_acc",
78
+ "ucsc": "ucsc_name",
79
+ "sequence-name": "sequence_name",
80
+ "assigned-molecule": "assigned_molecule",
81
+ }
82
+
83
+ # Reverse lookup: column name -> CLI-facing convention name. Used to
84
+ # report auto-detection results back to the user in their vocabulary.
85
+ COLUMN_TO_CONVENTION = {v: k for k, v in CONVENTIONS.items()}
86
+
87
+
88
+ def build_parser() -> argparse.ArgumentParser:
89
+ parser = argparse.ArgumentParser(
90
+ prog="alias-mapper",
91
+ description="Translate sequence names in GFF/GTF/FASTA files.",
92
+ )
93
+ subparsers = parser.add_subparsers(dest="command", required=True)
94
+
95
+ # convert ---------------------------------------------------------------
96
+ p_convert = subparsers.add_parser(
97
+ "convert",
98
+ help="Translate one file, or a FASTA plus its annotation files.",
99
+ description=(
100
+ "Single-file: convert INPUT --to TGT -o OUT\n"
101
+ "Multi-file conform: convert --fasta REF [ANN ...] --out-dir DIR\n"
102
+ "Multi-file force: convert --fasta REF [ANN ...] --overwrite-to TGT --out-dir DIR\n\n"
103
+ "In multi-file mode the assembly is detected once from the FASTA. "
104
+ "Without --overwrite-to, the annotations are conformed to the "
105
+ "FASTA's own convention and the FASTA is left unchanged. With "
106
+ "--overwrite-to, the FASTA and every annotation are forced to the "
107
+ "target convention."
108
+ ),
109
+ formatter_class=argparse.RawDescriptionHelpFormatter,
110
+ )
111
+ p_convert.add_argument(
112
+ "input", type=Path, nargs="*",
113
+ help=(
114
+ "Single-file mode: one input file (GFF, GTF, or FASTA; optionally "
115
+ ".gz). Multi-file mode (with --fasta): the annotation files to "
116
+ "convert alongside the FASTA."
117
+ ),
118
+ )
119
+ p_convert.add_argument(
120
+ "--fasta", type=Path, default=None,
121
+ help=(
122
+ "Reference FASTA. Enables multi-file mode: detect the assembly "
123
+ "from this FASTA, then conform the annotation inputs to its "
124
+ "convention (or, with --overwrite-to, force everything to a "
125
+ "chosen convention)."
126
+ ),
127
+ )
128
+ p_convert.add_argument(
129
+ "--from", dest="src", choices=CONVENTIONS.keys(),
130
+ help=(
131
+ "Source naming convention. Auto-detected if omitted. Not used in "
132
+ "conform mode (the FASTA's convention is the target there)."
133
+ ),
134
+ )
135
+ p_convert.add_argument(
136
+ "--to", dest="tgt", choices=CONVENTIONS.keys(),
137
+ help=(
138
+ "Target naming convention. Required in single-file mode. In "
139
+ "--fasta mode use --overwrite-to instead (or omit to conform)."
140
+ ),
141
+ )
142
+ p_convert.add_argument(
143
+ "--overwrite-to", dest="overwrite_to", choices=CONVENTIONS.keys(),
144
+ help=(
145
+ "(--fasta mode) Force the FASTA and all annotations to this "
146
+ "convention. Omit to conform the annotations to the FASTA's own "
147
+ "convention, leaving the FASTA unchanged."
148
+ ),
149
+ )
150
+ p_convert.add_argument(
151
+ "--assembly",
152
+ help="Assembly accession (e.g. GCF_000001405.40). Auto-detected if omitted.",
153
+ )
154
+ p_convert.add_argument(
155
+ "-o", "--output", type=Path, default=None,
156
+ help="Output path (single-file mode only; gzipped if it ends in .gz).",
157
+ )
158
+ p_convert.add_argument(
159
+ "--out-dir", dest="out_dir", type=Path, default=None,
160
+ help=(
161
+ "Output directory (multi-file/--fasta mode only). Each converted "
162
+ "input is written as <stem>.<conv>.<ext>, preserving any .gz."
163
+ ),
164
+ )
165
+ p_convert.add_argument(
166
+ "--alias-db", type=Path, default=None,
167
+ help=(
168
+ "Path to the alias SQLite database. Defaults to the platform cache "
169
+ "location (created on first run if missing)."
170
+ ),
171
+ )
172
+
173
+ # update ----------------------------------------------------------------
174
+ p_update = subparsers.add_parser(
175
+ "update",
176
+ help="Re-download the latest alias data and rebuild the local cache.",
177
+ )
178
+ p_update.add_argument(
179
+ "--alias-db", type=Path, default=None,
180
+ help=(
181
+ "Path to the alias SQLite database to refresh. Defaults to the "
182
+ "platform cache location."
183
+ ),
184
+ )
185
+
186
+ return parser
187
+
188
+
189
+ def cmd_update(args) -> int:
190
+ """Force a rebuild of the local alias DB from the latest TSV."""
191
+ try:
192
+ path = ensure_db(args.alias_db, force=True)
193
+ except BootstrapError as e:
194
+ sys.exit(f"error: {e}")
195
+ print(f"Done. Local alias database is up to date at {path}", file=sys.stderr)
196
+ return 0
197
+
198
+
199
+ def _open_source(args):
200
+ """Resolve --alias-db (or the cached default) and open a SqliteAliasSource."""
201
+ # If --alias-db was explicitly passed and doesn't exist, that's a user
202
+ # error: don't surprise them by auto-downloading to a path they chose.
203
+ if args.alias_db is not None and not args.alias_db.exists():
204
+ sys.exit(
205
+ f"error: alias database not found at {args.alias_db}. "
206
+ f"Either omit --alias-db to use the cached default, or run "
207
+ f"`alias-mapper update --alias-db {args.alias_db}` to create it there."
208
+ )
209
+ try:
210
+ db_path = ensure_db(args.alias_db)
211
+ except BootstrapError as e:
212
+ sys.exit(f"error: {e}")
213
+ return SqliteAliasSource(db_path), db_path
214
+
215
+
216
+ def _resolve_from_assembly(source, sample_path, args, role="source convention"):
217
+ """
218
+ Determine (conv_col, conv_name, assembly), sampling names from
219
+ `sample_path` to auto-detect whichever of --from / --assembly was
220
+ omitted. In multi-file mode `sample_path` is the FASTA, so detection
221
+ happens once and is reused for every annotation file.
222
+
223
+ `role` only changes the label printed for the detected convention,
224
+ so conform mode can report it as the target rather than the source.
225
+ """
226
+ translator = translator_for(sample_path)
227
+ sample = None
228
+ if args.src is None or args.assembly is None:
229
+ sample = translator.sample_names(sample_path)
230
+ if not sample:
231
+ sys.exit(
232
+ f"error: no sequence names found in {sample_path} for auto-detection. "
233
+ f"Pass --from and --assembly explicitly."
234
+ )
235
+ print(
236
+ f"Sampled {len(sample)} unique sequence names from {sample_path} "
237
+ f"for auto-detection.",
238
+ file=sys.stderr,
239
+ )
240
+
241
+ if args.src is None:
242
+ try:
243
+ result = source.detect_convention(sample)
244
+ except LowConfidenceDetection as e:
245
+ sys.exit(f"error: {e}")
246
+ conv_col = result.winner
247
+ conv_name = COLUMN_TO_CONVENTION.get(conv_col, conv_col)
248
+ print(
249
+ f" detected {role}: {conv_name} "
250
+ f"({result.winner_score}/{len(sample)} matches, "
251
+ f"runner-up {result.runner_up_score})",
252
+ file=sys.stderr,
253
+ )
254
+ else:
255
+ conv_col = CONVENTIONS[args.src]
256
+ conv_name = args.src
257
+
258
+ if args.assembly is None:
259
+ try:
260
+ result = source.detect_assembly(sample)
261
+ except LowConfidenceDetection as e:
262
+ sys.exit(f"error: {e}")
263
+ assembly = result.winner
264
+ print(
265
+ f" detected assembly: {assembly} "
266
+ f"({result.winner_score}/{len(sample)} matches, "
267
+ f"runner-up {result.runner_up_score})",
268
+ file=sys.stderr,
269
+ )
270
+ else:
271
+ assembly = args.assembly
272
+
273
+ return conv_col, conv_name, assembly
274
+
275
+
276
+ def _load_map(source, assembly, src_col, src_name, tgt_col):
277
+ """Fetch the {source_name -> target_name} map for one assembly."""
278
+ if src_col == tgt_col:
279
+ sys.exit(
280
+ f"error: source and target conventions are the same ({src_name}). "
281
+ f"Nothing to translate."
282
+ )
283
+ try:
284
+ alias_map = source.get_map(assembly, src_col, tgt_col)
285
+ except AssemblyNotFoundError:
286
+ sys.exit(
287
+ f"error: assembly {assembly!r} not found in the database. "
288
+ f"Check the --assembly value."
289
+ )
290
+ except AliasNotFoundError as e:
291
+ sys.exit(
292
+ f"error: {e}. This assembly may not have aliases in those conventions."
293
+ )
294
+ print(f" -> {len(alias_map)} entries loaded", file=sys.stderr)
295
+ return alias_map
296
+
297
+
298
+ def _load_conform_map(source, assembly, target_col, target_name):
299
+ """
300
+ Build a {any_convention_name -> target_name} map for conform mode.
301
+
302
+ Merges get_map() across every convention column except the target,
303
+ so a name in any recognized convention resolves to the FASTA's
304
+ convention. Convention columns with no rows paired to the target for
305
+ this assembly are skipped.
306
+
307
+ This is built from the existing one-source/one-target get_map, so it
308
+ needs no change to the AliasSource interface. A consequence: names
309
+ that are *already* in the target convention are not keys here, so
310
+ they pass through unchanged (the correct output) but land in the
311
+ "unmapped" tally — see the conform-mode note in _translate_file.
312
+ """
313
+ if not source.assembly_exists(assembly):
314
+ sys.exit(
315
+ f"error: assembly {assembly!r} not found in the database. "
316
+ f"Check the --assembly value."
317
+ )
318
+ conform_map: dict[str, str] = {}
319
+ contributing: list[str] = []
320
+ for col in CONVENTION_COLUMNS:
321
+ if col == target_col:
322
+ continue
323
+ try:
324
+ partial = source.get_map(assembly, col, target_col)
325
+ except AliasNotFoundError:
326
+ # This convention has no rows paired with the target for this
327
+ # assembly (e.g. RefSeq/UCSC absent). Nothing to contribute.
328
+ continue
329
+ conform_map.update(partial)
330
+ contributing.append(COLUMN_TO_CONVENTION.get(col, col))
331
+
332
+ if contributing:
333
+ print(
334
+ f" conform map: {len(conform_map)} names -> {target_name} "
335
+ f"(from {', '.join(contributing)})",
336
+ file=sys.stderr,
337
+ )
338
+ else:
339
+ print(
340
+ f" warning: no other convention has data paired to {target_name} "
341
+ f"for this assembly; nothing can be conformed. Annotation names "
342
+ f"already in {target_name} will pass through unchanged.",
343
+ file=sys.stderr,
344
+ )
345
+ return conform_map
346
+
347
+
348
+ def _translate_file(in_path: Path, out_path: Path, alias_map: dict,
349
+ conform_target: str | None = None) -> dict:
350
+ """
351
+ Translate one file with a prepared alias map. Returns its stats.
352
+
353
+ When `conform_target` is set (conform mode), the passthrough message
354
+ is worded as a neutral note rather than a warning: a name that is
355
+ already in the target convention is not in the conform map and so is
356
+ correctly left unchanged, which is not an error.
357
+ """
358
+ translator = translator_for(in_path)
359
+ stats = {"mapped": 0, "unmapped": 0, "unmapped_examples": set()}
360
+ print(f"Translating {in_path} → {out_path}", file=sys.stderr)
361
+ with open_text_read(in_path) as in_f, open_text_write(out_path) as out_f:
362
+ for line in in_f:
363
+ out_f.write(translator.translate_line(line, alias_map, stats))
364
+ print(
365
+ f" {in_path.name}: mapped={stats['mapped']}, unmapped={stats['unmapped']}",
366
+ file=sys.stderr,
367
+ )
368
+ if stats["unmapped"]:
369
+ examples = sorted(stats["unmapped_examples"])[:5]
370
+ if conform_target is not None:
371
+ print(
372
+ f" note: {stats['unmapped']} names in {in_path.name} were already "
373
+ f"in {conform_target} convention or not recognized; passed through "
374
+ f"unchanged. Examples: {examples}",
375
+ file=sys.stderr,
376
+ )
377
+ else:
378
+ print(
379
+ f" warning: {stats['unmapped']} names in {in_path.name} not found in "
380
+ f"the alias database for this assembly; passed through unchanged. "
381
+ f"Examples: {examples}",
382
+ file=sys.stderr,
383
+ )
384
+ return stats
385
+
386
+
387
+ def _output_name(in_path: Path, to: str) -> str:
388
+ """
389
+ Build the multi-file output filename: insert `.<to>` before the
390
+ extension(s), preserving a trailing .gz.
391
+
392
+ genome.fa.gz -> genome.<to>.fa.gz ; ann1.gff -> ann1.<to>.gff
393
+ """
394
+ p = Path(in_path)
395
+ if p.suffix.lower() == ".gz":
396
+ base = Path(p.stem).stem
397
+ exts = Path(p.stem).suffix + p.suffix
398
+ else:
399
+ base = p.stem
400
+ exts = p.suffix
401
+ return f"{base}.{to}{exts}"
402
+
403
+
404
+ def cmd_convert(args) -> int:
405
+ """Dispatch to single-file or multi-file (--fasta) translation."""
406
+ if args.fasta is not None:
407
+ return _convert_multi(args)
408
+ return _convert_single(args)
409
+
410
+
411
+ def _convert_single(args) -> int:
412
+ if args.overwrite_to is not None:
413
+ sys.exit(
414
+ "error: --overwrite-to is only for --fasta (multi-file) mode. "
415
+ "Use --to for single-file output."
416
+ )
417
+ if args.tgt is None:
418
+ sys.exit("error: --to is required in single-file mode.")
419
+ if args.out_dir is not None:
420
+ sys.exit(
421
+ "error: --out-dir is only for --fasta (multi-file) mode. "
422
+ "Use -o for single-file output."
423
+ )
424
+ if len(args.input) != 1:
425
+ sys.exit(
426
+ "error: single-file mode takes exactly one input file. For multiple "
427
+ "files use --fasta REF ANN... with --out-dir."
428
+ )
429
+ if args.output is None:
430
+ sys.exit("error: -o/--output is required in single-file mode.")
431
+
432
+ in_path, out_path = args.input[0], args.output
433
+ if not in_path.exists():
434
+ sys.exit(f"error: input file not found: {in_path}")
435
+ if out_path.exists():
436
+ sys.exit(
437
+ f"error: output file already exists: {out_path} "
438
+ f"(refusing to overwrite — choose another path or delete it first)"
439
+ )
440
+ try:
441
+ translator_for(in_path)
442
+ except ValueError as e:
443
+ sys.exit(f"error: {e}")
444
+
445
+ out_path.parent.mkdir(parents=True, exist_ok=True)
446
+ source, db_path = _open_source(args)
447
+ src_col, src_name, assembly = _resolve_from_assembly(source, in_path, args)
448
+ tgt_col = CONVENTIONS[args.tgt]
449
+ print(
450
+ f"Loading alias table from {db_path}\n"
451
+ f" assembly={assembly}, from={src_name}, to={args.tgt}",
452
+ file=sys.stderr,
453
+ )
454
+ alias_map = _load_map(source, assembly, src_col, src_name, tgt_col)
455
+ _translate_file(in_path, out_path, alias_map)
456
+ print("Done.", file=sys.stderr)
457
+ return 0
458
+
459
+
460
+ def _convert_multi(args) -> int:
461
+ if args.output is not None:
462
+ sys.exit(
463
+ "error: -o/--output is for single-file mode. In --fasta mode, "
464
+ "outputs go to --out-dir."
465
+ )
466
+ if args.out_dir is None:
467
+ sys.exit("error: --out-dir is required in --fasta (multi-file) mode.")
468
+ if args.tgt is not None:
469
+ sys.exit(
470
+ "error: --to is single-file only. In --fasta mode, use "
471
+ "--overwrite-to to force every file into one convention, or omit "
472
+ "it to conform the annotations to the FASTA's own convention."
473
+ )
474
+
475
+ fasta = args.fasta
476
+ if not fasta.exists():
477
+ sys.exit(f"error: FASTA not found: {fasta}")
478
+ annotations = list(args.input)
479
+ for f in annotations:
480
+ if not f.exists():
481
+ sys.exit(f"error: input file not found: {f}")
482
+
483
+ conform = args.overwrite_to is None
484
+ if conform and args.src is not None:
485
+ sys.exit(
486
+ "error: --from is not used in conform mode. The FASTA's own "
487
+ "convention is detected and used as the target. To force a "
488
+ "specific convention for every file, use --overwrite-to."
489
+ )
490
+ if conform and not annotations:
491
+ sys.exit(
492
+ "error: conform mode needs at least one annotation file to conform "
493
+ "to the FASTA. (To convert just the FASTA, use --overwrite-to.)"
494
+ )
495
+
496
+ # Validate every translator up front so a bad extension fails before
497
+ # we touch the (possibly large) database download.
498
+ for f in [fasta, *annotations]:
499
+ try:
500
+ translator_for(f)
501
+ except ValueError as e:
502
+ sys.exit(f"error: {e}")
503
+
504
+ out_dir = args.out_dir
505
+ source, db_path = _open_source(args)
506
+
507
+ # Detect from the FASTA. In conform mode the FASTA's convention is the
508
+ # target; in overwrite mode it's the (shared) source convention. The
509
+ # assembly is detected from the FASTA either way. Output naming and
510
+ # planning need the convention name, so this runs before planning.
511
+ if conform:
512
+ tgt_col, tgt_name, assembly = _resolve_from_assembly(
513
+ source, fasta, args, role="FASTA convention (conform target)"
514
+ )
515
+ files_to_convert = annotations # FASTA is the reference, left untouched
516
+ else:
517
+ src_col, src_name, assembly = _resolve_from_assembly(source, fasta, args)
518
+ tgt_name = args.overwrite_to
519
+ tgt_col = CONVENTIONS[args.overwrite_to]
520
+ files_to_convert = [fasta, *annotations]
521
+
522
+ # Plan outputs, refusing both overwrites and same-output collisions.
523
+ planned, seen = [], {}
524
+ for f in files_to_convert:
525
+ out_path = out_dir / _output_name(f, tgt_name)
526
+ if out_path in seen:
527
+ sys.exit(
528
+ f"error: inputs {seen[out_path]} and {f} both map to output "
529
+ f"{out_path.name}. Rename one."
530
+ )
531
+ seen[out_path] = f
532
+ if out_path.exists():
533
+ sys.exit(f"error: output already exists: {out_path} (refusing to overwrite).")
534
+ planned.append((f, out_path))
535
+
536
+ out_dir.mkdir(parents=True, exist_ok=True)
537
+
538
+ print(
539
+ f"Loading alias table from {db_path}\n"
540
+ f" assembly={assembly}, target={tgt_name}, "
541
+ f"mode={'conform' if conform else 'overwrite'}",
542
+ file=sys.stderr,
543
+ )
544
+
545
+ if conform:
546
+ alias_map = _load_conform_map(source, assembly, tgt_col, tgt_name)
547
+ conform_target = tgt_name
548
+ print(
549
+ f"Conforming {len(planned)} annotation file(s) to {tgt_name} in "
550
+ f"{out_dir}/; FASTA {fasta.name} left unchanged.",
551
+ file=sys.stderr,
552
+ )
553
+ else:
554
+ alias_map = _load_map(source, assembly, src_col, src_name, tgt_col)
555
+ conform_target = None
556
+ print(f"Converting {len(planned)} file(s) into {out_dir}/", file=sys.stderr)
557
+
558
+ totals = {"mapped": 0, "unmapped": 0}
559
+ for in_path, out_path in planned:
560
+ stats = _translate_file(in_path, out_path, alias_map,
561
+ conform_target=conform_target)
562
+ totals["mapped"] += stats["mapped"]
563
+ totals["unmapped"] += stats["unmapped"]
564
+ print(
565
+ f"Done. {len(planned)} file(s), total mapped={totals['mapped']}, "
566
+ f"unmapped={totals['unmapped']}",
567
+ file=sys.stderr,
568
+ )
569
+ return 0
570
+
571
+
572
+ def main():
573
+ parser = build_parser()
574
+ args = parser.parse_args()
575
+
576
+ if args.command == "convert":
577
+ return cmd_convert(args)
578
+ elif args.command == "update":
579
+ return cmd_update(args)
580
+ else:
581
+ parser.error(f"unknown command: {args.command}")
582
+
583
+
584
+ if __name__ == "__main__":
585
+ sys.exit(main())
@@ -0,0 +1,68 @@
1
+ """
2
+ File format translators.
3
+
4
+ Each translator class handles one file format's specifics — which lines
5
+ contain sequence names, how to extract and rewrite them. The CLI
6
+ dispatches to the right translator by file extension via TRANSLATORS.
7
+
8
+ Input and output may be gzipped transparently; see _io for the rules.
9
+ Format detection ignores a trailing .gz, so `genome.fa.gz` resolves to
10
+ the FASTA translator.
11
+
12
+ Name lookup during translation goes through resolve_alias (see
13
+ _resolve), which falls back to conservative name normalizations only
14
+ when an exact map hit is missing.
15
+
16
+ Adding a new format: write a class in a new module, import it here,
17
+ add an entry to TRANSLATORS.
18
+ """
19
+
20
+ from .base import FileTranslator
21
+ from .gff import GffTranslator
22
+ from .fasta import FastaTranslator
23
+ from ._io import open_text_read, open_text_write, is_gzip, effective_suffix
24
+ from ._resolve import resolve_alias
25
+
26
+ # Extension → translator class. The CLI uses this to pick a translator
27
+ # based on the input file's suffix (a trailing .gz is stripped first).
28
+ TRANSLATORS: dict[str, type[FileTranslator]] = {
29
+ ".gff": GffTranslator,
30
+ ".gff3": GffTranslator,
31
+ ".gtf": GffTranslator,
32
+ ".fa": FastaTranslator,
33
+ ".fasta": FastaTranslator,
34
+ ".fna": FastaTranslator,
35
+ }
36
+
37
+
38
+ def translator_for(path) -> FileTranslator:
39
+ """
40
+ Pick a translator based on the file's extension.
41
+
42
+ A trailing .gz is ignored, so `genome.gff.gz` uses the GFF
43
+ translator. Raises ValueError if no translator is registered for
44
+ the (de-gzipped) extension.
45
+ """
46
+ ext = effective_suffix(path)
47
+ cls = TRANSLATORS.get(ext)
48
+ if cls is None:
49
+ supported = sorted(TRANSLATORS.keys())
50
+ raise ValueError(
51
+ f"no translator registered for {path} (extension {ext!r}). "
52
+ f"Supported: {supported} (optionally .gz-compressed)"
53
+ )
54
+ return cls()
55
+
56
+
57
+ __all__ = [
58
+ "FileTranslator",
59
+ "GffTranslator",
60
+ "FastaTranslator",
61
+ "TRANSLATORS",
62
+ "translator_for",
63
+ "open_text_read",
64
+ "open_text_write",
65
+ "is_gzip",
66
+ "effective_suffix",
67
+ "resolve_alias",
68
+ ]