alias-mapper 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- alias_mapper/__init__.py +8 -0
- alias_mapper/_ssl.py +40 -0
- alias_mapper/alias_source.py +358 -0
- alias_mapper/bootstrap.py +305 -0
- alias_mapper/build_alias_db.py +407 -0
- alias_mapper/cli.py +585 -0
- alias_mapper/formats/__init__.py +68 -0
- alias_mapper/formats/_io.py +73 -0
- alias_mapper/formats/_resolve.py +117 -0
- alias_mapper/formats/base.py +51 -0
- alias_mapper/formats/fasta.py +91 -0
- alias_mapper/formats/gff.py +63 -0
- alias_mapper-1.0.0.dist-info/METADATA +217 -0
- alias_mapper-1.0.0.dist-info/RECORD +18 -0
- alias_mapper-1.0.0.dist-info/WHEEL +5 -0
- alias_mapper-1.0.0.dist-info/entry_points.txt +2 -0
- alias_mapper-1.0.0.dist-info/licenses/LICENSE +21 -0
- alias_mapper-1.0.0.dist-info/top_level.txt +1 -0
alias_mapper/cli.py
ADDED
|
@@ -0,0 +1,585 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
cli.py
|
|
4
|
+
------
|
|
5
|
+
Command-line entry point for alias-mapper.
|
|
6
|
+
|
|
7
|
+
Translates the chromosome / scaffold names in GFF, GTF, or FASTA files
|
|
8
|
+
from one naming convention to another, using an alias source (SQLite
|
|
9
|
+
DB today; HTTP API in the future).
|
|
10
|
+
|
|
11
|
+
Modes of `convert`:
|
|
12
|
+
|
|
13
|
+
Single file:
|
|
14
|
+
alias-mapper convert INPUT --to ucsc -o OUTPUT
|
|
15
|
+
Multi file, conform (omit --to): conform the annotations to whatever
|
|
16
|
+
convention the reference FASTA is already in; the FASTA is left
|
|
17
|
+
untouched:
|
|
18
|
+
alias-mapper convert --fasta REF.fa ANN1.gff ANN2.gtf --out-dir OUT/
|
|
19
|
+
Multi file, overwrite (--overwrite-to): convert the FASTA and every
|
|
20
|
+
annotation into one chosen convention:
|
|
21
|
+
alias-mapper convert --fasta REF.fa ANN1.gff --overwrite-to ucsc --out-dir OUT/
|
|
22
|
+
|
|
23
|
+
In multi-file mode the assembly is detected ONCE from the FASTA. Conform
|
|
24
|
+
mode then maps any recognized name (in any convention) to the FASTA's own
|
|
25
|
+
convention, matching the common workflow where you have one genome and
|
|
26
|
+
want its annotations to line up with it. Overwrite mode instead detects
|
|
27
|
+
the shared source convention from the FASTA and forces everything to the
|
|
28
|
+
target. Outputs are written to --out-dir as `<stem>.<conv>.<ext>` (gzip
|
|
29
|
+
preserved); in conform mode the FASTA itself is not written, since it is
|
|
30
|
+
unchanged.
|
|
31
|
+
|
|
32
|
+
Input files may be gzipped: compression is detected from contents, and
|
|
33
|
+
output is gzipped when the chosen path ends in .gz.
|
|
34
|
+
|
|
35
|
+
If --from or --assembly is omitted, the tool samples the input (or the
|
|
36
|
+
FASTA, in multi-file mode) and auto-detects from the database.
|
|
37
|
+
|
|
38
|
+
Subcommands:
|
|
39
|
+
convert Translate one file, or a FASTA + its annotation files.
|
|
40
|
+
update Re-download the latest alias data and rebuild the cache.
|
|
41
|
+
|
|
42
|
+
On first run `convert` downloads the latest alias TSV from GitHub
|
|
43
|
+
Releases and builds a local SQLite database in the platform cache
|
|
44
|
+
directory; later invocations reuse it. Run `update` to refresh.
|
|
45
|
+
|
|
46
|
+
Usage:
|
|
47
|
+
alias-mapper convert INPUT.gff --to ucsc -o OUTPUT.gff
|
|
48
|
+
alias-mapper convert INPUT.gff.gz --to ucsc -o OUTPUT.gff.gz
|
|
49
|
+
alias-mapper convert --fasta REF.fa ann1.gff ann2.gtf --out-dir out/
|
|
50
|
+
alias-mapper convert --fasta REF.fa ann1.gff --overwrite-to ucsc --out-dir out/
|
|
51
|
+
alias-mapper convert INPUT.gff --from refseq --to ucsc \\
|
|
52
|
+
--assembly GCF_000001405.40 -o OUTPUT.gff
|
|
53
|
+
alias-mapper update
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
import argparse
|
|
57
|
+
import sys
|
|
58
|
+
from pathlib import Path
|
|
59
|
+
|
|
60
|
+
from .alias_source import (
|
|
61
|
+
SqliteAliasSource,
|
|
62
|
+
AssemblyNotFoundError,
|
|
63
|
+
AliasNotFoundError,
|
|
64
|
+
LowConfidenceDetection,
|
|
65
|
+
CONVENTION_COLUMNS,
|
|
66
|
+
)
|
|
67
|
+
from .formats import translator_for, open_text_read, open_text_write
|
|
68
|
+
from .bootstrap import (
|
|
69
|
+
BootstrapError,
|
|
70
|
+
default_cache_path,
|
|
71
|
+
ensure_db,
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
# Short names exposed on the CLI mapped to columns in the aliases table.
|
|
75
|
+
CONVENTIONS = {
|
|
76
|
+
"genbank": "genbank_acc",
|
|
77
|
+
"refseq": "refseq_acc",
|
|
78
|
+
"ucsc": "ucsc_name",
|
|
79
|
+
"sequence-name": "sequence_name",
|
|
80
|
+
"assigned-molecule": "assigned_molecule",
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
# Reverse lookup: column name -> CLI-facing convention name. Used to
|
|
84
|
+
# report auto-detection results back to the user in their vocabulary.
|
|
85
|
+
COLUMN_TO_CONVENTION = {v: k for k, v in CONVENTIONS.items()}
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def build_parser() -> argparse.ArgumentParser:
|
|
89
|
+
parser = argparse.ArgumentParser(
|
|
90
|
+
prog="alias-mapper",
|
|
91
|
+
description="Translate sequence names in GFF/GTF/FASTA files.",
|
|
92
|
+
)
|
|
93
|
+
subparsers = parser.add_subparsers(dest="command", required=True)
|
|
94
|
+
|
|
95
|
+
# convert ---------------------------------------------------------------
|
|
96
|
+
p_convert = subparsers.add_parser(
|
|
97
|
+
"convert",
|
|
98
|
+
help="Translate one file, or a FASTA plus its annotation files.",
|
|
99
|
+
description=(
|
|
100
|
+
"Single-file: convert INPUT --to TGT -o OUT\n"
|
|
101
|
+
"Multi-file conform: convert --fasta REF [ANN ...] --out-dir DIR\n"
|
|
102
|
+
"Multi-file force: convert --fasta REF [ANN ...] --overwrite-to TGT --out-dir DIR\n\n"
|
|
103
|
+
"In multi-file mode the assembly is detected once from the FASTA. "
|
|
104
|
+
"Without --overwrite-to, the annotations are conformed to the "
|
|
105
|
+
"FASTA's own convention and the FASTA is left unchanged. With "
|
|
106
|
+
"--overwrite-to, the FASTA and every annotation are forced to the "
|
|
107
|
+
"target convention."
|
|
108
|
+
),
|
|
109
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
110
|
+
)
|
|
111
|
+
p_convert.add_argument(
|
|
112
|
+
"input", type=Path, nargs="*",
|
|
113
|
+
help=(
|
|
114
|
+
"Single-file mode: one input file (GFF, GTF, or FASTA; optionally "
|
|
115
|
+
".gz). Multi-file mode (with --fasta): the annotation files to "
|
|
116
|
+
"convert alongside the FASTA."
|
|
117
|
+
),
|
|
118
|
+
)
|
|
119
|
+
p_convert.add_argument(
|
|
120
|
+
"--fasta", type=Path, default=None,
|
|
121
|
+
help=(
|
|
122
|
+
"Reference FASTA. Enables multi-file mode: detect the assembly "
|
|
123
|
+
"from this FASTA, then conform the annotation inputs to its "
|
|
124
|
+
"convention (or, with --overwrite-to, force everything to a "
|
|
125
|
+
"chosen convention)."
|
|
126
|
+
),
|
|
127
|
+
)
|
|
128
|
+
p_convert.add_argument(
|
|
129
|
+
"--from", dest="src", choices=CONVENTIONS.keys(),
|
|
130
|
+
help=(
|
|
131
|
+
"Source naming convention. Auto-detected if omitted. Not used in "
|
|
132
|
+
"conform mode (the FASTA's convention is the target there)."
|
|
133
|
+
),
|
|
134
|
+
)
|
|
135
|
+
p_convert.add_argument(
|
|
136
|
+
"--to", dest="tgt", choices=CONVENTIONS.keys(),
|
|
137
|
+
help=(
|
|
138
|
+
"Target naming convention. Required in single-file mode. In "
|
|
139
|
+
"--fasta mode use --overwrite-to instead (or omit to conform)."
|
|
140
|
+
),
|
|
141
|
+
)
|
|
142
|
+
p_convert.add_argument(
|
|
143
|
+
"--overwrite-to", dest="overwrite_to", choices=CONVENTIONS.keys(),
|
|
144
|
+
help=(
|
|
145
|
+
"(--fasta mode) Force the FASTA and all annotations to this "
|
|
146
|
+
"convention. Omit to conform the annotations to the FASTA's own "
|
|
147
|
+
"convention, leaving the FASTA unchanged."
|
|
148
|
+
),
|
|
149
|
+
)
|
|
150
|
+
p_convert.add_argument(
|
|
151
|
+
"--assembly",
|
|
152
|
+
help="Assembly accession (e.g. GCF_000001405.40). Auto-detected if omitted.",
|
|
153
|
+
)
|
|
154
|
+
p_convert.add_argument(
|
|
155
|
+
"-o", "--output", type=Path, default=None,
|
|
156
|
+
help="Output path (single-file mode only; gzipped if it ends in .gz).",
|
|
157
|
+
)
|
|
158
|
+
p_convert.add_argument(
|
|
159
|
+
"--out-dir", dest="out_dir", type=Path, default=None,
|
|
160
|
+
help=(
|
|
161
|
+
"Output directory (multi-file/--fasta mode only). Each converted "
|
|
162
|
+
"input is written as <stem>.<conv>.<ext>, preserving any .gz."
|
|
163
|
+
),
|
|
164
|
+
)
|
|
165
|
+
p_convert.add_argument(
|
|
166
|
+
"--alias-db", type=Path, default=None,
|
|
167
|
+
help=(
|
|
168
|
+
"Path to the alias SQLite database. Defaults to the platform cache "
|
|
169
|
+
"location (created on first run if missing)."
|
|
170
|
+
),
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
# update ----------------------------------------------------------------
|
|
174
|
+
p_update = subparsers.add_parser(
|
|
175
|
+
"update",
|
|
176
|
+
help="Re-download the latest alias data and rebuild the local cache.",
|
|
177
|
+
)
|
|
178
|
+
p_update.add_argument(
|
|
179
|
+
"--alias-db", type=Path, default=None,
|
|
180
|
+
help=(
|
|
181
|
+
"Path to the alias SQLite database to refresh. Defaults to the "
|
|
182
|
+
"platform cache location."
|
|
183
|
+
),
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
return parser
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def cmd_update(args) -> int:
|
|
190
|
+
"""Force a rebuild of the local alias DB from the latest TSV."""
|
|
191
|
+
try:
|
|
192
|
+
path = ensure_db(args.alias_db, force=True)
|
|
193
|
+
except BootstrapError as e:
|
|
194
|
+
sys.exit(f"error: {e}")
|
|
195
|
+
print(f"Done. Local alias database is up to date at {path}", file=sys.stderr)
|
|
196
|
+
return 0
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def _open_source(args):
|
|
200
|
+
"""Resolve --alias-db (or the cached default) and open a SqliteAliasSource."""
|
|
201
|
+
# If --alias-db was explicitly passed and doesn't exist, that's a user
|
|
202
|
+
# error: don't surprise them by auto-downloading to a path they chose.
|
|
203
|
+
if args.alias_db is not None and not args.alias_db.exists():
|
|
204
|
+
sys.exit(
|
|
205
|
+
f"error: alias database not found at {args.alias_db}. "
|
|
206
|
+
f"Either omit --alias-db to use the cached default, or run "
|
|
207
|
+
f"`alias-mapper update --alias-db {args.alias_db}` to create it there."
|
|
208
|
+
)
|
|
209
|
+
try:
|
|
210
|
+
db_path = ensure_db(args.alias_db)
|
|
211
|
+
except BootstrapError as e:
|
|
212
|
+
sys.exit(f"error: {e}")
|
|
213
|
+
return SqliteAliasSource(db_path), db_path
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def _resolve_from_assembly(source, sample_path, args, role="source convention"):
|
|
217
|
+
"""
|
|
218
|
+
Determine (conv_col, conv_name, assembly), sampling names from
|
|
219
|
+
`sample_path` to auto-detect whichever of --from / --assembly was
|
|
220
|
+
omitted. In multi-file mode `sample_path` is the FASTA, so detection
|
|
221
|
+
happens once and is reused for every annotation file.
|
|
222
|
+
|
|
223
|
+
`role` only changes the label printed for the detected convention,
|
|
224
|
+
so conform mode can report it as the target rather than the source.
|
|
225
|
+
"""
|
|
226
|
+
translator = translator_for(sample_path)
|
|
227
|
+
sample = None
|
|
228
|
+
if args.src is None or args.assembly is None:
|
|
229
|
+
sample = translator.sample_names(sample_path)
|
|
230
|
+
if not sample:
|
|
231
|
+
sys.exit(
|
|
232
|
+
f"error: no sequence names found in {sample_path} for auto-detection. "
|
|
233
|
+
f"Pass --from and --assembly explicitly."
|
|
234
|
+
)
|
|
235
|
+
print(
|
|
236
|
+
f"Sampled {len(sample)} unique sequence names from {sample_path} "
|
|
237
|
+
f"for auto-detection.",
|
|
238
|
+
file=sys.stderr,
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
if args.src is None:
|
|
242
|
+
try:
|
|
243
|
+
result = source.detect_convention(sample)
|
|
244
|
+
except LowConfidenceDetection as e:
|
|
245
|
+
sys.exit(f"error: {e}")
|
|
246
|
+
conv_col = result.winner
|
|
247
|
+
conv_name = COLUMN_TO_CONVENTION.get(conv_col, conv_col)
|
|
248
|
+
print(
|
|
249
|
+
f" detected {role}: {conv_name} "
|
|
250
|
+
f"({result.winner_score}/{len(sample)} matches, "
|
|
251
|
+
f"runner-up {result.runner_up_score})",
|
|
252
|
+
file=sys.stderr,
|
|
253
|
+
)
|
|
254
|
+
else:
|
|
255
|
+
conv_col = CONVENTIONS[args.src]
|
|
256
|
+
conv_name = args.src
|
|
257
|
+
|
|
258
|
+
if args.assembly is None:
|
|
259
|
+
try:
|
|
260
|
+
result = source.detect_assembly(sample)
|
|
261
|
+
except LowConfidenceDetection as e:
|
|
262
|
+
sys.exit(f"error: {e}")
|
|
263
|
+
assembly = result.winner
|
|
264
|
+
print(
|
|
265
|
+
f" detected assembly: {assembly} "
|
|
266
|
+
f"({result.winner_score}/{len(sample)} matches, "
|
|
267
|
+
f"runner-up {result.runner_up_score})",
|
|
268
|
+
file=sys.stderr,
|
|
269
|
+
)
|
|
270
|
+
else:
|
|
271
|
+
assembly = args.assembly
|
|
272
|
+
|
|
273
|
+
return conv_col, conv_name, assembly
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
def _load_map(source, assembly, src_col, src_name, tgt_col):
|
|
277
|
+
"""Fetch the {source_name -> target_name} map for one assembly."""
|
|
278
|
+
if src_col == tgt_col:
|
|
279
|
+
sys.exit(
|
|
280
|
+
f"error: source and target conventions are the same ({src_name}). "
|
|
281
|
+
f"Nothing to translate."
|
|
282
|
+
)
|
|
283
|
+
try:
|
|
284
|
+
alias_map = source.get_map(assembly, src_col, tgt_col)
|
|
285
|
+
except AssemblyNotFoundError:
|
|
286
|
+
sys.exit(
|
|
287
|
+
f"error: assembly {assembly!r} not found in the database. "
|
|
288
|
+
f"Check the --assembly value."
|
|
289
|
+
)
|
|
290
|
+
except AliasNotFoundError as e:
|
|
291
|
+
sys.exit(
|
|
292
|
+
f"error: {e}. This assembly may not have aliases in those conventions."
|
|
293
|
+
)
|
|
294
|
+
print(f" -> {len(alias_map)} entries loaded", file=sys.stderr)
|
|
295
|
+
return alias_map
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
def _load_conform_map(source, assembly, target_col, target_name):
|
|
299
|
+
"""
|
|
300
|
+
Build a {any_convention_name -> target_name} map for conform mode.
|
|
301
|
+
|
|
302
|
+
Merges get_map() across every convention column except the target,
|
|
303
|
+
so a name in any recognized convention resolves to the FASTA's
|
|
304
|
+
convention. Convention columns with no rows paired to the target for
|
|
305
|
+
this assembly are skipped.
|
|
306
|
+
|
|
307
|
+
This is built from the existing one-source/one-target get_map, so it
|
|
308
|
+
needs no change to the AliasSource interface. A consequence: names
|
|
309
|
+
that are *already* in the target convention are not keys here, so
|
|
310
|
+
they pass through unchanged (the correct output) but land in the
|
|
311
|
+
"unmapped" tally — see the conform-mode note in _translate_file.
|
|
312
|
+
"""
|
|
313
|
+
if not source.assembly_exists(assembly):
|
|
314
|
+
sys.exit(
|
|
315
|
+
f"error: assembly {assembly!r} not found in the database. "
|
|
316
|
+
f"Check the --assembly value."
|
|
317
|
+
)
|
|
318
|
+
conform_map: dict[str, str] = {}
|
|
319
|
+
contributing: list[str] = []
|
|
320
|
+
for col in CONVENTION_COLUMNS:
|
|
321
|
+
if col == target_col:
|
|
322
|
+
continue
|
|
323
|
+
try:
|
|
324
|
+
partial = source.get_map(assembly, col, target_col)
|
|
325
|
+
except AliasNotFoundError:
|
|
326
|
+
# This convention has no rows paired with the target for this
|
|
327
|
+
# assembly (e.g. RefSeq/UCSC absent). Nothing to contribute.
|
|
328
|
+
continue
|
|
329
|
+
conform_map.update(partial)
|
|
330
|
+
contributing.append(COLUMN_TO_CONVENTION.get(col, col))
|
|
331
|
+
|
|
332
|
+
if contributing:
|
|
333
|
+
print(
|
|
334
|
+
f" conform map: {len(conform_map)} names -> {target_name} "
|
|
335
|
+
f"(from {', '.join(contributing)})",
|
|
336
|
+
file=sys.stderr,
|
|
337
|
+
)
|
|
338
|
+
else:
|
|
339
|
+
print(
|
|
340
|
+
f" warning: no other convention has data paired to {target_name} "
|
|
341
|
+
f"for this assembly; nothing can be conformed. Annotation names "
|
|
342
|
+
f"already in {target_name} will pass through unchanged.",
|
|
343
|
+
file=sys.stderr,
|
|
344
|
+
)
|
|
345
|
+
return conform_map
|
|
346
|
+
|
|
347
|
+
|
|
348
|
+
def _translate_file(in_path: Path, out_path: Path, alias_map: dict,
|
|
349
|
+
conform_target: str | None = None) -> dict:
|
|
350
|
+
"""
|
|
351
|
+
Translate one file with a prepared alias map. Returns its stats.
|
|
352
|
+
|
|
353
|
+
When `conform_target` is set (conform mode), the passthrough message
|
|
354
|
+
is worded as a neutral note rather than a warning: a name that is
|
|
355
|
+
already in the target convention is not in the conform map and so is
|
|
356
|
+
correctly left unchanged, which is not an error.
|
|
357
|
+
"""
|
|
358
|
+
translator = translator_for(in_path)
|
|
359
|
+
stats = {"mapped": 0, "unmapped": 0, "unmapped_examples": set()}
|
|
360
|
+
print(f"Translating {in_path} → {out_path}", file=sys.stderr)
|
|
361
|
+
with open_text_read(in_path) as in_f, open_text_write(out_path) as out_f:
|
|
362
|
+
for line in in_f:
|
|
363
|
+
out_f.write(translator.translate_line(line, alias_map, stats))
|
|
364
|
+
print(
|
|
365
|
+
f" {in_path.name}: mapped={stats['mapped']}, unmapped={stats['unmapped']}",
|
|
366
|
+
file=sys.stderr,
|
|
367
|
+
)
|
|
368
|
+
if stats["unmapped"]:
|
|
369
|
+
examples = sorted(stats["unmapped_examples"])[:5]
|
|
370
|
+
if conform_target is not None:
|
|
371
|
+
print(
|
|
372
|
+
f" note: {stats['unmapped']} names in {in_path.name} were already "
|
|
373
|
+
f"in {conform_target} convention or not recognized; passed through "
|
|
374
|
+
f"unchanged. Examples: {examples}",
|
|
375
|
+
file=sys.stderr,
|
|
376
|
+
)
|
|
377
|
+
else:
|
|
378
|
+
print(
|
|
379
|
+
f" warning: {stats['unmapped']} names in {in_path.name} not found in "
|
|
380
|
+
f"the alias database for this assembly; passed through unchanged. "
|
|
381
|
+
f"Examples: {examples}",
|
|
382
|
+
file=sys.stderr,
|
|
383
|
+
)
|
|
384
|
+
return stats
|
|
385
|
+
|
|
386
|
+
|
|
387
|
+
def _output_name(in_path: Path, to: str) -> str:
|
|
388
|
+
"""
|
|
389
|
+
Build the multi-file output filename: insert `.<to>` before the
|
|
390
|
+
extension(s), preserving a trailing .gz.
|
|
391
|
+
|
|
392
|
+
genome.fa.gz -> genome.<to>.fa.gz ; ann1.gff -> ann1.<to>.gff
|
|
393
|
+
"""
|
|
394
|
+
p = Path(in_path)
|
|
395
|
+
if p.suffix.lower() == ".gz":
|
|
396
|
+
base = Path(p.stem).stem
|
|
397
|
+
exts = Path(p.stem).suffix + p.suffix
|
|
398
|
+
else:
|
|
399
|
+
base = p.stem
|
|
400
|
+
exts = p.suffix
|
|
401
|
+
return f"{base}.{to}{exts}"
|
|
402
|
+
|
|
403
|
+
|
|
404
|
+
def cmd_convert(args) -> int:
|
|
405
|
+
"""Dispatch to single-file or multi-file (--fasta) translation."""
|
|
406
|
+
if args.fasta is not None:
|
|
407
|
+
return _convert_multi(args)
|
|
408
|
+
return _convert_single(args)
|
|
409
|
+
|
|
410
|
+
|
|
411
|
+
def _convert_single(args) -> int:
|
|
412
|
+
if args.overwrite_to is not None:
|
|
413
|
+
sys.exit(
|
|
414
|
+
"error: --overwrite-to is only for --fasta (multi-file) mode. "
|
|
415
|
+
"Use --to for single-file output."
|
|
416
|
+
)
|
|
417
|
+
if args.tgt is None:
|
|
418
|
+
sys.exit("error: --to is required in single-file mode.")
|
|
419
|
+
if args.out_dir is not None:
|
|
420
|
+
sys.exit(
|
|
421
|
+
"error: --out-dir is only for --fasta (multi-file) mode. "
|
|
422
|
+
"Use -o for single-file output."
|
|
423
|
+
)
|
|
424
|
+
if len(args.input) != 1:
|
|
425
|
+
sys.exit(
|
|
426
|
+
"error: single-file mode takes exactly one input file. For multiple "
|
|
427
|
+
"files use --fasta REF ANN... with --out-dir."
|
|
428
|
+
)
|
|
429
|
+
if args.output is None:
|
|
430
|
+
sys.exit("error: -o/--output is required in single-file mode.")
|
|
431
|
+
|
|
432
|
+
in_path, out_path = args.input[0], args.output
|
|
433
|
+
if not in_path.exists():
|
|
434
|
+
sys.exit(f"error: input file not found: {in_path}")
|
|
435
|
+
if out_path.exists():
|
|
436
|
+
sys.exit(
|
|
437
|
+
f"error: output file already exists: {out_path} "
|
|
438
|
+
f"(refusing to overwrite — choose another path or delete it first)"
|
|
439
|
+
)
|
|
440
|
+
try:
|
|
441
|
+
translator_for(in_path)
|
|
442
|
+
except ValueError as e:
|
|
443
|
+
sys.exit(f"error: {e}")
|
|
444
|
+
|
|
445
|
+
out_path.parent.mkdir(parents=True, exist_ok=True)
|
|
446
|
+
source, db_path = _open_source(args)
|
|
447
|
+
src_col, src_name, assembly = _resolve_from_assembly(source, in_path, args)
|
|
448
|
+
tgt_col = CONVENTIONS[args.tgt]
|
|
449
|
+
print(
|
|
450
|
+
f"Loading alias table from {db_path}\n"
|
|
451
|
+
f" assembly={assembly}, from={src_name}, to={args.tgt}",
|
|
452
|
+
file=sys.stderr,
|
|
453
|
+
)
|
|
454
|
+
alias_map = _load_map(source, assembly, src_col, src_name, tgt_col)
|
|
455
|
+
_translate_file(in_path, out_path, alias_map)
|
|
456
|
+
print("Done.", file=sys.stderr)
|
|
457
|
+
return 0
|
|
458
|
+
|
|
459
|
+
|
|
460
|
+
def _convert_multi(args) -> int:
|
|
461
|
+
if args.output is not None:
|
|
462
|
+
sys.exit(
|
|
463
|
+
"error: -o/--output is for single-file mode. In --fasta mode, "
|
|
464
|
+
"outputs go to --out-dir."
|
|
465
|
+
)
|
|
466
|
+
if args.out_dir is None:
|
|
467
|
+
sys.exit("error: --out-dir is required in --fasta (multi-file) mode.")
|
|
468
|
+
if args.tgt is not None:
|
|
469
|
+
sys.exit(
|
|
470
|
+
"error: --to is single-file only. In --fasta mode, use "
|
|
471
|
+
"--overwrite-to to force every file into one convention, or omit "
|
|
472
|
+
"it to conform the annotations to the FASTA's own convention."
|
|
473
|
+
)
|
|
474
|
+
|
|
475
|
+
fasta = args.fasta
|
|
476
|
+
if not fasta.exists():
|
|
477
|
+
sys.exit(f"error: FASTA not found: {fasta}")
|
|
478
|
+
annotations = list(args.input)
|
|
479
|
+
for f in annotations:
|
|
480
|
+
if not f.exists():
|
|
481
|
+
sys.exit(f"error: input file not found: {f}")
|
|
482
|
+
|
|
483
|
+
conform = args.overwrite_to is None
|
|
484
|
+
if conform and args.src is not None:
|
|
485
|
+
sys.exit(
|
|
486
|
+
"error: --from is not used in conform mode. The FASTA's own "
|
|
487
|
+
"convention is detected and used as the target. To force a "
|
|
488
|
+
"specific convention for every file, use --overwrite-to."
|
|
489
|
+
)
|
|
490
|
+
if conform and not annotations:
|
|
491
|
+
sys.exit(
|
|
492
|
+
"error: conform mode needs at least one annotation file to conform "
|
|
493
|
+
"to the FASTA. (To convert just the FASTA, use --overwrite-to.)"
|
|
494
|
+
)
|
|
495
|
+
|
|
496
|
+
# Validate every translator up front so a bad extension fails before
|
|
497
|
+
# we touch the (possibly large) database download.
|
|
498
|
+
for f in [fasta, *annotations]:
|
|
499
|
+
try:
|
|
500
|
+
translator_for(f)
|
|
501
|
+
except ValueError as e:
|
|
502
|
+
sys.exit(f"error: {e}")
|
|
503
|
+
|
|
504
|
+
out_dir = args.out_dir
|
|
505
|
+
source, db_path = _open_source(args)
|
|
506
|
+
|
|
507
|
+
# Detect from the FASTA. In conform mode the FASTA's convention is the
|
|
508
|
+
# target; in overwrite mode it's the (shared) source convention. The
|
|
509
|
+
# assembly is detected from the FASTA either way. Output naming and
|
|
510
|
+
# planning need the convention name, so this runs before planning.
|
|
511
|
+
if conform:
|
|
512
|
+
tgt_col, tgt_name, assembly = _resolve_from_assembly(
|
|
513
|
+
source, fasta, args, role="FASTA convention (conform target)"
|
|
514
|
+
)
|
|
515
|
+
files_to_convert = annotations # FASTA is the reference, left untouched
|
|
516
|
+
else:
|
|
517
|
+
src_col, src_name, assembly = _resolve_from_assembly(source, fasta, args)
|
|
518
|
+
tgt_name = args.overwrite_to
|
|
519
|
+
tgt_col = CONVENTIONS[args.overwrite_to]
|
|
520
|
+
files_to_convert = [fasta, *annotations]
|
|
521
|
+
|
|
522
|
+
# Plan outputs, refusing both overwrites and same-output collisions.
|
|
523
|
+
planned, seen = [], {}
|
|
524
|
+
for f in files_to_convert:
|
|
525
|
+
out_path = out_dir / _output_name(f, tgt_name)
|
|
526
|
+
if out_path in seen:
|
|
527
|
+
sys.exit(
|
|
528
|
+
f"error: inputs {seen[out_path]} and {f} both map to output "
|
|
529
|
+
f"{out_path.name}. Rename one."
|
|
530
|
+
)
|
|
531
|
+
seen[out_path] = f
|
|
532
|
+
if out_path.exists():
|
|
533
|
+
sys.exit(f"error: output already exists: {out_path} (refusing to overwrite).")
|
|
534
|
+
planned.append((f, out_path))
|
|
535
|
+
|
|
536
|
+
out_dir.mkdir(parents=True, exist_ok=True)
|
|
537
|
+
|
|
538
|
+
print(
|
|
539
|
+
f"Loading alias table from {db_path}\n"
|
|
540
|
+
f" assembly={assembly}, target={tgt_name}, "
|
|
541
|
+
f"mode={'conform' if conform else 'overwrite'}",
|
|
542
|
+
file=sys.stderr,
|
|
543
|
+
)
|
|
544
|
+
|
|
545
|
+
if conform:
|
|
546
|
+
alias_map = _load_conform_map(source, assembly, tgt_col, tgt_name)
|
|
547
|
+
conform_target = tgt_name
|
|
548
|
+
print(
|
|
549
|
+
f"Conforming {len(planned)} annotation file(s) to {tgt_name} in "
|
|
550
|
+
f"{out_dir}/; FASTA {fasta.name} left unchanged.",
|
|
551
|
+
file=sys.stderr,
|
|
552
|
+
)
|
|
553
|
+
else:
|
|
554
|
+
alias_map = _load_map(source, assembly, src_col, src_name, tgt_col)
|
|
555
|
+
conform_target = None
|
|
556
|
+
print(f"Converting {len(planned)} file(s) into {out_dir}/", file=sys.stderr)
|
|
557
|
+
|
|
558
|
+
totals = {"mapped": 0, "unmapped": 0}
|
|
559
|
+
for in_path, out_path in planned:
|
|
560
|
+
stats = _translate_file(in_path, out_path, alias_map,
|
|
561
|
+
conform_target=conform_target)
|
|
562
|
+
totals["mapped"] += stats["mapped"]
|
|
563
|
+
totals["unmapped"] += stats["unmapped"]
|
|
564
|
+
print(
|
|
565
|
+
f"Done. {len(planned)} file(s), total mapped={totals['mapped']}, "
|
|
566
|
+
f"unmapped={totals['unmapped']}",
|
|
567
|
+
file=sys.stderr,
|
|
568
|
+
)
|
|
569
|
+
return 0
|
|
570
|
+
|
|
571
|
+
|
|
572
|
+
def main():
|
|
573
|
+
parser = build_parser()
|
|
574
|
+
args = parser.parse_args()
|
|
575
|
+
|
|
576
|
+
if args.command == "convert":
|
|
577
|
+
return cmd_convert(args)
|
|
578
|
+
elif args.command == "update":
|
|
579
|
+
return cmd_update(args)
|
|
580
|
+
else:
|
|
581
|
+
parser.error(f"unknown command: {args.command}")
|
|
582
|
+
|
|
583
|
+
|
|
584
|
+
if __name__ == "__main__":
|
|
585
|
+
sys.exit(main())
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
"""
|
|
2
|
+
File format translators.
|
|
3
|
+
|
|
4
|
+
Each translator class handles one file format's specifics — which lines
|
|
5
|
+
contain sequence names, how to extract and rewrite them. The CLI
|
|
6
|
+
dispatches to the right translator by file extension via TRANSLATORS.
|
|
7
|
+
|
|
8
|
+
Input and output may be gzipped transparently; see _io for the rules.
|
|
9
|
+
Format detection ignores a trailing .gz, so `genome.fa.gz` resolves to
|
|
10
|
+
the FASTA translator.
|
|
11
|
+
|
|
12
|
+
Name lookup during translation goes through resolve_alias (see
|
|
13
|
+
_resolve), which falls back to conservative name normalizations only
|
|
14
|
+
when an exact map hit is missing.
|
|
15
|
+
|
|
16
|
+
Adding a new format: write a class in a new module, import it here,
|
|
17
|
+
add an entry to TRANSLATORS.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from .base import FileTranslator
|
|
21
|
+
from .gff import GffTranslator
|
|
22
|
+
from .fasta import FastaTranslator
|
|
23
|
+
from ._io import open_text_read, open_text_write, is_gzip, effective_suffix
|
|
24
|
+
from ._resolve import resolve_alias
|
|
25
|
+
|
|
26
|
+
# Extension → translator class. The CLI uses this to pick a translator
|
|
27
|
+
# based on the input file's suffix (a trailing .gz is stripped first).
|
|
28
|
+
TRANSLATORS: dict[str, type[FileTranslator]] = {
|
|
29
|
+
".gff": GffTranslator,
|
|
30
|
+
".gff3": GffTranslator,
|
|
31
|
+
".gtf": GffTranslator,
|
|
32
|
+
".fa": FastaTranslator,
|
|
33
|
+
".fasta": FastaTranslator,
|
|
34
|
+
".fna": FastaTranslator,
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def translator_for(path) -> FileTranslator:
|
|
39
|
+
"""
|
|
40
|
+
Pick a translator based on the file's extension.
|
|
41
|
+
|
|
42
|
+
A trailing .gz is ignored, so `genome.gff.gz` uses the GFF
|
|
43
|
+
translator. Raises ValueError if no translator is registered for
|
|
44
|
+
the (de-gzipped) extension.
|
|
45
|
+
"""
|
|
46
|
+
ext = effective_suffix(path)
|
|
47
|
+
cls = TRANSLATORS.get(ext)
|
|
48
|
+
if cls is None:
|
|
49
|
+
supported = sorted(TRANSLATORS.keys())
|
|
50
|
+
raise ValueError(
|
|
51
|
+
f"no translator registered for {path} (extension {ext!r}). "
|
|
52
|
+
f"Supported: {supported} (optionally .gz-compressed)"
|
|
53
|
+
)
|
|
54
|
+
return cls()
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
__all__ = [
|
|
58
|
+
"FileTranslator",
|
|
59
|
+
"GffTranslator",
|
|
60
|
+
"FastaTranslator",
|
|
61
|
+
"TRANSLATORS",
|
|
62
|
+
"translator_for",
|
|
63
|
+
"open_text_read",
|
|
64
|
+
"open_text_write",
|
|
65
|
+
"is_gzip",
|
|
66
|
+
"effective_suffix",
|
|
67
|
+
"resolve_alias",
|
|
68
|
+
]
|