alias-mapper 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- alias_mapper/__init__.py +8 -0
- alias_mapper/_ssl.py +40 -0
- alias_mapper/alias_source.py +358 -0
- alias_mapper/bootstrap.py +305 -0
- alias_mapper/build_alias_db.py +407 -0
- alias_mapper/cli.py +585 -0
- alias_mapper/formats/__init__.py +68 -0
- alias_mapper/formats/_io.py +73 -0
- alias_mapper/formats/_resolve.py +117 -0
- alias_mapper/formats/base.py +51 -0
- alias_mapper/formats/fasta.py +91 -0
- alias_mapper/formats/gff.py +63 -0
- alias_mapper-1.0.0.dist-info/METADATA +217 -0
- alias_mapper-1.0.0.dist-info/RECORD +18 -0
- alias_mapper-1.0.0.dist-info/WHEEL +5 -0
- alias_mapper-1.0.0.dist-info/entry_points.txt +2 -0
- alias_mapper-1.0.0.dist-info/licenses/LICENSE +21 -0
- alias_mapper-1.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Transparent gzip handling for input and output files.
|
|
3
|
+
|
|
4
|
+
Genomics files (FASTA, GFF, GTF) are very often distributed gzipped,
|
|
5
|
+
sometimes without a telltale .gz suffix (e.g. a browser "download"
|
|
6
|
+
endpoint). These helpers let the rest of the package open a path
|
|
7
|
+
without caring whether it's compressed:
|
|
8
|
+
|
|
9
|
+
- reads sniff the gzip magic bytes, so a gzipped file works even if
|
|
10
|
+
its name doesn't end in .gz;
|
|
11
|
+
- writes compress when the chosen output path ends in .gz;
|
|
12
|
+
- format detection ignores a trailing .gz so `genome.fa.gz` still
|
|
13
|
+
resolves to the FASTA translator.
|
|
14
|
+
|
|
15
|
+
This is the only place in the package that decides "is this gzip?",
|
|
16
|
+
so the rule lives in one spot.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
import gzip
|
|
20
|
+
import io
|
|
21
|
+
from pathlib import Path
|
|
22
|
+
|
|
23
|
+
# gzip streams begin with these two magic bytes (RFC 1952).
|
|
24
|
+
_GZIP_MAGIC = b"\x1f\x8b"
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def is_gzip(path) -> bool:
|
|
28
|
+
"""
|
|
29
|
+
Return True if `path` is gzip-compressed, decided by content not name.
|
|
30
|
+
|
|
31
|
+
Reads the first two bytes and checks for the gzip magic number, so a
|
|
32
|
+
gzipped file is handled regardless of whether it ends in .gz. Returns
|
|
33
|
+
False for a missing or unreadable file; the caller surfaces a clearer
|
|
34
|
+
"not found" error downstream.
|
|
35
|
+
"""
|
|
36
|
+
try:
|
|
37
|
+
with open(path, "rb") as f:
|
|
38
|
+
return f.read(2) == _GZIP_MAGIC
|
|
39
|
+
except OSError:
|
|
40
|
+
return False
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def open_text_read(path, encoding: str = "utf-8") -> io.TextIOBase:
|
|
44
|
+
"""Open `path` for text reading, decompressing if it is gzipped."""
|
|
45
|
+
if is_gzip(path):
|
|
46
|
+
return gzip.open(path, "rt", encoding=encoding)
|
|
47
|
+
return open(path, "r", encoding=encoding)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def open_text_write(path, encoding: str = "utf-8") -> io.TextIOBase:
|
|
51
|
+
"""
|
|
52
|
+
Open `path` for text writing, compressing if the name ends in .gz.
|
|
53
|
+
|
|
54
|
+
Output compression keys off the extension rather than content (there
|
|
55
|
+
is no content yet): `out.gff.gz` is written gzipped, `out.gff` plain.
|
|
56
|
+
"""
|
|
57
|
+
if Path(path).suffix.lower() == ".gz":
|
|
58
|
+
return gzip.open(path, "wt", encoding=encoding)
|
|
59
|
+
return open(path, "w", encoding=encoding)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def effective_suffix(path) -> str:
|
|
63
|
+
"""
|
|
64
|
+
The format-relevant suffix, ignoring a trailing .gz.
|
|
65
|
+
|
|
66
|
+
`genome.fa.gz` -> `.fa`, `genome.gff` -> `.gff`, `genome.gz` -> ``.
|
|
67
|
+
Lower-cased. Used so extension-based format detection works on
|
|
68
|
+
compressed files.
|
|
69
|
+
"""
|
|
70
|
+
p = Path(path)
|
|
71
|
+
if p.suffix.lower() == ".gz":
|
|
72
|
+
return Path(p.stem).suffix.lower()
|
|
73
|
+
return p.suffix.lower()
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Fallback name resolution for alias lookups.
|
|
3
|
+
|
|
4
|
+
The primary lookup is an exact dict hit: ``alias_map[name]``. When that
|
|
5
|
+
misses, real-world inputs often carry the *same* identifier in a
|
|
6
|
+
different surface form. Rather than fail those outright, we try a small,
|
|
7
|
+
conservative set of normalizations and retry the lookup. The fallbacks
|
|
8
|
+
only run on a miss, so the common path stays a single dict lookup.
|
|
9
|
+
|
|
10
|
+
Two variant classes are handled, both low-risk normalizations of the
|
|
11
|
+
same underlying accession (not fuzzy matching):
|
|
12
|
+
|
|
13
|
+
Version separator (.N <-> vN)
|
|
14
|
+
UCSC writes unplaced/unlocalized scaffolds with a 'v' version
|
|
15
|
+
separator (NW_013982187v1) where GenBank/RefSeq use a dot
|
|
16
|
+
(NW_013982187.1). If a name in one form misses, try the other.
|
|
17
|
+
|
|
18
|
+
ENA pipe-prefixed headers (ENA|<unversioned>|<versioned>)
|
|
19
|
+
ENA FASTA headers wrap the accession as 'ENA|ACC|ACC.v'. The bare
|
|
20
|
+
accession is what matches our columns, so on a miss we peel the ENA
|
|
21
|
+
wrapper and retry on the inner accession(s), most-likely first.
|
|
22
|
+
|
|
23
|
+
Resolution order: exact name, then ENA-unwrapped accession(s), with a
|
|
24
|
+
version-separator swap tried on each. First hit wins. None means the
|
|
25
|
+
name is genuinely unmapped and the caller passes the line through
|
|
26
|
+
unchanged, exactly as before this fallback existed.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
import re
|
|
30
|
+
|
|
31
|
+
# A trailing version suffix in either surface form: ".1" or "v1".
|
|
32
|
+
# Non-greedy with an end anchor so the LAST separator is the split point
|
|
33
|
+
# (e.g. "GL000.2.1" -> base "GL000.2", version "1").
|
|
34
|
+
_DOT_VERSION = re.compile(r"^(.*?)\.(\d+)$")
|
|
35
|
+
_V_VERSION = re.compile(r"^(.*?)v(\d+)$")
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _swap_version_suffix(name):
|
|
39
|
+
"""
|
|
40
|
+
Yield the alternate version-suffix form of ``name``, if one applies.
|
|
41
|
+
|
|
42
|
+
"NW_013982187.1" -> "NW_013982187v1" and vice versa. Yields nothing
|
|
43
|
+
when the name has no trailing ".N" or "vN", so callers can iterate
|
|
44
|
+
unconditionally.
|
|
45
|
+
"""
|
|
46
|
+
m = _DOT_VERSION.match(name)
|
|
47
|
+
if m:
|
|
48
|
+
yield f"{m.group(1)}v{m.group(2)}"
|
|
49
|
+
return
|
|
50
|
+
m = _V_VERSION.match(name)
|
|
51
|
+
if m:
|
|
52
|
+
yield f"{m.group(1)}.{m.group(2)}"
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _strip_ena_prefix(name):
|
|
56
|
+
"""
|
|
57
|
+
Yield the bare accession(s) from an 'ENA|...|...' header.
|
|
58
|
+
|
|
59
|
+
ENA|CAJVCV010000001|CAJVCV010000001.1 yields "CAJVCV010000001.1"
|
|
60
|
+
(versioned, the form that matches our columns) then
|
|
61
|
+
"CAJVCV010000001" (unversioned) as a secondary try. Yields nothing
|
|
62
|
+
when the name is not ENA-prefixed.
|
|
63
|
+
"""
|
|
64
|
+
if "|" not in name:
|
|
65
|
+
return
|
|
66
|
+
fields = name.split("|")
|
|
67
|
+
if not fields or fields[0].upper() != "ENA":
|
|
68
|
+
return
|
|
69
|
+
rest = [f for f in fields[1:] if f]
|
|
70
|
+
# The versioned accession is usually the last field and most likely
|
|
71
|
+
# to match; the unversioned field is a secondary candidate.
|
|
72
|
+
for f in reversed(rest):
|
|
73
|
+
yield f
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def _candidates(name):
|
|
77
|
+
"""
|
|
78
|
+
Generate lookup candidates for ``name`` in priority order.
|
|
79
|
+
|
|
80
|
+
The exact name is always yielded first, so a normal hit costs a
|
|
81
|
+
single dict lookup. Candidates are deduplicated while preserving
|
|
82
|
+
order, so each surface form is tried at most once.
|
|
83
|
+
"""
|
|
84
|
+
seen = set()
|
|
85
|
+
|
|
86
|
+
def add(candidate):
|
|
87
|
+
if candidate and candidate not in seen:
|
|
88
|
+
seen.add(candidate)
|
|
89
|
+
return True
|
|
90
|
+
return False
|
|
91
|
+
|
|
92
|
+
# ENA-unwrapped accessions become additional base forms that the
|
|
93
|
+
# version-swap step can also act on (handles ENA-wrapped vN names).
|
|
94
|
+
bases = [name]
|
|
95
|
+
bases.extend(_strip_ena_prefix(name))
|
|
96
|
+
|
|
97
|
+
for base in bases:
|
|
98
|
+
if add(base):
|
|
99
|
+
yield base
|
|
100
|
+
for swapped in _swap_version_suffix(base):
|
|
101
|
+
if add(swapped):
|
|
102
|
+
yield swapped
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def resolve_alias(name: str, alias_map: dict) -> str | None:
|
|
106
|
+
"""
|
|
107
|
+
Look up ``name`` in ``alias_map``, trying conservative fallbacks on a miss.
|
|
108
|
+
|
|
109
|
+
Returns the mapped target name, or None if no candidate form of the
|
|
110
|
+
name is present in the map. The first candidate is always the exact
|
|
111
|
+
name, so an ordinary hit is a single dict lookup with no overhead.
|
|
112
|
+
"""
|
|
113
|
+
for candidate in _candidates(name):
|
|
114
|
+
hit = alias_map.get(candidate)
|
|
115
|
+
if hit is not None:
|
|
116
|
+
return hit
|
|
117
|
+
return None
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
"""Abstract base class for file format translators."""
|
|
2
|
+
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class FileTranslator(ABC):
|
|
8
|
+
"""
|
|
9
|
+
Translates sequence names in one file format.
|
|
10
|
+
|
|
11
|
+
Subclasses know which lines in their format contain sequence names
|
|
12
|
+
and how to rewrite them. Comments, headers, and blank lines should
|
|
13
|
+
pass through unchanged.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
@abstractmethod
|
|
17
|
+
def translate_line(self, line: str, alias_map: dict, stats: dict) -> str:
|
|
18
|
+
"""
|
|
19
|
+
Translate one line of input.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
line: One line from the input file (with trailing newline).
|
|
23
|
+
alias_map: {source_name -> target_name} dict from AliasSource.
|
|
24
|
+
stats: Mutable dict with keys 'mapped' (int), 'unmapped'
|
|
25
|
+
(int), and 'unmapped_examples' (set). The translator
|
|
26
|
+
updates these in place.
|
|
27
|
+
|
|
28
|
+
Returns:
|
|
29
|
+
The translated line (with trailing newline). Lines that
|
|
30
|
+
don't contain a translatable sequence name are returned
|
|
31
|
+
unchanged.
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
@abstractmethod
|
|
35
|
+
def sample_names(self, path: Path, limit: int = 50) -> list[str]:
|
|
36
|
+
"""
|
|
37
|
+
Read up to `limit` unique sequence names from the start of the file.
|
|
38
|
+
|
|
39
|
+
Used by auto-detection to decide which convention and assembly
|
|
40
|
+
the input file is using. Stops once `limit` unique names have
|
|
41
|
+
been collected, so this is O(limit) regardless of file size.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
path: Path to the input file.
|
|
45
|
+
limit: Maximum number of unique names to return.
|
|
46
|
+
|
|
47
|
+
Returns:
|
|
48
|
+
List of unique sequence names, preserving the order they
|
|
49
|
+
appeared in the file. May contain fewer than `limit` if
|
|
50
|
+
the file has fewer unique names.
|
|
51
|
+
"""
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
"""FASTA translator. Sequence name lives in the header line, after the '>'."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from .base import FileTranslator
|
|
6
|
+
from ._io import open_text_read
|
|
7
|
+
from ._resolve import resolve_alias
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class FastaTranslator(FileTranslator):
|
|
11
|
+
"""
|
|
12
|
+
Translator for FASTA files.
|
|
13
|
+
|
|
14
|
+
FASTA structure:
|
|
15
|
+
- Header lines start with '>'. Format: '>NAME [WHITESPACE DESCRIPTION]'.
|
|
16
|
+
Only the NAME (first whitespace-separated token after '>') is the
|
|
17
|
+
sequence identifier. The description, if present, is preserved
|
|
18
|
+
verbatim including the exact whitespace between name and description.
|
|
19
|
+
- Sequence lines (the ACGT content) pass through unchanged.
|
|
20
|
+
- Blank lines pass through unchanged.
|
|
21
|
+
|
|
22
|
+
Translation rule:
|
|
23
|
+
- If the header's name resolves in the alias map, replace the name
|
|
24
|
+
with the target. Description is preserved exactly.
|
|
25
|
+
- If the name doesn't resolve, pass the whole line through
|
|
26
|
+
unchanged and count it as unmapped (same warn-and-pass-through
|
|
27
|
+
behavior as the GFF translator).
|
|
28
|
+
|
|
29
|
+
Name lookup goes through resolve_alias: an exact map hit is used when
|
|
30
|
+
present, and conservative fallbacks (ENA prefix strip, .N/vN
|
|
31
|
+
version-separator swap) are tried only when the exact name misses.
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
def translate_line(self, line: str, alias_map: dict, stats: dict) -> str:
|
|
35
|
+
if not line.startswith(">"):
|
|
36
|
+
# Sequence line, blank line, or anything else non-header.
|
|
37
|
+
return line
|
|
38
|
+
|
|
39
|
+
# Strip the trailing newline so we can reattach it exactly at the
|
|
40
|
+
# end. We don't use rstrip() generally — only the newline — to
|
|
41
|
+
# preserve any trailing whitespace inside the description.
|
|
42
|
+
if line.endswith("\n"):
|
|
43
|
+
body = line[1:-1]
|
|
44
|
+
newline = "\n"
|
|
45
|
+
else:
|
|
46
|
+
body = line[1:]
|
|
47
|
+
newline = ""
|
|
48
|
+
|
|
49
|
+
# Find the first whitespace character after '>'. Everything before
|
|
50
|
+
# it is the name; everything from there on (including the
|
|
51
|
+
# whitespace itself) is preserved as-is.
|
|
52
|
+
i = 0
|
|
53
|
+
while i < len(body) and not body[i].isspace():
|
|
54
|
+
i += 1
|
|
55
|
+
name = body[:i]
|
|
56
|
+
rest = body[i:]
|
|
57
|
+
|
|
58
|
+
if not name:
|
|
59
|
+
# Malformed header like '>' or '> description'. Nothing to
|
|
60
|
+
# translate; pass through unchanged.
|
|
61
|
+
return line
|
|
62
|
+
|
|
63
|
+
new_name = resolve_alias(name, alias_map)
|
|
64
|
+
if new_name is None:
|
|
65
|
+
stats["unmapped"] += 1
|
|
66
|
+
stats["unmapped_examples"].add(name)
|
|
67
|
+
return line
|
|
68
|
+
|
|
69
|
+
stats["mapped"] += 1
|
|
70
|
+
return f">{new_name}{rest}{newline}"
|
|
71
|
+
|
|
72
|
+
def sample_names(self, path: Path, limit: int = 50) -> list[str]:
|
|
73
|
+
names: list[str] = []
|
|
74
|
+
seen: set[str] = set()
|
|
75
|
+
with open_text_read(path) as f:
|
|
76
|
+
for line in f:
|
|
77
|
+
if not line.startswith(">"):
|
|
78
|
+
continue
|
|
79
|
+
# Parse header the same way translate_line does, so a name
|
|
80
|
+
# collected here is the same string that would be looked up.
|
|
81
|
+
body = line[1:].rstrip("\n")
|
|
82
|
+
i = 0
|
|
83
|
+
while i < len(body) and not body[i].isspace():
|
|
84
|
+
i += 1
|
|
85
|
+
name = body[:i]
|
|
86
|
+
if name and name not in seen:
|
|
87
|
+
seen.add(name)
|
|
88
|
+
names.append(name)
|
|
89
|
+
if len(names) >= limit:
|
|
90
|
+
break
|
|
91
|
+
return names
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
"""GFF / GTF translator. Both formats put the sequence name in column 1."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from .base import FileTranslator
|
|
6
|
+
from ._io import open_text_read
|
|
7
|
+
from ._resolve import resolve_alias
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class GffTranslator(FileTranslator):
|
|
11
|
+
"""
|
|
12
|
+
Translator for GFF, GFF3, and GTF files.
|
|
13
|
+
|
|
14
|
+
All three are tab-separated with the sequence name in column 1.
|
|
15
|
+
Lines starting with '#' are comments/headers and pass through
|
|
16
|
+
unchanged.
|
|
17
|
+
|
|
18
|
+
Name lookup goes through resolve_alias, so an exact map hit is used
|
|
19
|
+
when present and a small set of conservative fallbacks (ENA prefix
|
|
20
|
+
strip, .N/vN version-separator swap) is tried only when the exact
|
|
21
|
+
name misses.
|
|
22
|
+
|
|
23
|
+
Known limitation: '##sequence-region <name> ...' metadata lines
|
|
24
|
+
contain a sequence name that v0.2 does not translate. The design
|
|
25
|
+
doc flags this as a v1 follow-up.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
def translate_line(self, line: str, alias_map: dict, stats: dict) -> str:
|
|
29
|
+
if not line or line.startswith("#"):
|
|
30
|
+
return line
|
|
31
|
+
|
|
32
|
+
parts = line.rstrip("\n").split("\t")
|
|
33
|
+
if len(parts) < 1:
|
|
34
|
+
return line
|
|
35
|
+
|
|
36
|
+
seq_name = parts[0]
|
|
37
|
+
new_name = resolve_alias(seq_name, alias_map)
|
|
38
|
+
if new_name is None:
|
|
39
|
+
stats["unmapped"] += 1
|
|
40
|
+
stats["unmapped_examples"].add(seq_name)
|
|
41
|
+
return line
|
|
42
|
+
|
|
43
|
+
parts[0] = new_name
|
|
44
|
+
stats["mapped"] += 1
|
|
45
|
+
return "\t".join(parts) + "\n"
|
|
46
|
+
|
|
47
|
+
def sample_names(self, path: Path, limit: int = 50) -> list[str]:
|
|
48
|
+
names: list[str] = []
|
|
49
|
+
seen: set[str] = set()
|
|
50
|
+
with open_text_read(path) as f:
|
|
51
|
+
for line in f:
|
|
52
|
+
if not line or line.startswith("#"):
|
|
53
|
+
continue
|
|
54
|
+
parts = line.rstrip("\n").split("\t")
|
|
55
|
+
if not parts:
|
|
56
|
+
continue
|
|
57
|
+
name = parts[0]
|
|
58
|
+
if name and name not in seen:
|
|
59
|
+
seen.add(name)
|
|
60
|
+
names.append(name)
|
|
61
|
+
if len(names) >= limit:
|
|
62
|
+
break
|
|
63
|
+
return names
|
|
@@ -0,0 +1,217 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: alias-mapper
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Translate chromosome/scaffold names in bioinformatics files between naming conventions
|
|
5
|
+
Author: Max Reese
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/guigolab/alias-mapper
|
|
8
|
+
Project-URL: Issues, https://github.com/guigolab/alias-mapper/issues
|
|
9
|
+
Keywords: bioinformatics,genomics,gff,fasta,naming-conventions,ncbi
|
|
10
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
11
|
+
Classifier: Intended Audience :: Science/Research
|
|
12
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
19
|
+
Classifier: Operating System :: OS Independent
|
|
20
|
+
Requires-Python: >=3.10
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
License-File: LICENSE
|
|
23
|
+
Requires-Dist: platformdirs>=4.0
|
|
24
|
+
Requires-Dist: certifi
|
|
25
|
+
Provides-Extra: trusted
|
|
26
|
+
Requires-Dist: truststore; extra == "trusted"
|
|
27
|
+
Provides-Extra: test
|
|
28
|
+
Requires-Dist: pytest>=7; extra == "test"
|
|
29
|
+
Dynamic: license-file
|
|
30
|
+
|
|
31
|
+
# alias-mapper
|
|
32
|
+
|
|
33
|
+
Translate chromosome and scaffold names in bioinformatics files
|
|
34
|
+
between naming conventions (GenBank, RefSeq, UCSC, and others).
|
|
35
|
+
|
|
36
|
+
## What it does
|
|
37
|
+
|
|
38
|
+
Research files from different sources use different names for the same
|
|
39
|
+
sequences: `chr1`, `NC_000001.11`, `CM000663.2`, and `1` can all refer
|
|
40
|
+
to the same human chromosome. Files using different conventions can't
|
|
41
|
+
be combined without translation.
|
|
42
|
+
|
|
43
|
+
`alias-mapper` rewrites the sequence names in GFF, GTF, and FASTA
|
|
44
|
+
files from one convention to another using a precomputed alias table
|
|
45
|
+
built from NCBI assembly reports. Source convention and genome
|
|
46
|
+
assembly are auto-detected from the input by default.
|
|
47
|
+
|
|
48
|
+
## Install
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
pip install git+https://github.com/guigolab/alias-mapper.git
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
On networks that perform TLS inspection (corporate / institutional,
|
|
55
|
+
e.g. CRG), also install the `trusted` extra so the tool uses the
|
|
56
|
+
system keychain for cert verification:
|
|
57
|
+
|
|
58
|
+
```bash
|
|
59
|
+
pip install "alias-mapper[trusted] @ git+https://github.com/guigolab/alias-mapper.git"
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
The first time you run `convert`, the tool downloads the latest alias
|
|
63
|
+
data (~100 MB) from GitHub Releases and builds a local SQLite database
|
|
64
|
+
in your platform cache directory:
|
|
65
|
+
|
|
66
|
+
- macOS: `~/Library/Caches/alias-mapper/aliases.db`
|
|
67
|
+
- Linux: `~/.cache/alias-mapper/aliases.db`
|
|
68
|
+
- Windows: `%LOCALAPPDATA%\alias-mapper\Cache\aliases.db`
|
|
69
|
+
|
|
70
|
+
First-run setup takes about a minute. Subsequent runs use the cached
|
|
71
|
+
database directly. If the database schema changes in a newer release,
|
|
72
|
+
the cache is rebuilt automatically.
|
|
73
|
+
|
|
74
|
+
## Quickstart
|
|
75
|
+
|
|
76
|
+
```bash
|
|
77
|
+
alias-mapper convert annotations.gff --to ucsc -o annotations.ucsc.gff
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
A summary on stderr reports how many rows were translated and how many
|
|
81
|
+
had sequence names not in the alias database (those rows are passed
|
|
82
|
+
through unchanged with a warning).
|
|
83
|
+
|
|
84
|
+
## Usage
|
|
85
|
+
|
|
86
|
+
```
|
|
87
|
+
# single file
|
|
88
|
+
alias-mapper convert <input> --to <convention> -o <output> [options]
|
|
89
|
+
|
|
90
|
+
# multi-file: conform annotations to a reference FASTA (FASTA untouched)
|
|
91
|
+
alias-mapper convert --fasta <ref> [<ann> ...] --out-dir <dir> [options]
|
|
92
|
+
|
|
93
|
+
# multi-file: force the FASTA and annotations to one convention
|
|
94
|
+
alias-mapper convert --fasta <ref> [<ann> ...] --overwrite-to <convention> --out-dir <dir>
|
|
95
|
+
|
|
96
|
+
alias-mapper update
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
### Subcommands
|
|
100
|
+
|
|
101
|
+
- **`convert`** — translate a single file, or a reference FASTA plus
|
|
102
|
+
its annotation files (multi-file mode; see [Multi-file mode](#multi-file-mode)).
|
|
103
|
+
- **`update`** — re-download the latest alias data and rebuild the
|
|
104
|
+
cached database. Run manually when you want newer data.
|
|
105
|
+
|
|
106
|
+
### Supported file types
|
|
107
|
+
|
|
108
|
+
GFF (`.gff`, `.gff3`), GTF (`.gtf`), and FASTA (`.fa`, `.fasta`,
|
|
109
|
+
`.fna`). The translator is picked by file extension.
|
|
110
|
+
|
|
111
|
+
### Supported conventions
|
|
112
|
+
|
|
113
|
+
`genbank`, `refseq`, `ucsc`, `sequence-name`, `assigned-molecule`.
|
|
114
|
+
|
|
115
|
+
### Examples
|
|
116
|
+
|
|
117
|
+
```bash
|
|
118
|
+
# Translate from RefSeq to UCSC explicitly
|
|
119
|
+
alias-mapper convert annotations.gff \
|
|
120
|
+
--from refseq --to ucsc \
|
|
121
|
+
-o out.gff
|
|
122
|
+
|
|
123
|
+
# Pin the assembly when auto-detection is ambiguous
|
|
124
|
+
alias-mapper convert annotations.gff \
|
|
125
|
+
--to ucsc \
|
|
126
|
+
--assembly GCF_000001405.40 \
|
|
127
|
+
-o out.gff
|
|
128
|
+
|
|
129
|
+
# FASTA — same syntax, different file
|
|
130
|
+
alias-mapper convert reference.fa \
|
|
131
|
+
--from genbank --to sequence-name \
|
|
132
|
+
--assembly GCA_963924405.1 \
|
|
133
|
+
-o reference.renamed.fa
|
|
134
|
+
|
|
135
|
+
# Multi-file conform: rewrite the annotations to match reference.fa's
|
|
136
|
+
# own convention; reference.fa is left untouched
|
|
137
|
+
alias-mapper convert --fasta reference.fa genes.gff peaks.bed.gff \
|
|
138
|
+
--out-dir conformed/
|
|
139
|
+
|
|
140
|
+
# Multi-file overwrite: force reference.fa and its annotations to UCSC
|
|
141
|
+
alias-mapper convert --fasta reference.fa genes.gff \
|
|
142
|
+
--overwrite-to ucsc --out-dir ucsc_out/
|
|
143
|
+
|
|
144
|
+
# Refresh the cached alias data
|
|
145
|
+
alias-mapper update
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
### Multi-file mode
|
|
149
|
+
|
|
150
|
+
Pass `--fasta <ref>` to process a reference FASTA together with its
|
|
151
|
+
annotation files in one invocation. The assembly is detected once from
|
|
152
|
+
the FASTA and the alias table is loaded once for the whole batch.
|
|
153
|
+
Outputs go to `--out-dir`, named `<stem>.<convention>.<ext>` (gzip
|
|
154
|
+
preserved).
|
|
155
|
+
|
|
156
|
+
There are two modes:
|
|
157
|
+
|
|
158
|
+
- **Conform** (the default, when `--overwrite-to` is omitted): each
|
|
159
|
+
annotation is rewritten to match the FASTA's *own* convention, and
|
|
160
|
+
the FASTA is left unchanged. Use this to make a set of annotations
|
|
161
|
+
agree with a genome you already have. The FASTA is not copied into
|
|
162
|
+
the output directory, since it is unchanged.
|
|
163
|
+
- **Overwrite** (`--overwrite-to <convention>`): the FASTA and every
|
|
164
|
+
annotation are converted to the named convention.
|
|
165
|
+
|
|
166
|
+
`--to` is single-file only; in `--fasta` mode use `--overwrite-to`
|
|
167
|
+
(or omit it to conform).
|
|
168
|
+
|
|
169
|
+
### Flags (`convert`)
|
|
170
|
+
|
|
171
|
+
| Flag | Mode | Purpose |
|
|
172
|
+
| ---------------- | ----------- | ------------------------------------------------------------- |
|
|
173
|
+
| `--to` | single-file | Target naming convention (required in single-file mode) |
|
|
174
|
+
| `-o` | single-file | Output path |
|
|
175
|
+
| `--fasta` | multi-file | Reference FASTA; enables multi-file mode |
|
|
176
|
+
| `--overwrite-to` | multi-file | Force the FASTA and all annotations to this convention |
|
|
177
|
+
| `--out-dir` | multi-file | Output directory for the converted files |
|
|
178
|
+
| `--from` | both | Source convention. Auto-detected if absent (not used to conform) |
|
|
179
|
+
| `--assembly` | both | Assembly accession. Auto-detected if absent |
|
|
180
|
+
| `--alias-db` | both | Path to a specific alias SQLite database (overrides cache) |
|
|
181
|
+
|
|
182
|
+
### Auto-detection
|
|
183
|
+
|
|
184
|
+
When `--from` or `--assembly` is omitted, the tool reads up to 50
|
|
185
|
+
unique sequence names from the input and scores them against the
|
|
186
|
+
database. It commits to a result only when the top candidate has at
|
|
187
|
+
least 5 matches and beats the runner-up by 2× or more. Otherwise it
|
|
188
|
+
errors out and asks for the flag explicitly.
|
|
189
|
+
|
|
190
|
+
### Unmapped names
|
|
191
|
+
|
|
192
|
+
If a sequence name in the input isn't in the alias database, the line
|
|
193
|
+
is written to the output unchanged and counted in the unmapped total.
|
|
194
|
+
Up to five example names are printed at the end of the run so you can
|
|
195
|
+
see what didn't translate.
|
|
196
|
+
|
|
197
|
+
Before giving up on a name, the tool tries a couple of conservative
|
|
198
|
+
fallbacks: swapping a UCSC-style `vN` version separator for the `.N`
|
|
199
|
+
form (and vice versa), and stripping an `ENA|...|accession` header
|
|
200
|
+
wrapper down to the bare accession. These only run when the exact name
|
|
201
|
+
isn't found, so they never override a direct match.
|
|
202
|
+
|
|
203
|
+
## Data updates
|
|
204
|
+
|
|
205
|
+
A weekly GitHub Actions workflow rebuilds the alias dataset from
|
|
206
|
+
NCBI's published assembly summaries and publishes it as a
|
|
207
|
+
`data-YYYY-MM-DD` GitHub Release. Each release ships three artifacts:
|
|
208
|
+
|
|
209
|
+
- `aliases.tsv.gz` — the merged-row alias data the CLI consumes.
|
|
210
|
+
- `historical.tsv.gz` — dead-accession lookup with suppression dates
|
|
211
|
+
and best-effort replacements.
|
|
212
|
+
- `failures.tsv` — per-assembly collection failure log.
|
|
213
|
+
|
|
214
|
+
## More
|
|
215
|
+
|
|
216
|
+
See [`docs/design.md`](docs/design.md) for architecture, design
|
|
217
|
+
decisions, and direction.
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
alias_mapper/__init__.py,sha256=O0JAPQuXu12gyFvu0kp_KSoZed5yAWYthF0X5l5ZXdE,197
|
|
2
|
+
alias_mapper/_ssl.py,sha256=mEiEkR_I92WSvS1hrRrKx39iddiS0e5MqIhZopDFH5A,1219
|
|
3
|
+
alias_mapper/alias_source.py,sha256=y4WduMiDOJoUqW0uH3hr3ZFx22dp28VS82xh3wi8XKA,12350
|
|
4
|
+
alias_mapper/bootstrap.py,sha256=Xxb9LiXGq9U49JzxqlIHqIYqGL-kCyOwRa7OIg6248A,11453
|
|
5
|
+
alias_mapper/build_alias_db.py,sha256=9QsAXfYsf5mKOOpIekUMxoINIaa9gdOeHfW0b7geoT4,15006
|
|
6
|
+
alias_mapper/cli.py,sha256=nQKO0SpdNEoTkclKiThOZIuSZh3utFrCCs9gBP9DBUk,21935
|
|
7
|
+
alias_mapper/formats/__init__.py,sha256=YHS55y63dO67zwbbnFC_mrRQNlAxueKA7voydgFiHsk,2091
|
|
8
|
+
alias_mapper/formats/_io.py,sha256=BbNLeW0hAlalBDrmnbGde0KAu1UVfKw-hjHlBkO6eX4,2429
|
|
9
|
+
alias_mapper/formats/_resolve.py,sha256=MWtpZfqT6tLc1H2yhq2_4s0hRQvCZYQ3P9Axc1Pyedo,4105
|
|
10
|
+
alias_mapper/formats/base.py,sha256=sXNjtI8RYaTayVJ83yKTpoNIFiH4QHefRSo9vNTPGwY,1805
|
|
11
|
+
alias_mapper/formats/fasta.py,sha256=gVic5h7Ww_H1BbpA5KAAEWKE-DlQJv9sMLX_VEi9d4A,3441
|
|
12
|
+
alias_mapper/formats/gff.py,sha256=7TITMSxfN10nF72c9BOb7Vd9YXLJfrlA-yhqrKh3gL0,2069
|
|
13
|
+
alias_mapper-1.0.0.dist-info/licenses/LICENSE,sha256=KT4Ss1FW0agpBfeCymuPj_WaHXCTmBw-Pa13nSOvYFU,1066
|
|
14
|
+
alias_mapper-1.0.0.dist-info/METADATA,sha256=3jBwGt2qVFKP2C3xDCr1Xji3x297t58812VdrU8PW0g,8237
|
|
15
|
+
alias_mapper-1.0.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
16
|
+
alias_mapper-1.0.0.dist-info/entry_points.txt,sha256=5N8tR8vDiFTXnLW7skIV1si8w9SWN6I30tPsdjyY_EM,55
|
|
17
|
+
alias_mapper-1.0.0.dist-info/top_level.txt,sha256=TdP8YWOGPZqC_3it-AUCX47RPt29G_dnxaG_l3CFRW0,13
|
|
18
|
+
alias_mapper-1.0.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Max Reese
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
alias_mapper
|