alias-mapper 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,73 @@
1
+ """
2
+ Transparent gzip handling for input and output files.
3
+
4
+ Genomics files (FASTA, GFF, GTF) are very often distributed gzipped,
5
+ sometimes without a telltale .gz suffix (e.g. a browser "download"
6
+ endpoint). These helpers let the rest of the package open a path
7
+ without caring whether it's compressed:
8
+
9
+ - reads sniff the gzip magic bytes, so a gzipped file works even if
10
+ its name doesn't end in .gz;
11
+ - writes compress when the chosen output path ends in .gz;
12
+ - format detection ignores a trailing .gz so `genome.fa.gz` still
13
+ resolves to the FASTA translator.
14
+
15
+ This is the only place in the package that decides "is this gzip?",
16
+ so the rule lives in one spot.
17
+ """
18
+
19
+ import gzip
20
+ import io
21
+ from pathlib import Path
22
+
23
+ # gzip streams begin with these two magic bytes (RFC 1952).
24
+ _GZIP_MAGIC = b"\x1f\x8b"
25
+
26
+
27
+ def is_gzip(path) -> bool:
28
+ """
29
+ Return True if `path` is gzip-compressed, decided by content not name.
30
+
31
+ Reads the first two bytes and checks for the gzip magic number, so a
32
+ gzipped file is handled regardless of whether it ends in .gz. Returns
33
+ False for a missing or unreadable file; the caller surfaces a clearer
34
+ "not found" error downstream.
35
+ """
36
+ try:
37
+ with open(path, "rb") as f:
38
+ return f.read(2) == _GZIP_MAGIC
39
+ except OSError:
40
+ return False
41
+
42
+
43
+ def open_text_read(path, encoding: str = "utf-8") -> io.TextIOBase:
44
+ """Open `path` for text reading, decompressing if it is gzipped."""
45
+ if is_gzip(path):
46
+ return gzip.open(path, "rt", encoding=encoding)
47
+ return open(path, "r", encoding=encoding)
48
+
49
+
50
+ def open_text_write(path, encoding: str = "utf-8") -> io.TextIOBase:
51
+ """
52
+ Open `path` for text writing, compressing if the name ends in .gz.
53
+
54
+ Output compression keys off the extension rather than content (there
55
+ is no content yet): `out.gff.gz` is written gzipped, `out.gff` plain.
56
+ """
57
+ if Path(path).suffix.lower() == ".gz":
58
+ return gzip.open(path, "wt", encoding=encoding)
59
+ return open(path, "w", encoding=encoding)
60
+
61
+
62
+ def effective_suffix(path) -> str:
63
+ """
64
+ The format-relevant suffix, ignoring a trailing .gz.
65
+
66
+ `genome.fa.gz` -> `.fa`, `genome.gff` -> `.gff`, `genome.gz` -> ``.
67
+ Lower-cased. Used so extension-based format detection works on
68
+ compressed files.
69
+ """
70
+ p = Path(path)
71
+ if p.suffix.lower() == ".gz":
72
+ return Path(p.stem).suffix.lower()
73
+ return p.suffix.lower()
@@ -0,0 +1,117 @@
1
+ """
2
+ Fallback name resolution for alias lookups.
3
+
4
+ The primary lookup is an exact dict hit: ``alias_map[name]``. When that
5
+ misses, real-world inputs often carry the *same* identifier in a
6
+ different surface form. Rather than fail those outright, we try a small,
7
+ conservative set of normalizations and retry the lookup. The fallbacks
8
+ only run on a miss, so the common path stays a single dict lookup.
9
+
10
+ Two variant classes are handled, both low-risk normalizations of the
11
+ same underlying accession (not fuzzy matching):
12
+
13
+ Version separator (.N <-> vN)
14
+ UCSC writes unplaced/unlocalized scaffolds with a 'v' version
15
+ separator (NW_013982187v1) where GenBank/RefSeq use a dot
16
+ (NW_013982187.1). If a name in one form misses, try the other.
17
+
18
+ ENA pipe-prefixed headers (ENA|<unversioned>|<versioned>)
19
+ ENA FASTA headers wrap the accession as 'ENA|ACC|ACC.v'. The bare
20
+ accession is what matches our columns, so on a miss we peel the ENA
21
+ wrapper and retry on the inner accession(s), most-likely first.
22
+
23
+ Resolution order: exact name, then ENA-unwrapped accession(s), with a
24
+ version-separator swap tried on each. First hit wins. None means the
25
+ name is genuinely unmapped and the caller passes the line through
26
+ unchanged, exactly as before this fallback existed.
27
+ """
28
+
29
+ import re
30
+
31
+ # A trailing version suffix in either surface form: ".1" or "v1".
32
+ # Non-greedy with an end anchor so the LAST separator is the split point
33
+ # (e.g. "GL000.2.1" -> base "GL000.2", version "1").
34
+ _DOT_VERSION = re.compile(r"^(.*?)\.(\d+)$")
35
+ _V_VERSION = re.compile(r"^(.*?)v(\d+)$")
36
+
37
+
38
+ def _swap_version_suffix(name):
39
+ """
40
+ Yield the alternate version-suffix form of ``name``, if one applies.
41
+
42
+ "NW_013982187.1" -> "NW_013982187v1" and vice versa. Yields nothing
43
+ when the name has no trailing ".N" or "vN", so callers can iterate
44
+ unconditionally.
45
+ """
46
+ m = _DOT_VERSION.match(name)
47
+ if m:
48
+ yield f"{m.group(1)}v{m.group(2)}"
49
+ return
50
+ m = _V_VERSION.match(name)
51
+ if m:
52
+ yield f"{m.group(1)}.{m.group(2)}"
53
+
54
+
55
+ def _strip_ena_prefix(name):
56
+ """
57
+ Yield the bare accession(s) from an 'ENA|...|...' header.
58
+
59
+ ENA|CAJVCV010000001|CAJVCV010000001.1 yields "CAJVCV010000001.1"
60
+ (versioned, the form that matches our columns) then
61
+ "CAJVCV010000001" (unversioned) as a secondary try. Yields nothing
62
+ when the name is not ENA-prefixed.
63
+ """
64
+ if "|" not in name:
65
+ return
66
+ fields = name.split("|")
67
+ if not fields or fields[0].upper() != "ENA":
68
+ return
69
+ rest = [f for f in fields[1:] if f]
70
+ # The versioned accession is usually the last field and most likely
71
+ # to match; the unversioned field is a secondary candidate.
72
+ for f in reversed(rest):
73
+ yield f
74
+
75
+
76
+ def _candidates(name):
77
+ """
78
+ Generate lookup candidates for ``name`` in priority order.
79
+
80
+ The exact name is always yielded first, so a normal hit costs a
81
+ single dict lookup. Candidates are deduplicated while preserving
82
+ order, so each surface form is tried at most once.
83
+ """
84
+ seen = set()
85
+
86
+ def add(candidate):
87
+ if candidate and candidate not in seen:
88
+ seen.add(candidate)
89
+ return True
90
+ return False
91
+
92
+ # ENA-unwrapped accessions become additional base forms that the
93
+ # version-swap step can also act on (handles ENA-wrapped vN names).
94
+ bases = [name]
95
+ bases.extend(_strip_ena_prefix(name))
96
+
97
+ for base in bases:
98
+ if add(base):
99
+ yield base
100
+ for swapped in _swap_version_suffix(base):
101
+ if add(swapped):
102
+ yield swapped
103
+
104
+
105
+ def resolve_alias(name: str, alias_map: dict) -> str | None:
106
+ """
107
+ Look up ``name`` in ``alias_map``, trying conservative fallbacks on a miss.
108
+
109
+ Returns the mapped target name, or None if no candidate form of the
110
+ name is present in the map. The first candidate is always the exact
111
+ name, so an ordinary hit is a single dict lookup with no overhead.
112
+ """
113
+ for candidate in _candidates(name):
114
+ hit = alias_map.get(candidate)
115
+ if hit is not None:
116
+ return hit
117
+ return None
@@ -0,0 +1,51 @@
1
+ """Abstract base class for file format translators."""
2
+
3
+ from abc import ABC, abstractmethod
4
+ from pathlib import Path
5
+
6
+
7
+ class FileTranslator(ABC):
8
+ """
9
+ Translates sequence names in one file format.
10
+
11
+ Subclasses know which lines in their format contain sequence names
12
+ and how to rewrite them. Comments, headers, and blank lines should
13
+ pass through unchanged.
14
+ """
15
+
16
+ @abstractmethod
17
+ def translate_line(self, line: str, alias_map: dict, stats: dict) -> str:
18
+ """
19
+ Translate one line of input.
20
+
21
+ Args:
22
+ line: One line from the input file (with trailing newline).
23
+ alias_map: {source_name -> target_name} dict from AliasSource.
24
+ stats: Mutable dict with keys 'mapped' (int), 'unmapped'
25
+ (int), and 'unmapped_examples' (set). The translator
26
+ updates these in place.
27
+
28
+ Returns:
29
+ The translated line (with trailing newline). Lines that
30
+ don't contain a translatable sequence name are returned
31
+ unchanged.
32
+ """
33
+
34
+ @abstractmethod
35
+ def sample_names(self, path: Path, limit: int = 50) -> list[str]:
36
+ """
37
+ Read up to `limit` unique sequence names from the start of the file.
38
+
39
+ Used by auto-detection to decide which convention and assembly
40
+ the input file is using. Stops once `limit` unique names have
41
+ been collected, so this is O(limit) regardless of file size.
42
+
43
+ Args:
44
+ path: Path to the input file.
45
+ limit: Maximum number of unique names to return.
46
+
47
+ Returns:
48
+ List of unique sequence names, preserving the order they
49
+ appeared in the file. May contain fewer than `limit` if
50
+ the file has fewer unique names.
51
+ """
@@ -0,0 +1,91 @@
1
+ """FASTA translator. Sequence name lives in the header line, after the '>'."""
2
+
3
+ from pathlib import Path
4
+
5
+ from .base import FileTranslator
6
+ from ._io import open_text_read
7
+ from ._resolve import resolve_alias
8
+
9
+
10
+ class FastaTranslator(FileTranslator):
11
+ """
12
+ Translator for FASTA files.
13
+
14
+ FASTA structure:
15
+ - Header lines start with '>'. Format: '>NAME [WHITESPACE DESCRIPTION]'.
16
+ Only the NAME (first whitespace-separated token after '>') is the
17
+ sequence identifier. The description, if present, is preserved
18
+ verbatim including the exact whitespace between name and description.
19
+ - Sequence lines (the ACGT content) pass through unchanged.
20
+ - Blank lines pass through unchanged.
21
+
22
+ Translation rule:
23
+ - If the header's name resolves in the alias map, replace the name
24
+ with the target. Description is preserved exactly.
25
+ - If the name doesn't resolve, pass the whole line through
26
+ unchanged and count it as unmapped (same warn-and-pass-through
27
+ behavior as the GFF translator).
28
+
29
+ Name lookup goes through resolve_alias: an exact map hit is used when
30
+ present, and conservative fallbacks (ENA prefix strip, .N/vN
31
+ version-separator swap) are tried only when the exact name misses.
32
+ """
33
+
34
+ def translate_line(self, line: str, alias_map: dict, stats: dict) -> str:
35
+ if not line.startswith(">"):
36
+ # Sequence line, blank line, or anything else non-header.
37
+ return line
38
+
39
+ # Strip the trailing newline so we can reattach it exactly at the
40
+ # end. We don't use rstrip() generally — only the newline — to
41
+ # preserve any trailing whitespace inside the description.
42
+ if line.endswith("\n"):
43
+ body = line[1:-1]
44
+ newline = "\n"
45
+ else:
46
+ body = line[1:]
47
+ newline = ""
48
+
49
+ # Find the first whitespace character after '>'. Everything before
50
+ # it is the name; everything from there on (including the
51
+ # whitespace itself) is preserved as-is.
52
+ i = 0
53
+ while i < len(body) and not body[i].isspace():
54
+ i += 1
55
+ name = body[:i]
56
+ rest = body[i:]
57
+
58
+ if not name:
59
+ # Malformed header like '>' or '> description'. Nothing to
60
+ # translate; pass through unchanged.
61
+ return line
62
+
63
+ new_name = resolve_alias(name, alias_map)
64
+ if new_name is None:
65
+ stats["unmapped"] += 1
66
+ stats["unmapped_examples"].add(name)
67
+ return line
68
+
69
+ stats["mapped"] += 1
70
+ return f">{new_name}{rest}{newline}"
71
+
72
+ def sample_names(self, path: Path, limit: int = 50) -> list[str]:
73
+ names: list[str] = []
74
+ seen: set[str] = set()
75
+ with open_text_read(path) as f:
76
+ for line in f:
77
+ if not line.startswith(">"):
78
+ continue
79
+ # Parse header the same way translate_line does, so a name
80
+ # collected here is the same string that would be looked up.
81
+ body = line[1:].rstrip("\n")
82
+ i = 0
83
+ while i < len(body) and not body[i].isspace():
84
+ i += 1
85
+ name = body[:i]
86
+ if name and name not in seen:
87
+ seen.add(name)
88
+ names.append(name)
89
+ if len(names) >= limit:
90
+ break
91
+ return names
@@ -0,0 +1,63 @@
1
+ """GFF / GTF translator. Both formats put the sequence name in column 1."""
2
+
3
+ from pathlib import Path
4
+
5
+ from .base import FileTranslator
6
+ from ._io import open_text_read
7
+ from ._resolve import resolve_alias
8
+
9
+
10
+ class GffTranslator(FileTranslator):
11
+ """
12
+ Translator for GFF, GFF3, and GTF files.
13
+
14
+ All three are tab-separated with the sequence name in column 1.
15
+ Lines starting with '#' are comments/headers and pass through
16
+ unchanged.
17
+
18
+ Name lookup goes through resolve_alias, so an exact map hit is used
19
+ when present and a small set of conservative fallbacks (ENA prefix
20
+ strip, .N/vN version-separator swap) is tried only when the exact
21
+ name misses.
22
+
23
+ Known limitation: '##sequence-region <name> ...' metadata lines
24
+ contain a sequence name that v0.2 does not translate. The design
25
+ doc flags this as a v1 follow-up.
26
+ """
27
+
28
+ def translate_line(self, line: str, alias_map: dict, stats: dict) -> str:
29
+ if not line or line.startswith("#"):
30
+ return line
31
+
32
+ parts = line.rstrip("\n").split("\t")
33
+ if len(parts) < 1:
34
+ return line
35
+
36
+ seq_name = parts[0]
37
+ new_name = resolve_alias(seq_name, alias_map)
38
+ if new_name is None:
39
+ stats["unmapped"] += 1
40
+ stats["unmapped_examples"].add(seq_name)
41
+ return line
42
+
43
+ parts[0] = new_name
44
+ stats["mapped"] += 1
45
+ return "\t".join(parts) + "\n"
46
+
47
+ def sample_names(self, path: Path, limit: int = 50) -> list[str]:
48
+ names: list[str] = []
49
+ seen: set[str] = set()
50
+ with open_text_read(path) as f:
51
+ for line in f:
52
+ if not line or line.startswith("#"):
53
+ continue
54
+ parts = line.rstrip("\n").split("\t")
55
+ if not parts:
56
+ continue
57
+ name = parts[0]
58
+ if name and name not in seen:
59
+ seen.add(name)
60
+ names.append(name)
61
+ if len(names) >= limit:
62
+ break
63
+ return names
@@ -0,0 +1,217 @@
1
+ Metadata-Version: 2.4
2
+ Name: alias-mapper
3
+ Version: 1.0.0
4
+ Summary: Translate chromosome/scaffold names in bioinformatics files between naming conventions
5
+ Author: Max Reese
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/guigolab/alias-mapper
8
+ Project-URL: Issues, https://github.com/guigolab/alias-mapper/issues
9
+ Keywords: bioinformatics,genomics,gff,fasta,naming-conventions,ncbi
10
+ Classifier: Development Status :: 5 - Production/Stable
11
+ Classifier: Intended Audience :: Science/Research
12
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Programming Language :: Python :: 3.13
19
+ Classifier: Operating System :: OS Independent
20
+ Requires-Python: >=3.10
21
+ Description-Content-Type: text/markdown
22
+ License-File: LICENSE
23
+ Requires-Dist: platformdirs>=4.0
24
+ Requires-Dist: certifi
25
+ Provides-Extra: trusted
26
+ Requires-Dist: truststore; extra == "trusted"
27
+ Provides-Extra: test
28
+ Requires-Dist: pytest>=7; extra == "test"
29
+ Dynamic: license-file
30
+
31
+ # alias-mapper
32
+
33
+ Translate chromosome and scaffold names in bioinformatics files
34
+ between naming conventions (GenBank, RefSeq, UCSC, and others).
35
+
36
+ ## What it does
37
+
38
+ Research files from different sources use different names for the same
39
+ sequences: `chr1`, `NC_000001.11`, `CM000663.2`, and `1` can all refer
40
+ to the same human chromosome. Files using different conventions can't
41
+ be combined without translation.
42
+
43
+ `alias-mapper` rewrites the sequence names in GFF, GTF, and FASTA
44
+ files from one convention to another using a precomputed alias table
45
+ built from NCBI assembly reports. Source convention and genome
46
+ assembly are auto-detected from the input by default.
47
+
48
+ ## Install
49
+
50
+ ```bash
51
+ pip install git+https://github.com/guigolab/alias-mapper.git
52
+ ```
53
+
54
+ On networks that perform TLS inspection (corporate / institutional,
55
+ e.g. CRG), also install the `trusted` extra so the tool uses the
56
+ system keychain for cert verification:
57
+
58
+ ```bash
59
+ pip install "alias-mapper[trusted] @ git+https://github.com/guigolab/alias-mapper.git"
60
+ ```
61
+
62
+ The first time you run `convert`, the tool downloads the latest alias
63
+ data (~100 MB) from GitHub Releases and builds a local SQLite database
64
+ in your platform cache directory:
65
+
66
+ - macOS: `~/Library/Caches/alias-mapper/aliases.db`
67
+ - Linux: `~/.cache/alias-mapper/aliases.db`
68
+ - Windows: `%LOCALAPPDATA%\alias-mapper\Cache\aliases.db`
69
+
70
+ First-run setup takes about a minute. Subsequent runs use the cached
71
+ database directly. If the database schema changes in a newer release,
72
+ the cache is rebuilt automatically.
73
+
74
+ ## Quickstart
75
+
76
+ ```bash
77
+ alias-mapper convert annotations.gff --to ucsc -o annotations.ucsc.gff
78
+ ```
79
+
80
+ A summary on stderr reports how many rows were translated and how many
81
+ had sequence names not in the alias database (those rows are passed
82
+ through unchanged with a warning).
83
+
84
+ ## Usage
85
+
86
+ ```
87
+ # single file
88
+ alias-mapper convert <input> --to <convention> -o <output> [options]
89
+
90
+ # multi-file: conform annotations to a reference FASTA (FASTA untouched)
91
+ alias-mapper convert --fasta <ref> [<ann> ...] --out-dir <dir> [options]
92
+
93
+ # multi-file: force the FASTA and annotations to one convention
94
+ alias-mapper convert --fasta <ref> [<ann> ...] --overwrite-to <convention> --out-dir <dir>
95
+
96
+ alias-mapper update
97
+ ```
98
+
99
+ ### Subcommands
100
+
101
+ - **`convert`** — translate a single file, or a reference FASTA plus
102
+ its annotation files (multi-file mode; see [Multi-file mode](#multi-file-mode)).
103
+ - **`update`** — re-download the latest alias data and rebuild the
104
+ cached database. Run manually when you want newer data.
105
+
106
+ ### Supported file types
107
+
108
+ GFF (`.gff`, `.gff3`), GTF (`.gtf`), and FASTA (`.fa`, `.fasta`,
109
+ `.fna`). The translator is picked by file extension.
110
+
111
+ ### Supported conventions
112
+
113
+ `genbank`, `refseq`, `ucsc`, `sequence-name`, `assigned-molecule`.
114
+
115
+ ### Examples
116
+
117
+ ```bash
118
+ # Translate from RefSeq to UCSC explicitly
119
+ alias-mapper convert annotations.gff \
120
+ --from refseq --to ucsc \
121
+ -o out.gff
122
+
123
+ # Pin the assembly when auto-detection is ambiguous
124
+ alias-mapper convert annotations.gff \
125
+ --to ucsc \
126
+ --assembly GCF_000001405.40 \
127
+ -o out.gff
128
+
129
+ # FASTA — same syntax, different file
130
+ alias-mapper convert reference.fa \
131
+ --from genbank --to sequence-name \
132
+ --assembly GCA_963924405.1 \
133
+ -o reference.renamed.fa
134
+
135
+ # Multi-file conform: rewrite the annotations to match reference.fa's
136
+ # own convention; reference.fa is left untouched
137
+ alias-mapper convert --fasta reference.fa genes.gff peaks.bed.gff \
138
+ --out-dir conformed/
139
+
140
+ # Multi-file overwrite: force reference.fa and its annotations to UCSC
141
+ alias-mapper convert --fasta reference.fa genes.gff \
142
+ --overwrite-to ucsc --out-dir ucsc_out/
143
+
144
+ # Refresh the cached alias data
145
+ alias-mapper update
146
+ ```
147
+
148
+ ### Multi-file mode
149
+
150
+ Pass `--fasta <ref>` to process a reference FASTA together with its
151
+ annotation files in one invocation. The assembly is detected once from
152
+ the FASTA and the alias table is loaded once for the whole batch.
153
+ Outputs go to `--out-dir`, named `<stem>.<convention>.<ext>` (gzip
154
+ preserved).
155
+
156
+ There are two modes:
157
+
158
+ - **Conform** (the default, when `--overwrite-to` is omitted): each
159
+ annotation is rewritten to match the FASTA's *own* convention, and
160
+ the FASTA is left unchanged. Use this to make a set of annotations
161
+ agree with a genome you already have. The FASTA is not copied into
162
+ the output directory, since it is unchanged.
163
+ - **Overwrite** (`--overwrite-to <convention>`): the FASTA and every
164
+ annotation are converted to the named convention.
165
+
166
+ `--to` is single-file only; in `--fasta` mode use `--overwrite-to`
167
+ (or omit it to conform).
168
+
169
+ ### Flags (`convert`)
170
+
171
+ | Flag | Mode | Purpose |
172
+ | ---------------- | ----------- | ------------------------------------------------------------- |
173
+ | `--to` | single-file | Target naming convention (required in single-file mode) |
174
+ | `-o` | single-file | Output path |
175
+ | `--fasta` | multi-file | Reference FASTA; enables multi-file mode |
176
+ | `--overwrite-to` | multi-file | Force the FASTA and all annotations to this convention |
177
+ | `--out-dir` | multi-file | Output directory for the converted files |
178
+ | `--from` | both | Source convention. Auto-detected if absent (not used to conform) |
179
+ | `--assembly` | both | Assembly accession. Auto-detected if absent |
180
+ | `--alias-db` | both | Path to a specific alias SQLite database (overrides cache) |
181
+
182
+ ### Auto-detection
183
+
184
+ When `--from` or `--assembly` is omitted, the tool reads up to 50
185
+ unique sequence names from the input and scores them against the
186
+ database. It commits to a result only when the top candidate has at
187
+ least 5 matches and beats the runner-up by 2× or more. Otherwise it
188
+ errors out and asks for the flag explicitly.
189
+
190
+ ### Unmapped names
191
+
192
+ If a sequence name in the input isn't in the alias database, the line
193
+ is written to the output unchanged and counted in the unmapped total.
194
+ Up to five example names are printed at the end of the run so you can
195
+ see what didn't translate.
196
+
197
+ Before giving up on a name, the tool tries a couple of conservative
198
+ fallbacks: swapping a UCSC-style `vN` version separator for the `.N`
199
+ form (and vice versa), and stripping an `ENA|...|accession` header
200
+ wrapper down to the bare accession. These only run when the exact name
201
+ isn't found, so they never override a direct match.
202
+
203
+ ## Data updates
204
+
205
+ A weekly GitHub Actions workflow rebuilds the alias dataset from
206
+ NCBI's published assembly summaries and publishes it as a
207
+ `data-YYYY-MM-DD` GitHub Release. Each release ships three artifacts:
208
+
209
+ - `aliases.tsv.gz` — the merged-row alias data the CLI consumes.
210
+ - `historical.tsv.gz` — dead-accession lookup with suppression dates
211
+ and best-effort replacements.
212
+ - `failures.tsv` — per-assembly collection failure log.
213
+
214
+ ## More
215
+
216
+ See [`docs/design.md`](docs/design.md) for architecture, design
217
+ decisions, and direction.
@@ -0,0 +1,18 @@
1
+ alias_mapper/__init__.py,sha256=O0JAPQuXu12gyFvu0kp_KSoZed5yAWYthF0X5l5ZXdE,197
2
+ alias_mapper/_ssl.py,sha256=mEiEkR_I92WSvS1hrRrKx39iddiS0e5MqIhZopDFH5A,1219
3
+ alias_mapper/alias_source.py,sha256=y4WduMiDOJoUqW0uH3hr3ZFx22dp28VS82xh3wi8XKA,12350
4
+ alias_mapper/bootstrap.py,sha256=Xxb9LiXGq9U49JzxqlIHqIYqGL-kCyOwRa7OIg6248A,11453
5
+ alias_mapper/build_alias_db.py,sha256=9QsAXfYsf5mKOOpIekUMxoINIaa9gdOeHfW0b7geoT4,15006
6
+ alias_mapper/cli.py,sha256=nQKO0SpdNEoTkclKiThOZIuSZh3utFrCCs9gBP9DBUk,21935
7
+ alias_mapper/formats/__init__.py,sha256=YHS55y63dO67zwbbnFC_mrRQNlAxueKA7voydgFiHsk,2091
8
+ alias_mapper/formats/_io.py,sha256=BbNLeW0hAlalBDrmnbGde0KAu1UVfKw-hjHlBkO6eX4,2429
9
+ alias_mapper/formats/_resolve.py,sha256=MWtpZfqT6tLc1H2yhq2_4s0hRQvCZYQ3P9Axc1Pyedo,4105
10
+ alias_mapper/formats/base.py,sha256=sXNjtI8RYaTayVJ83yKTpoNIFiH4QHefRSo9vNTPGwY,1805
11
+ alias_mapper/formats/fasta.py,sha256=gVic5h7Ww_H1BbpA5KAAEWKE-DlQJv9sMLX_VEi9d4A,3441
12
+ alias_mapper/formats/gff.py,sha256=7TITMSxfN10nF72c9BOb7Vd9YXLJfrlA-yhqrKh3gL0,2069
13
+ alias_mapper-1.0.0.dist-info/licenses/LICENSE,sha256=KT4Ss1FW0agpBfeCymuPj_WaHXCTmBw-Pa13nSOvYFU,1066
14
+ alias_mapper-1.0.0.dist-info/METADATA,sha256=3jBwGt2qVFKP2C3xDCr1Xji3x297t58812VdrU8PW0g,8237
15
+ alias_mapper-1.0.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
16
+ alias_mapper-1.0.0.dist-info/entry_points.txt,sha256=5N8tR8vDiFTXnLW7skIV1si8w9SWN6I30tPsdjyY_EM,55
17
+ alias_mapper-1.0.0.dist-info/top_level.txt,sha256=TdP8YWOGPZqC_3it-AUCX47RPt29G_dnxaG_l3CFRW0,13
18
+ alias_mapper-1.0.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ alias-mapper = alias_mapper.cli:main
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Max Reese
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1 @@
1
+ alias_mapper