alias-mapper 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- alias_mapper-1.0.0/LICENSE +21 -0
- alias_mapper-1.0.0/PKG-INFO +217 -0
- alias_mapper-1.0.0/README.md +187 -0
- alias_mapper-1.0.0/pyproject.toml +68 -0
- alias_mapper-1.0.0/setup.cfg +4 -0
- alias_mapper-1.0.0/src/alias_mapper/__init__.py +8 -0
- alias_mapper-1.0.0/src/alias_mapper/_ssl.py +40 -0
- alias_mapper-1.0.0/src/alias_mapper/alias_source.py +358 -0
- alias_mapper-1.0.0/src/alias_mapper/bootstrap.py +305 -0
- alias_mapper-1.0.0/src/alias_mapper/build_alias_db.py +407 -0
- alias_mapper-1.0.0/src/alias_mapper/cli.py +585 -0
- alias_mapper-1.0.0/src/alias_mapper/formats/__init__.py +68 -0
- alias_mapper-1.0.0/src/alias_mapper/formats/_io.py +73 -0
- alias_mapper-1.0.0/src/alias_mapper/formats/_resolve.py +117 -0
- alias_mapper-1.0.0/src/alias_mapper/formats/base.py +51 -0
- alias_mapper-1.0.0/src/alias_mapper/formats/fasta.py +91 -0
- alias_mapper-1.0.0/src/alias_mapper/formats/gff.py +63 -0
- alias_mapper-1.0.0/src/alias_mapper.egg-info/PKG-INFO +217 -0
- alias_mapper-1.0.0/src/alias_mapper.egg-info/SOURCES.txt +28 -0
- alias_mapper-1.0.0/src/alias_mapper.egg-info/dependency_links.txt +1 -0
- alias_mapper-1.0.0/src/alias_mapper.egg-info/entry_points.txt +2 -0
- alias_mapper-1.0.0/src/alias_mapper.egg-info/requires.txt +8 -0
- alias_mapper-1.0.0/src/alias_mapper.egg-info/top_level.txt +1 -0
- alias_mapper-1.0.0/tests/test_alias_source.py +115 -0
- alias_mapper-1.0.0/tests/test_build_db.py +78 -0
- alias_mapper-1.0.0/tests/test_cli_multi.py +213 -0
- alias_mapper-1.0.0/tests/test_cli_smoke.py +77 -0
- alias_mapper-1.0.0/tests/test_formats.py +115 -0
- alias_mapper-1.0.0/tests/test_io.py +75 -0
- alias_mapper-1.0.0/tests/test_resolve.py +99 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Max Reese
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,217 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: alias-mapper
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Translate chromosome/scaffold names in bioinformatics files between naming conventions
|
|
5
|
+
Author: Max Reese
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/guigolab/alias-mapper
|
|
8
|
+
Project-URL: Issues, https://github.com/guigolab/alias-mapper/issues
|
|
9
|
+
Keywords: bioinformatics,genomics,gff,fasta,naming-conventions,ncbi
|
|
10
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
11
|
+
Classifier: Intended Audience :: Science/Research
|
|
12
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
19
|
+
Classifier: Operating System :: OS Independent
|
|
20
|
+
Requires-Python: >=3.10
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
License-File: LICENSE
|
|
23
|
+
Requires-Dist: platformdirs>=4.0
|
|
24
|
+
Requires-Dist: certifi
|
|
25
|
+
Provides-Extra: trusted
|
|
26
|
+
Requires-Dist: truststore; extra == "trusted"
|
|
27
|
+
Provides-Extra: test
|
|
28
|
+
Requires-Dist: pytest>=7; extra == "test"
|
|
29
|
+
Dynamic: license-file
|
|
30
|
+
|
|
31
|
+
# alias-mapper
|
|
32
|
+
|
|
33
|
+
Translate chromosome and scaffold names in bioinformatics files
|
|
34
|
+
between naming conventions (GenBank, RefSeq, UCSC, and others).
|
|
35
|
+
|
|
36
|
+
## What it does
|
|
37
|
+
|
|
38
|
+
Research files from different sources use different names for the same
|
|
39
|
+
sequences: `chr1`, `NC_000001.11`, `CM000663.2`, and `1` can all refer
|
|
40
|
+
to the same human chromosome. Files using different conventions can't
|
|
41
|
+
be combined without translation.
|
|
42
|
+
|
|
43
|
+
`alias-mapper` rewrites the sequence names in GFF, GTF, and FASTA
|
|
44
|
+
files from one convention to another using a precomputed alias table
|
|
45
|
+
built from NCBI assembly reports. Source convention and genome
|
|
46
|
+
assembly are auto-detected from the input by default.
|
|
47
|
+
|
|
48
|
+
## Install
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
pip install git+https://github.com/guigolab/alias-mapper.git
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
On networks that perform TLS inspection (corporate / institutional,
|
|
55
|
+
e.g. CRG), also install the `trusted` extra so the tool uses the
|
|
56
|
+
system keychain for cert verification:
|
|
57
|
+
|
|
58
|
+
```bash
|
|
59
|
+
pip install "alias-mapper[trusted] @ git+https://github.com/guigolab/alias-mapper.git"
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
The first time you run `convert`, the tool downloads the latest alias
|
|
63
|
+
data (~100 MB) from GitHub Releases and builds a local SQLite database
|
|
64
|
+
in your platform cache directory:
|
|
65
|
+
|
|
66
|
+
- macOS: `~/Library/Caches/alias-mapper/aliases.db`
|
|
67
|
+
- Linux: `~/.cache/alias-mapper/aliases.db`
|
|
68
|
+
- Windows: `%LOCALAPPDATA%\alias-mapper\Cache\aliases.db`
|
|
69
|
+
|
|
70
|
+
First-run setup takes about a minute. Subsequent runs use the cached
|
|
71
|
+
database directly. If the database schema changes in a newer release,
|
|
72
|
+
the cache is rebuilt automatically.
|
|
73
|
+
|
|
74
|
+
## Quickstart
|
|
75
|
+
|
|
76
|
+
```bash
|
|
77
|
+
alias-mapper convert annotations.gff --to ucsc -o annotations.ucsc.gff
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
A summary on stderr reports how many rows were translated and how many
|
|
81
|
+
had sequence names not in the alias database (those rows are passed
|
|
82
|
+
through unchanged with a warning).
|
|
83
|
+
|
|
84
|
+
## Usage
|
|
85
|
+
|
|
86
|
+
```
|
|
87
|
+
# single file
|
|
88
|
+
alias-mapper convert <input> --to <convention> -o <output> [options]
|
|
89
|
+
|
|
90
|
+
# multi-file: conform annotations to a reference FASTA (FASTA untouched)
|
|
91
|
+
alias-mapper convert --fasta <ref> [<ann> ...] --out-dir <dir> [options]
|
|
92
|
+
|
|
93
|
+
# multi-file: force the FASTA and annotations to one convention
|
|
94
|
+
alias-mapper convert --fasta <ref> [<ann> ...] --overwrite-to <convention> --out-dir <dir>
|
|
95
|
+
|
|
96
|
+
alias-mapper update
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
### Subcommands
|
|
100
|
+
|
|
101
|
+
- **`convert`** — translate a single file, or a reference FASTA plus
|
|
102
|
+
its annotation files (multi-file mode; see [Multi-file mode](#multi-file-mode)).
|
|
103
|
+
- **`update`** — re-download the latest alias data and rebuild the
|
|
104
|
+
cached database. Run manually when you want newer data.
|
|
105
|
+
|
|
106
|
+
### Supported file types
|
|
107
|
+
|
|
108
|
+
GFF (`.gff`, `.gff3`), GTF (`.gtf`), and FASTA (`.fa`, `.fasta`,
|
|
109
|
+
`.fna`). The translator is picked by file extension.
|
|
110
|
+
|
|
111
|
+
### Supported conventions
|
|
112
|
+
|
|
113
|
+
`genbank`, `refseq`, `ucsc`, `sequence-name`, `assigned-molecule`.
|
|
114
|
+
|
|
115
|
+
### Examples
|
|
116
|
+
|
|
117
|
+
```bash
|
|
118
|
+
# Translate from RefSeq to UCSC explicitly
|
|
119
|
+
alias-mapper convert annotations.gff \
|
|
120
|
+
--from refseq --to ucsc \
|
|
121
|
+
-o out.gff
|
|
122
|
+
|
|
123
|
+
# Pin the assembly when auto-detection is ambiguous
|
|
124
|
+
alias-mapper convert annotations.gff \
|
|
125
|
+
--to ucsc \
|
|
126
|
+
--assembly GCF_000001405.40 \
|
|
127
|
+
-o out.gff
|
|
128
|
+
|
|
129
|
+
# FASTA — same syntax, different file
|
|
130
|
+
alias-mapper convert reference.fa \
|
|
131
|
+
--from genbank --to sequence-name \
|
|
132
|
+
--assembly GCA_963924405.1 \
|
|
133
|
+
-o reference.renamed.fa
|
|
134
|
+
|
|
135
|
+
# Multi-file conform: rewrite the annotations to match reference.fa's
|
|
136
|
+
# own convention; reference.fa is left untouched
|
|
137
|
+
alias-mapper convert --fasta reference.fa genes.gff peaks.bed.gff \
|
|
138
|
+
--out-dir conformed/
|
|
139
|
+
|
|
140
|
+
# Multi-file overwrite: force reference.fa and its annotations to UCSC
|
|
141
|
+
alias-mapper convert --fasta reference.fa genes.gff \
|
|
142
|
+
--overwrite-to ucsc --out-dir ucsc_out/
|
|
143
|
+
|
|
144
|
+
# Refresh the cached alias data
|
|
145
|
+
alias-mapper update
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
### Multi-file mode
|
|
149
|
+
|
|
150
|
+
Pass `--fasta <ref>` to process a reference FASTA together with its
|
|
151
|
+
annotation files in one invocation. The assembly is detected once from
|
|
152
|
+
the FASTA and the alias table is loaded once for the whole batch.
|
|
153
|
+
Outputs go to `--out-dir`, named `<stem>.<convention>.<ext>` (gzip
|
|
154
|
+
preserved).
|
|
155
|
+
|
|
156
|
+
There are two modes:
|
|
157
|
+
|
|
158
|
+
- **Conform** (the default, when `--overwrite-to` is omitted): each
|
|
159
|
+
annotation is rewritten to match the FASTA's *own* convention, and
|
|
160
|
+
the FASTA is left unchanged. Use this to make a set of annotations
|
|
161
|
+
agree with a genome you already have. The FASTA is not copied into
|
|
162
|
+
the output directory, since it is unchanged.
|
|
163
|
+
- **Overwrite** (`--overwrite-to <convention>`): the FASTA and every
|
|
164
|
+
annotation are converted to the named convention.
|
|
165
|
+
|
|
166
|
+
`--to` is single-file only; in `--fasta` mode use `--overwrite-to`
|
|
167
|
+
(or omit it to conform).
|
|
168
|
+
|
|
169
|
+
### Flags (`convert`)
|
|
170
|
+
|
|
171
|
+
| Flag | Mode | Purpose |
|
|
172
|
+
| ---------------- | ----------- | ------------------------------------------------------------- |
|
|
173
|
+
| `--to` | single-file | Target naming convention (required in single-file mode) |
|
|
174
|
+
| `-o` | single-file | Output path |
|
|
175
|
+
| `--fasta` | multi-file | Reference FASTA; enables multi-file mode |
|
|
176
|
+
| `--overwrite-to` | multi-file | Force the FASTA and all annotations to this convention |
|
|
177
|
+
| `--out-dir` | multi-file | Output directory for the converted files |
|
|
178
|
+
| `--from` | both | Source convention. Auto-detected if absent (not used to conform) |
|
|
179
|
+
| `--assembly` | both | Assembly accession. Auto-detected if absent |
|
|
180
|
+
| `--alias-db` | both | Path to a specific alias SQLite database (overrides cache) |
|
|
181
|
+
|
|
182
|
+
### Auto-detection
|
|
183
|
+
|
|
184
|
+
When `--from` or `--assembly` is omitted, the tool reads up to 50
|
|
185
|
+
unique sequence names from the input and scores them against the
|
|
186
|
+
database. It commits to a result only when the top candidate has at
|
|
187
|
+
least 5 matches and beats the runner-up by 2× or more. Otherwise it
|
|
188
|
+
errors out and asks for the flag explicitly.
|
|
189
|
+
|
|
190
|
+
### Unmapped names
|
|
191
|
+
|
|
192
|
+
If a sequence name in the input isn't in the alias database, the line
|
|
193
|
+
is written to the output unchanged and counted in the unmapped total.
|
|
194
|
+
Up to five example names are printed at the end of the run so you can
|
|
195
|
+
see what didn't translate.
|
|
196
|
+
|
|
197
|
+
Before giving up on a name, the tool tries a couple of conservative
|
|
198
|
+
fallbacks: swapping a UCSC-style `vN` version separator for the `.N`
|
|
199
|
+
form (and vice versa), and stripping an `ENA|...|accession` header
|
|
200
|
+
wrapper down to the bare accession. These only run when the exact name
|
|
201
|
+
isn't found, so they never override a direct match.
|
|
202
|
+
|
|
203
|
+
## Data updates
|
|
204
|
+
|
|
205
|
+
A weekly GitHub Actions workflow rebuilds the alias dataset from
|
|
206
|
+
NCBI's published assembly summaries and publishes it as a
|
|
207
|
+
`data-YYYY-MM-DD` GitHub Release. Each release ships three artifacts:
|
|
208
|
+
|
|
209
|
+
- `aliases.tsv.gz` — the merged-row alias data the CLI consumes.
|
|
210
|
+
- `historical.tsv.gz` — dead-accession lookup with suppression dates
|
|
211
|
+
and best-effort replacements.
|
|
212
|
+
- `failures.tsv` — per-assembly collection failure log.
|
|
213
|
+
|
|
214
|
+
## More
|
|
215
|
+
|
|
216
|
+
See [`docs/design.md`](docs/design.md) for architecture, design
|
|
217
|
+
decisions, and direction.
|
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
# alias-mapper
|
|
2
|
+
|
|
3
|
+
Translate chromosome and scaffold names in bioinformatics files
|
|
4
|
+
between naming conventions (GenBank, RefSeq, UCSC, and others).
|
|
5
|
+
|
|
6
|
+
## What it does
|
|
7
|
+
|
|
8
|
+
Research files from different sources use different names for the same
|
|
9
|
+
sequences: `chr1`, `NC_000001.11`, `CM000663.2`, and `1` can all refer
|
|
10
|
+
to the same human chromosome. Files using different conventions can't
|
|
11
|
+
be combined without translation.
|
|
12
|
+
|
|
13
|
+
`alias-mapper` rewrites the sequence names in GFF, GTF, and FASTA
|
|
14
|
+
files from one convention to another using a precomputed alias table
|
|
15
|
+
built from NCBI assembly reports. Source convention and genome
|
|
16
|
+
assembly are auto-detected from the input by default.
|
|
17
|
+
|
|
18
|
+
## Install
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
pip install git+https://github.com/guigolab/alias-mapper.git
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
On networks that perform TLS inspection (corporate / institutional,
|
|
25
|
+
e.g. CRG), also install the `trusted` extra so the tool uses the
|
|
26
|
+
system keychain for cert verification:
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
pip install "alias-mapper[trusted] @ git+https://github.com/guigolab/alias-mapper.git"
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
The first time you run `convert`, the tool downloads the latest alias
|
|
33
|
+
data (~100 MB) from GitHub Releases and builds a local SQLite database
|
|
34
|
+
in your platform cache directory:
|
|
35
|
+
|
|
36
|
+
- macOS: `~/Library/Caches/alias-mapper/aliases.db`
|
|
37
|
+
- Linux: `~/.cache/alias-mapper/aliases.db`
|
|
38
|
+
- Windows: `%LOCALAPPDATA%\alias-mapper\Cache\aliases.db`
|
|
39
|
+
|
|
40
|
+
First-run setup takes about a minute. Subsequent runs use the cached
|
|
41
|
+
database directly. If the database schema changes in a newer release,
|
|
42
|
+
the cache is rebuilt automatically.
|
|
43
|
+
|
|
44
|
+
## Quickstart
|
|
45
|
+
|
|
46
|
+
```bash
|
|
47
|
+
alias-mapper convert annotations.gff --to ucsc -o annotations.ucsc.gff
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
A summary on stderr reports how many rows were translated and how many
|
|
51
|
+
had sequence names not in the alias database (those rows are passed
|
|
52
|
+
through unchanged with a warning).
|
|
53
|
+
|
|
54
|
+
## Usage
|
|
55
|
+
|
|
56
|
+
```
|
|
57
|
+
# single file
|
|
58
|
+
alias-mapper convert <input> --to <convention> -o <output> [options]
|
|
59
|
+
|
|
60
|
+
# multi-file: conform annotations to a reference FASTA (FASTA untouched)
|
|
61
|
+
alias-mapper convert --fasta <ref> [<ann> ...] --out-dir <dir> [options]
|
|
62
|
+
|
|
63
|
+
# multi-file: force the FASTA and annotations to one convention
|
|
64
|
+
alias-mapper convert --fasta <ref> [<ann> ...] --overwrite-to <convention> --out-dir <dir>
|
|
65
|
+
|
|
66
|
+
alias-mapper update
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
### Subcommands
|
|
70
|
+
|
|
71
|
+
- **`convert`** — translate a single file, or a reference FASTA plus
|
|
72
|
+
its annotation files (multi-file mode; see [Multi-file mode](#multi-file-mode)).
|
|
73
|
+
- **`update`** — re-download the latest alias data and rebuild the
|
|
74
|
+
cached database. Run manually when you want newer data.
|
|
75
|
+
|
|
76
|
+
### Supported file types
|
|
77
|
+
|
|
78
|
+
GFF (`.gff`, `.gff3`), GTF (`.gtf`), and FASTA (`.fa`, `.fasta`,
|
|
79
|
+
`.fna`). The translator is picked by file extension.
|
|
80
|
+
|
|
81
|
+
### Supported conventions
|
|
82
|
+
|
|
83
|
+
`genbank`, `refseq`, `ucsc`, `sequence-name`, `assigned-molecule`.
|
|
84
|
+
|
|
85
|
+
### Examples
|
|
86
|
+
|
|
87
|
+
```bash
|
|
88
|
+
# Translate from RefSeq to UCSC explicitly
|
|
89
|
+
alias-mapper convert annotations.gff \
|
|
90
|
+
--from refseq --to ucsc \
|
|
91
|
+
-o out.gff
|
|
92
|
+
|
|
93
|
+
# Pin the assembly when auto-detection is ambiguous
|
|
94
|
+
alias-mapper convert annotations.gff \
|
|
95
|
+
--to ucsc \
|
|
96
|
+
--assembly GCF_000001405.40 \
|
|
97
|
+
-o out.gff
|
|
98
|
+
|
|
99
|
+
# FASTA — same syntax, different file
|
|
100
|
+
alias-mapper convert reference.fa \
|
|
101
|
+
--from genbank --to sequence-name \
|
|
102
|
+
--assembly GCA_963924405.1 \
|
|
103
|
+
-o reference.renamed.fa
|
|
104
|
+
|
|
105
|
+
# Multi-file conform: rewrite the annotations to match reference.fa's
|
|
106
|
+
# own convention; reference.fa is left untouched
|
|
107
|
+
alias-mapper convert --fasta reference.fa genes.gff peaks.bed.gff \
|
|
108
|
+
--out-dir conformed/
|
|
109
|
+
|
|
110
|
+
# Multi-file overwrite: force reference.fa and its annotations to UCSC
|
|
111
|
+
alias-mapper convert --fasta reference.fa genes.gff \
|
|
112
|
+
--overwrite-to ucsc --out-dir ucsc_out/
|
|
113
|
+
|
|
114
|
+
# Refresh the cached alias data
|
|
115
|
+
alias-mapper update
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
### Multi-file mode
|
|
119
|
+
|
|
120
|
+
Pass `--fasta <ref>` to process a reference FASTA together with its
|
|
121
|
+
annotation files in one invocation. The assembly is detected once from
|
|
122
|
+
the FASTA and the alias table is loaded once for the whole batch.
|
|
123
|
+
Outputs go to `--out-dir`, named `<stem>.<convention>.<ext>` (gzip
|
|
124
|
+
preserved).
|
|
125
|
+
|
|
126
|
+
There are two modes:
|
|
127
|
+
|
|
128
|
+
- **Conform** (the default, when `--overwrite-to` is omitted): each
|
|
129
|
+
annotation is rewritten to match the FASTA's *own* convention, and
|
|
130
|
+
the FASTA is left unchanged. Use this to make a set of annotations
|
|
131
|
+
agree with a genome you already have. The FASTA is not copied into
|
|
132
|
+
the output directory, since it is unchanged.
|
|
133
|
+
- **Overwrite** (`--overwrite-to <convention>`): the FASTA and every
|
|
134
|
+
annotation are converted to the named convention.
|
|
135
|
+
|
|
136
|
+
`--to` is single-file only; in `--fasta` mode use `--overwrite-to`
|
|
137
|
+
(or omit it to conform).
|
|
138
|
+
|
|
139
|
+
### Flags (`convert`)
|
|
140
|
+
|
|
141
|
+
| Flag | Mode | Purpose |
|
|
142
|
+
| ---------------- | ----------- | ------------------------------------------------------------- |
|
|
143
|
+
| `--to` | single-file | Target naming convention (required in single-file mode) |
|
|
144
|
+
| `-o` | single-file | Output path |
|
|
145
|
+
| `--fasta` | multi-file | Reference FASTA; enables multi-file mode |
|
|
146
|
+
| `--overwrite-to` | multi-file | Force the FASTA and all annotations to this convention |
|
|
147
|
+
| `--out-dir` | multi-file | Output directory for the converted files |
|
|
148
|
+
| `--from` | both | Source convention. Auto-detected if absent (not used to conform) |
|
|
149
|
+
| `--assembly` | both | Assembly accession. Auto-detected if absent |
|
|
150
|
+
| `--alias-db` | both | Path to a specific alias SQLite database (overrides cache) |
|
|
151
|
+
|
|
152
|
+
### Auto-detection
|
|
153
|
+
|
|
154
|
+
When `--from` or `--assembly` is omitted, the tool reads up to 50
|
|
155
|
+
unique sequence names from the input and scores them against the
|
|
156
|
+
database. It commits to a result only when the top candidate has at
|
|
157
|
+
least 5 matches and beats the runner-up by 2× or more. Otherwise it
|
|
158
|
+
errors out and asks for the flag explicitly.
|
|
159
|
+
|
|
160
|
+
### Unmapped names
|
|
161
|
+
|
|
162
|
+
If a sequence name in the input isn't in the alias database, the line
|
|
163
|
+
is written to the output unchanged and counted in the unmapped total.
|
|
164
|
+
Up to five example names are printed at the end of the run so you can
|
|
165
|
+
see what didn't translate.
|
|
166
|
+
|
|
167
|
+
Before giving up on a name, the tool tries a couple of conservative
|
|
168
|
+
fallbacks: swapping a UCSC-style `vN` version separator for the `.N`
|
|
169
|
+
form (and vice versa), and stripping an `ENA|...|accession` header
|
|
170
|
+
wrapper down to the bare accession. These only run when the exact name
|
|
171
|
+
isn't found, so they never override a direct match.
|
|
172
|
+
|
|
173
|
+
## Data updates
|
|
174
|
+
|
|
175
|
+
A weekly GitHub Actions workflow rebuilds the alias dataset from
|
|
176
|
+
NCBI's published assembly summaries and publishes it as a
|
|
177
|
+
`data-YYYY-MM-DD` GitHub Release. Each release ships three artifacts:
|
|
178
|
+
|
|
179
|
+
- `aliases.tsv.gz` — the merged-row alias data the CLI consumes.
|
|
180
|
+
- `historical.tsv.gz` — dead-accession lookup with suppression dates
|
|
181
|
+
and best-effort replacements.
|
|
182
|
+
- `failures.tsv` — per-assembly collection failure log.
|
|
183
|
+
|
|
184
|
+
## More
|
|
185
|
+
|
|
186
|
+
See [`docs/design.md`](docs/design.md) for architecture, design
|
|
187
|
+
decisions, and direction.
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "alias-mapper"
|
|
7
|
+
dynamic = ["version"]
|
|
8
|
+
description = "Translate chromosome/scaffold names in bioinformatics files between naming conventions"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
license = { text = "MIT" }
|
|
12
|
+
authors = [
|
|
13
|
+
{ name = "Max Reese" },
|
|
14
|
+
]
|
|
15
|
+
keywords = ["bioinformatics", "genomics", "gff", "fasta", "naming-conventions", "ncbi"]
|
|
16
|
+
classifiers = [
|
|
17
|
+
"Development Status :: 5 - Production/Stable",
|
|
18
|
+
"Intended Audience :: Science/Research",
|
|
19
|
+
"Topic :: Scientific/Engineering :: Bio-Informatics",
|
|
20
|
+
"License :: OSI Approved :: MIT License",
|
|
21
|
+
"Programming Language :: Python :: 3",
|
|
22
|
+
"Programming Language :: Python :: 3.10",
|
|
23
|
+
"Programming Language :: Python :: 3.11",
|
|
24
|
+
"Programming Language :: Python :: 3.12",
|
|
25
|
+
"Programming Language :: Python :: 3.13",
|
|
26
|
+
"Operating System :: OS Independent",
|
|
27
|
+
]
|
|
28
|
+
dependencies = [
|
|
29
|
+
"platformdirs>=4.0",
|
|
30
|
+
"certifi",
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
[project.optional-dependencies]
|
|
34
|
+
# truststore makes the package use the system keychain for TLS
|
|
35
|
+
# verification, which is necessary on networks that do TLS inspection
|
|
36
|
+
# (e.g. corporate / institutional networks like CRG's). Harmless
|
|
37
|
+
# elsewhere. Install with: pip install alias-mapper[trusted]
|
|
38
|
+
trusted = [
|
|
39
|
+
"truststore",
|
|
40
|
+
]
|
|
41
|
+
# Test dependencies. Install with: pip install -e .[test]
|
|
42
|
+
test = [
|
|
43
|
+
"pytest>=7",
|
|
44
|
+
]
|
|
45
|
+
|
|
46
|
+
[project.urls]
|
|
47
|
+
Homepage = "https://github.com/guigolab/alias-mapper"
|
|
48
|
+
Issues = "https://github.com/guigolab/alias-mapper/issues"
|
|
49
|
+
|
|
50
|
+
[project.scripts]
|
|
51
|
+
alias-mapper = "alias_mapper.cli:main"
|
|
52
|
+
|
|
53
|
+
# Single-source the version from the package so a release is one number to
|
|
54
|
+
# bump (src/alias_mapper/__init__.py). setuptools reads __version__ without
|
|
55
|
+
# importing the package, so this stays cheap and import-safe.
|
|
56
|
+
[tool.setuptools.dynamic]
|
|
57
|
+
version = { attr = "alias_mapper.__version__" }
|
|
58
|
+
|
|
59
|
+
[tool.setuptools.packages.find]
|
|
60
|
+
where = ["src"]
|
|
61
|
+
include = ["alias_mapper*"]
|
|
62
|
+
|
|
63
|
+
[tool.pytest.ini_options]
|
|
64
|
+
testpaths = ["tests"]
|
|
65
|
+
# Put src/ on sys.path so `pytest` finds the package without an install
|
|
66
|
+
# (CI does `pip install -e .[test]`, but this keeps a bare `pytest` working
|
|
67
|
+
# locally too). pythonpath requires pytest >= 7.
|
|
68
|
+
pythonpath = ["src"]
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
"""
|
|
2
|
+
_ssl.py
|
|
3
|
+
-------
|
|
4
|
+
Shared SSL context setup for the installed alias-mapper package.
|
|
5
|
+
|
|
6
|
+
Mirrors scripts/_http.py's setup, but lives inside the package so
|
|
7
|
+
bootstrap.py and any future HTTP-using module (e.g. HttpAliasSource)
|
|
8
|
+
can import it without depending on scripts/.
|
|
9
|
+
|
|
10
|
+
Order of preference: truststore > certifi > stdlib defaults.
|
|
11
|
+
|
|
12
|
+
- truststore: uses the system keychain (necessary on networks with
|
|
13
|
+
TLS inspection like CRG's, which inject a non-Mozilla root cert)
|
|
14
|
+
- certifi: Mozilla's CA bundle, covers most environments including
|
|
15
|
+
GitHub Actions runners
|
|
16
|
+
- stdlib: last fallback, used if neither extra is installed
|
|
17
|
+
|
|
18
|
+
Both truststore and certifi are optional installs. The package will
|
|
19
|
+
work without them on any network where the system already trusts the
|
|
20
|
+
NCBI/GitHub cert chains.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
import ssl
|
|
24
|
+
|
|
25
|
+
try:
|
|
26
|
+
import truststore
|
|
27
|
+
truststore.inject_into_ssl()
|
|
28
|
+
SSL_BACKEND = "truststore"
|
|
29
|
+
except ImportError:
|
|
30
|
+
SSL_BACKEND = None
|
|
31
|
+
|
|
32
|
+
try:
|
|
33
|
+
import certifi
|
|
34
|
+
SSL_CONTEXT = ssl.create_default_context(cafile=certifi.where())
|
|
35
|
+
if SSL_BACKEND is None:
|
|
36
|
+
SSL_BACKEND = "certifi"
|
|
37
|
+
except ImportError:
|
|
38
|
+
SSL_CONTEXT = ssl.create_default_context()
|
|
39
|
+
if SSL_BACKEND is None:
|
|
40
|
+
SSL_BACKEND = "stdlib"
|