gene-tidy 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gene_tidy-0.1.0/LICENSE +21 -0
- gene_tidy-0.1.0/PKG-INFO +292 -0
- gene_tidy-0.1.0/README.md +261 -0
- gene_tidy-0.1.0/pyproject.toml +56 -0
- gene_tidy-0.1.0/setup.cfg +4 -0
- gene_tidy-0.1.0/src/gene_tidy/__init__.py +31 -0
- gene_tidy-0.1.0/src/gene_tidy/cli.py +98 -0
- gene_tidy-0.1.0/src/gene_tidy/columns.py +129 -0
- gene_tidy-0.1.0/src/gene_tidy/data/hgnc_complete_set.tsv.gz +0 -0
- gene_tidy-0.1.0/src/gene_tidy/data/hgnc_version.json +26 -0
- gene_tidy-0.1.0/src/gene_tidy/detect.py +92 -0
- gene_tidy-0.1.0/src/gene_tidy/examples/__init__.py +54 -0
- gene_tidy-0.1.0/src/gene_tidy/examples/messy_example.xlsx +0 -0
- gene_tidy-0.1.0/src/gene_tidy/excel_fix.py +123 -0
- gene_tidy-0.1.0/src/gene_tidy/hgnc.py +243 -0
- gene_tidy-0.1.0/src/gene_tidy/io.py +132 -0
- gene_tidy-0.1.0/src/gene_tidy/pipeline.py +212 -0
- gene_tidy-0.1.0/src/gene_tidy/resolver.py +302 -0
- gene_tidy-0.1.0/src/gene_tidy.egg-info/PKG-INFO +292 -0
- gene_tidy-0.1.0/src/gene_tidy.egg-info/SOURCES.txt +31 -0
- gene_tidy-0.1.0/src/gene_tidy.egg-info/dependency_links.txt +1 -0
- gene_tidy-0.1.0/src/gene_tidy.egg-info/entry_points.txt +2 -0
- gene_tidy-0.1.0/src/gene_tidy.egg-info/requires.txt +11 -0
- gene_tidy-0.1.0/src/gene_tidy.egg-info/top_level.txt +1 -0
- gene_tidy-0.1.0/tests/test_cli.py +58 -0
- gene_tidy-0.1.0/tests/test_columns.py +56 -0
- gene_tidy-0.1.0/tests/test_data_boundary.py +99 -0
- gene_tidy-0.1.0/tests/test_detect.py +63 -0
- gene_tidy-0.1.0/tests/test_excel_fix.py +56 -0
- gene_tidy-0.1.0/tests/test_golden.py +64 -0
- gene_tidy-0.1.0/tests/test_io.py +146 -0
- gene_tidy-0.1.0/tests/test_paths_and_packaging.py +77 -0
- gene_tidy-0.1.0/tests/test_resolver.py +130 -0
gene_tidy-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 gene-tidy contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
gene_tidy-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,292 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: gene-tidy
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: No-code cleaning of messy gene/protein identifier tables, fully offline, with explicit ambiguity handling.
|
|
5
|
+
Author: gene-tidy contributors
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/MargoSolo/gene-tidy
|
|
8
|
+
Project-URL: Issues, https://github.com/MargoSolo/gene-tidy/issues
|
|
9
|
+
Keywords: bioinformatics,genomics,HGNC,gene symbols,identifier mapping,data cleaning
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Intended Audience :: Science/Research
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
18
|
+
Requires-Python: >=3.10
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
License-File: LICENSE
|
|
21
|
+
Requires-Dist: pandas>=1.5
|
|
22
|
+
Requires-Dist: openpyxl>=3.0
|
|
23
|
+
Requires-Dist: typer>=0.9
|
|
24
|
+
Provides-Extra: test
|
|
25
|
+
Requires-Dist: pytest>=7.0; extra == "test"
|
|
26
|
+
Provides-Extra: dev
|
|
27
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
28
|
+
Requires-Dist: build>=1.0; extra == "dev"
|
|
29
|
+
Requires-Dist: twine>=4.0; extra == "dev"
|
|
30
|
+
Dynamic: license-file
|
|
31
|
+
|
|
32
|
+
# gene-tidy
|
|
33
|
+
|
|
34
|
+
[](https://pypi.org/project/gene-tidy/)
|
|
35
|
+
[](https://www.python.org/)
|
|
36
|
+
[](LICENSE)
|
|
37
|
+
[](https://colab.research.google.com/github/MargoSolo/gene-tidy/blob/main/notebooks/gene_tidy_colab.ipynb)
|
|
38
|
+
|
|
39
|
+
<!-- No CI badge until a real GitHub Actions workflow exists. -->
|
|
40
|
+
|
|
41
|
+
**Clean messy gene/protein identifier tables — fully offline, fully audited, no code required.**
|
|
42
|
+
|
|
43
|
+
Drop in a TXT/CSV/XLSX from a paper, a supplementary file, or a lab Excel sheet,
|
|
44
|
+
and get back a clean, multi-ID, fully-traceable table. Every value is mapped to
|
|
45
|
+
the current HGNC approved symbol plus Ensembl / UniProt / Entrez / RefSeq
|
|
46
|
+
cross-references — and nothing is ever guessed silently or dropped.
|
|
47
|
+
|
|
48
|
+
Inspired by [HGNChelper](https://cran.r-project.org/package=HGNChelper) (R), but
|
|
49
|
+
in Python, mapping to all major IDs, with **explicit ambiguity handling** and
|
|
50
|
+
**Excel date-corruption recovery** (`SEPT2 → "2-Sep"`, `MARCH1 → "1-Mar"`).
|
|
51
|
+
|
|
52
|
+

|
|
53
|
+
|
|
54
|
+
<!-- TODO: replace docs/demo.gif with a real screencast of the CLI / Colab run. -->
|
|
55
|
+
|
|
56
|
+
## Scope
|
|
57
|
+
|
|
58
|
+
gene-tidy is **HGNC-centered, offline, and reproducible**: it standardises human
|
|
59
|
+
gene/protein identifiers against a bundled static HGNC complete set and records
|
|
60
|
+
exactly which version it used. It is **not** a full
|
|
61
|
+
[BioMart](https://www.ensembl.org/biomart/) / [VEP](https://www.ensembl.org/vep)
|
|
62
|
+
/ [UniProt](https://www.uniprot.org/) mapping service — it does no live lookups,
|
|
63
|
+
no transcript/variant annotation, and no cross-database ID expansion beyond the
|
|
64
|
+
gene-level cross-references HGNC itself provides. If you need exhaustive,
|
|
65
|
+
always-current, multi-database mapping, use those tools; if you need a fast,
|
|
66
|
+
offline, auditable HGNC cleanup you can cite in a methods section, use gene-tidy.
|
|
67
|
+
|
|
68
|
+
## Why
|
|
69
|
+
|
|
70
|
+
- **Works offline, out of the box.** The full [HGNC](https://www.genenames.org/)
|
|
71
|
+
complete set (all Approved gene records) ships inside the package as a gzipped
|
|
72
|
+
TSV. No network, no API keys, no surprises — and the exact HGNC version is
|
|
73
|
+
recorded in every run.
|
|
74
|
+
- **Never guesses silently.** One-to-many or uncertain mappings are flagged
|
|
75
|
+
`ambiguous` / `manual_review_required` and routed to a separate file.
|
|
76
|
+
- **Never drops a row.** Clean, ambiguous, and failed rows are all accounted for.
|
|
77
|
+
- **Reproducible.** Every run emits a `methods_text.txt` paragraph (tool version,
|
|
78
|
+
HGNC version + date) ready to paste into a supplementary methods section.
|
|
79
|
+
|
|
80
|
+
## Install
|
|
81
|
+
|
|
82
|
+
```bash
|
|
83
|
+
pip install gene-tidy
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
Requires Python 3.10+. Dependencies: `pandas`, `openpyxl`, `typer`.
|
|
87
|
+
|
|
88
|
+
To install the latest development version directly from GitHub:
|
|
89
|
+
|
|
90
|
+
```bash
|
|
91
|
+
pip install git+https://github.com/MargoSolo/gene-tidy.git
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
From source (recommended for development):
|
|
95
|
+
|
|
96
|
+
```bash
|
|
97
|
+
git clone https://github.com/MargoSolo/gene-tidy
|
|
98
|
+
cd gene-tidy
|
|
99
|
+
pip install -e .
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
## Quickstart (CLI)
|
|
103
|
+
|
|
104
|
+
```bash
|
|
105
|
+
gene-tidy input.xlsx --out outputs/
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
That's it. `outputs/` will contain six files (see below). Works the same on
|
|
109
|
+
`.txt`, `.csv`, `.tsv`, and `.xlsx`:
|
|
110
|
+
|
|
111
|
+
```bash
|
|
112
|
+
gene-tidy my_genes.txt --out outputs/
|
|
113
|
+
gene-tidy supp_table.csv --out outputs/
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
Useful flags:
|
|
117
|
+
|
|
118
|
+
```bash
|
|
119
|
+
gene-tidy data.xlsx -o out/ --column gene_symbol # force the identifier column
|
|
120
|
+
gene-tidy data.csv -o out/ --column symbol -c ensembl_id # multiple columns
|
|
121
|
+
gene-tidy data.xlsx -o out/ --hgnc-file hgnc_complete_set.txt # use the full HGNC set
|
|
122
|
+
gene-tidy --version # tool + HGNC dump version
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
## Quickstart (Python)
|
|
126
|
+
|
|
127
|
+
```python
|
|
128
|
+
from gene_tidy import tidy_file, tidy_values
|
|
129
|
+
|
|
130
|
+
# Whole file -> writes the six output files, returns a result object.
|
|
131
|
+
result = tidy_file("supp_table.xlsx", "outputs/")
|
|
132
|
+
print(result.counts) # {'total': 21, 'clean': 16, 'ambiguous': 3, 'failed': 2}
|
|
133
|
+
|
|
134
|
+
# Or clean an in-memory list of identifiers (no files written):
|
|
135
|
+
result = tidy_values(["TP53", "p53", "Sep-7", "ENSG00000141510", "1-Mar", "FOOBAR1"])
|
|
136
|
+
print(result.audit[["input_value", "approved_symbol", "match_status"]])
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
```text
|
|
140
|
+
input_value approved_symbol match_status
|
|
141
|
+
0 TP53 TP53 matched
|
|
142
|
+
1 p53 TP53 matched_alias
|
|
143
|
+
2 Sep-7 SEPTIN7 recovered_excel
|
|
144
|
+
3 ENSG00000141510 TP53 matched
|
|
145
|
+
4 1-Mar MARCHF1;MTARC1 ambiguous
|
|
146
|
+
5 FOOBAR1 unmatched
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
`1-Mar` (ambiguous between `MARCHF1` and `MTARC1`) lands in `result.ambiguous`;
|
|
150
|
+
`FOOBAR1` lands in `result.failed`. Nothing is dropped.
|
|
151
|
+
|
|
152
|
+
## What it handles
|
|
153
|
+
|
|
154
|
+
| Input | Example | Result |
|
|
155
|
+
|---|---|---|
|
|
156
|
+
| Approved symbol | `TP53` | `matched` → TP53 |
|
|
157
|
+
| Alias symbol | `p53`, `HER2` | `matched_alias` (warns "resolved from alias") |
|
|
158
|
+
| Previous symbol | `FRAP1`, `VEGF` | `matched_prev` (warns "resolved from previous symbol") |
|
|
159
|
+
| Ensembl gene | `ENSG00000141510` | `matched` → TP53 |
|
|
160
|
+
| UniProt | `P38398` | `matched` → BRCA1 |
|
|
161
|
+
| Entrez | `672` | `matched` → BRCA1 |
|
|
162
|
+
| RefSeq | `NM_000546` | `matched` → TP53 |
|
|
163
|
+
| HGNC ID | `HGNC:11998` | `matched` → TP53 |
|
|
164
|
+
| **Excel date corruption** | `Sep-7` | `recovered_excel` → SEPTIN7 (always warns) |
|
|
165
|
+
| **Ambiguous corruption** | `1-Mar`, `2-Sep`, `1-Dec` | `ambiguous` → e.g. MARCHF1/MTARC1, SEPTIN2/SEPTIN6 → manual review |
|
|
166
|
+
| Multiple IDs per cell | `KRAS, NRAS` | split and resolved independently |
|
|
167
|
+
| Case / whitespace | ` tp53 ` | normalised → TP53 |
|
|
168
|
+
| Duplicates | `TP53` ×2 | kept, flagged in `warning` |
|
|
169
|
+
| No match | `FOOBAR1` | `unmatched` → `failed_rows.csv` |
|
|
170
|
+
|
|
171
|
+
## Output files
|
|
172
|
+
|
|
173
|
+
Every run writes six files to `--out`:
|
|
174
|
+
|
|
175
|
+
| File | Contents |
|
|
176
|
+
|---|---|
|
|
177
|
+
| `clean_table.xlsx` / `clean_table.csv` | confidently resolved rows |
|
|
178
|
+
| `ambiguous_rows.csv` | one-to-many / uncertain rows needing manual review |
|
|
179
|
+
| `failed_rows.csv` | unmatched and empty rows |
|
|
180
|
+
| `mapping_audit.csv` | **every** input → output, with full provenance (see below) |
|
|
181
|
+
| `methods_text.txt` | paste-ready methods paragraph (tool + HGNC version/date) |
|
|
182
|
+
|
|
183
|
+
### Columns (required schema)
|
|
184
|
+
|
|
185
|
+
`input_value`, `detected_type`, `approved_symbol`, `hgnc_id`,
|
|
186
|
+
`ensembl_gene_id`, `uniprot_id`, `entrez_id`, `refseq_id`, `match_status`,
|
|
187
|
+
`warning`, `source_used`, `manual_review_required`
|
|
188
|
+
(plus `source_row` / `source_column` for traceability back to the original table).
|
|
189
|
+
|
|
190
|
+
`match_status` is one of: `matched`, `matched_alias`, `matched_prev`,
|
|
191
|
+
`recovered_excel` (→ clean) · `ambiguous` (→ review) · `unmatched`, `empty`
|
|
192
|
+
(→ failed).
|
|
193
|
+
|
|
194
|
+
Every table also carries per-row provenance — `matched_field` (which HGNC field
|
|
195
|
+
matched: `symbol` / `alias_symbol` / `prev_symbol` / `ensembl_gene_id` /
|
|
196
|
+
`uniprot_ids` / `entrez_id` / `refseq_accession` / `hgnc_id` / `excel_recovery`),
|
|
197
|
+
`match_reason` (human-readable), and `candidate_count` (1 for a clean hit, N for
|
|
198
|
+
ambiguous, 0 for no match). `mapping_audit.csv` additionally records
|
|
199
|
+
`hgnc_dump_date` and `gene_tidy_version` on every row for full reproducibility.
|
|
200
|
+
|
|
201
|
+
## Source of truth & offline guarantee
|
|
202
|
+
|
|
203
|
+
Resolution runs against a **static, bundled HGNC complete set** —
|
|
204
|
+
`src/gene_tidy/data/hgnc_complete_set.tsv.gz`, containing all ~45,000 Approved
|
|
205
|
+
HGNC gene records — matched against the approved symbol, `alias_symbol`, and
|
|
206
|
+
`prev_symbol` fields. The accompanying `hgnc_version.json` records the source
|
|
207
|
+
URL, HGNC license (CC0), download date, release tag, and record count; the same
|
|
208
|
+
provenance is printed by `gene-tidy --version`, written into every
|
|
209
|
+
`mapping_audit.csv` row, and summarised in `methods_text.txt`.
|
|
210
|
+
|
|
211
|
+
To use a **different / newer** HGNC release, pass
|
|
212
|
+
`--hgnc-file path/to/hgnc_complete_set.txt`, set the `GENE_TIDY_HGNC_FILE`
|
|
213
|
+
environment variable, or regenerate the bundled dump with
|
|
214
|
+
`python tools/build_hgnc_data.py hgnc_complete_set.txt`. A user-supplied file is
|
|
215
|
+
filtered to `status == Approved` automatically.
|
|
216
|
+
|
|
217
|
+
The package and its **tests never require network access.** (The test suite
|
|
218
|
+
resolves against a tiny curated fixture in `tests/fixtures/` for speed; the real
|
|
219
|
+
bundled dump is exercised separately in `tests/test_data_boundary.py`.)
|
|
220
|
+
|
|
221
|
+
> Real-world note: because the bundled data is the *real* HGNC set, genuine
|
|
222
|
+
> one-to-many cases surface honestly. For example `SEPT2` is a previous symbol of
|
|
223
|
+
> `SEPTIN2` **and** an alias of `SEPTIN6`, so gene-tidy reports it `ambiguous`
|
|
224
|
+
> rather than guessing.
|
|
225
|
+
|
|
226
|
+
## Colab notebook
|
|
227
|
+
|
|
228
|
+
Zero-setup, in-browser: upload a file → run → preview clean/failed/ambiguous
|
|
229
|
+
rows → download a ZIP of all outputs. A bundled `messy_example.xlsx` lets you
|
|
230
|
+
click **Run** and see results immediately.
|
|
231
|
+
|
|
232
|
+
[`notebooks/gene_tidy_colab.ipynb`](notebooks/gene_tidy_colab.ipynb)
|
|
233
|
+
<!-- TODO: add an "Open in Colab" badge pointing at the hosted repo path. -->
|
|
234
|
+
|
|
235
|
+
## Limitations (v0.1)
|
|
236
|
+
|
|
237
|
+
- Ensembl **transcript/protein** IDs (`ENST…`/`ENSP…`) are detected but not
|
|
238
|
+
resolved offline (gene-level dump only); they are flagged for manual review.
|
|
239
|
+
- Numeric Excel date *serials* (e.g. `44075`) are indistinguishable from Entrez
|
|
240
|
+
IDs and are intentionally **not** reinterpreted.
|
|
241
|
+
- Human only. No HGVS / ClinVar / VEP / gnomAD / liftover / genome-build
|
|
242
|
+
detection / clinical interpretation (out of scope for v0.1).
|
|
243
|
+
|
|
244
|
+
## Development
|
|
245
|
+
|
|
246
|
+
```bash
|
|
247
|
+
pip install -e ".[dev]" # installs pytest, build, and twine
|
|
248
|
+
pytest # 116 tests, all offline
|
|
249
|
+
```
|
|
250
|
+
|
|
251
|
+
Test coverage: ID-type detection, column detection, resolver (alias / prev /
|
|
252
|
+
Excel-corruption / ambiguity), input/output file handling, CLI, golden-output
|
|
253
|
+
regression on the bundled example, and a data-boundary test that loads the real
|
|
254
|
+
bundled HGNC complete set. Most tests use a small curated fixture
|
|
255
|
+
(`tests/fixtures/hgnc_subset.tsv`) so the suite runs in seconds.
|
|
256
|
+
|
|
257
|
+
To refresh the bundled HGNC data (deterministic: the same input always produces
|
|
258
|
+
a byte-identical `.tsv.gz`, and the run records `raw_download_sha256` +
|
|
259
|
+
`bundled_tsv_gz_sha256` in `hgnc_version.json`):
|
|
260
|
+
|
|
261
|
+
```bash
|
|
262
|
+
python tools/build_hgnc_data.py path/to/hgnc_complete_set.txt # from a pinned file
|
|
263
|
+
python tools/build_hgnc_data.py --download # or fetch current
|
|
264
|
+
```
|
|
265
|
+
|
|
266
|
+
## Attribution & citing HGNC
|
|
267
|
+
|
|
268
|
+
gene-tidy resolves identifiers using data from the **HUGO Gene Nomenclature
|
|
269
|
+
Committee (HGNC)**.
|
|
270
|
+
|
|
271
|
+
- **Source:** HGNC complete set (`hgnc_complete_set.txt`) from the HGNC download archive. The exact source URL, snapshot date, and SHA-256 hashes are recorded in `src/gene_tidy/data/hgnc_version.json`.
|
|
272
|
+
- **Snapshot bundled in this release:** see `downloaded_date` and
|
|
273
|
+
`bundled_tsv_gz_sha256` in
|
|
274
|
+
[`src/gene_tidy/data/hgnc_version.json`](src/gene_tidy/data/hgnc_version.json)
|
|
275
|
+
(also printed by `gene-tidy --version` and written into every
|
|
276
|
+
`mapping_audit.csv` / `methods_text.txt`).
|
|
277
|
+
- **License:** HGNC data are released under a
|
|
278
|
+
[CC0 1.0 public-domain dedication](https://www.genenames.org/about/license/),
|
|
279
|
+
so they are free to redistribute; gene-tidy bundles a column-trimmed,
|
|
280
|
+
Approved-only snapshot.
|
|
281
|
+
- **Recommendation:** in your own methods/supplementary text, cite HGNC and
|
|
282
|
+
state the **retrieval month/year** of the dump you used (e.g. *"HGNC complete
|
|
283
|
+
set, retrieved June 2026, via gene-tidy v0.1.0"*). The exact date and hash are
|
|
284
|
+
in `hgnc_version.json` and the generated `methods_text.txt`.
|
|
285
|
+
|
|
286
|
+
Please cite HGNC: Seal RL, *et al.* *Genenames.org: the HGNC resources in 2023.*
|
|
287
|
+
Nucleic Acids Res. 2023;51(D1):D1003–D1009.
|
|
288
|
+
|
|
289
|
+
## License
|
|
290
|
+
|
|
291
|
+
gene-tidy itself is MIT — see [LICENSE](LICENSE). The bundled HGNC data is CC0
|
|
292
|
+
(see Attribution above).
|
|
@@ -0,0 +1,261 @@
|
|
|
1
|
+
# gene-tidy
|
|
2
|
+
|
|
3
|
+
[](https://pypi.org/project/gene-tidy/)
|
|
4
|
+
[](https://www.python.org/)
|
|
5
|
+
[](LICENSE)
|
|
6
|
+
[](https://colab.research.google.com/github/MargoSolo/gene-tidy/blob/main/notebooks/gene_tidy_colab.ipynb)
|
|
7
|
+
|
|
8
|
+
<!-- No CI badge until a real GitHub Actions workflow exists. -->
|
|
9
|
+
|
|
10
|
+
**Clean messy gene/protein identifier tables — fully offline, fully audited, no code required.**
|
|
11
|
+
|
|
12
|
+
Drop in a TXT/CSV/XLSX from a paper, a supplementary file, or a lab Excel sheet,
|
|
13
|
+
and get back a clean, multi-ID, fully-traceable table. Every value is mapped to
|
|
14
|
+
the current HGNC approved symbol plus Ensembl / UniProt / Entrez / RefSeq
|
|
15
|
+
cross-references — and nothing is ever guessed silently or dropped.
|
|
16
|
+
|
|
17
|
+
Inspired by [HGNChelper](https://cran.r-project.org/package=HGNChelper) (R), but
|
|
18
|
+
in Python, mapping to all major IDs, with **explicit ambiguity handling** and
|
|
19
|
+
**Excel date-corruption recovery** (`SEPT2 → "2-Sep"`, `MARCH1 → "1-Mar"`).
|
|
20
|
+
|
|
21
|
+

|
|
22
|
+
|
|
23
|
+
<!-- TODO: replace docs/demo.gif with a real screencast of the CLI / Colab run. -->
|
|
24
|
+
|
|
25
|
+
## Scope
|
|
26
|
+
|
|
27
|
+
gene-tidy is **HGNC-centered, offline, and reproducible**: it standardises human
|
|
28
|
+
gene/protein identifiers against a bundled static HGNC complete set and records
|
|
29
|
+
exactly which version it used. It is **not** a full
|
|
30
|
+
[BioMart](https://www.ensembl.org/biomart/) / [VEP](https://www.ensembl.org/vep)
|
|
31
|
+
/ [UniProt](https://www.uniprot.org/) mapping service — it does no live lookups,
|
|
32
|
+
no transcript/variant annotation, and no cross-database ID expansion beyond the
|
|
33
|
+
gene-level cross-references HGNC itself provides. If you need exhaustive,
|
|
34
|
+
always-current, multi-database mapping, use those tools; if you need a fast,
|
|
35
|
+
offline, auditable HGNC cleanup you can cite in a methods section, use gene-tidy.
|
|
36
|
+
|
|
37
|
+
## Why
|
|
38
|
+
|
|
39
|
+
- **Works offline, out of the box.** The full [HGNC](https://www.genenames.org/)
|
|
40
|
+
complete set (all Approved gene records) ships inside the package as a gzipped
|
|
41
|
+
TSV. No network, no API keys, no surprises — and the exact HGNC version is
|
|
42
|
+
recorded in every run.
|
|
43
|
+
- **Never guesses silently.** One-to-many or uncertain mappings are flagged
|
|
44
|
+
`ambiguous` / `manual_review_required` and routed to a separate file.
|
|
45
|
+
- **Never drops a row.** Clean, ambiguous, and failed rows are all accounted for.
|
|
46
|
+
- **Reproducible.** Every run emits a `methods_text.txt` paragraph (tool version,
|
|
47
|
+
HGNC version + date) ready to paste into a supplementary methods section.
|
|
48
|
+
|
|
49
|
+
## Install
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
pip install gene-tidy
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
Requires Python 3.10+. Dependencies: `pandas`, `openpyxl`, `typer`.
|
|
56
|
+
|
|
57
|
+
To install the latest development version directly from GitHub:
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
pip install git+https://github.com/MargoSolo/gene-tidy.git
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
From source (recommended for development):
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
git clone https://github.com/MargoSolo/gene-tidy
|
|
67
|
+
cd gene-tidy
|
|
68
|
+
pip install -e .
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
## Quickstart (CLI)
|
|
72
|
+
|
|
73
|
+
```bash
|
|
74
|
+
gene-tidy input.xlsx --out outputs/
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
That's it. `outputs/` will contain six files (see below). Works the same on
|
|
78
|
+
`.txt`, `.csv`, `.tsv`, and `.xlsx`:
|
|
79
|
+
|
|
80
|
+
```bash
|
|
81
|
+
gene-tidy my_genes.txt --out outputs/
|
|
82
|
+
gene-tidy supp_table.csv --out outputs/
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
Useful flags:
|
|
86
|
+
|
|
87
|
+
```bash
|
|
88
|
+
gene-tidy data.xlsx -o out/ --column gene_symbol # force the identifier column
|
|
89
|
+
gene-tidy data.csv -o out/ --column symbol -c ensembl_id # multiple columns
|
|
90
|
+
gene-tidy data.xlsx -o out/ --hgnc-file hgnc_complete_set.txt # use the full HGNC set
|
|
91
|
+
gene-tidy --version # tool + HGNC dump version
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
## Quickstart (Python)
|
|
95
|
+
|
|
96
|
+
```python
|
|
97
|
+
from gene_tidy import tidy_file, tidy_values
|
|
98
|
+
|
|
99
|
+
# Whole file -> writes the six output files, returns a result object.
|
|
100
|
+
result = tidy_file("supp_table.xlsx", "outputs/")
|
|
101
|
+
print(result.counts) # {'total': 21, 'clean': 16, 'ambiguous': 3, 'failed': 2}
|
|
102
|
+
|
|
103
|
+
# Or clean an in-memory list of identifiers (no files written):
|
|
104
|
+
result = tidy_values(["TP53", "p53", "Sep-7", "ENSG00000141510", "1-Mar", "FOOBAR1"])
|
|
105
|
+
print(result.audit[["input_value", "approved_symbol", "match_status"]])
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
```text
|
|
109
|
+
input_value approved_symbol match_status
|
|
110
|
+
0 TP53 TP53 matched
|
|
111
|
+
1 p53 TP53 matched_alias
|
|
112
|
+
2 Sep-7 SEPTIN7 recovered_excel
|
|
113
|
+
3 ENSG00000141510 TP53 matched
|
|
114
|
+
4 1-Mar MARCHF1;MTARC1 ambiguous
|
|
115
|
+
5 FOOBAR1 unmatched
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
`1-Mar` (ambiguous between `MARCHF1` and `MTARC1`) lands in `result.ambiguous`;
|
|
119
|
+
`FOOBAR1` lands in `result.failed`. Nothing is dropped.
|
|
120
|
+
|
|
121
|
+
## What it handles
|
|
122
|
+
|
|
123
|
+
| Input | Example | Result |
|
|
124
|
+
|---|---|---|
|
|
125
|
+
| Approved symbol | `TP53` | `matched` → TP53 |
|
|
126
|
+
| Alias symbol | `p53`, `HER2` | `matched_alias` (warns "resolved from alias") |
|
|
127
|
+
| Previous symbol | `FRAP1`, `VEGF` | `matched_prev` (warns "resolved from previous symbol") |
|
|
128
|
+
| Ensembl gene | `ENSG00000141510` | `matched` → TP53 |
|
|
129
|
+
| UniProt | `P38398` | `matched` → BRCA1 |
|
|
130
|
+
| Entrez | `672` | `matched` → BRCA1 |
|
|
131
|
+
| RefSeq | `NM_000546` | `matched` → TP53 |
|
|
132
|
+
| HGNC ID | `HGNC:11998` | `matched` → TP53 |
|
|
133
|
+
| **Excel date corruption** | `Sep-7` | `recovered_excel` → SEPTIN7 (always warns) |
|
|
134
|
+
| **Ambiguous corruption** | `1-Mar`, `2-Sep`, `1-Dec` | `ambiguous` → e.g. MARCHF1/MTARC1, SEPTIN2/SEPTIN6 → manual review |
|
|
135
|
+
| Multiple IDs per cell | `KRAS, NRAS` | split and resolved independently |
|
|
136
|
+
| Case / whitespace | ` tp53 ` | normalised → TP53 |
|
|
137
|
+
| Duplicates | `TP53` ×2 | kept, flagged in `warning` |
|
|
138
|
+
| No match | `FOOBAR1` | `unmatched` → `failed_rows.csv` |
|
|
139
|
+
|
|
140
|
+
## Output files
|
|
141
|
+
|
|
142
|
+
Every run writes six files to `--out`:
|
|
143
|
+
|
|
144
|
+
| File | Contents |
|
|
145
|
+
|---|---|
|
|
146
|
+
| `clean_table.xlsx` / `clean_table.csv` | confidently resolved rows |
|
|
147
|
+
| `ambiguous_rows.csv` | one-to-many / uncertain rows needing manual review |
|
|
148
|
+
| `failed_rows.csv` | unmatched and empty rows |
|
|
149
|
+
| `mapping_audit.csv` | **every** input → output, with full provenance (see below) |
|
|
150
|
+
| `methods_text.txt` | paste-ready methods paragraph (tool + HGNC version/date) |
|
|
151
|
+
|
|
152
|
+
### Columns (required schema)
|
|
153
|
+
|
|
154
|
+
`input_value`, `detected_type`, `approved_symbol`, `hgnc_id`,
|
|
155
|
+
`ensembl_gene_id`, `uniprot_id`, `entrez_id`, `refseq_id`, `match_status`,
|
|
156
|
+
`warning`, `source_used`, `manual_review_required`
|
|
157
|
+
(plus `source_row` / `source_column` for traceability back to the original table).
|
|
158
|
+
|
|
159
|
+
`match_status` is one of: `matched`, `matched_alias`, `matched_prev`,
|
|
160
|
+
`recovered_excel` (→ clean) · `ambiguous` (→ review) · `unmatched`, `empty`
|
|
161
|
+
(→ failed).
|
|
162
|
+
|
|
163
|
+
Every table also carries per-row provenance — `matched_field` (which HGNC field
|
|
164
|
+
matched: `symbol` / `alias_symbol` / `prev_symbol` / `ensembl_gene_id` /
|
|
165
|
+
`uniprot_ids` / `entrez_id` / `refseq_accession` / `hgnc_id` / `excel_recovery`),
|
|
166
|
+
`match_reason` (human-readable), and `candidate_count` (1 for a clean hit, N for
|
|
167
|
+
ambiguous, 0 for no match). `mapping_audit.csv` additionally records
|
|
168
|
+
`hgnc_dump_date` and `gene_tidy_version` on every row for full reproducibility.
|
|
169
|
+
|
|
170
|
+
## Source of truth & offline guarantee
|
|
171
|
+
|
|
172
|
+
Resolution runs against a **static, bundled HGNC complete set** —
|
|
173
|
+
`src/gene_tidy/data/hgnc_complete_set.tsv.gz`, containing all ~45,000 Approved
|
|
174
|
+
HGNC gene records — matched against the approved symbol, `alias_symbol`, and
|
|
175
|
+
`prev_symbol` fields. The accompanying `hgnc_version.json` records the source
|
|
176
|
+
URL, HGNC license (CC0), download date, release tag, and record count; the same
|
|
177
|
+
provenance is printed by `gene-tidy --version`, written into every
|
|
178
|
+
`mapping_audit.csv` row, and summarised in `methods_text.txt`.
|
|
179
|
+
|
|
180
|
+
To use a **different / newer** HGNC release, pass
|
|
181
|
+
`--hgnc-file path/to/hgnc_complete_set.txt`, set the `GENE_TIDY_HGNC_FILE`
|
|
182
|
+
environment variable, or regenerate the bundled dump with
|
|
183
|
+
`python tools/build_hgnc_data.py hgnc_complete_set.txt`. A user-supplied file is
|
|
184
|
+
filtered to `status == Approved` automatically.
|
|
185
|
+
|
|
186
|
+
The package and its **tests never require network access.** (The test suite
|
|
187
|
+
resolves against a tiny curated fixture in `tests/fixtures/` for speed; the real
|
|
188
|
+
bundled dump is exercised separately in `tests/test_data_boundary.py`.)
|
|
189
|
+
|
|
190
|
+
> Real-world note: because the bundled data is the *real* HGNC set, genuine
|
|
191
|
+
> one-to-many cases surface honestly. For example `SEPT2` is a previous symbol of
|
|
192
|
+
> `SEPTIN2` **and** an alias of `SEPTIN6`, so gene-tidy reports it `ambiguous`
|
|
193
|
+
> rather than guessing.
|
|
194
|
+
|
|
195
|
+
## Colab notebook
|
|
196
|
+
|
|
197
|
+
Zero-setup, in-browser: upload a file → run → preview clean/failed/ambiguous
|
|
198
|
+
rows → download a ZIP of all outputs. A bundled `messy_example.xlsx` lets you
|
|
199
|
+
click **Run** and see results immediately.
|
|
200
|
+
|
|
201
|
+
[`notebooks/gene_tidy_colab.ipynb`](notebooks/gene_tidy_colab.ipynb)
|
|
202
|
+
<!-- TODO: add an "Open in Colab" badge pointing at the hosted repo path. -->
|
|
203
|
+
|
|
204
|
+
## Limitations (v0.1)
|
|
205
|
+
|
|
206
|
+
- Ensembl **transcript/protein** IDs (`ENST…`/`ENSP…`) are detected but not
|
|
207
|
+
resolved offline (gene-level dump only); they are flagged for manual review.
|
|
208
|
+
- Numeric Excel date *serials* (e.g. `44075`) are indistinguishable from Entrez
|
|
209
|
+
IDs and are intentionally **not** reinterpreted.
|
|
210
|
+
- Human only. No HGVS / ClinVar / VEP / gnomAD / liftover / genome-build
|
|
211
|
+
detection / clinical interpretation (out of scope for v0.1).
|
|
212
|
+
|
|
213
|
+
## Development
|
|
214
|
+
|
|
215
|
+
```bash
|
|
216
|
+
pip install -e ".[dev]" # installs pytest, build, and twine
|
|
217
|
+
pytest # 116 tests, all offline
|
|
218
|
+
```
|
|
219
|
+
|
|
220
|
+
Test coverage: ID-type detection, column detection, resolver (alias / prev /
|
|
221
|
+
Excel-corruption / ambiguity), input/output file handling, CLI, golden-output
|
|
222
|
+
regression on the bundled example, and a data-boundary test that loads the real
|
|
223
|
+
bundled HGNC complete set. Most tests use a small curated fixture
|
|
224
|
+
(`tests/fixtures/hgnc_subset.tsv`) so the suite runs in seconds.
|
|
225
|
+
|
|
226
|
+
To refresh the bundled HGNC data (deterministic: the same input always produces
|
|
227
|
+
a byte-identical `.tsv.gz`, and the run records `raw_download_sha256` +
|
|
228
|
+
`bundled_tsv_gz_sha256` in `hgnc_version.json`):
|
|
229
|
+
|
|
230
|
+
```bash
|
|
231
|
+
python tools/build_hgnc_data.py path/to/hgnc_complete_set.txt # from a pinned file
|
|
232
|
+
python tools/build_hgnc_data.py --download # or fetch current
|
|
233
|
+
```
|
|
234
|
+
|
|
235
|
+
## Attribution & citing HGNC
|
|
236
|
+
|
|
237
|
+
gene-tidy resolves identifiers using data from the **HUGO Gene Nomenclature
|
|
238
|
+
Committee (HGNC)**.
|
|
239
|
+
|
|
240
|
+
- **Source:** HGNC complete set (`hgnc_complete_set.txt`) from the HGNC download archive. The exact source URL, snapshot date, and SHA-256 hashes are recorded in `src/gene_tidy/data/hgnc_version.json`.
|
|
241
|
+
- **Snapshot bundled in this release:** see `downloaded_date` and
|
|
242
|
+
`bundled_tsv_gz_sha256` in
|
|
243
|
+
[`src/gene_tidy/data/hgnc_version.json`](src/gene_tidy/data/hgnc_version.json)
|
|
244
|
+
(also printed by `gene-tidy --version` and written into every
|
|
245
|
+
`mapping_audit.csv` / `methods_text.txt`).
|
|
246
|
+
- **License:** HGNC data are released under a
|
|
247
|
+
[CC0 1.0 public-domain dedication](https://www.genenames.org/about/license/),
|
|
248
|
+
so they are free to redistribute; gene-tidy bundles a column-trimmed,
|
|
249
|
+
Approved-only snapshot.
|
|
250
|
+
- **Recommendation:** in your own methods/supplementary text, cite HGNC and
|
|
251
|
+
state the **retrieval month/year** of the dump you used (e.g. *"HGNC complete
|
|
252
|
+
set, retrieved June 2026, via gene-tidy v0.1.0"*). The exact date and hash are
|
|
253
|
+
in `hgnc_version.json` and the generated `methods_text.txt`.
|
|
254
|
+
|
|
255
|
+
Please cite HGNC: Seal RL, *et al.* *Genenames.org: the HGNC resources in 2023.*
|
|
256
|
+
Nucleic Acids Res. 2023;51(D1):D1003–D1009.
|
|
257
|
+
|
|
258
|
+
## License
|
|
259
|
+
|
|
260
|
+
gene-tidy itself is MIT — see [LICENSE](LICENSE). The bundled HGNC data is CC0
|
|
261
|
+
(see Attribution above).
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=64", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "gene-tidy"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "No-code cleaning of messy gene/protein identifier tables, fully offline, with explicit ambiguity handling."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
license = { text = "MIT" }
|
|
12
|
+
authors = [{ name = "gene-tidy contributors" }]
|
|
13
|
+
keywords = [
|
|
14
|
+
"bioinformatics",
|
|
15
|
+
"genomics",
|
|
16
|
+
"HGNC",
|
|
17
|
+
"gene symbols",
|
|
18
|
+
"identifier mapping",
|
|
19
|
+
"data cleaning",
|
|
20
|
+
]
|
|
21
|
+
classifiers = [
|
|
22
|
+
"Development Status :: 4 - Beta",
|
|
23
|
+
"Intended Audience :: Science/Research",
|
|
24
|
+
"License :: OSI Approved :: MIT License",
|
|
25
|
+
"Programming Language :: Python :: 3",
|
|
26
|
+
"Programming Language :: Python :: 3.10",
|
|
27
|
+
"Programming Language :: Python :: 3.11",
|
|
28
|
+
"Programming Language :: Python :: 3.12",
|
|
29
|
+
"Topic :: Scientific/Engineering :: Bio-Informatics",
|
|
30
|
+
]
|
|
31
|
+
dependencies = [
|
|
32
|
+
"pandas>=1.5",
|
|
33
|
+
"openpyxl>=3.0",
|
|
34
|
+
"typer>=0.9",
|
|
35
|
+
]
|
|
36
|
+
|
|
37
|
+
[project.optional-dependencies]
|
|
38
|
+
test = ["pytest>=7.0"]
|
|
39
|
+
dev = ["pytest>=7.0", "build>=1.0", "twine>=4.0"]
|
|
40
|
+
|
|
41
|
+
[project.urls]
|
|
42
|
+
Homepage = "https://github.com/MargoSolo/gene-tidy"
|
|
43
|
+
Issues = "https://github.com/MargoSolo/gene-tidy/issues"
|
|
44
|
+
|
|
45
|
+
[project.scripts]
|
|
46
|
+
gene-tidy = "gene_tidy.cli:app"
|
|
47
|
+
|
|
48
|
+
[tool.setuptools.packages.find]
|
|
49
|
+
where = ["src"]
|
|
50
|
+
|
|
51
|
+
[tool.setuptools.package-data]
|
|
52
|
+
gene_tidy = ["data/*.tsv.gz", "data/*.json", "examples/*.xlsx"]
|
|
53
|
+
|
|
54
|
+
[tool.pytest.ini_options]
|
|
55
|
+
testpaths = ["tests"]
|
|
56
|
+
addopts = "-q"
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
"""gene-tidy: clean messy gene/protein identifier tables, fully offline.
|
|
2
|
+
|
|
3
|
+
Public API
|
|
4
|
+
----------
|
|
5
|
+
- ``tidy_file``: end-to-end cleaning of a file -> output files on disk.
|
|
6
|
+
- ``tidy_dataframe`` / ``tidy_values``: in-memory cleaning (handy in notebooks).
|
|
7
|
+
- ``OUTPUT_COLUMNS``: the canonical output schema.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from .pipeline import (
|
|
11
|
+
OUTPUT_COLUMNS,
|
|
12
|
+
TidyResult,
|
|
13
|
+
tidy_dataframe,
|
|
14
|
+
tidy_file,
|
|
15
|
+
tidy_values,
|
|
16
|
+
)
|
|
17
|
+
from .hgnc import HgncData, load_hgnc, hgnc_version_info
|
|
18
|
+
|
|
19
|
+
__version__ = "0.1.0"
|
|
20
|
+
|
|
21
|
+
__all__ = [
|
|
22
|
+
"__version__",
|
|
23
|
+
"OUTPUT_COLUMNS",
|
|
24
|
+
"TidyResult",
|
|
25
|
+
"tidy_file",
|
|
26
|
+
"tidy_dataframe",
|
|
27
|
+
"tidy_values",
|
|
28
|
+
"HgncData",
|
|
29
|
+
"load_hgnc",
|
|
30
|
+
"hgnc_version_info",
|
|
31
|
+
]
|