gene-tidy 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. gene_tidy-0.1.0/LICENSE +21 -0
  2. gene_tidy-0.1.0/PKG-INFO +292 -0
  3. gene_tidy-0.1.0/README.md +261 -0
  4. gene_tidy-0.1.0/pyproject.toml +56 -0
  5. gene_tidy-0.1.0/setup.cfg +4 -0
  6. gene_tidy-0.1.0/src/gene_tidy/__init__.py +31 -0
  7. gene_tidy-0.1.0/src/gene_tidy/cli.py +98 -0
  8. gene_tidy-0.1.0/src/gene_tidy/columns.py +129 -0
  9. gene_tidy-0.1.0/src/gene_tidy/data/hgnc_complete_set.tsv.gz +0 -0
  10. gene_tidy-0.1.0/src/gene_tidy/data/hgnc_version.json +26 -0
  11. gene_tidy-0.1.0/src/gene_tidy/detect.py +92 -0
  12. gene_tidy-0.1.0/src/gene_tidy/examples/__init__.py +54 -0
  13. gene_tidy-0.1.0/src/gene_tidy/examples/messy_example.xlsx +0 -0
  14. gene_tidy-0.1.0/src/gene_tidy/excel_fix.py +123 -0
  15. gene_tidy-0.1.0/src/gene_tidy/hgnc.py +243 -0
  16. gene_tidy-0.1.0/src/gene_tidy/io.py +132 -0
  17. gene_tidy-0.1.0/src/gene_tidy/pipeline.py +212 -0
  18. gene_tidy-0.1.0/src/gene_tidy/resolver.py +302 -0
  19. gene_tidy-0.1.0/src/gene_tidy.egg-info/PKG-INFO +292 -0
  20. gene_tidy-0.1.0/src/gene_tidy.egg-info/SOURCES.txt +31 -0
  21. gene_tidy-0.1.0/src/gene_tidy.egg-info/dependency_links.txt +1 -0
  22. gene_tidy-0.1.0/src/gene_tidy.egg-info/entry_points.txt +2 -0
  23. gene_tidy-0.1.0/src/gene_tidy.egg-info/requires.txt +11 -0
  24. gene_tidy-0.1.0/src/gene_tidy.egg-info/top_level.txt +1 -0
  25. gene_tidy-0.1.0/tests/test_cli.py +58 -0
  26. gene_tidy-0.1.0/tests/test_columns.py +56 -0
  27. gene_tidy-0.1.0/tests/test_data_boundary.py +99 -0
  28. gene_tidy-0.1.0/tests/test_detect.py +63 -0
  29. gene_tidy-0.1.0/tests/test_excel_fix.py +56 -0
  30. gene_tidy-0.1.0/tests/test_golden.py +64 -0
  31. gene_tidy-0.1.0/tests/test_io.py +146 -0
  32. gene_tidy-0.1.0/tests/test_paths_and_packaging.py +77 -0
  33. gene_tidy-0.1.0/tests/test_resolver.py +130 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 gene-tidy contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,292 @@
1
+ Metadata-Version: 2.4
2
+ Name: gene-tidy
3
+ Version: 0.1.0
4
+ Summary: No-code cleaning of messy gene/protein identifier tables, fully offline, with explicit ambiguity handling.
5
+ Author: gene-tidy contributors
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/MargoSolo/gene-tidy
8
+ Project-URL: Issues, https://github.com/MargoSolo/gene-tidy/issues
9
+ Keywords: bioinformatics,genomics,HGNC,gene symbols,identifier mapping,data cleaning
10
+ Classifier: Development Status :: 4 - Beta
11
+ Classifier: Intended Audience :: Science/Research
12
+ Classifier: License :: OSI Approved :: MIT License
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.10
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
18
+ Requires-Python: >=3.10
19
+ Description-Content-Type: text/markdown
20
+ License-File: LICENSE
21
+ Requires-Dist: pandas>=1.5
22
+ Requires-Dist: openpyxl>=3.0
23
+ Requires-Dist: typer>=0.9
24
+ Provides-Extra: test
25
+ Requires-Dist: pytest>=7.0; extra == "test"
26
+ Provides-Extra: dev
27
+ Requires-Dist: pytest>=7.0; extra == "dev"
28
+ Requires-Dist: build>=1.0; extra == "dev"
29
+ Requires-Dist: twine>=4.0; extra == "dev"
30
+ Dynamic: license-file
31
+
32
+ # gene-tidy
33
+
34
+ [![PyPI](https://img.shields.io/pypi/v/gene-tidy)](https://pypi.org/project/gene-tidy/)
35
+ [![Python](https://img.shields.io/badge/python-3.10%2B-blue)](https://www.python.org/)
36
+ [![License: MIT](https://img.shields.io/badge/license-MIT-green)](LICENSE)
37
+ [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/MargoSolo/gene-tidy/blob/main/notebooks/gene_tidy_colab.ipynb)
38
+
39
+ <!-- No CI badge until a real GitHub Actions workflow exists. -->
40
+
41
+ **Clean messy gene/protein identifier tables — fully offline, fully audited, no code required.**
42
+
43
+ Drop in a TXT/CSV/XLSX from a paper, a supplementary file, or a lab Excel sheet,
44
+ and get back a clean, multi-ID, fully-traceable table. Every value is mapped to
45
+ the current HGNC approved symbol plus Ensembl / UniProt / Entrez / RefSeq
46
+ cross-references — and nothing is ever guessed silently or dropped.
47
+
48
+ Inspired by [HGNChelper](https://cran.r-project.org/package=HGNChelper) (R), but
49
+ in Python, mapping to all major IDs, with **explicit ambiguity handling** and
50
+ **Excel date-corruption recovery** (`SEPT2 → "2-Sep"`, `MARCH1 → "1-Mar"`).
51
+
52
+ ![gene-tidy demo](docs/demo.gif)
53
+
54
+ <!-- TODO: replace docs/demo.gif with a real screencast of the CLI / Colab run. -->
55
+
56
+ ## Scope
57
+
58
+ gene-tidy is **HGNC-centered, offline, and reproducible**: it standardises human
59
+ gene/protein identifiers against a bundled static HGNC complete set and records
60
+ exactly which version it used. It is **not** a full
61
+ [BioMart](https://www.ensembl.org/biomart/) / [VEP](https://www.ensembl.org/vep)
62
+ / [UniProt](https://www.uniprot.org/) mapping service — it does no live lookups,
63
+ no transcript/variant annotation, and no cross-database ID expansion beyond the
64
+ gene-level cross-references HGNC itself provides. If you need exhaustive,
65
+ always-current, multi-database mapping, use those tools; if you need a fast,
66
+ offline, auditable HGNC cleanup you can cite in a methods section, use gene-tidy.
67
+
68
+ ## Why
69
+
70
+ - **Works offline, out of the box.** The full [HGNC](https://www.genenames.org/)
71
+ complete set (all Approved gene records) ships inside the package as a gzipped
72
+ TSV. No network, no API keys, no surprises — and the exact HGNC version is
73
+ recorded in every run.
74
+ - **Never guesses silently.** One-to-many or uncertain mappings are flagged
75
+ `ambiguous` / `manual_review_required` and routed to a separate file.
76
+ - **Never drops a row.** Clean, ambiguous, and failed rows are all accounted for.
77
+ - **Reproducible.** Every run emits a `methods_text.txt` paragraph (tool version,
78
+ HGNC version + date) ready to paste into a supplementary methods section.
79
+
80
+ ## Install
81
+
82
+ ```bash
83
+ pip install gene-tidy
84
+ ```
85
+
86
+ Requires Python 3.10+. Dependencies: `pandas`, `openpyxl`, `typer`.
87
+
88
+ To install the latest development version directly from GitHub:
89
+
90
+ ```bash
91
+ pip install git+https://github.com/MargoSolo/gene-tidy.git
92
+ ```
93
+
94
+ From source (recommended for development):
95
+
96
+ ```bash
97
+ git clone https://github.com/MargoSolo/gene-tidy
98
+ cd gene-tidy
99
+ pip install -e .
100
+ ```
101
+
102
+ ## Quickstart (CLI)
103
+
104
+ ```bash
105
+ gene-tidy input.xlsx --out outputs/
106
+ ```
107
+
108
+ That's it. `outputs/` will contain six files (see below). Works the same on
109
+ `.txt`, `.csv`, `.tsv`, and `.xlsx`:
110
+
111
+ ```bash
112
+ gene-tidy my_genes.txt --out outputs/
113
+ gene-tidy supp_table.csv --out outputs/
114
+ ```
115
+
116
+ Useful flags:
117
+
118
+ ```bash
119
+ gene-tidy data.xlsx -o out/ --column gene_symbol # force the identifier column
120
+ gene-tidy data.csv -o out/ --column symbol -c ensembl_id # multiple columns
121
+ gene-tidy data.xlsx -o out/ --hgnc-file hgnc_complete_set.txt # use the full HGNC set
122
+ gene-tidy --version # tool + HGNC dump version
123
+ ```
124
+
125
+ ## Quickstart (Python)
126
+
127
+ ```python
128
+ from gene_tidy import tidy_file, tidy_values
129
+
130
+ # Whole file -> writes the six output files, returns a result object.
131
+ result = tidy_file("supp_table.xlsx", "outputs/")
132
+ print(result.counts) # {'total': 21, 'clean': 16, 'ambiguous': 3, 'failed': 2}
133
+
134
+ # Or clean an in-memory list of identifiers (no files written):
135
+ result = tidy_values(["TP53", "p53", "Sep-7", "ENSG00000141510", "1-Mar", "FOOBAR1"])
136
+ print(result.audit[["input_value", "approved_symbol", "match_status"]])
137
+ ```
138
+
139
+ ```text
140
+ input_value approved_symbol match_status
141
+ 0 TP53 TP53 matched
142
+ 1 p53 TP53 matched_alias
143
+ 2 Sep-7 SEPTIN7 recovered_excel
144
+ 3 ENSG00000141510 TP53 matched
145
+ 4 1-Mar MARCHF1;MTARC1 ambiguous
146
+ 5 FOOBAR1 unmatched
147
+ ```
148
+
149
+ `1-Mar` (ambiguous between `MARCHF1` and `MTARC1`) lands in `result.ambiguous`;
150
+ `FOOBAR1` lands in `result.failed`. Nothing is dropped.
151
+
152
+ ## What it handles
153
+
154
+ | Input | Example | Result |
155
+ |---|---|---|
156
+ | Approved symbol | `TP53` | `matched` → TP53 |
157
+ | Alias symbol | `p53`, `HER2` | `matched_alias` (warns "resolved from alias") |
158
+ | Previous symbol | `FRAP1`, `VEGF` | `matched_prev` (warns "resolved from previous symbol") |
159
+ | Ensembl gene | `ENSG00000141510` | `matched` → TP53 |
160
+ | UniProt | `P38398` | `matched` → BRCA1 |
161
+ | Entrez | `672` | `matched` → BRCA1 |
162
+ | RefSeq | `NM_000546` | `matched` → TP53 |
163
+ | HGNC ID | `HGNC:11998` | `matched` → TP53 |
164
+ | **Excel date corruption** | `Sep-7` | `recovered_excel` → SEPTIN7 (always warns) |
165
+ | **Ambiguous corruption** | `1-Mar`, `2-Sep`, `1-Dec` | `ambiguous` → e.g. MARCHF1/MTARC1, SEPTIN2/SEPTIN6 → manual review |
166
+ | Multiple IDs per cell | `KRAS, NRAS` | split and resolved independently |
167
+ | Case / whitespace | ` tp53 ` | normalised → TP53 |
168
+ | Duplicates | `TP53` ×2 | kept, flagged in `warning` |
169
+ | No match | `FOOBAR1` | `unmatched` → `failed_rows.csv` |
170
+
171
+ ## Output files
172
+
173
+ Every run writes six files to `--out`:
174
+
175
+ | File | Contents |
176
+ |---|---|
177
+ | `clean_table.xlsx` / `clean_table.csv` | confidently resolved rows |
178
+ | `ambiguous_rows.csv` | one-to-many / uncertain rows needing manual review |
179
+ | `failed_rows.csv` | unmatched and empty rows |
180
+ | `mapping_audit.csv` | **every** input → output, with full provenance (see below) |
181
+ | `methods_text.txt` | paste-ready methods paragraph (tool + HGNC version/date) |
182
+
183
+ ### Columns (required schema)
184
+
185
+ `input_value`, `detected_type`, `approved_symbol`, `hgnc_id`,
186
+ `ensembl_gene_id`, `uniprot_id`, `entrez_id`, `refseq_id`, `match_status`,
187
+ `warning`, `source_used`, `manual_review_required`
188
+ (plus `source_row` / `source_column` for traceability back to the original table).
189
+
190
+ `match_status` is one of: `matched`, `matched_alias`, `matched_prev`,
191
+ `recovered_excel` (→ clean) · `ambiguous` (→ review) · `unmatched`, `empty`
192
+ (→ failed).
193
+
194
+ Every table also carries per-row provenance — `matched_field` (which HGNC field
195
+ matched: `symbol` / `alias_symbol` / `prev_symbol` / `ensembl_gene_id` /
196
+ `uniprot_ids` / `entrez_id` / `refseq_accession` / `hgnc_id` / `excel_recovery`),
197
+ `match_reason` (human-readable), and `candidate_count` (1 for a clean hit, N for
198
+ ambiguous, 0 for no match). `mapping_audit.csv` additionally records
199
+ `hgnc_dump_date` and `gene_tidy_version` on every row for full reproducibility.
200
+
201
+ ## Source of truth & offline guarantee
202
+
203
+ Resolution runs against a **static, bundled HGNC complete set** —
204
+ `src/gene_tidy/data/hgnc_complete_set.tsv.gz`, containing all ~45,000 Approved
205
+ HGNC gene records — matched against the approved symbol, `alias_symbol`, and
206
+ `prev_symbol` fields. The accompanying `hgnc_version.json` records the source
207
+ URL, HGNC license (CC0), download date, release tag, and record count; the same
208
+ provenance is printed by `gene-tidy --version`, written into every
209
+ `mapping_audit.csv` row, and summarised in `methods_text.txt`.
210
+
211
+ To use a **different / newer** HGNC release, pass
212
+ `--hgnc-file path/to/hgnc_complete_set.txt`, set the `GENE_TIDY_HGNC_FILE`
213
+ environment variable, or regenerate the bundled dump with
214
+ `python tools/build_hgnc_data.py hgnc_complete_set.txt`. A user-supplied file is
215
+ filtered to `status == Approved` automatically.
216
+
217
+ The package and its **tests never require network access.** (The test suite
218
+ resolves against a tiny curated fixture in `tests/fixtures/` for speed; the real
219
+ bundled dump is exercised separately in `tests/test_data_boundary.py`.)
220
+
221
+ > Real-world note: because the bundled data is the *real* HGNC set, genuine
222
+ > one-to-many cases surface honestly. For example `SEPT2` is a previous symbol of
223
+ > `SEPTIN2` **and** an alias of `SEPTIN6`, so gene-tidy reports it `ambiguous`
224
+ > rather than guessing.
225
+
226
+ ## Colab notebook
227
+
228
+ Zero-setup, in-browser: upload a file → run → preview clean/failed/ambiguous
229
+ rows → download a ZIP of all outputs. A bundled `messy_example.xlsx` lets you
230
+ click **Run** and see results immediately.
231
+
232
+ [`notebooks/gene_tidy_colab.ipynb`](notebooks/gene_tidy_colab.ipynb)
233
+ <!-- TODO: add an "Open in Colab" badge pointing at the hosted repo path. -->
234
+
235
+ ## Limitations (v0.1)
236
+
237
+ - Ensembl **transcript/protein** IDs (`ENST…`/`ENSP…`) are detected but not
238
+ resolved offline (gene-level dump only); they are flagged for manual review.
239
+ - Numeric Excel date *serials* (e.g. `44075`) are indistinguishable from Entrez
240
+ IDs and are intentionally **not** reinterpreted.
241
+ - Human only. No HGVS / ClinVar / VEP / gnomAD / liftover / genome-build
242
+ detection / clinical interpretation (out of scope for v0.1).
243
+
244
+ ## Development
245
+
246
+ ```bash
247
+ pip install -e ".[dev]" # installs pytest, build, and twine
248
+ pytest # 116 tests, all offline
249
+ ```
250
+
251
+ Test coverage: ID-type detection, column detection, resolver (alias / prev /
252
+ Excel-corruption / ambiguity), input/output file handling, CLI, golden-output
253
+ regression on the bundled example, and a data-boundary test that loads the real
254
+ bundled HGNC complete set. Most tests use a small curated fixture
255
+ (`tests/fixtures/hgnc_subset.tsv`) so the suite runs in seconds.
256
+
257
+ To refresh the bundled HGNC data (deterministic: the same input always produces
258
+ a byte-identical `.tsv.gz`, and the run records `raw_download_sha256` +
259
+ `bundled_tsv_gz_sha256` in `hgnc_version.json`):
260
+
261
+ ```bash
262
+ python tools/build_hgnc_data.py path/to/hgnc_complete_set.txt # from a pinned file
263
+ python tools/build_hgnc_data.py --download # or fetch current
264
+ ```
265
+
266
+ ## Attribution & citing HGNC
267
+
268
+ gene-tidy resolves identifiers using data from the **HUGO Gene Nomenclature
269
+ Committee (HGNC)**.
270
+
271
+ - **Source:** HGNC complete set (`hgnc_complete_set.txt`) from the HGNC download archive. The exact source URL, snapshot date, and SHA-256 hashes are recorded in `src/gene_tidy/data/hgnc_version.json`.
272
+ - **Snapshot bundled in this release:** see `downloaded_date` and
273
+ `bundled_tsv_gz_sha256` in
274
+ [`src/gene_tidy/data/hgnc_version.json`](src/gene_tidy/data/hgnc_version.json)
275
+ (also printed by `gene-tidy --version` and written into every
276
+ `mapping_audit.csv` / `methods_text.txt`).
277
+ - **License:** HGNC data are released under a
278
+ [CC0 1.0 public-domain dedication](https://www.genenames.org/about/license/),
279
+ so they are free to redistribute; gene-tidy bundles a column-trimmed,
280
+ Approved-only snapshot.
281
+ - **Recommendation:** in your own methods/supplementary text, cite HGNC and
282
+ state the **retrieval month/year** of the dump you used (e.g. *"HGNC complete
283
+ set, retrieved June 2026, via gene-tidy v0.1.0"*). The exact date and hash are
284
+ in `hgnc_version.json` and the generated `methods_text.txt`.
285
+
286
+ Please cite HGNC: Seal RL, *et al.* *Genenames.org: the HGNC resources in 2023.*
287
+ Nucleic Acids Res. 2023;51(D1):D1003–D1009.
288
+
289
+ ## License
290
+
291
+ gene-tidy itself is MIT — see [LICENSE](LICENSE). The bundled HGNC data is CC0
292
+ (see Attribution above).
@@ -0,0 +1,261 @@
1
+ # gene-tidy
2
+
3
+ [![PyPI](https://img.shields.io/pypi/v/gene-tidy)](https://pypi.org/project/gene-tidy/)
4
+ [![Python](https://img.shields.io/badge/python-3.10%2B-blue)](https://www.python.org/)
5
+ [![License: MIT](https://img.shields.io/badge/license-MIT-green)](LICENSE)
6
+ [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/MargoSolo/gene-tidy/blob/main/notebooks/gene_tidy_colab.ipynb)
7
+
8
+ <!-- No CI badge until a real GitHub Actions workflow exists. -->
9
+
10
+ **Clean messy gene/protein identifier tables — fully offline, fully audited, no code required.**
11
+
12
+ Drop in a TXT/CSV/XLSX from a paper, a supplementary file, or a lab Excel sheet,
13
+ and get back a clean, multi-ID, fully-traceable table. Every value is mapped to
14
+ the current HGNC approved symbol plus Ensembl / UniProt / Entrez / RefSeq
15
+ cross-references — and nothing is ever guessed silently or dropped.
16
+
17
+ Inspired by [HGNChelper](https://cran.r-project.org/package=HGNChelper) (R), but
18
+ in Python, mapping to all major IDs, with **explicit ambiguity handling** and
19
+ **Excel date-corruption recovery** (`SEPT2 → "2-Sep"`, `MARCH1 → "1-Mar"`).
20
+
21
+ ![gene-tidy demo](docs/demo.gif)
22
+
23
+ <!-- TODO: replace docs/demo.gif with a real screencast of the CLI / Colab run. -->
24
+
25
+ ## Scope
26
+
27
+ gene-tidy is **HGNC-centered, offline, and reproducible**: it standardises human
28
+ gene/protein identifiers against a bundled static HGNC complete set and records
29
+ exactly which version it used. It is **not** a full
30
+ [BioMart](https://www.ensembl.org/biomart/) / [VEP](https://www.ensembl.org/vep)
31
+ / [UniProt](https://www.uniprot.org/) mapping service — it does no live lookups,
32
+ no transcript/variant annotation, and no cross-database ID expansion beyond the
33
+ gene-level cross-references HGNC itself provides. If you need exhaustive,
34
+ always-current, multi-database mapping, use those tools; if you need a fast,
35
+ offline, auditable HGNC cleanup you can cite in a methods section, use gene-tidy.
36
+
37
+ ## Why
38
+
39
+ - **Works offline, out of the box.** The full [HGNC](https://www.genenames.org/)
40
+ complete set (all Approved gene records) ships inside the package as a gzipped
41
+ TSV. No network, no API keys, no surprises — and the exact HGNC version is
42
+ recorded in every run.
43
+ - **Never guesses silently.** One-to-many or uncertain mappings are flagged
44
+ `ambiguous` / `manual_review_required` and routed to a separate file.
45
+ - **Never drops a row.** Clean, ambiguous, and failed rows are all accounted for.
46
+ - **Reproducible.** Every run emits a `methods_text.txt` paragraph (tool version,
47
+ HGNC version + date) ready to paste into a supplementary methods section.
48
+
49
+ ## Install
50
+
51
+ ```bash
52
+ pip install gene-tidy
53
+ ```
54
+
55
+ Requires Python 3.10+. Dependencies: `pandas`, `openpyxl`, `typer`.
56
+
57
+ To install the latest development version directly from GitHub:
58
+
59
+ ```bash
60
+ pip install git+https://github.com/MargoSolo/gene-tidy.git
61
+ ```
62
+
63
+ From source (recommended for development):
64
+
65
+ ```bash
66
+ git clone https://github.com/MargoSolo/gene-tidy
67
+ cd gene-tidy
68
+ pip install -e .
69
+ ```
70
+
71
+ ## Quickstart (CLI)
72
+
73
+ ```bash
74
+ gene-tidy input.xlsx --out outputs/
75
+ ```
76
+
77
+ That's it. `outputs/` will contain six files (see below). Works the same on
78
+ `.txt`, `.csv`, `.tsv`, and `.xlsx`:
79
+
80
+ ```bash
81
+ gene-tidy my_genes.txt --out outputs/
82
+ gene-tidy supp_table.csv --out outputs/
83
+ ```
84
+
85
+ Useful flags:
86
+
87
+ ```bash
88
+ gene-tidy data.xlsx -o out/ --column gene_symbol # force the identifier column
89
+ gene-tidy data.csv -o out/ --column symbol -c ensembl_id # multiple columns
90
+ gene-tidy data.xlsx -o out/ --hgnc-file hgnc_complete_set.txt # use the full HGNC set
91
+ gene-tidy --version # tool + HGNC dump version
92
+ ```
93
+
94
+ ## Quickstart (Python)
95
+
96
+ ```python
97
+ from gene_tidy import tidy_file, tidy_values
98
+
99
+ # Whole file -> writes the six output files, returns a result object.
100
+ result = tidy_file("supp_table.xlsx", "outputs/")
101
+ print(result.counts) # {'total': 21, 'clean': 16, 'ambiguous': 3, 'failed': 2}
102
+
103
+ # Or clean an in-memory list of identifiers (no files written):
104
+ result = tidy_values(["TP53", "p53", "Sep-7", "ENSG00000141510", "1-Mar", "FOOBAR1"])
105
+ print(result.audit[["input_value", "approved_symbol", "match_status"]])
106
+ ```
107
+
108
+ ```text
109
+ input_value approved_symbol match_status
110
+ 0 TP53 TP53 matched
111
+ 1 p53 TP53 matched_alias
112
+ 2 Sep-7 SEPTIN7 recovered_excel
113
+ 3 ENSG00000141510 TP53 matched
114
+ 4 1-Mar MARCHF1;MTARC1 ambiguous
115
+ 5 FOOBAR1 unmatched
116
+ ```
117
+
118
+ `1-Mar` (ambiguous between `MARCHF1` and `MTARC1`) lands in `result.ambiguous`;
119
+ `FOOBAR1` lands in `result.failed`. Nothing is dropped.
120
+
121
+ ## What it handles
122
+
123
+ | Input | Example | Result |
124
+ |---|---|---|
125
+ | Approved symbol | `TP53` | `matched` → TP53 |
126
+ | Alias symbol | `p53`, `HER2` | `matched_alias` (warns "resolved from alias") |
127
+ | Previous symbol | `FRAP1`, `VEGF` | `matched_prev` (warns "resolved from previous symbol") |
128
+ | Ensembl gene | `ENSG00000141510` | `matched` → TP53 |
129
+ | UniProt | `P38398` | `matched` → BRCA1 |
130
+ | Entrez | `672` | `matched` → BRCA1 |
131
+ | RefSeq | `NM_000546` | `matched` → TP53 |
132
+ | HGNC ID | `HGNC:11998` | `matched` → TP53 |
133
+ | **Excel date corruption** | `Sep-7` | `recovered_excel` → SEPTIN7 (always warns) |
134
+ | **Ambiguous corruption** | `1-Mar`, `2-Sep`, `1-Dec` | `ambiguous` → e.g. MARCHF1/MTARC1, SEPTIN2/SEPTIN6 → manual review |
135
+ | Multiple IDs per cell | `KRAS, NRAS` | split and resolved independently |
136
+ | Case / whitespace | ` tp53 ` | normalised → TP53 |
137
+ | Duplicates | `TP53` ×2 | kept, flagged in `warning` |
138
+ | No match | `FOOBAR1` | `unmatched` → `failed_rows.csv` |
139
+
140
+ ## Output files
141
+
142
+ Every run writes six files to `--out`:
143
+
144
+ | File | Contents |
145
+ |---|---|
146
+ | `clean_table.xlsx` / `clean_table.csv` | confidently resolved rows |
147
+ | `ambiguous_rows.csv` | one-to-many / uncertain rows needing manual review |
148
+ | `failed_rows.csv` | unmatched and empty rows |
149
+ | `mapping_audit.csv` | **every** input → output, with full provenance (see below) |
150
+ | `methods_text.txt` | paste-ready methods paragraph (tool + HGNC version/date) |
151
+
152
+ ### Columns (required schema)
153
+
154
+ `input_value`, `detected_type`, `approved_symbol`, `hgnc_id`,
155
+ `ensembl_gene_id`, `uniprot_id`, `entrez_id`, `refseq_id`, `match_status`,
156
+ `warning`, `source_used`, `manual_review_required`
157
+ (plus `source_row` / `source_column` for traceability back to the original table).
158
+
159
+ `match_status` is one of: `matched`, `matched_alias`, `matched_prev`,
160
+ `recovered_excel` (→ clean) · `ambiguous` (→ review) · `unmatched`, `empty`
161
+ (→ failed).
162
+
163
+ Every table also carries per-row provenance — `matched_field` (which HGNC field
164
+ matched: `symbol` / `alias_symbol` / `prev_symbol` / `ensembl_gene_id` /
165
+ `uniprot_ids` / `entrez_id` / `refseq_accession` / `hgnc_id` / `excel_recovery`),
166
+ `match_reason` (human-readable), and `candidate_count` (1 for a clean hit, N for
167
+ ambiguous, 0 for no match). `mapping_audit.csv` additionally records
168
+ `hgnc_dump_date` and `gene_tidy_version` on every row for full reproducibility.
169
+
170
+ ## Source of truth & offline guarantee
171
+
172
+ Resolution runs against a **static, bundled HGNC complete set** —
173
+ `src/gene_tidy/data/hgnc_complete_set.tsv.gz`, containing all ~45,000 Approved
174
+ HGNC gene records — matched against the approved symbol, `alias_symbol`, and
175
+ `prev_symbol` fields. The accompanying `hgnc_version.json` records the source
176
+ URL, HGNC license (CC0), download date, release tag, and record count; the same
177
+ provenance is printed by `gene-tidy --version`, written into every
178
+ `mapping_audit.csv` row, and summarised in `methods_text.txt`.
179
+
180
+ To use a **different / newer** HGNC release, pass
181
+ `--hgnc-file path/to/hgnc_complete_set.txt`, set the `GENE_TIDY_HGNC_FILE`
182
+ environment variable, or regenerate the bundled dump with
183
+ `python tools/build_hgnc_data.py hgnc_complete_set.txt`. A user-supplied file is
184
+ filtered to `status == Approved` automatically.
185
+
186
+ The package and its **tests never require network access.** (The test suite
187
+ resolves against a tiny curated fixture in `tests/fixtures/` for speed; the real
188
+ bundled dump is exercised separately in `tests/test_data_boundary.py`.)
189
+
190
+ > Real-world note: because the bundled data is the *real* HGNC set, genuine
191
+ > one-to-many cases surface honestly. For example `SEPT2` is a previous symbol of
192
+ > `SEPTIN2` **and** an alias of `SEPTIN6`, so gene-tidy reports it `ambiguous`
193
+ > rather than guessing.
194
+
195
+ ## Colab notebook
196
+
197
+ Zero-setup, in-browser: upload a file → run → preview clean/failed/ambiguous
198
+ rows → download a ZIP of all outputs. A bundled `messy_example.xlsx` lets you
199
+ click **Run** and see results immediately.
200
+
201
+ [`notebooks/gene_tidy_colab.ipynb`](notebooks/gene_tidy_colab.ipynb)
202
+ <!-- TODO: add an "Open in Colab" badge pointing at the hosted repo path. -->
203
+
204
+ ## Limitations (v0.1)
205
+
206
+ - Ensembl **transcript/protein** IDs (`ENST…`/`ENSP…`) are detected but not
207
+ resolved offline (gene-level dump only); they are flagged for manual review.
208
+ - Numeric Excel date *serials* (e.g. `44075`) are indistinguishable from Entrez
209
+ IDs and are intentionally **not** reinterpreted.
210
+ - Human only. No HGVS / ClinVar / VEP / gnomAD / liftover / genome-build
211
+ detection / clinical interpretation (out of scope for v0.1).
212
+
213
+ ## Development
214
+
215
+ ```bash
216
+ pip install -e ".[dev]" # installs pytest, build, and twine
217
+ pytest # 116 tests, all offline
218
+ ```
219
+
220
+ Test coverage: ID-type detection, column detection, resolver (alias / prev /
221
+ Excel-corruption / ambiguity), input/output file handling, CLI, golden-output
222
+ regression on the bundled example, and a data-boundary test that loads the real
223
+ bundled HGNC complete set. Most tests use a small curated fixture
224
+ (`tests/fixtures/hgnc_subset.tsv`) so the suite runs in seconds.
225
+
226
+ To refresh the bundled HGNC data (deterministic: the same input always produces
227
+ a byte-identical `.tsv.gz`, and the run records `raw_download_sha256` +
228
+ `bundled_tsv_gz_sha256` in `hgnc_version.json`):
229
+
230
+ ```bash
231
+ python tools/build_hgnc_data.py path/to/hgnc_complete_set.txt # from a pinned file
232
+ python tools/build_hgnc_data.py --download # or fetch current
233
+ ```
234
+
235
+ ## Attribution & citing HGNC
236
+
237
+ gene-tidy resolves identifiers using data from the **HUGO Gene Nomenclature
238
+ Committee (HGNC)**.
239
+
240
+ - **Source:** HGNC complete set (`hgnc_complete_set.txt`) from the HGNC download archive. The exact source URL, snapshot date, and SHA-256 hashes are recorded in `src/gene_tidy/data/hgnc_version.json`.
241
+ - **Snapshot bundled in this release:** see `downloaded_date` and
242
+ `bundled_tsv_gz_sha256` in
243
+ [`src/gene_tidy/data/hgnc_version.json`](src/gene_tidy/data/hgnc_version.json)
244
+ (also printed by `gene-tidy --version` and written into every
245
+ `mapping_audit.csv` / `methods_text.txt`).
246
+ - **License:** HGNC data are released under a
247
+ [CC0 1.0 public-domain dedication](https://www.genenames.org/about/license/),
248
+ so they are free to redistribute; gene-tidy bundles a column-trimmed,
249
+ Approved-only snapshot.
250
+ - **Recommendation:** in your own methods/supplementary text, cite HGNC and
251
+ state the **retrieval month/year** of the dump you used (e.g. *"HGNC complete
252
+ set, retrieved June 2026, via gene-tidy v0.1.0"*). The exact date and hash are
253
+ in `hgnc_version.json` and the generated `methods_text.txt`.
254
+
255
+ Please cite HGNC: Seal RL, *et al.* *Genenames.org: the HGNC resources in 2023.*
256
+ Nucleic Acids Res. 2023;51(D1):D1003–D1009.
257
+
258
+ ## License
259
+
260
+ gene-tidy itself is MIT — see [LICENSE](LICENSE). The bundled HGNC data is CC0
261
+ (see Attribution above).
@@ -0,0 +1,56 @@
1
+ [build-system]
2
+ requires = ["setuptools>=64", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "gene-tidy"
7
+ version = "0.1.0"
8
+ description = "No-code cleaning of messy gene/protein identifier tables, fully offline, with explicit ambiguity handling."
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ license = { text = "MIT" }
12
+ authors = [{ name = "gene-tidy contributors" }]
13
+ keywords = [
14
+ "bioinformatics",
15
+ "genomics",
16
+ "HGNC",
17
+ "gene symbols",
18
+ "identifier mapping",
19
+ "data cleaning",
20
+ ]
21
+ classifiers = [
22
+ "Development Status :: 4 - Beta",
23
+ "Intended Audience :: Science/Research",
24
+ "License :: OSI Approved :: MIT License",
25
+ "Programming Language :: Python :: 3",
26
+ "Programming Language :: Python :: 3.10",
27
+ "Programming Language :: Python :: 3.11",
28
+ "Programming Language :: Python :: 3.12",
29
+ "Topic :: Scientific/Engineering :: Bio-Informatics",
30
+ ]
31
+ dependencies = [
32
+ "pandas>=1.5",
33
+ "openpyxl>=3.0",
34
+ "typer>=0.9",
35
+ ]
36
+
37
+ [project.optional-dependencies]
38
+ test = ["pytest>=7.0"]
39
+ dev = ["pytest>=7.0", "build>=1.0", "twine>=4.0"]
40
+
41
+ [project.urls]
42
+ Homepage = "https://github.com/MargoSolo/gene-tidy"
43
+ Issues = "https://github.com/MargoSolo/gene-tidy/issues"
44
+
45
+ [project.scripts]
46
+ gene-tidy = "gene_tidy.cli:app"
47
+
48
+ [tool.setuptools.packages.find]
49
+ where = ["src"]
50
+
51
+ [tool.setuptools.package-data]
52
+ gene_tidy = ["data/*.tsv.gz", "data/*.json", "examples/*.xlsx"]
53
+
54
+ [tool.pytest.ini_options]
55
+ testpaths = ["tests"]
56
+ addopts = "-q"
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,31 @@
1
+ """gene-tidy: clean messy gene/protein identifier tables, fully offline.
2
+
3
+ Public API
4
+ ----------
5
+ - ``tidy_file``: end-to-end cleaning of a file -> output files on disk.
6
+ - ``tidy_dataframe`` / ``tidy_values``: in-memory cleaning (handy in notebooks).
7
+ - ``OUTPUT_COLUMNS``: the canonical output schema.
8
+ """
9
+
10
+ from .pipeline import (
11
+ OUTPUT_COLUMNS,
12
+ TidyResult,
13
+ tidy_dataframe,
14
+ tidy_file,
15
+ tidy_values,
16
+ )
17
+ from .hgnc import HgncData, load_hgnc, hgnc_version_info
18
+
19
+ __version__ = "0.1.0"
20
+
21
+ __all__ = [
22
+ "__version__",
23
+ "OUTPUT_COLUMNS",
24
+ "TidyResult",
25
+ "tidy_file",
26
+ "tidy_dataframe",
27
+ "tidy_values",
28
+ "HgncData",
29
+ "load_hgnc",
30
+ "hgnc_version_info",
31
+ ]