fetchm2 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fetchm2-0.1.0/LICENSE +21 -0
- fetchm2-0.1.0/MANIFEST.in +5 -0
- fetchm2-0.1.0/PKG-INFO +208 -0
- fetchm2-0.1.0/README.md +172 -0
- fetchm2-0.1.0/docs/RELEASE_CHECKLIST.md +30 -0
- fetchm2-0.1.0/docs/SEQUENCE_DOWNLOAD.md +45 -0
- fetchm2-0.1.0/docs/STANDARDIZATION.md +34 -0
- fetchm2-0.1.0/docs/VALIDATION_REPORT.md +99 -0
- fetchm2-0.1.0/examples/offline_metadata.tsv +6 -0
- fetchm2-0.1.0/examples/test_ncbi_dataset.tsv +201 -0
- fetchm2-0.1.0/pyproject.toml +62 -0
- fetchm2-0.1.0/setup.cfg +4 -0
- fetchm2-0.1.0/src/fetchm2/__init__.py +6 -0
- fetchm2-0.1.0/src/fetchm2/audit.py +126 -0
- fetchm2-0.1.0/src/fetchm2/cli.py +175 -0
- fetchm2-0.1.0/src/fetchm2/data/__init__.py +2 -0
- fetchm2-0.1.0/src/fetchm2/data/approved_broad_categories.csv +51 -0
- fetchm2-0.1.0/src/fetchm2/data/controlled_categories.csv +7506 -0
- fetchm2-0.1.0/src/fetchm2/data/country_mapping.json +810 -0
- fetchm2-0.1.0/src/fetchm2/data/geography_reviewed_rules.csv +17 -0
- fetchm2-0.1.0/src/fetchm2/data/host_negative_rules.csv +409 -0
- fetchm2-0.1.0/src/fetchm2/data/host_synonyms.csv +7114 -0
- fetchm2-0.1.0/src/fetchm2/metadata.py +244 -0
- fetchm2-0.1.0/src/fetchm2/sequence.py +194 -0
- fetchm2-0.1.0/src/fetchm2/standardization.py +586 -0
- fetchm2-0.1.0/src/fetchm2/utils.py +54 -0
- fetchm2-0.1.0/src/fetchm2.egg-info/PKG-INFO +208 -0
- fetchm2-0.1.0/src/fetchm2.egg-info/SOURCES.txt +32 -0
- fetchm2-0.1.0/src/fetchm2.egg-info/dependency_links.txt +1 -0
- fetchm2-0.1.0/src/fetchm2.egg-info/entry_points.txt +3 -0
- fetchm2-0.1.0/src/fetchm2.egg-info/requires.txt +13 -0
- fetchm2-0.1.0/src/fetchm2.egg-info/top_level.txt +1 -0
- fetchm2-0.1.0/tests/test_cli.py +70 -0
- fetchm2-0.1.0/tests/test_standardization.py +55 -0
fetchm2-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Tasnimul Arabi Anik
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
fetchm2-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: fetchm2
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Standalone comprehensive genome metadata standardization and sequence download toolkit.
|
|
5
|
+
Author-email: Tasnimul Arabi Anik <arabianik987@gmail.com>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/Tasnimul-Arabi-Anik/FetchM2
|
|
8
|
+
Project-URL: Repository, https://github.com/Tasnimul-Arabi-Anik/FetchM2
|
|
9
|
+
Project-URL: Issues, https://github.com/Tasnimul-Arabi-Anik/FetchM2/issues
|
|
10
|
+
Keywords: NCBI,BioSample,metadata,genomics,standardization,sequence-download
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Environment :: Console
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: Operating System :: OS Independent
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
20
|
+
Requires-Python: >=3.10
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
License-File: LICENSE
|
|
23
|
+
Requires-Dist: pandas>=2.0
|
|
24
|
+
Requires-Dist: requests>=2.31
|
|
25
|
+
Requires-Dist: tqdm>=4.66
|
|
26
|
+
Requires-Dist: matplotlib>=3.7
|
|
27
|
+
Requires-Dist: seaborn>=0.13
|
|
28
|
+
Requires-Dist: plotly>=5.20
|
|
29
|
+
Requires-Dist: kaleido<1.0.0,>=0.2.1
|
|
30
|
+
Requires-Dist: xmltodict>=0.13
|
|
31
|
+
Provides-Extra: dev
|
|
32
|
+
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
33
|
+
Requires-Dist: build>=1.2; extra == "dev"
|
|
34
|
+
Requires-Dist: twine>=5.0; extra == "dev"
|
|
35
|
+
Dynamic: license-file
|
|
36
|
+
|
|
37
|
+
# FetchM2
|
|
38
|
+
|
|
39
|
+
FetchM2 is a standalone command-line toolkit for genome metadata retrieval, comprehensive metadata standardization, audit reporting, and optional sequence download.
|
|
40
|
+
|
|
41
|
+
It keeps the simple standalone installation model of the original public [`FetchM`](https://github.com/Tasnimul-Arabi-Anik/FetchM), while packaging deterministic rule files and QA concepts developed in FetchM Web.
|
|
42
|
+
|
|
43
|
+
## What FetchM2 Does
|
|
44
|
+
|
|
45
|
+
- Reads NCBI Genome Datasets TSV/CSV exports.
|
|
46
|
+
- Optionally fetches linked BioSample metadata from NCBI.
|
|
47
|
+
- Standardizes host, country/geography, collection year, sample type, isolation source, isolation site, environment medium, host disease, and host health state.
|
|
48
|
+
- Adds host TaxID, rank, lineage fields, match method, confidence, and review status.
|
|
49
|
+
- Writes clean metadata tables and audit reports.
|
|
50
|
+
- Downloads genome FASTA files from NCBI with flexible filters.
|
|
51
|
+
- Runs offline on already annotated tables for reproducible tests and local standardization.
|
|
52
|
+
|
|
53
|
+
## Installation
|
|
54
|
+
|
|
55
|
+
Recommended clean environment:
|
|
56
|
+
|
|
57
|
+
```bash
|
|
58
|
+
python -m venv fetchm2-env
|
|
59
|
+
source fetchm2-env/bin/activate
|
|
60
|
+
pip install fetchm2
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
For development from source:
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
git clone https://github.com/Tasnimul-Arabi-Anik/FetchM2.git
|
|
67
|
+
cd FetchM2
|
|
68
|
+
python -m pip install -e ".[dev]"
|
|
69
|
+
pytest
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
FetchM2 uses Python dependencies only. `taxonkit` is optional. If available, FetchM2 can use it to enrich less common host TaxIDs with lineage fields; common host lineages are bundled.
|
|
73
|
+
|
|
74
|
+
## Quick Start
|
|
75
|
+
|
|
76
|
+
Offline smoke test using the bundled example:
|
|
77
|
+
|
|
78
|
+
```bash
|
|
79
|
+
fetchm2 metadata --input examples/offline_metadata.tsv --outdir demo_out --offline
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
Full BioSample metadata retrieval:
|
|
83
|
+
|
|
84
|
+
```bash
|
|
85
|
+
fetchm2 metadata --input ncbi_dataset.tsv --outdir results
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
With NCBI API key:
|
|
89
|
+
|
|
90
|
+
```bash
|
|
91
|
+
export NCBI_API_KEY=YOUR_NCBI_API_KEY
|
|
92
|
+
fetchm2 metadata --input ncbi_dataset.tsv --outdir results --workers 6 --sleep 0.15
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
All-in-one metadata plus sequence download:
|
|
96
|
+
|
|
97
|
+
```bash
|
|
98
|
+
fetchm2 run --input ncbi_dataset.tsv --outdir results --download
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
Filtered sequence download from a clean table:
|
|
102
|
+
|
|
103
|
+
```bash
|
|
104
|
+
fetchm2 seq \
|
|
105
|
+
--input results/metadata_output/fetchm2_clean.csv \
|
|
106
|
+
--outdir results/sequence \
|
|
107
|
+
--host "Homo sapiens" \
|
|
108
|
+
--country Bangladesh \
|
|
109
|
+
--year-from 2018 \
|
|
110
|
+
--year-to 2024
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
## Main Commands
|
|
114
|
+
|
|
115
|
+
```bash
|
|
116
|
+
fetchm2 metadata --help
|
|
117
|
+
fetchm2 run --help
|
|
118
|
+
fetchm2 seq --help
|
|
119
|
+
fetchm2 audit --help
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
## Metadata Outputs
|
|
123
|
+
|
|
124
|
+
FetchM2 writes:
|
|
125
|
+
|
|
126
|
+
- `metadata_output/fetchm2_clean.csv`
|
|
127
|
+
- `metadata_output/fetchm2_clean.tsv`
|
|
128
|
+
- `metadata_output/fetchm2_report.md`
|
|
129
|
+
- `audit/standardization_summary.csv`
|
|
130
|
+
- `audit/top_host_review_needed.csv`
|
|
131
|
+
- `audit/standardization_audit.md`
|
|
132
|
+
|
|
133
|
+
Important standardized fields include:
|
|
134
|
+
|
|
135
|
+
- `Host_SD`, `Host_TaxID`, `Host_Rank`, `Host_Superkingdom`, `Host_Phylum`, `Host_Class`, `Host_Order`, `Host_Family`, `Host_Genus`, `Host_Species`
|
|
136
|
+
- `Host_Common_Name`, `Host_Match_Method`, `Host_Confidence`, `Host_Review_Status`
|
|
137
|
+
- `Sample_Type_SD`, `Sample_Type_SD_Broad`
|
|
138
|
+
- `Isolation_Source_SD`, `Isolation_Source_SD_Broad`
|
|
139
|
+
- `Isolation_Site_SD`
|
|
140
|
+
- `Environment_Medium_SD`, `Environment_Medium_SD_Broad`
|
|
141
|
+
- `Environment_Broad_Scale_SD`, `Environment_Local_Scale_SD`
|
|
142
|
+
- `Host_Disease_SD`, `Host_Health_State_SD`
|
|
143
|
+
- `Country`, `Continent`, `Subcontinent`, `Collection_Year`
|
|
144
|
+
|
|
145
|
+
## Sequence Download Options
|
|
146
|
+
|
|
147
|
+
FetchM2 supports filtering by:
|
|
148
|
+
|
|
149
|
+
- host
|
|
150
|
+
- host rank
|
|
151
|
+
- country
|
|
152
|
+
- continent
|
|
153
|
+
- subcontinent
|
|
154
|
+
- sample type
|
|
155
|
+
- isolation source
|
|
156
|
+
- environment medium
|
|
157
|
+
- collection year range
|
|
158
|
+
- maximum genomes
|
|
159
|
+
|
|
160
|
+
Use `--check-only` to audit a sequence output directory without downloading.
|
|
161
|
+
|
|
162
|
+
## API Keys
|
|
163
|
+
|
|
164
|
+
For NCBI, prefer environment variables:
|
|
165
|
+
|
|
166
|
+
```bash
|
|
167
|
+
export NCBI_API_KEY=YOUR_NCBI_API_KEY
|
|
168
|
+
export NCBI_EMAIL=you@example.com
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
Do not place API keys in scripts, notebooks, README files, or Git commits.
|
|
172
|
+
|
|
173
|
+
## Design Compared With FetchM and FetchM Web
|
|
174
|
+
|
|
175
|
+
FetchM2 uses the original FetchM standalone flow as the command-line baseline:
|
|
176
|
+
|
|
177
|
+
- metadata
|
|
178
|
+
- run
|
|
179
|
+
- seq
|
|
180
|
+
- SQLite cache
|
|
181
|
+
- NCBI BioSample fetch
|
|
182
|
+
- sequence download from NCBI FTP
|
|
183
|
+
|
|
184
|
+
FetchM2 adds FetchM Web-style standardized metadata fields and deterministic rule files:
|
|
185
|
+
|
|
186
|
+
- host synonyms and negative host rules
|
|
187
|
+
- controlled source/sample/environment categories
|
|
188
|
+
- approved broad vocabulary
|
|
189
|
+
- production-style audit gate
|
|
190
|
+
- richer sequence filtering on standardized fields
|
|
191
|
+
|
|
192
|
+
FetchM2 intentionally does not use embeddings or AI for production mappings. Embeddings can be used later as a review assistant, but final production rules should remain deterministic and auditable.
|
|
193
|
+
|
|
194
|
+
## Testing
|
|
195
|
+
|
|
196
|
+
Run:
|
|
197
|
+
|
|
198
|
+
```bash
|
|
199
|
+
pytest
|
|
200
|
+
python -m build
|
|
201
|
+
python -m pip install dist/fetchm2-*.whl
|
|
202
|
+
fetchm2 metadata --input examples/offline_metadata.tsv --outdir smoke_out --offline
|
|
203
|
+
fetchm2 seq --input smoke_out/metadata_output/fetchm2_clean.csv --outdir smoke_seq --country Bangladesh --check-only
|
|
204
|
+
```
|
|
205
|
+
|
|
206
|
+
## License
|
|
207
|
+
|
|
208
|
+
MIT License.
|
fetchm2-0.1.0/README.md
ADDED
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
# FetchM2
|
|
2
|
+
|
|
3
|
+
FetchM2 is a standalone command-line toolkit for genome metadata retrieval, comprehensive metadata standardization, audit reporting, and optional sequence download.
|
|
4
|
+
|
|
5
|
+
It keeps the simple standalone installation model of the original public [`FetchM`](https://github.com/Tasnimul-Arabi-Anik/FetchM), while packaging deterministic rule files and QA concepts developed in FetchM Web.
|
|
6
|
+
|
|
7
|
+
## What FetchM2 Does
|
|
8
|
+
|
|
9
|
+
- Reads NCBI Genome Datasets TSV/CSV exports.
|
|
10
|
+
- Optionally fetches linked BioSample metadata from NCBI.
|
|
11
|
+
- Standardizes host, country/geography, collection year, sample type, isolation source, isolation site, environment medium, host disease, and host health state.
|
|
12
|
+
- Adds host TaxID, rank, lineage fields, match method, confidence, and review status.
|
|
13
|
+
- Writes clean metadata tables and audit reports.
|
|
14
|
+
- Downloads genome FASTA files from NCBI with flexible filters.
|
|
15
|
+
- Runs offline on already annotated tables for reproducible tests and local standardization.
|
|
16
|
+
|
|
17
|
+
## Installation
|
|
18
|
+
|
|
19
|
+
Recommended clean environment:
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
python -m venv fetchm2-env
|
|
23
|
+
source fetchm2-env/bin/activate
|
|
24
|
+
pip install fetchm2
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
For development from source:
|
|
28
|
+
|
|
29
|
+
```bash
|
|
30
|
+
git clone https://github.com/Tasnimul-Arabi-Anik/FetchM2.git
|
|
31
|
+
cd FetchM2
|
|
32
|
+
python -m pip install -e ".[dev]"
|
|
33
|
+
pytest
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
FetchM2 uses Python dependencies only. `taxonkit` is optional. If available, FetchM2 can use it to enrich less common host TaxIDs with lineage fields; common host lineages are bundled.
|
|
37
|
+
|
|
38
|
+
## Quick Start
|
|
39
|
+
|
|
40
|
+
Offline smoke test using the bundled example:
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
fetchm2 metadata --input examples/offline_metadata.tsv --outdir demo_out --offline
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
Full BioSample metadata retrieval:
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
fetchm2 metadata --input ncbi_dataset.tsv --outdir results
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
With NCBI API key:
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
export NCBI_API_KEY=YOUR_NCBI_API_KEY
|
|
56
|
+
fetchm2 metadata --input ncbi_dataset.tsv --outdir results --workers 6 --sleep 0.15
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
All-in-one metadata plus sequence download:
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
fetchm2 run --input ncbi_dataset.tsv --outdir results --download
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
Filtered sequence download from a clean table:
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
fetchm2 seq \
|
|
69
|
+
--input results/metadata_output/fetchm2_clean.csv \
|
|
70
|
+
--outdir results/sequence \
|
|
71
|
+
--host "Homo sapiens" \
|
|
72
|
+
--country Bangladesh \
|
|
73
|
+
--year-from 2018 \
|
|
74
|
+
--year-to 2024
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
## Main Commands
|
|
78
|
+
|
|
79
|
+
```bash
|
|
80
|
+
fetchm2 metadata --help
|
|
81
|
+
fetchm2 run --help
|
|
82
|
+
fetchm2 seq --help
|
|
83
|
+
fetchm2 audit --help
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
## Metadata Outputs
|
|
87
|
+
|
|
88
|
+
FetchM2 writes:
|
|
89
|
+
|
|
90
|
+
- `metadata_output/fetchm2_clean.csv`
|
|
91
|
+
- `metadata_output/fetchm2_clean.tsv`
|
|
92
|
+
- `metadata_output/fetchm2_report.md`
|
|
93
|
+
- `audit/standardization_summary.csv`
|
|
94
|
+
- `audit/top_host_review_needed.csv`
|
|
95
|
+
- `audit/standardization_audit.md`
|
|
96
|
+
|
|
97
|
+
Important standardized fields include:
|
|
98
|
+
|
|
99
|
+
- `Host_SD`, `Host_TaxID`, `Host_Rank`, `Host_Superkingdom`, `Host_Phylum`, `Host_Class`, `Host_Order`, `Host_Family`, `Host_Genus`, `Host_Species`
|
|
100
|
+
- `Host_Common_Name`, `Host_Match_Method`, `Host_Confidence`, `Host_Review_Status`
|
|
101
|
+
- `Sample_Type_SD`, `Sample_Type_SD_Broad`
|
|
102
|
+
- `Isolation_Source_SD`, `Isolation_Source_SD_Broad`
|
|
103
|
+
- `Isolation_Site_SD`
|
|
104
|
+
- `Environment_Medium_SD`, `Environment_Medium_SD_Broad`
|
|
105
|
+
- `Environment_Broad_Scale_SD`, `Environment_Local_Scale_SD`
|
|
106
|
+
- `Host_Disease_SD`, `Host_Health_State_SD`
|
|
107
|
+
- `Country`, `Continent`, `Subcontinent`, `Collection_Year`
|
|
108
|
+
|
|
109
|
+
## Sequence Download Options
|
|
110
|
+
|
|
111
|
+
FetchM2 supports filtering by:
|
|
112
|
+
|
|
113
|
+
- host
|
|
114
|
+
- host rank
|
|
115
|
+
- country
|
|
116
|
+
- continent
|
|
117
|
+
- subcontinent
|
|
118
|
+
- sample type
|
|
119
|
+
- isolation source
|
|
120
|
+
- environment medium
|
|
121
|
+
- collection year range
|
|
122
|
+
- maximum genomes
|
|
123
|
+
|
|
124
|
+
Use `--check-only` to audit a sequence output directory without downloading.
|
|
125
|
+
|
|
126
|
+
## API Keys
|
|
127
|
+
|
|
128
|
+
For NCBI, prefer environment variables:
|
|
129
|
+
|
|
130
|
+
```bash
|
|
131
|
+
export NCBI_API_KEY=YOUR_NCBI_API_KEY
|
|
132
|
+
export NCBI_EMAIL=you@example.com
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
Do not place API keys in scripts, notebooks, README files, or Git commits.
|
|
136
|
+
|
|
137
|
+
## Design Compared With FetchM and FetchM Web
|
|
138
|
+
|
|
139
|
+
FetchM2 uses the original FetchM standalone flow as the command-line baseline:
|
|
140
|
+
|
|
141
|
+
- metadata
|
|
142
|
+
- run
|
|
143
|
+
- seq
|
|
144
|
+
- SQLite cache
|
|
145
|
+
- NCBI BioSample fetch
|
|
146
|
+
- sequence download from NCBI FTP
|
|
147
|
+
|
|
148
|
+
FetchM2 adds FetchM Web-style standardized metadata fields and deterministic rule files:
|
|
149
|
+
|
|
150
|
+
- host synonyms and negative host rules
|
|
151
|
+
- controlled source/sample/environment categories
|
|
152
|
+
- approved broad vocabulary
|
|
153
|
+
- production-style audit gate
|
|
154
|
+
- richer sequence filtering on standardized fields
|
|
155
|
+
|
|
156
|
+
FetchM2 intentionally does not use embeddings or AI for production mappings. Embeddings can be used later as a review assistant, but final production rules should remain deterministic and auditable.
|
|
157
|
+
|
|
158
|
+
## Testing
|
|
159
|
+
|
|
160
|
+
Run:
|
|
161
|
+
|
|
162
|
+
```bash
|
|
163
|
+
pytest
|
|
164
|
+
python -m build
|
|
165
|
+
python -m pip install dist/fetchm2-*.whl
|
|
166
|
+
fetchm2 metadata --input examples/offline_metadata.tsv --outdir smoke_out --offline
|
|
167
|
+
fetchm2 seq --input smoke_out/metadata_output/fetchm2_clean.csv --outdir smoke_seq --country Bangladesh --check-only
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
## License
|
|
171
|
+
|
|
172
|
+
MIT License.
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
# FetchM2 Release Checklist
|
|
2
|
+
|
|
3
|
+
Use this checklist before each GitHub/PyPI release.
|
|
4
|
+
|
|
5
|
+
## Pre-release
|
|
6
|
+
|
|
7
|
+
- Update `src/fetchm2/__init__.py` version.
|
|
8
|
+
- Update `pyproject.toml` version.
|
|
9
|
+
- Run `pytest`.
|
|
10
|
+
- Run `python -m build`.
|
|
11
|
+
- Run `python -m twine check dist/*`.
|
|
12
|
+
- Install the wheel in a clean environment.
|
|
13
|
+
- Run offline metadata smoke test.
|
|
14
|
+
- Run sequence `--check-only` smoke test.
|
|
15
|
+
- Run a small live NCBI smoke test when network access is available.
|
|
16
|
+
- Confirm no API keys, tokens, caches, or output directories are committed.
|
|
17
|
+
|
|
18
|
+
## Release
|
|
19
|
+
|
|
20
|
+
- Commit all source, data, docs, tests, and examples.
|
|
21
|
+
- Tag the commit, for example `v0.1.0`.
|
|
22
|
+
- Push branch and tag to GitHub.
|
|
23
|
+
- Upload `dist/*` to PyPI using an environment variable or secure prompt.
|
|
24
|
+
|
|
25
|
+
## Post-release
|
|
26
|
+
|
|
27
|
+
- Install from PyPI in a fresh environment.
|
|
28
|
+
- Run `fetchm2 --version`.
|
|
29
|
+
- Run the offline smoke test from the PyPI-installed package.
|
|
30
|
+
- Record results in `docs/VALIDATION_REPORT.md`.
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
# FetchM2 Sequence Download
|
|
2
|
+
|
|
3
|
+
FetchM2 downloads genome FASTA files from the public NCBI genomes FTP layout using the assembly accession and assembly name in `fetchm2_clean.csv`.
|
|
4
|
+
|
|
5
|
+
## Typical Workflow
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
fetchm2 metadata --input ncbi_dataset.tsv --outdir results
|
|
9
|
+
fetchm2 seq --input results/metadata_output/fetchm2_clean.csv --outdir results/sequence
|
|
10
|
+
```
|
|
11
|
+
|
|
12
|
+
## Filtered Download
|
|
13
|
+
|
|
14
|
+
You can filter using standardized metadata:
|
|
15
|
+
|
|
16
|
+
```bash
|
|
17
|
+
fetchm2 seq \
|
|
18
|
+
--input results/metadata_output/fetchm2_clean.csv \
|
|
19
|
+
--outdir results/sequence_human_bd \
|
|
20
|
+
--host "Homo sapiens" \
|
|
21
|
+
--country Bangladesh \
|
|
22
|
+
--year-from 2018 \
|
|
23
|
+
--year-to 2024
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
Supported filters include:
|
|
27
|
+
|
|
28
|
+
- `--host`
|
|
29
|
+
- `--host-rank`
|
|
30
|
+
- `--country`
|
|
31
|
+
- `--continent`
|
|
32
|
+
- `--subcontinent`
|
|
33
|
+
- `--sample-type`
|
|
34
|
+
- `--isolation-source`
|
|
35
|
+
- `--environment-medium`
|
|
36
|
+
- `--year-from` and `--year-to`
|
|
37
|
+
- `--max-genomes`
|
|
38
|
+
|
|
39
|
+
## Check Only
|
|
40
|
+
|
|
41
|
+
Use `--check-only` to compare expected accessions against an output directory without downloading:
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
fetchm2 seq --input fetchm2_clean.csv --outdir sequence --check-only
|
|
45
|
+
```
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# FetchM2 Standardization Notes
|
|
2
|
+
|
|
3
|
+
FetchM2 packages deterministic standardization rules from FetchM Web so the standalone tool can reproduce the major standardized metadata fields without requiring a web database.
|
|
4
|
+
|
|
5
|
+
## Rule Sources
|
|
6
|
+
|
|
7
|
+
The packaged files live in `src/fetchm2/data/`:
|
|
8
|
+
|
|
9
|
+
- `host_synonyms.csv`: exact and broad host mappings with TaxID.
|
|
10
|
+
- `host_negative_rules.csv`: values blocked from `Host_SD`, including source/material/lab artifacts.
|
|
11
|
+
- `controlled_categories.csv`: source, sample, environment, disease, and health-state rules.
|
|
12
|
+
- `approved_broad_categories.csv`: allowed broad-category vocabulary.
|
|
13
|
+
- `geography_reviewed_rules.csv`: reviewed special geography cases.
|
|
14
|
+
- `country_mapping.json`: country, continent, and subcontinent mapping extracted from public FetchM.
|
|
15
|
+
|
|
16
|
+
## Output Fields
|
|
17
|
+
|
|
18
|
+
FetchM2 writes the original input columns plus standardized columns including:
|
|
19
|
+
|
|
20
|
+
- `Host_SD`, `Host_TaxID`, `Host_Rank`, host lineage fields, match method, confidence, and review status.
|
|
21
|
+
- `Sample_Type_SD`, `Isolation_Source_SD`, `Isolation_Site_SD`.
|
|
22
|
+
- `Environment_Medium_SD`, `Environment_Broad_Scale_SD`, `Environment_Local_Scale_SD`.
|
|
23
|
+
- `Host_Disease_SD`, `Host_Health_State_SD`.
|
|
24
|
+
- `Country`, `Continent`, `Subcontinent`, `Collection_Year`.
|
|
25
|
+
|
|
26
|
+
## Production Gate
|
|
27
|
+
|
|
28
|
+
The audit gate fails on obvious category leakage:
|
|
29
|
+
|
|
30
|
+
- non-country values in `Country`
|
|
31
|
+
- host-only values in `Sample_Type_SD`
|
|
32
|
+
- unapproved `Isolation_Source_SD_Broad` values
|
|
33
|
+
|
|
34
|
+
Warnings are used for curation backlogs such as high host review counts.
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
# FetchM2 Validation Report
|
|
2
|
+
|
|
3
|
+
Validation date: 2026-05-05
|
|
4
|
+
|
|
5
|
+
## Source Baselines
|
|
6
|
+
|
|
7
|
+
FetchM2 was built using:
|
|
8
|
+
|
|
9
|
+
- public standalone FetchM GitHub repository as the CLI/workflow baseline
|
|
10
|
+
- FetchM Web standardization rule files as deterministic packaged data
|
|
11
|
+
|
|
12
|
+
The public FetchM baseline inspected from GitHub was commit:
|
|
13
|
+
|
|
14
|
+
```text
|
|
15
|
+
11070b9 Avoid Chrome requirement for map export
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
## Packaged Rule Counts
|
|
19
|
+
|
|
20
|
+
FetchM2 packages these deterministic rule resources:
|
|
21
|
+
|
|
22
|
+
| Rule file | Rows |
|
|
23
|
+
| --- | ---: |
|
|
24
|
+
| `host_synonyms.csv` | 7,113 |
|
|
25
|
+
| `host_negative_rules.csv` | 408 |
|
|
26
|
+
| `controlled_categories.csv` | 7,505 |
|
|
27
|
+
| `approved_broad_categories.csv` | 50 |
|
|
28
|
+
| `geography_reviewed_rules.csv` | 16 |
|
|
29
|
+
| `country_mapping.json` | 202 countries/regions |
|
|
30
|
+
|
|
31
|
+
## Commands Validated
|
|
32
|
+
|
|
33
|
+
The following commands were validated in isolated environments:
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
fetchm2 --version
|
|
37
|
+
fetchm2 metadata --input examples/offline_metadata.tsv --outdir /tmp/fetchm2_smoke --offline
|
|
38
|
+
fetchm2 audit --input /tmp/fetchm2_smoke/metadata_output/fetchm2_clean.csv --outdir /tmp/fetchm2_smoke_audit
|
|
39
|
+
fetchm2 seq --input /tmp/fetchm2_smoke/metadata_output/fetchm2_clean.csv --outdir /tmp/fetchm2_smoke_seq --country Bangladesh --check-only
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
## Test Results
|
|
43
|
+
|
|
44
|
+
Regression tests:
|
|
45
|
+
|
|
46
|
+
```text
|
|
47
|
+
6 passed
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
Package build:
|
|
51
|
+
|
|
52
|
+
```text
|
|
53
|
+
python -m build: passed
|
|
54
|
+
twine check dist/*: passed
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
Wheel installation:
|
|
58
|
+
|
|
59
|
+
```text
|
|
60
|
+
pip install dist/fetchm2-0.1.0-py3-none-any.whl: passed
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
Wheel smoke test:
|
|
64
|
+
|
|
65
|
+
```text
|
|
66
|
+
metadata command: production gate PASS
|
|
67
|
+
sequence check-only: selected 1 accession, wrote failed_accessions.txt as expected
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
Live NCBI smoke test:
|
|
71
|
+
|
|
72
|
+
```text
|
|
73
|
+
Input: first two rows from public FetchM test.tsv
|
|
74
|
+
BioSample records requested: 1 unique BioSample
|
|
75
|
+
Result: metadata command completed and production gate PASS
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
## Regression Scenarios Covered
|
|
79
|
+
|
|
80
|
+
The test suite checks:
|
|
81
|
+
|
|
82
|
+
- `human blood` maps to `Host_SD=Homo sapiens`, `Host_TaxID=9606`, and `Sample_Type_SD=blood`
|
|
83
|
+
- `bacteria culture` is blocked from `Host_SD`
|
|
84
|
+
- `Hospital` is blocked from standardized `Country`
|
|
85
|
+
- `turkey breast sandwich` does not become a country or host
|
|
86
|
+
- `water deer` remains a valid host and is not treated as water
|
|
87
|
+
- metadata CLI writes clean outputs and audit files
|
|
88
|
+
- sequence `--check-only` works with standardized filters
|
|
89
|
+
|
|
90
|
+
## Release Readiness
|
|
91
|
+
|
|
92
|
+
FetchM2 is ready for an initial `0.1.0` GitHub/PyPI release as a standalone alpha package.
|
|
93
|
+
|
|
94
|
+
Known scope limits for `0.1.0`:
|
|
95
|
+
|
|
96
|
+
- host lineage is bundled for common hosts and optionally enriched with `taxonkit` when installed
|
|
97
|
+
- figures are not yet as extensive as FetchM Web dashboards
|
|
98
|
+
- embeddings/BGE are intentionally not used in production standardization
|
|
99
|
+
- large-scale sequence download was not run during this validation to avoid unnecessary NCBI load
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
Assembly Accession Assembly Name Organism Name ANI Check status CheckM completeness Host Collection Date Geographic Location Isolation Source Sample Type Isolation Site Environment Medium Host Disease Host Health State
|
|
2
|
+
GCF_000001.1 Test assembly 1 Escherichia coli OK 99.1 human blood 2020-04-01 Bangladesh: Dhaka blood blood vein healthy healthy
|
|
3
|
+
GCF_000002.1 Test assembly 2 Salmonella enterica OK 98.4 cattle feces 2019 United States: Texas feces feces rectum diseased mastitis
|
|
4
|
+
GCF_000003.1 Test assembly 3 Vibrio cholerae OK 97.0 not collected 2021-07 soil around the Arctic Ocean soil soil soil
|
|
5
|
+
GCF_000004.1 Test assembly 4 Staphylococcus aureus Failed 96.2 bacteria culture 2022 Hospital pure culture culture unknown
|
|
6
|
+
GCF_000005.1 Test assembly 5 Campylobacter jejuni OK 95.5 turkey breast sandwich 2023 ground turkey food meat retail food
|