fetchm2 0.1.1__tar.gz → 0.1.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {fetchm2-0.1.1/src/fetchm2.egg-info → fetchm2-0.1.2}/PKG-INFO +80 -1
- {fetchm2-0.1.1 → fetchm2-0.1.2}/README.md +79 -0
- {fetchm2-0.1.1 → fetchm2-0.1.2}/docs/STANDARDIZATION.md +14 -1
- {fetchm2-0.1.1 → fetchm2-0.1.2}/docs/VALIDATION_REPORT.md +78 -8
- {fetchm2-0.1.1 → fetchm2-0.1.2}/pyproject.toml +1 -1
- {fetchm2-0.1.1 → fetchm2-0.1.2}/src/fetchm2/__init__.py +1 -1
- {fetchm2-0.1.1 → fetchm2-0.1.2}/src/fetchm2/analysis.py +1 -0
- fetchm2-0.1.2/src/fetchm2/audit.py +419 -0
- {fetchm2-0.1.1 → fetchm2-0.1.2}/src/fetchm2/cli.py +68 -3
- fetchm2-0.1.2/src/fetchm2/data/collection_date_reviewed_rules.csv +1 -0
- {fetchm2-0.1.1 → fetchm2-0.1.2}/src/fetchm2/sequence.py +10 -0
- {fetchm2-0.1.1 → fetchm2-0.1.2}/src/fetchm2/standardization.py +27 -1
- {fetchm2-0.1.1 → fetchm2-0.1.2}/src/fetchm2/utils.py +6 -4
- {fetchm2-0.1.1 → fetchm2-0.1.2/src/fetchm2.egg-info}/PKG-INFO +80 -1
- {fetchm2-0.1.1 → fetchm2-0.1.2}/src/fetchm2.egg-info/SOURCES.txt +1 -0
- fetchm2-0.1.2/tests/test_cli.py +207 -0
- {fetchm2-0.1.1 → fetchm2-0.1.2}/tests/test_standardization.py +14 -0
- fetchm2-0.1.1/src/fetchm2/audit.py +0 -126
- fetchm2-0.1.1/tests/test_cli.py +0 -107
- {fetchm2-0.1.1 → fetchm2-0.1.2}/LICENSE +0 -0
- {fetchm2-0.1.1 → fetchm2-0.1.2}/MANIFEST.in +0 -0
- {fetchm2-0.1.1 → fetchm2-0.1.2}/docs/METADATA_ANALYSIS.md +0 -0
- {fetchm2-0.1.1 → fetchm2-0.1.2}/docs/RELEASE_CHECKLIST.md +0 -0
- {fetchm2-0.1.1 → fetchm2-0.1.2}/docs/SEQUENCE_DOWNLOAD.md +0 -0
- {fetchm2-0.1.1 → fetchm2-0.1.2}/environment.yml +0 -0
- {fetchm2-0.1.1 → fetchm2-0.1.2}/examples/offline_metadata.tsv +0 -0
- {fetchm2-0.1.1 → fetchm2-0.1.2}/examples/test_ncbi_dataset.tsv +0 -0
- {fetchm2-0.1.1 → fetchm2-0.1.2}/setup.cfg +0 -0
- {fetchm2-0.1.1 → fetchm2-0.1.2}/src/fetchm2/data/__init__.py +0 -0
- {fetchm2-0.1.1 → fetchm2-0.1.2}/src/fetchm2/data/approved_broad_categories.csv +0 -0
- {fetchm2-0.1.1 → fetchm2-0.1.2}/src/fetchm2/data/controlled_categories.csv +0 -0
- {fetchm2-0.1.1 → fetchm2-0.1.2}/src/fetchm2/data/country_mapping.json +0 -0
- {fetchm2-0.1.1 → fetchm2-0.1.2}/src/fetchm2/data/geography_reviewed_rules.csv +0 -0
- {fetchm2-0.1.1 → fetchm2-0.1.2}/src/fetchm2/data/host_negative_rules.csv +0 -0
- {fetchm2-0.1.1 → fetchm2-0.1.2}/src/fetchm2/data/host_synonyms.csv +0 -0
- {fetchm2-0.1.1 → fetchm2-0.1.2}/src/fetchm2/metadata.py +0 -0
- {fetchm2-0.1.1 → fetchm2-0.1.2}/src/fetchm2.egg-info/dependency_links.txt +0 -0
- {fetchm2-0.1.1 → fetchm2-0.1.2}/src/fetchm2.egg-info/entry_points.txt +0 -0
- {fetchm2-0.1.1 → fetchm2-0.1.2}/src/fetchm2.egg-info/requires.txt +0 -0
- {fetchm2-0.1.1 → fetchm2-0.1.2}/src/fetchm2.egg-info/top_level.txt +0 -0
- {fetchm2-0.1.1 → fetchm2-0.1.2}/test.tsv +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: fetchm2
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.2
|
|
4
4
|
Summary: Standalone comprehensive genome metadata standardization and sequence download toolkit.
|
|
5
5
|
Author-email: Tasnimul Arabi Anik <arabianik987@gmail.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -40,6 +40,12 @@ FetchM2 is a comprehensive standalone command-line toolkit for bacterial genome
|
|
|
40
40
|
|
|
41
41
|
FetchM2 is designed as the updated successor to the original FetchM standalone tool. It keeps the same practical command-line workflow, but adds many more standardized metadata fields, richer filtering, packaged curation rules, audit outputs, and reproducible test data.
|
|
42
42
|
|
|
43
|
+
Recommended one-command workflow:
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
fetchm2 run --input ncbi_dataset.tsv --outdir results --download
|
|
47
|
+
```
|
|
48
|
+
|
|
43
49
|
## Key Features
|
|
44
50
|
|
|
45
51
|
- Standalone command-line tool installable with `pip` or a conda environment.
|
|
@@ -136,6 +142,29 @@ Run metadata standardization and sequence download in one command:
|
|
|
136
142
|
fetchm2 run --input ncbi_dataset.tsv --outdir results --download
|
|
137
143
|
```
|
|
138
144
|
|
|
145
|
+
## Typical Species/Genus Workflow
|
|
146
|
+
|
|
147
|
+
1. Download an NCBI Genome Datasets TSV or CSV for your target species or genus.
|
|
148
|
+
2. Run FetchM2:
|
|
149
|
+
|
|
150
|
+
```bash
|
|
151
|
+
fetchm2 run --input ncbi_dataset.tsv --outdir results --download
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
3. Review the main outputs:
|
|
155
|
+
|
|
156
|
+
- `results/metadata_output/fetchm2_clean.csv`
|
|
157
|
+
- `results/metadata_analysis/metadata_analysis_report.md`
|
|
158
|
+
- `results/audit/standardization_audit.md`
|
|
159
|
+
- `results/audit/production_readiness_gate.md`
|
|
160
|
+
- `results/sequence/`
|
|
161
|
+
|
|
162
|
+
For large NCBI retrieval jobs without an API key, use a conservative request delay:
|
|
163
|
+
|
|
164
|
+
```bash
|
|
165
|
+
fetchm2 run --input ncbi_dataset.tsv --outdir results --download --workers 3 --sleep 0.4
|
|
166
|
+
```
|
|
167
|
+
|
|
139
168
|
## Metadata Retrieval Workflow
|
|
140
169
|
|
|
141
170
|
FetchM2 can work in two modes.
|
|
@@ -175,6 +204,8 @@ fetchm2 metadata --help
|
|
|
175
204
|
fetchm2 run --help
|
|
176
205
|
fetchm2 seq --help
|
|
177
206
|
fetchm2 audit --help
|
|
207
|
+
fetchm2 validate --help
|
|
208
|
+
fetchm2 analyze --help
|
|
178
209
|
```
|
|
179
210
|
|
|
180
211
|
### `fetchm2 metadata`
|
|
@@ -256,6 +287,16 @@ fetchm2 audit \
|
|
|
256
287
|
--outdir results/audit_rerun
|
|
257
288
|
```
|
|
258
289
|
|
|
290
|
+
### `fetchm2 validate`
|
|
291
|
+
|
|
292
|
+
Runs the same production-readiness checks as `audit`, but names the workflow explicitly for CLI validation:
|
|
293
|
+
|
|
294
|
+
```bash
|
|
295
|
+
fetchm2 validate \
|
|
296
|
+
--input results/metadata_output/fetchm2_clean.csv \
|
|
297
|
+
--outdir results/validation
|
|
298
|
+
```
|
|
299
|
+
|
|
259
300
|
### `fetchm2 analyze`
|
|
260
301
|
|
|
261
302
|
Generates metadata analysis outputs from any existing clean metadata CSV.
|
|
@@ -283,6 +324,41 @@ FetchM2 writes:
|
|
|
283
324
|
- `metadata_analysis/tables/numeric_summary.csv`
|
|
284
325
|
- `metadata_analysis/figures/*.png`
|
|
285
326
|
|
|
327
|
+
Typical output structure:
|
|
328
|
+
|
|
329
|
+
```text
|
|
330
|
+
results/
|
|
331
|
+
├── metadata_output/
|
|
332
|
+
│ ├── fetchm2_clean.csv
|
|
333
|
+
│ ├── fetchm2_clean.tsv
|
|
334
|
+
│ └── fetchm2_report.md
|
|
335
|
+
├── metadata_analysis/
|
|
336
|
+
│ ├── metadata_analysis_report.md
|
|
337
|
+
│ ├── tables/
|
|
338
|
+
│ └── figures/
|
|
339
|
+
├── audit/
|
|
340
|
+
│ ├── standardization_summary.csv
|
|
341
|
+
│ ├── standardization_audit.md
|
|
342
|
+
│ ├── production_readiness_gate.md
|
|
343
|
+
│ ├── production_readiness_gate.json
|
|
344
|
+
│ ├── top_host_review_needed.csv
|
|
345
|
+
│ ├── non_country_values_in_country.csv
|
|
346
|
+
│ ├── country_continent_mismatch.csv
|
|
347
|
+
│ ├── country_subcontinent_mismatch.csv
|
|
348
|
+
│ ├── invalid_collection_years.csv
|
|
349
|
+
│ ├── invalid_host_like_sample_type.csv
|
|
350
|
+
│ ├── source_like_mapped_hosts.csv
|
|
351
|
+
│ ├── source_like_unmapped_hosts_for_review.csv
|
|
352
|
+
│ ├── broad_vocabulary_leakage.csv
|
|
353
|
+
│ ├── sequence_readiness.csv
|
|
354
|
+
│ └── rule_count_summary.csv
|
|
355
|
+
└── sequence/
|
|
356
|
+
├── *.fna
|
|
357
|
+
├── failed_accessions.txt
|
|
358
|
+
├── sequence_download_summary.csv
|
|
359
|
+
└── fetchm2_sequence_cache.sqlite3
|
|
360
|
+
```
|
|
361
|
+
|
|
286
362
|
## Standardized Metadata Fields
|
|
287
363
|
|
|
288
364
|
FetchM2 keeps the original input columns and adds standardized fields.
|
|
@@ -304,6 +380,7 @@ Original FetchM had host-oriented metadata summaries. FetchM2 expands this into
|
|
|
304
380
|
- `Host_Genus`
|
|
305
381
|
- `Host_Species`
|
|
306
382
|
- `Host_Common_Name`
|
|
383
|
+
- `Host_Context_SD`
|
|
307
384
|
- `Host_Match_Method`
|
|
308
385
|
- `Host_Confidence`
|
|
309
386
|
- `Host_Review_Status`
|
|
@@ -428,6 +505,7 @@ FetchM2 ships deterministic rules in `src/fetchm2/data/`:
|
|
|
428
505
|
- `controlled_categories.csv`
|
|
429
506
|
- `approved_broad_categories.csv`
|
|
430
507
|
- `geography_reviewed_rules.csv`
|
|
508
|
+
- `collection_date_reviewed_rules.csv`
|
|
431
509
|
- `country_mapping.json`
|
|
432
510
|
|
|
433
511
|
These rules let the standalone tool produce richer standardized fields without needing a web database.
|
|
@@ -453,6 +531,7 @@ python -m build
|
|
|
453
531
|
python -m twine check dist/*
|
|
454
532
|
python -m pip install dist/fetchm2-*.whl
|
|
455
533
|
fetchm2 metadata --input examples/offline_metadata.tsv --outdir smoke_out --offline
|
|
534
|
+
fetchm2 validate --input smoke_out/metadata_output/fetchm2_clean.csv --outdir smoke_out/validation
|
|
456
535
|
fetchm2 seq --input smoke_out/metadata_output/fetchm2_clean.csv --outdir smoke_seq --country Bangladesh --check-only
|
|
457
536
|
```
|
|
458
537
|
|
|
@@ -4,6 +4,12 @@ FetchM2 is a comprehensive standalone command-line toolkit for bacterial genome
|
|
|
4
4
|
|
|
5
5
|
FetchM2 is designed as the updated successor to the original FetchM standalone tool. It keeps the same practical command-line workflow, but adds many more standardized metadata fields, richer filtering, packaged curation rules, audit outputs, and reproducible test data.
|
|
6
6
|
|
|
7
|
+
Recommended one-command workflow:
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
fetchm2 run --input ncbi_dataset.tsv --outdir results --download
|
|
11
|
+
```
|
|
12
|
+
|
|
7
13
|
## Key Features
|
|
8
14
|
|
|
9
15
|
- Standalone command-line tool installable with `pip` or a conda environment.
|
|
@@ -100,6 +106,29 @@ Run metadata standardization and sequence download in one command:
|
|
|
100
106
|
fetchm2 run --input ncbi_dataset.tsv --outdir results --download
|
|
101
107
|
```
|
|
102
108
|
|
|
109
|
+
## Typical Species/Genus Workflow
|
|
110
|
+
|
|
111
|
+
1. Download an NCBI Genome Datasets TSV or CSV for your target species or genus.
|
|
112
|
+
2. Run FetchM2:
|
|
113
|
+
|
|
114
|
+
```bash
|
|
115
|
+
fetchm2 run --input ncbi_dataset.tsv --outdir results --download
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
3. Review the main outputs:
|
|
119
|
+
|
|
120
|
+
- `results/metadata_output/fetchm2_clean.csv`
|
|
121
|
+
- `results/metadata_analysis/metadata_analysis_report.md`
|
|
122
|
+
- `results/audit/standardization_audit.md`
|
|
123
|
+
- `results/audit/production_readiness_gate.md`
|
|
124
|
+
- `results/sequence/`
|
|
125
|
+
|
|
126
|
+
For large NCBI retrieval jobs without an API key, use a conservative request delay:
|
|
127
|
+
|
|
128
|
+
```bash
|
|
129
|
+
fetchm2 run --input ncbi_dataset.tsv --outdir results --download --workers 3 --sleep 0.4
|
|
130
|
+
```
|
|
131
|
+
|
|
103
132
|
## Metadata Retrieval Workflow
|
|
104
133
|
|
|
105
134
|
FetchM2 can work in two modes.
|
|
@@ -139,6 +168,8 @@ fetchm2 metadata --help
|
|
|
139
168
|
fetchm2 run --help
|
|
140
169
|
fetchm2 seq --help
|
|
141
170
|
fetchm2 audit --help
|
|
171
|
+
fetchm2 validate --help
|
|
172
|
+
fetchm2 analyze --help
|
|
142
173
|
```
|
|
143
174
|
|
|
144
175
|
### `fetchm2 metadata`
|
|
@@ -220,6 +251,16 @@ fetchm2 audit \
|
|
|
220
251
|
--outdir results/audit_rerun
|
|
221
252
|
```
|
|
222
253
|
|
|
254
|
+
### `fetchm2 validate`
|
|
255
|
+
|
|
256
|
+
Runs the same production-readiness checks as `audit`, but names the workflow explicitly for CLI validation:
|
|
257
|
+
|
|
258
|
+
```bash
|
|
259
|
+
fetchm2 validate \
|
|
260
|
+
--input results/metadata_output/fetchm2_clean.csv \
|
|
261
|
+
--outdir results/validation
|
|
262
|
+
```
|
|
263
|
+
|
|
223
264
|
### `fetchm2 analyze`
|
|
224
265
|
|
|
225
266
|
Generates metadata analysis outputs from any existing clean metadata CSV.
|
|
@@ -247,6 +288,41 @@ FetchM2 writes:
|
|
|
247
288
|
- `metadata_analysis/tables/numeric_summary.csv`
|
|
248
289
|
- `metadata_analysis/figures/*.png`
|
|
249
290
|
|
|
291
|
+
Typical output structure:
|
|
292
|
+
|
|
293
|
+
```text
|
|
294
|
+
results/
|
|
295
|
+
├── metadata_output/
|
|
296
|
+
│ ├── fetchm2_clean.csv
|
|
297
|
+
│ ├── fetchm2_clean.tsv
|
|
298
|
+
│ └── fetchm2_report.md
|
|
299
|
+
├── metadata_analysis/
|
|
300
|
+
│ ├── metadata_analysis_report.md
|
|
301
|
+
│ ├── tables/
|
|
302
|
+
│ └── figures/
|
|
303
|
+
├── audit/
|
|
304
|
+
│ ├── standardization_summary.csv
|
|
305
|
+
│ ├── standardization_audit.md
|
|
306
|
+
│ ├── production_readiness_gate.md
|
|
307
|
+
│ ├── production_readiness_gate.json
|
|
308
|
+
│ ├── top_host_review_needed.csv
|
|
309
|
+
│ ├── non_country_values_in_country.csv
|
|
310
|
+
│ ├── country_continent_mismatch.csv
|
|
311
|
+
│ ├── country_subcontinent_mismatch.csv
|
|
312
|
+
│ ├── invalid_collection_years.csv
|
|
313
|
+
│ ├── invalid_host_like_sample_type.csv
|
|
314
|
+
│ ├── source_like_mapped_hosts.csv
|
|
315
|
+
│ ├── source_like_unmapped_hosts_for_review.csv
|
|
316
|
+
│ ├── broad_vocabulary_leakage.csv
|
|
317
|
+
│ ├── sequence_readiness.csv
|
|
318
|
+
│ └── rule_count_summary.csv
|
|
319
|
+
└── sequence/
|
|
320
|
+
├── *.fna
|
|
321
|
+
├── failed_accessions.txt
|
|
322
|
+
├── sequence_download_summary.csv
|
|
323
|
+
└── fetchm2_sequence_cache.sqlite3
|
|
324
|
+
```
|
|
325
|
+
|
|
250
326
|
## Standardized Metadata Fields
|
|
251
327
|
|
|
252
328
|
FetchM2 keeps the original input columns and adds standardized fields.
|
|
@@ -268,6 +344,7 @@ Original FetchM had host-oriented metadata summaries. FetchM2 expands this into
|
|
|
268
344
|
- `Host_Genus`
|
|
269
345
|
- `Host_Species`
|
|
270
346
|
- `Host_Common_Name`
|
|
347
|
+
- `Host_Context_SD`
|
|
271
348
|
- `Host_Match_Method`
|
|
272
349
|
- `Host_Confidence`
|
|
273
350
|
- `Host_Review_Status`
|
|
@@ -392,6 +469,7 @@ FetchM2 ships deterministic rules in `src/fetchm2/data/`:
|
|
|
392
469
|
- `controlled_categories.csv`
|
|
393
470
|
- `approved_broad_categories.csv`
|
|
394
471
|
- `geography_reviewed_rules.csv`
|
|
472
|
+
- `collection_date_reviewed_rules.csv`
|
|
395
473
|
- `country_mapping.json`
|
|
396
474
|
|
|
397
475
|
These rules let the standalone tool produce richer standardized fields without needing a web database.
|
|
@@ -417,6 +495,7 @@ python -m build
|
|
|
417
495
|
python -m twine check dist/*
|
|
418
496
|
python -m pip install dist/fetchm2-*.whl
|
|
419
497
|
fetchm2 metadata --input examples/offline_metadata.tsv --outdir smoke_out --offline
|
|
498
|
+
fetchm2 validate --input smoke_out/metadata_output/fetchm2_clean.csv --outdir smoke_out/validation
|
|
420
499
|
fetchm2 seq --input smoke_out/metadata_output/fetchm2_clean.csv --outdir smoke_seq --country Bangladesh --check-only
|
|
421
500
|
```
|
|
422
501
|
|
|
@@ -11,13 +11,14 @@ The packaged files live in `src/fetchm2/data/`:
|
|
|
11
11
|
- `controlled_categories.csv`: source, sample, environment, disease, and health-state rules.
|
|
12
12
|
- `approved_broad_categories.csv`: allowed broad-category vocabulary.
|
|
13
13
|
- `geography_reviewed_rules.csv`: reviewed special geography cases.
|
|
14
|
+
- `collection_date_reviewed_rules.csv`: reviewed date phrases that need explicit year recovery.
|
|
14
15
|
- `country_mapping.json`: country, continent, and subcontinent mapping extracted from public FetchM.
|
|
15
16
|
|
|
16
17
|
## Output Fields
|
|
17
18
|
|
|
18
19
|
FetchM2 writes the original input columns plus standardized columns including:
|
|
19
20
|
|
|
20
|
-
- `Host_SD`, `Host_TaxID`, `Host_Rank`, host lineage fields, match method, confidence, and review status.
|
|
21
|
+
- `Host_SD`, `Host_TaxID`, `Host_Rank`, host lineage fields, `Host_Context_SD`, match method, confidence, and review status.
|
|
21
22
|
- `Sample_Type_SD`, `Isolation_Source_SD`, `Isolation_Site_SD`.
|
|
22
23
|
- `Environment_Medium_SD`, `Environment_Broad_Scale_SD`, `Environment_Local_Scale_SD`.
|
|
23
24
|
- `Host_Disease_SD`, `Host_Health_State_SD`.
|
|
@@ -28,7 +29,19 @@ FetchM2 writes the original input columns plus standardized columns including:
|
|
|
28
29
|
The audit gate fails on obvious category leakage:
|
|
29
30
|
|
|
30
31
|
- non-country values in `Country`
|
|
32
|
+
- country-continent and country-subcontinent mismatches
|
|
33
|
+
- invalid or future `Collection_Year` values
|
|
31
34
|
- host-only values in `Sample_Type_SD`
|
|
32
35
|
- unapproved `Isolation_Source_SD_Broad` values
|
|
36
|
+
- missing required clean standardized columns
|
|
33
37
|
|
|
34
38
|
Warnings are used for curation backlogs such as high host review counts.
|
|
39
|
+
|
|
40
|
+
Each `metadata`, `audit`, or `validate` run writes:
|
|
41
|
+
|
|
42
|
+
- `standardization_audit.md`
|
|
43
|
+
- `standardization_summary.csv`
|
|
44
|
+
- `production_readiness_gate.md`
|
|
45
|
+
- `production_readiness_gate.json`
|
|
46
|
+
- `top_host_review_needed.csv`
|
|
47
|
+
- issue-specific review CSVs for country, year, sample/source/host leakage, broad vocabulary, and sequence readiness.
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
# FetchM2 Validation Report
|
|
2
2
|
|
|
3
3
|
Validation date: 2026-05-05
|
|
4
|
-
Current validation target: `fetchm2 0.1.
|
|
4
|
+
Current validation target: `fetchm2 0.1.2`
|
|
5
5
|
|
|
6
6
|
## Source Baselines
|
|
7
7
|
|
|
@@ -27,6 +27,7 @@ FetchM2 packages these deterministic rule resources:
|
|
|
27
27
|
| `controlled_categories.csv` | 7,505 |
|
|
28
28
|
| `approved_broad_categories.csv` | 50 |
|
|
29
29
|
| `geography_reviewed_rules.csv` | 16 |
|
|
30
|
+
| `collection_date_reviewed_rules.csv` | 0 reviewed rows |
|
|
30
31
|
| `country_mapping.json` | 202 countries/regions |
|
|
31
32
|
|
|
32
33
|
## Commands Validated
|
|
@@ -37,6 +38,7 @@ The following commands were validated in isolated environments:
|
|
|
37
38
|
fetchm2 --version
|
|
38
39
|
fetchm2 metadata --input examples/offline_metadata.tsv --outdir /tmp/fetchm2_smoke --offline
|
|
39
40
|
fetchm2 audit --input /tmp/fetchm2_smoke/metadata_output/fetchm2_clean.csv --outdir /tmp/fetchm2_smoke_audit
|
|
41
|
+
fetchm2 validate --input /tmp/fetchm2_smoke/metadata_output/fetchm2_clean.csv --outdir /tmp/fetchm2_smoke_validation
|
|
40
42
|
fetchm2 seq --input /tmp/fetchm2_smoke/metadata_output/fetchm2_clean.csv --outdir /tmp/fetchm2_smoke_seq --country Bangladesh --check-only
|
|
41
43
|
```
|
|
42
44
|
|
|
@@ -45,28 +47,30 @@ fetchm2 seq --input /tmp/fetchm2_smoke/metadata_output/fetchm2_clean.csv --outdi
|
|
|
45
47
|
Regression tests:
|
|
46
48
|
|
|
47
49
|
```text
|
|
48
|
-
|
|
50
|
+
11 passed
|
|
49
51
|
```
|
|
50
52
|
|
|
51
|
-
Package build for `0.1.
|
|
53
|
+
Package build for `0.1.2`:
|
|
52
54
|
|
|
53
55
|
```text
|
|
54
56
|
python -m build: passed
|
|
55
57
|
twine check dist/*: passed
|
|
56
58
|
```
|
|
57
59
|
|
|
58
|
-
Wheel installation for `0.1.
|
|
60
|
+
Wheel installation for `0.1.2`:
|
|
59
61
|
|
|
60
62
|
```text
|
|
61
|
-
pip install fetchm2-0.1.
|
|
62
|
-
fetchm2 --version: fetchm2 0.1.
|
|
63
|
+
pip install fetchm2-0.1.2-py3-none-any.whl: passed
|
|
64
|
+
fetchm2 --version: fetchm2 0.1.2
|
|
63
65
|
```
|
|
64
66
|
|
|
65
|
-
Wheel smoke test for `0.1.
|
|
67
|
+
Wheel smoke test for `0.1.2`:
|
|
66
68
|
|
|
67
69
|
```text
|
|
68
70
|
metadata command: production gate PASS
|
|
69
71
|
metadata analysis outputs: generated
|
|
72
|
+
validate command: production gate PASS
|
|
73
|
+
sequence check-only: selected 1 accession and completed without download
|
|
70
74
|
```
|
|
71
75
|
|
|
72
76
|
Live NCBI smoke test:
|
|
@@ -159,7 +163,73 @@ FetchM2 `0.1.0` has been released on GitHub and PyPI.
|
|
|
159
163
|
|
|
160
164
|
The `0.1.1` update has passed local tests, package build validation, clean wheel installation, offline packaged-test validation, 20-row live BioSample validation, and full `test.tsv` live BioSample validation.
|
|
161
165
|
|
|
162
|
-
|
|
166
|
+
## Additional 0.1.2 CLI Hardening Validation
|
|
167
|
+
|
|
168
|
+
The 0.1.2 development update adds:
|
|
169
|
+
|
|
170
|
+
- `fetchm2 validate`
|
|
171
|
+
- `production_readiness_gate.md`
|
|
172
|
+
- `production_readiness_gate.json`
|
|
173
|
+
- issue-specific validation CSVs for country, collection year, sample/source/host leakage, broad vocabulary, missing columns, and sequence readiness
|
|
174
|
+
- `Host_Context_SD`
|
|
175
|
+
- packaged `collection_date_reviewed_rules.csv` support
|
|
176
|
+
- final terminal summaries after `metadata`, `run`, `audit`, and `validate`
|
|
177
|
+
- `sequence_download_summary.csv` in `--check-only` mode
|
|
178
|
+
|
|
179
|
+
Local 0.1.2 validation:
|
|
180
|
+
|
|
181
|
+
```text
|
|
182
|
+
pytest: 11 passed
|
|
183
|
+
examples/offline_metadata.tsv metadata run: production gate PASS
|
|
184
|
+
validate command: production gate PASS
|
|
185
|
+
sequence check-only: wrote failed_accessions.txt and sequence_download_summary.csv
|
|
186
|
+
```
|
|
187
|
+
|
|
188
|
+
Live 20-row `test.tsv` validation under 0.1.2:
|
|
189
|
+
|
|
190
|
+
```text
|
|
191
|
+
Rows scanned: 20
|
|
192
|
+
Metadata fetch status: 20 ok
|
|
193
|
+
Metadata fetch errors: 0
|
|
194
|
+
Production gate: PASS
|
|
195
|
+
Host TaxID mapped: 14 / 20 (70.0%)
|
|
196
|
+
Country present: 14 / 20 (70.0%)
|
|
197
|
+
Collection year present: 18 / 20 (90.0%)
|
|
198
|
+
Invalid host-like Sample_Type_SD rows: 0
|
|
199
|
+
Non-country values in Country rows: 0
|
|
200
|
+
Country-continent mismatch rows: 0
|
|
201
|
+
Country-subcontinent mismatch rows: 0
|
|
202
|
+
Invalid/future Collection_Year rows: 0
|
|
203
|
+
Unapproved broad-category rows: 0
|
|
204
|
+
Metadata analysis figures generated: 47
|
|
205
|
+
```
|
|
206
|
+
|
|
207
|
+
Live full `test.tsv` validation under 0.1.2:
|
|
208
|
+
|
|
209
|
+
```text
|
|
210
|
+
Rows scanned: 200
|
|
211
|
+
Production gate: PASS
|
|
212
|
+
Host TaxID mapped: 92 / 200 (46.0%)
|
|
213
|
+
Host review needed: 54
|
|
214
|
+
Country present: 178 / 200 (89.0%)
|
|
215
|
+
Collection year present: 196 / 200 (98.0%)
|
|
216
|
+
Sample_Type_SD present: 66 / 200 (33.0%)
|
|
217
|
+
Isolation_Source_SD present: 28 / 200 (14.0%)
|
|
218
|
+
Isolation_Site_SD present: 46 / 200 (23.0%)
|
|
219
|
+
Environment_Medium_SD present: 10 / 200 (5.0%)
|
|
220
|
+
Invalid host-like Sample_Type_SD rows: 0
|
|
221
|
+
Non-country values in Country rows: 0
|
|
222
|
+
Country-continent mismatch rows: 0
|
|
223
|
+
Country-subcontinent mismatch rows: 0
|
|
224
|
+
Invalid/future Collection_Year rows: 0
|
|
225
|
+
Unapproved Isolation_Source_SD_Broad rows: 0
|
|
226
|
+
Unapproved broad-category rows: 0
|
|
227
|
+
Source-like mapped host rows: 6
|
|
228
|
+
Source-like unmapped host rows: 8
|
|
229
|
+
Sequence-readiness issue rows: 0
|
|
230
|
+
```
|
|
231
|
+
|
|
232
|
+
Known scope notes for `0.1.2`:
|
|
163
233
|
|
|
164
234
|
- host lineage is bundled for common hosts and optionally enriched with `taxonkit` when installed
|
|
165
235
|
- embeddings/BGE are intentionally not used in production standardization
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "fetchm2"
|
|
7
|
-
version = "0.1.
|
|
7
|
+
version = "0.1.2"
|
|
8
8
|
description = "Standalone comprehensive genome metadata standardization and sequence download toolkit."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.10"
|