fetchm2 0.1.1__tar.gz → 0.1.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. {fetchm2-0.1.1/src/fetchm2.egg-info → fetchm2-0.1.2}/PKG-INFO +80 -1
  2. {fetchm2-0.1.1 → fetchm2-0.1.2}/README.md +79 -0
  3. {fetchm2-0.1.1 → fetchm2-0.1.2}/docs/STANDARDIZATION.md +14 -1
  4. {fetchm2-0.1.1 → fetchm2-0.1.2}/docs/VALIDATION_REPORT.md +78 -8
  5. {fetchm2-0.1.1 → fetchm2-0.1.2}/pyproject.toml +1 -1
  6. {fetchm2-0.1.1 → fetchm2-0.1.2}/src/fetchm2/__init__.py +1 -1
  7. {fetchm2-0.1.1 → fetchm2-0.1.2}/src/fetchm2/analysis.py +1 -0
  8. fetchm2-0.1.2/src/fetchm2/audit.py +419 -0
  9. {fetchm2-0.1.1 → fetchm2-0.1.2}/src/fetchm2/cli.py +68 -3
  10. fetchm2-0.1.2/src/fetchm2/data/collection_date_reviewed_rules.csv +1 -0
  11. {fetchm2-0.1.1 → fetchm2-0.1.2}/src/fetchm2/sequence.py +10 -0
  12. {fetchm2-0.1.1 → fetchm2-0.1.2}/src/fetchm2/standardization.py +27 -1
  13. {fetchm2-0.1.1 → fetchm2-0.1.2}/src/fetchm2/utils.py +6 -4
  14. {fetchm2-0.1.1 → fetchm2-0.1.2/src/fetchm2.egg-info}/PKG-INFO +80 -1
  15. {fetchm2-0.1.1 → fetchm2-0.1.2}/src/fetchm2.egg-info/SOURCES.txt +1 -0
  16. fetchm2-0.1.2/tests/test_cli.py +207 -0
  17. {fetchm2-0.1.1 → fetchm2-0.1.2}/tests/test_standardization.py +14 -0
  18. fetchm2-0.1.1/src/fetchm2/audit.py +0 -126
  19. fetchm2-0.1.1/tests/test_cli.py +0 -107
  20. {fetchm2-0.1.1 → fetchm2-0.1.2}/LICENSE +0 -0
  21. {fetchm2-0.1.1 → fetchm2-0.1.2}/MANIFEST.in +0 -0
  22. {fetchm2-0.1.1 → fetchm2-0.1.2}/docs/METADATA_ANALYSIS.md +0 -0
  23. {fetchm2-0.1.1 → fetchm2-0.1.2}/docs/RELEASE_CHECKLIST.md +0 -0
  24. {fetchm2-0.1.1 → fetchm2-0.1.2}/docs/SEQUENCE_DOWNLOAD.md +0 -0
  25. {fetchm2-0.1.1 → fetchm2-0.1.2}/environment.yml +0 -0
  26. {fetchm2-0.1.1 → fetchm2-0.1.2}/examples/offline_metadata.tsv +0 -0
  27. {fetchm2-0.1.1 → fetchm2-0.1.2}/examples/test_ncbi_dataset.tsv +0 -0
  28. {fetchm2-0.1.1 → fetchm2-0.1.2}/setup.cfg +0 -0
  29. {fetchm2-0.1.1 → fetchm2-0.1.2}/src/fetchm2/data/__init__.py +0 -0
  30. {fetchm2-0.1.1 → fetchm2-0.1.2}/src/fetchm2/data/approved_broad_categories.csv +0 -0
  31. {fetchm2-0.1.1 → fetchm2-0.1.2}/src/fetchm2/data/controlled_categories.csv +0 -0
  32. {fetchm2-0.1.1 → fetchm2-0.1.2}/src/fetchm2/data/country_mapping.json +0 -0
  33. {fetchm2-0.1.1 → fetchm2-0.1.2}/src/fetchm2/data/geography_reviewed_rules.csv +0 -0
  34. {fetchm2-0.1.1 → fetchm2-0.1.2}/src/fetchm2/data/host_negative_rules.csv +0 -0
  35. {fetchm2-0.1.1 → fetchm2-0.1.2}/src/fetchm2/data/host_synonyms.csv +0 -0
  36. {fetchm2-0.1.1 → fetchm2-0.1.2}/src/fetchm2/metadata.py +0 -0
  37. {fetchm2-0.1.1 → fetchm2-0.1.2}/src/fetchm2.egg-info/dependency_links.txt +0 -0
  38. {fetchm2-0.1.1 → fetchm2-0.1.2}/src/fetchm2.egg-info/entry_points.txt +0 -0
  39. {fetchm2-0.1.1 → fetchm2-0.1.2}/src/fetchm2.egg-info/requires.txt +0 -0
  40. {fetchm2-0.1.1 → fetchm2-0.1.2}/src/fetchm2.egg-info/top_level.txt +0 -0
  41. {fetchm2-0.1.1 → fetchm2-0.1.2}/test.tsv +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: fetchm2
3
- Version: 0.1.1
3
+ Version: 0.1.2
4
4
  Summary: Standalone comprehensive genome metadata standardization and sequence download toolkit.
5
5
  Author-email: Tasnimul Arabi Anik <arabianik987@gmail.com>
6
6
  License-Expression: MIT
@@ -40,6 +40,12 @@ FetchM2 is a comprehensive standalone command-line toolkit for bacterial genome
40
40
 
41
41
  FetchM2 is designed as the updated successor to the original FetchM standalone tool. It keeps the same practical command-line workflow, but adds many more standardized metadata fields, richer filtering, packaged curation rules, audit outputs, and reproducible test data.
42
42
 
43
+ Recommended one-command workflow:
44
+
45
+ ```bash
46
+ fetchm2 run --input ncbi_dataset.tsv --outdir results --download
47
+ ```
48
+
43
49
  ## Key Features
44
50
 
45
51
  - Standalone command-line tool installable with `pip` or a conda environment.
@@ -136,6 +142,29 @@ Run metadata standardization and sequence download in one command:
136
142
  fetchm2 run --input ncbi_dataset.tsv --outdir results --download
137
143
  ```
138
144
 
145
+ ## Typical Species/Genus Workflow
146
+
147
+ 1. Download an NCBI Genome Datasets TSV or CSV for your target species or genus.
148
+ 2. Run FetchM2:
149
+
150
+ ```bash
151
+ fetchm2 run --input ncbi_dataset.tsv --outdir results --download
152
+ ```
153
+
154
+ 3. Review the main outputs:
155
+
156
+ - `results/metadata_output/fetchm2_clean.csv`
157
+ - `results/metadata_analysis/metadata_analysis_report.md`
158
+ - `results/audit/standardization_audit.md`
159
+ - `results/audit/production_readiness_gate.md`
160
+ - `results/sequence/`
161
+
162
+ For large NCBI retrieval jobs without an API key, use a conservative request delay:
163
+
164
+ ```bash
165
+ fetchm2 run --input ncbi_dataset.tsv --outdir results --download --workers 3 --sleep 0.4
166
+ ```
167
+
139
168
  ## Metadata Retrieval Workflow
140
169
 
141
170
  FetchM2 can work in two modes.
@@ -175,6 +204,8 @@ fetchm2 metadata --help
175
204
  fetchm2 run --help
176
205
  fetchm2 seq --help
177
206
  fetchm2 audit --help
207
+ fetchm2 validate --help
208
+ fetchm2 analyze --help
178
209
  ```
179
210
 
180
211
  ### `fetchm2 metadata`
@@ -256,6 +287,16 @@ fetchm2 audit \
256
287
  --outdir results/audit_rerun
257
288
  ```
258
289
 
290
+ ### `fetchm2 validate`
291
+
292
+ Runs the same production-readiness checks as `audit`, but names the workflow explicitly for CLI validation:
293
+
294
+ ```bash
295
+ fetchm2 validate \
296
+ --input results/metadata_output/fetchm2_clean.csv \
297
+ --outdir results/validation
298
+ ```
299
+
259
300
  ### `fetchm2 analyze`
260
301
 
261
302
  Generates metadata analysis outputs from any existing clean metadata CSV.
@@ -283,6 +324,41 @@ FetchM2 writes:
283
324
  - `metadata_analysis/tables/numeric_summary.csv`
284
325
  - `metadata_analysis/figures/*.png`
285
326
 
327
+ Typical output structure:
328
+
329
+ ```text
330
+ results/
331
+ ├── metadata_output/
332
+ │ ├── fetchm2_clean.csv
333
+ │ ├── fetchm2_clean.tsv
334
+ │ └── fetchm2_report.md
335
+ ├── metadata_analysis/
336
+ │ ├── metadata_analysis_report.md
337
+ │ ├── tables/
338
+ │ └── figures/
339
+ ├── audit/
340
+ │ ├── standardization_summary.csv
341
+ │ ├── standardization_audit.md
342
+ │ ├── production_readiness_gate.md
343
+ │ ├── production_readiness_gate.json
344
+ │ ├── top_host_review_needed.csv
345
+ │ ├── non_country_values_in_country.csv
346
+ │ ├── country_continent_mismatch.csv
347
+ │ ├── country_subcontinent_mismatch.csv
348
+ │ ├── invalid_collection_years.csv
349
+ │ ├── invalid_host_like_sample_type.csv
350
+ │ ├── source_like_mapped_hosts.csv
351
+ │ ├── source_like_unmapped_hosts_for_review.csv
352
+ │ ├── broad_vocabulary_leakage.csv
353
+ │ ├── sequence_readiness.csv
354
+ │ └── rule_count_summary.csv
355
+ └── sequence/
356
+ ├── *.fna
357
+ ├── failed_accessions.txt
358
+ ├── sequence_download_summary.csv
359
+ └── fetchm2_sequence_cache.sqlite3
360
+ ```
361
+
286
362
  ## Standardized Metadata Fields
287
363
 
288
364
  FetchM2 keeps the original input columns and adds standardized fields.
@@ -304,6 +380,7 @@ Original FetchM had host-oriented metadata summaries. FetchM2 expands this into
304
380
  - `Host_Genus`
305
381
  - `Host_Species`
306
382
  - `Host_Common_Name`
383
+ - `Host_Context_SD`
307
384
  - `Host_Match_Method`
308
385
  - `Host_Confidence`
309
386
  - `Host_Review_Status`
@@ -428,6 +505,7 @@ FetchM2 ships deterministic rules in `src/fetchm2/data/`:
428
505
  - `controlled_categories.csv`
429
506
  - `approved_broad_categories.csv`
430
507
  - `geography_reviewed_rules.csv`
508
+ - `collection_date_reviewed_rules.csv`
431
509
  - `country_mapping.json`
432
510
 
433
511
  These rules let the standalone tool produce richer standardized fields without needing a web database.
@@ -453,6 +531,7 @@ python -m build
453
531
  python -m twine check dist/*
454
532
  python -m pip install dist/fetchm2-*.whl
455
533
  fetchm2 metadata --input examples/offline_metadata.tsv --outdir smoke_out --offline
534
+ fetchm2 validate --input smoke_out/metadata_output/fetchm2_clean.csv --outdir smoke_out/validation
456
535
  fetchm2 seq --input smoke_out/metadata_output/fetchm2_clean.csv --outdir smoke_seq --country Bangladesh --check-only
457
536
  ```
458
537
 
@@ -4,6 +4,12 @@ FetchM2 is a comprehensive standalone command-line toolkit for bacterial genome
4
4
 
5
5
  FetchM2 is designed as the updated successor to the original FetchM standalone tool. It keeps the same practical command-line workflow, but adds many more standardized metadata fields, richer filtering, packaged curation rules, audit outputs, and reproducible test data.
6
6
 
7
+ Recommended one-command workflow:
8
+
9
+ ```bash
10
+ fetchm2 run --input ncbi_dataset.tsv --outdir results --download
11
+ ```
12
+
7
13
  ## Key Features
8
14
 
9
15
  - Standalone command-line tool installable with `pip` or a conda environment.
@@ -100,6 +106,29 @@ Run metadata standardization and sequence download in one command:
100
106
  fetchm2 run --input ncbi_dataset.tsv --outdir results --download
101
107
  ```
102
108
 
109
+ ## Typical Species/Genus Workflow
110
+
111
+ 1. Download an NCBI Genome Datasets TSV or CSV for your target species or genus.
112
+ 2. Run FetchM2:
113
+
114
+ ```bash
115
+ fetchm2 run --input ncbi_dataset.tsv --outdir results --download
116
+ ```
117
+
118
+ 3. Review the main outputs:
119
+
120
+ - `results/metadata_output/fetchm2_clean.csv`
121
+ - `results/metadata_analysis/metadata_analysis_report.md`
122
+ - `results/audit/standardization_audit.md`
123
+ - `results/audit/production_readiness_gate.md`
124
+ - `results/sequence/`
125
+
126
+ For large NCBI retrieval jobs without an API key, use a conservative request delay:
127
+
128
+ ```bash
129
+ fetchm2 run --input ncbi_dataset.tsv --outdir results --download --workers 3 --sleep 0.4
130
+ ```
131
+
103
132
  ## Metadata Retrieval Workflow
104
133
 
105
134
  FetchM2 can work in two modes.
@@ -139,6 +168,8 @@ fetchm2 metadata --help
139
168
  fetchm2 run --help
140
169
  fetchm2 seq --help
141
170
  fetchm2 audit --help
171
+ fetchm2 validate --help
172
+ fetchm2 analyze --help
142
173
  ```
143
174
 
144
175
  ### `fetchm2 metadata`
@@ -220,6 +251,16 @@ fetchm2 audit \
220
251
  --outdir results/audit_rerun
221
252
  ```
222
253
 
254
+ ### `fetchm2 validate`
255
+
256
+ Runs the same production-readiness checks as `audit`, but names the workflow explicitly for CLI validation:
257
+
258
+ ```bash
259
+ fetchm2 validate \
260
+ --input results/metadata_output/fetchm2_clean.csv \
261
+ --outdir results/validation
262
+ ```
263
+
223
264
  ### `fetchm2 analyze`
224
265
 
225
266
  Generates metadata analysis outputs from any existing clean metadata CSV.
@@ -247,6 +288,41 @@ FetchM2 writes:
247
288
  - `metadata_analysis/tables/numeric_summary.csv`
248
289
  - `metadata_analysis/figures/*.png`
249
290
 
291
+ Typical output structure:
292
+
293
+ ```text
294
+ results/
295
+ ├── metadata_output/
296
+ │ ├── fetchm2_clean.csv
297
+ │ ├── fetchm2_clean.tsv
298
+ │ └── fetchm2_report.md
299
+ ├── metadata_analysis/
300
+ │ ├── metadata_analysis_report.md
301
+ │ ├── tables/
302
+ │ └── figures/
303
+ ├── audit/
304
+ │ ├── standardization_summary.csv
305
+ │ ├── standardization_audit.md
306
+ │ ├── production_readiness_gate.md
307
+ │ ├── production_readiness_gate.json
308
+ │ ├── top_host_review_needed.csv
309
+ │ ├── non_country_values_in_country.csv
310
+ │ ├── country_continent_mismatch.csv
311
+ │ ├── country_subcontinent_mismatch.csv
312
+ │ ├── invalid_collection_years.csv
313
+ │ ├── invalid_host_like_sample_type.csv
314
+ │ ├── source_like_mapped_hosts.csv
315
+ │ ├── source_like_unmapped_hosts_for_review.csv
316
+ │ ├── broad_vocabulary_leakage.csv
317
+ │ ├── sequence_readiness.csv
318
+ │ └── rule_count_summary.csv
319
+ └── sequence/
320
+ ├── *.fna
321
+ ├── failed_accessions.txt
322
+ ├── sequence_download_summary.csv
323
+ └── fetchm2_sequence_cache.sqlite3
324
+ ```
325
+
250
326
  ## Standardized Metadata Fields
251
327
 
252
328
  FetchM2 keeps the original input columns and adds standardized fields.
@@ -268,6 +344,7 @@ Original FetchM had host-oriented metadata summaries. FetchM2 expands this into
268
344
  - `Host_Genus`
269
345
  - `Host_Species`
270
346
  - `Host_Common_Name`
347
+ - `Host_Context_SD`
271
348
  - `Host_Match_Method`
272
349
  - `Host_Confidence`
273
350
  - `Host_Review_Status`
@@ -392,6 +469,7 @@ FetchM2 ships deterministic rules in `src/fetchm2/data/`:
392
469
  - `controlled_categories.csv`
393
470
  - `approved_broad_categories.csv`
394
471
  - `geography_reviewed_rules.csv`
472
+ - `collection_date_reviewed_rules.csv`
395
473
  - `country_mapping.json`
396
474
 
397
475
  These rules let the standalone tool produce richer standardized fields without needing a web database.
@@ -417,6 +495,7 @@ python -m build
417
495
  python -m twine check dist/*
418
496
  python -m pip install dist/fetchm2-*.whl
419
497
  fetchm2 metadata --input examples/offline_metadata.tsv --outdir smoke_out --offline
498
+ fetchm2 validate --input smoke_out/metadata_output/fetchm2_clean.csv --outdir smoke_out/validation
420
499
  fetchm2 seq --input smoke_out/metadata_output/fetchm2_clean.csv --outdir smoke_seq --country Bangladesh --check-only
421
500
  ```
422
501
 
@@ -11,13 +11,14 @@ The packaged files live in `src/fetchm2/data/`:
11
11
  - `controlled_categories.csv`: source, sample, environment, disease, and health-state rules.
12
12
  - `approved_broad_categories.csv`: allowed broad-category vocabulary.
13
13
  - `geography_reviewed_rules.csv`: reviewed special geography cases.
14
+ - `collection_date_reviewed_rules.csv`: reviewed date phrases that need explicit year recovery.
14
15
  - `country_mapping.json`: country, continent, and subcontinent mapping extracted from public FetchM.
15
16
 
16
17
  ## Output Fields
17
18
 
18
19
  FetchM2 writes the original input columns plus standardized columns including:
19
20
 
20
- - `Host_SD`, `Host_TaxID`, `Host_Rank`, host lineage fields, match method, confidence, and review status.
21
+ - `Host_SD`, `Host_TaxID`, `Host_Rank`, host lineage fields, `Host_Context_SD`, match method, confidence, and review status.
21
22
  - `Sample_Type_SD`, `Isolation_Source_SD`, `Isolation_Site_SD`.
22
23
  - `Environment_Medium_SD`, `Environment_Broad_Scale_SD`, `Environment_Local_Scale_SD`.
23
24
  - `Host_Disease_SD`, `Host_Health_State_SD`.
@@ -28,7 +29,19 @@ FetchM2 writes the original input columns plus standardized columns including:
28
29
  The audit gate fails on obvious category leakage:
29
30
 
30
31
  - non-country values in `Country`
32
+ - country-continent and country-subcontinent mismatches
33
+ - invalid or future `Collection_Year` values
31
34
  - host-only values in `Sample_Type_SD`
32
35
  - unapproved `Isolation_Source_SD_Broad` values
36
+ - missing required clean standardized columns
33
37
 
34
38
  Warnings are used for curation backlogs such as high host review counts.
39
+
40
+ Each `metadata`, `audit`, or `validate` run writes:
41
+
42
+ - `standardization_audit.md`
43
+ - `standardization_summary.csv`
44
+ - `production_readiness_gate.md`
45
+ - `production_readiness_gate.json`
46
+ - `top_host_review_needed.csv`
47
+ - issue-specific review CSVs for country, year, sample/source/host leakage, broad vocabulary, and sequence readiness.
@@ -1,7 +1,7 @@
1
1
  # FetchM2 Validation Report
2
2
 
3
3
  Validation date: 2026-05-05
4
- Current validation target: `fetchm2 0.1.1`
4
+ Current validation target: `fetchm2 0.1.2`
5
5
 
6
6
  ## Source Baselines
7
7
 
@@ -27,6 +27,7 @@ FetchM2 packages these deterministic rule resources:
27
27
  | `controlled_categories.csv` | 7,505 |
28
28
  | `approved_broad_categories.csv` | 50 |
29
29
  | `geography_reviewed_rules.csv` | 16 |
30
+ | `collection_date_reviewed_rules.csv` | 0 reviewed rows |
30
31
  | `country_mapping.json` | 202 countries/regions |
31
32
 
32
33
  ## Commands Validated
@@ -37,6 +38,7 @@ The following commands were validated in isolated environments:
37
38
  fetchm2 --version
38
39
  fetchm2 metadata --input examples/offline_metadata.tsv --outdir /tmp/fetchm2_smoke --offline
39
40
  fetchm2 audit --input /tmp/fetchm2_smoke/metadata_output/fetchm2_clean.csv --outdir /tmp/fetchm2_smoke_audit
41
+ fetchm2 validate --input /tmp/fetchm2_smoke/metadata_output/fetchm2_clean.csv --outdir /tmp/fetchm2_smoke_validation
40
42
  fetchm2 seq --input /tmp/fetchm2_smoke/metadata_output/fetchm2_clean.csv --outdir /tmp/fetchm2_smoke_seq --country Bangladesh --check-only
41
43
  ```
42
44
 
@@ -45,28 +47,30 @@ fetchm2 seq --input /tmp/fetchm2_smoke/metadata_output/fetchm2_clean.csv --outdi
45
47
  Regression tests:
46
48
 
47
49
  ```text
48
- 7 passed
50
+ 11 passed
49
51
  ```
50
52
 
51
- Package build for `0.1.1`:
53
+ Package build for `0.1.2`:
52
54
 
53
55
  ```text
54
56
  python -m build: passed
55
57
  twine check dist/*: passed
56
58
  ```
57
59
 
58
- Wheel installation for `0.1.1`:
60
+ Wheel installation for `0.1.2`:
59
61
 
60
62
  ```text
61
- pip install fetchm2-0.1.1-py3-none-any.whl: passed
62
- fetchm2 --version: fetchm2 0.1.1
63
+ pip install fetchm2-0.1.2-py3-none-any.whl: passed
64
+ fetchm2 --version: fetchm2 0.1.2
63
65
  ```
64
66
 
65
- Wheel smoke test for `0.1.1`:
67
+ Wheel smoke test for `0.1.2`:
66
68
 
67
69
  ```text
68
70
  metadata command: production gate PASS
69
71
  metadata analysis outputs: generated
72
+ validate command: production gate PASS
73
+ sequence check-only: selected 1 accession and completed without download
70
74
  ```
71
75
 
72
76
  Live NCBI smoke test:
@@ -159,7 +163,73 @@ FetchM2 `0.1.0` has been released on GitHub and PyPI.
159
163
 
160
164
  The `0.1.1` update has passed local tests, package build validation, clean wheel installation, offline packaged-test validation, 20-row live BioSample validation, and full `test.tsv` live BioSample validation.
161
165
 
162
- Known scope notes for `0.1.1`:
166
+ ## Additional 0.1.2 CLI Hardening Validation
167
+
168
+ The 0.1.2 development update adds:
169
+
170
+ - `fetchm2 validate`
171
+ - `production_readiness_gate.md`
172
+ - `production_readiness_gate.json`
173
+ - issue-specific validation CSVs for country, collection year, sample/source/host leakage, broad vocabulary, missing columns, and sequence readiness
174
+ - `Host_Context_SD`
175
+ - packaged `collection_date_reviewed_rules.csv` support
176
+ - final terminal summaries after `metadata`, `run`, `audit`, and `validate`
177
+ - `sequence_download_summary.csv` in `--check-only` mode
178
+
179
+ Local 0.1.2 validation:
180
+
181
+ ```text
182
+ pytest: 11 passed
183
+ examples/offline_metadata.tsv metadata run: production gate PASS
184
+ validate command: production gate PASS
185
+ sequence check-only: wrote failed_accessions.txt and sequence_download_summary.csv
186
+ ```
187
+
188
+ Live 20-row `test.tsv` validation under 0.1.2:
189
+
190
+ ```text
191
+ Rows scanned: 20
192
+ Metadata fetch status: 20 ok
193
+ Metadata fetch errors: 0
194
+ Production gate: PASS
195
+ Host TaxID mapped: 14 / 20 (70.0%)
196
+ Country present: 14 / 20 (70.0%)
197
+ Collection year present: 18 / 20 (90.0%)
198
+ Invalid host-like Sample_Type_SD rows: 0
199
+ Non-country values in Country rows: 0
200
+ Country-continent mismatch rows: 0
201
+ Country-subcontinent mismatch rows: 0
202
+ Invalid/future Collection_Year rows: 0
203
+ Unapproved broad-category rows: 0
204
+ Metadata analysis figures generated: 47
205
+ ```
206
+
207
+ Live full `test.tsv` validation under 0.1.2:
208
+
209
+ ```text
210
+ Rows scanned: 200
211
+ Production gate: PASS
212
+ Host TaxID mapped: 92 / 200 (46.0%)
213
+ Host review needed: 54
214
+ Country present: 178 / 200 (89.0%)
215
+ Collection year present: 196 / 200 (98.0%)
216
+ Sample_Type_SD present: 66 / 200 (33.0%)
217
+ Isolation_Source_SD present: 28 / 200 (14.0%)
218
+ Isolation_Site_SD present: 46 / 200 (23.0%)
219
+ Environment_Medium_SD present: 10 / 200 (5.0%)
220
+ Invalid host-like Sample_Type_SD rows: 0
221
+ Non-country values in Country rows: 0
222
+ Country-continent mismatch rows: 0
223
+ Country-subcontinent mismatch rows: 0
224
+ Invalid/future Collection_Year rows: 0
225
+ Unapproved Isolation_Source_SD_Broad rows: 0
226
+ Unapproved broad-category rows: 0
227
+ Source-like mapped host rows: 6
228
+ Source-like unmapped host rows: 8
229
+ Sequence-readiness issue rows: 0
230
+ ```
231
+
232
+ Known scope notes for `0.1.2`:
163
233
 
164
234
  - host lineage is bundled for common hosts and optionally enriched with `taxonkit` when installed
165
235
  - embeddings/BGE are intentionally not used in production standardization
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "fetchm2"
7
- version = "0.1.1"
7
+ version = "0.1.2"
8
8
  description = "Standalone comprehensive genome metadata standardization and sequence download toolkit."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.10"
@@ -2,4 +2,4 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
- __version__ = "0.1.1"
5
+ __version__ = "0.1.2"
@@ -55,6 +55,7 @@ CORE_STANDARDIZED_FIELDS = [
55
55
  "Host_Genus",
56
56
  "Host_Species",
57
57
  "Host_Common_Name",
58
+ "Host_Context_SD",
58
59
  "Host_Match_Method",
59
60
  "Host_Confidence",
60
61
  "Host_Review_Status",