fetchm2 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
fetchm2/__init__.py ADDED
@@ -0,0 +1,6 @@
1
+ """FetchM2 standalone package."""
2
+
3
+ from __future__ import annotations
4
+
5
+ __version__ = "0.1.0"
6
+
fetchm2/audit.py ADDED
@@ -0,0 +1,126 @@
1
+ from __future__ import annotations
2
+
3
+ from collections import Counter
4
+ from pathlib import Path
5
+ from typing import Any
6
+
7
+ from .standardization import load_rules, normalize_lookup
8
+ from .utils import write_csv, write_text
9
+
10
+ ALLOWED_MARINE_REGIONS = {
11
+ "Arctic Ocean",
12
+ "Atlantic Ocean",
13
+ "Indian Ocean",
14
+ "Pacific Ocean",
15
+ "Southern Ocean",
16
+ "Mediterranean Sea",
17
+ "North Sea",
18
+ "Baltic Sea",
19
+ }
20
+
21
+
22
+ def value_present(value: Any) -> bool:
23
+ return bool(str(value or "").strip())
24
+
25
+
26
+ def summarize_rows(rows: list[dict[str, Any]]) -> dict[str, Any]:
27
+ total = len(rows)
28
+ host_taxid = sum(1 for row in rows if value_present(row.get("Host_TaxID")))
29
+ host_review = sum(1 for row in rows if row.get("Host_Review_Status") == "review_needed")
30
+ country = sum(1 for row in rows if value_present(row.get("Country")))
31
+ collection_year = sum(1 for row in rows if value_present(row.get("Collection_Year")))
32
+ sample_type = sum(1 for row in rows if value_present(row.get("Sample_Type_SD")))
33
+ isolation_source = sum(1 for row in rows if value_present(row.get("Isolation_Source_SD")))
34
+ isolation_site = sum(1 for row in rows if value_present(row.get("Isolation_Site_SD")))
35
+ environment_medium = sum(1 for row in rows if value_present(row.get("Environment_Medium_SD")))
36
+ host_disease = sum(1 for row in rows if value_present(row.get("Host_Disease_SD")))
37
+ host_health = sum(1 for row in rows if value_present(row.get("Host_Health_State_SD")))
38
+ invalid_sample = [
39
+ row
40
+ for row in rows
41
+ if normalize_lookup(row.get("Sample_Type_SD")) in {"human", "patient", "animal", "poultry", "cattle", "pig", "plant", "bacteria"}
42
+ ]
43
+ non_country = [
44
+ row
45
+ for row in rows
46
+ if value_present(row.get("Country"))
47
+ and row.get("Country") not in load_rules().country_mapping
48
+ and row.get("Country") not in ALLOWED_MARINE_REGIONS
49
+ ]
50
+ broad_values = Counter(str(row.get("Isolation_Source_SD_Broad") or "").strip() for row in rows if value_present(row.get("Isolation_Source_SD_Broad")))
51
+ approved_broad = load_rules().approved_broad.get("Isolation_Source_SD_Broad", set())
52
+ unapproved_broad = {
53
+ value: count
54
+ for value, count in broad_values.items()
55
+ if value and value not in approved_broad
56
+ }
57
+ return {
58
+ "rows": total,
59
+ "host_taxid_mapped": host_taxid,
60
+ "host_taxid_percent": round((host_taxid / total) * 100, 2) if total else 0,
61
+ "host_review_needed": host_review,
62
+ "country_present": country,
63
+ "country_percent": round((country / total) * 100, 2) if total else 0,
64
+ "collection_year_present": collection_year,
65
+ "collection_year_percent": round((collection_year / total) * 100, 2) if total else 0,
66
+ "sample_type_present": sample_type,
67
+ "isolation_source_present": isolation_source,
68
+ "isolation_site_present": isolation_site,
69
+ "environment_medium_present": environment_medium,
70
+ "host_disease_present": host_disease,
71
+ "host_health_state_present": host_health,
72
+ "invalid_host_like_sample_type_rows": len(invalid_sample),
73
+ "non_country_values_in_country_rows": len(non_country),
74
+ "unapproved_isolation_source_broad_rows": sum(unapproved_broad.values()),
75
+ "unique_isolation_source_broad_values": len(broad_values),
76
+ }
77
+
78
+
79
+ def write_audit_outputs(rows: list[dict[str, Any]], output_dir: Path) -> dict[str, Any]:
80
+ summary = summarize_rows(rows)
81
+ output_dir.mkdir(parents=True, exist_ok=True)
82
+ write_csv(output_dir / "standardization_summary.csv", [summary])
83
+
84
+ top_host_review = Counter(
85
+ str(row.get("Host_Original") or "").strip()
86
+ for row in rows
87
+ if row.get("Host_Review_Status") == "review_needed"
88
+ )
89
+ write_csv(
90
+ output_dir / "top_host_review_needed.csv",
91
+ [{"host_original": key, "count": count} for key, count in top_host_review.most_common(200)],
92
+ )
93
+
94
+ markdown = [
95
+ "# FetchM2 Metadata Standardization Audit",
96
+ "",
97
+ f"Rows scanned: {summary['rows']}",
98
+ f"Host TaxID mapped: {summary['host_taxid_mapped']} ({summary['host_taxid_percent']}%)",
99
+ f"Host review needed: {summary['host_review_needed']}",
100
+ f"Country present: {summary['country_present']} ({summary['country_percent']}%)",
101
+ f"Collection year present: {summary['collection_year_present']} ({summary['collection_year_percent']}%)",
102
+ f"Sample_Type_SD present: {summary['sample_type_present']}",
103
+ f"Isolation_Source_SD present: {summary['isolation_source_present']}",
104
+ f"Isolation_Site_SD present: {summary['isolation_site_present']}",
105
+ f"Environment_Medium_SD present: {summary['environment_medium_present']}",
106
+ f"Invalid host-like Sample_Type_SD rows: {summary['invalid_host_like_sample_type_rows']}",
107
+ f"Non-country values in Country rows: {summary['non_country_values_in_country_rows']}",
108
+ f"Unapproved Isolation_Source_SD_Broad rows: {summary['unapproved_isolation_source_broad_rows']}",
109
+ ]
110
+ write_text(output_dir / "standardization_audit.md", "\n".join(markdown) + "\n")
111
+ return summary
112
+
113
+
114
+ def production_gate(summary: dict[str, Any]) -> tuple[bool, list[str], list[str]]:
115
+ hard_failures: list[str] = []
116
+ warnings: list[str] = []
117
+ for key in [
118
+ "invalid_host_like_sample_type_rows",
119
+ "non_country_values_in_country_rows",
120
+ "unapproved_isolation_source_broad_rows",
121
+ ]:
122
+ if int(summary.get(key) or 0) > 0:
123
+ hard_failures.append(f"{key}={summary[key]}")
124
+ if int(summary.get("host_review_needed") or 0) > 1000:
125
+ warnings.append(f"host_review_needed={summary['host_review_needed']}")
126
+ return not hard_failures, hard_failures, warnings
fetchm2/cli.py ADDED
@@ -0,0 +1,175 @@
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import os
5
+ from pathlib import Path
6
+
7
+ from . import __version__
8
+ from .audit import production_gate, summarize_rows, write_audit_outputs
9
+ from .metadata import run_metadata
10
+ from .sequence import run_sequence_downloads
11
+
12
+
13
+ def add_filter_args(parser: argparse.ArgumentParser) -> None:
14
+ parser.add_argument("--host", nargs="+", help="Filter sequence downloads by Host_SD.")
15
+ parser.add_argument("--host-rank", nargs="+", help="Filter sequence downloads by Host_Rank.")
16
+ parser.add_argument("--country", nargs="+", help="Filter sequence downloads by standardized Country.")
17
+ parser.add_argument("--continent", nargs="+", help="Filter sequence downloads by Continent.")
18
+ parser.add_argument("--subcontinent", nargs="+", help="Filter sequence downloads by Subcontinent.")
19
+ parser.add_argument("--sample-type", nargs="+", help="Filter by Sample_Type_SD.")
20
+ parser.add_argument("--isolation-source", nargs="+", help="Filter by Isolation_Source_SD.")
21
+ parser.add_argument("--environment-medium", nargs="+", help="Filter by Environment_Medium_SD.")
22
+ parser.add_argument("--year-from", type=int, help="Minimum Collection_Year.")
23
+ parser.add_argument("--year-to", type=int, help="Maximum Collection_Year.")
24
+ parser.add_argument("--max-genomes", type=int, help="Maximum selected genomes for sequence download.")
25
+
26
+
27
+ def build_parser() -> argparse.ArgumentParser:
28
+ parser = argparse.ArgumentParser(
29
+ prog="fetchm2",
30
+ description="Comprehensive standalone metadata standardization and sequence download toolkit.",
31
+ )
32
+ parser.add_argument("--version", action="version", version=f"fetchm2 {__version__}")
33
+ subparsers = parser.add_subparsers(dest="command", required=True)
34
+
35
+ metadata = subparsers.add_parser("metadata", help="Fetch/standardize metadata and write audit outputs.")
36
+ metadata.add_argument("--input", required=True, type=Path, help="NCBI Datasets TSV/CSV input.")
37
+ metadata.add_argument("--outdir", required=True, type=Path, help="Output directory.")
38
+ metadata.add_argument("--ani", nargs="+", default=["all"], help="ANI Check status filter.")
39
+ metadata.add_argument("--checkm", type=float, help="Minimum CheckM completeness.")
40
+ metadata.add_argument("--api-key", default=os.environ.get("NCBI_API_KEY"), help="NCBI API key or NCBI_API_KEY env var.")
41
+ metadata.add_argument("--email", default=os.environ.get("NCBI_EMAIL"), help="Optional NCBI contact email.")
42
+ metadata.add_argument("--workers", type=int, default=3, help="Metadata fetch workers.")
43
+ metadata.add_argument("--sleep", type=float, default=0.34, help="Delay before BioSample requests.")
44
+ metadata.add_argument("--offline", action="store_true", help="Do not fetch BioSample metadata; standardize existing columns only.")
45
+ metadata.set_defaults(func=run_metadata_command)
46
+
47
+ run = subparsers.add_parser("run", help="Run metadata standardization and optionally download sequences.")
48
+ run.add_argument("--input", required=True, type=Path, help="NCBI Datasets TSV/CSV input.")
49
+ run.add_argument("--outdir", required=True, type=Path, help="Output directory.")
50
+ run.add_argument("--ani", nargs="+", default=["all"], help="ANI Check status filter.")
51
+ run.add_argument("--checkm", type=float, help="Minimum CheckM completeness.")
52
+ run.add_argument("--api-key", default=os.environ.get("NCBI_API_KEY"), help="NCBI API key or NCBI_API_KEY env var.")
53
+ run.add_argument("--email", default=os.environ.get("NCBI_EMAIL"), help="Optional NCBI contact email.")
54
+ run.add_argument("--workers", type=int, default=3, help="Metadata fetch workers.")
55
+ run.add_argument("--sleep", type=float, default=0.34, help="Delay before BioSample requests.")
56
+ run.add_argument("--offline", action="store_true", help="Do not fetch BioSample metadata; standardize existing columns only.")
57
+ run.add_argument("--download", action="store_true", help="Download sequences after metadata standardization.")
58
+ run.add_argument("--download-workers", type=int, default=4, help="Sequence download workers.")
59
+ run.add_argument("--retries", type=int, default=3, help="Download retries.")
60
+ run.add_argument("--retry-delay", type=float, default=5.0, help="Download retry delay.")
61
+ run.add_argument("--keep-gz", action="store_true", help="Keep compressed FASTA files instead of decompressing.")
62
+ add_filter_args(run)
63
+ run.set_defaults(func=run_all_command)
64
+
65
+ seq = subparsers.add_parser("seq", help="Download sequences from fetchm2_clean.csv.")
66
+ seq.add_argument("--input", required=True, type=Path, help="Path to fetchm2_clean.csv.")
67
+ seq.add_argument("--outdir", required=True, type=Path, help="Sequence output directory.")
68
+ seq.add_argument("--download-workers", type=int, default=4, help="Sequence download workers.")
69
+ seq.add_argument("--retries", type=int, default=3, help="Download retries.")
70
+ seq.add_argument("--retry-delay", type=float, default=5.0, help="Download retry delay.")
71
+ seq.add_argument("--check-only", action="store_true", help="Audit sequence directory without downloading.")
72
+ seq.add_argument("--keep-gz", action="store_true", help="Keep compressed FASTA files instead of decompressing.")
73
+ add_filter_args(seq)
74
+ seq.set_defaults(func=run_seq_command)
75
+
76
+ audit = subparsers.add_parser("audit", help="Audit an existing standardized CSV.")
77
+ audit.add_argument("--input", required=True, type=Path, help="Path to fetchm2_clean.csv.")
78
+ audit.add_argument("--outdir", required=True, type=Path, help="Audit output directory.")
79
+ audit.set_defaults(func=run_audit_command)
80
+ return parser
81
+
82
+
83
+ def filter_dict(args: argparse.Namespace) -> dict[str, object]:
84
+ return {
85
+ "host": args.host,
86
+ "host_rank": args.host_rank,
87
+ "country": args.country,
88
+ "continent": args.continent,
89
+ "subcontinent": args.subcontinent,
90
+ "sample_type": args.sample_type,
91
+ "isolation_source": args.isolation_source,
92
+ "environment_medium": args.environment_medium,
93
+ "year_from": args.year_from,
94
+ "year_to": args.year_to,
95
+ }
96
+
97
+
98
+ def run_metadata_command(args: argparse.Namespace) -> None:
99
+ result = run_metadata(
100
+ input_path=args.input,
101
+ outdir=args.outdir,
102
+ ani=args.ani,
103
+ checkm=args.checkm,
104
+ api_key=args.api_key,
105
+ email=args.email,
106
+ workers=args.workers,
107
+ sleep=args.sleep,
108
+ offline=args.offline,
109
+ )
110
+ print(f"Wrote clean metadata: {result['clean_path']}")
111
+ print(f"Production gate: {'PASS' if result['production_ready'] else 'FAIL'}")
112
+
113
+
114
+ def run_all_command(args: argparse.Namespace) -> None:
115
+ result = run_metadata(
116
+ input_path=args.input,
117
+ outdir=args.outdir,
118
+ ani=args.ani,
119
+ checkm=args.checkm,
120
+ api_key=args.api_key,
121
+ email=args.email,
122
+ workers=args.workers,
123
+ sleep=args.sleep,
124
+ offline=args.offline,
125
+ )
126
+ print(f"Wrote clean metadata: {result['clean_path']}")
127
+ if args.download:
128
+ summary = run_sequence_downloads(
129
+ input_path=Path(result["clean_path"]),
130
+ outdir=args.outdir / "sequence",
131
+ filters=filter_dict(args),
132
+ retries=args.retries,
133
+ retry_delay=args.retry_delay,
134
+ workers=args.download_workers,
135
+ max_genomes=args.max_genomes,
136
+ keep_gz=args.keep_gz,
137
+ )
138
+ print(f"Sequence summary: {summary}")
139
+
140
+
141
+ def run_seq_command(args: argparse.Namespace) -> None:
142
+ summary = run_sequence_downloads(
143
+ input_path=args.input,
144
+ outdir=args.outdir,
145
+ filters=filter_dict(args),
146
+ retries=args.retries,
147
+ retry_delay=args.retry_delay,
148
+ workers=args.download_workers,
149
+ check_only=args.check_only,
150
+ max_genomes=args.max_genomes,
151
+ keep_gz=args.keep_gz,
152
+ )
153
+ print(f"Sequence summary: {summary}")
154
+
155
+
156
+ def run_audit_command(args: argparse.Namespace) -> None:
157
+ import pandas as pd
158
+
159
+ rows = pd.read_csv(args.input).fillna("").to_dict(orient="records")
160
+ summary = write_audit_outputs(rows, args.outdir)
161
+ ready, failures, warnings = production_gate(summary)
162
+ print(f"Production gate: {'PASS' if ready else 'FAIL'}")
163
+ if failures:
164
+ print(f"Hard failures: {failures}")
165
+ if warnings:
166
+ print(f"Warnings: {warnings}")
167
+
168
+
169
+ def main() -> None:
170
+ args = build_parser().parse_args()
171
+ args.func(args)
172
+
173
+
174
+ if __name__ == "__main__":
175
+ main()
@@ -0,0 +1,2 @@
1
+ """Packaged FetchM2 standardization data."""
2
+
@@ -0,0 +1,51 @@
1
+ field,approved_value,description,examples
2
+ Isolation_Source_SD_Broad,clinical/host-associated material,Clinical or host-associated biological material,blood; sputum; tissue
3
+ Isolation_Source_SD_Broad,host-associated context,Host-associated context where a specific specimen/source is not available,whole organism; wildlife
4
+ Isolation_Source_SD_Broad,feces/stool,Fecal or stool material,feces; stool; faeces
5
+ Isolation_Source_SD_Broad,food,Food or food product,ready-to-eat food; fermented food
6
+ Isolation_Source_SD_Broad,food/meat,Meat or animal-origin food product,chicken meat; beef; pork
7
+ Isolation_Source_SD_Broad,food/dairy,Dairy food or dairy product,milk; cheese; dairy product
8
+ Isolation_Source_SD_Broad,food/produce,Produce or plant-origin food,spinach; papaya; vegetable
9
+ Isolation_Source_SD_Broad,food/plant product,Processed plant-origin food,peanut butter
10
+ Isolation_Source_SD_Broad,food/processing environment,Food-processing or food-contact environment,food-contact surface
11
+ Isolation_Source_SD_Broad,water,Aquatic or water-associated source,river water; lake water; seawater
12
+ Isolation_Source_SD_Broad,wastewater/sewage,Wastewater sewage or sludge source,wastewater; sewage; activated sludge
13
+ Isolation_Source_SD_Broad,soil,Soil-associated material,soil; rhizosphere soil
14
+ Isolation_Source_SD_Broad,sediment,Sediment-associated material,marine sediment; pond sediment
15
+ Isolation_Source_SD_Broad,environmental material,Generic environmental material,environmental sample
16
+ Isolation_Source_SD_Broad,environmental/geologic material,Geologic or extreme environmental material,rock; hydrothermal vent
17
+ Isolation_Source_SD_Broad,healthcare-associated environment,Healthcare or hospital environment,hospital surface; ICU environment
18
+ Isolation_Source_SD_Broad,agricultural environment,Agricultural or farm environment,farm; dairy farm
19
+ Isolation_Source_SD_Broad,agricultural fecal material,Agricultural fecal material,manure
20
+ Isolation_Source_SD_Broad,animal-associated environment,Animal production or animal-associated environment,poultry house; animal facility
21
+ Isolation_Source_SD_Broad,plant-associated material,Plant-associated non-food material,root; rhizosphere; leaves
22
+ Isolation_Source_SD_Broad,culture,Culture material or culture condition,pure culture; mixed culture
23
+ Isolation_Source_SD_Broad,culture/assembly,Assembly or metagenomic culture descriptor,metagenomic assembly
24
+ Isolation_Source_SD_Broad,culture/isolate,Microbial isolate or culture isolate,microbial isolate
25
+ Isolation_Source_SD_Broad,culture medium,Culture medium,blood agar culture medium
26
+ Isolation_Source_SD_Broad,laboratory environment,Laboratory source or context,laboratory
27
+ Isolation_Source_SD_Broad,built environment,Built environment or built surface,sink; drain; cleanroom floor
28
+ Isolation_Source_SD_Broad,surface sample,Surface sample or surface context,surface
29
+ Isolation_Source_SD_Broad,biofilm,Biofilm source,biofilm; wall biofilm
30
+ Isolation_Source_SD_Broad,respiratory sample,Respiratory sample category,tracheal aspirate; bronchoalveolar lavage
31
+ Isolation_Source_SD_Broad,upper respiratory tract,Upper respiratory tract site,nasopharynx/oropharynx
32
+ Isolation_Source_SD_Broad,upper respiratory site,Upper respiratory site,nasal site
33
+ Isolation_Source_SD_Broad,oral cavity,Oral cavity site,dental plaque; oral cavity
34
+ Isolation_Source_SD_Broad,urogenital site,Urogenital site,urethra; cervix
35
+ Isolation_Source_SD_Broad,gastrointestinal site,Gastrointestinal site,colon; ileum
36
+ Isolation_Source_SD_Broad,gut content,Gut or intestinal content,intestine; stomach; rumen
37
+ Isolation_Source_SD_Broad,tissue,Tissue specimen,FFPE tissue
38
+ Isolation_Source_SD_Broad,swab,Swab specimen category,rectal swab; nasal swab
39
+ Isolation_Source_SD_Broad,clinical fluid/material,Clinical fluid or material,pus; abscess; pleural fluid
40
+ Isolation_Source_SD_Broad,medical device,Medical device or catheter context,catheter
41
+ Isolation_Source_SD_Broad,aquatic food product,Aquatic food product,seafood; fish product
42
+ Isolation_Source_SD_Broad,biological/clinical product,Biological or clinical product,biological product
43
+ Isolation_Source_SD_Broad,molecular extract,Molecular extract,DNA extract
44
+ Isolation_Source_SD_Broad,single cell,Single-cell source,single cell
45
+ Isolation_Source_SD_Broad,sample,Generic sample label,sample
46
+ Isolation_Source_SD_Broad,cloacal sample,Cloacal specimen,cloacal sample
47
+ Isolation_Source_SD_Broad,fermented food,Fermented food,kimchi; fermented food
48
+ Isolation_Source_SD_Broad,surface/sample collection material,Collection material or surface sampling tool,sponge
49
+ Isolation_Source_SD_Broad,gut/host-associated material,Gut-associated host material,intestinal epithelial cells
50
+ Isolation_Source_SD_Broad,wastewater/organic waste,Wastewater or organic waste digestion source,anaerobic digester
51
+ Isolation_Source_SD_Broad,metadata descriptor / non-source,Metadata descriptor rather than biological source,metagenome