fetchm2 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
fetchm2/metadata.py ADDED
@@ -0,0 +1,244 @@
1
+ from __future__ import annotations
2
+
3
+ import sqlite3
4
+ import threading
5
+ import time
6
+ from concurrent.futures import ThreadPoolExecutor, as_completed
7
+ from pathlib import Path
8
+ from typing import Any
9
+
10
+ import pandas as pd
11
+ import requests
12
+ import xmltodict
13
+ from tqdm import tqdm
14
+
15
+ from .audit import production_gate, write_audit_outputs
16
+ from .standardization import standardize_rows
17
+
18
+ NCBI_EFETCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
19
+ NCBI_TIMEOUT = 60
20
+
21
+
22
+ def read_table(path: Path) -> pd.DataFrame:
23
+ if path.suffix.lower() == ".csv":
24
+ return pd.read_csv(path)
25
+ return pd.read_csv(path, sep="\t")
26
+
27
+
28
+ def filter_quality(df: pd.DataFrame, ani: list[str] | None, checkm: float | None) -> pd.DataFrame:
29
+ filtered = df.copy()
30
+ if ani and "all" not in [value.lower() for value in ani] and "ANI Check status" in filtered:
31
+ filtered = filtered[filtered["ANI Check status"].astype(str).isin(ani)]
32
+ if checkm is not None and "CheckM completeness" in filtered:
33
+ filtered = filtered[pd.to_numeric(filtered["CheckM completeness"], errors="coerce") >= checkm]
34
+ return filtered
35
+
36
+
37
+ class MetadataCache:
38
+ def __init__(self, path: Path) -> None:
39
+ self.path = path
40
+ self.lock = threading.Lock()
41
+ self.conn = sqlite3.connect(path, check_same_thread=False)
42
+ self.conn.execute(
43
+ """
44
+ CREATE TABLE IF NOT EXISTS biosample_cache (
45
+ biosample TEXT PRIMARY KEY,
46
+ payload TEXT,
47
+ fetched_at REAL
48
+ )
49
+ """
50
+ )
51
+ self.conn.commit()
52
+
53
+ def get(self, biosample: str) -> str | None:
54
+ with self.lock:
55
+ row = self.conn.execute("SELECT payload FROM biosample_cache WHERE biosample = ?", (biosample,)).fetchone()
56
+ return None if row is None else str(row[0])
57
+
58
+ def set(self, biosample: str, payload: str) -> None:
59
+ with self.lock:
60
+ self.conn.execute(
61
+ "INSERT OR REPLACE INTO biosample_cache (biosample, payload, fetched_at) VALUES (?, ?, ?)",
62
+ (biosample, payload, time.time()),
63
+ )
64
+ self.conn.commit()
65
+
66
+ def close(self) -> None:
67
+ with self.lock:
68
+ self.conn.close()
69
+
70
+
71
+ def biosample_accession(row: dict[str, Any]) -> str:
72
+ for key in ["Assembly BioSample Accession", "BioSample Accession", "BioSample"]:
73
+ value = str(row.get(key) or "").strip()
74
+ if value:
75
+ return value
76
+ return ""
77
+
78
+
79
+ def parse_biosample_xml(xml_text: str) -> dict[str, str]:
80
+ if not xml_text.strip():
81
+ return {}
82
+ parsed = xmltodict.parse(xml_text)
83
+ sample = parsed.get("BioSampleSet", {}).get("BioSample")
84
+ if isinstance(sample, list):
85
+ sample = sample[0] if sample else {}
86
+ if not isinstance(sample, dict):
87
+ return {}
88
+ attributes = sample.get("Attributes", {}).get("Attribute", [])
89
+ if isinstance(attributes, dict):
90
+ attributes = [attributes]
91
+ output: dict[str, str] = {}
92
+ key_map = {
93
+ "isolation_source": "Isolation Source",
94
+ "collection_date": "Collection Date",
95
+ "geo_loc_name": "Geographic Location",
96
+ "host": "Host",
97
+ "sample_type": "Sample Type",
98
+ "env_medium": "Environment Medium",
99
+ "env_broad_scale": "Environment Broad Scale",
100
+ "env_local_scale": "Environment Local Scale",
101
+ "disease": "Host Disease",
102
+ "host_disease": "Host Disease",
103
+ "host_health_state": "Host Health State",
104
+ }
105
+ for attr in attributes:
106
+ name = str(attr.get("@attribute_name") or attr.get("@harmonized_name") or "").strip()
107
+ value = str(attr.get("#text") or "").strip()
108
+ if not name or not value:
109
+ continue
110
+ normalized_name = name.lower().replace("-", "_").replace(" ", "_")
111
+ output[key_map.get(normalized_name, name)] = value
112
+ return output
113
+
114
+
115
+ def fetch_biosample_metadata(
116
+ biosample: str,
117
+ *,
118
+ api_key: str | None,
119
+ email: str | None,
120
+ sleep: float,
121
+ cache: MetadataCache,
122
+ ) -> dict[str, str]:
123
+ cached = cache.get(biosample)
124
+ if cached is not None:
125
+ return parse_biosample_xml(cached)
126
+ params = {
127
+ "db": "biosample",
128
+ "id": biosample,
129
+ "retmode": "xml",
130
+ }
131
+ if api_key:
132
+ params["api_key"] = api_key
133
+ if email:
134
+ params["email"] = email
135
+ if sleep > 0:
136
+ time.sleep(sleep)
137
+ response = requests.get(NCBI_EFETCH_URL, params=params, timeout=NCBI_TIMEOUT)
138
+ response.raise_for_status()
139
+ cache.set(biosample, response.text)
140
+ return parse_biosample_xml(response.text)
141
+
142
+
143
+ def enrich_rows_with_biosample(
144
+ rows: list[dict[str, Any]],
145
+ *,
146
+ cache_path: Path,
147
+ api_key: str | None,
148
+ email: str | None,
149
+ workers: int,
150
+ sleep: float,
151
+ offline: bool,
152
+ ) -> list[dict[str, Any]]:
153
+ if offline:
154
+ return rows
155
+ cache = MetadataCache(cache_path)
156
+ try:
157
+ accessions = sorted({biosample_accession(row) for row in rows if biosample_accession(row)})
158
+ metadata_by_biosample: dict[str, dict[str, str]] = {}
159
+ with ThreadPoolExecutor(max_workers=max(1, workers)) as executor:
160
+ futures = {
161
+ executor.submit(
162
+ fetch_biosample_metadata,
163
+ biosample,
164
+ api_key=api_key,
165
+ email=email,
166
+ sleep=sleep,
167
+ cache=cache,
168
+ ): biosample
169
+ for biosample in accessions
170
+ }
171
+ for future in tqdm(as_completed(futures), total=len(futures), desc="Fetching BioSample metadata"):
172
+ biosample = futures[future]
173
+ try:
174
+ metadata_by_biosample[biosample] = future.result()
175
+ except Exception as exc:
176
+ metadata_by_biosample[biosample] = {"Metadata Fetch Error": str(exc)}
177
+ enriched = []
178
+ for row in rows:
179
+ merged = dict(row)
180
+ for key, value in metadata_by_biosample.get(biosample_accession(row), {}).items():
181
+ if not str(merged.get(key) or "").strip():
182
+ merged[key] = value
183
+ enriched.append(merged)
184
+ return enriched
185
+ finally:
186
+ cache.close()
187
+
188
+
189
+ def run_metadata(
190
+ *,
191
+ input_path: Path,
192
+ outdir: Path,
193
+ ani: list[str] | None = None,
194
+ checkm: float | None = None,
195
+ api_key: str | None = None,
196
+ email: str | None = None,
197
+ workers: int = 3,
198
+ sleep: float = 0.34,
199
+ offline: bool = False,
200
+ ) -> dict[str, Any]:
201
+ outdir.mkdir(parents=True, exist_ok=True)
202
+ metadata_dir = outdir / "metadata_output"
203
+ audit_dir = outdir / "audit"
204
+ metadata_dir.mkdir(parents=True, exist_ok=True)
205
+
206
+ df = read_table(input_path)
207
+ df = filter_quality(df, ani, checkm)
208
+ rows = df.fillna("").to_dict(orient="records")
209
+ rows = enrich_rows_with_biosample(
210
+ rows,
211
+ cache_path=metadata_dir / "fetchm2_biosample_cache.sqlite3",
212
+ api_key=api_key,
213
+ email=email,
214
+ workers=workers,
215
+ sleep=sleep,
216
+ offline=offline,
217
+ )
218
+ standardized = standardize_rows(rows)
219
+ clean_df = pd.DataFrame(standardized)
220
+ clean_path = metadata_dir / "fetchm2_clean.csv"
221
+ clean_df.to_csv(clean_path, index=False)
222
+ clean_df.to_csv(metadata_dir / "fetchm2_clean.tsv", sep="\t", index=False)
223
+ summary = write_audit_outputs(standardized, audit_dir)
224
+ production_ready, hard_failures, warnings = production_gate(summary)
225
+ report_lines = [
226
+ "# FetchM2 Run Report",
227
+ "",
228
+ f"Input: {input_path}",
229
+ f"Rows processed: {summary['rows']}",
230
+ f"Clean table: {clean_path}",
231
+ f"Production gate: {'PASS' if production_ready else 'FAIL'}",
232
+ ]
233
+ if hard_failures:
234
+ report_lines.append(f"Hard failures: {', '.join(hard_failures)}")
235
+ if warnings:
236
+ report_lines.append(f"Warnings: {', '.join(warnings)}")
237
+ (metadata_dir / "fetchm2_report.md").write_text("\n".join(report_lines) + "\n", encoding="utf-8")
238
+ return {
239
+ "clean_path": str(clean_path),
240
+ "summary": summary,
241
+ "production_ready": production_ready,
242
+ "hard_failures": hard_failures,
243
+ "warnings": warnings,
244
+ }
fetchm2/sequence.py ADDED
@@ -0,0 +1,194 @@
1
+ from __future__ import annotations
2
+
3
+ import gzip
4
+ import re
5
+ import shutil
6
+ import sqlite3
7
+ from concurrent.futures import ThreadPoolExecutor, as_completed
8
+ from pathlib import Path
9
+ from typing import Any
10
+
11
+ import pandas as pd
12
+ import requests
13
+ from tqdm import tqdm
14
+
15
+ BASE_URL = "https://ftp.ncbi.nlm.nih.gov/genomes/all"
16
+
17
+
18
+ def normalize_text(value: Any) -> str:
19
+ return str(value or "").strip().lower()
20
+
21
+
22
+ def normalize_assembly_name(name: str) -> str:
23
+ cleaned = str(name or "").strip()
24
+ return cleaned.replace(" ", "_") if cleaned else "NA"
25
+
26
+
27
+ def build_parent_url(accession: str) -> str:
28
+ prefix, digits = accession.split("_", 1)
29
+ core = digits.split(".", 1)[0]
30
+ return f"{BASE_URL}/{prefix}/{core[:3]}/{core[3:6]}/{core[6:9]}/{core[9:]}"
31
+
32
+
33
+ class DirectoryCache:
34
+ def __init__(self, path: Path) -> None:
35
+ self.conn = sqlite3.connect(path)
36
+ self.conn.execute(
37
+ "CREATE TABLE IF NOT EXISTS assembly_directory_cache (accession TEXT PRIMARY KEY, assembly_name TEXT, directory TEXT)"
38
+ )
39
+ self.conn.commit()
40
+
41
+ def get(self, accession: str, name: str) -> str | None:
42
+ row = self.conn.execute(
43
+ "SELECT directory FROM assembly_directory_cache WHERE accession = ? AND assembly_name = ?",
44
+ (accession, normalize_assembly_name(name)),
45
+ ).fetchone()
46
+ return None if row is None else str(row[0])
47
+
48
+ def set(self, accession: str, name: str, directory: str) -> None:
49
+ self.conn.execute(
50
+ "INSERT OR REPLACE INTO assembly_directory_cache (accession, assembly_name, directory) VALUES (?, ?, ?)",
51
+ (accession, normalize_assembly_name(name), directory),
52
+ )
53
+ self.conn.commit()
54
+
55
+ def close(self) -> None:
56
+ self.conn.close()
57
+
58
+
59
+ def resolve_assembly_directory(accession: str, name: str, cache: DirectoryCache) -> str:
60
+ cached = cache.get(accession, name)
61
+ if cached:
62
+ return cached
63
+ parent_url = build_parent_url(accession)
64
+ normalized_name = normalize_assembly_name(name)
65
+ candidates = [f"{accession}_{normalized_name}", f"{accession}_NA"]
66
+ session = requests.Session()
67
+ for candidate in candidates:
68
+ if session.get(f"{parent_url}/{candidate}", timeout=30).ok:
69
+ cache.set(accession, name, candidate)
70
+ return candidate
71
+ response = session.get(parent_url, timeout=60)
72
+ response.raise_for_status()
73
+ matches = [item.rstrip("/") for item in re.findall(r'href="([^"]+/)"', response.text) if item.startswith(f"{accession}_")]
74
+ if not matches:
75
+ raise FileNotFoundError(f"No remote assembly directory found for {accession}")
76
+ cache.set(accession, name, matches[0])
77
+ return matches[0]
78
+
79
+
80
+ def row_matches_filters(row: dict[str, Any], filters: dict[str, Any]) -> bool:
81
+ for field, values in {
82
+ "Country": filters.get("country"),
83
+ "Continent": filters.get("continent"),
84
+ "Subcontinent": filters.get("subcontinent"),
85
+ "Host_SD": filters.get("host"),
86
+ "Host_Rank": filters.get("host_rank"),
87
+ "Sample_Type_SD": filters.get("sample_type"),
88
+ "Isolation_Source_SD": filters.get("isolation_source"),
89
+ "Environment_Medium_SD": filters.get("environment_medium"),
90
+ }.items():
91
+ if values and normalize_text(row.get(field)) not in {normalize_text(value) for value in values}:
92
+ return False
93
+ year_from = filters.get("year_from")
94
+ year_to = filters.get("year_to")
95
+ if year_from is not None or year_to is not None:
96
+ try:
97
+ year = int(str(row.get("Collection_Year") or row.get("Collection Date") or "")[:4])
98
+ except ValueError:
99
+ return False
100
+ if year_from is not None and year < year_from:
101
+ return False
102
+ if year_to is not None and year > year_to:
103
+ return False
104
+ return True
105
+
106
+
107
+ def select_rows(input_path: Path, filters: dict[str, Any], max_genomes: int | None) -> list[dict[str, Any]]:
108
+ df = pd.read_csv(input_path)
109
+ rows = [row for row in df.fillna("").to_dict(orient="records") if row_matches_filters(row, filters)]
110
+ if max_genomes is not None:
111
+ rows = rows[:max_genomes]
112
+ return rows
113
+
114
+
115
+ def download_one(row: dict[str, Any], outdir: Path, cache: DirectoryCache, retries: int, retry_delay: float, keep_gz: bool) -> tuple[str, str]:
116
+ accession = str(row.get("Assembly Accession") or "").strip()
117
+ name = str(row.get("Assembly Name") or "").strip()
118
+ if not accession:
119
+ return "", "missing accession"
120
+ for attempt in range(1, retries + 1):
121
+ try:
122
+ directory = resolve_assembly_directory(accession, name, cache)
123
+ gz_name = f"{directory}_genomic.fna.gz"
124
+ fna_name = f"{directory}_genomic.fna"
125
+ gz_path = outdir / gz_name
126
+ fna_path = outdir / fna_name
127
+ if fna_path.exists() or gz_path.exists():
128
+ return accession, "exists"
129
+ url = f"{build_parent_url(accession)}/{directory}/{gz_name}"
130
+ with requests.get(url, stream=True, timeout=300) as response:
131
+ response.raise_for_status()
132
+ with gz_path.open("wb") as handle:
133
+ for chunk in response.iter_content(chunk_size=1024 * 1024):
134
+ if chunk:
135
+ handle.write(chunk)
136
+ if not keep_gz:
137
+ with gzip.open(gz_path, "rb") as source, fna_path.open("wb") as target:
138
+ shutil.copyfileobj(source, target)
139
+ gz_path.unlink()
140
+ return accession, "downloaded"
141
+ except Exception as exc:
142
+ if attempt >= retries:
143
+ return accession, f"failed: {exc}"
144
+ import time
145
+
146
+ time.sleep(retry_delay * attempt)
147
+ return accession, "failed"
148
+
149
+
150
+ def run_sequence_downloads(
151
+ *,
152
+ input_path: Path,
153
+ outdir: Path,
154
+ filters: dict[str, Any] | None = None,
155
+ retries: int = 3,
156
+ retry_delay: float = 5.0,
157
+ workers: int = 4,
158
+ check_only: bool = False,
159
+ max_genomes: int | None = None,
160
+ keep_gz: bool = False,
161
+ ) -> dict[str, Any]:
162
+ outdir.mkdir(parents=True, exist_ok=True)
163
+ filters = filters or {}
164
+ rows = select_rows(input_path, filters, max_genomes)
165
+ expected = [str(row.get("Assembly Accession") or "").strip() for row in rows]
166
+ if check_only:
167
+ existing = {path.name.split("_", 2)[0] + "_" + path.name.split("_", 2)[1] for path in outdir.glob("*_genomic.fna*")}
168
+ missing = [accession for accession in expected if accession not in existing]
169
+ (outdir / "failed_accessions.txt").write_text("\n".join(missing) + ("\n" if missing else ""), encoding="utf-8")
170
+ return {"selected": len(rows), "missing": len(missing), "downloaded": 0, "failed": len(missing)}
171
+ cache = DirectoryCache(outdir / "fetchm2_sequence_cache.sqlite3")
172
+ results: list[tuple[str, str]] = []
173
+ try:
174
+ with ThreadPoolExecutor(max_workers=max(1, workers)) as executor:
175
+ futures = [
176
+ executor.submit(download_one, row, outdir, cache, retries, retry_delay, keep_gz)
177
+ for row in rows
178
+ ]
179
+ for future in tqdm(as_completed(futures), total=len(futures), desc="Downloading FASTA"):
180
+ results.append(future.result())
181
+ finally:
182
+ cache.close()
183
+ failed = [accession for accession, status in results if status.startswith("failed") or status == "missing accession"]
184
+ (outdir / "failed_accessions.txt").write_text("\n".join(failed) + ("\n" if failed else ""), encoding="utf-8")
185
+ summary = {
186
+ "selected": len(rows),
187
+ "downloaded": sum(1 for _, status in results if status == "downloaded"),
188
+ "existing": sum(1 for _, status in results if status == "exists"),
189
+ "failed": len(failed),
190
+ }
191
+ pd.DataFrame([{"assembly_accession": accession, "status": status} for accession, status in results]).to_csv(
192
+ outdir / "sequence_download_summary.csv", index=False
193
+ )
194
+ return summary