PyPI - fetchm2 - Versions diffs - 0.1.0__py3-none-any.whl - Mend

fetchm2 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

fetchm2/__init__.py +6 -0
fetchm2/audit.py +126 -0
fetchm2/cli.py +175 -0
fetchm2/data/__init__.py +2 -0
fetchm2/data/approved_broad_categories.csv +51 -0
fetchm2/data/controlled_categories.csv +7506 -0
fetchm2/data/country_mapping.json +810 -0
fetchm2/data/geography_reviewed_rules.csv +17 -0
fetchm2/data/host_negative_rules.csv +409 -0
fetchm2/data/host_synonyms.csv +7114 -0
fetchm2/metadata.py +244 -0
fetchm2/sequence.py +194 -0
fetchm2/standardization.py +586 -0
fetchm2/utils.py +54 -0
fetchm2-0.1.0.dist-info/METADATA +208 -0
fetchm2-0.1.0.dist-info/RECORD +20 -0
fetchm2-0.1.0.dist-info/WHEEL +5 -0
fetchm2-0.1.0.dist-info/entry_points.txt +3 -0
fetchm2-0.1.0.dist-info/licenses/LICENSE +21 -0
fetchm2-0.1.0.dist-info/top_level.txt +1 -0

fetchm2/metadata.py ADDED Viewed

@@ -0,0 +1,244 @@
+from __future__ import annotations
+import sqlite3
+import threading
+import time
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from pathlib import Path
+from typing import Any
+import pandas as pd
+import requests
+import xmltodict
+from tqdm import tqdm
+from .audit import production_gate, write_audit_outputs
+from .standardization import standardize_rows
+NCBI_EFETCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
+NCBI_TIMEOUT = 60
+def read_table(path: Path) -> pd.DataFrame:
+    if path.suffix.lower() == ".csv":
+        return pd.read_csv(path)
+    return pd.read_csv(path, sep="\t")
+def filter_quality(df: pd.DataFrame, ani: list[str] | None, checkm: float | None) -> pd.DataFrame:
+    filtered = df.copy()
+    if ani and "all" not in [value.lower() for value in ani] and "ANI Check status" in filtered:
+        filtered = filtered[filtered["ANI Check status"].astype(str).isin(ani)]
+    if checkm is not None and "CheckM completeness" in filtered:
+        filtered = filtered[pd.to_numeric(filtered["CheckM completeness"], errors="coerce") >= checkm]
+    return filtered
+class MetadataCache:
+    def __init__(self, path: Path) -> None:
+        self.path = path
+        self.lock = threading.Lock()
+        self.conn = sqlite3.connect(path, check_same_thread=False)
+        self.conn.execute(
+            """
+            CREATE TABLE IF NOT EXISTS biosample_cache (
+                biosample TEXT PRIMARY KEY,
+                payload TEXT,
+                fetched_at REAL
+            )
+            """
+        )
+        self.conn.commit()
+    def get(self, biosample: str) -> str | None:
+        with self.lock:
+            row = self.conn.execute("SELECT payload FROM biosample_cache WHERE biosample = ?", (biosample,)).fetchone()
+        return None if row is None else str(row[0])
+    def set(self, biosample: str, payload: str) -> None:
+        with self.lock:
+            self.conn.execute(
+                "INSERT OR REPLACE INTO biosample_cache (biosample, payload, fetched_at) VALUES (?, ?, ?)",
+                (biosample, payload, time.time()),
+            )
+            self.conn.commit()
+    def close(self) -> None:
+        with self.lock:
+            self.conn.close()
+def biosample_accession(row: dict[str, Any]) -> str:
+    for key in ["Assembly BioSample Accession", "BioSample Accession", "BioSample"]:
+        value = str(row.get(key) or "").strip()
+        if value:
+            return value
+    return ""
+def parse_biosample_xml(xml_text: str) -> dict[str, str]:
+    if not xml_text.strip():
+        return {}
+    parsed = xmltodict.parse(xml_text)
+    sample = parsed.get("BioSampleSet", {}).get("BioSample")
+    if isinstance(sample, list):
+        sample = sample[0] if sample else {}
+    if not isinstance(sample, dict):
+        return {}
+    attributes = sample.get("Attributes", {}).get("Attribute", [])
+    if isinstance(attributes, dict):
+        attributes = [attributes]
+    output: dict[str, str] = {}
+    key_map = {
+        "isolation_source": "Isolation Source",
+        "collection_date": "Collection Date",
+        "geo_loc_name": "Geographic Location",
+        "host": "Host",
+        "sample_type": "Sample Type",
+        "env_medium": "Environment Medium",
+        "env_broad_scale": "Environment Broad Scale",
+        "env_local_scale": "Environment Local Scale",
+        "disease": "Host Disease",
+        "host_disease": "Host Disease",
+        "host_health_state": "Host Health State",
+    }
+    for attr in attributes:
+        name = str(attr.get("@attribute_name") or attr.get("@harmonized_name") or "").strip()
+        value = str(attr.get("#text") or "").strip()
+        if not name or not value:
+            continue
+        normalized_name = name.lower().replace("-", "_").replace(" ", "_")
+        output[key_map.get(normalized_name, name)] = value
+    return output
+def fetch_biosample_metadata(
+    biosample: str,
+    *,
+    api_key: str | None,
+    email: str | None,
+    sleep: float,
+    cache: MetadataCache,
+) -> dict[str, str]:
+    cached = cache.get(biosample)
+    if cached is not None:
+        return parse_biosample_xml(cached)
+    params = {
+        "db": "biosample",
+        "id": biosample,
+        "retmode": "xml",
+    }
+    if api_key:
+        params["api_key"] = api_key
+    if email:
+        params["email"] = email
+    if sleep > 0:
+        time.sleep(sleep)
+    response = requests.get(NCBI_EFETCH_URL, params=params, timeout=NCBI_TIMEOUT)
+    response.raise_for_status()
+    cache.set(biosample, response.text)
+    return parse_biosample_xml(response.text)
+def enrich_rows_with_biosample(
+    rows: list[dict[str, Any]],
+    *,
+    cache_path: Path,
+    api_key: str | None,
+    email: str | None,
+    workers: int,
+    sleep: float,
+    offline: bool,
+) -> list[dict[str, Any]]:
+    if offline:
+        return rows
+    cache = MetadataCache(cache_path)
+    try:
+        accessions = sorted({biosample_accession(row) for row in rows if biosample_accession(row)})
+        metadata_by_biosample: dict[str, dict[str, str]] = {}
+        with ThreadPoolExecutor(max_workers=max(1, workers)) as executor:
+            futures = {
+                executor.submit(
+                    fetch_biosample_metadata,
+                    biosample,
+                    api_key=api_key,
+                    email=email,
+                    sleep=sleep,
+                    cache=cache,
+                ): biosample
+                for biosample in accessions
+            }
+            for future in tqdm(as_completed(futures), total=len(futures), desc="Fetching BioSample metadata"):
+                biosample = futures[future]
+                try:
+                    metadata_by_biosample[biosample] = future.result()
+                except Exception as exc:
+                    metadata_by_biosample[biosample] = {"Metadata Fetch Error": str(exc)}
+        enriched = []
+        for row in rows:
+            merged = dict(row)
+            for key, value in metadata_by_biosample.get(biosample_accession(row), {}).items():
+                if not str(merged.get(key) or "").strip():
+                    merged[key] = value
+            enriched.append(merged)
+        return enriched
+    finally:
+        cache.close()
+def run_metadata(
+    *,
+    input_path: Path,
+    outdir: Path,
+    ani: list[str] | None = None,
+    checkm: float | None = None,
+    api_key: str | None = None,
+    email: str | None = None,
+    workers: int = 3,
+    sleep: float = 0.34,
+    offline: bool = False,
+) -> dict[str, Any]:
+    outdir.mkdir(parents=True, exist_ok=True)
+    metadata_dir = outdir / "metadata_output"
+    audit_dir = outdir / "audit"
+    metadata_dir.mkdir(parents=True, exist_ok=True)
+    df = read_table(input_path)
+    df = filter_quality(df, ani, checkm)
+    rows = df.fillna("").to_dict(orient="records")
+    rows = enrich_rows_with_biosample(
+        rows,
+        cache_path=metadata_dir / "fetchm2_biosample_cache.sqlite3",
+        api_key=api_key,
+        email=email,
+        workers=workers,
+        sleep=sleep,
+        offline=offline,
+    )
+    standardized = standardize_rows(rows)
+    clean_df = pd.DataFrame(standardized)
+    clean_path = metadata_dir / "fetchm2_clean.csv"
+    clean_df.to_csv(clean_path, index=False)
+    clean_df.to_csv(metadata_dir / "fetchm2_clean.tsv", sep="\t", index=False)
+    summary = write_audit_outputs(standardized, audit_dir)
+    production_ready, hard_failures, warnings = production_gate(summary)
+    report_lines = [
+        "# FetchM2 Run Report",
+        "",
+        f"Input: {input_path}",
+        f"Rows processed: {summary['rows']}",
+        f"Clean table: {clean_path}",
+        f"Production gate: {'PASS' if production_ready else 'FAIL'}",
+    ]
+    if hard_failures:
+        report_lines.append(f"Hard failures: {', '.join(hard_failures)}")
+    if warnings:
+        report_lines.append(f"Warnings: {', '.join(warnings)}")
+    (metadata_dir / "fetchm2_report.md").write_text("\n".join(report_lines) + "\n", encoding="utf-8")
+    return {
+        "clean_path": str(clean_path),
+        "summary": summary,
+        "production_ready": production_ready,
+        "hard_failures": hard_failures,
+        "warnings": warnings,
+    }

fetchm2/sequence.py ADDED Viewed

@@ -0,0 +1,194 @@
+from __future__ import annotations
+import gzip
+import re
+import shutil
+import sqlite3
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from pathlib import Path
+from typing import Any
+import pandas as pd
+import requests
+from tqdm import tqdm
+BASE_URL = "https://ftp.ncbi.nlm.nih.gov/genomes/all"
+def normalize_text(value: Any) -> str:
+    return str(value or "").strip().lower()
+def normalize_assembly_name(name: str) -> str:
+    cleaned = str(name or "").strip()
+    return cleaned.replace(" ", "_") if cleaned else "NA"
+def build_parent_url(accession: str) -> str:
+    prefix, digits = accession.split("_", 1)
+    core = digits.split(".", 1)[0]
+    return f"{BASE_URL}/{prefix}/{core[:3]}/{core[3:6]}/{core[6:9]}/{core[9:]}"
+class DirectoryCache:
+    def __init__(self, path: Path) -> None:
+        self.conn = sqlite3.connect(path)
+        self.conn.execute(
+            "CREATE TABLE IF NOT EXISTS assembly_directory_cache (accession TEXT PRIMARY KEY, assembly_name TEXT, directory TEXT)"
+        )
+        self.conn.commit()
+    def get(self, accession: str, name: str) -> str | None:
+        row = self.conn.execute(
+            "SELECT directory FROM assembly_directory_cache WHERE accession = ? AND assembly_name = ?",
+            (accession, normalize_assembly_name(name)),
+        ).fetchone()
+        return None if row is None else str(row[0])
+    def set(self, accession: str, name: str, directory: str) -> None:
+        self.conn.execute(
+            "INSERT OR REPLACE INTO assembly_directory_cache (accession, assembly_name, directory) VALUES (?, ?, ?)",
+            (accession, normalize_assembly_name(name), directory),
+        )
+        self.conn.commit()
+    def close(self) -> None:
+        self.conn.close()
+def resolve_assembly_directory(accession: str, name: str, cache: DirectoryCache) -> str:
+    cached = cache.get(accession, name)
+    if cached:
+        return cached
+    parent_url = build_parent_url(accession)
+    normalized_name = normalize_assembly_name(name)
+    candidates = [f"{accession}_{normalized_name}", f"{accession}_NA"]
+    session = requests.Session()
+    for candidate in candidates:
+        if session.get(f"{parent_url}/{candidate}", timeout=30).ok:
+            cache.set(accession, name, candidate)
+            return candidate
+    response = session.get(parent_url, timeout=60)
+    response.raise_for_status()
+    matches = [item.rstrip("/") for item in re.findall(r'href="([^"]+/)"', response.text) if item.startswith(f"{accession}_")]
+    if not matches:
+        raise FileNotFoundError(f"No remote assembly directory found for {accession}")
+    cache.set(accession, name, matches[0])
+    return matches[0]
+def row_matches_filters(row: dict[str, Any], filters: dict[str, Any]) -> bool:
+    for field, values in {
+        "Country": filters.get("country"),
+        "Continent": filters.get("continent"),
+        "Subcontinent": filters.get("subcontinent"),
+        "Host_SD": filters.get("host"),
+        "Host_Rank": filters.get("host_rank"),
+        "Sample_Type_SD": filters.get("sample_type"),
+        "Isolation_Source_SD": filters.get("isolation_source"),
+        "Environment_Medium_SD": filters.get("environment_medium"),
+    }.items():
+        if values and normalize_text(row.get(field)) not in {normalize_text(value) for value in values}:
+            return False
+    year_from = filters.get("year_from")
+    year_to = filters.get("year_to")
+    if year_from is not None or year_to is not None:
+        try:
+            year = int(str(row.get("Collection_Year") or row.get("Collection Date") or "")[:4])
+        except ValueError:
+            return False
+        if year_from is not None and year < year_from:
+            return False
+        if year_to is not None and year > year_to:
+            return False
+    return True
+def select_rows(input_path: Path, filters: dict[str, Any], max_genomes: int | None) -> list[dict[str, Any]]:
+    df = pd.read_csv(input_path)
+    rows = [row for row in df.fillna("").to_dict(orient="records") if row_matches_filters(row, filters)]
+    if max_genomes is not None:
+        rows = rows[:max_genomes]
+    return rows
+def download_one(row: dict[str, Any], outdir: Path, cache: DirectoryCache, retries: int, retry_delay: float, keep_gz: bool) -> tuple[str, str]:
+    accession = str(row.get("Assembly Accession") or "").strip()
+    name = str(row.get("Assembly Name") or "").strip()
+    if not accession:
+        return "", "missing accession"
+    for attempt in range(1, retries + 1):
+        try:
+            directory = resolve_assembly_directory(accession, name, cache)
+            gz_name = f"{directory}_genomic.fna.gz"
+            fna_name = f"{directory}_genomic.fna"
+            gz_path = outdir / gz_name
+            fna_path = outdir / fna_name
+            if fna_path.exists() or gz_path.exists():
+                return accession, "exists"
+            url = f"{build_parent_url(accession)}/{directory}/{gz_name}"
+            with requests.get(url, stream=True, timeout=300) as response:
+                response.raise_for_status()
+                with gz_path.open("wb") as handle:
+                    for chunk in response.iter_content(chunk_size=1024 * 1024):
+                        if chunk:
+                            handle.write(chunk)
+            if not keep_gz:
+                with gzip.open(gz_path, "rb") as source, fna_path.open("wb") as target:
+                    shutil.copyfileobj(source, target)
+                gz_path.unlink()
+            return accession, "downloaded"
+        except Exception as exc:
+            if attempt >= retries:
+                return accession, f"failed: {exc}"
+            import time
+            time.sleep(retry_delay * attempt)
+    return accession, "failed"
+def run_sequence_downloads(
+    *,
+    input_path: Path,
+    outdir: Path,
+    filters: dict[str, Any] | None = None,
+    retries: int = 3,
+    retry_delay: float = 5.0,
+    workers: int = 4,
+    check_only: bool = False,
+    max_genomes: int | None = None,
+    keep_gz: bool = False,
+) -> dict[str, Any]:
+    outdir.mkdir(parents=True, exist_ok=True)
+    filters = filters or {}
+    rows = select_rows(input_path, filters, max_genomes)
+    expected = [str(row.get("Assembly Accession") or "").strip() for row in rows]
+    if check_only:
+        existing = {path.name.split("_", 2)[0] + "_" + path.name.split("_", 2)[1] for path in outdir.glob("*_genomic.fna*")}
+        missing = [accession for accession in expected if accession not in existing]
+        (outdir / "failed_accessions.txt").write_text("\n".join(missing) + ("\n" if missing else ""), encoding="utf-8")
+        return {"selected": len(rows), "missing": len(missing), "downloaded": 0, "failed": len(missing)}
+    cache = DirectoryCache(outdir / "fetchm2_sequence_cache.sqlite3")
+    results: list[tuple[str, str]] = []
+    try:
+        with ThreadPoolExecutor(max_workers=max(1, workers)) as executor:
+            futures = [
+                executor.submit(download_one, row, outdir, cache, retries, retry_delay, keep_gz)
+                for row in rows
+            ]
+            for future in tqdm(as_completed(futures), total=len(futures), desc="Downloading FASTA"):
+                results.append(future.result())
+    finally:
+        cache.close()
+    failed = [accession for accession, status in results if status.startswith("failed") or status == "missing accession"]
+    (outdir / "failed_accessions.txt").write_text("\n".join(failed) + ("\n" if failed else ""), encoding="utf-8")
+    summary = {
+        "selected": len(rows),
+        "downloaded": sum(1 for _, status in results if status == "downloaded"),
+        "existing": sum(1 for _, status in results if status == "exists"),
+        "failed": len(failed),
+    }
+    pd.DataFrame([{"assembly_accession": accession, "status": status} for accession, status in results]).to_csv(
+        outdir / "sequence_download_summary.csv", index=False
+    )
+    return summary