npm - @brainpilot/skills - Versions diffs - 0.0.6 → 0.0.7 - Mend

@brainpilot/skills 0.0.6 → 0.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (285) hide show

package/skills/06_fMRI_Neuroimaging/aomic-skill/scripts/aomic_qc_summary.py ADDED Viewed

@@ -0,0 +1,258 @@
+#!/usr/bin/env python3
+"""Generate per-subject QC summaries and exclusion lists for AOMIC.
+Combines fMRIPrep confounds and FreeSurfer recon-all metrics to produce a
+unified QC report with configurable exclusion criteria.
+"""
+import argparse
+import sys
+from pathlib import Path
+from typing import Dict, List, Tuple
+try:
+    import pandas as pd
+except ImportError:
+    print("Error: pandas is required. Install with: pip install pandas", file=sys.stderr)
+    sys.exit(1)
+def detect_delimiter(file_path: Path) -> str:
+    """Detect whether file uses tab or comma delimiter."""
+    with open(file_path, "r", encoding="utf-8") as f:
+        first_line = f.readline()
+    if "\t" in first_line:
+        return "\t"
+    return ","
+def collect_fmriprep_qc(fmriprep_dir: Path) -> Dict[str, Dict[str, float]]:
+    """Collect QC metrics from fMRIPrep outputs."""
+    qc = {}
+    confounds_patterns = [
+        "**/desc-confounds_timeseries.tsv",
+        "**/*_desc-confounds_timeseries.tsv",
+        "**/confounds.tsv",
+    ]
+    confounds_files = []
+    for pattern in confounds_patterns:
+        confounds_files.extend(fmriprep_dir.rglob(pattern))
+    for confounds_file in confounds_files:
+        subject_id = None
+        for part in confounds_file.parts:
+            if part.startswith("sub-"):
+                subject_id = part
+                break
+        if not subject_id:
+            continue
+        delimiter = detect_delimiter(confounds_file)
+        try:
+            df = pd.read_csv(confounds_file, sep=delimiter, low_memory=False)
+        except Exception:
+            continue
+        fd_col = None
+        for col_name in ["framewise_displacement", "fd", "FD"]:
+            if col_name in df.columns:
+                fd_col = col_name
+                break
+        metrics = {"n_volumes": len(df)}
+        if fd_col:
+            fd_values = pd.to_numeric(df[fd_col], errors="coerce").dropna()
+            if len(fd_values) > 0:
+                metrics["mean_fd"] = float(fd_values.mean())
+                metrics["max_fd"] = float(fd_values.max())
+            else:
+                metrics["mean_fd"] = 0.0
+                metrics["max_fd"] = 0.0
+        if subject_id in qc:
+            existing = qc[subject_id]
+            existing["mean_fd"] = max(existing.get("mean_fd", 0), metrics.get("mean_fd", 0))
+            existing["max_fd"] = max(existing.get("max_fd", 0), metrics.get("max_fd", 0))
+            existing["n_volumes"] = existing.get("n_volumes", 0) + metrics.get("n_volumes", 0)
+        else:
+            qc[subject_id] = metrics
+    return qc
+def collect_freesurfer_qc(freesurfer_dir: Path) -> Dict[str, Dict[str, float]]:
+    """Collect QC metrics from FreeSurfer recon-all outputs."""
+    qc = {}
+    for subject_dir in sorted(freesurfer_dir.rglob("sub-*")):
+        if not subject_dir.is_dir():
+            continue
+        subject_id = None
+        for part in subject_dir.parts:
+            if part.startswith("sub-"):
+                subject_id = part
+                break
+        if not subject_id:
+            subject_id = subject_dir.name
+        recon_log = None
+        matches = list(freesurfer_dir.rglob(f"{subject_id}/**/recon-all.log"))
+        if matches:
+            recon_log = matches[0]
+        completed = False
+        if recon_log and recon_log.exists():
+            try:
+                with open(recon_log, "r", encoding="utf-8", errors="ignore") as f:
+                    content = f.read()
+                    completed = "finished without error" in content.lower() or "recon-all -done" in content.lower()
+            except Exception:
+                pass
+        aseg_matches = list(freesurfer_dir.rglob(f"{subject_id}**/aseg.stats"))
+        total_brain_vol = None
+        etiv = None
+        if aseg_matches:
+            aseg_file = aseg_matches[0]
+            try:
+                with open(aseg_file, "r", encoding="utf-8", errors="ignore") as f:
+                    for line in f:
+                        if "EstimatedTotalIntraCranialVol" in line:
+                            parts = line.split()
+                            if len(parts) >= 4:
+                                try:
+                                    etiv = float(parts[3])
+                                except ValueError:
+                                    pass
+                        elif "BrainSegVol" in line and "Not" not in line:
+                            parts = line.split()
+                            if len(parts) >= 4:
+                                try:
+                                    total_brain_vol = float(parts[3])
+                                except ValueError:
+                                    pass
+            except Exception:
+                pass
+        metrics = {"completed": completed}
+        if total_brain_vol is not None:
+            metrics["total_brain_volume"] = total_brain_vol
+        if etiv is not None:
+            metrics["estimated_total_intracranial_volume"] = etiv
+        qc[subject_id] = metrics
+    return qc
+def generate_qc_summary(
+    fmriprep_qc: Dict[str, Dict[str, float]],
+    freesurfer_qc: Dict[str, Dict[str, float]],
+    fd_threshold: float = 0.3,
+    max_fd_threshold: float = 5.0,
+) -> Tuple["pd.DataFrame", List[str]]:
+    """Generate QC summary DataFrame and exclusion list."""
+    all_subjects = set()
+    all_subjects.update(fmriprep_qc.keys())
+    all_subjects.update(freesurfer_qc.keys())
+    rows = []
+    excluded = []
+    for sub_id in sorted(all_subjects):
+        row = {"subject_id": sub_id}
+        exclude_reasons = []
+        fp = fmriprep_qc.get(sub_id, {})
+        row["mean_fd"] = fp.get("mean_fd", None)
+        row["max_fd"] = fp.get("max_fd", None)
+        row["n_volumes"] = fp.get("n_volumes", None)
+        fs = freesurfer_qc.get(sub_id, {})
+        row["fs_completed"] = fs.get("completed", None)
+        row["total_brain_volume"] = fs.get("total_brain_volume", None)
+        row["etiv"] = fs.get("estimated_total_intracranial_volume", None)
+        if row["mean_fd"] is not None and row["mean_fd"] > fd_threshold:
+            exclude_reasons.append(f"mean_fd={row['mean_fd']:.3f}>{fd_threshold}")
+        if row["max_fd"] is not None and row["max_fd"] > max_fd_threshold:
+            exclude_reasons.append(f"max_fd={row['max_fd']:.3f}>{max_fd_threshold}")
+        if row["fs_completed"] is False:
+            exclude_reasons.append("FreeSurfer recon-all incomplete")
+        row["exclude"] = len(exclude_reasons) > 0
+        row["exclude_reasons"] = "; ".join(exclude_reasons) if exclude_reasons else ""
+        if exclude_reasons:
+            excluded.append(sub_id)
+        rows.append(row)
+    df = pd.DataFrame(rows)
+    return df, excluded
+def main() -> int:
+    parser = argparse.ArgumentParser(
+        description="Generate per-subject QC summaries and exclusion lists for AOMIC."
+    )
+    parser.add_argument("--fmriprep-dir", help="Path to fMRIPrep output directory")
+    parser.add_argument("--freesurfer-dir", help="Path to FreeSurfer output directory")
+    parser.add_argument("--output", required=True, help="Output path for QC summary CSV")
+    parser.add_argument("--exclude-output", help="Output path for exclusion list CSV")
+    parser.add_argument("--fd-threshold", type=float, default=0.3, help="Mean FD threshold (default: 0.3)")
+    parser.add_argument("--max-fd-threshold", type=float, default=5.0, help="Max FD threshold (default: 5.0)")
+    args = parser.parse_args()
+    fmriprep_qc = {}
+    freesurfer_qc = {}
+    if args.fmriprep_dir:
+        fp_dir = Path(args.fmriprep_dir).resolve()
+        if fp_dir.exists():
+            print(f"Collecting fMRIPrep QC from {fp_dir}...")
+            fmriprep_qc = collect_fmriprep_qc(fp_dir)
+            print(f"  Found {len(fmriprep_qc)} subjects")
+    if args.freesurfer_dir:
+        fs_dir = Path(args.freesurfer_dir).resolve()
+        if fs_dir.exists():
+            print(f"Collecting FreeSurfer QC from {fs_dir}...")
+            freesurfer_qc = collect_freesurfer_qc(fs_dir)
+            print(f"  Found {len(freesurfer_qc)} subjects")
+    if not fmriprep_qc and not freesurfer_qc:
+        print("[ERROR] No QC data collected. Check input paths.", file=sys.stderr)
+        return 1
+    summary_df, excluded = generate_qc_summary(
+        fmriprep_qc=fmriprep_qc,
+        freesurfer_qc=freesurfer_qc,
+        fd_threshold=args.fd_threshold,
+        max_fd_threshold=args.max_fd_threshold,
+    )
+    output_path = Path(args.output).resolve()
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    summary_df.to_csv(output_path, index=False)
+    print(f"\nQC Summary: {len(summary_df)} subjects -> {output_path}")
+    print(f"  Excluded: {len(excluded)} / {len(summary_df)} ({100*len(excluded)/max(len(summary_df),1):.1f}%)")
+    if args.exclude_output:
+        exclude_path = Path(args.exclude_output).resolve()
+        exclude_path.parent.mkdir(parents=True, exist_ok=True)
+        summary_df[summary_df["exclude"] == True][["subject_id", "exclude_reasons"]].to_csv(exclude_path, index=False)
+        print(f"  Exclusion list: {exclude_path}")
+    return 0
+if __name__ == "__main__":
+    sys.exit(main())

package/skills/06_fMRI_Neuroimaging/aomic-skill/scripts/extract_aomic_phenotype.py ADDED Viewed

@@ -0,0 +1,284 @@
+#!/usr/bin/env python3
+"""Extract and merge AOMIC phenotype tables for downstream analysis.
+Reads AOMIC phenotype files (Big Five personality, Raven's progressive matrices,
+demographics), selects columns, and optionally cross-references with imaging
+subject lists.
+"""
+import argparse
+import sys
+from pathlib import Path
+from typing import Dict, List, Optional, Set
+try:
+    import pandas as pd
+except ImportError:
+    print("Error: pandas is required. Install with: pip install pandas", file=sys.stderr)
+    sys.exit(1)
+# Default phenotype files to look for
+DEFAULT_PHENOTYPE_FILES = [
+    "big_five.csv",
+    "big_five.tsv",
+    "personality.csv",
+    "personality.tsv",
+    "ravens.csv",
+    "ravens.tsv",
+    "ravens_progressive_matrices.csv",
+    "ravens_progressive_matrices.tsv",
+    "demographics.csv",
+    "demographics.tsv",
+    "participants.csv",
+    "participants.tsv",
+    "cognitive.csv",
+    "cognitive.tsv",
+]
+# Standard column name mapping
+COLUMN_MAP = {
+    "participant_id": "subject_id",
+    "subject": "subject_id",
+    "sub_id": "subject_id",
+    "openness": "openness",
+    "conscientiousness": "conscientiousness",
+    "extraversion": "extraversion",
+    "agreeableness": "agreeableness",
+    "neuroticism": "neuroticism",
+    "big5_o": "openness",
+    "big5_c": "conscientiousness",
+    "big5_e": "extraversion",
+    "big5_a": "agreeableness",
+    "big5_n": "neuroticism",
+    "ravens_score": "ravens_score",
+    "raven": "ravens_score",
+    "rpm": "ravens_score",
+    "age": "age",
+    "sex": "sex",
+    "gender": "sex",
+    "handedness": "handedness",
+    "education": "education",
+}
+def detect_delimiter(file_path: Path) -> str:
+    """Detect whether file uses tab or comma delimiter."""
+    with open(file_path, "r", encoding="utf-8") as f:
+        first_line = f.readline()
+    if "\t" in first_line:
+        return "\t"
+    return ","
+def read_phenotype_file(file_path: Path) -> Optional["pd.DataFrame"]:
+    """Read a single AOMIC phenotype file."""
+    delimiter = detect_delimiter(file_path)
+    try:
+        df = pd.read_csv(file_path, sep=delimiter, low_memory=False)
+        return df
+    except Exception as e:
+        print(f"[WARN] Failed to read {file_path}: {e}", file=sys.stderr)
+        return None
+def merge_phenotype_tables(
+    phenotype_dir: Path,
+    columns: Optional[List[str]] = None,
+    imaging_ids: Optional[Set[str]] = None,
+    drop_missing_threshold: float = 0.5,
+) -> "pd.DataFrame":
+    """Merge multiple AOMIC phenotype tables.
+    Args:
+        phenotype_dir: Directory containing phenotype CSV/TSV files.
+        columns: Specific columns to select (None = all).
+        imaging_ids: Set of subject IDs from imaging data to filter by.
+        drop_missing_threshold: Drop columns with > this fraction of missing values.
+    Returns:
+        Merged DataFrame.
+    """
+    phenotype_files = []
+    for f in sorted(phenotype_dir.iterdir()):
+        if f.is_file() and f.suffix in (".csv", ".tsv"):
+            phenotype_files.append(f)
+    if not phenotype_files:
+        for name in DEFAULT_PHENOTYPE_FILES:
+            candidate = phenotype_dir / name
+            if candidate.exists():
+                phenotype_files.append(candidate)
+    if not phenotype_files:
+        print(f"[ERROR] No phenotype files found in {phenotype_dir}", file=sys.stderr)
+        return pd.DataFrame()
+    print(f"Found {len(phenotype_files)} phenotype files:")
+    for f in phenotype_files:
+        print(f"  - {f.name}")
+    dataframes = []
+    for f in phenotype_files:
+        df = read_phenotype_file(f)
+        if df is not None and len(df) > 0:
+            # Apply column name mapping
+            rename_map = {k: v for k, v in COLUMN_MAP.items() if k in df.columns}
+            if rename_map:
+                df = df.rename(columns=rename_map)
+            # Ensure subject_id column exists
+            if "subject_id" not in df.columns:
+                # Try first column as subject ID
+                first_col = df.columns[0]
+                if df[first_col].dtype == object or df[first_col].nunique() == len(df):
+                    df = df.rename(columns={first_col: "subject_id"})
+            dataframes.append(df)
+    if not dataframes:
+        return pd.DataFrame()
+    # Find common columns
+    common_cols = set(dataframes[0].columns)
+    for df in dataframes[1:]:
+        common_cols &= set(df.columns)
+    common_cols = sorted(common_cols)
+    print(f"Common columns across all tables: {len(common_cols)}")
+    # Merge on subject_id
+    merge_keys = []
+    if "subject_id" in common_cols:
+        merge_keys.append("subject_id")
+    if not merge_keys:
+        print("[WARN] No merge key found (subject_id). Concatenating instead.")
+        merged = pd.concat(dataframes, ignore_index=True)
+    else:
+        merged = dataframes[0]
+        for df in dataframes[1:]:
+            new_cols = [c for c in df.columns if c not in merged.columns or c in merge_keys]
+            df_subset = df[new_cols]
+            merged = pd.merge(merged, df_subset, on=merge_keys, how="outer", suffixes=("", "_dup"))
+    # Drop duplicate columns
+    dup_cols = [c for c in merged.columns if c.endswith("_dup")]
+    if dup_cols:
+        merged = merged.drop(columns=dup_cols)
+    # Filter by imaging IDs
+    if imaging_ids and "subject_id" in merged.columns:
+        normalized_imaging = set()
+        for sid in imaging_ids:
+            clean = sid.replace("sub-", "")
+            normalized_imaging.add(clean)
+            normalized_imaging.add(sid)
+        before_count = len(merged)
+        mask = merged["subject_id"].apply(
+            lambda x: str(x).strip() in normalized_imaging
+            or f"sub-{str(x).strip()}" in normalized_imaging
+        )
+        merged = merged[mask]
+        print(f"Filtered to imaging subjects: {before_count} -> {len(merged)} rows")
+    # Select specific columns
+    if columns:
+        available = [c for c in columns if c in merged.columns]
+        missing = [c for c in columns if c not in merged.columns]
+        if missing:
+            print(f"[WARN] Columns not found (skipped): {missing}")
+        if available:
+            merged = merged[available]
+    # Drop columns with too many missing values
+    if drop_missing_threshold < 1.0:
+        missing_frac = merged.isnull().mean()
+        cols_to_drop = missing_frac[missing_frac > drop_missing_threshold].index.tolist()
+        if cols_to_drop:
+            print(f"Dropping {len(cols_to_drop)} columns with >{drop_missing_threshold*100}% missing")
+            merged = merged.drop(columns=cols_to_drop)
+    return merged
+def load_imaging_ids(imaging_ids_file: Path) -> Set[str]:
+    """Load subject IDs from a BIDS participants.tsv or similar file."""
+    ids = set()
+    delimiter = detect_delimiter(imaging_ids_file)
+    try:
+        df = pd.read_csv(imaging_ids_file, sep=delimiter)
+        id_col = None
+        for col_name in ["participant_id", "subject_id", "sub_id"]:
+            if col_name in df.columns:
+                id_col = col_name
+                break
+        if id_col:
+            ids = set(df[id_col].astype(str).str.strip())
+    except Exception as e:
+        print(f"[WARN] Failed to read imaging IDs: {e}", file=sys.stderr)
+    return ids
+def main() -> int:
+    parser = argparse.ArgumentParser(
+        description="Extract and merge AOMIC phenotype tables."
+    )
+    parser.add_argument(
+        "--phenotype-dir",
+        required=True,
+        help="Directory containing AOMIC phenotype CSV/TSV files",
+    )
+    parser.add_argument(
+        "--output",
+        required=True,
+        help="Output path for merged phenotype CSV",
+    )
+    parser.add_argument(
+        "--columns",
+        help="Comma-separated list of columns to select (default: all)",
+    )
+    parser.add_argument(
+        "--imaging-ids",
+        help="Path to BIDS participants.tsv or file with imaging subject IDs",
+    )
+    parser.add_argument(
+        "--missing-threshold",
+        type=float,
+        default=0.5,
+        help="Drop columns with more than this fraction of missing values (default: 0.5)",
+    )
+    args = parser.parse_args()
+    phenotype_dir = Path(args.phenotype_dir).resolve()
+    if not phenotype_dir.exists() or not phenotype_dir.is_dir():
+        print(f"Phenotype directory does not exist: {phenotype_dir}", file=sys.stderr)
+        return 1
+    columns = None
+    if args.columns:
+        columns = [c.strip() for c in args.columns.split(",")]
+    imaging_ids = None
+    if args.imaging_ids:
+        imaging_ids_path = Path(args.imaging_ids).resolve()
+        if imaging_ids_path.exists():
+            imaging_ids = load_imaging_ids(imaging_ids_path)
+            print(f"Loaded {len(imaging_ids)} imaging subject IDs")
+    merged = merge_phenotype_tables(
+        phenotype_dir=phenotype_dir,
+        columns=columns,
+        imaging_ids=imaging_ids,
+        drop_missing_threshold=args.missing_threshold,
+    )
+    if merged.empty:
+        print("[ERROR] No data after merging. Check input files.", file=sys.stderr)
+        return 1
+    output_path = Path(args.output).resolve()
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    merged.to_csv(output_path, index=False)
+    print(f"\nWrote {len(merged)} rows x {len(merged.columns)} columns to {output_path}")
+    return 0
+if __name__ == "__main__":
+    sys.exit(main())