npm - @bgicli/bgicli - Versions diffs - 2.1.1 → 2.2.0 - Mend

@bgicli/bgicli 2.1.1 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1266) hide show

package/data/workflows/clinicaltrials-landscape/scripts/generate_report.py ADDED Viewed

@@ -0,0 +1,1813 @@
+"""
+Generate comprehensive markdown landscape report for clinical trials.
+Config-driven: loads disease-specific settings (mechanism descriptions,
+highlight sections, indication categories) from a YAML config via
+disease_config.py. Falls back to generic behavior when no config is provided.
+Creates a detailed competitive intelligence report with:
+- Executive summary with strategic highlights
+- Mechanism deep-dives with drug pipeline tables
+- Geographic landscape analysis
+- Study design breakdown
+- Phase transition funnel
+- Combination therapy landscape
+- Late-stage pipeline analysis (Phase 3 trials)
+- Phase 3 endpoint comparison
+- Upcoming readouts / expected completion dates
+- Company competitive positioning and portfolio analysis
+- Indication breakdown (config-driven or data-driven)
+- Patient population analysis
+- Trial arms & comparator analysis
+- Collaborator network
+- Regulatory signals
+- Enrollment and investment signals (clean)
+- Biosimilar threat assessment
+- Whitespace / unmet needs analysis
+"""
+import re
+import numpy as np
+import pandas as pd
+from datetime import datetime, timedelta
+try:
+    from disease_config import (
+        get_mechanism_descriptions, get_disease_name, get_disease_short,
+        get_highlight_mechanisms, get_highlight_sponsors,
+        get_executive_highlights, get_indication_categories,
+    )
+except ImportError:
+    from scripts.disease_config import (
+        get_mechanism_descriptions, get_disease_name, get_disease_short,
+        get_highlight_mechanisms, get_highlight_sponsors,
+        get_executive_highlights, get_indication_categories,
+    )
+# ============================================================
+# SAFE COLUMN ACCESS HELPER
+# ============================================================
+def _safe_col(df, col, default=None):
+    """
+    Safely access a DataFrame column, returning a Series of `default` if missing.
+    This allows backward-compatibility when generating a report from older compiled
+    DataFrames that may not contain Phase-2B columns (geographic, study-design, etc.).
+    Parameters
+    ----------
+    df : pd.DataFrame
+    col : str
+        Column name to look up.
+    default : scalar or None
+        Value to fill if column is absent.  None -> pd.NA.
+    Returns
+    -------
+    pd.Series
+    """
+    if col in df.columns:
+        return df[col]
+    return pd.Series(default, index=df.index, name=col)
+def generate_report(trials_df, parameters=None, output_file="landscape_report.md", config=None):
+    """
+    Generate comprehensive markdown landscape report.
+    Parameters
+    ----------
+    trials_df : pd.DataFrame
+        Compiled trials from compile_trials().
+    parameters : dict, optional
+        Query parameters and metadata.
+    output_file : str
+        Output file path.
+    config : dict, optional
+        Disease configuration loaded via load_disease_config().
+        When provided, disease-specific names, descriptions, highlight
+        sections, and indication categories are drawn from config.
+        When None, generic/data-driven fallbacks are used.
+    Returns
+    -------
+    str
+        The generated report as a string.
+    """
+    if parameters is None:
+        parameters = {}
+    # ------------------------------------------------------------------
+    # Load config-driven data (all functions handle config=None gracefully)
+    # ------------------------------------------------------------------
+    disease_name = get_disease_name(config, default="Clinical Trial")
+    disease_short = get_disease_short(config, default="")
+    mechanism_descriptions = get_mechanism_descriptions(config)
+    highlight_mechanisms = get_highlight_mechanisms(config)
+    highlight_sponsors = get_highlight_sponsors(config)
+    exec_highlights = get_executive_highlights(config)
+    indication_cats = get_indication_categories(config)
+    sections = []
+    now = datetime.now()
+    # ==========================================
+    # TITLE
+    # ==========================================
+    sections.append(f"# {disease_name} Clinical Trial Landscape Report\n")
+    sections.append(f"*Competitive Intelligence Report \u2014 Generated: {now.strftime('%Y-%m-%d %H:%M')}*\n")
+    sections.append("---\n")
+    # ==========================================
+    # TABLE OF CONTENTS (dynamic based on config highlight sections)
+    # ==========================================
+    sections.append("## Table of Contents\n")
+    # Fixed sections 1-11
+    toc_entries = [
+        "Executive Summary",
+        "Mechanism \u00d7 Phase Overview",
+        "Mechanism Deep Dives",
+        "Geographic Landscape",
+        "Study Design Analysis",
+        "Phase Transition Funnel",
+        "Late-Stage Pipeline (Phase 3)",
+        "Phase 3 Endpoint Comparison",
+        "Combination Therapy Landscape",
+        "Upcoming Readouts",
+        "Drug-Level Pipeline",
+    ]
+    # Dynamic highlight sections from config (mechanisms then sponsors)
+    for hl in highlight_mechanisms:
+        toc_entries.append(hl["section_title"])
+    for hl in highlight_sponsors:
+        sponsor_title = hl["section_title"]
+        if disease_short and disease_short not in sponsor_title:
+            toc_entries.append(f"{sponsor_title}")
+        else:
+            toc_entries.append(sponsor_title)
+    # Remaining fixed sections
+    toc_entries.extend([
+        "Sponsor Competitive Landscape",
+        "Indication Breakdown",
+        "Patient Population Analysis",
+        "Trial Arms & Comparator Analysis",
+        "Enrollment & Investment Signals",
+        "Biosimilar Landscape",
+        "Whitespace & Unmet Needs",
+        "Collaborator Network",
+        "Regulatory Signals",
+        "Data Notes",
+    ])
+    for i, title in enumerate(toc_entries, 1):
+        anchor = re.sub(r'[^a-z0-9\s-]', '', title.lower()).strip().replace(' ', '-')
+        anchor = re.sub(r'-+', '-', anchor)
+        sections.append(f"{i}. [{title}](#{anchor})")
+    sections.append("")
+    # ==========================================
+    # 1. EXECUTIVE SUMMARY
+    # ==========================================
+    sections.append("---\n")
+    sections.append("## Executive Summary\n")
+    n_total = len(trials_df)
+    n_mechanisms = trials_df["mechanism"].nunique()
+    n_sponsors = trials_df["sponsor_normalized"].nunique()
+    n_industry = int(trials_df["is_industry"].sum())
+    n_academic = n_total - n_industry
+    # Pharmacological mechanism counts (exclude non-pharma and unclassified)
+    non_pharma = ["Non-pharmacological", "Unclassified", "Other Biologic", "Small Molecule (Other)"]
+    pharma_df = trials_df[~trials_df["mechanism"].isin(non_pharma)]
+    n_pharma_mechanisms = pharma_df["mechanism"].nunique()
+    top_mech = trials_df["mechanism"].value_counts()
+    pharma_mechs = top_mech[~top_mech.index.isin(non_pharma)]
+    top_sponsor = trials_df["sponsor_normalized"].value_counts()
+    # Phase counts
+    phase3_count = len(trials_df[trials_df["phase_normalized"] == "Phase 3"])
+    phase2_count = len(trials_df[trials_df["phase_normalized"].isin(["Phase 2", "Phase 2/3"])])
+    phase1_count = len(trials_df[trials_df["phase_normalized"].isin(["Phase 1", "Phase 1/2"])])
+    # Recruiting stats
+    n_recruiting = len(trials_df[trials_df["overall_status"] == "RECRUITING"])
+    n_not_yet = len(trials_df[trials_df["overall_status"] == "NOT_YET_RECRUITING"])
+    sections.append("### Key Figures\n")
+    sections.append(f"| Metric | Value |")
+    sections.append(f"|--------|-------|")
+    sections.append(f"| Total active clinical trials | **{n_total}** |")
+    sections.append(f"| Pharmacological mechanism classes | **{n_pharma_mechanisms}** |")
+    sections.append(f"| Unique sponsors | **{n_sponsors}** |")
+    sections.append(f"| Industry-sponsored | **{n_industry}** ({n_industry/n_total*100:.0f}%) |")
+    sections.append(f"| Academic / Other | **{n_academic}** ({n_academic/n_total*100:.0f}%) |")
+    sections.append(f"| Phase 3 trials | **{phase3_count}** |")
+    sections.append(f"| Phase 2 / 2-3 trials | **{phase2_count}** |")
+    sections.append(f"| Phase 1 / 1-2 trials | **{phase1_count}** |")
+    sections.append(f"| Currently recruiting | **{n_recruiting}** |")
+    sections.append(f"| Not yet recruiting | **{n_not_yet}** |")
+    sections.append("")
+    sections.append("### Strategic Highlights\n")
+    if len(pharma_mechs) > 0:
+        sections.append(f"- **Most active mechanism class:** {pharma_mechs.index[0]} ({pharma_mechs.iloc[0]} trials)")
+    if len(top_sponsor) > 0:
+        sections.append(f"- **Most active sponsor:** {top_sponsor.index[0]} ({top_sponsor.iloc[0]} trials)")
+    # Config-driven executive highlights (mechanisms)
+    exec_mechs = exec_highlights.get("mechanisms", []) if exec_highlights else []
+    exec_spons = exec_highlights.get("sponsors", []) if exec_highlights else []
+    for mech_name in exec_mechs:
+        mech_trials = trials_df[trials_df["mechanism"] == mech_name]
+        if len(mech_trials) > 0:
+            mech_p3 = len(mech_trials[mech_trials["phase_normalized"] == "Phase 3"])
+            p3_note = f" ({mech_p3} in Phase 3)" if mech_p3 > 0 else ""
+            sections.append(f"- **{mech_name}:** {len(mech_trials)} trials{p3_note}")
+    # Config-driven executive highlights (sponsors)
+    for sponsor_name in exec_spons:
+        spons_trials = trials_df[trials_df["sponsor_normalized"] == sponsor_name]
+        if len(spons_trials) > 0:
+            spons_mechs = spons_trials["mechanism"].nunique()
+            sections.append(f"- **{sponsor_name} portfolio:** {len(spons_trials)} active trials across {spons_mechs} mechanism(s)")
+    # Upcoming readouts
+    upcoming = _get_upcoming_readouts(trials_df, months=18)
+    if len(upcoming) > 0:
+        sections.append(f"- **Upcoming readouts (next 18 months):** {len(upcoming)} trials expected to report")
+    # Total enrollment (prefer enrollment_clean, fallback to enrollment)
+    enroll_col = "enrollment_clean" if "enrollment_clean" in trials_df.columns else "enrollment"
+    total_enrollment = trials_df[enroll_col].sum()
+    if total_enrollment > 0:
+        sections.append(f"- **Total enrolled/planned participants:** ~{int(total_enrollment):,}")
+    # --- NEW: Geographic summary ---
+    n_countries_col = _safe_col(trials_df, "n_countries", default=np.nan)
+    if n_countries_col.notna().any():
+        unique_countries = set()
+        for val in _safe_col(trials_df, "countries_str", default=""):
+            if pd.notna(val) and str(val).strip():
+                for c in str(val).split(";"):
+                    c = c.strip()
+                    if c:
+                        unique_countries.add(c)
+        if unique_countries:
+            sections.append(f"- **Geographic reach:** trials conducted across **{len(unique_countries)} countries**")
+    # --- NEW: Study design summary ---
+    study_design_col = _safe_col(trials_df, "study_design_category", default="")
+    rct_mask = study_design_col.astype(str).str.lower().str.contains("rct|randomized", na=False)
+    n_rct = int(rct_mask.sum())
+    if n_rct > 0:
+        sections.append(f"- **RCTs:** {n_rct} ({n_rct/n_total*100:.0f}%) of trials are randomized controlled trials")
+    # --- NEW: Combination therapy summary ---
+    combo_col = _safe_col(trials_df, "is_combination", default=False)
+    n_combo = int(combo_col.fillna(False).astype(bool).sum())
+    if n_combo > 0:
+        sections.append(f"- **Combination therapies:** {n_combo} trials testing multi-drug regimens")
+    sections.append("")
+    # ==========================================
+    # 2. MECHANISM x PHASE OVERVIEW
+    # ==========================================
+    sections.append("---\n")
+    sections.append("## Mechanism \u00d7 Phase Overview\n")
+    phase_order = ["Phase 1", "Phase 1/2", "Phase 2", "Phase 2/3", "Phase 3", "Phase 4"]
+    ct = pd.crosstab(trials_df["mechanism"], trials_df["phase_normalized"])
+    ct = ct.reindex(columns=[p for p in phase_order if p in ct.columns], fill_value=0)
+    ct["Total"] = ct.sum(axis=1)
+    ct = ct.sort_values("Total", ascending=False)
+    cols = list(ct.columns)
+    header = "| Mechanism | " + " | ".join(cols) + " |"
+    sep = "|---|" + "|".join(["---:"] * len(cols)) + "|"
+    sections.append(header)
+    sections.append(sep)
+    for mech, row in ct.iterrows():
+        vals = " | ".join(str(int(row[c])) for c in cols)
+        sections.append(f"| {mech} | {vals} |")
+    sections.append("")
+    # ==========================================
+    # 3. MECHANISM DEEP DIVES
+    # ==========================================
+    sections.append("---\n")
+    sections.append("## Mechanism Deep Dives\n")
+    # Only deep-dive into pharmacological mechanisms with 2+ trials
+    mech_order = pharma_mechs[pharma_mechs >= 2].index.tolist()
+    for mech in mech_order:
+        mech_df = trials_df[trials_df["mechanism"] == mech].copy()
+        sections.append(f"### {mech} ({len(mech_df)} trials)\n")
+        # Description
+        desc = mechanism_descriptions.get(mech, "")
+        if desc:
+            sections.append(f"{desc}\n")
+        # Phase breakdown
+        mech_phases = mech_df["phase_normalized"].value_counts()
+        sections.append("**Phase distribution:**")
+        for phase, count in mech_phases.items():
+            sections.append(f"- {phase}: {count} trial{'s' if count > 1 else ''}")
+        sections.append("")
+        # Status breakdown
+        mech_statuses = mech_df["overall_status"].value_counts()
+        status_parts = [f"{_format_status(s)}: {c}" for s, c in mech_statuses.items()]
+        sections.append(f"**Recruitment status:** {' | '.join(status_parts)}\n")
+        # Key sponsors for this mechanism
+        mech_sponsors = mech_df["sponsor_normalized"].value_counts()
+        if len(mech_sponsors) > 0:
+            sections.append("**Key sponsors:**")
+            for sponsor, count in mech_sponsors.head(8).items():
+                phases = sorted(mech_df[mech_df["sponsor_normalized"] == sponsor]["phase_normalized"].unique())
+                sections.append(f"- **{sponsor}**: {count} trial{'s' if count > 1 else ''} ({', '.join(phases)})")
+            sections.append("")
+        # Drug pipeline table for this mechanism
+        drugs = _extract_drug_table(mech_df, indication_cats=indication_cats)
+        if len(drugs) > 0:
+            sections.append("**Drug pipeline:**\n")
+            sections.append("| Drug / Intervention | Sponsor | Phase | Status | Indication | Enrollment | Expected Completion |")
+            sections.append("|---|---|---|---|---|---:|---|")
+            for _, drug_row in drugs.iterrows():
+                sections.append(
+                    f"| {drug_row['drug']} | {drug_row['sponsor']} | {drug_row['phase']} | "
+                    f"{drug_row['status']} | {drug_row['indication']} | "
+                    f"{drug_row['enrollment']} | {drug_row['completion']} |"
+                )
+            sections.append("")
+        # Notable trials (brief list with NCT IDs)
+        sections.append("**Trial listing:**\n")
+        sections.append("| NCT ID | Title | Sponsor | Phase | Status | Enrollment | Completion |")
+        sections.append("|---|---|---|---|---|---:|---|")
+        for _, trial in mech_df.head(15).iterrows():
+            enrollment_val = _get_enrollment(trial)
+            enrollment_str = f"{int(enrollment_val):,}" if pd.notna(enrollment_val) and enrollment_val > 0 else "\u2014"
+            completion_str = _format_date(trial.get("completion_date", ""))
+            sections.append(
+                f"| {trial['nct_id']} | {trial['brief_title'][:65]} | {trial['sponsor_normalized']} | "
+                f"{trial['phase_normalized']} | {_format_status(trial['overall_status'])} | "
+                f"{enrollment_str} | {completion_str} |"
+            )
+        if len(mech_df) > 15:
+            sections.append(f"\n*... and {len(mech_df) - 15} additional trials*")
+        sections.append("")
+    # ==========================================
+    # 4. GEOGRAPHIC LANDSCAPE
+    # ==========================================
+    sections.append("---\n")
+    sections.append("## Geographic Landscape\n")
+    sections.append(
+        f"Geographic distribution of {disease_short or disease_name} trials reveals where investment and patient "
+        "recruitment are concentrated. Multi-country trials signal global registration "
+        "intent; single-country trials may indicate regional strategies or early-phase work.\n"
+    )
+    countries_str_col = _safe_col(trials_df, "countries_str", default="")
+    n_countries_col = _safe_col(trials_df, "n_countries", default=np.nan)
+    regions_str_col = _safe_col(trials_df, "regions_str", default="")
+    if countries_str_col.astype(str).str.strip().replace("", pd.NA).dropna().shape[0] > 0:
+        # Build country-level counts
+        country_counts = {}
+        for val in countries_str_col:
+            if pd.notna(val) and str(val).strip():
+                for c in str(val).split(";"):
+                    c = c.strip()
+                    if c:
+                        country_counts[c] = country_counts.get(c, 0) + 1
+        if country_counts:
+            sorted_countries = sorted(country_counts.items(), key=lambda x: x[1], reverse=True)
+            sections.append("### Top 15 Countries by Trial Presence\n")
+            sections.append("| Rank | Country | Trials |")
+            sections.append("|---:|---|---:|")
+            for rank, (country, count) in enumerate(sorted_countries[:15], 1):
+                sections.append(f"| {rank} | {country} | {count} |")
+            if len(sorted_countries) > 15:
+                sections.append(f"\n*{len(sorted_countries) - 15} additional countries with active trials.*")
+            sections.append("")
+        # Region breakdown
+        region_counts = {}
+        for val in regions_str_col:
+            if pd.notna(val) and str(val).strip():
+                for r in str(val).split(";"):
+                    r = r.strip()
+                    if r:
+                        region_counts[r] = region_counts.get(r, 0) + 1
+        if region_counts:
+            sorted_regions = sorted(region_counts.items(), key=lambda x: x[1], reverse=True)
+            sections.append("### Region Breakdown\n")
+            sections.append("| Region | Trials |")
+            sections.append("|---|---:|")
+            for region, count in sorted_regions:
+                sections.append(f"| {region} | {count} |")
+            sections.append("")
+        # Mechanism x region cross-tab
+        if region_counts:
+            # Build a DataFrame with one row per trial-region pair
+            rows_mr = []
+            for idx, row in trials_df.iterrows():
+                rval = _safe_col_val(row, "regions_str", "")
+                if pd.notna(rval) and str(rval).strip():
+                    for r in str(rval).split(";"):
+                        r = r.strip()
+                        if r:
+                            rows_mr.append({"mechanism": row["mechanism"], "region": r})
+            if rows_mr:
+                mr_df = pd.DataFrame(rows_mr)
+                mr_ct = pd.crosstab(mr_df["mechanism"], mr_df["region"])
+                mr_ct["Total"] = mr_ct.sum(axis=1)
+                mr_ct = mr_ct.sort_values("Total", ascending=False)
+                sections.append("### Mechanism \u00d7 Region\n")
+                mr_cols = [c for c in mr_ct.columns if c != "Total"] + ["Total"]
+                mr_ct = mr_ct[mr_cols]
+                header = "| Mechanism | " + " | ".join(str(c) for c in mr_cols) + " |"
+                sep_line = "|---|" + "|".join(["---:"] * len(mr_cols)) + "|"
+                sections.append(header)
+                sections.append(sep_line)
+                for mech, vals in mr_ct.iterrows():
+                    row_str = " | ".join(str(int(vals[c])) for c in mr_cols)
+                    sections.append(f"| {mech} | {row_str} |")
+                sections.append("")
+        # Multi-country trial distribution
+        mc = n_countries_col.dropna()
+        if len(mc) > 0:
+            n_multi = int((mc > 1).sum())
+            n_single = int((mc == 1).sum())
+            sections.append("### Multi-Country Trial Distribution\n")
+            sections.append(f"- Single-country: {n_single} trials")
+            sections.append(f"- Multi-country: {n_multi} trials")
+            avg_countries = mc[mc > 1].mean()
+            if pd.notna(avg_countries):
+                sections.append(f"- Average countries per multi-country trial: {avg_countries:.1f}")
+            sections.append("")
+    else:
+        sections.append("*Geographic data not available in current dataset. Re-run compile_trials.py with Phase 2B to populate.*\n")
+    # ==========================================
+    # 5. STUDY DESIGN ANALYSIS
+    # ==========================================
+    sections.append("---\n")
+    sections.append("## Study Design Analysis\n")
+    sections.append(
+        "Study design choices impact data quality, regulatory acceptance, and time to "
+        "market. A high proportion of double-blind RCTs signals mature, registration-intent "
+        "programs.\n"
+    )
+    design_cat_col = _safe_col(trials_df, "study_design_category", default="")
+    allocation_col = _safe_col(trials_df, "allocation", default="")
+    masking_col = _safe_col(trials_df, "masking", default="")
+    model_col = _safe_col(trials_df, "intervention_model", default="")
+    has_design_data = design_cat_col.astype(str).str.strip().replace("", pd.NA).dropna().shape[0] > 0
+    if has_design_data:
+        # Study design category breakdown
+        design_vc = design_cat_col.astype(str).replace("", "Unknown").value_counts()
+        sections.append("### Study Design Category Breakdown\n")
+        sections.append("| Design Category | Trials | % |")
+        sections.append("|---|---:|---:|")
+        for cat, cnt in design_vc.items():
+            if str(cat).strip() and str(cat) != "nan":
+                sections.append(f"| {cat} | {cnt} | {cnt/n_total*100:.0f}% |")
+        sections.append("")
+        # Allocation breakdown
+        alloc_vc = allocation_col.astype(str).replace("", "Not reported").value_counts()
+        if len(alloc_vc) > 0:
+            sections.append("### Allocation\n")
+            sections.append("| Allocation | Trials |")
+            sections.append("|---|---:|")
+            for a, cnt in alloc_vc.items():
+                if str(a) != "nan":
+                    sections.append(f"| {a} | {cnt} |")
+            sections.append("")
+        # Masking breakdown
+        masking_vc = masking_col.astype(str).replace("", "Not reported").value_counts()
+        if len(masking_vc) > 0:
+            sections.append("### Masking / Blinding\n")
+            sections.append("| Masking | Trials |")
+            sections.append("|---|---:|")
+            for m, cnt in masking_vc.items():
+                if str(m) != "nan":
+                    sections.append(f"| {m} | {cnt} |")
+            sections.append("")
+        # % double-blind by phase
+        db_mask = masking_col.astype(str).str.lower().str.contains("double", na=False)
+        if db_mask.any():
+            sections.append("### % Double-Blind by Phase\n")
+            phase_groups = trials_df.groupby("phase_normalized")
+            sections.append("| Phase | Total Trials | Double-Blind | % Double-Blind |")
+            sections.append("|---|---:|---:|---:|")
+            for phase in phase_order:
+                if phase in phase_groups.groups:
+                    idx = phase_groups.groups[phase]
+                    n_phase = len(idx)
+                    n_db = int(db_mask.loc[idx].sum())
+                    pct = n_db / n_phase * 100 if n_phase > 0 else 0
+                    sections.append(f"| {phase} | {n_phase} | {n_db} | {pct:.0f}% |")
+            sections.append("")
+        # Intervention model
+        model_vc = model_col.astype(str).replace("", "Not reported").value_counts()
+        if len(model_vc) > 0:
+            sections.append("### Intervention Model\n")
+            sections.append("| Model | Trials |")
+            sections.append("|---|---:|")
+            for m, cnt in model_vc.items():
+                if str(m) != "nan":
+                    sections.append(f"| {m} | {cnt} |")
+            sections.append("")
+    else:
+        sections.append("*Study design data not available in current dataset. Re-run compile_trials.py with Phase 2B to populate.*\n")
+    # ==========================================
+    # 6. PHASE TRANSITION FUNNEL
+    # ==========================================
+    sections.append("---\n")
+    sections.append("## Phase Transition Funnel\n")
+    sections.append(
+        "The phase funnel shows how many trials each mechanism has at each stage, "
+        "providing a rough indicator of pipeline depth and implied transition rates. "
+        "Higher Phase 2-to-3 ratios suggest stronger conviction by sponsors.\n"
+    )
+    # Phase counts per mechanism
+    funnel_phases = ["Phase 1", "Phase 1/2", "Phase 2", "Phase 2/3", "Phase 3"]
+    funnel_mechs = pharma_mechs[pharma_mechs >= 2].index.tolist()
+    sections.append("### Trials per Phase by Mechanism\n")
+    sections.append("| Mechanism | Ph 1 | Ph 1/2 | Ph 2 | Ph 2/3 | Ph 3 | Total | Ph2\u21923 Ratio |")
+    sections.append("|---|---:|---:|---:|---:|---:|---:|---|")
+    for mech in funnel_mechs:
+        mech_df = trials_df[trials_df["mechanism"] == mech]
+        counts = {}
+        for p in funnel_phases:
+            counts[p] = len(mech_df[mech_df["phase_normalized"] == p])
+        total = sum(counts.values())
+        # Implied transition ratio: Phase 3 / (Phase 2 + Phase 2/3) if denominator > 0
+        denom = counts["Phase 2"] + counts["Phase 2/3"]
+        if denom > 0 and counts["Phase 3"] > 0:
+            ratio = f"{counts['Phase 3']/denom:.1f}"
+        elif counts["Phase 3"] > 0:
+            ratio = "\u221e"  # infinity — no Phase 2 trials
+        else:
+            ratio = "\u2014"
+        sections.append(
+            f"| {mech} | {counts['Phase 1']} | {counts['Phase 1/2']} | "
+            f"{counts['Phase 2']} | {counts['Phase 2/3']} | {counts['Phase 3']} | "
+            f"{total} | {ratio} |"
+        )
+    sections.append("")
+    sections.append(
+        "*Ph2\u21923 Ratio = Phase 3 count / (Phase 2 + Phase 2/3 count). Higher values imply "
+        "strong advancement conviction. \"\u221e\" means Phase 3 trials exist without active Phase 2.*\n"
+    )
+    # ==========================================
+    # 7. LATE-STAGE PIPELINE (Phase 3)
+    # ==========================================
+    sections.append("---\n")
+    sections.append("## Late-Stage Pipeline (Phase 3)\n")
+    p3_df = trials_df[trials_df["phase_normalized"].isin(["Phase 3", "Phase 3/4"])].copy()
+    if len(p3_df) > 0:
+        sections.append(f"**{len(p3_df)} Phase 3 trials** represent the most advanced pipeline with near-term commercial potential.\n")
+        # Phase 3 by mechanism
+        p3_mechs = p3_df["mechanism"].value_counts()
+        sections.append("**Phase 3 by mechanism:**")
+        for mech, count in p3_mechs.items():
+            sections.append(f"- {mech}: {count}")
+        sections.append("")
+        # Full Phase 3 table
+        sections.append("| NCT ID | Drug(s) | Mechanism | Sponsor | Indication | Enrollment | Status | Expected Completion |")
+        sections.append("|---|---|---|---|---|---:|---|---|")
+        for _, trial in p3_df.sort_values("completion_date").iterrows():
+            drugs = trial.get("drug_names_str", "")
+            if not drugs or drugs == "nan":
+                drugs = "\u2014"
+            elif len(drugs) > 40:
+                drugs = drugs[:37] + "..."
+            enrollment_val = _get_enrollment(trial)
+            enrollment_str = f"{int(enrollment_val):,}" if pd.notna(enrollment_val) and enrollment_val > 0 else "\u2014"
+            completion_str = _format_date(trial.get("completion_date", ""))
+            indication = _extract_indication(trial.get("conditions_str", ""), indication_cats=indication_cats)
+            sections.append(
+                f"| {trial['nct_id']} | {drugs} | {trial['mechanism']} | "
+                f"{trial['sponsor_normalized']} | {indication} | {enrollment_str} | "
+                f"{_format_status(trial['overall_status'])} | {completion_str} |"
+            )
+        sections.append("")
+    else:
+        sections.append("*No Phase 3 trials found in the current dataset.*\n")
+    # ==========================================
+    # 8. PHASE 3 ENDPOINT COMPARISON
+    # ==========================================
+    sections.append("---\n")
+    sections.append("## Phase 3 Endpoint Comparison\n")
+    sections.append(
+        "Primary endpoints in Phase 3 trials define the regulatory bar each program must clear. "
+        "Comparing endpoints across mechanisms reveals whether the field is converging on "
+        "standard outcomes (e.g., clinical remission at Week 12/52) or pursuing differentiated strategies.\n"
+    )
+    endpoint_col = _safe_col(trials_df, "primary_endpoint", default="")
+    timeframe_col = _safe_col(trials_df, "endpoint_timeframe", default="")
+    p3_with_endpoint = p3_df.copy() if len(p3_df) > 0 else pd.DataFrame()
+    if len(p3_with_endpoint) > 0:
+        p3_with_endpoint["_endpoint"] = endpoint_col.reindex(p3_with_endpoint.index).fillna("").astype(str)
+        p3_with_endpoint["_timeframe"] = timeframe_col.reindex(p3_with_endpoint.index).fillna("").astype(str)
+        has_ep = p3_with_endpoint["_endpoint"].str.strip().replace("", pd.NA).dropna()
+        if len(has_ep) > 0:
+            sections.append("### Primary Endpoints by Mechanism (Phase 3)\n")
+            sections.append("| NCT ID | Mechanism | Sponsor | Primary Endpoint | Timeframe |")
+            sections.append("|---|---|---|---|---|")
+            for mech in p3_with_endpoint["mechanism"].unique():
+                mech_ep = p3_with_endpoint[p3_with_endpoint["mechanism"] == mech]
+                for _, trial in mech_ep.iterrows():
+                    ep_text = str(trial["_endpoint"]).strip()
+                    if not ep_text or ep_text == "nan":
+                        ep_text = "\u2014"
+                    elif len(ep_text) > 80:
+                        ep_text = ep_text[:77] + "..."
+                    tf_text = str(trial["_timeframe"]).strip()
+                    if not tf_text or tf_text == "nan":
+                        tf_text = "\u2014"
+                    sections.append(
+                        f"| {trial['nct_id']} | {mech} | {trial['sponsor_normalized']} | "
+                        f"{ep_text} | {tf_text} |"
+                    )
+            sections.append("")
+            # Timeframe comparison summary
+            tf_series = p3_with_endpoint["_timeframe"].replace("", pd.NA).dropna()
+            if len(tf_series) > 0:
+                sections.append("### Timeframe Summary\n")
+                tf_counts = tf_series.value_counts().head(10)
+                sections.append("| Timeframe | Trials |")
+                sections.append("|---|---:|")
+                for tf, cnt in tf_counts.items():
+                    if str(tf).strip() and str(tf) != "nan":
+                        sections.append(f"| {tf} | {cnt} |")
+                sections.append("")
+        else:
+            sections.append("*Endpoint data not populated for Phase 3 trials. Re-run compile_trials.py with Phase 2B.*\n")
+    else:
+        sections.append("*No Phase 3 trials in dataset.*\n")
+    # ==========================================
+    # 9. COMBINATION THERAPY LANDSCAPE
+    # ==========================================
+    sections.append("---\n")
+    sections.append("## Combination Therapy Landscape\n")
+    sections.append(
+        f"Combination trials (2+ active drugs) represent a growing strategy in {disease_short or disease_name}, reflecting "
+        "the need for improved efficacy beyond monotherapy. Identifying which mechanism pairs "
+        "are being tested reveals emerging therapeutic paradigms.\n"
+    )
+    combo_col = _safe_col(trials_df, "is_combination", default=False)
+    combo_df = trials_df[combo_col.fillna(False).astype(bool)].copy()
+    if len(combo_df) > 0:
+        sections.append(f"**{len(combo_df)} combination therapy trials** identified.\n")
+        # By mechanism
+        combo_mechs = combo_df["mechanism"].value_counts()
+        sections.append("### Combination Trials by Mechanism\n")
+        sections.append("| Mechanism | Combo Trials | % of Mechanism Total |")
+        sections.append("|---|---:|---:|")
+        for mech, cnt in combo_mechs.items():
+            total_mech = len(trials_df[trials_df["mechanism"] == mech])
+            pct = cnt / total_mech * 100 if total_mech > 0 else 0
+            sections.append(f"| {mech} | {cnt} | {pct:.0f}% |")
+        sections.append("")
+        # Full combo listing
+        sections.append("### Combination Trial Details\n")
+        sections.append("| NCT ID | Drug(s) | Mechanism | Sponsor | Phase | Status |")
+        sections.append("|---|---|---|---|---|---|")
+        for _, trial in combo_df.sort_values(["mechanism", "phase_normalized"]).iterrows():
+            drugs = trial.get("drug_names_str", "")
+            if not drugs or str(drugs) == "nan":
+                drugs = "\u2014"
+            elif len(str(drugs)) > 50:
+                drugs = str(drugs)[:47] + "..."
+            sections.append(
+                f"| {trial['nct_id']} | {drugs} | {trial['mechanism']} | "
+                f"{trial['sponsor_normalized']} | {trial['phase_normalized']} | "
+                f"{_format_status(trial['overall_status'])} |"
+            )
+        sections.append("")
+        # Sponsor activity in combinations
+        combo_sponsors = combo_df["sponsor_normalized"].value_counts()
+        if len(combo_sponsors) > 0:
+            sections.append("### Sponsors Active in Combination Trials\n")
+            sections.append("| Sponsor | Combo Trials |")
+            sections.append("|---|---:|")
+            for sponsor, cnt in combo_sponsors.head(10).items():
+                sections.append(f"| {sponsor} | {cnt} |")
+            sections.append("")
+    else:
+        combo_has_data = "is_combination" in trials_df.columns
+        if combo_has_data:
+            sections.append("*No combination therapy trials identified in the current dataset.*\n")
+        else:
+            sections.append("*Combination therapy data not available. Re-run compile_trials.py with Phase 2B to populate.*\n")
+    # ==========================================
+    # 10. UPCOMING READOUTS
+    # ==========================================
+    sections.append("---\n")
+    sections.append("## Upcoming Readouts\n")
+    upcoming_12 = _get_upcoming_readouts(trials_df, months=12)
+    upcoming_24 = _get_upcoming_readouts(trials_df, months=24)
+    if len(upcoming_24) > 0:
+        sections.append(
+            f"**{len(upcoming_12)} trials** expected to complete within 12 months, "
+            f"**{len(upcoming_24)}** within 24 months.\n"
+        )
+        sections.append("*Sorted by expected primary completion date.*\n")
+        sections.append("| NCT ID | Drug(s) | Mechanism | Sponsor | Phase | Enrollment | Expected Completion | Time to Readout |")
+        sections.append("|---|---|---|---|---|---:|---|---|")
+        for _, trial in upcoming_24.iterrows():
+            drugs = trial.get("drug_names_str", "")
+            if not drugs or str(drugs) == "nan":
+                drugs = "\u2014"
+            elif len(str(drugs)) > 35:
+                drugs = str(drugs)[:32] + "..."
+            enrollment_val = _get_enrollment(trial)
+            enrollment_str = f"{int(enrollment_val):,}" if pd.notna(enrollment_val) and enrollment_val > 0 else "\u2014"
+            completion_str = _format_date(trial.get("completion_date", ""))
+            time_to = _time_to_readout(trial.get("completion_date", ""))
+            sections.append(
+                f"| {trial['nct_id']} | {drugs} | {trial['mechanism']} | "
+                f"{trial['sponsor_normalized']} | {trial['phase_normalized']} | "
+                f"{enrollment_str} | {completion_str} | {time_to} |"
+            )
+        sections.append("")
+    else:
+        sections.append("*No trials with completion dates within 24 months found.*\n")
+    # ==========================================
+    # 11. DRUG-LEVEL PIPELINE
+    # ==========================================
+    sections.append("---\n")
+    sections.append("## Drug-Level Pipeline\n")
+    drug_table = _extract_drug_table(trials_df, indication_cats=indication_cats)
+    if len(drug_table) > 0:
+        sections.append(f"**{len(drug_table)} distinct drug/intervention programs** identified across all mechanisms.\n")
+        sections.append("| Drug / Intervention | Mechanism | Sponsor | Phase | Status | Indication | Enrollment | Completion |")
+        sections.append("|---|---|---|---|---|---|---:|---|")
+        for _, row in drug_table.iterrows():
+            sections.append(
+                f"| {row['drug']} | {row['mechanism']} | {row['sponsor']} | "
+                f"{row['phase']} | {row['status']} | {row['indication']} | "
+                f"{row['enrollment']} | {row['completion']} |"
+            )
+        sections.append("")
+    else:
+        sections.append("*No named drug interventions identified.*\n")
+    # ==========================================
+    # 12+. CONFIG-DRIVEN HIGHLIGHT SECTIONS (mechanisms)
+    # ==========================================
+    for hl in highlight_mechanisms:
+        hl_mech_name = hl["mechanism"]
+        hl_section_title = hl["section_title"]
+        hl_narrative = hl.get("narrative", "")
+        hl_trials = trials_df[trials_df["mechanism"] == hl_mech_name]
+        if len(hl_trials) > 0:
+            sections.append("---\n")
+            sections.append(f"## {hl_section_title}\n")
+            narrative_suffix = f" \u2014 {hl_narrative}" if hl_narrative else ""
+            sections.append(f"**{len(hl_trials)} active trials**{narrative_suffix}.\n")
+            desc = mechanism_descriptions.get(hl_mech_name, "")
+            if desc:
+                sections.append(f"> {desc}\n")
+            # Phase breakdown
+            hl_phases = hl_trials["phase_normalized"].value_counts()
+            for phase, count in hl_phases.items():
+                sections.append(f"- **{phase}:** {count} trials")
+            sections.append("")
+            # Key sponsors + their drugs
+            sections.append("### Key Sponsors & Drug Programs\n")
+            hl_sponsors = hl_trials["sponsor_normalized"].value_counts()
+            for sponsor, count in hl_sponsors.items():
+                sponsor_trials = hl_trials[hl_trials["sponsor_normalized"] == sponsor]
+                drugs = _get_unique_drugs(sponsor_trials)
+                phases = sorted(sponsor_trials["phase_normalized"].unique())
+                drug_str = f" ({', '.join(drugs)})" if drugs else ""
+                enrollments = sponsor_trials["enrollment"].dropna()
+                enroll_str = f", ~{int(enrollments.sum()):,} patients" if len(enrollments) > 0 else ""
+                sections.append(
+                    f"- **{sponsor}**: {count} trial{'s' if count > 1 else ''}{drug_str} "
+                    f"\u2014 {', '.join(phases)}{enroll_str}"
+                )
+            sections.append("")
+            # Indication split
+            sections.append("### Indication Split\n")
+            _add_indication_split(sections, hl_trials, indication_cats=indication_cats)
+            # Complete trial listing
+            sections.append("### Complete Trial Listing\n")
+            sections.append("| NCT ID | Title | Sponsor | Phase | Status | Enrollment | Completion |")
+            sections.append("|---|---|---|---|---|---:|---|")
+            for _, trial in hl_trials.iterrows():
+                enrollment_val = _get_enrollment(trial)
+                enrollment_str = f"{int(enrollment_val):,}" if pd.notna(enrollment_val) and enrollment_val > 0 else "\u2014"
+                completion_str = _format_date(trial.get("completion_date", ""))
+                sections.append(
+                    f"| {trial['nct_id']} | {trial['brief_title'][:60]} | "
+                    f"{trial['sponsor_normalized']} | {trial['phase_normalized']} | "
+                    f"{_format_status(trial['overall_status'])} | {enrollment_str} | {completion_str} |"
+                )
+            sections.append("")
+    # ==========================================
+    # CONFIG-DRIVEN HIGHLIGHT SECTIONS (sponsors)
+    # ==========================================
+    for hl in highlight_sponsors:
+        hl_sponsor_name = hl["sponsor"]
+        hl_section_title = hl["section_title"]
+        hl_trials = trials_df[trials_df["sponsor_normalized"] == hl_sponsor_name]
+        if len(hl_trials) > 0:
+            sections.append("---\n")
+            sections.append(f"## {hl_section_title}\n")
+            sections.append(f"**{len(hl_trials)} active trials** from {hl_sponsor_name}.\n")
+            # Portfolio by mechanism
+            sections.append("### Portfolio by Mechanism\n")
+            hl_mechs = hl_trials["mechanism"].value_counts()
+            for mech, count in hl_mechs.items():
+                mech_trials = hl_trials[hl_trials["mechanism"] == mech]
+                drugs = _get_unique_drugs(mech_trials)
+                phases = sorted(mech_trials["phase_normalized"].unique())
+                drug_str = f": {', '.join(drugs)}" if drugs else ""
+                sections.append(f"- **{mech}** ({count} trial{'s' if count > 1 else ''}){drug_str} \u2014 {', '.join(phases)}")
+            sections.append("")
+            # Indication split
+            sections.append("### Indication Coverage\n")
+            _add_indication_split(sections, hl_trials, indication_cats=indication_cats)
+            # Competitive context
+            sections.append("### Competitive Context\n")
+            for mech in hl_mechs.index:
+                all_mech = trials_df[trials_df["mechanism"] == mech]
+                sponsor_mech = hl_trials[hl_trials["mechanism"] == mech]
+                other_sponsors = all_mech[all_mech["sponsor_normalized"] != hl_sponsor_name]["sponsor_normalized"].nunique()
+                sections.append(
+                    f"- **{mech}**: {hl_sponsor_name} has {len(sponsor_mech)} of {len(all_mech)} total trials "
+                    f"(vs. {other_sponsors} other sponsors)"
+                )
+            sections.append("")
+            # Full trial listing
+            sections.append("### Complete Trial Listing\n")
+            sections.append("| NCT ID | Title | Mechanism | Phase | Status | Enrollment | Completion |")
+            sections.append("|---|---|---|---|---|---:|---|")
+            for _, trial in hl_trials.iterrows():
+                enrollment_val = _get_enrollment(trial)
+                enrollment_str = f"{int(enrollment_val):,}" if pd.notna(enrollment_val) and enrollment_val > 0 else "\u2014"
+                completion_str = _format_date(trial.get("completion_date", ""))
+                sections.append(
+                    f"| {trial['nct_id']} | {trial['brief_title'][:55]} | "
+                    f"{trial['mechanism']} | {trial['phase_normalized']} | "
+                    f"{_format_status(trial['overall_status'])} | {enrollment_str} | {completion_str} |"
+                )
+            sections.append("")
+    # ==========================================
+    # SPONSOR COMPETITIVE LANDSCAPE
+    # ==========================================
+    sections.append("---\n")
+    sections.append("## Sponsor Competitive Landscape\n")
+    # Top sponsors summary table
+    enroll_col = "enrollment_clean" if "enrollment_clean" in trials_df.columns else "enrollment"
+    sponsor_summary = (
+        trials_df.groupby("sponsor_normalized")
+        .agg(
+            trials=("nct_id", "count"),
+            mechanisms=("mechanism", "nunique"),
+            top_mechanism=("mechanism", lambda x: x.value_counts().index[0] if len(x) > 0 else ""),
+            phases=("phase_normalized", lambda x: ", ".join(sorted(x.unique()))),
+            total_enrollment=(enroll_col, "sum"),
+            industry=("is_industry", "first"),
+        )
+        .sort_values("trials", ascending=False)
+    )
+    top_n = 25
+    top_sponsors = sponsor_summary.head(top_n)
+    sections.append(f"### Top {min(top_n, len(top_sponsors))} Sponsors\n")
+    sections.append("| Rank | Sponsor | Trials | Mechanisms | Primary Focus | Phases | Est. Enrollment | Type |")
+    sections.append("|---:|---|---:|---:|---|---|---:|---|")
+    for rank, (sponsor, row) in enumerate(top_sponsors.iterrows(), 1):
+        enroll = f"{int(row['total_enrollment']):,}" if pd.notna(row["total_enrollment"]) and row["total_enrollment"] > 0 else "\u2014"
+        stype = "Industry" if row["industry"] else "Academic"
+        sections.append(
+            f"| {rank} | **{sponsor}** | {int(row['trials'])} | {int(row['mechanisms'])} | "
+            f"{row['top_mechanism']} | {row['phases'][:35]} | {enroll} | {stype} |"
+        )
+    sections.append("")
+    # Company deep-dives for top sponsors
+    sections.append("### Company Portfolio Analysis\n")
+    for sponsor, row in top_sponsors.head(10).iterrows():
+        sponsor_df = trials_df[trials_df["sponsor_normalized"] == sponsor]
+        mech_breakdown = sponsor_df["mechanism"].value_counts()
+        phase_breakdown = sponsor_df["phase_normalized"].value_counts()
+        sections.append(f"**{sponsor}** \u2014 {int(row['trials'])} trials\n")
+        # Mechanism focus
+        mech_parts = [f"{m} ({c})" for m, c in mech_breakdown.items()]
+        sections.append(f"- Mechanisms: {', '.join(mech_parts)}")
+        # Phase coverage
+        phase_parts = [f"{p}: {c}" for p, c in phase_breakdown.items()]
+        sections.append(f"- Phases: {', '.join(phase_parts)}")
+        # Key drugs
+        drugs = _get_unique_drugs(sponsor_df)
+        if drugs:
+            sections.append(f"- Key drugs/interventions: {', '.join(drugs[:5])}")
+        # Indication focus
+        ind_counts = _count_indications(sponsor_df, indication_cats=indication_cats)
+        ind_parts = [f"{label} ({cnt})" for label, _full, cnt in ind_counts if cnt > 0]
+        if ind_parts:
+            sections.append(f"- Indications: {', '.join(ind_parts)}")
+        sections.append("")
+    # Industry vs academic summary
+    sections.append("### Industry vs. Academic Split\n")
+    industry_df = trials_df[trials_df["is_industry"]]
+    academic_df = trials_df[~trials_df["is_industry"]]
+    sections.append(f"| | Industry | Academic/Other |")
+    sections.append(f"|---|---:|---:|")
+    sections.append(f"| Total trials | {len(industry_df)} | {len(academic_df)} |")
+    sections.append(f"| Unique sponsors | {industry_df['sponsor_normalized'].nunique()} | {academic_df['sponsor_normalized'].nunique()} |")
+    sections.append(f"| Phase 3 trials | {len(industry_df[industry_df['phase_normalized'] == 'Phase 3'])} | {len(academic_df[academic_df['phase_normalized'] == 'Phase 3'])} |")
+    sections.append(f"| Mechanisms covered | {industry_df['mechanism'].nunique()} | {academic_df['mechanism'].nunique()} |")
+    ind_enroll = industry_df[enroll_col].sum() if enroll_col in industry_df.columns else industry_df["enrollment"].sum()
+    aca_enroll = academic_df[enroll_col].sum() if enroll_col in academic_df.columns else academic_df["enrollment"].sum()
+    sections.append(f"| Total enrollment | {int(ind_enroll):,} | {int(aca_enroll):,} |")
+    sections.append("")
+    # ==========================================
+    # 16. INDICATION BREAKDOWN
+    # ==========================================
+    sections.append("---\n")
+    sections.append("## Indication Breakdown\n")
+    ind_totals = _count_indications(trials_df, indication_cats=indication_cats)
+    sections.append(f"| Indication | Trials |")
+    sections.append(f"|---|---:|")
+    for label, full_name, cnt in ind_totals:
+        display = f"{full_name} ({label})" if label != full_name else label
+        sections.append(f"| {display} | {cnt} |")
+    sections.append("")
+    # Mechanism x indication
+    ind_labels = [entry[0] for entry in ind_totals]
+    sections.append("### Mechanism \u00d7 Indication\n")
+    header = "| Mechanism | " + " | ".join(ind_labels) + " |"
+    sep = "|---|" + "|".join(["---:"] * len(ind_labels)) + "|"
+    sections.append(header)
+    sections.append(sep)
+    for mech in ct.index:
+        mech_df = trials_df[trials_df["mechanism"] == mech]
+        mech_ind = _count_indications(mech_df, indication_cats=indication_cats)
+        vals = " | ".join(str(entry[2]) for entry in mech_ind)
+        sections.append(f"| {mech} | {vals} |")
+    sections.append("")
+    # ==========================================
+    # 17. PATIENT POPULATION ANALYSIS
+    # ==========================================
+    sections.append("---\n")
+    sections.append("## Patient Population Analysis\n")
+    sections.append(
+        "Eligibility criteria reveal which patient segments are being targeted. "
+        f"Pediatric {disease_short or disease_name} remains underserved relative to adult populations; identifying "
+        "pediatric-specific programs is valuable for portfolio gap analysis.\n"
+    )
+    is_pediatric_col = _safe_col(trials_df, "is_pediatric", default=False)
+    min_age_years_col = _safe_col(trials_df, "min_age_years", default=np.nan)
+    max_age_years_col = _safe_col(trials_df, "max_age_years", default=np.nan)
+    sex_col = _safe_col(trials_df, "sex", default="")
+    has_pop_data = is_pediatric_col.astype(bool).any() or min_age_years_col.notna().any()
+    if has_pop_data:
+        n_ped = int(is_pediatric_col.fillna(False).astype(bool).sum())
+        n_adult = n_total - n_ped
+        sections.append("### Pediatric vs. Adult\n")
+        sections.append(f"| Population | Trials | % |")
+        sections.append(f"|---|---:|---:|")
+        sections.append(f"| Adult-only | {n_adult} | {n_adult/n_total*100:.0f}% |")
+        sections.append(f"| Includes pediatric | {n_ped} | {n_ped/n_total*100:.0f}% |")
+        sections.append("")
+        # Pediatric by mechanism
+        if n_ped > 0:
+            ped_df = trials_df[is_pediatric_col.fillna(False).astype(bool)]
+            ped_mechs = ped_df["mechanism"].value_counts()
+            sections.append("### Pediatric Trials by Mechanism\n")
+            sections.append("| Mechanism | Pediatric Trials |")
+            sections.append("|---|---:|")
+            for mech, cnt in ped_mechs.items():
+                sections.append(f"| {mech} | {cnt} |")
+            sections.append("")
+        # Age range summary
+        min_ages = min_age_years_col.dropna()
+        max_ages = max_age_years_col.dropna()
+        if len(min_ages) > 0 or len(max_ages) > 0:
+            sections.append("### Age Eligibility Summary\n")
+            if len(min_ages) > 0:
+                sections.append(f"- Minimum age range: {min_ages.min():.0f} \u2013 {min_ages.max():.0f} years (median: {min_ages.median():.0f})")
+            if len(max_ages) > 0:
+                sections.append(f"- Maximum age range: {max_ages.min():.0f} \u2013 {max_ages.max():.0f} years (median: {max_ages.median():.0f})")
+            sections.append("")
+        # Gender eligibility
+        sex_vc = sex_col.astype(str).replace("", "Not reported").value_counts()
+        if len(sex_vc) > 0 and not (len(sex_vc) == 1 and "Not reported" in sex_vc.index):
+            sections.append("### Gender Eligibility\n")
+            sections.append("| Sex | Trials |")
+            sections.append("|---|---:|")
+            for s, cnt in sex_vc.items():
+                if str(s) != "nan":
+                    sections.append(f"| {s} | {cnt} |")
+            sections.append("")
+    else:
+        sections.append("*Patient population data not available. Re-run compile_trials.py with Phase 2B to populate.*\n")
+    # ==========================================
+    # 18. TRIAL ARMS & COMPARATOR ANALYSIS
+    # ==========================================
+    sections.append("---\n")
+    sections.append("## Trial Arms & Comparator Analysis\n")
+    sections.append(
+        "Understanding comparator strategies is critical for competitive positioning. "
+        "Head-to-head trials directly compare therapies and carry outsize strategic "
+        "importance for prescriber adoption and market access.\n"
+    )
+    has_placebo_col = _safe_col(trials_df, "has_placebo_arm", default=False)
+    has_active_col = _safe_col(trials_df, "has_active_comparator", default=False)
+    is_h2h_col = _safe_col(trials_df, "is_head_to_head", default=False)
+    n_arms_col = _safe_col(trials_df, "n_arms", default=np.nan)
+    has_arms_data = has_placebo_col.astype(bool).any() or n_arms_col.notna().any()
+    if has_arms_data:
+        n_placebo = int(has_placebo_col.fillna(False).astype(bool).sum())
+        n_active = int(has_active_col.fillna(False).astype(bool).sum())
+        n_h2h = int(is_h2h_col.fillna(False).astype(bool).sum())
+        sections.append("### Comparator Summary\n")
+        sections.append("| Comparator Type | Trials | % |")
+        sections.append("|---|---:|---:|")
+        sections.append(f"| Has placebo arm | {n_placebo} | {n_placebo/n_total*100:.0f}% |")
+        sections.append(f"| Has active comparator | {n_active} | {n_active/n_total*100:.0f}% |")
+        sections.append(f"| Head-to-head | {n_h2h} | {n_h2h/n_total*100:.0f}% |")
+        sections.append("")
+        # Number of arms distribution
+        arms_valid = n_arms_col.dropna()
+        if len(arms_valid) > 0:
+            sections.append("### Number of Arms\n")
+            arms_vc = arms_valid.astype(int).value_counts().sort_index()
+            sections.append("| Arms | Trials |")
+            sections.append("|---:|---:|")
+            for n, cnt in arms_vc.items():
+                sections.append(f"| {n} | {cnt} |")
+            sections.append("")
+        # Head-to-head trial details
+        if n_h2h > 0:
+            h2h_df = trials_df[is_h2h_col.fillna(False).astype(bool)]
+            sections.append("### Head-to-Head Trials\n")
+            sections.append("| NCT ID | Drug(s) | Mechanism | Sponsor | Phase | Status |")
+            sections.append("|---|---|---|---|---|---|")
+            for _, trial in h2h_df.iterrows():
+                drugs = trial.get("drug_names_str", "")
+                if not drugs or str(drugs) == "nan":
+                    drugs = "\u2014"
+                elif len(str(drugs)) > 50:
+                    drugs = str(drugs)[:47] + "..."
+                sections.append(
+                    f"| {trial['nct_id']} | {drugs} | {trial['mechanism']} | "
+                    f"{trial['sponsor_normalized']} | {trial['phase_normalized']} | "
+                    f"{_format_status(trial['overall_status'])} |"
+                )
+            sections.append("")
+        # Placebo vs active by phase
+        sections.append("### Comparator Strategy by Phase\n")
+        sections.append("| Phase | Placebo | Active | Head-to-Head | Neither |")
+        sections.append("|---|---:|---:|---:|---:|")
+        for phase in phase_order:
+            phase_mask = trials_df["phase_normalized"] == phase
+            if phase_mask.sum() == 0:
+                continue
+            n_ph = int(phase_mask.sum())
+            n_pl = int((phase_mask & has_placebo_col.fillna(False).astype(bool)).sum())
+            n_ac = int((phase_mask & has_active_col.fillna(False).astype(bool)).sum())
+            n_hh = int((phase_mask & is_h2h_col.fillna(False).astype(bool)).sum())
+            n_neither = n_ph - max(n_pl, n_ac)
+            if n_neither < 0:
+                n_neither = 0
+            sections.append(f"| {phase} | {n_pl} | {n_ac} | {n_hh} | {n_neither} |")
+        sections.append("")
+    else:
+        sections.append("*Trial arm/comparator data not available. Re-run compile_trials.py with Phase 2B to populate.*\n")
+    # ==========================================
+    # 19. ENROLLMENT & INVESTMENT SIGNALS
+    # ==========================================
+    sections.append("---\n")
+    sections.append("## Enrollment & Investment Signals\n")
+    # Use enrollment_clean if available, else fallback to enrollment
+    enroll_col = "enrollment_clean" if "enrollment_clean" in trials_df.columns else "enrollment"
+    if enroll_col == "enrollment_clean":
+        sections.append("*Using cleaned enrollment figures (outliers flagged/removed).*\n")
+    # Largest trials
+    large_trials = trials_df[trials_df[enroll_col].notna() & (trials_df[enroll_col] > 0)].sort_values(enroll_col, ascending=False)
+    if len(large_trials) > 0:
+        sections.append("### Largest Trials by Enrollment\n")
+        sections.append(f"*Large enrollment signals significant investment and commercial intent.*\n")
+        sections.append("| NCT ID | Drug(s) | Mechanism | Sponsor | Phase | Enrollment | Status |")
+        sections.append("|---|---|---|---|---|---:|---|")
+        for _, trial in large_trials.head(20).iterrows():
+            drugs = trial.get("drug_names_str", "")
+            if not drugs or str(drugs) == "nan":
+                drugs = "\u2014"
+            elif len(str(drugs)) > 35:
+                drugs = str(drugs)[:32] + "..."
+            enrollment_val = trial[enroll_col]
+            sections.append(
+                f"| {trial['nct_id']} | {drugs} | {trial['mechanism']} | "
+                f"{trial['sponsor_normalized']} | {trial['phase_normalized']} | "
+                f"{int(enrollment_val):,} | {_format_status(trial['overall_status'])} |"
+            )
+        sections.append("")
+    # Enrollment by mechanism
+    sections.append("### Total Enrollment by Mechanism\n")
+    mech_enrollment = trials_df.groupby("mechanism")[enroll_col].agg(["sum", "count", "mean"]).sort_values("sum", ascending=False)
+    sections.append("| Mechanism | Total Enrollment | Trials | Avg. per Trial |")
+    sections.append("|---|---:|---:|---:|")
+    for mech, row in mech_enrollment.iterrows():
+        total = f"{int(row['sum']):,}" if pd.notna(row["sum"]) else "\u2014"
+        avg = f"{int(row['mean']):,}" if pd.notna(row["mean"]) else "\u2014"
+        sections.append(f"| {mech} | {total} | {int(row['count'])} | {avg} |")
+    sections.append("")
+    # New trials (started recently)
+    recent_starts = trials_df[trials_df["start_year"].notna()].copy()
+    if len(recent_starts) > 0:
+        current_year = now.year
+        new_trials = recent_starts[recent_starts["start_year"] >= current_year - 1]
+        if len(new_trials) > 0:
+            sections.append(f"### Recently Initiated Trials ({current_year - 1}\u2013{current_year})\n")
+            sections.append(f"**{len(new_trials)} trials** started in {current_year - 1}\u2013{current_year}.\n")
+            new_mechs = new_trials["mechanism"].value_counts()
+            for mech, count in new_mechs.items():
+                sections.append(f"- {mech}: {count}")
+            sections.append("")
+    # ==========================================
+    # 20. BIOSIMILAR LANDSCAPE
+    # ==========================================
+    sections.append("---\n")
+    sections.append("## Biosimilar Landscape\n")
+    sections.append(
+        "Biosimilar trials erode originator revenue and reshape competitive dynamics. "
+        "Mechanisms with heavy biosimilar activity (e.g., Anti-TNF) face price compression, "
+        "while mechanisms with no biosimilars retain pricing power.\n"
+    )
+    biosim_col = _safe_col(trials_df, "is_biosimilar", default=False)
+    has_biosim_data = "is_biosimilar" in trials_df.columns
+    if has_biosim_data:
+        n_biosim = int(biosim_col.fillna(False).astype(bool).sum())
+        n_originator = n_total - n_biosim
+        sections.append(f"**{n_biosim} biosimilar trials** vs. **{n_originator} originator/novel** trials.\n")
+        if n_biosim > 0:
+            biosim_df = trials_df[biosim_col.fillna(False).astype(bool)]
+            # Biosimilar by mechanism
+            biosim_mechs = biosim_df["mechanism"].value_counts()
+            sections.append("### Biosimilar Trials by Mechanism\n")
+            sections.append("| Mechanism | Biosimilar Trials | Originator Trials | Biosimilar % |")
+            sections.append("|---|---:|---:|---:|")
+            for mech in ct.index:
+                mech_total = len(trials_df[trials_df["mechanism"] == mech])
+                mech_biosim = int(biosim_mechs.get(mech, 0))
+                mech_orig = mech_total - mech_biosim
+                pct = mech_biosim / mech_total * 100 if mech_total > 0 else 0
+                if mech_biosim > 0 or mech_total >= 3:  # Only show mechanisms with biosimilars or enough trials
+                    sections.append(f"| {mech} | {mech_biosim} | {mech_orig} | {pct:.0f}% |")
+            sections.append("")
+            # Biosimilar sponsors
+            biosim_sponsors = biosim_df["sponsor_normalized"].value_counts()
+            sections.append("### Top Biosimilar Sponsors\n")
+            sections.append("| Sponsor | Biosimilar Trials |")
+            sections.append("|---|---:|")
+            for sponsor, cnt in biosim_sponsors.head(10).items():
+                sections.append(f"| {sponsor} | {cnt} |")
+            sections.append("")
+        else:
+            sections.append("*No biosimilar trials identified in current dataset.*\n")
+    else:
+        sections.append("*Biosimilar classification not available. Re-run compile_trials.py with Phase 2B to populate.*\n")
+    # ==========================================
+    # 21. WHITESPACE & UNMET NEEDS
+    # ==========================================
+    sections.append("---\n")
+    sections.append("## Whitespace & Unmet Needs\n")
+    sections.append(
+        "Identifying gaps in the current pipeline highlights opportunities for BD and "
+        "portfolio strategy. Mechanisms without Phase 3 programs, indications with thin "
+        "coverage, and under-tested patient segments represent potential whitespace.\n"
+    )
+    # Mechanisms with no Phase 3
+    sections.append("### Mechanisms Without Phase 3 Trials\n")
+    mechs_no_p3 = []
+    for mech in pharma_mechs.index:
+        mech_p3 = len(trials_df[(trials_df["mechanism"] == mech) &
+                                 (trials_df["phase_normalized"].isin(["Phase 3", "Phase 3/4"]))])
+        if mech_p3 == 0:
+            total_for_mech = int(pharma_mechs[mech])
+            mechs_no_p3.append((mech, total_for_mech))
+    if mechs_no_p3:
+        sections.append("| Mechanism | Total Trials | Highest Phase |")
+        sections.append("|---|---:|---|")
+        for mech, total in mechs_no_p3:
+            mech_df_ws = trials_df[trials_df["mechanism"] == mech]
+            highest = mech_df_ws["phase_normalized"].value_counts().index[0] if len(mech_df_ws) > 0 else "\u2014"
+            sections.append(f"| {mech} | {total} | {highest} |")
+        sections.append("")
+    else:
+        sections.append("*All pharmacological mechanisms have at least one Phase 3 trial.*\n")
+    # Indication gaps — mechanisms x indication with 0 trials
+    sections.append("### Underserved Mechanism \u00d7 Indication Combinations\n")
+    sections.append(
+        "*Mechanism-indication pairs where no active trials exist represent potential "
+        "first-mover opportunities.*\n"
+    )
+    gaps = []
+    for mech in pharma_mechs.index[:10]:  # Top 10 mechanisms
+        mech_df_gap = trials_df[trials_df["mechanism"] == mech]
+        mech_ind_counts = _count_indications(mech_df_gap, indication_cats=indication_cats)
+        for label, _full, cnt in mech_ind_counts:
+            # Skip default/catch-all categories for gap analysis
+            is_default = False
+            if indication_cats:
+                for cat in indication_cats:
+                    if cat.get("label") == label and cat.get("is_default"):
+                        is_default = True
+                        break
+            if not is_default and cnt == 0:
+                gaps.append((mech, label))
+    if gaps:
+        sections.append("| Mechanism | Missing Indication |")
+        sections.append("|---|---|")
+        for mech, ind in gaps:
+            sections.append(f"| {mech} | {ind} |")
+        sections.append("")
+    else:
+        sections.append("*No obvious mechanism \u00d7 indication gaps among top mechanisms.*\n")
+    # Pediatric gaps
+    if has_pop_data and n_ped > 0:
+        sections.append("### Pediatric Coverage Gaps\n")
+        ped_mechs_set = set(trials_df[is_pediatric_col.fillna(False).astype(bool)]["mechanism"].unique())
+        all_pharma_mechs = set(pharma_mechs.index)
+        no_ped = all_pharma_mechs - ped_mechs_set
+        if no_ped:
+            sections.append("*Mechanisms with NO pediatric trials:*\n")
+            for mech in sorted(no_ped):
+                sections.append(f"- {mech}")
+            sections.append("")
+        else:
+            sections.append("*All pharmacological mechanisms have at least one pediatric trial.*\n")
+    # ==========================================
+    # 22. COLLABORATOR NETWORK
+    # ==========================================
+    sections.append("---\n")
+    sections.append("## Collaborator Network\n")
+    sections.append(
+        "Collaborating institutions (listed as collaborators on ClinicalTrials.gov) "
+        f"reveal partnership density and key academic centers driving {disease_short or disease_name} research.\n"
+    )
+    collab_col = _safe_col(trials_df, "collaborators_str", default="")
+    has_collab = collab_col.astype(str).str.strip().replace("", pd.NA).dropna().shape[0] > 0
+    if has_collab:
+        # Count collaborator appearances
+        collab_counts = {}
+        for val in collab_col:
+            if pd.notna(val) and str(val).strip():
+                for c in str(val).split(";"):
+                    c = c.strip()
+                    if c:
+                        collab_counts[c] = collab_counts.get(c, 0) + 1
+        if collab_counts:
+            sorted_collabs = sorted(collab_counts.items(), key=lambda x: x[1], reverse=True)
+            sections.append(f"**{len(sorted_collabs)} unique collaborating institutions** identified.\n")
+            sections.append("### Top 20 Collaborators\n")
+            sections.append("| Rank | Collaborator | Trials |")
+            sections.append("|---:|---|---:|")
+            for rank, (collab, cnt) in enumerate(sorted_collabs[:20], 1):
+                sections.append(f"| {rank} | {collab} | {cnt} |")
+            sections.append("")
+            # Trials with collaborators vs. without
+            n_with_collab = int(collab_col.astype(str).str.strip().replace("", pd.NA).dropna().shape[0])
+            sections.append(f"- Trials with listed collaborators: {n_with_collab} ({n_with_collab/n_total*100:.0f}%)")
+            sections.append(f"- Trials without collaborators: {n_total - n_with_collab} ({(n_total - n_with_collab)/n_total*100:.0f}%)")
+            sections.append("")
+        else:
+            sections.append("*No collaborator data found.*\n")
+    else:
+        sections.append("*Collaborator data not available. Re-run compile_trials.py with Phase 2B to populate.*\n")
+    # ==========================================
+    # 23. REGULATORY SIGNALS
+    # ==========================================
+    sections.append("---\n")
+    sections.append("## Regulatory Signals\n")
+    sections.append(
+        "FDA-regulated status and Data Safety Monitoring Board (DSMB) presence are "
+        "proxies for regulatory seriousness and risk management maturity.\n"
+    )
+    fda_col = _safe_col(trials_df, "is_fda_regulated_drug", default=False)
+    dmc_col = _safe_col(trials_df, "has_dmc", default=False)
+    has_reg_data = "is_fda_regulated_drug" in trials_df.columns or "has_dmc" in trials_df.columns
+    if has_reg_data:
+        # FDA-regulated breakdown
+        if "is_fda_regulated_drug" in trials_df.columns:
+            n_fda = int(fda_col.fillna(False).astype(bool).sum())
+            n_non_fda = n_total - n_fda
+            sections.append("### FDA-Regulated Drug Trials\n")
+            sections.append(f"| | Trials | % |")
+            sections.append(f"|---|---:|---:|")
+            sections.append(f"| FDA-regulated | {n_fda} | {n_fda/n_total*100:.0f}% |")
+            sections.append(f"| Non-FDA-regulated | {n_non_fda} | {n_non_fda/n_total*100:.0f}% |")
+            sections.append("")
+            # FDA by mechanism
+            sections.append("### FDA-Regulated by Mechanism\n")
+            sections.append("| Mechanism | FDA-Regulated | Non-FDA | % FDA |")
+            sections.append("|---|---:|---:|---:|")
+            for mech in ct.index:
+                mech_mask = trials_df["mechanism"] == mech
+                mech_n = int(mech_mask.sum())
+                mech_fda = int((mech_mask & fda_col.fillna(False).astype(bool)).sum())
+                pct = mech_fda / mech_n * 100 if mech_n > 0 else 0
+                sections.append(f"| {mech} | {mech_fda} | {mech_n - mech_fda} | {pct:.0f}% |")
+            sections.append("")
+        # DSMB breakdown
+        if "has_dmc" in trials_df.columns:
+            n_dmc = int(dmc_col.fillna(False).astype(bool).sum())
+            sections.append("### Data Safety Monitoring Board (DSMB)\n")
+            sections.append(f"| | Trials | % |")
+            sections.append(f"|---|---:|---:|")
+            sections.append(f"| Has DSMB | {n_dmc} | {n_dmc/n_total*100:.0f}% |")
+            sections.append(f"| No DSMB | {n_total - n_dmc} | {(n_total - n_dmc)/n_total*100:.0f}% |")
+            sections.append("")
+            # DSMB rate by phase
+            sections.append("### DSMB Rate by Phase\n")
+            sections.append("| Phase | Has DSMB | Total | % DSMB |")
+            sections.append("|---|---:|---:|---:|")
+            for phase in phase_order:
+                phase_mask = trials_df["phase_normalized"] == phase
+                if phase_mask.sum() == 0:
+                    continue
+                n_ph = int(phase_mask.sum())
+                n_dmc_ph = int((phase_mask & dmc_col.fillna(False).astype(bool)).sum())
+                pct = n_dmc_ph / n_ph * 100 if n_ph > 0 else 0
+                sections.append(f"| {phase} | {n_dmc_ph} | {n_ph} | {pct:.0f}% |")
+            sections.append("")
+    else:
+        sections.append("*Regulatory signal data not available. Re-run compile_trials.py with Phase 2B to populate.*\n")
+    # ==========================================
+    # 24. DATA NOTES
+    # ==========================================
+    sections.append("---\n")
+    sections.append("## Data Notes\n")
+    sections.append(f"- **Source:** ClinicalTrials.gov API v2 (free, no authentication)")
+    sections.append(f"- **Query date:** {now.strftime('%Y-%m-%d')}")
+    if parameters:
+        conditions = parameters.get("conditions", [])
+        if conditions:
+            sections.append(f"- **Conditions searched:** {', '.join(conditions)}")
+        statuses = parameters.get("statuses", [])
+        if statuses:
+            sections.append(f"- **Status filter:** {', '.join(_format_status(s) for s in statuses)}")
+        highlight_mech = parameters.get("highlight_mechanism")
+        if highlight_mech:
+            sections.append(f"- **Mechanism focus:** {highlight_mech}")
+        highlight_sponsor = parameters.get("highlight_sponsor")
+        if highlight_sponsor:
+            sections.append(f"- **Sponsor focus:** {highlight_sponsor}")
+    unclassified = trials_df[trials_df["mechanism"].isin(["Unclassified", "Other Biologic", "Small Molecule (Other)"])]
+    if len(unclassified) > 0:
+        sections.append(
+            f"- **Unclassified trials:** {len(unclassified)} ({len(unclassified)/n_total*100:.0f}% of total) "
+            f"\u2014 trials with generic or unrecognized interventions"
+        )
+    sections.append(f"- **Classification method:** Pattern matching on intervention names, descriptions, and trial titles")
+    sections.append(f"- **Limitations:** Mechanism classification is automated and may misclassify novel or ambiguous interventions. "
+                    f"Completion dates are sponsor-reported estimates and may change.")
+    if "enrollment_clean" in trials_df.columns:
+        n_outliers = int(_safe_col(trials_df, "enrollment_outlier", default=False).fillna(False).astype(bool).sum())
+        if n_outliers > 0:
+            sections.append(f"- **Enrollment cleaning:** {n_outliers} enrollment outlier(s) flagged and excluded from summary statistics")
+    sections.append("")
+    # ==========================================
+    # BUILD AND WRITE
+    # ==========================================
+    report = "\n".join(sections)
+    with open(output_file, "w") as f:
+        f.write(report)
+    print(f"\u2713 Markdown report generated: {output_file}")
+    return report
+# ============================================================
+# HELPER FUNCTIONS
+# ============================================================
+def _safe_col_val(row, col, default=""):
+    """Safely get a value from a row (Series), returning default if key missing."""
+    try:
+        val = row[col]
+        if pd.isna(val):
+            return default
+        return val
+    except (KeyError, IndexError):
+        return default
+def _get_enrollment(trial):
+    """
+    Get enrollment from a trial row, preferring enrollment_clean over enrollment.
+    Returns the numeric value or NaN.
+    """
+    for col in ("enrollment_clean", "enrollment"):
+        try:
+            val = trial[col]
+            if pd.notna(val) and val > 0:
+                return val
+        except (KeyError, IndexError):
+            continue
+    return np.nan
+def _format_status(status):
+    """Convert API status to readable format."""
+    status_map = {
+        "RECRUITING": "Recruiting",
+        "ACTIVE_NOT_RECRUITING": "Active, not recruiting",
+        "NOT_YET_RECRUITING": "Not yet recruiting",
+        "ENROLLING_BY_INVITATION": "Enrolling by invitation",
+        "COMPLETED": "Completed",
+        "SUSPENDED": "Suspended",
+        "TERMINATED": "Terminated",
+        "WITHDRAWN": "Withdrawn",
+    }
+    return status_map.get(status, status)
+def _format_date(date_str):
+    """Format date string to readable format, return dash if empty."""
+    if not date_str or str(date_str) == "nan":
+        return "\u2014"
+    # Clean up: just return the date as-is but shorter
+    date_str = str(date_str).strip()
+    if len(date_str) > 10:
+        date_str = date_str[:10]
+    return date_str if date_str else "\u2014"
+def _time_to_readout(completion_date):
+    """Calculate time from now to expected completion."""
+    if not completion_date or str(completion_date) == "nan":
+        return "\u2014"
+    try:
+        date_str = str(completion_date).strip()
+        # Parse various date formats
+        for fmt in ("%Y-%m-%d", "%Y-%m", "%Y"):
+            try:
+                dt = datetime.strptime(date_str[:len(fmt.replace("%", "").replace("-", "") + date_str[:4])], fmt)
+                break
+            except ValueError:
+                continue
+        else:
+            # Try simple parse
+            if len(date_str) >= 7:
+                dt = datetime.strptime(date_str[:7], "%Y-%m")
+            elif len(date_str) >= 4:
+                dt = datetime.strptime(date_str[:4], "%Y")
+            else:
+                return "\u2014"
+        delta = dt - datetime.now()
+        months = delta.days / 30.44
+        if months < 0:
+            return "Overdue"
+        elif months < 1:
+            return "<1 month"
+        elif months < 12:
+            return f"~{int(months)} months"
+        else:
+            years = months / 12
+            return f"~{years:.1f} years"
+    except (ValueError, TypeError):
+        return "\u2014"
+def _get_upcoming_readouts(trials_df, months=18):
+    """Get trials with completion dates within the next N months."""
+    now = datetime.now()
+    cutoff = now + timedelta(days=months * 30.44)
+    upcoming = []
+    for _, trial in trials_df.iterrows():
+        cd = trial.get("completion_date", "")
+        if not cd or str(cd) == "nan":
+            continue
+        try:
+            date_str = str(cd).strip()
+            if len(date_str) >= 10:
+                dt = datetime.strptime(date_str[:10], "%Y-%m-%d")
+            elif len(date_str) >= 7:
+                dt = datetime.strptime(date_str[:7], "%Y-%m")
+            elif len(date_str) >= 4:
+                dt = datetime.strptime(date_str[:4], "%Y")
+            else:
+                continue
+            if now <= dt <= cutoff:
+                upcoming.append(trial)
+        except (ValueError, TypeError):
+            continue
+    if not upcoming:
+        return pd.DataFrame()
+    result = pd.DataFrame(upcoming)
+    # Sort by completion date
+    result = result.sort_values("completion_date")
+    return result
+def _extract_drug_table(df, indication_cats=None):
+    """Extract a drug-level pipeline table from trials DataFrame."""
+    records = []
+    seen_drugs = set()
+    for _, trial in df.iterrows():
+        drugs_str = trial.get("drug_names_str", "")
+        if not drugs_str or str(drugs_str) == "nan":
+            continue
+        drugs = [d.strip() for d in str(drugs_str).split(";") if d.strip()]
+        for drug in drugs:
+            # Skip placebo, generic names
+            drug_lower = drug.lower()
+            if any(skip in drug_lower for skip in ["placebo", "saline", "sodium chloride", "standard of care",
+                                                     "standard care", "usual care", "sham", "no intervention"]):
+                continue
+            # Deduplicate by drug + sponsor
+            key = (drug_lower, trial.get("sponsor_normalized", ""))
+            if key in seen_drugs:
+                continue
+            seen_drugs.add(key)
+            enrollment_val = _get_enrollment(trial)
+            enrollment_str = f"{int(enrollment_val):,}" if pd.notna(enrollment_val) and enrollment_val > 0 else "\u2014"
+            indication = _extract_indication(trial.get("conditions_str", ""), indication_cats=indication_cats)
+            completion_str = _format_date(trial.get("completion_date", ""))
+            records.append({
+                "drug": drug[:40],
+                "mechanism": trial.get("mechanism", ""),
+                "sponsor": trial.get("sponsor_normalized", ""),
+                "phase": trial.get("phase_normalized", ""),
+                "status": _format_status(trial.get("overall_status", "")),
+                "indication": indication,
+                "enrollment": enrollment_str,
+                "completion": completion_str,
+                "phase_numeric": trial.get("phase_numeric", 0),
+            })
+    if not records:
+        return pd.DataFrame()
+    result = pd.DataFrame(records)
+    result = result.sort_values(["phase_numeric", "drug"], ascending=[False, True])
+    return result.drop(columns=["phase_numeric"])
+def _get_unique_drugs(df):
+    """Extract unique meaningful drug names from a trials DataFrame."""
+    drugs = set()
+    skip_terms = {"placebo", "saline", "sodium chloride", "standard of care", "standard care",
+                  "usual care", "sham", "no intervention", "data collection"}
+    for _, trial in df.iterrows():
+        drugs_str = trial.get("drug_names_str", "")
+        if not drugs_str or str(drugs_str) == "nan":
+            continue
+        for d in str(drugs_str).split(";"):
+            d = d.strip()
+            if d and not any(skip in d.lower() for skip in skip_terms):
+                drugs.add(d)
+    return sorted(drugs)
+def _extract_indication(conditions_str, indication_cats=None):
+    """
+    Extract simplified indication from conditions string.
+    Parameters
+    ----------
+    conditions_str : str
+        The trial's conditions string.
+    indication_cats : list of dict, optional
+        Config-driven indication categories. Each dict has 'label',
+        'pattern' (substring match), and optionally 'is_default'.
+        When None, falls back to returning the raw conditions (truncated).
+    """
+    if not conditions_str or str(conditions_str) == "nan":
+        return "\u2014"
+    conditions_lower = str(conditions_str).lower()
+    if indication_cats:
+        matched = []
+        for cat in indication_cats:
+            if cat.get("is_default"):
+                continue
+            if cat.get("pattern") and cat["pattern"].lower() in conditions_lower:
+                matched.append(cat["label"])
+        if matched:
+            return " + ".join(matched)
+        # Check for default category
+        for cat in indication_cats:
+            if cat.get("is_default"):
+                return cat["label"]
+        return "Other"
+    else:
+        # No config: return truncated conditions
+        raw = str(conditions_str).strip()
+        return raw[:40] if len(raw) > 40 else raw if raw else "Other"
+def _count_indications(df, indication_cats=None):
+    """
+    Count trials per indication category.
+    Parameters
+    ----------
+    df : pd.DataFrame
+        Trials DataFrame.
+    indication_cats : list of dict, optional
+        Config-driven indication categories. When None, falls back to
+        counting top conditions from the data.
+    Returns
+    -------
+    list of (label, full_name, count) tuples if indication_cats provided,
+    or (uc_count, cd_count, other_count) tuple for backward compatibility.
+    """
+    if indication_cats:
+        counts = []
+        matched_indices = set()
+        for cat in indication_cats:
+            if cat.get("is_default"):
+                continue
+            label = cat["label"]
+            full_name = cat.get("full_name", label)
+            pattern = cat.get("pattern", "").lower()
+            cat_count = 0
+            for idx, trial in df.iterrows():
+                cond = str(trial.get("conditions_str", "")).lower()
+                if pattern and pattern in cond:
+                    cat_count += 1
+                    matched_indices.add(idx)
+            counts.append((label, full_name, cat_count))
+        # Handle default category (remaining unmatched)
+        for cat in indication_cats:
+            if cat.get("is_default"):
+                label = cat["label"]
+                full_name = cat.get("full_name", label)
+                default_count = len(df) - len(matched_indices)
+                counts.append((label, full_name, default_count))
+                break
+        return counts
+    else:
+        # Fallback: no config, return top conditions from data
+        cond_counts = {}
+        for _, trial in df.iterrows():
+            cond = str(trial.get("conditions_str", "")).strip()
+            if cond and cond != "nan":
+                # Split semicolon-separated conditions
+                for c in cond.split(";"):
+                    c = c.strip()
+                    if c:
+                        cond_counts[c] = cond_counts.get(c, 0) + 1
+        # Return as list of (label, full_name, count) tuples
+        sorted_conds = sorted(cond_counts.items(), key=lambda x: x[1], reverse=True)
+        return [(c, c, n) for c, n in sorted_conds[:10]]
+def _add_indication_split(sections, df, indication_cats=None):
+    """Add indication split analysis to sections."""
+    counts = _count_indications(df, indication_cats=indication_cats)
+    total = len(df)
+    for entry in counts:
+        label, full_name, cnt = entry
+        display = f"{full_name} ({label})" if label != full_name else label
+        if total > 0:
+            sections.append(f"- {display}: {cnt} ({cnt/total*100:.0f}%)")
+        else:
+            sections.append(f"- {display}: 0")
+    sections.append("")