@bgicli/bgicli 2.1.1 → 2.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/data/skills/aav-vector-design-agent/SKILL.md +198 -0
- package/data/skills/adaptyv/SKILL.md +112 -0
- package/data/skills/adhd-daily-planner/SKILL.md +271 -0
- package/data/skills/aeon/SKILL.md +372 -0
- package/data/skills/agent-browser/SKILL.md +159 -0
- package/data/skills/agentd-drug-discovery/SKILL.md +52 -0
- package/data/skills/ai-analyzer/SKILL.md +218 -0
- package/data/skills/alphafold/SKILL.md +183 -0
- package/data/skills/alphafold-database/SKILL.md +500 -0
- package/data/skills/anndata/SKILL.md +394 -0
- package/data/skills/antibody-design-agent/SKILL.md +64 -0
- package/data/skills/arboreto/SKILL.md +237 -0
- package/data/skills/armored-cart-design-agent/SKILL.md +225 -0
- package/data/skills/arxiv-search/SKILL.md +224 -0
- package/data/skills/autonomous-oncology-agent/SKILL.md +77 -0
- package/data/skills/bayesian-optimizer/SKILL.md +60 -0
- package/data/skills/benchling-integration/SKILL.md +473 -0
- package/data/skills/bgpt-paper-search/SKILL.md +81 -0
- package/data/skills/bindcraft/SKILL.md +198 -0
- package/data/skills/binder-design/SKILL.md +182 -0
- package/data/skills/binding-characterization/SKILL.md +234 -0
- package/data/skills/bindingdb-database/SKILL.md +332 -0
- package/data/skills/bio-admet-prediction/SKILL.md +224 -0
- package/data/skills/bio-alignment-files-bam-statistics/SKILL.md +340 -0
- package/data/skills/bio-alignment-filtering/SKILL.md +322 -0
- package/data/skills/bio-alignment-indexing/SKILL.md +249 -0
- package/data/skills/bio-alignment-io/SKILL.md +301 -0
- package/data/skills/bio-alignment-msa-parsing/SKILL.md +366 -0
- package/data/skills/bio-alignment-msa-statistics/SKILL.md +375 -0
- package/data/skills/bio-alignment-pairwise/SKILL.md +277 -0
- package/data/skills/bio-alignment-sorting/SKILL.md +296 -0
- package/data/skills/bio-alignment-validation/SKILL.md +374 -0
- package/data/skills/bio-atac-seq-atac-peak-calling/SKILL.md +221 -0
- package/data/skills/bio-atac-seq-atac-qc/SKILL.md +292 -0
- package/data/skills/bio-atac-seq-differential-accessibility/SKILL.md +268 -0
- package/data/skills/bio-atac-seq-footprinting/SKILL.md +256 -0
- package/data/skills/bio-atac-seq-motif-deviation/SKILL.md +319 -0
- package/data/skills/bio-atac-seq-nucleosome-positioning/SKILL.md +321 -0
- package/data/skills/bio-basecalling/SKILL.md +368 -0
- package/data/skills/bio-batch-downloads/SKILL.md +384 -0
- package/data/skills/bio-batch-processing/SKILL.md +303 -0
- package/data/skills/bio-bedgraph-handling/SKILL.md +336 -0
- package/data/skills/bio-blast-searches/SKILL.md +354 -0
- package/data/skills/bio-causal-genomics-colocalization-analysis/SKILL.md +264 -0
- package/data/skills/bio-causal-genomics-fine-mapping/SKILL.md +267 -0
- package/data/skills/bio-causal-genomics-mediation-analysis/SKILL.md +264 -0
- package/data/skills/bio-causal-genomics-mendelian-randomization/SKILL.md +221 -0
- package/data/skills/bio-causal-genomics-pleiotropy-detection/SKILL.md +292 -0
- package/data/skills/bio-cfdna-preprocessing/SKILL.md +200 -0
- package/data/skills/bio-chipseq-differential-binding/SKILL.md +262 -0
- package/data/skills/bio-chipseq-motif-analysis/SKILL.md +387 -0
- package/data/skills/bio-chipseq-peak-annotation/SKILL.md +239 -0
- package/data/skills/bio-chipseq-peak-calling/SKILL.md +277 -0
- package/data/skills/bio-chipseq-qc/SKILL.md +391 -0
- package/data/skills/bio-chipseq-super-enhancers/SKILL.md +288 -0
- package/data/skills/bio-chipseq-visualization/SKILL.md +289 -0
- package/data/skills/bio-clinical-databases-clinvar-lookup/SKILL.md +188 -0
- package/data/skills/bio-clinical-databases-dbsnp-queries/SKILL.md +171 -0
- package/data/skills/bio-clinical-databases-gnomad-frequencies/SKILL.md +205 -0
- package/data/skills/bio-clinical-databases-hla-typing/SKILL.md +248 -0
- package/data/skills/bio-clinical-databases-myvariant-queries/SKILL.md +174 -0
- package/data/skills/bio-clinical-databases-pharmacogenomics/SKILL.md +232 -0
- package/data/skills/bio-clinical-databases-polygenic-risk/SKILL.md +276 -0
- package/data/skills/bio-clinical-databases-somatic-signatures/SKILL.md +261 -0
- package/data/skills/bio-clinical-databases-tumor-mutational-burden/SKILL.md +301 -0
- package/data/skills/bio-clinical-databases-variant-prioritization/SKILL.md +225 -0
- package/data/skills/bio-clip-seq-binding-site-annotation/SKILL.md +66 -0
- package/data/skills/bio-clip-seq-clip-alignment/SKILL.md +70 -0
- package/data/skills/bio-clip-seq-clip-motif-analysis/SKILL.md +62 -0
- package/data/skills/bio-clip-seq-clip-peak-calling/SKILL.md +282 -0
- package/data/skills/bio-clip-seq-clip-preprocessing/SKILL.md +142 -0
- package/data/skills/bio-codon-usage/SKILL.md +353 -0
- package/data/skills/bio-comparative-genomics-ancestral-reconstruction/SKILL.md +312 -0
- package/data/skills/bio-comparative-genomics-hgt-detection/SKILL.md +341 -0
- package/data/skills/bio-comparative-genomics-ortholog-inference/SKILL.md +308 -0
- package/data/skills/bio-comparative-genomics-positive-selection/SKILL.md +354 -0
- package/data/skills/bio-comparative-genomics-synteny-analysis/SKILL.md +315 -0
- package/data/skills/bio-compressed-files/SKILL.md +263 -0
- package/data/skills/bio-consensus-sequences/SKILL.md +340 -0
- package/data/skills/bio-copy-number-cnv-annotation/SKILL.md +307 -0
- package/data/skills/bio-copy-number-cnv-visualization/SKILL.md +294 -0
- package/data/skills/bio-copy-number-cnvkit-analysis/SKILL.md +290 -0
- package/data/skills/bio-copy-number-gatk-cnv/SKILL.md +270 -0
- package/data/skills/bio-crispr-screens-base-editing-analysis/SKILL.md +110 -0
- package/data/skills/bio-crispr-screens-batch-correction/SKILL.md +316 -0
- package/data/skills/bio-crispr-screens-crispresso-editing/SKILL.md +205 -0
- package/data/skills/bio-crispr-screens-hit-calling/SKILL.md +264 -0
- package/data/skills/bio-crispr-screens-jacks-analysis/SKILL.md +313 -0
- package/data/skills/bio-crispr-screens-library-design/SKILL.md +417 -0
- package/data/skills/bio-crispr-screens-mageck-analysis/SKILL.md +222 -0
- package/data/skills/bio-crispr-screens-screen-qc/SKILL.md +243 -0
- package/data/skills/bio-ctdna-mutation-detection/SKILL.md +234 -0
- package/data/skills/bio-data-visualization-circos-plots/SKILL.md +405 -0
- package/data/skills/bio-data-visualization-color-palettes/SKILL.md +244 -0
- package/data/skills/bio-data-visualization-genome-browser-tracks/SKILL.md +328 -0
- package/data/skills/bio-data-visualization-genome-tracks/SKILL.md +249 -0
- package/data/skills/bio-data-visualization-ggplot2-fundamentals/SKILL.md +313 -0
- package/data/skills/bio-data-visualization-heatmaps-clustering/SKILL.md +227 -0
- package/data/skills/bio-data-visualization-interactive-visualization/SKILL.md +210 -0
- package/data/skills/bio-data-visualization-multipanel-figures/SKILL.md +274 -0
- package/data/skills/bio-data-visualization-specialized-omics-plots/SKILL.md +251 -0
- package/data/skills/bio-data-visualization-upset-plots/SKILL.md +228 -0
- package/data/skills/bio-data-visualization-volcano-customization/SKILL.md +233 -0
- package/data/skills/bio-de-deseq2-basics/SKILL.md +376 -0
- package/data/skills/bio-de-edger-basics/SKILL.md +418 -0
- package/data/skills/bio-de-results/SKILL.md +378 -0
- package/data/skills/bio-de-visualization/SKILL.md +408 -0
- package/data/skills/bio-differential-expression-batch-correction/SKILL.md +253 -0
- package/data/skills/bio-differential-expression-timeseries-de/SKILL.md +370 -0
- package/data/skills/bio-differential-splicing/SKILL.md +177 -0
- package/data/skills/bio-duplicate-handling/SKILL.md +292 -0
- package/data/skills/bio-entrez-fetch/SKILL.md +334 -0
- package/data/skills/bio-entrez-link/SKILL.md +325 -0
- package/data/skills/bio-entrez-search/SKILL.md +311 -0
- package/data/skills/bio-epidemiological-genomics-amr-surveillance/SKILL.md +233 -0
- package/data/skills/bio-epidemiological-genomics-pathogen-typing/SKILL.md +202 -0
- package/data/skills/bio-epidemiological-genomics-phylodynamics/SKILL.md +207 -0
- package/data/skills/bio-epidemiological-genomics-transmission-inference/SKILL.md +237 -0
- package/data/skills/bio-epidemiological-genomics-variant-surveillance/SKILL.md +237 -0
- package/data/skills/bio-epitranscriptomics-m6a-differential/SKILL.md +88 -0
- package/data/skills/bio-epitranscriptomics-m6a-peak-calling/SKILL.md +89 -0
- package/data/skills/bio-epitranscriptomics-m6anet-analysis/SKILL.md +101 -0
- package/data/skills/bio-epitranscriptomics-merip-preprocessing/SKILL.md +81 -0
- package/data/skills/bio-epitranscriptomics-modification-visualization/SKILL.md +98 -0
- package/data/skills/bio-experimental-design-batch-design/SKILL.md +110 -0
- package/data/skills/bio-experimental-design-multiple-testing/SKILL.md +98 -0
- package/data/skills/bio-experimental-design-power-analysis/SKILL.md +84 -0
- package/data/skills/bio-experimental-design-sample-size/SKILL.md +93 -0
- package/data/skills/bio-expression-matrix-counts-ingest/SKILL.md +220 -0
- package/data/skills/bio-expression-matrix-gene-id-mapping/SKILL.md +256 -0
- package/data/skills/bio-expression-matrix-metadata-joins/SKILL.md +271 -0
- package/data/skills/bio-expression-matrix-sparse-handling/SKILL.md +247 -0
- package/data/skills/bio-fastq-quality/SKILL.md +279 -0
- package/data/skills/bio-filter-sequences/SKILL.md +265 -0
- package/data/skills/bio-flow-cytometry-bead-normalization/SKILL.md +315 -0
- package/data/skills/bio-flow-cytometry-clustering-phenotyping/SKILL.md +237 -0
- package/data/skills/bio-flow-cytometry-compensation-transformation/SKILL.md +196 -0
- package/data/skills/bio-flow-cytometry-cytometry-qc/SKILL.md +382 -0
- package/data/skills/bio-flow-cytometry-differential-analysis/SKILL.md +217 -0
- package/data/skills/bio-flow-cytometry-doublet-detection/SKILL.md +288 -0
- package/data/skills/bio-flow-cytometry-fcs-handling/SKILL.md +221 -0
- package/data/skills/bio-flow-cytometry-gating-analysis/SKILL.md +193 -0
- package/data/skills/bio-format-conversion/SKILL.md +193 -0
- package/data/skills/bio-fragment-analysis/SKILL.md +214 -0
- package/data/skills/bio-gatk-variant-calling/SKILL.md +422 -0
- package/data/skills/bio-genome-assembly-assembly-polishing/SKILL.md +333 -0
- package/data/skills/bio-genome-assembly-assembly-qc/SKILL.md +344 -0
- package/data/skills/bio-genome-assembly-contamination-detection/SKILL.md +235 -0
- package/data/skills/bio-genome-assembly-hifi-assembly/SKILL.md +178 -0
- package/data/skills/bio-genome-assembly-long-read-assembly/SKILL.md +307 -0
- package/data/skills/bio-genome-assembly-metagenome-assembly/SKILL.md +227 -0
- package/data/skills/bio-genome-assembly-scaffolding/SKILL.md +204 -0
- package/data/skills/bio-genome-assembly-short-read-assembly/SKILL.md +319 -0
- package/data/skills/bio-genome-engineering-base-editing-design/SKILL.md +277 -0
- package/data/skills/bio-genome-engineering-grna-design/SKILL.md +221 -0
- package/data/skills/bio-genome-engineering-hdr-template-design/SKILL.md +264 -0
- package/data/skills/bio-genome-engineering-off-target-prediction/SKILL.md +232 -0
- package/data/skills/bio-genome-engineering-prime-editing-design/SKILL.md +275 -0
- package/data/skills/bio-genome-intervals-bed-file-basics/SKILL.md +357 -0
- package/data/skills/bio-genome-intervals-bigwig-tracks/SKILL.md +351 -0
- package/data/skills/bio-genome-intervals-coverage-analysis/SKILL.md +300 -0
- package/data/skills/bio-genome-intervals-gtf-gff-handling/SKILL.md +345 -0
- package/data/skills/bio-genome-intervals-interval-arithmetic/SKILL.md +485 -0
- package/data/skills/bio-genome-intervals-proximity-operations/SKILL.md +337 -0
- package/data/skills/bio-geo-data/SKILL.md +380 -0
- package/data/skills/bio-hi-c-analysis-compartment-analysis/SKILL.md +261 -0
- package/data/skills/bio-hi-c-analysis-contact-pairs/SKILL.md +278 -0
- package/data/skills/bio-hi-c-analysis-hic-data-io/SKILL.md +260 -0
- package/data/skills/bio-hi-c-analysis-hic-differential/SKILL.md +328 -0
- package/data/skills/bio-hi-c-analysis-hic-visualization/SKILL.md +297 -0
- package/data/skills/bio-hi-c-analysis-loop-calling/SKILL.md +284 -0
- package/data/skills/bio-hi-c-analysis-matrix-operations/SKILL.md +274 -0
- package/data/skills/bio-hi-c-analysis-tad-detection/SKILL.md +239 -0
- package/data/skills/bio-imaging-mass-cytometry-cell-segmentation/SKILL.md +241 -0
- package/data/skills/bio-imaging-mass-cytometry-data-preprocessing/SKILL.md +279 -0
- package/data/skills/bio-imaging-mass-cytometry-interactive-annotation/SKILL.md +304 -0
- package/data/skills/bio-imaging-mass-cytometry-phenotyping/SKILL.md +231 -0
- package/data/skills/bio-imaging-mass-cytometry-quality-metrics/SKILL.md +316 -0
- package/data/skills/bio-imaging-mass-cytometry-spatial-analysis/SKILL.md +246 -0
- package/data/skills/bio-immunoinformatics-epitope-prediction/SKILL.md +259 -0
- package/data/skills/bio-immunoinformatics-immunogenicity-scoring/SKILL.md +275 -0
- package/data/skills/bio-immunoinformatics-mhc-binding-prediction/SKILL.md +260 -0
- package/data/skills/bio-immunoinformatics-neoantigen-prediction/SKILL.md +277 -0
- package/data/skills/bio-immunoinformatics-tcr-epitope-binding/SKILL.md +257 -0
- package/data/skills/bio-isoform-switching/SKILL.md +192 -0
- package/data/skills/bio-liquid-biopsy-pipeline/SKILL.md +311 -0
- package/data/skills/bio-local-blast/SKILL.md +350 -0
- package/data/skills/bio-long-read-sequencing-clair3-variants/SKILL.md +252 -0
- package/data/skills/bio-long-read-sequencing-isoseq-analysis/SKILL.md +334 -0
- package/data/skills/bio-long-read-sequencing-nanopore-methylation/SKILL.md +110 -0
- package/data/skills/bio-longitudinal-monitoring/SKILL.md +271 -0
- package/data/skills/bio-longread-alignment/SKILL.md +193 -0
- package/data/skills/bio-longread-medaka/SKILL.md +176 -0
- package/data/skills/bio-longread-qc/SKILL.md +224 -0
- package/data/skills/bio-longread-structural-variants/SKILL.md +201 -0
- package/data/skills/bio-machine-learning-atlas-mapping/SKILL.md +139 -0
- package/data/skills/bio-machine-learning-biomarker-discovery/SKILL.md +157 -0
- package/data/skills/bio-machine-learning-model-validation/SKILL.md +148 -0
- package/data/skills/bio-machine-learning-omics-classifiers/SKILL.md +146 -0
- package/data/skills/bio-machine-learning-prediction-explanation/SKILL.md +162 -0
- package/data/skills/bio-machine-learning-survival-analysis/SKILL.md +176 -0
- package/data/skills/bio-metabolomics-lipidomics/SKILL.md +265 -0
- package/data/skills/bio-metabolomics-metabolite-annotation/SKILL.md +241 -0
- package/data/skills/bio-metabolomics-msdial-preprocessing/SKILL.md +308 -0
- package/data/skills/bio-metabolomics-normalization-qc/SKILL.md +283 -0
- package/data/skills/bio-metabolomics-pathway-mapping/SKILL.md +237 -0
- package/data/skills/bio-metabolomics-statistical-analysis/SKILL.md +276 -0
- package/data/skills/bio-metabolomics-targeted-analysis/SKILL.md +314 -0
- package/data/skills/bio-metabolomics-xcms-preprocessing/SKILL.md +268 -0
- package/data/skills/bio-metagenomics-abundance/SKILL.md +203 -0
- package/data/skills/bio-metagenomics-amr-detection/SKILL.md +293 -0
- package/data/skills/bio-metagenomics-functional-profiling/SKILL.md +252 -0
- package/data/skills/bio-metagenomics-kraken/SKILL.md +204 -0
- package/data/skills/bio-metagenomics-metaphlan/SKILL.md +214 -0
- package/data/skills/bio-metagenomics-strain-tracking/SKILL.md +292 -0
- package/data/skills/bio-metagenomics-visualization/SKILL.md +240 -0
- package/data/skills/bio-methylation-based-detection/SKILL.md +223 -0
- package/data/skills/bio-methylation-bismark-alignment/SKILL.md +195 -0
- package/data/skills/bio-methylation-calling/SKILL.md +200 -0
- package/data/skills/bio-methylation-dmr-detection/SKILL.md +211 -0
- package/data/skills/bio-methylation-methylkit/SKILL.md +219 -0
- package/data/skills/bio-microbiome-amplicon-processing/SKILL.md +137 -0
- package/data/skills/bio-microbiome-differential-abundance/SKILL.md +147 -0
- package/data/skills/bio-microbiome-diversity-analysis/SKILL.md +188 -0
- package/data/skills/bio-microbiome-functional-prediction/SKILL.md +153 -0
- package/data/skills/bio-microbiome-qiime2-workflow/SKILL.md +219 -0
- package/data/skills/bio-microbiome-taxonomy-assignment/SKILL.md +168 -0
- package/data/skills/bio-molecular-descriptors/SKILL.md +200 -0
- package/data/skills/bio-molecular-io/SKILL.md +188 -0
- package/data/skills/bio-motif-search/SKILL.md +354 -0
- package/data/skills/bio-multi-omics-data-harmonization/SKILL.md +228 -0
- package/data/skills/bio-multi-omics-mixomics-analysis/SKILL.md +221 -0
- package/data/skills/bio-multi-omics-mofa-integration/SKILL.md +225 -0
- package/data/skills/bio-multi-omics-similarity-network/SKILL.md +235 -0
- package/data/skills/bio-orchestrator/SKILL.md +133 -0
- package/data/skills/bio-paired-end-fastq/SKILL.md +334 -0
- package/data/skills/bio-pathway-enrichment-visualization/SKILL.md +278 -0
- package/data/skills/bio-pathway-go-enrichment/SKILL.md +218 -0
- package/data/skills/bio-pathway-gsea/SKILL.md +227 -0
- package/data/skills/bio-pathway-kegg-pathways/SKILL.md +234 -0
- package/data/skills/bio-pathway-reactome/SKILL.md +215 -0
- package/data/skills/bio-pathway-wikipathways/SKILL.md +255 -0
- package/data/skills/bio-pdb-geometric-analysis/SKILL.md +475 -0
- package/data/skills/bio-pdb-structure-io/SKILL.md +296 -0
- package/data/skills/bio-pdb-structure-modification/SKILL.md +448 -0
- package/data/skills/bio-pdb-structure-navigation/SKILL.md +335 -0
- package/data/skills/bio-phasing-imputation-genotype-imputation/SKILL.md +201 -0
- package/data/skills/bio-phasing-imputation-haplotype-phasing/SKILL.md +190 -0
- package/data/skills/bio-phasing-imputation-imputation-qc/SKILL.md +265 -0
- package/data/skills/bio-phasing-imputation-reference-panels/SKILL.md +203 -0
- package/data/skills/bio-phylo-distance-calculations/SKILL.md +307 -0
- package/data/skills/bio-phylo-modern-tree-inference/SKILL.md +274 -0
- package/data/skills/bio-phylo-tree-io/SKILL.md +252 -0
- package/data/skills/bio-phylo-tree-manipulation/SKILL.md +375 -0
- package/data/skills/bio-phylo-tree-visualization/SKILL.md +275 -0
- package/data/skills/bio-pileup-generation/SKILL.md +314 -0
- package/data/skills/bio-population-genetics-association-testing/SKILL.md +293 -0
- package/data/skills/bio-population-genetics-linkage-disequilibrium/SKILL.md +260 -0
- package/data/skills/bio-population-genetics-plink-basics/SKILL.md +338 -0
- package/data/skills/bio-population-genetics-population-structure/SKILL.md +352 -0
- package/data/skills/bio-population-genetics-scikit-allel-analysis/SKILL.md +306 -0
- package/data/skills/bio-population-genetics-selection-statistics/SKILL.md +251 -0
- package/data/skills/bio-primer-design-primer-basics/SKILL.md +289 -0
- package/data/skills/bio-primer-design-primer-validation/SKILL.md +344 -0
- package/data/skills/bio-primer-design-qpcr-primers/SKILL.md +273 -0
- package/data/skills/bio-proteomics-data-import/SKILL.md +122 -0
- package/data/skills/bio-proteomics-dia-analysis/SKILL.md +246 -0
- package/data/skills/bio-proteomics-differential-abundance/SKILL.md +129 -0
- package/data/skills/bio-proteomics-peptide-identification/SKILL.md +122 -0
- package/data/skills/bio-proteomics-protein-inference/SKILL.md +174 -0
- package/data/skills/bio-proteomics-proteomics-qc/SKILL.md +208 -0
- package/data/skills/bio-proteomics-ptm-analysis/SKILL.md +139 -0
- package/data/skills/bio-proteomics-quantification/SKILL.md +141 -0
- package/data/skills/bio-proteomics-spectral-libraries/SKILL.md +270 -0
- package/data/skills/bio-reaction-enumeration/SKILL.md +251 -0
- package/data/skills/bio-read-alignment-bowtie2-alignment/SKILL.md +189 -0
- package/data/skills/bio-read-alignment-bwa-alignment/SKILL.md +166 -0
- package/data/skills/bio-read-alignment-hisat2-alignment/SKILL.md +205 -0
- package/data/skills/bio-read-alignment-star-alignment/SKILL.md +204 -0
- package/data/skills/bio-read-qc-adapter-trimming/SKILL.md +222 -0
- package/data/skills/bio-read-qc-contamination-screening/SKILL.md +252 -0
- package/data/skills/bio-read-qc-fastp-workflow/SKILL.md +278 -0
- package/data/skills/bio-read-qc-quality-filtering/SKILL.md +231 -0
- package/data/skills/bio-read-qc-quality-reports/SKILL.md +204 -0
- package/data/skills/bio-read-qc-umi-processing/SKILL.md +391 -0
- package/data/skills/bio-read-sequences/SKILL.md +319 -0
- package/data/skills/bio-reference-operations/SKILL.md +302 -0
- package/data/skills/bio-reporting-automated-qc-reports/SKILL.md +103 -0
- package/data/skills/bio-reporting-figure-export/SKILL.md +112 -0
- package/data/skills/bio-reporting-jupyter-reports/SKILL.md +98 -0
- package/data/skills/bio-reporting-quarto-reports/SKILL.md +295 -0
- package/data/skills/bio-reporting-rmarkdown-reports/SKILL.md +276 -0
- package/data/skills/bio-research-tools-biomarker-signature-studio/SKILL.md +99 -0
- package/data/skills/bio-restriction-enzyme-selection/SKILL.md +342 -0
- package/data/skills/bio-restriction-fragment-analysis/SKILL.md +259 -0
- package/data/skills/bio-restriction-mapping/SKILL.md +239 -0
- package/data/skills/bio-restriction-sites/SKILL.md +222 -0
- package/data/skills/bio-reverse-complement/SKILL.md +250 -0
- package/data/skills/bio-ribo-seq-orf-detection/SKILL.md +303 -0
- package/data/skills/bio-ribo-seq-riboseq-preprocessing/SKILL.md +176 -0
- package/data/skills/bio-ribo-seq-ribosome-periodicity/SKILL.md +182 -0
- package/data/skills/bio-ribo-seq-ribosome-stalling/SKILL.md +217 -0
- package/data/skills/bio-ribo-seq-translation-efficiency/SKILL.md +183 -0
- package/data/skills/bio-rna-quantification-alignment-free-quant/SKILL.md +226 -0
- package/data/skills/bio-rna-quantification-count-matrix-qc/SKILL.md +310 -0
- package/data/skills/bio-rna-quantification-featurecounts-counting/SKILL.md +190 -0
- package/data/skills/bio-rna-quantification-tximport-workflow/SKILL.md +240 -0
- package/data/skills/bio-rnaseq-qc/SKILL.md +320 -0
- package/data/skills/bio-sam-bam-basics/SKILL.md +248 -0
- package/data/skills/bio-sashimi-plots/SKILL.md +175 -0
- package/data/skills/bio-seq-objects/SKILL.md +240 -0
- package/data/skills/bio-sequence-properties/SKILL.md +397 -0
- package/data/skills/bio-sequence-similarity/SKILL.md +335 -0
- package/data/skills/bio-sequence-slicing/SKILL.md +232 -0
- package/data/skills/bio-sequence-statistics/SKILL.md +318 -0
- package/data/skills/bio-similarity-searching/SKILL.md +200 -0
- package/data/skills/bio-single-cell-batch-integration/SKILL.md +317 -0
- package/data/skills/bio-single-cell-cell-annotation/SKILL.md +259 -0
- package/data/skills/bio-single-cell-cell-communication/SKILL.md +257 -0
- package/data/skills/bio-single-cell-clustering/SKILL.md +330 -0
- package/data/skills/bio-single-cell-data-io/SKILL.md +315 -0
- package/data/skills/bio-single-cell-doublet-detection/SKILL.md +362 -0
- package/data/skills/bio-single-cell-lineage-tracing/SKILL.md +319 -0
- package/data/skills/bio-single-cell-markers-annotation/SKILL.md +317 -0
- package/data/skills/bio-single-cell-metabolite-communication/SKILL.md +258 -0
- package/data/skills/bio-single-cell-multimodal-integration/SKILL.md +242 -0
- package/data/skills/bio-single-cell-perturb-seq/SKILL.md +258 -0
- package/data/skills/bio-single-cell-preprocessing/SKILL.md +338 -0
- package/data/skills/bio-single-cell-scatac-analysis/SKILL.md +326 -0
- package/data/skills/bio-single-cell-splicing/SKILL.md +199 -0
- package/data/skills/bio-single-cell-trajectory-inference/SKILL.md +225 -0
- package/data/skills/bio-small-rna-seq-differential-mirna/SKILL.md +194 -0
- package/data/skills/bio-small-rna-seq-mirdeep2-analysis/SKILL.md +180 -0
- package/data/skills/bio-small-rna-seq-mirge3-analysis/SKILL.md +178 -0
- package/data/skills/bio-small-rna-seq-smrna-preprocessing/SKILL.md +174 -0
- package/data/skills/bio-small-rna-seq-target-prediction/SKILL.md +202 -0
- package/data/skills/bio-spatial-transcriptomics-image-analysis/SKILL.md +283 -0
- package/data/skills/bio-spatial-transcriptomics-spatial-communication/SKILL.md +299 -0
- package/data/skills/bio-spatial-transcriptomics-spatial-data-io/SKILL.md +272 -0
- package/data/skills/bio-spatial-transcriptomics-spatial-deconvolution/SKILL.md +314 -0
- package/data/skills/bio-spatial-transcriptomics-spatial-domains/SKILL.md +254 -0
- package/data/skills/bio-spatial-transcriptomics-spatial-multiomics/SKILL.md +181 -0
- package/data/skills/bio-spatial-transcriptomics-spatial-neighbors/SKILL.md +198 -0
- package/data/skills/bio-spatial-transcriptomics-spatial-preprocessing/SKILL.md +269 -0
- package/data/skills/bio-spatial-transcriptomics-spatial-proteomics/SKILL.md +124 -0
- package/data/skills/bio-spatial-transcriptomics-spatial-statistics/SKILL.md +237 -0
- package/data/skills/bio-spatial-transcriptomics-spatial-visualization/SKILL.md +287 -0
- package/data/skills/bio-splicing-pipeline/SKILL.md +253 -0
- package/data/skills/bio-splicing-qc/SKILL.md +190 -0
- package/data/skills/bio-splicing-quantification/SKILL.md +145 -0
- package/data/skills/bio-sra-data/SKILL.md +363 -0
- package/data/skills/bio-structural-biology-alphafold-predictions/SKILL.md +258 -0
- package/data/skills/bio-structural-biology-modern-structure-prediction/SKILL.md +346 -0
- package/data/skills/bio-substructure-search/SKILL.md +206 -0
- package/data/skills/bio-systems-biology-context-specific-models/SKILL.md +241 -0
- package/data/skills/bio-systems-biology-flux-balance-analysis/SKILL.md +206 -0
- package/data/skills/bio-systems-biology-gene-essentiality/SKILL.md +235 -0
- package/data/skills/bio-systems-biology-metabolic-reconstruction/SKILL.md +215 -0
- package/data/skills/bio-systems-biology-model-curation/SKILL.md +243 -0
- package/data/skills/bio-tcr-bcr-analysis-immcantation-analysis/SKILL.md +195 -0
- package/data/skills/bio-tcr-bcr-analysis-mixcr-analysis/SKILL.md +167 -0
- package/data/skills/bio-tcr-bcr-analysis-repertoire-visualization/SKILL.md +224 -0
- package/data/skills/bio-tcr-bcr-analysis-scirpy-analysis/SKILL.md +168 -0
- package/data/skills/bio-tcr-bcr-analysis-vdjtools-analysis/SKILL.md +188 -0
- package/data/skills/bio-transcription-translation/SKILL.md +237 -0
- package/data/skills/bio-tumor-fraction-estimation/SKILL.md +211 -0
- package/data/skills/bio-uniprot-access/SKILL.md +239 -0
- package/data/skills/bio-variant-annotation/SKILL.md +410 -0
- package/data/skills/bio-variant-calling/SKILL.md +266 -0
- package/data/skills/bio-variant-calling-clinical-interpretation/SKILL.md +355 -0
- package/data/skills/bio-variant-calling-deepvariant/SKILL.md +315 -0
- package/data/skills/bio-variant-calling-filtering-best-practices/SKILL.md +403 -0
- package/data/skills/bio-variant-calling-joint-calling/SKILL.md +338 -0
- package/data/skills/bio-variant-calling-structural-variant-calling/SKILL.md +253 -0
- package/data/skills/bio-variant-normalization/SKILL.md +325 -0
- package/data/skills/bio-vcf-basics/SKILL.md +342 -0
- package/data/skills/bio-vcf-manipulation/SKILL.md +429 -0
- package/data/skills/bio-vcf-statistics/SKILL.md +445 -0
- package/data/skills/bio-virtual-screening/SKILL.md +263 -0
- package/data/skills/bio-workflow-management-cwl-workflows/SKILL.md +433 -0
- package/data/skills/bio-workflow-management-nextflow-pipelines/SKILL.md +386 -0
- package/data/skills/bio-workflow-management-snakemake-workflows/SKILL.md +383 -0
- package/data/skills/bio-workflow-management-wdl-workflows/SKILL.md +500 -0
- package/data/skills/bio-workflows-atacseq-pipeline/SKILL.md +362 -0
- package/data/skills/bio-workflows-biomarker-pipeline/SKILL.md +272 -0
- package/data/skills/bio-workflows-chipseq-pipeline/SKILL.md +282 -0
- package/data/skills/bio-workflows-clip-pipeline/SKILL.md +268 -0
- package/data/skills/bio-workflows-cnv-pipeline/SKILL.md +324 -0
- package/data/skills/bio-workflows-crispr-editing-pipeline/SKILL.md +455 -0
- package/data/skills/bio-workflows-crispr-screen-pipeline/SKILL.md +278 -0
- package/data/skills/bio-workflows-cytometry-pipeline/SKILL.md +328 -0
- package/data/skills/bio-workflows-expression-to-pathways/SKILL.md +329 -0
- package/data/skills/bio-workflows-fastq-to-variants/SKILL.md +374 -0
- package/data/skills/bio-workflows-genome-assembly-pipeline/SKILL.md +290 -0
- package/data/skills/bio-workflows-gwas-pipeline/SKILL.md +323 -0
- package/data/skills/bio-workflows-hic-pipeline/SKILL.md +304 -0
- package/data/skills/bio-workflows-imc-pipeline/SKILL.md +304 -0
- package/data/skills/bio-workflows-longread-sv-pipeline/SKILL.md +281 -0
- package/data/skills/bio-workflows-merip-pipeline/SKILL.md +222 -0
- package/data/skills/bio-workflows-metabolic-modeling-pipeline/SKILL.md +408 -0
- package/data/skills/bio-workflows-metabolomics-pipeline/SKILL.md +297 -0
- package/data/skills/bio-workflows-metagenomics-pipeline/SKILL.md +283 -0
- package/data/skills/bio-workflows-methylation-pipeline/SKILL.md +274 -0
- package/data/skills/bio-workflows-microbiome-pipeline/SKILL.md +221 -0
- package/data/skills/bio-workflows-multi-omics-pipeline/SKILL.md +362 -0
- package/data/skills/bio-workflows-multiome-pipeline/SKILL.md +298 -0
- package/data/skills/bio-workflows-neoantigen-pipeline/SKILL.md +325 -0
- package/data/skills/bio-workflows-outbreak-pipeline/SKILL.md +341 -0
- package/data/skills/bio-workflows-proteomics-pipeline/SKILL.md +226 -0
- package/data/skills/bio-workflows-riboseq-pipeline/SKILL.md +94 -0
- package/data/skills/bio-workflows-rnaseq-to-de/SKILL.md +345 -0
- package/data/skills/bio-workflows-scrnaseq-pipeline/SKILL.md +354 -0
- package/data/skills/bio-workflows-smrna-pipeline/SKILL.md +86 -0
- package/data/skills/bio-workflows-somatic-variant-pipeline/SKILL.md +313 -0
- package/data/skills/bio-workflows-spatial-pipeline/SKILL.md +267 -0
- package/data/skills/bio-workflows-tcr-pipeline/SKILL.md +84 -0
- package/data/skills/bio-write-sequences/SKILL.md +205 -0
- package/data/skills/bioinformatics-singlecell/SKILL.md +143 -0
- package/data/skills/biokernel/SKILL.md +61 -0
- package/data/skills/biologist-analyst/SKILL.md +799 -0
- package/data/skills/biomaster-workflows/SKILL.md +55 -0
- package/data/skills/biomcp-server/SKILL.md +65 -0
- package/data/skills/biomedical-data-analysis/SKILL.md +56 -0
- package/data/skills/biomedical-search/SKILL.md +214 -0
- package/data/skills/biomni/SKILL.md +309 -0
- package/data/skills/biomni-general-agent/SKILL.md +43 -0
- package/data/skills/biomni-research-agent/SKILL.md +76 -0
- package/data/skills/biopython/SKILL.md +437 -0
- package/data/skills/biorxiv-database/SKILL.md +477 -0
- package/data/skills/bioservices/SKILL.md +355 -0
- package/data/skills/boltz/SKILL.md +188 -0
- package/data/skills/boltzgen/SKILL.md +287 -0
- package/data/skills/bone-marrow-ai-agent/SKILL.md +163 -0
- package/data/skills/brainstorming/SKILL.md +96 -0
- package/data/skills/brenda-database/SKILL.md +714 -0
- package/data/skills/bulk-combat-correction/SKILL.md +54 -0
- package/data/skills/bulk-deg-analysis/SKILL.md +61 -0
- package/data/skills/bulk-deseq2-analysis/SKILL.md +50 -0
- package/data/skills/bulk-stringdb-ppi/SKILL.md +49 -0
- package/data/skills/bulk-to-single-deconvolution/SKILL.md +50 -0
- package/data/skills/bulk-trajblend-interpolation/SKILL.md +52 -0
- package/data/skills/bulk-wgcna-analysis/SKILL.md +56 -0
- package/data/skills/cancer-metabolism-agent/SKILL.md +180 -0
- package/data/skills/care-coordination/SKILL.md +35 -0
- package/data/skills/cart-design-optimizer-agent/SKILL.md +162 -0
- package/data/skills/cbioportal-database/SKILL.md +367 -0
- package/data/skills/cell-free-expression/SKILL.md +291 -0
- package/data/skills/cellagent-annotation/SKILL.md +69 -0
- package/data/skills/cellfree-rna-agent/SKILL.md +182 -0
- package/data/skills/cellular-senescence-agent/SKILL.md +183 -0
- package/data/skills/cellxgene-census/SKILL.md +505 -0
- package/data/skills/chai/SKILL.md +272 -0
- package/data/skills/chatehr-clinician-assistant/SKILL.md +67 -0
- package/data/skills/chematagent-drug-discovery/SKILL.md +68 -0
- package/data/skills/chembl-database/SKILL.md +383 -0
- package/data/skills/chembl-search/SKILL.md +211 -0
- package/data/skills/chemcrow-drug-discovery/SKILL.md +61 -0
- package/data/skills/chemical-property-lookup/SKILL.md +42 -0
- package/data/skills/chemist-analyst/SKILL.md +1603 -0
- package/data/skills/chemistry-agent/SKILL.md +62 -0
- package/data/skills/chip-clonal-hematopoiesis-agent/SKILL.md +224 -0
- package/data/skills/chromosomal-instability-agent/SKILL.md +187 -0
- package/data/skills/citation-management/SKILL.md +1081 -0
- package/data/skills/claims-appeals/SKILL.md +35 -0
- package/data/skills/claw-ancestry-pca/SKILL.md +145 -0
- package/data/skills/claw-metagenomics/SKILL.md +238 -0
- package/data/skills/claw-semantic-sim/SKILL.md +151 -0
- package/data/skills/clinical-decision-support/SKILL.md +504 -0
- package/data/skills/clinical-diagnostic-reasoning/SKILL.md +222 -0
- package/data/skills/clinical-nlp-extractor/SKILL.md +59 -0
- package/data/skills/clinical-note-summarization/SKILL.md +52 -0
- package/data/skills/clinical-reports/SKILL.md +1127 -0
- package/data/skills/clinical-trial-protocol-skill/SKILL.md +508 -0
- package/data/skills/clinical-trials-search/SKILL.md +211 -0
- package/data/skills/clinicaltrials-database/SKILL.md +501 -0
- package/data/skills/clinpgx/SKILL.md +96 -0
- package/data/skills/clinpgx-database/SKILL.md +632 -0
- package/data/skills/clinvar-database/SKILL.md +356 -0
- package/data/skills/cnv-caller-agent/SKILL.md +171 -0
- package/data/skills/coagulation-thrombosis-agent/SKILL.md +141 -0
- package/data/skills/cobrapy/SKILL.md +457 -0
- package/data/skills/compbioagent-explorer/SKILL.md +67 -0
- package/data/skills/computational-pathology-agent/SKILL.md +72 -0
- package/data/skills/convergence-study/SKILL.md +98 -0
- package/data/skills/cosmic-database/SKILL.md +330 -0
- package/data/skills/crisis-detection-intervention-ai/SKILL.md +569 -0
- package/data/skills/crisis-response-protocol/SKILL.md +456 -0
- package/data/skills/crispr-guide-design/SKILL.md +72 -0
- package/data/skills/crispr-offtarget-predictor/SKILL.md +56 -0
- package/data/skills/cryoem-ai-drug-design-agent/SKILL.md +216 -0
- package/data/skills/ctdna-dynamics-mrd-agent/SKILL.md +206 -0
- package/data/skills/cytokine-storm-analysis-agent/SKILL.md +180 -0
- package/data/skills/dask/SKILL.md +454 -0
- package/data/skills/data-stats-analysis/SKILL.md +477 -0
- package/data/skills/data-transform/SKILL.md +576 -0
- package/data/skills/data-visualization-biomedical/SKILL.md +252 -0
- package/data/skills/data-visualization-expert/SKILL.md +72 -0
- package/data/skills/data-viz-plots/SKILL.md +461 -0
- package/data/skills/datacommons-client/SKILL.md +253 -0
- package/data/skills/datamol/SKILL.md +700 -0
- package/data/skills/deep-research/SKILL.md +111 -0
- package/data/skills/deep-research-swarm/SKILL.md +62 -0
- package/data/skills/deep-visual-proteomics-agent/SKILL.md +149 -0
- package/data/skills/deepchem/SKILL.md +591 -0
- package/data/skills/deeptools/SKILL.md +525 -0
- package/data/skills/depmap/SKILL.md +300 -0
- package/data/skills/diffdock/SKILL.md +477 -0
- package/data/skills/differentiation-schemes/SKILL.md +159 -0
- package/data/skills/digital-twin-clinical-agent/SKILL.md +228 -0
- package/data/skills/dispatching-parallel-agents/SKILL.md +180 -0
- package/data/skills/dnanexus-integration/SKILL.md +376 -0
- package/data/skills/doc-coauthoring/SKILL.md +375 -0
- package/data/skills/docx/SKILL.md +590 -0
- package/data/skills/docx-official/SKILL.md +197 -0
- package/data/skills/drug-discovery-search/SKILL.md +214 -0
- package/data/skills/drug-interaction-checker/SKILL.md +56 -0
- package/data/skills/drug-labels-search/SKILL.md +211 -0
- package/data/skills/drug-photo/SKILL.md +149 -0
- package/data/skills/drugbank-database/SKILL.md +184 -0
- package/data/skills/drugbank-search/SKILL.md +211 -0
- package/data/skills/ehr-fhir-integration/SKILL.md +60 -0
- package/data/skills/emergency-card/SKILL.md +426 -0
- package/data/skills/ena-database/SKILL.md +198 -0
- package/data/skills/ensembl-database/SKILL.md +305 -0
- package/data/skills/epidemiologist-analyst/SKILL.md +1844 -0
- package/data/skills/epigenomics-methylgpt-agent/SKILL.md +111 -0
- package/data/skills/equity-scorer/SKILL.md +182 -0
- package/data/skills/esm/SKILL.md +300 -0
- package/data/skills/etetoolkit/SKILL.md +617 -0
- package/data/skills/executing-plans/SKILL.md +84 -0
- package/data/skills/exosome-ev-analysis-agent/SKILL.md +171 -0
- package/data/skills/exploratory-data-analysis/SKILL.md +440 -0
- package/data/skills/family-health-analyzer/SKILL.md +137 -0
- package/data/skills/fastq-analysis/SKILL.md +191 -0
- package/data/skills/fda-database/SKILL.md +512 -0
- package/data/skills/fhir-developer-skill/SKILL.md +294 -0
- package/data/skills/fhir-development/SKILL.md +35 -0
- package/data/skills/find-skills/SKILL.md +133 -0
- package/data/skills/finishing-a-development-branch/SKILL.md +200 -0
- package/data/skills/fitness-analyzer/SKILL.md +431 -0
- package/data/skills/flowio/SKILL.md +602 -0
- package/data/skills/foldseek/SKILL.md +179 -0
- package/data/skills/galaxy-bridge/SKILL.md +215 -0
- package/data/skills/gene-database/SKILL.md +173 -0
- package/data/skills/gene-panel-design-agent/SKILL.md +192 -0
- package/data/skills/geniml/SKILL.md +312 -0
- package/data/skills/genome-compare/SKILL.md +127 -0
- package/data/skills/geo-database/SKILL.md +809 -0
- package/data/skills/geopandas/SKILL.md +245 -0
- package/data/skills/gget/SKILL.md +865 -0
- package/data/skills/ginkgo-cloud-lab/SKILL.md +56 -0
- package/data/skills/glycoengineering/SKILL.md +338 -0
- package/data/skills/gnomad-database/SKILL.md +395 -0
- package/data/skills/goal-analyzer/SKILL.md +605 -0
- package/data/skills/grief-companion/SKILL.md +250 -0
- package/data/skills/gsea-enrichment/SKILL.md +151 -0
- package/data/skills/gtars/SKILL.md +279 -0
- package/data/skills/gtex-database/SKILL.md +315 -0
- package/data/skills/gwas-database/SKILL.md +602 -0
- package/data/skills/gwas-lookup/SKILL.md +122 -0
- package/data/skills/gwas-prs/SKILL.md +178 -0
- package/data/skills/health-trend-analyzer/SKILL.md +451 -0
- package/data/skills/hemoglobinopathy-analysis-agent/SKILL.md +167 -0
- package/data/skills/hipaa-compliance/SKILL.md +230 -0
- package/data/skills/histolab/SKILL.md +672 -0
- package/data/skills/hmdb-database/SKILL.md +190 -0
- package/data/skills/hrd-analysis-agent/SKILL.md +184 -0
- package/data/skills/hrv-alexithymia-expert/SKILL.md +151 -0
- package/data/skills/hypogenic/SKILL.md +649 -0
- package/data/skills/hypothesis-generation/SKILL.md +286 -0
- package/data/skills/imaging-data-commons/SKILL.md +843 -0
- package/data/skills/immune-checkpoint-combination-agent/SKILL.md +170 -0
- package/data/skills/infographics/SKILL.md +563 -0
- package/data/skills/instrument-data-to-allotrope/SKILL.md +280 -0
- package/data/skills/interpro-database/SKILL.md +305 -0
- package/data/skills/ipsae/SKILL.md +190 -0
- package/data/skills/iso-13485-certification/SKILL.md +678 -0
- package/data/skills/jaspar-database/SKILL.md +351 -0
- package/data/skills/jungian-psychologist/SKILL.md +191 -0
- package/data/skills/kegg-database/SKILL.md +371 -0
- package/data/skills/knowledge-synthesis/SKILL.md +283 -0
- package/data/skills/kragen-knowledge-graph/SKILL.md +68 -0
- package/data/skills/lab-results/SKILL.md +35 -0
- package/data/skills/labarchive-integration/SKILL.md +262 -0
- package/data/skills/labstep/SKILL.md +208 -0
- package/data/skills/lamindb/SKILL.md +384 -0
- package/data/skills/latchbio-integration/SKILL.md +347 -0
- package/data/skills/latex-posters/SKILL.md +1602 -0
- package/data/skills/leads-literature-mining/SKILL.md +68 -0
- package/data/skills/ligandmpnn/SKILL.md +170 -0
- package/data/skills/linear-solvers/SKILL.md +165 -0
- package/data/skills/liquid-biopsy-analytics-agent/SKILL.md +171 -0
- package/data/skills/lit-synthesizer/SKILL.md +53 -0
- package/data/skills/literature-review/SKILL.md +584 -0
- package/data/skills/literature-search/SKILL.md +214 -0
- package/data/skills/lobster-bioinformatics/SKILL.md +305 -0
- package/data/skills/long-read-sequencing-agent/SKILL.md +181 -0
- package/data/skills/mage-antibody-generator/SKILL.md +54 -0
- package/data/skills/markdown-mermaid-writing/SKILL.md +327 -0
- package/data/skills/markitdown/SKILL.md +486 -0
- package/data/skills/matchms/SKILL.md +197 -0
- package/data/skills/matplotlib/SKILL.md +359 -0
- package/data/skills/mcpmed-bioinformatics-server/SKILL.md +42 -0
- package/data/skills/medchem/SKILL.md +400 -0
- package/data/skills/medea-therapeutic-discovery/SKILL.md +45 -0
- package/data/skills/medical-entity-extractor/SKILL.md +144 -0
- package/data/skills/medical-imaging-review/SKILL.md +170 -0
- package/data/skills/medical-research-toolkit/SKILL.md +273 -0
- package/data/skills/medrxiv-search/SKILL.md +211 -0
- package/data/skills/mental-health-analyzer/SKILL.md +981 -0
- package/data/skills/mesh-generation/SKILL.md +149 -0
- package/data/skills/metabolomics-workbench-database/SKILL.md +253 -0
- package/data/skills/microbiome-cancer-agent/SKILL.md +180 -0
- package/data/skills/modern-drug-rehab-computer/SKILL.md +392 -0
- package/data/skills/molecular-dynamics/SKILL.md +457 -0
- package/data/skills/molecular-glue-discovery-agent/SKILL.md +224 -0
- package/data/skills/molecule-evolution-agent/SKILL.md +62 -0
- package/data/skills/molfeat/SKILL.md +505 -0
- package/data/skills/monarch-database/SKILL.md +372 -0
- package/data/skills/mpn-progression-monitor-agent/SKILL.md +228 -0
- package/data/skills/mpn-research-assistant/SKILL.md +197 -0
- package/data/skills/mrd-edge-detection-agent/SKILL.md +213 -0
- package/data/skills/multi-ancestry-prs-agent/SKILL.md +224 -0
- package/data/skills/multi-search-engine/SKILL.md +110 -0
- package/data/skills/multimodal-medical-imaging/SKILL.md +59 -0
- package/data/skills/multimodal-radpath-fusion-agent/SKILL.md +213 -0
- package/data/skills/myeloma-mrd-agent/SKILL.md +184 -0
- package/data/skills/networkx/SKILL.md +435 -0
- package/data/skills/neurokit2/SKILL.md +350 -0
- package/data/skills/neuropixels-analysis/SKILL.md +344 -0
- package/data/skills/nextflow-development/SKILL.md +290 -0
- package/data/skills/ngs-analysis/SKILL.md +183 -0
- package/data/skills/nicheformer-spatial-agent/SKILL.md +197 -0
- package/data/skills/nk-cell-therapy-agent/SKILL.md +186 -0
- package/data/skills/nonlinear-solvers/SKILL.md +180 -0
- package/data/skills/numerical-integration/SKILL.md +166 -0
- package/data/skills/numerical-stability/SKILL.md +149 -0
- package/data/skills/nutrition-analyzer/SKILL.md +775 -0
- package/data/skills/occupational-health-analyzer/SKILL.md +386 -0
- package/data/skills/omero-integration/SKILL.md +245 -0
- package/data/skills/ontology-explorer/SKILL.md +168 -0
- package/data/skills/ontology-mapper/SKILL.md +171 -0
- package/data/skills/ontology-validator/SKILL.md +136 -0
- package/data/skills/open-notebook/SKILL.md +289 -0
- package/data/skills/open-targets-search/SKILL.md +211 -0
- package/data/skills/openalex-database/SKILL.md +488 -0
- package/data/skills/opentargets-database/SKILL.md +367 -0
- package/data/skills/opentrons-integration/SKILL.md +567 -0
- package/data/skills/opentrons-protocol-agent/SKILL.md +58 -0
- package/data/skills/organoid-drug-response-agent/SKILL.md +189 -0
- package/data/skills/pan-cancer-multiomics-agent/SKILL.md +159 -0
- package/data/skills/paper-2-web/SKILL.md +495 -0
- package/data/skills/parameter-optimization/SKILL.md +141 -0
- package/data/skills/patents-search/SKILL.md +211 -0
- package/data/skills/pathml/SKILL.md +160 -0
- package/data/skills/patiently-ai/SKILL.md +103 -0
- package/data/skills/pdb/SKILL.md +217 -0
- package/data/skills/pdb-database/SKILL.md +303 -0
- package/data/skills/pdf/SKILL.md +314 -0
- package/data/skills/pdf-anthropic/SKILL.md +294 -0
- package/data/skills/pdf-processing/SKILL.md +149 -0
- package/data/skills/pdf-processing-pro/SKILL.md +296 -0
- package/data/skills/pdx-model-analysis-agent/SKILL.md +169 -0
- package/data/skills/peer-review/SKILL.md +565 -0
- package/data/skills/performance-profiling/SKILL.md +255 -0
- package/data/skills/perplexity-search/SKILL.md +441 -0
- package/data/skills/pharmacogenomics-agent/SKILL.md +143 -0
- package/data/skills/pharmgx-reporter/SKILL.md +134 -0
- package/data/skills/phylogenetics/SKILL.md +404 -0
- package/data/skills/plotly/SKILL.md +265 -0
- package/data/skills/polars/SKILL.md +385 -0
- package/data/skills/popeve-variant-predictor-agent/SKILL.md +213 -0
- package/data/skills/post-processing/SKILL.md +338 -0
- package/data/skills/pptx/SKILL.md +232 -0
- package/data/skills/pptx-official/SKILL.md +484 -0
- package/data/skills/pptx-posters/SKILL.md +414 -0
- package/data/skills/precision-oncology-agent/SKILL.md +53 -0
- package/data/skills/prior-auth-coworker/SKILL.md +60 -0
- package/data/skills/prior-auth-review-skill/SKILL.md +360 -0
- package/data/skills/profile-report/SKILL.md +120 -0
- package/data/skills/protac-design-agent/SKILL.md +220 -0
- package/data/skills/protein-design-workflow/SKILL.md +199 -0
- package/data/skills/protein-qc/SKILL.md +300 -0
- package/data/skills/protein-structure-prediction/SKILL.md +59 -0
- package/data/skills/proteinmpnn/SKILL.md +279 -0
- package/data/skills/protocolsio-integration/SKILL.md +415 -0
- package/data/skills/prs-net-deep-learning-agent/SKILL.md +232 -0
- package/data/skills/psychologist-analyst/SKILL.md +1888 -0
- package/data/skills/pubchem-database/SKILL.md +568 -0
- package/data/skills/pubmed-database/SKILL.md +454 -0
- package/data/skills/pubmed-search/SKILL.md +103 -0
- package/data/skills/pydeseq2/SKILL.md +553 -0
- package/data/skills/pydicom/SKILL.md +428 -0
- package/data/skills/pyhealth/SKILL.md +485 -0
- package/data/skills/pylabrobot/SKILL.md +179 -0
- package/data/skills/pymc/SKILL.md +566 -0
- package/data/skills/pymoo/SKILL.md +565 -0
- package/data/skills/pyopenms/SKILL.md +211 -0
- package/data/skills/pysam/SKILL.md +259 -0
- package/data/skills/pytdc/SKILL.md +454 -0
- package/data/skills/pytorch-lightning/SKILL.md +172 -0
- package/data/skills/pyzotero/SKILL.md +111 -0
- package/data/skills/radgpt-radiology-reporter/SKILL.md +67 -0
- package/data/skills/radiomics-pathomics-fusion-agent/SKILL.md +221 -0
- package/data/skills/rdkit/SKILL.md +763 -0
- package/data/skills/reactome-database/SKILL.md +272 -0
- package/data/skills/receiving-code-review/SKILL.md +213 -0
- package/data/skills/recovery-community-moderator/SKILL.md +175 -0
- package/data/skills/regulatory-drafter/SKILL.md +56 -0
- package/data/skills/regulatory-drafting/SKILL.md +35 -0
- package/data/skills/rehabilitation-analyzer/SKILL.md +636 -0
- package/data/skills/repro-enforcer/SKILL.md +50 -0
- package/data/skills/requesting-code-review/SKILL.md +105 -0
- package/data/skills/research-grants/SKILL.md +935 -0
- package/data/skills/research-literature/SKILL.md +35 -0
- package/data/skills/research-lookup/SKILL.md +502 -0
- package/data/skills/rfdiffusion/SKILL.md +306 -0
- package/data/skills/rna-velocity-agent/SKILL.md +174 -0
- package/data/skills/scanpy/SKILL.md +380 -0
- package/data/skills/scfoundation-model-agent/SKILL.md +210 -0
- package/data/skills/scientific-brainstorming/SKILL.md +185 -0
- package/data/skills/scientific-critical-thinking/SKILL.md +566 -0
- package/data/skills/scientific-manuscript/SKILL.md +181 -0
- package/data/skills/scientific-problem-selection/SKILL.md +269 -0
- package/data/skills/scientific-schematics/SKILL.md +619 -0
- package/data/skills/scientific-slides/SKILL.md +1154 -0
- package/data/skills/scientific-visualization/SKILL.md +773 -0
- package/data/skills/scientific-writing/SKILL.md +483 -0
- package/data/skills/scikit-bio/SKILL.md +431 -0
- package/data/skills/scikit-learn/SKILL.md +515 -0
- package/data/skills/scikit-survival/SKILL.md +393 -0
- package/data/skills/scrna-orchestrator/SKILL.md +204 -0
- package/data/skills/scrna-qc/SKILL.md +43 -0
- package/data/skills/scvelo/SKILL.md +321 -0
- package/data/skills/scvi-tools/SKILL.md +184 -0
- package/data/skills/seaborn/SKILL.md +671 -0
- package/data/skills/search-strategy/SKILL.md +247 -0
- package/data/skills/seq-wrangler/SKILL.md +58 -0
- package/data/skills/shap/SKILL.md +560 -0
- package/data/skills/simo-multiomics-integration-agent/SKILL.md +178 -0
- package/data/skills/simpy/SKILL.md +423 -0
- package/data/skills/simulation-orchestrator/SKILL.md +230 -0
- package/data/skills/simulation-validator/SKILL.md +195 -0
- package/data/skills/single-annotation/SKILL.md +129 -0
- package/data/skills/single-cell-rna-qc/SKILL.md +175 -0
- package/data/skills/single-cellphone-db/SKILL.md +68 -0
- package/data/skills/single-clustering/SKILL.md +75 -0
- package/data/skills/single-downstream-analysis/SKILL.md +150 -0
- package/data/skills/single-multiomics/SKILL.md +44 -0
- package/data/skills/single-preprocessing/SKILL.md +184 -0
- package/data/skills/single-to-spatial-mapping/SKILL.md +48 -0
- package/data/skills/single-trajectory/SKILL.md +62 -0
- package/data/skills/sleep-analyzer/SKILL.md +773 -0
- package/data/skills/slurm-job-script-generator/SKILL.md +135 -0
- package/data/skills/solublempnn/SKILL.md +165 -0
- package/data/skills/spatial-agent/SKILL.md +56 -0
- package/data/skills/spatial-epigenomics-agent/SKILL.md +163 -0
- package/data/skills/spatial-transcriptomics-agent/SKILL.md +75 -0
- package/data/skills/spatial-transcriptomics-analysis/SKILL.md +72 -0
- package/data/skills/spatial-transcriptomics-analysis/STAgent/SKILL.md +75 -0
- package/data/skills/spatial-transcriptomics-analysis/SpatialAgent/SKILL.md +56 -0
- package/data/skills/spatial-transcriptomics-analysis/bioSkills/image-analysis/SKILL.md +266 -0
- package/data/skills/spatial-transcriptomics-analysis/bioSkills/spatial-communication/SKILL.md +287 -0
- package/data/skills/spatial-transcriptomics-analysis/bioSkills/spatial-data-io/SKILL.md +243 -0
- package/data/skills/spatial-transcriptomics-analysis/bioSkills/spatial-deconvolution/SKILL.md +298 -0
- package/data/skills/spatial-transcriptomics-analysis/bioSkills/spatial-domains/SKILL.md +229 -0
- package/data/skills/spatial-transcriptomics-analysis/bioSkills/spatial-multiomics/SKILL.md +172 -0
- package/data/skills/spatial-transcriptomics-analysis/bioSkills/spatial-neighbors/SKILL.md +189 -0
- package/data/skills/spatial-transcriptomics-analysis/bioSkills/spatial-preprocessing/SKILL.md +232 -0
- package/data/skills/spatial-transcriptomics-analysis/bioSkills/spatial-proteomics/SKILL.md +127 -0
- package/data/skills/spatial-transcriptomics-analysis/bioSkills/spatial-statistics/SKILL.md +225 -0
- package/data/skills/spatial-transcriptomics-analysis/bioSkills/spatial-visualization/SKILL.md +270 -0
- package/data/skills/spatial-tutorials/SKILL.md +87 -0
- package/data/skills/speech-pathology-ai/SKILL.md +184 -0
- package/data/skills/statistical-analysis/SKILL.md +626 -0
- package/data/skills/statsmodels/SKILL.md +608 -0
- package/data/skills/string-database/SKILL.md +528 -0
- package/data/skills/struct-predictor/SKILL.md +52 -0
- package/data/skills/subagent-driven-development/SKILL.md +242 -0
- package/data/skills/systematic-debugging/SKILL.md +296 -0
- package/data/skills/tcell-exhaustion-analysis-agent/SKILL.md +139 -0
- package/data/skills/tcga-preprocessing/SKILL.md +49 -0
- package/data/skills/tcm-constitution-analyzer/SKILL.md +664 -0
- package/data/skills/tcr-pmhc-prediction-agent/SKILL.md +226 -0
- package/data/skills/tcr-repertoire-analysis-agent/SKILL.md +218 -0
- package/data/skills/test-driven-development/SKILL.md +371 -0
- package/data/skills/tiledbvcf/SKILL.md +459 -0
- package/data/skills/time-resolved-cryoem-agent/SKILL.md +223 -0
- package/data/skills/time-stepping/SKILL.md +140 -0
- package/data/skills/timesfm-forecasting/SKILL.md +785 -0
- package/data/skills/tme-immune-profiling-agent/SKILL.md +220 -0
- package/data/skills/tooluniverse-adverse-event-detection/SKILL.md +1115 -0
- package/data/skills/tooluniverse-antibody-engineering/SKILL.md +1581 -0
- package/data/skills/tooluniverse-binder-discovery/SKILL.md +1459 -0
- package/data/skills/tooluniverse-cancer-variant-interpretation/SKILL.md +971 -0
- package/data/skills/tooluniverse-chemical-compound-retrieval/SKILL.md +322 -0
- package/data/skills/tooluniverse-chemical-safety/SKILL.md +733 -0
- package/data/skills/tooluniverse-clinical-guidelines/SKILL.md +399 -0
- package/data/skills/tooluniverse-clinical-trial-design/SKILL.md +1195 -0
- package/data/skills/tooluniverse-clinical-trial-matching/SKILL.md +1333 -0
- package/data/skills/tooluniverse-crispr-screen-analysis/SKILL.md +900 -0
- package/data/skills/tooluniverse-disease-research/SKILL.md +630 -0
- package/data/skills/tooluniverse-drug-drug-interaction/SKILL.md +73 -0
- package/data/skills/tooluniverse-drug-repurposing/SKILL.md +595 -0
- package/data/skills/tooluniverse-drug-research/SKILL.md +1642 -0
- package/data/skills/tooluniverse-drug-target-validation/SKILL.md +1206 -0
- package/data/skills/tooluniverse-epigenomics/SKILL.md +1489 -0
- package/data/skills/tooluniverse-expression-data-retrieval/SKILL.md +389 -0
- package/data/skills/tooluniverse-gene-enrichment/SKILL.md +402 -0
- package/data/skills/tooluniverse-gwas-drug-discovery/SKILL.md +576 -0
- package/data/skills/tooluniverse-gwas-finemapping/SKILL.md +309 -0
- package/data/skills/tooluniverse-gwas-snp-interpretation/SKILL.md +223 -0
- package/data/skills/tooluniverse-gwas-study-explorer/SKILL.md +342 -0
- package/data/skills/tooluniverse-gwas-trait-to-gene/SKILL.md +236 -0
- package/data/skills/tooluniverse-image-analysis/SKILL.md +439 -0
- package/data/skills/tooluniverse-immune-repertoire-analysis/SKILL.md +949 -0
- package/data/skills/tooluniverse-immunotherapy-response-prediction/SKILL.md +865 -0
- package/data/skills/tooluniverse-infectious-disease/SKILL.md +749 -0
- package/data/skills/tooluniverse-literature-deep-research/SKILL.md +1050 -0
- package/data/skills/tooluniverse-metabolomics/SKILL.md +298 -0
- package/data/skills/tooluniverse-metabolomics-analysis/SKILL.md +764 -0
- package/data/skills/tooluniverse-multi-omics-integration/SKILL.md +703 -0
- package/data/skills/tooluniverse-multiomic-disease-characterization/SKILL.md +1138 -0
- package/data/skills/tooluniverse-network-pharmacology/SKILL.md +1312 -0
- package/data/skills/tooluniverse-pharmacovigilance/SKILL.md +807 -0
- package/data/skills/tooluniverse-phylogenetics/SKILL.md +461 -0
- package/data/skills/tooluniverse-polygenic-risk-score/SKILL.md +397 -0
- package/data/skills/tooluniverse-precision-medicine-stratification/SKILL.md +1143 -0
- package/data/skills/tooluniverse-precision-oncology/SKILL.md +1091 -0
- package/data/skills/tooluniverse-protein-interactions/SKILL.md +446 -0
- package/data/skills/tooluniverse-protein-structure-retrieval/SKILL.md +416 -0
- package/data/skills/tooluniverse-protein-therapeutic-design/SKILL.md +637 -0
- package/data/skills/tooluniverse-proteomics-analysis/SKILL.md +843 -0
- package/data/skills/tooluniverse-rare-disease-diagnosis/SKILL.md +1257 -0
- package/data/skills/tooluniverse-rnaseq-deseq2/SKILL.md +536 -0
- package/data/skills/tooluniverse-sequence-retrieval/SKILL.md +419 -0
- package/data/skills/tooluniverse-single-cell/SKILL.md +719 -0
- package/data/skills/tooluniverse-spatial-omics-analysis/SKILL.md +1102 -0
- package/data/skills/tooluniverse-spatial-transcriptomics/SKILL.md +788 -0
- package/data/skills/tooluniverse-statistical-modeling/SKILL.md +557 -0
- package/data/skills/tooluniverse-structural-variant-analysis/SKILL.md +1356 -0
- package/data/skills/tooluniverse-systems-biology/SKILL.md +374 -0
- package/data/skills/tooluniverse-target-research/SKILL.md +1510 -0
- package/data/skills/tooluniverse-variant-analysis/SKILL.md +448 -0
- package/data/skills/tooluniverse-variant-interpretation/SKILL.md +1118 -0
- package/data/skills/torch-geometric/SKILL.md +674 -0
- package/data/skills/torch_geometric/SKILL.md +670 -0
- package/data/skills/torchdrug/SKILL.md +444 -0
- package/data/skills/tpd-ternary-complex-agent/SKILL.md +226 -0
- package/data/skills/transformers/SKILL.md +157 -0
- package/data/skills/travel-health-analyzer/SKILL.md +421 -0
- package/data/skills/treatment-plans/SKILL.md +1576 -0
- package/data/skills/trial-eligibility-agent/SKILL.md +54 -0
- package/data/skills/trialgpt-matching/SKILL.md +66 -0
- package/data/skills/tumor-clonal-evolution-agent/SKILL.md +134 -0
- package/data/skills/tumor-heterogeneity-agent/SKILL.md +216 -0
- package/data/skills/tumor-mutational-burden-agent/SKILL.md +188 -0
- package/data/skills/ukb-navigator/SKILL.md +113 -0
- package/data/skills/umap-learn/SKILL.md +473 -0
- package/data/skills/uniprot-database/SKILL.md +189 -0
- package/data/skills/universal-single-cell-annotator/SKILL.md +72 -0
- package/data/skills/using-git-worktrees/SKILL.md +218 -0
- package/data/skills/using-superpowers/SKILL.md +95 -0
- package/data/skills/usmle/SKILL.md +62 -0
- package/data/skills/uspto-database/SKILL.md +597 -0
- package/data/skills/vaex/SKILL.md +180 -0
- package/data/skills/varcadd-pathogenicity/SKILL.md +68 -0
- package/data/skills/variant-interpretation-acmg/SKILL.md +58 -0
- package/data/skills/variant-interpretation-acmg/bioSkills/clinical-interpretation/SKILL.md +334 -0
- package/data/skills/variant-interpretation-acmg/bioSkills/consensus-sequences/SKILL.md +343 -0
- package/data/skills/variant-interpretation-acmg/bioSkills/deepvariant/SKILL.md +279 -0
- package/data/skills/variant-interpretation-acmg/bioSkills/filtering-best-practices/SKILL.md +362 -0
- package/data/skills/variant-interpretation-acmg/bioSkills/gatk-variant-calling/SKILL.md +398 -0
- package/data/skills/variant-interpretation-acmg/bioSkills/joint-calling/SKILL.md +343 -0
- package/data/skills/variant-interpretation-acmg/bioSkills/structural-variant-calling/SKILL.md +256 -0
- package/data/skills/variant-interpretation-acmg/bioSkills/variant-annotation/SKILL.md +387 -0
- package/data/skills/variant-interpretation-acmg/bioSkills/variant-calling/SKILL.md +258 -0
- package/data/skills/variant-interpretation-acmg/bioSkills/variant-normalization/SKILL.md +304 -0
- package/data/skills/variant-interpretation-acmg/bioSkills/vcf-basics/SKILL.md +329 -0
- package/data/skills/variant-interpretation-acmg/bioSkills/vcf-manipulation/SKILL.md +398 -0
- package/data/skills/variant-interpretation-acmg/bioSkills/vcf-statistics/SKILL.md +424 -0
- package/data/skills/variant-interpretation-acmg/varCADD/SKILL.md +68 -0
- package/data/skills/vcf-annotator/SKILL.md +55 -0
- package/data/skills/verification-before-completion/SKILL.md +139 -0
- package/data/skills/virtual-lab-agent/SKILL.md +240 -0
- package/data/skills/wearable-analysis-agent/SKILL.md +70 -0
- package/data/skills/weightloss-analyzer/SKILL.md +320 -0
- package/data/skills/wellally-tech/SKILL.md +685 -0
- package/data/skills/wikipedia-search/SKILL.md +481 -0
- package/data/skills/writing-plans/SKILL.md +116 -0
- package/data/skills/writing-skills/SKILL.md +655 -0
- package/data/skills/xlsx/SKILL.md +292 -0
- package/data/skills/xlsx-official/SKILL.md +289 -0
- package/data/skills/zarr-python/SKILL.md +777 -0
- package/data/skills/zinc-database/SKILL.md +398 -0
- package/data/tools/__init__.py +8 -0
- package/data/tools/hpc.py +71 -0
- package/data/tools/hpc_client/__init__.py +8 -0
- package/data/tools/hpc_client/builders/__init__.py +12 -0
- package/data/tools/hpc_client/builders/alphafold.py +36 -0
- package/data/tools/hpc_client/builders/boltz.py +33 -0
- package/data/tools/hpc_client/builders/chai.py +30 -0
- package/data/tools/hpc_client/builders/immunebuilder.py +31 -0
- package/data/tools/hpc_client/builders/rfantibody.py +58 -0
- package/data/tools/hpc_client/builders/thermompnn.py +16 -0
- package/data/tools/hpc_client/hpc_api.py +41 -0
- package/data/tools/hpc_client/hpc_tools.py +218 -0
- package/data/tools/hpc_dynamic.py +71 -0
- package/data/tools/integrations/__init__.py +14 -0
- package/data/tools/integrations/adaptyv.py +107 -0
- package/data/tools/integrations/addgene.py +52 -0
- package/data/tools/integrations/api_internal.py +33 -0
- package/data/tools/molecular_biology.py +688 -0
- package/data/tools/pharmacology.py +67 -0
- package/data/workflows/bulk-omics-clustering/SKILL.md +501 -0
- package/data/workflows/bulk-omics-clustering/references/best_practices.md +395 -0
- package/data/workflows/bulk-omics-clustering/references/clustering_methods_comparison.md +288 -0
- package/data/workflows/bulk-omics-clustering/references/common-patterns.md +1136 -0
- package/data/workflows/bulk-omics-clustering/references/decision-guide.md +819 -0
- package/data/workflows/bulk-omics-clustering/references/distance_metrics_guide.md +388 -0
- package/data/workflows/bulk-omics-clustering/references/parameter_guide.md +396 -0
- package/data/workflows/bulk-omics-clustering/references/r-quick-start.md +105 -0
- package/data/workflows/bulk-omics-clustering/references/validation_metrics_guide.md +315 -0
- package/data/workflows/bulk-omics-clustering/scripts/characterize_clusters.py +255 -0
- package/data/workflows/bulk-omics-clustering/scripts/cluster_validation.py +449 -0
- package/data/workflows/bulk-omics-clustering/scripts/density_clustering.py +321 -0
- package/data/workflows/bulk-omics-clustering/scripts/dimensionality_reduction.py +328 -0
- package/data/workflows/bulk-omics-clustering/scripts/distance_metrics.py +251 -0
- package/data/workflows/bulk-omics-clustering/scripts/export_results.py +456 -0
- package/data/workflows/bulk-omics-clustering/scripts/hierarchical_clustering.R +229 -0
- package/data/workflows/bulk-omics-clustering/scripts/hierarchical_clustering.py +269 -0
- package/data/workflows/bulk-omics-clustering/scripts/kmeans_clustering.py +346 -0
- package/data/workflows/bulk-omics-clustering/scripts/load_example_data.R +171 -0
- package/data/workflows/bulk-omics-clustering/scripts/load_example_data.py +171 -0
- package/data/workflows/bulk-omics-clustering/scripts/model_based_clustering.py +370 -0
- package/data/workflows/bulk-omics-clustering/scripts/optimal_clusters.py +381 -0
- package/data/workflows/bulk-omics-clustering/scripts/plot_cluster_heatmap.R +141 -0
- package/data/workflows/bulk-omics-clustering/scripts/plot_clustering_results.py +452 -0
- package/data/workflows/bulk-omics-clustering/scripts/prepare_data.py +250 -0
- package/data/workflows/bulk-omics-clustering/scripts/stability_analysis.py +434 -0
- package/data/workflows/bulk-rnaseq-counts-to-de-deseq2/SKILL.md +505 -0
- package/data/workflows/bulk-rnaseq-counts-to-de-deseq2/references/comprehensive-reference.md +440 -0
- package/data/workflows/bulk-rnaseq-counts-to-de-deseq2/references/decision-guide.md +327 -0
- package/data/workflows/bulk-rnaseq-counts-to-de-deseq2/references/troubleshooting.md +456 -0
- package/data/workflows/bulk-rnaseq-counts-to-de-deseq2/references/usage-guide.md +75 -0
- package/data/workflows/bulk-rnaseq-counts-to-de-deseq2/scripts/basic_workflow.R +149 -0
- package/data/workflows/bulk-rnaseq-counts-to-de-deseq2/scripts/batch_correction.R +44 -0
- package/data/workflows/bulk-rnaseq-counts-to-de-deseq2/scripts/export_results.R +190 -0
- package/data/workflows/bulk-rnaseq-counts-to-de-deseq2/scripts/extract_results.R +242 -0
- package/data/workflows/bulk-rnaseq-counts-to-de-deseq2/scripts/load_example_data.R +250 -0
- package/data/workflows/bulk-rnaseq-counts-to-de-deseq2/scripts/multi_condition.R +50 -0
- package/data/workflows/bulk-rnaseq-counts-to-de-deseq2/scripts/qc_plots.R +410 -0
- package/data/workflows/bulk-rnaseq-counts-to-de-deseq2/scripts/transformations.R +218 -0
- package/data/workflows/chip-atlas-diff-analysis/SKILL.md +222 -0
- package/data/workflows/chip-atlas-diff-analysis/references/chipatlas_diff_api_format.md +106 -0
- package/data/workflows/chip-atlas-diff-analysis/references/diff_analysis_methods.md +89 -0
- package/data/workflows/chip-atlas-diff-analysis/references/output_format.md +78 -0
- package/data/workflows/chip-atlas-diff-analysis/scripts/__init__.py +1 -0
- package/data/workflows/chip-atlas-diff-analysis/scripts/annotate_genes.py +144 -0
- package/data/workflows/chip-atlas-diff-analysis/scripts/export_all.py +498 -0
- package/data/workflows/chip-atlas-diff-analysis/scripts/filter_regions.py +176 -0
- package/data/workflows/chip-atlas-diff-analysis/scripts/generate_all_plots.py +321 -0
- package/data/workflows/chip-atlas-diff-analysis/scripts/load_example_data.py +149 -0
- package/data/workflows/chip-atlas-diff-analysis/scripts/load_user_data.py +211 -0
- package/data/workflows/chip-atlas-diff-analysis/scripts/parse_bed_results.py +240 -0
- package/data/workflows/chip-atlas-diff-analysis/scripts/qc_checks.py +621 -0
- package/data/workflows/chip-atlas-diff-analysis/scripts/query_chipatlas_api.py +329 -0
- package/data/workflows/chip-atlas-diff-analysis/scripts/run_diff_workflow.py +256 -0
- package/data/workflows/chip-atlas-peak-enrichment/SKILL.md +212 -0
- package/data/workflows/chip-atlas-peak-enrichment/references/chipatlas_metadata_format.md +115 -0
- package/data/workflows/chip-atlas-peak-enrichment/references/enrichment_statistics.md +145 -0
- package/data/workflows/chip-atlas-peak-enrichment/references/peak_thresholds.md +63 -0
- package/data/workflows/chip-atlas-peak-enrichment/references/promoter_definitions.md +69 -0
- package/data/workflows/chip-atlas-peak-enrichment/scripts/__init__.py +1 -0
- package/data/workflows/chip-atlas-peak-enrichment/scripts/convert_genes_to_regions.py +271 -0
- package/data/workflows/chip-atlas-peak-enrichment/scripts/export_all.py +456 -0
- package/data/workflows/chip-atlas-peak-enrichment/scripts/filter_experiments.py +116 -0
- package/data/workflows/chip-atlas-peak-enrichment/scripts/generate_all_plots.py +280 -0
- package/data/workflows/chip-atlas-peak-enrichment/scripts/load_example_data.py +96 -0
- package/data/workflows/chip-atlas-peak-enrichment/scripts/load_user_data.py +183 -0
- package/data/workflows/chip-atlas-peak-enrichment/scripts/query_chipatlas_api.py +349 -0
- package/data/workflows/chip-atlas-peak-enrichment/scripts/run_enrichment_workflow.py +271 -0
- package/data/workflows/chip-atlas-target-genes/SKILL.md +230 -0
- package/data/workflows/chip-atlas-target-genes/references/macs2_binding_scores.md +89 -0
- package/data/workflows/chip-atlas-target-genes/references/string_scores.md +58 -0
- package/data/workflows/chip-atlas-target-genes/references/target_genes_data_format.md +73 -0
- package/data/workflows/chip-atlas-target-genes/scripts/__init__.py +0 -0
- package/data/workflows/chip-atlas-target-genes/scripts/download_target_genes.py +200 -0
- package/data/workflows/chip-atlas-target-genes/scripts/export_all.py +340 -0
- package/data/workflows/chip-atlas-target-genes/scripts/filter_targets.py +205 -0
- package/data/workflows/chip-atlas-target-genes/scripts/generate_all_plots.py +330 -0
- package/data/workflows/chip-atlas-target-genes/scripts/load_example_query.py +61 -0
- package/data/workflows/chip-atlas-target-genes/scripts/load_user_query.py +47 -0
- package/data/workflows/chip-atlas-target-genes/scripts/run_target_genes_workflow.py +141 -0
- package/data/workflows/clinicaltrials-landscape/SKILL.md +257 -0
- package/data/workflows/clinicaltrials-landscape/references/api-parameters.md +181 -0
- package/data/workflows/clinicaltrials-landscape/references/mechanisms.md +141 -0
- package/data/workflows/clinicaltrials-landscape/references/output-schema.md +184 -0
- package/data/workflows/clinicaltrials-landscape/scripts/__init__.py +1 -0
- package/data/workflows/clinicaltrials-landscape/scripts/classify_mechanisms.py +359 -0
- package/data/workflows/clinicaltrials-landscape/scripts/compile_trials.py +579 -0
- package/data/workflows/clinicaltrials-landscape/scripts/disease_config.py +161 -0
- package/data/workflows/clinicaltrials-landscape/scripts/export_all.py +242 -0
- package/data/workflows/clinicaltrials-landscape/scripts/generate_landscape_plots.py +761 -0
- package/data/workflows/clinicaltrials-landscape/scripts/generate_pdf_report.py +1465 -0
- package/data/workflows/clinicaltrials-landscape/scripts/generate_report.py +1813 -0
- package/data/workflows/clinicaltrials-landscape/scripts/query_clinicaltrials.py +307 -0
- package/data/workflows/coexpression-network/SKILL.md +344 -0
- package/data/workflows/coexpression-network/references/parameter-tuning-guide.md +591 -0
- package/data/workflows/coexpression-network/references/troubleshooting.md +483 -0
- package/data/workflows/coexpression-network/references/wgcna-best-practices.md +563 -0
- package/data/workflows/coexpression-network/references/wgcna-reference.md +538 -0
- package/data/workflows/coexpression-network/scripts/build_network.R +43 -0
- package/data/workflows/coexpression-network/scripts/correlate_modules_traits.R +92 -0
- package/data/workflows/coexpression-network/scripts/export_wgcna_results.R +117 -0
- package/data/workflows/coexpression-network/scripts/identify_hub_genes.R +63 -0
- package/data/workflows/coexpression-network/scripts/load_example_data.R +214 -0
- package/data/workflows/coexpression-network/scripts/module_enrichment.R +159 -0
- package/data/workflows/coexpression-network/scripts/pick_soft_power.R +70 -0
- package/data/workflows/coexpression-network/scripts/plot_all_wgcna.R +104 -0
- package/data/workflows/coexpression-network/scripts/plot_eigengene_heatmap.R +65 -0
- package/data/workflows/coexpression-network/scripts/plot_hub_genes.R +70 -0
- package/data/workflows/coexpression-network/scripts/plot_module_dendrogram.R +50 -0
- package/data/workflows/coexpression-network/scripts/plotting_helpers.R +87 -0
- package/data/workflows/coexpression-network/scripts/prepare_wgcna_data.R +73 -0
- package/data/workflows/coexpression-network/scripts/wgcna_workflow.R +93 -0
- package/data/workflows/experimental-design-statistics/SKILL.md +408 -0
- package/data/workflows/experimental-design-statistics/references/batch_effect_mitigation.md +756 -0
- package/data/workflows/experimental-design-statistics/references/cv_tissue_database.csv +30 -0
- package/data/workflows/experimental-design-statistics/references/experimental_design_best_practices.md +515 -0
- package/data/workflows/experimental-design-statistics/references/multiple_testing_guide.md +730 -0
- package/data/workflows/experimental-design-statistics/references/power_analysis_guidelines.md +635 -0
- package/data/workflows/experimental-design-statistics/references/qc_guidelines.md +310 -0
- package/data/workflows/experimental-design-statistics/references/software_requirements.md +328 -0
- package/data/workflows/experimental-design-statistics/references/troubleshooting_guide.md +510 -0
- package/data/workflows/experimental-design-statistics/scripts/batch_assignment.R +302 -0
- package/data/workflows/experimental-design-statistics/scripts/batch_validation.R +342 -0
- package/data/workflows/experimental-design-statistics/scripts/export_design.R +352 -0
- package/data/workflows/experimental-design-statistics/scripts/load_example_data.R +204 -0
- package/data/workflows/experimental-design-statistics/scripts/multiple_testing.R +417 -0
- package/data/workflows/experimental-design-statistics/scripts/plot_power_curves.R +317 -0
- package/data/workflows/experimental-design-statistics/scripts/power_atacseq.R +229 -0
- package/data/workflows/experimental-design-statistics/scripts/power_pilot_based.R +289 -0
- package/data/workflows/experimental-design-statistics/scripts/power_rnaseq.R +247 -0
- package/data/workflows/experimental-design-statistics/scripts/sample_size_de.R +327 -0
- package/data/workflows/experimental-design-statistics/scripts/sample_size_scrna.R +304 -0
- package/data/workflows/functional-enrichment-from-degs/SKILL.md +387 -0
- package/data/workflows/functional-enrichment-from-degs/references/database_guide.md +354 -0
- package/data/workflows/functional-enrichment-from-degs/references/decision-guide.md +546 -0
- package/data/workflows/functional-enrichment-from-degs/references/gsea_ora_comparison.md +213 -0
- package/data/workflows/functional-enrichment-from-degs/references/gsea_ora_validation_framework.md +483 -0
- package/data/workflows/functional-enrichment-from-degs/references/interpretation_guidelines.md +374 -0
- package/data/workflows/functional-enrichment-from-degs/references/method-reference.md +742 -0
- package/data/workflows/functional-enrichment-from-degs/scripts/export_results.R +190 -0
- package/data/workflows/functional-enrichment-from-degs/scripts/generate_plots.R +240 -0
- package/data/workflows/functional-enrichment-from-degs/scripts/get_msigdb_genesets.R +75 -0
- package/data/workflows/functional-enrichment-from-degs/scripts/load_de_results.R +60 -0
- package/data/workflows/functional-enrichment-from-degs/scripts/load_example_data.R +212 -0
- package/data/workflows/functional-enrichment-from-degs/scripts/prepare_gene_lists.R +92 -0
- package/data/workflows/functional-enrichment-from-degs/scripts/run_gsea.R +44 -0
- package/data/workflows/functional-enrichment-from-degs/scripts/run_ora.R +53 -0
- package/data/workflows/genetic-variant-annotation/SKILL.md +440 -0
- package/data/workflows/genetic-variant-annotation/references/auto_installation_implementation.md +274 -0
- package/data/workflows/genetic-variant-annotation/references/consequence_terms.md +392 -0
- package/data/workflows/genetic-variant-annotation/references/filtering_strategies.md +808 -0
- package/data/workflows/genetic-variant-annotation/references/installation_guide.md +557 -0
- package/data/workflows/genetic-variant-annotation/references/pathogenicity_interpretation.md +473 -0
- package/data/workflows/genetic-variant-annotation/references/qc_guidelines.md +524 -0
- package/data/workflows/genetic-variant-annotation/references/snpeff_best_practices.md +481 -0
- package/data/workflows/genetic-variant-annotation/references/tool_selection_guide.md +433 -0
- package/data/workflows/genetic-variant-annotation/references/troubleshooting_guide.md +678 -0
- package/data/workflows/genetic-variant-annotation/references/vep_best_practices.md +450 -0
- package/data/workflows/genetic-variant-annotation/scripts/annotate_genes.py +243 -0
- package/data/workflows/genetic-variant-annotation/scripts/export_results.py +450 -0
- package/data/workflows/genetic-variant-annotation/scripts/filter_variants.py +365 -0
- package/data/workflows/genetic-variant-annotation/scripts/install_tools.py +246 -0
- package/data/workflows/genetic-variant-annotation/scripts/load_example_data.py +166 -0
- package/data/workflows/genetic-variant-annotation/scripts/parse_snpeff_output.py +283 -0
- package/data/workflows/genetic-variant-annotation/scripts/parse_vep_output.py +257 -0
- package/data/workflows/genetic-variant-annotation/scripts/plot_variant_distribution.py +372 -0
- package/data/workflows/genetic-variant-annotation/scripts/prioritize_variants.py +287 -0
- package/data/workflows/genetic-variant-annotation/scripts/run_snpeff.py +418 -0
- package/data/workflows/genetic-variant-annotation/scripts/run_vep.py +358 -0
- package/data/workflows/genetic-variant-annotation/scripts/select_tool.py +203 -0
- package/data/workflows/genetic-variant-annotation/scripts/test_complete_workflow.py +312 -0
- package/data/workflows/genetic-variant-annotation/scripts/test_pickle_load.py +118 -0
- package/data/workflows/genetic-variant-annotation/scripts/validate_vcf.py +351 -0
- package/data/workflows/genetic-variant-annotation/scripts/verify_changes.py +212 -0
- package/data/workflows/grn-pyscenic/SKILL.md +331 -0
- package/data/workflows/grn-pyscenic/references/cli_interface.md +222 -0
- package/data/workflows/grn-pyscenic/references/database_downloads.md +245 -0
- package/data/workflows/grn-pyscenic/scripts/export_all.py +192 -0
- package/data/workflows/grn-pyscenic/scripts/generate_report.py +512 -0
- package/data/workflows/grn-pyscenic/scripts/integrate_with_adata.py +54 -0
- package/data/workflows/grn-pyscenic/scripts/load_example_data.py +200 -0
- package/data/workflows/grn-pyscenic/scripts/load_expression_data.py +61 -0
- package/data/workflows/grn-pyscenic/scripts/plot_regulon_visualizations.py +263 -0
- package/data/workflows/grn-pyscenic/scripts/run_grn_workflow.py +184 -0
- package/data/workflows/gwas-to-function-twas/SKILL.md +394 -0
- package/data/workflows/gwas-to-function-twas/references/fusion_best_practices.md +120 -0
- package/data/workflows/gwas-to-function-twas/references/installation-guide.md +414 -0
- package/data/workflows/gwas-to-function-twas/references/ldsc_qc_guidelines.md +287 -0
- package/data/workflows/gwas-to-function-twas/references/spredixxcan_best_practices.md +166 -0
- package/data/workflows/gwas-to-function-twas/references/therapeutic_interpretation_guide.md +717 -0
- package/data/workflows/gwas-to-function-twas/references/tissue_reference_guide.md +182 -0
- package/data/workflows/gwas-to-function-twas/references/troubleshooting_guide.md +317 -0
- package/data/workflows/gwas-to-function-twas/references/twas_hub_validation_guide.md +88 -0
- package/data/workflows/gwas-to-function-twas/scripts/colocalization_analysis.py +187 -0
- package/data/workflows/gwas-to-function-twas/scripts/druggability_scoring.py +199 -0
- package/data/workflows/gwas-to-function-twas/scripts/export_results.py +220 -0
- package/data/workflows/gwas-to-function-twas/scripts/integrate_variant_annotation.py +194 -0
- package/data/workflows/gwas-to-function-twas/scripts/interpret_therapeutic_direction.py +418 -0
- package/data/workflows/gwas-to-function-twas/scripts/mendelian_randomization.py +749 -0
- package/data/workflows/gwas-to-function-twas/scripts/multilayer_direction_analysis.py +471 -0
- package/data/workflows/gwas-to-function-twas/scripts/plot_twas_results.py +252 -0
- package/data/workflows/gwas-to-function-twas/scripts/run_fusion.py +155 -0
- package/data/workflows/gwas-to-function-twas/scripts/run_smultixcan.py +102 -0
- package/data/workflows/gwas-to-function-twas/scripts/run_spredixxcan.py +138 -0
- package/data/workflows/gwas-to-function-twas/scripts/select_reference_panel.py +253 -0
- package/data/workflows/gwas-to-function-twas/scripts/validate_gwas_sumstats.py +214 -0
- package/data/workflows/gwas-to-function-twas/scripts/validate_with_twas_hub.py +439 -0
- package/data/workflows/lasso-biomarker-panel/SKILL.md +322 -0
- package/data/workflows/lasso-biomarker-panel/references/decision-guide.md +64 -0
- package/data/workflows/lasso-biomarker-panel/references/lasso-reference.md +110 -0
- package/data/workflows/lasso-biomarker-panel/references/validation-guide.md +105 -0
- package/data/workflows/lasso-biomarker-panel/scripts/biological_interpretation.R +1560 -0
- package/data/workflows/lasso-biomarker-panel/scripts/biomarker_plots.R +350 -0
- package/data/workflows/lasso-biomarker-panel/scripts/export_results.R +1492 -0
- package/data/workflows/lasso-biomarker-panel/scripts/lasso_workflow.R +328 -0
- package/data/workflows/lasso-biomarker-panel/scripts/load_example_data.R +1903 -0
- package/data/workflows/lasso-biomarker-panel/scripts/plotting_helpers.R +78 -0
- package/data/workflows/lasso-biomarker-panel/scripts/prepare_features.R +225 -0
- package/data/workflows/lasso-biomarker-panel/scripts/query_cellxgene.py +107 -0
- package/data/workflows/lasso-biomarker-panel/scripts/validate_external.R +174 -0
- package/data/workflows/literature-preclinical/SKILL.md +276 -0
- package/data/workflows/literature-preclinical/assets/eval/simple_test.py +386 -0
- package/data/workflows/literature-preclinical/references/experiment-extraction-guide.md +147 -0
- package/data/workflows/literature-preclinical/references/full-text-enrichment-guide.md +121 -0
- package/data/workflows/literature-preclinical/references/preclinical-search-guide.md +117 -0
- package/data/workflows/literature-preclinical/scripts/extract_experiments.py +401 -0
- package/data/workflows/literature-preclinical/scripts/generate_plots.R +303 -0
- package/data/workflows/literature-preclinical/scripts/narrative_synthesis.py +653 -0
- package/data/workflows/literature-preclinical/scripts/preclinical_search.py +332 -0
- package/data/workflows/literature-preclinical/scripts/preclinical_synthesis.py +237 -0
- package/data/workflows/literature-preclinical/scripts/report_generation.py +326 -0
- package/data/workflows/mendelian-randomization-twosamplemr/SKILL.md +210 -0
- package/data/workflows/mendelian-randomization-twosamplemr/references/interpretation-guide.md +239 -0
- package/data/workflows/mendelian-randomization-twosamplemr/references/method-reference.md +190 -0
- package/data/workflows/mendelian-randomization-twosamplemr/scripts/export_results.R +123 -0
- package/data/workflows/mendelian-randomization-twosamplemr/scripts/generate_report.R +411 -0
- package/data/workflows/mendelian-randomization-twosamplemr/scripts/load_data.R +281 -0
- package/data/workflows/mendelian-randomization-twosamplemr/scripts/mr_plots.R +163 -0
- package/data/workflows/mendelian-randomization-twosamplemr/scripts/run_mr_analysis.R +322 -0
- package/data/workflows/pcr-primer-design/SKILL.md +397 -0
- package/data/workflows/pcr-primer-design/references/code_examples.md +594 -0
- package/data/workflows/pcr-primer-design/references/miqe_guidelines.md +453 -0
- package/data/workflows/pcr-primer-design/references/parameter_ranges.md +356 -0
- package/data/workflows/pcr-primer-design/references/primer_design_best_practices.md +451 -0
- package/data/workflows/pcr-primer-design/references/troubleshooting_guide.md +477 -0
- package/data/workflows/pcr-primer-design/scripts/__init__.py +2 -0
- package/data/workflows/pcr-primer-design/scripts/calculate_tm.py +306 -0
- package/data/workflows/pcr-primer-design/scripts/check_dimers.py +298 -0
- package/data/workflows/pcr-primer-design/scripts/check_secondary_structures.py +343 -0
- package/data/workflows/pcr-primer-design/scripts/design_qpcr_primers.py +233 -0
- package/data/workflows/pcr-primer-design/scripts/design_standard_primers.py +197 -0
- package/data/workflows/pcr-primer-design/scripts/design_taqman_probes.py +226 -0
- package/data/workflows/pcr-primer-design/scripts/export_results.py +382 -0
- package/data/workflows/pcr-primer-design/scripts/generate_reports.py +379 -0
- package/data/workflows/pcr-primer-design/scripts/validate_specificity.py +311 -0
- package/data/workflows/pcr-primer-design/scripts/visualize_primers.py +379 -0
- package/data/workflows/polygenic-risk-score-prs-catalog/SKILL.md +195 -0
- package/data/workflows/polygenic-risk-score-prs-catalog/references/interpretation-guide.md +80 -0
- package/data/workflows/polygenic-risk-score-prs-catalog/references/pgs-catalog-guide.md +109 -0
- package/data/workflows/polygenic-risk-score-prs-catalog/scripts/export_results.R +186 -0
- package/data/workflows/polygenic-risk-score-prs-catalog/scripts/generate_plots.R +283 -0
- package/data/workflows/polygenic-risk-score-prs-catalog/scripts/load_pgs_weights.R +228 -0
- package/data/workflows/polygenic-risk-score-prs-catalog/scripts/load_reference_data.R +191 -0
- package/data/workflows/polygenic-risk-score-prs-catalog/scripts/score_traits.R +216 -0
- package/data/workflows/pooled-crispr-screens/SKILL.md +362 -0
- package/data/workflows/pooled-crispr-screens/references/crispr_screen_best_practices.md +349 -0
- package/data/workflows/pooled-crispr-screens/references/qc_guidelines.md +722 -0
- package/data/workflows/pooled-crispr-screens/references/statistical_methods.md +644 -0
- package/data/workflows/pooled-crispr-screens/references/troubleshooting_guide.md +684 -0
- package/data/workflows/pooled-crispr-screens/references/umi_optimization.md +297 -0
- package/data/workflows/pooled-crispr-screens/scripts/concatenate_libraries.py +132 -0
- package/data/workflows/pooled-crispr-screens/scripts/detect_perturbed_cells.py +255 -0
- package/data/workflows/pooled-crispr-screens/scripts/differential_expression.py +202 -0
- package/data/workflows/pooled-crispr-screens/scripts/differential_expression_glmgampoi.py +320 -0
- package/data/workflows/pooled-crispr-screens/scripts/export_results.py +261 -0
- package/data/workflows/pooled-crispr-screens/scripts/expression_filtering.py +159 -0
- package/data/workflows/pooled-crispr-screens/scripts/gene_name_corrections.py +188 -0
- package/data/workflows/pooled-crispr-screens/scripts/generate_report.py +485 -0
- package/data/workflows/pooled-crispr-screens/scripts/load_10x_libraries.py +69 -0
- package/data/workflows/pooled-crispr-screens/scripts/load_example_data.py +257 -0
- package/data/workflows/pooled-crispr-screens/scripts/map_sgrna_to_cells.py +119 -0
- package/data/workflows/pooled-crispr-screens/scripts/normalize_and_scale.py +140 -0
- package/data/workflows/pooled-crispr-screens/scripts/qc_filtering.py +185 -0
- package/data/workflows/pooled-crispr-screens/scripts/run_glmgampoi.R +181 -0
- package/data/workflows/pooled-crispr-screens/scripts/screen_all_perturbations.py +306 -0
- package/data/workflows/pooled-crispr-screens/scripts/validate_perturbations.py +314 -0
- package/data/workflows/pooled-crispr-screens/scripts/visualize_perturbations.py +314 -0
- package/data/workflows/scrnaseq-scanpy-core-analysis/SKILL.md +425 -0
- package/data/workflows/scrnaseq-scanpy-core-analysis/references/ambient_rna_correction.md +422 -0
- package/data/workflows/scrnaseq-scanpy-core-analysis/references/common-patterns.md +533 -0
- package/data/workflows/scrnaseq-scanpy-core-analysis/references/integration_methods.md +820 -0
- package/data/workflows/scrnaseq-scanpy-core-analysis/references/marker_gene_database.md +471 -0
- package/data/workflows/scrnaseq-scanpy-core-analysis/references/pseudobulk_de_guide.md +408 -0
- package/data/workflows/scrnaseq-scanpy-core-analysis/references/qc_guidelines.md +535 -0
- package/data/workflows/scrnaseq-scanpy-core-analysis/references/scanpy_best_practices.md +496 -0
- package/data/workflows/scrnaseq-scanpy-core-analysis/references/troubleshooting_guide.md +668 -0
- package/data/workflows/scrnaseq-scanpy-core-analysis/references/workflow-details.md +727 -0
- package/data/workflows/scrnaseq-scanpy-core-analysis/scripts/annotate_celltypes.py +431 -0
- package/data/workflows/scrnaseq-scanpy-core-analysis/scripts/cluster_cells.py +293 -0
- package/data/workflows/scrnaseq-scanpy-core-analysis/scripts/export_results.py +423 -0
- package/data/workflows/scrnaseq-scanpy-core-analysis/scripts/filter_cells.py +531 -0
- package/data/workflows/scrnaseq-scanpy-core-analysis/scripts/find_markers.py +391 -0
- package/data/workflows/scrnaseq-scanpy-core-analysis/scripts/find_variable_genes.py +222 -0
- package/data/workflows/scrnaseq-scanpy-core-analysis/scripts/integrate_scvi.py +665 -0
- package/data/workflows/scrnaseq-scanpy-core-analysis/scripts/integration_diagnostics.py +678 -0
- package/data/workflows/scrnaseq-scanpy-core-analysis/scripts/load_example_data.py +68 -0
- package/data/workflows/scrnaseq-scanpy-core-analysis/scripts/normalize_data.py +325 -0
- package/data/workflows/scrnaseq-scanpy-core-analysis/scripts/plot_dimreduction.py +389 -0
- package/data/workflows/scrnaseq-scanpy-core-analysis/scripts/plot_qc.py +320 -0
- package/data/workflows/scrnaseq-scanpy-core-analysis/scripts/pseudobulk_de.py +553 -0
- package/data/workflows/scrnaseq-scanpy-core-analysis/scripts/qc_metrics.py +477 -0
- package/data/workflows/scrnaseq-scanpy-core-analysis/scripts/remove_ambient_rna.py +347 -0
- package/data/workflows/scrnaseq-scanpy-core-analysis/scripts/run_umap.py +188 -0
- package/data/workflows/scrnaseq-scanpy-core-analysis/scripts/scale_and_pca.py +365 -0
- package/data/workflows/scrnaseq-scanpy-core-analysis/scripts/setup_and_import.py +334 -0
- package/data/workflows/scrnaseq-seurat-core-analysis/SKILL.md +585 -0
- package/data/workflows/scrnaseq-seurat-core-analysis/references/ambient_rna_correction.md +422 -0
- package/data/workflows/scrnaseq-seurat-core-analysis/references/common-patterns.md +667 -0
- package/data/workflows/scrnaseq-seurat-core-analysis/references/decision-guide.md +456 -0
- package/data/workflows/scrnaseq-seurat-core-analysis/references/integration_methods.md +864 -0
- package/data/workflows/scrnaseq-seurat-core-analysis/references/marker_gene_database.md +471 -0
- package/data/workflows/scrnaseq-seurat-core-analysis/references/pseudobulk_de_guide.md +408 -0
- package/data/workflows/scrnaseq-seurat-core-analysis/references/qc_guidelines.md +452 -0
- package/data/workflows/scrnaseq-seurat-core-analysis/references/seurat_best_practices.md +417 -0
- package/data/workflows/scrnaseq-seurat-core-analysis/references/troubleshooting_guide.md +566 -0
- package/data/workflows/scrnaseq-seurat-core-analysis/references/workflow-details.md +801 -0
- package/data/workflows/scrnaseq-seurat-core-analysis/scripts/annotate_celltypes.R +306 -0
- package/data/workflows/scrnaseq-seurat-core-analysis/scripts/cluster_cells.R +223 -0
- package/data/workflows/scrnaseq-seurat-core-analysis/scripts/export_results.R +292 -0
- package/data/workflows/scrnaseq-seurat-core-analysis/scripts/filter_cells.R +576 -0
- package/data/workflows/scrnaseq-seurat-core-analysis/scripts/find_markers.R +325 -0
- package/data/workflows/scrnaseq-seurat-core-analysis/scripts/find_variable_features.R +106 -0
- package/data/workflows/scrnaseq-seurat-core-analysis/scripts/integrate_batches.R +504 -0
- package/data/workflows/scrnaseq-seurat-core-analysis/scripts/integration_diagnostics.R +596 -0
- package/data/workflows/scrnaseq-seurat-core-analysis/scripts/load_example_data.R +89 -0
- package/data/workflows/scrnaseq-seurat-core-analysis/scripts/normalize_data.R +184 -0
- package/data/workflows/scrnaseq-seurat-core-analysis/scripts/plot_dimreduction.R +273 -0
- package/data/workflows/scrnaseq-seurat-core-analysis/scripts/plot_qc.R +250 -0
- package/data/workflows/scrnaseq-seurat-core-analysis/scripts/pseudobulk_de.R +324 -0
- package/data/workflows/scrnaseq-seurat-core-analysis/scripts/qc_metrics.R +358 -0
- package/data/workflows/scrnaseq-seurat-core-analysis/scripts/remove_ambient_rna.R +281 -0
- package/data/workflows/scrnaseq-seurat-core-analysis/scripts/run_umap.R +116 -0
- package/data/workflows/scrnaseq-seurat-core-analysis/scripts/scale_and_pca.R +243 -0
- package/data/workflows/scrnaseq-seurat-core-analysis/scripts/setup_and_import.R +193 -0
- package/data/workflows/spatial-transcriptomics/SKILL.md +256 -0
- package/data/workflows/spatial-transcriptomics/references/spatial-analysis-guide.md +216 -0
- package/data/workflows/spatial-transcriptomics/scripts/export_results.py +214 -0
- package/data/workflows/spatial-transcriptomics/scripts/generate_all_plots.py +397 -0
- package/data/workflows/spatial-transcriptomics/scripts/load_example_data.py +175 -0
- package/data/workflows/spatial-transcriptomics/scripts/spatial_workflow.py +206 -0
- package/dist/bgi.js +28 -1
- package/package.json +2 -1
|
@@ -0,0 +1,1903 @@
|
|
|
1
|
+
# Load Example Data for LASSO Biomarker Panel Skill
|
|
2
|
+
# Primary: GSE206285 (UNIFI — Ustekinumab UC Trial, baseline prediction of week 8 mucosal healing)
|
|
3
|
+
# Validation: GSE92415 (PURSUIT — Golimumab UC Trial, baseline prediction of week 6 response)
|
|
4
|
+
#
|
|
5
|
+
# GSE206285: 550 UC patients + 18 healthy controls (568 total), ALL baseline (Week I-0)
|
|
6
|
+
# - Ustekinumab (anti-IL-12/23) Phase 3 clinical trial (UNIFI)
|
|
7
|
+
# - Outcome: week 8 mucosal healing (Y/N)
|
|
8
|
+
# - ~83 healed vs ~459 unhealed (all treatments combined)
|
|
9
|
+
# - Expression in series matrix (Affymetrix HT HG-U133+ PM, GPL13158, log2 RMA)
|
|
10
|
+
# GSE92415: 87 UC patients at baseline + 75 week 6 + 21 healthy (183 total)
|
|
11
|
+
# - Golimumab (anti-TNF) Phase 3 clinical trial (PURSUIT-SC)
|
|
12
|
+
# - Outcome: week 6 mucosal healing response (Yes/No)
|
|
13
|
+
# - 43 responders vs 44 non-responders at baseline — balanced!
|
|
14
|
+
# - SAME platform (GPL13158) as GSE206285 — ideal for cross-drug validation
|
|
15
|
+
|
|
16
|
+
# Set CRAN mirror
|
|
17
|
+
options(repos = c(CRAN = "https://cloud.r-project.org"))
|
|
18
|
+
|
|
19
|
+
# ---- Helper functions ----
|
|
20
|
+
|
|
21
|
+
.ensure_bioc <- function() {
|
|
22
|
+
if (!requireNamespace("BiocManager", quietly = TRUE)) {
|
|
23
|
+
install.packages("BiocManager")
|
|
24
|
+
}
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
.ensure_package <- function(pkg, bioc = TRUE) {
|
|
28
|
+
if (!requireNamespace(pkg, quietly = TRUE)) {
|
|
29
|
+
.ensure_bioc()
|
|
30
|
+
if (bioc) {
|
|
31
|
+
cat("Installing", pkg, "...\n")
|
|
32
|
+
BiocManager::install(pkg, ask = FALSE, update = FALSE)
|
|
33
|
+
} else {
|
|
34
|
+
install.packages(pkg)
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
#' Collapse probes to gene-level expression using Gene Symbol column
|
|
40
|
+
#' Handles Affymetrix GPL13158/GPL570 format with "///" separated multi-mapped symbols
|
|
41
|
+
#' @param expr Expression matrix (probes x samples)
|
|
42
|
+
#' @param gene_symbols Character vector from fData "Gene Symbol" column
|
|
43
|
+
#' @return Expression matrix with gene symbols as rownames
|
|
44
|
+
.collapse_probes_to_genes <- function(expr, gene_symbols) {
|
|
45
|
+
# Remove probes without gene mapping or with multi-mapped symbols
|
|
46
|
+
valid <- !is.na(gene_symbols) & gene_symbols != "" & gene_symbols != "---"
|
|
47
|
+
# For multi-mapped probes (///), take the first symbol
|
|
48
|
+
symbols <- ifelse(grepl("///", gene_symbols),
|
|
49
|
+
trimws(sub(" ///.*", "", gene_symbols)),
|
|
50
|
+
gene_symbols)
|
|
51
|
+
valid <- valid & symbols != ""
|
|
52
|
+
expr <- expr[valid, , drop = FALSE]
|
|
53
|
+
symbols <- symbols[valid]
|
|
54
|
+
cat(" Probes with gene mapping:", sum(valid), "\n")
|
|
55
|
+
|
|
56
|
+
# For duplicate symbols, keep probe with highest variance
|
|
57
|
+
probe_var <- apply(expr, 1, var, na.rm = TRUE)
|
|
58
|
+
df <- data.frame(symbol = symbols, variance = probe_var, idx = seq_along(symbols),
|
|
59
|
+
stringsAsFactors = FALSE)
|
|
60
|
+
|
|
61
|
+
keep_idx <- tapply(seq_len(nrow(df)), df$symbol, function(rows) {
|
|
62
|
+
rows[which.max(df$variance[rows])]
|
|
63
|
+
})
|
|
64
|
+
keep_idx <- unlist(keep_idx)
|
|
65
|
+
|
|
66
|
+
expr_genes <- expr[keep_idx, , drop = FALSE]
|
|
67
|
+
rownames(expr_genes) <- df$symbol[keep_idx]
|
|
68
|
+
|
|
69
|
+
cat(" Unique gene symbols:", nrow(expr_genes), "\n")
|
|
70
|
+
return(expr_genes)
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
#' Load UNIFI Trial data (GSE206285) — Primary discovery cohort
|
|
75
|
+
#'
|
|
76
|
+
#' Downloads baseline colonic biopsy gene expression from the UNIFI ustekinumab
|
|
77
|
+
#' UC clinical trial. Affymetrix HT HG-U133+ PM Array Plate (GPL13158), log2 RMA.
|
|
78
|
+
#' All 568 samples are baseline (Week I-0). Predicts week 8 mucosal healing.
|
|
79
|
+
#'
|
|
80
|
+
#' @param data_dir Directory for downloads and cache (default: "data")
|
|
81
|
+
#' @param endpoint "mucosal_healing" (default, more cases) or "clinical_remission"
|
|
82
|
+
#' @return list(expression, metadata, outcome_col, description)
|
|
83
|
+
load_unifi_data <- function(data_dir = "data", endpoint = "mucosal_healing") {
|
|
84
|
+
cat("Loading GSE206285 (UNIFI Ustekinumab UC Trial — Baseline Prediction)...\n")
|
|
85
|
+
|
|
86
|
+
.ensure_package("GEOquery")
|
|
87
|
+
.ensure_package("Biobase")
|
|
88
|
+
|
|
89
|
+
library(GEOquery)
|
|
90
|
+
library(Biobase)
|
|
91
|
+
|
|
92
|
+
if (!dir.exists(data_dir)) dir.create(data_dir, recursive = TRUE)
|
|
93
|
+
|
|
94
|
+
# --- Check for cached processed data ---
|
|
95
|
+
cache_expr <- file.path(data_dir, "GSE206285_expression.rds")
|
|
96
|
+
cache_meta <- file.path(data_dir, "GSE206285_metadata.rds")
|
|
97
|
+
|
|
98
|
+
if (file.exists(cache_expr) && file.exists(cache_meta)) {
|
|
99
|
+
cat(" Loading from cache...\n")
|
|
100
|
+
expr <- readRDS(cache_expr)
|
|
101
|
+
metadata <- readRDS(cache_meta)
|
|
102
|
+
} else {
|
|
103
|
+
# --- Download series matrix ---
|
|
104
|
+
cat(" Downloading from GEO (may take 2-5 min for 568 samples)...\n")
|
|
105
|
+
gse <- getGEO("GSE206285", GSEMatrix = TRUE, getGPL = TRUE,
|
|
106
|
+
destdir = data_dir)
|
|
107
|
+
if (is.list(gse)) gse <- gse[[1]]
|
|
108
|
+
|
|
109
|
+
# --- Expression matrix ---
|
|
110
|
+
expr_raw <- exprs(gse)
|
|
111
|
+
cat(" Raw expression:", nrow(expr_raw), "probes x", ncol(expr_raw), "samples\n")
|
|
112
|
+
|
|
113
|
+
# --- Probe to gene symbol mapping ---
|
|
114
|
+
cat(" Mapping probes to gene symbols...\n")
|
|
115
|
+
fdata <- fData(gse)
|
|
116
|
+
sym_col <- grep("gene.symbol|^symbol$|gene_symbol",
|
|
117
|
+
colnames(fdata), ignore.case = TRUE, value = TRUE)
|
|
118
|
+
if (length(sym_col) == 0) {
|
|
119
|
+
stop("Cannot find gene symbol column in feature data. ",
|
|
120
|
+
"Re-download with getGPL=TRUE.")
|
|
121
|
+
}
|
|
122
|
+
gene_symbols <- as.character(fdata[[sym_col[1]]])
|
|
123
|
+
expr <- .collapse_probes_to_genes(expr_raw, gene_symbols)
|
|
124
|
+
|
|
125
|
+
# --- Parse metadata ---
|
|
126
|
+
pheno <- pData(gse)
|
|
127
|
+
metadata <- data.frame(
|
|
128
|
+
sample_id = colnames(expr),
|
|
129
|
+
treatment = as.character(pheno[colnames(expr), "treatment:ch1"]),
|
|
130
|
+
stringsAsFactors = FALSE
|
|
131
|
+
)
|
|
132
|
+
rownames(metadata) <- metadata$sample_id
|
|
133
|
+
|
|
134
|
+
# Parse response endpoints
|
|
135
|
+
mh_col <- grep("mucosal healing", colnames(pheno), ignore.case = TRUE, value = TRUE)
|
|
136
|
+
cr_col <- grep("clinical remission", colnames(pheno), ignore.case = TRUE, value = TRUE)
|
|
137
|
+
|
|
138
|
+
if (length(mh_col) > 0) {
|
|
139
|
+
metadata$mucosal_healing <- as.character(pheno[colnames(expr), mh_col[1]])
|
|
140
|
+
}
|
|
141
|
+
if (length(cr_col) > 0) {
|
|
142
|
+
metadata$clinical_remission <- as.character(pheno[colnames(expr), cr_col[1]])
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
# --- Cache processed data ---
|
|
146
|
+
saveRDS(expr, cache_expr)
|
|
147
|
+
saveRDS(metadata, cache_meta)
|
|
148
|
+
cat(" Cached processed data for fast subsequent loads\n")
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
# --- Select endpoint ---
|
|
152
|
+
if (endpoint == "mucosal_healing") {
|
|
153
|
+
outcome_raw <- metadata$mucosal_healing
|
|
154
|
+
outcome_label <- "week 8 mucosal healing"
|
|
155
|
+
} else if (endpoint == "clinical_remission") {
|
|
156
|
+
outcome_raw <- metadata$clinical_remission
|
|
157
|
+
outcome_label <- "week 8 clinical remission"
|
|
158
|
+
} else {
|
|
159
|
+
stop("endpoint must be 'mucosal_healing' or 'clinical_remission'")
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
# --- Filter to UC patients with non-NA response ---
|
|
163
|
+
has_response <- outcome_raw %in% c("Y", "N")
|
|
164
|
+
expr <- expr[, colnames(expr) %in% rownames(metadata)[has_response], drop = FALSE]
|
|
165
|
+
metadata <- metadata[has_response, , drop = FALSE]
|
|
166
|
+
|
|
167
|
+
# Binary: 1 = healed/remission (Y), 0 = not healed (N)
|
|
168
|
+
metadata$response <- as.integer(outcome_raw[has_response] == "Y")
|
|
169
|
+
|
|
170
|
+
cat("\n\u2713 UNIFI data loaded successfully\n")
|
|
171
|
+
cat(" Expression:", nrow(expr), "genes x", ncol(expr), "samples\n")
|
|
172
|
+
cat(" Outcome:", outcome_label, "(baseline prediction)\n")
|
|
173
|
+
cat(" Distribution:", sum(metadata$response == 1), "healed /",
|
|
174
|
+
sum(metadata$response == 0), "not healed\n")
|
|
175
|
+
cat(" Treatment:", paste(names(table(metadata$treatment)),
|
|
176
|
+
table(metadata$treatment), sep = "=", collapse = ", "), "\n")
|
|
177
|
+
|
|
178
|
+
return(list(
|
|
179
|
+
expression = expr,
|
|
180
|
+
metadata = metadata,
|
|
181
|
+
outcome_col = "response",
|
|
182
|
+
description = paste(
|
|
183
|
+
"UNIFI Trial (GSE206285): Ustekinumab UC Phase 3 trial, baseline colonic biopsies.",
|
|
184
|
+
"Affymetrix HT HG-U133+ PM (GPL13158), log2 RMA.", ncol(expr), "patients.",
|
|
185
|
+
"Binary outcome:", outcome_label, "(Y/N).",
|
|
186
|
+
"Ustekinumab + placebo arms combined."
|
|
187
|
+
),
|
|
188
|
+
# Structured context for comprehensive report generation
|
|
189
|
+
# Inline citation numbers [N] correspond to the references list below
|
|
190
|
+
report_context = list(
|
|
191
|
+
disease_background = paste(
|
|
192
|
+
"Ulcerative colitis (UC) is a chronic inflammatory bowel disease",
|
|
193
|
+
"characterized by relapsing and remitting mucosal inflammation of the",
|
|
194
|
+
"colon, affecting approximately 3.1 million adults in the United States.",
|
|
195
|
+
"Current biologic therapies -- including anti-TNF agents (infliximab,",
|
|
196
|
+
"adalimumab, golimumab), anti-integrins (vedolizumab), and anti-IL-12/23",
|
|
197
|
+
"antibodies (ustekinumab) -- achieve mucosal healing in only 25-40% of",
|
|
198
|
+
"patients in pivotal Phase 3 trials [7,8]. The inability to predict which",
|
|
199
|
+
"patients will respond to a given therapy before treatment initiation",
|
|
200
|
+
"results in months of ineffective exposure, continued disease progression,",
|
|
201
|
+
"and avoidable colectomy. Identification of baseline transcriptomic",
|
|
202
|
+
"biomarkers from colonic mucosal biopsies offers a promising route toward",
|
|
203
|
+
"pre-treatment patient stratification and precision medicine in IBD."
|
|
204
|
+
),
|
|
205
|
+
trial_description = paste(
|
|
206
|
+
"UNIFI (NCT02407236) was a Phase 3, randomized, double-blind,",
|
|
207
|
+
"placebo-controlled, multicenter clinical trial evaluating ustekinumab",
|
|
208
|
+
"(Stelara) as induction and maintenance therapy for moderate-to-severe",
|
|
209
|
+
"ulcerative colitis [7]. Ustekinumab is a fully human IgG1 monoclonal",
|
|
210
|
+
"antibody targeting the p40 subunit shared by interleukin-12 (IL-12) and",
|
|
211
|
+
"interleukin-23 (IL-23), thereby inhibiting Th1 and Th17 inflammatory",
|
|
212
|
+
"pathways central to UC pathogenesis. In the induction phase, patients",
|
|
213
|
+
"were randomized to a single intravenous dose of ustekinumab (130 mg",
|
|
214
|
+
"fixed dose or weight-based ~6 mg/kg) or placebo, with clinical response",
|
|
215
|
+
"assessed at week 8."
|
|
216
|
+
),
|
|
217
|
+
patient_population = paste(
|
|
218
|
+
"Gene expression profiling was performed on baseline (Week I-0) colonic",
|
|
219
|
+
"mucosal biopsies from 568 participants in the induction cohort (550 UC",
|
|
220
|
+
"patients across ustekinumab and placebo arms, plus 18 healthy controls)",
|
|
221
|
+
"using the Affymetrix HT HG-U133+ PM Array Plate (GPL13158).",
|
|
222
|
+
"Expression values are log2 RMA-normalized. After removing samples",
|
|
223
|
+
"without endpoint data,", ncol(expr), "patients were retained for",
|
|
224
|
+
"analysis. Both ustekinumab-treated and placebo-arm patients are",
|
|
225
|
+
"included to maximize statistical power for identifying",
|
|
226
|
+
"treatment-agnostic mucosal healing signatures, given that response",
|
|
227
|
+
"rates are comparable across arms (~15%)."
|
|
228
|
+
),
|
|
229
|
+
endpoint_definition = paste(
|
|
230
|
+
"The primary binary endpoint is mucosal healing at week 8, defined by",
|
|
231
|
+
"endoscopic assessment (Mayo endoscopic subscore of 0 or 1).",
|
|
232
|
+
sum(metadata$response == 1), "patients achieved mucosal healing and",
|
|
233
|
+
sum(metadata$response == 0), "did not, reflecting the refractory",
|
|
234
|
+
"moderate-to-severe disease population enrolled in the trial."
|
|
235
|
+
),
|
|
236
|
+
platform_description = paste(
|
|
237
|
+
"Affymetrix HT HG-U133+ PM Array Plate (GPL13158), a high-throughput",
|
|
238
|
+
"version of the HG-U133 Plus 2.0 platform comprising 54,715 probe sets.",
|
|
239
|
+
"Expression values are log2 RMA-normalized. Probes were collapsed to",
|
|
240
|
+
"gene-level by selecting the highest-variance probe per gene symbol,",
|
|
241
|
+
"yielding", nrow(expr), "unique gene features."
|
|
242
|
+
),
|
|
243
|
+
analytical_goals = c(
|
|
244
|
+
paste("Identify a minimal gene expression signature (<15 genes) from",
|
|
245
|
+
"baseline colonic biopsies that predicts week 8 mucosal healing",
|
|
246
|
+
"in UC patients, independent of treatment arm, using penalized",
|
|
247
|
+
"logistic regression with LASSO [1] and elastic net [2] regularization."),
|
|
248
|
+
paste("Evaluate signature stability through repeated nested",
|
|
249
|
+
"cross-validation with stability selection [3] to ensure robust",
|
|
250
|
+
"feature selection despite the high-dimensional setting",
|
|
251
|
+
"(p >> n) [6]."),
|
|
252
|
+
paste("Assess whether baseline mucosal transcriptomics can stratify",
|
|
253
|
+
"patients before biologic therapy initiation, potentially",
|
|
254
|
+
"enabling personalized treatment selection and reducing exposure",
|
|
255
|
+
"to ineffective therapies."),
|
|
256
|
+
paste("Test cross-drug generalizability by validating the",
|
|
257
|
+
"UNIFI-derived signature on the independent PURSUIT golimumab",
|
|
258
|
+
"trial (GSE92415) [8], which shares the same microarray platform",
|
|
259
|
+
"(GPL13158), enabling direct cross-study comparison without",
|
|
260
|
+
"platform normalization.")
|
|
261
|
+
),
|
|
262
|
+
published_benchmarks = list(
|
|
263
|
+
intro = paste(
|
|
264
|
+
"Predicting biologic therapy response from baseline mucosal",
|
|
265
|
+
"transcriptomics is an active area of research in IBD. Published",
|
|
266
|
+
"validated AUCs for this task typically range from 0.65 to 0.85,",
|
|
267
|
+
"depending on the biologic, patient population, and importantly,",
|
|
268
|
+
"whether feature selection was properly nested within",
|
|
269
|
+
"cross-validation (a common source of optimistic bias in",
|
|
270
|
+
"published studies [9])."
|
|
271
|
+
),
|
|
272
|
+
studies = data.frame(
|
|
273
|
+
study = c(
|
|
274
|
+
"Feng et al. 2021 [9]",
|
|
275
|
+
"Li et al. 2021 [10]",
|
|
276
|
+
"Verstockt et al. 2020 [11]",
|
|
277
|
+
"BMC Gastro 2025 [12]"
|
|
278
|
+
),
|
|
279
|
+
drug = c("Infliximab", "Infliximab",
|
|
280
|
+
"Vedolizumab", "Vedolizumab"),
|
|
281
|
+
validated_auc = c("0.81", "0.76", "0.77-0.86", "0.80"),
|
|
282
|
+
method = c("RF + ANN (30 DEGs)", "ANN (6-gene panel)",
|
|
283
|
+
"4-gene qPCR panel", "LASSO"),
|
|
284
|
+
notes = c(
|
|
285
|
+
"DE on all samples before ML",
|
|
286
|
+
"DE on all samples before ML",
|
|
287
|
+
"Validated by qPCR across 3 cohorts",
|
|
288
|
+
"Mucosal healing at wk12/wk52"
|
|
289
|
+
),
|
|
290
|
+
stringsAsFactors = FALSE
|
|
291
|
+
),
|
|
292
|
+
context = paste(
|
|
293
|
+
"Notably, several high-performing published models apply",
|
|
294
|
+
"differential expression filtering on the full dataset before",
|
|
295
|
+
"machine learning, which introduces optimistic bias through data",
|
|
296
|
+
"leakage. When we tested this approach on our data, DE pre-filtering",
|
|
297
|
+
"on all samples inflated AUC from 0.69 to 0.87 — an artificial",
|
|
298
|
+
"gain that disappeared entirely with proper nested feature selection.",
|
|
299
|
+
"Our reported AUC uses rigorous nested cross-validation with no",
|
|
300
|
+
"information leakage, representing an honest performance estimate."
|
|
301
|
+
)
|
|
302
|
+
),
|
|
303
|
+
references = list(
|
|
304
|
+
tibshirani1996 = "[1] Tibshirani R. Regression shrinkage and selection via the lasso. J R Stat Soc Series B Stat Methodol. 1996;58(1):267-288.",
|
|
305
|
+
zou2005 = "[2] Zou H, Hastie T. Regularization and variable selection via the elastic net. J R Stat Soc Series B Stat Methodol. 2005;67(2):301-320.",
|
|
306
|
+
meinshausen2010 = "[3] Meinshausen N, Buhlmann P. Stability selection. J R Stat Soc Series B Stat Methodol. 2010;72(4):417-473.",
|
|
307
|
+
friedman2010 = "[4] Friedman J, Hastie T, Tibshirani R. Regularization paths for generalized linear models via coordinate descent. J Stat Softw. 2010;33(1):1-22.",
|
|
308
|
+
robin2011 = "[5] Robin X, Turck N, Hainard A, et al. pROC: an open-source package for R and S+ to analyze and compare ROC curves. BMC Bioinformatics. 2011;12:77.",
|
|
309
|
+
ali2025 = "[6] Ali M, Western D, et al. A proteogenomic framework for diagnosis and subtyping of neurodegenerative dementia. Nat Med. 2025.",
|
|
310
|
+
sands2019 = "[7] Sands BE, Sandborn WJ, Panaccione R, et al. Ustekinumab as induction and maintenance therapy for ulcerative colitis. N Engl J Med. 2019;381(13):1201-1214.",
|
|
311
|
+
sandborn2014 = "[8] Sandborn WJ, Feagan BG, Marano C, et al. Subcutaneous golimumab induces clinical response and remission in patients with moderate-to-severe ulcerative colitis. Gastroenterology. 2014;146(1):85-95.",
|
|
312
|
+
feng2021 = "[9] Feng J, et al. A molecular prognostic score for prediction of infliximab response in ulcerative colitis. Front Med. 2021;8:678424.",
|
|
313
|
+
li2021 = "[10] Li D, et al. An artificial neural network model for predicting infliximab response in ulcerative colitis. Front Immunol. 2021;12:742080.",
|
|
314
|
+
verstockt2020 = "[11] Verstockt B, et al. Expression levels of 4 genes in colon tissue predict vedolizumab remission in inflammatory bowel diseases. Clin Gastroenterol Hepatol. 2020;18(5):1142-1151.",
|
|
315
|
+
bmc2025 = "[12] Prediction of vedolizumab treatment response using LASSO logistic regression. BMC Gastroenterol. 2025;25:4599."
|
|
316
|
+
)
|
|
317
|
+
)
|
|
318
|
+
))
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
#' Load PURSUIT Trial data (GSE92415) — Validation cohort
|
|
323
|
+
#'
|
|
324
|
+
#' Downloads baseline colonic biopsy gene expression from the PURSUIT golimumab
|
|
325
|
+
#' UC clinical trial. SAME platform (GPL13158) as GSE206285 — ideal cross-drug validation.
|
|
326
|
+
#' Uses BASELINE (Week 0) samples only.
|
|
327
|
+
#' Outcome: week 6 mucosal healing response (Yes/No).
|
|
328
|
+
#'
|
|
329
|
+
#' @param data_dir Directory for downloads and cache (default: "data")
|
|
330
|
+
#' @param include_placebo Include placebo arm in analysis (default: TRUE)
|
|
331
|
+
#' @return list(expression, metadata, outcome_col, description)
|
|
332
|
+
load_pursuit_data <- function(data_dir = "data", include_placebo = TRUE) {
|
|
333
|
+
cat("Loading GSE92415 (PURSUIT Golimumab UC Trial — Validation)...\n")
|
|
334
|
+
|
|
335
|
+
.ensure_package("GEOquery")
|
|
336
|
+
.ensure_package("Biobase")
|
|
337
|
+
|
|
338
|
+
library(GEOquery)
|
|
339
|
+
library(Biobase)
|
|
340
|
+
|
|
341
|
+
if (!dir.exists(data_dir)) dir.create(data_dir, recursive = TRUE)
|
|
342
|
+
|
|
343
|
+
# --- Check for cached processed data ---
|
|
344
|
+
cache_expr <- file.path(data_dir, "GSE92415_expression.rds")
|
|
345
|
+
cache_meta <- file.path(data_dir, "GSE92415_metadata.rds")
|
|
346
|
+
|
|
347
|
+
if (file.exists(cache_expr) && file.exists(cache_meta)) {
|
|
348
|
+
cat(" Loading from cache...\n")
|
|
349
|
+
expr <- readRDS(cache_expr)
|
|
350
|
+
metadata <- readRDS(cache_meta)
|
|
351
|
+
} else {
|
|
352
|
+
# --- Download series matrix ---
|
|
353
|
+
cat(" Downloading from GEO (may take 1-2 min)...\n")
|
|
354
|
+
gse <- getGEO("GSE92415", GSEMatrix = TRUE, getGPL = TRUE,
|
|
355
|
+
destdir = data_dir)
|
|
356
|
+
if (is.list(gse)) gse <- gse[[1]]
|
|
357
|
+
|
|
358
|
+
# --- Expression matrix ---
|
|
359
|
+
expr_raw <- exprs(gse)
|
|
360
|
+
cat(" Raw expression:", nrow(expr_raw), "probes x", ncol(expr_raw), "samples\n")
|
|
361
|
+
|
|
362
|
+
# --- Probe to gene symbol mapping ---
|
|
363
|
+
cat(" Mapping probes to gene symbols...\n")
|
|
364
|
+
fdata <- fData(gse)
|
|
365
|
+
sym_col <- grep("gene.symbol|^symbol$|gene_symbol",
|
|
366
|
+
colnames(fdata), ignore.case = TRUE, value = TRUE)
|
|
367
|
+
if (length(sym_col) == 0) {
|
|
368
|
+
stop("Cannot find gene symbol column in feature data.")
|
|
369
|
+
}
|
|
370
|
+
gene_symbols <- as.character(fdata[[sym_col[1]]])
|
|
371
|
+
expr <- .collapse_probes_to_genes(expr_raw, gene_symbols)
|
|
372
|
+
|
|
373
|
+
# --- Parse metadata ---
|
|
374
|
+
pheno <- pData(gse)
|
|
375
|
+
metadata <- data.frame(
|
|
376
|
+
sample_id = colnames(expr),
|
|
377
|
+
treatment = as.character(pheno[colnames(expr), "treatment:ch1"]),
|
|
378
|
+
visit = as.character(pheno[colnames(expr), "visit:ch1"]),
|
|
379
|
+
wk6response = as.character(pheno[colnames(expr), "wk6response:ch1"]),
|
|
380
|
+
stringsAsFactors = FALSE
|
|
381
|
+
)
|
|
382
|
+
rownames(metadata) <- metadata$sample_id
|
|
383
|
+
|
|
384
|
+
# Parse Mayo score
|
|
385
|
+
mayo_col <- grep("mayo.score", colnames(pheno), ignore.case = TRUE, value = TRUE)
|
|
386
|
+
if (length(mayo_col) > 0) {
|
|
387
|
+
metadata$mayo_score <- suppressWarnings(
|
|
388
|
+
as.numeric(as.character(pheno[colnames(expr), mayo_col[1]]))
|
|
389
|
+
)
|
|
390
|
+
}
|
|
391
|
+
|
|
392
|
+
# --- Cache processed data ---
|
|
393
|
+
saveRDS(expr, cache_expr)
|
|
394
|
+
saveRDS(metadata, cache_meta)
|
|
395
|
+
cat(" Cached processed data for fast subsequent loads\n")
|
|
396
|
+
}
|
|
397
|
+
|
|
398
|
+
# --- Filter to BASELINE (Week 0) samples with response labels ---
|
|
399
|
+
is_baseline <- metadata$visit == "Week 0"
|
|
400
|
+
has_response <- metadata$wk6response %in% c("Yes", "No")
|
|
401
|
+
|
|
402
|
+
if (include_placebo) {
|
|
403
|
+
use_samples <- is_baseline & has_response
|
|
404
|
+
cohort_desc <- "golimumab + placebo"
|
|
405
|
+
} else {
|
|
406
|
+
is_golimumab <- metadata$treatment == "golimumab"
|
|
407
|
+
use_samples <- is_baseline & has_response & is_golimumab
|
|
408
|
+
cohort_desc <- "golimumab only"
|
|
409
|
+
}
|
|
410
|
+
|
|
411
|
+
expr <- expr[, colnames(expr) %in% rownames(metadata)[use_samples], drop = FALSE]
|
|
412
|
+
metadata <- metadata[use_samples, , drop = FALSE]
|
|
413
|
+
|
|
414
|
+
# Binary: 1 = responder (Yes), 0 = non-responder (No)
|
|
415
|
+
metadata$response <- as.integer(metadata$wk6response == "Yes")
|
|
416
|
+
|
|
417
|
+
cat("\n\u2713 PURSUIT validation data loaded successfully\n")
|
|
418
|
+
cat(" Expression:", nrow(expr), "genes x", ncol(expr), "samples\n")
|
|
419
|
+
cat(" Cohort:", cohort_desc, "\n")
|
|
420
|
+
cat(" Outcome: week 6 mucosal healing response (baseline prediction)\n")
|
|
421
|
+
cat(" Distribution:", sum(metadata$response == 1), "responders /",
|
|
422
|
+
sum(metadata$response == 0), "non-responders\n")
|
|
423
|
+
cat(" Treatment:", paste(names(table(metadata$treatment)),
|
|
424
|
+
table(metadata$treatment), sep = "=", collapse = ", "), "\n")
|
|
425
|
+
|
|
426
|
+
return(list(
|
|
427
|
+
expression = expr,
|
|
428
|
+
metadata = metadata,
|
|
429
|
+
outcome_col = "response",
|
|
430
|
+
description = paste(
|
|
431
|
+
"PURSUIT Trial (GSE92415): Golimumab UC Phase 3 trial, baseline colonic biopsies.",
|
|
432
|
+
"Affymetrix HT HG-U133+ PM (GPL13158), log2 RMA.", ncol(expr), "baseline samples.",
|
|
433
|
+
"Binary outcome: week 6 mucosal healing response (Yes/No).",
|
|
434
|
+
"Cohort:", cohort_desc
|
|
435
|
+
),
|
|
436
|
+
# Structured context for comprehensive report generation
|
|
437
|
+
report_context = list(
|
|
438
|
+
disease_background = paste(
|
|
439
|
+
"Ulcerative colitis (UC) is a chronic inflammatory bowel disease",
|
|
440
|
+
"characterized by relapsing and remitting mucosal inflammation of the",
|
|
441
|
+
"colon. Anti-TNF therapies (infliximab, adalimumab, golimumab) are a",
|
|
442
|
+
"mainstay of treatment for moderate-to-severe UC, yet only 30-45% of",
|
|
443
|
+
"patients achieve mucosal healing in Phase 3 trials [8]. Cross-drug",
|
|
444
|
+
"validation of biomarker signatures derived from one biologic class",
|
|
445
|
+
"(e.g., anti-IL-12/23) on a different class (e.g., anti-TNF) is essential",
|
|
446
|
+
"to distinguish treatment-agnostic mucosal healing biology from",
|
|
447
|
+
"drug-specific pharmacodynamic effects."
|
|
448
|
+
),
|
|
449
|
+
trial_description = paste(
|
|
450
|
+
"PURSUIT-SC (NCT00487539) was a Phase 2b/3, randomized, double-blind,",
|
|
451
|
+
"placebo-controlled, multicenter clinical trial evaluating subcutaneous",
|
|
452
|
+
"golimumab (Simponi) as induction therapy for moderate-to-severe",
|
|
453
|
+
"ulcerative colitis [8]. Golimumab is a fully human IgG1 monoclonal",
|
|
454
|
+
"antibody targeting tumor necrosis factor alpha (TNF-alpha), a key",
|
|
455
|
+
"pro-inflammatory cytokine in UC pathogenesis. Patients received",
|
|
456
|
+
"subcutaneous golimumab (200/100 mg or 400/200 mg induction dosing)",
|
|
457
|
+
"or placebo, with clinical response assessed at week 6."
|
|
458
|
+
),
|
|
459
|
+
patient_population = paste(
|
|
460
|
+
"Gene expression profiling was performed on baseline (Week 0) colonic",
|
|
461
|
+
"mucosal biopsies from", ncol(expr), "UC patients using the Affymetrix",
|
|
462
|
+
"HT HG-U133+ PM Array Plate (GPL13158), the same platform as the",
|
|
463
|
+
"discovery cohort (UNIFI, GSE206285). This shared platform is critical",
|
|
464
|
+
"for cross-drug validation as it eliminates technical confounding from",
|
|
465
|
+
"platform differences and enables direct application of the locked",
|
|
466
|
+
"discovery model."
|
|
467
|
+
),
|
|
468
|
+
endpoint_definition = paste(
|
|
469
|
+
"The binary endpoint is mucosal healing response at week 6, defined by",
|
|
470
|
+
"clinical assessment of endoscopic improvement.",
|
|
471
|
+
sum(metadata$response == 1), "patients were responders and",
|
|
472
|
+
sum(metadata$response == 0), "were non-responders, providing a",
|
|
473
|
+
"well-balanced validation cohort for signature evaluation."
|
|
474
|
+
),
|
|
475
|
+
platform_description = paste(
|
|
476
|
+
"Affymetrix HT HG-U133+ PM Array Plate (GPL13158), identical to the",
|
|
477
|
+
"UNIFI discovery cohort platform. Expression values are log2",
|
|
478
|
+
"RMA-normalized. This shared platform eliminates technical confounding",
|
|
479
|
+
"and enables direct application of LASSO coefficients without",
|
|
480
|
+
"cross-platform normalization."
|
|
481
|
+
),
|
|
482
|
+
analytical_goals = c(
|
|
483
|
+
paste("Validate the UNIFI-derived biomarker signature on an independent",
|
|
484
|
+
"clinical trial with a different therapeutic mechanism (anti-TNF vs",
|
|
485
|
+
"anti-IL-12/23) to assess cross-drug generalizability [8]."),
|
|
486
|
+
paste("Evaluate whether baseline mucosal transcriptomics capture shared",
|
|
487
|
+
"biology of treatment response across biologic drug classes in UC."),
|
|
488
|
+
paste("Assess prediction performance using the locked discovery model",
|
|
489
|
+
"applied to this independent validation cohort, providing an unbiased",
|
|
490
|
+
"estimate of real-world signature performance [6].")
|
|
491
|
+
),
|
|
492
|
+
references = list(
|
|
493
|
+
tibshirani1996 = "[1] Tibshirani R. Regression shrinkage and selection via the lasso. J R Stat Soc Series B Stat Methodol. 1996;58(1):267-288.",
|
|
494
|
+
zou2005 = "[2] Zou H, Hastie T. Regularization and variable selection via the elastic net. J R Stat Soc Series B Stat Methodol. 2005;67(2):301-320.",
|
|
495
|
+
meinshausen2010 = "[3] Meinshausen N, Buhlmann P. Stability selection. J R Stat Soc Series B Stat Methodol. 2010;72(4):417-473.",
|
|
496
|
+
friedman2010 = "[4] Friedman J, Hastie T, Tibshirani R. Regularization paths for generalized linear models via coordinate descent. J Stat Softw. 2010;33(1):1-22.",
|
|
497
|
+
robin2011 = "[5] Robin X, Turck N, Hainard A, et al. pROC: an open-source package for R and S+ to analyze and compare ROC curves. BMC Bioinformatics. 2011;12:77.",
|
|
498
|
+
ali2025 = "[6] Ali M, Western D, et al. A proteogenomic framework for diagnosis and subtyping of neurodegenerative dementia. Nat Med. 2025.",
|
|
499
|
+
sands2019 = "[7] Sands BE, Sandborn WJ, Panaccione R, et al. Ustekinumab as induction and maintenance therapy for ulcerative colitis. N Engl J Med. 2019;381(13):1201-1214.",
|
|
500
|
+
sandborn2014 = "[8] Sandborn WJ, Feagan BG, Marano C, et al. Subcutaneous golimumab induces clinical response and remission in patients with moderate-to-severe ulcerative colitis. Gastroenterology. 2014;146(1):85-95."
|
|
501
|
+
)
|
|
502
|
+
)
|
|
503
|
+
))
|
|
504
|
+
}
|
|
505
|
+
|
|
506
|
+
|
|
507
|
+
#' Validate user-provided input data
|
|
508
|
+
#'
|
|
509
|
+
#' @param expression Expression matrix (genes x samples)
|
|
510
|
+
#' @param metadata Data frame with sample metadata
|
|
511
|
+
#' @param outcome_col Name of binary outcome column in metadata
|
|
512
|
+
#' @return TRUE if valid, stops with error otherwise
|
|
513
|
+
validate_input_data <- function(expression, metadata, outcome_col) {
|
|
514
|
+
cat("Validating input data...\n")
|
|
515
|
+
|
|
516
|
+
stopifnot("Expression must be a matrix or data.frame" =
|
|
517
|
+
is.matrix(expression) || is.data.frame(expression))
|
|
518
|
+
stopifnot("Expression must have row and column names" =
|
|
519
|
+
!is.null(rownames(expression)) && !is.null(colnames(expression)))
|
|
520
|
+
|
|
521
|
+
stopifnot("Metadata must be a data.frame" = is.data.frame(metadata))
|
|
522
|
+
stopifnot("outcome_col must exist in metadata" = outcome_col %in% colnames(metadata))
|
|
523
|
+
|
|
524
|
+
shared <- intersect(colnames(expression), rownames(metadata))
|
|
525
|
+
stopifnot("No shared samples between expression and metadata" = length(shared) > 0)
|
|
526
|
+
if (length(shared) < ncol(expression)) {
|
|
527
|
+
cat(" WARNING:", ncol(expression) - length(shared),
|
|
528
|
+
"expression samples not in metadata (will be excluded)\n")
|
|
529
|
+
}
|
|
530
|
+
|
|
531
|
+
outcome <- metadata[[outcome_col]][match(shared, rownames(metadata))]
|
|
532
|
+
outcome <- outcome[!is.na(outcome)]
|
|
533
|
+
unique_vals <- unique(outcome)
|
|
534
|
+
stopifnot("Outcome must be binary (2 unique values)" = length(unique_vals) == 2)
|
|
535
|
+
tab <- table(outcome)
|
|
536
|
+
cat(" Outcome distribution:", paste(names(tab), "=", tab, collapse = ", "), "\n")
|
|
537
|
+
stopifnot("Each outcome group must have >= 10 samples" = all(tab >= 10))
|
|
538
|
+
|
|
539
|
+
cat("\u2713 Input data validation passed\n")
|
|
540
|
+
cat(" Shared samples:", length(shared), "\n")
|
|
541
|
+
cat(" Features:", nrow(expression), "\n")
|
|
542
|
+
return(TRUE)
|
|
543
|
+
}
|
|
544
|
+
|
|
545
|
+
|
|
546
|
+
# ============================================================
|
|
547
|
+
# IMvigor210 — Atezolizumab Bladder Cancer IO Response
|
|
548
|
+
# ============================================================
|
|
549
|
+
|
|
550
|
+
#' Load IMvigor210 bladder cancer immunotherapy response data
|
|
551
|
+
#'
|
|
552
|
+
#' Loads the IMvigor210 Phase II trial dataset: 348 metastatic urothelial
|
|
553
|
+
#' carcinoma patients treated with atezolizumab (anti-PD-L1). Uses RECIST
|
|
554
|
+
#' response (CR/PR vs PD), RNA-seq counts (VST-normalized), and TMB.
|
|
555
|
+
#'
|
|
556
|
+
#' @return Named list with expression, metadata, outcome_col, description,
|
|
557
|
+
#' report_context — same structure as load_unifi_data()
|
|
558
|
+
load_imvigor210_data <- function() {
|
|
559
|
+
cat("Loading IMvigor210 atezolizumab bladder cancer data...\n")
|
|
560
|
+
|
|
561
|
+
# ---- Dependencies ----
|
|
562
|
+
.ensure_bioc()
|
|
563
|
+
.ensure_package("DESeq2")
|
|
564
|
+
|
|
565
|
+
# Install IMvigor210CoreBiologies if needed (from GitHub fork)
|
|
566
|
+
if (!requireNamespace("IMvigor210CoreBiologies", quietly = TRUE)) {
|
|
567
|
+
cat("Installing IMvigor210CoreBiologies (RNA-seq + clinical data, ~50MB)...\n")
|
|
568
|
+
if (!requireNamespace("DESeq", quietly = TRUE)) {
|
|
569
|
+
cat(" Installing legacy DESeq dependency...\n")
|
|
570
|
+
if (!requireNamespace("remotes", quietly = TRUE)) install.packages("remotes")
|
|
571
|
+
remotes::install_github("SiYangming/DESeq", upgrade = "never", quiet = TRUE)
|
|
572
|
+
}
|
|
573
|
+
remotes::install_github("SiYangming/IMvigor210CoreBiologies",
|
|
574
|
+
upgrade = "never", quiet = TRUE)
|
|
575
|
+
}
|
|
576
|
+
|
|
577
|
+
suppressPackageStartupMessages({
|
|
578
|
+
library(IMvigor210CoreBiologies)
|
|
579
|
+
library(DESeq2)
|
|
580
|
+
})
|
|
581
|
+
|
|
582
|
+
# ---- Load and extract ----
|
|
583
|
+
data(cds, envir = environment())
|
|
584
|
+
raw_counts <- counts(cds)
|
|
585
|
+
clinical <- pData(cds)
|
|
586
|
+
gene_info <- fData(cds)
|
|
587
|
+
|
|
588
|
+
cat(" Raw data: ", nrow(raw_counts), " genes x ", ncol(raw_counts), " samples\n", sep = "")
|
|
589
|
+
|
|
590
|
+
# ---- Convert Entrez IDs to gene symbols ----
|
|
591
|
+
symbols <- gene_info$Symbol
|
|
592
|
+
has_symbol <- symbols != "" & !is.na(symbols) & !duplicated(symbols)
|
|
593
|
+
raw_counts <- raw_counts[has_symbol, ]
|
|
594
|
+
rownames(raw_counts) <- symbols[has_symbol]
|
|
595
|
+
cat(" After symbol mapping:", nrow(raw_counts), "unique genes\n")
|
|
596
|
+
|
|
597
|
+
# ---- Filter samples: RECIST evaluable + TMB available ----
|
|
598
|
+
response <- clinical[["Best Confirmed Overall Response"]]
|
|
599
|
+
tmb <- clinical[["FMOne mutation burden per MB"]]
|
|
600
|
+
|
|
601
|
+
# CR/PR = responder (1), PD = non-responder (0), exclude SD/NE
|
|
602
|
+
keep <- response %in% c("CR", "PR", "PD") & !is.na(tmb)
|
|
603
|
+
raw_counts <- raw_counts[, keep]
|
|
604
|
+
clinical <- clinical[keep, ]
|
|
605
|
+
response <- response[keep]
|
|
606
|
+
tmb <- tmb[keep]
|
|
607
|
+
|
|
608
|
+
n_resp <- sum(response %in% c("CR", "PR"))
|
|
609
|
+
n_nonresp <- sum(response == "PD")
|
|
610
|
+
cat(" Evaluable samples (CR/PR vs PD, TMB available):", ncol(raw_counts), "\n")
|
|
611
|
+
cat(" Responders (CR/PR):", n_resp, "\n")
|
|
612
|
+
cat(" Non-responders (PD):", n_nonresp, "\n")
|
|
613
|
+
|
|
614
|
+
# ---- DESeq2 VST normalization ----
|
|
615
|
+
cat(" Running DESeq2 variance-stabilizing transformation...\n")
|
|
616
|
+
|
|
617
|
+
# Pre-filter low-count genes (require >= 10 counts in >= 10 samples)
|
|
618
|
+
gene_keep <- rowSums(raw_counts >= 10) >= 10
|
|
619
|
+
counts_filt <- raw_counts[gene_keep, ]
|
|
620
|
+
cat(" Genes after low-count filter:", nrow(counts_filt), "\n")
|
|
621
|
+
|
|
622
|
+
# Create DESeqDataSet (minimal design for normalization only)
|
|
623
|
+
col_data <- data.frame(
|
|
624
|
+
condition = ifelse(response[keep[keep]] %in% c("CR", "PR"), "responder", "non_responder"),
|
|
625
|
+
row.names = colnames(counts_filt)
|
|
626
|
+
)
|
|
627
|
+
# Use the clinical response directly
|
|
628
|
+
col_data$condition <- ifelse(response %in% c("CR", "PR"), "responder", "non_responder")
|
|
629
|
+
dds <- DESeqDataSetFromMatrix(
|
|
630
|
+
countData = counts_filt,
|
|
631
|
+
colData = col_data,
|
|
632
|
+
design = ~ 1 # intercept-only for normalization
|
|
633
|
+
)
|
|
634
|
+
vsd <- vst(dds, blind = TRUE)
|
|
635
|
+
expr <- assay(vsd)
|
|
636
|
+
cat(" VST-normalized:", nrow(expr), "genes x", ncol(expr), "samples\n")
|
|
637
|
+
|
|
638
|
+
# ---- TMB as extra feature (stored separately for force-inclusion) ----
|
|
639
|
+
# Log-transform TMB (right-skewed: range 0-62, median 8)
|
|
640
|
+
tmb_log <- log2(tmb + 1)
|
|
641
|
+
cat(" TMB_log2 range:", round(min(tmb_log), 1), "-", round(max(tmb_log), 1),
|
|
642
|
+
" (stored in $tmb_feature for force-inclusion after variance filtering)\n")
|
|
643
|
+
|
|
644
|
+
# ---- Build metadata ----
|
|
645
|
+
sample_ids <- colnames(expr)
|
|
646
|
+
metadata <- data.frame(
|
|
647
|
+
sample_id = sample_ids,
|
|
648
|
+
response = as.integer(response %in% c("CR", "PR")),
|
|
649
|
+
recist = as.character(response),
|
|
650
|
+
tmb = tmb,
|
|
651
|
+
tmb_log2 = tmb_log,
|
|
652
|
+
pdl1_ic = as.character(clinical[["IC Level"]]),
|
|
653
|
+
immune_phenotype = as.character(clinical[["Immune phenotype"]]),
|
|
654
|
+
sex = as.character(clinical[["Sex"]]),
|
|
655
|
+
row.names = sample_ids,
|
|
656
|
+
stringsAsFactors = FALSE
|
|
657
|
+
)
|
|
658
|
+
|
|
659
|
+
# ---- Report context ----
|
|
660
|
+
report_context <- list(
|
|
661
|
+
disease_background = paste(
|
|
662
|
+
"Urothelial carcinoma (bladder cancer) is the sixth most common",
|
|
663
|
+
"malignancy worldwide, with approximately 550,000 new cases annually.",
|
|
664
|
+
"Metastatic urothelial carcinoma (mUC) carries a poor prognosis,",
|
|
665
|
+
"with median overall survival of 12-15 months with platinum-based",
|
|
666
|
+
"chemotherapy. The advent of immune checkpoint inhibitors (ICIs)",
|
|
667
|
+
"targeting the PD-1/PD-L1 axis has transformed the treatment",
|
|
668
|
+
"landscape, with durable responses observed in a subset of patients.",
|
|
669
|
+
"However, only 15-25% of unselected patients respond to single-agent",
|
|
670
|
+
"anti-PD-L1 therapy [1], creating an urgent need for predictive",
|
|
671
|
+
"biomarkers to identify patients most likely to benefit from",
|
|
672
|
+
"immunotherapy and to spare non-responders from ineffective treatment."
|
|
673
|
+
),
|
|
674
|
+
trial_description = paste(
|
|
675
|
+
"IMvigor210 (NCT02108652) was a Phase II, single-arm, multicenter",
|
|
676
|
+
"clinical trial evaluating atezolizumab (Tecentriq) in patients with",
|
|
677
|
+
"locally advanced or metastatic urothelial carcinoma [1]. Atezolizumab",
|
|
678
|
+
"is a fully humanized IgG1 monoclonal antibody that selectively binds",
|
|
679
|
+
"PD-L1, blocking its interaction with PD-1 and B7.1 receptors on",
|
|
680
|
+
"T cells, thereby restoring anti-tumor immune responses. The trial",
|
|
681
|
+
"enrolled two cohorts: cisplatin-ineligible treatment-naive patients",
|
|
682
|
+
"(Cohort 1, n=119) and platinum-pre-treated patients (Cohort 2, n=310).",
|
|
683
|
+
"Molecular profiling included bulk RNA-seq, whole-exome sequencing",
|
|
684
|
+
"for tumor mutational burden (TMB), and PD-L1 IHC (Ventana SP142) [2]."
|
|
685
|
+
),
|
|
686
|
+
patient_population = paste(
|
|
687
|
+
"Comprehensive molecular profiling was performed on pre-treatment",
|
|
688
|
+
"tumor biopsies from 348 patients. After restricting to RECIST-evaluable",
|
|
689
|
+
"patients (CR/PR vs PD, excluding SD and NE) with available TMB data,",
|
|
690
|
+
n_resp + n_nonresp, "patients were retained for biomarker analysis:",
|
|
691
|
+
n_resp, "responders (CR/PR) and", n_nonresp, "non-responders (PD).",
|
|
692
|
+
"RNA-seq data includes", nrow(expr), "protein-coding genes.",
|
|
693
|
+
"Log2-transformed TMB is force-included as an additional feature",
|
|
694
|
+
"after variance-based gene filtering. Key clinical annotations",
|
|
695
|
+
"include PD-L1 immune cell (IC) score, immune phenotype classification",
|
|
696
|
+
"(desert/excluded/inflamed), and TCGA molecular subtype."
|
|
697
|
+
),
|
|
698
|
+
endpoint_definition = paste(
|
|
699
|
+
"The binary endpoint is objective response per RECIST v1.1:",
|
|
700
|
+
"responders (complete response [CR] or partial response [PR], n =",
|
|
701
|
+
paste0(n_resp, ")"), "versus non-responders (progressive disease [PD], n =",
|
|
702
|
+
paste0(n_nonresp, ")."),
|
|
703
|
+
"Patients with stable disease (SD, n=63) and not evaluable (NE, n=50)",
|
|
704
|
+
"were excluded to maximize signal clarity. This CR/PR vs PD contrast",
|
|
705
|
+
"captures the extremes of immunotherapy response and is the standard",
|
|
706
|
+
"endpoint for biomarker discovery in IO trials [3]."
|
|
707
|
+
),
|
|
708
|
+
platform_description = paste(
|
|
709
|
+
"Bulk RNA-seq performed on pre-treatment tumor biopsies. Raw counts",
|
|
710
|
+
"(31,286 Entrez gene IDs) were mapped to gene symbols, filtered to",
|
|
711
|
+
"genes with >= 10 counts in >= 10 samples, and normalized using DESeq2",
|
|
712
|
+
"variance-stabilizing transformation (VST) [4], yielding", nrow(expr) - 1,
|
|
713
|
+
"gene features. Tumor mutational burden (TMB) was calculated from the",
|
|
714
|
+
"FoundationOne genomic profiling panel as nonsynonymous mutations per",
|
|
715
|
+
"megabase and log2-transformed. TMB is force-included after variance-based",
|
|
716
|
+
"gene filtering to ensure its representation in the feature matrix."
|
|
717
|
+
),
|
|
718
|
+
analytical_goals = c(
|
|
719
|
+
paste("Identify a minimal biomarker panel (<15 features) from",
|
|
720
|
+
"pre-treatment tumor biopsies that predicts objective response",
|
|
721
|
+
"to atezolizumab in metastatic urothelial carcinoma, using",
|
|
722
|
+
"penalized logistic regression with LASSO [5] and elastic net [6]",
|
|
723
|
+
"regularization on combined transcriptomic and TMB features."),
|
|
724
|
+
paste("Evaluate signature stability through repeated nested",
|
|
725
|
+
"cross-validation with stability selection [7] to ensure robust",
|
|
726
|
+
"feature selection in the high-dimensional setting (p >> n)."),
|
|
727
|
+
paste("Determine whether integrated transcriptomic + TMB features",
|
|
728
|
+
"can improve upon TMB-alone or PD-L1-alone biomarker strategies,",
|
|
729
|
+
"which have limited predictive accuracy in isolation [2][3]."),
|
|
730
|
+
paste("Characterize selected biomarkers through pathway enrichment,",
|
|
731
|
+
"tumor microenvironment cell-type expression mapping, and",
|
|
732
|
+
"cross-reference with bladder cancer GWAS risk loci and",
|
|
733
|
+
"ICI-relevant immune genes to interpret the biological basis",
|
|
734
|
+
"of the predictive signature.")
|
|
735
|
+
),
|
|
736
|
+
published_benchmarks = list(
|
|
737
|
+
intro = paste(
|
|
738
|
+
"Predicting immunotherapy response from pre-treatment molecular",
|
|
739
|
+
"profiling is an active and challenging area. Published biomarker",
|
|
740
|
+
"strategies for anti-PD-L1/PD-1 response in urothelial carcinoma",
|
|
741
|
+
"report AUCs ranging from 0.55 (PD-L1 alone) to ~0.80 (multi-modal",
|
|
742
|
+
"integrative models), depending on the features used and validation",
|
|
743
|
+
"approach [2][8]."
|
|
744
|
+
),
|
|
745
|
+
studies = data.frame(
|
|
746
|
+
study = c(
|
|
747
|
+
"Mariathasan et al. 2018 [1]",
|
|
748
|
+
"Mariathasan et al. 2018 [1]",
|
|
749
|
+
"Boll & Bellmunt 2025 [8]",
|
|
750
|
+
"Cristescu et al. 2022 [9]"
|
|
751
|
+
),
|
|
752
|
+
drug = c("Atezolizumab", "Atezolizumab",
|
|
753
|
+
"Atezolizumab (+ multi-cohort)", "Pembrolizumab (pan-tumor)"),
|
|
754
|
+
validated_auc = c("~0.55-0.60", "Association (p<0.001)",
|
|
755
|
+
"~0.70-0.75", "~0.74"),
|
|
756
|
+
method = c("PD-L1 IC score (SP142 IHC)",
|
|
757
|
+
"TMB (FoundationOne)",
|
|
758
|
+
"LASSO logistic regression (expression + molecular)",
|
|
759
|
+
"IRS: TMB + PD1 + PDL1 + TOP2A + ADAM12"),
|
|
760
|
+
notes = c("PD-L1 IC2+ enriches ORR (~27%) vs IC0 (~8%); low discrimination as continuous predictor",
|
|
761
|
+
"TMB-high (>=10 mut/Mb) associated with higher response; not sufficient alone",
|
|
762
|
+
"Multi-cohort integrated model; 707 patients; APOBEC signature + macrophage markers",
|
|
763
|
+
"Pan-solid-tumor IRS validated across 7 tumor types including UC"),
|
|
764
|
+
stringsAsFactors = FALSE
|
|
765
|
+
),
|
|
766
|
+
context = paste(
|
|
767
|
+
"Compared to single-analyte biomarkers (PD-L1 IHC, TMB cutoff),",
|
|
768
|
+
"integrated transcriptomic + genomic models show improved",
|
|
769
|
+
"discrimination. LASSO-based feature selection on RNA-seq data",
|
|
770
|
+
"combined with TMB represents a data-driven alternative to",
|
|
771
|
+
"hypothesis-driven signatures, with the potential to discover",
|
|
772
|
+
"novel predictive biology beyond known immune markers."
|
|
773
|
+
)
|
|
774
|
+
),
|
|
775
|
+
references = list(
|
|
776
|
+
mariathasan2018 = paste("[1] Mariathasan S, Turley SJ, Nickles D, et al.",
|
|
777
|
+
"TGF-beta attenuates tumour response to PD-L1 blockade by contributing",
|
|
778
|
+
"to exclusion of T cells. Nature. 2018;554(7693):544-548."),
|
|
779
|
+
rosenberg2016 = paste("[2] Rosenberg JE, Hoffman-Censits J, Powles T, et al.",
|
|
780
|
+
"Atezolizumab in patients with locally advanced and metastatic",
|
|
781
|
+
"urothelial carcinoma who have progressed following treatment with",
|
|
782
|
+
"platinum-based chemotherapy. Lancet. 2016;387(10031):1909-1920."),
|
|
783
|
+
powles2018 = paste("[3] Powles T, Duran I, van der Heijden MS, et al.",
|
|
784
|
+
"Atezolizumab versus chemotherapy in patients with platinum-treated",
|
|
785
|
+
"locally advanced or metastatic urothelial carcinoma (IMvigor211).",
|
|
786
|
+
"Lancet. 2018;391(10122):748-757."),
|
|
787
|
+
love2014 = paste("[4] Love MI, Huber W, Anders S. Moderated estimation",
|
|
788
|
+
"of fold change and dispersion for RNA-seq data with DESeq2.",
|
|
789
|
+
"Genome Biology. 2014;15(12):550."),
|
|
790
|
+
tibshirani1996 = paste("[5] Tibshirani R. Regression shrinkage and selection",
|
|
791
|
+
"via the lasso. J Royal Stat Soc B. 1996;58(1):267-288."),
|
|
792
|
+
zou2005 = paste("[6] Zou H, Hastie T. Regularization and variable selection",
|
|
793
|
+
"via the elastic net. J Royal Stat Soc B. 2005;67(2):301-320."),
|
|
794
|
+
meinshausen2010 = paste("[7] Meinshausen N, Buhlmann P. Stability selection.",
|
|
795
|
+
"J Royal Stat Soc B. 2010;72(4):417-473."),
|
|
796
|
+
boll2025 = paste("[8] Boll LM, Bellmunt J, et al. Predicting immunotherapy",
|
|
797
|
+
"response of advanced bladder cancer through meta-analysis of",
|
|
798
|
+
"multi-omics data. Nature Communications. 2025;16:1592."),
|
|
799
|
+
cristescu2022 = paste("[9] Cristescu R, Aurora-Garg D, Engelman JA, et al.",
|
|
800
|
+
"Molecular analysis of a Phase II open-label study of atezolizumab",
|
|
801
|
+
"plus nab-paclitaxel vs paclitaxel in triple-negative breast cancer.",
|
|
802
|
+
"Clin Cancer Res. 2022;28(23):5175-5186."),
|
|
803
|
+
samstein2019 = paste("[10] Samstein RM, Lee CH, Shoushtari AN, et al.",
|
|
804
|
+
"Tumor mutational burden predicts immunotherapy response across",
|
|
805
|
+
"cancer types. Nat Genet. 2019;51(2):202-206.")
|
|
806
|
+
)
|
|
807
|
+
)
|
|
808
|
+
|
|
809
|
+
# TMB feature vector (named, scaled, ready to append to feature matrix)
|
|
810
|
+
tmb_feature <- setNames(tmb_log, sample_ids)
|
|
811
|
+
|
|
812
|
+
result <- list(
|
|
813
|
+
expression = expr,
|
|
814
|
+
metadata = metadata,
|
|
815
|
+
outcome_col = "response",
|
|
816
|
+
tmb_feature = tmb_feature, # Force-include after prepare_feature_matrix()
|
|
817
|
+
description = paste("IMvigor210 Phase II trial:", n_resp + n_nonresp,
|
|
818
|
+
"metastatic urothelial carcinoma patients treated with",
|
|
819
|
+
"atezolizumab (anti-PD-L1). RNA-seq + TMB features,",
|
|
820
|
+
"RECIST response endpoint (CR/PR vs PD)."),
|
|
821
|
+
report_context = report_context
|
|
822
|
+
)
|
|
823
|
+
|
|
824
|
+
cat("\n\u2713 IMvigor210 data loaded successfully!\n")
|
|
825
|
+
cat(" Expression:", nrow(expr), "genes x", ncol(expr), "samples\n")
|
|
826
|
+
cat(" TMB: stored in $tmb_feature (use after prepare_feature_matrix)\n")
|
|
827
|
+
cat(" Outcome: ", n_resp, " responders (CR/PR) vs ",
|
|
828
|
+
n_nonresp, " non-responders (PD)\n", sep = "")
|
|
829
|
+
|
|
830
|
+
return(result)
|
|
831
|
+
}
|
|
832
|
+
|
|
833
|
+
|
|
834
|
+
#' Load Breast Cancer Neoadjuvant Chemotherapy pCR Data (GSE25055)
|
|
835
|
+
#'
|
|
836
|
+
#' Downloads baseline tumor gene expression from the Hatzis et al. 2011 study.
|
|
837
|
+
#' Affymetrix Human Genome U133A Array (GPL96), 310 HER2-negative breast cancer
|
|
838
|
+
#' patients treated with neoadjuvant taxane-anthracycline chemotherapy (T/FAC).
|
|
839
|
+
#'
|
|
840
|
+
#' @param data_dir Directory for downloads and cache (default: "data")
|
|
841
|
+
#' @param outcome "subtype" (default, Basal-like vs Luminal A classification) or
|
|
842
|
+
#' "pcr" (pathological complete response prediction)
|
|
843
|
+
#' @return Named list with expression, metadata, outcome_col, description,
|
|
844
|
+
#' report_context — same structure as load_unifi_data()
|
|
845
|
+
|
|
846
|
+
# ---- Breast cancer report context helpers ----
|
|
847
|
+
|
|
848
|
+
.breast_cancer_subtype_report_context <- function(n_basal, n_luma) {
|
|
849
|
+
list(
|
|
850
|
+
disease_background = paste(
|
|
851
|
+
"Breast cancer is the most commonly diagnosed malignancy in women",
|
|
852
|
+
"worldwide, with approximately 2.3 million new cases annually [1].",
|
|
853
|
+
"Molecular subtyping has revolutionized breast cancer classification,",
|
|
854
|
+
"moving beyond traditional histopathological grading to gene",
|
|
855
|
+
"expression-based intrinsic subtypes that capture distinct biology,",
|
|
856
|
+
"prognosis, and treatment sensitivity [2]. The PAM50 classifier",
|
|
857
|
+
"identifies five intrinsic subtypes: Luminal A, Luminal B,",
|
|
858
|
+
"HER2-enriched, Basal-like, and Normal-like [3]. Basal-like tumors",
|
|
859
|
+
"(~15-20% of cases) are the most aggressive subtype, characterized",
|
|
860
|
+
"by high proliferation, frequent TP53 mutations, and poor prognosis,",
|
|
861
|
+
"while Luminal A tumors (~40% of cases) have the best prognosis",
|
|
862
|
+
"and respond to endocrine therapy [4][5]. Accurate molecular",
|
|
863
|
+
"subtype classification from gene expression enables precision",
|
|
864
|
+
"treatment selection and prognostic stratification."
|
|
865
|
+
),
|
|
866
|
+
trial_description = paste(
|
|
867
|
+
"GSE25055 comprises tumor biopsies from breast cancer patients",
|
|
868
|
+
"with PAM50 molecular subtype annotations derived from gene",
|
|
869
|
+
"expression profiling [6]. The classification task focuses on",
|
|
870
|
+
"distinguishing Basal-like from Luminal A tumors — the two most",
|
|
871
|
+
"biologically distinct subtypes representing opposite ends of the",
|
|
872
|
+
"breast cancer molecular spectrum. Basal-like tumors express basal",
|
|
873
|
+
"cytokeratins (KRT5/6/17), EGFR, and lack ER/PR/HER2 expression",
|
|
874
|
+
"(triple-negative phenotype), while Luminal A tumors express ESR1,",
|
|
875
|
+
"FOXA1, GATA3, and related ER-pathway genes [3][4]."
|
|
876
|
+
),
|
|
877
|
+
patient_population = paste(
|
|
878
|
+
n_basal + n_luma, "tumors were selected for the Basal-like vs",
|
|
879
|
+
"Luminal A classification analysis:", n_basal, "Basal-like and",
|
|
880
|
+
n_luma, "Luminal A tumors. This represents the two most clearly",
|
|
881
|
+
"distinct molecular subtypes, with well-characterized genomic",
|
|
882
|
+
"differences spanning estrogen receptor signaling, proliferation",
|
|
883
|
+
"programs, DNA damage repair, and immune microenvironment",
|
|
884
|
+
"composition [4][5]. Gene expression was profiled on the",
|
|
885
|
+
"Affymetrix Human Genome U133A Array, yielding approximately",
|
|
886
|
+
"13,000 unique gene features after probe-to-gene collapse."
|
|
887
|
+
),
|
|
888
|
+
endpoint_definition = paste(
|
|
889
|
+
"The binary endpoint is molecular subtype classification.",
|
|
890
|
+
"Basal-like (coded as 1, n =", paste0(n_basal, ")"), "represents",
|
|
891
|
+
"the aggressive basal-like intrinsic subtype characterized by",
|
|
892
|
+
"high proliferation, basal cytokeratin expression, and absence",
|
|
893
|
+
"of ER/PR/HER2. Luminal A (coded as 0, n =",
|
|
894
|
+
paste0(n_luma, ")"), "represents the indolent luminal subtype",
|
|
895
|
+
"driven by estrogen receptor signaling. Subtype assignments were",
|
|
896
|
+
"determined by the PAM50 centroid classifier applied to the",
|
|
897
|
+
"gene expression data [3]."
|
|
898
|
+
),
|
|
899
|
+
platform_description = paste(
|
|
900
|
+
"Gene expression profiling was performed on tumor biopsies",
|
|
901
|
+
"using the Affymetrix Human Genome U133A Array (GPL96). Data",
|
|
902
|
+
"were RMA-normalized (log2 scale). Probe-level data were",
|
|
903
|
+
"collapsed to gene-level expression by selecting the probe with",
|
|
904
|
+
"highest variance for each gene symbol, yielding approximately",
|
|
905
|
+
"13,000 unique gene features."
|
|
906
|
+
),
|
|
907
|
+
analytical_goals = c(
|
|
908
|
+
paste("Identify a minimal biomarker panel from tumor gene",
|
|
909
|
+
"expression that distinguishes Basal-like from Luminal A",
|
|
910
|
+
"breast cancer subtypes, using penalized logistic regression",
|
|
911
|
+
"with elastic net [7][8] regularization."),
|
|
912
|
+
paste("Evaluate signature stability through repeated nested",
|
|
913
|
+
"cross-validation with stability selection [9] to ensure",
|
|
914
|
+
"robust feature selection across resampled datasets."),
|
|
915
|
+
paste("Recover known subtype-defining genes (ESR1, FOXA1, GATA3",
|
|
916
|
+
"for Luminal; KRT5, KRT17, EGFR for Basal) to validate",
|
|
917
|
+
"the methodology against established biology [3][4]."),
|
|
918
|
+
paste("Characterize selected biomarkers through pathway enrichment,",
|
|
919
|
+
"cell-type expression mapping in breast tissue, and",
|
|
920
|
+
"cross-reference with breast cancer GWAS risk loci to",
|
|
921
|
+
"interpret the biological basis of the predictive panel.")
|
|
922
|
+
),
|
|
923
|
+
published_benchmarks = list(
|
|
924
|
+
intro = paste(
|
|
925
|
+
"Breast cancer molecular subtype classification from gene",
|
|
926
|
+
"expression is a well-established problem with excellent",
|
|
927
|
+
"performance. The PAM50 assay (Prosigna) is FDA-cleared for",
|
|
928
|
+
"clinical use [3]. Machine learning approaches routinely",
|
|
929
|
+
"achieve AUC > 0.95 for subtype classification [10][11]."
|
|
930
|
+
),
|
|
931
|
+
studies = data.frame(
|
|
932
|
+
study = c(
|
|
933
|
+
"Parker et al. 2009 [3]",
|
|
934
|
+
"TCGA Network 2012 [4]",
|
|
935
|
+
"Prat et al. 2015 [5]",
|
|
936
|
+
"Berger et al. 2023 [10]"
|
|
937
|
+
),
|
|
938
|
+
drug = c("N/A (classification)",
|
|
939
|
+
"N/A (classification)",
|
|
940
|
+
"N/A (classification)",
|
|
941
|
+
"N/A (classification)"),
|
|
942
|
+
validated_auc = c(
|
|
943
|
+
"PAM50 classifier (clinical standard)",
|
|
944
|
+
"Multi-platform classification >0.95",
|
|
945
|
+
"Subtype-specific survival stratification",
|
|
946
|
+
"ML classifiers AUC >0.98"),
|
|
947
|
+
method = c(
|
|
948
|
+
"50-gene centroid classifier",
|
|
949
|
+
"Integrated multi-omics analysis",
|
|
950
|
+
"PAM50 with clinical integration",
|
|
951
|
+
"LASSO/random forest/SVM comparison"),
|
|
952
|
+
notes = c(
|
|
953
|
+
"Clinical standard; FDA-cleared as Prosigna",
|
|
954
|
+
"Comprehensive molecular portraits; defined subtypes",
|
|
955
|
+
"Clinical utility of intrinsic subtypes for treatment",
|
|
956
|
+
"ML methods replicate PAM50 with fewer genes"),
|
|
957
|
+
stringsAsFactors = FALSE
|
|
958
|
+
),
|
|
959
|
+
context = paste(
|
|
960
|
+
"The Basal-like vs Luminal A distinction represents the",
|
|
961
|
+
"most biologically distinct comparison in breast cancer,",
|
|
962
|
+
"driven by estrogen receptor signaling (ESR1, FOXA1, GATA3),",
|
|
963
|
+
"proliferation programs (MKI67, CDC20, TPX2), and basal",
|
|
964
|
+
"cytokeratin expression (KRT5, KRT17). LASSO-based feature",
|
|
965
|
+
"selection consistently identifies these pathway genes,",
|
|
966
|
+
"confirming the biological validity of penalized regression",
|
|
967
|
+
"for biomarker discovery in cancer genomics."
|
|
968
|
+
)
|
|
969
|
+
),
|
|
970
|
+
references = list(
|
|
971
|
+
sung2021 = paste("[1] Sung H, Ferlay J, Siegel RL, et al. Global Cancer",
|
|
972
|
+
"Statistics 2020: GLOBOCAN Estimates. CA Cancer J Clin.",
|
|
973
|
+
"2021;71(3):209-249."),
|
|
974
|
+
perou2000 = paste("[2] Perou CM, Sorlie T, Eisen MB, et al. Molecular",
|
|
975
|
+
"portraits of human breast tumours. Nature.",
|
|
976
|
+
"2000;406(6797):747-752."),
|
|
977
|
+
parker2009 = paste("[3] Parker JS, Mullins M, Cheang MC, et al. Supervised",
|
|
978
|
+
"risk predictor of breast cancer based on intrinsic subtypes.",
|
|
979
|
+
"J Clin Oncol. 2009;27(8):1160-1167."),
|
|
980
|
+
tcga2012 = paste("[4] Cancer Genome Atlas Network. Comprehensive molecular",
|
|
981
|
+
"portraits of human breast tumours. Nature.",
|
|
982
|
+
"2012;490(7418):61-70."),
|
|
983
|
+
prat2015 = paste("[5] Prat A, Fan C, Fernandez A, et al. Response and",
|
|
984
|
+
"survival of breast cancer intrinsic subtypes following",
|
|
985
|
+
"multi-agent neoadjuvant chemotherapy. BMC Med. 2015;13:303."),
|
|
986
|
+
hatzis2011 = paste("[6] Hatzis C, Pusztai L, Valero V, et al. A genomic",
|
|
987
|
+
"predictor of response and survival following taxane-anthracycline",
|
|
988
|
+
"chemotherapy for invasive breast cancer. JAMA.",
|
|
989
|
+
"2011;305(18):1873-1881."),
|
|
990
|
+
tibshirani1996 = paste("[7] Tibshirani R. Regression shrinkage and selection",
|
|
991
|
+
"via the lasso. J Royal Stat Soc B. 1996;58(1):267-288."),
|
|
992
|
+
zou2005 = paste("[8] Zou H, Hastie T. Regularization and variable selection",
|
|
993
|
+
"via the elastic net. J Royal Stat Soc B. 2005;67(2):301-320."),
|
|
994
|
+
meinshausen2010 = paste("[9] Meinshausen N, Buhlmann P. Stability selection.",
|
|
995
|
+
"J Royal Stat Soc B. 2010;72(4):417-473."),
|
|
996
|
+
berger2023 = paste("[10] Berger AC, et al. Machine learning approaches for",
|
|
997
|
+
"breast cancer molecular subtype classification. npj Breast",
|
|
998
|
+
"Cancer. 2023;9:42."),
|
|
999
|
+
weigelt2010 = paste("[11] Weigelt B, Mackay A, A'Hern R, et al. Breast cancer",
|
|
1000
|
+
"molecular profiling with single sample predictors: a",
|
|
1001
|
+
"retrospective analysis. Lancet Oncol. 2010;11(4):339-349.")
|
|
1002
|
+
)
|
|
1003
|
+
)
|
|
1004
|
+
}
|
|
1005
|
+
|
|
1006
|
+
.breast_cancer_pcr_report_context <- function(n_pcr, n_rd) {
|
|
1007
|
+
list(
|
|
1008
|
+
disease_background = paste(
|
|
1009
|
+
"Breast cancer is the most commonly diagnosed malignancy in women",
|
|
1010
|
+
"worldwide, with approximately 2.3 million new cases annually [1].",
|
|
1011
|
+
"Neoadjuvant chemotherapy (NAC) is the standard of care for locally",
|
|
1012
|
+
"advanced breast cancer. Pathological complete response (pCR) is a",
|
|
1013
|
+
"validated surrogate endpoint for long-term survival [2][3].",
|
|
1014
|
+
"However, pCR rates vary dramatically by molecular subtype [4].",
|
|
1015
|
+
"Identifying patients likely to achieve pCR before treatment could",
|
|
1016
|
+
"spare non-responders from ineffective cytotoxic therapy."
|
|
1017
|
+
),
|
|
1018
|
+
trial_description = paste(
|
|
1019
|
+
"GSE25055 comprises baseline pre-treatment tumor biopsies from",
|
|
1020
|
+
n_pcr + n_rd, "HER2-negative breast cancer patients treated with",
|
|
1021
|
+
"neoadjuvant taxane-anthracycline chemotherapy (T/FAC regimen)",
|
|
1022
|
+
"across multiple institutions [5]. This is part of the landmark",
|
|
1023
|
+
"study by Hatzis et al. (JCO 2011) [5]."
|
|
1024
|
+
),
|
|
1025
|
+
patient_population = paste(
|
|
1026
|
+
n_pcr + n_rd, "patients with stage I-III HER2-negative breast cancer.",
|
|
1027
|
+
n_pcr, "achieved pCR and", n_rd, "had residual disease (RD).",
|
|
1028
|
+
"The cohort includes ER-positive and ER-negative patients across",
|
|
1029
|
+
"multiple PAM50 intrinsic subtypes."
|
|
1030
|
+
),
|
|
1031
|
+
endpoint_definition = paste(
|
|
1032
|
+
"Binary endpoint: pathological complete response (pCR, coded as 1,",
|
|
1033
|
+
"n =", paste0(n_pcr, ")"), "vs residual disease (RD, coded as 0,",
|
|
1034
|
+
"n =", paste0(n_rd, ")."), "pCR is an FDA-accepted surrogate",
|
|
1035
|
+
"endpoint for drug approval in the neoadjuvant setting [3]."
|
|
1036
|
+
),
|
|
1037
|
+
platform_description = paste(
|
|
1038
|
+
"Affymetrix Human Genome U133A Array (GPL96). RMA-normalized",
|
|
1039
|
+
"(log2 scale). Probe-to-gene collapse by highest variance probe."
|
|
1040
|
+
),
|
|
1041
|
+
analytical_goals = c(
|
|
1042
|
+
paste("Identify a biomarker panel predicting pCR to neoadjuvant",
|
|
1043
|
+
"chemotherapy using elastic net regularization [6][7]."),
|
|
1044
|
+
paste("Evaluate stability through repeated nested CV [8]."),
|
|
1045
|
+
paste("Compare against published pCR predictors [5][9][10].")
|
|
1046
|
+
),
|
|
1047
|
+
published_benchmarks = list(
|
|
1048
|
+
intro = paste(
|
|
1049
|
+
"Published LASSO/ML approaches report AUCs of 0.73-0.97",
|
|
1050
|
+
"for pCR prediction depending on feature set and subtype [9][10]."
|
|
1051
|
+
),
|
|
1052
|
+
studies = data.frame(
|
|
1053
|
+
study = c("Hatzis et al. 2011 [5]", "Li et al. 2021 [10]"),
|
|
1054
|
+
drug = c("T/FAC", "T/FAC"),
|
|
1055
|
+
validated_auc = c("0.77 (DLDA30)", "0.91-0.97 (25-gene immune)"),
|
|
1056
|
+
method = c("DLDA30 genomic predictor", "LASSO on 320 immune genes"),
|
|
1057
|
+
notes = c("First large-scale predictor", "Pre-filtered immune genes"),
|
|
1058
|
+
stringsAsFactors = FALSE
|
|
1059
|
+
),
|
|
1060
|
+
context = paste(
|
|
1061
|
+
"Immune infiltration is the dominant signal predicting pCR,",
|
|
1062
|
+
"especially in ER-negative subtypes [9]."
|
|
1063
|
+
)
|
|
1064
|
+
),
|
|
1065
|
+
references = list(
|
|
1066
|
+
sung2021 = paste("[1] Sung H, et al. Global Cancer Statistics 2020.",
|
|
1067
|
+
"CA Cancer J Clin. 2021;71(3):209-249."),
|
|
1068
|
+
cortazar2014 = paste("[2] Cortazar P, et al. Pathological complete response",
|
|
1069
|
+
"and long-term benefit. Lancet. 2014;384(9938):164-172."),
|
|
1070
|
+
fda2020 = paste("[3] FDA Guidance: pCR in Neoadjuvant Breast Cancer. 2020."),
|
|
1071
|
+
carey2007 = paste("[4] Carey LA, et al. Triple negative paradox.",
|
|
1072
|
+
"Clin Cancer Res. 2007;13(8):2329-2334."),
|
|
1073
|
+
hatzis2011 = paste("[5] Hatzis C, et al. Genomic predictor for breast cancer.",
|
|
1074
|
+
"JAMA. 2011;305(18):1873-1881."),
|
|
1075
|
+
tibshirani1996 = paste("[6] Tibshirani R. LASSO. JRSS-B. 1996;58(1):267-288."),
|
|
1076
|
+
zou2005 = paste("[7] Zou H, Hastie T. Elastic net. JRSS-B. 2005;67(2):301-320."),
|
|
1077
|
+
meinshausen2010 = paste("[8] Meinshausen N, Buhlmann P. Stability selection.",
|
|
1078
|
+
"JRSS-B. 2010;72(4):417-473."),
|
|
1079
|
+
denkert2018 = paste("[9] Denkert C, et al. TILs and prognosis in breast cancer.",
|
|
1080
|
+
"Lancet Oncol. 2018;19(1):40-50."),
|
|
1081
|
+
li2021 = paste("[10] Li L, et al. Immune signature for pCR prediction.",
|
|
1082
|
+
"Front Immunol. 2021;12:704655.")
|
|
1083
|
+
)
|
|
1084
|
+
)
|
|
1085
|
+
}
|
|
1086
|
+
|
|
1087
|
+
load_breast_cancer_pcr_data <- function(data_dir = "data", outcome = "subtype") {
|
|
1088
|
+
cat("Loading GSE25055 (Breast Cancer Gene Expression)...\n")
|
|
1089
|
+
|
|
1090
|
+
.ensure_package("GEOquery")
|
|
1091
|
+
.ensure_package("Biobase")
|
|
1092
|
+
|
|
1093
|
+
library(GEOquery)
|
|
1094
|
+
library(Biobase)
|
|
1095
|
+
|
|
1096
|
+
if (!dir.exists(data_dir)) dir.create(data_dir, recursive = TRUE)
|
|
1097
|
+
|
|
1098
|
+
# --- Check for cached processed data ---
|
|
1099
|
+
cache_expr <- file.path(data_dir, "GSE25055_expression.rds")
|
|
1100
|
+
cache_meta <- file.path(data_dir, "GSE25055_metadata.rds")
|
|
1101
|
+
|
|
1102
|
+
if (file.exists(cache_expr) && file.exists(cache_meta)) {
|
|
1103
|
+
cat(" Loading from cache...\n")
|
|
1104
|
+
expr <- readRDS(cache_expr)
|
|
1105
|
+
metadata <- readRDS(cache_meta)
|
|
1106
|
+
} else {
|
|
1107
|
+
# --- Download series matrix ---
|
|
1108
|
+
cat(" Downloading from GEO (may take 2-5 min for 310 samples)...\n")
|
|
1109
|
+
gse <- getGEO("GSE25055", GSEMatrix = TRUE, getGPL = TRUE,
|
|
1110
|
+
destdir = data_dir)
|
|
1111
|
+
if (is.list(gse)) gse <- gse[[1]]
|
|
1112
|
+
|
|
1113
|
+
# --- Expression matrix ---
|
|
1114
|
+
expr_raw <- exprs(gse)
|
|
1115
|
+
cat(" Raw expression:", nrow(expr_raw), "probes x", ncol(expr_raw), "samples\n")
|
|
1116
|
+
|
|
1117
|
+
# --- Probe to gene symbol mapping ---
|
|
1118
|
+
cat(" Mapping probes to gene symbols...\n")
|
|
1119
|
+
fdata <- fData(gse)
|
|
1120
|
+
sym_col <- grep("gene.symbol|^symbol$|gene_symbol",
|
|
1121
|
+
colnames(fdata), ignore.case = TRUE, value = TRUE)
|
|
1122
|
+
if (length(sym_col) == 0) {
|
|
1123
|
+
stop("Cannot find gene symbol column in feature data. ",
|
|
1124
|
+
"Re-download with getGPL=TRUE.")
|
|
1125
|
+
}
|
|
1126
|
+
gene_symbols <- as.character(fdata[[sym_col[1]]])
|
|
1127
|
+
expr <- .collapse_probes_to_genes(expr_raw, gene_symbols)
|
|
1128
|
+
|
|
1129
|
+
# --- Parse metadata ---
|
|
1130
|
+
pheno <- pData(gse)
|
|
1131
|
+
|
|
1132
|
+
# Find pCR column
|
|
1133
|
+
pcr_col <- grep("pathologic_response_pcr_rd|pcr|pathologic.response",
|
|
1134
|
+
colnames(pheno), ignore.case = TRUE, value = TRUE)
|
|
1135
|
+
if (length(pcr_col) == 0) {
|
|
1136
|
+
# Try characteristics columns
|
|
1137
|
+
char_cols <- grep("characteristics", colnames(pheno),
|
|
1138
|
+
ignore.case = TRUE, value = TRUE)
|
|
1139
|
+
for (cc in char_cols) {
|
|
1140
|
+
vals <- as.character(pheno[[cc]])
|
|
1141
|
+
if (any(grepl("pathologic_response|pcr_rd|pcr", vals, ignore.case = TRUE))) {
|
|
1142
|
+
pcr_col <- cc
|
|
1143
|
+
# Extract value after ": "
|
|
1144
|
+
pheno[[cc]] <- sub(".*: ", "", vals)
|
|
1145
|
+
break
|
|
1146
|
+
}
|
|
1147
|
+
}
|
|
1148
|
+
}
|
|
1149
|
+
|
|
1150
|
+
if (length(pcr_col) == 0) {
|
|
1151
|
+
stop("Cannot find pathologic response column in metadata. ",
|
|
1152
|
+
"Check pData(gse) columns.")
|
|
1153
|
+
}
|
|
1154
|
+
pcr_col <- pcr_col[1]
|
|
1155
|
+
cat(" Using pCR column:", pcr_col, "\n")
|
|
1156
|
+
|
|
1157
|
+
# Extract response values
|
|
1158
|
+
pcr_values <- as.character(pheno[[pcr_col]])
|
|
1159
|
+
cat(" Response values:", paste(sort(unique(pcr_values)), collapse = ", "), "\n")
|
|
1160
|
+
|
|
1161
|
+
# Parse ER status
|
|
1162
|
+
er_col <- grep("er_status|estrogen", colnames(pheno),
|
|
1163
|
+
ignore.case = TRUE, value = TRUE)
|
|
1164
|
+
er_status <- if (length(er_col) > 0) {
|
|
1165
|
+
vals <- as.character(pheno[[er_col[1]]])
|
|
1166
|
+
sub(".*: ", "", vals)
|
|
1167
|
+
} else { rep(NA, ncol(expr)) }
|
|
1168
|
+
|
|
1169
|
+
# Parse HER2 status
|
|
1170
|
+
her2_col <- grep("her2_status|erbb2", colnames(pheno),
|
|
1171
|
+
ignore.case = TRUE, value = TRUE)
|
|
1172
|
+
her2_status <- if (length(her2_col) > 0) {
|
|
1173
|
+
vals <- as.character(pheno[[her2_col[1]]])
|
|
1174
|
+
sub(".*: ", "", vals)
|
|
1175
|
+
} else { rep(NA, ncol(expr)) }
|
|
1176
|
+
|
|
1177
|
+
# Parse PAM50 subtype
|
|
1178
|
+
pam50_col <- grep("pam50", colnames(pheno),
|
|
1179
|
+
ignore.case = TRUE, value = TRUE)
|
|
1180
|
+
pam50 <- if (length(pam50_col) > 0) {
|
|
1181
|
+
vals <- as.character(pheno[[pam50_col[1]]])
|
|
1182
|
+
sub(".*: ", "", vals)
|
|
1183
|
+
} else { rep(NA, ncol(expr)) }
|
|
1184
|
+
|
|
1185
|
+
# Parse grade
|
|
1186
|
+
grade_col <- grep("^grade", colnames(pheno),
|
|
1187
|
+
ignore.case = TRUE, value = TRUE)
|
|
1188
|
+
grade <- if (length(grade_col) > 0) {
|
|
1189
|
+
vals <- as.character(pheno[[grade_col[1]]])
|
|
1190
|
+
sub(".*: ", "", vals)
|
|
1191
|
+
} else { rep(NA, ncol(expr)) }
|
|
1192
|
+
|
|
1193
|
+
# Build metadata
|
|
1194
|
+
metadata <- data.frame(
|
|
1195
|
+
sample_id = colnames(expr),
|
|
1196
|
+
pcr = pcr_values,
|
|
1197
|
+
er_status = er_status,
|
|
1198
|
+
her2_status = her2_status,
|
|
1199
|
+
pam50_subtype = pam50,
|
|
1200
|
+
grade = grade,
|
|
1201
|
+
row.names = colnames(expr),
|
|
1202
|
+
stringsAsFactors = FALSE
|
|
1203
|
+
)
|
|
1204
|
+
|
|
1205
|
+
# Binarize: pCR = 1, RD = 0
|
|
1206
|
+
metadata$response <- ifelse(toupper(metadata$pcr) == "PCR", 1L,
|
|
1207
|
+
ifelse(toupper(metadata$pcr) == "RD", 0L, NA_integer_))
|
|
1208
|
+
|
|
1209
|
+
# Remove samples with missing response
|
|
1210
|
+
valid <- !is.na(metadata$response)
|
|
1211
|
+
if (sum(!valid) > 0) {
|
|
1212
|
+
cat(" Removing", sum(!valid), "samples with missing/ambiguous response\n")
|
|
1213
|
+
metadata <- metadata[valid, ]
|
|
1214
|
+
expr <- expr[, valid, drop = FALSE]
|
|
1215
|
+
}
|
|
1216
|
+
|
|
1217
|
+
# --- Cache ---
|
|
1218
|
+
saveRDS(expr, cache_expr)
|
|
1219
|
+
saveRDS(metadata, cache_meta)
|
|
1220
|
+
cat(" Cached processed data for faster re-loading\n")
|
|
1221
|
+
}
|
|
1222
|
+
|
|
1223
|
+
# ---- Outcome handling ----
|
|
1224
|
+
if (outcome == "subtype") {
|
|
1225
|
+
# Basal-like vs Luminal A molecular subtype classification
|
|
1226
|
+
cat(" Selecting outcome: Basal-like vs Luminal A subtype classification\n")
|
|
1227
|
+
|
|
1228
|
+
subtype_idx <- metadata$pam50_subtype %in% c("Basal", "LumA")
|
|
1229
|
+
if (sum(subtype_idx) < 50) {
|
|
1230
|
+
stop("Insufficient samples for Basal/LumA classification. Found: ",
|
|
1231
|
+
sum(subtype_idx))
|
|
1232
|
+
}
|
|
1233
|
+
metadata <- metadata[subtype_idx, ]
|
|
1234
|
+
expr <- expr[, subtype_idx, drop = FALSE]
|
|
1235
|
+
metadata$subtype_binary <- ifelse(metadata$pam50_subtype == "Basal", 1L, 0L)
|
|
1236
|
+
outcome_col <- "subtype_binary"
|
|
1237
|
+
|
|
1238
|
+
n_basal <- sum(metadata$subtype_binary == 1)
|
|
1239
|
+
n_luma <- sum(metadata$subtype_binary == 0)
|
|
1240
|
+
|
|
1241
|
+
report_context <- .breast_cancer_subtype_report_context(n_basal, n_luma)
|
|
1242
|
+
desc <- paste("GSE25055: Breast cancer molecular subtype classification.",
|
|
1243
|
+
n_basal, "Basal-like vs", n_luma, "Luminal A tumors.",
|
|
1244
|
+
"Affymetrix U133A (GPL96), log2 RMA-normalized.")
|
|
1245
|
+
|
|
1246
|
+
cat("\n\u2713 Breast cancer subtype data loaded successfully!\n")
|
|
1247
|
+
cat(" Expression:", nrow(expr), "genes x", ncol(expr), "samples\n")
|
|
1248
|
+
cat(" Outcome: ", n_basal, " Basal-like vs ",
|
|
1249
|
+
n_luma, " Luminal A\n", sep = "")
|
|
1250
|
+
|
|
1251
|
+
} else {
|
|
1252
|
+
# Original pCR outcome
|
|
1253
|
+
outcome_col <- "response"
|
|
1254
|
+
n_pcr <- sum(metadata$response == 1)
|
|
1255
|
+
n_rd <- sum(metadata$response == 0)
|
|
1256
|
+
|
|
1257
|
+
report_context <- .breast_cancer_pcr_report_context(n_pcr, n_rd)
|
|
1258
|
+
desc <- paste("GSE25055: Breast cancer neoadjuvant chemotherapy pCR prediction.",
|
|
1259
|
+
n_pcr + n_rd, "HER2-negative patients treated with T/FAC.",
|
|
1260
|
+
n_pcr, "pCR vs", n_rd, "residual disease.",
|
|
1261
|
+
"Affymetrix U133A (GPL96), log2 RMA-normalized.")
|
|
1262
|
+
|
|
1263
|
+
cat("\n\u2713 Breast cancer pCR data loaded successfully!\n")
|
|
1264
|
+
cat(" Expression:", nrow(expr), "genes x", ncol(expr), "samples\n")
|
|
1265
|
+
cat(" Outcome: ", n_pcr, " pCR (responders) vs ",
|
|
1266
|
+
n_rd, " RD (non-responders)\n", sep = "")
|
|
1267
|
+
if (!all(is.na(metadata$er_status))) {
|
|
1268
|
+
cat(" ER status: ", sum(metadata$er_status == "P", na.rm = TRUE),
|
|
1269
|
+
" positive, ",
|
|
1270
|
+
sum(metadata$er_status == "N", na.rm = TRUE), " negative\n",
|
|
1271
|
+
sep = "")
|
|
1272
|
+
}
|
|
1273
|
+
}
|
|
1274
|
+
|
|
1275
|
+
if (!all(is.na(metadata$pam50_subtype))) {
|
|
1276
|
+
tab <- table(metadata$pam50_subtype)
|
|
1277
|
+
cat(" PAM50 subtypes:", paste(names(tab), tab, sep = "=",
|
|
1278
|
+
collapse = ", "), "\n")
|
|
1279
|
+
}
|
|
1280
|
+
|
|
1281
|
+
result <- list(
|
|
1282
|
+
expression = expr,
|
|
1283
|
+
metadata = metadata,
|
|
1284
|
+
outcome_col = outcome_col,
|
|
1285
|
+
description = desc,
|
|
1286
|
+
report_context = report_context
|
|
1287
|
+
)
|
|
1288
|
+
|
|
1289
|
+
return(result)
|
|
1290
|
+
}
|
|
1291
|
+
|
|
1292
|
+
|
|
1293
|
+
#' Load Breast Cancer Validation Data for Cross-Dataset Validation
|
|
1294
|
+
#'
|
|
1295
|
+
#' Downloads an independent breast cancer neoadjuvant chemo dataset from GEO
|
|
1296
|
+
#' for external validation of the pCR biomarker panel.
|
|
1297
|
+
#'
|
|
1298
|
+
#' @param geo_id GEO accession for validation cohort. Options:
|
|
1299
|
+
#' - "GSE32646" (115 samples, GPL570, paclitaxel + FEC)
|
|
1300
|
+
#' - "GSE20194" (207 samples, T/FAC)
|
|
1301
|
+
#' - "GSE20271" (74 samples, T/FAC)
|
|
1302
|
+
#' @param data_dir Directory for downloads and cache (default: "data")
|
|
1303
|
+
#' @return Named list with expression, metadata, outcome_col
|
|
1304
|
+
load_breast_cancer_validation_data <- function(geo_id = "GSE32646",
|
|
1305
|
+
data_dir = "data") {
|
|
1306
|
+
cat("Loading", geo_id, "(Breast Cancer Validation Cohort)...\n")
|
|
1307
|
+
|
|
1308
|
+
.ensure_package("GEOquery")
|
|
1309
|
+
.ensure_package("Biobase")
|
|
1310
|
+
|
|
1311
|
+
library(GEOquery)
|
|
1312
|
+
library(Biobase)
|
|
1313
|
+
|
|
1314
|
+
if (!dir.exists(data_dir)) dir.create(data_dir, recursive = TRUE)
|
|
1315
|
+
|
|
1316
|
+
# --- Check for cached data ---
|
|
1317
|
+
cache_expr <- file.path(data_dir, paste0(geo_id, "_expression.rds"))
|
|
1318
|
+
cache_meta <- file.path(data_dir, paste0(geo_id, "_metadata.rds"))
|
|
1319
|
+
|
|
1320
|
+
if (file.exists(cache_expr) && file.exists(cache_meta)) {
|
|
1321
|
+
cat(" Loading from cache...\n")
|
|
1322
|
+
expr <- readRDS(cache_expr)
|
|
1323
|
+
metadata <- readRDS(cache_meta)
|
|
1324
|
+
} else {
|
|
1325
|
+
cat(" Downloading from GEO...\n")
|
|
1326
|
+
gse <- getGEO(geo_id, GSEMatrix = TRUE, getGPL = TRUE,
|
|
1327
|
+
destdir = data_dir)
|
|
1328
|
+
if (is.list(gse)) gse <- gse[[1]]
|
|
1329
|
+
|
|
1330
|
+
# --- Expression ---
|
|
1331
|
+
expr_raw <- exprs(gse)
|
|
1332
|
+
cat(" Raw expression:", nrow(expr_raw), "probes x", ncol(expr_raw), "samples\n")
|
|
1333
|
+
|
|
1334
|
+
# --- Probe to gene ---
|
|
1335
|
+
fdata <- fData(gse)
|
|
1336
|
+
sym_col <- grep("gene.symbol|^symbol$|gene_symbol",
|
|
1337
|
+
colnames(fdata), ignore.case = TRUE, value = TRUE)
|
|
1338
|
+
if (length(sym_col) == 0) {
|
|
1339
|
+
stop("Cannot find gene symbol column. Re-download with getGPL=TRUE.")
|
|
1340
|
+
}
|
|
1341
|
+
gene_symbols <- as.character(fdata[[sym_col[1]]])
|
|
1342
|
+
expr <- .collapse_probes_to_genes(expr_raw, gene_symbols)
|
|
1343
|
+
|
|
1344
|
+
# --- Parse pCR from metadata ---
|
|
1345
|
+
pheno <- pData(gse)
|
|
1346
|
+
|
|
1347
|
+
# Try standard column names first
|
|
1348
|
+
pcr_col <- grep("pathologic_response|pcr|response.*path",
|
|
1349
|
+
colnames(pheno), ignore.case = TRUE, value = TRUE)
|
|
1350
|
+
|
|
1351
|
+
if (length(pcr_col) == 0) {
|
|
1352
|
+
# Search in characteristics columns
|
|
1353
|
+
char_cols <- grep("characteristics", colnames(pheno),
|
|
1354
|
+
ignore.case = TRUE, value = TRUE)
|
|
1355
|
+
for (cc in char_cols) {
|
|
1356
|
+
vals <- as.character(pheno[[cc]])
|
|
1357
|
+
if (any(grepl("pathologic|pcr|response|pCR|RD",
|
|
1358
|
+
vals, ignore.case = TRUE))) {
|
|
1359
|
+
pcr_col <- cc
|
|
1360
|
+
pheno[[cc]] <- sub(".*: ", "", vals)
|
|
1361
|
+
break
|
|
1362
|
+
}
|
|
1363
|
+
}
|
|
1364
|
+
}
|
|
1365
|
+
|
|
1366
|
+
if (length(pcr_col) == 0) {
|
|
1367
|
+
stop("Cannot find pathologic response column in ", geo_id,
|
|
1368
|
+
" metadata. Check pData(gse).")
|
|
1369
|
+
}
|
|
1370
|
+
pcr_col <- pcr_col[1]
|
|
1371
|
+
|
|
1372
|
+
pcr_values <- toupper(trimws(as.character(pheno[[pcr_col]])))
|
|
1373
|
+
cat(" Response values:", paste(sort(unique(pcr_values)), collapse = ", "), "\n")
|
|
1374
|
+
|
|
1375
|
+
# Binarize: handle various formats (pCR/RD, PCR/NCR, etc.)
|
|
1376
|
+
response <- ifelse(pcr_values %in% c("PCR", "YES", "1", "COMPLETE"), 1L,
|
|
1377
|
+
ifelse(pcr_values %in% c("RD", "NCR", "NO", "0", "RESIDUAL", "INCOMPLETE"), 0L,
|
|
1378
|
+
NA_integer_))
|
|
1379
|
+
|
|
1380
|
+
metadata <- data.frame(
|
|
1381
|
+
sample_id = colnames(expr),
|
|
1382
|
+
pcr = as.character(pheno[[pcr_col]]),
|
|
1383
|
+
response = response,
|
|
1384
|
+
row.names = colnames(expr),
|
|
1385
|
+
stringsAsFactors = FALSE
|
|
1386
|
+
)
|
|
1387
|
+
|
|
1388
|
+
# Remove missing
|
|
1389
|
+
valid <- !is.na(metadata$response)
|
|
1390
|
+
if (sum(!valid) > 0) {
|
|
1391
|
+
cat(" Removing", sum(!valid), "samples with missing response\n")
|
|
1392
|
+
metadata <- metadata[valid, ]
|
|
1393
|
+
expr <- expr[, valid, drop = FALSE]
|
|
1394
|
+
}
|
|
1395
|
+
|
|
1396
|
+
saveRDS(expr, cache_expr)
|
|
1397
|
+
saveRDS(metadata, cache_meta)
|
|
1398
|
+
cat(" Cached processed data\n")
|
|
1399
|
+
}
|
|
1400
|
+
|
|
1401
|
+
n_pcr <- sum(metadata$response == 1)
|
|
1402
|
+
n_rd <- sum(metadata$response == 0)
|
|
1403
|
+
|
|
1404
|
+
result <- list(
|
|
1405
|
+
expression = expr,
|
|
1406
|
+
metadata = metadata,
|
|
1407
|
+
outcome_col = "response",
|
|
1408
|
+
description = paste(geo_id, ": Breast cancer validation cohort.",
|
|
1409
|
+
n_pcr, "pCR vs", n_rd, "RD.")
|
|
1410
|
+
)
|
|
1411
|
+
|
|
1412
|
+
cat("\u2713", geo_id, "loaded:", n_pcr, "pCR vs", n_rd, "RD\n")
|
|
1413
|
+
return(result)
|
|
1414
|
+
}
|
|
1415
|
+
|
|
1416
|
+
|
|
1417
|
+
# ==============================================================================
|
|
1418
|
+
# Sepsis Blood Transcriptomics (GSE65682 — MARS Consortium)
|
|
1419
|
+
# ==============================================================================
|
|
1420
|
+
|
|
1421
|
+
#' Load Sepsis Blood Transcriptomics Data (GSE65682)
|
|
1422
|
+
#'
|
|
1423
|
+
#' Downloads blood gene expression from the MARS (Molecular Diagnosis and Risk
|
|
1424
|
+
#' Stratification of Sepsis) consortium. Affymetrix Human Genome U219 Array
|
|
1425
|
+
#' (GPL13667), ICU sepsis patients.
|
|
1426
|
+
#'
|
|
1427
|
+
#' @param data_dir Directory for downloads and cache (default: "data")
|
|
1428
|
+
#' @param outcome "endotype" (default, Mars1 immunosuppressed endotype classification)
|
|
1429
|
+
#' or "mortality" (28-day all-cause mortality prediction)
|
|
1430
|
+
#' @return Named list with expression, metadata, outcome_col, description,
|
|
1431
|
+
#' report_context
|
|
1432
|
+
|
|
1433
|
+
# ---- Sepsis report context helpers ----
|
|
1434
|
+
|
|
1435
|
+
.sepsis_endotype_report_context <- function(n_mars1, n_other, n_genes) {
|
|
1436
|
+
list(
|
|
1437
|
+
disease_background = paste(
|
|
1438
|
+
"Sepsis is a life-threatening organ dysfunction caused by a dysregulated",
|
|
1439
|
+
"host response to infection, affecting over 48 million people annually",
|
|
1440
|
+
"worldwide with an estimated 11 million deaths (20% of all global deaths) [1].",
|
|
1441
|
+
"Despite decades of clinical trials, no immunomodulatory therapy has shown",
|
|
1442
|
+
"consistent benefit across all sepsis patients. A key reason is biological",
|
|
1443
|
+
"heterogeneity: sepsis encompasses a spectrum of immune states from",
|
|
1444
|
+
"hyperinflammation to profound immunosuppression, and treating all patients",
|
|
1445
|
+
"identically fails to account for this diversity [2]. Blood transcriptomic",
|
|
1446
|
+
"profiling has revealed distinct molecular endotypes that stratify patients",
|
|
1447
|
+
"into groups with markedly different immune profiles and outcomes, opening the",
|
|
1448
|
+
"door to precision immunotherapy in critical care [3]."
|
|
1449
|
+
),
|
|
1450
|
+
trial_description = paste(
|
|
1451
|
+
"The MARS (Molecular Diagnosis and Risk Stratification of Sepsis) project is",
|
|
1452
|
+
"a prospective observational cohort study conducted at two tertiary ICUs in",
|
|
1453
|
+
"the Netherlands (Academic Medical Center Amsterdam and University Medical",
|
|
1454
|
+
"Center Utrecht) between 2011-2013 [3]. Blood samples for transcriptomic",
|
|
1455
|
+
"profiling were collected within 24 hours of ICU admission. Unsupervised",
|
|
1456
|
+
"consensus clustering of genome-wide blood transcriptomes identified four",
|
|
1457
|
+
"molecular endotypes (Mars1-4) with distinct immune signatures and clinical",
|
|
1458
|
+
"trajectories. Mars1, characterized by immunosuppression and impaired",
|
|
1459
|
+
"gluconeogenesis, carries the highest mortality risk (hazard ratio 3.9",
|
|
1460
|
+
"versus Mars3, the reference group) [3][4]."
|
|
1461
|
+
),
|
|
1462
|
+
patient_population = paste(
|
|
1463
|
+
"The analysis cohort comprises", n_mars1 + n_other,
|
|
1464
|
+
"sepsis patients from the MARS consortium with assigned blood genomic",
|
|
1465
|
+
"endotypes (discovery + validation cohorts). The binary classification task",
|
|
1466
|
+
"is Mars1 (immunosuppressed, n =", paste0(n_mars1, ")"), "versus all other",
|
|
1467
|
+
"endotypes (Mars2+Mars3+Mars4, n =", paste0(n_other, ")."),
|
|
1468
|
+
"Gene expression was measured from whole blood RNA on the Affymetrix Human",
|
|
1469
|
+
"Genome U219 Array (GPL13667). Probe-level data was collapsed to", n_genes,
|
|
1470
|
+
"unique gene symbols. Healthy controls (n=42) were excluded."
|
|
1471
|
+
),
|
|
1472
|
+
endpoint_definition = paste(
|
|
1473
|
+
"The binary outcome is Mars1 endotype classification: Mars1 (1, n =",
|
|
1474
|
+
paste0(n_mars1, ","), round(100 * n_mars1 / (n_mars1 + n_other), 1),
|
|
1475
|
+
"%) versus non-Mars1 (0, n =", paste0(n_other, ")."),
|
|
1476
|
+
"Mars1 represents the most severely immunosuppressed endotype with",
|
|
1477
|
+
"downregulated adaptive immune pathways, impaired interferon signaling,",
|
|
1478
|
+
"and the highest 28-day mortality rate (~40% vs ~15-25% for other endotypes).",
|
|
1479
|
+
"Identifying Mars1 patients at ICU admission could guide enrollment in",
|
|
1480
|
+
"immunostimulatory therapy trials (e.g., IFN-gamma, IL-7, anti-PD-1) [4][5]."
|
|
1481
|
+
),
|
|
1482
|
+
platform_description = paste(
|
|
1483
|
+
"Whole-blood RNA profiling on the Affymetrix Human Genome U219 Array",
|
|
1484
|
+
"(GPL13667, 49,386 probe sets). Data was RMA-normalized and log2-transformed.",
|
|
1485
|
+
"Probe-to-gene mapping used official Affymetrix annotation; multi-gene probes",
|
|
1486
|
+
"were assigned to the first listed gene symbol, and for genes with multiple",
|
|
1487
|
+
"probes, the highest inter-sample variance probe was retained, yielding",
|
|
1488
|
+
n_genes, "unique gene features."
|
|
1489
|
+
),
|
|
1490
|
+
analytical_goals = c(
|
|
1491
|
+
paste("Derive a minimal gene panel from admission-day blood transcriptomics",
|
|
1492
|
+
"that identifies Mars1 (immunosuppressed) sepsis patients, using",
|
|
1493
|
+
"stability-selected elastic net logistic regression — reducing genome-wide",
|
|
1494
|
+
"profiling to a practical qPCR-compatible signature."),
|
|
1495
|
+
paste("Evaluate panel performance through rigorous repeated nested",
|
|
1496
|
+
"cross-validation (100 iterations of 70/30 train/test splits) to ensure",
|
|
1497
|
+
"robust classification and unbiased AUC estimation."),
|
|
1498
|
+
paste("Assess whether the Mars1-classifying genes are independently",
|
|
1499
|
+
"prognostic for 28-day mortality, validating that the panel captures",
|
|
1500
|
+
"biologically meaningful immunosuppression rather than technical artifacts."),
|
|
1501
|
+
paste("Characterize selected biomarkers through immune pathway enrichment,",
|
|
1502
|
+
"blood cell-type expression mapping via CZI CELLxGENE Census, and",
|
|
1503
|
+
"cross-reference with sepsis GWAS susceptibility loci and published",
|
|
1504
|
+
"immune gene databases to interpret the panel's biological basis.")
|
|
1505
|
+
),
|
|
1506
|
+
published_benchmarks = list(
|
|
1507
|
+
intro = paste(
|
|
1508
|
+
"Scicluna et al. (2017) defined four blood genomic endotypes in sepsis",
|
|
1509
|
+
"using unsupervised consensus clustering of genome-wide expression data [3].",
|
|
1510
|
+
"Mars1 patients show downregulated adaptive immunity, impaired antigen",
|
|
1511
|
+
"presentation, and metabolic derangement, with 28-day mortality of ~40%.",
|
|
1512
|
+
"Subsequent studies have used machine learning to classify these endotypes",
|
|
1513
|
+
"from targeted gene panels, achieving high accuracy with as few as 5-12",
|
|
1514
|
+
"genes per endotype [3][6][7]."
|
|
1515
|
+
),
|
|
1516
|
+
studies = data.frame(
|
|
1517
|
+
study = c(
|
|
1518
|
+
"Scicluna et al. 2017 [3]",
|
|
1519
|
+
"Antcliffe et al. 2019 [6]",
|
|
1520
|
+
"Sweeney et al. 2018 [7]",
|
|
1521
|
+
"Burnham et al. 2017 [8]"
|
|
1522
|
+
),
|
|
1523
|
+
drug = c("N/A (endotype)", "N/A (endotype)",
|
|
1524
|
+
"N/A (diagnostic)", "N/A (endotype)"),
|
|
1525
|
+
validated_auc = c("4 endotypes (Mars1-4, HR 3.9)",
|
|
1526
|
+
"SRS1/SRS2 (similar to Mars1, AUC ~0.95)",
|
|
1527
|
+
"0.87 (11-gene SeptiCyte)",
|
|
1528
|
+
"SRS1 enrichment in drotrecogin trial"),
|
|
1529
|
+
method = c("Unsupervised clustering → 4 endotypes",
|
|
1530
|
+
"7-gene SRS classifier (SeptiScore)",
|
|
1531
|
+
"LASSO logistic regression (11 genes)",
|
|
1532
|
+
"Retrospective endotyping of PROWESS trial"),
|
|
1533
|
+
notes = c("Discovery: 263 patients; validation: 216 patients",
|
|
1534
|
+
"Validated SRS endotypes predict differential therapy response",
|
|
1535
|
+
"SeptiCyte Lab: FDA-cleared host-response sepsis test",
|
|
1536
|
+
"SRS1 patients showed differential response to drotrecogin alfa"),
|
|
1537
|
+
stringsAsFactors = FALSE
|
|
1538
|
+
),
|
|
1539
|
+
context = paste(
|
|
1540
|
+
"Blood transcriptomic endotyping in sepsis is an active frontier in",
|
|
1541
|
+
"precision critical care medicine. Mars1/SRS1 endotypes identify patients",
|
|
1542
|
+
"with immunoparalysis who may benefit from immunostimulatory therapy.",
|
|
1543
|
+
"Clinical trials of IFN-gamma (NCT01649921), IL-7 (NCT02640807), and",
|
|
1544
|
+
"anti-PD-1 (NCT02576457) in sepsis immunosuppression are ongoing.",
|
|
1545
|
+
"A rapid bedside gene panel for Mars1 identification would enable",
|
|
1546
|
+
"real-time patient stratification in these precision immunotherapy trials."
|
|
1547
|
+
)
|
|
1548
|
+
),
|
|
1549
|
+
references = list(
|
|
1550
|
+
rudd2020 = paste("[1] Rudd KE, Johnson SC, Agesa KM, et al. Global, regional,",
|
|
1551
|
+
"and national sepsis incidence and mortality, 1990-2017. Lancet.",
|
|
1552
|
+
"2020;395(10219):200-211."),
|
|
1553
|
+
hotchkiss2013 = paste("[2] Hotchkiss RS, Monneret G, Payen D. Sepsis-induced",
|
|
1554
|
+
"immunosuppression: from cellular dysfunctions to immunotherapy. Nat Rev",
|
|
1555
|
+
"Immunol. 2013;13(12):862-874."),
|
|
1556
|
+
scicluna2017 = paste("[3] Scicluna BP, van Vught LA, Zwinderman AH, et al.",
|
|
1557
|
+
"Classification of patients with sepsis according to blood genomic endotype:",
|
|
1558
|
+
"a prospective cohort study. Lancet Respir Med. 2017;5(10):816-826."),
|
|
1559
|
+
scicluna2017b = paste("[4] Scicluna BP, et al. The leukocyte non-coding RNA",
|
|
1560
|
+
"landscape in critically ill patients with sepsis. Elife. 2020."),
|
|
1561
|
+
venet2018 = paste("[5] Venet F, Monneret G. Advances in the understanding and",
|
|
1562
|
+
"treatment of sepsis-induced immunosuppression. Nat Rev Nephrol.",
|
|
1563
|
+
"2018;14(2):121-137."),
|
|
1564
|
+
antcliffe2019 = paste("[6] Antcliffe DB, Burnham KL, Al-Beidh F, et al.",
|
|
1565
|
+
"Transcriptomic signatures in sepsis and a differential response to steroids.",
|
|
1566
|
+
"Am J Respir Crit Care Med. 2019;199(8):980-986."),
|
|
1567
|
+
sweeney2018 = paste("[7] Sweeney TE, Perumal TM, Henao R, et al. A community",
|
|
1568
|
+
"approach to mortality prediction in sepsis via gene expression analysis.",
|
|
1569
|
+
"Nat Commun. 2018;9(1):694."),
|
|
1570
|
+
burnham2017 = paste("[8] Burnham KL, Davenport EE, Radhakrishnan J, et al.",
|
|
1571
|
+
"Shared and distinct aspects of the sepsis transcriptomic response to",
|
|
1572
|
+
"fecal peritonitis and pneumonia. Am J Respir Crit Care Med.",
|
|
1573
|
+
"2017;196(3):328-339.")
|
|
1574
|
+
)
|
|
1575
|
+
)
|
|
1576
|
+
}
|
|
1577
|
+
|
|
1578
|
+
.sepsis_mortality_report_context <- function(n_survivor, n_nonsurvivor, n_genes) {
|
|
1579
|
+
list(
|
|
1580
|
+
disease_background = paste(
|
|
1581
|
+
"Sepsis is a life-threatening organ dysfunction caused by a dysregulated",
|
|
1582
|
+
"host response to infection, affecting over 48 million people annually",
|
|
1583
|
+
"worldwide with an estimated 11 million deaths (20% of all global deaths) [1].",
|
|
1584
|
+
"In the ICU, sepsis mortality ranges from 20-40% despite advances in",
|
|
1585
|
+
"antimicrobial therapy and supportive care [2]. Early identification of",
|
|
1586
|
+
"patients at highest risk of death is critical for triage, escalation of",
|
|
1587
|
+
"care, and enrollment in clinical trials of novel immunomodulatory therapies.",
|
|
1588
|
+
"Current severity scores (APACHE IV, SOFA) rely on clinical and laboratory",
|
|
1589
|
+
"parameters that incompletely capture the underlying immune dysregulation [3].",
|
|
1590
|
+
"Transcriptomic profiling of peripheral blood offers a molecular window into",
|
|
1591
|
+
"the host immune response, with the potential to identify prognostic signatures",
|
|
1592
|
+
"that outperform clinical scores alone."
|
|
1593
|
+
),
|
|
1594
|
+
trial_description = paste(
|
|
1595
|
+
"The MARS (Molecular Diagnosis and Risk Stratification of Sepsis) project is",
|
|
1596
|
+
"a prospective observational cohort study conducted at two tertiary ICUs in",
|
|
1597
|
+
"the Netherlands (Academic Medical Center Amsterdam and University Medical",
|
|
1598
|
+
"Center Utrecht) between 2011 and 2013. All patients admitted to the ICU with",
|
|
1599
|
+
"a suspected or confirmed infection were enrolled. Blood samples for",
|
|
1600
|
+
"transcriptomic profiling were collected within 24 hours of ICU admission.",
|
|
1601
|
+
"The primary endpoint was 28-day all-cause mortality [3][4]."
|
|
1602
|
+
),
|
|
1603
|
+
patient_population = paste(
|
|
1604
|
+
"The analysis cohort comprises", n_survivor + n_nonsurvivor,
|
|
1605
|
+
"sepsis patients from the MARS consortium with complete 28-day outcome data.",
|
|
1606
|
+
"The cohort includes", n_survivor, "survivors and", n_nonsurvivor,
|
|
1607
|
+
"non-survivors (", round(100 * n_nonsurvivor / (n_survivor + n_nonsurvivor), 1),
|
|
1608
|
+
"% mortality rate). Gene expression was measured from whole blood RNA on the",
|
|
1609
|
+
"Affymetrix Human Genome U219 Array (GPL13667). Probe-level data was",
|
|
1610
|
+
"collapsed to", n_genes, "unique gene symbols by retaining the highest-variance",
|
|
1611
|
+
"probe per gene. Forty-two healthy control samples were excluded to focus the",
|
|
1612
|
+
"analysis on prognostic discrimination within sepsis patients."
|
|
1613
|
+
),
|
|
1614
|
+
endpoint_definition = paste(
|
|
1615
|
+
"The binary outcome is 28-day all-cause mortality: survivor (0, n =",
|
|
1616
|
+
paste0(n_survivor, ")"), "versus non-survivor (1, n =",
|
|
1617
|
+
paste0(n_nonsurvivor, ")."),
|
|
1618
|
+
"This is the standard primary endpoint for sepsis clinical trials and",
|
|
1619
|
+
"prognostic biomarker studies [1][5]. The 23.8% mortality rate provides",
|
|
1620
|
+
"adequate event frequency for penalized regression with stability selection,",
|
|
1621
|
+
"though class-balanced resampling is applied to prevent bias toward the",
|
|
1622
|
+
"majority class."
|
|
1623
|
+
),
|
|
1624
|
+
platform_description = paste(
|
|
1625
|
+
"Whole-blood RNA profiling on the Affymetrix Human Genome U219 Array",
|
|
1626
|
+
"(GPL13667, 49,386 probe sets). Data was RMA-normalized and log2-transformed",
|
|
1627
|
+
"(expression range 0.7-13.5, median ~3.9). Probe-to-gene mapping used the",
|
|
1628
|
+
"official Affymetrix annotation (NetAffx build 36); multi-gene probes were",
|
|
1629
|
+
"assigned to the first listed gene symbol, and for genes mapped by multiple",
|
|
1630
|
+
"probes, the probe with highest inter-sample variance was retained, yielding",
|
|
1631
|
+
n_genes, "unique gene features."
|
|
1632
|
+
),
|
|
1633
|
+
analytical_goals = c(
|
|
1634
|
+
paste("Identify a minimal gene panel (<15 features) from admission-day",
|
|
1635
|
+
"blood transcriptomics that predicts 28-day mortality in ICU sepsis",
|
|
1636
|
+
"patients, using penalized logistic regression with elastic net",
|
|
1637
|
+
"regularization and stability selection."),
|
|
1638
|
+
paste("Determine whether blood transcriptomic signatures capture",
|
|
1639
|
+
"prognostic information beyond conventional severity scores (APACHE IV,",
|
|
1640
|
+
"SOFA), potentially identifying patients with dysregulated immune states",
|
|
1641
|
+
"associated with higher mortality risk."),
|
|
1642
|
+
paste("Evaluate signature stability through repeated nested cross-validation",
|
|
1643
|
+
"(100 iterations of 70/30 train/test splits) to ensure robust feature",
|
|
1644
|
+
"selection and unbiased AUC estimation in this moderately-sized cohort."),
|
|
1645
|
+
paste("Characterize selected biomarkers through immune pathway enrichment,",
|
|
1646
|
+
"blood cell-type expression mapping via CZI CELLxGENE Census, and",
|
|
1647
|
+
"cross-reference with published sepsis gene signatures and GWAS",
|
|
1648
|
+
"susceptibility loci to interpret the biological basis of the panel.")
|
|
1649
|
+
),
|
|
1650
|
+
published_benchmarks = list(
|
|
1651
|
+
intro = paste(
|
|
1652
|
+
"Blood transcriptomic biomarkers for sepsis prognosis have been",
|
|
1653
|
+
"extensively studied, with multi-gene LASSO panels consistently",
|
|
1654
|
+
"achieving AUC 0.80-0.95 for 28-day mortality prediction in the MARS",
|
|
1655
|
+
"cohort and external validation cohorts [4][6][7]. Single genes such as",
|
|
1656
|
+
"IL1R2, TDRD9, and S100A12 show individual AUCs of 0.70-0.80, but",
|
|
1657
|
+
"multi-gene panels significantly outperform individual markers."
|
|
1658
|
+
),
|
|
1659
|
+
studies = data.frame(
|
|
1660
|
+
study = c(
|
|
1661
|
+
"Scicluna et al. 2017 [4]",
|
|
1662
|
+
"Zhang et al. 2023 [6]",
|
|
1663
|
+
"Liu et al. 2026 [7]",
|
|
1664
|
+
"Sweeney et al. 2018 [8]"
|
|
1665
|
+
),
|
|
1666
|
+
drug = c("N/A (prognostic)", "N/A (prognostic)",
|
|
1667
|
+
"N/A (prognostic)", "N/A (diagnostic)"),
|
|
1668
|
+
validated_auc = c("Endotype-based (Mars1 HR=3.9)",
|
|
1669
|
+
"0.85-0.90 (5-gene panel)",
|
|
1670
|
+
"0.92 (12-gene LASSO+RF)",
|
|
1671
|
+
"0.87 (11-gene SeptiCyte)"),
|
|
1672
|
+
method = c("K-means clustering → 4 blood endotypes",
|
|
1673
|
+
"LASSO + SVM feature selection",
|
|
1674
|
+
"LASSO + Random Forest (12 genes)",
|
|
1675
|
+
"LASSO logistic regression (11 genes)"),
|
|
1676
|
+
notes = c("Mars1 endotype = immunosuppressed, highest mortality",
|
|
1677
|
+
"5-gene prognostic panel validated in external cohorts",
|
|
1678
|
+
"12-gene model integrating LASSO + ML on GSE65682",
|
|
1679
|
+
"SeptiCyte Lab: FDA-cleared host-response sepsis test"),
|
|
1680
|
+
stringsAsFactors = FALSE
|
|
1681
|
+
),
|
|
1682
|
+
context = paste(
|
|
1683
|
+
"Published LASSO-based approaches on this dataset and related sepsis",
|
|
1684
|
+
"cohorts consistently identify panels of 5-12 genes achieving AUC",
|
|
1685
|
+
"0.85-0.95 for 28-day mortality. Key genes recurring across studies",
|
|
1686
|
+
"include immune regulators (IL1R2, S100A12, TDRD9), metabolic enzymes",
|
|
1687
|
+
"(CYP4F3, OLAH), and T-cell markers (CCR7, BCL11B), reflecting the",
|
|
1688
|
+
"complex interplay between pro-inflammatory and immunosuppressive",
|
|
1689
|
+
"pathways that determines sepsis outcomes."
|
|
1690
|
+
)
|
|
1691
|
+
),
|
|
1692
|
+
references = list(
|
|
1693
|
+
rudd2020 = paste("[1] Rudd KE, Johnson SC, Agesa KM, et al. Global, regional,",
|
|
1694
|
+
"and national sepsis incidence and mortality, 1990-2017. Lancet.",
|
|
1695
|
+
"2020;395(10219):200-211."),
|
|
1696
|
+
vincent2013 = paste("[2] Vincent JL, Marshall JC, Namendys-Silva SA, et al.",
|
|
1697
|
+
"Assessment of the worldwide burden of critical illness: the Intensive",
|
|
1698
|
+
"Care Over Nations (ICON) audit. Lancet Respir Med. 2014;2(5):380-386."),
|
|
1699
|
+
van_vught2016 = paste("[3] van Vught LA, Klein Klouwenberg PM, Spitoni C, et al.",
|
|
1700
|
+
"Incidence, risk factors, and attributable mortality of secondary infections",
|
|
1701
|
+
"in the intensive care unit after admission for sepsis. JAMA.",
|
|
1702
|
+
"2016;315(14):1469-1479."),
|
|
1703
|
+
scicluna2017 = paste("[4] Scicluna BP, van Vught LA, Zwinderman AH, et al.",
|
|
1704
|
+
"Classification of patients with sepsis according to blood genomic endotype:",
|
|
1705
|
+
"a prospective cohort study. Lancet Respir Med. 2017;5(10):816-826."),
|
|
1706
|
+
singer2016 = paste("[5] Singer M, Deutschman CS, Seymour CW, et al. The Third",
|
|
1707
|
+
"International Consensus Definitions for Sepsis and Septic Shock (Sepsis-3).",
|
|
1708
|
+
"JAMA. 2016;315(8):801-810."),
|
|
1709
|
+
zhang2023 = paste("[6] Zhang W, Liu T, et al. Identification of sepsis prognosis",
|
|
1710
|
+
"biomarkers via LASSO and machine learning using blood transcriptomics.",
|
|
1711
|
+
"J Transl Med. 2023."),
|
|
1712
|
+
liu2026 = paste("[7] Liu Y, et al. A 12-gene signature for sepsis prognosis",
|
|
1713
|
+
"combining LASSO and random forest on blood transcriptomics.",
|
|
1714
|
+
"Eur J Med Res. 2026;31:55."),
|
|
1715
|
+
sweeney2018 = paste("[8] Sweeney TE, Perumal TM, Henao R, et al. A community",
|
|
1716
|
+
"approach to mortality prediction in sepsis via gene expression analysis.",
|
|
1717
|
+
"Nat Commun. 2018;9(1):694.")
|
|
1718
|
+
)
|
|
1719
|
+
)
|
|
1720
|
+
}
|
|
1721
|
+
|
|
1722
|
+
load_sepsis_data <- function(data_dir = "data", outcome = "endotype") {
|
|
1723
|
+
|
|
1724
|
+
outcome <- match.arg(outcome, c("endotype", "mortality"))
|
|
1725
|
+
|
|
1726
|
+
if (!dir.exists(data_dir)) dir.create(data_dir, recursive = TRUE)
|
|
1727
|
+
|
|
1728
|
+
# Check for outcome-specific cache first
|
|
1729
|
+
cache_file_outcome <- file.path(data_dir, paste0("sepsis_", outcome, "_data.rds"))
|
|
1730
|
+
if (file.exists(cache_file_outcome)) {
|
|
1731
|
+
cat("Loading cached sepsis", outcome, "data...\n")
|
|
1732
|
+
cached <- readRDS(cache_file_outcome)
|
|
1733
|
+
if (outcome == "endotype") {
|
|
1734
|
+
n1 <- sum(cached$metadata$mars1 == 1)
|
|
1735
|
+
n0 <- sum(cached$metadata$mars1 == 0)
|
|
1736
|
+
cached$report_context <- .sepsis_endotype_report_context(n1, n0, nrow(cached$expression))
|
|
1737
|
+
cat("\u2713 Sepsis endotype data loaded:", n1, "Mars1 vs", n0, "other endotypes\n")
|
|
1738
|
+
} else {
|
|
1739
|
+
n_surv <- sum(cached$metadata$mortality == 0)
|
|
1740
|
+
n_dead <- sum(cached$metadata$mortality == 1)
|
|
1741
|
+
cached$report_context <- .sepsis_mortality_report_context(n_surv, n_dead, nrow(cached$expression))
|
|
1742
|
+
cat("\u2713 Sepsis mortality data loaded:", n_surv, "survivors vs", n_dead, "non-survivors\n")
|
|
1743
|
+
}
|
|
1744
|
+
return(cached)
|
|
1745
|
+
}
|
|
1746
|
+
|
|
1747
|
+
# Check for base processed cache (before outcome filtering)
|
|
1748
|
+
cache_file <- file.path(data_dir, "sepsis_base_data.rds")
|
|
1749
|
+
if (file.exists(cache_file)) {
|
|
1750
|
+
cat("Loading cached base sepsis data...\n")
|
|
1751
|
+
base <- readRDS(cache_file)
|
|
1752
|
+
expr <- base$expression
|
|
1753
|
+
metadata <- base$metadata
|
|
1754
|
+
} else {
|
|
1755
|
+
|
|
1756
|
+
# ---- Download from GEO ----
|
|
1757
|
+
.ensure_package("GEOquery")
|
|
1758
|
+
library(GEOquery)
|
|
1759
|
+
|
|
1760
|
+
cat("=== Downloading GSE65682 (MARS Sepsis Consortium) ===\n")
|
|
1761
|
+
cat(" 802 samples, Affymetrix HG-U219, blood transcriptomics\n")
|
|
1762
|
+
cat(" This may take 2-5 minutes...\n")
|
|
1763
|
+
|
|
1764
|
+
gse <- getGEO("GSE65682", GSEMatrix = TRUE, destdir = data_dir)
|
|
1765
|
+
eset <- gse[[1]]
|
|
1766
|
+
|
|
1767
|
+
expr_raw <- exprs(eset)
|
|
1768
|
+
clinical <- pData(eset)
|
|
1769
|
+
|
|
1770
|
+
cat(" Downloaded:", nrow(expr_raw), "probes x", ncol(expr_raw), "samples\n")
|
|
1771
|
+
|
|
1772
|
+
# ---- Extract clinical annotations ----
|
|
1773
|
+
metadata <- data.frame(
|
|
1774
|
+
sample_id = rownames(clinical),
|
|
1775
|
+
gender = clinical[["gender:ch1"]],
|
|
1776
|
+
age = as.numeric(clinical[["age:ch1"]]),
|
|
1777
|
+
mortality_28d = clinical[["mortality_event_28days:ch1"]],
|
|
1778
|
+
time_to_event = clinical[["time_to_event_28days:ch1"]],
|
|
1779
|
+
endotype = clinical[["endotype_class:ch1"]],
|
|
1780
|
+
icu_infection = clinical[["icu_acquired_infection:ch1"]],
|
|
1781
|
+
pneumonia = clinical[["pneumonia diagnoses:ch1"]],
|
|
1782
|
+
row.names = rownames(clinical),
|
|
1783
|
+
stringsAsFactors = FALSE
|
|
1784
|
+
)
|
|
1785
|
+
|
|
1786
|
+
# ---- Filter to sepsis patients (exclude healthy controls) ----
|
|
1787
|
+
not_healthy <- is.na(metadata$icu_infection) | metadata$icu_infection != "healthy"
|
|
1788
|
+
# Keep patients with either mortality data or endotype data
|
|
1789
|
+
has_mortality <- !is.na(metadata$mortality_28d) & metadata$mortality_28d != "NA"
|
|
1790
|
+
has_endotype <- !is.na(metadata$endotype) & metadata$endotype != "" & metadata$endotype != "NA"
|
|
1791
|
+
keep <- not_healthy & (has_mortality | has_endotype)
|
|
1792
|
+
|
|
1793
|
+
metadata <- metadata[keep, ]
|
|
1794
|
+
expr_raw <- expr_raw[, keep]
|
|
1795
|
+
|
|
1796
|
+
# Create binary outcomes
|
|
1797
|
+
metadata$mortality <- ifelse(has_mortality[keep], as.integer(metadata$mortality_28d), NA)
|
|
1798
|
+
metadata$mars1 <- ifelse(has_endotype[keep], as.integer(metadata$endotype == "Mars1"), NA)
|
|
1799
|
+
|
|
1800
|
+
cat(" Filtered to", ncol(expr_raw), "sepsis patients\n")
|
|
1801
|
+
|
|
1802
|
+
# ---- Probe-to-gene mapping ----
|
|
1803
|
+
cat(" Mapping probes to gene symbols (GPL13667)...\n")
|
|
1804
|
+
|
|
1805
|
+
gpl <- getGEO("GPL13667", destdir = data_dir)
|
|
1806
|
+
annot <- Table(gpl)
|
|
1807
|
+
|
|
1808
|
+
probe_ids <- rownames(expr_raw)
|
|
1809
|
+
gene_symbols <- annot[match(probe_ids, annot$ID), "Gene Symbol"]
|
|
1810
|
+
|
|
1811
|
+
gene_symbols <- sapply(gene_symbols, function(s) {
|
|
1812
|
+
if (is.na(s) || s == "" || s == "---") return(NA)
|
|
1813
|
+
trimws(strsplit(s, "///")[[1]][1])
|
|
1814
|
+
})
|
|
1815
|
+
|
|
1816
|
+
has_gene <- !is.na(gene_symbols)
|
|
1817
|
+
expr_raw <- expr_raw[has_gene, ]
|
|
1818
|
+
gene_symbols <- gene_symbols[has_gene]
|
|
1819
|
+
|
|
1820
|
+
cat(" ", sum(has_gene), "probes mapped to", length(unique(gene_symbols)), "unique genes\n")
|
|
1821
|
+
|
|
1822
|
+
# ---- Collapse multi-probe genes (keep highest variance probe) ----
|
|
1823
|
+
probe_vars <- apply(expr_raw, 1, var)
|
|
1824
|
+
unique_genes <- unique(gene_symbols)
|
|
1825
|
+
best_probes <- character(length(unique_genes))
|
|
1826
|
+
|
|
1827
|
+
for (i in seq_along(unique_genes)) {
|
|
1828
|
+
gene <- unique_genes[i]
|
|
1829
|
+
probe_mask <- which(gene_symbols == gene)
|
|
1830
|
+
if (length(probe_mask) == 1) {
|
|
1831
|
+
best_probes[i] <- rownames(expr_raw)[probe_mask]
|
|
1832
|
+
} else {
|
|
1833
|
+
best_idx <- probe_mask[which.max(probe_vars[probe_mask])]
|
|
1834
|
+
best_probes[i] <- rownames(expr_raw)[best_idx]
|
|
1835
|
+
}
|
|
1836
|
+
}
|
|
1837
|
+
|
|
1838
|
+
expr <- expr_raw[best_probes, ]
|
|
1839
|
+
rownames(expr) <- unique_genes
|
|
1840
|
+
|
|
1841
|
+
cat(" Collapsed to", nrow(expr), "genes x", ncol(expr), "samples\n")
|
|
1842
|
+
|
|
1843
|
+
# Cache base data
|
|
1844
|
+
saveRDS(list(expression = expr, metadata = metadata), cache_file)
|
|
1845
|
+
cat(" Cached base data to", cache_file, "\n")
|
|
1846
|
+
|
|
1847
|
+
} # end of base data loading/caching
|
|
1848
|
+
|
|
1849
|
+
# ---- Apply outcome-specific filtering ----
|
|
1850
|
+
if (outcome == "endotype") {
|
|
1851
|
+
# Mars1 (immunosuppressed) vs rest (Mars2+3+4)
|
|
1852
|
+
valid <- !is.na(metadata$mars1)
|
|
1853
|
+
metadata <- metadata[valid, ]
|
|
1854
|
+
expr <- expr[, valid]
|
|
1855
|
+
|
|
1856
|
+
n_mars1 <- sum(metadata$mars1 == 1)
|
|
1857
|
+
n_other <- sum(metadata$mars1 == 0)
|
|
1858
|
+
report_context <- .sepsis_endotype_report_context(n_mars1, n_other, nrow(expr))
|
|
1859
|
+
|
|
1860
|
+
result <- list(
|
|
1861
|
+
expression = expr,
|
|
1862
|
+
metadata = metadata,
|
|
1863
|
+
outcome_col = "mars1",
|
|
1864
|
+
description = paste("MARS Consortium GSE65682:", nrow(metadata),
|
|
1865
|
+
"ICU sepsis patients. Blood transcriptomics (Affymetrix HG-U219).",
|
|
1866
|
+
n_mars1, "Mars1 (immunosuppressed) vs", n_other,
|
|
1867
|
+
"other endotypes (Mars2+3+4)."),
|
|
1868
|
+
report_context = report_context
|
|
1869
|
+
)
|
|
1870
|
+
|
|
1871
|
+
saveRDS(result, cache_file_outcome)
|
|
1872
|
+
cat("\n\u2713 Sepsis endotype data loaded successfully!\n")
|
|
1873
|
+
cat(" Expression:", nrow(expr), "genes x", ncol(expr), "samples\n")
|
|
1874
|
+
cat(" Outcome:", n_mars1, "Mars1 vs", n_other, "non-Mars1\n")
|
|
1875
|
+
|
|
1876
|
+
} else {
|
|
1877
|
+
# 28-day mortality: survivor vs non-survivor
|
|
1878
|
+
valid <- !is.na(metadata$mortality)
|
|
1879
|
+
metadata <- metadata[valid, ]
|
|
1880
|
+
expr <- expr[, valid]
|
|
1881
|
+
|
|
1882
|
+
n_surv <- sum(metadata$mortality == 0)
|
|
1883
|
+
n_dead <- sum(metadata$mortality == 1)
|
|
1884
|
+
report_context <- .sepsis_mortality_report_context(n_surv, n_dead, nrow(expr))
|
|
1885
|
+
|
|
1886
|
+
result <- list(
|
|
1887
|
+
expression = expr,
|
|
1888
|
+
metadata = metadata,
|
|
1889
|
+
outcome_col = "mortality",
|
|
1890
|
+
description = paste("MARS Consortium GSE65682:", nrow(metadata),
|
|
1891
|
+
"ICU sepsis patients. Blood transcriptomics (Affymetrix HG-U219).",
|
|
1892
|
+
n_surv, "survivors vs", n_dead, "non-survivors (28-day mortality)."),
|
|
1893
|
+
report_context = report_context
|
|
1894
|
+
)
|
|
1895
|
+
|
|
1896
|
+
saveRDS(result, cache_file_outcome)
|
|
1897
|
+
cat("\n\u2713 Sepsis mortality data loaded successfully!\n")
|
|
1898
|
+
cat(" Expression:", nrow(expr), "genes x", ncol(expr), "samples\n")
|
|
1899
|
+
cat(" Outcome:", n_surv, "survivors vs", n_dead, "non-survivors\n")
|
|
1900
|
+
}
|
|
1901
|
+
|
|
1902
|
+
return(result)
|
|
1903
|
+
}
|