PyPI - oncoref - Versions diffs - 1.6.0__py3-none-any.whl - Mend

oncoref 1.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (94) hide show

oncoref/__init__.py +390 -0
oncoref/apd1.py +59 -0
oncoref/cancer_genes.py +149 -0
oncoref/cancer_types.py +827 -0
oncoref/catalog.py +261 -0
oncoref/cli.py +633 -0
oncoref/coverage.py +282 -0
oncoref/cta.py +297 -0
oncoref/cta_regen.py +560 -0
oncoref/cta_tissues.py +152 -0
oncoref/data/cancer-apd1-response.csv +82 -0
oncoref/data/cancer-code-burden-map.csv +5 -0
oncoref/data/cancer-cohort-aggregates.csv +18 -0
oncoref/data/cancer-driver-genes.csv +740 -0
oncoref/data/cancer-driver-variants.csv +580 -0
oncoref/data/cancer-expression-source-candidates.csv +59 -0
oncoref/data/cancer-frameshift-burden.csv +37 -0
oncoref/data/cancer-fusions.csv +172 -0
oncoref/data/cancer-ici-response.csv +105 -0
oncoref/data/cancer-incidence-mortality.csv +38 -0
oncoref/data/cancer-key-genes.csv +852 -0
oncoref/data/cancer-lineage-group-overrides.csv +2 -0
oncoref/data/cancer-lineage-groups.csv +28 -0
oncoref/data/cancer-reference-expression-samples.csv.gz +0 -0
oncoref/data/cancer-response-signatures.csv +25 -0
oncoref/data/cancer-subtype-groupings.csv +13 -0
oncoref/data/cancer-testis-antigens.csv +398 -0
oncoref/data/cancer-tmb.csv +123 -0
oncoref/data/cancer-type-genes.csv +578 -0
oncoref/data/cancer-type-registry.csv +165 -0
oncoref/data/cancer-viral-antigens.csv +7 -0
oncoref/data/cdna-identical-gene-groups.csv +327 -0
oncoref/data/censored-gene-reference-tpm.csv +1729 -0
oncoref/data/clean-tpm-censored-genes.csv +2787 -0
oncoref/data/cohort-registry.csv +42 -0
oncoref/data/cta-candidate-references.csv +13 -0
oncoref/data/cta-ihc-unreliable.csv +7 -0
oncoref/data/degenerate-subtype-pairs.csv +8 -0
oncoref/data/disease-state-rules.csv +9 -0
oncoref/data/ensembl-id-aliases.csv +468 -0
oncoref/data/expression_sources.yaml +964 -0
oncoref/data/extra-tx-mappings.csv +213 -0
oncoref/data/family-burden-map.csv +10 -0
oncoref/data/fusion-expression-effects.csv +7 -0
oncoref/data/fusion-surrogate-expression.csv +52 -0
oncoref/data/hemoglobin-genes.csv +13 -0
oncoref/data/histone-genes.csv +193 -0
oncoref/data/housekeeping-genes.csv +31 -0
oncoref/data/mitochondrial-genes.csv +38 -0
oncoref/data/narrative-gene-sets.csv +4 -0
oncoref/data/ncbi-symbol-synonyms.csv.gz +0 -0
oncoref/data/nuclear-retained-lncrnas.csv +5 -0
oncoref/data/numt-pseudogenes.csv +413 -0
oncoref/data/proteoform-collapse-overrides.csv +2 -0
oncoref/data/proteoform-groups-genome.csv +411 -0
oncoref/data/proteoform-groups.csv +48 -0
oncoref/data/rare-cancer-fusion-rules.csv +27 -0
oncoref/data/ribosomal-protein-genes.csv +123 -0
oncoref/data/ribosomal-protein-pseudogenes.csv +1641 -0
oncoref/data/rrna-and-pseudogenes.csv +558 -0
oncoref/data/small-noncoding-rnas.csv +6396 -0
oncoref/data/source-matrices.csv +119 -0
oncoref/data/tissue-burden-map.csv +71 -0
oncoref/data_bundle.py +378 -0
oncoref/data_manifest.py +283 -0
oncoref/expression.py +1062 -0
oncoref/expression_builders.py +334 -0
oncoref/expression_engine.py +168 -0
oncoref/expression_registry.py +162 -0
oncoref/fusions.py +186 -0
oncoref/gene_families.py +126 -0
oncoref/gene_ids.py +88 -0
oncoref/gene_qc.py +233 -0
oncoref/genome.py +298 -0
oncoref/hpa.py +101 -0
oncoref/ici.py +169 -0
oncoref/incidence.py +156 -0
oncoref/load_dataset.py +233 -0
oncoref/normalization.py +607 -0
oncoref/peptides.py +237 -0
oncoref/plots.py +1083 -0
oncoref/proteoforms.py +302 -0
oncoref/reference_data.py +287 -0
oncoref/response_signatures.py +92 -0
oncoref/samples.py +70 -0
oncoref/source_matrices.py +145 -0
oncoref/tmb.py +101 -0
oncoref/version.py +31 -0
oncoref-1.6.0.dist-info/METADATA +169 -0
oncoref-1.6.0.dist-info/RECORD +94 -0
oncoref-1.6.0.dist-info/WHEEL +5 -0
oncoref-1.6.0.dist-info/entry_points.txt +2 -0
oncoref-1.6.0.dist-info/licenses/LICENSE +201 -0
oncoref-1.6.0.dist-info/top_level.txt +1 -0

oncoref/__init__.py ADDED Viewed

@@ -0,0 +1,390 @@
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""oncoref — curated cancer reference data (ontology, TMB, incidence/
+mortality, and expression) with a single fetch/cache surface.
+Bottom-of-stack: depends only on pandas/numpy/pyarrow, never on the analysis
+or target-selection libraries that consume it.
+"""
+from .apd1 import cancer_apd1_response, cancer_apd1_response_df
+from .cancer_types import (
+    CANCER_TYPE_ALIASES,
+    CANCER_TYPE_NAMES,
+    cancer_lineage_group,
+    cancer_lineage_group_overrides,
+    cancer_lineage_groups,
+    cancer_subtype_group,
+    cancer_subtype_groupings,
+    cancer_type_ancestors,
+    cancer_type_descendants,
+    cancer_type_families,
+    cancer_type_info,
+    cancer_type_lineage,
+    cancer_type_registry,
+    cancer_type_subtypes_of,
+    cancer_type_synonyms,
+    cancer_type_tree,
+    cancer_types_by_tissue,
+    cancer_types_in_family,
+    canonical_cancer_code,
+    cohort_aggregate_members,
+    cohort_aggregates,
+    cohort_aggregates_df,
+    cohort_kind,
+    cohort_registry,
+    cohort_registry_df,
+    family_display_name,
+    format_cancer_code_label,
+    fusion_status,
+    is_mixture_cohort,
+    known_cohort_ids,
+    mixture_cohort_codes,
+    resolve_cancer_type,
+    sarcoma_lineage_codes,
+    tissue_of_origin,
+    viral_status,
+)
+from .coverage import (
+    addressable_fraction,
+    addressable_fraction_by_cohort,
+    cta_patient_fractions,
+    greedy_coverage,
+    mean_antigens_per_patient,
+    mean_antigens_per_patient_by_cohort,
+)
+from .cta import (
+    cta_candidate_references,
+    cta_df,
+    cta_evidence,
+    cta_excluded_gene_names,
+    cta_filtered_gene_ids,
+    cta_filtered_gene_names,
+    cta_gene_id_to_name,
+    cta_gene_ids,
+    cta_gene_names,
+    cta_never_expressed_gene_names,
+    cta_unfiltered_gene_ids,
+    cta_unfiltered_gene_names,
+)
+from .expression import (
+    SHARD_DATASETS,
+    ShardDataset,
+    available_percentile_cohorts,
+    available_representative_cohorts,
+    available_within_sample_cohorts,
+    cohort_gene_percentiles,
+    cohort_mean_expression,
+    cohort_stats,
+    gene_cohort_mean_expression,
+    gene_cohort_percentiles,
+    gene_cohort_stats,
+    gene_per_sample_expression,
+    gene_pooled_cohort_stats,
+    gene_representative_samples,
+    gene_within_sample_top_fraction,
+    pan_cancer_expression,
+    per_sample_expression,
+    pooled_cohort_stats,
+    proteoform_cohort_mean_expression,
+    proteoform_cohort_percentiles,
+    proteoform_cohort_stats,
+    proteoform_per_sample_expression,
+    proteoform_pooled_cohort_stats,
+    proteoform_representative_samples,
+    proteoform_within_sample_top_fraction,
+    representative_cohort_samples,
+    within_sample_top_fraction,
+)
+from .expression_engine import aggregate_transcripts_to_genes, id_columns, sample_columns
+from .expression_registry import (
+    ExpressionSource,
+    expression_source,
+    expression_sources,
+    expression_sources_df,
+    sources_for_cancer_code,
+)
+from .fusions import (
+    cancer_fusions,
+    cancer_fusions_df,
+    cancer_types_with_fusion,
+    fusion_partners,
+    protein_family,
+)
+from .gene_families import TECHNICAL_RNA_FAMILIES
+from .gene_qc import TECHNICAL_RNA_GROUPS, GeneQcClass, classify_gene_qc, is_rescue_feature
+from .genome import (
+    aggregate_gene_expression,
+    canonical_gene_id_and_name,
+    canonical_gene_ids_and_names,
+    find_gene_id_by_name,
+    find_gene_name_from_ensembl_gene_id,
+    find_gene_name_from_ensembl_transcript_id,
+    genomes,
+)
+from .hpa import (
+    gene_cell_type_ntpm,
+    gene_protein_tissues,
+    gene_tissue_ntpm,
+    hpa_normal_tissue,
+    hpa_rna_consensus,
+    hpa_single_cell,
+)
+from .ici import (
+    REGIMEN_FALLBACK,
+    REGIMEN_LABELS,
+    cancer_ici_regimen,
+    cancer_ici_response,
+    cancer_ici_response_df,
+    ici_regimens,
+)
+from .incidence import (
+    burden_category,
+    cancer_burden,
+    cancer_burden_df,
+    cancer_code_burden_map,
+)
+from .normalization import (
+    BIOLOGICAL_FRACTION,
+    OTHER_TECHNICAL_FRACTION,
+    RIBOSOMAL_PROTEIN_FRACTION,
+    TECHNICAL_FRACTION,
+    clean_tpm,
+    drop_technical_rna,
+    filter_technical_rna,
+    fpkm_to_tpm,
+    is_expression_value_col,
+    log1p_transform,
+    log2_transform,
+    normalize_expression,
+    normalize_technical_rna_columns,
+    normalize_technical_rna_long_table,
+    normalize_to_housekeeping,
+    percentile_rank,
+    renormalize_to_million,
+    tpm_to_housekeeping_normalized,
+)
+from .peptides import (
+    cta_specific_9mer_counts,
+    cta_specific_9mer_load,
+    cta_specific_9mer_weights,
+)
+from .proteoforms import (
+    collapse_to_proteoforms,
+    expression_level,
+    gene_to_proteoform,
+    gene_to_proteoform_id,
+    proteoform_aliases,
+    proteoform_for_gene,
+    proteoform_group_map,
+    proteoform_groups,
+    proteoform_key,
+    proteoform_members_for_gene,
+    proteoform_symbol,
+    proteoform_symbol_map,
+)
+from .response_signatures import (
+    response_signature_direction,
+    response_signature_genes,
+    response_signature_names,
+    response_signatures_df,
+    signature_score,
+)
+from .samples import (
+    sample_counts_by_cancer_code,
+    sample_manifest,
+    samples_for_cancer_code,
+    samples_for_cohort,
+)
+from .tmb import cancer_tmb, cancer_tmb_df
+from .version import __version__
+__all__ = [
+    # expression sources + per-sample curation
+    "BIOLOGICAL_FRACTION",
+    # ontology / registry
+    "CANCER_TYPE_ALIASES",
+    "CANCER_TYPE_NAMES",
+    "OTHER_TECHNICAL_FRACTION",
+    "REGIMEN_FALLBACK",
+    "REGIMEN_LABELS",
+    "RIBOSOMAL_PROTEIN_FRACTION",
+    "SHARD_DATASETS",
+    "TECHNICAL_FRACTION",
+    "TECHNICAL_RNA_FAMILIES",
+    "TECHNICAL_RNA_GROUPS",
+    "ExpressionSource",
+    "GeneQcClass",
+    "ShardDataset",
+    "__version__",
+    # expression (read accessors over the downloadable bundle)
+    "addressable_fraction",
+    "addressable_fraction_by_cohort",
+    "aggregate_gene_expression",
+    "aggregate_transcripts_to_genes",
+    "available_percentile_cohorts",
+    "available_representative_cohorts",
+    "available_within_sample_cohorts",
+    "burden_category",
+    # anti-PD-1 response
+    "cancer_apd1_response",
+    "cancer_apd1_response_df",
+    # incidence / mortality
+    "cancer_burden",
+    "cancer_burden_df",
+    "cancer_code_burden_map",
+    "cancer_fusions",
+    "cancer_fusions_df",
+    "cancer_ici_regimen",
+    "cancer_ici_response",
+    "cancer_ici_response_df",
+    "cancer_lineage_group",
+    "cancer_lineage_group_overrides",
+    "cancer_lineage_groups",
+    "cancer_subtype_group",
+    "cancer_subtype_groupings",
+    # TMB
+    "cancer_tmb",
+    "cancer_tmb_df",
+    "cancer_type_ancestors",
+    "cancer_type_descendants",
+    "cancer_type_families",
+    "cancer_type_info",
+    "cancer_type_lineage",
+    "cancer_type_registry",
+    "cancer_type_subtypes_of",
+    "cancer_type_synonyms",
+    "cancer_type_tree",
+    "cancer_types_by_tissue",
+    "cancer_types_in_family",
+    "cancer_types_with_fusion",
+    "canonical_cancer_code",
+    "canonical_gene_id_and_name",
+    "canonical_gene_ids_and_names",
+    "classify_gene_qc",
+    "clean_tpm",
+    "cohort_aggregate_members",
+    # cohort vocabulary
+    "cohort_aggregates",
+    "cohort_aggregates_df",
+    "cohort_gene_percentiles",
+    "cohort_kind",
+    "cohort_mean_expression",
+    "cohort_registry",
+    "cohort_registry_df",
+    "cohort_stats",
+    "collapse_to_proteoforms",
+    "cta_candidate_references",
+    "cta_df",
+    # cancer-testis antigens
+    "cta_evidence",
+    "cta_excluded_gene_names",
+    "cta_filtered_gene_ids",
+    "cta_filtered_gene_names",
+    "cta_gene_id_to_name",
+    "cta_gene_ids",
+    "cta_gene_names",
+    "cta_never_expressed_gene_names",
+    "cta_patient_fractions",
+    "cta_specific_9mer_counts",
+    "cta_specific_9mer_load",
+    "cta_specific_9mer_weights",
+    "cta_unfiltered_gene_ids",
+    "cta_unfiltered_gene_names",
+    "drop_technical_rna",
+    "expression_level",
+    "expression_source",
+    "expression_sources",
+    "expression_sources_df",
+    "family_display_name",
+    "filter_technical_rna",
+    "find_gene_id_by_name",
+    "find_gene_name_from_ensembl_gene_id",
+    "find_gene_name_from_ensembl_transcript_id",
+    "format_cancer_code_label",
+    "fpkm_to_tpm",
+    "fusion_partners",
+    "fusion_status",
+    "gene_cell_type_ntpm",
+    "gene_cohort_mean_expression",
+    "gene_cohort_percentiles",
+    "gene_cohort_stats",
+    "gene_per_sample_expression",
+    "gene_pooled_cohort_stats",
+    "gene_protein_tissues",
+    "gene_representative_samples",
+    "gene_tissue_ntpm",
+    "gene_to_proteoform",
+    "gene_to_proteoform_id",
+    "gene_within_sample_top_fraction",
+    "genomes",
+    "greedy_coverage",
+    "hpa_normal_tissue",
+    # HPA normal-tissue reference data
+    "hpa_rna_consensus",
+    "hpa_single_cell",
+    "ici_regimens",
+    "id_columns",
+    "is_expression_value_col",
+    "is_mixture_cohort",
+    "is_rescue_feature",
+    "known_cohort_ids",
+    "log1p_transform",
+    "log2_transform",
+    "mean_antigens_per_patient",
+    "mean_antigens_per_patient_by_cohort",
+    "mixture_cohort_codes",
+    "normalize_expression",
+    "normalize_technical_rna_columns",
+    "normalize_technical_rna_long_table",
+    "normalize_to_housekeeping",
+    "pan_cancer_expression",
+    "per_sample_expression",
+    "percentile_rank",
+    "pooled_cohort_stats",
+    "protein_family",
+    "proteoform_aliases",
+    "proteoform_cohort_mean_expression",
+    "proteoform_cohort_percentiles",
+    "proteoform_cohort_stats",
+    "proteoform_for_gene",
+    "proteoform_group_map",
+    "proteoform_groups",
+    "proteoform_key",
+    "proteoform_members_for_gene",
+    "proteoform_per_sample_expression",
+    "proteoform_pooled_cohort_stats",
+    "proteoform_representative_samples",
+    "proteoform_symbol",
+    "proteoform_symbol_map",
+    "proteoform_within_sample_top_fraction",
+    "renormalize_to_million",
+    "representative_cohort_samples",
+    "resolve_cancer_type",
+    "response_signature_direction",
+    "response_signature_genes",
+    "response_signature_names",
+    "response_signatures_df",
+    "sample_columns",
+    "sample_counts_by_cancer_code",
+    "sample_manifest",
+    "samples_for_cancer_code",
+    "samples_for_cohort",
+    "sarcoma_lineage_codes",
+    "signature_score",
+    "sources_for_cancer_code",
+    "tissue_of_origin",
+    "tpm_to_housekeeping_normalized",
+    "viral_status",
+    "within_sample_top_fraction",
+]

oncoref/apd1.py ADDED Viewed

@@ -0,0 +1,59 @@
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Anti-PD-1 monotherapy response (objective response rate) by cancer type."""
+from __future__ import annotations
+from .cancer_types import cancer_type_registry, resolve_cancer_type
+from .load_dataset import get_data
+def cancer_apd1_response_df():
+    """Return the curated ``cancer-apd1-response.csv`` reference: representative
+    objective response rate (ORR, %) to anti-PD-1 **monotherapy**
+    (pembrolizumab / nivolumab) per cancer-type code, with the drug, pivotal
+    trial, treatment setting, a published source PMID/DOI, and a confidence flag.
+    Intended as a per-cancer-type plotting axis (e.g. TMB vs aPD1 ORR, CTA burden
+    vs aPD1 ORR). Values are representative anchors, not exact reproducible
+    constants — they shift with data cutoff, line of therapy, and biomarker
+    selection (PD-L1 / MSI / MMR); the ``setting`` and ``notes`` columns record
+    that context."""
+    return get_data("cancer-apd1-response")
+def cancer_apd1_response(cancer_type=None, *, inherit=True):
+    """Anti-PD-1 monotherapy ORR (%) for one cancer type, or the whole
+    ``{code: orr_pct}`` map. ``cancer_type`` is resolved through
+    :func:`resolve_cancer_type`; with ``inherit`` (default) a code with no
+    curated row of its own inherits its nearest ancestor's value via the registry
+    ``parent_code`` chain. Returns ``None`` if neither the code nor any ancestor
+    has a value. Mirrors :func:`oncoref.cancer_tmb`."""
+    df = cancer_apd1_response_df()
+    vals = df.dropna(subset=["apd1_orr_pct"])
+    mapping = dict(zip(vals["cancer_code"].astype(str), vals["apd1_orr_pct"].astype(float)))
+    if cancer_type is None:
+        return mapping
+    code = resolve_cancer_type(cancer_type)
+    if code in mapping or not inherit:
+        return mapping.get(code)
+    reg = cancer_type_registry().set_index("code")
+    cur, seen = code, set()
+    while cur and cur not in seen:
+        seen.add(cur)
+        if cur in mapping:
+            return mapping[cur]
+        if cur not in reg.index:
+            break
+        cur = str(reg.loc[cur].get("parent_code", "") or "").strip() or None
+    return None

oncoref/cancer_genes.py ADDED Viewed

@@ -0,0 +1,149 @@
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Per-cancer-type gene biology: drivers, key (biomarker/target) genes, role-
+stratified type genes, viral antigens, and a few narrative/rule tables.
+The curated ontology metadata that hangs off the cancer-type registry. All code
+arguments are alias-resolved via :func:`oncoref.resolve_cancer_type`.
+"""
+from __future__ import annotations
+import pandas as pd
+from .cancer_types import resolve_cancer_type
+from .load_dataset import get_data
+def _split(value, sep=";") -> list[str]:
+    return [x.strip() for x in str(value).split(sep) if x.strip() and x.strip().lower() != "nan"]
+# ---------- driver genes / variants ----------
+def cancer_driver_genes_df() -> pd.DataFrame:
+    """Curated cancer driver genes (``Symbol``, ``Cancer``, ``Function``,
+    ``Ensembl_Gene_ID``, …). Defensive copy."""
+    return get_data("cancer-driver-genes").copy()
+def cancer_driver_variants_df() -> pd.DataFrame:
+    """Curated driver variants (``Symbol``, ``Mutation``, ``Ensembl_Gene_ID``, …)."""
+    return get_data("cancer-driver-variants").copy()
+# ---------- key genes: biomarkers + therapy targets ----------
+def cancer_key_genes_df() -> pd.DataFrame:
+    """Per-type key genes — ``role`` ∈ {biomarker, target} with agent/phase/
+    indication context. Defensive copy."""
+    return get_data("cancer-key-genes").copy()
+def _key_genes_for(cancer_type, *, subtype=None) -> pd.DataFrame:
+    df = cancer_key_genes_df()
+    df = df[df["cancer_code"].astype(str) == resolve_cancer_type(cancer_type)]
+    if subtype is not None:
+        df = df[df["subtype"].astype(str) == str(subtype)]
+    return df
+def cancer_biomarker_genes(cancer_type, *, subtype=None) -> list[str]:
+    """Biomarker gene symbols for a cancer type (ordered, de-duplicated)."""
+    df = _key_genes_for(cancer_type, subtype=subtype)
+    syms = df[df["role"].astype(str) == "biomarker"]["symbol"].astype(str)
+    return list(dict.fromkeys(syms))
+def cancer_therapy_targets(cancer_type, *, subtype=None) -> pd.DataFrame:
+    """Therapy-target rows for a cancer type (agent / phase / indication)."""
+    df = _key_genes_for(cancer_type, subtype=subtype)
+    return df[df["role"].astype(str) == "target"].reset_index(drop=True)
+# ---------- role-stratified type genes ----------
+def cancer_type_genes_df() -> pd.DataFrame:
+    """Role-stratified per-type genes (``Symbol``, ``Ensembl_Gene_ID``,
+    ``Cancer_Type``, ``Role``). Defensive copy."""
+    return get_data("cancer-type-genes").copy()
+def cancer_type_gene_sets(cancer_type) -> dict[str, dict[str, str]]:
+    """``{role: {ensembl_id: symbol}}`` for one cancer type (empty if none curated)."""
+    code = resolve_cancer_type(cancer_type)
+    df = cancer_type_genes_df()
+    df = df[df["Cancer_Type"].astype(str) == code]
+    out: dict[str, dict[str, str]] = {}
+    for _, row in df.iterrows():
+        out.setdefault(str(row["Role"]), {})[str(row["Ensembl_Gene_ID"])] = str(row["Symbol"])
+    return out
+# ---------- viral antigens ----------
+def cancer_viral_antigens_df() -> pd.DataFrame:
+    """Per-oncovirus targetable antigens (``virus``, ``targetable_antigens``,
+    ``associated_cohorts``, …). Defensive copy."""
+    return get_data("cancer-viral-antigens").copy()
+def cancer_viral_antigens(virus: str | None = None):
+    """Targetable viral antigens. With ``virus`` (case-insensitive), the list for
+    that virus (``[]`` if unknown); otherwise a ``{virus: [antigen, …]}`` map."""
+    df = cancer_viral_antigens_df()
+    if virus is not None:
+        hit = df[df["virus"].astype(str).str.lower() == str(virus).strip().lower()]
+        return _split(hit.iloc[0]["targetable_antigens"]) if not hit.empty else []
+    return {str(r.virus): _split(r.targetable_antigens) for r in df.itertuples()}
+def viral_antigens_for_cancer(cancer_type) -> list[tuple[str, list[str]]]:
+    """``[(virus, [antigen, …]), …]`` for a cancer type — the reverse lookup over
+    ``associated_cohorts``. Empty for a non-virally-driven entity."""
+    code = resolve_cancer_type(cancer_type)
+    out = []
+    for r in cancer_viral_antigens_df().itertuples():
+        if code in _split(r.associated_cohorts):
+            out.append((str(r.virus), _split(r.targetable_antigens)))
+    return out
+# ---------- narrative / rule tables ----------
+def narrative_gene_sets_df() -> pd.DataFrame:
+    """Named narrative gene sets (``set_name``, ``members``, ``notes``)."""
+    return get_data("narrative-gene-sets").copy()
+def narrative_gene_set(set_name: str) -> list[str]:
+    """Member gene symbols of a named narrative set (``[]`` if unknown)."""
+    df = narrative_gene_sets_df()
+    hit = df[df["set_name"].astype(str) == str(set_name)]
+    return _split(hit.iloc[0]["members"]) if not hit.empty else []
+def disease_state_rules_df() -> pd.DataFrame:
+    """Declarative disease-state rules (``rule_id``, ``cancer_code``, ``claims``,
+    ``conditions``, ``narrative``). Defensive copy."""
+    return get_data("disease-state-rules").copy()
+def degenerate_subtype_pairs_df() -> pd.DataFrame:
+    """Expression-degenerate subtype pairs + their tiebreaker rules. Defensive copy."""
+    return get_data("degenerate-subtype-pairs").copy()