oncoref 1.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. oncoref/__init__.py +390 -0
  2. oncoref/apd1.py +59 -0
  3. oncoref/cancer_genes.py +149 -0
  4. oncoref/cancer_types.py +827 -0
  5. oncoref/catalog.py +261 -0
  6. oncoref/cli.py +633 -0
  7. oncoref/coverage.py +282 -0
  8. oncoref/cta.py +297 -0
  9. oncoref/cta_regen.py +560 -0
  10. oncoref/cta_tissues.py +152 -0
  11. oncoref/data/cancer-apd1-response.csv +82 -0
  12. oncoref/data/cancer-code-burden-map.csv +5 -0
  13. oncoref/data/cancer-cohort-aggregates.csv +18 -0
  14. oncoref/data/cancer-driver-genes.csv +740 -0
  15. oncoref/data/cancer-driver-variants.csv +580 -0
  16. oncoref/data/cancer-expression-source-candidates.csv +59 -0
  17. oncoref/data/cancer-frameshift-burden.csv +37 -0
  18. oncoref/data/cancer-fusions.csv +172 -0
  19. oncoref/data/cancer-ici-response.csv +105 -0
  20. oncoref/data/cancer-incidence-mortality.csv +38 -0
  21. oncoref/data/cancer-key-genes.csv +852 -0
  22. oncoref/data/cancer-lineage-group-overrides.csv +2 -0
  23. oncoref/data/cancer-lineage-groups.csv +28 -0
  24. oncoref/data/cancer-reference-expression-samples.csv.gz +0 -0
  25. oncoref/data/cancer-response-signatures.csv +25 -0
  26. oncoref/data/cancer-subtype-groupings.csv +13 -0
  27. oncoref/data/cancer-testis-antigens.csv +398 -0
  28. oncoref/data/cancer-tmb.csv +123 -0
  29. oncoref/data/cancer-type-genes.csv +578 -0
  30. oncoref/data/cancer-type-registry.csv +165 -0
  31. oncoref/data/cancer-viral-antigens.csv +7 -0
  32. oncoref/data/cdna-identical-gene-groups.csv +327 -0
  33. oncoref/data/censored-gene-reference-tpm.csv +1729 -0
  34. oncoref/data/clean-tpm-censored-genes.csv +2787 -0
  35. oncoref/data/cohort-registry.csv +42 -0
  36. oncoref/data/cta-candidate-references.csv +13 -0
  37. oncoref/data/cta-ihc-unreliable.csv +7 -0
  38. oncoref/data/degenerate-subtype-pairs.csv +8 -0
  39. oncoref/data/disease-state-rules.csv +9 -0
  40. oncoref/data/ensembl-id-aliases.csv +468 -0
  41. oncoref/data/expression_sources.yaml +964 -0
  42. oncoref/data/extra-tx-mappings.csv +213 -0
  43. oncoref/data/family-burden-map.csv +10 -0
  44. oncoref/data/fusion-expression-effects.csv +7 -0
  45. oncoref/data/fusion-surrogate-expression.csv +52 -0
  46. oncoref/data/hemoglobin-genes.csv +13 -0
  47. oncoref/data/histone-genes.csv +193 -0
  48. oncoref/data/housekeeping-genes.csv +31 -0
  49. oncoref/data/mitochondrial-genes.csv +38 -0
  50. oncoref/data/narrative-gene-sets.csv +4 -0
  51. oncoref/data/ncbi-symbol-synonyms.csv.gz +0 -0
  52. oncoref/data/nuclear-retained-lncrnas.csv +5 -0
  53. oncoref/data/numt-pseudogenes.csv +413 -0
  54. oncoref/data/proteoform-collapse-overrides.csv +2 -0
  55. oncoref/data/proteoform-groups-genome.csv +411 -0
  56. oncoref/data/proteoform-groups.csv +48 -0
  57. oncoref/data/rare-cancer-fusion-rules.csv +27 -0
  58. oncoref/data/ribosomal-protein-genes.csv +123 -0
  59. oncoref/data/ribosomal-protein-pseudogenes.csv +1641 -0
  60. oncoref/data/rrna-and-pseudogenes.csv +558 -0
  61. oncoref/data/small-noncoding-rnas.csv +6396 -0
  62. oncoref/data/source-matrices.csv +119 -0
  63. oncoref/data/tissue-burden-map.csv +71 -0
  64. oncoref/data_bundle.py +378 -0
  65. oncoref/data_manifest.py +283 -0
  66. oncoref/expression.py +1062 -0
  67. oncoref/expression_builders.py +334 -0
  68. oncoref/expression_engine.py +168 -0
  69. oncoref/expression_registry.py +162 -0
  70. oncoref/fusions.py +186 -0
  71. oncoref/gene_families.py +126 -0
  72. oncoref/gene_ids.py +88 -0
  73. oncoref/gene_qc.py +233 -0
  74. oncoref/genome.py +298 -0
  75. oncoref/hpa.py +101 -0
  76. oncoref/ici.py +169 -0
  77. oncoref/incidence.py +156 -0
  78. oncoref/load_dataset.py +233 -0
  79. oncoref/normalization.py +607 -0
  80. oncoref/peptides.py +237 -0
  81. oncoref/plots.py +1083 -0
  82. oncoref/proteoforms.py +302 -0
  83. oncoref/reference_data.py +287 -0
  84. oncoref/response_signatures.py +92 -0
  85. oncoref/samples.py +70 -0
  86. oncoref/source_matrices.py +145 -0
  87. oncoref/tmb.py +101 -0
  88. oncoref/version.py +31 -0
  89. oncoref-1.6.0.dist-info/METADATA +169 -0
  90. oncoref-1.6.0.dist-info/RECORD +94 -0
  91. oncoref-1.6.0.dist-info/WHEEL +5 -0
  92. oncoref-1.6.0.dist-info/entry_points.txt +2 -0
  93. oncoref-1.6.0.dist-info/licenses/LICENSE +201 -0
  94. oncoref-1.6.0.dist-info/top_level.txt +1 -0
oncoref/__init__.py ADDED
@@ -0,0 +1,390 @@
1
+ # Licensed under the Apache License, Version 2.0 (the "License");
2
+ # you may not use this file except in compliance with the License.
3
+ # You may obtain a copy of the License at
4
+ #
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ #
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+
13
+ """oncoref — curated cancer reference data (ontology, TMB, incidence/
14
+ mortality, and expression) with a single fetch/cache surface.
15
+
16
+ Bottom-of-stack: depends only on pandas/numpy/pyarrow, never on the analysis
17
+ or target-selection libraries that consume it.
18
+ """
19
+
20
+ from .apd1 import cancer_apd1_response, cancer_apd1_response_df
21
+ from .cancer_types import (
22
+ CANCER_TYPE_ALIASES,
23
+ CANCER_TYPE_NAMES,
24
+ cancer_lineage_group,
25
+ cancer_lineage_group_overrides,
26
+ cancer_lineage_groups,
27
+ cancer_subtype_group,
28
+ cancer_subtype_groupings,
29
+ cancer_type_ancestors,
30
+ cancer_type_descendants,
31
+ cancer_type_families,
32
+ cancer_type_info,
33
+ cancer_type_lineage,
34
+ cancer_type_registry,
35
+ cancer_type_subtypes_of,
36
+ cancer_type_synonyms,
37
+ cancer_type_tree,
38
+ cancer_types_by_tissue,
39
+ cancer_types_in_family,
40
+ canonical_cancer_code,
41
+ cohort_aggregate_members,
42
+ cohort_aggregates,
43
+ cohort_aggregates_df,
44
+ cohort_kind,
45
+ cohort_registry,
46
+ cohort_registry_df,
47
+ family_display_name,
48
+ format_cancer_code_label,
49
+ fusion_status,
50
+ is_mixture_cohort,
51
+ known_cohort_ids,
52
+ mixture_cohort_codes,
53
+ resolve_cancer_type,
54
+ sarcoma_lineage_codes,
55
+ tissue_of_origin,
56
+ viral_status,
57
+ )
58
+ from .coverage import (
59
+ addressable_fraction,
60
+ addressable_fraction_by_cohort,
61
+ cta_patient_fractions,
62
+ greedy_coverage,
63
+ mean_antigens_per_patient,
64
+ mean_antigens_per_patient_by_cohort,
65
+ )
66
+ from .cta import (
67
+ cta_candidate_references,
68
+ cta_df,
69
+ cta_evidence,
70
+ cta_excluded_gene_names,
71
+ cta_filtered_gene_ids,
72
+ cta_filtered_gene_names,
73
+ cta_gene_id_to_name,
74
+ cta_gene_ids,
75
+ cta_gene_names,
76
+ cta_never_expressed_gene_names,
77
+ cta_unfiltered_gene_ids,
78
+ cta_unfiltered_gene_names,
79
+ )
80
+ from .expression import (
81
+ SHARD_DATASETS,
82
+ ShardDataset,
83
+ available_percentile_cohorts,
84
+ available_representative_cohorts,
85
+ available_within_sample_cohorts,
86
+ cohort_gene_percentiles,
87
+ cohort_mean_expression,
88
+ cohort_stats,
89
+ gene_cohort_mean_expression,
90
+ gene_cohort_percentiles,
91
+ gene_cohort_stats,
92
+ gene_per_sample_expression,
93
+ gene_pooled_cohort_stats,
94
+ gene_representative_samples,
95
+ gene_within_sample_top_fraction,
96
+ pan_cancer_expression,
97
+ per_sample_expression,
98
+ pooled_cohort_stats,
99
+ proteoform_cohort_mean_expression,
100
+ proteoform_cohort_percentiles,
101
+ proteoform_cohort_stats,
102
+ proteoform_per_sample_expression,
103
+ proteoform_pooled_cohort_stats,
104
+ proteoform_representative_samples,
105
+ proteoform_within_sample_top_fraction,
106
+ representative_cohort_samples,
107
+ within_sample_top_fraction,
108
+ )
109
+ from .expression_engine import aggregate_transcripts_to_genes, id_columns, sample_columns
110
+ from .expression_registry import (
111
+ ExpressionSource,
112
+ expression_source,
113
+ expression_sources,
114
+ expression_sources_df,
115
+ sources_for_cancer_code,
116
+ )
117
+ from .fusions import (
118
+ cancer_fusions,
119
+ cancer_fusions_df,
120
+ cancer_types_with_fusion,
121
+ fusion_partners,
122
+ protein_family,
123
+ )
124
+ from .gene_families import TECHNICAL_RNA_FAMILIES
125
+ from .gene_qc import TECHNICAL_RNA_GROUPS, GeneQcClass, classify_gene_qc, is_rescue_feature
126
+ from .genome import (
127
+ aggregate_gene_expression,
128
+ canonical_gene_id_and_name,
129
+ canonical_gene_ids_and_names,
130
+ find_gene_id_by_name,
131
+ find_gene_name_from_ensembl_gene_id,
132
+ find_gene_name_from_ensembl_transcript_id,
133
+ genomes,
134
+ )
135
+ from .hpa import (
136
+ gene_cell_type_ntpm,
137
+ gene_protein_tissues,
138
+ gene_tissue_ntpm,
139
+ hpa_normal_tissue,
140
+ hpa_rna_consensus,
141
+ hpa_single_cell,
142
+ )
143
+ from .ici import (
144
+ REGIMEN_FALLBACK,
145
+ REGIMEN_LABELS,
146
+ cancer_ici_regimen,
147
+ cancer_ici_response,
148
+ cancer_ici_response_df,
149
+ ici_regimens,
150
+ )
151
+ from .incidence import (
152
+ burden_category,
153
+ cancer_burden,
154
+ cancer_burden_df,
155
+ cancer_code_burden_map,
156
+ )
157
+ from .normalization import (
158
+ BIOLOGICAL_FRACTION,
159
+ OTHER_TECHNICAL_FRACTION,
160
+ RIBOSOMAL_PROTEIN_FRACTION,
161
+ TECHNICAL_FRACTION,
162
+ clean_tpm,
163
+ drop_technical_rna,
164
+ filter_technical_rna,
165
+ fpkm_to_tpm,
166
+ is_expression_value_col,
167
+ log1p_transform,
168
+ log2_transform,
169
+ normalize_expression,
170
+ normalize_technical_rna_columns,
171
+ normalize_technical_rna_long_table,
172
+ normalize_to_housekeeping,
173
+ percentile_rank,
174
+ renormalize_to_million,
175
+ tpm_to_housekeeping_normalized,
176
+ )
177
+ from .peptides import (
178
+ cta_specific_9mer_counts,
179
+ cta_specific_9mer_load,
180
+ cta_specific_9mer_weights,
181
+ )
182
+ from .proteoforms import (
183
+ collapse_to_proteoforms,
184
+ expression_level,
185
+ gene_to_proteoform,
186
+ gene_to_proteoform_id,
187
+ proteoform_aliases,
188
+ proteoform_for_gene,
189
+ proteoform_group_map,
190
+ proteoform_groups,
191
+ proteoform_key,
192
+ proteoform_members_for_gene,
193
+ proteoform_symbol,
194
+ proteoform_symbol_map,
195
+ )
196
+ from .response_signatures import (
197
+ response_signature_direction,
198
+ response_signature_genes,
199
+ response_signature_names,
200
+ response_signatures_df,
201
+ signature_score,
202
+ )
203
+ from .samples import (
204
+ sample_counts_by_cancer_code,
205
+ sample_manifest,
206
+ samples_for_cancer_code,
207
+ samples_for_cohort,
208
+ )
209
+ from .tmb import cancer_tmb, cancer_tmb_df
210
+ from .version import __version__
211
+
212
+ __all__ = [
213
+ # expression sources + per-sample curation
214
+ "BIOLOGICAL_FRACTION",
215
+ # ontology / registry
216
+ "CANCER_TYPE_ALIASES",
217
+ "CANCER_TYPE_NAMES",
218
+ "OTHER_TECHNICAL_FRACTION",
219
+ "REGIMEN_FALLBACK",
220
+ "REGIMEN_LABELS",
221
+ "RIBOSOMAL_PROTEIN_FRACTION",
222
+ "SHARD_DATASETS",
223
+ "TECHNICAL_FRACTION",
224
+ "TECHNICAL_RNA_FAMILIES",
225
+ "TECHNICAL_RNA_GROUPS",
226
+ "ExpressionSource",
227
+ "GeneQcClass",
228
+ "ShardDataset",
229
+ "__version__",
230
+ # expression (read accessors over the downloadable bundle)
231
+ "addressable_fraction",
232
+ "addressable_fraction_by_cohort",
233
+ "aggregate_gene_expression",
234
+ "aggregate_transcripts_to_genes",
235
+ "available_percentile_cohorts",
236
+ "available_representative_cohorts",
237
+ "available_within_sample_cohorts",
238
+ "burden_category",
239
+ # anti-PD-1 response
240
+ "cancer_apd1_response",
241
+ "cancer_apd1_response_df",
242
+ # incidence / mortality
243
+ "cancer_burden",
244
+ "cancer_burden_df",
245
+ "cancer_code_burden_map",
246
+ "cancer_fusions",
247
+ "cancer_fusions_df",
248
+ "cancer_ici_regimen",
249
+ "cancer_ici_response",
250
+ "cancer_ici_response_df",
251
+ "cancer_lineage_group",
252
+ "cancer_lineage_group_overrides",
253
+ "cancer_lineage_groups",
254
+ "cancer_subtype_group",
255
+ "cancer_subtype_groupings",
256
+ # TMB
257
+ "cancer_tmb",
258
+ "cancer_tmb_df",
259
+ "cancer_type_ancestors",
260
+ "cancer_type_descendants",
261
+ "cancer_type_families",
262
+ "cancer_type_info",
263
+ "cancer_type_lineage",
264
+ "cancer_type_registry",
265
+ "cancer_type_subtypes_of",
266
+ "cancer_type_synonyms",
267
+ "cancer_type_tree",
268
+ "cancer_types_by_tissue",
269
+ "cancer_types_in_family",
270
+ "cancer_types_with_fusion",
271
+ "canonical_cancer_code",
272
+ "canonical_gene_id_and_name",
273
+ "canonical_gene_ids_and_names",
274
+ "classify_gene_qc",
275
+ "clean_tpm",
276
+ "cohort_aggregate_members",
277
+ # cohort vocabulary
278
+ "cohort_aggregates",
279
+ "cohort_aggregates_df",
280
+ "cohort_gene_percentiles",
281
+ "cohort_kind",
282
+ "cohort_mean_expression",
283
+ "cohort_registry",
284
+ "cohort_registry_df",
285
+ "cohort_stats",
286
+ "collapse_to_proteoforms",
287
+ "cta_candidate_references",
288
+ "cta_df",
289
+ # cancer-testis antigens
290
+ "cta_evidence",
291
+ "cta_excluded_gene_names",
292
+ "cta_filtered_gene_ids",
293
+ "cta_filtered_gene_names",
294
+ "cta_gene_id_to_name",
295
+ "cta_gene_ids",
296
+ "cta_gene_names",
297
+ "cta_never_expressed_gene_names",
298
+ "cta_patient_fractions",
299
+ "cta_specific_9mer_counts",
300
+ "cta_specific_9mer_load",
301
+ "cta_specific_9mer_weights",
302
+ "cta_unfiltered_gene_ids",
303
+ "cta_unfiltered_gene_names",
304
+ "drop_technical_rna",
305
+ "expression_level",
306
+ "expression_source",
307
+ "expression_sources",
308
+ "expression_sources_df",
309
+ "family_display_name",
310
+ "filter_technical_rna",
311
+ "find_gene_id_by_name",
312
+ "find_gene_name_from_ensembl_gene_id",
313
+ "find_gene_name_from_ensembl_transcript_id",
314
+ "format_cancer_code_label",
315
+ "fpkm_to_tpm",
316
+ "fusion_partners",
317
+ "fusion_status",
318
+ "gene_cell_type_ntpm",
319
+ "gene_cohort_mean_expression",
320
+ "gene_cohort_percentiles",
321
+ "gene_cohort_stats",
322
+ "gene_per_sample_expression",
323
+ "gene_pooled_cohort_stats",
324
+ "gene_protein_tissues",
325
+ "gene_representative_samples",
326
+ "gene_tissue_ntpm",
327
+ "gene_to_proteoform",
328
+ "gene_to_proteoform_id",
329
+ "gene_within_sample_top_fraction",
330
+ "genomes",
331
+ "greedy_coverage",
332
+ "hpa_normal_tissue",
333
+ # HPA normal-tissue reference data
334
+ "hpa_rna_consensus",
335
+ "hpa_single_cell",
336
+ "ici_regimens",
337
+ "id_columns",
338
+ "is_expression_value_col",
339
+ "is_mixture_cohort",
340
+ "is_rescue_feature",
341
+ "known_cohort_ids",
342
+ "log1p_transform",
343
+ "log2_transform",
344
+ "mean_antigens_per_patient",
345
+ "mean_antigens_per_patient_by_cohort",
346
+ "mixture_cohort_codes",
347
+ "normalize_expression",
348
+ "normalize_technical_rna_columns",
349
+ "normalize_technical_rna_long_table",
350
+ "normalize_to_housekeeping",
351
+ "pan_cancer_expression",
352
+ "per_sample_expression",
353
+ "percentile_rank",
354
+ "pooled_cohort_stats",
355
+ "protein_family",
356
+ "proteoform_aliases",
357
+ "proteoform_cohort_mean_expression",
358
+ "proteoform_cohort_percentiles",
359
+ "proteoform_cohort_stats",
360
+ "proteoform_for_gene",
361
+ "proteoform_group_map",
362
+ "proteoform_groups",
363
+ "proteoform_key",
364
+ "proteoform_members_for_gene",
365
+ "proteoform_per_sample_expression",
366
+ "proteoform_pooled_cohort_stats",
367
+ "proteoform_representative_samples",
368
+ "proteoform_symbol",
369
+ "proteoform_symbol_map",
370
+ "proteoform_within_sample_top_fraction",
371
+ "renormalize_to_million",
372
+ "representative_cohort_samples",
373
+ "resolve_cancer_type",
374
+ "response_signature_direction",
375
+ "response_signature_genes",
376
+ "response_signature_names",
377
+ "response_signatures_df",
378
+ "sample_columns",
379
+ "sample_counts_by_cancer_code",
380
+ "sample_manifest",
381
+ "samples_for_cancer_code",
382
+ "samples_for_cohort",
383
+ "sarcoma_lineage_codes",
384
+ "signature_score",
385
+ "sources_for_cancer_code",
386
+ "tissue_of_origin",
387
+ "tpm_to_housekeeping_normalized",
388
+ "viral_status",
389
+ "within_sample_top_fraction",
390
+ ]
oncoref/apd1.py ADDED
@@ -0,0 +1,59 @@
1
+ # Licensed under the Apache License, Version 2.0 (the "License");
2
+ # you may not use this file except in compliance with the License.
3
+ # You may obtain a copy of the License at
4
+ #
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ #
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+
13
+ """Anti-PD-1 monotherapy response (objective response rate) by cancer type."""
14
+
15
+ from __future__ import annotations
16
+
17
+ from .cancer_types import cancer_type_registry, resolve_cancer_type
18
+ from .load_dataset import get_data
19
+
20
+
21
+ def cancer_apd1_response_df():
22
+ """Return the curated ``cancer-apd1-response.csv`` reference: representative
23
+ objective response rate (ORR, %) to anti-PD-1 **monotherapy**
24
+ (pembrolizumab / nivolumab) per cancer-type code, with the drug, pivotal
25
+ trial, treatment setting, a published source PMID/DOI, and a confidence flag.
26
+
27
+ Intended as a per-cancer-type plotting axis (e.g. TMB vs aPD1 ORR, CTA burden
28
+ vs aPD1 ORR). Values are representative anchors, not exact reproducible
29
+ constants — they shift with data cutoff, line of therapy, and biomarker
30
+ selection (PD-L1 / MSI / MMR); the ``setting`` and ``notes`` columns record
31
+ that context."""
32
+ return get_data("cancer-apd1-response")
33
+
34
+
35
+ def cancer_apd1_response(cancer_type=None, *, inherit=True):
36
+ """Anti-PD-1 monotherapy ORR (%) for one cancer type, or the whole
37
+ ``{code: orr_pct}`` map. ``cancer_type`` is resolved through
38
+ :func:`resolve_cancer_type`; with ``inherit`` (default) a code with no
39
+ curated row of its own inherits its nearest ancestor's value via the registry
40
+ ``parent_code`` chain. Returns ``None`` if neither the code nor any ancestor
41
+ has a value. Mirrors :func:`oncoref.cancer_tmb`."""
42
+ df = cancer_apd1_response_df()
43
+ vals = df.dropna(subset=["apd1_orr_pct"])
44
+ mapping = dict(zip(vals["cancer_code"].astype(str), vals["apd1_orr_pct"].astype(float)))
45
+ if cancer_type is None:
46
+ return mapping
47
+ code = resolve_cancer_type(cancer_type)
48
+ if code in mapping or not inherit:
49
+ return mapping.get(code)
50
+ reg = cancer_type_registry().set_index("code")
51
+ cur, seen = code, set()
52
+ while cur and cur not in seen:
53
+ seen.add(cur)
54
+ if cur in mapping:
55
+ return mapping[cur]
56
+ if cur not in reg.index:
57
+ break
58
+ cur = str(reg.loc[cur].get("parent_code", "") or "").strip() or None
59
+ return None
@@ -0,0 +1,149 @@
1
+ # Licensed under the Apache License, Version 2.0 (the "License");
2
+ # you may not use this file except in compliance with the License.
3
+ # You may obtain a copy of the License at
4
+ #
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ #
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+
13
+ """Per-cancer-type gene biology: drivers, key (biomarker/target) genes, role-
14
+ stratified type genes, viral antigens, and a few narrative/rule tables.
15
+
16
+ The curated ontology metadata that hangs off the cancer-type registry. All code
17
+ arguments are alias-resolved via :func:`oncoref.resolve_cancer_type`.
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ import pandas as pd
23
+
24
+ from .cancer_types import resolve_cancer_type
25
+ from .load_dataset import get_data
26
+
27
+
28
+ def _split(value, sep=";") -> list[str]:
29
+ return [x.strip() for x in str(value).split(sep) if x.strip() and x.strip().lower() != "nan"]
30
+
31
+
32
+ # ---------- driver genes / variants ----------
33
+
34
+
35
+ def cancer_driver_genes_df() -> pd.DataFrame:
36
+ """Curated cancer driver genes (``Symbol``, ``Cancer``, ``Function``,
37
+ ``Ensembl_Gene_ID``, …). Defensive copy."""
38
+ return get_data("cancer-driver-genes").copy()
39
+
40
+
41
+ def cancer_driver_variants_df() -> pd.DataFrame:
42
+ """Curated driver variants (``Symbol``, ``Mutation``, ``Ensembl_Gene_ID``, …)."""
43
+ return get_data("cancer-driver-variants").copy()
44
+
45
+
46
+ # ---------- key genes: biomarkers + therapy targets ----------
47
+
48
+
49
+ def cancer_key_genes_df() -> pd.DataFrame:
50
+ """Per-type key genes — ``role`` ∈ {biomarker, target} with agent/phase/
51
+ indication context. Defensive copy."""
52
+ return get_data("cancer-key-genes").copy()
53
+
54
+
55
+ def _key_genes_for(cancer_type, *, subtype=None) -> pd.DataFrame:
56
+ df = cancer_key_genes_df()
57
+ df = df[df["cancer_code"].astype(str) == resolve_cancer_type(cancer_type)]
58
+ if subtype is not None:
59
+ df = df[df["subtype"].astype(str) == str(subtype)]
60
+ return df
61
+
62
+
63
+ def cancer_biomarker_genes(cancer_type, *, subtype=None) -> list[str]:
64
+ """Biomarker gene symbols for a cancer type (ordered, de-duplicated)."""
65
+ df = _key_genes_for(cancer_type, subtype=subtype)
66
+ syms = df[df["role"].astype(str) == "biomarker"]["symbol"].astype(str)
67
+ return list(dict.fromkeys(syms))
68
+
69
+
70
+ def cancer_therapy_targets(cancer_type, *, subtype=None) -> pd.DataFrame:
71
+ """Therapy-target rows for a cancer type (agent / phase / indication)."""
72
+ df = _key_genes_for(cancer_type, subtype=subtype)
73
+ return df[df["role"].astype(str) == "target"].reset_index(drop=True)
74
+
75
+
76
+ # ---------- role-stratified type genes ----------
77
+
78
+
79
+ def cancer_type_genes_df() -> pd.DataFrame:
80
+ """Role-stratified per-type genes (``Symbol``, ``Ensembl_Gene_ID``,
81
+ ``Cancer_Type``, ``Role``). Defensive copy."""
82
+ return get_data("cancer-type-genes").copy()
83
+
84
+
85
+ def cancer_type_gene_sets(cancer_type) -> dict[str, dict[str, str]]:
86
+ """``{role: {ensembl_id: symbol}}`` for one cancer type (empty if none curated)."""
87
+ code = resolve_cancer_type(cancer_type)
88
+ df = cancer_type_genes_df()
89
+ df = df[df["Cancer_Type"].astype(str) == code]
90
+ out: dict[str, dict[str, str]] = {}
91
+ for _, row in df.iterrows():
92
+ out.setdefault(str(row["Role"]), {})[str(row["Ensembl_Gene_ID"])] = str(row["Symbol"])
93
+ return out
94
+
95
+
96
+ # ---------- viral antigens ----------
97
+
98
+
99
+ def cancer_viral_antigens_df() -> pd.DataFrame:
100
+ """Per-oncovirus targetable antigens (``virus``, ``targetable_antigens``,
101
+ ``associated_cohorts``, …). Defensive copy."""
102
+ return get_data("cancer-viral-antigens").copy()
103
+
104
+
105
+ def cancer_viral_antigens(virus: str | None = None):
106
+ """Targetable viral antigens. With ``virus`` (case-insensitive), the list for
107
+ that virus (``[]`` if unknown); otherwise a ``{virus: [antigen, …]}`` map."""
108
+ df = cancer_viral_antigens_df()
109
+ if virus is not None:
110
+ hit = df[df["virus"].astype(str).str.lower() == str(virus).strip().lower()]
111
+ return _split(hit.iloc[0]["targetable_antigens"]) if not hit.empty else []
112
+ return {str(r.virus): _split(r.targetable_antigens) for r in df.itertuples()}
113
+
114
+
115
+ def viral_antigens_for_cancer(cancer_type) -> list[tuple[str, list[str]]]:
116
+ """``[(virus, [antigen, …]), …]`` for a cancer type — the reverse lookup over
117
+ ``associated_cohorts``. Empty for a non-virally-driven entity."""
118
+ code = resolve_cancer_type(cancer_type)
119
+ out = []
120
+ for r in cancer_viral_antigens_df().itertuples():
121
+ if code in _split(r.associated_cohorts):
122
+ out.append((str(r.virus), _split(r.targetable_antigens)))
123
+ return out
124
+
125
+
126
+ # ---------- narrative / rule tables ----------
127
+
128
+
129
+ def narrative_gene_sets_df() -> pd.DataFrame:
130
+ """Named narrative gene sets (``set_name``, ``members``, ``notes``)."""
131
+ return get_data("narrative-gene-sets").copy()
132
+
133
+
134
+ def narrative_gene_set(set_name: str) -> list[str]:
135
+ """Member gene symbols of a named narrative set (``[]`` if unknown)."""
136
+ df = narrative_gene_sets_df()
137
+ hit = df[df["set_name"].astype(str) == str(set_name)]
138
+ return _split(hit.iloc[0]["members"]) if not hit.empty else []
139
+
140
+
141
+ def disease_state_rules_df() -> pd.DataFrame:
142
+ """Declarative disease-state rules (``rule_id``, ``cancer_code``, ``claims``,
143
+ ``conditions``, ``narrative``). Defensive copy."""
144
+ return get_data("disease-state-rules").copy()
145
+
146
+
147
+ def degenerate_subtype_pairs_df() -> pd.DataFrame:
148
+ """Expression-degenerate subtype pairs + their tiebreaker rules. Defensive copy."""
149
+ return get_data("degenerate-subtype-pairs").copy()