TSUMUGI 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- TSUMUGI/annotator.py +103 -0
- TSUMUGI/argparser.py +599 -0
- TSUMUGI/core.py +185 -0
- TSUMUGI/data/impc_phenodigm.csv +3406 -0
- TSUMUGI/data/mp.obo +143993 -0
- TSUMUGI/filterer.py +36 -0
- TSUMUGI/formatter.py +122 -0
- TSUMUGI/genewise_annotation_builder.py +94 -0
- TSUMUGI/io_handler.py +189 -0
- TSUMUGI/main.py +300 -0
- TSUMUGI/network_constructor.py +603 -0
- TSUMUGI/ontology_handler.py +62 -0
- TSUMUGI/pairwise_similarity_builder.py +66 -0
- TSUMUGI/report_generator.py +122 -0
- TSUMUGI/similarity_calculator.py +498 -0
- TSUMUGI/subcommands/count_filterer.py +47 -0
- TSUMUGI/subcommands/genes_filterer.py +89 -0
- TSUMUGI/subcommands/graphml_builder.py +158 -0
- TSUMUGI/subcommands/life_stage_filterer.py +48 -0
- TSUMUGI/subcommands/mp_filterer.py +142 -0
- TSUMUGI/subcommands/score_filterer.py +22 -0
- TSUMUGI/subcommands/sex_filterer.py +48 -0
- TSUMUGI/subcommands/webapp_builder.py +358 -0
- TSUMUGI/subcommands/zygosity_filterer.py +48 -0
- TSUMUGI/validator.py +65 -0
- TSUMUGI/web/app/css/app.css +1129 -0
- TSUMUGI/web/app/genelist/network_genelist.html +339 -0
- TSUMUGI/web/app/genelist/network_genelist.js +421 -0
- TSUMUGI/web/app/js/data/dataLoader.js +41 -0
- TSUMUGI/web/app/js/export/graphExporter.js +214 -0
- TSUMUGI/web/app/js/graph/centrality.js +495 -0
- TSUMUGI/web/app/js/graph/components.js +30 -0
- TSUMUGI/web/app/js/graph/filters.js +158 -0
- TSUMUGI/web/app/js/graph/highlighter.js +52 -0
- TSUMUGI/web/app/js/graph/layoutController.js +454 -0
- TSUMUGI/web/app/js/graph/valueScaler.js +43 -0
- TSUMUGI/web/app/js/search/geneSearcher.js +93 -0
- TSUMUGI/web/app/js/search/phenotypeSearcher.js +292 -0
- TSUMUGI/web/app/js/ui/dynamicFontSize.js +30 -0
- TSUMUGI/web/app/js/ui/mobilePanel.js +77 -0
- TSUMUGI/web/app/js/ui/slider.js +22 -0
- TSUMUGI/web/app/js/ui/tooltips.js +514 -0
- TSUMUGI/web/app/js/viewer/pageSetup.js +217 -0
- TSUMUGI/web/app/viewer.html +515 -0
- TSUMUGI/web/app/viewer.js +1593 -0
- TSUMUGI/web/css/sanitize.css +363 -0
- TSUMUGI/web/css/top.css +391 -0
- TSUMUGI/web/image/tsumugi-favicon.ico +0 -0
- TSUMUGI/web/image/tsumugi-icon.png +0 -0
- TSUMUGI/web/image/tsumugi-logo.png +0 -0
- TSUMUGI/web/image/tsumugi-logo.svg +69 -0
- TSUMUGI/web/js/genelist_formatter.js +123 -0
- TSUMUGI/web/js/top.js +338 -0
- TSUMUGI/web/open_webapp_linux.sh +25 -0
- TSUMUGI/web/open_webapp_mac.command +25 -0
- TSUMUGI/web/open_webapp_windows.bat +37 -0
- TSUMUGI/web/serve_index.py +110 -0
- TSUMUGI/web/template/template_index.html +197 -0
- TSUMUGI/web_deployer.py +150 -0
- tsumugi-1.0.1.dist-info/METADATA +504 -0
- tsumugi-1.0.1.dist-info/RECORD +64 -0
- tsumugi-1.0.1.dist-info/WHEEL +4 -0
- tsumugi-1.0.1.dist-info/entry_points.txt +3 -0
- tsumugi-1.0.1.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,603 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import gzip
|
|
4
|
+
import json
|
|
5
|
+
import math
|
|
6
|
+
import random
|
|
7
|
+
from collections import defaultdict
|
|
8
|
+
from itertools import combinations
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
|
|
11
|
+
from tqdm import tqdm
|
|
12
|
+
|
|
13
|
+
random.seed(0)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
ZYGOSITY_MAP = {
|
|
17
|
+
"homozygote": "Homo",
|
|
18
|
+
"heterozygote": "Hetero",
|
|
19
|
+
"hemizygote": "Hemi",
|
|
20
|
+
"hom": "Homo",
|
|
21
|
+
"het": "Hetero",
|
|
22
|
+
"hem": "Hemi",
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
MAX_GENE_COUNT = 150
|
|
27
|
+
GENE_COUNT_LOWER_BOUND = 100
|
|
28
|
+
GENE_COUNT_UPPER_BOUND = 150
|
|
29
|
+
|
|
30
|
+
###############################################################################
|
|
31
|
+
# Compose datasets
|
|
32
|
+
###############################################################################
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _create_annotation_string(*parts: str) -> str:
|
|
36
|
+
"""Join non-empty parts with commas."""
|
|
37
|
+
return ", ".join(part for part in parts if part)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
# ----------------------------------------------------------
|
|
41
|
+
# Compose genewise_phenotype_significants
|
|
42
|
+
# ----------------------------------------------------------
|
|
43
|
+
def _compose_genewise_phenotype_significants(
|
|
44
|
+
genewise_phenotype_significants: list[dict[str, str | float]],
|
|
45
|
+
) -> dict[str, list[dict[str, str | float]]]:
|
|
46
|
+
"""Compose genewise_phenotype_significants into gene_records_map for Nodes."""
|
|
47
|
+
|
|
48
|
+
gene_records_map = defaultdict(list)
|
|
49
|
+
for record in genewise_phenotype_significants:
|
|
50
|
+
zygosity = record["zygosity"]
|
|
51
|
+
life_stage = record.get("life_stage", "")
|
|
52
|
+
sexual_dimorphism = record.get("sexual_dimorphism", "")
|
|
53
|
+
sexual_dimorphism = "" if sexual_dimorphism == "None" else sexual_dimorphism
|
|
54
|
+
|
|
55
|
+
annotation_str = _create_annotation_string(zygosity, life_stage, sexual_dimorphism)
|
|
56
|
+
phenotype_composed = f"{record['mp_term_name']} ({annotation_str})"
|
|
57
|
+
|
|
58
|
+
effect_size = record["effect_size"]
|
|
59
|
+
|
|
60
|
+
gene_records_map[record["marker_symbol"]].append(
|
|
61
|
+
{
|
|
62
|
+
"mp_term_name": record["mp_term_name"],
|
|
63
|
+
"effect_size": effect_size,
|
|
64
|
+
"phenotype": phenotype_composed,
|
|
65
|
+
}
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
return dict(gene_records_map)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
# ----------------------------------------------------------
|
|
72
|
+
# Compose biological annotations
|
|
73
|
+
# ----------------------------------------------------------
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def _compose_pairwise_similarity_annotations(
|
|
77
|
+
pairwise_similarity_annotations: list[dict[str, list[dict[str, str]] | int]],
|
|
78
|
+
) -> dict[tuple[str], dict[str, list[str] | int]]:
|
|
79
|
+
"""Compose pair similarity annotations (Edges) into strings."""
|
|
80
|
+
pairwise_similarity_annotations_composed = {}
|
|
81
|
+
for record in pairwise_similarity_annotations:
|
|
82
|
+
pair_annotations_composed = set()
|
|
83
|
+
for annotation in record["phenotype_shared_annotations"]:
|
|
84
|
+
mp_term_name = annotation["phenotype"]
|
|
85
|
+
zygosity = annotation["zygosity"]
|
|
86
|
+
life_stage = annotation.get("life_stage", "")
|
|
87
|
+
sexual_dimorphism = annotation.get("sexual_dimorphism", "")
|
|
88
|
+
sexual_dimorphism = "" if sexual_dimorphism == "None" else sexual_dimorphism
|
|
89
|
+
|
|
90
|
+
annotation_str = _create_annotation_string(zygosity, life_stage, sexual_dimorphism)
|
|
91
|
+
pair_annotations_composed.add(f"{mp_term_name} ({annotation_str})")
|
|
92
|
+
|
|
93
|
+
gene_pair = (record["gene1_symbol"], record["gene2_symbol"])
|
|
94
|
+
|
|
95
|
+
pairwise_similarity_annotations_composed[gene_pair] = {
|
|
96
|
+
"phenotype_shared_annotations": sorted(pair_annotations_composed),
|
|
97
|
+
"phenotype_similarity_score": record["phenotype_similarity_score"],
|
|
98
|
+
}
|
|
99
|
+
return pairwise_similarity_annotations_composed
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
# ----------------------------------------------------------
|
|
103
|
+
# Compose disease_annotations_by_allele
|
|
104
|
+
# ----------------------------------------------------------
|
|
105
|
+
def _compose_disease_annotations_by_allele(
|
|
106
|
+
disease_annotations_by_allele: dict[str, list[dict[str, str]]],
|
|
107
|
+
) -> dict[str, set[str]]:
|
|
108
|
+
disease_annotations_composed = defaultdict(set)
|
|
109
|
+
for marker_symbol, records in disease_annotations_by_allele.items():
|
|
110
|
+
for record in records:
|
|
111
|
+
disorder_name = record["disorder_name"]
|
|
112
|
+
zygosity = record["zygosity"]
|
|
113
|
+
life_stage = record["life_stage"]
|
|
114
|
+
|
|
115
|
+
annotation = []
|
|
116
|
+
annotation.append(zygosity)
|
|
117
|
+
annotation.append(life_stage)
|
|
118
|
+
annotation = ", ".join(annotation)
|
|
119
|
+
|
|
120
|
+
disease_annotations_composed[marker_symbol].add(f"{disorder_name} ({annotation})")
|
|
121
|
+
|
|
122
|
+
return dict(disease_annotations_composed)
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def _compose_dataset(genewise_phenotype_significants, pairwise_similarity_annotations, disease_annotations_by_allele):
|
|
126
|
+
gene_records_map = _compose_genewise_phenotype_significants(genewise_phenotype_significants)
|
|
127
|
+
pairwise_similarity_annotations_composed = _compose_pairwise_similarity_annotations(
|
|
128
|
+
pairwise_similarity_annotations
|
|
129
|
+
)
|
|
130
|
+
disease_annotations_composed = _compose_disease_annotations_by_allele(disease_annotations_by_allele)
|
|
131
|
+
return gene_records_map, pairwise_similarity_annotations_composed, disease_annotations_composed
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
###############################################################################
|
|
135
|
+
# Build network JSON
|
|
136
|
+
###############################################################################
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def _scale_to_1_100(x: int, min_val: int, max_val: int) -> int:
|
|
140
|
+
if max_val == min_val:
|
|
141
|
+
return 100
|
|
142
|
+
if x <= min_val:
|
|
143
|
+
return 1
|
|
144
|
+
if x >= max_val:
|
|
145
|
+
return 100
|
|
146
|
+
|
|
147
|
+
scale = 99 / (max_val - min_val)
|
|
148
|
+
shifted = x - min_val
|
|
149
|
+
scaled_score = 1 + shifted * scale
|
|
150
|
+
|
|
151
|
+
return int(scaled_score)
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def _scale_phenotype_similarity_scores(pairwise_similarity_annotations_filtered, target_gene: str | None = None):
|
|
155
|
+
if target_gene:
|
|
156
|
+
scores = [
|
|
157
|
+
v["phenotype_similarity_score"]
|
|
158
|
+
for pair, v in pairwise_similarity_annotations_filtered.items()
|
|
159
|
+
if target_gene in pair
|
|
160
|
+
]
|
|
161
|
+
else:
|
|
162
|
+
scores = [v["phenotype_similarity_score"] for v in pairwise_similarity_annotations_filtered.values()]
|
|
163
|
+
|
|
164
|
+
min_val = min(scores)
|
|
165
|
+
max_val = max(scores)
|
|
166
|
+
|
|
167
|
+
scaled_annotations = {}
|
|
168
|
+
|
|
169
|
+
for pair, annotation in pairwise_similarity_annotations_filtered.items():
|
|
170
|
+
scaled_annotation = annotation.copy()
|
|
171
|
+
scaled_annotation["phenotype_similarity_score"] = _scale_to_1_100(
|
|
172
|
+
annotation["phenotype_similarity_score"],
|
|
173
|
+
min_val,
|
|
174
|
+
max_val,
|
|
175
|
+
)
|
|
176
|
+
scaled_annotations[pair] = scaled_annotation
|
|
177
|
+
|
|
178
|
+
return scaled_annotations
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def _scale_effect_sizes(gene_records_map_filtered, mp_term_name):
|
|
182
|
+
effect_sizes = []
|
|
183
|
+
for records in gene_records_map_filtered.values():
|
|
184
|
+
for record in records:
|
|
185
|
+
if record["mp_term_name"] == mp_term_name:
|
|
186
|
+
effect_sizes.append(record["effect_size"])
|
|
187
|
+
|
|
188
|
+
# For binary effect sizes (0 or 1), set 1 to 100 directly
|
|
189
|
+
if all(es == 1 for es in effect_sizes):
|
|
190
|
+
for records in gene_records_map_filtered.values():
|
|
191
|
+
for record in records:
|
|
192
|
+
if record["mp_term_name"] == mp_term_name:
|
|
193
|
+
record["effect_size"] = 100
|
|
194
|
+
return gene_records_map_filtered
|
|
195
|
+
|
|
196
|
+
effect_sizes_log1p = [math.log1p(es) for es in effect_sizes]
|
|
197
|
+
min_val = min(effect_sizes_log1p)
|
|
198
|
+
max_val = max(effect_sizes_log1p)
|
|
199
|
+
for records in gene_records_map_filtered.values():
|
|
200
|
+
for record in records:
|
|
201
|
+
if record["mp_term_name"] == mp_term_name:
|
|
202
|
+
effect_size_scaled = _scale_to_1_100(math.log1p(record["effect_size"]), min_val, max_val)
|
|
203
|
+
record["effect_size"] = effect_size_scaled
|
|
204
|
+
return gene_records_map_filtered
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def _find_optimal_scores(
|
|
208
|
+
sorted_scores,
|
|
209
|
+
related_genes,
|
|
210
|
+
pairwise_similarity_annotations_composed,
|
|
211
|
+
low_threshold=GENE_COUNT_LOWER_BOUND,
|
|
212
|
+
high_threshold=GENE_COUNT_UPPER_BOUND,
|
|
213
|
+
):
|
|
214
|
+
low = 0
|
|
215
|
+
high = len(sorted_scores) - 1
|
|
216
|
+
while low <= high:
|
|
217
|
+
mid = (low + high) // 2
|
|
218
|
+
|
|
219
|
+
count_genes = set()
|
|
220
|
+
for gene1, gene2 in combinations(sorted(related_genes), 2):
|
|
221
|
+
gene_pair = tuple(sorted([gene1, gene2]))
|
|
222
|
+
if gene_pair not in pairwise_similarity_annotations_composed:
|
|
223
|
+
continue
|
|
224
|
+
pair_annotations = pairwise_similarity_annotations_composed[gene_pair]
|
|
225
|
+
if pair_annotations["phenotype_similarity_score"] >= sorted_scores[mid]:
|
|
226
|
+
count_genes.add(gene1)
|
|
227
|
+
count_genes.add(gene2)
|
|
228
|
+
|
|
229
|
+
n = len(count_genes)
|
|
230
|
+
|
|
231
|
+
if low_threshold <= n <= high_threshold:
|
|
232
|
+
return sorted_scores[mid]
|
|
233
|
+
elif n < low_threshold:
|
|
234
|
+
low = mid + 1
|
|
235
|
+
else:
|
|
236
|
+
high = mid - 1
|
|
237
|
+
return -1
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def _filter_related_genes(
|
|
241
|
+
records: list[dict[str, str | float]],
|
|
242
|
+
related_genes: set[str],
|
|
243
|
+
pairwise_similarity_annotations_composed: dict[tuple[str], dict[str, list[str] | int]],
|
|
244
|
+
is_gene_network: bool = False,
|
|
245
|
+
) -> set[str]:
|
|
246
|
+
"""
|
|
247
|
+
Strategy:
|
|
248
|
+
1) If possible, select by a threshold on phenotype similarity score found via _find_optimal_scores().
|
|
249
|
+
2) Otherwise, rank by:
|
|
250
|
+
- effect size (desc),
|
|
251
|
+
- then number of shared phenotypes (desc),
|
|
252
|
+
- then phenotype similarity score (desc),
|
|
253
|
+
- then gene symbol (asc, for stability),
|
|
254
|
+
and take the top MAX_GENE_COUNT.
|
|
255
|
+
Notes:
|
|
256
|
+
- NaN effect sizes are treated as 1.
|
|
257
|
+
- For speed, pair stats are computed in a single pass over unique gene pairs.
|
|
258
|
+
"""
|
|
259
|
+
|
|
260
|
+
# --- Compute maximum values per gene ---
|
|
261
|
+
phenotype_similarity_scores = []
|
|
262
|
+
gene_max_score = defaultdict(float)
|
|
263
|
+
gene_max_shared_phenotype = defaultdict(int)
|
|
264
|
+
|
|
265
|
+
for gene1, gene2 in combinations(sorted(related_genes), 2):
|
|
266
|
+
gene_pair = tuple(sorted([gene1, gene2]))
|
|
267
|
+
if gene_pair not in pairwise_similarity_annotations_composed:
|
|
268
|
+
continue
|
|
269
|
+
|
|
270
|
+
pair_annotations = pairwise_similarity_annotations_composed[gene_pair]
|
|
271
|
+
score = pair_annotations["phenotype_similarity_score"]
|
|
272
|
+
num_shared_phenotypes = len(pair_annotations["phenotype_shared_annotations"])
|
|
273
|
+
|
|
274
|
+
phenotype_similarity_scores.append(score)
|
|
275
|
+
|
|
276
|
+
# Update maximum similarity score for each gene
|
|
277
|
+
gene_max_score[gene1] = max(gene_max_score[gene1], score)
|
|
278
|
+
gene_max_score[gene2] = max(gene_max_score[gene2], score)
|
|
279
|
+
|
|
280
|
+
# Update maximum number of shared phenotypes for each gene
|
|
281
|
+
gene_max_shared_phenotype[gene1] = max(gene_max_shared_phenotype[gene1], num_shared_phenotypes)
|
|
282
|
+
gene_max_shared_phenotype[gene2] = max(gene_max_shared_phenotype[gene2], num_shared_phenotypes)
|
|
283
|
+
|
|
284
|
+
# 1. Filter genes by phenotype similarity score
|
|
285
|
+
unique_phenotype_similarity_scores = sorted(set(phenotype_similarity_scores))
|
|
286
|
+
|
|
287
|
+
optimal_score = _find_optimal_scores(
|
|
288
|
+
unique_phenotype_similarity_scores,
|
|
289
|
+
related_genes,
|
|
290
|
+
pairwise_similarity_annotations_composed,
|
|
291
|
+
low_threshold=GENE_COUNT_LOWER_BOUND,
|
|
292
|
+
high_threshold=GENE_COUNT_UPPER_BOUND,
|
|
293
|
+
)
|
|
294
|
+
if optimal_score > -1:
|
|
295
|
+
return {gene for gene, max_score in gene_max_score.items() if max_score >= optimal_score}
|
|
296
|
+
|
|
297
|
+
if is_gene_network is False:
|
|
298
|
+
# For gene networks, effect size is only 0 or 1, so skip effect size filtering
|
|
299
|
+
|
|
300
|
+
# Compute maximum effect size per gene
|
|
301
|
+
gene_max_effect_sizes = defaultdict(float)
|
|
302
|
+
for record in records:
|
|
303
|
+
gene = record["marker_symbol"]
|
|
304
|
+
if gene in related_genes:
|
|
305
|
+
effect_size = record["effect_size"] if not math.isnan(record["effect_size"]) else 0.0
|
|
306
|
+
gene_max_effect_sizes[gene] = max(gene_max_effect_sizes[gene], effect_size)
|
|
307
|
+
|
|
308
|
+
# 2. Filter genes by effect size
|
|
309
|
+
filtered_effect_sizes = {g: s for g, s in gene_max_effect_sizes.items() if g in related_genes}
|
|
310
|
+
gene_max_effect_sizes_sorted = sorted(filtered_effect_sizes.items(), key=lambda x: x[1], reverse=True)
|
|
311
|
+
|
|
312
|
+
# If the top MAX_GENE_COUNT entries have different effect sizes, return them
|
|
313
|
+
if len({score for _, score in gene_max_effect_sizes_sorted[:MAX_GENE_COUNT]}) > 1:
|
|
314
|
+
return {gene for gene, _ in gene_max_effect_sizes_sorted[:MAX_GENE_COUNT]}
|
|
315
|
+
|
|
316
|
+
# 3. Filter genes by number of shared phenotypes
|
|
317
|
+
filtered_shared_phenotypes = {g: s for g, s in gene_max_shared_phenotype.items() if g in related_genes}
|
|
318
|
+
gene_max_shared_phenotype_sorted = sorted(filtered_shared_phenotypes.items(), key=lambda x: x[1], reverse=True)
|
|
319
|
+
return {gene for gene, _ in gene_max_shared_phenotype_sorted[:MAX_GENE_COUNT]}
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
###############################################################################
|
|
323
|
+
# build_phenotype_network_json
|
|
324
|
+
###############################################################################
|
|
325
|
+
|
|
326
|
+
|
|
327
|
+
def _convert_to_nodes_json(
|
|
328
|
+
related_genes: set[str],
|
|
329
|
+
mp_term_name: str,
|
|
330
|
+
gene_records_map: dict[str, list[dict[str, str | float]]],
|
|
331
|
+
disease_annotations_composed: dict[str, set[str]],
|
|
332
|
+
hide_severity: bool = False,
|
|
333
|
+
) -> list[dict[str, dict[str, str | list[str] | int]]]:
|
|
334
|
+
nodes_json = []
|
|
335
|
+
gene_records_map_filtered = {gene: gene_records_map[gene] for gene in related_genes}
|
|
336
|
+
|
|
337
|
+
# Scale effect sizes to 1-100
|
|
338
|
+
gene_records_map_filtered = _scale_effect_sizes(gene_records_map_filtered, mp_term_name)
|
|
339
|
+
|
|
340
|
+
for gene, records in gene_records_map_filtered.items():
|
|
341
|
+
phenotypes: list[str] = [r["phenotype"] for r in records]
|
|
342
|
+
diseases: set[str] = disease_annotations_composed.get(gene, set())
|
|
343
|
+
node_color: int = next((r["effect_size"] for r in records if r["mp_term_name"] == mp_term_name), 1)
|
|
344
|
+
|
|
345
|
+
node = {
|
|
346
|
+
"data": {
|
|
347
|
+
"id": gene,
|
|
348
|
+
"label": gene,
|
|
349
|
+
"phenotype": sorted(phenotypes),
|
|
350
|
+
"disease": sorted(diseases) if diseases else "",
|
|
351
|
+
"node_color": node_color,
|
|
352
|
+
}
|
|
353
|
+
}
|
|
354
|
+
if hide_severity:
|
|
355
|
+
node["data"]["hide_severity"] = True
|
|
356
|
+
nodes_json.append(node)
|
|
357
|
+
|
|
358
|
+
return nodes_json
|
|
359
|
+
|
|
360
|
+
|
|
361
|
+
def _convert_to_edges_json(
|
|
362
|
+
related_genes: set[str],
|
|
363
|
+
pairwise_similarity_annotations_composed: dict[tuple[str], dict[str, list[str] | int]],
|
|
364
|
+
) -> list[dict[str, dict[str, str | list[str] | float]]]:
|
|
365
|
+
edges_json = []
|
|
366
|
+
pairwise_similarity_annotations_filtered = {}
|
|
367
|
+
for gene1, gene2 in combinations(sorted(related_genes), 2):
|
|
368
|
+
gene_pairs = tuple(sorted([gene1, gene2]))
|
|
369
|
+
if gene_pairs not in pairwise_similarity_annotations_composed:
|
|
370
|
+
continue
|
|
371
|
+
pairwise_similarity_annotations_filtered[gene_pairs] = pairwise_similarity_annotations_composed[gene_pairs]
|
|
372
|
+
|
|
373
|
+
if not pairwise_similarity_annotations_filtered:
|
|
374
|
+
return []
|
|
375
|
+
|
|
376
|
+
# Scale phenotype similarity scores to 1-100
|
|
377
|
+
pairwise_similarity_annotations_filtered = _scale_phenotype_similarity_scores(
|
|
378
|
+
pairwise_similarity_annotations_filtered, target_gene=None
|
|
379
|
+
)
|
|
380
|
+
|
|
381
|
+
for pair_genes, pair_annotations in pairwise_similarity_annotations_filtered.items():
|
|
382
|
+
gene1, gene2 = sorted(pair_genes)
|
|
383
|
+
edges_json.append(
|
|
384
|
+
{
|
|
385
|
+
"data": {
|
|
386
|
+
"source": gene1,
|
|
387
|
+
"target": gene2,
|
|
388
|
+
"phenotype": sorted(pair_annotations["phenotype_shared_annotations"]),
|
|
389
|
+
"edge_size": pair_annotations["phenotype_similarity_score"],
|
|
390
|
+
}
|
|
391
|
+
}
|
|
392
|
+
)
|
|
393
|
+
return edges_json
|
|
394
|
+
|
|
395
|
+
|
|
396
|
+
def build_phenotype_network_json(
|
|
397
|
+
genewise_phenotype_significants: list[dict[str, str | float]],
|
|
398
|
+
pairwise_similarity_annotations: dict[tuple[str], dict[str, dict[str, dict[str, str] | int]]],
|
|
399
|
+
disease_annotations_by_gene: dict[str, dict[str, str]],
|
|
400
|
+
output_dir,
|
|
401
|
+
binary_phenotypes: set[str] | None = None,
|
|
402
|
+
hide_severity: bool = False,
|
|
403
|
+
) -> None:
|
|
404
|
+
gene_records_map, pairwise_similarity_annotations_composed, disease_annotations_composed = _compose_dataset(
|
|
405
|
+
genewise_phenotype_significants, pairwise_similarity_annotations, disease_annotations_by_gene
|
|
406
|
+
)
|
|
407
|
+
|
|
408
|
+
phenotype_records_map: dict[str, list[dict[str, str | float]]] = defaultdict(list)
|
|
409
|
+
for record in genewise_phenotype_significants:
|
|
410
|
+
phenotype_records_map[record["mp_term_name"]].append(record)
|
|
411
|
+
phenotype_records_map = dict(phenotype_records_map)
|
|
412
|
+
|
|
413
|
+
gene_lists = set()
|
|
414
|
+
for pair in pairwise_similarity_annotations_composed.keys():
|
|
415
|
+
for gene in pair:
|
|
416
|
+
gene_lists.add(gene)
|
|
417
|
+
|
|
418
|
+
for mp_term_name in tqdm(phenotype_records_map.keys(), total=len(phenotype_records_map)):
|
|
419
|
+
records = phenotype_records_map[mp_term_name]
|
|
420
|
+
related_genes = {r["marker_symbol"] for r in records if r["marker_symbol"] in gene_lists}
|
|
421
|
+
|
|
422
|
+
if len(related_genes) < 2:
|
|
423
|
+
continue
|
|
424
|
+
|
|
425
|
+
if len(related_genes) > MAX_GENE_COUNT:
|
|
426
|
+
related_genes = _filter_related_genes(records, related_genes, pairwise_similarity_annotations_composed)
|
|
427
|
+
|
|
428
|
+
is_binary = False
|
|
429
|
+
if binary_phenotypes:
|
|
430
|
+
is_binary = mp_term_name in binary_phenotypes
|
|
431
|
+
|
|
432
|
+
edges_json = _convert_to_edges_json(related_genes, pairwise_similarity_annotations_composed)
|
|
433
|
+
|
|
434
|
+
if not edges_json:
|
|
435
|
+
continue
|
|
436
|
+
|
|
437
|
+
# Remove unconnected nodes
|
|
438
|
+
connected_node_ids = set()
|
|
439
|
+
for edge in edges_json:
|
|
440
|
+
connected_node_ids.add(edge["data"]["source"])
|
|
441
|
+
connected_node_ids.add(edge["data"]["target"])
|
|
442
|
+
|
|
443
|
+
if not connected_node_ids:
|
|
444
|
+
continue
|
|
445
|
+
|
|
446
|
+
nodes_json = _convert_to_nodes_json(
|
|
447
|
+
connected_node_ids,
|
|
448
|
+
mp_term_name,
|
|
449
|
+
gene_records_map,
|
|
450
|
+
disease_annotations_composed,
|
|
451
|
+
hide_severity=hide_severity or is_binary,
|
|
452
|
+
)
|
|
453
|
+
|
|
454
|
+
# Sort nodes for stability
|
|
455
|
+
nodes_json = sorted(nodes_json, key=lambda n: n["data"]["id"])
|
|
456
|
+
edges_json = sorted(edges_json, key=lambda e: (e["data"]["source"], e["data"]["target"]))
|
|
457
|
+
|
|
458
|
+
network_json = nodes_json + edges_json
|
|
459
|
+
|
|
460
|
+
mp_term_name_underscore = mp_term_name.replace(" ", "_").replace("/", "_")
|
|
461
|
+
output_json = Path(output_dir / f"{mp_term_name_underscore}.json.gz")
|
|
462
|
+
with gzip.open(output_json, "wt", encoding="utf-8") as f:
|
|
463
|
+
json.dump(network_json, f, indent=4)
|
|
464
|
+
|
|
465
|
+
|
|
466
|
+
###############################################################################
|
|
467
|
+
# build_gene_network_json
|
|
468
|
+
###############################################################################
|
|
469
|
+
|
|
470
|
+
|
|
471
|
+
def _build_node_info(
|
|
472
|
+
gene: str,
|
|
473
|
+
gene_records_map: dict[str, list[dict[str, str | float]]],
|
|
474
|
+
disease_annotations_composed: dict[str, set[str]],
|
|
475
|
+
target_gene: str,
|
|
476
|
+
hide_severity: bool = False,
|
|
477
|
+
) -> dict[str, dict[str, str | list[str] | float]]:
|
|
478
|
+
phenotypes: list[str] = [r["phenotype"] for r in gene_records_map.get(gene, [])]
|
|
479
|
+
diseases: set[str] = disease_annotations_composed.get(gene, set())
|
|
480
|
+
node_color: int = 100 if target_gene == gene else 1
|
|
481
|
+
|
|
482
|
+
node = {
|
|
483
|
+
"data": {
|
|
484
|
+
"id": gene,
|
|
485
|
+
"label": gene,
|
|
486
|
+
"phenotype": sorted(phenotypes),
|
|
487
|
+
"disease": sorted(diseases) if diseases else "",
|
|
488
|
+
"node_color": node_color,
|
|
489
|
+
}
|
|
490
|
+
}
|
|
491
|
+
if hide_severity:
|
|
492
|
+
node["data"]["hide_severity"] = True
|
|
493
|
+
return node
|
|
494
|
+
|
|
495
|
+
|
|
496
|
+
def build_gene_network_json(
|
|
497
|
+
genewise_phenotype_significants: list[dict[str, str | float]],
|
|
498
|
+
pairwise_similarity_annotations: dict[tuple[str], dict[str, dict[str, str] | int]],
|
|
499
|
+
disease_annotations_by_gene: dict[str, dict[str, str]],
|
|
500
|
+
output_dir,
|
|
501
|
+
hide_severity: bool = True,
|
|
502
|
+
) -> None:
|
|
503
|
+
gene_records_map, pairwise_similarity_annotations_composed, disease_annotations_composed = _compose_dataset(
|
|
504
|
+
genewise_phenotype_significants, pairwise_similarity_annotations, disease_annotations_by_gene
|
|
505
|
+
)
|
|
506
|
+
|
|
507
|
+
gene_sets = set()
|
|
508
|
+
for pair in pairwise_similarity_annotations_composed.keys():
|
|
509
|
+
for gene in pair:
|
|
510
|
+
gene_sets.add(gene)
|
|
511
|
+
|
|
512
|
+
for target_gene in tqdm(gene_sets, total=len(gene_sets)):
|
|
513
|
+
related_pairs_with_target_gene = []
|
|
514
|
+
for pair in pairwise_similarity_annotations_composed.keys():
|
|
515
|
+
if target_gene not in pair:
|
|
516
|
+
continue
|
|
517
|
+
related_pairs_with_target_gene.append(pair)
|
|
518
|
+
|
|
519
|
+
related_genes = set()
|
|
520
|
+
for genes in related_pairs_with_target_gene:
|
|
521
|
+
gene1, gene2 = genes
|
|
522
|
+
related_genes.add(gene1)
|
|
523
|
+
related_genes.add(gene2)
|
|
524
|
+
|
|
525
|
+
# Skip if less than 2 related genes
|
|
526
|
+
if len(related_genes) < 2:
|
|
527
|
+
continue
|
|
528
|
+
|
|
529
|
+
related_pairs = []
|
|
530
|
+
for gene1, gene2 in combinations(related_genes, 2):
|
|
531
|
+
gene_pair = tuple(sorted([gene1, gene2]))
|
|
532
|
+
if gene_pair not in pairwise_similarity_annotations_composed:
|
|
533
|
+
continue
|
|
534
|
+
related_pairs.append(gene_pair)
|
|
535
|
+
|
|
536
|
+
# Filter genes if more than MAX_GENE_COUNT
|
|
537
|
+
if len(related_genes) > MAX_GENE_COUNT:
|
|
538
|
+
related_genes_filtered = _filter_related_genes(
|
|
539
|
+
genewise_phenotype_significants, related_genes, pairwise_similarity_annotations_composed
|
|
540
|
+
)
|
|
541
|
+
related_genes_filtered.add(target_gene)
|
|
542
|
+
related_pairs = [pairs for pairs in related_pairs if all(gene in related_genes_filtered for gene in pairs)]
|
|
543
|
+
|
|
544
|
+
# ---------------------------------------
|
|
545
|
+
# Nodes
|
|
546
|
+
# ---------------------------------------
|
|
547
|
+
nodes_json = []
|
|
548
|
+
visited_genes = set()
|
|
549
|
+
for pair in related_pairs:
|
|
550
|
+
gene1, gene2 = pair
|
|
551
|
+
if gene1 not in visited_genes:
|
|
552
|
+
visited_genes.add(gene1)
|
|
553
|
+
node_json = _build_node_info(
|
|
554
|
+
gene1, gene_records_map, disease_annotations_composed, target_gene, hide_severity=hide_severity
|
|
555
|
+
)
|
|
556
|
+
nodes_json.append(node_json)
|
|
557
|
+
if gene2 not in visited_genes:
|
|
558
|
+
visited_genes.add(gene2)
|
|
559
|
+
node_json = _build_node_info(
|
|
560
|
+
gene2, gene_records_map, disease_annotations_composed, target_gene, hide_severity=hide_severity
|
|
561
|
+
)
|
|
562
|
+
nodes_json.append(node_json)
|
|
563
|
+
|
|
564
|
+
# ---------------------------------------
|
|
565
|
+
# Edges
|
|
566
|
+
# ---------------------------------------
|
|
567
|
+
pairwise_similarity_annotations_filtered = {
|
|
568
|
+
pair: pairwise_similarity_annotations_composed[pair] for pair in related_pairs
|
|
569
|
+
}
|
|
570
|
+
|
|
571
|
+
if not pairwise_similarity_annotations_filtered:
|
|
572
|
+
return []
|
|
573
|
+
|
|
574
|
+
# Scale phenotype similarity scores to 1-100
|
|
575
|
+
pairwise_similarity_annotations_scaled = _scale_phenotype_similarity_scores(
|
|
576
|
+
pairwise_similarity_annotations_filtered, target_gene
|
|
577
|
+
)
|
|
578
|
+
|
|
579
|
+
edges_json = []
|
|
580
|
+
for pair in related_pairs:
|
|
581
|
+
gene1, gene2 = sorted(pair)
|
|
582
|
+
phenotypes = pairwise_similarity_annotations_scaled[pair]["phenotype_shared_annotations"]
|
|
583
|
+
phenodigm_score = pairwise_similarity_annotations_scaled[pair]["phenotype_similarity_score"]
|
|
584
|
+
edges_json.append(
|
|
585
|
+
{
|
|
586
|
+
"data": {
|
|
587
|
+
"source": gene1,
|
|
588
|
+
"target": gene2,
|
|
589
|
+
"phenotype": sorted(phenotypes),
|
|
590
|
+
"edge_size": phenodigm_score,
|
|
591
|
+
}
|
|
592
|
+
}
|
|
593
|
+
)
|
|
594
|
+
|
|
595
|
+
# Sort nodes for stability
|
|
596
|
+
nodes_json = sorted(nodes_json, key=lambda n: n["data"]["id"])
|
|
597
|
+
edges_json = sorted(edges_json, key=lambda e: (e["data"]["source"], e["data"]["target"]))
|
|
598
|
+
|
|
599
|
+
network_json = nodes_json + edges_json
|
|
600
|
+
|
|
601
|
+
output_json = Path(output_dir / f"{target_gene}.json.gz")
|
|
602
|
+
with gzip.open(output_json, "wt", encoding="utf-8") as f:
|
|
603
|
+
json.dump(network_json, f, indent=4)
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from collections import defaultdict
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def build_term_hierarchy(
|
|
7
|
+
ontology_terms: dict[str, dict],
|
|
8
|
+
) -> tuple[dict[str, set[str]], dict[str, set[str]]]:
|
|
9
|
+
"""Build parent-child hierarchy relationships from ontology terms."""
|
|
10
|
+
parent_term_map = defaultdict(set) # term_id -> [parent_ids]
|
|
11
|
+
child_term_map = defaultdict(set) # term_id -> [child_ids]
|
|
12
|
+
|
|
13
|
+
for term_id, term_data in ontology_terms.items():
|
|
14
|
+
if "is_a" in term_data:
|
|
15
|
+
for parent_id in term_data["is_a"]:
|
|
16
|
+
parent_term_map[term_id].add(parent_id)
|
|
17
|
+
child_term_map[parent_id].add(term_id)
|
|
18
|
+
|
|
19
|
+
return dict(parent_term_map), dict(child_term_map)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def find_all_ancestor_terms(term_id: str, parent_term_map: dict[str, set[str]]) -> set[str]:
|
|
23
|
+
"""Find all ancestor terms for a given term."""
|
|
24
|
+
ancestor_terms = set()
|
|
25
|
+
terms_to_process = [term_id]
|
|
26
|
+
|
|
27
|
+
while terms_to_process:
|
|
28
|
+
current_term = terms_to_process.pop(0)
|
|
29
|
+
if current_term in parent_term_map:
|
|
30
|
+
for parent_term in parent_term_map[current_term]:
|
|
31
|
+
if parent_term not in ancestor_terms:
|
|
32
|
+
ancestor_terms.add(parent_term)
|
|
33
|
+
terms_to_process.append(parent_term)
|
|
34
|
+
|
|
35
|
+
return ancestor_terms
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def find_all_descendant_terms(term_id: str, child_term_map: dict[str, set[str]]) -> set[str]:
|
|
39
|
+
"""Find all descendant terms for a given term."""
|
|
40
|
+
descendant_terms = set()
|
|
41
|
+
terms_to_process = [term_id]
|
|
42
|
+
|
|
43
|
+
while terms_to_process:
|
|
44
|
+
current_term = terms_to_process.pop(0)
|
|
45
|
+
if current_term in child_term_map:
|
|
46
|
+
for child_term in child_term_map[current_term]:
|
|
47
|
+
if child_term not in descendant_terms:
|
|
48
|
+
descendant_terms.add(child_term)
|
|
49
|
+
terms_to_process.append(child_term)
|
|
50
|
+
|
|
51
|
+
return descendant_terms
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def find_common_ancestors(term1_id: str, term2_id: str, parent_term_map: dict[str, set[str]]) -> set[str]:
|
|
55
|
+
"""Find common ancestors of two terms."""
|
|
56
|
+
term1_ancestors = find_all_ancestor_terms(term1_id, parent_term_map)
|
|
57
|
+
term1_ancestors.add(term1_id) # Include the term itself
|
|
58
|
+
|
|
59
|
+
term2_ancestors = find_all_ancestor_terms(term2_id, parent_term_map)
|
|
60
|
+
term2_ancestors.add(term2_id) # Include the term itself
|
|
61
|
+
|
|
62
|
+
return term1_ancestors.intersection(term2_ancestors)
|