TSUMUGI 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. TSUMUGI/annotator.py +103 -0
  2. TSUMUGI/argparser.py +599 -0
  3. TSUMUGI/core.py +185 -0
  4. TSUMUGI/data/impc_phenodigm.csv +3406 -0
  5. TSUMUGI/data/mp.obo +143993 -0
  6. TSUMUGI/filterer.py +36 -0
  7. TSUMUGI/formatter.py +122 -0
  8. TSUMUGI/genewise_annotation_builder.py +94 -0
  9. TSUMUGI/io_handler.py +189 -0
  10. TSUMUGI/main.py +300 -0
  11. TSUMUGI/network_constructor.py +603 -0
  12. TSUMUGI/ontology_handler.py +62 -0
  13. TSUMUGI/pairwise_similarity_builder.py +66 -0
  14. TSUMUGI/report_generator.py +122 -0
  15. TSUMUGI/similarity_calculator.py +498 -0
  16. TSUMUGI/subcommands/count_filterer.py +47 -0
  17. TSUMUGI/subcommands/genes_filterer.py +89 -0
  18. TSUMUGI/subcommands/graphml_builder.py +158 -0
  19. TSUMUGI/subcommands/life_stage_filterer.py +48 -0
  20. TSUMUGI/subcommands/mp_filterer.py +142 -0
  21. TSUMUGI/subcommands/score_filterer.py +22 -0
  22. TSUMUGI/subcommands/sex_filterer.py +48 -0
  23. TSUMUGI/subcommands/webapp_builder.py +358 -0
  24. TSUMUGI/subcommands/zygosity_filterer.py +48 -0
  25. TSUMUGI/validator.py +65 -0
  26. TSUMUGI/web/app/css/app.css +1129 -0
  27. TSUMUGI/web/app/genelist/network_genelist.html +339 -0
  28. TSUMUGI/web/app/genelist/network_genelist.js +421 -0
  29. TSUMUGI/web/app/js/data/dataLoader.js +41 -0
  30. TSUMUGI/web/app/js/export/graphExporter.js +214 -0
  31. TSUMUGI/web/app/js/graph/centrality.js +495 -0
  32. TSUMUGI/web/app/js/graph/components.js +30 -0
  33. TSUMUGI/web/app/js/graph/filters.js +158 -0
  34. TSUMUGI/web/app/js/graph/highlighter.js +52 -0
  35. TSUMUGI/web/app/js/graph/layoutController.js +454 -0
  36. TSUMUGI/web/app/js/graph/valueScaler.js +43 -0
  37. TSUMUGI/web/app/js/search/geneSearcher.js +93 -0
  38. TSUMUGI/web/app/js/search/phenotypeSearcher.js +292 -0
  39. TSUMUGI/web/app/js/ui/dynamicFontSize.js +30 -0
  40. TSUMUGI/web/app/js/ui/mobilePanel.js +77 -0
  41. TSUMUGI/web/app/js/ui/slider.js +22 -0
  42. TSUMUGI/web/app/js/ui/tooltips.js +514 -0
  43. TSUMUGI/web/app/js/viewer/pageSetup.js +217 -0
  44. TSUMUGI/web/app/viewer.html +515 -0
  45. TSUMUGI/web/app/viewer.js +1593 -0
  46. TSUMUGI/web/css/sanitize.css +363 -0
  47. TSUMUGI/web/css/top.css +391 -0
  48. TSUMUGI/web/image/tsumugi-favicon.ico +0 -0
  49. TSUMUGI/web/image/tsumugi-icon.png +0 -0
  50. TSUMUGI/web/image/tsumugi-logo.png +0 -0
  51. TSUMUGI/web/image/tsumugi-logo.svg +69 -0
  52. TSUMUGI/web/js/genelist_formatter.js +123 -0
  53. TSUMUGI/web/js/top.js +338 -0
  54. TSUMUGI/web/open_webapp_linux.sh +25 -0
  55. TSUMUGI/web/open_webapp_mac.command +25 -0
  56. TSUMUGI/web/open_webapp_windows.bat +37 -0
  57. TSUMUGI/web/serve_index.py +110 -0
  58. TSUMUGI/web/template/template_index.html +197 -0
  59. TSUMUGI/web_deployer.py +150 -0
  60. tsumugi-1.0.1.dist-info/METADATA +504 -0
  61. tsumugi-1.0.1.dist-info/RECORD +64 -0
  62. tsumugi-1.0.1.dist-info/WHEEL +4 -0
  63. tsumugi-1.0.1.dist-info/entry_points.txt +3 -0
  64. tsumugi-1.0.1.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,603 @@
1
+ from __future__ import annotations
2
+
3
+ import gzip
4
+ import json
5
+ import math
6
+ import random
7
+ from collections import defaultdict
8
+ from itertools import combinations
9
+ from pathlib import Path
10
+
11
+ from tqdm import tqdm
12
+
13
+ random.seed(0)
14
+
15
+
16
+ ZYGOSITY_MAP = {
17
+ "homozygote": "Homo",
18
+ "heterozygote": "Hetero",
19
+ "hemizygote": "Hemi",
20
+ "hom": "Homo",
21
+ "het": "Hetero",
22
+ "hem": "Hemi",
23
+ }
24
+
25
+
26
+ MAX_GENE_COUNT = 150
27
+ GENE_COUNT_LOWER_BOUND = 100
28
+ GENE_COUNT_UPPER_BOUND = 150
29
+
30
+ ###############################################################################
31
+ # Compose datasets
32
+ ###############################################################################
33
+
34
+
35
+ def _create_annotation_string(*parts: str) -> str:
36
+ """Join non-empty parts with commas."""
37
+ return ", ".join(part for part in parts if part)
38
+
39
+
40
+ # ----------------------------------------------------------
41
+ # Compose genewise_phenotype_significants
42
+ # ----------------------------------------------------------
43
+ def _compose_genewise_phenotype_significants(
44
+ genewise_phenotype_significants: list[dict[str, str | float]],
45
+ ) -> dict[str, list[dict[str, str | float]]]:
46
+ """Compose genewise_phenotype_significants into gene_records_map for Nodes."""
47
+
48
+ gene_records_map = defaultdict(list)
49
+ for record in genewise_phenotype_significants:
50
+ zygosity = record["zygosity"]
51
+ life_stage = record.get("life_stage", "")
52
+ sexual_dimorphism = record.get("sexual_dimorphism", "")
53
+ sexual_dimorphism = "" if sexual_dimorphism == "None" else sexual_dimorphism
54
+
55
+ annotation_str = _create_annotation_string(zygosity, life_stage, sexual_dimorphism)
56
+ phenotype_composed = f"{record['mp_term_name']} ({annotation_str})"
57
+
58
+ effect_size = record["effect_size"]
59
+
60
+ gene_records_map[record["marker_symbol"]].append(
61
+ {
62
+ "mp_term_name": record["mp_term_name"],
63
+ "effect_size": effect_size,
64
+ "phenotype": phenotype_composed,
65
+ }
66
+ )
67
+
68
+ return dict(gene_records_map)
69
+
70
+
71
+ # ----------------------------------------------------------
72
+ # Compose biological annotations
73
+ # ----------------------------------------------------------
74
+
75
+
76
+ def _compose_pairwise_similarity_annotations(
77
+ pairwise_similarity_annotations: list[dict[str, list[dict[str, str]] | int]],
78
+ ) -> dict[tuple[str], dict[str, list[str] | int]]:
79
+ """Compose pair similarity annotations (Edges) into strings."""
80
+ pairwise_similarity_annotations_composed = {}
81
+ for record in pairwise_similarity_annotations:
82
+ pair_annotations_composed = set()
83
+ for annotation in record["phenotype_shared_annotations"]:
84
+ mp_term_name = annotation["phenotype"]
85
+ zygosity = annotation["zygosity"]
86
+ life_stage = annotation.get("life_stage", "")
87
+ sexual_dimorphism = annotation.get("sexual_dimorphism", "")
88
+ sexual_dimorphism = "" if sexual_dimorphism == "None" else sexual_dimorphism
89
+
90
+ annotation_str = _create_annotation_string(zygosity, life_stage, sexual_dimorphism)
91
+ pair_annotations_composed.add(f"{mp_term_name} ({annotation_str})")
92
+
93
+ gene_pair = (record["gene1_symbol"], record["gene2_symbol"])
94
+
95
+ pairwise_similarity_annotations_composed[gene_pair] = {
96
+ "phenotype_shared_annotations": sorted(pair_annotations_composed),
97
+ "phenotype_similarity_score": record["phenotype_similarity_score"],
98
+ }
99
+ return pairwise_similarity_annotations_composed
100
+
101
+
102
+ # ----------------------------------------------------------
103
+ # Compose disease_annotations_by_allele
104
+ # ----------------------------------------------------------
105
+ def _compose_disease_annotations_by_allele(
106
+ disease_annotations_by_allele: dict[str, list[dict[str, str]]],
107
+ ) -> dict[str, set[str]]:
108
+ disease_annotations_composed = defaultdict(set)
109
+ for marker_symbol, records in disease_annotations_by_allele.items():
110
+ for record in records:
111
+ disorder_name = record["disorder_name"]
112
+ zygosity = record["zygosity"]
113
+ life_stage = record["life_stage"]
114
+
115
+ annotation = []
116
+ annotation.append(zygosity)
117
+ annotation.append(life_stage)
118
+ annotation = ", ".join(annotation)
119
+
120
+ disease_annotations_composed[marker_symbol].add(f"{disorder_name} ({annotation})")
121
+
122
+ return dict(disease_annotations_composed)
123
+
124
+
125
+ def _compose_dataset(genewise_phenotype_significants, pairwise_similarity_annotations, disease_annotations_by_allele):
126
+ gene_records_map = _compose_genewise_phenotype_significants(genewise_phenotype_significants)
127
+ pairwise_similarity_annotations_composed = _compose_pairwise_similarity_annotations(
128
+ pairwise_similarity_annotations
129
+ )
130
+ disease_annotations_composed = _compose_disease_annotations_by_allele(disease_annotations_by_allele)
131
+ return gene_records_map, pairwise_similarity_annotations_composed, disease_annotations_composed
132
+
133
+
134
+ ###############################################################################
135
+ # Build network JSON
136
+ ###############################################################################
137
+
138
+
139
+ def _scale_to_1_100(x: int, min_val: int, max_val: int) -> int:
140
+ if max_val == min_val:
141
+ return 100
142
+ if x <= min_val:
143
+ return 1
144
+ if x >= max_val:
145
+ return 100
146
+
147
+ scale = 99 / (max_val - min_val)
148
+ shifted = x - min_val
149
+ scaled_score = 1 + shifted * scale
150
+
151
+ return int(scaled_score)
152
+
153
+
154
+ def _scale_phenotype_similarity_scores(pairwise_similarity_annotations_filtered, target_gene: str | None = None):
155
+ if target_gene:
156
+ scores = [
157
+ v["phenotype_similarity_score"]
158
+ for pair, v in pairwise_similarity_annotations_filtered.items()
159
+ if target_gene in pair
160
+ ]
161
+ else:
162
+ scores = [v["phenotype_similarity_score"] for v in pairwise_similarity_annotations_filtered.values()]
163
+
164
+ min_val = min(scores)
165
+ max_val = max(scores)
166
+
167
+ scaled_annotations = {}
168
+
169
+ for pair, annotation in pairwise_similarity_annotations_filtered.items():
170
+ scaled_annotation = annotation.copy()
171
+ scaled_annotation["phenotype_similarity_score"] = _scale_to_1_100(
172
+ annotation["phenotype_similarity_score"],
173
+ min_val,
174
+ max_val,
175
+ )
176
+ scaled_annotations[pair] = scaled_annotation
177
+
178
+ return scaled_annotations
179
+
180
+
181
+ def _scale_effect_sizes(gene_records_map_filtered, mp_term_name):
182
+ effect_sizes = []
183
+ for records in gene_records_map_filtered.values():
184
+ for record in records:
185
+ if record["mp_term_name"] == mp_term_name:
186
+ effect_sizes.append(record["effect_size"])
187
+
188
+ # For binary effect sizes (0 or 1), set 1 to 100 directly
189
+ if all(es == 1 for es in effect_sizes):
190
+ for records in gene_records_map_filtered.values():
191
+ for record in records:
192
+ if record["mp_term_name"] == mp_term_name:
193
+ record["effect_size"] = 100
194
+ return gene_records_map_filtered
195
+
196
+ effect_sizes_log1p = [math.log1p(es) for es in effect_sizes]
197
+ min_val = min(effect_sizes_log1p)
198
+ max_val = max(effect_sizes_log1p)
199
+ for records in gene_records_map_filtered.values():
200
+ for record in records:
201
+ if record["mp_term_name"] == mp_term_name:
202
+ effect_size_scaled = _scale_to_1_100(math.log1p(record["effect_size"]), min_val, max_val)
203
+ record["effect_size"] = effect_size_scaled
204
+ return gene_records_map_filtered
205
+
206
+
207
+ def _find_optimal_scores(
208
+ sorted_scores,
209
+ related_genes,
210
+ pairwise_similarity_annotations_composed,
211
+ low_threshold=GENE_COUNT_LOWER_BOUND,
212
+ high_threshold=GENE_COUNT_UPPER_BOUND,
213
+ ):
214
+ low = 0
215
+ high = len(sorted_scores) - 1
216
+ while low <= high:
217
+ mid = (low + high) // 2
218
+
219
+ count_genes = set()
220
+ for gene1, gene2 in combinations(sorted(related_genes), 2):
221
+ gene_pair = tuple(sorted([gene1, gene2]))
222
+ if gene_pair not in pairwise_similarity_annotations_composed:
223
+ continue
224
+ pair_annotations = pairwise_similarity_annotations_composed[gene_pair]
225
+ if pair_annotations["phenotype_similarity_score"] >= sorted_scores[mid]:
226
+ count_genes.add(gene1)
227
+ count_genes.add(gene2)
228
+
229
+ n = len(count_genes)
230
+
231
+ if low_threshold <= n <= high_threshold:
232
+ return sorted_scores[mid]
233
+ elif n < low_threshold:
234
+ low = mid + 1
235
+ else:
236
+ high = mid - 1
237
+ return -1
238
+
239
+
240
+ def _filter_related_genes(
241
+ records: list[dict[str, str | float]],
242
+ related_genes: set[str],
243
+ pairwise_similarity_annotations_composed: dict[tuple[str], dict[str, list[str] | int]],
244
+ is_gene_network: bool = False,
245
+ ) -> set[str]:
246
+ """
247
+ Strategy:
248
+ 1) If possible, select by a threshold on phenotype similarity score found via _find_optimal_scores().
249
+ 2) Otherwise, rank by:
250
+ - effect size (desc),
251
+ - then number of shared phenotypes (desc),
252
+ - then phenotype similarity score (desc),
253
+ - then gene symbol (asc, for stability),
254
+ and take the top MAX_GENE_COUNT.
255
+ Notes:
256
+ - NaN effect sizes are treated as 1.
257
+ - For speed, pair stats are computed in a single pass over unique gene pairs.
258
+ """
259
+
260
+ # --- Compute maximum values per gene ---
261
+ phenotype_similarity_scores = []
262
+ gene_max_score = defaultdict(float)
263
+ gene_max_shared_phenotype = defaultdict(int)
264
+
265
+ for gene1, gene2 in combinations(sorted(related_genes), 2):
266
+ gene_pair = tuple(sorted([gene1, gene2]))
267
+ if gene_pair not in pairwise_similarity_annotations_composed:
268
+ continue
269
+
270
+ pair_annotations = pairwise_similarity_annotations_composed[gene_pair]
271
+ score = pair_annotations["phenotype_similarity_score"]
272
+ num_shared_phenotypes = len(pair_annotations["phenotype_shared_annotations"])
273
+
274
+ phenotype_similarity_scores.append(score)
275
+
276
+ # Update maximum similarity score for each gene
277
+ gene_max_score[gene1] = max(gene_max_score[gene1], score)
278
+ gene_max_score[gene2] = max(gene_max_score[gene2], score)
279
+
280
+ # Update maximum number of shared phenotypes for each gene
281
+ gene_max_shared_phenotype[gene1] = max(gene_max_shared_phenotype[gene1], num_shared_phenotypes)
282
+ gene_max_shared_phenotype[gene2] = max(gene_max_shared_phenotype[gene2], num_shared_phenotypes)
283
+
284
+ # 1. Filter genes by phenotype similarity score
285
+ unique_phenotype_similarity_scores = sorted(set(phenotype_similarity_scores))
286
+
287
+ optimal_score = _find_optimal_scores(
288
+ unique_phenotype_similarity_scores,
289
+ related_genes,
290
+ pairwise_similarity_annotations_composed,
291
+ low_threshold=GENE_COUNT_LOWER_BOUND,
292
+ high_threshold=GENE_COUNT_UPPER_BOUND,
293
+ )
294
+ if optimal_score > -1:
295
+ return {gene for gene, max_score in gene_max_score.items() if max_score >= optimal_score}
296
+
297
+ if is_gene_network is False:
298
+ # For gene networks, effect size is only 0 or 1, so skip effect size filtering
299
+
300
+ # Compute maximum effect size per gene
301
+ gene_max_effect_sizes = defaultdict(float)
302
+ for record in records:
303
+ gene = record["marker_symbol"]
304
+ if gene in related_genes:
305
+ effect_size = record["effect_size"] if not math.isnan(record["effect_size"]) else 0.0
306
+ gene_max_effect_sizes[gene] = max(gene_max_effect_sizes[gene], effect_size)
307
+
308
+ # 2. Filter genes by effect size
309
+ filtered_effect_sizes = {g: s for g, s in gene_max_effect_sizes.items() if g in related_genes}
310
+ gene_max_effect_sizes_sorted = sorted(filtered_effect_sizes.items(), key=lambda x: x[1], reverse=True)
311
+
312
+ # If the top MAX_GENE_COUNT entries have different effect sizes, return them
313
+ if len({score for _, score in gene_max_effect_sizes_sorted[:MAX_GENE_COUNT]}) > 1:
314
+ return {gene for gene, _ in gene_max_effect_sizes_sorted[:MAX_GENE_COUNT]}
315
+
316
+ # 3. Filter genes by number of shared phenotypes
317
+ filtered_shared_phenotypes = {g: s for g, s in gene_max_shared_phenotype.items() if g in related_genes}
318
+ gene_max_shared_phenotype_sorted = sorted(filtered_shared_phenotypes.items(), key=lambda x: x[1], reverse=True)
319
+ return {gene for gene, _ in gene_max_shared_phenotype_sorted[:MAX_GENE_COUNT]}
320
+
321
+
322
+ ###############################################################################
323
+ # build_phenotype_network_json
324
+ ###############################################################################
325
+
326
+
327
+ def _convert_to_nodes_json(
328
+ related_genes: set[str],
329
+ mp_term_name: str,
330
+ gene_records_map: dict[str, list[dict[str, str | float]]],
331
+ disease_annotations_composed: dict[str, set[str]],
332
+ hide_severity: bool = False,
333
+ ) -> list[dict[str, dict[str, str | list[str] | int]]]:
334
+ nodes_json = []
335
+ gene_records_map_filtered = {gene: gene_records_map[gene] for gene in related_genes}
336
+
337
+ # Scale effect sizes to 1-100
338
+ gene_records_map_filtered = _scale_effect_sizes(gene_records_map_filtered, mp_term_name)
339
+
340
+ for gene, records in gene_records_map_filtered.items():
341
+ phenotypes: list[str] = [r["phenotype"] for r in records]
342
+ diseases: set[str] = disease_annotations_composed.get(gene, set())
343
+ node_color: int = next((r["effect_size"] for r in records if r["mp_term_name"] == mp_term_name), 1)
344
+
345
+ node = {
346
+ "data": {
347
+ "id": gene,
348
+ "label": gene,
349
+ "phenotype": sorted(phenotypes),
350
+ "disease": sorted(diseases) if diseases else "",
351
+ "node_color": node_color,
352
+ }
353
+ }
354
+ if hide_severity:
355
+ node["data"]["hide_severity"] = True
356
+ nodes_json.append(node)
357
+
358
+ return nodes_json
359
+
360
+
361
+ def _convert_to_edges_json(
362
+ related_genes: set[str],
363
+ pairwise_similarity_annotations_composed: dict[tuple[str], dict[str, list[str] | int]],
364
+ ) -> list[dict[str, dict[str, str | list[str] | float]]]:
365
+ edges_json = []
366
+ pairwise_similarity_annotations_filtered = {}
367
+ for gene1, gene2 in combinations(sorted(related_genes), 2):
368
+ gene_pairs = tuple(sorted([gene1, gene2]))
369
+ if gene_pairs not in pairwise_similarity_annotations_composed:
370
+ continue
371
+ pairwise_similarity_annotations_filtered[gene_pairs] = pairwise_similarity_annotations_composed[gene_pairs]
372
+
373
+ if not pairwise_similarity_annotations_filtered:
374
+ return []
375
+
376
+ # Scale phenotype similarity scores to 1-100
377
+ pairwise_similarity_annotations_filtered = _scale_phenotype_similarity_scores(
378
+ pairwise_similarity_annotations_filtered, target_gene=None
379
+ )
380
+
381
+ for pair_genes, pair_annotations in pairwise_similarity_annotations_filtered.items():
382
+ gene1, gene2 = sorted(pair_genes)
383
+ edges_json.append(
384
+ {
385
+ "data": {
386
+ "source": gene1,
387
+ "target": gene2,
388
+ "phenotype": sorted(pair_annotations["phenotype_shared_annotations"]),
389
+ "edge_size": pair_annotations["phenotype_similarity_score"],
390
+ }
391
+ }
392
+ )
393
+ return edges_json
394
+
395
+
396
+ def build_phenotype_network_json(
397
+ genewise_phenotype_significants: list[dict[str, str | float]],
398
+ pairwise_similarity_annotations: dict[tuple[str], dict[str, dict[str, dict[str, str] | int]]],
399
+ disease_annotations_by_gene: dict[str, dict[str, str]],
400
+ output_dir,
401
+ binary_phenotypes: set[str] | None = None,
402
+ hide_severity: bool = False,
403
+ ) -> None:
404
+ gene_records_map, pairwise_similarity_annotations_composed, disease_annotations_composed = _compose_dataset(
405
+ genewise_phenotype_significants, pairwise_similarity_annotations, disease_annotations_by_gene
406
+ )
407
+
408
+ phenotype_records_map: dict[str, list[dict[str, str | float]]] = defaultdict(list)
409
+ for record in genewise_phenotype_significants:
410
+ phenotype_records_map[record["mp_term_name"]].append(record)
411
+ phenotype_records_map = dict(phenotype_records_map)
412
+
413
+ gene_lists = set()
414
+ for pair in pairwise_similarity_annotations_composed.keys():
415
+ for gene in pair:
416
+ gene_lists.add(gene)
417
+
418
+ for mp_term_name in tqdm(phenotype_records_map.keys(), total=len(phenotype_records_map)):
419
+ records = phenotype_records_map[mp_term_name]
420
+ related_genes = {r["marker_symbol"] for r in records if r["marker_symbol"] in gene_lists}
421
+
422
+ if len(related_genes) < 2:
423
+ continue
424
+
425
+ if len(related_genes) > MAX_GENE_COUNT:
426
+ related_genes = _filter_related_genes(records, related_genes, pairwise_similarity_annotations_composed)
427
+
428
+ is_binary = False
429
+ if binary_phenotypes:
430
+ is_binary = mp_term_name in binary_phenotypes
431
+
432
+ edges_json = _convert_to_edges_json(related_genes, pairwise_similarity_annotations_composed)
433
+
434
+ if not edges_json:
435
+ continue
436
+
437
+ # Remove unconnected nodes
438
+ connected_node_ids = set()
439
+ for edge in edges_json:
440
+ connected_node_ids.add(edge["data"]["source"])
441
+ connected_node_ids.add(edge["data"]["target"])
442
+
443
+ if not connected_node_ids:
444
+ continue
445
+
446
+ nodes_json = _convert_to_nodes_json(
447
+ connected_node_ids,
448
+ mp_term_name,
449
+ gene_records_map,
450
+ disease_annotations_composed,
451
+ hide_severity=hide_severity or is_binary,
452
+ )
453
+
454
+ # Sort nodes for stability
455
+ nodes_json = sorted(nodes_json, key=lambda n: n["data"]["id"])
456
+ edges_json = sorted(edges_json, key=lambda e: (e["data"]["source"], e["data"]["target"]))
457
+
458
+ network_json = nodes_json + edges_json
459
+
460
+ mp_term_name_underscore = mp_term_name.replace(" ", "_").replace("/", "_")
461
+ output_json = Path(output_dir / f"{mp_term_name_underscore}.json.gz")
462
+ with gzip.open(output_json, "wt", encoding="utf-8") as f:
463
+ json.dump(network_json, f, indent=4)
464
+
465
+
466
+ ###############################################################################
467
+ # build_gene_network_json
468
+ ###############################################################################
469
+
470
+
471
+ def _build_node_info(
472
+ gene: str,
473
+ gene_records_map: dict[str, list[dict[str, str | float]]],
474
+ disease_annotations_composed: dict[str, set[str]],
475
+ target_gene: str,
476
+ hide_severity: bool = False,
477
+ ) -> dict[str, dict[str, str | list[str] | float]]:
478
+ phenotypes: list[str] = [r["phenotype"] for r in gene_records_map.get(gene, [])]
479
+ diseases: set[str] = disease_annotations_composed.get(gene, set())
480
+ node_color: int = 100 if target_gene == gene else 1
481
+
482
+ node = {
483
+ "data": {
484
+ "id": gene,
485
+ "label": gene,
486
+ "phenotype": sorted(phenotypes),
487
+ "disease": sorted(diseases) if diseases else "",
488
+ "node_color": node_color,
489
+ }
490
+ }
491
+ if hide_severity:
492
+ node["data"]["hide_severity"] = True
493
+ return node
494
+
495
+
496
+ def build_gene_network_json(
497
+ genewise_phenotype_significants: list[dict[str, str | float]],
498
+ pairwise_similarity_annotations: dict[tuple[str], dict[str, dict[str, str] | int]],
499
+ disease_annotations_by_gene: dict[str, dict[str, str]],
500
+ output_dir,
501
+ hide_severity: bool = True,
502
+ ) -> None:
503
+ gene_records_map, pairwise_similarity_annotations_composed, disease_annotations_composed = _compose_dataset(
504
+ genewise_phenotype_significants, pairwise_similarity_annotations, disease_annotations_by_gene
505
+ )
506
+
507
+ gene_sets = set()
508
+ for pair in pairwise_similarity_annotations_composed.keys():
509
+ for gene in pair:
510
+ gene_sets.add(gene)
511
+
512
+ for target_gene in tqdm(gene_sets, total=len(gene_sets)):
513
+ related_pairs_with_target_gene = []
514
+ for pair in pairwise_similarity_annotations_composed.keys():
515
+ if target_gene not in pair:
516
+ continue
517
+ related_pairs_with_target_gene.append(pair)
518
+
519
+ related_genes = set()
520
+ for genes in related_pairs_with_target_gene:
521
+ gene1, gene2 = genes
522
+ related_genes.add(gene1)
523
+ related_genes.add(gene2)
524
+
525
+ # Skip if less than 2 related genes
526
+ if len(related_genes) < 2:
527
+ continue
528
+
529
+ related_pairs = []
530
+ for gene1, gene2 in combinations(related_genes, 2):
531
+ gene_pair = tuple(sorted([gene1, gene2]))
532
+ if gene_pair not in pairwise_similarity_annotations_composed:
533
+ continue
534
+ related_pairs.append(gene_pair)
535
+
536
+ # Filter genes if more than MAX_GENE_COUNT
537
+ if len(related_genes) > MAX_GENE_COUNT:
538
+ related_genes_filtered = _filter_related_genes(
539
+ genewise_phenotype_significants, related_genes, pairwise_similarity_annotations_composed
540
+ )
541
+ related_genes_filtered.add(target_gene)
542
+ related_pairs = [pairs for pairs in related_pairs if all(gene in related_genes_filtered for gene in pairs)]
543
+
544
+ # ---------------------------------------
545
+ # Nodes
546
+ # ---------------------------------------
547
+ nodes_json = []
548
+ visited_genes = set()
549
+ for pair in related_pairs:
550
+ gene1, gene2 = pair
551
+ if gene1 not in visited_genes:
552
+ visited_genes.add(gene1)
553
+ node_json = _build_node_info(
554
+ gene1, gene_records_map, disease_annotations_composed, target_gene, hide_severity=hide_severity
555
+ )
556
+ nodes_json.append(node_json)
557
+ if gene2 not in visited_genes:
558
+ visited_genes.add(gene2)
559
+ node_json = _build_node_info(
560
+ gene2, gene_records_map, disease_annotations_composed, target_gene, hide_severity=hide_severity
561
+ )
562
+ nodes_json.append(node_json)
563
+
564
+ # ---------------------------------------
565
+ # Edges
566
+ # ---------------------------------------
567
+ pairwise_similarity_annotations_filtered = {
568
+ pair: pairwise_similarity_annotations_composed[pair] for pair in related_pairs
569
+ }
570
+
571
+ if not pairwise_similarity_annotations_filtered:
572
+ return []
573
+
574
+ # Scale phenotype similarity scores to 1-100
575
+ pairwise_similarity_annotations_scaled = _scale_phenotype_similarity_scores(
576
+ pairwise_similarity_annotations_filtered, target_gene
577
+ )
578
+
579
+ edges_json = []
580
+ for pair in related_pairs:
581
+ gene1, gene2 = sorted(pair)
582
+ phenotypes = pairwise_similarity_annotations_scaled[pair]["phenotype_shared_annotations"]
583
+ phenodigm_score = pairwise_similarity_annotations_scaled[pair]["phenotype_similarity_score"]
584
+ edges_json.append(
585
+ {
586
+ "data": {
587
+ "source": gene1,
588
+ "target": gene2,
589
+ "phenotype": sorted(phenotypes),
590
+ "edge_size": phenodigm_score,
591
+ }
592
+ }
593
+ )
594
+
595
+ # Sort nodes for stability
596
+ nodes_json = sorted(nodes_json, key=lambda n: n["data"]["id"])
597
+ edges_json = sorted(edges_json, key=lambda e: (e["data"]["source"], e["data"]["target"]))
598
+
599
+ network_json = nodes_json + edges_json
600
+
601
+ output_json = Path(output_dir / f"{target_gene}.json.gz")
602
+ with gzip.open(output_json, "wt", encoding="utf-8") as f:
603
+ json.dump(network_json, f, indent=4)
@@ -0,0 +1,62 @@
1
+ from __future__ import annotations
2
+
3
+ from collections import defaultdict
4
+
5
+
6
+ def build_term_hierarchy(
7
+ ontology_terms: dict[str, dict],
8
+ ) -> tuple[dict[str, set[str]], dict[str, set[str]]]:
9
+ """Build parent-child hierarchy relationships from ontology terms."""
10
+ parent_term_map = defaultdict(set) # term_id -> [parent_ids]
11
+ child_term_map = defaultdict(set) # term_id -> [child_ids]
12
+
13
+ for term_id, term_data in ontology_terms.items():
14
+ if "is_a" in term_data:
15
+ for parent_id in term_data["is_a"]:
16
+ parent_term_map[term_id].add(parent_id)
17
+ child_term_map[parent_id].add(term_id)
18
+
19
+ return dict(parent_term_map), dict(child_term_map)
20
+
21
+
22
+ def find_all_ancestor_terms(term_id: str, parent_term_map: dict[str, set[str]]) -> set[str]:
23
+ """Find all ancestor terms for a given term."""
24
+ ancestor_terms = set()
25
+ terms_to_process = [term_id]
26
+
27
+ while terms_to_process:
28
+ current_term = terms_to_process.pop(0)
29
+ if current_term in parent_term_map:
30
+ for parent_term in parent_term_map[current_term]:
31
+ if parent_term not in ancestor_terms:
32
+ ancestor_terms.add(parent_term)
33
+ terms_to_process.append(parent_term)
34
+
35
+ return ancestor_terms
36
+
37
+
38
+ def find_all_descendant_terms(term_id: str, child_term_map: dict[str, set[str]]) -> set[str]:
39
+ """Find all descendant terms for a given term."""
40
+ descendant_terms = set()
41
+ terms_to_process = [term_id]
42
+
43
+ while terms_to_process:
44
+ current_term = terms_to_process.pop(0)
45
+ if current_term in child_term_map:
46
+ for child_term in child_term_map[current_term]:
47
+ if child_term not in descendant_terms:
48
+ descendant_terms.add(child_term)
49
+ terms_to_process.append(child_term)
50
+
51
+ return descendant_terms
52
+
53
+
54
+ def find_common_ancestors(term1_id: str, term2_id: str, parent_term_map: dict[str, set[str]]) -> set[str]:
55
+ """Find common ancestors of two terms."""
56
+ term1_ancestors = find_all_ancestor_terms(term1_id, parent_term_map)
57
+ term1_ancestors.add(term1_id) # Include the term itself
58
+
59
+ term2_ancestors = find_all_ancestor_terms(term2_id, parent_term_map)
60
+ term2_ancestors.add(term2_id) # Include the term itself
61
+
62
+ return term1_ancestors.intersection(term2_ancestors)