TSUMUGI 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. TSUMUGI/annotator.py +103 -0
  2. TSUMUGI/argparser.py +599 -0
  3. TSUMUGI/core.py +185 -0
  4. TSUMUGI/data/impc_phenodigm.csv +3406 -0
  5. TSUMUGI/data/mp.obo +143993 -0
  6. TSUMUGI/filterer.py +36 -0
  7. TSUMUGI/formatter.py +122 -0
  8. TSUMUGI/genewise_annotation_builder.py +94 -0
  9. TSUMUGI/io_handler.py +189 -0
  10. TSUMUGI/main.py +300 -0
  11. TSUMUGI/network_constructor.py +603 -0
  12. TSUMUGI/ontology_handler.py +62 -0
  13. TSUMUGI/pairwise_similarity_builder.py +66 -0
  14. TSUMUGI/report_generator.py +122 -0
  15. TSUMUGI/similarity_calculator.py +498 -0
  16. TSUMUGI/subcommands/count_filterer.py +47 -0
  17. TSUMUGI/subcommands/genes_filterer.py +89 -0
  18. TSUMUGI/subcommands/graphml_builder.py +158 -0
  19. TSUMUGI/subcommands/life_stage_filterer.py +48 -0
  20. TSUMUGI/subcommands/mp_filterer.py +142 -0
  21. TSUMUGI/subcommands/score_filterer.py +22 -0
  22. TSUMUGI/subcommands/sex_filterer.py +48 -0
  23. TSUMUGI/subcommands/webapp_builder.py +358 -0
  24. TSUMUGI/subcommands/zygosity_filterer.py +48 -0
  25. TSUMUGI/validator.py +65 -0
  26. TSUMUGI/web/app/css/app.css +1129 -0
  27. TSUMUGI/web/app/genelist/network_genelist.html +339 -0
  28. TSUMUGI/web/app/genelist/network_genelist.js +421 -0
  29. TSUMUGI/web/app/js/data/dataLoader.js +41 -0
  30. TSUMUGI/web/app/js/export/graphExporter.js +214 -0
  31. TSUMUGI/web/app/js/graph/centrality.js +495 -0
  32. TSUMUGI/web/app/js/graph/components.js +30 -0
  33. TSUMUGI/web/app/js/graph/filters.js +158 -0
  34. TSUMUGI/web/app/js/graph/highlighter.js +52 -0
  35. TSUMUGI/web/app/js/graph/layoutController.js +454 -0
  36. TSUMUGI/web/app/js/graph/valueScaler.js +43 -0
  37. TSUMUGI/web/app/js/search/geneSearcher.js +93 -0
  38. TSUMUGI/web/app/js/search/phenotypeSearcher.js +292 -0
  39. TSUMUGI/web/app/js/ui/dynamicFontSize.js +30 -0
  40. TSUMUGI/web/app/js/ui/mobilePanel.js +77 -0
  41. TSUMUGI/web/app/js/ui/slider.js +22 -0
  42. TSUMUGI/web/app/js/ui/tooltips.js +514 -0
  43. TSUMUGI/web/app/js/viewer/pageSetup.js +217 -0
  44. TSUMUGI/web/app/viewer.html +515 -0
  45. TSUMUGI/web/app/viewer.js +1593 -0
  46. TSUMUGI/web/css/sanitize.css +363 -0
  47. TSUMUGI/web/css/top.css +391 -0
  48. TSUMUGI/web/image/tsumugi-favicon.ico +0 -0
  49. TSUMUGI/web/image/tsumugi-icon.png +0 -0
  50. TSUMUGI/web/image/tsumugi-logo.png +0 -0
  51. TSUMUGI/web/image/tsumugi-logo.svg +69 -0
  52. TSUMUGI/web/js/genelist_formatter.js +123 -0
  53. TSUMUGI/web/js/top.js +338 -0
  54. TSUMUGI/web/open_webapp_linux.sh +25 -0
  55. TSUMUGI/web/open_webapp_mac.command +25 -0
  56. TSUMUGI/web/open_webapp_windows.bat +37 -0
  57. TSUMUGI/web/serve_index.py +110 -0
  58. TSUMUGI/web/template/template_index.html +197 -0
  59. TSUMUGI/web_deployer.py +150 -0
  60. tsumugi-1.0.1.dist-info/METADATA +504 -0
  61. tsumugi-1.0.1.dist-info/RECORD +64 -0
  62. tsumugi-1.0.1.dist-info/WHEEL +4 -0
  63. tsumugi-1.0.1.dist-info/entry_points.txt +3 -0
  64. tsumugi-1.0.1.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,66 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ import pickle
5
+ from collections.abc import Iterator
6
+ from pathlib import Path
7
+
8
+ from TSUMUGI import similarity_calculator
9
+
10
+
11
+ def build_pairwise_similarity(
12
+ genewise_phenotype_significants: list[dict], ontology_terms: set, args
13
+ ) -> dict[tuple[str], dict[str, dict[str, str] | int]]:
14
+ mp_term_ids = {r["mp_term_id"] for r in genewise_phenotype_significants}
15
+
16
+ terms_similarity_map, term_ic_map = similarity_calculator.calculate_all_pairwise_similarities(
17
+ ontology_terms, mp_term_ids, ic_threshold=5, threads=args.threads
18
+ )
19
+
20
+ if args.debug:
21
+ logging.debug("Caching terms similarity map and term IC map...")
22
+
23
+ # --------------------------------------------------------
24
+ # Cache results
25
+ # --------------------------------------------------------
26
+
27
+ output_dir = Path(args.output_dir, ".tempdir", "preprocessed")
28
+ output_dir.mkdir(parents=True, exist_ok=True)
29
+
30
+ with open(output_dir / "terms_similarity_map.pkl", "wb") as f:
31
+ pickle.dump(terms_similarity_map, f)
32
+
33
+ with open(output_dir / "term_ic_map.pkl", "wb") as f:
34
+ pickle.dump(term_ic_map, f)
35
+
36
+ # ----------------------------------------
37
+ # Calculate phenotype similarity
38
+ # ----------------------------------------
39
+
40
+ total_genes = {r["marker_symbol"] for r in genewise_phenotype_significants}
41
+ num_pairs = len(total_genes) * (len(total_genes) - 1) // 2
42
+
43
+ logging.info(f"Annotate phenotype ancestors for {num_pairs} pairs...")
44
+ phenotype_ancestors = similarity_calculator.annotate_phenotype_ancestors(
45
+ genewise_phenotype_significants,
46
+ terms_similarity_map,
47
+ ontology_terms,
48
+ )
49
+
50
+ phenodigm_scores = similarity_calculator.calculate_phenodigm_score(
51
+ genewise_phenotype_significants, terms_similarity_map, term_ic_map
52
+ )
53
+
54
+ # ----------------------------------------
55
+ # Summarize the phenotype similarity results
56
+ # ----------------------------------------
57
+
58
+ logging.info(f"Compute phenotype annotations and similarity score for {num_pairs} pairs...")
59
+
60
+ pairwise_similarity_annotations: Iterator[dict[str, dict[str, str] | int]] = (
61
+ similarity_calculator.summarize_similarity_annotations(
62
+ ontology_terms, phenotype_ancestors, phenodigm_scores, num_pairs
63
+ )
64
+ )
65
+
66
+ return pairwise_similarity_annotations
@@ -0,0 +1,122 @@
1
+ from __future__ import annotations
2
+
3
+ import gzip
4
+ import json
5
+ from collections import defaultdict
6
+ from pathlib import Path
7
+
8
+
9
+ # available mp terms
10
+ def write_available_mp_terms_txt(TEMPDIR: Path, output_file: Path) -> None:
11
+ with open(output_file, "w") as f:
12
+ for path_phenotype in Path(TEMPDIR, "network", "phenotype").glob("*.json.gz"):
13
+ mp_term_name = path_phenotype.name.replace(".json.gz", "").replace("_", " ")
14
+ f.write(f"{mp_term_name}\n")
15
+
16
+
17
+ def write_available_mp_terms_json(TEMPDIR: Path, output_file: Path) -> None:
18
+ mp_term_name_json = {}
19
+ for path_phenotype in Path(TEMPDIR, "network", "phenotype").glob("*.json.gz"):
20
+ mp_term_name_underscore = path_phenotype.name.replace(".json.gz", "")
21
+ mp_term_name = mp_term_name_underscore.replace("_", " ")
22
+ mp_term_name_json[mp_term_name] = mp_term_name_underscore
23
+ # Save as a JSON file
24
+ with open(output_file, "w") as f:
25
+ json.dump(mp_term_name_json, f, ensure_ascii=False, indent=2)
26
+
27
+
28
+ def write_mp_term_id_lookup(records_significants, available_mp_terms_file: Path, output_file: Path) -> None:
29
+ """
30
+ Build a mapping from phenotype slug to MP term ID using the most frequent ID for each name.
31
+ """
32
+ with open(available_mp_terms_file) as f:
33
+ available_mp_terms = json.load(f)
34
+
35
+ term_id_counts = defaultdict(lambda: defaultdict(int))
36
+ for record in records_significants:
37
+ name = record.get("mp_term_name")
38
+ mp_id = record.get("mp_term_id")
39
+ if not name or not mp_id:
40
+ continue
41
+ term_id_counts[name][mp_id] += 1
42
+
43
+ slug_to_mp_id = {}
44
+ for display_name, slug in available_mp_terms.items():
45
+ counts = term_id_counts.get(display_name)
46
+ if not counts:
47
+ continue
48
+ slug_to_mp_id[slug] = max(counts.items(), key=lambda item: item[1])[0]
49
+
50
+ with open(output_file, "w") as f:
51
+ json.dump(slug_to_mp_id, f, ensure_ascii=False, indent=2)
52
+
53
+
54
+ # binary phenotypes
55
+ def write_binary_phenotypes_txt(records_significants, TEMPDIR: Path, output_file: Path) -> None:
56
+ paths_available_mp_terms = Path(TEMPDIR, "network", "phenotype").glob("*.json.gz")
57
+ available_mp_terms = {p.name.replace(".json.gz", "").replace("_", " ") for p in paths_available_mp_terms}
58
+
59
+ mp_term_names_effect_size = defaultdict(set)
60
+ for record in records_significants:
61
+ mp_term_name = record["mp_term_name"]
62
+ if mp_term_name not in available_mp_terms:
63
+ continue
64
+ effect_size = record["effect_size"]
65
+ mp_term_names_effect_size[mp_term_name].add(effect_size)
66
+
67
+ binary_phenotypes = set()
68
+ for mp_term_name, effect_sizes in mp_term_names_effect_size.items():
69
+ if all(True if es in {0, 1} else False for es in effect_sizes):
70
+ binary_phenotypes.add(mp_term_name)
71
+
72
+ binary_phenotypes = sorted(binary_phenotypes)
73
+ with open(output_file, "w") as f:
74
+ for bp in binary_phenotypes:
75
+ f.write(f"{bp}\n")
76
+
77
+
78
+ # available gene symbols
79
+ def write_available_gene_symbols_txt(TEMPDIR: Path, output_file: Path) -> None:
80
+ with open(output_file, "w") as f:
81
+ for path_genesymbol in Path(TEMPDIR, "network", "genesymbol").glob("*.json.gz"):
82
+ gene_symbol = path_genesymbol.name.replace(".json.gz", "")
83
+ f.write(f"{gene_symbol}\n")
84
+
85
+
86
+ def write_marker_symbol_accession_id_json(records_significants, TEMPDIR: Path, output_file: Path) -> None:
87
+ marker_symbol_accession_id = {}
88
+ paths_genesymbol = Path(TEMPDIR, "network", "genesymbol").glob("*.json.gz")
89
+ available_gene_symbols = {p.name.replace(".json.gz", "") for p in paths_genesymbol}
90
+ for record in records_significants:
91
+ if record["marker_symbol"] not in available_gene_symbols:
92
+ continue
93
+ marker_symbol_accession_id[record["marker_symbol"]] = record.get("marker_accession_id")
94
+ # Save as a JSON file
95
+ with open(output_file, "w") as f:
96
+ json.dump(marker_symbol_accession_id, f, ensure_ascii=False, indent=2)
97
+
98
+
99
+ def write_records_jsonl_gz(records, output_file: Path) -> None:
100
+ with gzip.open(output_file, "wt", encoding="utf-8") as f:
101
+ for record in records:
102
+ f.write(json.dumps(record) + "\n")
103
+
104
+
105
+ def write_pairwise_similarity_annotations(pairwise_similarity_annotations, output_file: Path) -> None:
106
+ with gzip.open(output_file, "wt", encoding="utf-8") as f:
107
+ for gene_pair, annotation in pairwise_similarity_annotations.items():
108
+ gene1_symbol, gene2_symbol = sorted(gene_pair)
109
+ if not annotation["phenotype_shared_annotations"]:
110
+ continue
111
+ phenotype_similarity_score = annotation["phenotype_similarity_score"]
112
+ f.write(
113
+ json.dumps(
114
+ {
115
+ "gene1_symbol": gene1_symbol,
116
+ "gene2_symbol": gene2_symbol,
117
+ "phenotype_shared_annotations": annotation["phenotype_shared_annotations"],
118
+ "phenotype_similarity_score": phenotype_similarity_score,
119
+ }
120
+ )
121
+ + "\n"
122
+ )
@@ -0,0 +1,498 @@
1
+ from __future__ import annotations
2
+
3
+ import math
4
+ from collections import defaultdict
5
+ from collections.abc import Iterator
6
+ from concurrent.futures import ProcessPoolExecutor
7
+ from itertools import combinations, combinations_with_replacement
8
+
9
+ import numpy as np
10
+ from tqdm import tqdm
11
+
12
+ from TSUMUGI.ontology_handler import (
13
+ build_term_hierarchy,
14
+ find_all_descendant_terms,
15
+ find_common_ancestors,
16
+ )
17
+
18
+ ###########################################################
19
+ # Pairwise term similarity (with multiprocessing)
20
+ ###########################################################
21
+
22
+
23
+ def _calculate_term_ic_map(
24
+ ontology_terms: dict[str, dict], child_term_map: dict[str, set[str]], ic_threshold: int = 5
25
+ ) -> dict[str, float]:
26
+ """
27
+ Calculate information content (IC) for all ontology terms.
28
+ Annotate 0 for terms below the given IC threshold percentile (default: 5th percentile).
29
+ """
30
+ total_term_count = len(ontology_terms)
31
+ term_ic_map: dict[str, float] = {}
32
+
33
+ for term_id in ontology_terms:
34
+ descendants = find_all_descendant_terms(term_id, child_term_map)
35
+ term_count = len(descendants) + 1
36
+ probability = term_count / total_term_count
37
+ term_ic_map[term_id] = -math.log(probability)
38
+
39
+ ic_values = list(term_ic_map.values())
40
+ ic_min_values = np.percentile(ic_values, ic_threshold)
41
+ for term_id, ic in term_ic_map.items():
42
+ if ic < ic_min_values:
43
+ term_ic_map[term_id] = 0.0
44
+
45
+ return term_ic_map
46
+
47
+
48
+ _worker_parent_term_map: dict[str, set[str]] | None = None
49
+ _worker_child_term_map: dict[str, set[str]] | None = None
50
+ _worker_term_ic_map: dict[str, float] | None = None
51
+
52
+
53
+ def _init_worker(
54
+ parent_term_map: dict[str, set[str]], child_term_map: dict[str, set[str]], term_ic_map: dict[str, float]
55
+ ) -> None:
56
+ """Initializer for worker processes to avoid repeatedly pickling large objects."""
57
+ global _worker_parent_term_map, _worker_child_term_map, _worker_term_ic_map
58
+ _worker_parent_term_map = parent_term_map
59
+ _worker_child_term_map = child_term_map
60
+ _worker_term_ic_map = term_ic_map
61
+
62
+
63
+ def _calculate_pair_mica_and_resnik(
64
+ term1_id: str, term2_id: str, parent_term_map: dict[str, set[str]], term_ic_map: dict[str, float]
65
+ ) -> tuple[str | None, float]:
66
+ """Calculate MSCA (by IC) and Resnik similarity using precalculated IC."""
67
+ if term1_id == term2_id:
68
+ return term1_id, term_ic_map.get(term1_id, 0.0)
69
+ common_ancestors = find_common_ancestors(term1_id, term2_id, parent_term_map)
70
+ if not common_ancestors:
71
+ return None, 0.0
72
+
73
+ msca = max(common_ancestors, key=lambda t: term_ic_map.get(t, 0.0))
74
+ similarity = term_ic_map.get(msca, 0.0)
75
+ return msca, similarity
76
+
77
+
78
+ def _calculate_pair_jaccard(
79
+ term1_id: str, term2_id: str, parent_term_map: dict[str, set[str]], term_ic_map: dict[str, float]
80
+ ) -> float:
81
+ """Calculate Jaccard index for parent ancestors."""
82
+ if term1_id == term2_id:
83
+ return 1.0
84
+ ancestors1 = parent_term_map.get(term1_id, set())
85
+ ancestors2 = parent_term_map.get(term2_id, set())
86
+
87
+ intersection = ancestors1.intersection(ancestors2)
88
+ union = ancestors1.union(ancestors2)
89
+
90
+ if not union:
91
+ return 0.0
92
+
93
+ jaccard_index = len(intersection) / len(union)
94
+ return jaccard_index
95
+
96
+
97
+ def _calculate_pair_worker(term_pair: tuple[str, str]) -> tuple[tuple[str], dict[str | None, float]]:
98
+ """Worker-side calculation using globals set by _init_worker."""
99
+ term1_id, term2_id = term_pair
100
+
101
+ parent_term_map = _worker_parent_term_map
102
+ term_ic_map = _worker_term_ic_map
103
+
104
+ if parent_term_map is None or term_ic_map is None:
105
+ raise RuntimeError("Worker maps are not initialized.")
106
+
107
+ if term1_id == term2_id:
108
+ msca = term1_id
109
+ sim = term_ic_map.get(term1_id, 0.0)
110
+ else:
111
+ msca, sim = _calculate_pair_mica_and_resnik(term1_id, term2_id, parent_term_map, term_ic_map)
112
+
113
+ term_pairs = tuple(sorted((term1_id, term2_id)))
114
+ return term_pairs, {msca: sim}
115
+
116
+
117
+ def _calculate_pair_similarity_score(
118
+ term1_id: str, term2_id: str, parent_term_map: dict[str, set[str]], term_ic_map: dict[str, float]
119
+ ) -> tuple[tuple[str], dict[str | None, float]]:
120
+ """Calculate pairwise similarity (Phenodigm score).
121
+ msca: Most Specific Common Ancestor
122
+ Phenodigm score: sqrt(Resnik similarity * Jaccard index)
123
+ """
124
+ msca, resnik = _calculate_pair_mica_and_resnik(term1_id, term2_id, parent_term_map, term_ic_map)
125
+ jaccard = _calculate_pair_jaccard(term1_id, term2_id, parent_term_map, term_ic_map)
126
+
127
+ term_pairs = tuple(sorted((term1_id, term2_id)))
128
+ score = math.sqrt(resnik * jaccard)
129
+ return term_pairs, {msca: score}
130
+
131
+
132
+ def calculate_all_pairwise_similarities(
133
+ ontology_terms: dict[str, dict],
134
+ all_term_ids: set[str],
135
+ ic_threshold: int = 5,
136
+ threads: int | None = None,
137
+ ) -> tuple[dict[tuple[str], dict[str | None, float]], dict[str, float]]:
138
+ """Calculate pairwise Resnik similarities for all term IDs."""
139
+ parent_term_map, child_term_map = build_term_hierarchy(ontology_terms)
140
+ term_ic_map = _calculate_term_ic_map(ontology_terms, child_term_map, ic_threshold=ic_threshold)
141
+ term_list = sorted(all_term_ids)
142
+
143
+ terms_similarity_map: dict[tuple[str], dict[str | None, float]] = {}
144
+
145
+ if threads == 1:
146
+ for term1_id, term2_id in combinations_with_replacement(term_list, 2):
147
+ term_pairs, ancestor_ic_map = _calculate_pair_similarity_score(
148
+ term1_id, term2_id, parent_term_map, term_ic_map
149
+ )
150
+ terms_similarity_map[term_pairs] = ancestor_ic_map
151
+ return terms_similarity_map
152
+
153
+ with ProcessPoolExecutor(
154
+ max_workers=threads,
155
+ initializer=_init_worker,
156
+ initargs=(parent_term_map, child_term_map, term_ic_map),
157
+ ) as executor:
158
+ for term_pairs, ancestor_ic_map in executor.map(
159
+ _calculate_pair_worker, combinations_with_replacement(term_list, 2)
160
+ ):
161
+ terms_similarity_map[term_pairs] = ancestor_ic_map
162
+
163
+ return terms_similarity_map, term_ic_map
164
+
165
+
166
+ ###########################################################
167
+ # Phenotype ancestor annotation
168
+ ###########################################################
169
+
170
+
171
+ def _delete_parent_terms_from_ancestors(
172
+ candidate_ancestors: list[dict[str, str]],
173
+ child_term_map: dict[str, set[str]],
174
+ ) -> list[dict[str, str]]:
175
+ """
176
+ Remove parent terms from the common ancestors.
177
+ Keep only the most specific terms among candidates with identical metadata.
178
+ """
179
+ to_delete: set[int] = set()
180
+ phenotype_to_meta = defaultdict(list)
181
+
182
+ for ancestor in candidate_ancestors:
183
+ phenotype_to_meta[ancestor["phenotype"]].append({k: v for k, v in ancestor.items() if k != "phenotype"})
184
+
185
+ for idx, ancestor in enumerate(candidate_ancestors):
186
+ term_id = ancestor["phenotype"]
187
+ term_meta = {k: v for k, v in ancestor.items() if k != "phenotype"}
188
+
189
+ if idx in to_delete:
190
+ continue
191
+
192
+ stack = list(child_term_map.get(term_id, ()))
193
+ while stack:
194
+ child_id = stack.pop()
195
+ child_metas = phenotype_to_meta.get(child_id, [])
196
+ if any(child_meta == term_meta for child_meta in child_metas):
197
+ to_delete.add(idx)
198
+ break
199
+ stack.extend(child_term_map.get(child_id, ()))
200
+
201
+ return [ancestor for i, ancestor in enumerate(candidate_ancestors) if i not in to_delete]
202
+
203
+
204
+ # ---------------------------------------------------------
205
+ # Helper functions for building gene metadata maps
206
+ # ---------------------------------------------------------
207
+
208
+
209
+ def _build_gene_metadata_maps(
210
+ gene_records_map: dict[str, list[dict[str, str | float]]],
211
+ annotations: set[str],
212
+ ) -> tuple[dict[str, dict[tuple[str, str, str], list[str]]], dict[tuple[str, str, str], dict[str, str]]]:
213
+ """
214
+ Group gene records by metadata signature for faster matching.
215
+ Returns:
216
+ gene_metadata_map: example: {"GeneA": {("Homo", "Embryo", "None"): ["MP:0001", "MP:0002"]}}
217
+ meta_dict_cache: example: {("Homo", "Embryo", "None"): {"zygosity": "Homo", "life_stage": "Embryo", "sexual_dimorphism": "None"}}
218
+ """
219
+ gene_metadata_map: dict[str, dict[tuple[str, str, str], list[str]]] = {}
220
+ meta_dict_cache: dict[tuple[str, str, str], dict[str, str]] = {}
221
+
222
+ for gene_symbol, records in gene_records_map.items():
223
+ per_gene_map: dict[tuple[str, str, str], list[str]] = defaultdict(list)
224
+ for record in records:
225
+ meta_signature = (
226
+ record["zygosity"],
227
+ record["life_stage"],
228
+ record.get("sexual_dimorphism", "None"),
229
+ )
230
+ per_gene_map[meta_signature].append(record["mp_term_id"])
231
+ if meta_signature not in meta_dict_cache:
232
+ meta_dict_cache[meta_signature] = {k: v for k, v in record.items() if k in annotations}
233
+ gene_metadata_map[gene_symbol] = dict(per_gene_map)
234
+
235
+ return gene_metadata_map, meta_dict_cache
236
+
237
+
238
+ def _annotate_ancestors(
239
+ gene1_meta_map: dict,
240
+ gene2_meta_map: dict,
241
+ meta_dict_cache: dict,
242
+ terms_similarity_map: dict[tuple[str, str], dict[str | None, float]],
243
+ child_term_map: dict[str, set[str]],
244
+ ) -> list[dict[str, str]]:
245
+ """Annotate phenotype ancestors for a single gene pair."""
246
+
247
+ candidate_ancestors: list[dict[str, str]] = []
248
+ added_keys: set[tuple[str, tuple[str, str, str]]] = set()
249
+
250
+ shared_meta_signatures = set(gene1_meta_map.keys()) & set(gene2_meta_map.keys())
251
+
252
+ for meta_signature in shared_meta_signatures:
253
+ gene1_terms = gene1_meta_map[meta_signature]
254
+ gene2_terms = gene2_meta_map[meta_signature]
255
+ meta_dict = meta_dict_cache[meta_signature]
256
+
257
+ for gene1_mp_term_id in gene1_terms:
258
+ for gene2_mp_term_id in gene2_terms:
259
+ pair_key = tuple(sorted([gene1_mp_term_id, gene2_mp_term_id]))
260
+ mapping = terms_similarity_map.get(pair_key)
261
+ if not mapping:
262
+ continue
263
+
264
+ common_ancestor, similarity = next(iter(mapping.items()))
265
+
266
+ if not common_ancestor or similarity == 0.0:
267
+ continue
268
+
269
+ current_key = (common_ancestor, meta_signature)
270
+
271
+ if current_key in added_keys:
272
+ continue
273
+
274
+ candidate_ancestors.append({"phenotype": common_ancestor, **meta_dict})
275
+ added_keys.add(current_key)
276
+
277
+ # Remove parent terms from candidate ancestors
278
+ ancestors = _delete_parent_terms_from_ancestors(candidate_ancestors, child_term_map)
279
+
280
+ return ancestors
281
+
282
+
283
+ def annotate_phenotype_ancestors(
284
+ genewise_phenotype_significants: list[dict[str, str | float]],
285
+ terms_similarity_map: dict[tuple[str], dict[str, float]],
286
+ ontology_terms: dict[str, dict[str, str]],
287
+ ) -> Iterator[dict[str, str | list[dict[str, str]]]]:
288
+ """
289
+ Annotate phenotype ancestors for each gene pair.
290
+ """
291
+ # Build gene -> records map
292
+ gene_records_map: dict[str, list[dict[str, str | float]]] = defaultdict(list)
293
+ for record in genewise_phenotype_significants:
294
+ gene_records_map[record["marker_symbol"]].append(record)
295
+
296
+ # Build hierarchy and IC-based filters
297
+ _, child_term_map = build_term_hierarchy(ontology_terms)
298
+ annotations: set[str] = {"zygosity", "life_stage", "sexual_dimorphism"}
299
+ gene_metadata_map, meta_dict_cache = _build_gene_metadata_maps(gene_records_map, annotations)
300
+
301
+ for (gene1_symbol, gene1_meta_map), (gene2_symbol, gene2_meta_map) in combinations(gene_metadata_map.items(), 2):
302
+ ancestors = _annotate_ancestors(
303
+ gene1_meta_map=gene1_meta_map,
304
+ gene2_meta_map=gene2_meta_map,
305
+ meta_dict_cache=meta_dict_cache,
306
+ terms_similarity_map=terms_similarity_map,
307
+ child_term_map=child_term_map,
308
+ )
309
+ yield {
310
+ "gene1_symbol": gene1_symbol,
311
+ "gene2_symbol": gene2_symbol,
312
+ "phenotype_shared_annotations": ancestors,
313
+ }
314
+
315
+
316
+ ###########################################################
317
+ # Phenodigm score calculation
318
+ ###########################################################
319
+
320
+
321
+ def _calculate_weighted_similarity_matrix(
322
+ gene1_record: dict[str, np.ndarray],
323
+ gene2_record: dict[str, np.ndarray],
324
+ terms_similarity_map: dict[tuple[str, str], dict[str | None, float]],
325
+ ) -> np.ndarray:
326
+ """Calculate weighted similarity matrix between two genes based on their phenotype records."""
327
+ gene1_terms = gene1_record["terms"]
328
+ gene2_terms = gene2_record["terms"]
329
+
330
+ similarity_matrix = np.zeros((len(gene1_terms), len(gene2_terms)), dtype=float)
331
+ for i, term1 in enumerate(gene1_terms):
332
+ row = similarity_matrix[i]
333
+ for j, term2 in enumerate(gene2_terms):
334
+ _, similarity = next(iter(terms_similarity_map.get(tuple(sorted([term1, term2])), {None: 0.0}).items()))
335
+ row[j] = similarity
336
+
337
+ z_match = gene1_record["zygosity"][:, None] == gene2_record["zygosity"][None, :]
338
+ l_match = gene1_record["life_stage"][:, None] == gene2_record["life_stage"][None, :]
339
+ s_match = gene1_record["sexual_dimorphism"][:, None] == gene2_record["sexual_dimorphism"][None, :]
340
+ match_counts = z_match.astype(int) + l_match.astype(int) + s_match.astype(int)
341
+
342
+ weight_lookup = np.array([0.25, 0.5, 0.75, 1.0])
343
+ weight_matrix = weight_lookup[match_counts]
344
+
345
+ return similarity_matrix * weight_matrix
346
+
347
+
348
+ def _apply_phenodigm_scaling(
349
+ weighted_similarity_matrix: np.ndarray,
350
+ gene1_record: dict[str, np.ndarray],
351
+ gene2_record: dict[str, np.ndarray],
352
+ ) -> int:
353
+ """Apply Phenodigm scaling method to similarity scores (0-100)."""
354
+ gene1_information_content_scores = gene1_record["ic_scores"]
355
+ gene2_information_content_scores = gene2_record["ic_scores"]
356
+
357
+ max_gene1_information_content = gene1_record["ic_max"]
358
+ max_gene2_information_content = gene2_record["ic_max"]
359
+
360
+ row_max_similarities = weighted_similarity_matrix.max(axis=1)
361
+ column_max_similarities = weighted_similarity_matrix.max(axis=0)
362
+
363
+ max_score_actual = np.max([np.max(row_max_similarities), np.max(column_max_similarities)])
364
+ average_score_actual = (
365
+ np.mean(np.concatenate([row_max_similarities, column_max_similarities]))
366
+ if (len(row_max_similarities) > 0 or len(column_max_similarities) > 0)
367
+ else 0.0
368
+ )
369
+
370
+ max_score_theoretical = max(max_gene1_information_content, max_gene2_information_content)
371
+
372
+ combined_ic_scores = np.concatenate([gene1_information_content_scores, gene2_information_content_scores])
373
+ average_score_theoretical = float(np.mean(combined_ic_scores)) if combined_ic_scores.size else 0.0
374
+
375
+ normalized_max_score = max_score_actual / max_score_theoretical if max_score_theoretical > 0 else 0.0
376
+ normalized_average_score = (
377
+ average_score_actual / average_score_theoretical if average_score_theoretical > 0 else 0.0
378
+ )
379
+
380
+ phenodigm_score = 100 * (normalized_max_score + normalized_average_score) / 2
381
+
382
+ return int(phenodigm_score)
383
+
384
+
385
+ def _calculate_phenodigm(
386
+ gene1_record: dict[str, np.ndarray],
387
+ gene2_record: dict[str, np.ndarray],
388
+ terms_similarity_map: dict[tuple[str, str], dict[str | None, float]],
389
+ ) -> int:
390
+ """Calculate the Phenodigm score for a single gene pair."""
391
+ weighted_similarity_matrix = _calculate_weighted_similarity_matrix(
392
+ gene1_record,
393
+ gene2_record,
394
+ terms_similarity_map,
395
+ )
396
+
397
+ score = _apply_phenodigm_scaling(
398
+ weighted_similarity_matrix,
399
+ gene1_record,
400
+ gene2_record,
401
+ )
402
+
403
+ return score
404
+
405
+
406
+ def _build_gene_data_map(
407
+ gene_records_map: dict[str, list[dict[str, str | float]]],
408
+ term_ic_map: dict[str, float],
409
+ ) -> dict[str, dict[str, np.ndarray]]:
410
+ """Convert raw gene records into array-based representation for faster scoring."""
411
+ gene_data_map: dict[str, dict[str, np.ndarray]] = {}
412
+ for gene_symbol, records in gene_records_map.items():
413
+ terms = np.array([r["mp_term_id"] for r in records], dtype=object)
414
+ zygosity = np.array([r["zygosity"] for r in records], dtype=object)
415
+ life_stage = np.array([r["life_stage"] for r in records], dtype=object)
416
+ sexual_dimorphism = np.array([r.get("sexual_dimorphism", "None") for r in records], dtype=object)
417
+ ic_scores = np.array([term_ic_map.get(term, 0.0) for term in terms], dtype=float)
418
+
419
+ gene_data_map[gene_symbol] = {
420
+ "terms": terms,
421
+ "zygosity": zygosity,
422
+ "life_stage": life_stage,
423
+ "sexual_dimorphism": sexual_dimorphism,
424
+ "ic_scores": ic_scores,
425
+ "ic_max": float(ic_scores.max()) if ic_scores.size else 0.0,
426
+ }
427
+
428
+ return gene_data_map
429
+
430
+
431
+ def calculate_phenodigm_score(
432
+ genewise_phenotype_significants: list[dict[str, str | float]],
433
+ terms_similarity_map: dict[tuple[str], dict[str, float]],
434
+ term_ic_map: dict[str, float],
435
+ ) -> Iterator[dict[str, str | int]]:
436
+ """
437
+ Calculate Phenodigm score between gene pairs.
438
+ """
439
+ # Build gene -> records map
440
+ gene_records_map: dict[str, list[dict[str, str | float]]] = defaultdict(list)
441
+ for record in genewise_phenotype_significants:
442
+ gene_records_map[record["marker_symbol"]].append(record)
443
+
444
+ gene_data_map = _build_gene_data_map(gene_records_map, term_ic_map)
445
+
446
+ for (gene1_symbol, gene1_record), (gene2_symbol, gene2_record) in combinations(gene_data_map.items(), 2):
447
+ score = _calculate_phenodigm(
448
+ gene1_record=gene1_record,
449
+ gene2_record=gene2_record,
450
+ terms_similarity_map=terms_similarity_map,
451
+ )
452
+ yield {"gene1_symbol": gene1_symbol, "gene2_symbol": gene2_symbol, "phenotype_similarity_score": score}
453
+
454
+
455
+ ###########################################################
456
+ # Summarize the phenotype similarity results
457
+ ###########################################################
458
+
459
+
460
+ def summarize_similarity_annotations(
461
+ ontology_terms: dict[str, dict[str, str]],
462
+ phenotype_ancestors: Iterator[dict[str, str | list[dict[str, str]]]],
463
+ phenodigm_scores: Iterator[dict[str, str | int]],
464
+ total_pairs: int,
465
+ ) -> Iterator[dict[str, list[dict[str, str]] | int]]:
466
+ """Summarize similarity annotations including common ancestors and Phenodigm scores."""
467
+
468
+ id_name_map = {v["id"]: v["name"] for v in ontology_terms.values()}
469
+
470
+ for phenotype_ancestor, phenodigm_score in tqdm(zip(phenotype_ancestors, phenodigm_scores), total=total_pairs):
471
+ gene1_symbol = phenotype_ancestor["gene1_symbol"]
472
+ gene2_symbol = phenotype_ancestor["gene2_symbol"]
473
+
474
+ ancestors: list[dict[str, str]] = phenotype_ancestor["phenotype_shared_annotations"]
475
+
476
+ ancestors_renamed = []
477
+ for ancestor in ancestors:
478
+ renamed_ancestor = {}
479
+ for k, v in ancestor.items():
480
+ if k == "phenotype" and v in id_name_map:
481
+ renamed_ancestor["phenotype"] = id_name_map[v]
482
+ else:
483
+ renamed_ancestor[k] = v
484
+ ancestors_renamed.append(renamed_ancestor)
485
+
486
+ phenodigm_score = phenodigm_score["phenotype_similarity_score"]
487
+
488
+ annotations = {
489
+ "gene1_symbol": gene1_symbol,
490
+ "gene2_symbol": gene2_symbol,
491
+ "phenotype_shared_annotations": sorted(
492
+ ancestors_renamed,
493
+ key=lambda x: [x["phenotype"], x["zygosity"], x["life_stage"], x["sexual_dimorphism"]],
494
+ ),
495
+ "phenotype_similarity_score": phenodigm_score if ancestors_renamed else 0,
496
+ }
497
+
498
+ yield annotations