TSUMUGI 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- TSUMUGI/annotator.py +103 -0
- TSUMUGI/argparser.py +599 -0
- TSUMUGI/core.py +185 -0
- TSUMUGI/data/impc_phenodigm.csv +3406 -0
- TSUMUGI/data/mp.obo +143993 -0
- TSUMUGI/filterer.py +36 -0
- TSUMUGI/formatter.py +122 -0
- TSUMUGI/genewise_annotation_builder.py +94 -0
- TSUMUGI/io_handler.py +189 -0
- TSUMUGI/main.py +300 -0
- TSUMUGI/network_constructor.py +603 -0
- TSUMUGI/ontology_handler.py +62 -0
- TSUMUGI/pairwise_similarity_builder.py +66 -0
- TSUMUGI/report_generator.py +122 -0
- TSUMUGI/similarity_calculator.py +498 -0
- TSUMUGI/subcommands/count_filterer.py +47 -0
- TSUMUGI/subcommands/genes_filterer.py +89 -0
- TSUMUGI/subcommands/graphml_builder.py +158 -0
- TSUMUGI/subcommands/life_stage_filterer.py +48 -0
- TSUMUGI/subcommands/mp_filterer.py +142 -0
- TSUMUGI/subcommands/score_filterer.py +22 -0
- TSUMUGI/subcommands/sex_filterer.py +48 -0
- TSUMUGI/subcommands/webapp_builder.py +358 -0
- TSUMUGI/subcommands/zygosity_filterer.py +48 -0
- TSUMUGI/validator.py +65 -0
- TSUMUGI/web/app/css/app.css +1129 -0
- TSUMUGI/web/app/genelist/network_genelist.html +339 -0
- TSUMUGI/web/app/genelist/network_genelist.js +421 -0
- TSUMUGI/web/app/js/data/dataLoader.js +41 -0
- TSUMUGI/web/app/js/export/graphExporter.js +214 -0
- TSUMUGI/web/app/js/graph/centrality.js +495 -0
- TSUMUGI/web/app/js/graph/components.js +30 -0
- TSUMUGI/web/app/js/graph/filters.js +158 -0
- TSUMUGI/web/app/js/graph/highlighter.js +52 -0
- TSUMUGI/web/app/js/graph/layoutController.js +454 -0
- TSUMUGI/web/app/js/graph/valueScaler.js +43 -0
- TSUMUGI/web/app/js/search/geneSearcher.js +93 -0
- TSUMUGI/web/app/js/search/phenotypeSearcher.js +292 -0
- TSUMUGI/web/app/js/ui/dynamicFontSize.js +30 -0
- TSUMUGI/web/app/js/ui/mobilePanel.js +77 -0
- TSUMUGI/web/app/js/ui/slider.js +22 -0
- TSUMUGI/web/app/js/ui/tooltips.js +514 -0
- TSUMUGI/web/app/js/viewer/pageSetup.js +217 -0
- TSUMUGI/web/app/viewer.html +515 -0
- TSUMUGI/web/app/viewer.js +1593 -0
- TSUMUGI/web/css/sanitize.css +363 -0
- TSUMUGI/web/css/top.css +391 -0
- TSUMUGI/web/image/tsumugi-favicon.ico +0 -0
- TSUMUGI/web/image/tsumugi-icon.png +0 -0
- TSUMUGI/web/image/tsumugi-logo.png +0 -0
- TSUMUGI/web/image/tsumugi-logo.svg +69 -0
- TSUMUGI/web/js/genelist_formatter.js +123 -0
- TSUMUGI/web/js/top.js +338 -0
- TSUMUGI/web/open_webapp_linux.sh +25 -0
- TSUMUGI/web/open_webapp_mac.command +25 -0
- TSUMUGI/web/open_webapp_windows.bat +37 -0
- TSUMUGI/web/serve_index.py +110 -0
- TSUMUGI/web/template/template_index.html +197 -0
- TSUMUGI/web_deployer.py +150 -0
- tsumugi-1.0.1.dist-info/METADATA +504 -0
- tsumugi-1.0.1.dist-info/RECORD +64 -0
- tsumugi-1.0.1.dist-info/WHEEL +4 -0
- tsumugi-1.0.1.dist-info/entry_points.txt +3 -0
- tsumugi-1.0.1.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,358 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import gzip
|
|
4
|
+
import json
|
|
5
|
+
import shutil
|
|
6
|
+
from collections import defaultdict
|
|
7
|
+
from collections.abc import Iterator
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
from TSUMUGI import io_handler
|
|
11
|
+
|
|
12
|
+
MAX_NODE_COUNT = 150
|
|
13
|
+
|
|
14
|
+
from importlib.resources import as_file, files
|
|
15
|
+
|
|
16
|
+
WEB_DIR = files("TSUMUGI") / "web"
|
|
17
|
+
WEB_APP_DIR = WEB_DIR / "app"
|
|
18
|
+
TEMPLATE_HTML_DIR = WEB_DIR / "template" / "template-app-html"
|
|
19
|
+
TEMPLATE_JS_DIR = WEB_DIR / "template" / "template-app-js"
|
|
20
|
+
LAUNCHER_FILES = [
|
|
21
|
+
"open_webapp_linux.sh",
|
|
22
|
+
"open_webapp_mac.command",
|
|
23
|
+
"open_webapp_windows.bat",
|
|
24
|
+
"serve_index.py",
|
|
25
|
+
]
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _create_annotation_string(*parts):
|
|
29
|
+
"""Join non-empty parts with commas."""
|
|
30
|
+
return ", ".join([p for p in parts if p])
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
###############################################################################
|
|
34
|
+
# Node building
|
|
35
|
+
###############################################################################
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _format_suffix(zygosity: str, life_stage: str, sexual_dimorphism: str) -> str:
|
|
39
|
+
"""Return a suffix like '(Homo, Early, Male)'; omit 'None'."""
|
|
40
|
+
parts = [zygosity, life_stage]
|
|
41
|
+
if sexual_dimorphism and sexual_dimorphism != "None":
|
|
42
|
+
parts.append(sexual_dimorphism)
|
|
43
|
+
return f"({', '.join(parts)})"
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def build_nodes(gene_to_records, all_genes, hide_severity: bool = False):
|
|
47
|
+
"""
|
|
48
|
+
Embed the following formatted text into data.annotation:
|
|
49
|
+
Phenotypes of {GENE} KO mice
|
|
50
|
+
- {mp_term_name} (zygosity, life_stage, sexual_dimorphism)
|
|
51
|
+
...
|
|
52
|
+
Associated Human Diseases
|
|
53
|
+
- {disease_name} (zygosity, life_stage, sexual_dimorphism)
|
|
54
|
+
...
|
|
55
|
+
"""
|
|
56
|
+
nodes = []
|
|
57
|
+
|
|
58
|
+
for gene in sorted(all_genes):
|
|
59
|
+
recs = gene_to_records.get(gene, [])
|
|
60
|
+
|
|
61
|
+
phenotype_lines: list[str] = []
|
|
62
|
+
disease_lines: list[str] = []
|
|
63
|
+
|
|
64
|
+
for r in recs:
|
|
65
|
+
mp = r.get("mp_term_name", "")
|
|
66
|
+
zyg = r.get("zygosity", "")
|
|
67
|
+
ls = r.get("life_stage", "")
|
|
68
|
+
sd = r.get("sexual_dimorphism", "None")
|
|
69
|
+
suffix = _format_suffix(zygosity=zyg, life_stage=ls, sexual_dimorphism=sd)
|
|
70
|
+
|
|
71
|
+
# Phenotypes of {gene} KO mice
|
|
72
|
+
if mp:
|
|
73
|
+
phenotype_lines.append(f"{mp} {suffix}")
|
|
74
|
+
|
|
75
|
+
# Associated Human Diseases (list[str] only)
|
|
76
|
+
for dis in r.get("disease_annotation", []) or []:
|
|
77
|
+
disease_lines.append(f"{dis} {suffix}")
|
|
78
|
+
|
|
79
|
+
phenotype_lines = list(set(phenotype_lines))
|
|
80
|
+
disease_lines = list(set(disease_lines))
|
|
81
|
+
|
|
82
|
+
# Formatted annotation text for display
|
|
83
|
+
lines = [f"Phenotypes of {gene} KO mice"]
|
|
84
|
+
lines += [f"- {p}" for p in phenotype_lines]
|
|
85
|
+
if disease_lines:
|
|
86
|
+
lines.append("Associated Human Diseases")
|
|
87
|
+
lines += [f"- {d}" for d in disease_lines]
|
|
88
|
+
|
|
89
|
+
node = {
|
|
90
|
+
"data": {
|
|
91
|
+
"id": gene,
|
|
92
|
+
"label": gene,
|
|
93
|
+
"phenotype": phenotype_lines,
|
|
94
|
+
"disease": disease_lines if disease_lines else "",
|
|
95
|
+
"node_color": 1,
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
if hide_severity:
|
|
99
|
+
node["data"]["hide_severity"] = True
|
|
100
|
+
|
|
101
|
+
nodes.append(node)
|
|
102
|
+
|
|
103
|
+
return nodes
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
###############################################################################
|
|
107
|
+
# Edge building
|
|
108
|
+
###############################################################################
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def _build_edges(pairwise_similarity_annotations: Iterator[dict]):
|
|
112
|
+
"""Return list of Cytoscape.js edges."""
|
|
113
|
+
edges = []
|
|
114
|
+
|
|
115
|
+
for r in pairwise_similarity_annotations:
|
|
116
|
+
g1 = r["gene1_symbol"]
|
|
117
|
+
g2 = r["gene2_symbol"]
|
|
118
|
+
|
|
119
|
+
shared = r.get("phenotype_shared_annotations", {}) or {}
|
|
120
|
+
phen_list = []
|
|
121
|
+
|
|
122
|
+
for mp, ann in shared.items():
|
|
123
|
+
zyg = ann.get("zygosity", "")
|
|
124
|
+
ls = ann.get("life_stage", "")
|
|
125
|
+
sd = ann.get("sexual_dimorphism", "")
|
|
126
|
+
if sd == "None":
|
|
127
|
+
sd = ""
|
|
128
|
+
|
|
129
|
+
ann_str = _create_annotation_string(zyg, ls, sd)
|
|
130
|
+
|
|
131
|
+
if mp:
|
|
132
|
+
if ann_str:
|
|
133
|
+
phen_list.append(f"{mp} ({ann_str})")
|
|
134
|
+
else:
|
|
135
|
+
phen_list.append(mp)
|
|
136
|
+
|
|
137
|
+
edge_size = r.get("phenotype_similarity_score", 0)
|
|
138
|
+
|
|
139
|
+
edges.append(
|
|
140
|
+
{
|
|
141
|
+
"data": {
|
|
142
|
+
"source": g1,
|
|
143
|
+
"target": g2,
|
|
144
|
+
"phenotype": phen_list,
|
|
145
|
+
"edge_size": edge_size,
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
return edges
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
###############################################################################
|
|
154
|
+
# Main builder
|
|
155
|
+
###############################################################################
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def _build_symbol_to_id_map(gene_to_records: dict[str, list[dict]]) -> dict[str, str]:
|
|
159
|
+
symbol_to_id: dict[str, str] = {}
|
|
160
|
+
for symbol, recs in gene_to_records.items():
|
|
161
|
+
for r in recs:
|
|
162
|
+
acc = r.get("marker_accession_id")
|
|
163
|
+
if isinstance(acc, str) and acc and symbol not in symbol_to_id:
|
|
164
|
+
symbol_to_id[symbol] = acc
|
|
165
|
+
break
|
|
166
|
+
return symbol_to_id
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def build_webapp_network(genewise_path, pairwise_path, hide_severity: bool = False):
|
|
170
|
+
"""Return (nodes, edges)."""
|
|
171
|
+
# Read pairwise annotations and collect all genes
|
|
172
|
+
pairwise_similarity_annotations: list[dict] = list(io_handler.read_jsonl(pairwise_path))
|
|
173
|
+
|
|
174
|
+
all_genes = set()
|
|
175
|
+
for record in pairwise_similarity_annotations:
|
|
176
|
+
all_genes.add(record["gene1_symbol"])
|
|
177
|
+
all_genes.add(record["gene2_symbol"])
|
|
178
|
+
|
|
179
|
+
# Read genewise annotations and map by marker_symbol
|
|
180
|
+
genewise_phenotype_annotations: Iterator[dict] = io_handler.read_jsonl(genewise_path)
|
|
181
|
+
gene_to_records = defaultdict(list)
|
|
182
|
+
for rec in genewise_phenotype_annotations:
|
|
183
|
+
gene_to_records[rec["marker_symbol"]].append(rec)
|
|
184
|
+
gene_to_records = dict(gene_to_records)
|
|
185
|
+
|
|
186
|
+
nodes = build_nodes(gene_to_records, all_genes, hide_severity=hide_severity)
|
|
187
|
+
|
|
188
|
+
if len(nodes) > MAX_NODE_COUNT:
|
|
189
|
+
raise ValueError(
|
|
190
|
+
f"Number of nodes ({len(nodes)}) exceeds the maximum allowed ({MAX_NODE_COUNT}). "
|
|
191
|
+
"For large networks, please generate a GraphML file using the `tsumugi build-graphml` "
|
|
192
|
+
"command and visualize it with Cytoscape or another network visualization tool."
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
edges = _build_edges(pairwise_similarity_annotations)
|
|
196
|
+
|
|
197
|
+
symbol_to_id = _build_symbol_to_id_map(gene_to_records)
|
|
198
|
+
|
|
199
|
+
return nodes, edges, symbol_to_id
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def build_and_save_webapp_network(genewise_path, pairwise_path, output_dir):
|
|
203
|
+
output_dir = Path(output_dir)
|
|
204
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
205
|
+
json_path = output_dir / "network.json.gz"
|
|
206
|
+
network_label = "Gene List"
|
|
207
|
+
|
|
208
|
+
# For gene/gene list views, we hide severity; caller (TSUMUGI main) can pass False for phenotype mode
|
|
209
|
+
nodes, edges, symbol_to_id = build_webapp_network(genewise_path, pairwise_path, hide_severity=True)
|
|
210
|
+
elements = nodes + edges
|
|
211
|
+
with gzip.open(json_path, "wt", encoding="utf-8") as f:
|
|
212
|
+
json.dump(elements, f, indent=4)
|
|
213
|
+
|
|
214
|
+
symmap_path = output_dir / "marker_symbol_accession_id.json"
|
|
215
|
+
with open(symmap_path, "w", encoding="utf-8") as fh:
|
|
216
|
+
json.dump(symbol_to_id, fh, ensure_ascii=False, indent=2)
|
|
217
|
+
|
|
218
|
+
_create_webapp_bundle(
|
|
219
|
+
output_dir=output_dir,
|
|
220
|
+
data_filename=json_path.name,
|
|
221
|
+
network_label=network_label,
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
###############################################################################
|
|
226
|
+
# Helpers for HTML/JS generation
|
|
227
|
+
###############################################################################
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
def _safe_filename(name: str) -> str:
|
|
231
|
+
if not name:
|
|
232
|
+
return "gene_list"
|
|
233
|
+
safe = "".join(ch if ch.isalnum() or ch in ("-", "_") else "_" for ch in name)
|
|
234
|
+
return safe or "gene_list"
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
def _copy_asset_tree(src: Path, dst: Path) -> None:
|
|
238
|
+
with as_file(src) as src_on_fs:
|
|
239
|
+
src = Path(src_on_fs)
|
|
240
|
+
if src.exists():
|
|
241
|
+
shutil.copytree(src, dst, dirs_exist_ok=True)
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
def _copy_launchers(output_dir: Path) -> None:
|
|
245
|
+
for filename in LAUNCHER_FILES:
|
|
246
|
+
src = WEB_DIR / filename
|
|
247
|
+
if src.exists():
|
|
248
|
+
with as_file(src) as src_on_fs:
|
|
249
|
+
shutil.copy(src_on_fs, output_dir / filename)
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
def _create_webapp_bundle(
|
|
253
|
+
output_dir: Path,
|
|
254
|
+
data_filename: str,
|
|
255
|
+
network_label: str,
|
|
256
|
+
) -> None:
|
|
257
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
258
|
+
|
|
259
|
+
_copy_asset_tree(WEB_APP_DIR / "css", output_dir / "css")
|
|
260
|
+
_copy_asset_tree(WEB_APP_DIR / "js", output_dir / "js")
|
|
261
|
+
_copy_asset_tree(WEB_DIR / "image", output_dir / "image")
|
|
262
|
+
_copy_launchers(output_dir)
|
|
263
|
+
|
|
264
|
+
safe_entry_name = _safe_filename(network_label)
|
|
265
|
+
_generate_genelist_entry_script(
|
|
266
|
+
output_dir=output_dir,
|
|
267
|
+
entry_js_name=safe_entry_name,
|
|
268
|
+
data_filename=data_filename,
|
|
269
|
+
export_label=safe_entry_name,
|
|
270
|
+
)
|
|
271
|
+
_generate_index_html(
|
|
272
|
+
output_dir=output_dir,
|
|
273
|
+
entry_js_name=safe_entry_name,
|
|
274
|
+
network_label=network_label,
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
|
|
278
|
+
def _read_template(path: Path) -> str:
|
|
279
|
+
with open(path, encoding="utf-8") as fh:
|
|
280
|
+
return fh.read()
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
def _generate_genelist_entry_script(
|
|
284
|
+
output_dir: Path,
|
|
285
|
+
entry_js_name: str,
|
|
286
|
+
data_filename: str,
|
|
287
|
+
export_label: str,
|
|
288
|
+
) -> None:
|
|
289
|
+
template_lines = _read_template(TEMPLATE_JS_DIR / "template_app.js").splitlines()
|
|
290
|
+
filtered_lines = [
|
|
291
|
+
line
|
|
292
|
+
for line in template_lines
|
|
293
|
+
if "XXX_NODE_COLOR_INITIALIZATION" not in line and "XXX_NODE_COLOR_UPDATE" not in line
|
|
294
|
+
]
|
|
295
|
+
template = "\n".join(filtered_lines)
|
|
296
|
+
template = template.replace(
|
|
297
|
+
'const isGeneSymbolPage = "XXX_ELEMENTS".includes("genesymbol");',
|
|
298
|
+
"const isGeneSymbolPage = false;",
|
|
299
|
+
)
|
|
300
|
+
|
|
301
|
+
filter_js = _read_template(TEMPLATE_JS_DIR / "filterByNodeColorAndEdgeSize_genelist.js")
|
|
302
|
+
|
|
303
|
+
final_js = (
|
|
304
|
+
template.replace("XXX_FILTER_BY_NODE_COLOR_AND_EDGE_SIZE", filter_js)
|
|
305
|
+
.replace("XXX_NODE_MIN_MAX", "")
|
|
306
|
+
.replace(
|
|
307
|
+
"XXX_EDGE_MIN_MAX",
|
|
308
|
+
"const edgeMin = Math.min(...edgeSizes); const edgeMax = Math.max(...edgeSizes);",
|
|
309
|
+
)
|
|
310
|
+
.replace("XXX_ELEMENTS", f"loadJSONGz('./{data_filename}')")
|
|
311
|
+
.replace("XXX_PHENOTYPE", "")
|
|
312
|
+
.replace("XXX_NAME", export_label)
|
|
313
|
+
)
|
|
314
|
+
final_js = final_js.replace(
|
|
315
|
+
'const map_symbol_to_id = loadJSON("../../data/marker_symbol_accession_id.json");',
|
|
316
|
+
'const map_symbol_to_id = loadJSON("./marker_symbol_accession_id.json");',
|
|
317
|
+
)
|
|
318
|
+
|
|
319
|
+
js_path = output_dir / "js" / f"{entry_js_name}.js"
|
|
320
|
+
with open(js_path, "w", encoding="utf-8") as fh:
|
|
321
|
+
fh.write(final_js)
|
|
322
|
+
|
|
323
|
+
|
|
324
|
+
def _generate_index_html(
|
|
325
|
+
output_dir: Path,
|
|
326
|
+
entry_js_name: str,
|
|
327
|
+
network_label: str,
|
|
328
|
+
) -> None:
|
|
329
|
+
body_html = _read_template(TEMPLATE_HTML_DIR / "body-container.html").replace("XXX_PHENOTYPE_SEVERITY", "")
|
|
330
|
+
cy_html = _read_template(TEMPLATE_HTML_DIR / "cy-container.html").replace("XXX_PHENOTYPE_SEVERITY", "")
|
|
331
|
+
|
|
332
|
+
page_title = network_label or "Gene List"
|
|
333
|
+
|
|
334
|
+
head_html = (
|
|
335
|
+
_read_template(TEMPLATE_HTML_DIR / "head.html")
|
|
336
|
+
.replace("XXX_TITLE", page_title)
|
|
337
|
+
.replace('src="./XXX_JS_FILE_NAME.js"', f'src="./js/{entry_js_name}.js"')
|
|
338
|
+
.replace("XXX_JS_FILE_NAME", entry_js_name)
|
|
339
|
+
)
|
|
340
|
+
head_html = head_html.replace("../js/", "./js/").replace("../css/", "./css/").replace("../../image", "./image")
|
|
341
|
+
|
|
342
|
+
if network_label and network_label.lower() != "gene list":
|
|
343
|
+
header_insert = f"Gene List: {network_label}"
|
|
344
|
+
else:
|
|
345
|
+
header_insert = "Gene List"
|
|
346
|
+
header_html = _read_template(TEMPLATE_HTML_DIR / "header.html").replace("XXX_TITLE", header_insert)
|
|
347
|
+
|
|
348
|
+
template_html = _read_template(TEMPLATE_HTML_DIR / "template_app.html")
|
|
349
|
+
final_html = (
|
|
350
|
+
template_html.replace("XXX_HEAD", head_html)
|
|
351
|
+
.replace("XXX_H1", header_html)
|
|
352
|
+
.replace("XXX_BODY_CONTAINER", body_html)
|
|
353
|
+
.replace("XXX_CY_CONTAINER", cy_html)
|
|
354
|
+
)
|
|
355
|
+
|
|
356
|
+
index_path = output_dir / "index.html"
|
|
357
|
+
with open(index_path, "w", encoding="utf-8") as fh:
|
|
358
|
+
fh.write(final_html)
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
from collections.abc import Generator
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
from TSUMUGI import io_handler
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def _filter_annotations_by_zygosity(
|
|
8
|
+
pairwise_similarity_annotations: list[dict[str, str | dict[str, dict] | dict[str | int]]],
|
|
9
|
+
zygosity: str = "Homo",
|
|
10
|
+
keep: bool = False,
|
|
11
|
+
drop: bool = False,
|
|
12
|
+
) -> Generator[frozenset[str], dict[str, dict, int]]:
|
|
13
|
+
for pairwise_similarity_annotation in pairwise_similarity_annotations:
|
|
14
|
+
phenotype_shared_annotations = pairwise_similarity_annotation["phenotype_shared_annotations"]
|
|
15
|
+
|
|
16
|
+
if len(phenotype_shared_annotations) == 0:
|
|
17
|
+
continue
|
|
18
|
+
|
|
19
|
+
phenotype_shared_annotations_filtered = {}
|
|
20
|
+
for term_name, annotation in phenotype_shared_annotations.items():
|
|
21
|
+
if annotation["zygosity"] == zygosity and keep:
|
|
22
|
+
phenotype_shared_annotations_filtered[term_name] = annotation
|
|
23
|
+
if annotation["zygosity"] != zygosity and drop:
|
|
24
|
+
phenotype_shared_annotations_filtered[term_name] = annotation
|
|
25
|
+
|
|
26
|
+
if len(phenotype_shared_annotations_filtered) == 0:
|
|
27
|
+
continue
|
|
28
|
+
|
|
29
|
+
pairwise_similarity_annotation["phenotype_shared_annotations"] = phenotype_shared_annotations_filtered
|
|
30
|
+
|
|
31
|
+
yield pairwise_similarity_annotation
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def filter_annotations_by_zygosity(
|
|
35
|
+
path_pairwise_similarity_annotations: str | Path,
|
|
36
|
+
zygosity: str,
|
|
37
|
+
keep: bool = False,
|
|
38
|
+
drop: bool = False,
|
|
39
|
+
) -> None:
|
|
40
|
+
pairwise_similarity_annotations = io_handler.read_jsonl(path_pairwise_similarity_annotations)
|
|
41
|
+
for record in _filter_annotations_by_zygosity(
|
|
42
|
+
pairwise_similarity_annotations=pairwise_similarity_annotations,
|
|
43
|
+
zygosity=zygosity,
|
|
44
|
+
keep=keep,
|
|
45
|
+
drop=drop,
|
|
46
|
+
):
|
|
47
|
+
# output to stdout as JSONL
|
|
48
|
+
io_handler.write_jsonl_to_stdout(record)
|
TSUMUGI/validator.py
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from TSUMUGI import io_handler
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def validate_statistical_results(file_path: str) -> None:
|
|
7
|
+
# Implementation for validating statistical results file
|
|
8
|
+
columns = {
|
|
9
|
+
"marker_symbol",
|
|
10
|
+
"marker_accession_id",
|
|
11
|
+
"mp_term_name",
|
|
12
|
+
"mp_term_id",
|
|
13
|
+
"p_value",
|
|
14
|
+
"effect_size",
|
|
15
|
+
"female_ko_effect_p_value", # sex differences
|
|
16
|
+
"male_ko_effect_p_value", # sex differences
|
|
17
|
+
"zygosity", # zygosity
|
|
18
|
+
"pipeline_name", # life-stage
|
|
19
|
+
"procedure_name", # life-stage
|
|
20
|
+
"allele_symbol", # map to Phendigm
|
|
21
|
+
}
|
|
22
|
+
records = io_handler.load_csv_as_dicts(file_path)
|
|
23
|
+
record_columns = next(records).keys()
|
|
24
|
+
missing_columns = columns - record_columns
|
|
25
|
+
if missing_columns:
|
|
26
|
+
raise ValueError(f"Invalid file: Missing columns {missing_columns} in {file_path}")
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def validate_obo_file(file_path: str) -> None:
|
|
30
|
+
# Implementation for validating OBO file
|
|
31
|
+
|
|
32
|
+
has_format = False
|
|
33
|
+
has_term = False
|
|
34
|
+
|
|
35
|
+
with open(file_path, encoding="utf-8") as f:
|
|
36
|
+
for line in f:
|
|
37
|
+
s = line.strip()
|
|
38
|
+
if not s or s.startswith("!"):
|
|
39
|
+
continue
|
|
40
|
+
if s.startswith("format-version:"):
|
|
41
|
+
has_format = True
|
|
42
|
+
elif s.startswith("[Term]"):
|
|
43
|
+
has_term = True
|
|
44
|
+
break # enough for quick validation
|
|
45
|
+
|
|
46
|
+
if not has_format:
|
|
47
|
+
raise ValueError("Invalid OBO file: missing 'format-version:' in header.")
|
|
48
|
+
if not has_term:
|
|
49
|
+
raise ValueError("Invalid OBO file: missing '[Term]' stanza.")
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def validate_mp_term_id(term_id: str, mp_obo_path: str) -> None:
|
|
53
|
+
# Implementation for validating MP term ID
|
|
54
|
+
ontology_terms = io_handler.parse_obo_file(mp_obo_path)
|
|
55
|
+
if term_id not in ontology_terms:
|
|
56
|
+
raise ValueError(f"MP term ID '{term_id}' not found in OBO file '{mp_obo_path}'.")
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def validate_phenodigm_file(file_path: str) -> None:
|
|
60
|
+
# Implementation for validating Phenodigm file
|
|
61
|
+
columns = {"Disorder name", "Mouse model description"}
|
|
62
|
+
record_columns = next(io_handler.load_csv_as_dicts(file_path)).keys()
|
|
63
|
+
missing_columns = columns - record_columns
|
|
64
|
+
if missing_columns:
|
|
65
|
+
raise ValueError(f"Invalid file: Missing {missing_columns} in {file_path}")
|