TSUMUGI 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- TSUMUGI/annotator.py +103 -0
- TSUMUGI/argparser.py +599 -0
- TSUMUGI/core.py +185 -0
- TSUMUGI/data/impc_phenodigm.csv +3406 -0
- TSUMUGI/data/mp.obo +143993 -0
- TSUMUGI/filterer.py +36 -0
- TSUMUGI/formatter.py +122 -0
- TSUMUGI/genewise_annotation_builder.py +94 -0
- TSUMUGI/io_handler.py +189 -0
- TSUMUGI/main.py +300 -0
- TSUMUGI/network_constructor.py +603 -0
- TSUMUGI/ontology_handler.py +62 -0
- TSUMUGI/pairwise_similarity_builder.py +66 -0
- TSUMUGI/report_generator.py +122 -0
- TSUMUGI/similarity_calculator.py +498 -0
- TSUMUGI/subcommands/count_filterer.py +47 -0
- TSUMUGI/subcommands/genes_filterer.py +89 -0
- TSUMUGI/subcommands/graphml_builder.py +158 -0
- TSUMUGI/subcommands/life_stage_filterer.py +48 -0
- TSUMUGI/subcommands/mp_filterer.py +142 -0
- TSUMUGI/subcommands/score_filterer.py +22 -0
- TSUMUGI/subcommands/sex_filterer.py +48 -0
- TSUMUGI/subcommands/webapp_builder.py +358 -0
- TSUMUGI/subcommands/zygosity_filterer.py +48 -0
- TSUMUGI/validator.py +65 -0
- TSUMUGI/web/app/css/app.css +1129 -0
- TSUMUGI/web/app/genelist/network_genelist.html +339 -0
- TSUMUGI/web/app/genelist/network_genelist.js +421 -0
- TSUMUGI/web/app/js/data/dataLoader.js +41 -0
- TSUMUGI/web/app/js/export/graphExporter.js +214 -0
- TSUMUGI/web/app/js/graph/centrality.js +495 -0
- TSUMUGI/web/app/js/graph/components.js +30 -0
- TSUMUGI/web/app/js/graph/filters.js +158 -0
- TSUMUGI/web/app/js/graph/highlighter.js +52 -0
- TSUMUGI/web/app/js/graph/layoutController.js +454 -0
- TSUMUGI/web/app/js/graph/valueScaler.js +43 -0
- TSUMUGI/web/app/js/search/geneSearcher.js +93 -0
- TSUMUGI/web/app/js/search/phenotypeSearcher.js +292 -0
- TSUMUGI/web/app/js/ui/dynamicFontSize.js +30 -0
- TSUMUGI/web/app/js/ui/mobilePanel.js +77 -0
- TSUMUGI/web/app/js/ui/slider.js +22 -0
- TSUMUGI/web/app/js/ui/tooltips.js +514 -0
- TSUMUGI/web/app/js/viewer/pageSetup.js +217 -0
- TSUMUGI/web/app/viewer.html +515 -0
- TSUMUGI/web/app/viewer.js +1593 -0
- TSUMUGI/web/css/sanitize.css +363 -0
- TSUMUGI/web/css/top.css +391 -0
- TSUMUGI/web/image/tsumugi-favicon.ico +0 -0
- TSUMUGI/web/image/tsumugi-icon.png +0 -0
- TSUMUGI/web/image/tsumugi-logo.png +0 -0
- TSUMUGI/web/image/tsumugi-logo.svg +69 -0
- TSUMUGI/web/js/genelist_formatter.js +123 -0
- TSUMUGI/web/js/top.js +338 -0
- TSUMUGI/web/open_webapp_linux.sh +25 -0
- TSUMUGI/web/open_webapp_mac.command +25 -0
- TSUMUGI/web/open_webapp_windows.bat +37 -0
- TSUMUGI/web/serve_index.py +110 -0
- TSUMUGI/web/template/template_index.html +197 -0
- TSUMUGI/web_deployer.py +150 -0
- tsumugi-1.0.1.dist-info/METADATA +504 -0
- tsumugi-1.0.1.dist-info/RECORD +64 -0
- tsumugi-1.0.1.dist-info/WHEEL +4 -0
- tsumugi-1.0.1.dist-info/entry_points.txt +3 -0
- tsumugi-1.0.1.dist-info/licenses/LICENSE +21 -0
TSUMUGI/filterer.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from collections.abc import Generator, Iterable, Iterator
|
|
4
|
+
from itertools import groupby
|
|
5
|
+
from operator import itemgetter
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def subset_columns(records: Iterator[dict[str, str]], columns: set[str]) -> Generator[dict[str, str]]:
|
|
9
|
+
"""Return list[dict] keeping only the requested columns; missing keys become empty strings."""
|
|
10
|
+
return ({col: record.get(col, "") for col in columns} for record in records)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
###########################################################
|
|
14
|
+
# Others
|
|
15
|
+
###########################################################
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def distinct_records_with_max_effect(
|
|
19
|
+
records: Iterable[dict[str, str | float]], unique_keys: list[str]
|
|
20
|
+
) -> Generator[dict[str, str | float]]:
|
|
21
|
+
"""
|
|
22
|
+
Groups records by the specified keys and returns the record with the maximum
|
|
23
|
+
effect_size from each group.
|
|
24
|
+
Note: effect_size is already an absolute value.
|
|
25
|
+
"""
|
|
26
|
+
# Dynamically define the key function based on unique_keys.
|
|
27
|
+
record_key_getter = itemgetter(*unique_keys)
|
|
28
|
+
|
|
29
|
+
# Pre-sort by the same key for groupby to function correctly.
|
|
30
|
+
records_sorted = sorted(records, key=record_key_getter)
|
|
31
|
+
|
|
32
|
+
for _, group in groupby(records_sorted, key=record_key_getter):
|
|
33
|
+
# Find the record with the maximum effect_size within the group.
|
|
34
|
+
# Use .get() to safely handle cases where the 'effect_size' key might be missing.
|
|
35
|
+
record_with_max_effect = max(group, key=lambda r: r.get("effect_size", -1))
|
|
36
|
+
yield record_with_max_effect
|
TSUMUGI/formatter.py
ADDED
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import math
|
|
4
|
+
from collections import defaultdict
|
|
5
|
+
from collections.abc import Generator, Iterable
|
|
6
|
+
|
|
7
|
+
###########################################################
|
|
8
|
+
# String to Float
|
|
9
|
+
###########################################################
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _to_float(x: str | None) -> float:
|
|
13
|
+
"""Convert a string to float; empty/None becomes NaN."""
|
|
14
|
+
return float(x) if x not in (None, "") else float("nan")
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def floatinize_columns(records: Iterable[dict[str, str]], columns: list[str]) -> Generator[dict[str, str | float]]:
|
|
18
|
+
"""Return a record with numeric fields coerced to float/NaN."""
|
|
19
|
+
for record in records:
|
|
20
|
+
for col in columns:
|
|
21
|
+
record[col] = _to_float(record.get(col))
|
|
22
|
+
yield record
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def abs_effect_size(
|
|
26
|
+
records: Iterable[dict[str, str | float]], effect_size_columns: list[str]
|
|
27
|
+
) -> Generator[dict[str, str | float]]:
|
|
28
|
+
"""Return a record with the absolute effect size and NaN replaced with 0."""
|
|
29
|
+
for record in records:
|
|
30
|
+
for col in effect_size_columns:
|
|
31
|
+
if math.isnan(record[col]):
|
|
32
|
+
record[col] = 0.0
|
|
33
|
+
record[col] = abs(record[col])
|
|
34
|
+
yield record
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
###########################################################
|
|
38
|
+
# Zygosity Formatting
|
|
39
|
+
###########################################################
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def format_zygosity(
|
|
43
|
+
records: Iterable[dict[str, str | float]], zygosity_converter: dict[str, str]
|
|
44
|
+
) -> Generator[dict[str, str | float]]:
|
|
45
|
+
"""Format zygosity values to a consistent style."""
|
|
46
|
+
for record in records:
|
|
47
|
+
record["zygosity"] = zygosity_converter.get(record["zygosity"], record["zygosity"])
|
|
48
|
+
yield record
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
###########################################################
|
|
52
|
+
# IMPC human disease_annotations
|
|
53
|
+
###########################################################
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _select_fields_from_disease_annotations(disease_annotations: list[dict[str, str]]) -> list[dict[str, str]]:
|
|
57
|
+
"""Select disorder_name and description fields from IMPC human disease_annotations."""
|
|
58
|
+
|
|
59
|
+
# Heuristic to identify the correct fields
|
|
60
|
+
prev_disorder_name = None
|
|
61
|
+
prev_description = None
|
|
62
|
+
for record in disease_annotations:
|
|
63
|
+
for key, value in record.items():
|
|
64
|
+
if "Syndrome" in value:
|
|
65
|
+
prev_disorder_name = key
|
|
66
|
+
if "<em1(IMPC)Bay>" in value:
|
|
67
|
+
prev_description = key
|
|
68
|
+
if prev_disorder_name is not None and prev_description is not None:
|
|
69
|
+
break
|
|
70
|
+
|
|
71
|
+
if prev_disorder_name is None or prev_description is None:
|
|
72
|
+
raise ValueError("Could not identify disorder_name or description fields in disease_annotations.")
|
|
73
|
+
|
|
74
|
+
# Select the fields
|
|
75
|
+
disease_annotations_selected = []
|
|
76
|
+
for record in disease_annotations:
|
|
77
|
+
disease_annotations_selected.append(
|
|
78
|
+
{"disorder_name": record[prev_disorder_name], "description": record[prev_description]}
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
return disease_annotations_selected
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def format_disease_annotations(disease_annotations: list[dict[str, str | float]]) -> dict[str, list[dict[str, str]]]:
|
|
85
|
+
"""Format the IMPC human disease_annotations for output."""
|
|
86
|
+
|
|
87
|
+
# Select "description" and "disorder_name" fields
|
|
88
|
+
disease_annotations = _select_fields_from_disease_annotations(disease_annotations)
|
|
89
|
+
|
|
90
|
+
# Filter out records with unexpected description format: expected format is "<allele_symbol> <zygosity> <life_stage>"
|
|
91
|
+
disease_annotations = [record for record in disease_annotations if len(record["description"].split(" ")) == 3]
|
|
92
|
+
|
|
93
|
+
zygosity_converter = {"het": "Hetero", "hom": "Homo", "hem": "Hemi"}
|
|
94
|
+
life_stage_converter = {"middle": "Interval", "late": "Late", "early": "Early", "embryo": "Embryo"}
|
|
95
|
+
|
|
96
|
+
for record in disease_annotations:
|
|
97
|
+
description = record["description"]
|
|
98
|
+
description_split = description.split(" ")
|
|
99
|
+
marker_symbol = description.split("<")[0].strip()
|
|
100
|
+
zygosity = description_split[-2]
|
|
101
|
+
life_stage = description_split[-1]
|
|
102
|
+
# Apply converters
|
|
103
|
+
zygosity = zygosity_converter.get(zygosity, zygosity)
|
|
104
|
+
life_stage = life_stage_converter.get(life_stage, life_stage)
|
|
105
|
+
# Update record with new values
|
|
106
|
+
record["marker_symbol"] = marker_symbol
|
|
107
|
+
record["zygosity"] = zygosity
|
|
108
|
+
record["life_stage"] = life_stage
|
|
109
|
+
# Delete used fields
|
|
110
|
+
del record["description"]
|
|
111
|
+
|
|
112
|
+
# Using marker_symbol as the key makes it easier to join with IMPC phenotype records later
|
|
113
|
+
disease_annotations_by_gene = defaultdict(list)
|
|
114
|
+
appended_records = set()
|
|
115
|
+
for record in disease_annotations:
|
|
116
|
+
if tuple(record.items()) not in appended_records:
|
|
117
|
+
appended_records.add(tuple(record.items()))
|
|
118
|
+
marker_symbol = record["marker_symbol"]
|
|
119
|
+
del record["marker_symbol"]
|
|
120
|
+
disease_annotations_by_gene[marker_symbol].append(record)
|
|
121
|
+
|
|
122
|
+
return dict(disease_annotations_by_gene)
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from collections.abc import Iterator
|
|
4
|
+
|
|
5
|
+
from TSUMUGI import annotator, filterer, formatter
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def build_genewise_phenotype_annotations(
|
|
9
|
+
records: Iterator[dict], ontology_terms: dict, disease_annotations_by_gene: dict
|
|
10
|
+
) -> Iterator[dict]:
|
|
11
|
+
###########################################################
|
|
12
|
+
# Preprocess data
|
|
13
|
+
###########################################################
|
|
14
|
+
|
|
15
|
+
# --------------------------------------------------------
|
|
16
|
+
# Select columns, maintained mp term, and significant genes
|
|
17
|
+
# --------------------------------------------------------
|
|
18
|
+
|
|
19
|
+
# Floatinize columns
|
|
20
|
+
float_columns = [
|
|
21
|
+
"p_value",
|
|
22
|
+
"effect_size",
|
|
23
|
+
"female_ko_effect_p_value",
|
|
24
|
+
"female_ko_parameter_estimate",
|
|
25
|
+
"male_ko_effect_p_value",
|
|
26
|
+
"male_ko_parameter_estimate",
|
|
27
|
+
]
|
|
28
|
+
records_formatted = formatter.floatinize_columns(records, float_columns)
|
|
29
|
+
|
|
30
|
+
# Format zygosity
|
|
31
|
+
zygosity_converter = {"heterozygote": "Hetero", "homozygote": "Homo", "hemizygote": "Hemi"}
|
|
32
|
+
records_formatted = formatter.format_zygosity(records_formatted, zygosity_converter)
|
|
33
|
+
# Take absolute value of effect size
|
|
34
|
+
effect_size_columns = ["effect_size", "female_ko_parameter_estimate", "male_ko_parameter_estimate"]
|
|
35
|
+
records_formatted = formatter.abs_effect_size(records_formatted, effect_size_columns)
|
|
36
|
+
|
|
37
|
+
# --------------------------------------------------------
|
|
38
|
+
# Annotate life stage and sexual dimorphisms
|
|
39
|
+
# --------------------------------------------------------
|
|
40
|
+
|
|
41
|
+
embryo_assays = {
|
|
42
|
+
"E9.5",
|
|
43
|
+
"E10.5",
|
|
44
|
+
"E12.5",
|
|
45
|
+
"Embryo LacZ", # E12.5
|
|
46
|
+
"E14.5",
|
|
47
|
+
"E14.5-E15.5",
|
|
48
|
+
"E18.5",
|
|
49
|
+
}
|
|
50
|
+
# Life stage
|
|
51
|
+
records_annotated = annotator.annotate_life_stage(records_formatted, embryo_assays)
|
|
52
|
+
# Sexual dimorphism
|
|
53
|
+
records_annotated = annotator.annotate_sexual_dimorphism(records_annotated, threshold=1e-4)
|
|
54
|
+
# Human Diseases
|
|
55
|
+
records_annotated = annotator.annotate_diseases(records_annotated, disease_annotations_by_gene)
|
|
56
|
+
# Annotate Significant (True/False)
|
|
57
|
+
records_annotated = annotator.annotate_significant(records_annotated)
|
|
58
|
+
|
|
59
|
+
# --------------------------------------------------------
|
|
60
|
+
# Filter records
|
|
61
|
+
# --------------------------------------------------------
|
|
62
|
+
records_filtered = records_annotated
|
|
63
|
+
|
|
64
|
+
# Subset columns
|
|
65
|
+
to_keep_columns = {
|
|
66
|
+
"marker_symbol",
|
|
67
|
+
"marker_accession_id",
|
|
68
|
+
"mp_term_id",
|
|
69
|
+
"mp_term_name",
|
|
70
|
+
"zygosity",
|
|
71
|
+
"life_stage",
|
|
72
|
+
"sexual_dimorphism",
|
|
73
|
+
"effect_size",
|
|
74
|
+
"significant",
|
|
75
|
+
"disease_annotation",
|
|
76
|
+
}
|
|
77
|
+
records_filtered = filterer.subset_columns(records_filtered, to_keep_columns)
|
|
78
|
+
|
|
79
|
+
# Keep only records with mp_term_id in the ontology file (= not obsolete)
|
|
80
|
+
records_filtered = (record for record in records_filtered if record["mp_term_id"] in ontology_terms)
|
|
81
|
+
|
|
82
|
+
# Distinct records with max effect size
|
|
83
|
+
unique_keys = [
|
|
84
|
+
"marker_symbol",
|
|
85
|
+
"mp_term_id",
|
|
86
|
+
"zygosity",
|
|
87
|
+
"life_stage",
|
|
88
|
+
"sexual_dimorphism",
|
|
89
|
+
]
|
|
90
|
+
records_filtered = filterer.distinct_records_with_max_effect(records_filtered, unique_keys)
|
|
91
|
+
|
|
92
|
+
genewise_phenotype_annotations = records_filtered
|
|
93
|
+
|
|
94
|
+
return genewise_phenotype_annotations
|
TSUMUGI/io_handler.py
ADDED
|
@@ -0,0 +1,189 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import csv
|
|
4
|
+
import gzip
|
|
5
|
+
import json
|
|
6
|
+
import pickle
|
|
7
|
+
import sys
|
|
8
|
+
from collections.abc import Iterable, Iterator
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
|
|
11
|
+
from tqdm import tqdm
|
|
12
|
+
|
|
13
|
+
from TSUMUGI import formatter
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def count_newline(file_path: str | Path, chunk_size: int = 1024 * 1024) -> int:
|
|
17
|
+
"""Count newline characters in plain text or gzip-compressed file efficiently."""
|
|
18
|
+
open_func = gzip.open if Path(file_path).suffix == ".gz" else open
|
|
19
|
+
count = 0
|
|
20
|
+
with open_func(file_path, "rb") as f:
|
|
21
|
+
while chunk := f.read(chunk_size):
|
|
22
|
+
count += chunk.count(b"\n")
|
|
23
|
+
return count
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def load_csv_as_dicts(file_path: str | Path, encoding: str = "utf-8") -> Iterator[dict[str, str]]:
|
|
27
|
+
"""
|
|
28
|
+
Read CSV file and return each row as a {header: value} dict
|
|
29
|
+
"""
|
|
30
|
+
file_path = Path(file_path)
|
|
31
|
+
|
|
32
|
+
# Detect gzip-compressed file
|
|
33
|
+
open_func = gzip.open if file_path.suffix == ".gz" else open
|
|
34
|
+
|
|
35
|
+
# Open and read as text
|
|
36
|
+
with open_func(file_path, mode="rt", newline="", encoding=encoding) as f:
|
|
37
|
+
reader = csv.DictReader(f)
|
|
38
|
+
for row in reader:
|
|
39
|
+
yield dict(row)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def write_pickle(obj: any, file_path: str | Path) -> None:
|
|
43
|
+
"""
|
|
44
|
+
Save any Python object in pickle format
|
|
45
|
+
"""
|
|
46
|
+
with open(file_path, "wb") as f:
|
|
47
|
+
pickle.dump(obj, f, protocol=pickle.HIGHEST_PROTOCOL)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def read_pickle(file_path: str | Path) -> any:
|
|
51
|
+
"""
|
|
52
|
+
Load pickle file and return as Python object
|
|
53
|
+
"""
|
|
54
|
+
with open(file_path, "rb") as f:
|
|
55
|
+
return pickle.load(f)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def write_pickle_iter(iterable: Iterator, file_path: str | Path) -> None:
|
|
59
|
+
"""
|
|
60
|
+
Sequential (streaming) pickle save.
|
|
61
|
+
Write element by element from iterable (including generators) without loading into memory.
|
|
62
|
+
"""
|
|
63
|
+
with open(file_path, "wb") as f:
|
|
64
|
+
for item in iterable:
|
|
65
|
+
pickle.dump(item, f, protocol=pickle.HIGHEST_PROTOCOL)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def read_pickle_iter(file_path: str | Path) -> Iterator[any]:
|
|
69
|
+
"""
|
|
70
|
+
Generator that reads pickle sequentially (yields until EOF)
|
|
71
|
+
"""
|
|
72
|
+
with open(file_path, "rb") as f:
|
|
73
|
+
while True:
|
|
74
|
+
try:
|
|
75
|
+
yield pickle.load(f)
|
|
76
|
+
except EOFError:
|
|
77
|
+
break
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def parse_obo_file(file_path: str | Path) -> dict[str, dict[str, str]]:
|
|
81
|
+
"""Parse ontology file (OBO format) and extract term information.
|
|
82
|
+
Returns dict with keys: id, name, is_a (parent terms), is_obsolete
|
|
83
|
+
"""
|
|
84
|
+
ontology_terms = {}
|
|
85
|
+
current_term_data = None
|
|
86
|
+
with open(file_path, encoding="utf-8") as f:
|
|
87
|
+
for line in f:
|
|
88
|
+
line = line.strip()
|
|
89
|
+
|
|
90
|
+
if line == "[Term]":
|
|
91
|
+
current_term_data = {}
|
|
92
|
+
continue
|
|
93
|
+
|
|
94
|
+
if line.startswith("[") and line.endswith("]") and line != "[Term]":
|
|
95
|
+
current_term_data = None
|
|
96
|
+
continue
|
|
97
|
+
|
|
98
|
+
if current_term_data is None:
|
|
99
|
+
continue
|
|
100
|
+
|
|
101
|
+
if ":" in line:
|
|
102
|
+
key, value = line.split(":", 1)
|
|
103
|
+
key = key.strip()
|
|
104
|
+
value = value.strip()
|
|
105
|
+
|
|
106
|
+
if key == "id":
|
|
107
|
+
current_term_data["id"] = value
|
|
108
|
+
elif key == "name":
|
|
109
|
+
current_term_data["name"] = value
|
|
110
|
+
elif key == "is_a":
|
|
111
|
+
if "is_a" not in current_term_data:
|
|
112
|
+
current_term_data["is_a"] = []
|
|
113
|
+
parent_id = value.split("!")[0].strip()
|
|
114
|
+
current_term_data["is_a"].append(parent_id)
|
|
115
|
+
elif key == "is_obsolete":
|
|
116
|
+
current_term_data["is_obsolete"] = value.lower() == "true"
|
|
117
|
+
|
|
118
|
+
if line == "" and current_term_data and "id" in current_term_data:
|
|
119
|
+
if not current_term_data.get("is_obsolete", False):
|
|
120
|
+
ontology_terms[current_term_data["id"]] = current_term_data
|
|
121
|
+
current_term_data = None
|
|
122
|
+
|
|
123
|
+
return ontology_terms
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def parse_impc_phenodigm(file_path: str | Path) -> dict[str, list[dict[str, str]]]:
|
|
127
|
+
with open(Path(file_path)) as f:
|
|
128
|
+
disease_annotations_by_gene: dict[str, list[dict[str, str]]] = formatter.format_disease_annotations(
|
|
129
|
+
list(csv.DictReader(f))
|
|
130
|
+
)
|
|
131
|
+
return disease_annotations_by_gene
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def read_jsonl(path_jsonl: str | Path | None) -> Iterator[dict]:
|
|
135
|
+
"""
|
|
136
|
+
Stream JSONL (.jsonl or .jsonl.gz). If path_jsonl is None or "-", read from stdin.
|
|
137
|
+
Keeps the file open during iteration (no 'I/O operation on closed file').
|
|
138
|
+
"""
|
|
139
|
+
# stdin
|
|
140
|
+
if path_jsonl is None or str(path_jsonl) == "-" or path_jsonl == sys.stdin:
|
|
141
|
+
for line in sys.stdin:
|
|
142
|
+
if line.strip():
|
|
143
|
+
yield json.loads(line)
|
|
144
|
+
return
|
|
145
|
+
|
|
146
|
+
# file / gzip
|
|
147
|
+
p = Path(path_jsonl)
|
|
148
|
+
open_func = gzip.open if p.suffix == ".gz" else open
|
|
149
|
+
|
|
150
|
+
with open_func(p, "rt", encoding="utf-8") as f:
|
|
151
|
+
for line in f:
|
|
152
|
+
if line.strip():
|
|
153
|
+
yield json.loads(line)
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def write_jsonl(records: Iterable[dict], path_jsonl: str | Path | None) -> None:
|
|
157
|
+
"""
|
|
158
|
+
Write an iterable of records as JSONL (.jsonl or .jsonl.gz).
|
|
159
|
+
|
|
160
|
+
If the filename ends with .gz, use gzip compression (level=9).
|
|
161
|
+
"""
|
|
162
|
+
p = Path(path_jsonl)
|
|
163
|
+
|
|
164
|
+
def open_text_file(path: Path, mode: str, encoding: str):
|
|
165
|
+
return open(path, mode, encoding=encoding)
|
|
166
|
+
|
|
167
|
+
def open_gzip_file(path: Path, mode: str, encoding: str):
|
|
168
|
+
return gzip.open(path, mode, encoding=encoding, compresslevel=9)
|
|
169
|
+
|
|
170
|
+
open_func = open_gzip_file if p.suffix == ".gz" else open_text_file
|
|
171
|
+
|
|
172
|
+
message = f"Writing JSONL to {path_jsonl}"
|
|
173
|
+
with open_func(p, "wt", encoding="utf-8") as f:
|
|
174
|
+
for record in tqdm(records, desc=message):
|
|
175
|
+
json.dump(record, f, ensure_ascii=False)
|
|
176
|
+
f.write("\n")
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def write_jsonl_to_stdout(record: dict) -> None:
|
|
180
|
+
"""Write record as JSONL and suppress BrokenPipeError cleanly."""
|
|
181
|
+
try:
|
|
182
|
+
json.dump(record, sys.stdout, ensure_ascii=False)
|
|
183
|
+
sys.stdout.write("\n")
|
|
184
|
+
except BrokenPipeError:
|
|
185
|
+
try:
|
|
186
|
+
sys.stdout.close()
|
|
187
|
+
except Exception:
|
|
188
|
+
pass
|
|
189
|
+
sys.exit(0)
|