TSUMUGI 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. TSUMUGI/annotator.py +103 -0
  2. TSUMUGI/argparser.py +599 -0
  3. TSUMUGI/core.py +185 -0
  4. TSUMUGI/data/impc_phenodigm.csv +3406 -0
  5. TSUMUGI/data/mp.obo +143993 -0
  6. TSUMUGI/filterer.py +36 -0
  7. TSUMUGI/formatter.py +122 -0
  8. TSUMUGI/genewise_annotation_builder.py +94 -0
  9. TSUMUGI/io_handler.py +189 -0
  10. TSUMUGI/main.py +300 -0
  11. TSUMUGI/network_constructor.py +603 -0
  12. TSUMUGI/ontology_handler.py +62 -0
  13. TSUMUGI/pairwise_similarity_builder.py +66 -0
  14. TSUMUGI/report_generator.py +122 -0
  15. TSUMUGI/similarity_calculator.py +498 -0
  16. TSUMUGI/subcommands/count_filterer.py +47 -0
  17. TSUMUGI/subcommands/genes_filterer.py +89 -0
  18. TSUMUGI/subcommands/graphml_builder.py +158 -0
  19. TSUMUGI/subcommands/life_stage_filterer.py +48 -0
  20. TSUMUGI/subcommands/mp_filterer.py +142 -0
  21. TSUMUGI/subcommands/score_filterer.py +22 -0
  22. TSUMUGI/subcommands/sex_filterer.py +48 -0
  23. TSUMUGI/subcommands/webapp_builder.py +358 -0
  24. TSUMUGI/subcommands/zygosity_filterer.py +48 -0
  25. TSUMUGI/validator.py +65 -0
  26. TSUMUGI/web/app/css/app.css +1129 -0
  27. TSUMUGI/web/app/genelist/network_genelist.html +339 -0
  28. TSUMUGI/web/app/genelist/network_genelist.js +421 -0
  29. TSUMUGI/web/app/js/data/dataLoader.js +41 -0
  30. TSUMUGI/web/app/js/export/graphExporter.js +214 -0
  31. TSUMUGI/web/app/js/graph/centrality.js +495 -0
  32. TSUMUGI/web/app/js/graph/components.js +30 -0
  33. TSUMUGI/web/app/js/graph/filters.js +158 -0
  34. TSUMUGI/web/app/js/graph/highlighter.js +52 -0
  35. TSUMUGI/web/app/js/graph/layoutController.js +454 -0
  36. TSUMUGI/web/app/js/graph/valueScaler.js +43 -0
  37. TSUMUGI/web/app/js/search/geneSearcher.js +93 -0
  38. TSUMUGI/web/app/js/search/phenotypeSearcher.js +292 -0
  39. TSUMUGI/web/app/js/ui/dynamicFontSize.js +30 -0
  40. TSUMUGI/web/app/js/ui/mobilePanel.js +77 -0
  41. TSUMUGI/web/app/js/ui/slider.js +22 -0
  42. TSUMUGI/web/app/js/ui/tooltips.js +514 -0
  43. TSUMUGI/web/app/js/viewer/pageSetup.js +217 -0
  44. TSUMUGI/web/app/viewer.html +515 -0
  45. TSUMUGI/web/app/viewer.js +1593 -0
  46. TSUMUGI/web/css/sanitize.css +363 -0
  47. TSUMUGI/web/css/top.css +391 -0
  48. TSUMUGI/web/image/tsumugi-favicon.ico +0 -0
  49. TSUMUGI/web/image/tsumugi-icon.png +0 -0
  50. TSUMUGI/web/image/tsumugi-logo.png +0 -0
  51. TSUMUGI/web/image/tsumugi-logo.svg +69 -0
  52. TSUMUGI/web/js/genelist_formatter.js +123 -0
  53. TSUMUGI/web/js/top.js +338 -0
  54. TSUMUGI/web/open_webapp_linux.sh +25 -0
  55. TSUMUGI/web/open_webapp_mac.command +25 -0
  56. TSUMUGI/web/open_webapp_windows.bat +37 -0
  57. TSUMUGI/web/serve_index.py +110 -0
  58. TSUMUGI/web/template/template_index.html +197 -0
  59. TSUMUGI/web_deployer.py +150 -0
  60. tsumugi-1.0.1.dist-info/METADATA +504 -0
  61. tsumugi-1.0.1.dist-info/RECORD +64 -0
  62. tsumugi-1.0.1.dist-info/WHEEL +4 -0
  63. tsumugi-1.0.1.dist-info/entry_points.txt +3 -0
  64. tsumugi-1.0.1.dist-info/licenses/LICENSE +21 -0
TSUMUGI/filterer.py ADDED
@@ -0,0 +1,36 @@
1
+ from __future__ import annotations
2
+
3
+ from collections.abc import Generator, Iterable, Iterator
4
+ from itertools import groupby
5
+ from operator import itemgetter
6
+
7
+
8
+ def subset_columns(records: Iterator[dict[str, str]], columns: set[str]) -> Generator[dict[str, str]]:
9
+ """Return list[dict] keeping only the requested columns; missing keys become empty strings."""
10
+ return ({col: record.get(col, "") for col in columns} for record in records)
11
+
12
+
13
+ ###########################################################
14
+ # Others
15
+ ###########################################################
16
+
17
+
18
+ def distinct_records_with_max_effect(
19
+ records: Iterable[dict[str, str | float]], unique_keys: list[str]
20
+ ) -> Generator[dict[str, str | float]]:
21
+ """
22
+ Groups records by the specified keys and returns the record with the maximum
23
+ effect_size from each group.
24
+ Note: effect_size is already an absolute value.
25
+ """
26
+ # Dynamically define the key function based on unique_keys.
27
+ record_key_getter = itemgetter(*unique_keys)
28
+
29
+ # Pre-sort by the same key for groupby to function correctly.
30
+ records_sorted = sorted(records, key=record_key_getter)
31
+
32
+ for _, group in groupby(records_sorted, key=record_key_getter):
33
+ # Find the record with the maximum effect_size within the group.
34
+ # Use .get() to safely handle cases where the 'effect_size' key might be missing.
35
+ record_with_max_effect = max(group, key=lambda r: r.get("effect_size", -1))
36
+ yield record_with_max_effect
TSUMUGI/formatter.py ADDED
@@ -0,0 +1,122 @@
1
+ from __future__ import annotations
2
+
3
+ import math
4
+ from collections import defaultdict
5
+ from collections.abc import Generator, Iterable
6
+
7
+ ###########################################################
8
+ # String to Float
9
+ ###########################################################
10
+
11
+
12
+ def _to_float(x: str | None) -> float:
13
+ """Convert a string to float; empty/None becomes NaN."""
14
+ return float(x) if x not in (None, "") else float("nan")
15
+
16
+
17
+ def floatinize_columns(records: Iterable[dict[str, str]], columns: list[str]) -> Generator[dict[str, str | float]]:
18
+ """Return a record with numeric fields coerced to float/NaN."""
19
+ for record in records:
20
+ for col in columns:
21
+ record[col] = _to_float(record.get(col))
22
+ yield record
23
+
24
+
25
+ def abs_effect_size(
26
+ records: Iterable[dict[str, str | float]], effect_size_columns: list[str]
27
+ ) -> Generator[dict[str, str | float]]:
28
+ """Return a record with the absolute effect size and NaN replaced with 0."""
29
+ for record in records:
30
+ for col in effect_size_columns:
31
+ if math.isnan(record[col]):
32
+ record[col] = 0.0
33
+ record[col] = abs(record[col])
34
+ yield record
35
+
36
+
37
+ ###########################################################
38
+ # Zygosity Formatting
39
+ ###########################################################
40
+
41
+
42
+ def format_zygosity(
43
+ records: Iterable[dict[str, str | float]], zygosity_converter: dict[str, str]
44
+ ) -> Generator[dict[str, str | float]]:
45
+ """Format zygosity values to a consistent style."""
46
+ for record in records:
47
+ record["zygosity"] = zygosity_converter.get(record["zygosity"], record["zygosity"])
48
+ yield record
49
+
50
+
51
+ ###########################################################
52
+ # IMPC human disease_annotations
53
+ ###########################################################
54
+
55
+
56
+ def _select_fields_from_disease_annotations(disease_annotations: list[dict[str, str]]) -> list[dict[str, str]]:
57
+ """Select disorder_name and description fields from IMPC human disease_annotations."""
58
+
59
+ # Heuristic to identify the correct fields
60
+ prev_disorder_name = None
61
+ prev_description = None
62
+ for record in disease_annotations:
63
+ for key, value in record.items():
64
+ if "Syndrome" in value:
65
+ prev_disorder_name = key
66
+ if "<em1(IMPC)Bay>" in value:
67
+ prev_description = key
68
+ if prev_disorder_name is not None and prev_description is not None:
69
+ break
70
+
71
+ if prev_disorder_name is None or prev_description is None:
72
+ raise ValueError("Could not identify disorder_name or description fields in disease_annotations.")
73
+
74
+ # Select the fields
75
+ disease_annotations_selected = []
76
+ for record in disease_annotations:
77
+ disease_annotations_selected.append(
78
+ {"disorder_name": record[prev_disorder_name], "description": record[prev_description]}
79
+ )
80
+
81
+ return disease_annotations_selected
82
+
83
+
84
+ def format_disease_annotations(disease_annotations: list[dict[str, str | float]]) -> dict[str, list[dict[str, str]]]:
85
+ """Format the IMPC human disease_annotations for output."""
86
+
87
+ # Select "description" and "disorder_name" fields
88
+ disease_annotations = _select_fields_from_disease_annotations(disease_annotations)
89
+
90
+ # Filter out records with unexpected description format: expected format is "<allele_symbol> <zygosity> <life_stage>"
91
+ disease_annotations = [record for record in disease_annotations if len(record["description"].split(" ")) == 3]
92
+
93
+ zygosity_converter = {"het": "Hetero", "hom": "Homo", "hem": "Hemi"}
94
+ life_stage_converter = {"middle": "Interval", "late": "Late", "early": "Early", "embryo": "Embryo"}
95
+
96
+ for record in disease_annotations:
97
+ description = record["description"]
98
+ description_split = description.split(" ")
99
+ marker_symbol = description.split("<")[0].strip()
100
+ zygosity = description_split[-2]
101
+ life_stage = description_split[-1]
102
+ # Apply converters
103
+ zygosity = zygosity_converter.get(zygosity, zygosity)
104
+ life_stage = life_stage_converter.get(life_stage, life_stage)
105
+ # Update record with new values
106
+ record["marker_symbol"] = marker_symbol
107
+ record["zygosity"] = zygosity
108
+ record["life_stage"] = life_stage
109
+ # Delete used fields
110
+ del record["description"]
111
+
112
+ # Using marker_symbol as the key makes it easier to join with IMPC phenotype records later
113
+ disease_annotations_by_gene = defaultdict(list)
114
+ appended_records = set()
115
+ for record in disease_annotations:
116
+ if tuple(record.items()) not in appended_records:
117
+ appended_records.add(tuple(record.items()))
118
+ marker_symbol = record["marker_symbol"]
119
+ del record["marker_symbol"]
120
+ disease_annotations_by_gene[marker_symbol].append(record)
121
+
122
+ return dict(disease_annotations_by_gene)
@@ -0,0 +1,94 @@
1
+ from __future__ import annotations
2
+
3
+ from collections.abc import Iterator
4
+
5
+ from TSUMUGI import annotator, filterer, formatter
6
+
7
+
8
+ def build_genewise_phenotype_annotations(
9
+ records: Iterator[dict], ontology_terms: dict, disease_annotations_by_gene: dict
10
+ ) -> Iterator[dict]:
11
+ ###########################################################
12
+ # Preprocess data
13
+ ###########################################################
14
+
15
+ # --------------------------------------------------------
16
+ # Select columns, maintained mp term, and significant genes
17
+ # --------------------------------------------------------
18
+
19
+ # Floatinize columns
20
+ float_columns = [
21
+ "p_value",
22
+ "effect_size",
23
+ "female_ko_effect_p_value",
24
+ "female_ko_parameter_estimate",
25
+ "male_ko_effect_p_value",
26
+ "male_ko_parameter_estimate",
27
+ ]
28
+ records_formatted = formatter.floatinize_columns(records, float_columns)
29
+
30
+ # Format zygosity
31
+ zygosity_converter = {"heterozygote": "Hetero", "homozygote": "Homo", "hemizygote": "Hemi"}
32
+ records_formatted = formatter.format_zygosity(records_formatted, zygosity_converter)
33
+ # Take absolute value of effect size
34
+ effect_size_columns = ["effect_size", "female_ko_parameter_estimate", "male_ko_parameter_estimate"]
35
+ records_formatted = formatter.abs_effect_size(records_formatted, effect_size_columns)
36
+
37
+ # --------------------------------------------------------
38
+ # Annotate life stage and sexual dimorphisms
39
+ # --------------------------------------------------------
40
+
41
+ embryo_assays = {
42
+ "E9.5",
43
+ "E10.5",
44
+ "E12.5",
45
+ "Embryo LacZ", # E12.5
46
+ "E14.5",
47
+ "E14.5-E15.5",
48
+ "E18.5",
49
+ }
50
+ # Life stage
51
+ records_annotated = annotator.annotate_life_stage(records_formatted, embryo_assays)
52
+ # Sexual dimorphism
53
+ records_annotated = annotator.annotate_sexual_dimorphism(records_annotated, threshold=1e-4)
54
+ # Human Diseases
55
+ records_annotated = annotator.annotate_diseases(records_annotated, disease_annotations_by_gene)
56
+ # Annotate Significant (True/False)
57
+ records_annotated = annotator.annotate_significant(records_annotated)
58
+
59
+ # --------------------------------------------------------
60
+ # Filter records
61
+ # --------------------------------------------------------
62
+ records_filtered = records_annotated
63
+
64
+ # Subset columns
65
+ to_keep_columns = {
66
+ "marker_symbol",
67
+ "marker_accession_id",
68
+ "mp_term_id",
69
+ "mp_term_name",
70
+ "zygosity",
71
+ "life_stage",
72
+ "sexual_dimorphism",
73
+ "effect_size",
74
+ "significant",
75
+ "disease_annotation",
76
+ }
77
+ records_filtered = filterer.subset_columns(records_filtered, to_keep_columns)
78
+
79
+ # Keep only records with mp_term_id in the ontology file (= not obsolete)
80
+ records_filtered = (record for record in records_filtered if record["mp_term_id"] in ontology_terms)
81
+
82
+ # Distinct records with max effect size
83
+ unique_keys = [
84
+ "marker_symbol",
85
+ "mp_term_id",
86
+ "zygosity",
87
+ "life_stage",
88
+ "sexual_dimorphism",
89
+ ]
90
+ records_filtered = filterer.distinct_records_with_max_effect(records_filtered, unique_keys)
91
+
92
+ genewise_phenotype_annotations = records_filtered
93
+
94
+ return genewise_phenotype_annotations
TSUMUGI/io_handler.py ADDED
@@ -0,0 +1,189 @@
1
+ from __future__ import annotations
2
+
3
+ import csv
4
+ import gzip
5
+ import json
6
+ import pickle
7
+ import sys
8
+ from collections.abc import Iterable, Iterator
9
+ from pathlib import Path
10
+
11
+ from tqdm import tqdm
12
+
13
+ from TSUMUGI import formatter
14
+
15
+
16
+ def count_newline(file_path: str | Path, chunk_size: int = 1024 * 1024) -> int:
17
+ """Count newline characters in plain text or gzip-compressed file efficiently."""
18
+ open_func = gzip.open if Path(file_path).suffix == ".gz" else open
19
+ count = 0
20
+ with open_func(file_path, "rb") as f:
21
+ while chunk := f.read(chunk_size):
22
+ count += chunk.count(b"\n")
23
+ return count
24
+
25
+
26
+ def load_csv_as_dicts(file_path: str | Path, encoding: str = "utf-8") -> Iterator[dict[str, str]]:
27
+ """
28
+ Read CSV file and return each row as a {header: value} dict
29
+ """
30
+ file_path = Path(file_path)
31
+
32
+ # Detect gzip-compressed file
33
+ open_func = gzip.open if file_path.suffix == ".gz" else open
34
+
35
+ # Open and read as text
36
+ with open_func(file_path, mode="rt", newline="", encoding=encoding) as f:
37
+ reader = csv.DictReader(f)
38
+ for row in reader:
39
+ yield dict(row)
40
+
41
+
42
+ def write_pickle(obj: any, file_path: str | Path) -> None:
43
+ """
44
+ Save any Python object in pickle format
45
+ """
46
+ with open(file_path, "wb") as f:
47
+ pickle.dump(obj, f, protocol=pickle.HIGHEST_PROTOCOL)
48
+
49
+
50
+ def read_pickle(file_path: str | Path) -> any:
51
+ """
52
+ Load pickle file and return as Python object
53
+ """
54
+ with open(file_path, "rb") as f:
55
+ return pickle.load(f)
56
+
57
+
58
+ def write_pickle_iter(iterable: Iterator, file_path: str | Path) -> None:
59
+ """
60
+ Sequential (streaming) pickle save.
61
+ Write element by element from iterable (including generators) without loading into memory.
62
+ """
63
+ with open(file_path, "wb") as f:
64
+ for item in iterable:
65
+ pickle.dump(item, f, protocol=pickle.HIGHEST_PROTOCOL)
66
+
67
+
68
+ def read_pickle_iter(file_path: str | Path) -> Iterator[any]:
69
+ """
70
+ Generator that reads pickle sequentially (yields until EOF)
71
+ """
72
+ with open(file_path, "rb") as f:
73
+ while True:
74
+ try:
75
+ yield pickle.load(f)
76
+ except EOFError:
77
+ break
78
+
79
+
80
+ def parse_obo_file(file_path: str | Path) -> dict[str, dict[str, str]]:
81
+ """Parse ontology file (OBO format) and extract term information.
82
+ Returns dict with keys: id, name, is_a (parent terms), is_obsolete
83
+ """
84
+ ontology_terms = {}
85
+ current_term_data = None
86
+ with open(file_path, encoding="utf-8") as f:
87
+ for line in f:
88
+ line = line.strip()
89
+
90
+ if line == "[Term]":
91
+ current_term_data = {}
92
+ continue
93
+
94
+ if line.startswith("[") and line.endswith("]") and line != "[Term]":
95
+ current_term_data = None
96
+ continue
97
+
98
+ if current_term_data is None:
99
+ continue
100
+
101
+ if ":" in line:
102
+ key, value = line.split(":", 1)
103
+ key = key.strip()
104
+ value = value.strip()
105
+
106
+ if key == "id":
107
+ current_term_data["id"] = value
108
+ elif key == "name":
109
+ current_term_data["name"] = value
110
+ elif key == "is_a":
111
+ if "is_a" not in current_term_data:
112
+ current_term_data["is_a"] = []
113
+ parent_id = value.split("!")[0].strip()
114
+ current_term_data["is_a"].append(parent_id)
115
+ elif key == "is_obsolete":
116
+ current_term_data["is_obsolete"] = value.lower() == "true"
117
+
118
+ if line == "" and current_term_data and "id" in current_term_data:
119
+ if not current_term_data.get("is_obsolete", False):
120
+ ontology_terms[current_term_data["id"]] = current_term_data
121
+ current_term_data = None
122
+
123
+ return ontology_terms
124
+
125
+
126
+ def parse_impc_phenodigm(file_path: str | Path) -> dict[str, list[dict[str, str]]]:
127
+ with open(Path(file_path)) as f:
128
+ disease_annotations_by_gene: dict[str, list[dict[str, str]]] = formatter.format_disease_annotations(
129
+ list(csv.DictReader(f))
130
+ )
131
+ return disease_annotations_by_gene
132
+
133
+
134
+ def read_jsonl(path_jsonl: str | Path | None) -> Iterator[dict]:
135
+ """
136
+ Stream JSONL (.jsonl or .jsonl.gz). If path_jsonl is None or "-", read from stdin.
137
+ Keeps the file open during iteration (no 'I/O operation on closed file').
138
+ """
139
+ # stdin
140
+ if path_jsonl is None or str(path_jsonl) == "-" or path_jsonl == sys.stdin:
141
+ for line in sys.stdin:
142
+ if line.strip():
143
+ yield json.loads(line)
144
+ return
145
+
146
+ # file / gzip
147
+ p = Path(path_jsonl)
148
+ open_func = gzip.open if p.suffix == ".gz" else open
149
+
150
+ with open_func(p, "rt", encoding="utf-8") as f:
151
+ for line in f:
152
+ if line.strip():
153
+ yield json.loads(line)
154
+
155
+
156
+ def write_jsonl(records: Iterable[dict], path_jsonl: str | Path | None) -> None:
157
+ """
158
+ Write an iterable of records as JSONL (.jsonl or .jsonl.gz).
159
+
160
+ If the filename ends with .gz, use gzip compression (level=9).
161
+ """
162
+ p = Path(path_jsonl)
163
+
164
+ def open_text_file(path: Path, mode: str, encoding: str):
165
+ return open(path, mode, encoding=encoding)
166
+
167
+ def open_gzip_file(path: Path, mode: str, encoding: str):
168
+ return gzip.open(path, mode, encoding=encoding, compresslevel=9)
169
+
170
+ open_func = open_gzip_file if p.suffix == ".gz" else open_text_file
171
+
172
+ message = f"Writing JSONL to {path_jsonl}"
173
+ with open_func(p, "wt", encoding="utf-8") as f:
174
+ for record in tqdm(records, desc=message):
175
+ json.dump(record, f, ensure_ascii=False)
176
+ f.write("\n")
177
+
178
+
179
+ def write_jsonl_to_stdout(record: dict) -> None:
180
+ """Write record as JSONL and suppress BrokenPipeError cleanly."""
181
+ try:
182
+ json.dump(record, sys.stdout, ensure_ascii=False)
183
+ sys.stdout.write("\n")
184
+ except BrokenPipeError:
185
+ try:
186
+ sys.stdout.close()
187
+ except Exception:
188
+ pass
189
+ sys.exit(0)