omicsync 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,147 @@
1
+ """Generic CSV/TSV loader for omicsync."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ from pathlib import Path
7
+ from typing import Dict, Optional, Union
8
+
9
+ import pandas as pd
10
+
11
+ from omicsync.core.dataset import OmicsDataset
12
+ from omicsync.core.modality import make_modality, OmicsModality
13
+ from omicsync.utils.logging import get_logger
14
+ from omicsync.utils.validation import validate_modality_type
15
+
16
+ logger = get_logger("loaders.csv")
17
+
18
+
19
+ def _detect_separator(path: Union[str, Path]) -> str:
20
+ path = Path(path)
21
+ suffix = path.suffix.lower()
22
+ if suffix in (".tsv", ".txt"):
23
+ return "\t"
24
+ if suffix == ".csv":
25
+ return ","
26
+ # Peek at first line to detect
27
+ with open(path, "r", encoding="utf-8") as fh:
28
+ first_line = fh.readline()
29
+ if first_line.count("\t") > first_line.count(","):
30
+ return "\t"
31
+ return ","
32
+
33
+
34
+ def load_csv(
35
+ path: Union[str, Path],
36
+ modality_type: str,
37
+ sample_col: Optional[str] = "sample_id",
38
+ feature_orientation: str = "samples_as_rows",
39
+ source: str = "csv",
40
+ **kwargs,
41
+ ) -> OmicsModality:
42
+ """Load a single CSV/TSV file into an :class:`~omicsync.core.modality.OmicsModality`.
43
+
44
+ Parameters
45
+ ----------
46
+ path:
47
+ Path to the CSV or TSV file.
48
+ modality_type:
49
+ One of ``"rna"``, ``"mutations"``, ``"methylation"``, ``"cnv"``,
50
+ ``"protein"``.
51
+ sample_col:
52
+ Name of the column that contains sample IDs when
53
+ ``feature_orientation="samples_as_rows"``. Set to ``None`` to use the
54
+ existing index. Ignored when ``feature_orientation="samples_as_columns"``.
55
+ feature_orientation:
56
+ ``"samples_as_rows"`` (default) — rows are samples, columns are
57
+ features. ``"samples_as_columns"`` — transpose after reading.
58
+ source:
59
+ Source label stored in the modality metadata.
60
+ **kwargs:
61
+ Additional keyword arguments forwarded to :func:`pandas.read_csv`.
62
+
63
+ Returns
64
+ -------
65
+ OmicsModality
66
+ The appropriate modality subclass.
67
+
68
+ Raises
69
+ ------
70
+ FileNotFoundError
71
+ If *path* does not exist.
72
+ ValueError
73
+ If *modality_type* or *feature_orientation* is invalid.
74
+ """
75
+ path = Path(path)
76
+ if not path.exists():
77
+ raise FileNotFoundError(f"File not found: {path}")
78
+
79
+ validate_modality_type(modality_type)
80
+ if feature_orientation not in ("samples_as_rows", "samples_as_columns"):
81
+ raise ValueError(
82
+ f"Unknown feature_orientation {feature_orientation!r}. "
83
+ "Valid: 'samples_as_rows', 'samples_as_columns'."
84
+ )
85
+
86
+ sep = kwargs.pop("sep", _detect_separator(path))
87
+ df = pd.read_csv(path, sep=sep, **kwargs)
88
+
89
+ if feature_orientation == "samples_as_rows":
90
+ if sample_col is not None:
91
+ if sample_col not in df.columns:
92
+ raise ValueError(
93
+ f"sample_col={sample_col!r} not found in columns: {df.columns.tolist()[:10]}..."
94
+ )
95
+ df = df.set_index(sample_col)
96
+ else:
97
+ if sample_col is not None and sample_col in df.columns:
98
+ df = df.set_index(sample_col)
99
+ df = df.T
100
+
101
+ df = df.apply(pd.to_numeric, errors="coerce")
102
+
103
+ logger.info(
104
+ "load_csv: loaded %s modality from %s — shape %s.",
105
+ modality_type,
106
+ path.name,
107
+ df.shape,
108
+ )
109
+ return make_modality(df, modality_type=modality_type, source=source)
110
+
111
+
112
+ def load_multimodal_csv(
113
+ paths_dict: Dict[str, Union[str, Path]],
114
+ modality_types: Optional[Dict[str, str]] = None,
115
+ study_id: str = "custom",
116
+ **kwargs,
117
+ ) -> OmicsDataset:
118
+ """Load multiple CSV/TSV files into an :class:`~omicsync.core.dataset.OmicsDataset`.
119
+
120
+ Parameters
121
+ ----------
122
+ paths_dict:
123
+ Mapping from modality name to file path.
124
+ modality_types:
125
+ Mapping from modality name to modality_type string. If ``None``,
126
+ the modality name itself is used as the type.
127
+ study_id:
128
+ Study identifier for the resulting dataset.
129
+ **kwargs:
130
+ Forwarded to :func:`load_csv` for every modality.
131
+
132
+ Returns
133
+ -------
134
+ OmicsDataset
135
+
136
+ Raises
137
+ ------
138
+ ValueError
139
+ If a modality name cannot be resolved to a valid modality type.
140
+ """
141
+ modalities: Dict[str, OmicsModality] = {}
142
+ for name, path in paths_dict.items():
143
+ mtype = (modality_types or {}).get(name, name)
144
+ logger.info("load_multimodal_csv: loading %r from %s.", name, path)
145
+ modalities[name] = load_csv(path, modality_type=mtype, **kwargs)
146
+
147
+ return OmicsDataset(modalities, study_id=study_id)
@@ -0,0 +1,111 @@
1
+ """GEO loader using GEOparse."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Optional
6
+
7
+ import numpy as np
8
+ import pandas as pd
9
+
10
+ from omicsync.core.modality import make_modality, OmicsModality
11
+ from omicsync.utils.logging import get_logger
12
+
13
+ logger = get_logger("loaders.geo")
14
+
15
+
16
+ def load_geo(
17
+ accession: str,
18
+ modality_type: str,
19
+ destdir: str = ".",
20
+ silent: bool = True,
21
+ **kwargs,
22
+ ) -> OmicsModality:
23
+ """Download and parse a GEO series into an :class:`~omicsync.core.modality.OmicsModality`.
24
+
25
+ Requires ``GEOparse`` to be installed (``pip install GEOparse``).
26
+
27
+ Parameters
28
+ ----------
29
+ accession:
30
+ GEO series accession, e.g. ``"GSE12345"``.
31
+ modality_type:
32
+ One of ``"rna"``, ``"mutations"``, ``"methylation"``, ``"cnv"``,
33
+ ``"protein"``.
34
+ destdir:
35
+ Directory to download GEO files into.
36
+ silent:
37
+ Suppress GEOparse download progress output (default ``True``).
38
+ **kwargs:
39
+ Additional keyword arguments forwarded to
40
+ :func:`GEOparse.get_GEO`.
41
+
42
+ Returns
43
+ -------
44
+ OmicsModality
45
+
46
+ Raises
47
+ ------
48
+ ImportError
49
+ If ``GEOparse`` is not installed.
50
+ ValueError
51
+ If the series has no usable expression matrix.
52
+ """
53
+ try:
54
+ import GEOparse
55
+ except ImportError as exc:
56
+ raise ImportError(
57
+ "GEOparse is required for load_geo(). "
58
+ "Install it with: pip install GEOparse"
59
+ ) from exc
60
+
61
+ logger.info("load_geo: fetching %s from NCBI GEO.", accession)
62
+ gse = GEOparse.get_GEO(accession, destdir=destdir, silent=silent, **kwargs)
63
+
64
+ platforms = gse.gpls
65
+ if len(platforms) > 1:
66
+ logger.warning(
67
+ "load_geo: %s has %d platforms (%s). "
68
+ "Using first platform; consider filtering manually.",
69
+ accession,
70
+ len(platforms),
71
+ list(platforms.keys()),
72
+ )
73
+
74
+ # Build expression matrix from GSMs
75
+ gsms = gse.gsms
76
+ if not gsms:
77
+ raise ValueError(f"GEO series {accession} contains no samples (GSMs).")
78
+
79
+ frames = {}
80
+ for sample_name, gsm in gsms.items():
81
+ table = gsm.table
82
+ if table.empty:
83
+ logger.warning("load_geo: sample %s has an empty table; skipping.", sample_name)
84
+ continue
85
+ # Detect value column: prefer "VALUE", else first numeric column
86
+ value_col = "VALUE" if "VALUE" in table.columns else None
87
+ if value_col is None:
88
+ for col in table.columns:
89
+ if col != "ID_REF" and pd.api.types.is_numeric_dtype(table[col]):
90
+ value_col = col
91
+ break
92
+ if value_col is None:
93
+ logger.warning("load_geo: cannot find value column in sample %s.", sample_name)
94
+ continue
95
+ id_col = "ID_REF" if "ID_REF" in table.columns else table.columns[0]
96
+ frames[sample_name] = table.set_index(id_col)[value_col]
97
+
98
+ if not frames:
99
+ raise ValueError(f"No usable data found in GEO series {accession}.")
100
+
101
+ df = pd.DataFrame(frames).T
102
+ df.index.name = "sample_id"
103
+ df = df.apply(pd.to_numeric, errors="coerce")
104
+
105
+ logger.info(
106
+ "load_geo: loaded %s — %d samples × %d features.",
107
+ accession,
108
+ df.shape[0],
109
+ df.shape[1],
110
+ )
111
+ return make_modality(df, modality_type=modality_type, source=f"geo:{accession}")
@@ -0,0 +1,239 @@
1
+ """Open Targets Platform GraphQL API loader."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import time
6
+ from typing import Dict, List, Optional, Sequence
7
+
8
+ import numpy as np
9
+ import pandas as pd
10
+ import requests
11
+
12
+ from omicsync.core.dataset import OmicsDataset
13
+ from omicsync.utils.logging import get_logger
14
+
15
+ logger = get_logger("loaders.open_targets")
16
+
17
+ _OT_GRAPHQL_URL = "https://api.platform.opentargets.org/api/v4/graphql"
18
+
19
+ _ASSOCIATION_QUERY = """
20
+ query targetDiseaseAssociations(
21
+ $diseaseIds: [String!],
22
+ $targetIds: [String!],
23
+ $size: Int!,
24
+ $cursor: String
25
+ ) {
26
+ associations: associatedTargets(
27
+ diseaseIds: $diseaseIds
28
+ size: $size
29
+ cursor: $cursor
30
+ ) {
31
+ count
32
+ cursor
33
+ rows {
34
+ target {
35
+ id
36
+ approvedSymbol
37
+ }
38
+ disease {
39
+ id
40
+ name
41
+ }
42
+ score
43
+ datatypeScores {
44
+ id
45
+ score
46
+ }
47
+ }
48
+ }
49
+ }
50
+ """
51
+
52
+ _DATATYPE_COLUMNS = {
53
+ "genetic_association": "genetic_association",
54
+ "somatic_mutation": "somatic_mutation",
55
+ "literature": "literature",
56
+ "rna_expression": "rna_expression",
57
+ "animal_model": "animal_model",
58
+ "affected_pathway": "affected_pathway",
59
+ }
60
+
61
+
62
+ def _graphql_request(
63
+ payload: Dict,
64
+ url: str = _OT_GRAPHQL_URL,
65
+ max_retries: int = 5,
66
+ backoff_factor: float = 1.0,
67
+ ) -> Dict:
68
+ """Execute a GraphQL query with exponential backoff."""
69
+ for attempt in range(max_retries):
70
+ try:
71
+ response = requests.post(url, json=payload, timeout=30)
72
+ response.raise_for_status()
73
+ return response.json()
74
+ except requests.exceptions.RequestException as exc:
75
+ if attempt == max_retries - 1:
76
+ raise RuntimeError(
77
+ f"Open Targets API request failed after {max_retries} attempts: {exc}"
78
+ ) from exc
79
+ wait = backoff_factor * (2 ** attempt)
80
+ logger.warning(
81
+ "Open Targets request failed (attempt %d/%d); retrying in %.1fs.",
82
+ attempt + 1, max_retries, wait,
83
+ )
84
+ time.sleep(wait)
85
+ raise RuntimeError("Unreachable") # pragma: no cover
86
+
87
+
88
+ def load_open_targets_targets(
89
+ disease_ids: Optional[Sequence[str]] = None,
90
+ target_ids: Optional[Sequence[str]] = None,
91
+ evidence_types: Optional[Sequence[str]] = None,
92
+ score_threshold: float = 0.0,
93
+ page_size: int = 200,
94
+ ) -> pd.DataFrame:
95
+ """Query Open Targets Platform for target-disease associations.
96
+
97
+ Parameters
98
+ ----------
99
+ disease_ids:
100
+ EFO disease IDs to filter on, e.g. ``["EFO_0000305"]``.
101
+ At least one of *disease_ids* or *target_ids* must be provided.
102
+ target_ids:
103
+ Ensembl gene IDs to filter on, e.g. ``["ENSG00000141736"]``.
104
+ evidence_types:
105
+ Evidence types to include in results. ``None`` returns all.
106
+ Valid keys: ``"genetic_association"``, ``"somatic_mutation"``,
107
+ ``"literature"``, ``"rna_expression"``, ``"animal_model"``,
108
+ ``"affected_pathway"``.
109
+ score_threshold:
110
+ Minimum overall association score (0–1).
111
+ page_size:
112
+ Results per API page.
113
+
114
+ Returns
115
+ -------
116
+ pandas.DataFrame
117
+ Columns: ``target_id``, ``target_name``, ``disease_id``,
118
+ ``disease_name``, ``overall_score``, plus one column per evidence
119
+ datatype.
120
+
121
+ Raises
122
+ ------
123
+ ValueError
124
+ If neither *disease_ids* nor *target_ids* is provided.
125
+ """
126
+ if disease_ids is None and target_ids is None:
127
+ raise ValueError("Provide at least one of disease_ids or target_ids.")
128
+
129
+ rows: List[Dict] = []
130
+ cursor: Optional[str] = None
131
+ total_fetched = 0
132
+
133
+ while True:
134
+ variables: Dict = {"size": page_size}
135
+ if disease_ids:
136
+ variables["diseaseIds"] = list(disease_ids)
137
+ if cursor:
138
+ variables["cursor"] = cursor
139
+
140
+ result = _graphql_request({"query": _ASSOCIATION_QUERY, "variables": variables})
141
+
142
+ data = result.get("data", {}).get("associations", {})
143
+ page_rows = data.get("rows", [])
144
+ cursor = data.get("cursor")
145
+
146
+ for row in page_rows:
147
+ target = row.get("target", {})
148
+ disease = row.get("disease", {})
149
+ overall_score = row.get("score", 0.0) or 0.0
150
+
151
+ if overall_score < score_threshold:
152
+ continue
153
+
154
+ record: Dict = {
155
+ "target_id": target.get("id"),
156
+ "target_name": target.get("approvedSymbol"),
157
+ "disease_id": disease.get("id"),
158
+ "disease_name": disease.get("name"),
159
+ "overall_score": overall_score,
160
+ }
161
+
162
+ dt_scores = {s["id"]: s["score"] for s in row.get("datatypeScores", [])}
163
+ for col, key in _DATATYPE_COLUMNS.items():
164
+ record[col] = dt_scores.get(key, np.nan)
165
+
166
+ rows.append(record)
167
+
168
+ total_fetched += len(page_rows)
169
+ logger.info("load_open_targets_targets: fetched %d associations so far.", total_fetched)
170
+
171
+ if not cursor or len(page_rows) < page_size:
172
+ break
173
+
174
+ if not rows:
175
+ logger.warning("load_open_targets_targets: no associations returned.")
176
+ return pd.DataFrame(columns=[
177
+ "target_id", "target_name", "disease_id", "disease_name",
178
+ "overall_score", *list(_DATATYPE_COLUMNS.keys()),
179
+ ])
180
+
181
+ df = pd.DataFrame(rows)
182
+
183
+ if evidence_types is not None:
184
+ keep = set(evidence_types) & set(_DATATYPE_COLUMNS.keys())
185
+ if not keep:
186
+ logger.warning(
187
+ "load_open_targets_targets: none of %s are valid evidence types.", evidence_types
188
+ )
189
+ else:
190
+ df = df[df[list(keep)].notna().any(axis=1)]
191
+
192
+ logger.info(
193
+ "load_open_targets_targets: returned %d associations.", len(df)
194
+ )
195
+ return df.reset_index(drop=True)
196
+
197
+
198
+ def add_open_targets_annotations(
199
+ dataset: OmicsDataset,
200
+ target_column: str = "gene_id",
201
+ disease_ids: Optional[Sequence[str]] = None,
202
+ **kwargs,
203
+ ) -> OmicsDataset:
204
+ """Annotate feature metadata in an OmicsDataset with Open Targets scores.
205
+
206
+ Queries Open Targets for each feature in the RNA modality (or any modality
207
+ whose feature IDs look like gene symbols or Ensembl IDs) and attaches the
208
+ association scores as feature-level metadata.
209
+
210
+ Parameters
211
+ ----------
212
+ dataset:
213
+ An :class:`~omicsync.core.dataset.OmicsDataset`.
214
+ target_column:
215
+ Column in the annotation DataFrame corresponding to gene identifiers.
216
+ disease_ids:
217
+ Disease IDs to query. Forwarded to :func:`load_open_targets_targets`.
218
+ **kwargs:
219
+ Forwarded to :func:`load_open_targets_targets`.
220
+
221
+ Returns
222
+ -------
223
+ OmicsDataset
224
+ *dataset* with ``open_targets`` key added to each modality's metadata.
225
+ """
226
+ ot_df = load_open_targets_targets(disease_ids=disease_ids, **kwargs)
227
+
228
+ for name, mod in dataset._modalities.items():
229
+ feature_ids = mod.feature_ids.tolist()
230
+ ann = ot_df[ot_df["target_name"].isin(feature_ids)].copy()
231
+ mod.metadata["open_targets"] = ann
232
+ logger.info(
233
+ "add_open_targets_annotations: %d/%d features annotated for modality %r.",
234
+ len(ann["target_name"].unique()),
235
+ len(feature_ids),
236
+ name,
237
+ )
238
+
239
+ return dataset