microarray 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. microarray/__init__.py +15 -0
  2. microarray/_version.py +3 -0
  3. microarray/datasets/__init__.py +3 -0
  4. microarray/datasets/_arrayexpress.py +1 -0
  5. microarray/datasets/_cdf_files.py +35 -0
  6. microarray/datasets/_geo.py +1 -0
  7. microarray/datasets/_utils.py +143 -0
  8. microarray/io/__init__.py +17 -0
  9. microarray/io/_anndata_converter.py +198 -0
  10. microarray/io/_cdf.py +575 -0
  11. microarray/io/_cel.py +591 -0
  12. microarray/io/_read.py +127 -0
  13. microarray/plotting/__init__.py +28 -0
  14. microarray/plotting/_base.py +253 -0
  15. microarray/plotting/_cel.py +75 -0
  16. microarray/plotting/_de_plots.py +239 -0
  17. microarray/plotting/_diagnostic_plots.py +268 -0
  18. microarray/plotting/_heatmap.py +279 -0
  19. microarray/plotting/_ma_plots.py +136 -0
  20. microarray/plotting/_pca.py +320 -0
  21. microarray/plotting/_qc_plots.py +335 -0
  22. microarray/plotting/_score.py +38 -0
  23. microarray/plotting/_top_table_heatmap.py +98 -0
  24. microarray/plotting/_utils.py +280 -0
  25. microarray/preprocessing/__init__.py +39 -0
  26. microarray/preprocessing/_background.py +862 -0
  27. microarray/preprocessing/_log2.py +77 -0
  28. microarray/preprocessing/_normalize.py +1292 -0
  29. microarray/preprocessing/_rma.py +243 -0
  30. microarray/preprocessing/_robust.py +170 -0
  31. microarray/preprocessing/_summarize.py +318 -0
  32. microarray/py.typed +0 -0
  33. microarray/tools/__init__.py +26 -0
  34. microarray/tools/_biomart.py +416 -0
  35. microarray/tools/_empirical_bayes.py +401 -0
  36. microarray/tools/_fdist.py +171 -0
  37. microarray/tools/_linear_models.py +387 -0
  38. microarray/tools/_mds.py +101 -0
  39. microarray/tools/_pca.py +88 -0
  40. microarray/tools/_score.py +86 -0
  41. microarray/tools/_toptable.py +360 -0
  42. microarray-0.1.0.dist-info/METADATA +75 -0
  43. microarray-0.1.0.dist-info/RECORD +44 -0
  44. microarray-0.1.0.dist-info/WHEEL +4 -0
@@ -0,0 +1,416 @@
1
+ from collections.abc import Generator
2
+ from dataclasses import dataclass
3
+ from io import StringIO
4
+ from pathlib import Path
5
+ from xml.etree import ElementTree
6
+
7
+ import pandas as pd
8
+ import requests
9
+ from anndata import AnnData
10
+
11
+
12
+ class Attribute:
13
+ """Biomart dataset attribute."""
14
+
15
+ def __init__(
16
+ self,
17
+ name: str,
18
+ display_name: str = "",
19
+ description: str = "",
20
+ default: bool = False,
21
+ ):
22
+ """Attribute constructor.
23
+
24
+ Args:
25
+ name (str): Attribute name.
26
+ display_name (str): Attribute display name.
27
+ description (str): Attribute description.
28
+ default (bool): Whether the attribute is a default attribute.
29
+ """
30
+ self.name: str = name
31
+ self.display_name: str = display_name
32
+ self.description: str = description
33
+ self.default: bool = default
34
+
35
+
36
+ @dataclass
37
+ class Filter:
38
+ """Biomart dataset filter."""
39
+
40
+ def __init__(self, name: str, type: str, description: str = ""):
41
+ """Filter constructor.
42
+
43
+ Args:
44
+ name (str): Filter name.
45
+ type (str): Filter type.
46
+ description (str): Filter description.
47
+ """
48
+ self.name = name
49
+ self.type = type
50
+ self.description = description
51
+
52
+
53
+ class BiomartDataset:
54
+ def __init__(
55
+ self,
56
+ name: str,
57
+ display_name: str = "",
58
+ schema: str = "default",
59
+ host: str = "http://www.ensembl.org",
60
+ path: str = "/biomart/martservice",
61
+ port: int = 80,
62
+ use_cache: bool = False,
63
+ ):
64
+ """BiomartDataset constructor.
65
+
66
+ Args:
67
+ name (str): Dataset name.
68
+ display_name (str): Dataset display name.
69
+ schema (str): Dataset schema.
70
+ host (str): Biomart host.
71
+ path (str): Biomart path.
72
+ port (int): Biomart port.
73
+ use_cache (bool): Whether to use caching.
74
+
75
+ """
76
+ # Add http prefix and remove trailing slash.
77
+ if not host.startswith("http://") and not host.startswith("https://"):
78
+ host = "http://" + host
79
+ if host.endswith("/"):
80
+ host = host[:-1]
81
+
82
+ # Ensure path starts with slash.
83
+ if not path.startswith("/"):
84
+ path = "/" + path
85
+
86
+ self.host: str = host
87
+ self.path: str = path
88
+ self.port: int = port
89
+ self.use_cache: bool = use_cache
90
+
91
+ self.name = name
92
+ self.display_name = display_name
93
+ self.schema = schema
94
+
95
+ self._filters: dict[str, Filter] | None = None
96
+ self._attributes: dict[str, Attribute] | None = None
97
+ self._default_attributes: dict[str, Attribute] | None = None
98
+
99
+ if use_cache:
100
+ raise NotImplementedError("Caching is not implemented yet.")
101
+
102
+ @property
103
+ def url(self) -> str:
104
+ """Url used to connect to the biomart service.
105
+
106
+ Returns:
107
+ str: Url used to connect to the biomart service.
108
+ """
109
+ # if self.port is not None:
110
+ # return f"{self.host}:{self.port}{self.path}"
111
+ # return f"{self.host}{self.path}"
112
+ return f"{self.host}:{self.port}{self.path}"
113
+
114
+ def get(self, **params) -> requests.models.Response:
115
+ """Performs get request to the biomart service.
116
+
117
+ Args:
118
+ **params: Parameters to be sent in the get request.
119
+
120
+ Returns:
121
+ requests.models.Response: Response from the biomart service.
122
+
123
+ """
124
+ response = requests.get(self.url, params=params)
125
+ response.raise_for_status()
126
+ return response
127
+
128
+ @property
129
+ def filters(self) -> dict[str, Filter]:
130
+ """List of filters available for the dataset."""
131
+ if self._filters is None:
132
+ self._filters, self._attributes = self._fetch_configuration()
133
+ return self._filters
134
+
135
+ @property
136
+ def attributes(self) -> dict[str, Attribute]:
137
+ """List of attributes available for the dataset (cached)."""
138
+ if self._attributes is None:
139
+ self._filters, self._attributes = self._fetch_configuration()
140
+ return self._attributes
141
+
142
+ @property
143
+ def default_attributes(self) -> dict[str, Attribute]:
144
+ """List of default attributes for the dataset."""
145
+ if self._default_attributes is None:
146
+ self._default_attributes = {name: attr for name, attr in self.attributes.items() if attr.default is True}
147
+ return self._default_attributes
148
+
149
+ def list_attributes(self) -> pd.DataFrame:
150
+ """Lists available attributes in a readable DataFrame format.
151
+
152
+ Returns:
153
+ pd.DataFrame: Frame listing available attributes.
154
+ """
155
+
156
+ def _row_gen(attributes: dict[str, Attribute]):
157
+ for attr in attributes.values():
158
+ yield (attr.name, attr.display_name, attr.description)
159
+
160
+ return pd.DataFrame.from_records(
161
+ _row_gen(self.attributes),
162
+ columns=["name", "display_name", "description"],
163
+ )
164
+
165
+ def list_filters(self) -> pd.DataFrame:
166
+ """Lists available filters in a readable DataFrame format.
167
+
168
+ Returns:
169
+ pd.DataFrame: Frame listing available filters.
170
+ """
171
+
172
+ def _row_gen(filters: dict[str, Filter]):
173
+ for filt in filters.values():
174
+ yield (filt.name, filt.type, filt.description)
175
+
176
+ return pd.DataFrame.from_records(
177
+ _row_gen(self.filters),
178
+ columns=["name", "type", "description"],
179
+ )
180
+
181
+ def _fetch_configuration(self) -> tuple[dict[str, Filter], dict[str, Attribute]]:
182
+ # Get datasets using biomart.
183
+ response = self.get(type="configuration", dataset=self.name)
184
+
185
+ # Check response for problems.
186
+ if "Problem retrieving configuration" in response.text:
187
+ raise RuntimeError("Failed to retrieve dataset configuration, check the dataset name and schema.")
188
+
189
+ # Get filters and attributes from xml.
190
+ xml = ElementTree.fromstring(response.content)
191
+
192
+ filters = {f.name: f for f in self._filters_from_xml(xml)}
193
+ attributes = {a.name: a for a in self._attributes_from_xml(xml)}
194
+
195
+ return filters, attributes
196
+
197
+ @staticmethod
198
+ def _filters_from_xml(xml: ElementTree.Element) -> Generator[Filter, None, None]:
199
+ for node in xml.iter("FilterDescription"):
200
+ attrib = node.attrib
201
+ yield Filter(name=attrib["internalName"], type=attrib.get("type", ""))
202
+
203
+ @staticmethod
204
+ def _attributes_from_xml(xml: ElementTree.Element) -> Generator[Attribute, None, None]:
205
+ for page_index, page in enumerate(xml.iter("AttributePage")):
206
+ for desc in page.iter("AttributeDescription"):
207
+ attrib = desc.attrib
208
+
209
+ # Default attributes can only be from the first page.
210
+ default = page_index == 0 and attrib.get("default", "") == "true"
211
+
212
+ yield Attribute(
213
+ name=attrib["internalName"],
214
+ display_name=attrib.get("displayName", ""),
215
+ description=attrib.get("description", ""),
216
+ default=default,
217
+ )
218
+
219
+ def query(
220
+ self,
221
+ attributes: list[str] | None = None,
222
+ filters: dict[str, str | bool | list | tuple] | None = None,
223
+ only_unique: bool = True,
224
+ use_attr_names: bool = False,
225
+ dtypes: dict[str, type] | None = None,
226
+ ) -> pd.DataFrame:
227
+ """Queries the dataset to retrieve the contained data.
228
+
229
+ Args:
230
+ attributes (list[str] | None): List of attribute names to retrieve, if None default attributes are used.
231
+ filters (dict[str, str | bool | list | tuple] | None): Dictionary of filter name to filter value, if None no filters are applied.
232
+ only_unique (bool): Whether to only return unique rows.
233
+ use_attr_names (bool): Whether to use attribute names instead of display names as column names in the result.
234
+ dtypes (dict[str, type] | None): Optional dictionary mapping attribute names to data types for the resulting DataFrame.
235
+
236
+ Returns:
237
+ pandas.DataFrame: DataFrame containing the query results.
238
+
239
+ """
240
+ # Setup query element.
241
+ root = ElementTree.Element("Query")
242
+ root.set("virtualSchemaName", self.schema)
243
+ root.set("formatter", "TSV")
244
+ root.set("header", "1")
245
+ root.set("uniqueRows", str(int(only_unique)))
246
+ root.set("datasetConfigVersion", "0.6")
247
+
248
+ # Add dataset element.
249
+ dataset = ElementTree.SubElement(root, "Dataset")
250
+ dataset.set("name", self.name)
251
+ dataset.set("interface", "default")
252
+
253
+ # Default to default attributes if none requested.
254
+ if attributes is None:
255
+ attributes = list(self.default_attributes.keys())
256
+
257
+ # Add attribute elements.
258
+ for name in attributes:
259
+ try:
260
+ attr = self.attributes[name]
261
+ self._add_attr_node(dataset, attr)
262
+ except KeyError as err:
263
+ raise KeyError(
264
+ f"Unknown attribute {name}, check dataset attributes for a list of valid attributes."
265
+ ) from err
266
+
267
+ if filters is not None:
268
+ # Add filter elements.
269
+ for name, value in filters.items():
270
+ try:
271
+ filter_ = self.filters[name]
272
+ self._add_filter_node(dataset, filter_, value)
273
+ except KeyError as err:
274
+ raise KeyError(
275
+ f"Unknown filter {name}, check dataset filters for a list of valid filters."
276
+ ) from err
277
+
278
+ # Fetch response.
279
+ response = self.get(query=ElementTree.tostring(root))
280
+
281
+ # Raise exception if an error occurred.
282
+ if "Query ERROR" in response.text:
283
+ raise RuntimeError(response.text)
284
+
285
+ # Parse results into a DataFrame.
286
+ try:
287
+ result = pd.read_csv(StringIO(response.text), sep="\t", dtype=dtypes)
288
+ # Type error is raised of a data type is not understood by pandas
289
+ except TypeError as err:
290
+ raise ValueError("Non valid data type is used in dtypes") from err
291
+
292
+ if use_attr_names:
293
+ # Rename columns with attribute names instead of display names.
294
+ column_map = {self.attributes[attr].display_name: attr for attr in attributes}
295
+ result.rename(columns=column_map, inplace=True)
296
+
297
+ return result
298
+
299
+ @staticmethod
300
+ def _add_attr_node(
301
+ root: ElementTree.Element,
302
+ attr: Attribute,
303
+ ) -> None:
304
+ attr_el = ElementTree.SubElement(root, "Attribute")
305
+ attr_el.set("name", attr.name)
306
+
307
+ @staticmethod
308
+ def _add_filter_node(
309
+ root: ElementTree.Element,
310
+ filter_: Filter,
311
+ value: str | bool | list | tuple,
312
+ ) -> None:
313
+ """Adds filter xml node to root."""
314
+ filter_el = ElementTree.SubElement(root, "Filter")
315
+ filter_el.set("name", filter_.name)
316
+
317
+ # Set filter value depending on type.
318
+ if filter_.type == "boolean":
319
+ # Boolean case.
320
+ if value is True or value.lower() in {"included", "only"}:
321
+ filter_el.set("excluded", "0")
322
+ elif value is False or value.lower() == "excluded":
323
+ filter_el.set("excluded", "1")
324
+ else:
325
+ raise ValueError(f"Invalid value for boolean filter ({value})")
326
+ elif isinstance(value, list) or isinstance(value, tuple):
327
+ # List case.
328
+ filter_el.set("value", ",".join(map(str, value)))
329
+ else:
330
+ # Default case.
331
+ filter_el.set("value", str(value))
332
+
333
+
334
+ def annotate(
335
+ adata: AnnData,
336
+ entrez_key: str = "gene_id",
337
+ symbol_key: str = "gene_symbol",
338
+ dataset_name: str = "hsapiens_gene_ensembl",
339
+ cache_dir: str = ".cache/biomart",
340
+ remove_unannotated: bool = False,
341
+ copy: bool = False,
342
+ use_cache: bool = True,
343
+ set_index: bool = True,
344
+ ) -> AnnData | None:
345
+ """Annotate Entrez gene IDs with symbols using Biomart.
346
+
347
+ Args:
348
+ adata (AnnData): AnnData object with feature metadata in ``.var``.
349
+ entrez_key (str): Column in ``adata.var`` containing Entrez gene IDs.
350
+ symbol_key (str): Output column in ``adata.var`` for gene symbols.
351
+ dataset_name (str): Biomart dataset name.
352
+ cache_dir (str): Directory where annotation tables are cached as CSV.
353
+ remove_unannotated (bool): If True, drop features without symbol annotation.
354
+ copy (bool): If True, return a new AnnData object.
355
+ use_cache (bool): Whether to use cached annotation tables if available.
356
+ set_index (bool): Whether to set the resulting symbol column as the index of .var.
357
+
358
+ Returns:
359
+ AnnData | None: Annotated AnnData if ``copy=True``, otherwise ``None``.
360
+ """
361
+ if remove_unannotated is True and copy is False:
362
+ raise ValueError(
363
+ "Cannot remove unannotated features when copy=False, as this would modify the input AnnData in-place."
364
+ )
365
+
366
+ if entrez_key not in adata.var.columns:
367
+ raise KeyError(f"AnnData .var has no '{entrez_key}' column")
368
+
369
+ adata = adata.copy() if copy else adata
370
+
371
+ entrez = adata.var[entrez_key].astype("string")
372
+ valid_mask = entrez.notna() & (entrez.str.strip() != "")
373
+
374
+ if not valid_mask.any():
375
+ adata.var[symbol_key] = pd.Series(pd.NA, index=adata.var.index, dtype="string")
376
+ if remove_unannotated:
377
+ adata = adata[:, adata.var[symbol_key].notna()].copy()
378
+ return adata if copy else None
379
+
380
+ entrez_col, symbol_col = "entrezgene_id", "hgnc_symbol"
381
+ fetch_entries = [entrez_col, symbol_col]
382
+
383
+ cache_path = Path(cache_dir) / f"{dataset_name}_entrez_symbol.csv"
384
+ if cache_path.exists() and use_cache:
385
+ annotations = pd.read_csv(
386
+ cache_path,
387
+ dtype={
388
+ "entrezgene_id": float,
389
+ "hgnc_symbol": str,
390
+ },
391
+ )
392
+ else:
393
+ biomart = BiomartDataset(name=dataset_name, port=80)
394
+ annotations = biomart.query(attributes=fetch_entries)
395
+ annotations.columns = fetch_entries
396
+ cache_path.parent.mkdir(parents=True, exist_ok=True)
397
+ annotations.to_csv(cache_path, index=False)
398
+
399
+ annotations = annotations[[entrez_col, symbol_col]].copy()
400
+ annotations = annotations[annotations[entrez_col].notna() & (annotations[entrez_col] != "")]
401
+ annotations[entrez_col] = annotations[entrez_col].astype("Int64").astype("string")
402
+ annotations[symbol_col] = annotations[symbol_col].astype("string").str.strip()
403
+ annotations = annotations[annotations[symbol_col].notna() & (annotations[symbol_col] != "")]
404
+ annotations = annotations.drop_duplicates(subset=[entrez_col], keep="first")
405
+
406
+ symbol_map = dict(zip(annotations[entrez_col], annotations[symbol_col], strict=False))
407
+ adata.var[symbol_key] = entrez.map(symbol_map).astype("string")
408
+
409
+ if remove_unannotated is True:
410
+ adata = adata[:, adata.var[symbol_key].notnull()].copy()
411
+
412
+ if set_index is True:
413
+ adata.var.set_index(symbol_key, inplace=True)
414
+ adata.var.index.name = None
415
+
416
+ return adata if copy else None