cap_sc_client 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. cap_sc_client-1.0.0/LICENSE +21 -0
  2. cap_sc_client-1.0.0/PKG-INFO +21 -0
  3. cap_sc_client-1.0.0/README.md +50 -0
  4. cap_sc_client-1.0.0/cap_sc_client/__init__.py +6 -0
  5. cap_sc_client-1.0.0/cap_sc_client/cap.py +539 -0
  6. cap_sc_client-1.0.0/cap_sc_client/client/__init__.py +240 -0
  7. cap_sc_client-1.0.0/cap_sc_client/client/base_client.py +211 -0
  8. cap_sc_client-1.0.0/cap_sc_client/client/base_model.py +29 -0
  9. cap_sc_client-1.0.0/cap_sc_client/client/client.py +670 -0
  10. cap_sc_client-1.0.0/cap_sc_client/client/cluster_types.py +30 -0
  11. cap_sc_client-1.0.0/cap_sc_client/client/create_session.py +68 -0
  12. cap_sc_client-1.0.0/cap_sc_client/client/dataset_initial_state_query.py +29 -0
  13. cap_sc_client-1.0.0/cap_sc_client/client/dataset_ready.py +22 -0
  14. cap_sc_client-1.0.0/cap_sc_client/client/download_urls.py +23 -0
  15. cap_sc_client-1.0.0/cap_sc_client/client/embedding_clusters.py +31 -0
  16. cap_sc_client-1.0.0/cap_sc_client/client/embedding_data.py +31 -0
  17. cap_sc_client-1.0.0/cap_sc_client/client/enums.py +3 -0
  18. cap_sc_client-1.0.0/cap_sc_client/client/exceptions.py +85 -0
  19. cap_sc_client-1.0.0/cap_sc_client/client/files_status.py +21 -0
  20. cap_sc_client-1.0.0/cap_sc_client/client/fragments.py +347 -0
  21. cap_sc_client-1.0.0/cap_sc_client/client/general_de.py +21 -0
  22. cap_sc_client-1.0.0/cap_sc_client/client/heatmap.py +70 -0
  23. cap_sc_client-1.0.0/cap_sc_client/client/highly_variable_genes.py +30 -0
  24. cap_sc_client-1.0.0/cap_sc_client/client/input_types.py +189 -0
  25. cap_sc_client-1.0.0/cap_sc_client/client/lookup_cells.py +23 -0
  26. cap_sc_client-1.0.0/cap_sc_client/client/md_commons_query.py +21 -0
  27. cap_sc_client-1.0.0/cap_sc_client/client/md_ready.py +21 -0
  28. cap_sc_client-1.0.0/cap_sc_client/client/search_datasets.py +22 -0
  29. cap_sc_client-1.0.0/cap_sc_client.egg-info/PKG-INFO +21 -0
  30. cap_sc_client-1.0.0/cap_sc_client.egg-info/SOURCES.txt +34 -0
  31. cap_sc_client-1.0.0/cap_sc_client.egg-info/dependency_links.txt +1 -0
  32. cap_sc_client-1.0.0/cap_sc_client.egg-info/requires.txt +7 -0
  33. cap_sc_client-1.0.0/cap_sc_client.egg-info/top_level.txt +5 -0
  34. cap_sc_client-1.0.0/pyproject.toml +52 -0
  35. cap_sc_client-1.0.0/setup.cfg +4 -0
  36. cap_sc_client-1.0.0/test/test_api.py +21 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Cell Annotation Platform (CAP) team
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,21 @@
1
+ Metadata-Version: 2.4
2
+ Name: cap_sc_client
3
+ Version: 1.0.0
4
+ Summary: Python client for Cell-Annotation-Platform (CAP) GraphQL API.
5
+ Author: R. Mukhin, E. Biederstedt, A. Isaev, M. Sokolov
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/cellannotation/cap-python-client
8
+ Project-URL: Documentation, https://github.com/cellannotation/cap-python-client/blob/main/README.md
9
+ Project-URL: Issues, https://github.com/cellannotation/cap-python-client/issues
10
+ Project-URL: Changelog, https://github.com/cellannotation/cap-python-client/blob/main/CHANGELOG.md
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Operating System :: OS Independent
13
+ Description-Content-Type: text/markdown
14
+ License-File: LICENSE
15
+ Requires-Dist: httpx>=0.27.2
16
+ Requires-Dist: pydantic>=2.10.3
17
+ Requires-Dist: pydantic_core>=2.27.1
18
+ Requires-Dist: pandas>=2.0.0
19
+ Provides-Extra: dev
20
+ Requires-Dist: ariadne; extra == "dev"
21
+ Dynamic: license-file
@@ -0,0 +1,50 @@
1
+
2
+ # Python client for Cell-Annotation-Platform GraphQL API
3
+ [![PyPI version](https://img.shields.io/pypi/v/cap-sc-client)](https://pypi.org/project/cap-sc-client/)
4
+
5
+ The Python package provides a simple interface to interact with the [Cell Annotation Platform](https://celltype.info/) (CAP) GraphQL API. The package allows to search for datasets, cell labels metadata and get molecular profiles of cell types published on CAP.
6
+
7
+ ## Installation
8
+
9
+ ```bash
10
+ pip install -U cap-sc-client
11
+ ```
12
+
13
+ ## Basic usage
14
+
15
+ The main goal of this package is to provide an interace to access CAP datasets and cell annotation metadata (including marker genes, synonyms, rationales, etc.) via standard python tooling. The outputs are in the format [pandas DataFrame](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html), which could be converted to other formats (csv, JSON, etc.) if the user desires.
16
+
17
+
18
+ ```
19
+ >>> from cap_sc_client import CapClient
20
+ >>> cp = CapClient()
21
+ >>> datasets = cp.search_datasets(limit=5, offset=0, organism=["Homo sapiens"])
22
+ >>> datasets.head()
23
+ id name cell_count project
24
+ 0 1427 Skin fibroblasts - Pan-d... 337376.0 {'id': '613', 'name': 'Pan...
25
+ 1 1426 Skin fibroblast scRNA-seq ... 153546.0 {'id': '613', 'name': 'Pan...
26
+ 2 1157 Single cell atlas of the h... 72788.0 {'id': '544', 'name': 'Sin...
27
+ 3 1156 snRNA-seq of human retina ... 3177310.0 {'id': '544', 'name': 'Sin...
28
+ 4 1154 snRNA-seq of human retina ... 691008.0 {'id': '544', 'name': 'Sin...
29
+ >>> labels = cp.search_cell_labels(limit=10, offset=0)
30
+ >>> labels[["full_name", "ontology_term_exists", "marker_genes"]]
31
+ full_name ontology_term_exists marker_genes
32
+ 0 cycling stromal ... True [MKI67, TOP2A, C...
33
+ 1 alveolar type 1 ... True [PDPN, HOPX]
34
+ 2 mesoderm 2 (ZEB2) False [ZEB2]
35
+ 3 acinar cell True [PRSS1]
36
+ 4 neuron True [STMN2]
37
+ 5 smooth muscle cell True [DES, CNN1, ACTA...
38
+ 6 ciliated cell True [FOXJ1]
39
+ 7 Schwann cell True [MPZ]
40
+ 8 pancreatic cells False [PDX1]
41
+ 9 club cell True [SCGB1A1]
42
+ ```
43
+
44
+ There is also an `MDSession` class that allows users to interact with the molecular profiles of cell types within a specific dataset. However, this class requires users to be familiar with the CAP MD page.
45
+
46
+ For more examples, please refer to ["examples"](https://github.com/cellannotation/cap-python-client/tree/main/examples) and the [GitHub wiki](https://github.com/cellannotation/cap-python-client/wiki) for detailed documentation.
47
+
48
+ ## Documentation
49
+
50
+ Detailed documentation is available on [GitHub Wiki](https://github.com/cellannotation/cap-python-client/wiki).
@@ -0,0 +1,6 @@
1
+ from .cap import CapClient, MDSession
2
+
3
+ __all__ = [
4
+ "CapClient",
5
+ "MDSession",
6
+ ]
@@ -0,0 +1,539 @@
1
+ from typing import List, Dict, Literal
2
+ from uuid import uuid4
3
+ import pandas as pd
4
+ import httpx
5
+
6
+ from .client.client import _Client
7
+ from .client.input_types import (
8
+ DatasetSearchOptions,
9
+ LookupDatasetsFiltersInput,
10
+ LookupDatasetsSearchInput,
11
+ SearchByMetadataArgs,
12
+ DatasetSearchSort,
13
+ CellLabelsSearchOptions,
14
+ LookupLabelsFilters,
15
+ LookupCellsSearch,
16
+ SearchLabelByMetadataArgs,
17
+ CellLabelsSearchSort,
18
+ GetDatasetEmbeddingDataInput,
19
+ GetGeneralDiffInput,
20
+ GetHighlyVariableGenesInput,
21
+ PostSaveEmbeddingSessionInput,
22
+ PostHeatmapInput
23
+ )
24
+ from .client.embedding_data import EmbeddingDataDatasetEmbeddingData
25
+ from .client.heatmap import HeatmapDatasetEmbeddingDiffHeatMap
26
+
27
+ CAP_API_URL = "https://celltype.info/graphql"
28
+
29
+ SESSION_ID = str
30
+ DIFF_KEY = str
31
+ SELECTION_KEY = str
32
+ CELL_LABELS_MODE = "cell-labels"
33
+
34
+
35
+ class MDSession:
36
+ """
37
+ A session for processing molecular data page endpoints.
38
+ """
39
+ def __init__(self, dataset_id: str, _client: _Client):
40
+ """
41
+ Initializes the MDSession with the provided dataset ID and client.
42
+ Do not call directly, use CapClient.md_session instead.
43
+
44
+ Args:
45
+ dataset_id (str): The unique identifier of the dataset to be processed.
46
+ _client (_Client): An instance of the client to interact with the backend API.
47
+ """
48
+ self.__client: _Client = _client
49
+ self._dataset_id: str = dataset_id
50
+ self._session_id: str = None
51
+ self._dataset_snapshot = None
52
+ self._embeddings: list[str] = None
53
+ self._labelsets: list[str] = None
54
+ self._clusterings: list[str] = None
55
+ self._metadata: list[str] = None
56
+
57
+ def __repr__(self) -> str:
58
+ return f"Molecular Data page session for dataset id: {self.dataset_id}"
59
+
60
+ def __str__(self) -> str:
61
+ return self.__repr__()
62
+
63
+ @property
64
+ def dataset_id(self) -> str:
65
+ return self._dataset_id
66
+
67
+ @property
68
+ def dataset_snapshot(self):
69
+ return self._dataset_snapshot
70
+
71
+ @property
72
+ def embeddings(self) -> list[str]:
73
+ return self._embeddings
74
+
75
+ @property
76
+ def clusterings(self) -> list[str]:
77
+ return self._clusterings
78
+
79
+ @property
80
+ def labelsets(self) -> list[str]:
81
+ return self._labelsets
82
+
83
+ @property
84
+ def session_id(self) -> str:
85
+ return self._session_id
86
+
87
+ def _check_md_ready(self):
88
+ ready = self.__client.dataset_ready(self.dataset_id)
89
+ if not ready.dataset.is_embeddings_up_to_date:
90
+ raise RuntimeError(f"The Molecular Data for the dataset {self.dataset_id} is not ready!")
91
+
92
+ def _get_clusterings(self) -> list[str]:
93
+ res = self.__client.cluster_types(self.dataset_id)
94
+ res = res.dataset
95
+ clusters = res.embedding_cluster_types
96
+ cluster_names = [cl.name for cl in clusters]
97
+ return cluster_names
98
+
99
+ def _get_embeddings(self) -> list[str]:
100
+ res = self.__client.md_commons_query(self.dataset_id)
101
+ res = res.dataset
102
+ embeddings = res.embeddings
103
+ emb_names = [e.name for e in embeddings]
104
+ return emb_names
105
+
106
+ def _get_cell_type_labelsets(self) -> list[str]:
107
+ if self.dataset_snapshot is None:
108
+ raise RuntimeError("The dataset snapshot is not ready, call MDSession.create_session first!")
109
+
110
+ labelsets = []
111
+ for lbst in self.dataset_snapshot.labelsets:
112
+ if lbst.mode == CELL_LABELS_MODE:
113
+ labelsets.append(lbst.name)
114
+ return labelsets
115
+
116
+ def create_session(
117
+ self,
118
+ ) -> SESSION_ID:
119
+ """
120
+ Creates a new session for embedding processing.
121
+
122
+ This method performs a sanity check, retrieves the initial state of the dataset,
123
+ fetches clusterings and embeddings, and then initializes a new session with a
124
+ unique session ID. The session information is saved via the client.
125
+
126
+ Returns:
127
+ str: The unique session ID of the newly created embedding session.
128
+ """
129
+
130
+ self._check_md_ready()
131
+
132
+ ds = self.__client.dataset_initial_state_query(self.dataset_id)
133
+ self._dataset_snapshot = ds.dataset
134
+ self._clusterings = self._get_clusterings()
135
+ self._embeddings = self._get_embeddings()
136
+ self._labelsets = self._get_cell_type_labelsets()
137
+
138
+ session_id = str(uuid4())
139
+
140
+ data = PostSaveEmbeddingSessionInput(
141
+ session_id = session_id,
142
+ dataset = self._dataset_snapshot.model_dump()
143
+ )
144
+ response = self.__client.create_session(
145
+ data = data
146
+ )
147
+ self._dataset_snapshot = response.save_embedding_session
148
+ self._session_id = session_id
149
+ return self.session_id
150
+
151
+ def embedding_data(
152
+ self,
153
+ embedding: str,
154
+ max_points: int,
155
+ labelsets: List[str] = None,
156
+ selection_gene: str = None,
157
+ selection_key_major: str = None,
158
+ selection_key_minor: str = None,
159
+ ) -> EmbeddingDataDatasetEmbeddingData:
160
+ """
161
+ Retrieves embedding data for the specified embedding type, with optional filtering and downsampling.
162
+
163
+ Parameters:
164
+ -----------
165
+ embedding : str
166
+ The name of the embedding to retrieve. Must be present in `self.embeddings`.
167
+ max_points : int
168
+ The maximum number of points to include in the response. Data may be downsampled to meet this limit.
169
+ labelsets : List[str], optional
170
+ A list of label sets to include in the embedding data. Defaults to None.
171
+ selection_gene : str, optional
172
+ If provided, returns a list of expression values for the specified gene. Defaults to None.
173
+ selection_key_major : str, optional
174
+ If provided, returns a list of boolean markers indicating whether each point is within the major selection. Defaults to None.
175
+ selection_key_minor : str, optional
176
+ If provided, returns a list of boolean markers indicating whether each point is within the minor selection. Defaults to None.
177
+
178
+ Returns:
179
+ --------
180
+ EmbeddingDataDatasetEmbeddingData
181
+ An object containing the embedding data, including observation IDs, selections, embeddings, annotations,
182
+ and gene expression values.
183
+
184
+ Raises:
185
+ -------
186
+ ValueError
187
+ If the specified embedding is not found in `self.embeddings`.
188
+ """
189
+
190
+ if embedding not in self.embeddings:
191
+ raise ValueError(f"Embedding '{embedding}' is not found in the list of '{self.embeddings}'")
192
+
193
+ options = GetDatasetEmbeddingDataInput(
194
+ embedding = embedding,
195
+ scale_max_plan = max_points,
196
+ session_id = self.session_id,
197
+ labelsets = labelsets,
198
+ selection_gene = selection_gene,
199
+ selection_key_major = selection_key_major,
200
+ selection_key_minor = selection_key_minor,
201
+ )
202
+
203
+ response = self.__client.embedding_data(
204
+ dataset_id = self.dataset_id,
205
+ options = options
206
+ )
207
+ data = response.dataset.embedding_data
208
+ return data
209
+
210
+ def _labelset_id_from_name(self, labelset_name) -> str:
211
+ if self.dataset_snapshot is None:
212
+ raise RuntimeError("Dataset snapshot is not ready, create session first!")
213
+
214
+ for lbst in self.dataset_snapshot.labelsets:
215
+ if lbst.name == labelset_name:
216
+ return lbst.id
217
+
218
+ raise ValueError(f"Can't find labelset '{labelset_name}' in dataset snapshot!")
219
+
220
+ def general_de(
221
+ self,
222
+ labelset: str,
223
+ random_seed: int = 42,
224
+ ) -> DIFF_KEY:
225
+ """
226
+ Performs a general differential expression (DE) analysis.
227
+
228
+ This method conducts a differential expression analysis, comparing each of the
229
+ top 10 largest labels within the specified label set against all other data points.
230
+
231
+ Parameters:
232
+ -----------
233
+ labelset : str
234
+ The name of the label set to use for differential expression analysis.
235
+ Must be present in `self.labelsets`.
236
+ random_seed : int, optional
237
+ The random seed for reproducibility. Defaults to 42.
238
+
239
+ Returns:
240
+ --------
241
+ DIFF_KEY
242
+ A string key associated with the results of the differential expression analysis.
243
+
244
+ Raises:
245
+ -------
246
+ ValueError
247
+ If the specified label set is not found in `self.labelsets`.
248
+ """
249
+ if labelset not in self.labelsets:
250
+ raise ValueError(f"Labelset '{labelset}' is not found in the list of '{self.labelsets}'")
251
+
252
+ labelset_id = self._labelset_id_from_name(labelset)
253
+
254
+ options = GetGeneralDiffInput(
255
+ random_seed = random_seed,
256
+ session_id = self.session_id,
257
+ labelset_id = labelset_id
258
+ )
259
+ response = self.__client.general_de(
260
+ dataset_id = self.dataset_id,
261
+ options = options,
262
+ )
263
+ diff_key = response.dataset.general_diff
264
+ return diff_key
265
+
266
+ def highly_variable_genes(
267
+ self,
268
+ gene_name_filter: str = None,
269
+ pseudogenes_filter: bool = True,
270
+ offset: int = 0,
271
+ limit: int = 50,
272
+ sort_order: Literal["desc", "asc"] = "desc"
273
+ ) -> pd.DataFrame:
274
+ """
275
+ Retrieves a list of highly variable genes from the specified dataset.
276
+
277
+ This method queries the dataset for highly variable genes based on dispersion values.
278
+ It supports filtering by gene name, excluding pseudogenes, and sorting the results.
279
+ The retrieved genes are returned as a Pandas DataFrame with columns for gene names
280
+ and their respective dispersion values.
281
+
282
+ Args:
283
+ dataset_id (str): The unique identifier of the dataset.
284
+ gene_name_filter (str, optional): A filter to include only genes matching a given prefix.
285
+ pseudogenes_filter (bool, optional): If True, filters out genes which are often
286
+ over-expressed but biologically non-informative. Defaults to True.
287
+ See https://github.com/cellannotation/cap-gene-filtering for details.
288
+ offset (int, optional): The starting index for pagination. Defaults to 0.
289
+ limit (int, optional): The maximum number of genes to return. Defaults to 50.
290
+ sort_order (Literal["desc", "asc"], optional): The sorting order for dispersion values.
291
+ Defaults to "desc" (descending).
292
+
293
+ Returns:
294
+ pd.DataFrame: A DataFrame containing highly variable genes with two columns:
295
+ - "gene_symbol" (str): The gene symbol.
296
+ - "dispersion" (float): The dispersion value of the gene. Initially, the gene
297
+ dispersion values are calculated over the log-transformed count matrix,
298
+ these dispersion values are then log-transformed again before being displayed
299
+ in the gene table.
300
+ """
301
+
302
+ options = GetHighlyVariableGenesInput(
303
+ offset = offset,
304
+ limit = limit,
305
+ gene_name_filter = gene_name_filter,
306
+ use_genes_pattern = pseudogenes_filter,
307
+ sort_by = "dispersion",
308
+ sort_order = sort_order
309
+ )
310
+ res = self.__client.highly_variable_genes(
311
+ dataset_id = self.dataset_id,
312
+ options = options
313
+ )
314
+ hvg_list = res.dataset.embedding_highly_variable_genes
315
+
316
+ df = pd.DataFrame({
317
+ "gene_symbol": [g.name for g in hvg_list],
318
+ "dispersion": [g.dispersion for g in hvg_list],
319
+ })
320
+ return df
321
+
322
+ def is_md_cache_ready(self) -> bool:
323
+ """
324
+ Checks whether the molecular data cache is ready.
325
+
326
+ This method queries the dataset's file status and determines if the
327
+ molecular data page files are fully prepared and available.
328
+
329
+ Returns:
330
+ --------
331
+ bool
332
+ True if the metadata cache is ready, otherwise False.
333
+ """
334
+ res = self.__client.files_status(self.dataset_id)
335
+ status = res.dataset.get_md_files_status
336
+ return status == "ready"
337
+
338
+ def heatmap(
339
+ self,
340
+ diff_key: DIFF_KEY,
341
+ n_top_genes: int = 3,
342
+ max_cells_displayed: int = 1000,
343
+ gene_name_filter: str = None,
344
+ pseudogenes_filter: bool = True,
345
+ selection_key: SELECTION_KEY = None,
346
+ include_reference: bool = True
347
+ ) -> HeatmapDatasetEmbeddingDiffHeatMap:
348
+ """
349
+ Return the data to plot a heatmap for the top differentially expressed genes from specific DE analysis.
350
+
351
+ Parameters:
352
+ -----------
353
+ diff_key : DIFF_KEY
354
+ The string key associated with the differential expression analysis results.
355
+ n_top_genes : int, optional
356
+ The number of top differentially expressed genes to include in the heatmap. Default is 3.
357
+ max_cells_displayed : int, optional
358
+ The maximum number of cells to display in the heatmap. Default is 1000.
359
+ gene_name_filter : str, optional
360
+ A filter to include only genes matching a given prefix. Should be used to focus on specific gene. Default is None.
361
+ pseudogenes_filter : bool, optional
362
+ If True, filters out genes which are often over-expressed but biologically non-informative.
363
+ Defaults to True. See https://github.com/cellannotation/cap-gene-filtering for details.
364
+ selection_key : SELECTION_KEY, optional
365
+ If provided, the heatmap will include only cells within the specified selection. Default is None.
366
+ include_reference : bool, optional
367
+ If True, includes a reference selection in the heatmap. Default is True.
368
+
369
+ Returns:
370
+ --------
371
+ HeatmapDatasetEmbeddingDiffHeatMap
372
+ An object containing the heatmap data, including gene names, cell IDs, expression values,
373
+ and selection information.
374
+ """
375
+
376
+ options=PostHeatmapInput(
377
+ diff_key = diff_key,
378
+ n_genes = n_top_genes,
379
+ scale_max_plan = max_cells_displayed,
380
+ genes_filter = gene_name_filter,
381
+ use_genes_pattern = pseudogenes_filter,
382
+ session_id = self.session_id,
383
+ include_reference_selection = include_reference,
384
+ selection_key = selection_key,
385
+ )
386
+
387
+ res = self.__client.heatmap(
388
+ dataset_id=self.dataset_id,
389
+ options=options,
390
+ )
391
+ heatmap = res.dataset.embedding_diff_heat_map
392
+ return heatmap
393
+
394
+
395
+ class CapClient:
396
+ def __init__(
397
+ self,
398
+ url: str = CAP_API_URL,
399
+ ) -> None:
400
+ headers = None
401
+ client = httpx.Client(timeout=300, headers=headers)
402
+ self.__client = _Client(url, headers=headers, http_client=client)
403
+
404
+ def search_datasets(
405
+ self,
406
+ search: List[str] = None,
407
+ organism: List[str] = None,
408
+ tissue: List[str] = None,
409
+ assay: List[str] = None,
410
+ limit: int = 50,
411
+ offset: int = 0,
412
+ sort: List[Dict[str, str]] = [],
413
+ ) -> pd.DataFrame:
414
+ """
415
+ Search public datasets, the analogue of the [dataset search page on CAP](https://celltype.info/search/datasets).
416
+
417
+ Parameters:
418
+ -----------
419
+ search : List[str], optional
420
+ A list of search terms to filter datasets by name. Defaults to None.
421
+ organism : List[str], optional
422
+ A list of organism names to filter datasets. Defaults to None.
423
+ tissue : List[str], optional
424
+ A list of tissue types to filter datasets. Defaults to None.
425
+ assay : List[str], optional
426
+ A list of assay types to filter datasets. Defaults to None.
427
+ limit : int, optional
428
+ The maximum number of datasets to return. Defaults to 50.
429
+ offset : int, optional
430
+ The number of datasets to skip before starting to collect the result set. Defaults to 0.
431
+ sort : List[Dict[str, str]], optional
432
+ A list of dictionaries specifying the sorting order. Each dictionary should have a single key-value pair
433
+ where the key is the field to sort by and the value is either "asc" for ascending or "desc" for descending order.
434
+ Example: [{"name": "asc"}, {"createdAt": "desc"}]. Defaults to an empty list.
435
+
436
+ Returns:
437
+ --------
438
+ pd.DataFrame
439
+ A DataFrame containing the search results with columns corresponding to dataset attributes.
440
+ """
441
+ sorting = []
442
+ for item in sort:
443
+ key = list(item.keys())[0]
444
+ value = list(item.values())[0]
445
+ sorting.append(DatasetSearchSort(field=key, order=value))
446
+ search_options = DatasetSearchOptions(limit=limit, offset=offset, sort=sorting)
447
+
448
+ metadata = []
449
+ if organism:
450
+ metadata.append(SearchByMetadataArgs(field="organism", values=organism))
451
+ if tissue:
452
+ metadata.append(SearchByMetadataArgs(field="tissue", values=tissue))
453
+ if assay:
454
+ metadata.append(SearchByMetadataArgs(field="assay", values=assay))
455
+
456
+ search_filter = LookupDatasetsFiltersInput(metadata=metadata)
457
+ search_input = None
458
+ if search:
459
+ search_input = LookupDatasetsSearchInput(name=search)
460
+
461
+ response = self.__client.search_datasets(
462
+ options=search_options, filter=search_filter, search=search_input
463
+ )
464
+ df = pd.DataFrame([r.model_dump() for r in response.results])
465
+ if "typename__" in df.columns:
466
+ df.drop(columns=["typename__"], inplace=True)
467
+ return df
468
+
469
+ def search_cell_labels(
470
+ self,
471
+ search: str = None,
472
+ organism: List[str] = None,
473
+ tissue: List[str] = None,
474
+ assay: List[str] = None,
475
+ limit: int = 50,
476
+ offset: int = 0,
477
+ sort: List[Dict[str, str]] = [],
478
+ ) -> pd.DataFrame:
479
+ """
480
+ Search for cell labels in the dataset. The analogue of the [cell labels search page on CAP](https://celltype.info/search/cell-labels).
481
+
482
+ Parameters:
483
+ -----------
484
+ search : List[str], optional
485
+ A list of search terms to filter datasets by name. Defaults to None.
486
+ organism : List[str], optional
487
+ A list of organism names to filter datasets. Defaults to None.
488
+ tissue : List[str], optional
489
+ A list of tissue types to filter datasets. Defaults to None.
490
+ assay : List[str], optional
491
+ A list of assay types to filter datasets. Defaults to None.
492
+ limit : int, optional
493
+ The maximum number of datasets to return. Defaults to 50.
494
+ offset : int, optional
495
+ The number of datasets to skip before starting to collect the result set. Defaults to 0.
496
+ sort : List[Dict[str, str]], optional
497
+ A list of dictionaries specifying the sorting order. Each dictionary should have a single key-value pair
498
+ where the key is the field to sort by and the value is either "asc" for ascending or "desc" for descending order.
499
+ Example: [{"name": "asc"}, {"createdAt": "desc"}]. Defaults to an empty list.
500
+
501
+ Returns:
502
+ --------
503
+ pd.DataFrame
504
+ A DataFrame containing the search results with columns corresponding to cell annotation metadata attributes.
505
+ """
506
+ sorting = []
507
+ for item in sort:
508
+ key = list(item.keys())[0]
509
+ value = list(item.values())[0]
510
+ sorting.append(CellLabelsSearchSort(field=key, order=value))
511
+ search_options = CellLabelsSearchOptions(
512
+ limit=limit, offset=offset, sort=sorting
513
+ )
514
+
515
+ metadata = []
516
+ if organism:
517
+ metadata.append(
518
+ SearchLabelByMetadataArgs(field="organism", values=organism)
519
+ )
520
+ if tissue:
521
+ metadata.append(SearchLabelByMetadataArgs(field="tissue", values=tissue))
522
+ if assay:
523
+ metadata.append(SearchLabelByMetadataArgs(field="assay", values=assay))
524
+
525
+ search_filter = LookupLabelsFilters(metadata=metadata)
526
+ search_input = None
527
+ if search:
528
+ search_input = LookupCellsSearch(name=search)
529
+
530
+ response = self.__client.lookup_cells(
531
+ options=search_options, filter=search_filter, search=search_input
532
+ )
533
+ df = pd.DataFrame([lc.model_dump() for lc in response.lookup_cells])
534
+ if "typename__" in df.columns:
535
+ df.drop(columns=["typename__"], inplace=True)
536
+ return df
537
+
538
+ def md_session(self, dataset_id: str) -> MDSession:
539
+ return MDSession(dataset_id=dataset_id, _client=self.__client)