polly-python 2.5.0__tar.gz → 3.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. {polly_python-2.5.0/polly_python.egg-info → polly_python-3.0.0}/PKG-INFO +7 -14
  2. polly_python-3.0.0/polly/__init__.py +1 -0
  3. {polly_python-2.5.0 → polly_python-3.0.0}/polly/constants.py +15 -15
  4. {polly_python-2.5.0 → polly_python-3.0.0}/polly/curation.py +243 -237
  5. {polly_python-2.5.0 → polly_python-3.0.0}/polly/errors.py +38 -38
  6. {polly_python-2.5.0 → polly_python-3.0.0}/polly/helpers.py +1 -56
  7. {polly_python-2.5.0 → polly_python-3.0.0}/polly/omixatlas.py +39 -33
  8. {polly_python-2.5.0 → polly_python-3.0.0}/polly/pipelines.py +119 -92
  9. {polly_python-2.5.0 → polly_python-3.0.0/polly_python.egg-info}/PKG-INFO +7 -14
  10. {polly_python-2.5.0 → polly_python-3.0.0}/polly_python.egg-info/SOURCES.txt +2 -4
  11. {polly_python-2.5.0 → polly_python-3.0.0}/polly_python.egg-info/requires.txt +5 -14
  12. {polly_python-2.5.0 → polly_python-3.0.0}/polly_services/reporting/reporting.py +49 -27
  13. {polly_python-2.5.0 → polly_python-3.0.0}/setup.cfg +6 -13
  14. polly_python-3.0.0/tests/test_help.py +81 -0
  15. {polly_python-2.5.0 → polly_python-3.0.0}/tests/test_helpers.py +10 -10
  16. {polly_python-2.5.0 → polly_python-3.0.0}/tests/test_omixatlas.py +9 -4
  17. {polly_python-2.5.0 → polly_python-3.0.0}/tests/test_pipelines.py +34 -25
  18. polly_python-3.0.0/tests/test_validation.py +135 -0
  19. polly_python-2.5.0/polly/__init__.py +0 -1
  20. polly_python-2.5.0/polly/bridge_cohort.py +0 -399
  21. polly_python-2.5.0/polly/cohort.py +0 -433
  22. polly_python-2.5.0/polly/core_cohort.py +0 -721
  23. polly_python-2.5.0/tests/test_cohort.py +0 -216
  24. {polly_python-2.5.0 → polly_python-3.0.0}/LICENSE.md +0 -0
  25. {polly_python-2.5.0 → polly_python-3.0.0}/MANIFEST.in +0 -0
  26. {polly_python-2.5.0 → polly_python-3.0.0}/README.md +0 -0
  27. {polly_python-2.5.0 → polly_python-3.0.0}/polly/analyze.py +0 -0
  28. {polly_python-2.5.0 → polly_python-3.0.0}/polly/application_error_info.py +0 -0
  29. {polly_python-2.5.0 → polly_python-3.0.0}/polly/atlas.py +0 -0
  30. {polly_python-2.5.0 → polly_python-3.0.0}/polly/auth.py +0 -0
  31. {polly_python-2.5.0 → polly_python-3.0.0}/polly/data_management.py +0 -0
  32. {polly_python-2.5.0 → polly_python-3.0.0}/polly/help.py +0 -0
  33. {polly_python-2.5.0 → polly_python-3.0.0}/polly/http_response_codes.py +0 -0
  34. {polly_python-2.5.0 → polly_python-3.0.0}/polly/index_schema_level_conversion_const.py +0 -0
  35. {polly_python-2.5.0 → polly_python-3.0.0}/polly/jobs.py +0 -0
  36. {polly_python-2.5.0 → polly_python-3.0.0}/polly/omixatlas_hlpr.py +0 -0
  37. {polly_python-2.5.0 → polly_python-3.0.0}/polly/s3_utils.py +0 -0
  38. {polly_python-2.5.0 → polly_python-3.0.0}/polly/session.py +0 -0
  39. {polly_python-2.5.0 → polly_python-3.0.0}/polly/threading_utils.py +0 -0
  40. {polly_python-2.5.0 → polly_python-3.0.0}/polly/tracking.py +0 -0
  41. {polly_python-2.5.0 → polly_python-3.0.0}/polly/validation.py +0 -0
  42. {polly_python-2.5.0 → polly_python-3.0.0}/polly/validation_hlpr.py +0 -0
  43. {polly_python-2.5.0 → polly_python-3.0.0}/polly/workspaces.py +0 -0
  44. {polly_python-2.5.0 → polly_python-3.0.0}/polly_interfaces/IFiles.py +0 -0
  45. {polly_python-2.5.0 → polly_python-3.0.0}/polly_interfaces/IReporting.py +0 -0
  46. {polly_python-2.5.0 → polly_python-3.0.0}/polly_interfaces/ISchema.py +0 -0
  47. {polly_python-2.5.0 → polly_python-3.0.0}/polly_interfaces/__init__.py +0 -0
  48. {polly_python-2.5.0 → polly_python-3.0.0}/polly_python.egg-info/dependency_links.txt +0 -0
  49. {polly_python-2.5.0 → polly_python-3.0.0}/polly_python.egg-info/top_level.txt +0 -0
  50. {polly_python-2.5.0 → polly_python-3.0.0}/polly_services/__init__.py +0 -0
  51. {polly_python-2.5.0 → polly_python-3.0.0}/polly_services/dataset.py +0 -0
  52. {polly_python-2.5.0 → polly_python-3.0.0}/polly_services/files/__init__.py +0 -0
  53. {polly_python-2.5.0 → polly_python-3.0.0}/polly_services/files/files.py +0 -0
  54. {polly_python-2.5.0 → polly_python-3.0.0}/polly_services/files/files_hlpr.py +0 -0
  55. {polly_python-2.5.0 → polly_python-3.0.0}/polly_services/polly_services_hlpr.py +0 -0
  56. {polly_python-2.5.0 → polly_python-3.0.0}/polly_services/reporting/__init__.py +0 -0
  57. {polly_python-2.5.0 → polly_python-3.0.0}/polly_services/reporting/reporting_hlpr.py +0 -0
  58. {polly_python-2.5.0 → polly_python-3.0.0}/polly_services/schema/__init__.py +0 -0
  59. {polly_python-2.5.0 → polly_python-3.0.0}/polly_services/schema/schema.py +0 -0
  60. {polly_python-2.5.0 → polly_python-3.0.0}/polly_services/schema/schema_const.py +0 -0
  61. {polly_python-2.5.0 → polly_python-3.0.0}/polly_services/schema/schema_hlpr.py +0 -0
  62. {polly_python-2.5.0 → polly_python-3.0.0}/polly_services/schema/validate_schema_hlpr.py +0 -0
  63. {polly_python-2.5.0 → polly_python-3.0.0}/pyproject.toml +0 -0
  64. {polly_python-2.5.0 → polly_python-3.0.0}/setup.py +0 -0
  65. {polly_python-2.5.0 → polly_python-3.0.0}/tests/test_constants.py +0 -0
  66. {polly_python-2.5.0 → polly_python-3.0.0}/tests/test_curation.py +0 -0
  67. {polly_python-2.5.0 → polly_python-3.0.0}/tests/test_data_management.py +0 -0
  68. {polly_python-2.5.0 → polly_python-3.0.0}/tests/test_jobs.py +0 -0
  69. {polly_python-2.5.0 → polly_python-3.0.0}/tests/test_s3_utils.py +0 -0
  70. {polly_python-2.5.0 → polly_python-3.0.0}/tests/test_schema_ux.py +0 -0
  71. {polly_python-2.5.0 → polly_python-3.0.0}/tests/test_threading_utils.py +0 -0
  72. {polly_python-2.5.0 → polly_python-3.0.0}/tests/test_workspaces.py +0 -0
@@ -1,14 +1,14 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: polly_python
3
- Version: 2.5.0
3
+ Version: 3.0.0
4
4
  Summary: Polly SDK
5
5
  Home-page: https://github.com/ElucidataInc/polly-python
6
6
  Project-URL: Documentation, https://docs.elucidata.io
7
7
  Project-URL: Tutorial Notebooks, https://github.com/ElucidataInc/polly-python
8
- Requires-Python: >3.8
8
+ Requires-Python: <=3.11,>=3.9
9
9
  Description-Content-Type: text/markdown
10
10
  License-File: LICENSE.md
11
- Requires-Dist: elucidatacmapPy==3.3.4
11
+ Requires-Dist: cmapPy<=4.0.1
12
12
  Requires-Dist: cloudpathlib>=0.15.0
13
13
  Requires-Dist: retrying==1.3.4
14
14
  Requires-Dist: rst2txt==1.1.0
@@ -17,22 +17,15 @@ Requires-Dist: mixpanel==4.10.0
17
17
  Requires-Dist: Deprecated>=1.2.12
18
18
  Requires-Dist: pytest>=6.2.5
19
19
  Requires-Dist: cryptography<=38.0.0,>=37.0.1
20
- Requires-Dist: plotly<5.0.0,>=4.8.1; python_version > "3.6" and python_version < "3.7"
21
20
  Requires-Dist: plotly>=5.0.0; python_version >= "3.7"
22
- Requires-Dist: pandas<1.2.0,>=1.1.0; python_version > "3.6" and python_version < "3.7"
23
- Requires-Dist: pandas>=1.3.5; python_version >= "3.7"
24
- Requires-Dist: pydantic<1.10.0a1,>=1.8.2; python_version > "3.6" and python_version < "3.7"
21
+ Requires-Dist: pandas<=2.2.2,>=1.3.5; python_version >= "3.7"
22
+ Requires-Dist: numpy<=1.26.4
25
23
  Requires-Dist: pydantic==1.10.12; python_version >= "3.7"
26
24
  Requires-Dist: requests==2.28.1
27
- Requires-Dist: numpy==1.26.4
28
- Requires-Dist: boto3<1.24.0,>=1.17.73; python_version > "3.6" and python_version < "3.7"
29
- Requires-Dist: boto3>=1.24.0; python_version >= "3.7"
30
- Requires-Dist: botocore<1.27.0,>=1.20.73; python_version > "3.6" and python_version < "3.7"
31
- Requires-Dist: botocore>=1.27.0; python_version >= "3.7"
32
- Requires-Dist: joblib<=1.1.0,>0.11.0; python_version > "3.6" and python_version < "3.7"
25
+ Requires-Dist: boto3<2.0,>=1.24.0; python_version >= "3.7"
26
+ Requires-Dist: botocore<2.0,>=1.27.0; python_version >= "3.7"
33
27
  Requires-Dist: joblib>=1.2.0; python_version >= "3.7"
34
28
  Requires-Dist: tabulate==0.9.0
35
- Requires-Dist: tqdm<4.65.0,>=4.61.0; python_version > "3.6" and python_version < "3.7"
36
29
  Requires-Dist: tqdm==4.65.0; python_version >= "3.7"
37
30
  Provides-Extra: testing
38
31
  Requires-Dist: black; extra == "testing"
@@ -0,0 +1 @@
1
+ __version__ = "3.0.0"
@@ -92,22 +92,22 @@ IO_CHUNKSIZE_LARGE_FILE_SIZE = 100 * MB
92
92
  EXPIRED_TOKEN = "ExpiredToken"
93
93
 
94
94
  # cohort constants
95
- COHORT_VERSION = "0.2"
96
- COHORT_CONSTANTS_URL = (
97
- "https://elucidatainc.github.io/PublicAssets/cohort_constants.txt"
98
- )
95
+ # COHORT_VERSION = "0.2"
96
+ # COHORT_CONSTANTS_URL = (
97
+ # "https://elucidatainc.github.io/PublicAssets/cohort_constants.txt"
98
+ # )
99
99
 
100
- OBSOLETE_METADATA_FIELDS = [
101
- "package",
102
- "region",
103
- "bucket",
104
- "key",
105
- "file_type",
106
- "file_location",
107
- "src_uri",
108
- "timestamp_",
109
- ]
110
- dot = "."
100
+ # OBSOLETE_METADATA_FIELDS = [
101
+ # "package",
102
+ # "region",
103
+ # "bucket",
104
+ # "key",
105
+ # "file_type",
106
+ # "file_location",
107
+ # "src_uri",
108
+ # "timestamp_",
109
+ # ]
110
+ # dot = "."
111
111
 
112
112
  GETTING_UPLOAD_URLS_PAYLOAD = {"data": {"type": "files", "attributes": {"folder": ""}}}
113
113
 
@@ -1,9 +1,11 @@
1
1
  import json
2
2
  from collections import namedtuple
3
- import os
4
- import shutil
3
+
4
+ # import os
5
+ # import shutil
5
6
  from typing import Dict, Optional, List
6
- import warnings
7
+
8
+ # import warnings
7
9
  import pandas as pd
8
10
  from functools import lru_cache
9
11
  from polly.errors import (
@@ -13,15 +15,19 @@ from polly.errors import (
13
15
  RequestException,
14
16
  UnauthorizedException,
15
17
  extract_json_api_error,
16
- paramException,
18
+ # paramException,
17
19
  )
18
20
  from polly.auth import Polly
19
- from polly.cohort import Cohort
21
+
22
+ # from polly.cohort import Cohort
20
23
  from polly import helpers, constants as const, application_error_info as app_err_info
21
24
  from polly.help import example
22
25
  import polly.http_response_codes as http_codes
23
- from polly.constants import SUPPORTED_ENTITY_TYPES, CURATION_COHORT_CACHE
24
- from polly.helpers import get_cohort_constants
26
+
27
+ # from polly.constants import SUPPORTED_ENTITY_TYPES, CURATION_COHORT_CACHE
28
+ from polly.constants import SUPPORTED_ENTITY_TYPES
29
+
30
+ # from polly.helpers import get_cohort_constants
25
31
  from polly.tracking import Track
26
32
 
27
33
 
@@ -60,162 +66,162 @@ class Curation:
60
66
  f"https://api.datalake.discover.{self.session.env}.elucidata.io/elastic/v2"
61
67
  )
62
68
  self.inference_url = f"https://api.discover.{self.session.env}.elucidata.io/curations/inferences/"
63
- self.cohort = Cohort()
64
- self.cohort_constants = get_cohort_constants()
69
+ # self.cohort = Cohort()
70
+ # self.cohort_constants = get_cohort_constants()
65
71
 
66
72
  def _handle_errors(self, response):
67
73
  detail = response.get("errors")[0].get("detail", [])
68
74
  title = response.get("errors")[0].get("title", [])
69
75
  return title, detail
70
76
 
71
- def _fetch_metadata_from_cohort(self, repo_name: str, dataset_ids: List[str]):
72
- """
73
- Utility function for fetching metadata using cohorts.
74
-
75
- Arguments:
76
- repo_name (str) : name of the repository for fetching datasets.
77
- dataset_ids (List[str]): dataset ids to be used for inference
78
-
79
- Returns:
80
- Returns sample metadata, dataset and sample ids.
81
- """
82
- sample_metadata = {}
83
- dataset_to_sample_id = {"dataset_id": [], "sample_id": []}
84
-
85
- if not (os.path.isdir(CURATION_COHORT_CACHE)):
86
- os.mkdir(CURATION_COHORT_CACHE)
87
- else:
88
- shutil.rmtree(CURATION_COHORT_CACHE)
89
- os.mkdir(CURATION_COHORT_CACHE)
90
-
91
- self.cohort.create_cohort(
92
- CURATION_COHORT_CACHE, "sample_metadata_query", "desc"
93
- )
94
-
95
- # Fetch metadata using cohorts
96
- for dataset_id in dataset_ids:
97
- datasets_sample_metadata = []
98
-
99
- if not (
100
- repo_name in self.cohort_constants
101
- and self.cohort_constants[repo_name]["file_structure"] != "multiple"
102
- ):
103
- # multiple mapped repo such as GEO
104
- self.cohort.add_to_cohort(repo_name, dataset_id=dataset_id)
105
- else:
106
- # for single mapped repos such as TCGA
107
- self.cohort.add_to_cohort(repo_name, dataset_id=[dataset_id])
108
-
109
- col_metadata = self.cohort.merge_data("sample")
110
- all_sample_ids = col_metadata.index.tolist()
111
-
112
- col_metadata.loc[:, "dataset_id"] = dataset_id
113
- dataset_to_sample_id["dataset_id"] += [dataset_id] * len(all_sample_ids)
114
-
115
- col_metadata.loc[:, "sample_id"] = all_sample_ids
116
- dataset_to_sample_id["sample_id"] += all_sample_ids
117
-
118
- datasets_sample_metadata += list(col_metadata.T.to_dict().values())
119
-
120
- if not (
121
- repo_name in self.cohort_constants
122
- and self.cohort_constants[repo_name]["file_structure"] != "multiple"
123
- ):
124
- self.cohort.remove_from_cohort(dataset_id)
125
- else:
126
- self.cohort.remove_from_cohort([dataset_id])
127
-
128
- sample_metadata[dataset_id] = datasets_sample_metadata
129
-
130
- dataset_to_sample_id = pd.DataFrame.from_dict(dataset_to_sample_id)
131
-
132
- return sample_metadata, dataset_to_sample_id
133
-
134
- def _clinical_model_param_checks(
135
- self,
136
- repo_name: str,
137
- dataset_ids: List[str],
138
- sample_ids: Optional[List[str]] = None,
139
- ):
140
- """
141
- Checking the parameter passed to the clinical label assigning model.
142
-
143
- Arguments:
144
- repo_name (str): repo name
145
- dataset_ids (list[str]): list of dataset ids
146
-
147
- Keyword Arguments:
148
- sample_ids (list[str], optional): Optional Parameter. List of sample ids.
149
- Default is 'None'.
150
-
151
- Raises:
152
- paramException
153
- """
154
- if dataset_ids is None or type(dataset_ids) is not list:
155
- raise paramException(
156
- title="Param Exception",
157
- detail="Dataset IDs should be given as a valid list of strings",
158
- )
159
-
160
- if sample_ids is not None and type(sample_ids) is not list:
161
- raise paramException(
162
- title="Param Exception",
163
- detail="Sample IDs should be given as a valid list of strings",
164
- )
165
-
166
- if repo_name != "geo" and not any(
167
- ["GSE" in dataset_id for dataset_id in dataset_ids]
168
- ):
169
- warnings.warn(
170
- "The model is tested with GEO metadata and the labels may be wrong for other repos"
171
- )
172
-
173
- def _post_process_clinical_tags(
174
- self,
175
- clinical_tags: pd.DataFrame,
176
- is_sample_tag: bool,
177
- sample_ids: Optional[List[str]] = None,
178
- ) -> pd.DataFrame:
179
- """
180
- process the response of the model (dataframe with clinical tags and samples)
181
- and return relevant feilds.
182
- incase no sample_ids are provided by the user, we return the dataset_ids and the clinical tags
183
- incase sample_ids are also provided, then we return the dataset_ids, the sample_ids and the clincal tags.
184
-
185
- Arguments:
186
- clinical_tags (pd.DataFrame): dataframe of the sample_ids and assigned clinical tags
187
- is_sample_tag (bool): if samples passed
188
-
189
- Keyword Arguments:
190
- sample_ids (list[str]): list of sample ids (default: {None})
191
-
192
- Returns:
193
- a dataframe with the the dataset_ids, sample_ids and the assigned clinical tags
194
- """
195
- if is_sample_tag:
196
- # if the user has provided list of samples, then we filter in just those sample ids
197
- # for the dataset ids.
198
- # taking only those clinical tags and samples where the sample_ids are in the sample_id list
199
- # provided by the user.
200
- clinical_tags = clinical_tags[
201
- clinical_tags["sample_id"].isin(sample_ids)
202
- ].reset_index(drop=True)
203
-
204
- # in case the sample_ids provided by the user are not present in the dataset_ids provided.
205
- if clinical_tags.empty or clinical_tags.shape[0] < len(sample_ids):
206
- warnings.warn(
207
- "The output is empty or has missing sample ids because they are not present in given datasets."
208
- )
209
-
210
- # return sample level tags here
211
- return clinical_tags
212
- # if no sample_ids were passed by the user, then
213
- # returning dataset level tags by removing sample id and removing duplicate columns
214
- return (
215
- clinical_tags.drop(columns=["sample_id"])
216
- .drop_duplicates()
217
- .reset_index(drop=True)
218
- )
77
+ # def _fetch_metadata_from_cohort(self, repo_name: str, dataset_ids: List[str]):
78
+ # """
79
+ # Utility function for fetching metadata using cohorts.
80
+
81
+ # Arguments:
82
+ # repo_name (str) : name of the repository for fetching datasets.
83
+ # dataset_ids (List[str]): dataset ids to be used for inference
84
+
85
+ # Returns:
86
+ # Returns sample metadata, dataset and sample ids.
87
+ # """
88
+ # sample_metadata = {}
89
+ # dataset_to_sample_id = {"dataset_id": [], "sample_id": []}
90
+
91
+ # if not (os.path.isdir(CURATION_COHORT_CACHE)):
92
+ # os.mkdir(CURATION_COHORT_CACHE)
93
+ # else:
94
+ # shutil.rmtree(CURATION_COHORT_CACHE)
95
+ # os.mkdir(CURATION_COHORT_CACHE)
96
+
97
+ # self.cohort.create_cohort(
98
+ # CURATION_COHORT_CACHE, "sample_metadata_query", "desc"
99
+ # )
100
+
101
+ # # Fetch metadata using cohorts
102
+ # for dataset_id in dataset_ids:
103
+ # datasets_sample_metadata = []
104
+
105
+ # if not (
106
+ # repo_name in self.cohort_constants
107
+ # and self.cohort_constants[repo_name]["file_structure"] != "multiple"
108
+ # ):
109
+ # # multiple mapped repo such as GEO
110
+ # self.cohort.add_to_cohort(repo_name, dataset_id=dataset_id)
111
+ # else:
112
+ # # for single mapped repos such as TCGA
113
+ # self.cohort.add_to_cohort(repo_name, dataset_id=[dataset_id])
114
+
115
+ # col_metadata = self.cohort.merge_data("sample")
116
+ # all_sample_ids = col_metadata.index.tolist()
117
+
118
+ # col_metadata.loc[:, "dataset_id"] = dataset_id
119
+ # dataset_to_sample_id["dataset_id"] += [dataset_id] * len(all_sample_ids)
120
+
121
+ # col_metadata.loc[:, "sample_id"] = all_sample_ids
122
+ # dataset_to_sample_id["sample_id"] += all_sample_ids
123
+
124
+ # datasets_sample_metadata += list(col_metadata.T.to_dict().values())
125
+
126
+ # if not (
127
+ # repo_name in self.cohort_constants
128
+ # and self.cohort_constants[repo_name]["file_structure"] != "multiple"
129
+ # ):
130
+ # self.cohort.remove_from_cohort(dataset_id)
131
+ # else:
132
+ # self.cohort.remove_from_cohort([dataset_id])
133
+
134
+ # sample_metadata[dataset_id] = datasets_sample_metadata
135
+
136
+ # dataset_to_sample_id = pd.DataFrame.from_dict(dataset_to_sample_id)
137
+
138
+ # return sample_metadata, dataset_to_sample_id
139
+
140
+ # def _clinical_model_param_checks(
141
+ # self,
142
+ # repo_name: str,
143
+ # dataset_ids: List[str],
144
+ # sample_ids: Optional[List[str]] = None,
145
+ # ):
146
+ # """
147
+ # Checking the parameter passed to the clinical label assigning model.
148
+
149
+ # Arguments:
150
+ # repo_name (str): repo name
151
+ # dataset_ids (list[str]): list of dataset ids
152
+
153
+ # Keyword Arguments:
154
+ # sample_ids (list[str], optional): Optional Parameter. List of sample ids.
155
+ # Default is 'None'.
156
+
157
+ # Raises:
158
+ # paramException
159
+ # """
160
+ # if dataset_ids is None or type(dataset_ids) is not list:
161
+ # raise paramException(
162
+ # title="Param Exception",
163
+ # detail="Dataset IDs should be given as a valid list of strings",
164
+ # )
165
+
166
+ # if sample_ids is not None and type(sample_ids) is not list:
167
+ # raise paramException(
168
+ # title="Param Exception",
169
+ # detail="Sample IDs should be given as a valid list of strings",
170
+ # )
171
+
172
+ # if repo_name != "geo" and not any(
173
+ # ["GSE" in dataset_id for dataset_id in dataset_ids]
174
+ # ):
175
+ # warnings.warn(
176
+ # "The model is tested with GEO metadata and the labels may be wrong for other repos"
177
+ # )
178
+
179
+ # def _post_process_clinical_tags(
180
+ # self,
181
+ # clinical_tags: pd.DataFrame,
182
+ # is_sample_tag: bool,
183
+ # sample_ids: Optional[List[str]] = None,
184
+ # ) -> pd.DataFrame:
185
+ # """
186
+ # process the response of the model (dataframe with clinical tags and samples)
187
+ # and return relevant feilds.
188
+ # incase no sample_ids are provided by the user, we return the dataset_ids and the clinical tags
189
+ # incase sample_ids are also provided, then we return the dataset_ids, the sample_ids and the clincal tags.
190
+
191
+ # Arguments:
192
+ # clinical_tags (pd.DataFrame): dataframe of the sample_ids and assigned clinical tags
193
+ # is_sample_tag (bool): if samples passed
194
+
195
+ # Keyword Arguments:
196
+ # sample_ids (list[str]): list of sample ids (default: {None})
197
+
198
+ # Returns:
199
+ # a dataframe with the the dataset_ids, sample_ids and the assigned clinical tags
200
+ # """
201
+ # if is_sample_tag:
202
+ # # if the user has provided list of samples, then we filter in just those sample ids
203
+ # # for the dataset ids.
204
+ # # taking only those clinical tags and samples where the sample_ids are in the sample_id list
205
+ # # provided by the user.
206
+ # clinical_tags = clinical_tags[
207
+ # clinical_tags["sample_id"].isin(sample_ids)
208
+ # ].reset_index(drop=True)
209
+
210
+ # # in case the sample_ids provided by the user are not present in the dataset_ids provided.
211
+ # if clinical_tags.empty or clinical_tags.shape[0] < len(sample_ids):
212
+ # warnings.warn(
213
+ # "The output is empty or has missing sample ids because they are not present in given datasets."
214
+ # )
215
+
216
+ # # return sample level tags here
217
+ # return clinical_tags
218
+ # # if no sample_ids were passed by the user, then
219
+ # # returning dataset level tags by removing sample id and removing duplicate columns
220
+ # return (
221
+ # clinical_tags.drop(columns=["sample_id"])
222
+ # .drop_duplicates()
223
+ # .reset_index(drop=True)
224
+ # )
219
225
 
220
226
  def _handle_perform_inference_api_error(self, response):
221
227
  if response.status_code == http_codes.UNAUTHORIZED:
@@ -483,83 +489,83 @@ class Curation:
483
489
  sample_metadata["control_prob"] = output["control_prob"].values
484
490
  return sample_metadata
485
491
 
486
- @Track.track_decorator
487
- def assign_clinical_labels(
488
- self,
489
- repo_name: str,
490
- dataset_ids: List[str],
491
- sample_ids: Optional[List[str]] = None,
492
- ) -> pd.DataFrame:
493
- """
494
- Returns a list of clinical or non clinical labels for the given datasets or samples.
495
-
496
- Arguments:
497
- repo_name (str): name of the repository for fetching datasets.
498
- dataset_ids (List[str]): dataset ids to be used for inference
499
-
500
- Keyword Arguments:
501
- sample_ids (List[str], optional): Optional Parameter. Sample ids if that is needed.
502
-
503
- Raises:
504
- RequestException: API response exception
505
- ParamException: Invalid parameters
506
- err
507
-
508
- Returns:
509
- dataframe which is a list of clinical tags for given ids
510
- """
511
- warnings.formatwarning = lambda msg, *args, **kwargs: f"WARNING: {msg}\n"
512
-
513
- try:
514
- self._clinical_model_param_checks(repo_name, dataset_ids, sample_ids)
515
- # evaluating the inference level based on if the user has provided sample_ids
516
- is_sample_tag = sample_ids is not None
517
- inference_level = "sample_id" if (is_sample_tag) else "dataset_id"
518
-
519
- sample_metadata, dataset_to_sample_id = self._fetch_metadata_from_cohort(
520
- repo_name=repo_name, dataset_ids=dataset_ids
521
- )
522
-
523
- clinical_model_predictions = []
524
-
525
- for dataset_id in sample_metadata:
526
- # Get output from model endpoint and structure output
527
- payload = {
528
- "sample_metadata": sample_metadata[dataset_id],
529
- "sample_id_column": "sample_id",
530
- "dataset_id_column": "dataset_id",
531
- "is_sample_tag": is_sample_tag,
532
- }
533
-
534
- output = self._perform_inference("clinical-classifier", payload)
535
- if "errors" in output:
536
- title, detail = self._handle_errors(output)
537
- raise RequestException(title, detail)
538
-
539
- output = output["clinical_predictions"]
540
-
541
- clinical_model_predictions += output
542
-
543
- # creating dataframe with inference_level and clinical_tags with values from the clinical_model_predictions
544
- clinical_tags = pd.DataFrame(
545
- {
546
- inference_level: [
547
- tag["tag_id"] for tag in clinical_model_predictions
548
- ],
549
- "clinical_tag": [
550
- tag["clinical_tag"] for tag in clinical_model_predictions
551
- ],
552
- }
553
- )
554
-
555
- clinical_tags = pd.merge(
556
- dataset_to_sample_id, clinical_tags, on=inference_level
557
- )
558
-
559
- clinical_tags = self._post_process_clinical_tags(
560
- clinical_tags, is_sample_tag, sample_ids
561
- )
562
- except Exception as err:
563
- raise err
564
-
565
- return clinical_tags
492
+ # @Track.track_decorator
493
+ # def assign_clinical_labels(
494
+ # self,
495
+ # repo_name: str,
496
+ # dataset_ids: List[str],
497
+ # sample_ids: Optional[List[str]] = None,
498
+ # ) -> pd.DataFrame:
499
+ # """
500
+ # Returns a list of clinical or non clinical labels for the given datasets or samples.
501
+
502
+ # Arguments:
503
+ # repo_name (str): name of the repository for fetching datasets.
504
+ # dataset_ids (List[str]): dataset ids to be used for inference
505
+
506
+ # Keyword Arguments:
507
+ # sample_ids (List[str], optional): Optional Parameter. Sample ids if that is needed.
508
+
509
+ # Raises:
510
+ # RequestException: API response exception
511
+ # ParamException: Invalid parameters
512
+ # err
513
+
514
+ # Returns:
515
+ # dataframe which is a list of clinical tags for given ids
516
+ # """
517
+ # warnings.formatwarning = lambda msg, *args, **kwargs: f"WARNING: {msg}\n"
518
+
519
+ # try:
520
+ # self._clinical_model_param_checks(repo_name, dataset_ids, sample_ids)
521
+ # # evaluating the inference level based on if the user has provided sample_ids
522
+ # is_sample_tag = sample_ids is not None
523
+ # inference_level = "sample_id" if (is_sample_tag) else "dataset_id"
524
+
525
+ # sample_metadata, dataset_to_sample_id = self._fetch_metadata_from_cohort(
526
+ # repo_name=repo_name, dataset_ids=dataset_ids
527
+ # )
528
+
529
+ # clinical_model_predictions = []
530
+
531
+ # for dataset_id in sample_metadata:
532
+ # # Get output from model endpoint and structure output
533
+ # payload = {
534
+ # "sample_metadata": sample_metadata[dataset_id],
535
+ # "sample_id_column": "sample_id",
536
+ # "dataset_id_column": "dataset_id",
537
+ # "is_sample_tag": is_sample_tag,
538
+ # }
539
+
540
+ # output = self._perform_inference("clinical-classifier", payload)
541
+ # if "errors" in output:
542
+ # title, detail = self._handle_errors(output)
543
+ # raise RequestException(title, detail)
544
+
545
+ # output = output["clinical_predictions"]
546
+
547
+ # clinical_model_predictions += output
548
+
549
+ # # creating dataframe with inference_level and clinical_tags with values from the clinical_model_predictions
550
+ # clinical_tags = pd.DataFrame(
551
+ # {
552
+ # inference_level: [
553
+ # tag["tag_id"] for tag in clinical_model_predictions
554
+ # ],
555
+ # "clinical_tag": [
556
+ # tag["clinical_tag"] for tag in clinical_model_predictions
557
+ # ],
558
+ # }
559
+ # )
560
+
561
+ # clinical_tags = pd.merge(
562
+ # dataset_to_sample_id, clinical_tags, on=inference_level
563
+ # )
564
+
565
+ # clinical_tags = self._post_process_clinical_tags(
566
+ # clinical_tags, is_sample_tag, sample_ids
567
+ # )
568
+ # except Exception as err:
569
+ # raise err
570
+
571
+ # return clinical_tags