PyPI - polly-python - Versions diffs - 2.5.0__tar.gz → 3.0.0__tar.gz - Mend

polly-python 2.5.0tar.gz → 3.0.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (72) hide show

{polly_python-2.5.0/polly_python.egg-info → polly_python-3.0.0}/PKG-INFO RENAMED Viewed

@@ -1,14 +1,14 @@
 Metadata-Version: 2.1
 Name: polly_python
-Version: 2.5.0
+Version: 3.0.0
 Summary: Polly SDK
 Home-page: https://github.com/ElucidataInc/polly-python
 Project-URL: Documentation, https://docs.elucidata.io
 Project-URL: Tutorial Notebooks, https://github.com/ElucidataInc/polly-python
-Requires-Python: >3.8
+Requires-Python: <=3.11,>=3.9
 Description-Content-Type: text/markdown
 License-File: LICENSE.md
-Requires-Dist: elucidatacmapPy==3.3.4
+Requires-Dist: cmapPy<=4.0.1
 Requires-Dist: cloudpathlib>=0.15.0
 Requires-Dist: retrying==1.3.4
 Requires-Dist: rst2txt==1.1.0
@@ -17,22 +17,15 @@ Requires-Dist: mixpanel==4.10.0
 Requires-Dist: Deprecated>=1.2.12
 Requires-Dist: pytest>=6.2.5
 Requires-Dist: cryptography<=38.0.0,>=37.0.1
-Requires-Dist: plotly<5.0.0,>=4.8.1; python_version > "3.6" and python_version < "3.7"
 Requires-Dist: plotly>=5.0.0; python_version >= "3.7"
-Requires-Dist: pandas<1.2.0,>=1.1.0; python_version > "3.6" and python_version < "3.7"
-Requires-Dist: pandas>=1.3.5; python_version >= "3.7"
-Requires-Dist: pydantic<1.10.0a1,>=1.8.2; python_version > "3.6" and python_version < "3.7"
+Requires-Dist: pandas<=2.2.2,>=1.3.5; python_version >= "3.7"
+Requires-Dist: numpy<=1.26.4
 Requires-Dist: pydantic==1.10.12; python_version >= "3.7"
 Requires-Dist: requests==2.28.1
-Requires-Dist: numpy==1.26.4
-Requires-Dist: boto3<1.24.0,>=1.17.73; python_version > "3.6" and python_version < "3.7"
-Requires-Dist: boto3>=1.24.0; python_version >= "3.7"
-Requires-Dist: botocore<1.27.0,>=1.20.73; python_version > "3.6" and python_version < "3.7"
-Requires-Dist: botocore>=1.27.0; python_version >= "3.7"
-Requires-Dist: joblib<=1.1.0,>0.11.0; python_version > "3.6" and python_version < "3.7"
+Requires-Dist: boto3<2.0,>=1.24.0; python_version >= "3.7"
+Requires-Dist: botocore<2.0,>=1.27.0; python_version >= "3.7"
 Requires-Dist: joblib>=1.2.0; python_version >= "3.7"
 Requires-Dist: tabulate==0.9.0
-Requires-Dist: tqdm<4.65.0,>=4.61.0; python_version > "3.6" and python_version < "3.7"
 Requires-Dist: tqdm==4.65.0; python_version >= "3.7"
 Provides-Extra: testing
 Requires-Dist: black; extra == "testing"

polly_python-3.0.0/polly/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__ = "3.0.0"

{polly_python-2.5.0 → polly_python-3.0.0}/polly/constants.py RENAMED Viewed

@@ -92,22 +92,22 @@ IO_CHUNKSIZE_LARGE_FILE_SIZE = 100 * MB
 EXPIRED_TOKEN = "ExpiredToken"
 # cohort constants
-COHORT_VERSION = "0.2"
-COHORT_CONSTANTS_URL = (
-    "https://elucidatainc.github.io/PublicAssets/cohort_constants.txt"
-)
+# COHORT_VERSION = "0.2"
+# COHORT_CONSTANTS_URL = (
+#     "https://elucidatainc.github.io/PublicAssets/cohort_constants.txt"
+# )
-OBSOLETE_METADATA_FIELDS = [
-    "package",
-    "region",
-    "bucket",
-    "key",
-    "file_type",
-    "file_location",
-    "src_uri",
-    "timestamp_",
-]
-dot = "."
+# OBSOLETE_METADATA_FIELDS = [
+#     "package",
+#     "region",
+#     "bucket",
+#     "key",
+#     "file_type",
+#     "file_location",
+#     "src_uri",
+#     "timestamp_",
+# ]
+# dot = "."
 GETTING_UPLOAD_URLS_PAYLOAD = {"data": {"type": "files", "attributes": {"folder": ""}}}

{polly_python-2.5.0 → polly_python-3.0.0}/polly/curation.py RENAMED Viewed

@@ -1,9 +1,11 @@
 import json
 from collections import namedtuple
-import os
-import shutil
+# import os
+# import shutil
 from typing import Dict, Optional, List
-import warnings
+# import warnings
 import pandas as pd
 from functools import lru_cache
 from polly.errors import (
@@ -13,15 +15,19 @@ from polly.errors import (
     RequestException,
     UnauthorizedException,
     extract_json_api_error,
-    paramException,
+    # paramException,
 )
 from polly.auth import Polly
-from polly.cohort import Cohort
+# from polly.cohort import Cohort
 from polly import helpers, constants as const, application_error_info as app_err_info
 from polly.help import example
 import polly.http_response_codes as http_codes
-from polly.constants import SUPPORTED_ENTITY_TYPES, CURATION_COHORT_CACHE
-from polly.helpers import get_cohort_constants
+# from polly.constants import SUPPORTED_ENTITY_TYPES, CURATION_COHORT_CACHE
+from polly.constants import SUPPORTED_ENTITY_TYPES
+# from polly.helpers import get_cohort_constants
 from polly.tracking import Track
@@ -60,162 +66,162 @@ class Curation:
             f"https://api.datalake.discover.{self.session.env}.elucidata.io/elastic/v2"
         )
         self.inference_url = f"https://api.discover.{self.session.env}.elucidata.io/curations/inferences/"
-        self.cohort = Cohort()
-        self.cohort_constants = get_cohort_constants()
+        # self.cohort = Cohort()
+        # self.cohort_constants = get_cohort_constants()
     def _handle_errors(self, response):
         detail = response.get("errors")[0].get("detail", [])
         title = response.get("errors")[0].get("title", [])
         return title, detail
-    def _fetch_metadata_from_cohort(self, repo_name: str, dataset_ids: List[str]):
-        """
-        Utility function for fetching metadata using cohorts.
-        Arguments:
-            repo_name (str) : name of the repository for fetching datasets.
-            dataset_ids (List[str]): dataset ids to be used for inference
-        Returns:
-            Returns sample metadata, dataset and sample ids.
-        """
-        sample_metadata = {}
-        dataset_to_sample_id = {"dataset_id": [], "sample_id": []}
-        if not (os.path.isdir(CURATION_COHORT_CACHE)):
-            os.mkdir(CURATION_COHORT_CACHE)
-        else:
-            shutil.rmtree(CURATION_COHORT_CACHE)
-            os.mkdir(CURATION_COHORT_CACHE)
-        self.cohort.create_cohort(
-            CURATION_COHORT_CACHE, "sample_metadata_query", "desc"
-        )
-        # Fetch metadata using cohorts
-        for dataset_id in dataset_ids:
-            datasets_sample_metadata = []
-            if not (
-                repo_name in self.cohort_constants
-                and self.cohort_constants[repo_name]["file_structure"] != "multiple"
-            ):
-                # multiple mapped repo such as GEO
-                self.cohort.add_to_cohort(repo_name, dataset_id=dataset_id)
-            else:
-                # for single mapped repos such as TCGA
-                self.cohort.add_to_cohort(repo_name, dataset_id=[dataset_id])
-            col_metadata = self.cohort.merge_data("sample")
-            all_sample_ids = col_metadata.index.tolist()
-            col_metadata.loc[:, "dataset_id"] = dataset_id
-            dataset_to_sample_id["dataset_id"] += [dataset_id] * len(all_sample_ids)
-            col_metadata.loc[:, "sample_id"] = all_sample_ids
-            dataset_to_sample_id["sample_id"] += all_sample_ids
-            datasets_sample_metadata += list(col_metadata.T.to_dict().values())
-            if not (
-                repo_name in self.cohort_constants
-                and self.cohort_constants[repo_name]["file_structure"] != "multiple"
-            ):
-                self.cohort.remove_from_cohort(dataset_id)
-            else:
-                self.cohort.remove_from_cohort([dataset_id])
-            sample_metadata[dataset_id] = datasets_sample_metadata
-        dataset_to_sample_id = pd.DataFrame.from_dict(dataset_to_sample_id)
-        return sample_metadata, dataset_to_sample_id
-    def _clinical_model_param_checks(
-        self,
-        repo_name: str,
-        dataset_ids: List[str],
-        sample_ids: Optional[List[str]] = None,
-    ):
-        """
-        Checking the parameter passed to the clinical label assigning model.
-        Arguments:
-            repo_name (str): repo name
-            dataset_ids (list[str]): list of dataset ids
-        Keyword Arguments:
-            sample_ids (list[str], optional): Optional Parameter. List of sample ids.
-            Default is 'None'.
-        Raises:
-            paramException
-        """
-        if dataset_ids is None or type(dataset_ids) is not list:
-            raise paramException(
-                title="Param Exception",
-                detail="Dataset IDs should be given as a valid list of strings",
-            )
-        if sample_ids is not None and type(sample_ids) is not list:
-            raise paramException(
-                title="Param Exception",
-                detail="Sample IDs should be given as a valid list of strings",
-            )
-        if repo_name != "geo" and not any(
-            ["GSE" in dataset_id for dataset_id in dataset_ids]
-        ):
-            warnings.warn(
-                "The model is tested with GEO metadata and the labels may be wrong for other repos"
-            )
-    def _post_process_clinical_tags(
-        self,
-        clinical_tags: pd.DataFrame,
-        is_sample_tag: bool,
-        sample_ids: Optional[List[str]] = None,
-    ) -> pd.DataFrame:
-        """
-        process the response of the model (dataframe with clinical tags and samples)
-        and return relevant feilds.
-        incase no sample_ids are provided by the user, we return the dataset_ids and the clinical tags
-        incase sample_ids are also provided, then we return the dataset_ids, the sample_ids and the clincal tags.
-        Arguments:
-            clinical_tags (pd.DataFrame): dataframe of the sample_ids and assigned clinical tags
-            is_sample_tag (bool): if samples passed
-        Keyword Arguments:
-            sample_ids (list[str]): list of sample ids (default: {None})
-        Returns:
-            a dataframe with the the dataset_ids, sample_ids and the assigned clinical tags
-        """
-        if is_sample_tag:
-            # if the user has provided list of samples, then we filter in just those sample ids
-            # for the dataset ids.
-            # taking only those clinical tags and samples where the sample_ids are in the sample_id list
-            # provided by the user.
-            clinical_tags = clinical_tags[
-                clinical_tags["sample_id"].isin(sample_ids)
-            ].reset_index(drop=True)
-            # in case the sample_ids provided by the user are not present in the dataset_ids provided.
-            if clinical_tags.empty or clinical_tags.shape[0] < len(sample_ids):
-                warnings.warn(
-                    "The output is empty or has missing sample ids because they are not present in given datasets."
-                )
-            # return sample level tags here
-            return clinical_tags
-        # if no sample_ids were passed by the user, then
-        # returning dataset level tags by removing sample id and removing duplicate columns
-        return (
-            clinical_tags.drop(columns=["sample_id"])
-            .drop_duplicates()
-            .reset_index(drop=True)
-        )
+    # def _fetch_metadata_from_cohort(self, repo_name: str, dataset_ids: List[str]):
+    #     """
+    #     Utility function for fetching metadata using cohorts.
+    #     Arguments:
+    #         repo_name (str) : name of the repository for fetching datasets.
+    #         dataset_ids (List[str]): dataset ids to be used for inference
+    #     Returns:
+    #         Returns sample metadata, dataset and sample ids.
+    #     """
+    #     sample_metadata = {}
+    #     dataset_to_sample_id = {"dataset_id": [], "sample_id": []}
+    #     if not (os.path.isdir(CURATION_COHORT_CACHE)):
+    #         os.mkdir(CURATION_COHORT_CACHE)
+    #     else:
+    #         shutil.rmtree(CURATION_COHORT_CACHE)
+    #         os.mkdir(CURATION_COHORT_CACHE)
+    #     self.cohort.create_cohort(
+    #         CURATION_COHORT_CACHE, "sample_metadata_query", "desc"
+    #     )
+    #     # Fetch metadata using cohorts
+    #     for dataset_id in dataset_ids:
+    #         datasets_sample_metadata = []
+    #         if not (
+    #             repo_name in self.cohort_constants
+    #             and self.cohort_constants[repo_name]["file_structure"] != "multiple"
+    #         ):
+    #             # multiple mapped repo such as GEO
+    #             self.cohort.add_to_cohort(repo_name, dataset_id=dataset_id)
+    #         else:
+    #             # for single mapped repos such as TCGA
+    #             self.cohort.add_to_cohort(repo_name, dataset_id=[dataset_id])
+    #         col_metadata = self.cohort.merge_data("sample")
+    #         all_sample_ids = col_metadata.index.tolist()
+    #         col_metadata.loc[:, "dataset_id"] = dataset_id
+    #         dataset_to_sample_id["dataset_id"] += [dataset_id] * len(all_sample_ids)
+    #         col_metadata.loc[:, "sample_id"] = all_sample_ids
+    #         dataset_to_sample_id["sample_id"] += all_sample_ids
+    #         datasets_sample_metadata += list(col_metadata.T.to_dict().values())
+    #         if not (
+    #             repo_name in self.cohort_constants
+    #             and self.cohort_constants[repo_name]["file_structure"] != "multiple"
+    #         ):
+    #             self.cohort.remove_from_cohort(dataset_id)
+    #         else:
+    #             self.cohort.remove_from_cohort([dataset_id])
+    #         sample_metadata[dataset_id] = datasets_sample_metadata
+    #     dataset_to_sample_id = pd.DataFrame.from_dict(dataset_to_sample_id)
+    #     return sample_metadata, dataset_to_sample_id
+    # def _clinical_model_param_checks(
+    #     self,
+    #     repo_name: str,
+    #     dataset_ids: List[str],
+    #     sample_ids: Optional[List[str]] = None,
+    # ):
+    #     """
+    #     Checking the parameter passed to the clinical label assigning model.
+    #     Arguments:
+    #         repo_name (str): repo name
+    #         dataset_ids (list[str]): list of dataset ids
+    #     Keyword Arguments:
+    #         sample_ids (list[str], optional): Optional Parameter. List of sample ids.
+    #         Default is 'None'.
+    #     Raises:
+    #         paramException
+    #     """
+    #     if dataset_ids is None or type(dataset_ids) is not list:
+    #         raise paramException(
+    #             title="Param Exception",
+    #             detail="Dataset IDs should be given as a valid list of strings",
+    #         )
+    #     if sample_ids is not None and type(sample_ids) is not list:
+    #         raise paramException(
+    #             title="Param Exception",
+    #             detail="Sample IDs should be given as a valid list of strings",
+    #         )
+    #     if repo_name != "geo" and not any(
+    #         ["GSE" in dataset_id for dataset_id in dataset_ids]
+    #     ):
+    #         warnings.warn(
+    #             "The model is tested with GEO metadata and the labels may be wrong for other repos"
+    #         )
+    # def _post_process_clinical_tags(
+    #     self,
+    #     clinical_tags: pd.DataFrame,
+    #     is_sample_tag: bool,
+    #     sample_ids: Optional[List[str]] = None,
+    # ) -> pd.DataFrame:
+    #     """
+    #     process the response of the model (dataframe with clinical tags and samples)
+    #     and return relevant feilds.
+    #     incase no sample_ids are provided by the user, we return the dataset_ids and the clinical tags
+    #     incase sample_ids are also provided, then we return the dataset_ids, the sample_ids and the clincal tags.
+    #     Arguments:
+    #         clinical_tags (pd.DataFrame): dataframe of the sample_ids and assigned clinical tags
+    #         is_sample_tag (bool): if samples passed
+    #     Keyword Arguments:
+    #         sample_ids (list[str]): list of sample ids (default: {None})
+    #     Returns:
+    #         a dataframe with the the dataset_ids, sample_ids and the assigned clinical tags
+    #     """
+    #     if is_sample_tag:
+    #         # if the user has provided list of samples, then we filter in just those sample ids
+    #         # for the dataset ids.
+    #         # taking only those clinical tags and samples where the sample_ids are in the sample_id list
+    #         # provided by the user.
+    #         clinical_tags = clinical_tags[
+    #             clinical_tags["sample_id"].isin(sample_ids)
+    #         ].reset_index(drop=True)
+    #         # in case the sample_ids provided by the user are not present in the dataset_ids provided.
+    #         if clinical_tags.empty or clinical_tags.shape[0] < len(sample_ids):
+    #             warnings.warn(
+    #                 "The output is empty or has missing sample ids because they are not present in given datasets."
+    #             )
+    #         # return sample level tags here
+    #         return clinical_tags
+    #     # if no sample_ids were passed by the user, then
+    #     # returning dataset level tags by removing sample id and removing duplicate columns
+    #     return (
+    #         clinical_tags.drop(columns=["sample_id"])
+    #         .drop_duplicates()
+    #         .reset_index(drop=True)
+    #     )
     def _handle_perform_inference_api_error(self, response):
         if response.status_code == http_codes.UNAUTHORIZED:
@@ -483,83 +489,83 @@ class Curation:
         sample_metadata["control_prob"] = output["control_prob"].values
         return sample_metadata
-    @Track.track_decorator
-    def assign_clinical_labels(
-        self,
-        repo_name: str,
-        dataset_ids: List[str],
-        sample_ids: Optional[List[str]] = None,
-    ) -> pd.DataFrame:
-        """
-        Returns a list of clinical or non clinical labels for the given datasets or samples.
-        Arguments:
-            repo_name (str): name of the repository for fetching datasets.
-            dataset_ids (List[str]): dataset ids to be used for inference
-        Keyword Arguments:
-            sample_ids (List[str], optional): Optional Parameter. Sample ids if that is needed.
-        Raises:
-            RequestException: API response exception
-            ParamException: Invalid parameters
-            err
-        Returns:
-            dataframe which is a list of clinical tags for given ids
-        """
-        warnings.formatwarning = lambda msg, *args, **kwargs: f"WARNING: {msg}\n"
-        try:
-            self._clinical_model_param_checks(repo_name, dataset_ids, sample_ids)
-            # evaluating the inference level based on if the user has provided sample_ids
-            is_sample_tag = sample_ids is not None
-            inference_level = "sample_id" if (is_sample_tag) else "dataset_id"
-            sample_metadata, dataset_to_sample_id = self._fetch_metadata_from_cohort(
-                repo_name=repo_name, dataset_ids=dataset_ids
-            )
-            clinical_model_predictions = []
-            for dataset_id in sample_metadata:
-                # Get output from model endpoint and structure output
-                payload = {
-                    "sample_metadata": sample_metadata[dataset_id],
-                    "sample_id_column": "sample_id",
-                    "dataset_id_column": "dataset_id",
-                    "is_sample_tag": is_sample_tag,
-                }
-                output = self._perform_inference("clinical-classifier", payload)
-                if "errors" in output:
-                    title, detail = self._handle_errors(output)
-                    raise RequestException(title, detail)
-                output = output["clinical_predictions"]
-                clinical_model_predictions += output
-            # creating dataframe with inference_level and clinical_tags with values from the clinical_model_predictions
-            clinical_tags = pd.DataFrame(
-                {
-                    inference_level: [
-                        tag["tag_id"] for tag in clinical_model_predictions
-                    ],
-                    "clinical_tag": [
-                        tag["clinical_tag"] for tag in clinical_model_predictions
-                    ],
-                }
-            )
-            clinical_tags = pd.merge(
-                dataset_to_sample_id, clinical_tags, on=inference_level
-            )
-            clinical_tags = self._post_process_clinical_tags(
-                clinical_tags, is_sample_tag, sample_ids
-            )
-        except Exception as err:
-            raise err
-        return clinical_tags
+    # @Track.track_decorator
+    # def assign_clinical_labels(
+    #     self,
+    #     repo_name: str,
+    #     dataset_ids: List[str],
+    #     sample_ids: Optional[List[str]] = None,
+    # ) -> pd.DataFrame:
+    #     """
+    #     Returns a list of clinical or non clinical labels for the given datasets or samples.
+    #     Arguments:
+    #         repo_name (str): name of the repository for fetching datasets.
+    #         dataset_ids (List[str]): dataset ids to be used for inference
+    #     Keyword Arguments:
+    #         sample_ids (List[str], optional): Optional Parameter. Sample ids if that is needed.
+    #     Raises:
+    #         RequestException: API response exception
+    #         ParamException: Invalid parameters
+    #         err
+    #     Returns:
+    #         dataframe which is a list of clinical tags for given ids
+    #     """
+    #     warnings.formatwarning = lambda msg, *args, **kwargs: f"WARNING: {msg}\n"
+    #     try:
+    #         self._clinical_model_param_checks(repo_name, dataset_ids, sample_ids)
+    #         # evaluating the inference level based on if the user has provided sample_ids
+    #         is_sample_tag = sample_ids is not None
+    #         inference_level = "sample_id" if (is_sample_tag) else "dataset_id"
+    #         sample_metadata, dataset_to_sample_id = self._fetch_metadata_from_cohort(
+    #             repo_name=repo_name, dataset_ids=dataset_ids
+    #         )
+    #         clinical_model_predictions = []
+    #         for dataset_id in sample_metadata:
+    #             # Get output from model endpoint and structure output
+    #             payload = {
+    #                 "sample_metadata": sample_metadata[dataset_id],
+    #                 "sample_id_column": "sample_id",
+    #                 "dataset_id_column": "dataset_id",
+    #                 "is_sample_tag": is_sample_tag,
+    #             }
+    #             output = self._perform_inference("clinical-classifier", payload)
+    #             if "errors" in output:
+    #                 title, detail = self._handle_errors(output)
+    #                 raise RequestException(title, detail)
+    #             output = output["clinical_predictions"]
+    #             clinical_model_predictions += output
+    #         # creating dataframe with inference_level and clinical_tags with values from the clinical_model_predictions
+    #         clinical_tags = pd.DataFrame(
+    #             {
+    #                 inference_level: [
+    #                     tag["tag_id"] for tag in clinical_model_predictions
+    #                 ],
+    #                 "clinical_tag": [
+    #                     tag["clinical_tag"] for tag in clinical_model_predictions
+    #                 ],
+    #             }
+    #         )
+    #         clinical_tags = pd.merge(
+    #             dataset_to_sample_id, clinical_tags, on=inference_level
+    #         )
+    #         clinical_tags = self._post_process_clinical_tags(
+    #             clinical_tags, is_sample_tag, sample_ids
+    #         )
+    #     except Exception as err:
+    #         raise err
+    #     return clinical_tags

polly-python 2.5.0__tar.gz → 3.0.0__tar.gz

polly-python 2.5.0tar.gz → 3.0.0tar.gz