PyPI - genal-python - Versions diffs - 0.6__tar.gz → 0.7__tar.gz - Mend

genal-python 0.6tar.gz → 0.7tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (95) hide show

{genal_python-0.6 → genal_python-0.7}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: genal-python
-Version: 0.6
+Version: 0.7
 Summary: A python toolkit for polygenic risk scoring and mendelian randomization.
 Author-email: Cyprien Rivier <riviercyprien@gmail.com>
 Requires-Python: >=3.7
@@ -8,16 +8,19 @@ Description-Content-Type: text/markdown
 Classifier: Programming Language :: Python :: 3
 Classifier: License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)
 Classifier: Operating System :: OS Independent
-Requires-Dist: numpy>=1.26.2
+Requires-Dist: aiohttp==3.9.5
+Requires-Dist: nest_asyncio==1.5.5
+Requires-Dist: numpy>=1.24.4, <2.0
 Requires-Dist: pandas>=2.0.3
-Requires-Dist: plotnine>=0.12.3
-Requires-Dist: psutil>=5.9.1
-Requires-Dist: pyliftover>=0.4
+Requires-Dist: plotnine==0.12.3
+Requires-Dist: psutil==5.9.1
+Requires-Dist: pyliftover==0.4
 Requires-Dist: scikit_learn>=1.3.0
-Requires-Dist: scipy>=1.11.3
-Requires-Dist: statsmodels>=0.14.0
-Requires-Dist: tqdm>=4.66.1
-Requires-Dist: wget>=3.2
+Requires-Dist: scipy>=1.11.4
+Requires-Dist: sphinx_rtd_theme==1.3.0
+Requires-Dist: statsmodels==0.14.0
+Requires-Dist: tqdm==4.66.1
+Requires-Dist: wget==3.2
 Project-URL: Home, https://github.com/CypRiv/genal
 <center><h1> genal: A Python Toolkit for Genetic Risk Scoring and Mendelian Randomization </h1></center>
@@ -479,7 +482,7 @@ As expected, many MR methods indicate that SBP is strongly associated with strok
 To investigate horizontal pleiotropy in more details, a very useful method is Mendelian Randomization Pleiotropy RESidual Sum and Outlier (MR-PRESSO). MR-PRESSO is a method designed to detect and correct for horizontal pleiotropy. It will identify which instruments are likely to be pleiotropic on their effect on the outcome, and it will rerun an inverse-variance weighted MR after excluding them. It can be run using the `genal.Geno.MRpresso` method:
 ```python
-SBP_clumped.MRpresso(action = 2, n_iterations = 30000)
+mod_table, GlobalTest, OutlierTest, BiasTest = SBP_clumped.MRpresso(action = 2, n_iterations = 30000)
 ```
 As with the `genal.Geno.MR` method, the `action` argument determines how the pleiotropic SNPs will be treated. The output is a list containing:

{genal_python-0.6 → genal_python-0.7}/README.md RENAMED Viewed

@@ -457,7 +457,7 @@ As expected, many MR methods indicate that SBP is strongly associated with strok
 To investigate horizontal pleiotropy in more details, a very useful method is Mendelian Randomization Pleiotropy RESidual Sum and Outlier (MR-PRESSO). MR-PRESSO is a method designed to detect and correct for horizontal pleiotropy. It will identify which instruments are likely to be pleiotropic on their effect on the outcome, and it will rerun an inverse-variance weighted MR after excluding them. It can be run using the `genal.Geno.MRpresso` method:
 ```python
-SBP_clumped.MRpresso(action = 2, n_iterations = 30000)
+mod_table, GlobalTest, OutlierTest, BiasTest = SBP_clumped.MRpresso(action = 2, n_iterations = 30000)
 ```
 As with the `genal.Geno.MR` method, the `action` argument determines how the pleiotropic SNPs will be treated. The output is a list containing:

{genal_python-0.6 → genal_python-0.7}/genal/Geno.py RENAMED Viewed

@@ -33,6 +33,7 @@ from .geno_tools import (
 )
 from .association import set_phenotype_func, association_test_func
 from .extract_prs import extract_snps_func, prs_func
+from .snp_query import async_query_gwas_catalog
 from .constants import STANDARD_COLUMNS, REF_PANEL_COLUMNS, CHECKS_DICT, MR_METHODS_NAMES
 # Do all the MR steps (query_outcome, harmonize etc) based on CHR/POS and not SNPs
@@ -40,7 +41,6 @@ from .constants import STANDARD_COLUMNS, REF_PANEL_COLUMNS, CHECKS_DICT, MR_METH
 # Get proxies (simply return a list of proxies)
 # Multi-MR with python MR
 # Warning that users might not have shell (for the .ram attribute)
-# Phenoscanner
@@ -64,6 +64,7 @@ class Geno:
         name (str): ID of the object (for internal reference and debugging purposes).
         reference_panel (pd.DataFrame): Reference population SNP data used for SNP info
             adjustments. Initialized when first needed.
+        reference_panel_name (str): string to identify the reference_panel (path or population string)
     Methods:
         preprocess_data():
@@ -313,24 +314,27 @@ class Geno:
         Raises:
             ValueError: If the provided DataFrame doesn't have the necessary columns.
         """
-        # Check if the object already has a reference panel set
-        if not hasattr(self, "reference_panel"):
+        # Check if the user provided a dataframe
+        if isinstance(reference_panel, pd.DataFrame):
             # If the provided reference_panel is a DataFrame, verify its structure and dtypes
-            if isinstance(reference_panel, pd.DataFrame):
-                for col in REF_PANEL_COLUMNS:
-                    if col not in reference_panel.columns:
-                        raise ValueError(
-                            f"The {col} column is not present in the reference_panel provided and is necessary."
-                        )
+            for col in REF_PANEL_COLUMNS:
+                if col not in reference_panel.columns:
+                    raise ValueError(
+                        f"The {col} column is not present in the reference_panel provided and is necessary."
+                    )
-                print(
-                    "Using the provided reference_panel dataframe as the reference panel."
-                )
-                self.reference_panel = reference_panel.copy()
-            else:
-                # Load the reference panel based on the provided string identifier
-                self.reference_panel = load_reference_panel(reference_panel)
+            print(
+                "Using the provided reference_panel dataframe as the reference panel."
+            )
+            self.reference_panel = reference_panel.copy()
+            self.reference_panel_name = "USER_PROVIDED"
+        # Else, check if there is already a reference_panel with the same ID. If not, load it based on provided string
+        elif not (hasattr(self, "reference_panel") and
+                  hasattr(self, "reference_panel_name") and
+                  self.reference_panel_name==reference_panel):
+            self.reference_panel = load_reference_panel(reference_panel)
+            self.reference_panel_name = reference_panel
         return self.reference_panel
@@ -1067,7 +1071,7 @@ class Geno:
             cpus (int, optional): number of cpu cores to be used for the parallel random data generation.
         Returns:
-            list: Contains the following elements:
+            tuple: Contains the following elements:
                 - mod_table: DataFrame containing the original (before outlier removal)
                              and outlier-corrected (after outlier removal) inverse variance-weighted MR results.
                 - GlobalTest: p-value of the global MR-PRESSO test indicating the presence of horizontal pleiotropy.
@@ -1163,6 +1167,58 @@ class Geno:
             )
         return data
+    def query_gwas_catalog(
+        self,
+        p_threshold=5e-8,
+        return_p=False,
+        return_study=False,
+        replace=True):
+        """
+        Queries the GWAS Catalog Rest API and add an "ASSOC" column containing associated traits for each SNP.
+        Args:
+            p_threshold (float, optional): Only associations that are at least as significant are reported. Default is 5e-8.
+            return_p (bool, optional): If True, include the p-value in the results. Default is False.
+            return_study (bool, optional): If True, include the ID of the study from which the association is derived in the results. Default is False.
+            replace (bool, optional): If True, updates the data attribute in place. Default is True.
+        Returns:
+            pd.DataFrame: Data attribute with an additional column "ASSOC".
+                The elements of this column are lists of strings or tuples depending on the `return_p` and `return_study` flags. If the SNP could not be queried, the value is set to "FAILED_QUERY".
+        """
+        # Ensure mandatory column is present in the input data
+        if "SNP" not in self.data.columns:
+            raise ValueError(f"The SNP column is necessary for the GWAS query!")
+        # Select appropriate data or copy of data depending on replace argument
+        if not replace:
+            data = self.data.copy()
+        else:
+            data = self.data
+        print(
+            f"Querying the GWAS Catalog and creating the ASSOC column. "
+            f"Only associations with a p-value <= {p_threshold} are reported. Use the p_threshold argument to change the threshold. "
+            f"To report the p-value of each association, use return_p=True. To report the study ID of the association, use return_study=True. "
+            f"The .data attribute will {'be' if replace else 'not be'} modified. "
+            f"{'Use replace=False to leave it as is.' if replace else ''}"
+        )
+        # Call the async function to query all SNPs
+        results_snps, errors = async_query_gwas_catalog(
+            data.SNP.to_list(),
+            p_threshold=p_threshold,
+            return_p=return_p,
+            return_study=return_study)
+        # Create the column
+        data["ASSOC"] = data['SNP'].map(results_snps).fillna("FAILED_QUERY")
+        print("The ASSOC column has been successfully created.")
+        return data, errors
     def standardize(self):
         """

{genal_python-0.6 → genal_python-0.7}/genal/__init__.py RENAMED Viewed

@@ -3,7 +3,7 @@ import json
 from .tools import default_config, write_config, set_plink, delete_tmp, get_reference_panel_path
 from .geno_tools import Combine_Geno
-__version__ = "0.6"
+__version__ = "0.7"
 config_dir = os.path.expanduser(
     "~/.genal/"

{genal_python-0.6 → genal_python-0.7}/genal/geno_tools.py RENAMED Viewed

@@ -213,9 +213,11 @@ def fill_snpids_func(data, reference_panel_df):
             + ":"
             + data.loc[missing_snp_condition, "POS"].astype(str)
             + ":"
+            + data.loc[missing_snp_condition, "NEA"].astype(str)
+            + ":"
             + data.loc[missing_snp_condition, "EA"].astype(str)
         )
-        print_statement = f" and their ID set to CHR:POS:EA"
+        print_statement = f" and their ID set to CHR:POS:NEA:EA"
     perc_missing = n_missing / data.shape[0] * 100
@@ -239,7 +241,8 @@ def fill_snpids_func(data, reference_panel_df):
 def check_int_column(data, int_col):
     """Set the type of the int_col column to Int32 and non-numeric values to NA."""
     nrows = data.shape[0]
-    data[int_col] = pd.to_numeric(data[int_col], errors="coerce")
+    if not pd.api.types.is_integer_dtype(data[int_col].dtype):
+        data[int_col] = pd.to_numeric(data[int_col].astype(str).str.strip(), errors="coerce")
     data[int_col] = data[int_col].round(0).astype("Int32")
     n_nan = data[int_col].isna().sum()
     if n_nan > 0:

genal_python-0.7/genal/snp_query.py ADDED Viewed

@@ -0,0 +1,86 @@
+import aiohttp
+import asyncio
+import numpy as np
+import nest_asyncio
+from tqdm.asyncio import tqdm_asyncio
+# Using nest_asyncio to allow execution in notebooks
+nest_asyncio.apply()
+# Function to query GWAS Catalog API for SNP associations
+async def query_gwas_catalog_coroutine(snps, p_threshold=5e-8, return_p=False, return_study=False):
+    results_global = {}  # Dictionary storing the SNP (keys) and results for each SNP: a list of single strings or tuples
+    errors = []  # List storing SNP for which the GWAS Catalog could not be queried
+    async def fetch(session, url):
+        async with session.get(url) as response:
+            if response.status == 200:
+                return await response.json()
+            return None
+    async def process_snp(session, snp):
+        #print(f"Processing SNP {snp}")
+        results_snp = []  # List storing the results for each association found for this SNP
+        base_url = f"https://www.ebi.ac.uk/gwas/rest/api/singleNucleotidePolymorphisms/{snp}/associations?projection=associationBySnp"
+        base_data = await fetch(session, base_url)
+        if base_data:
+            # Process each association found for this SNP
+            for assoc in base_data.get('_embedded', {}).get('associations', []):
+                pvalue = assoc.get("pvalue", np.nan)
+                # If the pvalue of the association does not pass the threshold, the association is not processed further nor reported
+                if pvalue < p_threshold:
+                    trait = assoc.get("efoTraits", [])[0].get("trait", "")
+                    # If the return_study flag is active: query the page containing the GWAS Catalog study ID
+                    if return_study:
+                        study_url = assoc.get("_links", {}).get("study", {}).get("href", {})
+                        study_data = await fetch(session, study_url)
+                        study_id = study_data.get("accessionId", "") if study_data else "Not found"
+                    else:
+                        study_id = None
+                    # Return a tuple or a string depending on the return flags
+                    if return_p and return_study:
+                        result_assoc = (trait, "{:.4g}".format(pvalue), study_id)
+                    elif return_p:
+                        result_assoc = (trait, "{:.4g}".format(pvalue))
+                    elif return_study:
+                        result_assoc = (trait, study_id)
+                    else:
+                        result_assoc = trait
+                    results_snp.append(result_assoc)
+                else:
+                    continue
+            # Clean the associations depending on the flag
+            # If the P-value and Study ID are not returned, display each trait only once
+            if not return_p and not return_study:
+                results_snp = list(set(results_snp))
+            # If the P-value must be returned, return each trait once with the lowest p-value
+            elif return_p and not return_study:
+                min_trait = {}
+                for trait, pvalue in results_snp:
+                    if trait not in min_trait or pvalue < min_trait[trait]:
+                        min_trait[trait] = pvalue
+                results_snp = [(trait, min_trait[trait]) for trait in min_trait]
+            results_global[snp] = results_snp
+        else:
+            errors.append(snp)
+    async with aiohttp.ClientSession() as session:
+        tasks = [process_snp(session, snp) for snp in snps]
+        await tqdm_asyncio.gather(*tasks)
+    return results_global, errors
+# Main function to start the event loop and run the asynchronous query
+def async_query_gwas_catalog(snps, p_threshold=5e-8, return_p=False, return_study=False):
+    loop = asyncio.get_event_loop()
+    results_global, errors = loop.run_until_complete(query_gwas_catalog_coroutine(snps, p_threshold, return_p, return_study))
+    return results_global, errors

{genal_python-0.6 → genal_python-0.7}/genal/tools.py RENAMED Viewed

@@ -98,7 +98,7 @@ def set_reference_folder(path=""):
         None: The function prints messages to inform the user of the status and any errors.
     """
-    # If no path is provided, set default path to 'tmp_GENAL' in the current directory
+    # If no path is provided, set default path to root/.genal/Reference_files
     if not path:
         path = default_ref_path
         print(f"No path provided, defaulting to {default_ref_path}.")
@@ -188,9 +188,9 @@ def get_reference_panel_path(reference_panel="eur"):
             print(
                 "If you have already downloaded it, use genal.set_reference_folder(path) to avoid downloading again."
             )
-            url = f"https://storage.googleapis.com/genal_files/1kg.v3.tgz"
+            url = f"https://storage.googleapis.com/genal_files/reference_panels.tgz"
             try:
-                wget.download(url, out=os.path.join(ref_path, "1kg.v3.tgz"))
+                wget.download(url, out=os.path.join(ref_path, "reference_panels.tgz"))
             except Exception as e:
                 print(f"Download unsuccessful: {e}")
                 print(
@@ -199,7 +199,7 @@ def get_reference_panel_path(reference_panel="eur"):
                 raise FileNotFoundError(f"Reference panel {reference_panel} not found.")
             print("Download successful. Decompressing...")
-            with tarfile.open(os.path.join(ref_path, "1kg.v3.tgz"), "r:gz") as tar_ref:
+            with tarfile.open(os.path.join(ref_path, "reference_panels.tgz"), "r:gz") as tar_ref:
                 tar_ref.extractall(ref_path)
         else:
             print(f"Using the {ref_panel_name} reference panel.")
@@ -207,7 +207,6 @@ def get_reference_panel_path(reference_panel="eur"):
     return ref_panel_path
-## Need to do the multi option
 def load_reference_panel(reference_panel="eur"):
     """Load the bim file from the reference panel specified."""
@@ -227,7 +226,7 @@ def load_reference_panel(reference_panel="eur"):
     #Load it and return it
     reference_panel_df = pd.read_csv(
-        ref_panel_path + ".bim", sep ="\t", names=["CHR","SNP","F","POS","A1","A2"]
+        ref_panel_path + ".bim", sep="\t", names=["CHR","SNP","F","POS","A1","A2"]
     )
     return reference_panel_df

{genal_python-0.6 → genal_python-0.7}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "flit_core.buildapi"
 [project]
 name = "genal-python"  # Updated name for PyPI
-version = "0.6"
+version = "0.7"
 authors = [{name = "Cyprien Rivier", email = "riviercyprien@gmail.com"}]
 description = "A python toolkit for polygenic risk scoring and mendelian randomization."
 readme = "README.md"
@@ -18,16 +18,19 @@ classifiers = [
 # Dependencies section
 dependencies = [
-    "numpy>=1.26.2",
-    "pandas>=2.0.3",
-    "plotnine>=0.12.3",
-    "psutil>=5.9.1",
-    "pyliftover>=0.4",
-    "scikit_learn>=1.3.0",
-    "scipy>=1.11.3",
-    "statsmodels>=0.14.0",
-    "tqdm>=4.66.1",
-    "wget>=3.2"
+    "aiohttp==3.9.5",
+"nest_asyncio==1.5.5",
+"numpy>=1.24.4, <2.0",
+"pandas>=2.0.3",
+"plotnine==0.12.3",
+"psutil==5.9.1",
+"pyliftover==0.4",
+"scikit_learn>=1.3.0",
+"scipy>=1.11.4",
+"sphinx_rtd_theme==1.3.0",
+"statsmodels==0.14.0",
+"tqdm==4.66.1",
+"wget==3.2"
 ]
 [tool.setuptools.package-dir]

{genal_python-0.6 → genal_python-0.7}/requirements.txt RENAMED Viewed

@@ -1,3 +1,5 @@
+aiohttp==3.9.5
+nest_asyncio==1.5.5
 numpy>=1.24.4, <2.0
 pandas>=2.0.3
 plotnine==0.12.3
@@ -5,8 +7,7 @@ psutil==5.9.1
 pyliftover==0.4
 scikit_learn>=1.3.0
 scipy>=1.11.4
-setuptools==62.3.3
 sphinx_rtd_theme==1.3.0
-statsmodels>=0.14.0
+statsmodels==0.14.0
 tqdm==4.66.1
 wget==3.2