PyPI - genal-python - Versions diffs - 1.4.0__tar.gz → 1.4.2__tar.gz - Mend

genal-python 1.4.0tar.gz → 1.4.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (111) hide show

{genal_python-1.4.0 → genal_python-1.4.2}/.gitignore RENAMED Viewed

@@ -6,4 +6,6 @@ genal/.ipynb_checkpoints/
 test_data/
 cursor/
 tests/
-tmp_GENAL/
+tmp_GENAL/
+REVIEW_REPORT.md
+TASKS.md

{genal_python-1.4.0 → genal_python-1.4.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: genal-python
-Version: 1.4.0
+Version: 1.4.2
 Summary: A python toolkit for polygenic risk scoring and mendelian randomization.
 Author-email: Cyprien Rivier <riviercyprien@gmail.com>
 Requires-Python: >=3.8

{genal_python-1.4.0 → genal_python-1.4.2}/docs/source/api.rst RENAMED Viewed

@@ -6,7 +6,7 @@ genal.GENO class
 -----------------
 .. autoclass:: genal.Geno
-   :members:
+   :members: __init__, preprocess_data, get_reference_panel, clump, update_snpids, extract_snps, prs, set_phenotype, association_test, query_outcome, MR, MR_plot, MR_forest, MRpresso, filter_by_gene, colocalize, lift, query_gwas_catalog, standardize_betas, update_eaf, sort_group, copy, save
    :undoc-members:
    :show-inheritance:

{genal_python-1.4.0 → genal_python-1.4.2}/genal/Geno.py RENAMED Viewed

@@ -15,7 +15,13 @@ from .proxy import find_proxies, apply_proxies
 from .MR_tools import query_outcome_func, MR_func, mrpresso_func
 from .clump import clump_data_plink2
 from .lift import lift_data
-from .tools import create_tmp, load_reference_panel, setup_genetic_path, check_reference_panel
+from .genes import filter_by_gene_func
+from .tools import (
+    create_tmp,
+    load_reference_panel,
+    setup_genetic_path,
+    check_reference_panel
+)
 from .geno_tools import (
     save_data,
     check_arguments,
@@ -31,7 +37,7 @@ from .geno_tools import (
     check_allele_column,
     check_snp_column,
     remove_na,
-    filter_by_gene_func
+    update_eaf_func,
 )
 from .association import set_phenotype_func, association_test_func_plink2
 from .extract_prs import extract_snps_func, prs_func
@@ -47,6 +53,7 @@ from .colocalization import coloc_abf_func
 # Check stability with variants on sexual chromosomes
 # Check the build of user data (potentially with a list of SNPs with different positions)
 # update_snpids function: take alleles into account during the merge if they are present in the user data
+# Consider how update_snpids could be replaced by the extract range function in plink2 (to gain speed)
 class Geno:
@@ -1596,7 +1603,51 @@ class Geno:
             data['SE'] = data['SE'] / sd
             return data
+    def update_eaf(self, reference_panel="EUR_37", replace=False, fill=True):
+        """
+        Update or create the EAF (Effect Allele Frequency) column using a reference panel.
+        This method calculates allele frequencies from a specified reference panel using PLINK
+        and updates the 'EAF' column for SNPs in the dataset. It match SNPs based on
+        CHR/POS or SNP ID and considers the effect allele ('EA') to assign the correct EAF.
+        Args:
+            reference_panel (str, optional): The reference panel to use for deriving EAF.
+                Can be a standard name (e.g., "EUR_37", "AFR_38") or a path to a
+                custom PLINK fileset (bed/bim/fam or pgen/pvar/psam).
+                Defaults to "EUR_37".
+            replace (bool, optional): If True, modifies the instance's `data` attribute
+                in place. If False (default), operates on a copy.
+            fill (bool, optional): If `True` (default), existing `EAF` values for SNPs not
+                found in the reference panel will be preserved. If `False`, `EAF` values
+                for unmatched SNPs will be set to `NaN`.
+        Returns:
+            pd.DataFrame or None: A new DataFrame with the updated 'EAF' column.
+        Raises:
+            ValueError: If the required columns ('EA' and either 'SNP' or 'CHR'/'POS')
+                are not present in the data.
+        """
+        if 'EA' not in self.data.columns:
+            raise ValueError("The 'EA' column is required to update EAF.")
+        if not ('SNP' in self.data.columns or ('CHR' in self.data.columns and 'POS' in self.data.columns)):
+            raise ValueError("Either 'SNP' or both 'CHR' and 'POS' columns are required.")
+        data = self.data if replace else self.data.copy()
+        data_updated = update_eaf_func(
+            data=data,
+            reference_panel=reference_panel,
+            object_name=self.name,
+            ram=self.ram,
+            fill=fill
+        )
+        if replace:
+            self.data = data_updated
+        return data_updated

{genal_python-1.4.0 → genal_python-1.4.2}/genal/__init__.py RENAMED Viewed

@@ -1,10 +1,11 @@
 import os
 import json
 from .tools import default_config, write_config, set_plink, install_plink, delete_tmp, get_reference_panel_path, get_plink_path
-from .geno_tools import Combine_Geno, filter_by_gene_func
+from .geno_tools import Combine_Geno
+from .genes import filter_by_gene_func
 from .constants import CONFIG_DIR
-__version__ = "1.4.0"
+__version__ = "1.4.2"
 config_path = os.path.join(CONFIG_DIR, "config.json")

genal_python-1.4.2/genal/genes.py ADDED Viewed

@@ -0,0 +1,125 @@
+import pandas as pd
+import numpy as np
+import os
+import wget
+from .constants import BUCKET_URL
+from .tools import read_config
+def filter_by_gene_func(data, gene_identifier, id_type="symbol", window_size=1000000, build="37"):
+    """
+    Filtering the data to include only variants that are within a specified distance of a specific gene.
+    Corresponds to the :meth:`Geno.filter_by_gene` method.
+    Args:
+        data (pd.DataFrame): Input data with at least 'CHR' and 'POS' columns.
+        gene_identifier (str): Identifier for the gene/protein to filter variants around.
+        id_type (str, optional): Type of identifier provided. Options are:
+            - "symbol": Gene symbol (e.g., "APOE")
+            - "HGNC": HGNC ID (e.g., "HGNC:613")
+            - "name": Full gene name (e.g., "apolipoprotein E")
+            - "Ensembl": Ensembl gene ID (e.g., "ENSG00000130203")
+            - "NCBI": NCBI gene ID (e.g., "348")
+            - "UCSC": UCSC gene ID (e.g., "uc001hbu.2")
+            - "Vega": Vega gene ID (e.g., "OTTHUMG00000019505")
+            Default is "symbol".
+        window_size (int, optional): Size of the window around the gene in base pairs. Default is 1,000,000 (1Mb).
+        build (str, optional): Genome build of the data. Default is "37".
+    Returns:
+        pd.DataFrame: Filtered DataFrame containing only variants within the specified window
+            around the gene, with additional column 'Distance'.
+    Notes:
+        - Distance is calculated from the nearest gene boundary (start or end position)
+        - Null distances indicate the variant is within the gene
+    """
+    # Validate id_type
+    valid_id_types = ["symbol", "HGNC_id", "name", "gene_id", "NCBI_id", "UCSC_id", "Vega_id"]
+    if id_type in ["HGNC", "NCBI", "UCSC", "Vega"]:
+        id_type = id_type + "_id"
+    if id_type == "Ensembl":
+        id_type = "gene_id"
+    if id_type not in valid_id_types:
+        raise ValueError(f"Invalid id_type. Must be one of: {', '.join(valid_id_types)}")
+    # Validate build
+    if int(build) not in [37, 38]:
+        raise ValueError(f"Invalid build. Must be one of: 37, 38")
+    # Download the gene info file if not already present in the reference folder
+    config = read_config()
+    ref_path = config["paths"]["ref_path"]
+    gene_info_file = os.path.join(ref_path, "gene_id_mapping_filtered.parquet")
+    if not os.path.exists(gene_info_file):
+        # Download parquet file
+        print(f"Downloading gene info file to {gene_info_file}...")
+        url = BUCKET_URL + "gene_id_mapping_filtered.parquet"
+        try:
+            wget.download(url, gene_info_file)
+            print("\nDownload complete.")
+        except Exception as e:
+            if os.path.exists(gene_info_file):
+                os.remove(gene_info_file)
+            raise RuntimeError(f"Failed to download gene info: {e}")
+    df_gene_info = pd.read_parquet(gene_info_file, engine="pyarrow")
+    # Find gene coordinates
+    gene_data = df_gene_info[df_gene_info[id_type] == gene_identifier]
+    if gene_data.empty:
+        raise ValueError(f"Gene with {id_type}='{gene_identifier}' not found in gene info database.")
+    if len(gene_data) > 1:
+        print(f"Warning: Multiple entries found for {id_type}='{gene_identifier}'. Using the first entry.")
+    gene_data = gene_data.iloc[0,:]
+    print(f"Filtering variants within {window_size}bp window based on genome build {build} around gene: {', '.join(f'{col}: {gene_data[col]}' for col in valid_id_types)}")
+    # Extract gene location information
+    chrom = gene_data['CHR']
+    # Convert to integer if possible
+    if str(chrom).isdigit():
+        chrom = int(chrom)
+    elif chrom=="X":
+        chrom=23
+    else:
+        raise ValueError(f"Gene {gene_identifier} is located on chromosome {chrom}, which is not supported.")
+    gene_start = int(gene_data[f'gene_start_{build}'])
+    gene_end = int(gene_data[f'gene_end_{build}'])
+    # Define the window boundaries
+    window_start = max(0, gene_start - window_size/2)
+    window_end = gene_end + window_size/2
+    # Filter variants within the window
+    filtered = data[
+        (data['CHR'] == chrom) &
+        (data['POS'] >= window_start) &
+        (data['POS'] <= window_end)
+    ].copy()
+    if not filtered.empty:
+        # Calculate distance from gene: if inside the gene, distance is 0, if before, distance is negative, if after, distance is positive
+        filtered.loc[:, 'Distance'] = np.nan
+        # Create boolean masks
+        mask_inside = filtered['POS'].between(gene_start, gene_end)
+        mask_before = filtered['POS'] < gene_start
+        mask_after  = filtered['POS'] > gene_end
+        filtered.loc[mask_inside, 'Distance'] = 0
+        filtered.loc[mask_before, 'Distance'] = filtered['POS'] - gene_start
+        filtered.loc[mask_after, 'Distance']  = filtered['POS'] - gene_end
+        filtered["Distance"] = filtered["Distance"].astype("Int64")
+        print(f"Found {len(filtered)} variants.")
+    else:
+        print(f"No variants found in a {window_size}bp window around {gene_identifier}")
+    return filtered

{genal_python-1.4.0 → genal_python-1.4.2}/genal/geno_tools.py RENAMED Viewed

@@ -1,15 +1,12 @@
 import pandas as pd
 import numpy as np
 import scipy.stats as st
-import os, subprocess
-import shutil
+import os
 import warnings
 from collections import Counter
-import wget
-from .constants import STANDARD_COLUMNS, BUCKET_URL
-from .tools import read_config
+from .constants import STANDARD_COLUMNS
+from .tools import get_reference_panel_path, run_plink_command, create_tmp, get_plink_path
 def remove_na(data):
@@ -282,7 +279,7 @@ def fill_snpids_func(data, reference_panel_df, keep_indel):
     return data
 def check_int_column(data, int_col):
-    """Set the type of the int_col column to Int64 and non-numeric values to NA."""
+    """Set the type of the int_col column to Int64 and non-numeric values to NA. This function is used to check the validity of the CHR and POS columns."""
     nrows = data.shape[0]
     # Remove any non-digit characters, convert to numeric, setting non-numeric to NaN
     data[int_col] = pd.to_numeric(data[int_col].astype(str).str.extract('(\d+)', expand=False), errors='coerce')
@@ -403,53 +400,39 @@ def check_arguments(
 def save_data(data, name, path="", fmt="h5", sep="\t", header=True):
     """
-    Save a DataFrame to a file in a given format.
-    Args:
-    - data (pd.DataFrame): The data to be saved.
-    - name (str): The name of the file without extension.
-    - path (str, optional): Directory path for saving. Default is the current directory.
-    - fmt (str, optional): Format for the file, e.g., "h5", "csv", "txt". Default is "h5".
-    - sep (str, optional): Delimiter for csv or txt files. Default is tab.
-    - header (bool, optional): Whether to include header in csv or txt files. Default is True.
+    Save data to a specified file format.
-    Returns:
-    None. But saves the data to a file and prints the file path.
+    Supported formats: .h5 (default), .csv, .txt.
+    Future supported formats: .vcf, .vcf.gz.
-    Raises:
-    - ValueError: If the provided format is not recognized.
+    Args:
+        data (pd.DataFrame): DataFrame to be saved.
+        name (str): A unique identifier for the data, used as the filename.
+        path (str, optional): The directory where the file will be saved. Defaults to current directory.
+        fmt (str, optional): The desired file format. Defaults to "h5".
+        sep (str, optional): Delimiter for text-based formats (.csv, .txt). Defaults to tab.
+        header (bool, optional): Whether to include column names in text-based formats. Defaults to True.
     """
-    if path:
-        path_name = os.path.join(path, f"{name}.{fmt}")
-    else:
-        path_name = f"{name}.{fmt}"
+    path = os.path.join(path, name)
     if fmt == "h5":
-        df = data.copy()
-        for col in df.select_dtypes(include="integer").columns:
-            df[col] = df[col].astype("float64")
-        df.to_hdf(path_name, mode="w", key="data")
-    elif fmt in ["csv", "txt"]:
-        data.to_csv(path_name, sep=sep, header=header, index=False)
+        data.to_hdf(f"{path}.h5", key="data", mode="w", format="table")
+    elif fmt == "csv":
+        data.to_csv(f"{path}.csv", sep=sep, header=header, index=False)
+    elif fmt == "txt":
+        data.to_csv(f"{path}.txt", sep=sep, header=header, index=False)
     else:
-        raise ValueError(
-            "The fmt argument takes value in (h5 (default), csv, txt)."
-        )
-    print(f"Data saved to {path_name}")
+        print(f"Format {fmt} is not supported yet.")
 def Combine_Geno(Gs):
     """
-    Combine a list of GWAS objects into one.
+    Combine multiple Geno instances.
     Args:
-    - Gs (list): List of GWAS objects.
+        Gs (list): A list of Geno instances to combine.
     Returns:
-    Geno object: Combined Geno object.
+        Geno: A new Geno instance containing the combined data.
     """
     from .Geno import Geno
@@ -462,118 +445,101 @@ def Combine_Geno(Gs):
     return Geno(C)
-def filter_by_gene_func(data, gene_identifier, id_type="symbol", window_size=1000000, build="37"):
+def update_eaf_func(data, reference_panel, object_name, ram=10000, fill=True):
     """
-    Filtering the data to include only variants that are within a specified distance of a specific gene.
-    Corresponds to the :meth:`Geno.filter_by_gene` method.
-    Args:
-        data (pd.DataFrame): Input data with at least 'CHR' and 'POS' columns.
-        gene_identifier (str): Identifier for the gene/protein to filter variants around.
-        id_type (str, optional): Type of identifier provided. Options are:
-            - "symbol": Gene symbol (e.g., "APOE")
-            - "HGNC": HGNC ID (e.g., "HGNC:613")
-            - "name": Full gene name (e.g., "apolipoprotein E")
-            - "Ensembl": Ensembl gene ID (e.g., "ENSG00000130203")
-            - "NCBI": NCBI gene ID (e.g., "348")
-            - "UCSC": UCSC gene ID (e.g., "uc001hbu.2")
-            - "Vega": Vega gene ID (e.g., "OTTHUMG00000019505")
-            Default is "symbol".
-        window_size (int, optional): Size of the window around the gene in base pairs. Default is 1,000,000 (1Mb).
-        build (str, optional): Genome build of the data. Default is "37".
-    Returns:
-        pd.DataFrame: Filtered DataFrame containing only variants within the specified window
-            around the gene, with additional column 'Distance'.
+    Core logic to update or create the EAF (Effect Allele Frequency) column.
-    Notes:
-        - Distance is calculated from the nearest gene boundary (start or end position)
-        - Null distances indicate the variant is within the gene
+    This function calculates EAF from a reference panel. If CHR/POS are available,
+    it uses a fast, coordinate-based extraction with PLINK. Otherwise, it falls
+    back to SNP-ID-based extraction.
     """
+    ref_panel_path, ref_filetype = get_reference_panel_path(reference_panel)
+    create_tmp()
+    by_coordinate = "CHR" in data.columns and "POS" in data.columns
+    # --- Match by CHR/POS or SNP ID ---
+    if by_coordinate:
+        print("CHR/POS columns present. SNPs searched based on genomic positions.")
-    # Validate id_type
-    valid_id_types = ["symbol", "HGNC_id", "name", "gene_id", "NCBI_id", "UCSC_id", "Vega_id"]
-    if id_type in ["HGNC", "NCBI", "UCSC", "Vega"]:
-        id_type = id_type + "_id"
-    if id_type == "Ensembl":
-        id_type = "gene_id"
-    if id_type not in valid_id_types:
-        raise ValueError(f"Invalid id_type. Must be one of: {', '.join(valid_id_types)}")
-    # Validate build
-    if int(build) not in [37, 38]:
-        raise ValueError(f"Invalid build. Must be one of: 37, 38")
-    # Download the gene info file if not already present in the reference folder
-    config = read_config()
-    ref_path = config["paths"]["ref_path"]
-    gene_info_file = os.path.join(ref_path, "gene_id_mapping_filtered.parquet")
-    if not os.path.exists(gene_info_file):
-        # Download parquet file
-        print(f"Downloading gene info file to {gene_info_file}...")
-        url = BUCKET_URL + "gene_id_mapping_filtered.parquet"
-        try:
-            wget.download(url, gene_info_file)
-            print("\nDownload complete.")
-        except Exception as e:
-            if os.path.exists(gene_info_file):
-                os.remove(gene_info_file)
-            raise RuntimeError(f"Failed to download gene info: {e}")
-    df_gene_info = pd.read_parquet(gene_info_file, engine="pyarrow")
-    # Find gene coordinates
-    gene_data = df_gene_info[df_gene_info[id_type] == gene_identifier]
-    if gene_data.empty:
-        raise ValueError(f"Gene with {id_type}='{gene_identifier}' not found in gene info database.")
-    if len(gene_data) > 1:
-        print(f"Warning: Multiple entries found for {id_type}='{gene_identifier}'. Using the first entry.")
-    gene_data = gene_data.iloc[0,:]
+        # 1. Write coordinates to a temp file for PLINK's --extract range
+        coord_path = os.path.join("tmp_GENAL", f"{object_name}_coord_list.txt")
+        data[['CHR', 'POS', 'POS']].dropna().to_csv(coord_path, sep='\t', index=False, header=False)
+        # 2. Run --freq directly, extracting by range and adding POS to output
+        freq_prefix = os.path.join("tmp_GENAL", f"{object_name}_eaf_freqs")
+        plink_command = (
+            f"{get_plink_path()} --{'pfile' if ref_filetype == 'pgen' else 'bfile'} {ref_panel_path} "
+            f"--memory {ram} "
+            f"--extract range {coord_path} "
+            f"--freq cols=+pos "
+            f"--out {freq_prefix}"
+        )
+        run_plink_command(plink_command)
-    print(f"Filtering variants within {window_size}bp window based on genome build {build} around gene: {', '.join(f'{col}: {gene_data[col]}' for col in valid_id_types)}")
-    # Extract gene location information
-    chrom = gene_data['CHR']
-    # Convert to integer if possible
-    if str(chrom).isdigit():
-        chrom = int(chrom)
-    elif chrom=="X":
-        chrom=23
+        # 3. Load frequency results
+        freq_path = f"{freq_prefix}.afreq"
+        if not os.path.exists(freq_path) or os.path.getsize(freq_path) == 0:
+            warnings.warn("No variants from your data were found in the reference panel by coordinate, or PLINK failed.")
+            return data.copy()
+        freqs_df = pd.read_csv(freq_path, sep='\t')
+        freqs_df.rename(columns={'#CHROM': 'CHR', 'ALT': 'ALT_calc', 'ALT_FREQS': 'EAF_ref'}, inplace=True)
+        # 4. Merge with original data to get EA and compute final EAF
+        data = data.merge(freqs_df[["CHR", "POS", "ALT_calc", "EAF_ref"]], on=['CHR', 'POS'], how='left')
     else:
-        raise ValueError(f"Gene {gene_identifier} is located on chromosome {chrom}, which is not supported.")
-    gene_start = int(gene_data[f'gene_start_{build}'])
-    gene_end = int(gene_data[f'gene_end_{build}'])
+        print("Using SNP IDs to extract frequencies.")
+        if "SNP" not in data.columns:
+            raise ValueError("SNP column is required when CHR/POS are not available.")
+        snp_list_path = os.path.join("tmp_GENAL", f"{object_name}_snp_list.txt")
+        data[["SNP"]].dropna().to_csv(snp_list_path, index=False, header=False)
+        freq_prefix = os.path.join("tmp_GENAL", f"{object_name}_eaf_freqs")
+        plink_command = (
+            f"{get_plink_path()} --{'pfile' if ref_filetype == 'pgen' else 'bfile'} {ref_panel_path} "
+            f"--memory {ram} "
+            f"--extract {snp_list_path} "
+            f"--freq "
+            f"--out {freq_prefix}"
+        )
+        run_plink_command(plink_command)
-    # Define the window boundaries
-    window_start = max(0, gene_start - window_size/2)
-    window_end = gene_end + window_size/2
-    # Filter variants within the window
-    filtered = data[
-        (data['CHR'] == chrom) &
-        (data['POS'] >= window_start) &
-        (data['POS'] <= window_end)
-    ].copy()
-    if not filtered.empty:
-        # Calculate distance from gene: if inside the gene, distance is 0, if before, distance is negative, if after, distance is positive
-        filtered.loc[:, 'Distance'] = np.nan
-        # Create boolean masks
-        mask_inside = filtered['POS'].between(gene_start, gene_end)
-        mask_before = filtered['POS'] < gene_start
-        mask_after  = filtered['POS'] > gene_end
+        freq_path = f"{freq_prefix}.afreq"
+        if not os.path.exists(freq_path):
+            warnings.warn("PLINK did not generate a frequency file. Cannot update EAF.")
+            return data.copy()
-        filtered.loc[mask_inside, 'Distance'] = 0
-        filtered.loc[mask_before, 'Distance'] = filtered['POS'] - gene_start
-        filtered.loc[mask_after, 'Distance']  = filtered['POS'] - gene_end
+        freqs_df = pd.read_csv(freq_path, sep='\t')
+        freqs_df.rename(columns={"#CHROM": "CHR", "ID": "SNP", "ALT": "ALT_calc", "ALT_FREQS": "EAF_ref"}, inplace=True)
-        filtered["Distance"] = filtered["Distance"].astype("Int64")
-        print(f"Found {len(filtered)} variants.")
+        data = data.merge(freqs_df[["SNP", "ALT_calc", "EAF_ref"]], on="SNP", how="left")
+    if data["EAF_ref"].isna().all():
+        warnings.warn("No matching SNPs found in the reference panel.")
+        return data.copy()
+    # Handle allele direction to ensure correct EAF is returned
+    conditions = [
+        data["EA"] == data["ALT_calc"],
+        data["NEA"] == data["ALT_calc"],
+    ]
+    choices = [
+        data["EAF_ref"],
+        1 - data["EAF_ref"],
+    ]
+    data["EAF_new"] = np.select(conditions, choices, default=np.nan)
+    # Create updated EAF column
+    if 'EAF' not in data.columns:
+        data['EAF'] = np.nan
+    if fill:
+        data['EAF'] = np.where(pd.notna(data["EAF_new"]), data["EAF_new"], data['EAF'])
     else:
-        print(f"No variants found in a {window_size}bp window around {gene_identifier}")
-    return filtered
+        data['EAF'] = np.where(pd.notna(data["EAF_new"]), data["EAF_new"], np.nan)
+    data.drop(columns=["EAF_new", "EAF_ref", "ALT_calc"], inplace=True)
+    return data

{genal_python-1.4.0 → genal_python-1.4.2}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "flit_core.buildapi"
 [project]
 name = "genal-python"  # Updated name for PyPI
-version = "1.4.0"
+version = "1.4.2"
 authors = [{name = "Cyprien Rivier", email = "riviercyprien@gmail.com"}]
 description = "A python toolkit for polygenic risk scoring and mendelian randomization."
 readme = "README.md"

genal_python-1.4.0/REVIEW_REPORT.md DELETED Viewed

@@ -1,63 +0,0 @@
-# Genal Code Review Report
-This report provides a thorough theoretical, methodological, and mathematical analysis of the `genal` codebase, following the guidelines provided in `CODE_REVIEW_GUIDE.md`.
----
-## **Finding 1: Allele harmonization logic for palindromic SNPs is flawed**
-**Severity:** Critical
-**Concern:** The `apply_action_2` function in `MR.py`, which is responsible for harmonizing palindromic SNPs using allele frequencies, contains flawed logic. The decision to flip an allele is based almost exclusively on the exposure allele frequency (`EAF_e`), without correctly considering the outcome allele frequency (`EAF_o`) or the relationship between the two. Specifically, it may incorrectly flip SNPs whose alleles are already concordant, simply because the exposure allele frequency is high. This can lead to incorrectly aligned effect alleles between the exposure and outcome, causing a sign error in the effect estimate (`BETA_e`). Such an error will severely bias the resulting Mendelian Randomization estimates, potentially reversing the direction of the causal effect or introducing spurious null results.
-**Recommendation:** The harmonization logic for palindromic SNPs should be rewritten to follow established best practices, such as those implemented in the `TwoSampleMR` R package. The corrected logic must properly compare `EAF_e` and `EAF_o` to infer strand orientation. For example, when alleles match, a flip should only be considered if frequencies are complementary (e.g., `EAF_e` > 0.5 and `EAF_o` < 0.5, or vice-versa, within a defined tolerance).
-**Evidence/Rationale:** The faulty logic is in the `apply_action_2` function in `genal/MR.py`. A correct implementation would compare both `EAF_e` and `EAF_o` to determine if they are on opposite strands. The current implementation's reliance on `EAF_e` alone is methodologically incorrect for inferring strand flips for palindromic SNPs.
----
-## **Finding 2: The MR-PRESSO distortion test implementation is incorrect**
-**Severity:** High
-**Concern:** The implementation of the MR-PRESSO distortion test in `MRpresso.py` appears to deviate from the methodology described in the original publication (Verbanck et al., *Nature Genetics*, 2018). The purpose of the distortion test is to assess whether the causal estimate changes significantly after removing outlier SNPs. This is tested by comparing the observed change to a null distribution of changes generated by removing the same number of *random* SNPs. The current implementation generates a null distribution by removing subsets of *non-outlier* SNPs, as indicated by a comment in the code from the original author expressing uncertainty (`## Is there an error in the MRPRESSO code?...`). This approach does not correctly simulate the null hypothesis and will likely produce an inaccurate p-value for the distortion test, leading to incorrect conclusions about the impact of pleiotropy.
-**Recommendation:** The logic for the distortion test's permutation procedure should be corrected to follow the original MR-PRESSO method. The null distribution should be generated by repeatedly removing a random set of SNPs (equal in number to the identified outliers) from the full dataset and calculating the resulting change in the causal estimate.
-**Evidence/Rationale:** The implementation in `genal/MRpresso.py` (around line 130 and in the `get_random_bias` function) and the associated comment from the developer strongly suggest the implementation is not faithful to the published MR-PRESSO methodology for the distortion test.
----
-## **Finding 3: The default LD clumping window size is excessively large**
-**Severity:** High
-**Concern:** The `clump` method in the `Geno` class uses a default clumping window of 10,000 kb (`kb=10000`). This is substantially larger than the commonly recommended window sizes of 250 kb to 1000 kb. Using such a large window risks incorrectly grouping SNPs that are not in linkage disequilibrium, especially across regions with moderate to high recombination rates. This can lead to the erroneous removal of genuinely independent genetic variants, reducing the resolution of the analysis and potentially biasing downstream results for Polygenic Risk Scores and Mendelian Randomization by discarding valid instruments.
-**Recommendation:** The default value for the `kb` parameter should be reduced to a more conventional and evidence-based value, such as 250 kb or 500 kb. The docstring should be updated to explain the choice and guide the user on selecting an appropriate window size based on their specific analysis and population.
-**Evidence/Rationale:** The `Geno.py` file, line 356, defines `def clump(self, kb=10000, ...)`. Peer-reviewed literature and best-practice guides (e.g., Privé et al., *Am J Hum Genet*, 2019; PLINK documentation) use smaller windows (e.g., 250kb-1000kb) as defaults for LD clumping. A 10,000kb window is typically only used for special cases like checking for long-range LD, not as a general-purpose default. A search of recent literature confirms this ("plink ld clumping window size best practice GWAS MR").
----
-## **Finding 4: The default window size for proxy SNP searching is inefficient**
-**Severity:** Medium
-**Concern:** The `query_outcome` and `prs` methods, when finding proxies, default to a search window of 5,000 kb. While the `r2` threshold of 0.8 is appropriate, searching for proxies over a 5 Mb window is computationally inefficient and methodologically questionable. Linkage disequilibrium is a local phenomenon, and proxies are almost always found within a much smaller window (e.g., 50-250 kb). Searching a 5 Mb window significantly increases computation time and risks identifying spurious, long-range correlations that do not reflect true local LD structure.
-**Recommendation:** Reduce the default `kb` parameter for proxy searches to a more standard and efficient value, such as 250 kb or 500 kb.
-**Evidence/Rationale:** `Geno.py`, line 810: `query_outcome(..., kb=5000, r2=0.8, ...)` and line 536: `prs(..., kb=5000, r2=0.8, ...)`. Standard practice for proxy searching is to use a much smaller window as LD is not expected to extend over several megabases. This large window offers little benefit at a significant computational cost.
----
-## **Finding 5: The default LD clumping r-squared threshold is unusually stringent**
-**Severity:** Medium
-**Concern:** The `clump` method defaults to an `r2` threshold of 0.01. While this ensures a set of highly independent SNPs, it is a very stringent cutoff compared to values commonly used in the literature (e.g., 0.1, 0.2, or 0.5 for selecting instruments). This may lead to an overly aggressive clumping procedure that retains multiple, weakly correlated signals from the same locus. For methods that assume independent instruments (like standard IVW MR), this could be problematic. Furthermore, for PRS construction, this stringent threshold might not be optimal for predictive power.
-**Recommendation:** Consider changing the default `r2` to a more moderate and widely used value, such as 0.2 or 0.1. The rationale for the default should be clearly documented, and the user should be encouraged to select a threshold appropriate for their specific analysis.
-**Evidence/Rationale:** `Geno.py`, line 356: `def clump(self, kb=10000, r2=0.01, ...)`. The paper "Making the Most of Clumping and Thresholding for Polygenic Scores" by Privé et al. (2019) demonstrates that the optimal `r2` varies widely by trait and that values higher than 0.01 are often optimal. Many tutorials and standard pipelines use less stringent thresholds.

{genal_python-1.4.0 → genal_python-1.4.2}/.DS_Store RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/.readthedocs.yaml RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/Genal_flowchart.png RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/LICENSE RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/README.md RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/docs/.DS_Store RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/docs/Makefile RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/docs/build/.DS_Store RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/docs/build/.buildinfo RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/docs/build/.doctrees/api.doctree RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/docs/build/.doctrees/environment.pickle RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/docs/build/.doctrees/genal.doctree RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/docs/build/.doctrees/index.doctree RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/docs/build/.doctrees/introduction.doctree RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/docs/build/.doctrees/modules.doctree RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_images/MR_plot_SBP_AS.png RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_modules/genal/Geno.html RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_modules/genal/MR.html RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_modules/genal/MR_tools.html RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_modules/genal/MRpresso.html RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_modules/genal/association.html RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_modules/genal/clump.html RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_modules/genal/extract_prs.html RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_modules/genal/geno_tools.html RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_modules/genal/lift.html RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_modules/genal/proxy.html RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_modules/genal/snp_query.html RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_modules/genal/tools.html RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_modules/index.html RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_sources/api.rst.txt RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_sources/genal.rst.txt RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_sources/index.rst.txt RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_sources/introduction.rst.txt RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_sources/modules.rst.txt RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_static/basic.css RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_static/css/badge_only.css RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_static/css/fonts/Roboto-Slab-Bold.woff RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_static/css/fonts/Roboto-Slab-Bold.woff2 RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_static/css/fonts/Roboto-Slab-Regular.woff RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_static/css/fonts/Roboto-Slab-Regular.woff2 RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_static/css/fonts/fontawesome-webfont.eot RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_static/css/fonts/fontawesome-webfont.svg RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_static/css/fonts/fontawesome-webfont.ttf RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_static/css/fonts/fontawesome-webfont.woff RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_static/css/fonts/fontawesome-webfont.woff2 RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_static/css/fonts/lato-bold-italic.woff RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_static/css/fonts/lato-bold-italic.woff2 RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_static/css/fonts/lato-bold.woff RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_static/css/fonts/lato-bold.woff2 RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_static/css/fonts/lato-normal-italic.woff RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_static/css/fonts/lato-normal-italic.woff2 RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_static/css/fonts/lato-normal.woff RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_static/css/fonts/lato-normal.woff2 RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_static/css/theme.css RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_static/doctools.js RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_static/documentation_options.js RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_static/file.png RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_static/js/badge_only.js RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_static/js/html5shiv-printshiv.min.js RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_static/js/html5shiv.min.js RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_static/js/theme.js RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_static/language_data.js RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_static/minus.png RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_static/plus.png RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_static/pygments.css RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_static/searchtools.js RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/docs/build/_static/sphinx_highlight.js RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/docs/build/api.html RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/docs/build/genal.html RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/docs/build/genindex.html RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/docs/build/index.html RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/docs/build/introduction.html RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/docs/build/modules.html RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/docs/build/objects.inv RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/docs/build/py-modindex.html RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/docs/build/search.html RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/docs/build/searchindex.js RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/docs/make.bat RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/docs/requirements.txt RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/docs/source/.DS_Store RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/docs/source/Images/Genal_flowchart.png RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/docs/source/Images/MR_plot_SBP_AS.png RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/docs/source/Images/genal_logo.png RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/docs/source/conf.py RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/docs/source/index.rst RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/docs/source/introduction.rst RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/docs/source/modules.rst RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/genal/MR.py RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/genal/MR_tools.py RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/genal/MRpresso.py RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/genal/association.py RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/genal/clump.py RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/genal/colocalization.py RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/genal/constants.py RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/genal/extract_prs.py RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/genal/lift.py RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/genal/proxy.py RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/genal/snp_query.py RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/genal/tools.py RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/genal_logo.png RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/gitignore RENAMED Viewed

File without changes

{genal_python-1.4.0 → genal_python-1.4.2}/readthedocs.yaml RENAMED Viewed

File without changes

genal-python 1.4.0__tar.gz → 1.4.2__tar.gz

genal-python 1.4.0tar.gz → 1.4.2tar.gz