PyPI - gpu-coloc - Versions diffs - 0.1__tar.gz - Mend

gpu-coloc 0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

gpu_coloc-0.1/PKG-INFO +31 -0
gpu_coloc-0.1/README.md +86 -0
gpu_coloc-0.1/gpu_coloc/__init__.py +0 -0
gpu_coloc-0.1/gpu_coloc/cli.py +14 -0
gpu_coloc-0.1/gpu_coloc/coloc.py +359 -0
gpu_coloc-0.1/gpu_coloc/format.py +131 -0
gpu_coloc-0.1/gpu_coloc.egg-info/PKG-INFO +31 -0
gpu_coloc-0.1/gpu_coloc.egg-info/SOURCES.txt +12 -0
gpu_coloc-0.1/gpu_coloc.egg-info/dependency_links.txt +1 -0
gpu_coloc-0.1/gpu_coloc.egg-info/entry_points.txt +2 -0
gpu_coloc-0.1/gpu_coloc.egg-info/requires.txt +17 -0
gpu_coloc-0.1/gpu_coloc.egg-info/top_level.txt +1 -0
gpu_coloc-0.1/setup.cfg +4 -0
gpu_coloc-0.1/setup.py +39 -0

gpu_coloc-0.1/PKG-INFO ADDED Viewed

@@ -0,0 +1,31 @@
+Metadata-Version: 2.4
+Name: gpu-coloc
+Version: 0.1
+Summary: Ultra-fast GPU-enabled Bayesian colocalisation
+Home-page: https://github.com/mjesse-github/gpu-coloc
+Author: Mihkel Jesse
+License: MIT
+Requires-Python: >=3.12
+Requires-Dist: filelock>=3.17.0
+Requires-Dist: fsspec>=2025.2.0
+Requires-Dist: Jinja2>=3.1.5
+Requires-Dist: MarkupSafe>=3.0.2
+Requires-Dist: mpmath>=1.3.0
+Requires-Dist: networkx>=3.4.2
+Requires-Dist: numpy>=2.2.3
+Requires-Dist: pandas>=2.2.3
+Requires-Dist: pyarrow>=19.0.0
+Requires-Dist: python-dateutil>=2.9.0.post0
+Requires-Dist: pytz>=2025.1
+Requires-Dist: six>=1.17.0
+Requires-Dist: sympy>=1.13.1
+Requires-Dist: torch>=2.6.0
+Requires-Dist: tqdm>=4.67.1
+Requires-Dist: typing_extensions>=4.12.2
+Requires-Dist: tzdata>=2025.1
+Dynamic: author
+Dynamic: home-page
+Dynamic: license
+Dynamic: requires-dist
+Dynamic: requires-python
+Dynamic: summary

gpu_coloc-0.1/README.md ADDED Viewed

@@ -0,0 +1,86 @@
+# gpu-coloc
+**gpu-coloc** is a GPU-accelerated implementation of the Bayesian colocalization algorithm (COLOC), providing identical results to R's coloc.bf\_bf at approximately 1000x greater speed.
+## Citation
+If you use **gpu-coloc**, please cite: *(citation placeholder)*
+## Installation
+Clone the repository:
+```bash
+git clone https://github.com/mjesse-github/gpu-coloc
+```
+### Dependencies
+Install required Python libraries locally:
+```bash
+pip install -r requirements.txt
+```
+Or create a virtual environment using:
+```bash
+python3 -m venv coloc_env
+source coloc_env/bin/activate
+pip3 install -r requirements.txt
+```
+For Linux x64 servers, we recommend using our Singularity container:
+*(Singularity link placeholder)*
+## Testing Installation
+Run:
+```bash
+bash test.sh
+```
+## Workflow
+Note: The following example assumes gpu-coloc is downloaded into your working directory. Adjust paths accordingly if downloaded elsewhere.
+Variants must follow a uniform naming convention, as the COLOC algorithm requires consistent naming. Use the format: chr[chromosome]_[position]_[ref]_[alt]. Perform any renaming prior to Step 1 below. We use chromosome X, not 23.
+1. **Prepare signals and summary files**
+   * **Signals files**: Each signal should be saved in `[signal].pickle` format, containing variants and their respective log Bayes Factors (lbf).
+Format on which our formatting algorithm works:
+```
+variant	chrX_153412224_C_A	chrX_153412528_C_T	...
+lbf	-0.060991	-1.508802	...
+```
+* **Summary file**: Tab-separated file with the structure below:
+```
+signal	chromosome	location_min	location_max	signal_strength	lead_variant
+QTD000141_ENSG00000013563_L1	X	153412224	155341332	12.1069377174147	chrX_154403855_T_G
+...
+```
+Example naming convention:
+* `gwas_summary.tsv`
+* Signals in directory `gwas_signals/[signal].pickle`
+Scripts in `summary_and_signals_examples/` are provided as examples, but may require adjustments.
+2. **Format data:**
+```bash
+python3 gpu-coloc/format.py --input [path_to_signals] --input_summary [summary_file] --output [output_folder]
+```
+3. **Run colocalization:**
+```bash
+python3 gpu-coloc/coloc.py --dir1 [formatted_dataset_1] --dir2 [formatted_dataset_2] --results [results_output] --p12 1e-6 --H4 0.8
+```

gpu_coloc-0.1/gpu_coloc/__init__.py ADDED Viewed

File without changes

gpu_coloc-0.1/gpu_coloc/cli.py ADDED Viewed

@@ -0,0 +1,14 @@
+import sys
+from gpu_coloc import coloc, format
+def main():
+    if "-r" in sys.argv or "--run" in sys.argv:
+        sys.argv.remove("-r") if "-r" in sys.argv else sys.argv.remove("--run")
+        coloc.main()
+    elif "-f" in sys.argv or "--format" in sys.argv:
+        sys.argv.remove("-f") if "-f" in sys.argv else sys.argv.remove("--format")
+        format.main()
+    else:
+        print("Usage: gpu-coloc [-r|--run] or [-f|--format]")
+        print("Use -r or --run to run the coloc script.")
+        print("Use -f or --format to run the format script.")

gpu_coloc-0.1/gpu_coloc/coloc.py ADDED Viewed

@@ -0,0 +1,359 @@
+import argparse
+import math
+import os
+import torch
+import pandas as pd
+import numpy as np
+from tqdm import tqdm
+def logdiff_torch(a, b):
+    mx = torch.maximum(a, b)
+    val = torch.exp(a - mx) - torch.exp(b - mx)
+    mask = (val <= 0)
+    out = mx + torch.log(torch.where(mask, torch.tensor(float('nan'), device=a.device), val))
+    out[mask] = float('nan')
+    return out
+def coloc_bf_bf_torch(
+    bf1_cpu, bf2_cpu,
+    p1=1e-4, p2=1e-4, p12=5e-6,
+    device="mps"
+):
+    if isinstance(bf1_cpu, pd.Series):
+        bf1_cpu = bf1_cpu.to_frame().T
+    if isinstance(bf2_cpu, pd.Series):
+        bf2_cpu = bf2_cpu.to_frame().T
+    isnps = list(set(bf1_cpu.columns).intersection(bf2_cpu.columns) - {"null"})
+    if not isnps:
+        return {
+            "summary": pd.DataFrame({"nsnps": [np.nan]}),
+            "pp_3d": None,
+            "pp_H4_matrix": None,
+            "priors": {"p1": p1, "p2": p2, "p12": p12}
+        }
+    bf1_arr = torch.tensor(bf1_cpu[isnps].values, dtype=torch.float32, device=device)
+    bf2_arr = torch.tensor(bf2_cpu[isnps].values, dtype=torch.float32, device=device)
+    N, M = bf1_arr.shape
+    K, _ = bf2_arr.shape
+    bf1_3d = bf1_arr.unsqueeze(1)
+    bf2_3d = bf2_arr.unsqueeze(0)
+    sum_3d = bf1_3d + bf2_3d
+    sum_3d_logexp = torch.logsumexp(sum_3d, dim=2)
+    l1_sum = torch.logsumexp(bf1_arr, dim=1)
+    l2_sum = torch.logsumexp(bf2_arr, dim=1)
+    l1_sum_2d = l1_sum.unsqueeze(1).expand(N, K)
+    l2_sum_2d = l2_sum.unsqueeze(0).expand(N, K)
+    p1_t  = torch.tensor(p1,  dtype=torch.float32, device=device)
+    p2_t  = torch.tensor(p2,  dtype=torch.float32, device=device)
+    p12_t = torch.tensor(p12, dtype=torch.float32, device=device)
+    lH0_2d = torch.zeros((N, K), device=device)
+    lH1_2d = torch.log(p1_t) + l1_sum_2d
+    lH2_2d = torch.log(p2_t) + l2_sum_2d
+    lH4_2d = torch.log(p12_t) + sum_3d_logexp
+    lH3_2d = torch.log(p1_t) + torch.log(p2_t) + logdiff_torch(l1_sum_2d + l2_sum_2d, sum_3d_logexp)
+    all_abf_3d = torch.stack([lH0_2d, lH1_2d, lH2_2d, lH3_2d, lH4_2d], dim=0)
+    denom_2d   = torch.logsumexp(all_abf_3d, dim=0)
+    pp_abf_3d  = torch.exp(all_abf_3d - denom_2d.unsqueeze(0))
+    pp_H3_2d = pp_abf_3d[3]
+    pp_H4_2d = pp_abf_3d[4]
+    i_coords = torch.arange(N, device=device).unsqueeze(1).expand(N, K).flatten()
+    j_coords = torch.arange(K, device=device).unsqueeze(0).expand(N, K).flatten()
+    pp_H3_flat        = pp_H3_2d.flatten()
+    pp_H4_flat        = pp_H4_2d.flatten()
+    i_coords_cpu    = i_coords.cpu().numpy()
+    j_coords_cpu    = j_coords.cpu().numpy()
+    pp_H3_cpu       = pp_H3_flat.cpu().numpy()
+    pp_H4_cpu       = pp_H4_flat.cpu().numpy()
+    summary_df = pd.DataFrame({
+        "idx1": i_coords_cpu,
+        "idx2": j_coords_cpu,
+        "PP.H3": pp_H3_cpu,
+        "PP.H4": pp_H4_cpu,
+    })
+    return {
+        "summary": summary_df,
+        "pp_3d": pp_abf_3d.cpu().numpy(),
+        "pp_H4_matrix": pp_H4_2d.cpu().numpy(),
+        "priors": {"p1": p1, "p2": p2, "p12": p12}
+    }
+def logsum(arr):
+    max_val = np.max(arr)
+    return max_val + np.log(np.sum(np.exp(arr - max_val)))
+def logbf_to_pp(df, pi, last_is_null):
+    n = df.shape[1] - 1 if last_is_null else df.shape[1]
+    if isinstance(pi, (int, float)):
+        if pi > 1 / n:
+            pi = 1 / n
+        pi = np.append(np.repeat(pi, n), 1 - n * pi) if last_is_null else np.repeat(pi, n)
+    if any(pi == 0):
+        pi[pi == 0] = 1e-16
+        pi = pi / np.sum(pi)
+    if last_is_null:
+        df = df.subtract(df.iloc[:, -1], axis=0)
+    priors = np.tile(np.log(pi), (df.shape[0], 1))
+    denom = np.apply_along_axis(logsum, 1, df.values + priors)
+    denom_df = pd.DataFrame(np.tile(denom, (df.shape[1], 1)).T, index=df.index, columns=df.columns)
+    result = np.exp(df.values + priors - denom_df.values)
+    return pd.DataFrame(result, index=df.index, columns=df.columns)
+def trim(bf1, bf2, p1=1e-4, p2=1e-4, overlap_min=0.5, silent=True):
+    if isinstance(bf1, pd.Series):
+        bf1 = bf1.to_frame().T
+    if isinstance(bf2, pd.Series):
+        bf2 = bf2.to_frame().T
+    isnps = list(set(bf1.columns).intersection(set(bf2.columns)).difference(['null']))
+    if not isnps:
+        if not silent:
+            print("No common SNPs found.")
+        return pd.DataFrame({'nsnps': [np.nan]})
+    pp1 = logbf_to_pp(bf1, p1, last_is_null=True)
+    pp2 = logbf_to_pp(bf2, p2, last_is_null=True)
+    bf1 = bf1[isnps]
+    bf2 = bf2[isnps]
+    prop1 = pp1[isnps].sum(axis=1) / pp1.loc[:, pp1.columns != "null"].sum(axis=1)
+    prop2 = pp2[isnps].sum(axis=1) / pp2.loc[:, pp2.columns != "null"].sum(axis=1)
+    todo = pd.DataFrame([(i, j) for i in range(bf1.shape[0]) for j in range(bf2.shape[0])], columns=['i', 'j'])
+    drop = [prop1[todo['i'][k]] < overlap_min or prop2[todo['j'][k]] < overlap_min for k in range(len(todo))]
+    if all(drop):
+        if not silent:
+            print("Warning: SNP overlap too small between datasets: too few SNPs with high posterior in one trait represented in other")
+        return pd.DataFrame({'nsnps': [np.nan]})
+    return todo[~pd.Series(drop)].reset_index(drop=True)
+def coloc_loop(
+    mat1: pd.DataFrame,
+    mat2: pd.DataFrame,
+    metadata1: pd.DataFrame,
+    metadata2: pd.DataFrame,
+    num_chunks1=0,
+    num_chunks2=0,
+    device="cuda",
+    p1=1e-4, p2=1e-4, p12=1e-6, H4_threshold=0.8
+):
+    try:
+        overlapping_pairs = trim(mat1, mat2, p12)
+        valid_pairs = set(overlapping_pairs[["i", "j"]].itertuples(index=False, name=None))
+    except:
+        print("Possible error in trim function")
+        return pd.DataFrame()
+    if overlapping_pairs.empty:
+        return pd.DataFrame()
+    chunk_size = 100
+    mat1_chunks = []
+    meta1_chunks = []
+    start1_idx = 0
+    N1 = mat1.shape[0]
+    for i in range(num_chunks1):
+        end1_idx = start1_idx + chunk_size if i < (num_chunks1 - 1) else N1
+        mat1_chunk = mat1.iloc[start1_idx:end1_idx, :].copy()
+        meta1_chunk = metadata1.iloc[start1_idx:end1_idx, :].copy()
+        mat1_chunks.append(mat1_chunk)
+        meta1_chunks.append(meta1_chunk)
+        start1_idx = end1_idx
+    mat2_chunks = []
+    meta2_chunks = []
+    start2_idx = 0
+    N2 = mat2.shape[0]
+    for i in range(num_chunks2):
+        end2_idx = start2_idx + chunk_size if i < (num_chunks2 - 1) else N2
+        mat2_chunk = mat2.iloc[start2_idx:end2_idx, :].copy()
+        meta2_chunk = metadata2.iloc[start2_idx:end2_idx, :].copy()
+        mat2_chunks.append(mat2_chunk)
+        meta2_chunks.append(meta2_chunk)
+        start2_idx = end2_idx
+    all_results = []
+    total_pairs = []
+    for i in range(num_chunks1):
+        for j in range(num_chunks2):
+            total_pairs.append((i, j))
+    for pair in tqdm(total_pairs, desc="All chunk pairs", leave=False):
+        out = coloc_bf_bf_torch(
+            bf1_cpu=mat1_chunks[pair[0]],
+            bf2_cpu=mat2_chunks[pair[1]],
+            p1=p1, p2=p2, p12=p12,
+            device=device
+        )
+        if out is None or out["summary"] is None:
+            continue
+        summary_df = out["summary"]
+        summary_df.loc[:, "idx1"] = summary_df["idx1"] + pair[0] * 100
+        summary_df.loc[:, "idx2"] = summary_df["idx2"] + pair[1] * 100
+        summary_df = summary_df[summary_df.apply(lambda row: (row["idx1"], row["idx2"]) in valid_pairs, axis=1)]
+        summary_df = summary_df[summary_df["PP.H4"] >= H4_threshold].reset_index(drop=True)
+        if summary_df.empty:
+            continue
+        summary_df["signal1"] = metadata1["signal"].iloc[
+            summary_df["idx1"]
+        ].values
+        summary_df["lead1"] = metadata1["lead_variant"].iloc[
+            summary_df["idx1"]
+        ].values
+        summary_df["signal2"] = metadata2["signal"].iloc[
+            summary_df["idx2"]
+        ].values
+        summary_df["lead2"] = metadata2["lead_variant"].iloc[
+            summary_df["idx2"]
+        ].values
+        summary_df = summary_df[summary_df["signal1"] != summary_df["signal2"]].reset_index(drop=True)
+        summary_df.drop(columns=["idx1", "idx2"], inplace=True)
+        all_results.append(summary_df)
+    if all_results:
+        final_df = pd.concat(all_results, ignore_index=True)
+    else:
+        final_df = pd.DataFrame()
+    return final_df
+def main():
+    parser = argparse.ArgumentParser(description="Run coloc")
+    parser.add_argument("--dir1", type=str, required=True, help="First directory of directories of parquet files, e.g., 'formatted_eqtls'.")
+    parser.add_argument("--dir2", type=str, required=True, help="Second directory of directories of parquet files, e.g., 'formatted_metabolites'.")
+    parser.add_argument("--results", type=str, required=True, help="File to write the colocalization results, e.g., 'results.tsv'.")
+    parser.add_argument("--p12", type=float, required=True, help="p12 prior, e.g. 1e-6")
+    parser.add_argument("--H4", type=float, required=False, help="Threshold for H4, e.g. 0.8", default=0.8)
+    args = parser.parse_args()
+    p12 = args.p12
+    H4_threshold = args.H4
+    if torch.cuda.is_available():
+        device = torch.device("cuda")
+    elif torch.backends.mps.is_available():
+        device = torch.device("mps")
+    else:
+        device = torch.device("cpu")
+    for root, dirs, files in os.walk(args.dir1):
+        for directory in tqdm(dirs, desc="chromosomes"):
+            dir_path = os.path.join(root, directory)
+            met_files = os.listdir(dir_path)
+            ge_dir_path = os.path.join(args.dir2, directory)
+            files = os.listdir(ge_dir_path)
+            combination = []
+            met_cache = {}
+            ge_cache = {}
+            for i in range(len(met_files)):
+                met_cache[i] = pd.read_parquet(f"{dir_path}/{met_files[i]}")
+            for i in range(len(files)):
+                ge_cache[i] = pd.read_parquet(f"{ge_dir_path}/{files[i]}")
+            for i in tqdm(range(len(met_files)), desc="processing met", leave=False):
+                input1 = met_cache[i]
+                metadata1 = input1.iloc[:, :6].copy()
+                mat1 = input1.iloc[:, 6:].copy()
+                min_pos_1 = metadata1['location_min'].min()
+                max_pos_1 = metadata1['location_max'].max()
+                for j in tqdm(range(len(files)), desc="running files", leave=False):
+                    input2 = ge_cache[j]
+                    metadata2 = input2.iloc[:, :6].copy()
+                    mat2 = input2.iloc[:, 6:].copy()
+                    min_pos_2 = metadata2['location_min'].min()
+                    max_pos_2 = metadata2['location_max'].max()
+                    if max_pos_1 < min_pos_2 or max_pos_2 < min_pos_1:
+                        continue
+                    final_results = coloc_loop(
+                        mat1=mat1,
+                        mat2=mat2,
+                        metadata1=metadata1,
+                        metadata2=metadata2,
+                        num_chunks1=math.ceil(mat1.shape[0]/100),
+                        num_chunks2=math.ceil(mat2.shape[0]/100),
+                        device=device,
+                        p1=1e-4,
+                        p2=1e-4,
+                        p12=p12,
+                        H4_threshold=H4_threshold,
+                    )
+                    output_file=args.results
+                    if final_results is None or final_results.empty:
+                        continue
+                    if not os.path.exists(output_file):
+                        final_results.to_csv(output_file, sep="\t", index=False, mode='w', header=True)
+                    else:
+                        final_results.to_csv(output_file, sep="\t", index=False, mode='a', header=False)
+if __name__ == "__main__":
+    main()

gpu_coloc-0.1/gpu_coloc/format.py ADDED Viewed

@@ -0,0 +1,131 @@
+import argparse
+import os
+import math
+import pandas as pd
+from tqdm import tqdm
+parquet_records = []
+signals_dir = None
+def process_group(meta_group, index, chrom, chrom_dir, group_id=None):
+    signals = meta_group.index.tolist()
+    mat_files = [os.path.join(signals_dir, f"{sig}.pickle") for sig in signals]
+    min_loc = meta_group["location_min"].min()
+    max_loc = meta_group["location_max"].max() if "location_max" in meta_group.columns else meta_group["location_min"].max()
+    snp_set = set()
+    for mat_file in mat_files:
+        df_tmp = pd.read_pickle(mat_file)
+        snp_set.update(df_tmp.columns.tolist())
+        del df_tmp
+    columns = list(meta_group.columns) + sorted(snp_set)
+    combined_df = pd.DataFrame(index=meta_group.index, columns=columns)
+    for col in meta_group.columns:
+        combined_df[col] = meta_group[col]
+    combined_df.iloc[:, len(meta_group.columns):] = -1e6
+    combined_array = combined_df.to_numpy()
+    snp_columns = {snp: idx for idx, snp in enumerate(combined_df.columns[len(meta_group.columns):], start=len(meta_group.columns))}
+    for mat_file in mat_files:
+        signal_name = os.path.splitext(os.path.basename(mat_file))[0]
+        df_mat = pd.read_pickle(mat_file)
+        row_idx = combined_df.index.get_loc(signal_name)
+        for snp_col, value in zip(df_mat.columns, df_mat.iloc[0].values):
+            if snp_col in snp_columns:
+                combined_array[row_idx, snp_columns[snp_col]] = value
+        del df_mat
+    combined_df = pd.DataFrame(combined_array, index=combined_df.index, columns=combined_df.columns)
+    combined_df.reset_index(inplace=True)
+    if group_id is not None:
+        parquet_filename = f"chr{chrom}_group_{group_id}.parquet"
+    else:
+        parquet_filename = f"chr{chrom}_met_group_{index}_region_{min_loc}-{max_loc}.parquet"
+    parquet_path = os.path.join(chrom_dir, parquet_filename)
+    combined_df.to_parquet(parquet_path, engine="pyarrow")
+    parquet_records.append({
+        "chromosome": chrom,
+        "group": group_id if group_id is not None else index,
+        "n_signals": combined_df.shape[0],
+        "min_position": min_loc,
+        "max_position": max_loc,
+        "parquet_file": parquet_path
+    })
+    return index + 1
+def create_parquet(meta_sub, index, chrom, chrom_dir):
+    meta_sub.sort_values(by="location_min", inplace=True)
+    positions = meta_sub["location_min"].tolist()
+    if len(positions) >= 2:
+        positions_sorted = sorted(positions)
+        max_gap, i1, j1 = 0, 0, 0
+        for i in range(len(positions_sorted) - 1):
+            gap = positions_sorted[i + 1] - positions_sorted[i]
+            if gap > max_gap:
+                max_gap, i1, j1 = gap, i, i + 1
+        if max_gap > 1_000_000:
+            split_point_1 = positions_sorted[i1]
+            split_point_2 = positions_sorted[j1]
+            df_part1 = meta_sub[meta_sub["location_min"] <= split_point_1].copy()
+            df_part2 = meta_sub[meta_sub["location_min"] >= split_point_2].copy()
+            index = create_parquet(df_part1, index, chrom, chrom_dir)
+            index = create_parquet(df_part2, index, chrom, chrom_dir)
+            return index
+    if len(meta_sub) > 1000:
+        signals = meta_sub.index.tolist()
+        total = len(signals)
+        chunk_size = 1000
+        n_groups = math.ceil(total / chunk_size)
+        for group in range(n_groups):
+            start = group * chunk_size
+            end = min(start + chunk_size, total)
+            meta_group = meta_sub.loc[signals[start:end]].copy()
+            index = process_group(meta_group, index, chrom, chrom_dir, group_id=index)
+        return index
+    index = process_group(meta_sub, index, chrom, chrom_dir, group_id=index)
+    return index
+def main():
+    parser = argparse.ArgumentParser(
+        description="Process signals with recursive gap splitting (>500k) and chunking (max 1000 signals)"
+    )
+    parser.add_argument("--input", type=str, required=True, help="Directory containing signal pickle files")
+    parser.add_argument("--output", type=str, required=True, help="Directory to save parquet files")
+    parser.add_argument("--input_summary", type=str, required=True, help="Path to summary TSV file")
+    parser.add_argument("--output_summary", type=str, help="Path to write parquet summary TSV")
+    args = parser.parse_args()
+    global signals_dir
+    signals_dir = args.input
+    os.makedirs(args.output, exist_ok=True)
+    metadata = pd.read_csv(args.input_summary, sep="\t")
+    metadata["chromosome"] = metadata["chromosome"].astype(str)
+    # Optionally, filter metadata (e.g., signal_strength > 7) here.
+    chromosomes = metadata["chromosome"].unique()
+    group_index = 0
+    for chrom in tqdm(chromosomes, desc="Processing chromosomes"):
+        chrom_dir = os.path.join(args.output, chrom)
+        os.makedirs(chrom_dir, exist_ok=True)
+        meta_sub = metadata[metadata["chromosome"] == chrom].copy()
+        meta_sub.set_index("signal", inplace=True)
+        meta_sub.sort_values(by="location_min", inplace=True)
+        group_index = create_parquet(meta_sub, group_index, chrom, chrom_dir)
+    if args.output_summary:
+        pd.DataFrame(parquet_records).to_csv(args.output_summary, sep="\t", index=False)
+    print("Done.")
+if __name__ == "__main__":
+    main()

gpu_coloc-0.1/gpu_coloc.egg-info/PKG-INFO ADDED Viewed

@@ -0,0 +1,31 @@
+Metadata-Version: 2.4
+Name: gpu-coloc
+Version: 0.1
+Summary: Ultra-fast GPU-enabled Bayesian colocalisation
+Home-page: https://github.com/mjesse-github/gpu-coloc
+Author: Mihkel Jesse
+License: MIT
+Requires-Python: >=3.12
+Requires-Dist: filelock>=3.17.0
+Requires-Dist: fsspec>=2025.2.0
+Requires-Dist: Jinja2>=3.1.5
+Requires-Dist: MarkupSafe>=3.0.2
+Requires-Dist: mpmath>=1.3.0
+Requires-Dist: networkx>=3.4.2
+Requires-Dist: numpy>=2.2.3
+Requires-Dist: pandas>=2.2.3
+Requires-Dist: pyarrow>=19.0.0
+Requires-Dist: python-dateutil>=2.9.0.post0
+Requires-Dist: pytz>=2025.1
+Requires-Dist: six>=1.17.0
+Requires-Dist: sympy>=1.13.1
+Requires-Dist: torch>=2.6.0
+Requires-Dist: tqdm>=4.67.1
+Requires-Dist: typing_extensions>=4.12.2
+Requires-Dist: tzdata>=2025.1
+Dynamic: author
+Dynamic: home-page
+Dynamic: license
+Dynamic: requires-dist
+Dynamic: requires-python
+Dynamic: summary

gpu_coloc-0.1/gpu_coloc.egg-info/SOURCES.txt ADDED Viewed

@@ -0,0 +1,12 @@
+README.md
+setup.py
+gpu_coloc/__init__.py
+gpu_coloc/cli.py
+gpu_coloc/coloc.py
+gpu_coloc/format.py
+gpu_coloc.egg-info/PKG-INFO
+gpu_coloc.egg-info/SOURCES.txt
+gpu_coloc.egg-info/dependency_links.txt
+gpu_coloc.egg-info/entry_points.txt
+gpu_coloc.egg-info/requires.txt
+gpu_coloc.egg-info/top_level.txt

gpu_coloc-0.1/gpu_coloc.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+

gpu_coloc-0.1/gpu_coloc.egg-info/entry_points.txt ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ [console_scripts]
2	+ gpu-coloc = gpu_coloc.cli:main

gpu_coloc-0.1/gpu_coloc.egg-info/requires.txt ADDED Viewed

@@ -0,0 +1,17 @@
+filelock>=3.17.0
+fsspec>=2025.2.0
+Jinja2>=3.1.5
+MarkupSafe>=3.0.2
+mpmath>=1.3.0
+networkx>=3.4.2
+numpy>=2.2.3
+pandas>=2.2.3
+pyarrow>=19.0.0
+python-dateutil>=2.9.0.post0
+pytz>=2025.1
+six>=1.17.0
+sympy>=1.13.1
+torch>=2.6.0
+tqdm>=4.67.1
+typing_extensions>=4.12.2
+tzdata>=2025.1

gpu_coloc-0.1/gpu_coloc.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ gpu_coloc

gpu_coloc-0.1/setup.cfg ADDED Viewed

@@ -0,0 +1,4 @@
+[egg_info]
+tag_build =
+tag_date = 0

gpu_coloc-0.1/setup.py ADDED Viewed

@@ -0,0 +1,39 @@
+from setuptools import setup, find_packages
+setup(
+    name="gpu-coloc",
+    version="0.1",
+    packages=find_packages(),
+    license="MIT",
+    description="Ultra-fast GPU-enabled Bayesian colocalisation",
+    url="https://github.com/mjesse-github/gpu-coloc",
+    author="Mihkel Jesse",
+    install_requires=[
+        "filelock>=3.17.0",
+        "fsspec>=2025.2.0",
+        "Jinja2>=3.1.5",
+        "MarkupSafe>=3.0.2",
+        "mpmath>=1.3.0",
+        "networkx>=3.4.2",
+        "numpy>=2.2.3",
+        "pandas>=2.2.3",
+        "pyarrow>=19.0.0",
+        "python-dateutil>=2.9.0.post0",
+        "pytz>=2025.1",
+        "six>=1.17.0",
+        "sympy>=1.13.1",
+        "torch>=2.6.0",
+        "tqdm>=4.67.1",
+        "typing_extensions>=4.12.2",
+        "tzdata>=2025.1"
+    ],
+    entry_points={
+        "console_scripts": [
+            "gpu-coloc = gpu_coloc.cli:main",
+        ],
+    },
+    python_requires=">=3.12",
+)