gpu-coloc 0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gpu_coloc-0.1/PKG-INFO +31 -0
- gpu_coloc-0.1/README.md +86 -0
- gpu_coloc-0.1/gpu_coloc/__init__.py +0 -0
- gpu_coloc-0.1/gpu_coloc/cli.py +14 -0
- gpu_coloc-0.1/gpu_coloc/coloc.py +359 -0
- gpu_coloc-0.1/gpu_coloc/format.py +131 -0
- gpu_coloc-0.1/gpu_coloc.egg-info/PKG-INFO +31 -0
- gpu_coloc-0.1/gpu_coloc.egg-info/SOURCES.txt +12 -0
- gpu_coloc-0.1/gpu_coloc.egg-info/dependency_links.txt +1 -0
- gpu_coloc-0.1/gpu_coloc.egg-info/entry_points.txt +2 -0
- gpu_coloc-0.1/gpu_coloc.egg-info/requires.txt +17 -0
- gpu_coloc-0.1/gpu_coloc.egg-info/top_level.txt +1 -0
- gpu_coloc-0.1/setup.cfg +4 -0
- gpu_coloc-0.1/setup.py +39 -0
gpu_coloc-0.1/PKG-INFO
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: gpu-coloc
|
|
3
|
+
Version: 0.1
|
|
4
|
+
Summary: Ultra-fast GPU-enabled Bayesian colocalisation
|
|
5
|
+
Home-page: https://github.com/mjesse-github/gpu-coloc
|
|
6
|
+
Author: Mihkel Jesse
|
|
7
|
+
License: MIT
|
|
8
|
+
Requires-Python: >=3.12
|
|
9
|
+
Requires-Dist: filelock>=3.17.0
|
|
10
|
+
Requires-Dist: fsspec>=2025.2.0
|
|
11
|
+
Requires-Dist: Jinja2>=3.1.5
|
|
12
|
+
Requires-Dist: MarkupSafe>=3.0.2
|
|
13
|
+
Requires-Dist: mpmath>=1.3.0
|
|
14
|
+
Requires-Dist: networkx>=3.4.2
|
|
15
|
+
Requires-Dist: numpy>=2.2.3
|
|
16
|
+
Requires-Dist: pandas>=2.2.3
|
|
17
|
+
Requires-Dist: pyarrow>=19.0.0
|
|
18
|
+
Requires-Dist: python-dateutil>=2.9.0.post0
|
|
19
|
+
Requires-Dist: pytz>=2025.1
|
|
20
|
+
Requires-Dist: six>=1.17.0
|
|
21
|
+
Requires-Dist: sympy>=1.13.1
|
|
22
|
+
Requires-Dist: torch>=2.6.0
|
|
23
|
+
Requires-Dist: tqdm>=4.67.1
|
|
24
|
+
Requires-Dist: typing_extensions>=4.12.2
|
|
25
|
+
Requires-Dist: tzdata>=2025.1
|
|
26
|
+
Dynamic: author
|
|
27
|
+
Dynamic: home-page
|
|
28
|
+
Dynamic: license
|
|
29
|
+
Dynamic: requires-dist
|
|
30
|
+
Dynamic: requires-python
|
|
31
|
+
Dynamic: summary
|
gpu_coloc-0.1/README.md
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
# gpu-coloc
|
|
2
|
+
|
|
3
|
+
**gpu-coloc** is a GPU-accelerated implementation of the Bayesian colocalization algorithm (COLOC), providing identical results to R's coloc.bf\_bf at approximately 1000x greater speed.
|
|
4
|
+
|
|
5
|
+
## Citation
|
|
6
|
+
|
|
7
|
+
If you use **gpu-coloc**, please cite: *(citation placeholder)*
|
|
8
|
+
|
|
9
|
+
## Installation
|
|
10
|
+
|
|
11
|
+
Clone the repository:
|
|
12
|
+
|
|
13
|
+
```bash
|
|
14
|
+
git clone https://github.com/mjesse-github/gpu-coloc
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
### Dependencies
|
|
18
|
+
|
|
19
|
+
Install required Python libraries locally:
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
pip install -r requirements.txt
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
Or create a virtual environment using:
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
python3 -m venv coloc_env
|
|
29
|
+
source coloc_env/bin/activate
|
|
30
|
+
pip3 install -r requirements.txt
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
For Linux x64 servers, we recommend using our Singularity container:
|
|
34
|
+
*(Singularity link placeholder)*
|
|
35
|
+
|
|
36
|
+
## Testing Installation
|
|
37
|
+
|
|
38
|
+
Run:
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
bash test.sh
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
## Workflow
|
|
45
|
+
|
|
46
|
+
Note: The following example assumes gpu-coloc is downloaded into your working directory. Adjust paths accordingly if downloaded elsewhere.
|
|
47
|
+
|
|
48
|
+
Variants must follow a uniform naming convention, as the COLOC algorithm requires consistent naming. Use the format: chr[chromosome]_[position]_[ref]_[alt]. Perform any renaming prior to Step 1 below. We use chromosome X, not 23.
|
|
49
|
+
|
|
50
|
+
1. **Prepare signals and summary files**
|
|
51
|
+
|
|
52
|
+
* **Signals files**: Each signal should be saved in `[signal].pickle` format, containing variants and their respective log Bayes Factors (lbf).
|
|
53
|
+
|
|
54
|
+
Format on which our formatting algorithm works:
|
|
55
|
+
|
|
56
|
+
```
|
|
57
|
+
variant chrX_153412224_C_A chrX_153412528_C_T ...
|
|
58
|
+
lbf -0.060991 -1.508802 ...
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
* **Summary file**: Tab-separated file with the structure below:
|
|
62
|
+
|
|
63
|
+
```
|
|
64
|
+
signal chromosome location_min location_max signal_strength lead_variant
|
|
65
|
+
QTD000141_ENSG00000013563_L1 X 153412224 155341332 12.1069377174147 chrX_154403855_T_G
|
|
66
|
+
...
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
Example naming convention:
|
|
70
|
+
|
|
71
|
+
* `gwas_summary.tsv`
|
|
72
|
+
* Signals in directory `gwas_signals/[signal].pickle`
|
|
73
|
+
|
|
74
|
+
Scripts in `summary_and_signals_examples/` are provided as examples, but may require adjustments.
|
|
75
|
+
|
|
76
|
+
2. **Format data:**
|
|
77
|
+
|
|
78
|
+
```bash
|
|
79
|
+
python3 gpu-coloc/format.py --input [path_to_signals] --input_summary [summary_file] --output [output_folder]
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
3. **Run colocalization:**
|
|
83
|
+
|
|
84
|
+
```bash
|
|
85
|
+
python3 gpu-coloc/coloc.py --dir1 [formatted_dataset_1] --dir2 [formatted_dataset_2] --results [results_output] --p12 1e-6 --H4 0.8
|
|
86
|
+
```
|
|
File without changes
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
from gpu_coloc import coloc, format
|
|
3
|
+
|
|
4
|
+
def main():
|
|
5
|
+
if "-r" in sys.argv or "--run" in sys.argv:
|
|
6
|
+
sys.argv.remove("-r") if "-r" in sys.argv else sys.argv.remove("--run")
|
|
7
|
+
coloc.main()
|
|
8
|
+
elif "-f" in sys.argv or "--format" in sys.argv:
|
|
9
|
+
sys.argv.remove("-f") if "-f" in sys.argv else sys.argv.remove("--format")
|
|
10
|
+
format.main()
|
|
11
|
+
else:
|
|
12
|
+
print("Usage: gpu-coloc [-r|--run] or [-f|--format]")
|
|
13
|
+
print("Use -r or --run to run the coloc script.")
|
|
14
|
+
print("Use -f or --format to run the format script.")
|
|
@@ -0,0 +1,359 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import math
|
|
3
|
+
import os
|
|
4
|
+
import torch
|
|
5
|
+
import pandas as pd
|
|
6
|
+
import numpy as np
|
|
7
|
+
from tqdm import tqdm
|
|
8
|
+
|
|
9
|
+
def logdiff_torch(a, b):
|
|
10
|
+
mx = torch.maximum(a, b)
|
|
11
|
+
val = torch.exp(a - mx) - torch.exp(b - mx)
|
|
12
|
+
mask = (val <= 0)
|
|
13
|
+
out = mx + torch.log(torch.where(mask, torch.tensor(float('nan'), device=a.device), val))
|
|
14
|
+
out[mask] = float('nan')
|
|
15
|
+
return out
|
|
16
|
+
|
|
17
|
+
def coloc_bf_bf_torch(
|
|
18
|
+
bf1_cpu, bf2_cpu,
|
|
19
|
+
p1=1e-4, p2=1e-4, p12=5e-6,
|
|
20
|
+
device="mps"
|
|
21
|
+
):
|
|
22
|
+
if isinstance(bf1_cpu, pd.Series):
|
|
23
|
+
bf1_cpu = bf1_cpu.to_frame().T
|
|
24
|
+
if isinstance(bf2_cpu, pd.Series):
|
|
25
|
+
bf2_cpu = bf2_cpu.to_frame().T
|
|
26
|
+
|
|
27
|
+
isnps = list(set(bf1_cpu.columns).intersection(bf2_cpu.columns) - {"null"})
|
|
28
|
+
if not isnps:
|
|
29
|
+
return {
|
|
30
|
+
"summary": pd.DataFrame({"nsnps": [np.nan]}),
|
|
31
|
+
"pp_3d": None,
|
|
32
|
+
"pp_H4_matrix": None,
|
|
33
|
+
"priors": {"p1": p1, "p2": p2, "p12": p12}
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
bf1_arr = torch.tensor(bf1_cpu[isnps].values, dtype=torch.float32, device=device)
|
|
37
|
+
bf2_arr = torch.tensor(bf2_cpu[isnps].values, dtype=torch.float32, device=device)
|
|
38
|
+
|
|
39
|
+
N, M = bf1_arr.shape
|
|
40
|
+
K, _ = bf2_arr.shape
|
|
41
|
+
|
|
42
|
+
bf1_3d = bf1_arr.unsqueeze(1)
|
|
43
|
+
bf2_3d = bf2_arr.unsqueeze(0)
|
|
44
|
+
sum_3d = bf1_3d + bf2_3d
|
|
45
|
+
|
|
46
|
+
sum_3d_logexp = torch.logsumexp(sum_3d, dim=2)
|
|
47
|
+
|
|
48
|
+
l1_sum = torch.logsumexp(bf1_arr, dim=1)
|
|
49
|
+
l2_sum = torch.logsumexp(bf2_arr, dim=1)
|
|
50
|
+
|
|
51
|
+
l1_sum_2d = l1_sum.unsqueeze(1).expand(N, K)
|
|
52
|
+
l2_sum_2d = l2_sum.unsqueeze(0).expand(N, K)
|
|
53
|
+
|
|
54
|
+
p1_t = torch.tensor(p1, dtype=torch.float32, device=device)
|
|
55
|
+
p2_t = torch.tensor(p2, dtype=torch.float32, device=device)
|
|
56
|
+
p12_t = torch.tensor(p12, dtype=torch.float32, device=device)
|
|
57
|
+
|
|
58
|
+
lH0_2d = torch.zeros((N, K), device=device)
|
|
59
|
+
lH1_2d = torch.log(p1_t) + l1_sum_2d
|
|
60
|
+
lH2_2d = torch.log(p2_t) + l2_sum_2d
|
|
61
|
+
lH4_2d = torch.log(p12_t) + sum_3d_logexp
|
|
62
|
+
lH3_2d = torch.log(p1_t) + torch.log(p2_t) + logdiff_torch(l1_sum_2d + l2_sum_2d, sum_3d_logexp)
|
|
63
|
+
|
|
64
|
+
all_abf_3d = torch.stack([lH0_2d, lH1_2d, lH2_2d, lH3_2d, lH4_2d], dim=0)
|
|
65
|
+
denom_2d = torch.logsumexp(all_abf_3d, dim=0)
|
|
66
|
+
pp_abf_3d = torch.exp(all_abf_3d - denom_2d.unsqueeze(0))
|
|
67
|
+
|
|
68
|
+
pp_H3_2d = pp_abf_3d[3]
|
|
69
|
+
pp_H4_2d = pp_abf_3d[4]
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
i_coords = torch.arange(N, device=device).unsqueeze(1).expand(N, K).flatten()
|
|
73
|
+
j_coords = torch.arange(K, device=device).unsqueeze(0).expand(N, K).flatten()
|
|
74
|
+
|
|
75
|
+
pp_H3_flat = pp_H3_2d.flatten()
|
|
76
|
+
pp_H4_flat = pp_H4_2d.flatten()
|
|
77
|
+
|
|
78
|
+
i_coords_cpu = i_coords.cpu().numpy()
|
|
79
|
+
j_coords_cpu = j_coords.cpu().numpy()
|
|
80
|
+
pp_H3_cpu = pp_H3_flat.cpu().numpy()
|
|
81
|
+
pp_H4_cpu = pp_H4_flat.cpu().numpy()
|
|
82
|
+
|
|
83
|
+
summary_df = pd.DataFrame({
|
|
84
|
+
"idx1": i_coords_cpu,
|
|
85
|
+
"idx2": j_coords_cpu,
|
|
86
|
+
"PP.H3": pp_H3_cpu,
|
|
87
|
+
"PP.H4": pp_H4_cpu,
|
|
88
|
+
})
|
|
89
|
+
|
|
90
|
+
return {
|
|
91
|
+
"summary": summary_df,
|
|
92
|
+
"pp_3d": pp_abf_3d.cpu().numpy(),
|
|
93
|
+
"pp_H4_matrix": pp_H4_2d.cpu().numpy(),
|
|
94
|
+
"priors": {"p1": p1, "p2": p2, "p12": p12}
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def logsum(arr):
|
|
99
|
+
max_val = np.max(arr)
|
|
100
|
+
return max_val + np.log(np.sum(np.exp(arr - max_val)))
|
|
101
|
+
|
|
102
|
+
def logbf_to_pp(df, pi, last_is_null):
|
|
103
|
+
n = df.shape[1] - 1 if last_is_null else df.shape[1]
|
|
104
|
+
|
|
105
|
+
if isinstance(pi, (int, float)):
|
|
106
|
+
if pi > 1 / n:
|
|
107
|
+
pi = 1 / n
|
|
108
|
+
pi = np.append(np.repeat(pi, n), 1 - n * pi) if last_is_null else np.repeat(pi, n)
|
|
109
|
+
|
|
110
|
+
if any(pi == 0):
|
|
111
|
+
pi[pi == 0] = 1e-16
|
|
112
|
+
pi = pi / np.sum(pi)
|
|
113
|
+
|
|
114
|
+
if last_is_null:
|
|
115
|
+
df = df.subtract(df.iloc[:, -1], axis=0)
|
|
116
|
+
|
|
117
|
+
priors = np.tile(np.log(pi), (df.shape[0], 1))
|
|
118
|
+
|
|
119
|
+
denom = np.apply_along_axis(logsum, 1, df.values + priors)
|
|
120
|
+
|
|
121
|
+
denom_df = pd.DataFrame(np.tile(denom, (df.shape[1], 1)).T, index=df.index, columns=df.columns)
|
|
122
|
+
|
|
123
|
+
result = np.exp(df.values + priors - denom_df.values)
|
|
124
|
+
|
|
125
|
+
return pd.DataFrame(result, index=df.index, columns=df.columns)
|
|
126
|
+
|
|
127
|
+
def trim(bf1, bf2, p1=1e-4, p2=1e-4, overlap_min=0.5, silent=True):
|
|
128
|
+
if isinstance(bf1, pd.Series):
|
|
129
|
+
bf1 = bf1.to_frame().T
|
|
130
|
+
if isinstance(bf2, pd.Series):
|
|
131
|
+
bf2 = bf2.to_frame().T
|
|
132
|
+
|
|
133
|
+
isnps = list(set(bf1.columns).intersection(set(bf2.columns)).difference(['null']))
|
|
134
|
+
|
|
135
|
+
if not isnps:
|
|
136
|
+
if not silent:
|
|
137
|
+
print("No common SNPs found.")
|
|
138
|
+
return pd.DataFrame({'nsnps': [np.nan]})
|
|
139
|
+
|
|
140
|
+
pp1 = logbf_to_pp(bf1, p1, last_is_null=True)
|
|
141
|
+
pp2 = logbf_to_pp(bf2, p2, last_is_null=True)
|
|
142
|
+
|
|
143
|
+
bf1 = bf1[isnps]
|
|
144
|
+
bf2 = bf2[isnps]
|
|
145
|
+
|
|
146
|
+
prop1 = pp1[isnps].sum(axis=1) / pp1.loc[:, pp1.columns != "null"].sum(axis=1)
|
|
147
|
+
|
|
148
|
+
prop2 = pp2[isnps].sum(axis=1) / pp2.loc[:, pp2.columns != "null"].sum(axis=1)
|
|
149
|
+
|
|
150
|
+
todo = pd.DataFrame([(i, j) for i in range(bf1.shape[0]) for j in range(bf2.shape[0])], columns=['i', 'j'])
|
|
151
|
+
|
|
152
|
+
drop = [prop1[todo['i'][k]] < overlap_min or prop2[todo['j'][k]] < overlap_min for k in range(len(todo))]
|
|
153
|
+
|
|
154
|
+
if all(drop):
|
|
155
|
+
if not silent:
|
|
156
|
+
print("Warning: SNP overlap too small between datasets: too few SNPs with high posterior in one trait represented in other")
|
|
157
|
+
|
|
158
|
+
return pd.DataFrame({'nsnps': [np.nan]})
|
|
159
|
+
|
|
160
|
+
return todo[~pd.Series(drop)].reset_index(drop=True)
|
|
161
|
+
|
|
162
|
+
def coloc_loop(
|
|
163
|
+
mat1: pd.DataFrame,
|
|
164
|
+
mat2: pd.DataFrame,
|
|
165
|
+
metadata1: pd.DataFrame,
|
|
166
|
+
metadata2: pd.DataFrame,
|
|
167
|
+
num_chunks1=0,
|
|
168
|
+
num_chunks2=0,
|
|
169
|
+
device="cuda",
|
|
170
|
+
p1=1e-4, p2=1e-4, p12=1e-6, H4_threshold=0.8
|
|
171
|
+
):
|
|
172
|
+
|
|
173
|
+
try:
|
|
174
|
+
overlapping_pairs = trim(mat1, mat2, p12)
|
|
175
|
+
valid_pairs = set(overlapping_pairs[["i", "j"]].itertuples(index=False, name=None))
|
|
176
|
+
except:
|
|
177
|
+
print("Possible error in trim function")
|
|
178
|
+
return pd.DataFrame()
|
|
179
|
+
|
|
180
|
+
if overlapping_pairs.empty:
|
|
181
|
+
return pd.DataFrame()
|
|
182
|
+
|
|
183
|
+
chunk_size = 100
|
|
184
|
+
|
|
185
|
+
mat1_chunks = []
|
|
186
|
+
meta1_chunks = []
|
|
187
|
+
start1_idx = 0
|
|
188
|
+
|
|
189
|
+
N1 = mat1.shape[0]
|
|
190
|
+
|
|
191
|
+
for i in range(num_chunks1):
|
|
192
|
+
end1_idx = start1_idx + chunk_size if i < (num_chunks1 - 1) else N1
|
|
193
|
+
mat1_chunk = mat1.iloc[start1_idx:end1_idx, :].copy()
|
|
194
|
+
meta1_chunk = metadata1.iloc[start1_idx:end1_idx, :].copy()
|
|
195
|
+
|
|
196
|
+
mat1_chunks.append(mat1_chunk)
|
|
197
|
+
meta1_chunks.append(meta1_chunk)
|
|
198
|
+
start1_idx = end1_idx
|
|
199
|
+
|
|
200
|
+
mat2_chunks = []
|
|
201
|
+
meta2_chunks = []
|
|
202
|
+
start2_idx = 0
|
|
203
|
+
|
|
204
|
+
N2 = mat2.shape[0]
|
|
205
|
+
|
|
206
|
+
for i in range(num_chunks2):
|
|
207
|
+
end2_idx = start2_idx + chunk_size if i < (num_chunks2 - 1) else N2
|
|
208
|
+
mat2_chunk = mat2.iloc[start2_idx:end2_idx, :].copy()
|
|
209
|
+
meta2_chunk = metadata2.iloc[start2_idx:end2_idx, :].copy()
|
|
210
|
+
|
|
211
|
+
mat2_chunks.append(mat2_chunk)
|
|
212
|
+
meta2_chunks.append(meta2_chunk)
|
|
213
|
+
start2_idx = end2_idx
|
|
214
|
+
|
|
215
|
+
all_results = []
|
|
216
|
+
|
|
217
|
+
total_pairs = []
|
|
218
|
+
|
|
219
|
+
for i in range(num_chunks1):
|
|
220
|
+
for j in range(num_chunks2):
|
|
221
|
+
total_pairs.append((i, j))
|
|
222
|
+
|
|
223
|
+
for pair in tqdm(total_pairs, desc="All chunk pairs", leave=False):
|
|
224
|
+
out = coloc_bf_bf_torch(
|
|
225
|
+
bf1_cpu=mat1_chunks[pair[0]],
|
|
226
|
+
bf2_cpu=mat2_chunks[pair[1]],
|
|
227
|
+
p1=p1, p2=p2, p12=p12,
|
|
228
|
+
device=device
|
|
229
|
+
)
|
|
230
|
+
if out is None or out["summary"] is None:
|
|
231
|
+
continue
|
|
232
|
+
|
|
233
|
+
summary_df = out["summary"]
|
|
234
|
+
|
|
235
|
+
summary_df.loc[:, "idx1"] = summary_df["idx1"] + pair[0] * 100
|
|
236
|
+
summary_df.loc[:, "idx2"] = summary_df["idx2"] + pair[1] * 100
|
|
237
|
+
|
|
238
|
+
summary_df = summary_df[summary_df.apply(lambda row: (row["idx1"], row["idx2"]) in valid_pairs, axis=1)]
|
|
239
|
+
|
|
240
|
+
summary_df = summary_df[summary_df["PP.H4"] >= H4_threshold].reset_index(drop=True)
|
|
241
|
+
|
|
242
|
+
if summary_df.empty:
|
|
243
|
+
continue
|
|
244
|
+
|
|
245
|
+
summary_df["signal1"] = metadata1["signal"].iloc[
|
|
246
|
+
summary_df["idx1"]
|
|
247
|
+
].values
|
|
248
|
+
|
|
249
|
+
summary_df["lead1"] = metadata1["lead_variant"].iloc[
|
|
250
|
+
summary_df["idx1"]
|
|
251
|
+
].values
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
summary_df["signal2"] = metadata2["signal"].iloc[
|
|
255
|
+
summary_df["idx2"]
|
|
256
|
+
].values
|
|
257
|
+
|
|
258
|
+
summary_df["lead2"] = metadata2["lead_variant"].iloc[
|
|
259
|
+
summary_df["idx2"]
|
|
260
|
+
].values
|
|
261
|
+
|
|
262
|
+
summary_df = summary_df[summary_df["signal1"] != summary_df["signal2"]].reset_index(drop=True)
|
|
263
|
+
|
|
264
|
+
summary_df.drop(columns=["idx1", "idx2"], inplace=True)
|
|
265
|
+
|
|
266
|
+
all_results.append(summary_df)
|
|
267
|
+
|
|
268
|
+
if all_results:
|
|
269
|
+
final_df = pd.concat(all_results, ignore_index=True)
|
|
270
|
+
else:
|
|
271
|
+
final_df = pd.DataFrame()
|
|
272
|
+
|
|
273
|
+
return final_df
|
|
274
|
+
|
|
275
|
+
def main():
|
|
276
|
+
parser = argparse.ArgumentParser(description="Run coloc")
|
|
277
|
+
|
|
278
|
+
parser.add_argument("--dir1", type=str, required=True, help="First directory of directories of parquet files, e.g., 'formatted_eqtls'.")
|
|
279
|
+
parser.add_argument("--dir2", type=str, required=True, help="Second directory of directories of parquet files, e.g., 'formatted_metabolites'.")
|
|
280
|
+
parser.add_argument("--results", type=str, required=True, help="File to write the colocalization results, e.g., 'results.tsv'.")
|
|
281
|
+
parser.add_argument("--p12", type=float, required=True, help="p12 prior, e.g. 1e-6")
|
|
282
|
+
parser.add_argument("--H4", type=float, required=False, help="Threshold for H4, e.g. 0.8", default=0.8)
|
|
283
|
+
|
|
284
|
+
args = parser.parse_args()
|
|
285
|
+
|
|
286
|
+
p12 = args.p12
|
|
287
|
+
H4_threshold = args.H4
|
|
288
|
+
|
|
289
|
+
if torch.cuda.is_available():
|
|
290
|
+
device = torch.device("cuda")
|
|
291
|
+
elif torch.backends.mps.is_available():
|
|
292
|
+
device = torch.device("mps")
|
|
293
|
+
else:
|
|
294
|
+
device = torch.device("cpu")
|
|
295
|
+
|
|
296
|
+
for root, dirs, files in os.walk(args.dir1):
|
|
297
|
+
for directory in tqdm(dirs, desc="chromosomes"):
|
|
298
|
+
dir_path = os.path.join(root, directory)
|
|
299
|
+
met_files = os.listdir(dir_path)
|
|
300
|
+
|
|
301
|
+
ge_dir_path = os.path.join(args.dir2, directory)
|
|
302
|
+
files = os.listdir(ge_dir_path)
|
|
303
|
+
|
|
304
|
+
combination = []
|
|
305
|
+
|
|
306
|
+
met_cache = {}
|
|
307
|
+
ge_cache = {}
|
|
308
|
+
|
|
309
|
+
for i in range(len(met_files)):
|
|
310
|
+
met_cache[i] = pd.read_parquet(f"{dir_path}/{met_files[i]}")
|
|
311
|
+
|
|
312
|
+
for i in range(len(files)):
|
|
313
|
+
ge_cache[i] = pd.read_parquet(f"{ge_dir_path}/{files[i]}")
|
|
314
|
+
|
|
315
|
+
for i in tqdm(range(len(met_files)), desc="processing met", leave=False):
|
|
316
|
+
input1 = met_cache[i]
|
|
317
|
+
metadata1 = input1.iloc[:, :6].copy()
|
|
318
|
+
mat1 = input1.iloc[:, 6:].copy()
|
|
319
|
+
|
|
320
|
+
min_pos_1 = metadata1['location_min'].min()
|
|
321
|
+
max_pos_1 = metadata1['location_max'].max()
|
|
322
|
+
|
|
323
|
+
for j in tqdm(range(len(files)), desc="running files", leave=False):
|
|
324
|
+
input2 = ge_cache[j]
|
|
325
|
+
metadata2 = input2.iloc[:, :6].copy()
|
|
326
|
+
mat2 = input2.iloc[:, 6:].copy()
|
|
327
|
+
|
|
328
|
+
min_pos_2 = metadata2['location_min'].min()
|
|
329
|
+
max_pos_2 = metadata2['location_max'].max()
|
|
330
|
+
|
|
331
|
+
if max_pos_1 < min_pos_2 or max_pos_2 < min_pos_1:
|
|
332
|
+
continue
|
|
333
|
+
|
|
334
|
+
final_results = coloc_loop(
|
|
335
|
+
mat1=mat1,
|
|
336
|
+
mat2=mat2,
|
|
337
|
+
metadata1=metadata1,
|
|
338
|
+
metadata2=metadata2,
|
|
339
|
+
num_chunks1=math.ceil(mat1.shape[0]/100),
|
|
340
|
+
num_chunks2=math.ceil(mat2.shape[0]/100),
|
|
341
|
+
device=device,
|
|
342
|
+
p1=1e-4,
|
|
343
|
+
p2=1e-4,
|
|
344
|
+
p12=p12,
|
|
345
|
+
H4_threshold=H4_threshold,
|
|
346
|
+
)
|
|
347
|
+
|
|
348
|
+
output_file=args.results
|
|
349
|
+
|
|
350
|
+
if final_results is None or final_results.empty:
|
|
351
|
+
continue
|
|
352
|
+
|
|
353
|
+
if not os.path.exists(output_file):
|
|
354
|
+
final_results.to_csv(output_file, sep="\t", index=False, mode='w', header=True)
|
|
355
|
+
else:
|
|
356
|
+
final_results.to_csv(output_file, sep="\t", index=False, mode='a', header=False)
|
|
357
|
+
|
|
358
|
+
if __name__ == "__main__":
|
|
359
|
+
main()
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import os
|
|
3
|
+
import math
|
|
4
|
+
import pandas as pd
|
|
5
|
+
from tqdm import tqdm
|
|
6
|
+
|
|
7
|
+
parquet_records = []
|
|
8
|
+
signals_dir = None
|
|
9
|
+
|
|
10
|
+
def process_group(meta_group, index, chrom, chrom_dir, group_id=None):
|
|
11
|
+
signals = meta_group.index.tolist()
|
|
12
|
+
mat_files = [os.path.join(signals_dir, f"{sig}.pickle") for sig in signals]
|
|
13
|
+
|
|
14
|
+
min_loc = meta_group["location_min"].min()
|
|
15
|
+
max_loc = meta_group["location_max"].max() if "location_max" in meta_group.columns else meta_group["location_min"].max()
|
|
16
|
+
|
|
17
|
+
snp_set = set()
|
|
18
|
+
for mat_file in mat_files:
|
|
19
|
+
df_tmp = pd.read_pickle(mat_file)
|
|
20
|
+
snp_set.update(df_tmp.columns.tolist())
|
|
21
|
+
del df_tmp
|
|
22
|
+
|
|
23
|
+
columns = list(meta_group.columns) + sorted(snp_set)
|
|
24
|
+
combined_df = pd.DataFrame(index=meta_group.index, columns=columns)
|
|
25
|
+
for col in meta_group.columns:
|
|
26
|
+
combined_df[col] = meta_group[col]
|
|
27
|
+
combined_df.iloc[:, len(meta_group.columns):] = -1e6
|
|
28
|
+
|
|
29
|
+
combined_array = combined_df.to_numpy()
|
|
30
|
+
snp_columns = {snp: idx for idx, snp in enumerate(combined_df.columns[len(meta_group.columns):], start=len(meta_group.columns))}
|
|
31
|
+
|
|
32
|
+
for mat_file in mat_files:
|
|
33
|
+
signal_name = os.path.splitext(os.path.basename(mat_file))[0]
|
|
34
|
+
df_mat = pd.read_pickle(mat_file)
|
|
35
|
+
row_idx = combined_df.index.get_loc(signal_name)
|
|
36
|
+
for snp_col, value in zip(df_mat.columns, df_mat.iloc[0].values):
|
|
37
|
+
if snp_col in snp_columns:
|
|
38
|
+
combined_array[row_idx, snp_columns[snp_col]] = value
|
|
39
|
+
del df_mat
|
|
40
|
+
|
|
41
|
+
combined_df = pd.DataFrame(combined_array, index=combined_df.index, columns=combined_df.columns)
|
|
42
|
+
combined_df.reset_index(inplace=True)
|
|
43
|
+
|
|
44
|
+
if group_id is not None:
|
|
45
|
+
parquet_filename = f"chr{chrom}_group_{group_id}.parquet"
|
|
46
|
+
else:
|
|
47
|
+
parquet_filename = f"chr{chrom}_met_group_{index}_region_{min_loc}-{max_loc}.parquet"
|
|
48
|
+
parquet_path = os.path.join(chrom_dir, parquet_filename)
|
|
49
|
+
|
|
50
|
+
combined_df.to_parquet(parquet_path, engine="pyarrow")
|
|
51
|
+
parquet_records.append({
|
|
52
|
+
"chromosome": chrom,
|
|
53
|
+
"group": group_id if group_id is not None else index,
|
|
54
|
+
"n_signals": combined_df.shape[0],
|
|
55
|
+
"min_position": min_loc,
|
|
56
|
+
"max_position": max_loc,
|
|
57
|
+
"parquet_file": parquet_path
|
|
58
|
+
})
|
|
59
|
+
|
|
60
|
+
return index + 1
|
|
61
|
+
|
|
62
|
+
def create_parquet(meta_sub, index, chrom, chrom_dir):
|
|
63
|
+
meta_sub.sort_values(by="location_min", inplace=True)
|
|
64
|
+
positions = meta_sub["location_min"].tolist()
|
|
65
|
+
|
|
66
|
+
if len(positions) >= 2:
|
|
67
|
+
positions_sorted = sorted(positions)
|
|
68
|
+
max_gap, i1, j1 = 0, 0, 0
|
|
69
|
+
for i in range(len(positions_sorted) - 1):
|
|
70
|
+
gap = positions_sorted[i + 1] - positions_sorted[i]
|
|
71
|
+
if gap > max_gap:
|
|
72
|
+
max_gap, i1, j1 = gap, i, i + 1
|
|
73
|
+
if max_gap > 1_000_000:
|
|
74
|
+
split_point_1 = positions_sorted[i1]
|
|
75
|
+
split_point_2 = positions_sorted[j1]
|
|
76
|
+
df_part1 = meta_sub[meta_sub["location_min"] <= split_point_1].copy()
|
|
77
|
+
df_part2 = meta_sub[meta_sub["location_min"] >= split_point_2].copy()
|
|
78
|
+
index = create_parquet(df_part1, index, chrom, chrom_dir)
|
|
79
|
+
index = create_parquet(df_part2, index, chrom, chrom_dir)
|
|
80
|
+
return index
|
|
81
|
+
|
|
82
|
+
if len(meta_sub) > 1000:
|
|
83
|
+
signals = meta_sub.index.tolist()
|
|
84
|
+
total = len(signals)
|
|
85
|
+
chunk_size = 1000
|
|
86
|
+
n_groups = math.ceil(total / chunk_size)
|
|
87
|
+
for group in range(n_groups):
|
|
88
|
+
start = group * chunk_size
|
|
89
|
+
end = min(start + chunk_size, total)
|
|
90
|
+
meta_group = meta_sub.loc[signals[start:end]].copy()
|
|
91
|
+
index = process_group(meta_group, index, chrom, chrom_dir, group_id=index)
|
|
92
|
+
return index
|
|
93
|
+
|
|
94
|
+
index = process_group(meta_sub, index, chrom, chrom_dir, group_id=index)
|
|
95
|
+
return index
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def main():
|
|
99
|
+
parser = argparse.ArgumentParser(
|
|
100
|
+
description="Process signals with recursive gap splitting (>500k) and chunking (max 1000 signals)"
|
|
101
|
+
)
|
|
102
|
+
parser.add_argument("--input", type=str, required=True, help="Directory containing signal pickle files")
|
|
103
|
+
parser.add_argument("--output", type=str, required=True, help="Directory to save parquet files")
|
|
104
|
+
parser.add_argument("--input_summary", type=str, required=True, help="Path to summary TSV file")
|
|
105
|
+
parser.add_argument("--output_summary", type=str, help="Path to write parquet summary TSV")
|
|
106
|
+
args = parser.parse_args()
|
|
107
|
+
|
|
108
|
+
global signals_dir
|
|
109
|
+
signals_dir = args.input
|
|
110
|
+
|
|
111
|
+
os.makedirs(args.output, exist_ok=True)
|
|
112
|
+
metadata = pd.read_csv(args.input_summary, sep="\t")
|
|
113
|
+
metadata["chromosome"] = metadata["chromosome"].astype(str)
|
|
114
|
+
# Optionally, filter metadata (e.g., signal_strength > 7) here.
|
|
115
|
+
chromosomes = metadata["chromosome"].unique()
|
|
116
|
+
|
|
117
|
+
group_index = 0
|
|
118
|
+
for chrom in tqdm(chromosomes, desc="Processing chromosomes"):
|
|
119
|
+
chrom_dir = os.path.join(args.output, chrom)
|
|
120
|
+
os.makedirs(chrom_dir, exist_ok=True)
|
|
121
|
+
meta_sub = metadata[metadata["chromosome"] == chrom].copy()
|
|
122
|
+
meta_sub.set_index("signal", inplace=True)
|
|
123
|
+
meta_sub.sort_values(by="location_min", inplace=True)
|
|
124
|
+
group_index = create_parquet(meta_sub, group_index, chrom, chrom_dir)
|
|
125
|
+
|
|
126
|
+
if args.output_summary:
|
|
127
|
+
pd.DataFrame(parquet_records).to_csv(args.output_summary, sep="\t", index=False)
|
|
128
|
+
print("Done.")
|
|
129
|
+
|
|
130
|
+
if __name__ == "__main__":
|
|
131
|
+
main()
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: gpu-coloc
|
|
3
|
+
Version: 0.1
|
|
4
|
+
Summary: Ultra-fast GPU-enabled Bayesian colocalisation
|
|
5
|
+
Home-page: https://github.com/mjesse-github/gpu-coloc
|
|
6
|
+
Author: Mihkel Jesse
|
|
7
|
+
License: MIT
|
|
8
|
+
Requires-Python: >=3.12
|
|
9
|
+
Requires-Dist: filelock>=3.17.0
|
|
10
|
+
Requires-Dist: fsspec>=2025.2.0
|
|
11
|
+
Requires-Dist: Jinja2>=3.1.5
|
|
12
|
+
Requires-Dist: MarkupSafe>=3.0.2
|
|
13
|
+
Requires-Dist: mpmath>=1.3.0
|
|
14
|
+
Requires-Dist: networkx>=3.4.2
|
|
15
|
+
Requires-Dist: numpy>=2.2.3
|
|
16
|
+
Requires-Dist: pandas>=2.2.3
|
|
17
|
+
Requires-Dist: pyarrow>=19.0.0
|
|
18
|
+
Requires-Dist: python-dateutil>=2.9.0.post0
|
|
19
|
+
Requires-Dist: pytz>=2025.1
|
|
20
|
+
Requires-Dist: six>=1.17.0
|
|
21
|
+
Requires-Dist: sympy>=1.13.1
|
|
22
|
+
Requires-Dist: torch>=2.6.0
|
|
23
|
+
Requires-Dist: tqdm>=4.67.1
|
|
24
|
+
Requires-Dist: typing_extensions>=4.12.2
|
|
25
|
+
Requires-Dist: tzdata>=2025.1
|
|
26
|
+
Dynamic: author
|
|
27
|
+
Dynamic: home-page
|
|
28
|
+
Dynamic: license
|
|
29
|
+
Dynamic: requires-dist
|
|
30
|
+
Dynamic: requires-python
|
|
31
|
+
Dynamic: summary
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
setup.py
|
|
3
|
+
gpu_coloc/__init__.py
|
|
4
|
+
gpu_coloc/cli.py
|
|
5
|
+
gpu_coloc/coloc.py
|
|
6
|
+
gpu_coloc/format.py
|
|
7
|
+
gpu_coloc.egg-info/PKG-INFO
|
|
8
|
+
gpu_coloc.egg-info/SOURCES.txt
|
|
9
|
+
gpu_coloc.egg-info/dependency_links.txt
|
|
10
|
+
gpu_coloc.egg-info/entry_points.txt
|
|
11
|
+
gpu_coloc.egg-info/requires.txt
|
|
12
|
+
gpu_coloc.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
filelock>=3.17.0
|
|
2
|
+
fsspec>=2025.2.0
|
|
3
|
+
Jinja2>=3.1.5
|
|
4
|
+
MarkupSafe>=3.0.2
|
|
5
|
+
mpmath>=1.3.0
|
|
6
|
+
networkx>=3.4.2
|
|
7
|
+
numpy>=2.2.3
|
|
8
|
+
pandas>=2.2.3
|
|
9
|
+
pyarrow>=19.0.0
|
|
10
|
+
python-dateutil>=2.9.0.post0
|
|
11
|
+
pytz>=2025.1
|
|
12
|
+
six>=1.17.0
|
|
13
|
+
sympy>=1.13.1
|
|
14
|
+
torch>=2.6.0
|
|
15
|
+
tqdm>=4.67.1
|
|
16
|
+
typing_extensions>=4.12.2
|
|
17
|
+
tzdata>=2025.1
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
gpu_coloc
|
gpu_coloc-0.1/setup.cfg
ADDED
gpu_coloc-0.1/setup.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
from setuptools import setup, find_packages
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
setup(
|
|
5
|
+
name="gpu-coloc",
|
|
6
|
+
version="0.1",
|
|
7
|
+
packages=find_packages(),
|
|
8
|
+
license="MIT",
|
|
9
|
+
|
|
10
|
+
description="Ultra-fast GPU-enabled Bayesian colocalisation",
|
|
11
|
+
url="https://github.com/mjesse-github/gpu-coloc",
|
|
12
|
+
author="Mihkel Jesse",
|
|
13
|
+
|
|
14
|
+
install_requires=[
|
|
15
|
+
"filelock>=3.17.0",
|
|
16
|
+
"fsspec>=2025.2.0",
|
|
17
|
+
"Jinja2>=3.1.5",
|
|
18
|
+
"MarkupSafe>=3.0.2",
|
|
19
|
+
"mpmath>=1.3.0",
|
|
20
|
+
"networkx>=3.4.2",
|
|
21
|
+
"numpy>=2.2.3",
|
|
22
|
+
"pandas>=2.2.3",
|
|
23
|
+
"pyarrow>=19.0.0",
|
|
24
|
+
"python-dateutil>=2.9.0.post0",
|
|
25
|
+
"pytz>=2025.1",
|
|
26
|
+
"six>=1.17.0",
|
|
27
|
+
"sympy>=1.13.1",
|
|
28
|
+
"torch>=2.6.0",
|
|
29
|
+
"tqdm>=4.67.1",
|
|
30
|
+
"typing_extensions>=4.12.2",
|
|
31
|
+
"tzdata>=2025.1"
|
|
32
|
+
],
|
|
33
|
+
entry_points={
|
|
34
|
+
"console_scripts": [
|
|
35
|
+
"gpu-coloc = gpu_coloc.cli:main",
|
|
36
|
+
],
|
|
37
|
+
},
|
|
38
|
+
python_requires=">=3.12",
|
|
39
|
+
)
|