gpu-coloc 0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
gpu_coloc-0.1/PKG-INFO ADDED
@@ -0,0 +1,31 @@
1
+ Metadata-Version: 2.4
2
+ Name: gpu-coloc
3
+ Version: 0.1
4
+ Summary: Ultra-fast GPU-enabled Bayesian colocalisation
5
+ Home-page: https://github.com/mjesse-github/gpu-coloc
6
+ Author: Mihkel Jesse
7
+ License: MIT
8
+ Requires-Python: >=3.12
9
+ Requires-Dist: filelock>=3.17.0
10
+ Requires-Dist: fsspec>=2025.2.0
11
+ Requires-Dist: Jinja2>=3.1.5
12
+ Requires-Dist: MarkupSafe>=3.0.2
13
+ Requires-Dist: mpmath>=1.3.0
14
+ Requires-Dist: networkx>=3.4.2
15
+ Requires-Dist: numpy>=2.2.3
16
+ Requires-Dist: pandas>=2.2.3
17
+ Requires-Dist: pyarrow>=19.0.0
18
+ Requires-Dist: python-dateutil>=2.9.0.post0
19
+ Requires-Dist: pytz>=2025.1
20
+ Requires-Dist: six>=1.17.0
21
+ Requires-Dist: sympy>=1.13.1
22
+ Requires-Dist: torch>=2.6.0
23
+ Requires-Dist: tqdm>=4.67.1
24
+ Requires-Dist: typing_extensions>=4.12.2
25
+ Requires-Dist: tzdata>=2025.1
26
+ Dynamic: author
27
+ Dynamic: home-page
28
+ Dynamic: license
29
+ Dynamic: requires-dist
30
+ Dynamic: requires-python
31
+ Dynamic: summary
@@ -0,0 +1,86 @@
1
+ # gpu-coloc
2
+
3
+ **gpu-coloc** is a GPU-accelerated implementation of the Bayesian colocalization algorithm (COLOC), providing identical results to R's coloc.bf\_bf at approximately 1000x greater speed.
4
+
5
+ ## Citation
6
+
7
+ If you use **gpu-coloc**, please cite: *(citation placeholder)*
8
+
9
+ ## Installation
10
+
11
+ Clone the repository:
12
+
13
+ ```bash
14
+ git clone https://github.com/mjesse-github/gpu-coloc
15
+ ```
16
+
17
+ ### Dependencies
18
+
19
+ Install required Python libraries locally:
20
+
21
+ ```bash
22
+ pip install -r requirements.txt
23
+ ```
24
+
25
+ Or create a virtual environment using:
26
+
27
+ ```bash
28
+ python3 -m venv coloc_env
29
+ source coloc_env/bin/activate
30
+ pip3 install -r requirements.txt
31
+ ```
32
+
33
+ For Linux x64 servers, we recommend using our Singularity container:
34
+ *(Singularity link placeholder)*
35
+
36
+ ## Testing Installation
37
+
38
+ Run:
39
+
40
+ ```bash
41
+ bash test.sh
42
+ ```
43
+
44
+ ## Workflow
45
+
46
+ Note: The following example assumes gpu-coloc is downloaded into your working directory. Adjust paths accordingly if downloaded elsewhere.
47
+
48
+ Variants must follow a uniform naming convention, as the COLOC algorithm requires consistent naming. Use the format: chr[chromosome]_[position]_[ref]_[alt]. Perform any renaming prior to Step 1 below. We use chromosome X, not 23.
49
+
50
+ 1. **Prepare signals and summary files**
51
+
52
+ * **Signals files**: Each signal should be saved in `[signal].pickle` format, containing variants and their respective log Bayes Factors (lbf).
53
+
54
+ Format on which our formatting algorithm works:
55
+
56
+ ```
57
+ variant chrX_153412224_C_A chrX_153412528_C_T ...
58
+ lbf -0.060991 -1.508802 ...
59
+ ```
60
+
61
+ * **Summary file**: Tab-separated file with the structure below:
62
+
63
+ ```
64
+ signal chromosome location_min location_max signal_strength lead_variant
65
+ QTD000141_ENSG00000013563_L1 X 153412224 155341332 12.1069377174147 chrX_154403855_T_G
66
+ ...
67
+ ```
68
+
69
+ Example naming convention:
70
+
71
+ * `gwas_summary.tsv`
72
+ * Signals in directory `gwas_signals/[signal].pickle`
73
+
74
+ Scripts in `summary_and_signals_examples/` are provided as examples, but may require adjustments.
75
+
76
+ 2. **Format data:**
77
+
78
+ ```bash
79
+ python3 gpu-coloc/format.py --input [path_to_signals] --input_summary [summary_file] --output [output_folder]
80
+ ```
81
+
82
+ 3. **Run colocalization:**
83
+
84
+ ```bash
85
+ python3 gpu-coloc/coloc.py --dir1 [formatted_dataset_1] --dir2 [formatted_dataset_2] --results [results_output] --p12 1e-6 --H4 0.8
86
+ ```
File without changes
@@ -0,0 +1,14 @@
1
+ import sys
2
+ from gpu_coloc import coloc, format
3
+
4
+ def main():
5
+ if "-r" in sys.argv or "--run" in sys.argv:
6
+ sys.argv.remove("-r") if "-r" in sys.argv else sys.argv.remove("--run")
7
+ coloc.main()
8
+ elif "-f" in sys.argv or "--format" in sys.argv:
9
+ sys.argv.remove("-f") if "-f" in sys.argv else sys.argv.remove("--format")
10
+ format.main()
11
+ else:
12
+ print("Usage: gpu-coloc [-r|--run] or [-f|--format]")
13
+ print("Use -r or --run to run the coloc script.")
14
+ print("Use -f or --format to run the format script.")
@@ -0,0 +1,359 @@
1
+ import argparse
2
+ import math
3
+ import os
4
+ import torch
5
+ import pandas as pd
6
+ import numpy as np
7
+ from tqdm import tqdm
8
+
9
+ def logdiff_torch(a, b):
10
+ mx = torch.maximum(a, b)
11
+ val = torch.exp(a - mx) - torch.exp(b - mx)
12
+ mask = (val <= 0)
13
+ out = mx + torch.log(torch.where(mask, torch.tensor(float('nan'), device=a.device), val))
14
+ out[mask] = float('nan')
15
+ return out
16
+
17
+ def coloc_bf_bf_torch(
18
+ bf1_cpu, bf2_cpu,
19
+ p1=1e-4, p2=1e-4, p12=5e-6,
20
+ device="mps"
21
+ ):
22
+ if isinstance(bf1_cpu, pd.Series):
23
+ bf1_cpu = bf1_cpu.to_frame().T
24
+ if isinstance(bf2_cpu, pd.Series):
25
+ bf2_cpu = bf2_cpu.to_frame().T
26
+
27
+ isnps = list(set(bf1_cpu.columns).intersection(bf2_cpu.columns) - {"null"})
28
+ if not isnps:
29
+ return {
30
+ "summary": pd.DataFrame({"nsnps": [np.nan]}),
31
+ "pp_3d": None,
32
+ "pp_H4_matrix": None,
33
+ "priors": {"p1": p1, "p2": p2, "p12": p12}
34
+ }
35
+
36
+ bf1_arr = torch.tensor(bf1_cpu[isnps].values, dtype=torch.float32, device=device)
37
+ bf2_arr = torch.tensor(bf2_cpu[isnps].values, dtype=torch.float32, device=device)
38
+
39
+ N, M = bf1_arr.shape
40
+ K, _ = bf2_arr.shape
41
+
42
+ bf1_3d = bf1_arr.unsqueeze(1)
43
+ bf2_3d = bf2_arr.unsqueeze(0)
44
+ sum_3d = bf1_3d + bf2_3d
45
+
46
+ sum_3d_logexp = torch.logsumexp(sum_3d, dim=2)
47
+
48
+ l1_sum = torch.logsumexp(bf1_arr, dim=1)
49
+ l2_sum = torch.logsumexp(bf2_arr, dim=1)
50
+
51
+ l1_sum_2d = l1_sum.unsqueeze(1).expand(N, K)
52
+ l2_sum_2d = l2_sum.unsqueeze(0).expand(N, K)
53
+
54
+ p1_t = torch.tensor(p1, dtype=torch.float32, device=device)
55
+ p2_t = torch.tensor(p2, dtype=torch.float32, device=device)
56
+ p12_t = torch.tensor(p12, dtype=torch.float32, device=device)
57
+
58
+ lH0_2d = torch.zeros((N, K), device=device)
59
+ lH1_2d = torch.log(p1_t) + l1_sum_2d
60
+ lH2_2d = torch.log(p2_t) + l2_sum_2d
61
+ lH4_2d = torch.log(p12_t) + sum_3d_logexp
62
+ lH3_2d = torch.log(p1_t) + torch.log(p2_t) + logdiff_torch(l1_sum_2d + l2_sum_2d, sum_3d_logexp)
63
+
64
+ all_abf_3d = torch.stack([lH0_2d, lH1_2d, lH2_2d, lH3_2d, lH4_2d], dim=0)
65
+ denom_2d = torch.logsumexp(all_abf_3d, dim=0)
66
+ pp_abf_3d = torch.exp(all_abf_3d - denom_2d.unsqueeze(0))
67
+
68
+ pp_H3_2d = pp_abf_3d[3]
69
+ pp_H4_2d = pp_abf_3d[4]
70
+
71
+
72
+ i_coords = torch.arange(N, device=device).unsqueeze(1).expand(N, K).flatten()
73
+ j_coords = torch.arange(K, device=device).unsqueeze(0).expand(N, K).flatten()
74
+
75
+ pp_H3_flat = pp_H3_2d.flatten()
76
+ pp_H4_flat = pp_H4_2d.flatten()
77
+
78
+ i_coords_cpu = i_coords.cpu().numpy()
79
+ j_coords_cpu = j_coords.cpu().numpy()
80
+ pp_H3_cpu = pp_H3_flat.cpu().numpy()
81
+ pp_H4_cpu = pp_H4_flat.cpu().numpy()
82
+
83
+ summary_df = pd.DataFrame({
84
+ "idx1": i_coords_cpu,
85
+ "idx2": j_coords_cpu,
86
+ "PP.H3": pp_H3_cpu,
87
+ "PP.H4": pp_H4_cpu,
88
+ })
89
+
90
+ return {
91
+ "summary": summary_df,
92
+ "pp_3d": pp_abf_3d.cpu().numpy(),
93
+ "pp_H4_matrix": pp_H4_2d.cpu().numpy(),
94
+ "priors": {"p1": p1, "p2": p2, "p12": p12}
95
+ }
96
+
97
+
98
+ def logsum(arr):
99
+ max_val = np.max(arr)
100
+ return max_val + np.log(np.sum(np.exp(arr - max_val)))
101
+
102
+ def logbf_to_pp(df, pi, last_is_null):
103
+ n = df.shape[1] - 1 if last_is_null else df.shape[1]
104
+
105
+ if isinstance(pi, (int, float)):
106
+ if pi > 1 / n:
107
+ pi = 1 / n
108
+ pi = np.append(np.repeat(pi, n), 1 - n * pi) if last_is_null else np.repeat(pi, n)
109
+
110
+ if any(pi == 0):
111
+ pi[pi == 0] = 1e-16
112
+ pi = pi / np.sum(pi)
113
+
114
+ if last_is_null:
115
+ df = df.subtract(df.iloc[:, -1], axis=0)
116
+
117
+ priors = np.tile(np.log(pi), (df.shape[0], 1))
118
+
119
+ denom = np.apply_along_axis(logsum, 1, df.values + priors)
120
+
121
+ denom_df = pd.DataFrame(np.tile(denom, (df.shape[1], 1)).T, index=df.index, columns=df.columns)
122
+
123
+ result = np.exp(df.values + priors - denom_df.values)
124
+
125
+ return pd.DataFrame(result, index=df.index, columns=df.columns)
126
+
127
+ def trim(bf1, bf2, p1=1e-4, p2=1e-4, overlap_min=0.5, silent=True):
128
+ if isinstance(bf1, pd.Series):
129
+ bf1 = bf1.to_frame().T
130
+ if isinstance(bf2, pd.Series):
131
+ bf2 = bf2.to_frame().T
132
+
133
+ isnps = list(set(bf1.columns).intersection(set(bf2.columns)).difference(['null']))
134
+
135
+ if not isnps:
136
+ if not silent:
137
+ print("No common SNPs found.")
138
+ return pd.DataFrame({'nsnps': [np.nan]})
139
+
140
+ pp1 = logbf_to_pp(bf1, p1, last_is_null=True)
141
+ pp2 = logbf_to_pp(bf2, p2, last_is_null=True)
142
+
143
+ bf1 = bf1[isnps]
144
+ bf2 = bf2[isnps]
145
+
146
+ prop1 = pp1[isnps].sum(axis=1) / pp1.loc[:, pp1.columns != "null"].sum(axis=1)
147
+
148
+ prop2 = pp2[isnps].sum(axis=1) / pp2.loc[:, pp2.columns != "null"].sum(axis=1)
149
+
150
+ todo = pd.DataFrame([(i, j) for i in range(bf1.shape[0]) for j in range(bf2.shape[0])], columns=['i', 'j'])
151
+
152
+ drop = [prop1[todo['i'][k]] < overlap_min or prop2[todo['j'][k]] < overlap_min for k in range(len(todo))]
153
+
154
+ if all(drop):
155
+ if not silent:
156
+ print("Warning: SNP overlap too small between datasets: too few SNPs with high posterior in one trait represented in other")
157
+
158
+ return pd.DataFrame({'nsnps': [np.nan]})
159
+
160
+ return todo[~pd.Series(drop)].reset_index(drop=True)
161
+
162
+ def coloc_loop(
163
+ mat1: pd.DataFrame,
164
+ mat2: pd.DataFrame,
165
+ metadata1: pd.DataFrame,
166
+ metadata2: pd.DataFrame,
167
+ num_chunks1=0,
168
+ num_chunks2=0,
169
+ device="cuda",
170
+ p1=1e-4, p2=1e-4, p12=1e-6, H4_threshold=0.8
171
+ ):
172
+
173
+ try:
174
+ overlapping_pairs = trim(mat1, mat2, p12)
175
+ valid_pairs = set(overlapping_pairs[["i", "j"]].itertuples(index=False, name=None))
176
+ except:
177
+ print("Possible error in trim function")
178
+ return pd.DataFrame()
179
+
180
+ if overlapping_pairs.empty:
181
+ return pd.DataFrame()
182
+
183
+ chunk_size = 100
184
+
185
+ mat1_chunks = []
186
+ meta1_chunks = []
187
+ start1_idx = 0
188
+
189
+ N1 = mat1.shape[0]
190
+
191
+ for i in range(num_chunks1):
192
+ end1_idx = start1_idx + chunk_size if i < (num_chunks1 - 1) else N1
193
+ mat1_chunk = mat1.iloc[start1_idx:end1_idx, :].copy()
194
+ meta1_chunk = metadata1.iloc[start1_idx:end1_idx, :].copy()
195
+
196
+ mat1_chunks.append(mat1_chunk)
197
+ meta1_chunks.append(meta1_chunk)
198
+ start1_idx = end1_idx
199
+
200
+ mat2_chunks = []
201
+ meta2_chunks = []
202
+ start2_idx = 0
203
+
204
+ N2 = mat2.shape[0]
205
+
206
+ for i in range(num_chunks2):
207
+ end2_idx = start2_idx + chunk_size if i < (num_chunks2 - 1) else N2
208
+ mat2_chunk = mat2.iloc[start2_idx:end2_idx, :].copy()
209
+ meta2_chunk = metadata2.iloc[start2_idx:end2_idx, :].copy()
210
+
211
+ mat2_chunks.append(mat2_chunk)
212
+ meta2_chunks.append(meta2_chunk)
213
+ start2_idx = end2_idx
214
+
215
+ all_results = []
216
+
217
+ total_pairs = []
218
+
219
+ for i in range(num_chunks1):
220
+ for j in range(num_chunks2):
221
+ total_pairs.append((i, j))
222
+
223
+ for pair in tqdm(total_pairs, desc="All chunk pairs", leave=False):
224
+ out = coloc_bf_bf_torch(
225
+ bf1_cpu=mat1_chunks[pair[0]],
226
+ bf2_cpu=mat2_chunks[pair[1]],
227
+ p1=p1, p2=p2, p12=p12,
228
+ device=device
229
+ )
230
+ if out is None or out["summary"] is None:
231
+ continue
232
+
233
+ summary_df = out["summary"]
234
+
235
+ summary_df.loc[:, "idx1"] = summary_df["idx1"] + pair[0] * 100
236
+ summary_df.loc[:, "idx2"] = summary_df["idx2"] + pair[1] * 100
237
+
238
+ summary_df = summary_df[summary_df.apply(lambda row: (row["idx1"], row["idx2"]) in valid_pairs, axis=1)]
239
+
240
+ summary_df = summary_df[summary_df["PP.H4"] >= H4_threshold].reset_index(drop=True)
241
+
242
+ if summary_df.empty:
243
+ continue
244
+
245
+ summary_df["signal1"] = metadata1["signal"].iloc[
246
+ summary_df["idx1"]
247
+ ].values
248
+
249
+ summary_df["lead1"] = metadata1["lead_variant"].iloc[
250
+ summary_df["idx1"]
251
+ ].values
252
+
253
+
254
+ summary_df["signal2"] = metadata2["signal"].iloc[
255
+ summary_df["idx2"]
256
+ ].values
257
+
258
+ summary_df["lead2"] = metadata2["lead_variant"].iloc[
259
+ summary_df["idx2"]
260
+ ].values
261
+
262
+ summary_df = summary_df[summary_df["signal1"] != summary_df["signal2"]].reset_index(drop=True)
263
+
264
+ summary_df.drop(columns=["idx1", "idx2"], inplace=True)
265
+
266
+ all_results.append(summary_df)
267
+
268
+ if all_results:
269
+ final_df = pd.concat(all_results, ignore_index=True)
270
+ else:
271
+ final_df = pd.DataFrame()
272
+
273
+ return final_df
274
+
275
+ def main():
276
+ parser = argparse.ArgumentParser(description="Run coloc")
277
+
278
+ parser.add_argument("--dir1", type=str, required=True, help="First directory of directories of parquet files, e.g., 'formatted_eqtls'.")
279
+ parser.add_argument("--dir2", type=str, required=True, help="Second directory of directories of parquet files, e.g., 'formatted_metabolites'.")
280
+ parser.add_argument("--results", type=str, required=True, help="File to write the colocalization results, e.g., 'results.tsv'.")
281
+ parser.add_argument("--p12", type=float, required=True, help="p12 prior, e.g. 1e-6")
282
+ parser.add_argument("--H4", type=float, required=False, help="Threshold for H4, e.g. 0.8", default=0.8)
283
+
284
+ args = parser.parse_args()
285
+
286
+ p12 = args.p12
287
+ H4_threshold = args.H4
288
+
289
+ if torch.cuda.is_available():
290
+ device = torch.device("cuda")
291
+ elif torch.backends.mps.is_available():
292
+ device = torch.device("mps")
293
+ else:
294
+ device = torch.device("cpu")
295
+
296
+ for root, dirs, files in os.walk(args.dir1):
297
+ for directory in tqdm(dirs, desc="chromosomes"):
298
+ dir_path = os.path.join(root, directory)
299
+ met_files = os.listdir(dir_path)
300
+
301
+ ge_dir_path = os.path.join(args.dir2, directory)
302
+ files = os.listdir(ge_dir_path)
303
+
304
+ combination = []
305
+
306
+ met_cache = {}
307
+ ge_cache = {}
308
+
309
+ for i in range(len(met_files)):
310
+ met_cache[i] = pd.read_parquet(f"{dir_path}/{met_files[i]}")
311
+
312
+ for i in range(len(files)):
313
+ ge_cache[i] = pd.read_parquet(f"{ge_dir_path}/{files[i]}")
314
+
315
+ for i in tqdm(range(len(met_files)), desc="processing met", leave=False):
316
+ input1 = met_cache[i]
317
+ metadata1 = input1.iloc[:, :6].copy()
318
+ mat1 = input1.iloc[:, 6:].copy()
319
+
320
+ min_pos_1 = metadata1['location_min'].min()
321
+ max_pos_1 = metadata1['location_max'].max()
322
+
323
+ for j in tqdm(range(len(files)), desc="running files", leave=False):
324
+ input2 = ge_cache[j]
325
+ metadata2 = input2.iloc[:, :6].copy()
326
+ mat2 = input2.iloc[:, 6:].copy()
327
+
328
+ min_pos_2 = metadata2['location_min'].min()
329
+ max_pos_2 = metadata2['location_max'].max()
330
+
331
+ if max_pos_1 < min_pos_2 or max_pos_2 < min_pos_1:
332
+ continue
333
+
334
+ final_results = coloc_loop(
335
+ mat1=mat1,
336
+ mat2=mat2,
337
+ metadata1=metadata1,
338
+ metadata2=metadata2,
339
+ num_chunks1=math.ceil(mat1.shape[0]/100),
340
+ num_chunks2=math.ceil(mat2.shape[0]/100),
341
+ device=device,
342
+ p1=1e-4,
343
+ p2=1e-4,
344
+ p12=p12,
345
+ H4_threshold=H4_threshold,
346
+ )
347
+
348
+ output_file=args.results
349
+
350
+ if final_results is None or final_results.empty:
351
+ continue
352
+
353
+ if not os.path.exists(output_file):
354
+ final_results.to_csv(output_file, sep="\t", index=False, mode='w', header=True)
355
+ else:
356
+ final_results.to_csv(output_file, sep="\t", index=False, mode='a', header=False)
357
+
358
+ if __name__ == "__main__":
359
+ main()
@@ -0,0 +1,131 @@
1
+ import argparse
2
+ import os
3
+ import math
4
+ import pandas as pd
5
+ from tqdm import tqdm
6
+
7
+ parquet_records = []
8
+ signals_dir = None
9
+
10
+ def process_group(meta_group, index, chrom, chrom_dir, group_id=None):
11
+ signals = meta_group.index.tolist()
12
+ mat_files = [os.path.join(signals_dir, f"{sig}.pickle") for sig in signals]
13
+
14
+ min_loc = meta_group["location_min"].min()
15
+ max_loc = meta_group["location_max"].max() if "location_max" in meta_group.columns else meta_group["location_min"].max()
16
+
17
+ snp_set = set()
18
+ for mat_file in mat_files:
19
+ df_tmp = pd.read_pickle(mat_file)
20
+ snp_set.update(df_tmp.columns.tolist())
21
+ del df_tmp
22
+
23
+ columns = list(meta_group.columns) + sorted(snp_set)
24
+ combined_df = pd.DataFrame(index=meta_group.index, columns=columns)
25
+ for col in meta_group.columns:
26
+ combined_df[col] = meta_group[col]
27
+ combined_df.iloc[:, len(meta_group.columns):] = -1e6
28
+
29
+ combined_array = combined_df.to_numpy()
30
+ snp_columns = {snp: idx for idx, snp in enumerate(combined_df.columns[len(meta_group.columns):], start=len(meta_group.columns))}
31
+
32
+ for mat_file in mat_files:
33
+ signal_name = os.path.splitext(os.path.basename(mat_file))[0]
34
+ df_mat = pd.read_pickle(mat_file)
35
+ row_idx = combined_df.index.get_loc(signal_name)
36
+ for snp_col, value in zip(df_mat.columns, df_mat.iloc[0].values):
37
+ if snp_col in snp_columns:
38
+ combined_array[row_idx, snp_columns[snp_col]] = value
39
+ del df_mat
40
+
41
+ combined_df = pd.DataFrame(combined_array, index=combined_df.index, columns=combined_df.columns)
42
+ combined_df.reset_index(inplace=True)
43
+
44
+ if group_id is not None:
45
+ parquet_filename = f"chr{chrom}_group_{group_id}.parquet"
46
+ else:
47
+ parquet_filename = f"chr{chrom}_met_group_{index}_region_{min_loc}-{max_loc}.parquet"
48
+ parquet_path = os.path.join(chrom_dir, parquet_filename)
49
+
50
+ combined_df.to_parquet(parquet_path, engine="pyarrow")
51
+ parquet_records.append({
52
+ "chromosome": chrom,
53
+ "group": group_id if group_id is not None else index,
54
+ "n_signals": combined_df.shape[0],
55
+ "min_position": min_loc,
56
+ "max_position": max_loc,
57
+ "parquet_file": parquet_path
58
+ })
59
+
60
+ return index + 1
61
+
62
+ def create_parquet(meta_sub, index, chrom, chrom_dir):
63
+ meta_sub.sort_values(by="location_min", inplace=True)
64
+ positions = meta_sub["location_min"].tolist()
65
+
66
+ if len(positions) >= 2:
67
+ positions_sorted = sorted(positions)
68
+ max_gap, i1, j1 = 0, 0, 0
69
+ for i in range(len(positions_sorted) - 1):
70
+ gap = positions_sorted[i + 1] - positions_sorted[i]
71
+ if gap > max_gap:
72
+ max_gap, i1, j1 = gap, i, i + 1
73
+ if max_gap > 1_000_000:
74
+ split_point_1 = positions_sorted[i1]
75
+ split_point_2 = positions_sorted[j1]
76
+ df_part1 = meta_sub[meta_sub["location_min"] <= split_point_1].copy()
77
+ df_part2 = meta_sub[meta_sub["location_min"] >= split_point_2].copy()
78
+ index = create_parquet(df_part1, index, chrom, chrom_dir)
79
+ index = create_parquet(df_part2, index, chrom, chrom_dir)
80
+ return index
81
+
82
+ if len(meta_sub) > 1000:
83
+ signals = meta_sub.index.tolist()
84
+ total = len(signals)
85
+ chunk_size = 1000
86
+ n_groups = math.ceil(total / chunk_size)
87
+ for group in range(n_groups):
88
+ start = group * chunk_size
89
+ end = min(start + chunk_size, total)
90
+ meta_group = meta_sub.loc[signals[start:end]].copy()
91
+ index = process_group(meta_group, index, chrom, chrom_dir, group_id=index)
92
+ return index
93
+
94
+ index = process_group(meta_sub, index, chrom, chrom_dir, group_id=index)
95
+ return index
96
+
97
+
98
+ def main():
99
+ parser = argparse.ArgumentParser(
100
+ description="Process signals with recursive gap splitting (>500k) and chunking (max 1000 signals)"
101
+ )
102
+ parser.add_argument("--input", type=str, required=True, help="Directory containing signal pickle files")
103
+ parser.add_argument("--output", type=str, required=True, help="Directory to save parquet files")
104
+ parser.add_argument("--input_summary", type=str, required=True, help="Path to summary TSV file")
105
+ parser.add_argument("--output_summary", type=str, help="Path to write parquet summary TSV")
106
+ args = parser.parse_args()
107
+
108
+ global signals_dir
109
+ signals_dir = args.input
110
+
111
+ os.makedirs(args.output, exist_ok=True)
112
+ metadata = pd.read_csv(args.input_summary, sep="\t")
113
+ metadata["chromosome"] = metadata["chromosome"].astype(str)
114
+ # Optionally, filter metadata (e.g., signal_strength > 7) here.
115
+ chromosomes = metadata["chromosome"].unique()
116
+
117
+ group_index = 0
118
+ for chrom in tqdm(chromosomes, desc="Processing chromosomes"):
119
+ chrom_dir = os.path.join(args.output, chrom)
120
+ os.makedirs(chrom_dir, exist_ok=True)
121
+ meta_sub = metadata[metadata["chromosome"] == chrom].copy()
122
+ meta_sub.set_index("signal", inplace=True)
123
+ meta_sub.sort_values(by="location_min", inplace=True)
124
+ group_index = create_parquet(meta_sub, group_index, chrom, chrom_dir)
125
+
126
+ if args.output_summary:
127
+ pd.DataFrame(parquet_records).to_csv(args.output_summary, sep="\t", index=False)
128
+ print("Done.")
129
+
130
+ if __name__ == "__main__":
131
+ main()
@@ -0,0 +1,31 @@
1
+ Metadata-Version: 2.4
2
+ Name: gpu-coloc
3
+ Version: 0.1
4
+ Summary: Ultra-fast GPU-enabled Bayesian colocalisation
5
+ Home-page: https://github.com/mjesse-github/gpu-coloc
6
+ Author: Mihkel Jesse
7
+ License: MIT
8
+ Requires-Python: >=3.12
9
+ Requires-Dist: filelock>=3.17.0
10
+ Requires-Dist: fsspec>=2025.2.0
11
+ Requires-Dist: Jinja2>=3.1.5
12
+ Requires-Dist: MarkupSafe>=3.0.2
13
+ Requires-Dist: mpmath>=1.3.0
14
+ Requires-Dist: networkx>=3.4.2
15
+ Requires-Dist: numpy>=2.2.3
16
+ Requires-Dist: pandas>=2.2.3
17
+ Requires-Dist: pyarrow>=19.0.0
18
+ Requires-Dist: python-dateutil>=2.9.0.post0
19
+ Requires-Dist: pytz>=2025.1
20
+ Requires-Dist: six>=1.17.0
21
+ Requires-Dist: sympy>=1.13.1
22
+ Requires-Dist: torch>=2.6.0
23
+ Requires-Dist: tqdm>=4.67.1
24
+ Requires-Dist: typing_extensions>=4.12.2
25
+ Requires-Dist: tzdata>=2025.1
26
+ Dynamic: author
27
+ Dynamic: home-page
28
+ Dynamic: license
29
+ Dynamic: requires-dist
30
+ Dynamic: requires-python
31
+ Dynamic: summary
@@ -0,0 +1,12 @@
1
+ README.md
2
+ setup.py
3
+ gpu_coloc/__init__.py
4
+ gpu_coloc/cli.py
5
+ gpu_coloc/coloc.py
6
+ gpu_coloc/format.py
7
+ gpu_coloc.egg-info/PKG-INFO
8
+ gpu_coloc.egg-info/SOURCES.txt
9
+ gpu_coloc.egg-info/dependency_links.txt
10
+ gpu_coloc.egg-info/entry_points.txt
11
+ gpu_coloc.egg-info/requires.txt
12
+ gpu_coloc.egg-info/top_level.txt
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ gpu-coloc = gpu_coloc.cli:main
@@ -0,0 +1,17 @@
1
+ filelock>=3.17.0
2
+ fsspec>=2025.2.0
3
+ Jinja2>=3.1.5
4
+ MarkupSafe>=3.0.2
5
+ mpmath>=1.3.0
6
+ networkx>=3.4.2
7
+ numpy>=2.2.3
8
+ pandas>=2.2.3
9
+ pyarrow>=19.0.0
10
+ python-dateutil>=2.9.0.post0
11
+ pytz>=2025.1
12
+ six>=1.17.0
13
+ sympy>=1.13.1
14
+ torch>=2.6.0
15
+ tqdm>=4.67.1
16
+ typing_extensions>=4.12.2
17
+ tzdata>=2025.1
@@ -0,0 +1 @@
1
+ gpu_coloc
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
gpu_coloc-0.1/setup.py ADDED
@@ -0,0 +1,39 @@
1
+ from setuptools import setup, find_packages
2
+
3
+
4
+ setup(
5
+ name="gpu-coloc",
6
+ version="0.1",
7
+ packages=find_packages(),
8
+ license="MIT",
9
+
10
+ description="Ultra-fast GPU-enabled Bayesian colocalisation",
11
+ url="https://github.com/mjesse-github/gpu-coloc",
12
+ author="Mihkel Jesse",
13
+
14
+ install_requires=[
15
+ "filelock>=3.17.0",
16
+ "fsspec>=2025.2.0",
17
+ "Jinja2>=3.1.5",
18
+ "MarkupSafe>=3.0.2",
19
+ "mpmath>=1.3.0",
20
+ "networkx>=3.4.2",
21
+ "numpy>=2.2.3",
22
+ "pandas>=2.2.3",
23
+ "pyarrow>=19.0.0",
24
+ "python-dateutil>=2.9.0.post0",
25
+ "pytz>=2025.1",
26
+ "six>=1.17.0",
27
+ "sympy>=1.13.1",
28
+ "torch>=2.6.0",
29
+ "tqdm>=4.67.1",
30
+ "typing_extensions>=4.12.2",
31
+ "tzdata>=2025.1"
32
+ ],
33
+ entry_points={
34
+ "console_scripts": [
35
+ "gpu-coloc = gpu_coloc.cli:main",
36
+ ],
37
+ },
38
+ python_requires=">=3.12",
39
+ )