gsMap 1.73.2__py3-none-any.whl → 1.73.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
gsMap/generate_ldscore.py CHANGED
@@ -13,12 +13,11 @@ from pathlib import Path
13
13
  import numpy as np
14
14
  import pandas as pd
15
15
  import pyranges as pr
16
- import zarr
17
16
  from scipy.sparse import csr_matrix
18
17
  from tqdm import trange
19
18
 
20
19
  from gsMap.config import GenerateLDScoreConfig
21
- from gsMap.utils.generate_r2_matrix import getBlockLefts, load_bfile
20
+ from gsMap.utils.generate_r2_matrix import PlinkBEDFile
22
21
 
23
22
  # Configure warning behavior more precisely
24
23
  warnings.filterwarnings("ignore", category=FutureWarning, module="pandas")
@@ -57,7 +56,8 @@ def load_gtf(
57
56
  gtf = gtf[gtf["Feature"] == "gene"]
58
57
 
59
58
  # Find common genes between GTF and marker scores
60
- common_gene = np.intersect1d(mk_score.index, gtf.gene_name)
59
+ # common_gene = np.intersect1d(mk_score.index, gtf.gene_name)
60
+ common_gene = list(set(mk_score.index) & set(gtf.gene_name))
61
61
  logger.info(f"Found {len(common_gene)} common genes between GTF and marker scores")
62
62
 
63
63
  # Filter GTF and marker scores to common genes
@@ -69,6 +69,9 @@ def load_gtf(
69
69
 
70
70
  # Process the GTF (open window around gene coordinates)
71
71
  gtf_bed = gtf[["Chromosome", "Start", "End", "gene_name", "Strand"]].copy()
72
+ gtf_bed["Chromosome"] = gtf_bed["Chromosome"].apply(
73
+ lambda x: f"chr{x}" if not str(x).startswith("chr") else x
74
+ )
72
75
  gtf_bed.loc[:, "TSS"] = gtf_bed["Start"]
73
76
  gtf_bed.loc[:, "TED"] = gtf_bed["End"]
74
77
 
@@ -109,44 +112,6 @@ def load_marker_score(mk_score_file: str) -> pd.DataFrame:
109
112
  return mk_score
110
113
 
111
114
 
112
- def load_bim(bfile_root: str, chrom: int) -> tuple[pd.DataFrame, pr.PyRanges]:
113
- """
114
- Load PLINK BIM file and convert to a PyRanges object.
115
-
116
- Parameters
117
- ----------
118
- bfile_root : str
119
- Root path for PLINK bfiles
120
- chrom : int
121
- Chromosome number
122
-
123
- Returns
124
- -------
125
- tuple
126
- A tuple containing (bim_df, bim_pr) where:
127
- - bim_df is a pandas DataFrame with BIM data
128
- - bim_pr is a PyRanges object with BIM data
129
- """
130
- bim_file = f"{bfile_root}.{chrom}.bim"
131
- logger.debug(f"Loading BIM file: {bim_file}")
132
-
133
- bim = pd.read_csv(bim_file, sep="\t", header=None)
134
- bim.columns = ["CHR", "SNP", "CM", "BP", "A1", "A2"]
135
-
136
- # Convert to PyRanges
137
- bim_pr = bim.copy()
138
- bim_pr.columns = ["Chromosome", "SNP", "CM", "Start", "A1", "A2"]
139
-
140
- # Adjust coordinates (BIM is 1-based, PyRanges uses 0-based)
141
- bim_pr["End"] = bim_pr["Start"].copy()
142
- bim_pr["Start"] = bim_pr["Start"] - 1
143
-
144
- bim_pr = pr.PyRanges(bim_pr)
145
- bim_pr.Chromosome = f"chr{chrom}"
146
-
147
- return bim, bim_pr
148
-
149
-
150
115
  def overlaps_gtf_bim(gtf_pr: pr.PyRanges, bim_pr: pr.PyRanges) -> pd.DataFrame:
151
116
  """
152
117
  Find overlaps between GTF and BIM data, and select nearest gene for each SNP.
@@ -176,282 +141,13 @@ def overlaps_gtf_bim(gtf_pr: pr.PyRanges, bim_pr: pr.PyRanges) -> pd.DataFrame:
176
141
  return nearest_genes
177
142
 
178
143
 
179
- def filter_snps_by_keep_snp(bim_df: pd.DataFrame, keep_snp_file: str) -> pd.DataFrame:
180
- """
181
- Filter BIM DataFrame to keep only SNPs in a provided list.
182
-
183
- Parameters
184
- ----------
185
- bim_df : pd.DataFrame
186
- DataFrame with BIM data
187
- keep_snp_file : str
188
- Path to a file with SNP IDs to keep
189
-
190
- Returns
191
- -------
192
- pd.DataFrame
193
- Filtered BIM DataFrame
194
- """
195
- # Read SNPs to keep
196
- keep_snp = pd.read_csv(keep_snp_file, header=None)[0].to_list()
197
-
198
- # Filter the BIM DataFrame
199
- filtered_bim_df = bim_df[bim_df["SNP"].isin(keep_snp)]
200
-
201
- logger.info(f"Kept {len(filtered_bim_df)} SNPs out of {len(bim_df)} after filtering")
202
-
203
- return filtered_bim_df
204
-
205
-
206
- def get_snp_counts(config: GenerateLDScoreConfig) -> dict:
207
- """
208
- Count SNPs per chromosome and calculate start positions for zarr arrays.
209
-
210
- Parameters
211
- ----------
212
- config : GenerateLDScoreConfig
213
- Configuration object
214
-
215
- Returns
216
- -------
217
- dict
218
- Dictionary with SNP counts and start positions
219
- """
220
- snp_counts = {}
221
- total_snp = 0
222
-
223
- for chrom in range(1, 23):
224
- bim_df, _ = load_bim(config.bfile_root, chrom)
225
-
226
- if config.keep_snp_root:
227
- keep_snp_file = f"{config.keep_snp_root}.{chrom}.snp"
228
- filtered_bim_df = filter_snps_by_keep_snp(bim_df, keep_snp_file)
229
- else:
230
- filtered_bim_df = bim_df
231
-
232
- snp_counts[chrom] = filtered_bim_df.shape[0]
233
- total_snp += snp_counts[chrom]
234
-
235
- snp_counts["total"] = total_snp
236
-
237
- # Calculate cumulative SNP counts for zarr array indexing
238
- chrom_snp_length_array = np.array([snp_counts[chrom] for chrom in range(1, 23)]).cumsum()
239
- snp_counts["chrom_snp_start_point"] = [0] + chrom_snp_length_array.tolist()
240
-
241
- return snp_counts
242
-
243
-
244
- def get_snp_pass_maf(bfile_root: str, chrom: int, maf_min: float = 0.05) -> list[str]:
245
- """
246
- Get SNPs that pass the minimum minor allele frequency (MAF) threshold.
247
-
248
- Parameters
249
- ----------
250
- bfile_root : str
251
- Root path for PLINK bfiles
252
- chrom : int
253
- Chromosome number
254
- maf_min : float, optional
255
- Minimum MAF threshold, by default 0.05
256
-
257
- Returns
258
- -------
259
- list
260
- List of SNP IDs that pass the MAF threshold
261
- """
262
- array_snps, array_indivs, geno_array = load_bfile(
263
- bfile_chr_prefix=f"{bfile_root}.{chrom}", mafMin=maf_min
264
- )
265
-
266
- m = len(array_snps.IDList)
267
- n = len(array_indivs.IDList)
268
- logger.info(
269
- f"Loading genotype data for {m} SNPs and {n} individuals from {bfile_root}.{chrom}"
270
- )
271
-
272
- # Filter SNPs by MAF
273
- snp_pass_maf = array_snps.IDList.iloc[geno_array.kept_snps]
274
- logger.info(f"After filtering SNPs with MAF < {maf_min}, {len(snp_pass_maf)} SNPs remain")
275
-
276
- return snp_pass_maf.SNP.to_list()
277
-
278
-
279
- def get_ldscore(
280
- bfile_root: str,
281
- chrom: int,
282
- annot_matrix: np.ndarray,
283
- ld_wind: float,
284
- ld_unit: str = "CM",
285
- keep_snps_index: list[int] = None,
286
- ) -> pd.DataFrame:
287
- """
288
- Calculate LD scores using PLINK data and an annotation matrix.
289
-
290
- Parameters
291
- ----------
292
- bfile_root : str
293
- Root path for PLINK bfiles
294
- chrom : int
295
- Chromosome number
296
- annot_matrix : np.ndarray
297
- Annotation matrix
298
- ld_wind : float
299
- LD window size
300
- ld_unit : str, optional
301
- Unit for the LD window, by default "CM"
302
- keep_snps_index : list[int], optional
303
- Indices of SNPs to keep, by default None
304
-
305
- Returns
306
- -------
307
- pd.DataFrame
308
- DataFrame with calculated LD scores
309
- """
310
- array_snps, array_indivs, geno_array = load_bfile(
311
- bfile_chr_prefix=f"{bfile_root}.{chrom}", keep_snps=keep_snps_index
312
- )
313
-
314
- # Configure LD window based on specified unit
315
- if ld_unit == "SNP":
316
- max_dist = ld_wind
317
- coords = np.array(range(geno_array.m))
318
- elif ld_unit == "KB":
319
- max_dist = ld_wind * 1000
320
- coords = np.array(array_snps.df["BP"])[geno_array.kept_snps]
321
- elif ld_unit == "CM":
322
- max_dist = ld_wind
323
- coords = np.array(array_snps.df["CM"])[geno_array.kept_snps]
324
- # Check if the CM is all 0
325
- if np.all(coords == 0):
326
- logger.warning(
327
- "All CM values are 0 in the BIM file. Using 1MB window size for LD score calculation."
328
- )
329
- max_dist = 1_000_000
330
- coords = np.array(array_snps.df["BP"])[geno_array.kept_snps]
331
- else:
332
- raise ValueError(f"Invalid ld_wind_unit: {ld_unit}. Must be one of: SNP, KB, CM")
333
-
334
- # Calculate blocks for LD computation
335
- block_left = getBlockLefts(coords, max_dist)
336
- assert block_left.sum() > 0, "Invalid window size, please check the ld_wind parameter."
337
-
338
- # Calculate LD scores
339
- ld_scores = pd.DataFrame(geno_array.ldScoreVarBlocks(block_left, 100, annot=annot_matrix))
340
-
341
- return ld_scores
342
-
343
-
344
- def calculate_ldscore_from_annotation(
345
- snp_annotation_df: pd.DataFrame,
346
- chrom: int,
347
- bfile_root: str,
348
- ld_wind: float = 1,
349
- ld_unit: str = "CM",
350
- ) -> pd.DataFrame:
351
- """
352
- Calculate LD scores from SNP annotation DataFrame.
353
-
354
- Parameters
355
- ----------
356
- snp_annotation_df : pd.DataFrame
357
- DataFrame with SNP annotations
358
- chrom : int
359
- Chromosome number
360
- bfile_root : str
361
- Root path for PLINK bfiles
362
- ld_wind : float, optional
363
- LD window size, by default 1
364
- ld_unit : str, optional
365
- Unit for the LD window, by default "CM"
366
-
367
- Returns
368
- -------
369
- pd.DataFrame
370
- DataFrame with calculated LD scores
371
- """
372
- # Calculate LD scores
373
- snp_gene_weight_matrix = get_ldscore(
374
- bfile_root, chrom, snp_annotation_df.values, ld_wind=ld_wind, ld_unit=ld_unit
375
- )
376
-
377
- # Set proper data types and indices
378
- snp_gene_weight_matrix = snp_gene_weight_matrix.astype(np.float32, copy=False)
379
- snp_gene_weight_matrix.index = snp_annotation_df.index
380
- snp_gene_weight_matrix.columns = snp_annotation_df.columns
381
-
382
- return snp_gene_weight_matrix
383
-
384
-
385
- def calculate_ldscore_from_multiple_annotation(
386
- snp_annotation_df_list: list[pd.DataFrame],
387
- chrom: int,
388
- bfile_root: str,
389
- ld_wind: float = 1,
390
- ld_unit: str = "CM",
391
- ) -> list[pd.DataFrame]:
392
- """
393
- Calculate LD scores from multiple SNP annotation DataFrames.
394
-
395
- Parameters
396
- ----------
397
- snp_annotation_df_list : list
398
- List of DataFrames with SNP annotations
399
- chrom : int
400
- Chromosome number
401
- bfile_root : str
402
- Root path for PLINK bfiles
403
- ld_wind : float, optional
404
- LD window size, by default 1
405
- ld_unit : str, optional
406
- Unit for the LD window, by default "CM"
407
-
408
- Returns
409
- -------
410
- list
411
- List of DataFrames with calculated LD scores
412
- """
413
- # Combine annotations
414
- combined_annotations = pd.concat(snp_annotation_df_list, axis=1).astype(np.float32, copy=False)
415
-
416
- # Calculate LD scores
417
- combined_ld_scores = get_ldscore(
418
- bfile_root, chrom, combined_annotations.values, ld_wind=ld_wind, ld_unit=ld_unit
419
- )
420
-
421
- # Apply proper indices and columns
422
- combined_ld_scores.index = combined_annotations.index
423
- combined_ld_scores.columns = combined_annotations.columns
424
-
425
- # Split back into separate DataFrames
426
- annotation_lengths = [len(df.columns) for df in snp_annotation_df_list]
427
- result_dataframes = []
428
- start_col = 0
429
-
430
- for length in annotation_lengths:
431
- end_col = start_col + length
432
- result_dataframes.append(combined_ld_scores.iloc[:, start_col:end_col])
433
- start_col = end_col
434
-
435
- return result_dataframes
436
-
437
-
438
144
  class LDScoreCalculator:
439
145
  """
440
146
  Class for calculating LD scores from gene specificity scores.
441
-
442
- This class handles the assignment of gene specificity scores to SNPs
443
- and the calculation of LD scores.
444
147
  """
445
148
 
446
149
  def __init__(self, config: GenerateLDScoreConfig):
447
- """
448
- Initialize LDScoreCalculator.
449
-
450
- Parameters
451
- ----------
452
- config : GenerateLDScoreConfig
453
- Configuration object
454
- """
150
+ """Initialize LDScoreCalculator."""
455
151
  self.config = config
456
152
  self.validate_config()
457
153
 
@@ -466,9 +162,6 @@ class LDScoreCalculator:
466
162
  # Initialize enhancer data if provided
467
163
  self.enhancer_pr = self._initialize_enhancer() if config.enhancer_annotation_file else None
468
164
 
469
- # Initialize zarr file if needed
470
- self._initialize_zarr_if_needed()
471
-
472
165
  def validate_config(self):
473
166
  """Validate configuration parameters."""
474
167
  if not Path(self.config.mkscore_feather_path).exists():
@@ -519,33 +212,6 @@ class LDScoreCalculator:
519
212
  # Convert to PyRanges
520
213
  return pr.PyRanges(enhancer_df.reset_index())
521
214
 
522
- def _initialize_zarr_if_needed(self):
523
- """Initialize zarr file if zarr format is specified."""
524
- if self.config.ldscore_save_format == "zarr":
525
- chrom_snp_length_dict = get_snp_counts(self.config)
526
- self.chrom_snp_start_point = chrom_snp_length_dict["chrom_snp_start_point"]
527
-
528
- zarr_path = (
529
- Path(self.config.ldscore_save_dir) / f"{self.config.sample_name}.ldscore.zarr"
530
- )
531
-
532
- if not zarr_path.exists():
533
- self.zarr_file = zarr.open(
534
- zarr_path.as_posix(),
535
- mode="a",
536
- dtype=np.float16,
537
- chunks=self.config.zarr_chunk_size,
538
- shape=(chrom_snp_length_dict["total"], self.mk_score_common.shape[1]),
539
- )
540
- zarr_path.parent.mkdir(parents=True, exist_ok=True)
541
-
542
- # Save metadata
543
- self.zarr_file.attrs["spot_names"] = self.mk_score_common.columns.to_list()
544
- self.zarr_file.attrs["chrom_snp_start_point"] = self.chrom_snp_start_point
545
-
546
- else:
547
- self.zarr_file = zarr.open(zarr_path.as_posix(), mode="a")
548
-
549
215
  def process_chromosome(self, chrom: int):
550
216
  """
551
217
  Process a single chromosome to calculate LD scores.
@@ -557,35 +223,42 @@ class LDScoreCalculator:
557
223
  """
558
224
  logger.info(f"Processing chromosome {chrom}")
559
225
 
560
- # Get SNPs passing MAF filter
561
- self.snp_pass_maf = get_snp_pass_maf(self.config.bfile_root, chrom, maf_min=0.05)
226
+ # Initialize PlinkBEDFile once for this chromosome
227
+ plink_bed = PlinkBEDFile(f"{self.config.bfile_root}.{chrom}")
228
+
229
+ # Get SNPs passing MAF filter using built-in method
230
+ self.snp_pass_maf = plink_bed.get_snps_by_maf(0.05)
562
231
 
563
232
  # Get SNP-gene dummy pairs
564
- self.snp_gene_pair_dummy = self._get_snp_gene_dummy(chrom)
233
+ self.snp_gene_pair_dummy = self._get_snp_gene_dummy(chrom, plink_bed)
565
234
 
566
235
  # Apply SNP filter if provided
567
236
  self._apply_snp_filter(chrom)
568
237
 
569
238
  # Process additional baseline annotations if provided
570
239
  if self.config.additional_baseline_annotation:
571
- self._process_additional_baseline(chrom)
240
+ self._process_additional_baseline(chrom, plink_bed)
572
241
  else:
573
- # Calculate SNP-gene weight matrix
574
- self.snp_gene_weight_matrix = calculate_ldscore_from_annotation(
575
- self.snp_gene_pair_dummy,
576
- chrom,
577
- self.config.bfile_root,
242
+ # Calculate SNP-gene weight matrix using built-in methods
243
+ ld_scores = plink_bed.get_ldscore(
244
+ annot_matrix=self.snp_gene_pair_dummy.values,
578
245
  ld_wind=self.config.ld_wind,
579
246
  ld_unit=self.config.ld_unit,
580
247
  )
581
248
 
249
+ self.snp_gene_weight_matrix = pd.DataFrame(
250
+ ld_scores,
251
+ index=self.snp_gene_pair_dummy.index,
252
+ columns=self.snp_gene_pair_dummy.columns,
253
+ )
254
+
582
255
  # Apply SNP filter if needed
583
256
  if self.keep_snp_mask is not None:
584
257
  self.snp_gene_weight_matrix = self.snp_gene_weight_matrix[self.keep_snp_mask]
585
258
 
586
259
  # Generate w_ld file if keep_snp_root is provided
587
260
  if self.config.keep_snp_root:
588
- self._generate_w_ld(chrom)
261
+ self._generate_w_ld(chrom, plink_bed)
589
262
 
590
263
  # Save pre-calculated SNP-gene weight matrix if requested
591
264
  self._save_snp_gene_weight_matrix_if_needed(chrom)
@@ -596,16 +269,16 @@ class LDScoreCalculator:
596
269
 
597
270
  # Calculate baseline LD scores
598
271
  logger.info(f"Calculating baseline LD scores for chr{chrom}")
599
- self._calculate_baseline_ldscores(chrom)
272
+ self._calculate_baseline_ldscores(chrom, plink_bed)
600
273
 
601
274
  # Calculate LD scores for annotation
602
275
  logger.info(f"Calculating annotation LD scores for chr{chrom}")
603
- self._calculate_annotation_ldscores(chrom)
276
+ self._calculate_annotation_ldscores(chrom, plink_bed)
604
277
 
605
278
  # Clear memory
606
279
  self._clear_memory()
607
280
 
608
- def _generate_w_ld(self, chrom: int):
281
+ def _generate_w_ld(self, chrom: int, plink_bed):
609
282
  """
610
283
  Generate w_ld file for the chromosome using filtered SNPs.
611
284
 
@@ -613,6 +286,8 @@ class LDScoreCalculator:
613
286
  ----------
614
287
  chrom : int
615
288
  Chromosome number
289
+ plink_bed : PlinkBEDFile
290
+ Initialized PlinkBEDFile object
616
291
  """
617
292
  if not self.config.keep_snp_root:
618
293
  logger.info(
@@ -622,48 +297,38 @@ class LDScoreCalculator:
622
297
 
623
298
  logger.info(f"Generating w_ld for chr{chrom}")
624
299
 
625
- # Get the indices of SNPs to keep based on the keep_snp_mask
626
- keep_snps_index = np.nonzero(self.keep_snp_mask)[0]
300
+ # Get the indices of SNPs to keep based on the keep_snp
301
+ keep_snps_indices = plink_bed.bim_df[
302
+ plink_bed.bim_df.SNP.isin(self.snp_name)
303
+ ].index.tolist()
627
304
 
628
305
  # Create a simple unit annotation (all ones) for the filtered SNPs
629
- unit_annotation = np.ones((len(keep_snps_index), 1))
306
+ unit_annotation = np.ones((len(keep_snps_indices), 1))
630
307
 
631
- # Calculate LD scores using the filtered SNPs
632
- w_ld_scores = get_ldscore(
633
- self.config.bfile_root,
634
- chrom,
635
- unit_annotation,
308
+ # Calculate LD scores
309
+ w_ld_scores = plink_bed.get_ldscore(
310
+ annot_matrix=unit_annotation,
636
311
  ld_wind=self.config.ld_wind,
637
312
  ld_unit=self.config.ld_unit,
638
- keep_snps_index=keep_snps_index.tolist(),
639
- )
640
-
641
- # Load the BIM file to get SNP information
642
- bim_data = pd.read_csv(
643
- f"{self.config.bfile_root}.{chrom}.bim",
644
- sep="\t",
645
- header=None,
646
- names=["CHR", "SNP", "CM", "BP", "A1", "A2"],
313
+ keep_snps_index=keep_snps_indices,
647
314
  )
648
315
 
649
- # Get SNP names for the kept indices
650
- kept_snp_names = bim_data.iloc[keep_snps_index].SNP.tolist()
651
-
652
316
  # Create the w_ld DataFrame
317
+ bim_subset = plink_bed.bim_df.loc[keep_snps_indices]
653
318
  w_ld_df = pd.DataFrame(
654
319
  {
655
- "SNP": kept_snp_names,
656
- "L2": w_ld_scores.values.flatten(),
657
- "CHR": bim_data.iloc[keep_snps_index].CHR.values,
658
- "BP": bim_data.iloc[keep_snps_index].BP.values,
659
- "CM": bim_data.iloc[keep_snps_index].CM.values,
320
+ "SNP": bim_subset.SNP,
321
+ "L2": w_ld_scores.flatten(),
322
+ "CHR": bim_subset.CHR,
323
+ "BP": bim_subset.BP,
324
+ "CM": bim_subset.CM,
660
325
  }
661
326
  )
662
327
 
663
328
  # Reorder columns
664
329
  w_ld_df = w_ld_df[["CHR", "SNP", "BP", "CM", "L2"]]
665
330
 
666
- # Save to feather format
331
+ # Save to file
667
332
  w_ld_dir = Path(self.config.ldscore_save_dir) / "w_ld"
668
333
  w_ld_dir.mkdir(parents=True, exist_ok=True)
669
334
  w_ld_file = w_ld_dir / f"weights.{chrom}.l2.ldscore.gz"
@@ -693,7 +358,7 @@ class LDScoreCalculator:
693
358
  logger.info(f"Using all {len(self.snp_name)} SNPs (no filter applied)")
694
359
  logger.warning("No keep_snp_root provided, all SNPs will be used to calculate w_ld.")
695
360
 
696
- def _process_additional_baseline(self, chrom: int):
361
+ def _process_additional_baseline(self, chrom: int, plink_bed):
697
362
  """
698
363
  Process additional baseline annotations.
699
364
 
@@ -701,6 +366,8 @@ class LDScoreCalculator:
701
366
  ----------
702
367
  chrom : int
703
368
  Chromosome number
369
+ plink_bed : PlinkBEDFile
370
+ Initialized PlinkBEDFile object
704
371
  """
705
372
  # Load additional baseline annotations
706
373
  additional_baseline_path = Path(self.config.additional_baseline_annotation)
@@ -730,25 +397,44 @@ class LDScoreCalculator:
730
397
  f"{missing_count} SNPs not found in additional baseline annotations. "
731
398
  "Setting their values to 0."
732
399
  )
733
- additional_baseline_df = additional_baseline_df.reindex(
734
- self.snp_gene_pair_dummy.index, fill_value=0
735
- )
736
- else:
737
- additional_baseline_df = additional_baseline_df.reindex(self.snp_gene_pair_dummy.index)
738
-
739
- # Calculate LD scores for both annotation sets together
740
- self.snp_gene_weight_matrix, additional_ldscore = (
741
- calculate_ldscore_from_multiple_annotation(
742
- [self.snp_gene_pair_dummy, additional_baseline_df],
743
- chrom,
744
- self.config.bfile_root,
745
- ld_wind=self.config.ld_wind,
746
- ld_unit=self.config.ld_unit,
747
- )
400
+ additional_baseline_df = additional_baseline_df.reindex(
401
+ self.snp_gene_pair_dummy.index, fill_value=0
402
+ )
403
+
404
+ # Combine annotations into a single matrix
405
+ combined_annotations = pd.concat(
406
+ [self.snp_gene_pair_dummy, additional_baseline_df], axis=1
748
407
  )
749
408
 
750
- # Filter additional ldscore
751
- additional_ldscore = additional_ldscore.loc[self.snp_name]
409
+ # Calculate LD scores
410
+ ld_scores = plink_bed.get_ldscore(
411
+ annot_matrix=combined_annotations.values.astype(np.float32, copy=False),
412
+ ld_wind=self.config.ld_wind,
413
+ ld_unit=self.config.ld_unit,
414
+ )
415
+
416
+ # Split results
417
+ # total_cols = combined_annotations.shape[1]
418
+ gene_cols = self.snp_gene_pair_dummy.shape[1]
419
+ # baseline_cols = additional_baseline_df.shape[1]
420
+
421
+ # Create DataFrames with proper indices and columns
422
+ self.snp_gene_weight_matrix = pd.DataFrame(
423
+ ld_scores[:, :gene_cols],
424
+ index=combined_annotations.index,
425
+ columns=self.snp_gene_pair_dummy.columns,
426
+ )
427
+
428
+ additional_ldscore = pd.DataFrame(
429
+ ld_scores[:, gene_cols:],
430
+ index=combined_annotations.index,
431
+ columns=additional_baseline_df.columns,
432
+ )
433
+
434
+ # Filter by keep_snp_mask if specified
435
+ if self.keep_snp_mask is not None:
436
+ additional_ldscore = additional_ldscore[self.keep_snp_mask]
437
+ self.snp_gene_weight_matrix = self.snp_gene_weight_matrix[self.keep_snp_mask]
752
438
 
753
439
  # Save additional baseline LD scores
754
440
  ld_score_file = f"{self.config.ldscore_save_dir}/additional_baseline/baseline.{chrom}.l2.ldscore.feather"
@@ -793,7 +479,7 @@ class LDScoreCalculator:
793
479
  save_path = save_dir / f"{chrom}.snp_gene_weight_matrix.feather"
794
480
  self.snp_gene_weight_matrix.reset_index().to_feather(save_path)
795
481
 
796
- def _calculate_baseline_ldscores(self, chrom: int):
482
+ def _calculate_baseline_ldscores(self, chrom: int, plink_bed):
797
483
  """
798
484
  Calculate and save baseline LD scores.
799
485
 
@@ -801,6 +487,8 @@ class LDScoreCalculator:
801
487
  ----------
802
488
  chrom : int
803
489
  Chromosome number
490
+ plink_bed : PlinkBEDFile
491
+ Initialized PlinkBEDFile object
804
492
  """
805
493
  # Create baseline scores
806
494
  baseline_mk_score = np.ones((self.snp_gene_pair_dummy.shape[1], 2))
@@ -818,7 +506,9 @@ class LDScoreCalculator:
818
506
  m_5_file = f"{self.config.ldscore_save_dir}/baseline/baseline.{chrom}.l2.M_5_50"
819
507
 
820
508
  # Calculate LD scores
821
- ldscore_chunk = self._calculate_ldscore_from_weights(baseline_df, drop_dummy_na=False)
509
+ ldscore_chunk = self._calculate_ldscore_from_weights(
510
+ baseline_df, plink_bed, drop_dummy_na=False
511
+ )
822
512
 
823
513
  # Save LD scores and M values
824
514
  self._save_ldscore_to_feather(
@@ -836,9 +526,9 @@ class LDScoreCalculator:
836
526
 
837
527
  # If keep_snp_root is not provided, use the first column of baseline ldscore as w_ld
838
528
  if not self.config.keep_snp_root:
839
- self._save_baseline_as_w_ld(chrom, ldscore_chunk)
529
+ self._save_baseline_as_w_ld(chrom, ldscore_chunk, plink_bed)
840
530
 
841
- def _save_baseline_as_w_ld(self, chrom: int, ldscore_chunk: np.ndarray):
531
+ def _save_baseline_as_w_ld(self, chrom: int, ldscore_chunk: np.ndarray, plink_bed):
842
532
  """
843
533
  Save the first column of baseline ldscore as w_ld.
844
534
 
@@ -848,6 +538,8 @@ class LDScoreCalculator:
848
538
  Chromosome number
849
539
  ldscore_chunk : np.ndarray
850
540
  Array with baseline LD scores
541
+ plink_bed : PlinkBEDFile
542
+ Initialized PlinkBEDFile object
851
543
  """
852
544
  logger.info(f"Using first column of baseline ldscore as w_ld for chr{chrom}")
853
545
 
@@ -861,23 +553,24 @@ class LDScoreCalculator:
861
553
  # Extract the first column
862
554
  w_ld_values = ldscore_chunk[:, 0]
863
555
 
864
- # Create a DataFrame
865
- bim_data = pd.read_csv(
866
- f"{self.config.bfile_root}.{chrom}.bim",
867
- sep="\t",
868
- header=None,
869
- names=["CHR", "SNP", "CM", "BP", "A1", "A2"],
556
+ # Create a DataFrame with SNP information from the BIM file
557
+ snp_indices = (
558
+ plink_bed.kept_snps
559
+ if hasattr(plink_bed, "kept_snps")
560
+ else np.arange(len(self.snp_name))
870
561
  )
562
+ bim_subset = plink_bed.bim_df.iloc[snp_indices]
563
+
871
564
  w_ld_df = pd.DataFrame(
872
565
  {
873
566
  "SNP": self.snp_name,
874
567
  "L2": w_ld_values,
568
+ "CHR": bim_subset.CHR.values[: len(self.snp_name)], # Ensure length matches
569
+ "BP": bim_subset.BP.values[: len(self.snp_name)],
570
+ "CM": bim_subset.CM.values[: len(self.snp_name)],
875
571
  }
876
572
  )
877
573
 
878
- # Add CHR, BP, and CM information
879
- w_ld_df = w_ld_df.merge(bim_data[["SNP", "CHR", "BP", "CM"]], on="SNP", how="left")
880
-
881
574
  # Reorder columns
882
575
  w_ld_df = w_ld_df[["CHR", "SNP", "BP", "CM", "L2"]]
883
576
 
@@ -885,7 +578,7 @@ class LDScoreCalculator:
885
578
 
886
579
  logger.info(f"Saved w_ld for chr{chrom} to {w_ld_file}")
887
580
 
888
- def _calculate_annotation_ldscores(self, chrom: int):
581
+ def _calculate_annotation_ldscores(self, chrom: int, plink_bed):
889
582
  """
890
583
  Calculate and save LD scores for spatial annotations.
891
584
 
@@ -893,6 +586,8 @@ class LDScoreCalculator:
893
586
  ----------
894
587
  chrom : int
895
588
  Chromosome number
589
+ plink_bed : PlinkBEDFile
590
+ Initialized PlinkBEDFile object
896
591
  """
897
592
  # Get marker scores for gene columns (excluding dummy NA column)
898
593
  mk_scores = self.mk_score_common.loc[self.snp_gene_pair_dummy.columns[:-1]]
@@ -915,7 +610,7 @@ class LDScoreCalculator:
915
610
  m_5_file = f"{self.config.ldscore_save_dir}/{sample_name}_chunk{chunk_index}/{sample_name}.{chrom}.l2.M_5_50"
916
611
 
917
612
  # Calculate LD scores
918
- ldscore_chunk = self._calculate_ldscore_from_weights(mk_score_chunk)
613
+ ldscore_chunk = self._calculate_ldscore_from_weights(mk_score_chunk, plink_bed)
919
614
 
920
615
  # Save LD scores based on format
921
616
  if self.config.ldscore_save_format == "feather":
@@ -924,12 +619,6 @@ class LDScoreCalculator:
924
619
  column_names=mk_score_chunk.columns,
925
620
  save_file_name=ld_score_file,
926
621
  )
927
- elif self.config.ldscore_save_format == "zarr":
928
- self._save_ldscore_chunk_to_zarr(
929
- ldscore_chunk,
930
- chrom=chrom,
931
- start_col_index=i,
932
- )
933
622
  else:
934
623
  raise ValueError(f"Invalid ldscore_save_format: {self.config.ldscore_save_format}")
935
624
 
@@ -948,7 +637,7 @@ class LDScoreCalculator:
948
637
  gc.collect()
949
638
 
950
639
  def _calculate_ldscore_from_weights(
951
- self, marker_scores: pd.DataFrame, drop_dummy_na: bool = True
640
+ self, marker_scores: pd.DataFrame, plink_bed, drop_dummy_na: bool = True
952
641
  ) -> np.ndarray:
953
642
  """
954
643
  Calculate LD scores using SNP-gene weight matrix.
@@ -957,6 +646,8 @@ class LDScoreCalculator:
957
646
  ----------
958
647
  marker_scores : pd.DataFrame
959
648
  DataFrame with marker scores
649
+ plink_bed : PlinkBEDFile
650
+ Initialized PlinkBEDFile object
960
651
  drop_dummy_na : bool, optional
961
652
  Whether to drop the dummy NA column, by default True
962
653
 
@@ -1009,37 +700,6 @@ class LDScoreCalculator:
1009
700
  df.index.name = "SNP"
1010
701
  df.reset_index().to_feather(save_file_name)
1011
702
 
1012
- def _save_ldscore_chunk_to_zarr(
1013
- self, ldscore_data: np.ndarray, chrom: int, start_col_index: int
1014
- ):
1015
- """
1016
- Save LD scores to a zarr array.
1017
-
1018
- Parameters
1019
- ----------
1020
- ldscore_data : np.ndarray
1021
- Array with LD scores
1022
- chrom : int
1023
- Chromosome number
1024
- start_col_index : int
1025
- Starting column index in the zarr array
1026
- """
1027
- # Convert to float16 for storage efficiency
1028
- ldscore_data = ldscore_data.astype(np.float16, copy=False)
1029
-
1030
- # Handle numerical overflow
1031
- ldscore_data[np.isinf(ldscore_data)] = np.finfo(np.float16).max
1032
-
1033
- # Get start and end indices for this chromosome
1034
- chrom_start = self.chrom_snp_start_point[chrom - 1]
1035
- chrom_end = self.chrom_snp_start_point[chrom]
1036
-
1037
- # Save to zarr array
1038
- self.zarr_file[
1039
- chrom_start:chrom_end,
1040
- start_col_index : start_col_index + ldscore_data.shape[1],
1041
- ] = ldscore_data
1042
-
1043
703
  def _calculate_and_save_m_values(
1044
704
  self,
1045
705
  marker_scores: pd.DataFrame,
@@ -1084,7 +744,7 @@ class LDScoreCalculator:
1084
744
  np.savetxt(m_file_path, m_values, delimiter="\t")
1085
745
  np.savetxt(m_5_file_path, m_5_values, delimiter="\t")
1086
746
 
1087
- def _get_snp_gene_dummy(self, chrom: int) -> pd.DataFrame:
747
+ def _get_snp_gene_dummy(self, chrom: int, plink_bed) -> pd.DataFrame:
1088
748
  """
1089
749
  Get dummy matrix for SNP-gene pairs.
1090
750
 
@@ -1092,6 +752,7 @@ class LDScoreCalculator:
1092
752
  ----------
1093
753
  chrom : int
1094
754
  Chromosome number
755
+ plink_bed : PlinkBEDFile
1095
756
 
1096
757
  Returns
1097
758
  -------
@@ -1101,7 +762,8 @@ class LDScoreCalculator:
1101
762
  logger.info(f"Creating SNP-gene mappings for chromosome {chrom}")
1102
763
 
1103
764
  # Load BIM file
1104
- bim, bim_pr = load_bim(self.config.bfile_root, chrom)
765
+ bim = plink_bed.bim_df
766
+ bim_pr = plink_bed.convert_bim_to_pyrange(bim)
1105
767
 
1106
768
  # Determine mapping strategy
1107
769
  if self.config.gene_window_enhancer_priority in ["gene_window_first", "enhancer_first"]: