gsMap 1.73.3__py3-none-any.whl → 1.73.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
gsMap/generate_ldscore.py CHANGED
@@ -13,12 +13,11 @@ from pathlib import Path
13
13
  import numpy as np
14
14
  import pandas as pd
15
15
  import pyranges as pr
16
- import zarr
17
16
  from scipy.sparse import csr_matrix
18
17
  from tqdm import trange
19
18
 
20
19
  from gsMap.config import GenerateLDScoreConfig
21
- from gsMap.utils.generate_r2_matrix import getBlockLefts, load_bfile
20
+ from gsMap.utils.generate_r2_matrix import PlinkBEDFile
22
21
 
23
22
  # Configure warning behavior more precisely
24
23
  warnings.filterwarnings("ignore", category=FutureWarning, module="pandas")
@@ -113,44 +112,6 @@ def load_marker_score(mk_score_file: str) -> pd.DataFrame:
113
112
  return mk_score
114
113
 
115
114
 
116
- def load_bim(bfile_root: str, chrom: int) -> tuple[pd.DataFrame, pr.PyRanges]:
117
- """
118
- Load PLINK BIM file and convert to a PyRanges object.
119
-
120
- Parameters
121
- ----------
122
- bfile_root : str
123
- Root path for PLINK bfiles
124
- chrom : int
125
- Chromosome number
126
-
127
- Returns
128
- -------
129
- tuple
130
- A tuple containing (bim_df, bim_pr) where:
131
- - bim_df is a pandas DataFrame with BIM data
132
- - bim_pr is a PyRanges object with BIM data
133
- """
134
- bim_file = f"{bfile_root}.{chrom}.bim"
135
- logger.info(f"Loading BIM file: {bim_file}")
136
-
137
- bim = pd.read_csv(bim_file, sep="\t", header=None)
138
- bim.columns = ["CHR", "SNP", "CM", "BP", "A1", "A2"]
139
-
140
- # Convert to PyRanges
141
- bim_pr = bim.copy()
142
- bim_pr.columns = ["Chromosome", "SNP", "CM", "Start", "A1", "A2"]
143
-
144
- # Adjust coordinates (BIM is 1-based, PyRanges uses 0-based)
145
- bim_pr["End"] = bim_pr["Start"].copy()
146
- bim_pr["Start"] = bim_pr["Start"] - 1
147
-
148
- bim_pr = pr.PyRanges(bim_pr)
149
- bim_pr.Chromosome = f"chr{chrom}"
150
-
151
- return bim, bim_pr
152
-
153
-
154
115
  def overlaps_gtf_bim(gtf_pr: pr.PyRanges, bim_pr: pr.PyRanges) -> pd.DataFrame:
155
116
  """
156
117
  Find overlaps between GTF and BIM data, and select nearest gene for each SNP.
@@ -180,284 +141,13 @@ def overlaps_gtf_bim(gtf_pr: pr.PyRanges, bim_pr: pr.PyRanges) -> pd.DataFrame:
180
141
  return nearest_genes
181
142
 
182
143
 
183
- def filter_snps_by_keep_snp(bim_df: pd.DataFrame, keep_snp_file: str) -> pd.DataFrame:
184
- """
185
- Filter BIM DataFrame to keep only SNPs in a provided list.
186
-
187
- Parameters
188
- ----------
189
- bim_df : pd.DataFrame
190
- DataFrame with BIM data
191
- keep_snp_file : str
192
- Path to a file with SNP IDs to keep
193
-
194
- Returns
195
- -------
196
- pd.DataFrame
197
- Filtered BIM DataFrame
198
- """
199
- # Read SNPs to keep
200
- keep_snp = pd.read_csv(keep_snp_file, header=None)[0].to_list()
201
-
202
- # Filter the BIM DataFrame
203
- filtered_bim_df = bim_df[bim_df["SNP"].isin(keep_snp)]
204
-
205
- logger.info(f"Kept {len(filtered_bim_df)} SNPs out of {len(bim_df)} after filtering")
206
-
207
- return filtered_bim_df
208
-
209
-
210
- def get_snp_counts(config: GenerateLDScoreConfig) -> dict:
211
- """
212
- Count SNPs per chromosome and calculate start positions for zarr arrays.
213
-
214
- Parameters
215
- ----------
216
- config : GenerateLDScoreConfig
217
- Configuration object
218
-
219
- Returns
220
- -------
221
- dict
222
- Dictionary with SNP counts and start positions
223
- """
224
- snp_counts = {}
225
- total_snp = 0
226
-
227
- for chrom in range(1, 23):
228
- bim_df, _ = load_bim(config.bfile_root, chrom)
229
-
230
- if config.keep_snp_root:
231
- keep_snp_file = f"{config.keep_snp_root}.{chrom}.snp"
232
- filtered_bim_df = filter_snps_by_keep_snp(bim_df, keep_snp_file)
233
- else:
234
- filtered_bim_df = bim_df
235
-
236
- snp_counts[chrom] = filtered_bim_df.shape[0]
237
- total_snp += snp_counts[chrom]
238
-
239
- snp_counts["total"] = total_snp
240
-
241
- # Calculate cumulative SNP counts for zarr array indexing
242
- chrom_snp_length_array = np.array([snp_counts[chrom] for chrom in range(1, 23)]).cumsum()
243
- snp_counts["chrom_snp_start_point"] = [0] + chrom_snp_length_array.tolist()
244
-
245
- return snp_counts
246
-
247
-
248
- def get_snp_pass_maf(bfile_root: str, chrom: int, maf_min: float = 0.05) -> list[str]:
249
- """
250
- Get SNPs that pass the minimum minor allele frequency (MAF) threshold.
251
-
252
- Parameters
253
- ----------
254
- bfile_root : str
255
- Root path for PLINK bfiles
256
- chrom : int
257
- Chromosome number
258
- maf_min : float, optional
259
- Minimum MAF threshold, by default 0.05
260
-
261
- Returns
262
- -------
263
- list
264
- List of SNP IDs that pass the MAF threshold
265
- """
266
- array_snps, array_indivs, geno_array = load_bfile(
267
- bfile_chr_prefix=f"{bfile_root}.{chrom}", mafMin=maf_min
268
- )
269
-
270
- m = len(array_snps.IDList)
271
- n = len(array_indivs.IDList)
272
- logger.info(
273
- f"Loading genotype data for {m} SNPs and {n} individuals from {bfile_root}.{chrom}"
274
- )
275
-
276
- # Filter SNPs by MAF
277
- snp_pass_maf = array_snps.IDList.iloc[geno_array.kept_snps]
278
- logger.info(f"After filtering SNPs with MAF < {maf_min}, {len(snp_pass_maf)} SNPs remain")
279
-
280
- return snp_pass_maf.SNP.to_list()
281
-
282
-
283
- def get_ldscore(
284
- bfile_root: str,
285
- chrom: int,
286
- annot_matrix: np.ndarray,
287
- ld_wind: float,
288
- ld_unit: str = "CM",
289
- keep_snps_index: list[int] = None,
290
- ) -> pd.DataFrame:
291
- """
292
- Calculate LD scores using PLINK data and an annotation matrix.
293
-
294
- Parameters
295
- ----------
296
- bfile_root : str
297
- Root path for PLINK bfiles
298
- chrom : int
299
- Chromosome number
300
- annot_matrix : np.ndarray
301
- Annotation matrix
302
- ld_wind : float
303
- LD window size
304
- ld_unit : str, optional
305
- Unit for the LD window, by default "CM"
306
- keep_snps_index : list[int], optional
307
- Indices of SNPs to keep, by default None
308
-
309
- Returns
310
- -------
311
- pd.DataFrame
312
- DataFrame with calculated LD scores
313
- """
314
- array_snps, array_indivs, geno_array = load_bfile(
315
- bfile_chr_prefix=f"{bfile_root}.{chrom}", keep_snps=keep_snps_index
316
- )
317
-
318
- annot_matrix = annot_matrix[geno_array.kept_snps, :]
319
-
320
- # Configure LD window based on specified unit
321
- if ld_unit == "SNP":
322
- max_dist = ld_wind
323
- coords = np.array(range(geno_array.m))
324
- elif ld_unit == "KB":
325
- max_dist = ld_wind * 1000
326
- coords = np.array(array_snps.df["BP"])[geno_array.kept_snps]
327
- elif ld_unit == "CM":
328
- max_dist = ld_wind
329
- coords = np.array(array_snps.df["CM"])[geno_array.kept_snps]
330
- # Check if the CM is all 0
331
- if np.all(coords == 0):
332
- logger.warning(
333
- "All CM values are 0 in the BIM file. Using 1MB window size for LD score calculation."
334
- )
335
- max_dist = 1_000_000
336
- coords = np.array(array_snps.df["BP"])[geno_array.kept_snps]
337
- else:
338
- raise ValueError(f"Invalid ld_wind_unit: {ld_unit}. Must be one of: SNP, KB, CM")
339
-
340
- # Calculate blocks for LD computation
341
- block_left = getBlockLefts(coords, max_dist)
342
- assert block_left.sum() > 0, "Invalid window size, please check the ld_wind parameter."
343
-
344
- # Calculate LD scores
345
- ld_scores = pd.DataFrame(geno_array.ldScoreVarBlocks(block_left, 100, annot=annot_matrix))
346
-
347
- return ld_scores
348
-
349
-
350
- def calculate_ldscore_from_annotation(
351
- snp_annotation_df: pd.DataFrame,
352
- chrom: int,
353
- bfile_root: str,
354
- ld_wind: float = 1,
355
- ld_unit: str = "CM",
356
- ) -> pd.DataFrame:
357
- """
358
- Calculate LD scores from SNP annotation DataFrame.
359
-
360
- Parameters
361
- ----------
362
- snp_annotation_df : pd.DataFrame
363
- DataFrame with SNP annotations
364
- chrom : int
365
- Chromosome number
366
- bfile_root : str
367
- Root path for PLINK bfiles
368
- ld_wind : float, optional
369
- LD window size, by default 1
370
- ld_unit : str, optional
371
- Unit for the LD window, by default "CM"
372
-
373
- Returns
374
- -------
375
- pd.DataFrame
376
- DataFrame with calculated LD scores
377
- """
378
- # Calculate LD scores
379
- snp_gene_weight_matrix = get_ldscore(
380
- bfile_root, chrom, snp_annotation_df.values, ld_wind=ld_wind, ld_unit=ld_unit
381
- )
382
-
383
- # Set proper data types and indices
384
- snp_gene_weight_matrix = snp_gene_weight_matrix.astype(np.float32, copy=False)
385
- snp_gene_weight_matrix.index = snp_annotation_df.index
386
- snp_gene_weight_matrix.columns = snp_annotation_df.columns
387
-
388
- return snp_gene_weight_matrix
389
-
390
-
391
- def calculate_ldscore_from_multiple_annotation(
392
- snp_annotation_df_list: list[pd.DataFrame],
393
- chrom: int,
394
- bfile_root: str,
395
- ld_wind: float = 1,
396
- ld_unit: str = "CM",
397
- ) -> list[pd.DataFrame]:
398
- """
399
- Calculate LD scores from multiple SNP annotation DataFrames.
400
-
401
- Parameters
402
- ----------
403
- snp_annotation_df_list : list
404
- List of DataFrames with SNP annotations
405
- chrom : int
406
- Chromosome number
407
- bfile_root : str
408
- Root path for PLINK bfiles
409
- ld_wind : float, optional
410
- LD window size, by default 1
411
- ld_unit : str, optional
412
- Unit for the LD window, by default "CM"
413
-
414
- Returns
415
- -------
416
- list
417
- List of DataFrames with calculated LD scores
418
- """
419
- # Combine annotations
420
- combined_annotations = pd.concat(snp_annotation_df_list, axis=1).astype(np.float32, copy=False)
421
-
422
- # Calculate LD scores
423
- combined_ld_scores = get_ldscore(
424
- bfile_root, chrom, combined_annotations.values, ld_wind=ld_wind, ld_unit=ld_unit
425
- )
426
-
427
- # Apply proper indices and columns
428
- combined_ld_scores.index = combined_annotations.index
429
- combined_ld_scores.columns = combined_annotations.columns
430
-
431
- # Split back into separate DataFrames
432
- annotation_lengths = [len(df.columns) for df in snp_annotation_df_list]
433
- result_dataframes = []
434
- start_col = 0
435
-
436
- for length in annotation_lengths:
437
- end_col = start_col + length
438
- result_dataframes.append(combined_ld_scores.iloc[:, start_col:end_col])
439
- start_col = end_col
440
-
441
- return result_dataframes
442
-
443
-
444
144
  class LDScoreCalculator:
445
145
  """
446
146
  Class for calculating LD scores from gene specificity scores.
447
-
448
- This class handles the assignment of gene specificity scores to SNPs
449
- and the calculation of LD scores.
450
147
  """
451
148
 
452
149
  def __init__(self, config: GenerateLDScoreConfig):
453
- """
454
- Initialize LDScoreCalculator.
455
-
456
- Parameters
457
- ----------
458
- config : GenerateLDScoreConfig
459
- Configuration object
460
- """
150
+ """Initialize LDScoreCalculator."""
461
151
  self.config = config
462
152
  self.validate_config()
463
153
 
@@ -472,9 +162,6 @@ class LDScoreCalculator:
472
162
  # Initialize enhancer data if provided
473
163
  self.enhancer_pr = self._initialize_enhancer() if config.enhancer_annotation_file else None
474
164
 
475
- # Initialize zarr file if needed
476
- self._initialize_zarr_if_needed()
477
-
478
165
  def validate_config(self):
479
166
  """Validate configuration parameters."""
480
167
  if not Path(self.config.mkscore_feather_path).exists():
@@ -525,33 +212,6 @@ class LDScoreCalculator:
525
212
  # Convert to PyRanges
526
213
  return pr.PyRanges(enhancer_df.reset_index())
527
214
 
528
- def _initialize_zarr_if_needed(self):
529
- """Initialize zarr file if zarr format is specified."""
530
- if self.config.ldscore_save_format == "zarr":
531
- chrom_snp_length_dict = get_snp_counts(self.config)
532
- self.chrom_snp_start_point = chrom_snp_length_dict["chrom_snp_start_point"]
533
-
534
- zarr_path = (
535
- Path(self.config.ldscore_save_dir) / f"{self.config.sample_name}.ldscore.zarr"
536
- )
537
-
538
- if not zarr_path.exists():
539
- self.zarr_file = zarr.open(
540
- zarr_path.as_posix(),
541
- mode="a",
542
- dtype=np.float16,
543
- chunks=self.config.zarr_chunk_size,
544
- shape=(chrom_snp_length_dict["total"], self.mk_score_common.shape[1]),
545
- )
546
- zarr_path.parent.mkdir(parents=True, exist_ok=True)
547
-
548
- # Save metadata
549
- self.zarr_file.attrs["spot_names"] = self.mk_score_common.columns.to_list()
550
- self.zarr_file.attrs["chrom_snp_start_point"] = self.chrom_snp_start_point
551
-
552
- else:
553
- self.zarr_file = zarr.open(zarr_path.as_posix(), mode="a")
554
-
555
215
  def process_chromosome(self, chrom: int):
556
216
  """
557
217
  Process a single chromosome to calculate LD scores.
@@ -563,35 +223,42 @@ class LDScoreCalculator:
563
223
  """
564
224
  logger.info(f"Processing chromosome {chrom}")
565
225
 
566
- # Get SNPs passing MAF filter
567
- self.snp_pass_maf = get_snp_pass_maf(self.config.bfile_root, chrom, maf_min=0.05)
226
+ # Initialize PlinkBEDFile once for this chromosome
227
+ plink_bed = PlinkBEDFile(f"{self.config.bfile_root}.{chrom}")
228
+
229
+ # Get SNPs passing MAF filter using built-in method
230
+ self.snp_pass_maf = plink_bed.get_snps_by_maf(0.05)
568
231
 
569
232
  # Get SNP-gene dummy pairs
570
- self.snp_gene_pair_dummy = self._get_snp_gene_dummy(chrom)
233
+ self.snp_gene_pair_dummy = self._get_snp_gene_dummy(chrom, plink_bed)
571
234
 
572
235
  # Apply SNP filter if provided
573
236
  self._apply_snp_filter(chrom)
574
237
 
575
238
  # Process additional baseline annotations if provided
576
239
  if self.config.additional_baseline_annotation:
577
- self._process_additional_baseline(chrom)
240
+ self._process_additional_baseline(chrom, plink_bed)
578
241
  else:
579
- # Calculate SNP-gene weight matrix
580
- self.snp_gene_weight_matrix = calculate_ldscore_from_annotation(
581
- self.snp_gene_pair_dummy,
582
- chrom,
583
- self.config.bfile_root,
242
+ # Calculate SNP-gene weight matrix using built-in methods
243
+ ld_scores = plink_bed.get_ldscore(
244
+ annot_matrix=self.snp_gene_pair_dummy.values,
584
245
  ld_wind=self.config.ld_wind,
585
246
  ld_unit=self.config.ld_unit,
586
247
  )
587
248
 
249
+ self.snp_gene_weight_matrix = pd.DataFrame(
250
+ ld_scores,
251
+ index=self.snp_gene_pair_dummy.index,
252
+ columns=self.snp_gene_pair_dummy.columns,
253
+ )
254
+
588
255
  # Apply SNP filter if needed
589
256
  if self.keep_snp_mask is not None:
590
257
  self.snp_gene_weight_matrix = self.snp_gene_weight_matrix[self.keep_snp_mask]
591
258
 
592
259
  # Generate w_ld file if keep_snp_root is provided
593
260
  if self.config.keep_snp_root:
594
- self._generate_w_ld(chrom)
261
+ self._generate_w_ld(chrom, plink_bed)
595
262
 
596
263
  # Save pre-calculated SNP-gene weight matrix if requested
597
264
  self._save_snp_gene_weight_matrix_if_needed(chrom)
@@ -602,16 +269,16 @@ class LDScoreCalculator:
602
269
 
603
270
  # Calculate baseline LD scores
604
271
  logger.info(f"Calculating baseline LD scores for chr{chrom}")
605
- self._calculate_baseline_ldscores(chrom)
272
+ self._calculate_baseline_ldscores(chrom, plink_bed)
606
273
 
607
274
  # Calculate LD scores for annotation
608
275
  logger.info(f"Calculating annotation LD scores for chr{chrom}")
609
- self._calculate_annotation_ldscores(chrom)
276
+ self._calculate_annotation_ldscores(chrom, plink_bed)
610
277
 
611
278
  # Clear memory
612
279
  self._clear_memory()
613
280
 
614
- def _generate_w_ld(self, chrom: int):
281
+ def _generate_w_ld(self, chrom: int, plink_bed):
615
282
  """
616
283
  Generate w_ld file for the chromosome using filtered SNPs.
617
284
 
@@ -619,6 +286,8 @@ class LDScoreCalculator:
619
286
  ----------
620
287
  chrom : int
621
288
  Chromosome number
289
+ plink_bed : PlinkBEDFile
290
+ Initialized PlinkBEDFile object
622
291
  """
623
292
  if not self.config.keep_snp_root:
624
293
  logger.info(
@@ -628,48 +297,38 @@ class LDScoreCalculator:
628
297
 
629
298
  logger.info(f"Generating w_ld for chr{chrom}")
630
299
 
631
- # Get the indices of SNPs to keep based on the keep_snp_mask
632
- keep_snps_index = np.nonzero(self.keep_snp_mask)[0]
300
+ # Get the indices of SNPs to keep based on the keep_snp
301
+ keep_snps_indices = plink_bed.bim_df[
302
+ plink_bed.bim_df.SNP.isin(self.snp_name)
303
+ ].index.tolist()
633
304
 
634
305
  # Create a simple unit annotation (all ones) for the filtered SNPs
635
- unit_annotation = np.ones((len(keep_snps_index), 1))
306
+ unit_annotation = np.ones((len(keep_snps_indices), 1))
636
307
 
637
- # Calculate LD scores using the filtered SNPs
638
- w_ld_scores = get_ldscore(
639
- self.config.bfile_root,
640
- chrom,
641
- unit_annotation,
308
+ # Calculate LD scores
309
+ w_ld_scores = plink_bed.get_ldscore(
310
+ annot_matrix=unit_annotation,
642
311
  ld_wind=self.config.ld_wind,
643
312
  ld_unit=self.config.ld_unit,
644
- keep_snps_index=keep_snps_index.tolist(),
645
- )
646
-
647
- # Load the BIM file to get SNP information
648
- bim_data = pd.read_csv(
649
- f"{self.config.bfile_root}.{chrom}.bim",
650
- sep="\t",
651
- header=None,
652
- names=["CHR", "SNP", "CM", "BP", "A1", "A2"],
313
+ keep_snps_index=keep_snps_indices,
653
314
  )
654
315
 
655
- # Get SNP names for the kept indices
656
- kept_snp_names = bim_data.iloc[keep_snps_index].SNP.tolist()
657
-
658
316
  # Create the w_ld DataFrame
317
+ bim_subset = plink_bed.bim_df.loc[keep_snps_indices]
659
318
  w_ld_df = pd.DataFrame(
660
319
  {
661
- "SNP": kept_snp_names,
662
- "L2": w_ld_scores.values.flatten(),
663
- "CHR": bim_data.iloc[keep_snps_index].CHR.values,
664
- "BP": bim_data.iloc[keep_snps_index].BP.values,
665
- "CM": bim_data.iloc[keep_snps_index].CM.values,
320
+ "SNP": bim_subset.SNP,
321
+ "L2": w_ld_scores.flatten(),
322
+ "CHR": bim_subset.CHR,
323
+ "BP": bim_subset.BP,
324
+ "CM": bim_subset.CM,
666
325
  }
667
326
  )
668
327
 
669
328
  # Reorder columns
670
329
  w_ld_df = w_ld_df[["CHR", "SNP", "BP", "CM", "L2"]]
671
330
 
672
- # Save to feather format
331
+ # Save to file
673
332
  w_ld_dir = Path(self.config.ldscore_save_dir) / "w_ld"
674
333
  w_ld_dir.mkdir(parents=True, exist_ok=True)
675
334
  w_ld_file = w_ld_dir / f"weights.{chrom}.l2.ldscore.gz"
@@ -699,7 +358,7 @@ class LDScoreCalculator:
699
358
  logger.info(f"Using all {len(self.snp_name)} SNPs (no filter applied)")
700
359
  logger.warning("No keep_snp_root provided, all SNPs will be used to calculate w_ld.")
701
360
 
702
- def _process_additional_baseline(self, chrom: int):
361
+ def _process_additional_baseline(self, chrom: int, plink_bed):
703
362
  """
704
363
  Process additional baseline annotations.
705
364
 
@@ -707,6 +366,8 @@ class LDScoreCalculator:
707
366
  ----------
708
367
  chrom : int
709
368
  Chromosome number
369
+ plink_bed : PlinkBEDFile
370
+ Initialized PlinkBEDFile object
710
371
  """
711
372
  # Load additional baseline annotations
712
373
  additional_baseline_path = Path(self.config.additional_baseline_annotation)
@@ -736,25 +397,44 @@ class LDScoreCalculator:
736
397
  f"{missing_count} SNPs not found in additional baseline annotations. "
737
398
  "Setting their values to 0."
738
399
  )
739
- additional_baseline_df = additional_baseline_df.reindex(
740
- self.snp_gene_pair_dummy.index, fill_value=0
741
- )
742
- else:
743
- additional_baseline_df = additional_baseline_df.reindex(self.snp_gene_pair_dummy.index)
744
-
745
- # Calculate LD scores for both annotation sets together
746
- self.snp_gene_weight_matrix, additional_ldscore = (
747
- calculate_ldscore_from_multiple_annotation(
748
- [self.snp_gene_pair_dummy, additional_baseline_df],
749
- chrom,
750
- self.config.bfile_root,
751
- ld_wind=self.config.ld_wind,
752
- ld_unit=self.config.ld_unit,
753
- )
400
+ additional_baseline_df = additional_baseline_df.reindex(
401
+ self.snp_gene_pair_dummy.index, fill_value=0
402
+ )
403
+
404
+ # Combine annotations into a single matrix
405
+ combined_annotations = pd.concat(
406
+ [self.snp_gene_pair_dummy, additional_baseline_df], axis=1
754
407
  )
755
408
 
756
- # Filter additional ldscore
757
- additional_ldscore = additional_ldscore.loc[self.snp_name]
409
+ # Calculate LD scores
410
+ ld_scores = plink_bed.get_ldscore(
411
+ annot_matrix=combined_annotations.values.astype(np.float32, copy=False),
412
+ ld_wind=self.config.ld_wind,
413
+ ld_unit=self.config.ld_unit,
414
+ )
415
+
416
+ # Split results
417
+ # total_cols = combined_annotations.shape[1]
418
+ gene_cols = self.snp_gene_pair_dummy.shape[1]
419
+ # baseline_cols = additional_baseline_df.shape[1]
420
+
421
+ # Create DataFrames with proper indices and columns
422
+ self.snp_gene_weight_matrix = pd.DataFrame(
423
+ ld_scores[:, :gene_cols],
424
+ index=combined_annotations.index,
425
+ columns=self.snp_gene_pair_dummy.columns,
426
+ )
427
+
428
+ additional_ldscore = pd.DataFrame(
429
+ ld_scores[:, gene_cols:],
430
+ index=combined_annotations.index,
431
+ columns=additional_baseline_df.columns,
432
+ )
433
+
434
+ # Filter by keep_snp_mask if specified
435
+ if self.keep_snp_mask is not None:
436
+ additional_ldscore = additional_ldscore[self.keep_snp_mask]
437
+ self.snp_gene_weight_matrix = self.snp_gene_weight_matrix[self.keep_snp_mask]
758
438
 
759
439
  # Save additional baseline LD scores
760
440
  ld_score_file = f"{self.config.ldscore_save_dir}/additional_baseline/baseline.{chrom}.l2.ldscore.feather"
@@ -799,7 +479,7 @@ class LDScoreCalculator:
799
479
  save_path = save_dir / f"{chrom}.snp_gene_weight_matrix.feather"
800
480
  self.snp_gene_weight_matrix.reset_index().to_feather(save_path)
801
481
 
802
- def _calculate_baseline_ldscores(self, chrom: int):
482
+ def _calculate_baseline_ldscores(self, chrom: int, plink_bed):
803
483
  """
804
484
  Calculate and save baseline LD scores.
805
485
 
@@ -807,6 +487,8 @@ class LDScoreCalculator:
807
487
  ----------
808
488
  chrom : int
809
489
  Chromosome number
490
+ plink_bed : PlinkBEDFile
491
+ Initialized PlinkBEDFile object
810
492
  """
811
493
  # Create baseline scores
812
494
  baseline_mk_score = np.ones((self.snp_gene_pair_dummy.shape[1], 2))
@@ -824,7 +506,9 @@ class LDScoreCalculator:
824
506
  m_5_file = f"{self.config.ldscore_save_dir}/baseline/baseline.{chrom}.l2.M_5_50"
825
507
 
826
508
  # Calculate LD scores
827
- ldscore_chunk = self._calculate_ldscore_from_weights(baseline_df, drop_dummy_na=False)
509
+ ldscore_chunk = self._calculate_ldscore_from_weights(
510
+ baseline_df, plink_bed, drop_dummy_na=False
511
+ )
828
512
 
829
513
  # Save LD scores and M values
830
514
  self._save_ldscore_to_feather(
@@ -842,9 +526,9 @@ class LDScoreCalculator:
842
526
 
843
527
  # If keep_snp_root is not provided, use the first column of baseline ldscore as w_ld
844
528
  if not self.config.keep_snp_root:
845
- self._save_baseline_as_w_ld(chrom, ldscore_chunk)
529
+ self._save_baseline_as_w_ld(chrom, ldscore_chunk, plink_bed)
846
530
 
847
- def _save_baseline_as_w_ld(self, chrom: int, ldscore_chunk: np.ndarray):
531
+ def _save_baseline_as_w_ld(self, chrom: int, ldscore_chunk: np.ndarray, plink_bed):
848
532
  """
849
533
  Save the first column of baseline ldscore as w_ld.
850
534
 
@@ -854,6 +538,8 @@ class LDScoreCalculator:
854
538
  Chromosome number
855
539
  ldscore_chunk : np.ndarray
856
540
  Array with baseline LD scores
541
+ plink_bed : PlinkBEDFile
542
+ Initialized PlinkBEDFile object
857
543
  """
858
544
  logger.info(f"Using first column of baseline ldscore as w_ld for chr{chrom}")
859
545
 
@@ -867,23 +553,24 @@ class LDScoreCalculator:
867
553
  # Extract the first column
868
554
  w_ld_values = ldscore_chunk[:, 0]
869
555
 
870
- # Create a DataFrame
871
- bim_data = pd.read_csv(
872
- f"{self.config.bfile_root}.{chrom}.bim",
873
- sep="\t",
874
- header=None,
875
- names=["CHR", "SNP", "CM", "BP", "A1", "A2"],
556
+ # Create a DataFrame with SNP information from the BIM file
557
+ snp_indices = (
558
+ plink_bed.kept_snps
559
+ if hasattr(plink_bed, "kept_snps")
560
+ else np.arange(len(self.snp_name))
876
561
  )
562
+ bim_subset = plink_bed.bim_df.iloc[snp_indices]
563
+
877
564
  w_ld_df = pd.DataFrame(
878
565
  {
879
566
  "SNP": self.snp_name,
880
567
  "L2": w_ld_values,
568
+ "CHR": bim_subset.CHR.values[: len(self.snp_name)], # Ensure length matches
569
+ "BP": bim_subset.BP.values[: len(self.snp_name)],
570
+ "CM": bim_subset.CM.values[: len(self.snp_name)],
881
571
  }
882
572
  )
883
573
 
884
- # Add CHR, BP, and CM information
885
- w_ld_df = w_ld_df.merge(bim_data[["SNP", "CHR", "BP", "CM"]], on="SNP", how="left")
886
-
887
574
  # Reorder columns
888
575
  w_ld_df = w_ld_df[["CHR", "SNP", "BP", "CM", "L2"]]
889
576
 
@@ -891,7 +578,7 @@ class LDScoreCalculator:
891
578
 
892
579
  logger.info(f"Saved w_ld for chr{chrom} to {w_ld_file}")
893
580
 
894
- def _calculate_annotation_ldscores(self, chrom: int):
581
+ def _calculate_annotation_ldscores(self, chrom: int, plink_bed):
895
582
  """
896
583
  Calculate and save LD scores for spatial annotations.
897
584
 
@@ -899,6 +586,8 @@ class LDScoreCalculator:
899
586
  ----------
900
587
  chrom : int
901
588
  Chromosome number
589
+ plink_bed : PlinkBEDFile
590
+ Initialized PlinkBEDFile object
902
591
  """
903
592
  # Get marker scores for gene columns (excluding dummy NA column)
904
593
  mk_scores = self.mk_score_common.loc[self.snp_gene_pair_dummy.columns[:-1]]
@@ -921,7 +610,7 @@ class LDScoreCalculator:
921
610
  m_5_file = f"{self.config.ldscore_save_dir}/{sample_name}_chunk{chunk_index}/{sample_name}.{chrom}.l2.M_5_50"
922
611
 
923
612
  # Calculate LD scores
924
- ldscore_chunk = self._calculate_ldscore_from_weights(mk_score_chunk)
613
+ ldscore_chunk = self._calculate_ldscore_from_weights(mk_score_chunk, plink_bed)
925
614
 
926
615
  # Save LD scores based on format
927
616
  if self.config.ldscore_save_format == "feather":
@@ -930,12 +619,6 @@ class LDScoreCalculator:
930
619
  column_names=mk_score_chunk.columns,
931
620
  save_file_name=ld_score_file,
932
621
  )
933
- elif self.config.ldscore_save_format == "zarr":
934
- self._save_ldscore_chunk_to_zarr(
935
- ldscore_chunk,
936
- chrom=chrom,
937
- start_col_index=i,
938
- )
939
622
  else:
940
623
  raise ValueError(f"Invalid ldscore_save_format: {self.config.ldscore_save_format}")
941
624
 
@@ -954,7 +637,7 @@ class LDScoreCalculator:
954
637
  gc.collect()
955
638
 
956
639
  def _calculate_ldscore_from_weights(
957
- self, marker_scores: pd.DataFrame, drop_dummy_na: bool = True
640
+ self, marker_scores: pd.DataFrame, plink_bed, drop_dummy_na: bool = True
958
641
  ) -> np.ndarray:
959
642
  """
960
643
  Calculate LD scores using SNP-gene weight matrix.
@@ -963,6 +646,8 @@ class LDScoreCalculator:
963
646
  ----------
964
647
  marker_scores : pd.DataFrame
965
648
  DataFrame with marker scores
649
+ plink_bed : PlinkBEDFile
650
+ Initialized PlinkBEDFile object
966
651
  drop_dummy_na : bool, optional
967
652
  Whether to drop the dummy NA column, by default True
968
653
 
@@ -1015,37 +700,6 @@ class LDScoreCalculator:
1015
700
  df.index.name = "SNP"
1016
701
  df.reset_index().to_feather(save_file_name)
1017
702
 
1018
- def _save_ldscore_chunk_to_zarr(
1019
- self, ldscore_data: np.ndarray, chrom: int, start_col_index: int
1020
- ):
1021
- """
1022
- Save LD scores to a zarr array.
1023
-
1024
- Parameters
1025
- ----------
1026
- ldscore_data : np.ndarray
1027
- Array with LD scores
1028
- chrom : int
1029
- Chromosome number
1030
- start_col_index : int
1031
- Starting column index in the zarr array
1032
- """
1033
- # Convert to float16 for storage efficiency
1034
- ldscore_data = ldscore_data.astype(np.float16, copy=False)
1035
-
1036
- # Handle numerical overflow
1037
- ldscore_data[np.isinf(ldscore_data)] = np.finfo(np.float16).max
1038
-
1039
- # Get start and end indices for this chromosome
1040
- chrom_start = self.chrom_snp_start_point[chrom - 1]
1041
- chrom_end = self.chrom_snp_start_point[chrom]
1042
-
1043
- # Save to zarr array
1044
- self.zarr_file[
1045
- chrom_start:chrom_end,
1046
- start_col_index : start_col_index + ldscore_data.shape[1],
1047
- ] = ldscore_data
1048
-
1049
703
  def _calculate_and_save_m_values(
1050
704
  self,
1051
705
  marker_scores: pd.DataFrame,
@@ -1090,7 +744,7 @@ class LDScoreCalculator:
1090
744
  np.savetxt(m_file_path, m_values, delimiter="\t")
1091
745
  np.savetxt(m_5_file_path, m_5_values, delimiter="\t")
1092
746
 
1093
- def _get_snp_gene_dummy(self, chrom: int) -> pd.DataFrame:
747
+ def _get_snp_gene_dummy(self, chrom: int, plink_bed) -> pd.DataFrame:
1094
748
  """
1095
749
  Get dummy matrix for SNP-gene pairs.
1096
750
 
@@ -1098,6 +752,7 @@ class LDScoreCalculator:
1098
752
  ----------
1099
753
  chrom : int
1100
754
  Chromosome number
755
+ plink_bed : PlinkBEDFile
1101
756
 
1102
757
  Returns
1103
758
  -------
@@ -1107,7 +762,8 @@ class LDScoreCalculator:
1107
762
  logger.info(f"Creating SNP-gene mappings for chromosome {chrom}")
1108
763
 
1109
764
  # Load BIM file
1110
- bim, bim_pr = load_bim(self.config.bfile_root, chrom)
765
+ bim = plink_bed.bim_df
766
+ bim_pr = plink_bed.convert_bim_to_pyrange(bim)
1111
767
 
1112
768
  # Determine mapping strategy
1113
769
  if self.config.gene_window_enhancer_priority in ["gene_window_first", "enhancer_first"]: