gsMap 1.72.3__py3-none-any.whl → 1.73.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
gsMap/latent_to_gene.py CHANGED
@@ -104,6 +104,7 @@ def compute_regional_mkscore(
104
104
  ranks,
105
105
  frac_whole,
106
106
  adata_X_bool,
107
+ pearson_residuals,
107
108
  ):
108
109
  """
109
110
  Compute gmean ranks of a region.
@@ -129,7 +130,8 @@ def compute_regional_mkscore(
129
130
  # Simultaneously consider the ratio of expression fractions and ranks
130
131
  gene_ranks_region = gene_ranks_region * frac_region
131
132
 
132
- mkscore = np.exp(gene_ranks_region**1.5) - 1
133
+ mkscore = np.exp(gene_ranks_region) - 1 if not pearson_residuals else gene_ranks_region
134
+
133
135
  return mkscore.astype(np.float16, copy=False)
134
136
 
135
137
 
@@ -147,22 +149,50 @@ def run_latent_to_gene(config: LatentToGeneConfig):
147
149
  )
148
150
 
149
151
  # Homologs transformation
150
- if config.homolog_file is not None:
151
- logger.info(f"------Transforming the {config.species} to HUMAN_GENE_SYM...")
152
- homologs = pd.read_csv(config.homolog_file, sep="\t")
153
- if homologs.shape[1] != 2:
154
- raise ValueError(
155
- "Homologs file must have two columns: one for the species and one for the human gene symbol."
152
+ if config.homolog_file is not None and config.species is not None:
153
+ species_col_name = f"{config.species}_homolog"
154
+
155
+ # Check if homolog conversion has already been performed
156
+ if species_col_name in adata.var.columns:
157
+ logger.warning(
158
+ f"Column '{species_col_name}' already exists in adata.var. "
159
+ f"It appears gene names have already been converted to human gene symbols. "
160
+ f"Skipping homolog transformation."
161
+ )
162
+ else:
163
+ logger.info(f"------Transforming the {config.species} to HUMAN_GENE_SYM...")
164
+ homologs = pd.read_csv(config.homolog_file, sep="\t")
165
+ if homologs.shape[1] != 2:
166
+ raise ValueError(
167
+ "Homologs file must have two columns: one for the species and one for the human gene symbol."
168
+ )
169
+
170
+ homologs.columns = [config.species, "HUMAN_GENE_SYM"]
171
+ homologs.set_index(config.species, inplace=True)
172
+
173
+ # original_gene_names = adata.var_names.copy()
174
+
175
+ # Filter genes present in homolog file
176
+ adata = adata[:, adata.var_names.isin(homologs.index)]
177
+ logger.info(f"{adata.shape[1]} genes retained after homolog transformation.")
178
+ if adata.shape[1] < 100:
179
+ raise ValueError("Too few genes retained in ST data (<100).")
180
+
181
+ # Create mapping table of original to human gene names
182
+ gene_mapping = pd.Series(
183
+ homologs.loc[adata.var_names, "HUMAN_GENE_SYM"].values, index=adata.var_names
156
184
  )
157
185
 
158
- homologs.columns = [config.species, "HUMAN_GENE_SYM"]
159
- homologs.set_index(config.species, inplace=True)
160
- adata = adata[:, adata.var_names.isin(homologs.index)]
161
- logger.info(f"{adata.shape[1]} genes retained after homolog transformation.")
162
- if adata.shape[1] < 100:
163
- raise ValueError("Too few genes retained in ST data (<100).")
164
- adata.var_names = homologs.loc[adata.var_names, "HUMAN_GENE_SYM"].values
165
- adata = adata[:, ~adata.var_names.duplicated()]
186
+ # Store original species gene names in var dataframe with the suffixed column name
187
+ adata.var[species_col_name] = adata.var_names.values
188
+
189
+ # Convert var_names to human gene symbols
190
+ adata.var_names = gene_mapping.values
191
+ adata.var.index.name = "HUMAN_GENE_SYM"
192
+
193
+ # Remove duplicated genes after conversion
194
+ adata = adata[:, ~adata.var_names.duplicated()]
195
+ logger.info(f"{adata.shape[1]} genes retained after removing duplicates.")
166
196
 
167
197
  if config.annotation is not None:
168
198
  cell_annotations = adata.obs[config.annotation].values
@@ -218,11 +248,18 @@ def run_latent_to_gene(config: LatentToGeneConfig):
218
248
  # Create mappings
219
249
  n_cells = adata.n_obs
220
250
  n_genes = adata.n_vars
221
-
251
+ pearson_residuals = True if "pearson_residuals" in adata.layers else False
222
252
  ranks = np.zeros((n_cells, adata.n_vars), dtype=np.float16)
223
- for i in tqdm(range(n_cells), desc="Computing ranks per cell"):
224
- data = adata_X[i, :].toarray().flatten()
225
- ranks[i, :] = rankdata(data, method="average")
253
+
254
+ if pearson_residuals:
255
+ logger.info("Using pearson residuals for ranking.")
256
+ data = adata.layers["pearson_residuals"]
257
+ for i in tqdm(range(n_cells), desc="Computing ranks per cell"):
258
+ ranks[i, :] = rankdata(data[i, :], method="average")
259
+ else:
260
+ for i in tqdm(range(n_cells), desc="Computing ranks per cell"):
261
+ data = adata_X[i, :].toarray().flatten()
262
+ ranks[i, :] = rankdata(data, method="average")
226
263
 
227
264
  if gM is None:
228
265
  gM = gmean(ranks, axis=0)
@@ -252,6 +289,7 @@ def run_latent_to_gene(config: LatentToGeneConfig):
252
289
  ranks,
253
290
  frac_whole,
254
291
  adata_X_bool,
292
+ pearson_residuals,
255
293
  )
256
294
 
257
295
  logger.info("------Computing marker scores...")
gsMap/run_all_mode.py CHANGED
@@ -67,6 +67,8 @@ def run_pipeline(config: RunAllModeConfig):
67
67
  sample_name=config.sample_name,
68
68
  annotation=config.annotation,
69
69
  data_layer=config.data_layer,
70
+ n_comps=config.n_comps,
71
+ pearson_residuals=config.pearson_residuals,
70
72
  )
71
73
 
72
74
  # Step 1: Find latent representations
@@ -1,42 +1,7 @@
1
- from pathlib import Path
2
-
3
1
  import bitarray as ba
4
2
  import numpy as np
5
3
  import pandas as pd
6
- from scipy.sparse import csr_matrix, load_npz, save_npz
7
- from tqdm import tqdm, trange
8
-
9
-
10
- # Define the log class
11
- class Logger:
12
- # -
13
- def __init__(self, fh):
14
- self.log_fh = open(fh, "w")
15
-
16
- # -
17
- def log(self, msg):
18
- """
19
- Print to log file and stdout.
20
- """
21
- print(msg, file=self.log_fh)
22
- print(msg)
23
-
24
- # -
25
- def close(self):
26
- self.log_fh.close()
27
-
28
-
29
- # Compute ld-score using cellular annotations
30
- def get_compression(fh):
31
- """Which sort of compression should we use with read_csv?"""
32
- if fh.endswith("gz"):
33
- compression = "gzip"
34
- elif fh.endswith("bz2"):
35
- compression = "bz2"
36
- else:
37
- compression = None
38
- # -
39
- return compression
4
+ from tqdm import tqdm
40
5
 
41
6
 
42
7
  # Define the reading functions
@@ -67,29 +32,17 @@ def ID_List_Factory(colnames, keepcol, fname_end, header=None, usecols=None):
67
32
  end = self.fname_end
68
33
  if end and not fname.endswith(end):
69
34
  raise ValueError(f"{end} filename must end in {end}")
70
- comp = get_compression(fname)
71
35
  self.df = pd.read_csv(
72
- fname, header=self.header, usecols=self.usecols, sep=r"\s+", compression=comp
36
+ fname,
37
+ header=self.header,
38
+ usecols=self.usecols,
39
+ sep=r"\s+",
73
40
  )
74
41
  if self.colnames:
75
42
  self.df.columns = self.colnames
76
43
  if self.keepcol is not None:
77
44
  self.IDList = self.df.iloc[:, [self.keepcol]].astype("object")
78
45
 
79
- # -
80
- def loj(self, externalDf):
81
- """
82
- Perform a left outer join operation with the given external DataFrame.
83
- """
84
- r = externalDf.columns[0]
85
- l = self.IDList.columns[0]
86
- merge_df = externalDf.iloc[:, [0]]
87
- merge_df["keep"] = True
88
- z = pd.merge(self.IDList, merge_df, how="left", left_on=l, right_on=r, sort=False)
89
- ii = z["keep"]
90
- return np.nonzero(ii)[0]
91
-
92
- # -
93
46
  return IDContainer
94
47
 
95
48
 
@@ -512,257 +465,25 @@ class PlinkBEDFile(GenotypeArrayInMemory):
512
465
  return Y
513
466
 
514
467
 
515
- class PlinkBEDFileWithR2Cache(PlinkBEDFile):
516
- def compute_r2_cache(
517
- self,
518
- block_left,
519
- output_cache_file_dir: Path,
520
- chunk_size=500_000_000,
521
- c=500,
522
- r2_threshold=1e-4,
523
- annot=None,
524
- ):
525
- func = np.square
526
- snp_getter = self.nextSNPs
527
- data, rows, cols = [], [], []
528
-
529
- def add_rfuncAB(rfuncAB, l_A, l_B):
530
- non_zero_indices = np.nonzero(rfuncAB > r2_threshold)
531
- data.extend(rfuncAB[non_zero_indices])
532
- rows.extend(l_A + non_zero_indices[0])
533
- cols.extend(l_B + non_zero_indices[1])
534
-
535
- # def add_rfuncAB(rfuncAB, l_A, l_B):
536
- # # not need select non zero indices
537
- # data.extend(rfuncAB.flatten())
538
- # rows.extend(l_A + np.repeat(np.arange(rfuncAB.shape[0]), rfuncAB.shape[1]))
539
- # cols.extend(l_B + np.tile(np.arange(rfuncAB.shape[1]), rfuncAB.shape[0]))
540
-
541
- # def add_rfuncBB(rfuncBB, l_B):
542
- # non_zero_indices = np.nonzero(rfuncBB)
543
- # data.extend(rfuncBB[non_zero_indices])
544
- # rows.extend(l_B + non_zero_indices[0])
545
- # cols.extend(l_B + non_zero_indices[1])
546
-
547
- def add_rfuncBB(rfuncBB, l_B):
548
- non_zero_indices = np.nonzero(rfuncBB > r2_threshold)
549
- data.extend(rfuncBB[non_zero_indices])
550
- rows.extend(l_B + non_zero_indices[0])
551
- cols.extend(l_B + non_zero_indices[1])
552
- if len(data) > chunk_size:
553
- # save the cache
554
- print(f"Start saving the cache file: {output_cache_file_dir / f'{l_B}.npz'}")
555
- r2_sparse_matrix = csr_matrix(
556
- (data, (rows, cols)), shape=(self.m, self.m), dtype="float16"
557
- )
558
- save_npz(output_cache_file_dir / f"{l_B}.npz", r2_sparse_matrix)
559
- # reset the data
560
- data.clear()
561
- rows.clear()
562
- cols.clear()
563
-
564
- m, n = self.m, self.n
565
- block_sizes = np.array(np.arange(m) - block_left)
566
- block_sizes = np.ceil(block_sizes / c) * c
567
- if annot is None:
568
- annot = np.ones((m, 1))
569
- else:
570
- annot_m = annot.shape[0]
571
- if annot_m != self.m:
572
- raise ValueError("Incorrect number of SNPs in annot")
573
- # -
574
- # n_a = annot.shape[1] # number of annotations
575
- # cor_sum = np.zeros((m, n_a))
576
- # b = index of first SNP for which SNP 0 is not included in LD Score
577
- b = np.nonzero(block_left > 0)
578
- if np.any(b):
579
- b = b[0][0]
580
- else:
581
- b = m
582
- b = int(np.ceil(b / c) * c) # round up to a multiple of c
583
- if b > m:
584
- c = 1
585
- b = m
586
-
587
- l_A = 0 # l_A := index of leftmost SNP in matrix A
588
- A = snp_getter(b)
589
- rfuncAB = np.zeros((b, c))
590
- rfuncBB = np.zeros((c, c))
591
- # chunk inside of block
592
- for l_B in np.arange(0, b, c): # l_B := index of leftmost SNP in matrix B
593
- B = A[:, l_B : l_B + c]
594
- # ld matrix
595
- np.dot(A.T, B / n, out=rfuncAB)
596
- # ld matrix square
597
- rfuncAB = func(rfuncAB)
598
- add_rfuncAB(rfuncAB, l_A, l_B)
599
- # cor_sum[l_A:l_A + b, :] += np.dot(rfuncAB, annot[l_B:l_B + c, :])
600
-
601
- # chunk to right of block
602
- b0 = b
603
- md = int(c * np.floor(m / c))
604
- end = md + 1 if md != m else md
605
- for l_B in trange(b0, end, c, desc=f"Compute r2 cache for {output_cache_file_dir.name}"):
606
- # check if the annot matrix is all zeros for this block + chunk
607
- # this happens w/ sparse categories (i.e., pathways)
608
- # update the block
609
- old_b = b
610
- b = int(block_sizes[l_B])
611
- if l_B > b0 and b > 0:
612
- # block_size can't increase more than c
613
- # block_size can't be less than c unless it is zero
614
- # both of these things make sense
615
- A = np.hstack((A[:, old_b - b + c : old_b], B))
616
- l_A += old_b - b + c
617
- elif l_B == b0 and b > 0:
618
- A = A[:, b0 - b : b0]
619
- l_A = b0 - b
620
- elif b == 0: # no SNPs to left in window, e.g., after a sequence gap
621
- A = np.array(()).reshape((n, 0))
622
- l_A = l_B
623
- if l_B == md:
624
- c = m - md
625
- rfuncAB = np.zeros((b, c))
626
- rfuncBB = np.zeros((c, c))
627
- if b != old_b:
628
- rfuncAB = np.zeros((b, c))
629
- # -
630
- B = snp_getter(c)
631
- p1 = np.all(annot[l_A : l_A + b, :] == 0)
632
- p2 = np.all(annot[l_B : l_B + c, :] == 0)
633
- if p1 and p2:
634
- continue
635
- # -
636
- np.dot(A.T, B / n, out=rfuncAB)
637
- rfuncAB = func(rfuncAB)
638
- # cor_sum[l_A:l_A + b, :] += np.dot(rfuncAB, annot[l_B:l_B + c, :])
639
- # cor_sum[l_B:l_B + c, :] += np.dot(annot[l_A:l_A + b, :].T, rfuncAB).T
640
- add_rfuncAB(rfuncAB, l_A, l_B)
641
- add_rfuncAB(rfuncAB.T, l_B, l_A)
642
- np.dot(B.T, B / n, out=rfuncBB)
643
- rfuncBB = func(rfuncBB)
644
- # cor_sum[l_B:l_B + c, :] += np.dot(rfuncBB, annot[l_B:l_B + c, :])
645
- add_rfuncBB(rfuncBB, l_B)
646
- if len(data) > 0:
647
- # save remaining data
648
- # save the cache
649
- print(f"Start saving the cache file: {output_cache_file_dir / f'{l_B}.npz'}")
650
- r2_sparse_matrix = csr_matrix((data, (rows, cols)), shape=(m, m), dtype="float16")
651
- save_npz(output_cache_file_dir / f"{l_B}.npz", r2_sparse_matrix)
652
- # combine the cache files
653
- print(f"Start combining the cache files in {output_cache_file_dir}")
654
- cached_r2_matrix_files = list(output_cache_file_dir.glob("*.npz"))
655
- combined_r2_matrix_files = self.load_r2_matrix_from_cache_files(output_cache_file_dir)
656
- # remove the cache files
657
- for cached_r2_matrix_file in cached_r2_matrix_files:
658
- cached_r2_matrix_file.unlink()
659
- # save the combined r2 matrix
660
- print(f"Start saving the combined r2 matrix in {output_cache_file_dir}")
661
- combined_r2_matrix_file = output_cache_file_dir / "combined_r2_matrix.npz"
662
- save_npz(combined_r2_matrix_file, combined_r2_matrix_files)
663
-
664
- def get_ldscore_using_r2_cache(self, annot_matrix, cached_r2_matrix_dir):
665
- """
666
- Compute the r2 matrix multiplication with annot_matrix
667
- """
668
- # Compute the r2 matrix multiplication with annot_matrix
669
- cached_r2_matrix_dir = Path(cached_r2_matrix_dir)
670
- # iter the cached r2 matrix files
671
- result_matrix = np.zeros((self.m, annot_matrix.shape[1]))
672
- cached_r2_matrix_files = list(cached_r2_matrix_dir.glob("*.npz"))
673
- assert len(cached_r2_matrix_files) > 0, (
674
- f"No cached r2 matrix files in {cached_r2_matrix_dir}"
675
- f"Please run the function compute_r2_cache first!"
676
- )
677
- for r2_matrix_file in tqdm(
678
- cached_r2_matrix_files, desc=f"Compute ld score for {cached_r2_matrix_dir.name}"
679
- ):
680
- print(f"Compute r2 matrix multiplication for {r2_matrix_file}")
681
- r2_matrix = load_npz(r2_matrix_file)
682
- result_matrix += r2_matrix.dot(annot_matrix)
683
- return result_matrix
684
-
685
- def load_r2_matrix_from_cache_files(self, cached_r2_matrix_dir):
686
- """
687
- Load the r2 matrix from cache
688
- """
689
- cached_r2_matrix_dir = Path(cached_r2_matrix_dir)
690
- # iter the cached r2 matrix files
691
- cached_r2_matrix_files = list(cached_r2_matrix_dir.glob("*.npz"))
692
- assert len(cached_r2_matrix_files) > 0, (
693
- f"No cached r2 matrix files in {cached_r2_matrix_dir}"
694
- f"Please run the function compute_r2_cache first!"
695
- )
696
- # load the r2 matrix
697
- r2_matrix = load_npz(cached_r2_matrix_files[0])
698
- for r2_matrix_file in tqdm(
699
- cached_r2_matrix_files[1:], desc=f"Load r2 matrix from {cached_r2_matrix_dir.name}"
700
- ):
701
- print(f"Load r2 matrix from {r2_matrix_file}")
702
- r2_matrix += load_npz(r2_matrix_file)
703
- # to float16
704
- r2_matrix = r2_matrix.astype("float16")
705
- return r2_matrix
706
-
707
- def load_combined_r2_matrix(self, cached_r2_matrix_dir):
708
- """
709
- Load the combined r2 matrix
710
- """
711
- combined_r2_matrix_file = Path(cached_r2_matrix_dir) / "combined_r2_matrix.npz"
712
- assert combined_r2_matrix_file.exists(), (
713
- f"No combined r2 matrix file in {cached_r2_matrix_dir}"
714
- f"Should delete the cache files and run the function compute_r2_cache first!"
715
- )
716
- # load the r2 matrix
717
- r2_matrix = load_npz(combined_r2_matrix_file)
718
- # to float16
719
- r2_matrix = r2_matrix.astype("float16")
720
- return r2_matrix
721
-
722
-
723
- def load_bfile(bfile_chr_prefix):
468
+ def load_bfile(bfile_chr_prefix, keep_snps=None, keep_indivs=None, mafMin=None):
724
469
  PlinkBIMFile = ID_List_Factory(
725
470
  ["CHR", "SNP", "CM", "BP", "A1", "A2"], 1, ".bim", usecols=[0, 1, 2, 3, 4, 5]
726
471
  )
727
472
  PlinkFAMFile = ID_List_Factory(["IID"], 0, ".fam", usecols=[1])
728
473
 
729
- snp_file, snp_obj = bfile_chr_prefix + ".bim", PlinkBIMFile
730
- array_snps = snp_obj(snp_file)
731
- m = len(array_snps.IDList)
732
- print(f"Read list of {m} SNPs from {snp_file}")
733
- #
474
+ snp_file = bfile_chr_prefix + ".bim"
475
+ array_snps = PlinkBIMFile(snp_file)
476
+
734
477
  # Load fam
735
- ind_file, ind_obj = bfile_chr_prefix + ".fam", PlinkFAMFile
736
- array_indivs = ind_obj(ind_file)
478
+ ind_file = bfile_chr_prefix + ".fam"
479
+ array_indivs = PlinkFAMFile(ind_file)
480
+
737
481
  n = len(array_indivs.IDList)
738
- print(f"Read list of {n} individuals from {ind_file}")
739
482
 
740
483
  # Load genotype array
741
- array_file, array_obj = bfile_chr_prefix + ".bed", PlinkBEDFileWithR2Cache
742
- geno_array = array_obj(
743
- array_file, n, array_snps, keep_snps=None, keep_indivs=None, mafMin=None
484
+ array_file = bfile_chr_prefix + ".bed"
485
+ geno_array = PlinkBEDFile(
486
+ array_file, n, array_snps, keep_snps=keep_snps, keep_indivs=keep_indivs, mafMin=mafMin
744
487
  )
745
488
 
746
489
  return array_snps, array_indivs, geno_array
747
-
748
-
749
- def generate_r2_matrix_chr_cache(bfile_chr_prefix, ld_wind_cm, output_cache_file_dir):
750
- # Load genotype array
751
- array_snps, array_indivs, geno_array = load_bfile(bfile_chr_prefix)
752
- # Compute block lefts
753
- # block_left = getBlockLefts(geno_array.df[:, 3], ld_wind_cm)
754
- # Compute LD score
755
- # r2_matrix = geno_array.load_r2_matrix_from_cache(output_cache_file_dir)
756
-
757
-
758
- def generate_r2_matrix_cache(bfile_prefix, chromosome_list, r2_cache_dir, ld_wind_cm=1):
759
- r2_cache_dir = Path(r2_cache_dir)
760
-
761
- for chr in chromosome_list:
762
- output_cache_file_prefix = r2_cache_dir / f"chr{chr}"
763
- output_cache_file_prefix.mkdir(parents=True, exist_ok=True)
764
- bfile_chr_prefix = bfile_prefix + "." + str(chr)
765
- generate_r2_matrix_chr_cache(
766
- bfile_chr_prefix, ld_wind_cm=ld_wind_cm, output_cache_file_dir=output_cache_file_prefix
767
- )
768
- print(f"Compute r2 matrix for chr{chr} done!")
@@ -89,37 +89,6 @@ def which_compression(fh):
89
89
  return suffix, compression
90
90
 
91
91
 
92
- def _read_ref_ld(ld_file):
93
- suffix = ".l2.ldscore"
94
- file = ld_file
95
- first_fh = f"{file}1{suffix}"
96
- s, compression = which_compression(first_fh)
97
- #
98
- ldscore_array = []
99
- print(f"Reading ld score annotations from {file}[1-22]{suffix}.{compression}")
100
-
101
- for chr in range(1, 23):
102
- file_chr = f"{file}{chr}{suffix}{s}"
103
- #
104
- if compression == "parquet":
105
- x = pd.read_parquet(file_chr)
106
- elif compression == "feather":
107
- x = pd.read_feather(file_chr)
108
- else:
109
- x = pd.read_csv(file_chr, compression=compression, sep="\t")
110
-
111
- x = x.sort_values(by=["CHR", "BP"]) # SEs will be wrong unless sorted
112
-
113
- columns_to_drop = ["MAF", "CM", "Gene", "TSS", "CHR", "BP"]
114
- columns_to_drop = [col for col in columns_to_drop if col in x.columns]
115
- x = x.drop(columns_to_drop, axis=1)
116
-
117
- ldscore_array.append(x)
118
- #
119
- ref_ld = pd.concat(ldscore_array, axis=0)
120
- return ref_ld
121
-
122
-
123
92
  def _read_ref_ld_v2(ld_file):
124
93
  suffix = ".l2.ldscore"
125
94
  file = ld_file
@@ -185,23 +154,6 @@ def M(fh, common=False):
185
154
  return np.array(M_array).reshape((1, len(M_array)))
186
155
 
187
156
 
188
- def _check_variance(M_annot, ref_ld):
189
- """
190
- Remove zero-variance LD Scores.
191
- """
192
- ii = ref_ld.iloc[:, 1:].var() == 0 # NB there is a SNP column here
193
- if ii.all():
194
- raise ValueError("All LD Scores have zero variance.")
195
- else:
196
- print("Removing partitioned LD Scores with zero variance.")
197
- ii_snp = np.array([True] + list(~ii))
198
- ii_m = np.array(~ii)
199
- ref_ld = ref_ld.iloc[:, ii_snp]
200
- M_annot = M_annot[:, ii_m]
201
- # -
202
- return M_annot, ref_ld, ii
203
-
204
-
205
157
  def _check_variance_v2(M_annot, ref_ld):
206
158
  ii = ref_ld.var() == 0
207
159
  if ii.all():
@@ -247,31 +199,3 @@ def _read_w_ld(w_file):
247
199
  w_ld.columns = ["SNP", "LD_weights"]
248
200
 
249
201
  return w_ld
250
-
251
-
252
- # Fun for merging
253
- def _merge_and_log(ld, sumstats, noun):
254
- """
255
- Wrap smart merge with log messages about # of SNPs.
256
- """
257
- sumstats = smart_merge(ld, sumstats)
258
- msg = "After merging with {F}, {N} SNPs remain."
259
- if len(sumstats) == 0:
260
- raise ValueError(msg.format(N=len(sumstats), F=noun))
261
- else:
262
- print(msg.format(N=len(sumstats), F=noun))
263
- # -
264
- return sumstats
265
-
266
-
267
- def smart_merge(x, y):
268
- """
269
- Check if SNP columns are equal. If so, save time by using concat instead of merge.
270
- """
271
- if len(x) == len(y) and (x.index == y.index).all() and (x.SNP == y.SNP).all():
272
- x = x.reset_index(drop=True)
273
- y = y.reset_index(drop=True).drop("SNP", 1)
274
- out = pd.concat([x, y], axis=1)
275
- else:
276
- out = pd.merge(x, y, how="inner", on="SNP")
277
- return out