gwaslab 3.4.38__py3-none-any.whl → 3.4.39__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of gwaslab might be problematic. Click here for more details.

Files changed (51) hide show
  1. gwaslab/bd_common_data.py +6 -3
  2. gwaslab/bd_download.py +9 -9
  3. gwaslab/bd_get_hapmap3.py +43 -9
  4. gwaslab/g_Log.py +14 -5
  5. gwaslab/g_Sumstats.py +86 -18
  6. gwaslab/g_SumstatsPair.py +70 -23
  7. gwaslab/g_SumstatsT.py +2 -2
  8. gwaslab/g_version.py +10 -10
  9. gwaslab/hm_casting.py +9 -4
  10. gwaslab/hm_harmonize_sumstats.py +88 -83
  11. gwaslab/io_preformat_input.py +14 -14
  12. gwaslab/io_read_ldsc.py +49 -1
  13. gwaslab/ldsc_irwls.py +198 -0
  14. gwaslab/ldsc_jackknife.py +514 -0
  15. gwaslab/ldsc_ldscore.py +417 -0
  16. gwaslab/ldsc_parse.py +294 -0
  17. gwaslab/ldsc_regressions.py +747 -0
  18. gwaslab/ldsc_sumstats.py +629 -0
  19. gwaslab/qc_check_datatype.py +1 -1
  20. gwaslab/qc_fix_sumstats.py +163 -161
  21. gwaslab/util_ex_calculate_ldmatrix.py +2 -2
  22. gwaslab/util_ex_gwascatalog.py +24 -24
  23. gwaslab/util_ex_ldproxyfinder.py +9 -9
  24. gwaslab/util_ex_ldsc.py +189 -0
  25. gwaslab/util_in_calculate_gc.py +6 -6
  26. gwaslab/util_in_calculate_power.py +42 -43
  27. gwaslab/util_in_convert_h2.py +8 -8
  28. gwaslab/util_in_fill_data.py +28 -28
  29. gwaslab/util_in_filter_value.py +91 -52
  30. gwaslab/util_in_get_density.py +8 -8
  31. gwaslab/util_in_get_sig.py +407 -65
  32. gwaslab/viz_aux_annotate_plot.py +12 -12
  33. gwaslab/viz_aux_quickfix.py +18 -18
  34. gwaslab/viz_aux_reposition_text.py +3 -3
  35. gwaslab/viz_aux_save_figure.py +14 -5
  36. gwaslab/viz_plot_compare_af.py +29 -30
  37. gwaslab/viz_plot_compare_effect.py +63 -71
  38. gwaslab/viz_plot_miamiplot2.py +6 -6
  39. gwaslab/viz_plot_mqqplot.py +17 -3
  40. gwaslab/viz_plot_qqplot.py +1 -1
  41. gwaslab/viz_plot_regionalplot.py +33 -32
  42. gwaslab/viz_plot_rg_heatmap.py +28 -26
  43. gwaslab/viz_plot_stackedregional.py +40 -21
  44. gwaslab/viz_plot_trumpetplot.py +50 -55
  45. gwaslab-3.4.39.dist-info/LICENSE +674 -0
  46. {gwaslab-3.4.38.dist-info → gwaslab-3.4.39.dist-info}/METADATA +4 -3
  47. gwaslab-3.4.39.dist-info/RECORD +80 -0
  48. gwaslab-3.4.38.dist-info/RECORD +0 -72
  49. /gwaslab-3.4.38.dist-info/LICENSE → /gwaslab-3.4.39.dist-info/LICENSE_before_v3.4.39 +0 -0
  50. {gwaslab-3.4.38.dist-info → gwaslab-3.4.39.dist-info}/WHEEL +0 -0
  51. {gwaslab-3.4.38.dist-info → gwaslab-3.4.39.dist-info}/top_level.txt +0 -0
gwaslab/g_version.py CHANGED
@@ -3,10 +3,10 @@ import subprocess
3
3
  import os
4
4
  import numpy as np
5
5
 
6
- def _show_version(log=Log()):
6
+ def _show_version(log=Log(), verbose=True):
7
7
  # show version when loading sumstats
8
- log.write("GWASLab v{} https://cloufield.github.io/gwaslab/".format(gwaslab_info()["version"]))
9
- log.write("(C) 2022-2024, Yunye He, Kamatani Lab, MIT License, gwaslab@gmail.com")
8
+ log.write("GWASLab v{} https://cloufield.github.io/gwaslab/".format(gwaslab_info()["version"]),verbose=verbose)
9
+ log.write("(C) 2022-2024, Yunye He, Kamatani Lab, MIT License, gwaslab@gmail.com",verbose=verbose)
10
10
 
11
11
  def _get_version():
12
12
  # return short version string like v3.4.33
@@ -15,12 +15,12 @@ def _get_version():
15
15
  def gwaslab_info():
16
16
  # version meta information
17
17
  dic={
18
- "version":"3.4.38",
19
- "release_date":"20240203"
18
+ "version":"3.4.39",
19
+ "release_date":"20240210"
20
20
  }
21
21
  return dic
22
22
 
23
- def _checking_plink_version(v=2,log=Log()):
23
+ def _checking_plink_version(v=2,log=Log(), verbose=True):
24
24
  if v==1:
25
25
  which_plink_script = "plink --version"
26
26
  elif v==2:
@@ -29,19 +29,19 @@ def _checking_plink_version(v=2,log=Log()):
29
29
  log.write(" -PLINK version: {}".format(output.strip()))
30
30
  return log
31
31
 
32
- def _checking_r_version(r, log):
32
+ def _checking_r_version(r, log=Log(), verbose=True):
33
33
  which_r_script = "{} --version".format(r)
34
34
  output = subprocess.check_output(which_r_script, stderr=subprocess.STDOUT, shell=True,text=True)
35
- log.write(" -R version: {}".format(output.strip()))
35
+ log.write(" -R version: {}".format(output.strip()),verbose=verbose)
36
36
  return log
37
37
 
38
- def _check_susie_version(r,log):
38
+ def _check_susie_version(r,log=Log(), verbose=True):
39
39
  rscript = 'print(packageVersion("susieR"))'
40
40
  temp_r = "_gwaslab_susie_temp_check_version_{}.R".format(np.random.randint(1, 99999999))
41
41
  with open(temp_r,"w") as file:
42
42
  file.write(rscript)
43
43
  which_susie_script = "{} {}".format(r, temp_r)
44
44
  output = subprocess.check_output(which_susie_script, stderr=subprocess.STDOUT, shell=True,text=True)
45
- log.write(" -SuSieR version: {}".format(output.strip()))
45
+ log.write(" -SuSieR version: {}".format(output.strip()),verbose=verbose)
46
46
  os.remove(temp_r)
47
47
  return log
gwaslab/hm_casting.py CHANGED
@@ -5,11 +5,15 @@ from pandas.api.types import CategoricalDtype
5
5
  from gwaslab.g_vchange_status import copy_status
6
6
  from gwaslab.g_vchange_status import vchange_status
7
7
  from gwaslab.qc_fix_sumstats import flipallelestats
8
+ from gwaslab.qc_check_datatype import check_datatype
9
+ from gwaslab.qc_fix_sumstats import start_to
8
10
  from gwaslab.util_in_fill_data import filldata
9
11
  from Bio import SeqIO
10
12
  from itertools import combinations
11
13
 
12
- def _merge_mold_with_sumstats(mold, sumstats, ref_path=None, windowsizeb=10, log=Log(),suffixes=("_MOLD",""),verbose=True,return_not_matched_mold =False):
14
+ def _merge_mold_with_sumstats_by_chrpos(mold, sumstats, ref_path=None, windowsizeb=10, log=Log(),suffixes=("_MOLD",""),verbose=True,return_not_matched_mold =False):
15
+
16
+
13
17
  cols_to_drop = []
14
18
  for i in sumstats.columns:
15
19
  if i in ["SNPID","rsID"]:
@@ -31,6 +35,7 @@ def _merge_mold_with_sumstats(mold, sumstats, ref_path=None, windowsizeb=10, log
31
35
  if return_not_matched_mold:
32
36
  mold["_IDENTIFIER_FOR_VARIANT"] = range(len(mold))
33
37
 
38
+ # mold sumffix + mold
34
39
  mold_sumstats = pd.merge(mold, sumstats, on=["CHR","POS"], how="inner",suffixes=suffixes)
35
40
  log.write(" -After merging by CHR and POS:{}".format(len(mold_sumstats)), verbose=verbose)
36
41
 
@@ -133,9 +138,9 @@ def _sort_pair_cols(molded_sumstats, verbose=True, log=Log(), order=None, stats_
133
138
  if i not in order:
134
139
  output_columns.append(i)
135
140
 
136
- if verbose: log.write(" -Reordering columns to :", ",".join(output_columns), verbose=verbose)
141
+ log.write(" -Reordering columns to :", ",".join(output_columns), verbose=verbose)
137
142
  molded_sumstats = molded_sumstats[ output_columns]
138
- if verbose: log.write("Finished sorting columns successfully!", verbose=verbose)
143
+ log.write("Finished sorting columns successfully!", verbose=verbose)
139
144
 
140
145
  return molded_sumstats
141
146
 
@@ -168,7 +173,7 @@ def _match_two_sumstats(mold,sumstats,ref_path,windowsizeb=25,verbose=True,log=L
168
173
  record_chr = int(str(record.id).strip("chrCHR").upper())
169
174
 
170
175
  if record_chr in chromlist:
171
- if verbose: log.write(record_chr," ", end="",show_time=False)
176
+ log.write(record_chr," ", end="",show_time=False,verbose=verbose)
172
177
  chromlist.remove(record_chr)
173
178
  else:
174
179
  continue
@@ -59,17 +59,17 @@ def rsidtochrpos(sumstats,
59
59
  if is_enough_info == False: return sumstats
60
60
  ############################################################################################
61
61
 
62
- if verbose: log.write(" -rsID dictionary file: "+ path)
62
+ log.write(" -rsID dictionary file: "+ path,verbose=verbose)
63
63
 
64
64
  if ref_rsid_to_chrpos_tsv is not None:
65
65
  path = ref_rsid_to_chrpos_tsv
66
66
 
67
67
  if snpid in sumstats.columns and sum(sumstats[rsid].isna())>0:
68
- if verbose: log.write(" -Filling na in rsID columns with SNPID...")
68
+ log.write(" -Filling na in rsID columns with SNPID...",verbose=verbose)
69
69
  sumstats.loc[sumstats[rsid].isna(),rsid] = sumstats.loc[sumstats[rsid].isna(),snpid]
70
70
 
71
71
  if sum(sumstats[rsid].isna())>0:
72
- if verbose: log.write(" -Filling na in rsID columns with NA_xxx for {} variants...".format(sum(sumstats[rsid].isna())))
72
+ log.write(" -Filling na in rsID columns with NA_xxx for {} variants...".format(sum(sumstats[rsid].isna())),verbose=verbose)
73
73
  sumstats.loc[sumstats[rsid].isna(),rsid] = ["NA_" + str(x+1) for x in range(len(sumstats.loc[sumstats[rsid].isna(),rsid]))]
74
74
 
75
75
  dic_chuncks = pd.read_csv(path,sep="\t",usecols=[ref_rsid,ref_chr,ref_pos],
@@ -84,8 +84,8 @@ def rsidtochrpos(sumstats,
84
84
  if pos not in sumstats.columns:
85
85
  sumstats[pos] =pd.Series(dtype="Int64")
86
86
 
87
- if verbose: log.write(" -Setting block size: ",chunksize)
88
- if verbose: log.write(" -Loading block: ",end="")
87
+ log.write(" -Setting block size: ",chunksize,verbose=verbose)
88
+ log.write(" -Loading block: ",end="",verbose=verbose)
89
89
  for i,dic in enumerate(dic_chuncks):
90
90
  dic_to_update = dic[dic.index.notnull()]
91
91
  log.write(i," ",end=" ",show_time=False)
@@ -95,10 +95,10 @@ def rsidtochrpos(sumstats,
95
95
  sumstats.update(dic_to_update,overwrite="True")
96
96
  gc.collect()
97
97
 
98
- if verbose: log.write("\n",end="",show_time=False)
98
+ log.write("\n",end="",show_time=False,verbose=verbose)
99
99
  sumstats = sumstats.reset_index()
100
100
  sumstats = sumstats.rename(columns = {'index':rsid})
101
- if verbose: log.write(" -Updating CHR and POS finished.Start to re-fixing CHR and POS... ")
101
+ log.write(" -Updating CHR and POS finished.Start to re-fixing CHR and POS... ",verbose=verbose)
102
102
  sumstats = fixchr(sumstats,verbose=verbose)
103
103
  sumstats = fixpos(sumstats,verbose=verbose)
104
104
  sumstats = sortcolumn(sumstats,verbose=verbose)
@@ -158,17 +158,17 @@ def parallelrsidtochrpos(sumstats, rsid="rsID", chrom="CHR",pos="POS", path=None
158
158
 
159
159
  sumstats["rsn"] = pd.to_numeric(sumstats[rsid].str.strip("rs"),errors="coerce").astype("Int64")
160
160
 
161
- if verbose: log.write(" -Source hdf5 file: ",path)
162
- if verbose: log.write(" -Cores to use : ",n_cores)
163
- if verbose: log.write(" -Blocksize (make sure it is the same as hdf5 file ): ",block_size)
161
+ log.write(" -Source hdf5 file: ",path,verbose=verbose)
162
+ log.write(" -Cores to use : ",n_cores,verbose=verbose)
163
+ log.write(" -Blocksize (make sure it is the same as hdf5 file ): ",block_size,verbose=verbose)
164
164
 
165
165
  input_columns= sumstats.columns
166
166
  sumstats_nonrs = sumstats.loc[sumstats["rsn"].isna()|sumstats["rsn"].duplicated(keep='first') ,:].copy()
167
167
  sumstats_rs = sumstats.loc[sumstats["rsn"].notnull(),:].copy()
168
168
 
169
- if verbose: log.write(" -Non-Valid rsIDs: ",sum(sumstats["rsn"].isna()))
170
- if verbose: log.write(" -Duplicated rsIDs except for the first occurrence: ",sum(sumstats.loc[~sumstats["rsn"].isna(), "rsn"].duplicated(keep='first')))
171
- if verbose: log.write(" -Valid rsIDs: ", len(sumstats_rs))
169
+ log.write(" -Non-Valid rsIDs: ",sum(sumstats["rsn"].isna()),verbose=verbose)
170
+ log.write(" -Duplicated rsIDs except for the first occurrence: ",sum(sumstats.loc[~sumstats["rsn"].isna(), "rsn"].duplicated(keep='first')),verbose=verbose)
171
+ log.write(" -Valid rsIDs: ", len(sumstats_rs),verbose=verbose)
172
172
 
173
173
  del sumstats
174
174
  gc.collect()
@@ -185,16 +185,16 @@ def parallelrsidtochrpos(sumstats, rsid="rsID", chrom="CHR",pos="POS", path=None
185
185
  #
186
186
  pool = Pool(n_cores)
187
187
  if chrom not in input_columns:
188
- if verbose: log.write(" -Initiating CHR ... ")
188
+ log.write(" -Initiating CHR ... ",verbose=verbose)
189
189
  sumstats_rs[chrom]=pd.Series(dtype="Int32")
190
190
 
191
191
  if pos not in input_columns:
192
- if verbose: log.write(" -Initiating POS ... ")
192
+ log.write(" -Initiating POS ... ",verbose=verbose)
193
193
  sumstats_rs[pos]=pd.Series(dtype="Int64")
194
194
 
195
195
  df_split=[y for x, y in sumstats_rs.groupby('group', as_index=False)]
196
- if verbose: log.write(" -Divided into groups: ",len(df_split))
197
- if verbose: log.write(" -",set(sumstats_rs.loc[:,"group"].unique()))
196
+ log.write(" -Divided into groups: ",len(df_split),verbose=verbose)
197
+ log.write(" -",set(sumstats_rs.loc[:,"group"].unique()),verbose=verbose)
198
198
 
199
199
  # check keys
200
200
  store = pd.HDFStore(path, 'r')
@@ -202,21 +202,21 @@ def parallelrsidtochrpos(sumstats, rsid="rsID", chrom="CHR",pos="POS", path=None
202
202
  all_groups_len = len(all_groups)
203
203
  store.close()
204
204
  all_groups_max = max(map(lambda x: int(x.split("_")[1]), all_groups))
205
- if verbose: log.write(" -Number of groups in HDF5: ",all_groups_len)
206
- if verbose: log.write(" -Max index of groups in HDF5: ",all_groups_max)
205
+ log.write(" -Number of groups in HDF5: ",all_groups_len,verbose=verbose)
206
+ log.write(" -Max index of groups in HDF5: ",all_groups_max,verbose=verbose)
207
207
 
208
208
  # update CHR and POS using rsID with multiple threads
209
209
  sumstats_rs = pd.concat(pool.map(partial(merge_chrpos,all_groups_max=all_groups_max,path=path,build=build,status=status),df_split),ignore_index=True)
210
210
  sumstats_rs.loc[:,["CHR","POS"]] = sumstats_rs.loc[:,["CHR","POS"]].astype("Int64")
211
211
  del df_split
212
212
  gc.collect()
213
- if verbose: log.write(" -Merging group data... ")
213
+ log.write(" -Merging group data... ",verbose=verbose)
214
214
  # drop group and rsn
215
215
  sumstats_rs = sumstats_rs.drop(columns=["group"])
216
216
  sumstats_nonrs = sumstats_nonrs.drop(columns=["rsn"])
217
217
 
218
218
  # merge back
219
- if verbose: log.write(" -Append data... ")
219
+ log.write(" -Append data... ",verbose=verbose)
220
220
  sumstats = pd.concat([sumstats_rs,sumstats_nonrs],ignore_index=True)
221
221
 
222
222
  del sumstats_rs
@@ -308,8 +308,8 @@ def checkref(sumstats,ref_path,chrom="CHR",pos="POS",ea="EA",nea="NEA",status="S
308
308
  **_must_args)
309
309
  if is_enough_info == False: return sumstats
310
310
  ############################################################################################
311
- if verbose: log.write(" -Reference genome FASTA file: "+ ref_path)
312
- if verbose: log.write(" -Checking records: ", end="")
311
+ log.write(" -Reference genome FASTA file: "+ ref_path,verbose=verbose)
312
+ log.write(" -Checking records: ", end="",verbose=verbose)
313
313
  chromlist = get_chr_list(add_number=True)
314
314
  records = SeqIO.parse(ref_path, "fasta")
315
315
  for record in records:
@@ -321,11 +321,11 @@ def checkref(sumstats,ref_path,chrom="CHR",pos="POS",ea="EA",nea="NEA",status="S
321
321
  else:
322
322
  i = record_chr
323
323
  if i in chromlist:
324
- if verbose: log.write(record_chr," ", end="",show_time=False)
324
+ log.write(record_chr," ", end="",show_time=False,verbose=verbose)
325
325
  to_check_ref = (sumstats[chrom]==i) & (~sumstats[pos].isna()) & (~sumstats[nea].isna()) & (~sumstats[ea].isna())
326
326
  sumstats.loc[to_check_ref,status] = sumstats.loc[to_check_ref,[pos,ea,nea,status]].apply(lambda x:check_status(x,record),axis=1)
327
327
 
328
- if verbose: log.write("\n",end="",show_time=False)
328
+ log.write("\n",end="",show_time=False,verbose=verbose)
329
329
 
330
330
  sumstats[status] = sumstats[status].astype("string")
331
331
  available_to_check =sum( (~sumstats[pos].isna()) & (~sumstats[nea].isna()) & (~sumstats[ea].isna()))
@@ -337,25 +337,25 @@ def checkref(sumstats,ref_path,chrom="CHR",pos="POS",ea="EA",nea="NEA",status="S
337
337
  #status_7=sum(sumstats["STATUS"].str.match("\w\w\w\w\w[7]\w", case=False, flags=0, na=False))
338
338
  status_8=sum(sumstats["STATUS"].str.match("\w\w\w\w\w[8]\w", case=False, flags=0, na=False))
339
339
 
340
- if verbose: log.write(" -Variants allele on given reference sequence : ",status_0)
341
- if verbose: log.write(" -Variants flipped : ",status_3)
340
+ log.write(" -Variants allele on given reference sequence : ",status_0,verbose=verbose)
341
+ log.write(" -Variants flipped : ",status_3,verbose=verbose)
342
342
  raw_matching_rate = (status_3+status_0)/available_to_check
343
343
  flip_rate = status_3/available_to_check
344
- if verbose: log.write(" -Raw Matching rate : ","{:.2f}%".format(raw_matching_rate*100))
344
+ log.write(" -Raw Matching rate : ","{:.2f}%".format(raw_matching_rate*100),verbose=verbose)
345
345
  if raw_matching_rate <0.8:
346
- if verbose: log.warning("Matching rate is low, please check if the right reference genome is used.")
346
+ log.warning("Matching rate is low, please check if the right reference genome is used.")
347
347
  if flip_rate > 0.85 :
348
- if verbose: log.write(" -Flipping variants rate > 0.85, it is likely that the EA is aligned with REF in the original dataset.")
348
+ log.write(" -Flipping variants rate > 0.85, it is likely that the EA is aligned with REF in the original dataset.",verbose=verbose)
349
349
 
350
- if verbose: log.write(" -Variants inferred reverse_complement : ",status_4)
351
- if verbose: log.write(" -Variants inferred reverse_complement_flipped : ",status_5)
352
- if verbose: log.write(" -Both allele on genome + unable to distinguish : ",status_6)
353
- #if verbose: log.write(" -Reverse_complementary + both allele on genome + unable to distinguish: ",status_7)
354
- if verbose: log.write(" -Variants not on given reference sequence : ",status_8)
350
+ log.write(" -Variants inferred reverse_complement : ",status_4,verbose=verbose)
351
+ log.write(" -Variants inferred reverse_complement_flipped : ",status_5,verbose=verbose)
352
+ log.write(" -Both allele on genome + unable to distinguish : ",status_6,verbose=verbose)
353
+ #log.write(" -Reverse_complementary + both allele on genome + unable to distinguish: ",status_7)
354
+ log.write(" -Variants not on given reference sequence : ",status_8,verbose=verbose)
355
355
 
356
356
  if remove is True:
357
357
  sumstats = sumstats.loc[~sumstats["STATUS"].str.match("\w\w\w\w\w[8]\w"),:]
358
- if verbose: log.write(" -Variants not on given reference sequence were removed.")
358
+ log.write(" -Variants not on given reference sequence were removed.",verbose=verbose)
359
359
 
360
360
  finished(log, verbose, _end_line)
361
361
  return sumstats
@@ -424,7 +424,7 @@ def parallelizeassignrsid(sumstats, path, ref_mode="vcf",snpid="SNPID",rsid="rsI
424
424
  if is_enough_info == False: return sumstats
425
425
  ############################################################################################
426
426
  chr_dict = auto_check_vcf_chr_dict(path, chr_dict, verbose, log)
427
- if verbose: log.write(" -Assigning rsID based on CHR:POS and REF:ALT/ALT:REF...")
427
+ log.write(" -Assigning rsID based on CHR:POS and REF:ALT/ALT:REF...",verbose=verbose)
428
428
  ##############################################
429
429
  if rsid not in sumstats.columns:
430
430
  sumstats[rsid]=pd.Series(dtype="string")
@@ -458,8 +458,8 @@ def parallelizeassignrsid(sumstats, path, ref_mode="vcf",snpid="SNPID",rsid="rsI
458
458
  ##################################################################################################################
459
459
 
460
460
  after_number = sum(~sumstats[rsid].isna())
461
- if verbose: log.write(" -rsID Annotation for "+str(total_number - after_number) +" need to be fixed!")
462
- if verbose: log.write(" -Annotated "+str(after_number - pre_number) +" rsID successfully!")
461
+ log.write(" -rsID Annotation for "+str(total_number - after_number) +" need to be fixed!",verbose=verbose)
462
+ log.write(" -Annotated "+str(after_number - pre_number) +" rsID successfully!",verbose=verbose)
463
463
 
464
464
  ##################################################################################################################
465
465
  elif ref_mode=="tsv":
@@ -486,7 +486,7 @@ def parallelizeassignrsid(sumstats, path, ref_mode="vcf",snpid="SNPID",rsid="rsI
486
486
  if is_enough_info == False: return sumstats
487
487
  ############################################################################################
488
488
 
489
- standardized_normalized = sumstats["STATUS"].str.match("\w\w\w[0][01234][0126]\w", case=False, flags=0, na=False)
489
+ standardized_normalized = sumstats["STATUS"].str.match("\w\w\w[0][01234]\w\w", case=False, flags=0, na=False)
490
490
 
491
491
  if rsid not in sumstats.columns:
492
492
  sumstats[rsid]=pd.Series(dtype="string")
@@ -500,15 +500,15 @@ def parallelizeassignrsid(sumstats, path, ref_mode="vcf",snpid="SNPID",rsid="rsI
500
500
 
501
501
  total_number= len(sumstats)
502
502
  pre_number = sum(~sumstats[rsid].isna())
503
- if verbose: log.write(" -"+str(sum(to_assign)) +" rsID could be possibly fixed...")
503
+ log.write(" -"+str(sum(to_assign)) +" rsID could be possibly fixed...",verbose=verbose)
504
504
  if sum(to_assign)>0:
505
505
  sumstats = sumstats.set_index(snpid)
506
506
  dic_chuncks = pd.read_csv(path,sep="\t",usecols=[ref_snpid,ref_rsid],
507
507
  chunksize=chunksize,index_col=ref_snpid,
508
508
  dtype={ref_snpid:"string",ref_rsid:"string"})
509
509
 
510
- if verbose: log.write(" -Setting block size: ",chunksize)
511
- if verbose: log.write(" -Loading block: ",end="")
510
+ log.write(" -Setting block size: ",chunksize,verbose=verbose)
511
+ log.write(" -Loading block: ",end="",verbose=verbose)
512
512
  for i,dic in enumerate(dic_chuncks):
513
513
  gc.collect()
514
514
  log.write(i," ",end=" ",show_time=False)
@@ -517,15 +517,15 @@ def parallelizeassignrsid(sumstats, path, ref_mode="vcf",snpid="SNPID",rsid="rsI
517
517
  dic = dic.loc[~dic.index.duplicated(keep=False),:]
518
518
  sumstats.update(dic,overwrite=True)
519
519
 
520
- if verbose: log.write("\n",end="",show_time=False)
520
+ log.write("\n",end="",show_time=False,verbose=verbose)
521
521
  sumstats = sumstats.reset_index()
522
522
  sumstats = sumstats.rename(columns = {'index':snpid})
523
523
 
524
524
  after_number = sum(~sumstats[rsid].isna())
525
- if verbose: log.write(" -rsID annotation for "+str(total_number - after_number) +" needed to be fixed!")
526
- if verbose: log.write(" -Annotated "+str(after_number - pre_number) +" rsID successfully!")
525
+ log.write(" -rsID annotation for "+str(total_number - after_number) +" needed to be fixed!",verbose=verbose)
526
+ log.write(" -Annotated "+str(after_number - pre_number) +" rsID successfully!",verbose=verbose)
527
527
  else:
528
- if verbose: log.write(" -No rsID can be fixed...skipping...")
528
+ log.write(" -No rsID can be fixed...skipping...",verbose=verbose)
529
529
  ################################################################################################################
530
530
 
531
531
  finished(log,verbose,_end_line)
@@ -652,7 +652,7 @@ def parallelinferstrand(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.40,
652
652
 
653
653
  ##not palindromic : change status
654
654
  sumstats.loc[not_palindromic_snp,status] = vchange_status(sumstats.loc[not_palindromic_snp,status], 7 ,"9","0")
655
- if verbose: log.write(" -Identified ", sum(palindromic)," palindromic SNPs...")
655
+ log.write(" -Identified ", sum(palindromic)," palindromic SNPs...",verbose=verbose)
656
656
 
657
657
  #palindromic but can not infer
658
658
  maf_can_infer = (sumstats[eaf] < maf_threshold) | (sumstats[eaf] > 1 - maf_threshold)
@@ -664,7 +664,7 @@ def parallelinferstrand(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.40,
664
664
 
665
665
  unknow_palindromic_to_check = palindromic & maf_can_infer & unknow_palindromic
666
666
 
667
- if verbose: log.write(" -After filtering by MAF< {} , {} palindromic SNPs with unknown strand will be inferred...".format(maf_threshold, sum(unknow_palindromic_to_check)))
667
+ log.write(" -After filtering by MAF< {} , {} palindromic SNPs with unknown strand will be inferred...".format(maf_threshold, sum(unknow_palindromic_to_check)),verbose=verbose)
668
668
 
669
669
  #########################################################################################
670
670
  if sum(unknow_palindromic_to_check)>0:
@@ -677,8 +677,10 @@ def parallelinferstrand(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.40,
677
677
  map_func = partial(check_strand,chr=chr,pos=pos,ref=ref,alt=alt,eaf=eaf,status=status,ref_infer=ref_infer,ref_alt_freq=ref_alt_freq,chr_dict=chr_dict)
678
678
  status_inferred = pd.concat(pool.map(map_func,df_split))
679
679
  sumstats.loc[unknow_palindromic_to_check,status] = status_inferred.values
680
- pool.close()
681
- pool.join()
680
+ pool.close()
681
+ pool.join()
682
+ else:
683
+ log.warning("No palindromic variants available for checking.")
682
684
  #########################################################################################
683
685
  #0 Not palindromic SNPs
684
686
  #1 Palindromic +strand -> no need to flip
@@ -697,33 +699,33 @@ def parallelinferstrand(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.40,
697
699
  status7 = sumstats[status].str.match(r'\w\w\w\w\w\w[7]', case=False, flags=0, na=False)
698
700
  status8 = sumstats[status].str.match(r'\w\w\w\w\w[123][8]', case=False, flags=0, na=False)
699
701
 
700
- if verbose: log.write(" -Non-palindromic : ",sum(status0))
701
- if verbose: log.write(" -Palindromic SNPs on + strand: ",sum(status1))
702
- if verbose: log.write(" -Palindromic SNPs on - strand and needed to be flipped:",sum(status5))
703
- if verbose: log.write(" -Palindromic SNPs with MAF not available to infer : ",sum(status7))
704
- if verbose: log.write(" -Palindromic SNPs with no macthes or no information : ",sum(status8))
702
+ log.write(" -Non-palindromic : ",sum(status0),verbose=verbose)
703
+ log.write(" -Palindromic SNPs on + strand: ",sum(status1),verbose=verbose)
704
+ log.write(" -Palindromic SNPs on - strand and needed to be flipped:",sum(status5),verbose=verbose)
705
+ log.write(" -Palindromic SNPs with MAF not available to infer : ",sum(status7),verbose=verbose)
706
+ log.write(" -Palindromic SNPs with no macthes or no information : ",sum(status8),verbose=verbose)
705
707
 
706
708
  if ("7" in remove_snp) and ("8" in remove_snp) :
707
- if verbose: log.write(" -Palindromic SNPs with maf not available to infer and with no macthes or no information will will be removed")
709
+ log.write(" -Palindromic SNPs with MAF not available to infer and with no macthes or no information will will be removed",verbose=verbose)
708
710
  sumstats = sumstats.loc[~(status7 | status8),:].copy()
709
711
  elif "8" in remove_snp:
710
- if verbose: log.write(" -Palindromic SNPs with no macthes or no information will be removed")
712
+ log.write(" -Palindromic SNPs with no macthes or no information will be removed",verbose=verbose)
711
713
  sumstats = sumstats.loc[~status8,:].copy()
712
714
  elif "7" in remove_snp:
713
- if verbose: log.write(" -Palindromic SNPs with maf not available to infer will be removed")
715
+ log.write(" -Palindromic SNPs with MAF not available to infer will be removed",verbose=verbose)
714
716
  sumstats = sumstats.loc[~status7,:].copy()
715
717
 
716
718
  ### unknow_indel
717
719
  if "i" in mode:
718
720
  unknow_indel = sumstats[status].str.match(r'\w\w\w\w\w[6][89]', case=False, flags=0, na=False)
719
- if verbose: log.write(" -Identified ", sum(unknow_indel)," indistinguishable Indels...")
721
+ log.write(" -Identified ", sum(unknow_indel)," indistinguishable Indels...",verbose=verbose)
720
722
  if sum(unknow_indel)>0:
721
- if verbose: log.write(" -Indistinguishable indels will be inferred from reference vcf ref and alt...")
723
+ log.write(" -Indistinguishable indels will be inferred from reference vcf REF and ALT...",verbose=verbose)
722
724
  #########################################################################################
723
725
  #with maf can not infer
724
726
  #maf_can_infer = (sumstats[eaf] < maf_threshold) | (sumstats[eaf] > 1 - maf_threshold)
725
727
  #sumstats.loc[unknow_indel&(~maf_can_infer),status] = vchange_status(sumstats.loc[unknow_indel&(~maf_can_infer),status],7,"9","8")
726
- if verbose: log.write(" -DAF tolerance: {}".format(daf_tolerance))
728
+ log.write(" -Difference in allele frequency (DAF) tolerance: {}".format(daf_tolerance),verbose=verbose)
727
729
 
728
730
  if sum(unknow_indel)>0:
729
731
  if sum(unknow_indel)<10000:
@@ -734,20 +736,23 @@ def parallelinferstrand(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.40,
734
736
  map_func = partial(check_indel,chr=chr,pos=pos,ref=ref,alt=alt,eaf=eaf,status=status,ref_infer=ref_infer,ref_alt_freq=ref_alt_freq,chr_dict=chr_dict,daf_tolerance=daf_tolerance)
735
737
  status_inferred = pd.concat(pool.map(map_func,df_split))
736
738
  sumstats.loc[unknow_indel,status] = status_inferred.values
737
- pool.close()
738
- pool.join()
739
+ pool.close()
740
+ pool.join()
741
+
739
742
  #########################################################################################
740
743
 
741
744
  status3 = sumstats[status].str.match(r'\w\w\w\w\w\w[3]', case=False, flags=0, na=False)
742
745
  status6 = sumstats[status].str.match(r'\w\w\w\w\w\w[6]', case=False, flags=0, na=False)
743
746
  status8 = sumstats[status].str.match(r'\w\w\w\w\w[6][8]', case=False, flags=0, na=False)
744
747
 
745
- if verbose: log.write(" -Indels ea/nea match reference : ",sum(status3))
746
- if verbose: log.write(" -Indels ea/nea need to be flipped : ",sum(status6))
747
- if verbose: log.write(" -Indels with no macthes or no information : ",sum(status8))
748
+ log.write(" -Indels ea/nea match reference : ",sum(status3),verbose=verbose)
749
+ log.write(" -Indels ea/nea need to be flipped : ",sum(status6),verbose=verbose)
750
+ log.write(" -Indels with no macthes or no information : ",sum(status8),verbose=verbose)
748
751
  if "8" in remove_indel:
749
- if verbose: log.write(" -Indels with no macthes or no information will be removed")
750
- sumstats = sumstats.loc[~status8,:].copy()
752
+ log.write(" -Indels with no macthes or no information will be removed",verbose=verbose)
753
+ sumstats = sumstats.loc[~status8,:].copy()
754
+ else:
755
+ log.warning("No indistinguishable indels available for checking.")
751
756
 
752
757
  finished(log,verbose,_end_line)
753
758
  return sumstats
@@ -804,7 +809,7 @@ def parallelecheckaf(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.4,colu
804
809
  log.write(" -Field for alternative allele frequency in VCF INFO: {}".format(ref_alt_freq), verbose=verbose)
805
810
  if not force:
806
811
  good_chrpos = sumstats[status].str.match(r'\w\w\w[0]\w\w\w', case=False, flags=0, na=False)
807
- if verbose: log.write(" -Checking variants:", sum(good_chrpos))
812
+ log.write(" -Checking variants:", sum(good_chrpos),verbose=verbose)
808
813
  sumstats[column_name]=np.nan
809
814
 
810
815
  ########################
@@ -823,13 +828,13 @@ def parallelecheckaf(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.4,colu
823
828
 
824
829
  #sumstats.loc[good_chrpos,"DAF"] = status_inferred.values
825
830
  #sumstats["DAF"]=sumstats["DAF"].astype("float")
826
- if verbose: log.write(" - {} max:".format(column_name), np.nanmax(sumstats[column_name]))
827
- if verbose: log.write(" - {} min:".format(column_name), np.nanmin(sumstats[column_name]))
828
- if verbose: log.write(" - {} sd:".format(column_name), np.nanstd(sumstats[column_name]))
829
- if verbose: log.write(" - abs({}) min:".format(column_name), np.nanmin(np.abs(sumstats[column_name])))
830
- if verbose: log.write(" - abs({}) max:".format(column_name), np.nanmax(np.abs(sumstats[column_name])))
831
- if verbose: log.write(" - abs({}) sd:".format(column_name), np.nanstd(np.abs(sumstats[column_name])))
832
- if verbose: log.write("Finished allele frequency checking!")
831
+ log.write(" - {} max:".format(column_name), np.nanmax(sumstats[column_name]),verbose=verbose)
832
+ log.write(" - {} min:".format(column_name), np.nanmin(sumstats[column_name]),verbose=verbose)
833
+ log.write(" - {} sd:".format(column_name), np.nanstd(sumstats[column_name]),verbose=verbose)
834
+ log.write(" - abs({}) min:".format(column_name), np.nanmin(np.abs(sumstats[column_name])),verbose=verbose)
835
+ log.write(" - abs({}) max:".format(column_name), np.nanmax(np.abs(sumstats[column_name])),verbose=verbose)
836
+ log.write(" - abs({}) sd:".format(column_name), np.nanstd(np.abs(sumstats[column_name])),verbose=verbose)
837
+ log.write("Finished allele frequency checking!")
833
838
  return sumstats
834
839
 
835
840
  def checkaf(sumstats,ref_infer,ref_alt_freq=None,column_name="DAF",chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",chr_dict=None):
@@ -886,7 +891,7 @@ def paralleleinferaf(sumstats,ref_infer,ref_alt_freq=None,n_cores=1, chr="CHR",p
886
891
  log.write(" -Field for alternative allele frequency in VCF INFO: {}".format(ref_alt_freq), verbose=verbose)
887
892
  if not force:
888
893
  good_chrpos = sumstats[status].str.match(r'\w\w\w[0]\w\w\w', case=False, flags=0, na=False)
889
- if verbose: log.write(" -Checking variants:", sum(good_chrpos))
894
+ log.write(" -Checking variants:", sum(good_chrpos),verbose=verbose)
890
895
 
891
896
  ########################
892
897
  if sum(sumstats[eaf].isna())<10000:
@@ -901,8 +906,8 @@ def paralleleinferaf(sumstats,ref_infer,ref_alt_freq=None,n_cores=1, chr="CHR",p
901
906
  ###########################
902
907
 
903
908
  afternumber = sum(sumstats[eaf].isna())
904
- if verbose: log.write(" -Inferred EAF for {} variants.".format(prenumber - afternumber))
905
- if verbose: log.write(" -EAF is still missing for {} variants.".format(afternumber))
909
+ log.write(" -Inferred EAF for {} variants.".format(prenumber - afternumber),verbose=verbose)
910
+ log.write(" -EAF is still missing for {} variants.".format(afternumber),verbose=verbose)
906
911
 
907
912
  finished(log,verbose,_end_line)
908
913
  return sumstats
@@ -936,13 +941,13 @@ def infer_af(chr,start,end,ref,alt,vcf_reader,alt_freq,chr_dict=None):
936
941
  def auto_check_vcf_chr_dict(vcf_path, vcf_chr_dict, verbose, log):
937
942
  if vcf_path is not None:
938
943
  if vcf_chr_dict is None:
939
- if verbose: log.write(" -Checking prefix for chromosomes in vcf files..." )
944
+ log.write(" -Checking prefix for chromosomes in vcf files..." ,verbose=verbose)
940
945
  prefix = check_vcf_chr_prefix(vcf_path)
941
946
  if prefix is not None:
942
- if verbose: log.write(" -Prefix for chromosomes: ",prefix)
947
+ log.write(" -Prefix for chromosomes: ",prefix)
943
948
  vcf_chr_dict = get_number_to_chr(prefix=prefix)
944
949
  else:
945
- if verbose: log.write(" -No prefix for chromosomes in the VCF files." )
950
+ log.write(" -No prefix for chromosomes in the VCF files." ,verbose=verbose)
946
951
  vcf_chr_dict = get_number_to_chr()
947
952
  return vcf_chr_dict
948
953
 
@@ -418,17 +418,17 @@ def print_format_info(fmt,meta_data, rename_dictionary, verbose, log,output=Fals
418
418
  if type(value) is str:
419
419
  if "\n" in value:
420
420
  value_first_line=value.split("\n")[0]
421
- if verbose:log.write(" -",key," : "+value_first_line.strip()+"...")
421
+ log.write(" -",key," : "+value_first_line.strip()+"...",verbose=verbose)
422
422
  elif value==" ":
423
- if verbose:log.write(' -',key,' : \\s ')
423
+ log.write(' -',key,' : \\s ',verbose=verbose)
424
424
  elif value=="\t":
425
- if verbose:log.write(' -',key,' : \\t')
425
+ log.write(' -',key,' : \\t',verbose=verbose)
426
426
  else:
427
- if verbose:log.write(" -",key," : "+value.strip())
427
+ log.write(" -",key," : "+value.strip(),verbose=verbose)
428
428
  elif type(value) is list:
429
- if verbose:log.write(" -",key," : "+','.join(value))
429
+ log.write(" -",key," : "+','.join(value),verbose=verbose)
430
430
  else:
431
- if verbose:log.write(" -",key," : ",value)
431
+ log.write(" -",key," : ",value,verbose=verbose)
432
432
  keys=[]
433
433
  values=[]
434
434
  for key,value in rename_dictionary.items():
@@ -437,21 +437,21 @@ def print_format_info(fmt,meta_data, rename_dictionary, verbose, log,output=Fals
437
437
  if fmt!="gwaslab":
438
438
  if output == False:
439
439
  if fmt!="auto":
440
- if verbose:log.write(" -"+fmt+" to gwaslab format dictionary:",verbose=verbose)
441
- if verbose:log.write(" - "+fmt+" keys:",",".join(keys),verbose=verbose)
442
- if verbose:log.write(" - gwaslab values:",",".join(values),verbose=verbose)
440
+ log.write(" -"+fmt+" to gwaslab format dictionary:",verbose=verbose)
441
+ log.write(" - "+fmt+" keys:",",".join(keys),verbose=verbose)
442
+ log.write(" - gwaslab values:",",".join(values),verbose=verbose)
443
443
  else:
444
- if verbose:log.write(" - Auto-detection mode. Note: auto-detection assumes A1=EA; Alt=EA and Frq=EAF...",verbose=verbose)
445
- if verbose:log.write(" - Header conversion source: https://github.com/Cloufield/formatbook/blob/main/formats/auto.json",verbose=verbose)
444
+ log.write(" - Auto-detection mode. Note: auto-detection assumes A1=EA; Alt=EA and Frq=EAF...",verbose=verbose)
445
+ log.write(" - Header conversion source: https://github.com/Cloufield/formatbook/blob/main/formats/auto.json",verbose=verbose)
446
446
  else:
447
- if verbose:log.write(" -gwaslab to "+fmt+" format dictionary:",verbose=verbose)
447
+ log.write(" -gwaslab to "+fmt+" format dictionary:",verbose=verbose)
448
448
  keys=[]
449
449
  values=[]
450
450
  for key,value in rename_dictionary.items():
451
451
  keys.append(key)
452
452
  values.append(value)
453
- if verbose:log.write(" - gwaslab keys:", ','.join(keys),verbose=verbose)
454
- if verbose:log.write(" - "+fmt+" values:" , ','.join(values),verbose=verbose)
453
+ log.write(" - gwaslab keys:", ','.join(keys),verbose=verbose)
454
+ log.write(" - "+fmt+" values:" , ','.join(values),verbose=verbose)
455
455
 
456
456
  def process_neaf(sumstats,log,verbose):
457
457
  log.write(" -NEAF is specified...",verbose=verbose)
gwaslab/io_read_ldsc.py CHANGED
@@ -195,4 +195,52 @@ def read_greml(filelist=[]):
195
195
  continue
196
196
  row = pd.DataFrame([row], columns = summary.columns)
197
197
  summary = pd.concat([summary, row], ignore_index=True)
198
- return summary
198
+ return summary
199
+
200
+ def parse_ldsc_summary(ldsc_summary):
201
+ summary = pd.DataFrame(columns = ['h2_obs', 'h2_se','Lambda_gc','Mean_chi2','Intercept','Intercept_se',"Ratio","Ratio_se"])
202
+ lines = ldsc_summary.split("\n")
203
+ row={}
204
+ try:
205
+ objects = re.compile('[a-zA-Z\s\d]+:|[-0-9.]+[e]?[-0-9.]+|NA').findall(lines[0])
206
+ row["h2_obs"]=objects[1]
207
+ row["h2_se"]=objects[2]
208
+
209
+ ##next line lambda gc
210
+
211
+ objects = re.compile('[a-zA-Z\s\d]+:|[-0-9.]+[e]?[-0-9.]+|NA').findall(lines[1])
212
+ row["Lambda_gc"] = objects[1]
213
+ ##next line Mean_chi2
214
+
215
+ objects = re.compile('[a-zA-Z\s\d]+:|[-0-9.]+[e]?[-0-9.]+|NA').findall(lines[2])
216
+ row["Mean_chi2"]=objects[1]
217
+ ##next line Intercept
218
+
219
+ objects = re.compile('[a-zA-Z\s\d]+:|[-0-9.]+[e]?[-0-9.]+|NA').findall(lines[3])
220
+ row["Intercept"]=objects[1]
221
+ row["Intercept_se"]=objects[2]
222
+ ##next line Ratio
223
+
224
+ if re.compile('NA').findall(lines[4]):
225
+ row["Ratio"]="NA"
226
+ row["Ratio_se"]="NA"
227
+ elif re.compile('<').findall(lines[4]):
228
+ row["Ratio"]="Ratio < 0"
229
+ row["Ratio_se"]="NA"
230
+ else:
231
+ objects = re.compile('[a-zA-Z\s\d]+:|[-0-9.]+[e]?[-0-9.]+').findall(lines[4])
232
+ row["Ratio"]=objects[1]
233
+ row["Ratio_se"]=objects[2]
234
+ except:
235
+ row["h2_obs"]="NA"
236
+ row["h2_se"]="NA"
237
+ row["Lambda_gc"] = "NA"
238
+ row["Mean_chi2"]="NA"
239
+ row["Intercept"]="NA"
240
+ row["Intercept_se"]="NA"
241
+ row["Ratio"]="NA"
242
+ row["Ratio_se"]="NA"
243
+
244
+ #summary = summary.append(row,ignore_index=True)
245
+ row = pd.DataFrame([row], columns = summary.columns)
246
+ return row