gwaslab 3.4.37__py3-none-any.whl → 3.4.39__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of gwaslab might be problematic. Click here for more details.

Files changed (57) hide show
  1. gwaslab/bd_common_data.py +6 -3
  2. gwaslab/bd_download.py +9 -9
  3. gwaslab/bd_get_hapmap3.py +43 -9
  4. gwaslab/data/formatbook.json +722 -721
  5. gwaslab/g_Log.py +22 -5
  6. gwaslab/g_Sumstats.py +110 -163
  7. gwaslab/g_SumstatsPair.py +76 -25
  8. gwaslab/g_SumstatsT.py +2 -2
  9. gwaslab/g_Sumstats_summary.py +3 -3
  10. gwaslab/g_version.py +10 -10
  11. gwaslab/hm_casting.py +36 -17
  12. gwaslab/hm_harmonize_sumstats.py +354 -221
  13. gwaslab/hm_rsid_to_chrpos.py +1 -1
  14. gwaslab/io_preformat_input.py +49 -43
  15. gwaslab/io_read_ldsc.py +49 -1
  16. gwaslab/io_to_formats.py +428 -295
  17. gwaslab/ldsc_irwls.py +198 -0
  18. gwaslab/ldsc_jackknife.py +514 -0
  19. gwaslab/ldsc_ldscore.py +417 -0
  20. gwaslab/ldsc_parse.py +294 -0
  21. gwaslab/ldsc_regressions.py +747 -0
  22. gwaslab/ldsc_sumstats.py +629 -0
  23. gwaslab/qc_check_datatype.py +3 -3
  24. gwaslab/qc_fix_sumstats.py +891 -778
  25. gwaslab/util_ex_calculate_ldmatrix.py +31 -13
  26. gwaslab/util_ex_gwascatalog.py +25 -25
  27. gwaslab/util_ex_ldproxyfinder.py +10 -10
  28. gwaslab/util_ex_ldsc.py +189 -0
  29. gwaslab/util_ex_process_ref.py +3 -3
  30. gwaslab/util_ex_run_coloc.py +26 -4
  31. gwaslab/util_in_calculate_gc.py +6 -6
  32. gwaslab/util_in_calculate_power.py +42 -43
  33. gwaslab/util_in_convert_h2.py +8 -8
  34. gwaslab/util_in_fill_data.py +30 -30
  35. gwaslab/util_in_filter_value.py +201 -74
  36. gwaslab/util_in_get_density.py +10 -10
  37. gwaslab/util_in_get_sig.py +445 -71
  38. gwaslab/viz_aux_annotate_plot.py +12 -12
  39. gwaslab/viz_aux_quickfix.py +42 -37
  40. gwaslab/viz_aux_reposition_text.py +10 -7
  41. gwaslab/viz_aux_save_figure.py +18 -8
  42. gwaslab/viz_plot_compare_af.py +32 -33
  43. gwaslab/viz_plot_compare_effect.py +63 -71
  44. gwaslab/viz_plot_miamiplot2.py +34 -26
  45. gwaslab/viz_plot_mqqplot.py +126 -75
  46. gwaslab/viz_plot_qqplot.py +11 -8
  47. gwaslab/viz_plot_regionalplot.py +36 -33
  48. gwaslab/viz_plot_rg_heatmap.py +28 -26
  49. gwaslab/viz_plot_stackedregional.py +40 -21
  50. gwaslab/viz_plot_trumpetplot.py +65 -61
  51. gwaslab-3.4.39.dist-info/LICENSE +674 -0
  52. {gwaslab-3.4.37.dist-info → gwaslab-3.4.39.dist-info}/METADATA +5 -4
  53. gwaslab-3.4.39.dist-info/RECORD +80 -0
  54. gwaslab-3.4.37.dist-info/RECORD +0 -72
  55. /gwaslab-3.4.37.dist-info/LICENSE → /gwaslab-3.4.39.dist-info/LICENSE_before_v3.4.39 +0 -0
  56. {gwaslab-3.4.37.dist-info → gwaslab-3.4.39.dist-info}/WHEEL +0 -0
  57. {gwaslab-3.4.37.dist-info → gwaslab-3.4.39.dist-info}/top_level.txt +0 -0
@@ -135,7 +135,7 @@ def rsID2chrpos(path,
135
135
  print("Writing to "+path+".rsid")
136
136
  sumstats = sumstats.loc[sumstats["#POS"]!=0]
137
137
 
138
- sumstats.loc[:,to_round] = sumstats.loc[:,to_round].round(4)
138
+ sumstats[to_round] = sumstats[to_round].round(4)
139
139
  sumstats.sort_values(["#CHROM","#POS"]).to_csv(path+".rsid","\t",na_rep='\.')
140
140
 
141
141
  return sumstats
@@ -67,7 +67,7 @@ def preformat(sumstats,
67
67
  #######################################################################################################################################################
68
68
  if fmt is not None:
69
69
  # loading format parameters
70
- if verbose: log.write("Start to load format from formatbook....")
70
+ log.write("Start to load format from formatbook....",verbose=verbose)
71
71
 
72
72
  # load format data
73
73
  meta_data,rename_dictionary = get_format_dict(fmt)
@@ -97,7 +97,7 @@ def preformat(sumstats,
97
97
  inpath = sumstats
98
98
  ###load sumstats by each chromosome #################################################
99
99
  if "@" in inpath:
100
- if verbose: log.write(" -Detected @ in path: load sumstats by each chromosome...")
100
+ log.write(" -Detected @ in path: load sumstats by each chromosome...",verbose=verbose)
101
101
  inpath_chr_list=[]
102
102
  inpath_chr_num_list=[]
103
103
  for chromosome in list(range(1,26))+["x","y","X","Y","MT","mt","m","M"]:
@@ -105,9 +105,10 @@ def preformat(sumstats,
105
105
  if isfile_casesensitive(inpath_chr):
106
106
  inpath_chr_num_list.append(str(chromosome))
107
107
  inpath_chr_list.append(inpath_chr)
108
- if verbose: log.write(" -Chromosomes detected:",",".join(inpath_chr_num_list))
108
+ log.write(" -Chromosomes detected:",",".join(inpath_chr_num_list),verbose=verbose)
109
109
  readargs_header = get_readargs_header(inpath = inpath_chr_list[0], readargs = readargs)
110
110
  row_one = pd.read_table(inpath_chr_list[0],**readargs_header)
111
+ # columns in the sumstats
111
112
  raw_cols = row_one.columns
112
113
  else:
113
114
  ##### loading data from tabular file#################################################
@@ -173,6 +174,7 @@ def preformat(sumstats,
173
174
  usecols.append(eaf)
174
175
  rename_dictionary[eaf]= "EAF"
175
176
  elif neaf:
177
+ # neaf will be converted to eaf
176
178
  usecols.append(neaf)
177
179
  rename_dictionary[neaf]= "EAF"
178
180
  if maf:
@@ -281,10 +283,10 @@ def preformat(sumstats,
281
283
  ## loading data from path
282
284
  inpath = sumstats
283
285
  if "@" in inpath:
284
- if verbose: log.write("Start to initiate from files with pattern :" + inpath)
286
+ log.write("Start to initialize gl.Sumstats from files with pattern :" + inpath,verbose=verbose)
285
287
  sumstats_chr_list=[]
286
288
  for i in inpath_chr_list:
287
- if verbose: log.write(" -Loading:" + i)
289
+ log.write(" -Loading:" + i)
288
290
  skip_rows = get_skip_rows(i)
289
291
  readargs["skiprows"] = skip_rows
290
292
  sumstats_chr = pd.read_table(i,
@@ -292,14 +294,14 @@ def preformat(sumstats,
292
294
  dtype=dtype_dictionary,
293
295
  **readargs)
294
296
  sumstats_chr_list.append(sumstats_chr)
295
- if verbose: log.write(" -Merging sumstats for chromosomes:",",".join(inpath_chr_num_list))
297
+ log.write(" -Merging sumstats for chromosomes:",",".join(inpath_chr_num_list),verbose=verbose)
296
298
  sumstats = pd.concat(sumstats_chr_list, axis=0, ignore_index=True)
297
299
  del(sumstats_chr_list)
298
300
  gc.collect()
299
301
  else:
300
302
  skip_rows = get_skip_rows(inpath)
301
303
  readargs["skiprows"] = skip_rows
302
- if verbose: log.write("Start to initiate from file :" + inpath)
304
+ log.write("Start to initialize gl.Sumstats from file :" + inpath,verbose=verbose)
303
305
  sumstats = pd.read_table(inpath,
304
306
  usecols=set(usecols),
305
307
  dtype=dtype_dictionary,
@@ -307,8 +309,8 @@ def preformat(sumstats,
307
309
 
308
310
  elif type(sumstats) is pd.DataFrame:
309
311
  ## loading data from dataframe
310
- if verbose: log.write("Start to initiate from pandas DataFrame ...")
311
- sumstats = sumstats.loc[:, usecols]
312
+ log.write("Start to initialize gl.Sumstats from pandas DataFrame ...",verbose=verbose)
313
+ sumstats = sumstats[usecols].copy()
312
314
  for key,value in dtype_dictionary.items():
313
315
  if key in usecols:
314
316
  sumstats[key] = sumstats[key].astype(value)
@@ -324,9 +326,9 @@ def preformat(sumstats,
324
326
  converted_columns = list(map(lambda x: rename_dictionary[x], set(usecols)))
325
327
 
326
328
  ## renaming log
327
- if verbose: log.write(" -Reading columns :", ",".join(set(usecols)))
328
- if verbose: log.write(" -Renaming columns to :", ",".join(converted_columns))
329
- if verbose: log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns))
329
+ log.write(" -Reading columns :", ",".join(set(usecols)),verbose=verbose)
330
+ log.write(" -Renaming columns to :", ",".join(converted_columns),verbose=verbose)
331
+ log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns),verbose=verbose)
330
332
 
331
333
  ## renaming #####################################################################################
332
334
  sumstats = sumstats.rename(columns=rename_dictionary)
@@ -356,7 +358,7 @@ def preformat(sumstats,
356
358
  gc.collect()
357
359
  check_dataframe_memory_usage(sumstats,log=log,verbose=verbose)
358
360
 
359
- if verbose: log.write("Finished loading data successfully!")
361
+ log.write("Finished loading data successfully!",verbose=verbose)
360
362
  return sumstats
361
363
 
362
364
 
@@ -396,33 +398,37 @@ def get_skip_rows(inpath):
396
398
  return 0
397
399
 
398
400
  def parse_vcf_study(sumstats,format_cols,study,vcf_usecols,log,verbose=True):
399
- if verbose: log.write(" -Parsing based on FORMAT: ", format_cols)
400
- if verbose: log.write(" -Parsing vcf study : ", study)
401
+ log.write(" -Parsing based on FORMAT: ", format_cols,verbose=verbose)
402
+ log.write(" -Parsing vcf study : ", study,verbose=verbose)
401
403
  sumstats[format_cols] = sumstats[study].str.split(":",expand=True).values
402
404
  sumstats = sumstats.drop(["FORMAT",study],axis=1)
403
- sumstats = sumstats.loc[:, vcf_usecols]
405
+ sumstats = sumstats[ vcf_usecols]
404
406
  gc.collect()
405
407
  return sumstats
406
408
 
407
- def print_format_info(fmt,meta_data, rename_dictionary, verbose, log,output=False):
408
- if verbose: log.write(" -"+fmt+" format meta info:")
409
+ def print_format_info(fmt,meta_data, rename_dictionary, verbose, log,output=False, skip_meta_records=None):
410
+ log.write(" -"+fmt+" format meta info:",verbose=verbose)
411
+ if skip_meta_records is None:
412
+ skip_meta_records =[]
409
413
  for key,value in meta_data.items():
414
+ if key in skip_meta_records:
415
+ continue
410
416
  if value is None:
411
417
  continue
412
418
  if type(value) is str:
413
419
  if "\n" in value:
414
420
  value_first_line=value.split("\n")[0]
415
- if verbose:log.write(" -",key," : "+value_first_line.strip()+"...")
421
+ log.write(" -",key," : "+value_first_line.strip()+"...",verbose=verbose)
416
422
  elif value==" ":
417
- if verbose:log.write(' -',key,' : \\s ')
423
+ log.write(' -',key,' : \\s ',verbose=verbose)
418
424
  elif value=="\t":
419
- if verbose:log.write(' -',key,' : \\t')
425
+ log.write(' -',key,' : \\t',verbose=verbose)
420
426
  else:
421
- if verbose:log.write(" -",key," : "+value.strip())
427
+ log.write(" -",key," : "+value.strip(),verbose=verbose)
422
428
  elif type(value) is list:
423
- if verbose:log.write(" -",key," : "+','.join(value))
429
+ log.write(" -",key," : "+','.join(value),verbose=verbose)
424
430
  else:
425
- if verbose:log.write(" -",key," : ",value)
431
+ log.write(" -",key," : ",value,verbose=verbose)
426
432
  keys=[]
427
433
  values=[]
428
434
  for key,value in rename_dictionary.items():
@@ -431,32 +437,32 @@ def print_format_info(fmt,meta_data, rename_dictionary, verbose, log,output=Fals
431
437
  if fmt!="gwaslab":
432
438
  if output == False:
433
439
  if fmt!="auto":
434
- if verbose:log.write(" -"+fmt+" to gwaslab format dictionary:")
435
- if verbose:log.write(" - "+fmt+" keys:",",".join(keys))
436
- if verbose:log.write(" - gwaslab values:",",".join(values))
440
+ log.write(" -"+fmt+" to gwaslab format dictionary:",verbose=verbose)
441
+ log.write(" - "+fmt+" keys:",",".join(keys),verbose=verbose)
442
+ log.write(" - gwaslab values:",",".join(values),verbose=verbose)
437
443
  else:
438
- if verbose:log.write(" - Auto-detection mode. Note: auto-detection assumes A1=EA; Alt=EA and Frq=EAF...")
439
- if verbose:log.write(" - Header conversion source: https://github.com/Cloufield/formatbook/blob/main/formats/auto.json")
444
+ log.write(" - Auto-detection mode. Note: auto-detection assumes A1=EA; Alt=EA and Frq=EAF...",verbose=verbose)
445
+ log.write(" - Header conversion source: https://github.com/Cloufield/formatbook/blob/main/formats/auto.json",verbose=verbose)
440
446
  else:
441
- if verbose:log.write(" -gwaslab to "+fmt+" format dictionary:",)
447
+ log.write(" -gwaslab to "+fmt+" format dictionary:",verbose=verbose)
442
448
  keys=[]
443
449
  values=[]
444
450
  for key,value in rename_dictionary.items():
445
451
  keys.append(key)
446
452
  values.append(value)
447
- if verbose:log.write(" - gwaslab keys:", ','.join(keys))
448
- if verbose:log.write(" - "+fmt+" values:" , ','.join(values))
453
+ log.write(" - gwaslab keys:", ','.join(keys),verbose=verbose)
454
+ log.write(" - "+fmt+" values:" , ','.join(values),verbose=verbose)
449
455
 
450
456
  def process_neaf(sumstats,log,verbose):
451
- if verbose: log.write(" -NEAF is specified...")
457
+ log.write(" -NEAF is specified...",verbose=verbose)
452
458
  pre_number=len(sumstats)
453
- if verbose: log.write(" -Checking if 0<= NEAF <=1 ...")
454
- sumstats.loc[:,"EAF"] = pd.to_numeric(sumstats.loc[:,"EAF"], errors='coerce')
459
+ log.write(" -Checking if 0<= NEAF <=1 ...",verbose=verbose)
460
+ sumstats["EAF"] = pd.to_numeric(sumstats["EAF"], errors='coerce')
455
461
  sumstats = sumstats.loc[(sumstats["EAF"]>=0) & (sumstats["EAF"]<=1),:]
456
- sumstats.loc[:,"EAF"] = 1- sumstats.loc[:,"EAF"]
457
- if verbose: log.write(" -Converted NEAF to EAF.")
462
+ sumstats["EAF"] = 1- sumstats["EAF"]
463
+ log.write(" -Converted NEAF to EAF.",verbose=verbose)
458
464
  after_number=len(sumstats)
459
- if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad NEAF.")
465
+ log.write(" -Removed "+str(pre_number - after_number)+" variants with bad NEAF.",verbose=verbose)
460
466
  return sumstats
461
467
 
462
468
  def process_allele(sumstats,log,verbose):
@@ -466,17 +472,17 @@ def process_allele(sumstats,log,verbose):
466
472
  if "REF" in sumstats.columns and "ALT" in sumstats.columns:
467
473
 
468
474
  if "NEA" not in sumstats.columns:
469
- if verbose: log.write(" NEA not available: assigning REF to NEA...")
475
+ log.write(" NEA not available: assigning REF to NEA...",verbose=verbose)
470
476
  sumstats["NEA"]=sumstats["REF"]
471
477
 
472
- if verbose: log.write(" -EA,REF and ALT columns are available: assigning NEA...")
478
+ log.write(" -EA,REF and ALT columns are available: assigning NEA...",verbose=verbose)
473
479
  ea_alt = sumstats["EA"]==sumstats["ALT"]
474
480
 
475
- if verbose: log.write(" -For variants with EA == ALT : assigning REF to NEA ...")
481
+ log.write(" -For variants with EA == ALT : assigning REF to NEA ...",verbose=verbose)
476
482
  sumstats.loc[ea_alt,"NEA"] = sumstats.loc[ea_alt,"REF"]
477
483
 
478
484
  ea_not_alt = sumstats["EA"]!=sumstats["ALT"]
479
- if verbose: log.write(" -For variants with EA != ALT : assigning ALT to NEA ...")
485
+ log.write(" -For variants with EA != ALT : assigning ALT to NEA ...",verbose=verbose)
480
486
  sumstats.loc[ea_not_alt,"NEA"] = sumstats.loc[ea_not_alt,"ALT"]
481
487
 
482
488
  #sumstats = sumstats.drop(labels=["REF","ALT"],axis=1)
@@ -489,7 +495,7 @@ def process_allele(sumstats,log,verbose):
489
495
  return sumstats
490
496
 
491
497
  def process_status(sumstats,build,log,verbose):
492
- if verbose: log.write(" -Initiating a status column: STATUS ...")
498
+ log.write(" -Initiating a status column: STATUS ...",verbose=verbose)
493
499
  #sumstats["STATUS"] = int(build)*(10**5) +99999
494
500
  build = _process_build(build,log,verbose)
495
501
  sumstats["STATUS"] = build +"99999"
gwaslab/io_read_ldsc.py CHANGED
@@ -195,4 +195,52 @@ def read_greml(filelist=[]):
195
195
  continue
196
196
  row = pd.DataFrame([row], columns = summary.columns)
197
197
  summary = pd.concat([summary, row], ignore_index=True)
198
- return summary
198
+ return summary
199
+
200
+ def parse_ldsc_summary(ldsc_summary):
201
+ summary = pd.DataFrame(columns = ['h2_obs', 'h2_se','Lambda_gc','Mean_chi2','Intercept','Intercept_se',"Ratio","Ratio_se"])
202
+ lines = ldsc_summary.split("\n")
203
+ row={}
204
+ try:
205
+ objects = re.compile('[a-zA-Z\s\d]+:|[-0-9.]+[e]?[-0-9.]+|NA').findall(lines[0])
206
+ row["h2_obs"]=objects[1]
207
+ row["h2_se"]=objects[2]
208
+
209
+ ##next line lambda gc
210
+
211
+ objects = re.compile('[a-zA-Z\s\d]+:|[-0-9.]+[e]?[-0-9.]+|NA').findall(lines[1])
212
+ row["Lambda_gc"] = objects[1]
213
+ ##next line Mean_chi2
214
+
215
+ objects = re.compile('[a-zA-Z\s\d]+:|[-0-9.]+[e]?[-0-9.]+|NA').findall(lines[2])
216
+ row["Mean_chi2"]=objects[1]
217
+ ##next line Intercept
218
+
219
+ objects = re.compile('[a-zA-Z\s\d]+:|[-0-9.]+[e]?[-0-9.]+|NA').findall(lines[3])
220
+ row["Intercept"]=objects[1]
221
+ row["Intercept_se"]=objects[2]
222
+ ##next line Ratio
223
+
224
+ if re.compile('NA').findall(lines[4]):
225
+ row["Ratio"]="NA"
226
+ row["Ratio_se"]="NA"
227
+ elif re.compile('<').findall(lines[4]):
228
+ row["Ratio"]="Ratio < 0"
229
+ row["Ratio_se"]="NA"
230
+ else:
231
+ objects = re.compile('[a-zA-Z\s\d]+:|[-0-9.]+[e]?[-0-9.]+').findall(lines[4])
232
+ row["Ratio"]=objects[1]
233
+ row["Ratio_se"]=objects[2]
234
+ except:
235
+ row["h2_obs"]="NA"
236
+ row["h2_se"]="NA"
237
+ row["Lambda_gc"] = "NA"
238
+ row["Mean_chi2"]="NA"
239
+ row["Intercept"]="NA"
240
+ row["Intercept_se"]="NA"
241
+ row["Ratio"]="NA"
242
+ row["Ratio_se"]="NA"
243
+
244
+ #summary = summary.append(row,ignore_index=True)
245
+ row = pd.DataFrame([row], columns = summary.columns)
246
+ return row