gwaslab 3.4.36__py3-none-any.whl → 3.4.38__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of gwaslab might be problematic. Click here for more details.

Files changed (42) hide show
  1. gwaslab/__init__.py +1 -1
  2. gwaslab/data/formatbook.json +722 -721
  3. gwaslab/g_Log.py +8 -0
  4. gwaslab/g_Sumstats.py +80 -178
  5. gwaslab/g_SumstatsPair.py +6 -2
  6. gwaslab/g_Sumstats_summary.py +3 -3
  7. gwaslab/g_meta.py +13 -3
  8. gwaslab/g_version.py +2 -2
  9. gwaslab/hm_casting.py +29 -15
  10. gwaslab/hm_harmonize_sumstats.py +312 -159
  11. gwaslab/hm_rsid_to_chrpos.py +1 -1
  12. gwaslab/io_preformat_input.py +46 -37
  13. gwaslab/io_to_formats.py +428 -295
  14. gwaslab/qc_check_datatype.py +15 -1
  15. gwaslab/qc_fix_sumstats.py +956 -719
  16. gwaslab/util_ex_calculate_ldmatrix.py +29 -11
  17. gwaslab/util_ex_gwascatalog.py +1 -1
  18. gwaslab/util_ex_ldproxyfinder.py +1 -1
  19. gwaslab/util_ex_process_h5.py +26 -17
  20. gwaslab/util_ex_process_ref.py +3 -3
  21. gwaslab/util_ex_run_coloc.py +26 -4
  22. gwaslab/util_in_convert_h2.py +1 -1
  23. gwaslab/util_in_fill_data.py +44 -5
  24. gwaslab/util_in_filter_value.py +122 -34
  25. gwaslab/util_in_get_density.py +2 -2
  26. gwaslab/util_in_get_sig.py +41 -9
  27. gwaslab/viz_aux_quickfix.py +26 -21
  28. gwaslab/viz_aux_reposition_text.py +7 -4
  29. gwaslab/viz_aux_save_figure.py +6 -5
  30. gwaslab/viz_plot_compare_af.py +5 -5
  31. gwaslab/viz_plot_compare_effect.py +22 -5
  32. gwaslab/viz_plot_miamiplot2.py +28 -20
  33. gwaslab/viz_plot_mqqplot.py +214 -98
  34. gwaslab/viz_plot_qqplot.py +11 -8
  35. gwaslab/viz_plot_regionalplot.py +16 -9
  36. gwaslab/viz_plot_trumpetplot.py +15 -6
  37. {gwaslab-3.4.36.dist-info → gwaslab-3.4.38.dist-info}/METADATA +3 -3
  38. gwaslab-3.4.38.dist-info/RECORD +72 -0
  39. gwaslab-3.4.36.dist-info/RECORD +0 -72
  40. {gwaslab-3.4.36.dist-info → gwaslab-3.4.38.dist-info}/LICENSE +0 -0
  41. {gwaslab-3.4.36.dist-info → gwaslab-3.4.38.dist-info}/WHEEL +0 -0
  42. {gwaslab-3.4.36.dist-info → gwaslab-3.4.38.dist-info}/top_level.txt +0 -0
@@ -135,7 +135,7 @@ def rsID2chrpos(path,
135
135
  print("Writing to "+path+".rsid")
136
136
  sumstats = sumstats.loc[sumstats["#POS"]!=0]
137
137
 
138
- sumstats.loc[:,to_round] = sumstats.loc[:,to_round].round(4)
138
+ sumstats[to_round] = sumstats[to_round].round(4)
139
139
  sumstats.sort_values(["#CHROM","#POS"]).to_csv(path+".rsid","\t",na_rep='\.')
140
140
 
141
141
  return sumstats
@@ -8,6 +8,7 @@ from gwaslab.bd_common_data import get_format_dict
8
8
  from gwaslab.qc_fix_sumstats import sortcolumn
9
9
  from gwaslab.qc_fix_sumstats import _process_build
10
10
  from gwaslab.qc_check_datatype import check_datatype
11
+ from gwaslab.qc_check_datatype import check_dataframe_memory_usage
11
12
 
12
13
  #20221030
13
14
  def preformat(sumstats,
@@ -66,7 +67,7 @@ def preformat(sumstats,
66
67
  #######################################################################################################################################################
67
68
  if fmt is not None:
68
69
  # loading format parameters
69
- if verbose: log.write("Start to load format from formatbook....")
70
+ log.write("Start to load format from formatbook....",verbose=verbose)
70
71
 
71
72
  # load format data
72
73
  meta_data,rename_dictionary = get_format_dict(fmt)
@@ -96,7 +97,7 @@ def preformat(sumstats,
96
97
  inpath = sumstats
97
98
  ###load sumstats by each chromosome #################################################
98
99
  if "@" in inpath:
99
- if verbose: log.write(" -Detected @ in path: load sumstats by each chromosome...")
100
+ log.write(" -Detected @ in path: load sumstats by each chromosome...",verbose=verbose)
100
101
  inpath_chr_list=[]
101
102
  inpath_chr_num_list=[]
102
103
  for chromosome in list(range(1,26))+["x","y","X","Y","MT","mt","m","M"]:
@@ -104,9 +105,10 @@ def preformat(sumstats,
104
105
  if isfile_casesensitive(inpath_chr):
105
106
  inpath_chr_num_list.append(str(chromosome))
106
107
  inpath_chr_list.append(inpath_chr)
107
- if verbose: log.write(" -Chromosomes detected:",",".join(inpath_chr_num_list))
108
+ log.write(" -Chromosomes detected:",",".join(inpath_chr_num_list),verbose=verbose)
108
109
  readargs_header = get_readargs_header(inpath = inpath_chr_list[0], readargs = readargs)
109
110
  row_one = pd.read_table(inpath_chr_list[0],**readargs_header)
111
+ # columns in the sumstats
110
112
  raw_cols = row_one.columns
111
113
  else:
112
114
  ##### loading data from tabular file#################################################
@@ -172,6 +174,7 @@ def preformat(sumstats,
172
174
  usecols.append(eaf)
173
175
  rename_dictionary[eaf]= "EAF"
174
176
  elif neaf:
177
+ # neaf will be converted to eaf
175
178
  usecols.append(neaf)
176
179
  rename_dictionary[neaf]= "EAF"
177
180
  if maf:
@@ -280,10 +283,10 @@ def preformat(sumstats,
280
283
  ## loading data from path
281
284
  inpath = sumstats
282
285
  if "@" in inpath:
283
- if verbose: log.write("Start to initiate from files with pattern :" + inpath)
286
+ log.write("Start to initialize gl.Sumstats from files with pattern :" + inpath,verbose=verbose)
284
287
  sumstats_chr_list=[]
285
288
  for i in inpath_chr_list:
286
- if verbose: log.write(" -Loading:" + i)
289
+ log.write(" -Loading:" + i)
287
290
  skip_rows = get_skip_rows(i)
288
291
  readargs["skiprows"] = skip_rows
289
292
  sumstats_chr = pd.read_table(i,
@@ -291,14 +294,14 @@ def preformat(sumstats,
291
294
  dtype=dtype_dictionary,
292
295
  **readargs)
293
296
  sumstats_chr_list.append(sumstats_chr)
294
- if verbose: log.write(" -Merging sumstats for chromosomes:",",".join(inpath_chr_num_list))
297
+ log.write(" -Merging sumstats for chromosomes:",",".join(inpath_chr_num_list),verbose=verbose)
295
298
  sumstats = pd.concat(sumstats_chr_list, axis=0, ignore_index=True)
296
299
  del(sumstats_chr_list)
297
300
  gc.collect()
298
301
  else:
299
302
  skip_rows = get_skip_rows(inpath)
300
303
  readargs["skiprows"] = skip_rows
301
- if verbose: log.write("Start to initiate from file :" + inpath)
304
+ log.write("Start to initialize gl.Sumstats from file :" + inpath,verbose=verbose)
302
305
  sumstats = pd.read_table(inpath,
303
306
  usecols=set(usecols),
304
307
  dtype=dtype_dictionary,
@@ -306,8 +309,8 @@ def preformat(sumstats,
306
309
 
307
310
  elif type(sumstats) is pd.DataFrame:
308
311
  ## loading data from dataframe
309
- if verbose: log.write("Start to initiate from pandas DataFrame ...")
310
- sumstats = sumstats.loc[:, usecols]
312
+ log.write("Start to initialize gl.Sumstats from pandas DataFrame ...",verbose=verbose)
313
+ sumstats = sumstats[usecols].copy()
311
314
  for key,value in dtype_dictionary.items():
312
315
  if key in usecols:
313
316
  sumstats[key] = sumstats[key].astype(value)
@@ -323,9 +326,9 @@ def preformat(sumstats,
323
326
  converted_columns = list(map(lambda x: rename_dictionary[x], set(usecols)))
324
327
 
325
328
  ## renaming log
326
- if verbose: log.write(" -Reading columns :", ",".join(set(usecols)))
327
- if verbose: log.write(" -Renaming columns to :", ",".join(converted_columns))
328
- if verbose: log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns))
329
+ log.write(" -Reading columns :", ",".join(set(usecols)),verbose=verbose)
330
+ log.write(" -Renaming columns to :", ",".join(converted_columns),verbose=verbose)
331
+ log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns),verbose=verbose)
329
332
 
330
333
  ## renaming #####################################################################################
331
334
  sumstats = sumstats.rename(columns=rename_dictionary)
@@ -353,7 +356,9 @@ def preformat(sumstats,
353
356
  sumstats = sortcolumn(sumstats=sumstats,log=log,verbose=verbose)
354
357
  check_datatype(sumstats,log=log,verbose=verbose)
355
358
  gc.collect()
356
- if verbose: log.write("Finished loading data successfully!")
359
+ check_dataframe_memory_usage(sumstats,log=log,verbose=verbose)
360
+
361
+ log.write("Finished loading data successfully!",verbose=verbose)
357
362
  return sumstats
358
363
 
359
364
 
@@ -393,17 +398,21 @@ def get_skip_rows(inpath):
393
398
  return 0
394
399
 
395
400
  def parse_vcf_study(sumstats,format_cols,study,vcf_usecols,log,verbose=True):
396
- if verbose: log.write(" -Parsing based on FORMAT: ", format_cols)
397
- if verbose: log.write(" -Parsing vcf study : ", study)
401
+ log.write(" -Parsing based on FORMAT: ", format_cols,verbose=verbose)
402
+ log.write(" -Parsing vcf study : ", study,verbose=verbose)
398
403
  sumstats[format_cols] = sumstats[study].str.split(":",expand=True).values
399
404
  sumstats = sumstats.drop(["FORMAT",study],axis=1)
400
- sumstats = sumstats.loc[:, vcf_usecols]
405
+ sumstats = sumstats[ vcf_usecols]
401
406
  gc.collect()
402
407
  return sumstats
403
408
 
404
- def print_format_info(fmt,meta_data, rename_dictionary, verbose, log,output=False):
405
- if verbose: log.write(" -"+fmt+" format meta info:")
409
+ def print_format_info(fmt,meta_data, rename_dictionary, verbose, log,output=False, skip_meta_records=None):
410
+ log.write(" -"+fmt+" format meta info:",verbose=verbose)
411
+ if skip_meta_records is None:
412
+ skip_meta_records =[]
406
413
  for key,value in meta_data.items():
414
+ if key in skip_meta_records:
415
+ continue
407
416
  if value is None:
408
417
  continue
409
418
  if type(value) is str:
@@ -428,32 +437,32 @@ def print_format_info(fmt,meta_data, rename_dictionary, verbose, log,output=Fals
428
437
  if fmt!="gwaslab":
429
438
  if output == False:
430
439
  if fmt!="auto":
431
- if verbose:log.write(" -"+fmt+" to gwaslab format dictionary:")
432
- if verbose:log.write(" - "+fmt+" keys:",",".join(keys))
433
- if verbose:log.write(" - gwaslab values:",",".join(values))
440
+ if verbose:log.write(" -"+fmt+" to gwaslab format dictionary:",verbose=verbose)
441
+ if verbose:log.write(" - "+fmt+" keys:",",".join(keys),verbose=verbose)
442
+ if verbose:log.write(" - gwaslab values:",",".join(values),verbose=verbose)
434
443
  else:
435
- if verbose:log.write(" - Auto-detection mode. Note: auto-detection assumes A1=EA; Alt=EA and Frq=EAF...")
436
- if verbose:log.write(" - Header conversion source: https://github.com/Cloufield/formatbook/blob/main/formats/auto.json")
444
+ if verbose:log.write(" - Auto-detection mode. Note: auto-detection assumes A1=EA; Alt=EA and Frq=EAF...",verbose=verbose)
445
+ if verbose:log.write(" - Header conversion source: https://github.com/Cloufield/formatbook/blob/main/formats/auto.json",verbose=verbose)
437
446
  else:
438
- if verbose:log.write(" -gwaslab to "+fmt+" format dictionary:",)
447
+ if verbose:log.write(" -gwaslab to "+fmt+" format dictionary:",verbose=verbose)
439
448
  keys=[]
440
449
  values=[]
441
450
  for key,value in rename_dictionary.items():
442
451
  keys.append(key)
443
452
  values.append(value)
444
- if verbose:log.write(" - gwaslab keys:", ','.join(keys))
445
- if verbose:log.write(" - "+fmt+" values:" , ','.join(values))
453
+ if verbose:log.write(" - gwaslab keys:", ','.join(keys),verbose=verbose)
454
+ if verbose:log.write(" - "+fmt+" values:" , ','.join(values),verbose=verbose)
446
455
 
447
456
  def process_neaf(sumstats,log,verbose):
448
- if verbose: log.write(" -NEAF is specified...")
457
+ log.write(" -NEAF is specified...",verbose=verbose)
449
458
  pre_number=len(sumstats)
450
- if verbose: log.write(" -Checking if 0<= NEAF <=1 ...")
451
- sumstats.loc[:,"EAF"] = pd.to_numeric(sumstats.loc[:,"EAF"], errors='coerce')
459
+ log.write(" -Checking if 0<= NEAF <=1 ...",verbose=verbose)
460
+ sumstats["EAF"] = pd.to_numeric(sumstats["EAF"], errors='coerce')
452
461
  sumstats = sumstats.loc[(sumstats["EAF"]>=0) & (sumstats["EAF"]<=1),:]
453
- sumstats.loc[:,"EAF"] = 1- sumstats.loc[:,"EAF"]
454
- if verbose: log.write(" -Converted NEAF to EAF.")
462
+ sumstats["EAF"] = 1- sumstats["EAF"]
463
+ log.write(" -Converted NEAF to EAF.",verbose=verbose)
455
464
  after_number=len(sumstats)
456
- if verbose: log.write(" -Removed "+str(pre_number - after_number)+" variants with bad NEAF.")
465
+ log.write(" -Removed "+str(pre_number - after_number)+" variants with bad NEAF.",verbose=verbose)
457
466
  return sumstats
458
467
 
459
468
  def process_allele(sumstats,log,verbose):
@@ -463,17 +472,17 @@ def process_allele(sumstats,log,verbose):
463
472
  if "REF" in sumstats.columns and "ALT" in sumstats.columns:
464
473
 
465
474
  if "NEA" not in sumstats.columns:
466
- if verbose: log.write(" NEA not available: assigning REF to NEA...")
475
+ log.write(" NEA not available: assigning REF to NEA...",verbose=verbose)
467
476
  sumstats["NEA"]=sumstats["REF"]
468
477
 
469
- if verbose: log.write(" -EA,REF and ALT columns are available: assigning NEA...")
478
+ log.write(" -EA,REF and ALT columns are available: assigning NEA...",verbose=verbose)
470
479
  ea_alt = sumstats["EA"]==sumstats["ALT"]
471
480
 
472
- if verbose: log.write(" -For variants with EA == ALT : assigning REF to NEA ...")
481
+ log.write(" -For variants with EA == ALT : assigning REF to NEA ...",verbose=verbose)
473
482
  sumstats.loc[ea_alt,"NEA"] = sumstats.loc[ea_alt,"REF"]
474
483
 
475
484
  ea_not_alt = sumstats["EA"]!=sumstats["ALT"]
476
- if verbose: log.write(" -For variants with EA != ALT : assigning ALT to NEA ...")
485
+ log.write(" -For variants with EA != ALT : assigning ALT to NEA ...",verbose=verbose)
477
486
  sumstats.loc[ea_not_alt,"NEA"] = sumstats.loc[ea_not_alt,"ALT"]
478
487
 
479
488
  #sumstats = sumstats.drop(labels=["REF","ALT"],axis=1)
@@ -486,7 +495,7 @@ def process_allele(sumstats,log,verbose):
486
495
  return sumstats
487
496
 
488
497
  def process_status(sumstats,build,log,verbose):
489
- if verbose: log.write(" -Initiating a status column: STATUS ...")
498
+ log.write(" -Initiating a status column: STATUS ...",verbose=verbose)
490
499
  #sumstats["STATUS"] = int(build)*(10**5) +99999
491
500
  build = _process_build(build,log,verbose)
492
501
  sumstats["STATUS"] = build +"99999"