gwaslab 3.4.37__py3-none-any.whl → 3.4.39__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of gwaslab might be problematic. Click here for more details.

Files changed (57) hide show
  1. gwaslab/bd_common_data.py +6 -3
  2. gwaslab/bd_download.py +9 -9
  3. gwaslab/bd_get_hapmap3.py +43 -9
  4. gwaslab/data/formatbook.json +722 -721
  5. gwaslab/g_Log.py +22 -5
  6. gwaslab/g_Sumstats.py +110 -163
  7. gwaslab/g_SumstatsPair.py +76 -25
  8. gwaslab/g_SumstatsT.py +2 -2
  9. gwaslab/g_Sumstats_summary.py +3 -3
  10. gwaslab/g_version.py +10 -10
  11. gwaslab/hm_casting.py +36 -17
  12. gwaslab/hm_harmonize_sumstats.py +354 -221
  13. gwaslab/hm_rsid_to_chrpos.py +1 -1
  14. gwaslab/io_preformat_input.py +49 -43
  15. gwaslab/io_read_ldsc.py +49 -1
  16. gwaslab/io_to_formats.py +428 -295
  17. gwaslab/ldsc_irwls.py +198 -0
  18. gwaslab/ldsc_jackknife.py +514 -0
  19. gwaslab/ldsc_ldscore.py +417 -0
  20. gwaslab/ldsc_parse.py +294 -0
  21. gwaslab/ldsc_regressions.py +747 -0
  22. gwaslab/ldsc_sumstats.py +629 -0
  23. gwaslab/qc_check_datatype.py +3 -3
  24. gwaslab/qc_fix_sumstats.py +891 -778
  25. gwaslab/util_ex_calculate_ldmatrix.py +31 -13
  26. gwaslab/util_ex_gwascatalog.py +25 -25
  27. gwaslab/util_ex_ldproxyfinder.py +10 -10
  28. gwaslab/util_ex_ldsc.py +189 -0
  29. gwaslab/util_ex_process_ref.py +3 -3
  30. gwaslab/util_ex_run_coloc.py +26 -4
  31. gwaslab/util_in_calculate_gc.py +6 -6
  32. gwaslab/util_in_calculate_power.py +42 -43
  33. gwaslab/util_in_convert_h2.py +8 -8
  34. gwaslab/util_in_fill_data.py +30 -30
  35. gwaslab/util_in_filter_value.py +201 -74
  36. gwaslab/util_in_get_density.py +10 -10
  37. gwaslab/util_in_get_sig.py +445 -71
  38. gwaslab/viz_aux_annotate_plot.py +12 -12
  39. gwaslab/viz_aux_quickfix.py +42 -37
  40. gwaslab/viz_aux_reposition_text.py +10 -7
  41. gwaslab/viz_aux_save_figure.py +18 -8
  42. gwaslab/viz_plot_compare_af.py +32 -33
  43. gwaslab/viz_plot_compare_effect.py +63 -71
  44. gwaslab/viz_plot_miamiplot2.py +34 -26
  45. gwaslab/viz_plot_mqqplot.py +126 -75
  46. gwaslab/viz_plot_qqplot.py +11 -8
  47. gwaslab/viz_plot_regionalplot.py +36 -33
  48. gwaslab/viz_plot_rg_heatmap.py +28 -26
  49. gwaslab/viz_plot_stackedregional.py +40 -21
  50. gwaslab/viz_plot_trumpetplot.py +65 -61
  51. gwaslab-3.4.39.dist-info/LICENSE +674 -0
  52. {gwaslab-3.4.37.dist-info → gwaslab-3.4.39.dist-info}/METADATA +5 -4
  53. gwaslab-3.4.39.dist-info/RECORD +80 -0
  54. gwaslab-3.4.37.dist-info/RECORD +0 -72
  55. /gwaslab-3.4.37.dist-info/LICENSE → /gwaslab-3.4.39.dist-info/LICENSE_before_v3.4.39 +0 -0
  56. {gwaslab-3.4.37.dist-info → gwaslab-3.4.39.dist-info}/WHEEL +0 -0
  57. {gwaslab-3.4.37.dist-info → gwaslab-3.4.39.dist-info}/top_level.txt +0 -0
@@ -13,8 +13,9 @@ from gwaslab.bd_common_data import get_chr_to_NC
13
13
  from gwaslab.bd_common_data import gtf_to_protein_coding
14
14
  from gwaslab.bd_download import check_and_download
15
15
  from gwaslab.util_ex_gwascatalog import gwascatalog_trait
16
-
17
-
16
+ from gwaslab.qc_fix_sumstats import check_dataframe_shape
17
+ from gwaslab.qc_fix_sumstats import start_to
18
+ from gwaslab.qc_fix_sumstats import finished
18
19
  # getsig
19
20
  # closest_gene
20
21
  # annogene
@@ -39,11 +40,27 @@ def getsig(insumstats,
39
40
  """
40
41
  Extract the lead variants using a sliding window. P or MLOG10P will be used and converted to SCALEDP for sorting.
41
42
  """
43
+ ##start function with col checking##########################################################
44
+ _start_line = "extract lead variants"
45
+ _end_line = "extracting lead variants"
46
+ _start_cols = [chrom,pos]
47
+ _start_function = ".get_lead()"
48
+ _must_args ={}
49
+
50
+ is_enough_info = start_to(sumstats=insumstats,
51
+ log=log,
52
+ verbose=verbose,
53
+ start_line=_start_line,
54
+ end_line=_end_line,
55
+ start_cols=_start_cols,
56
+ start_function=_start_function,
57
+ **_must_args)
58
+ if is_enough_info == False: return None
59
+ ############################################################################################
42
60
 
43
- if verbose: log.write("Start to extract lead variants...")
44
- if verbose: log.write(" -Processing "+str(len(insumstats))+" variants...")
45
- if verbose: log.write(" -Significance threshold :", sig_level)
46
- if verbose: log.write(" -Sliding window size:", str(windowsizekb) ," kb")
61
+ log.write(" -Processing "+str(len(insumstats))+" variants...", verbose=verbose)
62
+ log.write(" -Significance threshold :", sig_level, verbose=verbose)
63
+ log.write(" -Sliding window size:", str(windowsizekb) ," kb", verbose=verbose)
47
64
 
48
65
  #load data
49
66
  sumstats=insumstats.loc[~insumstats[id].isna(),:].copy()
@@ -73,12 +90,12 @@ def getsig(insumstats,
73
90
  sumstats[p] = pd.to_numeric(sumstats[p], errors='coerce')
74
91
  sumstats_sig = sumstats.loc[sumstats[p]<sig_level,:].copy()
75
92
  sumstats_sig.loc[:,"__SCALEDP"] = pd.to_numeric(sumstats_sig[p], errors='coerce')
76
- if verbose:log.write(" -Found "+str(len(sumstats_sig))+" significant variants in total...")
93
+ log.write(" -Found "+str(len(sumstats_sig))+" significant variants in total...", verbose=verbose)
77
94
 
78
95
  #sort the coordinates
79
96
  sumstats_sig = sumstats_sig.sort_values([chrom,pos])
80
97
  if sumstats_sig is None:
81
- if verbose:log.write(" -No lead snps at given significance threshold!")
98
+ log.write(" -No lead snps at given significance threshold!", verbose=verbose)
82
99
  return None
83
100
 
84
101
  #init
@@ -131,7 +148,7 @@ def getsig(insumstats,
131
148
  sig_index_list.append(current_sig_index)
132
149
  continue
133
150
 
134
- if verbose:log.write(" -Identified "+str(len(sig_index_list))+" lead variants!")
151
+ log.write(" -Identified "+str(len(sig_index_list))+" lead variants!", verbose=verbose)
135
152
 
136
153
  # drop internal __SCALEDP
137
154
  sumstats_sig = sumstats_sig.drop("__SCALEDP",axis=1)
@@ -141,8 +158,8 @@ def getsig(insumstats,
141
158
 
142
159
  # annotate GENENAME
143
160
  if anno is True and len(output)>0:
144
- if verbose:log.write(" -Annotating variants using references:{}".format(source))
145
- if verbose:log.write(" -Annotating variants using references based on genome build:{}".format(build))
161
+ log.write(" -Annotating variants using references:{}".format(source), verbose=verbose)
162
+ log.write(" -Annotating variants using references based on genome build:{}".format(build), verbose=verbose)
146
163
 
147
164
  output = annogene(
148
165
  output,
@@ -155,11 +172,9 @@ def getsig(insumstats,
155
172
  source=source,
156
173
  verbose=verbose)
157
174
 
158
- # Finishing
159
- if verbose: log.write("Finished extracting lead variants successfully!")
160
175
  # drop internal id
161
176
  output = output.drop("__ID",axis=1)
162
- gc.collect()
177
+ finished(log,verbose,_end_line)
163
178
  return output.copy()
164
179
 
165
180
 
@@ -234,13 +249,13 @@ def annogene(
234
249
  source="ensembl",
235
250
  verbose=True):
236
251
 
237
- if verbose: log.write("Start to annotate variants with nearest gene name(s)...")
252
+ log.write("Start to annotate variants with nearest gene name(s)...", verbose=verbose)
238
253
  output = insumstats.copy()
239
254
 
240
255
  if source == "ensembl":
241
256
  if build=="19":
242
257
  #data = EnsemblRelease(75)
243
- if verbose:log.write(" -Assigning Gene name using ensembl_hg19_gtf for protein coding genes")
258
+ log.write(" -Assigning Gene name using ensembl_hg19_gtf for protein coding genes", verbose=verbose)
244
259
  #zcat Homo_sapiens.GRCh37.75.gtf.gz|
245
260
  #grep -E 'processed_transcript|protein_coding|_gene'
246
261
  #| gzip >Homo_sapiens.GRCh37.75.processed.chr.gtf.gz
@@ -260,7 +275,7 @@ def annogene(
260
275
  list(output.apply(lambda x:closest_gene(x,data=data,chrom=chrom,pos=pos,source=source), axis=1)),
261
276
  index=output.index).values
262
277
  elif build=="38":
263
- if verbose:log.write(" -Assigning Gene name using ensembl_hg38_gtf for protein coding genes")
278
+ log.write(" -Assigning Gene name using ensembl_hg38_gtf for protein coding genes", verbose=verbose)
264
279
  #gtf_path = check_and_download("ensembl_hg38_gtf_protein_coding")
265
280
  gtf_path = check_and_download("ensembl_hg38_gtf")
266
281
  gtf_path = gtf_to_protein_coding(gtf_path,log=log,verbose=verbose)
@@ -277,7 +292,7 @@ def annogene(
277
292
 
278
293
  if source == "refseq":
279
294
  if build=="19":
280
- if verbose:log.write(" -Assigning Gene name using NCBI refseq latest GRCh37 for protein coding genes")
295
+ log.write(" -Assigning Gene name using NCBI refseq latest GRCh37 for protein coding genes", verbose=verbose)
281
296
  #gtf_path = check_and_download("refseq_hg19_gtf_protein_coding")
282
297
  gtf_path = check_and_download("refseq_hg19_gtf")
283
298
  gtf_path = gtf_to_protein_coding(gtf_path,log=log,verbose=verbose)
@@ -292,7 +307,7 @@ def annogene(
292
307
  list(output.apply(lambda x:closest_gene(x,data=data,chrom=chrom,pos=pos,source=source,build=build), axis=1)),
293
308
  index=output.index).values
294
309
  elif build=="38":
295
- if verbose:log.write(" -Assigning Gene name using NCBI refseq latest GRCh38 for protein coding genes")
310
+ log.write(" -Assigning Gene name using NCBI refseq latest GRCh38 for protein coding genes", verbose=verbose)
296
311
  #gtf_path = check_and_download("refseq_hg38_gtf_protein_coding")
297
312
  gtf_path = check_and_download("refseq_hg38_gtf")
298
313
  gtf_path = gtf_to_protein_coding(gtf_path,log=log,verbose=verbose)
@@ -306,7 +321,7 @@ def annogene(
306
321
  output.loc[:,["LOCATION","GENE"]] = pd.DataFrame(
307
322
  list(output.apply(lambda x:closest_gene(x,data=data,chrom=chrom,pos=pos,source=source,build=build), axis=1)),
308
323
  index=output.index).values
309
- if verbose: log.write("Finished annotating variants with nearest gene name(s) successfully!")
324
+ log.write("Finished annotating variants with nearest gene name(s) successfully!", verbose=verbose)
310
325
  return output
311
326
 
312
327
  def getnovel(insumstats,
@@ -318,6 +333,8 @@ def getnovel(insumstats,
318
333
  known=False,
319
334
  efo=False,
320
335
  only_novel=False,
336
+ group_key=None,
337
+ if_get_lead = True,
321
338
  windowsizekb_for_novel=1000,
322
339
  windowsizekb=500,
323
340
  sig_level=5e-8,
@@ -329,37 +346,48 @@ def getnovel(insumstats,
329
346
  gwascatalog_source="NCBI",
330
347
  output_known=False,
331
348
  verbose=True):
332
- if verbose: log.write("Start to check if lead variants are known...")
333
- allsig = getsig(insumstats=insumstats,
334
- id=id,chrom=chrom,pos=pos,p=p,use_p=use_p,windowsizekb=windowsizekb,sig_level=sig_level,log=log,
335
- xymt=xymt,anno=anno,build=build, source=source,verbose=verbose)
336
-
337
- big_number = 1000000000
338
- for i in range(7):
339
- if insumstats["POS"].max()*10 > big_number:
340
- big_number = int(big_number * 10)
341
- else:
342
- break
349
+ ##start function with col checking##########################################################
350
+ _start_line = "check if lead variants are known"
351
+ _end_line = "checking if lead variants are known"
352
+ _start_cols = [chrom,pos]
353
+ _start_function = ".get_novel()"
354
+ _must_args ={}
343
355
 
344
- # create helper column TCHR+POS for allsig
345
- allsig["TCHR+POS"]=allsig[chrom]*big_number + allsig[pos]
356
+ is_enough_info = start_to(sumstats=insumstats,
357
+ log=log,
358
+ verbose=verbose,
359
+ start_line=_start_line,
360
+ end_line=_end_line,
361
+ start_cols=_start_cols,
362
+ start_function=_start_function,
363
+ **_must_args)
364
+ if is_enough_info == False: return None
365
+ ############################################################################################
346
366
 
367
+ if if_get_lead == True:
368
+ allsig = getsig(insumstats=insumstats,
369
+ id=id,chrom=chrom,pos=pos,p=p,use_p=use_p,windowsizekb=windowsizekb,sig_level=sig_level,log=log,
370
+ xymt=xymt,anno=anno,build=build, source=source,verbose=verbose)
371
+ else:
372
+ allsig = insumstats.copy()
373
+
374
+ ############################################################################################
347
375
  knownsig = pd.DataFrame()
348
376
  if efo != False:
349
377
  if type(efo) is not list:
350
- if verbose: log.write("Start to retrieve data using EFO: {}...".format(efo))
378
+ log.write("Start to retrieve data using EFO: {}...".format(efo), verbose=verbose)
351
379
  known_Sumstats = gwascatalog_trait(efo,source=gwascatalog_source,sig_level=sig_level,verbose=verbose,log=log)
352
380
  knownsig = known_Sumstats.data.copy()
353
381
  else:
354
382
  knownsig=pd.DataFrame()
355
- if verbose: log.write("Start to retrieve data using {} EFOs: {}...".format(len(efo),efo))
383
+ log.write("Start to retrieve data using {} EFOs: {}...".format(len(efo),efo), verbose=verbose)
356
384
  for single_efo in efo:
357
385
  known_Sumstats = gwascatalog_trait(single_efo,source=gwascatalog_source,sig_level=sig_level,verbose=verbose,log=log)
358
386
  known_Sumstats.data["EFOID"] = single_efo
359
387
  knownsig = pd.concat([known_Sumstats.data, knownsig],ignore_index=True)
360
388
  knownsig["CHR"] = knownsig["CHR"].astype("Int64")
361
389
  knownsig["POS"] = knownsig["POS"].astype("Int64")
362
- if verbose: log.write(" -Retrieved {} associations from GWAS catalog.".format(len(knownsig)))
390
+ log.write(" -Retrieved {} associations from GWAS catalog.".format(len(knownsig)), verbose=verbose)
363
391
  if type(known) is pd.DataFrame:
364
392
  knownsig_2 = known.copy()
365
393
  knownsig = pd.concat([knownsig, knownsig_2],ignore_index=True)
@@ -374,19 +402,230 @@ def getnovel(insumstats,
374
402
  knownsig["POS"] = knownsig["POS"].astype("Int64")
375
403
  if "SNPID" not in knownsig.columns:
376
404
  knownsig["SNPID"] =knownsig["CHR"].astype("string") + ":" + knownsig["POS"].astype("string")
405
+
377
406
  if len(knownsig)<1:
378
407
  raise ValueError("Please input a dataframe of known loci or valid efo code")
379
-
380
- # create helper column TCHR+POS for knownsig
381
- knownsig["TCHR+POS"]=knownsig[chrom]*big_number + knownsig[pos]
382
-
383
- if verbose: log.write(" -Lead variants in known loci:",len(knownsig))
384
- if verbose: log.write(" -Checking the minimum distance between identified lead variants and provided known variants...")
385
408
 
409
+ if group_key is not None:
410
+ if (group_key not in allsig.columns) or (group_key not in knownsig.columns):
411
+ raise ValueError("Please check if group_key is in both sumstats and list of known associations.")
412
+
413
+ # create helper column TCHR+POS for knownsig and all sig
414
+ ############################################################################################
415
+ maxpos = insumstats["POS"].max()
416
+ big_number = determine_big_number(maxpos)
417
+ knownsig = add_tchr_pos(knownsig, chrom, pos, big_number)
418
+ allsig = add_tchr_pos(allsig, chrom, pos, big_number)
419
+ ############################################################################################
386
420
  #sorting
387
421
  allsig = allsig.sort_values(by="TCHR+POS",ignore_index=True)
388
422
  knownsig = knownsig.sort_values(by="TCHR+POS",ignore_index=True)
423
+ ############################################################################################
424
+ if group_key is not None:
425
+ number_of_groups_allsig = allsig[group_key].nunique()
426
+ number_of_groups_known = knownsig[group_key].nunique()
427
+ log.write(" -Number of groups in sumstats:{}".format(number_of_groups_allsig), verbose=verbose)
428
+ log.write(" -Number of groups in reference:{}".format(number_of_groups_known), verbose=verbose)
429
+
430
+ log.write(" -Lead variants in known loci:",len(knownsig), verbose=verbose)
431
+ log.write(" -Checking the minimum distance between identified lead variants and provided known variants...", verbose=verbose)
432
+
433
+ ############################################################################################
434
+ if group_key is None:
435
+ # get distance
436
+ allsig = determine_distance(allsig, knownsig)
437
+ # get other info
438
+ allsig = fill_meta_info_for_known(allsig, knownsig)
439
+ ############################################################################################
440
+ # determine if novel
441
+ allsig = determine_novel(allsig, windowsizekb_for_novel)
442
+ # determine location
443
+ allsig = determine_location(allsig)
444
+ # if not on same chromosome, distance set to pd.NA
445
+ allsig = determine_if_same_chromosome(allsig, knownsig, maxpos)
446
+ ############################################################################################
447
+ else:
448
+ #groups1 = set(allsig[group_key].unique())
449
+ #groups2 = set(knownsig[group_key].unique())
450
+ #common_group = groups1.intersection(groups2)
451
+
452
+ #allsig_no_group = allsig.loc[~allsig[group_key].isin(common_group),:].copy()
453
+ allsig_group = pd.DataFrame()
454
+
455
+ for key in allsig[group_key].unique():
456
+ allsig_single_group = allsig.loc[allsig[group_key]==key,:].copy()
457
+ knownsig_single_group = knownsig.loc[knownsig[group_key]==key,:].copy()
458
+
459
+ #if len(allsig_single_group) >0 and len(knownsig_single_group) >0:
460
+ allsig_single_group = determine_distance(allsig_single_group, knownsig_single_group)
461
+ # get other info
462
+ allsig_single_group = fill_meta_info_for_known(allsig_single_group, knownsig_single_group)
463
+
464
+ # determine if novel
465
+ allsig_single_group = determine_novel(allsig_single_group, windowsizekb_for_novel)
466
+
467
+ # determine location
468
+ allsig_single_group = determine_location(allsig_single_group)
469
+
470
+ # if not on same chromosome, distance set to pd.NA
471
+ allsig_single_group = determine_if_same_chromosome(allsig_single_group, knownsig_single_group, maxpos)
472
+
473
+ allsig_group = pd.concat([allsig_group, allsig_single_group], ignore_index=True)
474
+
475
+ allsig = allsig_group
476
+ #pd.concat([allsig_no_group, allsig_group], ignore_index=True)
477
+
478
+ # drop helper column TCHR+POS
479
+ allsig = allsig.drop(["TCHR+POS"], axis=1)
480
+
481
+ try:
482
+ allsig = allsig.where(~pd.isna(allsig), pd.NA)
483
+ except:
484
+ pass
485
+
486
+ log.write(" -Identified ",len(allsig)-sum(allsig["NOVEL"])," known vairants in current sumstats...", verbose=verbose)
487
+ log.write(" -Identified ",sum(allsig["NOVEL"])," novel vairants in current sumstats...", verbose=verbose)
488
+
489
+ finished(log,verbose,_end_line)
490
+
491
+ # how to return
492
+ if only_novel is True:
493
+ if output_known is True:
494
+ return allsig.loc[allsig["NOVEL"],:], knownsig
495
+ else:
496
+ return allsig.loc[allsig["NOVEL"],:]
497
+ else:
498
+ if output_known is True:
499
+ return allsig, knownsig
500
+ else:
501
+ return allsig
502
+ ##################################################################################################################################################################################################
503
+
504
+
505
+ def _check_cis(insumstats,
506
+ id,
507
+ chrom,
508
+ pos,
509
+ p,
510
+ use_p=False,
511
+ known=False,
512
+ group_key=None,
513
+ if_get_lead = False,
514
+ windowsizekb=500,
515
+ sig_level=5e-8,
516
+ log=Log(),
517
+ xymt=["X","Y","MT"],
518
+ anno=False,
519
+ build="19",
520
+ source="ensembl",
521
+ verbose=True):
522
+ ##start function with col checking##########################################################
523
+ _start_line = "check if variants are in cis or trans regions"
524
+ _end_line = "checking if variants are in cis or trans regions"
525
+ _start_cols = [chrom,pos, group_key]
526
+ _start_function = ".check_cis()"
527
+ _must_args ={}
528
+
529
+ is_enough_info = start_to(sumstats=insumstats,
530
+ log=log,
531
+ verbose=verbose,
532
+ start_line=_start_line,
533
+ end_line=_end_line,
534
+ start_cols=_start_cols,
535
+ start_function=_start_function,
536
+ **_must_args)
537
+ if is_enough_info == False: return None
538
+ ############################################################################################
389
539
 
540
+ if if_get_lead == True:
541
+ allsig = getsig(insumstats=insumstats,
542
+ id=id,chrom=chrom,pos=pos,p=p,use_p=use_p,windowsizekb=windowsizekb,sig_level=sig_level,log=log,
543
+ xymt=xymt,anno=anno,build=build, source=source,verbose=verbose)
544
+ else:
545
+ allsig = insumstats.copy()
546
+
547
+ ############################################################################################
548
+ knownsig = pd.DataFrame()
549
+ if type(known) is pd.DataFrame:
550
+ knownsig_2 = known.copy()
551
+ knownsig = pd.concat([knownsig, knownsig_2],ignore_index=True)
552
+ knownsig["CHR"] = knownsig["CHR"].astype("Int64")
553
+ knownsig["START"] = knownsig["START"].astype("Int64")
554
+ knownsig["END"] = knownsig["END"].astype("Int64")
555
+ elif type(known) is str:
556
+ knownsig_2 = pd.read_csv(known,sep="\s+",dtype={"CHR":"Int64","POS":"Int64"})
557
+ knownsig = pd.concat([knownsig, knownsig_2],ignore_index=True)
558
+ knownsig["CHR"] = knownsig["CHR"].astype("Int64")
559
+ knownsig["START"] = knownsig["START"].astype("Int64")
560
+ knownsig["END"] = knownsig["END"].astype("Int64")
561
+
562
+ if len(knownsig)<1:
563
+ raise ValueError("Please input a dataframe of gene list with GENE, CHR, START, END.")
564
+
565
+ if group_key is not None:
566
+ if group_key not in knownsig.columns:
567
+ raise ValueError("Please check if group_key is in both sumstats and list of known associations.")
568
+
569
+ ############################################################################################
570
+ if group_key is not None:
571
+ number_of_groups_allsig = allsig[group_key].nunique()
572
+ number_of_groups_known = knownsig[group_key].nunique()
573
+ log.write(" -Number of groups in sumstats:{}".format(number_of_groups_allsig), verbose=verbose)
574
+ log.write(" -Number of groups in reference:{}".format(number_of_groups_known), verbose=verbose)
575
+
576
+ log.write(" -Checking if variants in cis/trans regions grouped by {}...".format(group_key), verbose=verbose)
577
+ log.write(" -Window size in kb adding to start and end: {}...".format(windowsizekb), verbose=verbose)
578
+ ############################################################################################
579
+ #convert to a dict
580
+ reference_dict = {}
581
+ for index,row in knownsig.iterrows():
582
+ reference_dict[row[group_key]] = (row["CHR"], row["START"], row["END"] )
583
+ ############################################################################################
584
+ try:
585
+ no_reference_avaialble = allsig.loc[~allsig[group_key].isin(reference_dict.keys()),group_key]
586
+ if len(no_reference_avaialble)>0:
587
+ log.write(" -Groups not in reference: {}".format( ",".join(no_reference_avaialble.unique())), verbose=verbose)
588
+ except:
589
+ pass
590
+
591
+ allsig["CIS/TRANS"] = allsig.apply(lambda x: determine_if_cis(x, group_key,windowsizekb, reference_dict), axis=1)
592
+
593
+ try:
594
+ allsig = allsig.where(~pd.isna(allsig), pd.NA)
595
+ except:
596
+ pass
597
+
598
+ try:
599
+ number_of_cis = sum(allsig["CIS/TRANS"] == "Cis")
600
+ number_of_trans = sum(allsig["CIS/TRANS"] == "Trans")
601
+ number_of_noreference = sum(allsig["CIS/TRANS"] == "NoReference")
602
+ log.write (" -Number of Cis variants: {}".format(number_of_cis),verbose=verbose)
603
+ log.write (" -Number of Trans variants: {}".format(number_of_trans),verbose=verbose)
604
+ log.write (" -Number of NoReference variants: {}".format(number_of_noreference),verbose=verbose)
605
+ except:
606
+ pass
607
+
608
+ finished(log,verbose,_end_line)
609
+
610
+ return allsig
611
+
612
+ ###################################################################################################################################################################################################
613
+
614
+
615
+ def determine_big_number(maxpos, big_number = 1000000000):
616
+ for i in range(7):
617
+ if maxpos*10 > big_number:
618
+ big_number = int(big_number * 10)
619
+ else:
620
+ break
621
+ return big_number
622
+
623
+ def add_tchr_pos(df, chrom, pos, big_number):
624
+ df["TCHR+POS"]=df[chrom]*big_number + df[pos]
625
+ return df
626
+
627
+ def fill_meta_info_for_known(allsig, knownsig):
628
+ if len(allsig)==0 or len(knownsig)==0: return allsig
390
629
  if "SNPID" in knownsig.columns:
391
630
  knownids=knownsig["SNPID"].values
392
631
  if "PUBMEDID" in knownsig.columns:
@@ -395,12 +634,7 @@ def getnovel(insumstats,
395
634
  knownauthor=knownsig["AUTHOR"].values
396
635
  if "EFOID" in knownsig.columns:
397
636
  knownefo=knownsig["EFOID"].values
398
-
399
- # get distance
400
- lambda x:np.min(np.abs(knownsig["TCHR+POS"]-x))
401
- allsig["DISTANCE_TO_KNOWN"] = allsig["TCHR+POS"].apply(lambda x:min(knownsig["TCHR+POS"]-x, key=abs))
402
-
403
- # get other info
637
+
404
638
  if "SNPID" in knownsig.columns:
405
639
  allsig["KNOWN_ID"] = allsig["TCHR+POS"].apply(lambda x:knownids[np.argmin(np.abs(knownsig["TCHR+POS"]-x))])
406
640
  if "PUBMEDID" in knownsig.columns:
@@ -408,20 +642,48 @@ def getnovel(insumstats,
408
642
  if "AUTHOR" in knownsig.columns:
409
643
  allsig["KNOWN_AUTHOR"] = allsig["TCHR+POS"].apply(lambda x:knownauthor[np.argmin(np.abs(knownsig["TCHR+POS"]-x))])
410
644
  if "EFOID" in knownsig.columns:
411
- allsig["KNOWN_EFOID"] = allsig["TCHR+POS"].apply(lambda x:knownefo[np.argmin(np.abs(knownsig["TCHR+POS"]-x))])
645
+ allsig["KNOWN_EFOID"] = allsig["TCHR+POS"].apply(lambda x:knownefo[np.argmin(np.abs(knownsig["TCHR+POS"]-x))])
646
+ return allsig
647
+
648
+ def determine_if_cis(x, group_key,windowsizekb, reference_dict):
649
+ if x[group_key] in reference_dict.keys():
650
+ is_same_chr = str(reference_dict[x[group_key]][0]) == str(x["CHR"])
651
+ is_large_than_start = int(reference_dict[x[group_key]][1]) - windowsizekb*1000 <= x["POS"]
652
+ is_smaller_than_end = int(reference_dict[x[group_key]][2]) + windowsizekb*1000 >= x["POS"]
653
+
654
+ if is_same_chr and is_large_than_start and is_smaller_than_end:
655
+ return "Cis"
656
+ else:
657
+ return "Trans"
658
+ else:
659
+ return "NoReference"
412
660
 
413
- # determine if novel
661
+ def determine_distance(allsig, knownsig):
662
+ if len(allsig)==0:
663
+ return allsig
664
+ if len(knownsig)==0:
665
+ allsig["DISTANCE_TO_KNOWN"] = pd.NA
666
+ return allsig
667
+ allsig["DISTANCE_TO_KNOWN"] = allsig["TCHR+POS"].apply(lambda x:min(knownsig["TCHR+POS"]-x, key=abs))
668
+ return allsig
669
+
670
+ def determine_novel(allsig, windowsizekb_for_novel):
671
+ if len(allsig)==0 or "DISTANCE_TO_KNOWN" not in allsig.columns:
672
+ return allsig
414
673
  allsig["NOVEL"] = allsig["DISTANCE_TO_KNOWN"].abs() > windowsizekb_for_novel*1000
415
-
416
- # determine location
417
- allsig["LOCATION_OF_KNOWN"]="Unknown"
674
+ allsig.loc[allsig["DISTANCE_TO_KNOWN"].isna(), "NOVEL"] = True
675
+ return allsig
676
+
677
+ def determine_location(allsig):
678
+ allsig["LOCATION_OF_KNOWN"]="NoReference"
418
679
  allsig.loc[ allsig["DISTANCE_TO_KNOWN"]== 0,"LOCATION_OF_KNOWN"] = "Same"
419
680
  allsig.loc[ allsig["DISTANCE_TO_KNOWN"] > 0 ,"LOCATION_OF_KNOWN"] = "Upstream"
420
681
  allsig.loc[ allsig["DISTANCE_TO_KNOWN"] < 0 ,"LOCATION_OF_KNOWN"] = "Downstream"
682
+ return allsig
421
683
 
422
- # if not on same chromosome, distance set to pd.NA
423
- if sum(allsig["DISTANCE_TO_KNOWN"].abs() > insumstats["POS"].max())>0:
424
- not_on_same_chromosome = allsig["DISTANCE_TO_KNOWN"].abs() > insumstats["POS"].max()
684
+ def determine_if_same_chromosome(allsig, knownsig, maxpos):
685
+ if sum(allsig["DISTANCE_TO_KNOWN"].abs() > maxpos)>0:
686
+ not_on_same_chromosome = allsig["DISTANCE_TO_KNOWN"].abs() > maxpos
425
687
  allsig.loc[ not_on_same_chromosome ,"DISTANCE_TO_KNOWN"] = pd.NA
426
688
  allsig.loc[ not_on_same_chromosome ,"LOCATION_OF_KNOWN"] = "NoneOnThisChr"
427
689
  if "SNPID" in knownsig.columns:
@@ -432,23 +694,135 @@ def getnovel(insumstats,
432
694
  allsig.loc[ not_on_same_chromosome ,"KNOWN_AUTHOR"] = pd.NA
433
695
  if "EFOID" in knownsig.columns:
434
696
  allsig.loc[ not_on_same_chromosome ,"KNOWN_EFOID"] = pd.NA
697
+ return allsig
435
698
 
436
- # drop helper column TCHR+POS
437
- allsig = allsig.drop(["TCHR+POS"], axis=1)
699
+ def _check_novel_set(insumstats,
700
+ id,
701
+ chrom,
702
+ pos,
703
+ p,
704
+ use_p=False,
705
+ known=False,
706
+ group_key=None,
707
+ snpset="SNPSET",
708
+ snpid="SNPID",
709
+ if_get_lead = False,
710
+ windowsizekb=500,
711
+ sig_level=5e-8,
712
+ log=Log(),
713
+ xymt=["X","Y","MT"],
714
+ anno=False,
715
+ build="19",
716
+ source="ensembl",
717
+ verbose=True):
718
+
719
+ ##start function with col checking##########################################################
720
+ _start_line = "check if variant sets are overlapping with those in reference file"
721
+ _end_line = "checking if variant sets are overlapping with those in reference file"
722
+ _start_cols = [chrom,pos, group_key]
723
+ _start_function = ".check_cis()"
724
+ _must_args ={}
438
725
 
439
- if verbose: log.write(" -Identified ",len(allsig)-sum(allsig["NOVEL"])," known vairants in current sumstats...")
440
- if verbose: log.write(" -Identified ",sum(allsig["NOVEL"])," novel vairants in current sumstats...")
441
- if verbose: log.write("Finished checking known or novel successfully!")
442
- gc.collect()
726
+ is_enough_info = start_to(sumstats=insumstats,
727
+ log=log,
728
+ verbose=verbose,
729
+ start_line=_start_line,
730
+ end_line=_end_line,
731
+ start_cols=_start_cols,
732
+ start_function=_start_function,
733
+ **_must_args)
734
+ if is_enough_info == False: return None
735
+ ############################################################################################
443
736
 
444
- # how to return
445
- if only_novel is True:
446
- if output_known is True:
447
- return allsig.loc[allsig["NOVEL"],:], knownsig
448
- else:
449
- return allsig.loc[allsig["NOVEL"],:]
737
+ if if_get_lead == True:
738
+ allsig = getsig(insumstats=insumstats,
739
+ id=id,chrom=chrom,pos=pos,p=p,use_p=use_p,windowsizekb=windowsizekb,sig_level=sig_level,log=log,
740
+ xymt=xymt,anno=anno,build=build, source=source,verbose=verbose)
450
741
  else:
451
- if output_known is True:
452
- return allsig, knownsig
742
+ allsig = insumstats.copy()
743
+
744
+ ############################################################################################
745
+ knownsig = pd.DataFrame()
746
+ if type(known) is pd.DataFrame:
747
+ knownsig_2 = known.copy()
748
+ knownsig = pd.concat([knownsig, knownsig_2],ignore_index=True)
749
+ knownsig[snpid] = knownsig[snpid].astype("string")
750
+ knownsig[snpset] = knownsig[snpset].astype("string")
751
+ knownsig[group_key] = knownsig[group_key].astype("string")
752
+ elif type(known) is str:
753
+ knownsig_2 = pd.read_csv(known,sep="\s+",dtype={"CHR":"Int64","POS":"Int64"})
754
+ knownsig = pd.concat([knownsig, knownsig_2],ignore_index=True)
755
+ knownsig[snpid] = knownsig[snpid].astype("string")
756
+ knownsig[snpset] = knownsig[snpset].astype("string")
757
+ knownsig[group_key] = knownsig[group_key].astype("string")
758
+
759
+ if len(knownsig)<1:
760
+ raise ValueError("Please input a dataframe of gene list with GENE, CHR, START, END.")
761
+
762
+ if group_key is not None:
763
+ if group_key not in knownsig.columns:
764
+ raise ValueError("Please check if group_key is in both sumstats and list of known associations.")
765
+
766
+ ############################################################################################
767
+ if group_key is not None:
768
+ number_of_groups_allsig = allsig[group_key].nunique()
769
+ number_of_groups_known = knownsig[group_key].nunique()
770
+ log.write(" -Number of groups in sumstats:{}".format(number_of_groups_allsig), verbose=verbose)
771
+ log.write(" -Number of groups in reference:{}".format(number_of_groups_known), verbose=verbose)
772
+
773
+ log.write(" -Checking if variants in cis/trans regions grouped by {}...".format(group_key), verbose=verbose)
774
+
775
+ ############################################################################################
776
+ #convert to a dict
777
+ reference_dict = {}
778
+
779
+ for index,row in knownsig.iterrows():
780
+ if row[group_key] in reference_dict.keys():
781
+ if row[snpset] in reference_dict[row[group_key]].keys():
782
+ reference_dict[row[group_key]][row[snpset]].add(row[snpid])
783
+ else:
784
+ reference_dict[row[group_key]][row[snpset]] = set([row[snpid]])
453
785
  else:
454
- return allsig
786
+ reference_dict[row[group_key]] = {row[snpset]:set([row[snpid]])}
787
+ ############################################################################################
788
+
789
+ try:
790
+ no_reference_avaialble = allsig.loc[~allsig[group_key].isin(reference_dict.keys()),group_key]
791
+ if len(no_reference_avaialble)>0:
792
+ log.write(" -Groups not in reference: {}".format( ",".join(no_reference_avaialble)), verbose=verbose)
793
+ except:
794
+ pass
795
+
796
+ log.write(" -Checking if variants are in reference variant sets...", verbose=verbose)
797
+ known_list = allsig.apply(lambda x: check_overlap(x,snpid, group_key,reference_dict), axis=1)
798
+
799
+ allsig["KNOWN_SET"] = known_list.str[0]
800
+ allsig["KNOWN_VARIANT"] = known_list.str[1]
801
+
802
+ back_dict={}
803
+ for i in allsig[group_key].unique():
804
+ back_dict[i] ={}
805
+ for j in allsig.loc[allsig[group_key]==i,snpset].unique():
806
+ back_dict[i][j] =set()
807
+ for index, row in allsig.loc[(allsig[group_key]==i) & (allsig[snpset]==j) & (~allsig["KNOWN_SET"].isna()),:].iterrows():
808
+ back_dict[i][j].add("{}-{}-{}".format(row[group_key], row["KNOWN_SET"],row["KNOWN_VARIANT"]))
809
+
810
+ allsig["KNOWN_SET_VARIANT"] = allsig.apply(lambda x: assign_set_variant(x,group_key,snpset,back_dict), axis=1)
811
+
812
+ finished(log,verbose,_end_line)
813
+
814
+ return allsig
815
+
816
+ def check_overlap(x,snpid, group_key,reference_dict):
817
+ if x[group_key] in reference_dict.keys():
818
+ for key, value in reference_dict[x[group_key]].items():
819
+ if x[snpid] in value:
820
+ return key, x[snpid]
821
+ return pd.NA, pd.NA,
822
+
823
+ def assign_set_variant(x,group_key,snpset,back_dict):
824
+ if x[group_key] in back_dict.keys():
825
+ if x[snpset] in back_dict[x[group_key]].keys():
826
+ if len(back_dict[x[group_key]][x[snpset]]) >0:
827
+ return back_dict[x[group_key]][x[snpset]]
828
+ return pd.NA