gwaslab 3.4.38__py3-none-any.whl → 3.4.39__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of gwaslab might be problematic. Click here for more details.

Files changed (51) hide show
  1. gwaslab/bd_common_data.py +6 -3
  2. gwaslab/bd_download.py +9 -9
  3. gwaslab/bd_get_hapmap3.py +43 -9
  4. gwaslab/g_Log.py +14 -5
  5. gwaslab/g_Sumstats.py +86 -18
  6. gwaslab/g_SumstatsPair.py +70 -23
  7. gwaslab/g_SumstatsT.py +2 -2
  8. gwaslab/g_version.py +10 -10
  9. gwaslab/hm_casting.py +9 -4
  10. gwaslab/hm_harmonize_sumstats.py +88 -83
  11. gwaslab/io_preformat_input.py +14 -14
  12. gwaslab/io_read_ldsc.py +49 -1
  13. gwaslab/ldsc_irwls.py +198 -0
  14. gwaslab/ldsc_jackknife.py +514 -0
  15. gwaslab/ldsc_ldscore.py +417 -0
  16. gwaslab/ldsc_parse.py +294 -0
  17. gwaslab/ldsc_regressions.py +747 -0
  18. gwaslab/ldsc_sumstats.py +629 -0
  19. gwaslab/qc_check_datatype.py +1 -1
  20. gwaslab/qc_fix_sumstats.py +163 -161
  21. gwaslab/util_ex_calculate_ldmatrix.py +2 -2
  22. gwaslab/util_ex_gwascatalog.py +24 -24
  23. gwaslab/util_ex_ldproxyfinder.py +9 -9
  24. gwaslab/util_ex_ldsc.py +189 -0
  25. gwaslab/util_in_calculate_gc.py +6 -6
  26. gwaslab/util_in_calculate_power.py +42 -43
  27. gwaslab/util_in_convert_h2.py +8 -8
  28. gwaslab/util_in_fill_data.py +28 -28
  29. gwaslab/util_in_filter_value.py +91 -52
  30. gwaslab/util_in_get_density.py +8 -8
  31. gwaslab/util_in_get_sig.py +407 -65
  32. gwaslab/viz_aux_annotate_plot.py +12 -12
  33. gwaslab/viz_aux_quickfix.py +18 -18
  34. gwaslab/viz_aux_reposition_text.py +3 -3
  35. gwaslab/viz_aux_save_figure.py +14 -5
  36. gwaslab/viz_plot_compare_af.py +29 -30
  37. gwaslab/viz_plot_compare_effect.py +63 -71
  38. gwaslab/viz_plot_miamiplot2.py +6 -6
  39. gwaslab/viz_plot_mqqplot.py +17 -3
  40. gwaslab/viz_plot_qqplot.py +1 -1
  41. gwaslab/viz_plot_regionalplot.py +33 -32
  42. gwaslab/viz_plot_rg_heatmap.py +28 -26
  43. gwaslab/viz_plot_stackedregional.py +40 -21
  44. gwaslab/viz_plot_trumpetplot.py +50 -55
  45. gwaslab-3.4.39.dist-info/LICENSE +674 -0
  46. {gwaslab-3.4.38.dist-info → gwaslab-3.4.39.dist-info}/METADATA +4 -3
  47. gwaslab-3.4.39.dist-info/RECORD +80 -0
  48. gwaslab-3.4.38.dist-info/RECORD +0 -72
  49. /gwaslab-3.4.38.dist-info/LICENSE → /gwaslab-3.4.39.dist-info/LICENSE_before_v3.4.39 +0 -0
  50. {gwaslab-3.4.38.dist-info → gwaslab-3.4.39.dist-info}/WHEEL +0 -0
  51. {gwaslab-3.4.38.dist-info → gwaslab-3.4.39.dist-info}/top_level.txt +0 -0
@@ -58,9 +58,9 @@ def getsig(insumstats,
58
58
  if is_enough_info == False: return None
59
59
  ############################################################################################
60
60
 
61
- if verbose: log.write(" -Processing "+str(len(insumstats))+" variants...")
62
- if verbose: log.write(" -Significance threshold :", sig_level)
63
- if verbose: log.write(" -Sliding window size:", str(windowsizekb) ," kb")
61
+ log.write(" -Processing "+str(len(insumstats))+" variants...", verbose=verbose)
62
+ log.write(" -Significance threshold :", sig_level, verbose=verbose)
63
+ log.write(" -Sliding window size:", str(windowsizekb) ," kb", verbose=verbose)
64
64
 
65
65
  #load data
66
66
  sumstats=insumstats.loc[~insumstats[id].isna(),:].copy()
@@ -90,12 +90,12 @@ def getsig(insumstats,
90
90
  sumstats[p] = pd.to_numeric(sumstats[p], errors='coerce')
91
91
  sumstats_sig = sumstats.loc[sumstats[p]<sig_level,:].copy()
92
92
  sumstats_sig.loc[:,"__SCALEDP"] = pd.to_numeric(sumstats_sig[p], errors='coerce')
93
- if verbose:log.write(" -Found "+str(len(sumstats_sig))+" significant variants in total...")
93
+ log.write(" -Found "+str(len(sumstats_sig))+" significant variants in total...", verbose=verbose)
94
94
 
95
95
  #sort the coordinates
96
96
  sumstats_sig = sumstats_sig.sort_values([chrom,pos])
97
97
  if sumstats_sig is None:
98
- if verbose:log.write(" -No lead snps at given significance threshold!")
98
+ log.write(" -No lead snps at given significance threshold!", verbose=verbose)
99
99
  return None
100
100
 
101
101
  #init
@@ -148,7 +148,7 @@ def getsig(insumstats,
148
148
  sig_index_list.append(current_sig_index)
149
149
  continue
150
150
 
151
- if verbose:log.write(" -Identified "+str(len(sig_index_list))+" lead variants!")
151
+ log.write(" -Identified "+str(len(sig_index_list))+" lead variants!", verbose=verbose)
152
152
 
153
153
  # drop internal __SCALEDP
154
154
  sumstats_sig = sumstats_sig.drop("__SCALEDP",axis=1)
@@ -158,8 +158,8 @@ def getsig(insumstats,
158
158
 
159
159
  # annotate GENENAME
160
160
  if anno is True and len(output)>0:
161
- if verbose:log.write(" -Annotating variants using references:{}".format(source))
162
- if verbose:log.write(" -Annotating variants using references based on genome build:{}".format(build))
161
+ log.write(" -Annotating variants using references:{}".format(source), verbose=verbose)
162
+ log.write(" -Annotating variants using references based on genome build:{}".format(build), verbose=verbose)
163
163
 
164
164
  output = annogene(
165
165
  output,
@@ -249,13 +249,13 @@ def annogene(
249
249
  source="ensembl",
250
250
  verbose=True):
251
251
 
252
- if verbose: log.write("Start to annotate variants with nearest gene name(s)...")
252
+ log.write("Start to annotate variants with nearest gene name(s)...", verbose=verbose)
253
253
  output = insumstats.copy()
254
254
 
255
255
  if source == "ensembl":
256
256
  if build=="19":
257
257
  #data = EnsemblRelease(75)
258
- if verbose:log.write(" -Assigning Gene name using ensembl_hg19_gtf for protein coding genes")
258
+ log.write(" -Assigning Gene name using ensembl_hg19_gtf for protein coding genes", verbose=verbose)
259
259
  #zcat Homo_sapiens.GRCh37.75.gtf.gz|
260
260
  #grep -E 'processed_transcript|protein_coding|_gene'
261
261
  #| gzip >Homo_sapiens.GRCh37.75.processed.chr.gtf.gz
@@ -275,7 +275,7 @@ def annogene(
275
275
  list(output.apply(lambda x:closest_gene(x,data=data,chrom=chrom,pos=pos,source=source), axis=1)),
276
276
  index=output.index).values
277
277
  elif build=="38":
278
- if verbose:log.write(" -Assigning Gene name using ensembl_hg38_gtf for protein coding genes")
278
+ log.write(" -Assigning Gene name using ensembl_hg38_gtf for protein coding genes", verbose=verbose)
279
279
  #gtf_path = check_and_download("ensembl_hg38_gtf_protein_coding")
280
280
  gtf_path = check_and_download("ensembl_hg38_gtf")
281
281
  gtf_path = gtf_to_protein_coding(gtf_path,log=log,verbose=verbose)
@@ -292,7 +292,7 @@ def annogene(
292
292
 
293
293
  if source == "refseq":
294
294
  if build=="19":
295
- if verbose:log.write(" -Assigning Gene name using NCBI refseq latest GRCh37 for protein coding genes")
295
+ log.write(" -Assigning Gene name using NCBI refseq latest GRCh37 for protein coding genes", verbose=verbose)
296
296
  #gtf_path = check_and_download("refseq_hg19_gtf_protein_coding")
297
297
  gtf_path = check_and_download("refseq_hg19_gtf")
298
298
  gtf_path = gtf_to_protein_coding(gtf_path,log=log,verbose=verbose)
@@ -307,7 +307,7 @@ def annogene(
307
307
  list(output.apply(lambda x:closest_gene(x,data=data,chrom=chrom,pos=pos,source=source,build=build), axis=1)),
308
308
  index=output.index).values
309
309
  elif build=="38":
310
- if verbose:log.write(" -Assigning Gene name using NCBI refseq latest GRCh38 for protein coding genes")
310
+ log.write(" -Assigning Gene name using NCBI refseq latest GRCh38 for protein coding genes", verbose=verbose)
311
311
  #gtf_path = check_and_download("refseq_hg38_gtf_protein_coding")
312
312
  gtf_path = check_and_download("refseq_hg38_gtf")
313
313
  gtf_path = gtf_to_protein_coding(gtf_path,log=log,verbose=verbose)
@@ -321,7 +321,7 @@ def annogene(
321
321
  output.loc[:,["LOCATION","GENE"]] = pd.DataFrame(
322
322
  list(output.apply(lambda x:closest_gene(x,data=data,chrom=chrom,pos=pos,source=source,build=build), axis=1)),
323
323
  index=output.index).values
324
- if verbose: log.write("Finished annotating variants with nearest gene name(s) successfully!")
324
+ log.write("Finished annotating variants with nearest gene name(s) successfully!", verbose=verbose)
325
325
  return output
326
326
 
327
327
  def getnovel(insumstats,
@@ -333,6 +333,8 @@ def getnovel(insumstats,
333
333
  known=False,
334
334
  efo=False,
335
335
  only_novel=False,
336
+ group_key=None,
337
+ if_get_lead = True,
336
338
  windowsizekb_for_novel=1000,
337
339
  windowsizekb=500,
338
340
  sig_level=5e-8,
@@ -362,36 +364,30 @@ def getnovel(insumstats,
362
364
  if is_enough_info == False: return None
363
365
  ############################################################################################
364
366
 
365
- allsig = getsig(insumstats=insumstats,
366
- id=id,chrom=chrom,pos=pos,p=p,use_p=use_p,windowsizekb=windowsizekb,sig_level=sig_level,log=log,
367
- xymt=xymt,anno=anno,build=build, source=source,verbose=verbose)
368
-
369
- big_number = 1000000000
370
- for i in range(7):
371
- if insumstats["POS"].max()*10 > big_number:
372
- big_number = int(big_number * 10)
373
- else:
374
- break
367
+ if if_get_lead == True:
368
+ allsig = getsig(insumstats=insumstats,
369
+ id=id,chrom=chrom,pos=pos,p=p,use_p=use_p,windowsizekb=windowsizekb,sig_level=sig_level,log=log,
370
+ xymt=xymt,anno=anno,build=build, source=source,verbose=verbose)
371
+ else:
372
+ allsig = insumstats.copy()
375
373
 
376
- # create helper column TCHR+POS for allsig
377
- allsig["TCHR+POS"]=allsig[chrom]*big_number + allsig[pos]
378
-
374
+ ############################################################################################
379
375
  knownsig = pd.DataFrame()
380
376
  if efo != False:
381
377
  if type(efo) is not list:
382
- if verbose: log.write("Start to retrieve data using EFO: {}...".format(efo))
378
+ log.write("Start to retrieve data using EFO: {}...".format(efo), verbose=verbose)
383
379
  known_Sumstats = gwascatalog_trait(efo,source=gwascatalog_source,sig_level=sig_level,verbose=verbose,log=log)
384
380
  knownsig = known_Sumstats.data.copy()
385
381
  else:
386
382
  knownsig=pd.DataFrame()
387
- if verbose: log.write("Start to retrieve data using {} EFOs: {}...".format(len(efo),efo))
383
+ log.write("Start to retrieve data using {} EFOs: {}...".format(len(efo),efo), verbose=verbose)
388
384
  for single_efo in efo:
389
385
  known_Sumstats = gwascatalog_trait(single_efo,source=gwascatalog_source,sig_level=sig_level,verbose=verbose,log=log)
390
386
  known_Sumstats.data["EFOID"] = single_efo
391
387
  knownsig = pd.concat([known_Sumstats.data, knownsig],ignore_index=True)
392
388
  knownsig["CHR"] = knownsig["CHR"].astype("Int64")
393
389
  knownsig["POS"] = knownsig["POS"].astype("Int64")
394
- if verbose: log.write(" -Retrieved {} associations from GWAS catalog.".format(len(knownsig)))
390
+ log.write(" -Retrieved {} associations from GWAS catalog.".format(len(knownsig)), verbose=verbose)
395
391
  if type(known) is pd.DataFrame:
396
392
  knownsig_2 = known.copy()
397
393
  knownsig = pd.concat([knownsig, knownsig_2],ignore_index=True)
@@ -406,19 +402,230 @@ def getnovel(insumstats,
406
402
  knownsig["POS"] = knownsig["POS"].astype("Int64")
407
403
  if "SNPID" not in knownsig.columns:
408
404
  knownsig["SNPID"] =knownsig["CHR"].astype("string") + ":" + knownsig["POS"].astype("string")
405
+
409
406
  if len(knownsig)<1:
410
407
  raise ValueError("Please input a dataframe of known loci or valid efo code")
411
-
412
- # create helper column TCHR+POS for knownsig
413
- knownsig["TCHR+POS"]=knownsig[chrom]*big_number + knownsig[pos]
414
-
415
- if verbose: log.write(" -Lead variants in known loci:",len(knownsig))
416
- if verbose: log.write(" -Checking the minimum distance between identified lead variants and provided known variants...")
417
408
 
409
+ if group_key is not None:
410
+ if (group_key not in allsig.columns) or (group_key not in knownsig.columns):
411
+ raise ValueError("Please check if group_key is in both sumstats and list of known associations.")
412
+
413
+ # create helper column TCHR+POS for knownsig and all sig
414
+ ############################################################################################
415
+ maxpos = insumstats["POS"].max()
416
+ big_number = determine_big_number(maxpos)
417
+ knownsig = add_tchr_pos(knownsig, chrom, pos, big_number)
418
+ allsig = add_tchr_pos(allsig, chrom, pos, big_number)
419
+ ############################################################################################
418
420
  #sorting
419
421
  allsig = allsig.sort_values(by="TCHR+POS",ignore_index=True)
420
422
  knownsig = knownsig.sort_values(by="TCHR+POS",ignore_index=True)
423
+ ############################################################################################
424
+ if group_key is not None:
425
+ number_of_groups_allsig = allsig[group_key].nunique()
426
+ number_of_groups_known = knownsig[group_key].nunique()
427
+ log.write(" -Number of groups in sumstats:{}".format(number_of_groups_allsig), verbose=verbose)
428
+ log.write(" -Number of groups in reference:{}".format(number_of_groups_known), verbose=verbose)
429
+
430
+ log.write(" -Lead variants in known loci:",len(knownsig), verbose=verbose)
431
+ log.write(" -Checking the minimum distance between identified lead variants and provided known variants...", verbose=verbose)
432
+
433
+ ############################################################################################
434
+ if group_key is None:
435
+ # get distance
436
+ allsig = determine_distance(allsig, knownsig)
437
+ # get other info
438
+ allsig = fill_meta_info_for_known(allsig, knownsig)
439
+ ############################################################################################
440
+ # determine if novel
441
+ allsig = determine_novel(allsig, windowsizekb_for_novel)
442
+ # determine location
443
+ allsig = determine_location(allsig)
444
+ # if not on same chromosome, distance set to pd.NA
445
+ allsig = determine_if_same_chromosome(allsig, knownsig, maxpos)
446
+ ############################################################################################
447
+ else:
448
+ #groups1 = set(allsig[group_key].unique())
449
+ #groups2 = set(knownsig[group_key].unique())
450
+ #common_group = groups1.intersection(groups2)
451
+
452
+ #allsig_no_group = allsig.loc[~allsig[group_key].isin(common_group),:].copy()
453
+ allsig_group = pd.DataFrame()
454
+
455
+ for key in allsig[group_key].unique():
456
+ allsig_single_group = allsig.loc[allsig[group_key]==key,:].copy()
457
+ knownsig_single_group = knownsig.loc[knownsig[group_key]==key,:].copy()
458
+
459
+ #if len(allsig_single_group) >0 and len(knownsig_single_group) >0:
460
+ allsig_single_group = determine_distance(allsig_single_group, knownsig_single_group)
461
+ # get other info
462
+ allsig_single_group = fill_meta_info_for_known(allsig_single_group, knownsig_single_group)
463
+
464
+ # determine if novel
465
+ allsig_single_group = determine_novel(allsig_single_group, windowsizekb_for_novel)
466
+
467
+ # determine location
468
+ allsig_single_group = determine_location(allsig_single_group)
469
+
470
+ # if not on same chromosome, distance set to pd.NA
471
+ allsig_single_group = determine_if_same_chromosome(allsig_single_group, knownsig_single_group, maxpos)
472
+
473
+ allsig_group = pd.concat([allsig_group, allsig_single_group], ignore_index=True)
474
+
475
+ allsig = allsig_group
476
+ #pd.concat([allsig_no_group, allsig_group], ignore_index=True)
477
+
478
+ # drop helper column TCHR+POS
479
+ allsig = allsig.drop(["TCHR+POS"], axis=1)
480
+
481
+ try:
482
+ allsig = allsig.where(~pd.isna(allsig), pd.NA)
483
+ except:
484
+ pass
485
+
486
+ log.write(" -Identified ",len(allsig)-sum(allsig["NOVEL"])," known vairants in current sumstats...", verbose=verbose)
487
+ log.write(" -Identified ",sum(allsig["NOVEL"])," novel vairants in current sumstats...", verbose=verbose)
488
+
489
+ finished(log,verbose,_end_line)
490
+
491
+ # how to return
492
+ if only_novel is True:
493
+ if output_known is True:
494
+ return allsig.loc[allsig["NOVEL"],:], knownsig
495
+ else:
496
+ return allsig.loc[allsig["NOVEL"],:]
497
+ else:
498
+ if output_known is True:
499
+ return allsig, knownsig
500
+ else:
501
+ return allsig
502
+ ##################################################################################################################################################################################################
503
+
504
+
505
+ def _check_cis(insumstats,
506
+ id,
507
+ chrom,
508
+ pos,
509
+ p,
510
+ use_p=False,
511
+ known=False,
512
+ group_key=None,
513
+ if_get_lead = False,
514
+ windowsizekb=500,
515
+ sig_level=5e-8,
516
+ log=Log(),
517
+ xymt=["X","Y","MT"],
518
+ anno=False,
519
+ build="19",
520
+ source="ensembl",
521
+ verbose=True):
522
+ ##start function with col checking##########################################################
523
+ _start_line = "check if variants are in cis or trans regions"
524
+ _end_line = "checking if variants are in cis or trans regions"
525
+ _start_cols = [chrom,pos, group_key]
526
+ _start_function = ".check_cis()"
527
+ _must_args ={}
528
+
529
+ is_enough_info = start_to(sumstats=insumstats,
530
+ log=log,
531
+ verbose=verbose,
532
+ start_line=_start_line,
533
+ end_line=_end_line,
534
+ start_cols=_start_cols,
535
+ start_function=_start_function,
536
+ **_must_args)
537
+ if is_enough_info == False: return None
538
+ ############################################################################################
539
+
540
+ if if_get_lead == True:
541
+ allsig = getsig(insumstats=insumstats,
542
+ id=id,chrom=chrom,pos=pos,p=p,use_p=use_p,windowsizekb=windowsizekb,sig_level=sig_level,log=log,
543
+ xymt=xymt,anno=anno,build=build, source=source,verbose=verbose)
544
+ else:
545
+ allsig = insumstats.copy()
546
+
547
+ ############################################################################################
548
+ knownsig = pd.DataFrame()
549
+ if type(known) is pd.DataFrame:
550
+ knownsig_2 = known.copy()
551
+ knownsig = pd.concat([knownsig, knownsig_2],ignore_index=True)
552
+ knownsig["CHR"] = knownsig["CHR"].astype("Int64")
553
+ knownsig["START"] = knownsig["START"].astype("Int64")
554
+ knownsig["END"] = knownsig["END"].astype("Int64")
555
+ elif type(known) is str:
556
+ knownsig_2 = pd.read_csv(known,sep="\s+",dtype={"CHR":"Int64","POS":"Int64"})
557
+ knownsig = pd.concat([knownsig, knownsig_2],ignore_index=True)
558
+ knownsig["CHR"] = knownsig["CHR"].astype("Int64")
559
+ knownsig["START"] = knownsig["START"].astype("Int64")
560
+ knownsig["END"] = knownsig["END"].astype("Int64")
561
+
562
+ if len(knownsig)<1:
563
+ raise ValueError("Please input a dataframe of gene list with GENE, CHR, START, END.")
564
+
565
+ if group_key is not None:
566
+ if group_key not in knownsig.columns:
567
+ raise ValueError("Please check if group_key is in both sumstats and list of known associations.")
568
+
569
+ ############################################################################################
570
+ if group_key is not None:
571
+ number_of_groups_allsig = allsig[group_key].nunique()
572
+ number_of_groups_known = knownsig[group_key].nunique()
573
+ log.write(" -Number of groups in sumstats:{}".format(number_of_groups_allsig), verbose=verbose)
574
+ log.write(" -Number of groups in reference:{}".format(number_of_groups_known), verbose=verbose)
575
+
576
+ log.write(" -Checking if variants in cis/trans regions grouped by {}...".format(group_key), verbose=verbose)
577
+ log.write(" -Window size in kb adding to start and end: {}...".format(windowsizekb), verbose=verbose)
578
+ ############################################################################################
579
+ #convert to a dict
580
+ reference_dict = {}
581
+ for index,row in knownsig.iterrows():
582
+ reference_dict[row[group_key]] = (row["CHR"], row["START"], row["END"] )
583
+ ############################################################################################
584
+ try:
585
+ no_reference_avaialble = allsig.loc[~allsig[group_key].isin(reference_dict.keys()),group_key]
586
+ if len(no_reference_avaialble)>0:
587
+ log.write(" -Groups not in reference: {}".format( ",".join(no_reference_avaialble.unique())), verbose=verbose)
588
+ except:
589
+ pass
590
+
591
+ allsig["CIS/TRANS"] = allsig.apply(lambda x: determine_if_cis(x, group_key,windowsizekb, reference_dict), axis=1)
592
+
593
+ try:
594
+ allsig = allsig.where(~pd.isna(allsig), pd.NA)
595
+ except:
596
+ pass
597
+
598
+ try:
599
+ number_of_cis = sum(allsig["CIS/TRANS"] == "Cis")
600
+ number_of_trans = sum(allsig["CIS/TRANS"] == "Trans")
601
+ number_of_noreference = sum(allsig["CIS/TRANS"] == "NoReference")
602
+ log.write (" -Number of Cis variants: {}".format(number_of_cis),verbose=verbose)
603
+ log.write (" -Number of Trans variants: {}".format(number_of_trans),verbose=verbose)
604
+ log.write (" -Number of NoReference variants: {}".format(number_of_noreference),verbose=verbose)
605
+ except:
606
+ pass
607
+
608
+ finished(log,verbose,_end_line)
421
609
 
610
+ return allsig
611
+
612
+ ###################################################################################################################################################################################################
613
+
614
+
615
+ def determine_big_number(maxpos, big_number = 1000000000):
616
+ for i in range(7):
617
+ if maxpos*10 > big_number:
618
+ big_number = int(big_number * 10)
619
+ else:
620
+ break
621
+ return big_number
622
+
623
+ def add_tchr_pos(df, chrom, pos, big_number):
624
+ df["TCHR+POS"]=df[chrom]*big_number + df[pos]
625
+ return df
626
+
627
+ def fill_meta_info_for_known(allsig, knownsig):
628
+ if len(allsig)==0 or len(knownsig)==0: return allsig
422
629
  if "SNPID" in knownsig.columns:
423
630
  knownids=knownsig["SNPID"].values
424
631
  if "PUBMEDID" in knownsig.columns:
@@ -427,12 +634,7 @@ def getnovel(insumstats,
427
634
  knownauthor=knownsig["AUTHOR"].values
428
635
  if "EFOID" in knownsig.columns:
429
636
  knownefo=knownsig["EFOID"].values
430
-
431
- # get distance
432
- lambda x:np.min(np.abs(knownsig["TCHR+POS"]-x))
433
- allsig["DISTANCE_TO_KNOWN"] = allsig["TCHR+POS"].apply(lambda x:min(knownsig["TCHR+POS"]-x, key=abs))
434
-
435
- # get other info
637
+
436
638
  if "SNPID" in knownsig.columns:
437
639
  allsig["KNOWN_ID"] = allsig["TCHR+POS"].apply(lambda x:knownids[np.argmin(np.abs(knownsig["TCHR+POS"]-x))])
438
640
  if "PUBMEDID" in knownsig.columns:
@@ -440,20 +642,48 @@ def getnovel(insumstats,
440
642
  if "AUTHOR" in knownsig.columns:
441
643
  allsig["KNOWN_AUTHOR"] = allsig["TCHR+POS"].apply(lambda x:knownauthor[np.argmin(np.abs(knownsig["TCHR+POS"]-x))])
442
644
  if "EFOID" in knownsig.columns:
443
- allsig["KNOWN_EFOID"] = allsig["TCHR+POS"].apply(lambda x:knownefo[np.argmin(np.abs(knownsig["TCHR+POS"]-x))])
645
+ allsig["KNOWN_EFOID"] = allsig["TCHR+POS"].apply(lambda x:knownefo[np.argmin(np.abs(knownsig["TCHR+POS"]-x))])
646
+ return allsig
444
647
 
445
- # determine if novel
648
+ def determine_if_cis(x, group_key,windowsizekb, reference_dict):
649
+ if x[group_key] in reference_dict.keys():
650
+ is_same_chr = str(reference_dict[x[group_key]][0]) == str(x["CHR"])
651
+ is_large_than_start = int(reference_dict[x[group_key]][1]) - windowsizekb*1000 <= x["POS"]
652
+ is_smaller_than_end = int(reference_dict[x[group_key]][2]) + windowsizekb*1000 >= x["POS"]
653
+
654
+ if is_same_chr and is_large_than_start and is_smaller_than_end:
655
+ return "Cis"
656
+ else:
657
+ return "Trans"
658
+ else:
659
+ return "NoReference"
660
+
661
+ def determine_distance(allsig, knownsig):
662
+ if len(allsig)==0:
663
+ return allsig
664
+ if len(knownsig)==0:
665
+ allsig["DISTANCE_TO_KNOWN"] = pd.NA
666
+ return allsig
667
+ allsig["DISTANCE_TO_KNOWN"] = allsig["TCHR+POS"].apply(lambda x:min(knownsig["TCHR+POS"]-x, key=abs))
668
+ return allsig
669
+
670
+ def determine_novel(allsig, windowsizekb_for_novel):
671
+ if len(allsig)==0 or "DISTANCE_TO_KNOWN" not in allsig.columns:
672
+ return allsig
446
673
  allsig["NOVEL"] = allsig["DISTANCE_TO_KNOWN"].abs() > windowsizekb_for_novel*1000
447
-
448
- # determine location
449
- allsig["LOCATION_OF_KNOWN"]="Unknown"
674
+ allsig.loc[allsig["DISTANCE_TO_KNOWN"].isna(), "NOVEL"] = True
675
+ return allsig
676
+
677
+ def determine_location(allsig):
678
+ allsig["LOCATION_OF_KNOWN"]="NoReference"
450
679
  allsig.loc[ allsig["DISTANCE_TO_KNOWN"]== 0,"LOCATION_OF_KNOWN"] = "Same"
451
680
  allsig.loc[ allsig["DISTANCE_TO_KNOWN"] > 0 ,"LOCATION_OF_KNOWN"] = "Upstream"
452
681
  allsig.loc[ allsig["DISTANCE_TO_KNOWN"] < 0 ,"LOCATION_OF_KNOWN"] = "Downstream"
682
+ return allsig
453
683
 
454
- # if not on same chromosome, distance set to pd.NA
455
- if sum(allsig["DISTANCE_TO_KNOWN"].abs() > insumstats["POS"].max())>0:
456
- not_on_same_chromosome = allsig["DISTANCE_TO_KNOWN"].abs() > insumstats["POS"].max()
684
+ def determine_if_same_chromosome(allsig, knownsig, maxpos):
685
+ if sum(allsig["DISTANCE_TO_KNOWN"].abs() > maxpos)>0:
686
+ not_on_same_chromosome = allsig["DISTANCE_TO_KNOWN"].abs() > maxpos
457
687
  allsig.loc[ not_on_same_chromosome ,"DISTANCE_TO_KNOWN"] = pd.NA
458
688
  allsig.loc[ not_on_same_chromosome ,"LOCATION_OF_KNOWN"] = "NoneOnThisChr"
459
689
  if "SNPID" in knownsig.columns:
@@ -464,23 +694,135 @@ def getnovel(insumstats,
464
694
  allsig.loc[ not_on_same_chromosome ,"KNOWN_AUTHOR"] = pd.NA
465
695
  if "EFOID" in knownsig.columns:
466
696
  allsig.loc[ not_on_same_chromosome ,"KNOWN_EFOID"] = pd.NA
697
+ return allsig
467
698
 
468
- # drop helper column TCHR+POS
469
- allsig = allsig.drop(["TCHR+POS"], axis=1)
470
-
471
- if verbose: log.write(" -Identified ",len(allsig)-sum(allsig["NOVEL"])," known vairants in current sumstats...")
472
- if verbose: log.write(" -Identified ",sum(allsig["NOVEL"])," novel vairants in current sumstats...")
699
+ def _check_novel_set(insumstats,
700
+ id,
701
+ chrom,
702
+ pos,
703
+ p,
704
+ use_p=False,
705
+ known=False,
706
+ group_key=None,
707
+ snpset="SNPSET",
708
+ snpid="SNPID",
709
+ if_get_lead = False,
710
+ windowsizekb=500,
711
+ sig_level=5e-8,
712
+ log=Log(),
713
+ xymt=["X","Y","MT"],
714
+ anno=False,
715
+ build="19",
716
+ source="ensembl",
717
+ verbose=True):
473
718
 
474
- finished(log,verbose,_end_line)
719
+ ##start function with col checking##########################################################
720
+ _start_line = "check if variant sets are overlapping with those in reference file"
721
+ _end_line = "checking if variant sets are overlapping with those in reference file"
722
+ _start_cols = [chrom,pos, group_key]
723
+ _start_function = ".check_cis()"
724
+ _must_args ={}
725
+
726
+ is_enough_info = start_to(sumstats=insumstats,
727
+ log=log,
728
+ verbose=verbose,
729
+ start_line=_start_line,
730
+ end_line=_end_line,
731
+ start_cols=_start_cols,
732
+ start_function=_start_function,
733
+ **_must_args)
734
+ if is_enough_info == False: return None
735
+ ############################################################################################
475
736
 
476
- # how to return
477
- if only_novel is True:
478
- if output_known is True:
479
- return allsig.loc[allsig["NOVEL"],:], knownsig
480
- else:
481
- return allsig.loc[allsig["NOVEL"],:]
737
+ if if_get_lead == True:
738
+ allsig = getsig(insumstats=insumstats,
739
+ id=id,chrom=chrom,pos=pos,p=p,use_p=use_p,windowsizekb=windowsizekb,sig_level=sig_level,log=log,
740
+ xymt=xymt,anno=anno,build=build, source=source,verbose=verbose)
482
741
  else:
483
- if output_known is True:
484
- return allsig, knownsig
742
+ allsig = insumstats.copy()
743
+
744
+ ############################################################################################
745
+ knownsig = pd.DataFrame()
746
+ if type(known) is pd.DataFrame:
747
+ knownsig_2 = known.copy()
748
+ knownsig = pd.concat([knownsig, knownsig_2],ignore_index=True)
749
+ knownsig[snpid] = knownsig[snpid].astype("string")
750
+ knownsig[snpset] = knownsig[snpset].astype("string")
751
+ knownsig[group_key] = knownsig[group_key].astype("string")
752
+ elif type(known) is str:
753
+ knownsig_2 = pd.read_csv(known,sep="\s+",dtype={"CHR":"Int64","POS":"Int64"})
754
+ knownsig = pd.concat([knownsig, knownsig_2],ignore_index=True)
755
+ knownsig[snpid] = knownsig[snpid].astype("string")
756
+ knownsig[snpset] = knownsig[snpset].astype("string")
757
+ knownsig[group_key] = knownsig[group_key].astype("string")
758
+
759
+ if len(knownsig)<1:
760
+ raise ValueError("Please input a dataframe of gene list with GENE, CHR, START, END.")
761
+
762
+ if group_key is not None:
763
+ if group_key not in knownsig.columns:
764
+ raise ValueError("Please check if group_key is in both sumstats and list of known associations.")
765
+
766
+ ############################################################################################
767
+ if group_key is not None:
768
+ number_of_groups_allsig = allsig[group_key].nunique()
769
+ number_of_groups_known = knownsig[group_key].nunique()
770
+ log.write(" -Number of groups in sumstats:{}".format(number_of_groups_allsig), verbose=verbose)
771
+ log.write(" -Number of groups in reference:{}".format(number_of_groups_known), verbose=verbose)
772
+
773
+ log.write(" -Checking if variants in cis/trans regions grouped by {}...".format(group_key), verbose=verbose)
774
+
775
+ ############################################################################################
776
+ #convert to a dict
777
+ reference_dict = {}
778
+
779
+ for index,row in knownsig.iterrows():
780
+ if row[group_key] in reference_dict.keys():
781
+ if row[snpset] in reference_dict[row[group_key]].keys():
782
+ reference_dict[row[group_key]][row[snpset]].add(row[snpid])
783
+ else:
784
+ reference_dict[row[group_key]][row[snpset]] = set([row[snpid]])
485
785
  else:
486
- return allsig
786
+ reference_dict[row[group_key]] = {row[snpset]:set([row[snpid]])}
787
+ ############################################################################################
788
+
789
+ try:
790
+ no_reference_avaialble = allsig.loc[~allsig[group_key].isin(reference_dict.keys()),group_key]
791
+ if len(no_reference_avaialble)>0:
792
+ log.write(" -Groups not in reference: {}".format( ",".join(no_reference_avaialble)), verbose=verbose)
793
+ except:
794
+ pass
795
+
796
+ log.write(" -Checking if variants are in reference variant sets...", verbose=verbose)
797
+ known_list = allsig.apply(lambda x: check_overlap(x,snpid, group_key,reference_dict), axis=1)
798
+
799
+ allsig["KNOWN_SET"] = known_list.str[0]
800
+ allsig["KNOWN_VARIANT"] = known_list.str[1]
801
+
802
+ back_dict={}
803
+ for i in allsig[group_key].unique():
804
+ back_dict[i] ={}
805
+ for j in allsig.loc[allsig[group_key]==i,snpset].unique():
806
+ back_dict[i][j] =set()
807
+ for index, row in allsig.loc[(allsig[group_key]==i) & (allsig[snpset]==j) & (~allsig["KNOWN_SET"].isna()),:].iterrows():
808
+ back_dict[i][j].add("{}-{}-{}".format(row[group_key], row["KNOWN_SET"],row["KNOWN_VARIANT"]))
809
+
810
+ allsig["KNOWN_SET_VARIANT"] = allsig.apply(lambda x: assign_set_variant(x,group_key,snpset,back_dict), axis=1)
811
+
812
+ finished(log,verbose,_end_line)
813
+
814
+ return allsig
815
+
816
+ def check_overlap(x,snpid, group_key,reference_dict):
817
+ if x[group_key] in reference_dict.keys():
818
+ for key, value in reference_dict[x[group_key]].items():
819
+ if x[snpid] in value:
820
+ return key, x[snpid]
821
+ return pd.NA, pd.NA,
822
+
823
+ def assign_set_variant(x,group_key,snpset,back_dict):
824
+ if x[group_key] in back_dict.keys():
825
+ if x[snpset] in back_dict[x[group_key]].keys():
826
+ if len(back_dict[x[group_key]][x[snpset]]) >0:
827
+ return back_dict[x[group_key]][x[snpset]]
828
+ return pd.NA