gwaslab 3.4.48__py3-none-any.whl → 3.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of gwaslab might be problematic. Click here for more details.

gwaslab/bd_common_data.py CHANGED
@@ -274,7 +274,9 @@ def get_gtf(chrom, build="19",source="ensembl"):
274
274
  gtf = pd.DataFrame(columns=["seqname","start","end","strand","feature","gene_biotype","gene_id","gene_name"])
275
275
  return gtf
276
276
 
277
-
277
+ def get_chain(from_build="19", to_build="38"):
278
+ chain_path = check_and_download("{}to{}".format(from_build, to_build))
279
+ return chain_path
278
280
  ####################################################################################################################
279
281
  def gtf_to_protein_coding(gtfpath,log=Log(),verbose=True):
280
282
  protein_coding_path = gtfpath[:-6]+"protein_coding.gtf.gz"
@@ -90,12 +90,20 @@
90
90
  "1kg_dbsnp151_hg38_x_md5":"48c05eeb1454c0dd4cbee3cb26382e8e",
91
91
  "recombination_hg19":"https://www.dropbox.com/s/wbesl8haxknonuc/recombination_hg19.tar.gz?dl=1",
92
92
  "recombination_hg38":"https://www.dropbox.com/s/vuo8mvqx0fpibzj/recombination_hg38.tar.gz?dl=1",
93
- "ensembl_hg19_gtf":"https://ftp.ensembl.org/pub/grch37/current/gtf/homo_sapiens/Homo_sapiens.GRCh37.87.chr.gtf.gz",
93
+ "ensembl_hg19_gtf":"https://ftp.ensembl.org/pub/grch37/release-87/gtf/homo_sapiens/Homo_sapiens.GRCh37.87.chr.gtf.gz",
94
94
  "ensembl_hg38_gtf":"https://ftp.ensembl.org/pub/release-109/gtf/homo_sapiens//Homo_sapiens.GRCh38.109.chr.gtf.gz",
95
95
  "refseq_hg19_gtf":"https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/GRCh37_latest/refseq_identifiers/GRCh37_latest_genomic.gtf.gz",
96
96
  "refseq_hg38_gtf":"https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_genomic.gtf.gz",
97
97
  "testlink":"https://www.dropbox.com/s/8u7capwge0ihshu/EAS.chr22.split_norm_af.1kgp3v5.vcf.gz?dl=1",
98
- "testlink_tbi":"https://www.dropbox.com/s/hdneg53t6u1j6ib/EAS.chr22.split_norm_af.1kgp3v5.vcf.gz.tbi?dl=1"
98
+ "testlink_tbi":"https://www.dropbox.com/s/hdneg53t6u1j6ib/EAS.chr22.split_norm_af.1kgp3v5.vcf.gz.tbi?dl=1",
99
+ "19to38":"https://hgdownload.soe.ucsc.edu/goldenPath/hg19/liftOver/hg19ToHg38.over.chain.gz",
100
+ "19to13":"https://s3-us-west-2.amazonaws.com/human-pangenomics/T2T/CHM13/assemblies/chain/v1_nflo/hg19-chm13v2.chain",
101
+ "38to19":"https://hgdownload.soe.ucsc.edu/goldenPath/hg38/liftOver/hg38ToHg19.over.chain.gz",
102
+ "38to13":"https://s3-us-west-2.amazonaws.com/human-pangenomics/T2T/CHM13/assemblies/chain/v1_nflo/grch38-chm13v2.chain",
103
+ "13to19":"https://s3-us-west-2.amazonaws.com/human-pangenomics/T2T/CHM13/assemblies/chain/v1_nflo/chm13v2-hg19.chain",
104
+ "13to38":"https://s3-us-west-2.amazonaws.com/human-pangenomics/T2T/CHM13/assemblies/chain/v1_nflo/chm13v2-grch38.chain",
105
+ "18to19":"https://hgdownload.soe.ucsc.edu/goldenPath/hg18/liftOver/hg18ToHg19.over.chain.gz",
106
+ "18to38":"https://hgdownload.soe.ucsc.edu/goldenPath/hg18/liftOver/hg18ToHg38.over.chain.gz"
99
107
  }
100
108
 
101
109
 
gwaslab/g_Sumstats.py CHANGED
@@ -8,6 +8,8 @@ from gwaslab.io_preformat_input import preformat
8
8
  from gwaslab.io_to_formats import _to_format
9
9
  from gwaslab.g_Log import Log
10
10
  from gwaslab.qc_fix_sumstats import fixID
11
+ from gwaslab.qc_fix_sumstats import flipSNPID
12
+ from gwaslab.qc_fix_sumstats import stripSNPID
11
13
  from gwaslab.qc_fix_sumstats import removedup
12
14
  from gwaslab.qc_fix_sumstats import fixchr
13
15
  from gwaslab.qc_fix_sumstats import fixpos
@@ -79,6 +81,7 @@ from gwaslab.bd_get_hapmap3 import gethapmap3
79
81
  from gwaslab.util_abf_finemapping import abf_finemapping
80
82
  from gwaslab.util_abf_finemapping import make_cs
81
83
  import gc
84
+ from gwaslab.viz_plot_phe_heatmap import _gwheatmap
82
85
 
83
86
  #20220309
84
87
  class Sumstats():
@@ -123,6 +126,8 @@ class Sumstats():
123
126
  snpr2=None,
124
127
  status=None,
125
128
  other=[],
129
+ chrom_pat=None,
130
+ snpid_pat=None,
126
131
  usekeys=None,
127
132
  direction=None,
128
133
  verbose=True,
@@ -205,6 +210,8 @@ class Sumstats():
205
210
  status=status,
206
211
  other=other,
207
212
  usekeys=usekeys,
213
+ chrom_pat=chrom_pat,
214
+ snpid_pat=snpid_pat,
208
215
  verbose=verbose,
209
216
  readargs=readargs,
210
217
  log=self.log)
@@ -418,6 +425,10 @@ class Sumstats():
418
425
  #customizable API to build your own QC pipeline
419
426
  def fix_id(self,**kwargs):
420
427
  self.data = fixID(self.data,log=self.log,**kwargs)
428
+ def flip_snpid(self,**kwargs):
429
+ self.data = flipSNPID(self.data,log=self.log,**kwargs)
430
+ def strip_snpid(self,**kwargs):
431
+ self.data = stripSNPID(self.data,log=self.log,**kwargs)
421
432
  def fix_chr(self,**kwargs):
422
433
  self.data = fixchr(self.data,log=self.log,**kwargs)
423
434
  def fix_pos(self,**kwargs):
@@ -592,6 +603,11 @@ class Sumstats():
592
603
  def plot_daf(self, **kwargs):
593
604
  fig,outliers = plotdaf(self.data, **kwargs)
594
605
  return fig, outliers
606
+
607
+ def plot_gwheatmap(self, **kwargs):
608
+ fig = _gwheatmap(self.data, **kwargs)
609
+ return fig
610
+
595
611
  def plot_mqq(self, build=None, **kwargs):
596
612
 
597
613
  chrom="CHR"
@@ -695,7 +711,7 @@ class Sumstats():
695
711
  # return sumstats object
696
712
  return output
697
713
 
698
- def check_cis(self, **kwargs):
714
+ def check_cis(self, gls=False, **kwargs):
699
715
  if "SNPID" in self.data.columns:
700
716
  id_to_use = "SNPID"
701
717
  else:
@@ -707,7 +723,13 @@ class Sumstats():
707
723
  p="P",
708
724
  log=self.log,
709
725
  **kwargs)
710
- # return sumstats object
726
+
727
+ # return sumstats object
728
+ if gls == True:
729
+ new_Sumstats_object = copy.deepcopy(self)
730
+ new_Sumstats_object.data = output
731
+ gc.collect()
732
+ return new_Sumstats_object
711
733
  return output
712
734
 
713
735
  def check_novel_set(self, **kwargs):
@@ -1,6 +1,6 @@
1
1
  import pandas as pd
2
2
 
3
- CATEGORIES = {str(j+i) for j in [1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
3
+ CATEGORIES = {str(j+i) for j in [1300000,1800000,1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
4
4
 
5
5
  def vchange_status(status,digit,before,after):
6
6
  dic={}
gwaslab/g_version.py CHANGED
@@ -15,8 +15,8 @@ def _get_version():
15
15
  def gwaslab_info():
16
16
  # version meta information
17
17
  dic={
18
- "version":"3.4.48",
19
- "release_date":"20240822"
18
+ "version":"3.5.0",
19
+ "release_date":"20241029"
20
20
  }
21
21
  return dic
22
22
 
@@ -21,6 +21,7 @@ from gwaslab.qc_check_datatype import check_dataframe_shape
21
21
  from gwaslab.bd_common_data import get_number_to_chr
22
22
  from gwaslab.bd_common_data import get_chr_list
23
23
  from gwaslab.bd_common_data import get_chr_to_number
24
+ from gwaslab.bd_common_data import get_number_to_NC
24
25
  from gwaslab.bd_common_data import _maketrans
25
26
  from gwaslab.g_vchange_status import vchange_status
26
27
  from gwaslab.g_version import _get_version
@@ -355,7 +356,7 @@ def oldcheckref(sumstats,ref_seq,chrom="CHR",pos="POS",ea="EA",nea="NEA",status=
355
356
 
356
357
  log.write("\n",end="",show_time=False,verbose=verbose)
357
358
 
358
- CATEGORIES = {str(j+i) for j in [1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
359
+ CATEGORIES = {str(j+i) for j in [1300000,1800000,1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
359
360
  sumstats[status] = pd.Categorical(sumstats[status],categories=CATEGORIES)
360
361
  #sumstats[status] = sumstats[status].astype("string")
361
362
 
@@ -674,7 +675,7 @@ def checkref(sumstats,ref_seq,chrom="CHR",pos="POS",ea="EA",nea="NEA",status="ST
674
675
  sumstats.loc[to_check_ref,status] = check_status(sumstats_to_check, all_records_dict, log=log, verbose=verbose)
675
676
  log.write(" -Finished checking records", verbose=verbose)
676
677
 
677
- CATEGORIES = {str(j+i) for j in [1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
678
+ CATEGORIES = {str(j+i) for j in [1300000,1800000,1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
678
679
  sumstats[status] = pd.Categorical(sumstats[status],categories=CATEGORIES)
679
680
  #sumstats[status] = sumstats[status].astype("string")
680
681
 
@@ -1496,17 +1497,21 @@ def infer_af(chr,start,end,ref,alt,vcf_reader,alt_freq,chr_dict=None):
1496
1497
  def auto_check_vcf_chr_dict(vcf_path, vcf_chr_dict, verbose, log):
1497
1498
  if vcf_path is not None:
1498
1499
  if vcf_chr_dict is None:
1499
- log.write(" -Checking prefix for chromosomes in vcf files..." ,verbose=verbose)
1500
- prefix = check_vcf_chr_prefix(vcf_path)
1500
+ log.write(" -Checking chromosome notations in VCF/BCF files..." ,verbose=verbose)
1501
+ vcf_chr_dict = check_vcf_chr_NC(vcf_path, log, verbose)
1502
+ if vcf_chr_dict is not None:
1503
+ return vcf_chr_dict
1504
+ log.write(" -Checking prefix for chromosomes in VCF/BCF files..." ,verbose=verbose)
1505
+ prefix = check_vcf_chr_prefix(vcf_path, log,verbose)
1501
1506
  if prefix is not None:
1502
1507
  log.write(" -Prefix for chromosomes: ",prefix)
1503
1508
  vcf_chr_dict = get_number_to_chr(prefix=prefix)
1504
1509
  else:
1505
- log.write(" -No prefix for chromosomes in the VCF files." ,verbose=verbose)
1510
+ log.write(" -No prefix for chromosomes in the VCF/BCF files." ,verbose=verbose)
1506
1511
  vcf_chr_dict = get_number_to_chr()
1507
1512
  return vcf_chr_dict
1508
1513
 
1509
- def check_vcf_chr_prefix(vcf_bcf_path):
1514
+ def check_vcf_chr_prefix(vcf_bcf_path,log,verbose):
1510
1515
  vcf_bcf = VariantFile(vcf_bcf_path)
1511
1516
  for i in list(vcf_bcf.header.contigs):
1512
1517
  m = re.search('(chr|Chr|CHR)([0-9xXyYmM]+)', i)
@@ -1514,5 +1519,16 @@ def check_vcf_chr_prefix(vcf_bcf_path):
1514
1519
  return m.group(1)
1515
1520
  else:
1516
1521
  return None
1517
-
1522
+
1523
+ def check_vcf_chr_NC(vcf_bcf_path,log,verbose):
1524
+ vcf_bcf = VariantFile(vcf_bcf_path)
1525
+ for i in list(vcf_bcf.header.contigs):
1526
+ if i in get_number_to_NC(build="19").values():
1527
+ log.write(" -RefSeq ID detected (hg19) in VCF/BCF...",verbose=verbose)
1528
+ return get_number_to_NC(build="19")
1529
+ elif i in get_number_to_NC(build="38").values():
1530
+ log.write(" -RefSeq ID detected (hg38) in VCF/BCF...",verbose=verbose)
1531
+ return get_number_to_NC(build="38")
1532
+ else:
1533
+ return None
1518
1534
 
@@ -56,6 +56,8 @@ def preformat(sumstats,
56
56
  build=None,
57
57
  other=[],
58
58
  usekeys=None,
59
+ chrom_pat=None,
60
+ snpid_pat=None,
59
61
  verbose=False,
60
62
  readargs=None,
61
63
  log=None):
@@ -84,7 +86,10 @@ def preformat(sumstats,
84
86
  if "format_separator" in meta_data.keys():
85
87
  if "sep" not in readargs.keys():
86
88
  readargs["sep"] = meta_data["format_separator"]
87
-
89
+ else:
90
+ if readargs["sep"] != meta_data["format_separator"]:
91
+ log.write(' - format_separator will be changed to: "{}"'.format(readargs["sep"]),verbose=verbose)
92
+
88
93
  if "format_na" in meta_data.keys():
89
94
  readargs["na_values"] = meta_data["format_na"]
90
95
 
@@ -92,7 +97,7 @@ def preformat(sumstats,
92
97
  readargs["comment"] = meta_data["format_comment"]
93
98
 
94
99
  if "sep" not in readargs.keys():
95
- readargs["sep"] = "\t"
100
+ readargs["sep"] = "\t"
96
101
 
97
102
  #########################################################################################################################################################
98
103
 
@@ -323,10 +328,30 @@ def preformat(sumstats,
323
328
  skip_rows = get_skip_rows(inpath)
324
329
  readargs["skiprows"] = skip_rows
325
330
  log.write("Start to initialize gl.Sumstats from file :" + inpath,verbose=verbose)
326
- sumstats = pd.read_table(inpath,
327
- usecols=set(usecols),
328
- dtype=dtype_dictionary,
329
- **readargs)
331
+ if chrom_pat is not None:
332
+ sumstats = _load_single_chr(inpath,
333
+ usecols,
334
+ dtype_dictionary,
335
+ readargs=readargs,
336
+ rename_dictionary=rename_dictionary,
337
+ chrom_pat=chrom_pat,
338
+ log=log,
339
+ verbose=verbose)
340
+ elif snpid_pat is not None:
341
+
342
+ sumstats = _load_variants_with_pattern(inpath,
343
+ usecols,
344
+ dtype_dictionary,
345
+ readargs=readargs,
346
+ rename_dictionary=rename_dictionary,
347
+ snpid_pat=snpid_pat,
348
+ log=log,
349
+ verbose=verbose)
350
+ else:
351
+ sumstats = pd.read_table(inpath,
352
+ usecols=set(usecols),
353
+ dtype=dtype_dictionary,
354
+ **readargs)
330
355
 
331
356
  elif type(sumstats) is pd.DataFrame:
332
357
  ## loading data from dataframe
@@ -520,9 +545,49 @@ def process_status(sumstats,build,log,verbose):
520
545
  #sumstats["STATUS"] = int(build)*(10**5) +99999
521
546
  build = _process_build(build,log,verbose)
522
547
  sumstats["STATUS"] = build +"99999"
523
- categories = {str(j+i) for j in [1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
548
+ categories = {str(j+i) for j in [1300000,1800000,1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
524
549
  sumstats["STATUS"] = pd.Categorical(sumstats["STATUS"],categories=categories)
525
550
  return sumstats
526
551
 
527
552
 
528
-
553
+ def _load_single_chr(inpath,usecols,dtype_dictionary,readargs,rename_dictionary,chrom_pat,log,verbose):
554
+
555
+ sumstats_iter = pd.read_table(inpath,
556
+ usecols=set(usecols),
557
+ dtype=dtype_dictionary,
558
+ iterator=True,
559
+ chunksize=500000,
560
+ **readargs)
561
+ # get chr
562
+ for k,v in rename_dictionary.items():
563
+ if v=="CHR":
564
+ if k in usecols:
565
+ log.write(" -Columns used to filter variants: {}".format(k),verbose=verbose)
566
+ chunk_chrom = k
567
+ break
568
+
569
+ log.write(" -Loading only variants on chromosome with pattern : {} ...".format(chrom_pat),verbose=verbose)
570
+ sumstats_filtered = pd.concat([chunk[chunk[chunk_chrom].str.match(chrom_pat, case=False,na=False) ] for chunk in sumstats_iter])
571
+ log.write(" -Loaded {} variants on chromosome with pattern :{} ...".format(len(sumstats_filtered), chrom_pat),verbose=verbose)
572
+ return sumstats_filtered
573
+
574
+ def _load_variants_with_pattern(inpath,usecols,dtype_dictionary,readargs,rename_dictionary,snpid_pat,log,verbose):
575
+
576
+ sumstats_iter = pd.read_table(inpath,
577
+ usecols=set(usecols),
578
+ dtype=dtype_dictionary,
579
+ iterator=True,
580
+ chunksize=500000,
581
+ **readargs)
582
+ # get chr
583
+ for k,v in rename_dictionary.items():
584
+ if v=="SNPID":
585
+ if k in usecols:
586
+ log.write(" -Columns used to filter variants: {}".format(k),verbose=verbose)
587
+ chunk_snpid = k
588
+ break
589
+
590
+ log.write(" -Loading only variants with pattern : {} ...".format(snpid_pat),verbose=verbose)
591
+ sumstats_filtered = pd.concat([chunk[chunk[chunk_snpid].str.match(snpid_pat, case=False,na=False) ] for chunk in sumstats_iter])
592
+ log.write(" -Loaded {} variants with pattern : {} ...".format(len(sumstats_filtered), snpid_pat),verbose=verbose)
593
+ return sumstats_filtered
gwaslab/io_to_formats.py CHANGED
@@ -342,7 +342,7 @@ def tofmt(sumstats,
342
342
  meta_data,rename_dictionary = get_format_dict(fmt,inverse=True)
343
343
  print_format_info(fmt=fmt, meta_data=meta_data,rename_dictionary=rename_dictionary,verbose=verbose, log=log, output=True)
344
344
 
345
- ymal_path = path + "."+suffix+".tsv-meta.ymal"
345
+ yaml_path = path + "."+suffix+".tsv-meta.yaml"
346
346
  path = path + "."+suffix+".tsv.gz"
347
347
  log.write(" -Output path:",path, verbose=verbose)
348
348
 
@@ -361,7 +361,7 @@ def tofmt(sumstats,
361
361
  md5_value = calculate_md5sum_file(path)
362
362
 
363
363
  ## update ssf-style meta data and export to yaml file
364
- _configure_ssf_meta(sumstats, fmt, ssfmeta, meta, meta_data, path, md5_value, ymal_path, log, verbose)
364
+ _configure_ssf_meta(sumstats, fmt, ssfmeta, meta, meta_data, path, md5_value, yaml_path, log, verbose)
365
365
 
366
366
  return sumstats
367
367
 
@@ -476,7 +476,7 @@ def _configure_output_cols_and_args(sumstats, rename_dictionary, cols, no_status
476
476
  return sumstats, to_csvargs
477
477
 
478
478
 
479
- def _configure_ssf_meta(sumstats, fmt, ssfmeta, meta, meta_data, path, md5_value, ymal_path, log, verbose):
479
+ def _configure_ssf_meta(sumstats, fmt, ssfmeta, meta, meta_data, path, md5_value, yaml_path, log, verbose):
480
480
  ### calculate meta data
481
481
  if "EAF" in sumstats.columns:
482
482
  min_maf = sumstats["EAF"].min()
@@ -506,8 +506,8 @@ def _configure_ssf_meta(sumstats, fmt, ssfmeta, meta, meta_data, path, md5_value
506
506
  sumstats_meta_copy["gwaslab"]["samples"]["sample_size_min"] = n_min
507
507
  sumstats_meta_copy["gwaslab"]["samples"]["sample_size_median"] = n_median
508
508
  sumstats_meta_copy["gwaslab"]["variants"]["variant_number"] = len(sumstats)
509
- log.write(" -Exporting SSF-style meta data to {}".format(ymal_path),verbose=verbose)
510
- with open(ymal_path, 'w') as outfile:
509
+ log.write(" -Exporting SSF-style meta data to {}".format(yaml_path),verbose=verbose)
510
+ with open(yaml_path, 'w') as outfile:
511
511
  yaml.dump(sumstats_meta_copy, outfile)
512
512
 
513
513
 
@@ -5,6 +5,7 @@ import numpy as np
5
5
  from itertools import repeat
6
6
  from multiprocessing import Pool
7
7
  from liftover import get_lifter
8
+ from liftover import ChainFile
8
9
  from functools import partial
9
10
  from gwaslab.g_vchange_status import vchange_status
10
11
  from gwaslab.g_vchange_status import status_match
@@ -19,6 +20,7 @@ from gwaslab.g_version import _get_version
19
20
  from gwaslab.util_in_fill_data import _convert_betase_to_mlog10p
20
21
  from gwaslab.util_in_fill_data import _convert_betase_to_p
21
22
  from gwaslab.util_in_fill_data import _convert_mlog10p_to_p
23
+ from gwaslab.bd_common_data import get_chain
22
24
  #process build
23
25
  #setbuild
24
26
  #fixID
@@ -43,9 +45,15 @@ def _process_build(build,log,verbose):
43
45
  if str(build).lower() in ["hg19","19","37","b37","grch37"]:
44
46
  log.write(" -Genomic coordinates are based on GRCh37/hg19...", verbose=verbose)
45
47
  final_build = "19"
48
+ elif str(build).lower() in ["hg18","18","36","b36","grch36"]:
49
+ log.write(" -Genomic coordinates are based on GRCh36/hg18...", verbose=verbose)
50
+ final_build = "18"
46
51
  elif str(build).lower() in ["hg38","38","b38","grch38"]:
47
52
  log.write(" -Genomic coordinates are based on GRCh38/hg38...", verbose=verbose)
48
53
  final_build = "38"
54
+ elif str(build).lower() in ["t2t","hs1","chm13","13"]:
55
+ log.write(" -Genomic coordinates are based on T2T-CHM13...", verbose=verbose)
56
+ final_build = "13"
49
57
  else:
50
58
  log.warning("Version of genomic coordinates is unknown...", verbose=verbose)
51
59
  final_build = "99"
@@ -358,6 +366,76 @@ def fixID(sumstats,
358
366
 
359
367
  ""
360
368
 
369
+ def stripSNPID(sumstats,snpid="SNPID",overwrite=False,verbose=True,log=Log()):
370
+ '''
371
+ flip EA and NEA SNPid CHR:POS:EA:NEA -> CHR:POS:NEA:EA
372
+ '''
373
+ ##start function with col checking##########################################################
374
+ _start_line = "strip SNPID"
375
+ _end_line = "stripping SNPID"
376
+ _start_cols =["SNPID"]
377
+ _start_function = ".strip_snpid()"
378
+ _must_args ={}
379
+
380
+ is_enough_info = start_to(sumstats=sumstats,
381
+ log=log,
382
+ verbose=verbose,
383
+ start_line=_start_line,
384
+ end_line=_end_line,
385
+ start_cols=_start_cols,
386
+ start_function=_start_function,
387
+ **_must_args)
388
+ if is_enough_info == False: return sumstats
389
+ log.write(" -Checking if SNPID is (xxx:)CHR:POS:ATCG_Allele:ATCG_Allele(:xxx)...(separator: - ,: , _)",verbose=verbose)
390
+ is_chrposrefalt = sumstats[snpid].str.contains(r'[:_-]?\w+[:_-]\d+[:_-][ATCG]+[:_-][ATCG]+[:_-]?', case=False, flags=0, na=False)
391
+ # check if SNPID is NA
392
+ is_snpid_na = sumstats[snpid].isna()
393
+
394
+ log.write(" -Stripping {} non-NA fixable SNPIDs...".format(sum(is_chrposrefalt)),verbose=verbose)
395
+
396
+ # flip
397
+ sumstats.loc[is_chrposrefalt,snpid] = \
398
+ sumstats.loc[is_chrposrefalt,snpid].str.extract(r'[:_-]?(chr)?(\w+[:_-]\d+[:_-][ATCG]+[:_-][ATCG]+)[:_-]?',flags=re.IGNORECASE|re.ASCII)[1].astype("string")
399
+
400
+ finished(log,verbose,_end_line)
401
+ return sumstats
402
+
403
+ def flipSNPID(sumstats,snpid="SNPID",overwrite=False,verbose=True,log=Log()):
404
+ '''
405
+ flip EA and NEA SNPid CHR:POS:EA:NEA -> CHR:POS:NEA:EA
406
+ '''
407
+ ##start function with col checking##########################################################
408
+ _start_line = "flip SNPID"
409
+ _end_line = "flipping SNPID"
410
+ _start_cols =["SNPID"]
411
+ _start_function = ".flip_snpid()"
412
+ _must_args ={}
413
+
414
+ is_enough_info = start_to(sumstats=sumstats,
415
+ log=log,
416
+ verbose=verbose,
417
+ start_line=_start_line,
418
+ end_line=_end_line,
419
+ start_cols=_start_cols,
420
+ start_function=_start_function,
421
+ **_must_args)
422
+ if is_enough_info == False: return sumstats
423
+ log.warning("This function only flips alleles in SNPID without changing EA, NEA, STATUS or any statistics.")
424
+ log.write(" -Checking if SNPID is CHR:POS:ATCG_Allele:ATCG_Allele...(separator: - ,: , _)",verbose=verbose)
425
+ is_chrposrefalt = sumstats[snpid].str.match(r'^\w+[:_-]\d+[:_-][ATCG]+[:_-][ATCG]+$', case=False, flags=0, na=False)
426
+ # check if SNPID is NA
427
+ is_snpid_na = sumstats[snpid].isna()
428
+
429
+ log.write(" -Flipping {} non-NA fixable SNPIDs...".format(sum(is_chrposrefalt)),verbose=verbose)
430
+
431
+ # flip
432
+ sumstats.loc[is_chrposrefalt,snpid] = \
433
+ sumstats.loc[is_chrposrefalt,snpid].str.extract(r'^(chr)?(\w+[:_-]\d+)[:_-]([ATCG]+)[:_-]([ATCG]+)$',flags=re.IGNORECASE|re.ASCII)[1].astype("string") \
434
+ + ":"+sumstats.loc[is_chrposrefalt,snpid].str.extract(r'^(chr)?(\w+)[:_-](\d+)[:_-]([ATCG]+)[:_-]([ATCG]+)$',flags=re.IGNORECASE|re.ASCII)[4].astype("string") \
435
+ + ":"+sumstats.loc[is_chrposrefalt,snpid].str.extract(r'^(chr)?(\w+)[:_-](\d+)[:_-]([ATCG]+)[:_-]([ATCG]+)$',flags=re.IGNORECASE|re.ASCII)[3].astype("string")
436
+
437
+ finished(log,verbose,_end_line)
438
+ return sumstats
361
439
 
362
440
  ###############################################################################################################
363
441
  # 20230128
@@ -1041,7 +1119,7 @@ def check_range(sumstats, var_range, header, coltocheck, cols_to_check, log, ver
1041
1119
  cols_to_check.append(header)
1042
1120
  if header=="STATUS":
1043
1121
  log.write(" -Checking STATUS and converting STATUS to categories....", verbose=verbose)
1044
- categories = {str(j+i) for j in [1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
1122
+ categories = {str(j+i) for j in [1300000,1800000,1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
1045
1123
  sumstats[header] = pd.Categorical(sumstats[header],categories=categories)
1046
1124
  return sumstats
1047
1125
 
@@ -1496,11 +1574,19 @@ def liftover_variant(sumstats,
1496
1574
  pos="POS",
1497
1575
  status="STATUS",
1498
1576
  from_build="19",
1499
- to_build="38"):
1577
+ to_build="38",
1578
+ chain=None):
1579
+
1500
1580
  try:
1501
- converter = get_lifter("hg"+from_build,"hg"+to_build,one_based=True)
1581
+ if chain is None:
1582
+ converter = get_lifter(from_build,to_build,one_based=True)
1583
+ else:
1584
+ converter = ChainFile(chain, one_based=True)
1502
1585
  except:
1503
- converter = get_lifter("hg"+from_build,"hg"+to_build)
1586
+ if chain is None:
1587
+ converter = get_lifter(from_build,to_build)
1588
+ else:
1589
+ converter = ChainFile(chain)
1504
1590
 
1505
1591
  dic= get_number_to_chr(in_chr=False,xymt=["X","Y","M"])
1506
1592
  dic2= get_chr_to_number(out_chr=False)
@@ -1513,7 +1599,7 @@ def liftover_variant(sumstats,
1513
1599
  sumstats.loc[variants_on_chrom_to_convert,chrom] = lifted.str[0].map(dic2).astype("Int64")
1514
1600
  return sumstats
1515
1601
 
1516
- def parallelizeliftovervariant(sumstats,n_cores=1,chrom="CHR", pos="POS", from_build="19", to_build="38",status="STATUS",remove=True, verbose=True,log=Log()):
1602
+ def parallelizeliftovervariant(sumstats,n_cores=1,chrom="CHR", pos="POS", from_build="19", to_build="38",status="STATUS",remove=True,chain=None, verbose=True,log=Log()):
1517
1603
  ##start function with col checking##########################################################
1518
1604
  _start_line = "perform liftover"
1519
1605
  _end_line = "liftover"
@@ -1532,8 +1618,21 @@ def parallelizeliftovervariant(sumstats,n_cores=1,chrom="CHR", pos="POS", from_b
1532
1618
  **_must_args)
1533
1619
  if is_enough_info == False: return sumstats
1534
1620
  ############################################################################################
1621
+
1622
+ lifter_from_build = _process_build(from_build,log=log,verbose=False)
1623
+ lifter_to_build = _process_build(to_build,log=log,verbose=False)
1535
1624
 
1536
- log.write(" -Creating converter : hg" + from_build +" to hg"+ to_build, verbose=verbose)
1625
+ if chain is not None:
1626
+ log.write(" -Creating converter using ChainFile: {}".format(chain), verbose=verbose)
1627
+ else:
1628
+ try:
1629
+ chain = get_chain(from_build=from_build, to_build=to_build)
1630
+ log.write(" -Creating converter using ChainFile: {}".format(chain), verbose=verbose)
1631
+ except:
1632
+ chain = None
1633
+ lifter_from_build=from_build
1634
+ lifter_to_build=to_build
1635
+ log.write(" -Creating converter : {} -> {}".format(lifter_from_build, lifter_to_build), verbose=verbose)
1537
1636
  # valid chr and pos
1538
1637
  pattern = r"\w\w\w0\w\w\w"
1539
1638
  to_lift = sumstats[status].str.match(pattern)
@@ -1549,7 +1648,7 @@ def parallelizeliftovervariant(sumstats,n_cores=1,chrom="CHR", pos="POS", from_b
1549
1648
  pool = Pool(n_cores)
1550
1649
  #df = pd.concat(pool.starmap(func, df_split))
1551
1650
  func=liftover_variant
1552
- sumstats[[chrom,pos,status]] = pd.concat(pool.map(partial(func,chrom=chrom,pos=pos,from_build=from_build,to_build=to_build,status=status),df_split))
1651
+ sumstats[[chrom,pos,status]] = pd.concat(pool.map(partial(func,chrom=chrom,pos=pos,from_build=from_build,to_build=to_build,status=status,chain=chain),df_split))
1553
1652
  pool.close()
1554
1653
  pool.join()
1555
1654
  ############################################################################
@@ -184,7 +184,8 @@ def fill_mlog10p(sumstats,log,verbose=True,filled_count=0):
184
184
  else:
185
185
  return 0,filled_count
186
186
  return 1,filled_count
187
- def fill_extreme_mlog10p(sumstats,log,verbose=True,filled_count=0):
187
+
188
+ def fill_extreme_mlog10p(sumstats,df,log,verbose=True,filled_count=0):
188
189
  # ref: https://stackoverflow.com/questions/46416027/how-to-compute-p-values-from-z-scores-in-r-when-the-z-score-is-large-pvalue-muc/46416222#46416222
189
190
  if "Z" in sumstats.columns:
190
191
  # P -> MLOG10P
@@ -198,6 +199,10 @@ def fill_extreme_mlog10p(sumstats,log,verbose=True,filled_count=0):
198
199
  log.write(" - Filling MLOG10P using Z column...", verbose=verbose)
199
200
  sumstats = fill_extreme_mlog10(sumstats, "Z")
200
201
  filled_count +=1
202
+ elif "CHISQ" in sumstats.columns and "DOF" in sumstats.columns:
203
+ log.write(" - Filling MLOG10P using CHISQ and DOF column...", verbose=verbose)
204
+ sumstats = fill_extreme_mlog10_chisq(sumstats, "CHISQ", df)
205
+ filled_count +=1
201
206
  else:
202
207
  return 0,filled_count
203
208
  return 1,filled_count
@@ -223,6 +228,19 @@ def fill_extreme_mlog10(sumstats, z):
223
228
  sumstats["P_EXPONENT"]= exponent
224
229
  return sumstats
225
230
 
231
+ def fill_extreme_mlog10_chisq(sumstats, chisq, df):
232
+ #https://stackoverflow.com/a/46416222/199475
233
+ log_pvalue = ss.chi2.logsf(sumstats[chisq], sumstats[df])
234
+
235
+ log10_pvalue = log_pvalue/np.log(10)
236
+
237
+ mantissa = 10**(log10_pvalue %1)
238
+ exponent = log10_pvalue // 1
239
+ sumstats["MLOG10P"] = -log10_pvalue
240
+ sumstats["P_MANTISSA"]= mantissa
241
+ sumstats["P_EXPONENT"]= exponent
242
+ return sumstats
243
+
226
244
  ####################################################################################################################
227
245
  def fill_iteratively(sumstats,raw_to_fill,log,only_sig,df,extreme,verbose,sig_level):
228
246
  to_fill = raw_to_fill.copy()
@@ -260,7 +278,7 @@ def fill_iteratively(sumstats,raw_to_fill,log,only_sig,df,extreme,verbose,sig_le
260
278
  # p to -log10(P) ###############################################################################################
261
279
  if "MLOG10P" in to_fill:
262
280
  if extreme==True:
263
- status,filled_count = fill_extreme_mlog10p(sumstats,log,verbose=verbose,filled_count=filled_count)
281
+ status,filled_count = fill_extreme_mlog10p(sumstats,df, log,verbose=verbose,filled_count=filled_count)
264
282
  filled_count +=1
265
283
  elif "P" not in sumstats.columns:
266
284
  fill_p(sumstats,log,verbose=verbose)
@@ -619,8 +619,10 @@ def _check_cis(insumstats,
619
619
  except:
620
620
  pass
621
621
 
622
- allsig["CIS/TRANS"] = allsig.apply(lambda x: determine_if_cis(x, group_key,windowsizekb, reference_dict), axis=1)
623
-
622
+ #allsig["CIS/TRANS"] = allsig.apply(lambda x: determine_if_cis(x, group_key,windowsizekb, reference_dict), axis=1)
623
+ cis_tuples = allsig.apply(lambda x: determine_if_cis2(x, group_key,windowsizekb, reference_dict), axis=1)
624
+ allsig[["CIS/TRANS","REF_CHR","REF_START","REF_END"]] = pd.DataFrame(cis_tuples.tolist(), index=allsig.index)
625
+
624
626
  try:
625
627
  allsig = allsig.where(~pd.isna(allsig), pd.NA)
626
628
  except:
@@ -689,6 +691,20 @@ def determine_if_cis(x, group_key,windowsizekb, reference_dict):
689
691
  else:
690
692
  return "NoReference"
691
693
 
694
+ def determine_if_cis2(x, group_key,windowsizekb, reference_dict):
695
+ if x[group_key] in reference_dict.keys():
696
+ is_same_chr = str(reference_dict[x[group_key]][0]) == str(x["CHR"])
697
+ is_large_than_start = int(reference_dict[x[group_key]][1]) - windowsizekb*1000 <= x["POS"]
698
+ is_smaller_than_end = int(reference_dict[x[group_key]][2]) + windowsizekb*1000 >= x["POS"]
699
+
700
+ if is_same_chr and is_large_than_start and is_smaller_than_end:
701
+ return "Cis", int(reference_dict[x[group_key]][0]), int(reference_dict[x[group_key]][1]), int(reference_dict[x[group_key]][2])
702
+ else:
703
+ return "Trans", int(reference_dict[x[group_key]][0]), int(reference_dict[x[group_key]][1]), int(reference_dict[x[group_key]][2])
704
+ else:
705
+ return "NoReference", pd.NA, pd.NA, pd.NA
706
+
707
+
692
708
  def determine_distance(allsig, knownsig):
693
709
  if len(allsig)==0:
694
710
  return allsig