gwaslab 3.4.43__tar.gz → 3.4.44__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of gwaslab might be problematic. Click here for more details.
- {gwaslab-3.4.43/src/gwaslab.egg-info → gwaslab-3.4.44}/PKG-INFO +3 -3
- {gwaslab-3.4.43 → gwaslab-3.4.44}/README.md +2 -2
- {gwaslab-3.4.43 → gwaslab-3.4.44}/pyproject.toml +1 -1
- {gwaslab-3.4.43 → gwaslab-3.4.44}/src/gwaslab/g_Sumstats.py +4 -2
- {gwaslab-3.4.43 → gwaslab-3.4.44}/src/gwaslab/g_version.py +2 -2
- {gwaslab-3.4.43 → gwaslab-3.4.44}/src/gwaslab/hm_harmonize_sumstats.py +34 -13
- {gwaslab-3.4.43 → gwaslab-3.4.44}/src/gwaslab/qc_fix_sumstats.py +132 -26
- {gwaslab-3.4.43 → gwaslab-3.4.44/src/gwaslab.egg-info}/PKG-INFO +3 -3
- {gwaslab-3.4.43 → gwaslab-3.4.44}/LICENSE +0 -0
- {gwaslab-3.4.43 → gwaslab-3.4.44}/LICENSE_before_v3.4.39 +0 -0
- {gwaslab-3.4.43 → gwaslab-3.4.44}/setup.cfg +0 -0
- {gwaslab-3.4.43 → gwaslab-3.4.44}/src/gwaslab/__init__.py +0 -0
- {gwaslab-3.4.43 → gwaslab-3.4.44}/src/gwaslab/bd_common_data.py +0 -0
- {gwaslab-3.4.43 → gwaslab-3.4.44}/src/gwaslab/bd_config.py +0 -0
- {gwaslab-3.4.43 → gwaslab-3.4.44}/src/gwaslab/bd_download.py +0 -0
- {gwaslab-3.4.43 → gwaslab-3.4.44}/src/gwaslab/bd_get_hapmap3.py +0 -0
- {gwaslab-3.4.43 → gwaslab-3.4.44}/src/gwaslab/cache_manager.py +0 -0
- {gwaslab-3.4.43 → gwaslab-3.4.44}/src/gwaslab/data/chrx_par/chrx_par_hg19.bed.gz +0 -0
- {gwaslab-3.4.43 → gwaslab-3.4.44}/src/gwaslab/data/chrx_par/chrx_par_hg38.bed.gz +0 -0
- {gwaslab-3.4.43 → gwaslab-3.4.44}/src/gwaslab/data/formatbook.json +0 -0
- {gwaslab-3.4.43 → gwaslab-3.4.44}/src/gwaslab/data/hapmap3_SNPs/hapmap3_db150_hg19.snplist.gz +0 -0
- {gwaslab-3.4.43 → gwaslab-3.4.44}/src/gwaslab/data/hapmap3_SNPs/hapmap3_db151_hg38.snplist.gz +0 -0
- {gwaslab-3.4.43 → gwaslab-3.4.44}/src/gwaslab/data/high_ld/high_ld_hla_hg19.bed.gz +0 -0
- {gwaslab-3.4.43 → gwaslab-3.4.44}/src/gwaslab/data/high_ld/high_ld_hla_hg38.bed.gz +0 -0
- {gwaslab-3.4.43 → gwaslab-3.4.44}/src/gwaslab/data/reference.json +0 -0
- {gwaslab-3.4.43 → gwaslab-3.4.44}/src/gwaslab/g_Log.py +0 -0
- {gwaslab-3.4.43 → gwaslab-3.4.44}/src/gwaslab/g_Phenotypes.py +0 -0
- {gwaslab-3.4.43 → gwaslab-3.4.44}/src/gwaslab/g_SumstatsPair.py +0 -0
- {gwaslab-3.4.43 → gwaslab-3.4.44}/src/gwaslab/g_SumstatsT.py +0 -0
- {gwaslab-3.4.43 → gwaslab-3.4.44}/src/gwaslab/g_Sumstats_summary.py +0 -0
- {gwaslab-3.4.43 → gwaslab-3.4.44}/src/gwaslab/g_meta.py +0 -0
- {gwaslab-3.4.43 → gwaslab-3.4.44}/src/gwaslab/g_vchange_status.py +0 -0
- {gwaslab-3.4.43 → gwaslab-3.4.44}/src/gwaslab/hm_casting.py +0 -0
- {gwaslab-3.4.43 → gwaslab-3.4.44}/src/gwaslab/hm_rsid_to_chrpos.py +0 -0
- {gwaslab-3.4.43 → gwaslab-3.4.44}/src/gwaslab/io_preformat_input.py +0 -0
- {gwaslab-3.4.43 → gwaslab-3.4.44}/src/gwaslab/io_read_ldsc.py +0 -0
- {gwaslab-3.4.43 → gwaslab-3.4.44}/src/gwaslab/io_read_tabular.py +0 -0
- {gwaslab-3.4.43 → gwaslab-3.4.44}/src/gwaslab/io_to_formats.py +0 -0
- {gwaslab-3.4.43 → gwaslab-3.4.44}/src/gwaslab/io_to_pickle.py +0 -0
- {gwaslab-3.4.43 → gwaslab-3.4.44}/src/gwaslab/ldsc_irwls.py +0 -0
- {gwaslab-3.4.43 → gwaslab-3.4.44}/src/gwaslab/ldsc_jackknife.py +0 -0
- {gwaslab-3.4.43 → gwaslab-3.4.44}/src/gwaslab/ldsc_ldscore.py +0 -0
- {gwaslab-3.4.43 → gwaslab-3.4.44}/src/gwaslab/ldsc_parse.py +0 -0
- {gwaslab-3.4.43 → gwaslab-3.4.44}/src/gwaslab/ldsc_regressions.py +0 -0
- {gwaslab-3.4.43 → gwaslab-3.4.44}/src/gwaslab/ldsc_sumstats.py +0 -0
- {gwaslab-3.4.43 → gwaslab-3.4.44}/src/gwaslab/qc_check_datatype.py +0 -0
- {gwaslab-3.4.43 → gwaslab-3.4.44}/src/gwaslab/run_script.py +0 -0
- {gwaslab-3.4.43 → gwaslab-3.4.44}/src/gwaslab/util_ex_calculate_ldmatrix.py +0 -0
- {gwaslab-3.4.43 → gwaslab-3.4.44}/src/gwaslab/util_ex_calculate_prs.py +0 -0
- {gwaslab-3.4.43 → gwaslab-3.4.44}/src/gwaslab/util_ex_gwascatalog.py +0 -0
- {gwaslab-3.4.43 → gwaslab-3.4.44}/src/gwaslab/util_ex_ldproxyfinder.py +0 -0
- {gwaslab-3.4.43 → gwaslab-3.4.44}/src/gwaslab/util_ex_ldsc.py +0 -0
- {gwaslab-3.4.43 → gwaslab-3.4.44}/src/gwaslab/util_ex_plink_filter.py +0 -0
- {gwaslab-3.4.43 → gwaslab-3.4.44}/src/gwaslab/util_ex_process_h5.py +0 -0
- {gwaslab-3.4.43 → gwaslab-3.4.44}/src/gwaslab/util_ex_process_ref.py +0 -0
- {gwaslab-3.4.43 → gwaslab-3.4.44}/src/gwaslab/util_ex_run_2samplemr.py +0 -0
- {gwaslab-3.4.43 → gwaslab-3.4.44}/src/gwaslab/util_ex_run_clumping.py +0 -0
- {gwaslab-3.4.43 → gwaslab-3.4.44}/src/gwaslab/util_ex_run_coloc.py +0 -0
- {gwaslab-3.4.43 → gwaslab-3.4.44}/src/gwaslab/util_ex_run_susie.py +0 -0
- {gwaslab-3.4.43 → gwaslab-3.4.44}/src/gwaslab/util_in_calculate_gc.py +0 -0
- {gwaslab-3.4.43 → gwaslab-3.4.44}/src/gwaslab/util_in_calculate_power.py +0 -0
- {gwaslab-3.4.43 → gwaslab-3.4.44}/src/gwaslab/util_in_convert_h2.py +0 -0
- {gwaslab-3.4.43 → gwaslab-3.4.44}/src/gwaslab/util_in_correct_winnerscurse.py +0 -0
- {gwaslab-3.4.43 → gwaslab-3.4.44}/src/gwaslab/util_in_fill_data.py +0 -0
- {gwaslab-3.4.43 → gwaslab-3.4.44}/src/gwaslab/util_in_filter_value.py +0 -0
- {gwaslab-3.4.43 → gwaslab-3.4.44}/src/gwaslab/util_in_get_density.py +0 -0
- {gwaslab-3.4.43 → gwaslab-3.4.44}/src/gwaslab/util_in_get_sig.py +0 -0
- {gwaslab-3.4.43 → gwaslab-3.4.44}/src/gwaslab/viz_aux_annotate_plot.py +0 -0
- {gwaslab-3.4.43 → gwaslab-3.4.44}/src/gwaslab/viz_aux_quickfix.py +0 -0
- {gwaslab-3.4.43 → gwaslab-3.4.44}/src/gwaslab/viz_aux_reposition_text.py +0 -0
- {gwaslab-3.4.43 → gwaslab-3.4.44}/src/gwaslab/viz_aux_save_figure.py +0 -0
- {gwaslab-3.4.43 → gwaslab-3.4.44}/src/gwaslab/viz_plot_compare_af.py +0 -0
- {gwaslab-3.4.43 → gwaslab-3.4.44}/src/gwaslab/viz_plot_compare_effect.py +0 -0
- {gwaslab-3.4.43 → gwaslab-3.4.44}/src/gwaslab/viz_plot_forestplot.py +0 -0
- {gwaslab-3.4.43 → gwaslab-3.4.44}/src/gwaslab/viz_plot_miamiplot.py +0 -0
- {gwaslab-3.4.43 → gwaslab-3.4.44}/src/gwaslab/viz_plot_miamiplot2.py +0 -0
- {gwaslab-3.4.43 → gwaslab-3.4.44}/src/gwaslab/viz_plot_mqqplot.py +0 -0
- {gwaslab-3.4.43 → gwaslab-3.4.44}/src/gwaslab/viz_plot_qqplot.py +0 -0
- {gwaslab-3.4.43 → gwaslab-3.4.44}/src/gwaslab/viz_plot_regionalplot.py +0 -0
- {gwaslab-3.4.43 → gwaslab-3.4.44}/src/gwaslab/viz_plot_rg_heatmap.py +0 -0
- {gwaslab-3.4.43 → gwaslab-3.4.44}/src/gwaslab/viz_plot_stackedregional.py +0 -0
- {gwaslab-3.4.43 → gwaslab-3.4.44}/src/gwaslab/viz_plot_trumpetplot.py +0 -0
- {gwaslab-3.4.43 → gwaslab-3.4.44}/src/gwaslab.egg-info/SOURCES.txt +0 -0
- {gwaslab-3.4.43 → gwaslab-3.4.44}/src/gwaslab.egg-info/dependency_links.txt +0 -0
- {gwaslab-3.4.43 → gwaslab-3.4.44}/src/gwaslab.egg-info/requires.txt +0 -0
- {gwaslab-3.4.43 → gwaslab-3.4.44}/src/gwaslab.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: gwaslab
|
|
3
|
-
Version: 3.4.
|
|
3
|
+
Version: 3.4.44
|
|
4
4
|
Summary: A collection of handy tools for GWAS SumStats
|
|
5
5
|
Author-email: Yunye <yunye@gwaslab.com>
|
|
6
6
|
Project-URL: Homepage, https://cloufield.github.io/gwaslab/
|
|
@@ -51,7 +51,7 @@ Warning: Known issues of GWASLab are summarized in [https://cloufield.github.io/
|
|
|
51
51
|
### install via pip
|
|
52
52
|
|
|
53
53
|
```
|
|
54
|
-
pip install gwaslab==3.4.
|
|
54
|
+
pip install gwaslab==3.4.43
|
|
55
55
|
```
|
|
56
56
|
|
|
57
57
|
```python
|
|
@@ -90,7 +90,7 @@ Create a Python 3.9 environment and install gwaslab using pip:
|
|
|
90
90
|
```
|
|
91
91
|
conda env create -n gwaslab_test -c conda-forge python=3.9
|
|
92
92
|
conda activate gwaslab
|
|
93
|
-
pip install gwaslab==3.4.
|
|
93
|
+
pip install gwaslab==3.4.43
|
|
94
94
|
```
|
|
95
95
|
|
|
96
96
|
or create a new environment using yml file [environment_3.4.40.yml](https://github.com/Cloufield/gwaslab/blob/main/environment_3.4.40.yml)
|
|
@@ -23,7 +23,7 @@ Warning: Known issues of GWASLab are summarized in [https://cloufield.github.io/
|
|
|
23
23
|
### install via pip
|
|
24
24
|
|
|
25
25
|
```
|
|
26
|
-
pip install gwaslab==3.4.
|
|
26
|
+
pip install gwaslab==3.4.43
|
|
27
27
|
```
|
|
28
28
|
|
|
29
29
|
```python
|
|
@@ -62,7 +62,7 @@ Create a Python 3.9 environment and install gwaslab using pip:
|
|
|
62
62
|
```
|
|
63
63
|
conda env create -n gwaslab_test -c conda-forge python=3.9
|
|
64
64
|
conda activate gwaslab
|
|
65
|
-
pip install gwaslab==3.4.
|
|
65
|
+
pip install gwaslab==3.4.43
|
|
66
66
|
```
|
|
67
67
|
|
|
68
68
|
or create a new environment using yml file [environment_3.4.40.yml](https://github.com/Cloufield/gwaslab/blob/main/environment_3.4.40.yml)
|
|
@@ -356,8 +356,10 @@ class Sumstats():
|
|
|
356
356
|
if ref_seq is not None:
|
|
357
357
|
if ref_seq_mode=="v":
|
|
358
358
|
self.data = checkref(self.data,ref_seq,log=self.log,**checkref_args)
|
|
359
|
-
|
|
359
|
+
elif ref_seq_mode=="s":
|
|
360
360
|
self.data = oldcheckref(self.data,ref_seq,log=self.log,**checkref_args)
|
|
361
|
+
else:
|
|
362
|
+
raise ValueError("ref_seq_mode should be 'v' (vectorized, faster) or 's' (sequential, slower)")
|
|
361
363
|
|
|
362
364
|
self.meta["gwaslab"]["references"]["ref_seq"] = ref_seq
|
|
363
365
|
|
|
@@ -429,7 +431,7 @@ class Sumstats():
|
|
|
429
431
|
if ref_seq_mode=="v":
|
|
430
432
|
self.meta["gwaslab"]["references"]["ref_seq"] = ref_seq
|
|
431
433
|
self.data = checkref(self.data,ref_seq,log=self.log,**kwargs)
|
|
432
|
-
|
|
434
|
+
elif ref_seq_mode=="s":
|
|
433
435
|
self.meta["gwaslab"]["references"]["ref_seq"] = ref_seq
|
|
434
436
|
self.data = oldcheckref(self.data,ref_seq,log=self.log,**kwargs)
|
|
435
437
|
def infer_strand(self,ref_infer,**kwargs):
|
|
@@ -389,7 +389,10 @@ def oldcheckref(sumstats,ref_seq,chrom="CHR",pos="POS",ea="EA",nea="NEA",status=
|
|
|
389
389
|
return sumstats
|
|
390
390
|
|
|
391
391
|
#20240320 check if non-effect allele is aligned with reference genome
|
|
392
|
-
def _fast_check_status(x: pd.DataFrame, record: np.array, starting_positions: np.array):
|
|
392
|
+
def _fast_check_status(x: pd.DataFrame, record: np.array, starting_positions: np.array, records_len: np.array):
|
|
393
|
+
# starting_positions and records_len must be 1D arrays containing data only for the chromosomes contained in x,
|
|
394
|
+
# and these arrays must be ordered in the same way as the chromosomes in np.unique(x['CHR'].values).
|
|
395
|
+
|
|
393
396
|
# status
|
|
394
397
|
#0 / -----> match
|
|
395
398
|
#1 / -----> Flipped Fixed
|
|
@@ -431,6 +434,9 @@ def _fast_check_status(x: pd.DataFrame, record: np.array, starting_positions: np
|
|
|
431
434
|
max_len_nea = _nea.str.len().max()
|
|
432
435
|
max_len_ea = _ea.str.len().max()
|
|
433
436
|
|
|
437
|
+
########################################## mask for variants with out of range POS
|
|
438
|
+
mask_outlier = pos > records_len[chrom]
|
|
439
|
+
#########################################
|
|
434
440
|
|
|
435
441
|
# Let's apply the same magic used for the fasta records (check build_fasta_records() for details) to convert the NEA and EA to
|
|
436
442
|
# a numpy array of integers in a very fast way.
|
|
@@ -442,7 +448,9 @@ def _fast_check_status(x: pd.DataFrame, record: np.array, starting_positions: np
|
|
|
442
448
|
nea = _nea.str.translate(TRANSLATE_TABLE).to_numpy().astype(f'<U{max_len_nea}')
|
|
443
449
|
nea = nea.view('<u4').reshape(-1, max_len_nea).astype(np.uint8)
|
|
444
450
|
nea[nea == 0] = PADDING_VALUE # padding value
|
|
445
|
-
|
|
451
|
+
###########################################
|
|
452
|
+
|
|
453
|
+
###########################################
|
|
446
454
|
# Create a mask holding True at the position of non-padding values
|
|
447
455
|
mask_nea = nea != PADDING_VALUE
|
|
448
456
|
|
|
@@ -458,7 +466,9 @@ def _fast_check_status(x: pd.DataFrame, record: np.array, starting_positions: np
|
|
|
458
466
|
ea = _ea.str.translate(TRANSLATE_TABLE).to_numpy().astype(f'<U{max_len_ea}')
|
|
459
467
|
ea = ea.view('<u4').reshape(-1, max_len_ea).astype(np.uint8)
|
|
460
468
|
ea[ea == 0] = PADDING_VALUE # padding value
|
|
461
|
-
|
|
469
|
+
###########################################
|
|
470
|
+
|
|
471
|
+
###########################################
|
|
462
472
|
mask_ea = ea != PADDING_VALUE
|
|
463
473
|
|
|
464
474
|
rev_ea = _ea.str.translate(TRANSLATE_TABLE_COMPL).str.pad(max_len_ea, 'left', chr(PADDING_VALUE)).to_numpy().astype(f'<U{max_len_ea}')
|
|
@@ -503,8 +513,11 @@ def _fast_check_status(x: pd.DataFrame, record: np.array, starting_positions: np
|
|
|
503
513
|
# Index the record array using the computed indices.
|
|
504
514
|
# Since we use np.take, indices must all have the same length, and this is why we added the padding to NEA
|
|
505
515
|
# and we create the indices using max_len_nea (long story short, we can't obtain a scattered/ragged array)
|
|
506
|
-
output_nea = np.take(record, indices)
|
|
507
|
-
|
|
516
|
+
output_nea = np.take(record, indices, mode="clip")
|
|
517
|
+
##################################################################
|
|
518
|
+
output_nea[mask_outlier] = PADDING_VALUE
|
|
519
|
+
##################################################################
|
|
520
|
+
|
|
508
521
|
# Check if the NEA is equal to the reference sequence at the given position
|
|
509
522
|
# In a non-matrix way, this is equivalent (for one single element) to:
|
|
510
523
|
# nea == record[pos-1: pos+len(nea)-1]
|
|
@@ -527,7 +540,10 @@ def _fast_check_status(x: pd.DataFrame, record: np.array, starting_positions: np
|
|
|
527
540
|
indices_range = np.arange(max_len_ea)
|
|
528
541
|
indices = pos + indices_range
|
|
529
542
|
indices = indices + modified_indices
|
|
530
|
-
output_ea = np.take(record, indices)
|
|
543
|
+
output_ea = np.take(record, indices, mode="clip")
|
|
544
|
+
##################################################################
|
|
545
|
+
output_ea[mask_outlier] = PADDING_VALUE
|
|
546
|
+
##################################################################
|
|
531
547
|
|
|
532
548
|
ea_eq_ref = np.all((ea == output_ea) + ~mask_ea, 1)
|
|
533
549
|
rev_ea_eq_ref = np.all((rev_ea == output_ea) + ~mask_ea, 1)
|
|
@@ -582,24 +598,28 @@ def check_status(sumstats: pd.DataFrame, fasta_records_dict, log=Log(), verbose=
|
|
|
582
598
|
chrom,pos,ea,nea,status = sumstats.columns
|
|
583
599
|
|
|
584
600
|
# First, convert the fasta records to a single numpy array of integers
|
|
585
|
-
record, starting_positions_dict = build_fasta_records(fasta_records_dict, pos_as_dict=True, log=log, verbose=verbose)
|
|
601
|
+
record, starting_positions_dict, records_len_dict = build_fasta_records(fasta_records_dict, pos_as_dict=True, log=log, verbose=verbose)
|
|
586
602
|
|
|
587
603
|
# In _fast_check_status(), several 2D numpy arrays are created and they are padded to have shape[1] == max_len_nea or max_len_ea
|
|
588
604
|
# Since most of the NEA and EA strings are short, we perform the check first on the records having short NEA and EA strings,
|
|
589
605
|
# and then we perform the check on the records having long NEA and EA strings. In this way we can speed up the process (since the
|
|
590
606
|
# arrays are smaller) and save memory.
|
|
591
607
|
max_len = 4 # this is a chosen value, we could compute it using some stats about the length and count of NEA and EA strings
|
|
592
|
-
condition = (sumstats[nea].str.len() <= max_len)
|
|
608
|
+
condition = (sumstats[nea].str.len() <= max_len) & (sumstats[ea].str.len() <= max_len)
|
|
593
609
|
|
|
594
610
|
log.write(f" -Checking records for ( len(NEA) <= {max_len} and len(EA) <= {max_len} )", verbose=verbose)
|
|
595
611
|
sumstats_cond = sumstats[condition]
|
|
596
|
-
|
|
597
|
-
|
|
612
|
+
unique_chrom_cond = sumstats_cond[chrom].unique()
|
|
613
|
+
starting_pos_cond = np.array([starting_positions_dict[k] for k in unique_chrom_cond])
|
|
614
|
+
records_len_cond = np.array([records_len_dict[k] for k in unique_chrom_cond])
|
|
615
|
+
sumstats.loc[condition, status] = _fast_check_status(sumstats_cond, record=record, starting_positions=starting_pos_cond, records_len=records_len_cond)
|
|
598
616
|
|
|
599
617
|
log.write(f" -Checking records for ( len(NEA) > {max_len} or len(EA) > {max_len} )", verbose=verbose)
|
|
600
618
|
sumstats_not_cond = sumstats[~condition]
|
|
601
|
-
|
|
602
|
-
|
|
619
|
+
unique_chrom_not_cond = sumstats_not_cond[chrom].unique()
|
|
620
|
+
starting_not_pos_cond = np.array([starting_positions_dict[k] for k in unique_chrom_not_cond])
|
|
621
|
+
records_len_not_cond = np.array([records_len_dict[k] for k in unique_chrom_not_cond])
|
|
622
|
+
sumstats.loc[~condition, status] = _fast_check_status(sumstats_not_cond, record=record, starting_positions=starting_not_pos_cond, records_len=records_len_not_cond)
|
|
603
623
|
|
|
604
624
|
return sumstats[status].values
|
|
605
625
|
|
|
@@ -709,10 +729,11 @@ def build_fasta_records(fasta_records_dict, pos_as_dict=True, log=Log(), verbose
|
|
|
709
729
|
starting_positions = np.cumsum(records_len) - records_len
|
|
710
730
|
if pos_as_dict:
|
|
711
731
|
starting_positions = {k: v for k, v in zip(fasta_records_dict.keys(), starting_positions)}
|
|
732
|
+
records_len_dict = {k: v for k, v in zip(fasta_records_dict.keys(), records_len)}
|
|
712
733
|
record = np.concatenate(all_r)
|
|
713
734
|
del all_r # free memory
|
|
714
735
|
|
|
715
|
-
return record, starting_positions
|
|
736
|
+
return record, starting_positions,records_len_dict
|
|
716
737
|
|
|
717
738
|
#######################################################################################################################################
|
|
718
739
|
|
|
@@ -792,7 +792,7 @@ def fixallele(sumstats,ea="EA", nea="NEA",status="STATUS",remove=False,verbose=T
|
|
|
792
792
|
###############################################################################################################
|
|
793
793
|
# 20220721
|
|
794
794
|
|
|
795
|
-
def parallelnormalizeallele(sumstats,snpid="SNPID",rsid="rsID",pos="POS",nea="NEA",ea="EA" ,status="STATUS",n_cores=1,verbose=True,log=Log()):
|
|
795
|
+
def parallelnormalizeallele(sumstats,mode="s",snpid="SNPID",rsid="rsID",pos="POS",nea="NEA",ea="EA" ,status="STATUS",chunk=3000000,n_cores=1,verbose=True,log=Log()):
|
|
796
796
|
##start function with col checking##########################################################
|
|
797
797
|
_start_line = "normalize indels"
|
|
798
798
|
_end_line = "normalizing indels"
|
|
@@ -819,7 +819,51 @@ def parallelnormalizeallele(sumstats,snpid="SNPID",rsid="rsID",pos="POS",nea="NE
|
|
|
819
819
|
log.write("Finished normalizing variants successfully!", verbose=verbose)
|
|
820
820
|
return sumstats
|
|
821
821
|
###############################################################################################################
|
|
822
|
-
if
|
|
822
|
+
if mode=="v":
|
|
823
|
+
if sum(variants_to_check)<100000:
|
|
824
|
+
n_cores=1
|
|
825
|
+
if n_cores==1:
|
|
826
|
+
normalized_pd, changed_index = fastnormalizeallele(sumstats.loc[variants_to_check,[pos,nea,ea,status]],pos=pos ,nea=nea,ea=ea,status=status,chunk=chunk, log=log, verbose=verbose)
|
|
827
|
+
else:
|
|
828
|
+
pool = Pool(n_cores)
|
|
829
|
+
map_func = partial(fastnormalizeallele,pos=pos,nea=nea,ea=ea,status=status)
|
|
830
|
+
df_split = _df_split(sumstats.loc[variants_to_check,[pos,nea,ea,status]], n_cores)
|
|
831
|
+
results = pool.map(map_func,df_split)
|
|
832
|
+
normalized_pd = pd.concat([i[0] for i in results])
|
|
833
|
+
changed_index = np.concatenate([i[1] for i in results])
|
|
834
|
+
del results
|
|
835
|
+
pool.close()
|
|
836
|
+
pool.join()
|
|
837
|
+
gc.collect()
|
|
838
|
+
###############################################################################################################
|
|
839
|
+
try:
|
|
840
|
+
example_sumstats = sumstats.loc[changed_index,:].head()
|
|
841
|
+
changed_num = len(changed_index)
|
|
842
|
+
if changed_num>0:
|
|
843
|
+
if snpid in example_sumstats.columns:
|
|
844
|
+
before_normalize_id = example_sumstats.loc[variants_to_check,snpid]
|
|
845
|
+
elif rsid in example_sumstats.columns:
|
|
846
|
+
before_normalize_id = example_sumstats.loc[variants_to_check,rsid]
|
|
847
|
+
else:
|
|
848
|
+
before_normalize_id = example_sumstats.index
|
|
849
|
+
|
|
850
|
+
log.write(" -Not normalized allele IDs:",end="", verbose=verbose)
|
|
851
|
+
for i in before_normalize_id.values:
|
|
852
|
+
log.write(i,end=" ",show_time=False)
|
|
853
|
+
log.write("... \n",end="",show_time=False, verbose=verbose)
|
|
854
|
+
|
|
855
|
+
log.write(" -Not normalized allele:",end="", verbose=verbose)
|
|
856
|
+
for i in example_sumstats[[ea,nea]].values:
|
|
857
|
+
log.write(i,end="",show_time=False, verbose=verbose)
|
|
858
|
+
log.write("... \n",end="",show_time=False, verbose=verbose)
|
|
859
|
+
log.write(" -Modified "+str(changed_num) +" variants according to parsimony and left alignment principal.", verbose=verbose)
|
|
860
|
+
else:
|
|
861
|
+
log.write(" -All variants are already normalized..", verbose=verbose)
|
|
862
|
+
except:
|
|
863
|
+
pass
|
|
864
|
+
|
|
865
|
+
##########################################################################################################################################################
|
|
866
|
+
elif mode=="s":
|
|
823
867
|
if sum(variants_to_check)<10000:
|
|
824
868
|
n_cores=1
|
|
825
869
|
pool = Pool(n_cores)
|
|
@@ -829,35 +873,36 @@ def parallelnormalizeallele(sumstats,snpid="SNPID",rsid="rsID",pos="POS",nea="NE
|
|
|
829
873
|
normalized_pd = pd.concat(pool.map(map_func,df_split))
|
|
830
874
|
pool.close()
|
|
831
875
|
pool.join()
|
|
832
|
-
|
|
833
|
-
|
|
834
|
-
|
|
835
|
-
|
|
836
|
-
|
|
837
|
-
|
|
838
|
-
|
|
839
|
-
|
|
840
|
-
|
|
841
|
-
|
|
842
|
-
|
|
876
|
+
|
|
877
|
+
before_normalize = sumstats.loc[variants_to_check,[ea,nea]]
|
|
878
|
+
changed_num = len(normalized_pd.loc[(before_normalize[ea]!=normalized_pd[ea]) | (before_normalize[nea]!=normalized_pd[nea]),:])
|
|
879
|
+
if changed_num>0:
|
|
880
|
+
if snpid in sumstats.columns:
|
|
881
|
+
before_normalize_id = sumstats.loc[variants_to_check,snpid]
|
|
882
|
+
elif rsid in sumstats.columns:
|
|
883
|
+
before_normalize_id = sumstats.loc[variants_to_check,rsid]
|
|
884
|
+
else:
|
|
885
|
+
before_normalize_id = pd.DataFrame(sumstats.index[variants_to_check],index=sumstats.index[variants_to_check])
|
|
886
|
+
|
|
887
|
+
log.write(" -Not normalized allele IDs:",end="", verbose=verbose)
|
|
888
|
+
for i in before_normalize_id.loc[(before_normalize[ea]!=normalized_pd[ea]) | (before_normalize[nea]!=normalized_pd[nea])].head().values:
|
|
889
|
+
log.write(i,end=" ",show_time=False)
|
|
890
|
+
log.write("... \n",end="",show_time=False, verbose=verbose)
|
|
843
891
|
|
|
844
|
-
|
|
845
|
-
|
|
846
|
-
|
|
847
|
-
|
|
848
|
-
|
|
849
|
-
|
|
850
|
-
|
|
851
|
-
|
|
852
|
-
|
|
853
|
-
log.write(" -Modified "+str(changed_num) +" variants according to parsimony and left alignment principal.", verbose=verbose)
|
|
854
|
-
else:
|
|
855
|
-
log.write(" -All variants are already normalized..", verbose=verbose)
|
|
856
|
-
###################################################################################################################
|
|
892
|
+
log.write(" -Not normalized allele:",end="", verbose=verbose)
|
|
893
|
+
for i in before_normalize.loc[(before_normalize[ea]!=normalized_pd[ea]) | (before_normalize[nea]!=normalized_pd[nea]),[ea,nea]].head().values:
|
|
894
|
+
log.write(i,end="",show_time=False, verbose=verbose)
|
|
895
|
+
log.write("... \n",end="",show_time=False, verbose=verbose)
|
|
896
|
+
log.write(" -Modified "+str(changed_num) +" variants according to parsimony and left alignment principal.", verbose=verbose)
|
|
897
|
+
else:
|
|
898
|
+
log.write(" -All variants are already normalized..", verbose=verbose)
|
|
899
|
+
###################################################################################################################
|
|
900
|
+
|
|
857
901
|
categories = set(sumstats[ea])|set(sumstats[nea]) |set(normalized_pd.loc[:,ea]) |set(normalized_pd.loc[:,nea])
|
|
858
902
|
sumstats[ea] = pd.Categorical(sumstats[ea],categories = categories)
|
|
859
903
|
sumstats[nea] = pd.Categorical(sumstats[nea],categories = categories )
|
|
860
904
|
sumstats.loc[variants_to_check,[pos,nea,ea,status]] = normalized_pd.values
|
|
905
|
+
|
|
861
906
|
try:
|
|
862
907
|
sumstats[pos] = sumstats[pos].astype('Int64')
|
|
863
908
|
except:
|
|
@@ -873,6 +918,67 @@ def normalizeallele(sumstats,pos="POS" ,nea="NEA",ea="EA",status="STATUS"):
|
|
|
873
918
|
sumstats = pd.DataFrame(normalized.to_list(), columns=[pos,nea,ea,status],index=sumstats.index)
|
|
874
919
|
return sumstats
|
|
875
920
|
|
|
921
|
+
def fastnormalizeallele(insumstats,pos="POS" ,nea="NEA",ea="EA",status="STATUS",chunk=3000000,log=Log(),verbose=False):
|
|
922
|
+
log.write(" -Number of variants to check:{}".format(len(insumstats)), verbose=verbose)
|
|
923
|
+
log.write(" -Chunk size:{}".format(chunk), verbose=verbose)
|
|
924
|
+
log.write(" -Processing in chunks:",end="", verbose=verbose)
|
|
925
|
+
changed_index = np.array([])
|
|
926
|
+
for part_n in range(len(insumstats)//chunk+1):
|
|
927
|
+
log.write(part_n, end=" ",show_time=False, verbose=verbose)
|
|
928
|
+
insumstats["NEA"] = insumstats["NEA"].astype("string")
|
|
929
|
+
insumstats["EA"] = insumstats["EA"].astype("string")
|
|
930
|
+
insumstats.iloc[part_n*chunk:(part_n+1)*chunk,:],changed_index_single = normalizae_chunk(insumstats.iloc[part_n*chunk:(part_n+1)*chunk,:].copy())
|
|
931
|
+
changed_index = np.concatenate([changed_index,changed_index_single])
|
|
932
|
+
gc.collect()
|
|
933
|
+
log.write("\n",end="",show_time=False, verbose=verbose)
|
|
934
|
+
return insumstats, changed_index
|
|
935
|
+
|
|
936
|
+
def normalizae_chunk(sumstats,pos="POS" ,nea="NEA",ea="EA",status="STATUS"):
|
|
937
|
+
# already normalized
|
|
938
|
+
|
|
939
|
+
is_same = sumstats["NEA"] == sumstats["EA"]
|
|
940
|
+
is_normalized = ((sumstats["NEA"].str.len()==1) | (sumstats["EA"].str.len()==1) ) & (~is_same)
|
|
941
|
+
|
|
942
|
+
# a series to keep tracking of variants that are modified
|
|
943
|
+
changed = sumstats["NEA"] != sumstats["NEA"]
|
|
944
|
+
|
|
945
|
+
# right side
|
|
946
|
+
ea_len = sumstats["NEA"].str.len()
|
|
947
|
+
nea_len = sumstats["EA"].str.len()
|
|
948
|
+
max_length=max(ea_len.max(), nea_len.max())
|
|
949
|
+
|
|
950
|
+
for i in range(1, max_length):
|
|
951
|
+
is_pop = (sumstats["NEA"].str[-1] == sumstats["EA"].str[-1]) & (~is_normalized)
|
|
952
|
+
if sum(is_pop)==0:
|
|
953
|
+
break
|
|
954
|
+
if i ==1:
|
|
955
|
+
changed = changed | is_pop
|
|
956
|
+
nea_len[is_pop] = nea_len[is_pop] -1
|
|
957
|
+
ea_len[is_pop] = ea_len[is_pop] -1
|
|
958
|
+
sumstats.loc[is_pop, "NEA"] = sumstats.loc[is_pop,"NEA"].str[:-1]
|
|
959
|
+
sumstats.loc[is_pop, "EA"] = sumstats.loc[is_pop,"EA"].str[:-1]
|
|
960
|
+
is_normalized = ((sumstats["NEA"].str.len()==1) | (sumstats["EA"].str.len()==1) ) & (~is_same)
|
|
961
|
+
gc.collect()
|
|
962
|
+
|
|
963
|
+
# left side
|
|
964
|
+
max_length=max(sumstats["NEA"].str.len().max(), sumstats["EA"].str.len().max())
|
|
965
|
+
for i in range(1, max_length):
|
|
966
|
+
is_pop = (sumstats["NEA"].str[0] == sumstats["EA"].str[0]) & (~is_normalized)
|
|
967
|
+
if sum(is_pop)==0:
|
|
968
|
+
break
|
|
969
|
+
if i ==1:
|
|
970
|
+
changed = changed | is_pop
|
|
971
|
+
sumstats.loc[is_pop, "NEA"] = sumstats.loc[is_pop,"NEA"].str[1:]
|
|
972
|
+
sumstats.loc[is_pop, "EA"] = sumstats.loc[is_pop,"EA"].str[1:]
|
|
973
|
+
sumstats.loc[is_pop, "POS"] = sumstats.loc[is_pop,"POS"] + 1
|
|
974
|
+
is_normalized = ((sumstats["NEA"].str.len()==1) | (sumstats["EA"].str.len()==1) ) & (~is_same)
|
|
975
|
+
gc.collect()
|
|
976
|
+
|
|
977
|
+
sumstats.loc[is_normalized,status] = vchange_status(sumstats.loc[is_normalized, status], 5,"4","0")
|
|
978
|
+
sumstats.loc[is_same,status] = vchange_status(sumstats.loc[is_same, status], 5,"4","3")
|
|
979
|
+
changed_index = sumstats[changed].index
|
|
980
|
+
return sumstats, changed_index.values
|
|
981
|
+
|
|
876
982
|
def normalizevariant(pos,a,b,status):
|
|
877
983
|
# single record
|
|
878
984
|
# https://genome.sph.umich.edu/wiki/Variant_Normalization
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: gwaslab
|
|
3
|
-
Version: 3.4.
|
|
3
|
+
Version: 3.4.44
|
|
4
4
|
Summary: A collection of handy tools for GWAS SumStats
|
|
5
5
|
Author-email: Yunye <yunye@gwaslab.com>
|
|
6
6
|
Project-URL: Homepage, https://cloufield.github.io/gwaslab/
|
|
@@ -51,7 +51,7 @@ Warning: Known issues of GWASLab are summarized in [https://cloufield.github.io/
|
|
|
51
51
|
### install via pip
|
|
52
52
|
|
|
53
53
|
```
|
|
54
|
-
pip install gwaslab==3.4.
|
|
54
|
+
pip install gwaslab==3.4.43
|
|
55
55
|
```
|
|
56
56
|
|
|
57
57
|
```python
|
|
@@ -90,7 +90,7 @@ Create a Python 3.9 environment and install gwaslab using pip:
|
|
|
90
90
|
```
|
|
91
91
|
conda env create -n gwaslab_test -c conda-forge python=3.9
|
|
92
92
|
conda activate gwaslab
|
|
93
|
-
pip install gwaslab==3.4.
|
|
93
|
+
pip install gwaslab==3.4.43
|
|
94
94
|
```
|
|
95
95
|
|
|
96
96
|
or create a new environment using yml file [environment_3.4.40.yml](https://github.com/Cloufield/gwaslab/blob/main/environment_3.4.40.yml)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gwaslab-3.4.43 → gwaslab-3.4.44}/src/gwaslab/data/hapmap3_SNPs/hapmap3_db150_hg19.snplist.gz
RENAMED
|
File without changes
|
{gwaslab-3.4.43 → gwaslab-3.4.44}/src/gwaslab/data/hapmap3_SNPs/hapmap3_db151_hg38.snplist.gz
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|