gwaslab 3.4.43__tar.gz → 3.4.45__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of gwaslab might be problematic. Click here for more details.

Files changed (86) hide show
  1. {gwaslab-3.4.43/src/gwaslab.egg-info → gwaslab-3.4.45}/PKG-INFO +5 -5
  2. {gwaslab-3.4.43 → gwaslab-3.4.45}/README.md +2 -2
  3. {gwaslab-3.4.43 → gwaslab-3.4.45}/pyproject.toml +3 -3
  4. {gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/g_Sumstats.py +4 -2
  5. {gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/g_SumstatsPair.py +1 -1
  6. {gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/g_vchange_status.py +4 -2
  7. {gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/g_version.py +2 -2
  8. {gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/hm_harmonize_sumstats.py +45 -17
  9. {gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/qc_fix_sumstats.py +132 -26
  10. {gwaslab-3.4.43 → gwaslab-3.4.45/src/gwaslab.egg-info}/PKG-INFO +5 -5
  11. {gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab.egg-info/requires.txt +1 -1
  12. {gwaslab-3.4.43 → gwaslab-3.4.45}/LICENSE +0 -0
  13. {gwaslab-3.4.43 → gwaslab-3.4.45}/LICENSE_before_v3.4.39 +0 -0
  14. {gwaslab-3.4.43 → gwaslab-3.4.45}/setup.cfg +0 -0
  15. {gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/__init__.py +0 -0
  16. {gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/bd_common_data.py +0 -0
  17. {gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/bd_config.py +0 -0
  18. {gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/bd_download.py +0 -0
  19. {gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/bd_get_hapmap3.py +0 -0
  20. {gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/cache_manager.py +0 -0
  21. {gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/data/chrx_par/chrx_par_hg19.bed.gz +0 -0
  22. {gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/data/chrx_par/chrx_par_hg38.bed.gz +0 -0
  23. {gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/data/formatbook.json +0 -0
  24. {gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/data/hapmap3_SNPs/hapmap3_db150_hg19.snplist.gz +0 -0
  25. {gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/data/hapmap3_SNPs/hapmap3_db151_hg38.snplist.gz +0 -0
  26. {gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/data/high_ld/high_ld_hla_hg19.bed.gz +0 -0
  27. {gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/data/high_ld/high_ld_hla_hg38.bed.gz +0 -0
  28. {gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/data/reference.json +0 -0
  29. {gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/g_Log.py +0 -0
  30. {gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/g_Phenotypes.py +0 -0
  31. {gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/g_SumstatsT.py +0 -0
  32. {gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/g_Sumstats_summary.py +0 -0
  33. {gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/g_meta.py +0 -0
  34. {gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/hm_casting.py +0 -0
  35. {gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/hm_rsid_to_chrpos.py +0 -0
  36. {gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/io_preformat_input.py +0 -0
  37. {gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/io_read_ldsc.py +0 -0
  38. {gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/io_read_tabular.py +0 -0
  39. {gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/io_to_formats.py +0 -0
  40. {gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/io_to_pickle.py +0 -0
  41. {gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/ldsc_irwls.py +0 -0
  42. {gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/ldsc_jackknife.py +0 -0
  43. {gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/ldsc_ldscore.py +0 -0
  44. {gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/ldsc_parse.py +0 -0
  45. {gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/ldsc_regressions.py +0 -0
  46. {gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/ldsc_sumstats.py +0 -0
  47. {gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/qc_check_datatype.py +0 -0
  48. {gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/run_script.py +0 -0
  49. {gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/util_ex_calculate_ldmatrix.py +0 -0
  50. {gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/util_ex_calculate_prs.py +0 -0
  51. {gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/util_ex_gwascatalog.py +0 -0
  52. {gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/util_ex_ldproxyfinder.py +0 -0
  53. {gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/util_ex_ldsc.py +0 -0
  54. {gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/util_ex_plink_filter.py +0 -0
  55. {gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/util_ex_process_h5.py +0 -0
  56. {gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/util_ex_process_ref.py +0 -0
  57. {gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/util_ex_run_2samplemr.py +0 -0
  58. {gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/util_ex_run_clumping.py +0 -0
  59. {gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/util_ex_run_coloc.py +0 -0
  60. {gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/util_ex_run_susie.py +0 -0
  61. {gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/util_in_calculate_gc.py +0 -0
  62. {gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/util_in_calculate_power.py +0 -0
  63. {gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/util_in_convert_h2.py +0 -0
  64. {gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/util_in_correct_winnerscurse.py +0 -0
  65. {gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/util_in_fill_data.py +0 -0
  66. {gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/util_in_filter_value.py +0 -0
  67. {gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/util_in_get_density.py +0 -0
  68. {gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/util_in_get_sig.py +0 -0
  69. {gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/viz_aux_annotate_plot.py +0 -0
  70. {gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/viz_aux_quickfix.py +0 -0
  71. {gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/viz_aux_reposition_text.py +0 -0
  72. {gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/viz_aux_save_figure.py +0 -0
  73. {gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/viz_plot_compare_af.py +0 -0
  74. {gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/viz_plot_compare_effect.py +0 -0
  75. {gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/viz_plot_forestplot.py +0 -0
  76. {gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/viz_plot_miamiplot.py +0 -0
  77. {gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/viz_plot_miamiplot2.py +0 -0
  78. {gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/viz_plot_mqqplot.py +0 -0
  79. {gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/viz_plot_qqplot.py +0 -0
  80. {gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/viz_plot_regionalplot.py +0 -0
  81. {gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/viz_plot_rg_heatmap.py +0 -0
  82. {gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/viz_plot_stackedregional.py +0 -0
  83. {gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/viz_plot_trumpetplot.py +0 -0
  84. {gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab.egg-info/SOURCES.txt +0 -0
  85. {gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab.egg-info/dependency_links.txt +0 -0
  86. {gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: gwaslab
3
- Version: 3.4.43
3
+ Version: 3.4.45
4
4
  Summary: A collection of handy tools for GWAS SumStats
5
5
  Author-email: Yunye <yunye@gwaslab.com>
6
6
  Project-URL: Homepage, https://cloufield.github.io/gwaslab/
@@ -8,7 +8,7 @@ Project-URL: Github, https://github.com/Cloufield/gwaslab
8
8
  Classifier: Programming Language :: Python :: 3
9
9
  Classifier: License :: OSI Approved :: MIT License
10
10
  Classifier: Operating System :: OS Independent
11
- Requires-Python: <=3.10,>=3.9
11
+ Requires-Python: <3.11,>=3.9
12
12
  Description-Content-Type: text/markdown
13
13
  License-File: LICENSE
14
14
  License-File: LICENSE_before_v3.4.39
@@ -17,7 +17,7 @@ Requires-Dist: numpy>=1.21.2
17
17
  Requires-Dist: matplotlib!=3.7.2,>=3.5
18
18
  Requires-Dist: seaborn>=0.12
19
19
  Requires-Dist: scipy>=1.12
20
- Requires-Dist: pySAM<0.20,>=0.18.1
20
+ Requires-Dist: pySAM==0.22.1
21
21
  Requires-Dist: Biopython>=1.79
22
22
  Requires-Dist: adjustText<=0.8,>=0.7.3
23
23
  Requires-Dist: liftover>=1.1.13
@@ -51,7 +51,7 @@ Warning: Known issues of GWASLab are summarized in [https://cloufield.github.io/
51
51
  ### install via pip
52
52
 
53
53
  ```
54
- pip install gwaslab==3.4.41
54
+ pip install gwaslab==3.4.43
55
55
  ```
56
56
 
57
57
  ```python
@@ -90,7 +90,7 @@ Create a Python 3.9 environment and install gwaslab using pip:
90
90
  ```
91
91
  conda env create -n gwaslab_test -c conda-forge python=3.9
92
92
  conda activate gwaslab
93
- pip install gwaslab==3.4.41
93
+ pip install gwaslab==3.4.43
94
94
  ```
95
95
 
96
96
  or create a new environment using yml file [environment_3.4.40.yml](https://github.com/Cloufield/gwaslab/blob/main/environment_3.4.40.yml)
@@ -23,7 +23,7 @@ Warning: Known issues of GWASLab are summarized in [https://cloufield.github.io/
23
23
  ### install via pip
24
24
 
25
25
  ```
26
- pip install gwaslab==3.4.41
26
+ pip install gwaslab==3.4.43
27
27
  ```
28
28
 
29
29
  ```python
@@ -62,7 +62,7 @@ Create a Python 3.9 environment and install gwaslab using pip:
62
62
  ```
63
63
  conda env create -n gwaslab_test -c conda-forge python=3.9
64
64
  conda activate gwaslab
65
- pip install gwaslab==3.4.41
65
+ pip install gwaslab==3.4.43
66
66
  ```
67
67
 
68
68
  or create a new environment using yml file [environment_3.4.40.yml](https://github.com/Cloufield/gwaslab/blob/main/environment_3.4.40.yml)
@@ -7,7 +7,7 @@ build-backend = "setuptools.build_meta"
7
7
 
8
8
  [project]
9
9
  name = "gwaslab"
10
- version = "3.4.43"
10
+ version = "3.4.45"
11
11
  authors = [
12
12
  { name="Yunye", email="yunye@gwaslab.com" },
13
13
  ]
@@ -21,7 +21,7 @@ dependencies = [
21
21
  "matplotlib>=3.5,!=3.7.2",
22
22
  "seaborn>=0.12",
23
23
  "scipy>=1.12",
24
- "pySAM>=0.18.1,<0.20",
24
+ "pySAM==0.22.1",
25
25
  "Biopython>=1.79",
26
26
  "adjustText>=0.7.3, <=0.8",
27
27
  "liftover>=1.1.13",
@@ -31,7 +31,7 @@ dependencies = [
31
31
  "h5py>=3.10.0"
32
32
  ]
33
33
 
34
- requires-python = ">=3.9,<=3.10"
34
+ requires-python = ">=3.9,<3.11"
35
35
  classifiers = [
36
36
  "Programming Language :: Python :: 3",
37
37
  "License :: OSI Approved :: MIT License",
@@ -356,8 +356,10 @@ class Sumstats():
356
356
  if ref_seq is not None:
357
357
  if ref_seq_mode=="v":
358
358
  self.data = checkref(self.data,ref_seq,log=self.log,**checkref_args)
359
- else:
359
+ elif ref_seq_mode=="s":
360
360
  self.data = oldcheckref(self.data,ref_seq,log=self.log,**checkref_args)
361
+ else:
362
+ raise ValueError("ref_seq_mode should be 'v' (vectorized, faster) or 's' (sequential, slower)")
361
363
 
362
364
  self.meta["gwaslab"]["references"]["ref_seq"] = ref_seq
363
365
 
@@ -429,7 +431,7 @@ class Sumstats():
429
431
  if ref_seq_mode=="v":
430
432
  self.meta["gwaslab"]["references"]["ref_seq"] = ref_seq
431
433
  self.data = checkref(self.data,ref_seq,log=self.log,**kwargs)
432
- else:
434
+ elif ref_seq_mode=="s":
433
435
  self.meta["gwaslab"]["references"]["ref_seq"] = ref_seq
434
436
  self.data = oldcheckref(self.data,ref_seq,log=self.log,**kwargs)
435
437
  def infer_strand(self,ref_infer,**kwargs):
@@ -139,7 +139,7 @@ class SumstatsPair( ):
139
139
  self.clumps["clumps"], self.clumps["plink_log"] = _clump(self.data, log=self.log, p="P_1",mlog10p="MLOG10P_1", study = self.study_name, **kwargs)
140
140
 
141
141
  def to_coloc(self,**kwargs):
142
- self.to_finemapping_file_path, self.plink_log = tofinemapping(self.data,study=self.study_name,suffixes=self.suffixes,log=self.log,**kwargs)
142
+ self.to_finemapping_file_path, output_file_list, self.plink_log = tofinemapping(self.data,study=self.study_name,suffixes=self.suffixes,log=self.log,**kwargs)
143
143
 
144
144
  def run_coloc_susie(self,**kwargs):
145
145
 
@@ -1,13 +1,15 @@
1
1
  import pandas as pd
2
2
 
3
+ CATEGORIES = {str(j+i) for j in [1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
4
+
3
5
  def vchange_status(status,digit,before,after):
4
6
  dic={}
5
7
  for i in range(len(before)):
6
8
  dic[before[i]]=after[i]
7
9
  if digit>1:
8
- return status.str[:digit-1]+status.str[digit-1].replace(dic)+status.str[digit:]
10
+ return pd.Categorical(status.str[:digit-1]+status.str[digit-1].replace(dic)+status.str[digit:],categories=CATEGORIES)
9
11
  else:
10
- return status.str[digit-1].replace(dic)+status.str[digit:]
12
+ return pd.Categorical(status.str[digit-1].replace(dic)+status.str[digit:],categories=CATEGORIES)
11
13
 
12
14
  def copy_status(from_status,to_status, digit):
13
15
  if digit>1:
@@ -15,8 +15,8 @@ def _get_version():
15
15
  def gwaslab_info():
16
16
  # version meta information
17
17
  dic={
18
- "version":"3.4.43",
19
- "release_date":"20240403"
18
+ "version":"3.4.45",
19
+ "release_date":"20240509"
20
20
  }
21
21
  return dic
22
22
 
@@ -355,7 +355,11 @@ def oldcheckref(sumstats,ref_seq,chrom="CHR",pos="POS",ea="EA",nea="NEA",status=
355
355
 
356
356
  log.write("\n",end="",show_time=False,verbose=verbose)
357
357
 
358
- sumstats[status] = sumstats[status].astype("string")
358
+ CATEGORIES = {str(j+i) for j in [1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
359
+ sumstats[status] = pd.Categorical(sumstats[status],categories=CATEGORIES)
360
+ #sumstats[status] = sumstats[status].astype("string")
361
+
362
+
359
363
  available_to_check =sum( (~sumstats[pos].isna()) & (~sumstats[nea].isna()) & (~sumstats[ea].isna()))
360
364
  status_0=sum(sumstats["STATUS"].str.match("\w\w\w\w\w[0]\w", case=False, flags=0, na=False))
361
365
  status_3=sum(sumstats["STATUS"].str.match("\w\w\w\w\w[3]\w", case=False, flags=0, na=False))
@@ -389,7 +393,10 @@ def oldcheckref(sumstats,ref_seq,chrom="CHR",pos="POS",ea="EA",nea="NEA",status=
389
393
  return sumstats
390
394
 
391
395
  #20240320 check if non-effect allele is aligned with reference genome
392
- def _fast_check_status(x: pd.DataFrame, record: np.array, starting_positions: np.array):
396
+ def _fast_check_status(x: pd.DataFrame, record: np.array, starting_positions: np.array, records_len: np.array):
397
+ # starting_positions and records_len must be 1D arrays containing data only for the chromosomes contained in x,
398
+ # and these arrays must be ordered in the same way as the chromosomes in np.unique(x['CHR'].values).
399
+
393
400
  # status
394
401
  #0 / -----> match
395
402
  #1 / -----> Flipped Fixed
@@ -431,6 +438,9 @@ def _fast_check_status(x: pd.DataFrame, record: np.array, starting_positions: np
431
438
  max_len_nea = _nea.str.len().max()
432
439
  max_len_ea = _ea.str.len().max()
433
440
 
441
+ ########################################## mask for variants with out of range POS
442
+ mask_outlier = pos > records_len[chrom]
443
+ #########################################
434
444
 
435
445
  # Let's apply the same magic used for the fasta records (check build_fasta_records() for details) to convert the NEA and EA to
436
446
  # a numpy array of integers in a very fast way.
@@ -442,7 +452,9 @@ def _fast_check_status(x: pd.DataFrame, record: np.array, starting_positions: np
442
452
  nea = _nea.str.translate(TRANSLATE_TABLE).to_numpy().astype(f'<U{max_len_nea}')
443
453
  nea = nea.view('<u4').reshape(-1, max_len_nea).astype(np.uint8)
444
454
  nea[nea == 0] = PADDING_VALUE # padding value
445
-
455
+ ###########################################
456
+
457
+ ###########################################
446
458
  # Create a mask holding True at the position of non-padding values
447
459
  mask_nea = nea != PADDING_VALUE
448
460
 
@@ -458,7 +470,9 @@ def _fast_check_status(x: pd.DataFrame, record: np.array, starting_positions: np
458
470
  ea = _ea.str.translate(TRANSLATE_TABLE).to_numpy().astype(f'<U{max_len_ea}')
459
471
  ea = ea.view('<u4').reshape(-1, max_len_ea).astype(np.uint8)
460
472
  ea[ea == 0] = PADDING_VALUE # padding value
461
-
473
+ ###########################################
474
+
475
+ ###########################################
462
476
  mask_ea = ea != PADDING_VALUE
463
477
 
464
478
  rev_ea = _ea.str.translate(TRANSLATE_TABLE_COMPL).str.pad(max_len_ea, 'left', chr(PADDING_VALUE)).to_numpy().astype(f'<U{max_len_ea}')
@@ -503,8 +517,11 @@ def _fast_check_status(x: pd.DataFrame, record: np.array, starting_positions: np
503
517
  # Index the record array using the computed indices.
504
518
  # Since we use np.take, indices must all have the same length, and this is why we added the padding to NEA
505
519
  # and we create the indices using max_len_nea (long story short, we can't obtain a scattered/ragged array)
506
- output_nea = np.take(record, indices)
507
-
520
+ output_nea = np.take(record, indices, mode="clip")
521
+ ##################################################################
522
+ output_nea[mask_outlier] = PADDING_VALUE
523
+ ##################################################################
524
+
508
525
  # Check if the NEA is equal to the reference sequence at the given position
509
526
  # In a non-matrix way, this is equivalent (for one single element) to:
510
527
  # nea == record[pos-1: pos+len(nea)-1]
@@ -527,7 +544,10 @@ def _fast_check_status(x: pd.DataFrame, record: np.array, starting_positions: np
527
544
  indices_range = np.arange(max_len_ea)
528
545
  indices = pos + indices_range
529
546
  indices = indices + modified_indices
530
- output_ea = np.take(record, indices)
547
+ output_ea = np.take(record, indices, mode="clip")
548
+ ##################################################################
549
+ output_ea[mask_outlier] = PADDING_VALUE
550
+ ##################################################################
531
551
 
532
552
  ea_eq_ref = np.all((ea == output_ea) + ~mask_ea, 1)
533
553
  rev_ea_eq_ref = np.all((rev_ea == output_ea) + ~mask_ea, 1)
@@ -582,24 +602,28 @@ def check_status(sumstats: pd.DataFrame, fasta_records_dict, log=Log(), verbose=
582
602
  chrom,pos,ea,nea,status = sumstats.columns
583
603
 
584
604
  # First, convert the fasta records to a single numpy array of integers
585
- record, starting_positions_dict = build_fasta_records(fasta_records_dict, pos_as_dict=True, log=log, verbose=verbose)
605
+ record, starting_positions_dict, records_len_dict = build_fasta_records(fasta_records_dict, pos_as_dict=True, log=log, verbose=verbose)
586
606
 
587
607
  # In _fast_check_status(), several 2D numpy arrays are created and they are padded to have shape[1] == max_len_nea or max_len_ea
588
608
  # Since most of the NEA and EA strings are short, we perform the check first on the records having short NEA and EA strings,
589
609
  # and then we perform the check on the records having long NEA and EA strings. In this way we can speed up the process (since the
590
610
  # arrays are smaller) and save memory.
591
611
  max_len = 4 # this is a chosen value, we could compute it using some stats about the length and count of NEA and EA strings
592
- condition = (sumstats[nea].str.len() <= max_len) * (sumstats[ea].str.len() <= max_len)
612
+ condition = (sumstats[nea].str.len() <= max_len) & (sumstats[ea].str.len() <= max_len)
593
613
 
594
614
  log.write(f" -Checking records for ( len(NEA) <= {max_len} and len(EA) <= {max_len} )", verbose=verbose)
595
615
  sumstats_cond = sumstats[condition]
596
- starting_pos_cond = np.array([starting_positions_dict[k] for k in sumstats_cond[chrom].unique()])
597
- sumstats.loc[condition, status] = _fast_check_status(sumstats_cond, record=record, starting_positions=starting_pos_cond)
616
+ unique_chrom_cond = sumstats_cond[chrom].unique()
617
+ starting_pos_cond = np.array([starting_positions_dict[k] for k in unique_chrom_cond])
618
+ records_len_cond = np.array([records_len_dict[k] for k in unique_chrom_cond])
619
+ sumstats.loc[condition, status] = _fast_check_status(sumstats_cond, record=record, starting_positions=starting_pos_cond, records_len=records_len_cond)
598
620
 
599
621
  log.write(f" -Checking records for ( len(NEA) > {max_len} or len(EA) > {max_len} )", verbose=verbose)
600
622
  sumstats_not_cond = sumstats[~condition]
601
- starting_not_pos_cond = np.array([starting_positions_dict[k] for k in sumstats_not_cond[chrom].unique()])
602
- sumstats.loc[~condition, status] = _fast_check_status(sumstats_not_cond, record=record, starting_positions=starting_not_pos_cond)
623
+ unique_chrom_not_cond = sumstats_not_cond[chrom].unique()
624
+ starting_not_pos_cond = np.array([starting_positions_dict[k] for k in unique_chrom_not_cond])
625
+ records_len_not_cond = np.array([records_len_dict[k] for k in unique_chrom_not_cond])
626
+ sumstats.loc[~condition, status] = _fast_check_status(sumstats_not_cond, record=record, starting_positions=starting_not_pos_cond, records_len=records_len_not_cond)
603
627
 
604
628
  return sumstats[status].values
605
629
 
@@ -649,9 +673,11 @@ def checkref(sumstats,ref_seq,chrom="CHR",pos="POS",ea="EA",nea="NEA",status="ST
649
673
  sumstats_to_check = sumstats.loc[to_check_ref,[chrom,pos,ea,nea,status]]
650
674
  sumstats.loc[to_check_ref,status] = check_status(sumstats_to_check, all_records_dict, log=log, verbose=verbose)
651
675
  log.write(" -Finished checking records", verbose=verbose)
652
-
653
- sumstats[status] = sumstats[status].astype("string")
654
-
676
+
677
+ CATEGORIES = {str(j+i) for j in [1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
678
+ sumstats[status] = pd.Categorical(sumstats[status],categories=CATEGORIES)
679
+ #sumstats[status] = sumstats[status].astype("string")
680
+
655
681
  available_to_check =sum( (~sumstats[pos].isna()) & (~sumstats[nea].isna()) & (~sumstats[ea].isna()))
656
682
  status_0=sum(sumstats["STATUS"].str.match("\w\w\w\w\w[0]\w", case=False, flags=0, na=False))
657
683
  status_3=sum(sumstats["STATUS"].str.match("\w\w\w\w\w[3]\w", case=False, flags=0, na=False))
@@ -680,6 +706,7 @@ def checkref(sumstats,ref_seq,chrom="CHR",pos="POS",ea="EA",nea="NEA",status="ST
680
706
  if remove is True:
681
707
  sumstats = sumstats.loc[~sumstats["STATUS"].str.match("\w\w\w\w\w[8]\w"),:]
682
708
  log.write(" -Variants not on given reference sequence were removed.",verbose=verbose)
709
+
683
710
 
684
711
  finished(log, verbose, _end_line)
685
712
  return sumstats
@@ -709,10 +736,11 @@ def build_fasta_records(fasta_records_dict, pos_as_dict=True, log=Log(), verbose
709
736
  starting_positions = np.cumsum(records_len) - records_len
710
737
  if pos_as_dict:
711
738
  starting_positions = {k: v for k, v in zip(fasta_records_dict.keys(), starting_positions)}
739
+ records_len_dict = {k: v for k, v in zip(fasta_records_dict.keys(), records_len)}
712
740
  record = np.concatenate(all_r)
713
741
  del all_r # free memory
714
742
 
715
- return record, starting_positions
743
+ return record, starting_positions,records_len_dict
716
744
 
717
745
  #######################################################################################################################################
718
746
 
@@ -792,7 +792,7 @@ def fixallele(sumstats,ea="EA", nea="NEA",status="STATUS",remove=False,verbose=T
792
792
  ###############################################################################################################
793
793
  # 20220721
794
794
 
795
- def parallelnormalizeallele(sumstats,snpid="SNPID",rsid="rsID",pos="POS",nea="NEA",ea="EA" ,status="STATUS",n_cores=1,verbose=True,log=Log()):
795
+ def parallelnormalizeallele(sumstats,mode="s",snpid="SNPID",rsid="rsID",pos="POS",nea="NEA",ea="EA" ,status="STATUS",chunk=3000000,n_cores=1,verbose=True,log=Log()):
796
796
  ##start function with col checking##########################################################
797
797
  _start_line = "normalize indels"
798
798
  _end_line = "normalizing indels"
@@ -819,7 +819,51 @@ def parallelnormalizeallele(sumstats,snpid="SNPID",rsid="rsID",pos="POS",nea="NE
819
819
  log.write("Finished normalizing variants successfully!", verbose=verbose)
820
820
  return sumstats
821
821
  ###############################################################################################################
822
- if sum(variants_to_check)>0:
822
+ if mode=="v":
823
+ if sum(variants_to_check)<100000:
824
+ n_cores=1
825
+ if n_cores==1:
826
+ normalized_pd, changed_index = fastnormalizeallele(sumstats.loc[variants_to_check,[pos,nea,ea,status]],pos=pos ,nea=nea,ea=ea,status=status,chunk=chunk, log=log, verbose=verbose)
827
+ else:
828
+ pool = Pool(n_cores)
829
+ map_func = partial(fastnormalizeallele,pos=pos,nea=nea,ea=ea,status=status)
830
+ df_split = _df_split(sumstats.loc[variants_to_check,[pos,nea,ea,status]], n_cores)
831
+ results = pool.map(map_func,df_split)
832
+ normalized_pd = pd.concat([i[0] for i in results])
833
+ changed_index = np.concatenate([i[1] for i in results])
834
+ del results
835
+ pool.close()
836
+ pool.join()
837
+ gc.collect()
838
+ ###############################################################################################################
839
+ try:
840
+ example_sumstats = sumstats.loc[changed_index,:].head()
841
+ changed_num = len(changed_index)
842
+ if changed_num>0:
843
+ if snpid in example_sumstats.columns:
844
+ before_normalize_id = example_sumstats.loc[variants_to_check,snpid]
845
+ elif rsid in example_sumstats.columns:
846
+ before_normalize_id = example_sumstats.loc[variants_to_check,rsid]
847
+ else:
848
+ before_normalize_id = example_sumstats.index
849
+
850
+ log.write(" -Not normalized allele IDs:",end="", verbose=verbose)
851
+ for i in before_normalize_id.values:
852
+ log.write(i,end=" ",show_time=False)
853
+ log.write("... \n",end="",show_time=False, verbose=verbose)
854
+
855
+ log.write(" -Not normalized allele:",end="", verbose=verbose)
856
+ for i in example_sumstats[[ea,nea]].values:
857
+ log.write(i,end="",show_time=False, verbose=verbose)
858
+ log.write("... \n",end="",show_time=False, verbose=verbose)
859
+ log.write(" -Modified "+str(changed_num) +" variants according to parsimony and left alignment principal.", verbose=verbose)
860
+ else:
861
+ log.write(" -All variants are already normalized..", verbose=verbose)
862
+ except:
863
+ pass
864
+
865
+ ##########################################################################################################################################################
866
+ elif mode=="s":
823
867
  if sum(variants_to_check)<10000:
824
868
  n_cores=1
825
869
  pool = Pool(n_cores)
@@ -829,35 +873,36 @@ def parallelnormalizeallele(sumstats,snpid="SNPID",rsid="rsID",pos="POS",nea="NE
829
873
  normalized_pd = pd.concat(pool.map(map_func,df_split))
830
874
  pool.close()
831
875
  pool.join()
832
- ###############################################################################################################
833
-
834
- before_normalize = sumstats.loc[variants_to_check,[ea,nea]]
835
- changed_num = len(normalized_pd.loc[(before_normalize[ea]!=normalized_pd[ea]) | (before_normalize[nea]!=normalized_pd[nea]),:])
836
- if changed_num>0:
837
- if snpid in sumstats.columns:
838
- before_normalize_id = sumstats.loc[variants_to_check,snpid]
839
- elif rsid in sumstats.columns:
840
- before_normalize_id = sumstats.loc[variants_to_check,rsid]
841
- else:
842
- before_normalize_id = pd.DataFrame(sumstats.index[variants_to_check],index=sumstats.index[variants_to_check])
876
+
877
+ before_normalize = sumstats.loc[variants_to_check,[ea,nea]]
878
+ changed_num = len(normalized_pd.loc[(before_normalize[ea]!=normalized_pd[ea]) | (before_normalize[nea]!=normalized_pd[nea]),:])
879
+ if changed_num>0:
880
+ if snpid in sumstats.columns:
881
+ before_normalize_id = sumstats.loc[variants_to_check,snpid]
882
+ elif rsid in sumstats.columns:
883
+ before_normalize_id = sumstats.loc[variants_to_check,rsid]
884
+ else:
885
+ before_normalize_id = pd.DataFrame(sumstats.index[variants_to_check],index=sumstats.index[variants_to_check])
886
+
887
+ log.write(" -Not normalized allele IDs:",end="", verbose=verbose)
888
+ for i in before_normalize_id.loc[(before_normalize[ea]!=normalized_pd[ea]) | (before_normalize[nea]!=normalized_pd[nea])].head().values:
889
+ log.write(i,end=" ",show_time=False)
890
+ log.write("... \n",end="",show_time=False, verbose=verbose)
843
891
 
844
- log.write(" -Not normalized allele IDs:",end="", verbose=verbose)
845
- for i in before_normalize_id.loc[(before_normalize[ea]!=normalized_pd[ea]) | (before_normalize[nea]!=normalized_pd[nea])].head().values:
846
- log.write(i,end=" ",show_time=False)
847
- log.write("... \n",end="",show_time=False, verbose=verbose)
848
-
849
- log.write(" -Not normalized allele:",end="", verbose=verbose)
850
- for i in before_normalize.loc[(before_normalize[ea]!=normalized_pd[ea]) | (before_normalize[nea]!=normalized_pd[nea]),[ea,nea]].head().values:
851
- log.write(i,end="",show_time=False, verbose=verbose)
852
- log.write("... \n",end="",show_time=False, verbose=verbose)
853
- log.write(" -Modified "+str(changed_num) +" variants according to parsimony and left alignment principal.", verbose=verbose)
854
- else:
855
- log.write(" -All variants are already normalized..", verbose=verbose)
856
- ###################################################################################################################
892
+ log.write(" -Not normalized allele:",end="", verbose=verbose)
893
+ for i in before_normalize.loc[(before_normalize[ea]!=normalized_pd[ea]) | (before_normalize[nea]!=normalized_pd[nea]),[ea,nea]].head().values:
894
+ log.write(i,end="",show_time=False, verbose=verbose)
895
+ log.write("... \n",end="",show_time=False, verbose=verbose)
896
+ log.write(" -Modified "+str(changed_num) +" variants according to parsimony and left alignment principal.", verbose=verbose)
897
+ else:
898
+ log.write(" -All variants are already normalized..", verbose=verbose)
899
+ ###################################################################################################################
900
+
857
901
  categories = set(sumstats[ea])|set(sumstats[nea]) |set(normalized_pd.loc[:,ea]) |set(normalized_pd.loc[:,nea])
858
902
  sumstats[ea] = pd.Categorical(sumstats[ea],categories = categories)
859
903
  sumstats[nea] = pd.Categorical(sumstats[nea],categories = categories )
860
904
  sumstats.loc[variants_to_check,[pos,nea,ea,status]] = normalized_pd.values
905
+
861
906
  try:
862
907
  sumstats[pos] = sumstats[pos].astype('Int64')
863
908
  except:
@@ -873,6 +918,67 @@ def normalizeallele(sumstats,pos="POS" ,nea="NEA",ea="EA",status="STATUS"):
873
918
  sumstats = pd.DataFrame(normalized.to_list(), columns=[pos,nea,ea,status],index=sumstats.index)
874
919
  return sumstats
875
920
 
921
+ def fastnormalizeallele(insumstats,pos="POS" ,nea="NEA",ea="EA",status="STATUS",chunk=3000000,log=Log(),verbose=False):
922
+ log.write(" -Number of variants to check:{}".format(len(insumstats)), verbose=verbose)
923
+ log.write(" -Chunk size:{}".format(chunk), verbose=verbose)
924
+ log.write(" -Processing in chunks:",end="", verbose=verbose)
925
+ changed_index = np.array([])
926
+ for part_n in range(len(insumstats)//chunk+1):
927
+ log.write(part_n, end=" ",show_time=False, verbose=verbose)
928
+ insumstats["NEA"] = insumstats["NEA"].astype("string")
929
+ insumstats["EA"] = insumstats["EA"].astype("string")
930
+ insumstats.iloc[part_n*chunk:(part_n+1)*chunk,:],changed_index_single = normalizae_chunk(insumstats.iloc[part_n*chunk:(part_n+1)*chunk,:].copy())
931
+ changed_index = np.concatenate([changed_index,changed_index_single])
932
+ gc.collect()
933
+ log.write("\n",end="",show_time=False, verbose=verbose)
934
+ return insumstats, changed_index
935
+
936
+ def normalizae_chunk(sumstats,pos="POS" ,nea="NEA",ea="EA",status="STATUS"):
937
+ # already normalized
938
+
939
+ is_same = sumstats["NEA"] == sumstats["EA"]
940
+ is_normalized = ((sumstats["NEA"].str.len()==1) | (sumstats["EA"].str.len()==1) ) & (~is_same)
941
+
942
+ # a series to keep tracking of variants that are modified
943
+ changed = sumstats["NEA"] != sumstats["NEA"]
944
+
945
+ # right side
946
+ ea_len = sumstats["NEA"].str.len()
947
+ nea_len = sumstats["EA"].str.len()
948
+ max_length=max(ea_len.max(), nea_len.max())
949
+
950
+ for i in range(1, max_length):
951
+ is_pop = (sumstats["NEA"].str[-1] == sumstats["EA"].str[-1]) & (~is_normalized)
952
+ if sum(is_pop)==0:
953
+ break
954
+ if i ==1:
955
+ changed = changed | is_pop
956
+ nea_len[is_pop] = nea_len[is_pop] -1
957
+ ea_len[is_pop] = ea_len[is_pop] -1
958
+ sumstats.loc[is_pop, "NEA"] = sumstats.loc[is_pop,"NEA"].str[:-1]
959
+ sumstats.loc[is_pop, "EA"] = sumstats.loc[is_pop,"EA"].str[:-1]
960
+ is_normalized = ((sumstats["NEA"].str.len()==1) | (sumstats["EA"].str.len()==1) ) & (~is_same)
961
+ gc.collect()
962
+
963
+ # left side
964
+ max_length=max(sumstats["NEA"].str.len().max(), sumstats["EA"].str.len().max())
965
+ for i in range(1, max_length):
966
+ is_pop = (sumstats["NEA"].str[0] == sumstats["EA"].str[0]) & (~is_normalized)
967
+ if sum(is_pop)==0:
968
+ break
969
+ if i ==1:
970
+ changed = changed | is_pop
971
+ sumstats.loc[is_pop, "NEA"] = sumstats.loc[is_pop,"NEA"].str[1:]
972
+ sumstats.loc[is_pop, "EA"] = sumstats.loc[is_pop,"EA"].str[1:]
973
+ sumstats.loc[is_pop, "POS"] = sumstats.loc[is_pop,"POS"] + 1
974
+ is_normalized = ((sumstats["NEA"].str.len()==1) | (sumstats["EA"].str.len()==1) ) & (~is_same)
975
+ gc.collect()
976
+
977
+ sumstats.loc[is_normalized,status] = vchange_status(sumstats.loc[is_normalized, status], 5,"4","0")
978
+ sumstats.loc[is_same,status] = vchange_status(sumstats.loc[is_same, status], 5,"4","3")
979
+ changed_index = sumstats[changed].index
980
+ return sumstats, changed_index.values
981
+
876
982
  def normalizevariant(pos,a,b,status):
877
983
  # single record
878
984
  # https://genome.sph.umich.edu/wiki/Variant_Normalization
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: gwaslab
3
- Version: 3.4.43
3
+ Version: 3.4.45
4
4
  Summary: A collection of handy tools for GWAS SumStats
5
5
  Author-email: Yunye <yunye@gwaslab.com>
6
6
  Project-URL: Homepage, https://cloufield.github.io/gwaslab/
@@ -8,7 +8,7 @@ Project-URL: Github, https://github.com/Cloufield/gwaslab
8
8
  Classifier: Programming Language :: Python :: 3
9
9
  Classifier: License :: OSI Approved :: MIT License
10
10
  Classifier: Operating System :: OS Independent
11
- Requires-Python: <=3.10,>=3.9
11
+ Requires-Python: <3.11,>=3.9
12
12
  Description-Content-Type: text/markdown
13
13
  License-File: LICENSE
14
14
  License-File: LICENSE_before_v3.4.39
@@ -17,7 +17,7 @@ Requires-Dist: numpy>=1.21.2
17
17
  Requires-Dist: matplotlib!=3.7.2,>=3.5
18
18
  Requires-Dist: seaborn>=0.12
19
19
  Requires-Dist: scipy>=1.12
20
- Requires-Dist: pySAM<0.20,>=0.18.1
20
+ Requires-Dist: pySAM==0.22.1
21
21
  Requires-Dist: Biopython>=1.79
22
22
  Requires-Dist: adjustText<=0.8,>=0.7.3
23
23
  Requires-Dist: liftover>=1.1.13
@@ -51,7 +51,7 @@ Warning: Known issues of GWASLab are summarized in [https://cloufield.github.io/
51
51
  ### install via pip
52
52
 
53
53
  ```
54
- pip install gwaslab==3.4.41
54
+ pip install gwaslab==3.4.43
55
55
  ```
56
56
 
57
57
  ```python
@@ -90,7 +90,7 @@ Create a Python 3.9 environment and install gwaslab using pip:
90
90
  ```
91
91
  conda env create -n gwaslab_test -c conda-forge python=3.9
92
92
  conda activate gwaslab
93
- pip install gwaslab==3.4.41
93
+ pip install gwaslab==3.4.43
94
94
  ```
95
95
 
96
96
  or create a new environment using yml file [environment_3.4.40.yml](https://github.com/Cloufield/gwaslab/blob/main/environment_3.4.40.yml)
@@ -3,7 +3,7 @@ numpy>=1.21.2
3
3
  matplotlib!=3.7.2,>=3.5
4
4
  seaborn>=0.12
5
5
  scipy>=1.12
6
- pySAM<0.20,>=0.18.1
6
+ pySAM==0.22.1
7
7
  Biopython>=1.79
8
8
  adjustText<=0.8,>=0.7.3
9
9
  liftover>=1.1.13
File without changes
File without changes
File without changes
File without changes