gwaslab 3.4.41__py3-none-any.whl → 3.4.43__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of gwaslab might be problematic. Click here for more details.
- gwaslab/bd_common_data.py +14 -1
- gwaslab/bd_get_hapmap3.py +7 -3
- gwaslab/cache_manager.py +687 -0
- gwaslab/g_Sumstats.py +156 -138
- gwaslab/g_SumstatsPair.py +15 -15
- gwaslab/g_version.py +2 -2
- gwaslab/hm_harmonize_sumstats.py +558 -32
- gwaslab/io_read_tabular.py +7 -7
- gwaslab/io_to_formats.py +96 -21
- gwaslab/io_to_pickle.py +1 -1
- gwaslab/ldsc_ldscore.py +1 -1
- gwaslab/qc_fix_sumstats.py +4 -11
- gwaslab/util_ex_calculate_ldmatrix.py +2 -2
- gwaslab/util_ex_calculate_prs.py +2 -2
- gwaslab/util_ex_ldsc.py +163 -110
- gwaslab/util_ex_plink_filter.py +2 -2
- gwaslab/util_ex_run_clumping.py +2 -2
- gwaslab/util_in_filter_value.py +27 -9
- gwaslab/viz_plot_mqqplot.py +12 -11
- gwaslab/viz_plot_trumpetplot.py +115 -4
- {gwaslab-3.4.41.dist-info → gwaslab-3.4.43.dist-info}/METADATA +8 -3
- {gwaslab-3.4.41.dist-info → gwaslab-3.4.43.dist-info}/RECORD +26 -25
- {gwaslab-3.4.41.dist-info → gwaslab-3.4.43.dist-info}/WHEEL +1 -1
- {gwaslab-3.4.41.dist-info → gwaslab-3.4.43.dist-info}/LICENSE +0 -0
- {gwaslab-3.4.41.dist-info → gwaslab-3.4.43.dist-info}/LICENSE_before_v3.4.39 +0 -0
- {gwaslab-3.4.41.dist-info → gwaslab-3.4.43.dist-info}/top_level.txt +0 -0
gwaslab/io_read_tabular.py
CHANGED
|
@@ -3,30 +3,30 @@ from gwaslab.bd_common_data import get_formats_list
|
|
|
3
3
|
from gwaslab.g_Log import Log
|
|
4
4
|
from gwaslab.bd_common_data import get_format_dict
|
|
5
5
|
|
|
6
|
-
def _read_tabular(path, fmt, **
|
|
6
|
+
def _read_tabular(path, fmt, **kwargs):
|
|
7
7
|
|
|
8
8
|
# default
|
|
9
9
|
load_args_dict = {"sep":"\t",
|
|
10
10
|
"header":None}
|
|
11
11
|
|
|
12
12
|
# if specified by user
|
|
13
|
-
if len(
|
|
14
|
-
load_args_dict =
|
|
13
|
+
if len(kwargs)>0:
|
|
14
|
+
load_args_dict = kwargs
|
|
15
15
|
|
|
16
16
|
# load format
|
|
17
17
|
meta_data, rename_dictionary = get_format_dict(fmt)
|
|
18
18
|
|
|
19
|
-
if "format_separator" in meta_data and "sep" not in
|
|
19
|
+
if "format_separator" in meta_data and "sep" not in kwargs:
|
|
20
20
|
load_args_dict["sep"] = meta_data["format_separator"]
|
|
21
21
|
|
|
22
|
-
if "format_comment" in meta_data and "comment" not in
|
|
22
|
+
if "format_comment" in meta_data and "comment" not in kwargs:
|
|
23
23
|
if meta_data["format_comment"] is not None:
|
|
24
24
|
load_args_dict["comment"] = meta_data["format_comment"]
|
|
25
25
|
|
|
26
|
-
if "format_header" in meta_data and "header" not in
|
|
26
|
+
if "format_header" in meta_data and "header" not in kwargs:
|
|
27
27
|
load_args_dict["header"] = meta_data["format_header"]
|
|
28
28
|
|
|
29
|
-
if "format_na" in meta_data and "na_values" not in
|
|
29
|
+
if "format_na" in meta_data and "na_values" not in kwargs:
|
|
30
30
|
if meta_data["format_na"] is not None:
|
|
31
31
|
load_args_dict["na_values"] = meta_data["format_na"]
|
|
32
32
|
|
gwaslab/io_to_formats.py
CHANGED
|
@@ -2,6 +2,7 @@ import pandas as pd
|
|
|
2
2
|
import yaml
|
|
3
3
|
import hashlib
|
|
4
4
|
import copy
|
|
5
|
+
import gzip
|
|
5
6
|
from pysam import tabix_compress
|
|
6
7
|
from pysam import tabix_index
|
|
7
8
|
from datetime import datetime
|
|
@@ -306,26 +307,30 @@ def tofmt(sumstats,
|
|
|
306
307
|
vcf_header = _process_vcf_header(sumstats, meta, meta_data, build, log, verbose)
|
|
307
308
|
|
|
308
309
|
log.write(" -Writing sumstats to: {}...".format(path),verbose=verbose)
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
310
|
+
try:
|
|
311
|
+
fast_to_vcf(sumstats, path, vcf_header, output_format, meta_data, meta)
|
|
312
|
+
except:
|
|
313
|
+
log.write(f"Error in using fast_to_vcf. Falling back to original implementation.",verbose=verbose)
|
|
314
|
+
# output header
|
|
315
|
+
with open(path,"w") as file:
|
|
316
|
+
file.write(vcf_header)
|
|
317
|
+
|
|
318
|
+
with open(path,"a") as file:
|
|
319
|
+
log.write(" -Output columns:"," ".join(meta_data["format_fixed"]+[meta["gwaslab"]["study_name"]]))
|
|
320
|
+
file.write("\t".join(meta_data["format_fixed"]+[meta["gwaslab"]["study_name"]])+"\n")
|
|
321
|
+
log.write(" -Outputing data...")
|
|
322
|
+
QUAL="."
|
|
323
|
+
FILTER="PASS"
|
|
324
|
+
for index,row in sumstats.iterrows():
|
|
325
|
+
CHROM=str(row["#CHROM"])
|
|
326
|
+
POS=str(row["POS"])
|
|
327
|
+
ID=str(row["ID"])
|
|
328
|
+
REF=str(row["REF"])
|
|
329
|
+
ALT=str(row["ALT"])
|
|
330
|
+
INFO=str(row["INFO"])
|
|
331
|
+
FORMAT=":".join(output_format)
|
|
332
|
+
DATA=":".join(row[output_format].astype("string"))
|
|
333
|
+
file.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO, FORMAT, DATA))
|
|
329
334
|
_bgzip_tabix_md5sum(path, fmt, bgzip, md5sum, tabix, tabix_indexargs, log, verbose)
|
|
330
335
|
|
|
331
336
|
####################################################################################################################
|
|
@@ -342,7 +347,11 @@ def tofmt(sumstats,
|
|
|
342
347
|
sumstats,to_csvargs = _configure_output_cols_and_args(sumstats, rename_dictionary, cols, no_status, path, meta_data, to_csvargs, log, verbose)
|
|
343
348
|
|
|
344
349
|
log.write(" -Writing sumstats to: {}...".format(path),verbose=verbose)
|
|
345
|
-
|
|
350
|
+
try:
|
|
351
|
+
fast_to_csv(sumstats, path, to_csvargs=to_csvargs, compress=True, write_in_chunks=True)
|
|
352
|
+
except:
|
|
353
|
+
log.write(f"Error in using fast_to_csv. Falling back to original implementation.",verbose=verbose)
|
|
354
|
+
sumstats.to_csv(path, index=None, **to_csvargs)
|
|
346
355
|
|
|
347
356
|
if md5sum == True:
|
|
348
357
|
md5_value = md5sum_file(path,log,verbose)
|
|
@@ -353,6 +362,72 @@ def tofmt(sumstats,
|
|
|
353
362
|
_configure_ssf_meta(sumstats, fmt, ssfmeta, meta, meta_data, path, md5_value, ymal_path, log, verbose)
|
|
354
363
|
|
|
355
364
|
return sumstats
|
|
365
|
+
|
|
366
|
+
####################################################################################################################
|
|
367
|
+
def fast_to_csv(dataframe, path, to_csvargs=None, compress=True, write_in_chunks=True):
|
|
368
|
+
df_numpy = dataframe.to_numpy()
|
|
369
|
+
|
|
370
|
+
if path.endswith(".gz"):
|
|
371
|
+
path = path[:-3]
|
|
372
|
+
|
|
373
|
+
if to_csvargs is None:
|
|
374
|
+
to_csvargs = {}
|
|
375
|
+
|
|
376
|
+
if 'sep' in to_csvargs:
|
|
377
|
+
sep = to_csvargs['sep']
|
|
378
|
+
else:
|
|
379
|
+
sep = '\t'
|
|
380
|
+
|
|
381
|
+
# this operation slows down a bit the process, but it is necessary to be consistent with the pandas.to_csv() behavior
|
|
382
|
+
if 'na_rep' in to_csvargs:
|
|
383
|
+
df_numpy[pd.isna(df_numpy)] = to_csvargs['na_rep'] # replace NaNs. We have to use pd.isna because np.isnan does not work with 'object' and 'string' dtypes
|
|
384
|
+
|
|
385
|
+
# np.savetext() is faster than df.to_csv, however it loops through the rows of X and formats each row individually:
|
|
386
|
+
# https://github.com/numpy/numpy/blob/d35cd07ea997f033b2d89d349734c61f5de54b0d/numpy/lib/npyio.py#L1613
|
|
387
|
+
# We can speed up the process building the whole format string and then appling the formatting in one single call
|
|
388
|
+
out_string = sep.join(dataframe.columns) + '\n'
|
|
389
|
+
fmt = sep.join(['%s']*dataframe.shape[1]) # build formatting for one single row
|
|
390
|
+
fmt = '\n'.join([fmt]*dataframe.shape[0]) # add newline and replicate the formatting for all rows
|
|
391
|
+
out_string += fmt % tuple(df_numpy.ravel()) # flatten the array and then apply formatting
|
|
392
|
+
out_string += '\n'
|
|
393
|
+
|
|
394
|
+
if write_in_chunks:
|
|
395
|
+
chunk_size = 50000000
|
|
396
|
+
lines = [out_string[i:i+chunk_size] for i in range(0, len(out_string), chunk_size)]
|
|
397
|
+
else:
|
|
398
|
+
lines = [out_string]
|
|
399
|
+
|
|
400
|
+
if compress:
|
|
401
|
+
lines = [line.encode() for line in lines]
|
|
402
|
+
with gzip.open(path+".gz", 'wb', compresslevel=1) as f:
|
|
403
|
+
f.writelines(lines)
|
|
404
|
+
else:
|
|
405
|
+
with open(path, 'w') as f:
|
|
406
|
+
f.writelines(lines)
|
|
407
|
+
|
|
408
|
+
|
|
409
|
+
def fast_to_vcf(dataframe, path, vcf_header, output_format, meta_data, meta):
|
|
410
|
+
# Get the columns in the right order and convert to numpy
|
|
411
|
+
df_numpy = dataframe[['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'INFO'] + output_format].to_numpy()
|
|
412
|
+
|
|
413
|
+
sep = '\t'
|
|
414
|
+
QUAL = "."
|
|
415
|
+
FILTER = "PASS"
|
|
416
|
+
FORMAT = ":".join(output_format)
|
|
417
|
+
format_format = ':'.join(['%s']*len(output_format))
|
|
418
|
+
|
|
419
|
+
single_row_format = f'%s %s %s %s %s {QUAL} {FILTER} %s {FORMAT} {format_format}'
|
|
420
|
+
|
|
421
|
+
out_string = vcf_header
|
|
422
|
+
out_string += sep.join(meta_data["format_fixed"]+[meta["gwaslab"]["study_name"]]) + "\n"
|
|
423
|
+
fmt = sep.join(single_row_format.split(' ')) # build formatting for one single row
|
|
424
|
+
fmt = '\n'.join([fmt]*dataframe.shape[0]) # add newline and replicate the formatting for all rows
|
|
425
|
+
out_string += fmt % tuple(df_numpy.ravel()) # flatten the array and then apply formatting
|
|
426
|
+
out_string += '\n'
|
|
427
|
+
|
|
428
|
+
with open(path, 'w') as f:
|
|
429
|
+
f.write(out_string)
|
|
430
|
+
|
|
356
431
|
####################################################################################################################
|
|
357
432
|
def _configure_output_cols_and_args(sumstats, rename_dictionary, cols, no_status, path, meta_data, to_csvargs, log, verbose):
|
|
358
433
|
# grab format cols that exist in sumstats
|
gwaslab/io_to_pickle.py
CHANGED
|
@@ -13,7 +13,7 @@ def dump_pickle(glsumstats,path="~/mysumstats.pickle",overwrite=False):
|
|
|
13
13
|
with open(path, 'wb') as file:
|
|
14
14
|
glsumstats.log.write(" -Dump the Sumstats Object to : ", path)
|
|
15
15
|
pickle.dump(glsumstats, file)
|
|
16
|
-
|
|
16
|
+
glsumstats.log.write("Finished dumping.")
|
|
17
17
|
|
|
18
18
|
def load_pickle(path):
|
|
19
19
|
if os.path.exists(path):
|
gwaslab/ldsc_ldscore.py
CHANGED
|
@@ -296,7 +296,7 @@ class PlinkBEDFile(__GenotypeArrayInMemory__):
|
|
|
296
296
|
nru_new = n_new + e
|
|
297
297
|
nru = self.nru
|
|
298
298
|
z = ba.bitarray(m*2*nru_new, endian="little")
|
|
299
|
-
|
|
299
|
+
z.setall(0)
|
|
300
300
|
for e, i in enumerate(keep_indivs):
|
|
301
301
|
z[2*e::2*nru_new] = geno[2*i::2*nru]
|
|
302
302
|
z[2*e+1::2*nru_new] = geno[2*i+1::2*nru]
|
gwaslab/qc_fix_sumstats.py
CHANGED
|
@@ -1532,7 +1532,7 @@ def start_to(sumstats,
|
|
|
1532
1532
|
ref_fasta=None,
|
|
1533
1533
|
n_cores=None,
|
|
1534
1534
|
ref_tsv=None,
|
|
1535
|
-
**
|
|
1535
|
+
**kwargs
|
|
1536
1536
|
):
|
|
1537
1537
|
|
|
1538
1538
|
log.write("Start to {}...{}".format(start_line,_get_version()), verbose=verbose)
|
|
@@ -1557,7 +1557,7 @@ def start_to(sumstats,
|
|
|
1557
1557
|
log.write(" -Reference TSV: {}".format(ref_tsv))
|
|
1558
1558
|
|
|
1559
1559
|
is_args_valid = True
|
|
1560
|
-
for key, value in
|
|
1560
|
+
for key, value in kwargs.items():
|
|
1561
1561
|
is_args_valid = is_args_valid & check_arg(log, verbose, key, value, start_function)
|
|
1562
1562
|
is_enough_col = is_args_valid & is_enough_col
|
|
1563
1563
|
|
|
@@ -1611,12 +1611,5 @@ def check_col(df_col_names, verbose=True, log=Log(), cols=None, function=None):
|
|
|
1611
1611
|
|
|
1612
1612
|
###############################################################################################################
|
|
1613
1613
|
def _df_split(dataframe, n):
|
|
1614
|
-
|
|
1615
|
-
|
|
1616
|
-
|
|
1617
|
-
for index in range(0, dataframe.shape[0], chunk_size):
|
|
1618
|
-
chunks.append(
|
|
1619
|
-
dataframe.iloc[index:index + chunk_size]
|
|
1620
|
-
)
|
|
1621
|
-
|
|
1622
|
-
return chunks
|
|
1614
|
+
k, m = divmod(len(dataframe), n)
|
|
1615
|
+
return [dataframe.iloc[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n)]
|
|
@@ -27,7 +27,7 @@ def tofinemapping(sumstats,
|
|
|
27
27
|
log=Log(),
|
|
28
28
|
suffixes=None,
|
|
29
29
|
verbose=True,
|
|
30
|
-
**
|
|
30
|
+
**kwargs):
|
|
31
31
|
##start function with col checking##########################################################
|
|
32
32
|
_start_line = "calculate LD matrix"
|
|
33
33
|
_end_line = "calculating LD matrix"
|
|
@@ -84,7 +84,7 @@ def tofinemapping(sumstats,
|
|
|
84
84
|
n_cores=n_cores,
|
|
85
85
|
log=log,
|
|
86
86
|
load_bim=True,
|
|
87
|
-
overwrite=overwrite,**
|
|
87
|
+
overwrite=overwrite,**kwargs)
|
|
88
88
|
|
|
89
89
|
## check available snps with reference file
|
|
90
90
|
matched_sumstats = _align_sumstats_with_bim(row=row,
|
gwaslab/util_ex_calculate_prs.py
CHANGED
|
@@ -18,7 +18,7 @@ def _calculate_prs(sumstats,
|
|
|
18
18
|
memory=None,
|
|
19
19
|
overwrite=False,
|
|
20
20
|
mode=None,delete=True,
|
|
21
|
-
log=Log(),**
|
|
21
|
+
log=Log(),**kwargs):
|
|
22
22
|
|
|
23
23
|
#matching_alleles
|
|
24
24
|
#read_bim
|
|
@@ -37,7 +37,7 @@ def _calculate_prs(sumstats,
|
|
|
37
37
|
n_cores=n_cores,
|
|
38
38
|
log=log,
|
|
39
39
|
load_bim=False,
|
|
40
|
-
overwrite=overwrite,**
|
|
40
|
+
overwrite=overwrite,**kwargs)
|
|
41
41
|
score_file_path_list =[]
|
|
42
42
|
for index, chrom in enumerate(chrlist):
|
|
43
43
|
chr_sumstats = sumstats.loc[sumstats["CHR"]==chrom,:].copy()
|