gwaslab 3.4.41__py3-none-any.whl → 3.4.42__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of gwaslab might be problematic. Click here for more details.
- gwaslab/bd_common_data.py +14 -1
- gwaslab/bd_get_hapmap3.py +7 -3
- gwaslab/g_Sumstats.py +156 -138
- gwaslab/g_SumstatsPair.py +15 -15
- gwaslab/g_version.py +2 -2
- gwaslab/hm_harmonize_sumstats.py +365 -12
- gwaslab/io_read_tabular.py +7 -7
- gwaslab/io_to_formats.py +96 -21
- gwaslab/io_to_pickle.py +1 -1
- gwaslab/ldsc_ldscore.py +1 -1
- gwaslab/qc_fix_sumstats.py +2 -2
- gwaslab/util_ex_calculate_ldmatrix.py +2 -2
- gwaslab/util_ex_calculate_prs.py +2 -2
- gwaslab/util_ex_ldsc.py +163 -110
- gwaslab/util_ex_plink_filter.py +2 -2
- gwaslab/util_ex_run_clumping.py +2 -2
- gwaslab/util_in_filter_value.py +27 -9
- gwaslab/viz_plot_trumpetplot.py +115 -4
- {gwaslab-3.4.41.dist-info → gwaslab-3.4.42.dist-info}/METADATA +6 -3
- {gwaslab-3.4.41.dist-info → gwaslab-3.4.42.dist-info}/RECORD +24 -24
- {gwaslab-3.4.41.dist-info → gwaslab-3.4.42.dist-info}/WHEEL +1 -1
- {gwaslab-3.4.41.dist-info → gwaslab-3.4.42.dist-info}/LICENSE +0 -0
- {gwaslab-3.4.41.dist-info → gwaslab-3.4.42.dist-info}/LICENSE_before_v3.4.39 +0 -0
- {gwaslab-3.4.41.dist-info → gwaslab-3.4.42.dist-info}/top_level.txt +0 -0
gwaslab/io_to_formats.py
CHANGED
|
@@ -2,6 +2,7 @@ import pandas as pd
|
|
|
2
2
|
import yaml
|
|
3
3
|
import hashlib
|
|
4
4
|
import copy
|
|
5
|
+
import gzip
|
|
5
6
|
from pysam import tabix_compress
|
|
6
7
|
from pysam import tabix_index
|
|
7
8
|
from datetime import datetime
|
|
@@ -306,26 +307,30 @@ def tofmt(sumstats,
|
|
|
306
307
|
vcf_header = _process_vcf_header(sumstats, meta, meta_data, build, log, verbose)
|
|
307
308
|
|
|
308
309
|
log.write(" -Writing sumstats to: {}...".format(path),verbose=verbose)
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
310
|
+
try:
|
|
311
|
+
fast_to_vcf(sumstats, path, vcf_header, output_format, meta_data, meta)
|
|
312
|
+
except:
|
|
313
|
+
log.write(f"Error in using fast_to_vcf. Falling back to original implementation.",verbose=verbose)
|
|
314
|
+
# output header
|
|
315
|
+
with open(path,"w") as file:
|
|
316
|
+
file.write(vcf_header)
|
|
317
|
+
|
|
318
|
+
with open(path,"a") as file:
|
|
319
|
+
log.write(" -Output columns:"," ".join(meta_data["format_fixed"]+[meta["gwaslab"]["study_name"]]))
|
|
320
|
+
file.write("\t".join(meta_data["format_fixed"]+[meta["gwaslab"]["study_name"]])+"\n")
|
|
321
|
+
log.write(" -Outputing data...")
|
|
322
|
+
QUAL="."
|
|
323
|
+
FILTER="PASS"
|
|
324
|
+
for index,row in sumstats.iterrows():
|
|
325
|
+
CHROM=str(row["#CHROM"])
|
|
326
|
+
POS=str(row["POS"])
|
|
327
|
+
ID=str(row["ID"])
|
|
328
|
+
REF=str(row["REF"])
|
|
329
|
+
ALT=str(row["ALT"])
|
|
330
|
+
INFO=str(row["INFO"])
|
|
331
|
+
FORMAT=":".join(output_format)
|
|
332
|
+
DATA=":".join(row[output_format].astype("string"))
|
|
333
|
+
file.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO, FORMAT, DATA))
|
|
329
334
|
_bgzip_tabix_md5sum(path, fmt, bgzip, md5sum, tabix, tabix_indexargs, log, verbose)
|
|
330
335
|
|
|
331
336
|
####################################################################################################################
|
|
@@ -342,7 +347,11 @@ def tofmt(sumstats,
|
|
|
342
347
|
sumstats,to_csvargs = _configure_output_cols_and_args(sumstats, rename_dictionary, cols, no_status, path, meta_data, to_csvargs, log, verbose)
|
|
343
348
|
|
|
344
349
|
log.write(" -Writing sumstats to: {}...".format(path),verbose=verbose)
|
|
345
|
-
|
|
350
|
+
try:
|
|
351
|
+
fast_to_csv(sumstats, path, to_csvargs=to_csvargs, compress=True, write_in_chunks=True)
|
|
352
|
+
except:
|
|
353
|
+
log.write(f"Error in using fast_to_csv. Falling back to original implementation.",verbose=verbose)
|
|
354
|
+
sumstats.to_csv(path, index=None, **to_csvargs)
|
|
346
355
|
|
|
347
356
|
if md5sum == True:
|
|
348
357
|
md5_value = md5sum_file(path,log,verbose)
|
|
@@ -353,6 +362,72 @@ def tofmt(sumstats,
|
|
|
353
362
|
_configure_ssf_meta(sumstats, fmt, ssfmeta, meta, meta_data, path, md5_value, ymal_path, log, verbose)
|
|
354
363
|
|
|
355
364
|
return sumstats
|
|
365
|
+
|
|
366
|
+
####################################################################################################################
|
|
367
|
+
def fast_to_csv(dataframe, path, to_csvargs=None, compress=True, write_in_chunks=True):
|
|
368
|
+
df_numpy = dataframe.to_numpy()
|
|
369
|
+
|
|
370
|
+
if path.endswith(".gz"):
|
|
371
|
+
path = path[:-3]
|
|
372
|
+
|
|
373
|
+
if to_csvargs is None:
|
|
374
|
+
to_csvargs = {}
|
|
375
|
+
|
|
376
|
+
if 'sep' in to_csvargs:
|
|
377
|
+
sep = to_csvargs['sep']
|
|
378
|
+
else:
|
|
379
|
+
sep = '\t'
|
|
380
|
+
|
|
381
|
+
# this operation slows down a bit the process, but it is necessary to be consistent with the pandas.to_csv() behavior
|
|
382
|
+
if 'na_rep' in to_csvargs:
|
|
383
|
+
df_numpy[pd.isna(df_numpy)] = to_csvargs['na_rep'] # replace NaNs. We have to use pd.isna because np.isnan does not work with 'object' and 'string' dtypes
|
|
384
|
+
|
|
385
|
+
# np.savetext() is faster than df.to_csv, however it loops through the rows of X and formats each row individually:
|
|
386
|
+
# https://github.com/numpy/numpy/blob/d35cd07ea997f033b2d89d349734c61f5de54b0d/numpy/lib/npyio.py#L1613
|
|
387
|
+
# We can speed up the process building the whole format string and then appling the formatting in one single call
|
|
388
|
+
out_string = sep.join(dataframe.columns) + '\n'
|
|
389
|
+
fmt = sep.join(['%s']*dataframe.shape[1]) # build formatting for one single row
|
|
390
|
+
fmt = '\n'.join([fmt]*dataframe.shape[0]) # add newline and replicate the formatting for all rows
|
|
391
|
+
out_string += fmt % tuple(df_numpy.ravel()) # flatten the array and then apply formatting
|
|
392
|
+
out_string += '\n'
|
|
393
|
+
|
|
394
|
+
if write_in_chunks:
|
|
395
|
+
chunk_size = 50000000
|
|
396
|
+
lines = [out_string[i:i+chunk_size] for i in range(0, len(out_string), chunk_size)]
|
|
397
|
+
else:
|
|
398
|
+
lines = [out_string]
|
|
399
|
+
|
|
400
|
+
if compress:
|
|
401
|
+
lines = [line.encode() for line in lines]
|
|
402
|
+
with gzip.open(path+".gz", 'wb', compresslevel=1) as f:
|
|
403
|
+
f.writelines(lines)
|
|
404
|
+
else:
|
|
405
|
+
with open(path, 'w') as f:
|
|
406
|
+
f.writelines(lines)
|
|
407
|
+
|
|
408
|
+
|
|
409
|
+
def fast_to_vcf(dataframe, path, vcf_header, output_format, meta_data, meta):
|
|
410
|
+
# Get the columns in the right order and convert to numpy
|
|
411
|
+
df_numpy = dataframe[['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'INFO'] + output_format].to_numpy()
|
|
412
|
+
|
|
413
|
+
sep = '\t'
|
|
414
|
+
QUAL = "."
|
|
415
|
+
FILTER = "PASS"
|
|
416
|
+
FORMAT = ":".join(output_format)
|
|
417
|
+
format_format = ':'.join(['%s']*len(output_format))
|
|
418
|
+
|
|
419
|
+
single_row_format = f'%s %s %s %s %s {QUAL} {FILTER} %s {FORMAT} {format_format}'
|
|
420
|
+
|
|
421
|
+
out_string = vcf_header
|
|
422
|
+
out_string += sep.join(meta_data["format_fixed"]+[meta["gwaslab"]["study_name"]]) + "\n"
|
|
423
|
+
fmt = sep.join(single_row_format.split(' ')) # build formatting for one single row
|
|
424
|
+
fmt = '\n'.join([fmt]*dataframe.shape[0]) # add newline and replicate the formatting for all rows
|
|
425
|
+
out_string += fmt % tuple(df_numpy.ravel()) # flatten the array and then apply formatting
|
|
426
|
+
out_string += '\n'
|
|
427
|
+
|
|
428
|
+
with open(path, 'w') as f:
|
|
429
|
+
f.write(out_string)
|
|
430
|
+
|
|
356
431
|
####################################################################################################################
|
|
357
432
|
def _configure_output_cols_and_args(sumstats, rename_dictionary, cols, no_status, path, meta_data, to_csvargs, log, verbose):
|
|
358
433
|
# grab format cols that exist in sumstats
|
gwaslab/io_to_pickle.py
CHANGED
|
@@ -13,7 +13,7 @@ def dump_pickle(glsumstats,path="~/mysumstats.pickle",overwrite=False):
|
|
|
13
13
|
with open(path, 'wb') as file:
|
|
14
14
|
glsumstats.log.write(" -Dump the Sumstats Object to : ", path)
|
|
15
15
|
pickle.dump(glsumstats, file)
|
|
16
|
-
|
|
16
|
+
glsumstats.log.write("Finished dumping.")
|
|
17
17
|
|
|
18
18
|
def load_pickle(path):
|
|
19
19
|
if os.path.exists(path):
|
gwaslab/ldsc_ldscore.py
CHANGED
|
@@ -296,7 +296,7 @@ class PlinkBEDFile(__GenotypeArrayInMemory__):
|
|
|
296
296
|
nru_new = n_new + e
|
|
297
297
|
nru = self.nru
|
|
298
298
|
z = ba.bitarray(m*2*nru_new, endian="little")
|
|
299
|
-
|
|
299
|
+
z.setall(0)
|
|
300
300
|
for e, i in enumerate(keep_indivs):
|
|
301
301
|
z[2*e::2*nru_new] = geno[2*i::2*nru]
|
|
302
302
|
z[2*e+1::2*nru_new] = geno[2*i+1::2*nru]
|
gwaslab/qc_fix_sumstats.py
CHANGED
|
@@ -1532,7 +1532,7 @@ def start_to(sumstats,
|
|
|
1532
1532
|
ref_fasta=None,
|
|
1533
1533
|
n_cores=None,
|
|
1534
1534
|
ref_tsv=None,
|
|
1535
|
-
**
|
|
1535
|
+
**kwargs
|
|
1536
1536
|
):
|
|
1537
1537
|
|
|
1538
1538
|
log.write("Start to {}...{}".format(start_line,_get_version()), verbose=verbose)
|
|
@@ -1557,7 +1557,7 @@ def start_to(sumstats,
|
|
|
1557
1557
|
log.write(" -Reference TSV: {}".format(ref_tsv))
|
|
1558
1558
|
|
|
1559
1559
|
is_args_valid = True
|
|
1560
|
-
for key, value in
|
|
1560
|
+
for key, value in kwargs.items():
|
|
1561
1561
|
is_args_valid = is_args_valid & check_arg(log, verbose, key, value, start_function)
|
|
1562
1562
|
is_enough_col = is_args_valid & is_enough_col
|
|
1563
1563
|
|
|
@@ -27,7 +27,7 @@ def tofinemapping(sumstats,
|
|
|
27
27
|
log=Log(),
|
|
28
28
|
suffixes=None,
|
|
29
29
|
verbose=True,
|
|
30
|
-
**
|
|
30
|
+
**kwargs):
|
|
31
31
|
##start function with col checking##########################################################
|
|
32
32
|
_start_line = "calculate LD matrix"
|
|
33
33
|
_end_line = "calculating LD matrix"
|
|
@@ -84,7 +84,7 @@ def tofinemapping(sumstats,
|
|
|
84
84
|
n_cores=n_cores,
|
|
85
85
|
log=log,
|
|
86
86
|
load_bim=True,
|
|
87
|
-
overwrite=overwrite,**
|
|
87
|
+
overwrite=overwrite,**kwargs)
|
|
88
88
|
|
|
89
89
|
## check available snps with reference file
|
|
90
90
|
matched_sumstats = _align_sumstats_with_bim(row=row,
|
gwaslab/util_ex_calculate_prs.py
CHANGED
|
@@ -18,7 +18,7 @@ def _calculate_prs(sumstats,
|
|
|
18
18
|
memory=None,
|
|
19
19
|
overwrite=False,
|
|
20
20
|
mode=None,delete=True,
|
|
21
|
-
log=Log(),**
|
|
21
|
+
log=Log(),**kwargs):
|
|
22
22
|
|
|
23
23
|
#matching_alleles
|
|
24
24
|
#read_bim
|
|
@@ -37,7 +37,7 @@ def _calculate_prs(sumstats,
|
|
|
37
37
|
n_cores=n_cores,
|
|
38
38
|
log=log,
|
|
39
39
|
load_bim=False,
|
|
40
|
-
overwrite=overwrite,**
|
|
40
|
+
overwrite=overwrite,**kwargs)
|
|
41
41
|
score_file_path_list =[]
|
|
42
42
|
for index, chrom in enumerate(chrlist):
|
|
43
43
|
chr_sumstats = sumstats.loc[sumstats["CHR"]==chrom,:].copy()
|