gwaslab 3.4.41__py3-none-any.whl → 3.4.43__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of gwaslab might be problematic. Click here for more details.

@@ -3,30 +3,30 @@ from gwaslab.bd_common_data import get_formats_list
3
3
  from gwaslab.g_Log import Log
4
4
  from gwaslab.bd_common_data import get_format_dict
5
5
 
6
- def _read_tabular(path, fmt, **args):
6
+ def _read_tabular(path, fmt, **kwargs):
7
7
 
8
8
  # default
9
9
  load_args_dict = {"sep":"\t",
10
10
  "header":None}
11
11
 
12
12
  # if specified by user
13
- if len(args)>0:
14
- load_args_dict = args
13
+ if len(kwargs)>0:
14
+ load_args_dict = kwargs
15
15
 
16
16
  # load format
17
17
  meta_data, rename_dictionary = get_format_dict(fmt)
18
18
 
19
- if "format_separator" in meta_data and "sep" not in args:
19
+ if "format_separator" in meta_data and "sep" not in kwargs:
20
20
  load_args_dict["sep"] = meta_data["format_separator"]
21
21
 
22
- if "format_comment" in meta_data and "comment" not in args:
22
+ if "format_comment" in meta_data and "comment" not in kwargs:
23
23
  if meta_data["format_comment"] is not None:
24
24
  load_args_dict["comment"] = meta_data["format_comment"]
25
25
 
26
- if "format_header" in meta_data and "header" not in args:
26
+ if "format_header" in meta_data and "header" not in kwargs:
27
27
  load_args_dict["header"] = meta_data["format_header"]
28
28
 
29
- if "format_na" in meta_data and "na_values" not in args:
29
+ if "format_na" in meta_data and "na_values" not in kwargs:
30
30
  if meta_data["format_na"] is not None:
31
31
  load_args_dict["na_values"] = meta_data["format_na"]
32
32
 
gwaslab/io_to_formats.py CHANGED
@@ -2,6 +2,7 @@ import pandas as pd
2
2
  import yaml
3
3
  import hashlib
4
4
  import copy
5
+ import gzip
5
6
  from pysam import tabix_compress
6
7
  from pysam import tabix_index
7
8
  from datetime import datetime
@@ -306,26 +307,30 @@ def tofmt(sumstats,
306
307
  vcf_header = _process_vcf_header(sumstats, meta, meta_data, build, log, verbose)
307
308
 
308
309
  log.write(" -Writing sumstats to: {}...".format(path),verbose=verbose)
309
- # output header
310
- with open(path,"w") as file:
311
- file.write(vcf_header)
312
-
313
- with open(path,"a") as file:
314
- log.write(" -Output columns:"," ".join(meta_data["format_fixed"]+[meta["gwaslab"]["study_name"]]))
315
- file.write("\t".join(meta_data["format_fixed"]+[meta["gwaslab"]["study_name"]])+"\n")
316
- log.write(" -Outputing data...")
317
- QUAL="."
318
- FILTER="PASS"
319
- for index,row in sumstats.iterrows():
320
- CHROM=str(row["#CHROM"])
321
- POS=str(row["POS"])
322
- ID=str(row["ID"])
323
- REF=str(row["REF"])
324
- ALT=str(row["ALT"])
325
- INFO=str(row["INFO"])
326
- FORMAT=":".join(output_format)
327
- DATA=":".join(row[output_format].astype("string"))
328
- file.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO, FORMAT, DATA))
310
+ try:
311
+ fast_to_vcf(sumstats, path, vcf_header, output_format, meta_data, meta)
312
+ except:
313
+ log.write(f"Error in using fast_to_vcf. Falling back to original implementation.",verbose=verbose)
314
+ # output header
315
+ with open(path,"w") as file:
316
+ file.write(vcf_header)
317
+
318
+ with open(path,"a") as file:
319
+ log.write(" -Output columns:"," ".join(meta_data["format_fixed"]+[meta["gwaslab"]["study_name"]]))
320
+ file.write("\t".join(meta_data["format_fixed"]+[meta["gwaslab"]["study_name"]])+"\n")
321
+ log.write(" -Outputing data...")
322
+ QUAL="."
323
+ FILTER="PASS"
324
+ for index,row in sumstats.iterrows():
325
+ CHROM=str(row["#CHROM"])
326
+ POS=str(row["POS"])
327
+ ID=str(row["ID"])
328
+ REF=str(row["REF"])
329
+ ALT=str(row["ALT"])
330
+ INFO=str(row["INFO"])
331
+ FORMAT=":".join(output_format)
332
+ DATA=":".join(row[output_format].astype("string"))
333
+ file.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO, FORMAT, DATA))
329
334
  _bgzip_tabix_md5sum(path, fmt, bgzip, md5sum, tabix, tabix_indexargs, log, verbose)
330
335
 
331
336
  ####################################################################################################################
@@ -342,7 +347,11 @@ def tofmt(sumstats,
342
347
  sumstats,to_csvargs = _configure_output_cols_and_args(sumstats, rename_dictionary, cols, no_status, path, meta_data, to_csvargs, log, verbose)
343
348
 
344
349
  log.write(" -Writing sumstats to: {}...".format(path),verbose=verbose)
345
- sumstats.to_csv(path, index=None,**to_csvargs)
350
+ try:
351
+ fast_to_csv(sumstats, path, to_csvargs=to_csvargs, compress=True, write_in_chunks=True)
352
+ except:
353
+ log.write(f"Error in using fast_to_csv. Falling back to original implementation.",verbose=verbose)
354
+ sumstats.to_csv(path, index=None, **to_csvargs)
346
355
 
347
356
  if md5sum == True:
348
357
  md5_value = md5sum_file(path,log,verbose)
@@ -353,6 +362,72 @@ def tofmt(sumstats,
353
362
  _configure_ssf_meta(sumstats, fmt, ssfmeta, meta, meta_data, path, md5_value, ymal_path, log, verbose)
354
363
 
355
364
  return sumstats
365
+
366
+ ####################################################################################################################
367
+ def fast_to_csv(dataframe, path, to_csvargs=None, compress=True, write_in_chunks=True):
368
+ df_numpy = dataframe.to_numpy()
369
+
370
+ if path.endswith(".gz"):
371
+ path = path[:-3]
372
+
373
+ if to_csvargs is None:
374
+ to_csvargs = {}
375
+
376
+ if 'sep' in to_csvargs:
377
+ sep = to_csvargs['sep']
378
+ else:
379
+ sep = '\t'
380
+
381
+ # this operation slows down a bit the process, but it is necessary to be consistent with the pandas.to_csv() behavior
382
+ if 'na_rep' in to_csvargs:
383
+ df_numpy[pd.isna(df_numpy)] = to_csvargs['na_rep'] # replace NaNs. We have to use pd.isna because np.isnan does not work with 'object' and 'string' dtypes
384
+
385
+ # np.savetext() is faster than df.to_csv, however it loops through the rows of X and formats each row individually:
386
+ # https://github.com/numpy/numpy/blob/d35cd07ea997f033b2d89d349734c61f5de54b0d/numpy/lib/npyio.py#L1613
387
+ # We can speed up the process building the whole format string and then appling the formatting in one single call
388
+ out_string = sep.join(dataframe.columns) + '\n'
389
+ fmt = sep.join(['%s']*dataframe.shape[1]) # build formatting for one single row
390
+ fmt = '\n'.join([fmt]*dataframe.shape[0]) # add newline and replicate the formatting for all rows
391
+ out_string += fmt % tuple(df_numpy.ravel()) # flatten the array and then apply formatting
392
+ out_string += '\n'
393
+
394
+ if write_in_chunks:
395
+ chunk_size = 50000000
396
+ lines = [out_string[i:i+chunk_size] for i in range(0, len(out_string), chunk_size)]
397
+ else:
398
+ lines = [out_string]
399
+
400
+ if compress:
401
+ lines = [line.encode() for line in lines]
402
+ with gzip.open(path+".gz", 'wb', compresslevel=1) as f:
403
+ f.writelines(lines)
404
+ else:
405
+ with open(path, 'w') as f:
406
+ f.writelines(lines)
407
+
408
+
409
+ def fast_to_vcf(dataframe, path, vcf_header, output_format, meta_data, meta):
410
+ # Get the columns in the right order and convert to numpy
411
+ df_numpy = dataframe[['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'INFO'] + output_format].to_numpy()
412
+
413
+ sep = '\t'
414
+ QUAL = "."
415
+ FILTER = "PASS"
416
+ FORMAT = ":".join(output_format)
417
+ format_format = ':'.join(['%s']*len(output_format))
418
+
419
+ single_row_format = f'%s %s %s %s %s {QUAL} {FILTER} %s {FORMAT} {format_format}'
420
+
421
+ out_string = vcf_header
422
+ out_string += sep.join(meta_data["format_fixed"]+[meta["gwaslab"]["study_name"]]) + "\n"
423
+ fmt = sep.join(single_row_format.split(' ')) # build formatting for one single row
424
+ fmt = '\n'.join([fmt]*dataframe.shape[0]) # add newline and replicate the formatting for all rows
425
+ out_string += fmt % tuple(df_numpy.ravel()) # flatten the array and then apply formatting
426
+ out_string += '\n'
427
+
428
+ with open(path, 'w') as f:
429
+ f.write(out_string)
430
+
356
431
  ####################################################################################################################
357
432
  def _configure_output_cols_and_args(sumstats, rename_dictionary, cols, no_status, path, meta_data, to_csvargs, log, verbose):
358
433
  # grab format cols that exist in sumstats
gwaslab/io_to_pickle.py CHANGED
@@ -13,7 +13,7 @@ def dump_pickle(glsumstats,path="~/mysumstats.pickle",overwrite=False):
13
13
  with open(path, 'wb') as file:
14
14
  glsumstats.log.write(" -Dump the Sumstats Object to : ", path)
15
15
  pickle.dump(glsumstats, file)
16
- Log().write("Finished dumping.")
16
+ glsumstats.log.write("Finished dumping.")
17
17
 
18
18
  def load_pickle(path):
19
19
  if os.path.exists(path):
gwaslab/ldsc_ldscore.py CHANGED
@@ -296,7 +296,7 @@ class PlinkBEDFile(__GenotypeArrayInMemory__):
296
296
  nru_new = n_new + e
297
297
  nru = self.nru
298
298
  z = ba.bitarray(m*2*nru_new, endian="little")
299
- z.setall(0)
299
+ z.setall(0)
300
300
  for e, i in enumerate(keep_indivs):
301
301
  z[2*e::2*nru_new] = geno[2*i::2*nru]
302
302
  z[2*e+1::2*nru_new] = geno[2*i+1::2*nru]
@@ -1532,7 +1532,7 @@ def start_to(sumstats,
1532
1532
  ref_fasta=None,
1533
1533
  n_cores=None,
1534
1534
  ref_tsv=None,
1535
- **args
1535
+ **kwargs
1536
1536
  ):
1537
1537
 
1538
1538
  log.write("Start to {}...{}".format(start_line,_get_version()), verbose=verbose)
@@ -1557,7 +1557,7 @@ def start_to(sumstats,
1557
1557
  log.write(" -Reference TSV: {}".format(ref_tsv))
1558
1558
 
1559
1559
  is_args_valid = True
1560
- for key, value in args.items():
1560
+ for key, value in kwargs.items():
1561
1561
  is_args_valid = is_args_valid & check_arg(log, verbose, key, value, start_function)
1562
1562
  is_enough_col = is_args_valid & is_enough_col
1563
1563
 
@@ -1611,12 +1611,5 @@ def check_col(df_col_names, verbose=True, log=Log(), cols=None, function=None):
1611
1611
 
1612
1612
  ###############################################################################################################
1613
1613
  def _df_split(dataframe, n):
1614
- chunks = []
1615
- chunk_size = int(dataframe.shape[0] // n)+1
1616
-
1617
- for index in range(0, dataframe.shape[0], chunk_size):
1618
- chunks.append(
1619
- dataframe.iloc[index:index + chunk_size]
1620
- )
1621
-
1622
- return chunks
1614
+ k, m = divmod(len(dataframe), n)
1615
+ return [dataframe.iloc[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n)]
@@ -27,7 +27,7 @@ def tofinemapping(sumstats,
27
27
  log=Log(),
28
28
  suffixes=None,
29
29
  verbose=True,
30
- **args):
30
+ **kwargs):
31
31
  ##start function with col checking##########################################################
32
32
  _start_line = "calculate LD matrix"
33
33
  _end_line = "calculating LD matrix"
@@ -84,7 +84,7 @@ def tofinemapping(sumstats,
84
84
  n_cores=n_cores,
85
85
  log=log,
86
86
  load_bim=True,
87
- overwrite=overwrite,**args)
87
+ overwrite=overwrite,**kwargs)
88
88
 
89
89
  ## check available snps with reference file
90
90
  matched_sumstats = _align_sumstats_with_bim(row=row,
@@ -18,7 +18,7 @@ def _calculate_prs(sumstats,
18
18
  memory=None,
19
19
  overwrite=False,
20
20
  mode=None,delete=True,
21
- log=Log(),**args):
21
+ log=Log(),**kwargs):
22
22
 
23
23
  #matching_alleles
24
24
  #read_bim
@@ -37,7 +37,7 @@ def _calculate_prs(sumstats,
37
37
  n_cores=n_cores,
38
38
  log=log,
39
39
  load_bim=False,
40
- overwrite=overwrite,**args)
40
+ overwrite=overwrite,**kwargs)
41
41
  score_file_path_list =[]
42
42
  for index, chrom in enumerate(chrlist):
43
43
  chr_sumstats = sumstats.loc[sumstats["CHR"]==chrom,:].copy()