gwaslab 3.4.41__py3-none-any.whl → 3.4.42__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of gwaslab might be problematic. Click here for more details.

gwaslab/g_SumstatsPair.py CHANGED
@@ -135,46 +135,46 @@ class SumstatsPair( ):
135
135
  return molded_sumstats, sumstats1
136
136
 
137
137
 
138
- def clump(self,**args):
139
- self.clumps["clumps"], self.clumps["plink_log"] = _clump(self.data, log=self.log, p="P_1",mlog10p="MLOG10P_1", study = self.study_name, **args)
138
+ def clump(self,**kwargs):
139
+ self.clumps["clumps"], self.clumps["plink_log"] = _clump(self.data, log=self.log, p="P_1",mlog10p="MLOG10P_1", study = self.study_name, **kwargs)
140
140
 
141
- def to_coloc(self,**args):
142
- self.to_finemapping_file_path, self.plink_log = tofinemapping(self.data,study=self.study_name,suffixes=self.suffixes,log=self.log,**args)
141
+ def to_coloc(self,**kwargs):
142
+ self.to_finemapping_file_path, self.plink_log = tofinemapping(self.data,study=self.study_name,suffixes=self.suffixes,log=self.log,**kwargs)
143
143
 
144
- def run_coloc_susie(self,**args):
144
+ def run_coloc_susie(self,**kwargs):
145
145
 
146
- self.colocalization = _run_coloc_susie(self.to_finemapping_file_path,log=self.log,ncols=self.ns,**args)
146
+ self.colocalization = _run_coloc_susie(self.to_finemapping_file_path,log=self.log,ncols=self.ns,**kwargs)
147
147
 
148
- def run_two_sample_mr(self, clump=False, **args):
148
+ def run_two_sample_mr(self, clump=False, **kwargs):
149
149
  exposure1 = self.study_name.split("_")[0]
150
150
  outcome2 = self.study_name.split("_")[1]
151
- _run_two_sample_mr(self,exposure1=exposure1,outcome2=outcome2, clump=clump,**args)
151
+ _run_two_sample_mr(self,exposure1=exposure1,outcome2=outcome2, clump=clump,**kwargs)
152
152
 
153
153
  def extract_with_ld_proxy(self,**arg):
154
154
  return _extract_with_ld_proxy(common_sumstats = self.data, sumstats1=self.sumstats1, **arg)
155
155
 
156
- def filter_value(self, expr, inplace=False, **args):
156
+ def filter_value(self, expr, inplace=False, **kwargs):
157
157
  if inplace is False:
158
158
  new_Sumstats_object = copy.deepcopy(self)
159
- new_Sumstats_object.data = filtervalues(new_Sumstats_object.data,expr,log=new_Sumstats_object.log, **args)
159
+ new_Sumstats_object.data = filtervalues(new_Sumstats_object.data,expr,log=new_Sumstats_object.log, **kwargs)
160
160
  return new_Sumstats_object
161
161
  else:
162
- self.data = filtervalues(self.data, expr,log=self.log,**args)
162
+ self.data = filtervalues(self.data, expr,log=self.log,**kwargs)
163
163
  gc.collect()
164
164
 
165
165
  ## Visualization #############################################################################################################################################
166
- def plot_miami(self,**args):
166
+ def plot_miami(self,**kwargs):
167
167
 
168
168
  plot_miami2(merged_sumstats=self.data,
169
169
  suffixes=self.suffixes,
170
- **args)
170
+ **kwargs)
171
171
 
172
- def compare_af(self, **args):
172
+ def compare_af(self, **kwargs):
173
173
 
174
174
  return plotdaf( self.data,
175
175
  eaf="EAF_2",
176
176
  raf="EAF_1",
177
177
  xlabel="Effect Allele Frequency in Sumstats 1",
178
178
  ylabel="Effect Allele Frequency in Sumstats 2",
179
- **args)
179
+ **kwargs)
180
180
 
gwaslab/g_version.py CHANGED
@@ -15,8 +15,8 @@ def _get_version():
15
15
  def gwaslab_info():
16
16
  # version meta information
17
17
  dic={
18
- "version":"3.4.41",
19
- "release_date":"20240219"
18
+ "version":"3.4.42",
19
+ "release_date":"20240328"
20
20
  }
21
21
  return dic
22
22
 
@@ -21,6 +21,7 @@ from gwaslab.qc_check_datatype import check_dataframe_shape
21
21
  from gwaslab.bd_common_data import get_number_to_chr
22
22
  from gwaslab.bd_common_data import get_chr_list
23
23
  from gwaslab.bd_common_data import get_chr_to_number
24
+ from gwaslab.bd_common_data import _maketrans
24
25
  from gwaslab.g_vchange_status import vchange_status
25
26
  from gwaslab.g_version import _get_version
26
27
 
@@ -30,6 +31,34 @@ from gwaslab.g_version import _get_version
30
31
  #inferstrand
31
32
  #parallelecheckaf
32
33
 
34
+ ### CONSTANTS AND MAPPINGS ###
35
+
36
+ PADDING_VALUE = 100
37
+
38
+ # chr(0) should not be used in the mapping dict because it's a reserved value.
39
+ # Instead of starting from chr(1), we start from chr(2) because this could be useful in the future
40
+ # to compute the complementary allele with a simple XOR operation (e.g. 2 ^ 1 = 3, 3 ^ 1 = 2, 4 ^ 1 = 5, 5 ^ 1 = 4, ...)
41
+ MAPPING = {
42
+ "A": chr(2),
43
+ "T": chr(3),
44
+ "C": chr(4),
45
+ "G": chr(5),
46
+ "N": chr(6),
47
+ }
48
+ assert all(value != chr(0) for value in MAPPING.values()), "Mapping in the dictionary should not be equal to chr(0). This is a reserved value"
49
+
50
+ _COMPLEMENTARY_MAPPING = {
51
+ "A": "T",
52
+ "C": "G",
53
+ "G": "C",
54
+ "T": "A",
55
+ "N": "N",
56
+ }
57
+ COMPLEMENTARY_MAPPING = {k: MAPPING[v] for k,v in _COMPLEMENTARY_MAPPING.items()}
58
+
59
+ TRANSLATE_TABLE = _maketrans(MAPPING)
60
+ TRANSLATE_TABLE_COMPL = _maketrans(COMPLEMENTARY_MAPPING)
61
+
33
62
  #20220808
34
63
  #################################################################################################################
35
64
 
@@ -44,7 +73,7 @@ def rsidtochrpos(sumstats,
44
73
  ##start function with col checking##########################################################
45
74
  _start_line = "assign CHR and POS using rsIDs"
46
75
  _end_line = "assigning CHR and POS using rsIDs"
47
- _start_cols = [rsid,chrom,pos]
76
+ _start_cols = [rsid]
48
77
  _start_function = ".rsid_to_chrpos()"
49
78
  _must_args ={}
50
79
 
@@ -131,7 +160,7 @@ def parallelrsidtochrpos(sumstats, rsid="rsID", chrom="CHR",pos="POS", path=None
131
160
  ##start function with col checking##########################################################
132
161
  _start_line = "assign CHR and POS using rsIDs"
133
162
  _end_line = "assigning CHR and POS using rsIDs"
134
- _start_cols = [rsid,chrom,pos]
163
+ _start_cols = [rsid]
135
164
  _start_function = ".rsid_to_chrpos2()"
136
165
  _must_args ={}
137
166
 
@@ -186,7 +215,7 @@ def parallelrsidtochrpos(sumstats, rsid="rsID", chrom="CHR",pos="POS", path=None
186
215
  pool = Pool(n_cores)
187
216
  if chrom not in input_columns:
188
217
  log.write(" -Initiating CHR ... ",verbose=verbose)
189
- sumstats_rs[chrom]=pd.Series(dtype="Int32")
218
+ sumstats_rs[chrom]=pd.Series(dtype="Int64")
190
219
 
191
220
  if pos not in input_columns:
192
221
  log.write(" -Initiating POS ... ",verbose=verbose)
@@ -207,7 +236,7 @@ def parallelrsidtochrpos(sumstats, rsid="rsID", chrom="CHR",pos="POS", path=None
207
236
 
208
237
  # update CHR and POS using rsID with multiple threads
209
238
  sumstats_rs = pd.concat(pool.map(partial(merge_chrpos,all_groups_max=all_groups_max,path=path,build=build,status=status),df_split),ignore_index=True)
210
- sumstats_rs.loc[:,["CHR","POS"]] = sumstats_rs.loc[:,["CHR","POS"]].astype("Int64")
239
+ sumstats_rs[["CHR","POS"]] = sumstats_rs[["CHR","POS"]].astype("Int64")
211
240
  del df_split
212
241
  gc.collect()
213
242
  log.write(" -Merging group data... ",verbose=verbose)
@@ -234,8 +263,8 @@ def parallelrsidtochrpos(sumstats, rsid="rsID", chrom="CHR",pos="POS", path=None
234
263
  finished(log, verbose, _end_line)
235
264
  return sumstats
236
265
  ####################################################################################################################
237
- #20220426 check if non-effect allele is aligned with reference genome
238
- def check_status(row,record):
266
+ # old version
267
+ def _old_check_status(row,record):
239
268
  #pos,ea,nea
240
269
  # status
241
270
  #0 / -----> match
@@ -288,16 +317,14 @@ def check_status(row,record):
288
317
  return status_pre+"5"+status_end
289
318
  # ea !=ref
290
319
  return status_pre+"8"+status_end
291
-
292
320
 
293
- def checkref(sumstats,ref_path,chrom="CHR",pos="POS",ea="EA",nea="NEA",status="STATUS",chr_dict=get_chr_to_number(),remove=False,verbose=True,log=Log()):
321
+ def oldcheckref(sumstats,ref_seq,chrom="CHR",pos="POS",ea="EA",nea="NEA",status="STATUS",chr_dict=get_chr_to_number(),remove=False,verbose=True,log=Log()):
294
322
  ##start function with col checking##########################################################
295
323
  _start_line = "check if NEA is aligned with reference sequence"
296
324
  _end_line = "checking if NEA is aligned with reference sequence"
297
325
  _start_cols = [chrom,pos,ea,nea,status]
298
326
  _start_function = ".check_ref()"
299
327
  _must_args ={}
300
-
301
328
  is_enough_info = start_to(sumstats=sumstats,
302
329
  log=log,
303
330
  verbose=verbose,
@@ -308,10 +335,10 @@ def checkref(sumstats,ref_path,chrom="CHR",pos="POS",ea="EA",nea="NEA",status="S
308
335
  **_must_args)
309
336
  if is_enough_info == False: return sumstats
310
337
  ############################################################################################
311
- log.write(" -Reference genome FASTA file: "+ ref_path,verbose=verbose)
338
+ log.write(" -Reference genome FASTA file: "+ ref_seq,verbose=verbose)
312
339
  log.write(" -Checking records: ", end="",verbose=verbose)
313
340
  chromlist = get_chr_list(add_number=True)
314
- records = SeqIO.parse(ref_path, "fasta")
341
+ records = SeqIO.parse(ref_seq, "fasta")
315
342
  for record in records:
316
343
  #record = next(records)
317
344
  if record is not None:
@@ -323,7 +350,7 @@ def checkref(sumstats,ref_path,chrom="CHR",pos="POS",ea="EA",nea="NEA",status="S
323
350
  if i in chromlist:
324
351
  log.write(record_chr," ", end="",show_time=False,verbose=verbose)
325
352
  to_check_ref = (sumstats[chrom]==i) & (~sumstats[pos].isna()) & (~sumstats[nea].isna()) & (~sumstats[ea].isna())
326
- sumstats.loc[to_check_ref,status] = sumstats.loc[to_check_ref,[pos,ea,nea,status]].apply(lambda x:check_status(x,record),axis=1)
353
+ sumstats.loc[to_check_ref,status] = sumstats.loc[to_check_ref,[pos,ea,nea,status]].apply(lambda x:_old_check_status(x,record),axis=1)
327
354
 
328
355
  log.write("\n",end="",show_time=False,verbose=verbose)
329
356
 
@@ -360,6 +387,332 @@ def checkref(sumstats,ref_path,chrom="CHR",pos="POS",ea="EA",nea="NEA",status="S
360
387
  finished(log, verbose, _end_line)
361
388
  return sumstats
362
389
 
390
+ #20240320 check if non-effect allele is aligned with reference genome
391
+ def _fast_check_status(x: pd.DataFrame, record: np.array, starting_positions: np.array):
392
+ # status
393
+ #0 / -----> match
394
+ #1 / -----> Flipped Fixed
395
+ #2 / -----> Reverse_complementary Fixed
396
+ #3 / -----> flipped
397
+ #4 / -----> reverse_complementary
398
+ #5 / ------> reverse_complementary + flipped
399
+ #6 / -----> both allele on genome + unable to distinguish
400
+ #7 / ----> reverse_complementary + both allele on genome + unable to distinguish
401
+ #8 / -----> not on ref genome
402
+ #9 / ------> unchecked
403
+ if x.empty:
404
+ return np.array([])
405
+
406
+ # x is expected to be a DataFrame with these columns in that order: ['CHR', 'POS', 'EA', 'NEA', 'STATUS']
407
+ # In this way, we don't need to specify the columns names
408
+ _chrom = x.iloc[:, 0]
409
+ _pos = x.iloc[:, 1]
410
+ _ea = x.iloc[:, 2]
411
+ _nea = x.iloc[:, 3]
412
+ _status = x.iloc[:, 4]
413
+
414
+ # position of the status (i.e. x['STATUS']) that will be modified
415
+ status_flip_idx = 5
416
+
417
+ pos = _pos.values.astype(np.int64) # convert to int64 because they could be of type 'object'
418
+
419
+ # Rebase the chromosome numbers to 0-based indexing
420
+ # e.g. ['1', '2', '4', '2'] -> [0, 1, 2, 1]
421
+ # This is needed because record is a single 1D array containing all the records for all the selected chromosomes,
422
+ # so for instance if record contains the records for chr1, chr2, chr4 ([...chr1...chr2...chr4...]), we need to
423
+ # rebase the chromosome numbers to 0-based indexing to index the correct record portion when we do starting_positions[chrom]
424
+ # Note that in x there are only the rows for the same chromosomes for which we have the records in record
425
+ # (i.e. we don't have rows for chr3 if we don't have the record for chr3). This filtering is done in the caller function
426
+ _chrom = _chrom.values
427
+ unique_values, _ = np.unique(_chrom, return_inverse=True) # Get the sorted unique values and their indices
428
+ chrom = np.searchsorted(unique_values, _chrom) # Replace each value in '_chrom' with its corresponding index in the sorted unique values
429
+
430
+ max_len_nea = _nea.str.len().max()
431
+ max_len_ea = _ea.str.len().max()
432
+
433
+
434
+ # Let's apply the same magic used for the fasta records (check build_fasta_records() for details) to convert the NEA and EA to
435
+ # a numpy array of integers in a very fast way.
436
+ # In that case we start from a pd.Series to we can apply some built-in methods.
437
+ # Also, when doing nea.view('<u4'), each row will be automatically right-padded with zeros to reach the max_len_nea.
438
+ # For this reason, we then replace the zeros with out padding value
439
+ # (and that's why the mapping dict can't have chr(0) as a value, otherwise we would have zeros for both padding and a character)
440
+ # Reshaping is needed because .view('<u4') will create a flattened array
441
+ nea = _nea.str.translate(TRANSLATE_TABLE).to_numpy().astype(f'<U{max_len_nea}')
442
+ nea = nea.view('<u4').reshape(-1, max_len_nea).astype(np.uint8)
443
+ nea[nea == 0] = PADDING_VALUE # padding value
444
+
445
+ # Create a mask holding True at the position of non-padding values
446
+ mask_nea = nea != PADDING_VALUE
447
+
448
+ # Create the reverse complement of NEA
449
+ # In this case, we manually left-pad the translated string with the padding value, since the padding done by view('<u4') would be right-padded
450
+ # and that will make hard the reverse operation (because we would have e.g. [2, 2, 4, 100, ..., 100] which will be hard to convert into [4, 2, 2, 100, ..., 100])
451
+ rev_nea = _nea.str.translate(TRANSLATE_TABLE_COMPL).str.pad(max_len_nea, 'left', chr(PADDING_VALUE)).to_numpy().astype(f'<U{max_len_nea}')
452
+ rev_nea = rev_nea.view('<u4').reshape(-1, max_len_nea).astype(np.uint8)
453
+ rev_nea = rev_nea[:, ::-1]
454
+
455
+
456
+ # Let's do everything again for EA
457
+ ea = _ea.str.translate(TRANSLATE_TABLE).to_numpy().astype(f'<U{max_len_ea}')
458
+ ea = ea.view('<u4').reshape(-1, max_len_ea).astype(np.uint8)
459
+ ea[ea == 0] = PADDING_VALUE # padding value
460
+
461
+ mask_ea = ea != PADDING_VALUE
462
+
463
+ rev_ea = _ea.str.translate(TRANSLATE_TABLE_COMPL).str.pad(max_len_ea, 'left', chr(PADDING_VALUE)).to_numpy().astype(f'<U{max_len_ea}')
464
+ rev_ea = rev_ea.view('<u4').reshape(-1, max_len_ea).astype(np.uint8)
465
+ rev_ea = rev_ea[:, ::-1]
466
+
467
+
468
+ # Convert the status (which are integers represented as strings) to a numpy array of integers.
469
+ # Again, use the same concept as before to do this in a very fast way.
470
+ # e.g. ["9999999", "9939999", "9929999"] -> [[9, 9, 9, 9, 9, 9, 9], [9, 9, 3, 9, 9, 9, 9], [9, 9, 2, 9, 9, 9, 9]]
471
+ assert _status.str.len().value_counts().nunique() == 1 # all the status strings should have the same length, let's be sure of that.
472
+ status_len = len(_status.iloc[0])
473
+ mapping_status = {str(v): chr(v) for v in range(10)}
474
+ table_stats = _maketrans(mapping_status)
475
+ status = _status.str.translate(table_stats).to_numpy().astype(f'<U{status_len}')
476
+ status = status.view('<u4').reshape(-1, status_len).astype(np.uint8)
477
+
478
+
479
+ # Expand the position to a 2D array and subtract 1 to convert to 0-based indexing
480
+ # e.g. [2, 21, 46] -> [[1], [20], [45]]
481
+ pos = np.expand_dims(pos, axis=-1) - 1
482
+
483
+ # Create a modified indices array specifying the starting position of each chromosome in the concatenated record array
484
+ modified_indices = starting_positions[chrom]
485
+ modified_indices = modified_indices[:, np.newaxis] # Add a new axis to modified_indices to align with the dimensions of pos
486
+
487
+ # Create the range of indices: [0, ..., max_len_nea-1]
488
+ indices_range = np.arange(max_len_nea)
489
+
490
+ # Add the range of indices to the starting indices
491
+ # e.g. pos = [[1], [20], [45]], indices_range = [0, 1, 2], indices = [[1, 2, 3], [20, 21, 22], [45, 46, 47]]
492
+ indices = pos + indices_range
493
+
494
+ # Modify indices to select the correct absolute position in the concatenated record array
495
+ indices = indices + modified_indices
496
+
497
+ # Let's pad the fasta records array because if there is a (pos, chrom) for which (pos+starting_position[chrom]+max_len_nea > len(record) we get out of bounds error.
498
+ # This basically happens if there is a pos for the last chromosome for which pos+max_len_nea > len(record for that chrom).
499
+ # This is very unlikely to happen but we should handle this case.
500
+ record = np.pad(record, (0, max_len_nea), constant_values=PADDING_VALUE)
501
+
502
+ # Index the record array using the computed indices.
503
+ # Since we use np.take, indices must all have the same length, and this is why we added the padding to NEA
504
+ # and we create the indices using max_len_nea (long story short, we can't obtain a scattered/ragged array)
505
+ output_nea = np.take(record, indices)
506
+
507
+ # Check if the NEA is equal to the reference sequence at the given position
508
+ # In a non-matrix way, this is equivalent (for one single element) to:
509
+ # nea == record[pos-1: pos+len(nea)-1]
510
+ # where for example:
511
+ # a) nea = "AC", record = "ACTG", pos = 1 -> True
512
+ # b) nea = "T", record = "ACTG", pos = 3 -> True
513
+ # c) nea = "AG", record = "ACTG", pos = 1 -> False
514
+ # Since we want to do everything in a vectorized way, we will compare the padded NEA with the output
515
+ # and then we use the mask to focus only on the non-padded elements
516
+ # Pseudo example (X represents the padding value):
517
+ # nea = ['AC', 'T'], record = 'ACTGAAG', pos = [1, 3]
518
+ # -> nea = ['AC', 'TX'], indices = [[1, 2], [3, 4]], mask = [[True, True], [True, False]], output_nea = [['A', 'C'], ['T', 'G']]
519
+ # -> nea == output_nea: [[True, True], [True, False]], mask: [[True, True], [True, False]]
520
+ # -> nea == output_nea + ~mask: [[True, True], [True, True]]
521
+ # -> np.all(nea == output_nea + ~mask, 1): [True, True]
522
+ nea_eq_ref = np.all((nea == output_nea) + ~mask_nea, 1)
523
+ rev_nea_eq_ref = np.all((rev_nea == output_nea) + ~mask_nea, 1)
524
+
525
+ # Let's do everything again for EA
526
+ indices_range = np.arange(max_len_ea)
527
+ indices = pos + indices_range
528
+ indices = indices + modified_indices
529
+ output_ea = np.take(record, indices)
530
+
531
+ ea_eq_ref = np.all((ea == output_ea) + ~mask_ea, 1)
532
+ rev_ea_eq_ref = np.all((rev_ea == output_ea) + ~mask_ea, 1)
533
+
534
+ masks_max_len = max(mask_nea.shape[1], mask_ea.shape[1])
535
+
536
+ len_nea_eq_len_ea = np.all(
537
+ np.pad(mask_nea, ((0,0),(0, masks_max_len-mask_nea.shape[1])), constant_values=False) ==
538
+ np.pad(mask_ea, ((0,0),(0, masks_max_len-mask_ea.shape[1])), constant_values=False)
539
+ , axis=1) # pad masks with False to reach same shape
540
+ len_rev_nea_eq_rev_len_ea = len_nea_eq_len_ea
541
+
542
+ # The following conditions replicates the if-else statements of the original check_status function:
543
+ # https://github.com/Cloufield/gwaslab/blob/f6b4c4e58a26e5d67d6587141cde27acf9ce2a11/src/gwaslab/hm_harmonize_sumstats.py#L238
544
+
545
+ # nea == ref && ea == ref && len(nea) != len(ea)
546
+ status[nea_eq_ref * ea_eq_ref * ~len_nea_eq_len_ea, status_flip_idx] = 6
547
+
548
+ # nea == ref && ea != ref
549
+ status[nea_eq_ref * ~ea_eq_ref, status_flip_idx] = 0
550
+
551
+ # nea != ref && ea == ref
552
+ status[~nea_eq_ref * ea_eq_ref, status_flip_idx] = 3
553
+
554
+ # nea != ref && ea != ref && rev_nea == ref && rev_ea == ref && len(rev_nea) != len(rev_ea)
555
+ status[~nea_eq_ref * ~ea_eq_ref * rev_nea_eq_ref * rev_ea_eq_ref * ~len_rev_nea_eq_rev_len_ea, status_flip_idx] = 8
556
+
557
+ # nea != ref && ea != ref && rev_nea == ref && rev_ea != ref
558
+ status[~nea_eq_ref * ~ea_eq_ref * rev_nea_eq_ref * ~rev_ea_eq_ref, status_flip_idx] = 4
559
+
560
+ # nea != ref && ea != ref && rev_nea != ref && rev_ea == ref
561
+ status[~nea_eq_ref * ~ea_eq_ref * ~rev_nea_eq_ref * rev_ea_eq_ref, status_flip_idx] = 5
562
+
563
+ # nea != ref && ea != ref && rev_nea != ref && rev_ea != ref
564
+ status[~nea_eq_ref * ~ea_eq_ref * ~rev_nea_eq_ref * ~rev_ea_eq_ref, status_flip_idx] = 8
565
+
566
+ # Convert back the (now modified) 2D status array to a numpy array of strings in a very fast way.
567
+ # Since 'status' is a 2D array of integers ranging from 0 to 9, we can build the integer representation
568
+ # of each row using the efficent operation below (e.g. [1, 2, 3, 4, 5] -> [12345]).
569
+ # Then we convert this integer to a string using the f'<U{status.shape[1]}' dtype (e.g. 12345 -> '12345')
570
+ # The "naive" way would be:
571
+ # status_str = [''.join(map(str, l)) for l in status]
572
+ # status_arr = np.array(status_str)
573
+ status_flat = np.sum(status * 10**np.arange(status.shape[1]-1, -1, -1), axis=1)
574
+ status_arr = status_flat.astype(f'<U{status.shape[1]}')
575
+
576
+ return status_arr
577
+
578
+
579
+ def check_status(sumstats: pd.DataFrame, fasta_records_dict, log=Log(), verbose=True):
580
+
581
+ chrom,pos,ea,nea,status = sumstats.columns
582
+
583
+ # First, convert the fasta records to a single numpy array of integers
584
+ record, starting_positions_dict = build_fasta_records(fasta_records_dict, pos_as_dict=True, log=log, verbose=verbose)
585
+
586
+ # In _fast_check_status(), several 2D numpy arrays are created and they are padded to have shape[1] == max_len_nea or max_len_ea
587
+ # Since most of the NEA and EA strings are short, we perform the check first on the records having short NEA and EA strings,
588
+ # and then we perform the check on the records having long NEA and EA strings. In this way we can speed up the process (since the
589
+ # arrays are smaller) and save memory.
590
+ max_len = 4 # this is a chosen value, we could compute it using some stats about the length and count of NEA and EA strings
591
+ condition = (sumstats[nea].str.len() <= max_len) * (sumstats[ea].str.len() <= max_len)
592
+
593
+ log.write(f" -Checking records for ( len(NEA) <= {max_len} and len(EA) <= {max_len} )", verbose=verbose)
594
+ sumstats_cond = sumstats[condition]
595
+ starting_pos_cond = np.array([starting_positions_dict[k] for k in sumstats_cond[chrom].unique()])
596
+ sumstats.loc[condition, status] = _fast_check_status(sumstats_cond, record=record, starting_positions=starting_pos_cond)
597
+
598
+ log.write(f" -Checking records for ( len(NEA) > {max_len} or len(EA) > {max_len} )", verbose=verbose)
599
+ sumstats_not_cond = sumstats[~condition]
600
+ starting_not_pos_cond = np.array([starting_positions_dict[k] for k in sumstats_not_cond[chrom].unique()])
601
+ sumstats.loc[~condition, status] = _fast_check_status(sumstats_not_cond, record=record, starting_positions=starting_not_pos_cond)
602
+
603
+ return sumstats[status].values
604
+
605
+
606
+ def checkref(sumstats,ref_seq,chrom="CHR",pos="POS",ea="EA",nea="NEA",status="STATUS",chr_dict=get_chr_to_number(),remove=False,verbose=True,log=Log()):
607
+ ##start function with col checking##########################################################
608
+ _start_line = "check if NEA is aligned with reference sequence"
609
+ _end_line = "checking if NEA is aligned with reference sequence"
610
+ _start_cols = [chrom,pos,ea,nea,status]
611
+ _start_function = ".check_ref()"
612
+ _must_args ={}
613
+
614
+ is_enough_info = start_to(sumstats=sumstats,
615
+ log=log,
616
+ verbose=verbose,
617
+ start_line=_start_line,
618
+ end_line=_end_line,
619
+ start_cols=_start_cols,
620
+ start_function=_start_function,
621
+ **_must_args)
622
+ if is_enough_info == False: return sumstats
623
+ ############################################################################################
624
+ log.write(" -Reference genome FASTA file: "+ ref_seq,verbose=verbose)
625
+ log.write(" -Loading fasta records:",end="", verbose=verbose)
626
+ chromlist = get_chr_list(add_number=True)
627
+ records = SeqIO.parse(ref_seq, "fasta")
628
+
629
+ all_records_dict = {}
630
+ chroms_in_sumstats = sumstats[chrom].unique() # load records from Fasta file only for the chromosomes present in the sumstats
631
+ for record in records:
632
+ #record = next(records)
633
+ if record is not None:
634
+ record_chr = str(record.id).strip("chrCHR").upper()
635
+ if record_chr in chr_dict.keys():
636
+ i = chr_dict[record_chr]
637
+ else:
638
+ i = record_chr
639
+ if (i in chromlist) and (i in chroms_in_sumstats):
640
+ log.write(record_chr," ", end="",show_time=False,verbose=verbose)
641
+ all_records_dict.update({i: record})
642
+ log.write("",show_time=False,verbose=verbose)
643
+
644
+ if len(all_records_dict) > 0:
645
+ log.write(" -Checking records", verbose=verbose)
646
+ all_records_dict = dict(sorted(all_records_dict.items())) # sort by key in case the fasta records are not already ordered by chromosome
647
+ to_check_ref = (sumstats[chrom].isin(list(all_records_dict.keys()))) & (~sumstats[pos].isna()) & (~sumstats[nea].isna()) & (~sumstats[ea].isna())
648
+ sumstats_to_check = sumstats.loc[to_check_ref,[chrom,pos,ea,nea,status]]
649
+ sumstats.loc[to_check_ref,status] = check_status(sumstats_to_check, all_records_dict, log=log, verbose=verbose)
650
+ log.write(" -Finished checking records", verbose=verbose)
651
+
652
+ sumstats[status] = sumstats[status].astype("string")
653
+
654
+ available_to_check =sum( (~sumstats[pos].isna()) & (~sumstats[nea].isna()) & (~sumstats[ea].isna()))
655
+ status_0=sum(sumstats["STATUS"].str.match("\w\w\w\w\w[0]\w", case=False, flags=0, na=False))
656
+ status_3=sum(sumstats["STATUS"].str.match("\w\w\w\w\w[3]\w", case=False, flags=0, na=False))
657
+ status_4=sum(sumstats["STATUS"].str.match("\w\w\w\w\w[4]\w", case=False, flags=0, na=False))
658
+ status_5=sum(sumstats["STATUS"].str.match("\w\w\w\w\w[5]\w", case=False, flags=0, na=False))
659
+ status_6=sum(sumstats["STATUS"].str.match("\w\w\w\w\w[6]\w", case=False, flags=0, na=False))
660
+ #status_7=sum(sumstats["STATUS"].str.match("\w\w\w\w\w[7]\w", case=False, flags=0, na=False))
661
+ status_8=sum(sumstats["STATUS"].str.match("\w\w\w\w\w[8]\w", case=False, flags=0, na=False))
662
+
663
+ log.write(" -Variants allele on given reference sequence : ",status_0,verbose=verbose)
664
+ log.write(" -Variants flipped : ",status_3,verbose=verbose)
665
+ raw_matching_rate = (status_3+status_0)/available_to_check
666
+ flip_rate = status_3/available_to_check
667
+ log.write(" -Raw Matching rate : ","{:.2f}%".format(raw_matching_rate*100),verbose=verbose)
668
+ if raw_matching_rate <0.8:
669
+ log.warning("Matching rate is low, please check if the right reference genome is used.")
670
+ if flip_rate > 0.85 :
671
+ log.write(" -Flipping variants rate > 0.85, it is likely that the EA is aligned with REF in the original dataset.",verbose=verbose)
672
+
673
+ log.write(" -Variants inferred reverse_complement : ",status_4,verbose=verbose)
674
+ log.write(" -Variants inferred reverse_complement_flipped : ",status_5,verbose=verbose)
675
+ log.write(" -Both allele on genome + unable to distinguish : ",status_6,verbose=verbose)
676
+ #log.write(" -Reverse_complementary + both allele on genome + unable to distinguish: ",status_7)
677
+ log.write(" -Variants not on given reference sequence : ",status_8,verbose=verbose)
678
+
679
+ if remove is True:
680
+ sumstats = sumstats.loc[~sumstats["STATUS"].str.match("\w\w\w\w\w[8]\w"),:]
681
+ log.write(" -Variants not on given reference sequence were removed.",verbose=verbose)
682
+
683
+ finished(log, verbose, _end_line)
684
+ return sumstats
685
+
686
+ def build_fasta_records(fasta_records_dict, pos_as_dict=True, log=Log(), verbose=True):
687
+ log.write(" -Building numpy fasta records from dict", verbose=verbose)
688
+
689
+ # Let's do some magic to convert the fasta record to a numpy array of integers in a very fast way.
690
+ # fasta_record.seq._data is a byte-string, so we can use the bytes.maketrans to apply a translation.
691
+ # Here we map the bytes to the unicode character representing the desired integer as defined in the mapping dict
692
+ # (i.e. b'A' -> '\x02', b'T' -> '\x03', b'C' -> '\x04', b'G' -> '\x05', b'N' -> '\x06')
693
+ # Then, using np.array(... dtype=<U..) we convert the string to a numpy array of unicode characters.
694
+ # Then, we do a magic with view('<u4') to convert the unicode characters to 4-byte integers, so we obtain the actual integer representation of the characters
695
+ # Lastly, we cast the array to np.uint8 to convert the 4-byte integers to 1-byte integers to save memory
696
+ # Full example:
697
+ # fasta_record.seq._data = b'ACTGN' -> b'\x02\x04\x03\x05\x06' -> np.array(['\x02\x04\x03\x05\x06'], dtype='<U5') -> np.array([2, 4, 3, 5, 6], dtype=uint32) -> np.array([2, 4, 3, 5, 6], dtype=uint8)
698
+ all_r = []
699
+ for r in fasta_records_dict.values():
700
+ r = r.seq._data.translate(TRANSLATE_TABLE)
701
+ r = np.array([r], dtype=f'<U{len(r)}').view('<u4').astype(np.uint8)
702
+ all_r.append(r)
703
+
704
+ # We've just created a list of numpy arrays, so we can concatenate them to obtain a single numpy array
705
+ # Then we keep track of the starting position of each record in the concatenated array. This will be useful later
706
+ # to index the record array depending on the position of the variant and the chromosome
707
+ records_len = np.array([len(r) for r in all_r])
708
+ starting_positions = np.cumsum(records_len) - records_len
709
+ if pos_as_dict:
710
+ starting_positions = {k: v for k, v in zip(fasta_records_dict.keys(), starting_positions)}
711
+ record = np.concatenate(all_r)
712
+ del all_r # free memory
713
+
714
+ return record, starting_positions
715
+
363
716
  #######################################################################################################################################
364
717
 
365
718
  #20220721
@@ -3,30 +3,30 @@ from gwaslab.bd_common_data import get_formats_list
3
3
  from gwaslab.g_Log import Log
4
4
  from gwaslab.bd_common_data import get_format_dict
5
5
 
6
- def _read_tabular(path, fmt, **args):
6
+ def _read_tabular(path, fmt, **kwargs):
7
7
 
8
8
  # default
9
9
  load_args_dict = {"sep":"\t",
10
10
  "header":None}
11
11
 
12
12
  # if specified by user
13
- if len(args)>0:
14
- load_args_dict = args
13
+ if len(kwargs)>0:
14
+ load_args_dict = kwargs
15
15
 
16
16
  # load format
17
17
  meta_data, rename_dictionary = get_format_dict(fmt)
18
18
 
19
- if "format_separator" in meta_data and "sep" not in args:
19
+ if "format_separator" in meta_data and "sep" not in kwargs:
20
20
  load_args_dict["sep"] = meta_data["format_separator"]
21
21
 
22
- if "format_comment" in meta_data and "comment" not in args:
22
+ if "format_comment" in meta_data and "comment" not in kwargs:
23
23
  if meta_data["format_comment"] is not None:
24
24
  load_args_dict["comment"] = meta_data["format_comment"]
25
25
 
26
- if "format_header" in meta_data and "header" not in args:
26
+ if "format_header" in meta_data and "header" not in kwargs:
27
27
  load_args_dict["header"] = meta_data["format_header"]
28
28
 
29
- if "format_na" in meta_data and "na_values" not in args:
29
+ if "format_na" in meta_data and "na_values" not in kwargs:
30
30
  if meta_data["format_na"] is not None:
31
31
  load_args_dict["na_values"] = meta_data["format_na"]
32
32