gwaslab 3.4.45__py3-none-any.whl → 3.4.47__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of gwaslab might be problematic. Click here for more details.

gwaslab/__init__.py CHANGED
@@ -44,4 +44,5 @@ from gwaslab.viz_plot_trumpetplot import plot_power
44
44
  from gwaslab.viz_plot_trumpetplot import plot_power_x
45
45
  from gwaslab.util_ex_process_h5 import process_vcf_to_hfd5
46
46
  from gwaslab.util_ex_run_susie import _run_susie_rss as run_susie_rss
47
- from gwaslab.io_read_tabular import _read_tabular as read_tabular
47
+ from gwaslab.io_read_tabular import _read_tabular as read_tabular
48
+ from gwaslab.util_in_meta import meta_analyze
gwaslab/bd_common_data.py CHANGED
@@ -298,6 +298,28 @@ def gtf_to_protein_coding(gtfpath,log=Log(),verbose=True):
298
298
 
299
299
  return protein_coding_path
300
300
 
301
+ def gtf_to_all_gene(gtfpath,log=Log(),verbose=True):
302
+ all_gene_path = gtfpath[:-6]+"all_genes.gtf.gz"
303
+ # if not existing, extract protein coding records and output to a new file
304
+ if not path.isfile(all_gene_path):
305
+
306
+ # get gene list
307
+ log.write(" - Extracting genes from {}".format(gtfpath),verbose=verbose)
308
+ gtf = read_gtf(gtfpath,usecols=["feature","gene_biotype","gene_id","gene_name"])
309
+ gene_list = gtf.loc[gtf["feature"]=="gene","gene_id"].values
310
+ log.write(" - Loaded {} genes.".format(len(gene_list)),verbose=verbose)
311
+
312
+ # extract entry using csv
313
+ gtf_raw = pd.read_csv(gtfpath,sep="\t",header=None,comment="#",dtype="string")
314
+ gtf_raw["_gene_id"] = gtf_raw[8].str.extract(r'gene_id "([\w\.-]+)"')
315
+ gtf_raw = gtf_raw.loc[ gtf_raw["_gene_id"].isin(gene_list) ,:]
316
+ gtf_raw = gtf_raw.drop("_gene_id",axis=1)
317
+
318
+ log.write(" - Extracted records are saved to : {} ".format(all_gene_path),verbose=verbose)
319
+ gtf_raw.to_csv(all_gene_path, header=None, index=None, sep="\t")
320
+
321
+ return all_gene_path
322
+
301
323
  ####################################################################################################################
302
324
  # From BioPython: https://github.com/biopython/biopython/blob/c5a6b1374267d769b19c1022b4b45472316e78b4/Bio/Seq.py#L36
303
325
  def _maketrans(complement_mapping):
gwaslab/g_Sumstats.py CHANGED
@@ -121,6 +121,7 @@ class Sumstats():
121
121
  snpr2=None,
122
122
  status=None,
123
123
  other=[],
124
+ usekeys=None,
124
125
  direction=None,
125
126
  verbose=True,
126
127
  study="Study_1",
@@ -200,6 +201,7 @@ class Sumstats():
200
201
  trait=trait,
201
202
  status=status,
202
203
  other=other,
204
+ usekeys=usekeys,
203
205
  verbose=verbose,
204
206
  readargs=readargs,
205
207
  log=self.log)
gwaslab/g_version.py CHANGED
@@ -15,16 +15,16 @@ def _get_version():
15
15
  def gwaslab_info():
16
16
  # version meta information
17
17
  dic={
18
- "version":"3.4.45",
19
- "release_date":"20240509"
18
+ "version":"3.4.47",
19
+ "release_date":"20240703"
20
20
  }
21
21
  return dic
22
22
 
23
- def _checking_plink_version(v=2,log=Log(), verbose=True):
24
- if v==1:
25
- which_plink_script = "plink --version"
26
- elif v==2:
27
- which_plink_script = "plink2 --version"
23
+ def _checking_plink_version(plink=None,plink2=None,log=Log(), verbose=True):
24
+ if plink is not None:
25
+ which_plink_script = "{} --version".format(plink)
26
+ elif plink2 is not None:
27
+ which_plink_script = "{} --version".format(plink2)
28
28
  output = subprocess.check_output(which_plink_script, stderr=subprocess.STDOUT, shell=True,text=True)
29
29
  log.write(" -PLINK version: {}".format(output.strip()))
30
30
  return log
@@ -868,8 +868,9 @@ def parallelizeassignrsid(sumstats, path, ref_mode="vcf",snpid="SNPID",rsid="rsI
868
868
  if is_enough_info == False: return sumstats
869
869
  ############################################################################################
870
870
 
871
- standardized_normalized = sumstats["STATUS"].str.match("\w\w\w[0][01234]\w\w", case=False, flags=0, na=False)
872
-
871
+ #standardized_normalized = sumstats["STATUS"].str.match("\w\w\w[0][01234]\w\w", case=False, flags=0, na=False)
872
+ standardized_normalized = sumstats["STATUS"] == sumstats["STATUS"]
873
+
873
874
  if rsid not in sumstats.columns:
874
875
  sumstats[rsid]=pd.Series(dtype="string")
875
876
 
@@ -55,6 +55,7 @@ def preformat(sumstats,
55
55
  trait=None,
56
56
  build=None,
57
57
  other=[],
58
+ usekeys=None,
58
59
  verbose=False,
59
60
  readargs=None,
60
61
  log=None):
@@ -65,6 +66,11 @@ def preformat(sumstats,
65
66
  dtype_dictionary ={}
66
67
 
67
68
  #######################################################################################################################################################
69
+ # workflow:
70
+ # 1. formatbook
71
+ # 2. user specified header
72
+ # 3. usekeys
73
+
68
74
  if fmt is not None:
69
75
  # loading format parameters
70
76
  log.write("Start to load format from formatbook....",verbose=verbose)
@@ -129,6 +135,8 @@ def preformat(sumstats,
129
135
 
130
136
  ################################################
131
137
  for key,value in rename_dictionary.items():
138
+ # check avaiable keys key->raw header
139
+ # usecols : a list of raw headers to load from file/DataFrame
132
140
  if key in raw_cols:
133
141
  usecols.append(key)
134
142
  if value in ["EA","NEA"]:
@@ -137,7 +145,7 @@ def preformat(sumstats,
137
145
  dtype_dictionary[value]="string"
138
146
 
139
147
  except ValueError:
140
- raise ValueError("Please input a path or a pd.DataFrame, and make sure the columns you specified are in the file.")
148
+ raise ValueError("Please input a path or a pd.DataFrame, and make sure the separator is correct and the columns you specified are in the file.")
141
149
 
142
150
  ###################################################################################################################################################
143
151
  ## check columns/datatype to use
@@ -276,6 +284,19 @@ def preformat(sumstats,
276
284
  else:
277
285
  study = raw_cols[9]
278
286
  usecols = usecols + [study]
287
+
288
+ if usekeys is not None:
289
+ # extract only specified keys
290
+ usecols_new =[]
291
+ for i in usekeys:
292
+ for k, v in rename_dictionary.items():
293
+ if i == v:
294
+ usecols_new.append(k)
295
+ usecols_valid =[]
296
+ for i in usecols_new:
297
+ if i in usecols:
298
+ usecols_valid.append(i)
299
+ usecols = usecols_valid
279
300
  #loading data ##########################################################################################################
280
301
 
281
302
  try:
gwaslab/io_to_formats.py CHANGED
@@ -212,8 +212,10 @@ def tofmt(sumstats,
212
212
  log.write(" -Start outputting sumstats in "+fmt+" format...")
213
213
 
214
214
  if "CHR" in sumstats.columns:
215
+ # output X,Y,MT instead of 23,24,25
215
216
  if xymt_number is False and pd.api.types.is_integer_dtype(sumstats["CHR"]):
216
217
  sumstats["CHR"]= sumstats["CHR"].map(get_number_to_chr(xymt=xymt,prefix=chr_prefix))
218
+ # add prefix to CHR
217
219
  elif chr_prefix is not None:
218
220
  sumstats["CHR"]= chr_prefix + sumstats["CHR"].astype("string")
219
221
 
@@ -437,17 +439,20 @@ def _configure_output_cols_and_args(sumstats, rename_dictionary, cols, no_status
437
439
  ouput_cols.append(i)
438
440
 
439
441
  # + additional cols and remove duplicated
440
- ouput_cols = list(set(ouput_cols + cols))
442
+ ouput_cols_final = []
443
+ for i in ouput_cols + cols:
444
+ if i not in ouput_cols_final:
445
+ ouput_cols_final.append(i)
441
446
 
442
447
  # remove STATUS
443
448
  try:
444
449
  if no_status == True:
445
- ouput_cols.remove("STATUS")
450
+ ouput_cols_final.remove("STATUS")
446
451
  except:
447
452
  pass
448
453
 
449
454
  #filter and rename to target fromat headers
450
- sumstats = sumstats[ouput_cols]
455
+ sumstats = sumstats[ouput_cols_final]
451
456
  sumstats = sumstats.rename(columns=rename_dictionary)
452
457
 
453
458
  # configure target format args and reorder columns
@@ -1061,6 +1061,13 @@ def check_range(sumstats, var_range, header, coltocheck, cols_to_check, log, ver
1061
1061
  if sum(is_low_p) >0:
1062
1062
  log.warning("Extremely low P detected (P=0 or P < minimum positive value of float64) : {}".format(sum(is_low_p)))
1063
1063
  log.warning("Please consider using MLOG10P instead.")
1064
+
1065
+ if header=="INFO":
1066
+ is_high_info = sumstats["INFO"]>1
1067
+ if sum(is_high_info) >0:
1068
+ log.warning("High INFO detected (INFO>1) : {}".format(sum(is_high_info)))
1069
+ log.warning("max(INFO): {}".format(sumstats["INFO"].max()))
1070
+ log.warning("Please check if this is as expected.")
1064
1071
 
1065
1072
  if sum(~is_valid)>0:
1066
1073
  try:
@@ -1102,7 +1109,7 @@ def sanitycheckstats(sumstats,
1102
1109
  HR=(-100,100),
1103
1110
  HR_95L=(0,float("Inf")),
1104
1111
  HR_95U=(0,float("Inf")),
1105
- info=(0,1),
1112
+ info=(0,2),
1106
1113
  float_tolerence = 1e-7,
1107
1114
  verbose=True,
1108
1115
  log=Log()):
@@ -17,6 +17,8 @@ def tofinemapping(sumstats,
17
17
  vcf=None,
18
18
  loci=None,
19
19
  out="./",
20
+ plink="plink",
21
+ plink2="plink2",
20
22
  windowsizekb=1000,
21
23
  n_cores=1,
22
24
  mode="r",
@@ -56,6 +58,9 @@ def tofinemapping(sumstats,
56
58
  else:
57
59
  sig_df = sumstats.loc[sumstats["SNPID"].isin(loci),:]
58
60
 
61
+ log.write(" -plink1.9 path: {}".format(plink),verbose=verbose)
62
+ log.write(" -plink2 path: {}".format(plink2),verbose=verbose)
63
+
59
64
  # Drop duplicate!!!!
60
65
  log.write(" -Dropping duplicated SNPIDs...",verbose=verbose)
61
66
  sumstats = sumstats.drop_duplicates(subset=["SNPID"]).copy()
@@ -68,11 +73,13 @@ def tofinemapping(sumstats,
68
73
  if exclude_hla==True:
69
74
  sig_df = _exclude_hla(sig_df, log=log, verbose=verbose)
70
75
 
76
+ sig_df = sig_df.reset_index()
77
+
71
78
  ## for each lead variant
72
79
  for index, row in sig_df.iterrows():
73
80
  # extract snplist in each locus
74
81
  gc.collect()
75
-
82
+ log.write(" -Locus #{}---------------------------------------------------------------".format(index+1))
76
83
  log.write(" -Processing locus with lead variant {} at CHR {} POS {} ...".format(row["SNPID"],row["CHR"],row["POS"]))
77
84
  locus_sumstats = _extract_variants_in_locus(sumstats, windowsizekb, locus = (row["CHR"],row["POS"]))
78
85
 
@@ -84,7 +91,10 @@ def tofinemapping(sumstats,
84
91
  n_cores=n_cores,
85
92
  log=log,
86
93
  load_bim=True,
87
- overwrite=overwrite,**kwargs)
94
+ overwrite=overwrite,
95
+ plink=plink,
96
+ plink2=plink2,
97
+ **kwargs)
88
98
 
89
99
  ## check available snps with reference file
90
100
  matched_sumstats = _align_sumstats_with_bim(row=row,
@@ -114,7 +124,10 @@ def tofinemapping(sumstats,
114
124
  windowsizekb=windowsizekb,
115
125
  out=out,
116
126
  plink_log=plink_log,
117
- log=log,filetype=filetype,
127
+ log=log,
128
+ filetype=filetype,
129
+ plink=plink,
130
+ plink2=plink2,
118
131
  verbose=verbose)
119
132
 
120
133
 
@@ -143,12 +156,12 @@ def tofinemapping(sumstats,
143
156
 
144
157
 
145
158
 
146
- def _calculate_ld_r(study, matched_sumstats_snpid, row, bfile_prefix, n_cores, windowsizekb,out,plink_log,log,memory,mode,filetype,verbose=True):
159
+ def _calculate_ld_r(study, matched_sumstats_snpid, row, bfile_prefix, n_cores, windowsizekb,out,plink_log,log,memory,mode,filetype,plink,plink2,verbose=True):
147
160
  '''
148
161
  Calculate LD r matrix by calling PLINK; return file name and log
149
162
  '''
150
163
  log.write(" -Start to calculate LD r matrix...",verbose=verbose)
151
- log = _checking_plink_version(v=1, log=log)
164
+ log = _checking_plink_version(plink=plink, log=log)
152
165
  if "@" in bfile_prefix:
153
166
  bfile_to_use = bfile_prefix.replace("@",str(row["CHR"]))
154
167
  else:
@@ -165,7 +178,7 @@ def _calculate_ld_r(study, matched_sumstats_snpid, row, bfile_prefix, n_cores, w
165
178
  raise ValueError("Please use bfile instead of pfile for PLINK1.")
166
179
 
167
180
  script_vcf_to_bfile = """
168
- plink \
181
+ {} \
169
182
  --bfile {} \
170
183
  --keep-allele-order \
171
184
  --extract {} \
@@ -175,7 +188,7 @@ def _calculate_ld_r(study, matched_sumstats_snpid, row, bfile_prefix, n_cores, w
175
188
  --threads {} {}\
176
189
  --write-snplist \
177
190
  --out {}
178
- """.format(bfile_to_use, snplist_path , row["CHR"], mode, n_cores, memory_flag if memory is not None else "", output_prefix)
191
+ """.format(plink, bfile_to_use, snplist_path , row["CHR"], mode, n_cores, memory_flag if memory is not None else "", output_prefix)
179
192
 
180
193
  try:
181
194
  output = subprocess.check_output(script_vcf_to_bfile, stderr=subprocess.STDOUT, shell=True,text=True)
@@ -18,6 +18,8 @@ def _calculate_prs(sumstats,
18
18
  memory=None,
19
19
  overwrite=False,
20
20
  mode=None,delete=True,
21
+ plink="plink",
22
+ plink2="plink2",
21
23
  log=Log(),**kwargs):
22
24
 
23
25
  #matching_alleles
@@ -30,14 +32,18 @@ def _calculate_prs(sumstats,
30
32
  chrlist.sort()
31
33
  plink_log = ""
32
34
  #process reference fileWWW
33
- bfile_prefix, plink_log, ref_bim, filetype = _process_plink_input_files( chrlist=chrlist,
35
+ bfile_prefix, plink_log, ref_bim, filetype = _process_plink_input_files(
36
+ chrlist=chrlist,
34
37
  bfile=bfile,
35
38
  vcf=vcf,
36
39
  plink_log=plink_log,
37
40
  n_cores=n_cores,
38
41
  log=log,
39
42
  load_bim=False,
40
- overwrite=overwrite,**kwargs)
43
+ overwrite=overwrite,
44
+ plink=plink,
45
+ plink2=plink2,
46
+ **kwargs)
41
47
  score_file_path_list =[]
42
48
  for index, chrom in enumerate(chrlist):
43
49
  chr_sumstats = sumstats.loc[sumstats["CHR"]==chrom,:].copy()
@@ -61,7 +67,7 @@ def _calculate_prs(sumstats,
61
67
  plink_log=plink_log,
62
68
  log=log,
63
69
  memory=memory,
64
- mode=mode,filetype=filetype)
70
+ mode=mode,filetype=filetype,plink2=plink2)
65
71
  score_file_path_list.append(score_file_path)
66
72
  if delete == True:
67
73
  os.remove(model_path)
@@ -71,10 +77,10 @@ def _calculate_prs(sumstats,
71
77
 
72
78
 
73
79
 
74
- def _run_calculate_prs(study, chrom , model_path, bfile_prefix, n_cores, out, plink_log, log, memory,filetype, mode=None):
80
+ def _run_calculate_prs(study, chrom , model_path, bfile_prefix, n_cores, out, plink_log, log, memory,filetype, plink2,mode=None):
75
81
 
76
82
  log.write(" -Start to calculate PRS for Chr {}...".format(chrom))
77
- _checking_plink_version(v=2, log=log)
83
+ _checking_plink_version(plink2=plink2, log=log)
78
84
 
79
85
  if "@" in bfile_prefix:
80
86
  bpfile_to_use = bfile_prefix.replace("@",str(chrom))
@@ -92,13 +98,13 @@ def _run_calculate_prs(study, chrom , model_path, bfile_prefix, n_cores, out, pl
92
98
  memory_flag = "--memory {}".format(memory)
93
99
 
94
100
  script_vcf_to_bfile = """
95
- plink2 \
101
+ {} \
96
102
  {} \
97
103
  --score {} 1 2 3 header {} cols=+scoresums,+denom ignore-dup-ids \
98
104
  --chr {} \
99
105
  --threads {} {}\
100
106
  --out {}
101
- """.format(file_flag, model_path , mode if mode is not None else "", chrom, n_cores, memory_flag if memory is not None else "", output_prefix)
107
+ """.format(plink2, file_flag, model_path , mode if mode is not None else "", chrom, n_cores, memory_flag if memory is not None else "", output_prefix)
102
108
 
103
109
  try:
104
110
  output = subprocess.check_output(script_vcf_to_bfile, stderr=subprocess.STDOUT, shell=True,text=True)
@@ -20,7 +20,9 @@ def _process_plink_input_files(chrlist,
20
20
  bgen_mode="ref-first",
21
21
  convert="bfile",
22
22
  memory=None,
23
- load_bim=False):
23
+ load_bim=False,
24
+ plink="plink",
25
+ plink2="plink2"):
24
26
  """
25
27
  Process input files (bfile,pfile,vcf,bgen) to either PLINK1 bed/bim/fam or PLINK2 pgen/psam/pvar.
26
28
 
@@ -66,7 +68,9 @@ def _process_plink_input_files(chrlist,
66
68
  convert=convert,
67
69
  memory=memory,
68
70
  overwrite=overwrite,
69
- load_bim=load_bim)
71
+ load_bim=load_bim,
72
+ plink=plink,
73
+ plink2=plink2)
70
74
  filetype = convert
71
75
  elif filetype == "bgen":
72
76
  ref_file_prefix, plink_log, ref_bims = _process_bgen(ref_file_prefix=ref_file_prefix,
@@ -81,7 +85,9 @@ def _process_plink_input_files(chrlist,
81
85
  convert=convert,
82
86
  memory=memory,
83
87
  overwrite=overwrite,
84
- load_bim=load_bim)
88
+ load_bim=load_bim,
89
+ plink=plink,
90
+ plink2=plink2)
85
91
  filetype = convert
86
92
  return ref_file_prefix, plink_log, ref_bims, filetype
87
93
 
@@ -199,11 +205,13 @@ def _process_vcf(ref_file_prefix,
199
205
  convert="bfile",
200
206
  memory=None,
201
207
  overwrite=False,
202
- load_bim=False):
208
+ load_bim=False,
209
+ plink="plink",
210
+ plink2="plink2"):
203
211
  log.write(" -Processing VCF : {}...".format(ref_file_prefix))
204
212
 
205
213
  #check plink version
206
- log = _checking_plink_version(v=2,log=log)
214
+ log = _checking_plink_version(plink2=plink2,log=log)
207
215
 
208
216
  # file path prefix to return
209
217
  if is_wild_card==True:
@@ -243,14 +251,15 @@ def _process_vcf(ref_file_prefix,
243
251
  #if not existing or overwrite is True
244
252
  if (not is_file_exist) or overwrite:
245
253
  script_vcf_to_bfile = """
246
- plink2 \
254
+ {} \
247
255
  --vcf {} \
248
256
  --chr {} \
249
257
  {} \
250
258
  --rm-dup force-first \
251
259
  --threads {}{}\
252
260
  --out {}
253
- """.format(vcf_to_load,
261
+ """.format(plink2,
262
+ vcf_to_load,
254
263
  i,
255
264
  make_flag,
256
265
  n_cores, memory_flag,
@@ -288,11 +297,13 @@ def _process_bgen(ref_file_prefix,
288
297
  convert="bfile",
289
298
  memory=None,
290
299
  overwrite=False,
291
- load_bim=False):
300
+ load_bim=False,
301
+ plink="plink",
302
+ plink2="plink2"):
292
303
  log.write(" -Processing BGEN files : {}...".format(ref_file_prefix))
293
304
 
294
305
  #check plink version
295
- log = _checking_plink_version(v=2,log=log)
306
+ log = _checking_plink_version(log=log,plink2=plink2)
296
307
 
297
308
  # file path prefix to return
298
309
  if is_wild_card==True:
@@ -338,14 +349,14 @@ def _process_bgen(ref_file_prefix,
338
349
  #if not existing or overwrite is True
339
350
  if (not is_file_exist) or overwrite:
340
351
  script_vcf_to_bfile = """
341
- plink2 \
352
+ {} \
342
353
  --bgen {} {} {}\
343
354
  --chr {} \
344
355
  {} \
345
356
  --rm-dup force-first \
346
357
  --threads {}{}\
347
358
  --out {}
348
- """.format(bgen_to_load, bgen_mode, sample_flag,
359
+ """.format(plink2,bgen_to_load, bgen_mode, sample_flag,
349
360
  i,
350
361
  make_flag,
351
362
  n_cores, memory_flag,
@@ -10,6 +10,7 @@ from gwaslab.g_vchange_status import vchange_status
10
10
  from gwaslab.qc_fix_sumstats import sortcoordinate
11
11
  from gwaslab.qc_fix_sumstats import start_to
12
12
  from gwaslab.qc_fix_sumstats import finished
13
+ from gwaslab.qc_fix_sumstats import _process_build
13
14
  from gwaslab.hm_harmonize_sumstats import is_palindromic
14
15
 
15
16
  import gc
@@ -430,8 +431,43 @@ def _filter_snp(sumstats, mode="in", ea="EA",nea="NEA", log=Log(),verbose=True):
430
431
  log.write("Finished filtering SNPs.",verbose=verbose)
431
432
  return snp
432
433
 
433
- def _exclude_hla(sumstats, chrom="CHR", pos="POS", lower=25000000 ,upper=34000000 ,log=Log(), verbose=True):
434
-
434
+ def _exclude_hla(sumstats, chrom="CHR", pos="POS", lower=None ,upper=None, build=None, mode="xmhc", log=Log(), verbose=True):
435
+
436
+ if build is not None:
437
+ build = _process_build(build = build,log = log,verbose = verbose)
438
+ # xMHC : HIST1H2AA ~ 7.6mb ~ RPL12P1
439
+ # reference: Horton, R., Wilming, L., Rand, V., Lovering, R. C., Bruford, E. A., Khodiyar, V. K., ... & Beck, S. (2004). Gene map of the extended human MHC. Nature Reviews Genetics, 5(12), 889-899.
440
+ # hg38: 25,726,063 ~ 33,400,644
441
+ # hg19 : 25,726,291 ~ 33,368,421
442
+
443
+ # HLA : GABBR1 ~ 3.78mb ~ KIFC1
444
+ # reference: Shiina, T., Hosomichi, K., Inoko, H., & Kulski, J. K. (2009). The HLA genomic loci map: expression, interaction, diversity and disease. Journal of human genetics, 54(1), 15-39.
445
+ # hg38: 29,602,238 ~ 33,409,896
446
+ # hg19: 29,570,015 ~ 33,377,673
447
+
448
+ if build == "19":
449
+ if mode =="xmhc":
450
+ lower=25000000
451
+ upper=34000000
452
+ if mode =="hla" or mode =="mhc":
453
+ lower=29500000
454
+ upper=33500000
455
+ if build == "38":
456
+ if mode =="xmhc":
457
+ lower=25000000
458
+ upper=34000000
459
+ if mode =="hla" or mode =="mhc":
460
+ lower=29500000
461
+ upper=33500000
462
+ else:
463
+ # -> 25,000,000 ~ 34,000,000
464
+ if mode =="xmhc":
465
+ lower=25000000
466
+ upper=34000000
467
+ if mode =="hla" or mode =="mhc":
468
+ lower=29500000
469
+ upper=33500000
470
+
435
471
  raw_len = len(sumstats)
436
472
 
437
473
  if str(sumstats[chrom].dtype) == "string":
@@ -11,6 +11,7 @@ from gwaslab.bd_common_data import get_chr_to_number
11
11
  from gwaslab.bd_common_data import get_number_to_chr
12
12
  from gwaslab.bd_common_data import get_chr_to_NC
13
13
  from gwaslab.bd_common_data import gtf_to_protein_coding
14
+ from gwaslab.bd_common_data import gtf_to_all_gene
14
15
  from gwaslab.bd_download import check_and_download
15
16
  from gwaslab.util_ex_gwascatalog import gwascatalog_trait
16
17
  from gwaslab.qc_fix_sumstats import check_dataframe_shape
@@ -38,6 +39,7 @@ def getsig(insumstats,
38
39
  wc_correction=False,
39
40
  build="19",
40
41
  source="ensembl",
42
+ gtf_path=None,
41
43
  verbose=True):
42
44
  """
43
45
  Extract the lead variants using a sliding window. P or MLOG10P will be used and converted to SCALEDP for sorting.
@@ -172,6 +174,7 @@ def getsig(insumstats,
172
174
  xymt=xymt,
173
175
  build=build,
174
176
  source=source,
177
+ gtf_path=gtf_path,
175
178
  verbose=verbose)
176
179
 
177
180
  # drop internal id
@@ -253,6 +256,7 @@ def annogene(
253
256
  xymt=["X","Y","MT"],
254
257
  build="19",
255
258
  source="ensembl",
259
+ gtf_path=None,
256
260
  verbose=True):
257
261
 
258
262
  log.write("Start to annotate variants with nearest gene name(s)...", verbose=verbose)
@@ -267,8 +271,13 @@ def annogene(
267
271
  #| gzip >Homo_sapiens.GRCh37.75.processed.chr.gtf.gz
268
272
 
269
273
  #gtf_path = check_and_download("ensembl_hg19_gtf_protein_coding")
270
- gtf_path = check_and_download("ensembl_hg19_gtf")
271
- gtf_path = gtf_to_protein_coding(gtf_path,log=log,verbose=verbose)
274
+ if gtf_path is None:
275
+ gtf_path = check_and_download("ensembl_hg19_gtf")
276
+ gtf_path = gtf_to_protein_coding(gtf_path,log=log,verbose=verbose)
277
+ else:
278
+ log.write(" -Using user-provided gtf:{}".format(gtf_path))
279
+ gtf_path = gtf_to_all_gene(gtf_path,log=log,verbose=verbose)
280
+
272
281
  gtf_db_path = gtf_path[:-2]+"db"
273
282
 
274
283
  data = Genome(
@@ -283,8 +292,13 @@ def annogene(
283
292
  elif build=="38":
284
293
  log.write(" -Assigning Gene name using ensembl_hg38_gtf for protein coding genes", verbose=verbose)
285
294
  #gtf_path = check_and_download("ensembl_hg38_gtf_protein_coding")
286
- gtf_path = check_and_download("ensembl_hg38_gtf")
287
- gtf_path = gtf_to_protein_coding(gtf_path,log=log,verbose=verbose)
295
+ if gtf_path is None:
296
+ gtf_path = check_and_download("ensembl_hg38_gtf")
297
+ gtf_path = gtf_to_protein_coding(gtf_path,log=log,verbose=verbose)
298
+ else:
299
+ log.write(" -Using user-provided gtf:{}".format(gtf_path))
300
+ gtf_path = gtf_to_all_gene(gtf_path,log=log,verbose=verbose)
301
+
288
302
  gtf_db_path = gtf_path[:-2]+"db"
289
303
  data = Genome(
290
304
  reference_name='GRCh38',
@@ -300,8 +314,13 @@ def annogene(
300
314
  if build=="19":
301
315
  log.write(" -Assigning Gene name using NCBI refseq latest GRCh37 for protein coding genes", verbose=verbose)
302
316
  #gtf_path = check_and_download("refseq_hg19_gtf_protein_coding")
303
- gtf_path = check_and_download("refseq_hg19_gtf")
304
- gtf_path = gtf_to_protein_coding(gtf_path,log=log,verbose=verbose)
317
+ if gtf_path is None:
318
+ gtf_path = check_and_download("refseq_hg19_gtf")
319
+ gtf_path = gtf_to_protein_coding(gtf_path,log=log,verbose=verbose)
320
+ else:
321
+ log.write(" -Using user-provided gtf:{}".format(gtf_path))
322
+ gtf_path = gtf_to_all_gene(gtf_path,log=log,verbose=verbose)
323
+
305
324
  gtf_db_path = gtf_path[:-2]+"db"
306
325
  data = Genome(
307
326
  reference_name='GRCh37',
@@ -315,8 +334,13 @@ def annogene(
315
334
  elif build=="38":
316
335
  log.write(" -Assigning Gene name using NCBI refseq latest GRCh38 for protein coding genes", verbose=verbose)
317
336
  #gtf_path = check_and_download("refseq_hg38_gtf_protein_coding")
318
- gtf_path = check_and_download("refseq_hg38_gtf")
319
- gtf_path = gtf_to_protein_coding(gtf_path,log=log,verbose=verbose)
337
+ if gtf_path is None:
338
+ gtf_path = check_and_download("refseq_hg38_gtf")
339
+ gtf_path = gtf_to_protein_coding(gtf_path,log=log,verbose=verbose)
340
+ else:
341
+ log.write(" -Using user-provided gtf:{}".format(gtf_path))
342
+ gtf_path = gtf_to_all_gene(gtf_path,log=log,verbose=verbose)
343
+
320
344
  gtf_db_path = gtf_path[:-2]+"db"
321
345
  data = Genome(
322
346
  reference_name='GRCh38',