DBPeaks 0.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
DBPeaks.py
ADDED
|
@@ -0,0 +1,810 @@
|
|
|
1
|
+
#/usr/bin/python
|
|
2
|
+
|
|
3
|
+
__author__ = "Sander Granneman"
|
|
4
|
+
__copyright__ = "Copyright 2024"
|
|
5
|
+
__version__ = "0.0.3"
|
|
6
|
+
__credits__ = ["Sander Granneman"]
|
|
7
|
+
__maintainer__ = "Sander Granneman"
|
|
8
|
+
__email__ = "Sander.Granneman@ed.ac.uk"
|
|
9
|
+
__status__ = "beta"
|
|
10
|
+
|
|
11
|
+
import os
|
|
12
|
+
import re
|
|
13
|
+
import csv
|
|
14
|
+
import sys
|
|
15
|
+
import pybedtools
|
|
16
|
+
import argparse
|
|
17
|
+
import subprocess
|
|
18
|
+
import numpy as np
|
|
19
|
+
import pandas as pd
|
|
20
|
+
|
|
21
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
22
|
+
from collections import defaultdict
|
|
23
|
+
from pydeseq2.dds import DeseqDataSet
|
|
24
|
+
from pydeseq2.default_inference import DefaultInference
|
|
25
|
+
from pydeseq2.ds import DeseqStats
|
|
26
|
+
from pyCRAC.Parsers import GTF2
|
|
27
|
+
from pyCRAC.Methods import numpy_overlap
|
|
28
|
+
from pyCRAC.Classes.NGSFormatWriters import NGSFileWriter
|
|
29
|
+
|
|
30
|
+
def getGeneIDs(string):
|
|
31
|
+
"""Finds all the gene_ids in a given string"""
|
|
32
|
+
gene_ids = list(set(re.findall(r'gene_id\s+"([^"]+)"', string)))
|
|
33
|
+
return gene_ids
|
|
34
|
+
|
|
35
|
+
def getGeneNames(string):
|
|
36
|
+
"""Finds all the gene_names in a given string"""
|
|
37
|
+
gene_names = list(set(re.findall(r'gene_name\s+"([^"]+)"', string)))
|
|
38
|
+
return gene_names
|
|
39
|
+
|
|
40
|
+
def rowToGTF(row):
|
|
41
|
+
""" Returns the results from the mergeGTFfiles function as a GTF file string """
|
|
42
|
+
return f"{row['chrom']}\tcluster\tinterval\t{row['start']}\t{row['end']}\t.\t{row['name']}\t.\tgene_id \"{row['gene_ids']}\"; gene_name \"{row['gene_names']}\";"
|
|
43
|
+
|
|
44
|
+
def mergeGTFintervals(gtf_files, reproducibility=0.9, output_file_name=None):
|
|
45
|
+
"""Concatenates the files and then uses pybedtools to find intervals/peaks that are found in all replicates.
|
|
46
|
+
The user can decide whether only some replicates should contain the peaks or all. This can be done by setting
|
|
47
|
+
the 'reproducibility' variable in the function, which is set to 0.9 (i.e., 90% by default).
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
if not output_file_name:
|
|
51
|
+
output_file_name = "merged.gtf"
|
|
52
|
+
|
|
53
|
+
if not os.path.exists(output_file_name):
|
|
54
|
+
### Concatenating the files and storing them in a pandas dataframe:
|
|
55
|
+
data_frames = [pd.read_csv(i, sep='\t', comment='#', index_col=None, header=None) for i in gtf_files]
|
|
56
|
+
merged_data = pd.concat(data_frames, ignore_index=True)
|
|
57
|
+
|
|
58
|
+
### Sorting the merged data:
|
|
59
|
+
merged_data = merged_data.sort_values(by=[0, 1, 2, 3, 8])
|
|
60
|
+
|
|
61
|
+
### Loading the dataframe into pybedtools:
|
|
62
|
+
bedtools_data = pybedtools.BedTool.from_dataframe(merged_data)
|
|
63
|
+
|
|
64
|
+
### Merging the data using bedtools:
|
|
65
|
+
bedtools_data_merged = bedtools_data.merge(s=True, c=[7, 9], o='collapse', delim='')
|
|
66
|
+
|
|
67
|
+
### Converting the results back into a dataframe:
|
|
68
|
+
bedtools_data_merged = pybedtools.BedTool.to_dataframe(bedtools_data_merged)
|
|
69
|
+
|
|
70
|
+
### Now only keeping peaks that were found in multiple replicates, based on the threshold:
|
|
71
|
+
number_of_reps = len(gtf_files)
|
|
72
|
+
must_be_seen_in_reps = number_of_reps * float(reproducibility)
|
|
73
|
+
|
|
74
|
+
### Filter rows where the length of the string in the 'name' column exceeds the threshold:
|
|
75
|
+
bedtools_data_merged = bedtools_data_merged[bedtools_data_merged['name'].apply(len) >= must_be_seen_in_reps]
|
|
76
|
+
|
|
77
|
+
### Keep only the first character of the strand column:
|
|
78
|
+
bedtools_data_merged['name'] = bedtools_data_merged['name'].str[0] # Keep only the first character
|
|
79
|
+
|
|
80
|
+
### Now extracting the gene_ids and gene_names, only keeping the unique ones:
|
|
81
|
+
bedtools_data_merged['gene_ids'] = bedtools_data_merged['score'].apply((getGeneIDs))
|
|
82
|
+
bedtools_data_merged['gene_ids'] = bedtools_data_merged['gene_ids'].apply(lambda x: '|'.join(x))
|
|
83
|
+
|
|
84
|
+
bedtools_data_merged['gene_names'] = bedtools_data_merged['score'].apply((getGeneNames))
|
|
85
|
+
bedtools_data_merged['gene_names'] = bedtools_data_merged['gene_names'].apply(lambda x: '|'.join(x))
|
|
86
|
+
|
|
87
|
+
### Now dropping the score column
|
|
88
|
+
bedtools_data_merged.drop(columns=['score'],inplace=True)
|
|
89
|
+
|
|
90
|
+
### Resetting the index:
|
|
91
|
+
bedtools_data_merged = bedtools_data_merged.reset_index()
|
|
92
|
+
|
|
93
|
+
gtf_file_lines = list()
|
|
94
|
+
gtf_file_lines = bedtools_data_merged.apply(rowToGTF,axis=1).to_list()
|
|
95
|
+
|
|
96
|
+
if gtf_file_lines:
|
|
97
|
+
outfile = open(output_file_name,'w')
|
|
98
|
+
outfile.write("##gff-version 2\n")
|
|
99
|
+
|
|
100
|
+
for i in gtf_file_lines:
|
|
101
|
+
outfile.write(f"{i}\n")
|
|
102
|
+
outfile.close()
|
|
103
|
+
return True
|
|
104
|
+
|
|
105
|
+
else:
|
|
106
|
+
sys.stderr.write("ERROR! The data could not be merged!\n")
|
|
107
|
+
return False
|
|
108
|
+
else:
|
|
109
|
+
sys.stderr.write(f"\tOutput file {output_file_name} already exists!\n")
|
|
110
|
+
return False
|
|
111
|
+
|
|
112
|
+
def countReadsBam(bam_files,gtf_annotation_file,no_cpus=1,output_dir="bam_read_counts",blocks=False):
|
|
113
|
+
""" Runs the pyReadCounter analyses on the bam files"""
|
|
114
|
+
|
|
115
|
+
### Creating the directory where the results will be stored:
|
|
116
|
+
if not os.path.exists(output_dir):
|
|
117
|
+
os.makedirs(output_dir)
|
|
118
|
+
|
|
119
|
+
def run_command(file_name,gtf_file,outfile_name):
|
|
120
|
+
cmd = [
|
|
121
|
+
"pyReadCounters.py",
|
|
122
|
+
"-f",
|
|
123
|
+
file_name,
|
|
124
|
+
"--file_type",
|
|
125
|
+
"sam",
|
|
126
|
+
"--gtf",
|
|
127
|
+
gtf_file,
|
|
128
|
+
"-v",
|
|
129
|
+
"--gtffile",
|
|
130
|
+
"-o",
|
|
131
|
+
outfile_name
|
|
132
|
+
]
|
|
133
|
+
|
|
134
|
+
if blocks:
|
|
135
|
+
cmd.extend(["--blocks",
|
|
136
|
+
"--mutations",
|
|
137
|
+
"nomuts"
|
|
138
|
+
]
|
|
139
|
+
)
|
|
140
|
+
subprocess.run(cmd)
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
data_type = "reads"
|
|
144
|
+
if blocks:
|
|
145
|
+
data_type = "cDNAs"
|
|
146
|
+
|
|
147
|
+
### Running pyReadCounters over multiple processors.
|
|
148
|
+
with ThreadPoolExecutor(no_cpus) as executor:
|
|
149
|
+
futures = []
|
|
150
|
+
for file_name in bam_files:
|
|
151
|
+
basename = getFileBaseName(file_name)
|
|
152
|
+
# The name of the output file path that should be submitted to pyReadCounters:
|
|
153
|
+
output_file_name = f"{output_dir}/{basename}"
|
|
154
|
+
# The name of the output file path produced by pyReadCounters:
|
|
155
|
+
output_file_path = f"{output_file_name}_count_output_{data_type}.gtf"
|
|
156
|
+
# If the file already exists, don't overwrite it.
|
|
157
|
+
if not os.path.exists(output_file_path):
|
|
158
|
+
future = executor.submit(run_command,file_name,gtf_annotation_file,output_file_name)
|
|
159
|
+
futures.append(future)
|
|
160
|
+
else:
|
|
161
|
+
sys.stderr.write(f"\tOutput file {output_file_path} already exists!\n")
|
|
162
|
+
|
|
163
|
+
# Wait for all commands to complete
|
|
164
|
+
if futures:
|
|
165
|
+
for future in futures:
|
|
166
|
+
future.result()
|
|
167
|
+
|
|
168
|
+
return True
|
|
169
|
+
|
|
170
|
+
def countReadsGTF(bam_files,gtf_annotation_file,no_cpus=1,output_dir="peak_hittables",blocks=False):
|
|
171
|
+
""" Runs the pyReadCounter analyses on the bam files"""
|
|
172
|
+
|
|
173
|
+
### Creating the directory where the results will be stored:
|
|
174
|
+
if not os.path.exists(output_dir):
|
|
175
|
+
os.makedirs(output_dir)
|
|
176
|
+
|
|
177
|
+
def run_command(file_name,gtf_file,outfile_name):
|
|
178
|
+
cmd = [
|
|
179
|
+
"pyReadCounters.py",
|
|
180
|
+
"-f",
|
|
181
|
+
file_name,
|
|
182
|
+
"--file_type",
|
|
183
|
+
"sam",
|
|
184
|
+
"--gtf",
|
|
185
|
+
gtf_file,
|
|
186
|
+
"-v",
|
|
187
|
+
"--hittable",
|
|
188
|
+
"-o",
|
|
189
|
+
outfile_name
|
|
190
|
+
]
|
|
191
|
+
|
|
192
|
+
if blocks:
|
|
193
|
+
cmd.extend(["--blocks",
|
|
194
|
+
"--mutations",
|
|
195
|
+
"nomuts"
|
|
196
|
+
]
|
|
197
|
+
)
|
|
198
|
+
subprocess.run(cmd)
|
|
199
|
+
|
|
200
|
+
data_type = "reads"
|
|
201
|
+
if blocks:
|
|
202
|
+
data_type = "cDNAs"
|
|
203
|
+
|
|
204
|
+
### Running pyReadCounters over multiple processors.
|
|
205
|
+
with ThreadPoolExecutor(no_cpus) as executor:
|
|
206
|
+
futures = []
|
|
207
|
+
for file_name in bam_files:
|
|
208
|
+
basename = getFileBaseName(file_name)
|
|
209
|
+
# The name of the output file path that should be submitted to pyReadCounters:
|
|
210
|
+
output_file_name = f"{output_dir}/{basename}"
|
|
211
|
+
# The name of the output file path produced by pyReadCounters:
|
|
212
|
+
output_file_path = f"{output_file_name}_hittable_{data_type}.txt"
|
|
213
|
+
# If the file already exists, don't overwrite it.
|
|
214
|
+
if not os.path.exists(output_file_path):
|
|
215
|
+
future = executor.submit(run_command, file_name, gtf_annotation_file, output_file_name)
|
|
216
|
+
futures.append(future)
|
|
217
|
+
else:
|
|
218
|
+
sys.stderr.write(f"\tOutput file {output_file_path} already exists!\n")
|
|
219
|
+
|
|
220
|
+
# Wait for all commands to complete
|
|
221
|
+
if futures:
|
|
222
|
+
for future in futures:
|
|
223
|
+
future.result()
|
|
224
|
+
|
|
225
|
+
def getPeaks(gtf_files,gtf_annotation_file,chromosome_file,no_cpus=1,min_peak_height=5,min_fdr=0.05,output_dir="peak_gtf_files"):
|
|
226
|
+
""" Runs pyCalculateFDRs to get peaks enriched in the data relative to random control dataset """
|
|
227
|
+
|
|
228
|
+
### Creating the directory where the results will be stored:
|
|
229
|
+
if not os.path.exists(output_dir):
|
|
230
|
+
os.makedirs(output_dir)
|
|
231
|
+
|
|
232
|
+
def run_command(file_name,gtf_annotation_file,chromosome_file,min_peak_height,min_fdr,output_file_name):
|
|
233
|
+
cmd = [
|
|
234
|
+
"pyCalculateFDRs.py",
|
|
235
|
+
"-f",
|
|
236
|
+
file_name,
|
|
237
|
+
"--gtf",
|
|
238
|
+
gtf_annotation_file,
|
|
239
|
+
"-c",
|
|
240
|
+
chromosome_file,
|
|
241
|
+
"--min",
|
|
242
|
+
str(min_peak_height),
|
|
243
|
+
"-m",
|
|
244
|
+
str(min_fdr),
|
|
245
|
+
"-v",
|
|
246
|
+
"-o",
|
|
247
|
+
output_file_name
|
|
248
|
+
]
|
|
249
|
+
subprocess.run(cmd)
|
|
250
|
+
|
|
251
|
+
### Running pyCalculateFDRs over multiple processors.
|
|
252
|
+
with ThreadPoolExecutor(no_cpus) as executor:
|
|
253
|
+
futures = []
|
|
254
|
+
for file_name in gtf_files:
|
|
255
|
+
basename = getFileBaseName(file_name)
|
|
256
|
+
output_file_name = f"{output_dir}/{basename}_FDR_{str(min_fdr)}_min_peak_height_{str(min_peak_height)}.gtf"
|
|
257
|
+
if not os.path.exists(output_file_name):
|
|
258
|
+
future = executor.submit(run_command,
|
|
259
|
+
file_name,
|
|
260
|
+
gtf_annotation_file,
|
|
261
|
+
chromosome_file,
|
|
262
|
+
min_peak_height,
|
|
263
|
+
min_fdr,
|
|
264
|
+
output_file_name)
|
|
265
|
+
futures.append(future)
|
|
266
|
+
else:
|
|
267
|
+
sys.stderr.write(f"\tOutput file {output_file_name} already exists!\n")
|
|
268
|
+
|
|
269
|
+
# Wait for all commands to complete
|
|
270
|
+
if futures:
|
|
271
|
+
for future in futures:
|
|
272
|
+
future.result()
|
|
273
|
+
|
|
274
|
+
def filterPeaks(peak_gtf_files,by=None,output_dir="filtered_peak_files"):
|
|
275
|
+
""" Filters all the peaks in gtf files by a specific threshold. This
|
|
276
|
+
threshold could be the mean, median or mean plus one standard devation
|
|
277
|
+
(mean_plus_std) peak heights. Default is mean. """
|
|
278
|
+
|
|
279
|
+
### Creating the directory where the results will be stored:
|
|
280
|
+
if not os.path.exists(output_dir):
|
|
281
|
+
os.makedirs(output_dir)
|
|
282
|
+
|
|
283
|
+
peak_heights = defaultdict(list)
|
|
284
|
+
data_thresholds = defaultdict(float)
|
|
285
|
+
|
|
286
|
+
for file_name in peak_gtf_files:
|
|
287
|
+
with open(file_name,'r') as infile:
|
|
288
|
+
for line in infile:
|
|
289
|
+
if not line.startswith("#"):
|
|
290
|
+
fld = line.strip().split('\t')
|
|
291
|
+
peak_height = float(fld[5])
|
|
292
|
+
peak_heights[file_name].append(peak_height)
|
|
293
|
+
|
|
294
|
+
### Calculating thresholds:
|
|
295
|
+
data_thresholds = defaultdict(float)
|
|
296
|
+
|
|
297
|
+
for file_name, value in peak_heights.items():
|
|
298
|
+
### Calculate the mean and use mean+stdev as the threshold:
|
|
299
|
+
threshold = float()
|
|
300
|
+
|
|
301
|
+
if by == 'mean':
|
|
302
|
+
threshold = np.mean(value)
|
|
303
|
+
elif by == 'median':
|
|
304
|
+
threshold = np.median(value)
|
|
305
|
+
elif by == 'mean_plus_std':
|
|
306
|
+
threshold = np.mean(value) + np.std(value)
|
|
307
|
+
elif by == "None":
|
|
308
|
+
threshold = 0
|
|
309
|
+
else:
|
|
310
|
+
sys.stderr.write("ERROR! Cannot figure out how you want to filter the peaks! Please use mean, median, mean_plus_std, or None\n")
|
|
311
|
+
threshold = 0
|
|
312
|
+
|
|
313
|
+
data_thresholds[file_name] = threshold
|
|
314
|
+
|
|
315
|
+
### Using these thresholds to remove peaks
|
|
316
|
+
for file_name, value in peak_heights.items():
|
|
317
|
+
basename = getFileBaseName(file_name)
|
|
318
|
+
output_file_name = f"{output_dir}/{basename}_filtered_by_{by}_threshold.gtf"
|
|
319
|
+
if not os.path.exists(output_file_name) and os.path.exists(file_name):
|
|
320
|
+
outfile = open(output_file_name,"w")
|
|
321
|
+
threshold = data_thresholds[file_name]
|
|
322
|
+
with open(file_name) as peak_file:
|
|
323
|
+
for line in peak_file:
|
|
324
|
+
if not line.startswith("#"):
|
|
325
|
+
fld = line.strip().split('\t')
|
|
326
|
+
peak_height = float(fld[5])
|
|
327
|
+
if peak_height >= threshold:
|
|
328
|
+
outfile.write(line)
|
|
329
|
+
else:
|
|
330
|
+
outfile.write(line)
|
|
331
|
+
outfile.close()
|
|
332
|
+
else:
|
|
333
|
+
sys.stderr.write(f"\tOutput file {output_file_name} already exists!\n")
|
|
334
|
+
|
|
335
|
+
return True
|
|
336
|
+
|
|
337
|
+
def adjustPeakWidths(peak_gtf_files,chromosome_file,min_width=20,no_cpus=1,output_dir = "filtered_peak_files"):
|
|
338
|
+
""" Normalises the peak widths to a minimum length. """
|
|
339
|
+
|
|
340
|
+
### Creating the directory where the results will be stored:
|
|
341
|
+
if not os.path.exists(output_dir):
|
|
342
|
+
os.makedirs(output_dir)
|
|
343
|
+
|
|
344
|
+
def run_command(file_name,chromosome_file,outfile_name=None,min_width=20):
|
|
345
|
+
cmd = [
|
|
346
|
+
"pyNormalizeIntervalLengths.py",
|
|
347
|
+
"-f",
|
|
348
|
+
file_name,
|
|
349
|
+
"-c",
|
|
350
|
+
chromosome_file,
|
|
351
|
+
"--min",
|
|
352
|
+
str(min_width),
|
|
353
|
+
"-o",
|
|
354
|
+
outfile_name
|
|
355
|
+
]
|
|
356
|
+
subprocess.run(cmd)
|
|
357
|
+
|
|
358
|
+
### Running pyNormalizeIntervalLengths over multiple processors.
|
|
359
|
+
with ThreadPoolExecutor(no_cpus) as executor:
|
|
360
|
+
futures = []
|
|
361
|
+
for file_name in peak_gtf_files:
|
|
362
|
+
basename = getFileBaseName(file_name)
|
|
363
|
+
output_file_name = f"{output_dir}/{basename}_min_width_{str(min_width)}.gtf"
|
|
364
|
+
if not os.path.exists(output_file_name):
|
|
365
|
+
future = executor.submit(run_command,file_name,chromosome_file,output_file_name,min_width)
|
|
366
|
+
futures.append(future)
|
|
367
|
+
else:
|
|
368
|
+
sys.stderr.write(f"\tOutput file {output_file_name} already exists!\n")
|
|
369
|
+
|
|
370
|
+
# Wait for all commands to complete
|
|
371
|
+
if futures:
|
|
372
|
+
for future in futures:
|
|
373
|
+
future.result()
|
|
374
|
+
|
|
375
|
+
return True
|
|
376
|
+
|
|
377
|
+
def numberPeaks(peak_gtf_files,output_dir="filtered_peak_files"):
|
|
378
|
+
""" Gives each peak a unique number to avoid a scenario where peaks end up having the same names,
|
|
379
|
+
which can cause problems with downstream data analysis steps. """
|
|
380
|
+
|
|
381
|
+
### Creating the directory where the results will be stored:
|
|
382
|
+
if not os.path.exists(output_dir):
|
|
383
|
+
os.makedirs(output_dir)
|
|
384
|
+
|
|
385
|
+
for gtf_file in peak_gtf_files:
|
|
386
|
+
basename = getFileBaseName(gtf_file)
|
|
387
|
+
output_file_name = f"{output_dir}/{basename}_numbered.gtf"
|
|
388
|
+
if not os.path.exists(output_file_name):
|
|
389
|
+
outfile = open(output_file_name,"w")
|
|
390
|
+
with open(gtf_file,'r') as infile:
|
|
391
|
+
peak_number = 1
|
|
392
|
+
for line in infile:
|
|
393
|
+
if not line.startswith("#"):
|
|
394
|
+
try:
|
|
395
|
+
gene_id = re.search('gene_id \"([\(\)a-zA-Z_0-9-,\'|]+?)\";',line).group(1)
|
|
396
|
+
gene_name = re.search('gene_name \"([\(\)a-zA-Z_0-9-,\'|]+?)\";',line).group(1)
|
|
397
|
+
|
|
398
|
+
line = line.replace(f'gene_id \"{gene_id}\";',f'gene_id \"{gene_id}_peak_{peak_number}\";')
|
|
399
|
+
line = line.replace(f'gene_name \"{gene_name}\";',f'gene_name \"{gene_name}_peak_{peak_number}\";')
|
|
400
|
+
peak_number += 1
|
|
401
|
+
outfile.write(line)
|
|
402
|
+
except:
|
|
403
|
+
sys.stderr.write(line)
|
|
404
|
+
else:
|
|
405
|
+
outfile.write(line)
|
|
406
|
+
outfile.close()
|
|
407
|
+
return True
|
|
408
|
+
else:
|
|
409
|
+
sys.stderr.write(f"\tOutput file {output_file_name} already exists!\n")
|
|
410
|
+
return False
|
|
411
|
+
|
|
412
|
+
def getFileBaseName(file_name):
|
|
413
|
+
""" Returns the file basename without extension. """
|
|
414
|
+
return os.path.splitext(os.path.basename(file_name))[0]
|
|
415
|
+
|
|
416
|
+
def mergeHittables(hittables,outfile_name="merged_hittables.txt"):
|
|
417
|
+
""" Merges pyReadCounters hittables. """
|
|
418
|
+
|
|
419
|
+
genes = defaultdict(set)
|
|
420
|
+
data = dict()
|
|
421
|
+
feature = str()
|
|
422
|
+
columns = [0,1]
|
|
423
|
+
sumofdata = defaultdict(float)
|
|
424
|
+
mappedreadsdata = defaultdict()
|
|
425
|
+
|
|
426
|
+
if not os.path.exists(outfile_name):
|
|
427
|
+
for i in hittables:
|
|
428
|
+
data[i] = defaultdict(lambda: defaultdict(int))
|
|
429
|
+
mappedreadsdata[i] = defaultdict(float)
|
|
430
|
+
with open(i,"r") as infile:
|
|
431
|
+
mappedreads = 0
|
|
432
|
+
for line in infile:
|
|
433
|
+
if line.startswith("##"):
|
|
434
|
+
feature = line.strip().split()[1]
|
|
435
|
+
elif line.startswith("# total number of paired"):
|
|
436
|
+
mappedreads += int(line.strip().split("\t")[-1])
|
|
437
|
+
elif line.startswith("# total number of single"):
|
|
438
|
+
mappedreads += int(line.strip().split("\t")[-1])
|
|
439
|
+
elif re.search("[A-Za-z0-9]",line[0] ):
|
|
440
|
+
Fld = line.strip().split("\t")
|
|
441
|
+
gene,hits = Fld[columns[0]],Fld[columns[1]]
|
|
442
|
+
data[i][feature][gene] = float(hits)
|
|
443
|
+
genes[feature].add(gene)
|
|
444
|
+
mappedreadsdata[i] = mappedreads
|
|
445
|
+
|
|
446
|
+
outfile = open(outfile_name,"w")
|
|
447
|
+
|
|
448
|
+
outfile.write("# gene\t%s\n" % ("\t".join([getFileBaseName(i) for i in hittables])))
|
|
449
|
+
outfile.write("# total mapped reads:\t%s\n" % ("\t".join([str(mappedreadsdata[i]) for i in hittables])))
|
|
450
|
+
for feature in sorted(genes):
|
|
451
|
+
for i in hittables:
|
|
452
|
+
sumofdata[i] = sum([data[i][feature][j] for j in genes[feature]])
|
|
453
|
+
sumoffeaturehits = "\t".join([str(sumofdata[x]) for x in hittables])
|
|
454
|
+
outfile.write("\n## %s\t%s\n" % (feature,sumoffeaturehits))
|
|
455
|
+
for gene in sorted(list(genes[feature])):
|
|
456
|
+
hitstring = "\t".join([str(data[i][feature][gene]) for i in hittables])
|
|
457
|
+
outfile.write("%s\t%s\n" % (gene,hitstring))
|
|
458
|
+
return True
|
|
459
|
+
else:
|
|
460
|
+
sys.stderr.write(f"\tOutput file {outfile_name} already exists!\n")
|
|
461
|
+
return False
|
|
462
|
+
|
|
463
|
+
def runDESeq(merged_hittable,conditions,no_cpus=1,output_dir="DESeq2_results"):
|
|
464
|
+
""" Runs the DESeq2 analyses on the samples. Returns the results in a text file. """
|
|
465
|
+
|
|
466
|
+
### Creating the directory where the results will be stored:
|
|
467
|
+
if not os.path.exists(output_dir):
|
|
468
|
+
os.makedirs(output_dir)
|
|
469
|
+
|
|
470
|
+
### Defining the GTF file:
|
|
471
|
+
basename = getFileBaseName(merged_hittable)
|
|
472
|
+
outfile_name = f"{output_dir}/{basename}_DESeq2_results.txt"
|
|
473
|
+
|
|
474
|
+
if not os.path.exists(outfile_name):
|
|
475
|
+
### opening the merged hittable file:
|
|
476
|
+
data = pd.read_csv(merged_hittable,comment="#",sep="\t",header=None,index_col=None)
|
|
477
|
+
columns = ["gene"]
|
|
478
|
+
columns.extend(conditions)
|
|
479
|
+
|
|
480
|
+
data.columns = columns
|
|
481
|
+
data.set_index('gene',inplace=True)
|
|
482
|
+
|
|
483
|
+
### Creating a DataFrame with column data:
|
|
484
|
+
colData = pd.DataFrame(data.columns, columns=['gene'])
|
|
485
|
+
colData['Conditions'] = conditions
|
|
486
|
+
|
|
487
|
+
### Changing the index so that the first column of both
|
|
488
|
+
### countData and colData are the same:
|
|
489
|
+
colData = colData.set_index('gene')
|
|
490
|
+
|
|
491
|
+
### Starting DESeq2
|
|
492
|
+
inference = DefaultInference(n_cpus=no_cpus)
|
|
493
|
+
|
|
494
|
+
dds = DeseqDataSet(
|
|
495
|
+
counts=data.T,
|
|
496
|
+
metadata=colData,
|
|
497
|
+
design_factors="Conditions",
|
|
498
|
+
refit_cooks=True,
|
|
499
|
+
inference=inference,
|
|
500
|
+
)
|
|
501
|
+
|
|
502
|
+
dds.deseq2()
|
|
503
|
+
stat_res = DeseqStats(dds, inference=inference)
|
|
504
|
+
|
|
505
|
+
### Getting the final results:
|
|
506
|
+
stat_res.summary()
|
|
507
|
+
final_results = stat_res.results_df
|
|
508
|
+
|
|
509
|
+
### Storing the final results:
|
|
510
|
+
final_results.to_csv(outfile_name,sep="\t")
|
|
511
|
+
return True
|
|
512
|
+
else:
|
|
513
|
+
sys.stderr.write(f"\tOutput file {outfile_name} already exists!\n")
|
|
514
|
+
return False
|
|
515
|
+
|
|
516
|
+
def addFoldChangeToGTF(deseq_results,merged_peaks,output_dir="./"):
|
|
517
|
+
""" Adds the log2-fold changes calculated be DESeq2 to the peak gtf file. """
|
|
518
|
+
|
|
519
|
+
### Setting the output file names:
|
|
520
|
+
peak_outfile_name = f"{output_dir}/{getFileBaseName(merged_peaks)}_with_padj.gtf"
|
|
521
|
+
|
|
522
|
+
### If the output file already exist, then don't overwrite:
|
|
523
|
+
if not os.path.exists(peak_outfile_name):
|
|
524
|
+
### Creating the output file:
|
|
525
|
+
outfile = open(peak_outfile_name,"w")
|
|
526
|
+
### loading the input files:
|
|
527
|
+
deseq_data = pd.read_csv(deseq_results,comment="#",sep="\t",index_col=None,header=0)
|
|
528
|
+
|
|
529
|
+
### Opening the peak data:
|
|
530
|
+
with open(merged_peaks,"r") as peak_file:
|
|
531
|
+
for line in peak_file:
|
|
532
|
+
if not line.startswith("#"):
|
|
533
|
+
gene_name = re.search('gene_name \"([\(\)a-zA-Z_0-9-,\'|]+?)\";',line).group(1)
|
|
534
|
+
if gene_name in deseq_data["gene"].values:
|
|
535
|
+
log2fold_change = deseq_data.loc[deseq_data["gene"] == gene_name,"log2FoldChange"].values[0]
|
|
536
|
+
p_value = deseq_data.loc[deseq_data["gene"] == gene_name,"padj"].values[0]
|
|
537
|
+
line = f"{line.strip()} log2foldchange \"{log2fold_change}\"; padj \"{p_value}\";\n"
|
|
538
|
+
else:
|
|
539
|
+
line = f"{line.strip()} log2foldchange \"unknown\"; padj \"unknown\";\n"
|
|
540
|
+
outfile.write(line)
|
|
541
|
+
outfile.close()
|
|
542
|
+
return True
|
|
543
|
+
else:
|
|
544
|
+
sys.stderr.write(f"\tOutput file {peak_outfile_name} already exist!\n")
|
|
545
|
+
return False
|
|
546
|
+
|
|
547
|
+
def getSignificantPeaks(deseq_results,merged_peaks,fdr_threshold=0.05,output_dir="DESeq2_results"):
|
|
548
|
+
""" Extracts the significantly DE peaks from the merged peak GTF file """
|
|
549
|
+
|
|
550
|
+
### Setting the output file names:
|
|
551
|
+
deseq_outfile_name = f"{output_dir}/{getFileBaseName(deseq_results)}_FDR_{str(fdr_threshold)}.txt"
|
|
552
|
+
peak_outfile_name = f"{output_dir}/{getFileBaseName(merged_peaks)}_FDR_{str(fdr_threshold)}.gtf"
|
|
553
|
+
|
|
554
|
+
### If the output files already exist, then don't overwrite:
|
|
555
|
+
if not os.path.exists(peak_outfile_name):
|
|
556
|
+
### loading the input files:
|
|
557
|
+
deseq_data = pd.read_csv(deseq_results,comment="#",sep="\t",index_col=None,header=0)
|
|
558
|
+
|
|
559
|
+
### Filter the DESeq2 results:
|
|
560
|
+
deseq_data = deseq_data.loc[deseq_data['padj'] <= fdr_threshold]
|
|
561
|
+
deseq_data.to_csv(deseq_outfile_name,sep="\t",index=None)
|
|
562
|
+
|
|
563
|
+
### DE genes:
|
|
564
|
+
de_genes = list(deseq_data[deseq_data.columns[0]])
|
|
565
|
+
|
|
566
|
+
### Opening the peak data:
|
|
567
|
+
peak_data = pd.read_csv(merged_peaks,index_col=None,header=None,comment="#",sep="\t")
|
|
568
|
+
peak_data.columns = ['chrom','source','feature','start','end','score','strand','frame','annotations']
|
|
569
|
+
|
|
570
|
+
### Making a seperate gene_name colum:
|
|
571
|
+
peak_data['gene_name'] = peak_data['annotations'].str.extract(r'gene_name \"(.*?)\"', expand=False)
|
|
572
|
+
|
|
573
|
+
### Filter rows based on the list of gene names
|
|
574
|
+
filtered_peak_data = peak_data[peak_data['gene_name'].isin(de_genes)]
|
|
575
|
+
filtered_peak_data = filtered_peak_data.drop(columns=['gene_name'])
|
|
576
|
+
filtered_peak_data.to_csv(peak_outfile_name,sep="\t",header=False,index=False,quoting=csv.QUOTE_NONE)
|
|
577
|
+
|
|
578
|
+
return True
|
|
579
|
+
else:
|
|
580
|
+
sys.stderr.write(f"\tOutput files {peak_outfile_name} and {deseq_outfile_name} already exist!\n")
|
|
581
|
+
return False
|
|
582
|
+
|
|
583
|
+
|
|
584
|
+
def main():
|
|
585
|
+
parser = argparse.ArgumentParser(usage="usage: %(prog)s [options]", description="A tool for identifying differential RNA-binding sites in CLIP/CRAC datasets")
|
|
586
|
+
|
|
587
|
+
files = parser.add_argument_group("File input options")
|
|
588
|
+
files.add_argument("--samples", dest="samples", nargs="*", metavar="clip_samples.bam",
|
|
589
|
+
help="Paths to the bam files containing the replicate clip samples you want to compare", default=None)
|
|
590
|
+
files.add_argument("--controls", dest="controls", nargs="*", metavar="control_clip_samples.bam",
|
|
591
|
+
help="Paths to bam files containing replicate control clip samples.", default=None)
|
|
592
|
+
files.add_argument("-c", "--chromfile", dest="chromfile", type=str,
|
|
593
|
+
help="Location of the chromosome info file. This file should have two columns: \
|
|
594
|
+
first column is the names of the chromosomes, second column is length of the chromosomes.", default=None)
|
|
595
|
+
files.add_argument("--gtf", dest="gtf_annotation", type=str, metavar="yeast.gtf",
|
|
596
|
+
help="Path to GTF anotation file for your organism containing gene location information.", default=None)
|
|
597
|
+
files.add_argument("-j","--jobname",dest="jobname",type=str,metavar="WT_vs_mutant",
|
|
598
|
+
help="provide a name for the job. Default = WT_vs_mutant")
|
|
599
|
+
#files.add_argument("--log", dest="log", help="To print all the command lines used during the run to the 'command_lines.txt' file",
|
|
600
|
+
# action="store_true",default=False)
|
|
601
|
+
|
|
602
|
+
peaks = parser.add_argument_group("Peak calling settings")
|
|
603
|
+
peaks.add_argument("-m", "--minfdr", dest="minfdr", type=float, metavar="0.05",
|
|
604
|
+
help="To set a minimal FDR threshold for filtering interval data. Default is 0.05", default=0.05)
|
|
605
|
+
peaks.add_argument("--padj", dest="padj", metavar="0.05", type=float,
|
|
606
|
+
help="DESeq2 threshold for calling a DE peak. Default is 0.05", default=0.05)
|
|
607
|
+
peaks.add_argument("--min", dest="min", metavar="5",
|
|
608
|
+
help="to set a minimal read coverages for a region. Regions with coverage less than minimum will be ignoredve an FDR of zero", type=int, default=1)
|
|
609
|
+
peaks.add_argument("--blocks", dest="blocks", help="Add this flag if you want to consider reads with identical mapping coordinates once, regardless of sequence",
|
|
610
|
+
action="store_true",default=False)
|
|
611
|
+
peaks.add_argument("--iterations", dest="iterations", metavar="100", type=int,
|
|
612
|
+
help="to set the number of iterations for randomization of read coordinates. Default=100", default=100)
|
|
613
|
+
peaks.add_argument("-r","--rep",dest="reproducibility",metavar="90", type=float,
|
|
614
|
+
help="To set in what percentage of the replicates the peak should be detected. Default=100", default=100.0)
|
|
615
|
+
peaks.add_argument("--filter",dest="filter_peak_height", metavar="mean", type=str,
|
|
616
|
+
help="To filter the peaks in gtf files by a specific threshold. \
|
|
617
|
+
Options are mean, median or mean plus one standard devation (mean_plus_std) peak heights. Default is no filtering.",default="None",choices=["mean","median","mean_plus_std","None"])
|
|
618
|
+
peaks.add_argument("--min_peak_width",dest="min_peak_width",type=int,metavar="20"
|
|
619
|
+
,help="To set the minimum width of a called peak. Default = 20",default=20)
|
|
620
|
+
|
|
621
|
+
log = parser.add_argument_group("Logging options")
|
|
622
|
+
log.add_argument("-v", "--verbose", action="store_true", help="to print status messages to a log file", default=False)
|
|
623
|
+
|
|
624
|
+
cpu = parser.add_argument_group("Number of CPUs needed for the analyses")
|
|
625
|
+
cpu.add_argument("--cpu", dest="cpu", type=int, metavar="12", help="The number of processors you want to use for the analyses. Default = 1", default=1)
|
|
626
|
+
|
|
627
|
+
args = parser.parse_args()
|
|
628
|
+
|
|
629
|
+
### Setting key parameters;
|
|
630
|
+
|
|
631
|
+
data_type = "reads"
|
|
632
|
+
|
|
633
|
+
if args.blocks:
|
|
634
|
+
data_type = "cDNAs"
|
|
635
|
+
|
|
636
|
+
|
|
637
|
+
### Running the code:
|
|
638
|
+
|
|
639
|
+
# Making the directory where the results files are stored:
|
|
640
|
+
storage_dir = f"{args.jobname}"
|
|
641
|
+
if not os.path.exists(storage_dir):
|
|
642
|
+
os.makedirs(storage_dir)
|
|
643
|
+
|
|
644
|
+
# Getting read counts for all the bam files:
|
|
645
|
+
if args.verbose:
|
|
646
|
+
sys.stdout.write("### Getting gene counts from sample bam files....\n")
|
|
647
|
+
|
|
648
|
+
all_bam_files = list()
|
|
649
|
+
all_bam_files.extend(args.samples)
|
|
650
|
+
all_bam_files.extend(args.controls)
|
|
651
|
+
|
|
652
|
+
countReadsBam(all_bam_files,
|
|
653
|
+
args.gtf_annotation,
|
|
654
|
+
no_cpus=args.cpu,
|
|
655
|
+
output_dir=f"{storage_dir}/bam_read_counts"
|
|
656
|
+
)
|
|
657
|
+
|
|
658
|
+
# Getting all the peak_files:
|
|
659
|
+
if args.verbose:
|
|
660
|
+
sys.stdout.write("### Finding peaks in the sample files....\n")
|
|
661
|
+
|
|
662
|
+
file_basenames = [getFileBaseName(i) for i in all_bam_files]
|
|
663
|
+
|
|
664
|
+
all_read_counters_files = [f"{storage_dir}/bam_read_counts/{i}_count_output_{data_type}.gtf" for i in file_basenames]
|
|
665
|
+
|
|
666
|
+
|
|
667
|
+
getPeaks(all_read_counters_files,
|
|
668
|
+
args.gtf_annotation,
|
|
669
|
+
args.chromfile,
|
|
670
|
+
no_cpus=args.cpu,
|
|
671
|
+
min_peak_height=args.min,
|
|
672
|
+
min_fdr=args.minfdr,
|
|
673
|
+
output_dir=f"{storage_dir}/peak_gtf_files"
|
|
674
|
+
)
|
|
675
|
+
|
|
676
|
+
# Filtering the peaks by peak height:
|
|
677
|
+
if args.verbose:
|
|
678
|
+
sys.stdout.write(f"### Filtering the peaks by {args.filter_peak_height} values of peak heights....\n")
|
|
679
|
+
|
|
680
|
+
read_counters_file_basenames = [getFileBaseName(i) for i in all_read_counters_files]
|
|
681
|
+
peak_gtf_files = [f"{storage_dir}/peak_gtf_files/{i}_FDR_{str(args.minfdr)}_min_peak_height_{str(args.min)}.gtf" \
|
|
682
|
+
for i in read_counters_file_basenames]
|
|
683
|
+
|
|
684
|
+
filterPeaks(peak_gtf_files,
|
|
685
|
+
by=args.filter_peak_height,
|
|
686
|
+
output_dir=f"{storage_dir}/filtered_peak_files"
|
|
687
|
+
)
|
|
688
|
+
|
|
689
|
+
# Setting minimum peak widths:
|
|
690
|
+
if args.verbose:
|
|
691
|
+
sys.stdout.write(f"### Adjusting peak widths to a minimum of {args.min_peak_width}....\n")
|
|
692
|
+
|
|
693
|
+
file_basenames = [getFileBaseName(i) for i in peak_gtf_files]
|
|
694
|
+
filtered_peak_files = [f"{storage_dir}/filtered_peak_files/{i}_filtered_by_{args.filter_peak_height}_threshold.gtf" \
|
|
695
|
+
for i in file_basenames]
|
|
696
|
+
|
|
697
|
+
adjustPeakWidths(filtered_peak_files,
|
|
698
|
+
args.chromfile,
|
|
699
|
+
min_width=args.min_peak_width,
|
|
700
|
+
no_cpus=args.cpu,
|
|
701
|
+
output_dir=f"{storage_dir}/filtered_peak_files"
|
|
702
|
+
)
|
|
703
|
+
|
|
704
|
+
# Merging the peak intervals for the sample files:
|
|
705
|
+
if args.verbose:
|
|
706
|
+
sys.stdout.write("### Merging peak intervals...\n")
|
|
707
|
+
|
|
708
|
+
file_basenames = [getFileBaseName(i) for i in filtered_peak_files]
|
|
709
|
+
filtered_peak_files = [f"{storage_dir}/filtered_peak_files/{i}_min_width_{args.min_peak_width}.gtf" for i in file_basenames]
|
|
710
|
+
reproducibility = args.reproducibility/100.0
|
|
711
|
+
merged_peak_output_file_name = f"{storage_dir}/filtered_peak_files/{args.jobname}_merged_peaks.gtf"
|
|
712
|
+
|
|
713
|
+
mergeGTFintervals(filtered_peak_files,
|
|
714
|
+
reproducibility,
|
|
715
|
+
merged_peak_output_file_name,
|
|
716
|
+
)
|
|
717
|
+
|
|
718
|
+
# Numbering the peaks:
|
|
719
|
+
if args.verbose:
|
|
720
|
+
sys.stdout.write("### Numbering the peaks....\n")
|
|
721
|
+
|
|
722
|
+
numberPeaks([merged_peak_output_file_name],
|
|
723
|
+
output_dir=f"{storage_dir}/filtered_peak_files"
|
|
724
|
+
)
|
|
725
|
+
|
|
726
|
+
# Performing pyReadCounters analysis on the bam files using the new GTF file containing the numbered peaks:
|
|
727
|
+
if args.verbose:
|
|
728
|
+
sys.stdout.write("### Counting peak coverage for each sample and control file....\n")
|
|
729
|
+
|
|
730
|
+
annotation_file = f"{os.path.splitext(merged_peak_output_file_name)[0]}_numbered.gtf"
|
|
731
|
+
|
|
732
|
+
# Making sure the input file actually exists!:
|
|
733
|
+
if os.path.exists(annotation_file):
|
|
734
|
+
|
|
735
|
+
all_bam_files = list()
|
|
736
|
+
all_bam_files.extend(args.samples)
|
|
737
|
+
all_bam_files.extend(args.controls)
|
|
738
|
+
|
|
739
|
+
countReadsGTF(all_bam_files,
|
|
740
|
+
gtf_annotation_file=annotation_file,
|
|
741
|
+
no_cpus=args.cpu,
|
|
742
|
+
output_dir=f"{storage_dir}/peak_hittables"
|
|
743
|
+
)
|
|
744
|
+
else:
|
|
745
|
+
sys.stderr.write(f"The file {os.path.basename(annotation_file)} already exists!\n")
|
|
746
|
+
|
|
747
|
+
# Merging the hittables:
|
|
748
|
+
if args.verbose:
|
|
749
|
+
sys.stdout.write("Merging the peak read coverage hit tables.\n")
|
|
750
|
+
|
|
751
|
+
sample_file_base_names = [getFileBaseName(i) for i in args.samples]
|
|
752
|
+
control_file_base_names = [getFileBaseName(i) for i in args.controls]
|
|
753
|
+
|
|
754
|
+
sample_file_hittables = [f"{storage_dir}/peak_hittables/{i}_hittable_{data_type}.txt" for i in sample_file_base_names]
|
|
755
|
+
control_file_hittables = [f"{storage_dir}/peak_hittables/{i}_hittable_{data_type}.txt" for i in control_file_base_names]
|
|
756
|
+
|
|
757
|
+
tables_to_merge = list()
|
|
758
|
+
tables_to_merge.extend(sample_file_hittables)
|
|
759
|
+
tables_to_merge.extend(control_file_hittables)
|
|
760
|
+
hittable_name = f"{storage_dir}/peak_hittables/{args.jobname}_merged_hittables.txt"
|
|
761
|
+
|
|
762
|
+
if not os.path.exists(hittable_name):
|
|
763
|
+
mergeHittables(tables_to_merge,hittable_name)
|
|
764
|
+
else:
|
|
765
|
+
sys.stderr.write(f"\tOutput file {os.path.basename(hittable_name)} already exists!\n")
|
|
766
|
+
|
|
767
|
+
# Running the DESeq2 analyses:
|
|
768
|
+
# Setting the testing conditions:
|
|
769
|
+
if args.verbose:
|
|
770
|
+
sys.stdout.write("### Running the DESeq2 analyses....\n")
|
|
771
|
+
|
|
772
|
+
conditions = list()
|
|
773
|
+
conditions.extend(len(args.samples)*["WT"])
|
|
774
|
+
conditions.extend(len(args.controls)*["control"])
|
|
775
|
+
|
|
776
|
+
# Order of Conditions: The order in which conditions are specified affects the reference level in DESeq2.
|
|
777
|
+
# In our case, "WT" samples are listed first in the conditions list and then "Control samples"
|
|
778
|
+
# DESeq2 will treat "WT" as the reference level by default (assuming no other specifications are made to alter this).
|
|
779
|
+
# This means:
|
|
780
|
+
# POSITIVE log2FC: Indicates higher expression in "control" relative to "WT".
|
|
781
|
+
# NEGATIVE log2FC: Indicates higher expression in "WT" relative to "control".
|
|
782
|
+
|
|
783
|
+
runDESeq(hittable_name,
|
|
784
|
+
conditions,
|
|
785
|
+
no_cpus=args.cpu,
|
|
786
|
+
output_dir=f"{storage_dir}/DESeq2_results"
|
|
787
|
+
)
|
|
788
|
+
|
|
789
|
+
# Adding log2-fold changes to the peak GTF file:
|
|
790
|
+
deseq_table_name = f"{storage_dir}/DESeq2_results/{getFileBaseName(hittable_name)}_DESeq2_results.txt"
|
|
791
|
+
|
|
792
|
+
addFoldChangeToGTF(deseq_table_name,
|
|
793
|
+
annotation_file,
|
|
794
|
+
output_dir=f"{storage_dir}/DESeq2_results"
|
|
795
|
+
)
|
|
796
|
+
|
|
797
|
+
# Extracting the significant peaks
|
|
798
|
+
annotation_file_with_padj = f"{storage_dir}/DESeq2_results/{getFileBaseName(annotation_file)}_with_padj.gtf"
|
|
799
|
+
|
|
800
|
+
if args.verbose:
|
|
801
|
+
sys.stdout.write("#### Extracting significant peaks....\n")
|
|
802
|
+
|
|
803
|
+
getSignificantPeaks(deseq_table_name,
|
|
804
|
+
annotation_file_with_padj,
|
|
805
|
+
fdr_threshold=args.padj,
|
|
806
|
+
output_dir=f"{storage_dir}/DESeq2_results"
|
|
807
|
+
)
|
|
808
|
+
|
|
809
|
+
if __name__ == "__main__":
|
|
810
|
+
main()
|
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: DBPeaks
|
|
3
|
+
Version: 0.0.3
|
|
4
|
+
Summary: A tool for identifying differentially bound peaks in CLIP/CRAC data
|
|
5
|
+
Author-email: Sander Granneman <Sander.Granneman@ed.ac.uk>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Classifier: Development Status :: 3 - Alpha
|
|
8
|
+
Classifier: Intended Audience :: Science/Research
|
|
9
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
+
Requires-Python: >=3.8
|
|
16
|
+
Description-Content-Type: text/markdown
|
|
17
|
+
License-File: LICENSE
|
|
18
|
+
Requires-Dist: pybedtools
|
|
19
|
+
Requires-Dist: numpy
|
|
20
|
+
Requires-Dist: pandas
|
|
21
|
+
Requires-Dist: pydeseq2==0.4.9
|
|
22
|
+
Requires-Dist: pyCRAC==1.5.2
|
|
23
|
+
Dynamic: license-file
|
|
24
|
+
|
|
25
|
+
# DBPeaks: Differential RNA-Binding Site Analysis Tool
|
|
26
|
+
|
|
27
|
+
## Contents
|
|
28
|
+
|
|
29
|
+
- [Introduction](#introduction)
|
|
30
|
+
|
|
31
|
+
- [Repo Contents](#repo-contents)
|
|
32
|
+
|
|
33
|
+
- [Features](#features)
|
|
34
|
+
|
|
35
|
+
- [System Requirements](#system-requirements)
|
|
36
|
+
|
|
37
|
+
- [Installation Guide](#installation-guide)
|
|
38
|
+
|
|
39
|
+
- [License](./LICENSE)
|
|
40
|
+
|
|
41
|
+
- [Citation](#citation)
|
|
42
|
+
|
|
43
|
+
- [Contact](#contact)
|
|
44
|
+
|
|
45
|
+
## Inroduction
|
|
46
|
+
|
|
47
|
+
DBPeaks is a Python-based command-line tool designed for the identification and analysis of differential RNA-binding sites in CLIP/CRAC datasets. It integrates various bioinformatics tools and methods to process sequencing data, identify peaks, and perform statistical analyses to detect significant differences in RNA-binding across conditions. It does so by first analysing the peaks in each individual file and it then looks whether peaks are found in the same regions. These peaks need to have overlapping genome mapping coordinates. All overlapping peaks will then be merged into a single peak interval and for each interval the program will then calculate the total number of reads covering that genomic interval. DESeq2 will then be used to determine if the read counts for that interval is statistically significantly different between sample and control files.
|
|
48
|
+
|
|
49
|
+
It requires CLIP/CRAC data BAM files as input as well as GTF and genome files for the model organism.
|
|
50
|
+
Make sure your GTF annotation file does not have any silly formatting mistakes, otherwise the program will not run.
|
|
51
|
+
Example genome files for yeast are available in this repository.
|
|
52
|
+
|
|
53
|
+
NOTE! DBPeaks was SPECIFICALLY designed to analyse CLIP/CRAC datasets from two different conditions or by comparing data from WT vs mutant RBPs. It was NOT designed to compare RBP CLIP datasets to control datasets that have substantially lower read counts (i.e. data from untagged strains or no UV cross-linking controls). Should you be stubborn and still decide to use DBPeaks for this purpose, you will get rubbish results!
|
|
54
|
+
|
|
55
|
+
It is really important that all bam files have good number of reads and that there is not a huge difference in read depth between the files. This will make DESeq2 much happier and will therefore improve the results.
|
|
56
|
+
|
|
57
|
+
We have tried many different tools that do similar things. However, we were either not able to get them running on our servers or they were not able to detect clearly differentially bound (DB) peaks in our data. We have not benchmarked DBPeaks to existing tools so we do not yet know how well it performs compared to most popular peak calling methods. All I can say is that on OUR data where we removed an RBP binding site in the genome it performs better than existing tools such as MACS3 and Peakachu. DBPeaks was able to detect loss of binding in that single genomic location. The other tools we tested could not. DBPeaks is, however, slower than most existing tools. This is because it relies on pyCalculateFDRs from the pyCRAC package to call peaks. This script looks for peaks in each individual gene anotated in the genome and takes read coverage of the gene into consideration for this. So this part is rather slow if you have many features annotated in your genome file.
|
|
58
|
+
|
|
59
|
+
DBPeaks uses multiple CPUs to process the data and has the added advantage that it can also use replicates.
|
|
60
|
+
|
|
61
|
+
## Repo Contents
|
|
62
|
+
- [DBPeaks](./DBPeaks.py)
|
|
63
|
+
- [License](./LICENSE)
|
|
64
|
+
|
|
65
|
+
## Features
|
|
66
|
+
|
|
67
|
+
Comprehensive Analysis Pipeline: From reading BAM files to statistical analysis with DESeq2.
|
|
68
|
+
Parallel Processing: Utilizes multiple CPUs to speed up the analysis.
|
|
69
|
+
Flexible Input Options: Supports various configurations and customizations through command-line options.
|
|
70
|
+
Integrated Peak Calling and Filtering: Includes functionality for peak detection, filtering based on reproducibility, and adjustment of peak widths.
|
|
71
|
+
Statistical Analysis: Incorporates DESeq2 for rigorous differential analysis.
|
|
72
|
+
|
|
73
|
+
## Installation Guide
|
|
74
|
+
|
|
75
|
+
### Prerequisites
|
|
76
|
+
|
|
77
|
+
Python 3.6 or higher
|
|
78
|
+
Dependencies: pybedtools, numpy, pandas, pydeseq2, pyCRAC, and others as listed in requirements.txt.
|
|
79
|
+
Steps
|
|
80
|
+
|
|
81
|
+
Clone the repository:
|
|
82
|
+
|
|
83
|
+
```
|
|
84
|
+
git clone https://git.ecdf.ed.ac.uk/sgrannem/dbpeaks.git
|
|
85
|
+
cd dbpeaks
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
### Install required Python packages:
|
|
89
|
+
|
|
90
|
+
```
|
|
91
|
+
pip install -r requirements.txt
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
### Install DBPeaks:
|
|
95
|
+
|
|
96
|
+
```
|
|
97
|
+
cd dbpeaks
|
|
98
|
+
pip install -e . --user
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
### Running DBPeaks
|
|
102
|
+
|
|
103
|
+
DBPeaks is run from the command line. Here is a basic example to get you started:
|
|
104
|
+
|
|
105
|
+
python dbpeaks.py --samples path/to/sample1.bam path/to/sample2.bam --controls path/to/control1.bam path/to/control2.bam --gtf path/to/annotation.gtf --chromfile path/to/chrominfo.txt --jobname ExampleAnalysis
|
|
106
|
+
|
|
107
|
+
### Command-Line Options
|
|
108
|
+
|
|
109
|
+
--samples: Specify paths to the BAM files for the sample group.
|
|
110
|
+
--controls: Specify paths to the BAM files for the control group.
|
|
111
|
+
--gtf: Path to the GTF annotation file.
|
|
112
|
+
--chromfile: Location of the chromosome info file. This file should have two columns: first column is the names of the chromosomes, second column is length of the chromosomes.
|
|
113
|
+
--jobname: A name for the job to organize output files.
|
|
114
|
+
|
|
115
|
+
### Additional options for peak calling, filtering, and statistical thresholds can be viewed using the help option:
|
|
116
|
+
|
|
117
|
+
```
|
|
118
|
+
DBpeaks.py --help
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
### Peak calling settings:
|
|
122
|
+
-m 0.05, --minfdr 0.05 To set a minimal FDR threshold for filtering interval data. Default is 0.05
|
|
123
|
+
|
|
124
|
+
This is a setting that is used when running pyCalculateFDRs. If you end up getting a lot of peaks in your data,
|
|
125
|
+
it is recommended to change this threshold, let's say to 0.01 as this will reduce the number of significantly enriched peaks
|
|
126
|
+
in your data.
|
|
127
|
+
|
|
128
|
+
--padj 0.05 DESeq2 threshold for calling a peak DB. Default is 0.05
|
|
129
|
+
|
|
130
|
+
If you hardly get any DB peaks, then it may be worth slighly adjusting this threshold.
|
|
131
|
+
However, in this scenario, it may also be the case that your samples just have too much variability.
|
|
132
|
+
It may then be wise to do a PCA analysis on your data to see if replicates are indeed grouped together.
|
|
133
|
+
|
|
134
|
+
--min 5 to set a minimal read coverages for a region. Regions with coverage less than minimum will be ignored
|
|
135
|
+
|
|
136
|
+
--blocks Add this flag if you want to consider reads with identical mapping coordinates once, regardless of sequence.
|
|
137
|
+
|
|
138
|
+
NOTE! This is a HUGELY important flag! Setting --blocks will remove any 'towers' in your data and collapse them into
|
|
139
|
+
one single interval. This can completely change the shape and height of the peak and the peak may no longer be detected.
|
|
140
|
+
However, if you suspect that your library is of low complexity and you see many of these blocks or towers in your genome browser, then I would recommend adding this flag as I have seen that this can improve the reliability of the final DESeq2 analyses.
|
|
141
|
+
|
|
142
|
+
--iterations 100 to set the number of iterations for randomization of read coordinates. Default=100
|
|
143
|
+
|
|
144
|
+
This is important for the peak calling analysis by the pyCalculateFDR.py script.
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
-r 90, --rep 90 To set in what percentage of the replicates the peak should be detected. Default=100
|
|
148
|
+
|
|
149
|
+
Let's say you have three sample and three control bam files and you set -r to 50, then peaks that are, for example present in the smaple files but absent in the control files will also be considered. If you, in this scenario, set -r to 100, then the tool will expect to find overlapping peaks at any given position for ALL samples! So you may miss peaks that were, for example, only present in your sample but not in the control!
|
|
150
|
+
|
|
151
|
+
--filter mean To filter the peaks in gtf files by a specific threshold. Options are mean, median or mean plus one standard devation
|
|
152
|
+
(mean_plus_std) peak heights. Default is no filtering.
|
|
153
|
+
|
|
154
|
+
I would always recommend starting with no filtering. If you get too many DB peaks, then I would start with --filter mean and then --filter median.
|
|
155
|
+
|
|
156
|
+
--min_peak_width 20 To set the minimum width of a called peak. Default = 20
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
## Contributing to further improving DBPeaks
|
|
160
|
+
|
|
161
|
+
Contributions to DBPeaks are welcome! Please fork the repository and submit pull requests with your enhancements.
|
|
162
|
+
We will also be including some test data on the repository soon!
|
|
163
|
+
|
|
164
|
+
## License
|
|
165
|
+
|
|
166
|
+
This project is licensed under the Apache License - see the LICENSE file for details.
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
## Citation
|
|
170
|
+
|
|
171
|
+
DBPeaks was developed to analyse CRAC data for a manuscript that we are about to submit. This will be updated once the paper has been accepted or put on a preprint server.
|
|
172
|
+
|
|
173
|
+
## Contact
|
|
174
|
+
|
|
175
|
+
For support or to report issues, please contact Sander Granneman at Sander.Granneman@ed.ac.uk, University of Edinburgh.
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
DBPeaks.py,sha256=H7ZSGxxK4hvEbpN52jXz0aojXENwF1AYY3wlQfj57UM,34794
|
|
2
|
+
dbpeaks-0.0.3.dist-info/licenses/LICENSE,sha256=DU9_yiwF7Kz7teRaufuoFy-WJ8abkKbkzkn8e2wVIvw,11338
|
|
3
|
+
dbpeaks-0.0.3.dist-info/METADATA,sha256=Udp6wfLHzKEbCBZsWuyfpD83UWl677y1sWlWU6d9H9c,9356
|
|
4
|
+
dbpeaks-0.0.3.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
5
|
+
dbpeaks-0.0.3.dist-info/entry_points.txt,sha256=Aa_zwEkr9vFVgLr7YffiVUGsi3asuy3pyfzd9KIwjY4,41
|
|
6
|
+
dbpeaks-0.0.3.dist-info/top_level.txt,sha256=x9WZWRV0ZCfB8TEjokW0A5nJGNf9XoGC_T3u1c7TC3E,8
|
|
7
|
+
dbpeaks-0.0.3.dist-info/RECORD,,
|
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
Apache License
|
|
2
|
+
Version 2.0, January 2004
|
|
3
|
+
http://www.apache.org/licenses/
|
|
4
|
+
|
|
5
|
+
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
|
6
|
+
|
|
7
|
+
1. Definitions.
|
|
8
|
+
|
|
9
|
+
"License" shall mean the terms and conditions for use, reproduction,
|
|
10
|
+
and distribution as defined by Sections 1 through 9 of this document.
|
|
11
|
+
|
|
12
|
+
"Licensor" shall mean the copyright owner or entity authorized by
|
|
13
|
+
the copyright owner that is granting the License.
|
|
14
|
+
|
|
15
|
+
"Legal Entity" shall mean the union of the acting entity and all
|
|
16
|
+
other entities that control, are controlled by, or are under common
|
|
17
|
+
control with that entity. For the purposes of this definition,
|
|
18
|
+
"control" means (i) the power, direct or indirect, to cause the
|
|
19
|
+
direction or management of such entity, whether by contract or
|
|
20
|
+
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
|
21
|
+
outstanding shares, or (iii) beneficial ownership of such entity.
|
|
22
|
+
|
|
23
|
+
"You" (or "Your") shall mean an individual or Legal Entity
|
|
24
|
+
exercising permissions granted by this License.
|
|
25
|
+
|
|
26
|
+
"Source" form shall mean the preferred form for making modifications,
|
|
27
|
+
including but not limited to software source code, documentation
|
|
28
|
+
source, and configuration files.
|
|
29
|
+
|
|
30
|
+
"Object" form shall mean any form resulting from mechanical
|
|
31
|
+
transformation or translation of a Source form, including but
|
|
32
|
+
not limited to compiled object code, generated documentation,
|
|
33
|
+
and conversions to other media types.
|
|
34
|
+
|
|
35
|
+
"Work" shall mean the work of authorship, whether in Source or
|
|
36
|
+
Object form, made available under the License, as indicated by a
|
|
37
|
+
copyright notice that is included in or attached to the work
|
|
38
|
+
(an example is provided in the Appendix below).
|
|
39
|
+
|
|
40
|
+
"Derivative Works" shall mean any work, whether in Source or Object
|
|
41
|
+
form, that is based on (or derived from) the Work and for which the
|
|
42
|
+
editorial revisions, annotations, elaborations, or other modifications
|
|
43
|
+
represent, as a whole, an original work of authorship. For the purposes
|
|
44
|
+
of this License, Derivative Works shall not include works that remain
|
|
45
|
+
separable from, or merely link (or bind by name) to the interfaces of,
|
|
46
|
+
the Work and Derivative Works thereof.
|
|
47
|
+
|
|
48
|
+
"Contribution" shall mean any work of authorship, including
|
|
49
|
+
the original version of the Work and any modifications or additions
|
|
50
|
+
to that Work or Derivative Works thereof, that is intentionally
|
|
51
|
+
submitted to Licensor for inclusion in the Work by the copyright owner
|
|
52
|
+
or by an individual or Legal Entity authorized to submit on behalf of
|
|
53
|
+
the copyright owner. For the purposes of this definition, "submitted"
|
|
54
|
+
means any form of electronic, verbal, or written communication sent
|
|
55
|
+
to the Licensor or its representatives, including but not limited to
|
|
56
|
+
communication on electronic mailing lists, source code control systems,
|
|
57
|
+
and issue tracking systems that are managed by, or on behalf of, the
|
|
58
|
+
Licensor for the purpose of discussing and improving the Work, but
|
|
59
|
+
excluding communication that is conspicuously marked or otherwise
|
|
60
|
+
designated in writing by the copyright owner as "Not a Contribution."
|
|
61
|
+
|
|
62
|
+
"Contributor" shall mean Licensor and any individual or Legal Entity
|
|
63
|
+
on behalf of whom a Contribution has been received by Licensor and
|
|
64
|
+
subsequently incorporated within the Work.
|
|
65
|
+
|
|
66
|
+
2. Grant of Copyright License. Subject to the terms and conditions of
|
|
67
|
+
this License, each Contributor hereby grants to You a perpetual,
|
|
68
|
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
|
69
|
+
copyright license to reproduce, prepare Derivative Works of,
|
|
70
|
+
publicly display, publicly perform, sublicense, and distribute the
|
|
71
|
+
Work and such Derivative Works in Source or Object form.
|
|
72
|
+
|
|
73
|
+
3. Grant of Patent License. Subject to the terms and conditions of
|
|
74
|
+
this License, each Contributor hereby grants to You a perpetual,
|
|
75
|
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
|
76
|
+
(except as stated in this section) patent license to make, have made,
|
|
77
|
+
use, offer to sell, sell, import, and otherwise transfer the Work,
|
|
78
|
+
where such license applies only to those patent claims licensable
|
|
79
|
+
by such Contributor that are necessarily infringed by their
|
|
80
|
+
Contribution(s) alone or by combination of their Contribution(s)
|
|
81
|
+
with the Work to which such Contribution(s) was submitted. If You
|
|
82
|
+
institute patent litigation against any entity (including a
|
|
83
|
+
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
|
84
|
+
or a Contribution incorporated within the Work constitutes direct
|
|
85
|
+
or contributory patent infringement, then any patent licenses
|
|
86
|
+
granted to You under this License for that Work shall terminate
|
|
87
|
+
as of the date such litigation is filed.
|
|
88
|
+
|
|
89
|
+
4. Redistribution. You may reproduce and distribute copies of the
|
|
90
|
+
Work or Derivative Works thereof in any medium, with or without
|
|
91
|
+
modifications, and in Source or Object form, provided that You
|
|
92
|
+
meet the following conditions:
|
|
93
|
+
|
|
94
|
+
(a) You must give any other recipients of the Work or
|
|
95
|
+
Derivative Works a copy of this License; and
|
|
96
|
+
|
|
97
|
+
(b) You must cause any modified files to carry prominent notices
|
|
98
|
+
stating that You changed the files; and
|
|
99
|
+
|
|
100
|
+
(c) You must retain, in the Source form of any Derivative Works
|
|
101
|
+
that You distribute, all copyright, patent, trademark, and
|
|
102
|
+
attribution notices from the Source form of the Work,
|
|
103
|
+
excluding those notices that do not pertain to any part of
|
|
104
|
+
the Derivative Works; and
|
|
105
|
+
|
|
106
|
+
(d) If the Work includes a "NOTICE" text file as part of its
|
|
107
|
+
distribution, then any Derivative Works that You distribute must
|
|
108
|
+
include a readable copy of the attribution notices contained
|
|
109
|
+
within such NOTICE file, excluding those notices that do not
|
|
110
|
+
pertain to any part of the Derivative Works, in at least one
|
|
111
|
+
of the following places: within a NOTICE text file distributed
|
|
112
|
+
as part of the Derivative Works; within the Source form or
|
|
113
|
+
documentation, if provided along with the Derivative Works; or,
|
|
114
|
+
within a display generated by the Derivative Works, if and
|
|
115
|
+
wherever such third-party notices normally appear. The contents
|
|
116
|
+
of the NOTICE file are for informational purposes only and
|
|
117
|
+
do not modify the License. You may add Your own attribution
|
|
118
|
+
notices within Derivative Works that You distribute, alongside
|
|
119
|
+
or as an addendum to the NOTICE text from the Work, provided
|
|
120
|
+
that such additional attribution notices cannot be construed
|
|
121
|
+
as modifying the License.
|
|
122
|
+
|
|
123
|
+
You may add Your own copyright statement to Your modifications and
|
|
124
|
+
may provide additional or different license terms and conditions
|
|
125
|
+
for use, reproduction, or distribution of Your modifications, or
|
|
126
|
+
for any such Derivative Works as a whole, provided Your use,
|
|
127
|
+
reproduction, and distribution of the Work otherwise complies with
|
|
128
|
+
the conditions stated in this License.
|
|
129
|
+
|
|
130
|
+
5. Submission of Contributions. Unless You explicitly state otherwise,
|
|
131
|
+
any Contribution intentionally submitted for inclusion in the Work
|
|
132
|
+
by You to the Licensor shall be under the terms and conditions of
|
|
133
|
+
this License, without any additional terms or conditions.
|
|
134
|
+
Notwithstanding the above, nothing herein shall supersede or modify
|
|
135
|
+
the terms of any separate license agreement you may have executed
|
|
136
|
+
with Licensor regarding such Contributions.
|
|
137
|
+
|
|
138
|
+
6. Trademarks. This License does not grant permission to use the trade
|
|
139
|
+
names, trademarks, service marks, or product names of the Licensor,
|
|
140
|
+
except as required for reasonable and customary use in describing the
|
|
141
|
+
origin of the Work and reproducing the content of the NOTICE file.
|
|
142
|
+
|
|
143
|
+
7. Disclaimer of Warranty. Unless required by applicable law or
|
|
144
|
+
agreed to in writing, Licensor provides the Work (and each
|
|
145
|
+
Contributor provides its Contributions) on an "AS IS" BASIS,
|
|
146
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
|
147
|
+
implied, including, without limitation, any warranties or conditions
|
|
148
|
+
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
|
149
|
+
PARTICULAR PURPOSE. You are solely responsible for determining the
|
|
150
|
+
appropriateness of using or redistributing the Work and assume any
|
|
151
|
+
risks associated with Your exercise of permissions under this License.
|
|
152
|
+
|
|
153
|
+
8. Limitation of Liability. In no event and under no legal theory,
|
|
154
|
+
whether in tort (including negligence), contract, or otherwise,
|
|
155
|
+
unless required by applicable law (such as deliberate and grossly
|
|
156
|
+
negligent acts) or agreed to in writing, shall any Contributor be
|
|
157
|
+
liable to You for damages, including any direct, indirect, special,
|
|
158
|
+
incidental, or consequential damages of any character arising as a
|
|
159
|
+
result of this License or out of the use or inability to use the
|
|
160
|
+
Work (including but not limited to damages for loss of goodwill,
|
|
161
|
+
work stoppage, computer failure or malfunction, or any and all
|
|
162
|
+
other commercial damages or losses), even if such Contributor
|
|
163
|
+
has been advised of the possibility of such damages.
|
|
164
|
+
|
|
165
|
+
9. Accepting Warranty or Additional Liability. While redistributing
|
|
166
|
+
the Work or Derivative Works thereof, You may choose to offer,
|
|
167
|
+
and charge a fee for, acceptance of support, warranty, indemnity,
|
|
168
|
+
or other liability obligations and/or rights consistent with this
|
|
169
|
+
License. However, in accepting such obligations, You may act only
|
|
170
|
+
on Your own behalf and on Your sole responsibility, not on behalf
|
|
171
|
+
of any other Contributor, and only if You agree to indemnify,
|
|
172
|
+
defend, and hold each Contributor harmless for any liability
|
|
173
|
+
incurred by, or claims asserted against, such Contributor by reason
|
|
174
|
+
of your accepting any such warranty or additional liability.
|
|
175
|
+
|
|
176
|
+
END OF TERMS AND CONDITIONS
|
|
177
|
+
|
|
178
|
+
APPENDIX: How to apply the Apache License to your work.
|
|
179
|
+
|
|
180
|
+
To apply the Apache License to your work, attach the following
|
|
181
|
+
boilerplate notice, with the fields enclosed by brackets "{}"
|
|
182
|
+
replaced with your own identifying information. (Don't include
|
|
183
|
+
the brackets!) The text should be enclosed in the appropriate
|
|
184
|
+
comment syntax for the file format. We also recommend that a
|
|
185
|
+
file or class name and description of purpose be included on the
|
|
186
|
+
same "printed page" as the copyright notice for easier
|
|
187
|
+
identification within third-party archives.
|
|
188
|
+
|
|
189
|
+
Copyright 2021 sgrannem
|
|
190
|
+
|
|
191
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
192
|
+
you may not use this file except in compliance with the License.
|
|
193
|
+
You may obtain a copy of the License at
|
|
194
|
+
|
|
195
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
|
196
|
+
|
|
197
|
+
Unless required by applicable law or agreed to in writing, software
|
|
198
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
199
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
200
|
+
See the License for the specific language governing permissions and
|
|
201
|
+
limitations under the License.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
DBPeaks
|