DBPeaks 0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
DBPeaks.py ADDED
@@ -0,0 +1,810 @@
1
+ #/usr/bin/python
2
+
3
+ __author__ = "Sander Granneman"
4
+ __copyright__ = "Copyright 2024"
5
+ __version__ = "0.0.3"
6
+ __credits__ = ["Sander Granneman"]
7
+ __maintainer__ = "Sander Granneman"
8
+ __email__ = "Sander.Granneman@ed.ac.uk"
9
+ __status__ = "beta"
10
+
11
+ import os
12
+ import re
13
+ import csv
14
+ import sys
15
+ import pybedtools
16
+ import argparse
17
+ import subprocess
18
+ import numpy as np
19
+ import pandas as pd
20
+
21
+ from concurrent.futures import ThreadPoolExecutor
22
+ from collections import defaultdict
23
+ from pydeseq2.dds import DeseqDataSet
24
+ from pydeseq2.default_inference import DefaultInference
25
+ from pydeseq2.ds import DeseqStats
26
+ from pyCRAC.Parsers import GTF2
27
+ from pyCRAC.Methods import numpy_overlap
28
+ from pyCRAC.Classes.NGSFormatWriters import NGSFileWriter
29
+
30
+ def getGeneIDs(string):
31
+ """Finds all the gene_ids in a given string"""
32
+ gene_ids = list(set(re.findall(r'gene_id\s+"([^"]+)"', string)))
33
+ return gene_ids
34
+
35
+ def getGeneNames(string):
36
+ """Finds all the gene_names in a given string"""
37
+ gene_names = list(set(re.findall(r'gene_name\s+"([^"]+)"', string)))
38
+ return gene_names
39
+
40
+ def rowToGTF(row):
41
+ """ Returns the results from the mergeGTFfiles function as a GTF file string """
42
+ return f"{row['chrom']}\tcluster\tinterval\t{row['start']}\t{row['end']}\t.\t{row['name']}\t.\tgene_id \"{row['gene_ids']}\"; gene_name \"{row['gene_names']}\";"
43
+
44
+ def mergeGTFintervals(gtf_files, reproducibility=0.9, output_file_name=None):
45
+ """Concatenates the files and then uses pybedtools to find intervals/peaks that are found in all replicates.
46
+ The user can decide whether only some replicates should contain the peaks or all. This can be done by setting
47
+ the 'reproducibility' variable in the function, which is set to 0.9 (i.e., 90% by default).
48
+ """
49
+
50
+ if not output_file_name:
51
+ output_file_name = "merged.gtf"
52
+
53
+ if not os.path.exists(output_file_name):
54
+ ### Concatenating the files and storing them in a pandas dataframe:
55
+ data_frames = [pd.read_csv(i, sep='\t', comment='#', index_col=None, header=None) for i in gtf_files]
56
+ merged_data = pd.concat(data_frames, ignore_index=True)
57
+
58
+ ### Sorting the merged data:
59
+ merged_data = merged_data.sort_values(by=[0, 1, 2, 3, 8])
60
+
61
+ ### Loading the dataframe into pybedtools:
62
+ bedtools_data = pybedtools.BedTool.from_dataframe(merged_data)
63
+
64
+ ### Merging the data using bedtools:
65
+ bedtools_data_merged = bedtools_data.merge(s=True, c=[7, 9], o='collapse', delim='')
66
+
67
+ ### Converting the results back into a dataframe:
68
+ bedtools_data_merged = pybedtools.BedTool.to_dataframe(bedtools_data_merged)
69
+
70
+ ### Now only keeping peaks that were found in multiple replicates, based on the threshold:
71
+ number_of_reps = len(gtf_files)
72
+ must_be_seen_in_reps = number_of_reps * float(reproducibility)
73
+
74
+ ### Filter rows where the length of the string in the 'name' column exceeds the threshold:
75
+ bedtools_data_merged = bedtools_data_merged[bedtools_data_merged['name'].apply(len) >= must_be_seen_in_reps]
76
+
77
+ ### Keep only the first character of the strand column:
78
+ bedtools_data_merged['name'] = bedtools_data_merged['name'].str[0] # Keep only the first character
79
+
80
+ ### Now extracting the gene_ids and gene_names, only keeping the unique ones:
81
+ bedtools_data_merged['gene_ids'] = bedtools_data_merged['score'].apply((getGeneIDs))
82
+ bedtools_data_merged['gene_ids'] = bedtools_data_merged['gene_ids'].apply(lambda x: '|'.join(x))
83
+
84
+ bedtools_data_merged['gene_names'] = bedtools_data_merged['score'].apply((getGeneNames))
85
+ bedtools_data_merged['gene_names'] = bedtools_data_merged['gene_names'].apply(lambda x: '|'.join(x))
86
+
87
+ ### Now dropping the score column
88
+ bedtools_data_merged.drop(columns=['score'],inplace=True)
89
+
90
+ ### Resetting the index:
91
+ bedtools_data_merged = bedtools_data_merged.reset_index()
92
+
93
+ gtf_file_lines = list()
94
+ gtf_file_lines = bedtools_data_merged.apply(rowToGTF,axis=1).to_list()
95
+
96
+ if gtf_file_lines:
97
+ outfile = open(output_file_name,'w')
98
+ outfile.write("##gff-version 2\n")
99
+
100
+ for i in gtf_file_lines:
101
+ outfile.write(f"{i}\n")
102
+ outfile.close()
103
+ return True
104
+
105
+ else:
106
+ sys.stderr.write("ERROR! The data could not be merged!\n")
107
+ return False
108
+ else:
109
+ sys.stderr.write(f"\tOutput file {output_file_name} already exists!\n")
110
+ return False
111
+
112
+ def countReadsBam(bam_files,gtf_annotation_file,no_cpus=1,output_dir="bam_read_counts",blocks=False):
113
+ """ Runs the pyReadCounter analyses on the bam files"""
114
+
115
+ ### Creating the directory where the results will be stored:
116
+ if not os.path.exists(output_dir):
117
+ os.makedirs(output_dir)
118
+
119
+ def run_command(file_name,gtf_file,outfile_name):
120
+ cmd = [
121
+ "pyReadCounters.py",
122
+ "-f",
123
+ file_name,
124
+ "--file_type",
125
+ "sam",
126
+ "--gtf",
127
+ gtf_file,
128
+ "-v",
129
+ "--gtffile",
130
+ "-o",
131
+ outfile_name
132
+ ]
133
+
134
+ if blocks:
135
+ cmd.extend(["--blocks",
136
+ "--mutations",
137
+ "nomuts"
138
+ ]
139
+ )
140
+ subprocess.run(cmd)
141
+
142
+
143
+ data_type = "reads"
144
+ if blocks:
145
+ data_type = "cDNAs"
146
+
147
+ ### Running pyReadCounters over multiple processors.
148
+ with ThreadPoolExecutor(no_cpus) as executor:
149
+ futures = []
150
+ for file_name in bam_files:
151
+ basename = getFileBaseName(file_name)
152
+ # The name of the output file path that should be submitted to pyReadCounters:
153
+ output_file_name = f"{output_dir}/{basename}"
154
+ # The name of the output file path produced by pyReadCounters:
155
+ output_file_path = f"{output_file_name}_count_output_{data_type}.gtf"
156
+ # If the file already exists, don't overwrite it.
157
+ if not os.path.exists(output_file_path):
158
+ future = executor.submit(run_command,file_name,gtf_annotation_file,output_file_name)
159
+ futures.append(future)
160
+ else:
161
+ sys.stderr.write(f"\tOutput file {output_file_path} already exists!\n")
162
+
163
+ # Wait for all commands to complete
164
+ if futures:
165
+ for future in futures:
166
+ future.result()
167
+
168
+ return True
169
+
170
+ def countReadsGTF(bam_files,gtf_annotation_file,no_cpus=1,output_dir="peak_hittables",blocks=False):
171
+ """ Runs the pyReadCounter analyses on the bam files"""
172
+
173
+ ### Creating the directory where the results will be stored:
174
+ if not os.path.exists(output_dir):
175
+ os.makedirs(output_dir)
176
+
177
+ def run_command(file_name,gtf_file,outfile_name):
178
+ cmd = [
179
+ "pyReadCounters.py",
180
+ "-f",
181
+ file_name,
182
+ "--file_type",
183
+ "sam",
184
+ "--gtf",
185
+ gtf_file,
186
+ "-v",
187
+ "--hittable",
188
+ "-o",
189
+ outfile_name
190
+ ]
191
+
192
+ if blocks:
193
+ cmd.extend(["--blocks",
194
+ "--mutations",
195
+ "nomuts"
196
+ ]
197
+ )
198
+ subprocess.run(cmd)
199
+
200
+ data_type = "reads"
201
+ if blocks:
202
+ data_type = "cDNAs"
203
+
204
+ ### Running pyReadCounters over multiple processors.
205
+ with ThreadPoolExecutor(no_cpus) as executor:
206
+ futures = []
207
+ for file_name in bam_files:
208
+ basename = getFileBaseName(file_name)
209
+ # The name of the output file path that should be submitted to pyReadCounters:
210
+ output_file_name = f"{output_dir}/{basename}"
211
+ # The name of the output file path produced by pyReadCounters:
212
+ output_file_path = f"{output_file_name}_hittable_{data_type}.txt"
213
+ # If the file already exists, don't overwrite it.
214
+ if not os.path.exists(output_file_path):
215
+ future = executor.submit(run_command, file_name, gtf_annotation_file, output_file_name)
216
+ futures.append(future)
217
+ else:
218
+ sys.stderr.write(f"\tOutput file {output_file_path} already exists!\n")
219
+
220
+ # Wait for all commands to complete
221
+ if futures:
222
+ for future in futures:
223
+ future.result()
224
+
225
+ def getPeaks(gtf_files,gtf_annotation_file,chromosome_file,no_cpus=1,min_peak_height=5,min_fdr=0.05,output_dir="peak_gtf_files"):
226
+ """ Runs pyCalculateFDRs to get peaks enriched in the data relative to random control dataset """
227
+
228
+ ### Creating the directory where the results will be stored:
229
+ if not os.path.exists(output_dir):
230
+ os.makedirs(output_dir)
231
+
232
+ def run_command(file_name,gtf_annotation_file,chromosome_file,min_peak_height,min_fdr,output_file_name):
233
+ cmd = [
234
+ "pyCalculateFDRs.py",
235
+ "-f",
236
+ file_name,
237
+ "--gtf",
238
+ gtf_annotation_file,
239
+ "-c",
240
+ chromosome_file,
241
+ "--min",
242
+ str(min_peak_height),
243
+ "-m",
244
+ str(min_fdr),
245
+ "-v",
246
+ "-o",
247
+ output_file_name
248
+ ]
249
+ subprocess.run(cmd)
250
+
251
+ ### Running pyCalculateFDRs over multiple processors.
252
+ with ThreadPoolExecutor(no_cpus) as executor:
253
+ futures = []
254
+ for file_name in gtf_files:
255
+ basename = getFileBaseName(file_name)
256
+ output_file_name = f"{output_dir}/{basename}_FDR_{str(min_fdr)}_min_peak_height_{str(min_peak_height)}.gtf"
257
+ if not os.path.exists(output_file_name):
258
+ future = executor.submit(run_command,
259
+ file_name,
260
+ gtf_annotation_file,
261
+ chromosome_file,
262
+ min_peak_height,
263
+ min_fdr,
264
+ output_file_name)
265
+ futures.append(future)
266
+ else:
267
+ sys.stderr.write(f"\tOutput file {output_file_name} already exists!\n")
268
+
269
+ # Wait for all commands to complete
270
+ if futures:
271
+ for future in futures:
272
+ future.result()
273
+
274
+ def filterPeaks(peak_gtf_files,by=None,output_dir="filtered_peak_files"):
275
+ """ Filters all the peaks in gtf files by a specific threshold. This
276
+ threshold could be the mean, median or mean plus one standard devation
277
+ (mean_plus_std) peak heights. Default is mean. """
278
+
279
+ ### Creating the directory where the results will be stored:
280
+ if not os.path.exists(output_dir):
281
+ os.makedirs(output_dir)
282
+
283
+ peak_heights = defaultdict(list)
284
+ data_thresholds = defaultdict(float)
285
+
286
+ for file_name in peak_gtf_files:
287
+ with open(file_name,'r') as infile:
288
+ for line in infile:
289
+ if not line.startswith("#"):
290
+ fld = line.strip().split('\t')
291
+ peak_height = float(fld[5])
292
+ peak_heights[file_name].append(peak_height)
293
+
294
+ ### Calculating thresholds:
295
+ data_thresholds = defaultdict(float)
296
+
297
+ for file_name, value in peak_heights.items():
298
+ ### Calculate the mean and use mean+stdev as the threshold:
299
+ threshold = float()
300
+
301
+ if by == 'mean':
302
+ threshold = np.mean(value)
303
+ elif by == 'median':
304
+ threshold = np.median(value)
305
+ elif by == 'mean_plus_std':
306
+ threshold = np.mean(value) + np.std(value)
307
+ elif by == "None":
308
+ threshold = 0
309
+ else:
310
+ sys.stderr.write("ERROR! Cannot figure out how you want to filter the peaks! Please use mean, median, mean_plus_std, or None\n")
311
+ threshold = 0
312
+
313
+ data_thresholds[file_name] = threshold
314
+
315
+ ### Using these thresholds to remove peaks
316
+ for file_name, value in peak_heights.items():
317
+ basename = getFileBaseName(file_name)
318
+ output_file_name = f"{output_dir}/{basename}_filtered_by_{by}_threshold.gtf"
319
+ if not os.path.exists(output_file_name) and os.path.exists(file_name):
320
+ outfile = open(output_file_name,"w")
321
+ threshold = data_thresholds[file_name]
322
+ with open(file_name) as peak_file:
323
+ for line in peak_file:
324
+ if not line.startswith("#"):
325
+ fld = line.strip().split('\t')
326
+ peak_height = float(fld[5])
327
+ if peak_height >= threshold:
328
+ outfile.write(line)
329
+ else:
330
+ outfile.write(line)
331
+ outfile.close()
332
+ else:
333
+ sys.stderr.write(f"\tOutput file {output_file_name} already exists!\n")
334
+
335
+ return True
336
+
337
+ def adjustPeakWidths(peak_gtf_files,chromosome_file,min_width=20,no_cpus=1,output_dir = "filtered_peak_files"):
338
+ """ Normalises the peak widths to a minimum length. """
339
+
340
+ ### Creating the directory where the results will be stored:
341
+ if not os.path.exists(output_dir):
342
+ os.makedirs(output_dir)
343
+
344
+ def run_command(file_name,chromosome_file,outfile_name=None,min_width=20):
345
+ cmd = [
346
+ "pyNormalizeIntervalLengths.py",
347
+ "-f",
348
+ file_name,
349
+ "-c",
350
+ chromosome_file,
351
+ "--min",
352
+ str(min_width),
353
+ "-o",
354
+ outfile_name
355
+ ]
356
+ subprocess.run(cmd)
357
+
358
+ ### Running pyNormalizeIntervalLengths over multiple processors.
359
+ with ThreadPoolExecutor(no_cpus) as executor:
360
+ futures = []
361
+ for file_name in peak_gtf_files:
362
+ basename = getFileBaseName(file_name)
363
+ output_file_name = f"{output_dir}/{basename}_min_width_{str(min_width)}.gtf"
364
+ if not os.path.exists(output_file_name):
365
+ future = executor.submit(run_command,file_name,chromosome_file,output_file_name,min_width)
366
+ futures.append(future)
367
+ else:
368
+ sys.stderr.write(f"\tOutput file {output_file_name} already exists!\n")
369
+
370
+ # Wait for all commands to complete
371
+ if futures:
372
+ for future in futures:
373
+ future.result()
374
+
375
+ return True
376
+
377
+ def numberPeaks(peak_gtf_files,output_dir="filtered_peak_files"):
378
+ """ Gives each peak a unique number to avoid a scenario where peaks end up having the same names,
379
+ which can cause problems with downstream data analysis steps. """
380
+
381
+ ### Creating the directory where the results will be stored:
382
+ if not os.path.exists(output_dir):
383
+ os.makedirs(output_dir)
384
+
385
+ for gtf_file in peak_gtf_files:
386
+ basename = getFileBaseName(gtf_file)
387
+ output_file_name = f"{output_dir}/{basename}_numbered.gtf"
388
+ if not os.path.exists(output_file_name):
389
+ outfile = open(output_file_name,"w")
390
+ with open(gtf_file,'r') as infile:
391
+ peak_number = 1
392
+ for line in infile:
393
+ if not line.startswith("#"):
394
+ try:
395
+ gene_id = re.search('gene_id \"([\(\)a-zA-Z_0-9-,\'|]+?)\";',line).group(1)
396
+ gene_name = re.search('gene_name \"([\(\)a-zA-Z_0-9-,\'|]+?)\";',line).group(1)
397
+
398
+ line = line.replace(f'gene_id \"{gene_id}\";',f'gene_id \"{gene_id}_peak_{peak_number}\";')
399
+ line = line.replace(f'gene_name \"{gene_name}\";',f'gene_name \"{gene_name}_peak_{peak_number}\";')
400
+ peak_number += 1
401
+ outfile.write(line)
402
+ except:
403
+ sys.stderr.write(line)
404
+ else:
405
+ outfile.write(line)
406
+ outfile.close()
407
+ return True
408
+ else:
409
+ sys.stderr.write(f"\tOutput file {output_file_name} already exists!\n")
410
+ return False
411
+
412
+ def getFileBaseName(file_name):
413
+ """ Returns the file basename without extension. """
414
+ return os.path.splitext(os.path.basename(file_name))[0]
415
+
416
+ def mergeHittables(hittables,outfile_name="merged_hittables.txt"):
417
+ """ Merges pyReadCounters hittables. """
418
+
419
+ genes = defaultdict(set)
420
+ data = dict()
421
+ feature = str()
422
+ columns = [0,1]
423
+ sumofdata = defaultdict(float)
424
+ mappedreadsdata = defaultdict()
425
+
426
+ if not os.path.exists(outfile_name):
427
+ for i in hittables:
428
+ data[i] = defaultdict(lambda: defaultdict(int))
429
+ mappedreadsdata[i] = defaultdict(float)
430
+ with open(i,"r") as infile:
431
+ mappedreads = 0
432
+ for line in infile:
433
+ if line.startswith("##"):
434
+ feature = line.strip().split()[1]
435
+ elif line.startswith("# total number of paired"):
436
+ mappedreads += int(line.strip().split("\t")[-1])
437
+ elif line.startswith("# total number of single"):
438
+ mappedreads += int(line.strip().split("\t")[-1])
439
+ elif re.search("[A-Za-z0-9]",line[0] ):
440
+ Fld = line.strip().split("\t")
441
+ gene,hits = Fld[columns[0]],Fld[columns[1]]
442
+ data[i][feature][gene] = float(hits)
443
+ genes[feature].add(gene)
444
+ mappedreadsdata[i] = mappedreads
445
+
446
+ outfile = open(outfile_name,"w")
447
+
448
+ outfile.write("# gene\t%s\n" % ("\t".join([getFileBaseName(i) for i in hittables])))
449
+ outfile.write("# total mapped reads:\t%s\n" % ("\t".join([str(mappedreadsdata[i]) for i in hittables])))
450
+ for feature in sorted(genes):
451
+ for i in hittables:
452
+ sumofdata[i] = sum([data[i][feature][j] for j in genes[feature]])
453
+ sumoffeaturehits = "\t".join([str(sumofdata[x]) for x in hittables])
454
+ outfile.write("\n## %s\t%s\n" % (feature,sumoffeaturehits))
455
+ for gene in sorted(list(genes[feature])):
456
+ hitstring = "\t".join([str(data[i][feature][gene]) for i in hittables])
457
+ outfile.write("%s\t%s\n" % (gene,hitstring))
458
+ return True
459
+ else:
460
+ sys.stderr.write(f"\tOutput file {outfile_name} already exists!\n")
461
+ return False
462
+
463
+ def runDESeq(merged_hittable,conditions,no_cpus=1,output_dir="DESeq2_results"):
464
+ """ Runs the DESeq2 analyses on the samples. Returns the results in a text file. """
465
+
466
+ ### Creating the directory where the results will be stored:
467
+ if not os.path.exists(output_dir):
468
+ os.makedirs(output_dir)
469
+
470
+ ### Defining the GTF file:
471
+ basename = getFileBaseName(merged_hittable)
472
+ outfile_name = f"{output_dir}/{basename}_DESeq2_results.txt"
473
+
474
+ if not os.path.exists(outfile_name):
475
+ ### opening the merged hittable file:
476
+ data = pd.read_csv(merged_hittable,comment="#",sep="\t",header=None,index_col=None)
477
+ columns = ["gene"]
478
+ columns.extend(conditions)
479
+
480
+ data.columns = columns
481
+ data.set_index('gene',inplace=True)
482
+
483
+ ### Creating a DataFrame with column data:
484
+ colData = pd.DataFrame(data.columns, columns=['gene'])
485
+ colData['Conditions'] = conditions
486
+
487
+ ### Changing the index so that the first column of both
488
+ ### countData and colData are the same:
489
+ colData = colData.set_index('gene')
490
+
491
+ ### Starting DESeq2
492
+ inference = DefaultInference(n_cpus=no_cpus)
493
+
494
+ dds = DeseqDataSet(
495
+ counts=data.T,
496
+ metadata=colData,
497
+ design_factors="Conditions",
498
+ refit_cooks=True,
499
+ inference=inference,
500
+ )
501
+
502
+ dds.deseq2()
503
+ stat_res = DeseqStats(dds, inference=inference)
504
+
505
+ ### Getting the final results:
506
+ stat_res.summary()
507
+ final_results = stat_res.results_df
508
+
509
+ ### Storing the final results:
510
+ final_results.to_csv(outfile_name,sep="\t")
511
+ return True
512
+ else:
513
+ sys.stderr.write(f"\tOutput file {outfile_name} already exists!\n")
514
+ return False
515
+
516
+ def addFoldChangeToGTF(deseq_results,merged_peaks,output_dir="./"):
517
+ """ Adds the log2-fold changes calculated be DESeq2 to the peak gtf file. """
518
+
519
+ ### Setting the output file names:
520
+ peak_outfile_name = f"{output_dir}/{getFileBaseName(merged_peaks)}_with_padj.gtf"
521
+
522
+ ### If the output file already exist, then don't overwrite:
523
+ if not os.path.exists(peak_outfile_name):
524
+ ### Creating the output file:
525
+ outfile = open(peak_outfile_name,"w")
526
+ ### loading the input files:
527
+ deseq_data = pd.read_csv(deseq_results,comment="#",sep="\t",index_col=None,header=0)
528
+
529
+ ### Opening the peak data:
530
+ with open(merged_peaks,"r") as peak_file:
531
+ for line in peak_file:
532
+ if not line.startswith("#"):
533
+ gene_name = re.search('gene_name \"([\(\)a-zA-Z_0-9-,\'|]+?)\";',line).group(1)
534
+ if gene_name in deseq_data["gene"].values:
535
+ log2fold_change = deseq_data.loc[deseq_data["gene"] == gene_name,"log2FoldChange"].values[0]
536
+ p_value = deseq_data.loc[deseq_data["gene"] == gene_name,"padj"].values[0]
537
+ line = f"{line.strip()} log2foldchange \"{log2fold_change}\"; padj \"{p_value}\";\n"
538
+ else:
539
+ line = f"{line.strip()} log2foldchange \"unknown\"; padj \"unknown\";\n"
540
+ outfile.write(line)
541
+ outfile.close()
542
+ return True
543
+ else:
544
+ sys.stderr.write(f"\tOutput file {peak_outfile_name} already exist!\n")
545
+ return False
546
+
547
+ def getSignificantPeaks(deseq_results,merged_peaks,fdr_threshold=0.05,output_dir="DESeq2_results"):
548
+ """ Extracts the significantly DE peaks from the merged peak GTF file """
549
+
550
+ ### Setting the output file names:
551
+ deseq_outfile_name = f"{output_dir}/{getFileBaseName(deseq_results)}_FDR_{str(fdr_threshold)}.txt"
552
+ peak_outfile_name = f"{output_dir}/{getFileBaseName(merged_peaks)}_FDR_{str(fdr_threshold)}.gtf"
553
+
554
+ ### If the output files already exist, then don't overwrite:
555
+ if not os.path.exists(peak_outfile_name):
556
+ ### loading the input files:
557
+ deseq_data = pd.read_csv(deseq_results,comment="#",sep="\t",index_col=None,header=0)
558
+
559
+ ### Filter the DESeq2 results:
560
+ deseq_data = deseq_data.loc[deseq_data['padj'] <= fdr_threshold]
561
+ deseq_data.to_csv(deseq_outfile_name,sep="\t",index=None)
562
+
563
+ ### DE genes:
564
+ de_genes = list(deseq_data[deseq_data.columns[0]])
565
+
566
+ ### Opening the peak data:
567
+ peak_data = pd.read_csv(merged_peaks,index_col=None,header=None,comment="#",sep="\t")
568
+ peak_data.columns = ['chrom','source','feature','start','end','score','strand','frame','annotations']
569
+
570
+ ### Making a seperate gene_name colum:
571
+ peak_data['gene_name'] = peak_data['annotations'].str.extract(r'gene_name \"(.*?)\"', expand=False)
572
+
573
+ ### Filter rows based on the list of gene names
574
+ filtered_peak_data = peak_data[peak_data['gene_name'].isin(de_genes)]
575
+ filtered_peak_data = filtered_peak_data.drop(columns=['gene_name'])
576
+ filtered_peak_data.to_csv(peak_outfile_name,sep="\t",header=False,index=False,quoting=csv.QUOTE_NONE)
577
+
578
+ return True
579
+ else:
580
+ sys.stderr.write(f"\tOutput files {peak_outfile_name} and {deseq_outfile_name} already exist!\n")
581
+ return False
582
+
583
+
584
+ def main():
585
+ parser = argparse.ArgumentParser(usage="usage: %(prog)s [options]", description="A tool for identifying differential RNA-binding sites in CLIP/CRAC datasets")
586
+
587
+ files = parser.add_argument_group("File input options")
588
+ files.add_argument("--samples", dest="samples", nargs="*", metavar="clip_samples.bam",
589
+ help="Paths to the bam files containing the replicate clip samples you want to compare", default=None)
590
+ files.add_argument("--controls", dest="controls", nargs="*", metavar="control_clip_samples.bam",
591
+ help="Paths to bam files containing replicate control clip samples.", default=None)
592
+ files.add_argument("-c", "--chromfile", dest="chromfile", type=str,
593
+ help="Location of the chromosome info file. This file should have two columns: \
594
+ first column is the names of the chromosomes, second column is length of the chromosomes.", default=None)
595
+ files.add_argument("--gtf", dest="gtf_annotation", type=str, metavar="yeast.gtf",
596
+ help="Path to GTF anotation file for your organism containing gene location information.", default=None)
597
+ files.add_argument("-j","--jobname",dest="jobname",type=str,metavar="WT_vs_mutant",
598
+ help="provide a name for the job. Default = WT_vs_mutant")
599
+ #files.add_argument("--log", dest="log", help="To print all the command lines used during the run to the 'command_lines.txt' file",
600
+ # action="store_true",default=False)
601
+
602
+ peaks = parser.add_argument_group("Peak calling settings")
603
+ peaks.add_argument("-m", "--minfdr", dest="minfdr", type=float, metavar="0.05",
604
+ help="To set a minimal FDR threshold for filtering interval data. Default is 0.05", default=0.05)
605
+ peaks.add_argument("--padj", dest="padj", metavar="0.05", type=float,
606
+ help="DESeq2 threshold for calling a DE peak. Default is 0.05", default=0.05)
607
+ peaks.add_argument("--min", dest="min", metavar="5",
608
+ help="to set a minimal read coverages for a region. Regions with coverage less than minimum will be ignoredve an FDR of zero", type=int, default=1)
609
+ peaks.add_argument("--blocks", dest="blocks", help="Add this flag if you want to consider reads with identical mapping coordinates once, regardless of sequence",
610
+ action="store_true",default=False)
611
+ peaks.add_argument("--iterations", dest="iterations", metavar="100", type=int,
612
+ help="to set the number of iterations for randomization of read coordinates. Default=100", default=100)
613
+ peaks.add_argument("-r","--rep",dest="reproducibility",metavar="90", type=float,
614
+ help="To set in what percentage of the replicates the peak should be detected. Default=100", default=100.0)
615
+ peaks.add_argument("--filter",dest="filter_peak_height", metavar="mean", type=str,
616
+ help="To filter the peaks in gtf files by a specific threshold. \
617
+ Options are mean, median or mean plus one standard devation (mean_plus_std) peak heights. Default is no filtering.",default="None",choices=["mean","median","mean_plus_std","None"])
618
+ peaks.add_argument("--min_peak_width",dest="min_peak_width",type=int,metavar="20"
619
+ ,help="To set the minimum width of a called peak. Default = 20",default=20)
620
+
621
+ log = parser.add_argument_group("Logging options")
622
+ log.add_argument("-v", "--verbose", action="store_true", help="to print status messages to a log file", default=False)
623
+
624
+ cpu = parser.add_argument_group("Number of CPUs needed for the analyses")
625
+ cpu.add_argument("--cpu", dest="cpu", type=int, metavar="12", help="The number of processors you want to use for the analyses. Default = 1", default=1)
626
+
627
+ args = parser.parse_args()
628
+
629
+ ### Setting key parameters;
630
+
631
+ data_type = "reads"
632
+
633
+ if args.blocks:
634
+ data_type = "cDNAs"
635
+
636
+
637
+ ### Running the code:
638
+
639
+ # Making the directory where the results files are stored:
640
+ storage_dir = f"{args.jobname}"
641
+ if not os.path.exists(storage_dir):
642
+ os.makedirs(storage_dir)
643
+
644
+ # Getting read counts for all the bam files:
645
+ if args.verbose:
646
+ sys.stdout.write("### Getting gene counts from sample bam files....\n")
647
+
648
+ all_bam_files = list()
649
+ all_bam_files.extend(args.samples)
650
+ all_bam_files.extend(args.controls)
651
+
652
+ countReadsBam(all_bam_files,
653
+ args.gtf_annotation,
654
+ no_cpus=args.cpu,
655
+ output_dir=f"{storage_dir}/bam_read_counts"
656
+ )
657
+
658
+ # Getting all the peak_files:
659
+ if args.verbose:
660
+ sys.stdout.write("### Finding peaks in the sample files....\n")
661
+
662
+ file_basenames = [getFileBaseName(i) for i in all_bam_files]
663
+
664
+ all_read_counters_files = [f"{storage_dir}/bam_read_counts/{i}_count_output_{data_type}.gtf" for i in file_basenames]
665
+
666
+
667
+ getPeaks(all_read_counters_files,
668
+ args.gtf_annotation,
669
+ args.chromfile,
670
+ no_cpus=args.cpu,
671
+ min_peak_height=args.min,
672
+ min_fdr=args.minfdr,
673
+ output_dir=f"{storage_dir}/peak_gtf_files"
674
+ )
675
+
676
+ # Filtering the peaks by peak height:
677
+ if args.verbose:
678
+ sys.stdout.write(f"### Filtering the peaks by {args.filter_peak_height} values of peak heights....\n")
679
+
680
+ read_counters_file_basenames = [getFileBaseName(i) for i in all_read_counters_files]
681
+ peak_gtf_files = [f"{storage_dir}/peak_gtf_files/{i}_FDR_{str(args.minfdr)}_min_peak_height_{str(args.min)}.gtf" \
682
+ for i in read_counters_file_basenames]
683
+
684
+ filterPeaks(peak_gtf_files,
685
+ by=args.filter_peak_height,
686
+ output_dir=f"{storage_dir}/filtered_peak_files"
687
+ )
688
+
689
+ # Setting minimum peak widths:
690
+ if args.verbose:
691
+ sys.stdout.write(f"### Adjusting peak widths to a minimum of {args.min_peak_width}....\n")
692
+
693
+ file_basenames = [getFileBaseName(i) for i in peak_gtf_files]
694
+ filtered_peak_files = [f"{storage_dir}/filtered_peak_files/{i}_filtered_by_{args.filter_peak_height}_threshold.gtf" \
695
+ for i in file_basenames]
696
+
697
+ adjustPeakWidths(filtered_peak_files,
698
+ args.chromfile,
699
+ min_width=args.min_peak_width,
700
+ no_cpus=args.cpu,
701
+ output_dir=f"{storage_dir}/filtered_peak_files"
702
+ )
703
+
704
+ # Merging the peak intervals for the sample files:
705
+ if args.verbose:
706
+ sys.stdout.write("### Merging peak intervals...\n")
707
+
708
+ file_basenames = [getFileBaseName(i) for i in filtered_peak_files]
709
+ filtered_peak_files = [f"{storage_dir}/filtered_peak_files/{i}_min_width_{args.min_peak_width}.gtf" for i in file_basenames]
710
+ reproducibility = args.reproducibility/100.0
711
+ merged_peak_output_file_name = f"{storage_dir}/filtered_peak_files/{args.jobname}_merged_peaks.gtf"
712
+
713
+ mergeGTFintervals(filtered_peak_files,
714
+ reproducibility,
715
+ merged_peak_output_file_name,
716
+ )
717
+
718
+ # Numbering the peaks:
719
+ if args.verbose:
720
+ sys.stdout.write("### Numbering the peaks....\n")
721
+
722
+ numberPeaks([merged_peak_output_file_name],
723
+ output_dir=f"{storage_dir}/filtered_peak_files"
724
+ )
725
+
726
+ # Performing pyReadCounters analysis on the bam files using the new GTF file containing the numbered peaks:
727
+ if args.verbose:
728
+ sys.stdout.write("### Counting peak coverage for each sample and control file....\n")
729
+
730
+ annotation_file = f"{os.path.splitext(merged_peak_output_file_name)[0]}_numbered.gtf"
731
+
732
+ # Making sure the input file actually exists!:
733
+ if os.path.exists(annotation_file):
734
+
735
+ all_bam_files = list()
736
+ all_bam_files.extend(args.samples)
737
+ all_bam_files.extend(args.controls)
738
+
739
+ countReadsGTF(all_bam_files,
740
+ gtf_annotation_file=annotation_file,
741
+ no_cpus=args.cpu,
742
+ output_dir=f"{storage_dir}/peak_hittables"
743
+ )
744
+ else:
745
+ sys.stderr.write(f"The file {os.path.basename(annotation_file)} already exists!\n")
746
+
747
+ # Merging the hittables:
748
+ if args.verbose:
749
+ sys.stdout.write("Merging the peak read coverage hit tables.\n")
750
+
751
+ sample_file_base_names = [getFileBaseName(i) for i in args.samples]
752
+ control_file_base_names = [getFileBaseName(i) for i in args.controls]
753
+
754
+ sample_file_hittables = [f"{storage_dir}/peak_hittables/{i}_hittable_{data_type}.txt" for i in sample_file_base_names]
755
+ control_file_hittables = [f"{storage_dir}/peak_hittables/{i}_hittable_{data_type}.txt" for i in control_file_base_names]
756
+
757
+ tables_to_merge = list()
758
+ tables_to_merge.extend(sample_file_hittables)
759
+ tables_to_merge.extend(control_file_hittables)
760
+ hittable_name = f"{storage_dir}/peak_hittables/{args.jobname}_merged_hittables.txt"
761
+
762
+ if not os.path.exists(hittable_name):
763
+ mergeHittables(tables_to_merge,hittable_name)
764
+ else:
765
+ sys.stderr.write(f"\tOutput file {os.path.basename(hittable_name)} already exists!\n")
766
+
767
+ # Running the DESeq2 analyses:
768
+ # Setting the testing conditions:
769
+ if args.verbose:
770
+ sys.stdout.write("### Running the DESeq2 analyses....\n")
771
+
772
+ conditions = list()
773
+ conditions.extend(len(args.samples)*["WT"])
774
+ conditions.extend(len(args.controls)*["control"])
775
+
776
+ # Order of Conditions: The order in which conditions are specified affects the reference level in DESeq2.
777
+ # In our case, "WT" samples are listed first in the conditions list and then "Control samples"
778
+ # DESeq2 will treat "WT" as the reference level by default (assuming no other specifications are made to alter this).
779
+ # This means:
780
+ # POSITIVE log2FC: Indicates higher expression in "control" relative to "WT".
781
+ # NEGATIVE log2FC: Indicates higher expression in "WT" relative to "control".
782
+
783
+ runDESeq(hittable_name,
784
+ conditions,
785
+ no_cpus=args.cpu,
786
+ output_dir=f"{storage_dir}/DESeq2_results"
787
+ )
788
+
789
+ # Adding log2-fold changes to the peak GTF file:
790
+ deseq_table_name = f"{storage_dir}/DESeq2_results/{getFileBaseName(hittable_name)}_DESeq2_results.txt"
791
+
792
+ addFoldChangeToGTF(deseq_table_name,
793
+ annotation_file,
794
+ output_dir=f"{storage_dir}/DESeq2_results"
795
+ )
796
+
797
+ # Extracting the significant peaks
798
+ annotation_file_with_padj = f"{storage_dir}/DESeq2_results/{getFileBaseName(annotation_file)}_with_padj.gtf"
799
+
800
+ if args.verbose:
801
+ sys.stdout.write("#### Extracting significant peaks....\n")
802
+
803
+ getSignificantPeaks(deseq_table_name,
804
+ annotation_file_with_padj,
805
+ fdr_threshold=args.padj,
806
+ output_dir=f"{storage_dir}/DESeq2_results"
807
+ )
808
+
809
+ if __name__ == "__main__":
810
+ main()
@@ -0,0 +1,175 @@
1
+ Metadata-Version: 2.4
2
+ Name: DBPeaks
3
+ Version: 0.0.3
4
+ Summary: A tool for identifying differentially bound peaks in CLIP/CRAC data
5
+ Author-email: Sander Granneman <Sander.Granneman@ed.ac.uk>
6
+ License-Expression: MIT
7
+ Classifier: Development Status :: 3 - Alpha
8
+ Classifier: Intended Audience :: Science/Research
9
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Programming Language :: Python :: 3.8
12
+ Classifier: Programming Language :: Python :: 3.9
13
+ Classifier: Programming Language :: Python :: 3.10
14
+ Classifier: Programming Language :: Python :: 3.11
15
+ Requires-Python: >=3.8
16
+ Description-Content-Type: text/markdown
17
+ License-File: LICENSE
18
+ Requires-Dist: pybedtools
19
+ Requires-Dist: numpy
20
+ Requires-Dist: pandas
21
+ Requires-Dist: pydeseq2==0.4.9
22
+ Requires-Dist: pyCRAC==1.5.2
23
+ Dynamic: license-file
24
+
25
+ # DBPeaks: Differential RNA-Binding Site Analysis Tool
26
+
27
+ ## Contents
28
+
29
+ - [Introduction](#introduction)
30
+
31
+ - [Repo Contents](#repo-contents)
32
+
33
+ - [Features](#features)
34
+
35
+ - [System Requirements](#system-requirements)
36
+
37
+ - [Installation Guide](#installation-guide)
38
+
39
+ - [License](./LICENSE)
40
+
41
+ - [Citation](#citation)
42
+
43
+ - [Contact](#contact)
44
+
45
+ ## Inroduction
46
+
47
+ DBPeaks is a Python-based command-line tool designed for the identification and analysis of differential RNA-binding sites in CLIP/CRAC datasets. It integrates various bioinformatics tools and methods to process sequencing data, identify peaks, and perform statistical analyses to detect significant differences in RNA-binding across conditions. It does so by first analysing the peaks in each individual file and it then looks whether peaks are found in the same regions. These peaks need to have overlapping genome mapping coordinates. All overlapping peaks will then be merged into a single peak interval and for each interval the program will then calculate the total number of reads covering that genomic interval. DESeq2 will then be used to determine if the read counts for that interval is statistically significantly different between sample and control files.
48
+
49
+ It requires CLIP/CRAC data BAM files as input as well as GTF and genome files for the model organism.
50
+ Make sure your GTF annotation file does not have any silly formatting mistakes, otherwise the program will not run.
51
+ Example genome files for yeast are available in this repository.
52
+
53
+ NOTE! DBPeaks was SPECIFICALLY designed to analyse CLIP/CRAC datasets from two different conditions or by comparing data from WT vs mutant RBPs. It was NOT designed to compare RBP CLIP datasets to control datasets that have substantially lower read counts (i.e. data from untagged strains or no UV cross-linking controls). Should you be stubborn and still decide to use DBPeaks for this purpose, you will get rubbish results!
54
+
55
+ It is really important that all bam files have good number of reads and that there is not a huge difference in read depth between the files. This will make DESeq2 much happier and will therefore improve the results.
56
+
57
+ We have tried many different tools that do similar things. However, we were either not able to get them running on our servers or they were not able to detect clearly differentially bound (DB) peaks in our data. We have not benchmarked DBPeaks to existing tools so we do not yet know how well it performs compared to most popular peak calling methods. All I can say is that on OUR data where we removed an RBP binding site in the genome it performs better than existing tools such as MACS3 and Peakachu. DBPeaks was able to detect loss of binding in that single genomic location. The other tools we tested could not. DBPeaks is, however, slower than most existing tools. This is because it relies on pyCalculateFDRs from the pyCRAC package to call peaks. This script looks for peaks in each individual gene anotated in the genome and takes read coverage of the gene into consideration for this. So this part is rather slow if you have many features annotated in your genome file.
58
+
59
+ DBPeaks uses multiple CPUs to process the data and has the added advantage that it can also use replicates.
60
+
61
+ ## Repo Contents
62
+ - [DBPeaks](./DBPeaks.py)
63
+ - [License](./LICENSE)
64
+
65
+ ## Features
66
+
67
+ Comprehensive Analysis Pipeline: From reading BAM files to statistical analysis with DESeq2.
68
+ Parallel Processing: Utilizes multiple CPUs to speed up the analysis.
69
+ Flexible Input Options: Supports various configurations and customizations through command-line options.
70
+ Integrated Peak Calling and Filtering: Includes functionality for peak detection, filtering based on reproducibility, and adjustment of peak widths.
71
+ Statistical Analysis: Incorporates DESeq2 for rigorous differential analysis.
72
+
73
+ ## Installation Guide
74
+
75
+ ### Prerequisites
76
+
77
+ Python 3.6 or higher
78
+ Dependencies: pybedtools, numpy, pandas, pydeseq2, pyCRAC, and others as listed in requirements.txt.
79
+ Steps
80
+
81
+ Clone the repository:
82
+
83
+ ```
84
+ git clone https://git.ecdf.ed.ac.uk/sgrannem/dbpeaks.git
85
+ cd dbpeaks
86
+ ```
87
+
88
+ ### Install required Python packages:
89
+
90
+ ```
91
+ pip install -r requirements.txt
92
+ ```
93
+
94
+ ### Install DBPeaks:
95
+
96
+ ```
97
+ cd dbpeaks
98
+ pip install -e . --user
99
+ ```
100
+
101
+ ### Running DBPeaks
102
+
103
+ DBPeaks is run from the command line. Here is a basic example to get you started:
104
+
105
+ python dbpeaks.py --samples path/to/sample1.bam path/to/sample2.bam --controls path/to/control1.bam path/to/control2.bam --gtf path/to/annotation.gtf --chromfile path/to/chrominfo.txt --jobname ExampleAnalysis
106
+
107
+ ### Command-Line Options
108
+
109
+ --samples: Specify paths to the BAM files for the sample group.
110
+ --controls: Specify paths to the BAM files for the control group.
111
+ --gtf: Path to the GTF annotation file.
112
+ --chromfile: Location of the chromosome info file. This file should have two columns: first column is the names of the chromosomes, second column is length of the chromosomes.
113
+ --jobname: A name for the job to organize output files.
114
+
115
+ ### Additional options for peak calling, filtering, and statistical thresholds can be viewed using the help option:
116
+
117
+ ```
118
+ DBpeaks.py --help
119
+ ```
120
+
121
+ ### Peak calling settings:
122
+ -m 0.05, --minfdr 0.05 To set a minimal FDR threshold for filtering interval data. Default is 0.05
123
+
124
+ This is a setting that is used when running pyCalculateFDRs. If you end up getting a lot of peaks in your data,
125
+ it is recommended to change this threshold, let's say to 0.01 as this will reduce the number of significantly enriched peaks
126
+ in your data.
127
+
128
+ --padj 0.05 DESeq2 threshold for calling a peak DB. Default is 0.05
129
+
130
+ If you hardly get any DB peaks, then it may be worth slighly adjusting this threshold.
131
+ However, in this scenario, it may also be the case that your samples just have too much variability.
132
+ It may then be wise to do a PCA analysis on your data to see if replicates are indeed grouped together.
133
+
134
+ --min 5 to set a minimal read coverages for a region. Regions with coverage less than minimum will be ignored
135
+
136
+ --blocks Add this flag if you want to consider reads with identical mapping coordinates once, regardless of sequence.
137
+
138
+ NOTE! This is a HUGELY important flag! Setting --blocks will remove any 'towers' in your data and collapse them into
139
+ one single interval. This can completely change the shape and height of the peak and the peak may no longer be detected.
140
+ However, if you suspect that your library is of low complexity and you see many of these blocks or towers in your genome browser, then I would recommend adding this flag as I have seen that this can improve the reliability of the final DESeq2 analyses.
141
+
142
+ --iterations 100 to set the number of iterations for randomization of read coordinates. Default=100
143
+
144
+ This is important for the peak calling analysis by the pyCalculateFDR.py script.
145
+
146
+
147
+ -r 90, --rep 90 To set in what percentage of the replicates the peak should be detected. Default=100
148
+
149
+ Let's say you have three sample and three control bam files and you set -r to 50, then peaks that are, for example present in the smaple files but absent in the control files will also be considered. If you, in this scenario, set -r to 100, then the tool will expect to find overlapping peaks at any given position for ALL samples! So you may miss peaks that were, for example, only present in your sample but not in the control!
150
+
151
+ --filter mean To filter the peaks in gtf files by a specific threshold. Options are mean, median or mean plus one standard devation
152
+ (mean_plus_std) peak heights. Default is no filtering.
153
+
154
+ I would always recommend starting with no filtering. If you get too many DB peaks, then I would start with --filter mean and then --filter median.
155
+
156
+ --min_peak_width 20 To set the minimum width of a called peak. Default = 20
157
+
158
+
159
+ ## Contributing to further improving DBPeaks
160
+
161
+ Contributions to DBPeaks are welcome! Please fork the repository and submit pull requests with your enhancements.
162
+ We will also be including some test data on the repository soon!
163
+
164
+ ## License
165
+
166
+ This project is licensed under the Apache License - see the LICENSE file for details.
167
+
168
+
169
+ ## Citation
170
+
171
+ DBPeaks was developed to analyse CRAC data for a manuscript that we are about to submit. This will be updated once the paper has been accepted or put on a preprint server.
172
+
173
+ ## Contact
174
+
175
+ For support or to report issues, please contact Sander Granneman at Sander.Granneman@ed.ac.uk, University of Edinburgh.
@@ -0,0 +1,7 @@
1
+ DBPeaks.py,sha256=H7ZSGxxK4hvEbpN52jXz0aojXENwF1AYY3wlQfj57UM,34794
2
+ dbpeaks-0.0.3.dist-info/licenses/LICENSE,sha256=DU9_yiwF7Kz7teRaufuoFy-WJ8abkKbkzkn8e2wVIvw,11338
3
+ dbpeaks-0.0.3.dist-info/METADATA,sha256=Udp6wfLHzKEbCBZsWuyfpD83UWl677y1sWlWU6d9H9c,9356
4
+ dbpeaks-0.0.3.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
5
+ dbpeaks-0.0.3.dist-info/entry_points.txt,sha256=Aa_zwEkr9vFVgLr7YffiVUGsi3asuy3pyfzd9KIwjY4,41
6
+ dbpeaks-0.0.3.dist-info/top_level.txt,sha256=x9WZWRV0ZCfB8TEjokW0A5nJGNf9XoGC_T3u1c7TC3E,8
7
+ dbpeaks-0.0.3.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ dbpeaks = DBPeaks:main
@@ -0,0 +1,201 @@
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "{}"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright 2021 sgrannem
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
@@ -0,0 +1 @@
1
+ DBPeaks