bacterial-annotator 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/bin/ba_blat +60 -0
- data/bin/ba_prodigal +45 -0
- data/bin/bacterial-annotator +131 -0
- data/lib/bacterial-annotator.rb +406 -0
- data/lib/bacterial-annotator/fasta-manip.rb +166 -0
- data/lib/bacterial-annotator/genbank-manip.rb +208 -0
- data/lib/bacterial-annotator/remote-ncbi.rb +200 -0
- data/lib/bacterial-annotator/synteny-manip.rb +188 -0
- metadata +96 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA1:
|
|
3
|
+
metadata.gz: feff352efae5fee416e2ed704a1ba20e36c391f3
|
|
4
|
+
data.tar.gz: 852b8ce7a8c6fa18bae196434ef47a8687584e32
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: 794d2ee5ae969f986e0e5a3dbdf65425c6a8b9b39d72df383b8bcb09f1f51ab790355e186c6bf1081bf84f0bd3ee39cdde25e450de5939e14b70114a7a38ac69
|
|
7
|
+
data.tar.gz: 94c90664667c6d913d4396ec1e0bda356bc3fff86d3e1d4f91b91ba6fb55db16d7310e43f3e44bfc9ba8efd76b0b2e87d2f6b28d1fd08b2f222b798771cf2c3a
|
data/bin/ba_blat
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
# author: maxime déraspe
|
|
4
|
+
# email: maxime@deraspe.net
|
|
5
|
+
# review:
|
|
6
|
+
# date: 15-02-24
|
|
7
|
+
# version: 0.01
|
|
8
|
+
# licence:
|
|
9
|
+
|
|
10
|
+
require 'net/http'
|
|
11
|
+
|
|
12
|
+
ROOT_path = File.dirname(__FILE__)
|
|
13
|
+
# blat URL = "http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/blat/blat"
|
|
14
|
+
|
|
15
|
+
# Install blat on the user system
|
|
16
|
+
def installBlat
|
|
17
|
+
|
|
18
|
+
begin
|
|
19
|
+
Net::HTTP.start("hgdownload.cse.ucsc.edu") do |http|
|
|
20
|
+
resp = http.get("/admin/exe/linux.x86_64/blat/blat")
|
|
21
|
+
open("#{ROOT_path}/blat.linux", "wb") do |file|
|
|
22
|
+
file.write(resp.body)
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
File.chmod(0755, "#{ROOT_path}/blat.linux")
|
|
27
|
+
rescue
|
|
28
|
+
abort "Problem installing Blat, aborting"
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
# Install blat if not already install
|
|
35
|
+
if ! File.exists? "#{ROOT_path}/blat.linux"
|
|
36
|
+
|
|
37
|
+
puts "Installing Blat the aligner.."
|
|
38
|
+
puts ""
|
|
39
|
+
puts "Please note that the Blat source and executables are freely available for"
|
|
40
|
+
puts "academic, nonprofit and personal use. Commercial licensing information is"
|
|
41
|
+
puts "available on the Kent Informatics website (http://www.kentinformatics.com/)."
|
|
42
|
+
puts "See http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/blat/"
|
|
43
|
+
puts ""
|
|
44
|
+
puts "Do you accept the license ? Y/n"
|
|
45
|
+
|
|
46
|
+
answer = $stdin.gets.chomp
|
|
47
|
+
|
|
48
|
+
if answer.downcase == "y" or answer.downcase == ""
|
|
49
|
+
puts "OK you accepted the licence let's install the blat binary"
|
|
50
|
+
installBlat
|
|
51
|
+
puts "Blat successfully installed : OK"
|
|
52
|
+
puts ""
|
|
53
|
+
else
|
|
54
|
+
puts "Sorry bacterial-annotator rely on blat for the alignment !"
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
# system("#{ROOT_path}/blat.linux")
|
data/bin/ba_prodigal
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
# author: maxime déraspe
|
|
4
|
+
# email: maxime@deraspe.net
|
|
5
|
+
# review:
|
|
6
|
+
# date: 15-02-24
|
|
7
|
+
# version: 0.01
|
|
8
|
+
# licence:
|
|
9
|
+
|
|
10
|
+
require 'open-uri'
|
|
11
|
+
|
|
12
|
+
ROOT_path = File.dirname(__FILE__)
|
|
13
|
+
# prodigal URL = "https://github.com/hyattpd/Prodigal/releases/download/v2.6.2/prodigal.linux"
|
|
14
|
+
|
|
15
|
+
# Install prodigal on the user system
|
|
16
|
+
def installProdigal
|
|
17
|
+
|
|
18
|
+
begin
|
|
19
|
+
resp = open("https://github.com/hyattpd/Prodigal/releases/download/v2.6.2/prodigal.linux")
|
|
20
|
+
open("#{ROOT_path}/prodigal.linux", "wb") do |file|
|
|
21
|
+
file.write(resp.read)
|
|
22
|
+
end
|
|
23
|
+
File.chmod(0755, "#{ROOT_path}/prodigal.linux")
|
|
24
|
+
rescue
|
|
25
|
+
abort "Problem installing Prodigal, aborting"
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
# Install prodigal if not already install
|
|
32
|
+
if ! File.exists? "#{ROOT_path}/prodigal.linux"
|
|
33
|
+
|
|
34
|
+
puts "Installing Prodigal the ORF finder.."
|
|
35
|
+
puts "See https://github.com/hyattpd/Prodigal"
|
|
36
|
+
puts "The Licence is GPLv3"
|
|
37
|
+
installProdigal
|
|
38
|
+
puts "Prodigal successfully installed"
|
|
39
|
+
puts ""
|
|
40
|
+
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
# Main
|
|
44
|
+
# Dir.mkdir("Prodigal-Output") if ! File.exists? "Prodigal-Output"
|
|
45
|
+
# system("#{ROOT_path}/prodigal.linux -a Prodigal-Output/Proteins.fa -d Prodigal-Output/Genes.fa -o Prodigal-Output/Genbanks.gbk -i #{ARGV[0]}")
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
# author: maxime déraspe
|
|
4
|
+
# email: maxime@deraspe.net
|
|
5
|
+
# review:
|
|
6
|
+
# date: 15-02-24
|
|
7
|
+
# version: 0.01
|
|
8
|
+
# licence:
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
require 'bacterial-annotator'
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
# Usage message to print to CLI
|
|
15
|
+
def usage
|
|
16
|
+
|
|
17
|
+
print <<OEM
|
|
18
|
+
|
|
19
|
+
bacterial-annotator [OPTIONS]
|
|
20
|
+
|
|
21
|
+
[OPTIONS]
|
|
22
|
+
|
|
23
|
+
// IO
|
|
24
|
+
|
|
25
|
+
--input/-i <fasta_file> Provide the fasta file to annotate
|
|
26
|
+
--outdir/-o <outdir> Output directory [default=BAnnotation]
|
|
27
|
+
--force/-f Force to overwrite the output directory
|
|
28
|
+
|
|
29
|
+
// Dataset
|
|
30
|
+
|
|
31
|
+
--refgenome/-g <GBK_ID> Provide a Genbank file or a Gbk Accession ID.
|
|
32
|
+
--guessref Will guess the best reference genome to use for the annotation.
|
|
33
|
+
|
|
34
|
+
--remotedb <remote_database> [nr|refseq|swissprot]
|
|
35
|
+
Complete the annotation of remaining CDS with a remote NCBI BLAST
|
|
36
|
+
Can be very slow, better to use an external database !
|
|
37
|
+
|
|
38
|
+
--externaldb <proteins fasta_file>
|
|
39
|
+
Complete or do the annotation of remaining CDS with this database (a protein fasta file).
|
|
40
|
+
Fasta headers need to look similar to NCBI or EBI fasta headers, ex.:
|
|
41
|
+
>gi|385721352|gb|AFI72857.1| NDM-1 [Escherichia coli]
|
|
42
|
+
>sp|C7C422|BLAN1_KLEPN Beta-lactamase NDM-1 OS=Klebsiella pneumoniae..
|
|
43
|
+
|
|
44
|
+
// Other options
|
|
45
|
+
|
|
46
|
+
--pidentity Minimum percentage identity to incorporate a CDS annotation [default=0.7]
|
|
47
|
+
--minlength Minimum contig length for annotation [default=500]
|
|
48
|
+
|
|
49
|
+
--meta Better for metagenome and plasmid annotations because of disparate codon usage [default=off]
|
|
50
|
+
--gff Will also generate gff annotation files [off by default]
|
|
51
|
+
|
|
52
|
+
--help/-h Print this !
|
|
53
|
+
OEM
|
|
54
|
+
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
# Parse the Options given on the CLI
|
|
58
|
+
def parseOptions
|
|
59
|
+
|
|
60
|
+
options = {}
|
|
61
|
+
|
|
62
|
+
# default options
|
|
63
|
+
options[:outdir] = "BAnnotation"
|
|
64
|
+
options[:pidentity] = 70
|
|
65
|
+
options[:minlength] = 500
|
|
66
|
+
options[:meta] = 0
|
|
67
|
+
|
|
68
|
+
while x = ARGV.shift
|
|
69
|
+
|
|
70
|
+
case x.downcase
|
|
71
|
+
when "--input", "-i"
|
|
72
|
+
options[:input] = ARGV.shift
|
|
73
|
+
when "--refgenome", "-g"
|
|
74
|
+
options[:refgenome] = ARGV.shift
|
|
75
|
+
when "--outdir", "-o"
|
|
76
|
+
options[:outdir] = ARGV.shift
|
|
77
|
+
when "--force", "-f"
|
|
78
|
+
options[:force] = 1
|
|
79
|
+
when "--gff"
|
|
80
|
+
options[:gff] = 1
|
|
81
|
+
when "--minlength"
|
|
82
|
+
options[:minlength] = ARGV.shift
|
|
83
|
+
when "--pidentity"
|
|
84
|
+
options[:pidentity] = ARGV.shift
|
|
85
|
+
when "--meta"
|
|
86
|
+
options[:meta] = 1
|
|
87
|
+
when "--remotedb"
|
|
88
|
+
options[:remote_db] = ARGV.shift
|
|
89
|
+
when "--externaldb"
|
|
90
|
+
options[:external_db] = ARGV.shift
|
|
91
|
+
when "--help", "-h"
|
|
92
|
+
usage
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
options
|
|
98
|
+
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
# Main
|
|
103
|
+
if ARGV.size > 1
|
|
104
|
+
|
|
105
|
+
ROOT = File.dirname(__FILE__)
|
|
106
|
+
options = parseOptions
|
|
107
|
+
|
|
108
|
+
# Check for 3rd party dependencies : Prodigal and Blat
|
|
109
|
+
system("ba_prodigal")
|
|
110
|
+
system("ba_blat")
|
|
111
|
+
|
|
112
|
+
if ! File.exist? ("#{ROOT}/blat.linux")
|
|
113
|
+
abort "exiting blat is missing"
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
# Check Options
|
|
117
|
+
if ! options.has_key? :refgenome and ! options.has_key? :remote_db and ! options.has_key? :external_db
|
|
118
|
+
puts "You didn't provide a reference genome or a database for the annotation !"
|
|
119
|
+
elsif ! options.has_key? :input
|
|
120
|
+
puts "You didn't provide a fasta file to annotate !"
|
|
121
|
+
elsif
|
|
122
|
+
puts ""
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
bannot = BacterialAnnotator.new(options, ROOT)
|
|
126
|
+
bannot.prepare_files_for_annotation
|
|
127
|
+
bannot.run_annotation
|
|
128
|
+
|
|
129
|
+
else
|
|
130
|
+
usage
|
|
131
|
+
end
|
|
@@ -0,0 +1,406 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
# author: maxime déraspe
|
|
3
|
+
# email: maxime@deraspe.net
|
|
4
|
+
# review:
|
|
5
|
+
# date: 15-02-24
|
|
6
|
+
# version: 0.0.1
|
|
7
|
+
# licence:
|
|
8
|
+
|
|
9
|
+
require 'bio'
|
|
10
|
+
require 'fileutils'
|
|
11
|
+
|
|
12
|
+
require 'bacterial-annotator/genbank-manip'
|
|
13
|
+
require 'bacterial-annotator/fasta-manip'
|
|
14
|
+
require 'bacterial-annotator/synteny-manip'
|
|
15
|
+
require 'bacterial-annotator/remote-ncbi'
|
|
16
|
+
|
|
17
|
+
class BacterialAnnotator
|
|
18
|
+
|
|
19
|
+
# Initialize BacterialAnnotator
|
|
20
|
+
# options[:input], options[:refgenome], ROOT, options[:outdir], options)
|
|
21
|
+
def initialize options, root
|
|
22
|
+
|
|
23
|
+
@root = root
|
|
24
|
+
@options = options
|
|
25
|
+
@outdir = @options[:outdir]
|
|
26
|
+
|
|
27
|
+
@minlength = @options[:minlength].to_i
|
|
28
|
+
@pidentity = @options[:pidentity].to_f
|
|
29
|
+
@pidentity = @pidentity * 100 if @pidentity < 1.00
|
|
30
|
+
|
|
31
|
+
if File.exists? (@outdir)
|
|
32
|
+
if ! options.has_key? :force
|
|
33
|
+
abort "Output directory already exist ! Choose another one or use -f to overwrite"
|
|
34
|
+
else
|
|
35
|
+
puts "Overwriting output directory #{@outdir}"
|
|
36
|
+
FileUtils.remove_dir(@outdir, force=true)
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
Dir.mkdir(@outdir)
|
|
40
|
+
|
|
41
|
+
@fasta = FastaManip.new(@options[:input], @options[:meta])
|
|
42
|
+
|
|
43
|
+
@with_refence_genome = false
|
|
44
|
+
if @options.has_key? :refgenome
|
|
45
|
+
@with_refence_genome = true
|
|
46
|
+
@refgenome = GenbankManip.new(@options[:refgenome], @outdir)
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
@prot_synteny = nil
|
|
50
|
+
@annotation_stats = {by_contigs: {},
|
|
51
|
+
annotated_cds: 0,
|
|
52
|
+
total_cds: 0,
|
|
53
|
+
foreign_contigs: [],
|
|
54
|
+
synteny_contigs: [],
|
|
55
|
+
short_contigs: []}
|
|
56
|
+
|
|
57
|
+
@contig_foreign_cds = {}
|
|
58
|
+
@contig_annotations = {}
|
|
59
|
+
|
|
60
|
+
end # end of method
|
|
61
|
+
|
|
62
|
+
# Prepare files for the annotation
|
|
63
|
+
# Will run prodigal on the query and prepare reference genome files
|
|
64
|
+
def prepare_files_for_annotation
|
|
65
|
+
puts "\nRunning Prodigal on your genome.."
|
|
66
|
+
@fasta.run_prodigal @root, @outdir
|
|
67
|
+
puts "Prodigal done."
|
|
68
|
+
if @with_refence_genome
|
|
69
|
+
@refgenome.write_cds_to_file @outdir
|
|
70
|
+
puts "Successfully loaded #{@refgenome.gbk.definition}"
|
|
71
|
+
end
|
|
72
|
+
end # end of method
|
|
73
|
+
|
|
74
|
+
# run_alignment of reference genome proteins and the query
|
|
75
|
+
def run_annotation
|
|
76
|
+
|
|
77
|
+
# process reference genome synteny
|
|
78
|
+
if @with_refence_genome # Annotation with the Reference Genome
|
|
79
|
+
|
|
80
|
+
@prot_synteny = SyntenyManip.new(@fasta.prodigal_files[:proteins], @refgenome.cds_file, "Prot-Ref", @pidentity)
|
|
81
|
+
puts "\nRunning BLAT alignment with Reference Genome.."
|
|
82
|
+
@prot_synteny.run_blat @root, @outdir
|
|
83
|
+
@prot_synteny.extract_hits :refgenome
|
|
84
|
+
|
|
85
|
+
@fasta.prodigal_files[:contigs].each_with_index do |contig, contig_index|
|
|
86
|
+
|
|
87
|
+
# Skip short contigs
|
|
88
|
+
if @fasta.prodigal_files[:contigs_length][contig_index] < @minlength
|
|
89
|
+
@annotation_stats[:short_contigs] << contig
|
|
90
|
+
next
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
contig_prots = @fasta.prodigal_files[:prot_ids_by_contig][contig]
|
|
94
|
+
|
|
95
|
+
# contig_prot_annotations = @prot_synteny.get_annotation_for_contig contig_prots, @refgenome.coding_seq
|
|
96
|
+
@contig_annotations[contig] = @prot_synteny.get_annotation_for_contig contig_prots, @refgenome.coding_seq
|
|
97
|
+
|
|
98
|
+
remaining_cds = cumulate_annotation_stats_reference contig, @contig_annotations[contig]
|
|
99
|
+
|
|
100
|
+
if ! remaining_cds.empty?
|
|
101
|
+
@contig_foreign_cds[contig] = remaining_cds
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
# dump foreign proteins to file
|
|
107
|
+
foreign_cds_file = dump_cds
|
|
108
|
+
|
|
109
|
+
else
|
|
110
|
+
|
|
111
|
+
# no reference genome .. will process all the CDS
|
|
112
|
+
foreign_cds_file = @fasta.prodigal_files[:proteins]
|
|
113
|
+
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
# Finishing annotation for foreign proteins
|
|
117
|
+
finish_annotation foreign_cds_file
|
|
118
|
+
|
|
119
|
+
# Parse annotations to genbank files
|
|
120
|
+
parsing_genbank_files
|
|
121
|
+
|
|
122
|
+
puts "\nPrinting Statistics.."
|
|
123
|
+
print_stats "#{@outdir}/Annotation-Stats.txt"
|
|
124
|
+
|
|
125
|
+
end # end of method
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
# Finishing the annotation of the remaining CDS
|
|
129
|
+
def finish_annotation remaining_cds_file
|
|
130
|
+
|
|
131
|
+
if @options.has_key? :external_db # from an external DB
|
|
132
|
+
|
|
133
|
+
db_file = @options[:external_db]
|
|
134
|
+
ref_cds = extract_externaldb_prot_info db_file
|
|
135
|
+
|
|
136
|
+
externaldb_synteny = SyntenyManip.new(remaining_cds_file, db_file, "Prot-ExternalDB", @pidentity)
|
|
137
|
+
puts "\nRunning BLAT alignment with External Database.."
|
|
138
|
+
externaldb_synteny.run_blat @root, @outdir
|
|
139
|
+
externaldb_synteny.extract_hits :externaldb
|
|
140
|
+
|
|
141
|
+
externaldb_synteny.aln_hits.each do |k,v|
|
|
142
|
+
contig_of_protein = k.split("_")[0..-2].join("_")
|
|
143
|
+
|
|
144
|
+
if ! @contig_annotations.has_key? contig_of_protein
|
|
145
|
+
@contig_annotations[contig_of_protein] = {}
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
hit_gi = v[:hits][0]
|
|
149
|
+
|
|
150
|
+
note = "correspond to gi:#{hit_gi}"
|
|
151
|
+
|
|
152
|
+
# p v
|
|
153
|
+
# p ref_cds[hit_gi]
|
|
154
|
+
|
|
155
|
+
if ref_cds[hit_gi][:org] != ""
|
|
156
|
+
note += " from #{ref_cds[hit_gi][:org]}"
|
|
157
|
+
end
|
|
158
|
+
@contig_annotations[contig_of_protein][k] = {product: ref_cds[hit_gi][:product],
|
|
159
|
+
gene: nil,
|
|
160
|
+
locustag: nil,
|
|
161
|
+
note: note}
|
|
162
|
+
|
|
163
|
+
end
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
elsif @options.has_key? :remote_db # from a remote DB
|
|
167
|
+
|
|
168
|
+
# do it by chunk to avoid NCBI CPU exceeding limit
|
|
169
|
+
cds_files = split_remaining_cds_file remaining_cds_file
|
|
170
|
+
@remotedb = @options[:remote_db]
|
|
171
|
+
|
|
172
|
+
puts "\n# NCBI Blast on #{@remotedb}"
|
|
173
|
+
|
|
174
|
+
cds_files.each do |cds_file|
|
|
175
|
+
|
|
176
|
+
# remotedb = @options[:remote_db]
|
|
177
|
+
valid = true
|
|
178
|
+
begin
|
|
179
|
+
# puts "\nNCBI blast on #{@remotedb} for #{cds_file}"
|
|
180
|
+
ncbiblast = RemoteNCBI.new(@remotedb,
|
|
181
|
+
cds_file,
|
|
182
|
+
"#{cds_file}.#{@remotedb}.xml",
|
|
183
|
+
@pidentity)
|
|
184
|
+
rescue
|
|
185
|
+
valid = false
|
|
186
|
+
end
|
|
187
|
+
|
|
188
|
+
# ncbi blast didn't worked out
|
|
189
|
+
if !valid
|
|
190
|
+
puts "Problem NCBI blast for foreign proteins"
|
|
191
|
+
else
|
|
192
|
+
ncbiblast.extract_blast_results
|
|
193
|
+
if ! ncbiblast.aln_hits
|
|
194
|
+
puts "Didn't produce the annotation for #{cds_file}"
|
|
195
|
+
next
|
|
196
|
+
end
|
|
197
|
+
ncbiblast.aln_hits.each do |k,v|
|
|
198
|
+
contig_of_protein = k.split("_")[0..-2].join("_")
|
|
199
|
+
# @contig_annotations[contig_of_protein][k][:product] = v[:hits][0][:product]
|
|
200
|
+
if ! @contig_annotations.has_key? contig_of_protein
|
|
201
|
+
@contig_annotations[contig_of_protein] = {}
|
|
202
|
+
end
|
|
203
|
+
note = "correspond to gi:#{v[:hits][0][:gi]}"
|
|
204
|
+
if v[:hits][0][:org] != ""
|
|
205
|
+
note += " from #{v[:hits][0][:org]}"
|
|
206
|
+
end
|
|
207
|
+
@contig_annotations[contig_of_protein][k] = {product: v[:hits][0][:product],
|
|
208
|
+
gene: nil,
|
|
209
|
+
locustag: nil,
|
|
210
|
+
note: note}
|
|
211
|
+
end
|
|
212
|
+
|
|
213
|
+
end
|
|
214
|
+
|
|
215
|
+
end
|
|
216
|
+
|
|
217
|
+
end
|
|
218
|
+
|
|
219
|
+
end # end of method
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
# parse all genbank files
|
|
223
|
+
def parsing_genbank_files
|
|
224
|
+
|
|
225
|
+
puts "\nParsing annotation into genbank files.."
|
|
226
|
+
@contig_annotations.each do |contig, contig_prot_annotations|
|
|
227
|
+
gbk_path = @fasta.prodigal_files[:gbk_path]
|
|
228
|
+
gbk_to_annotate = GenbankManip.new("#{gbk_path}/#{contig}.gbk", "#{gbk_path}")
|
|
229
|
+
gbk_to_annotate.add_annotation contig_prot_annotations, gbk_path, 0
|
|
230
|
+
end
|
|
231
|
+
|
|
232
|
+
end # end of method
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
# cumulate the stats for the synteny
|
|
236
|
+
# return : unannotated cds array
|
|
237
|
+
def cumulate_annotation_stats_reference contig, contig_prots_ann
|
|
238
|
+
|
|
239
|
+
remaining_cds = []
|
|
240
|
+
contig_prots = @fasta.prodigal_files[:prot_ids_by_contig][contig]
|
|
241
|
+
|
|
242
|
+
@annotation_stats[:total_cds] += contig_prots.length if contig_prots
|
|
243
|
+
contig_prots_ann.each do |k,v|
|
|
244
|
+
if v != nil
|
|
245
|
+
@annotation_stats[:annotated_cds] += 1
|
|
246
|
+
else
|
|
247
|
+
remaining_cds << k
|
|
248
|
+
end
|
|
249
|
+
end
|
|
250
|
+
|
|
251
|
+
# Annotated Contigs
|
|
252
|
+
if contig_prots_ann.keys.length < 1
|
|
253
|
+
@annotation_stats[:foreign_contigs] << contig
|
|
254
|
+
else
|
|
255
|
+
@annotation_stats[:synteny_contigs] << contig
|
|
256
|
+
end
|
|
257
|
+
|
|
258
|
+
remaining_cds
|
|
259
|
+
end # end of method
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
# print statistics to file
|
|
263
|
+
def print_stats file
|
|
264
|
+
|
|
265
|
+
total_nb_contigs = @annotation_stats[:foreign_contigs].length +
|
|
266
|
+
@annotation_stats[:synteny_contigs].length +
|
|
267
|
+
@annotation_stats[:short_contigs].length
|
|
268
|
+
p_contigs_annotated = @annotation_stats[:synteny_contigs].length.to_f/total_nb_contigs.to_f
|
|
269
|
+
p_cds_annotated = @annotation_stats[:annotated_cds].to_f/@annotation_stats[:total_cds].to_f
|
|
270
|
+
|
|
271
|
+
File.open(file, "w") do |fopen|
|
|
272
|
+
fopen.write("#Contigs annotation based on reference genomes\n")
|
|
273
|
+
fopen.write("Short Contigs (< #{@minlength}) :\t\t" + @annotation_stats[:short_contigs].length.to_s + "\n")
|
|
274
|
+
fopen.write("Foreign Contigs :\t\t" + @annotation_stats[:foreign_contigs].length.to_s + "\n")
|
|
275
|
+
fopen.write("Annotated Contigs :\t\t" + @annotation_stats[:synteny_contigs].length.to_s + "\n")
|
|
276
|
+
fopen.write("Total Contigs :\t\t\t" + total_nb_contigs.to_s + "\n")
|
|
277
|
+
fopen.write("% Contigs annotated :\t\t" + (p_contigs_annotated*100).round(2).to_s + "\n")
|
|
278
|
+
fopen.write("\n")
|
|
279
|
+
|
|
280
|
+
fopen.write("#CDS annotations based on reference genomes\n")
|
|
281
|
+
fopen.write("Annotated CDS :\t\t\t" + @annotation_stats[:annotated_cds].to_s + "\n")
|
|
282
|
+
fopen.write("Total CDS :\t\t\t" + @annotation_stats[:total_cds].to_s + "\n")
|
|
283
|
+
fopen.write("% CDS annotated :\t\t" + (p_cds_annotated*100).round(2).to_s + "\n")
|
|
284
|
+
fopen.write("\n")
|
|
285
|
+
|
|
286
|
+
end
|
|
287
|
+
|
|
288
|
+
end # end of method
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
# dump cds to file for blast
|
|
292
|
+
def dump_cds
|
|
293
|
+
|
|
294
|
+
cds_outfile = File.open("#{@outdir}/Proteins-foreign.fa","w")
|
|
295
|
+
foreign_cds = []
|
|
296
|
+
@contig_foreign_cds.each_value do |v|
|
|
297
|
+
foreign_cds.push(*v)
|
|
298
|
+
end
|
|
299
|
+
inprot = false
|
|
300
|
+
File.open(@fasta.prodigal_files[:proteins]) do |fprot|
|
|
301
|
+
while l=fprot.gets
|
|
302
|
+
if l[0] == ">"
|
|
303
|
+
inprot = false
|
|
304
|
+
prot_id = l.chomp.split(" ")[0].delete(">")
|
|
305
|
+
if foreign_cds.include? prot_id
|
|
306
|
+
inprot = true
|
|
307
|
+
cds_outfile.write(l)
|
|
308
|
+
end
|
|
309
|
+
elsif inprot
|
|
310
|
+
cds_outfile.write(l)
|
|
311
|
+
end
|
|
312
|
+
end
|
|
313
|
+
end
|
|
314
|
+
cds_outfile.close
|
|
315
|
+
return "#{@outdir}/Proteins-foreign.fa"
|
|
316
|
+
|
|
317
|
+
end # end of method
|
|
318
|
+
|
|
319
|
+
|
|
320
|
+
# extract the information on protein from an externaldb
|
|
321
|
+
def extract_externaldb_prot_info db
|
|
322
|
+
|
|
323
|
+
# NCBI
|
|
324
|
+
# >gi|103485499|ref|YP_615060.1| chromosomal replication initiation protein [Sphingopyxis alaskensis RB2256]
|
|
325
|
+
# Swissprot
|
|
326
|
+
# >sp|C7C422|BLAN1_KLEPN Beta-lactamase NDM-1 OS=Klebsiella pneumoniae GN=blaNDM-1 PE=1 SV=1
|
|
327
|
+
# TrEMBL
|
|
328
|
+
# >tr|E5KIY2|E5KIY2_ECOLX Beta-lactamase NDM-1 OS=Escherichia coli GN=blaNDM-1 PE=1 SV=1
|
|
329
|
+
|
|
330
|
+
ref_cds = {}
|
|
331
|
+
|
|
332
|
+
File.open(db, "r") do |dbfile|
|
|
333
|
+
while l=dbfile.gets
|
|
334
|
+
|
|
335
|
+
if l[0] == ">"
|
|
336
|
+
|
|
337
|
+
lA = l.chomp.split("|")
|
|
338
|
+
key_gi = lA[1]
|
|
339
|
+
product_long = lA[-1]
|
|
340
|
+
|
|
341
|
+
organism = ""
|
|
342
|
+
product = ""
|
|
343
|
+
|
|
344
|
+
if product_long.include? " [" and product_long.include? "]" # NCBI
|
|
345
|
+
organism = product_long[/\[.*?\]/]
|
|
346
|
+
product = product_long.split(" [")[0].strip
|
|
347
|
+
elsif product_long.include? "OS="
|
|
348
|
+
product_tmp = product.split("OS=")
|
|
349
|
+
organism = product_tmp[1].split(/[A-Z][A-Z]=/)[0].strip
|
|
350
|
+
product = product_tmp[0].strip
|
|
351
|
+
elsif product_long.include? "[A-Z][A-Z]="
|
|
352
|
+
product = product_long.split(/[A-Z][A-Z]=/)[0].strip
|
|
353
|
+
end
|
|
354
|
+
org = organism.gsub("[","").gsub("]","")
|
|
355
|
+
product.lstrip!
|
|
356
|
+
ref_cds[key_gi] = {product: product, org: org}
|
|
357
|
+
|
|
358
|
+
end
|
|
359
|
+
|
|
360
|
+
end
|
|
361
|
+
|
|
362
|
+
end # end of file reading
|
|
363
|
+
|
|
364
|
+
ref_cds
|
|
365
|
+
|
|
366
|
+
end # end of method
|
|
367
|
+
|
|
368
|
+
|
|
369
|
+
# split fasta file to multiple fasta
|
|
370
|
+
def split_remaining_cds_file file
|
|
371
|
+
|
|
372
|
+
cds_files = []
|
|
373
|
+
outdir = "#{@outdir}/Protein-foreign.split"
|
|
374
|
+
|
|
375
|
+
Dir.mkdir(outdir) if ! Dir.exists? outdir
|
|
376
|
+
|
|
377
|
+
iter = 0
|
|
378
|
+
file_nb = 0
|
|
379
|
+
fout = File.open("#{outdir}/ProtForeign.#{file_nb}.fa", "w")
|
|
380
|
+
cds_files << "#{outdir}/ProtForeign.#{file_nb}.fa"
|
|
381
|
+
|
|
382
|
+
File.open(file, "r") do |fopen|
|
|
383
|
+
while l=fopen.gets
|
|
384
|
+
if l[0] == ">"
|
|
385
|
+
if iter > 19
|
|
386
|
+
fout.close
|
|
387
|
+
iter = 0
|
|
388
|
+
file_nb += 1
|
|
389
|
+
fout = File.open("#{outdir}/ProtForeign.#{file_nb}.fa", "w")
|
|
390
|
+
cds_files << "#{outdir}/ProtForeign.#{file_nb}.fa"
|
|
391
|
+
end
|
|
392
|
+
iter += 1
|
|
393
|
+
end
|
|
394
|
+
fout.write(l)
|
|
395
|
+
end
|
|
396
|
+
end
|
|
397
|
+
|
|
398
|
+
fout.close
|
|
399
|
+
|
|
400
|
+
cds_files
|
|
401
|
+
|
|
402
|
+
end # end of method
|
|
403
|
+
|
|
404
|
+
private :dump_cds, :split_remaining_cds_file
|
|
405
|
+
|
|
406
|
+
end # end of class
|