bacterial-annotator 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,166 @@
1
+ # -*- coding: utf-8 -*-
2
+ # author: maxime déraspe
3
+ # email: maxime@deraspe.net
4
+ # review:
5
+ # date: 15-02-24
6
+ # version: 0.0.1
7
+ # licence:
8
+
9
+
10
+
11
+ class FastaManip
12
+
13
+ attr_reader :fasta_flat, :fasta_file, :prodigal_files
14
+
15
+ # Initialize fasta holder
16
+ def initialize fasta_file, meta
17
+
18
+ @fasta_file = fasta_file
19
+ @fasta_flat = Bio::FlatFile.auto(@fasta_file)
20
+ @meta = meta
21
+ @prodigal_files = nil
22
+ @single_fasta = nil
23
+
24
+ if @fasta_flat.dbclass != Bio::FastaFormat
25
+ abort "Aborting : The input sequence is not a fasta file !"
26
+ end
27
+
28
+ end
29
+
30
+ # Run prodigal on the genome to annotate
31
+ def run_prodigal root, outdir
32
+ @prodigal_files = {}
33
+ Dir.mkdir "#{outdir}" if ! Dir.exists? "#{outdir}"
34
+ if @meta
35
+ system("#{root}/prodigal.linux -p meta -i #{@fasta_file} -a #{outdir}/Proteins.fa -d #{outdir}/Genes.fa -o #{outdir}/Genbanks.gbk -q")
36
+ else
37
+ system("#{root}/prodigal.linux -i #{@fasta_file} -a #{outdir}/Proteins.fa -d #{outdir}/Genes.fa -o #{outdir}/Genbanks.gbk -q")
38
+ end
39
+
40
+ @prodigal_files = {multiGBK: "#{outdir}/Genbanks.gbk",
41
+ contigs: [],
42
+ contigs_length: [],
43
+ genes: "#{outdir}/Genes.fa",
44
+ proteins: "#{outdir}/Proteins.fa",
45
+ prot_ids_by_contig: {},
46
+ fasta_path: "#{outdir}/single-fasta/",
47
+ gbk_path: "#{outdir}/single-genbank/"}
48
+ split_fasta outdir
49
+ split_genbank outdir, "#{outdir}/Genbanks.gbk"
50
+ extract_cds_names
51
+ @prodigal_files
52
+ end
53
+
54
+
55
+ # Split Multi Genbanks file
56
+ # RETURN : array of fasta files
57
+ def split_fasta outdir
58
+ @single_fasta = {}
59
+ Dir.mkdir("#{outdir}/single-fasta") if ! Dir.exists?("#{outdir}/single-fasta")
60
+ @fasta_flat.each_entry do |seq|
61
+ file_name = seq.definition.chomp.split(" ")[0]
62
+ @prodigal_files[:contigs] << "#{file_name}"
63
+ @prodigal_files[:contigs_length] << seq.seq.length
64
+ File.open("#{outdir}/single-fasta/#{file_name}.fasta", "w") do |fwrite|
65
+ fwrite.write(seq)
66
+ end
67
+ @single_fasta[file_name] = seq
68
+ end
69
+ end
70
+
71
+
72
+ # Split Multi Genbanks file
73
+ # RETURN : array of genbank files
74
+ def split_genbank outdir, multigbk
75
+
76
+ Dir.mkdir("#{outdir}/single-genbank")if ! Dir.exists?("#{outdir}/single-genbank")
77
+ File.open(multigbk,"r") do |f|
78
+ fopen = nil
79
+ while l = f.gets
80
+ if l[0..9] == "DEFINITION"
81
+ file_name = l.chomp.split(";")[2].gsub("seqhdr","").delete("\"").delete("=").split(" ")[0]
82
+ outseq, seq_length = print_sequence_for_gbk @single_fasta[file_name]
83
+ spacer = " " * (20-seq_length.to_s.length)
84
+ date = DateTime.now
85
+ month = Date::ABBR_MONTHNAMES[date.month]
86
+ day = "%02d" % date.day
87
+ year = date.year
88
+ locus = "LOCUS #{file_name}#{spacer}#{seq_length.to_s} bp DNA linear BCT #{day}-#{month}-#{year}\n"
89
+ fopen = File.open("#{outdir}/single-genbank/#{file_name}.gbk", "w")
90
+ fopen.write(locus)
91
+ fopen.write(l)
92
+ elsif l[0..1] == "//"
93
+ fopen.write(outseq)
94
+ fopen.close
95
+ else
96
+ fopen.write(l)
97
+ end
98
+ end
99
+ end
100
+
101
+ end
102
+
103
+
104
+ # Utility function to print the sequence to the end of a gbk file
105
+ def print_sequence_for_gbk seq
106
+
107
+ outseq = "ORIGIN\n"
108
+ # puts "ORIGIN"
109
+
110
+ ntNum = 0
111
+ sequence = seq.seq.downcase
112
+
113
+ nt_left = true
114
+ it = 0
115
+
116
+ while nt_left
117
+
118
+ if sequence.length > it+60
119
+ nt_to_add = sequence[it..(it+59)]
120
+ # printf "%9s ", (ntNum - l.size + 2)
121
+ outseq += "%9s " % (it+1)
122
+ outseq += nt_to_add.scan(/.{1,10}/).join(" ")
123
+ outseq += "\n"
124
+ it += 60
125
+ else
126
+ nt_to_add = sequence[it..sequence.length-1]
127
+ outseq += "%9s " % (it+1)
128
+ outseq += nt_to_add.scan(/.{1,10}/).join(" ")
129
+ outseq += "\n"
130
+ outseq += "//"
131
+ nt_left = false
132
+ end
133
+
134
+ end
135
+
136
+ return outseq, sequence.length
137
+
138
+ end
139
+
140
+
141
+ # extract protein and gene names
142
+ def extract_cds_names
143
+
144
+ prot_ids = {}
145
+ flatfile = Bio::FlatFile.auto(@prodigal_files[:proteins])
146
+ flatfile.each_entry do |entry|
147
+ prot_id = entry.definition.split(" ")[0]
148
+ contig = prot_id.split("_")[0..-2].join("_")
149
+ if !prot_ids.has_key? contig
150
+ prot_ids[contig] = []
151
+ end
152
+ prot_ids[contig] << prot_id
153
+ end
154
+
155
+ prot_ids.each do |k,prot_array|
156
+ prot_array.sort! { |a,b| a.split("_")[-1].to_i <=> b.split("_")[-1].to_i }
157
+ end
158
+
159
+ @prodigal_files[:prot_ids_by_contig] = prot_ids
160
+
161
+ end
162
+
163
+
164
+ private :extract_cds_names # :split_fasta, :split_genbank
165
+
166
+ end
@@ -0,0 +1,208 @@
1
+ # -*- coding: utf-8 -*-
2
+ # author: maxime déraspe
3
+ # email: maxime@deraspe.net
4
+ # review:
5
+ # date: 15-02-24
6
+ # version: 0.0.1
7
+ # licence:
8
+
9
+
10
+
11
+ class GenbankManip
12
+
13
+ attr_accessor :gbk, :coding_seq, :cds_file
14
+
15
+ # Initialize then genbank file
16
+ def initialize gbk_file, outdir
17
+
18
+ @gbk_file = gbk_file
19
+ if ! File.exists? @gbk_file
20
+ fetch_ncbi_genome(@gbk_file, outdir)
21
+ @gbk_file = "#{outdir}/#{gbk_file}.gbk"
22
+ # @gbk_file += ".gbk"
23
+ end
24
+
25
+ flat_gbk = Bio::FlatFile.auto(@gbk_file)
26
+
27
+ # Check if gbk is valid
28
+ if flat_gbk.dbclass != Bio::GenBank
29
+ abort "Aborting : The input #{@gbk_file} is not a valid genbank file !"
30
+ else
31
+ @gbk = flat_gbk.next_entry
32
+ end
33
+
34
+ @bioseq = @gbk.to_biosequence
35
+
36
+ end
37
+
38
+
39
+ # Prepare CDS/proteins
40
+ def get_cds
41
+
42
+ if @coding_seq == nil
43
+
44
+ @coding_seq = {}
45
+
46
+ # Iterate over each CDS
47
+ @gbk.each_cds do |ft|
48
+ ftH = ft.to_hash
49
+ loc = ft.locations
50
+ gene = []
51
+ product = []
52
+ protId = ""
53
+ if ftH.has_key? "pseudo"
54
+ next
55
+ end
56
+ gene = ftH["gene"] if !ftH["gene"].nil?
57
+ product = ftH["product"] if !ftH["product"].nil?
58
+ protId = ftH["protein_id"][0] if !ftH["protein_id"].nil?
59
+ locustag = ftH["locus_tag"][0] if !ftH["locus_tag"].nil?
60
+ dna = get_DNA(ft,@bioseq)
61
+ if ftH.has_key? "translation"
62
+ pep = ftH["translation"][0] if !ftH["translation"].nil?
63
+ else
64
+ pep = ""
65
+ end
66
+
67
+ pepBioSeq = Bio::Sequence.auto(pep)
68
+
69
+ @coding_seq[protId] = {location: loc,
70
+ locustag: locustag,
71
+ gene: gene[0],
72
+ product: product[0],
73
+ bioseq: pepBioSeq }
74
+ end
75
+
76
+ end
77
+
78
+ @coding_seq
79
+
80
+ end
81
+
82
+
83
+ # Print CDS to files
84
+ # RETURN : cds_file path
85
+ def write_cds_to_file outdir
86
+
87
+ cds_file = "#{@gbk.accession}.pep"
88
+ if @coding_seq == nil
89
+ get_cds
90
+ end
91
+
92
+ File.open("#{outdir}/#{cds_file}", "w") do |fwrite|
93
+ @coding_seq.each_key do |k|
94
+ seqout = @coding_seq[k][:bioseq].output_fasta("#{k}",60)
95
+ fwrite.write(seqout)
96
+ end
97
+ end
98
+
99
+ @cds_file = "#{outdir}/" + cds_file
100
+
101
+ end
102
+
103
+
104
+ # add annotation to a genbank file produced by prodigal
105
+ def add_annotation annotations, outdir, mode
106
+
107
+ nb_of_added_ft = 0
108
+ i = 0
109
+ contig = @gbk.definition.split(";")[2].
110
+ gsub("seqhdr","").
111
+ delete("\"").
112
+ delete("=").
113
+ split(" ")[0]
114
+
115
+ # iterate through
116
+ @gbk.features.each_with_index do |cds, ft_index|
117
+
118
+ next if cds.feature != "CDS"
119
+
120
+ if mode == 0
121
+ ftArray = []
122
+ else
123
+ ftArray = cds.qualifiers
124
+ end
125
+
126
+ i += 1
127
+ prot_id = contig+"_"+i.to_s
128
+ hit = nil
129
+ hit = annotations[prot_id] if annotations.has_key? prot_id
130
+
131
+ if hit != nil
132
+ locus, gene, product, note = nil
133
+ locus = hit[:locustag]
134
+ gene = hit[:gene]
135
+ product = hit[:product]
136
+ note = hit[:note]
137
+
138
+ if gene != nil
139
+ qGene = Bio::Feature::Qualifier.new('gene', gene)
140
+ ftArray.push(qGene)
141
+ end
142
+
143
+ if product != nil
144
+ qProd = Bio::Feature::Qualifier.new('product', product)
145
+ ftArray.push(qProd)
146
+ end
147
+
148
+ if locus != nil
149
+ qNote = Bio::Feature::Qualifier.new('note', "correspond to #{locus} locus")
150
+ ftArray.push(qNote)
151
+ end
152
+
153
+ if note != nil
154
+ qNote = Bio::Feature::Qualifier.new('note', note)
155
+ ftArray.push(qNote)
156
+ end
157
+
158
+
159
+ end
160
+ cds.qualifiers = ftArray
161
+
162
+ end
163
+
164
+ File.open("#{outdir}/#{contig}.gbk", "w") do |f|
165
+ f.write(@gbk.to_biosequence.output(:genbank))
166
+ end
167
+
168
+ # Bioruby doesn't support gff at this point
169
+ # File.open("#{outdir}/#{contig}.gff", "w") do |f|
170
+ # f.write(@gbk.to_biosequence.output(:gff))
171
+ # end
172
+
173
+ end
174
+
175
+
176
+ ###################
177
+ # Private Methods #
178
+ ###################
179
+
180
+ # Fct: Get dna sequence
181
+ def get_DNA (cds, seq)
182
+ loc = cds.locations
183
+ sbeg = loc[0].from.to_i
184
+ send = loc[0].to.to_i
185
+ fasta = Bio::Sequence::NA.new(seq.subseq(sbeg,send))
186
+ # position = "#{sbeg}..#{send}"
187
+ if loc[0].strand == -1
188
+ fasta.reverse_complement!
189
+ end
190
+ dna = Bio::Sequence.auto(fasta)
191
+ return dna
192
+ end
193
+
194
+
195
+ # Fetch genbank genome from NCBI
196
+ def fetch_ncbi_genome refgenome_id, outdir
197
+ Bio::NCBI.default_email = 'default@default.com'
198
+ ncbi = Bio::NCBI::REST.new
199
+ genbankstring = ncbi.efetch(refgenome_id, {"db"=>'nucleotide', "rettype"=>'gb'})
200
+ File.open("#{outdir}/#{refgenome_id}.gbk", "w") do |f|
201
+ f.write(genbankstring)
202
+ end
203
+ end
204
+
205
+ private :fetch_ncbi_genome, :get_DNA
206
+
207
+
208
+ end # end of Class
@@ -0,0 +1,200 @@
1
+ # -*- coding: utf-8 -*-
2
+ # author: maxime déraspe
3
+ # email: maxime@deraspe.net
4
+ # review:
5
+ # date: 15-02-24
6
+ # version: 0.0.1
7
+ # licence:
8
+
9
+ require 'mechanize'
10
+ require 'open-uri'
11
+ require 'bio'
12
+
13
+ class RemoteNCBI
14
+
15
+ attr_reader :aln_hits, :db, :xmloutput
16
+
17
+ # initialize stuff for a remote ncbi run
18
+ def initialize db, seq_file, outfile, pidentity
19
+
20
+ if ! ["swissprot", "refseq_protein", "nr"].include? db
21
+ @db = "bad database"
22
+ else
23
+ @db = db
24
+ end
25
+
26
+ url = 'http://blast.ncbi.nlm.nih.gov/Blast.cgi'\
27
+ '?PROGRAM=blastp&BLAST_PROGRAMS=blastp'\
28
+ '&PAGE_TYPE=BlastSearch&SHOW_DEFAULTS=on'\
29
+ '&LINK_LOC=blasthome'
30
+
31
+ @seq_file = seq_file
32
+ @outfile = outfile
33
+ @resultURI = submit_blast url
34
+ @pidentity = pidentity
35
+
36
+ if @resultURI != ""
37
+ @xmloutput = ""
38
+ @valid = validate_output
39
+ else
40
+ @valid = false
41
+ end
42
+
43
+ end # end of method
44
+
45
+
46
+ # submit blast to ncbi
47
+ def submit_blast ncbiURL
48
+
49
+ f = @seq_file.split("/")[-1]
50
+
51
+ seq_fasta = File.read(@seq_file)
52
+
53
+ a = Mechanize.new { |agent|
54
+ agent.user_agent_alias = 'Linux Firefox'
55
+ agent.ignore_bad_chunking = true
56
+ }
57
+
58
+ toBreak = 0
59
+ requestID = ""
60
+ try = 1
61
+
62
+ while requestID == "" and try < 12
63
+
64
+ begin
65
+
66
+ a.get(ncbiURL) do |page|
67
+
68
+ search = page.form_with(:name => 'searchForm') { |form|
69
+ form.textareas[0].value = File.read(@seq_file)
70
+ form.field_with(:name => 'DATABASE').value = @db
71
+ form.field_with(:name => 'MAX_NUM_SEQ').value = 40
72
+ }.submit
73
+
74
+ search.parser.css('td').each do |td|
75
+ if toBreak == 1
76
+ requestID = td.text.gsub(" ","")
77
+ # puts "breaking because #{requestID}"
78
+ break
79
+ end
80
+ if td.text == "Request ID"
81
+ toBreak = 1
82
+ end
83
+ end
84
+
85
+ end
86
+
87
+ rescue
88
+ try += 1
89
+ puts "#{try} POST try for #{f}"
90
+ sleep 3
91
+ end
92
+
93
+ end
94
+
95
+ uri_parsed = "http://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Get&RID=#{requestID}"
96
+ puts "NCBI Blast for #{f}: #{uri_parsed}"
97
+
98
+ return URI.parse("http://blast.ncbi.nlm.nih.gov/Blast.cgi?RESULTS_FILE=on&RID=#{requestID}&FORMAT_TYPE=XML&FORMAT_OBJECT=Alignment&CMD=Get")
99
+
100
+ end # end of method
101
+
102
+
103
+ # validate the xml blast results
104
+ def validate_output
105
+
106
+ xmloutput = ""
107
+ valid = true
108
+ finish = false
109
+
110
+ while valid and ! finish
111
+
112
+ response = Net::HTTP.get_response(@resultURI)
113
+ body = response.body.split("\n")
114
+ if body[0] =~ /<?xml version=/
115
+ xmloutput = body.join("\n")
116
+ valid = true
117
+ finish = true
118
+ else
119
+ valid = false
120
+ body.each do |l|
121
+ if l =~ /Status=/
122
+ status = l.strip.gsub("Status=", "")
123
+ if status == "WAITING"
124
+ valid = true
125
+ end
126
+ end
127
+ break if valid
128
+ end
129
+ end
130
+
131
+ case @db
132
+ when 'nr', 'refseq_protein'
133
+ sleep 30
134
+ when 'swissprot'
135
+ sleep 10
136
+ end
137
+
138
+ end
139
+
140
+ if finish
141
+ File.open("#{@outfile}", "w") do |f|
142
+ f.write(xmloutput)
143
+ end
144
+ return finish
145
+ end
146
+ valid
147
+
148
+ end # end of method
149
+
150
+ # extract blast results from
151
+ def extract_blast_results
152
+
153
+ if !@valid
154
+ @aln_hits = nil
155
+ return
156
+ end
157
+
158
+ flat = Bio::FlatFile.auto("#{@outfile}")
159
+ @aln_hits = {}
160
+
161
+ flat.each_entry do |report|
162
+
163
+ report.iterations.each do |query_it|
164
+ prot_id = query_it.query_def.split(" ")[0]
165
+ query_it.hits.each do |hit|
166
+ if ! @aln_hits.has_key? prot_id
167
+ p_identity = hit.identity.to_f/hit.target_len.to_f*100
168
+ if p_identity >= @pidentity
169
+ # cleaning product definition
170
+ definition_clean = hit.definition.split(">")[0]
171
+ product = definition_clean.
172
+ gsub("MULTISPECIES: ","").
173
+ gsub(/ \[.*\]/,"").
174
+ gsub("RecName: Full=","").
175
+ split("; AltName")[0].
176
+ split("; Flags:")[0].
177
+ split(" ; Short=")[0]
178
+ gi = hit.hit_id.to_s.split("|")[1]
179
+ organism = ""
180
+ definition_clean = hit.definition.split(">")[0]
181
+ if ! definition_clean[/\[.*\]/].nil?
182
+ organism = definition_clean[/\[.*\]/].gsub("[","").gsub("]","")
183
+ end
184
+ @aln_hits[prot_id] = {
185
+ pId: hit.identity.to_f/hit.target_len.to_f*100,
186
+ length: hit.target_len.to_i,
187
+ evalue: hit.evalue,
188
+ score: hit.bit_score.to_f,
189
+ hits: [{gi: gi, product: product, org: organism}]
190
+ }
191
+ end
192
+ end
193
+ end
194
+ end
195
+ end
196
+
197
+ end # end of method
198
+
199
+
200
+ end # end of class