bacterial-annotator 0.8.9 → 0.9.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 1dc26e2ae283ea0c8163e7c487c95bf0f0b7cffa4095d841b38e709262698a52
4
- data.tar.gz: be699876985f29d8269119852b6f027f562c541263abb6afe29b84783375ad3b
3
+ metadata.gz: b90fb0d1c0bc3d82a10706a7d6d2c0554099b94ce8ec27f63d4ebcad0ec11f04
4
+ data.tar.gz: c98732d57ad75290b6f0584ee559de98f8676ed0d94e5f4e9886f152269bf75e
5
5
  SHA512:
6
- metadata.gz: a7615dee5f7d89e8224477dc94798621d72ab8923d1b7027c9b912f46f9fca85beb71bf1b3cd7a058fa27503b07a244ecd97a71330c0816d4b3c41b4f8795e55
7
- data.tar.gz: 813dc52e3732accd812e54a9ad998fee65e01cdec8bf79c136c59262512683ed32656e88e860476f405e08441723eef6681648d972f17237ec80d88a64a8a46d
6
+ metadata.gz: eab03371ef5f9fbf75b694f58ba4108adb38f6316131a96efc38a0c938fed727f3583f80e3f97604103c5c25f0d271fa9bcb397670b16bfd1bb2b930ca47be83
7
+ data.tar.gz: 6e48a1c67bd2add06ca3772bab0f734050a28476e05a9879f23fbf6df1cd88a314aee21c469a8226623978fefcd1b0d62cd240960b4097c55802395a891d5ae9
data/bin/ba_diamond CHANGED
@@ -2,21 +2,21 @@
2
2
  # -*- coding: utf-8 -*-
3
3
  # author: maxime déraspe
4
4
  # email: maximilien1er@gmail.com
5
- # review:
5
+ # review:
6
6
  # date: 17-08-24
7
7
  # version: 0.01
8
- # licence:
8
+ # licence:
9
9
 
10
10
  require 'open-uri'
11
11
 
12
12
  ROOT_path = File.dirname(__FILE__)
13
- # diamond URL = "https://github.com/bbuchfink/diamond/releases/download/v0.9.10/diamond-linux64.tar.gz"
13
+ # diamond URL = "https://github.com/bbuchfink/diamond/releases/download/v2.0.15/diamond-linux64.tar.gz"
14
14
 
15
15
  # Install diamond on the user system
16
16
  def installDiamond
17
17
 
18
18
  begin
19
- resp = open("https://github.com/bbuchfink/diamond/releases/download/v0.9.10/diamond-linux64.tar.gz")
19
+ resp = open("https://github.com/bbuchfink/diamond/releases/download/v2.0.15/diamond-linux64.tar.gz")
20
20
  open("#{ROOT_path}/diamond-linux64.tar.gz", "wb") do |file|
21
21
  file.write(resp.read)
22
22
  end
@@ -43,4 +43,3 @@ if ! File.exists? "#{ROOT_path}/diamond.linux"
43
43
  puts ""
44
44
 
45
45
  end
46
-
@@ -2,10 +2,10 @@
2
2
  # -*- coding: utf-8 -*-
3
3
  # author: maxime déraspe
4
4
  # email: maximilien1er@gmail.com
5
- # review:
5
+ # review:
6
6
  # date: 15-02-24
7
7
  # version: 0.01
8
- # licence:
8
+ # licence:
9
9
 
10
10
 
11
11
  require 'bacterial-annotator'
@@ -160,6 +160,7 @@ compare [OPTIONS]
160
160
  --proc <nb of process> Number of process to run the comparison
161
161
 
162
162
  //Synteny
163
+ --refgenome genbank ref genome if you want to provide complete genomes not previously annotated with bacterial-annotator
163
164
  --pidentity <default 0.80> Minimal percentage identity to call a syntenic protein
164
165
  --min_cov <default 0.80> Minimal coverage for the alignment of the protein / gene
165
166
 
@@ -193,6 +194,7 @@ def parseOptions_compare
193
194
  options[:phylogeny] = 0
194
195
  options[:software] = "fasttree"
195
196
  options[:bootstrap] = 100
197
+ options[:refgenome] = ""
196
198
 
197
199
  if ARGV.length == 0
198
200
  usage_compare
@@ -212,6 +214,8 @@ def parseOptions_compare
212
214
  options[:proc] = ARGV.shift
213
215
  when "--align"
214
216
  options[:align] = ARGV.shift
217
+ when "--refgenome", "-g"
218
+ options[:refgenome] = ARGV.shift
215
219
  when "--concat"
216
220
  options[:concat] = ARGV.shift
217
221
  when "--phylogeny"
@@ -7,7 +7,7 @@
7
7
 
8
8
  require 'json'
9
9
  require 'zlib'
10
-
10
+ require 'pp'
11
11
 
12
12
  class SequenceAnnotation
13
13
 
@@ -272,7 +272,7 @@ class SequenceAnnotation
272
272
  @rna_seq = {}
273
273
  @gbk.features do |ft|
274
274
 
275
- next if ! ft.feature.to_s.include? "rRNA"
275
+ next if ! ft.feature.to_s.include? "RNA"
276
276
 
277
277
  ftH = ft.to_hash
278
278
  loc = ft.locations
@@ -285,7 +285,12 @@ class SequenceAnnotation
285
285
  # gene = ftH["gene"] if !ftH["gene"].nil?
286
286
  # protId = ftH["protein_id"][0] if !ftH["protein_id"].nil?
287
287
  product = ""
288
- product = ftH["product"][0] if !ftH["product"].nil?
288
+
289
+ if !ftH["product"].nil?
290
+ product = ftH["product"][0]
291
+ # puts ftH["product"].join(",") + "---" + ftH["product"][0]
292
+ end
293
+
289
294
  locustag = ftH["locus_tag"][0] if !ftH["locus_tag"].nil?
290
295
 
291
296
  # puts "#{@accession}\t#{seqBeg}\t#{seqEnd}\t#{strand}\t#{protId}\t#{locustag}\t#{gene[0]}\t#{product[0]}"
@@ -533,6 +538,7 @@ class SequenceAnnotation
533
538
 
534
539
  new_features = {}
535
540
  annotations_done = {}
541
+ gbk_features_len = @gbk.features.length
536
542
 
537
543
  @gbk.features.each_with_index do |ft, ft_index|
538
544
 
@@ -540,8 +546,9 @@ class SequenceAnnotation
540
546
 
541
547
  next if annotations_done.has_key? k
542
548
 
543
- if v[:query_location][0][0] < ft.locations[0].from
549
+ if v[:query_location][0][0] < ft.locations[0].from or ft_index == gbk_features_len-1
544
550
 
551
+ pp v
545
552
  if v[:subject_location][0][0] > v[:subject_location][0][1]
546
553
  location = "complement(#{v[:query_location][0][0]}..#{v[:query_location][0][1]})"
547
554
  else
@@ -550,7 +557,12 @@ class SequenceAnnotation
550
557
 
551
558
  feature = Bio::Feature.new(v[:feature][0],location)
552
559
  feature.qualifiers.push(Bio::Feature::Qualifier.new('product',v[:product][0])) if ! v[:product][0].nil? or v[:product][0] != ""
553
- new_features[ft_index] = feature
560
+ if ft_index == gbk_features_len-1
561
+ new_features[gbk_features_len] = feature
562
+ else
563
+ new_features[ft_index] = feature
564
+ end
565
+
554
566
  annotations_done[k] = 1
555
567
  break
556
568
 
@@ -1,10 +1,10 @@
1
1
  # -*- coding: utf-8 -*-
2
2
  # author: maxime déraspe
3
3
  # email: maximilien1er@gmail.com
4
- # review:
4
+ # review:
5
5
  # date: 15-02-24
6
6
  # version: 0.0.1
7
- # licence:
7
+ # licence:
8
8
 
9
9
  require 'bio'
10
10
  require 'fileutils'
@@ -324,11 +324,19 @@ class BacterialAnnotator
324
324
  @options[:pcoverage],
325
325
  "prot")
326
326
 
327
- print "# Running BLAT alignment with External Database.."
327
+ # print "# Running BLAT alignment with External Database.."
328
+ # start_time = Time.now
329
+ # @externaldb_synteny.run_blat
330
+ # end_time = Time.now
331
+ # c_time = Helper.sec2str(end_time-start_time)
332
+ # print "done (#{c_time})\n"
333
+ # @externaldb_synteny.extract_hits :externaldb
334
+
335
+ print "# Running alignment with External DB"
328
336
  start_time = Time.now
329
- @externaldb_synteny.run_blat
337
+ @externaldb_synteny.run_diamond
330
338
  end_time = Time.now
331
- c_time = Helper.sec2str(end_time-start_time)
339
+ c_time = Helper.sec2str(end_time - start_time)
332
340
  print "done (#{c_time})\n"
333
341
  @externaldb_synteny.extract_hits :externaldb
334
342
 
@@ -1,16 +1,18 @@
1
1
  # -*- coding: utf-8 -*-
2
2
  # author: maxime déraspe
3
3
  # email: maximilien1er@gmail.com
4
- # review:
4
+ # review:
5
5
  # date: 15-02-24
6
6
  # version: 0.0.1
7
- # licence:
7
+ # licence:
8
8
 
9
9
  require 'bio'
10
10
  require 'fileutils'
11
11
  require 'parallel'
12
12
  require 'helper'
13
13
 
14
+ require 'bacterial-annotator/sequence-annotation'
15
+
14
16
  class BacterialComparator
15
17
 
16
18
  attr_reader :genomes_list, :stats
@@ -25,6 +27,9 @@ class BacterialComparator
25
27
  @genomes_list = options[:genomes_list]
26
28
  @proc = options[:proc].to_i
27
29
  @phylo_nb_genes = options[:phylo_nb_genes]
30
+ @refgenome_file = options[:refgenome]
31
+ @refgenome = ""
32
+
28
33
  if ["fasttree","raxml"].include? options[:software]
29
34
  @software = options[:software]
30
35
  else
@@ -62,14 +67,39 @@ class BacterialComparator
62
67
 
63
68
  end
64
69
 
65
-
66
70
  def read_prot_synteny
67
71
 
68
72
  puts "# Reading genome synteny files START.."
69
73
  start_time = Time.now
70
74
  synteny = {}
71
- @genomes_list.each do |g|
75
+
76
+ # If genome is genbank (.gbk or .gb) then do syntheny first
77
+
78
+ @genomes_list.each_with_index do |g,i|
79
+
72
80
  genome_synteny = []
81
+
82
+ if g =~ /.gbk/i
83
+ new_genomes_dir = @outdir+"/new-genomes/"
84
+ Dir.mkdir(new_genomes_dir) if ! Dir.exists? new_genomes_dir
85
+ genome_dir = new_genomes_dir + "/" + File.basename(g).gsub(".gbk","")
86
+ Dir.mkdir(genome_dir) if ! Dir.exists? genome_dir
87
+ genome_to_annotate = SequenceAnnotation.new(@root,
88
+ genome_dir,
89
+ g,
90
+ "refGbk")
91
+ query_prot_file = Dir["#{genome_dir}/*.pep"][0]
92
+ query_gene_file = Dir["#{genome_dir}/*.dna"][0]
93
+
94
+ File.symlink(query_prot_file, File.dirname(query_prot_file)+"/Proteins.fa")
95
+ File.symlink(query_gene_file, File.dirname(query_gene_file)+"/Genes.fa")
96
+
97
+ run_synteny_prot @root, genome_dir, @ref_prot_file, query_prot_file
98
+
99
+ g = genome_dir
100
+ @genomes_list[i] = genome_dir
101
+ end
102
+
73
103
  file = File.open("#{g}/Prot-Synteny.tsv", "r")
74
104
  l = file.gets # skip header
75
105
  while l = file.gets
@@ -79,6 +109,7 @@ class BacterialComparator
79
109
  synteny[lA[0]] << {ref_cov: lA[3].to_f, pId: lA[4].to_f, query_prot: lA[5], query_cov: lA[7].to_f}
80
110
  genome_synteny << lA[0]
81
111
  end
112
+
82
113
  @ref_prot.each do |ref_prot|
83
114
  if ! genome_synteny.include? ref_prot
84
115
  synteny[lA[0]] << {ref_cov: "-", pId: "-", query_prot: "-", query_cov: "-"}
@@ -96,16 +127,45 @@ class BacterialComparator
96
127
  end
97
128
 
98
129
  def get_ref_prot
130
+
99
131
  ref_prot = []
100
- pep_file = Dir["#{@genomes_list[0]}/*.pep"]
101
- flatfile = Bio::FlatFile.auto("#{pep_file[0]}")
102
- flatfile.each_entry do |entry|
103
- ref_prot << entry.definition.split(" ")[0]
132
+ if File.exist?("#{@genomes_list[0]}/*.pep")
133
+ pep_file = Dir["#{@genomes_list[0]}/*.pep"]
134
+ @ref_prot_file = pep_file[0]
135
+ flatfile = Bio::FlatFile.auto("#{@ref_prot_file}")
136
+ flatfile.each_entry do |entry|
137
+ ref_prot << entry.definition.split(" ")[0]
138
+ end
139
+ flatfile.close
140
+ dna_file = Dir["#{@genomes_list[0]}/*.dna"]
141
+ @ref_dna_file = dna_file[0]
142
+ else
143
+ if @refgenome_file == ""
144
+ abort "You need to provide a reference genome to add a non-annotated genome"
145
+ elsif @refgenome == ""
146
+ new_genomes_dir = @outdir+"/new-genomes/"
147
+ Dir.mkdir(new_genomes_dir) if ! Dir.exists? new_genomes_dir
148
+ refgenome_dir = new_genomes_dir + "/" + File.basename(@refgenome_file).gsub(".gbk","")
149
+ Dir.mkdir(refgenome_dir) if ! Dir.exists? refgenome_dir
150
+ @refgenome = SequenceAnnotation.new(@root,
151
+ refgenome_dir,
152
+ @refgenome_file,
153
+ "refGbk")
154
+ pep_file = Dir["#{refgenome_dir}/*.pep"]
155
+ @ref_prot_file = pep_file[0]
156
+ flatfile = Bio::FlatFile.auto("#{@ref_prot_file}")
157
+ flatfile.each_entry do |entry|
158
+ ref_prot << entry.definition.split(" ")[0]
159
+ end
160
+ flatfile.close
161
+ dna_file = Dir["#{refgenome_dir}/*.dna"]
162
+ @ref_dna_file = dna_file[0]
163
+ end
104
164
  end
105
- flatfile.close
165
+
106
166
  ref_prot
107
- end
108
167
 
168
+ end
109
169
 
110
170
  # load all id => sequences from multifasta
111
171
  def load_genome_cds file
@@ -128,7 +188,7 @@ class BacterialComparator
128
188
 
129
189
  pep_out_dir = "#{@outdir}/align-genes-pep"
130
190
 
131
- ref_proteins = load_genome_cds(Dir["#{@genomes_list[0]}/*.pep"][0])
191
+ ref_proteins = load_genome_cds(@ref_prot_file)
132
192
  synteny_list.each do |k,v|
133
193
  pep_out = File.open(pep_out_dir+"/#{k}.pep", "w")
134
194
  pep_out.write(ref_proteins[k])
@@ -147,7 +207,7 @@ class BacterialComparator
147
207
  end
148
208
 
149
209
  dna_out_dir = "#{@outdir}/align-genes-dna"
150
- ref_genes = load_genome_cds(Dir["#{@genomes_list[0]}/*.dna"][0])
210
+ ref_genes = load_genome_cds(@ref_dna_file)
151
211
  synteny_list.each do |k,v|
152
212
  dna_out = File.open(dna_out_dir+"/#{k}.dna", "w")
153
213
  dna_out.write(ref_genes[k])
@@ -304,7 +364,7 @@ class BacterialComparator
304
364
 
305
365
  # FIXME ugly hack to find out the reference genome
306
366
  Dir.chdir(ori_dir)
307
- ref_id = Dir["#{@genomes_list[0]}/*.pep"][0].split('/')[-1].gsub(".pep","")
367
+ ref_id = @ref_prot_file.split('/')[-1].gsub(".pep","")
308
368
 
309
369
  concat_alignments "#{@outdir}/align-genes-pep.all.fasta", ref_id
310
370
 
@@ -339,7 +399,7 @@ class BacterialComparator
339
399
 
340
400
  # ugly hack to find out the reference genome FIXME
341
401
  Dir.chdir(ori_dir)
342
- ref_id = Dir["#{@genomes_list[0]}/*.pep"][0].split('/')[-1].gsub(".pep","")
402
+ ref_id = @ref_prot_file.split('/')[-1].gsub(".pep","")
343
403
 
344
404
  end_time = Time.now
345
405
  c_time = Helper.sec2str(end_time-start_time)
@@ -508,4 +568,110 @@ class BacterialComparator
508
568
 
509
569
  end
510
570
 
571
+
572
+ def get_fasta_length fasta
573
+ flatfile = Bio::FlatFile.auto(fasta)
574
+ prot_lengths = {}
575
+ flatfile.each_entry do |entry|
576
+ prot_id = entry.definition.split(" ")[0]
577
+ prot_length = entry.length
578
+ prot_lengths[prot_id] = prot_length
579
+ end
580
+ flatfile.close
581
+ prot_lengths
582
+ end
583
+
584
+
585
+ def run_synteny_prot root, outdir, ref_prot_file, query_prot_file
586
+
587
+ puts query_prot_file
588
+ puts ref_prot_file
589
+
590
+ ref_synteny_prot = SequenceSynteny.new(root,
591
+ outdir,
592
+ query_prot_file,
593
+ ref_prot_file,
594
+ "Prot-Ref",
595
+ 0.80,
596
+ 0.80,
597
+ "prot")
598
+
599
+ print "# Running alignment with Reference Genome CDS (diamond).."
600
+ start_time = Time.now
601
+ ref_synteny_prot.run_diamond
602
+ end_time = Time.now
603
+ c_time = Helper.sec2str(end_time - start_time)
604
+ print "done (#{c_time})\n"
605
+
606
+ ref_synteny_prot.extract_hits :refgenome
607
+
608
+ synteny_file = File.open("#{outdir}/Prot-Synteny.tsv","w")
609
+ synteny_file.write("RefLocusTag\tRefProtID\tRefLength\tRefCoverage\tIdentity\tQueryGene\tQueryLength\tQueryCoverage\tQueryPartial\n")
610
+ ref_annotated = {}
611
+
612
+ ref_synteny_prot.query_sequences.each do |prot, syn_val|
613
+ next if ! syn_val.has_key? :homology
614
+ next if syn_val[:homology][:assert_cutoff].inject(:+) < 3
615
+ next if ref_annotated.has_key? syn_val[:homology][:hits][0] and ref_annotated[syn_val[:homology][:hits][0]][:partial] == 0
616
+ ref_annotated[syn_val[:homology][:hits][0]] = {
617
+ key: prot,
618
+ pId: syn_val[:homology][:pId],
619
+ cov_query: syn_val[:homology][:cov_query],
620
+ cov_subject: syn_val[:homology][:cov_subject],
621
+ assert_cutoff: syn_val[:homology][:assert_cutoff],
622
+ length: syn_val[:homology][:length][0],
623
+ partial: (syn_val[:partial] ? 1 : 0)
624
+ }
625
+ # ref_annotated[syn_val[:homology][:hits][0]] = {
626
+ # key: prot,
627
+ # pId: syn_val[:homology][:pId],
628
+ # cov_query: syn_val[:homology][:cov_query],
629
+ # cov_subject: syn_val[:homology][:cov_subject],
630
+ # assert_cutoff: syn_val[:homology][:assert_cutoff],
631
+ # length: syn_val[:homology][:length][0],
632
+ # partial: (syn_val[:partial] ? 1 : 0)
633
+ # }
634
+ end
635
+
636
+ # print ref_annotated
637
+ query_lengths = get_fasta_length query_prot_file
638
+
639
+ @refgenome.coding_seq.each do |ref_k, ref_v|
640
+ gene = ""
641
+ coverage_ref = ""
642
+ coverage_query = ""
643
+ query_length = ""
644
+ pId = ""
645
+ if ref_annotated[ref_v[:protId]] != nil
646
+ gene = ref_annotated[ref_v[:protId]][:key]
647
+ coverage_ref = ref_annotated[ref_v[:protId]][:cov_subject]
648
+ query_length = query_lengths[ref_annotated[ref_v[:protId]][:key]]
649
+ coverage_query = ref_annotated[ref_v[:protId]][:cov_query]
650
+ pId = ref_annotated[ref_v[:protId]][:pId]
651
+ partial = ref_annotated[ref_v[:protId]][:partial]
652
+ end
653
+
654
+ _locus_tag = ref_v[:locustag] || ""
655
+ _seq_len = "NA"
656
+ # _seq_len = ref_v[:bioseq].seq.length.to_s if ! ref_v[:bioseq].nil?
657
+ _seq_len = ref_v[:length].to_s if ! ref_v[:length].nil?
658
+
659
+ synteny_file.write(ref_v[:protId])
660
+ synteny_file.write("\t"+_locus_tag)
661
+ synteny_file.write("\t"+_seq_len)
662
+ synteny_file.write("\t"+coverage_ref.to_s)
663
+ synteny_file.write("\t"+pId.to_s)
664
+ synteny_file.write("\t"+gene)
665
+ synteny_file.write("\t"+query_length.to_s)
666
+ synteny_file.write("\t"+coverage_query.to_s)
667
+ synteny_file.write("\t"+partial.to_s)
668
+ synteny_file.write("\n")
669
+
670
+ end
671
+
672
+ synteny_file.close
673
+
674
+ end
675
+
676
+
511
677
  end # end of Class
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bacterial-annotator
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.8.9
4
+ version: 0.9.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Maxime Deraspe
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2018-10-17 00:00:00.000000000 Z
11
+ date: 2022-09-05 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bio
@@ -110,7 +110,7 @@ homepage: http://rubygems.org/gems/bacterial-annotator
110
110
  licenses:
111
111
  - GPL-3.0
112
112
  metadata: {}
113
- post_install_message:
113
+ post_install_message:
114
114
  rdoc_options: []
115
115
  require_paths:
116
116
  - lib
@@ -125,9 +125,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
125
125
  - !ruby/object:Gem::Version
126
126
  version: '0'
127
127
  requirements: []
128
- rubyforge_project:
129
- rubygems_version: 2.7.7
130
- signing_key:
128
+ rubygems_version: 3.1.6
129
+ signing_key:
131
130
  specification_version: 4
132
131
  summary: Bacterial Annotator
133
132
  test_files: []