bacterial-annotator 0.8.8 → 0.9.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: b629352a56f96c69564434da84bf58732e519ec6e064551e7cf7a0871a7050a1
4
- data.tar.gz: 1659b196791b259db68053ef83ae74acd453f92814ffba807a1b2af87026acb3
3
+ metadata.gz: c3f04aa6e41b8214a3f3d46b77afc21f8c4471326721cf29aae3c810d798b553
4
+ data.tar.gz: 959c216c280eb6c62f4b27bc01b03f59f00efef9e393d036cc04519ab7f6c54a
5
5
  SHA512:
6
- metadata.gz: 0cecffe0d0f8ac6cf1c41783927594cdcf952d90b35e5d115efad0a0e87a2d89f6b573397973bcdfd2c390d3f6e08172b3f94c9352e68d0c1d33ecbbfdd19f0f
7
- data.tar.gz: bcbddbdf09266cbda5529b60c952afa7e3d92f3b4f14eec1c878ebb2b6a66130f2f0857d54a5f0680fbab3f47dc12bdfc8872387cbe08a085ec793f166c8f6cf
6
+ metadata.gz: ef78d609d9c3bb1a0af64d4f662206038683c67cb3a55386e5dcde0e3200cbc4ecdf6d5844bb361fb5841e247b1d2e19fc4e1d68c7db963fa8cdcf6e564c9453
7
+ data.tar.gz: b58553928bf63cbbf845a84ddb30765ebb04ec83e73494bf8caaf2784b3c29a5bf17f9c587c99feb7bf80f24eb041f331618705d61d04bf4191628dd8d48af73
@@ -2,10 +2,10 @@
2
2
  # -*- coding: utf-8 -*-
3
3
  # author: maxime déraspe
4
4
  # email: maximilien1er@gmail.com
5
- # review:
5
+ # review:
6
6
  # date: 15-02-24
7
7
  # version: 0.01
8
- # licence:
8
+ # licence:
9
9
 
10
10
 
11
11
  require 'bacterial-annotator'
@@ -160,6 +160,7 @@ compare [OPTIONS]
160
160
  --proc <nb of process> Number of process to run the comparison
161
161
 
162
162
  //Synteny
163
+ --refgenome genbank ref genome if you want to provide complete genomes not previously annotated with bacterial-annotator
163
164
  --pidentity <default 0.80> Minimal percentage identity to call a syntenic protein
164
165
  --min_cov <default 0.80> Minimal coverage for the alignment of the protein / gene
165
166
 
@@ -193,6 +194,12 @@ def parseOptions_compare
193
194
  options[:phylogeny] = 0
194
195
  options[:software] = "fasttree"
195
196
  options[:bootstrap] = 100
197
+ options[:refgenome] = ""
198
+
199
+ if ARGV.length == 0
200
+ usage_compare
201
+ abort
202
+ end
196
203
 
197
204
  while x = ARGV.shift
198
205
 
@@ -207,6 +214,8 @@ def parseOptions_compare
207
214
  options[:proc] = ARGV.shift
208
215
  when "--align"
209
216
  options[:align] = ARGV.shift
217
+ when "--refgenome", "-g"
218
+ options[:refgenome] = ARGV.shift
210
219
  when "--concat"
211
220
  options[:concat] = ARGV.shift
212
221
  when "--phylogeny"
@@ -254,6 +263,11 @@ def parseOptions_identify
254
263
  options[:genome_list] = []
255
264
  options[:output] = "tsv"
256
265
 
266
+ if ARGV.length == 0
267
+ usage_identify
268
+ abort
269
+ end
270
+
257
271
  while x = ARGV.shift
258
272
 
259
273
  case x.downcase
@@ -7,7 +7,7 @@
7
7
 
8
8
  require 'json'
9
9
  require 'zlib'
10
-
10
+ require 'pp'
11
11
 
12
12
  class SequenceAnnotation
13
13
 
@@ -272,7 +272,7 @@ class SequenceAnnotation
272
272
  @rna_seq = {}
273
273
  @gbk.features do |ft|
274
274
 
275
- next if ! ft.feature.to_s.include? "rRNA"
275
+ next if ! ft.feature.to_s.include? "RNA"
276
276
 
277
277
  ftH = ft.to_hash
278
278
  loc = ft.locations
@@ -285,7 +285,12 @@ class SequenceAnnotation
285
285
  # gene = ftH["gene"] if !ftH["gene"].nil?
286
286
  # protId = ftH["protein_id"][0] if !ftH["protein_id"].nil?
287
287
  product = ""
288
- product = ftH["product"][0] if !ftH["product"].nil?
288
+
289
+ if !ftH["product"].nil?
290
+ product = ftH["product"][0]
291
+ # puts ftH["product"].join(",") + "---" + ftH["product"][0]
292
+ end
293
+
289
294
  locustag = ftH["locus_tag"][0] if !ftH["locus_tag"].nil?
290
295
 
291
296
  # puts "#{@accession}\t#{seqBeg}\t#{seqEnd}\t#{strand}\t#{protId}\t#{locustag}\t#{gene[0]}\t#{product[0]}"
@@ -533,6 +538,7 @@ class SequenceAnnotation
533
538
 
534
539
  new_features = {}
535
540
  annotations_done = {}
541
+ gbk_features_len = @gbk.features.length
536
542
 
537
543
  @gbk.features.each_with_index do |ft, ft_index|
538
544
 
@@ -540,8 +546,9 @@ class SequenceAnnotation
540
546
 
541
547
  next if annotations_done.has_key? k
542
548
 
543
- if v[:query_location][0][0] < ft.locations[0].from
549
+ if v[:query_location][0][0] < ft.locations[0].from or ft_index == gbk_features_len-1
544
550
 
551
+ pp v
545
552
  if v[:subject_location][0][0] > v[:subject_location][0][1]
546
553
  location = "complement(#{v[:query_location][0][0]}..#{v[:query_location][0][1]})"
547
554
  else
@@ -550,7 +557,12 @@ class SequenceAnnotation
550
557
 
551
558
  feature = Bio::Feature.new(v[:feature][0],location)
552
559
  feature.qualifiers.push(Bio::Feature::Qualifier.new('product',v[:product][0])) if ! v[:product][0].nil? or v[:product][0] != ""
553
- new_features[ft_index] = feature
560
+ if ft_index == gbk_features_len-1
561
+ new_features[gbk_features_len] = feature
562
+ else
563
+ new_features[ft_index] = feature
564
+ end
565
+
554
566
  annotations_done[k] = 1
555
567
  break
556
568
 
@@ -1,10 +1,10 @@
1
1
  # -*- coding: utf-8 -*-
2
2
  # author: maxime déraspe
3
3
  # email: maximilien1er@gmail.com
4
- # review:
4
+ # review:
5
5
  # date: 15-02-24
6
6
  # version: 0.0.1
7
- # licence:
7
+ # licence:
8
8
 
9
9
  require 'bio'
10
10
  require 'fileutils'
@@ -324,11 +324,19 @@ class BacterialAnnotator
324
324
  @options[:pcoverage],
325
325
  "prot")
326
326
 
327
- print "# Running BLAT alignment with External Database.."
327
+ # print "# Running BLAT alignment with External Database.."
328
+ # start_time = Time.now
329
+ # @externaldb_synteny.run_blat
330
+ # end_time = Time.now
331
+ # c_time = Helper.sec2str(end_time-start_time)
332
+ # print "done (#{c_time})\n"
333
+ # @externaldb_synteny.extract_hits :externaldb
334
+
335
+ print "# Running alignment with External DB"
328
336
  start_time = Time.now
329
- @externaldb_synteny.run_blat
337
+ @externaldb_synteny.run_diamond
330
338
  end_time = Time.now
331
- c_time = Helper.sec2str(end_time-start_time)
339
+ c_time = Helper.sec2str(end_time - start_time)
332
340
  print "done (#{c_time})\n"
333
341
  @externaldb_synteny.extract_hits :externaldb
334
342
 
@@ -369,6 +377,20 @@ class BacterialAnnotator
369
377
  inference: inference
370
378
  }
371
379
 
380
+ @annotation_stats[:flagged_cds].each do |flag|
381
+ if flag.include? "#{k}"
382
+ if v[:homology][:assert_cutoff].inject(:+) > 2
383
+ flag.replace("#{flag}\tAnnotated by externaldb (#{v[:homology][:hits][0]}|#{v[:homology][:pId]}|#{cov_query}|#{cov_subject}))")
384
+ elsif v[:homology][:assert_cutoff] == [1,1,0]
385
+ flag.replace("#{flag}\tPossible pseudogene (coverage subject = #{cov_subject} with #{v[:homology][:hits][0]}))")
386
+ elsif v[:homology][:assert_cutoff] == [1,0,1]
387
+ flag.replace("#{flag}\tPossible pseudogene (coverage query = #{cov_query} with #{v[:homology][:hits][0]}))")
388
+ elsif v[:homology][:assert_cutoff] == [0,1,1]
389
+ flag.replace("#{flag}\tLow similarity (percent identity = #{v[:homology][:pId]} with #{v[:homology][:hits][0]}))")
390
+ end
391
+ end
392
+ end
393
+
372
394
  end
373
395
 
374
396
  end
@@ -463,6 +485,7 @@ class BacterialAnnotator
463
485
  flag += "\t#{(@prot_synteny_refgenome.query_sequences[prot][:homology][:cov_query]*100).round(2)}"
464
486
  flag += "\t#{(@prot_synteny_refgenome.query_sequences[prot][:homology][:cov_subject]*100).round(2)}"
465
487
  @annotation_stats[:flagged_cds] << flag
488
+ remaining_cds << prot
466
489
  end
467
490
 
468
491
  else
@@ -515,7 +538,7 @@ class BacterialAnnotator
515
538
 
516
539
  file_flagged_cds = file_dir + "/Prot-flagged.tsv"
517
540
  File.open(file_flagged_cds, "w") do |fopen|
518
- fopen.write("CDS locus\tAssertion-CutOff\tAA Identity\tCovQuery(%)\tCovSubject(%)\n")
541
+ fopen.write("CDS locus\tAssertion-CutOff\tAA Identity\tCovQuery(%)\tCovSubject(%)\tNote\n")
519
542
  @annotation_stats[:flagged_cds].each do |fcds|
520
543
  fopen.write("#{fcds}\n")
521
544
  end
@@ -1,16 +1,18 @@
1
1
  # -*- coding: utf-8 -*-
2
2
  # author: maxime déraspe
3
3
  # email: maximilien1er@gmail.com
4
- # review:
4
+ # review:
5
5
  # date: 15-02-24
6
6
  # version: 0.0.1
7
- # licence:
7
+ # licence:
8
8
 
9
9
  require 'bio'
10
10
  require 'fileutils'
11
11
  require 'parallel'
12
12
  require 'helper'
13
13
 
14
+ require 'bacterial-annotator/sequence-annotation'
15
+
14
16
  class BacterialComparator
15
17
 
16
18
  attr_reader :genomes_list, :stats
@@ -25,6 +27,9 @@ class BacterialComparator
25
27
  @genomes_list = options[:genomes_list]
26
28
  @proc = options[:proc].to_i
27
29
  @phylo_nb_genes = options[:phylo_nb_genes]
30
+ @refgenome_file = options[:refgenome]
31
+ @refgenome = ""
32
+
28
33
  if ["fasttree","raxml"].include? options[:software]
29
34
  @software = options[:software]
30
35
  else
@@ -62,14 +67,39 @@ class BacterialComparator
62
67
 
63
68
  end
64
69
 
65
-
66
70
  def read_prot_synteny
67
71
 
68
72
  puts "# Reading genome synteny files START.."
69
73
  start_time = Time.now
70
74
  synteny = {}
71
- @genomes_list.each do |g|
75
+
76
+ # If genome is genbank (.gbk or .gb) then do syntheny first
77
+
78
+ @genomes_list.each_with_index do |g,i|
79
+
72
80
  genome_synteny = []
81
+
82
+ if g =~ /.gbk/i
83
+ new_genomes_dir = @outdir+"/new-genomes/"
84
+ Dir.mkdir(new_genomes_dir) if ! Dir.exists? new_genomes_dir
85
+ genome_dir = new_genomes_dir + "/" + File.basename(g).gsub(".gbk","")
86
+ Dir.mkdir(genome_dir) if ! Dir.exists? genome_dir
87
+ genome_to_annotate = SequenceAnnotation.new(@root,
88
+ genome_dir,
89
+ g,
90
+ "refGbk")
91
+ query_prot_file = Dir["#{genome_dir}/*.pep"][0]
92
+ query_gene_file = Dir["#{genome_dir}/*.dna"][0]
93
+
94
+ File.symlink(query_prot_file, File.dirname(query_prot_file)+"/Proteins.fa")
95
+ File.symlink(query_gene_file, File.dirname(query_gene_file)+"/Genes.fa")
96
+
97
+ run_synteny_prot @root, genome_dir, @ref_prot_file, query_prot_file
98
+
99
+ g = genome_dir
100
+ @genomes_list[i] = genome_dir
101
+ end
102
+
73
103
  file = File.open("#{g}/Prot-Synteny.tsv", "r")
74
104
  l = file.gets # skip header
75
105
  while l = file.gets
@@ -79,6 +109,7 @@ class BacterialComparator
79
109
  synteny[lA[0]] << {ref_cov: lA[3].to_f, pId: lA[4].to_f, query_prot: lA[5], query_cov: lA[7].to_f}
80
110
  genome_synteny << lA[0]
81
111
  end
112
+
82
113
  @ref_prot.each do |ref_prot|
83
114
  if ! genome_synteny.include? ref_prot
84
115
  synteny[lA[0]] << {ref_cov: "-", pId: "-", query_prot: "-", query_cov: "-"}
@@ -96,16 +127,45 @@ class BacterialComparator
96
127
  end
97
128
 
98
129
  def get_ref_prot
130
+
99
131
  ref_prot = []
100
- pep_file = Dir["#{@genomes_list[0]}/*.pep"]
101
- flatfile = Bio::FlatFile.auto("#{pep_file[0]}")
102
- flatfile.each_entry do |entry|
103
- ref_prot << entry.definition.split(" ")[0]
132
+ if File.exist?("#{@genomes_list[0]}/*.pep")
133
+ pep_file = Dir["#{@genomes_list[0]}/*.pep"]
134
+ @ref_prot_file = pep_file[0]
135
+ flatfile = Bio::FlatFile.auto("#{@ref_prot_file}")
136
+ flatfile.each_entry do |entry|
137
+ ref_prot << entry.definition.split(" ")[0]
138
+ end
139
+ flatfile.close
140
+ dna_file = Dir["#{@genomes_list[0]}/*.dna"]
141
+ @ref_dna_file = dna_file[0]
142
+ else
143
+ if @refgenome_file == ""
144
+ abort "You need to provide a reference genome to add a non-annotated genome"
145
+ elsif @refgenome == ""
146
+ new_genomes_dir = @outdir+"/new-genomes/"
147
+ Dir.mkdir(new_genomes_dir) if ! Dir.exists? new_genomes_dir
148
+ refgenome_dir = new_genomes_dir + "/" + File.basename(@refgenome_file).gsub(".gbk","")
149
+ Dir.mkdir(refgenome_dir) if ! Dir.exists? refgenome_dir
150
+ @refgenome = SequenceAnnotation.new(@root,
151
+ refgenome_dir,
152
+ @refgenome_file,
153
+ "refGbk")
154
+ pep_file = Dir["#{refgenome_dir}/*.pep"]
155
+ @ref_prot_file = pep_file[0]
156
+ flatfile = Bio::FlatFile.auto("#{@ref_prot_file}")
157
+ flatfile.each_entry do |entry|
158
+ ref_prot << entry.definition.split(" ")[0]
159
+ end
160
+ flatfile.close
161
+ dna_file = Dir["#{refgenome_dir}/*.dna"]
162
+ @ref_dna_file = dna_file[0]
163
+ end
104
164
  end
105
- flatfile.close
165
+
106
166
  ref_prot
107
- end
108
167
 
168
+ end
109
169
 
110
170
  # load all id => sequences from multifasta
111
171
  def load_genome_cds file
@@ -128,7 +188,7 @@ class BacterialComparator
128
188
 
129
189
  pep_out_dir = "#{@outdir}/align-genes-pep"
130
190
 
131
- ref_proteins = load_genome_cds(Dir["#{@genomes_list[0]}/*.pep"][0])
191
+ ref_proteins = load_genome_cds(@ref_prot_file)
132
192
  synteny_list.each do |k,v|
133
193
  pep_out = File.open(pep_out_dir+"/#{k}.pep", "w")
134
194
  pep_out.write(ref_proteins[k])
@@ -147,7 +207,7 @@ class BacterialComparator
147
207
  end
148
208
 
149
209
  dna_out_dir = "#{@outdir}/align-genes-dna"
150
- ref_genes = load_genome_cds(Dir["#{@genomes_list[0]}/*.dna"][0])
210
+ ref_genes = load_genome_cds(@ref_dna_file)
151
211
  synteny_list.each do |k,v|
152
212
  dna_out = File.open(dna_out_dir+"/#{k}.dna", "w")
153
213
  dna_out.write(ref_genes[k])
@@ -304,7 +364,7 @@ class BacterialComparator
304
364
 
305
365
  # FIXME ugly hack to find out the reference genome
306
366
  Dir.chdir(ori_dir)
307
- ref_id = Dir["#{@genomes_list[0]}/*.pep"][0].split('/')[-1].gsub(".pep","")
367
+ ref_id = @ref_prot_file.split('/')[-1].gsub(".pep","")
308
368
 
309
369
  concat_alignments "#{@outdir}/align-genes-pep.all.fasta", ref_id
310
370
 
@@ -339,7 +399,7 @@ class BacterialComparator
339
399
 
340
400
  # ugly hack to find out the reference genome FIXME
341
401
  Dir.chdir(ori_dir)
342
- ref_id = Dir["#{@genomes_list[0]}/*.pep"][0].split('/')[-1].gsub(".pep","")
402
+ ref_id = @ref_prot_file.split('/')[-1].gsub(".pep","")
343
403
 
344
404
  end_time = Time.now
345
405
  c_time = Helper.sec2str(end_time-start_time)
@@ -508,4 +568,110 @@ class BacterialComparator
508
568
 
509
569
  end
510
570
 
571
+
572
+ def get_fasta_length fasta
573
+ flatfile = Bio::FlatFile.auto(fasta)
574
+ prot_lengths = {}
575
+ flatfile.each_entry do |entry|
576
+ prot_id = entry.definition.split(" ")[0]
577
+ prot_length = entry.length
578
+ prot_lengths[prot_id] = prot_length
579
+ end
580
+ flatfile.close
581
+ prot_lengths
582
+ end
583
+
584
+
585
+ def run_synteny_prot root, outdir, ref_prot_file, query_prot_file
586
+
587
+ puts query_prot_file
588
+ puts ref_prot_file
589
+
590
+ ref_synteny_prot = SequenceSynteny.new(root,
591
+ outdir,
592
+ query_prot_file,
593
+ ref_prot_file,
594
+ "Prot-Ref",
595
+ 0.80,
596
+ 0.80,
597
+ "prot")
598
+
599
+ print "# Running alignment with Reference Genome CDS (diamond).."
600
+ start_time = Time.now
601
+ ref_synteny_prot.run_diamond
602
+ end_time = Time.now
603
+ c_time = Helper.sec2str(end_time - start_time)
604
+ print "done (#{c_time})\n"
605
+
606
+ ref_synteny_prot.extract_hits :refgenome
607
+
608
+ synteny_file = File.open("#{outdir}/Prot-Synteny.tsv","w")
609
+ synteny_file.write("RefLocusTag\tRefProtID\tRefLength\tRefCoverage\tIdentity\tQueryGene\tQueryLength\tQueryCoverage\tQueryPartial\n")
610
+ ref_annotated = {}
611
+
612
+ ref_synteny_prot.query_sequences.each do |prot, syn_val|
613
+ next if ! syn_val.has_key? :homology
614
+ next if syn_val[:homology][:assert_cutoff].inject(:+) < 3
615
+ next if ref_annotated.has_key? syn_val[:homology][:hits][0] and ref_annotated[syn_val[:homology][:hits][0]][:partial] == 0
616
+ ref_annotated[syn_val[:homology][:hits][0]] = {
617
+ key: prot,
618
+ pId: syn_val[:homology][:pId],
619
+ cov_query: syn_val[:homology][:cov_query],
620
+ cov_subject: syn_val[:homology][:cov_subject],
621
+ assert_cutoff: syn_val[:homology][:assert_cutoff],
622
+ length: syn_val[:homology][:length][0],
623
+ partial: (syn_val[:partial] ? 1 : 0)
624
+ }
625
+ # ref_annotated[syn_val[:homology][:hits][0]] = {
626
+ # key: prot,
627
+ # pId: syn_val[:homology][:pId],
628
+ # cov_query: syn_val[:homology][:cov_query],
629
+ # cov_subject: syn_val[:homology][:cov_subject],
630
+ # assert_cutoff: syn_val[:homology][:assert_cutoff],
631
+ # length: syn_val[:homology][:length][0],
632
+ # partial: (syn_val[:partial] ? 1 : 0)
633
+ # }
634
+ end
635
+
636
+ # print ref_annotated
637
+ query_lengths = get_fasta_length query_prot_file
638
+
639
+ @refgenome.coding_seq.each do |ref_k, ref_v|
640
+ gene = ""
641
+ coverage_ref = ""
642
+ coverage_query = ""
643
+ query_length = ""
644
+ pId = ""
645
+ if ref_annotated[ref_v[:protId]] != nil
646
+ gene = ref_annotated[ref_v[:protId]][:key]
647
+ coverage_ref = ref_annotated[ref_v[:protId]][:cov_subject]
648
+ query_length = query_lengths[ref_annotated[ref_v[:protId]][:key]]
649
+ coverage_query = ref_annotated[ref_v[:protId]][:cov_query]
650
+ pId = ref_annotated[ref_v[:protId]][:pId]
651
+ partial = ref_annotated[ref_v[:protId]][:partial]
652
+ end
653
+
654
+ _locus_tag = ref_v[:locustag] || ""
655
+ _seq_len = "NA"
656
+ # _seq_len = ref_v[:bioseq].seq.length.to_s if ! ref_v[:bioseq].nil?
657
+ _seq_len = ref_v[:length].to_s if ! ref_v[:length].nil?
658
+
659
+ synteny_file.write(ref_v[:protId])
660
+ synteny_file.write("\t"+_locus_tag)
661
+ synteny_file.write("\t"+_seq_len)
662
+ synteny_file.write("\t"+coverage_ref.to_s)
663
+ synteny_file.write("\t"+pId.to_s)
664
+ synteny_file.write("\t"+gene)
665
+ synteny_file.write("\t"+query_length.to_s)
666
+ synteny_file.write("\t"+coverage_query.to_s)
667
+ synteny_file.write("\t"+partial.to_s)
668
+ synteny_file.write("\n")
669
+
670
+ end
671
+
672
+ synteny_file.close
673
+
674
+ end
675
+
676
+
511
677
  end # end of Class
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bacterial-annotator
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.8.8
4
+ version: 0.9.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Maxime Deraspe
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2018-08-02 00:00:00.000000000 Z
11
+ date: 2022-09-05 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bio
@@ -110,7 +110,7 @@ homepage: http://rubygems.org/gems/bacterial-annotator
110
110
  licenses:
111
111
  - GPL-3.0
112
112
  metadata: {}
113
- post_install_message:
113
+ post_install_message:
114
114
  rdoc_options: []
115
115
  require_paths:
116
116
  - lib
@@ -125,9 +125,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
125
125
  - !ruby/object:Gem::Version
126
126
  version: '0'
127
127
  requirements: []
128
- rubyforge_project:
129
- rubygems_version: 2.7.7
130
- signing_key:
128
+ rubygems_version: 3.1.6
129
+ signing_key:
131
130
  specification_version: 4
132
131
  summary: Bacterial Annotator
133
132
  test_files: []