bacterial-annotator 0.8.8 → 0.9.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: b629352a56f96c69564434da84bf58732e519ec6e064551e7cf7a0871a7050a1
4
- data.tar.gz: 1659b196791b259db68053ef83ae74acd453f92814ffba807a1b2af87026acb3
3
+ metadata.gz: c3f04aa6e41b8214a3f3d46b77afc21f8c4471326721cf29aae3c810d798b553
4
+ data.tar.gz: 959c216c280eb6c62f4b27bc01b03f59f00efef9e393d036cc04519ab7f6c54a
5
5
  SHA512:
6
- metadata.gz: 0cecffe0d0f8ac6cf1c41783927594cdcf952d90b35e5d115efad0a0e87a2d89f6b573397973bcdfd2c390d3f6e08172b3f94c9352e68d0c1d33ecbbfdd19f0f
7
- data.tar.gz: bcbddbdf09266cbda5529b60c952afa7e3d92f3b4f14eec1c878ebb2b6a66130f2f0857d54a5f0680fbab3f47dc12bdfc8872387cbe08a085ec793f166c8f6cf
6
+ metadata.gz: ef78d609d9c3bb1a0af64d4f662206038683c67cb3a55386e5dcde0e3200cbc4ecdf6d5844bb361fb5841e247b1d2e19fc4e1d68c7db963fa8cdcf6e564c9453
7
+ data.tar.gz: b58553928bf63cbbf845a84ddb30765ebb04ec83e73494bf8caaf2784b3c29a5bf17f9c587c99feb7bf80f24eb041f331618705d61d04bf4191628dd8d48af73
@@ -2,10 +2,10 @@
2
2
  # -*- coding: utf-8 -*-
3
3
  # author: maxime déraspe
4
4
  # email: maximilien1er@gmail.com
5
- # review:
5
+ # review:
6
6
  # date: 15-02-24
7
7
  # version: 0.01
8
- # licence:
8
+ # licence:
9
9
 
10
10
 
11
11
  require 'bacterial-annotator'
@@ -160,6 +160,7 @@ compare [OPTIONS]
160
160
  --proc <nb of process> Number of process to run the comparison
161
161
 
162
162
  //Synteny
163
+ --refgenome genbank ref genome if you want to provide complete genomes not previously annotated with bacterial-annotator
163
164
  --pidentity <default 0.80> Minimal percentage identity to call a syntenic protein
164
165
  --min_cov <default 0.80> Minimal coverage for the alignment of the protein / gene
165
166
 
@@ -193,6 +194,12 @@ def parseOptions_compare
193
194
  options[:phylogeny] = 0
194
195
  options[:software] = "fasttree"
195
196
  options[:bootstrap] = 100
197
+ options[:refgenome] = ""
198
+
199
+ if ARGV.length == 0
200
+ usage_compare
201
+ abort
202
+ end
196
203
 
197
204
  while x = ARGV.shift
198
205
 
@@ -207,6 +214,8 @@ def parseOptions_compare
207
214
  options[:proc] = ARGV.shift
208
215
  when "--align"
209
216
  options[:align] = ARGV.shift
217
+ when "--refgenome", "-g"
218
+ options[:refgenome] = ARGV.shift
210
219
  when "--concat"
211
220
  options[:concat] = ARGV.shift
212
221
  when "--phylogeny"
@@ -254,6 +263,11 @@ def parseOptions_identify
254
263
  options[:genome_list] = []
255
264
  options[:output] = "tsv"
256
265
 
266
+ if ARGV.length == 0
267
+ usage_identify
268
+ abort
269
+ end
270
+
257
271
  while x = ARGV.shift
258
272
 
259
273
  case x.downcase
@@ -7,7 +7,7 @@
7
7
 
8
8
  require 'json'
9
9
  require 'zlib'
10
-
10
+ require 'pp'
11
11
 
12
12
  class SequenceAnnotation
13
13
 
@@ -272,7 +272,7 @@ class SequenceAnnotation
272
272
  @rna_seq = {}
273
273
  @gbk.features do |ft|
274
274
 
275
- next if ! ft.feature.to_s.include? "rRNA"
275
+ next if ! ft.feature.to_s.include? "RNA"
276
276
 
277
277
  ftH = ft.to_hash
278
278
  loc = ft.locations
@@ -285,7 +285,12 @@ class SequenceAnnotation
285
285
  # gene = ftH["gene"] if !ftH["gene"].nil?
286
286
  # protId = ftH["protein_id"][0] if !ftH["protein_id"].nil?
287
287
  product = ""
288
- product = ftH["product"][0] if !ftH["product"].nil?
288
+
289
+ if !ftH["product"].nil?
290
+ product = ftH["product"][0]
291
+ # puts ftH["product"].join(",") + "---" + ftH["product"][0]
292
+ end
293
+
289
294
  locustag = ftH["locus_tag"][0] if !ftH["locus_tag"].nil?
290
295
 
291
296
  # puts "#{@accession}\t#{seqBeg}\t#{seqEnd}\t#{strand}\t#{protId}\t#{locustag}\t#{gene[0]}\t#{product[0]}"
@@ -533,6 +538,7 @@ class SequenceAnnotation
533
538
 
534
539
  new_features = {}
535
540
  annotations_done = {}
541
+ gbk_features_len = @gbk.features.length
536
542
 
537
543
  @gbk.features.each_with_index do |ft, ft_index|
538
544
 
@@ -540,8 +546,9 @@ class SequenceAnnotation
540
546
 
541
547
  next if annotations_done.has_key? k
542
548
 
543
- if v[:query_location][0][0] < ft.locations[0].from
549
+ if v[:query_location][0][0] < ft.locations[0].from or ft_index == gbk_features_len-1
544
550
 
551
+ pp v
545
552
  if v[:subject_location][0][0] > v[:subject_location][0][1]
546
553
  location = "complement(#{v[:query_location][0][0]}..#{v[:query_location][0][1]})"
547
554
  else
@@ -550,7 +557,12 @@ class SequenceAnnotation
550
557
 
551
558
  feature = Bio::Feature.new(v[:feature][0],location)
552
559
  feature.qualifiers.push(Bio::Feature::Qualifier.new('product',v[:product][0])) if ! v[:product][0].nil? or v[:product][0] != ""
553
- new_features[ft_index] = feature
560
+ if ft_index == gbk_features_len-1
561
+ new_features[gbk_features_len] = feature
562
+ else
563
+ new_features[ft_index] = feature
564
+ end
565
+
554
566
  annotations_done[k] = 1
555
567
  break
556
568
 
@@ -1,10 +1,10 @@
1
1
  # -*- coding: utf-8 -*-
2
2
  # author: maxime déraspe
3
3
  # email: maximilien1er@gmail.com
4
- # review:
4
+ # review:
5
5
  # date: 15-02-24
6
6
  # version: 0.0.1
7
- # licence:
7
+ # licence:
8
8
 
9
9
  require 'bio'
10
10
  require 'fileutils'
@@ -324,11 +324,19 @@ class BacterialAnnotator
324
324
  @options[:pcoverage],
325
325
  "prot")
326
326
 
327
- print "# Running BLAT alignment with External Database.."
327
+ # print "# Running BLAT alignment with External Database.."
328
+ # start_time = Time.now
329
+ # @externaldb_synteny.run_blat
330
+ # end_time = Time.now
331
+ # c_time = Helper.sec2str(end_time-start_time)
332
+ # print "done (#{c_time})\n"
333
+ # @externaldb_synteny.extract_hits :externaldb
334
+
335
+ print "# Running alignment with External DB"
328
336
  start_time = Time.now
329
- @externaldb_synteny.run_blat
337
+ @externaldb_synteny.run_diamond
330
338
  end_time = Time.now
331
- c_time = Helper.sec2str(end_time-start_time)
339
+ c_time = Helper.sec2str(end_time - start_time)
332
340
  print "done (#{c_time})\n"
333
341
  @externaldb_synteny.extract_hits :externaldb
334
342
 
@@ -369,6 +377,20 @@ class BacterialAnnotator
369
377
  inference: inference
370
378
  }
371
379
 
380
+ @annotation_stats[:flagged_cds].each do |flag|
381
+ if flag.include? "#{k}"
382
+ if v[:homology][:assert_cutoff].inject(:+) > 2
383
+ flag.replace("#{flag}\tAnnotated by externaldb (#{v[:homology][:hits][0]}|#{v[:homology][:pId]}|#{cov_query}|#{cov_subject}))")
384
+ elsif v[:homology][:assert_cutoff] == [1,1,0]
385
+ flag.replace("#{flag}\tPossible pseudogene (coverage subject = #{cov_subject} with #{v[:homology][:hits][0]}))")
386
+ elsif v[:homology][:assert_cutoff] == [1,0,1]
387
+ flag.replace("#{flag}\tPossible pseudogene (coverage query = #{cov_query} with #{v[:homology][:hits][0]}))")
388
+ elsif v[:homology][:assert_cutoff] == [0,1,1]
389
+ flag.replace("#{flag}\tLow similarity (percent identity = #{v[:homology][:pId]} with #{v[:homology][:hits][0]}))")
390
+ end
391
+ end
392
+ end
393
+
372
394
  end
373
395
 
374
396
  end
@@ -463,6 +485,7 @@ class BacterialAnnotator
463
485
  flag += "\t#{(@prot_synteny_refgenome.query_sequences[prot][:homology][:cov_query]*100).round(2)}"
464
486
  flag += "\t#{(@prot_synteny_refgenome.query_sequences[prot][:homology][:cov_subject]*100).round(2)}"
465
487
  @annotation_stats[:flagged_cds] << flag
488
+ remaining_cds << prot
466
489
  end
467
490
 
468
491
  else
@@ -515,7 +538,7 @@ class BacterialAnnotator
515
538
 
516
539
  file_flagged_cds = file_dir + "/Prot-flagged.tsv"
517
540
  File.open(file_flagged_cds, "w") do |fopen|
518
- fopen.write("CDS locus\tAssertion-CutOff\tAA Identity\tCovQuery(%)\tCovSubject(%)\n")
541
+ fopen.write("CDS locus\tAssertion-CutOff\tAA Identity\tCovQuery(%)\tCovSubject(%)\tNote\n")
519
542
  @annotation_stats[:flagged_cds].each do |fcds|
520
543
  fopen.write("#{fcds}\n")
521
544
  end
@@ -1,16 +1,18 @@
1
1
  # -*- coding: utf-8 -*-
2
2
  # author: maxime déraspe
3
3
  # email: maximilien1er@gmail.com
4
- # review:
4
+ # review:
5
5
  # date: 15-02-24
6
6
  # version: 0.0.1
7
- # licence:
7
+ # licence:
8
8
 
9
9
  require 'bio'
10
10
  require 'fileutils'
11
11
  require 'parallel'
12
12
  require 'helper'
13
13
 
14
+ require 'bacterial-annotator/sequence-annotation'
15
+
14
16
  class BacterialComparator
15
17
 
16
18
  attr_reader :genomes_list, :stats
@@ -25,6 +27,9 @@ class BacterialComparator
25
27
  @genomes_list = options[:genomes_list]
26
28
  @proc = options[:proc].to_i
27
29
  @phylo_nb_genes = options[:phylo_nb_genes]
30
+ @refgenome_file = options[:refgenome]
31
+ @refgenome = ""
32
+
28
33
  if ["fasttree","raxml"].include? options[:software]
29
34
  @software = options[:software]
30
35
  else
@@ -62,14 +67,39 @@ class BacterialComparator
62
67
 
63
68
  end
64
69
 
65
-
66
70
  def read_prot_synteny
67
71
 
68
72
  puts "# Reading genome synteny files START.."
69
73
  start_time = Time.now
70
74
  synteny = {}
71
- @genomes_list.each do |g|
75
+
76
+ # If genome is genbank (.gbk or .gb) then do syntheny first
77
+
78
+ @genomes_list.each_with_index do |g,i|
79
+
72
80
  genome_synteny = []
81
+
82
+ if g =~ /.gbk/i
83
+ new_genomes_dir = @outdir+"/new-genomes/"
84
+ Dir.mkdir(new_genomes_dir) if ! Dir.exists? new_genomes_dir
85
+ genome_dir = new_genomes_dir + "/" + File.basename(g).gsub(".gbk","")
86
+ Dir.mkdir(genome_dir) if ! Dir.exists? genome_dir
87
+ genome_to_annotate = SequenceAnnotation.new(@root,
88
+ genome_dir,
89
+ g,
90
+ "refGbk")
91
+ query_prot_file = Dir["#{genome_dir}/*.pep"][0]
92
+ query_gene_file = Dir["#{genome_dir}/*.dna"][0]
93
+
94
+ File.symlink(query_prot_file, File.dirname(query_prot_file)+"/Proteins.fa")
95
+ File.symlink(query_gene_file, File.dirname(query_gene_file)+"/Genes.fa")
96
+
97
+ run_synteny_prot @root, genome_dir, @ref_prot_file, query_prot_file
98
+
99
+ g = genome_dir
100
+ @genomes_list[i] = genome_dir
101
+ end
102
+
73
103
  file = File.open("#{g}/Prot-Synteny.tsv", "r")
74
104
  l = file.gets # skip header
75
105
  while l = file.gets
@@ -79,6 +109,7 @@ class BacterialComparator
79
109
  synteny[lA[0]] << {ref_cov: lA[3].to_f, pId: lA[4].to_f, query_prot: lA[5], query_cov: lA[7].to_f}
80
110
  genome_synteny << lA[0]
81
111
  end
112
+
82
113
  @ref_prot.each do |ref_prot|
83
114
  if ! genome_synteny.include? ref_prot
84
115
  synteny[lA[0]] << {ref_cov: "-", pId: "-", query_prot: "-", query_cov: "-"}
@@ -96,16 +127,45 @@ class BacterialComparator
96
127
  end
97
128
 
98
129
  def get_ref_prot
130
+
99
131
  ref_prot = []
100
- pep_file = Dir["#{@genomes_list[0]}/*.pep"]
101
- flatfile = Bio::FlatFile.auto("#{pep_file[0]}")
102
- flatfile.each_entry do |entry|
103
- ref_prot << entry.definition.split(" ")[0]
132
+ if File.exist?("#{@genomes_list[0]}/*.pep")
133
+ pep_file = Dir["#{@genomes_list[0]}/*.pep"]
134
+ @ref_prot_file = pep_file[0]
135
+ flatfile = Bio::FlatFile.auto("#{@ref_prot_file}")
136
+ flatfile.each_entry do |entry|
137
+ ref_prot << entry.definition.split(" ")[0]
138
+ end
139
+ flatfile.close
140
+ dna_file = Dir["#{@genomes_list[0]}/*.dna"]
141
+ @ref_dna_file = dna_file[0]
142
+ else
143
+ if @refgenome_file == ""
144
+ abort "You need to provide a reference genome to add a non-annotated genome"
145
+ elsif @refgenome == ""
146
+ new_genomes_dir = @outdir+"/new-genomes/"
147
+ Dir.mkdir(new_genomes_dir) if ! Dir.exists? new_genomes_dir
148
+ refgenome_dir = new_genomes_dir + "/" + File.basename(@refgenome_file).gsub(".gbk","")
149
+ Dir.mkdir(refgenome_dir) if ! Dir.exists? refgenome_dir
150
+ @refgenome = SequenceAnnotation.new(@root,
151
+ refgenome_dir,
152
+ @refgenome_file,
153
+ "refGbk")
154
+ pep_file = Dir["#{refgenome_dir}/*.pep"]
155
+ @ref_prot_file = pep_file[0]
156
+ flatfile = Bio::FlatFile.auto("#{@ref_prot_file}")
157
+ flatfile.each_entry do |entry|
158
+ ref_prot << entry.definition.split(" ")[0]
159
+ end
160
+ flatfile.close
161
+ dna_file = Dir["#{refgenome_dir}/*.dna"]
162
+ @ref_dna_file = dna_file[0]
163
+ end
104
164
  end
105
- flatfile.close
165
+
106
166
  ref_prot
107
- end
108
167
 
168
+ end
109
169
 
110
170
  # load all id => sequences from multifasta
111
171
  def load_genome_cds file
@@ -128,7 +188,7 @@ class BacterialComparator
128
188
 
129
189
  pep_out_dir = "#{@outdir}/align-genes-pep"
130
190
 
131
- ref_proteins = load_genome_cds(Dir["#{@genomes_list[0]}/*.pep"][0])
191
+ ref_proteins = load_genome_cds(@ref_prot_file)
132
192
  synteny_list.each do |k,v|
133
193
  pep_out = File.open(pep_out_dir+"/#{k}.pep", "w")
134
194
  pep_out.write(ref_proteins[k])
@@ -147,7 +207,7 @@ class BacterialComparator
147
207
  end
148
208
 
149
209
  dna_out_dir = "#{@outdir}/align-genes-dna"
150
- ref_genes = load_genome_cds(Dir["#{@genomes_list[0]}/*.dna"][0])
210
+ ref_genes = load_genome_cds(@ref_dna_file)
151
211
  synteny_list.each do |k,v|
152
212
  dna_out = File.open(dna_out_dir+"/#{k}.dna", "w")
153
213
  dna_out.write(ref_genes[k])
@@ -304,7 +364,7 @@ class BacterialComparator
304
364
 
305
365
  # FIXME ugly hack to find out the reference genome
306
366
  Dir.chdir(ori_dir)
307
- ref_id = Dir["#{@genomes_list[0]}/*.pep"][0].split('/')[-1].gsub(".pep","")
367
+ ref_id = @ref_prot_file.split('/')[-1].gsub(".pep","")
308
368
 
309
369
  concat_alignments "#{@outdir}/align-genes-pep.all.fasta", ref_id
310
370
 
@@ -339,7 +399,7 @@ class BacterialComparator
339
399
 
340
400
  # ugly hack to find out the reference genome FIXME
341
401
  Dir.chdir(ori_dir)
342
- ref_id = Dir["#{@genomes_list[0]}/*.pep"][0].split('/')[-1].gsub(".pep","")
402
+ ref_id = @ref_prot_file.split('/')[-1].gsub(".pep","")
343
403
 
344
404
  end_time = Time.now
345
405
  c_time = Helper.sec2str(end_time-start_time)
@@ -508,4 +568,110 @@ class BacterialComparator
508
568
 
509
569
  end
510
570
 
571
+
572
+ def get_fasta_length fasta
573
+ flatfile = Bio::FlatFile.auto(fasta)
574
+ prot_lengths = {}
575
+ flatfile.each_entry do |entry|
576
+ prot_id = entry.definition.split(" ")[0]
577
+ prot_length = entry.length
578
+ prot_lengths[prot_id] = prot_length
579
+ end
580
+ flatfile.close
581
+ prot_lengths
582
+ end
583
+
584
+
585
+ def run_synteny_prot root, outdir, ref_prot_file, query_prot_file
586
+
587
+ puts query_prot_file
588
+ puts ref_prot_file
589
+
590
+ ref_synteny_prot = SequenceSynteny.new(root,
591
+ outdir,
592
+ query_prot_file,
593
+ ref_prot_file,
594
+ "Prot-Ref",
595
+ 0.80,
596
+ 0.80,
597
+ "prot")
598
+
599
+ print "# Running alignment with Reference Genome CDS (diamond).."
600
+ start_time = Time.now
601
+ ref_synteny_prot.run_diamond
602
+ end_time = Time.now
603
+ c_time = Helper.sec2str(end_time - start_time)
604
+ print "done (#{c_time})\n"
605
+
606
+ ref_synteny_prot.extract_hits :refgenome
607
+
608
+ synteny_file = File.open("#{outdir}/Prot-Synteny.tsv","w")
609
+ synteny_file.write("RefLocusTag\tRefProtID\tRefLength\tRefCoverage\tIdentity\tQueryGene\tQueryLength\tQueryCoverage\tQueryPartial\n")
610
+ ref_annotated = {}
611
+
612
+ ref_synteny_prot.query_sequences.each do |prot, syn_val|
613
+ next if ! syn_val.has_key? :homology
614
+ next if syn_val[:homology][:assert_cutoff].inject(:+) < 3
615
+ next if ref_annotated.has_key? syn_val[:homology][:hits][0] and ref_annotated[syn_val[:homology][:hits][0]][:partial] == 0
616
+ ref_annotated[syn_val[:homology][:hits][0]] = {
617
+ key: prot,
618
+ pId: syn_val[:homology][:pId],
619
+ cov_query: syn_val[:homology][:cov_query],
620
+ cov_subject: syn_val[:homology][:cov_subject],
621
+ assert_cutoff: syn_val[:homology][:assert_cutoff],
622
+ length: syn_val[:homology][:length][0],
623
+ partial: (syn_val[:partial] ? 1 : 0)
624
+ }
625
+ # ref_annotated[syn_val[:homology][:hits][0]] = {
626
+ # key: prot,
627
+ # pId: syn_val[:homology][:pId],
628
+ # cov_query: syn_val[:homology][:cov_query],
629
+ # cov_subject: syn_val[:homology][:cov_subject],
630
+ # assert_cutoff: syn_val[:homology][:assert_cutoff],
631
+ # length: syn_val[:homology][:length][0],
632
+ # partial: (syn_val[:partial] ? 1 : 0)
633
+ # }
634
+ end
635
+
636
+ # print ref_annotated
637
+ query_lengths = get_fasta_length query_prot_file
638
+
639
+ @refgenome.coding_seq.each do |ref_k, ref_v|
640
+ gene = ""
641
+ coverage_ref = ""
642
+ coverage_query = ""
643
+ query_length = ""
644
+ pId = ""
645
+ if ref_annotated[ref_v[:protId]] != nil
646
+ gene = ref_annotated[ref_v[:protId]][:key]
647
+ coverage_ref = ref_annotated[ref_v[:protId]][:cov_subject]
648
+ query_length = query_lengths[ref_annotated[ref_v[:protId]][:key]]
649
+ coverage_query = ref_annotated[ref_v[:protId]][:cov_query]
650
+ pId = ref_annotated[ref_v[:protId]][:pId]
651
+ partial = ref_annotated[ref_v[:protId]][:partial]
652
+ end
653
+
654
+ _locus_tag = ref_v[:locustag] || ""
655
+ _seq_len = "NA"
656
+ # _seq_len = ref_v[:bioseq].seq.length.to_s if ! ref_v[:bioseq].nil?
657
+ _seq_len = ref_v[:length].to_s if ! ref_v[:length].nil?
658
+
659
+ synteny_file.write(ref_v[:protId])
660
+ synteny_file.write("\t"+_locus_tag)
661
+ synteny_file.write("\t"+_seq_len)
662
+ synteny_file.write("\t"+coverage_ref.to_s)
663
+ synteny_file.write("\t"+pId.to_s)
664
+ synteny_file.write("\t"+gene)
665
+ synteny_file.write("\t"+query_length.to_s)
666
+ synteny_file.write("\t"+coverage_query.to_s)
667
+ synteny_file.write("\t"+partial.to_s)
668
+ synteny_file.write("\n")
669
+
670
+ end
671
+
672
+ synteny_file.close
673
+
674
+ end
675
+
676
+
511
677
  end # end of Class
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bacterial-annotator
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.8.8
4
+ version: 0.9.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Maxime Deraspe
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2018-08-02 00:00:00.000000000 Z
11
+ date: 2022-09-05 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bio
@@ -110,7 +110,7 @@ homepage: http://rubygems.org/gems/bacterial-annotator
110
110
  licenses:
111
111
  - GPL-3.0
112
112
  metadata: {}
113
- post_install_message:
113
+ post_install_message:
114
114
  rdoc_options: []
115
115
  require_paths:
116
116
  - lib
@@ -125,9 +125,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
125
125
  - !ruby/object:Gem::Version
126
126
  version: '0'
127
127
  requirements: []
128
- rubyforge_project:
129
- rubygems_version: 2.7.7
130
- signing_key:
128
+ rubygems_version: 3.1.6
129
+ signing_key:
131
130
  specification_version: 4
132
131
  summary: Bacterial Annotator
133
132
  test_files: []