bacterial-annotator 0.8.9 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 1dc26e2ae283ea0c8163e7c487c95bf0f0b7cffa4095d841b38e709262698a52
4
- data.tar.gz: be699876985f29d8269119852b6f027f562c541263abb6afe29b84783375ad3b
3
+ metadata.gz: 0a9d751e0a2dd02e44e8708a57d01675ed0a8aaef29aa8ce4783d4fbbe994358
4
+ data.tar.gz: 122e95e15e7ae3c741c8a083bfbdada0b38ebec3213d35cbeb6ff0dbc131a780
5
5
  SHA512:
6
- metadata.gz: a7615dee5f7d89e8224477dc94798621d72ab8923d1b7027c9b912f46f9fca85beb71bf1b3cd7a058fa27503b07a244ecd97a71330c0816d4b3c41b4f8795e55
7
- data.tar.gz: 813dc52e3732accd812e54a9ad998fee65e01cdec8bf79c136c59262512683ed32656e88e860476f405e08441723eef6681648d972f17237ec80d88a64a8a46d
6
+ metadata.gz: 3f945bce150d6edac552d6a83c1618fed60f72d05727b440de8a2bd981e958cab9a76e1f9eeb451c82affeb464343b793c303115623eb03db8decd90fdfa0cdd
7
+ data.tar.gz: 3b9380050860bf619990a5d32c3111beb3afdc602be8161ee660830cae45cce9bb5137f8f466d97eb12c16d925af23f3739151ed2287d00f4c413562783e3251
@@ -2,10 +2,10 @@
2
2
  # -*- coding: utf-8 -*-
3
3
  # author: maxime déraspe
4
4
  # email: maximilien1er@gmail.com
5
- # review:
5
+ # review:
6
6
  # date: 15-02-24
7
7
  # version: 0.01
8
- # licence:
8
+ # licence:
9
9
 
10
10
 
11
11
  require 'bacterial-annotator'
@@ -160,6 +160,7 @@ compare [OPTIONS]
160
160
  --proc <nb of process> Number of process to run the comparison
161
161
 
162
162
  //Synteny
163
+ --refgenome genbank ref genome if you want to provide complete genomes not previously annotated with bacterial-annotator
163
164
  --pidentity <default 0.80> Minimal percentage identity to call a syntenic protein
164
165
  --min_cov <default 0.80> Minimal coverage for the alignment of the protein / gene
165
166
 
@@ -193,6 +194,7 @@ def parseOptions_compare
193
194
  options[:phylogeny] = 0
194
195
  options[:software] = "fasttree"
195
196
  options[:bootstrap] = 100
197
+ options[:refgenome] = ""
196
198
 
197
199
  if ARGV.length == 0
198
200
  usage_compare
@@ -212,6 +214,8 @@ def parseOptions_compare
212
214
  options[:proc] = ARGV.shift
213
215
  when "--align"
214
216
  options[:align] = ARGV.shift
217
+ when "--refgenome", "-g"
218
+ options[:refgenome] = ARGV.shift
215
219
  when "--concat"
216
220
  options[:concat] = ARGV.shift
217
221
  when "--phylogeny"
@@ -7,7 +7,7 @@
7
7
 
8
8
  require 'json'
9
9
  require 'zlib'
10
-
10
+ require 'pp'
11
11
 
12
12
  class SequenceAnnotation
13
13
 
@@ -272,7 +272,7 @@ class SequenceAnnotation
272
272
  @rna_seq = {}
273
273
  @gbk.features do |ft|
274
274
 
275
- next if ! ft.feature.to_s.include? "rRNA"
275
+ next if ! ft.feature.to_s.include? "RNA"
276
276
 
277
277
  ftH = ft.to_hash
278
278
  loc = ft.locations
@@ -285,7 +285,12 @@ class SequenceAnnotation
285
285
  # gene = ftH["gene"] if !ftH["gene"].nil?
286
286
  # protId = ftH["protein_id"][0] if !ftH["protein_id"].nil?
287
287
  product = ""
288
- product = ftH["product"][0] if !ftH["product"].nil?
288
+
289
+ if !ftH["product"].nil?
290
+ product = ftH["product"][0]
291
+ # puts ftH["product"].join(",") + "---" + ftH["product"][0]
292
+ end
293
+
289
294
  locustag = ftH["locus_tag"][0] if !ftH["locus_tag"].nil?
290
295
 
291
296
  # puts "#{@accession}\t#{seqBeg}\t#{seqEnd}\t#{strand}\t#{protId}\t#{locustag}\t#{gene[0]}\t#{product[0]}"
@@ -533,6 +538,7 @@ class SequenceAnnotation
533
538
 
534
539
  new_features = {}
535
540
  annotations_done = {}
541
+ gbk_features_len = @gbk.features.length
536
542
 
537
543
  @gbk.features.each_with_index do |ft, ft_index|
538
544
 
@@ -540,8 +546,9 @@ class SequenceAnnotation
540
546
 
541
547
  next if annotations_done.has_key? k
542
548
 
543
- if v[:query_location][0][0] < ft.locations[0].from
549
+ if v[:query_location][0][0] < ft.locations[0].from or ft_index == gbk_features_len-1
544
550
 
551
+ pp v
545
552
  if v[:subject_location][0][0] > v[:subject_location][0][1]
546
553
  location = "complement(#{v[:query_location][0][0]}..#{v[:query_location][0][1]})"
547
554
  else
@@ -550,7 +557,12 @@ class SequenceAnnotation
550
557
 
551
558
  feature = Bio::Feature.new(v[:feature][0],location)
552
559
  feature.qualifiers.push(Bio::Feature::Qualifier.new('product',v[:product][0])) if ! v[:product][0].nil? or v[:product][0] != ""
553
- new_features[ft_index] = feature
560
+ if ft_index == gbk_features_len-1
561
+ new_features[gbk_features_len] = feature
562
+ else
563
+ new_features[ft_index] = feature
564
+ end
565
+
554
566
  annotations_done[k] = 1
555
567
  break
556
568
 
@@ -1,10 +1,10 @@
1
1
  # -*- coding: utf-8 -*-
2
2
  # author: maxime déraspe
3
3
  # email: maximilien1er@gmail.com
4
- # review:
4
+ # review:
5
5
  # date: 15-02-24
6
6
  # version: 0.0.1
7
- # licence:
7
+ # licence:
8
8
 
9
9
  require 'bio'
10
10
  require 'fileutils'
@@ -324,11 +324,19 @@ class BacterialAnnotator
324
324
  @options[:pcoverage],
325
325
  "prot")
326
326
 
327
- print "# Running BLAT alignment with External Database.."
327
+ # print "# Running BLAT alignment with External Database.."
328
+ # start_time = Time.now
329
+ # @externaldb_synteny.run_blat
330
+ # end_time = Time.now
331
+ # c_time = Helper.sec2str(end_time-start_time)
332
+ # print "done (#{c_time})\n"
333
+ # @externaldb_synteny.extract_hits :externaldb
334
+
335
+ print "# Running alignment with External DB"
328
336
  start_time = Time.now
329
- @externaldb_synteny.run_blat
337
+ @externaldb_synteny.run_diamond
330
338
  end_time = Time.now
331
- c_time = Helper.sec2str(end_time-start_time)
339
+ c_time = Helper.sec2str(end_time - start_time)
332
340
  print "done (#{c_time})\n"
333
341
  @externaldb_synteny.extract_hits :externaldb
334
342
 
@@ -1,16 +1,18 @@
1
1
  # -*- coding: utf-8 -*-
2
2
  # author: maxime déraspe
3
3
  # email: maximilien1er@gmail.com
4
- # review:
4
+ # review:
5
5
  # date: 15-02-24
6
6
  # version: 0.0.1
7
- # licence:
7
+ # licence:
8
8
 
9
9
  require 'bio'
10
10
  require 'fileutils'
11
11
  require 'parallel'
12
12
  require 'helper'
13
13
 
14
+ require 'bacterial-annotator/sequence-annotation'
15
+
14
16
  class BacterialComparator
15
17
 
16
18
  attr_reader :genomes_list, :stats
@@ -25,6 +27,9 @@ class BacterialComparator
25
27
  @genomes_list = options[:genomes_list]
26
28
  @proc = options[:proc].to_i
27
29
  @phylo_nb_genes = options[:phylo_nb_genes]
30
+ @refgenome_file = options[:refgenome]
31
+ @refgenome = ""
32
+
28
33
  if ["fasttree","raxml"].include? options[:software]
29
34
  @software = options[:software]
30
35
  else
@@ -62,14 +67,40 @@ class BacterialComparator
62
67
 
63
68
  end
64
69
 
65
-
66
70
  def read_prot_synteny
67
71
 
68
72
  puts "# Reading genome synteny files START.."
69
73
  start_time = Time.now
70
74
  synteny = {}
71
- @genomes_list.each do |g|
75
+
76
+ # If genome is genbank (.gbk or .gb) then do syntheny first
77
+
78
+ @genomes_list.each_with_index do |g,i|
79
+
72
80
  genome_synteny = []
81
+
82
+ if g =~ /.gbk/i
83
+ new_genomes_dir = @outdir+"/new-genomes/"
84
+ Dir.mkdir(new_genomes_dir) if ! Dir.exists? new_genomes_dir
85
+ genome_dir = new_genomes_dir + "/" + File.basename(g).gsub(".gbk","")
86
+ Dir.mkdir(genome_dir) if ! Dir.exists? genome_dir
87
+ genome_to_annotate = SequenceAnnotation.new(@root,
88
+ genome_dir,
89
+ g,
90
+ "refGbk")
91
+ query_prot_file = Dir["#{genome_dir}/*.pep"][0]
92
+ query_gene_file = Dir["#{genome_dir}/*.dna"][0]
93
+
94
+ File.symlink(query_prot_file, File.dirname(query_prot_file)+"/Proteins.fa")
95
+ File.symlink(query_gene_file, File.dirname(query_gene_file)+"/Genes.fa")
96
+
97
+ run_synteny_prot @root, genome_dir, @ref_prot_file, query_prot_file
98
+
99
+ end
100
+
101
+ g = genome_dir
102
+ @genomes_list[i] = genome_dir
103
+
73
104
  file = File.open("#{g}/Prot-Synteny.tsv", "r")
74
105
  l = file.gets # skip header
75
106
  while l = file.gets
@@ -79,6 +110,7 @@ class BacterialComparator
79
110
  synteny[lA[0]] << {ref_cov: lA[3].to_f, pId: lA[4].to_f, query_prot: lA[5], query_cov: lA[7].to_f}
80
111
  genome_synteny << lA[0]
81
112
  end
113
+
82
114
  @ref_prot.each do |ref_prot|
83
115
  if ! genome_synteny.include? ref_prot
84
116
  synteny[lA[0]] << {ref_cov: "-", pId: "-", query_prot: "-", query_cov: "-"}
@@ -96,16 +128,45 @@ class BacterialComparator
96
128
  end
97
129
 
98
130
  def get_ref_prot
131
+
99
132
  ref_prot = []
100
- pep_file = Dir["#{@genomes_list[0]}/*.pep"]
101
- flatfile = Bio::FlatFile.auto("#{pep_file[0]}")
102
- flatfile.each_entry do |entry|
103
- ref_prot << entry.definition.split(" ")[0]
133
+ if File.exist?("#{@genomes_list[0]}/*.pep")
134
+ pep_file = Dir["#{@genomes_list[0]}/*.pep"]
135
+ @ref_prot_file = pep_file[0]
136
+ flatfile = Bio::FlatFile.auto("#{@ref_prot_file}")
137
+ flatfile.each_entry do |entry|
138
+ ref_prot << entry.definition.split(" ")[0]
139
+ end
140
+ flatfile.close
141
+ dna_file = Dir["#{@genomes_list[0]}/*.dna"]
142
+ @ref_dna_file = dna_file[0]
143
+ else
144
+ if @refgenome_file == ""
145
+ abort "You need to provide a reference genome to add a non-annotated genome"
146
+ elsif @refgenome == ""
147
+ new_genomes_dir = @outdir+"/new-genomes/"
148
+ Dir.mkdir(new_genomes_dir) if ! Dir.exists? new_genomes_dir
149
+ refgenome_dir = new_genomes_dir + "/" + File.basename(@refgenome_file).gsub(".gbk","")
150
+ Dir.mkdir(refgenome_dir) if ! Dir.exists? refgenome_dir
151
+ @refgenome = SequenceAnnotation.new(@root,
152
+ refgenome_dir,
153
+ @refgenome_file,
154
+ "refGbk")
155
+ pep_file = Dir["#{refgenome_dir}/*.pep"]
156
+ @ref_prot_file = pep_file[0]
157
+ flatfile = Bio::FlatFile.auto("#{@ref_prot_file}")
158
+ flatfile.each_entry do |entry|
159
+ ref_prot << entry.definition.split(" ")[0]
160
+ end
161
+ flatfile.close
162
+ dna_file = Dir["#{refgenome_dir}/*.dna"]
163
+ @ref_dna_file = dna_file[0]
164
+ end
104
165
  end
105
- flatfile.close
166
+
106
167
  ref_prot
107
- end
108
168
 
169
+ end
109
170
 
110
171
  # load all id => sequences from multifasta
111
172
  def load_genome_cds file
@@ -128,7 +189,7 @@ class BacterialComparator
128
189
 
129
190
  pep_out_dir = "#{@outdir}/align-genes-pep"
130
191
 
131
- ref_proteins = load_genome_cds(Dir["#{@genomes_list[0]}/*.pep"][0])
192
+ ref_proteins = load_genome_cds(@ref_prot_file)
132
193
  synteny_list.each do |k,v|
133
194
  pep_out = File.open(pep_out_dir+"/#{k}.pep", "w")
134
195
  pep_out.write(ref_proteins[k])
@@ -147,7 +208,7 @@ class BacterialComparator
147
208
  end
148
209
 
149
210
  dna_out_dir = "#{@outdir}/align-genes-dna"
150
- ref_genes = load_genome_cds(Dir["#{@genomes_list[0]}/*.dna"][0])
211
+ ref_genes = load_genome_cds(@ref_dna_file)
151
212
  synteny_list.each do |k,v|
152
213
  dna_out = File.open(dna_out_dir+"/#{k}.dna", "w")
153
214
  dna_out.write(ref_genes[k])
@@ -304,7 +365,7 @@ class BacterialComparator
304
365
 
305
366
  # FIXME ugly hack to find out the reference genome
306
367
  Dir.chdir(ori_dir)
307
- ref_id = Dir["#{@genomes_list[0]}/*.pep"][0].split('/')[-1].gsub(".pep","")
368
+ ref_id = @ref_prot_file.split('/')[-1].gsub(".pep","")
308
369
 
309
370
  concat_alignments "#{@outdir}/align-genes-pep.all.fasta", ref_id
310
371
 
@@ -339,7 +400,7 @@ class BacterialComparator
339
400
 
340
401
  # ugly hack to find out the reference genome FIXME
341
402
  Dir.chdir(ori_dir)
342
- ref_id = Dir["#{@genomes_list[0]}/*.pep"][0].split('/')[-1].gsub(".pep","")
403
+ ref_id = @ref_prot_file.split('/')[-1].gsub(".pep","")
343
404
 
344
405
  end_time = Time.now
345
406
  c_time = Helper.sec2str(end_time-start_time)
@@ -508,4 +569,110 @@ class BacterialComparator
508
569
 
509
570
  end
510
571
 
572
+
573
+ def get_fasta_length fasta
574
+ flatfile = Bio::FlatFile.auto(fasta)
575
+ prot_lengths = {}
576
+ flatfile.each_entry do |entry|
577
+ prot_id = entry.definition.split(" ")[0]
578
+ prot_length = entry.length
579
+ prot_lengths[prot_id] = prot_length
580
+ end
581
+ flatfile.close
582
+ prot_lengths
583
+ end
584
+
585
+
586
+ def run_synteny_prot root, outdir, ref_prot_file, query_prot_file
587
+
588
+ puts query_prot_file
589
+ puts ref_prot_file
590
+
591
+ ref_synteny_prot = SequenceSynteny.new(root,
592
+ outdir,
593
+ query_prot_file,
594
+ ref_prot_file,
595
+ "Prot-Ref",
596
+ 0.80,
597
+ 0.80,
598
+ "prot")
599
+
600
+ print "# Running alignment with Reference Genome CDS (diamond).."
601
+ start_time = Time.now
602
+ ref_synteny_prot.run_diamond
603
+ end_time = Time.now
604
+ c_time = Helper.sec2str(end_time - start_time)
605
+ print "done (#{c_time})\n"
606
+
607
+ ref_synteny_prot.extract_hits :refgenome
608
+
609
+ synteny_file = File.open("#{outdir}/Prot-Synteny.tsv","w")
610
+ synteny_file.write("RefLocusTag\tRefProtID\tRefLength\tRefCoverage\tIdentity\tQueryGene\tQueryLength\tQueryCoverage\tQueryPartial\n")
611
+ ref_annotated = {}
612
+
613
+ ref_synteny_prot.query_sequences.each do |prot, syn_val|
614
+ next if ! syn_val.has_key? :homology
615
+ next if syn_val[:homology][:assert_cutoff].inject(:+) < 3
616
+ next if ref_annotated.has_key? syn_val[:homology][:hits][0] and ref_annotated[syn_val[:homology][:hits][0]][:partial] == 0
617
+ ref_annotated[syn_val[:homology][:hits][0]] = {
618
+ key: prot,
619
+ pId: syn_val[:homology][:pId],
620
+ cov_query: syn_val[:homology][:cov_query],
621
+ cov_subject: syn_val[:homology][:cov_subject],
622
+ assert_cutoff: syn_val[:homology][:assert_cutoff],
623
+ length: syn_val[:homology][:length][0],
624
+ partial: (syn_val[:partial] ? 1 : 0)
625
+ }
626
+ # ref_annotated[syn_val[:homology][:hits][0]] = {
627
+ # key: prot,
628
+ # pId: syn_val[:homology][:pId],
629
+ # cov_query: syn_val[:homology][:cov_query],
630
+ # cov_subject: syn_val[:homology][:cov_subject],
631
+ # assert_cutoff: syn_val[:homology][:assert_cutoff],
632
+ # length: syn_val[:homology][:length][0],
633
+ # partial: (syn_val[:partial] ? 1 : 0)
634
+ # }
635
+ end
636
+
637
+ # print ref_annotated
638
+ query_lengths = get_fasta_length query_prot_file
639
+
640
+ @refgenome.coding_seq.each do |ref_k, ref_v|
641
+ gene = ""
642
+ coverage_ref = ""
643
+ coverage_query = ""
644
+ query_length = ""
645
+ pId = ""
646
+ if ref_annotated[ref_v[:protId]] != nil
647
+ gene = ref_annotated[ref_v[:protId]][:key]
648
+ coverage_ref = ref_annotated[ref_v[:protId]][:cov_subject]
649
+ query_length = query_lengths[ref_annotated[ref_v[:protId]][:key]]
650
+ coverage_query = ref_annotated[ref_v[:protId]][:cov_query]
651
+ pId = ref_annotated[ref_v[:protId]][:pId]
652
+ partial = ref_annotated[ref_v[:protId]][:partial]
653
+ end
654
+
655
+ _locus_tag = ref_v[:locustag] || ""
656
+ _seq_len = "NA"
657
+ # _seq_len = ref_v[:bioseq].seq.length.to_s if ! ref_v[:bioseq].nil?
658
+ _seq_len = ref_v[:length].to_s if ! ref_v[:length].nil?
659
+
660
+ synteny_file.write(ref_v[:protId])
661
+ synteny_file.write("\t"+_locus_tag)
662
+ synteny_file.write("\t"+_seq_len)
663
+ synteny_file.write("\t"+coverage_ref.to_s)
664
+ synteny_file.write("\t"+pId.to_s)
665
+ synteny_file.write("\t"+gene)
666
+ synteny_file.write("\t"+query_length.to_s)
667
+ synteny_file.write("\t"+coverage_query.to_s)
668
+ synteny_file.write("\t"+partial.to_s)
669
+ synteny_file.write("\n")
670
+
671
+ end
672
+
673
+ synteny_file.close
674
+
675
+ end
676
+
677
+
511
678
  end # end of Class
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bacterial-annotator
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.8.9
4
+ version: 0.9.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Maxime Deraspe
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2018-10-17 00:00:00.000000000 Z
11
+ date: 2022-09-05 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bio
@@ -110,7 +110,7 @@ homepage: http://rubygems.org/gems/bacterial-annotator
110
110
  licenses:
111
111
  - GPL-3.0
112
112
  metadata: {}
113
- post_install_message:
113
+ post_install_message:
114
114
  rdoc_options: []
115
115
  require_paths:
116
116
  - lib
@@ -125,9 +125,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
125
125
  - !ruby/object:Gem::Version
126
126
  version: '0'
127
127
  requirements: []
128
- rubyforge_project:
129
- rubygems_version: 2.7.7
130
- signing_key:
128
+ rubygems_version: 3.1.6
129
+ signing_key:
131
130
  specification_version: 4
132
131
  summary: Bacterial Annotator
133
132
  test_files: []