bacterial-annotator 0.6.6 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 471d9b65a3b536abb3ab252ddcf4e1b819f7dc12
4
- data.tar.gz: 464f51469f99b92bc5c1717c6e0913afb8241fb0
3
+ metadata.gz: 0228aafd97af13b8756df42db362e4a53a30f4f0
4
+ data.tar.gz: 858942624597354dd0f52ad98ea1e94373289174
5
5
  SHA512:
6
- metadata.gz: 7d31974ec7d7f2899366172306b33f9e456cd7827ca1f381453d3909cb16716509c66c2718af96a8e3b73f17026c6ecc0fda71b30e970e4f7aeb53fd951a7c62
7
- data.tar.gz: bf7deca317fbbb5463166b9b672219875ab2fa87dcee514a5dae8bd3006a31cb563e11351c5c929bb7f10422df878788aa09e58741c11eca409af3eb0d460e1e
6
+ metadata.gz: 82c36c4fba00b437e721991c739517b7cfeb5edaa7e1ac49849e59d3ffac2165f1ef39f9961aa756ff8ad691fec36a8b3424cf8ce4d0e1125d486fa2e2a38593
7
+ data.tar.gz: e8b569f61f2dcb7309c6587ce619f7432e2588a717f3017053adcb693327ac2f21850785883eae4477226d057dde18a308a88d85d2a0558561d567865d1348cc
data/bin/ba_mash ADDED
@@ -0,0 +1,45 @@
1
+ #!/usr/bin/env ruby
2
+ # -*- coding: utf-8 -*-
3
+ # author: maxime déraspe
4
+ # email: maximilien1er@gmail.com
5
+ # review:
6
+ # date: 17-10-12
7
+ # version: 0.01
8
+ # licence:
9
+
10
+ require 'open-uri'
11
+
12
+ ROOT_path = File.dirname(__FILE__)
13
+ # url = "https://github.com/marbl/Mash/releases/download/v2.0/mash-Linux64-v2.0.tar"
14
+
15
+ # Install Mash on the user system
16
+ def installMash
17
+
18
+ begin
19
+ resp = open("https://github.com/marbl/Mash/releases/download/v2.0/mash-Linux64-v2.0.tar")
20
+ open("#{ROOT_path}/mash.tar", "wb") do |file|
21
+ file.write(resp.read)
22
+ end
23
+ Dir.chdir("#{ROOT_path}/")
24
+ `tar -xvf mash.tar > /dev/null 2>&1`
25
+ `cp mash-Linux64-v2.0/mash mash.linux`
26
+ File.chmod(0755, "#{ROOT_path}/mash.linux")
27
+ `rm -fr mash.tar mash-Linux64-v2.0/`
28
+ rescue
29
+ abort "Problem installing Mash, aborting"
30
+ end
31
+
32
+ end
33
+
34
+
35
+ # Install Mash if not already install
36
+ if ! File.exists? "#{ROOT_path}/mash.linux"
37
+
38
+ puts "Installing Mash the MinHash sequence comparator.."
39
+ puts "See https://github.com/marbl/Mash/"
40
+ puts "The Licence is Custom"
41
+ installMash
42
+ puts "Mash successfully installed in #{ROOT_path}/mash.linux"
43
+ puts ""
44
+
45
+ end
@@ -10,6 +10,7 @@
10
10
 
11
11
  require 'bacterial-annotator'
12
12
  require 'bacterial-comparator'
13
+ require 'bacterial-identificator'
13
14
 
14
15
  VERSION = "0.6.6"
15
16
 
@@ -22,6 +23,7 @@ def print_version
22
23
  version += " -- fasta36 v36.3.8d\n"
23
24
  version += " -- RAxML v8.2.11\n"
24
25
  version += " -- FastTree v2.1.10\n"
26
+ version += " -- MASH v2.0\n"
25
27
  puts version
26
28
  end
27
29
 
@@ -36,6 +38,9 @@ bacterial-annotator [annotate | compare] [OPTIONS]
36
38
 
37
39
  # Choose either to annotate a genome or compare several genome annotations
38
40
 
41
+ identify [OPTIONS]
42
+ .. see identify -h for OPTIONS
43
+
39
44
  annotate [OPTIONS]
40
45
  .. see annotate -h for OPTIONS
41
46
 
@@ -64,7 +69,7 @@ annotate [OPTIONS]
64
69
  --name/-n <name> Sample name
65
70
 
66
71
  // MERGEM-based Annotation (Recommended)
67
- --db/-d <directory> MERGEM database directory
72
+ --db/-d <directory> MERGEM database directory
68
73
 
69
74
  // Reference-Based Annotation
70
75
  --refgenome/-g <GBK_ID> Provide a Genbank file or a Gbk Accession ID.
@@ -122,7 +127,6 @@ def parseOptions_annotate
122
127
  when "--externaldb"
123
128
  options[:external_db] = ARGV.shift
124
129
  when "--help", "-h"
125
-
126
130
  usage_annotate
127
131
  abort
128
132
  when "--version", "-v"
@@ -213,6 +217,58 @@ def parseOptions_compare
213
217
 
214
218
  end
215
219
 
220
+
221
+ def usage_identify
222
+
223
+ print <<OEM
224
+
225
+ identify [OPTIONS] genome_1.fasta genome_2.fasta genome_x.fasta
226
+
227
+ //MERGEM Database
228
+ --db/-d <database directory>
229
+
230
+ //IO
231
+ --proc <nb of process> Number of process to run the comparison
232
+
233
+ OEM
234
+
235
+ end
236
+
237
+ def parseOptions_identify
238
+
239
+ options = {}
240
+ options[:proc] = 2
241
+ options[:genomes_list] = []
242
+
243
+ while x = ARGV.shift
244
+
245
+ case x.downcase
246
+ when "--db", "-d"
247
+ options[:database] = ARGV.shift
248
+ when "--proc", "-p"
249
+ options[:proc] = ARGV.shift
250
+ when "--help", "-h"
251
+ usage_identify
252
+ abort
253
+ else
254
+ if File.exists? "#{x}"
255
+ options[:genomes_list] << x
256
+ else
257
+ puts "#{x} file doesn't exist"
258
+ usage_identify
259
+ abort
260
+ end
261
+ end
262
+
263
+ end
264
+
265
+ options
266
+
267
+ end
268
+
269
+
270
+
271
+
216
272
  ########
217
273
  # MAIN #
218
274
  ########
@@ -230,9 +286,9 @@ if ARGV.size >= 1
230
286
  system("ba_fasta36")
231
287
  system("ba_cdhit")
232
288
  system("ba_fasttree")
289
+ system("ba_mash")
233
290
 
234
291
  options = {}
235
- genomes_list = [] # TODO multiple input genomes
236
292
 
237
293
  if ARGV[0] == "annotate"
238
294
 
@@ -240,7 +296,8 @@ if ARGV.size >= 1
240
296
  options = parseOptions_annotate
241
297
 
242
298
  if ! File.exist? ("#{ROOT}/blat.linux")
243
- abort "#exiting blat is missing"
299
+ puts "Exiting program because blat is missing"
300
+ abort
244
301
  end
245
302
 
246
303
  # Check Options
@@ -264,6 +321,17 @@ if ARGV.size >= 1
264
321
  bcomp.mafft_aln aln_opt
265
322
  bcomp.raxml_tree aln_opt, options[:bootstrap] if options[:phylogeny] == 1
266
323
 
324
+ elsif ARGV[0] == "identify"
325
+
326
+ ARGV.shift
327
+ options = parseOptions_identify
328
+ if options[:genomes_list].empty?
329
+ puts "You need at least 1 genome fasta to identify !!"
330
+ usage_identify
331
+ abort
332
+ end
333
+ bident = BacterialIdentificator.new(options, ROOT)
334
+
267
335
  elsif ARGV[0] == "--version" or ARGV[0] == "-v"
268
336
 
269
337
  print_version
@@ -273,7 +341,6 @@ if ARGV.size >= 1
273
341
  usage
274
342
  end
275
343
 
276
-
277
344
  else
278
345
  usage
279
346
  end
@@ -22,8 +22,6 @@ class SequenceFasta
22
22
  abort "Aborting : The input sequence is not a fasta file !"
23
23
  end
24
24
 
25
- # @contigs = extract_contigs(@fasta_flat)
26
-
27
25
  @meta = meta
28
26
 
29
27
  @annotation_files = nil
@@ -38,7 +36,7 @@ class SequenceFasta
38
36
 
39
37
  @annotation_files = {}
40
38
  Dir.mkdir "#{outdir}" if ! Dir.exists? "#{outdir}"
41
- if @meta
39
+ if @meta==1
42
40
  system("#{root}/prodigal.linux -p meta -i #{@fasta_file} -a #{outdir}/Proteins.fa -d #{outdir}/Genes.fa -o #{outdir}/Genbanks.gbk -q")
43
41
  else
44
42
  system("#{root}/prodigal.linux -i #{@fasta_file} -a #{outdir}/Proteins.fa -d #{outdir}/Genes.fa -o #{outdir}/Genbanks.gbk -q")
@@ -100,7 +100,7 @@ class SequenceSynteny
100
100
  # first hit for query
101
101
  if ! @query_sequences[key].has_key? :homology
102
102
  @query_sequences[key][:conserved] = true
103
- @subject_sequences[key][:conserved] = true
103
+ # @subject_sequences[key][:conserved] = true
104
104
  @query_sequences[key][:homology] = {
105
105
  pId: lA[2].to_f.round(2),
106
106
  cov_query: cov_query,
@@ -119,7 +119,7 @@ class SequenceSynteny
119
119
  # query already got at least 1 hit and new_score > last_score
120
120
  elsif lA[11].to_f > @query_sequences[key][:homology][:score]
121
121
  @query_sequences[key][:conserved] = true
122
- @subject_sequences[key][:conserved] = true
122
+ # @subject_sequences[key][:conserved] = true
123
123
  @query_sequences[key][:homology] = {
124
124
  pId: lA[2].to_f.round(2),
125
125
  cov_query: cov_query,
@@ -26,7 +26,7 @@ class BacterialAnnotator
26
26
  @minlength = @options[:minlength].to_i
27
27
  @options[:minlength] = @options[:minlength].to_i
28
28
  @options[:pidentity] = @options[:pidentity].to_f
29
- @options[:pidentispacemacs-lightty] = @options[:pidentity] * 100 if @options[:pidentity] <= 1.00
29
+ @options[:pidentity] = @options[:pidentity] * 100 if @options[:pidentity] <= 1.00
30
30
  @options[:pcoverage] = @options[:pcoverage].to_f
31
31
  @options[:pcoverage] = @options[:pcoverage] / 100 if @options[:pcoverage] > 1.00
32
32
 
@@ -200,8 +200,12 @@ class BacterialAnnotator
200
200
  "Prot-ExternalDB", @options[:pidentity],
201
201
  @options[:pcoverage], "prot")
202
202
 
203
- puts "# Running BLAT alignment with External Database.."
203
+ print "# Running BLAT alignment with External Database.."
204
+ start_time = Time.now
204
205
  @externaldb_synteny.run_blat @root, @options[:outdir]
206
+ end_time = Time.now
207
+ c_time = Helper.sec2str(end_time-start_time)
208
+ print "done (#{c_time})\n"
205
209
  @externaldb_synteny.extract_hits :externaldb
206
210
 
207
211
  @externaldb_synteny.query_sequences.each do |k, v|
@@ -214,6 +218,9 @@ class BacterialAnnotator
214
218
 
215
219
  next if ! v.has_key? :homology
216
220
 
221
+ if ! @contig_annotations_cds.has_key? contig_of_protein
222
+ @contig_annotations_cds[contig_of_protein] = []
223
+ end
217
224
  @contig_annotations_cds[contig_of_protein] << k
218
225
 
219
226
  hit_gi = v[:homology][:hits][0]
@@ -238,7 +245,6 @@ class BacterialAnnotator
238
245
  inference: inference
239
246
  }
240
247
 
241
-
242
248
  end
243
249
 
244
250
  end
@@ -256,12 +262,17 @@ class BacterialAnnotator
256
262
  gbk_path = @query_fasta.annotation_files[:gbk_path]
257
263
  gbk_to_annotate = SequenceAnnotation.new("#{gbk_path}/#{contig}.gbk", "#{gbk_path}")
258
264
 
259
- if @with_external_db
265
+ if @with_external_db and @with_refence_genome
260
266
  gbk_to_annotate.add_annotation_ref_synteny_prot(
261
267
  (@prot_synteny_refgenome.query_sequences.merge(@externaldb_synteny.query_sequences)),
262
268
  @contig_annotations_externaldb[contig].merge(@ref_genome.coding_seq),
263
269
  (File.basename @options[:refgenome]).gsub(/.gb.*/,"")
264
270
  )
271
+ elsif @with_external_db
272
+ gbk_to_annotate.add_annotation_ref_synteny_prot(
273
+ @externaldb_synteny.query_sequences,
274
+ @contig_annotations_externaldb[contig]
275
+ )
265
276
  else
266
277
  gbk_to_annotate.add_annotation_ref_synteny_prot(
267
278
  @prot_synteny_refgenome.query_sequences,
@@ -270,7 +281,7 @@ class BacterialAnnotator
270
281
  )
271
282
  end
272
283
 
273
- if @contig_annotations_rna.has_key? contig
284
+ if @contig_annotations_rna and @contig_annotations_rna.has_key? contig
274
285
  # puts "RNA annotation"
275
286
  gbk_to_annotate.add_annotations @contig_annotations_rna[contig], "new"
276
287
  end
@@ -419,6 +430,8 @@ class BacterialAnnotator
419
430
  # >sp|C7C422|BLAN1_KLEPN Beta-lactamase NDM-1 OS=Klebsiella pneumoniae GN=blaNDM-1 PE=1 SV=1
420
431
  # TrEMBL
421
432
  # >tr|E5KIY2|E5KIY2_ECOLX Beta-lactamase NDM-1 OS=Escherichia coli GN=blaNDM-1 PE=1 SV=1
433
+ # MERGEM
434
+ # >Genome_ID|location|Protein_ID|LocusTag|Gene|Protein_Product
422
435
 
423
436
  ref_cds = {}
424
437
 
@@ -436,7 +449,10 @@ class BacterialAnnotator
436
449
  product = ""
437
450
  db_source = "[DBSource]"
438
451
 
439
- if product_long.include? " [" and product_long.include? "]" # NCBI
452
+ if product_long.scan(/|/).count >= 5 # MERGEM
453
+ product = product_long
454
+ db_source = "RefSeq"
455
+ elsif product_long.include? " [" and product_long.include? "]" # NCBI
440
456
  organism = product_long[/\[.*?\]/]
441
457
  product = product_long.split(" [")[0].strip
442
458
  elsif product_long.include? "OS=" # Swissprot / TrEMBL
@@ -465,6 +481,9 @@ class BacterialAnnotator
465
481
  db_source = "UniProtKB"
466
482
  end
467
483
  prot_id = lA[1]
484
+ elsif key_gi.count("|") == 5
485
+ db_source = "RefSeq"
486
+ prot_id = lA[2]
468
487
  end
469
488
 
470
489
  ref_cds[key_gi] = {product: product, org: org, prot_id: prot_id, db_source: db_source}
@@ -153,7 +153,11 @@ class BacterialComparator
153
153
  stats = {}
154
154
  stats[:syntenic] = []
155
155
  fout = File.open("#{@outdir}/cds-synteny.tsv", "w")
156
- fout.write("Gene\t"+@genomes_list.join("\t")+"\n")
156
+ genomes_name = []
157
+ @genomes_list.each do |g|
158
+ genomes_name.push(File.basename(g))
159
+ end
160
+ fout.write("Gene\t"+genomes_name.join("\t")+"\n")
157
161
 
158
162
  to_build_multifasta = []
159
163
 
@@ -386,7 +390,6 @@ class BacterialComparator
386
390
 
387
391
  end
388
392
 
389
-
390
393
  def raxml_tree_dna bt
391
394
  print "# Genes DNA tree creation (RAXML).."
392
395
  start_time = Time.now
@@ -423,7 +426,6 @@ class BacterialComparator
423
426
  print "done (#{c_time})\n"
424
427
  end
425
428
 
426
-
427
429
  def raxml_tree aln_opt, bt
428
430
 
429
431
  if aln_opt == "both"
@@ -0,0 +1,64 @@
1
+ # -*- coding: utf-8 -*-
2
+ # author: maxime déraspe
3
+ # email: maximilien1er@gmail.com
4
+ # review:
5
+ # date: 17-10-12
6
+ # version: 0.0.1
7
+ # licence:
8
+
9
+ require 'bio'
10
+ require 'fileutils'
11
+ require 'parallel'
12
+ require 'helper'
13
+
14
+ class BacterialIdentificator
15
+
16
+ attr_reader :genomes_list, :stats
17
+
18
+ # Initialize BacterialIdentificator
19
+ # options[:input], options[:refgenome], ROOT, options[:outdir], options)
20
+ def initialize options, root
21
+
22
+ @root = root
23
+ @db_path = options[:database]
24
+ @genomes_list = options[:genomes_list]
25
+ @proc = options[:proc].to_i
26
+ p @genomes_list
27
+
28
+ @genomes_list.each do |g|
29
+ mash_genome g
30
+ end
31
+
32
+ end
33
+
34
+
35
+ def mash_genome genome
36
+
37
+ # Reference-ID, Query-ID, Mash-distance, P-value, and Matching-hashes
38
+ # fields = ["hit","query","distance","pvalue","match"]
39
+
40
+ results_raw = `#{@root}/mash.linux dist #{@db_path}/species-sequences.msh #{genome}`
41
+ results = []
42
+
43
+ results_raw.split("\n").each do |l|
44
+ lA = l.chomp.split("\t")
45
+ next if lA[-1].split("/")[0] == '0' # no match
46
+ results << lA
47
+ end
48
+
49
+ results_sorted = results.sort {|a,b| a[2] <=> b[2]}
50
+
51
+ File.open("#{genome}.msh_dist", "w") do |fout|
52
+ results_sorted.each do |f|
53
+ fout.write(f.join("\t"))
54
+ fout.write("\n")
55
+ end
56
+ end
57
+
58
+ end
59
+
60
+
61
+
62
+
63
+ end
64
+
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bacterial-annotator
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.6
4
+ version: 0.7.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Maxime Deraspe
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-10-10 00:00:00.000000000 Z
11
+ date: 2017-11-06 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bio
@@ -83,6 +83,7 @@ executables:
83
83
  - ba_fasta36
84
84
  - ba_cdhit
85
85
  - ba_fasttree
86
+ - ba_mash
86
87
  extensions: []
87
88
  extra_rdoc_files: []
88
89
  files:
@@ -92,6 +93,7 @@ files:
92
93
  - bin/ba_fasta36
93
94
  - bin/ba_fasttree
94
95
  - bin/ba_mafft
96
+ - bin/ba_mash
95
97
  - bin/ba_prodigal
96
98
  - bin/ba_raxml
97
99
  - bin/bacterial-annotator
@@ -100,6 +102,7 @@ files:
100
102
  - lib/bacterial-annotator/sequence-fasta.rb
101
103
  - lib/bacterial-annotator/sequence-synteny.rb
102
104
  - lib/bacterial-comparator.rb
105
+ - lib/bacterial-identificator.rb
103
106
  - lib/helper.rb
104
107
  homepage: http://rubygems.org/gems/bacterial-annotator
105
108
  licenses: