bacterial-annotator 0.6.6 → 0.7.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 471d9b65a3b536abb3ab252ddcf4e1b819f7dc12
4
- data.tar.gz: 464f51469f99b92bc5c1717c6e0913afb8241fb0
3
+ metadata.gz: 0228aafd97af13b8756df42db362e4a53a30f4f0
4
+ data.tar.gz: 858942624597354dd0f52ad98ea1e94373289174
5
5
  SHA512:
6
- metadata.gz: 7d31974ec7d7f2899366172306b33f9e456cd7827ca1f381453d3909cb16716509c66c2718af96a8e3b73f17026c6ecc0fda71b30e970e4f7aeb53fd951a7c62
7
- data.tar.gz: bf7deca317fbbb5463166b9b672219875ab2fa87dcee514a5dae8bd3006a31cb563e11351c5c929bb7f10422df878788aa09e58741c11eca409af3eb0d460e1e
6
+ metadata.gz: 82c36c4fba00b437e721991c739517b7cfeb5edaa7e1ac49849e59d3ffac2165f1ef39f9961aa756ff8ad691fec36a8b3424cf8ce4d0e1125d486fa2e2a38593
7
+ data.tar.gz: e8b569f61f2dcb7309c6587ce619f7432e2588a717f3017053adcb693327ac2f21850785883eae4477226d057dde18a308a88d85d2a0558561d567865d1348cc
data/bin/ba_mash ADDED
@@ -0,0 +1,45 @@
1
+ #!/usr/bin/env ruby
2
+ # -*- coding: utf-8 -*-
3
+ # author: maxime déraspe
4
+ # email: maximilien1er@gmail.com
5
+ # review:
6
+ # date: 17-10-12
7
+ # version: 0.01
8
+ # licence:
9
+
10
+ require 'open-uri'
11
+
12
+ ROOT_path = File.dirname(__FILE__)
13
+ # url = "https://github.com/marbl/Mash/releases/download/v2.0/mash-Linux64-v2.0.tar"
14
+
15
+ # Install Mash on the user system
16
+ def installMash
17
+
18
+ begin
19
+ resp = open("https://github.com/marbl/Mash/releases/download/v2.0/mash-Linux64-v2.0.tar")
20
+ open("#{ROOT_path}/mash.tar", "wb") do |file|
21
+ file.write(resp.read)
22
+ end
23
+ Dir.chdir("#{ROOT_path}/")
24
+ `tar -xvf mash.tar > /dev/null 2>&1`
25
+ `cp mash-Linux64-v2.0/mash mash.linux`
26
+ File.chmod(0755, "#{ROOT_path}/mash.linux")
27
+ `rm -fr mash.tar mash-Linux64-v2.0/`
28
+ rescue
29
+ abort "Problem installing Mash, aborting"
30
+ end
31
+
32
+ end
33
+
34
+
35
+ # Install Mash if not already install
36
+ if ! File.exists? "#{ROOT_path}/mash.linux"
37
+
38
+ puts "Installing Mash the MinHash sequence comparator.."
39
+ puts "See https://github.com/marbl/Mash/"
40
+ puts "The Licence is Custom"
41
+ installMash
42
+ puts "Mash successfully installed in #{ROOT_path}/mash.linux"
43
+ puts ""
44
+
45
+ end
@@ -10,6 +10,7 @@
10
10
 
11
11
  require 'bacterial-annotator'
12
12
  require 'bacterial-comparator'
13
+ require 'bacterial-identificator'
13
14
 
14
15
  VERSION = "0.6.6"
15
16
 
@@ -22,6 +23,7 @@ def print_version
22
23
  version += " -- fasta36 v36.3.8d\n"
23
24
  version += " -- RAxML v8.2.11\n"
24
25
  version += " -- FastTree v2.1.10\n"
26
+ version += " -- MASH v2.0\n"
25
27
  puts version
26
28
  end
27
29
 
@@ -36,6 +38,9 @@ bacterial-annotator [annotate | compare] [OPTIONS]
36
38
 
37
39
  # Choose either to annotate a genome or compare several genome annotations
38
40
 
41
+ identify [OPTIONS]
42
+ .. see identify -h for OPTIONS
43
+
39
44
  annotate [OPTIONS]
40
45
  .. see annotate -h for OPTIONS
41
46
 
@@ -64,7 +69,7 @@ annotate [OPTIONS]
64
69
  --name/-n <name> Sample name
65
70
 
66
71
  // MERGEM-based Annotation (Recommended)
67
- --db/-d <directory> MERGEM database directory
72
+ --db/-d <directory> MERGEM database directory
68
73
 
69
74
  // Reference-Based Annotation
70
75
  --refgenome/-g <GBK_ID> Provide a Genbank file or a Gbk Accession ID.
@@ -122,7 +127,6 @@ def parseOptions_annotate
122
127
  when "--externaldb"
123
128
  options[:external_db] = ARGV.shift
124
129
  when "--help", "-h"
125
-
126
130
  usage_annotate
127
131
  abort
128
132
  when "--version", "-v"
@@ -213,6 +217,58 @@ def parseOptions_compare
213
217
 
214
218
  end
215
219
 
220
+
221
+ def usage_identify
222
+
223
+ print <<OEM
224
+
225
+ identify [OPTIONS] genome_1.fasta genome_2.fasta genome_x.fasta
226
+
227
+ //MERGEM Database
228
+ --db/-d <database directory>
229
+
230
+ //IO
231
+ --proc <nb of process> Number of process to run the comparison
232
+
233
+ OEM
234
+
235
+ end
236
+
237
+ def parseOptions_identify
238
+
239
+ options = {}
240
+ options[:proc] = 2
241
+ options[:genomes_list] = []
242
+
243
+ while x = ARGV.shift
244
+
245
+ case x.downcase
246
+ when "--db", "-d"
247
+ options[:database] = ARGV.shift
248
+ when "--proc", "-p"
249
+ options[:proc] = ARGV.shift
250
+ when "--help", "-h"
251
+ usage_identify
252
+ abort
253
+ else
254
+ if File.exists? "#{x}"
255
+ options[:genomes_list] << x
256
+ else
257
+ puts "#{x} file doesn't exist"
258
+ usage_identify
259
+ abort
260
+ end
261
+ end
262
+
263
+ end
264
+
265
+ options
266
+
267
+ end
268
+
269
+
270
+
271
+
216
272
  ########
217
273
  # MAIN #
218
274
  ########
@@ -230,9 +286,9 @@ if ARGV.size >= 1
230
286
  system("ba_fasta36")
231
287
  system("ba_cdhit")
232
288
  system("ba_fasttree")
289
+ system("ba_mash")
233
290
 
234
291
  options = {}
235
- genomes_list = [] # TODO multiple input genomes
236
292
 
237
293
  if ARGV[0] == "annotate"
238
294
 
@@ -240,7 +296,8 @@ if ARGV.size >= 1
240
296
  options = parseOptions_annotate
241
297
 
242
298
  if ! File.exist? ("#{ROOT}/blat.linux")
243
- abort "#exiting blat is missing"
299
+ puts "Exiting program because blat is missing"
300
+ abort
244
301
  end
245
302
 
246
303
  # Check Options
@@ -264,6 +321,17 @@ if ARGV.size >= 1
264
321
  bcomp.mafft_aln aln_opt
265
322
  bcomp.raxml_tree aln_opt, options[:bootstrap] if options[:phylogeny] == 1
266
323
 
324
+ elsif ARGV[0] == "identify"
325
+
326
+ ARGV.shift
327
+ options = parseOptions_identify
328
+ if options[:genomes_list].empty?
329
+ puts "You need at least 1 genome fasta to identify !!"
330
+ usage_identify
331
+ abort
332
+ end
333
+ bident = BacterialIdentificator.new(options, ROOT)
334
+
267
335
  elsif ARGV[0] == "--version" or ARGV[0] == "-v"
268
336
 
269
337
  print_version
@@ -273,7 +341,6 @@ if ARGV.size >= 1
273
341
  usage
274
342
  end
275
343
 
276
-
277
344
  else
278
345
  usage
279
346
  end
@@ -22,8 +22,6 @@ class SequenceFasta
22
22
  abort "Aborting : The input sequence is not a fasta file !"
23
23
  end
24
24
 
25
- # @contigs = extract_contigs(@fasta_flat)
26
-
27
25
  @meta = meta
28
26
 
29
27
  @annotation_files = nil
@@ -38,7 +36,7 @@ class SequenceFasta
38
36
 
39
37
  @annotation_files = {}
40
38
  Dir.mkdir "#{outdir}" if ! Dir.exists? "#{outdir}"
41
- if @meta
39
+ if @meta==1
42
40
  system("#{root}/prodigal.linux -p meta -i #{@fasta_file} -a #{outdir}/Proteins.fa -d #{outdir}/Genes.fa -o #{outdir}/Genbanks.gbk -q")
43
41
  else
44
42
  system("#{root}/prodigal.linux -i #{@fasta_file} -a #{outdir}/Proteins.fa -d #{outdir}/Genes.fa -o #{outdir}/Genbanks.gbk -q")
@@ -100,7 +100,7 @@ class SequenceSynteny
100
100
  # first hit for query
101
101
  if ! @query_sequences[key].has_key? :homology
102
102
  @query_sequences[key][:conserved] = true
103
- @subject_sequences[key][:conserved] = true
103
+ # @subject_sequences[key][:conserved] = true
104
104
  @query_sequences[key][:homology] = {
105
105
  pId: lA[2].to_f.round(2),
106
106
  cov_query: cov_query,
@@ -119,7 +119,7 @@ class SequenceSynteny
119
119
  # query already got at least 1 hit and new_score > last_score
120
120
  elsif lA[11].to_f > @query_sequences[key][:homology][:score]
121
121
  @query_sequences[key][:conserved] = true
122
- @subject_sequences[key][:conserved] = true
122
+ # @subject_sequences[key][:conserved] = true
123
123
  @query_sequences[key][:homology] = {
124
124
  pId: lA[2].to_f.round(2),
125
125
  cov_query: cov_query,
@@ -26,7 +26,7 @@ class BacterialAnnotator
26
26
  @minlength = @options[:minlength].to_i
27
27
  @options[:minlength] = @options[:minlength].to_i
28
28
  @options[:pidentity] = @options[:pidentity].to_f
29
- @options[:pidentispacemacs-lightty] = @options[:pidentity] * 100 if @options[:pidentity] <= 1.00
29
+ @options[:pidentity] = @options[:pidentity] * 100 if @options[:pidentity] <= 1.00
30
30
  @options[:pcoverage] = @options[:pcoverage].to_f
31
31
  @options[:pcoverage] = @options[:pcoverage] / 100 if @options[:pcoverage] > 1.00
32
32
 
@@ -200,8 +200,12 @@ class BacterialAnnotator
200
200
  "Prot-ExternalDB", @options[:pidentity],
201
201
  @options[:pcoverage], "prot")
202
202
 
203
- puts "# Running BLAT alignment with External Database.."
203
+ print "# Running BLAT alignment with External Database.."
204
+ start_time = Time.now
204
205
  @externaldb_synteny.run_blat @root, @options[:outdir]
206
+ end_time = Time.now
207
+ c_time = Helper.sec2str(end_time-start_time)
208
+ print "done (#{c_time})\n"
205
209
  @externaldb_synteny.extract_hits :externaldb
206
210
 
207
211
  @externaldb_synteny.query_sequences.each do |k, v|
@@ -214,6 +218,9 @@ class BacterialAnnotator
214
218
 
215
219
  next if ! v.has_key? :homology
216
220
 
221
+ if ! @contig_annotations_cds.has_key? contig_of_protein
222
+ @contig_annotations_cds[contig_of_protein] = []
223
+ end
217
224
  @contig_annotations_cds[contig_of_protein] << k
218
225
 
219
226
  hit_gi = v[:homology][:hits][0]
@@ -238,7 +245,6 @@ class BacterialAnnotator
238
245
  inference: inference
239
246
  }
240
247
 
241
-
242
248
  end
243
249
 
244
250
  end
@@ -256,12 +262,17 @@ class BacterialAnnotator
256
262
  gbk_path = @query_fasta.annotation_files[:gbk_path]
257
263
  gbk_to_annotate = SequenceAnnotation.new("#{gbk_path}/#{contig}.gbk", "#{gbk_path}")
258
264
 
259
- if @with_external_db
265
+ if @with_external_db and @with_refence_genome
260
266
  gbk_to_annotate.add_annotation_ref_synteny_prot(
261
267
  (@prot_synteny_refgenome.query_sequences.merge(@externaldb_synteny.query_sequences)),
262
268
  @contig_annotations_externaldb[contig].merge(@ref_genome.coding_seq),
263
269
  (File.basename @options[:refgenome]).gsub(/.gb.*/,"")
264
270
  )
271
+ elsif @with_external_db
272
+ gbk_to_annotate.add_annotation_ref_synteny_prot(
273
+ @externaldb_synteny.query_sequences,
274
+ @contig_annotations_externaldb[contig]
275
+ )
265
276
  else
266
277
  gbk_to_annotate.add_annotation_ref_synteny_prot(
267
278
  @prot_synteny_refgenome.query_sequences,
@@ -270,7 +281,7 @@ class BacterialAnnotator
270
281
  )
271
282
  end
272
283
 
273
- if @contig_annotations_rna.has_key? contig
284
+ if @contig_annotations_rna and @contig_annotations_rna.has_key? contig
274
285
  # puts "RNA annotation"
275
286
  gbk_to_annotate.add_annotations @contig_annotations_rna[contig], "new"
276
287
  end
@@ -419,6 +430,8 @@ class BacterialAnnotator
419
430
  # >sp|C7C422|BLAN1_KLEPN Beta-lactamase NDM-1 OS=Klebsiella pneumoniae GN=blaNDM-1 PE=1 SV=1
420
431
  # TrEMBL
421
432
  # >tr|E5KIY2|E5KIY2_ECOLX Beta-lactamase NDM-1 OS=Escherichia coli GN=blaNDM-1 PE=1 SV=1
433
+ # MERGEM
434
+ # >Genome_ID|location|Protein_ID|LocusTag|Gene|Protein_Product
422
435
 
423
436
  ref_cds = {}
424
437
 
@@ -436,7 +449,10 @@ class BacterialAnnotator
436
449
  product = ""
437
450
  db_source = "[DBSource]"
438
451
 
439
- if product_long.include? " [" and product_long.include? "]" # NCBI
452
+ if product_long.scan(/|/).count >= 5 # MERGEM
453
+ product = product_long
454
+ db_source = "RefSeq"
455
+ elsif product_long.include? " [" and product_long.include? "]" # NCBI
440
456
  organism = product_long[/\[.*?\]/]
441
457
  product = product_long.split(" [")[0].strip
442
458
  elsif product_long.include? "OS=" # Swissprot / TrEMBL
@@ -465,6 +481,9 @@ class BacterialAnnotator
465
481
  db_source = "UniProtKB"
466
482
  end
467
483
  prot_id = lA[1]
484
+ elsif key_gi.count("|") == 5
485
+ db_source = "RefSeq"
486
+ prot_id = lA[2]
468
487
  end
469
488
 
470
489
  ref_cds[key_gi] = {product: product, org: org, prot_id: prot_id, db_source: db_source}
@@ -153,7 +153,11 @@ class BacterialComparator
153
153
  stats = {}
154
154
  stats[:syntenic] = []
155
155
  fout = File.open("#{@outdir}/cds-synteny.tsv", "w")
156
- fout.write("Gene\t"+@genomes_list.join("\t")+"\n")
156
+ genomes_name = []
157
+ @genomes_list.each do |g|
158
+ genomes_name.push(File.basename(g))
159
+ end
160
+ fout.write("Gene\t"+genomes_name.join("\t")+"\n")
157
161
 
158
162
  to_build_multifasta = []
159
163
 
@@ -386,7 +390,6 @@ class BacterialComparator
386
390
 
387
391
  end
388
392
 
389
-
390
393
  def raxml_tree_dna bt
391
394
  print "# Genes DNA tree creation (RAXML).."
392
395
  start_time = Time.now
@@ -423,7 +426,6 @@ class BacterialComparator
423
426
  print "done (#{c_time})\n"
424
427
  end
425
428
 
426
-
427
429
  def raxml_tree aln_opt, bt
428
430
 
429
431
  if aln_opt == "both"
@@ -0,0 +1,64 @@
1
+ # -*- coding: utf-8 -*-
2
+ # author: maxime déraspe
3
+ # email: maximilien1er@gmail.com
4
+ # review:
5
+ # date: 17-10-12
6
+ # version: 0.0.1
7
+ # licence:
8
+
9
+ require 'bio'
10
+ require 'fileutils'
11
+ require 'parallel'
12
+ require 'helper'
13
+
14
+ class BacterialIdentificator
15
+
16
+ attr_reader :genomes_list, :stats
17
+
18
+ # Initialize BacterialIdentificator
19
+ # options[:input], options[:refgenome], ROOT, options[:outdir], options)
20
+ def initialize options, root
21
+
22
+ @root = root
23
+ @db_path = options[:database]
24
+ @genomes_list = options[:genomes_list]
25
+ @proc = options[:proc].to_i
26
+ p @genomes_list
27
+
28
+ @genomes_list.each do |g|
29
+ mash_genome g
30
+ end
31
+
32
+ end
33
+
34
+
35
+ def mash_genome genome
36
+
37
+ # Reference-ID, Query-ID, Mash-distance, P-value, and Matching-hashes
38
+ # fields = ["hit","query","distance","pvalue","match"]
39
+
40
+ results_raw = `#{@root}/mash.linux dist #{@db_path}/species-sequences.msh #{genome}`
41
+ results = []
42
+
43
+ results_raw.split("\n").each do |l|
44
+ lA = l.chomp.split("\t")
45
+ next if lA[-1].split("/")[0] == '0' # no match
46
+ results << lA
47
+ end
48
+
49
+ results_sorted = results.sort {|a,b| a[2] <=> b[2]}
50
+
51
+ File.open("#{genome}.msh_dist", "w") do |fout|
52
+ results_sorted.each do |f|
53
+ fout.write(f.join("\t"))
54
+ fout.write("\n")
55
+ end
56
+ end
57
+
58
+ end
59
+
60
+
61
+
62
+
63
+ end
64
+
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bacterial-annotator
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.6
4
+ version: 0.7.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Maxime Deraspe
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-10-10 00:00:00.000000000 Z
11
+ date: 2017-11-06 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bio
@@ -83,6 +83,7 @@ executables:
83
83
  - ba_fasta36
84
84
  - ba_cdhit
85
85
  - ba_fasttree
86
+ - ba_mash
86
87
  extensions: []
87
88
  extra_rdoc_files: []
88
89
  files:
@@ -92,6 +93,7 @@ files:
92
93
  - bin/ba_fasta36
93
94
  - bin/ba_fasttree
94
95
  - bin/ba_mafft
96
+ - bin/ba_mash
95
97
  - bin/ba_prodigal
96
98
  - bin/ba_raxml
97
99
  - bin/bacterial-annotator
@@ -100,6 +102,7 @@ files:
100
102
  - lib/bacterial-annotator/sequence-fasta.rb
101
103
  - lib/bacterial-annotator/sequence-synteny.rb
102
104
  - lib/bacterial-comparator.rb
105
+ - lib/bacterial-identificator.rb
103
106
  - lib/helper.rb
104
107
  homepage: http://rubygems.org/gems/bacterial-annotator
105
108
  licenses: