miga-base 1.0.5.0 → 1.0.5.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: bc2d6a45a59d47e3490d1e6bbe728961abfc28002c945c0fc371b24277f7abcd
4
- data.tar.gz: c20265a37c53e403858c592a72b2de801ab6ce12e48f0fe88d886cea41a94413
3
+ metadata.gz: 9867b3df06290fb7eb25e10b39dd439c9db52d33c9beb3657e76b1f8da570c47
4
+ data.tar.gz: a53428a16e8d34b48e0e889c54169f287ad34473ad935885b27d7eb74bec090e
5
5
  SHA512:
6
- metadata.gz: 91804137814911287b5f1395e79b28e2805a0584dfb5ff234ca51a2366d17d497acf8a88ee80e18a1b8dd27a1b159ab3433471f82b389fd741825a1f5da57da5
7
- data.tar.gz: 97d932379e302a22b2c0f489b5dd6cdd5efe4408c77e9cd32bb3b89efff3bc5f3787d167bc8f46720777dcc4d2efa06fb9b82f77e1903735a4b6e631e20be6f9
6
+ metadata.gz: 53305fda5e76e31d2fae77480de9c6c66b32b3780b27b5f028fba2f95bcebe6bde23635dc6f456244533c5859d732c48c1e595a9b3ddba4a4b8a6cca679ab6df
7
+ data.tar.gz: 822cd0ded5cb9508406e313abca2fdfb36899dd444e20bbc8839d28fafea2d647b4eeafc87e16e30e6918b0d3bf106ed0a46cfa4482ed6c53caa93f21a98c496
data/lib/miga/version.rb CHANGED
@@ -12,7 +12,7 @@ module MiGA
12
12
  # - String indicating release status:
13
13
  # - rc* release candidate, not released as gem
14
14
  # - [0-9]+ stable release, released as gem
15
- VERSION = [1.0, 5, 0].freeze
15
+ VERSION = [1.0, 5, 1].freeze
16
16
 
17
17
  ##
18
18
  # Nickname for the current major.minor version.
@@ -589,7 +589,17 @@
589
589
  "|",
590
590
  { "arg": "task" },
591
591
  "--blast",
592
- "/dev/stdin",
592
+ "-",
593
+ {
594
+ "name": "Output",
595
+ "opt": "--out",
596
+ "arg": "out_file",
597
+ "mandatory": true,
598
+ "description": [
599
+ "Output filtered BLAST file.",
600
+ "Supports compression with .gz extension."
601
+ ]
602
+ },
593
603
  {
594
604
  "opt": "--top",
595
605
  "arg": "integer",
@@ -599,19 +609,13 @@
599
609
  {
600
610
  "opt": "--sort-by",
601
611
  "arg": "select",
602
- "values": ["bitscore", "evalue", "identity", "length"],
612
+ "values": ["bitscore", "evalue", "identity", "length", "no"],
603
613
  "default": "bitscore",
604
614
  "description": "Parameter used to detect the 'best' hits."
605
615
  },
606
616
  {
607
617
  "opt": "--quiet",
608
618
  "description": "Run quietly."
609
- },
610
- ">",
611
- {
612
- "arg": "out_file",
613
- "mandatory": true,
614
- "description": "Output (filtered) Tabular BLAST."
615
619
  }
616
620
  ]
617
621
  },
@@ -536,8 +536,10 @@
536
536
  },
537
537
  {
538
538
  "task": "rbm.rb",
539
- "description": ["Finds the reciprocal best matches between two sets of",
540
- "sequences."],
539
+ "description": [
540
+ "Finds the reciprocal best matches between two sets of",
541
+ "sequences."
542
+ ],
541
543
  "help_arg": "--help",
542
544
  "cite":[
543
545
  ["Camacho et al, 2009, BMC Bioinf (BLAST+)",
@@ -564,11 +566,22 @@
564
566
  "mandatory": true,
565
567
  "description": "FastA file containing the genome 2."
566
568
  },
569
+ {
570
+ "name": "Output",
571
+ "opt": "--out",
572
+ "arg": "out_file",
573
+ "mandatory": true,
574
+ "description": [
575
+ "Reciprocal Best Matches in BLAST tabular format.",
576
+ "Supports compression with .gz extension."
577
+ ]
578
+ },
567
579
  {
568
580
  "name": "Nucleotides",
569
581
  "opt": "--nucl",
570
- "description": ["Sequences are assumed to be nucleotides (proteins",
571
- "by default)."]
582
+ "description": [
583
+ "Sequences are assumed to be nucleotides (proteins by default)."
584
+ ]
572
585
  },
573
586
  {
574
587
  "name": "Length",
@@ -582,8 +595,10 @@
582
595
  "opt": "--fract",
583
596
  "arg": "float",
584
597
  "default": 0.0,
585
- "description": ["Minimum alignment length (as a fraction of the",
586
- "query). If set, requires BLAST+ or Diamond (see Program)."]
598
+ "description": [
599
+ "Minimum alignment length (as a fraction of the query).",
600
+ "If set, requires BLAST+ or Diamond (see Program)."
601
+ ]
587
602
  },
588
603
  {
589
604
  "name": "Identity",
@@ -603,8 +618,9 @@
603
618
  "name": "Executables",
604
619
  "opt": "--bin",
605
620
  "arg": "in_dir",
606
- "description": ["Directory containing the binaries of the search",
607
- "program."]
621
+ "description": [
622
+ "Directory containing the binaries of the search program."
623
+ ]
608
624
  },
609
625
  {
610
626
  "opt": "--program",
@@ -612,9 +628,11 @@
612
628
  "values": ["blast+", "blast", "blat", "diamond"],
613
629
  "default": "blast+",
614
630
  "description": "Search program to be used. Default: blast+.",
615
- "note": ["Make sure that you have installed the search program you",
631
+ "note": [
632
+ "Make sure that you have installed the search program you",
616
633
  "want to use. If you have downloaded the program, but it's not",
617
- "installed, please use the Executables option above."]
634
+ "installed, please use the Executables option above."
635
+ ]
618
636
  },
619
637
  {
620
638
  "opt": "--threads",
@@ -625,12 +643,6 @@
625
643
  {
626
644
  "opt": "--quiet",
627
645
  "description": "Run quietly (no STDERR output)."
628
- },
629
- ">",
630
- {
631
- "arg": "out_file",
632
- "mandatory": true,
633
- "description": "Reciprocal Best Matches in BLAST tabular format."
634
646
  }
635
647
  ]
636
648
  }
@@ -26,10 +26,14 @@
26
26
  },
27
27
  {
28
28
  "task": "rbm.rb",
29
- "description": ["Reciprocal Best Matches between the proteomes of the",
30
- "two major HIV types (HIV-1 and HIV-2)."],
31
- "values": ["hiv1.faa","hiv2.faa",null,null,null,null,null,null,null,null,
32
- null,null,"hiv1-hiv2.rbm"]
29
+ "description": [
30
+ "Reciprocal Best Matches between the proteomes of the",
31
+ "two major HIV types (HIV-1 and HIV-2)."
32
+ ],
33
+ "values": [
34
+ "hiv1.faa", "hiv2.faa", "hiv1-hiv2.rbm", null, null, null, null, null,
35
+ null, null,null, null
36
+ ]
33
37
  },
34
38
  {
35
39
  "task": "ogs.mcl.rb",
@@ -83,10 +87,14 @@
83
87
  {
84
88
  "_": "== Examples of BLAST statistics and manipulation",
85
89
  "task": "BlastTab.topHits_sorted.rb",
86
- "description": ["Extract the best match of metagenome-derived proteins",
87
- "(from the 'A metagenome') against a Gene Ontology collection."],
88
- "values": ["sort","a_mg.cds-go.blast.tsv",null,null,null,null,1,null,null,
89
- null,"a_mg.cds-go.blast-bm.tsv"]
90
+ "description": [
91
+ "Extract the best match of metagenome-derived proteins",
92
+ "(from the 'A metagenome') against a Gene Ontology collection."
93
+ ],
94
+ "values": [
95
+ "sort","a_mg.cds-go.blast.tsv", null, null, null, null,
96
+ "a_mg.cds-go.blast-bm.tsv", 1, null, null
97
+ ]
90
98
  },
91
99
  {
92
100
  "task": "BlastTab.sumPerHit.pl",
@@ -1,101 +1,123 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- #
4
- # @author: Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
5
- # @update: Dec-11-2015
6
- # @license: artistic license 2.0
7
- #
3
+ # frozen_string_literal: true
8
4
 
9
- require "optparse"
5
+ $:.push File.expand_path('../lib', __FILE__)
6
+ require 'enveomics_rb/enveomics'
7
+ require 'enveomics_rb/match'
8
+ $VERSION = 1.0
10
9
 
11
- $opts = {n:5, sortby:"bitscore", q:false}
12
- $cols = {"bitscore"=>11, "evalue"=>10, "identity"=>2, "length"=>3}
13
- ARGV << "-h" if ARGV.size==0
10
+ o = { n: 5, sortby: :bitscore, out: '-' }
14
11
  OptionParser.new do |opts|
15
- opts.banner = "Reports the top-N best hits of a BLAST, pre-sorted by query."
16
- opts.separator ""
17
- opts.separator "Mandatory"
18
- opts.on("-i", "--blast FILE",
19
- "Path to the BLAST file."){ |v| $opts[:blast]=v }
20
- opts.separator ""
21
- opts.separator "Optional"
22
- opts.on("-n", "--top INTEGER",
23
- "Maximum number of hits to report for each query.",
24
- "By default: #{$opts[:n]}"){ |v| $opts[:n]=v.to_i }
25
- opts.on("-s", "--sort-by STRING",
26
- "Parameter used to detect the 'best' hits.",
27
- "Any of: bitscore (default), evalue, identity, length."
28
- ){ |v| $opts[:sortby]=v }
29
- opts.on("-q", "--quiet", "Run quietly."){ $opts[:q]=true }
30
- opts.on("-h", "--help", "Display this screen") do
31
- puts opts
32
- exit
33
- end
34
- opts.separator ""
12
+ opts.version = $VERSION
13
+ Enveomics.opt_banner(
14
+ opts, 'Reports the top-N best hits of a BLAST, pre-sorted by query',
15
+ "#{File.basename($0)} -i in.tsv -o out.tsv [options]"
16
+ )
17
+
18
+ opts.separator 'Mandatory'
19
+ opts.on(
20
+ '-i', '--blast FILE',
21
+ 'Path to the BLAST file',
22
+ 'Supports compression with .gz extension, use - for STDIN'
23
+ ) { |v| o[:in] = v }
24
+ opts.on(
25
+ '-o', '--out FILE',
26
+ 'Output filtered BLAST file',
27
+ 'Supports compression with .gz extension, use - for STDOUT (default)'
28
+ ) { |v| o[:out] = v }
29
+ opts.separator ''
30
+ opts.separator 'Filter Options'
31
+ opts.on(
32
+ '-n', '--top INTEGER', Integer,
33
+ 'Maximum number of hits to report for each query',
34
+ "By default: #{o[:n]}"
35
+ ) { |v| o[:n] = v }
36
+ opts.on(
37
+ '-s', '--sort-by STRING',
38
+ 'Parameter used to detect the "best" hits',
39
+ 'Any of: bitscore (default), evalue, identity, length, no (pick first)'
40
+ ) { |v| o[:sortby] = v.to_sym }
41
+ opts.separator ''
42
+ opts.separator 'Other Options'
43
+ opts.on('-q', '--quiet', 'Run quietly (no STDERR output)') { $QUIET = true }
44
+ opts.on('-h', '--help', 'Display this screen') { puts opts; exit }
45
+ opts.separator ''
35
46
  end.parse!
36
47
 
37
- abort "-i/--blast is mandatory." if $opts[:blast].nil?
38
- abort "Unrecognized value for -s/--sortby: #{$opts[:sortby]}." if
39
- $cols[ $opts[:sortby] ].nil?
40
-
41
- class Hit
42
- attr_reader :blast_line
43
- def initialize(blast_line)
44
- @blast_line = blast_line.chomp.split(/\t/)
45
- end
46
- def col(i)
47
- @blast_line[i]
48
- end
49
- def <=>(other)
50
- ans = self.col( $cols[ $opts[:sortby] ] ).to_f <=> other.col( $cols[ $opts[:sortby] ] ).to_f
51
- ans = ans * -1 unless $opts[:sortby] == "evalue"
52
- return ans
53
- end
54
- def to_s
55
- @blast_line.join("\t")
56
- end
48
+ raise Enveomics::OptionError.new('-i is mandatory') if o[:in].nil?
49
+ unless o[:sortby] == :no || Enveomics::Match.column(o[:sortby])
50
+ raise Enveomics::OptionError.new("Unrecognized value for -s: #{o[:sortby]}")
57
51
  end
58
52
 
59
- class HitSet
60
- attr_reader :query, :hits
61
- def initialize
62
- @hits = []
63
- @query = nil
64
- end
65
- def <<(hit)
66
- @query = hit.col(0) if @query.nil?
67
- raise "Inconsistent query, expecting #{self.query}" unless
68
- self.query == hit.col(0)
69
- @hits << hit
70
- end
71
- def empty?
72
- self.hits.length == 0
73
- end
74
- def filter!
75
- @hits.sort!
76
- @hits.slice!($opts[:n], @hits.length)
77
- end
78
- def to_s
79
- @hits.join("\n")
80
- end
53
+ class Enveomics::Match
54
+ attr_accessor :sortby
55
+
56
+ def <=>(other)
57
+ ans = send(sortby) <=> other.send(sortby)
58
+ sortby == :evalue ? ans : ans * -1
59
+ end
81
60
  end
82
61
 
83
- $stderr.puts "Parsing BLAST." unless $opts[:q]
84
- fh = File.open $opts[:blast], "r"
85
- hs = HitSet.new
86
- while ln=fh.gets
87
- hit = Hit.new( ln )
88
- if hs.query != hit.col(0)
89
- hs.filter!
90
- puts hs unless hs.empty?
91
- hs = HitSet.new
92
- $stderr.print "Parsing line #{$.}... \r" unless $opts[:q]
93
- end
94
- hs << hit
62
+ class Enveomics::MatchSet
63
+ attr_reader :query, :hits, :sortby
64
+
65
+ def initialize(sortby)
66
+ @hits = []
67
+ @query = nil
68
+ @sortby = sortby
69
+ end
70
+
71
+ def <<(hit)
72
+ @query ||= hit.qry
73
+ unless query == hit.qry
74
+ raise "Inconsistent query, expecting #{query}"
75
+ end
76
+
77
+ @hits << hit.tap { |i| i.sortby = sortby }
78
+ end
79
+
80
+ def empty?
81
+ hits.empty?
82
+ end
83
+
84
+ def filter!(n)
85
+ @hits.sort! unless sortby == :no
86
+ @hits.slice!(n, @hits.length)
87
+ end
88
+
89
+ def to_s
90
+ hits.join("\n")
91
+ end
95
92
  end
96
- $stderr.print "Parsed #{$.} lines. \n" unless $opts[:q]
97
- fh.close
98
93
 
99
- hs.filter!
100
- puts hs unless hs.empty?
94
+ begin
95
+ ifh = reader(o[:in])
96
+ ofh = writer(o[:out])
97
+
98
+ say 'Parsing BLAST'
99
+ hs = Enveomics::MatchSet.new(o[:sortby])
100
+ lno = 0
101
+ ifh.each do |ln|
102
+ lno += 1
103
+ hit = Enveomics::Match.new(ln)
104
+ if hs.query != hit.qry
105
+ hs.filter! o[:n]
106
+ ofh.puts hs unless hs.empty?
107
+ hs = Enveomics::MatchSet.new(o[:sortby])
108
+ say_inline("Parsing line #{lno}... \r")
109
+ end
110
+ hs << hit
111
+ end
112
+ say("Parsed #{lno} lines ")
113
+ ifh.close
114
+
115
+ hs.filter! o[:n]
116
+ ofh.puts hs unless hs.empty?
117
+ ofh.close
118
+ rescue => err
119
+ $stderr.puts "Exception: #{err}\n\n"
120
+ err.backtrace.reverse.each { |l| $stderr.puts "DEBUG: %s\n" % l }
121
+ err
122
+ end
101
123
 
@@ -316,6 +316,7 @@ Dir.mktmpdir do |dir|
316
316
  end
317
317
  res = File.open(o[:res], "w") unless o[:res].nil?
318
318
  rbm = File.open(o[:rbm], "w") unless o[:rbm].nil?
319
+ sqlite_db.execute('BEGIN TRANSACTION') unless o[:sqlite3].nil?
319
320
  [1,2].each do |i|
320
321
  qry_seen = []
321
322
  q = "#{dir}/seq#{i}.fa"
@@ -413,6 +414,7 @@ Dir.mktmpdir do |dir|
413
414
  o[:sqlite3].nil?
414
415
  puts id2/n2 if o[:auto]
415
416
  end
417
+ sqlite_db.execute('COMMIT') unless o[:sqlite3].nil?
416
418
  res.close unless o[:res].nil?
417
419
  fo.close unless o[:out].nil?
418
420
  end
@@ -4,44 +4,69 @@ module Enveomics
4
4
  # A simple object representing a sequence match from a search engine
5
5
  # supporting tabular BLAST output
6
6
  class Match
7
- attr :row
7
+ class << self
8
+ def column_types
9
+ {
10
+ qseqid: String, sseqid: String, pident: Float,
11
+ length: Integer, mismatch: Integer, gapopen: Integer,
12
+ q_start: Integer, q_end: Integer, s_start: Integer,
13
+ s_end: Integer, evalue: Float, bitscore: Float,
14
+ # Non-standard (but frequently used in Enveomics Collection):
15
+ qry_len: Integer, sbj_len: Integer
16
+ }
17
+ end
8
18
 
9
- ##
10
- # Initialize Enveomics::Match object from a tabular blast line String +ln+
11
- def initialize(ln)
12
- @row = ln.chomp.split("\t")
13
- end
19
+ def column_type(sym)
20
+ column_types[colname(sym)]
21
+ end
14
22
 
15
- def qry
16
- row[0]
17
- end
23
+ def to_column_type(sym, value)
24
+ case column_type(sym).to_s
25
+ when 'String' ; value.to_s
26
+ when 'Float' ; value.to_f
27
+ when 'Integer'; value.to_i
28
+ end
29
+ end
18
30
 
19
- def sbj
20
- row[1]
21
- end
31
+ def columns
32
+ column_types.keys
33
+ end
22
34
 
23
- def id
24
- @id ||= row[2].to_f
25
- end
35
+ def column(sym)
36
+ columns.index(colname(sym))
37
+ end
26
38
 
27
- def len
28
- @len ||= row[3].to_i
29
- end
39
+ def colsynonyms
40
+ {
41
+ qry: :qseqid, sbj: :sseqid,
42
+ id: :pident, len: :length, score: :bitscore
43
+ }
44
+ end
30
45
 
31
- def evalue
32
- @evalue ||= row[9].to_f
33
- end
46
+ def colnames
47
+ columns + colsynonyms.keys
48
+ end
34
49
 
35
- def score
36
- @score ||= row[10].to_f
50
+ def colname(sym)
51
+ s = sym.to_sym
52
+ column_types[s] ? s : colsynonyms[s]
53
+ end
37
54
  end
38
55
 
39
- def qry_len
40
- @qry_len ||= row[12].to_i
56
+ ####--- Instance Level ---###
57
+
58
+ attr :row
59
+
60
+ ##
61
+ # Initialize Enveomics::Match object from a tabular blast line String +ln+
62
+ def initialize(ln)
63
+ @row = ln.chomp.split("\t")
41
64
  end
42
65
 
43
- def sbj_len
44
- @sbj_len ||= row[13].to_i
66
+ colnames.each do |sym|
67
+ define_method sym do
68
+ self.class.to_column_type(sym, row[self.class.column(sym)])
69
+ end
45
70
  end
46
71
 
47
72
  def qry_fract
@@ -18,10 +18,11 @@ def use(gems, mandatory = true)
18
18
  end
19
19
 
20
20
  def say(*msg)
21
- return if $QUIET ||= false
21
+ $stderr.puts('[%s] %s' % [Time.now, msg.join('')]) unless $QUIET ||= false
22
+ end
22
23
 
23
- o = '[%s] %s' % [Time.now, msg.join('')]
24
- $stderr.puts(o)
24
+ def say_inline(*msg)
25
+ $stderr.print('[%s] %s' % [Time.now, msg.join('')]) unless $QUIET ||= false
25
26
  end
26
27
 
27
28
  ##
@@ -2,18 +2,19 @@
2
2
 
3
3
  # frozen_string_literal: true
4
4
 
5
- $VERSION = 1.0
5
+ $VERSION = 1.01
6
6
  $:.push File.expand_path('../lib', __FILE__)
7
7
  require 'enveomics_rb/rbm'
8
8
  require 'tmpdir'
9
9
 
10
10
  bms_dummy = Enveomics::RBM.new('1', '2').bms1
11
- o = { q: false }
11
+ o = { q: false, out: '-' }
12
12
  %i[thr len id fract score bin program nucl].each do |k|
13
13
  o[k] = bms_dummy.opt(k)
14
14
  end
15
15
 
16
16
  OptionParser.new do |opts|
17
+ opts.version = $VERSION
17
18
  cmd = File.basename($0)
18
19
  opts.banner = <<~BANNER
19
20
 
@@ -34,6 +35,11 @@ OptionParser.new do |opts|
34
35
  '-2', '--seq2 FILE',
35
36
  'Path to the FastA file containing the set 2'
36
37
  ) { |v| o[:seq2] = v }
38
+ opts.on(
39
+ '-o', '--out FILE',
40
+ 'Reciprocal Best Matches in BLAST tabular format.',
41
+ 'Supports compression with .gz extension, use - for STDOUT (default)'
42
+ ) { |v| o[:out] = v }
37
43
  opts.separator ''
38
44
  opts.separator 'Search Options'
39
45
  opts.on(
@@ -80,7 +86,7 @@ OptionParser.new do |opts|
80
86
  ) { |v| o[:thr] = v }
81
87
  opts.separator ''
82
88
  opts.separator 'Other Options'
83
- opts.on('-q', '--quiet', 'Run quietly (no STDERR output)') { o[:q] = true }
89
+ opts.on('-q', '--quiet', 'Run quietly (no STDERR output)') { $QUIET = true }
84
90
  opts.on('-h', '--help', 'Display this screen') { puts opts ; exit }
85
91
  opts.separator ''
86
92
  end.parse!
@@ -90,10 +96,12 @@ raise Enveomics::OptionError.new('-2 is mandatory') if o[:seq2].nil?
90
96
  raise Enveomics::OptionError.new(
91
97
  'Argument -f/--fract requires -p blast+ or -p diamond'
92
98
  ) if o[:fract] > 0.0 && !%i[blast+ diamond].include?(o[:program])
93
- $QUIET = o[:q]
94
99
 
95
100
  rbm = Enveomics::RBM.new(o[:seq1], o[:seq2], o)
96
- rbm.each { |bm| puts bm.to_s }
101
+ ofh = writer(o[:out])
102
+ rbm.each { |bm| ofh.puts bm.to_s }
103
+ ofh.close
104
+
97
105
  say('Forward Best Matches: ', rbm.bms1.count)
98
106
  say('Reverse Best Matches: ', rbm.bms2.count)
99
107
  say('Reciprocal Best Matches: ', rbm.count)
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: miga-base
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.5.0
4
+ version: 1.0.5.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Luis M. Rodriguez-R