mgnu 2.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +0 -0
  3. data/README.md +31 -0
  4. data/Rakefile +33 -0
  5. data/lib/mgnu.rb +9 -0
  6. data/lib/mgnu/alignment.rb +143 -0
  7. data/lib/mgnu/common.rb +68 -0
  8. data/lib/mgnu/genbank.rb +117 -0
  9. data/lib/mgnu/genbank/feature.rb +84 -0
  10. data/lib/mgnu/genbank/location.rb +150 -0
  11. data/lib/mgnu/genbank/qualifier.rb +45 -0
  12. data/lib/mgnu/genbank/reference.rb +114 -0
  13. data/lib/mgnu/genbank/source.rb +39 -0
  14. data/lib/mgnu/loggable.rb +61 -0
  15. data/lib/mgnu/parser.rb +50 -0
  16. data/lib/mgnu/parser/blast.rb +87 -0
  17. data/lib/mgnu/parser/blast/format0.rb +290 -0
  18. data/lib/mgnu/parser/blast/format7.rb +121 -0
  19. data/lib/mgnu/parser/blast/format8.rb +120 -0
  20. data/lib/mgnu/parser/blast/hsp.rb +75 -0
  21. data/lib/mgnu/parser/blast/query.rb +45 -0
  22. data/lib/mgnu/parser/blast/sbjct.rb +62 -0
  23. data/lib/mgnu/parser/clustalw.rb +72 -0
  24. data/lib/mgnu/parser/fasta.rb +61 -0
  25. data/lib/mgnu/parser/fasta_header_index.rb +39 -0
  26. data/lib/mgnu/parser/fasta_index.rb +57 -0
  27. data/lib/mgnu/parser/fastq.rb +61 -0
  28. data/lib/mgnu/parser/genbank.rb +187 -0
  29. data/lib/mgnu/parser/gff.rb +56 -0
  30. data/lib/mgnu/parser/iprscan/hit.rb +76 -0
  31. data/lib/mgnu/parser/iprscan_file.rb +39 -0
  32. data/lib/mgnu/parser/kegg_ontology_index.rb +163 -0
  33. data/lib/mgnu/parser/pilercr.rb +102 -0
  34. data/lib/mgnu/parser/prodigal.rb +170 -0
  35. data/lib/mgnu/parser/sam.rb +115 -0
  36. data/lib/mgnu/parser/sam/alignment.rb +22 -0
  37. data/lib/mgnu/parser/sam/header.rb +23 -0
  38. data/lib/mgnu/parser/sam/pair.rb +18 -0
  39. data/lib/mgnu/sequence.rb +207 -0
  40. data/lib/mgnu/sequence/fasta.rb +79 -0
  41. data/lib/mgnu/sequence/fastq.rb +43 -0
  42. data/lib/mgnu/version.rb +16 -0
  43. data/mgnu.gemspec +39 -0
  44. data/spec/mgnu/parser/blast_format0_spec.rb +114 -0
  45. data/spec/mgnu/parser/blast_format7_spec.rb +24 -0
  46. data/spec/mgnu/parser/blast_format8_spec.rb +26 -0
  47. data/spec/mgnu/parser/blast_multihsp_spec.rb +100 -0
  48. data/spec/mgnu/parser/blast_oof_spec.rb +53 -0
  49. data/spec/mgnu/parser/clustalw_spec.rb +90 -0
  50. data/spec/mgnu/parser/fasta_header_index_tc_parser_spec.rb +25 -0
  51. data/spec/mgnu/parser/fasta_index_tc_parser_spec.rb +25 -0
  52. data/spec/mgnu/parser/fasta_parser_spec.rb +53 -0
  53. data/spec/mgnu/parser_spec.rb +22 -0
  54. data/spec/mgnu/sequence/fasta_spec.rb +60 -0
  55. data/spec/mgnu/sequence/fastq_spec.rb +31 -0
  56. data/spec/mgnu/sequence_spec.rb +81 -0
  57. data/spec/mgnu_spec.rb +7 -0
  58. data/spec/spec_helper.rb +53 -0
  59. metadata +376 -0
@@ -0,0 +1,57 @@
1
+ require 'moneta'
2
+ require 'json'
3
+
4
+ module MgNu
5
+ module Parser
6
+ class FastaIndex
7
+ attr_reader :filename, :db_name, :db, :db_type
8
+
9
+ # create a new FastaIndex parser
10
+ def initialize(filename, options = {})
11
+ options = {
12
+ :db_type => :TokyoCabinet
13
+ }.merge!(options)
14
+
15
+ @db_type = options[:db_type]
16
+
17
+ @filename = filename
18
+ if @db_type == :TokyoCabinet
19
+ if @filename =~ /^.+\.tch$/
20
+ @db_name = @filename
21
+ else
22
+ @db_name = "#{@filename}.tch"
23
+ end
24
+ end
25
+
26
+ if db_type == :TokyoCabinet
27
+ @db = Moneta.new(:TokyoCabinet, file: @db_name, type: :hdb)
28
+ end
29
+ parse
30
+ end
31
+
32
+ # setup parse method for creating tokyo cabinet
33
+ def parse
34
+ MgNu::Parser::Fasta.new(@filename).each do |f|
35
+ name = f.header_name
36
+ description = f.header_description
37
+ @db[name] = { 'description' => description, 'sequence' => f.sequence }.to_json
38
+ end
39
+ end # end of #parse
40
+
41
+ def [](name)
42
+ f = nil
43
+ if @db.key?(name)
44
+ d = JSON.parse(@db[name])
45
+ f = MgNu::Sequence::Fasta.new(:header => "#{name} #{d['description']}",
46
+ :sequence => d['sequence'])
47
+ end
48
+ f
49
+ end
50
+
51
+ def close
52
+ @db.close unless @db.nil?
53
+ end
54
+
55
+ end # end of MgNu::Parser::FastaIndex class
56
+ end # end of MgNu::Parser module
57
+ end # end of MgNu module
@@ -0,0 +1,61 @@
1
+ module MgNu
2
+ module Parser
3
+ class Fastq
4
+ include Enumerable
5
+ attr_reader :file, :filename
6
+
7
+ # create a new Fastq parser
8
+ def initialize(filename = nil)
9
+ @filename = filename
10
+ if @filename
11
+ if File.exists?(@filename) and File.readable?(@filename)
12
+ @file = File.open(@filename)
13
+ else
14
+ raise "\n\n -- No file by that name (#{@filename}). Exiting\n\n"
15
+ exit(1)
16
+ end
17
+ else
18
+ $stderr.puts("MgNu::Parser::Fastq.new(): need an existing fastq file name")
19
+ exit(1)
20
+ end
21
+ end
22
+
23
+ # override enumerables
24
+ def each
25
+ while @file.eof != true # keep reading until EOF
26
+ header = @file.readline.chomp
27
+ sequence = @file.readline.chomp
28
+ qualhdr = @file.readline.chomp
29
+ quality = @file.readline.chomp
30
+ if header =~ /^@(.*)/
31
+ header = $1
32
+ if qualhdr =~ /^\+(.*)/
33
+ qualhdr = $1
34
+ else
35
+ error("Malformed quality header!")
36
+ error("\n#{qualhdr}")
37
+ error("\nExiting at line #{@file.lineno}")
38
+ exit(1)
39
+ end
40
+ if header != qualhdr
41
+ if qualhdr =~ /\s*/
42
+ qualhdr = header
43
+ else
44
+ warn("Sequence header and quality header don't match!")
45
+ warn("sequence: #{header}")
46
+ warn(" quality: #{qualhdr}")
47
+ end
48
+ end
49
+ yield MgNu::Sequence::Fastq.new(:header => header, :sequence => sequence, :qualhdr => qualhdr, :quality => quality)
50
+ else
51
+ $stderr.puts "Malformed header!"
52
+ $stderr.puts "\n#{header}"
53
+ $stderr.puts "\nExiting at line #{@file.lineno}"
54
+ exit(1)
55
+ end
56
+ end # end of while @file.eof
57
+ end # end of #each
58
+
59
+ end # end of MgNu::Parser::Fasta class
60
+ end # end of MgNu::File module
61
+ end # end of MgNu module
@@ -0,0 +1,187 @@
1
+ module MgNu
2
+ module Parser
3
+ class Genbank
4
+ attr_reader :file
5
+ attr_accessor :genbank_instances
6
+
7
+ include MgNu::Loggable
8
+ include MgNu::Parser
9
+
10
+ InvalidGenbankFile = Class.new(StandardError)
11
+
12
+ LOCUS_REGEX = /^LOCUS\s+(\S+)\s+(\d+)\s+bp\s+(?:(ss-|ds-|ms-))?(\S+)\s+(?:(\S+)\s+)?(\S+)\s+(\S+)$/
13
+
14
+ # create a new Genbank parser
15
+ def initialize(filename)
16
+ @genbank_instances = []
17
+
18
+ if filename
19
+ if File.exists?(filename) and File.readable?(filename)
20
+ @file = File.open(filename)
21
+ else
22
+ error("MgNu::Parser::Genbank#parse: problems with filename")
23
+ raise "File doesn't exist or is not readable!"
24
+ end
25
+ else
26
+ error("MgNu::Parser::Genbank#parse: need a filename")
27
+ raise "no filename given!"
28
+ end
29
+ end
30
+
31
+ def parse(debug=false)
32
+ @debug = debug
33
+ # parse_header # also triggers parsing of everything else
34
+ until file.eof? do
35
+ parse_section
36
+ end
37
+ genbank_instances
38
+ end
39
+
40
+ def parse_section
41
+ locus_line = file.readline
42
+ if md = locus_line.match(LOCUS_REGEX)
43
+ genbank = MgNu::Genbank.new
44
+ info("found a LOCUS line") if @debug
45
+ genbank.locus = MgNu::Genbank::Locus.new(*md.captures)
46
+ info("LOCUS name #{genbank.locus.name}") if @debug
47
+
48
+ buffer = parse_until(file, /^ACCESSION/)
49
+ if buffer.join =~ /^DEFINITION\s+(.+)$/m
50
+ genbank.definition = $1.gsub(/\n/, ' ').gsub(/\s{2,}/, ' ').strip.chop
51
+ info genbank.definition if @debug
52
+ end
53
+
54
+ buffer = parse_until(file, /^VERSION/)
55
+ # parsing ACESSION number line
56
+ if buffer.join =~ /^ACCESSION\s+(.+)$/
57
+ temp = $1.strip.squeeze(' ').split("\s")
58
+ # multiple secondary accession numbers possible
59
+ genbank.accession, genbank.secondary_accession = temp.shift, temp
60
+ end
61
+ info "ACCESSION: #{genbank.accession}" if @debug
62
+
63
+ buffer = parse_until(file, /^KEYWORDS/)
64
+ # parsing VERSION line
65
+ buffer.each do |line|
66
+ if line =~ /^VERSION\s+(.+)$/
67
+ temp = $1.strip.squeeze(' ').split
68
+ temp.each do |version|
69
+ if version =~ /GI:(\d+)/
70
+ genbank.geninfo_identifier = $1.to_i
71
+ else
72
+ genbank.version = version
73
+ end
74
+ end
75
+ elsif line =~ /^DBLINK\s+(.+)$/
76
+ genbank.dblink = $1.strip.squeeze(' ')
77
+ end
78
+ end
79
+
80
+ buffer = parse_until(file, /^SOURCE/)
81
+
82
+ # parse keywords and optional segment
83
+ keyword_lines = []
84
+ buffer.each do |line|
85
+ if line =~ /^KEYWORDS\s+(.+)$/
86
+ keyword_lines << $1.strip.squeeze(' ')
87
+ elsif line =~ /^SEGMENT\s+(.+)$/
88
+ genbank.segment = $1.strip.squeeze(' ')
89
+ else
90
+ keyword_lines << line
91
+ end
92
+ end
93
+ k = keyword_lines.join
94
+ unless k == "."
95
+ k_array = k.split(/;\s*/) # keywords are separated by semicolons
96
+ k_array[-1].chop! # gets rid of the period after the last keyword
97
+ genbank.keywords = k_array
98
+ end
99
+
100
+ buffer = parse_until(file,/^FEATURES/)
101
+
102
+ ri = buffer.index {|l| l =~ /^REFERENCE/ }
103
+ ci = buffer.index {|l| l =~ /^COMMENT/ }
104
+
105
+ if ri && ci
106
+ genbank.source = MgNu::Genbank::Source.parse(buffer[0..ri-1])
107
+ parse_references(buffer[ri..ci-1], genbank)
108
+ genbank.comment = buffer[ci..-1].map{|line| line.gsub(/^COMMENT/, '').lstrip!.squeeze(' ')}.join("\n")
109
+ elsif ri
110
+ genbank.source = MgNu::Genbank::Source.parse(buffer[0..ri-1])
111
+ parse_references(buffer[ri..-1], genbank)
112
+ elsif ci
113
+ genbank.source = MgNu::Genbank::Source.parse(buffer[0..ci-1])
114
+ genbank.comment = buffer[ci..-1].map{|line| line.gsub(/^COMMENT/, '').lstrip!.squeeze(' ')}.join("\n")
115
+ else
116
+ # neither references nor comment line
117
+ genbank.source = MgNu::Genbank::Source.parse(buffer)
118
+ end
119
+
120
+ info genbank.source.common_name if @debug
121
+ info genbank.source.organism if @debug
122
+ info genbank.source.lineage if @debug
123
+
124
+ parse_features(parse_until(file, /^ORIGIN/), genbank)
125
+ info "features count: #{genbank.features.length}" if @debug
126
+
127
+ parse_sequence(parse_until(file, /\/\//), genbank)
128
+ info "sequence length: #{genbank.sequence.try(:length) || 0}" if @debug
129
+ file.readline # consumes end of section line //
130
+ genbank_instances << genbank
131
+ else
132
+ unless locus_line =~ /^\s*$/
133
+ raise InvalidGenbankFile, "Missing or malformed LOCUS line."
134
+ end
135
+ end
136
+ end
137
+
138
+ def parse_features(buffer, genbank)
139
+ buffer.shift if buffer[0] =~ /^FEATURES/
140
+ all_features = split_at_features(buffer.join("\n"))
141
+
142
+ all_features.each do |feature_str|
143
+ genbank.features << MgNu::Genbank::Feature.parse(feature_str)
144
+ end
145
+ end # end parse_features
146
+
147
+ def parse_references(buffer, genbank)
148
+ ref_array = split_at_header_tag(buffer.join("\n"))
149
+ ref_array.each do |ref|
150
+ genbank.references << MgNu::Genbank::Reference.parse(ref)
151
+ end
152
+ end
153
+
154
+ def parse_sequence(buffer, genbank)
155
+ buffer.shift # drop ORIGIN line
156
+ info("inside parse_sequence") if @debug
157
+ info("buffer is #{buffer.length}") if @debug
158
+
159
+ unless buffer.empty?
160
+ seq = ""
161
+ bigstr = buffer.join
162
+ seq = bigstr.gsub(/[\d\s]+/, "")
163
+ genbank.sequence = MgNu::Sequence.new(:value => seq)
164
+ genbank.features.each do |f|
165
+ f.sequence = f.location.get_sequence(genbank.sequence.value)
166
+ end
167
+ else
168
+ genbank.sequence = nil
169
+ end
170
+ end
171
+
172
+ # splits at lines beginning with capital letter and no preceding space chars
173
+ def split_at_header_tag(str)
174
+ sep = "\001"
175
+ str.gsub(/\n([A-Z])/, "\n#{sep}\\1").split(sep)
176
+ end
177
+
178
+ def split_at_features(str)
179
+ sep = "\001"
180
+ str.gsub(/\n(\s{5}\S)/, "\n#{sep}\\1").split(sep)
181
+ end
182
+
183
+ end # end of MgNu::Parser::Genbank class
184
+ end # end of MgNu::Parser module
185
+ end # end of MgNu module
186
+
187
+ __END__
@@ -0,0 +1,56 @@
1
+ module MgNu
2
+ module Parser
3
+ class GFF
4
+ include Enumerable
5
+
6
+ attr_reader :file
7
+
8
+ # create a new GFF parser
9
+ def initialize(filename = nil)
10
+ if filename
11
+ if File.exists?(filename) and File.readable?(filename)
12
+ @file = File.open(filename)
13
+ else
14
+ @file = File.new(filename, "w")
15
+ end
16
+ else
17
+ error("MgNu::Parser::GFF.new(): need a filename for an existing file")
18
+ end
19
+ end
20
+
21
+ # override enumerables
22
+ def each
23
+ @file.each_line do |line|
24
+ line.chomp!
25
+ next if line =~ /^#/
26
+ yield Record.new(line)
27
+ end
28
+ end # end of #each
29
+
30
+ # class to deal with each line (record) of data
31
+ class Record
32
+ attr_accessor :name, :source, :feature, :start, :end
33
+ attr_accessor :score, :strand, :frame, :attributes
34
+
35
+ def initialize(line)
36
+ @name, @source, @feature, @start, @end,
37
+ @score, @strand, @frame, @attributes = line.split("\t")
38
+ @attributes = parse_attributes(attributes) if attributes
39
+ end
40
+
41
+ alias :seqname :name
42
+
43
+ private
44
+
45
+ def parse_attributes(attributes)
46
+ hash = Hash.new
47
+ attributes.split(/[^\\];/).each do |atr|
48
+ key, value = atr.split(' ', 2)
49
+ hash[key] = value
50
+ end
51
+ hash
52
+ end
53
+ end # end of MgNu::Parser::GFF::Record class
54
+ end # end of MgNu::Parser::GFF class
55
+ end # end of MgNu::Parser module
56
+ end # end of MgNu module
@@ -0,0 +1,76 @@
1
+ module MgNu
2
+ module Parser
3
+ class Iprscan
4
+ class Hit
5
+ attr_accessor :query, :crc, :length, :db, :db_id, :db_description
6
+ attr_accessor :from, :to, :evalue, :status, :date
7
+ attr_accessor :ipr_id, :ipr_description, :go
8
+
9
+ include MgNu::Loggable
10
+
11
+ # create a new Hit object
12
+ def initialize(line = nil)
13
+ @ipr_id = nil
14
+ @ipr_description = nil
15
+ @go = nil
16
+
17
+ line.chomp!
18
+ temp = line.split(/\t/)
19
+ @query = temp.shift
20
+ @crc = temp.shift
21
+ @length = temp.shift.to_i
22
+ @db = temp.shift
23
+ @db_id = temp.shift
24
+ @db_description = temp.shift
25
+ @from = temp.shift.to_i
26
+ @to = temp.shift.to_i
27
+ @evalue = temp.shift.to_f
28
+ if @db == "Seg" or @db == "TMHMM" or @db == "Coil"
29
+ @evalue = "NA"
30
+ end
31
+ @status = temp.shift
32
+ @date = temp.shift
33
+ if temp.length > 0
34
+ @ipr_id = temp.shift
35
+ if temp.length > 0
36
+ @ipr_description = temp.shift
37
+ if temp.length > 0
38
+ @go = temp.shift
39
+ end
40
+ end
41
+ end
42
+
43
+ end
44
+
45
+ def to_s
46
+ str = "#{@query}\t#{@crc}\t#{@length}\t#{@db}\t#{@db_id}\t#{@db_description}\t"
47
+ str += "#{@from}\t#{@to}\t#{@evalue}\t#{@status}\t#{@date}"
48
+ unless @ipr_id.nil?
49
+ str += "\t#{@ipr_id}\t#{@ipr_description}"
50
+ unless @go.nil?
51
+ str += "\t#{@go}"
52
+ end
53
+ end
54
+ str
55
+ end
56
+
57
+ def match_length
58
+ @from < @to ? @to - @from : @from - @to
59
+ end
60
+
61
+ def summary
62
+ string = "#{@db_description} (db=#{@db} db_id=#{@db_id}"
63
+ string += " from=#{@from} to=#{@to}"
64
+ string += " evalue=#{@evalue}" unless db == "Seg" or db == "TMHMM"
65
+ string += " interpro_id=#{@ipr_id} interpro_description=#{@ipr_description}" unless @ipr_id == "NULL"
66
+ string += " GO=#{@go}" unless @go.nil?
67
+ string += ")"
68
+ string
69
+ end
70
+
71
+ end
72
+ end # end of MgNu::Parser::Iprscan::Hit class
73
+ end # end of MgNu::Parser module
74
+ end # end of MgNu module
75
+
76
+ __END__
@@ -0,0 +1,39 @@
1
+ require 'mgnu/parser/iprscan/hit'
2
+
3
+ module MgNu
4
+ module Parser
5
+ class IprscanFile
6
+ attr_reader :file, :queries
7
+
8
+ include MgNu::Loggable
9
+
10
+ def initialize(filename = nil)
11
+ if filename
12
+ if File.exists?(filename) and File.readable?(filename)
13
+ @file = File.open(filename)
14
+ else
15
+ error("MgNu::Parser::IprscanFile.new(): problems with filename")
16
+ raise "File doesn't exist or is not readable!"
17
+ end
18
+ else
19
+ error("MgNu::Parser::IprscanFile.new(): need a filename")
20
+ raise "no filename given!"
21
+ end
22
+
23
+ @queries = Hash.new
24
+
25
+ parse
26
+ end
27
+
28
+ def parse
29
+ @file.each do |line|
30
+ line.chomp!
31
+ hit = MgNu::Parser::Iprscan::Hit.new(line)
32
+ @queries.has_key?(hit.query) ? @queries[hit.query] << hit : @queries[hit.query] = [ hit ]
33
+ end
34
+ end
35
+ end # end of MgNu::Parser::IprscanFile class
36
+ end # end of MgNu::Parser module
37
+ end # end of MgNu module
38
+
39
+ __END__