mgnu 2.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (59) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +0 -0
  3. data/README.md +31 -0
  4. data/Rakefile +33 -0
  5. data/lib/mgnu.rb +9 -0
  6. data/lib/mgnu/alignment.rb +143 -0
  7. data/lib/mgnu/common.rb +68 -0
  8. data/lib/mgnu/genbank.rb +117 -0
  9. data/lib/mgnu/genbank/feature.rb +84 -0
  10. data/lib/mgnu/genbank/location.rb +150 -0
  11. data/lib/mgnu/genbank/qualifier.rb +45 -0
  12. data/lib/mgnu/genbank/reference.rb +114 -0
  13. data/lib/mgnu/genbank/source.rb +39 -0
  14. data/lib/mgnu/loggable.rb +61 -0
  15. data/lib/mgnu/parser.rb +50 -0
  16. data/lib/mgnu/parser/blast.rb +87 -0
  17. data/lib/mgnu/parser/blast/format0.rb +290 -0
  18. data/lib/mgnu/parser/blast/format7.rb +121 -0
  19. data/lib/mgnu/parser/blast/format8.rb +120 -0
  20. data/lib/mgnu/parser/blast/hsp.rb +75 -0
  21. data/lib/mgnu/parser/blast/query.rb +45 -0
  22. data/lib/mgnu/parser/blast/sbjct.rb +62 -0
  23. data/lib/mgnu/parser/clustalw.rb +72 -0
  24. data/lib/mgnu/parser/fasta.rb +61 -0
  25. data/lib/mgnu/parser/fasta_header_index.rb +39 -0
  26. data/lib/mgnu/parser/fasta_index.rb +57 -0
  27. data/lib/mgnu/parser/fastq.rb +61 -0
  28. data/lib/mgnu/parser/genbank.rb +187 -0
  29. data/lib/mgnu/parser/gff.rb +56 -0
  30. data/lib/mgnu/parser/iprscan/hit.rb +76 -0
  31. data/lib/mgnu/parser/iprscan_file.rb +39 -0
  32. data/lib/mgnu/parser/kegg_ontology_index.rb +163 -0
  33. data/lib/mgnu/parser/pilercr.rb +102 -0
  34. data/lib/mgnu/parser/prodigal.rb +170 -0
  35. data/lib/mgnu/parser/sam.rb +115 -0
  36. data/lib/mgnu/parser/sam/alignment.rb +22 -0
  37. data/lib/mgnu/parser/sam/header.rb +23 -0
  38. data/lib/mgnu/parser/sam/pair.rb +18 -0
  39. data/lib/mgnu/sequence.rb +207 -0
  40. data/lib/mgnu/sequence/fasta.rb +79 -0
  41. data/lib/mgnu/sequence/fastq.rb +43 -0
  42. data/lib/mgnu/version.rb +16 -0
  43. data/mgnu.gemspec +39 -0
  44. data/spec/mgnu/parser/blast_format0_spec.rb +114 -0
  45. data/spec/mgnu/parser/blast_format7_spec.rb +24 -0
  46. data/spec/mgnu/parser/blast_format8_spec.rb +26 -0
  47. data/spec/mgnu/parser/blast_multihsp_spec.rb +100 -0
  48. data/spec/mgnu/parser/blast_oof_spec.rb +53 -0
  49. data/spec/mgnu/parser/clustalw_spec.rb +90 -0
  50. data/spec/mgnu/parser/fasta_header_index_tc_parser_spec.rb +25 -0
  51. data/spec/mgnu/parser/fasta_index_tc_parser_spec.rb +25 -0
  52. data/spec/mgnu/parser/fasta_parser_spec.rb +53 -0
  53. data/spec/mgnu/parser_spec.rb +22 -0
  54. data/spec/mgnu/sequence/fasta_spec.rb +60 -0
  55. data/spec/mgnu/sequence/fastq_spec.rb +31 -0
  56. data/spec/mgnu/sequence_spec.rb +81 -0
  57. data/spec/mgnu_spec.rb +7 -0
  58. data/spec/spec_helper.rb +53 -0
  59. metadata +376 -0
@@ -0,0 +1,57 @@
1
+ require 'moneta'
2
+ require 'json'
3
+
4
+ module MgNu
5
+ module Parser
6
+ class FastaIndex
7
+ attr_reader :filename, :db_name, :db, :db_type
8
+
9
+ # create a new FastaIndex parser
10
+ def initialize(filename, options = {})
11
+ options = {
12
+ :db_type => :TokyoCabinet
13
+ }.merge!(options)
14
+
15
+ @db_type = options[:db_type]
16
+
17
+ @filename = filename
18
+ if @db_type == :TokyoCabinet
19
+ if @filename =~ /^.+\.tch$/
20
+ @db_name = @filename
21
+ else
22
+ @db_name = "#{@filename}.tch"
23
+ end
24
+ end
25
+
26
+ if db_type == :TokyoCabinet
27
+ @db = Moneta.new(:TokyoCabinet, file: @db_name, type: :hdb)
28
+ end
29
+ parse
30
+ end
31
+
32
+ # setup parse method for creating tokyo cabinet
33
+ def parse
34
+ MgNu::Parser::Fasta.new(@filename).each do |f|
35
+ name = f.header_name
36
+ description = f.header_description
37
+ @db[name] = { 'description' => description, 'sequence' => f.sequence }.to_json
38
+ end
39
+ end # end of #parse
40
+
41
+ def [](name)
42
+ f = nil
43
+ if @db.key?(name)
44
+ d = JSON.parse(@db[name])
45
+ f = MgNu::Sequence::Fasta.new(:header => "#{name} #{d['description']}",
46
+ :sequence => d['sequence'])
47
+ end
48
+ f
49
+ end
50
+
51
+ def close
52
+ @db.close unless @db.nil?
53
+ end
54
+
55
+ end # end of MgNu::Parser::FastaIndex class
56
+ end # end of MgNu::Parser module
57
+ end # end of MgNu module
@@ -0,0 +1,61 @@
1
+ module MgNu
2
+ module Parser
3
+ class Fastq
4
+ include Enumerable
5
+ attr_reader :file, :filename
6
+
7
+ # create a new Fastq parser
8
+ def initialize(filename = nil)
9
+ @filename = filename
10
+ if @filename
11
+ if File.exists?(@filename) and File.readable?(@filename)
12
+ @file = File.open(@filename)
13
+ else
14
+ raise "\n\n -- No file by that name (#{@filename}). Exiting\n\n"
15
+ exit(1)
16
+ end
17
+ else
18
+ $stderr.puts("MgNu::Parser::Fastq.new(): need an existing fastq file name")
19
+ exit(1)
20
+ end
21
+ end
22
+
23
+ # override enumerables
24
+ def each
25
+ while @file.eof != true # keep reading until EOF
26
+ header = @file.readline.chomp
27
+ sequence = @file.readline.chomp
28
+ qualhdr = @file.readline.chomp
29
+ quality = @file.readline.chomp
30
+ if header =~ /^@(.*)/
31
+ header = $1
32
+ if qualhdr =~ /^\+(.*)/
33
+ qualhdr = $1
34
+ else
35
+ error("Malformed quality header!")
36
+ error("\n#{qualhdr}")
37
+ error("\nExiting at line #{@file.lineno}")
38
+ exit(1)
39
+ end
40
+ if header != qualhdr
41
+ if qualhdr =~ /\s*/
42
+ qualhdr = header
43
+ else
44
+ warn("Sequence header and quality header don't match!")
45
+ warn("sequence: #{header}")
46
+ warn(" quality: #{qualhdr}")
47
+ end
48
+ end
49
+ yield MgNu::Sequence::Fastq.new(:header => header, :sequence => sequence, :qualhdr => qualhdr, :quality => quality)
50
+ else
51
+ $stderr.puts "Malformed header!"
52
+ $stderr.puts "\n#{header}"
53
+ $stderr.puts "\nExiting at line #{@file.lineno}"
54
+ exit(1)
55
+ end
56
+ end # end of while @file.eof
57
+ end # end of #each
58
+
59
+ end # end of MgNu::Parser::Fasta class
60
+ end # end of MgNu::File module
61
+ end # end of MgNu module
@@ -0,0 +1,187 @@
1
+ module MgNu
2
+ module Parser
3
+ class Genbank
4
+ attr_reader :file
5
+ attr_accessor :genbank_instances
6
+
7
+ include MgNu::Loggable
8
+ include MgNu::Parser
9
+
10
+ InvalidGenbankFile = Class.new(StandardError)
11
+
12
+ LOCUS_REGEX = /^LOCUS\s+(\S+)\s+(\d+)\s+bp\s+(?:(ss-|ds-|ms-))?(\S+)\s+(?:(\S+)\s+)?(\S+)\s+(\S+)$/
13
+
14
+ # create a new Genbank parser
15
+ def initialize(filename)
16
+ @genbank_instances = []
17
+
18
+ if filename
19
+ if File.exists?(filename) and File.readable?(filename)
20
+ @file = File.open(filename)
21
+ else
22
+ error("MgNu::Parser::Genbank#parse: problems with filename")
23
+ raise "File doesn't exist or is not readable!"
24
+ end
25
+ else
26
+ error("MgNu::Parser::Genbank#parse: need a filename")
27
+ raise "no filename given!"
28
+ end
29
+ end
30
+
31
+ def parse(debug=false)
32
+ @debug = debug
33
+ # parse_header # also triggers parsing of everything else
34
+ until file.eof? do
35
+ parse_section
36
+ end
37
+ genbank_instances
38
+ end
39
+
40
+ def parse_section
41
+ locus_line = file.readline
42
+ if md = locus_line.match(LOCUS_REGEX)
43
+ genbank = MgNu::Genbank.new
44
+ info("found a LOCUS line") if @debug
45
+ genbank.locus = MgNu::Genbank::Locus.new(*md.captures)
46
+ info("LOCUS name #{genbank.locus.name}") if @debug
47
+
48
+ buffer = parse_until(file, /^ACCESSION/)
49
+ if buffer.join =~ /^DEFINITION\s+(.+)$/m
50
+ genbank.definition = $1.gsub(/\n/, ' ').gsub(/\s{2,}/, ' ').strip.chop
51
+ info genbank.definition if @debug
52
+ end
53
+
54
+ buffer = parse_until(file, /^VERSION/)
55
+ # parsing ACESSION number line
56
+ if buffer.join =~ /^ACCESSION\s+(.+)$/
57
+ temp = $1.strip.squeeze(' ').split("\s")
58
+ # multiple secondary accession numbers possible
59
+ genbank.accession, genbank.secondary_accession = temp.shift, temp
60
+ end
61
+ info "ACCESSION: #{genbank.accession}" if @debug
62
+
63
+ buffer = parse_until(file, /^KEYWORDS/)
64
+ # parsing VERSION line
65
+ buffer.each do |line|
66
+ if line =~ /^VERSION\s+(.+)$/
67
+ temp = $1.strip.squeeze(' ').split
68
+ temp.each do |version|
69
+ if version =~ /GI:(\d+)/
70
+ genbank.geninfo_identifier = $1.to_i
71
+ else
72
+ genbank.version = version
73
+ end
74
+ end
75
+ elsif line =~ /^DBLINK\s+(.+)$/
76
+ genbank.dblink = $1.strip.squeeze(' ')
77
+ end
78
+ end
79
+
80
+ buffer = parse_until(file, /^SOURCE/)
81
+
82
+ # parse keywords and optional segment
83
+ keyword_lines = []
84
+ buffer.each do |line|
85
+ if line =~ /^KEYWORDS\s+(.+)$/
86
+ keyword_lines << $1.strip.squeeze(' ')
87
+ elsif line =~ /^SEGMENT\s+(.+)$/
88
+ genbank.segment = $1.strip.squeeze(' ')
89
+ else
90
+ keyword_lines << line
91
+ end
92
+ end
93
+ k = keyword_lines.join
94
+ unless k == "."
95
+ k_array = k.split(/;\s*/) # keywords are separated by semicolons
96
+ k_array[-1].chop! # gets rid of the period after the last keyword
97
+ genbank.keywords = k_array
98
+ end
99
+
100
+ buffer = parse_until(file,/^FEATURES/)
101
+
102
+ ri = buffer.index {|l| l =~ /^REFERENCE/ }
103
+ ci = buffer.index {|l| l =~ /^COMMENT/ }
104
+
105
+ if ri && ci
106
+ genbank.source = MgNu::Genbank::Source.parse(buffer[0..ri-1])
107
+ parse_references(buffer[ri..ci-1], genbank)
108
+ genbank.comment = buffer[ci..-1].map{|line| line.gsub(/^COMMENT/, '').lstrip!.squeeze(' ')}.join("\n")
109
+ elsif ri
110
+ genbank.source = MgNu::Genbank::Source.parse(buffer[0..ri-1])
111
+ parse_references(buffer[ri..-1], genbank)
112
+ elsif ci
113
+ genbank.source = MgNu::Genbank::Source.parse(buffer[0..ci-1])
114
+ genbank.comment = buffer[ci..-1].map{|line| line.gsub(/^COMMENT/, '').lstrip!.squeeze(' ')}.join("\n")
115
+ else
116
+ # neither references nor comment line
117
+ genbank.source = MgNu::Genbank::Source.parse(buffer)
118
+ end
119
+
120
+ info genbank.source.common_name if @debug
121
+ info genbank.source.organism if @debug
122
+ info genbank.source.lineage if @debug
123
+
124
+ parse_features(parse_until(file, /^ORIGIN/), genbank)
125
+ info "features count: #{genbank.features.length}" if @debug
126
+
127
+ parse_sequence(parse_until(file, /\/\//), genbank)
128
+ info "sequence length: #{genbank.sequence.try(:length) || 0}" if @debug
129
+ file.readline # consumes end of section line //
130
+ genbank_instances << genbank
131
+ else
132
+ unless locus_line =~ /^\s*$/
133
+ raise InvalidGenbankFile, "Missing or malformed LOCUS line."
134
+ end
135
+ end
136
+ end
137
+
138
+ def parse_features(buffer, genbank)
139
+ buffer.shift if buffer[0] =~ /^FEATURES/
140
+ all_features = split_at_features(buffer.join("\n"))
141
+
142
+ all_features.each do |feature_str|
143
+ genbank.features << MgNu::Genbank::Feature.parse(feature_str)
144
+ end
145
+ end # end parse_features
146
+
147
+ def parse_references(buffer, genbank)
148
+ ref_array = split_at_header_tag(buffer.join("\n"))
149
+ ref_array.each do |ref|
150
+ genbank.references << MgNu::Genbank::Reference.parse(ref)
151
+ end
152
+ end
153
+
154
+ def parse_sequence(buffer, genbank)
155
+ buffer.shift # drop ORIGIN line
156
+ info("inside parse_sequence") if @debug
157
+ info("buffer is #{buffer.length}") if @debug
158
+
159
+ unless buffer.empty?
160
+ seq = ""
161
+ bigstr = buffer.join
162
+ seq = bigstr.gsub(/[\d\s]+/, "")
163
+ genbank.sequence = MgNu::Sequence.new(:value => seq)
164
+ genbank.features.each do |f|
165
+ f.sequence = f.location.get_sequence(genbank.sequence.value)
166
+ end
167
+ else
168
+ genbank.sequence = nil
169
+ end
170
+ end
171
+
172
+ # splits at lines beginning with capital letter and no preceding space chars
173
+ def split_at_header_tag(str)
174
+ sep = "\001"
175
+ str.gsub(/\n([A-Z])/, "\n#{sep}\\1").split(sep)
176
+ end
177
+
178
+ def split_at_features(str)
179
+ sep = "\001"
180
+ str.gsub(/\n(\s{5}\S)/, "\n#{sep}\\1").split(sep)
181
+ end
182
+
183
+ end # end of MgNu::Parser::Genbank class
184
+ end # end of MgNu::Parser module
185
+ end # end of MgNu module
186
+
187
+ __END__
@@ -0,0 +1,56 @@
1
+ module MgNu
2
+ module Parser
3
+ class GFF
4
+ include Enumerable
5
+
6
+ attr_reader :file
7
+
8
+ # create a new GFF parser
9
+ def initialize(filename = nil)
10
+ if filename
11
+ if File.exists?(filename) and File.readable?(filename)
12
+ @file = File.open(filename)
13
+ else
14
+ @file = File.new(filename, "w")
15
+ end
16
+ else
17
+ error("MgNu::Parser::GFF.new(): need a filename for an existing file")
18
+ end
19
+ end
20
+
21
+ # override enumerables
22
+ def each
23
+ @file.each_line do |line|
24
+ line.chomp!
25
+ next if line =~ /^#/
26
+ yield Record.new(line)
27
+ end
28
+ end # end of #each
29
+
30
+ # class to deal with each line (record) of data
31
+ class Record
32
+ attr_accessor :name, :source, :feature, :start, :end
33
+ attr_accessor :score, :strand, :frame, :attributes
34
+
35
+ def initialize(line)
36
+ @name, @source, @feature, @start, @end,
37
+ @score, @strand, @frame, @attributes = line.split("\t")
38
+ @attributes = parse_attributes(attributes) if attributes
39
+ end
40
+
41
+ alias :seqname :name
42
+
43
+ private
44
+
45
+ def parse_attributes(attributes)
46
+ hash = Hash.new
47
+ attributes.split(/[^\\];/).each do |atr|
48
+ key, value = atr.split(' ', 2)
49
+ hash[key] = value
50
+ end
51
+ hash
52
+ end
53
+ end # end of MgNu::Parser::GFF::Record class
54
+ end # end of MgNu::Parser::GFF class
55
+ end # end of MgNu::Parser module
56
+ end # end of MgNu module
@@ -0,0 +1,76 @@
1
+ module MgNu
2
+ module Parser
3
+ class Iprscan
4
+ class Hit
5
+ attr_accessor :query, :crc, :length, :db, :db_id, :db_description
6
+ attr_accessor :from, :to, :evalue, :status, :date
7
+ attr_accessor :ipr_id, :ipr_description, :go
8
+
9
+ include MgNu::Loggable
10
+
11
+ # create a new Hit object
12
+ def initialize(line = nil)
13
+ @ipr_id = nil
14
+ @ipr_description = nil
15
+ @go = nil
16
+
17
+ line.chomp!
18
+ temp = line.split(/\t/)
19
+ @query = temp.shift
20
+ @crc = temp.shift
21
+ @length = temp.shift.to_i
22
+ @db = temp.shift
23
+ @db_id = temp.shift
24
+ @db_description = temp.shift
25
+ @from = temp.shift.to_i
26
+ @to = temp.shift.to_i
27
+ @evalue = temp.shift.to_f
28
+ if @db == "Seg" or @db == "TMHMM" or @db == "Coil"
29
+ @evalue = "NA"
30
+ end
31
+ @status = temp.shift
32
+ @date = temp.shift
33
+ if temp.length > 0
34
+ @ipr_id = temp.shift
35
+ if temp.length > 0
36
+ @ipr_description = temp.shift
37
+ if temp.length > 0
38
+ @go = temp.shift
39
+ end
40
+ end
41
+ end
42
+
43
+ end
44
+
45
+ def to_s
46
+ str = "#{@query}\t#{@crc}\t#{@length}\t#{@db}\t#{@db_id}\t#{@db_description}\t"
47
+ str += "#{@from}\t#{@to}\t#{@evalue}\t#{@status}\t#{@date}"
48
+ unless @ipr_id.nil?
49
+ str += "\t#{@ipr_id}\t#{@ipr_description}"
50
+ unless @go.nil?
51
+ str += "\t#{@go}"
52
+ end
53
+ end
54
+ str
55
+ end
56
+
57
+ def match_length
58
+ @from < @to ? @to - @from : @from - @to
59
+ end
60
+
61
+ def summary
62
+ string = "#{@db_description} (db=#{@db} db_id=#{@db_id}"
63
+ string += " from=#{@from} to=#{@to}"
64
+ string += " evalue=#{@evalue}" unless db == "Seg" or db == "TMHMM"
65
+ string += " interpro_id=#{@ipr_id} interpro_description=#{@ipr_description}" unless @ipr_id == "NULL"
66
+ string += " GO=#{@go}" unless @go.nil?
67
+ string += ")"
68
+ string
69
+ end
70
+
71
+ end
72
+ end # end of MgNu::Parser::Iprscan::Hit class
73
+ end # end of MgNu::Parser module
74
+ end # end of MgNu module
75
+
76
+ __END__
@@ -0,0 +1,39 @@
1
+ require 'mgnu/parser/iprscan/hit'
2
+
3
+ module MgNu
4
+ module Parser
5
+ class IprscanFile
6
+ attr_reader :file, :queries
7
+
8
+ include MgNu::Loggable
9
+
10
+ def initialize(filename = nil)
11
+ if filename
12
+ if File.exists?(filename) and File.readable?(filename)
13
+ @file = File.open(filename)
14
+ else
15
+ error("MgNu::Parser::IprscanFile.new(): problems with filename")
16
+ raise "File doesn't exist or is not readable!"
17
+ end
18
+ else
19
+ error("MgNu::Parser::IprscanFile.new(): need a filename")
20
+ raise "no filename given!"
21
+ end
22
+
23
+ @queries = Hash.new
24
+
25
+ parse
26
+ end
27
+
28
+ def parse
29
+ @file.each do |line|
30
+ line.chomp!
31
+ hit = MgNu::Parser::Iprscan::Hit.new(line)
32
+ @queries.has_key?(hit.query) ? @queries[hit.query] << hit : @queries[hit.query] = [ hit ]
33
+ end
34
+ end
35
+ end # end of MgNu::Parser::IprscanFile class
36
+ end # end of MgNu::Parser module
37
+ end # end of MgNu module
38
+
39
+ __END__