swissparser 0.5.1
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +8 -0
- data/LICENSE +675 -0
- data/README.txt +32 -0
- data/Rakefile +23 -0
- data/examples/data/EColPositives_noTAT.bas +520 -0
- data/examples/data/kegg_enzyme_short.txt +881 -0
- data/examples/data/uniprot.txt +2855 -0
- data/examples/kegg_demo.rb +103 -0
- data/examples/signal_demo.rb +100 -0
- data/examples/uniprot_demo.rb +83 -0
- data/lib/swiss_parser.rb +214 -0
- metadata +76 -0
@@ -0,0 +1,103 @@
|
|
1
|
+
=begin
|
2
|
+
Copyright (C) 2009 Paradigmatic
|
3
|
+
|
4
|
+
This file is part of SwissParser.
|
5
|
+
|
6
|
+
SwissParser is free software: you can redistribute it and/or modify
|
7
|
+
it under the terms of the GNU General Public License as published by
|
8
|
+
the Free Software Foundation, either version 3 of the License, or
|
9
|
+
(at your option) any later version.
|
10
|
+
|
11
|
+
SwissParser is distributed in the hope that it will be useful,
|
12
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
14
|
+
GNU General Public License for more details.
|
15
|
+
|
16
|
+
You should have received a copy of the GNU General Public License
|
17
|
+
along with SwissParser. If not, see <http://www.gnu.org/licenses/>.
|
18
|
+
=end
|
19
|
+
|
20
|
+
require 'swiss_parser.rb'
|
21
|
+
require 'yaml'
|
22
|
+
|
23
|
+
class Enzyme
|
24
|
+
|
25
|
+
attr_accessor :id, :genes
|
26
|
+
|
27
|
+
end
|
28
|
+
|
29
|
+
|
30
|
+
enzyme_parser = Swiss::Parser.define do
|
31
|
+
|
32
|
+
|
33
|
+
new_entry do |params|
|
34
|
+
{ :genes => [] }
|
35
|
+
end
|
36
|
+
|
37
|
+
rules do
|
38
|
+
|
39
|
+
def parse_gene_ids( string, entry )
|
40
|
+
string.split(" ").each do |item|
|
41
|
+
if item =~ /(\d+)\(\w+\)/
|
42
|
+
entry[:genes] << $1
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
human = "HSA"
|
48
|
+
|
49
|
+
set_separator( "///" )
|
50
|
+
|
51
|
+
with("ENTRY") do |content,entry|
|
52
|
+
content =~ /((\d+|-)\.(\d+|-)\.(\d+|-)\.(\d+|-))/
|
53
|
+
entry[:id] = $1
|
54
|
+
end
|
55
|
+
|
56
|
+
with("GENES") do |content,entry|
|
57
|
+
content =~ /^([A-Z]+): (.*)/
|
58
|
+
org,genes = $1,$2
|
59
|
+
entry[:last_organism] = org
|
60
|
+
if org == human
|
61
|
+
parse_gene_ids( genes, entry )
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
with_text_after("GENES") do |content,entry|
|
66
|
+
if content =~ /([A-Z]+): (.*)/
|
67
|
+
org,genes = $1,$2
|
68
|
+
entry[:last_organism] = org
|
69
|
+
if org == human
|
70
|
+
parse_gene_ids( genes, entry )
|
71
|
+
end
|
72
|
+
elsif entry[:last_organism] == human
|
73
|
+
parse_gene_ids( content, entry )
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
end
|
78
|
+
|
79
|
+
finish_entry do |entry,container,params|
|
80
|
+
if entry[:genes].size > 0
|
81
|
+
e = Enzyme.new
|
82
|
+
e.id = entry[:id]
|
83
|
+
e.genes = entry[:genes]
|
84
|
+
container << e
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
end
|
89
|
+
|
90
|
+
|
91
|
+
if $0 == __FILE__
|
92
|
+
|
93
|
+
filename = ARGV.shift
|
94
|
+
|
95
|
+
enzymes = enzyme_parser.parse_file( filename )
|
96
|
+
|
97
|
+
enzymes.each do |e|
|
98
|
+
puts e.to_yaml
|
99
|
+
end
|
100
|
+
|
101
|
+
end
|
102
|
+
|
103
|
+
|
@@ -0,0 +1,100 @@
|
|
1
|
+
=begin
|
2
|
+
Copyright (C) 2009 Paradigmatic
|
3
|
+
|
4
|
+
This file is part of SwissParser.
|
5
|
+
|
6
|
+
SwissParser is free software: you can redistribute it and/or modify
|
7
|
+
it under the terms of the GNU General Public License as published by
|
8
|
+
the Free Software Foundation, either version 3 of the License, or
|
9
|
+
(at your option) any later version.
|
10
|
+
|
11
|
+
SwissParser is distributed in the hope that it will be useful,
|
12
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
14
|
+
GNU General Public License for more details.
|
15
|
+
|
16
|
+
You should have received a copy of the GNU General Public License
|
17
|
+
along with SwissParser. If not, see <http://www.gnu.org/licenses/>.
|
18
|
+
=end
|
19
|
+
|
20
|
+
require 'swiss_parser.rb'
|
21
|
+
require 'yaml'
|
22
|
+
|
23
|
+
class Protein
|
24
|
+
attr_accessor :name, :sequence, :size
|
25
|
+
end
|
26
|
+
|
27
|
+
parser = Swiss::Parser.define do
|
28
|
+
|
29
|
+
new_entry do
|
30
|
+
Protein.new
|
31
|
+
end
|
32
|
+
|
33
|
+
rules do
|
34
|
+
|
35
|
+
set_separator '/'
|
36
|
+
|
37
|
+
with('N') do |content,entry|
|
38
|
+
entry.name = content
|
39
|
+
end
|
40
|
+
|
41
|
+
with('C') do |content,entry|
|
42
|
+
entry.size = content.to_i
|
43
|
+
end
|
44
|
+
|
45
|
+
with('S') do |content,entry|
|
46
|
+
entry.sequence = content
|
47
|
+
end
|
48
|
+
|
49
|
+
end
|
50
|
+
|
51
|
+
end
|
52
|
+
|
53
|
+
|
54
|
+
stat_parser = parser.extend do
|
55
|
+
|
56
|
+
before do |params|
|
57
|
+
{ :min => 1_000, :max => 0, :sum => 0, :n => 0 }
|
58
|
+
end
|
59
|
+
|
60
|
+
finish_entry do |entry,h,params|
|
61
|
+
if entry.size < h[:min]
|
62
|
+
h[:min] = entry.size
|
63
|
+
end
|
64
|
+
if entry.size > h[:max]
|
65
|
+
h[:max] = entry.size
|
66
|
+
end
|
67
|
+
h[:sum] += entry.size
|
68
|
+
h[:n] += 1
|
69
|
+
end
|
70
|
+
|
71
|
+
after do |h,params|
|
72
|
+
h[:average] = h[:sum].to_f / h[:n]
|
73
|
+
h
|
74
|
+
end
|
75
|
+
|
76
|
+
end
|
77
|
+
|
78
|
+
|
79
|
+
if $0 == __FILE__
|
80
|
+
|
81
|
+
filename = ARGV.shift
|
82
|
+
|
83
|
+
entries = parser.parse_file( filename )
|
84
|
+
|
85
|
+
entries.each do |e|
|
86
|
+
puts e.to_yaml
|
87
|
+
end
|
88
|
+
|
89
|
+
puts
|
90
|
+
|
91
|
+
results = stat_parser.parse_file( filename )
|
92
|
+
|
93
|
+
puts "Min: #{results[:min]}"
|
94
|
+
puts "Max: #{results[:max]}"
|
95
|
+
puts "Average: #{results[:average]}"
|
96
|
+
puts "Size: #{results[:n]}"
|
97
|
+
|
98
|
+
end
|
99
|
+
|
100
|
+
|
@@ -0,0 +1,83 @@
|
|
1
|
+
=begin
|
2
|
+
Copyright (C) 2009 Paradigmatic
|
3
|
+
|
4
|
+
This file is part of SwissParser.
|
5
|
+
|
6
|
+
SwissParser is free software: you can redistribute it and/or modify
|
7
|
+
it under the terms of the GNU General Public License as published by
|
8
|
+
the Free Software Foundation, either version 3 of the License, or
|
9
|
+
(at your option) any later version.
|
10
|
+
|
11
|
+
SwissParser is distributed in the hope that it will be useful,
|
12
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
14
|
+
GNU General Public License for more details.
|
15
|
+
|
16
|
+
You should have received a copy of the GNU General Public License
|
17
|
+
along with SwissParser. If not, see <http://www.gnu.org/licenses/>.
|
18
|
+
=end
|
19
|
+
|
20
|
+
#!/usr/bin/ruby -w
|
21
|
+
|
22
|
+
require 'yaml'
|
23
|
+
require 'swiss_parser.rb'
|
24
|
+
|
25
|
+
class Protein
|
26
|
+
|
27
|
+
attr_accessor :id, :size, :species, :taxonomy, :sequence
|
28
|
+
|
29
|
+
def initialize
|
30
|
+
@taxonomy = []
|
31
|
+
@sequence = ""
|
32
|
+
end
|
33
|
+
|
34
|
+
end
|
35
|
+
|
36
|
+
|
37
|
+
uniprot_parser = Swiss::Parser.define do
|
38
|
+
|
39
|
+
new_entry do
|
40
|
+
Protein.new
|
41
|
+
end
|
42
|
+
|
43
|
+
rules do
|
44
|
+
|
45
|
+
with("ID") do |content,protein|
|
46
|
+
content =~ /([A-Z]\w+)\D+(\d+)/
|
47
|
+
protein.id = $1
|
48
|
+
protein.size = $2.to_i
|
49
|
+
end
|
50
|
+
|
51
|
+
with("OS") do |content,protein|
|
52
|
+
content =~ /(\w+ \w+)/
|
53
|
+
protein.species = $1
|
54
|
+
end
|
55
|
+
|
56
|
+
with("OC") do |content,protein|
|
57
|
+
ary = content.gsub(".","").split("; ")
|
58
|
+
protein.taxonomy += ary
|
59
|
+
end
|
60
|
+
|
61
|
+
with_text_after("SQ") do |content,protein|
|
62
|
+
seq = content.strip.gsub(" ","")
|
63
|
+
protein.sequence += seq
|
64
|
+
end
|
65
|
+
|
66
|
+
end
|
67
|
+
|
68
|
+
end
|
69
|
+
|
70
|
+
|
71
|
+
if $0 == __FILE__
|
72
|
+
|
73
|
+
filename = ARGV.shift
|
74
|
+
|
75
|
+
entries = uniprot_parser.parse_file( filename )
|
76
|
+
|
77
|
+
puts entries.size
|
78
|
+
|
79
|
+
entries.each do |e|
|
80
|
+
puts e.to_yaml
|
81
|
+
end
|
82
|
+
|
83
|
+
end
|
data/lib/swiss_parser.rb
ADDED
@@ -0,0 +1,214 @@
|
|
1
|
+
=begin
|
2
|
+
Copyright (C) 2009 Paradigmatic
|
3
|
+
|
4
|
+
This file is part of SwissParser.
|
5
|
+
|
6
|
+
SwissParser is free software: you can redistribute it and/or modify
|
7
|
+
it under the terms of the GNU General Public License as published by
|
8
|
+
the Free Software Foundation, either version 3 of the License, or
|
9
|
+
(at your option) any later version.
|
10
|
+
|
11
|
+
SwissParser is distributed in the hope that it will be useful,
|
12
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
14
|
+
GNU General Public License for more details.
|
15
|
+
|
16
|
+
You should have received a copy of the GNU General Public License
|
17
|
+
along with SwissParser. If not, see <http://www.gnu.org/licenses/>.
|
18
|
+
=end
|
19
|
+
|
20
|
+
|
21
|
+
|
22
|
+
module Swiss
|
23
|
+
|
24
|
+
VERSION = "0.5.1"
|
25
|
+
|
26
|
+
# This class defines parsing rules. Its methods
|
27
|
+
# are accessible within the +rules+ section of
|
28
|
+
# a parser definition.
|
29
|
+
class ParsingRules
|
30
|
+
|
31
|
+
attr_reader :separator, :actions
|
32
|
+
|
33
|
+
# *Do* *not* create directly this class but access it
|
34
|
+
# through a +rules+ section in a parser definition.
|
35
|
+
def initialize
|
36
|
+
@actions = { :text => {} }
|
37
|
+
end
|
38
|
+
|
39
|
+
# Sets the entry separator line. Default: "//"
|
40
|
+
def set_separator(string)
|
41
|
+
@separator = string
|
42
|
+
end
|
43
|
+
|
44
|
+
# Defines how to parse a line starting with +key+. The +proc+
|
45
|
+
# takes two arguments:
|
46
|
+
# * the rest of the line
|
47
|
+
# * the entry object
|
48
|
+
def with( key, &proc )
|
49
|
+
@actions[key] = proc
|
50
|
+
end
|
51
|
+
|
52
|
+
# Defines how to parse a line without key coming *after*
|
53
|
+
# a specified key. The +proc+ takes two arguments:
|
54
|
+
# * the rest of the line
|
55
|
+
# * the entry object
|
56
|
+
def with_text_after( key, &proc )
|
57
|
+
@actions[:text][key] = proc
|
58
|
+
end
|
59
|
+
|
60
|
+
end
|
61
|
+
|
62
|
+
|
63
|
+
# Parser for a typical bioinformatic flat file.
|
64
|
+
class Parser
|
65
|
+
|
66
|
+
#Default entry separator
|
67
|
+
DEFAULT_SEPARATOR = "//"
|
68
|
+
|
69
|
+
#*Do* *not* *use* this method to instatiate a parser. Use rather
|
70
|
+
#the +define+ class method.
|
71
|
+
def initialize(*args)
|
72
|
+
if args.size == 0
|
73
|
+
@separator = DEFAULT_SEPARATOR
|
74
|
+
@actions = {}
|
75
|
+
@actions[:text] = {}
|
76
|
+
elsif args.size == 6
|
77
|
+
actions,separator,before,the_begin,the_end,after = *args
|
78
|
+
@actions = actions.clone
|
79
|
+
@actions[:text] = actions[:text].clone
|
80
|
+
@separator = separator
|
81
|
+
@before = before
|
82
|
+
@end = the_end
|
83
|
+
@begin = the_begin
|
84
|
+
@after = after
|
85
|
+
else
|
86
|
+
raise "Wrong arg number, either 0 or 6."
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
# Defines how to create the _entry_ _object_. The +proc+
|
91
|
+
# takes a single argument which is a hash containing
|
92
|
+
# parsing options. It must return a new _entry_ _object_.
|
93
|
+
# Default:: creates an empty hash.
|
94
|
+
def new_entry(&proc)
|
95
|
+
@begin = proc
|
96
|
+
end
|
97
|
+
|
98
|
+
# Defines how to finalize an _entry_ _object_. The +proc+
|
99
|
+
# takes three arguments:
|
100
|
+
# * The entry object ready to be finalized
|
101
|
+
# * The context object
|
102
|
+
# * An hash containing parsing options.
|
103
|
+
# Default:: Adds the entry object to the context object using +<<+ method.
|
104
|
+
def finish_entry(&proc)
|
105
|
+
@end = proc
|
106
|
+
end
|
107
|
+
|
108
|
+
# Defines how to set the context before using the parser.
|
109
|
+
# The +proc+ takes a single argument which is a hash containing
|
110
|
+
# parsing options. It must return a _context_ object.
|
111
|
+
# Default:: creates an empty array
|
112
|
+
def before (&proc)
|
113
|
+
@before = proc
|
114
|
+
end
|
115
|
+
|
116
|
+
# Defines how to finalize the whole parsing.
|
117
|
+
# The +proc+ takes two arguments:
|
118
|
+
# * The context object
|
119
|
+
# * An hash containing parsing options.
|
120
|
+
# The value returned by the +proc+ is then returned by the parsing method.
|
121
|
+
# Default:: just returns the context object.
|
122
|
+
def after(&proc)
|
123
|
+
@after = proc
|
124
|
+
end
|
125
|
+
|
126
|
+
# Defines parsing rules inside a parser definition. The ParsingRules
|
127
|
+
# methods can then be called inside the proc.
|
128
|
+
def rules(&proc)
|
129
|
+
r = ParsingRules.new
|
130
|
+
r.instance_eval(&proc)
|
131
|
+
r.actions.each do |k,v|
|
132
|
+
if k == :text
|
133
|
+
next
|
134
|
+
end
|
135
|
+
@actions[k] = v
|
136
|
+
r.actions[:text].each do |k,v|
|
137
|
+
@actions[:text][k] = v
|
138
|
+
end
|
139
|
+
if r.separator
|
140
|
+
@separator = r.separator
|
141
|
+
end
|
142
|
+
end
|
143
|
+
end
|
144
|
+
|
145
|
+
|
146
|
+
|
147
|
+
# Extends an existing parser by allowing to redefine rules. The
|
148
|
+
# changes in the new parser simply replace the original defintions.
|
149
|
+
# After extension, the new parser is independent of the original one,
|
150
|
+
# i.e. a change to the original parser will not affect the derived one.
|
151
|
+
def extend(&proc)
|
152
|
+
clone = Parser.new( @actions, @separator, @before, @begin, @end, @after )
|
153
|
+
clone.instance_eval( &proc )
|
154
|
+
clone
|
155
|
+
end
|
156
|
+
|
157
|
+
# Defines a new parser.
|
158
|
+
def self.define( &proc )
|
159
|
+
PROTOTYPE.extend( &proc )
|
160
|
+
end
|
161
|
+
|
162
|
+
# Parses a file specified by +filename+. An optional hash
|
163
|
+
# of arbitrary arguments (+params+) can be specified. It is
|
164
|
+
# passed to the workflow methods blocks (+before+, +new_entry+, ...)
|
165
|
+
# It returns the value specified in the +after+ block. By default,
|
166
|
+
# it returns an array containing _entry_ objects.
|
167
|
+
def parse_file( filename, params={} )
|
168
|
+
context = @before.call( params )
|
169
|
+
File.open( filename, 'r' ) do |file|
|
170
|
+
entry = @begin.call( params )
|
171
|
+
file.each_line do |line|
|
172
|
+
state = parse_line( line, entry )
|
173
|
+
if state == :end
|
174
|
+
@end.call( entry, context, params )
|
175
|
+
entry = @begin.call( params )
|
176
|
+
end
|
177
|
+
end
|
178
|
+
end
|
179
|
+
@after.call( context, params )
|
180
|
+
end
|
181
|
+
|
182
|
+
private
|
183
|
+
|
184
|
+
PROTOTYPE = Parser.new
|
185
|
+
PROTOTYPE.instance_eval do
|
186
|
+
before { |p| [] }
|
187
|
+
new_entry { |p| {} }
|
188
|
+
finish_entry {|e,c,p| c << e }
|
189
|
+
after {|c,p| c }
|
190
|
+
end
|
191
|
+
|
192
|
+
|
193
|
+
def parse_line( line, holder )
|
194
|
+
line.chomp!
|
195
|
+
if line == @separator
|
196
|
+
:end
|
197
|
+
elsif line =~ /^(\S+)\s+(.*)$/
|
198
|
+
key,value = $1,$2
|
199
|
+
@last_key = key
|
200
|
+
if @actions[key]
|
201
|
+
@actions[key].call( value, holder )
|
202
|
+
end
|
203
|
+
:parsing
|
204
|
+
else
|
205
|
+
if @actions[:text][@last_key]
|
206
|
+
@actions[:text][@last_key].call( line, holder )
|
207
|
+
end
|
208
|
+
:parsing
|
209
|
+
end
|
210
|
+
end
|
211
|
+
|
212
|
+
end
|
213
|
+
|
214
|
+
end
|