swissparser 0.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +8 -0
- data/LICENSE +675 -0
- data/README.txt +32 -0
- data/Rakefile +23 -0
- data/examples/data/EColPositives_noTAT.bas +520 -0
- data/examples/data/kegg_enzyme_short.txt +881 -0
- data/examples/data/uniprot.txt +2855 -0
- data/examples/kegg_demo.rb +103 -0
- data/examples/signal_demo.rb +100 -0
- data/examples/uniprot_demo.rb +83 -0
- data/lib/swiss_parser.rb +214 -0
- metadata +76 -0
@@ -0,0 +1,103 @@
|
|
1
|
+
=begin
|
2
|
+
Copyright (C) 2009 Paradigmatic
|
3
|
+
|
4
|
+
This file is part of SwissParser.
|
5
|
+
|
6
|
+
SwissParser is free software: you can redistribute it and/or modify
|
7
|
+
it under the terms of the GNU General Public License as published by
|
8
|
+
the Free Software Foundation, either version 3 of the License, or
|
9
|
+
(at your option) any later version.
|
10
|
+
|
11
|
+
SwissParser is distributed in the hope that it will be useful,
|
12
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
14
|
+
GNU General Public License for more details.
|
15
|
+
|
16
|
+
You should have received a copy of the GNU General Public License
|
17
|
+
along with SwissParser. If not, see <http://www.gnu.org/licenses/>.
|
18
|
+
=end
|
19
|
+
|
20
|
+
require 'swiss_parser.rb'
|
21
|
+
require 'yaml'
|
22
|
+
|
23
|
+
class Enzyme
|
24
|
+
|
25
|
+
attr_accessor :id, :genes
|
26
|
+
|
27
|
+
end
|
28
|
+
|
29
|
+
|
30
|
+
enzyme_parser = Swiss::Parser.define do
|
31
|
+
|
32
|
+
|
33
|
+
new_entry do |params|
|
34
|
+
{ :genes => [] }
|
35
|
+
end
|
36
|
+
|
37
|
+
rules do
|
38
|
+
|
39
|
+
def parse_gene_ids( string, entry )
|
40
|
+
string.split(" ").each do |item|
|
41
|
+
if item =~ /(\d+)\(\w+\)/
|
42
|
+
entry[:genes] << $1
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
human = "HSA"
|
48
|
+
|
49
|
+
set_separator( "///" )
|
50
|
+
|
51
|
+
with("ENTRY") do |content,entry|
|
52
|
+
content =~ /((\d+|-)\.(\d+|-)\.(\d+|-)\.(\d+|-))/
|
53
|
+
entry[:id] = $1
|
54
|
+
end
|
55
|
+
|
56
|
+
with("GENES") do |content,entry|
|
57
|
+
content =~ /^([A-Z]+): (.*)/
|
58
|
+
org,genes = $1,$2
|
59
|
+
entry[:last_organism] = org
|
60
|
+
if org == human
|
61
|
+
parse_gene_ids( genes, entry )
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
with_text_after("GENES") do |content,entry|
|
66
|
+
if content =~ /([A-Z]+): (.*)/
|
67
|
+
org,genes = $1,$2
|
68
|
+
entry[:last_organism] = org
|
69
|
+
if org == human
|
70
|
+
parse_gene_ids( genes, entry )
|
71
|
+
end
|
72
|
+
elsif entry[:last_organism] == human
|
73
|
+
parse_gene_ids( content, entry )
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
end
|
78
|
+
|
79
|
+
finish_entry do |entry,container,params|
|
80
|
+
if entry[:genes].size > 0
|
81
|
+
e = Enzyme.new
|
82
|
+
e.id = entry[:id]
|
83
|
+
e.genes = entry[:genes]
|
84
|
+
container << e
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
end
|
89
|
+
|
90
|
+
|
91
|
+
if $0 == __FILE__
|
92
|
+
|
93
|
+
filename = ARGV.shift
|
94
|
+
|
95
|
+
enzymes = enzyme_parser.parse_file( filename )
|
96
|
+
|
97
|
+
enzymes.each do |e|
|
98
|
+
puts e.to_yaml
|
99
|
+
end
|
100
|
+
|
101
|
+
end
|
102
|
+
|
103
|
+
|
@@ -0,0 +1,100 @@
|
|
1
|
+
=begin
|
2
|
+
Copyright (C) 2009 Paradigmatic
|
3
|
+
|
4
|
+
This file is part of SwissParser.
|
5
|
+
|
6
|
+
SwissParser is free software: you can redistribute it and/or modify
|
7
|
+
it under the terms of the GNU General Public License as published by
|
8
|
+
the Free Software Foundation, either version 3 of the License, or
|
9
|
+
(at your option) any later version.
|
10
|
+
|
11
|
+
SwissParser is distributed in the hope that it will be useful,
|
12
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
14
|
+
GNU General Public License for more details.
|
15
|
+
|
16
|
+
You should have received a copy of the GNU General Public License
|
17
|
+
along with SwissParser. If not, see <http://www.gnu.org/licenses/>.
|
18
|
+
=end
|
19
|
+
|
20
|
+
require 'swiss_parser.rb'
|
21
|
+
require 'yaml'
|
22
|
+
|
23
|
+
class Protein
|
24
|
+
attr_accessor :name, :sequence, :size
|
25
|
+
end
|
26
|
+
|
27
|
+
parser = Swiss::Parser.define do
|
28
|
+
|
29
|
+
new_entry do
|
30
|
+
Protein.new
|
31
|
+
end
|
32
|
+
|
33
|
+
rules do
|
34
|
+
|
35
|
+
set_separator '/'
|
36
|
+
|
37
|
+
with('N') do |content,entry|
|
38
|
+
entry.name = content
|
39
|
+
end
|
40
|
+
|
41
|
+
with('C') do |content,entry|
|
42
|
+
entry.size = content.to_i
|
43
|
+
end
|
44
|
+
|
45
|
+
with('S') do |content,entry|
|
46
|
+
entry.sequence = content
|
47
|
+
end
|
48
|
+
|
49
|
+
end
|
50
|
+
|
51
|
+
end
|
52
|
+
|
53
|
+
|
54
|
+
stat_parser = parser.extend do
|
55
|
+
|
56
|
+
before do |params|
|
57
|
+
{ :min => 1_000, :max => 0, :sum => 0, :n => 0 }
|
58
|
+
end
|
59
|
+
|
60
|
+
finish_entry do |entry,h,params|
|
61
|
+
if entry.size < h[:min]
|
62
|
+
h[:min] = entry.size
|
63
|
+
end
|
64
|
+
if entry.size > h[:max]
|
65
|
+
h[:max] = entry.size
|
66
|
+
end
|
67
|
+
h[:sum] += entry.size
|
68
|
+
h[:n] += 1
|
69
|
+
end
|
70
|
+
|
71
|
+
after do |h,params|
|
72
|
+
h[:average] = h[:sum].to_f / h[:n]
|
73
|
+
h
|
74
|
+
end
|
75
|
+
|
76
|
+
end
|
77
|
+
|
78
|
+
|
79
|
+
if $0 == __FILE__
|
80
|
+
|
81
|
+
filename = ARGV.shift
|
82
|
+
|
83
|
+
entries = parser.parse_file( filename )
|
84
|
+
|
85
|
+
entries.each do |e|
|
86
|
+
puts e.to_yaml
|
87
|
+
end
|
88
|
+
|
89
|
+
puts
|
90
|
+
|
91
|
+
results = stat_parser.parse_file( filename )
|
92
|
+
|
93
|
+
puts "Min: #{results[:min]}"
|
94
|
+
puts "Max: #{results[:max]}"
|
95
|
+
puts "Average: #{results[:average]}"
|
96
|
+
puts "Size: #{results[:n]}"
|
97
|
+
|
98
|
+
end
|
99
|
+
|
100
|
+
|
@@ -0,0 +1,83 @@
|
|
1
|
+
=begin
|
2
|
+
Copyright (C) 2009 Paradigmatic
|
3
|
+
|
4
|
+
This file is part of SwissParser.
|
5
|
+
|
6
|
+
SwissParser is free software: you can redistribute it and/or modify
|
7
|
+
it under the terms of the GNU General Public License as published by
|
8
|
+
the Free Software Foundation, either version 3 of the License, or
|
9
|
+
(at your option) any later version.
|
10
|
+
|
11
|
+
SwissParser is distributed in the hope that it will be useful,
|
12
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
14
|
+
GNU General Public License for more details.
|
15
|
+
|
16
|
+
You should have received a copy of the GNU General Public License
|
17
|
+
along with SwissParser. If not, see <http://www.gnu.org/licenses/>.
|
18
|
+
=end
|
19
|
+
|
20
|
+
#!/usr/bin/ruby -w
|
21
|
+
|
22
|
+
require 'yaml'
|
23
|
+
require 'swiss_parser.rb'
|
24
|
+
|
25
|
+
class Protein
|
26
|
+
|
27
|
+
attr_accessor :id, :size, :species, :taxonomy, :sequence
|
28
|
+
|
29
|
+
def initialize
|
30
|
+
@taxonomy = []
|
31
|
+
@sequence = ""
|
32
|
+
end
|
33
|
+
|
34
|
+
end
|
35
|
+
|
36
|
+
|
37
|
+
uniprot_parser = Swiss::Parser.define do
|
38
|
+
|
39
|
+
new_entry do
|
40
|
+
Protein.new
|
41
|
+
end
|
42
|
+
|
43
|
+
rules do
|
44
|
+
|
45
|
+
with("ID") do |content,protein|
|
46
|
+
content =~ /([A-Z]\w+)\D+(\d+)/
|
47
|
+
protein.id = $1
|
48
|
+
protein.size = $2.to_i
|
49
|
+
end
|
50
|
+
|
51
|
+
with("OS") do |content,protein|
|
52
|
+
content =~ /(\w+ \w+)/
|
53
|
+
protein.species = $1
|
54
|
+
end
|
55
|
+
|
56
|
+
with("OC") do |content,protein|
|
57
|
+
ary = content.gsub(".","").split("; ")
|
58
|
+
protein.taxonomy += ary
|
59
|
+
end
|
60
|
+
|
61
|
+
with_text_after("SQ") do |content,protein|
|
62
|
+
seq = content.strip.gsub(" ","")
|
63
|
+
protein.sequence += seq
|
64
|
+
end
|
65
|
+
|
66
|
+
end
|
67
|
+
|
68
|
+
end
|
69
|
+
|
70
|
+
|
71
|
+
if $0 == __FILE__
|
72
|
+
|
73
|
+
filename = ARGV.shift
|
74
|
+
|
75
|
+
entries = uniprot_parser.parse_file( filename )
|
76
|
+
|
77
|
+
puts entries.size
|
78
|
+
|
79
|
+
entries.each do |e|
|
80
|
+
puts e.to_yaml
|
81
|
+
end
|
82
|
+
|
83
|
+
end
|
data/lib/swiss_parser.rb
ADDED
@@ -0,0 +1,214 @@
|
|
1
|
+
=begin
|
2
|
+
Copyright (C) 2009 Paradigmatic
|
3
|
+
|
4
|
+
This file is part of SwissParser.
|
5
|
+
|
6
|
+
SwissParser is free software: you can redistribute it and/or modify
|
7
|
+
it under the terms of the GNU General Public License as published by
|
8
|
+
the Free Software Foundation, either version 3 of the License, or
|
9
|
+
(at your option) any later version.
|
10
|
+
|
11
|
+
SwissParser is distributed in the hope that it will be useful,
|
12
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
14
|
+
GNU General Public License for more details.
|
15
|
+
|
16
|
+
You should have received a copy of the GNU General Public License
|
17
|
+
along with SwissParser. If not, see <http://www.gnu.org/licenses/>.
|
18
|
+
=end
|
19
|
+
|
20
|
+
|
21
|
+
|
22
|
+
module Swiss
|
23
|
+
|
24
|
+
VERSION = "0.5.1"
|
25
|
+
|
26
|
+
# This class defines parsing rules. Its methods
|
27
|
+
# are accessible within the +rules+ section of
|
28
|
+
# a parser definition.
|
29
|
+
class ParsingRules
|
30
|
+
|
31
|
+
attr_reader :separator, :actions
|
32
|
+
|
33
|
+
# *Do* *not* create directly this class but access it
|
34
|
+
# through a +rules+ section in a parser definition.
|
35
|
+
def initialize
|
36
|
+
@actions = { :text => {} }
|
37
|
+
end
|
38
|
+
|
39
|
+
# Sets the entry separator line. Default: "//"
|
40
|
+
def set_separator(string)
|
41
|
+
@separator = string
|
42
|
+
end
|
43
|
+
|
44
|
+
# Defines how to parse a line starting with +key+. The +proc+
|
45
|
+
# takes two arguments:
|
46
|
+
# * the rest of the line
|
47
|
+
# * the entry object
|
48
|
+
def with( key, &proc )
|
49
|
+
@actions[key] = proc
|
50
|
+
end
|
51
|
+
|
52
|
+
# Defines how to parse a line without key coming *after*
|
53
|
+
# a specified key. The +proc+ takes two arguments:
|
54
|
+
# * the rest of the line
|
55
|
+
# * the entry object
|
56
|
+
def with_text_after( key, &proc )
|
57
|
+
@actions[:text][key] = proc
|
58
|
+
end
|
59
|
+
|
60
|
+
end
|
61
|
+
|
62
|
+
|
63
|
+
# Parser for a typical bioinformatic flat file.
|
64
|
+
class Parser
|
65
|
+
|
66
|
+
#Default entry separator
|
67
|
+
DEFAULT_SEPARATOR = "//"
|
68
|
+
|
69
|
+
#*Do* *not* *use* this method to instatiate a parser. Use rather
|
70
|
+
#the +define+ class method.
|
71
|
+
def initialize(*args)
|
72
|
+
if args.size == 0
|
73
|
+
@separator = DEFAULT_SEPARATOR
|
74
|
+
@actions = {}
|
75
|
+
@actions[:text] = {}
|
76
|
+
elsif args.size == 6
|
77
|
+
actions,separator,before,the_begin,the_end,after = *args
|
78
|
+
@actions = actions.clone
|
79
|
+
@actions[:text] = actions[:text].clone
|
80
|
+
@separator = separator
|
81
|
+
@before = before
|
82
|
+
@end = the_end
|
83
|
+
@begin = the_begin
|
84
|
+
@after = after
|
85
|
+
else
|
86
|
+
raise "Wrong arg number, either 0 or 6."
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
# Defines how to create the _entry_ _object_. The +proc+
|
91
|
+
# takes a single argument which is a hash containing
|
92
|
+
# parsing options. It must return a new _entry_ _object_.
|
93
|
+
# Default:: creates an empty hash.
|
94
|
+
def new_entry(&proc)
|
95
|
+
@begin = proc
|
96
|
+
end
|
97
|
+
|
98
|
+
# Defines how to finalize an _entry_ _object_. The +proc+
|
99
|
+
# takes three arguments:
|
100
|
+
# * The entry object ready to be finalized
|
101
|
+
# * The context object
|
102
|
+
# * An hash containing parsing options.
|
103
|
+
# Default:: Adds the entry object to the context object using +<<+ method.
|
104
|
+
def finish_entry(&proc)
|
105
|
+
@end = proc
|
106
|
+
end
|
107
|
+
|
108
|
+
# Defines how to set the context before using the parser.
|
109
|
+
# The +proc+ takes a single argument which is a hash containing
|
110
|
+
# parsing options. It must return a _context_ object.
|
111
|
+
# Default:: creates an empty array
|
112
|
+
def before (&proc)
|
113
|
+
@before = proc
|
114
|
+
end
|
115
|
+
|
116
|
+
# Defines how to finalize the whole parsing.
|
117
|
+
# The +proc+ takes two arguments:
|
118
|
+
# * The context object
|
119
|
+
# * An hash containing parsing options.
|
120
|
+
# The value returned by the +proc+ is then returned by the parsing method.
|
121
|
+
# Default:: just returns the context object.
|
122
|
+
def after(&proc)
|
123
|
+
@after = proc
|
124
|
+
end
|
125
|
+
|
126
|
+
# Defines parsing rules inside a parser definition. The ParsingRules
|
127
|
+
# methods can then be called inside the proc.
|
128
|
+
def rules(&proc)
|
129
|
+
r = ParsingRules.new
|
130
|
+
r.instance_eval(&proc)
|
131
|
+
r.actions.each do |k,v|
|
132
|
+
if k == :text
|
133
|
+
next
|
134
|
+
end
|
135
|
+
@actions[k] = v
|
136
|
+
r.actions[:text].each do |k,v|
|
137
|
+
@actions[:text][k] = v
|
138
|
+
end
|
139
|
+
if r.separator
|
140
|
+
@separator = r.separator
|
141
|
+
end
|
142
|
+
end
|
143
|
+
end
|
144
|
+
|
145
|
+
|
146
|
+
|
147
|
+
# Extends an existing parser by allowing to redefine rules. The
|
148
|
+
# changes in the new parser simply replace the original defintions.
|
149
|
+
# After extension, the new parser is independent of the original one,
|
150
|
+
# i.e. a change to the original parser will not affect the derived one.
|
151
|
+
def extend(&proc)
|
152
|
+
clone = Parser.new( @actions, @separator, @before, @begin, @end, @after )
|
153
|
+
clone.instance_eval( &proc )
|
154
|
+
clone
|
155
|
+
end
|
156
|
+
|
157
|
+
# Defines a new parser.
|
158
|
+
def self.define( &proc )
|
159
|
+
PROTOTYPE.extend( &proc )
|
160
|
+
end
|
161
|
+
|
162
|
+
# Parses a file specified by +filename+. An optional hash
|
163
|
+
# of arbitrary arguments (+params+) can be specified. It is
|
164
|
+
# passed to the workflow methods blocks (+before+, +new_entry+, ...)
|
165
|
+
# It returns the value specified in the +after+ block. By default,
|
166
|
+
# it returns an array containing _entry_ objects.
|
167
|
+
def parse_file( filename, params={} )
|
168
|
+
context = @before.call( params )
|
169
|
+
File.open( filename, 'r' ) do |file|
|
170
|
+
entry = @begin.call( params )
|
171
|
+
file.each_line do |line|
|
172
|
+
state = parse_line( line, entry )
|
173
|
+
if state == :end
|
174
|
+
@end.call( entry, context, params )
|
175
|
+
entry = @begin.call( params )
|
176
|
+
end
|
177
|
+
end
|
178
|
+
end
|
179
|
+
@after.call( context, params )
|
180
|
+
end
|
181
|
+
|
182
|
+
private
|
183
|
+
|
184
|
+
PROTOTYPE = Parser.new
|
185
|
+
PROTOTYPE.instance_eval do
|
186
|
+
before { |p| [] }
|
187
|
+
new_entry { |p| {} }
|
188
|
+
finish_entry {|e,c,p| c << e }
|
189
|
+
after {|c,p| c }
|
190
|
+
end
|
191
|
+
|
192
|
+
|
193
|
+
def parse_line( line, holder )
|
194
|
+
line.chomp!
|
195
|
+
if line == @separator
|
196
|
+
:end
|
197
|
+
elsif line =~ /^(\S+)\s+(.*)$/
|
198
|
+
key,value = $1,$2
|
199
|
+
@last_key = key
|
200
|
+
if @actions[key]
|
201
|
+
@actions[key].call( value, holder )
|
202
|
+
end
|
203
|
+
:parsing
|
204
|
+
else
|
205
|
+
if @actions[:text][@last_key]
|
206
|
+
@actions[:text][@last_key].call( line, holder )
|
207
|
+
end
|
208
|
+
:parsing
|
209
|
+
end
|
210
|
+
end
|
211
|
+
|
212
|
+
end
|
213
|
+
|
214
|
+
end
|