swissparser 0.5.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,103 @@
1
+ =begin
2
+ Copyright (C) 2009 Paradigmatic
3
+
4
+ This file is part of SwissParser.
5
+
6
+ SwissParser is free software: you can redistribute it and/or modify
7
+ it under the terms of the GNU General Public License as published by
8
+ the Free Software Foundation, either version 3 of the License, or
9
+ (at your option) any later version.
10
+
11
+ SwissParser is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ GNU General Public License for more details.
15
+
16
+ You should have received a copy of the GNU General Public License
17
+ along with SwissParser. If not, see <http://www.gnu.org/licenses/>.
18
+ =end
19
+
20
+ require 'swiss_parser.rb'
21
+ require 'yaml'
22
+
23
+ class Enzyme
24
+
25
+ attr_accessor :id, :genes
26
+
27
+ end
28
+
29
+
30
+ enzyme_parser = Swiss::Parser.define do
31
+
32
+
33
+ new_entry do |params|
34
+ { :genes => [] }
35
+ end
36
+
37
+ rules do
38
+
39
+ def parse_gene_ids( string, entry )
40
+ string.split(" ").each do |item|
41
+ if item =~ /(\d+)\(\w+\)/
42
+ entry[:genes] << $1
43
+ end
44
+ end
45
+ end
46
+
47
+ human = "HSA"
48
+
49
+ set_separator( "///" )
50
+
51
+ with("ENTRY") do |content,entry|
52
+ content =~ /((\d+|-)\.(\d+|-)\.(\d+|-)\.(\d+|-))/
53
+ entry[:id] = $1
54
+ end
55
+
56
+ with("GENES") do |content,entry|
57
+ content =~ /^([A-Z]+): (.*)/
58
+ org,genes = $1,$2
59
+ entry[:last_organism] = org
60
+ if org == human
61
+ parse_gene_ids( genes, entry )
62
+ end
63
+ end
64
+
65
+ with_text_after("GENES") do |content,entry|
66
+ if content =~ /([A-Z]+): (.*)/
67
+ org,genes = $1,$2
68
+ entry[:last_organism] = org
69
+ if org == human
70
+ parse_gene_ids( genes, entry )
71
+ end
72
+ elsif entry[:last_organism] == human
73
+ parse_gene_ids( content, entry )
74
+ end
75
+ end
76
+
77
+ end
78
+
79
+ finish_entry do |entry,container,params|
80
+ if entry[:genes].size > 0
81
+ e = Enzyme.new
82
+ e.id = entry[:id]
83
+ e.genes = entry[:genes]
84
+ container << e
85
+ end
86
+ end
87
+
88
+ end
89
+
90
+
91
+ if $0 == __FILE__
92
+
93
+ filename = ARGV.shift
94
+
95
+ enzymes = enzyme_parser.parse_file( filename )
96
+
97
+ enzymes.each do |e|
98
+ puts e.to_yaml
99
+ end
100
+
101
+ end
102
+
103
+
@@ -0,0 +1,100 @@
1
+ =begin
2
+ Copyright (C) 2009 Paradigmatic
3
+
4
+ This file is part of SwissParser.
5
+
6
+ SwissParser is free software: you can redistribute it and/or modify
7
+ it under the terms of the GNU General Public License as published by
8
+ the Free Software Foundation, either version 3 of the License, or
9
+ (at your option) any later version.
10
+
11
+ SwissParser is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ GNU General Public License for more details.
15
+
16
+ You should have received a copy of the GNU General Public License
17
+ along with SwissParser. If not, see <http://www.gnu.org/licenses/>.
18
+ =end
19
+
20
+ require 'swiss_parser.rb'
21
+ require 'yaml'
22
+
23
+ class Protein
24
+ attr_accessor :name, :sequence, :size
25
+ end
26
+
27
+ parser = Swiss::Parser.define do
28
+
29
+ new_entry do
30
+ Protein.new
31
+ end
32
+
33
+ rules do
34
+
35
+ set_separator '/'
36
+
37
+ with('N') do |content,entry|
38
+ entry.name = content
39
+ end
40
+
41
+ with('C') do |content,entry|
42
+ entry.size = content.to_i
43
+ end
44
+
45
+ with('S') do |content,entry|
46
+ entry.sequence = content
47
+ end
48
+
49
+ end
50
+
51
+ end
52
+
53
+
54
+ stat_parser = parser.extend do
55
+
56
+ before do |params|
57
+ { :min => 1_000, :max => 0, :sum => 0, :n => 0 }
58
+ end
59
+
60
+ finish_entry do |entry,h,params|
61
+ if entry.size < h[:min]
62
+ h[:min] = entry.size
63
+ end
64
+ if entry.size > h[:max]
65
+ h[:max] = entry.size
66
+ end
67
+ h[:sum] += entry.size
68
+ h[:n] += 1
69
+ end
70
+
71
+ after do |h,params|
72
+ h[:average] = h[:sum].to_f / h[:n]
73
+ h
74
+ end
75
+
76
+ end
77
+
78
+
79
+ if $0 == __FILE__
80
+
81
+ filename = ARGV.shift
82
+
83
+ entries = parser.parse_file( filename )
84
+
85
+ entries.each do |e|
86
+ puts e.to_yaml
87
+ end
88
+
89
+ puts
90
+
91
+ results = stat_parser.parse_file( filename )
92
+
93
+ puts "Min: #{results[:min]}"
94
+ puts "Max: #{results[:max]}"
95
+ puts "Average: #{results[:average]}"
96
+ puts "Size: #{results[:n]}"
97
+
98
+ end
99
+
100
+
@@ -0,0 +1,83 @@
1
+ =begin
2
+ Copyright (C) 2009 Paradigmatic
3
+
4
+ This file is part of SwissParser.
5
+
6
+ SwissParser is free software: you can redistribute it and/or modify
7
+ it under the terms of the GNU General Public License as published by
8
+ the Free Software Foundation, either version 3 of the License, or
9
+ (at your option) any later version.
10
+
11
+ SwissParser is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ GNU General Public License for more details.
15
+
16
+ You should have received a copy of the GNU General Public License
17
+ along with SwissParser. If not, see <http://www.gnu.org/licenses/>.
18
+ =end
19
+
20
+ #!/usr/bin/ruby -w
21
+
22
+ require 'yaml'
23
+ require 'swiss_parser.rb'
24
+
25
+ class Protein
26
+
27
+ attr_accessor :id, :size, :species, :taxonomy, :sequence
28
+
29
+ def initialize
30
+ @taxonomy = []
31
+ @sequence = ""
32
+ end
33
+
34
+ end
35
+
36
+
37
+ uniprot_parser = Swiss::Parser.define do
38
+
39
+ new_entry do
40
+ Protein.new
41
+ end
42
+
43
+ rules do
44
+
45
+ with("ID") do |content,protein|
46
+ content =~ /([A-Z]\w+)\D+(\d+)/
47
+ protein.id = $1
48
+ protein.size = $2.to_i
49
+ end
50
+
51
+ with("OS") do |content,protein|
52
+ content =~ /(\w+ \w+)/
53
+ protein.species = $1
54
+ end
55
+
56
+ with("OC") do |content,protein|
57
+ ary = content.gsub(".","").split("; ")
58
+ protein.taxonomy += ary
59
+ end
60
+
61
+ with_text_after("SQ") do |content,protein|
62
+ seq = content.strip.gsub(" ","")
63
+ protein.sequence += seq
64
+ end
65
+
66
+ end
67
+
68
+ end
69
+
70
+
71
+ if $0 == __FILE__
72
+
73
+ filename = ARGV.shift
74
+
75
+ entries = uniprot_parser.parse_file( filename )
76
+
77
+ puts entries.size
78
+
79
+ entries.each do |e|
80
+ puts e.to_yaml
81
+ end
82
+
83
+ end
@@ -0,0 +1,214 @@
1
+ =begin
2
+ Copyright (C) 2009 Paradigmatic
3
+
4
+ This file is part of SwissParser.
5
+
6
+ SwissParser is free software: you can redistribute it and/or modify
7
+ it under the terms of the GNU General Public License as published by
8
+ the Free Software Foundation, either version 3 of the License, or
9
+ (at your option) any later version.
10
+
11
+ SwissParser is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ GNU General Public License for more details.
15
+
16
+ You should have received a copy of the GNU General Public License
17
+ along with SwissParser. If not, see <http://www.gnu.org/licenses/>.
18
+ =end
19
+
20
+
21
+
22
+ module Swiss
23
+
24
+ VERSION = "0.5.1"
25
+
26
+ # This class defines parsing rules. Its methods
27
+ # are accessible within the +rules+ section of
28
+ # a parser definition.
29
+ class ParsingRules
30
+
31
+ attr_reader :separator, :actions
32
+
33
+ # *Do* *not* create directly this class but access it
34
+ # through a +rules+ section in a parser definition.
35
+ def initialize
36
+ @actions = { :text => {} }
37
+ end
38
+
39
+ # Sets the entry separator line. Default: "//"
40
+ def set_separator(string)
41
+ @separator = string
42
+ end
43
+
44
+ # Defines how to parse a line starting with +key+. The +proc+
45
+ # takes two arguments:
46
+ # * the rest of the line
47
+ # * the entry object
48
+ def with( key, &proc )
49
+ @actions[key] = proc
50
+ end
51
+
52
+ # Defines how to parse a line without key coming *after*
53
+ # a specified key. The +proc+ takes two arguments:
54
+ # * the rest of the line
55
+ # * the entry object
56
+ def with_text_after( key, &proc )
57
+ @actions[:text][key] = proc
58
+ end
59
+
60
+ end
61
+
62
+
63
+ # Parser for a typical bioinformatic flat file.
64
+ class Parser
65
+
66
+ #Default entry separator
67
+ DEFAULT_SEPARATOR = "//"
68
+
69
+ #*Do* *not* *use* this method to instatiate a parser. Use rather
70
+ #the +define+ class method.
71
+ def initialize(*args)
72
+ if args.size == 0
73
+ @separator = DEFAULT_SEPARATOR
74
+ @actions = {}
75
+ @actions[:text] = {}
76
+ elsif args.size == 6
77
+ actions,separator,before,the_begin,the_end,after = *args
78
+ @actions = actions.clone
79
+ @actions[:text] = actions[:text].clone
80
+ @separator = separator
81
+ @before = before
82
+ @end = the_end
83
+ @begin = the_begin
84
+ @after = after
85
+ else
86
+ raise "Wrong arg number, either 0 or 6."
87
+ end
88
+ end
89
+
90
+ # Defines how to create the _entry_ _object_. The +proc+
91
+ # takes a single argument which is a hash containing
92
+ # parsing options. It must return a new _entry_ _object_.
93
+ # Default:: creates an empty hash.
94
+ def new_entry(&proc)
95
+ @begin = proc
96
+ end
97
+
98
+ # Defines how to finalize an _entry_ _object_. The +proc+
99
+ # takes three arguments:
100
+ # * The entry object ready to be finalized
101
+ # * The context object
102
+ # * An hash containing parsing options.
103
+ # Default:: Adds the entry object to the context object using +<<+ method.
104
+ def finish_entry(&proc)
105
+ @end = proc
106
+ end
107
+
108
+ # Defines how to set the context before using the parser.
109
+ # The +proc+ takes a single argument which is a hash containing
110
+ # parsing options. It must return a _context_ object.
111
+ # Default:: creates an empty array
112
+ def before (&proc)
113
+ @before = proc
114
+ end
115
+
116
+ # Defines how to finalize the whole parsing.
117
+ # The +proc+ takes two arguments:
118
+ # * The context object
119
+ # * An hash containing parsing options.
120
+ # The value returned by the +proc+ is then returned by the parsing method.
121
+ # Default:: just returns the context object.
122
+ def after(&proc)
123
+ @after = proc
124
+ end
125
+
126
+ # Defines parsing rules inside a parser definition. The ParsingRules
127
+ # methods can then be called inside the proc.
128
+ def rules(&proc)
129
+ r = ParsingRules.new
130
+ r.instance_eval(&proc)
131
+ r.actions.each do |k,v|
132
+ if k == :text
133
+ next
134
+ end
135
+ @actions[k] = v
136
+ r.actions[:text].each do |k,v|
137
+ @actions[:text][k] = v
138
+ end
139
+ if r.separator
140
+ @separator = r.separator
141
+ end
142
+ end
143
+ end
144
+
145
+
146
+
147
+ # Extends an existing parser by allowing to redefine rules. The
148
+ # changes in the new parser simply replace the original defintions.
149
+ # After extension, the new parser is independent of the original one,
150
+ # i.e. a change to the original parser will not affect the derived one.
151
+ def extend(&proc)
152
+ clone = Parser.new( @actions, @separator, @before, @begin, @end, @after )
153
+ clone.instance_eval( &proc )
154
+ clone
155
+ end
156
+
157
+ # Defines a new parser.
158
+ def self.define( &proc )
159
+ PROTOTYPE.extend( &proc )
160
+ end
161
+
162
+ # Parses a file specified by +filename+. An optional hash
163
+ # of arbitrary arguments (+params+) can be specified. It is
164
+ # passed to the workflow methods blocks (+before+, +new_entry+, ...)
165
+ # It returns the value specified in the +after+ block. By default,
166
+ # it returns an array containing _entry_ objects.
167
+ def parse_file( filename, params={} )
168
+ context = @before.call( params )
169
+ File.open( filename, 'r' ) do |file|
170
+ entry = @begin.call( params )
171
+ file.each_line do |line|
172
+ state = parse_line( line, entry )
173
+ if state == :end
174
+ @end.call( entry, context, params )
175
+ entry = @begin.call( params )
176
+ end
177
+ end
178
+ end
179
+ @after.call( context, params )
180
+ end
181
+
182
+ private
183
+
184
+ PROTOTYPE = Parser.new
185
+ PROTOTYPE.instance_eval do
186
+ before { |p| [] }
187
+ new_entry { |p| {} }
188
+ finish_entry {|e,c,p| c << e }
189
+ after {|c,p| c }
190
+ end
191
+
192
+
193
+ def parse_line( line, holder )
194
+ line.chomp!
195
+ if line == @separator
196
+ :end
197
+ elsif line =~ /^(\S+)\s+(.*)$/
198
+ key,value = $1,$2
199
+ @last_key = key
200
+ if @actions[key]
201
+ @actions[key].call( value, holder )
202
+ end
203
+ :parsing
204
+ else
205
+ if @actions[:text][@last_key]
206
+ @actions[:text][@last_key].call( line, holder )
207
+ end
208
+ :parsing
209
+ end
210
+ end
211
+
212
+ end
213
+
214
+ end