swissparser 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,103 @@
1
+ =begin
2
+ Copyright (C) 2009 Paradigmatic
3
+
4
+ This file is part of SwissParser.
5
+
6
+ SwissParser is free software: you can redistribute it and/or modify
7
+ it under the terms of the GNU General Public License as published by
8
+ the Free Software Foundation, either version 3 of the License, or
9
+ (at your option) any later version.
10
+
11
+ SwissParser is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ GNU General Public License for more details.
15
+
16
+ You should have received a copy of the GNU General Public License
17
+ along with SwissParser. If not, see <http://www.gnu.org/licenses/>.
18
+ =end
19
+
20
+ require 'swiss_parser.rb'
21
+ require 'yaml'
22
+
23
+ class Enzyme
24
+
25
+ attr_accessor :id, :genes
26
+
27
+ end
28
+
29
+
30
+ enzyme_parser = Swiss::Parser.define do
31
+
32
+
33
+ new_entry do |params|
34
+ { :genes => [] }
35
+ end
36
+
37
+ rules do
38
+
39
+ def parse_gene_ids( string, entry )
40
+ string.split(" ").each do |item|
41
+ if item =~ /(\d+)\(\w+\)/
42
+ entry[:genes] << $1
43
+ end
44
+ end
45
+ end
46
+
47
+ human = "HSA"
48
+
49
+ set_separator( "///" )
50
+
51
+ with("ENTRY") do |content,entry|
52
+ content =~ /((\d+|-)\.(\d+|-)\.(\d+|-)\.(\d+|-))/
53
+ entry[:id] = $1
54
+ end
55
+
56
+ with("GENES") do |content,entry|
57
+ content =~ /^([A-Z]+): (.*)/
58
+ org,genes = $1,$2
59
+ entry[:last_organism] = org
60
+ if org == human
61
+ parse_gene_ids( genes, entry )
62
+ end
63
+ end
64
+
65
+ with_text_after("GENES") do |content,entry|
66
+ if content =~ /([A-Z]+): (.*)/
67
+ org,genes = $1,$2
68
+ entry[:last_organism] = org
69
+ if org == human
70
+ parse_gene_ids( genes, entry )
71
+ end
72
+ elsif entry[:last_organism] == human
73
+ parse_gene_ids( content, entry )
74
+ end
75
+ end
76
+
77
+ end
78
+
79
+ finish_entry do |entry,container,params|
80
+ if entry[:genes].size > 0
81
+ e = Enzyme.new
82
+ e.id = entry[:id]
83
+ e.genes = entry[:genes]
84
+ container << e
85
+ end
86
+ end
87
+
88
+ end
89
+
90
+
91
+ if $0 == __FILE__
92
+
93
+ filename = ARGV.shift
94
+
95
+ enzymes = enzyme_parser.parse_file( filename )
96
+
97
+ enzymes.each do |e|
98
+ puts e.to_yaml
99
+ end
100
+
101
+ end
102
+
103
+
@@ -0,0 +1,100 @@
1
+ =begin
2
+ Copyright (C) 2009 Paradigmatic
3
+
4
+ This file is part of SwissParser.
5
+
6
+ SwissParser is free software: you can redistribute it and/or modify
7
+ it under the terms of the GNU General Public License as published by
8
+ the Free Software Foundation, either version 3 of the License, or
9
+ (at your option) any later version.
10
+
11
+ SwissParser is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ GNU General Public License for more details.
15
+
16
+ You should have received a copy of the GNU General Public License
17
+ along with SwissParser. If not, see <http://www.gnu.org/licenses/>.
18
+ =end
19
+
20
+ require 'swiss_parser.rb'
21
+ require 'yaml'
22
+
23
+ class Protein
24
+ attr_accessor :name, :sequence, :size
25
+ end
26
+
27
+ parser = Swiss::Parser.define do
28
+
29
+ new_entry do
30
+ Protein.new
31
+ end
32
+
33
+ rules do
34
+
35
+ set_separator '/'
36
+
37
+ with('N') do |content,entry|
38
+ entry.name = content
39
+ end
40
+
41
+ with('C') do |content,entry|
42
+ entry.size = content.to_i
43
+ end
44
+
45
+ with('S') do |content,entry|
46
+ entry.sequence = content
47
+ end
48
+
49
+ end
50
+
51
+ end
52
+
53
+
54
+ stat_parser = parser.extend do
55
+
56
+ before do |params|
57
+ { :min => 1_000, :max => 0, :sum => 0, :n => 0 }
58
+ end
59
+
60
+ finish_entry do |entry,h,params|
61
+ if entry.size < h[:min]
62
+ h[:min] = entry.size
63
+ end
64
+ if entry.size > h[:max]
65
+ h[:max] = entry.size
66
+ end
67
+ h[:sum] += entry.size
68
+ h[:n] += 1
69
+ end
70
+
71
+ after do |h,params|
72
+ h[:average] = h[:sum].to_f / h[:n]
73
+ h
74
+ end
75
+
76
+ end
77
+
78
+
79
+ if $0 == __FILE__
80
+
81
+ filename = ARGV.shift
82
+
83
+ entries = parser.parse_file( filename )
84
+
85
+ entries.each do |e|
86
+ puts e.to_yaml
87
+ end
88
+
89
+ puts
90
+
91
+ results = stat_parser.parse_file( filename )
92
+
93
+ puts "Min: #{results[:min]}"
94
+ puts "Max: #{results[:max]}"
95
+ puts "Average: #{results[:average]}"
96
+ puts "Size: #{results[:n]}"
97
+
98
+ end
99
+
100
+
@@ -0,0 +1,83 @@
1
+ =begin
2
+ Copyright (C) 2009 Paradigmatic
3
+
4
+ This file is part of SwissParser.
5
+
6
+ SwissParser is free software: you can redistribute it and/or modify
7
+ it under the terms of the GNU General Public License as published by
8
+ the Free Software Foundation, either version 3 of the License, or
9
+ (at your option) any later version.
10
+
11
+ SwissParser is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ GNU General Public License for more details.
15
+
16
+ You should have received a copy of the GNU General Public License
17
+ along with SwissParser. If not, see <http://www.gnu.org/licenses/>.
18
+ =end
19
+
20
+ #!/usr/bin/ruby -w
21
+
22
+ require 'yaml'
23
+ require 'swiss_parser.rb'
24
+
25
+ class Protein
26
+
27
+ attr_accessor :id, :size, :species, :taxonomy, :sequence
28
+
29
+ def initialize
30
+ @taxonomy = []
31
+ @sequence = ""
32
+ end
33
+
34
+ end
35
+
36
+
37
+ uniprot_parser = Swiss::Parser.define do
38
+
39
+ new_entry do
40
+ Protein.new
41
+ end
42
+
43
+ rules do
44
+
45
+ with("ID") do |content,protein|
46
+ content =~ /([A-Z]\w+)\D+(\d+)/
47
+ protein.id = $1
48
+ protein.size = $2.to_i
49
+ end
50
+
51
+ with("OS") do |content,protein|
52
+ content =~ /(\w+ \w+)/
53
+ protein.species = $1
54
+ end
55
+
56
+ with("OC") do |content,protein|
57
+ ary = content.gsub(".","").split("; ")
58
+ protein.taxonomy += ary
59
+ end
60
+
61
+ with_text_after("SQ") do |content,protein|
62
+ seq = content.strip.gsub(" ","")
63
+ protein.sequence += seq
64
+ end
65
+
66
+ end
67
+
68
+ end
69
+
70
+
71
+ if $0 == __FILE__
72
+
73
+ filename = ARGV.shift
74
+
75
+ entries = uniprot_parser.parse_file( filename )
76
+
77
+ puts entries.size
78
+
79
+ entries.each do |e|
80
+ puts e.to_yaml
81
+ end
82
+
83
+ end
@@ -0,0 +1,214 @@
1
+ =begin
2
+ Copyright (C) 2009 Paradigmatic
3
+
4
+ This file is part of SwissParser.
5
+
6
+ SwissParser is free software: you can redistribute it and/or modify
7
+ it under the terms of the GNU General Public License as published by
8
+ the Free Software Foundation, either version 3 of the License, or
9
+ (at your option) any later version.
10
+
11
+ SwissParser is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ GNU General Public License for more details.
15
+
16
+ You should have received a copy of the GNU General Public License
17
+ along with SwissParser. If not, see <http://www.gnu.org/licenses/>.
18
+ =end
19
+
20
+
21
+
22
+ module Swiss
23
+
24
+ VERSION = "0.5.1"
25
+
26
+ # This class defines parsing rules. Its methods
27
+ # are accessible within the +rules+ section of
28
+ # a parser definition.
29
+ class ParsingRules
30
+
31
+ attr_reader :separator, :actions
32
+
33
+ # *Do* *not* create directly this class but access it
34
+ # through a +rules+ section in a parser definition.
35
+ def initialize
36
+ @actions = { :text => {} }
37
+ end
38
+
39
+ # Sets the entry separator line. Default: "//"
40
+ def set_separator(string)
41
+ @separator = string
42
+ end
43
+
44
+ # Defines how to parse a line starting with +key+. The +proc+
45
+ # takes two arguments:
46
+ # * the rest of the line
47
+ # * the entry object
48
+ def with( key, &proc )
49
+ @actions[key] = proc
50
+ end
51
+
52
+ # Defines how to parse a line without key coming *after*
53
+ # a specified key. The +proc+ takes two arguments:
54
+ # * the rest of the line
55
+ # * the entry object
56
+ def with_text_after( key, &proc )
57
+ @actions[:text][key] = proc
58
+ end
59
+
60
+ end
61
+
62
+
63
+ # Parser for a typical bioinformatic flat file.
64
+ class Parser
65
+
66
+ #Default entry separator
67
+ DEFAULT_SEPARATOR = "//"
68
+
69
+ #*Do* *not* *use* this method to instatiate a parser. Use rather
70
+ #the +define+ class method.
71
+ def initialize(*args)
72
+ if args.size == 0
73
+ @separator = DEFAULT_SEPARATOR
74
+ @actions = {}
75
+ @actions[:text] = {}
76
+ elsif args.size == 6
77
+ actions,separator,before,the_begin,the_end,after = *args
78
+ @actions = actions.clone
79
+ @actions[:text] = actions[:text].clone
80
+ @separator = separator
81
+ @before = before
82
+ @end = the_end
83
+ @begin = the_begin
84
+ @after = after
85
+ else
86
+ raise "Wrong arg number, either 0 or 6."
87
+ end
88
+ end
89
+
90
+ # Defines how to create the _entry_ _object_. The +proc+
91
+ # takes a single argument which is a hash containing
92
+ # parsing options. It must return a new _entry_ _object_.
93
+ # Default:: creates an empty hash.
94
+ def new_entry(&proc)
95
+ @begin = proc
96
+ end
97
+
98
+ # Defines how to finalize an _entry_ _object_. The +proc+
99
+ # takes three arguments:
100
+ # * The entry object ready to be finalized
101
+ # * The context object
102
+ # * An hash containing parsing options.
103
+ # Default:: Adds the entry object to the context object using +<<+ method.
104
+ def finish_entry(&proc)
105
+ @end = proc
106
+ end
107
+
108
+ # Defines how to set the context before using the parser.
109
+ # The +proc+ takes a single argument which is a hash containing
110
+ # parsing options. It must return a _context_ object.
111
+ # Default:: creates an empty array
112
+ def before (&proc)
113
+ @before = proc
114
+ end
115
+
116
+ # Defines how to finalize the whole parsing.
117
+ # The +proc+ takes two arguments:
118
+ # * The context object
119
+ # * An hash containing parsing options.
120
+ # The value returned by the +proc+ is then returned by the parsing method.
121
+ # Default:: just returns the context object.
122
+ def after(&proc)
123
+ @after = proc
124
+ end
125
+
126
+ # Defines parsing rules inside a parser definition. The ParsingRules
127
+ # methods can then be called inside the proc.
128
+ def rules(&proc)
129
+ r = ParsingRules.new
130
+ r.instance_eval(&proc)
131
+ r.actions.each do |k,v|
132
+ if k == :text
133
+ next
134
+ end
135
+ @actions[k] = v
136
+ r.actions[:text].each do |k,v|
137
+ @actions[:text][k] = v
138
+ end
139
+ if r.separator
140
+ @separator = r.separator
141
+ end
142
+ end
143
+ end
144
+
145
+
146
+
147
+ # Extends an existing parser by allowing to redefine rules. The
148
+ # changes in the new parser simply replace the original defintions.
149
+ # After extension, the new parser is independent of the original one,
150
+ # i.e. a change to the original parser will not affect the derived one.
151
+ def extend(&proc)
152
+ clone = Parser.new( @actions, @separator, @before, @begin, @end, @after )
153
+ clone.instance_eval( &proc )
154
+ clone
155
+ end
156
+
157
+ # Defines a new parser.
158
+ def self.define( &proc )
159
+ PROTOTYPE.extend( &proc )
160
+ end
161
+
162
+ # Parses a file specified by +filename+. An optional hash
163
+ # of arbitrary arguments (+params+) can be specified. It is
164
+ # passed to the workflow methods blocks (+before+, +new_entry+, ...)
165
+ # It returns the value specified in the +after+ block. By default,
166
+ # it returns an array containing _entry_ objects.
167
+ def parse_file( filename, params={} )
168
+ context = @before.call( params )
169
+ File.open( filename, 'r' ) do |file|
170
+ entry = @begin.call( params )
171
+ file.each_line do |line|
172
+ state = parse_line( line, entry )
173
+ if state == :end
174
+ @end.call( entry, context, params )
175
+ entry = @begin.call( params )
176
+ end
177
+ end
178
+ end
179
+ @after.call( context, params )
180
+ end
181
+
182
+ private
183
+
184
+ PROTOTYPE = Parser.new
185
+ PROTOTYPE.instance_eval do
186
+ before { |p| [] }
187
+ new_entry { |p| {} }
188
+ finish_entry {|e,c,p| c << e }
189
+ after {|c,p| c }
190
+ end
191
+
192
+
193
+ def parse_line( line, holder )
194
+ line.chomp!
195
+ if line == @separator
196
+ :end
197
+ elsif line =~ /^(\S+)\s+(.*)$/
198
+ key,value = $1,$2
199
+ @last_key = key
200
+ if @actions[key]
201
+ @actions[key].call( value, holder )
202
+ end
203
+ :parsing
204
+ else
205
+ if @actions[:text][@last_key]
206
+ @actions[:text][@last_key].call( line, holder )
207
+ end
208
+ :parsing
209
+ end
210
+ end
211
+
212
+ end
213
+
214
+ end