rltk 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/lib/rltk/lexer.rb ADDED
@@ -0,0 +1,298 @@
1
+ # Author: Chris Wailes <chris.wailes@gmail.com>
2
+ # Project: Ruby Language Toolkit
3
+ # Date: 2011/01/17
4
+ # Description: This file contains the base class for lexers that use RLTK.
5
+
6
+ ############
7
+ # Requires #
8
+ ############
9
+
10
+ # Standard Library
11
+ require 'strscan'
12
+
13
+ # Ruby Language Toolkit
14
+ require 'rltk/token'
15
+
16
+ #######################
17
+ # Classes and Modules #
18
+ #######################
19
+
20
+ module RLTK # :nodoc:
21
+
22
+ # A LexingError exception is raised when an input stream contains a
23
+ # substring that isn't matched by any of a lexer's rules.
24
+ class LexingError < Exception
25
+ def initialize(stream_offset, line_number, line_offset, remainder)
26
+ @stream_offset = stream_offset
27
+ @line_number = line_number
28
+ @line_offset = line_offset
29
+ @remainder = remainder
30
+ end
31
+
32
+ def to_s()
33
+ "#{super()}: #{@remainder}"
34
+ end
35
+ end
36
+
37
+ # The Lexer class may be sub-classed to produce new lexers. These lexers
38
+ # have a lot of features, and are described in the main documentation.
39
+ class Lexer
40
+
41
+ # Called when the Lexer class is sub-classed, this method adds a
42
+ # LexerCore to the new class, and installs some needed class and
43
+ # instance methods.
44
+ def Lexer.inherited(klass)
45
+ klass.class_exec do
46
+ @core = LexerCore.new
47
+
48
+ # Returns this class's LexerCore object.
49
+ def self.core
50
+ @core
51
+ end
52
+
53
+ # Lexes the given string using a newly instantiated
54
+ # environment.
55
+ def self.lex(str)
56
+ @core.lex(str, self::Environment.new(@core.start_state))
57
+ end
58
+
59
+ # Lexes the contents of the given file using a newly
60
+ # instantiated environment.
61
+ def self.lex_file(file_name)
62
+ @core.lex_file(file_name, self::Environment.new(@core.start_state))
63
+ end
64
+
65
+ # Routes method calls to the new subclass to the LexerCore
66
+ # object.
67
+ def self.method_missing(method, *args, &proc)
68
+ @core.send(method, *args, &proc)
69
+ end
70
+
71
+ # Instantiates a new lexer and creates an environment to be
72
+ # used for subsequent calls.
73
+ def initialize
74
+ @env = self.class::Environment.new(self.class.core.start_state)
75
+ end
76
+
77
+ # Returns the environment used by an instantiated lexer.
78
+ def env
79
+ @env
80
+ end
81
+
82
+ # Lexes a string using the encapsulated environment.
83
+ def lex(string)
84
+ self.class.core.lex(string, @env)
85
+ end
86
+
87
+ # Lexes a file using the encapsulated environment.
88
+ def lex_file(file_name)
89
+ self.class.core.lex_file(file_name, @env)
90
+ end
91
+ end
92
+ end
93
+
94
+ #################
95
+ # Inner Classes #
96
+ #################
97
+
98
+ # The LexerCore class provides most of the functionality of the Lexer
99
+ # class. A LexerCore is instantiated for each subclass of Lexer,
100
+ # thereby allowing multiple lexers to be defined inside a single Ruby
101
+ # program.
102
+ class LexerCore
103
+ attr_reader :start_state
104
+
105
+ # Instantiate a new LexerCore object.
106
+ def initialize
107
+ @match_type = :longest
108
+ @rules = Hash.new {|h,k| h[k] = Array.new}
109
+ @start_state = :default
110
+ end
111
+
112
+ # Lex _string_, using _env_ as the environment. This method will
113
+ # return the array of tokens generated by the lexer with a token
114
+ # of type EOS (End of Stream) appended to the end.
115
+ def lex(string, env, file_name = nil)
116
+ # Offset from start of stream.
117
+ stream_offset = 0
118
+
119
+ # Offset from the start of the line.
120
+ line_offset = 0
121
+ line_number = 1
122
+
123
+ # Empty token list.
124
+ tokens = Array.new
125
+
126
+ # The scanner.
127
+ scanner = StringScanner.new(string)
128
+
129
+ # Start scanning the input string.
130
+ until scanner.eos?
131
+ match = nil
132
+
133
+ # If the match_type is set to :longest all of the
134
+ # rules for the current state need to be scanned
135
+ # and the longest match returned. If the
136
+ # match_type is :first, we only need to scan until
137
+ # we find a match.
138
+ @rules[env.state].each do |rule|
139
+ if (rule.flags - env.flags).empty?
140
+ if txt = scanner.check(rule.pattern)
141
+ if not match or match.first.length < txt.length
142
+ match = [txt, rule]
143
+
144
+ break if @match_type == :first
145
+ end
146
+ end
147
+ end
148
+ end
149
+
150
+ if match
151
+ rule = match.last
152
+
153
+ txt = scanner.scan(rule.pattern)
154
+ type, value = env.instance_exec(txt, &rule.action)
155
+
156
+ if type
157
+ pos = StreamPosition.new(stream_offset, line_number, line_offset, txt.length, file_name)
158
+ tokens << Token.new(type, value, pos)
159
+ end
160
+
161
+ # Advance our stat counters.
162
+ stream_offset += txt.length
163
+
164
+ if (newlines = txt.count("\n")) > 0
165
+ line_number += newlines
166
+ line_offset = 0
167
+ else
168
+ line_offset += txt.length()
169
+ end
170
+ else
171
+ error = LexingError.new(stream_offset, line_number, line_offset, scanner.post_match)
172
+ raise(error, 'Unable to match string with any of the given rules')
173
+ end
174
+ end
175
+
176
+ return tokens << Token.new(:EOS)
177
+ end
178
+
179
+ # A wrapper function that calls ParserCore.lex on the
180
+ # contents of a file.
181
+ def lex_file(file_name, evn)
182
+ File.open(file_name, 'r') { |f| lex(f.read, env, file_name) }
183
+ end
184
+
185
+ # Used to tell a lexer to use the first match found instead
186
+ # of the longest match found.
187
+ def match_first
188
+ @match_type = :first
189
+ end
190
+
191
+ # This method is used to define a new lexing rule. The
192
+ # first argument is the regular expression used to match
193
+ # substrings of the input. The second argument is the state
194
+ # to which the rule belongs. Flags that need to be set for
195
+ # the rule to be considered are specified by the third
196
+ # argument. The last argument is a block that returns a
197
+ # type and value to be used in constructing a Token. If no
198
+ # block is specified the matched substring will be
199
+ # discarded and lexing will continue.
200
+ def rule(pattern, state = :default, flags = [], &action)
201
+ # If no action is given we will set it to an empty
202
+ # action.
203
+ action ||= Proc.new() {}
204
+
205
+ r = Rule.new(pattern, action, state, flags)
206
+
207
+ if state == :ALL then @rules.each_key { |k| @rules[k] << r } else @rules[state] << r end
208
+ end
209
+
210
+ alias :r :rule
211
+
212
+ # Changes the starting state of the lexer.
213
+ def start(state)
214
+ @start_state = state
215
+ end
216
+ end
217
+
218
+ # All actions passed to LexerCore.rule are evaluated inside an
219
+ # instance of the Environment class or its subclass (which must have
220
+ # the same name). This class provides functions for manipulating
221
+ # lexer state and flags.
222
+ class Environment
223
+
224
+ # The flags currently set in this environment.
225
+ attr_reader :flags
226
+
227
+ # Instantiates a new Environment object.
228
+ def initialize(start_state)
229
+ @state = [start_state]
230
+ @flags = Array.new
231
+ end
232
+
233
+ # Pops a state from the state stack.
234
+ def pop_state
235
+ @state.pop
236
+
237
+ nil
238
+ end
239
+
240
+ # Pushes a new state onto the state stack.
241
+ def push_state(state)
242
+ @state << state
243
+
244
+ nil
245
+ end
246
+
247
+ # Sets the value on the top of the state stack.
248
+ def set_state(state)
249
+ @state[-1] = state
250
+
251
+ nil
252
+ end
253
+
254
+ # Returns the current state.
255
+ def state
256
+ return @state.last
257
+ end
258
+
259
+ # Sets a flag in the current environment.
260
+ def set_flag(flag)
261
+ if not @flags.include?(flag)
262
+ @flags << flag
263
+ end
264
+
265
+ nil
266
+ end
267
+
268
+ # Unsets a flag in the current environment.
269
+ def unset_flag(flag)
270
+ @flags.delete(flag)
271
+
272
+ nil
273
+ end
274
+
275
+ # Unsets all flags in the current environment.
276
+ def clear_flags
277
+ @flags = Array.new
278
+
279
+ nil
280
+ end
281
+ end
282
+
283
+ # The Rule class is used simply for data encapsulation.
284
+ class Rule
285
+ attr_reader :action
286
+ attr_reader :pattern
287
+ attr_reader :flags
288
+
289
+ # Instantiates a new Rule object.
290
+ def initialize(pattern, action, state, flags)
291
+ @pattern = pattern
292
+ @action = action
293
+ @state = state
294
+ @flags = flags
295
+ end
296
+ end
297
+ end
298
+ end
@@ -0,0 +1,41 @@
1
+ # Author: Chris Wailes <chris.wailes@gmail.com>
2
+ # Project: Ruby Language Toolkit
3
+ # Date: 2011/03/04
4
+ # Description: This file contains a lexer for a simple calculator.
5
+
6
+ ############
7
+ # Requires #
8
+ ############
9
+
10
+ # Ruby Language Toolkit
11
+ require 'rltk/lexer'
12
+
13
+ #######################
14
+ # Classes and Modules #
15
+ #######################
16
+
17
+ module RLTK # :nodoc:
18
+ module Lexers # :nodoc:
19
+
20
+ # The Calculator lexer is a simple lexer for use with several of the
21
+ # provided parsers.
22
+ class Calculator < Lexer
23
+
24
+ #################
25
+ # Default State #
26
+ #################
27
+
28
+ rule(/\+/) { :PLS }
29
+ rule(/-/) { :SUB }
30
+ rule(/\*/) { :MUL }
31
+ rule(/\//) { :DIV }
32
+
33
+ rule(/\(/) { :LPAREN }
34
+ rule(/\)/) { :RPAREN }
35
+
36
+ rule(/[0-9]+/) { |t| [:NUM, t.to_i] }
37
+
38
+ rule(/\s/)
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,40 @@
1
+ # Author: Chris Wailes <chris.wailes@gmail.com>
2
+ # Project: Ruby Language Toolkit
3
+ # Date: 2011/01/20
4
+ # Description: This file contains a lexer for Extended Backus–Naur Form.
5
+
6
+ ############
7
+ # Requires #
8
+ ############
9
+
10
+ # Ruby Language Toolkit
11
+ require 'rltk/lexer'
12
+
13
+ #######################
14
+ # Classes and Modules #
15
+ #######################
16
+
17
+ module RLTK # :nodoc:
18
+
19
+ # The RLTK::Lexers module contains the lexers that are included as part of
20
+ # the RLKT project.
21
+ module Lexers
22
+
23
+ # The EBNF lexer is used by the RLTK::CFG class.
24
+ class EBNF < Lexer
25
+
26
+ #################
27
+ # Default State #
28
+ #################
29
+
30
+ rule(/\*/) { :* }
31
+ rule(/\+/) { :+ }
32
+ rule(/\?/) { :'?' }
33
+
34
+ rule(/[a-z0-9_]+/) { |t| [:NONTERM, t.to_sym] }
35
+ rule(/[A-Z0-9_]+/) { |t| [:TERM, t.to_sym] }
36
+
37
+ rule(/\s/)
38
+ end
39
+ end
40
+ end