riml 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
data/lib/lexer.rb ADDED
@@ -0,0 +1,276 @@
1
+ require File.expand_path('../constants', __FILE__)
2
+ require File.expand_path('../errors', __FILE__)
3
+
4
+ module Riml
5
+ class Lexer
6
+ include Riml::Constants
7
+
8
+ SINGLE_LINE_COMMENT_REGEX = /\A\s*"(.*)$/
9
+ OPERATOR_REGEX = /\A#{Regexp.union(['||', '&&', '===', '+=', '-='] + COMPARISON_OPERATORS)}/
10
+
11
+ attr_reader :tokens, :prev_token, :lineno, :chunk
12
+
13
+ def initialize(code)
14
+ @code = code
15
+ @code.chomp!
16
+ @i = 0 # number of characters consumed
17
+ @token_buf = []
18
+ @tokens = []
19
+ @prev_token = nil
20
+ @lineno = 1
21
+ @current_indent = 0
22
+ @indent_pending = false
23
+ @dedent_pending = false
24
+ @one_line_conditional_END_pending = false
25
+ @splat_allowed = false
26
+ end
27
+
28
+ def tokenize
29
+ while more_code_to_tokenize?
30
+ new_token = next_token
31
+ @tokens << new_token unless new_token.nil?
32
+ end
33
+ @tokens
34
+ end
35
+
36
+ def next_token
37
+ while @token_buf.empty? && more_code_to_tokenize?
38
+ tokenize_chunk(get_new_chunk)
39
+ end
40
+ if @token_buf.any?
41
+ return @prev_token = @token_buf.shift
42
+ end
43
+ check_indentation
44
+ nil
45
+ end
46
+
47
+ def tokenize_chunk(chunk)
48
+ @chunk = chunk
49
+ # deal with line continuations
50
+ if cont = chunk[/\A(\n*)\s*\\/]
51
+ @i += cont.size
52
+ @lineno += $1.size
53
+ return
54
+ end
55
+
56
+ # all lines that start with ':' pass right through unmodified
57
+ if (prev_token.nil? || prev_token[0] == :NEWLINE) && (ex_literal = chunk[/\A\s*:(.*)?$/])
58
+ @i += ex_literal.size
59
+ @token_buf << [:EX_LITERAL, $1]
60
+ return
61
+ end
62
+
63
+ if splat_var = chunk[/\Aa:\d+/]
64
+ @i += splat_var.size
65
+ @token_buf << [:SCOPE_MODIFIER, 'a:'] << [:IDENTIFIER, splat_var[2..-1]]
66
+ # the 'n' scope modifier is added by riml
67
+ elsif scope_modifier = chunk[/\A([bwtglsavn]:)[\w_]/]
68
+ @i += 2
69
+ @token_buf << [:SCOPE_MODIFIER, $1]
70
+ elsif scope_modifier_literal = chunk[/\A([bwtglsavn]:)/]
71
+ @i += 2
72
+ @token_buf << [:SCOPE_MODIFIER_LITERAL, $1]
73
+ elsif special_var_prefix = chunk[/\A(&(\w:)?|\$|@)/]
74
+ @token_buf << [:SPECIAL_VAR_PREFIX, special_var_prefix.strip]
75
+ @expecting_identifier = true
76
+ @i += special_var_prefix.size
77
+ elsif function_method = chunk[/\A(function)\(/]
78
+ @token_buf << [:IDENTIFIER, $1]
79
+ @i += $1.size
80
+ elsif identifier = chunk[/\A[a-zA-Z_][\w#]*(\?|!)?/]
81
+ # keyword identifiers
82
+ if KEYWORDS.include?(identifier)
83
+ if identifier.match(/\Afunction/)
84
+ old_identifier = identifier.dup
85
+ identifier.sub!(/function/, "def")
86
+ @i += (old_identifier.size - identifier.size)
87
+ elsif identifier == 'finally'
88
+ identifier = 'ensure'
89
+ @i += 1 # diff b/t the two string lengths
90
+ elsif VIML_END_KEYWORDS.include? identifier
91
+ old_identifier = identifier.dup
92
+ identifier = 'end'
93
+ @i += old_identifier.size - identifier.size
94
+ end
95
+
96
+ if DEFINE_KEYWORDS.include?(identifier)
97
+ @in_function_declaration = true
98
+ end
99
+
100
+ # strip '?' out of token names and replace '!' with '_bang'
101
+ token_name = identifier.sub(/\?\Z/, "").sub(/!\Z/, "_bang").upcase
102
+
103
+ track_indent_level(chunk, identifier)
104
+ @token_buf << [token_name.intern, identifier]
105
+
106
+ elsif BUILTIN_COMMANDS.include? identifier
107
+ @token_buf << [:BUILTIN_COMMAND, identifier]
108
+ # method names and variable names
109
+ else
110
+ @token_buf << [:IDENTIFIER, identifier]
111
+ end
112
+
113
+ @i += identifier.size
114
+
115
+ parse_dict_vals!
116
+
117
+ if @in_function_declaration
118
+ @in_function_declaration = false unless DEFINE_KEYWORDS.include?(identifier) && @token_buf.size == 1
119
+ end
120
+ elsif splat = chunk[/\A(\.{3}|\*[a-zA-Z_]\w*)/]
121
+ raise SyntaxError, "unexpected splat, has to be enclosed in parentheses" unless @splat_allowed
122
+ @token_buf << [:SPLAT, splat]
123
+ @splat_allowed = false
124
+ @i += splat.size
125
+ # integer (octal)
126
+ elsif octal = chunk[/\A0[0-7]+/]
127
+ @token_buf << [:NUMBER, octal.to_s]
128
+ @i += octal.size
129
+ # integer (hex)
130
+ elsif hex = chunk[/\A0[xX]\h+/]
131
+ @token_buf << [:NUMBER, hex.to_s]
132
+ @i += hex.size
133
+ # integer or float (decimal)
134
+ elsif decimal = chunk[/\A[0-9]+(\.[0-9]+)?/]
135
+ @token_buf << [:NUMBER, decimal.to_s]
136
+ @i += decimal.size
137
+ elsif interpolation = chunk[/\A"(.*?)(\#\{(.*?)\})(.*?)"/]
138
+ # "#{hey} guys" = "hey" . " guys"
139
+ unless $1.empty?
140
+ @token_buf << [:STRING_D, $1]
141
+ @token_buf << ['.', '.']
142
+ end
143
+ @token_buf << [:IDENTIFIER, $3]
144
+ unless $4.empty?
145
+ @token_buf << ['.', '.']
146
+ @token_buf << [ :STRING_D, " #{$4[1..-1]}" ]
147
+ end
148
+ @i += interpolation.size
149
+ elsif single_line_comment = chunk[SINGLE_LINE_COMMENT_REGEX] && (prev_token.nil? || prev_token[0] == :NEWLINE)
150
+ comment = chunk[SINGLE_LINE_COMMENT_REGEX]
151
+ @i += comment.size + 1 # consume next newline character
152
+ @lineno += 1
153
+ elsif inline_comment = chunk[/\A\s*"[^"]*?$/]
154
+ @i += inline_comment.size # inline comment, don't consume newline character
155
+ elsif string_double = chunk[/\A"(.*?)"/, 1]
156
+ @token_buf << [:STRING_D, string_double]
157
+ @i += string_double.size + 2
158
+ elsif string_single = chunk[/\A'(([^']|'')*)'/, 1]
159
+ @token_buf << [:STRING_S, string_single]
160
+ @i += string_single.size + 2
161
+ elsif newlines = chunk[/\A(\n+)/, 1]
162
+ # push only 1 newline
163
+ @token_buf << [:NEWLINE, "\n"] unless prev_token && prev_token[0] == :NEWLINE
164
+
165
+ # pending indents/dedents
166
+ if @one_line_conditional_END_pending
167
+ @one_line_conditional_END_pending = false
168
+ elsif @indent_pending
169
+ @indent_pending = false
170
+ elsif @dedent_pending
171
+ @dedent_pending = false
172
+ end
173
+
174
+ @i += newlines.size
175
+ @lineno += newlines.size
176
+ elsif heredoc_pattern = chunk[%r{\A<<(.+?)\r?\n}]
177
+ pattern = $1
178
+ @i += heredoc_pattern.size
179
+ @token_buf << [:HEREDOC, pattern]
180
+ new_chunk = get_new_chunk
181
+ heredoc_string = new_chunk[%r|(.+?\r?\n)(#{Regexp.escape(pattern)})|]
182
+ @i += heredoc_string.size + $2.size
183
+ @token_buf << [:STRING_D, $1]
184
+ @lineno += (1 + heredoc_string.each_line.to_a.size)
185
+ # operators of more than 1 char
186
+ elsif operator = chunk[OPERATOR_REGEX]
187
+ @token_buf << [operator, operator]
188
+ @i += operator.size
189
+ # FIXME: this doesn't work well enough
190
+ elsif regexp = chunk[%r{\A/.*?[^\\]/}]
191
+ @token_buf << [:REGEXP, regexp]
192
+ @i += regexp.size
193
+ elsif whitespaces = chunk[/\A\s+/]
194
+ @i += whitespaces.size
195
+ # operators and tokens of single chars, one of: ( ) , . [ ] ! + - = < > /
196
+ else
197
+ value = chunk[0, 1]
198
+ if value == '|'
199
+ @token_buf << [:NEWLINE, "\n"]
200
+ else
201
+ @token_buf << [value, value]
202
+ end
203
+ @splat_allowed = true if value == '('
204
+ @splat_allowed = false if value == ')'
205
+ @i += 1
206
+ if value == ']' || value == ')' && chunk[1, 1] == '.'
207
+ parse_dict_vals!
208
+ end
209
+ end
210
+ end
211
+
212
+ private
213
+ def track_indent_level(chunk, identifier)
214
+ case identifier.to_sym
215
+ when :def, :def!, :defm, :defm!, :while, :until, :for, :try, :class
216
+ @current_indent += 2
217
+ @indent_pending = true
218
+ when :if, :unless
219
+ if one_line_conditional?(chunk)
220
+ @one_line_conditional_END_pending = true
221
+ elsif !statement_modifier?(chunk)
222
+ @current_indent += 2
223
+ @indent_pending = true
224
+ end
225
+ when :end
226
+ unless @one_line_conditional_END_pending
227
+ @current_indent -= 2
228
+ @dedent_pending = true
229
+ end
230
+ end
231
+ end
232
+
233
+ def parse_dict_vals!
234
+ # dict.key OR dict.key.other_key
235
+ new_chunk = get_new_chunk
236
+ if new_chunk[/\A\.([\w.]+)/]
237
+ parts = $1.split('.')
238
+ @i += $1.size + 1
239
+ if @in_function_declaration
240
+ @token_buf.last[1] << ".#{$1}"
241
+ else
242
+ while key = parts.shift
243
+ @token_buf << [:DICT_VAL, key]
244
+ end
245
+ end
246
+ end
247
+ end
248
+
249
+ def check_indentation
250
+ raise SyntaxError, "Missing #{(@current_indent / 2)} END identifier(s), " if @current_indent > 0
251
+ raise SyntaxError, "#{(@current_indent / 2).abs} too many END identifiers" if @current_indent < 0
252
+ end
253
+
254
+ def one_line_conditional?(chunk)
255
+ chunk[/^(if|unless).+?(else)?.+?end$/]
256
+ end
257
+
258
+ def statement_modifier?(chunk)
259
+ old_i = @i
260
+ # backtrack until the beginning of the line
261
+ @i -= 1 while @code[@i-1] =~ /[^\n\r]/ && !@code[@i-1].empty?
262
+ new_chunk = get_new_chunk
263
+ new_chunk[/^(.+?)(if|unless).+$/] && !$1.strip.empty?
264
+ ensure
265
+ @i = old_i
266
+ end
267
+
268
+ def get_new_chunk
269
+ @code[@i..-1]
270
+ end
271
+
272
+ def more_code_to_tokenize?
273
+ @i < @code.size
274
+ end
275
+ end
276
+ end