riml 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/lexer.rb ADDED
@@ -0,0 +1,276 @@
1
+ require File.expand_path('../constants', __FILE__)
2
+ require File.expand_path('../errors', __FILE__)
3
+
4
+ module Riml
5
+ class Lexer
6
+ include Riml::Constants
7
+
8
+ SINGLE_LINE_COMMENT_REGEX = /\A\s*"(.*)$/
9
+ OPERATOR_REGEX = /\A#{Regexp.union(['||', '&&', '===', '+=', '-='] + COMPARISON_OPERATORS)}/
10
+
11
+ attr_reader :tokens, :prev_token, :lineno, :chunk
12
+
13
+ def initialize(code)
14
+ @code = code
15
+ @code.chomp!
16
+ @i = 0 # number of characters consumed
17
+ @token_buf = []
18
+ @tokens = []
19
+ @prev_token = nil
20
+ @lineno = 1
21
+ @current_indent = 0
22
+ @indent_pending = false
23
+ @dedent_pending = false
24
+ @one_line_conditional_END_pending = false
25
+ @splat_allowed = false
26
+ end
27
+
28
+ def tokenize
29
+ while more_code_to_tokenize?
30
+ new_token = next_token
31
+ @tokens << new_token unless new_token.nil?
32
+ end
33
+ @tokens
34
+ end
35
+
36
+ def next_token
37
+ while @token_buf.empty? && more_code_to_tokenize?
38
+ tokenize_chunk(get_new_chunk)
39
+ end
40
+ if @token_buf.any?
41
+ return @prev_token = @token_buf.shift
42
+ end
43
+ check_indentation
44
+ nil
45
+ end
46
+
47
+ def tokenize_chunk(chunk)
48
+ @chunk = chunk
49
+ # deal with line continuations
50
+ if cont = chunk[/\A(\n*)\s*\\/]
51
+ @i += cont.size
52
+ @lineno += $1.size
53
+ return
54
+ end
55
+
56
+ # all lines that start with ':' pass right through unmodified
57
+ if (prev_token.nil? || prev_token[0] == :NEWLINE) && (ex_literal = chunk[/\A\s*:(.*)?$/])
58
+ @i += ex_literal.size
59
+ @token_buf << [:EX_LITERAL, $1]
60
+ return
61
+ end
62
+
63
+ if splat_var = chunk[/\Aa:\d+/]
64
+ @i += splat_var.size
65
+ @token_buf << [:SCOPE_MODIFIER, 'a:'] << [:IDENTIFIER, splat_var[2..-1]]
66
+ # the 'n' scope modifier is added by riml
67
+ elsif scope_modifier = chunk[/\A([bwtglsavn]:)[\w_]/]
68
+ @i += 2
69
+ @token_buf << [:SCOPE_MODIFIER, $1]
70
+ elsif scope_modifier_literal = chunk[/\A([bwtglsavn]:)/]
71
+ @i += 2
72
+ @token_buf << [:SCOPE_MODIFIER_LITERAL, $1]
73
+ elsif special_var_prefix = chunk[/\A(&(\w:)?|\$|@)/]
74
+ @token_buf << [:SPECIAL_VAR_PREFIX, special_var_prefix.strip]
75
+ @expecting_identifier = true
76
+ @i += special_var_prefix.size
77
+ elsif function_method = chunk[/\A(function)\(/]
78
+ @token_buf << [:IDENTIFIER, $1]
79
+ @i += $1.size
80
+ elsif identifier = chunk[/\A[a-zA-Z_][\w#]*(\?|!)?/]
81
+ # keyword identifiers
82
+ if KEYWORDS.include?(identifier)
83
+ if identifier.match(/\Afunction/)
84
+ old_identifier = identifier.dup
85
+ identifier.sub!(/function/, "def")
86
+ @i += (old_identifier.size - identifier.size)
87
+ elsif identifier == 'finally'
88
+ identifier = 'ensure'
89
+ @i += 1 # diff b/t the two string lengths
90
+ elsif VIML_END_KEYWORDS.include? identifier
91
+ old_identifier = identifier.dup
92
+ identifier = 'end'
93
+ @i += old_identifier.size - identifier.size
94
+ end
95
+
96
+ if DEFINE_KEYWORDS.include?(identifier)
97
+ @in_function_declaration = true
98
+ end
99
+
100
+ # strip '?' out of token names and replace '!' with '_bang'
101
+ token_name = identifier.sub(/\?\Z/, "").sub(/!\Z/, "_bang").upcase
102
+
103
+ track_indent_level(chunk, identifier)
104
+ @token_buf << [token_name.intern, identifier]
105
+
106
+ elsif BUILTIN_COMMANDS.include? identifier
107
+ @token_buf << [:BUILTIN_COMMAND, identifier]
108
+ # method names and variable names
109
+ else
110
+ @token_buf << [:IDENTIFIER, identifier]
111
+ end
112
+
113
+ @i += identifier.size
114
+
115
+ parse_dict_vals!
116
+
117
+ if @in_function_declaration
118
+ @in_function_declaration = false unless DEFINE_KEYWORDS.include?(identifier) && @token_buf.size == 1
119
+ end
120
+ elsif splat = chunk[/\A(\.{3}|\*[a-zA-Z_]\w*)/]
121
+ raise SyntaxError, "unexpected splat, has to be enclosed in parentheses" unless @splat_allowed
122
+ @token_buf << [:SPLAT, splat]
123
+ @splat_allowed = false
124
+ @i += splat.size
125
+ # integer (octal)
126
+ elsif octal = chunk[/\A0[0-7]+/]
127
+ @token_buf << [:NUMBER, octal.to_s]
128
+ @i += octal.size
129
+ # integer (hex)
130
+ elsif hex = chunk[/\A0[xX]\h+/]
131
+ @token_buf << [:NUMBER, hex.to_s]
132
+ @i += hex.size
133
+ # integer or float (decimal)
134
+ elsif decimal = chunk[/\A[0-9]+(\.[0-9]+)?/]
135
+ @token_buf << [:NUMBER, decimal.to_s]
136
+ @i += decimal.size
137
+ elsif interpolation = chunk[/\A"(.*?)(\#\{(.*?)\})(.*?)"/]
138
+ # "#{hey} guys" = "hey" . " guys"
139
+ unless $1.empty?
140
+ @token_buf << [:STRING_D, $1]
141
+ @token_buf << ['.', '.']
142
+ end
143
+ @token_buf << [:IDENTIFIER, $3]
144
+ unless $4.empty?
145
+ @token_buf << ['.', '.']
146
+ @token_buf << [ :STRING_D, " #{$4[1..-1]}" ]
147
+ end
148
+ @i += interpolation.size
149
+ elsif single_line_comment = chunk[SINGLE_LINE_COMMENT_REGEX] && (prev_token.nil? || prev_token[0] == :NEWLINE)
150
+ comment = chunk[SINGLE_LINE_COMMENT_REGEX]
151
+ @i += comment.size + 1 # consume next newline character
152
+ @lineno += 1
153
+ elsif inline_comment = chunk[/\A\s*"[^"]*?$/]
154
+ @i += inline_comment.size # inline comment, don't consume newline character
155
+ elsif string_double = chunk[/\A"(.*?)"/, 1]
156
+ @token_buf << [:STRING_D, string_double]
157
+ @i += string_double.size + 2
158
+ elsif string_single = chunk[/\A'(([^']|'')*)'/, 1]
159
+ @token_buf << [:STRING_S, string_single]
160
+ @i += string_single.size + 2
161
+ elsif newlines = chunk[/\A(\n+)/, 1]
162
+ # push only 1 newline
163
+ @token_buf << [:NEWLINE, "\n"] unless prev_token && prev_token[0] == :NEWLINE
164
+
165
+ # pending indents/dedents
166
+ if @one_line_conditional_END_pending
167
+ @one_line_conditional_END_pending = false
168
+ elsif @indent_pending
169
+ @indent_pending = false
170
+ elsif @dedent_pending
171
+ @dedent_pending = false
172
+ end
173
+
174
+ @i += newlines.size
175
+ @lineno += newlines.size
176
+ elsif heredoc_pattern = chunk[%r{\A<<(.+?)\r?\n}]
177
+ pattern = $1
178
+ @i += heredoc_pattern.size
179
+ @token_buf << [:HEREDOC, pattern]
180
+ new_chunk = get_new_chunk
181
+ heredoc_string = new_chunk[%r|(.+?\r?\n)(#{Regexp.escape(pattern)})|]
182
+ @i += heredoc_string.size + $2.size
183
+ @token_buf << [:STRING_D, $1]
184
+ @lineno += (1 + heredoc_string.each_line.to_a.size)
185
+ # operators of more than 1 char
186
+ elsif operator = chunk[OPERATOR_REGEX]
187
+ @token_buf << [operator, operator]
188
+ @i += operator.size
189
+ # FIXME: this doesn't work well enough
190
+ elsif regexp = chunk[%r{\A/.*?[^\\]/}]
191
+ @token_buf << [:REGEXP, regexp]
192
+ @i += regexp.size
193
+ elsif whitespaces = chunk[/\A\s+/]
194
+ @i += whitespaces.size
195
+ # operators and tokens of single chars, one of: ( ) , . [ ] ! + - = < > /
196
+ else
197
+ value = chunk[0, 1]
198
+ if value == '|'
199
+ @token_buf << [:NEWLINE, "\n"]
200
+ else
201
+ @token_buf << [value, value]
202
+ end
203
+ @splat_allowed = true if value == '('
204
+ @splat_allowed = false if value == ')'
205
+ @i += 1
206
+ if value == ']' || value == ')' && chunk[1, 1] == '.'
207
+ parse_dict_vals!
208
+ end
209
+ end
210
+ end
211
+
212
+ private
213
+ def track_indent_level(chunk, identifier)
214
+ case identifier.to_sym
215
+ when :def, :def!, :defm, :defm!, :while, :until, :for, :try, :class
216
+ @current_indent += 2
217
+ @indent_pending = true
218
+ when :if, :unless
219
+ if one_line_conditional?(chunk)
220
+ @one_line_conditional_END_pending = true
221
+ elsif !statement_modifier?(chunk)
222
+ @current_indent += 2
223
+ @indent_pending = true
224
+ end
225
+ when :end
226
+ unless @one_line_conditional_END_pending
227
+ @current_indent -= 2
228
+ @dedent_pending = true
229
+ end
230
+ end
231
+ end
232
+
233
+ def parse_dict_vals!
234
+ # dict.key OR dict.key.other_key
235
+ new_chunk = get_new_chunk
236
+ if new_chunk[/\A\.([\w.]+)/]
237
+ parts = $1.split('.')
238
+ @i += $1.size + 1
239
+ if @in_function_declaration
240
+ @token_buf.last[1] << ".#{$1}"
241
+ else
242
+ while key = parts.shift
243
+ @token_buf << [:DICT_VAL, key]
244
+ end
245
+ end
246
+ end
247
+ end
248
+
249
+ def check_indentation
250
+ raise SyntaxError, "Missing #{(@current_indent / 2)} END identifier(s), " if @current_indent > 0
251
+ raise SyntaxError, "#{(@current_indent / 2).abs} too many END identifiers" if @current_indent < 0
252
+ end
253
+
254
+ def one_line_conditional?(chunk)
255
+ chunk[/^(if|unless).+?(else)?.+?end$/]
256
+ end
257
+
258
+ def statement_modifier?(chunk)
259
+ old_i = @i
260
+ # backtrack until the beginning of the line
261
+ @i -= 1 while @code[@i-1] =~ /[^\n\r]/ && !@code[@i-1].empty?
262
+ new_chunk = get_new_chunk
263
+ new_chunk[/^(.+?)(if|unless).+$/] && !$1.strip.empty?
264
+ ensure
265
+ @i = old_i
266
+ end
267
+
268
+ def get_new_chunk
269
+ @code[@i..-1]
270
+ end
271
+
272
+ def more_code_to_tokenize?
273
+ @i < @code.size
274
+ end
275
+ end
276
+ end