miniruby 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,325 @@
1
+ # typed: strong
2
+ # frozen_string_literal: true
3
+
4
+ module MiniRuby
5
+ # Contains the definitions of all AST (Abstract Syntax Tree) nodes.
6
+ # AST is the data structure that is returned by the parser.
7
+ module AST
8
+ # A string that represents a single level of indentation
9
+ # in S-expressions
10
+ INDENT_UNIT = ' '
11
+
12
+ # Abstract class representing an AST node.
13
+ class Node
14
+ extend T::Sig
15
+ extend T::Helpers
16
+
17
+ abstract!
18
+
19
+ sig { returns(Span) }
20
+ attr_accessor :span
21
+
22
+ sig { params(span: Span).void }
23
+ def initialize(span: Span::ZERO)
24
+ @span = span
25
+ end
26
+
27
+ sig { params(other: Object).returns(T::Boolean) }
28
+ def ==(other)
29
+ other.is_a?(self.class)
30
+ end
31
+
32
+ # Get the Ruby-like representation of the AST
33
+ sig { abstract.params(indent: Integer).returns(String) }
34
+ def to_s(indent = 0); end
35
+
36
+ # Inspect the AST in the S-expression format
37
+ sig { abstract.params(indent: Integer).returns(String) }
38
+ def inspect(indent = 0); end
39
+ end
40
+
41
+ # Represents a program
42
+ class ProgramNode < Node
43
+ sig { returns(T::Array[StatementNode]) }
44
+ attr_reader :statements
45
+
46
+ sig { params(statements: T::Array[StatementNode], span: Span).void }
47
+ def initialize(statements:, span: Span::ZERO)
48
+ @span = span
49
+ @statements = statements
50
+ end
51
+
52
+ sig { params(other: Object).returns(T::Boolean) }
53
+ def ==(other)
54
+ return false unless other.is_a?(ProgramNode)
55
+
56
+ @statements == other.statements
57
+ end
58
+
59
+ sig { override.params(indent: Integer).returns(String) }
60
+ def to_s(indent = 0)
61
+ buffer = String.new
62
+
63
+ @statements.each do |stmt|
64
+ buffer << stmt.to_s(indent)
65
+ end
66
+
67
+ buffer
68
+ end
69
+
70
+ sig { override.params(indent: Integer).returns(String) }
71
+ def inspect(indent = 0)
72
+ buff = String.new
73
+
74
+ buff << "#{INDENT_UNIT * indent}(program"
75
+ @statements.each do |stmt|
76
+ buff << "\n" << stmt.inspect(indent + 1)
77
+ end
78
+ buff << ')'
79
+ buff
80
+ end
81
+ end
82
+
83
+ # Represents a single statement (line) of code
84
+ class StatementNode < Node
85
+ abstract!
86
+ end
87
+
88
+ # Represents a statement with an expression like `2 + 3 - 5;`
89
+ class ExpressionStatementNode < StatementNode
90
+ sig { returns(ExpressionNode) }
91
+ attr_reader :expression
92
+
93
+ sig { params(expression: ExpressionNode, span: Span).void }
94
+ def initialize(expression:, span: Span::ZERO)
95
+ @span = span
96
+ @expression = expression
97
+ end
98
+
99
+ sig { params(other: Object).returns(T::Boolean) }
100
+ def ==(other)
101
+ return false unless other.is_a?(ExpressionStatementNode)
102
+
103
+ @expression == other.expression
104
+ end
105
+
106
+ sig { override.params(indent: Integer).returns(String) }
107
+ def to_s(indent = 0)
108
+ "#{INDENT_UNIT * indent}#{@expression}\n"
109
+ end
110
+
111
+ sig { override.params(indent: Integer).returns(String) }
112
+ def inspect(indent = 0)
113
+ buff = String.new
114
+ buff << "#{INDENT_UNIT * indent}(expr_stmt"
115
+ buff << "\n" << @expression.inspect(indent + 1)
116
+ buff << ')'
117
+ buff
118
+ end
119
+ end
120
+
121
+ # Represents an expression like `2 + 3`
122
+ # that can be a part of a larger expression/statement like `2 + 3 - 5`
123
+ class ExpressionNode < Node
124
+ abstract!
125
+ end
126
+
127
+ # Represents an invalid node
128
+ class InvalidNode < ExpressionNode
129
+ sig { returns(Token) }
130
+ attr_reader :token
131
+
132
+ sig { params(token: Token, span: Span).void }
133
+ def initialize(token:, span: Span::ZERO)
134
+ @span = span
135
+ @token = token
136
+ end
137
+
138
+ sig { params(other: Object).returns(T::Boolean) }
139
+ def ==(other)
140
+ return false unless other.is_a?(InvalidNode)
141
+
142
+ @token == other.token
143
+ end
144
+
145
+ sig { override.params(indent: Integer).returns(String) }
146
+ def to_s(indent = 0)
147
+ "#{INDENT_UNIT * indent}<invalid: `#{token}`>"
148
+ end
149
+
150
+ sig { override.params(indent: Integer).returns(String) }
151
+ def inspect(indent = 0)
152
+ "#{INDENT_UNIT * indent}(invalid #{token.inspect})"
153
+ end
154
+ end
155
+
156
+ # Represents a false literal eg. `false`
157
+ class FalseLiteralNode < ExpressionNode
158
+ sig { override.params(indent: Integer).returns(String) }
159
+ def to_s(indent = 0)
160
+ "#{INDENT_UNIT * indent}false"
161
+ end
162
+
163
+ sig { override.params(indent: Integer).returns(String) }
164
+ def inspect(indent = 0)
165
+ "#{INDENT_UNIT * indent}false"
166
+ end
167
+ end
168
+
169
+ # Represents a true literal eg. `true`
170
+ class TrueLiteralNode < ExpressionNode
171
+ sig { override.params(indent: Integer).returns(String) }
172
+ def to_s(indent = 0)
173
+ "#{INDENT_UNIT * indent}true"
174
+ end
175
+
176
+ sig { override.params(indent: Integer).returns(String) }
177
+ def inspect(indent = 0)
178
+ "#{INDENT_UNIT * indent}true"
179
+ end
180
+ end
181
+
182
+ # Represents a nil literal eg. `nil`
183
+ class NilLiteralNode < ExpressionNode
184
+ sig { override.params(indent: Integer).returns(String) }
185
+ def to_s(indent = 0)
186
+ "#{INDENT_UNIT * indent}nil"
187
+ end
188
+
189
+ sig { override.params(indent: Integer).returns(String) }
190
+ def inspect(indent = 0)
191
+ "#{INDENT_UNIT * indent}nil"
192
+ end
193
+ end
194
+
195
+ # Represents a self literal eg. `self`
196
+ class SelfLiteralNode < ExpressionNode
197
+ sig { override.params(indent: Integer).returns(String) }
198
+ def to_s(indent = 0)
199
+ "#{INDENT_UNIT * indent}self"
200
+ end
201
+
202
+ sig { override.params(indent: Integer).returns(String) }
203
+ def inspect(indent = 0)
204
+ "#{INDENT_UNIT * indent}self"
205
+ end
206
+ end
207
+
208
+ # Represents a float literal eg. `123.5`
209
+ class FloatLiteralNode < ExpressionNode
210
+ sig { returns(String) }
211
+ attr_reader :value
212
+
213
+ sig { params(value: String, span: Span).void }
214
+ def initialize(value:, span: Span::ZERO)
215
+ @span = span
216
+ @value = value
217
+ end
218
+
219
+ sig { params(other: Object).returns(T::Boolean) }
220
+ def ==(other)
221
+ return false unless other.is_a?(FloatLiteralNode)
222
+
223
+ @value == other.value
224
+ end
225
+
226
+ sig { override.params(indent: Integer).returns(String) }
227
+ def to_s(indent = 0)
228
+ "#{INDENT_UNIT * indent}#{value}"
229
+ end
230
+
231
+ sig { override.params(indent: Integer).returns(String) }
232
+ def inspect(indent = 0)
233
+ "#{INDENT_UNIT * indent}#{value}"
234
+ end
235
+ end
236
+
237
+ # Represents an integer literal eg. `123`
238
+ class IntegerLiteralNode < ExpressionNode
239
+ sig { returns(String) }
240
+ attr_reader :value
241
+
242
+ sig { params(value: String, span: Span).void }
243
+ def initialize(value:, span: Span::ZERO)
244
+ @span = span
245
+ @value = value
246
+ end
247
+
248
+ sig { params(other: Object).returns(T::Boolean) }
249
+ def ==(other)
250
+ return false unless other.is_a?(IntegerLiteralNode)
251
+
252
+ @value == other.value
253
+ end
254
+
255
+ sig { override.params(indent: Integer).returns(String) }
256
+ def to_s(indent = 0)
257
+ "#{INDENT_UNIT * indent}#{value}"
258
+ end
259
+
260
+ sig { override.params(indent: Integer).returns(String) }
261
+ def inspect(indent = 0)
262
+ "#{INDENT_UNIT * indent}#{value}"
263
+ end
264
+ end
265
+
266
+ # Represents a string literal eg. `"foo"`
267
+ class StringLiteralNode < ExpressionNode
268
+ sig { returns(String) }
269
+ attr_reader :value
270
+
271
+ sig { params(value: String, span: Span).void }
272
+ def initialize(value:, span: Span::ZERO)
273
+ @span = span
274
+ @value = value
275
+ end
276
+
277
+ sig { params(other: Object).returns(T::Boolean) }
278
+ def ==(other)
279
+ return false unless other.is_a?(StringLiteralNode)
280
+
281
+ @value == other.value
282
+ end
283
+
284
+ sig { override.params(indent: Integer).returns(String) }
285
+ def to_s(indent = 0)
286
+ "#{INDENT_UNIT * indent}#{value.inspect}"
287
+ end
288
+
289
+ sig { override.params(indent: Integer).returns(String) }
290
+ def inspect(indent = 0)
291
+ "#{INDENT_UNIT * indent}#{value.inspect}"
292
+ end
293
+ end
294
+
295
+ # Represents an identifier like `a`, `foo`
296
+ class IdentifierNode < ExpressionNode
297
+ sig { returns(String) }
298
+ attr_reader :value
299
+
300
+ sig { params(value: String, span: Span).void }
301
+ def initialize(value:, span: Span::ZERO)
302
+ @span = span
303
+ @value = value
304
+ end
305
+
306
+ sig { params(other: Object).returns(T::Boolean) }
307
+ def ==(other)
308
+ return false unless other.is_a?(IdentifierNode)
309
+
310
+ @value == other.value
311
+ end
312
+
313
+ sig { override.params(indent: Integer).returns(String) }
314
+ def to_s(indent = 0)
315
+ "#{INDENT_UNIT * indent}#{@value}"
316
+ end
317
+
318
+ sig { override.params(indent: Integer).returns(String) }
319
+ def inspect(indent = 0)
320
+ "#{INDENT_UNIT * indent}#{@value}"
321
+ end
322
+ end
323
+
324
+ end
325
+ end
@@ -0,0 +1,380 @@
1
+ # typed: strict
2
+ # frozen_string_literal: true
3
+
4
+ require_relative 'token'
5
+
6
+ module MiniRuby
7
+ # A lexical analyzer (tokenizer) for MiniRuby
8
+ class Lexer
9
+ extend T::Sig
10
+ extend T::Generic
11
+ include Enumerable
12
+
13
+ # Type parameter for `Enumerable`
14
+ # Declares the type that the lexer returns for tokens
15
+ Elem = type_member { { fixed: Token } }
16
+
17
+ class << self
18
+ extend T::Sig
19
+
20
+ sig { params(source: String).returns(T::Array[Token]) }
21
+ def lex(source)
22
+ new(source).to_a
23
+ end
24
+ end
25
+
26
+ sig { params(source: String).void }
27
+ def initialize(source)
28
+ @source = source
29
+
30
+ # offset of the first character of the current lexeme
31
+ @start_cursor = T.let(0, Integer)
32
+ # offset of the next character
33
+ @cursor = T.let(0, Integer)
34
+ end
35
+
36
+ sig { returns(Token) }
37
+ def next
38
+ return Token.new(Token::END_OF_FILE, Span.new(Position.new(0), Position.new(0))) unless more_tokens?
39
+
40
+ scan_token
41
+ end
42
+
43
+ sig { override.params(block: T.nilable(T.proc.params(arg0: Token).void)).returns(T.untyped) }
44
+ def each(&block)
45
+ return enum_for(T.must(__method__)) unless block
46
+
47
+ loop do
48
+ tok = self.next
49
+ break if tok.type == Token::END_OF_FILE
50
+
51
+ block.call(tok)
52
+ end
53
+
54
+ self
55
+ end
56
+
57
+ private
58
+
59
+ sig { returns(T::Boolean) }
60
+ def more_tokens?
61
+ @cursor < @source.length
62
+ end
63
+
64
+ sig { params(type: Symbol).returns(Token) }
65
+ def token_with_consumed_value(type)
66
+ token(type, token_value)
67
+ end
68
+
69
+ sig { params(type: Symbol, value: T.nilable(String)).returns(Token) }
70
+ def token(type, value = nil)
71
+ span = Span.new(Position.new(@start_cursor), Position.new(@cursor - 1))
72
+ @start_cursor = @cursor
73
+ Token.new(type, span, value)
74
+ end
75
+
76
+ # Returns the current token value.
77
+ sig { returns(String) }
78
+ def token_value
79
+ T.must @source[@start_cursor...@cursor]
80
+ end
81
+
82
+ sig { returns([String, T::Boolean]) }
83
+ def advance_char
84
+ return '', false unless more_tokens?
85
+
86
+ char = next_char
87
+
88
+ @cursor += 1
89
+ [char, true]
90
+ end
91
+
92
+ sig { returns(String) }
93
+ def next_char
94
+ T.must @source[@cursor]
95
+ end
96
+
97
+ # Gets the next UTF-8 encoded character
98
+ # without incrementing the cursor.
99
+ sig { returns(String) }
100
+ def peek_char
101
+ return '' unless more_tokens?
102
+
103
+ char, = next_char
104
+ char
105
+ end
106
+
107
+ # Advance the next `n` characters
108
+ sig { params(n: Integer).returns(T::Boolean) }
109
+ def advance_chars(n)
110
+ n.times do
111
+ _, ok = advance_char
112
+ return false unless ok
113
+ end
114
+
115
+ true
116
+ end
117
+
118
+ # Checks if the given character matches
119
+ # the next UTF-8 encoded character in source code.
120
+ # If they match, the cursor gets incremented.
121
+ sig { params(char: String).returns(T::Boolean) }
122
+ def match_char(char)
123
+ return false unless more_tokens?
124
+
125
+ if peek_char == char
126
+ advance_char
127
+ return true
128
+ end
129
+
130
+ false
131
+ end
132
+
133
+ # Consumes the next character if it's from the valid set.
134
+ sig { params(valid_chars: String).returns(T::Boolean) }
135
+ def match_chars(valid_chars)
136
+ return false unless more_tokens?
137
+
138
+ p = peek_char
139
+ if p != '' && valid_chars.include?(p)
140
+ advance_char
141
+ return true
142
+ end
143
+
144
+ false
145
+ end
146
+
147
+ # Rewinds the cursor back n chars.
148
+ sig { params(n: Integer).void }
149
+ def backup_chars(n)
150
+ @cursor -= n
151
+ end
152
+
153
+ # Skips the current accumulated token.
154
+ sig { void }
155
+ def skip_token
156
+ @start_cursor = @cursor
157
+ end
158
+
159
+ sig { returns(Token) }
160
+ def scan_token
161
+ loop do
162
+ char, ok = advance_char
163
+ return token(Token::END_OF_FILE) unless ok
164
+
165
+ case char
166
+ when ','
167
+ return token(Token::COMMA)
168
+ when ';'
169
+ return token(Token::SEMICOLON)
170
+ when '('
171
+ return token(Token::LPAREN)
172
+ when ')'
173
+ return token(Token::RPAREN)
174
+ when '!'
175
+ return token(Token::NOT_EQUAL) if match_char('=')
176
+
177
+ return token(Token::BANG)
178
+ when '='
179
+ return token(Token::EQUAL_EQUAL) if match_char('=')
180
+
181
+ return token(Token::EQUAL)
182
+ when '>'
183
+ return token(Token::GREATER_EQUAL) if match_char('=')
184
+
185
+ return token(Token::GREATER)
186
+ when '<'
187
+ return token(Token::LESS_EQUAL) if match_char('=')
188
+
189
+ return token(Token::LESS)
190
+ when '+'
191
+ return token(Token::PLUS)
192
+ when '-'
193
+ return token(Token::MINUS)
194
+ when '*'
195
+ return token(Token::STAR)
196
+ when '/'
197
+ return token(Token::SLASH)
198
+ when '"'
199
+ return scan_string
200
+ when "\n"
201
+ return token(Token::NEWLINE)
202
+ when ' ', "\r", "\t"
203
+ skip_token
204
+ next
205
+ else
206
+ if char.match?(/[[:alpha:]]/)
207
+ return scan_identifier
208
+ elsif char.match?(/\d/)
209
+ return scan_number(char)
210
+ end
211
+
212
+ return token(Token::ERROR, "unexpected char `#{char}`")
213
+ end
214
+ end
215
+ end
216
+
217
+ sig { params(char: String).returns(T::Boolean) }
218
+ def identifier_char?(char)
219
+ char.match?(/[[:alpha:][:digit:]_]/)
220
+ end
221
+
222
+ sig { returns(Token) }
223
+ def scan_identifier
224
+ advance_char while identifier_char?(peek_char)
225
+
226
+ value = token_value
227
+ return token(value.to_sym) if Token::KEYWORDS.include?(value)
228
+
229
+ token(Token::IDENTIFIER, value)
230
+ end
231
+
232
+ sig { void }
233
+ def consume_digits
234
+ loop do
235
+ p = peek_char
236
+ break if p == '' || !Token::DIGITS.include?(peek_char)
237
+
238
+ _, ok = advance_char
239
+ break unless ok
240
+ end
241
+ end
242
+
243
+ # Checks if the next `n` characters are from the valid set.
244
+ sig { params(valid_chars: String, n: Integer).returns(T::Boolean) }
245
+ def accept_chars(valid_chars, n)
246
+ result = T.let(true, T::Boolean)
247
+ n.times do
248
+ unless match_chars(valid_chars)
249
+ result = false
250
+ break
251
+ end
252
+ end
253
+
254
+ backup_chars(n)
255
+
256
+ result
257
+ end
258
+
259
+ sig { params(init_char: String).returns(Token) }
260
+ def scan_number(init_char)
261
+ if init_char == '0'
262
+ p = peek_char
263
+ if accept_chars(Token::DIGITS, 1)
264
+ consume_digits
265
+ return token(
266
+ Token::ERROR,
267
+ 'illegal trailing zero in number literal',
268
+ )
269
+ end
270
+ end
271
+
272
+ consume_digits
273
+
274
+ is_float = false
275
+
276
+ if match_char('.')
277
+ is_float = true
278
+ p = peek_char
279
+ if p == ''
280
+ return token(
281
+ Token::ERROR,
282
+ 'unexpected EOF',
283
+ )
284
+ end
285
+
286
+ unless Token::DIGITS.include?(p)
287
+ return token(
288
+ Token::ERROR,
289
+ "unexpected char in number literal: `#{p}`",
290
+ )
291
+ end
292
+
293
+ consume_digits
294
+ end
295
+
296
+ if match_char('e') || match_char('E')
297
+ is_float = true
298
+ match_char('+') || match_char('-')
299
+ p = peek_char
300
+ if p == ''
301
+ return token(
302
+ Token::ERROR,
303
+ 'unexpected EOF',
304
+ )
305
+ end
306
+ unless Token::DIGITS.include?(p)
307
+ return token(
308
+ Token::ERROR,
309
+ "unexpected char in number literal: `#{p}`",
310
+ )
311
+ end
312
+ consume_digits
313
+ end
314
+
315
+ if is_float
316
+ return token_with_consumed_value(Token::FLOAT)
317
+ end
318
+
319
+ token_with_consumed_value(Token::INTEGER)
320
+ end
321
+
322
+ sig { void }
323
+ def swallow_rest_of_the_string
324
+ loop do
325
+ # swallow the rest of the string
326
+ ch, more_tokens = advance_char
327
+ break if !more_tokens || ch == '"'
328
+ end
329
+ end
330
+
331
+ sig { returns(Token) }
332
+ def scan_string
333
+ value_buffer = String.new
334
+ loop do
335
+ char, ok = advance_char
336
+ return token(Token::ERROR, 'unterminated string literal') unless ok
337
+ return token(Token::STRING, value_buffer) if char == '"'
338
+
339
+ if char != '\\'
340
+ value_buffer << char
341
+ next
342
+ end
343
+
344
+ char, ok = advance_char
345
+ return token(Token::ERROR, 'unterminated string literal') unless ok
346
+
347
+ case char
348
+ when '"'
349
+ value_buffer << '"'
350
+ when '\\'
351
+ value_buffer << '\\'
352
+ when '/'
353
+ value_buffer << '/'
354
+ when 'b'
355
+ value_buffer << "\b"
356
+ when 'f'
357
+ value_buffer << "\f"
358
+ when 'n'
359
+ value_buffer << "\n"
360
+ when 'r'
361
+ value_buffer << "\r"
362
+ when 't'
363
+ value_buffer << "\t"
364
+ when 'u'
365
+ unless accept_chars(Token::HEX_DIGITS, 4)
366
+ swallow_rest_of_the_string
367
+ return token(Token::ERROR, 'invalid unicode escape')
368
+ end
369
+
370
+ advance_chars(4)
371
+ last4 = T.must @source[@cursor - 4...@cursor]
372
+ value_buffer << [last4.hex].pack('U')
373
+ else
374
+ swallow_rest_of_the_string
375
+ return token(Token::ERROR, "invalid escape `\\#{char}`")
376
+ end
377
+ end
378
+ end
379
+ end
380
+ end