miniruby 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,325 @@
1
+ # typed: strong
2
+ # frozen_string_literal: true
3
+
4
+ module MiniRuby
5
+ # Contains the definitions of all AST (Abstract Syntax Tree) nodes.
6
+ # AST is the data structure that is returned by the parser.
7
+ module AST
8
+ # A string that represents a single level of indentation
9
+ # in S-expressions
10
+ INDENT_UNIT = ' '
11
+
12
+ # Abstract class representing an AST node.
13
+ class Node
14
+ extend T::Sig
15
+ extend T::Helpers
16
+
17
+ abstract!
18
+
19
+ sig { returns(Span) }
20
+ attr_accessor :span
21
+
22
+ sig { params(span: Span).void }
23
+ def initialize(span: Span::ZERO)
24
+ @span = span
25
+ end
26
+
27
+ sig { params(other: Object).returns(T::Boolean) }
28
+ def ==(other)
29
+ other.is_a?(self.class)
30
+ end
31
+
32
+ # Get the Ruby-like representation of the AST
33
+ sig { abstract.params(indent: Integer).returns(String) }
34
+ def to_s(indent = 0); end
35
+
36
+ # Inspect the AST in the S-expression format
37
+ sig { abstract.params(indent: Integer).returns(String) }
38
+ def inspect(indent = 0); end
39
+ end
40
+
41
+ # Represents a program
42
+ class ProgramNode < Node
43
+ sig { returns(T::Array[StatementNode]) }
44
+ attr_reader :statements
45
+
46
+ sig { params(statements: T::Array[StatementNode], span: Span).void }
47
+ def initialize(statements:, span: Span::ZERO)
48
+ @span = span
49
+ @statements = statements
50
+ end
51
+
52
+ sig { params(other: Object).returns(T::Boolean) }
53
+ def ==(other)
54
+ return false unless other.is_a?(ProgramNode)
55
+
56
+ @statements == other.statements
57
+ end
58
+
59
+ sig { override.params(indent: Integer).returns(String) }
60
+ def to_s(indent = 0)
61
+ buffer = String.new
62
+
63
+ @statements.each do |stmt|
64
+ buffer << stmt.to_s(indent)
65
+ end
66
+
67
+ buffer
68
+ end
69
+
70
+ sig { override.params(indent: Integer).returns(String) }
71
+ def inspect(indent = 0)
72
+ buff = String.new
73
+
74
+ buff << "#{INDENT_UNIT * indent}(program"
75
+ @statements.each do |stmt|
76
+ buff << "\n" << stmt.inspect(indent + 1)
77
+ end
78
+ buff << ')'
79
+ buff
80
+ end
81
+ end
82
+
83
+ # Represents a single statement (line) of code
84
+ class StatementNode < Node
85
+ abstract!
86
+ end
87
+
88
+ # Represents a statement with an expression like `2 + 3 - 5;`
89
+ class ExpressionStatementNode < StatementNode
90
+ sig { returns(ExpressionNode) }
91
+ attr_reader :expression
92
+
93
+ sig { params(expression: ExpressionNode, span: Span).void }
94
+ def initialize(expression:, span: Span::ZERO)
95
+ @span = span
96
+ @expression = expression
97
+ end
98
+
99
+ sig { params(other: Object).returns(T::Boolean) }
100
+ def ==(other)
101
+ return false unless other.is_a?(ExpressionStatementNode)
102
+
103
+ @expression == other.expression
104
+ end
105
+
106
+ sig { override.params(indent: Integer).returns(String) }
107
+ def to_s(indent = 0)
108
+ "#{INDENT_UNIT * indent}#{@expression}\n"
109
+ end
110
+
111
+ sig { override.params(indent: Integer).returns(String) }
112
+ def inspect(indent = 0)
113
+ buff = String.new
114
+ buff << "#{INDENT_UNIT * indent}(expr_stmt"
115
+ buff << "\n" << @expression.inspect(indent + 1)
116
+ buff << ')'
117
+ buff
118
+ end
119
+ end
120
+
121
+ # Represents an expression like `2 + 3`
122
+ # that can be a part of a larger expression/statement like `2 + 3 - 5`
123
+ class ExpressionNode < Node
124
+ abstract!
125
+ end
126
+
127
+ # Represents an invalid node
128
+ class InvalidNode < ExpressionNode
129
+ sig { returns(Token) }
130
+ attr_reader :token
131
+
132
+ sig { params(token: Token, span: Span).void }
133
+ def initialize(token:, span: Span::ZERO)
134
+ @span = span
135
+ @token = token
136
+ end
137
+
138
+ sig { params(other: Object).returns(T::Boolean) }
139
+ def ==(other)
140
+ return false unless other.is_a?(InvalidNode)
141
+
142
+ @token == other.token
143
+ end
144
+
145
+ sig { override.params(indent: Integer).returns(String) }
146
+ def to_s(indent = 0)
147
+ "#{INDENT_UNIT * indent}<invalid: `#{token}`>"
148
+ end
149
+
150
+ sig { override.params(indent: Integer).returns(String) }
151
+ def inspect(indent = 0)
152
+ "#{INDENT_UNIT * indent}(invalid #{token.inspect})"
153
+ end
154
+ end
155
+
156
+ # Represents a false literal eg. `false`
157
+ class FalseLiteralNode < ExpressionNode
158
+ sig { override.params(indent: Integer).returns(String) }
159
+ def to_s(indent = 0)
160
+ "#{INDENT_UNIT * indent}false"
161
+ end
162
+
163
+ sig { override.params(indent: Integer).returns(String) }
164
+ def inspect(indent = 0)
165
+ "#{INDENT_UNIT * indent}false"
166
+ end
167
+ end
168
+
169
+ # Represents a true literal eg. `true`
170
+ class TrueLiteralNode < ExpressionNode
171
+ sig { override.params(indent: Integer).returns(String) }
172
+ def to_s(indent = 0)
173
+ "#{INDENT_UNIT * indent}true"
174
+ end
175
+
176
+ sig { override.params(indent: Integer).returns(String) }
177
+ def inspect(indent = 0)
178
+ "#{INDENT_UNIT * indent}true"
179
+ end
180
+ end
181
+
182
+ # Represents a nil literal eg. `nil`
183
+ class NilLiteralNode < ExpressionNode
184
+ sig { override.params(indent: Integer).returns(String) }
185
+ def to_s(indent = 0)
186
+ "#{INDENT_UNIT * indent}nil"
187
+ end
188
+
189
+ sig { override.params(indent: Integer).returns(String) }
190
+ def inspect(indent = 0)
191
+ "#{INDENT_UNIT * indent}nil"
192
+ end
193
+ end
194
+
195
+ # Represents a self literal eg. `self`
196
+ class SelfLiteralNode < ExpressionNode
197
+ sig { override.params(indent: Integer).returns(String) }
198
+ def to_s(indent = 0)
199
+ "#{INDENT_UNIT * indent}self"
200
+ end
201
+
202
+ sig { override.params(indent: Integer).returns(String) }
203
+ def inspect(indent = 0)
204
+ "#{INDENT_UNIT * indent}self"
205
+ end
206
+ end
207
+
208
+ # Represents a float literal eg. `123.5`
209
+ class FloatLiteralNode < ExpressionNode
210
+ sig { returns(String) }
211
+ attr_reader :value
212
+
213
+ sig { params(value: String, span: Span).void }
214
+ def initialize(value:, span: Span::ZERO)
215
+ @span = span
216
+ @value = value
217
+ end
218
+
219
+ sig { params(other: Object).returns(T::Boolean) }
220
+ def ==(other)
221
+ return false unless other.is_a?(FloatLiteralNode)
222
+
223
+ @value == other.value
224
+ end
225
+
226
+ sig { override.params(indent: Integer).returns(String) }
227
+ def to_s(indent = 0)
228
+ "#{INDENT_UNIT * indent}#{value}"
229
+ end
230
+
231
+ sig { override.params(indent: Integer).returns(String) }
232
+ def inspect(indent = 0)
233
+ "#{INDENT_UNIT * indent}#{value}"
234
+ end
235
+ end
236
+
237
+ # Represents an integer literal eg. `123`
238
+ class IntegerLiteralNode < ExpressionNode
239
+ sig { returns(String) }
240
+ attr_reader :value
241
+
242
+ sig { params(value: String, span: Span).void }
243
+ def initialize(value:, span: Span::ZERO)
244
+ @span = span
245
+ @value = value
246
+ end
247
+
248
+ sig { params(other: Object).returns(T::Boolean) }
249
+ def ==(other)
250
+ return false unless other.is_a?(IntegerLiteralNode)
251
+
252
+ @value == other.value
253
+ end
254
+
255
+ sig { override.params(indent: Integer).returns(String) }
256
+ def to_s(indent = 0)
257
+ "#{INDENT_UNIT * indent}#{value}"
258
+ end
259
+
260
+ sig { override.params(indent: Integer).returns(String) }
261
+ def inspect(indent = 0)
262
+ "#{INDENT_UNIT * indent}#{value}"
263
+ end
264
+ end
265
+
266
+ # Represents a string literal eg. `"foo"`
267
+ class StringLiteralNode < ExpressionNode
268
+ sig { returns(String) }
269
+ attr_reader :value
270
+
271
+ sig { params(value: String, span: Span).void }
272
+ def initialize(value:, span: Span::ZERO)
273
+ @span = span
274
+ @value = value
275
+ end
276
+
277
+ sig { params(other: Object).returns(T::Boolean) }
278
+ def ==(other)
279
+ return false unless other.is_a?(StringLiteralNode)
280
+
281
+ @value == other.value
282
+ end
283
+
284
+ sig { override.params(indent: Integer).returns(String) }
285
+ def to_s(indent = 0)
286
+ "#{INDENT_UNIT * indent}#{value.inspect}"
287
+ end
288
+
289
+ sig { override.params(indent: Integer).returns(String) }
290
+ def inspect(indent = 0)
291
+ "#{INDENT_UNIT * indent}#{value.inspect}"
292
+ end
293
+ end
294
+
295
+ # Represents an identifier like `a`, `foo`
296
+ class IdentifierNode < ExpressionNode
297
+ sig { returns(String) }
298
+ attr_reader :value
299
+
300
+ sig { params(value: String, span: Span).void }
301
+ def initialize(value:, span: Span::ZERO)
302
+ @span = span
303
+ @value = value
304
+ end
305
+
306
+ sig { params(other: Object).returns(T::Boolean) }
307
+ def ==(other)
308
+ return false unless other.is_a?(IdentifierNode)
309
+
310
+ @value == other.value
311
+ end
312
+
313
+ sig { override.params(indent: Integer).returns(String) }
314
+ def to_s(indent = 0)
315
+ "#{INDENT_UNIT * indent}#{@value}"
316
+ end
317
+
318
+ sig { override.params(indent: Integer).returns(String) }
319
+ def inspect(indent = 0)
320
+ "#{INDENT_UNIT * indent}#{@value}"
321
+ end
322
+ end
323
+
324
+ end
325
+ end
@@ -0,0 +1,380 @@
1
+ # typed: strict
2
+ # frozen_string_literal: true
3
+
4
+ require_relative 'token'
5
+
6
+ module MiniRuby
7
+ # A lexical analyzer (tokenizer) for MiniRuby
8
+ class Lexer
9
+ extend T::Sig
10
+ extend T::Generic
11
+ include Enumerable
12
+
13
+ # Type parameter for `Enumerable`
14
+ # Declares the type that the lexer returns for tokens
15
+ Elem = type_member { { fixed: Token } }
16
+
17
+ class << self
18
+ extend T::Sig
19
+
20
+ sig { params(source: String).returns(T::Array[Token]) }
21
+ def lex(source)
22
+ new(source).to_a
23
+ end
24
+ end
25
+
26
+ sig { params(source: String).void }
27
+ def initialize(source)
28
+ @source = source
29
+
30
+ # offset of the first character of the current lexeme
31
+ @start_cursor = T.let(0, Integer)
32
+ # offset of the next character
33
+ @cursor = T.let(0, Integer)
34
+ end
35
+
36
+ sig { returns(Token) }
37
+ def next
38
+ return Token.new(Token::END_OF_FILE, Span.new(Position.new(0), Position.new(0))) unless more_tokens?
39
+
40
+ scan_token
41
+ end
42
+
43
+ sig { override.params(block: T.nilable(T.proc.params(arg0: Token).void)).returns(T.untyped) }
44
+ def each(&block)
45
+ return enum_for(T.must(__method__)) unless block
46
+
47
+ loop do
48
+ tok = self.next
49
+ break if tok.type == Token::END_OF_FILE
50
+
51
+ block.call(tok)
52
+ end
53
+
54
+ self
55
+ end
56
+
57
+ private
58
+
59
+ sig { returns(T::Boolean) }
60
+ def more_tokens?
61
+ @cursor < @source.length
62
+ end
63
+
64
+ sig { params(type: Symbol).returns(Token) }
65
+ def token_with_consumed_value(type)
66
+ token(type, token_value)
67
+ end
68
+
69
+ sig { params(type: Symbol, value: T.nilable(String)).returns(Token) }
70
+ def token(type, value = nil)
71
+ span = Span.new(Position.new(@start_cursor), Position.new(@cursor - 1))
72
+ @start_cursor = @cursor
73
+ Token.new(type, span, value)
74
+ end
75
+
76
+ # Returns the current token value.
77
+ sig { returns(String) }
78
+ def token_value
79
+ T.must @source[@start_cursor...@cursor]
80
+ end
81
+
82
+ sig { returns([String, T::Boolean]) }
83
+ def advance_char
84
+ return '', false unless more_tokens?
85
+
86
+ char = next_char
87
+
88
+ @cursor += 1
89
+ [char, true]
90
+ end
91
+
92
+ sig { returns(String) }
93
+ def next_char
94
+ T.must @source[@cursor]
95
+ end
96
+
97
+ # Gets the next UTF-8 encoded character
98
+ # without incrementing the cursor.
99
+ sig { returns(String) }
100
+ def peek_char
101
+ return '' unless more_tokens?
102
+
103
+ char, = next_char
104
+ char
105
+ end
106
+
107
+ # Advance the next `n` characters
108
+ sig { params(n: Integer).returns(T::Boolean) }
109
+ def advance_chars(n)
110
+ n.times do
111
+ _, ok = advance_char
112
+ return false unless ok
113
+ end
114
+
115
+ true
116
+ end
117
+
118
+ # Checks if the given character matches
119
+ # the next UTF-8 encoded character in source code.
120
+ # If they match, the cursor gets incremented.
121
+ sig { params(char: String).returns(T::Boolean) }
122
+ def match_char(char)
123
+ return false unless more_tokens?
124
+
125
+ if peek_char == char
126
+ advance_char
127
+ return true
128
+ end
129
+
130
+ false
131
+ end
132
+
133
+ # Consumes the next character if it's from the valid set.
134
+ sig { params(valid_chars: String).returns(T::Boolean) }
135
+ def match_chars(valid_chars)
136
+ return false unless more_tokens?
137
+
138
+ p = peek_char
139
+ if p != '' && valid_chars.include?(p)
140
+ advance_char
141
+ return true
142
+ end
143
+
144
+ false
145
+ end
146
+
147
+ # Rewinds the cursor back n chars.
148
+ sig { params(n: Integer).void }
149
+ def backup_chars(n)
150
+ @cursor -= n
151
+ end
152
+
153
+ # Skips the current accumulated token.
154
+ sig { void }
155
+ def skip_token
156
+ @start_cursor = @cursor
157
+ end
158
+
159
+ sig { returns(Token) }
160
+ def scan_token
161
+ loop do
162
+ char, ok = advance_char
163
+ return token(Token::END_OF_FILE) unless ok
164
+
165
+ case char
166
+ when ','
167
+ return token(Token::COMMA)
168
+ when ';'
169
+ return token(Token::SEMICOLON)
170
+ when '('
171
+ return token(Token::LPAREN)
172
+ when ')'
173
+ return token(Token::RPAREN)
174
+ when '!'
175
+ return token(Token::NOT_EQUAL) if match_char('=')
176
+
177
+ return token(Token::BANG)
178
+ when '='
179
+ return token(Token::EQUAL_EQUAL) if match_char('=')
180
+
181
+ return token(Token::EQUAL)
182
+ when '>'
183
+ return token(Token::GREATER_EQUAL) if match_char('=')
184
+
185
+ return token(Token::GREATER)
186
+ when '<'
187
+ return token(Token::LESS_EQUAL) if match_char('=')
188
+
189
+ return token(Token::LESS)
190
+ when '+'
191
+ return token(Token::PLUS)
192
+ when '-'
193
+ return token(Token::MINUS)
194
+ when '*'
195
+ return token(Token::STAR)
196
+ when '/'
197
+ return token(Token::SLASH)
198
+ when '"'
199
+ return scan_string
200
+ when "\n"
201
+ return token(Token::NEWLINE)
202
+ when ' ', "\r", "\t"
203
+ skip_token
204
+ next
205
+ else
206
+ if char.match?(/[[:alpha:]]/)
207
+ return scan_identifier
208
+ elsif char.match?(/\d/)
209
+ return scan_number(char)
210
+ end
211
+
212
+ return token(Token::ERROR, "unexpected char `#{char}`")
213
+ end
214
+ end
215
+ end
216
+
217
+ sig { params(char: String).returns(T::Boolean) }
218
+ def identifier_char?(char)
219
+ char.match?(/[[:alpha:][:digit:]_]/)
220
+ end
221
+
222
+ sig { returns(Token) }
223
+ def scan_identifier
224
+ advance_char while identifier_char?(peek_char)
225
+
226
+ value = token_value
227
+ return token(value.to_sym) if Token::KEYWORDS.include?(value)
228
+
229
+ token(Token::IDENTIFIER, value)
230
+ end
231
+
232
+ sig { void }
233
+ def consume_digits
234
+ loop do
235
+ p = peek_char
236
+ break if p == '' || !Token::DIGITS.include?(peek_char)
237
+
238
+ _, ok = advance_char
239
+ break unless ok
240
+ end
241
+ end
242
+
243
+ # Checks if the next `n` characters are from the valid set.
244
+ sig { params(valid_chars: String, n: Integer).returns(T::Boolean) }
245
+ def accept_chars(valid_chars, n)
246
+ result = T.let(true, T::Boolean)
247
+ n.times do
248
+ unless match_chars(valid_chars)
249
+ result = false
250
+ break
251
+ end
252
+ end
253
+
254
+ backup_chars(n)
255
+
256
+ result
257
+ end
258
+
259
+ sig { params(init_char: String).returns(Token) }
260
+ def scan_number(init_char)
261
+ if init_char == '0'
262
+ p = peek_char
263
+ if accept_chars(Token::DIGITS, 1)
264
+ consume_digits
265
+ return token(
266
+ Token::ERROR,
267
+ 'illegal trailing zero in number literal',
268
+ )
269
+ end
270
+ end
271
+
272
+ consume_digits
273
+
274
+ is_float = false
275
+
276
+ if match_char('.')
277
+ is_float = true
278
+ p = peek_char
279
+ if p == ''
280
+ return token(
281
+ Token::ERROR,
282
+ 'unexpected EOF',
283
+ )
284
+ end
285
+
286
+ unless Token::DIGITS.include?(p)
287
+ return token(
288
+ Token::ERROR,
289
+ "unexpected char in number literal: `#{p}`",
290
+ )
291
+ end
292
+
293
+ consume_digits
294
+ end
295
+
296
+ if match_char('e') || match_char('E')
297
+ is_float = true
298
+ match_char('+') || match_char('-')
299
+ p = peek_char
300
+ if p == ''
301
+ return token(
302
+ Token::ERROR,
303
+ 'unexpected EOF',
304
+ )
305
+ end
306
+ unless Token::DIGITS.include?(p)
307
+ return token(
308
+ Token::ERROR,
309
+ "unexpected char in number literal: `#{p}`",
310
+ )
311
+ end
312
+ consume_digits
313
+ end
314
+
315
+ if is_float
316
+ return token_with_consumed_value(Token::FLOAT)
317
+ end
318
+
319
+ token_with_consumed_value(Token::INTEGER)
320
+ end
321
+
322
+ sig { void }
323
+ def swallow_rest_of_the_string
324
+ loop do
325
+ # swallow the rest of the string
326
+ ch, more_tokens = advance_char
327
+ break if !more_tokens || ch == '"'
328
+ end
329
+ end
330
+
331
+ sig { returns(Token) }
332
+ def scan_string
333
+ value_buffer = String.new
334
+ loop do
335
+ char, ok = advance_char
336
+ return token(Token::ERROR, 'unterminated string literal') unless ok
337
+ return token(Token::STRING, value_buffer) if char == '"'
338
+
339
+ if char != '\\'
340
+ value_buffer << char
341
+ next
342
+ end
343
+
344
+ char, ok = advance_char
345
+ return token(Token::ERROR, 'unterminated string literal') unless ok
346
+
347
+ case char
348
+ when '"'
349
+ value_buffer << '"'
350
+ when '\\'
351
+ value_buffer << '\\'
352
+ when '/'
353
+ value_buffer << '/'
354
+ when 'b'
355
+ value_buffer << "\b"
356
+ when 'f'
357
+ value_buffer << "\f"
358
+ when 'n'
359
+ value_buffer << "\n"
360
+ when 'r'
361
+ value_buffer << "\r"
362
+ when 't'
363
+ value_buffer << "\t"
364
+ when 'u'
365
+ unless accept_chars(Token::HEX_DIGITS, 4)
366
+ swallow_rest_of_the_string
367
+ return token(Token::ERROR, 'invalid unicode escape')
368
+ end
369
+
370
+ advance_chars(4)
371
+ last4 = T.must @source[@cursor - 4...@cursor]
372
+ value_buffer << [last4.hex].pack('U')
373
+ else
374
+ swallow_rest_of_the_string
375
+ return token(Token::ERROR, "invalid escape `\\#{char}`")
376
+ end
377
+ end
378
+ end
379
+ end
380
+ end