kumi-parser 0.0.32 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,692 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Kumi
4
+ module Parser
5
+ # Recursive-descent parser for declarations and a Pratt parser for
6
+ # expressions, producing kumi-core's Kumi::Syntax::* AST directly.
7
+ #
8
+ # The parser is the boundary of the parse phase: it reports *shape* errors
9
+ # (a missing `end`, an unexpected token, a malformed hash pair) with exact
10
+ # locations, and it does not attempt to resolve names, check types, or know
11
+ # anything about axes — those are semantic concerns owned by the analyzer.
12
+ class Parser
13
+ include Syntax = Kumi::Syntax
14
+
15
+ def initialize(tokens, source)
16
+ @tokens = tokens
17
+ @source = source
18
+ @pos = 0
19
+ @imported_names = Set.new
20
+ end
21
+
22
+ def parse
23
+ skip_separators
24
+ imports = parse_imports
25
+ @imported_names.merge(imports.flat_map(&:names))
26
+
27
+ root = parse_schema(imports)
28
+
29
+ skip_separators
30
+ expect(:eof, 'end of input')
31
+ root
32
+ end
33
+
34
+ private
35
+
36
+ # ---- cursor --------------------------------------------------------
37
+
38
+ def current
39
+ @tokens[@pos]
40
+ end
41
+
42
+ def peek(offset = 1)
43
+ @tokens[@pos + offset] || @tokens.last
44
+ end
45
+
46
+ def at?(*kinds)
47
+ kinds.include?(current.kind)
48
+ end
49
+
50
+ def advance
51
+ token = current
52
+ @pos += 1 unless current.kind == :eof
53
+ token
54
+ end
55
+
56
+ # Consume a token of the expected kind, or raise a clear parse error
57
+ # naming what was expected and what was found.
58
+ def expect(kind, description = nil)
59
+ return advance if current.kind == kind
60
+
61
+ want = description || describe_kind(kind)
62
+ error("expected #{want}, but found #{describe_token(current)}")
63
+ end
64
+
65
+ def skip_separators
66
+ @pos += 1 while at?(:newline, :comment)
67
+ end
68
+
69
+ def loc(token)
70
+ @source.location(token.offset)
71
+ end
72
+
73
+ # ---- schema --------------------------------------------------------
74
+
75
+ def parse_schema(root_imports)
76
+ schema_token = expect(:schema, '`schema do`')
77
+ expect(:do, '`do` after `schema`')
78
+ skip_separators
79
+
80
+ inner_imports = parse_imports
81
+ @imported_names.merge(inner_imports.flat_map(&:names))
82
+ hints = parse_codegen_directives
83
+
84
+ inputs = parse_input_block
85
+ values, traits = parse_declarations
86
+
87
+ expect(:end, '`end` to close the schema')
88
+
89
+ Syntax::Root.new(
90
+ inputs,
91
+ values,
92
+ traits,
93
+ root_imports + inner_imports,
94
+ hints: hints,
95
+ loc: loc(schema_token)
96
+ )
97
+ end
98
+
99
+ def parse_declarations
100
+ values = []
101
+ traits = []
102
+ skip_separators
103
+ while at?(:value, :let, :trait)
104
+ case current.kind
105
+ when :value then values << parse_value(inline: false)
106
+ when :let then values << parse_value(inline: true)
107
+ when :trait then traits << parse_trait
108
+ end
109
+ skip_separators
110
+ end
111
+ [values, traits]
112
+ end
113
+
114
+ # ---- imports -------------------------------------------------------
115
+
116
+ def parse_imports
117
+ imports = []
118
+ skip_separators
119
+ while at?(:import)
120
+ imports << parse_import
121
+ skip_separators
122
+ end
123
+ imports
124
+ end
125
+
126
+ def parse_import
127
+ import_token = advance
128
+ names = [expect(:symbol, 'a `:name` to import').value]
129
+
130
+ while at?(:comma)
131
+ advance
132
+ skip_separators
133
+ break if from_label?
134
+
135
+ names << expect(:symbol, 'a `:name` to import').value
136
+ end
137
+
138
+ skip_separators
139
+ error('expected `from:` to name the module the import comes from') unless from_label?
140
+ advance # consume `from:`
141
+ skip_separators
142
+
143
+ module_ref = parse_constant_path
144
+ Syntax::ImportDeclaration.new(names, module_ref, loc: loc(import_token))
145
+ end
146
+
147
+ def from_label?
148
+ current.kind == :label && current.value == 'from'
149
+ end
150
+
151
+ def parse_constant_path
152
+ token = expect(:constant, 'a module name like `Foo::Bar`')
153
+ token.value
154
+ end
155
+
156
+ # ---- codegen directives -------------------------------------------
157
+
158
+ def parse_codegen_directives
159
+ hints = {}
160
+ skip_separators
161
+ while at?(:codegen)
162
+ advance
163
+ opts = parse_codegen_options
164
+ hints[:codegen] = (hints[:codegen] || {}).merge(opts)
165
+ skip_separators
166
+ end
167
+ hints
168
+ end
169
+
170
+ def parse_codegen_options
171
+ opts = {}
172
+ loop do
173
+ key = expect(:label, 'a codegen option like `streaming:`')
174
+ error("Unknown codegen option '#{key.value}'", at: key) unless key.value == 'streaming'
175
+ value = expect(:boolean, '`true` or `false`')
176
+ opts[:streaming] = value.value
177
+ break unless at?(:comma)
178
+
179
+ advance
180
+ skip_separators
181
+ end
182
+ opts
183
+ end
184
+
185
+ # ---- input block ---------------------------------------------------
186
+
187
+ def parse_input_block
188
+ expect(:input, 'an `input do` block')
189
+ expect(:do, '`do` after `input`')
190
+ declarations = []
191
+ skip_separators
192
+ while at?(:type_keyword)
193
+ declarations << parse_input_declaration
194
+ skip_separators
195
+ end
196
+ expect(:end, '`end` to close the `input` block')
197
+ declarations
198
+ end
199
+
200
+ def parse_input_declaration
201
+ type_token = expect(:type_keyword)
202
+ type = type_token.value
203
+ name = expect(:symbol, "a `:name` for the #{type} input").value
204
+
205
+ domain, index_name = parse_input_options(type)
206
+
207
+ children = parse_input_children(type)
208
+
209
+ Syntax::InputDeclaration.new(name, domain, type, children, index_name, loc: loc(type_token))
210
+ end
211
+
212
+ # Optional ", domain: …" and ", index: :sym" in any order.
213
+ def parse_input_options(type)
214
+ domain = nil
215
+ index_name = nil
216
+
217
+ while at?(:comma) && peek.kind == :label
218
+ advance # comma
219
+ key = advance # label
220
+ case key.value
221
+ when 'domain'
222
+ domain = parse_domain
223
+ when 'index'
224
+ error('`index:` is only valid on array declarations', at: key) unless type == :array
225
+ index_name = expect(:symbol, 'an index name like `:i`').value
226
+ else
227
+ error("unknown option `#{key.value}:` on an input declaration", at: key)
228
+ end
229
+ end
230
+
231
+ [domain, index_name]
232
+ end
233
+
234
+ def parse_input_children(type)
235
+ return [] unless Grammar::CONTAINER_TYPES.include?(type) && at?(:do)
236
+
237
+ advance # do
238
+ children = []
239
+ skip_separators
240
+ while at?(:type_keyword)
241
+ children << parse_input_declaration
242
+ skip_separators
243
+ end
244
+ expect(:end, '`end` to close the nested input block')
245
+ children
246
+ end
247
+
248
+ # Domains: a bracketed literal list `[a, b]` or a numeric range `lo..hi`.
249
+ def parse_domain
250
+ case current.kind
251
+ when :lbracket
252
+ array = parse_array_literal
253
+ array.elements.map { |e| e.is_a?(Syntax::Literal) ? e.value : e }
254
+ when :integer, :float
255
+ parse_range_domain
256
+ else
257
+ error('expected a domain: a list like `[1, 2]` or a range like `0..10`')
258
+ end
259
+ end
260
+
261
+ def parse_range_domain
262
+ lo = advance.value
263
+ case current.kind
264
+ when :dot_dot
265
+ advance
266
+ (lo..numeric_bound)
267
+ when :dot_dot_dot
268
+ advance
269
+ (lo...numeric_bound)
270
+ else
271
+ [lo]
272
+ end
273
+ end
274
+
275
+ def numeric_bound
276
+ token = expect(current.kind == :float ? :float : :integer, 'a number')
277
+ token.value
278
+ end
279
+
280
+ # ---- value / let / trait ------------------------------------------
281
+
282
+ def parse_value(inline:)
283
+ keyword = advance
284
+ name = expect(:symbol, "a `:name` for the #{inline ? 'let' : 'value'}").value
285
+
286
+ expression =
287
+ if at?(:do)
288
+ parse_cascade
289
+ else
290
+ expect(:comma, '`,` then an expression')
291
+ parse_expression
292
+ end
293
+
294
+ hints = inline ? { inline: true } : {}
295
+ Syntax::ValueDeclaration.new(name, expression, hints: hints, loc: loc(keyword))
296
+ end
297
+
298
+ def parse_trait
299
+ keyword = advance
300
+ name = expect(:symbol, 'a `:name` for the trait').value
301
+ expect(:comma, '`,` then a boolean expression')
302
+ expression = parse_expression
303
+ Syntax::TraitDeclaration.new(name, expression, loc: loc(keyword))
304
+ end
305
+
306
+ # ---- cascade -------------------------------------------------------
307
+
308
+ def parse_cascade
309
+ do_token = expect(:do)
310
+ cases = []
311
+ skip_separators
312
+ while at?(:on, :base)
313
+ cases << parse_case
314
+ skip_separators
315
+ end
316
+ expect(:end, '`end` to close the cascade')
317
+ Syntax::CascadeExpression.new(cases, loc: loc(do_token))
318
+ end
319
+
320
+ def parse_case
321
+ if at?(:on)
322
+ on_token = advance
323
+ exprs = [parse_expression]
324
+ while at?(:comma)
325
+ advance
326
+ exprs << parse_expression
327
+ end
328
+ result = exprs.pop
329
+ condition = build_case_condition(exprs, on_token)
330
+ Syntax::CaseExpression.new(condition, result, loc: loc(on_token))
331
+ else
332
+ base_token = advance
333
+ result = parse_expression
334
+ true_literal = Syntax::Literal.new(true, loc: loc(base_token))
335
+ Syntax::CaseExpression.new(true_literal, result, loc: loc(base_token))
336
+ end
337
+ end
338
+
339
+ # A single trait reference is wrapped in `cascade_and([ref])` so the
340
+ # condition is uniformly an all-of over traits; multiple conditions become
341
+ # `cascade_and([...])` directly.
342
+ def build_case_condition(conditions, on_token)
343
+ if conditions.length == 1
344
+ c = conditions.first
345
+ if c.is_a?(Syntax::DeclarationReference)
346
+ Syntax::CallExpression.new(:cascade_and, [c], loc: loc(on_token))
347
+ else
348
+ c
349
+ end
350
+ else
351
+ Syntax::CallExpression.new(:cascade_and, conditions, loc: loc(on_token))
352
+ end
353
+ end
354
+
355
+ # ---- expressions (Pratt) ------------------------------------------
356
+
357
+ def parse_expression(min_precedence = 0)
358
+ left = parse_postfix(parse_primary)
359
+ skip_separators
360
+
361
+ while Grammar.binary_operator?(current.kind) && Grammar.precedence(current.kind) >= min_precedence
362
+ op = advance
363
+ skip_separators
364
+ next_min = Grammar.right_associative?(op.kind) ? Grammar.precedence(op.kind) : Grammar.precedence(op.kind) + 1
365
+ right = parse_expression(next_min)
366
+ left = Syntax::CallExpression.new(Grammar.operator_fn(op.kind), [left, right], loc: loc(op))
367
+ left = parse_postfix(left)
368
+ skip_separators
369
+ end
370
+
371
+ left
372
+ end
373
+
374
+ # Trailing `[index]` accesses lower to `at(base, index)`.
375
+ def parse_postfix(base)
376
+ skip_separators
377
+ while at?(:lbracket)
378
+ advance
379
+ index = parse_expression
380
+ expect(:rbracket, '`]` to close the index')
381
+ base = Syntax::CallExpression.new(:at, [base, index], loc: base.loc)
382
+ skip_separators
383
+ end
384
+ base
385
+ end
386
+
387
+ def parse_primary
388
+ skip_separators
389
+ token = current
390
+
391
+ case token.kind
392
+ when :integer, :float, :string, :boolean, :symbol
393
+ advance
394
+ Syntax::Literal.new(token.value, loc: loc(token))
395
+ when :constant
396
+ advance
397
+ Syntax::Literal.new(resolve_constant(token), loc: loc(token))
398
+ when :function_sugar
399
+ parse_function_sugar
400
+ when :fn
401
+ parse_fn_call
402
+ when :input
403
+ parse_input_reference
404
+ when :identifier
405
+ parse_identifier_expression
406
+ when :lparen
407
+ advance
408
+ expr = parse_expression
409
+ expect(:rparen, '`)` to close the group')
410
+ expr
411
+ when :lbracket
412
+ parse_array_literal
413
+ when :lbrace
414
+ parse_hash_literal
415
+ when :subtract
416
+ parse_unary_minus
417
+ else
418
+ error("expected an expression, but found #{describe_token(token)}")
419
+ end
420
+ end
421
+
422
+ def parse_unary_minus
423
+ token = advance
424
+ operand = parse_postfix(parse_primary)
425
+ zero = Syntax::Literal.new(0, loc: loc(token))
426
+ Syntax::CallExpression.new(:subtract, [zero, operand], loc: loc(token))
427
+ end
428
+
429
+ def parse_identifier_expression
430
+ token = current
431
+ if token.value == 'input' && peek.kind == :dot
432
+ parse_input_reference
433
+ elsif peek.kind == :lparen
434
+ parse_named_call
435
+ else
436
+ advance
437
+ Syntax::DeclarationReference.new(token.value.to_sym, loc: loc(token))
438
+ end
439
+ end
440
+
441
+ # `input` is a keyword token; `input.field…` builds an input reference.
442
+ def parse_input_reference
443
+ input_token = advance
444
+ error('expected `input`', at: input_token) if input_token.kind == :identifier && input_token.value != 'input'
445
+ expect(:dot, '`.` after `input`')
446
+ path = [expect_field_name]
447
+ while at?(:dot)
448
+ advance
449
+ path << expect_field_name
450
+ end
451
+ if path.length == 1
452
+ Syntax::InputReference.new(path.first, loc: loc(input_token))
453
+ else
454
+ Syntax::InputElementReference.new(path, loc: loc(input_token))
455
+ end
456
+ end
457
+
458
+ def expect_field_name
459
+ token = current
460
+ if token.kind == :identifier || token.kind == :type_keyword || keyword_token?(token)
461
+ advance
462
+ field_name_value(token).to_sym
463
+ else
464
+ error("expected an input field name after `.`, but found #{describe_token(token)}")
465
+ end
466
+ end
467
+
468
+ def keyword_token?(token)
469
+ Grammar::KEYWORDS.value?(token.kind)
470
+ end
471
+
472
+ def field_name_value(token)
473
+ # type_keyword tokens carry the type symbol as value; reconstruct the word.
474
+ return Grammar::TYPE_KEYWORDS.key(token.value) if token.kind == :type_keyword
475
+
476
+ token.value
477
+ end
478
+
479
+ # ---- calls ---------------------------------------------------------
480
+
481
+ # `name(args)` sugar: `select(...)`, `cross(...)`, `index(:i)`, etc.
482
+ # Keyword args here are function options (`policy: :clamp`, `axis_offset: 1`)
483
+ # and are stored as raw scalars, the form the IR lowering reads.
484
+ def parse_function_sugar
485
+ token = advance
486
+ expect(:lparen, "`(` after #{Grammar::FUNCTION_SUGAR.key(token.value)}")
487
+ args, opts = parse_call_arguments(keyword_mode: :literal)
488
+ Syntax::CallExpression.new(token.value, args, opts, loc: loc(token))
489
+ end
490
+
491
+ # `fn(:name, args)` explicit form. Keyword args are function options (raw).
492
+ def parse_fn_call
493
+ fn_token = advance
494
+ expect(:lparen, '`(` after `fn`')
495
+ name = expect(:symbol, 'a `:function_name` inside `fn(...)`').value
496
+ args = []
497
+ opts = {}
498
+ if at?(:comma)
499
+ advance
500
+ args, opts = parse_call_arguments(keyword_mode: :literal)
501
+ else
502
+ expect(:rparen, '`)` to close `fn(...)`')
503
+ end
504
+ build_call(name, args, opts, fn_token)
505
+ end
506
+
507
+ # `name(...)` where `name` is a bare identifier — an imported schema call.
508
+ # Its keyword args are the input mapping, whose values are full
509
+ # expressions (e.g. `subtotal(items: input.order_items)`).
510
+ def parse_named_call
511
+ name_token = advance
512
+ name = name_token.value.to_sym
513
+ expect(:lparen, "`(` after `#{name}`")
514
+ args, opts = parse_call_arguments(keyword_mode: :expression)
515
+ build_call(name, args, opts, name_token)
516
+ end
517
+
518
+ def build_call(name, args, opts, token)
519
+ if @imported_names.include?(name) && args.empty? && !opts.empty?
520
+ Syntax::ImportCall.new(name, opts, loc: loc(token))
521
+ else
522
+ Syntax::CallExpression.new(name, args, opts, loc: loc(token))
523
+ end
524
+ end
525
+
526
+ # Parse the contents of a `(...)` argument list up to and including the
527
+ # closing paren: positional expression args first, then `label: value`
528
+ # keyword args. `keyword_mode` selects how keyword values are read:
529
+ # `:literal` (raw scalars, for function options) or `:expression` (full
530
+ # AST, for import input mappings).
531
+ def parse_call_arguments(keyword_mode:)
532
+ args = []
533
+ opts = {}
534
+ skip_separators
535
+ return [args, opts] if consume_rparen
536
+
537
+ loop do
538
+ if at?(:label)
539
+ key = advance.value.to_sym
540
+ opts[key] = keyword_mode == :literal ? parse_keyword_literal : parse_expression
541
+ else
542
+ args << parse_expression
543
+ end
544
+ skip_separators
545
+ break unless at?(:comma)
546
+
547
+ advance
548
+ skip_separators
549
+ end
550
+
551
+ expect(:rparen, '`)` to close the argument list')
552
+ [args, opts]
553
+ end
554
+
555
+ def consume_rparen
556
+ return false unless at?(:rparen)
557
+
558
+ advance
559
+ true
560
+ end
561
+
562
+ # A function option's value: a raw scalar, not an AST node. `:clamp` stays
563
+ # a symbol, `1` stays an Integer, a bare `label:`-style word becomes a
564
+ # symbol. The IR lowering reads these directly.
565
+ def parse_keyword_literal
566
+ token = current
567
+ case token.kind
568
+ when :integer, :float, :string, :boolean
569
+ advance.value
570
+ when :symbol
571
+ advance.value
572
+ when :label
573
+ advance.value.to_sym
574
+ when :subtract
575
+ advance
576
+ v = parse_keyword_literal
577
+ error('expected a number after unary `-`') unless v.is_a?(Numeric)
578
+ -v
579
+ else
580
+ error('a function option value must be a literal or symbol, ' \
581
+ "but found #{describe_token(token)}")
582
+ end
583
+ end
584
+
585
+ # ---- array / hash literals ----------------------------------------
586
+
587
+ def parse_array_literal
588
+ open = expect(:lbracket)
589
+ elements = []
590
+ skip_separators
591
+ until at?(:rbracket)
592
+ elements << parse_expression
593
+ skip_separators
594
+ break unless at?(:comma)
595
+
596
+ advance
597
+ skip_separators
598
+ end
599
+ expect(:rbracket, '`]` to close the array')
600
+ Syntax::ArrayExpression.new(elements, loc: loc(open))
601
+ end
602
+
603
+ def parse_hash_literal
604
+ open = expect(:lbrace)
605
+ pairs = []
606
+ skip_separators
607
+ until at?(:rbrace)
608
+ pairs << parse_hash_pair
609
+ skip_separators
610
+ break unless at?(:comma)
611
+
612
+ advance
613
+ skip_separators
614
+ end
615
+ expect(:rbrace, '`}` to close the hash')
616
+ Syntax::HashExpression.new(pairs, loc: loc(open))
617
+ end
618
+
619
+ def parse_hash_pair
620
+ key_token = current
621
+ key_value =
622
+ case key_token.kind
623
+ when :label then advance && key_token.value.to_sym
624
+ when :string then advance && key_token.value
625
+ when :symbol then advance && key_token.value
626
+ else
627
+ error('a hash key must be a `name:` label, a `:symbol`, or a "string", ' \
628
+ "but found #{describe_token(key_token)}")
629
+ end
630
+ key = Syntax::Literal.new(key_value, loc: loc(key_token))
631
+
632
+ skip_separators
633
+ if at?(:arrow)
634
+ advance
635
+ elsif key_token.kind != :label
636
+ error('expected `=>` after the hash key')
637
+ end
638
+ skip_separators
639
+ value = parse_expression
640
+ [key, value]
641
+ end
642
+
643
+ # ---- constants -----------------------------------------------------
644
+
645
+ # The parse phase resolves only the handful of constants whose value is
646
+ # part of the language surface; everything else is left to the analyzer.
647
+ KNOWN_CONSTANTS = { 'Float::INFINITY' => Float::INFINITY }.freeze
648
+
649
+ def resolve_constant(token)
650
+ KNOWN_CONSTANTS.fetch(token.value) do
651
+ error("unknown constant `#{token.value}` (use an inline value instead)", at: token)
652
+ end
653
+ end
654
+
655
+ # ---- errors --------------------------------------------------------
656
+
657
+ def error(message, at: current)
658
+ raise ParseError.new(message, source: @source, offset: at.offset)
659
+ end
660
+
661
+ def describe_token(token)
662
+ case token.kind
663
+ when :eof then 'end of input'
664
+ when :newline then 'a line break'
665
+ when :type_keyword then "`#{Grammar::TYPE_KEYWORDS.key(token.value)}`"
666
+ when :function_sugar then "`#{Grammar::FUNCTION_SUGAR.key(token.value)}`"
667
+ when :symbol then "`:#{token.value}`"
668
+ when :label then "`#{token.value}:`"
669
+ when :string then token.value.inspect
670
+ when :integer, :float, :boolean then "`#{token.value}`"
671
+ else "`#{token.value}`"
672
+ end
673
+ end
674
+
675
+ def describe_kind(kind)
676
+ {
677
+ symbol: 'a `:name`',
678
+ comma: '`,`',
679
+ do: '`do`',
680
+ end: '`end`',
681
+ rparen: '`)`',
682
+ rbracket: '`]`',
683
+ rbrace: '`}`',
684
+ lparen: '`(`',
685
+ dot: '`.`',
686
+ arrow: '`=>`',
687
+ eof: 'end of input'
688
+ }.fetch(kind, "`#{kind}`")
689
+ end
690
+ end
691
+ end
692
+ end