odin-foundation 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. checksums.yaml +7 -0
  2. data/lib/odin/diff/differ.rb +115 -0
  3. data/lib/odin/diff/patcher.rb +64 -0
  4. data/lib/odin/export.rb +330 -0
  5. data/lib/odin/parsing/parser.rb +1193 -0
  6. data/lib/odin/parsing/token.rb +26 -0
  7. data/lib/odin/parsing/token_type.rb +40 -0
  8. data/lib/odin/parsing/tokenizer.rb +825 -0
  9. data/lib/odin/parsing/value_parser.rb +322 -0
  10. data/lib/odin/resolver/import_resolver.rb +137 -0
  11. data/lib/odin/serialization/canonicalize.rb +112 -0
  12. data/lib/odin/serialization/stringify.rb +582 -0
  13. data/lib/odin/transform/format_exporters.rb +819 -0
  14. data/lib/odin/transform/source_parsers.rb +385 -0
  15. data/lib/odin/transform/transform_engine.rb +2837 -0
  16. data/lib/odin/transform/transform_parser.rb +979 -0
  17. data/lib/odin/transform/transform_types.rb +278 -0
  18. data/lib/odin/transform/verb_context.rb +87 -0
  19. data/lib/odin/transform/verbs/aggregation_verbs.rb +106 -0
  20. data/lib/odin/transform/verbs/collection_verbs.rb +640 -0
  21. data/lib/odin/transform/verbs/datetime_verbs.rb +602 -0
  22. data/lib/odin/transform/verbs/financial_verbs.rb +356 -0
  23. data/lib/odin/transform/verbs/geo_verbs.rb +125 -0
  24. data/lib/odin/transform/verbs/numeric_verbs.rb +434 -0
  25. data/lib/odin/transform/verbs/object_verbs.rb +123 -0
  26. data/lib/odin/types/array_item.rb +42 -0
  27. data/lib/odin/types/diff.rb +89 -0
  28. data/lib/odin/types/directive.rb +28 -0
  29. data/lib/odin/types/document.rb +92 -0
  30. data/lib/odin/types/document_builder.rb +67 -0
  31. data/lib/odin/types/dyn_value.rb +270 -0
  32. data/lib/odin/types/errors.rb +149 -0
  33. data/lib/odin/types/modifiers.rb +45 -0
  34. data/lib/odin/types/ordered_map.rb +79 -0
  35. data/lib/odin/types/schema.rb +262 -0
  36. data/lib/odin/types/value_type.rb +28 -0
  37. data/lib/odin/types/values.rb +618 -0
  38. data/lib/odin/types.rb +12 -0
  39. data/lib/odin/utils/format_utils.rb +186 -0
  40. data/lib/odin/utils/path_utils.rb +25 -0
  41. data/lib/odin/utils/security_limits.rb +17 -0
  42. data/lib/odin/validation/format_validators.rb +238 -0
  43. data/lib/odin/validation/redos_protection.rb +102 -0
  44. data/lib/odin/validation/schema_parser.rb +813 -0
  45. data/lib/odin/validation/schema_serializer.rb +262 -0
  46. data/lib/odin/validation/validator.rb +1061 -0
  47. data/lib/odin/version.rb +5 -0
  48. data/lib/odin.rb +90 -0
  49. metadata +160 -0
@@ -0,0 +1,825 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "strscan"
4
+
5
+ module Odin
6
+ module Parsing
7
+ class Tokenizer
8
+ MAX_DOCUMENT_SIZE = Utils::SecurityLimits::MAX_DOCUMENT_SIZE
9
+
10
+ # Pre-compiled regex patterns for StringScanner (all frozen)
11
+ RE_WHITESPACE = /[ \t]+/
12
+ RE_NEWLINE_CRLF = /\r\n?/
13
+ RE_IDENTIFIER = /[a-zA-Z_][a-zA-Z0-9_\-]*/
14
+ RE_IDENT_PATH = /[a-zA-Z_][a-zA-Z0-9_\-.]*/
15
+ RE_NUMERIC = /[+\-]?[0-9eE.+\-]+/
16
+ RE_CURRENCY_VAL = /[+\-]?[0-9.]+(?:[eE][+\-]?\d+)?(?::[a-zA-Z0-9_\-]+)?/
17
+ RE_WORD = /[a-zA-Z0-9_.\-]+/
18
+ RE_HEADER_CONTENT = /[^}\r\n]*/
19
+ RE_COMMENT_CONTENT = /[^\r\n]*/
20
+ RE_REF_PATH = /[a-zA-Z0-9_.\[\]()?\-@']*/
21
+ RE_BINARY_DATA = /[^\s;\r\n]*/
22
+ RE_BARE_VALUE = /[^\s;:\r\n]+/
23
+ RE_DATE_OR_NUM = /[0-9eE.\-:+TZ]+/
24
+ RE_DATE_PREFIX = /\A\d{4}-\d{2}-\d{2}T/
25
+ RE_DATE_EXACT = /\A\d{4}-\d{2}-\d{2}\z/
26
+ RE_DURATION = /P[0-9YMWDTHS.]+/
27
+ RE_TIME_VAL = /T[0-9:.+\-Z]+/
28
+ RE_ARRAY_INDEX = /\[[^\]]*\]/
29
+
30
+ ESCAPE_MAP = {
31
+ '"' => '"',
32
+ '\\' => '\\',
33
+ 'n' => "\n",
34
+ 't' => "\t",
35
+ 'r' => "\r",
36
+ '0' => "\0",
37
+ '/' => '/'
38
+ }.freeze
39
+
40
+ def initialize(text)
41
+ @source = text
42
+ @scanner = StringScanner.new(text)
43
+ @line = 1
44
+ @col = 1
45
+ @tokens = Array.new(text.length / 10 + 16)
46
+ @token_count = 0
47
+ end
48
+
49
+ def tokenize
50
+ check_document_size!
51
+ skip_bom
52
+ scan_tokens
53
+ emit(TokenType::EOF, "", @line, @col)
54
+ @tokens.first(@token_count)
55
+ end
56
+
57
+ private
58
+
59
+ def check_document_size!
60
+ if @source.bytesize > MAX_DOCUMENT_SIZE
61
+ raise Errors::ParseError.new(
62
+ Errors::ParseErrorCode::MAXIMUM_DOCUMENT_SIZE_EXCEEDED,
63
+ 1, 1, "Document size #{@source.bytesize} exceeds limit #{MAX_DOCUMENT_SIZE}"
64
+ )
65
+ end
66
+ end
67
+
68
+ def skip_bom
69
+ if @source.start_with?("\uFEFF")
70
+ @scanner.pos = "\uFEFF".bytesize
71
+ @col = 1
72
+ elsif @source.bytesize >= 3 &&
73
+ @source.getbyte(0) == 0xEF &&
74
+ @source.getbyte(1) == 0xBB &&
75
+ @source.getbyte(2) == 0xBF
76
+ @scanner.pos = 3
77
+ @col = 1
78
+ end
79
+ end
80
+
81
+ def emit(type, value, line, col, raw: nil)
82
+ @tokens[@token_count] = Token.new(type, value, line, col, raw: raw)
83
+ @token_count += 1
84
+ end
85
+
86
+ # Track line/col after consuming text
87
+ def track(text)
88
+ i = 0
89
+ len = text.length
90
+ while i < len
91
+ if text.getbyte(i) == 10 # \n
92
+ @line += 1
93
+ @col = 1
94
+ else
95
+ @col += 1
96
+ end
97
+ i += 1
98
+ end
99
+ end
100
+
101
+ # Advance scanner by n bytes, updating line/col
102
+ def skip_bytes(n)
103
+ text = @scanner.peek(n)
104
+ @scanner.pos += n
105
+ track(text)
106
+ end
107
+
108
+ def scan_tokens
109
+ s = @scanner
110
+
111
+ until s.eos?
112
+ # Skip horizontal whitespace
113
+ if (ws = s.scan(RE_WHITESPACE))
114
+ @col += ws.length
115
+ next
116
+ end
117
+
118
+ line = @line
119
+ col = @col
120
+
121
+ byte = s.string.getbyte(s.pos)
122
+
123
+ case byte
124
+ when 10 # \n
125
+ s.pos += 1
126
+ emit(TokenType::NEWLINE, "\n", line, col)
127
+ @line += 1
128
+ @col = 1
129
+ when 13 # \r
130
+ if s.string.getbyte(s.pos + 1) == 10
131
+ s.pos += 2
132
+ else
133
+ s.pos += 1
134
+ end
135
+ emit(TokenType::NEWLINE, "\n", line, col)
136
+ @line += 1
137
+ @col = 1
138
+ when 59 # ;
139
+ s.pos += 1
140
+ @col += 1
141
+ text = s.scan(RE_COMMENT_CONTENT) || ""
142
+ emit(TokenType::COMMENT, text.strip, line, col)
143
+ @col += text.length
144
+ when 123 # {
145
+ scan_header(line, col)
146
+ when 61 # =
147
+ s.pos += 1
148
+ @col += 1
149
+ emit(TokenType::EQUALS, "=", line, col)
150
+ # Skip whitespace after =
151
+ if (ws = s.scan(RE_WHITESPACE))
152
+ @col += ws.length
153
+ end
154
+ scan_value_side
155
+ when 124 # |
156
+ s.pos += 1
157
+ @col += 1
158
+ emit(TokenType::PIPE, "|", line, col)
159
+ when 35 # #
160
+ scan_number_prefix(line, col)
161
+ when 34 # "
162
+ scan_string(line, col)
163
+ when 63 # ?
164
+ s.pos += 1
165
+ @col += 1
166
+ word = s.scan(RE_WORD) || ""
167
+ @col += word.length
168
+ if word == "true" || word == "false"
169
+ emit(TokenType::BOOLEAN, word, line, col)
170
+ else
171
+ emit(TokenType::ERROR, "Invalid boolean: ?#{word}", line, col)
172
+ end
173
+ when 126 # ~
174
+ s.pos += 1
175
+ @col += 1
176
+ emit(TokenType::NULL, "~", line, col)
177
+ when 64 # @
178
+ scan_reference(line, col)
179
+ when 94 # ^
180
+ scan_binary(line, col)
181
+ when 37 # %
182
+ scan_verb(line, col)
183
+ when 44 # ,
184
+ s.pos += 1
185
+ @col += 1
186
+ emit(TokenType::PATH, ",", line, col)
187
+ when 33, 42 # ! *
188
+ s.pos += 1
189
+ @col += 1
190
+ emit(TokenType::MODIFIER, byte == 33 ? "!" : "*", line, col)
191
+ when 45 # -
192
+ scan_identifier(line, col)
193
+ when 58 # :
194
+ scan_directive(line, col)
195
+ when 46 # .
196
+ scan_identifier(line, col)
197
+ when 38 # &
198
+ scan_identifier(line, col)
199
+ when 91 # [
200
+ scan_array_indexed_path(line, col)
201
+ else
202
+ if ident_start_byte?(byte)
203
+ scan_identifier(line, col)
204
+ elsif digit_byte?(byte)
205
+ scan_date_or_number(line, col)
206
+ else
207
+ s.pos += 1
208
+ @col += 1
209
+ emit(TokenType::ERROR, byte.chr, line, col)
210
+ end
211
+ end
212
+ end
213
+ end
214
+
215
+ def scan_value_side
216
+ s = @scanner
217
+
218
+ # Parse modifiers after =
219
+ loop do
220
+ break if s.eos?
221
+ byte = s.string.getbyte(s.pos)
222
+ break if byte == 10 || byte == 13 # newline
223
+
224
+ case byte
225
+ when 33 # !
226
+ line = @line; col = @col
227
+ s.pos += 1; @col += 1
228
+ emit(TokenType::MODIFIER, "!", line, col)
229
+ if (ws = s.scan(RE_WHITESPACE)) then @col += ws.length end
230
+ when 42 # *
231
+ line = @line; col = @col
232
+ s.pos += 1; @col += 1
233
+ emit(TokenType::MODIFIER, "*", line, col)
234
+ if (ws = s.scan(RE_WHITESPACE)) then @col += ws.length end
235
+ when 45 # -
236
+ line = @line; col = @col
237
+ s.pos += 1; @col += 1
238
+ emit(TokenType::MODIFIER, "-", line, col)
239
+ if (ws = s.scan(RE_WHITESPACE)) then @col += ws.length end
240
+ else
241
+ break
242
+ end
243
+ end
244
+
245
+ # Now scan the actual value
246
+ return if s.eos?
247
+ byte = s.string.getbyte(s.pos)
248
+ return if byte == 10 || byte == 13
249
+
250
+ line = @line
251
+ col = @col
252
+
253
+ case byte
254
+ when 35 # #
255
+ scan_number_prefix(line, col)
256
+ when 34 # "
257
+ scan_string(line, col)
258
+ when 63 # ?
259
+ s.pos += 1; @col += 1
260
+ word = s.scan(RE_WORD) || ""
261
+ @col += word.length
262
+ if word == "true" || word == "false"
263
+ emit(TokenType::BOOLEAN, word, line, col)
264
+ else
265
+ emit(TokenType::ERROR, "Invalid boolean: ?#{word}", line, col)
266
+ end
267
+ when 126 # ~
268
+ s.pos += 1; @col += 1
269
+ emit(TokenType::NULL, "~", line, col)
270
+ when 64 # @
271
+ scan_reference(line, col)
272
+ when 94 # ^
273
+ scan_binary(line, col)
274
+ when 37 # %
275
+ scan_verb(line, col)
276
+ when 59 # ;
277
+ s.pos += 1; @col += 1
278
+ text = s.scan(RE_COMMENT_CONTENT) || ""
279
+ emit(TokenType::COMMENT, text.strip, line, col)
280
+ @col += text.length
281
+ return
282
+ else
283
+ if digit_byte?(byte)
284
+ scan_date_or_number(line, col)
285
+ elsif byte == 116 || byte == 102 # t, f
286
+ scan_bare_boolean_or_identifier(line, col)
287
+ elsif byte == 80 # P
288
+ scan_possible_duration(line, col)
289
+ elsif byte == 84 # T
290
+ scan_possible_time(line, col)
291
+ elsif ident_start_byte?(byte)
292
+ scan_bare_string_value(line, col)
293
+ else
294
+ s.pos += 1; @col += 1
295
+ emit(TokenType::ERROR, byte.chr, line, col)
296
+ return
297
+ end
298
+ end
299
+
300
+ # After value, check for directives and comments
301
+ if (ws = s.scan(RE_WHITESPACE)) then @col += ws.length end
302
+ return if s.eos?
303
+
304
+ byte = s.string.getbyte(s.pos)
305
+ if byte == 58 # :
306
+ scan_directive(@line, @col)
307
+ if (ws = s.scan(RE_WHITESPACE)) then @col += ws.length end
308
+ end
309
+
310
+ return if s.eos?
311
+ byte = s.string.getbyte(s.pos)
312
+ if byte == 59 # ;
313
+ sl = @line; sc = @col
314
+ s.pos += 1; @col += 1
315
+ text = s.scan(RE_COMMENT_CONTENT) || ""
316
+ emit(TokenType::COMMENT, text.strip, sl, sc)
317
+ @col += text.length
318
+ end
319
+ end
320
+
321
+ def scan_header(line, col)
322
+ s = @scanner
323
+ s.pos += 1; @col += 1 # skip {
324
+ emit(TokenType::HEADER_OPEN, "{", line, col)
325
+
326
+ if (ws = s.scan(RE_WHITESPACE)) then @col += ws.length end
327
+
328
+ if !s.eos? && s.string.getbyte(s.pos) == 125 # }
329
+ hline = @line; hcol = @col
330
+ s.pos += 1; @col += 1
331
+ emit(TokenType::HEADER_CLOSE, "}", hline, hcol)
332
+ return
333
+ end
334
+
335
+ path_line = @line
336
+ path_col = @col
337
+ path = s.scan(RE_HEADER_CONTENT) || ""
338
+ @col += path.length
339
+ path = path.strip
340
+ emit(TokenType::PATH, path, path_line, path_col) unless path.empty?
341
+
342
+ if !s.eos? && s.string.getbyte(s.pos) == 125 # }
343
+ hline = @line; hcol = @col
344
+ s.pos += 1; @col += 1
345
+ emit(TokenType::HEADER_CLOSE, "}", hline, hcol)
346
+ else
347
+ emit(TokenType::ERROR, "Unterminated header", line, col)
348
+ end
349
+ end
350
+
351
+ def scan_number_prefix(line, col)
352
+ s = @scanner
353
+ s.pos += 1; @col += 1 # skip first #
354
+
355
+ if s.eos?
356
+ emit(TokenType::ERROR, "Invalid numeric format", line, col)
357
+ return
358
+ end
359
+ byte = s.string.getbyte(s.pos)
360
+
361
+ case byte
362
+ when 35 # ## integer
363
+ s.pos += 1; @col += 1
364
+ val = scan_numeric_value
365
+ if val.empty?
366
+ emit(TokenType::ERROR, "Invalid numeric format", line, col)
367
+ else
368
+ emit(TokenType::INTEGER, val, line, col)
369
+ end
370
+ when 36 # #$ currency
371
+ s.pos += 1; @col += 1
372
+ val = scan_currency_value
373
+ if val.empty?
374
+ emit(TokenType::ERROR, "Invalid numeric format", line, col)
375
+ else
376
+ emit(TokenType::CURRENCY, val, line, col)
377
+ end
378
+ when 37 # #% percent
379
+ s.pos += 1; @col += 1
380
+ val = scan_numeric_value
381
+ if val.empty?
382
+ emit(TokenType::ERROR, "Invalid numeric format", line, col)
383
+ else
384
+ emit(TokenType::PERCENT, val, line, col)
385
+ end
386
+ else # # number
387
+ val = scan_numeric_value
388
+ if val.empty?
389
+ emit(TokenType::ERROR, "Invalid numeric format", line, col)
390
+ else
391
+ emit(TokenType::NUMBER, val, line, col)
392
+ end
393
+ end
394
+ end
395
+
396
+ def scan_numeric_value
397
+ val = @scanner.scan(RE_NUMERIC) || ""
398
+ @col += val.length
399
+ val
400
+ end
401
+
402
+ def scan_currency_value
403
+ val = @scanner.scan(RE_CURRENCY_VAL) || ""
404
+ @col += val.length
405
+ val
406
+ end
407
+
408
+ def scan_string(line, col)
409
+ s = @scanner
410
+ s.pos += 1; @col += 1 # skip opening "
411
+
412
+ # Check for multi-line """
413
+ if !s.eos? && s.string.getbyte(s.pos) == 34 &&
414
+ s.pos + 1 < s.string.bytesize && s.string.getbyte(s.pos + 1) == 34
415
+ s.pos += 2; @col += 2
416
+ scan_multiline_string(line, col)
417
+ return
418
+ end
419
+
420
+ result = +""
421
+ until s.eos?
422
+ # Scan non-special characters in bulk
423
+ chunk = s.scan(/[^"\\\r\n]+/)
424
+ if chunk
425
+ result << chunk
426
+ @col += chunk.length
427
+ end
428
+
429
+ break if s.eos?
430
+ byte = s.string.getbyte(s.pos)
431
+
432
+ case byte
433
+ when 92 # backslash
434
+ s.pos += 1; @col += 1
435
+ if s.eos?
436
+ emit(TokenType::ERROR, "Unterminated escape sequence", line, col)
437
+ return
438
+ end
439
+ esc_byte = s.string.getbyte(s.pos)
440
+ if esc_byte == 110 then result << "\n"; s.pos += 1; @col += 1 # n
441
+ elsif esc_byte == 116 then result << "\t"; s.pos += 1; @col += 1 # t
442
+ elsif esc_byte == 114 then result << "\r"; s.pos += 1; @col += 1 # r
443
+ elsif esc_byte == 34 then result << '"'; s.pos += 1; @col += 1 # "
444
+ elsif esc_byte == 92 then result << '\\'; s.pos += 1; @col += 1 # \
445
+ elsif esc_byte == 48 then result << "\0"; s.pos += 1; @col += 1 # 0
446
+ elsif esc_byte == 47 then result << '/'; s.pos += 1; @col += 1 # /
447
+ elsif esc_byte == 117 # u
448
+ s.pos += 1; @col += 1
449
+ result << scan_unicode_escape(line, col, 4)
450
+ elsif esc_byte == 85 # U
451
+ s.pos += 1; @col += 1
452
+ result << scan_unicode_escape(line, col, 8)
453
+ else
454
+ # Read the actual character (may be multi-byte)
455
+ esc_char = s.scan(/./) || "?"
456
+ @col += 1
457
+ emit(TokenType::ERROR, "Invalid escape: \\#{esc_char}", line, col)
458
+ return
459
+ end
460
+ when 34 # closing "
461
+ s.pos += 1; @col += 1
462
+ emit(TokenType::STRING, result, line, col)
463
+ return
464
+ when 10, 13 # newline
465
+ emit(TokenType::ERROR, "Unterminated string", line, col)
466
+ return
467
+ end
468
+ end
469
+
470
+ emit(TokenType::ERROR, "Unterminated string", line, col)
471
+ end
472
+
473
+ def scan_multiline_string(line, col)
474
+ s = @scanner
475
+ # Skip initial newline after opening """
476
+ if !s.eos?
477
+ byte = s.string.getbyte(s.pos)
478
+ if byte == 10
479
+ s.pos += 1; @line += 1; @col = 1
480
+ elsif byte == 13
481
+ s.pos += 1; @line += 1; @col = 1
482
+ if !s.eos? && s.string.getbyte(s.pos) == 10
483
+ s.pos += 1
484
+ end
485
+ end
486
+ end
487
+
488
+ result = +""
489
+ until s.eos?
490
+ # Check for closing """
491
+ if s.string.getbyte(s.pos) == 34 &&
492
+ s.pos + 2 < s.string.bytesize &&
493
+ s.string.getbyte(s.pos + 1) == 34 &&
494
+ s.string.getbyte(s.pos + 2) == 34
495
+ s.pos += 3; @col += 3
496
+ emit(TokenType::STRING, result, line, col)
497
+ return
498
+ end
499
+
500
+ byte = s.string.getbyte(s.pos)
501
+ if byte == 13 # \r
502
+ result << "\n"
503
+ s.pos += 1; @line += 1; @col = 1
504
+ if !s.eos? && s.string.getbyte(s.pos) == 10
505
+ s.pos += 1
506
+ end
507
+ elsif byte == 10 # \n
508
+ result << "\n"
509
+ s.pos += 1; @line += 1; @col = 1
510
+ else
511
+ # Scan non-special chars in bulk
512
+ chunk = s.scan(/[^"\r\n]+/)
513
+ if chunk
514
+ result << chunk
515
+ @col += chunk.length
516
+ else
517
+ # Single quote that isn't part of """
518
+ result << s.string[s.pos]
519
+ s.pos += 1; @col += 1
520
+ end
521
+ end
522
+ end
523
+
524
+ emit(TokenType::ERROR, "Unterminated multi-line string", line, col)
525
+ end
526
+
527
+ def scan_unicode_escape(line, col, num_digits)
528
+ s = @scanner
529
+ hex = s.peek(num_digits)
530
+ unless hex.length == num_digits && hex.match?(/\A[0-9a-fA-F]+\z/)
531
+ emit(TokenType::ERROR, "Invalid unicode escape", line, col)
532
+ return ""
533
+ end
534
+ s.pos += num_digits; @col += num_digits
535
+ codepoint = hex.to_i(16)
536
+
537
+ # Check for surrogate pair
538
+ if codepoint >= 0xD800 && codepoint <= 0xDBFF
539
+ if !s.eos? && s.string.getbyte(s.pos) == 92 && # backslash
540
+ s.pos + 1 < s.string.bytesize && s.string.getbyte(s.pos + 1) == 117 # u
541
+ s.pos += 2; @col += 2
542
+ low_hex = s.peek(4)
543
+ unless low_hex.length == 4 && low_hex.match?(/\A[0-9a-fA-F]+\z/)
544
+ emit(TokenType::ERROR, "Invalid surrogate pair", line, col)
545
+ return ""
546
+ end
547
+ s.pos += 4; @col += 4
548
+ low = low_hex.to_i(16)
549
+ if low >= 0xDC00 && low <= 0xDFFF
550
+ codepoint = 0x10000 + ((codepoint - 0xD800) << 10) + (low - 0xDC00)
551
+ else
552
+ emit(TokenType::ERROR, "Invalid low surrogate", line, col)
553
+ return ""
554
+ end
555
+ else
556
+ emit(TokenType::ERROR, "Expected low surrogate", line, col)
557
+ return ""
558
+ end
559
+ end
560
+
561
+ [codepoint].pack("U")
562
+ end
563
+
564
+ def scan_reference(line, col)
565
+ s = @scanner
566
+ s.pos += 1; @col += 1 # skip @
567
+
568
+ if !s.eos? && s.string.getbyte(s.pos) == 35 # @#
569
+ s.pos += 1; @col += 1
570
+ emit(TokenType::ERROR, "@#", line, col)
571
+ return
572
+ end
573
+
574
+ path = s.scan(RE_REF_PATH) || ""
575
+ @col += path.length
576
+ # Normalize leading zeros in array indices: [007] -> [7]
577
+ path = path.gsub(/\[(\d+)\]/) { "[#{$1.to_i}]" }
578
+ emit(TokenType::REFERENCE, path, line, col)
579
+ end
580
+
581
+ def scan_binary(line, col)
582
+ s = @scanner
583
+ s.pos += 1; @col += 1 # skip ^
584
+ data = s.scan(RE_BINARY_DATA) || ""
585
+ @col += data.length
586
+ emit(TokenType::BINARY, data, line, col)
587
+ end
588
+
589
+ def scan_verb(line, col)
590
+ s = @scanner
591
+ s.pos += 1; @col += 1 # skip %
592
+
593
+ if s.eos? || s.string.getbyte(s.pos) == 32 || s.string.getbyte(s.pos) == 9 ||
594
+ s.string.getbyte(s.pos) == 10 || s.string.getbyte(s.pos) == 13
595
+ emit(TokenType::ERROR, "Empty verb name", line, col)
596
+ return
597
+ end
598
+
599
+ name = +""
600
+ if !s.eos? && s.string.getbyte(s.pos) == 38 # &
601
+ name << "&"
602
+ s.pos += 1; @col += 1
603
+ end
604
+
605
+ word = s.scan(/[a-zA-Z0-9_.\-]+/) || ""
606
+ name << word
607
+ @col += word.length
608
+
609
+ if name.empty?
610
+ emit(TokenType::ERROR, "Invalid verb", line, col)
611
+ return
612
+ end
613
+
614
+ emit(TokenType::VERB, name, line, col)
615
+ scan_verb_arguments
616
+ end
617
+
618
+ def scan_verb_arguments
619
+ s = @scanner
620
+ until s.eos?
621
+ if (ws = s.scan(RE_WHITESPACE)) then @col += ws.length end
622
+ break if s.eos?
623
+
624
+ byte = s.string.getbyte(s.pos)
625
+ break if byte == 10 || byte == 13 || byte == 59 || byte == 58 # \n \r ; :
626
+
627
+ line = @line
628
+ col = @col
629
+
630
+ case byte
631
+ when 34 then scan_string(line, col) # "
632
+ when 35 then scan_number_prefix(line, col) # #
633
+ when 63 # ?
634
+ s.pos += 1; @col += 1
635
+ word = s.scan(RE_WORD) || ""
636
+ @col += word.length
637
+ if word == "true" || word == "false"
638
+ emit(TokenType::BOOLEAN, word, line, col)
639
+ else
640
+ emit(TokenType::ERROR, "Invalid boolean: ?#{word}", line, col)
641
+ end
642
+ when 126 # ~
643
+ s.pos += 1; @col += 1
644
+ emit(TokenType::NULL, "~", line, col)
645
+ when 64 then scan_reference(line, col) # @
646
+ when 94 then scan_binary(line, col) # ^
647
+ when 37 then scan_verb(line, col) # %
648
+ when 124 # |
649
+ s.pos += 1; @col += 1
650
+ emit(TokenType::PIPE, "|", line, col)
651
+ else
652
+ if digit_byte?(byte)
653
+ scan_date_or_number(line, col)
654
+ elsif byte == 116 || byte == 102 # t, f
655
+ scan_bare_boolean_or_identifier(line, col)
656
+ elsif byte == 80 # P
657
+ scan_possible_duration(line, col)
658
+ elsif byte == 84 # T
659
+ scan_possible_time(line, col)
660
+ elsif ident_start_byte?(byte)
661
+ scan_bare_string_value(line, col)
662
+ else
663
+ break
664
+ end
665
+ end
666
+ end
667
+ end
668
+
669
+ def scan_directive(line, col)
670
+ s = @scanner
671
+ s.pos += 1; @col += 1 # skip :
672
+ name = s.scan(RE_WORD) || ""
673
+ @col += name.length
674
+
675
+ if name.empty?
676
+ emit(TokenType::ERROR, "Empty directive", line, col)
677
+ return
678
+ end
679
+ emit(TokenType::DIRECTIVE, name, line, col)
680
+
681
+ # Directive may have a string value
682
+ if (ws = s.scan(RE_WHITESPACE)) then @col += ws.length end
683
+ return if s.eos?
684
+ byte = s.string.getbyte(s.pos)
685
+ return if byte == 10 || byte == 13 || byte == 59 # \n \r ;
686
+
687
+ if byte == 34 # "
688
+ scan_string(@line, @col)
689
+ end
690
+ end
691
+
692
+ def scan_array_indexed_path(line, col)
693
+ s = @scanner
694
+ word = +""
695
+ # Read [index]
696
+ if (idx = s.scan(RE_ARRAY_INDEX))
697
+ word << idx
698
+ @col += idx.length
699
+ end
700
+ # Continue with identifier chars, dots, and more brackets
701
+ loop do
702
+ if (chunk = s.scan(/[a-zA-Z0-9_.\-]+/))
703
+ word << chunk
704
+ @col += chunk.length
705
+ elsif (idx = s.scan(RE_ARRAY_INDEX))
706
+ word << idx
707
+ @col += idx.length
708
+ else
709
+ break
710
+ end
711
+ end
712
+ emit(TokenType::PATH, word, line, col)
713
+ end
714
+
715
+ def scan_identifier(line, col)
716
+ s = @scanner
717
+ word = +""
718
+
719
+ # Allow leading dot or &
720
+ byte = s.string.getbyte(s.pos)
721
+ if byte == 46 || byte == 38 # . or &
722
+ word << s.string[s.pos]
723
+ s.pos += 1; @col += 1
724
+ end
725
+
726
+ # Scan identifier body with dots and brackets
727
+ loop do
728
+ if (chunk = s.scan(/[a-zA-Z0-9_.\-]+/))
729
+ word << chunk
730
+ @col += chunk.length
731
+ elsif (idx = s.scan(RE_ARRAY_INDEX))
732
+ word << idx
733
+ @col += idx.length
734
+ elsif !s.eos? && s.string.getbyte(s.pos) == 38 # &
735
+ word << "&"
736
+ s.pos += 1; @col += 1
737
+ else
738
+ break
739
+ end
740
+ end
741
+
742
+ emit(TokenType::PATH, word, line, col)
743
+ end
744
+
745
+ def scan_bare_boolean_or_identifier(line, col)
746
+ s = @scanner
747
+ word = s.scan(RE_WORD) || ""
748
+ @col += word.length
749
+
750
+ if word == "true" || word == "false"
751
+ emit(TokenType::BOOLEAN, word, line, col)
752
+ else
753
+ # It's a bare string value — don't span multiple words
754
+ emit(TokenType::STRING, word, line, col, raw: "bare")
755
+ end
756
+ end
757
+
758
+ def scan_possible_duration(line, col)
759
+ s = @scanner
760
+ saved_pos = s.pos
761
+ saved_col = @col
762
+ saved_line = @line
763
+
764
+ val = s.scan(RE_DURATION)
765
+ if val && val.length > 1 && val.match?(/[0-9]/)
766
+ @col += val.length
767
+ emit(TokenType::DURATION, val, line, col)
768
+ else
769
+ s.pos = saved_pos
770
+ @col = saved_col
771
+ @line = saved_line
772
+ scan_bare_string_value(line, col)
773
+ end
774
+ end
775
+
776
+ def scan_possible_time(line, col)
777
+ s = @scanner
778
+ saved_pos = s.pos
779
+ saved_col = @col
780
+ saved_line = @line
781
+
782
+ val = s.scan(RE_TIME_VAL)
783
+ if val && val.length > 1
784
+ @col += val.length
785
+ emit(TokenType::TIME, val, line, col)
786
+ else
787
+ s.pos = saved_pos
788
+ @col = saved_col
789
+ @line = saved_line
790
+ scan_bare_string_value(line, col)
791
+ end
792
+ end
793
+
794
+ def scan_date_or_number(line, col)
795
+ s = @scanner
796
+ val = s.scan(RE_DATE_OR_NUM) || ""
797
+ @col += val.length
798
+
799
+ if val.match?(RE_DATE_PREFIX)
800
+ emit(TokenType::TIMESTAMP, val, line, col)
801
+ elsif val.match?(RE_DATE_EXACT)
802
+ emit(TokenType::DATE, val, line, col)
803
+ else
804
+ emit(TokenType::NUMBER, val, line, col)
805
+ end
806
+ end
807
+
808
+ def scan_bare_string_value(line, col)
809
+ s = @scanner
810
+ val = s.scan(RE_BARE_VALUE) || ""
811
+ @col += val.length
812
+ emit(TokenType::STRING, val, line, col, raw: "bare")
813
+ end
814
+
815
+ # Byte classification helpers (no allocation)
816
+ def ident_start_byte?(b)
817
+ (b >= 65 && b <= 90) || (b >= 97 && b <= 122) || b == 95
818
+ end
819
+
820
+ def digit_byte?(b)
821
+ b >= 48 && b <= 57
822
+ end
823
+ end
824
+ end
825
+ end