dms-parser 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/dms/parser.rb ADDED
@@ -0,0 +1,3007 @@
1
+ # frozen_string_literal: true
2
+
3
+ # DMS parser - port of the Python/Rust reference, optimized for Ruby.
4
+ #
5
+ # Hand-written recursive-descent + stateful lexer. Errors carry
6
+ # (line, column, message). Tables are insertion-ordered Hashes.
7
+ #
8
+ # Hot-path strategy: positions are *byte* offsets into a UTF-8 source.
9
+ # Inline scanners use String#getbyte (returns Integer or nil) which is
10
+ # allocation-free, vs. String#[] which copies a 1-char String per call.
11
+ # DMS structural characters (':', '+', '-', '\n', digits, ASCII keys)
12
+ # are all ASCII, so byte-level checks are exact for the common cases.
13
+ # Multi-byte content only matters in string bodies (handled via
14
+ # byteslice + force_encoding) and for non-ASCII bare-key chars (label-
15
+ # class check has a UTF-8 multi-byte fallback).
16
+
17
+ module Dms
18
+ class Parser
19
+ # ---------- Byte constants (faster than ?x.ord at call site) ----------
20
+ SP = 0x20
21
+ TAB = 0x09
22
+ LF = 0x0A
23
+ CR = 0x0D
24
+ HASH = 0x23 # '#'
25
+ SLASH = 0x2F # '/'
26
+ STAR = 0x2A # '*'
27
+ BANG = 0x21 # '!' (unused)
28
+ PLUS = 0x2B # '+'
29
+ MINUS = 0x2D # '-'
30
+ COLON = 0x3A # ':'
31
+ COMMA = 0x2C # ','
32
+ LBRACK = 0x5B # '['
33
+ RBRACK = 0x5D # ']'
34
+ LBRACE = 0x7B # '{'
35
+ RBRACE = 0x7D # '}'
36
+ DOT = 0x2E # '.'
37
+ UNDERSCORE = 0x5F # '_'
38
+ BACKSLASH = 0x5C # '\\'
39
+ DQUOTE = 0x22 # '"'
40
+ SQUOTE = 0x27 # "'"
41
+ DIGIT0 = 0x30
42
+ DIGIT9 = 0x39
43
+ LOWER_A = 0x61
44
+ LOWER_F = 0x66
45
+ LOWER_Z = 0x7A
46
+ UPPER_A = 0x41
47
+ UPPER_F = 0x46
48
+ UPPER_Z = 0x5A
49
+ LOWER_X = 0x78
50
+ LOWER_O = 0x6F
51
+ LOWER_B = 0x62
52
+ LOWER_E = 0x65
53
+ UPPER_E = 0x45
54
+ LOWER_P = 0x70
55
+ LOWER_T = 0x74
56
+ LOWER_N = 0x6E
57
+ LOWER_U = 0x75
58
+ UPPER_U = 0x55
59
+ UPPER_T = 0x54
60
+ UPPER_Z_LETTER = 0x5A
61
+ LOWER_Z_LETTER = 0x7A
62
+ LOWER_R = 0x72
63
+ LOWER_F_LETTER = 0x66
64
+
65
+ HEX_DIGITS = "0123456789abcdefABCDEF"
66
+
67
+ BUILT_IN_HEREDOC_MODIFIERS = %w[_trim _fold_paragraphs].freeze
68
+
69
+ # Bare-key fast lookup: 256-entry table, value true iff that ASCII byte
70
+ # is a bare-key char (alnum, '_', '-').
71
+ BARE_KEY_BYTE = Array.new(256, false)
72
+ (DIGIT0..DIGIT9).each { |b| BARE_KEY_BYTE[b] = true }
73
+ (LOWER_A..LOWER_Z).each { |b| BARE_KEY_BYTE[b] = true }
74
+ (UPPER_A..UPPER_Z).each { |b| BARE_KEY_BYTE[b] = true }
75
+ BARE_KEY_BYTE[UNDERSCORE] = true
76
+ BARE_KEY_BYTE[MINUS] = true
77
+ BARE_KEY_BYTE.freeze
78
+
79
+ # Reserved decorator sigils (SPEC tier-0): a body line whose first
80
+ # non-whitespace byte is one of these is a parse error. The check is
81
+ # line-start only; these chars are unrestricted inside string bodies,
82
+ # comments, and heredoc bodies. Underscore is intentionally NOT in
83
+ # this set (it remains a valid identifier-leading byte).
84
+ RESERVED_SIGIL_BYTE = Array.new(256, false)
85
+ [0x21, 0x40, 0x24, 0x25, 0x5E, 0x26, 0x2A, 0x7C, 0x7E, 0x60,
86
+ 0x2E, 0x2C, 0x3E, 0x3C, 0x3F, 0x3B, 0x3D].each do |b|
87
+ RESERVED_SIGIL_BYTE[b] = true
88
+ end
89
+ RESERVED_SIGIL_BYTE.freeze
90
+
91
+ # XID_Continue snapshot, frozen at Unicode 15.1 per UAX #31 §2 default
92
+ # identifier syntax (XID_Continue \ Default_Ignorable_Code_Point). Embedded
93
+ # so accept/reject is identical across Ruby/Onigmo Unicode-data versions.
94
+ # 773 sorted, non-overlapping ranges; binary-searched by #xid_continue?.
95
+ XID_CONTINUE_RANGES = [
96
+ [0x00AA, 0x00AA],
97
+ [0x00B5, 0x00B5],
98
+ [0x00B7, 0x00B7],
99
+ [0x00BA, 0x00BA],
100
+ [0x00C0, 0x00D6],
101
+ [0x00D8, 0x00F6],
102
+ [0x00F8, 0x02C1],
103
+ [0x02C6, 0x02D1],
104
+ [0x02E0, 0x02E4],
105
+ [0x02EC, 0x02EC],
106
+ [0x02EE, 0x02EE],
107
+ [0x0300, 0x034E],
108
+ [0x0350, 0x0374],
109
+ [0x0376, 0x0377],
110
+ [0x037B, 0x037D],
111
+ [0x037F, 0x037F],
112
+ [0x0386, 0x038A],
113
+ [0x038C, 0x038C],
114
+ [0x038E, 0x03A1],
115
+ [0x03A3, 0x03F5],
116
+ [0x03F7, 0x0481],
117
+ [0x0483, 0x0487],
118
+ [0x048A, 0x052F],
119
+ [0x0531, 0x0556],
120
+ [0x0559, 0x0559],
121
+ [0x0560, 0x0588],
122
+ [0x0591, 0x05BD],
123
+ [0x05BF, 0x05BF],
124
+ [0x05C1, 0x05C2],
125
+ [0x05C4, 0x05C5],
126
+ [0x05C7, 0x05C7],
127
+ [0x05D0, 0x05EA],
128
+ [0x05EF, 0x05F2],
129
+ [0x0610, 0x061A],
130
+ [0x0620, 0x0669],
131
+ [0x066E, 0x06D3],
132
+ [0x06D5, 0x06DC],
133
+ [0x06DF, 0x06E8],
134
+ [0x06EA, 0x06FC],
135
+ [0x06FF, 0x06FF],
136
+ [0x0710, 0x074A],
137
+ [0x074D, 0x07B1],
138
+ [0x07C0, 0x07F5],
139
+ [0x07FA, 0x07FA],
140
+ [0x07FD, 0x07FD],
141
+ [0x0800, 0x082D],
142
+ [0x0840, 0x085B],
143
+ [0x0860, 0x086A],
144
+ [0x0870, 0x0887],
145
+ [0x0889, 0x088E],
146
+ [0x0898, 0x08E1],
147
+ [0x08E3, 0x0963],
148
+ [0x0966, 0x096F],
149
+ [0x0971, 0x0983],
150
+ [0x0985, 0x098C],
151
+ [0x098F, 0x0990],
152
+ [0x0993, 0x09A8],
153
+ [0x09AA, 0x09B0],
154
+ [0x09B2, 0x09B2],
155
+ [0x09B6, 0x09B9],
156
+ [0x09BC, 0x09C4],
157
+ [0x09C7, 0x09C8],
158
+ [0x09CB, 0x09CE],
159
+ [0x09D7, 0x09D7],
160
+ [0x09DC, 0x09DD],
161
+ [0x09DF, 0x09E3],
162
+ [0x09E6, 0x09F1],
163
+ [0x09FC, 0x09FC],
164
+ [0x09FE, 0x09FE],
165
+ [0x0A01, 0x0A03],
166
+ [0x0A05, 0x0A0A],
167
+ [0x0A0F, 0x0A10],
168
+ [0x0A13, 0x0A28],
169
+ [0x0A2A, 0x0A30],
170
+ [0x0A32, 0x0A33],
171
+ [0x0A35, 0x0A36],
172
+ [0x0A38, 0x0A39],
173
+ [0x0A3C, 0x0A3C],
174
+ [0x0A3E, 0x0A42],
175
+ [0x0A47, 0x0A48],
176
+ [0x0A4B, 0x0A4D],
177
+ [0x0A51, 0x0A51],
178
+ [0x0A59, 0x0A5C],
179
+ [0x0A5E, 0x0A5E],
180
+ [0x0A66, 0x0A75],
181
+ [0x0A81, 0x0A83],
182
+ [0x0A85, 0x0A8D],
183
+ [0x0A8F, 0x0A91],
184
+ [0x0A93, 0x0AA8],
185
+ [0x0AAA, 0x0AB0],
186
+ [0x0AB2, 0x0AB3],
187
+ [0x0AB5, 0x0AB9],
188
+ [0x0ABC, 0x0AC5],
189
+ [0x0AC7, 0x0AC9],
190
+ [0x0ACB, 0x0ACD],
191
+ [0x0AD0, 0x0AD0],
192
+ [0x0AE0, 0x0AE3],
193
+ [0x0AE6, 0x0AEF],
194
+ [0x0AF9, 0x0AFF],
195
+ [0x0B01, 0x0B03],
196
+ [0x0B05, 0x0B0C],
197
+ [0x0B0F, 0x0B10],
198
+ [0x0B13, 0x0B28],
199
+ [0x0B2A, 0x0B30],
200
+ [0x0B32, 0x0B33],
201
+ [0x0B35, 0x0B39],
202
+ [0x0B3C, 0x0B44],
203
+ [0x0B47, 0x0B48],
204
+ [0x0B4B, 0x0B4D],
205
+ [0x0B55, 0x0B57],
206
+ [0x0B5C, 0x0B5D],
207
+ [0x0B5F, 0x0B63],
208
+ [0x0B66, 0x0B6F],
209
+ [0x0B71, 0x0B71],
210
+ [0x0B82, 0x0B83],
211
+ [0x0B85, 0x0B8A],
212
+ [0x0B8E, 0x0B90],
213
+ [0x0B92, 0x0B95],
214
+ [0x0B99, 0x0B9A],
215
+ [0x0B9C, 0x0B9C],
216
+ [0x0B9E, 0x0B9F],
217
+ [0x0BA3, 0x0BA4],
218
+ [0x0BA8, 0x0BAA],
219
+ [0x0BAE, 0x0BB9],
220
+ [0x0BBE, 0x0BC2],
221
+ [0x0BC6, 0x0BC8],
222
+ [0x0BCA, 0x0BCD],
223
+ [0x0BD0, 0x0BD0],
224
+ [0x0BD7, 0x0BD7],
225
+ [0x0BE6, 0x0BEF],
226
+ [0x0C00, 0x0C0C],
227
+ [0x0C0E, 0x0C10],
228
+ [0x0C12, 0x0C28],
229
+ [0x0C2A, 0x0C39],
230
+ [0x0C3C, 0x0C44],
231
+ [0x0C46, 0x0C48],
232
+ [0x0C4A, 0x0C4D],
233
+ [0x0C55, 0x0C56],
234
+ [0x0C58, 0x0C5A],
235
+ [0x0C5D, 0x0C5D],
236
+ [0x0C60, 0x0C63],
237
+ [0x0C66, 0x0C6F],
238
+ [0x0C80, 0x0C83],
239
+ [0x0C85, 0x0C8C],
240
+ [0x0C8E, 0x0C90],
241
+ [0x0C92, 0x0CA8],
242
+ [0x0CAA, 0x0CB3],
243
+ [0x0CB5, 0x0CB9],
244
+ [0x0CBC, 0x0CC4],
245
+ [0x0CC6, 0x0CC8],
246
+ [0x0CCA, 0x0CCD],
247
+ [0x0CD5, 0x0CD6],
248
+ [0x0CDD, 0x0CDE],
249
+ [0x0CE0, 0x0CE3],
250
+ [0x0CE6, 0x0CEF],
251
+ [0x0CF1, 0x0CF3],
252
+ [0x0D00, 0x0D0C],
253
+ [0x0D0E, 0x0D10],
254
+ [0x0D12, 0x0D44],
255
+ [0x0D46, 0x0D48],
256
+ [0x0D4A, 0x0D4E],
257
+ [0x0D54, 0x0D57],
258
+ [0x0D5F, 0x0D63],
259
+ [0x0D66, 0x0D6F],
260
+ [0x0D7A, 0x0D7F],
261
+ [0x0D81, 0x0D83],
262
+ [0x0D85, 0x0D96],
263
+ [0x0D9A, 0x0DB1],
264
+ [0x0DB3, 0x0DBB],
265
+ [0x0DBD, 0x0DBD],
266
+ [0x0DC0, 0x0DC6],
267
+ [0x0DCA, 0x0DCA],
268
+ [0x0DCF, 0x0DD4],
269
+ [0x0DD6, 0x0DD6],
270
+ [0x0DD8, 0x0DDF],
271
+ [0x0DE6, 0x0DEF],
272
+ [0x0DF2, 0x0DF3],
273
+ [0x0E01, 0x0E3A],
274
+ [0x0E40, 0x0E4E],
275
+ [0x0E50, 0x0E59],
276
+ [0x0E81, 0x0E82],
277
+ [0x0E84, 0x0E84],
278
+ [0x0E86, 0x0E8A],
279
+ [0x0E8C, 0x0EA3],
280
+ [0x0EA5, 0x0EA5],
281
+ [0x0EA7, 0x0EBD],
282
+ [0x0EC0, 0x0EC4],
283
+ [0x0EC6, 0x0EC6],
284
+ [0x0EC8, 0x0ECE],
285
+ [0x0ED0, 0x0ED9],
286
+ [0x0EDC, 0x0EDF],
287
+ [0x0F00, 0x0F00],
288
+ [0x0F18, 0x0F19],
289
+ [0x0F20, 0x0F29],
290
+ [0x0F35, 0x0F35],
291
+ [0x0F37, 0x0F37],
292
+ [0x0F39, 0x0F39],
293
+ [0x0F3E, 0x0F47],
294
+ [0x0F49, 0x0F6C],
295
+ [0x0F71, 0x0F84],
296
+ [0x0F86, 0x0F97],
297
+ [0x0F99, 0x0FBC],
298
+ [0x0FC6, 0x0FC6],
299
+ [0x1000, 0x1049],
300
+ [0x1050, 0x109D],
301
+ [0x10A0, 0x10C5],
302
+ [0x10C7, 0x10C7],
303
+ [0x10CD, 0x10CD],
304
+ [0x10D0, 0x10FA],
305
+ [0x10FC, 0x115E],
306
+ [0x1161, 0x1248],
307
+ [0x124A, 0x124D],
308
+ [0x1250, 0x1256],
309
+ [0x1258, 0x1258],
310
+ [0x125A, 0x125D],
311
+ [0x1260, 0x1288],
312
+ [0x128A, 0x128D],
313
+ [0x1290, 0x12B0],
314
+ [0x12B2, 0x12B5],
315
+ [0x12B8, 0x12BE],
316
+ [0x12C0, 0x12C0],
317
+ [0x12C2, 0x12C5],
318
+ [0x12C8, 0x12D6],
319
+ [0x12D8, 0x1310],
320
+ [0x1312, 0x1315],
321
+ [0x1318, 0x135A],
322
+ [0x135D, 0x135F],
323
+ [0x1369, 0x1371],
324
+ [0x1380, 0x138F],
325
+ [0x13A0, 0x13F5],
326
+ [0x13F8, 0x13FD],
327
+ [0x1401, 0x166C],
328
+ [0x166F, 0x167F],
329
+ [0x1681, 0x169A],
330
+ [0x16A0, 0x16EA],
331
+ [0x16EE, 0x16F8],
332
+ [0x1700, 0x1715],
333
+ [0x171F, 0x1734],
334
+ [0x1740, 0x1753],
335
+ [0x1760, 0x176C],
336
+ [0x176E, 0x1770],
337
+ [0x1772, 0x1773],
338
+ [0x1780, 0x17B3],
339
+ [0x17B6, 0x17D3],
340
+ [0x17D7, 0x17D7],
341
+ [0x17DC, 0x17DD],
342
+ [0x17E0, 0x17E9],
343
+ [0x1810, 0x1819],
344
+ [0x1820, 0x1878],
345
+ [0x1880, 0x18AA],
346
+ [0x18B0, 0x18F5],
347
+ [0x1900, 0x191E],
348
+ [0x1920, 0x192B],
349
+ [0x1930, 0x193B],
350
+ [0x1946, 0x196D],
351
+ [0x1970, 0x1974],
352
+ [0x1980, 0x19AB],
353
+ [0x19B0, 0x19C9],
354
+ [0x19D0, 0x19DA],
355
+ [0x1A00, 0x1A1B],
356
+ [0x1A20, 0x1A5E],
357
+ [0x1A60, 0x1A7C],
358
+ [0x1A7F, 0x1A89],
359
+ [0x1A90, 0x1A99],
360
+ [0x1AA7, 0x1AA7],
361
+ [0x1AB0, 0x1ABD],
362
+ [0x1ABF, 0x1ACE],
363
+ [0x1B00, 0x1B4C],
364
+ [0x1B50, 0x1B59],
365
+ [0x1B6B, 0x1B73],
366
+ [0x1B80, 0x1BF3],
367
+ [0x1C00, 0x1C37],
368
+ [0x1C40, 0x1C49],
369
+ [0x1C4D, 0x1C7D],
370
+ [0x1C80, 0x1C88],
371
+ [0x1C90, 0x1CBA],
372
+ [0x1CBD, 0x1CBF],
373
+ [0x1CD0, 0x1CD2],
374
+ [0x1CD4, 0x1CFA],
375
+ [0x1D00, 0x1F15],
376
+ [0x1F18, 0x1F1D],
377
+ [0x1F20, 0x1F45],
378
+ [0x1F48, 0x1F4D],
379
+ [0x1F50, 0x1F57],
380
+ [0x1F59, 0x1F59],
381
+ [0x1F5B, 0x1F5B],
382
+ [0x1F5D, 0x1F5D],
383
+ [0x1F5F, 0x1F7D],
384
+ [0x1F80, 0x1FB4],
385
+ [0x1FB6, 0x1FBC],
386
+ [0x1FBE, 0x1FBE],
387
+ [0x1FC2, 0x1FC4],
388
+ [0x1FC6, 0x1FCC],
389
+ [0x1FD0, 0x1FD3],
390
+ [0x1FD6, 0x1FDB],
391
+ [0x1FE0, 0x1FEC],
392
+ [0x1FF2, 0x1FF4],
393
+ [0x1FF6, 0x1FFC],
394
+ [0x203F, 0x2040],
395
+ [0x2054, 0x2054],
396
+ [0x2071, 0x2071],
397
+ [0x207F, 0x207F],
398
+ [0x2090, 0x209C],
399
+ [0x20D0, 0x20DC],
400
+ [0x20E1, 0x20E1],
401
+ [0x20E5, 0x20F0],
402
+ [0x2102, 0x2102],
403
+ [0x2107, 0x2107],
404
+ [0x210A, 0x2113],
405
+ [0x2115, 0x2115],
406
+ [0x2118, 0x211D],
407
+ [0x2124, 0x2124],
408
+ [0x2126, 0x2126],
409
+ [0x2128, 0x2128],
410
+ [0x212A, 0x2139],
411
+ [0x213C, 0x213F],
412
+ [0x2145, 0x2149],
413
+ [0x214E, 0x214E],
414
+ [0x2160, 0x2188],
415
+ [0x2C00, 0x2CE4],
416
+ [0x2CEB, 0x2CF3],
417
+ [0x2D00, 0x2D25],
418
+ [0x2D27, 0x2D27],
419
+ [0x2D2D, 0x2D2D],
420
+ [0x2D30, 0x2D67],
421
+ [0x2D6F, 0x2D6F],
422
+ [0x2D7F, 0x2D96],
423
+ [0x2DA0, 0x2DA6],
424
+ [0x2DA8, 0x2DAE],
425
+ [0x2DB0, 0x2DB6],
426
+ [0x2DB8, 0x2DBE],
427
+ [0x2DC0, 0x2DC6],
428
+ [0x2DC8, 0x2DCE],
429
+ [0x2DD0, 0x2DD6],
430
+ [0x2DD8, 0x2DDE],
431
+ [0x2DE0, 0x2DFF],
432
+ [0x3005, 0x3007],
433
+ [0x3021, 0x302F],
434
+ [0x3031, 0x3035],
435
+ [0x3038, 0x303C],
436
+ [0x3041, 0x3096],
437
+ [0x3099, 0x309A],
438
+ [0x309D, 0x309F],
439
+ [0x30A1, 0x30FF],
440
+ [0x3105, 0x312F],
441
+ [0x3131, 0x3163],
442
+ [0x3165, 0x318E],
443
+ [0x31A0, 0x31BF],
444
+ [0x31F0, 0x31FF],
445
+ [0x3400, 0x4DBF],
446
+ [0x4E00, 0xA48C],
447
+ [0xA4D0, 0xA4FD],
448
+ [0xA500, 0xA60C],
449
+ [0xA610, 0xA62B],
450
+ [0xA640, 0xA66F],
451
+ [0xA674, 0xA67D],
452
+ [0xA67F, 0xA6F1],
453
+ [0xA717, 0xA71F],
454
+ [0xA722, 0xA788],
455
+ [0xA78B, 0xA7CA],
456
+ [0xA7D0, 0xA7D1],
457
+ [0xA7D3, 0xA7D3],
458
+ [0xA7D5, 0xA7D9],
459
+ [0xA7F2, 0xA827],
460
+ [0xA82C, 0xA82C],
461
+ [0xA840, 0xA873],
462
+ [0xA880, 0xA8C5],
463
+ [0xA8D0, 0xA8D9],
464
+ [0xA8E0, 0xA8F7],
465
+ [0xA8FB, 0xA8FB],
466
+ [0xA8FD, 0xA92D],
467
+ [0xA930, 0xA953],
468
+ [0xA960, 0xA97C],
469
+ [0xA980, 0xA9C0],
470
+ [0xA9CF, 0xA9D9],
471
+ [0xA9E0, 0xA9FE],
472
+ [0xAA00, 0xAA36],
473
+ [0xAA40, 0xAA4D],
474
+ [0xAA50, 0xAA59],
475
+ [0xAA60, 0xAA76],
476
+ [0xAA7A, 0xAAC2],
477
+ [0xAADB, 0xAADD],
478
+ [0xAAE0, 0xAAEF],
479
+ [0xAAF2, 0xAAF6],
480
+ [0xAB01, 0xAB06],
481
+ [0xAB09, 0xAB0E],
482
+ [0xAB11, 0xAB16],
483
+ [0xAB20, 0xAB26],
484
+ [0xAB28, 0xAB2E],
485
+ [0xAB30, 0xAB5A],
486
+ [0xAB5C, 0xAB69],
487
+ [0xAB70, 0xABEA],
488
+ [0xABEC, 0xABED],
489
+ [0xABF0, 0xABF9],
490
+ [0xAC00, 0xD7A3],
491
+ [0xD7B0, 0xD7C6],
492
+ [0xD7CB, 0xD7FB],
493
+ [0xF900, 0xFA6D],
494
+ [0xFA70, 0xFAD9],
495
+ [0xFB00, 0xFB06],
496
+ [0xFB13, 0xFB17],
497
+ [0xFB1D, 0xFB28],
498
+ [0xFB2A, 0xFB36],
499
+ [0xFB38, 0xFB3C],
500
+ [0xFB3E, 0xFB3E],
501
+ [0xFB40, 0xFB41],
502
+ [0xFB43, 0xFB44],
503
+ [0xFB46, 0xFBB1],
504
+ [0xFBD3, 0xFC5D],
505
+ [0xFC64, 0xFD3D],
506
+ [0xFD50, 0xFD8F],
507
+ [0xFD92, 0xFDC7],
508
+ [0xFDF0, 0xFDF9],
509
+ [0xFE20, 0xFE2F],
510
+ [0xFE33, 0xFE34],
511
+ [0xFE4D, 0xFE4F],
512
+ [0xFE71, 0xFE71],
513
+ [0xFE73, 0xFE73],
514
+ [0xFE77, 0xFE77],
515
+ [0xFE79, 0xFE79],
516
+ [0xFE7B, 0xFE7B],
517
+ [0xFE7D, 0xFE7D],
518
+ [0xFE7F, 0xFEFC],
519
+ [0xFF10, 0xFF19],
520
+ [0xFF21, 0xFF3A],
521
+ [0xFF3F, 0xFF3F],
522
+ [0xFF41, 0xFF5A],
523
+ [0xFF65, 0xFF9F],
524
+ [0xFFA1, 0xFFBE],
525
+ [0xFFC2, 0xFFC7],
526
+ [0xFFCA, 0xFFCF],
527
+ [0xFFD2, 0xFFD7],
528
+ [0xFFDA, 0xFFDC],
529
+ [0x10000, 0x1000B],
530
+ [0x1000D, 0x10026],
531
+ [0x10028, 0x1003A],
532
+ [0x1003C, 0x1003D],
533
+ [0x1003F, 0x1004D],
534
+ [0x10050, 0x1005D],
535
+ [0x10080, 0x100FA],
536
+ [0x10140, 0x10174],
537
+ [0x101FD, 0x101FD],
538
+ [0x10280, 0x1029C],
539
+ [0x102A0, 0x102D0],
540
+ [0x102E0, 0x102E0],
541
+ [0x10300, 0x1031F],
542
+ [0x1032D, 0x1034A],
543
+ [0x10350, 0x1037A],
544
+ [0x10380, 0x1039D],
545
+ [0x103A0, 0x103C3],
546
+ [0x103C8, 0x103CF],
547
+ [0x103D1, 0x103D5],
548
+ [0x10400, 0x1049D],
549
+ [0x104A0, 0x104A9],
550
+ [0x104B0, 0x104D3],
551
+ [0x104D8, 0x104FB],
552
+ [0x10500, 0x10527],
553
+ [0x10530, 0x10563],
554
+ [0x10570, 0x1057A],
555
+ [0x1057C, 0x1058A],
556
+ [0x1058C, 0x10592],
557
+ [0x10594, 0x10595],
558
+ [0x10597, 0x105A1],
559
+ [0x105A3, 0x105B1],
560
+ [0x105B3, 0x105B9],
561
+ [0x105BB, 0x105BC],
562
+ [0x10600, 0x10736],
563
+ [0x10740, 0x10755],
564
+ [0x10760, 0x10767],
565
+ [0x10780, 0x10785],
566
+ [0x10787, 0x107B0],
567
+ [0x107B2, 0x107BA],
568
+ [0x10800, 0x10805],
569
+ [0x10808, 0x10808],
570
+ [0x1080A, 0x10835],
571
+ [0x10837, 0x10838],
572
+ [0x1083C, 0x1083C],
573
+ [0x1083F, 0x10855],
574
+ [0x10860, 0x10876],
575
+ [0x10880, 0x1089E],
576
+ [0x108E0, 0x108F2],
577
+ [0x108F4, 0x108F5],
578
+ [0x10900, 0x10915],
579
+ [0x10920, 0x10939],
580
+ [0x10980, 0x109B7],
581
+ [0x109BE, 0x109BF],
582
+ [0x10A00, 0x10A03],
583
+ [0x10A05, 0x10A06],
584
+ [0x10A0C, 0x10A13],
585
+ [0x10A15, 0x10A17],
586
+ [0x10A19, 0x10A35],
587
+ [0x10A38, 0x10A3A],
588
+ [0x10A3F, 0x10A3F],
589
+ [0x10A60, 0x10A7C],
590
+ [0x10A80, 0x10A9C],
591
+ [0x10AC0, 0x10AC7],
592
+ [0x10AC9, 0x10AE6],
593
+ [0x10B00, 0x10B35],
594
+ [0x10B40, 0x10B55],
595
+ [0x10B60, 0x10B72],
596
+ [0x10B80, 0x10B91],
597
+ [0x10C00, 0x10C48],
598
+ [0x10C80, 0x10CB2],
599
+ [0x10CC0, 0x10CF2],
600
+ [0x10D00, 0x10D27],
601
+ [0x10D30, 0x10D39],
602
+ [0x10E80, 0x10EA9],
603
+ [0x10EAB, 0x10EAC],
604
+ [0x10EB0, 0x10EB1],
605
+ [0x10EFD, 0x10F1C],
606
+ [0x10F27, 0x10F27],
607
+ [0x10F30, 0x10F50],
608
+ [0x10F70, 0x10F85],
609
+ [0x10FB0, 0x10FC4],
610
+ [0x10FE0, 0x10FF6],
611
+ [0x11000, 0x11046],
612
+ [0x11066, 0x11075],
613
+ [0x1107F, 0x110BA],
614
+ [0x110C2, 0x110C2],
615
+ [0x110D0, 0x110E8],
616
+ [0x110F0, 0x110F9],
617
+ [0x11100, 0x11134],
618
+ [0x11136, 0x1113F],
619
+ [0x11144, 0x11147],
620
+ [0x11150, 0x11173],
621
+ [0x11176, 0x11176],
622
+ [0x11180, 0x111C4],
623
+ [0x111C9, 0x111CC],
624
+ [0x111CE, 0x111DA],
625
+ [0x111DC, 0x111DC],
626
+ [0x11200, 0x11211],
627
+ [0x11213, 0x11237],
628
+ [0x1123E, 0x11241],
629
+ [0x11280, 0x11286],
630
+ [0x11288, 0x11288],
631
+ [0x1128A, 0x1128D],
632
+ [0x1128F, 0x1129D],
633
+ [0x1129F, 0x112A8],
634
+ [0x112B0, 0x112EA],
635
+ [0x112F0, 0x112F9],
636
+ [0x11300, 0x11303],
637
+ [0x11305, 0x1130C],
638
+ [0x1130F, 0x11310],
639
+ [0x11313, 0x11328],
640
+ [0x1132A, 0x11330],
641
+ [0x11332, 0x11333],
642
+ [0x11335, 0x11339],
643
+ [0x1133B, 0x11344],
644
+ [0x11347, 0x11348],
645
+ [0x1134B, 0x1134D],
646
+ [0x11350, 0x11350],
647
+ [0x11357, 0x11357],
648
+ [0x1135D, 0x11363],
649
+ [0x11366, 0x1136C],
650
+ [0x11370, 0x11374],
651
+ [0x11400, 0x1144A],
652
+ [0x11450, 0x11459],
653
+ [0x1145E, 0x11461],
654
+ [0x11480, 0x114C5],
655
+ [0x114C7, 0x114C7],
656
+ [0x114D0, 0x114D9],
657
+ [0x11580, 0x115B5],
658
+ [0x115B8, 0x115C0],
659
+ [0x115D8, 0x115DD],
660
+ [0x11600, 0x11640],
661
+ [0x11644, 0x11644],
662
+ [0x11650, 0x11659],
663
+ [0x11680, 0x116B8],
664
+ [0x116C0, 0x116C9],
665
+ [0x11700, 0x1171A],
666
+ [0x1171D, 0x1172B],
667
+ [0x11730, 0x11739],
668
+ [0x11740, 0x11746],
669
+ [0x11800, 0x1183A],
670
+ [0x118A0, 0x118E9],
671
+ [0x118FF, 0x11906],
672
+ [0x11909, 0x11909],
673
+ [0x1190C, 0x11913],
674
+ [0x11915, 0x11916],
675
+ [0x11918, 0x11935],
676
+ [0x11937, 0x11938],
677
+ [0x1193B, 0x11943],
678
+ [0x11950, 0x11959],
679
+ [0x119A0, 0x119A7],
680
+ [0x119AA, 0x119D7],
681
+ [0x119DA, 0x119E1],
682
+ [0x119E3, 0x119E4],
683
+ [0x11A00, 0x11A3E],
684
+ [0x11A47, 0x11A47],
685
+ [0x11A50, 0x11A99],
686
+ [0x11A9D, 0x11A9D],
687
+ [0x11AB0, 0x11AF8],
688
+ [0x11C00, 0x11C08],
689
+ [0x11C0A, 0x11C36],
690
+ [0x11C38, 0x11C40],
691
+ [0x11C50, 0x11C59],
692
+ [0x11C72, 0x11C8F],
693
+ [0x11C92, 0x11CA7],
694
+ [0x11CA9, 0x11CB6],
695
+ [0x11D00, 0x11D06],
696
+ [0x11D08, 0x11D09],
697
+ [0x11D0B, 0x11D36],
698
+ [0x11D3A, 0x11D3A],
699
+ [0x11D3C, 0x11D3D],
700
+ [0x11D3F, 0x11D47],
701
+ [0x11D50, 0x11D59],
702
+ [0x11D60, 0x11D65],
703
+ [0x11D67, 0x11D68],
704
+ [0x11D6A, 0x11D8E],
705
+ [0x11D90, 0x11D91],
706
+ [0x11D93, 0x11D98],
707
+ [0x11DA0, 0x11DA9],
708
+ [0x11EE0, 0x11EF6],
709
+ [0x11F00, 0x11F10],
710
+ [0x11F12, 0x11F3A],
711
+ [0x11F3E, 0x11F42],
712
+ [0x11F50, 0x11F59],
713
+ [0x11FB0, 0x11FB0],
714
+ [0x12000, 0x12399],
715
+ [0x12400, 0x1246E],
716
+ [0x12480, 0x12543],
717
+ [0x12F90, 0x12FF0],
718
+ [0x13000, 0x1342F],
719
+ [0x13440, 0x13455],
720
+ [0x14400, 0x14646],
721
+ [0x16800, 0x16A38],
722
+ [0x16A40, 0x16A5E],
723
+ [0x16A60, 0x16A69],
724
+ [0x16A70, 0x16ABE],
725
+ [0x16AC0, 0x16AC9],
726
+ [0x16AD0, 0x16AED],
727
+ [0x16AF0, 0x16AF4],
728
+ [0x16B00, 0x16B36],
729
+ [0x16B40, 0x16B43],
730
+ [0x16B50, 0x16B59],
731
+ [0x16B63, 0x16B77],
732
+ [0x16B7D, 0x16B8F],
733
+ [0x16E40, 0x16E7F],
734
+ [0x16F00, 0x16F4A],
735
+ [0x16F4F, 0x16F87],
736
+ [0x16F8F, 0x16F9F],
737
+ [0x16FE0, 0x16FE1],
738
+ [0x16FE3, 0x16FE4],
739
+ [0x16FF0, 0x16FF1],
740
+ [0x17000, 0x187F7],
741
+ [0x18800, 0x18CD5],
742
+ [0x18D00, 0x18D08],
743
+ [0x1AFF0, 0x1AFF3],
744
+ [0x1AFF5, 0x1AFFB],
745
+ [0x1AFFD, 0x1AFFE],
746
+ [0x1B000, 0x1B122],
747
+ [0x1B132, 0x1B132],
748
+ [0x1B150, 0x1B152],
749
+ [0x1B155, 0x1B155],
750
+ [0x1B164, 0x1B167],
751
+ [0x1B170, 0x1B2FB],
752
+ [0x1BC00, 0x1BC6A],
753
+ [0x1BC70, 0x1BC7C],
754
+ [0x1BC80, 0x1BC88],
755
+ [0x1BC90, 0x1BC99],
756
+ [0x1BC9D, 0x1BC9E],
757
+ [0x1CF00, 0x1CF2D],
758
+ [0x1CF30, 0x1CF46],
759
+ [0x1D165, 0x1D169],
760
+ [0x1D16D, 0x1D172],
761
+ [0x1D17B, 0x1D182],
762
+ [0x1D185, 0x1D18B],
763
+ [0x1D1AA, 0x1D1AD],
764
+ [0x1D242, 0x1D244],
765
+ [0x1D400, 0x1D454],
766
+ [0x1D456, 0x1D49C],
767
+ [0x1D49E, 0x1D49F],
768
+ [0x1D4A2, 0x1D4A2],
769
+ [0x1D4A5, 0x1D4A6],
770
+ [0x1D4A9, 0x1D4AC],
771
+ [0x1D4AE, 0x1D4B9],
772
+ [0x1D4BB, 0x1D4BB],
773
+ [0x1D4BD, 0x1D4C3],
774
+ [0x1D4C5, 0x1D505],
775
+ [0x1D507, 0x1D50A],
776
+ [0x1D50D, 0x1D514],
777
+ [0x1D516, 0x1D51C],
778
+ [0x1D51E, 0x1D539],
779
+ [0x1D53B, 0x1D53E],
780
+ [0x1D540, 0x1D544],
781
+ [0x1D546, 0x1D546],
782
+ [0x1D54A, 0x1D550],
783
+ [0x1D552, 0x1D6A5],
784
+ [0x1D6A8, 0x1D6C0],
785
+ [0x1D6C2, 0x1D6DA],
786
+ [0x1D6DC, 0x1D6FA],
787
+ [0x1D6FC, 0x1D714],
788
+ [0x1D716, 0x1D734],
789
+ [0x1D736, 0x1D74E],
790
+ [0x1D750, 0x1D76E],
791
+ [0x1D770, 0x1D788],
792
+ [0x1D78A, 0x1D7A8],
793
+ [0x1D7AA, 0x1D7C2],
794
+ [0x1D7C4, 0x1D7CB],
795
+ [0x1D7CE, 0x1D7FF],
796
+ [0x1DA00, 0x1DA36],
797
+ [0x1DA3B, 0x1DA6C],
798
+ [0x1DA75, 0x1DA75],
799
+ [0x1DA84, 0x1DA84],
800
+ [0x1DA9B, 0x1DA9F],
801
+ [0x1DAA1, 0x1DAAF],
802
+ [0x1DF00, 0x1DF1E],
803
+ [0x1DF25, 0x1DF2A],
804
+ [0x1E000, 0x1E006],
805
+ [0x1E008, 0x1E018],
806
+ [0x1E01B, 0x1E021],
807
+ [0x1E023, 0x1E024],
808
+ [0x1E026, 0x1E02A],
809
+ [0x1E030, 0x1E06D],
810
+ [0x1E08F, 0x1E08F],
811
+ [0x1E100, 0x1E12C],
812
+ [0x1E130, 0x1E13D],
813
+ [0x1E140, 0x1E149],
814
+ [0x1E14E, 0x1E14E],
815
+ [0x1E290, 0x1E2AE],
816
+ [0x1E2C0, 0x1E2F9],
817
+ [0x1E4D0, 0x1E4F9],
818
+ [0x1E7E0, 0x1E7E6],
819
+ [0x1E7E8, 0x1E7EB],
820
+ [0x1E7ED, 0x1E7EE],
821
+ [0x1E7F0, 0x1E7FE],
822
+ [0x1E800, 0x1E8C4],
823
+ [0x1E8D0, 0x1E8D6],
824
+ [0x1E900, 0x1E94B],
825
+ [0x1E950, 0x1E959],
826
+ [0x1EE00, 0x1EE03],
827
+ [0x1EE05, 0x1EE1F],
828
+ [0x1EE21, 0x1EE22],
829
+ [0x1EE24, 0x1EE24],
830
+ [0x1EE27, 0x1EE27],
831
+ [0x1EE29, 0x1EE32],
832
+ [0x1EE34, 0x1EE37],
833
+ [0x1EE39, 0x1EE39],
834
+ [0x1EE3B, 0x1EE3B],
835
+ [0x1EE42, 0x1EE42],
836
+ [0x1EE47, 0x1EE47],
837
+ [0x1EE49, 0x1EE49],
838
+ [0x1EE4B, 0x1EE4B],
839
+ [0x1EE4D, 0x1EE4F],
840
+ [0x1EE51, 0x1EE52],
841
+ [0x1EE54, 0x1EE54],
842
+ [0x1EE57, 0x1EE57],
843
+ [0x1EE59, 0x1EE59],
844
+ [0x1EE5B, 0x1EE5B],
845
+ [0x1EE5D, 0x1EE5D],
846
+ [0x1EE5F, 0x1EE5F],
847
+ [0x1EE61, 0x1EE62],
848
+ [0x1EE64, 0x1EE64],
849
+ [0x1EE67, 0x1EE6A],
850
+ [0x1EE6C, 0x1EE72],
851
+ [0x1EE74, 0x1EE77],
852
+ [0x1EE79, 0x1EE7C],
853
+ [0x1EE7E, 0x1EE7E],
854
+ [0x1EE80, 0x1EE89],
855
+ [0x1EE8B, 0x1EE9B],
856
+ [0x1EEA1, 0x1EEA3],
857
+ [0x1EEA5, 0x1EEA9],
858
+ [0x1EEAB, 0x1EEBB],
859
+ [0x1FBF0, 0x1FBF9],
860
+ [0x20000, 0x2A6DF],
861
+ [0x2A700, 0x2B739],
862
+ [0x2B740, 0x2B81D],
863
+ [0x2B820, 0x2CEA1],
864
+ [0x2CEB0, 0x2EBE0],
865
+ [0x2EBF0, 0x2EE5D],
866
+ [0x2F800, 0x2FA1D],
867
+ [0x30000, 0x3134A],
868
+ [0x31350, 0x323AF],
869
+ ].freeze
870
+
871
+ # Label-start (used for heredoc labels and modifier names): underscore or ASCII alpha.
872
+ LABEL_START_BYTE = Array.new(256, false)
873
+ (LOWER_A..LOWER_Z).each { |b| LABEL_START_BYTE[b] = true }
874
+ (UPPER_A..UPPER_Z).each { |b| LABEL_START_BYTE[b] = true }
875
+ LABEL_START_BYTE[UNDERSCORE] = true
876
+ LABEL_START_BYTE.freeze
877
+
878
+ # Label-cont: alpha + digit + underscore.
879
+ LABEL_CONT_BYTE = Array.new(256, false)
880
+ (LOWER_A..LOWER_Z).each { |b| LABEL_CONT_BYTE[b] = true }
881
+ (UPPER_A..UPPER_Z).each { |b| LABEL_CONT_BYTE[b] = true }
882
+ (DIGIT0..DIGIT9).each { |b| LABEL_CONT_BYTE[b] = true }
883
+ LABEL_CONT_BYTE[UNDERSCORE] = true
884
+ LABEL_CONT_BYTE.freeze
885
+
886
+ # ASCII digit lookup.
887
+ DIGIT_BYTE = Array.new(256, false)
888
+ (DIGIT0..DIGIT9).each { |b| DIGIT_BYTE[b] = true }
889
+ DIGIT_BYTE.freeze
890
+
891
+ # Hex digit lookup.
892
+ HEX_BYTE = Array.new(256, false)
893
+ (DIGIT0..DIGIT9).each { |b| HEX_BYTE[b] = true }
894
+ (LOWER_A..LOWER_F).each { |b| HEX_BYTE[b] = true }
895
+ (UPPER_A..UPPER_F).each { |b| HEX_BYTE[b] = true }
896
+ HEX_BYTE.freeze
897
+
898
+ # Value-terminator lookup (whitespace, EOL, comments, flow-end markers).
899
+ VALUE_TERMINATOR_BYTE = Array.new(256, false)
900
+ [SP, TAB, LF, CR, HASH, SLASH, COMMA, RBRACK, RBRACE].each { |b| VALUE_TERMINATOR_BYTE[b] = true }
901
+ VALUE_TERMINATOR_BYTE.freeze
902
+
903
+ # ---------- Public entry ----------
904
+
905
+ def self.parse_document(src)
906
+ _parse_document_with_mode(src, false, false)
907
+ end
908
+
909
+ # Lite-mode parse: same data tree, no comment AST, no original_forms.
910
+ # Not suitable for to_dms round-trip. SPEC §Parsing modes — full and lite.
911
+ def self.parse_lite_document(src)
912
+ _parse_document_with_mode(src, true, false)
913
+ end
914
+
915
+ # Unordered full-mode parse (SPEC §"Unordered tables"). Every body
916
+ # `Hash` is replaced by an `UnorderedHash`; iteration order is
917
+ # arbitrary. Comments + original_forms are still recorded, but
918
+ # `Dms.encode` will refuse to round-trip the result — use
919
+ # `Dms.encode_lite` for canonical emit instead.
920
+ def self.parse_document_unordered(src)
921
+ _parse_document_with_mode(src, false, true)
922
+ end
923
+
924
+ # Unordered lite-mode parse (SPEC §"Unordered tables"). The
925
+ # `(unordered, lite)` combo is the fastest read-only path for ports
926
+ # that ship a hash-only backing.
927
+ def self.parse_lite_document_unordered(src)
928
+ _parse_document_with_mode(src, true, true)
929
+ end
930
+
931
+ def self._parse_document_with_mode(src, lite, ignore_order = false)
932
+ src = src.dup if src.frozen?
933
+ if src.encoding == Encoding::ASCII_8BIT || src.encoding == Encoding::BINARY
934
+ src.force_encoding("UTF-8")
935
+ elsif src.encoding != Encoding::UTF_8
936
+ src = src.encode("UTF-8")
937
+ end
938
+ # SPEC §"UTF-8 only, NFC-normalized": DMS source is plain UTF-8 with
939
+ # no byte-order mark. A leading U+FEFF is not silently consumed —
940
+ # reject it explicitly so encoding mistakes surface loudly. (BOMs
941
+ # *inside* string/heredoc bodies are fine; this only fires at offset 0.)
942
+ if src.start_with?("")
943
+ raise DecodeError.new(1, 1, "BOM (U+FEFF) at file start is not allowed; DMS source is plain UTF-8")
944
+ end
945
+ nul = src.index("\0")
946
+ if nul
947
+ prefix = src.byteslice(0, nul)
948
+ line = 1 + prefix.count("\n")
949
+ last_nl = prefix.rindex("\n")
950
+ col = last_nl ? (nul - last_nl) : (nul + 1)
951
+ raise DecodeError.new(line, col, "U+0000 (NUL) is not allowed in DMS source")
952
+ end
953
+ # NFC-normalize unless ASCII-only (which is a no-op).
954
+ src = src.unicode_normalize(:nfc) unless src.ascii_only?
955
+ p = new(src, lite: lite, ignore_order: ignore_order)
956
+ meta = p.parse_front_matter
957
+ body = p.parse_body
958
+ Document.new(meta, body, p.comments, p.original_forms)
959
+ end
960
+
961
+ # SPEC §Front-matter-only decode. Decodes the leading `+++ ... +++`
962
+ # block and stops — body bytes after the closer are not tokenized,
963
+ # so body-only errors (duplicate body keys, unterminated body
964
+ # heredoc, etc.) are not surfaced here. Front-matter validation is
965
+ # byte-identical to a full decode: open/close on their own lines,
966
+ # `_dms_tier` is type-checked, unknown reserved keys rejected,
967
+ # unterminated front matter is a parse error.
968
+ #
969
+ # Returns the front-matter Hash (possibly empty) when an opener is
970
+ # present, or nil when the document has no front matter at all.
971
+ # Always runs in lite mode — no comment AST, no original_forms.
972
+ def self.parse_front_matter_only(src)
973
+ src = src.dup if src.frozen?
974
+ if src.encoding == Encoding::ASCII_8BIT || src.encoding == Encoding::BINARY
975
+ src.force_encoding("UTF-8")
976
+ elsif src.encoding != Encoding::UTF_8
977
+ src = src.encode("UTF-8")
978
+ end
979
+ if src.start_with?("")
980
+ raise DecodeError.new(1, 1, "BOM (U+FEFF) at file start is not allowed; DMS source is plain UTF-8")
981
+ end
982
+ # NUL scan is bounded to the FM region — only pre-closer NULs are
983
+ # diagnosable here (body-only errors are not surfaced per SPEC).
984
+ # We still do the cheap whole-buffer scan: catching a NUL anywhere
985
+ # is byte-identical to the full decoder for the prefix that would
986
+ # have been tokenized, and matches what a config loader expects.
987
+ nul = src.index("\0")
988
+ if nul
989
+ prefix = src.byteslice(0, nul)
990
+ line = 1 + prefix.count("\n")
991
+ last_nl = prefix.rindex("\n")
992
+ col = last_nl ? (nul - last_nl) : (nul + 1)
993
+ raise DecodeError.new(line, col, "U+0000 (NUL) is not allowed in DMS source")
994
+ end
995
+ src = src.unicode_normalize(:nfc) unless src.ascii_only?
996
+ p = new(src, lite: true, ignore_order: false)
997
+ p.parse_front_matter
998
+ end
999
+
1000
+ # ---------- Init ----------
1001
+
1002
+ attr_reader :comments, :original_forms
1003
+
1004
+ def initialize(src, lite: false, ignore_order: false)
1005
+ # NB: BOM-at-file-start rejection happens in _parse_document_with_mode
1006
+ # before we get here; do not silently strip a leading U+FEFF.
1007
+ @src = src
1008
+ # Position is a *byte* index into src. For pure-ASCII, this matches
1009
+ # the character index; for mixed UTF-8, multi-byte chars only appear
1010
+ # inside string bodies / non-ASCII keys, where we slice via byteslice.
1011
+ @len = src.bytesize
1012
+ @pos = 0
1013
+ @line = 1
1014
+ @line_start = 0
1015
+ @comments = []
1016
+ @pending_leading = []
1017
+ @path = []
1018
+ @original_forms = []
1019
+ @record_forms = true
1020
+ # Lite mode: skip comment-AST + original_forms bookkeeping.
1021
+ # Same grammar, same errors. SPEC §Parsing modes — full and lite.
1022
+ @lite = lite
1023
+ # Unordered mode (SPEC §"Unordered tables"): when true, every body
1024
+ # table is built as an UnorderedHash. Keys are shuffled at
1025
+ # end-of-build to expose the arbitrary-order contract.
1026
+ @ignore_order = ignore_order
1027
+ end
1028
+
1029
+ # Allocate a fresh table (Hash by default, UnorderedHash when
1030
+ # `ignore_order` is set). Used for body tables, list-item tables,
1031
+ # and flow tables. Front-matter `meta` is excluded — the front-matter
1032
+ # block is always insertion-ordered regardless of body mode.
1033
+ def new_table
1034
+ @ignore_order ? UnorderedHash.new : {}
1035
+ end
1036
+
1037
+ # Shuffle an UnorderedHash in place at end-of-build so callers cannot
1038
+ # rely on insertion order. No-op for plain `Hash` or empty tables.
1039
+ def finalize_table(t)
1040
+ return t unless @ignore_order && t.is_a?(UnorderedHash) && t.size > 1
1041
+ keys = t.keys.shuffle
1042
+ pairs = keys.map { |k| [k, t[k]] }
1043
+ t.clear
1044
+ pairs.each { |k, v| t[k] = v }
1045
+ t
1046
+ end
1047
+
1048
+ # ---------- Position primitives ----------
1049
+
1050
+ # 1-based char column. For pure-ASCII lines this is identical to the
1051
+ # byte offset within the line; for mixed UTF-8 we count chars from
1052
+ # @line_start to @pos (rare, only used in error messages).
1053
+ def col
1054
+ bytes = @pos - @line_start
1055
+ return bytes + 1 if @src.ascii_only?
1056
+ @src.byteslice(@line_start, bytes).length + 1
1057
+ end
1058
+
1059
+ def err(msg); DecodeError.new(@line, col, msg); end
1060
+
1061
+ def err_at(line, line_start, byte_pos, msg)
1062
+ bytes = byte_pos - line_start
1063
+ column =
1064
+ if @src.ascii_only?
1065
+ bytes + 1
1066
+ else
1067
+ @src.byteslice(line_start, bytes).length + 1
1068
+ end
1069
+ DecodeError.new(line, column, msg)
1070
+ end
1071
+
1072
+ # peek: returns the byte at @pos as Integer, or nil at EOF.
1073
+ def peek_byte
1074
+ @src.getbyte(@pos)
1075
+ end
1076
+
1077
+ # peek_char: returns the *character* at @pos as a String. Used in
1078
+ # error messages and a handful of dispatch sites where we already
1079
+ # know we're on a single-byte ASCII char.
1080
+ def peek_char_byte_safe
1081
+ b = @src.getbyte(@pos)
1082
+ return nil if b.nil?
1083
+ return b.chr if b < 128
1084
+ # Multi-byte: read full character.
1085
+ @src.byteslice(@pos, 4).force_encoding(Encoding::UTF_8)[0]
1086
+ end
1087
+
1088
+ def starts_bytes?(s)
1089
+ # Compare s (ASCII string) byte-for-byte at @pos.
1090
+ slen = s.bytesize
1091
+ return false if @pos + slen > @len
1092
+ i = 0
1093
+ while i < slen
1094
+ return false if @src.getbyte(@pos + i) != s.getbyte(i)
1095
+ i += 1
1096
+ end
1097
+ true
1098
+ end
1099
+
1100
+ def eof?; @pos >= @len; end
1101
+
1102
+ def advance_line
1103
+ @line += 1
1104
+ @line_start = @pos
1105
+ end
1106
+
1107
+ # ---------- Whitespace / EOL ----------
1108
+
1109
+ def skip_inline_ws
1110
+ s = @src
1111
+ n = @len
1112
+ p = @pos
1113
+ while p < n
1114
+ b = s.getbyte(p)
1115
+ break unless b == SP || b == TAB
1116
+ p += 1
1117
+ end
1118
+ @pos = p
1119
+ end
1120
+
1121
+ def consume_eol
1122
+ b = @src.getbyte(@pos)
1123
+ if b == LF
1124
+ @pos += 1
1125
+ advance_line
1126
+ return true
1127
+ end
1128
+ if b == CR && @src.getbyte(@pos + 1) == LF
1129
+ @pos += 2
1130
+ advance_line
1131
+ return true
1132
+ end
1133
+ false
1134
+ end
1135
+
1136
+ def skip_trivia
1137
+ loop do
1138
+ line_start_pos = @pos
1139
+ skip_inline_ws
1140
+ b = @src.getbyte(@pos)
1141
+ if b.nil?
1142
+ @pos = line_start_pos
1143
+ return
1144
+ elsif b == LF
1145
+ flush_pending_as_floating
1146
+ @pos += 1; advance_line
1147
+ elsif b == CR
1148
+ if @src.getbyte(@pos + 1) != LF
1149
+ raise err("bare CR is not a valid line terminator")
1150
+ end
1151
+ flush_pending_as_floating
1152
+ @pos += 2; advance_line
1153
+ elsif b == HASH
1154
+ if starts_bytes?("###")
1155
+ raw = read_hash_block_comment
1156
+ @pending_leading << Comment.new(raw, :block) unless @lite
1157
+ else
1158
+ raw = read_line_comment_to_eol
1159
+ consume_eol
1160
+ @pending_leading << Comment.new(raw, :line) unless @lite
1161
+ end
1162
+ elsif b == SLASH
1163
+ n2 = @src.getbyte(@pos + 1)
1164
+ if n2 == SLASH
1165
+ raw = read_line_comment_to_eol
1166
+ consume_eol
1167
+ @pending_leading << Comment.new(raw, :line) unless @lite
1168
+ elsif n2 == STAR
1169
+ raw = read_c_block_comment
1170
+ @pending_leading << Comment.new(raw, :block) unless @lite
1171
+ else
1172
+ @pos = line_start_pos
1173
+ return
1174
+ end
1175
+ else
1176
+ @pos = line_start_pos
1177
+ return
1178
+ end
1179
+ end
1180
+ end
1181
+
1182
+ # ---------- Pending leading flushers ----------
1183
+
1184
+ def flush_pending_as_floating
1185
+ return if @pending_leading.empty?
1186
+ drained = @pending_leading
1187
+ @pending_leading = []
1188
+ path = @path.dup.freeze
1189
+ drained.each do |c|
1190
+ @comments << AttachedComment.new(c, :floating, path)
1191
+ end
1192
+ end
1193
+
1194
+ def flush_pending_as_leading_on_current
1195
+ return if @pending_leading.empty?
1196
+ drained = @pending_leading
1197
+ @pending_leading = []
1198
+ path = @path.dup.freeze
1199
+ drained.each do |c|
1200
+ @comments << AttachedComment.new(c, :leading, path)
1201
+ end
1202
+ end
1203
+
1204
+ # ---------- Raw comment readers ----------
1205
+
1206
+ def read_line_comment_to_eol
1207
+ s = @src
1208
+ n = @len
1209
+ p = @pos
1210
+ start = p
1211
+ while p < n
1212
+ b = s.getbyte(p)
1213
+ break if b == LF || b == CR
1214
+ p += 1
1215
+ end
1216
+ @pos = p
1217
+ s.byteslice(start, p - start).force_encoding(Encoding::UTF_8)
1218
+ end
1219
+
1220
+ def read_c_block_comment
1221
+ sl = @line; sls = @line_start; sp = @pos
1222
+ @pos += 2
1223
+ depth = 1
1224
+ s = @src
1225
+ n = @len
1226
+ while depth > 0
1227
+ if @pos >= n
1228
+ raise err_at(sl, sls, sp, "unterminated /* block comment")
1229
+ end
1230
+ b = s.getbyte(@pos)
1231
+ if b == SLASH && s.getbyte(@pos + 1) == STAR
1232
+ @pos += 2; depth += 1
1233
+ elsif b == STAR && s.getbyte(@pos + 1) == SLASH
1234
+ @pos += 2; depth -= 1
1235
+ elsif b == LF
1236
+ @pos += 1; advance_line
1237
+ elsif b == CR && s.getbyte(@pos + 1) == LF
1238
+ @pos += 2; advance_line
1239
+ else
1240
+ @pos += 1
1241
+ end
1242
+ end
1243
+ s.byteslice(sp, @pos - sp).force_encoding(Encoding::UTF_8)
1244
+ end
1245
+
1246
+ def read_hash_block_comment
1247
+ sl = @line; sls = @line_start; sp = @pos
1248
+ @pos += 3
1249
+ ls = @pos
1250
+ s = @src
1251
+ n = @len
1252
+ while @pos < n
1253
+ b = s.getbyte(@pos)
1254
+ break unless LABEL_CONT_BYTE[b]
1255
+ @pos += 1
1256
+ end
1257
+ label = s.byteslice(ls, @pos - ls).force_encoding(Encoding::UTF_8)
1258
+ if !label.empty?
1259
+ first = label.getbyte(0)
1260
+ unless first == UNDERSCORE || (first >= LOWER_A && first <= LOWER_Z) || (first >= UPPER_A && first <= UPPER_Z)
1261
+ raise err_at(sl, sls, sp, "block comment label must start with a letter or underscore")
1262
+ end
1263
+ end
1264
+ terminator = label.empty? ? "###" : label
1265
+ skip_inline_ws
1266
+ unless consume_eol || eof?
1267
+ raise err("block comment opener must be on its own line")
1268
+ end
1269
+ loop do
1270
+ if eof?
1271
+ raise err_at(sl, sls, sp, "unterminated ### block comment")
1272
+ end
1273
+ line_begin = @pos
1274
+ while @pos < n
1275
+ b = s.getbyte(@pos)
1276
+ break if b == LF || b == CR
1277
+ @pos += 1
1278
+ end
1279
+ line_text = s.byteslice(line_begin, @pos - line_begin).force_encoding(Encoding::UTF_8)
1280
+ line_end = @pos
1281
+ consume_eol
1282
+ if line_text.strip == terminator
1283
+ return s.byteslice(sp, line_end - sp).force_encoding(Encoding::UTF_8)
1284
+ end
1285
+ end
1286
+ end
1287
+
1288
+ # ---------- Document entry ----------
1289
+
1290
+ def parse_front_matter
1291
+ save_pos = @pos; save_line = @line; save_lstart = @line_start
1292
+ save_pending = @pending_leading.length
1293
+ save_comments = @comments.length
1294
+ skip_trivia
1295
+ unless starts_bytes?("+++")
1296
+ @pos = save_pos; @line = save_line; @line_start = save_lstart
1297
+ @pending_leading.slice!(save_pending..)
1298
+ @comments.slice!(save_comments..)
1299
+ return nil
1300
+ end
1301
+ # Any trailing content on the opener line is a parse error
1302
+ # (SPEC §Front matter: "each `+++` must appear on its own line,
1303
+ # with no trailing content"). Advance past `+++` and let the
1304
+ # strict EOL check below diagnose.
1305
+ opener_line = @line; opener_lstart = @line_start; opener_pos = @pos
1306
+ @pos += 3
1307
+ skip_inline_ws
1308
+ unless consume_eol || eof?
1309
+ raise err("front matter opener must be on its own line")
1310
+ end
1311
+ inner_buf = +""
1312
+ inner_buf.force_encoding(Encoding::UTF_8)
1313
+ loop do
1314
+ if eof?
1315
+ raise DecodeError.new(opener_line, opener_pos - opener_lstart + 1,
1316
+ "unterminated front matter: missing closing '+++'")
1317
+ end
1318
+ line_begin = @pos
1319
+ while @pos < @len
1320
+ b = @src.getbyte(@pos)
1321
+ break if b == LF || b == CR
1322
+ @pos += 1
1323
+ end
1324
+ line_text = @src.byteslice(line_begin, @pos - line_begin).force_encoding(Encoding::UTF_8)
1325
+ if line_text.strip == "+++"
1326
+ consume_eol
1327
+ break
1328
+ end
1329
+ inner_buf << line_text
1330
+ inner_buf << "\n" if consume_eol
1331
+ end
1332
+ sub = self.class.new(inner_buf, lite: @lite)
1333
+ table = sub.parse_body_as_table
1334
+ meta = {}
1335
+ fm_err = ->(msg) { DecodeError.new(opener_line, opener_pos - opener_lstart + 1, msg) }
1336
+ table.each do |k, v|
1337
+ if k.start_with?("_")
1338
+ if k == "_dms_tier"
1339
+ unless v.is_a?(Integer) && !v.is_a?(TrueClass) && !v.is_a?(FalseClass)
1340
+ raise fm_err.call("_dms_tier must be a non-negative integer")
1341
+ end
1342
+ raise fm_err.call("_dms_tier must be non-negative") if v < 0
1343
+ if v >= 2
1344
+ raise fm_err.call("_dms_tier: #{v} is not supported (only tier 0 and 1 are defined)")
1345
+ end
1346
+ if v == 1
1347
+ raise fm_err.call("_dms_tier: 1 requires tier-1 decode mode (use --tier=1)")
1348
+ end
1349
+ else
1350
+ raise fm_err.call("unknown reserved key: #{k}")
1351
+ end
1352
+ else
1353
+ meta[k] = v
1354
+ end
1355
+ end
1356
+ sub.comments.each do |ac|
1357
+ attached_to_reserved = !ac.path.empty? && ac.path[0].is_a?(String) && ac.path[0].start_with?("_")
1358
+ if attached_to_reserved
1359
+ @comments << AttachedComment.new(ac.comment, :floating, ["__fm__"].freeze)
1360
+ next
1361
+ end
1362
+ @comments << AttachedComment.new(ac.comment, ac.position, (["__fm__"] + ac.path).freeze)
1363
+ end
1364
+ sub.original_forms.each do |path, lit|
1365
+ next if !path.empty? && path[0].is_a?(String) && path[0].start_with?("_")
1366
+ @original_forms << [(["__fm__"] + path).freeze, lit]
1367
+ end
1368
+ meta
1369
+ end
1370
+
1371
+ def parse_body_as_table
1372
+ skip_trivia
1373
+ if eof?
1374
+ flush_pending_as_floating
1375
+ return new_table
1376
+ end
1377
+ b = @src.getbyte(@pos)
1378
+ if b == SP || b == TAB
1379
+ raise err("unexpected indentation inside front matter")
1380
+ end
1381
+ reject_reserved_sigil_at_line_start!
1382
+ if b == PLUS && peek_after_plus_is_space_or_eol?
1383
+ raise err("front matter block cannot have a list root")
1384
+ end
1385
+ unless line_starts_kvpair?
1386
+ raise err("front matter block must be a table")
1387
+ end
1388
+ t = parse_table_block(0)
1389
+ skip_trivia
1390
+ raise err("trailing content inside front matter") unless eof?
1391
+ t
1392
+ end
1393
+
1394
+ def parse_body
1395
+ skip_trivia
1396
+ if eof?
1397
+ flush_pending_as_floating
1398
+ return new_table
1399
+ end
1400
+ b = @src.getbyte(@pos)
1401
+ raise err("unexpected indentation at document root") if b == SP || b == TAB
1402
+ reject_reserved_sigil_at_line_start!
1403
+ if b == PLUS && peek_after_plus_is_space_or_eol?
1404
+ v = parse_list_block(0)
1405
+ skip_trivia
1406
+ raise err("trailing content after list root") unless eof?
1407
+ flush_pending_as_floating
1408
+ return v
1409
+ end
1410
+ if line_starts_kvpair?
1411
+ t = parse_table_block(0)
1412
+ skip_trivia
1413
+ raise err("trailing content after table root") unless eof?
1414
+ flush_pending_as_floating
1415
+ return t
1416
+ end
1417
+ v = parse_inline_value_or_heredoc
1418
+ consume_after_value(true)
1419
+ skip_trivia
1420
+ raise err("scalar root cannot be followed by more content") unless eof?
1421
+ flush_pending_as_floating
1422
+ v
1423
+ end
1424
+
1425
+ def peek_after_plus_is_space_or_eol?
1426
+ b = @src.getbyte(@pos + 1)
1427
+ b.nil? || b == SP || b == TAB || b == LF || b == CR
1428
+ end
1429
+
1430
+ # SPEC tier-0: reject reserved decorator sigils at line-start position.
1431
+ # Caller has already consumed leading whitespace + trivia, so @pos sits
1432
+ # on the first non-whitespace byte of a body line. If that byte is one
1433
+ # of the 17 reserved sigils (! @ $ % ^ & * | ~ ` . , > < ? ; =), raise. The check is
1434
+ # only valid here — string bodies, comments, and heredoc bodies are
1435
+ # parsed by their own readers and never reach this dispatch.
1436
+ def reject_reserved_sigil_at_line_start!
1437
+ return if @pos >= @len
1438
+ b = @src.getbyte(@pos)
1439
+ return unless b && RESERVED_SIGIL_BYTE[b]
1440
+ raise err("reserved decorator sigil '#{b.chr}' at line start is not allowed")
1441
+ end
1442
+
1443
+ def line_starts_kvpair?
1444
+ p = @pos
1445
+ s = @src
1446
+ n = @len
1447
+ first = s.getbyte(p)
1448
+ if first == DQUOTE
1449
+ p += 1
1450
+ while p < n
1451
+ b = s.getbyte(p)
1452
+ if b == BACKSLASH
1453
+ p += 2
1454
+ elsif b == DQUOTE
1455
+ p += 1
1456
+ break
1457
+ elsif b == LF || b == CR
1458
+ return false
1459
+ else
1460
+ p += 1
1461
+ end
1462
+ end
1463
+ elsif first == SQUOTE
1464
+ p += 1
1465
+ while p < n
1466
+ b = s.getbyte(p)
1467
+ if b == SQUOTE
1468
+ p += 1
1469
+ break
1470
+ elsif b == LF || b == CR
1471
+ return false
1472
+ else
1473
+ p += 1
1474
+ end
1475
+ end
1476
+ else
1477
+ any_chars = false
1478
+ while p < n
1479
+ b = s.getbyte(p)
1480
+ if b < 128
1481
+ break unless BARE_KEY_BYTE[b]
1482
+ p += 1
1483
+ any_chars = true
1484
+ else
1485
+ # Non-ASCII byte: walk a full UTF-8 char and XID_Continue-test it.
1486
+ ch_len = utf8_char_len(b)
1487
+ ch = s.byteslice(p, ch_len).force_encoding(Encoding::UTF_8)
1488
+ break unless xid_continue?(ch.ord)
1489
+ p += ch_len
1490
+ any_chars = true
1491
+ end
1492
+ end
1493
+ return false unless any_chars
1494
+ end
1495
+ return false if p >= n || s.getbyte(p) != COLON
1496
+ nxt = s.getbyte(p + 1)
1497
+ nxt.nil? || nxt == SP || nxt == TAB || nxt == LF || nxt == CR
1498
+ end
1499
+
1500
+ def utf8_char_len(b)
1501
+ return 1 if b < 0x80
1502
+ return 2 if b < 0xC0
1503
+ return 2 if b < 0xE0
1504
+ return 3 if b < 0xF0
1505
+ 4
1506
+ end
1507
+
1508
+ # Frozen XID_Continue test (Unicode 15.1, UAX #31 §2). ASCII fast path:
1509
+ # the bare-key ASCII set is handled by BARE_KEY_BYTE before this is
1510
+ # called, so any cp < 0x80 reaching here is not an XID_Continue char
1511
+ # for our purposes (we already accepted alnum/_/- and rejected the rest).
1512
+ # Binary-search the sorted, non-overlapping range table.
1513
+ def xid_continue?(cp)
1514
+ return false if cp < 0x80
1515
+ ranges = XID_CONTINUE_RANGES
1516
+ lo = 0
1517
+ hi = ranges.length - 1
1518
+ while lo <= hi
1519
+ mid = (lo + hi) >> 1
1520
+ r = ranges[mid]
1521
+ if cp < r[0]
1522
+ hi = mid - 1
1523
+ elsif cp > r[1]
1524
+ lo = mid + 1
1525
+ else
1526
+ return true
1527
+ end
1528
+ end
1529
+ false
1530
+ end
1531
+
1532
+ # ---------- Block parsers ----------
1533
+
1534
+ def measure_line_indent
1535
+ n = 0
1536
+ i = @line_start
1537
+ s = @src
1538
+ while i < @len && s.getbyte(i) == SP
1539
+ n += 1
1540
+ i += 1
1541
+ end
1542
+ n
1543
+ end
1544
+
1545
+ def parse_table_block(indent)
1546
+ t = new_table
1547
+ loop do
1548
+ skip_trivia
1549
+ break if @pos >= @len
1550
+ # measure indent inline
1551
+ li = 0
1552
+ i = @line_start
1553
+ while i < @len && @src.getbyte(i) == SP
1554
+ li += 1
1555
+ i += 1
1556
+ end
1557
+ break if li < indent
1558
+ if li != indent
1559
+ raise err_at(@line, @line_start, @line_start + indent,
1560
+ "inconsistent indent: expected #{indent} spaces, got #{li}")
1561
+ end
1562
+ @pos = @line_start + indent
1563
+ reject_reserved_sigil_at_line_start!
1564
+ k, v = parse_kvpair(indent)
1565
+ raise err("duplicate key: #{k}") if t.key?(k)
1566
+ t[k] = v
1567
+ end
1568
+ flush_pending_as_floating
1569
+ finalize_table(t)
1570
+ end
1571
+
1572
+ def parse_list_block(indent)
1573
+ items = []
1574
+ loop do
1575
+ skip_trivia
1576
+ break if @pos >= @len
1577
+ li = measure_line_indent
1578
+ break if li < indent
1579
+ if li != indent
1580
+ raise err_at(@line, @line_start, @line_start + indent,
1581
+ "inconsistent indent: expected #{indent} spaces, got #{li}")
1582
+ end
1583
+ @pos = @line_start + indent
1584
+ reject_reserved_sigil_at_line_start!
1585
+ break unless @src.getbyte(@pos) == PLUS
1586
+ idx = items.length
1587
+ @path.push(idx)
1588
+ flush_pending_as_leading_on_current unless @pending_leading.empty?
1589
+ begin
1590
+ @pos += 1 # consume '+'
1591
+ b = @src.getbyte(@pos)
1592
+ v =
1593
+ if b == SP || b == TAB
1594
+ @pos += 1
1595
+ skip_inline_ws
1596
+ capture_inner_block_comments
1597
+ nb = @src.getbyte(@pos)
1598
+ if nb.nil? || nb == LF || nb == CR
1599
+ consume_eol
1600
+ skip_trivia
1601
+ raise err("expected indented block after empty '+' marker") if @pos >= @len
1602
+ inner_indent = measure_line_indent
1603
+ raise err("expected indented block after empty '+' marker") if inner_indent <= indent
1604
+ parse_block_value(inner_indent)
1605
+ else
1606
+ parse_list_item_value(indent)
1607
+ end
1608
+ elsif b.nil? || b == LF || b == CR
1609
+ consume_eol
1610
+ skip_trivia
1611
+ raise err("expected indented block after empty '+' marker") if @pos >= @len
1612
+ inner_indent = measure_line_indent
1613
+ raise err("expected indented block after empty '+' marker") if inner_indent <= indent
1614
+ parse_block_value(inner_indent)
1615
+ else
1616
+ raise err("expected space after '+'")
1617
+ end
1618
+ ensure
1619
+ @path.pop
1620
+ end
1621
+ items << v
1622
+ end
1623
+ flush_pending_as_floating
1624
+ items
1625
+ end
1626
+
1627
+ def parse_block_value(indent)
1628
+ @pos = @line_start + indent
1629
+ if @src.getbyte(@pos) == PLUS && peek_after_plus_is_space_or_eol?
1630
+ return parse_list_block(indent)
1631
+ end
1632
+ parse_table_block(indent)
1633
+ end
1634
+
1635
+ def parse_list_item_value(list_indent)
1636
+ if line_starts_kvpair?
1637
+ key_col = col - 1
1638
+ k, v = parse_kvpair(key_col)
1639
+ t = new_table
1640
+ t[k] = v
1641
+ loop do
1642
+ skip_trivia
1643
+ break if @pos >= @len
1644
+ li = measure_line_indent
1645
+ break if li < key_col
1646
+ if li != key_col
1647
+ raise err_at(@line, @line_start, @line_start + key_col,
1648
+ "list-item table sibling key must align with first key")
1649
+ end
1650
+ @pos = @line_start + key_col
1651
+ reject_reserved_sigil_at_line_start!
1652
+ if @src.getbyte(@pos) == PLUS
1653
+ raise err("'+' marker at sibling-key column is ambiguous")
1654
+ end
1655
+ break unless line_starts_kvpair?
1656
+ k2, v2 = parse_kvpair(key_col)
1657
+ raise err("duplicate key: #{k2}") if t.key?(k2)
1658
+ t[k2] = v2
1659
+ end
1660
+ flush_pending_as_floating
1661
+ return finalize_table(t)
1662
+ end
1663
+ v = parse_inline_value_or_heredoc
1664
+ consume_after_value(false)
1665
+ v
1666
+ end
1667
+
1668
+ # ---------- kvpair ----------
1669
+
1670
+ def parse_kvpair(parent_indent)
1671
+ # Inlined parse_key fast path for bare ASCII keys (common hot-loop case).
1672
+ # Only takes the fast path when the *next* byte after the key run is
1673
+ # ASCII too — otherwise the key may include trailing unicode chars
1674
+ # the slow path needs to consume.
1675
+ s = @src
1676
+ n = @len
1677
+ start = @pos
1678
+ b0 = s.getbyte(start)
1679
+ took_fast = false
1680
+ if b0 && b0 < 128 && BARE_KEY_BYTE[b0]
1681
+ p = start + 1
1682
+ while p < n
1683
+ bb = s.getbyte(p)
1684
+ break unless bb && bb < 128 && BARE_KEY_BYTE[bb]
1685
+ p += 1
1686
+ end
1687
+ # Only commit fast path if next byte is ASCII (i.e. truly key end).
1688
+ nb = (p < n) ? s.getbyte(p) : nil
1689
+ if nb.nil? || nb < 128
1690
+ @pos = p
1691
+ key = s.byteslice(start, p - start).force_encoding(Encoding::UTF_8)
1692
+ took_fast = true
1693
+ end
1694
+ end
1695
+ key = parse_key unless took_fast
1696
+ raise err("expected ':' after key") if @src.getbyte(@pos) != COLON
1697
+ @path.push(key)
1698
+ flush_pending_as_leading_on_current unless @pending_leading.empty?
1699
+ @pos += 1 # consume ':'
1700
+ b = @src.getbyte(@pos)
1701
+ if b == SP || b == TAB
1702
+ @pos += 1
1703
+ skip_inline_ws
1704
+ # Only enter the comment-capture loop if we see '/' (cheap byte check).
1705
+ capture_inner_block_comments if @src.getbyte(@pos) == SLASH
1706
+ nb = @src.getbyte(@pos)
1707
+ if nb.nil? || nb == LF || nb == CR
1708
+ consume_eol
1709
+ skip_trivia
1710
+ raise err("expected indented child block") if @pos >= @len
1711
+ child_indent = measure_line_indent
1712
+ raise err("expected indented child block") if child_indent <= parent_indent
1713
+ v = parse_block_value(child_indent)
1714
+ @path.pop
1715
+ return [key, v]
1716
+ end
1717
+ v = parse_inline_value_or_heredoc
1718
+ # Fast path peek: consume optional inline ws, then if next byte is
1719
+ # LF we just jump past it. Anything else (including comments) falls
1720
+ # back to the full consume_after_value, which needs to see the ws.
1721
+ s2 = @src
1722
+ p2 = @pos
1723
+ while (bb = s2.getbyte(p2)) == SP || bb == TAB
1724
+ p2 += 1
1725
+ end
1726
+ if bb == LF
1727
+ @pos = p2 + 1
1728
+ advance_line
1729
+ @path.pop
1730
+ return [key, v]
1731
+ end
1732
+ if bb.nil?
1733
+ @pos = p2
1734
+ @path.pop
1735
+ return [key, v]
1736
+ end
1737
+ # leave @pos before the ws so trailing-comment whitespace check sees it
1738
+ consume_after_value(false)
1739
+ @path.pop
1740
+ return [key, v]
1741
+ end
1742
+ if b.nil? || b == LF || b == CR
1743
+ consume_eol
1744
+ skip_trivia
1745
+ raise err("expected indented child block") if @pos >= @len
1746
+ child_indent = measure_line_indent
1747
+ raise err("expected indented child block") if child_indent <= parent_indent
1748
+ v = parse_block_value(child_indent)
1749
+ @path.pop
1750
+ return [key, v]
1751
+ end
1752
+ raise err("expected whitespace after ':'")
1753
+ end
1754
+
1755
+ # ---------- Keys ----------
1756
+
1757
+ def parse_key
1758
+ b = @src.getbyte(@pos)
1759
+ if b == DQUOTE
1760
+ raise err("triple-quoted strings are not allowed as keys") if starts_bytes?('"""')
1761
+ saved = @record_forms
1762
+ @record_forms = false
1763
+ begin
1764
+ return parse_basic_string_value
1765
+ ensure
1766
+ @record_forms = saved
1767
+ end
1768
+ end
1769
+ if b == SQUOTE
1770
+ raise err("triple-quoted strings are not allowed as keys") if starts_bytes?("'''")
1771
+ saved = @record_forms
1772
+ @record_forms = false
1773
+ begin
1774
+ return parse_literal_string_value
1775
+ ensure
1776
+ @record_forms = saved
1777
+ end
1778
+ end
1779
+ raise err("expected key") if b.nil?
1780
+ parse_bare_key
1781
+ end
1782
+
1783
+ def parse_bare_key
1784
+ s = @src
1785
+ n = @len
1786
+ pos = @pos
1787
+ start = pos
1788
+ while pos < n
1789
+ b = s.getbyte(pos)
1790
+ if b < 128
1791
+ break unless BARE_KEY_BYTE[b]
1792
+ pos += 1
1793
+ else
1794
+ ch_len = utf8_char_len(b)
1795
+ ch = s.byteslice(pos, ch_len).force_encoding(Encoding::UTF_8)
1796
+ # SPEC §"What counts as a bare key" — UAX #31 XID_Continue.
1797
+ # Onigmo supports the property name natively.
1798
+ break unless xid_continue?(ch.ord)
1799
+ pos += ch_len
1800
+ end
1801
+ end
1802
+ raise err("expected key") if pos == start
1803
+ @pos = pos
1804
+ s.byteslice(start, pos - start).force_encoding(Encoding::UTF_8)
1805
+ end
1806
+
1807
+ # ---------- Value dispatch ----------
1808
+
1809
+ def capture_inner_block_comments
1810
+ loop do
1811
+ if @src.getbyte(@pos) == SLASH && @src.getbyte(@pos + 1) == STAR
1812
+ raw = read_c_block_comment
1813
+ @comments << AttachedComment.new(Comment.new(raw, :block), :inner, @path.dup.freeze) unless @lite
1814
+ skip_inline_ws
1815
+ else
1816
+ break
1817
+ end
1818
+ end
1819
+ end
1820
+
1821
+ def parse_inline_value_or_heredoc
1822
+ b = @src.getbyte(@pos)
1823
+ # Fast path: plain decimal integer. Most hot-loop benchmarks parse
1824
+ # millions of these, so we recognize "[0-9]+ <terminator>" inline,
1825
+ # skipping number_or_datetime's full lookahead/scanner setup.
1826
+ if b && b >= DIGIT0 && b <= DIGIT9
1827
+ s = @src
1828
+ n = @len
1829
+ start = @pos
1830
+ p = start + 1
1831
+ while p < n
1832
+ bb = s.getbyte(p)
1833
+ break unless bb >= DIGIT0 && bb <= DIGIT9
1834
+ p += 1
1835
+ end
1836
+ # If next byte is a non-numeric value terminator and the token
1837
+ # length is safely within i64 (<=18 digits) and not a date/time
1838
+ # prefix, take the fast path.
1839
+ len = p - start
1840
+ if len <= 18 && (p >= n || VALUE_TERMINATOR_BYTE[s.getbyte(p)])
1841
+ # Reject leading-zero on multi-digit (e.g. "012") via slow path.
1842
+ if !(s.getbyte(start) == DIGIT0 && len > 1)
1843
+ @pos = p
1844
+ return s.byteslice(start, len).to_i
1845
+ end
1846
+ end
1847
+ end
1848
+ case b
1849
+ when DQUOTE
1850
+ return parse_heredoc_basic if starts_bytes?('"""')
1851
+ return parse_basic_string_value
1852
+ when SQUOTE
1853
+ return parse_heredoc_literal if starts_bytes?("'''")
1854
+ v = parse_literal_string_value
1855
+ record_form(OriginalLiteral.string(StringForm.literal))
1856
+ return v
1857
+ when LBRACK
1858
+ return parse_flow_array
1859
+ when LBRACE
1860
+ return parse_flow_table
1861
+ when LOWER_T, LOWER_F_LETTER
1862
+ return parse_bool_value
1863
+ when 0x69 # 'i'
1864
+ return parse_inf_value
1865
+ when LOWER_N
1866
+ return parse_nan_value
1867
+ end
1868
+ if b && (b == PLUS || b == MINUS || (b >= DIGIT0 && b <= DIGIT9))
1869
+ return parse_number_or_datetime
1870
+ end
1871
+ raise err("expected value") if b.nil?
1872
+ raise err("unexpected character '#{b.chr}' in value")
1873
+ end
1874
+
1875
+ def parse_bool_value
1876
+ s = @src
1877
+ p = @pos
1878
+ if s.byteslice(p, 4) == "true"
1879
+ after = s.getbyte(p + 4)
1880
+ if after.nil? || VALUE_TERMINATOR_BYTE[after]
1881
+ @pos += 4
1882
+ return true
1883
+ end
1884
+ end
1885
+ if s.byteslice(p, 5) == "false"
1886
+ after = s.getbyte(p + 5)
1887
+ if after.nil? || VALUE_TERMINATOR_BYTE[after]
1888
+ @pos += 5
1889
+ return false
1890
+ end
1891
+ end
1892
+ raise err("expected value")
1893
+ end
1894
+
1895
+ def parse_inf_value
1896
+ if @src.byteslice(@pos, 3) == "inf"
1897
+ after = @src.getbyte(@pos + 3)
1898
+ if after.nil? || VALUE_TERMINATOR_BYTE[after]
1899
+ @pos += 3
1900
+ return Float::INFINITY
1901
+ end
1902
+ end
1903
+ raise err("expected 'inf'")
1904
+ end
1905
+
1906
+ def parse_nan_value
1907
+ if @src.byteslice(@pos, 3) == "nan"
1908
+ after = @src.getbyte(@pos + 3)
1909
+ if after.nil? || VALUE_TERMINATOR_BYTE[after]
1910
+ @pos += 3
1911
+ return Float::NAN
1912
+ end
1913
+ end
1914
+ raise err("expected 'nan'")
1915
+ end
1916
+
1917
+ # ---------- Numbers & datetimes ----------
1918
+
1919
+ def looks_like_date_prefix_at?(p)
1920
+ return false if p + 10 > @len
1921
+ s = @src
1922
+ return false unless DIGIT_BYTE[s.getbyte(p)]
1923
+ return false unless DIGIT_BYTE[s.getbyte(p + 1)]
1924
+ return false unless DIGIT_BYTE[s.getbyte(p + 2)]
1925
+ return false unless DIGIT_BYTE[s.getbyte(p + 3)]
1926
+ return false unless s.getbyte(p + 4) == MINUS
1927
+ return false unless DIGIT_BYTE[s.getbyte(p + 5)]
1928
+ return false unless DIGIT_BYTE[s.getbyte(p + 6)]
1929
+ return false unless s.getbyte(p + 7) == MINUS
1930
+ return false unless DIGIT_BYTE[s.getbyte(p + 8)]
1931
+ return false unless DIGIT_BYTE[s.getbyte(p + 9)]
1932
+ true
1933
+ end
1934
+
1935
+ def looks_like_time_prefix_at?(p)
1936
+ return false if p + 8 > @len
1937
+ s = @src
1938
+ return false unless DIGIT_BYTE[s.getbyte(p)]
1939
+ return false unless DIGIT_BYTE[s.getbyte(p + 1)]
1940
+ return false unless s.getbyte(p + 2) == 0x3A # ':'
1941
+ return false unless DIGIT_BYTE[s.getbyte(p + 3)]
1942
+ return false unless DIGIT_BYTE[s.getbyte(p + 4)]
1943
+ return false unless s.getbyte(p + 5) == 0x3A
1944
+ return false unless DIGIT_BYTE[s.getbyte(p + 6)]
1945
+ return false unless DIGIT_BYTE[s.getbyte(p + 7)]
1946
+ true
1947
+ end
1948
+
1949
+ def parse_number_or_datetime
1950
+ s = @src
1951
+ p = @pos
1952
+ first = s.getbyte(p)
1953
+ starts_sign = first == PLUS || first == MINUS
1954
+ if !starts_sign && looks_like_date_prefix_at?(p)
1955
+ return parse_datetime_value
1956
+ end
1957
+ if !starts_sign && looks_like_time_prefix_at?(p)
1958
+ return parse_local_time_value
1959
+ end
1960
+ if starts_sign && s.byteslice(p + 1, 3) == "inf"
1961
+ after = s.getbyte(p + 4)
1962
+ if after.nil? || VALUE_TERMINATOR_BYTE[after]
1963
+ neg = first == MINUS
1964
+ @pos += 4
1965
+ return neg ? -Float::INFINITY : Float::INFINITY
1966
+ end
1967
+ end
1968
+ tok_len, is_float = scan_number_token
1969
+ lex = s.byteslice(p, tok_len).force_encoding(Encoding::UTF_8)
1970
+ if is_float
1971
+ f =
1972
+ begin
1973
+ parse_float_lit(lex)
1974
+ rescue StandardError => e
1975
+ raise err("invalid float: #{lex} (#{e.message})")
1976
+ end
1977
+ @pos += tok_len
1978
+ return f
1979
+ end
1980
+ n =
1981
+ begin
1982
+ parse_integer_lit(lex)
1983
+ rescue StandardError => e
1984
+ raise err(e.message)
1985
+ end
1986
+ @pos += tok_len
1987
+ # Record original lexeme only if it differs from canonical form.
1988
+ # Fast cheap test: if lex contains '_', '+', or starts with '0' followed
1989
+ # by a non-digit (hex/oct/bin prefix marker), it's non-canonical. Simple
1990
+ # decimal integers like "42" / "-7" map directly to n.to_s and need no
1991
+ # entry; skip the to_s allocation in that common case.
1992
+ if @record_forms
1993
+ bs = lex.bytesize
1994
+ first = lex.getbyte(0)
1995
+ possibly_non_canonical =
1996
+ lex.include?("_") || first == PLUS ||
1997
+ (first == DIGIT0 && bs > 1) ||
1998
+ (first == MINUS && bs > 1 && lex.getbyte(1) == DIGIT0)
1999
+ if possibly_non_canonical && lex != n.to_s
2000
+ @original_forms << [@path.dup.freeze, OriginalLiteral.integer(lex)]
2001
+ end
2002
+ end
2003
+ n
2004
+ end
2005
+
2006
+ def scan_number_token
2007
+ s = @src
2008
+ n = @len
2009
+ i = @pos
2010
+ start = i
2011
+ first = s.getbyte(i)
2012
+ if first == PLUS || first == MINUS
2013
+ i += 1
2014
+ end
2015
+ is_prefixed = false
2016
+ if i + 1 < n && s.getbyte(i) == DIGIT0
2017
+ nb = s.getbyte(i + 1)
2018
+ if nb == LOWER_X || nb == LOWER_O || nb == LOWER_B
2019
+ is_prefixed = true
2020
+ end
2021
+ end
2022
+ saw_dot = false; saw_p = false; saw_e = false
2023
+ if is_prefixed
2024
+ i += 2
2025
+ while i < n
2026
+ b = s.getbyte(i)
2027
+ if b == UNDERSCORE || HEX_BYTE[b]
2028
+ i += 1
2029
+ elsif b == DOT && !saw_dot && !saw_p
2030
+ saw_dot = true; i += 1
2031
+ elsif b == LOWER_P && !saw_p
2032
+ saw_p = true; i += 1
2033
+ nb = s.getbyte(i)
2034
+ if nb == PLUS || nb == MINUS
2035
+ i += 1
2036
+ end
2037
+ elsif saw_p && DIGIT_BYTE[b]
2038
+ i += 1
2039
+ else
2040
+ break
2041
+ end
2042
+ end
2043
+ return [i - start, saw_dot || saw_p]
2044
+ end
2045
+ while i < n
2046
+ b = s.getbyte(i)
2047
+ if DIGIT_BYTE[b] || b == UNDERSCORE
2048
+ i += 1
2049
+ elsif b == DOT && !saw_dot && !saw_e
2050
+ saw_dot = true; i += 1
2051
+ elsif (b == LOWER_E || b == UPPER_E) && !saw_e
2052
+ saw_e = true; i += 1
2053
+ nb = s.getbyte(i)
2054
+ if nb == PLUS || nb == MINUS
2055
+ i += 1
2056
+ end
2057
+ else
2058
+ break
2059
+ end
2060
+ end
2061
+ [i - start, saw_dot || saw_e]
2062
+ end
2063
+
2064
+ # ---------- Numeric helpers ----------
2065
+
2066
+ def valid_underscores?(s)
2067
+ return true if s.empty?
2068
+ return false if s.start_with?("_") || s.end_with?("_")
2069
+ prev_us = false
2070
+ i = 0
2071
+ n = s.bytesize
2072
+ while i < n
2073
+ b = s.getbyte(i)
2074
+ if b == UNDERSCORE
2075
+ return false if prev_us
2076
+ prev_us = true
2077
+ else
2078
+ prev_us = false
2079
+ end
2080
+ i += 1
2081
+ end
2082
+ true
2083
+ end
2084
+
2085
+ INT64_MIN = -(2**63)
2086
+ INT64_MAX = 2**63 - 1
2087
+
2088
+ def parse_integer_lit(s)
2089
+ # Fast path: pure decimal digits, no underscore, no sign or just leading '-',
2090
+ # length such that no overflow check is needed (<=18 digits for unsigned,
2091
+ # <=19 with leading minus). Skip the per-char validation loop.
2092
+ bs = s.bytesize
2093
+ if bs > 0
2094
+ first = s.getbyte(0)
2095
+ first_digit_idx = (first == MINUS) ? 1 : 0
2096
+ digit_count = bs - first_digit_idx
2097
+ # 18 digits never overflow signed i64 (max 9.22e18); negatives same.
2098
+ if digit_count > 0 && digit_count <= 18
2099
+ fc = s.getbyte(first_digit_idx)
2100
+ if fc >= DIGIT0 && fc <= DIGIT9
2101
+ ok = true
2102
+ i = first_digit_idx + 1
2103
+ while i < bs
2104
+ b = s.getbyte(i)
2105
+ unless b >= DIGIT0 && b <= DIGIT9
2106
+ ok = false
2107
+ break
2108
+ end
2109
+ i += 1
2110
+ end
2111
+ if ok
2112
+ if fc == DIGIT0 && digit_count > 1
2113
+ raise "leading zeros are not allowed on decimal integers"
2114
+ end
2115
+ return s.to_i
2116
+ end
2117
+ end
2118
+ end
2119
+ end
2120
+ # Slow path: full parser for hex/oct/bin/underscored/edge cases.
2121
+ if s.start_with?("-")
2122
+ sign = -1; rest = s[1..]
2123
+ elsif s.start_with?("+")
2124
+ sign = 1; rest = s[1..]
2125
+ else
2126
+ sign = 1; rest = s
2127
+ end
2128
+ raise "hex prefix must be lowercase '0x'" if rest.start_with?("0X")
2129
+ if rest.start_with?("0x")
2130
+ radix = 16; body = rest[2..]
2131
+ elsif rest.start_with?("0o")
2132
+ radix = 8; body = rest[2..]
2133
+ elsif rest.start_with?("0b")
2134
+ radix = 2; body = rest[2..]
2135
+ else
2136
+ radix = 10; body = rest
2137
+ end
2138
+ raise "empty number" if body.empty?
2139
+ raise "underscore must be between digits" if body.start_with?("_") || body.end_with?("_")
2140
+ if radix == 10 && rest.length > 1 && rest.start_with?("0")
2141
+ raise "leading zeros are not allowed on decimal integers"
2142
+ end
2143
+ digit_chars = "0123456789abcdef"[0, radix]
2144
+ clean = +""
2145
+ prev_is_digit = false
2146
+ body.each_char do |c|
2147
+ if c == "_"
2148
+ raise "underscore must be between digits" unless prev_is_digit
2149
+ prev_is_digit = false
2150
+ else
2151
+ unless digit_chars.include?(c.downcase)
2152
+ raise "invalid digit '#{c}' for base #{radix}"
2153
+ end
2154
+ clean << c
2155
+ prev_is_digit = true
2156
+ end
2157
+ end
2158
+ raise "underscore must be between digits" unless prev_is_digit
2159
+ n = sign * clean.to_i(radix)
2160
+ raise "integer out of i64 range" if n < INT64_MIN || n > INT64_MAX
2161
+ n
2162
+ end
2163
+
2164
+ def parse_float_lit(s)
2165
+ if s.start_with?("-")
2166
+ sign = -1.0; rest = s[1..]
2167
+ elsif s.start_with?("+")
2168
+ sign = 1.0; rest = s[1..]
2169
+ else
2170
+ sign = 1.0; rest = s
2171
+ end
2172
+ v =
2173
+ if rest.start_with?("0x") || rest.start_with?("0o") || rest.start_with?("0b")
2174
+ parse_nondec_float(rest)
2175
+ else
2176
+ parse_dec_float(rest)
2177
+ end
2178
+ sign * v
2179
+ end
2180
+
2181
+ def parse_dec_float(s)
2182
+ e_idx = nil
2183
+ s.each_char.with_index do |c, i|
2184
+ if c == "e" || c == "E"
2185
+ e_idx = i
2186
+ break
2187
+ end
2188
+ end
2189
+ m = e_idx.nil? ? s : s[0, e_idx]
2190
+ e = e_idx.nil? ? nil : s[e_idx + 1..]
2191
+ raise "decimal float requires '.'" unless m.include?(".")
2192
+ ip, fp = m.split(".", 2)
2193
+ raise "decimal float requires digit on both sides of '.'" if ip.empty? || fp.nil? || fp.empty?
2194
+ raise "invalid character in mantissa" unless ip.each_char.all? { |c| c == "_" || (c >= "0" && c <= "9") }
2195
+ raise "invalid character in mantissa" unless fp.each_char.all? { |c| c == "_" || (c >= "0" && c <= "9") }
2196
+ raise "bad underscore in mantissa" unless valid_underscores?(ip) && valid_underscores?(fp)
2197
+ full = ip.delete("_") + "." + fp.delete("_")
2198
+ if e
2199
+ es_clean = e.sub(/^[+-]/, "")
2200
+ raise "underscore not allowed in exponent" if es_clean.include?("_")
2201
+ raise "invalid character in exponent" unless e.each_char.all? { |c| c == "+" || c == "-" || (c >= "0" && c <= "9") }
2202
+ raise "empty exponent" if es_clean.empty?
2203
+ full = "#{full}e#{e}"
2204
+ end
2205
+ Float(full)
2206
+ end
2207
+
2208
+ def parse_nondec_float(s)
2209
+ if s.start_with?("0x")
2210
+ radix = 16; rest = s[2..]
2211
+ elsif s.start_with?("0o")
2212
+ radix = 8; rest = s[2..]
2213
+ elsif s.start_with?("0b")
2214
+ radix = 2; rest = s[2..]
2215
+ else
2216
+ raise "non-decimal float prefix required"
2217
+ end
2218
+ p_idx = rest.index("p")
2219
+ raise "non-decimal float requires 'p' exponent" if p_idx.nil?
2220
+ mant = rest[0, p_idx]
2221
+ exp_str = rest[p_idx + 1..]
2222
+ raise "empty exponent" if exp_str.nil? || exp_str.empty?
2223
+ raise "underscore not allowed in exponent" if exp_str.include?("_")
2224
+ raise "invalid exponent character" unless exp_str.each_char.all? { |c| c == "+" || c == "-" || (c >= "0" && c <= "9") }
2225
+ exp = Integer(exp_str)
2226
+ if mant.include?(".")
2227
+ ip, fp = mant.split(".", 2)
2228
+ raise "digit required on both sides of '.'" if ip.empty? || fp.nil? || fp.empty?
2229
+ else
2230
+ ip = mant; fp = ""
2231
+ end
2232
+ raise "bad underscore in mantissa" unless valid_underscores?(ip) && valid_underscores?(fp)
2233
+ ip_clean = ip.delete("_")
2234
+ fp_clean = fp.delete("_")
2235
+ digit_chars = "0123456789abcdef"[0, radix]
2236
+ raise "invalid digit for base #{radix}" unless ip_clean.each_char.all? { |c| digit_chars.include?(c.downcase) }
2237
+ raise "invalid digit for base #{radix}" unless fp_clean.each_char.all? { |c| digit_chars.include?(c.downcase) }
2238
+ int_val = ip_clean.empty? ? 0 : ip_clean.to_i(radix)
2239
+ frac_val = 0.0
2240
+ div = radix.to_f
2241
+ fp_clean.each_char do |c|
2242
+ d = c.to_i(radix)
2243
+ frac_val += d / div
2244
+ div *= radix
2245
+ end
2246
+ (int_val + frac_val) * (2.0 ** exp)
2247
+ end
2248
+
2249
+ def days_in_month(y, m)
2250
+ case m
2251
+ when 1, 3, 5, 7, 8, 10, 12 then 31
2252
+ when 4, 6, 9, 11 then 30
2253
+ when 2
2254
+ leap = (y % 4 == 0 && y % 100 != 0) || y % 400 == 0
2255
+ leap ? 29 : 28
2256
+ else 0
2257
+ end
2258
+ end
2259
+
2260
+ def validate_date(s)
2261
+ raise "invalid date format" if s.length != 10 || s[4] != "-" || s[7] != "-"
2262
+ [0, 1, 2, 3, 5, 6, 8, 9].each do |i|
2263
+ c = s[i]
2264
+ raise "date must be all digits" unless c >= "0" && c <= "9"
2265
+ end
2266
+ y = s[0, 4].to_i; m = s[5, 2].to_i; d = s[8, 2].to_i
2267
+ raise "month out of range" unless m.between?(1, 12)
2268
+ raise "day out of range" unless d.between?(1, days_in_month(y, m))
2269
+ end
2270
+
2271
+ def validate_time(s)
2272
+ raise "invalid time format" if s.length != 8 || s[2] != ":" || s[5] != ":"
2273
+ [0, 1, 3, 4, 6, 7].each do |i|
2274
+ c = s[i]
2275
+ raise "time must be all digits" unless c >= "0" && c <= "9"
2276
+ end
2277
+ h = s[0, 2].to_i; m = s[3, 2].to_i; sec = s[6, 2].to_i
2278
+ raise "hour out of range" if h > 23
2279
+ raise "minute out of range" if m > 59
2280
+ raise "second out of range (leap seconds not supported)" if sec > 59
2281
+ end
2282
+
2283
+ def parse_datetime_value
2284
+ rest = @src.byteslice(@pos, @len - @pos).force_encoding(Encoding::UTF_8)
2285
+ date = rest[0, 10]
2286
+ begin
2287
+ validate_date(date)
2288
+ rescue StandardError => e
2289
+ raise err(e.message)
2290
+ end
2291
+ rest2 = rest[10..]
2292
+ if !rest2.start_with?("T") && !rest2.start_with?(" ")
2293
+ if rest2.start_with?("t")
2294
+ raise err("date and time separator must be uppercase 'T' (lowercase 't' not permitted)")
2295
+ end
2296
+ after = rest2[0]
2297
+ unless after.nil? || after == " " || after == "\t" || after == "\n" || after == "\r" ||
2298
+ after == "#" || after == "/" || after == "," || after == "]" || after == "}"
2299
+ raise err("invalid character after date")
2300
+ end
2301
+ @pos += 10
2302
+ return LocalDate.new(date)
2303
+ end
2304
+ if rest2.start_with?(" ")
2305
+ i = 0
2306
+ while i < rest2.length && (rest2[i] == " " || rest2[i] == "\t")
2307
+ i += 1
2308
+ end
2309
+ if i < rest2.length && rest2[i] >= "0" && rest2[i] <= "9"
2310
+ raise err("date and time must be separated by 'T' (space not permitted)")
2311
+ end
2312
+ @pos += 10
2313
+ return LocalDate.new(date)
2314
+ end
2315
+ after_t = rest2[1..]
2316
+ raise err("expected HH:MM:SS after 'T'") unless looks_like_time_str?(after_t)
2317
+ time_str = after_t[0, 8]
2318
+ begin
2319
+ validate_time(time_str)
2320
+ rescue StandardError => e
2321
+ raise err(e.message)
2322
+ end
2323
+ consumed = 10 + 1 + 8
2324
+ after_time = rest[consumed..]
2325
+ frac_len = 0
2326
+ if after_time.start_with?(".")
2327
+ k = 1
2328
+ while k < after_time.length && after_time[k] >= "0" && after_time[k] <= "9"
2329
+ k += 1
2330
+ end
2331
+ digits = k - 1
2332
+ raise err("expected fractional digits after '.'") if digits == 0
2333
+ raise err("fractional seconds limited to 9 digits (nanosecond precision)") if digits > 9
2334
+ frac_len = k
2335
+ end
2336
+ consumed += frac_len
2337
+ after_frac = rest[consumed..]
2338
+ if after_frac.start_with?("Z") || after_frac.start_with?("z")
2339
+ consumed += 1
2340
+ s = rest[0, consumed]
2341
+ @pos += consumed
2342
+ return OffsetDateTime.new(s)
2343
+ end
2344
+ if after_frac.start_with?("+") || after_frac.start_with?("-")
2345
+ if after_frac.length < 6 ||
2346
+ !(after_frac[1] >= "0" && after_frac[1] <= "9") ||
2347
+ !(after_frac[2] >= "0" && after_frac[2] <= "9") ||
2348
+ after_frac[3] != ":" ||
2349
+ !(after_frac[4] >= "0" && after_frac[4] <= "9") ||
2350
+ !(after_frac[5] >= "0" && after_frac[5] <= "9")
2351
+ raise err("invalid offset; expected ±HH:MM")
2352
+ end
2353
+ oh = after_frac[1, 2].to_i
2354
+ om = after_frac[4, 2].to_i
2355
+ raise err("offset out of range") if oh > 23 || om > 59
2356
+ consumed += 6
2357
+ s = rest[0, consumed]
2358
+ @pos += consumed
2359
+ return OffsetDateTime.new(s)
2360
+ end
2361
+ after = after_frac[0]
2362
+ unless after.nil? || after == " " || after == "\t" || after == "\n" || after == "\r" ||
2363
+ after == "#" || after == "/" || after == "," || after == "]" || after == "}"
2364
+ raise err("invalid character after datetime")
2365
+ end
2366
+ s = rest[0, consumed]
2367
+ @pos += consumed
2368
+ LocalDateTime.new(s)
2369
+ end
2370
+
2371
+ def looks_like_time_str?(s)
2372
+ return false if s.length < 8
2373
+ s[0] >= "0" && s[0] <= "9" && s[1] >= "0" && s[1] <= "9" && s[2] == ":" &&
2374
+ s[3] >= "0" && s[3] <= "9" && s[4] >= "0" && s[4] <= "9" && s[5] == ":" &&
2375
+ s[6] >= "0" && s[6] <= "9" && s[7] >= "0" && s[7] <= "9"
2376
+ end
2377
+
2378
+ def parse_local_time_value
2379
+ rest = @src.byteslice(@pos, @len - @pos).force_encoding(Encoding::UTF_8)
2380
+ time_str = rest[0, 8]
2381
+ begin
2382
+ validate_time(time_str)
2383
+ rescue StandardError => e
2384
+ raise err(e.message)
2385
+ end
2386
+ consumed = 8
2387
+ after = rest[consumed..]
2388
+ if after.start_with?(".")
2389
+ k = 1
2390
+ while k < after.length && after[k] >= "0" && after[k] <= "9"
2391
+ k += 1
2392
+ end
2393
+ digits = k - 1
2394
+ raise err("expected fractional digits after '.'") if digits == 0
2395
+ raise err("fractional seconds limited to 9 digits") if digits > 9
2396
+ consumed += k
2397
+ end
2398
+ after2 = rest[consumed..]
2399
+ nxt = after2[0]
2400
+ unless nxt.nil? || nxt == " " || nxt == "\t" || nxt == "\n" || nxt == "\r" ||
2401
+ nxt == "#" || nxt == "/" || nxt == "," || nxt == "]" || nxt == "}"
2402
+ raise err("invalid character after time")
2403
+ end
2404
+ s = rest[0, consumed]
2405
+ @pos += consumed
2406
+ LocalTime.new(s)
2407
+ end
2408
+
2409
+ # ---------- Strings ----------
2410
+
2411
+ def parse_basic_string_value
2412
+ sl = @line; sls = @line_start; sp = @pos
2413
+ @pos += 1 # opening "
2414
+ out = +""
2415
+ out.force_encoding(Encoding::UTF_8)
2416
+ s = @src
2417
+ n = @len
2418
+ run_start = @pos
2419
+ loop do
2420
+ b = s.getbyte(@pos)
2421
+ if b.nil?
2422
+ raise err_at(sl, sls, sp, "unterminated string")
2423
+ elsif b == LF || b == CR
2424
+ raise err("strings cannot span lines")
2425
+ elsif b == DQUOTE
2426
+ out << s.byteslice(run_start, @pos - run_start).force_encoding(Encoding::UTF_8) if @pos > run_start
2427
+ @pos += 1
2428
+ # NFC re-normalize after escape decoding (only needed if escapes used).
2429
+ return out.ascii_only? ? out : out.unicode_normalize(:nfc)
2430
+ elsif b == BACKSLASH
2431
+ out << s.byteslice(run_start, @pos - run_start).force_encoding(Encoding::UTF_8) if @pos > run_start
2432
+ @pos += 1
2433
+ esc = s.getbyte(@pos)
2434
+ @pos += 1 unless esc.nil?
2435
+ case esc
2436
+ when DQUOTE then out << '"'
2437
+ when BACKSLASH then out << "\\"
2438
+ when LOWER_N then out << "\n"
2439
+ when LOWER_T then out << "\t"
2440
+ when LOWER_R then out << "\r"
2441
+ when 0x62 then out << "\b"
2442
+ when LOWER_F_LETTER then out << "\f"
2443
+ when LOWER_U then out << read_hex_codepoint(4)
2444
+ when UPPER_U then out << read_hex_codepoint(8)
2445
+ when nil then raise err("unterminated escape")
2446
+ else raise err("invalid escape '\\#{esc.chr}'")
2447
+ end
2448
+ run_start = @pos
2449
+ else
2450
+ @pos += 1
2451
+ end
2452
+ end
2453
+ end
2454
+
2455
+ def parse_literal_string_value
2456
+ sl = @line; sls = @line_start; sp = @pos
2457
+ @pos += 1 # opening '
2458
+ start = @pos
2459
+ s = @src
2460
+ n = @len
2461
+ while @pos < n
2462
+ b = s.getbyte(@pos)
2463
+ if b == SQUOTE
2464
+ out = s.byteslice(start, @pos - start).force_encoding(Encoding::UTF_8)
2465
+ @pos += 1
2466
+ return out
2467
+ end
2468
+ if b == LF || b == CR
2469
+ raise err("strings cannot span lines")
2470
+ end
2471
+ @pos += 1
2472
+ end
2473
+ raise err_at(sl, sls, sp, "unterminated string")
2474
+ end
2475
+
2476
+ def read_hex_codepoint(n)
2477
+ s = @src
2478
+ return raise(err("expected #{n} hex digits in unicode escape")) if @pos + n > @len
2479
+ hex_str = s.byteslice(@pos, n).force_encoding(Encoding::UTF_8)
2480
+ i = 0
2481
+ while i < n
2482
+ b = s.getbyte(@pos + i)
2483
+ unless HEX_BYTE[b]
2484
+ raise err("invalid hex in unicode escape: #{hex_str}")
2485
+ end
2486
+ i += 1
2487
+ end
2488
+ v = hex_str.to_i(16)
2489
+ @pos += n
2490
+ # SPEC: U+0000 is forbidden anywhere in DMS source, including via
2491
+ # escape decoding. `` / `\U00000000` must not slip through.
2492
+ if v == 0
2493
+ raise err("\\u0000 escape forbidden")
2494
+ end
2495
+ if v >= 0xD800 && v <= 0xDFFF
2496
+ raise err(format("surrogate codepoint U+%04X in escape", v))
2497
+ end
2498
+ begin
2499
+ v.chr(Encoding::UTF_8)
2500
+ rescue RangeError
2501
+ raise err("unicode escape is not a scalar value")
2502
+ end
2503
+ end
2504
+
2505
+ # ---------- Heredocs ----------
2506
+
2507
+ HBody = Struct.new(:lines, :strip_depth)
2508
+
2509
+ def parse_heredoc_basic
2510
+ @pos += 3
2511
+ label = parse_heredoc_label
2512
+ modifiers = parse_heredoc_modifiers
2513
+ skip_inline_ws
2514
+ raise err("heredoc opener must be followed by end of line") unless consume_eol || eof?
2515
+ terminator = label.empty? ? '"""' : label
2516
+ body = collect_heredoc_body(terminator)
2517
+ # SPEC §basic-string escapes: surrogate codepoints (U+D800..U+DFFF)
2518
+ # are not valid Unicode scalars and are a parse error in `\uXXXX` /
2519
+ # `\UXXXXXXXX` escapes. Basic-heredoc bodies process the same
2520
+ # escapes as basic strings, so apply the same rejection here.
2521
+ validate_heredoc_basic_surrogates(body)
2522
+ stripped = strip_indent_and_continuations(body, true)
2523
+ result =
2524
+ begin
2525
+ apply_modifiers(stripped, modifiers)
2526
+ rescue StandardError => e
2527
+ raise err(e.message)
2528
+ end
2529
+ label_opt = label.empty? ? nil : label
2530
+ calls = modifiers.map { |m| HeredocModifierCall.new(m[:name], m[:args]) }
2531
+ record_form(OriginalLiteral.string(StringForm.heredoc(:basic_triple, label_opt, calls)))
2532
+ result.ascii_only? ? result : result.unicode_normalize(:nfc)
2533
+ end
2534
+
2535
+ # SPEC §basic-string escapes: a `\uXXXX` / `\UXXXXXXXX` escape whose
2536
+ # decoded value falls in the surrogate range U+D800..U+DFFF is a
2537
+ # parse error. Basic-string lexer enforces it inline; heredoc bodies
2538
+ # are collected raw, so we scan the body for surrogate escapes here.
2539
+ def validate_heredoc_basic_surrogates(body)
2540
+ body.lines.each do |text, line_no, line_start|
2541
+ bytes = text.b
2542
+ i = 0
2543
+ len = bytes.bytesize
2544
+ while i < len
2545
+ if bytes.getbyte(i) == BACKSLASH
2546
+ j = i
2547
+ while j < len && bytes.getbyte(j) == BACKSLASH
2548
+ j += 1
2549
+ end
2550
+ run = j - i
2551
+ if run.odd? && j < len
2552
+ intro = bytes.getbyte(j)
2553
+ n = if intro == LOWER_U then 4
2554
+ elsif intro == UPPER_U then 8
2555
+ else 0
2556
+ end
2557
+ if n > 0 && j + 1 + n <= len
2558
+ hex = bytes.byteslice(j + 1, n)
2559
+ ok = true
2560
+ k = 0
2561
+ while k < n
2562
+ unless HEX_BYTE[hex.getbyte(k)]
2563
+ ok = false
2564
+ break
2565
+ end
2566
+ k += 1
2567
+ end
2568
+ if ok
2569
+ cp = hex.to_i(16)
2570
+ if cp >= 0xD800 && cp <= 0xDFFF
2571
+ esc_off = j - 1
2572
+ column = esc_off + 1
2573
+ raise DecodeError.new(line_no, column,
2574
+ format("surrogate codepoint U+%04X in escape", cp))
2575
+ end
2576
+ end
2577
+ end
2578
+ end
2579
+ i = j
2580
+ else
2581
+ i += 1
2582
+ end
2583
+ end
2584
+ end
2585
+ end
2586
+
2587
+ def parse_heredoc_literal
2588
+ @pos += 3
2589
+ label = parse_heredoc_label
2590
+ modifiers = parse_heredoc_modifiers
2591
+ skip_inline_ws
2592
+ raise err("heredoc opener must be followed by end of line") unless consume_eol || eof?
2593
+ terminator = label.empty? ? "'''" : label
2594
+ body = collect_heredoc_body(terminator)
2595
+ stripped = strip_indent_and_continuations(body, false)
2596
+ result =
2597
+ begin
2598
+ apply_modifiers(stripped, modifiers)
2599
+ rescue StandardError => e
2600
+ raise err(e.message)
2601
+ end
2602
+ label_opt = label.empty? ? nil : label
2603
+ calls = modifiers.map { |m| HeredocModifierCall.new(m[:name], m[:args]) }
2604
+ record_form(OriginalLiteral.string(StringForm.heredoc(:literal_triple, label_opt, calls)))
2605
+ result
2606
+ end
2607
+
2608
+ def parse_heredoc_label
2609
+ b = @src.getbyte(@pos)
2610
+ return "" if b.nil? || !LABEL_START_BYTE[b]
2611
+ start = @pos
2612
+ while (bb = @src.getbyte(@pos)) && LABEL_CONT_BYTE[bb]
2613
+ @pos += 1
2614
+ end
2615
+ @src.byteslice(start, @pos - start).force_encoding(Encoding::UTF_8)
2616
+ end
2617
+
2618
+ def parse_heredoc_modifiers
2619
+ mods = []
2620
+ loop do
2621
+ ws_start = @pos
2622
+ skip_inline_ws
2623
+ had_ws = @pos > ws_start
2624
+ b = @src.getbyte(@pos)
2625
+ if b && LABEL_START_BYTE[b]
2626
+ raise err("modifier must be preceded by whitespace") unless had_ws
2627
+ mods << parse_one_modifier
2628
+ else
2629
+ @pos = ws_start
2630
+ return mods
2631
+ end
2632
+ end
2633
+ end
2634
+
2635
+ def parse_one_modifier
2636
+ ns = @pos
2637
+ while (b = @src.getbyte(@pos)) && LABEL_CONT_BYTE[b]
2638
+ @pos += 1
2639
+ end
2640
+ name = @src.byteslice(ns, @pos - ns).force_encoding(Encoding::UTF_8)
2641
+ raise err("modifiers require parentheses") if @src.getbyte(@pos) != 0x28 # '('
2642
+ @pos += 1
2643
+ saved = @record_forms
2644
+ @record_forms = false
2645
+ begin
2646
+ args = parse_modifier_call_args
2647
+ ensure
2648
+ @record_forms = saved
2649
+ end
2650
+ { name: name, args: args }
2651
+ end
2652
+
2653
+ def parse_modifier_call_args
2654
+ args = []
2655
+ loop do
2656
+ skip_inline_ws
2657
+ b = @src.getbyte(@pos)
2658
+ if b == 0x29 # ')'
2659
+ @pos += 1
2660
+ return args
2661
+ end
2662
+ raise err("expected ',' or ')' in modifier args") if b.nil?
2663
+ v = parse_inline_value_or_heredoc
2664
+ args << v
2665
+ skip_inline_ws
2666
+ b = @src.getbyte(@pos)
2667
+ if b == COMMA
2668
+ @pos += 1
2669
+ elsif b == 0x29
2670
+ @pos += 1
2671
+ return args
2672
+ else
2673
+ raise err("expected ',' or ')' in modifier args")
2674
+ end
2675
+ end
2676
+ end
2677
+
2678
+ def collect_heredoc_body(terminator)
2679
+ lines = []
2680
+ sl = @line; sls = @line_start; sp = @pos
2681
+ s = @src
2682
+ n = @len
2683
+ loop do
2684
+ if @pos >= n
2685
+ raise err_at(sl, sls, sp, "unterminated heredoc")
2686
+ end
2687
+ line_begin = @pos
2688
+ while @pos < n
2689
+ b = s.getbyte(@pos)
2690
+ break if b == LF || b == CR
2691
+ @pos += 1
2692
+ end
2693
+ raw = s.byteslice(line_begin, @pos - line_begin).force_encoding(Encoding::UTF_8)
2694
+ this_line = @line; this_lstart = @line_start
2695
+ if raw.strip == terminator
2696
+ strip_depth = 0
2697
+ raw.each_char do |c|
2698
+ break unless c == " "
2699
+ strip_depth += 1
2700
+ end
2701
+ return HBody.new(lines, strip_depth)
2702
+ end
2703
+ consume_eol
2704
+ lines << [raw, this_line, this_lstart]
2705
+ end
2706
+ end
2707
+
2708
+ # ---------- Heredoc body processing ----------
2709
+
2710
+ def strip_indent_and_continuations(body, allow_cont)
2711
+ out = +""
2712
+ out.force_encoding(Encoding::UTF_8)
2713
+ first = true
2714
+ pending = false
2715
+ last_pos = [1, 0]
2716
+ body.lines.each do |text, line_no, line_start|
2717
+ last_pos = [line_no, line_start]
2718
+ is_blank = text.each_char.all? { |c| c == " " || c == "\t" }
2719
+ if is_blank
2720
+ stripped = ""
2721
+ else
2722
+ leading = 0
2723
+ text.each_char do |c|
2724
+ break unless c == " "
2725
+ leading += 1
2726
+ end
2727
+ if leading < body.strip_depth
2728
+ raise DecodeError.new(line_no, leading + 1,
2729
+ "heredoc body line indented #{leading} spaces, less than strip depth #{body.strip_depth}")
2730
+ end
2731
+ stripped = text[body.strip_depth..]
2732
+ end
2733
+ piece = stripped
2734
+ splice = false
2735
+ if allow_cont
2736
+ trimmed_end = piece.sub(/[ \t]+\z/, "")
2737
+ idx = trimmed_end.rindex("\\")
2738
+ if idx && idx == trimmed_end.length - 1
2739
+ preceding = 0
2740
+ k = idx - 1
2741
+ while k >= 0 && trimmed_end[k] == "\\"
2742
+ preceding += 1
2743
+ k -= 1
2744
+ end
2745
+ if preceding.even?
2746
+ piece = trimmed_end[0, idx]
2747
+ splice = true
2748
+ end
2749
+ end
2750
+ end
2751
+ if first
2752
+ out << piece
2753
+ first = false
2754
+ elsif pending
2755
+ trimmed_start = piece.sub(/\A[ \t]+/, "")
2756
+ out << trimmed_start unless is_blank
2757
+ else
2758
+ out << "\n"
2759
+ out << piece
2760
+ end
2761
+ pending = splice
2762
+ end
2763
+ if pending
2764
+ raise DecodeError.new(last_pos[0], 1, "trailing line continuation has nothing to splice to")
2765
+ end
2766
+ out
2767
+ end
2768
+
2769
+ def fold_paragraphs(s)
2770
+ paragraphs = s.split("\n\n", -1)
2771
+ paragraphs.map { |p| p.split("\n").reject(&:empty?).join(" ") }.join("\n")
2772
+ end
2773
+
2774
+ def replace_all_runs(s, char_set, replacement)
2775
+ out = +""
2776
+ i = 0
2777
+ n = s.length
2778
+ while i < n
2779
+ if char_set.include?(s[i])
2780
+ while i < n && char_set.include?(s[i])
2781
+ i += 1
2782
+ end
2783
+ out << replacement
2784
+ else
2785
+ out << s[i]
2786
+ i += 1
2787
+ end
2788
+ end
2789
+ out
2790
+ end
2791
+
2792
+ def replace_leading_run(s, char_set, replacement)
2793
+ e = 0
2794
+ while e < s.length && char_set.include?(s[e])
2795
+ e += 1
2796
+ end
2797
+ return s if e == 0
2798
+ replacement + s[e..]
2799
+ end
2800
+
2801
+ def replace_trailing_run(s, char_set, replacement)
2802
+ st = s.length
2803
+ while st > 0 && char_set.include?(s[st - 1])
2804
+ st -= 1
2805
+ end
2806
+ return s if st == s.length
2807
+ s[0, st] + replacement
2808
+ end
2809
+
2810
+ def per_line_edges(s, char_set, replacement)
2811
+ s.split("\n", -1).map { |l|
2812
+ l = replace_leading_run(l, char_set, replacement)
2813
+ replace_trailing_run(l, char_set, replacement)
2814
+ }.join("\n")
2815
+ end
2816
+
2817
+ def apply_trim(s, chars, where_s, replacement)
2818
+ return s if chars.empty?
2819
+ char_set = chars.each_char.to_a.uniq
2820
+ has_star = where_s.include?("*")
2821
+ has_pipe = where_s.include?("|")
2822
+ has_lt = where_s.include?("<")
2823
+ has_gt = where_s.include?(">")
2824
+ return s unless has_star || has_pipe || has_lt || has_gt
2825
+ return replace_all_runs(s, char_set, replacement) if has_star
2826
+ cur = s
2827
+ cur = per_line_edges(cur, char_set, replacement) if has_pipe
2828
+ cur = replace_leading_run(cur, char_set, replacement) if has_lt
2829
+ cur = replace_trailing_run(cur, char_set, replacement) if has_gt
2830
+ cur
2831
+ end
2832
+
2833
+ def apply_modifiers(s, mods)
2834
+ cur = s
2835
+ mods.each do |m|
2836
+ case m[:name]
2837
+ when "_fold_paragraphs"
2838
+ raise "fold_paragraphs() takes no arguments" unless m[:args].empty?
2839
+ cur = fold_paragraphs(cur)
2840
+ when "_trim"
2841
+ args = m[:args]
2842
+ raise "trim(chars, where, replacement = \"\") expects 2 or 3 arguments" unless args.length.between?(2, 3)
2843
+ chars = args[0]
2844
+ raise "trim: first argument (chars) must be a string" unless chars.is_a?(String)
2845
+ where = args[1]
2846
+ raise "trim: second argument (where) must be a string" unless where.is_a?(String)
2847
+ replacement = ""
2848
+ if args.length == 3
2849
+ raise "trim: third argument (replacement) must be a string" unless args[2].is_a?(String)
2850
+ replacement = args[2]
2851
+ end
2852
+ cur = apply_trim(cur, chars, where, replacement)
2853
+ else
2854
+ raise "unknown modifier: #{m[:name]}"
2855
+ end
2856
+ end
2857
+ cur
2858
+ end
2859
+
2860
+ # ---------- Flow forms ----------
2861
+
2862
+ def parse_flow_array
2863
+ @pos += 1 # [
2864
+ items = []
2865
+ loop do
2866
+ skip_flow_ws
2867
+ if @src.getbyte(@pos) == RBRACK
2868
+ @pos += 1
2869
+ return items
2870
+ end
2871
+ idx = items.length
2872
+ @path.push(idx)
2873
+ begin
2874
+ v = parse_inline_value_in_flow
2875
+ ensure
2876
+ @path.pop
2877
+ end
2878
+ items << v
2879
+ skip_flow_ws
2880
+ b = @src.getbyte(@pos)
2881
+ if b == COMMA
2882
+ @pos += 1
2883
+ elsif b == RBRACK
2884
+ @pos += 1
2885
+ return items
2886
+ elsif b.nil?
2887
+ raise err("unterminated flow array")
2888
+ else
2889
+ raise err("unexpected '#{b.chr}' in flow array; expected ',' or ']'")
2890
+ end
2891
+ end
2892
+ end
2893
+
2894
+ def parse_flow_table
2895
+ @pos += 1 # {
2896
+ t = new_table
2897
+ loop do
2898
+ skip_flow_ws
2899
+ if @src.getbyte(@pos) == RBRACE
2900
+ @pos += 1
2901
+ return finalize_table(t)
2902
+ end
2903
+ key = parse_key
2904
+ raise err("expected ':' after flow-table key") unless @src.getbyte(@pos) == COLON
2905
+ @pos += 1
2906
+ b = @src.getbyte(@pos)
2907
+ unless b == SP || b == TAB || b == LF || b == CR
2908
+ raise err("expected whitespace after ':'")
2909
+ end
2910
+ skip_flow_ws
2911
+ @path.push(key)
2912
+ begin
2913
+ v = parse_inline_value_in_flow
2914
+ ensure
2915
+ @path.pop
2916
+ end
2917
+ raise err("duplicate key: #{key}") if t.key?(key)
2918
+ t[key] = v
2919
+ skip_flow_ws
2920
+ b = @src.getbyte(@pos)
2921
+ if b == COMMA
2922
+ @pos += 1
2923
+ elsif b == RBRACE
2924
+ @pos += 1
2925
+ return finalize_table(t)
2926
+ elsif b.nil?
2927
+ raise err("unterminated flow table")
2928
+ else
2929
+ raise err("unexpected '#{b.chr}' in flow table; expected ',' or '}'")
2930
+ end
2931
+ end
2932
+ end
2933
+
2934
+ def skip_flow_ws
2935
+ loop do
2936
+ b = @src.getbyte(@pos)
2937
+ if b == SP || b == TAB
2938
+ @pos += 1
2939
+ elsif b == LF
2940
+ @pos += 1; advance_line
2941
+ elsif b == CR && @src.getbyte(@pos + 1) == LF
2942
+ @pos += 2; advance_line
2943
+ elsif b == HASH
2944
+ raise err("comments not allowed inside flow forms")
2945
+ elsif b == SLASH && (@src.getbyte(@pos + 1) == SLASH || @src.getbyte(@pos + 1) == STAR)
2946
+ raise err("comments not allowed inside flow forms")
2947
+ else
2948
+ return
2949
+ end
2950
+ end
2951
+ end
2952
+
2953
+ def parse_inline_value_in_flow
2954
+ if @src.getbyte(@pos) == DQUOTE && starts_bytes?('"""')
2955
+ raise err("heredocs are not allowed inside flow forms")
2956
+ end
2957
+ if @src.getbyte(@pos) == SQUOTE && starts_bytes?("'''")
2958
+ raise err("heredocs are not allowed inside flow forms")
2959
+ end
2960
+ parse_inline_value_or_heredoc
2961
+ end
2962
+
2963
+ # ---------- Post-value ----------
2964
+
2965
+ def consume_after_value(allow_eof)
2966
+ loop do
2967
+ ws_start = @pos
2968
+ skip_inline_ws
2969
+ had_ws = @pos > ws_start
2970
+ b = @src.getbyte(@pos)
2971
+ if b == HASH && !starts_bytes?("###")
2972
+ raise err("expected whitespace before '#' comment") unless had_ws
2973
+ raw = read_line_comment_to_eol
2974
+ @comments << AttachedComment.new(Comment.new(raw, :line), :trailing, @path.dup.freeze) unless @lite
2975
+ break
2976
+ elsif b == SLASH && @src.getbyte(@pos + 1) == SLASH
2977
+ raise err("expected whitespace before '//' comment") unless had_ws
2978
+ raw = read_line_comment_to_eol
2979
+ @comments << AttachedComment.new(Comment.new(raw, :line), :trailing, @path.dup.freeze) unless @lite
2980
+ break
2981
+ elsif b == SLASH && @src.getbyte(@pos + 1) == STAR
2982
+ raw = read_c_block_comment
2983
+ @comments << AttachedComment.new(Comment.new(raw, :block), :trailing, @path.dup.freeze) unless @lite
2984
+ next
2985
+ else
2986
+ break
2987
+ end
2988
+ end
2989
+ b = @src.getbyte(@pos)
2990
+ return if b.nil?
2991
+ if b == LF
2992
+ @pos += 1; advance_line; return
2993
+ end
2994
+ if b == CR && @src.getbyte(@pos + 1) == LF
2995
+ @pos += 2; advance_line; return
2996
+ end
2997
+ raise err("unexpected character '#{b < 128 ? b.chr : '?'}' after value")
2998
+ end
2999
+
3000
+ # ---------- Original-form recording ----------
3001
+
3002
+ def record_form(lit)
3003
+ return if @lite || !@record_forms
3004
+ @original_forms << [@path.dup.freeze, lit]
3005
+ end
3006
+ end
3007
+ end