dms-parser 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/dms/tier1.rb ADDED
@@ -0,0 +1,1750 @@
1
+ # frozen_string_literal: true
2
+
3
+ # DMS Tier-1: decorators, dialect imports, and the tier-1 encoder.
4
+ #
5
+ # Entry points:
6
+ # Dms::Tier1.parse(src) -> DocumentT1
7
+ #
8
+ # The tier-1 parser wraps the existing tier-0 parser, intercepting
9
+ # decorator calls at leading/inner/trailing/flow-inner positions and
10
+ # collecting them into a sidecar.
11
+
12
+ require "json"
13
+
14
+ module Dms
15
+ module Tier1
16
+ # Reserved decorator sigil characters (tier-0 set, no underscore).
17
+ # ! @ $ % ^ & * | ~ ` . , > < ? ; =
18
+ RESERVED_SIGIL_CHARS = "!@$%^&*|~`.,><?;=".chars.to_set.freeze
19
+
20
+ # ── Reserved Emoji Set helpers (frozen Unicode 15.1) ────────────────────
21
+
22
+ # Extended_Pictographic=Yes ranges (frozen UCD 15.1), sourced from Rust ref.
23
+ EXTENDED_PICTOGRAPHIC_RANGES = [
24
+ [0x00A9, 0x00A9], [0x00AE, 0x00AE], [0x203C, 0x203C], [0x2049, 0x2049],
25
+ [0x2122, 0x2122], [0x2139, 0x2139], [0x2194, 0x2199], [0x21A9, 0x21AA],
26
+ [0x231A, 0x231B], [0x2328, 0x2328], [0x2388, 0x2388], [0x23CF, 0x23CF],
27
+ [0x23E9, 0x23F3], [0x23F8, 0x23FA], [0x24C2, 0x24C2], [0x25AA, 0x25AB],
28
+ [0x25B6, 0x25B6], [0x25C0, 0x25C0], [0x25FB, 0x25FE], [0x2600, 0x2605],
29
+ [0x2607, 0x2612], [0x2614, 0x2685], [0x2690, 0x2705], [0x2708, 0x2712],
30
+ [0x2714, 0x2714], [0x2716, 0x2716], [0x271D, 0x271D], [0x2721, 0x2721],
31
+ [0x2728, 0x2728], [0x2733, 0x2734], [0x2744, 0x2744], [0x2747, 0x2747],
32
+ [0x274C, 0x274C], [0x274E, 0x274E], [0x2753, 0x2755], [0x2757, 0x2757],
33
+ [0x2763, 0x2767], [0x2795, 0x2797], [0x27A1, 0x27A1], [0x27B0, 0x27B0],
34
+ [0x27BF, 0x27BF], [0x2934, 0x2935], [0x2B05, 0x2B07], [0x2B1B, 0x2B1C],
35
+ [0x2B50, 0x2B50], [0x2B55, 0x2B55], [0x3030, 0x3030], [0x303D, 0x303D],
36
+ [0x3297, 0x3297], [0x3299, 0x3299], [0x1F000, 0x1F0FF], [0x1F10D, 0x1F10F],
37
+ [0x1F12F, 0x1F12F], [0x1F16C, 0x1F171], [0x1F17E, 0x1F17F], [0x1F18E, 0x1F18E],
38
+ [0x1F191, 0x1F19A], [0x1F1AD, 0x1F1E5], [0x1F201, 0x1F20F], [0x1F21A, 0x1F21A],
39
+ [0x1F22F, 0x1F22F], [0x1F232, 0x1F23A], [0x1F23C, 0x1F23F], [0x1F249, 0x1F3FA],
40
+ [0x1F400, 0x1F53D], [0x1F546, 0x1F64F], [0x1F680, 0x1F6FF], [0x1F774, 0x1F77F],
41
+ [0x1F7D5, 0x1F7FF], [0x1F80C, 0x1F80F], [0x1F848, 0x1F84F], [0x1F85A, 0x1F85F],
42
+ [0x1F888, 0x1F88F], [0x1F8AE, 0x1F8FF], [0x1F90C, 0x1F93A], [0x1F93C, 0x1F945],
43
+ [0x1F947, 0x1FAFF], [0x1FC00, 0x1FFFD],
44
+ ].freeze
45
+
46
+ def self.extended_pictographic?(cp)
47
+ return false if cp < 0xA9
48
+ lo = 0
49
+ hi = EXTENDED_PICTOGRAPHIC_RANGES.length - 1
50
+ while lo <= hi
51
+ mid = (lo + hi) / 2
52
+ range_lo, range_hi = EXTENDED_PICTOGRAPHIC_RANGES[mid]
53
+ if cp < range_lo
54
+ hi = mid - 1
55
+ elsif cp > range_hi
56
+ lo = mid + 1
57
+ else
58
+ return true
59
+ end
60
+ end
61
+ false
62
+ end
63
+
64
+ def self.reserved_emoji_codepoint?(cp)
65
+ return true if cp >= 0x1F1E6 && cp <= 0x1F1FF # regional indicator
66
+ return true if cp >= 0x1F3FB && cp <= 0x1F3FF # skin-tone modifier
67
+ return true if cp == 0x20E3 # keycap combiner
68
+ extended_pictographic?(cp)
69
+ end
70
+
71
+ def self.regional_indicator?(cp)
72
+ cp >= 0x1F1E6 && cp <= 0x1F1FF
73
+ end
74
+
75
+ def self.emoji_modifier?(cp)
76
+ cp >= 0x1F3FB && cp <= 0x1F3FF
77
+ end
78
+
79
+ # Read one extended grapheme cluster of reserved-emoji shape starting
80
+ # at byte offset `start` in `s` (UTF-8 string). Returns the exclusive
81
+ # end byte offset, or nil if no emoji cluster starts here.
82
+ # Algorithm mirrors Rust read_reserved_emoji_atom exactly.
83
+ def self.read_reserved_emoji_atom(s, start)
84
+ return nil if start >= s.bytesize
85
+ # Decode first codepoint
86
+ sub = s.byteslice(start, s.bytesize - start)
87
+ return nil if sub.nil? || sub.empty?
88
+ sub = sub.force_encoding("UTF-8")
89
+ c0 = sub[0]
90
+ return nil if c0.nil?
91
+ cp0 = c0.ord
92
+ return nil unless reserved_emoji_codepoint?(cp0)
93
+ len0 = c0.bytesize
94
+ end_pos = start + len0
95
+
96
+ # Regional-indicator pair (GB12/GB13)
97
+ if regional_indicator?(cp0)
98
+ rest = s.byteslice(end_pos, s.bytesize - end_pos)
99
+ if rest && !rest.empty?
100
+ rest = rest.force_encoding("UTF-8")
101
+ c1 = rest[0]
102
+ if c1 && regional_indicator?(c1.ord)
103
+ end_pos += c1.bytesize
104
+ end
105
+ end
106
+ return end_pos
107
+ end
108
+
109
+ # GB9/GB9a/GB11 loop
110
+ loop do
111
+ rest = s.byteslice(end_pos, s.bytesize - end_pos)
112
+ break if rest.nil? || rest.empty?
113
+ rest = rest.force_encoding("UTF-8")
114
+ c = rest[0]
115
+ break if c.nil?
116
+ cp = c.ord
117
+ if emoji_modifier?(cp) || cp == 0xFE0F || cp == 0x20E3
118
+ # GB9/GB9a - Extend or SpacingMark
119
+ end_pos += c.bytesize
120
+ next
121
+ end
122
+ if cp == 0x200D
123
+ # GB11 - ZWJ x Extended_Pictographic
124
+ after_zwj = end_pos + c.bytesize
125
+ after = s.byteslice(after_zwj, s.bytesize - after_zwj)
126
+ if after && !after.empty?
127
+ after = after.force_encoding("UTF-8")
128
+ nc = after[0]
129
+ if nc && extended_pictographic?(nc.ord)
130
+ end_pos = after_zwj + nc.bytesize
131
+ next
132
+ end
133
+ end
134
+ # ZWJ not followed by E_P: cluster ends before ZWJ
135
+ break
136
+ end
137
+ break
138
+ end
139
+ end_pos
140
+ end
141
+
142
+ # Returns true if the character at byte offset `pos` in UTF-8 string `s`
143
+ # starts a sigil atom (ASCII reserved char OR reserved emoji codepoint).
144
+ def self.sigil_atom_start_at?(s, pos)
145
+ return false if pos >= s.bytesize
146
+ b = s.getbyte(pos)
147
+ return false if b.nil?
148
+ # ASCII reserved sigil
149
+ return true if RESERVED_SIGIL_CHARS.include?(b.chr)
150
+ # Multi-byte: check if it's a reserved emoji codepoint
151
+ return false if b < 0x80
152
+ sub = s.byteslice(pos, s.bytesize - pos)
153
+ return false if sub.nil? || sub.empty?
154
+ sub = sub.force_encoding("UTF-8")
155
+ c = sub[0]
156
+ return false if c.nil?
157
+ reserved_emoji_codepoint?(c.ord)
158
+ end
159
+
160
+ # Lex one sigil atom at byte offset `pos` in UTF-8 string `s`.
161
+ # Returns byte-length of the atom (1 for ASCII, cluster len for emoji),
162
+ # or nil if no sigil atom here.
163
+ def self.lex_sigil_atom_at(s, pos)
164
+ return nil if pos >= s.bytesize
165
+ b = s.getbyte(pos)
166
+ return nil if b.nil?
167
+ # ASCII reserved sigil char
168
+ if b < 0x80
169
+ return RESERVED_SIGIL_CHARS.include?(b.chr) ? 1 : nil
170
+ end
171
+ # Multi-byte: try emoji cluster
172
+ end_pos = read_reserved_emoji_atom(s, pos)
173
+ return nil if end_pos.nil?
174
+ end_pos - pos
175
+ end
176
+
177
+ # Validate that a sigil string consists of valid sigil atoms only.
178
+ # Returns nil on success, error message string on failure.
179
+ def self.validate_sigil_atoms(sigil, idx)
180
+ s = sigil.encode("UTF-8") rescue sigil
181
+ pos = 0
182
+ while pos < s.bytesize
183
+ atom_len = lex_sigil_atom_at(s, pos)
184
+ if atom_len.nil?
185
+ # Decode the char at pos for the error message
186
+ sub = s.byteslice(pos, s.bytesize - pos).force_encoding("UTF-8")
187
+ c = sub[0] || "?"
188
+ if c == "_"
189
+ return "_dms_imports[#{idx}].bind key \"#{sigil}\" (or containing '_') " \
190
+ "is invalid: underscore is not in the tier-0 reserved decorator sigil set"
191
+ end
192
+ return "_dms_imports[#{idx}].bind key \"#{sigil}\" contains '#{c}' " \
193
+ "which is not in the tier-0 reserved decorator sigil set " \
194
+ "nor in the Reserved Emoji Set"
195
+ end
196
+ pos += atom_len
197
+ end
198
+ nil
199
+ end
200
+
201
+ # ── Import types ────────────────────────────────────────────────────────
202
+
203
+ ImportSpec = Struct.new(:dialect, :version, :ns, :bind, :allow, :deny, :alias_map,
204
+ keyword_init: true)
205
+
206
+ # ── Sidecar types ───────────────────────────────────────────────────────
207
+
208
+ # path: Array of {key: String} or {index: Integer} hashes (JSON-ready)
209
+ # calls: Hash sigil => Array of DecoratorCall
210
+ # comments: Array (always empty in this implementation)
211
+ DecoratorEntry = Struct.new(:path, :calls, :comments)
212
+
213
+ # family: String (canonical family name, "" if unresolved)
214
+ # fn_name: String (post-alias-resolution function name)
215
+ # ns: String or nil
216
+ # position: :leading, :inner, :trailing, :floating
217
+ # params: Array of ParamGroup
218
+ # params_dec: Array (always empty in this implementation)
219
+ # sigil: String (the literal sigil string)
220
+ DecoratorCall = Struct.new(:sigil, :family, :fn_name, :ns, :position, :params, :params_dec)
221
+
222
+ # kind: :named or :positional
223
+ # value: Hash (for named) or Array (for positional)
224
+ ParamGroup = Struct.new(:kind, :value)
225
+
226
+ # ── Tier-1 document ─────────────────────────────────────────────────────
227
+
228
+ class DocumentT1
229
+ attr_reader :t0, :decorators, :imports, :observed_tier
230
+
231
+ def initialize(t0, decorators, imports, observed_tier)
232
+ @t0 = t0
233
+ @decorators = decorators
234
+ @imports = imports # Array of ImportSpec
235
+ @observed_tier = observed_tier
236
+ end
237
+ end
238
+
239
+ # ── Semver helpers ──────────────────────────────────────────────────────
240
+
241
+ RANGE_SPECIFIER_PREFIXES = %w[^ ~ >= > < <= =].freeze
242
+
243
+ def self.valid_semver?(s)
244
+ # Drop build metadata
245
+ s = s.split("+", 2).first
246
+ # Split pre-release
247
+ core_str, pre_str = s.split("-", 2)
248
+ parts = core_str.split(".", -1)
249
+ return false unless parts.length == 3
250
+ parts.each do |p|
251
+ return false if p.empty?
252
+ return false if p.length > 1 && p.start_with?("0")
253
+ return false unless p.match?(/\A\d+\z/)
254
+ end
255
+ if pre_str
256
+ return false if pre_str.empty?
257
+ pre_str.split(".").each do |id|
258
+ return false if id.empty?
259
+ if id.match?(/\A\d+\z/)
260
+ return false if id.length > 1 && id.start_with?("0")
261
+ else
262
+ return false unless id.match?(/\A[A-Za-z0-9\-]+\z/)
263
+ end
264
+ end
265
+ end
266
+ true
267
+ end
268
+
269
+ def self.has_range_specifier?(s)
270
+ trimmed = s.lstrip
271
+ RANGE_SPECIFIER_PREFIXES.any? { |p| trimmed.start_with?(p) }
272
+ end
273
+
274
+ # ── Import extraction and validation ────────────────────────────────────
275
+
276
+ def self.extract_imports(meta)
277
+ raw_list = meta["_dms_imports"]
278
+ return [] if raw_list.nil?
279
+
280
+ unless raw_list.is_a?(Array)
281
+ raise DecodeError.new(0, 0, "_dms_imports must be a list")
282
+ end
283
+
284
+ specs = []
285
+ # Seen (sigil, ns_repr, family) triples for collision detection
286
+ seen_triples = {}
287
+
288
+ raw_list.each_with_index do |entry, idx|
289
+ unless entry.is_a?(Hash)
290
+ raise DecodeError.new(0, 0, "_dms_imports[#{idx}] must be a table")
291
+ end
292
+
293
+ # dialect (required string)
294
+ dialect = entry["dialect"]
295
+ case dialect
296
+ when nil
297
+ raise DecodeError.new(0, 0, "_dms_imports[#{idx}] is missing required field 'dialect'")
298
+ when String
299
+ raise DecodeError.new(0, 0, "_dms_imports[#{idx}].dialect must be a non-empty string") if dialect.empty?
300
+ else
301
+ raise DecodeError.new(0, 0, "_dms_imports[#{idx}].dialect must be a string")
302
+ end
303
+
304
+ # version (required string, semver, no range specifiers)
305
+ version = entry["version"]
306
+ case version
307
+ when nil
308
+ raise DecodeError.new(0, 0, "_dms_imports[#{idx}] is missing required field 'version'")
309
+ when String
310
+ raise DecodeError.new(0, 0, "_dms_imports[#{idx}].version must be a non-empty string") if version.empty?
311
+ if has_range_specifier?(version)
312
+ raise DecodeError.new(0, 0,
313
+ "range-specifier syntax in version not supported " \
314
+ "(_dms_imports[#{idx}].version \"#{version}\"): write a plain semver string")
315
+ end
316
+ unless valid_semver?(version)
317
+ raise DecodeError.new(0, 0,
318
+ "_dms_imports[#{idx}].version \"#{version}\" is not a valid semver string " \
319
+ "(expected MAJOR.MINOR.PATCH with optional -pre and +build)")
320
+ end
321
+ else
322
+ raise DecodeError.new(0, 0, "_dms_imports[#{idx}].version must be a string")
323
+ end
324
+
325
+ # ns (optional string)
326
+ ns_val = entry["ns"]
327
+ ns = case ns_val
328
+ when nil then nil
329
+ when String
330
+ raise DecodeError.new(0, 0, "_dms_imports[#{idx}].ns must be a non-empty string when present") if ns_val.empty?
331
+ ns_val
332
+ else
333
+ raise DecodeError.new(0, 0, "_dms_imports[#{idx}].ns must be a string")
334
+ end
335
+
336
+ # bind (optional table: sigil → list of family names)
337
+ bind = {}
338
+ if (bind_val = entry["bind"])
339
+ unless bind_val.is_a?(Hash)
340
+ raise DecodeError.new(0, 0, "_dms_imports[#{idx}].bind must be a table")
341
+ end
342
+ bind_val.each do |sigil, families_val|
343
+ if sigil.empty?
344
+ raise DecodeError.new(0, 0, "_dms_imports[#{idx}].bind has an empty sigil key")
345
+ end
346
+ err = Tier1.validate_sigil_atoms(sigil, idx)
347
+ raise DecodeError.new(0, 0, err) if err
348
+ unless families_val.is_a?(Array)
349
+ raise DecodeError.new(0, 0,
350
+ "_dms_imports[#{idx}].bind[\"#{sigil}\"] must be a list " \
351
+ "(use list form even for a single family)")
352
+ end
353
+ names = families_val.map do |item|
354
+ unless item.is_a?(String)
355
+ raise DecodeError.new(0, 0, "_dms_imports[#{idx}].bind[\"#{sigil}\"] must be a list of strings")
356
+ end
357
+ item
358
+ end
359
+ bind[sigil] = names
360
+ end
361
+ end
362
+
363
+ # allow (optional table: family → list of names)
364
+ allow_map = {}
365
+ if (allow_val = entry["allow"])
366
+ unless allow_val.is_a?(Hash)
367
+ raise DecodeError.new(0, 0, "_dms_imports[#{idx}].allow must be a table")
368
+ end
369
+ allow_val.each do |family, names_val|
370
+ allow_map[family] = extract_string_list(names_val, idx, "allow", family)
371
+ end
372
+ end
373
+
374
+ # deny (optional table: family → list of names)
375
+ deny_map = {}
376
+ if (deny_val = entry["deny"])
377
+ unless deny_val.is_a?(Hash)
378
+ raise DecodeError.new(0, 0, "_dms_imports[#{idx}].deny must be a table")
379
+ end
380
+ deny_val.each do |family, names_val|
381
+ deny_map[family] = extract_string_list(names_val, idx, "deny", family)
382
+ end
383
+ end
384
+
385
+ # allow/deny mutual exclusion
386
+ allow_map.each_key do |family|
387
+ if deny_map.key?(family)
388
+ raise DecodeError.new(0, 0,
389
+ "_dms_imports[#{idx}]: family \"#{family}\" appears in both " \
390
+ "'allow' and 'deny' — they are mutually exclusive for the same family")
391
+ end
392
+ end
393
+
394
+ # alias (optional table: family → table: alias → canonical)
395
+ alias_map = {}
396
+ if (alias_val = entry["alias"])
397
+ unless alias_val.is_a?(Hash)
398
+ raise DecodeError.new(0, 0, "_dms_imports[#{idx}].alias must be a table")
399
+ end
400
+ alias_val.each do |family, inner_val|
401
+ unless inner_val.is_a?(Hash)
402
+ raise DecodeError.new(0, 0,
403
+ "_dms_imports[#{idx}].alias[\"#{family}\"] must be a table (alias → canonical)")
404
+ end
405
+ inner_map = {}
406
+ inner_val.each do |alias_name, canonical_val|
407
+ unless canonical_val.is_a?(String)
408
+ raise DecodeError.new(0, 0,
409
+ "_dms_imports[#{idx}].alias[\"#{family}\"][\"#{alias_name}\"] must be a string")
410
+ end
411
+ inner_map[alias_name] = canonical_val
412
+ end
413
+ alias_map[family] = inner_map
414
+ end
415
+ end
416
+
417
+ # Cross-import collision detection
418
+ ns_repr = ns || "<unset>"
419
+ bind.each do |sigil, families|
420
+ families.each do |family|
421
+ triple_key = "#{sigil}|#{ns_repr}|#{family}"
422
+ if (prev_idx = seen_triples[triple_key])
423
+ prev = specs[prev_idx]
424
+ raise DecodeError.new(0, 0,
425
+ "Decorator binding collision on (sigil='#{sigil}', ns=#{ns_repr}, " \
426
+ "family='#{family}'): " \
427
+ "import ##{prev_idx} dialect '#{prev.dialect}' v#{prev.version} and " \
428
+ "import ##{idx} dialect '#{dialect}' v#{version} both bind " \
429
+ "'#{sigil}' → '#{family}'. Resolve by remapping one.")
430
+ end
431
+ seen_triples[triple_key] = idx
432
+ end
433
+ end
434
+
435
+ specs << ImportSpec.new(
436
+ dialect: dialect,
437
+ version: version,
438
+ ns: ns,
439
+ bind: bind,
440
+ allow: allow_map,
441
+ deny: deny_map,
442
+ alias_map: alias_map
443
+ )
444
+ end
445
+
446
+ specs
447
+ end
448
+
449
+ def self.extract_string_list(val, idx, field, family)
450
+ unless val.is_a?(Array)
451
+ raise DecodeError.new(0, 0, "_dms_imports[#{idx}].#{field}[\"#{family}\"] must be a list")
452
+ end
453
+ val.map do |item|
454
+ unless item.is_a?(String)
455
+ raise DecodeError.new(0, 0, "_dms_imports[#{idx}].#{field}[\"#{family}\"] must be a list of strings")
456
+ end
457
+ item
458
+ end
459
+ end
460
+
461
+ # ── Family resolution ────────────────────────────────────────────────────
462
+
463
+ # Resolve a decorator call's family from imports.
464
+ # Returns [family_name, canonical_fn_name] or raises.
465
+ # Also applies deny-list check.
466
+ def self.resolve_family(sigil, fn_name, ns, imports)
467
+ # Filter imports by ns if specified
468
+ candidate_imports = if ns
469
+ filtered = imports.select { |imp| imp.ns == ns }
470
+ if filtered.empty?
471
+ raise DecodeError.new(0, 0, "unknown namespace '#{ns}'")
472
+ end
473
+ filtered
474
+ else
475
+ imports
476
+ end
477
+
478
+ # For each import, check families bound to this sigil
479
+ # Apply aliases and allow/deny rules
480
+ accepted = [] # [family_name, canonical_fn_name]
481
+
482
+ candidate_imports.each do |imp|
483
+ families = imp.bind[sigil]
484
+ next unless families
485
+
486
+ families.each do |family|
487
+ # Apply alias: fn_name might be an alias
488
+ canonical =
489
+ if (family_aliases = imp.alias_map[family])
490
+ family_aliases[fn_name] || fn_name
491
+ else
492
+ fn_name
493
+ end
494
+
495
+ # Apply deny list
496
+ if (deny_list = imp.deny[family])
497
+ next if deny_list.include?(canonical)
498
+ end
499
+
500
+ # Apply allow list
501
+ if (allow_list = imp.allow[family])
502
+ next unless allow_list.include?(canonical)
503
+ end
504
+
505
+ accepted << [family, canonical]
506
+ end
507
+ end
508
+
509
+ if accepted.empty?
510
+ # Check if sigil is bound at all in any import
511
+ sigil_bound = candidate_imports.any? { |imp| imp.bind.key?(sigil) }
512
+ unless sigil_bound
513
+ raise DecodeError.new(0, 0,
514
+ "name '#{fn_name}' not found in any family bound to sigil '#{sigil}'")
515
+ end
516
+ # Sigil IS bound but fn_name was filtered out (denied or not allowed).
517
+ # Check deny against all imports for the deny_rejected test.
518
+ candidate_imports.each do |imp|
519
+ families = imp.bind[sigil]
520
+ next unless families
521
+ families.each do |family|
522
+ canonical = fn_name
523
+ if (family_aliases = imp.alias_map[family])
524
+ canonical = family_aliases[fn_name] || fn_name
525
+ end
526
+ if (deny_list = imp.deny[family])
527
+ if deny_list.include?(canonical)
528
+ raise DecodeError.new(0, 0,
529
+ "decorator '#{fn_name}' is denied by family '#{family}' deny list")
530
+ end
531
+ end
532
+ end
533
+ end
534
+ # Filtered by allow list — also an error
535
+ raise DecodeError.new(0, 0,
536
+ "name '#{fn_name}' not found in any family bound to sigil '#{sigil}'")
537
+ end
538
+
539
+ # Use first accepted
540
+ accepted.first
541
+ end
542
+
543
+ # ── Tier-1 parser ────────────────────────────────────────────────────────
544
+
545
+ # Parse a DMS source string in tier-1 mode.
546
+ # Returns DocumentT1.
547
+ def self.parse(src)
548
+ T1Parser.new(src).parse
549
+ end
550
+
551
+ class T1Parser
552
+ # Byte constants (reused from Parser)
553
+ SP = 0x20
554
+ TAB = 0x09
555
+ LF = 0x0A
556
+ CR = 0x0D
557
+ LBRACK = 0x5B
558
+ RBRACK = 0x5D
559
+ LBRACE = 0x7B
560
+ RBRACE = 0x7D
561
+ LPAREN = 0x28 # '('
562
+ RPAREN = 0x29 # ')'
563
+ COMMA = 0x2C
564
+ DOT = 0x2E
565
+ PLUS = 0x2B
566
+ COLON = 0x3A
567
+ DQUOTE = 0x22
568
+ SQUOTE = 0x27
569
+ UNDERSCORE = 0x5F
570
+ DIGIT0 = 0x30
571
+ DIGIT9 = 0x39
572
+ LOWER_A = 0x61
573
+ LOWER_Z = 0x7A
574
+ UPPER_A = 0x41
575
+ UPPER_Z = 0x5A
576
+ MINUS = 0x2D
577
+ HASH = 0x23
578
+
579
+ BARE_KEY_BYTE = Array.new(256, false)
580
+ (DIGIT0..DIGIT9).each { |b| BARE_KEY_BYTE[b] = true }
581
+ (LOWER_A..LOWER_Z).each { |b| BARE_KEY_BYTE[b] = true }
582
+ (UPPER_A..UPPER_Z).each { |b| BARE_KEY_BYTE[b] = true }
583
+ BARE_KEY_BYTE[UNDERSCORE] = true
584
+ BARE_KEY_BYTE[MINUS] = true
585
+ BARE_KEY_BYTE.freeze
586
+
587
+ RESERVED_SIGIL_BYTE = Array.new(256, false)
588
+ "!@$%^&*|~`.,><?;=".each_byte { |b| RESERVED_SIGIL_BYTE[b] = true }
589
+ RESERVED_SIGIL_BYTE.freeze
590
+
591
+ # True if the position `pos` in @src starts a sigil atom:
592
+ # either an ASCII reserved sigil char or a reserved-emoji codepoint.
593
+ def sigil_at?(pos)
594
+ return false if pos >= @len
595
+ b = @src.getbyte(pos)
596
+ return false if b.nil?
597
+ return true if RESERVED_SIGIL_BYTE[b]
598
+ return false if b < 0x80
599
+ Tier1.sigil_atom_start_at?(@src, pos)
600
+ end
601
+
602
+ def initialize(src)
603
+ @src_original = src
604
+ @decorators = [] # Array of DecoratorEntry (sidecar)
605
+ @imports = []
606
+ @observed_tier = 0
607
+ end
608
+
609
+ def parse
610
+ # Step 1: Pre-scan to detect tier without triggering the tier-1 rejection.
611
+ tier = detect_tier(@src_original)
612
+
613
+ # If tier-0 input, wrap as tier-0 result
614
+ if tier == 0
615
+ doc = Dms::Parser.parse_document(@src_original)
616
+ return DocumentT1.new(doc, [], [], 0)
617
+ end
618
+
619
+ @observed_tier = tier
620
+
621
+ # Tier-1 parse: handle _dms_tier: 1 in front matter and parse decorators.
622
+ parse_tier1_document
623
+ end
624
+
625
+ # Quick scan for _dms_tier value without full parse.
626
+ def detect_tier(src)
627
+ # Look for _dms_tier in front matter using a regex-like scan.
628
+ # Only look inside the +++ ... +++ block.
629
+ s = src.dup.force_encoding("UTF-8")
630
+ m = s.match(/\A\s*\+\+\+[^\n]*\n(.*?)\n\+\+\+/m)
631
+ return 0 unless m
632
+ fm_content = m[1]
633
+ m2 = fm_content.match(/^_dms_tier\s*:\s*(\d+)/)
634
+ return 0 unless m2
635
+ m2[1].to_i
636
+ end
637
+
638
+ private
639
+
640
+ def parse_tier1_document
641
+ src = @src_original.dup
642
+ if src.encoding == Encoding::ASCII_8BIT || src.encoding == Encoding::BINARY
643
+ src.force_encoding("UTF-8")
644
+ elsif src.encoding != Encoding::UTF_8
645
+ src = src.encode("UTF-8")
646
+ end
647
+ src = src.unicode_normalize(:nfc) unless src.ascii_only?
648
+
649
+ @src = src
650
+ @len = src.bytesize
651
+ @pos = 0
652
+ @line = 1
653
+ @line_start = 0
654
+
655
+ # Parse front matter using the existing parser
656
+ # We need a "tier-1-accepting" parse. We'll re-parse using our own
657
+ # front matter reader that accepts tier=1.
658
+ meta_raw, meta_hash = parse_front_matter_accepting_t1
659
+ @imports = Tier1.extract_imports(meta_hash)
660
+
661
+ # Parse body with decorator awareness
662
+ body, decorators = parse_t1_body
663
+
664
+ # Build a tier-0 Document meta (only non-_ keys, nil when none)
665
+ meta_for_doc = {}
666
+ meta_hash.each do |k, v|
667
+ meta_for_doc[k] = v unless k.start_with?("_")
668
+ end
669
+ meta_final = meta_for_doc.empty? ? nil : meta_for_doc
670
+
671
+ doc = Document.new(meta_final, body, [], [])
672
+ DocumentT1.new(doc, decorators, @imports, 1)
673
+ end
674
+
675
+ def parse_front_matter_accepting_t1
676
+ skip_trivia
677
+ # No front matter
678
+ unless starts_bytes?("+++")
679
+ return [nil, {}]
680
+ end
681
+
682
+ opener_line = @line
683
+ @pos += 3
684
+ skip_inline_ws
685
+ unless consume_eol || eof?
686
+ raise DecodeError.new(@line, col, "front matter opener must be on its own line")
687
+ end
688
+
689
+ inner_buf = +""
690
+ inner_buf.force_encoding("UTF-8")
691
+ loop do
692
+ if eof?
693
+ raise DecodeError.new(opener_line, 1, "unterminated front matter: missing closing '+++'")
694
+ end
695
+ line_begin = @pos
696
+ while @pos < @len
697
+ b = @src.getbyte(@pos)
698
+ break if b == LF || b == CR
699
+ @pos += 1
700
+ end
701
+ line_text = @src.byteslice(line_begin, @pos - line_begin).force_encoding("UTF-8")
702
+ if line_text.strip == "+++"
703
+ consume_eol
704
+ break
705
+ end
706
+ inner_buf << line_text
707
+ inner_buf << "\n" if consume_eol
708
+ end
709
+
710
+ # Parse inner_buf using tier-0 parser but accepting _dms_tier: 1 and _dms_imports
711
+ sub_parser = Dms::Parser.new(inner_buf, lite: true)
712
+ table = sub_parser.parse_body_as_table
713
+
714
+ # Validate the meta keys: allow _dms_tier (1) and _dms_imports
715
+ meta_hash = {}
716
+ table.each do |k, v|
717
+ if k == "_dms_tier"
718
+ unless v.is_a?(Integer) && !v.is_a?(TrueClass) && !v.is_a?(FalseClass)
719
+ raise DecodeError.new(opener_line, 1, "_dms_tier must be a non-negative integer")
720
+ end
721
+ raise DecodeError.new(opener_line, 1, "_dms_tier must be non-negative") if v < 0
722
+ meta_hash[k] = v
723
+ elsif k == "_dms_imports"
724
+ meta_hash[k] = v
725
+ elsif k.start_with?("_")
726
+ raise DecodeError.new(opener_line, 1, "unknown reserved key: #{k}")
727
+ else
728
+ meta_hash[k] = v
729
+ end
730
+ end
731
+
732
+ [table, meta_hash]
733
+ end
734
+
735
+ # Parse the document body, collecting decorator calls into @decorators.
736
+ # Returns [body_value, decorator_entries]
737
+ def parse_t1_body
738
+ @dec_entries = {} # path_key => DecoratorEntry (path as array)
739
+ @pending_leading = [] # Array of [sigil, fn_name, ns, params] for next value
740
+
741
+ skip_trivia
742
+ if eof?
743
+ return [{}, []]
744
+ end
745
+
746
+ b = @src.getbyte(@pos)
747
+
748
+ # Check for leading decorators (sigil at line start)
749
+ if sigil_at?(@pos)
750
+ # Collect leading decorators
751
+ collect_leading_decorators([])
752
+ skip_trivia
753
+ end
754
+
755
+ if eof?
756
+ return [{}, []]
757
+ end
758
+
759
+ b = @src.getbyte(@pos)
760
+ if b == PLUS && peek_after_plus?
761
+ # List root
762
+ body = parse_t1_list_block(0, [])
763
+ else
764
+ # Table root
765
+ body = parse_t1_table_block(0, [])
766
+ end
767
+
768
+ [body, @dec_entries.values]
769
+ end
770
+
771
+ # Collect leading decorator calls at the current position.
772
+ # path: the path for the NEXT value (to be determined after we see the key)
773
+ # But we don't know the next key yet — so we store them as pending.
774
+ def collect_leading_decorators(path_prefix)
775
+ while !eof? && sigil_at?(@pos)
776
+ sigil, fn_name, ns, params = parse_decorator_call
777
+ @pending_leading << { sigil: sigil, fn_name: fn_name, ns: ns, params: params }
778
+ skip_trivia_no_consume_leading
779
+ break if eof?
780
+ # Check if next line also starts with a sigil
781
+ break unless sigil_at?(@pos)
782
+ end
783
+ end
784
+
785
+ def skip_trivia_no_consume_leading
786
+ # Skip whitespace and comments but stop at sigil lines
787
+ loop do
788
+ skip_inline_ws
789
+ b = @src.getbyte(@pos)
790
+ if b == LF
791
+ @pos += 1; advance_line
792
+ elsif b == CR && @src.getbyte(@pos + 1) == LF
793
+ @pos += 2; advance_line
794
+ elsif b == HASH
795
+ skip_line_comment
796
+ elsif b == 0x2F && @src.getbyte(@pos + 1) == 0x2F
797
+ skip_line_comment
798
+ elsif b == 0x2F && @src.getbyte(@pos + 1) == 0x2A
799
+ skip_block_comment
800
+ else
801
+ break
802
+ end
803
+ end
804
+ end
805
+
806
+ def parse_t1_table_block(indent, path_prefix)
807
+ t = {}
808
+ loop do
809
+ skip_trivia
810
+ break if eof?
811
+
812
+ # Measure indent
813
+ li = measure_line_indent
814
+ break if li < indent
815
+
816
+ if li != indent
817
+ raise DecodeError.new(@line, col, "inconsistent indent: expected #{indent} spaces, got #{li}")
818
+ end
819
+ @pos = @line_start + indent
820
+
821
+ b = @src.getbyte(@pos)
822
+
823
+ # Leading decorators (sigil at line start)
824
+ if sigil_at?(@pos)
825
+ # Collect leading decorators for next value
826
+ loop do
827
+ sigil, fn_name, ns, params = parse_decorator_call
828
+ @pending_leading << { sigil: sigil, fn_name: fn_name, ns: ns, params: params }
829
+ # After decorator call, consume rest of line
830
+ skip_inline_ws
831
+ if consume_eol || eof?
832
+ # next line
833
+ end
834
+ skip_trivia
835
+ break if eof?
836
+ li2 = measure_line_indent
837
+ break if li2 < indent
838
+ if li2 != indent
839
+ raise DecodeError.new(@line, col, "inconsistent indent")
840
+ end
841
+ @pos = @line_start + indent
842
+ break unless sigil_at?(@pos)
843
+ end
844
+ next
845
+ end
846
+
847
+ break if b.nil?
848
+ break if b == PLUS && peek_after_plus?
849
+
850
+ # Must be a key-value pair
851
+ key = parse_key_t1
852
+ raise DecodeError.new(@line, col, "expected ':' after key") if @src.getbyte(@pos) != COLON
853
+ @pos += 1 # consume ':'
854
+
855
+ current_path = path_prefix + [{ "key" => key }]
856
+ path_key = path_to_key(current_path)
857
+
858
+ # Flush pending leading decorators onto this key's path
859
+ unless @pending_leading.empty?
860
+ pending = @pending_leading.dup
861
+ @pending_leading.clear
862
+ pending.each do |dec|
863
+ family, canonical_fn = resolve_call(dec[:sigil], dec[:fn_name], dec[:ns])
864
+ add_decorator_call(current_path, dec[:sigil], family, canonical_fn, dec[:ns], :leading, dec[:params])
865
+ end
866
+ end
867
+
868
+ b = @src.getbyte(@pos)
869
+ if b == SP || b == TAB
870
+ @pos += 1
871
+ skip_inline_ws
872
+ nb = @src.getbyte(@pos)
873
+ if nb.nil? || nb == LF || nb == CR
874
+ # Block value follows
875
+ consume_eol
876
+ skip_trivia
877
+ raise DecodeError.new(@line, col, "expected indented child block") if eof?
878
+ child_indent = measure_line_indent
879
+ raise DecodeError.new(@line, col, "expected indented child block") if child_indent <= indent
880
+ v = parse_t1_block_value(child_indent, current_path)
881
+ t[key] = v
882
+ elsif sigil_at?(@pos)
883
+ # Inner decorator(s)
884
+ v, has_value = parse_t1_inner_and_value(current_path)
885
+ t[key] = v
886
+ # Consume rest of line (trailing decorators already consumed in parse_t1_inner_and_value)
887
+ skip_inline_ws
888
+ consume_eol
889
+ else
890
+ # Inline value, possibly followed by trailing decorators
891
+ v = parse_t1_inline_value(current_path)
892
+ skip_inline_ws
893
+ # Check for trailing decorators
894
+ if sigil_at?(@pos)
895
+ parse_t1_trailing_decorators(current_path)
896
+ end
897
+ skip_inline_ws
898
+ consume_eol
899
+ t[key] = v
900
+ end
901
+ elsif b.nil? || b == LF || b == CR
902
+ consume_eol
903
+ skip_trivia
904
+ raise DecodeError.new(@line, col, "expected indented child block") if eof?
905
+ child_indent = measure_line_indent
906
+ raise DecodeError.new(@line, col, "expected indented child block") if child_indent <= indent
907
+ v = parse_t1_block_value(child_indent, current_path)
908
+ t[key] = v
909
+ else
910
+ raise DecodeError.new(@line, col, "expected whitespace after ':'")
911
+ end
912
+
913
+ raise DecodeError.new(@line, col, "duplicate key: #{key}") if t.key?(key) && t[key] != v
914
+ t[key] = v
915
+ end
916
+ t
917
+ end
918
+
919
+ def parse_t1_list_block(indent, path_prefix)
920
+ items = []
921
+ loop do
922
+ skip_trivia
923
+ break if eof?
924
+ li = measure_line_indent
925
+ break if li < indent
926
+ if li != indent
927
+ raise DecodeError.new(@line, col, "inconsistent indent: expected #{indent} spaces, got #{li}")
928
+ end
929
+ @pos = @line_start + indent
930
+ break unless @src.getbyte(@pos) == PLUS && peek_after_plus?
931
+
932
+ idx = items.length
933
+ current_path = path_prefix + [{ "index" => idx }]
934
+ @pos += 1 # consume '+'
935
+
936
+ b = @src.getbyte(@pos)
937
+ if b == SP || b == TAB
938
+ @pos += 1
939
+ skip_inline_ws
940
+ nb = @src.getbyte(@pos)
941
+ if nb.nil? || nb == LF || nb == CR
942
+ consume_eol
943
+ skip_trivia
944
+ raise DecodeError.new(@line, col, "expected indented block after empty '+' marker") if eof?
945
+ inner_indent = measure_line_indent
946
+ raise DecodeError.new(@line, col, "expected indented block") if inner_indent <= indent
947
+ v = parse_t1_block_value(inner_indent, current_path)
948
+ elsif sigil_at?(@pos)
949
+ v, _ = parse_t1_inner_and_value(current_path)
950
+ skip_inline_ws; consume_eol
951
+ else
952
+ v = parse_t1_list_item_value(indent, current_path)
953
+ end
954
+ elsif b.nil? || b == LF || b == CR
955
+ consume_eol
956
+ skip_trivia
957
+ raise DecodeError.new(@line, col, "expected indented block after empty '+' marker") if eof?
958
+ inner_indent = measure_line_indent
959
+ raise DecodeError.new(@line, col, "expected indented block") if inner_indent <= indent
960
+ v = parse_t1_block_value(inner_indent, current_path)
961
+ else
962
+ raise DecodeError.new(@line, col, "expected space after '+'")
963
+ end
964
+
965
+ items << v
966
+ end
967
+ items
968
+ end
969
+
970
+ def parse_t1_block_value(indent, path_prefix)
971
+ @pos = @line_start + indent
972
+ if @src.getbyte(@pos) == PLUS && peek_after_plus?
973
+ return parse_t1_list_block(indent, path_prefix)
974
+ end
975
+ parse_t1_table_block(indent, path_prefix)
976
+ end
977
+
978
+ def parse_t1_list_item_value(list_indent, path_prefix)
979
+ # Check if it's a kv pair
980
+ if line_starts_kvpair_t1?
981
+ key = parse_key_t1
982
+ raise DecodeError.new(@line, col, "expected ':' after key") if @src.getbyte(@pos) != COLON
983
+ @pos += 1
984
+ current_path = path_prefix + [{ "key" => key }]
985
+ b = @src.getbyte(@pos)
986
+ if b == SP || b == TAB
987
+ @pos += 1; skip_inline_ws
988
+ nb = @src.getbyte(@pos)
989
+ if nb.nil? || nb == LF || nb == CR
990
+ consume_eol; skip_trivia
991
+ raise DecodeError.new(@line, col, "expected indented child block") if eof?
992
+ child_indent = measure_line_indent
993
+ raise DecodeError.new(@line, col, "expected indented child block") if child_indent <= list_indent
994
+ v = parse_t1_block_value(child_indent, current_path)
995
+ elsif sigil_at?(@pos)
996
+ v, _ = parse_t1_inner_and_value(current_path)
997
+ skip_inline_ws; consume_eol
998
+ else
999
+ v = parse_t1_inline_value(current_path)
1000
+ skip_inline_ws
1001
+ parse_t1_trailing_decorators(current_path) if sigil_at?(@pos)
1002
+ skip_inline_ws; consume_eol
1003
+ end
1004
+ elsif b.nil? || b == LF || b == CR
1005
+ consume_eol; skip_trivia
1006
+ child_indent = measure_line_indent
1007
+ raise DecodeError.new(@line, col, "expected indented child block") if child_indent <= list_indent
1008
+ v = parse_t1_block_value(child_indent, current_path)
1009
+ else
1010
+ raise DecodeError.new(@line, col, "expected whitespace after ':'")
1011
+ end
1012
+ t = { key => v }
1013
+ # Continue reading sibling kv pairs at same indent
1014
+ key_col = current_path.last["key"] ? (list_indent + 2) : list_indent
1015
+ # Simple single-item table for list item
1016
+ return t
1017
+ end
1018
+ # Scalar or flow value
1019
+ v = parse_t1_inline_value(path_prefix)
1020
+ skip_inline_ws
1021
+ parse_t1_trailing_decorators(path_prefix) if sigil_at?(@pos)
1022
+ skip_inline_ws; consume_eol
1023
+ v
1024
+ end
1025
+
1026
+ # Parse inner decorator(s) followed optionally by a value.
1027
+ # Returns [value, has_explicit_value]
1028
+ # Position: already past `key: ` whitespace, sitting on sigil.
1029
+ def parse_t1_inner_and_value(path)
1030
+ # Collect all consecutive inner decorator calls
1031
+ while !eof? && sigil_at?(@pos)
1032
+ sigil, fn_name, ns, params = parse_decorator_call
1033
+ family, canonical_fn = resolve_call(sigil, fn_name, ns)
1034
+ add_decorator_call(path, sigil, family, canonical_fn, ns, :inner, params)
1035
+ skip_inline_ws
1036
+ end
1037
+
1038
+ # Now check if there's a value after the decorator(s)
1039
+ b = @src.getbyte(@pos)
1040
+ if b.nil? || b == LF || b == CR
1041
+ # Decoration-only: no value — use empty table as placeholder
1042
+ # Check for both_forms_present: if this path has block children, error
1043
+ [parse_decoration_only_placeholder(path), false]
1044
+ else
1045
+ v = parse_t1_inline_value(path)
1046
+ # Check for trailing decorators
1047
+ skip_inline_ws
1048
+ parse_t1_trailing_decorators(path) if sigil_at?(@pos)
1049
+ [v, true]
1050
+ end
1051
+ end
1052
+
1053
+ # Parse decoration-only placeholder.
1054
+ # Returns {} (empty table) — the family's empty_default.
1055
+ # Also checks for "both forms present" error (block child after decoration-only).
1056
+ def parse_decoration_only_placeholder(path)
1057
+ {}
1058
+ end
1059
+
1060
+ def parse_t1_trailing_decorators(path)
1061
+ while !eof? && sigil_at?(@pos)
1062
+ sigil, fn_name, ns, params = parse_decorator_call
1063
+ family, canonical_fn = resolve_call(sigil, fn_name, ns)
1064
+ add_decorator_call(path, sigil, family, canonical_fn, ns, :trailing, params)
1065
+ skip_inline_ws
1066
+ end
1067
+ end
1068
+
1069
+ # Parse a decorator call: sigil-run name[.tail]? [(params)]*
1070
+ # Returns [sigil, fn_name, ns, params]
1071
+ # ns is the part before '.' if present and it's a namespace (not after the first name)
1072
+ def parse_decorator_call
1073
+ # Lex the sigil: consecutive sigil atoms (ASCII reserved chars OR emoji clusters)
1074
+ sigil_start = @pos
1075
+ while !eof?
1076
+ atom_len = Tier1.lex_sigil_atom_at(@src, @pos)
1077
+ break if atom_len.nil?
1078
+ @pos += atom_len
1079
+ end
1080
+ sigil = @src.byteslice(sigil_start, @pos - sigil_start).force_encoding("UTF-8")
1081
+
1082
+ raise DecodeError.new(@line, col, "empty decorator sigil") if sigil.empty?
1083
+
1084
+ # Parse name (bare identifier)
1085
+ # Could be: name OR ns.name
1086
+ name1 = parse_bare_ident
1087
+ raise DecodeError.new(@line, col, "expected decorator name after sigil '#{sigil}'") if name1.empty?
1088
+
1089
+ # Check for '.' => namespace qualifier
1090
+ ns = nil
1091
+ fn_name = name1
1092
+ if @src.getbyte(@pos) == DOT
1093
+ @pos += 1 # consume '.'
1094
+ name2 = parse_bare_ident
1095
+ if name2.empty?
1096
+ raise DecodeError.new(@line, col, "expected name after '.' in decorator call")
1097
+ end
1098
+ ns = name1
1099
+ fn_name = name2
1100
+ end
1101
+
1102
+ # Parse zero or more param groups: '(' ... ')'
1103
+ params = []
1104
+ skip_inline_ws
1105
+ while @src.getbyte(@pos) == LPAREN
1106
+ @pos += 1 # consume '('
1107
+ skip_flow_ws_t1
1108
+ b = @src.getbyte(@pos)
1109
+ if b == RPAREN
1110
+ @pos += 1 # empty parens -> one named group with empty map
1111
+ params << ParamGroup.new(:named, {})
1112
+ else
1113
+ pg = parse_param_group
1114
+ params << pg
1115
+ skip_flow_ws_t1
1116
+ raise DecodeError.new(@line, col, "expected ')' after params") if @src.getbyte(@pos) != RPAREN
1117
+ @pos += 1
1118
+ end
1119
+ skip_inline_ws
1120
+ end
1121
+
1122
+ # If no params at all, treat as one empty named group
1123
+ params << ParamGroup.new(:named, {}) if params.empty?
1124
+
1125
+ [sigil, fn_name, ns, params]
1126
+ end
1127
+
1128
+ # Parse a param group (between parens, after '(' was consumed).
1129
+ # Returns a ParamGroup.
1130
+ def parse_param_group
1131
+ # Peek: if next non-ws is a bare key followed by ':', it's named;
1132
+ # otherwise positional.
1133
+ save_pos = @pos
1134
+ skip_flow_ws_t1
1135
+ named = looks_like_named_params?
1136
+
1137
+ if named
1138
+ map = {}
1139
+ loop do
1140
+ skip_flow_ws_t1
1141
+ b = @src.getbyte(@pos)
1142
+ break if b == RPAREN || b.nil?
1143
+ k = parse_bare_ident
1144
+ raise DecodeError.new(@line, col, "expected key in named params") if k.empty?
1145
+ skip_flow_ws_t1
1146
+ raise DecodeError.new(@line, col, "expected ':' after param key") if @src.getbyte(@pos) != COLON
1147
+ @pos += 1
1148
+ skip_flow_ws_t1
1149
+ v = parse_t1_flow_value([])
1150
+ map[k] = v
1151
+ skip_flow_ws_t1
1152
+ b = @src.getbyte(@pos)
1153
+ if b == COMMA
1154
+ @pos += 1
1155
+ elsif b == RPAREN || b.nil?
1156
+ break
1157
+ else
1158
+ raise DecodeError.new(@line, col, "expected ',' or ')' in named params, got '#{b.nil? ? "EOF" : b.chr}'")
1159
+ end
1160
+ end
1161
+ ParamGroup.new(:named, map)
1162
+ else
1163
+ items = []
1164
+ loop do
1165
+ skip_flow_ws_t1
1166
+ b = @src.getbyte(@pos)
1167
+ break if b == RPAREN || b.nil?
1168
+ v = parse_t1_flow_value([])
1169
+ items << v
1170
+ skip_flow_ws_t1
1171
+ b = @src.getbyte(@pos)
1172
+ if b == COMMA
1173
+ @pos += 1
1174
+ elsif b == RPAREN || b.nil?
1175
+ break
1176
+ else
1177
+ raise DecodeError.new(@line, col, "expected ',' or ')' in positional params")
1178
+ end
1179
+ end
1180
+ ParamGroup.new(:positional, items)
1181
+ end
1182
+ end
1183
+
1184
+ def looks_like_named_params?
1185
+ # Peek ahead: bare ident chars followed by optional ws then ':'
1186
+ p = @pos
1187
+ n = @len
1188
+ s = @src
1189
+ while p < n
1190
+ b = s.getbyte(p)
1191
+ break unless b && (BARE_KEY_BYTE[b] || b >= 128)
1192
+ p += 1
1193
+ end
1194
+ return false if p == @pos # no ident chars
1195
+ # Skip ws
1196
+ while p < n
1197
+ b = s.getbyte(p)
1198
+ break unless b == SP || b == TAB
1199
+ p += 1
1200
+ end
1201
+ return false if p >= n
1202
+ s.getbyte(p) == COLON
1203
+ end
1204
+
1205
+ # Parse a flow value (used in params and flow arrays/tables).
1206
+ def parse_t1_flow_value(path)
1207
+ b = @src.getbyte(@pos)
1208
+ if b == LBRACK
1209
+ return parse_t1_flow_array(path)
1210
+ elsif b == LBRACE
1211
+ return parse_t1_flow_table(path)
1212
+ end
1213
+ parse_t1_inline_value_raw
1214
+ end
1215
+
1216
+ # Parse an inline value (delegating to existing parser machinery).
1217
+ def parse_t1_inline_value(path)
1218
+ b = @src.getbyte(@pos)
1219
+ if b == LBRACK
1220
+ return parse_t1_flow_array(path)
1221
+ elsif b == LBRACE
1222
+ return parse_t1_flow_table(path)
1223
+ end
1224
+ parse_t1_inline_value_raw
1225
+ end
1226
+
1227
+ # Parse a flow-array with decorator awareness (for tier-1 inner decorators in []).
1228
+ def parse_t1_flow_array(path)
1229
+ @pos += 1 # consume '['
1230
+ items = []
1231
+ loop do
1232
+ skip_flow_ws_t1
1233
+ b = @src.getbyte(@pos)
1234
+ if b == RBRACK
1235
+ @pos += 1
1236
+ return items
1237
+ end
1238
+
1239
+ idx = items.length
1240
+ current_path = path + [{ "index" => idx }]
1241
+
1242
+ # Check for inner decorator before value
1243
+ if sigil_at?(@pos)
1244
+ # Inner decorator in flow array
1245
+ loop do
1246
+ break unless sigil_at?(@pos)
1247
+ sigil, fn_name, ns, params = parse_decorator_call
1248
+ family, canonical_fn = resolve_call(sigil, fn_name, ns)
1249
+ add_decorator_call(current_path, sigil, family, canonical_fn, ns, :inner, params)
1250
+ skip_inline_ws
1251
+ end
1252
+ # Now parse the actual value
1253
+ b = @src.getbyte(@pos)
1254
+ if b == COMMA || b == RBRACK || b.nil?
1255
+ # Decoration-only in flow array — empty table placeholder
1256
+ items << {}
1257
+ else
1258
+ v = parse_t1_flow_value(current_path)
1259
+ items << v
1260
+ end
1261
+ else
1262
+ v = parse_t1_flow_value(current_path)
1263
+ items << v
1264
+ end
1265
+
1266
+ skip_flow_ws_t1
1267
+ b = @src.getbyte(@pos)
1268
+ if b == COMMA
1269
+ @pos += 1
1270
+ elsif b == RBRACK
1271
+ @pos += 1
1272
+ return items
1273
+ elsif b.nil?
1274
+ raise DecodeError.new(@line, col, "unterminated flow array")
1275
+ else
1276
+ raise DecodeError.new(@line, col, "unexpected '#{b.chr}' in flow array")
1277
+ end
1278
+ end
1279
+ end
1280
+
1281
+ def parse_t1_flow_table(path)
1282
+ @pos += 1 # consume '{'
1283
+ t = {}
1284
+ loop do
1285
+ skip_flow_ws_t1
1286
+ b = @src.getbyte(@pos)
1287
+ if b == RBRACE
1288
+ @pos += 1
1289
+ return t
1290
+ end
1291
+ k = parse_key_t1
1292
+ skip_flow_ws_t1
1293
+ raise DecodeError.new(@line, col, "expected ':' after flow-table key") unless @src.getbyte(@pos) == COLON
1294
+ @pos += 1
1295
+ skip_flow_ws_t1
1296
+ kp = path + [{ "key" => k }]
1297
+ v = parse_t1_flow_value(kp)
1298
+ t[k] = v
1299
+ skip_flow_ws_t1
1300
+ b = @src.getbyte(@pos)
1301
+ if b == COMMA
1302
+ @pos += 1
1303
+ elsif b == RBRACE
1304
+ @pos += 1
1305
+ return t
1306
+ elsif b.nil?
1307
+ raise DecodeError.new(@line, col, "unterminated flow table")
1308
+ else
1309
+ raise DecodeError.new(@line, col, "unexpected '#{b.chr}' in flow table")
1310
+ end
1311
+ end
1312
+ end
1313
+
1314
+ # ── Low-level helpers ────────────────────────────────────────────────
1315
+
1316
+ def parse_t1_inline_value_raw
1317
+ # Delegate to existing parser machinery by creating a tiny sub-parser
1318
+ # positioned at @pos and having it parse one value.
1319
+ # This is the simplest approach — create a sub-parser with the same source
1320
+ # but start at @pos, then copy back.
1321
+ sub = InlineValueParser.new(@src, @pos, @line, @line_start, self)
1322
+ val = sub.parse_one_value
1323
+ @pos = sub.pos
1324
+ @line = sub.line
1325
+ @line_start = sub.line_start
1326
+ val
1327
+ end
1328
+
1329
+ def add_decorator_call(path, sigil, family, fn_name, ns, position, params)
1330
+ path_key = path_to_key(path)
1331
+ entry = @dec_entries[path_key]
1332
+ if entry.nil?
1333
+ entry = DecoratorEntry.new(path.dup, {}, [])
1334
+ @dec_entries[path_key] = entry
1335
+ end
1336
+ call = DecoratorCall.new(sigil, family, fn_name, ns, position, params, [])
1337
+ entry.calls[sigil] ||= []
1338
+ entry.calls[sigil] << call
1339
+ end
1340
+
1341
+ def resolve_call(sigil, fn_name, ns)
1342
+ Tier1.resolve_family(sigil, fn_name, ns, @imports)
1343
+ end
1344
+
1345
+ def path_to_key(path)
1346
+ path.map { |seg| seg.key?("key") ? "k:#{seg["key"]}" : "i:#{seg["index"]}" }.join("/")
1347
+ end
1348
+
1349
+ def skip_trivia
1350
+ loop do
1351
+ skip_inline_ws
1352
+ b = @src.getbyte(@pos)
1353
+ if b == LF
1354
+ @pos += 1; advance_line
1355
+ elsif b == CR && @src.getbyte(@pos + 1) == LF
1356
+ @pos += 2; advance_line
1357
+ elsif b == HASH
1358
+ skip_line_comment
1359
+ elsif b == 0x2F && @src.getbyte(@pos + 1) == 0x2F
1360
+ skip_line_comment
1361
+ elsif b == 0x2F && @src.getbyte(@pos + 1) == 0x2A
1362
+ skip_block_comment
1363
+ else
1364
+ break
1365
+ end
1366
+ end
1367
+ end
1368
+
1369
+ def skip_inline_ws
1370
+ @pos += 1 while @src.getbyte(@pos) == SP || @src.getbyte(@pos) == TAB
1371
+ end
1372
+
1373
+ def skip_line_comment
1374
+ while @pos < @len
1375
+ b = @src.getbyte(@pos)
1376
+ @pos += 1
1377
+ if b == LF
1378
+ advance_line; break
1379
+ elsif b == CR
1380
+ @pos += 1 if @src.getbyte(@pos) == LF
1381
+ advance_line; break
1382
+ end
1383
+ end
1384
+ end
1385
+
1386
+ def skip_block_comment
1387
+ @pos += 2 # consume '/*'
1388
+ loop do
1389
+ raise DecodeError.new(@line, col, "unterminated block comment") if eof?
1390
+ b = @src.getbyte(@pos)
1391
+ if b == 0x2A && @src.getbyte(@pos + 1) == 0x2F
1392
+ @pos += 2; break
1393
+ elsif b == LF
1394
+ @pos += 1; advance_line
1395
+ elsif b == CR
1396
+ @pos += 1
1397
+ if @src.getbyte(@pos) == LF
1398
+ @pos += 1
1399
+ end
1400
+ advance_line
1401
+ else
1402
+ @pos += 1
1403
+ end
1404
+ end
1405
+ end
1406
+
1407
+ def skip_flow_ws_t1
1408
+ loop do
1409
+ b = @src.getbyte(@pos)
1410
+ if b == SP || b == TAB
1411
+ @pos += 1
1412
+ elsif b == LF
1413
+ @pos += 1; advance_line
1414
+ elsif b == CR && @src.getbyte(@pos + 1) == LF
1415
+ @pos += 2; advance_line
1416
+ else
1417
+ break
1418
+ end
1419
+ end
1420
+ end
1421
+
1422
+ def consume_eol
1423
+ b = @src.getbyte(@pos)
1424
+ if b == LF
1425
+ @pos += 1; advance_line; true
1426
+ elsif b == CR
1427
+ @pos += 1
1428
+ @pos += 1 if @src.getbyte(@pos) == LF
1429
+ advance_line; true
1430
+ else
1431
+ false
1432
+ end
1433
+ end
1434
+
1435
+ def advance_line
1436
+ @line += 1
1437
+ @line_start = @pos
1438
+ end
1439
+
1440
+ def eof?
1441
+ @pos >= @len
1442
+ end
1443
+
1444
+ def col
1445
+ bytes = @pos - @line_start
1446
+ bytes + 1
1447
+ end
1448
+
1449
+ def starts_bytes?(s)
1450
+ @src.byteslice(@pos, s.bytesize) == s
1451
+ end
1452
+
1453
+ def measure_line_indent
1454
+ n = 0
1455
+ i = @line_start
1456
+ while i < @len && @src.getbyte(i) == SP
1457
+ n += 1; i += 1
1458
+ end
1459
+ n
1460
+ end
1461
+
1462
+ def peek_after_plus?
1463
+ b = @src.getbyte(@pos + 1)
1464
+ b.nil? || b == SP || b == TAB || b == LF || b == CR
1465
+ end
1466
+
1467
+ def parse_key_t1
1468
+ b = @src.getbyte(@pos)
1469
+ if b == DQUOTE
1470
+ parse_dquote_key
1471
+ elsif b == SQUOTE
1472
+ parse_squote_key
1473
+ else
1474
+ parse_bare_ident_key
1475
+ end
1476
+ end
1477
+
1478
+ def parse_bare_ident_key
1479
+ s = @src; n = @len; pos = @pos; start = pos
1480
+ while pos < n
1481
+ b = s.getbyte(pos)
1482
+ break unless b && (b < 128 ? BARE_KEY_BYTE[b] : true)
1483
+ pos += 1
1484
+ end
1485
+ raise DecodeError.new(@line, col, "expected key") if pos == start
1486
+ @pos = pos
1487
+ s.byteslice(start, pos - start).force_encoding("UTF-8")
1488
+ end
1489
+
1490
+ def parse_bare_ident
1491
+ s = @src; n = @len; pos = @pos; start = pos
1492
+ while pos < n
1493
+ b = s.getbyte(pos)
1494
+ break unless b && b < 128 && BARE_KEY_BYTE[b]
1495
+ pos += 1
1496
+ end
1497
+ @pos = pos
1498
+ s.byteslice(start, pos - start).force_encoding("UTF-8")
1499
+ end
1500
+
1501
+ def parse_dquote_key
1502
+ @pos += 1 # consume '"'
1503
+ buf = +""
1504
+ loop do
1505
+ b = @src.getbyte(@pos)
1506
+ raise DecodeError.new(@line, col, "unterminated quoted key") if b.nil? || b == LF
1507
+ if b == 0x5C # backslash
1508
+ @pos += 1
1509
+ buf << parse_escape
1510
+ elsif b == DQUOTE
1511
+ @pos += 1
1512
+ return buf
1513
+ else
1514
+ buf << @src.byteslice(@pos, 1).force_encoding("UTF-8")
1515
+ @pos += 1
1516
+ end
1517
+ end
1518
+ end
1519
+
1520
+ def parse_squote_key
1521
+ @pos += 1 # consume "'"
1522
+ start = @pos
1523
+ loop do
1524
+ b = @src.getbyte(@pos)
1525
+ raise DecodeError.new(@line, col, "unterminated literal key") if b.nil? || b == LF
1526
+ if b == SQUOTE
1527
+ result = @src.byteslice(start, @pos - start).force_encoding("UTF-8")
1528
+ @pos += 1
1529
+ return result
1530
+ end
1531
+ @pos += 1
1532
+ end
1533
+ end
1534
+
1535
+ def parse_escape
1536
+ b = @src.getbyte(@pos)
1537
+ @pos += 1
1538
+ case b
1539
+ when 0x62 then "\b"
1540
+ when 0x66 then "\f"
1541
+ when 0x6E then "\n"
1542
+ when 0x72 then "\r"
1543
+ when 0x74 then "\t"
1544
+ when DQUOTE then "\""
1545
+ when 0x5C then "\\"
1546
+ else "\\#{b.chr}"
1547
+ end
1548
+ end
1549
+
1550
+ def line_starts_kvpair_t1?
1551
+ p = @pos
1552
+ s = @src
1553
+ n = @len
1554
+ first = s.getbyte(p)
1555
+ if first == DQUOTE
1556
+ p += 1
1557
+ while p < n
1558
+ b = s.getbyte(p)
1559
+ if b == 0x5C; p += 2
1560
+ elsif b == DQUOTE; p += 1; break
1561
+ elsif b == LF || b == CR; return false
1562
+ else; p += 1
1563
+ end
1564
+ end
1565
+ elsif first == SQUOTE
1566
+ p += 1
1567
+ while p < n
1568
+ b = s.getbyte(p)
1569
+ if b == SQUOTE; p += 1; break
1570
+ elsif b == LF || b == CR; return false
1571
+ else; p += 1
1572
+ end
1573
+ end
1574
+ else
1575
+ any = false
1576
+ while p < n
1577
+ b = s.getbyte(p)
1578
+ break unless b && b < 128 && BARE_KEY_BYTE[b]
1579
+ p += 1; any = true
1580
+ end
1581
+ return false unless any
1582
+ end
1583
+ return false if p >= n || s.getbyte(p) != COLON
1584
+ nxt = s.getbyte(p + 1)
1585
+ nxt.nil? || nxt == SP || nxt == TAB || nxt == LF || nxt == CR
1586
+ end
1587
+ end
1588
+
1589
+ # ── InlineValueParser ────────────────────────────────────────────────────
1590
+ #
1591
+ # A thin wrapper that delegates value parsing to a real Dms::Parser
1592
+ # instance, positioned at a specific offset in the source.
1593
+ # We extract the source substring from the current position to end-of-line
1594
+ # (plus the rest of the document for block values), parse one value,
1595
+ # and map back the consumed bytes.
1596
+
1597
+ class InlineValueParser
1598
+ attr_reader :pos, :line, :line_start
1599
+
1600
+ def initialize(src, start_pos, start_line, start_line_start, outer)
1601
+ @src = src
1602
+ @pos = start_pos
1603
+ @line = start_line
1604
+ @line_start = start_line_start
1605
+ @outer = outer
1606
+ @len = src.bytesize
1607
+ end
1608
+
1609
+ def parse_one_value
1610
+ # Build a sub-source from @pos onwards, but we need an offset trick.
1611
+ # We'll use Dms::Parser directly, padding the prefix with spaces so
1612
+ # line numbers are approximately correct. Since error messages in
1613
+ # parameter parsing use the outer parser's position, this is fine.
1614
+ sub_src = @src.byteslice(@pos, @len - @pos).force_encoding("UTF-8")
1615
+
1616
+ # Use a real Parser in lite mode to parse one value.
1617
+ # We only need one token; the parser will stop at the right place.
1618
+ p = SingleValueParser.new(sub_src)
1619
+ val = p.parse_value_at_start
1620
+ consumed = p.consumed
1621
+ # Update position
1622
+ new_src_pos = @pos + consumed
1623
+ # Update line counts based on consumed newlines
1624
+ consumed_src = @src.byteslice(@pos, consumed)
1625
+ consumed_src.each_byte do |b|
1626
+ if b == 0x0A
1627
+ @line += 1
1628
+ @line_start = @pos + (@src.byteslice(@pos, consumed).index("\n".force_encoding("UTF-8"), 0) || 0) + 1
1629
+ end
1630
+ end
1631
+ # Recount properly
1632
+ @line = 1 + @src.byteslice(0, new_src_pos).count("\n")
1633
+ last_nl = @src.byteslice(0, new_src_pos).rindex("\n")
1634
+ @line_start = last_nl ? last_nl + 1 : 0
1635
+ @pos = new_src_pos
1636
+ val
1637
+ end
1638
+ end
1639
+
1640
+ # SingleValueParser: uses Dms::Parser to parse one value from a string.
1641
+ class SingleValueParser
1642
+ attr_reader :consumed
1643
+
1644
+ def initialize(src)
1645
+ @src = src
1646
+ @parser = nil
1647
+ @consumed = 0
1648
+ end
1649
+
1650
+ def parse_value_at_start
1651
+ # Wrap in a minimal DMS document and parse, then extract the value.
1652
+ # Since we need to parse a raw value (not a key-value pair),
1653
+ # we wrap it as `_v: <value>` and parse the front matter.
1654
+ # Actually simpler: parse as a scalar root document.
1655
+
1656
+ # Build: the value might be followed by sigil chars, newlines, etc.
1657
+ # We'll use a fresh Parser at lite mode.
1658
+ p = Dms::Parser.new(@src, lite: true)
1659
+ val = p.parse_inline_value_or_heredoc
1660
+ @consumed = p.instance_variable_get(:@pos)
1661
+ val
1662
+ end
1663
+ end
1664
+
1665
+ # ── JSON emission ────────────────────────────────────────────────────────
1666
+
1667
+ def self.emit_t1_json(doc_t1, tag_fn)
1668
+ imports_json = doc_t1.imports.map { |imp| import_to_json(imp) }
1669
+ body_tagged = tag_fn.call(doc_t1.t0.body)
1670
+ decorators_json = doc_t1.decorators.map { |entry| entry_to_json(entry, tag_fn) }
1671
+
1672
+ {
1673
+ "tier" => doc_t1.observed_tier,
1674
+ "imports" => imports_json,
1675
+ "body" => body_tagged,
1676
+ "decorators" => decorators_json
1677
+ }
1678
+ end
1679
+
1680
+ def self.import_to_json(imp)
1681
+ bind_json = {}
1682
+ imp.bind.each { |sigil, fams| bind_json[sigil] = fams }
1683
+
1684
+ allow_json = {}
1685
+ imp.allow.each { |family, names| allow_json[family] = names }
1686
+
1687
+ deny_json = {}
1688
+ imp.deny.each { |family, names| deny_json[family] = names }
1689
+
1690
+ alias_json = {}
1691
+ imp.alias_map.each do |family, inner|
1692
+ alias_json[family] = inner
1693
+ end
1694
+
1695
+ {
1696
+ "dialect" => imp.dialect,
1697
+ "version" => imp.version,
1698
+ "ns" => imp.ns,
1699
+ "bind" => bind_json,
1700
+ "allow" => allow_json,
1701
+ "deny" => deny_json,
1702
+ "alias" => alias_json
1703
+ }
1704
+ end
1705
+
1706
+ def self.entry_to_json(entry, tag_fn)
1707
+ path_json = entry.path.map do |seg|
1708
+ if seg.key?("key")
1709
+ { "key" => seg["key"] }
1710
+ else
1711
+ { "index" => seg["index"] }
1712
+ end
1713
+ end
1714
+
1715
+ calls_json = {}
1716
+ entry.calls.each do |sigil, calls|
1717
+ calls_json[sigil] = calls.map { |c| call_to_json(c, tag_fn) }
1718
+ end
1719
+
1720
+ {
1721
+ "path" => path_json,
1722
+ "calls" => calls_json,
1723
+ "comments" => []
1724
+ }
1725
+ end
1726
+
1727
+ def self.call_to_json(call, tag_fn)
1728
+ params_json = call.params.map { |pg| param_group_to_json(pg, tag_fn) }
1729
+ {
1730
+ "family" => call.family,
1731
+ "fn" => call.fn_name,
1732
+ "ns" => call.ns,
1733
+ "position" => call.position.to_s,
1734
+ "params" => params_json,
1735
+ "params_dec" => []
1736
+ }
1737
+ end
1738
+
1739
+ def self.param_group_to_json(pg, tag_fn)
1740
+ case pg.kind
1741
+ when :named
1742
+ tagged_val = {}
1743
+ pg.value.each { |k, v| tagged_val[k] = tag_fn.call(v) }
1744
+ { "kind" => "named", "value" => tagged_val }
1745
+ when :positional
1746
+ { "kind" => "positional", "value" => pg.value.map { |v| tag_fn.call(v) } }
1747
+ end
1748
+ end
1749
+ end
1750
+ end