dms-parser 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,674 @@
1
+ # frozen_string_literal: true
2
+
3
+ # DMS round-trip emitter (SPEC §encode).
4
+ #
5
+ # Re-emits a parsed `Document` as DMS source. Contract:
6
+ #
7
+ # Dms.decode_document(Dms.encode(Dms.decode_document(source)))
8
+ #
9
+ # is data-equivalent to `Dms.decode_document(source)`, has the same
10
+ # comments at the same attached paths, and uses the same literal forms
11
+ # (integer base, string flavor, heredoc label + modifiers) for values
12
+ # that recorded an `OriginalLiteral`. Float formatting is shortest-
13
+ # round-trip; indentation is 2 spaces; lists and tables default to block
14
+ # form.
15
+ #
16
+ # Round-trip stability: `encode(decode_document(encode(decode_document(src))))`
17
+ # is byte-equal to `encode(decode_document(src))`.
18
+ #
19
+ # Mirrors the Python reference at dms/emitter.py and the Rust reference
20
+ # at language/rust/crates/dms/src/lib.rs.
21
+
22
+ require "dms/types"
23
+
24
+ module Dms
25
+ INDENT_STR = " "
26
+
27
+ # Recursive walk: returns true iff `v` (or any descendant) is an
28
+ # `UnorderedHash`. Used by `encode` to enforce the SPEC contract that
29
+ # full-mode round-trip refuses unordered documents.
30
+ def self._contains_unordered_table?(v)
31
+ case v
32
+ when UnorderedHash
33
+ true
34
+ when Hash
35
+ v.each_value { |vv| return true if _contains_unordered_table?(vv) }
36
+ false
37
+ when Array
38
+ v.each { |vv| return true if _contains_unordered_table?(vv) }
39
+ false
40
+ else
41
+ false
42
+ end
43
+ end
44
+
45
+ class Emitter
46
+ attr_reader :out
47
+
48
+ # ----- canonical entry points (SPEC §encode) ------------------
49
+
50
+ # Full-mode round-trip emit (SPEC §encode). Refuses Documents
51
+ # containing `Dms::UnorderedHash` — see SPEC §"Unordered tables".
52
+ def self._encode_full(doc)
53
+ if Dms._contains_unordered_table?(doc.body)
54
+ raise EncodeError,
55
+ "encode (full-mode round-trip) refuses Document with " \
56
+ "Dms::UnorderedHash; unordered tables have arbitrary " \
57
+ "iteration order — use Dms.encode_lite instead. " \
58
+ "See SPEC §\"Unordered tables\"."
59
+ end
60
+ em = new(doc)
61
+ em.emit_document
62
+ em.out.join
63
+ end
64
+
65
+ # Canonical-form emit (SPEC §encode). Drops comments and
66
+ # `original_forms` even when present in `doc`: integers go out in
67
+ # decimal, strings in basic-quoted form, no comments are written.
68
+ # Accepts both full-mode and lite-mode parsed `Document`s.
69
+ # Round-trip stability is **data-only** —
70
+ # `decode_document(encode_lite(doc))` is data-equivalent to `doc`,
71
+ # but lossy by design for comments and source forms.
72
+ def self._encode_lite(doc)
73
+ em = new(doc, lite: true)
74
+ em.emit_document
75
+ em.out.join
76
+ end
77
+
78
+ # Per-node bucket of attached comments by position.
79
+ NodeComments = Struct.new(:leading, :inner, :trailing, :floating) do
80
+ def self.empty
81
+ new([], [], [], [])
82
+ end
83
+ end
84
+
85
+ def initialize(doc, lite: false)
86
+ @doc = doc
87
+ @out = []
88
+ @lite = lite
89
+ @comments_by_path = {}
90
+ # Lite mode (canonical-form emit) leaves the comment + form
91
+ # lookups empty even when `doc.comments` / `doc.original_forms`
92
+ # are populated, so the walker emits no comments and uses
93
+ # canonical integer/string forms. See SPEC §encode.
94
+ unless @lite
95
+ doc.comments.each do |ac|
96
+ path = ac.path.is_a?(Array) ? ac.path : ac.path.to_a
97
+ entry = (@comments_by_path[path] ||= NodeComments.empty)
98
+ case ac.position
99
+ when :leading then entry.leading << ac.comment
100
+ when :inner then entry.inner << ac.comment
101
+ when :trailing then entry.trailing << ac.comment
102
+ else entry.floating << ac.comment
103
+ end
104
+ end
105
+ end
106
+ @forms_by_path = {}
107
+ unless @lite
108
+ doc.original_forms.each do |path, lit|
109
+ path_arr = path.is_a?(Array) ? path : path.to_a
110
+ @forms_by_path[path_arr] ||= lit
111
+ end
112
+ end
113
+ # For is_flow_safe: any descendant comment forces block form.
114
+ # Empty in lite mode (no comments emitted anyway).
115
+ @descendant_comment_paths =
116
+ if @lite
117
+ []
118
+ else
119
+ doc.comments.map do |ac|
120
+ ac.path.is_a?(Array) ? ac.path : ac.path.to_a
121
+ end
122
+ end
123
+ end
124
+
125
+ # ----- helpers -------------------------------------------------
126
+
127
+ def push(s)
128
+ @out << s
129
+ end
130
+
131
+ def push_indent(indent)
132
+ @out << (INDENT_STR * indent) if indent > 0
133
+ end
134
+
135
+ def emit_document
136
+ # Lite mode emits no comments, so FM comments don't force a
137
+ # `+++` block — only an explicit `meta = Some(...)` does.
138
+ has_fm_comments = !@lite && @doc.comments.any? do |ac|
139
+ path = ac.path.is_a?(Array) ? ac.path : ac.path.to_a
140
+ !path.empty? && path[0] == "__fm__"
141
+ end
142
+ fm_present = !@doc.meta.nil?
143
+ if fm_present || has_fm_comments
144
+ push("+++\n")
145
+ fm_path = ["__fm__"].freeze
146
+ if !@doc.meta.nil?
147
+ emit_table_block(@doc.meta, fm_path, 0)
148
+ else
149
+ emit_floating(fm_path, 0)
150
+ end
151
+ push("+++\n\n")
152
+ end
153
+
154
+ body = @doc.body
155
+ body_path = [].freeze
156
+ if body.is_a?(Hash)
157
+ emit_table_block(body, body_path, 0)
158
+ elsif body.is_a?(Array)
159
+ emit_list_block(body, body_path, 0)
160
+ else
161
+ # Scalar root.
162
+ entry = @comments_by_path[body_path]
163
+ if entry
164
+ entry.leading.each { |c| emit_comment_line(c, 0) }
165
+ end
166
+ emit_value_inline(body, body_path)
167
+ emit_trailing_for(body_path)
168
+ push("\n")
169
+ if entry
170
+ entry.floating.each { |c| emit_comment_line(c, 0) }
171
+ end
172
+ end
173
+ end
174
+
175
+ # ----- block emitters ------------------------------------------
176
+
177
+ def emit_table_block(t, path, indent)
178
+ # Lite-mode hot path: comments_by_path / forms_by_path are empty
179
+ # by construction, so child_path / lookups / flow_safe? are pure
180
+ # overhead. Skipping them halves emit time on bench_realistic.
181
+ if @lite
182
+ pad = INDENT_STR * indent
183
+ t.each do |k, v|
184
+ # Inline common scalar cases (string/bool/int) to avoid the
185
+ # emit_value_inline + emit_string/emit_integer dispatch on
186
+ # the per-kvpair hot path. Real-world configs are dominated
187
+ # by these three.
188
+ kf = (k.is_a?(String) && k =~ /\A[A-Za-z_][A-Za-z0-9_-]*\z/) ? k : format_key(k)
189
+ case v
190
+ when String
191
+ if v =~ /[\\"\x00-\x1F]/
192
+ @out << "#{pad}#{kf}: \"#{Emitter.escape_basic(v)}\"\n"
193
+ else
194
+ @out << "#{pad}#{kf}: \"#{v}\"\n"
195
+ end
196
+ next
197
+ when true
198
+ @out << "#{pad}#{kf}: true\n"
199
+ next
200
+ when false
201
+ @out << "#{pad}#{kf}: false\n"
202
+ next
203
+ when Integer
204
+ @out << "#{pad}#{kf}: #{v}\n"
205
+ next
206
+ end
207
+ can_block =
208
+ (v.is_a?(Hash) && !v.empty?) || (v.is_a?(Array) && !v.empty?)
209
+ push_indent(indent)
210
+ push(kf)
211
+ push(":")
212
+ if can_block
213
+ push("\n")
214
+ if v.is_a?(Hash)
215
+ emit_table_block(v, nil, indent + 1)
216
+ else
217
+ emit_list_block(v, nil, indent + 1)
218
+ end
219
+ else
220
+ push(" ")
221
+ emit_value_inline(v, nil)
222
+ push("\n")
223
+ end
224
+ end
225
+ return
226
+ end
227
+ t.each do |k, v|
228
+ child_path = (path + [k]).freeze
229
+ entry = @comments_by_path[child_path]
230
+ if entry
231
+ entry.leading.each { |c| emit_comment_line(c, indent) }
232
+ end
233
+ has_trailing = entry && !entry.trailing.empty?
234
+ has_inner = has_inner?(child_path)
235
+ can_block =
236
+ (v.is_a?(Hash) && !v.empty?) || (v.is_a?(Array) && !v.empty?)
237
+ needs_block = can_block && !(has_trailing && flow_safe?(v, child_path))
238
+ push_indent(indent)
239
+ push(format_key(k))
240
+ push(":")
241
+ if needs_block
242
+ if has_inner
243
+ push(" ")
244
+ emit_inner_for(child_path)
245
+ # emit_inner_for leaves a trailing space; trim it.
246
+ @out.pop if !@out.empty? && @out[-1] == " "
247
+ end
248
+ push("\n")
249
+ if v.is_a?(Hash)
250
+ emit_table_block(v, child_path, indent + 1)
251
+ else
252
+ emit_list_block(v, child_path, indent + 1)
253
+ end
254
+ else
255
+ push(" ")
256
+ emit_inner_for(child_path)
257
+ emit_value_inline(v, child_path)
258
+ emit_trailing_for(child_path)
259
+ push("\n")
260
+ end
261
+ end
262
+ emit_floating(path, indent)
263
+ end
264
+
265
+ def emit_list_block(items, path, indent)
266
+ if @lite
267
+ items.each do |v|
268
+ push_indent(indent)
269
+ push("+")
270
+ if v.is_a?(Hash) && !v.empty?
271
+ push("\n")
272
+ emit_table_block(v, nil, indent + 1)
273
+ elsif v.is_a?(Array) && !v.empty?
274
+ push("\n")
275
+ emit_list_block(v, nil, indent + 1)
276
+ else
277
+ push(" ")
278
+ emit_value_inline(v, nil)
279
+ push("\n")
280
+ end
281
+ end
282
+ return
283
+ end
284
+ items.each_with_index do |v, i|
285
+ child_path = (path + [i]).freeze
286
+ entry = @comments_by_path[child_path]
287
+ if entry
288
+ entry.leading.each { |c| emit_comment_line(c, indent) }
289
+ end
290
+ push_indent(indent)
291
+ push("+")
292
+ has_inner = has_inner?(child_path)
293
+ if v.is_a?(Hash) && !v.empty?
294
+ if has_inner
295
+ push(" ")
296
+ emit_inner_for(child_path)
297
+ @out.pop if !@out.empty? && @out[-1] == " "
298
+ end
299
+ emit_trailing_for(child_path)
300
+ push("\n")
301
+ emit_table_block(v, child_path, indent + 1)
302
+ elsif v.is_a?(Array) && !v.empty?
303
+ if has_inner
304
+ push(" ")
305
+ emit_inner_for(child_path)
306
+ @out.pop if !@out.empty? && @out[-1] == " "
307
+ end
308
+ emit_trailing_for(child_path)
309
+ push("\n")
310
+ emit_list_block(v, child_path, indent + 1)
311
+ else
312
+ push(" ")
313
+ emit_inner_for(child_path)
314
+ emit_value_inline(v, child_path)
315
+ emit_trailing_for(child_path)
316
+ push("\n")
317
+ end
318
+ end
319
+ emit_floating(path, indent)
320
+ end
321
+
322
+ # ----- value / scalar emitters --------------------------------
323
+
324
+ def emit_value_inline(v, path)
325
+ case v
326
+ when true
327
+ push("true")
328
+ when false
329
+ push("false")
330
+ when Integer
331
+ emit_integer(v, path)
332
+ when Float
333
+ emit_float(v)
334
+ when LocalDate, LocalTime, LocalDateTime, OffsetDateTime
335
+ push(v.value)
336
+ when String
337
+ emit_string(v, path)
338
+ when Array
339
+ if v.empty?
340
+ push("[]")
341
+ else
342
+ push("[")
343
+ if @lite
344
+ v.each_with_index do |item, i|
345
+ push(", ") if i > 0
346
+ emit_value_inline(item, nil)
347
+ end
348
+ else
349
+ v.each_with_index do |item, i|
350
+ push(", ") if i > 0
351
+ emit_value_inline(item, (path + [i]).freeze)
352
+ end
353
+ end
354
+ push("]")
355
+ end
356
+ when Hash
357
+ if v.empty?
358
+ push("{}")
359
+ else
360
+ push("{")
361
+ first = true
362
+ if @lite
363
+ v.each do |k, vv|
364
+ push(", ") unless first
365
+ first = false
366
+ push(format_key(k))
367
+ push(": ")
368
+ emit_value_inline(vv, nil)
369
+ end
370
+ else
371
+ v.each do |k, vv|
372
+ push(", ") unless first
373
+ first = false
374
+ push(format_key(k))
375
+ push(": ")
376
+ emit_value_inline(vv, (path + [k]).freeze)
377
+ end
378
+ end
379
+ push("}")
380
+ end
381
+ else
382
+ raise EncodeError, "encode: cannot emit #{v.class.name}"
383
+ end
384
+ end
385
+
386
+ def emit_integer(n, path)
387
+ if @lite
388
+ push(n.to_s)
389
+ return
390
+ end
391
+ lit = @forms_by_path[path]
392
+ if lit && lit.kind == :integer
393
+ push(lit.integer_lit)
394
+ return
395
+ end
396
+ push(n.to_s)
397
+ end
398
+
399
+ def emit_float(f)
400
+ if f.nan?
401
+ push("nan")
402
+ elsif f.infinite?
403
+ push(f > 0 ? "inf" : "-inf")
404
+ else
405
+ push(Emitter.format_float_ryu_shape(f))
406
+ end
407
+ end
408
+
409
+ def emit_string(s, path)
410
+ if @lite
411
+ push('"')
412
+ push(Emitter.escape_basic(s))
413
+ push('"')
414
+ return
415
+ end
416
+ lit = @forms_by_path[path]
417
+ form = nil
418
+ form = lit.string_form if lit && lit.kind == :string
419
+ if form.nil? || form.kind == :basic
420
+ push('"')
421
+ push(Emitter.escape_basic(s))
422
+ push('"')
423
+ return
424
+ end
425
+ if form.kind == :literal
426
+ push("'")
427
+ push(s)
428
+ push("'")
429
+ return
430
+ end
431
+ # Heredoc: stored body is post-modifier. Mirror Python emitter:
432
+ # for `_fold_paragraphs`, replace each `\n` with `\n\n` so on
433
+ # re-parse the fold collapses paragraphs back to single lines.
434
+ body = s
435
+ if form.modifiers && form.modifiers.any? { |m| m.name == "_fold_paragraphs" }
436
+ body = body.gsub("\n", "\n\n")
437
+ end
438
+ emit_heredoc(body, form.flavor, form.label, form.modifiers || [])
439
+ end
440
+
441
+ def emit_heredoc(body, flavor, label, modifiers)
442
+ # Compute kvpair indent dynamically from the buffer: count the
443
+ # leading spaces on the line we're currently writing.
444
+ joined = @out.join
445
+ nl = joined.rindex("\n")
446
+ line_start = nl ? nl + 1 : 0
447
+ kv_indent = 0
448
+ i = line_start
449
+ while i < joined.length && joined[i] == " "
450
+ kv_indent += 1
451
+ i += 1
452
+ end
453
+ body_indent = " " * (kv_indent + INDENT_STR.length)
454
+ term_indent = body_indent
455
+
456
+ opener = (flavor == :basic_triple) ? '"""' : "'''"
457
+ push(opener)
458
+ push(label) if label
459
+ modifiers.each do |m|
460
+ push(" ")
461
+ push(m.name)
462
+ push("(")
463
+ m.args.each_with_index do |a, j|
464
+ push(", ") if j > 0
465
+ emit_modifier_arg(a)
466
+ end
467
+ push(")")
468
+ end
469
+ push("\n")
470
+ unless body.empty?
471
+ # Mirror Python's `body.split("\n")` (no limit): this keeps
472
+ # trailing empty fields, matching Ruby `split("\n", -1)`.
473
+ body.split("\n", -1).each do |line|
474
+ if line == ""
475
+ push("\n")
476
+ else
477
+ push(body_indent)
478
+ push(line)
479
+ push("\n")
480
+ end
481
+ end
482
+ end
483
+ push(term_indent)
484
+ if label
485
+ push(label)
486
+ else
487
+ push(opener)
488
+ end
489
+ end
490
+
491
+ def emit_modifier_arg(v)
492
+ case v
493
+ when true
494
+ push("true")
495
+ when false
496
+ push("false")
497
+ when Integer
498
+ push(v.to_s)
499
+ when Float
500
+ emit_float(v)
501
+ when String
502
+ push('"')
503
+ push(Emitter.escape_basic(v))
504
+ push('"')
505
+ when LocalDate, LocalTime, LocalDateTime, OffsetDateTime
506
+ push(v.value)
507
+ when Array
508
+ push("[]")
509
+ when Hash
510
+ push("{}")
511
+ else
512
+ raise EncodeError, "encode: cannot emit modifier arg #{v.class.name}"
513
+ end
514
+ end
515
+
516
+ # ----- comment emitters ----------------------------------------
517
+
518
+ def emit_comment_line(c, indent)
519
+ text = c.content
520
+ prefix = INDENT_STR * indent
521
+ unless text.include?("\n")
522
+ push(prefix)
523
+ push(text)
524
+ push("\n")
525
+ return
526
+ end
527
+ lines = text.split("\n", -1)
528
+ lines.each_with_index do |line, j|
529
+ if j > 0
530
+ push("\n")
531
+ else
532
+ push(prefix)
533
+ end
534
+ push(line)
535
+ end
536
+ push("\n")
537
+ end
538
+
539
+ def emit_trailing_for(path)
540
+ return if @lite
541
+ entry = @comments_by_path[path]
542
+ return unless entry
543
+ first = true
544
+ entry.trailing.each do |c|
545
+ if first
546
+ push(" ")
547
+ first = false
548
+ else
549
+ push(" ")
550
+ end
551
+ push(c.content)
552
+ end
553
+ end
554
+
555
+ def emit_inner_for(path)
556
+ return if @lite
557
+ entry = @comments_by_path[path]
558
+ return unless entry
559
+ entry.inner.each do |c|
560
+ push(c.content)
561
+ push(" ")
562
+ end
563
+ end
564
+
565
+ def has_inner?(path)
566
+ return false if @lite
567
+ entry = @comments_by_path[path]
568
+ entry && !entry.inner.empty?
569
+ end
570
+
571
+ def emit_floating(path, indent)
572
+ return if @lite
573
+ entry = @comments_by_path[path]
574
+ return unless entry
575
+ entry.floating.each { |c| emit_comment_line(c, indent) }
576
+ end
577
+
578
+ # ----- flow-safety check ---------------------------------------
579
+
580
+ def flow_safe?(v, path)
581
+ plen = path.length
582
+ @descendant_comment_paths.each do |cpath|
583
+ return false if cpath.length > plen && cpath[0, plen] == path
584
+ end
585
+ case v
586
+ when String
587
+ lit = @forms_by_path[path]
588
+ if lit && lit.kind == :string && lit.string_form &&
589
+ lit.string_form.kind == :heredoc
590
+ return false
591
+ end
592
+ return true
593
+ when Array
594
+ v.each_with_index do |item, i|
595
+ return false unless flow_safe?(item, (path + [i]).freeze)
596
+ end
597
+ return true
598
+ when Hash
599
+ v.each do |k, vv|
600
+ return false unless flow_safe?(vv, (path + [k]).freeze)
601
+ end
602
+ return true
603
+ else
604
+ true
605
+ end
606
+ end
607
+
608
+ # ----- module-level helpers ------------------------------------
609
+
610
+ # Bare-key chars: `_`, `-`, ASCII alnum, or non-ASCII Unicode L/N.
611
+ def self.bare_key_char?(c)
612
+ return true if c == "_" || c == "-"
613
+ cp = c.ord
614
+ if cp < 128
615
+ return (cp >= 0x30 && cp <= 0x39) ||
616
+ (cp >= 0x41 && cp <= 0x5A) ||
617
+ (cp >= 0x61 && cp <= 0x7A)
618
+ end
619
+ c.match?(/\p{L}|\p{N}/)
620
+ end
621
+
622
+ def self.format_float_ryu_shape(v)
623
+ s = v.to_s
624
+ if s.include?("e")
625
+ mantissa, exp = s.split("e", 2)
626
+ exp = exp[1..] if exp.start_with?("+")
627
+ # Drop trailing `.0` from the mantissa so e.g. `1.0e100` -> `1e100`,
628
+ # matching the Python ryu-shape (Python's `repr(1e100)` -> `1e+100`).
629
+ mantissa = mantissa[0..-3] if mantissa.end_with?(".0") && mantissa.length > 2
630
+ return "#{mantissa}e#{exp}"
631
+ end
632
+ return s + ".0" unless s.include?(".")
633
+ s
634
+ end
635
+
636
+ def self.escape_basic(s)
637
+ out = String.new
638
+ s.each_char do |c|
639
+ case c
640
+ when "\\" then out << "\\\\"
641
+ when "\"" then out << "\\\""
642
+ when "\n" then out << "\\n"
643
+ when "\r" then out << "\\r"
644
+ when "\t" then out << "\\t"
645
+ when "\b" then out << "\\b"
646
+ when "\f" then out << "\\f"
647
+ else
648
+ cp = c.ord
649
+ if cp < 0x20
650
+ out << format("\\u%04X", cp)
651
+ else
652
+ out << c
653
+ end
654
+ end
655
+ end
656
+ out
657
+ end
658
+
659
+ def self.format_key(k)
660
+ return k if !k.empty? && k.each_char.all? { |c| bare_key_char?(c) }
661
+ if !k.include?("'") && !k.include?("\n") && !k.include?("\r")
662
+ return "'#{k}'"
663
+ end
664
+ "\"#{escape_basic(k)}\""
665
+ end
666
+ end
667
+
668
+ class Emitter
669
+ # Instance forwarder so `format_key(k)` inside emit_* works.
670
+ def format_key(k)
671
+ Emitter.format_key(k)
672
+ end
673
+ end
674
+ end