red_quilt 0.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.rubocop.yml +109 -0
  4. data/.rubocop_todo.yml +7 -0
  5. data/CHANGELOG.md +57 -0
  6. data/README.md +284 -0
  7. data/Rakefile +8 -0
  8. data/ast-spec.md +1227 -0
  9. data/docs/architecture.md +81 -0
  10. data/docs/arena-usage.md +363 -0
  11. data/docs/commonmark-conformance.md +241 -0
  12. data/exe/redquilt +7 -0
  13. data/lib/red_quilt/arena.rb +366 -0
  14. data/lib/red_quilt/block_parser.rb +724 -0
  15. data/lib/red_quilt/blockquote.rb +151 -0
  16. data/lib/red_quilt/cli.rb +182 -0
  17. data/lib/red_quilt/diagnostic.rb +47 -0
  18. data/lib/red_quilt/document.rb +126 -0
  19. data/lib/red_quilt/extended_autolink_pass.rb +185 -0
  20. data/lib/red_quilt/footnote_definition.rb +147 -0
  21. data/lib/red_quilt/footnote_pass.rb +39 -0
  22. data/lib/red_quilt/footnote_registry.rb +68 -0
  23. data/lib/red_quilt/indentation.rb +73 -0
  24. data/lib/red_quilt/inline/builder.rb +674 -0
  25. data/lib/red_quilt/inline/flanking.rb +120 -0
  26. data/lib/red_quilt/inline/html_entities.rb +2180 -0
  27. data/lib/red_quilt/inline/lexer.rb +280 -0
  28. data/lib/red_quilt/inline/link_scanner.rb +315 -0
  29. data/lib/red_quilt/inline/token_kind.rb +39 -0
  30. data/lib/red_quilt/inline/tokens.rb +73 -0
  31. data/lib/red_quilt/inline.rb +34 -0
  32. data/lib/red_quilt/inline_pass.rb +53 -0
  33. data/lib/red_quilt/line.rb +14 -0
  34. data/lib/red_quilt/lint_pass.rb +71 -0
  35. data/lib/red_quilt/list.rb +317 -0
  36. data/lib/red_quilt/node_ref.rb +114 -0
  37. data/lib/red_quilt/node_type.rb +66 -0
  38. data/lib/red_quilt/plain_text.rb +46 -0
  39. data/lib/red_quilt/reference_definition.rb +309 -0
  40. data/lib/red_quilt/renderer/html.rb +279 -0
  41. data/lib/red_quilt/renderer/mdast.rb +152 -0
  42. data/lib/red_quilt/source_map.rb +29 -0
  43. data/lib/red_quilt/source_span.rb +26 -0
  44. data/lib/red_quilt/theme.rb +28 -0
  45. data/lib/red_quilt/themes/default.css +87 -0
  46. data/lib/red_quilt/version.rb +5 -0
  47. data/lib/red_quilt.rb +86 -0
  48. data/mise.toml +2 -0
  49. data/sig/red_quilt.rbs +45 -0
  50. metadata +91 -0
@@ -0,0 +1,724 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RedQuilt
4
+ class BlockParser
5
+ def initialize(arena, footnotes: nil)
6
+ @arena = arena
7
+ @lines = build_lines(arena.source)
8
+ @references = {}
9
+ @footnotes = footnotes
10
+ @diagnostics = []
11
+ # Cached collaborator parsers — created once and reused for every
12
+ # block of the corresponding type (including nested ones) so the
13
+ # dispatch path stays allocation-free.
14
+ @list_parser = List::Parser.new(self)
15
+ @blockquote_parser = Blockquote::Parser.new(self)
16
+ @footnote_parser = FootnoteDefinition::Parser.new(self)
17
+ end
18
+
19
+ attr_reader :references, :arena, :diagnostics
20
+
21
+ def parse
22
+ @root_id = @arena.add_node(NodeType::DOCUMENT, source_start: 0, source_len: @arena.source.bytesize)
23
+ parse_lines(@root_id, @lines, transformed: false)
24
+ @footnote_parser.move_section_to_end(@root_id) if @footnotes
25
+ @root_id
26
+ end
27
+
28
+ # parse_lines returns true if it encountered a blank line BETWEEN
29
+ # two block-level constructs at this scope. parse_list uses that to
30
+ # decide an item's looseness — the spec says an item is loose when
31
+ # it "directly contains two block-level elements with a blank line
32
+ # between them", and ref-defs / fence openers that don't emit an
33
+ # arena child still count as block-level elements.
34
+ #
35
+ # `seen_block` guards against treating the empty marker line of a
36
+ # list item (e.g. `-` alone) as a blank "between" anything: the
37
+ # blank only counts after at least one real block has been emitted.
38
+ def parse_lines(parent_id, lines, transformed:)
39
+ saw_blank = false
40
+ seen_block = false
41
+ blank_then_block = false
42
+ index = 0
43
+ while index < lines.length
44
+ line = lines[index]
45
+ if line.blank
46
+ saw_blank = true if seen_block
47
+ index += 1
48
+ next
49
+ end
50
+
51
+ blank_then_block = true if saw_blank
52
+ saw_blank = false
53
+ seen_block = true
54
+
55
+ content = line.content
56
+ if paragraph_only_line?(content)
57
+ # Fast path: nothing in this line can possibly start a
58
+ # different block, so skip the eight predicate checks below.
59
+ index = parse_paragraph(parent_id, lines, index, transformed)
60
+ next
61
+ end
62
+
63
+ if (fence = fenced_code_start(content))
64
+ index = parse_fenced_code(parent_id, lines, index, fence)
65
+ elsif (heading = atx_heading(content))
66
+ append_heading(parent_id, line, heading, transformed)
67
+ index += 1
68
+ elsif thematic_break?(content)
69
+ @arena.append_child(parent_id, @arena.add_node(NodeType::THEMATIC_BREAK, source_start: line.start_byte, source_len: span_len(line)))
70
+ index += 1
71
+ elsif @footnotes && (footnote = FootnoteDefinition.match(content))
72
+ index = @footnote_parser.parse(lines, index, footnote, @footnotes, @root_id)
73
+ elsif (reference = ReferenceDefinition.consume(lines, index))
74
+ store_reference(reference[:reference], reference[:source_span])
75
+ index += reference[:consumed]
76
+ elsif table_start?(lines, index)
77
+ index = parse_table(parent_id, lines, index)
78
+ elsif html_block_start?(content)
79
+ index = parse_html_block(parent_id, lines, index)
80
+ elsif Blockquote.match?(content)
81
+ index = @blockquote_parser.parse(parent_id, lines, index)
82
+ elsif List.match(content)
83
+ index = @list_parser.parse(parent_id, lines, index)
84
+ elsif indented_code_line?(content)
85
+ index = parse_indented_code(parent_id, lines, index)
86
+ else
87
+ index = parse_paragraph(parent_id, lines, index, transformed)
88
+ end
89
+ end
90
+ blank_then_block
91
+ end
92
+
93
+ # Methods the collaborator parsers (List::Parser, Blockquote::Parser,
94
+ # FootnoteDefinition::Parser) call back into.
95
+
96
+ # A line at less-than-N indent breaks lazy continuation when it would
97
+ # itself start a new block (heading, thematic break, fenced/indented
98
+ # code, html block, blockquote, list item, table). Same predicate as
99
+ # paragraph_interrupt? minus the "index > 0" guard.
100
+ def lazy_break?(lines, index)
101
+ line = lines[index]
102
+ return true if atx_heading(line.content)
103
+ return true if thematic_break?(line.content)
104
+ return true if fenced_code_start(line.content)
105
+ # HTML type 7 doesn't break lazy continuation either.
106
+ if (type = html_block_type(line.content)) && type != 7
107
+ return true
108
+ end
109
+ return true if Blockquote.match?(line.content)
110
+ if (li = List.match(line.content)) && List.interrupts_paragraph?(li)
111
+ return true
112
+ end
113
+ return true if table_start?(lines, index)
114
+
115
+ false
116
+ end
117
+
118
+ # Thematic break per CommonMark: 0-3 spaces of indent, then 3+ of
119
+ # the same character (`*`, `-`, or `_`) optionally separated by
120
+ # whitespace, and nothing else on the line. Lines indented 4+ spaces
121
+ # are indented code, not thematic breaks.
122
+ THEMATIC_BREAK_RE = /\A {0,3}(?:(?:\*[ \t]*){3,}|(?:-[ \t]*){3,}|(?:_[ \t]*){3,})\z/
123
+
124
+ private_constant :THEMATIC_BREAK_RE
125
+
126
+ def thematic_break?(text)
127
+ THEMATIC_BREAK_RE.match?(text)
128
+ end
129
+
130
+ def paragraph_eligible_line?(content)
131
+ return false if indented_code_line?(content)
132
+ return false if fenced_code_start(content)
133
+ return false if atx_heading(content)
134
+ return false if thematic_break?(content)
135
+ return false if html_block_start?(content)
136
+ return false if List.match(content)
137
+ return false if Blockquote.match?(content)
138
+
139
+ true
140
+ end
141
+
142
+ private
143
+
144
+ # Byte values that can begin a non-paragraph block (after 0-3
145
+ # leading spaces). Lines whose first non-space byte is NOT in this
146
+ # set go straight to parse_paragraph, skipping all eight specific
147
+ # block-start predicates.
148
+ #
149
+ # Members: `#` (ATX), ``` ` ```/`~` (fences), `*`/`-`/`+`/`_` (thematic
150
+ # & list markers), `0`-`9` (ordered list), `[` (ref def), `>` (blockquote),
151
+ # `<` (HTML block), `\t` (indented code, when a tab provides indent).
152
+ BLOCK_START_BYTES = begin
153
+ a = Array.new(256, false)
154
+ [0x23, 0x60, 0x7E, 0x2A, 0x2D, 0x2B, 0x5F, 0x5B, 0x3E, 0x3C, 0x09].each { |b| a[b] = true }
155
+ (0x30..0x39).each { |b| a[b] = true }
156
+ a.freeze
157
+ end
158
+
159
+ private_constant :BLOCK_START_BYTES
160
+
161
+ # Returns true when `content` cannot start any non-paragraph block,
162
+ # so the slow predicate fan-out in parse_lines can be skipped. The
163
+ # check is intentionally conservative: anything ambiguous returns
164
+ # false and falls through to the full dispatch.
165
+ def paragraph_only_line?(content)
166
+ bytes = content.bytesize
167
+ i = 0
168
+ # Up to 3 leading spaces are still part of the block prefix; 4+
169
+ # means indented code, which IS a block start.
170
+ while i < 3 && i < bytes && content.getbyte(i) == 0x20
171
+ i += 1
172
+ end
173
+ return false if i >= bytes
174
+
175
+ first = content.getbyte(i)
176
+ # 4+ leading spaces? Treat as indented code candidate.
177
+ return false if i == 3 && first == 0x20
178
+ # The first non-space byte gates every block start we recognise.
179
+ return false if BLOCK_START_BYTES[first]
180
+ # Table rows always contain `|`; quick C-level scan covers them.
181
+ return false if content.include?("|")
182
+
183
+ true
184
+ end
185
+
186
+ def parse_fenced_code(parent_id, lines, index, fence)
187
+ start_line = lines[index]
188
+ content_lines = []
189
+ index += 1
190
+ while index < lines.length
191
+ break if fenced_code_close?(lines[index].content, fence[:char], fence[:count])
192
+
193
+ content_lines << lines[index]
194
+ index += 1
195
+ end
196
+ index += 1 if index < lines.length
197
+
198
+ # Each content line is stripped of up to the fence's own leading
199
+ # indent (CommonMark spec: a fence indented by N spaces strips up
200
+ # to N spaces from every content line, but never more). Manual
201
+ # byte scan beats compiling an interpolated regex per block and
202
+ # short-circuits when the fence had no indent (the common case).
203
+ indent_n = fence[:indent] || 0
204
+ code = content_lines.map { |l| strip_leading_spaces(l.content, indent_n) }.join("\n")
205
+ code << "\n" unless content_lines.empty?
206
+ source_start = content_lines.empty? ? start_line.start_byte : content_lines.first.start_byte
207
+ source_end = content_lines.empty? ? start_line.end_byte : content_lines.last.end_byte
208
+ code_id = @arena.add_node(NodeType::CODE_BLOCK,
209
+ source_start: source_start,
210
+ source_len: source_end - source_start,
211
+ str1: code,
212
+ str2: fence[:info])
213
+ @arena.append_child(parent_id, code_id)
214
+ index
215
+ end
216
+
217
+ def parse_indented_code(parent_id, lines, index)
218
+ start_index = index
219
+ code_lines = []
220
+ while index < lines.length
221
+ line = lines[index]
222
+ break unless line.blank || indented_code_line?(line.content)
223
+
224
+ # CommonMark: strip up to 4 columns of leading whitespace
225
+ # (tab-aware) from every line, including blank lines whose
226
+ # content beyond column 4 must be preserved verbatim.
227
+ code_lines << Indentation.strip_columns(line.content, 4)
228
+ index += 1
229
+ end
230
+
231
+ # Trailing blank lines are not part of the code block.
232
+ while !code_lines.empty? && code_lines.last.strip.empty?
233
+ code_lines.pop
234
+ index -= 1
235
+ end
236
+
237
+ start_byte = lines[start_index].start_byte
238
+ end_byte = lines[index - 1].end_byte
239
+ code = code_lines.empty? ? "" : code_lines.join("\n") + "\n"
240
+
241
+ code_id = @arena.add_node(NodeType::CODE_BLOCK,
242
+ source_start: start_byte,
243
+ source_len: end_byte - start_byte,
244
+ str1: code)
245
+ @arena.append_child(parent_id, code_id)
246
+ index
247
+ end
248
+
249
+ HTML_BLOCK_FIXED_TERMINATORS = {
250
+ 2 => "-->",
251
+ 3 => "?>",
252
+ 4 => ">",
253
+ 5 => "]]>",
254
+ }.freeze
255
+
256
+ private_constant :HTML_BLOCK_FIXED_TERMINATORS
257
+
258
+ def parse_html_block(parent_id, lines, index)
259
+ start_index = index
260
+ type = html_block_type(lines[index].content)
261
+ end_index = locate_html_block_end(lines, index, type)
262
+
263
+ start_byte = lines[start_index].start_byte
264
+ end_byte = lines[end_index].end_byte
265
+ html_lines = (start_index..end_index).map { |i| lines[i].content }
266
+ html_id = @arena.add_node(NodeType::HTML_BLOCK,
267
+ source_start: start_byte,
268
+ source_len: end_byte - start_byte,
269
+ str1: html_lines.join("\n"))
270
+ @arena.append_child(parent_id, html_id)
271
+ end_index + 1
272
+ end
273
+
274
+ def locate_html_block_end(lines, index, type)
275
+ terminator = html_block_terminator(type, lines[index].content)
276
+
277
+ if terminator
278
+ case_insensitive = (type == 1)
279
+ while index < lines.length
280
+ line = lines[index].content
281
+ haystack = case_insensitive ? line.downcase : line
282
+ return index if haystack.include?(terminator)
283
+
284
+ index += 1
285
+ end
286
+ lines.length - 1
287
+ else
288
+ # Types 6 & 7: terminated by blank line (or end of input)
289
+ index += 1 while index < lines.length && !lines[index].blank
290
+ index - 1
291
+ end
292
+ end
293
+
294
+ def html_block_terminator(type, first_line)
295
+ case type
296
+ when 1
297
+ "</#{extract_closing_tag_name(first_line)}>"
298
+ when 2..5
299
+ HTML_BLOCK_FIXED_TERMINATORS[type]
300
+ end
301
+ end
302
+
303
+ def extract_closing_tag_name(text)
304
+ match = /\A<(script|pre|style|textarea)/i.match(text)
305
+ match ? match[1].downcase : "script"
306
+ end
307
+
308
+ def parse_table(parent_id, lines, index)
309
+ # Caller must have verified table_start?(lines, index), which validates
310
+ # both the delimiter pattern and the header/separator column count match.
311
+ start_index = index
312
+ header_cells = split_table_row(lines[index].content)
313
+ row_lines = [lines[index]]
314
+ index += 2
315
+ while index < lines.length
316
+ break if lines[index].blank
317
+ break unless table_row?(lines[index].content)
318
+
319
+ row_lines << lines[index]
320
+ index += 1
321
+ end
322
+
323
+ table_id = @arena.add_node(NodeType::TABLE,
324
+ source_start: lines[start_index].start_byte,
325
+ source_len: row_lines.last.end_byte - lines[start_index].start_byte)
326
+ @arena.append_child(parent_id, table_id)
327
+
328
+ append_table_row(table_id, lines[start_index], header_cells, true)
329
+ row_lines.drop(1).each do |row_line|
330
+ append_table_row(table_id, row_line, split_table_row(row_line.content), false)
331
+ end
332
+
333
+ index
334
+ end
335
+
336
+ def append_table_row(table_id, line, cells, header)
337
+ row_id = @arena.add_node(NodeType::TABLE_ROW,
338
+ source_start: line.start_byte,
339
+ source_len: span_len(line),
340
+ int1: header ? 1 : 0)
341
+ @arena.append_child(table_id, row_id)
342
+ cells.each do |cell_text|
343
+ stripped = cell_text.strip
344
+ cell_id = @arena.add_node(NodeType::TABLE_CELL,
345
+ source_start: line.start_byte,
346
+ source_len: span_len(line),
347
+ int1: header ? 1 : 0,
348
+ str1: stripped)
349
+ @arena.append_child(row_id, cell_id)
350
+ end
351
+ end
352
+
353
+ def append_heading(parent_id, line, heading, transformed)
354
+ content = heading[:content].to_s.rstrip
355
+ source_start = line.start_byte + heading[:content_start]
356
+ node_id = @arena.add_node(NodeType::HEADING,
357
+ source_start: source_start,
358
+ source_len: content.bytesize,
359
+ int1: heading[:level],
360
+ str1: transformed ? content : nil)
361
+ @arena.append_child(parent_id, node_id)
362
+ end
363
+
364
+ def parse_paragraph(parent_id, lines, index, transformed)
365
+ paragraph_lines = []
366
+ start_index = index
367
+ setext_level = nil
368
+ while index < lines.length
369
+ line = lines[index]
370
+ break if line.blank
371
+
372
+ # Setext heading underline: only valid when there is already at
373
+ # least one paragraph line above it. Checked before
374
+ # paragraph_interrupt? so that "---" / "===" turns the open
375
+ # paragraph into a heading instead of being treated as a
376
+ # thematic break.
377
+ if paragraph_lines.any? && !line.lazy_continuation && (level = setext_underline_level(line.content))
378
+ setext_level = level
379
+ index += 1
380
+ break
381
+ end
382
+
383
+ # Lazy continuation lines always extend the open paragraph;
384
+ # they have already been classified as paragraph content by the
385
+ # outer collector, so we must not let `paragraph_interrupt?`
386
+ # split them off into a new block (which would also try to
387
+ # parse them as e.g. a list item start).
388
+ if !line.lazy_continuation && index > start_index && paragraph_interrupt?(lines, index)
389
+ break
390
+ end
391
+
392
+ # NOTE: Per CommonMark, a `[label]: ...` line cannot start a
393
+ # link reference definition inside an open paragraph — it's
394
+ # absorbed as paragraph continuation. The dispatch in
395
+ # parse_lines catches definitions that appear after a blank
396
+ # line, so we don't need another scan here.
397
+ paragraph_lines << line
398
+ index += 1
399
+ end
400
+
401
+ # CommonMark: the first paragraph line may carry 0-3 spaces of
402
+ # leading indent (4+ would be an indented code block, so it never
403
+ # reaches this branch). Continuation lines have no fixed indent
404
+ # cap — all leading whitespace is stripped before joining.
405
+ stripped = paragraph_lines.map.with_index do |l, i|
406
+ i.zero? ? strip_leading_spaces(l.content, 3) : strip_leading_whitespace(l.content)
407
+ end
408
+ # Trailing whitespace on the last line is dropped (no hard-break
409
+ # without a following content line).
410
+ stripped[-1] = stripped[-1].sub(/[ \t]+\z/, "") if stripped.any?
411
+ indent_was_stripped = stripped.zip(paragraph_lines).any? { |s, l| s.length != l.content.length }
412
+ text = stripped.join("\n")
413
+ start_byte = paragraph_lines.first.start_byte
414
+ end_byte = paragraph_lines.last.end_byte
415
+
416
+ if setext_level
417
+ heading_id = @arena.add_node(NodeType::HEADING,
418
+ source_start: start_byte,
419
+ source_len: end_byte - start_byte,
420
+ int1: setext_level,
421
+ str1: text.strip)
422
+ @arena.append_child(parent_id, heading_id)
423
+ return index
424
+ end
425
+
426
+ # Paragraphs carry a literal when the inline content cannot be
427
+ # recovered from a contiguous source slice — that is, when block
428
+ # transformation has already happened (blockquote / list item
429
+ # interior, `transformed: true`) or when we stripped leading
430
+ # paragraph indent above. Otherwise leave str1 nil so the inline
431
+ # pass and NodeRef#source_span / source_location use the real
432
+ # source bytes.
433
+ needs_literal = transformed || indent_was_stripped
434
+ paragraph_id = @arena.add_node(NodeType::PARAGRAPH,
435
+ source_start: start_byte,
436
+ source_len: end_byte - start_byte,
437
+ str1: needs_literal ? text : nil)
438
+ @arena.append_child(parent_id, paragraph_id)
439
+ index
440
+ end
441
+
442
+ # Returns 1 for `===...` (h1), 2 for `---...` (h2), nil otherwise.
443
+ # Leading up to 3 spaces of indent and any amount of trailing
444
+ # whitespace are allowed.
445
+ def setext_underline_level(text)
446
+ match = /\A {0,3}(=+|-+)[ \t]*\z/.match(text)
447
+ return nil unless match
448
+
449
+ match[1].start_with?("=") ? 1 : 2
450
+ end
451
+
452
+ def paragraph_interrupt?(lines, index)
453
+ line = lines[index]
454
+ return false unless index > 0
455
+ return true if atx_heading(line.content)
456
+ return true if thematic_break?(line.content)
457
+ return true if fenced_code_start(line.content)
458
+ # CommonMark: HTML block types 1–6 interrupt paragraphs; type 7
459
+ # (a bare valid tag on its own line) does not.
460
+ if (type = html_block_type(line.content)) && type != 7
461
+ return true
462
+ end
463
+ return true if Blockquote.match?(line.content)
464
+ if (li = List.match(line.content)) && List.interrupts_paragraph?(li)
465
+ return true
466
+ end
467
+ return true if table_start?(lines, index)
468
+
469
+ false
470
+ end
471
+
472
+ # Strips up to `max` leading 0x20 bytes from `text`. Returns the
473
+ # original string when nothing changed, so callers avoid an
474
+ # allocation in the common no-indent case.
475
+ def strip_leading_spaces(text, max)
476
+ return text if max <= 0
477
+
478
+ bytes = text.bytesize
479
+ i = 0
480
+ while i < max && i < bytes && text.getbyte(i) == 0x20
481
+ i += 1
482
+ end
483
+ return text if i.zero?
484
+
485
+ text.byteslice(i..)
486
+ end
487
+
488
+ # Strips all leading 0x20 / 0x09 bytes from `text`. Same no-alloc
489
+ # return as `strip_leading_spaces` when the string already starts
490
+ # at a non-whitespace byte.
491
+ def strip_leading_whitespace(text)
492
+ bytes = text.bytesize
493
+ i = 0
494
+ while i < bytes
495
+ b = text.getbyte(i)
496
+ break unless b == 0x20 || b == 0x09
497
+
498
+ i += 1
499
+ end
500
+ return text if i.zero?
501
+
502
+ text.byteslice(i..)
503
+ end
504
+
505
+ def build_lines(source)
506
+ # split("\n", -1) avoids the extra slice/allocation that
507
+ # each_line + chomp incurs per line. The blank-line check uses
508
+ # /[^ \t]/ (not /\S/) because CommonMark defines a blank line as
509
+ # "empty, or containing only spaces (U+0020) or tabs (U+0009)" --
510
+ # other whitespace (e.g. form feed U+000C) does NOT make a line
511
+ # blank and must continue an enclosing paragraph.
512
+ parts = source.split("\n", -1)
513
+ parts.pop if source.end_with?("\n")
514
+ lines = []
515
+ offset = 0
516
+ parts.each do |raw|
517
+ size = raw.bytesize
518
+ lines << Line.new(raw, offset, offset + size, !raw.match?(/[^ \t]/))
519
+ offset += size + 1
520
+ end
521
+ lines
522
+ end
523
+
524
+ # ATX headings per CommonMark spec:
525
+ # - 0-3 spaces of indent, then 1-6 `#`s
526
+ # - either end-of-line (empty heading) or at least one space/tab
527
+ # followed by the content
528
+ # - optional trailing `#`s are only stripped when separated from the
529
+ # content by whitespace (so `# foo#` keeps the `#`)
530
+ ATX_HEADING_RE = /\A {0,3}(\#{1,6})(?:[ \t]+\#+[ \t]*|[ \t]+(.*?)(?:[ \t]+\#+)?[ \t]*|[ \t]*)\z/
531
+
532
+ private_constant :ATX_HEADING_RE
533
+
534
+ def atx_heading(text)
535
+ match = ATX_HEADING_RE.match(text)
536
+ return unless match
537
+
538
+ content = match[2].to_s
539
+ content_index = content.empty? ? text.length : (text.index(content) || text.bytesize)
540
+ { level: match[1].length, content: content, content_start: content_index }
541
+ end
542
+
543
+ def fenced_code_start(text)
544
+ match = /\A( {0,3})(`{3,}|~{3,})[ \t]*(.*?)\s*\z/.match(text)
545
+ return unless match
546
+
547
+ info = match[3]
548
+ # CommonMark: a backtick-style fence cannot have backticks in its
549
+ # info string (they'd be ambiguous with the fence itself).
550
+ return if match[2].start_with?("`") && info.include?("`")
551
+
552
+ {
553
+ char: match[2][0],
554
+ count: match[2].length,
555
+ info: ReferenceDefinition.unescape_text(info),
556
+ indent: match[1].length,
557
+ }
558
+ end
559
+
560
+ def fenced_code_close?(text, char, count)
561
+ # Manual byte scan beats compiling a per-(char,count) regex on
562
+ # every line of a fenced block. Pattern: 0-3 spaces, >=count of
563
+ # `char`, optional trailing spaces/tabs, end-of-line.
564
+ bytes = text.bytesize
565
+ i = 0
566
+ # CommonMark spec: at most 3 spaces of indent.
567
+ while i < 3 && i < bytes && text.getbyte(i) == 0x20
568
+ i += 1
569
+ end
570
+ char_byte = char.getbyte(0)
571
+ fence_start = i
572
+ while i < bytes && text.getbyte(i) == char_byte
573
+ i += 1
574
+ end
575
+ return false if i - fence_start < count
576
+
577
+ while i < bytes
578
+ b = text.getbyte(i)
579
+ return false unless b == 0x20 || b == 0x09
580
+
581
+ i += 1
582
+ end
583
+ true
584
+ end
585
+
586
+ def indented_code_line?(text)
587
+ # CommonMark: 4+ columns of leading whitespace, where tabs expand
588
+ # virtually to a tab stop of 4 columns.
589
+ Indentation.leading_columns(text) >= 4
590
+ end
591
+
592
+ # Returns the column count of leading whitespace, treating tabs as
593
+ # advancing to the next multiple-of-4 column.
594
+ def html_block_start?(text)
595
+ # Indented code block takes precedence (4+ spaces)
596
+ return false if text.start_with?(" ")
597
+
598
+ !html_block_type(text).nil?
599
+ end
600
+
601
+ def html_block_type(text)
602
+ # Fast reject: every HTML block starts with `<`. lstrip strips
603
+ # 0-3 indent spaces (more would already be indented code), so peek
604
+ # the leading non-space byte before doing any allocations.
605
+ i = 0
606
+ # CommonMark: HTML block lines may have 0-3 spaces of indent.
607
+ while i < 3 && i < text.length && text.getbyte(i) == 0x20
608
+ i += 1
609
+ end
610
+ return nil unless i < text.length && text.getbyte(i) == 0x3C
611
+
612
+ stripped = i.zero? ? text : text[i..]
613
+
614
+ # Type 1: <script|pre|style|textarea (case-insensitive) followed by
615
+ # space/tab/end-of-line or `>`. CommonMark restricts the separator
616
+ # to space, tab, or a line ending (not any whitespace class).
617
+ return 1 if stripped.match?(%r{\A<(script|pre|style|textarea)(?:[ \t]|>|$)}i)
618
+
619
+ # Type 2: <!--
620
+ return 2 if stripped.start_with?("<!--")
621
+
622
+ # Type 3: <?
623
+ return 3 if stripped.start_with?("<?")
624
+
625
+ # Type 4: <! followed by uppercase ASCII letter
626
+ return 4 if stripped.match?(%r{\A<![A-Z]})
627
+
628
+ # Type 5: <![CDATA[
629
+ return 5 if stripped.start_with?("<![CDATA[")
630
+
631
+ # Type 6: line opens with one of the listed block-level tags.
632
+ return 6 if stripped.match?(HTML_BLOCK_TYPE_6_RE)
633
+
634
+ # Type 7: a complete open or closing tag spanning the line.
635
+ return 7 if valid_html_tag?(stripped)
636
+
637
+ nil
638
+ end
639
+
640
+ HTML_BLOCK_TYPE_6_NAMES = %w[
641
+ address article aside base basefont blockquote body caption center
642
+ col colgroup dd details dialog dir div dl dt fieldset figcaption
643
+ figure footer form frame frameset h1 h2 h3 h4 h5 h6 head header
644
+ hr html iframe legend li link main menu menuitem nav noframes ol
645
+ optgroup option p param search section summary table tbody td
646
+ tfoot th thead title tr track ul
647
+ ].freeze
648
+ HTML_BLOCK_TYPE_6_RE =
649
+ %r{\A</?(?:#{HTML_BLOCK_TYPE_6_NAMES.join('|')})(?:[ \t]|>|/>|\z)}i
650
+
651
+ private_constant :HTML_BLOCK_TYPE_6_NAMES, :HTML_BLOCK_TYPE_6_RE
652
+
653
+ def table_start?(lines, index)
654
+ return false if index + 1 >= lines.length
655
+ return false unless table_row?(lines[index].content)
656
+
657
+ header_cells = split_table_row(lines[index].content)
658
+ separators = split_table_row(lines[index + 1].content)
659
+ return false if separators.empty?
660
+
661
+ # GFM spec: separator row must have valid delimiters AND match header column count.
662
+ # "The header row must match the delimiter row in the number of cells.
663
+ # If not, a table will not be recognized."
664
+ return false unless header_cells.length == separators.length
665
+
666
+ separators.all? { |cell| cell.strip.match?(/\A:?-+:?\z/) }
667
+ end
668
+
669
+ def table_row?(text)
670
+ text.include?("|")
671
+ end
672
+
673
+ def split_table_row(text)
674
+ body = text.strip
675
+ body = body[1..] if body.start_with?("|")
676
+ body = body[0...-1] if body.end_with?("|")
677
+ body.split("|", -1)
678
+ end
679
+
680
+ # Type 7: a complete open or closing tag on its own line.
681
+ # Closing tags must not have attributes.
682
+ #
683
+ # HTML tag separators per CommonMark 6.6 are space, tab, or up to one
684
+ # line ending -- not the broader \s class (which would include form
685
+ # feed and vertical tab).
686
+ HTML_TYPE_7_OPEN_TAG_RE = %r{
687
+ \A
688
+ <[A-Za-z][A-Za-z0-9-]*
689
+ (?:[ \t\r\n]+[A-Za-z_:][A-Za-z0-9_.:-]*(?:[ \t\r\n]*=[ \t\r\n]*(?:"[^"\n]*"|'[^'\n]*'|[^ \t\r\n"'=<>`]+))?)*
690
+ [ \t\r\n]*/?>
691
+ \z
692
+ }x
693
+ HTML_TYPE_7_CLOSING_TAG_RE = %r{\A</[A-Za-z][A-Za-z0-9-]*[ \t\r\n]*>\z}
694
+
695
+ private_constant :HTML_TYPE_7_OPEN_TAG_RE, :HTML_TYPE_7_CLOSING_TAG_RE
696
+
697
+ def valid_html_tag?(text)
698
+ # Fast reject: every type-7 tag must begin with `<`.
699
+ return false unless text.start_with?("<")
700
+
701
+ HTML_TYPE_7_OPEN_TAG_RE.match?(text) || HTML_TYPE_7_CLOSING_TAG_RE.match?(text)
702
+ end
703
+
704
+ def store_reference(reference, source_span)
705
+ if @references.key?(reference[:label])
706
+ @diagnostics << Diagnostic.new(
707
+ severity: :warning,
708
+ rule: :duplicate_reference,
709
+ message: "Duplicate reference definition #{reference[:label].inspect} — keeping the first",
710
+ source_span: source_span,
711
+ )
712
+ return
713
+ end
714
+ @references[reference[:label]] = {
715
+ destination: reference[:destination],
716
+ title: reference[:title],
717
+ }
718
+ end
719
+
720
+ def span_len(line)
721
+ line.end_byte - line.start_byte
722
+ end
723
+ end
724
+ end