red_quilt 0.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.rubocop.yml +109 -0
- data/.rubocop_todo.yml +7 -0
- data/CHANGELOG.md +57 -0
- data/README.md +284 -0
- data/Rakefile +8 -0
- data/ast-spec.md +1227 -0
- data/docs/architecture.md +81 -0
- data/docs/arena-usage.md +363 -0
- data/docs/commonmark-conformance.md +241 -0
- data/exe/redquilt +7 -0
- data/lib/red_quilt/arena.rb +366 -0
- data/lib/red_quilt/block_parser.rb +724 -0
- data/lib/red_quilt/blockquote.rb +151 -0
- data/lib/red_quilt/cli.rb +182 -0
- data/lib/red_quilt/diagnostic.rb +47 -0
- data/lib/red_quilt/document.rb +126 -0
- data/lib/red_quilt/extended_autolink_pass.rb +185 -0
- data/lib/red_quilt/footnote_definition.rb +147 -0
- data/lib/red_quilt/footnote_pass.rb +39 -0
- data/lib/red_quilt/footnote_registry.rb +68 -0
- data/lib/red_quilt/indentation.rb +73 -0
- data/lib/red_quilt/inline/builder.rb +674 -0
- data/lib/red_quilt/inline/flanking.rb +120 -0
- data/lib/red_quilt/inline/html_entities.rb +2180 -0
- data/lib/red_quilt/inline/lexer.rb +280 -0
- data/lib/red_quilt/inline/link_scanner.rb +315 -0
- data/lib/red_quilt/inline/token_kind.rb +39 -0
- data/lib/red_quilt/inline/tokens.rb +73 -0
- data/lib/red_quilt/inline.rb +34 -0
- data/lib/red_quilt/inline_pass.rb +53 -0
- data/lib/red_quilt/line.rb +14 -0
- data/lib/red_quilt/lint_pass.rb +71 -0
- data/lib/red_quilt/list.rb +317 -0
- data/lib/red_quilt/node_ref.rb +114 -0
- data/lib/red_quilt/node_type.rb +66 -0
- data/lib/red_quilt/plain_text.rb +46 -0
- data/lib/red_quilt/reference_definition.rb +309 -0
- data/lib/red_quilt/renderer/html.rb +279 -0
- data/lib/red_quilt/renderer/mdast.rb +152 -0
- data/lib/red_quilt/source_map.rb +29 -0
- data/lib/red_quilt/source_span.rb +26 -0
- data/lib/red_quilt/theme.rb +28 -0
- data/lib/red_quilt/themes/default.css +87 -0
- data/lib/red_quilt/version.rb +5 -0
- data/lib/red_quilt.rb +86 -0
- data/mise.toml +2 -0
- data/sig/red_quilt.rbs +45 -0
- metadata +91 -0
|
@@ -0,0 +1,724 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RedQuilt
|
|
4
|
+
class BlockParser
|
|
5
|
+
def initialize(arena, footnotes: nil)
|
|
6
|
+
@arena = arena
|
|
7
|
+
@lines = build_lines(arena.source)
|
|
8
|
+
@references = {}
|
|
9
|
+
@footnotes = footnotes
|
|
10
|
+
@diagnostics = []
|
|
11
|
+
# Cached collaborator parsers — created once and reused for every
|
|
12
|
+
# block of the corresponding type (including nested ones) so the
|
|
13
|
+
# dispatch path stays allocation-free.
|
|
14
|
+
@list_parser = List::Parser.new(self)
|
|
15
|
+
@blockquote_parser = Blockquote::Parser.new(self)
|
|
16
|
+
@footnote_parser = FootnoteDefinition::Parser.new(self)
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
attr_reader :references, :arena, :diagnostics
|
|
20
|
+
|
|
21
|
+
def parse
|
|
22
|
+
@root_id = @arena.add_node(NodeType::DOCUMENT, source_start: 0, source_len: @arena.source.bytesize)
|
|
23
|
+
parse_lines(@root_id, @lines, transformed: false)
|
|
24
|
+
@footnote_parser.move_section_to_end(@root_id) if @footnotes
|
|
25
|
+
@root_id
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
# parse_lines returns true if it encountered a blank line BETWEEN
|
|
29
|
+
# two block-level constructs at this scope. parse_list uses that to
|
|
30
|
+
# decide an item's looseness — the spec says an item is loose when
|
|
31
|
+
# it "directly contains two block-level elements with a blank line
|
|
32
|
+
# between them", and ref-defs / fence openers that don't emit an
|
|
33
|
+
# arena child still count as block-level elements.
|
|
34
|
+
#
|
|
35
|
+
# `seen_block` guards against treating the empty marker line of a
|
|
36
|
+
# list item (e.g. `-` alone) as a blank "between" anything: the
|
|
37
|
+
# blank only counts after at least one real block has been emitted.
|
|
38
|
+
def parse_lines(parent_id, lines, transformed:)
|
|
39
|
+
saw_blank = false
|
|
40
|
+
seen_block = false
|
|
41
|
+
blank_then_block = false
|
|
42
|
+
index = 0
|
|
43
|
+
while index < lines.length
|
|
44
|
+
line = lines[index]
|
|
45
|
+
if line.blank
|
|
46
|
+
saw_blank = true if seen_block
|
|
47
|
+
index += 1
|
|
48
|
+
next
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
blank_then_block = true if saw_blank
|
|
52
|
+
saw_blank = false
|
|
53
|
+
seen_block = true
|
|
54
|
+
|
|
55
|
+
content = line.content
|
|
56
|
+
if paragraph_only_line?(content)
|
|
57
|
+
# Fast path: nothing in this line can possibly start a
|
|
58
|
+
# different block, so skip the eight predicate checks below.
|
|
59
|
+
index = parse_paragraph(parent_id, lines, index, transformed)
|
|
60
|
+
next
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
if (fence = fenced_code_start(content))
|
|
64
|
+
index = parse_fenced_code(parent_id, lines, index, fence)
|
|
65
|
+
elsif (heading = atx_heading(content))
|
|
66
|
+
append_heading(parent_id, line, heading, transformed)
|
|
67
|
+
index += 1
|
|
68
|
+
elsif thematic_break?(content)
|
|
69
|
+
@arena.append_child(parent_id, @arena.add_node(NodeType::THEMATIC_BREAK, source_start: line.start_byte, source_len: span_len(line)))
|
|
70
|
+
index += 1
|
|
71
|
+
elsif @footnotes && (footnote = FootnoteDefinition.match(content))
|
|
72
|
+
index = @footnote_parser.parse(lines, index, footnote, @footnotes, @root_id)
|
|
73
|
+
elsif (reference = ReferenceDefinition.consume(lines, index))
|
|
74
|
+
store_reference(reference[:reference], reference[:source_span])
|
|
75
|
+
index += reference[:consumed]
|
|
76
|
+
elsif table_start?(lines, index)
|
|
77
|
+
index = parse_table(parent_id, lines, index)
|
|
78
|
+
elsif html_block_start?(content)
|
|
79
|
+
index = parse_html_block(parent_id, lines, index)
|
|
80
|
+
elsif Blockquote.match?(content)
|
|
81
|
+
index = @blockquote_parser.parse(parent_id, lines, index)
|
|
82
|
+
elsif List.match(content)
|
|
83
|
+
index = @list_parser.parse(parent_id, lines, index)
|
|
84
|
+
elsif indented_code_line?(content)
|
|
85
|
+
index = parse_indented_code(parent_id, lines, index)
|
|
86
|
+
else
|
|
87
|
+
index = parse_paragraph(parent_id, lines, index, transformed)
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
blank_then_block
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
# Methods the collaborator parsers (List::Parser, Blockquote::Parser,
|
|
94
|
+
# FootnoteDefinition::Parser) call back into.
|
|
95
|
+
|
|
96
|
+
# A line at less-than-N indent breaks lazy continuation when it would
|
|
97
|
+
# itself start a new block (heading, thematic break, fenced/indented
|
|
98
|
+
# code, html block, blockquote, list item, table). Same predicate as
|
|
99
|
+
# paragraph_interrupt? minus the "index > 0" guard.
|
|
100
|
+
def lazy_break?(lines, index)
|
|
101
|
+
line = lines[index]
|
|
102
|
+
return true if atx_heading(line.content)
|
|
103
|
+
return true if thematic_break?(line.content)
|
|
104
|
+
return true if fenced_code_start(line.content)
|
|
105
|
+
# HTML type 7 doesn't break lazy continuation either.
|
|
106
|
+
if (type = html_block_type(line.content)) && type != 7
|
|
107
|
+
return true
|
|
108
|
+
end
|
|
109
|
+
return true if Blockquote.match?(line.content)
|
|
110
|
+
if (li = List.match(line.content)) && List.interrupts_paragraph?(li)
|
|
111
|
+
return true
|
|
112
|
+
end
|
|
113
|
+
return true if table_start?(lines, index)
|
|
114
|
+
|
|
115
|
+
false
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
# Thematic break per CommonMark: 0-3 spaces of indent, then 3+ of
|
|
119
|
+
# the same character (`*`, `-`, or `_`) optionally separated by
|
|
120
|
+
# whitespace, and nothing else on the line. Lines indented 4+ spaces
|
|
121
|
+
# are indented code, not thematic breaks.
|
|
122
|
+
THEMATIC_BREAK_RE = /\A {0,3}(?:(?:\*[ \t]*){3,}|(?:-[ \t]*){3,}|(?:_[ \t]*){3,})\z/
|
|
123
|
+
|
|
124
|
+
private_constant :THEMATIC_BREAK_RE
|
|
125
|
+
|
|
126
|
+
def thematic_break?(text)
|
|
127
|
+
THEMATIC_BREAK_RE.match?(text)
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
def paragraph_eligible_line?(content)
|
|
131
|
+
return false if indented_code_line?(content)
|
|
132
|
+
return false if fenced_code_start(content)
|
|
133
|
+
return false if atx_heading(content)
|
|
134
|
+
return false if thematic_break?(content)
|
|
135
|
+
return false if html_block_start?(content)
|
|
136
|
+
return false if List.match(content)
|
|
137
|
+
return false if Blockquote.match?(content)
|
|
138
|
+
|
|
139
|
+
true
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
private
|
|
143
|
+
|
|
144
|
+
# Byte values that can begin a non-paragraph block (after 0-3
|
|
145
|
+
# leading spaces). Lines whose first non-space byte is NOT in this
|
|
146
|
+
# set go straight to parse_paragraph, skipping all eight specific
|
|
147
|
+
# block-start predicates.
|
|
148
|
+
#
|
|
149
|
+
# Members: `#` (ATX), ``` ` ```/`~` (fences), `*`/`-`/`+`/`_` (thematic
|
|
150
|
+
# & list markers), `0`-`9` (ordered list), `[` (ref def), `>` (blockquote),
|
|
151
|
+
# `<` (HTML block), `\t` (indented code, when a tab provides indent).
|
|
152
|
+
BLOCK_START_BYTES = begin
|
|
153
|
+
a = Array.new(256, false)
|
|
154
|
+
[0x23, 0x60, 0x7E, 0x2A, 0x2D, 0x2B, 0x5F, 0x5B, 0x3E, 0x3C, 0x09].each { |b| a[b] = true }
|
|
155
|
+
(0x30..0x39).each { |b| a[b] = true }
|
|
156
|
+
a.freeze
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
private_constant :BLOCK_START_BYTES
|
|
160
|
+
|
|
161
|
+
# Returns true when `content` cannot start any non-paragraph block,
|
|
162
|
+
# so the slow predicate fan-out in parse_lines can be skipped. The
|
|
163
|
+
# check is intentionally conservative: anything ambiguous returns
|
|
164
|
+
# false and falls through to the full dispatch.
|
|
165
|
+
def paragraph_only_line?(content)
|
|
166
|
+
bytes = content.bytesize
|
|
167
|
+
i = 0
|
|
168
|
+
# Up to 3 leading spaces are still part of the block prefix; 4+
|
|
169
|
+
# means indented code, which IS a block start.
|
|
170
|
+
while i < 3 && i < bytes && content.getbyte(i) == 0x20
|
|
171
|
+
i += 1
|
|
172
|
+
end
|
|
173
|
+
return false if i >= bytes
|
|
174
|
+
|
|
175
|
+
first = content.getbyte(i)
|
|
176
|
+
# 4+ leading spaces? Treat as indented code candidate.
|
|
177
|
+
return false if i == 3 && first == 0x20
|
|
178
|
+
# The first non-space byte gates every block start we recognise.
|
|
179
|
+
return false if BLOCK_START_BYTES[first]
|
|
180
|
+
# Table rows always contain `|`; quick C-level scan covers them.
|
|
181
|
+
return false if content.include?("|")
|
|
182
|
+
|
|
183
|
+
true
|
|
184
|
+
end
|
|
185
|
+
|
|
186
|
+
def parse_fenced_code(parent_id, lines, index, fence)
|
|
187
|
+
start_line = lines[index]
|
|
188
|
+
content_lines = []
|
|
189
|
+
index += 1
|
|
190
|
+
while index < lines.length
|
|
191
|
+
break if fenced_code_close?(lines[index].content, fence[:char], fence[:count])
|
|
192
|
+
|
|
193
|
+
content_lines << lines[index]
|
|
194
|
+
index += 1
|
|
195
|
+
end
|
|
196
|
+
index += 1 if index < lines.length
|
|
197
|
+
|
|
198
|
+
# Each content line is stripped of up to the fence's own leading
|
|
199
|
+
# indent (CommonMark spec: a fence indented by N spaces strips up
|
|
200
|
+
# to N spaces from every content line, but never more). Manual
|
|
201
|
+
# byte scan beats compiling an interpolated regex per block and
|
|
202
|
+
# short-circuits when the fence had no indent (the common case).
|
|
203
|
+
indent_n = fence[:indent] || 0
|
|
204
|
+
code = content_lines.map { |l| strip_leading_spaces(l.content, indent_n) }.join("\n")
|
|
205
|
+
code << "\n" unless content_lines.empty?
|
|
206
|
+
source_start = content_lines.empty? ? start_line.start_byte : content_lines.first.start_byte
|
|
207
|
+
source_end = content_lines.empty? ? start_line.end_byte : content_lines.last.end_byte
|
|
208
|
+
code_id = @arena.add_node(NodeType::CODE_BLOCK,
|
|
209
|
+
source_start: source_start,
|
|
210
|
+
source_len: source_end - source_start,
|
|
211
|
+
str1: code,
|
|
212
|
+
str2: fence[:info])
|
|
213
|
+
@arena.append_child(parent_id, code_id)
|
|
214
|
+
index
|
|
215
|
+
end
|
|
216
|
+
|
|
217
|
+
def parse_indented_code(parent_id, lines, index)
|
|
218
|
+
start_index = index
|
|
219
|
+
code_lines = []
|
|
220
|
+
while index < lines.length
|
|
221
|
+
line = lines[index]
|
|
222
|
+
break unless line.blank || indented_code_line?(line.content)
|
|
223
|
+
|
|
224
|
+
# CommonMark: strip up to 4 columns of leading whitespace
|
|
225
|
+
# (tab-aware) from every line, including blank lines whose
|
|
226
|
+
# content beyond column 4 must be preserved verbatim.
|
|
227
|
+
code_lines << Indentation.strip_columns(line.content, 4)
|
|
228
|
+
index += 1
|
|
229
|
+
end
|
|
230
|
+
|
|
231
|
+
# Trailing blank lines are not part of the code block.
|
|
232
|
+
while !code_lines.empty? && code_lines.last.strip.empty?
|
|
233
|
+
code_lines.pop
|
|
234
|
+
index -= 1
|
|
235
|
+
end
|
|
236
|
+
|
|
237
|
+
start_byte = lines[start_index].start_byte
|
|
238
|
+
end_byte = lines[index - 1].end_byte
|
|
239
|
+
code = code_lines.empty? ? "" : code_lines.join("\n") + "\n"
|
|
240
|
+
|
|
241
|
+
code_id = @arena.add_node(NodeType::CODE_BLOCK,
|
|
242
|
+
source_start: start_byte,
|
|
243
|
+
source_len: end_byte - start_byte,
|
|
244
|
+
str1: code)
|
|
245
|
+
@arena.append_child(parent_id, code_id)
|
|
246
|
+
index
|
|
247
|
+
end
|
|
248
|
+
|
|
249
|
+
HTML_BLOCK_FIXED_TERMINATORS = {
|
|
250
|
+
2 => "-->",
|
|
251
|
+
3 => "?>",
|
|
252
|
+
4 => ">",
|
|
253
|
+
5 => "]]>",
|
|
254
|
+
}.freeze
|
|
255
|
+
|
|
256
|
+
private_constant :HTML_BLOCK_FIXED_TERMINATORS
|
|
257
|
+
|
|
258
|
+
def parse_html_block(parent_id, lines, index)
|
|
259
|
+
start_index = index
|
|
260
|
+
type = html_block_type(lines[index].content)
|
|
261
|
+
end_index = locate_html_block_end(lines, index, type)
|
|
262
|
+
|
|
263
|
+
start_byte = lines[start_index].start_byte
|
|
264
|
+
end_byte = lines[end_index].end_byte
|
|
265
|
+
html_lines = (start_index..end_index).map { |i| lines[i].content }
|
|
266
|
+
html_id = @arena.add_node(NodeType::HTML_BLOCK,
|
|
267
|
+
source_start: start_byte,
|
|
268
|
+
source_len: end_byte - start_byte,
|
|
269
|
+
str1: html_lines.join("\n"))
|
|
270
|
+
@arena.append_child(parent_id, html_id)
|
|
271
|
+
end_index + 1
|
|
272
|
+
end
|
|
273
|
+
|
|
274
|
+
def locate_html_block_end(lines, index, type)
|
|
275
|
+
terminator = html_block_terminator(type, lines[index].content)
|
|
276
|
+
|
|
277
|
+
if terminator
|
|
278
|
+
case_insensitive = (type == 1)
|
|
279
|
+
while index < lines.length
|
|
280
|
+
line = lines[index].content
|
|
281
|
+
haystack = case_insensitive ? line.downcase : line
|
|
282
|
+
return index if haystack.include?(terminator)
|
|
283
|
+
|
|
284
|
+
index += 1
|
|
285
|
+
end
|
|
286
|
+
lines.length - 1
|
|
287
|
+
else
|
|
288
|
+
# Types 6 & 7: terminated by blank line (or end of input)
|
|
289
|
+
index += 1 while index < lines.length && !lines[index].blank
|
|
290
|
+
index - 1
|
|
291
|
+
end
|
|
292
|
+
end
|
|
293
|
+
|
|
294
|
+
def html_block_terminator(type, first_line)
|
|
295
|
+
case type
|
|
296
|
+
when 1
|
|
297
|
+
"</#{extract_closing_tag_name(first_line)}>"
|
|
298
|
+
when 2..5
|
|
299
|
+
HTML_BLOCK_FIXED_TERMINATORS[type]
|
|
300
|
+
end
|
|
301
|
+
end
|
|
302
|
+
|
|
303
|
+
def extract_closing_tag_name(text)
|
|
304
|
+
match = /\A<(script|pre|style|textarea)/i.match(text)
|
|
305
|
+
match ? match[1].downcase : "script"
|
|
306
|
+
end
|
|
307
|
+
|
|
308
|
+
def parse_table(parent_id, lines, index)
|
|
309
|
+
# Caller must have verified table_start?(lines, index), which validates
|
|
310
|
+
# both the delimiter pattern and the header/separator column count match.
|
|
311
|
+
start_index = index
|
|
312
|
+
header_cells = split_table_row(lines[index].content)
|
|
313
|
+
row_lines = [lines[index]]
|
|
314
|
+
index += 2
|
|
315
|
+
while index < lines.length
|
|
316
|
+
break if lines[index].blank
|
|
317
|
+
break unless table_row?(lines[index].content)
|
|
318
|
+
|
|
319
|
+
row_lines << lines[index]
|
|
320
|
+
index += 1
|
|
321
|
+
end
|
|
322
|
+
|
|
323
|
+
table_id = @arena.add_node(NodeType::TABLE,
|
|
324
|
+
source_start: lines[start_index].start_byte,
|
|
325
|
+
source_len: row_lines.last.end_byte - lines[start_index].start_byte)
|
|
326
|
+
@arena.append_child(parent_id, table_id)
|
|
327
|
+
|
|
328
|
+
append_table_row(table_id, lines[start_index], header_cells, true)
|
|
329
|
+
row_lines.drop(1).each do |row_line|
|
|
330
|
+
append_table_row(table_id, row_line, split_table_row(row_line.content), false)
|
|
331
|
+
end
|
|
332
|
+
|
|
333
|
+
index
|
|
334
|
+
end
|
|
335
|
+
|
|
336
|
+
def append_table_row(table_id, line, cells, header)
|
|
337
|
+
row_id = @arena.add_node(NodeType::TABLE_ROW,
|
|
338
|
+
source_start: line.start_byte,
|
|
339
|
+
source_len: span_len(line),
|
|
340
|
+
int1: header ? 1 : 0)
|
|
341
|
+
@arena.append_child(table_id, row_id)
|
|
342
|
+
cells.each do |cell_text|
|
|
343
|
+
stripped = cell_text.strip
|
|
344
|
+
cell_id = @arena.add_node(NodeType::TABLE_CELL,
|
|
345
|
+
source_start: line.start_byte,
|
|
346
|
+
source_len: span_len(line),
|
|
347
|
+
int1: header ? 1 : 0,
|
|
348
|
+
str1: stripped)
|
|
349
|
+
@arena.append_child(row_id, cell_id)
|
|
350
|
+
end
|
|
351
|
+
end
|
|
352
|
+
|
|
353
|
+
def append_heading(parent_id, line, heading, transformed)
|
|
354
|
+
content = heading[:content].to_s.rstrip
|
|
355
|
+
source_start = line.start_byte + heading[:content_start]
|
|
356
|
+
node_id = @arena.add_node(NodeType::HEADING,
|
|
357
|
+
source_start: source_start,
|
|
358
|
+
source_len: content.bytesize,
|
|
359
|
+
int1: heading[:level],
|
|
360
|
+
str1: transformed ? content : nil)
|
|
361
|
+
@arena.append_child(parent_id, node_id)
|
|
362
|
+
end
|
|
363
|
+
|
|
364
|
+
def parse_paragraph(parent_id, lines, index, transformed)
|
|
365
|
+
paragraph_lines = []
|
|
366
|
+
start_index = index
|
|
367
|
+
setext_level = nil
|
|
368
|
+
while index < lines.length
|
|
369
|
+
line = lines[index]
|
|
370
|
+
break if line.blank
|
|
371
|
+
|
|
372
|
+
# Setext heading underline: only valid when there is already at
|
|
373
|
+
# least one paragraph line above it. Checked before
|
|
374
|
+
# paragraph_interrupt? so that "---" / "===" turns the open
|
|
375
|
+
# paragraph into a heading instead of being treated as a
|
|
376
|
+
# thematic break.
|
|
377
|
+
if paragraph_lines.any? && !line.lazy_continuation && (level = setext_underline_level(line.content))
|
|
378
|
+
setext_level = level
|
|
379
|
+
index += 1
|
|
380
|
+
break
|
|
381
|
+
end
|
|
382
|
+
|
|
383
|
+
# Lazy continuation lines always extend the open paragraph;
|
|
384
|
+
# they have already been classified as paragraph content by the
|
|
385
|
+
# outer collector, so we must not let `paragraph_interrupt?`
|
|
386
|
+
# split them off into a new block (which would also try to
|
|
387
|
+
# parse them as e.g. a list item start).
|
|
388
|
+
if !line.lazy_continuation && index > start_index && paragraph_interrupt?(lines, index)
|
|
389
|
+
break
|
|
390
|
+
end
|
|
391
|
+
|
|
392
|
+
# NOTE: Per CommonMark, a `[label]: ...` line cannot start a
|
|
393
|
+
# link reference definition inside an open paragraph — it's
|
|
394
|
+
# absorbed as paragraph continuation. The dispatch in
|
|
395
|
+
# parse_lines catches definitions that appear after a blank
|
|
396
|
+
# line, so we don't need another scan here.
|
|
397
|
+
paragraph_lines << line
|
|
398
|
+
index += 1
|
|
399
|
+
end
|
|
400
|
+
|
|
401
|
+
# CommonMark: the first paragraph line may carry 0-3 spaces of
|
|
402
|
+
# leading indent (4+ would be an indented code block, so it never
|
|
403
|
+
# reaches this branch). Continuation lines have no fixed indent
|
|
404
|
+
# cap — all leading whitespace is stripped before joining.
|
|
405
|
+
stripped = paragraph_lines.map.with_index do |l, i|
|
|
406
|
+
i.zero? ? strip_leading_spaces(l.content, 3) : strip_leading_whitespace(l.content)
|
|
407
|
+
end
|
|
408
|
+
# Trailing whitespace on the last line is dropped (no hard-break
|
|
409
|
+
# without a following content line).
|
|
410
|
+
stripped[-1] = stripped[-1].sub(/[ \t]+\z/, "") if stripped.any?
|
|
411
|
+
indent_was_stripped = stripped.zip(paragraph_lines).any? { |s, l| s.length != l.content.length }
|
|
412
|
+
text = stripped.join("\n")
|
|
413
|
+
start_byte = paragraph_lines.first.start_byte
|
|
414
|
+
end_byte = paragraph_lines.last.end_byte
|
|
415
|
+
|
|
416
|
+
if setext_level
|
|
417
|
+
heading_id = @arena.add_node(NodeType::HEADING,
|
|
418
|
+
source_start: start_byte,
|
|
419
|
+
source_len: end_byte - start_byte,
|
|
420
|
+
int1: setext_level,
|
|
421
|
+
str1: text.strip)
|
|
422
|
+
@arena.append_child(parent_id, heading_id)
|
|
423
|
+
return index
|
|
424
|
+
end
|
|
425
|
+
|
|
426
|
+
# Paragraphs carry a literal when the inline content cannot be
|
|
427
|
+
# recovered from a contiguous source slice — that is, when block
|
|
428
|
+
# transformation has already happened (blockquote / list item
|
|
429
|
+
# interior, `transformed: true`) or when we stripped leading
|
|
430
|
+
# paragraph indent above. Otherwise leave str1 nil so the inline
|
|
431
|
+
# pass and NodeRef#source_span / source_location use the real
|
|
432
|
+
# source bytes.
|
|
433
|
+
needs_literal = transformed || indent_was_stripped
|
|
434
|
+
paragraph_id = @arena.add_node(NodeType::PARAGRAPH,
|
|
435
|
+
source_start: start_byte,
|
|
436
|
+
source_len: end_byte - start_byte,
|
|
437
|
+
str1: needs_literal ? text : nil)
|
|
438
|
+
@arena.append_child(parent_id, paragraph_id)
|
|
439
|
+
index
|
|
440
|
+
end
|
|
441
|
+
|
|
442
|
+
# Returns 1 for `===...` (h1), 2 for `---...` (h2), nil otherwise.
|
|
443
|
+
# Leading up to 3 spaces of indent and any amount of trailing
|
|
444
|
+
# whitespace are allowed.
|
|
445
|
+
def setext_underline_level(text)
|
|
446
|
+
match = /\A {0,3}(=+|-+)[ \t]*\z/.match(text)
|
|
447
|
+
return nil unless match
|
|
448
|
+
|
|
449
|
+
match[1].start_with?("=") ? 1 : 2
|
|
450
|
+
end
|
|
451
|
+
|
|
452
|
+
def paragraph_interrupt?(lines, index)
|
|
453
|
+
line = lines[index]
|
|
454
|
+
return false unless index > 0
|
|
455
|
+
return true if atx_heading(line.content)
|
|
456
|
+
return true if thematic_break?(line.content)
|
|
457
|
+
return true if fenced_code_start(line.content)
|
|
458
|
+
# CommonMark: HTML block types 1–6 interrupt paragraphs; type 7
|
|
459
|
+
# (a bare valid tag on its own line) does not.
|
|
460
|
+
if (type = html_block_type(line.content)) && type != 7
|
|
461
|
+
return true
|
|
462
|
+
end
|
|
463
|
+
return true if Blockquote.match?(line.content)
|
|
464
|
+
if (li = List.match(line.content)) && List.interrupts_paragraph?(li)
|
|
465
|
+
return true
|
|
466
|
+
end
|
|
467
|
+
return true if table_start?(lines, index)
|
|
468
|
+
|
|
469
|
+
false
|
|
470
|
+
end
|
|
471
|
+
|
|
472
|
+
# Strips up to `max` leading 0x20 bytes from `text`. Returns the
|
|
473
|
+
# original string when nothing changed, so callers avoid an
|
|
474
|
+
# allocation in the common no-indent case.
|
|
475
|
+
def strip_leading_spaces(text, max)
|
|
476
|
+
return text if max <= 0
|
|
477
|
+
|
|
478
|
+
bytes = text.bytesize
|
|
479
|
+
i = 0
|
|
480
|
+
while i < max && i < bytes && text.getbyte(i) == 0x20
|
|
481
|
+
i += 1
|
|
482
|
+
end
|
|
483
|
+
return text if i.zero?
|
|
484
|
+
|
|
485
|
+
text.byteslice(i..)
|
|
486
|
+
end
|
|
487
|
+
|
|
488
|
+
# Strips all leading 0x20 / 0x09 bytes from `text`. Same no-alloc
|
|
489
|
+
# return as `strip_leading_spaces` when the string already starts
|
|
490
|
+
# at a non-whitespace byte.
|
|
491
|
+
def strip_leading_whitespace(text)
|
|
492
|
+
bytes = text.bytesize
|
|
493
|
+
i = 0
|
|
494
|
+
while i < bytes
|
|
495
|
+
b = text.getbyte(i)
|
|
496
|
+
break unless b == 0x20 || b == 0x09
|
|
497
|
+
|
|
498
|
+
i += 1
|
|
499
|
+
end
|
|
500
|
+
return text if i.zero?
|
|
501
|
+
|
|
502
|
+
text.byteslice(i..)
|
|
503
|
+
end
|
|
504
|
+
|
|
505
|
+
def build_lines(source)
|
|
506
|
+
# split("\n", -1) avoids the extra slice/allocation that
|
|
507
|
+
# each_line + chomp incurs per line. The blank-line check uses
|
|
508
|
+
# /[^ \t]/ (not /\S/) because CommonMark defines a blank line as
|
|
509
|
+
# "empty, or containing only spaces (U+0020) or tabs (U+0009)" --
|
|
510
|
+
# other whitespace (e.g. form feed U+000C) does NOT make a line
|
|
511
|
+
# blank and must continue an enclosing paragraph.
|
|
512
|
+
parts = source.split("\n", -1)
|
|
513
|
+
parts.pop if source.end_with?("\n")
|
|
514
|
+
lines = []
|
|
515
|
+
offset = 0
|
|
516
|
+
parts.each do |raw|
|
|
517
|
+
size = raw.bytesize
|
|
518
|
+
lines << Line.new(raw, offset, offset + size, !raw.match?(/[^ \t]/))
|
|
519
|
+
offset += size + 1
|
|
520
|
+
end
|
|
521
|
+
lines
|
|
522
|
+
end
|
|
523
|
+
|
|
524
|
+
# ATX headings per CommonMark spec:
|
|
525
|
+
# - 0-3 spaces of indent, then 1-6 `#`s
|
|
526
|
+
# - either end-of-line (empty heading) or at least one space/tab
|
|
527
|
+
# followed by the content
|
|
528
|
+
# - optional trailing `#`s are only stripped when separated from the
|
|
529
|
+
# content by whitespace (so `# foo#` keeps the `#`)
|
|
530
|
+
ATX_HEADING_RE = /\A {0,3}(\#{1,6})(?:[ \t]+\#+[ \t]*|[ \t]+(.*?)(?:[ \t]+\#+)?[ \t]*|[ \t]*)\z/
|
|
531
|
+
|
|
532
|
+
private_constant :ATX_HEADING_RE
|
|
533
|
+
|
|
534
|
+
def atx_heading(text)
|
|
535
|
+
match = ATX_HEADING_RE.match(text)
|
|
536
|
+
return unless match
|
|
537
|
+
|
|
538
|
+
content = match[2].to_s
|
|
539
|
+
content_index = content.empty? ? text.length : (text.index(content) || text.bytesize)
|
|
540
|
+
{ level: match[1].length, content: content, content_start: content_index }
|
|
541
|
+
end
|
|
542
|
+
|
|
543
|
+
def fenced_code_start(text)
|
|
544
|
+
match = /\A( {0,3})(`{3,}|~{3,})[ \t]*(.*?)\s*\z/.match(text)
|
|
545
|
+
return unless match
|
|
546
|
+
|
|
547
|
+
info = match[3]
|
|
548
|
+
# CommonMark: a backtick-style fence cannot have backticks in its
|
|
549
|
+
# info string (they'd be ambiguous with the fence itself).
|
|
550
|
+
return if match[2].start_with?("`") && info.include?("`")
|
|
551
|
+
|
|
552
|
+
{
|
|
553
|
+
char: match[2][0],
|
|
554
|
+
count: match[2].length,
|
|
555
|
+
info: ReferenceDefinition.unescape_text(info),
|
|
556
|
+
indent: match[1].length,
|
|
557
|
+
}
|
|
558
|
+
end
|
|
559
|
+
|
|
560
|
+
def fenced_code_close?(text, char, count)
|
|
561
|
+
# Manual byte scan beats compiling a per-(char,count) regex on
|
|
562
|
+
# every line of a fenced block. Pattern: 0-3 spaces, >=count of
|
|
563
|
+
# `char`, optional trailing spaces/tabs, end-of-line.
|
|
564
|
+
bytes = text.bytesize
|
|
565
|
+
i = 0
|
|
566
|
+
# CommonMark spec: at most 3 spaces of indent.
|
|
567
|
+
while i < 3 && i < bytes && text.getbyte(i) == 0x20
|
|
568
|
+
i += 1
|
|
569
|
+
end
|
|
570
|
+
char_byte = char.getbyte(0)
|
|
571
|
+
fence_start = i
|
|
572
|
+
while i < bytes && text.getbyte(i) == char_byte
|
|
573
|
+
i += 1
|
|
574
|
+
end
|
|
575
|
+
return false if i - fence_start < count
|
|
576
|
+
|
|
577
|
+
while i < bytes
|
|
578
|
+
b = text.getbyte(i)
|
|
579
|
+
return false unless b == 0x20 || b == 0x09
|
|
580
|
+
|
|
581
|
+
i += 1
|
|
582
|
+
end
|
|
583
|
+
true
|
|
584
|
+
end
|
|
585
|
+
|
|
586
|
+
def indented_code_line?(text)
|
|
587
|
+
# CommonMark: 4+ columns of leading whitespace, where tabs expand
|
|
588
|
+
# virtually to a tab stop of 4 columns.
|
|
589
|
+
Indentation.leading_columns(text) >= 4
|
|
590
|
+
end
|
|
591
|
+
|
|
592
|
+
# Returns the column count of leading whitespace, treating tabs as
|
|
593
|
+
# advancing to the next multiple-of-4 column.
|
|
594
|
+
def html_block_start?(text)
|
|
595
|
+
# Indented code block takes precedence (4+ spaces)
|
|
596
|
+
return false if text.start_with?(" ")
|
|
597
|
+
|
|
598
|
+
!html_block_type(text).nil?
|
|
599
|
+
end
|
|
600
|
+
|
|
601
|
+
def html_block_type(text)
|
|
602
|
+
# Fast reject: every HTML block starts with `<`. lstrip strips
|
|
603
|
+
# 0-3 indent spaces (more would already be indented code), so peek
|
|
604
|
+
# the leading non-space byte before doing any allocations.
|
|
605
|
+
i = 0
|
|
606
|
+
# CommonMark: HTML block lines may have 0-3 spaces of indent.
|
|
607
|
+
while i < 3 && i < text.length && text.getbyte(i) == 0x20
|
|
608
|
+
i += 1
|
|
609
|
+
end
|
|
610
|
+
return nil unless i < text.length && text.getbyte(i) == 0x3C
|
|
611
|
+
|
|
612
|
+
stripped = i.zero? ? text : text[i..]
|
|
613
|
+
|
|
614
|
+
# Type 1: <script|pre|style|textarea (case-insensitive) followed by
|
|
615
|
+
# space/tab/end-of-line or `>`. CommonMark restricts the separator
|
|
616
|
+
# to space, tab, or a line ending (not any whitespace class).
|
|
617
|
+
return 1 if stripped.match?(%r{\A<(script|pre|style|textarea)(?:[ \t]|>|$)}i)
|
|
618
|
+
|
|
619
|
+
# Type 2: <!--
|
|
620
|
+
return 2 if stripped.start_with?("<!--")
|
|
621
|
+
|
|
622
|
+
# Type 3: <?
|
|
623
|
+
return 3 if stripped.start_with?("<?")
|
|
624
|
+
|
|
625
|
+
# Type 4: <! followed by uppercase ASCII letter
|
|
626
|
+
return 4 if stripped.match?(%r{\A<![A-Z]})
|
|
627
|
+
|
|
628
|
+
# Type 5: <![CDATA[
|
|
629
|
+
return 5 if stripped.start_with?("<![CDATA[")
|
|
630
|
+
|
|
631
|
+
# Type 6: line opens with one of the listed block-level tags.
|
|
632
|
+
return 6 if stripped.match?(HTML_BLOCK_TYPE_6_RE)
|
|
633
|
+
|
|
634
|
+
# Type 7: a complete open or closing tag spanning the line.
|
|
635
|
+
return 7 if valid_html_tag?(stripped)
|
|
636
|
+
|
|
637
|
+
nil
|
|
638
|
+
end
|
|
639
|
+
|
|
640
|
+
HTML_BLOCK_TYPE_6_NAMES = %w[
|
|
641
|
+
address article aside base basefont blockquote body caption center
|
|
642
|
+
col colgroup dd details dialog dir div dl dt fieldset figcaption
|
|
643
|
+
figure footer form frame frameset h1 h2 h3 h4 h5 h6 head header
|
|
644
|
+
hr html iframe legend li link main menu menuitem nav noframes ol
|
|
645
|
+
optgroup option p param search section summary table tbody td
|
|
646
|
+
tfoot th thead title tr track ul
|
|
647
|
+
].freeze
|
|
648
|
+
HTML_BLOCK_TYPE_6_RE =
|
|
649
|
+
%r{\A</?(?:#{HTML_BLOCK_TYPE_6_NAMES.join('|')})(?:[ \t]|>|/>|\z)}i
|
|
650
|
+
|
|
651
|
+
private_constant :HTML_BLOCK_TYPE_6_NAMES, :HTML_BLOCK_TYPE_6_RE
|
|
652
|
+
|
|
653
|
+
def table_start?(lines, index)
|
|
654
|
+
return false if index + 1 >= lines.length
|
|
655
|
+
return false unless table_row?(lines[index].content)
|
|
656
|
+
|
|
657
|
+
header_cells = split_table_row(lines[index].content)
|
|
658
|
+
separators = split_table_row(lines[index + 1].content)
|
|
659
|
+
return false if separators.empty?
|
|
660
|
+
|
|
661
|
+
# GFM spec: separator row must have valid delimiters AND match header column count.
|
|
662
|
+
# "The header row must match the delimiter row in the number of cells.
|
|
663
|
+
# If not, a table will not be recognized."
|
|
664
|
+
return false unless header_cells.length == separators.length
|
|
665
|
+
|
|
666
|
+
separators.all? { |cell| cell.strip.match?(/\A:?-+:?\z/) }
|
|
667
|
+
end
|
|
668
|
+
|
|
669
|
+
def table_row?(text)
|
|
670
|
+
text.include?("|")
|
|
671
|
+
end
|
|
672
|
+
|
|
673
|
+
def split_table_row(text)
|
|
674
|
+
body = text.strip
|
|
675
|
+
body = body[1..] if body.start_with?("|")
|
|
676
|
+
body = body[0...-1] if body.end_with?("|")
|
|
677
|
+
body.split("|", -1)
|
|
678
|
+
end
|
|
679
|
+
|
|
680
|
+
# Type 7: a complete open or closing tag on its own line.
|
|
681
|
+
# Closing tags must not have attributes.
|
|
682
|
+
#
|
|
683
|
+
# HTML tag separators per CommonMark 6.6 are space, tab, or up to one
|
|
684
|
+
# line ending -- not the broader \s class (which would include form
|
|
685
|
+
# feed and vertical tab).
|
|
686
|
+
HTML_TYPE_7_OPEN_TAG_RE = %r{
|
|
687
|
+
\A
|
|
688
|
+
<[A-Za-z][A-Za-z0-9-]*
|
|
689
|
+
(?:[ \t\r\n]+[A-Za-z_:][A-Za-z0-9_.:-]*(?:[ \t\r\n]*=[ \t\r\n]*(?:"[^"\n]*"|'[^'\n]*'|[^ \t\r\n"'=<>`]+))?)*
|
|
690
|
+
[ \t\r\n]*/?>
|
|
691
|
+
\z
|
|
692
|
+
}x
|
|
693
|
+
HTML_TYPE_7_CLOSING_TAG_RE = %r{\A</[A-Za-z][A-Za-z0-9-]*[ \t\r\n]*>\z}
|
|
694
|
+
|
|
695
|
+
private_constant :HTML_TYPE_7_OPEN_TAG_RE, :HTML_TYPE_7_CLOSING_TAG_RE
|
|
696
|
+
|
|
697
|
+
def valid_html_tag?(text)
|
|
698
|
+
# Fast reject: every type-7 tag must begin with `<`.
|
|
699
|
+
return false unless text.start_with?("<")
|
|
700
|
+
|
|
701
|
+
HTML_TYPE_7_OPEN_TAG_RE.match?(text) || HTML_TYPE_7_CLOSING_TAG_RE.match?(text)
|
|
702
|
+
end
|
|
703
|
+
|
|
704
|
+
def store_reference(reference, source_span)
|
|
705
|
+
if @references.key?(reference[:label])
|
|
706
|
+
@diagnostics << Diagnostic.new(
|
|
707
|
+
severity: :warning,
|
|
708
|
+
rule: :duplicate_reference,
|
|
709
|
+
message: "Duplicate reference definition #{reference[:label].inspect} — keeping the first",
|
|
710
|
+
source_span: source_span,
|
|
711
|
+
)
|
|
712
|
+
return
|
|
713
|
+
end
|
|
714
|
+
@references[reference[:label]] = {
|
|
715
|
+
destination: reference[:destination],
|
|
716
|
+
title: reference[:title],
|
|
717
|
+
}
|
|
718
|
+
end
|
|
719
|
+
|
|
720
|
+
def span_len(line)
|
|
721
|
+
line.end_byte - line.start_byte
|
|
722
|
+
end
|
|
723
|
+
end
|
|
724
|
+
end
|