red_quilt 0.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.rubocop.yml +109 -0
- data/.rubocop_todo.yml +7 -0
- data/CHANGELOG.md +57 -0
- data/README.md +284 -0
- data/Rakefile +8 -0
- data/ast-spec.md +1227 -0
- data/docs/architecture.md +81 -0
- data/docs/arena-usage.md +363 -0
- data/docs/commonmark-conformance.md +241 -0
- data/exe/redquilt +7 -0
- data/lib/red_quilt/arena.rb +366 -0
- data/lib/red_quilt/block_parser.rb +724 -0
- data/lib/red_quilt/blockquote.rb +151 -0
- data/lib/red_quilt/cli.rb +182 -0
- data/lib/red_quilt/diagnostic.rb +47 -0
- data/lib/red_quilt/document.rb +126 -0
- data/lib/red_quilt/extended_autolink_pass.rb +185 -0
- data/lib/red_quilt/footnote_definition.rb +147 -0
- data/lib/red_quilt/footnote_pass.rb +39 -0
- data/lib/red_quilt/footnote_registry.rb +68 -0
- data/lib/red_quilt/indentation.rb +73 -0
- data/lib/red_quilt/inline/builder.rb +674 -0
- data/lib/red_quilt/inline/flanking.rb +120 -0
- data/lib/red_quilt/inline/html_entities.rb +2180 -0
- data/lib/red_quilt/inline/lexer.rb +280 -0
- data/lib/red_quilt/inline/link_scanner.rb +315 -0
- data/lib/red_quilt/inline/token_kind.rb +39 -0
- data/lib/red_quilt/inline/tokens.rb +73 -0
- data/lib/red_quilt/inline.rb +34 -0
- data/lib/red_quilt/inline_pass.rb +53 -0
- data/lib/red_quilt/line.rb +14 -0
- data/lib/red_quilt/lint_pass.rb +71 -0
- data/lib/red_quilt/list.rb +317 -0
- data/lib/red_quilt/node_ref.rb +114 -0
- data/lib/red_quilt/node_type.rb +66 -0
- data/lib/red_quilt/plain_text.rb +46 -0
- data/lib/red_quilt/reference_definition.rb +309 -0
- data/lib/red_quilt/renderer/html.rb +279 -0
- data/lib/red_quilt/renderer/mdast.rb +152 -0
- data/lib/red_quilt/source_map.rb +29 -0
- data/lib/red_quilt/source_span.rb +26 -0
- data/lib/red_quilt/theme.rb +28 -0
- data/lib/red_quilt/themes/default.css +87 -0
- data/lib/red_quilt/version.rb +5 -0
- data/lib/red_quilt.rb +86 -0
- data/mise.toml +2 -0
- data/sig/red_quilt.rbs +45 -0
- metadata +91 -0
|
@@ -0,0 +1,674 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RedQuilt
|
|
4
|
+
module Inline
|
|
5
|
+
# Consumes a token stream produced by Lexer and adds inline nodes
|
|
6
|
+
# to the arena under parent_id.
|
|
7
|
+
#
|
|
8
|
+
# Processing happens in two phases:
|
|
9
|
+
# 1. linear_pass — code spans, brackets (link/image), autolinks,
|
|
10
|
+
# HTML, simple inlines. Emphasis delimiter runs are added as
|
|
11
|
+
# provisional TEXT nodes and pushed onto a delimiter stack.
|
|
12
|
+
# 2. process_emphasis — CommonMark spec 6.2 algorithm pairs up
|
|
13
|
+
# delimiter stack entries into EMPHASIS / STRONG nodes.
|
|
14
|
+
class Builder
|
|
15
|
+
SAFE_SCHEMES = %w[http https mailto ftp tel ssh].freeze
|
|
16
|
+
# Autolinks (`<scheme:...>`) are not run through the SAFE_SCHEMES
|
|
17
|
+
# allowlist: CommonMark permits arbitrary schemes there (e.g.
|
|
18
|
+
# `<made-up-scheme://x>`), and an allowlist would break that
|
|
19
|
+
# conformance. Only the schemes that execute script when the link
|
|
20
|
+
# is navigated are denied.
|
|
21
|
+
UNSAFE_AUTOLINK_SCHEMES = %w[javascript vbscript data].freeze
|
|
22
|
+
|
|
23
|
+
# `count` is the CommonMark delimiter-run length; a Delimiter is
|
|
24
|
+
# never enumerated, so shadowing Struct#count (from Enumerable) is
|
|
25
|
+
# intentional rather than a footgun.
|
|
26
|
+
Delimiter = Struct.new(:node_id, :char, :count, :can_open, :can_close) # rubocop:disable Lint/StructNewOverride
|
|
27
|
+
|
|
28
|
+
Bracket = Struct.new(:token_id, :node_id, :image, :active, :delim_stack_size)
|
|
29
|
+
|
|
30
|
+
# track_source: when true, arena nodes carry the byte ranges supplied
|
|
31
|
+
# by the lexer. When false (used for inputs whose source has been
|
|
32
|
+
# materialized into a separate string, e.g. transformed blockquote
|
|
33
|
+
# lines), source_start/source_len are not recorded; in that mode every
|
|
34
|
+
# text node carries its content in str1 so Arena#text still works.
|
|
35
|
+
#
|
|
36
|
+
# diagnostics: an optional Array the builder appends warnings to
|
|
37
|
+
# (unsafe URL schemes, missing references, ...). The caller — usually
|
|
38
|
+
# InlinePass — is expected to forward Document#diagnostics here.
|
|
39
|
+
def initialize(arena, source, references, track_source: true, diagnostics: nil, footnotes: nil)
|
|
40
|
+
@arena = arena
|
|
41
|
+
@source = source
|
|
42
|
+
# Binary view of the source for String#byteindex hot paths:
|
|
43
|
+
# byteindex on a UTF-8 string raises when the byte offset falls
|
|
44
|
+
# inside a multibyte sequence; the binary view treats every byte
|
|
45
|
+
# as its own character.
|
|
46
|
+
@source_b = source.b
|
|
47
|
+
@references = references
|
|
48
|
+
@track_source = track_source
|
|
49
|
+
@diagnostics = diagnostics
|
|
50
|
+
@footnotes = footnotes
|
|
51
|
+
@link_scanner = LinkScanner.new(source)
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
def build(parent_id, tokens)
|
|
55
|
+
@parent_id = parent_id
|
|
56
|
+
@tokens = tokens
|
|
57
|
+
@delimiter_stack = []
|
|
58
|
+
@bracket_stack = []
|
|
59
|
+
@provisional_nodes = {}
|
|
60
|
+
linear_pass
|
|
61
|
+
process_emphasis(@delimiter_stack)
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
private
|
|
65
|
+
|
|
66
|
+
# --------------------------- node helpers ---------------------------
|
|
67
|
+
|
|
68
|
+
def add_arena_node(type, start_byte, end_byte, str1: nil, str2: nil, int1: 0, int2: 0)
|
|
69
|
+
if @track_source
|
|
70
|
+
@arena.add_node(type,
|
|
71
|
+
source_start: start_byte,
|
|
72
|
+
source_len: end_byte - start_byte,
|
|
73
|
+
str1: str1, str2: str2, int1: int1, int2: int2)
|
|
74
|
+
else
|
|
75
|
+
@arena.add_node(type, source_start: -1, source_len: 0,
|
|
76
|
+
str1: str1, str2: str2, int1: int1, int2: int2)
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
def update_arena_span(node_id, start_byte, end_byte)
|
|
81
|
+
return unless @track_source
|
|
82
|
+
|
|
83
|
+
@arena.update_span(node_id, start_byte, end_byte)
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
# --------------------------- linear pass ----------------------------
|
|
87
|
+
|
|
88
|
+
def linear_pass
|
|
89
|
+
id = 0
|
|
90
|
+
last = @tokens.length
|
|
91
|
+
while id < last
|
|
92
|
+
case @tokens.kind(id)
|
|
93
|
+
when TokenKind::TEXT
|
|
94
|
+
append_text(@tokens.start_byte(id), @tokens.end_byte(id), nil)
|
|
95
|
+
when TokenKind::ENTITY, TokenKind::ESCAPED_CHAR
|
|
96
|
+
append_text(@tokens.start_byte(id), @tokens.end_byte(id), @tokens.str1(id))
|
|
97
|
+
when TokenKind::LINE_ENDING
|
|
98
|
+
append_line_ending(id)
|
|
99
|
+
when TokenKind::HTML_INLINE
|
|
100
|
+
append_html_inline(id)
|
|
101
|
+
when TokenKind::AUTOLINK_URI
|
|
102
|
+
append_autolink(id, @tokens.str1(id), @tokens.str1(id))
|
|
103
|
+
when TokenKind::AUTOLINK_EMAIL
|
|
104
|
+
email = @tokens.str1(id)
|
|
105
|
+
append_autolink(id, "mailto:#{email}", email)
|
|
106
|
+
when TokenKind::CODE_DELIMITER
|
|
107
|
+
next_id = resolve_code_span(id)
|
|
108
|
+
if next_id
|
|
109
|
+
id = next_id
|
|
110
|
+
next
|
|
111
|
+
end
|
|
112
|
+
append_text(@tokens.start_byte(id), @tokens.end_byte(id), nil)
|
|
113
|
+
when TokenKind::LBRACKET
|
|
114
|
+
push_bracket(id, image: false)
|
|
115
|
+
when TokenKind::BANG_LBRACKET
|
|
116
|
+
push_bracket(id, image: true)
|
|
117
|
+
when TokenKind::RBRACKET
|
|
118
|
+
next_id = resolve_rbracket(id, id + 1)
|
|
119
|
+
if next_id
|
|
120
|
+
id = next_id
|
|
121
|
+
next
|
|
122
|
+
end
|
|
123
|
+
when TokenKind::DELIM_RUN
|
|
124
|
+
push_delim_run(id)
|
|
125
|
+
end
|
|
126
|
+
id += 1
|
|
127
|
+
end
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
# ---------------------------- text ----------------------------------
|
|
131
|
+
|
|
132
|
+
def append_text(start_byte, end_byte, literal)
|
|
133
|
+
materialized = if literal
|
|
134
|
+
literal
|
|
135
|
+
elsif !@track_source
|
|
136
|
+
@source.byteslice(start_byte, end_byte - start_byte).to_s
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
last = @arena.raw_last_child_id(@parent_id)
|
|
140
|
+
if last != -1 && @arena.type(last) == NodeType::TEXT &&
|
|
141
|
+
!@provisional_nodes[last] && can_coalesce?(last, start_byte)
|
|
142
|
+
coalesce_text(last, materialized, start_byte, end_byte)
|
|
143
|
+
return
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
node = add_arena_node(NodeType::TEXT, start_byte, end_byte, str1: materialized)
|
|
147
|
+
@arena.append_child(@parent_id, node)
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
def can_coalesce?(last_id, start_byte)
|
|
151
|
+
if @track_source
|
|
152
|
+
@arena.source_end(last_id) == start_byte
|
|
153
|
+
else
|
|
154
|
+
!@arena.str1(last_id).nil?
|
|
155
|
+
end
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
def coalesce_text(last_id, materialized, start_byte, end_byte)
|
|
159
|
+
if @track_source
|
|
160
|
+
last_lit = @arena.str1(last_id)
|
|
161
|
+
if materialized.nil? && last_lit.nil?
|
|
162
|
+
update_arena_span(last_id, @arena.source_start(last_id), end_byte)
|
|
163
|
+
return
|
|
164
|
+
end
|
|
165
|
+
existing = last_lit || @arena.text(last_id).to_s
|
|
166
|
+
incoming = materialized || @source.byteslice(start_byte, end_byte - start_byte).to_s
|
|
167
|
+
@arena.update_str1(last_id, existing + incoming)
|
|
168
|
+
update_arena_span(last_id, @arena.source_start(last_id), end_byte)
|
|
169
|
+
else
|
|
170
|
+
@arena.update_str1(last_id, @arena.str1(last_id) + materialized.to_s)
|
|
171
|
+
end
|
|
172
|
+
end
|
|
173
|
+
|
|
174
|
+
# --------------------------- line endings ---------------------------
|
|
175
|
+
|
|
176
|
+
def append_line_ending(id)
|
|
177
|
+
start_byte = @tokens.start_byte(id)
|
|
178
|
+
end_byte = @tokens.end_byte(id)
|
|
179
|
+
trailing_spaces = @tokens.int1(id)
|
|
180
|
+
backslash_form = @tokens.int2(id) == 1
|
|
181
|
+
|
|
182
|
+
if trailing_spaces >= 2 || backslash_form
|
|
183
|
+
strip_trailing_spaces(trailing_spaces) if trailing_spaces.positive?
|
|
184
|
+
kind = NodeType::HARDBREAK
|
|
185
|
+
else
|
|
186
|
+
# Soft line break: spec also strips trailing spaces from the
|
|
187
|
+
# previous line so a single trailing space doesn't survive into
|
|
188
|
+
# the output.
|
|
189
|
+
strip_trailing_spaces(trailing_spaces) if trailing_spaces.positive?
|
|
190
|
+
kind = NodeType::SOFTBREAK
|
|
191
|
+
end
|
|
192
|
+
|
|
193
|
+
@arena.append_child(@parent_id,
|
|
194
|
+
add_arena_node(kind, start_byte, end_byte, str1: "\n"))
|
|
195
|
+
end
|
|
196
|
+
|
|
197
|
+
def strip_trailing_spaces(count)
|
|
198
|
+
last = @arena.raw_last_child_id(@parent_id)
|
|
199
|
+
return if last == -1 || @arena.type(last) != NodeType::TEXT
|
|
200
|
+
|
|
201
|
+
lit = @arena.str1(last)
|
|
202
|
+
if lit
|
|
203
|
+
new_lit = lit.sub(/ {#{count},}\z/, "")
|
|
204
|
+
@arena.update_str1(last, new_lit)
|
|
205
|
+
end
|
|
206
|
+
|
|
207
|
+
return unless @track_source
|
|
208
|
+
|
|
209
|
+
new_len = @arena.source_len(last) - count
|
|
210
|
+
new_len = 0 if new_len.negative?
|
|
211
|
+
@arena.update_span(last,
|
|
212
|
+
@arena.source_start(last),
|
|
213
|
+
@arena.source_start(last) + new_len)
|
|
214
|
+
end
|
|
215
|
+
|
|
216
|
+
# --------------------------- HTML / autolink ------------------------
|
|
217
|
+
|
|
218
|
+
def append_html_inline(id)
|
|
219
|
+
node = add_arena_node(
|
|
220
|
+
NodeType::HTML_INLINE,
|
|
221
|
+
@tokens.start_byte(id), @tokens.end_byte(id),
|
|
222
|
+
str1: @tokens.str1(id),
|
|
223
|
+
)
|
|
224
|
+
@arena.append_child(@parent_id, node)
|
|
225
|
+
end
|
|
226
|
+
|
|
227
|
+
def append_autolink(id, destination, label)
|
|
228
|
+
link_id = add_arena_node(
|
|
229
|
+
NodeType::LINK,
|
|
230
|
+
@tokens.start_byte(id), @tokens.end_byte(id),
|
|
231
|
+
str1: block_unsafe_autolink(@link_scanner.normalize_uri(destination)),
|
|
232
|
+
)
|
|
233
|
+
@arena.append_child(@parent_id, link_id)
|
|
234
|
+
@arena.append_child(link_id, @arena.add_node(NodeType::TEXT, str1: label))
|
|
235
|
+
end
|
|
236
|
+
|
|
237
|
+
# Returns "" (blocking the href) for autolink destinations whose
|
|
238
|
+
# scheme executes script on navigation; otherwise the destination
|
|
239
|
+
# is returned unchanged. Unlike sanitize_destination this is a
|
|
240
|
+
# denylist, to stay CommonMark-conformant for benign custom schemes.
|
|
241
|
+
def block_unsafe_autolink(destination)
|
|
242
|
+
scheme = destination[%r{\A([a-zA-Z][a-zA-Z0-9+\-.]*):}, 1]
|
|
243
|
+
return destination if scheme.nil?
|
|
244
|
+
return destination unless UNSAFE_AUTOLINK_SCHEMES.include?(scheme.downcase)
|
|
245
|
+
|
|
246
|
+
report_diagnostic(
|
|
247
|
+
severity: :warning,
|
|
248
|
+
rule: :unsafe_url,
|
|
249
|
+
message: "Unsafe URL scheme #{scheme.downcase.inspect} blocked",
|
|
250
|
+
)
|
|
251
|
+
""
|
|
252
|
+
end
|
|
253
|
+
|
|
254
|
+
# --------------------------- code spans -----------------------------
|
|
255
|
+
|
|
256
|
+
# Find the closing backtick run for a code span by scanning the
|
|
257
|
+
# source bytes directly. CommonMark: backslash escapes do not
|
|
258
|
+
# apply inside a code span, so once we're past the opener every
|
|
259
|
+
# backtick run is a real candidate (token-level ESCAPED_CHAR is
|
|
260
|
+
# ignored). byteindex jumps over non-backtick byte stretches at
|
|
261
|
+
# C speed.
|
|
262
|
+
def resolve_code_span(opener_id)
|
|
263
|
+
run_len = @tokens.int1(opener_id)
|
|
264
|
+
pos = @tokens.end_byte(opener_id)
|
|
265
|
+
bytesize = @source_b.bytesize
|
|
266
|
+
while pos < bytesize
|
|
267
|
+
run_start = @source_b.byteindex(BACKTICK_BYTE, pos)
|
|
268
|
+
break if run_start.nil?
|
|
269
|
+
|
|
270
|
+
pos = run_start + 1
|
|
271
|
+
pos += 1 while pos < bytesize && @source_b.getbyte(pos) == 0x60
|
|
272
|
+
if pos - run_start == run_len
|
|
273
|
+
emit_code_span_bytes(opener_id, run_start, pos)
|
|
274
|
+
return next_token_after(pos, opener_id + 1)
|
|
275
|
+
end
|
|
276
|
+
end
|
|
277
|
+
nil
|
|
278
|
+
end
|
|
279
|
+
|
|
280
|
+
BACKTICK_BYTE = "`".b.freeze
|
|
281
|
+
|
|
282
|
+
def emit_code_span_bytes(opener_id, closer_start_byte, closer_end_byte)
|
|
283
|
+
body_start = @tokens.end_byte(opener_id)
|
|
284
|
+
body_end = closer_start_byte
|
|
285
|
+
span_start = @tokens.start_byte(opener_id)
|
|
286
|
+
span_end = closer_end_byte
|
|
287
|
+
raw = @source.byteslice(body_start, body_end - body_start).to_s
|
|
288
|
+
node = add_arena_node(NodeType::CODE_SPAN, span_start, span_end,
|
|
289
|
+
str1: normalize_code_span(raw))
|
|
290
|
+
@arena.append_child(@parent_id, node)
|
|
291
|
+
end
|
|
292
|
+
|
|
293
|
+
def normalize_code_span(text)
|
|
294
|
+
text = text.tr("\n", " ")
|
|
295
|
+
if text.length >= 2 && text.start_with?(" ") && text.end_with?(" ") && text.match?(/[^ ]/)
|
|
296
|
+
text = text[1..-2]
|
|
297
|
+
end
|
|
298
|
+
text
|
|
299
|
+
end
|
|
300
|
+
|
|
301
|
+
# --------------------------- brackets -------------------------------
|
|
302
|
+
|
|
303
|
+
def push_bracket(token_id, image:)
|
|
304
|
+
text = image ? "![" : "["
|
|
305
|
+
node_id = add_arena_node(
|
|
306
|
+
NodeType::TEXT,
|
|
307
|
+
@tokens.start_byte(token_id), @tokens.end_byte(token_id),
|
|
308
|
+
str1: text,
|
|
309
|
+
)
|
|
310
|
+
@arena.append_child(@parent_id, node_id)
|
|
311
|
+
@provisional_nodes[node_id] = true
|
|
312
|
+
@bracket_stack << Bracket.new(token_id, node_id, image, true, @delimiter_stack.length)
|
|
313
|
+
end
|
|
314
|
+
|
|
315
|
+
def resolve_rbracket(rbracket_token_id, search_from_id)
|
|
316
|
+
# CommonMark spec algorithm: peek the TOP of the bracket stack
|
|
317
|
+
# (don't search past inactive brackets). If the top opener is
|
|
318
|
+
# inactive, pop it and turn `]` into text — an inactive `[`
|
|
319
|
+
# earlier in the input must not be jumped over to reach an
|
|
320
|
+
# outer `[` or `![`, otherwise nested-image precedence (spec
|
|
321
|
+
# example 520) resolves the wrong way.
|
|
322
|
+
if @bracket_stack.empty?
|
|
323
|
+
append_text(@tokens.start_byte(rbracket_token_id),
|
|
324
|
+
@tokens.end_byte(rbracket_token_id), "]")
|
|
325
|
+
return nil
|
|
326
|
+
end
|
|
327
|
+
|
|
328
|
+
opener_index = @bracket_stack.length - 1
|
|
329
|
+
unless @bracket_stack[opener_index].active
|
|
330
|
+
@bracket_stack.pop
|
|
331
|
+
append_text(@tokens.start_byte(rbracket_token_id),
|
|
332
|
+
@tokens.end_byte(rbracket_token_id), "]")
|
|
333
|
+
return nil
|
|
334
|
+
end
|
|
335
|
+
|
|
336
|
+
opener = @bracket_stack[opener_index]
|
|
337
|
+
rbracket_end = @tokens.end_byte(rbracket_token_id)
|
|
338
|
+
|
|
339
|
+
# Footnote references (`[^label]`) take precedence over link forms.
|
|
340
|
+
if @footnotes && !opener.image && (footnote = try_footnote_reference(opener, rbracket_token_id))
|
|
341
|
+
finalize_footnote(opener, opener_index, footnote, rbracket_end)
|
|
342
|
+
return next_token_after(rbracket_end, search_from_id)
|
|
343
|
+
end
|
|
344
|
+
|
|
345
|
+
match = @link_scanner.inline_link(rbracket_end) ||
|
|
346
|
+
try_reference_link(opener, rbracket_token_id, rbracket_end)
|
|
347
|
+
unless match
|
|
348
|
+
@bracket_stack.delete_at(opener_index)
|
|
349
|
+
append_text(@tokens.start_byte(rbracket_token_id),
|
|
350
|
+
@tokens.end_byte(rbracket_token_id), "]")
|
|
351
|
+
return nil
|
|
352
|
+
end
|
|
353
|
+
|
|
354
|
+
finalize_link(opener, opener_index, match)
|
|
355
|
+
next_token_after(match[:end_byte], search_from_id)
|
|
356
|
+
end
|
|
357
|
+
|
|
358
|
+
def try_reference_link(opener, rbracket_token_id, start_byte)
|
|
359
|
+
label_start = @tokens.end_byte(opener.token_id)
|
|
360
|
+
label_end = @tokens.start_byte(rbracket_token_id)
|
|
361
|
+
text_label = @source.byteslice(label_start, label_end - label_start).to_s
|
|
362
|
+
return nil if ReferenceDefinition.label_too_long?(text_label)
|
|
363
|
+
|
|
364
|
+
if start_byte < @source.bytesize && @source.getbyte(start_byte) == 0x5B
|
|
365
|
+
ref_label, after_byte = @link_scanner.reference_label(start_byte)
|
|
366
|
+
return nil unless after_byte
|
|
367
|
+
|
|
368
|
+
lookup = ref_label.empty? ? text_label : ref_label
|
|
369
|
+
normalized = ReferenceDefinition.normalize_label(lookup)
|
|
370
|
+
ref = @references[normalized]
|
|
371
|
+
unless ref
|
|
372
|
+
# Full reference `[text][ref]` with a missing definition is
|
|
373
|
+
# usually a typo worth surfacing.
|
|
374
|
+
report_diagnostic(
|
|
375
|
+
severity: :warning,
|
|
376
|
+
rule: :missing_reference,
|
|
377
|
+
message: "Reference #{normalized.inspect} is not defined",
|
|
378
|
+
)
|
|
379
|
+
return nil
|
|
380
|
+
end
|
|
381
|
+
return {
|
|
382
|
+
end_byte: after_byte,
|
|
383
|
+
destination: @link_scanner.normalize_uri(ref[:destination].to_s),
|
|
384
|
+
title: ref[:title],
|
|
385
|
+
}
|
|
386
|
+
end
|
|
387
|
+
|
|
388
|
+
ref = @references[ReferenceDefinition.normalize_label(text_label)]
|
|
389
|
+
return nil unless ref
|
|
390
|
+
|
|
391
|
+
{
|
|
392
|
+
end_byte: start_byte,
|
|
393
|
+
destination: @link_scanner.normalize_uri(ref[:destination].to_s),
|
|
394
|
+
title: ref[:title],
|
|
395
|
+
}
|
|
396
|
+
end
|
|
397
|
+
|
|
398
|
+
def finalize_link(opener, opener_index, match)
|
|
399
|
+
opener_start = @tokens.start_byte(opener.token_id)
|
|
400
|
+
link_kind = opener.image ? NodeType::IMAGE : NodeType::LINK
|
|
401
|
+
link_id = add_arena_node(
|
|
402
|
+
link_kind, opener_start, match[:end_byte],
|
|
403
|
+
str1: sanitize_destination(match[:destination]),
|
|
404
|
+
str2: match[:title],
|
|
405
|
+
)
|
|
406
|
+
|
|
407
|
+
@arena.insert_before(@parent_id, opener.node_id, link_id)
|
|
408
|
+
|
|
409
|
+
first_inside = @arena.raw_next_sibling_id(opener.node_id)
|
|
410
|
+
last_inside = @arena.raw_last_child_id(@parent_id)
|
|
411
|
+
if first_inside != -1 && last_inside != -1 && first_inside != link_id
|
|
412
|
+
@arena.reparent(link_id, first_inside, last_inside)
|
|
413
|
+
end
|
|
414
|
+
|
|
415
|
+
@provisional_nodes.delete(opener.node_id)
|
|
416
|
+
@arena.detach(opener.node_id)
|
|
417
|
+
|
|
418
|
+
inner_delims = @delimiter_stack.slice!(opener.delim_stack_size..) || []
|
|
419
|
+
process_emphasis(inner_delims)
|
|
420
|
+
|
|
421
|
+
@bracket_stack.delete_at(opener_index)
|
|
422
|
+
|
|
423
|
+
unless opener.image
|
|
424
|
+
@bracket_stack.each { |b| b.active = false unless b.image }
|
|
425
|
+
end
|
|
426
|
+
end
|
|
427
|
+
|
|
428
|
+
# A footnote reference is a non-image bracket whose inner text is
|
|
429
|
+
# `^label` (label non-empty, no whitespace or `]`). Returns
|
|
430
|
+
# { label:, number:, occurrence: } when the label has a registered
|
|
431
|
+
# definition, else nil (so the bracket falls back to link logic).
|
|
432
|
+
FOOTNOTE_REF_RE = /\A\^([^\]\s]+)\z/
|
|
433
|
+
|
|
434
|
+
def try_footnote_reference(opener, rbracket_token_id)
|
|
435
|
+
inner_start = @tokens.end_byte(opener.token_id)
|
|
436
|
+
inner_end = @tokens.start_byte(rbracket_token_id)
|
|
437
|
+
match = FOOTNOTE_REF_RE.match(@source.byteslice(inner_start, inner_end - inner_start).to_s)
|
|
438
|
+
return nil unless match
|
|
439
|
+
|
|
440
|
+
label = ReferenceDefinition.normalize_label(match[1])
|
|
441
|
+
number, occurrence = @footnotes.reference(label)
|
|
442
|
+
return nil unless number
|
|
443
|
+
|
|
444
|
+
{ label: label, number: number, occurrence: occurrence }
|
|
445
|
+
end
|
|
446
|
+
|
|
447
|
+
def finalize_footnote(opener, opener_index, footnote, rbracket_end)
|
|
448
|
+
opener_start = @tokens.start_byte(opener.token_id)
|
|
449
|
+
fn_id = add_arena_node(
|
|
450
|
+
NodeType::FOOTNOTE_REFERENCE, opener_start, rbracket_end,
|
|
451
|
+
str1: footnote[:label], int1: footnote[:number], int2: footnote[:occurrence],
|
|
452
|
+
)
|
|
453
|
+
@arena.insert_before(@parent_id, opener.node_id, fn_id)
|
|
454
|
+
|
|
455
|
+
# Drop the provisional `[` node and the inner `^label` text node(s);
|
|
456
|
+
# the footnote reference replaces them entirely.
|
|
457
|
+
cursor = opener.node_id
|
|
458
|
+
while cursor != -1
|
|
459
|
+
nxt = @arena.raw_next_sibling_id(cursor)
|
|
460
|
+
@provisional_nodes.delete(cursor)
|
|
461
|
+
@arena.detach(cursor)
|
|
462
|
+
cursor = nxt
|
|
463
|
+
end
|
|
464
|
+
|
|
465
|
+
# Discard any delimiters opened inside the (literal) label.
|
|
466
|
+
@delimiter_stack.slice!(opener.delim_stack_size..)
|
|
467
|
+
@bracket_stack.delete_at(opener_index)
|
|
468
|
+
end
|
|
469
|
+
|
|
470
|
+
def next_token_after(byte_offset, from_id)
|
|
471
|
+
id = from_id
|
|
472
|
+
last = @tokens.length
|
|
473
|
+
while id < last
|
|
474
|
+
s = @tokens.start_byte(id)
|
|
475
|
+
e = @tokens.end_byte(id)
|
|
476
|
+
if s >= byte_offset
|
|
477
|
+
return id
|
|
478
|
+
elsif e > byte_offset
|
|
479
|
+
# A multi-byte token (HTML inline, autolink, ...) overlaps
|
|
480
|
+
# the boundary of an earlier-resolved code span / link. The
|
|
481
|
+
# part inside the resolved span is already consumed; surface
|
|
482
|
+
# the tail bytes as plain text so they aren't silently lost.
|
|
483
|
+
append_text(byte_offset, e, nil)
|
|
484
|
+
return id + 1
|
|
485
|
+
end
|
|
486
|
+
|
|
487
|
+
id += 1
|
|
488
|
+
end
|
|
489
|
+
last
|
|
490
|
+
end
|
|
491
|
+
|
|
492
|
+
def sanitize_destination(destination)
|
|
493
|
+
return "" if destination.nil?
|
|
494
|
+
return destination if destination.start_with?("/", "#")
|
|
495
|
+
|
|
496
|
+
scheme = destination[%r{\A([a-zA-Z][a-zA-Z0-9+\-.]*):}, 1]
|
|
497
|
+
return destination if scheme.nil?
|
|
498
|
+
return destination if SAFE_SCHEMES.include?(scheme.downcase)
|
|
499
|
+
|
|
500
|
+
report_diagnostic(
|
|
501
|
+
severity: :warning,
|
|
502
|
+
rule: :unsafe_url,
|
|
503
|
+
message: "Unsafe URL scheme #{scheme.downcase.inspect} blocked",
|
|
504
|
+
)
|
|
505
|
+
""
|
|
506
|
+
end
|
|
507
|
+
|
|
508
|
+
def report_diagnostic(severity:, rule:, message:, source_span: nil)
|
|
509
|
+
return unless @diagnostics
|
|
510
|
+
|
|
511
|
+
@diagnostics << Diagnostic.new(
|
|
512
|
+
severity: severity, rule: rule, message: message, source_span: source_span,
|
|
513
|
+
)
|
|
514
|
+
end
|
|
515
|
+
|
|
516
|
+
# --------------------------- delim runs / emphasis ------------------
|
|
517
|
+
|
|
518
|
+
def push_delim_run(token_id)
|
|
519
|
+
char_byte = @tokens.int1(token_id)
|
|
520
|
+
count = @tokens.int2(token_id)
|
|
521
|
+
flags = @tokens.int3(token_id)
|
|
522
|
+
|
|
523
|
+
char = Inline::BYTE_CHR[char_byte]
|
|
524
|
+
text = char * count
|
|
525
|
+
node_id = add_arena_node(
|
|
526
|
+
NodeType::TEXT,
|
|
527
|
+
@tokens.start_byte(token_id), @tokens.end_byte(token_id),
|
|
528
|
+
str1: text,
|
|
529
|
+
)
|
|
530
|
+
@arena.append_child(@parent_id, node_id)
|
|
531
|
+
@provisional_nodes[node_id] = true
|
|
532
|
+
|
|
533
|
+
@delimiter_stack << Delimiter.new(
|
|
534
|
+
node_id, char, count,
|
|
535
|
+
(flags & 0b10) != 0,
|
|
536
|
+
(flags & 0b01) != 0,
|
|
537
|
+
)
|
|
538
|
+
end
|
|
539
|
+
|
|
540
|
+
def process_emphasis(stack)
|
|
541
|
+
# NB: the CommonMark spec describes an `openers_bottom`
|
|
542
|
+
# optimization keyed by closer character / length / flanking
|
|
543
|
+
# flags. Implementing that correctly is subtle (a single
|
|
544
|
+
# per-character bottom blocks valid matches like
|
|
545
|
+
# `*foo**bar**baz*`), so the implementation here just walks
|
|
546
|
+
# back to the start of the stack for every closer. This is
|
|
547
|
+
# O(stack^2) in the worst case but stacks are tiny in practice.
|
|
548
|
+
closer_idx = 0
|
|
549
|
+
|
|
550
|
+
while closer_idx < stack.length
|
|
551
|
+
closer = stack[closer_idx]
|
|
552
|
+
unless closer.can_close
|
|
553
|
+
closer_idx += 1
|
|
554
|
+
next
|
|
555
|
+
end
|
|
556
|
+
|
|
557
|
+
opener_idx = closer_idx - 1
|
|
558
|
+
found = false
|
|
559
|
+
while opener_idx >= 0
|
|
560
|
+
opener = stack[opener_idx]
|
|
561
|
+
if opener.can_open && opener.char == closer.char
|
|
562
|
+
skip = false
|
|
563
|
+
if (opener.can_close || closer.can_open) &&
|
|
564
|
+
((opener.count + closer.count) % 3).zero? &&
|
|
565
|
+
!((opener.count % 3).zero? && (closer.count % 3).zero?)
|
|
566
|
+
skip = true
|
|
567
|
+
end
|
|
568
|
+
unless skip
|
|
569
|
+
found = true
|
|
570
|
+
break
|
|
571
|
+
end
|
|
572
|
+
end
|
|
573
|
+
opener_idx -= 1
|
|
574
|
+
end
|
|
575
|
+
|
|
576
|
+
unless found
|
|
577
|
+
unless closer.can_open
|
|
578
|
+
@provisional_nodes.delete(closer.node_id)
|
|
579
|
+
stack.delete_at(closer_idx)
|
|
580
|
+
end
|
|
581
|
+
closer_idx += 1
|
|
582
|
+
next
|
|
583
|
+
end
|
|
584
|
+
|
|
585
|
+
opener = stack[opener_idx]
|
|
586
|
+
strength = [opener.count, closer.count].min >= 2 ? 2 : 1
|
|
587
|
+
if closer.char == "~"
|
|
588
|
+
# GFM strikethrough only forms on `~~` runs. A single `~`
|
|
589
|
+
# leaves the delimiter as text; advance the cursor so future
|
|
590
|
+
# `~~` pairs can still match.
|
|
591
|
+
if strength < 2
|
|
592
|
+
closer_idx += 1
|
|
593
|
+
next
|
|
594
|
+
end
|
|
595
|
+
kind = NodeType::STRIKETHROUGH
|
|
596
|
+
else
|
|
597
|
+
kind = strength == 2 ? NodeType::STRONG : NodeType::EMPHASIS
|
|
598
|
+
end
|
|
599
|
+
|
|
600
|
+
# CommonMark spec: any delimiters strictly between this opener and
|
|
601
|
+
# closer can't open or close anything in this scope, so drop them
|
|
602
|
+
# from the stack before we rebuild the tree. Their arena nodes
|
|
603
|
+
# stay where they are (they'll be reparented into the new emphasis
|
|
604
|
+
# alongside the surrounding content), but they must no longer be
|
|
605
|
+
# candidates for future iterations. Without this, the next
|
|
606
|
+
# iteration would try to pair stranded delimiters that have
|
|
607
|
+
# already been moved into a different parent, which corrupts the
|
|
608
|
+
# sibling chain (Arena#reparent walks into @parent[-1]).
|
|
609
|
+
if closer_idx > opener_idx + 1
|
|
610
|
+
removed = stack.slice!((opener_idx + 1)...closer_idx)
|
|
611
|
+
removed.each { |e| @provisional_nodes.delete(e.node_id) }
|
|
612
|
+
closer_idx = opener_idx + 1
|
|
613
|
+
closer = stack[closer_idx]
|
|
614
|
+
end
|
|
615
|
+
|
|
616
|
+
opener_node = opener.node_id
|
|
617
|
+
closer_node = closer.node_id
|
|
618
|
+
|
|
619
|
+
if @track_source
|
|
620
|
+
opener_match_start = @arena.source_end(opener_node) - strength
|
|
621
|
+
closer_match_end = @arena.source_start(closer_node) + strength
|
|
622
|
+
else
|
|
623
|
+
opener_match_start = -1
|
|
624
|
+
closer_match_end = 0
|
|
625
|
+
end
|
|
626
|
+
emphasis_id = add_arena_node(kind, opener_match_start, closer_match_end)
|
|
627
|
+
|
|
628
|
+
first_inside = @arena.raw_next_sibling_id(opener_node)
|
|
629
|
+
last_inside = @arena.raw_prev_sibling_id(closer_node)
|
|
630
|
+
if first_inside != -1 && last_inside != -1 &&
|
|
631
|
+
first_inside != closer_node && last_inside != opener_node
|
|
632
|
+
@arena.reparent(emphasis_id, first_inside, last_inside)
|
|
633
|
+
end
|
|
634
|
+
|
|
635
|
+
parent_id = @arena.raw_parent_id(opener_node)
|
|
636
|
+
@arena.insert_before(parent_id, closer_node, emphasis_id)
|
|
637
|
+
|
|
638
|
+
if opener.count == strength
|
|
639
|
+
@provisional_nodes.delete(opener_node)
|
|
640
|
+
@arena.detach(opener_node)
|
|
641
|
+
stack.delete_at(opener_idx)
|
|
642
|
+
closer_idx -= 1
|
|
643
|
+
else
|
|
644
|
+
opener.count -= strength
|
|
645
|
+
str = @arena.str1(opener_node)
|
|
646
|
+
@arena.update_str1(opener_node, str[0...-strength])
|
|
647
|
+
if @track_source
|
|
648
|
+
new_end = @arena.source_end(opener_node) - strength
|
|
649
|
+
@arena.update_span(opener_node, @arena.source_start(opener_node), new_end)
|
|
650
|
+
end
|
|
651
|
+
end
|
|
652
|
+
|
|
653
|
+
if closer.count == strength
|
|
654
|
+
@provisional_nodes.delete(closer_node)
|
|
655
|
+
@arena.detach(closer_node)
|
|
656
|
+
stack.delete_at(closer_idx)
|
|
657
|
+
else
|
|
658
|
+
closer.count -= strength
|
|
659
|
+
str = @arena.str1(closer_node)
|
|
660
|
+
@arena.update_str1(closer_node, str[strength..])
|
|
661
|
+
if @track_source
|
|
662
|
+
new_start = @arena.source_start(closer_node) + strength
|
|
663
|
+
new_end = @arena.source_end(closer_node)
|
|
664
|
+
@arena.update_span(closer_node, new_start, new_end)
|
|
665
|
+
end
|
|
666
|
+
end
|
|
667
|
+
end
|
|
668
|
+
|
|
669
|
+
stack.each { |e| @provisional_nodes.delete(e.node_id) }
|
|
670
|
+
stack.clear
|
|
671
|
+
end
|
|
672
|
+
end
|
|
673
|
+
end
|
|
674
|
+
end
|