red_quilt 0.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.rubocop.yml +109 -0
- data/.rubocop_todo.yml +7 -0
- data/CHANGELOG.md +57 -0
- data/README.md +284 -0
- data/Rakefile +8 -0
- data/ast-spec.md +1227 -0
- data/docs/architecture.md +81 -0
- data/docs/arena-usage.md +363 -0
- data/docs/commonmark-conformance.md +241 -0
- data/exe/redquilt +7 -0
- data/lib/red_quilt/arena.rb +366 -0
- data/lib/red_quilt/block_parser.rb +724 -0
- data/lib/red_quilt/blockquote.rb +151 -0
- data/lib/red_quilt/cli.rb +182 -0
- data/lib/red_quilt/diagnostic.rb +47 -0
- data/lib/red_quilt/document.rb +126 -0
- data/lib/red_quilt/extended_autolink_pass.rb +185 -0
- data/lib/red_quilt/footnote_definition.rb +147 -0
- data/lib/red_quilt/footnote_pass.rb +39 -0
- data/lib/red_quilt/footnote_registry.rb +68 -0
- data/lib/red_quilt/indentation.rb +73 -0
- data/lib/red_quilt/inline/builder.rb +674 -0
- data/lib/red_quilt/inline/flanking.rb +120 -0
- data/lib/red_quilt/inline/html_entities.rb +2180 -0
- data/lib/red_quilt/inline/lexer.rb +280 -0
- data/lib/red_quilt/inline/link_scanner.rb +315 -0
- data/lib/red_quilt/inline/token_kind.rb +39 -0
- data/lib/red_quilt/inline/tokens.rb +73 -0
- data/lib/red_quilt/inline.rb +34 -0
- data/lib/red_quilt/inline_pass.rb +53 -0
- data/lib/red_quilt/line.rb +14 -0
- data/lib/red_quilt/lint_pass.rb +71 -0
- data/lib/red_quilt/list.rb +317 -0
- data/lib/red_quilt/node_ref.rb +114 -0
- data/lib/red_quilt/node_type.rb +66 -0
- data/lib/red_quilt/plain_text.rb +46 -0
- data/lib/red_quilt/reference_definition.rb +309 -0
- data/lib/red_quilt/renderer/html.rb +279 -0
- data/lib/red_quilt/renderer/mdast.rb +152 -0
- data/lib/red_quilt/source_map.rb +29 -0
- data/lib/red_quilt/source_span.rb +26 -0
- data/lib/red_quilt/theme.rb +28 -0
- data/lib/red_quilt/themes/default.css +87 -0
- data/lib/red_quilt/version.rb +5 -0
- data/lib/red_quilt.rb +86 -0
- data/mise.toml +2 -0
- data/sig/red_quilt.rbs +45 -0
- metadata +91 -0
|
@@ -0,0 +1,280 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "strscan"
|
|
4
|
+
|
|
5
|
+
require_relative "html_entities"
|
|
6
|
+
|
|
7
|
+
module RedQuilt
|
|
8
|
+
module Inline
|
|
9
|
+
# Scans a byte range of the document source and emits inline tokens
|
|
10
|
+
# into a caller-owned Tokens storage.
|
|
11
|
+
#
|
|
12
|
+
# The lexer never copies the source string; all positions are absolute
|
|
13
|
+
# byte offsets into @source. The caller is responsible for clearing the
|
|
14
|
+
# Tokens storage between invocations if it is being reused.
|
|
15
|
+
class Lexer
|
|
16
|
+
# Bytes whose appearance ends a TEXT run. Anything not in this set is
|
|
17
|
+
# plain text content. Newline is included so LINE_ENDING gets its own
|
|
18
|
+
# token.
|
|
19
|
+
SPECIAL_BYTES = begin
|
|
20
|
+
a = Array.new(256, false)
|
|
21
|
+
# *, _, `, [, ], !, <, &, \, \n, ~ (GFM strikethrough)
|
|
22
|
+
[0x2A, 0x5F, 0x60, 0x5B, 0x5D, 0x21, 0x3C, 0x26, 0x5C, 0x0A, 0x7E].each { |b| a[b] = true }
|
|
23
|
+
a.freeze
|
|
24
|
+
end
|
|
25
|
+
# Same set as SPECIAL_BYTES, for String#byteindex to jump over long
|
|
26
|
+
# plain-text runs at C speed.
|
|
27
|
+
SPECIAL_BYTE_RE = /[*_`\[\]!<&\\\n~]/
|
|
28
|
+
|
|
29
|
+
# Anchored regexes for StringScanner#scan (still used by
|
|
30
|
+
# scan_angle / scan_amp). StringScanner anchors at the current pos,
|
|
31
|
+
# so no `\G` is needed.
|
|
32
|
+
#
|
|
33
|
+
# URI autolink rejects every ASCII control char (U+0000-U+001F, U+007F)
|
|
34
|
+
# plus space (U+0020); CommonMark 6.5 forbids ASCII control characters,
|
|
35
|
+
# space, <, or >.
|
|
36
|
+
URI_AUTOLINK_RE = /<([A-Za-z][A-Za-z0-9+.-]{1,31}:[^<>\u0000-\u0020\u007F]*)>/
|
|
37
|
+
EMAIL_AUTOLINK_RE = /<([a-zA-Z0-9.!#$%&'*+\/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*)>/
|
|
38
|
+
# CommonMark spec 6.6 "Raw HTML": six forms — open tag, closing tag,
|
|
39
|
+
# HTML comment, processing instruction, declaration, CDATA section.
|
|
40
|
+
# Attribute values are allowed to span lines.
|
|
41
|
+
# HTML tag separators are restricted to space/tab/CR/LF per spec --
|
|
42
|
+
# \s would also match form feed (U+000C) and vertical tab (U+000B),
|
|
43
|
+
# which CommonMark disallows.
|
|
44
|
+
HTML_OPEN_TAG_RE = %r{<[A-Za-z][A-Za-z0-9-]*(?:[ \t\r\n]+[A-Za-z_:][A-Za-z0-9_.:-]*(?:[ \t\r\n]*=[ \t\r\n]*(?:"[^"]*"|'[^']*'|[^ \t\r\n"'=<>`]+))?)*[ \t\r\n]*/?>}
|
|
45
|
+
HTML_CLOSING_TAG_RE = %r{</[A-Za-z][A-Za-z0-9-]*[ \t\r\n]*>}
|
|
46
|
+
# Comment: `<!-->`, `<!--->`, or `<!-- text -->` where text doesn't
|
|
47
|
+
# start with `>` or `->`, end with `-`, or contain `--`.
|
|
48
|
+
HTML_COMMENT_RE = %r{<!-->|<!--->|<!--(?!>)(?!->)[\s\S]*?(?<!-)-->}
|
|
49
|
+
HTML_PROC_INST_RE = %r{<\?[\s\S]*?\?>}
|
|
50
|
+
HTML_DECLARATION_RE = %r{<![A-Za-z][^>]*>}
|
|
51
|
+
HTML_CDATA_RE = %r{<!\[CDATA\[[\s\S]*?\]\]>}
|
|
52
|
+
# Entity regex and decoder live on the enclosing Inline module so
|
|
53
|
+
# the same digit-count caps and U+FFFD replacement apply across
|
|
54
|
+
# the lexer, the inline builder, and the reference-definition
|
|
55
|
+
# parser. See lib/red_quilt/inline/html_entities.rb.
|
|
56
|
+
|
|
57
|
+
def initialize(source)
|
|
58
|
+
@source = source
|
|
59
|
+
# A binary-encoded view for String#byteindex hot paths (byteindex
|
|
60
|
+
# on a UTF-8 string raises when the offset falls inside a
|
|
61
|
+
# multibyte sequence; binary treats every byte as its own char).
|
|
62
|
+
@source_b = source.b
|
|
63
|
+
@ss = StringScanner.new(source)
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
# Scans @source[start_byte...end_byte] and emits tokens.
|
|
67
|
+
# Returns the tokens object that was passed in.
|
|
68
|
+
def lex_into(tokens, start_byte, end_byte)
|
|
69
|
+
@ss.pos = start_byte
|
|
70
|
+
@start = start_byte
|
|
71
|
+
@end = end_byte
|
|
72
|
+
scan(tokens)
|
|
73
|
+
tokens
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
private
|
|
77
|
+
|
|
78
|
+
def scan(tokens)
|
|
79
|
+
# Hot loop. `pos` is the source of truth during the scan; @ss.pos
|
|
80
|
+
# is only synced when entering scan_angle / scan_amp (which still
|
|
81
|
+
# use StringScanner for the regex match) and at loop exit. The
|
|
82
|
+
# other scan_* helpers take `pos` as an arg and return the new
|
|
83
|
+
# position, so the round-trip through @ss.pos is avoided.
|
|
84
|
+
pos = @ss.pos
|
|
85
|
+
end_pos = @end
|
|
86
|
+
while pos < end_pos
|
|
87
|
+
byte = @source.getbyte(pos)
|
|
88
|
+
case byte
|
|
89
|
+
when 0x0A # \n
|
|
90
|
+
pos = scan_line_ending(tokens, pos)
|
|
91
|
+
when 0x5C # \\ (backslash)
|
|
92
|
+
pos = scan_backslash(tokens, pos, end_pos)
|
|
93
|
+
when 0x60 # `
|
|
94
|
+
pos = scan_code_delimiter(tokens, pos, end_pos)
|
|
95
|
+
when 0x2A # *
|
|
96
|
+
pos = scan_delim_run(tokens, pos, end_pos, "*", 0x2A)
|
|
97
|
+
when 0x5F # _
|
|
98
|
+
pos = scan_delim_run(tokens, pos, end_pos, "_", 0x5F)
|
|
99
|
+
when 0x7E # ~ (GFM strikethrough)
|
|
100
|
+
pos = scan_delim_run(tokens, pos, end_pos, "~", 0x7E)
|
|
101
|
+
when 0x5B # [
|
|
102
|
+
tokens.emit(TokenKind::LBRACKET, start_byte: pos, end_byte: pos + 1)
|
|
103
|
+
pos += 1
|
|
104
|
+
when 0x5D # ]
|
|
105
|
+
tokens.emit(TokenKind::RBRACKET, start_byte: pos, end_byte: pos + 1)
|
|
106
|
+
pos += 1
|
|
107
|
+
when 0x21 # !
|
|
108
|
+
pos = scan_bang(tokens, pos, end_pos)
|
|
109
|
+
when 0x3C # <
|
|
110
|
+
@ss.pos = pos
|
|
111
|
+
scan_angle(tokens)
|
|
112
|
+
pos = @ss.pos
|
|
113
|
+
when 0x26 # &
|
|
114
|
+
@ss.pos = pos
|
|
115
|
+
scan_amp(tokens)
|
|
116
|
+
pos = @ss.pos
|
|
117
|
+
else
|
|
118
|
+
# Inlined scan_text. Always make progress: consume the
|
|
119
|
+
# current byte, then byteindex against the binary view to
|
|
120
|
+
# leap to the next special byte at C speed.
|
|
121
|
+
start = pos
|
|
122
|
+
pos += 1
|
|
123
|
+
if pos < end_pos
|
|
124
|
+
next_special = @source_b.byteindex(SPECIAL_BYTE_RE, pos)
|
|
125
|
+
pos = next_special.nil? || next_special >= end_pos ? end_pos : next_special
|
|
126
|
+
end
|
|
127
|
+
tokens.emit(TokenKind::TEXT, start_byte: start, end_byte: pos)
|
|
128
|
+
end
|
|
129
|
+
end
|
|
130
|
+
@ss.pos = pos
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
def scan_line_ending(tokens, pos)
|
|
134
|
+
# Count trailing ASCII spaces immediately before the newline; the
|
|
135
|
+
# builder uses this to decide softbreak vs hardbreak (>= 2 spaces).
|
|
136
|
+
trailing_spaces = 0
|
|
137
|
+
i = pos - 1
|
|
138
|
+
while i >= 0 && @source.getbyte(i) == 0x20
|
|
139
|
+
trailing_spaces += 1
|
|
140
|
+
i -= 1
|
|
141
|
+
end
|
|
142
|
+
new_pos = pos + 1
|
|
143
|
+
tokens.emit(TokenKind::LINE_ENDING,
|
|
144
|
+
start_byte: pos, end_byte: new_pos,
|
|
145
|
+
int1: trailing_spaces)
|
|
146
|
+
new_pos
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
def scan_backslash(tokens, pos, end_pos)
|
|
150
|
+
nxt_pos = pos + 1
|
|
151
|
+
if nxt_pos >= end_pos
|
|
152
|
+
tokens.emit(TokenKind::TEXT, start_byte: pos, end_byte: nxt_pos)
|
|
153
|
+
return nxt_pos
|
|
154
|
+
end
|
|
155
|
+
|
|
156
|
+
nxt = @source.getbyte(nxt_pos)
|
|
157
|
+
if nxt == 0x0A
|
|
158
|
+
# "\\\n" → hardbreak (backslash form). int2 = 1 signals the form.
|
|
159
|
+
tokens.emit(TokenKind::LINE_ENDING,
|
|
160
|
+
start_byte: pos, end_byte: nxt_pos + 1,
|
|
161
|
+
int1: 0, int2: 1)
|
|
162
|
+
nxt_pos + 1
|
|
163
|
+
elsif Inline.ascii_punct_byte?(nxt)
|
|
164
|
+
tokens.emit(TokenKind::ESCAPED_CHAR,
|
|
165
|
+
start_byte: pos, end_byte: nxt_pos + 1,
|
|
166
|
+
str1: nxt.chr)
|
|
167
|
+
nxt_pos + 1
|
|
168
|
+
else
|
|
169
|
+
tokens.emit(TokenKind::TEXT, start_byte: pos, end_byte: nxt_pos)
|
|
170
|
+
nxt_pos
|
|
171
|
+
end
|
|
172
|
+
end
|
|
173
|
+
|
|
174
|
+
def scan_code_delimiter(tokens, pos, end_pos)
|
|
175
|
+
# Manual byte loop. Backtick runs are usually short (1-3 bytes),
|
|
176
|
+
# so a regex skip's setup cost outweighs the per-byte compare.
|
|
177
|
+
i = pos
|
|
178
|
+
while i < end_pos && @source.getbyte(i) == 0x60
|
|
179
|
+
i += 1
|
|
180
|
+
end
|
|
181
|
+
tokens.emit(TokenKind::CODE_DELIMITER,
|
|
182
|
+
start_byte: pos, end_byte: i,
|
|
183
|
+
int1: i - pos)
|
|
184
|
+
i
|
|
185
|
+
end
|
|
186
|
+
|
|
187
|
+
def scan_delim_run(tokens, pos, end_pos, char, byte)
|
|
188
|
+
i = pos
|
|
189
|
+
while i < end_pos && @source.getbyte(i) == byte
|
|
190
|
+
i += 1
|
|
191
|
+
end
|
|
192
|
+
count = i - pos
|
|
193
|
+
prev_char = Flanking.char_before(@source, pos, @start)
|
|
194
|
+
next_char = Flanking.char_at(@source, i, end_pos)
|
|
195
|
+
can_open, can_close = Flanking.can_open_close(char, prev_char, next_char)
|
|
196
|
+
# A run that can neither open nor close (e.g. underscores inside
|
|
197
|
+
# a word) can never participate in emphasis, so emit it as plain
|
|
198
|
+
# TEXT to allow text coalescing with neighbours.
|
|
199
|
+
if !can_open && !can_close
|
|
200
|
+
tokens.emit(TokenKind::TEXT, start_byte: pos, end_byte: i)
|
|
201
|
+
return i
|
|
202
|
+
end
|
|
203
|
+
flags = (can_open ? 0b10 : 0) | (can_close ? 0b01 : 0)
|
|
204
|
+
tokens.emit(TokenKind::DELIM_RUN,
|
|
205
|
+
start_byte: pos, end_byte: i,
|
|
206
|
+
int1: byte, int2: count, int3: flags)
|
|
207
|
+
i
|
|
208
|
+
end
|
|
209
|
+
|
|
210
|
+
def scan_bang(tokens, pos, end_pos)
|
|
211
|
+
if pos + 1 < end_pos && @source.getbyte(pos + 1) == 0x5B # [
|
|
212
|
+
tokens.emit(TokenKind::BANG_LBRACKET, start_byte: pos, end_byte: pos + 2)
|
|
213
|
+
pos + 2
|
|
214
|
+
else
|
|
215
|
+
tokens.emit(TokenKind::TEXT, start_byte: pos, end_byte: pos + 1)
|
|
216
|
+
pos + 1
|
|
217
|
+
end
|
|
218
|
+
end
|
|
219
|
+
|
|
220
|
+
def scan_angle(tokens)
|
|
221
|
+
start = @ss.pos
|
|
222
|
+
if scan_within_end(URI_AUTOLINK_RE)
|
|
223
|
+
tokens.emit(TokenKind::AUTOLINK_URI,
|
|
224
|
+
start_byte: start, end_byte: @ss.pos,
|
|
225
|
+
str1: @ss[1])
|
|
226
|
+
elsif scan_within_end(EMAIL_AUTOLINK_RE)
|
|
227
|
+
tokens.emit(TokenKind::AUTOLINK_EMAIL,
|
|
228
|
+
start_byte: start, end_byte: @ss.pos,
|
|
229
|
+
str1: @ss[1])
|
|
230
|
+
elsif (matched = scan_within_end(HTML_OPEN_TAG_RE)) ||
|
|
231
|
+
(matched = scan_within_end(HTML_CLOSING_TAG_RE)) ||
|
|
232
|
+
(matched = scan_within_end(HTML_COMMENT_RE)) ||
|
|
233
|
+
(matched = scan_within_end(HTML_PROC_INST_RE)) ||
|
|
234
|
+
(matched = scan_within_end(HTML_DECLARATION_RE)) ||
|
|
235
|
+
(matched = scan_within_end(HTML_CDATA_RE))
|
|
236
|
+
tokens.emit(TokenKind::HTML_INLINE,
|
|
237
|
+
start_byte: start, end_byte: @ss.pos,
|
|
238
|
+
str1: matched)
|
|
239
|
+
else
|
|
240
|
+
@ss.pos += 1
|
|
241
|
+
tokens.emit(TokenKind::TEXT, start_byte: start, end_byte: @ss.pos)
|
|
242
|
+
end
|
|
243
|
+
end
|
|
244
|
+
|
|
245
|
+
def scan_amp(tokens)
|
|
246
|
+
start = @ss.pos
|
|
247
|
+
if (matched = scan_within_end(Inline::ENTITY_RE))
|
|
248
|
+
tokens.emit(TokenKind::ENTITY,
|
|
249
|
+
start_byte: start, end_byte: @ss.pos,
|
|
250
|
+
str1: decode_entity(matched))
|
|
251
|
+
else
|
|
252
|
+
@ss.pos += 1
|
|
253
|
+
tokens.emit(TokenKind::TEXT, start_byte: start, end_byte: @ss.pos)
|
|
254
|
+
end
|
|
255
|
+
end
|
|
256
|
+
|
|
257
|
+
# StringScanner#scan but constrained to @end. Returns the matched
|
|
258
|
+
# string on success (rewinding when the match extends past @end),
|
|
259
|
+
# nil otherwise.
|
|
260
|
+
def scan_within_end(regex)
|
|
261
|
+
before = @ss.pos
|
|
262
|
+
matched = @ss.scan(regex)
|
|
263
|
+
return nil unless matched
|
|
264
|
+
|
|
265
|
+
if @ss.pos > @end
|
|
266
|
+
@ss.pos = before
|
|
267
|
+
return nil
|
|
268
|
+
end
|
|
269
|
+
matched
|
|
270
|
+
end
|
|
271
|
+
|
|
272
|
+
# Decodes a single entity reference using the shared Inline
|
|
273
|
+
# decoder, which enforces the spec's digit limits and U+FFFD
|
|
274
|
+
# replacement for U+0000 / surrogates / out-of-range codepoints.
|
|
275
|
+
def decode_entity(raw)
|
|
276
|
+
Inline.decode_entity(raw)
|
|
277
|
+
end
|
|
278
|
+
end
|
|
279
|
+
end
|
|
280
|
+
end
|
|
@@ -0,0 +1,315 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RedQuilt
|
|
4
|
+
module Inline
|
|
5
|
+
# Pure byte-level scanner for link / image tails: inline link bodies
|
|
6
|
+
# `(dest "title")`, bracketed reference labels `[label]`, and link
|
|
7
|
+
# destination URI normalization. Operates only on the document source
|
|
8
|
+
# string -- no arena, token stream, or parser state -- so it can be
|
|
9
|
+
# exercised in isolation. Inline::Builder owns one instance and feeds
|
|
10
|
+
# it absolute byte offsets.
|
|
11
|
+
class LinkScanner
|
|
12
|
+
NIL_PAIR = [nil, nil].freeze
|
|
13
|
+
# Bytes left verbatim by normalize_uri: ASCII alphanumerics plus the
|
|
14
|
+
# URL sub-delims / reserved chars that the spec keeps unencoded.
|
|
15
|
+
# Everything else is percent-encoded.
|
|
16
|
+
URL_SAFE_BYTE = begin
|
|
17
|
+
a = Array.new(256, false)
|
|
18
|
+
(0x30..0x39).each { |b| a[b] = true } # 0-9
|
|
19
|
+
(0x41..0x5A).each { |b| a[b] = true } # A-Z
|
|
20
|
+
(0x61..0x7A).each { |b| a[b] = true } # a-z
|
|
21
|
+
"-._~:/?#@!$&'()*+,;=".each_byte { |b| a[b] = true }
|
|
22
|
+
a.freeze
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def initialize(source)
|
|
26
|
+
@source = source
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
# Parses an inline link body `(dest "title")` starting at the byte
|
|
30
|
+
# right after the link's closing `]`. Returns a hash with
|
|
31
|
+
# `:end_byte`, `:destination`, `:title` on success, or nil if the
|
|
32
|
+
# bytes don't form a valid inline link tail.
|
|
33
|
+
def inline_link(start_byte)
|
|
34
|
+
return nil unless byte_at(start_byte) == 0x28
|
|
35
|
+
|
|
36
|
+
pos = start_byte + 1
|
|
37
|
+
pos = skip_link_whitespace(pos)
|
|
38
|
+
return nil if pos.nil?
|
|
39
|
+
|
|
40
|
+
raw_dest = nil
|
|
41
|
+
next_byte = byte_at(pos)
|
|
42
|
+
if next_byte && next_byte != 0x29 && !link_tail_whitespace_byte?(next_byte) && next_byte != 0x0A
|
|
43
|
+
dest_result = parse_link_destination(pos)
|
|
44
|
+
return nil unless dest_result
|
|
45
|
+
|
|
46
|
+
raw_dest, pos = dest_result
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
ws_end = skip_link_whitespace(pos)
|
|
50
|
+
return nil if ws_end.nil?
|
|
51
|
+
|
|
52
|
+
raw_title = nil
|
|
53
|
+
if ws_end > pos
|
|
54
|
+
opener_byte = byte_at(ws_end)
|
|
55
|
+
if opener_byte && (opener_byte == 0x22 || opener_byte == 0x27 || opener_byte == 0x28)
|
|
56
|
+
title_result = parse_link_title(ws_end)
|
|
57
|
+
return nil unless title_result
|
|
58
|
+
|
|
59
|
+
raw_title, pos = title_result
|
|
60
|
+
pos = skip_link_whitespace(pos)
|
|
61
|
+
return nil if pos.nil?
|
|
62
|
+
else
|
|
63
|
+
pos = ws_end
|
|
64
|
+
end
|
|
65
|
+
else
|
|
66
|
+
pos = ws_end
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
return nil unless byte_at(pos) == 0x29
|
|
70
|
+
|
|
71
|
+
destination = raw_dest ? normalize_uri(raw_dest) : ""
|
|
72
|
+
title = raw_title ? decode_link_entities(raw_title) : nil
|
|
73
|
+
{ end_byte: pos + 1, destination: destination, title: title }
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
# Reads a bracketed reference label `[label]` starting at start_byte
|
|
77
|
+
# (which must point at the `[`). Returns [label, after_byte] or
|
|
78
|
+
# NIL_PAIR when the label is malformed or over-long.
|
|
79
|
+
def reference_label(start_byte)
|
|
80
|
+
return NIL_PAIR unless @source.getbyte(start_byte) == 0x5B
|
|
81
|
+
|
|
82
|
+
i = start_byte + 1
|
|
83
|
+
while i < @source.bytesize
|
|
84
|
+
b = @source.getbyte(i)
|
|
85
|
+
if b == 0x5D
|
|
86
|
+
label = @source.byteslice(start_byte + 1, i - start_byte - 1).to_s
|
|
87
|
+
return NIL_PAIR if ReferenceDefinition.label_too_long?(label)
|
|
88
|
+
|
|
89
|
+
return [label, i + 1]
|
|
90
|
+
elsif b == 0x5B
|
|
91
|
+
# An unescaped `[` inside a reference label voids the form.
|
|
92
|
+
return NIL_PAIR
|
|
93
|
+
elsif b == 0x5C && i + 1 < @source.bytesize
|
|
94
|
+
i += 2
|
|
95
|
+
next
|
|
96
|
+
end
|
|
97
|
+
i += 1
|
|
98
|
+
end
|
|
99
|
+
NIL_PAIR
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
# Percent-encodes bytes not in the URL-safe set, decodes HTML
|
|
103
|
+
# entities first, and preserves (uppercasing) existing `%XX`.
|
|
104
|
+
def normalize_uri(raw)
|
|
105
|
+
decoded = decode_link_entities(raw)
|
|
106
|
+
bytes = decoded.b
|
|
107
|
+
result = +""
|
|
108
|
+
i = 0
|
|
109
|
+
size = bytes.bytesize
|
|
110
|
+
while i < size
|
|
111
|
+
b = bytes.getbyte(i)
|
|
112
|
+
if b == 0x25 && i + 2 < size &&
|
|
113
|
+
hex_byte?(bytes.getbyte(i + 1)) && hex_byte?(bytes.getbyte(i + 2))
|
|
114
|
+
result << "%"
|
|
115
|
+
result << bytes.getbyte(i + 1).chr.upcase
|
|
116
|
+
result << bytes.getbyte(i + 2).chr.upcase
|
|
117
|
+
i += 3
|
|
118
|
+
elsif URL_SAFE_BYTE[b]
|
|
119
|
+
# All URL-safe bytes are ASCII, so appending the integer
|
|
120
|
+
# codepoint matches b.chr without allocating a 1-char string.
|
|
121
|
+
result << b
|
|
122
|
+
i += 1
|
|
123
|
+
else
|
|
124
|
+
result << format("%%%02X", b)
|
|
125
|
+
i += 1
|
|
126
|
+
end
|
|
127
|
+
end
|
|
128
|
+
result
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
private
|
|
132
|
+
|
|
133
|
+
# Consume ASCII whitespace starting at start_byte. Returns the
|
|
134
|
+
# position of the first non-whitespace byte, or nil if a blank line
|
|
135
|
+
# was crossed (link inner whitespace may span at most one newline).
|
|
136
|
+
def skip_link_whitespace(start_byte)
|
|
137
|
+
pos = start_byte
|
|
138
|
+
newlines = 0
|
|
139
|
+
while pos < @source.bytesize
|
|
140
|
+
b = @source.getbyte(pos)
|
|
141
|
+
if b == 0x0A
|
|
142
|
+
newlines += 1
|
|
143
|
+
return nil if newlines > 1
|
|
144
|
+
elsif !link_tail_whitespace_byte?(b)
|
|
145
|
+
break
|
|
146
|
+
end
|
|
147
|
+
pos += 1
|
|
148
|
+
end
|
|
149
|
+
pos
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
def parse_link_destination(start_byte)
|
|
153
|
+
if byte_at(start_byte) == 0x3C
|
|
154
|
+
parse_angle_bracket_destination(start_byte)
|
|
155
|
+
else
|
|
156
|
+
parse_raw_destination(start_byte)
|
|
157
|
+
end
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
# `<...>` form. Returns [string_with_backslash_escapes_applied, end_pos]
|
|
161
|
+
# or nil. Inside angles, `\` followed by ASCII punctuation escapes that
|
|
162
|
+
# punctuation; unescaped `<`, `>` or newlines bail the parse.
|
|
163
|
+
def parse_angle_bracket_destination(start_byte)
|
|
164
|
+
pos = start_byte + 1
|
|
165
|
+
result = String.new
|
|
166
|
+
while pos < @source.bytesize
|
|
167
|
+
b = @source.getbyte(pos)
|
|
168
|
+
case b
|
|
169
|
+
when 0x3E
|
|
170
|
+
return [result, pos + 1]
|
|
171
|
+
when 0x3C, 0x0A
|
|
172
|
+
return nil
|
|
173
|
+
when 0x5C
|
|
174
|
+
nb = @source.getbyte(pos + 1)
|
|
175
|
+
if nb && Inline.ascii_punct_byte?(nb)
|
|
176
|
+
result << nb
|
|
177
|
+
pos += 2
|
|
178
|
+
next
|
|
179
|
+
end
|
|
180
|
+
result << b
|
|
181
|
+
else
|
|
182
|
+
result << b
|
|
183
|
+
end
|
|
184
|
+
pos += 1
|
|
185
|
+
end
|
|
186
|
+
nil
|
|
187
|
+
end
|
|
188
|
+
|
|
189
|
+
# Raw destination: characters until ASCII whitespace, an ASCII
|
|
190
|
+
# control char, or an unbalanced `)`. Parens are allowed if balanced
|
|
191
|
+
# or backslash-escaped.
|
|
192
|
+
def parse_raw_destination(start_byte)
|
|
193
|
+
pos = start_byte
|
|
194
|
+
depth = 0
|
|
195
|
+
result = String.new
|
|
196
|
+
while pos < @source.bytesize
|
|
197
|
+
b = @source.getbyte(pos)
|
|
198
|
+
if b == 0x5C
|
|
199
|
+
nb = @source.getbyte(pos + 1)
|
|
200
|
+
if nb && Inline.ascii_punct_byte?(nb)
|
|
201
|
+
result << nb
|
|
202
|
+
pos += 2
|
|
203
|
+
next
|
|
204
|
+
end
|
|
205
|
+
result << b
|
|
206
|
+
pos += 1
|
|
207
|
+
next
|
|
208
|
+
end
|
|
209
|
+
|
|
210
|
+
break if link_tail_whitespace_byte?(b) || b < 0x20 || b == 0x7F
|
|
211
|
+
|
|
212
|
+
if b == 0x28
|
|
213
|
+
depth += 1
|
|
214
|
+
elsif b == 0x29
|
|
215
|
+
break if depth.zero?
|
|
216
|
+
|
|
217
|
+
depth -= 1
|
|
218
|
+
end
|
|
219
|
+
|
|
220
|
+
result << b
|
|
221
|
+
pos += 1
|
|
222
|
+
end
|
|
223
|
+
|
|
224
|
+
return nil if pos == start_byte
|
|
225
|
+
return nil if depth != 0
|
|
226
|
+
|
|
227
|
+
[result, pos]
|
|
228
|
+
end
|
|
229
|
+
|
|
230
|
+
# Parses a title delimited by `"`, `'`, or `(...)`. Returns
|
|
231
|
+
# [unescaped_string, end_pos] or nil. Backslash escapes apply for
|
|
232
|
+
# ASCII punctuation; a blank line inside a title voids the match.
|
|
233
|
+
def parse_link_title(start_byte)
|
|
234
|
+
opener = @source.getbyte(start_byte)
|
|
235
|
+
closer = case opener
|
|
236
|
+
when 0x22 then 0x22
|
|
237
|
+
when 0x27 then 0x27
|
|
238
|
+
when 0x28 then 0x29
|
|
239
|
+
else return nil
|
|
240
|
+
end
|
|
241
|
+
balanced = opener == 0x28
|
|
242
|
+
|
|
243
|
+
pos = start_byte + 1
|
|
244
|
+
result = String.new
|
|
245
|
+
while pos < @source.bytesize
|
|
246
|
+
b = @source.getbyte(pos)
|
|
247
|
+
if b == 0x5C
|
|
248
|
+
nb = @source.getbyte(pos + 1)
|
|
249
|
+
if nb && Inline.ascii_punct_byte?(nb)
|
|
250
|
+
result << nb
|
|
251
|
+
pos += 2
|
|
252
|
+
next
|
|
253
|
+
end
|
|
254
|
+
result << b
|
|
255
|
+
pos += 1
|
|
256
|
+
next
|
|
257
|
+
end
|
|
258
|
+
|
|
259
|
+
if b == 0x0A
|
|
260
|
+
# Blank line (newline followed by only whitespace + newline) is forbidden.
|
|
261
|
+
look = pos + 1
|
|
262
|
+
while look < @source.bytesize && (@source.getbyte(look) == 0x20 || @source.getbyte(look) == 0x09)
|
|
263
|
+
look += 1
|
|
264
|
+
end
|
|
265
|
+
return nil if look < @source.bytesize && @source.getbyte(look) == 0x0A
|
|
266
|
+
|
|
267
|
+
result << b
|
|
268
|
+
pos += 1
|
|
269
|
+
next
|
|
270
|
+
end
|
|
271
|
+
|
|
272
|
+
# Inside `(...)` titles, an unescaped opening `(` invalidates the match.
|
|
273
|
+
return nil if balanced && b == 0x28
|
|
274
|
+
|
|
275
|
+
if b == closer
|
|
276
|
+
return [result, pos + 1]
|
|
277
|
+
end
|
|
278
|
+
|
|
279
|
+
result << b
|
|
280
|
+
pos += 1
|
|
281
|
+
end
|
|
282
|
+
nil
|
|
283
|
+
end
|
|
284
|
+
|
|
285
|
+
def decode_link_entities(raw)
|
|
286
|
+
# An entity reference always contains `&`; skip the regex scan and
|
|
287
|
+
# the new-string allocation when there's nothing to decode.
|
|
288
|
+
return raw unless raw.include?("&")
|
|
289
|
+
|
|
290
|
+
raw.gsub(Inline::ENTITY_RE) { |m| Inline.decode_entity(m) }
|
|
291
|
+
end
|
|
292
|
+
|
|
293
|
+
def byte_at(pos)
|
|
294
|
+
return nil if pos < 0 || pos >= @source.bytesize
|
|
295
|
+
|
|
296
|
+
@source.getbyte(pos)
|
|
297
|
+
end
|
|
298
|
+
|
|
299
|
+
# Whitespace allowed as a link-tail separator per CommonMark 6.3:
|
|
300
|
+
# "spaces, tabs, and up to one line ending". Line endings are
|
|
301
|
+
# counted by the caller, so this predicate intentionally matches
|
|
302
|
+
# only space and tab -- it must NOT match form feed (U+000C) or
|
|
303
|
+
# vertical tab (U+000B) the way the generic \s class does.
|
|
304
|
+
def link_tail_whitespace_byte?(b)
|
|
305
|
+
b == 0x20 || b == 0x09
|
|
306
|
+
end
|
|
307
|
+
|
|
308
|
+
def hex_byte?(b)
|
|
309
|
+
(b >= 0x30 && b <= 0x39) ||
|
|
310
|
+
(b >= 0x41 && b <= 0x46) ||
|
|
311
|
+
(b >= 0x61 && b <= 0x66)
|
|
312
|
+
end
|
|
313
|
+
end
|
|
314
|
+
end
|
|
315
|
+
end
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RedQuilt
|
|
4
|
+
module Inline
|
|
5
|
+
module TokenKind
|
|
6
|
+
TEXT = 1
|
|
7
|
+
ENTITY = 2
|
|
8
|
+
ESCAPED_CHAR = 3
|
|
9
|
+
LINE_ENDING = 4
|
|
10
|
+
CODE_DELIMITER = 5
|
|
11
|
+
DELIM_RUN = 6
|
|
12
|
+
LBRACKET = 7
|
|
13
|
+
BANG_LBRACKET = 8
|
|
14
|
+
RBRACKET = 9
|
|
15
|
+
AUTOLINK_URI = 10
|
|
16
|
+
AUTOLINK_EMAIL = 11
|
|
17
|
+
HTML_INLINE = 12
|
|
18
|
+
|
|
19
|
+
NAMES = {
|
|
20
|
+
TEXT => :text,
|
|
21
|
+
ENTITY => :entity,
|
|
22
|
+
ESCAPED_CHAR => :escaped_char,
|
|
23
|
+
LINE_ENDING => :line_ending,
|
|
24
|
+
CODE_DELIMITER => :code_delimiter,
|
|
25
|
+
DELIM_RUN => :delim_run,
|
|
26
|
+
LBRACKET => :lbracket,
|
|
27
|
+
BANG_LBRACKET => :bang_lbracket,
|
|
28
|
+
RBRACKET => :rbracket,
|
|
29
|
+
AUTOLINK_URI => :autolink_uri,
|
|
30
|
+
AUTOLINK_EMAIL => :autolink_email,
|
|
31
|
+
HTML_INLINE => :html_inline,
|
|
32
|
+
}.freeze
|
|
33
|
+
|
|
34
|
+
def self.name(kind)
|
|
35
|
+
NAMES[kind]
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
end
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RedQuilt
|
|
4
|
+
module Inline
|
|
5
|
+
# Parallel-array storage for the inline token stream.
|
|
6
|
+
#
|
|
7
|
+
# InlineTokens is intended to be allocated once per document and reused
|
|
8
|
+
# across paragraphs by calling #clear between inline targets. Array#clear
|
|
9
|
+
# resets length to 0 while preserving internal capacity, so subsequent
|
|
10
|
+
# paragraphs avoid reallocating the underlying buffers.
|
|
11
|
+
class Tokens
|
|
12
|
+
def initialize
|
|
13
|
+
@kind = []
|
|
14
|
+
@start_byte = []
|
|
15
|
+
@end_byte = []
|
|
16
|
+
@int1 = []
|
|
17
|
+
@int2 = []
|
|
18
|
+
@int3 = []
|
|
19
|
+
@str1 = []
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def emit(kind, start_byte:, end_byte:, int1: 0, int2: 0, int3: 0, str1: nil)
|
|
23
|
+
id = @kind.length
|
|
24
|
+
@kind[id] = kind
|
|
25
|
+
@start_byte[id] = start_byte
|
|
26
|
+
@end_byte[id] = end_byte
|
|
27
|
+
@int1[id] = int1
|
|
28
|
+
@int2[id] = int2
|
|
29
|
+
@int3[id] = int3
|
|
30
|
+
@str1[id] = str1
|
|
31
|
+
id
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def clear
|
|
35
|
+
@kind.clear
|
|
36
|
+
@start_byte.clear
|
|
37
|
+
@end_byte.clear
|
|
38
|
+
@int1.clear
|
|
39
|
+
@int2.clear
|
|
40
|
+
@int3.clear
|
|
41
|
+
@str1.clear
|
|
42
|
+
self
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def length
|
|
46
|
+
@kind.length
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
def empty?
|
|
50
|
+
@kind.empty?
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
def kind(id) = @kind.[](id)
|
|
54
|
+
def start_byte(id) = @start_byte.[](id)
|
|
55
|
+
def end_byte(id) = @end_byte.[](id)
|
|
56
|
+
def int1(id) = @int1.[](id)
|
|
57
|
+
def int2(id) = @int2.[](id)
|
|
58
|
+
def int3(id) = @int3.[](id)
|
|
59
|
+
def str1(id) = @str1.[](id)
|
|
60
|
+
|
|
61
|
+
def each_id
|
|
62
|
+
return enum_for(:each_id) unless block_given?
|
|
63
|
+
|
|
64
|
+
id = 0
|
|
65
|
+
last = @kind.length
|
|
66
|
+
while id < last
|
|
67
|
+
yield id
|
|
68
|
+
id += 1
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
end
|