red_quilt 0.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.rubocop.yml +109 -0
  4. data/.rubocop_todo.yml +7 -0
  5. data/CHANGELOG.md +57 -0
  6. data/README.md +284 -0
  7. data/Rakefile +8 -0
  8. data/ast-spec.md +1227 -0
  9. data/docs/architecture.md +81 -0
  10. data/docs/arena-usage.md +363 -0
  11. data/docs/commonmark-conformance.md +241 -0
  12. data/exe/redquilt +7 -0
  13. data/lib/red_quilt/arena.rb +366 -0
  14. data/lib/red_quilt/block_parser.rb +724 -0
  15. data/lib/red_quilt/blockquote.rb +151 -0
  16. data/lib/red_quilt/cli.rb +182 -0
  17. data/lib/red_quilt/diagnostic.rb +47 -0
  18. data/lib/red_quilt/document.rb +126 -0
  19. data/lib/red_quilt/extended_autolink_pass.rb +185 -0
  20. data/lib/red_quilt/footnote_definition.rb +147 -0
  21. data/lib/red_quilt/footnote_pass.rb +39 -0
  22. data/lib/red_quilt/footnote_registry.rb +68 -0
  23. data/lib/red_quilt/indentation.rb +73 -0
  24. data/lib/red_quilt/inline/builder.rb +674 -0
  25. data/lib/red_quilt/inline/flanking.rb +120 -0
  26. data/lib/red_quilt/inline/html_entities.rb +2180 -0
  27. data/lib/red_quilt/inline/lexer.rb +280 -0
  28. data/lib/red_quilt/inline/link_scanner.rb +315 -0
  29. data/lib/red_quilt/inline/token_kind.rb +39 -0
  30. data/lib/red_quilt/inline/tokens.rb +73 -0
  31. data/lib/red_quilt/inline.rb +34 -0
  32. data/lib/red_quilt/inline_pass.rb +53 -0
  33. data/lib/red_quilt/line.rb +14 -0
  34. data/lib/red_quilt/lint_pass.rb +71 -0
  35. data/lib/red_quilt/list.rb +317 -0
  36. data/lib/red_quilt/node_ref.rb +114 -0
  37. data/lib/red_quilt/node_type.rb +66 -0
  38. data/lib/red_quilt/plain_text.rb +46 -0
  39. data/lib/red_quilt/reference_definition.rb +309 -0
  40. data/lib/red_quilt/renderer/html.rb +279 -0
  41. data/lib/red_quilt/renderer/mdast.rb +152 -0
  42. data/lib/red_quilt/source_map.rb +29 -0
  43. data/lib/red_quilt/source_span.rb +26 -0
  44. data/lib/red_quilt/theme.rb +28 -0
  45. data/lib/red_quilt/themes/default.css +87 -0
  46. data/lib/red_quilt/version.rb +5 -0
  47. data/lib/red_quilt.rb +86 -0
  48. data/mise.toml +2 -0
  49. data/sig/red_quilt.rbs +45 -0
  50. metadata +91 -0
@@ -0,0 +1,280 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "strscan"
4
+
5
+ require_relative "html_entities"
6
+
7
+ module RedQuilt
8
+ module Inline
9
+ # Scans a byte range of the document source and emits inline tokens
10
+ # into a caller-owned Tokens storage.
11
+ #
12
+ # The lexer never copies the source string; all positions are absolute
13
+ # byte offsets into @source. The caller is responsible for clearing the
14
+ # Tokens storage between invocations if it is being reused.
15
+ class Lexer
16
+ # Bytes whose appearance ends a TEXT run. Anything not in this set is
17
+ # plain text content. Newline is included so LINE_ENDING gets its own
18
+ # token.
19
+ SPECIAL_BYTES = begin
20
+ a = Array.new(256, false)
21
+ # *, _, `, [, ], !, <, &, \, \n, ~ (GFM strikethrough)
22
+ [0x2A, 0x5F, 0x60, 0x5B, 0x5D, 0x21, 0x3C, 0x26, 0x5C, 0x0A, 0x7E].each { |b| a[b] = true }
23
+ a.freeze
24
+ end
25
+ # Same set as SPECIAL_BYTES, for String#byteindex to jump over long
26
+ # plain-text runs at C speed.
27
+ SPECIAL_BYTE_RE = /[*_`\[\]!<&\\\n~]/
28
+
29
+ # Anchored regexes for StringScanner#scan (still used by
30
+ # scan_angle / scan_amp). StringScanner anchors at the current pos,
31
+ # so no `\G` is needed.
32
+ #
33
+ # URI autolink rejects every ASCII control char (U+0000-U+001F, U+007F)
34
+ # plus space (U+0020); CommonMark 6.5 forbids ASCII control characters,
35
+ # space, <, or >.
36
+ URI_AUTOLINK_RE = /<([A-Za-z][A-Za-z0-9+.-]{1,31}:[^<>\u0000-\u0020\u007F]*)>/
37
+ EMAIL_AUTOLINK_RE = /<([a-zA-Z0-9.!#$%&'*+\/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*)>/
38
+ # CommonMark spec 6.6 "Raw HTML": six forms — open tag, closing tag,
39
+ # HTML comment, processing instruction, declaration, CDATA section.
40
+ # Attribute values are allowed to span lines.
41
+ # HTML tag separators are restricted to space/tab/CR/LF per spec --
42
+ # \s would also match form feed (U+000C) and vertical tab (U+000B),
43
+ # which CommonMark disallows.
44
+ HTML_OPEN_TAG_RE = %r{<[A-Za-z][A-Za-z0-9-]*(?:[ \t\r\n]+[A-Za-z_:][A-Za-z0-9_.:-]*(?:[ \t\r\n]*=[ \t\r\n]*(?:"[^"]*"|'[^']*'|[^ \t\r\n"'=<>`]+))?)*[ \t\r\n]*/?>}
45
+ HTML_CLOSING_TAG_RE = %r{</[A-Za-z][A-Za-z0-9-]*[ \t\r\n]*>}
46
+ # Comment: `<!-->`, `<!--->`, or `<!-- text -->` where text doesn't
47
+ # start with `>` or `->`, end with `-`, or contain `--`.
48
+ HTML_COMMENT_RE = %r{<!-->|<!--->|<!--(?!>)(?!->)[\s\S]*?(?<!-)-->}
49
+ HTML_PROC_INST_RE = %r{<\?[\s\S]*?\?>}
50
+ HTML_DECLARATION_RE = %r{<![A-Za-z][^>]*>}
51
+ HTML_CDATA_RE = %r{<!\[CDATA\[[\s\S]*?\]\]>}
52
+ # Entity regex and decoder live on the enclosing Inline module so
53
+ # the same digit-count caps and U+FFFD replacement apply across
54
+ # the lexer, the inline builder, and the reference-definition
55
+ # parser. See lib/red_quilt/inline/html_entities.rb.
56
+
57
+ def initialize(source)
58
+ @source = source
59
+ # A binary-encoded view for String#byteindex hot paths (byteindex
60
+ # on a UTF-8 string raises when the offset falls inside a
61
+ # multibyte sequence; binary treats every byte as its own char).
62
+ @source_b = source.b
63
+ @ss = StringScanner.new(source)
64
+ end
65
+
66
+ # Scans @source[start_byte...end_byte] and emits tokens.
67
+ # Returns the tokens object that was passed in.
68
+ def lex_into(tokens, start_byte, end_byte)
69
+ @ss.pos = start_byte
70
+ @start = start_byte
71
+ @end = end_byte
72
+ scan(tokens)
73
+ tokens
74
+ end
75
+
76
+ private
77
+
78
+ def scan(tokens)
79
+ # Hot loop. `pos` is the source of truth during the scan; @ss.pos
80
+ # is only synced when entering scan_angle / scan_amp (which still
81
+ # use StringScanner for the regex match) and at loop exit. The
82
+ # other scan_* helpers take `pos` as an arg and return the new
83
+ # position, so the round-trip through @ss.pos is avoided.
84
+ pos = @ss.pos
85
+ end_pos = @end
86
+ while pos < end_pos
87
+ byte = @source.getbyte(pos)
88
+ case byte
89
+ when 0x0A # \n
90
+ pos = scan_line_ending(tokens, pos)
91
+ when 0x5C # \\ (backslash)
92
+ pos = scan_backslash(tokens, pos, end_pos)
93
+ when 0x60 # `
94
+ pos = scan_code_delimiter(tokens, pos, end_pos)
95
+ when 0x2A # *
96
+ pos = scan_delim_run(tokens, pos, end_pos, "*", 0x2A)
97
+ when 0x5F # _
98
+ pos = scan_delim_run(tokens, pos, end_pos, "_", 0x5F)
99
+ when 0x7E # ~ (GFM strikethrough)
100
+ pos = scan_delim_run(tokens, pos, end_pos, "~", 0x7E)
101
+ when 0x5B # [
102
+ tokens.emit(TokenKind::LBRACKET, start_byte: pos, end_byte: pos + 1)
103
+ pos += 1
104
+ when 0x5D # ]
105
+ tokens.emit(TokenKind::RBRACKET, start_byte: pos, end_byte: pos + 1)
106
+ pos += 1
107
+ when 0x21 # !
108
+ pos = scan_bang(tokens, pos, end_pos)
109
+ when 0x3C # <
110
+ @ss.pos = pos
111
+ scan_angle(tokens)
112
+ pos = @ss.pos
113
+ when 0x26 # &
114
+ @ss.pos = pos
115
+ scan_amp(tokens)
116
+ pos = @ss.pos
117
+ else
118
+ # Inlined scan_text. Always make progress: consume the
119
+ # current byte, then byteindex against the binary view to
120
+ # leap to the next special byte at C speed.
121
+ start = pos
122
+ pos += 1
123
+ if pos < end_pos
124
+ next_special = @source_b.byteindex(SPECIAL_BYTE_RE, pos)
125
+ pos = next_special.nil? || next_special >= end_pos ? end_pos : next_special
126
+ end
127
+ tokens.emit(TokenKind::TEXT, start_byte: start, end_byte: pos)
128
+ end
129
+ end
130
+ @ss.pos = pos
131
+ end
132
+
133
+ def scan_line_ending(tokens, pos)
134
+ # Count trailing ASCII spaces immediately before the newline; the
135
+ # builder uses this to decide softbreak vs hardbreak (>= 2 spaces).
136
+ trailing_spaces = 0
137
+ i = pos - 1
138
+ while i >= 0 && @source.getbyte(i) == 0x20
139
+ trailing_spaces += 1
140
+ i -= 1
141
+ end
142
+ new_pos = pos + 1
143
+ tokens.emit(TokenKind::LINE_ENDING,
144
+ start_byte: pos, end_byte: new_pos,
145
+ int1: trailing_spaces)
146
+ new_pos
147
+ end
148
+
149
+ def scan_backslash(tokens, pos, end_pos)
150
+ nxt_pos = pos + 1
151
+ if nxt_pos >= end_pos
152
+ tokens.emit(TokenKind::TEXT, start_byte: pos, end_byte: nxt_pos)
153
+ return nxt_pos
154
+ end
155
+
156
+ nxt = @source.getbyte(nxt_pos)
157
+ if nxt == 0x0A
158
+ # "\\\n" → hardbreak (backslash form). int2 = 1 signals the form.
159
+ tokens.emit(TokenKind::LINE_ENDING,
160
+ start_byte: pos, end_byte: nxt_pos + 1,
161
+ int1: 0, int2: 1)
162
+ nxt_pos + 1
163
+ elsif Inline.ascii_punct_byte?(nxt)
164
+ tokens.emit(TokenKind::ESCAPED_CHAR,
165
+ start_byte: pos, end_byte: nxt_pos + 1,
166
+ str1: nxt.chr)
167
+ nxt_pos + 1
168
+ else
169
+ tokens.emit(TokenKind::TEXT, start_byte: pos, end_byte: nxt_pos)
170
+ nxt_pos
171
+ end
172
+ end
173
+
174
+ def scan_code_delimiter(tokens, pos, end_pos)
175
+ # Manual byte loop. Backtick runs are usually short (1-3 bytes),
176
+ # so a regex skip's setup cost outweighs the per-byte compare.
177
+ i = pos
178
+ while i < end_pos && @source.getbyte(i) == 0x60
179
+ i += 1
180
+ end
181
+ tokens.emit(TokenKind::CODE_DELIMITER,
182
+ start_byte: pos, end_byte: i,
183
+ int1: i - pos)
184
+ i
185
+ end
186
+
187
+ def scan_delim_run(tokens, pos, end_pos, char, byte)
188
+ i = pos
189
+ while i < end_pos && @source.getbyte(i) == byte
190
+ i += 1
191
+ end
192
+ count = i - pos
193
+ prev_char = Flanking.char_before(@source, pos, @start)
194
+ next_char = Flanking.char_at(@source, i, end_pos)
195
+ can_open, can_close = Flanking.can_open_close(char, prev_char, next_char)
196
+ # A run that can neither open nor close (e.g. underscores inside
197
+ # a word) can never participate in emphasis, so emit it as plain
198
+ # TEXT to allow text coalescing with neighbours.
199
+ if !can_open && !can_close
200
+ tokens.emit(TokenKind::TEXT, start_byte: pos, end_byte: i)
201
+ return i
202
+ end
203
+ flags = (can_open ? 0b10 : 0) | (can_close ? 0b01 : 0)
204
+ tokens.emit(TokenKind::DELIM_RUN,
205
+ start_byte: pos, end_byte: i,
206
+ int1: byte, int2: count, int3: flags)
207
+ i
208
+ end
209
+
210
+ def scan_bang(tokens, pos, end_pos)
211
+ if pos + 1 < end_pos && @source.getbyte(pos + 1) == 0x5B # [
212
+ tokens.emit(TokenKind::BANG_LBRACKET, start_byte: pos, end_byte: pos + 2)
213
+ pos + 2
214
+ else
215
+ tokens.emit(TokenKind::TEXT, start_byte: pos, end_byte: pos + 1)
216
+ pos + 1
217
+ end
218
+ end
219
+
220
+ def scan_angle(tokens)
221
+ start = @ss.pos
222
+ if scan_within_end(URI_AUTOLINK_RE)
223
+ tokens.emit(TokenKind::AUTOLINK_URI,
224
+ start_byte: start, end_byte: @ss.pos,
225
+ str1: @ss[1])
226
+ elsif scan_within_end(EMAIL_AUTOLINK_RE)
227
+ tokens.emit(TokenKind::AUTOLINK_EMAIL,
228
+ start_byte: start, end_byte: @ss.pos,
229
+ str1: @ss[1])
230
+ elsif (matched = scan_within_end(HTML_OPEN_TAG_RE)) ||
231
+ (matched = scan_within_end(HTML_CLOSING_TAG_RE)) ||
232
+ (matched = scan_within_end(HTML_COMMENT_RE)) ||
233
+ (matched = scan_within_end(HTML_PROC_INST_RE)) ||
234
+ (matched = scan_within_end(HTML_DECLARATION_RE)) ||
235
+ (matched = scan_within_end(HTML_CDATA_RE))
236
+ tokens.emit(TokenKind::HTML_INLINE,
237
+ start_byte: start, end_byte: @ss.pos,
238
+ str1: matched)
239
+ else
240
+ @ss.pos += 1
241
+ tokens.emit(TokenKind::TEXT, start_byte: start, end_byte: @ss.pos)
242
+ end
243
+ end
244
+
245
+ def scan_amp(tokens)
246
+ start = @ss.pos
247
+ if (matched = scan_within_end(Inline::ENTITY_RE))
248
+ tokens.emit(TokenKind::ENTITY,
249
+ start_byte: start, end_byte: @ss.pos,
250
+ str1: decode_entity(matched))
251
+ else
252
+ @ss.pos += 1
253
+ tokens.emit(TokenKind::TEXT, start_byte: start, end_byte: @ss.pos)
254
+ end
255
+ end
256
+
257
+ # StringScanner#scan but constrained to @end. Returns the matched
258
+ # string on success (rewinding when the match extends past @end),
259
+ # nil otherwise.
260
+ def scan_within_end(regex)
261
+ before = @ss.pos
262
+ matched = @ss.scan(regex)
263
+ return nil unless matched
264
+
265
+ if @ss.pos > @end
266
+ @ss.pos = before
267
+ return nil
268
+ end
269
+ matched
270
+ end
271
+
272
+ # Decodes a single entity reference using the shared Inline
273
+ # decoder, which enforces the spec's digit limits and U+FFFD
274
+ # replacement for U+0000 / surrogates / out-of-range codepoints.
275
+ def decode_entity(raw)
276
+ Inline.decode_entity(raw)
277
+ end
278
+ end
279
+ end
280
+ end
@@ -0,0 +1,315 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RedQuilt
4
+ module Inline
5
+ # Pure byte-level scanner for link / image tails: inline link bodies
6
+ # `(dest "title")`, bracketed reference labels `[label]`, and link
7
+ # destination URI normalization. Operates only on the document source
8
+ # string -- no arena, token stream, or parser state -- so it can be
9
+ # exercised in isolation. Inline::Builder owns one instance and feeds
10
+ # it absolute byte offsets.
11
+ class LinkScanner
12
+ NIL_PAIR = [nil, nil].freeze
13
+ # Bytes left verbatim by normalize_uri: ASCII alphanumerics plus the
14
+ # URL sub-delims / reserved chars that the spec keeps unencoded.
15
+ # Everything else is percent-encoded.
16
+ URL_SAFE_BYTE = begin
17
+ a = Array.new(256, false)
18
+ (0x30..0x39).each { |b| a[b] = true } # 0-9
19
+ (0x41..0x5A).each { |b| a[b] = true } # A-Z
20
+ (0x61..0x7A).each { |b| a[b] = true } # a-z
21
+ "-._~:/?#@!$&'()*+,;=".each_byte { |b| a[b] = true }
22
+ a.freeze
23
+ end
24
+
25
+ def initialize(source)
26
+ @source = source
27
+ end
28
+
29
+ # Parses an inline link body `(dest "title")` starting at the byte
30
+ # right after the link's closing `]`. Returns a hash with
31
+ # `:end_byte`, `:destination`, `:title` on success, or nil if the
32
+ # bytes don't form a valid inline link tail.
33
+ def inline_link(start_byte)
34
+ return nil unless byte_at(start_byte) == 0x28
35
+
36
+ pos = start_byte + 1
37
+ pos = skip_link_whitespace(pos)
38
+ return nil if pos.nil?
39
+
40
+ raw_dest = nil
41
+ next_byte = byte_at(pos)
42
+ if next_byte && next_byte != 0x29 && !link_tail_whitespace_byte?(next_byte) && next_byte != 0x0A
43
+ dest_result = parse_link_destination(pos)
44
+ return nil unless dest_result
45
+
46
+ raw_dest, pos = dest_result
47
+ end
48
+
49
+ ws_end = skip_link_whitespace(pos)
50
+ return nil if ws_end.nil?
51
+
52
+ raw_title = nil
53
+ if ws_end > pos
54
+ opener_byte = byte_at(ws_end)
55
+ if opener_byte && (opener_byte == 0x22 || opener_byte == 0x27 || opener_byte == 0x28)
56
+ title_result = parse_link_title(ws_end)
57
+ return nil unless title_result
58
+
59
+ raw_title, pos = title_result
60
+ pos = skip_link_whitespace(pos)
61
+ return nil if pos.nil?
62
+ else
63
+ pos = ws_end
64
+ end
65
+ else
66
+ pos = ws_end
67
+ end
68
+
69
+ return nil unless byte_at(pos) == 0x29
70
+
71
+ destination = raw_dest ? normalize_uri(raw_dest) : ""
72
+ title = raw_title ? decode_link_entities(raw_title) : nil
73
+ { end_byte: pos + 1, destination: destination, title: title }
74
+ end
75
+
76
+ # Reads a bracketed reference label `[label]` starting at start_byte
77
+ # (which must point at the `[`). Returns [label, after_byte] or
78
+ # NIL_PAIR when the label is malformed or over-long.
79
+ def reference_label(start_byte)
80
+ return NIL_PAIR unless @source.getbyte(start_byte) == 0x5B
81
+
82
+ i = start_byte + 1
83
+ while i < @source.bytesize
84
+ b = @source.getbyte(i)
85
+ if b == 0x5D
86
+ label = @source.byteslice(start_byte + 1, i - start_byte - 1).to_s
87
+ return NIL_PAIR if ReferenceDefinition.label_too_long?(label)
88
+
89
+ return [label, i + 1]
90
+ elsif b == 0x5B
91
+ # An unescaped `[` inside a reference label voids the form.
92
+ return NIL_PAIR
93
+ elsif b == 0x5C && i + 1 < @source.bytesize
94
+ i += 2
95
+ next
96
+ end
97
+ i += 1
98
+ end
99
+ NIL_PAIR
100
+ end
101
+
102
+ # Percent-encodes bytes not in the URL-safe set, decodes HTML
103
+ # entities first, and preserves (uppercasing) existing `%XX`.
104
+ def normalize_uri(raw)
105
+ decoded = decode_link_entities(raw)
106
+ bytes = decoded.b
107
+ result = +""
108
+ i = 0
109
+ size = bytes.bytesize
110
+ while i < size
111
+ b = bytes.getbyte(i)
112
+ if b == 0x25 && i + 2 < size &&
113
+ hex_byte?(bytes.getbyte(i + 1)) && hex_byte?(bytes.getbyte(i + 2))
114
+ result << "%"
115
+ result << bytes.getbyte(i + 1).chr.upcase
116
+ result << bytes.getbyte(i + 2).chr.upcase
117
+ i += 3
118
+ elsif URL_SAFE_BYTE[b]
119
+ # All URL-safe bytes are ASCII, so appending the integer
120
+ # codepoint matches b.chr without allocating a 1-char string.
121
+ result << b
122
+ i += 1
123
+ else
124
+ result << format("%%%02X", b)
125
+ i += 1
126
+ end
127
+ end
128
+ result
129
+ end
130
+
131
+ private
132
+
133
+ # Consume ASCII whitespace starting at start_byte. Returns the
134
+ # position of the first non-whitespace byte, or nil if a blank line
135
+ # was crossed (link inner whitespace may span at most one newline).
136
+ def skip_link_whitespace(start_byte)
137
+ pos = start_byte
138
+ newlines = 0
139
+ while pos < @source.bytesize
140
+ b = @source.getbyte(pos)
141
+ if b == 0x0A
142
+ newlines += 1
143
+ return nil if newlines > 1
144
+ elsif !link_tail_whitespace_byte?(b)
145
+ break
146
+ end
147
+ pos += 1
148
+ end
149
+ pos
150
+ end
151
+
152
+ def parse_link_destination(start_byte)
153
+ if byte_at(start_byte) == 0x3C
154
+ parse_angle_bracket_destination(start_byte)
155
+ else
156
+ parse_raw_destination(start_byte)
157
+ end
158
+ end
159
+
160
+ # `<...>` form. Returns [string_with_backslash_escapes_applied, end_pos]
161
+ # or nil. Inside angles, `\` followed by ASCII punctuation escapes that
162
+ # punctuation; unescaped `<`, `>` or newlines bail the parse.
163
+ def parse_angle_bracket_destination(start_byte)
164
+ pos = start_byte + 1
165
+ result = String.new
166
+ while pos < @source.bytesize
167
+ b = @source.getbyte(pos)
168
+ case b
169
+ when 0x3E
170
+ return [result, pos + 1]
171
+ when 0x3C, 0x0A
172
+ return nil
173
+ when 0x5C
174
+ nb = @source.getbyte(pos + 1)
175
+ if nb && Inline.ascii_punct_byte?(nb)
176
+ result << nb
177
+ pos += 2
178
+ next
179
+ end
180
+ result << b
181
+ else
182
+ result << b
183
+ end
184
+ pos += 1
185
+ end
186
+ nil
187
+ end
188
+
189
+ # Raw destination: characters until ASCII whitespace, an ASCII
190
+ # control char, or an unbalanced `)`. Parens are allowed if balanced
191
+ # or backslash-escaped.
192
+ def parse_raw_destination(start_byte)
193
+ pos = start_byte
194
+ depth = 0
195
+ result = String.new
196
+ while pos < @source.bytesize
197
+ b = @source.getbyte(pos)
198
+ if b == 0x5C
199
+ nb = @source.getbyte(pos + 1)
200
+ if nb && Inline.ascii_punct_byte?(nb)
201
+ result << nb
202
+ pos += 2
203
+ next
204
+ end
205
+ result << b
206
+ pos += 1
207
+ next
208
+ end
209
+
210
+ break if link_tail_whitespace_byte?(b) || b < 0x20 || b == 0x7F
211
+
212
+ if b == 0x28
213
+ depth += 1
214
+ elsif b == 0x29
215
+ break if depth.zero?
216
+
217
+ depth -= 1
218
+ end
219
+
220
+ result << b
221
+ pos += 1
222
+ end
223
+
224
+ return nil if pos == start_byte
225
+ return nil if depth != 0
226
+
227
+ [result, pos]
228
+ end
229
+
230
+ # Parses a title delimited by `"`, `'`, or `(...)`. Returns
231
+ # [unescaped_string, end_pos] or nil. Backslash escapes apply for
232
+ # ASCII punctuation; a blank line inside a title voids the match.
233
+ def parse_link_title(start_byte)
234
+ opener = @source.getbyte(start_byte)
235
+ closer = case opener
236
+ when 0x22 then 0x22
237
+ when 0x27 then 0x27
238
+ when 0x28 then 0x29
239
+ else return nil
240
+ end
241
+ balanced = opener == 0x28
242
+
243
+ pos = start_byte + 1
244
+ result = String.new
245
+ while pos < @source.bytesize
246
+ b = @source.getbyte(pos)
247
+ if b == 0x5C
248
+ nb = @source.getbyte(pos + 1)
249
+ if nb && Inline.ascii_punct_byte?(nb)
250
+ result << nb
251
+ pos += 2
252
+ next
253
+ end
254
+ result << b
255
+ pos += 1
256
+ next
257
+ end
258
+
259
+ if b == 0x0A
260
+ # Blank line (newline followed by only whitespace + newline) is forbidden.
261
+ look = pos + 1
262
+ while look < @source.bytesize && (@source.getbyte(look) == 0x20 || @source.getbyte(look) == 0x09)
263
+ look += 1
264
+ end
265
+ return nil if look < @source.bytesize && @source.getbyte(look) == 0x0A
266
+
267
+ result << b
268
+ pos += 1
269
+ next
270
+ end
271
+
272
+ # Inside `(...)` titles, an unescaped opening `(` invalidates the match.
273
+ return nil if balanced && b == 0x28
274
+
275
+ if b == closer
276
+ return [result, pos + 1]
277
+ end
278
+
279
+ result << b
280
+ pos += 1
281
+ end
282
+ nil
283
+ end
284
+
285
+ def decode_link_entities(raw)
286
+ # An entity reference always contains `&`; skip the regex scan and
287
+ # the new-string allocation when there's nothing to decode.
288
+ return raw unless raw.include?("&")
289
+
290
+ raw.gsub(Inline::ENTITY_RE) { |m| Inline.decode_entity(m) }
291
+ end
292
+
293
+ def byte_at(pos)
294
+ return nil if pos < 0 || pos >= @source.bytesize
295
+
296
+ @source.getbyte(pos)
297
+ end
298
+
299
+ # Whitespace allowed as a link-tail separator per CommonMark 6.3:
300
+ # "spaces, tabs, and up to one line ending". Line endings are
301
+ # counted by the caller, so this predicate intentionally matches
302
+ # only space and tab -- it must NOT match form feed (U+000C) or
303
+ # vertical tab (U+000B) the way the generic \s class does.
304
+ def link_tail_whitespace_byte?(b)
305
+ b == 0x20 || b == 0x09
306
+ end
307
+
308
+ def hex_byte?(b)
309
+ (b >= 0x30 && b <= 0x39) ||
310
+ (b >= 0x41 && b <= 0x46) ||
311
+ (b >= 0x61 && b <= 0x66)
312
+ end
313
+ end
314
+ end
315
+ end
@@ -0,0 +1,39 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RedQuilt
4
+ module Inline
5
+ module TokenKind
6
+ TEXT = 1
7
+ ENTITY = 2
8
+ ESCAPED_CHAR = 3
9
+ LINE_ENDING = 4
10
+ CODE_DELIMITER = 5
11
+ DELIM_RUN = 6
12
+ LBRACKET = 7
13
+ BANG_LBRACKET = 8
14
+ RBRACKET = 9
15
+ AUTOLINK_URI = 10
16
+ AUTOLINK_EMAIL = 11
17
+ HTML_INLINE = 12
18
+
19
+ NAMES = {
20
+ TEXT => :text,
21
+ ENTITY => :entity,
22
+ ESCAPED_CHAR => :escaped_char,
23
+ LINE_ENDING => :line_ending,
24
+ CODE_DELIMITER => :code_delimiter,
25
+ DELIM_RUN => :delim_run,
26
+ LBRACKET => :lbracket,
27
+ BANG_LBRACKET => :bang_lbracket,
28
+ RBRACKET => :rbracket,
29
+ AUTOLINK_URI => :autolink_uri,
30
+ AUTOLINK_EMAIL => :autolink_email,
31
+ HTML_INLINE => :html_inline,
32
+ }.freeze
33
+
34
+ def self.name(kind)
35
+ NAMES[kind]
36
+ end
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,73 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RedQuilt
4
+ module Inline
5
+ # Parallel-array storage for the inline token stream.
6
+ #
7
+ # InlineTokens is intended to be allocated once per document and reused
8
+ # across paragraphs by calling #clear between inline targets. Array#clear
9
+ # resets length to 0 while preserving internal capacity, so subsequent
10
+ # paragraphs avoid reallocating the underlying buffers.
11
+ class Tokens
12
+ def initialize
13
+ @kind = []
14
+ @start_byte = []
15
+ @end_byte = []
16
+ @int1 = []
17
+ @int2 = []
18
+ @int3 = []
19
+ @str1 = []
20
+ end
21
+
22
+ def emit(kind, start_byte:, end_byte:, int1: 0, int2: 0, int3: 0, str1: nil)
23
+ id = @kind.length
24
+ @kind[id] = kind
25
+ @start_byte[id] = start_byte
26
+ @end_byte[id] = end_byte
27
+ @int1[id] = int1
28
+ @int2[id] = int2
29
+ @int3[id] = int3
30
+ @str1[id] = str1
31
+ id
32
+ end
33
+
34
+ def clear
35
+ @kind.clear
36
+ @start_byte.clear
37
+ @end_byte.clear
38
+ @int1.clear
39
+ @int2.clear
40
+ @int3.clear
41
+ @str1.clear
42
+ self
43
+ end
44
+
45
+ def length
46
+ @kind.length
47
+ end
48
+
49
+ def empty?
50
+ @kind.empty?
51
+ end
52
+
53
+ def kind(id) = @kind.[](id)
54
+ def start_byte(id) = @start_byte.[](id)
55
+ def end_byte(id) = @end_byte.[](id)
56
+ def int1(id) = @int1.[](id)
57
+ def int2(id) = @int2.[](id)
58
+ def int3(id) = @int3.[](id)
59
+ def str1(id) = @str1.[](id)
60
+
61
+ def each_id
62
+ return enum_for(:each_id) unless block_given?
63
+
64
+ id = 0
65
+ last = @kind.length
66
+ while id < last
67
+ yield id
68
+ id += 1
69
+ end
70
+ end
71
+ end
72
+ end
73
+ end