red_quilt 0.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.rubocop.yml +109 -0
  4. data/.rubocop_todo.yml +7 -0
  5. data/CHANGELOG.md +57 -0
  6. data/README.md +284 -0
  7. data/Rakefile +8 -0
  8. data/ast-spec.md +1227 -0
  9. data/docs/architecture.md +81 -0
  10. data/docs/arena-usage.md +363 -0
  11. data/docs/commonmark-conformance.md +241 -0
  12. data/exe/redquilt +7 -0
  13. data/lib/red_quilt/arena.rb +366 -0
  14. data/lib/red_quilt/block_parser.rb +724 -0
  15. data/lib/red_quilt/blockquote.rb +151 -0
  16. data/lib/red_quilt/cli.rb +182 -0
  17. data/lib/red_quilt/diagnostic.rb +47 -0
  18. data/lib/red_quilt/document.rb +126 -0
  19. data/lib/red_quilt/extended_autolink_pass.rb +185 -0
  20. data/lib/red_quilt/footnote_definition.rb +147 -0
  21. data/lib/red_quilt/footnote_pass.rb +39 -0
  22. data/lib/red_quilt/footnote_registry.rb +68 -0
  23. data/lib/red_quilt/indentation.rb +73 -0
  24. data/lib/red_quilt/inline/builder.rb +674 -0
  25. data/lib/red_quilt/inline/flanking.rb +120 -0
  26. data/lib/red_quilt/inline/html_entities.rb +2180 -0
  27. data/lib/red_quilt/inline/lexer.rb +280 -0
  28. data/lib/red_quilt/inline/link_scanner.rb +315 -0
  29. data/lib/red_quilt/inline/token_kind.rb +39 -0
  30. data/lib/red_quilt/inline/tokens.rb +73 -0
  31. data/lib/red_quilt/inline.rb +34 -0
  32. data/lib/red_quilt/inline_pass.rb +53 -0
  33. data/lib/red_quilt/line.rb +14 -0
  34. data/lib/red_quilt/lint_pass.rb +71 -0
  35. data/lib/red_quilt/list.rb +317 -0
  36. data/lib/red_quilt/node_ref.rb +114 -0
  37. data/lib/red_quilt/node_type.rb +66 -0
  38. data/lib/red_quilt/plain_text.rb +46 -0
  39. data/lib/red_quilt/reference_definition.rb +309 -0
  40. data/lib/red_quilt/renderer/html.rb +279 -0
  41. data/lib/red_quilt/renderer/mdast.rb +152 -0
  42. data/lib/red_quilt/source_map.rb +29 -0
  43. data/lib/red_quilt/source_span.rb +26 -0
  44. data/lib/red_quilt/theme.rb +28 -0
  45. data/lib/red_quilt/themes/default.css +87 -0
  46. data/lib/red_quilt/version.rb +5 -0
  47. data/lib/red_quilt.rb +86 -0
  48. data/mise.toml +2 -0
  49. data/sig/red_quilt.rbs +45 -0
  50. metadata +91 -0
@@ -0,0 +1,151 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RedQuilt
4
+ # CommonMark spec 5.1 blockquotes.
5
+ #
6
+ # Module-level functions are stateless helpers used by BlockParser's
7
+ # predicate dispatch. `Blockquote::Parser` is a cached collaborator
8
+ # created once in BlockParser#initialize and reused for every
9
+ # blockquote (including nested ones) — per-call state lives in method
10
+ # locals so reentrant `#parse` calls are safe.
11
+ module Blockquote
12
+ BLOCKQUOTE_PREFIX_RE = /\A {0,3}>/
13
+
14
+ module_function
15
+
16
+ def match?(text)
17
+ text.match?(BLOCKQUOTE_PREFIX_RE)
18
+ end
19
+
20
+ # Strip the leading `>` (and at most one column of whitespace after
21
+ # it) from a blockquote line. Returns a new Line whose
22
+ # content is the inner text. If the line has no `>` prefix, the
23
+ # original line is returned unchanged (wrapped in a fresh Line so
24
+ # the caller treats it uniformly).
25
+ def strip_prefix(line)
26
+ content = line.content
27
+ bytes = content.bytesize
28
+ i = 0
29
+ abs_col = 0
30
+ # Up to 3 spaces of indent before `>`.
31
+ while i < 3 && i < bytes && content.getbyte(i) == 0x20
32
+ i += 1
33
+ abs_col += 1
34
+ end
35
+ unless i < bytes && content.getbyte(i) == 0x3E
36
+ return Line.new(content, line.start_byte, line.end_byte, !content.match?(/\S/))
37
+ end
38
+
39
+ i += 1
40
+ abs_col += 1 # consume `>`
41
+
42
+ # Count column width of leading whitespace after `>` using
43
+ # absolute-column tracking so a tab right after `>` (at col 1) is
44
+ # correctly billed as only 3 columns of indent, not 4.
45
+ ws_start_col = abs_col
46
+ j = i
47
+ while j < bytes
48
+ b = content.getbyte(j)
49
+ if b == 0x20
50
+ abs_col += 1
51
+ elsif b == 0x09
52
+ abs_col = ((abs_col / 4) + 1) * 4
53
+ else
54
+ break
55
+ end
56
+ j += 1
57
+ end
58
+ ws_cols = abs_col - ws_start_col
59
+
60
+ if ws_cols >= 1
61
+ tail = (" " * (ws_cols - 1)) + content.byteslice(j..)
62
+ offset = j
63
+ else
64
+ tail = content.byteslice(i..)
65
+ offset = i
66
+ end
67
+
68
+ Line.new(tail, line.start_byte + offset, line.end_byte, !tail.match?(/\S/))
69
+ end
70
+
71
+ class Parser
72
+ def initialize(block_parser)
73
+ @block_parser = block_parser
74
+ @arena = block_parser.arena
75
+ end
76
+
77
+ def parse(parent_id, lines, index)
78
+ block_lines = []
79
+ paragraph_open = false
80
+
81
+ while index < lines.length
82
+ line = lines[index]
83
+
84
+ if line.blank
85
+ # Blank line outside the blockquote prefix closes it.
86
+ break
87
+ elsif Blockquote.match?(line.content)
88
+ stripped = Blockquote.strip_prefix(line)
89
+ paragraph_open =
90
+ if stripped.content.strip.empty?
91
+ false # `>` 単独 (or `>` followed by blank) ends any open paragraph
92
+ else
93
+ # Recurse through any inner blockquote prefixes — an
94
+ # innermost open paragraph (e.g. `> > > foo` where
95
+ # `foo` is paragraph-eligible) lets a `>`-less follow-
96
+ # up line lazily continue it even at the outer level.
97
+ paragraph_eligible_through_blockquotes?(stripped.content)
98
+ end
99
+ block_lines << stripped
100
+ elsif paragraph_open && !@block_parser.lazy_break?(lines, index)
101
+ # Lazy continuation: a `>`-less line is absorbed into the
102
+ # currently open paragraph as long as it doesn't itself
103
+ # start a new block. Only allowed while the most recent
104
+ # in-quote line is paragraph-eligible content. The `lazy`
105
+ # flag prevents the paragraph parser from interpreting
106
+ # `===` / `---` on such a line as a setext underline.
107
+ block_lines << Line.new(line.content, line.start_byte, line.end_byte, line.blank, true)
108
+ else
109
+ break
110
+ end
111
+
112
+ index += 1
113
+ end
114
+
115
+ block_id = @arena.add_node(NodeType::BLOCKQUOTE,
116
+ source_start: block_lines.first.start_byte,
117
+ source_len: block_lines.last.end_byte - block_lines.first.start_byte)
118
+ @arena.append_child(parent_id, block_id)
119
+ @block_parser.parse_lines(block_id, block_lines, transformed: true)
120
+ index
121
+ end
122
+
123
+ private
124
+
125
+ # Like BlockParser#paragraph_eligible_line?, but transparently
126
+ # peels any number of leading wrapper prefixes (blockquote `>`
127
+ # and list item markers) to find out whether the innermost block
128
+ # is still paragraph content. Used so `> > > foo\nbar` and
129
+ # `> 1. > foo\nbar` both let the unprefixed line lazily continue
130
+ # the deepest paragraph.
131
+ def paragraph_eligible_through_blockquotes?(content)
132
+ c = content
133
+ loop do
134
+ if Blockquote.match?(c)
135
+ m = /\A {0,3}> ?/.match(c)
136
+ break unless m
137
+
138
+ c = c[m[0].length..]
139
+ return false if c.strip.empty?
140
+ elsif (li = List.match(c))
141
+ c = li[:content]
142
+ return false if c.strip.empty?
143
+ else
144
+ break
145
+ end
146
+ end
147
+ @block_parser.paragraph_eligible_line?(c)
148
+ end
149
+ end
150
+ end
151
+ end
@@ -0,0 +1,182 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "optparse"
4
+
5
+ module RedQuilt
6
+ # Entry point for the `redquilt` executable. Defined as a module-level
7
+ # function so tests can drive it without shelling out.
8
+ #
9
+ # CLI.run takes an argv-style array and an optional set of IO objects
10
+ # (stdin / stdout / stderr) for testability. It returns an Integer
11
+ # exit code: 0 on success, 1 on usage errors.
12
+ module CLI
13
+ USAGE = <<~USAGE
14
+ Usage: redquilt [options] [file]
15
+
16
+ Reads Markdown from FILE (or stdin if FILE is omitted) and writes the
17
+ result to stdout.
18
+
19
+ Options:
20
+ USAGE
21
+
22
+ DEFAULTS = {
23
+ format: :html,
24
+ allow_html: false,
25
+ disallow_raw_html: false,
26
+ extended_autolinks: false,
27
+ lint: false,
28
+ diagnostics: false,
29
+ diagnostics_only: false,
30
+ standalone: true,
31
+ auto_title: false,
32
+ title: nil,
33
+ lang: "en",
34
+ css: nil,
35
+ theme: :default,
36
+ }.freeze
37
+
38
+ THEMES = %i[none default].freeze
39
+
40
+ FORMATS = %i[html ast json].freeze
41
+
42
+ def self.run(argv, stdin: $stdin, stdout: $stdout, stderr: $stderr)
43
+ options = parse_options(argv, stderr: stderr)
44
+ return options if options.is_a?(Integer)
45
+
46
+ source = read_source(argv, stdin: stdin, stderr: stderr)
47
+ return 1 unless source
48
+
49
+ doc = RedQuilt.parse(source,
50
+ allow_html: options[:allow_html],
51
+ disallow_raw_html: options[:disallow_raw_html],
52
+ extended_autolinks: options[:extended_autolinks],
53
+ lint: options[:lint])
54
+
55
+ unless options[:diagnostics_only]
56
+ case options[:format]
57
+ when :html
58
+ stdout.write(render_html(doc, options))
59
+ when :ast
60
+ require "pp"
61
+ PP.pp(doc.to_ast, stdout)
62
+ when :json
63
+ stdout.puts doc.to_json
64
+ end
65
+ end
66
+
67
+ if options[:diagnostics] || options[:diagnostics_only]
68
+ write_diagnostics(doc.diagnostics, stderr)
69
+ end
70
+
71
+ doc.diagnostics.any? { |d| d.severity == :error } ? 1 : 0
72
+ end
73
+
74
+ def self.parse_options(argv, stderr:)
75
+ options = DEFAULTS.dup
76
+ parser = OptionParser.new do |opts|
77
+ opts.banner = USAGE
78
+ opts.on("--format FORMAT", FORMATS, "Output format: html (default), ast, json") do |f|
79
+ options[:format] = f
80
+ end
81
+ opts.on("--allow-html", "Pass raw HTML through to the output") do
82
+ options[:allow_html] = true
83
+ end
84
+ opts.on("--disallow-raw-html",
85
+ "Filter dangerous tags (script, iframe, ...) even with --allow-html (GFM)") do
86
+ options[:disallow_raw_html] = true
87
+ end
88
+ opts.on("--extended-autolinks",
89
+ "Linkify bare URLs and email addresses (GFM)") do
90
+ options[:extended_autolinks] = true
91
+ end
92
+ opts.on("--lint",
93
+ "Emit lint-style diagnostics (empty_link, missing_alt, heading_level_skip)") do
94
+ options[:lint] = true
95
+ end
96
+ opts.on("--[no-]standalone",
97
+ "Wrap (or not) the rendered HTML in a full document (default: on)") do |v|
98
+ options[:standalone] = v
99
+ end
100
+ opts.on("--auto-title",
101
+ "Use the first heading's text as <title> (standalone only)") do
102
+ options[:auto_title] = true
103
+ end
104
+ opts.on("--title TITLE", "Explicit <title> text (standalone only)") do |t|
105
+ options[:title] = t
106
+ end
107
+ opts.on("--lang LANG", "html lang attribute (standalone only; default \"en\")") do |l|
108
+ options[:lang] = l
109
+ end
110
+ opts.on("--css URL", "Add a stylesheet link (standalone only)") do |u|
111
+ options[:css] = u
112
+ end
113
+ opts.on("--theme THEME", THEMES,
114
+ "Embedded stylesheet: default (the default) or none (bare HTML)") do |t|
115
+ options[:theme] = t
116
+ end
117
+ opts.on("--diagnostics", "Also print diagnostics to stderr") do
118
+ options[:diagnostics] = true
119
+ end
120
+ opts.on("--diagnostics-only", "Print diagnostics only (suppress normal output)") do
121
+ options[:diagnostics_only] = true
122
+ end
123
+ opts.on("-h", "--help", "Show this help") do
124
+ stderr.puts opts
125
+ return 0
126
+ end
127
+ opts.on("-v", "--version", "Show version") do
128
+ stderr.puts "redquilt #{RedQuilt::VERSION}"
129
+ return 0
130
+ end
131
+ end
132
+
133
+ begin
134
+ parser.parse!(argv)
135
+ rescue OptionParser::ParseError => e
136
+ stderr.puts "redquilt: #{e.message}"
137
+ stderr.puts parser
138
+ return 1
139
+ end
140
+
141
+ options
142
+ end
143
+
144
+ def self.read_source(argv, stdin:, stderr:)
145
+ if argv.empty?
146
+ stdin.read
147
+ elsif argv.size == 1
148
+ path = argv.first
149
+ unless File.file?(path)
150
+ stderr.puts "redquilt: no such file: #{path}"
151
+ return nil
152
+ end
153
+ File.read(path)
154
+ else
155
+ stderr.puts "redquilt: too many arguments: #{argv.inspect}"
156
+ nil
157
+ end
158
+ end
159
+
160
+ def self.render_html(doc, options)
161
+ title = options[:title]
162
+ title = doc.first_heading_text.to_s if title.nil? && options[:auto_title]
163
+ doc.to_html(
164
+ standalone: options[:standalone],
165
+ title: title,
166
+ lang: options[:lang],
167
+ css: options[:css],
168
+ theme: options[:theme],
169
+ )
170
+ end
171
+
172
+ def self.write_diagnostics(diagnostics, stderr)
173
+ if diagnostics.empty?
174
+ stderr.puts "redquilt: no diagnostics"
175
+ return
176
+ end
177
+ diagnostics.each do |d|
178
+ stderr.puts "[#{d.severity}] #{d.rule}: #{d.message}"
179
+ end
180
+ end
181
+ end
182
+ end
@@ -0,0 +1,47 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RedQuilt
4
+ # A single warning / error raised while parsing or rendering a
5
+ # document. Diagnostics are collected on the Document and never
6
+ # interrupt processing — every parse / render call still produces a
7
+ # tree and HTML, even if it emitted diagnostics along the way.
8
+ #
9
+ # severity: :info / :warning / :error
10
+ # rule: a short Symbol identifying the rule (e.g. :unsafe_url,
11
+ # :missing_reference) so callers can filter / silence
12
+ # message: human-readable explanation
13
+ # source_span: optional SourceSpan, points at the offending byte range
14
+ class Diagnostic
15
+ SEVERITIES = %i[info warning error].freeze
16
+
17
+ attr_reader :severity, :rule, :message, :source_span
18
+
19
+ def initialize(severity:, rule:, message:, source_span: nil)
20
+ unless SEVERITIES.include?(severity)
21
+ raise ArgumentError, "unknown severity: #{severity.inspect}"
22
+ end
23
+
24
+ @severity = severity
25
+ @rule = rule
26
+ @message = message
27
+ @source_span = source_span
28
+ end
29
+
30
+ def to_h
31
+ {
32
+ severity: severity,
33
+ rule: rule,
34
+ message: message,
35
+ source_span: source_span && { start_byte: source_span.start_byte, end_byte: source_span.end_byte },
36
+ }
37
+ end
38
+
39
+ def ==(other)
40
+ other.is_a?(Diagnostic) &&
41
+ other.severity == severity &&
42
+ other.rule == rule &&
43
+ other.message == message &&
44
+ other.source_span == source_span
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,126 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RedQuilt
4
+ class Document
5
+ attr_reader :source, :arena, :root_id, :references, :footnotes
6
+
7
+ def initialize(source, arena, root_id, allow_html: false, disallow_raw_html: false, references: {}, footnotes: nil)
8
+ @source = source
9
+ @arena = arena
10
+ @root_id = root_id
11
+ @allow_html = allow_html
12
+ @disallow_raw_html = disallow_raw_html
13
+ @references = references
14
+ @footnotes = footnotes
15
+ end
16
+
17
+ def allow_html?
18
+ @allow_html
19
+ end
20
+
21
+ # When true, raw HTML output filters the 9 dangerous tags defined by
22
+ # GFM's "Disallowed Raw HTML" extension (title, textarea, style, xmp,
23
+ # iframe, noembed, noframes, script, plaintext) by replacing their
24
+ # leading `<` with `&lt;`. Only meaningful when allow_html? is true;
25
+ # when allow_html? is false everything is already escaped.
26
+ def disallow_raw_html?
27
+ @disallow_raw_html
28
+ end
29
+
30
+ def root
31
+ NodeRef.new(self, @root_id)
32
+ end
33
+
34
+ def walk(&)
35
+ root.walk(&)
36
+ end
37
+
38
+ # Renders the document to HTML.
39
+ #
40
+ # standalone: when true, wrap the rendered body in a `<!DOCTYPE html>`
41
+ # template with `<head>` (charset / title / optional stylesheet)
42
+ # and `<body>`. When false (the default), only the rendered body
43
+ # fragment is returned.
44
+ # title / lang / css / theme: applied only when standalone is true.
45
+ # theme: a bundled stylesheet to inline (`:none` embeds nothing, keeping
46
+ # the bare template; `:default` embeds RedQuilt's default theme). `css`
47
+ # (an external stylesheet link) is independent and may be combined.
48
+ def to_html(standalone: false, title: nil, lang: "en", css: nil, theme: :none)
49
+ body = Renderer::HTML.new(self).render
50
+ return body unless standalone
51
+
52
+ wrap_standalone_html(body, title: title.to_s, lang: lang.to_s, css: css, theme: Theme.css(theme))
53
+ end
54
+
55
+ def to_ast
56
+ root.to_h
57
+ end
58
+
59
+ def to_json(*)
60
+ require "json"
61
+ JSON.pretty_generate(to_mdast)
62
+ end
63
+
64
+ def to_mdast
65
+ Renderer::Mdast.new(self).render
66
+ end
67
+
68
+ # Returns the plain-text content of the first HEADING in the
69
+ # document, or nil if there is no heading. Used by callers (e.g. the
70
+ # CLI's --auto-title) to derive a document title.
71
+ def first_heading_text
72
+ first_heading_text_walk(@root_id)
73
+ end
74
+
75
+ def source_map
76
+ @source_map ||= SourceMap.new(@source)
77
+ end
78
+
79
+ # Returns the array of diagnostics collected during parse / render.
80
+ # The array is mutable and shared with the parser / renderer; new
81
+ # entries appear here without further calls.
82
+ def diagnostics
83
+ @diagnostics ||= []
84
+ end
85
+
86
+ private
87
+
88
+ def wrap_standalone_html(body, title:, lang:, css:, theme:)
89
+ out = +"<!DOCTYPE html>\n"
90
+ out << %(<html lang="#{html_escape_attr(lang)}">\n)
91
+ out << "<head>\n"
92
+ out << %(<meta charset="utf-8">\n)
93
+ out << "<title>#{html_escape_text(title)}</title>\n"
94
+ out << %(<link rel="stylesheet" href="#{html_escape_attr(css)}">\n) if css
95
+ out << "<style>\n#{theme}</style>\n" if theme
96
+ out << "</head>\n<body>\n"
97
+ out << body
98
+ out << "</body>\n</html>\n"
99
+ out
100
+ end
101
+
102
+ def html_escape_text(str)
103
+ str.to_s.gsub("&", "&amp;").gsub("<", "&lt;").gsub(">", "&gt;")
104
+ end
105
+
106
+ def html_escape_attr(str)
107
+ html_escape_text(str).gsub('"', "&quot;")
108
+ end
109
+
110
+ def first_heading_text_walk(node_id)
111
+ return nil if node_id == -1
112
+ if @arena.type(node_id) == NodeType::HEADING
113
+ return PlainText.from(@arena, node_id)
114
+ end
115
+
116
+ child = @arena.raw_first_child_id(node_id)
117
+ while child != -1
118
+ text = first_heading_text_walk(child)
119
+ return text if text
120
+
121
+ child = @arena.raw_next_sibling_id(child)
122
+ end
123
+ nil
124
+ end
125
+ end
126
+ end
@@ -0,0 +1,185 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RedQuilt
4
+ # GFM Extended autolinks: rewrites bare URLs (`https://...`,
5
+ # `http://...`, `ftp://...`, `www....`) and email addresses inside
6
+ # TEXT nodes into LINK nodes. Runs as an optional pass after the
7
+ # ordinary inline pipeline, so by then all CommonMark inline structure
8
+ # (real `<...>` autolinks, code spans, links, ...) is already in place
9
+ # and protected from rewriting.
10
+ class ExtendedAutolinkPass
11
+ URL_RE = %r{
12
+ (?<![A-Za-z0-9_])
13
+ (?:https?://|ftp://|www\.)
14
+ [^\s<>]+
15
+ }x
16
+
17
+ EMAIL_RE = /
18
+ (?<![A-Za-z0-9._+-])
19
+ [A-Za-z0-9._+-]+
20
+ @
21
+ [A-Za-z0-9](?:[A-Za-z0-9\-_]{0,61}[A-Za-z0-9])?
22
+ (?:\.[A-Za-z0-9](?:[A-Za-z0-9\-_]{0,61}[A-Za-z0-9])?)+
23
+ /x
24
+
25
+ TRAILING_PUNCT_RE = /[?!.,:*_~]+\z/
26
+ TRAILING_ENTITY_RE = /&[A-Za-z0-9]+;\z/
27
+
28
+ # AST contexts whose TEXT descendants must not be auto-linkified.
29
+ SKIP_TYPES = [
30
+ NodeType::LINK,
31
+ NodeType::IMAGE,
32
+ NodeType::CODE_SPAN,
33
+ NodeType::HTML_INLINE,
34
+ NodeType::CODE_BLOCK,
35
+ NodeType::HTML_BLOCK,
36
+ ].freeze
37
+
38
+ def initialize(document)
39
+ @document = document
40
+ @arena = document.arena
41
+ end
42
+
43
+ def apply
44
+ walk(@document.root_id)
45
+ end
46
+
47
+ private
48
+
49
+ def walk(node_id)
50
+ return if node_id == -1
51
+
52
+ type = @arena.type(node_id)
53
+ return if SKIP_TYPES.include?(type)
54
+
55
+ if type == NodeType::TEXT
56
+ process_text(node_id)
57
+ return
58
+ end
59
+
60
+ child = @arena.raw_first_child_id(node_id)
61
+ while child != -1
62
+ nxt = @arena.raw_next_sibling_id(child)
63
+ walk(child)
64
+ child = nxt
65
+ end
66
+ end
67
+
68
+ Match = Struct.new(:start, :finish, :label, :dest)
69
+
70
+ def process_text(node_id)
71
+ text = @arena.text(node_id).to_s
72
+ return if text.empty?
73
+
74
+ matches = scan_text(text)
75
+ return if matches.empty?
76
+
77
+ parent = @arena.raw_parent_id(node_id)
78
+ prev_end = 0
79
+ matches.each do |m|
80
+ if m.start > prev_end
81
+ @arena.insert_before(parent, node_id,
82
+ @arena.add_node(NodeType::TEXT, str1: text[prev_end...m.start]))
83
+ end
84
+ link_id = @arena.add_node(NodeType::LINK, str1: m.dest)
85
+ @arena.append_child(link_id,
86
+ @arena.add_node(NodeType::TEXT, str1: m.label))
87
+ @arena.insert_before(parent, node_id, link_id)
88
+ prev_end = m.finish
89
+ end
90
+ if prev_end < text.length
91
+ @arena.insert_before(parent, node_id,
92
+ @arena.add_node(NodeType::TEXT, str1: text[prev_end..]))
93
+ end
94
+ @arena.detach(node_id)
95
+ end
96
+
97
+ def scan_text(text)
98
+ matches = []
99
+ pos = 0
100
+ while pos < text.length
101
+ url_m = URL_RE.match(text, pos)
102
+ email_m = EMAIL_RE.match(text, pos)
103
+ m = first_match(url_m, email_m)
104
+ break unless m
105
+
106
+ candidate = m[0]
107
+ is_email = (m == email_m)
108
+ trimmed = trim_trailing(candidate, email: is_email)
109
+ if trimmed.empty? || !valid_domain?(trimmed, email: is_email)
110
+ pos = m.begin(0) + 1
111
+ next
112
+ end
113
+
114
+ start = m.begin(0)
115
+ finish = start + trimmed.length
116
+ dest = build_destination(trimmed, email: is_email)
117
+ matches << Match.new(start, finish, trimmed, dest)
118
+ pos = finish
119
+ end
120
+ matches
121
+ end
122
+
123
+ # GFM spec: "If the domain name contains an underscore (_) in its last two
124
+ # segments, it is invalid." Applies to both URLs and email domains.
125
+ def valid_domain?(candidate, email:)
126
+ domain = extract_domain(candidate, email: email)
127
+ return false if domain.nil? || domain.empty?
128
+
129
+ segments = domain.split(".")
130
+ return false if segments.length < 2
131
+
132
+ last_two = segments.last(2)
133
+ last_two.none? { |seg| seg.include?("_") }
134
+ end
135
+
136
+ def extract_domain(candidate, email:)
137
+ if email
138
+ candidate.split("@", 2)[1]
139
+ elsif candidate.start_with?("www.")
140
+ host = candidate[4..]
141
+ host.split("/", 2).first
142
+ else
143
+ # https://, http://, ftp://
144
+ after_scheme = candidate.sub(%r{\A[a-z]+://}, "")
145
+ after_scheme.split("/", 2).first
146
+ end
147
+ end
148
+
149
+ def first_match(a, b)
150
+ return b unless a
151
+ return a unless b
152
+
153
+ a.begin(0) <= b.begin(0) ? a : b
154
+ end
155
+
156
+ def trim_trailing(candidate, email:)
157
+ loop do
158
+ before = candidate.length
159
+ candidate = candidate.sub(TRAILING_PUNCT_RE, "")
160
+ candidate = strip_excess_close_paren(candidate) unless email
161
+ if candidate.end_with?(";") && (em = TRAILING_ENTITY_RE.match(candidate))
162
+ candidate = candidate[0...em.begin(0)]
163
+ end
164
+ break candidate if candidate.length == before
165
+ end
166
+ end
167
+
168
+ def strip_excess_close_paren(s)
169
+ opens = s.count("(")
170
+ closes = s.count(")")
171
+ while closes > opens && s.end_with?(")")
172
+ s = s[0..-2]
173
+ closes -= 1
174
+ end
175
+ s
176
+ end
177
+
178
+ def build_destination(label, email:)
179
+ return "mailto:#{label}" if email
180
+ return "http://#{label}" if label.start_with?("www.")
181
+
182
+ label
183
+ end
184
+ end
185
+ end