markbridge 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE.txt +21 -0
- data/lib/markbridge/all.rb +9 -0
- data/lib/markbridge/ast/align.rb +24 -0
- data/lib/markbridge/ast/attachment.rb +42 -0
- data/lib/markbridge/ast/bold.rb +13 -0
- data/lib/markbridge/ast/code.rb +27 -0
- data/lib/markbridge/ast/color.rb +25 -0
- data/lib/markbridge/ast/document.rb +27 -0
- data/lib/markbridge/ast/element.rb +47 -0
- data/lib/markbridge/ast/email.rb +27 -0
- data/lib/markbridge/ast/event.rb +59 -0
- data/lib/markbridge/ast/heading.rb +23 -0
- data/lib/markbridge/ast/horizontal_rule.rb +12 -0
- data/lib/markbridge/ast/image.rb +35 -0
- data/lib/markbridge/ast/italic.rb +13 -0
- data/lib/markbridge/ast/line_break.rb +12 -0
- data/lib/markbridge/ast/list.rb +52 -0
- data/lib/markbridge/ast/list_item.rb +13 -0
- data/lib/markbridge/ast/markdown_text.rb +37 -0
- data/lib/markbridge/ast/mention.rb +29 -0
- data/lib/markbridge/ast/node.rb +19 -0
- data/lib/markbridge/ast/paragraph.rb +13 -0
- data/lib/markbridge/ast/poll.rb +74 -0
- data/lib/markbridge/ast/quote.rb +46 -0
- data/lib/markbridge/ast/size.rb +25 -0
- data/lib/markbridge/ast/spoiler.rb +27 -0
- data/lib/markbridge/ast/strikethrough.rb +13 -0
- data/lib/markbridge/ast/subscript.rb +13 -0
- data/lib/markbridge/ast/superscript.rb +13 -0
- data/lib/markbridge/ast/text.rb +38 -0
- data/lib/markbridge/ast/underline.rb +13 -0
- data/lib/markbridge/ast/upload.rb +74 -0
- data/lib/markbridge/ast/url.rb +27 -0
- data/lib/markbridge/ast.rb +42 -0
- data/lib/markbridge/configuration.rb +11 -0
- data/lib/markbridge/gem_loader.rb +23 -0
- data/lib/markbridge/parsers/bbcode/closing_strategies/base.rb +37 -0
- data/lib/markbridge/parsers/bbcode/closing_strategies/reordering.rb +17 -0
- data/lib/markbridge/parsers/bbcode/closing_strategies/strict.rb +12 -0
- data/lib/markbridge/parsers/bbcode/closing_strategies/tag_reconciler.rb +121 -0
- data/lib/markbridge/parsers/bbcode/errors/max_depth_exceeded_error.rb +13 -0
- data/lib/markbridge/parsers/bbcode/handler_registry.rb +160 -0
- data/lib/markbridge/parsers/bbcode/handlers/align_handler.rb +26 -0
- data/lib/markbridge/parsers/bbcode/handlers/attachment_handler.rb +104 -0
- data/lib/markbridge/parsers/bbcode/handlers/base_handler.rb +44 -0
- data/lib/markbridge/parsers/bbcode/handlers/code_handler.rb +25 -0
- data/lib/markbridge/parsers/bbcode/handlers/color_handler.rb +31 -0
- data/lib/markbridge/parsers/bbcode/handlers/email_handler.rb +25 -0
- data/lib/markbridge/parsers/bbcode/handlers/image_handler.rb +51 -0
- data/lib/markbridge/parsers/bbcode/handlers/list_handler.rb +36 -0
- data/lib/markbridge/parsers/bbcode/handlers/list_item_handler.rb +26 -0
- data/lib/markbridge/parsers/bbcode/handlers/quote_handler.rb +64 -0
- data/lib/markbridge/parsers/bbcode/handlers/raw_handler.rb +48 -0
- data/lib/markbridge/parsers/bbcode/handlers/self_closing_handler.rb +28 -0
- data/lib/markbridge/parsers/bbcode/handlers/simple_handler.rb +28 -0
- data/lib/markbridge/parsers/bbcode/handlers/size_handler.rb +31 -0
- data/lib/markbridge/parsers/bbcode/handlers/spoiler_handler.rb +28 -0
- data/lib/markbridge/parsers/bbcode/handlers/url_handler.rb +24 -0
- data/lib/markbridge/parsers/bbcode/parser.rb +123 -0
- data/lib/markbridge/parsers/bbcode/parser_state.rb +93 -0
- data/lib/markbridge/parsers/bbcode/peekable_enumerator.rb +126 -0
- data/lib/markbridge/parsers/bbcode/raw_content_collector.rb +35 -0
- data/lib/markbridge/parsers/bbcode/raw_content_result.rb +25 -0
- data/lib/markbridge/parsers/bbcode/scanner.rb +231 -0
- data/lib/markbridge/parsers/bbcode/tokens/tag_end_token.rb +21 -0
- data/lib/markbridge/parsers/bbcode/tokens/tag_start_token.rb +23 -0
- data/lib/markbridge/parsers/bbcode/tokens/text_token.rb +23 -0
- data/lib/markbridge/parsers/bbcode/tokens/token.rb +16 -0
- data/lib/markbridge/parsers/bbcode.rb +56 -0
- data/lib/markbridge/parsers/html/handler_registry.rb +87 -0
- data/lib/markbridge/parsers/html/handlers/base_handler.rb +27 -0
- data/lib/markbridge/parsers/html/handlers/image_handler.rb +40 -0
- data/lib/markbridge/parsers/html/handlers/list_handler.rb +29 -0
- data/lib/markbridge/parsers/html/handlers/list_item_handler.rb +26 -0
- data/lib/markbridge/parsers/html/handlers/paragraph_handler.rb +17 -0
- data/lib/markbridge/parsers/html/handlers/quote_handler.rb +28 -0
- data/lib/markbridge/parsers/html/handlers/raw_handler.rb +33 -0
- data/lib/markbridge/parsers/html/handlers/simple_handler.rb +26 -0
- data/lib/markbridge/parsers/html/handlers/url_handler.rb +27 -0
- data/lib/markbridge/parsers/html/parser.rb +113 -0
- data/lib/markbridge/parsers/html.rb +30 -0
- data/lib/markbridge/parsers/media_wiki/inline_parser.rb +332 -0
- data/lib/markbridge/parsers/media_wiki/parser.rb +279 -0
- data/lib/markbridge/parsers/media_wiki.rb +15 -0
- data/lib/markbridge/parsers/text_formatter/handler_registry.rb +130 -0
- data/lib/markbridge/parsers/text_formatter/handlers/attachment_handler.rb +33 -0
- data/lib/markbridge/parsers/text_formatter/handlers/attribute_handler.rb +40 -0
- data/lib/markbridge/parsers/text_formatter/handlers/base_handler.rb +45 -0
- data/lib/markbridge/parsers/text_formatter/handlers/code_handler.rb +28 -0
- data/lib/markbridge/parsers/text_formatter/handlers/email_handler.rb +27 -0
- data/lib/markbridge/parsers/text_formatter/handlers/image_handler.rb +32 -0
- data/lib/markbridge/parsers/text_formatter/handlers/list_handler.rb +31 -0
- data/lib/markbridge/parsers/text_formatter/handlers/quote_handler.rb +33 -0
- data/lib/markbridge/parsers/text_formatter/handlers/simple_handler.rb +37 -0
- data/lib/markbridge/parsers/text_formatter/handlers/url_handler.rb +29 -0
- data/lib/markbridge/parsers/text_formatter/parser.rb +132 -0
- data/lib/markbridge/parsers/text_formatter.rb +31 -0
- data/lib/markbridge/processors/discourse_markdown/code_block_tracker.rb +199 -0
- data/lib/markbridge/processors/discourse_markdown/detectors/base.rb +57 -0
- data/lib/markbridge/processors/discourse_markdown/detectors/event.rb +73 -0
- data/lib/markbridge/processors/discourse_markdown/detectors/mention.rb +57 -0
- data/lib/markbridge/processors/discourse_markdown/detectors/poll.rb +90 -0
- data/lib/markbridge/processors/discourse_markdown/detectors/upload.rb +123 -0
- data/lib/markbridge/processors/discourse_markdown/scanner.rb +199 -0
- data/lib/markbridge/processors/discourse_markdown.rb +16 -0
- data/lib/markbridge/processors.rb +8 -0
- data/lib/markbridge/renderers/discourse/builders/list_item_builder.rb +83 -0
- data/lib/markbridge/renderers/discourse/markdown_escaper.rb +468 -0
- data/lib/markbridge/renderers/discourse/render_context.rb +80 -0
- data/lib/markbridge/renderers/discourse/renderer.rb +63 -0
- data/lib/markbridge/renderers/discourse/rendering_interface.rb +86 -0
- data/lib/markbridge/renderers/discourse/tag.rb +29 -0
- data/lib/markbridge/renderers/discourse/tag_library.rb +67 -0
- data/lib/markbridge/renderers/discourse/tags/align_tag.rb +24 -0
- data/lib/markbridge/renderers/discourse/tags/attachment_tag.rb +46 -0
- data/lib/markbridge/renderers/discourse/tags/bold_tag.rb +18 -0
- data/lib/markbridge/renderers/discourse/tags/code_tag.rb +54 -0
- data/lib/markbridge/renderers/discourse/tags/color_tag.rb +27 -0
- data/lib/markbridge/renderers/discourse/tags/email_tag.rb +24 -0
- data/lib/markbridge/renderers/discourse/tags/event_tag.rb +49 -0
- data/lib/markbridge/renderers/discourse/tags/heading_tag.rb +21 -0
- data/lib/markbridge/renderers/discourse/tags/horizontal_rule_tag.rb +16 -0
- data/lib/markbridge/renderers/discourse/tags/image_tag.rb +29 -0
- data/lib/markbridge/renderers/discourse/tags/italic_tag.rb +18 -0
- data/lib/markbridge/renderers/discourse/tags/line_break_tag.rb +16 -0
- data/lib/markbridge/renderers/discourse/tags/list_item_tag.rb +87 -0
- data/lib/markbridge/renderers/discourse/tags/list_tag.rb +39 -0
- data/lib/markbridge/renderers/discourse/tags/mention_tag.rb +34 -0
- data/lib/markbridge/renderers/discourse/tags/paragraph_tag.rb +21 -0
- data/lib/markbridge/renderers/discourse/tags/poll_tag.rb +51 -0
- data/lib/markbridge/renderers/discourse/tags/quote_tag.rb +32 -0
- data/lib/markbridge/renderers/discourse/tags/size_tag.rb +27 -0
- data/lib/markbridge/renderers/discourse/tags/spoiler_tag.rb +24 -0
- data/lib/markbridge/renderers/discourse/tags/strikethrough_tag.rb +18 -0
- data/lib/markbridge/renderers/discourse/tags/subscript_tag.rb +19 -0
- data/lib/markbridge/renderers/discourse/tags/superscript_tag.rb +19 -0
- data/lib/markbridge/renderers/discourse/tags/underline_tag.rb +19 -0
- data/lib/markbridge/renderers/discourse/tags/upload_tag.rb +80 -0
- data/lib/markbridge/renderers/discourse/tags/url_tag.rb +24 -0
- data/lib/markbridge/renderers/discourse.rb +50 -0
- data/lib/markbridge/version.rb +5 -0
- data/lib/markbridge.rb +201 -0
- metadata +186 -0
|
@@ -0,0 +1,468 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Markbridge
|
|
4
|
+
module Renderers
|
|
5
|
+
module Discourse
|
|
6
|
+
# Escapes text to prevent interpretation as Markdown formatting.
|
|
7
|
+
#
|
|
8
|
+
# Design principles:
|
|
9
|
+
# - No false negatives: all potentially special sequences MUST be escaped
|
|
10
|
+
# - False positives OK: over-escaping is acceptable for safety
|
|
11
|
+
# - Autolinks preserved: <https://...>, <mailto:...>, and <email@domain> remain functional
|
|
12
|
+
# - HTML escaped: tags, processing instructions, and SGML declarations are neutralized
|
|
13
|
+
# - Performance: minimal allocations, byte-level processing, early returns
|
|
14
|
+
# - Discourse-compatible: handles ndash conversion, unlimited ordered list numbers
|
|
15
|
+
#
|
|
16
|
+
# Optimized for Ruby 3.3+ with YJIT. Key optimizations:
|
|
17
|
+
# - Fast path returns original string for plain text (no allocations)
|
|
18
|
+
# - Pre-allocated result buffers with estimated capacity
|
|
19
|
+
# - Byte-level processing for inline escaping (YJIT-friendly tight loops)
|
|
20
|
+
# - Simplified escaping rules: [ breaks links, so ] doesn't need escaping
|
|
21
|
+
#
|
|
22
|
+
# @example Basic escaping
|
|
23
|
+
# escaper = Markbridge::Renderers::Discourse::MarkdownEscaper.new
|
|
24
|
+
# escaper.escape("# Heading") # => "\\# Heading"
|
|
25
|
+
# escaper.escape("*emphasis*") # => "\\*emphasis\\*"
|
|
26
|
+
# escaper.escape("foo -- bar") # => "foo \\-\\- bar"
|
|
27
|
+
#
|
|
28
|
+
# @example HTML is escaped
|
|
29
|
+
# escaper.escape("<div>content</div>") # => "\\<div>content\\</div>"
|
|
30
|
+
# escaper.escape("<?php echo 1; ?>") # => "\\<?php echo 1; ?>"
|
|
31
|
+
#
|
|
32
|
+
class MarkdownEscaper
|
|
33
|
+
# @param escape_hard_line_breaks [Boolean] when true, strip trailing spaces
|
|
34
|
+
# before newlines to prevent CommonMark hard line breaks (<br/>).
|
|
35
|
+
# Defaults to false because Discourse has trailing-space hard line
|
|
36
|
+
# breaks disabled by default.
|
|
37
|
+
def initialize(escape_hard_line_breaks: false)
|
|
38
|
+
@escape_hard_line_breaks = escape_hard_line_breaks
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
# Fast-path check: any character that might need escaping
|
|
42
|
+
# Only includes characters we actually escape (removed ], {, }, ^)
|
|
43
|
+
# > is needed for blockquote detection at line start
|
|
44
|
+
MAYBE_SPECIAL = /[\\`*_\[#+\-.!<>&|~=>)]/
|
|
45
|
+
|
|
46
|
+
# Check for indented code on any line
|
|
47
|
+
# Matches: 4+ spaces, tab, or space+tab combinations that reach column 4+
|
|
48
|
+
MAYBE_INDENTED_CODE = /(?:^|\n)(?: {4}|\t| {1,3}\t)/
|
|
49
|
+
|
|
50
|
+
# Block-level patterns
|
|
51
|
+
ATX_HEADING = /\A\#{1,6}(?=[ \t]|$)/
|
|
52
|
+
BLOCK_QUOTE = /\A>/
|
|
53
|
+
# List markers followed by space, tab, or end of line
|
|
54
|
+
BULLET_LIST = /\A[-+*](?=[ \t]|$)/
|
|
55
|
+
ORDERED_LIST = /\A(\d+)([.)])(?=[ \t])/
|
|
56
|
+
THEMATIC_BREAK_DASH = /\A(?:-[ \t]*){3,}$/
|
|
57
|
+
THEMATIC_BREAK_STAR = /\A(?:\*[ \t]*){3,}$/
|
|
58
|
+
THEMATIC_BREAK_UNDERSCORE = /\A(?:_[ \t]*){3,}$/
|
|
59
|
+
FENCED_CODE_BACKTICK = /\A`{3,}[^`]*$/
|
|
60
|
+
FENCED_CODE_TILDE = /\A~{3,}/
|
|
61
|
+
SETEXT_UNDERLINE_EQUALS = /\A=+[ \t]*$/
|
|
62
|
+
SETEXT_UNDERLINE_DASH = /\A-+[ \t]*$/
|
|
63
|
+
# Indented code: 4+ spaces, tab at start, or space+tab reaching column 4+
|
|
64
|
+
INDENTED_CODE = /\A(?: {4}|\t| {1,3}\t)/
|
|
65
|
+
|
|
66
|
+
# Inline quick-check pattern (includes < for HTML tag escaping)
|
|
67
|
+
INLINE_SPECIAL = /[\\*_`\[!|<&~-]/
|
|
68
|
+
|
|
69
|
+
# Entity reference pattern (we escape these to prevent conversion)
|
|
70
|
+
ENTITY_REF = /\A&(?:\#[xX][0-9a-fA-F]{1,6}|\#[0-9]{1,7}|[a-zA-Z][a-zA-Z0-9]{0,31});/
|
|
71
|
+
|
|
72
|
+
# HTML tag pattern (we escape these, but NOT autolinks)
|
|
73
|
+
# Handles quoted attributes which can contain > characters
|
|
74
|
+
# Attribute patterns: name="value" | name='value' | name=value | name
|
|
75
|
+
HTML_ATTR = /(?:\s+[a-zA-Z_:][a-zA-Z0-9_.:-]*(?:\s*=\s*(?:"[^"]*"|'[^']*'|[^\s"'=<>`]+))?)/
|
|
76
|
+
HTML_TAG = %r{\A</?[a-zA-Z][a-zA-Z0-9-]*#{HTML_ATTR}*\s*/?>}
|
|
77
|
+
|
|
78
|
+
# Autolink pattern - we pass these through entirely unchanged
|
|
79
|
+
# Matches <http://...>, <https://...>, <mailto:...>, and email addresses
|
|
80
|
+
AUTOLINK =
|
|
81
|
+
%r{\A<(?:https?://|mailto:)[^>\s]*>|\A<[a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-]+@[a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)*>}i
|
|
82
|
+
|
|
83
|
+
# Match HTML-like constructs that need escaping:
|
|
84
|
+
# - Processing instructions: <?php, <?xml, etc.
|
|
85
|
+
# - SGML declarations: <!DOCTYPE, <!ELEMENT, <![CDATA[, <!--, etc.
|
|
86
|
+
# - Incomplete/multi-line HTML tags: <div followed by attributes on next line
|
|
87
|
+
# - Custom elements: <my-component>, <responsive-image>
|
|
88
|
+
# The (?:[\s/]|$) ensures we don't match comparisons like "a < b"
|
|
89
|
+
HTML_TAG_START = %r{\A<(?:[?!]|/?\s*[a-zA-Z][a-zA-Z0-9-]*(?:[\s/]|$))}
|
|
90
|
+
|
|
91
|
+
# Byte constants for inline processing
|
|
92
|
+
BACKSLASH = 92 # \
|
|
93
|
+
BANG = 33 # !
|
|
94
|
+
HASH = 35 # #
|
|
95
|
+
AMP = 38 # &
|
|
96
|
+
STAR = 42 # *
|
|
97
|
+
PLUS = 43 # +
|
|
98
|
+
DASH = 45 # -
|
|
99
|
+
LT = 60 # <
|
|
100
|
+
EQUALS = 61 # =
|
|
101
|
+
GT = 62 # >
|
|
102
|
+
BRACKET_OPEN = 91 # [
|
|
103
|
+
UNDERSCORE = 95 # _
|
|
104
|
+
BACKTICK = 96 # `
|
|
105
|
+
PIPE = 124 # |
|
|
106
|
+
TILDE = 126 # ~
|
|
107
|
+
SPACE = 32
|
|
108
|
+
TAB = 9
|
|
109
|
+
DIGIT_0 = 48
|
|
110
|
+
DIGIT_9 = 57
|
|
111
|
+
|
|
112
|
+
# Escapes markdown special characters in the given text.
|
|
113
|
+
#
|
|
114
|
+
# Handles both block-level constructs (headings, lists, code blocks, HTML blocks)
|
|
115
|
+
# and inline formatting (emphasis, code spans, links, inline HTML).
|
|
116
|
+
# Autolinks (<https://...>, <email@domain>) are intentionally preserved.
|
|
117
|
+
#
|
|
118
|
+
# @param text [String, nil] the text to escape
|
|
119
|
+
# @return [String] the escaped text, or empty string if input is nil
|
|
120
|
+
# @note Multi-line HTML tags and blocks are handled by escaping the opening <
|
|
121
|
+
def escape(text)
|
|
122
|
+
return "".freeze if text.nil?
|
|
123
|
+
return text if text.empty?
|
|
124
|
+
|
|
125
|
+
# Neutralize hard line breaks (trailing 2+ spaces before newline)
|
|
126
|
+
text = text.gsub(/ +\n/, "\n") if @escape_hard_line_breaks && text.include?(" \n")
|
|
127
|
+
|
|
128
|
+
return text unless MAYBE_SPECIAL.match?(text) || MAYBE_INDENTED_CODE.match?(text)
|
|
129
|
+
|
|
130
|
+
escape_text(text)
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
private
|
|
134
|
+
|
|
135
|
+
def escape_text(text)
|
|
136
|
+
lines = text.split("\n", -1)
|
|
137
|
+
return escape_line(lines[0], false) if lines.size == 1
|
|
138
|
+
|
|
139
|
+
# Pre-allocate result buffer
|
|
140
|
+
result = String.new(capacity: text.bytesize + text.bytesize / 3, encoding: text.encoding)
|
|
141
|
+
prev_was_paragraph = false
|
|
142
|
+
first = true
|
|
143
|
+
|
|
144
|
+
lines.each do |line|
|
|
145
|
+
result << "\n" unless first
|
|
146
|
+
first = false
|
|
147
|
+
|
|
148
|
+
escaped = escape_line(line, prev_was_paragraph)
|
|
149
|
+
result << escaped
|
|
150
|
+
prev_was_paragraph = paragraph_line?(line)
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
result
|
|
154
|
+
end
|
|
155
|
+
|
|
156
|
+
def escape_line(line, prev_was_paragraph)
|
|
157
|
+
return line if line.empty?
|
|
158
|
+
|
|
159
|
+
# Handle indented code blocks first
|
|
160
|
+
return escape_indented_code(line) if INDENTED_CODE.match?(line)
|
|
161
|
+
|
|
162
|
+
# Extract 0-3 space indent
|
|
163
|
+
indent_len = 0
|
|
164
|
+
while indent_len < 3 && indent_len < line.length && line.getbyte(indent_len) == SPACE
|
|
165
|
+
indent_len += 1
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
return line if indent_len >= line.length
|
|
169
|
+
|
|
170
|
+
content = indent_len > 0 ? line[indent_len..] : line
|
|
171
|
+
|
|
172
|
+
# Apply block-level escaping (which may also do inline escaping)
|
|
173
|
+
escaped, skip_inline = escape_block_level(content, prev_was_paragraph)
|
|
174
|
+
|
|
175
|
+
# Apply inline escaping if block-level didn't handle it
|
|
176
|
+
escaped = escape_inline(escaped) unless skip_inline
|
|
177
|
+
|
|
178
|
+
# Prepend indent if present, preserve encoding
|
|
179
|
+
if indent_len > 0
|
|
180
|
+
result = String.new(encoding: line.encoding)
|
|
181
|
+
result << line[0, indent_len] << escaped
|
|
182
|
+
result
|
|
183
|
+
else
|
|
184
|
+
# Preserve original encoding
|
|
185
|
+
escaped.is_a?(String) ? escaped.force_encoding(line.encoding) : escaped
|
|
186
|
+
end
|
|
187
|
+
end
|
|
188
|
+
|
|
189
|
+
# Non-breaking space - used to preserve visual indentation without
|
|
190
|
+
# triggering code blocks or block-level markdown
|
|
191
|
+
NBSP = "\u00A0"
|
|
192
|
+
|
|
193
|
+
def escape_indented_code(line)
|
|
194
|
+
# Replace leading whitespace with NBSP to prevent code block interpretation.
|
|
195
|
+
# NBSP is not whitespace to CommonMark, so:
|
|
196
|
+
# - Line doesn't start with 4+ spaces (no code block)
|
|
197
|
+
# - Content doesn't start at valid block position (no lists, headings, etc.)
|
|
198
|
+
# - Visual indentation is preserved (NBSP renders as space)
|
|
199
|
+
# We still escape inline content since it's no longer protected.
|
|
200
|
+
i = 0
|
|
201
|
+
while i < line.length
|
|
202
|
+
b = line.getbyte(i)
|
|
203
|
+
break if b != SPACE && b != TAB
|
|
204
|
+
i += 1
|
|
205
|
+
end
|
|
206
|
+
|
|
207
|
+
return line if i == 0 # No leading whitespace (shouldn't happen, but safe)
|
|
208
|
+
return line if i >= line.length # Whitespace-only line
|
|
209
|
+
|
|
210
|
+
# Convert leading whitespace to NBSP (tab = 4 NBSP for visual consistency)
|
|
211
|
+
nbsp_indent = String.new(encoding: line.encoding)
|
|
212
|
+
line[0, i].each_char { |c| nbsp_indent << (c == "\t" ? (NBSP * 4) : NBSP) }
|
|
213
|
+
|
|
214
|
+
content = line[i..]
|
|
215
|
+
"#{nbsp_indent}#{escape_inline(content)}"
|
|
216
|
+
end
|
|
217
|
+
|
|
218
|
+
def escape_block_level(content, prev_was_paragraph)
|
|
219
|
+
first_byte = content.getbyte(0)
|
|
220
|
+
|
|
221
|
+
case first_byte
|
|
222
|
+
when HASH
|
|
223
|
+
return "\\##{escape_inline(content[1..])}", true if ATX_HEADING.match?(content)
|
|
224
|
+
when GT
|
|
225
|
+
return "\\>#{escape_inline(content[1..])}", true
|
|
226
|
+
when DASH
|
|
227
|
+
if THEMATIC_BREAK_DASH.match?(content) ||
|
|
228
|
+
(prev_was_paragraph && SETEXT_UNDERLINE_DASH.match?(content))
|
|
229
|
+
return escape_all_chars(content, DASH, "\\-"), true
|
|
230
|
+
end
|
|
231
|
+
return "\\-#{escape_inline(content[1..])}", true if BULLET_LIST.match?(content)
|
|
232
|
+
when PLUS
|
|
233
|
+
return "\\+#{escape_inline(content[1..])}", true if BULLET_LIST.match?(content)
|
|
234
|
+
when STAR
|
|
235
|
+
if THEMATIC_BREAK_STAR.match?(content)
|
|
236
|
+
return escape_all_chars(content, STAR, "\\*"), true
|
|
237
|
+
end
|
|
238
|
+
return "\\*#{escape_inline(content[1..])}", true if BULLET_LIST.match?(content)
|
|
239
|
+
when UNDERSCORE
|
|
240
|
+
if THEMATIC_BREAK_UNDERSCORE.match?(content)
|
|
241
|
+
return escape_all_chars(content, UNDERSCORE, "\\_"), true
|
|
242
|
+
end
|
|
243
|
+
when EQUALS
|
|
244
|
+
if prev_was_paragraph && SETEXT_UNDERLINE_EQUALS.match?(content)
|
|
245
|
+
return escape_all_chars(content, EQUALS, "\\="), true
|
|
246
|
+
end
|
|
247
|
+
when BACKTICK
|
|
248
|
+
if FENCED_CODE_BACKTICK.match?(content)
|
|
249
|
+
# Escape ALL backticks to prevent code span interpretation
|
|
250
|
+
# e.g., ```` becomes \`\`\`\` not \```` (which would be \` + ```)
|
|
251
|
+
return escape_all_chars(content, BACKTICK, "\\`"), true
|
|
252
|
+
end
|
|
253
|
+
when TILDE
|
|
254
|
+
return "\\#{content}", true if FENCED_CODE_TILDE.match?(content)
|
|
255
|
+
when BRACKET_OPEN
|
|
256
|
+
return "\\[#{escape_inline(content[1..])}", true
|
|
257
|
+
when PIPE
|
|
258
|
+
return "\\|#{escape_inline(content[1..])}", true
|
|
259
|
+
when DIGIT_0..DIGIT_9
|
|
260
|
+
if (m = ORDERED_LIST.match(content))
|
|
261
|
+
prefix = m[1]
|
|
262
|
+
delim = m[2]
|
|
263
|
+
rest = content[m[0].length..]
|
|
264
|
+
return "#{prefix}\\#{delim}#{escape_inline(rest)}", true
|
|
265
|
+
end
|
|
266
|
+
end
|
|
267
|
+
|
|
268
|
+
[content, false]
|
|
269
|
+
end
|
|
270
|
+
|
|
271
|
+
def escape_all_chars(str, byte_val, escaped)
|
|
272
|
+
result = String.new(capacity: str.bytesize * 2, encoding: str.encoding)
|
|
273
|
+
str.each_byte do |b|
|
|
274
|
+
if b == byte_val
|
|
275
|
+
result << escaped
|
|
276
|
+
else
|
|
277
|
+
result << b
|
|
278
|
+
end
|
|
279
|
+
end
|
|
280
|
+
result
|
|
281
|
+
end
|
|
282
|
+
|
|
283
|
+
def escape_inline(content)
|
|
284
|
+
# Quick check - if no special chars, return as-is
|
|
285
|
+
return content unless INLINE_SPECIAL.match?(content)
|
|
286
|
+
|
|
287
|
+
result =
|
|
288
|
+
String.new(
|
|
289
|
+
capacity: content.bytesize + content.bytesize / 4,
|
|
290
|
+
encoding: content.encoding,
|
|
291
|
+
)
|
|
292
|
+
len = content.bytesize
|
|
293
|
+
i = 0
|
|
294
|
+
|
|
295
|
+
while i < len
|
|
296
|
+
b = content.getbyte(i)
|
|
297
|
+
|
|
298
|
+
case b
|
|
299
|
+
when BACKSLASH # \
|
|
300
|
+
if i + 1 < len && ascii_punctuation?(content.getbyte(i + 1))
|
|
301
|
+
# Escape the backslash, but let the next char be processed on its own
|
|
302
|
+
result << "\\\\"
|
|
303
|
+
i += 1
|
|
304
|
+
elsif i + 1 == len # backslash at end (hard break)
|
|
305
|
+
result << "\\\\"
|
|
306
|
+
i += 1
|
|
307
|
+
else
|
|
308
|
+
result << b
|
|
309
|
+
i += 1
|
|
310
|
+
end
|
|
311
|
+
when DASH # -
|
|
312
|
+
if i + 1 < len && content.getbyte(i + 1) == DASH
|
|
313
|
+
# Consecutive dashes - escape each for Discourse ndash prevention
|
|
314
|
+
while i < len && content.getbyte(i) == DASH
|
|
315
|
+
result << "\\-"
|
|
316
|
+
i += 1
|
|
317
|
+
end
|
|
318
|
+
else
|
|
319
|
+
result << b
|
|
320
|
+
i += 1
|
|
321
|
+
end
|
|
322
|
+
when TILDE # ~
|
|
323
|
+
if i + 1 < len && content.getbyte(i + 1) == TILDE
|
|
324
|
+
result << "\\~\\~"
|
|
325
|
+
i += 2
|
|
326
|
+
else
|
|
327
|
+
result << b
|
|
328
|
+
i += 1
|
|
329
|
+
end
|
|
330
|
+
when STAR # *
|
|
331
|
+
while i < len && content.getbyte(i) == STAR
|
|
332
|
+
result << "\\*"
|
|
333
|
+
i += 1
|
|
334
|
+
end
|
|
335
|
+
when UNDERSCORE # _
|
|
336
|
+
while i < len && content.getbyte(i) == UNDERSCORE
|
|
337
|
+
result << "\\_"
|
|
338
|
+
i += 1
|
|
339
|
+
end
|
|
340
|
+
when BACKTICK # `
|
|
341
|
+
while i < len && content.getbyte(i) == BACKTICK
|
|
342
|
+
result << "\\`"
|
|
343
|
+
i += 1
|
|
344
|
+
end
|
|
345
|
+
when BANG # !
|
|
346
|
+
if i + 1 < len && content.getbyte(i + 1) == BRACKET_OPEN
|
|
347
|
+
result << "\\!\\["
|
|
348
|
+
i += 2
|
|
349
|
+
else
|
|
350
|
+
result << b
|
|
351
|
+
i += 1
|
|
352
|
+
end
|
|
353
|
+
when BRACKET_OPEN # [
|
|
354
|
+
result << "\\["
|
|
355
|
+
i += 1
|
|
356
|
+
when PIPE # |
|
|
357
|
+
result << "\\|"
|
|
358
|
+
i += 1
|
|
359
|
+
when LT # <
|
|
360
|
+
remaining = content.byteslice(i, len - i)
|
|
361
|
+
# Check for autolinks first - pass through entirely unchanged
|
|
362
|
+
if (m = AUTOLINK.match(remaining))
|
|
363
|
+
result << m[0]
|
|
364
|
+
i += m[0].bytesize
|
|
365
|
+
# Escape complete HTML tags (include tag in output for readability)
|
|
366
|
+
# Also escape backticks inside the tag to prevent code span interpretation
|
|
367
|
+
elsif (m = HTML_TAG.match(remaining))
|
|
368
|
+
escaped_tag = m[0].gsub("`") { "\\`" }
|
|
369
|
+
result << "\\" << escaped_tag
|
|
370
|
+
i += m[0].bytesize
|
|
371
|
+
# Escape HTML-like constructs: processing instructions, SGML declarations,
|
|
372
|
+
# and potential tag starts (including multi-line and custom elements)
|
|
373
|
+
elsif HTML_TAG_START.match?(remaining)
|
|
374
|
+
result << "\\<"
|
|
375
|
+
i += 1
|
|
376
|
+
else
|
|
377
|
+
# Not HTML-like (comparison operator, etc.)
|
|
378
|
+
result << b
|
|
379
|
+
i += 1
|
|
380
|
+
end
|
|
381
|
+
when AMP # &
|
|
382
|
+
remaining = content.byteslice(i, len - i)
|
|
383
|
+
if (m = ENTITY_REF.match(remaining))
|
|
384
|
+
result << "\\" << m[0]
|
|
385
|
+
i += m[0].bytesize
|
|
386
|
+
else
|
|
387
|
+
result << b
|
|
388
|
+
i += 1
|
|
389
|
+
end
|
|
390
|
+
else
|
|
391
|
+
# Regular character - handle multi-byte UTF-8
|
|
392
|
+
if b < 128
|
|
393
|
+
result << b
|
|
394
|
+
i += 1
|
|
395
|
+
else
|
|
396
|
+
char_len = utf8_char_length(b)
|
|
397
|
+
end_i = [i + char_len, len].min
|
|
398
|
+
result << content.byteslice(i, end_i - i)
|
|
399
|
+
i = end_i
|
|
400
|
+
end
|
|
401
|
+
end
|
|
402
|
+
end
|
|
403
|
+
|
|
404
|
+
result
|
|
405
|
+
end
|
|
406
|
+
|
|
407
|
+
def ascii_punctuation?(byte)
|
|
408
|
+
(byte >= 33 && byte <= 47) || (byte >= 58 && byte <= 64) || (byte >= 91 && byte <= 96) || # !"#$%&'()*+,-./ # :;<=>?@ # [\]^_`
|
|
409
|
+
(byte >= 123 && byte <= 126) # {|}~
|
|
410
|
+
end
|
|
411
|
+
|
|
412
|
+
def utf8_char_length(first_byte)
|
|
413
|
+
if first_byte >= 240
|
|
414
|
+
4
|
|
415
|
+
elsif first_byte >= 224
|
|
416
|
+
3
|
|
417
|
+
elsif first_byte >= 192
|
|
418
|
+
2
|
|
419
|
+
else
|
|
420
|
+
1
|
|
421
|
+
end
|
|
422
|
+
end
|
|
423
|
+
|
|
424
|
+
def paragraph_line?(line)
|
|
425
|
+
return false if line.empty?
|
|
426
|
+
|
|
427
|
+
# Quick whitespace-only check
|
|
428
|
+
first_non_space = 0
|
|
429
|
+
while first_non_space < line.length && line.getbyte(first_non_space) == SPACE
|
|
430
|
+
first_non_space += 1
|
|
431
|
+
end
|
|
432
|
+
return false if first_non_space >= line.length || line.getbyte(first_non_space) == TAB
|
|
433
|
+
|
|
434
|
+
# Check if this is a block construct
|
|
435
|
+
content = first_non_space <= 3 ? line[first_non_space..] : line
|
|
436
|
+
return false if content.nil? || content.empty?
|
|
437
|
+
|
|
438
|
+
first_byte = content.getbyte(0)
|
|
439
|
+
|
|
440
|
+
case first_byte
|
|
441
|
+
when HASH
|
|
442
|
+
return false if ATX_HEADING.match?(content)
|
|
443
|
+
when GT
|
|
444
|
+
return false
|
|
445
|
+
when DASH, PLUS, STAR
|
|
446
|
+
return false if BULLET_LIST.match?(content)
|
|
447
|
+
return false if first_byte == DASH && THEMATIC_BREAK_DASH.match?(content)
|
|
448
|
+
return false if first_byte == STAR && THEMATIC_BREAK_STAR.match?(content)
|
|
449
|
+
when UNDERSCORE
|
|
450
|
+
return false if THEMATIC_BREAK_UNDERSCORE.match?(content)
|
|
451
|
+
when BACKTICK, TILDE
|
|
452
|
+
if FENCED_CODE_BACKTICK.match?(content) || FENCED_CODE_TILDE.match?(content)
|
|
453
|
+
return false
|
|
454
|
+
end
|
|
455
|
+
when BRACKET_OPEN
|
|
456
|
+
# Lines starting with [ get escaped to \[, which IS paragraph content
|
|
457
|
+
# So setext headings CAN follow them
|
|
458
|
+
return true
|
|
459
|
+
when DIGIT_0..DIGIT_9
|
|
460
|
+
return false if ORDERED_LIST.match?(content)
|
|
461
|
+
end
|
|
462
|
+
|
|
463
|
+
!INDENTED_CODE.match?(line)
|
|
464
|
+
end
|
|
465
|
+
end
|
|
466
|
+
end
|
|
467
|
+
end
|
|
468
|
+
end
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Markbridge
|
|
4
|
+
module Renderers
|
|
5
|
+
module Discourse
|
|
6
|
+
# Immutable context for rendering that wraps the parent chain
|
|
7
|
+
# Provides query methods to ask about parent elements without
|
|
8
|
+
# the renderer knowing about specific element types
|
|
9
|
+
#
|
|
10
|
+
# Uses a hash-based cache for O(1) parent lookups instead of O(depth) scans
|
|
11
|
+
class RenderContext
|
|
12
|
+
attr_reader :parents, :depth
|
|
13
|
+
|
|
14
|
+
def initialize(parents = [], parent_cache: nil)
|
|
15
|
+
@parents = parents.freeze
|
|
16
|
+
@depth = parents.size
|
|
17
|
+
@parent_cache = parent_cache || build_cache(parents)
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
# Create new context with element added to parent chain
|
|
21
|
+
# Incrementally updates cache instead of rebuilding from scratch
|
|
22
|
+
# @param element [AST::Element]
|
|
23
|
+
# @return [RenderContext]
|
|
24
|
+
def with_parent(element)
|
|
25
|
+
new_parents = @parents + [element]
|
|
26
|
+
|
|
27
|
+
# Incrementally update cache instead of rebuilding
|
|
28
|
+
new_cache = @parent_cache.dup
|
|
29
|
+
element_class = element.class
|
|
30
|
+
new_cache[element_class] ||= []
|
|
31
|
+
new_cache[element_class] = new_cache[element_class] + [element]
|
|
32
|
+
|
|
33
|
+
self.class.new(new_parents, parent_cache: new_cache)
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
# Find closest parent of given type
|
|
37
|
+
# O(1) hash lookup instead of O(depth) scan
|
|
38
|
+
# @param klass [Class]
|
|
39
|
+
# @return [AST::Element, nil]
|
|
40
|
+
def find_parent(klass)
|
|
41
|
+
@parent_cache[klass]&.last
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
# Count parents of given type
|
|
45
|
+
# O(1) instead of O(depth)
|
|
46
|
+
# @param klass [Class]
|
|
47
|
+
# @return [Integer]
|
|
48
|
+
def count_parents(klass)
|
|
49
|
+
@parent_cache[klass]&.size || 0
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
# Check if parent of type exists
|
|
53
|
+
# O(1) check
|
|
54
|
+
# @param klass [Class]
|
|
55
|
+
# @return [Boolean]
|
|
56
|
+
def has_parent?(klass)
|
|
57
|
+
@parent_cache.key?(klass) && !@parent_cache[klass].empty?
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
# Check if we're at the root (no parents)
|
|
61
|
+
# @return [Boolean]
|
|
62
|
+
def root?
|
|
63
|
+
@depth.zero?
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
private
|
|
67
|
+
|
|
68
|
+
# Build cache from parents array
|
|
69
|
+
# Groups parents by class for fast lookup
|
|
70
|
+
# @param parents [Array<AST::Element>]
|
|
71
|
+
# @return [Hash{Class => Array<AST::Element>}]
|
|
72
|
+
def build_cache(parents)
|
|
73
|
+
parents.each_with_object(Hash.new { |h, k| h[k] = [] }) do |parent, cache|
|
|
74
|
+
cache[parent.class] = cache[parent.class] + [parent]
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
end
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Markbridge
|
|
4
|
+
module Renderers
|
|
5
|
+
module Discourse
|
|
6
|
+
# Renders AST to Discourse-flavored Markdown in-memory.
|
|
7
|
+
class Renderer
|
|
8
|
+
def initialize(tag_library: nil, escaper: nil)
|
|
9
|
+
@tag_library = tag_library || TagLibrary.default
|
|
10
|
+
@escaper = escaper || MarkdownEscaper.new
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
# Render a node to Markdown
|
|
14
|
+
# @param node [AST::Node]
|
|
15
|
+
# @param context [RenderContext] rendering context with parent chain
|
|
16
|
+
# @return [String]
|
|
17
|
+
def render(node, context: RenderContext.new)
|
|
18
|
+
root_call = @interface_cache.nil?
|
|
19
|
+
@interface_cache ||= {}
|
|
20
|
+
|
|
21
|
+
tag = @tag_library[node.class]
|
|
22
|
+
if tag
|
|
23
|
+
interface = interface_for(context)
|
|
24
|
+
return tag.render(node, interface)
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
case node
|
|
28
|
+
when AST::Document, AST::Element
|
|
29
|
+
render_children(node, context:)
|
|
30
|
+
when AST::MarkdownText
|
|
31
|
+
# Pass through markdown text as-is (already formatted)
|
|
32
|
+
node.text
|
|
33
|
+
when AST::Text
|
|
34
|
+
# Escape plain text unless we're inside a code block
|
|
35
|
+
if context.has_parent?(AST::Code)
|
|
36
|
+
node.text
|
|
37
|
+
else
|
|
38
|
+
@escaper.escape(node.text)
|
|
39
|
+
end
|
|
40
|
+
else
|
|
41
|
+
""
|
|
42
|
+
end
|
|
43
|
+
ensure
|
|
44
|
+
@interface_cache = nil if root_call
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
# Render all children of a node
|
|
48
|
+
# @param node [AST::Element]
|
|
49
|
+
# @param context [RenderContext] rendering context
|
|
50
|
+
# @return [String]
|
|
51
|
+
def render_children(node, context:)
|
|
52
|
+
node.children.map { |child| render(child, context:) }.join
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
private
|
|
56
|
+
|
|
57
|
+
def interface_for(context)
|
|
58
|
+
@interface_cache[context.object_id] ||= RenderingInterface.new(self, context)
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
end
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Markbridge
|
|
4
|
+
module Renderers
|
|
5
|
+
module Discourse
|
|
6
|
+
# Interface that tags use for rendering operations
|
|
7
|
+
# Decouples tags from renderer implementation details
|
|
8
|
+
class RenderingInterface
|
|
9
|
+
attr_reader :context
|
|
10
|
+
|
|
11
|
+
def initialize(renderer, context)
|
|
12
|
+
@renderer = renderer
|
|
13
|
+
@context = context
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
# Core rendering operations
|
|
17
|
+
def render_node(node, context: @context)
|
|
18
|
+
@renderer.render(node, context:)
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def render_children(element, context: @context)
|
|
22
|
+
@renderer.render_children(element, context:)
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
# Context operations
|
|
26
|
+
def with_parent(element)
|
|
27
|
+
@context.with_parent(element)
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def find_parent(klass)
|
|
31
|
+
@context.find_parent(klass)
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def count_parents(klass)
|
|
35
|
+
@context.count_parents(klass)
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def has_parent?(klass)
|
|
39
|
+
@context.has_parent?(klass)
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def root?
|
|
43
|
+
@context.root?
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
# Check if element should be rendered in block context
|
|
47
|
+
# @param node [AST::Node] container node or leaf like HorizontalRule
|
|
48
|
+
# @return [Boolean]
|
|
49
|
+
def block_context?(node)
|
|
50
|
+
# Check if it's a block-level element type (but not code, which can be inline)
|
|
51
|
+
return true if node.is_a?(AST::List) || node.is_a?(AST::HorizontalRule)
|
|
52
|
+
return false unless node.is_a?(AST::Element)
|
|
53
|
+
|
|
54
|
+
# Check if content has newlines
|
|
55
|
+
node.children.any? { |c| c.is_a?(AST::Text) && c.text.include?("\n") }
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
# Helper: wrap inline content with markers
|
|
59
|
+
# Handles edge cases like existing markers and whitespace
|
|
60
|
+
def wrap_inline(content, open_marker, close_marker = nil)
|
|
61
|
+
close_marker ||= open_marker
|
|
62
|
+
return content if content.strip.empty?
|
|
63
|
+
|
|
64
|
+
# Handle conflicts with existing markers
|
|
65
|
+
if content.include?(open_marker) || content.include?(close_marker)
|
|
66
|
+
# Use HTML fallback for common cases
|
|
67
|
+
case open_marker
|
|
68
|
+
when "**"
|
|
69
|
+
return "<strong>#{content}</strong>"
|
|
70
|
+
when "*"
|
|
71
|
+
return "<em>#{content}</em>"
|
|
72
|
+
when "~~"
|
|
73
|
+
return "<s>#{content}</s>"
|
|
74
|
+
end
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
# Preserve leading/trailing whitespace
|
|
78
|
+
content.sub(/^(\s*)(.+?)(\s*)$/m) do
|
|
79
|
+
match = Regexp.last_match
|
|
80
|
+
"#{match[1]}#{open_marker}#{match[2]}#{close_marker}#{match[3]}"
|
|
81
|
+
end
|
|
82
|
+
end
|
|
83
|
+
end
|
|
84
|
+
end
|
|
85
|
+
end
|
|
86
|
+
end
|