markbridge 0.1.3 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/LICENSE.txt +1 -1
- data/lib/markbridge/ast/details.rb +24 -0
- data/lib/markbridge/ast/element.rb +63 -0
- data/lib/markbridge/ast.rb +1 -0
- data/lib/markbridge/conversion.rb +40 -0
- data/lib/markbridge/parse.rb +20 -0
- data/lib/markbridge/parsers/bbcode/handler_registry.rb +25 -2
- data/lib/markbridge/parsers/bbcode/handlers/raw_handler.rb +13 -2
- data/lib/markbridge/parsers/html/handler_registry.rb +97 -17
- data/lib/markbridge/parsers/html/handlers/self_closing_handler.rb +26 -0
- data/lib/markbridge/parsers/html/handlers/span_handler.rb +74 -0
- data/lib/markbridge/parsers/html/parser.rb +88 -18
- data/lib/markbridge/parsers/html.rb +2 -0
- data/lib/markbridge/parsers/media_wiki/inline_parser.rb +21 -8
- data/lib/markbridge/parsers/media_wiki/parser.rb +13 -5
- data/lib/markbridge/parsers/text_formatter/handler_registry.rb +27 -4
- data/lib/markbridge/parsers/text_formatter/handlers/attachment_handler.rb +1 -1
- data/lib/markbridge/parsers/text_formatter/handlers/attribute_handler.rb +1 -1
- data/lib/markbridge/parsers/text_formatter/handlers/base_handler.rb +1 -1
- data/lib/markbridge/parsers/text_formatter/handlers/code_handler.rb +1 -1
- data/lib/markbridge/parsers/text_formatter/handlers/email_handler.rb +1 -1
- data/lib/markbridge/parsers/text_formatter/handlers/image_handler.rb +1 -1
- data/lib/markbridge/parsers/text_formatter/handlers/list_handler.rb +1 -1
- data/lib/markbridge/parsers/text_formatter/handlers/quote_handler.rb +1 -1
- data/lib/markbridge/parsers/text_formatter/handlers/simple_handler.rb +1 -1
- data/lib/markbridge/parsers/text_formatter/handlers/table_cell_handler.rb +1 -1
- data/lib/markbridge/parsers/text_formatter/handlers/url_handler.rb +1 -1
- data/lib/markbridge/parsers/text_formatter/parser.rb +17 -3
- data/lib/markbridge/renderers/discourse/identity_escaper.rb +37 -0
- data/lib/markbridge/renderers/discourse/markdown_escaper.rb +83 -8
- data/lib/markbridge/renderers/discourse/postprocessor.rb +53 -0
- data/lib/markbridge/renderers/discourse/render_context.rb +14 -40
- data/lib/markbridge/renderers/discourse/renderer.rb +15 -5
- data/lib/markbridge/renderers/discourse/rendering_interface.rb +4 -3
- data/lib/markbridge/renderers/discourse/tag_library.rb +42 -2
- data/lib/markbridge/renderers/discourse/tags/align_tag.rb +2 -2
- data/lib/markbridge/renderers/discourse/tags/code_tag.rb +5 -3
- data/lib/markbridge/renderers/discourse/tags/details_tag.rb +46 -0
- data/lib/markbridge/renderers/discourse/tags/heading_tag.rb +1 -1
- data/lib/markbridge/renderers/discourse/tags/paragraph_tag.rb +5 -2
- data/lib/markbridge/renderers/discourse/tags/quote_tag.rb +4 -3
- data/lib/markbridge/renderers/discourse/tags/underline_tag.rb +13 -0
- data/lib/markbridge/renderers/discourse.rb +3 -0
- data/lib/markbridge/version.rb +1 -1
- data/lib/markbridge.rb +274 -110
- metadata +9 -2
- data/lib/markbridge/configuration.rb +0 -11
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 1f501e9875d69ca60aa8fcf7d1e46ef5ce83d24b69da46b192121e00b7414919
|
|
4
|
+
data.tar.gz: db9a49e5d6b0c0c5f68f84109c153f619e59d1b07fbb5e695a6443ed099b1436
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 1f5b7bac5d2ee008db7a012040de5394a615811420fcd90a8a95064349a2cf074254c3310dffb27e93f7d0c51c06f47c16b1f361015a9fad61e8fb14380c0a19
|
|
7
|
+
data.tar.gz: de4b3625ef287981cec8a5c6683ccbbf88d7593af0cd5ca7798393f377dc930a20f3f1228b225d49880f5f0323ccfa8bb7957186b644cbefd55770e153948ade
|
data/LICENSE.txt
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
The MIT License (MIT)
|
|
2
2
|
|
|
3
|
-
Copyright (c) 2025
|
|
3
|
+
Copyright (c) 2025 Civilized Discourse Construction Kit, Inc.
|
|
4
4
|
|
|
5
5
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
6
|
of this software and associated documentation files (the "Software"), to deal
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Markbridge
|
|
4
|
+
module AST
|
|
5
|
+
# Represents a Discourse +[details=…]…[/details]+ collapsible section.
|
|
6
|
+
#
|
|
7
|
+
# Carries a +title+ string (used as the +summary+ text when the
|
|
8
|
+
# block renders) and any child nodes.
|
|
9
|
+
#
|
|
10
|
+
# @example
|
|
11
|
+
# block = AST::Details.new(title: "Show more")
|
|
12
|
+
# block << AST::Text.new("Hidden body")
|
|
13
|
+
class Details < Element
|
|
14
|
+
# @return [String, nil] the summary / collapsed-state label
|
|
15
|
+
attr_reader :title
|
|
16
|
+
|
|
17
|
+
# @param title [String, nil] optional summary text
|
|
18
|
+
def initialize(title: nil)
|
|
19
|
+
super()
|
|
20
|
+
@title = title
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
end
|
|
@@ -42,6 +42,69 @@ module Markbridge
|
|
|
42
42
|
|
|
43
43
|
self
|
|
44
44
|
end
|
|
45
|
+
|
|
46
|
+
# Depth-first pre-order traversal yielding every descendant node.
|
|
47
|
+
# Returns an +Enumerator+ when called without a block so it
|
|
48
|
+
# composes through +Enumerable+:
|
|
49
|
+
#
|
|
50
|
+
# document.each_descendant.select { |n| n.is_a?(AST::Url) }
|
|
51
|
+
#
|
|
52
|
+
# Iteration semantics: each Element snapshots its own +children+
|
|
53
|
+
# array at the moment iteration enters it, so replacing a child
|
|
54
|
+
# via {#replace_child} mid-walk is safe — descent uses the
|
|
55
|
+
# pre-replacement reference. Adding or removing siblings on an
|
|
56
|
+
# Element you are currently descending into is *not* guaranteed
|
|
57
|
+
# to be visible to the current walk.
|
|
58
|
+
#
|
|
59
|
+
# @yieldparam node [Node] each descendant in document order
|
|
60
|
+
# @return [Enumerator, Element] +Enumerator+ without a block, +self+ otherwise
|
|
61
|
+
def each_descendant(&block)
|
|
62
|
+
return enum_for(:each_descendant) unless block_given?
|
|
63
|
+
|
|
64
|
+
@children.dup.each do |child|
|
|
65
|
+
yield child
|
|
66
|
+
child.each_descendant(&block) if child.is_a?(Element)
|
|
67
|
+
end
|
|
68
|
+
self
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
# Array of descendant nodes, optionally filtered by class.
|
|
72
|
+
#
|
|
73
|
+
# document.descendants # every descendant
|
|
74
|
+
# document.descendants(AST::Url) # every Url descendant
|
|
75
|
+
#
|
|
76
|
+
# @param klass [Class, nil] when given, only descendants that
|
|
77
|
+
# +is_a?(klass)+ are returned
|
|
78
|
+
# @return [Array<Node>]
|
|
79
|
+
def descendants(klass = nil)
|
|
80
|
+
result = each_descendant.to_a
|
|
81
|
+
return result if klass.nil?
|
|
82
|
+
|
|
83
|
+
result.select { |node| node.is_a?(klass) }
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
# Replace a direct child of this Element with a different Node.
|
|
87
|
+
# Preserves the child's index — useful for AST-mutation passes
|
|
88
|
+
# that need to swap one Element type for another in place
|
|
89
|
+
# (e.g. wrapping trailing paragraphs in a +Details+ block).
|
|
90
|
+
#
|
|
91
|
+
# @param old_child [Node] the child to remove (matched by +equal?+ via {Array#index})
|
|
92
|
+
# @param new_child [Node] the replacement
|
|
93
|
+
# @return [Element] +self+
|
|
94
|
+
# @raise [ArgumentError] when +old_child+ is not currently a child of this Element
|
|
95
|
+
# @raise [TypeError] when +new_child+ is not a {Node}
|
|
96
|
+
def replace_child(old_child, new_child)
|
|
97
|
+
index = @children.index(old_child)
|
|
98
|
+
raise ArgumentError, "child not found in #{self.class}" if index.nil?
|
|
99
|
+
|
|
100
|
+
unless new_child.is_a?(Node)
|
|
101
|
+
actual = new_child.nil? ? "nil" : new_child.class
|
|
102
|
+
raise TypeError, "replace_child on #{self.class} expected a #{Node}, got #{actual}"
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
@children[index] = new_child
|
|
106
|
+
self
|
|
107
|
+
end
|
|
45
108
|
end
|
|
46
109
|
end
|
|
47
110
|
end
|
data/lib/markbridge/ast.rb
CHANGED
|
@@ -10,6 +10,7 @@ require_relative "ast/attachment"
|
|
|
10
10
|
require_relative "ast/bold"
|
|
11
11
|
require_relative "ast/code"
|
|
12
12
|
require_relative "ast/color"
|
|
13
|
+
require_relative "ast/details"
|
|
13
14
|
require_relative "ast/email"
|
|
14
15
|
require_relative "ast/heading"
|
|
15
16
|
require_relative "ast/horizontal_rule"
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Markbridge
|
|
4
|
+
# Result of a *_to_markdown / convert / render call.
|
|
5
|
+
#
|
|
6
|
+
# Wraps a {Parse} (the input-side fields: +ast+, +format+,
|
|
7
|
+
# +unknown_tags+, +diagnostics+) and adds the render-side outputs:
|
|
8
|
+
# +markdown+ and +errors+. The wrapped {Parse} is reachable via
|
|
9
|
+
# {#parsed}, and each of its fields is also exposed as a delegated
|
|
10
|
+
# reader so the common usage stays ergonomic
|
|
11
|
+
# (+conversion.ast+, +conversion.unknown_tags+, …) without forcing
|
|
12
|
+
# callers to chain through +#parsed+.
|
|
13
|
+
#
|
|
14
|
+
# @!attribute [r] parsed
|
|
15
|
+
# @return [Parse] the parsed input — also reusable for a direct
|
|
16
|
+
# re-render via +Markbridge.render(conversion.parsed, …)+.
|
|
17
|
+
# @!attribute [r] markdown
|
|
18
|
+
# @return [String] the rendered Discourse-flavored Markdown
|
|
19
|
+
# @!attribute [r] errors
|
|
20
|
+
# @return [Array<StandardError>] render-time errors collected when
|
|
21
|
+
# +raise_on_error: false+ was passed; empty otherwise.
|
|
22
|
+
# @!method ast
|
|
23
|
+
# @return [AST::Document] delegated to {Parse#ast}
|
|
24
|
+
# @!method format
|
|
25
|
+
# @return [Symbol, nil] delegated to {Parse#format}
|
|
26
|
+
# @!method unknown_tags
|
|
27
|
+
# @return [Hash{String => Integer}] delegated to {Parse#unknown_tags}
|
|
28
|
+
# @!method diagnostics
|
|
29
|
+
# @return [Hash{Symbol => Object}] delegated to {Parse#diagnostics}
|
|
30
|
+
Conversion =
|
|
31
|
+
Data.define(:parsed, :markdown, :errors) do
|
|
32
|
+
def ast = parsed.ast
|
|
33
|
+
def format = parsed.format
|
|
34
|
+
def unknown_tags = parsed.unknown_tags
|
|
35
|
+
def diagnostics = parsed.diagnostics
|
|
36
|
+
|
|
37
|
+
# Allows +puts result+ and +"text: #{result}"+ to work seamlessly.
|
|
38
|
+
def to_s = markdown
|
|
39
|
+
end
|
|
40
|
+
end
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Markbridge
|
|
4
|
+
# Result of a parse-only call (Markbridge.parse_bbcode and friends).
|
|
5
|
+
#
|
|
6
|
+
# @!attribute [r] ast
|
|
7
|
+
# @return [AST::Document]
|
|
8
|
+
# @!attribute [r] format
|
|
9
|
+
# @return [Symbol, nil] :bbcode, :html, :text_formatter_xml, or
|
|
10
|
+
# :mediawiki. +nil+ when synthesized by {Markbridge.render} from
|
|
11
|
+
# a bare AST node — there was no source document to parse.
|
|
12
|
+
# @!attribute [r] unknown_tags
|
|
13
|
+
# @return [Hash{String => Integer}] tag-name → occurrence count.
|
|
14
|
+
# Empty for parsers that do not yet track unknown tags.
|
|
15
|
+
# @!attribute [r] diagnostics
|
|
16
|
+
# @return [Hash{Symbol => Object}] format-specific diagnostics.
|
|
17
|
+
# BBCode supplies :auto_closed_tags_count, :depth_exceeded_count,
|
|
18
|
+
# :unclosed_raw_tags. Other parsers supply an empty hash for now.
|
|
19
|
+
Parse = Data.define(:ast, :format, :unknown_tags, :diagnostics)
|
|
20
|
+
end
|
|
@@ -37,6 +37,27 @@ module Markbridge
|
|
|
37
37
|
self
|
|
38
38
|
end
|
|
39
39
|
|
|
40
|
+
# Replace the handler bound to one or more tag names by yielding
|
|
41
|
+
# the previously-bound handler (which may be +nil+) and
|
|
42
|
+
# registering whatever the block returns. Used to install a
|
|
43
|
+
# delegating handler that wraps the default.
|
|
44
|
+
#
|
|
45
|
+
# @example Wrap the default URL handler
|
|
46
|
+
# registry.overlay(%w[url link iurl]) do |default|
|
|
47
|
+
# LinkifyingUrlHandler.new(default:)
|
|
48
|
+
# end
|
|
49
|
+
#
|
|
50
|
+
# @param tag_names [String, Array<String>]
|
|
51
|
+
# @yieldparam previous [BaseHandler, nil] previously bound handler
|
|
52
|
+
# @return [self]
|
|
53
|
+
def overlay(tag_names)
|
|
54
|
+
Array(tag_names).each do |name|
|
|
55
|
+
previous = self[name]
|
|
56
|
+
register(name, yield(previous))
|
|
57
|
+
end
|
|
58
|
+
self
|
|
59
|
+
end
|
|
60
|
+
|
|
40
61
|
# Get handler for a tag name
|
|
41
62
|
# @param tag_name [String]
|
|
42
63
|
# @return [BaseHandler, nil]
|
|
@@ -69,8 +90,10 @@ module Markbridge
|
|
|
69
90
|
# Create the default handler registry with common BBCode tags.
|
|
70
91
|
#
|
|
71
92
|
# Each call returns a *fresh* instance — mutations made to one will
|
|
72
|
-
# not be visible to another.
|
|
73
|
-
#
|
|
93
|
+
# not be visible to another. Convenience methods on +Markbridge+
|
|
94
|
+
# build a fresh default registry per call when none is supplied;
|
|
95
|
+
# to share state across calls, build one once and pass it via
|
|
96
|
+
# the +handlers:+ kwarg.
|
|
74
97
|
#
|
|
75
98
|
# @param closing_strategy [Object, nil] optional closing strategy to apply, defaults to Reordering strategy
|
|
76
99
|
# @return [HandlerRegistry]
|
|
@@ -32,11 +32,22 @@ module Markbridge
|
|
|
32
32
|
private
|
|
33
33
|
|
|
34
34
|
def create_element(token:, content:)
|
|
35
|
-
|
|
36
|
-
|
|
35
|
+
element =
|
|
36
|
+
if accepts_language?
|
|
37
|
+
@element_class.new(language: token.attrs[:lang] || token.attrs[:option])
|
|
38
|
+
else
|
|
39
|
+
@element_class.new
|
|
40
|
+
end
|
|
37
41
|
element << AST::Text.new(content) unless content.empty?
|
|
38
42
|
element
|
|
39
43
|
end
|
|
44
|
+
|
|
45
|
+
def accepts_language?
|
|
46
|
+
@element_class
|
|
47
|
+
.instance_method(:initialize)
|
|
48
|
+
.parameters
|
|
49
|
+
.any? { |_kind, name| name == :language }
|
|
50
|
+
end
|
|
40
51
|
end
|
|
41
52
|
end
|
|
42
53
|
end
|
|
@@ -3,23 +3,114 @@
|
|
|
3
3
|
module Markbridge
|
|
4
4
|
module Parsers
|
|
5
5
|
module HTML
|
|
6
|
-
# Registry of HTML tag handlers
|
|
6
|
+
# Registry of HTML tag handlers and per-tag-name parser configuration.
|
|
7
|
+
#
|
|
8
|
+
# Handlers map a tag name to a handler instance. `block_level_tags` and
|
|
9
|
+
# `whitespace_preserving_tags` configure parser whitespace behavior by
|
|
10
|
+
# tag name, independent of whether a handler is registered — so unknown
|
|
11
|
+
# tags like <div> or <section> still trigger boundary collapsing and
|
|
12
|
+
# <pre>/<code> still pass through verbatim. Both sets are mutable, so
|
|
13
|
+
# downstream consumers can add or remove tags freely:
|
|
14
|
+
#
|
|
15
|
+
# registry = HandlerRegistry.default
|
|
16
|
+
# registry.block_level_tags << "my-block"
|
|
17
|
+
# registry.whitespace_preserving_tags.delete("tt")
|
|
7
18
|
class HandlerRegistry
|
|
19
|
+
# HTML5 block-level elements (per MDN). The trim-before-block rule
|
|
20
|
+
# applies to these regardless of whether a handler is registered.
|
|
21
|
+
DEFAULT_BLOCK_LEVEL_TAGS = %w[
|
|
22
|
+
address
|
|
23
|
+
article
|
|
24
|
+
aside
|
|
25
|
+
blockquote
|
|
26
|
+
canvas
|
|
27
|
+
dd
|
|
28
|
+
details
|
|
29
|
+
dialog
|
|
30
|
+
div
|
|
31
|
+
dl
|
|
32
|
+
dt
|
|
33
|
+
fieldset
|
|
34
|
+
figcaption
|
|
35
|
+
figure
|
|
36
|
+
footer
|
|
37
|
+
form
|
|
38
|
+
h1
|
|
39
|
+
h2
|
|
40
|
+
h3
|
|
41
|
+
h4
|
|
42
|
+
h5
|
|
43
|
+
h6
|
|
44
|
+
header
|
|
45
|
+
hgroup
|
|
46
|
+
hr
|
|
47
|
+
html
|
|
48
|
+
li
|
|
49
|
+
main
|
|
50
|
+
nav
|
|
51
|
+
noscript
|
|
52
|
+
ol
|
|
53
|
+
output
|
|
54
|
+
p
|
|
55
|
+
pre
|
|
56
|
+
section
|
|
57
|
+
table
|
|
58
|
+
tbody
|
|
59
|
+
td
|
|
60
|
+
tfoot
|
|
61
|
+
th
|
|
62
|
+
thead
|
|
63
|
+
tr
|
|
64
|
+
ul
|
|
65
|
+
video
|
|
66
|
+
].freeze
|
|
67
|
+
|
|
68
|
+
# Tags whose default CSS preserves source whitespace
|
|
69
|
+
# (`white-space: pre*`). Text inside these is passed through
|
|
70
|
+
# verbatim; outside, `\s+` runs collapse to a single space.
|
|
71
|
+
DEFAULT_WHITESPACE_PRESERVING_TAGS = %w[pre code textarea tt].freeze
|
|
72
|
+
|
|
73
|
+
# @return [Set<String>] mutable set of tag names treated as block-level.
|
|
74
|
+
attr_reader :block_level_tags
|
|
75
|
+
|
|
76
|
+
# @return [Set<String>] mutable set of tag names whose contents
|
|
77
|
+
# preserve source whitespace.
|
|
78
|
+
attr_reader :whitespace_preserving_tags
|
|
79
|
+
|
|
8
80
|
def initialize
|
|
9
81
|
@handlers = {}
|
|
82
|
+
@block_level_tags = Set.new(DEFAULT_BLOCK_LEVEL_TAGS)
|
|
83
|
+
@whitespace_preserving_tags = Set.new(DEFAULT_WHITESPACE_PRESERVING_TAGS)
|
|
10
84
|
end
|
|
11
85
|
|
|
12
86
|
# Register a handler for one or more tag names
|
|
13
87
|
# @param tag_names [String, Array<String>] tag name(s) to register
|
|
14
|
-
# @param handler [BaseHandler
|
|
88
|
+
# @param handler [BaseHandler] the handler instance — must
|
|
89
|
+
# respond to +#process(element:, parent:)+
|
|
15
90
|
def register(tag_names, handler)
|
|
16
91
|
Array(tag_names).each { |tag_name| @handlers[tag_name.to_s.downcase] = handler }
|
|
17
92
|
self
|
|
18
93
|
end
|
|
19
94
|
|
|
95
|
+
# Replace the handler bound to one or more tag names by yielding
|
|
96
|
+
# the previously-bound handler (which may be +nil+) and
|
|
97
|
+
# registering whatever the block returns. Used to install a
|
|
98
|
+
# delegating handler that wraps the default.
|
|
99
|
+
#
|
|
100
|
+
# @param tag_names [String, Array<String>]
|
|
101
|
+
# @yieldparam previous [BaseHandler, nil] previously bound handler
|
|
102
|
+
# @return [self]
|
|
103
|
+
def overlay(tag_names)
|
|
104
|
+
Array(tag_names).each do |name|
|
|
105
|
+
previous = self[name]
|
|
106
|
+
register(name, yield(previous))
|
|
107
|
+
end
|
|
108
|
+
self
|
|
109
|
+
end
|
|
110
|
+
|
|
20
111
|
# Get handler for a tag name
|
|
21
112
|
# @param tag_name [String]
|
|
22
|
-
# @return [BaseHandler,
|
|
113
|
+
# @return [BaseHandler, nil]
|
|
23
114
|
def [](tag_name)
|
|
24
115
|
@handlers[tag_name.to_s.downcase]
|
|
25
116
|
end
|
|
@@ -38,26 +129,15 @@ module Markbridge
|
|
|
38
129
|
registry.register("a", Handlers::UrlHandler.new)
|
|
39
130
|
registry.register("img", Handlers::ImageHandler.new)
|
|
40
131
|
registry.register("blockquote", Handlers::QuoteHandler.new)
|
|
41
|
-
registry.register(
|
|
42
|
-
|
|
43
|
-
lambda do |element:, parent:|
|
|
44
|
-
parent << AST::LineBreak.new
|
|
45
|
-
nil
|
|
46
|
-
end,
|
|
47
|
-
)
|
|
48
|
-
registry.register(
|
|
49
|
-
"hr",
|
|
50
|
-
lambda do |element:, parent:|
|
|
51
|
-
parent << AST::HorizontalRule.new
|
|
52
|
-
nil
|
|
53
|
-
end,
|
|
54
|
-
)
|
|
132
|
+
registry.register("br", Handlers::SelfClosingHandler.new(AST::LineBreak))
|
|
133
|
+
registry.register("hr", Handlers::SelfClosingHandler.new(AST::HorizontalRule))
|
|
55
134
|
registry.register(%w[ul ol], Handlers::ListHandler.new)
|
|
56
135
|
registry.register("li", Handlers::ListItemHandler.new)
|
|
57
136
|
registry.register("table", Handlers::TableHandler.new)
|
|
58
137
|
registry.register("tr", Handlers::TableRowHandler.new)
|
|
59
138
|
registry.register(%w[td th], Handlers::TableCellHandler.new)
|
|
60
139
|
registry.register("p", Handlers::ParagraphHandler.new)
|
|
140
|
+
registry.register("span", Handlers::SpanHandler.new)
|
|
61
141
|
end
|
|
62
142
|
end
|
|
63
143
|
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Markbridge
|
|
4
|
+
module Parsers
|
|
5
|
+
module HTML
|
|
6
|
+
module Handlers
|
|
7
|
+
# Handler for self-closing leaf tags (br, hr, etc.). Creates
|
|
8
|
+
# an instance of +element_class+, appends it to +parent+, and
|
|
9
|
+
# returns nil so the parser does not try to recurse into
|
|
10
|
+
# children.
|
|
11
|
+
class SelfClosingHandler < BaseHandler
|
|
12
|
+
def initialize(element_class)
|
|
13
|
+
@element_class = element_class
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def process(element:, parent:)
|
|
17
|
+
parent << @element_class.new
|
|
18
|
+
nil
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
attr_reader :element_class
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
end
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Markbridge
|
|
4
|
+
module Parsers
|
|
5
|
+
module HTML
|
|
6
|
+
module Handlers
|
|
7
|
+
# Maps recognized inline `style` declarations on `<span>` to AST
|
|
8
|
+
# formatting nodes. Supports text-decoration (underline,
|
|
9
|
+
# line-through), font-weight (bold), and font-style (italic). When
|
|
10
|
+
# multiple recognized styles are set, AST elements are nested in
|
|
11
|
+
# declaration order. Unrecognized styles are ignored; a span with
|
|
12
|
+
# no recognized styles is transparent (children processed into the
|
|
13
|
+
# parent).
|
|
14
|
+
class SpanHandler < BaseHandler
|
|
15
|
+
STYLE_DECLARATION = /([a-z-]+)\s*:\s*([^;]+)/i
|
|
16
|
+
BOLD_THRESHOLD = 600
|
|
17
|
+
|
|
18
|
+
def process(element:, parent:)
|
|
19
|
+
ast_classes_for(element["style"]).reduce(parent) do |current, klass|
|
|
20
|
+
child = klass.new
|
|
21
|
+
current << child
|
|
22
|
+
child
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
private
|
|
27
|
+
|
|
28
|
+
def ast_classes_for(style)
|
|
29
|
+
return [] if style.nil?
|
|
30
|
+
|
|
31
|
+
classes = []
|
|
32
|
+
style.scan(STYLE_DECLARATION) do |property, value|
|
|
33
|
+
classes_for_declaration(property.downcase, value.downcase.rstrip).each do |klass|
|
|
34
|
+
classes << klass unless classes.include?(klass)
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
classes
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def classes_for_declaration(property, value)
|
|
41
|
+
case property
|
|
42
|
+
when "text-decoration"
|
|
43
|
+
text_decoration_classes(value)
|
|
44
|
+
when "font-weight"
|
|
45
|
+
bold_value?(value) ? [AST::Bold] : []
|
|
46
|
+
when "font-style"
|
|
47
|
+
italic_value?(value) ? [AST::Italic] : []
|
|
48
|
+
else
|
|
49
|
+
[]
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
def text_decoration_classes(value)
|
|
54
|
+
classes = []
|
|
55
|
+
classes << AST::Underline if value.include?("underline")
|
|
56
|
+
classes << AST::Strikethrough if value.include?("line-through")
|
|
57
|
+
classes
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
def bold_value?(value)
|
|
61
|
+
return true if %w[bold bolder].include?(value)
|
|
62
|
+
return false unless value.match?(/\A\d+\z/)
|
|
63
|
+
|
|
64
|
+
Integer(value) >= BOLD_THRESHOLD
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
def italic_value?(value)
|
|
68
|
+
%w[italic oblique].include?(value)
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
end
|
|
74
|
+
end
|
|
@@ -10,6 +10,8 @@ module Markbridge
|
|
|
10
10
|
# JavaScript, or document metadata that shouldn't appear in output.
|
|
11
11
|
IGNORED_TAGS = %w[style script head title noscript template].freeze
|
|
12
12
|
|
|
13
|
+
WHITESPACE_RUN = /[ \t\r\n\f]+/
|
|
14
|
+
|
|
13
15
|
attr_reader :unknown_tags
|
|
14
16
|
|
|
15
17
|
# Create a new parser with optional custom handlers
|
|
@@ -25,8 +27,22 @@ module Markbridge
|
|
|
25
27
|
@unknown_tags = Hash.new(0)
|
|
26
28
|
end
|
|
27
29
|
|
|
28
|
-
# Parse HTML
|
|
29
|
-
#
|
|
30
|
+
# Parse HTML into an AST.
|
|
31
|
+
#
|
|
32
|
+
# Accepts either a String of HTML source or a pre-parsed
|
|
33
|
+
# Nokogiri node (typically a +DocumentFragment+ from
|
|
34
|
+
# +Nokogiri::HTML.fragment+ or a full +Document+ from
|
|
35
|
+
# +Nokogiri::HTML.parse+). Passing a pre-parsed tree lets a
|
|
36
|
+
# caller run their own Nokogiri-driven pre-processing without
|
|
37
|
+
# forcing Markbridge to re-parse the same bytes.
|
|
38
|
+
#
|
|
39
|
+
# A +Nokogiri::HTML::Document+ is unwrapped to its +<body>+
|
|
40
|
+
# children so the +<html>+ / +<body>+ / +<head>+ wrappers
|
|
41
|
+
# don't pollute {#unknown_tags}; fragments and bare elements
|
|
42
|
+
# iterate their own children directly.
|
|
43
|
+
#
|
|
44
|
+
# @param input [String, Nokogiri::XML::Node] HTML source or
|
|
45
|
+
# pre-parsed Nokogiri tree
|
|
30
46
|
# @return [AST::Document]
|
|
31
47
|
def parse(input)
|
|
32
48
|
@unknown_tags.clear
|
|
@@ -36,13 +52,21 @@ module Markbridge
|
|
|
36
52
|
# (see sparklemotion/nokogiri#2227). Table support treats thead/tbody/tfoot
|
|
37
53
|
# as transparent, so the parse-tree difference (HTML5 auto-inserts tbody,
|
|
38
54
|
# HTML4 does not) has no effect on the AST.
|
|
39
|
-
doc =
|
|
55
|
+
doc =
|
|
56
|
+
if input.is_a?(Nokogiri::XML::Node)
|
|
57
|
+
input
|
|
58
|
+
else
|
|
59
|
+
Nokogiri::HTML.fragment(input.to_s)
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
children = doc.is_a?(Nokogiri::HTML::Document) ? body_children(doc) : doc.children
|
|
40
63
|
|
|
41
64
|
# Create root AST document
|
|
42
65
|
document = AST::Document.new
|
|
43
66
|
|
|
44
67
|
# Process all nodes
|
|
45
|
-
|
|
68
|
+
children.each { |node| process_node(node, document) }
|
|
69
|
+
trim_trailing_whitespace(document)
|
|
46
70
|
|
|
47
71
|
document
|
|
48
72
|
end
|
|
@@ -72,7 +96,19 @@ module Markbridge
|
|
|
72
96
|
# @param node [Nokogiri::XML::Text]
|
|
73
97
|
# @param parent [AST::Element]
|
|
74
98
|
def process_text_node(node, parent)
|
|
75
|
-
|
|
99
|
+
if preserves_whitespace?(node)
|
|
100
|
+
parent << AST::Text.new(node.text)
|
|
101
|
+
return
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
text = node.text.gsub(WHITESPACE_RUN, " ")
|
|
105
|
+
# Drop leading whitespace at the start of an element's content,
|
|
106
|
+
# matching the browser rule that whitespace at the beginning of a
|
|
107
|
+
# block (or before any inline content) is collapsed away.
|
|
108
|
+
text = text.lstrip if parent.children.empty?
|
|
109
|
+
return if text.empty?
|
|
110
|
+
|
|
111
|
+
parent << AST::Text.new(text)
|
|
76
112
|
end
|
|
77
113
|
|
|
78
114
|
# Process an element node
|
|
@@ -82,21 +118,24 @@ module Markbridge
|
|
|
82
118
|
tag_name = node.name
|
|
83
119
|
return if IGNORED_TAGS.include?(tag_name)
|
|
84
120
|
|
|
121
|
+
# Drop whitespace that sits between content and the start of a
|
|
122
|
+
# block-level tag, matching browser behavior where such whitespace
|
|
123
|
+
# collapses against the block boundary. Applies whether or not a
|
|
124
|
+
# handler is registered, so unknown tags like <div> or <section>
|
|
125
|
+
# still collapse the whitespace before them.
|
|
126
|
+
trim_trailing_whitespace(parent) if @handlers.block_level_tags.include?(tag_name)
|
|
127
|
+
|
|
85
128
|
handler = @handlers[tag_name]
|
|
129
|
+
return handle_unknown_tag(node, parent) unless handler
|
|
130
|
+
|
|
131
|
+
# Handler returns element if children should be processed, nil otherwise
|
|
132
|
+
ast_element = handler.process(element: node, parent:)
|
|
86
133
|
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
else
|
|
93
|
-
handler.call(element: node, parent:)
|
|
94
|
-
end
|
|
95
|
-
|
|
96
|
-
# Automatically process children if handler returned element
|
|
97
|
-
process_children(node, ast_element) if ast_element
|
|
98
|
-
else
|
|
99
|
-
handle_unknown_tag(node, parent)
|
|
134
|
+
return unless ast_element
|
|
135
|
+
|
|
136
|
+
process_children(node, ast_element)
|
|
137
|
+
unless @handlers.whitespace_preserving_tags.include?(tag_name)
|
|
138
|
+
trim_trailing_whitespace(ast_element)
|
|
100
139
|
end
|
|
101
140
|
end
|
|
102
141
|
|
|
@@ -108,6 +147,37 @@ module Markbridge
|
|
|
108
147
|
@unknown_tags[node.name] += 1
|
|
109
148
|
process_children(node, parent)
|
|
110
149
|
end
|
|
150
|
+
|
|
151
|
+
# Whether `node` is inside a tag that preserves source whitespace.
|
|
152
|
+
# @param node [Nokogiri::XML::Node]
|
|
153
|
+
# @return [Boolean]
|
|
154
|
+
def preserves_whitespace?(node)
|
|
155
|
+
node.ancestors.any? do |ancestor|
|
|
156
|
+
@handlers.whitespace_preserving_tags.include?(ancestor.name)
|
|
157
|
+
end
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
# Direct children of the +<body>+ element of a full HTML document,
|
|
161
|
+
# falling back to the document's own children if no +<body>+ exists
|
|
162
|
+
# (malformed input).
|
|
163
|
+
# @param doc [Nokogiri::HTML::Document]
|
|
164
|
+
# @return [Nokogiri::XML::NodeSet]
|
|
165
|
+
def body_children(doc)
|
|
166
|
+
(doc.at_css("body") || doc).children
|
|
167
|
+
end
|
|
168
|
+
|
|
169
|
+
# Strip trailing whitespace from the last Text child of `element`.
|
|
170
|
+
# Removes the child entirely if it becomes empty. No-op if the last
|
|
171
|
+
# child is not a Text node.
|
|
172
|
+
# @param element [AST::Element]
|
|
173
|
+
def trim_trailing_whitespace(element)
|
|
174
|
+
last = element.children.last
|
|
175
|
+
return unless last.instance_of?(AST::Text)
|
|
176
|
+
|
|
177
|
+
trimmed = last.text.rstrip
|
|
178
|
+
element.children.pop
|
|
179
|
+
element << AST::Text.new(trimmed) unless trimmed.empty?
|
|
180
|
+
end
|
|
111
181
|
end
|
|
112
182
|
end
|
|
113
183
|
end
|
|
@@ -10,6 +10,7 @@ require_relative "../ast"
|
|
|
10
10
|
# Handlers
|
|
11
11
|
require_relative "html/handlers/base_handler"
|
|
12
12
|
require_relative "html/handlers/simple_handler"
|
|
13
|
+
require_relative "html/handlers/self_closing_handler"
|
|
13
14
|
require_relative "html/handlers/raw_handler"
|
|
14
15
|
require_relative "html/handlers/url_handler"
|
|
15
16
|
require_relative "html/handlers/image_handler"
|
|
@@ -20,6 +21,7 @@ require_relative "html/handlers/paragraph_handler"
|
|
|
20
21
|
require_relative "html/handlers/table_handler"
|
|
21
22
|
require_relative "html/handlers/table_row_handler"
|
|
22
23
|
require_relative "html/handlers/table_cell_handler"
|
|
24
|
+
require_relative "html/handlers/span_handler"
|
|
23
25
|
|
|
24
26
|
# Parser components
|
|
25
27
|
require_relative "html/handler_registry"
|