markbridge 0.1.2 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. checksums.yaml +4 -4
  2. data/LICENSE.txt +1 -1
  3. data/lib/markbridge/ast/details.rb +24 -0
  4. data/lib/markbridge/ast/element.rb +63 -0
  5. data/lib/markbridge/ast.rb +1 -0
  6. data/lib/markbridge/conversion.rb +40 -0
  7. data/lib/markbridge/parse.rb +20 -0
  8. data/lib/markbridge/parsers/bbcode/handler_registry.rb +25 -2
  9. data/lib/markbridge/parsers/bbcode/handlers/raw_handler.rb +13 -2
  10. data/lib/markbridge/parsers/html/handler_registry.rb +97 -17
  11. data/lib/markbridge/parsers/html/handlers/self_closing_handler.rb +26 -0
  12. data/lib/markbridge/parsers/html/handlers/span_handler.rb +74 -0
  13. data/lib/markbridge/parsers/html/parser.rb +88 -18
  14. data/lib/markbridge/parsers/html.rb +2 -0
  15. data/lib/markbridge/parsers/media_wiki/inline_parser.rb +21 -8
  16. data/lib/markbridge/parsers/media_wiki/parser.rb +13 -5
  17. data/lib/markbridge/parsers/text_formatter/handler_registry.rb +27 -4
  18. data/lib/markbridge/parsers/text_formatter/handlers/attachment_handler.rb +1 -1
  19. data/lib/markbridge/parsers/text_formatter/handlers/attribute_handler.rb +1 -1
  20. data/lib/markbridge/parsers/text_formatter/handlers/base_handler.rb +1 -1
  21. data/lib/markbridge/parsers/text_formatter/handlers/code_handler.rb +1 -1
  22. data/lib/markbridge/parsers/text_formatter/handlers/email_handler.rb +1 -1
  23. data/lib/markbridge/parsers/text_formatter/handlers/image_handler.rb +1 -1
  24. data/lib/markbridge/parsers/text_formatter/handlers/list_handler.rb +1 -1
  25. data/lib/markbridge/parsers/text_formatter/handlers/quote_handler.rb +1 -1
  26. data/lib/markbridge/parsers/text_formatter/handlers/simple_handler.rb +1 -1
  27. data/lib/markbridge/parsers/text_formatter/handlers/table_cell_handler.rb +1 -1
  28. data/lib/markbridge/parsers/text_formatter/handlers/url_handler.rb +1 -1
  29. data/lib/markbridge/parsers/text_formatter/parser.rb +17 -3
  30. data/lib/markbridge/renderers/discourse/identity_escaper.rb +37 -0
  31. data/lib/markbridge/renderers/discourse/markdown_escaper.rb +91 -9
  32. data/lib/markbridge/renderers/discourse/postprocessor.rb +53 -0
  33. data/lib/markbridge/renderers/discourse/render_context.rb +14 -40
  34. data/lib/markbridge/renderers/discourse/renderer.rb +15 -5
  35. data/lib/markbridge/renderers/discourse/rendering_interface.rb +4 -3
  36. data/lib/markbridge/renderers/discourse/tag_library.rb +42 -2
  37. data/lib/markbridge/renderers/discourse/tags/align_tag.rb +2 -2
  38. data/lib/markbridge/renderers/discourse/tags/code_tag.rb +5 -3
  39. data/lib/markbridge/renderers/discourse/tags/details_tag.rb +46 -0
  40. data/lib/markbridge/renderers/discourse/tags/heading_tag.rb +1 -1
  41. data/lib/markbridge/renderers/discourse/tags/paragraph_tag.rb +5 -2
  42. data/lib/markbridge/renderers/discourse/tags/quote_tag.rb +4 -3
  43. data/lib/markbridge/renderers/discourse/tags/underline_tag.rb +13 -0
  44. data/lib/markbridge/renderers/discourse.rb +3 -0
  45. data/lib/markbridge/version.rb +1 -1
  46. data/lib/markbridge.rb +274 -110
  47. metadata +9 -2
  48. data/lib/markbridge/configuration.rb +0 -11
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f805bfbca5d3f0ecb098c54a27c367a3b5c44bdf79a59bd4f4f04073f3b4e3d7
4
- data.tar.gz: 77d62ba2660fc967f9475fafb768e3728e54ad0811f562bc575fdfcfbd2bfe5a
3
+ metadata.gz: 1f501e9875d69ca60aa8fcf7d1e46ef5ce83d24b69da46b192121e00b7414919
4
+ data.tar.gz: db9a49e5d6b0c0c5f68f84109c153f619e59d1b07fbb5e695a6443ed099b1436
5
5
  SHA512:
6
- metadata.gz: b65279c8b48a47f1c745e601dca97ccdaa170279830ed6ecb82aed03439cb0ed05a37d5ff3818920efde337fbe65c32c9cf3e24e354bbd072b230fe7334128ea
7
- data.tar.gz: a6139242a1f81149bf0e3eb1560ec70028979e5ee2aba12d0b0477e1b8b505e526a657e4dc5854a89f4903aad376fe687bf80a72d77b8eb3238f46f76b267cb8
6
+ metadata.gz: 1f5b7bac5d2ee008db7a012040de5394a615811420fcd90a8a95064349a2cf074254c3310dffb27e93f7d0c51c06f47c16b1f361015a9fad61e8fb14380c0a19
7
+ data.tar.gz: de4b3625ef287981cec8a5c6683ccbbf88d7593af0cd5ca7798393f377dc930a20f3f1228b225d49880f5f0323ccfa8bb7957186b644cbefd55770e153948ade
data/LICENSE.txt CHANGED
@@ -1,6 +1,6 @@
1
1
  The MIT License (MIT)
2
2
 
3
- Copyright (c) 2025 Gerhard Schlager
3
+ Copyright (c) 2025 Civilized Discourse Construction Kit, Inc.
4
4
 
5
5
  Permission is hereby granted, free of charge, to any person obtaining a copy
6
6
  of this software and associated documentation files (the "Software"), to deal
@@ -0,0 +1,24 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Markbridge
4
+ module AST
5
+ # Represents a Discourse +[details=…]…[/details]+ collapsible section.
6
+ #
7
+ # Carries a +title+ string (used as the +summary+ text when the
8
+ # block renders) and any child nodes.
9
+ #
10
+ # @example
11
+ # block = AST::Details.new(title: "Show more")
12
+ # block << AST::Text.new("Hidden body")
13
+ class Details < Element
14
+ # @return [String, nil] the summary / collapsed-state label
15
+ attr_reader :title
16
+
17
+ # @param title [String, nil] optional summary text
18
+ def initialize(title: nil)
19
+ super()
20
+ @title = title
21
+ end
22
+ end
23
+ end
24
+ end
@@ -42,6 +42,69 @@ module Markbridge
42
42
 
43
43
  self
44
44
  end
45
+
46
+ # Depth-first pre-order traversal yielding every descendant node.
47
+ # Returns an +Enumerator+ when called without a block so it
48
+ # composes through +Enumerable+:
49
+ #
50
+ # document.each_descendant.select { |n| n.is_a?(AST::Url) }
51
+ #
52
+ # Iteration semantics: each Element snapshots its own +children+
53
+ # array at the moment iteration enters it, so replacing a child
54
+ # via {#replace_child} mid-walk is safe — descent uses the
55
+ # pre-replacement reference. Adding or removing siblings on an
56
+ # Element you are currently descending into is *not* guaranteed
57
+ # to be visible to the current walk.
58
+ #
59
+ # @yieldparam node [Node] each descendant in document order
60
+ # @return [Enumerator, Element] +Enumerator+ without a block, +self+ otherwise
61
+ def each_descendant(&block)
62
+ return enum_for(:each_descendant) unless block_given?
63
+
64
+ @children.dup.each do |child|
65
+ yield child
66
+ child.each_descendant(&block) if child.is_a?(Element)
67
+ end
68
+ self
69
+ end
70
+
71
+ # Array of descendant nodes, optionally filtered by class.
72
+ #
73
+ # document.descendants # every descendant
74
+ # document.descendants(AST::Url) # every Url descendant
75
+ #
76
+ # @param klass [Class, nil] when given, only descendants that
77
+ # +is_a?(klass)+ are returned
78
+ # @return [Array<Node>]
79
+ def descendants(klass = nil)
80
+ result = each_descendant.to_a
81
+ return result if klass.nil?
82
+
83
+ result.select { |node| node.is_a?(klass) }
84
+ end
85
+
86
+ # Replace a direct child of this Element with a different Node.
87
+ # Preserves the child's index — useful for AST-mutation passes
88
+ # that need to swap one Element type for another in place
89
+ # (e.g. wrapping trailing paragraphs in a +Details+ block).
90
+ #
91
+ # @param old_child [Node] the child to remove (matched by +equal?+ via {Array#index})
92
+ # @param new_child [Node] the replacement
93
+ # @return [Element] +self+
94
+ # @raise [ArgumentError] when +old_child+ is not currently a child of this Element
95
+ # @raise [TypeError] when +new_child+ is not a {Node}
96
+ def replace_child(old_child, new_child)
97
+ index = @children.index(old_child)
98
+ raise ArgumentError, "child not found in #{self.class}" if index.nil?
99
+
100
+ unless new_child.is_a?(Node)
101
+ actual = new_child.nil? ? "nil" : new_child.class
102
+ raise TypeError, "replace_child on #{self.class} expected a #{Node}, got #{actual}"
103
+ end
104
+
105
+ @children[index] = new_child
106
+ self
107
+ end
45
108
  end
46
109
  end
47
110
  end
@@ -10,6 +10,7 @@ require_relative "ast/attachment"
10
10
  require_relative "ast/bold"
11
11
  require_relative "ast/code"
12
12
  require_relative "ast/color"
13
+ require_relative "ast/details"
13
14
  require_relative "ast/email"
14
15
  require_relative "ast/heading"
15
16
  require_relative "ast/horizontal_rule"
@@ -0,0 +1,40 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Markbridge
4
+ # Result of a *_to_markdown / convert / render call.
5
+ #
6
+ # Wraps a {Parse} (the input-side fields: +ast+, +format+,
7
+ # +unknown_tags+, +diagnostics+) and adds the render-side outputs:
8
+ # +markdown+ and +errors+. The wrapped {Parse} is reachable via
9
+ # {#parsed}, and each of its fields is also exposed as a delegated
10
+ # reader so the common usage stays ergonomic
11
+ # (+conversion.ast+, +conversion.unknown_tags+, …) without forcing
12
+ # callers to chain through +#parsed+.
13
+ #
14
+ # @!attribute [r] parsed
15
+ # @return [Parse] the parsed input — also reusable for a direct
16
+ # re-render via +Markbridge.render(conversion.parsed, …)+.
17
+ # @!attribute [r] markdown
18
+ # @return [String] the rendered Discourse-flavored Markdown
19
+ # @!attribute [r] errors
20
+ # @return [Array<StandardError>] render-time errors collected when
21
+ # +raise_on_error: false+ was passed; empty otherwise.
22
+ # @!method ast
23
+ # @return [AST::Document] delegated to {Parse#ast}
24
+ # @!method format
25
+ # @return [Symbol, nil] delegated to {Parse#format}
26
+ # @!method unknown_tags
27
+ # @return [Hash{String => Integer}] delegated to {Parse#unknown_tags}
28
+ # @!method diagnostics
29
+ # @return [Hash{Symbol => Object}] delegated to {Parse#diagnostics}
30
+ Conversion =
31
+ Data.define(:parsed, :markdown, :errors) do
32
+ def ast = parsed.ast
33
+ def format = parsed.format
34
+ def unknown_tags = parsed.unknown_tags
35
+ def diagnostics = parsed.diagnostics
36
+
37
+ # Allows +puts result+ and +"text: #{result}"+ to work seamlessly.
38
+ def to_s = markdown
39
+ end
40
+ end
@@ -0,0 +1,20 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Markbridge
4
+ # Result of a parse-only call (Markbridge.parse_bbcode and friends).
5
+ #
6
+ # @!attribute [r] ast
7
+ # @return [AST::Document]
8
+ # @!attribute [r] format
9
+ # @return [Symbol, nil] :bbcode, :html, :text_formatter_xml, or
10
+ # :mediawiki. +nil+ when synthesized by {Markbridge.render} from
11
+ # a bare AST node — there was no source document to parse.
12
+ # @!attribute [r] unknown_tags
13
+ # @return [Hash{String => Integer}] tag-name → occurrence count.
14
+ # Empty for parsers that do not yet track unknown tags.
15
+ # @!attribute [r] diagnostics
16
+ # @return [Hash{Symbol => Object}] format-specific diagnostics.
17
+ # BBCode supplies :auto_closed_tags_count, :depth_exceeded_count,
18
+ # :unclosed_raw_tags. Other parsers supply an empty hash for now.
19
+ Parse = Data.define(:ast, :format, :unknown_tags, :diagnostics)
20
+ end
@@ -37,6 +37,27 @@ module Markbridge
37
37
  self
38
38
  end
39
39
 
40
+ # Replace the handler bound to one or more tag names by yielding
41
+ # the previously-bound handler (which may be +nil+) and
42
+ # registering whatever the block returns. Used to install a
43
+ # delegating handler that wraps the default.
44
+ #
45
+ # @example Wrap the default URL handler
46
+ # registry.overlay(%w[url link iurl]) do |default|
47
+ # LinkifyingUrlHandler.new(default:)
48
+ # end
49
+ #
50
+ # @param tag_names [String, Array<String>]
51
+ # @yieldparam previous [BaseHandler, nil] previously bound handler
52
+ # @return [self]
53
+ def overlay(tag_names)
54
+ Array(tag_names).each do |name|
55
+ previous = self[name]
56
+ register(name, yield(previous))
57
+ end
58
+ self
59
+ end
60
+
40
61
  # Get handler for a tag name
41
62
  # @param tag_name [String]
42
63
  # @return [BaseHandler, nil]
@@ -69,8 +90,10 @@ module Markbridge
69
90
  # Create the default handler registry with common BBCode tags.
70
91
  #
71
92
  # Each call returns a *fresh* instance — mutations made to one will
72
- # not be visible to another. If you want a process-wide singleton,
73
- # use {Markbridge.default_handlers} instead, which memoizes.
93
+ # not be visible to another. Convenience methods on +Markbridge+
94
+ # build a fresh default registry per call when none is supplied;
95
+ # to share state across calls, build one once and pass it via
96
+ # the +handlers:+ kwarg.
74
97
  #
75
98
  # @param closing_strategy [Object, nil] optional closing strategy to apply, defaults to Reordering strategy
76
99
  # @return [HandlerRegistry]
@@ -32,11 +32,22 @@ module Markbridge
32
32
  private
33
33
 
34
34
  def create_element(token:, content:)
35
- language = token.attrs[:lang] || token.attrs[:option]
36
- element = @element_class.new(language:)
35
+ element =
36
+ if accepts_language?
37
+ @element_class.new(language: token.attrs[:lang] || token.attrs[:option])
38
+ else
39
+ @element_class.new
40
+ end
37
41
  element << AST::Text.new(content) unless content.empty?
38
42
  element
39
43
  end
44
+
45
+ def accepts_language?
46
+ @element_class
47
+ .instance_method(:initialize)
48
+ .parameters
49
+ .any? { |_kind, name| name == :language }
50
+ end
40
51
  end
41
52
  end
42
53
  end
@@ -3,23 +3,114 @@
3
3
  module Markbridge
4
4
  module Parsers
5
5
  module HTML
6
- # Registry of HTML tag handlers
6
+ # Registry of HTML tag handlers and per-tag-name parser configuration.
7
+ #
8
+ # Handlers map a tag name to a handler instance. `block_level_tags` and
9
+ # `whitespace_preserving_tags` configure parser whitespace behavior by
10
+ # tag name, independent of whether a handler is registered — so unknown
11
+ # tags like <div> or <section> still trigger boundary collapsing and
12
+ # <pre>/<code> still pass through verbatim. Both sets are mutable, so
13
+ # downstream consumers can add or remove tags freely:
14
+ #
15
+ # registry = HandlerRegistry.default
16
+ # registry.block_level_tags << "my-block"
17
+ # registry.whitespace_preserving_tags.delete("tt")
7
18
  class HandlerRegistry
19
+ # HTML5 block-level elements (per MDN). The trim-before-block rule
20
+ # applies to these regardless of whether a handler is registered.
21
+ DEFAULT_BLOCK_LEVEL_TAGS = %w[
22
+ address
23
+ article
24
+ aside
25
+ blockquote
26
+ canvas
27
+ dd
28
+ details
29
+ dialog
30
+ div
31
+ dl
32
+ dt
33
+ fieldset
34
+ figcaption
35
+ figure
36
+ footer
37
+ form
38
+ h1
39
+ h2
40
+ h3
41
+ h4
42
+ h5
43
+ h6
44
+ header
45
+ hgroup
46
+ hr
47
+ html
48
+ li
49
+ main
50
+ nav
51
+ noscript
52
+ ol
53
+ output
54
+ p
55
+ pre
56
+ section
57
+ table
58
+ tbody
59
+ td
60
+ tfoot
61
+ th
62
+ thead
63
+ tr
64
+ ul
65
+ video
66
+ ].freeze
67
+
68
+ # Tags whose default CSS preserves source whitespace
69
+ # (`white-space: pre*`). Text inside these is passed through
70
+ # verbatim; outside, `\s+` runs collapse to a single space.
71
+ DEFAULT_WHITESPACE_PRESERVING_TAGS = %w[pre code textarea tt].freeze
72
+
73
+ # @return [Set<String>] mutable set of tag names treated as block-level.
74
+ attr_reader :block_level_tags
75
+
76
+ # @return [Set<String>] mutable set of tag names whose contents
77
+ # preserve source whitespace.
78
+ attr_reader :whitespace_preserving_tags
79
+
8
80
  def initialize
9
81
  @handlers = {}
82
+ @block_level_tags = Set.new(DEFAULT_BLOCK_LEVEL_TAGS)
83
+ @whitespace_preserving_tags = Set.new(DEFAULT_WHITESPACE_PRESERVING_TAGS)
10
84
  end
11
85
 
12
86
  # Register a handler for one or more tag names
13
87
  # @param tag_names [String, Array<String>] tag name(s) to register
14
- # @param handler [BaseHandler, Proc] the handler instance or proc
88
+ # @param handler [BaseHandler] the handler instance must
89
+ # respond to +#process(element:, parent:)+
15
90
  def register(tag_names, handler)
16
91
  Array(tag_names).each { |tag_name| @handlers[tag_name.to_s.downcase] = handler }
17
92
  self
18
93
  end
19
94
 
95
+ # Replace the handler bound to one or more tag names by yielding
96
+ # the previously-bound handler (which may be +nil+) and
97
+ # registering whatever the block returns. Used to install a
98
+ # delegating handler that wraps the default.
99
+ #
100
+ # @param tag_names [String, Array<String>]
101
+ # @yieldparam previous [BaseHandler, nil] previously bound handler
102
+ # @return [self]
103
+ def overlay(tag_names)
104
+ Array(tag_names).each do |name|
105
+ previous = self[name]
106
+ register(name, yield(previous))
107
+ end
108
+ self
109
+ end
110
+
20
111
  # Get handler for a tag name
21
112
  # @param tag_name [String]
22
- # @return [BaseHandler, Proc, nil]
113
+ # @return [BaseHandler, nil]
23
114
  def [](tag_name)
24
115
  @handlers[tag_name.to_s.downcase]
25
116
  end
@@ -38,26 +129,15 @@ module Markbridge
38
129
  registry.register("a", Handlers::UrlHandler.new)
39
130
  registry.register("img", Handlers::ImageHandler.new)
40
131
  registry.register("blockquote", Handlers::QuoteHandler.new)
41
- registry.register(
42
- "br",
43
- lambda do |element:, parent:|
44
- parent << AST::LineBreak.new
45
- nil
46
- end,
47
- )
48
- registry.register(
49
- "hr",
50
- lambda do |element:, parent:|
51
- parent << AST::HorizontalRule.new
52
- nil
53
- end,
54
- )
132
+ registry.register("br", Handlers::SelfClosingHandler.new(AST::LineBreak))
133
+ registry.register("hr", Handlers::SelfClosingHandler.new(AST::HorizontalRule))
55
134
  registry.register(%w[ul ol], Handlers::ListHandler.new)
56
135
  registry.register("li", Handlers::ListItemHandler.new)
57
136
  registry.register("table", Handlers::TableHandler.new)
58
137
  registry.register("tr", Handlers::TableRowHandler.new)
59
138
  registry.register(%w[td th], Handlers::TableCellHandler.new)
60
139
  registry.register("p", Handlers::ParagraphHandler.new)
140
+ registry.register("span", Handlers::SpanHandler.new)
61
141
  end
62
142
  end
63
143
 
@@ -0,0 +1,26 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Markbridge
4
+ module Parsers
5
+ module HTML
6
+ module Handlers
7
+ # Handler for self-closing leaf tags (br, hr, etc.). Creates
8
+ # an instance of +element_class+, appends it to +parent+, and
9
+ # returns nil so the parser does not try to recurse into
10
+ # children.
11
+ class SelfClosingHandler < BaseHandler
12
+ def initialize(element_class)
13
+ @element_class = element_class
14
+ end
15
+
16
+ def process(element:, parent:)
17
+ parent << @element_class.new
18
+ nil
19
+ end
20
+
21
+ attr_reader :element_class
22
+ end
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,74 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Markbridge
4
+ module Parsers
5
+ module HTML
6
+ module Handlers
7
+ # Maps recognized inline `style` declarations on `<span>` to AST
8
+ # formatting nodes. Supports text-decoration (underline,
9
+ # line-through), font-weight (bold), and font-style (italic). When
10
+ # multiple recognized styles are set, AST elements are nested in
11
+ # declaration order. Unrecognized styles are ignored; a span with
12
+ # no recognized styles is transparent (children processed into the
13
+ # parent).
14
+ class SpanHandler < BaseHandler
15
+ STYLE_DECLARATION = /([a-z-]+)\s*:\s*([^;]+)/i
16
+ BOLD_THRESHOLD = 600
17
+
18
+ def process(element:, parent:)
19
+ ast_classes_for(element["style"]).reduce(parent) do |current, klass|
20
+ child = klass.new
21
+ current << child
22
+ child
23
+ end
24
+ end
25
+
26
+ private
27
+
28
+ def ast_classes_for(style)
29
+ return [] if style.nil?
30
+
31
+ classes = []
32
+ style.scan(STYLE_DECLARATION) do |property, value|
33
+ classes_for_declaration(property.downcase, value.downcase.rstrip).each do |klass|
34
+ classes << klass unless classes.include?(klass)
35
+ end
36
+ end
37
+ classes
38
+ end
39
+
40
+ def classes_for_declaration(property, value)
41
+ case property
42
+ when "text-decoration"
43
+ text_decoration_classes(value)
44
+ when "font-weight"
45
+ bold_value?(value) ? [AST::Bold] : []
46
+ when "font-style"
47
+ italic_value?(value) ? [AST::Italic] : []
48
+ else
49
+ []
50
+ end
51
+ end
52
+
53
+ def text_decoration_classes(value)
54
+ classes = []
55
+ classes << AST::Underline if value.include?("underline")
56
+ classes << AST::Strikethrough if value.include?("line-through")
57
+ classes
58
+ end
59
+
60
+ def bold_value?(value)
61
+ return true if %w[bold bolder].include?(value)
62
+ return false unless value.match?(/\A\d+\z/)
63
+
64
+ Integer(value) >= BOLD_THRESHOLD
65
+ end
66
+
67
+ def italic_value?(value)
68
+ %w[italic oblique].include?(value)
69
+ end
70
+ end
71
+ end
72
+ end
73
+ end
74
+ end
@@ -10,6 +10,8 @@ module Markbridge
10
10
  # JavaScript, or document metadata that shouldn't appear in output.
11
11
  IGNORED_TAGS = %w[style script head title noscript template].freeze
12
12
 
13
+ WHITESPACE_RUN = /[ \t\r\n\f]+/
14
+
13
15
  attr_reader :unknown_tags
14
16
 
15
17
  # Create a new parser with optional custom handlers
@@ -25,8 +27,22 @@ module Markbridge
25
27
  @unknown_tags = Hash.new(0)
26
28
  end
27
29
 
28
- # Parse HTML string into an AST
29
- # @param input [String] HTML source
30
+ # Parse HTML into an AST.
31
+ #
32
+ # Accepts either a String of HTML source or a pre-parsed
33
+ # Nokogiri node (typically a +DocumentFragment+ from
34
+ # +Nokogiri::HTML.fragment+ or a full +Document+ from
35
+ # +Nokogiri::HTML.parse+). Passing a pre-parsed tree lets a
36
+ # caller run their own Nokogiri-driven pre-processing without
37
+ # forcing Markbridge to re-parse the same bytes.
38
+ #
39
+ # A +Nokogiri::HTML::Document+ is unwrapped to its +<body>+
40
+ # children so the +<html>+ / +<body>+ / +<head>+ wrappers
41
+ # don't pollute {#unknown_tags}; fragments and bare elements
42
+ # iterate their own children directly.
43
+ #
44
+ # @param input [String, Nokogiri::XML::Node] HTML source or
45
+ # pre-parsed Nokogiri tree
30
46
  # @return [AST::Document]
31
47
  def parse(input)
32
48
  @unknown_tags.clear
@@ -36,13 +52,21 @@ module Markbridge
36
52
  # (see sparklemotion/nokogiri#2227). Table support treats thead/tbody/tfoot
37
53
  # as transparent, so the parse-tree difference (HTML5 auto-inserts tbody,
38
54
  # HTML4 does not) has no effect on the AST.
39
- doc = Nokogiri::HTML.fragment(input)
55
+ doc =
56
+ if input.is_a?(Nokogiri::XML::Node)
57
+ input
58
+ else
59
+ Nokogiri::HTML.fragment(input.to_s)
60
+ end
61
+
62
+ children = doc.is_a?(Nokogiri::HTML::Document) ? body_children(doc) : doc.children
40
63
 
41
64
  # Create root AST document
42
65
  document = AST::Document.new
43
66
 
44
67
  # Process all nodes
45
- doc.children.each { |node| process_node(node, document) }
68
+ children.each { |node| process_node(node, document) }
69
+ trim_trailing_whitespace(document)
46
70
 
47
71
  document
48
72
  end
@@ -72,7 +96,19 @@ module Markbridge
72
96
  # @param node [Nokogiri::XML::Text]
73
97
  # @param parent [AST::Element]
74
98
  def process_text_node(node, parent)
75
- parent << AST::Text.new(node.text)
99
+ if preserves_whitespace?(node)
100
+ parent << AST::Text.new(node.text)
101
+ return
102
+ end
103
+
104
+ text = node.text.gsub(WHITESPACE_RUN, " ")
105
+ # Drop leading whitespace at the start of an element's content,
106
+ # matching the browser rule that whitespace at the beginning of a
107
+ # block (or before any inline content) is collapsed away.
108
+ text = text.lstrip if parent.children.empty?
109
+ return if text.empty?
110
+
111
+ parent << AST::Text.new(text)
76
112
  end
77
113
 
78
114
  # Process an element node
@@ -82,21 +118,24 @@ module Markbridge
82
118
  tag_name = node.name
83
119
  return if IGNORED_TAGS.include?(tag_name)
84
120
 
121
+ # Drop whitespace that sits between content and the start of a
122
+ # block-level tag, matching browser behavior where such whitespace
123
+ # collapses against the block boundary. Applies whether or not a
124
+ # handler is registered, so unknown tags like <div> or <section>
125
+ # still collapse the whitespace before them.
126
+ trim_trailing_whitespace(parent) if @handlers.block_level_tags.include?(tag_name)
127
+
85
128
  handler = @handlers[tag_name]
129
+ return handle_unknown_tag(node, parent) unless handler
130
+
131
+ # Handler returns element if children should be processed, nil otherwise
132
+ ast_element = handler.process(element: node, parent:)
86
133
 
87
- if handler
88
- # Handler returns element if children should be processed, nil otherwise
89
- ast_element =
90
- if handler.respond_to?(:process)
91
- handler.process(element: node, parent:)
92
- else
93
- handler.call(element: node, parent:)
94
- end
95
-
96
- # Automatically process children if handler returned element
97
- process_children(node, ast_element) if ast_element
98
- else
99
- handle_unknown_tag(node, parent)
134
+ return unless ast_element
135
+
136
+ process_children(node, ast_element)
137
+ unless @handlers.whitespace_preserving_tags.include?(tag_name)
138
+ trim_trailing_whitespace(ast_element)
100
139
  end
101
140
  end
102
141
 
@@ -108,6 +147,37 @@ module Markbridge
108
147
  @unknown_tags[node.name] += 1
109
148
  process_children(node, parent)
110
149
  end
150
+
151
+ # Whether `node` is inside a tag that preserves source whitespace.
152
+ # @param node [Nokogiri::XML::Node]
153
+ # @return [Boolean]
154
+ def preserves_whitespace?(node)
155
+ node.ancestors.any? do |ancestor|
156
+ @handlers.whitespace_preserving_tags.include?(ancestor.name)
157
+ end
158
+ end
159
+
160
+ # Direct children of the +<body>+ element of a full HTML document,
161
+ # falling back to the document's own children if no +<body>+ exists
162
+ # (malformed input).
163
+ # @param doc [Nokogiri::HTML::Document]
164
+ # @return [Nokogiri::XML::NodeSet]
165
+ def body_children(doc)
166
+ (doc.at_css("body") || doc).children
167
+ end
168
+
169
+ # Strip trailing whitespace from the last Text child of `element`.
170
+ # Removes the child entirely if it becomes empty. No-op if the last
171
+ # child is not a Text node.
172
+ # @param element [AST::Element]
173
+ def trim_trailing_whitespace(element)
174
+ last = element.children.last
175
+ return unless last.instance_of?(AST::Text)
176
+
177
+ trimmed = last.text.rstrip
178
+ element.children.pop
179
+ element << AST::Text.new(trimmed) unless trimmed.empty?
180
+ end
111
181
  end
112
182
  end
113
183
  end
@@ -10,6 +10,7 @@ require_relative "../ast"
10
10
  # Handlers
11
11
  require_relative "html/handlers/base_handler"
12
12
  require_relative "html/handlers/simple_handler"
13
+ require_relative "html/handlers/self_closing_handler"
13
14
  require_relative "html/handlers/raw_handler"
14
15
  require_relative "html/handlers/url_handler"
15
16
  require_relative "html/handlers/image_handler"
@@ -20,6 +21,7 @@ require_relative "html/handlers/paragraph_handler"
20
21
  require_relative "html/handlers/table_handler"
21
22
  require_relative "html/handlers/table_row_handler"
22
23
  require_relative "html/handlers/table_cell_handler"
24
+ require_relative "html/handlers/span_handler"
23
25
 
24
26
  # Parser components
25
27
  require_relative "html/handler_registry"