html-to-markdown 2.16.0 → 2.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/bin/benchmark.rb CHANGED
@@ -23,7 +23,8 @@ end
23
23
  options = {
24
24
  iterations: 50,
25
25
  format: 'html',
26
- scenario: 'convert-default'
26
+ scenario: 'convert-default',
27
+ visitor: nil
27
28
  }
28
29
 
29
30
  OptionParser.new do |parser|
@@ -44,6 +45,10 @@ OptionParser.new do |parser|
44
45
  parser.on('--format FORMAT', 'Fixture format (html or hocr)') do |format|
45
46
  options[:format] = format.downcase
46
47
  end
48
+
49
+ parser.on('--visitor VISITOR', 'Visitor type (noop, simple, custom, complex)') do |visitor|
50
+ options[:visitor] = visitor if %w[noop simple custom complex].include?(visitor)
51
+ end
47
52
  end.parse!
48
53
 
49
54
  fixture = options.fetch(:file) do
@@ -74,6 +79,70 @@ unless supported_scenarios.include?(options[:scenario])
74
79
  exit 1
75
80
  end
76
81
 
82
+ # Visitor factory functions
83
+ def create_noop_visitor
84
+ {
85
+ visit_text: proc { |_ctx, _text| 'continue' },
86
+ visit_heading: proc { |_ctx, _level, _text, _id| 'continue' },
87
+ visit_paragraph: proc { |_ctx, _text| 'continue' },
88
+ visit_link: proc { |_ctx, _href, _text, _title| 'continue' },
89
+ visit_image: proc { |_ctx, _src, _alt, _title| 'continue' },
90
+ visit_strong: proc { |_ctx, _text| 'continue' },
91
+ visit_em: proc { |_ctx, _text| 'continue' },
92
+ visit_code: proc { |_ctx, _text| 'continue' },
93
+ visit_br: proc { |_ctx| 'continue' }
94
+ }
95
+ end
96
+
97
+ def create_simple_visitor
98
+ {
99
+ text_count: 0,
100
+ link_count: 0,
101
+ image_count: 0,
102
+ visit_text: proc { |_ctx, _text| 'continue' },
103
+ visit_heading: proc { |_ctx, _level, _text, _id| 'continue' },
104
+ visit_paragraph: proc { |_ctx, _text| 'continue' },
105
+ visit_link: proc { |_ctx, _href, _text, _title| 'continue' },
106
+ visit_image: proc { |_ctx, _src, _alt, _title| 'continue' },
107
+ visit_strong: proc { |_ctx, _text| 'continue' },
108
+ visit_em: proc { |_ctx, _text| 'continue' },
109
+ visit_code: proc { |_ctx, _text| 'continue' },
110
+ visit_br: proc { |_ctx| 'continue' }
111
+ }
112
+ end
113
+
114
+ def create_custom_visitor
115
+ {
116
+ visit_text: proc { |_ctx, _text| 'continue' },
117
+ visit_heading: proc { |_ctx, _level, _text, _id| 'continue' },
118
+ visit_paragraph: proc { |_ctx, _text| 'continue' },
119
+ visit_link: proc { |_ctx, href, text, _title| ['custom', "LINK[#{text}](#{href})"] },
120
+ visit_image: proc { |_ctx, src, alt, _title| ['custom', "![#{alt}](#{src})"] },
121
+ visit_strong: proc { |_ctx, _text| 'continue' },
122
+ visit_em: proc { |_ctx, _text| 'continue' },
123
+ visit_code: proc { |_ctx, _text| 'continue' },
124
+ visit_br: proc { |_ctx| 'continue' }
125
+ }
126
+ end
127
+
128
+ def create_complex_visitor
129
+ {
130
+ texts: 0,
131
+ links: 0,
132
+ images: 0,
133
+ headings: 0,
134
+ visit_text: proc { |_ctx, _text| 'continue' },
135
+ visit_heading: proc { |_ctx, _level, _text, _id| 'continue' },
136
+ visit_paragraph: proc { |_ctx, _text| 'continue' },
137
+ visit_link: proc { |_ctx, href, text, _title| ['custom', "[#{text}](#{href})"] },
138
+ visit_image: proc { |_ctx, _src, _alt, _title| 'skip' },
139
+ visit_strong: proc { |_ctx, _text| 'continue' },
140
+ visit_em: proc { |_ctx, _text| 'continue' },
141
+ visit_code: proc { |_ctx, _text| 'continue' },
142
+ visit_br: proc { |_ctx| 'continue' }
143
+ }
144
+ end
145
+
77
146
  html = File.binread(fixture)
78
147
  html.force_encoding(Encoding::UTF_8)
79
148
  html.freeze
@@ -83,33 +152,52 @@ options_handle = if %w[convert-options inline-images-options metadata-options].i
83
152
  HtmlToMarkdown.options(conversion_options)
84
153
  end
85
154
 
155
+ # Create visitor if specified
156
+ visitor = nil
157
+ if options[:visitor]
158
+ visitor_creators = {
159
+ 'noop' => method(:create_noop_visitor),
160
+ 'simple' => method(:create_simple_visitor),
161
+ 'custom' => method(:create_custom_visitor),
162
+ 'complex' => method(:create_complex_visitor)
163
+ }
164
+ creator = visitor_creators[options[:visitor]]
165
+ visitor = creator.call if creator
166
+ end
167
+
86
168
  SCENARIO_RUNNERS = {
87
- 'convert-default' => ->(html, _options, _handle) { HtmlToMarkdown.convert(html) },
88
- 'convert-options' => lambda do |html, _options, handle|
169
+ 'convert-default' => ->(html, _options, _handle, _visitor) { HtmlToMarkdown.convert(html) },
170
+ 'convert-options' => lambda do |html, _options, handle, _visitor|
89
171
  raise ArgumentError, 'options handle required' unless handle
90
172
 
91
173
  HtmlToMarkdown.convert_with_options(html, handle)
92
174
  end,
93
- 'inline-images-default' => ->(html, _options, _handle) { HtmlToMarkdown.convert_with_inline_images(html, nil, nil) },
94
- 'inline-images-options' => lambda do |html, _options, handle|
175
+ 'inline-images-default' => lambda { |html, _options, _handle, _visitor|
176
+ HtmlToMarkdown.convert_with_inline_images(html, nil, nil)
177
+ },
178
+ 'inline-images-options' => lambda do |html, _options, handle, _visitor|
95
179
  raise ArgumentError, 'options handle required' unless handle
96
180
 
97
181
  HtmlToMarkdown.convert_with_inline_images_handle(html, handle, nil)
98
182
  end,
99
- 'metadata-default' => ->(html, _options, _handle) { HtmlToMarkdown.convert_with_metadata(html, nil, nil) },
100
- 'metadata-options' => lambda do |html, _options, handle|
183
+ 'metadata-default' => ->(html, _options, _handle, _visitor) { HtmlToMarkdown.convert_with_metadata(html, nil, nil) },
184
+ 'metadata-options' => lambda do |html, _options, handle, _visitor|
101
185
  raise ArgumentError, 'options handle required' unless handle
102
186
 
103
187
  HtmlToMarkdown.convert_with_metadata_handle(html, handle, nil)
104
188
  end
105
189
  }.freeze
106
190
 
107
- def run_scenario(html, scenario, options, handle)
108
- runner = SCENARIO_RUNNERS.fetch(scenario) { raise ArgumentError, "Unsupported scenario: #{scenario}" }
109
- runner.call(html, options, handle)
191
+ def run_scenario(html, scenario, options, handle, visitor = nil)
192
+ if visitor
193
+ HtmlToMarkdown.convert_with_visitor(html, nil, visitor)
194
+ else
195
+ runner = SCENARIO_RUNNERS.fetch(scenario) { raise ArgumentError, "Unsupported scenario: #{scenario}" }
196
+ runner.call(html, options, handle, visitor)
197
+ end
110
198
  end
111
199
 
112
- run_scenario(html, options[:scenario], conversion_options, options_handle)
200
+ run_scenario(html, options[:scenario], conversion_options, options_handle, visitor)
113
201
 
114
202
  profile_output = ENV.fetch('HTML_TO_MARKDOWN_PROFILE_OUTPUT', nil)
115
203
  if profile_output && HtmlToMarkdown.respond_to?(:start_profiling)
@@ -118,7 +206,7 @@ if profile_output && HtmlToMarkdown.respond_to?(:start_profiling)
118
206
  end
119
207
 
120
208
  start = Process.clock_gettime(Process::CLOCK_MONOTONIC)
121
- iterations.times { run_scenario(html, options[:scenario], conversion_options, options_handle) }
209
+ iterations.times { run_scenario(html, options[:scenario], conversion_options, options_handle, visitor) }
122
210
  elapsed = Process.clock_gettime(Process::CLOCK_MONOTONIC) - start
123
211
 
124
212
  HtmlToMarkdown.stop_profiling if profile_output && HtmlToMarkdown.respond_to?(:stop_profiling)
@@ -1,11 +1,11 @@
1
1
  [package]
2
2
  name = "html-to-markdown-rb"
3
- version = "2.16.0"
3
+ version = "2.18.0"
4
4
  edition = "2024"
5
5
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
6
6
  license = "MIT"
7
- repository = "https://github.com/Goldziher/html-to-markdown"
8
- homepage = "https://github.com/Goldziher/html-to-markdown"
7
+ repository = "https://github.com/kreuzberg-dev/html-to-markdown"
8
+ homepage = "https://github.com/kreuzberg-dev/html-to-markdown"
9
9
  documentation = "https://docs.rs/html-to-markdown-rs"
10
10
  readme = "README.md"
11
11
  rust-version = "1.85"
@@ -18,7 +18,7 @@ name = "html_to_markdown_rb"
18
18
  crate-type = ["cdylib", "rlib"]
19
19
 
20
20
  [dependencies]
21
- html-to-markdown-rs = { version = "2.16.0", features = ["inline-images"] }
21
+ html-to-markdown-rs = { version = "2.16.1", features = ["inline-images", "visitor", "metadata"] }
22
22
  magnus = { git = "https://github.com/matsadler/magnus", rev = "f6db11769efb517427bf7f121f9c32e18b059b38", features = ["rb-sys"] }
23
23
 
24
24
  [target.'cfg(not(target_os = "windows"))'.dependencies]
@@ -27,6 +27,8 @@ pprof = { version = "0.15", features = ["flamegraph"], optional = true }
27
27
  [dev-dependencies]
28
28
  pretty_assertions = "1.4"
29
29
  [features]
30
- default = ["metadata"]
30
+ default = ["inline-images", "metadata", "visitor"]
31
+ inline-images = ["html-to-markdown-rs/inline-images"]
31
32
  metadata = ["html-to-markdown-rs/metadata"]
33
+ visitor = ["html-to-markdown-rs/visitor"]
32
34
  profiling = ["dep:pprof"]
@@ -8,7 +8,7 @@ Blazing-fast HTML → Markdown conversion for Ruby, powered by the same Rust eng
8
8
  [![PyPI](https://img.shields.io/pypi/v/html-to-markdown.svg?logo=pypi)](https://pypi.org/project/html-to-markdown/)
9
9
  [![Packagist](https://img.shields.io/packagist/v/goldziher/html-to-markdown.svg)](https://packagist.org/packages/goldziher/html-to-markdown)
10
10
  [![RubyGems](https://badge.fury.io/rb/html-to-markdown.svg)](https://rubygems.org/gems/html-to-markdown)
11
- [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://github.com/Goldziher/html-to-markdown/blob/main/LICENSE)
11
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://github.com/kreuzberg-dev/html-to-markdown/blob/main/LICENSE)
12
12
 
13
13
  ## Features
14
14
 
@@ -21,10 +21,10 @@ Blazing-fast HTML → Markdown conversion for Ruby, powered by the same Rust eng
21
21
 
22
22
  ## Documentation & Support
23
23
 
24
- - [GitHub repository](https://github.com/Goldziher/html-to-markdown)
25
- - [Issue tracker](https://github.com/Goldziher/html-to-markdown/issues)
26
- - [Changelog](https://github.com/Goldziher/html-to-markdown/blob/main/CHANGELOG.md)
27
- - [Live demo (WASM)](https://goldziher.github.io/html-to-markdown/)
24
+ - [GitHub repository](https://github.com/kreuzberg-dev/html-to-markdown)
25
+ - [Issue tracker](https://github.com/kreuzberg-dev/html-to-markdown/issues)
26
+ - [Changelog](https://github.com/kreuzberg-dev/html-to-markdown/blob/main/CHANGELOG.md)
27
+ - [Live demo (WASM)](https://kreuzberg-dev.github.io/html-to-markdown/)
28
28
 
29
29
  ## Installation
30
30