html-to-markdown 2.16.0 → 2.18.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +90 -10
- data/README.md +99 -302
- data/bin/benchmark.rb +100 -12
- data/ext/html-to-markdown-rb/native/Cargo.toml +7 -5
- data/ext/html-to-markdown-rb/native/README.md +5 -5
- data/ext/html-to-markdown-rb/native/src/lib.rs +951 -0
- data/ext/html-to-markdown-rb/native/src/profiling.rs +4 -0
- data/html-to-markdown-rb.gemspec +6 -6
- data/lib/html_to_markdown/version.rb +1 -1
- data/sig/html_to_markdown.rbs +110 -0
- data/spec/visitor_spec.rb +1149 -0
- metadata +9 -8
|
@@ -175,6 +175,7 @@ mod enabled {
|
|
|
175
175
|
pub use enabled::{maybe_profile, start, stop};
|
|
176
176
|
|
|
177
177
|
#[cfg(target_os = "windows")]
|
|
178
|
+
#[allow(dead_code)]
|
|
178
179
|
pub fn start(_output_path: PathBuf, _frequency: i32) -> Result<()> {
|
|
179
180
|
Err(ConversionError::Other(
|
|
180
181
|
"Profiling is not supported on Windows".to_string(),
|
|
@@ -182,6 +183,7 @@ pub fn start(_output_path: PathBuf, _frequency: i32) -> Result<()> {
|
|
|
182
183
|
}
|
|
183
184
|
|
|
184
185
|
#[cfg(all(not(target_os = "windows"), not(feature = "profiling")))]
|
|
186
|
+
#[allow(dead_code)]
|
|
185
187
|
pub fn start(_output_path: PathBuf, _frequency: i32) -> Result<()> {
|
|
186
188
|
Err(ConversionError::Other(
|
|
187
189
|
"Profiling is disabled; rebuild with the profiling feature".to_string(),
|
|
@@ -189,6 +191,7 @@ pub fn start(_output_path: PathBuf, _frequency: i32) -> Result<()> {
|
|
|
189
191
|
}
|
|
190
192
|
|
|
191
193
|
#[cfg(target_os = "windows")]
|
|
194
|
+
#[allow(dead_code)]
|
|
192
195
|
pub fn stop() -> Result<()> {
|
|
193
196
|
Err(ConversionError::Other(
|
|
194
197
|
"Profiling is not supported on Windows".to_string(),
|
|
@@ -196,6 +199,7 @@ pub fn stop() -> Result<()> {
|
|
|
196
199
|
}
|
|
197
200
|
|
|
198
201
|
#[cfg(all(not(target_os = "windows"), not(feature = "profiling")))]
|
|
202
|
+
#[allow(dead_code)]
|
|
199
203
|
pub fn stop() -> Result<()> {
|
|
200
204
|
Err(ConversionError::Other(
|
|
201
205
|
"Profiling is disabled; rebuild with the profiling feature".to_string(),
|
data/html-to-markdown-rb.gemspec
CHANGED
|
@@ -36,7 +36,7 @@ Gem::Specification.new do |spec|
|
|
|
36
36
|
html-to-markdown is a native Ruby extension built on the shared Rust engine that powers the html-to-markdown project.
|
|
37
37
|
It delivers identical HTML-to-Markdown output across languages, exposes inline image extraction, and ships with a CLI for automation workflows.
|
|
38
38
|
DESC
|
|
39
|
-
spec.homepage = 'https://github.com/
|
|
39
|
+
spec.homepage = 'https://github.com/kreuzberg-dev/html-to-markdown'
|
|
40
40
|
spec.license = 'MIT'
|
|
41
41
|
|
|
42
42
|
spec.required_ruby_version = Gem::Requirement.new('>= 3.2')
|
|
@@ -52,9 +52,9 @@ Gem::Specification.new do |spec|
|
|
|
52
52
|
|
|
53
53
|
spec.add_dependency 'rb_sys', '>= 0.9', '< 1.0'
|
|
54
54
|
spec.metadata['rubygems_mfa_required'] = 'true'
|
|
55
|
-
spec.metadata['homepage_uri'] = 'https://github.com/
|
|
56
|
-
spec.metadata['source_code_uri'] = 'https://github.com/
|
|
57
|
-
spec.metadata['bug_tracker_uri'] = 'https://github.com/
|
|
58
|
-
spec.metadata['changelog_uri'] = 'https://github.com/
|
|
59
|
-
spec.metadata['documentation_uri'] = 'https://github.com/
|
|
55
|
+
spec.metadata['homepage_uri'] = 'https://github.com/kreuzberg-dev/html-to-markdown'
|
|
56
|
+
spec.metadata['source_code_uri'] = 'https://github.com/kreuzberg-dev/html-to-markdown'
|
|
57
|
+
spec.metadata['bug_tracker_uri'] = 'https://github.com/kreuzberg-dev/html-to-markdown/issues'
|
|
58
|
+
spec.metadata['changelog_uri'] = 'https://github.com/kreuzberg-dev/html-to-markdown/releases'
|
|
59
|
+
spec.metadata['documentation_uri'] = 'https://github.com/kreuzberg-dev/html-to-markdown/blob/main/packages/ruby/README.md'
|
|
60
60
|
end
|
data/sig/html_to_markdown.rbs
CHANGED
|
@@ -6,6 +6,32 @@ module HtmlToMarkdown
|
|
|
6
6
|
class Options
|
|
7
7
|
end
|
|
8
8
|
|
|
9
|
+
# Visitor context information passed to visitor callbacks
|
|
10
|
+
class NodeContext
|
|
11
|
+
attr_reader node_type: Symbol
|
|
12
|
+
attr_reader tag_name: String
|
|
13
|
+
attr_reader attributes: Hash[String, String]
|
|
14
|
+
attr_reader depth: Integer
|
|
15
|
+
attr_reader index_in_parent: Integer
|
|
16
|
+
attr_reader parent_tag: String | nil
|
|
17
|
+
attr_reader is_inline: bool
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
# Result of a visitor callback
|
|
21
|
+
type visitor_result = {
|
|
22
|
+
type: :continue,
|
|
23
|
+
} | {
|
|
24
|
+
type: :custom,
|
|
25
|
+
output: String,
|
|
26
|
+
} | {
|
|
27
|
+
type: :skip,
|
|
28
|
+
} | {
|
|
29
|
+
type: :preserve_html,
|
|
30
|
+
} | {
|
|
31
|
+
type: :error,
|
|
32
|
+
message: String,
|
|
33
|
+
}
|
|
34
|
+
|
|
9
35
|
type heading_style = :underlined | :atx | :atx_closed
|
|
10
36
|
type list_indent_type = :spaces | :tabs
|
|
11
37
|
type highlight_style = :double_equal | :html | :bold | :none
|
|
@@ -88,6 +114,7 @@ module HtmlToMarkdown
|
|
|
88
114
|
}
|
|
89
115
|
|
|
90
116
|
type metadata_config = {
|
|
117
|
+
extract_document?: bool,
|
|
91
118
|
extract_headers?: bool,
|
|
92
119
|
extract_links?: bool,
|
|
93
120
|
extract_images?: bool,
|
|
@@ -182,6 +209,11 @@ module HtmlToMarkdown
|
|
|
182
209
|
conversion_options? options,
|
|
183
210
|
metadata_config? metadata_config
|
|
184
211
|
) -> [String, extended_metadata]
|
|
212
|
+
def self.native_convert_with_visitor: (
|
|
213
|
+
String html,
|
|
214
|
+
conversion_options? options,
|
|
215
|
+
visitor? visitor
|
|
216
|
+
) -> String
|
|
185
217
|
|
|
186
218
|
def native_convert: (String html, conversion_options? options) -> String
|
|
187
219
|
def native_options: (conversion_options? options_hash) -> Options
|
|
@@ -206,6 +238,14 @@ module HtmlToMarkdown
|
|
|
206
238
|
conversion_options? options,
|
|
207
239
|
metadata_config? metadata_config
|
|
208
240
|
) -> [String, extended_metadata]
|
|
241
|
+
def native_convert_with_visitor: (
|
|
242
|
+
String html,
|
|
243
|
+
conversion_options? options,
|
|
244
|
+
visitor? visitor
|
|
245
|
+
) -> String
|
|
246
|
+
|
|
247
|
+
# Visitor interface for customizing conversion behavior
|
|
248
|
+
type visitor = Object
|
|
209
249
|
|
|
210
250
|
public
|
|
211
251
|
|
|
@@ -230,6 +270,75 @@ module HtmlToMarkdown
|
|
|
230
270
|
?inline_image_config image_config
|
|
231
271
|
) -> html_extraction
|
|
232
272
|
|
|
273
|
+
# Convert HTML to Markdown with a custom visitor
|
|
274
|
+
#
|
|
275
|
+
# The visitor object can implement any of the following methods:
|
|
276
|
+
# - visit_element_start(ctx) -> visitor_result
|
|
277
|
+
# - visit_element_end(ctx, output) -> visitor_result
|
|
278
|
+
# - visit_text(ctx, text) -> visitor_result
|
|
279
|
+
# - visit_link(ctx, href, text, title) -> visitor_result
|
|
280
|
+
# - visit_image(ctx, src, alt, title) -> visitor_result
|
|
281
|
+
# - visit_heading(ctx, level, text, id) -> visitor_result
|
|
282
|
+
# - visit_code_block(ctx, lang, code) -> visitor_result
|
|
283
|
+
# - visit_code_inline(ctx, code) -> visitor_result
|
|
284
|
+
# - visit_list_item(ctx, ordered, marker, text) -> visitor_result
|
|
285
|
+
# - visit_list_start(ctx, ordered) -> visitor_result
|
|
286
|
+
# - visit_list_end(ctx, ordered, output) -> visitor_result
|
|
287
|
+
# - visit_table_start(ctx) -> visitor_result
|
|
288
|
+
# - visit_table_row(ctx, cells, is_header) -> visitor_result
|
|
289
|
+
# - visit_table_end(ctx, output) -> visitor_result
|
|
290
|
+
# - visit_blockquote(ctx, content, depth) -> visitor_result
|
|
291
|
+
# - visit_strong(ctx, text) -> visitor_result
|
|
292
|
+
# - visit_emphasis(ctx, text) -> visitor_result
|
|
293
|
+
# - visit_strikethrough(ctx, text) -> visitor_result
|
|
294
|
+
# - visit_underline(ctx, text) -> visitor_result
|
|
295
|
+
# - visit_subscript(ctx, text) -> visitor_result
|
|
296
|
+
# - visit_superscript(ctx, text) -> visitor_result
|
|
297
|
+
# - visit_mark(ctx, text) -> visitor_result
|
|
298
|
+
# - visit_line_break(ctx) -> visitor_result
|
|
299
|
+
# - visit_horizontal_rule(ctx) -> visitor_result
|
|
300
|
+
# - visit_custom_element(ctx, tag_name, html) -> visitor_result
|
|
301
|
+
# - visit_definition_list_start(ctx) -> visitor_result
|
|
302
|
+
# - visit_definition_term(ctx, text) -> visitor_result
|
|
303
|
+
# - visit_definition_description(ctx, text) -> visitor_result
|
|
304
|
+
# - visit_definition_list_end(ctx, output) -> visitor_result
|
|
305
|
+
# - visit_form(ctx, action, method) -> visitor_result
|
|
306
|
+
# - visit_input(ctx, input_type, name, value) -> visitor_result
|
|
307
|
+
# - visit_button(ctx, text) -> visitor_result
|
|
308
|
+
# - visit_audio(ctx, src) -> visitor_result
|
|
309
|
+
# - visit_video(ctx, src) -> visitor_result
|
|
310
|
+
# - visit_iframe(ctx, src) -> visitor_result
|
|
311
|
+
# - visit_details(ctx, open) -> visitor_result
|
|
312
|
+
# - visit_summary(ctx, text) -> visitor_result
|
|
313
|
+
# - visit_figure_start(ctx) -> visitor_result
|
|
314
|
+
# - visit_figcaption(ctx, text) -> visitor_result
|
|
315
|
+
# - visit_figure_end(ctx, output) -> visitor_result
|
|
316
|
+
#
|
|
317
|
+
# Each method should return a Hash with at least :type key:
|
|
318
|
+
# { type: :continue } - Continue with default behavior
|
|
319
|
+
# { type: :custom, output: "..." } - Replace with custom markdown
|
|
320
|
+
# { type: :skip } - Skip this element entirely
|
|
321
|
+
# { type: :preserve_html } - Keep original HTML
|
|
322
|
+
# { type: :error, message: "..." } - Stop conversion with error
|
|
323
|
+
#
|
|
324
|
+
# Args:
|
|
325
|
+
# html: HTML string to convert
|
|
326
|
+
# options: Optional conversion configuration
|
|
327
|
+
# visitor: Visitor object that responds to visitor callback methods
|
|
328
|
+
#
|
|
329
|
+
# Returns:
|
|
330
|
+
# markdown: String - Converted markdown output
|
|
331
|
+
#
|
|
332
|
+
# Example:
|
|
333
|
+
# class MyVisitor
|
|
334
|
+
# def visit_link(ctx, href, text, title = nil)
|
|
335
|
+
# { type: :custom, output: "[#{text}](#{href})" }
|
|
336
|
+
# end
|
|
337
|
+
# end
|
|
338
|
+
#
|
|
339
|
+
# HtmlToMarkdown.convert_with_visitor(html, visitor: MyVisitor.new)
|
|
340
|
+
def self.convert_with_visitor: (String html, ?conversion_options options, visitor: visitor) -> String
|
|
341
|
+
|
|
233
342
|
# Convert HTML to Markdown with metadata extraction
|
|
234
343
|
#
|
|
235
344
|
# Extracts comprehensive metadata (headers, links, images, structured data) during conversion.
|
|
@@ -281,6 +390,7 @@ module HtmlToMarkdown
|
|
|
281
390
|
?conversion_options options,
|
|
282
391
|
?inline_image_config image_config
|
|
283
392
|
) -> html_extraction
|
|
393
|
+
def convert_with_visitor: (String html, ?conversion_options options, visitor: visitor) -> String
|
|
284
394
|
def convert_with_metadata: (
|
|
285
395
|
String html,
|
|
286
396
|
?conversion_options options,
|