html-to-markdown 2.16.0 → 2.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -175,6 +175,7 @@ mod enabled {
175
175
  pub use enabled::{maybe_profile, start, stop};
176
176
 
177
177
  #[cfg(target_os = "windows")]
178
+ #[allow(dead_code)]
178
179
  pub fn start(_output_path: PathBuf, _frequency: i32) -> Result<()> {
179
180
  Err(ConversionError::Other(
180
181
  "Profiling is not supported on Windows".to_string(),
@@ -182,6 +183,7 @@ pub fn start(_output_path: PathBuf, _frequency: i32) -> Result<()> {
182
183
  }
183
184
 
184
185
  #[cfg(all(not(target_os = "windows"), not(feature = "profiling")))]
186
+ #[allow(dead_code)]
185
187
  pub fn start(_output_path: PathBuf, _frequency: i32) -> Result<()> {
186
188
  Err(ConversionError::Other(
187
189
  "Profiling is disabled; rebuild with the profiling feature".to_string(),
@@ -189,6 +191,7 @@ pub fn start(_output_path: PathBuf, _frequency: i32) -> Result<()> {
189
191
  }
190
192
 
191
193
  #[cfg(target_os = "windows")]
194
+ #[allow(dead_code)]
192
195
  pub fn stop() -> Result<()> {
193
196
  Err(ConversionError::Other(
194
197
  "Profiling is not supported on Windows".to_string(),
@@ -196,6 +199,7 @@ pub fn stop() -> Result<()> {
196
199
  }
197
200
 
198
201
  #[cfg(all(not(target_os = "windows"), not(feature = "profiling")))]
202
+ #[allow(dead_code)]
199
203
  pub fn stop() -> Result<()> {
200
204
  Err(ConversionError::Other(
201
205
  "Profiling is disabled; rebuild with the profiling feature".to_string(),
@@ -36,7 +36,7 @@ Gem::Specification.new do |spec|
36
36
  html-to-markdown is a native Ruby extension built on the shared Rust engine that powers the html-to-markdown project.
37
37
  It delivers identical HTML-to-Markdown output across languages, exposes inline image extraction, and ships with a CLI for automation workflows.
38
38
  DESC
39
- spec.homepage = 'https://github.com/Goldziher/html-to-markdown'
39
+ spec.homepage = 'https://github.com/kreuzberg-dev/html-to-markdown'
40
40
  spec.license = 'MIT'
41
41
 
42
42
  spec.required_ruby_version = Gem::Requirement.new('>= 3.2')
@@ -52,9 +52,9 @@ Gem::Specification.new do |spec|
52
52
 
53
53
  spec.add_dependency 'rb_sys', '>= 0.9', '< 1.0'
54
54
  spec.metadata['rubygems_mfa_required'] = 'true'
55
- spec.metadata['homepage_uri'] = 'https://github.com/Goldziher/html-to-markdown'
56
- spec.metadata['source_code_uri'] = 'https://github.com/Goldziher/html-to-markdown'
57
- spec.metadata['bug_tracker_uri'] = 'https://github.com/Goldziher/html-to-markdown/issues'
58
- spec.metadata['changelog_uri'] = 'https://github.com/Goldziher/html-to-markdown/releases'
59
- spec.metadata['documentation_uri'] = 'https://github.com/Goldziher/html-to-markdown/blob/main/packages/ruby/README.md'
55
+ spec.metadata['homepage_uri'] = 'https://github.com/kreuzberg-dev/html-to-markdown'
56
+ spec.metadata['source_code_uri'] = 'https://github.com/kreuzberg-dev/html-to-markdown'
57
+ spec.metadata['bug_tracker_uri'] = 'https://github.com/kreuzberg-dev/html-to-markdown/issues'
58
+ spec.metadata['changelog_uri'] = 'https://github.com/kreuzberg-dev/html-to-markdown/releases'
59
+ spec.metadata['documentation_uri'] = 'https://github.com/kreuzberg-dev/html-to-markdown/blob/main/packages/ruby/README.md'
60
60
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module HtmlToMarkdown
4
- VERSION = '2.16.0'
4
+ VERSION = '2.18.0'
5
5
  end
@@ -6,6 +6,32 @@ module HtmlToMarkdown
6
6
  class Options
7
7
  end
8
8
 
9
+ # Visitor context information passed to visitor callbacks
10
+ class NodeContext
11
+ attr_reader node_type: Symbol
12
+ attr_reader tag_name: String
13
+ attr_reader attributes: Hash[String, String]
14
+ attr_reader depth: Integer
15
+ attr_reader index_in_parent: Integer
16
+ attr_reader parent_tag: String | nil
17
+ attr_reader is_inline: bool
18
+ end
19
+
20
+ # Result of a visitor callback
21
+ type visitor_result = {
22
+ type: :continue,
23
+ } | {
24
+ type: :custom,
25
+ output: String,
26
+ } | {
27
+ type: :skip,
28
+ } | {
29
+ type: :preserve_html,
30
+ } | {
31
+ type: :error,
32
+ message: String,
33
+ }
34
+
9
35
  type heading_style = :underlined | :atx | :atx_closed
10
36
  type list_indent_type = :spaces | :tabs
11
37
  type highlight_style = :double_equal | :html | :bold | :none
@@ -88,6 +114,7 @@ module HtmlToMarkdown
88
114
  }
89
115
 
90
116
  type metadata_config = {
117
+ extract_document?: bool,
91
118
  extract_headers?: bool,
92
119
  extract_links?: bool,
93
120
  extract_images?: bool,
@@ -182,6 +209,11 @@ module HtmlToMarkdown
182
209
  conversion_options? options,
183
210
  metadata_config? metadata_config
184
211
  ) -> [String, extended_metadata]
212
+ def self.native_convert_with_visitor: (
213
+ String html,
214
+ conversion_options? options,
215
+ visitor? visitor
216
+ ) -> String
185
217
 
186
218
  def native_convert: (String html, conversion_options? options) -> String
187
219
  def native_options: (conversion_options? options_hash) -> Options
@@ -206,6 +238,14 @@ module HtmlToMarkdown
206
238
  conversion_options? options,
207
239
  metadata_config? metadata_config
208
240
  ) -> [String, extended_metadata]
241
+ def native_convert_with_visitor: (
242
+ String html,
243
+ conversion_options? options,
244
+ visitor? visitor
245
+ ) -> String
246
+
247
+ # Visitor interface for customizing conversion behavior
248
+ type visitor = Object
209
249
 
210
250
  public
211
251
 
@@ -230,6 +270,75 @@ module HtmlToMarkdown
230
270
  ?inline_image_config image_config
231
271
  ) -> html_extraction
232
272
 
273
+ # Convert HTML to Markdown with a custom visitor
274
+ #
275
+ # The visitor object can implement any of the following methods:
276
+ # - visit_element_start(ctx) -> visitor_result
277
+ # - visit_element_end(ctx, output) -> visitor_result
278
+ # - visit_text(ctx, text) -> visitor_result
279
+ # - visit_link(ctx, href, text, title) -> visitor_result
280
+ # - visit_image(ctx, src, alt, title) -> visitor_result
281
+ # - visit_heading(ctx, level, text, id) -> visitor_result
282
+ # - visit_code_block(ctx, lang, code) -> visitor_result
283
+ # - visit_code_inline(ctx, code) -> visitor_result
284
+ # - visit_list_item(ctx, ordered, marker, text) -> visitor_result
285
+ # - visit_list_start(ctx, ordered) -> visitor_result
286
+ # - visit_list_end(ctx, ordered, output) -> visitor_result
287
+ # - visit_table_start(ctx) -> visitor_result
288
+ # - visit_table_row(ctx, cells, is_header) -> visitor_result
289
+ # - visit_table_end(ctx, output) -> visitor_result
290
+ # - visit_blockquote(ctx, content, depth) -> visitor_result
291
+ # - visit_strong(ctx, text) -> visitor_result
292
+ # - visit_emphasis(ctx, text) -> visitor_result
293
+ # - visit_strikethrough(ctx, text) -> visitor_result
294
+ # - visit_underline(ctx, text) -> visitor_result
295
+ # - visit_subscript(ctx, text) -> visitor_result
296
+ # - visit_superscript(ctx, text) -> visitor_result
297
+ # - visit_mark(ctx, text) -> visitor_result
298
+ # - visit_line_break(ctx) -> visitor_result
299
+ # - visit_horizontal_rule(ctx) -> visitor_result
300
+ # - visit_custom_element(ctx, tag_name, html) -> visitor_result
301
+ # - visit_definition_list_start(ctx) -> visitor_result
302
+ # - visit_definition_term(ctx, text) -> visitor_result
303
+ # - visit_definition_description(ctx, text) -> visitor_result
304
+ # - visit_definition_list_end(ctx, output) -> visitor_result
305
+ # - visit_form(ctx, action, method) -> visitor_result
306
+ # - visit_input(ctx, input_type, name, value) -> visitor_result
307
+ # - visit_button(ctx, text) -> visitor_result
308
+ # - visit_audio(ctx, src) -> visitor_result
309
+ # - visit_video(ctx, src) -> visitor_result
310
+ # - visit_iframe(ctx, src) -> visitor_result
311
+ # - visit_details(ctx, open) -> visitor_result
312
+ # - visit_summary(ctx, text) -> visitor_result
313
+ # - visit_figure_start(ctx) -> visitor_result
314
+ # - visit_figcaption(ctx, text) -> visitor_result
315
+ # - visit_figure_end(ctx, output) -> visitor_result
316
+ #
317
+ # Each method should return a Hash with at least :type key:
318
+ # { type: :continue } - Continue with default behavior
319
+ # { type: :custom, output: "..." } - Replace with custom markdown
320
+ # { type: :skip } - Skip this element entirely
321
+ # { type: :preserve_html } - Keep original HTML
322
+ # { type: :error, message: "..." } - Stop conversion with error
323
+ #
324
+ # Args:
325
+ # html: HTML string to convert
326
+ # options: Optional conversion configuration
327
+ # visitor: Visitor object that responds to visitor callback methods
328
+ #
329
+ # Returns:
330
+ # markdown: String - Converted markdown output
331
+ #
332
+ # Example:
333
+ # class MyVisitor
334
+ # def visit_link(ctx, href, text, title = nil)
335
+ # { type: :custom, output: "[#{text}](#{href})" }
336
+ # end
337
+ # end
338
+ #
339
+ # HtmlToMarkdown.convert_with_visitor(html, visitor: MyVisitor.new)
340
+ def self.convert_with_visitor: (String html, ?conversion_options options, visitor: visitor) -> String
341
+
233
342
  # Convert HTML to Markdown with metadata extraction
234
343
  #
235
344
  # Extracts comprehensive metadata (headers, links, images, structured data) during conversion.
@@ -281,6 +390,7 @@ module HtmlToMarkdown
281
390
  ?conversion_options options,
282
391
  ?inline_image_config image_config
283
392
  ) -> html_extraction
393
+ def convert_with_visitor: (String html, ?conversion_options options, visitor: visitor) -> String
284
394
  def convert_with_metadata: (
285
395
  String html,
286
396
  ?conversion_options options,