html-to-markdown 2.29.0-arm64-darwin

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Steepfile ADDED
@@ -0,0 +1,26 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Steepfile for type checking html-to-markdown Ruby gem
4
+
5
+ target :lib do
6
+ signature 'sig'
7
+
8
+ check 'lib'
9
+
10
+ # Configure libraries
11
+ library 'pathname'
12
+ library 'open3'
13
+
14
+ # Ignore vendor directory
15
+ ignore 'vendor'
16
+
17
+ # Ignore spec directory
18
+ ignore 'spec'
19
+
20
+ # Ignore bin directory
21
+ ignore 'bin'
22
+
23
+ # Ignore internal implementation modules (not public API)
24
+ ignore 'lib/html_to_markdown/cli.rb'
25
+ ignore 'lib/html_to_markdown/cli_proxy.rb'
26
+ end
data/bin/benchmark.rb ADDED
@@ -0,0 +1,232 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ require 'optparse'
5
+ require 'time'
6
+
7
+ $LOAD_PATH.unshift(File.expand_path('../lib', __dir__))
8
+ require 'html_to_markdown'
9
+
10
+ def json_escape(value)
11
+ value.to_s.gsub(/["\\\n\r]/) do |char|
12
+ case char
13
+ when '"', '\\'
14
+ "\\#{char}"
15
+ when "\n"
16
+ '\\n'
17
+ when "\r"
18
+ '\\r'
19
+ end
20
+ end
21
+ end
22
+
23
+ options = {
24
+ iterations: 50,
25
+ format: 'html',
26
+ scenario: 'convert-default',
27
+ visitor: nil
28
+ }
29
+
30
+ OptionParser.new do |parser|
31
+ parser.banner = 'ruby benchmark.rb --file path/to/fixture.html [--iterations 200]'
32
+
33
+ parser.on('--file FILE', 'HTML fixture to convert repeatedly') do |file|
34
+ options[:file] = file
35
+ end
36
+
37
+ parser.on('--iterations N', Integer, 'Number of conversion iterations (default: 50)') do |n|
38
+ options[:iterations] = n.positive? ? n : 1
39
+ end
40
+
41
+ parser.on('--scenario SCENARIO', 'Scenario to benchmark') do |scenario|
42
+ options[:scenario] = scenario
43
+ end
44
+
45
+ parser.on('--format FORMAT', 'Fixture format (html or hocr)') do |format|
46
+ options[:format] = format.downcase
47
+ end
48
+
49
+ parser.on('--visitor VISITOR', 'Visitor type (noop, simple, custom, complex)') do |visitor|
50
+ options[:visitor] = visitor if %w[noop simple custom complex].include?(visitor)
51
+ end
52
+ end.parse!
53
+
54
+ fixture = options.fetch(:file) do
55
+ warn 'Missing --file parameter'
56
+ exit 1
57
+ end
58
+
59
+ unless File.exist?(fixture)
60
+ warn "Fixture not found: #{fixture}"
61
+ exit 1
62
+ end
63
+
64
+ unless %w[html hocr].include?(options[:format])
65
+ warn "Unsupported format: #{options[:format]}"
66
+ exit 1
67
+ end
68
+
69
+ supported_scenarios = %w[
70
+ convert-default
71
+ convert-options
72
+ inline-images-default
73
+ inline-images-options
74
+ metadata-default
75
+ metadata-options
76
+ ]
77
+ unless supported_scenarios.include?(options[:scenario])
78
+ warn "Unsupported scenario: #{options[:scenario]}"
79
+ exit 1
80
+ end
81
+
82
+ # Visitor factory functions
83
+ def create_noop_visitor
84
+ {
85
+ visit_text: proc { |_ctx, _text| 'continue' },
86
+ visit_heading: proc { |_ctx, _level, _text, _id| 'continue' },
87
+ visit_paragraph: proc { |_ctx, _text| 'continue' },
88
+ visit_link: proc { |_ctx, _href, _text, _title| 'continue' },
89
+ visit_image: proc { |_ctx, _src, _alt, _title| 'continue' },
90
+ visit_strong: proc { |_ctx, _text| 'continue' },
91
+ visit_em: proc { |_ctx, _text| 'continue' },
92
+ visit_code: proc { |_ctx, _text| 'continue' },
93
+ visit_br: proc { |_ctx| 'continue' }
94
+ }
95
+ end
96
+
97
+ def create_simple_visitor
98
+ {
99
+ text_count: 0,
100
+ link_count: 0,
101
+ image_count: 0,
102
+ visit_text: proc { |_ctx, _text| 'continue' },
103
+ visit_heading: proc { |_ctx, _level, _text, _id| 'continue' },
104
+ visit_paragraph: proc { |_ctx, _text| 'continue' },
105
+ visit_link: proc { |_ctx, _href, _text, _title| 'continue' },
106
+ visit_image: proc { |_ctx, _src, _alt, _title| 'continue' },
107
+ visit_strong: proc { |_ctx, _text| 'continue' },
108
+ visit_em: proc { |_ctx, _text| 'continue' },
109
+ visit_code: proc { |_ctx, _text| 'continue' },
110
+ visit_br: proc { |_ctx| 'continue' }
111
+ }
112
+ end
113
+
114
+ def create_custom_visitor
115
+ {
116
+ visit_text: proc { |_ctx, _text| 'continue' },
117
+ visit_heading: proc { |_ctx, _level, _text, _id| 'continue' },
118
+ visit_paragraph: proc { |_ctx, _text| 'continue' },
119
+ visit_link: proc { |_ctx, href, text, _title| ['custom', "LINK[#{text}](#{href})"] },
120
+ visit_image: proc { |_ctx, src, alt, _title| ['custom', "![#{alt}](#{src})"] },
121
+ visit_strong: proc { |_ctx, _text| 'continue' },
122
+ visit_em: proc { |_ctx, _text| 'continue' },
123
+ visit_code: proc { |_ctx, _text| 'continue' },
124
+ visit_br: proc { |_ctx| 'continue' }
125
+ }
126
+ end
127
+
128
+ def create_complex_visitor
129
+ {
130
+ texts: 0,
131
+ links: 0,
132
+ images: 0,
133
+ headings: 0,
134
+ visit_text: proc { |_ctx, _text| 'continue' },
135
+ visit_heading: proc { |_ctx, _level, _text, _id| 'continue' },
136
+ visit_paragraph: proc { |_ctx, _text| 'continue' },
137
+ visit_link: proc { |_ctx, href, text, _title| ['custom', "[#{text}](#{href})"] },
138
+ visit_image: proc { |_ctx, _src, _alt, _title| 'skip' },
139
+ visit_strong: proc { |_ctx, _text| 'continue' },
140
+ visit_em: proc { |_ctx, _text| 'continue' },
141
+ visit_code: proc { |_ctx, _text| 'continue' },
142
+ visit_br: proc { |_ctx| 'continue' }
143
+ }
144
+ end
145
+
146
+ html = File.binread(fixture)
147
+ html.force_encoding(Encoding::UTF_8)
148
+ html.freeze
149
+ iterations = options[:iterations]
150
+ conversion_options = options[:format] == 'hocr' ? { hocr_spatial_tables: false } : {}
151
+ options_handle = if %w[convert-options inline-images-options metadata-options].include?(options[:scenario])
152
+ HtmlToMarkdown.options(conversion_options)
153
+ end
154
+
155
+ # Create visitor if specified
156
+ visitor = nil
157
+ if options[:visitor]
158
+ visitor_creators = {
159
+ 'noop' => method(:create_noop_visitor),
160
+ 'simple' => method(:create_simple_visitor),
161
+ 'custom' => method(:create_custom_visitor),
162
+ 'complex' => method(:create_complex_visitor)
163
+ }
164
+ creator = visitor_creators[options[:visitor]]
165
+ visitor = creator.call if creator
166
+ end
167
+
168
+ SCENARIO_RUNNERS = {
169
+ 'convert-default' => ->(html, _options, _handle, _visitor) { HtmlToMarkdown.convert(html) },
170
+ 'convert-options' => lambda do |html, _options, handle, _visitor|
171
+ raise ArgumentError, 'options handle required' unless handle
172
+
173
+ HtmlToMarkdown.convert_with_options(html, handle)
174
+ end,
175
+ 'inline-images-default' => lambda { |html, _options, _handle, _visitor|
176
+ HtmlToMarkdown.convert_with_inline_images(html, nil, nil)
177
+ },
178
+ 'inline-images-options' => lambda do |html, _options, handle, _visitor|
179
+ raise ArgumentError, 'options handle required' unless handle
180
+
181
+ HtmlToMarkdown.convert_with_inline_images_handle(html, handle, nil)
182
+ end,
183
+ 'metadata-default' => ->(html, _options, _handle, _visitor) { HtmlToMarkdown.convert_with_metadata(html, nil, nil) },
184
+ 'metadata-options' => lambda do |html, _options, handle, _visitor|
185
+ raise ArgumentError, 'options handle required' unless handle
186
+
187
+ HtmlToMarkdown.convert_with_metadata_handle(html, handle, nil)
188
+ end
189
+ }.freeze
190
+
191
+ def run_scenario(html, scenario, options, handle, visitor = nil)
192
+ if visitor
193
+ HtmlToMarkdown.convert_with_visitor(html, nil, visitor)
194
+ else
195
+ runner = SCENARIO_RUNNERS.fetch(scenario) { raise ArgumentError, "Unsupported scenario: #{scenario}" }
196
+ runner.call(html, options, handle, visitor)
197
+ end
198
+ end
199
+
200
+ run_scenario(html, options[:scenario], conversion_options, options_handle, visitor)
201
+
202
+ profile_output = ENV.fetch('HTML_TO_MARKDOWN_PROFILE_OUTPUT', nil)
203
+ if profile_output && HtmlToMarkdown.respond_to?(:start_profiling)
204
+ freq = Integer(ENV.fetch('HTML_TO_MARKDOWN_PROFILE_FREQUENCY', '1000'), 10)
205
+ HtmlToMarkdown.start_profiling(profile_output, freq)
206
+ end
207
+
208
+ start = Process.clock_gettime(Process::CLOCK_MONOTONIC)
209
+ iterations.times { run_scenario(html, options[:scenario], conversion_options, options_handle, visitor) }
210
+ elapsed = Process.clock_gettime(Process::CLOCK_MONOTONIC) - start
211
+
212
+ HtmlToMarkdown.stop_profiling if profile_output && HtmlToMarkdown.respond_to?(:stop_profiling)
213
+
214
+ payload_size_bytes = html.bytesize
215
+ bytes_processed = payload_size_bytes * iterations
216
+ ops_per_sec = iterations / elapsed
217
+ mb_per_sec = (bytes_processed.to_f / (1024 * 1024)) / elapsed
218
+
219
+ payload = %({
220
+ "language":"ruby",
221
+ "fixture":"#{json_escape(File.basename(fixture))}",
222
+ "fixture_path":"#{json_escape(fixture)}",
223
+ "scenario":"#{json_escape(options[:scenario])}",
224
+ "iterations":#{iterations},
225
+ "elapsed_seconds":#{format('%.8f', elapsed)},
226
+ "ops_per_sec":#{format('%.4f', ops_per_sec)},
227
+ "mb_per_sec":#{format('%.4f', mb_per_sec)},
228
+ "bytes_processed":#{bytes_processed},
229
+ "payload_size_bytes":#{payload_size_bytes}
230
+ })
231
+
232
+ puts payload.strip
@@ -0,0 +1,6 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ require 'html_to_markdown/cli'
5
+
6
+ exit HtmlToMarkdown::CLI.run(ARGV)
@@ -0,0 +1,99 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'lib/html_to_markdown/version'
4
+
5
+ repo_root = File.expand_path('../..', __dir__)
6
+ crate_prefix = 'packages/ruby/'
7
+ git_cmd = %(git -C "#{repo_root}" ls-files -z #{crate_prefix})
8
+ ruby_files =
9
+ `#{git_cmd}`.split("\x0")
10
+ .select { |path| path.start_with?(crate_prefix) }
11
+ .map { |path| path.delete_prefix(crate_prefix) }
12
+
13
+ fallback_files = Dir.chdir(__dir__) do
14
+ Dir.glob(
15
+ %w[
16
+ README.md
17
+ ext/**/*
18
+ exe/*
19
+ lib/**/*.rb
20
+ lib/bin/*
21
+ src/**/*.rs
22
+ spec/**/*.rb
23
+ sig/**/*.rbs
24
+ ]
25
+ )
26
+ end
27
+
28
+ # Vendor files: include vendored crates and workspace Cargo.toml
29
+ vendor_files = Dir.chdir(__dir__) do
30
+ Dir.glob('vendor/html-to-markdown-rs/**/*', File::FNM_DOTMATCH)
31
+ .select { |f| File.file?(f) }
32
+ .grep_v(%r{/target/})
33
+ .grep_v(/\.(swp|bak|tmp)$/)
34
+ end
35
+
36
+ # Include vendor/Cargo.toml (workspace definition) if it exists
37
+ workspace_toml = if File.exist?(File.join(__dir__, 'vendor/Cargo.toml'))
38
+ ['vendor/Cargo.toml']
39
+ else
40
+ []
41
+ end
42
+
43
+ # When vendor exists, use ext/ files from filesystem (modified by vendor script)
44
+ # instead of git (which has the unmodified Cargo.toml with workspace paths)
45
+ ext_files_from_fs = Dir.chdir(__dir__) do
46
+ Dir.glob('ext/**/*', File::FNM_DOTMATCH)
47
+ .reject { |f| File.directory?(f) }
48
+ .reject { |f| f.include?('/target/') }
49
+ end
50
+
51
+ # Include native artifacts (.so, .bundle, .dylib) if present (for platform gems)
52
+ native_files = Dir.chdir(__dir__) do
53
+ Dir.glob('lib/**/*.{so,bundle,dylib}')
54
+ end
55
+
56
+ files = if vendor_files.any?
57
+ # Vendor exists: use ext/ from filesystem (has modified Cargo.toml)
58
+ non_ext_ruby_files = (ruby_files.empty? ? fallback_files : ruby_files)
59
+ .reject { |f| f.start_with?('ext/') }
60
+ non_ext_ruby_files + ext_files_from_fs + vendor_files + workspace_toml + native_files
61
+ else
62
+ ruby_files.empty? ? fallback_files : ruby_files
63
+ end
64
+
65
+ files = files.uniq
66
+
67
+ Gem::Specification.new do |spec|
68
+ spec.name = 'html-to-markdown'
69
+ spec.version = HtmlToMarkdown::VERSION
70
+ spec.authors = ["Na'aman Hirschfeld"]
71
+ spec.email = ['nhirschfeld@gmail.com']
72
+
73
+ spec.summary = 'Blazing-fast HTML to Markdown conversion for Ruby, powered by Rust.'
74
+ spec.description = <<~DESC.strip
75
+ html-to-markdown is a native Ruby extension built on the shared Rust engine that powers the html-to-markdown project.
76
+ It delivers identical HTML-to-Markdown output across languages, exposes inline image extraction, and ships with a CLI for automation workflows.
77
+ DESC
78
+ spec.homepage = 'https://github.com/kreuzberg-dev/html-to-markdown'
79
+ spec.license = 'MIT'
80
+
81
+ spec.required_ruby_version = Gem::Requirement.new('>= 3.2')
82
+
83
+ spec.bindir = 'exe'
84
+ spec.executables = ['html-to-markdown']
85
+ spec.require_paths = ['lib']
86
+
87
+ spec.files = files
88
+ spec.extra_rdoc_files = ['README.md']
89
+
90
+ spec.extensions = ['ext/html-to-markdown-rb/extconf.rb']
91
+
92
+ spec.add_dependency 'rb_sys', '>= 0.9', '< 1.0'
93
+ spec.metadata['rubygems_mfa_required'] = 'true'
94
+ spec.metadata['homepage_uri'] = 'https://github.com/kreuzberg-dev/html-to-markdown'
95
+ spec.metadata['source_code_uri'] = 'https://github.com/kreuzberg-dev/html-to-markdown'
96
+ spec.metadata['bug_tracker_uri'] = 'https://github.com/kreuzberg-dev/html-to-markdown/issues'
97
+ spec.metadata['changelog_uri'] = 'https://github.com/kreuzberg-dev/html-to-markdown/releases'
98
+ spec.metadata['documentation_uri'] = 'https://github.com/kreuzberg-dev/html-to-markdown/blob/main/packages/ruby/README.md'
99
+ end
@@ -0,0 +1,21 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'html_to_markdown/cli_proxy'
4
+
5
+ module HtmlToMarkdown
6
+ module CLI
7
+ module_function
8
+
9
+ def run(argv = ARGV, stdout: $stdout, stderr: $stderr)
10
+ output = CLIProxy.call(argv)
11
+ stdout.print(output)
12
+ 0
13
+ rescue CLIProxy::CLIExecutionError => e
14
+ stderr.print(e.stderr)
15
+ e.status || 1
16
+ rescue CLIProxy::MissingBinaryError, CLIProxy::Error => e
17
+ stderr.puts(e.message)
18
+ 1
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,74 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'open3'
4
+ require 'pathname'
5
+
6
+ module HtmlToMarkdown
7
+ module CLIProxy
8
+ class Error < StandardError
9
+ end
10
+
11
+ class MissingBinaryError < Error
12
+ end
13
+
14
+ class CLIExecutionError < Error
15
+ attr_reader :stderr, :status
16
+
17
+ def initialize(message, stderr:, status:)
18
+ super(message)
19
+ @stderr = stderr
20
+ @status = status
21
+ end
22
+ end
23
+
24
+ module_function
25
+
26
+ def call(argv)
27
+ binary = find_cli_binary
28
+ args = Array(argv).map(&:to_s)
29
+ stdout, stderr, status = Open3.capture3(binary.to_s, *args)
30
+ return stdout if status.success?
31
+
32
+ raise CLIExecutionError.new(
33
+ "html-to-markdown CLI exited with status #{status.exitstatus}",
34
+ stderr: stderr,
35
+ status: status.exitstatus
36
+ )
37
+ end
38
+
39
+ def find_cli_binary
40
+ binary_name = Gem.win_platform? ? 'html-to-markdown.exe' : 'html-to-markdown'
41
+ found = search_paths(binary_name).find(&:file?)
42
+ return found if found
43
+
44
+ raise MissingBinaryError, missing_binary_message
45
+ end
46
+
47
+ def root_path
48
+ @root_path ||= Pathname(__dir__.to_s).join('../..').expand_path
49
+ end
50
+
51
+ def lib_path
52
+ @lib_path ||= Pathname(__dir__.to_s).join('..').expand_path
53
+ end
54
+
55
+ def search_paths(binary_name)
56
+ paths = [
57
+ root_path.join('target', 'release', binary_name),
58
+ lib_path.join('bin', binary_name),
59
+ lib_path.join(binary_name)
60
+ ]
61
+
62
+ workspace_root = root_path.parent&.parent
63
+ paths << workspace_root.join('target', 'release', binary_name) if workspace_root
64
+ paths
65
+ end
66
+
67
+ def missing_binary_message
68
+ <<~MSG.strip
69
+ html-to-markdown CLI binary not found. Build it with
70
+ `cargo build --release --package html-to-markdown-cli`.
71
+ MSG
72
+ end
73
+ end
74
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module HtmlToMarkdown
4
+ VERSION = '2.29.0'
5
+ end
@@ -0,0 +1,211 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'html_to_markdown/version'
4
+ require 'html_to_markdown_rb'
5
+
6
+ module HtmlToMarkdown
7
+ autoload :CLI, 'html_to_markdown/cli'
8
+ autoload :CLIProxy, 'html_to_markdown/cli_proxy'
9
+
10
+ class Options; end # rubocop:disable Lint/EmptyClass
11
+
12
+ class << self
13
+ alias native_convert convert
14
+ alias native_convert_with_inline_images convert_with_inline_images
15
+ alias native_convert_with_inline_images_handle convert_with_inline_images_handle
16
+ alias native_options options
17
+ alias native_convert_with_options convert_with_options
18
+ alias native_convert_with_metadata convert_with_metadata
19
+ alias native_convert_with_metadata_handle convert_with_metadata_handle
20
+ alias native_convert_with_visitor convert_with_visitor
21
+ alias native_convert_with_tables convert_with_tables
22
+ end
23
+
24
+ module_function
25
+
26
+ def convert(html, options = nil, visitor = nil)
27
+ if visitor
28
+ native_convert_with_visitor(html.to_s, options, visitor)
29
+ else
30
+ native_convert(html.to_s, options)
31
+ end
32
+ end
33
+
34
+ def convert_with_options(html, options_handle)
35
+ native_convert_with_options(html.to_s, options_handle)
36
+ end
37
+
38
+ def convert_with_inline_images(html, options = nil, image_config = nil, _visitor = nil)
39
+ # NOTE: visitor parameter is accepted for API compatibility but not used in inline images mode
40
+ # The visitor pattern is only supported in the standard convert() method
41
+ native_convert_with_inline_images(html.to_s, options, image_config)
42
+ end
43
+
44
+ def convert_with_inline_images_handle(html, options_handle, image_config = nil)
45
+ native_convert_with_inline_images_handle(html.to_s, options_handle, image_config)
46
+ end
47
+
48
+ def options(options_hash = nil)
49
+ native_options(options_hash)
50
+ end
51
+
52
+ # Convert HTML to Markdown with comprehensive metadata extraction.
53
+ #
54
+ # Performs HTML-to-Markdown conversion while extracting document metadata, headers,
55
+ # links, images, and structured data in a single pass. Ideal for content analysis,
56
+ # SEO workflows, and document indexing.
57
+ #
58
+ # @param html [String] HTML string to convert. Line endings are normalized (CRLF -> LF).
59
+ # @param options [ConversionOptions, Hash, nil] Optional conversion configuration.
60
+ # When a Hash, keys should match ConversionOptions field names (as symbols or strings).
61
+ # Common options:
62
+ # - :heading_style [String] "atx", "atx_closed", or "underlined" (default: "underlined")
63
+ # - :list_indent_type [String] "spaces" or "tabs" (default: "spaces")
64
+ # - :list_indent_width [Integer] Spaces per indent level (default: 4)
65
+ # - :wrap [true, false] Enable text wrapping (default: false)
66
+ # - :wrap_width [Integer] Wrap at this column width (default: 80)
67
+ # See ConversionOptions documentation for complete list.
68
+ #
69
+ # @param metadata_config [Hash, nil] Optional metadata extraction configuration.
70
+ # Keys should be symbols or strings. Supported keys:
71
+ # - :extract_headers [true, false] Extract h1-h6 heading elements (default: true)
72
+ # - :extract_links [true, false] Extract hyperlinks with type classification (default: true)
73
+ # - :extract_images [true, false] Extract image elements (default: true)
74
+ # - :extract_structured_data [true, false] Extract JSON-LD/Microdata/RDFa (default: true)
75
+ # - :max_structured_data_size [Integer] Size limit for structured data in bytes (default: 1_000_000)
76
+ #
77
+ # @return [Array<String, Hash>] Tuple of [markdown_string, metadata_hash]
78
+ # markdown_string: String - The converted Markdown output
79
+ #
80
+ # metadata_hash: Hash with keys:
81
+ # - :document [Hash] Document-level metadata:
82
+ # - :title [String, nil] From <title> tag
83
+ # - :description [String, nil] From <meta name="description">
84
+ # - :keywords [Array<String>] From <meta name="keywords">
85
+ # - :author [String, nil] From <meta name="author">
86
+ # - :language [String, nil] From lang attribute (e.g., "en")
87
+ # - :text_direction [String, nil] "ltr", "rtl", or "auto"
88
+ # - :canonical_url [String, nil] From <link rel="canonical">
89
+ # - :base_href [String, nil] From <base href="">
90
+ # - :open_graph [Hash<String, String>] Open Graph properties (og:* meta tags)
91
+ # - :twitter_card [Hash<String, String>] Twitter Card properties (twitter:* meta tags)
92
+ # - :meta_tags [Hash<String, String>] Other meta tags
93
+ #
94
+ # - :headers [Array<Hash>] Heading elements:
95
+ # - :level [Integer] 1-6
96
+ # - :text [String] Header text content
97
+ # - :id [String, nil] HTML id attribute
98
+ # - :depth [Integer] Tree nesting depth
99
+ # - :html_offset [Integer] Byte offset in original HTML
100
+ #
101
+ # - :links [Array<Hash>] Hyperlinks:
102
+ # - :href [String] Link URL
103
+ # - :text [String] Link text content
104
+ # - :title [String, nil] Title attribute
105
+ # - :link_type [String] "anchor", "internal", "external", "email", "phone", or "other"
106
+ # - :rel [Array<String>] Rel attribute values
107
+ # - :attributes [Hash<String, String>] Additional HTML attributes
108
+ #
109
+ # - :images [Array<Hash>] Image elements:
110
+ # - :src [String] Image source URL or data URI
111
+ # - :alt [String, nil] Alt text for accessibility
112
+ # - :title [String, nil] Title attribute
113
+ # - :dimensions [Array<Integer>, nil] [width, height] if available
114
+ # - :image_type [String] "data_uri", "external", "relative", or "inline_svg"
115
+ # - :attributes [Hash<String, String>] Additional HTML attributes
116
+ #
117
+ # - :structured_data [Array<Hash>] Structured data blocks:
118
+ # - :data_type [String] "json_ld", "microdata", or "rdfa"
119
+ # - :raw_json [String] Raw JSON content
120
+ # - :schema_type [String, nil] Schema type (e.g., "Article", "Event")
121
+ #
122
+ # @raise [StandardError] If conversion fails or invalid configuration
123
+ #
124
+ # @example Basic usage
125
+ # html = <<~HTML
126
+ # <html lang="en">
127
+ # <head>
128
+ # <title>My Article</title>
129
+ # <meta name="description" content="A great read">
130
+ # </head>
131
+ # <body>
132
+ # <h1 id="intro">Introduction</h1>
133
+ # <p>Visit <a href="https://example.com">our site</a></p>
134
+ # <img src="photo.jpg" alt="Beautiful landscape">
135
+ # </body>
136
+ # </html>
137
+ # HTML
138
+ #
139
+ # markdown, metadata = HtmlToMarkdown.convert_with_metadata(html)
140
+ #
141
+ # puts metadata[:document][:title] # => "My Article"
142
+ # puts metadata[:document][:language] # => "en"
143
+ # puts metadata[:headers].length # => 1
144
+ # puts metadata[:headers][0][:text] # => "Introduction"
145
+ # puts metadata[:links].length # => 1
146
+ # puts metadata[:images].length # => 1
147
+ #
148
+ # @example With selective metadata extraction
149
+ # config = {
150
+ # extract_headers: true,
151
+ # extract_links: true,
152
+ # extract_images: false, # Skip images
153
+ # extract_structured_data: false # Skip structured data
154
+ # }
155
+ #
156
+ # markdown, metadata = HtmlToMarkdown.convert_with_metadata(html, nil, config)
157
+ # puts metadata[:images].empty? # => true (not extracted)
158
+ #
159
+ # @example With conversion options
160
+ # options = {
161
+ # heading_style: "atx", # Use # H1, ## H2 style
162
+ # wrap: true,
163
+ # wrap_width: 80
164
+ # }
165
+ #
166
+ # config = { extract_headers: true }
167
+ #
168
+ # markdown, metadata = HtmlToMarkdown.convert_with_metadata(html, options, config)
169
+ # # Markdown uses ATX-style headings and wraps at 80 characters
170
+ #
171
+ # @see #convert Simple conversion without metadata
172
+ # @see #convert_with_inline_images Extract inline images during conversion
173
+ # @see ConversionOptions Detailed conversion configuration
174
+ def convert_with_metadata(html, options = nil, metadata_config = nil, _visitor = nil)
175
+ # NOTE: visitor parameter is accepted for API compatibility but not used in metadata extraction mode
176
+ # The visitor pattern is only supported in the standard convert() method
177
+ native_convert_with_metadata(html.to_s, options, metadata_config)
178
+ end
179
+
180
+ def convert_with_metadata_handle(html, options_handle, metadata_config = nil)
181
+ native_convert_with_metadata_handle(html.to_s, options_handle, metadata_config)
182
+ end
183
+
184
+ # Convert HTML to Markdown with table extraction.
185
+ #
186
+ # Performs HTML-to-Markdown conversion while extracting structured table data
187
+ # (cells, markdown representation, header row flags) in a single pass.
188
+ #
189
+ # @param html [String] HTML string to convert.
190
+ # @param options [Hash, nil] Optional conversion configuration.
191
+ # @param metadata_config [Hash, nil] Optional metadata extraction configuration.
192
+ #
193
+ # @return [Hash] A hash with keys:
194
+ # - :content [String] The converted Markdown output
195
+ # - :metadata [Hash, nil] Extended metadata (if metadata extraction was configured)
196
+ # - :tables [Array<Hash>] Extracted tables, each with:
197
+ # - :cells [Array<Array<String>>] Table cells organized as rows x columns
198
+ # - :markdown [String] Complete rendered table in Markdown format
199
+ # - :is_header_row [Array<Boolean>] Per-row flag indicating header rows
200
+ #
201
+ # @raise [StandardError] If conversion fails or invalid configuration
202
+ #
203
+ # @example Basic usage
204
+ # html = '<table><thead><tr><th>Name</th></tr></thead><tbody><tr><td>Alice</td></tr></tbody></table>'
205
+ # result = HtmlToMarkdown.convert_with_tables(html)
206
+ # puts result[:tables].length # => 1
207
+ # puts result[:tables][0][:cells] # => [["Name"], ["Alice"]]
208
+ def convert_with_tables(html, options = nil, metadata_config = nil)
209
+ native_convert_with_tables(html.to_s, options, metadata_config)
210
+ end
211
+ end
Binary file