html-to-markdown 2.30.0-x86_64-linux → 3.0.0-x86_64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ee8dd2c455a6a86421af27a2066b1e93f6c460073643222d4ac762eb1f7d0189
4
- data.tar.gz: 2d15f7312c9127aef99b419fb60fc4be7d63687dfed2dcccf00be9cb098ee3cb
3
+ metadata.gz: 07d676f000540af84276c48d2b0e84768e9f4708098cdda3de3d999520e7e716
4
+ data.tar.gz: 3b7fbe10fc72c7af0965ded5f770e0cf6fec353e39497d029f80d7c77f6c7f24
5
5
  SHA512:
6
- metadata.gz: 0cb644c37d6bac86198cfc45150b59a7dbda7ffdce7116dd153d045bfd3d182703d9d4f47edf67825d0cd67807c1b09a4d9b527dc3c453291a5bf4c93957ce5b
7
- data.tar.gz: ec204e318d2d9f01f34e5801958354d7a1266e750bb12abd72b1c07baf9ea904a49d353bfaf65f6b4a2b30d41a94da835e3755f0b5a426a9677cf88e985ffdbb
6
+ metadata.gz: 207023e2ce048eb36df739aa1166af50b2086ea8c388016250d838230b8c7dba1b5be208540c9afb93277538a9551183a83370d46aa6f71b750073bb71a8cb91
7
+ data.tar.gz: 8801822711dc240f82151044a45248bcc3656080ddf3d922ec6391c4371d49ce801227b0c85cfb5a4488827af1474a8e84c650b576272d1ae6f6c97e0d9cc1ad
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- html-to-markdown (2.30.0)
4
+ html-to-markdown (3.0.0)
5
5
  rb_sys (>= 0.9, < 1.0)
6
6
 
7
7
  GEM
@@ -22,16 +22,13 @@ GEM
22
22
  uri (>= 0.13.1)
23
23
  ast (2.4.3)
24
24
  base64 (0.3.0)
25
- bigdecimal (4.0.1)
25
+ bigdecimal (4.1.0)
26
26
  concurrent-ruby (1.3.6)
27
27
  connection_pool (3.0.2)
28
28
  csv (3.3.5)
29
29
  diff-lcs (1.6.2)
30
30
  drb (2.2.3)
31
- ffi (1.17.4-aarch64-linux-gnu)
32
31
  ffi (1.17.4-arm64-darwin)
33
- ffi (1.17.4-x64-mingw-ucrt)
34
- ffi (1.17.4-x86_64-darwin)
35
32
  ffi (1.17.4-x86_64-linux-gnu)
36
33
  fileutils (1.8.0)
37
34
  i18n (1.14.8)
@@ -129,12 +126,8 @@ GEM
129
126
  uri (1.1.1)
130
127
 
131
128
  PLATFORMS
132
- aarch64-linux
133
129
  arm64-darwin
134
- x64-mingw-ucrt
135
- x86_64-darwin
136
130
  x86_64-linux
137
- x86_64-linux-gnu
138
131
 
139
132
  DEPENDENCIES
140
133
  html-to-markdown!
@@ -150,19 +143,16 @@ CHECKSUMS
150
143
  activesupport (8.1.3) sha256=21a5e0dfbd4c3ddd9e1317ec6a4d782fa226e7867dc70b0743acda81a1dca20e
151
144
  ast (2.4.3) sha256=954615157c1d6a382bc27d690d973195e79db7f55e9765ac7c481c60bdb4d383
152
145
  base64 (0.3.0) sha256=27337aeabad6ffae05c265c450490628ef3ebd4b67be58257393227588f5a97b
153
- bigdecimal (4.0.1) sha256=8b07d3d065a9f921c80ceaea7c9d4ae596697295b584c296fe599dd0ad01c4a7
146
+ bigdecimal (4.1.0) sha256=6dc07767aa3dc456ccd48e7ae70a07b474e9afd7c5bc576f80bd6da5c8dd6cae
154
147
  concurrent-ruby (1.3.6) sha256=6b56837e1e7e5292f9864f34b69c5a2cbc75c0cf5338f1ce9903d10fa762d5ab
155
148
  connection_pool (3.0.2) sha256=33fff5ba71a12d2aa26cb72b1db8bba2a1a01823559fb01d29eb74c286e62e0a
156
149
  csv (3.3.5) sha256=6e5134ac3383ef728b7f02725d9872934f523cb40b961479f69cf3afa6c8e73f
157
150
  diff-lcs (1.6.2) sha256=9ae0d2cba7d4df3075fe8cd8602a8604993efc0dfa934cff568969efb1909962
158
151
  drb (2.2.3) sha256=0b00d6fdb50995fe4a45dea13663493c841112e4068656854646f418fda13373
159
- ffi (1.17.4-aarch64-linux-gnu) sha256=b208f06f91ffd8f5e1193da3cae3d2ccfc27fc36fba577baf698d26d91c080df
160
152
  ffi (1.17.4-arm64-darwin) sha256=19071aaf1419251b0a46852abf960e77330a3b334d13a4ab51d58b31a937001b
161
- ffi (1.17.4-x64-mingw-ucrt) sha256=f6ff9618cfccc494138bddade27aa06c74c6c7bc367a1ea1103d80c2fcb9ed35
162
- ffi (1.17.4-x86_64-darwin) sha256=aa70390523cf3235096cf64962b709b4cfbd5c082a2cb2ae714eb0fe2ccda496
163
153
  ffi (1.17.4-x86_64-linux-gnu) sha256=9d3db14c2eae074b382fa9c083fe95aec6e0a1451da249eab096c34002bc752d
164
154
  fileutils (1.8.0) sha256=8c6b1df54e2540bdb2f39258f08af78853aa70bad52b4d394bbc6424593c6e02
165
- html-to-markdown (2.30.0)
155
+ html-to-markdown (3.0.0)
166
156
  i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
167
157
  json (2.19.3) sha256=289b0bb53052a1fa8c34ab33cc750b659ba14a5c45f3fcf4b18762dc67c78646
168
158
  language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
data/README.md CHANGED
@@ -17,8 +17,8 @@
17
17
  <a href="https://central.sonatype.com/artifact/dev.kreuzberg/html-to-markdown">
18
18
  <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/html-to-markdown?label=Java&color=007ec6" alt="Java">
19
19
  </a>
20
- <a href="https://pkg.go.dev/github.com/kreuzberg-dev/html-to-markdown/packages/go/v2/htmltomarkdown">
21
- <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/html-to-markdown?label=Go&color=007ec6&filter=v2.29.0" alt="Go">
20
+ <a href="https://pkg.go.dev/github.com/kreuzberg-dev/html-to-markdown/packages/go/v3/htmltomarkdown">
21
+ <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/html-to-markdown?label=Go&color=007ec6&filter=v3.0.0" alt="Go">
22
22
  </a>
23
23
  <a href="https://www.nuget.org/packages/KreuzbergDev.HtmlToMarkdown/">
24
24
  <img src="https://img.shields.io/nuget/v/KreuzbergDev.HtmlToMarkdown?label=C%23&color=007ec6" alt="C#">
@@ -87,7 +87,6 @@ Apple M4 • Real Wikipedia documents • `convert()` (Ruby)
87
87
  | Mixed (Python wiki) | 656KB | 4.89ms | 134 MB/s |
88
88
 
89
89
 
90
- See [Performance Guide](../../examples/performance/) for detailed benchmarks.
91
90
 
92
91
 
93
92
  ## Quick Start
@@ -98,7 +97,8 @@ Basic conversion:
98
97
  require 'html_to_markdown'
99
98
 
100
99
  html = "<h1>Hello</h1><p>This is <strong>fast</strong>!</p>"
101
- markdown = HtmlToMarkdown.convert(html)
100
+ result = HtmlToMarkdown.convert(html)
101
+ markdown = result[:content]
102
102
  ```
103
103
 
104
104
 
@@ -109,60 +109,50 @@ With conversion options:
109
109
  require 'html_to_markdown'
110
110
 
111
111
  html = "<h1>Hello</h1><p>This is <strong>fast</strong>!</p>"
112
- markdown = HtmlToMarkdown.convert(html, heading_style: :atx, code_block_style: :fenced)
112
+ result = HtmlToMarkdown.convert(html, heading_style: :atx, code_block_style: :fenced)
113
+ markdown = result[:content]
113
114
  ```
114
115
 
115
116
 
116
117
 
117
118
 
118
-
119
-
120
119
  ## API Reference
121
120
 
122
- ### Core Functions
123
-
124
-
125
- **`convert(html, options: nil) -> String`**
121
+ ### Core Function
126
122
 
127
- Basic HTML-to-Markdown conversion. Fast and simple.
128
123
 
129
- **`convert_with_metadata(html, options: nil, config: nil) -> [String, Hash]`**
124
+ **`convert(html, options: nil, visitor: nil) -> ConversionResult`**
130
125
 
131
- Extract Markdown plus metadata (headers, links, images, structured data) in a single pass. See [Metadata Extraction Guide](../../examples/metadata-extraction/).
126
+ Converts HTML to Markdown. Returns a `ConversionResult` hash with all results in a single call.
132
127
 
133
- **`convert_with_visitor(html, visitor:, options: nil) -> String`**
134
-
135
- Customize conversion with visitor callbacks for element interception. See [Visitor Pattern Guide](../../examples/visitor-pattern/).
136
-
137
- **`convert_with_inline_images(html, config: nil) -> [String, Array, Array]`**
138
-
139
- Extract base64-encoded inline images with metadata.
140
-
141
- **`convert_with_tables(html, options: nil, config: nil) -> ConversionWithTables`**
128
+ ```ruby
129
+ require 'html_to_markdown'
142
130
 
143
- Extract structured table data (cells, headers, rendered markdown) alongside conversion.
131
+ result = HtmlToMarkdown.convert(html)
132
+ markdown = result[:content] # Converted Markdown string
133
+ metadata = result[:metadata] # Metadata (when extract_metadata: true)
134
+ tables = result[:tables] # Structured table data (when extract_tables: true)
135
+ document = result[:document] # Document-level info
136
+ images = result[:images] # Extracted images
137
+ warnings = result[:warnings] # Any conversion warnings
138
+ ```
144
139
 
145
140
 
146
141
 
147
142
  ### Options
148
143
 
149
144
  **`ConversionOptions`** – Key configuration fields:
145
+
150
146
  - `heading_style`: Heading format (`"underlined"` | `"atx"` | `"atx_closed"`) — default: `"underlined"`
151
147
  - `list_indent_width`: Spaces per indent level — default: `2`
152
148
  - `bullets`: Bullet characters cycle — default: `"*+-"`
153
149
  - `wrap`: Enable text wrapping — default: `false`
154
150
  - `wrap_width`: Wrap at column — default: `80`
155
151
  - `code_language`: Default fenced code block language — default: none
156
- - `extract_metadata`: Embed metadata as YAML frontmatter — default: `false`
152
+ - `extract_metadata`: Enable metadata extraction into `result.metadata` — default: `false`
153
+ - `extract_tables`: Enable structured table extraction into `result.tables` — default: `false`
157
154
  - `output_format`: Output markup format (`"markdown"` | `"djot"` | `"plain"`) — default: `"markdown"`
158
155
 
159
- **`MetadataConfig`** – Selective metadata extraction:
160
- - `extract_headers`: h1-h6 elements — default: `true`
161
- - `extract_links`: Hyperlinks — default: `true`
162
- - `extract_images`: Image elements — default: `true`
163
- - `extract_structured_data`: JSON-LD, Microdata, RDFa — default: `true`
164
- - `max_structured_data_size`: Size limit in bytes — default: `100KB`
165
-
166
156
 
167
157
  ## Djot Output Format
168
158
 
@@ -222,16 +212,17 @@ Plain text mode is useful for search indexing, text extraction, and feeding cont
222
212
 
223
213
  ## Metadata Extraction
224
214
 
225
- The metadata extraction feature enables comprehensive document analysis during conversion. Extract document properties, headers, links, images, and structured data in a single pass.
215
+ The metadata extraction feature enables comprehensive document analysis during conversion. Extract document properties, headers, links, images, and structured data in a single pass — all via the standard `convert()` function.
226
216
 
227
217
  **Use Cases:**
218
+
228
219
  - **SEO analysis** – Extract title, description, Open Graph tags, Twitter cards
229
220
  - **Table of contents generation** – Build structured outlines from heading hierarchy
230
221
  - **Content migration** – Document all external links and resources
231
222
  - **Accessibility audits** – Check for images without alt text, empty links, invalid heading hierarchy
232
223
  - **Link validation** – Classify and validate anchor, internal, external, email, and phone links
233
224
 
234
- **Zero Overhead When Disabled:** Metadata extraction adds negligible overhead and happens during the HTML parsing pass. Disable unused metadata types in `MetadataConfig` to optimize further.
225
+ **Zero Overhead When Disabled:** Metadata extraction adds negligible overhead and happens during the HTML parsing pass. Pass `extract_metadata: true` in `ConversionOptions` to enable it; the result is available at `result.metadata`.
235
226
 
236
227
  ### Example: Quick Start
237
228
 
@@ -240,27 +231,27 @@ The metadata extraction feature enables comprehensive document analysis during c
240
231
  require 'html_to_markdown'
241
232
 
242
233
  html = '<h1>Article</h1><img src="test.jpg" alt="test">'
243
- markdown, metadata = HtmlToMarkdown.convert_with_metadata(html)
244
-
245
- puts metadata[:document][:title] # Document title
246
- puts metadata[:headers] # All h1-h6 elements
247
- puts metadata[:links] # All hyperlinks
248
- puts metadata[:images] # All images with alt text
249
- puts metadata[:structured_data] # JSON-LD, Microdata, RDFa
234
+ result = HtmlToMarkdown.convert(html, extract_metadata: true)
235
+
236
+ puts result[:content] # Converted Markdown
237
+ puts result[:metadata][:document][:title] # Document title
238
+ puts result[:metadata][:headers] # All h1-h6 elements
239
+ puts result[:metadata][:links] # All hyperlinks
240
+ puts result[:metadata][:images] # All images with alt text
241
+ puts result[:metadata][:structured_data] # JSON-LD, Microdata, RDFa
250
242
  ```
251
243
 
252
244
 
253
245
 
254
- For detailed examples including SEO extraction, table-of-contents generation, link validation, and accessibility audits, see the [Metadata Extraction Guide](../../examples/metadata-extraction/).
255
-
256
246
 
257
247
 
258
248
 
259
249
  ## Visitor Pattern
260
250
 
261
- The visitor pattern enables custom HTML→Markdown conversion logic by providing callbacks for specific HTML elements during traversal. Use visitors to transform content, filter elements, validate structure, or collect analytics.
251
+ The visitor pattern enables custom HTML→Markdown conversion logic by providing callbacks for specific HTML elements during traversal. Pass a visitor as the third argument to `convert()`.
262
252
 
263
253
  **Use Cases:**
254
+
264
255
  - **Custom Markdown dialects** – Convert to Obsidian, Notion, or other flavors
265
256
  - **Content filtering** – Remove tracking pixels, ads, or unwanted elements
266
257
  - **URL rewriting** – Rewrite CDN URLs, add query parameters, validate links
@@ -291,20 +282,16 @@ class MyVisitor
291
282
  end
292
283
 
293
284
  html = '<a href="https://old-cdn.com/file.pdf">Download</a>'
294
- markdown = HtmlToMarkdown.convert_with_visitor(html, visitor: MyVisitor.new)
285
+ result = HtmlToMarkdown.convert(html, visitor: MyVisitor.new)
286
+ markdown = result[:content]
295
287
  ```
296
288
 
297
289
 
298
290
 
299
- For comprehensive examples including content filtering, link footnotes, accessibility validation, and asynchronous URL validation, see the [Visitor Pattern Guide](../../examples/visitor-pattern/).
300
-
301
291
 
302
292
 
303
293
  ## Examples
304
294
 
305
- - [Visitor Pattern Guide](../../examples/visitor-pattern/)
306
- - [Metadata Extraction Guide](../../examples/metadata-extraction/)
307
- - [Performance Guide](../../examples/performance/)
308
295
 
309
296
  ## Links
310
297
 
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module HtmlToMarkdown
4
- VERSION = '2.30.0'
4
+ VERSION = '3.0.0'
5
5
  end
@@ -7,205 +7,24 @@ module HtmlToMarkdown
7
7
  autoload :CLI, 'html_to_markdown/cli'
8
8
  autoload :CLIProxy, 'html_to_markdown/cli_proxy'
9
9
 
10
- class Options; end # rubocop:disable Lint/EmptyClass
11
-
12
10
  class << self
13
11
  alias native_convert convert
14
- alias native_convert_with_inline_images convert_with_inline_images
15
- alias native_convert_with_inline_images_handle convert_with_inline_images_handle
16
- alias native_options options
17
- alias native_convert_with_options convert_with_options
18
- alias native_convert_with_metadata convert_with_metadata
19
- alias native_convert_with_metadata_handle convert_with_metadata_handle
20
- alias native_convert_with_visitor convert_with_visitor
21
- alias native_convert_with_tables convert_with_tables
22
12
  end
23
13
 
24
14
  module_function
25
15
 
26
- def convert(html, options = nil, visitor = nil)
27
- if visitor
28
- native_convert_with_visitor(html.to_s, options, visitor)
29
- else
30
- native_convert(html.to_s, options)
31
- end
32
- end
33
-
34
- def convert_with_options(html, options_handle)
35
- native_convert_with_options(html.to_s, options_handle)
36
- end
37
-
38
- def convert_with_inline_images(html, options = nil, image_config = nil, _visitor = nil)
39
- # NOTE: visitor parameter is accepted for API compatibility but not used in inline images mode
40
- # The visitor pattern is only supported in the standard convert() method
41
- native_convert_with_inline_images(html.to_s, options, image_config)
42
- end
43
-
44
- def convert_with_inline_images_handle(html, options_handle, image_config = nil)
45
- native_convert_with_inline_images_handle(html.to_s, options_handle, image_config)
46
- end
47
-
48
- def options(options_hash = nil)
49
- native_options(options_hash)
50
- end
51
-
52
- # Convert HTML to Markdown with comprehensive metadata extraction.
53
- #
54
- # Performs HTML-to-Markdown conversion while extracting document metadata, headers,
55
- # links, images, and structured data in a single pass. Ideal for content analysis,
56
- # SEO workflows, and document indexing.
57
- #
58
- # @param html [String] HTML string to convert. Line endings are normalized (CRLF -> LF).
59
- # @param options [ConversionOptions, Hash, nil] Optional conversion configuration.
60
- # When a Hash, keys should match ConversionOptions field names (as symbols or strings).
61
- # Common options:
62
- # - :heading_style [String] "atx", "atx_closed", or "underlined" (default: "underlined")
63
- # - :list_indent_type [String] "spaces" or "tabs" (default: "spaces")
64
- # - :list_indent_width [Integer] Spaces per indent level (default: 4)
65
- # - :wrap [true, false] Enable text wrapping (default: false)
66
- # - :wrap_width [Integer] Wrap at this column width (default: 80)
67
- # See ConversionOptions documentation for complete list.
68
- #
69
- # @param metadata_config [Hash, nil] Optional metadata extraction configuration.
70
- # Keys should be symbols or strings. Supported keys:
71
- # - :extract_headers [true, false] Extract h1-h6 heading elements (default: true)
72
- # - :extract_links [true, false] Extract hyperlinks with type classification (default: true)
73
- # - :extract_images [true, false] Extract image elements (default: true)
74
- # - :extract_structured_data [true, false] Extract JSON-LD/Microdata/RDFa (default: true)
75
- # - :max_structured_data_size [Integer] Size limit for structured data in bytes (default: 1_000_000)
76
- #
77
- # @return [Array<String, Hash>] Tuple of [markdown_string, metadata_hash]
78
- # markdown_string: String - The converted Markdown output
79
- #
80
- # metadata_hash: Hash with keys:
81
- # - :document [Hash] Document-level metadata:
82
- # - :title [String, nil] From <title> tag
83
- # - :description [String, nil] From <meta name="description">
84
- # - :keywords [Array<String>] From <meta name="keywords">
85
- # - :author [String, nil] From <meta name="author">
86
- # - :language [String, nil] From lang attribute (e.g., "en")
87
- # - :text_direction [String, nil] "ltr", "rtl", or "auto"
88
- # - :canonical_url [String, nil] From <link rel="canonical">
89
- # - :base_href [String, nil] From <base href="">
90
- # - :open_graph [Hash<String, String>] Open Graph properties (og:* meta tags)
91
- # - :twitter_card [Hash<String, String>] Twitter Card properties (twitter:* meta tags)
92
- # - :meta_tags [Hash<String, String>] Other meta tags
93
- #
94
- # - :headers [Array<Hash>] Heading elements:
95
- # - :level [Integer] 1-6
96
- # - :text [String] Header text content
97
- # - :id [String, nil] HTML id attribute
98
- # - :depth [Integer] Tree nesting depth
99
- # - :html_offset [Integer] Byte offset in original HTML
100
- #
101
- # - :links [Array<Hash>] Hyperlinks:
102
- # - :href [String] Link URL
103
- # - :text [String] Link text content
104
- # - :title [String, nil] Title attribute
105
- # - :link_type [String] "anchor", "internal", "external", "email", "phone", or "other"
106
- # - :rel [Array<String>] Rel attribute values
107
- # - :attributes [Hash<String, String>] Additional HTML attributes
108
- #
109
- # - :images [Array<Hash>] Image elements:
110
- # - :src [String] Image source URL or data URI
111
- # - :alt [String, nil] Alt text for accessibility
112
- # - :title [String, nil] Title attribute
113
- # - :dimensions [Array<Integer>, nil] [width, height] if available
114
- # - :image_type [String] "data_uri", "external", "relative", or "inline_svg"
115
- # - :attributes [Hash<String, String>] Additional HTML attributes
116
- #
117
- # - :structured_data [Array<Hash>] Structured data blocks:
118
- # - :data_type [String] "json_ld", "microdata", or "rdfa"
119
- # - :raw_json [String] Raw JSON content
120
- # - :schema_type [String, nil] Schema type (e.g., "Article", "Event")
121
- #
122
- # @raise [StandardError] If conversion fails or invalid configuration
123
- #
124
- # @example Basic usage
125
- # html = <<~HTML
126
- # <html lang="en">
127
- # <head>
128
- # <title>My Article</title>
129
- # <meta name="description" content="A great read">
130
- # </head>
131
- # <body>
132
- # <h1 id="intro">Introduction</h1>
133
- # <p>Visit <a href="https://example.com">our site</a></p>
134
- # <img src="photo.jpg" alt="Beautiful landscape">
135
- # </body>
136
- # </html>
137
- # HTML
138
- #
139
- # markdown, metadata = HtmlToMarkdown.convert_with_metadata(html)
140
- #
141
- # puts metadata[:document][:title] # => "My Article"
142
- # puts metadata[:document][:language] # => "en"
143
- # puts metadata[:headers].length # => 1
144
- # puts metadata[:headers][0][:text] # => "Introduction"
145
- # puts metadata[:links].length # => 1
146
- # puts metadata[:images].length # => 1
147
- #
148
- # @example With selective metadata extraction
149
- # config = {
150
- # extract_headers: true,
151
- # extract_links: true,
152
- # extract_images: false, # Skip images
153
- # extract_structured_data: false # Skip structured data
154
- # }
155
- #
156
- # markdown, metadata = HtmlToMarkdown.convert_with_metadata(html, nil, config)
157
- # puts metadata[:images].empty? # => true (not extracted)
158
- #
159
- # @example With conversion options
160
- # options = {
161
- # heading_style: "atx", # Use # H1, ## H2 style
162
- # wrap: true,
163
- # wrap_width: 80
164
- # }
165
- #
166
- # config = { extract_headers: true }
167
- #
168
- # markdown, metadata = HtmlToMarkdown.convert_with_metadata(html, options, config)
169
- # # Markdown uses ATX-style headings and wraps at 80 characters
170
- #
171
- # @see #convert Simple conversion without metadata
172
- # @see #convert_with_inline_images Extract inline images during conversion
173
- # @see ConversionOptions Detailed conversion configuration
174
- def convert_with_metadata(html, options = nil, metadata_config = nil, _visitor = nil)
175
- # NOTE: visitor parameter is accepted for API compatibility but not used in metadata extraction mode
176
- # The visitor pattern is only supported in the standard convert() method
177
- native_convert_with_metadata(html.to_s, options, metadata_config)
178
- end
179
-
180
- def convert_with_metadata_handle(html, options_handle, metadata_config = nil)
181
- native_convert_with_metadata_handle(html.to_s, options_handle, metadata_config)
182
- end
183
-
184
- # Convert HTML to Markdown with table extraction.
185
- #
186
- # Performs HTML-to-Markdown conversion while extracting structured table data
187
- # (cells, markdown representation, header row flags) in a single pass.
188
- #
189
- # @param html [String] HTML string to convert.
190
- # @param options [Hash, nil] Optional conversion configuration.
191
- # @param metadata_config [Hash, nil] Optional metadata extraction configuration.
192
- #
193
- # @return [Hash] A hash with keys:
194
- # - :content [String] The converted Markdown output
195
- # - :metadata [Hash, nil] Extended metadata (if metadata extraction was configured)
196
- # - :tables [Array<Hash>] Extracted tables, each with:
197
- # - :cells [Array<Array<String>>] Table cells organized as rows x columns
198
- # - :markdown [String] Complete rendered table in Markdown format
199
- # - :is_header_row [Array<Boolean>] Per-row flag indicating header rows
200
- #
201
- # @raise [StandardError] If conversion fails or invalid configuration
202
- #
203
- # @example Basic usage
204
- # html = '<table><thead><tr><th>Name</th></tr></thead><tbody><tr><td>Alice</td></tr></tbody></table>'
205
- # result = HtmlToMarkdown.convert_with_tables(html)
206
- # puts result[:tables].length # => 1
207
- # puts result[:tables][0][:cells] # => [["Name"], ["Alice"]]
208
- def convert_with_tables(html, options = nil, metadata_config = nil)
209
- native_convert_with_tables(html.to_s, options, metadata_config)
16
+ # Convert HTML to Markdown, returning a Hash with:
17
+ # - :content [String, nil] the converted Markdown output
18
+ # - :document [nil] document structure (not yet exposed)
19
+ # - :metadata [Hash, nil] extracted HTML metadata
20
+ # - :tables [Array<Hash>] extracted tables with :grid and :markdown
21
+ # - :images [Array<Hash>] extracted inline images
22
+ # - :warnings [Array<Hash>] processing warnings
23
+ #
24
+ # @param html [String] HTML string to convert
25
+ # @param options [Hash, nil] optional conversion options
26
+ # @return [Hash] conversion result
27
+ def convert(html, options = nil)
28
+ native_convert(html.to_s, options)
210
29
  end
211
30
  end
Binary file