html-to-markdown 2.30.0-x86_64-linux → 3.0.0-x86_64-linux
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +4 -14
- data/README.md +37 -50
- data/lib/html_to_markdown/version.rb +1 -1
- data/lib/html_to_markdown.rb +13 -194
- data/lib/html_to_markdown_rb.so +0 -0
- data/sig/html_to_markdown.rbs +12 -373
- metadata +2 -8
- data/bin/benchmark.rb +0 -232
- data/spec/convert_spec.rb +0 -77
- data/spec/convert_with_tables_spec.rb +0 -194
- data/spec/metadata_extraction_spec.rb +0 -437
- data/spec/visitor_issue_187_spec.rb +0 -605
- data/spec/visitor_spec.rb +0 -1149
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 07d676f000540af84276c48d2b0e84768e9f4708098cdda3de3d999520e7e716
|
|
4
|
+
data.tar.gz: 3b7fbe10fc72c7af0965ded5f770e0cf6fec353e39497d029f80d7c77f6c7f24
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 207023e2ce048eb36df739aa1166af50b2086ea8c388016250d838230b8c7dba1b5be208540c9afb93277538a9551183a83370d46aa6f71b750073bb71a8cb91
|
|
7
|
+
data.tar.gz: 8801822711dc240f82151044a45248bcc3656080ddf3d922ec6391c4371d49ce801227b0c85cfb5a4488827af1474a8e84c650b576272d1ae6f6c97e0d9cc1ad
|
data/Gemfile.lock
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
html-to-markdown (
|
|
4
|
+
html-to-markdown (3.0.0)
|
|
5
5
|
rb_sys (>= 0.9, < 1.0)
|
|
6
6
|
|
|
7
7
|
GEM
|
|
@@ -22,16 +22,13 @@ GEM
|
|
|
22
22
|
uri (>= 0.13.1)
|
|
23
23
|
ast (2.4.3)
|
|
24
24
|
base64 (0.3.0)
|
|
25
|
-
bigdecimal (4.0
|
|
25
|
+
bigdecimal (4.1.0)
|
|
26
26
|
concurrent-ruby (1.3.6)
|
|
27
27
|
connection_pool (3.0.2)
|
|
28
28
|
csv (3.3.5)
|
|
29
29
|
diff-lcs (1.6.2)
|
|
30
30
|
drb (2.2.3)
|
|
31
|
-
ffi (1.17.4-aarch64-linux-gnu)
|
|
32
31
|
ffi (1.17.4-arm64-darwin)
|
|
33
|
-
ffi (1.17.4-x64-mingw-ucrt)
|
|
34
|
-
ffi (1.17.4-x86_64-darwin)
|
|
35
32
|
ffi (1.17.4-x86_64-linux-gnu)
|
|
36
33
|
fileutils (1.8.0)
|
|
37
34
|
i18n (1.14.8)
|
|
@@ -129,12 +126,8 @@ GEM
|
|
|
129
126
|
uri (1.1.1)
|
|
130
127
|
|
|
131
128
|
PLATFORMS
|
|
132
|
-
aarch64-linux
|
|
133
129
|
arm64-darwin
|
|
134
|
-
x64-mingw-ucrt
|
|
135
|
-
x86_64-darwin
|
|
136
130
|
x86_64-linux
|
|
137
|
-
x86_64-linux-gnu
|
|
138
131
|
|
|
139
132
|
DEPENDENCIES
|
|
140
133
|
html-to-markdown!
|
|
@@ -150,19 +143,16 @@ CHECKSUMS
|
|
|
150
143
|
activesupport (8.1.3) sha256=21a5e0dfbd4c3ddd9e1317ec6a4d782fa226e7867dc70b0743acda81a1dca20e
|
|
151
144
|
ast (2.4.3) sha256=954615157c1d6a382bc27d690d973195e79db7f55e9765ac7c481c60bdb4d383
|
|
152
145
|
base64 (0.3.0) sha256=27337aeabad6ffae05c265c450490628ef3ebd4b67be58257393227588f5a97b
|
|
153
|
-
bigdecimal (4.0
|
|
146
|
+
bigdecimal (4.1.0) sha256=6dc07767aa3dc456ccd48e7ae70a07b474e9afd7c5bc576f80bd6da5c8dd6cae
|
|
154
147
|
concurrent-ruby (1.3.6) sha256=6b56837e1e7e5292f9864f34b69c5a2cbc75c0cf5338f1ce9903d10fa762d5ab
|
|
155
148
|
connection_pool (3.0.2) sha256=33fff5ba71a12d2aa26cb72b1db8bba2a1a01823559fb01d29eb74c286e62e0a
|
|
156
149
|
csv (3.3.5) sha256=6e5134ac3383ef728b7f02725d9872934f523cb40b961479f69cf3afa6c8e73f
|
|
157
150
|
diff-lcs (1.6.2) sha256=9ae0d2cba7d4df3075fe8cd8602a8604993efc0dfa934cff568969efb1909962
|
|
158
151
|
drb (2.2.3) sha256=0b00d6fdb50995fe4a45dea13663493c841112e4068656854646f418fda13373
|
|
159
|
-
ffi (1.17.4-aarch64-linux-gnu) sha256=b208f06f91ffd8f5e1193da3cae3d2ccfc27fc36fba577baf698d26d91c080df
|
|
160
152
|
ffi (1.17.4-arm64-darwin) sha256=19071aaf1419251b0a46852abf960e77330a3b334d13a4ab51d58b31a937001b
|
|
161
|
-
ffi (1.17.4-x64-mingw-ucrt) sha256=f6ff9618cfccc494138bddade27aa06c74c6c7bc367a1ea1103d80c2fcb9ed35
|
|
162
|
-
ffi (1.17.4-x86_64-darwin) sha256=aa70390523cf3235096cf64962b709b4cfbd5c082a2cb2ae714eb0fe2ccda496
|
|
163
153
|
ffi (1.17.4-x86_64-linux-gnu) sha256=9d3db14c2eae074b382fa9c083fe95aec6e0a1451da249eab096c34002bc752d
|
|
164
154
|
fileutils (1.8.0) sha256=8c6b1df54e2540bdb2f39258f08af78853aa70bad52b4d394bbc6424593c6e02
|
|
165
|
-
html-to-markdown (
|
|
155
|
+
html-to-markdown (3.0.0)
|
|
166
156
|
i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
|
|
167
157
|
json (2.19.3) sha256=289b0bb53052a1fa8c34ab33cc750b659ba14a5c45f3fcf4b18762dc67c78646
|
|
168
158
|
language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
|
data/README.md
CHANGED
|
@@ -17,8 +17,8 @@
|
|
|
17
17
|
<a href="https://central.sonatype.com/artifact/dev.kreuzberg/html-to-markdown">
|
|
18
18
|
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/html-to-markdown?label=Java&color=007ec6" alt="Java">
|
|
19
19
|
</a>
|
|
20
|
-
<a href="https://pkg.go.dev/github.com/kreuzberg-dev/html-to-markdown/packages/go/
|
|
21
|
-
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/html-to-markdown?label=Go&color=007ec6&filter=
|
|
20
|
+
<a href="https://pkg.go.dev/github.com/kreuzberg-dev/html-to-markdown/packages/go/v3/htmltomarkdown">
|
|
21
|
+
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/html-to-markdown?label=Go&color=007ec6&filter=v3.0.0" alt="Go">
|
|
22
22
|
</a>
|
|
23
23
|
<a href="https://www.nuget.org/packages/KreuzbergDev.HtmlToMarkdown/">
|
|
24
24
|
<img src="https://img.shields.io/nuget/v/KreuzbergDev.HtmlToMarkdown?label=C%23&color=007ec6" alt="C#">
|
|
@@ -87,7 +87,6 @@ Apple M4 • Real Wikipedia documents • `convert()` (Ruby)
|
|
|
87
87
|
| Mixed (Python wiki) | 656KB | 4.89ms | 134 MB/s |
|
|
88
88
|
|
|
89
89
|
|
|
90
|
-
See [Performance Guide](../../examples/performance/) for detailed benchmarks.
|
|
91
90
|
|
|
92
91
|
|
|
93
92
|
## Quick Start
|
|
@@ -98,7 +97,8 @@ Basic conversion:
|
|
|
98
97
|
require 'html_to_markdown'
|
|
99
98
|
|
|
100
99
|
html = "<h1>Hello</h1><p>This is <strong>fast</strong>!</p>"
|
|
101
|
-
|
|
100
|
+
result = HtmlToMarkdown.convert(html)
|
|
101
|
+
markdown = result[:content]
|
|
102
102
|
```
|
|
103
103
|
|
|
104
104
|
|
|
@@ -109,60 +109,50 @@ With conversion options:
|
|
|
109
109
|
require 'html_to_markdown'
|
|
110
110
|
|
|
111
111
|
html = "<h1>Hello</h1><p>This is <strong>fast</strong>!</p>"
|
|
112
|
-
|
|
112
|
+
result = HtmlToMarkdown.convert(html, heading_style: :atx, code_block_style: :fenced)
|
|
113
|
+
markdown = result[:content]
|
|
113
114
|
```
|
|
114
115
|
|
|
115
116
|
|
|
116
117
|
|
|
117
118
|
|
|
118
|
-
|
|
119
|
-
|
|
120
119
|
## API Reference
|
|
121
120
|
|
|
122
|
-
### Core
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
**`convert(html, options: nil) -> String`**
|
|
121
|
+
### Core Function
|
|
126
122
|
|
|
127
|
-
Basic HTML-to-Markdown conversion. Fast and simple.
|
|
128
123
|
|
|
129
|
-
**`
|
|
124
|
+
**`convert(html, options: nil, visitor: nil) -> ConversionResult`**
|
|
130
125
|
|
|
131
|
-
|
|
126
|
+
Converts HTML to Markdown. Returns a `ConversionResult` hash with all results in a single call.
|
|
132
127
|
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
Customize conversion with visitor callbacks for element interception. See [Visitor Pattern Guide](../../examples/visitor-pattern/).
|
|
136
|
-
|
|
137
|
-
**`convert_with_inline_images(html, config: nil) -> [String, Array, Array]`**
|
|
138
|
-
|
|
139
|
-
Extract base64-encoded inline images with metadata.
|
|
140
|
-
|
|
141
|
-
**`convert_with_tables(html, options: nil, config: nil) -> ConversionWithTables`**
|
|
128
|
+
```ruby
|
|
129
|
+
require 'html_to_markdown'
|
|
142
130
|
|
|
143
|
-
|
|
131
|
+
result = HtmlToMarkdown.convert(html)
|
|
132
|
+
markdown = result[:content] # Converted Markdown string
|
|
133
|
+
metadata = result[:metadata] # Metadata (when extract_metadata: true)
|
|
134
|
+
tables = result[:tables] # Structured table data (when extract_tables: true)
|
|
135
|
+
document = result[:document] # Document-level info
|
|
136
|
+
images = result[:images] # Extracted images
|
|
137
|
+
warnings = result[:warnings] # Any conversion warnings
|
|
138
|
+
```
|
|
144
139
|
|
|
145
140
|
|
|
146
141
|
|
|
147
142
|
### Options
|
|
148
143
|
|
|
149
144
|
**`ConversionOptions`** – Key configuration fields:
|
|
145
|
+
|
|
150
146
|
- `heading_style`: Heading format (`"underlined"` | `"atx"` | `"atx_closed"`) — default: `"underlined"`
|
|
151
147
|
- `list_indent_width`: Spaces per indent level — default: `2`
|
|
152
148
|
- `bullets`: Bullet characters cycle — default: `"*+-"`
|
|
153
149
|
- `wrap`: Enable text wrapping — default: `false`
|
|
154
150
|
- `wrap_width`: Wrap at column — default: `80`
|
|
155
151
|
- `code_language`: Default fenced code block language — default: none
|
|
156
|
-
- `extract_metadata`:
|
|
152
|
+
- `extract_metadata`: Enable metadata extraction into `result.metadata` — default: `false`
|
|
153
|
+
- `extract_tables`: Enable structured table extraction into `result.tables` — default: `false`
|
|
157
154
|
- `output_format`: Output markup format (`"markdown"` | `"djot"` | `"plain"`) — default: `"markdown"`
|
|
158
155
|
|
|
159
|
-
**`MetadataConfig`** – Selective metadata extraction:
|
|
160
|
-
- `extract_headers`: h1-h6 elements — default: `true`
|
|
161
|
-
- `extract_links`: Hyperlinks — default: `true`
|
|
162
|
-
- `extract_images`: Image elements — default: `true`
|
|
163
|
-
- `extract_structured_data`: JSON-LD, Microdata, RDFa — default: `true`
|
|
164
|
-
- `max_structured_data_size`: Size limit in bytes — default: `100KB`
|
|
165
|
-
|
|
166
156
|
|
|
167
157
|
## Djot Output Format
|
|
168
158
|
|
|
@@ -222,16 +212,17 @@ Plain text mode is useful for search indexing, text extraction, and feeding cont
|
|
|
222
212
|
|
|
223
213
|
## Metadata Extraction
|
|
224
214
|
|
|
225
|
-
The metadata extraction feature enables comprehensive document analysis during conversion. Extract document properties, headers, links, images, and structured data in a single pass.
|
|
215
|
+
The metadata extraction feature enables comprehensive document analysis during conversion. Extract document properties, headers, links, images, and structured data in a single pass — all via the standard `convert()` function.
|
|
226
216
|
|
|
227
217
|
**Use Cases:**
|
|
218
|
+
|
|
228
219
|
- **SEO analysis** – Extract title, description, Open Graph tags, Twitter cards
|
|
229
220
|
- **Table of contents generation** – Build structured outlines from heading hierarchy
|
|
230
221
|
- **Content migration** – Document all external links and resources
|
|
231
222
|
- **Accessibility audits** – Check for images without alt text, empty links, invalid heading hierarchy
|
|
232
223
|
- **Link validation** – Classify and validate anchor, internal, external, email, and phone links
|
|
233
224
|
|
|
234
|
-
**Zero Overhead When Disabled:** Metadata extraction adds negligible overhead and happens during the HTML parsing pass.
|
|
225
|
+
**Zero Overhead When Disabled:** Metadata extraction adds negligible overhead and happens during the HTML parsing pass. Pass `extract_metadata: true` in `ConversionOptions` to enable it; the result is available at `result.metadata`.
|
|
235
226
|
|
|
236
227
|
### Example: Quick Start
|
|
237
228
|
|
|
@@ -240,27 +231,27 @@ The metadata extraction feature enables comprehensive document analysis during c
|
|
|
240
231
|
require 'html_to_markdown'
|
|
241
232
|
|
|
242
233
|
html = '<h1>Article</h1><img src="test.jpg" alt="test">'
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
puts
|
|
246
|
-
puts metadata[:
|
|
247
|
-
puts metadata[:
|
|
248
|
-
puts metadata[:
|
|
249
|
-
puts metadata[:
|
|
234
|
+
result = HtmlToMarkdown.convert(html, extract_metadata: true)
|
|
235
|
+
|
|
236
|
+
puts result[:content] # Converted Markdown
|
|
237
|
+
puts result[:metadata][:document][:title] # Document title
|
|
238
|
+
puts result[:metadata][:headers] # All h1-h6 elements
|
|
239
|
+
puts result[:metadata][:links] # All hyperlinks
|
|
240
|
+
puts result[:metadata][:images] # All images with alt text
|
|
241
|
+
puts result[:metadata][:structured_data] # JSON-LD, Microdata, RDFa
|
|
250
242
|
```
|
|
251
243
|
|
|
252
244
|
|
|
253
245
|
|
|
254
|
-
For detailed examples including SEO extraction, table-of-contents generation, link validation, and accessibility audits, see the [Metadata Extraction Guide](../../examples/metadata-extraction/).
|
|
255
|
-
|
|
256
246
|
|
|
257
247
|
|
|
258
248
|
|
|
259
249
|
## Visitor Pattern
|
|
260
250
|
|
|
261
|
-
The visitor pattern enables custom HTML→Markdown conversion logic by providing callbacks for specific HTML elements during traversal.
|
|
251
|
+
The visitor pattern enables custom HTML→Markdown conversion logic by providing callbacks for specific HTML elements during traversal. Pass a visitor as the third argument to `convert()`.
|
|
262
252
|
|
|
263
253
|
**Use Cases:**
|
|
254
|
+
|
|
264
255
|
- **Custom Markdown dialects** – Convert to Obsidian, Notion, or other flavors
|
|
265
256
|
- **Content filtering** – Remove tracking pixels, ads, or unwanted elements
|
|
266
257
|
- **URL rewriting** – Rewrite CDN URLs, add query parameters, validate links
|
|
@@ -291,20 +282,16 @@ class MyVisitor
|
|
|
291
282
|
end
|
|
292
283
|
|
|
293
284
|
html = '<a href="https://old-cdn.com/file.pdf">Download</a>'
|
|
294
|
-
|
|
285
|
+
result = HtmlToMarkdown.convert(html, visitor: MyVisitor.new)
|
|
286
|
+
markdown = result[:content]
|
|
295
287
|
```
|
|
296
288
|
|
|
297
289
|
|
|
298
290
|
|
|
299
|
-
For comprehensive examples including content filtering, link footnotes, accessibility validation, and asynchronous URL validation, see the [Visitor Pattern Guide](../../examples/visitor-pattern/).
|
|
300
|
-
|
|
301
291
|
|
|
302
292
|
|
|
303
293
|
## Examples
|
|
304
294
|
|
|
305
|
-
- [Visitor Pattern Guide](../../examples/visitor-pattern/)
|
|
306
|
-
- [Metadata Extraction Guide](../../examples/metadata-extraction/)
|
|
307
|
-
- [Performance Guide](../../examples/performance/)
|
|
308
295
|
|
|
309
296
|
## Links
|
|
310
297
|
|
data/lib/html_to_markdown.rb
CHANGED
|
@@ -7,205 +7,24 @@ module HtmlToMarkdown
|
|
|
7
7
|
autoload :CLI, 'html_to_markdown/cli'
|
|
8
8
|
autoload :CLIProxy, 'html_to_markdown/cli_proxy'
|
|
9
9
|
|
|
10
|
-
class Options; end # rubocop:disable Lint/EmptyClass
|
|
11
|
-
|
|
12
10
|
class << self
|
|
13
11
|
alias native_convert convert
|
|
14
|
-
alias native_convert_with_inline_images convert_with_inline_images
|
|
15
|
-
alias native_convert_with_inline_images_handle convert_with_inline_images_handle
|
|
16
|
-
alias native_options options
|
|
17
|
-
alias native_convert_with_options convert_with_options
|
|
18
|
-
alias native_convert_with_metadata convert_with_metadata
|
|
19
|
-
alias native_convert_with_metadata_handle convert_with_metadata_handle
|
|
20
|
-
alias native_convert_with_visitor convert_with_visitor
|
|
21
|
-
alias native_convert_with_tables convert_with_tables
|
|
22
12
|
end
|
|
23
13
|
|
|
24
14
|
module_function
|
|
25
15
|
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
# NOTE: visitor parameter is accepted for API compatibility but not used in inline images mode
|
|
40
|
-
# The visitor pattern is only supported in the standard convert() method
|
|
41
|
-
native_convert_with_inline_images(html.to_s, options, image_config)
|
|
42
|
-
end
|
|
43
|
-
|
|
44
|
-
def convert_with_inline_images_handle(html, options_handle, image_config = nil)
|
|
45
|
-
native_convert_with_inline_images_handle(html.to_s, options_handle, image_config)
|
|
46
|
-
end
|
|
47
|
-
|
|
48
|
-
def options(options_hash = nil)
|
|
49
|
-
native_options(options_hash)
|
|
50
|
-
end
|
|
51
|
-
|
|
52
|
-
# Convert HTML to Markdown with comprehensive metadata extraction.
|
|
53
|
-
#
|
|
54
|
-
# Performs HTML-to-Markdown conversion while extracting document metadata, headers,
|
|
55
|
-
# links, images, and structured data in a single pass. Ideal for content analysis,
|
|
56
|
-
# SEO workflows, and document indexing.
|
|
57
|
-
#
|
|
58
|
-
# @param html [String] HTML string to convert. Line endings are normalized (CRLF -> LF).
|
|
59
|
-
# @param options [ConversionOptions, Hash, nil] Optional conversion configuration.
|
|
60
|
-
# When a Hash, keys should match ConversionOptions field names (as symbols or strings).
|
|
61
|
-
# Common options:
|
|
62
|
-
# - :heading_style [String] "atx", "atx_closed", or "underlined" (default: "underlined")
|
|
63
|
-
# - :list_indent_type [String] "spaces" or "tabs" (default: "spaces")
|
|
64
|
-
# - :list_indent_width [Integer] Spaces per indent level (default: 4)
|
|
65
|
-
# - :wrap [true, false] Enable text wrapping (default: false)
|
|
66
|
-
# - :wrap_width [Integer] Wrap at this column width (default: 80)
|
|
67
|
-
# See ConversionOptions documentation for complete list.
|
|
68
|
-
#
|
|
69
|
-
# @param metadata_config [Hash, nil] Optional metadata extraction configuration.
|
|
70
|
-
# Keys should be symbols or strings. Supported keys:
|
|
71
|
-
# - :extract_headers [true, false] Extract h1-h6 heading elements (default: true)
|
|
72
|
-
# - :extract_links [true, false] Extract hyperlinks with type classification (default: true)
|
|
73
|
-
# - :extract_images [true, false] Extract image elements (default: true)
|
|
74
|
-
# - :extract_structured_data [true, false] Extract JSON-LD/Microdata/RDFa (default: true)
|
|
75
|
-
# - :max_structured_data_size [Integer] Size limit for structured data in bytes (default: 1_000_000)
|
|
76
|
-
#
|
|
77
|
-
# @return [Array<String, Hash>] Tuple of [markdown_string, metadata_hash]
|
|
78
|
-
# markdown_string: String - The converted Markdown output
|
|
79
|
-
#
|
|
80
|
-
# metadata_hash: Hash with keys:
|
|
81
|
-
# - :document [Hash] Document-level metadata:
|
|
82
|
-
# - :title [String, nil] From <title> tag
|
|
83
|
-
# - :description [String, nil] From <meta name="description">
|
|
84
|
-
# - :keywords [Array<String>] From <meta name="keywords">
|
|
85
|
-
# - :author [String, nil] From <meta name="author">
|
|
86
|
-
# - :language [String, nil] From lang attribute (e.g., "en")
|
|
87
|
-
# - :text_direction [String, nil] "ltr", "rtl", or "auto"
|
|
88
|
-
# - :canonical_url [String, nil] From <link rel="canonical">
|
|
89
|
-
# - :base_href [String, nil] From <base href="">
|
|
90
|
-
# - :open_graph [Hash<String, String>] Open Graph properties (og:* meta tags)
|
|
91
|
-
# - :twitter_card [Hash<String, String>] Twitter Card properties (twitter:* meta tags)
|
|
92
|
-
# - :meta_tags [Hash<String, String>] Other meta tags
|
|
93
|
-
#
|
|
94
|
-
# - :headers [Array<Hash>] Heading elements:
|
|
95
|
-
# - :level [Integer] 1-6
|
|
96
|
-
# - :text [String] Header text content
|
|
97
|
-
# - :id [String, nil] HTML id attribute
|
|
98
|
-
# - :depth [Integer] Tree nesting depth
|
|
99
|
-
# - :html_offset [Integer] Byte offset in original HTML
|
|
100
|
-
#
|
|
101
|
-
# - :links [Array<Hash>] Hyperlinks:
|
|
102
|
-
# - :href [String] Link URL
|
|
103
|
-
# - :text [String] Link text content
|
|
104
|
-
# - :title [String, nil] Title attribute
|
|
105
|
-
# - :link_type [String] "anchor", "internal", "external", "email", "phone", or "other"
|
|
106
|
-
# - :rel [Array<String>] Rel attribute values
|
|
107
|
-
# - :attributes [Hash<String, String>] Additional HTML attributes
|
|
108
|
-
#
|
|
109
|
-
# - :images [Array<Hash>] Image elements:
|
|
110
|
-
# - :src [String] Image source URL or data URI
|
|
111
|
-
# - :alt [String, nil] Alt text for accessibility
|
|
112
|
-
# - :title [String, nil] Title attribute
|
|
113
|
-
# - :dimensions [Array<Integer>, nil] [width, height] if available
|
|
114
|
-
# - :image_type [String] "data_uri", "external", "relative", or "inline_svg"
|
|
115
|
-
# - :attributes [Hash<String, String>] Additional HTML attributes
|
|
116
|
-
#
|
|
117
|
-
# - :structured_data [Array<Hash>] Structured data blocks:
|
|
118
|
-
# - :data_type [String] "json_ld", "microdata", or "rdfa"
|
|
119
|
-
# - :raw_json [String] Raw JSON content
|
|
120
|
-
# - :schema_type [String, nil] Schema type (e.g., "Article", "Event")
|
|
121
|
-
#
|
|
122
|
-
# @raise [StandardError] If conversion fails or invalid configuration
|
|
123
|
-
#
|
|
124
|
-
# @example Basic usage
|
|
125
|
-
# html = <<~HTML
|
|
126
|
-
# <html lang="en">
|
|
127
|
-
# <head>
|
|
128
|
-
# <title>My Article</title>
|
|
129
|
-
# <meta name="description" content="A great read">
|
|
130
|
-
# </head>
|
|
131
|
-
# <body>
|
|
132
|
-
# <h1 id="intro">Introduction</h1>
|
|
133
|
-
# <p>Visit <a href="https://example.com">our site</a></p>
|
|
134
|
-
# <img src="photo.jpg" alt="Beautiful landscape">
|
|
135
|
-
# </body>
|
|
136
|
-
# </html>
|
|
137
|
-
# HTML
|
|
138
|
-
#
|
|
139
|
-
# markdown, metadata = HtmlToMarkdown.convert_with_metadata(html)
|
|
140
|
-
#
|
|
141
|
-
# puts metadata[:document][:title] # => "My Article"
|
|
142
|
-
# puts metadata[:document][:language] # => "en"
|
|
143
|
-
# puts metadata[:headers].length # => 1
|
|
144
|
-
# puts metadata[:headers][0][:text] # => "Introduction"
|
|
145
|
-
# puts metadata[:links].length # => 1
|
|
146
|
-
# puts metadata[:images].length # => 1
|
|
147
|
-
#
|
|
148
|
-
# @example With selective metadata extraction
|
|
149
|
-
# config = {
|
|
150
|
-
# extract_headers: true,
|
|
151
|
-
# extract_links: true,
|
|
152
|
-
# extract_images: false, # Skip images
|
|
153
|
-
# extract_structured_data: false # Skip structured data
|
|
154
|
-
# }
|
|
155
|
-
#
|
|
156
|
-
# markdown, metadata = HtmlToMarkdown.convert_with_metadata(html, nil, config)
|
|
157
|
-
# puts metadata[:images].empty? # => true (not extracted)
|
|
158
|
-
#
|
|
159
|
-
# @example With conversion options
|
|
160
|
-
# options = {
|
|
161
|
-
# heading_style: "atx", # Use # H1, ## H2 style
|
|
162
|
-
# wrap: true,
|
|
163
|
-
# wrap_width: 80
|
|
164
|
-
# }
|
|
165
|
-
#
|
|
166
|
-
# config = { extract_headers: true }
|
|
167
|
-
#
|
|
168
|
-
# markdown, metadata = HtmlToMarkdown.convert_with_metadata(html, options, config)
|
|
169
|
-
# # Markdown uses ATX-style headings and wraps at 80 characters
|
|
170
|
-
#
|
|
171
|
-
# @see #convert Simple conversion without metadata
|
|
172
|
-
# @see #convert_with_inline_images Extract inline images during conversion
|
|
173
|
-
# @see ConversionOptions Detailed conversion configuration
|
|
174
|
-
def convert_with_metadata(html, options = nil, metadata_config = nil, _visitor = nil)
|
|
175
|
-
# NOTE: visitor parameter is accepted for API compatibility but not used in metadata extraction mode
|
|
176
|
-
# The visitor pattern is only supported in the standard convert() method
|
|
177
|
-
native_convert_with_metadata(html.to_s, options, metadata_config)
|
|
178
|
-
end
|
|
179
|
-
|
|
180
|
-
def convert_with_metadata_handle(html, options_handle, metadata_config = nil)
|
|
181
|
-
native_convert_with_metadata_handle(html.to_s, options_handle, metadata_config)
|
|
182
|
-
end
|
|
183
|
-
|
|
184
|
-
# Convert HTML to Markdown with table extraction.
|
|
185
|
-
#
|
|
186
|
-
# Performs HTML-to-Markdown conversion while extracting structured table data
|
|
187
|
-
# (cells, markdown representation, header row flags) in a single pass.
|
|
188
|
-
#
|
|
189
|
-
# @param html [String] HTML string to convert.
|
|
190
|
-
# @param options [Hash, nil] Optional conversion configuration.
|
|
191
|
-
# @param metadata_config [Hash, nil] Optional metadata extraction configuration.
|
|
192
|
-
#
|
|
193
|
-
# @return [Hash] A hash with keys:
|
|
194
|
-
# - :content [String] The converted Markdown output
|
|
195
|
-
# - :metadata [Hash, nil] Extended metadata (if metadata extraction was configured)
|
|
196
|
-
# - :tables [Array<Hash>] Extracted tables, each with:
|
|
197
|
-
# - :cells [Array<Array<String>>] Table cells organized as rows x columns
|
|
198
|
-
# - :markdown [String] Complete rendered table in Markdown format
|
|
199
|
-
# - :is_header_row [Array<Boolean>] Per-row flag indicating header rows
|
|
200
|
-
#
|
|
201
|
-
# @raise [StandardError] If conversion fails or invalid configuration
|
|
202
|
-
#
|
|
203
|
-
# @example Basic usage
|
|
204
|
-
# html = '<table><thead><tr><th>Name</th></tr></thead><tbody><tr><td>Alice</td></tr></tbody></table>'
|
|
205
|
-
# result = HtmlToMarkdown.convert_with_tables(html)
|
|
206
|
-
# puts result[:tables].length # => 1
|
|
207
|
-
# puts result[:tables][0][:cells] # => [["Name"], ["Alice"]]
|
|
208
|
-
def convert_with_tables(html, options = nil, metadata_config = nil)
|
|
209
|
-
native_convert_with_tables(html.to_s, options, metadata_config)
|
|
16
|
+
# Convert HTML to Markdown, returning a Hash with:
|
|
17
|
+
# - :content [String, nil] the converted Markdown output
|
|
18
|
+
# - :document [nil] document structure (not yet exposed)
|
|
19
|
+
# - :metadata [Hash, nil] extracted HTML metadata
|
|
20
|
+
# - :tables [Array<Hash>] extracted tables with :grid and :markdown
|
|
21
|
+
# - :images [Array<Hash>] extracted inline images
|
|
22
|
+
# - :warnings [Array<Hash>] processing warnings
|
|
23
|
+
#
|
|
24
|
+
# @param html [String] HTML string to convert
|
|
25
|
+
# @param options [Hash, nil] optional conversion options
|
|
26
|
+
# @return [Hash] conversion result
|
|
27
|
+
def convert(html, options = nil)
|
|
28
|
+
native_convert(html.to_s, options)
|
|
210
29
|
end
|
|
211
30
|
end
|
data/lib/html_to_markdown_rb.so
CHANGED
|
Binary file
|