html-to-markdown 2.13.0 → 2.14.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/README.md +200 -3
- data/ext/html-to-markdown-rb/native/Cargo.toml +2 -2
- data/lib/html_to_markdown/version.rb +1 -1
- data/lib/html_to_markdown.rb +121 -5
- metadata +2 -3
- data/METADATA.md +0 -227
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: dd0a9378e87b5c10c4500389c1c5e9c25a3b7b7ea1a930839b20ec0c9e00745b
|
|
4
|
+
data.tar.gz: 81b1626a43403390709c9c1fdecc377648ef8cda554fafafda6c0cfa2f841bcb
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 30e15347d844c106f7e0538f499e9162c3d14903b7af3dff932649150e050b68e04bf4775a20377aaacc8f7c3c290a9128cb290047f9247f251a6214b793e043
|
|
7
|
+
data.tar.gz: d966ff4c9395461196e6f81f087ba7be193b9ea2f071e74053f9ae7feed10c5ffeca73f54cd01c53ba9ba8548634b3820dfa70712b040f5ce2d7d3ceefdff5b4
|
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
|
@@ -184,7 +184,37 @@ result.inline_images.each do |img|
|
|
|
184
184
|
end
|
|
185
185
|
```
|
|
186
186
|
|
|
187
|
-
### Metadata
|
|
187
|
+
### Metadata Extraction
|
|
188
|
+
|
|
189
|
+
Extract comprehensive metadata alongside Markdown conversion: document properties (title, description, author, language), social metadata (Open Graph, Twitter cards), heading hierarchy, link analysis (type classification, rel attributes), image metadata (dimensions, type detection), and structured data (JSON-LD, Microdata, RDFa).
|
|
190
|
+
|
|
191
|
+
#### Basic Usage
|
|
192
|
+
|
|
193
|
+
```ruby
|
|
194
|
+
require 'html_to_markdown'
|
|
195
|
+
|
|
196
|
+
html = '<html lang="en"><head><title>Test</title></head><body><h1>Hello</h1></body></html>'
|
|
197
|
+
markdown, metadata = HtmlToMarkdown.convert_with_metadata(html)
|
|
198
|
+
|
|
199
|
+
puts markdown
|
|
200
|
+
puts metadata[:document][:title] # "Test"
|
|
201
|
+
puts metadata[:headers].length # 1
|
|
202
|
+
```
|
|
203
|
+
|
|
204
|
+
#### With Conversion Options
|
|
205
|
+
|
|
206
|
+
```ruby
|
|
207
|
+
conv_opts = { heading_style: :atx_closed }
|
|
208
|
+
metadata_opts = { extract_headers: true, extract_links: false }
|
|
209
|
+
|
|
210
|
+
markdown, metadata = HtmlToMarkdown.convert_with_metadata(
|
|
211
|
+
html,
|
|
212
|
+
conv_opts,
|
|
213
|
+
metadata_opts
|
|
214
|
+
)
|
|
215
|
+
```
|
|
216
|
+
|
|
217
|
+
#### Full Example
|
|
188
218
|
|
|
189
219
|
```ruby
|
|
190
220
|
require 'html_to_markdown'
|
|
@@ -195,11 +225,16 @@ html = <<~HTML
|
|
|
195
225
|
<title>Example</title>
|
|
196
226
|
<meta name="description" content="Demo page">
|
|
197
227
|
<link rel="canonical" href="https://example.com/page">
|
|
228
|
+
<meta property="og:image" content="https://example.com/og.jpg">
|
|
229
|
+
<meta name="twitter:card" content="summary_large_image">
|
|
198
230
|
</head>
|
|
199
231
|
<body>
|
|
200
232
|
<h1 id="welcome">Welcome</h1>
|
|
201
233
|
<a href="https://example.com" rel="nofollow external">Example link</a>
|
|
202
234
|
<img src="https://example.com/image.jpg" alt="Hero" width="640" height="480">
|
|
235
|
+
<script type="application/ld+json">
|
|
236
|
+
{"@context": "https://schema.org", "@type": "Article"}
|
|
237
|
+
</script>
|
|
203
238
|
</body>
|
|
204
239
|
</html>
|
|
205
240
|
HTML
|
|
@@ -207,16 +242,178 @@ HTML
|
|
|
207
242
|
markdown, metadata = HtmlToMarkdown.convert_with_metadata(
|
|
208
243
|
html,
|
|
209
244
|
{ heading_style: :atx },
|
|
210
|
-
{ extract_links: true, extract_images: true, extract_headers: true }
|
|
245
|
+
{ extract_links: true, extract_images: true, extract_headers: true, extract_structured_data: true }
|
|
211
246
|
)
|
|
212
247
|
|
|
213
248
|
puts markdown
|
|
214
249
|
puts metadata[:document][:title] # "Example"
|
|
250
|
+
puts metadata[:document][:description] # "Demo page"
|
|
251
|
+
puts metadata[:document][:open_graph] # {"og:image" => "https://example.com/og.jpg"}
|
|
215
252
|
puts metadata[:links].first[:rel] # ["nofollow", "external"]
|
|
216
253
|
puts metadata[:images].first[:dimensions] # [640, 480]
|
|
254
|
+
puts metadata[:headers].first[:id] # "welcome"
|
|
255
|
+
```
|
|
256
|
+
|
|
257
|
+
#### Return Value Structure
|
|
258
|
+
|
|
259
|
+
Returns a 2-element array: `[markdown_string, metadata_hash]`
|
|
260
|
+
|
|
261
|
+
The metadata hash contains:
|
|
262
|
+
|
|
263
|
+
```ruby
|
|
264
|
+
{
|
|
265
|
+
document: {
|
|
266
|
+
title: String?,
|
|
267
|
+
description: String?,
|
|
268
|
+
keywords: Array[String],
|
|
269
|
+
author: String?,
|
|
270
|
+
canonical_url: String?,
|
|
271
|
+
base_href: String?,
|
|
272
|
+
language: String?,
|
|
273
|
+
text_direction: "ltr" | "rtl" | "auto" | nil,
|
|
274
|
+
open_graph: Hash[String, String],
|
|
275
|
+
twitter_card: Hash[String, String],
|
|
276
|
+
meta_tags: Hash[String, String]
|
|
277
|
+
},
|
|
278
|
+
headers: [
|
|
279
|
+
{
|
|
280
|
+
level: Integer, # 1-6
|
|
281
|
+
text: String,
|
|
282
|
+
id: String?,
|
|
283
|
+
depth: Integer,
|
|
284
|
+
html_offset: Integer
|
|
285
|
+
}
|
|
286
|
+
],
|
|
287
|
+
links: [
|
|
288
|
+
{
|
|
289
|
+
href: String,
|
|
290
|
+
text: String,
|
|
291
|
+
title: String?,
|
|
292
|
+
link_type: "anchor" | "internal" | "external" | "email" | "phone" | "other",
|
|
293
|
+
rel: Array[String],
|
|
294
|
+
attributes: Hash[String, String]
|
|
295
|
+
}
|
|
296
|
+
],
|
|
297
|
+
images: [
|
|
298
|
+
{
|
|
299
|
+
src: String,
|
|
300
|
+
alt: String?,
|
|
301
|
+
title: String?,
|
|
302
|
+
dimensions: [Integer, Integer]?,
|
|
303
|
+
image_type: "data_uri" | "inline_svg" | "external" | "relative",
|
|
304
|
+
attributes: Hash[String, String]
|
|
305
|
+
}
|
|
306
|
+
],
|
|
307
|
+
structured_data: [
|
|
308
|
+
{
|
|
309
|
+
data_type: "json_ld" | "microdata" | "rdfa",
|
|
310
|
+
raw_json: String,
|
|
311
|
+
schema_type: String?
|
|
312
|
+
}
|
|
313
|
+
]
|
|
314
|
+
}
|
|
315
|
+
```
|
|
316
|
+
|
|
317
|
+
#### Metadata Configuration
|
|
318
|
+
|
|
319
|
+
Pass a hash with the following options to control which metadata types are extracted:
|
|
320
|
+
|
|
321
|
+
```ruby
|
|
322
|
+
config = {
|
|
323
|
+
extract_headers: true, # Extract h1-h6 elements (default: true)
|
|
324
|
+
extract_links: true, # Extract <a> elements (default: true)
|
|
325
|
+
extract_images: true, # Extract <img> elements (default: true)
|
|
326
|
+
extract_structured_data: true, # Extract JSON-LD/Microdata/RDFa (default: true)
|
|
327
|
+
max_structured_data_size: 1_000_000 # Max bytes for structured data (default: 1MB)
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
markdown, metadata = HtmlToMarkdown.convert_with_metadata(html, nil, config)
|
|
331
|
+
```
|
|
332
|
+
|
|
333
|
+
#### Features
|
|
334
|
+
|
|
335
|
+
The Ruby binding provides comprehensive metadata extraction during HTML-to-Markdown conversion:
|
|
336
|
+
|
|
337
|
+
- **Document Metadata**: title, description, keywords, author, canonical URL, language, text direction
|
|
338
|
+
- **Open Graph & Twitter Card**: social media metadata extraction
|
|
339
|
+
- **Headers**: h1-h6 extraction with hierarchy, ids, and depth tracking
|
|
340
|
+
- **Links**: hyperlink extraction with type classification (anchor, internal, external, email, phone)
|
|
341
|
+
- **Images**: image extraction with source type (data_uri, inline_svg, external, relative) and dimensions
|
|
342
|
+
- **Structured Data**: JSON-LD, Microdata, and RDFa extraction
|
|
343
|
+
|
|
344
|
+
#### Type Safety with RBS
|
|
345
|
+
|
|
346
|
+
All types are defined in RBS format in `sig/html_to_markdown.rbs`:
|
|
347
|
+
|
|
348
|
+
- `document_metadata` - Document-level metadata structure
|
|
349
|
+
- `header_metadata` - Individual header element
|
|
350
|
+
- `link_metadata` - Individual link element
|
|
351
|
+
- `image_metadata` - Individual image element
|
|
352
|
+
- `structured_data` - Structured data block
|
|
353
|
+
- `extended_metadata` - Complete metadata extraction result
|
|
354
|
+
|
|
355
|
+
Uses strict RBS type checking with Steep for full type safety:
|
|
356
|
+
|
|
357
|
+
```bash
|
|
358
|
+
steep check
|
|
359
|
+
```
|
|
360
|
+
|
|
361
|
+
#### Implementation Architecture
|
|
362
|
+
|
|
363
|
+
The Rust implementation uses a single-pass collector pattern for efficient metadata extraction:
|
|
364
|
+
|
|
365
|
+
1. **No duplication**: Core logic lives in Rust (`crates/html-to-markdown/src/metadata.rs`)
|
|
366
|
+
2. **Minimal wrapper layer**: Ruby binding in `crates/html-to-markdown-rb/src/lib.rs`
|
|
367
|
+
3. **Type translation**: Rust types → Ruby hashes with proper Magnus bindings
|
|
368
|
+
4. **Hash conversion**: Uses Magnus `RHash` API for efficient Ruby hash construction
|
|
369
|
+
|
|
370
|
+
The metadata feature is gated by a Cargo feature in `Cargo.toml`:
|
|
371
|
+
|
|
372
|
+
```toml
|
|
373
|
+
[features]
|
|
374
|
+
metadata = ["html-to-markdown-rs/metadata"]
|
|
375
|
+
```
|
|
376
|
+
|
|
377
|
+
This ensures:
|
|
378
|
+
- Zero overhead when metadata is not needed
|
|
379
|
+
- Clean integration with feature flag detection
|
|
380
|
+
- Consistent with Python binding implementation
|
|
381
|
+
|
|
382
|
+
#### Language Parity
|
|
383
|
+
|
|
384
|
+
Implements the same API as the Python binding:
|
|
385
|
+
|
|
386
|
+
- Same method signature: `convert_with_metadata(html, options, metadata_config)`
|
|
387
|
+
- Same return type: `[markdown, metadata_dict]`
|
|
388
|
+
- Same metadata structures and field names
|
|
389
|
+
- Same enum values (link_type, image_type, data_type, text_direction)
|
|
390
|
+
|
|
391
|
+
Enables seamless migration and multi-language development.
|
|
392
|
+
|
|
393
|
+
#### Performance
|
|
394
|
+
|
|
395
|
+
Single-pass collection during tree traversal:
|
|
396
|
+
- No additional parsing passes
|
|
397
|
+
- Minimal memory overhead
|
|
398
|
+
- Configurable extraction granularity
|
|
399
|
+
- Built-in size limits for safety
|
|
400
|
+
|
|
401
|
+
#### Testing
|
|
402
|
+
|
|
403
|
+
Comprehensive RSpec test suite in `spec/metadata_extraction_spec.rb`:
|
|
404
|
+
|
|
405
|
+
```bash
|
|
406
|
+
cd packages/ruby
|
|
407
|
+
bundle exec rake compile -- --release --features metadata
|
|
408
|
+
bundle exec rspec spec/metadata_extraction_spec.rb
|
|
217
409
|
```
|
|
218
410
|
|
|
219
|
-
|
|
411
|
+
Tests cover:
|
|
412
|
+
- All metadata types extraction
|
|
413
|
+
- Configuration flags
|
|
414
|
+
- Edge cases (empty HTML, malformed input, special characters)
|
|
415
|
+
- Return value structure validation
|
|
416
|
+
- Integration with conversion options
|
|
220
417
|
|
|
221
418
|
## CLI
|
|
222
419
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "html-to-markdown-rb"
|
|
3
|
-
version = "2.
|
|
3
|
+
version = "2.14.1"
|
|
4
4
|
edition = "2024"
|
|
5
5
|
authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
|
|
6
6
|
license = "MIT"
|
|
@@ -22,7 +22,7 @@ default = ["metadata"]
|
|
|
22
22
|
metadata = ["html-to-markdown-rs/metadata"]
|
|
23
23
|
|
|
24
24
|
[dependencies]
|
|
25
|
-
html-to-markdown-rs = { version = "2.
|
|
25
|
+
html-to-markdown-rs = { version = "2.14.1", features = ["inline-images"] }
|
|
26
26
|
magnus = { git = "https://github.com/matsadler/magnus", rev = "f6db11769efb517427bf7f121f9c32e18b059b38", features = ["rb-sys"] }
|
|
27
27
|
|
|
28
28
|
[dev-dependencies]
|
data/lib/html_to_markdown.rb
CHANGED
|
@@ -35,12 +35,128 @@ module HtmlToMarkdown
|
|
|
35
35
|
native_options(options_hash)
|
|
36
36
|
end
|
|
37
37
|
|
|
38
|
-
# Convert HTML to Markdown with comprehensive metadata extraction
|
|
38
|
+
# Convert HTML to Markdown with comprehensive metadata extraction.
|
|
39
39
|
#
|
|
40
|
-
#
|
|
41
|
-
#
|
|
42
|
-
#
|
|
43
|
-
#
|
|
40
|
+
# Performs HTML-to-Markdown conversion while extracting document metadata, headers,
|
|
41
|
+
# links, images, and structured data in a single pass. Ideal for content analysis,
|
|
42
|
+
# SEO workflows, and document indexing.
|
|
43
|
+
#
|
|
44
|
+
# @param html [String] HTML string to convert. Line endings are normalized (CRLF -> LF).
|
|
45
|
+
# @param options [ConversionOptions, Hash, nil] Optional conversion configuration.
|
|
46
|
+
# When a Hash, keys should match ConversionOptions field names (as symbols or strings).
|
|
47
|
+
# Common options:
|
|
48
|
+
# - :heading_style [String] "atx", "atx_closed", or "underlined" (default: "underlined")
|
|
49
|
+
# - :list_indent_type [String] "spaces" or "tabs" (default: "spaces")
|
|
50
|
+
# - :list_indent_width [Integer] Spaces per indent level (default: 4)
|
|
51
|
+
# - :wrap [true, false] Enable text wrapping (default: false)
|
|
52
|
+
# - :wrap_width [Integer] Wrap at this column width (default: 80)
|
|
53
|
+
# See ConversionOptions documentation for complete list.
|
|
54
|
+
#
|
|
55
|
+
# @param metadata_config [Hash, nil] Optional metadata extraction configuration.
|
|
56
|
+
# Keys should be symbols or strings. Supported keys:
|
|
57
|
+
# - :extract_headers [true, false] Extract h1-h6 heading elements (default: true)
|
|
58
|
+
# - :extract_links [true, false] Extract hyperlinks with type classification (default: true)
|
|
59
|
+
# - :extract_images [true, false] Extract image elements (default: true)
|
|
60
|
+
# - :extract_structured_data [true, false] Extract JSON-LD/Microdata/RDFa (default: true)
|
|
61
|
+
# - :max_structured_data_size [Integer] Size limit for structured data in bytes (default: 1_000_000)
|
|
62
|
+
#
|
|
63
|
+
# @return [Array<String, Hash>] Tuple of [markdown_string, metadata_hash]
|
|
64
|
+
# markdown_string: String - The converted Markdown output
|
|
65
|
+
#
|
|
66
|
+
# metadata_hash: Hash with keys:
|
|
67
|
+
# - :document [Hash] Document-level metadata:
|
|
68
|
+
# - :title [String, nil] From <title> tag
|
|
69
|
+
# - :description [String, nil] From <meta name="description">
|
|
70
|
+
# - :keywords [Array<String>] From <meta name="keywords">
|
|
71
|
+
# - :author [String, nil] From <meta name="author">
|
|
72
|
+
# - :language [String, nil] From lang attribute (e.g., "en")
|
|
73
|
+
# - :text_direction [String, nil] "ltr", "rtl", or "auto"
|
|
74
|
+
# - :canonical_url [String, nil] From <link rel="canonical">
|
|
75
|
+
# - :base_href [String, nil] From <base href="">
|
|
76
|
+
# - :open_graph [Hash<String, String>] Open Graph properties (og:* meta tags)
|
|
77
|
+
# - :twitter_card [Hash<String, String>] Twitter Card properties (twitter:* meta tags)
|
|
78
|
+
# - :meta_tags [Hash<String, String>] Other meta tags
|
|
79
|
+
#
|
|
80
|
+
# - :headers [Array<Hash>] Heading elements:
|
|
81
|
+
# - :level [Integer] 1-6
|
|
82
|
+
# - :text [String] Header text content
|
|
83
|
+
# - :id [String, nil] HTML id attribute
|
|
84
|
+
# - :depth [Integer] Tree nesting depth
|
|
85
|
+
# - :html_offset [Integer] Byte offset in original HTML
|
|
86
|
+
#
|
|
87
|
+
# - :links [Array<Hash>] Hyperlinks:
|
|
88
|
+
# - :href [String] Link URL
|
|
89
|
+
# - :text [String] Link text content
|
|
90
|
+
# - :title [String, nil] Title attribute
|
|
91
|
+
# - :link_type [String] "anchor", "internal", "external", "email", "phone", or "other"
|
|
92
|
+
# - :rel [Array<String>] Rel attribute values
|
|
93
|
+
# - :attributes [Hash<String, String>] Additional HTML attributes
|
|
94
|
+
#
|
|
95
|
+
# - :images [Array<Hash>] Image elements:
|
|
96
|
+
# - :src [String] Image source URL or data URI
|
|
97
|
+
# - :alt [String, nil] Alt text for accessibility
|
|
98
|
+
# - :title [String, nil] Title attribute
|
|
99
|
+
# - :dimensions [Array<Integer>, nil] [width, height] if available
|
|
100
|
+
# - :image_type [String] "data_uri", "external", "relative", or "inline_svg"
|
|
101
|
+
# - :attributes [Hash<String, String>] Additional HTML attributes
|
|
102
|
+
#
|
|
103
|
+
# - :structured_data [Array<Hash>] Structured data blocks:
|
|
104
|
+
# - :data_type [String] "json_ld", "microdata", or "rdfa"
|
|
105
|
+
# - :raw_json [String] Raw JSON content
|
|
106
|
+
# - :schema_type [String, nil] Schema type (e.g., "Article", "Event")
|
|
107
|
+
#
|
|
108
|
+
# @raise [StandardError] If conversion fails or invalid configuration
|
|
109
|
+
#
|
|
110
|
+
# @example Basic usage
|
|
111
|
+
# html = <<~HTML
|
|
112
|
+
# <html lang="en">
|
|
113
|
+
# <head>
|
|
114
|
+
# <title>My Article</title>
|
|
115
|
+
# <meta name="description" content="A great read">
|
|
116
|
+
# </head>
|
|
117
|
+
# <body>
|
|
118
|
+
# <h1 id="intro">Introduction</h1>
|
|
119
|
+
# <p>Visit <a href="https://example.com">our site</a></p>
|
|
120
|
+
# <img src="photo.jpg" alt="Beautiful landscape">
|
|
121
|
+
# </body>
|
|
122
|
+
# </html>
|
|
123
|
+
# HTML
|
|
124
|
+
#
|
|
125
|
+
# markdown, metadata = HtmlToMarkdown.convert_with_metadata(html)
|
|
126
|
+
#
|
|
127
|
+
# puts metadata[:document][:title] # => "My Article"
|
|
128
|
+
# puts metadata[:document][:language] # => "en"
|
|
129
|
+
# puts metadata[:headers].length # => 1
|
|
130
|
+
# puts metadata[:headers][0][:text] # => "Introduction"
|
|
131
|
+
# puts metadata[:links].length # => 1
|
|
132
|
+
# puts metadata[:images].length # => 1
|
|
133
|
+
#
|
|
134
|
+
# @example With selective metadata extraction
|
|
135
|
+
# config = {
|
|
136
|
+
# extract_headers: true,
|
|
137
|
+
# extract_links: true,
|
|
138
|
+
# extract_images: false, # Skip images
|
|
139
|
+
# extract_structured_data: false # Skip structured data
|
|
140
|
+
# }
|
|
141
|
+
#
|
|
142
|
+
# markdown, metadata = HtmlToMarkdown.convert_with_metadata(html, nil, config)
|
|
143
|
+
# puts metadata[:images].empty? # => true (not extracted)
|
|
144
|
+
#
|
|
145
|
+
# @example With conversion options
|
|
146
|
+
# options = {
|
|
147
|
+
# heading_style: "atx", # Use # H1, ## H2 style
|
|
148
|
+
# wrap: true,
|
|
149
|
+
# wrap_width: 80
|
|
150
|
+
# }
|
|
151
|
+
#
|
|
152
|
+
# config = { extract_headers: true }
|
|
153
|
+
#
|
|
154
|
+
# markdown, metadata = HtmlToMarkdown.convert_with_metadata(html, options, config)
|
|
155
|
+
# # Markdown uses ATX-style headings and wraps at 80 characters
|
|
156
|
+
#
|
|
157
|
+
# @see #convert Simple conversion without metadata
|
|
158
|
+
# @see #convert_with_inline_images Extract inline images during conversion
|
|
159
|
+
# @see ConversionOptions Detailed conversion configuration
|
|
44
160
|
def convert_with_metadata(html, options = nil, metadata_config = nil)
|
|
45
161
|
native_convert_with_metadata(html.to_s, options, metadata_config)
|
|
46
162
|
end
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: html-to-markdown
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 2.
|
|
4
|
+
version: 2.14.1
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Na'aman Hirschfeld
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2025-12-
|
|
11
|
+
date: 2025-12-12 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: rb_sys
|
|
@@ -46,7 +46,6 @@ files:
|
|
|
46
46
|
- ".rubocop.yml"
|
|
47
47
|
- Gemfile
|
|
48
48
|
- Gemfile.lock
|
|
49
|
-
- METADATA.md
|
|
50
49
|
- README.md
|
|
51
50
|
- Rakefile
|
|
52
51
|
- Steepfile
|
data/METADATA.md
DELETED
|
@@ -1,227 +0,0 @@
|
|
|
1
|
-
# Metadata Extraction for Ruby Bindings
|
|
2
|
-
|
|
3
|
-
Complete Ruby Magnus binding implementation for HTML-to-Markdown metadata extraction with full RBS type signatures.
|
|
4
|
-
|
|
5
|
-
## Features
|
|
6
|
-
|
|
7
|
-
The Ruby binding provides comprehensive metadata extraction during HTML-to-Markdown conversion:
|
|
8
|
-
|
|
9
|
-
- **Document Metadata**: title, description, keywords, author, canonical URL, language, text direction
|
|
10
|
-
- **Open Graph & Twitter Card**: social media metadata extraction
|
|
11
|
-
- **Headers**: h1-h6 extraction with hierarchy, ids, and depth tracking
|
|
12
|
-
- **Links**: hyperlink extraction with type classification (anchor, internal, external, email, phone)
|
|
13
|
-
- **Images**: image extraction with source type (data_uri, inline_svg, external, relative) and dimensions
|
|
14
|
-
- **Structured Data**: JSON-LD, Microdata, and RDFa extraction
|
|
15
|
-
|
|
16
|
-
## API
|
|
17
|
-
|
|
18
|
-
### Basic Usage
|
|
19
|
-
|
|
20
|
-
```ruby
|
|
21
|
-
require 'html_to_markdown'
|
|
22
|
-
|
|
23
|
-
html = '<html lang="en"><head><title>Test</title></head><body><h1>Hello</h1></body></html>'
|
|
24
|
-
markdown, metadata = HtmlToMarkdown.convert_with_metadata(html)
|
|
25
|
-
|
|
26
|
-
puts markdown
|
|
27
|
-
puts metadata[:document][:title] # "Test"
|
|
28
|
-
puts metadata[:headers].length # 1
|
|
29
|
-
```
|
|
30
|
-
|
|
31
|
-
### With Conversion Options
|
|
32
|
-
|
|
33
|
-
```ruby
|
|
34
|
-
conv_opts = { heading_style: :atx_closed }
|
|
35
|
-
metadata_opts = { extract_headers: true, extract_links: false }
|
|
36
|
-
|
|
37
|
-
markdown, metadata = HtmlToMarkdown.convert_with_metadata(
|
|
38
|
-
html,
|
|
39
|
-
conv_opts,
|
|
40
|
-
metadata_opts
|
|
41
|
-
)
|
|
42
|
-
```
|
|
43
|
-
|
|
44
|
-
### Return Value
|
|
45
|
-
|
|
46
|
-
Returns a 2-element array: `[markdown_string, metadata_hash]`
|
|
47
|
-
|
|
48
|
-
The metadata hash contains:
|
|
49
|
-
|
|
50
|
-
```ruby
|
|
51
|
-
{
|
|
52
|
-
document: {
|
|
53
|
-
title: String?,
|
|
54
|
-
description: String?,
|
|
55
|
-
keywords: Array[String],
|
|
56
|
-
author: String?,
|
|
57
|
-
canonical_url: String?,
|
|
58
|
-
base_href: String?,
|
|
59
|
-
language: String?,
|
|
60
|
-
text_direction: "ltr" | "rtl" | "auto" | nil,
|
|
61
|
-
open_graph: Hash[String, String],
|
|
62
|
-
twitter_card: Hash[String, String],
|
|
63
|
-
meta_tags: Hash[String, String]
|
|
64
|
-
},
|
|
65
|
-
headers: [
|
|
66
|
-
{
|
|
67
|
-
level: Integer, # 1-6
|
|
68
|
-
text: String,
|
|
69
|
-
id: String?,
|
|
70
|
-
depth: Integer,
|
|
71
|
-
html_offset: Integer
|
|
72
|
-
}
|
|
73
|
-
],
|
|
74
|
-
links: [
|
|
75
|
-
{
|
|
76
|
-
href: String,
|
|
77
|
-
text: String,
|
|
78
|
-
title: String?,
|
|
79
|
-
link_type: "anchor" | "internal" | "external" | "email" | "phone" | "other",
|
|
80
|
-
rel: Array[String],
|
|
81
|
-
attributes: Hash[String, String]
|
|
82
|
-
}
|
|
83
|
-
],
|
|
84
|
-
images: [
|
|
85
|
-
{
|
|
86
|
-
src: String,
|
|
87
|
-
alt: String?,
|
|
88
|
-
title: String?,
|
|
89
|
-
dimensions: [Integer, Integer]?,
|
|
90
|
-
image_type: "data_uri" | "inline_svg" | "external" | "relative",
|
|
91
|
-
attributes: Hash[String, String]
|
|
92
|
-
}
|
|
93
|
-
],
|
|
94
|
-
structured_data: [
|
|
95
|
-
{
|
|
96
|
-
data_type: "json_ld" | "microdata" | "rdfa",
|
|
97
|
-
raw_json: String,
|
|
98
|
-
schema_type: String?
|
|
99
|
-
}
|
|
100
|
-
]
|
|
101
|
-
}
|
|
102
|
-
```
|
|
103
|
-
|
|
104
|
-
## Metadata Configuration
|
|
105
|
-
|
|
106
|
-
Pass a hash with the following options to control which metadata types are extracted:
|
|
107
|
-
|
|
108
|
-
```ruby
|
|
109
|
-
config = {
|
|
110
|
-
extract_headers: true, # Extract h1-h6 elements (default: true)
|
|
111
|
-
extract_links: true, # Extract <a> elements (default: true)
|
|
112
|
-
extract_images: true, # Extract <img> elements (default: true)
|
|
113
|
-
extract_structured_data: true, # Extract JSON-LD/Microdata/RDFa (default: true)
|
|
114
|
-
max_structured_data_size: 1_000_000 # Max bytes for structured data (default: 1MB)
|
|
115
|
-
}
|
|
116
|
-
|
|
117
|
-
markdown, metadata = HtmlToMarkdown.convert_with_metadata(html, nil, config)
|
|
118
|
-
```
|
|
119
|
-
|
|
120
|
-
## Type Signatures
|
|
121
|
-
|
|
122
|
-
All types are defined in RBS format in `sig/html_to_markdown.rbs`:
|
|
123
|
-
|
|
124
|
-
- `document_metadata` - Document-level metadata structure
|
|
125
|
-
- `header_metadata` - Individual header element
|
|
126
|
-
- `link_metadata` - Individual link element
|
|
127
|
-
- `image_metadata` - Individual image element
|
|
128
|
-
- `structured_data` - Structured data block
|
|
129
|
-
- `extended_metadata` - Complete metadata extraction result
|
|
130
|
-
|
|
131
|
-
Uses strict RBS type checking with Steep for full type safety.
|
|
132
|
-
|
|
133
|
-
## Implementation Details
|
|
134
|
-
|
|
135
|
-
### Architecture
|
|
136
|
-
|
|
137
|
-
The Rust implementation uses a single-pass collector pattern for efficient metadata extraction:
|
|
138
|
-
|
|
139
|
-
1. **No duplication**: Core logic lives in Rust (`crates/html-to-markdown/src/metadata.rs`)
|
|
140
|
-
2. **Minimal wrapper layer**: Ruby binding in `crates/html-to-markdown-rb/src/lib.rs`
|
|
141
|
-
3. **Type translation**: Rust types → Ruby hashes with proper Magnus bindings
|
|
142
|
-
4. **Hash conversion**: Uses Magnus `RHash` API for efficient Ruby hash construction
|
|
143
|
-
|
|
144
|
-
### Hash Conversion Pattern
|
|
145
|
-
|
|
146
|
-
Following the inline_images pattern:
|
|
147
|
-
|
|
148
|
-
```rust
|
|
149
|
-
fn document_metadata_to_ruby(ruby: &Ruby, doc: RustDocumentMetadata) -> Result<Value, Error> {
|
|
150
|
-
let hash = ruby.hash_new();
|
|
151
|
-
hash.aset(ruby.intern("title"), opt_string_to_ruby(ruby, doc.title)?)?;
|
|
152
|
-
hash.aset(ruby.intern("keywords"), keywords_array)?;
|
|
153
|
-
// ... more fields
|
|
154
|
-
Ok(hash.as_value())
|
|
155
|
-
}
|
|
156
|
-
```
|
|
157
|
-
|
|
158
|
-
### Feature Flag
|
|
159
|
-
|
|
160
|
-
The metadata feature is gated by a Cargo feature in `Cargo.toml`:
|
|
161
|
-
|
|
162
|
-
```toml
|
|
163
|
-
[features]
|
|
164
|
-
metadata = ["html-to-markdown-rs/metadata"]
|
|
165
|
-
```
|
|
166
|
-
|
|
167
|
-
This ensures:
|
|
168
|
-
- Zero overhead when metadata is not needed
|
|
169
|
-
- Clean integration with feature flag detection
|
|
170
|
-
- Consistent with Python binding implementation
|
|
171
|
-
|
|
172
|
-
## Tests
|
|
173
|
-
|
|
174
|
-
Comprehensive RSpec test suite in `spec/metadata_extraction_spec.rb`:
|
|
175
|
-
|
|
176
|
-
```bash
|
|
177
|
-
cd packages/ruby
|
|
178
|
-
bundle exec rake compile
|
|
179
|
-
bundle exec rspec spec/metadata_extraction_spec.rb
|
|
180
|
-
```
|
|
181
|
-
|
|
182
|
-
Tests cover:
|
|
183
|
-
- All metadata types extraction
|
|
184
|
-
- Configuration flags
|
|
185
|
-
- Edge cases (empty HTML, malformed input, special characters)
|
|
186
|
-
- Return value structure validation
|
|
187
|
-
- Integration with conversion options
|
|
188
|
-
|
|
189
|
-
## Language Parity
|
|
190
|
-
|
|
191
|
-
Implements the same API as the Python binding:
|
|
192
|
-
|
|
193
|
-
- Same method signature: `convert_with_metadata(html, options, metadata_config)`
|
|
194
|
-
- Same return type: `[markdown, metadata_dict]`
|
|
195
|
-
- Same metadata structures and field names
|
|
196
|
-
- Same enum values (link_type, image_type, data_type, text_direction)
|
|
197
|
-
|
|
198
|
-
Enables seamless migration and multi-language development.
|
|
199
|
-
|
|
200
|
-
## Performance
|
|
201
|
-
|
|
202
|
-
Single-pass collection during tree traversal:
|
|
203
|
-
- No additional parsing passes
|
|
204
|
-
- Minimal memory overhead
|
|
205
|
-
- Configurable extraction granularity
|
|
206
|
-
- Built-in size limits for safety
|
|
207
|
-
|
|
208
|
-
## Building and Testing
|
|
209
|
-
|
|
210
|
-
Build the extension with metadata support:
|
|
211
|
-
|
|
212
|
-
```bash
|
|
213
|
-
cd packages/ruby
|
|
214
|
-
bundle exec rake compile -- --release --features metadata
|
|
215
|
-
```
|
|
216
|
-
|
|
217
|
-
Run type checking:
|
|
218
|
-
|
|
219
|
-
```bash
|
|
220
|
-
steep check
|
|
221
|
-
```
|
|
222
|
-
|
|
223
|
-
Run tests:
|
|
224
|
-
|
|
225
|
-
```bash
|
|
226
|
-
bundle exec rspec spec/metadata_extraction_spec.rb
|
|
227
|
-
```
|