html-to-markdown 2.12.1 → 2.14.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/README.md +231 -0
- data/ext/html-to-markdown-rb/native/Cargo.toml +4 -3
- data/ext/html-to-markdown-rb/native/src/lib.rs +269 -0
- data/lib/html_to_markdown/version.rb +1 -1
- data/lib/html_to_markdown.rb +127 -0
- data/sig/html_to_markdown.rbs +123 -8
- data/spec/metadata_extraction_spec.rb +440 -0
- metadata +3 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: dd0a9378e87b5c10c4500389c1c5e9c25a3b7b7ea1a930839b20ec0c9e00745b
|
|
4
|
+
data.tar.gz: 81b1626a43403390709c9c1fdecc377648ef8cda554fafafda6c0cfa2f841bcb
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 30e15347d844c106f7e0538f499e9162c3d14903b7af3dff932649150e050b68e04bf4775a20377aaacc8f7c3c290a9128cb290047f9247f251a6214b793e043
|
|
7
|
+
data.tar.gz: d966ff4c9395461196e6f81f087ba7be193b9ea2f071e74053f9ae7feed10c5ffeca73f54cd01c53ba9ba8548634b3820dfa70712b040f5ce2d7d3ceefdff5b4
|
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
|
@@ -184,6 +184,237 @@ result.inline_images.each do |img|
|
|
|
184
184
|
end
|
|
185
185
|
```
|
|
186
186
|
|
|
187
|
+
### Metadata Extraction
|
|
188
|
+
|
|
189
|
+
Extract comprehensive metadata alongside Markdown conversion: document properties (title, description, author, language), social metadata (Open Graph, Twitter cards), heading hierarchy, link analysis (type classification, rel attributes), image metadata (dimensions, type detection), and structured data (JSON-LD, Microdata, RDFa).
|
|
190
|
+
|
|
191
|
+
#### Basic Usage
|
|
192
|
+
|
|
193
|
+
```ruby
|
|
194
|
+
require 'html_to_markdown'
|
|
195
|
+
|
|
196
|
+
html = '<html lang="en"><head><title>Test</title></head><body><h1>Hello</h1></body></html>'
|
|
197
|
+
markdown, metadata = HtmlToMarkdown.convert_with_metadata(html)
|
|
198
|
+
|
|
199
|
+
puts markdown
|
|
200
|
+
puts metadata[:document][:title] # "Test"
|
|
201
|
+
puts metadata[:headers].length # 1
|
|
202
|
+
```
|
|
203
|
+
|
|
204
|
+
#### With Conversion Options
|
|
205
|
+
|
|
206
|
+
```ruby
|
|
207
|
+
conv_opts = { heading_style: :atx_closed }
|
|
208
|
+
metadata_opts = { extract_headers: true, extract_links: false }
|
|
209
|
+
|
|
210
|
+
markdown, metadata = HtmlToMarkdown.convert_with_metadata(
|
|
211
|
+
html,
|
|
212
|
+
conv_opts,
|
|
213
|
+
metadata_opts
|
|
214
|
+
)
|
|
215
|
+
```
|
|
216
|
+
|
|
217
|
+
#### Full Example
|
|
218
|
+
|
|
219
|
+
```ruby
|
|
220
|
+
require 'html_to_markdown'
|
|
221
|
+
|
|
222
|
+
html = <<~HTML
|
|
223
|
+
<html>
|
|
224
|
+
<head>
|
|
225
|
+
<title>Example</title>
|
|
226
|
+
<meta name="description" content="Demo page">
|
|
227
|
+
<link rel="canonical" href="https://example.com/page">
|
|
228
|
+
<meta property="og:image" content="https://example.com/og.jpg">
|
|
229
|
+
<meta name="twitter:card" content="summary_large_image">
|
|
230
|
+
</head>
|
|
231
|
+
<body>
|
|
232
|
+
<h1 id="welcome">Welcome</h1>
|
|
233
|
+
<a href="https://example.com" rel="nofollow external">Example link</a>
|
|
234
|
+
<img src="https://example.com/image.jpg" alt="Hero" width="640" height="480">
|
|
235
|
+
<script type="application/ld+json">
|
|
236
|
+
{"@context": "https://schema.org", "@type": "Article"}
|
|
237
|
+
</script>
|
|
238
|
+
</body>
|
|
239
|
+
</html>
|
|
240
|
+
HTML
|
|
241
|
+
|
|
242
|
+
markdown, metadata = HtmlToMarkdown.convert_with_metadata(
|
|
243
|
+
html,
|
|
244
|
+
{ heading_style: :atx },
|
|
245
|
+
{ extract_links: true, extract_images: true, extract_headers: true, extract_structured_data: true }
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
puts markdown
|
|
249
|
+
puts metadata[:document][:title] # "Example"
|
|
250
|
+
puts metadata[:document][:description] # "Demo page"
|
|
251
|
+
puts metadata[:document][:open_graph] # {"og:image" => "https://example.com/og.jpg"}
|
|
252
|
+
puts metadata[:links].first[:rel] # ["nofollow", "external"]
|
|
253
|
+
puts metadata[:images].first[:dimensions] # [640, 480]
|
|
254
|
+
puts metadata[:headers].first[:id] # "welcome"
|
|
255
|
+
```
|
|
256
|
+
|
|
257
|
+
#### Return Value Structure
|
|
258
|
+
|
|
259
|
+
Returns a 2-element array: `[markdown_string, metadata_hash]`
|
|
260
|
+
|
|
261
|
+
The metadata hash contains:
|
|
262
|
+
|
|
263
|
+
```ruby
|
|
264
|
+
{
|
|
265
|
+
document: {
|
|
266
|
+
title: String?,
|
|
267
|
+
description: String?,
|
|
268
|
+
keywords: Array[String],
|
|
269
|
+
author: String?,
|
|
270
|
+
canonical_url: String?,
|
|
271
|
+
base_href: String?,
|
|
272
|
+
language: String?,
|
|
273
|
+
text_direction: "ltr" | "rtl" | "auto" | nil,
|
|
274
|
+
open_graph: Hash[String, String],
|
|
275
|
+
twitter_card: Hash[String, String],
|
|
276
|
+
meta_tags: Hash[String, String]
|
|
277
|
+
},
|
|
278
|
+
headers: [
|
|
279
|
+
{
|
|
280
|
+
level: Integer, # 1-6
|
|
281
|
+
text: String,
|
|
282
|
+
id: String?,
|
|
283
|
+
depth: Integer,
|
|
284
|
+
html_offset: Integer
|
|
285
|
+
}
|
|
286
|
+
],
|
|
287
|
+
links: [
|
|
288
|
+
{
|
|
289
|
+
href: String,
|
|
290
|
+
text: String,
|
|
291
|
+
title: String?,
|
|
292
|
+
link_type: "anchor" | "internal" | "external" | "email" | "phone" | "other",
|
|
293
|
+
rel: Array[String],
|
|
294
|
+
attributes: Hash[String, String]
|
|
295
|
+
}
|
|
296
|
+
],
|
|
297
|
+
images: [
|
|
298
|
+
{
|
|
299
|
+
src: String,
|
|
300
|
+
alt: String?,
|
|
301
|
+
title: String?,
|
|
302
|
+
dimensions: [Integer, Integer]?,
|
|
303
|
+
image_type: "data_uri" | "inline_svg" | "external" | "relative",
|
|
304
|
+
attributes: Hash[String, String]
|
|
305
|
+
}
|
|
306
|
+
],
|
|
307
|
+
structured_data: [
|
|
308
|
+
{
|
|
309
|
+
data_type: "json_ld" | "microdata" | "rdfa",
|
|
310
|
+
raw_json: String,
|
|
311
|
+
schema_type: String?
|
|
312
|
+
}
|
|
313
|
+
]
|
|
314
|
+
}
|
|
315
|
+
```
|
|
316
|
+
|
|
317
|
+
#### Metadata Configuration
|
|
318
|
+
|
|
319
|
+
Pass a hash with the following options to control which metadata types are extracted:
|
|
320
|
+
|
|
321
|
+
```ruby
|
|
322
|
+
config = {
|
|
323
|
+
extract_headers: true, # Extract h1-h6 elements (default: true)
|
|
324
|
+
extract_links: true, # Extract <a> elements (default: true)
|
|
325
|
+
extract_images: true, # Extract <img> elements (default: true)
|
|
326
|
+
extract_structured_data: true, # Extract JSON-LD/Microdata/RDFa (default: true)
|
|
327
|
+
max_structured_data_size: 1_000_000 # Max bytes for structured data (default: 1MB)
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
markdown, metadata = HtmlToMarkdown.convert_with_metadata(html, nil, config)
|
|
331
|
+
```
|
|
332
|
+
|
|
333
|
+
#### Features
|
|
334
|
+
|
|
335
|
+
The Ruby binding provides comprehensive metadata extraction during HTML-to-Markdown conversion:
|
|
336
|
+
|
|
337
|
+
- **Document Metadata**: title, description, keywords, author, canonical URL, language, text direction
|
|
338
|
+
- **Open Graph & Twitter Card**: social media metadata extraction
|
|
339
|
+
- **Headers**: h1-h6 extraction with hierarchy, ids, and depth tracking
|
|
340
|
+
- **Links**: hyperlink extraction with type classification (anchor, internal, external, email, phone)
|
|
341
|
+
- **Images**: image extraction with source type (data_uri, inline_svg, external, relative) and dimensions
|
|
342
|
+
- **Structured Data**: JSON-LD, Microdata, and RDFa extraction
|
|
343
|
+
|
|
344
|
+
#### Type Safety with RBS
|
|
345
|
+
|
|
346
|
+
All types are defined in RBS format in `sig/html_to_markdown.rbs`:
|
|
347
|
+
|
|
348
|
+
- `document_metadata` - Document-level metadata structure
|
|
349
|
+
- `header_metadata` - Individual header element
|
|
350
|
+
- `link_metadata` - Individual link element
|
|
351
|
+
- `image_metadata` - Individual image element
|
|
352
|
+
- `structured_data` - Structured data block
|
|
353
|
+
- `extended_metadata` - Complete metadata extraction result
|
|
354
|
+
|
|
355
|
+
Uses strict RBS type checking with Steep for full type safety:
|
|
356
|
+
|
|
357
|
+
```bash
|
|
358
|
+
steep check
|
|
359
|
+
```
|
|
360
|
+
|
|
361
|
+
#### Implementation Architecture
|
|
362
|
+
|
|
363
|
+
The Rust implementation uses a single-pass collector pattern for efficient metadata extraction:
|
|
364
|
+
|
|
365
|
+
1. **No duplication**: Core logic lives in Rust (`crates/html-to-markdown/src/metadata.rs`)
|
|
366
|
+
2. **Minimal wrapper layer**: Ruby binding in `crates/html-to-markdown-rb/src/lib.rs`
|
|
367
|
+
3. **Type translation**: Rust types → Ruby hashes with proper Magnus bindings
|
|
368
|
+
4. **Hash conversion**: Uses Magnus `RHash` API for efficient Ruby hash construction
|
|
369
|
+
|
|
370
|
+
The metadata feature is gated by a Cargo feature in `Cargo.toml`:
|
|
371
|
+
|
|
372
|
+
```toml
|
|
373
|
+
[features]
|
|
374
|
+
metadata = ["html-to-markdown-rs/metadata"]
|
|
375
|
+
```
|
|
376
|
+
|
|
377
|
+
This ensures:
|
|
378
|
+
- Zero overhead when metadata is not needed
|
|
379
|
+
- Clean integration with feature flag detection
|
|
380
|
+
- Consistent with Python binding implementation
|
|
381
|
+
|
|
382
|
+
#### Language Parity
|
|
383
|
+
|
|
384
|
+
Implements the same API as the Python binding:
|
|
385
|
+
|
|
386
|
+
- Same method signature: `convert_with_metadata(html, options, metadata_config)`
|
|
387
|
+
- Same return type: `[markdown, metadata_dict]`
|
|
388
|
+
- Same metadata structures and field names
|
|
389
|
+
- Same enum values (link_type, image_type, data_type, text_direction)
|
|
390
|
+
|
|
391
|
+
Enables seamless migration and multi-language development.
|
|
392
|
+
|
|
393
|
+
#### Performance
|
|
394
|
+
|
|
395
|
+
Single-pass collection during tree traversal:
|
|
396
|
+
- No additional parsing passes
|
|
397
|
+
- Minimal memory overhead
|
|
398
|
+
- Configurable extraction granularity
|
|
399
|
+
- Built-in size limits for safety
|
|
400
|
+
|
|
401
|
+
#### Testing
|
|
402
|
+
|
|
403
|
+
Comprehensive RSpec test suite in `spec/metadata_extraction_spec.rb`:
|
|
404
|
+
|
|
405
|
+
```bash
|
|
406
|
+
cd packages/ruby
|
|
407
|
+
bundle exec rake compile -- --release --features metadata
|
|
408
|
+
bundle exec rspec spec/metadata_extraction_spec.rb
|
|
409
|
+
```
|
|
410
|
+
|
|
411
|
+
Tests cover:
|
|
412
|
+
- All metadata types extraction
|
|
413
|
+
- Configuration flags
|
|
414
|
+
- Edge cases (empty HTML, malformed input, special characters)
|
|
415
|
+
- Return value structure validation
|
|
416
|
+
- Integration with conversion options
|
|
417
|
+
|
|
187
418
|
## CLI
|
|
188
419
|
|
|
189
420
|
The gem bundles a small proxy for the Rust CLI binary. Use it when you need parity with the standalone `html-to-markdown` executable.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "html-to-markdown-rb"
|
|
3
|
-
version = "2.
|
|
3
|
+
version = "2.14.1"
|
|
4
4
|
edition = "2024"
|
|
5
5
|
authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
|
|
6
6
|
license = "MIT"
|
|
@@ -18,10 +18,11 @@ name = "html_to_markdown_rb"
|
|
|
18
18
|
crate-type = ["cdylib", "rlib"]
|
|
19
19
|
|
|
20
20
|
[features]
|
|
21
|
-
default = []
|
|
21
|
+
default = ["metadata"]
|
|
22
|
+
metadata = ["html-to-markdown-rs/metadata"]
|
|
22
23
|
|
|
23
24
|
[dependencies]
|
|
24
|
-
html-to-markdown-rs = { version = "2.
|
|
25
|
+
html-to-markdown-rs = { version = "2.14.1", features = ["inline-images"] }
|
|
25
26
|
magnus = { git = "https://github.com/matsadler/magnus", rev = "f6db11769efb517427bf7f121f9c32e18b059b38", features = ["rb-sys"] }
|
|
26
27
|
|
|
27
28
|
[dev-dependencies]
|
|
@@ -4,6 +4,17 @@ use html_to_markdown_rs::{
|
|
|
4
4
|
PreprocessingPreset, WhitespaceMode, convert as convert_inner,
|
|
5
5
|
convert_with_inline_images as convert_with_inline_images_inner, error::ConversionError, safety::guard_panic,
|
|
6
6
|
};
|
|
7
|
+
|
|
8
|
+
#[cfg(feature = "metadata")]
|
|
9
|
+
use html_to_markdown_rs::convert_with_metadata as convert_with_metadata_inner;
|
|
10
|
+
#[cfg(feature = "metadata")]
|
|
11
|
+
use html_to_markdown_rs::metadata::{
|
|
12
|
+
DocumentMetadata as RustDocumentMetadata, ExtendedMetadata as RustExtendedMetadata,
|
|
13
|
+
HeaderMetadata as RustHeaderMetadata, ImageMetadata as RustImageMetadata, ImageType as RustImageType,
|
|
14
|
+
LinkMetadata as RustLinkMetadata, LinkType as RustLinkType, MetadataConfig as RustMetadataConfig,
|
|
15
|
+
StructuredData as RustStructuredData, StructuredDataType as RustStructuredDataType,
|
|
16
|
+
TextDirection as RustTextDirection,
|
|
17
|
+
};
|
|
7
18
|
use magnus::prelude::*;
|
|
8
19
|
use magnus::r_hash::ForEach;
|
|
9
20
|
use magnus::{Error, RArray, RHash, Ruby, Symbol, TryConvert, Value, function, scan_args::scan_args};
|
|
@@ -423,6 +434,261 @@ fn convert_with_inline_images_fn(ruby: &Ruby, args: &[Value]) -> Result<Value, E
|
|
|
423
434
|
extraction_to_value(ruby, extraction)
|
|
424
435
|
}
|
|
425
436
|
|
|
437
|
+
#[cfg(feature = "metadata")]
|
|
438
|
+
fn build_metadata_config(_ruby: &Ruby, config: Option<Value>) -> Result<RustMetadataConfig, Error> {
|
|
439
|
+
let mut cfg = RustMetadataConfig::default();
|
|
440
|
+
|
|
441
|
+
let Some(config) = config else {
|
|
442
|
+
return Ok(cfg);
|
|
443
|
+
};
|
|
444
|
+
|
|
445
|
+
if config.is_nil() {
|
|
446
|
+
return Ok(cfg);
|
|
447
|
+
}
|
|
448
|
+
|
|
449
|
+
let hash = RHash::from_value(config).ok_or_else(|| arg_error("metadata_config must be provided as a Hash"))?;
|
|
450
|
+
|
|
451
|
+
hash.foreach(|key: Value, val: Value| {
|
|
452
|
+
let key_name = symbol_to_string(key)?;
|
|
453
|
+
match key_name.as_str() {
|
|
454
|
+
"extract_headers" => {
|
|
455
|
+
cfg.extract_headers = bool::try_convert(val)?;
|
|
456
|
+
}
|
|
457
|
+
"extract_links" => {
|
|
458
|
+
cfg.extract_links = bool::try_convert(val)?;
|
|
459
|
+
}
|
|
460
|
+
"extract_images" => {
|
|
461
|
+
cfg.extract_images = bool::try_convert(val)?;
|
|
462
|
+
}
|
|
463
|
+
"extract_structured_data" => {
|
|
464
|
+
cfg.extract_structured_data = bool::try_convert(val)?;
|
|
465
|
+
}
|
|
466
|
+
"max_structured_data_size" => {
|
|
467
|
+
cfg.max_structured_data_size = usize::try_convert(val)?;
|
|
468
|
+
}
|
|
469
|
+
_ => {}
|
|
470
|
+
}
|
|
471
|
+
Ok(ForEach::Continue)
|
|
472
|
+
})?;
|
|
473
|
+
|
|
474
|
+
Ok(cfg)
|
|
475
|
+
}
|
|
476
|
+
|
|
477
|
+
#[cfg(feature = "metadata")]
|
|
478
|
+
fn opt_string_to_ruby(ruby: &Ruby, opt: Option<String>) -> Result<Value, Error> {
|
|
479
|
+
match opt {
|
|
480
|
+
Some(val) => Ok(ruby.str_from_slice(val.as_bytes()).as_value()),
|
|
481
|
+
None => Ok(ruby.qnil().as_value()),
|
|
482
|
+
}
|
|
483
|
+
}
|
|
484
|
+
|
|
485
|
+
#[cfg(feature = "metadata")]
|
|
486
|
+
fn btreemap_to_ruby_hash(ruby: &Ruby, map: std::collections::BTreeMap<String, String>) -> Result<Value, Error> {
|
|
487
|
+
let hash = ruby.hash_new();
|
|
488
|
+
for (k, v) in map {
|
|
489
|
+
hash.aset(k, v)?;
|
|
490
|
+
}
|
|
491
|
+
Ok(hash.as_value())
|
|
492
|
+
}
|
|
493
|
+
|
|
494
|
+
#[cfg(feature = "metadata")]
|
|
495
|
+
fn text_direction_to_string(text_direction: Option<RustTextDirection>) -> Option<&'static str> {
|
|
496
|
+
match text_direction {
|
|
497
|
+
Some(RustTextDirection::LeftToRight) => Some("ltr"),
|
|
498
|
+
Some(RustTextDirection::RightToLeft) => Some("rtl"),
|
|
499
|
+
Some(RustTextDirection::Auto) => Some("auto"),
|
|
500
|
+
None => None,
|
|
501
|
+
}
|
|
502
|
+
}
|
|
503
|
+
|
|
504
|
+
#[cfg(feature = "metadata")]
|
|
505
|
+
fn link_type_to_string(link_type: &RustLinkType) -> &'static str {
|
|
506
|
+
match link_type {
|
|
507
|
+
RustLinkType::Anchor => "anchor",
|
|
508
|
+
RustLinkType::Internal => "internal",
|
|
509
|
+
RustLinkType::External => "external",
|
|
510
|
+
RustLinkType::Email => "email",
|
|
511
|
+
RustLinkType::Phone => "phone",
|
|
512
|
+
RustLinkType::Other => "other",
|
|
513
|
+
}
|
|
514
|
+
}
|
|
515
|
+
|
|
516
|
+
#[cfg(feature = "metadata")]
|
|
517
|
+
fn image_type_to_string(image_type: &RustImageType) -> &'static str {
|
|
518
|
+
match image_type {
|
|
519
|
+
RustImageType::DataUri => "data_uri",
|
|
520
|
+
RustImageType::InlineSvg => "inline_svg",
|
|
521
|
+
RustImageType::External => "external",
|
|
522
|
+
RustImageType::Relative => "relative",
|
|
523
|
+
}
|
|
524
|
+
}
|
|
525
|
+
|
|
526
|
+
#[cfg(feature = "metadata")]
|
|
527
|
+
fn structured_data_type_to_string(data_type: &RustStructuredDataType) -> &'static str {
|
|
528
|
+
match data_type {
|
|
529
|
+
RustStructuredDataType::JsonLd => "json_ld",
|
|
530
|
+
RustStructuredDataType::Microdata => "microdata",
|
|
531
|
+
RustStructuredDataType::RDFa => "rdfa",
|
|
532
|
+
}
|
|
533
|
+
}
|
|
534
|
+
|
|
535
|
+
#[cfg(feature = "metadata")]
|
|
536
|
+
fn document_metadata_to_ruby(ruby: &Ruby, doc: RustDocumentMetadata) -> Result<Value, Error> {
|
|
537
|
+
let hash = ruby.hash_new();
|
|
538
|
+
|
|
539
|
+
hash.aset(ruby.intern("title"), opt_string_to_ruby(ruby, doc.title)?)?;
|
|
540
|
+
hash.aset(ruby.intern("description"), opt_string_to_ruby(ruby, doc.description)?)?;
|
|
541
|
+
|
|
542
|
+
let keywords = ruby.ary_new();
|
|
543
|
+
for keyword in doc.keywords {
|
|
544
|
+
keywords.push(keyword)?;
|
|
545
|
+
}
|
|
546
|
+
hash.aset(ruby.intern("keywords"), keywords)?;
|
|
547
|
+
|
|
548
|
+
hash.aset(ruby.intern("author"), opt_string_to_ruby(ruby, doc.author)?)?;
|
|
549
|
+
hash.aset(
|
|
550
|
+
ruby.intern("canonical_url"),
|
|
551
|
+
opt_string_to_ruby(ruby, doc.canonical_url)?,
|
|
552
|
+
)?;
|
|
553
|
+
hash.aset(ruby.intern("base_href"), opt_string_to_ruby(ruby, doc.base_href)?)?;
|
|
554
|
+
hash.aset(ruby.intern("language"), opt_string_to_ruby(ruby, doc.language)?)?;
|
|
555
|
+
|
|
556
|
+
match text_direction_to_string(doc.text_direction) {
|
|
557
|
+
Some(dir) => hash.aset(ruby.intern("text_direction"), dir)?,
|
|
558
|
+
None => hash.aset(ruby.intern("text_direction"), ruby.qnil())?,
|
|
559
|
+
}
|
|
560
|
+
|
|
561
|
+
hash.aset(ruby.intern("open_graph"), btreemap_to_ruby_hash(ruby, doc.open_graph)?)?;
|
|
562
|
+
hash.aset(
|
|
563
|
+
ruby.intern("twitter_card"),
|
|
564
|
+
btreemap_to_ruby_hash(ruby, doc.twitter_card)?,
|
|
565
|
+
)?;
|
|
566
|
+
hash.aset(ruby.intern("meta_tags"), btreemap_to_ruby_hash(ruby, doc.meta_tags)?)?;
|
|
567
|
+
|
|
568
|
+
Ok(hash.as_value())
|
|
569
|
+
}
|
|
570
|
+
|
|
571
|
+
#[cfg(feature = "metadata")]
|
|
572
|
+
fn headers_to_ruby(ruby: &Ruby, headers: Vec<RustHeaderMetadata>) -> Result<Value, Error> {
|
|
573
|
+
let array = ruby.ary_new();
|
|
574
|
+
for header in headers {
|
|
575
|
+
let hash = ruby.hash_new();
|
|
576
|
+
hash.aset(ruby.intern("level"), header.level)?;
|
|
577
|
+
hash.aset(ruby.intern("text"), header.text)?;
|
|
578
|
+
hash.aset(ruby.intern("id"), opt_string_to_ruby(ruby, header.id)?)?;
|
|
579
|
+
hash.aset(ruby.intern("depth"), header.depth as i64)?;
|
|
580
|
+
hash.aset(ruby.intern("html_offset"), header.html_offset as i64)?;
|
|
581
|
+
array.push(hash)?;
|
|
582
|
+
}
|
|
583
|
+
Ok(array.as_value())
|
|
584
|
+
}
|
|
585
|
+
|
|
586
|
+
#[cfg(feature = "metadata")]
|
|
587
|
+
fn links_to_ruby(ruby: &Ruby, links: Vec<RustLinkMetadata>) -> Result<Value, Error> {
|
|
588
|
+
let array = ruby.ary_new();
|
|
589
|
+
for link in links {
|
|
590
|
+
let hash = ruby.hash_new();
|
|
591
|
+
hash.aset(ruby.intern("href"), link.href)?;
|
|
592
|
+
hash.aset(ruby.intern("text"), link.text)?;
|
|
593
|
+
hash.aset(ruby.intern("title"), opt_string_to_ruby(ruby, link.title)?)?;
|
|
594
|
+
hash.aset(ruby.intern("link_type"), link_type_to_string(&link.link_type))?;
|
|
595
|
+
|
|
596
|
+
let rel_array = ruby.ary_new();
|
|
597
|
+
for r in link.rel {
|
|
598
|
+
rel_array.push(r)?;
|
|
599
|
+
}
|
|
600
|
+
hash.aset(ruby.intern("rel"), rel_array)?;
|
|
601
|
+
|
|
602
|
+
hash.aset(ruby.intern("attributes"), btreemap_to_ruby_hash(ruby, link.attributes)?)?;
|
|
603
|
+
array.push(hash)?;
|
|
604
|
+
}
|
|
605
|
+
Ok(array.as_value())
|
|
606
|
+
}
|
|
607
|
+
|
|
608
|
+
#[cfg(feature = "metadata")]
|
|
609
|
+
fn images_to_ruby(ruby: &Ruby, images: Vec<RustImageMetadata>) -> Result<Value, Error> {
|
|
610
|
+
let array = ruby.ary_new();
|
|
611
|
+
for image in images {
|
|
612
|
+
let hash = ruby.hash_new();
|
|
613
|
+
hash.aset(ruby.intern("src"), image.src)?;
|
|
614
|
+
hash.aset(ruby.intern("alt"), opt_string_to_ruby(ruby, image.alt)?)?;
|
|
615
|
+
hash.aset(ruby.intern("title"), opt_string_to_ruby(ruby, image.title)?)?;
|
|
616
|
+
|
|
617
|
+
match image.dimensions {
|
|
618
|
+
Some((width, height)) => {
|
|
619
|
+
let dims = ruby.ary_new();
|
|
620
|
+
dims.push(width as i64)?;
|
|
621
|
+
dims.push(height as i64)?;
|
|
622
|
+
hash.aset(ruby.intern("dimensions"), dims)?;
|
|
623
|
+
}
|
|
624
|
+
None => {
|
|
625
|
+
hash.aset(ruby.intern("dimensions"), ruby.qnil())?;
|
|
626
|
+
}
|
|
627
|
+
}
|
|
628
|
+
|
|
629
|
+
hash.aset(ruby.intern("image_type"), image_type_to_string(&image.image_type))?;
|
|
630
|
+
hash.aset(
|
|
631
|
+
ruby.intern("attributes"),
|
|
632
|
+
btreemap_to_ruby_hash(ruby, image.attributes)?,
|
|
633
|
+
)?;
|
|
634
|
+
array.push(hash)?;
|
|
635
|
+
}
|
|
636
|
+
Ok(array.as_value())
|
|
637
|
+
}
|
|
638
|
+
|
|
639
|
+
#[cfg(feature = "metadata")]
|
|
640
|
+
fn structured_data_to_ruby(ruby: &Ruby, data: Vec<RustStructuredData>) -> Result<Value, Error> {
|
|
641
|
+
let array = ruby.ary_new();
|
|
642
|
+
for item in data {
|
|
643
|
+
let hash = ruby.hash_new();
|
|
644
|
+
hash.aset(
|
|
645
|
+
ruby.intern("data_type"),
|
|
646
|
+
structured_data_type_to_string(&item.data_type),
|
|
647
|
+
)?;
|
|
648
|
+
hash.aset(ruby.intern("raw_json"), item.raw_json)?;
|
|
649
|
+
hash.aset(ruby.intern("schema_type"), opt_string_to_ruby(ruby, item.schema_type)?)?;
|
|
650
|
+
array.push(hash)?;
|
|
651
|
+
}
|
|
652
|
+
Ok(array.as_value())
|
|
653
|
+
}
|
|
654
|
+
|
|
655
|
+
#[cfg(feature = "metadata")]
|
|
656
|
+
fn extended_metadata_to_ruby(ruby: &Ruby, metadata: RustExtendedMetadata) -> Result<Value, Error> {
|
|
657
|
+
let hash = ruby.hash_new();
|
|
658
|
+
|
|
659
|
+
hash.aset(
|
|
660
|
+
ruby.intern("document"),
|
|
661
|
+
document_metadata_to_ruby(ruby, metadata.document)?,
|
|
662
|
+
)?;
|
|
663
|
+
hash.aset(ruby.intern("headers"), headers_to_ruby(ruby, metadata.headers)?)?;
|
|
664
|
+
hash.aset(ruby.intern("links"), links_to_ruby(ruby, metadata.links)?)?;
|
|
665
|
+
hash.aset(ruby.intern("images"), images_to_ruby(ruby, metadata.images)?)?;
|
|
666
|
+
hash.aset(
|
|
667
|
+
ruby.intern("structured_data"),
|
|
668
|
+
structured_data_to_ruby(ruby, metadata.structured_data)?,
|
|
669
|
+
)?;
|
|
670
|
+
|
|
671
|
+
Ok(hash.as_value())
|
|
672
|
+
}
|
|
673
|
+
|
|
674
|
+
#[cfg(feature = "metadata")]
|
|
675
|
+
fn convert_with_metadata_fn(ruby: &Ruby, args: &[Value]) -> Result<Value, Error> {
|
|
676
|
+
let parsed = scan_args::<(String,), (Option<Value>, Option<Value>), (), (), (), ()>(args)?;
|
|
677
|
+
let html = parsed.required.0;
|
|
678
|
+
let options = build_conversion_options(ruby, parsed.optional.0)?;
|
|
679
|
+
let metadata_config = build_metadata_config(ruby, parsed.optional.1)?;
|
|
680
|
+
|
|
681
|
+
let (markdown, metadata) =
|
|
682
|
+
guard_panic(|| convert_with_metadata_inner(&html, Some(options), metadata_config)).map_err(conversion_error)?;
|
|
683
|
+
|
|
684
|
+
// Convert to Ruby array [markdown, metadata_hash]
|
|
685
|
+
let array = ruby.ary_new();
|
|
686
|
+
array.push(markdown)?;
|
|
687
|
+
array.push(extended_metadata_to_ruby(ruby, metadata)?)?;
|
|
688
|
+
|
|
689
|
+
Ok(array.as_value())
|
|
690
|
+
}
|
|
691
|
+
|
|
426
692
|
#[magnus::init]
|
|
427
693
|
fn init(ruby: &Ruby) -> Result<(), Error> {
|
|
428
694
|
let module = ruby.define_module("HtmlToMarkdown")?;
|
|
@@ -434,5 +700,8 @@ fn init(ruby: &Ruby) -> Result<(), Error> {
|
|
|
434
700
|
function!(convert_with_inline_images_fn, -1),
|
|
435
701
|
)?;
|
|
436
702
|
|
|
703
|
+
#[cfg(feature = "metadata")]
|
|
704
|
+
module.define_singleton_method("convert_with_metadata", function!(convert_with_metadata_fn, -1))?;
|
|
705
|
+
|
|
437
706
|
Ok(())
|
|
438
707
|
}
|
data/lib/html_to_markdown.rb
CHANGED
|
@@ -14,6 +14,7 @@ module HtmlToMarkdown
|
|
|
14
14
|
alias native_convert_with_inline_images convert_with_inline_images
|
|
15
15
|
alias native_options options
|
|
16
16
|
alias native_convert_with_options convert_with_options
|
|
17
|
+
alias native_convert_with_metadata convert_with_metadata
|
|
17
18
|
end
|
|
18
19
|
|
|
19
20
|
module_function
|
|
@@ -33,4 +34,130 @@ module HtmlToMarkdown
|
|
|
33
34
|
def options(options_hash = nil)
|
|
34
35
|
native_options(options_hash)
|
|
35
36
|
end
|
|
37
|
+
|
|
38
|
+
# Convert HTML to Markdown with comprehensive metadata extraction.
|
|
39
|
+
#
|
|
40
|
+
# Performs HTML-to-Markdown conversion while extracting document metadata, headers,
|
|
41
|
+
# links, images, and structured data in a single pass. Ideal for content analysis,
|
|
42
|
+
# SEO workflows, and document indexing.
|
|
43
|
+
#
|
|
44
|
+
# @param html [String] HTML string to convert. Line endings are normalized (CRLF -> LF).
|
|
45
|
+
# @param options [ConversionOptions, Hash, nil] Optional conversion configuration.
|
|
46
|
+
# When a Hash, keys should match ConversionOptions field names (as symbols or strings).
|
|
47
|
+
# Common options:
|
|
48
|
+
# - :heading_style [String] "atx", "atx_closed", or "underlined" (default: "underlined")
|
|
49
|
+
# - :list_indent_type [String] "spaces" or "tabs" (default: "spaces")
|
|
50
|
+
# - :list_indent_width [Integer] Spaces per indent level (default: 4)
|
|
51
|
+
# - :wrap [true, false] Enable text wrapping (default: false)
|
|
52
|
+
# - :wrap_width [Integer] Wrap at this column width (default: 80)
|
|
53
|
+
# See ConversionOptions documentation for complete list.
|
|
54
|
+
#
|
|
55
|
+
# @param metadata_config [Hash, nil] Optional metadata extraction configuration.
|
|
56
|
+
# Keys should be symbols or strings. Supported keys:
|
|
57
|
+
# - :extract_headers [true, false] Extract h1-h6 heading elements (default: true)
|
|
58
|
+
# - :extract_links [true, false] Extract hyperlinks with type classification (default: true)
|
|
59
|
+
# - :extract_images [true, false] Extract image elements (default: true)
|
|
60
|
+
# - :extract_structured_data [true, false] Extract JSON-LD/Microdata/RDFa (default: true)
|
|
61
|
+
# - :max_structured_data_size [Integer] Size limit for structured data in bytes (default: 1_000_000)
|
|
62
|
+
#
|
|
63
|
+
# @return [Array<String, Hash>] Tuple of [markdown_string, metadata_hash]
|
|
64
|
+
# markdown_string: String - The converted Markdown output
|
|
65
|
+
#
|
|
66
|
+
# metadata_hash: Hash with keys:
|
|
67
|
+
# - :document [Hash] Document-level metadata:
|
|
68
|
+
# - :title [String, nil] From <title> tag
|
|
69
|
+
# - :description [String, nil] From <meta name="description">
|
|
70
|
+
# - :keywords [Array<String>] From <meta name="keywords">
|
|
71
|
+
# - :author [String, nil] From <meta name="author">
|
|
72
|
+
# - :language [String, nil] From lang attribute (e.g., "en")
|
|
73
|
+
# - :text_direction [String, nil] "ltr", "rtl", or "auto"
|
|
74
|
+
# - :canonical_url [String, nil] From <link rel="canonical">
|
|
75
|
+
# - :base_href [String, nil] From <base href="">
|
|
76
|
+
# - :open_graph [Hash<String, String>] Open Graph properties (og:* meta tags)
|
|
77
|
+
# - :twitter_card [Hash<String, String>] Twitter Card properties (twitter:* meta tags)
|
|
78
|
+
# - :meta_tags [Hash<String, String>] Other meta tags
|
|
79
|
+
#
|
|
80
|
+
# - :headers [Array<Hash>] Heading elements:
|
|
81
|
+
# - :level [Integer] 1-6
|
|
82
|
+
# - :text [String] Header text content
|
|
83
|
+
# - :id [String, nil] HTML id attribute
|
|
84
|
+
# - :depth [Integer] Tree nesting depth
|
|
85
|
+
# - :html_offset [Integer] Byte offset in original HTML
|
|
86
|
+
#
|
|
87
|
+
# - :links [Array<Hash>] Hyperlinks:
|
|
88
|
+
# - :href [String] Link URL
|
|
89
|
+
# - :text [String] Link text content
|
|
90
|
+
# - :title [String, nil] Title attribute
|
|
91
|
+
# - :link_type [String] "anchor", "internal", "external", "email", "phone", or "other"
|
|
92
|
+
# - :rel [Array<String>] Rel attribute values
|
|
93
|
+
# - :attributes [Hash<String, String>] Additional HTML attributes
|
|
94
|
+
#
|
|
95
|
+
# - :images [Array<Hash>] Image elements:
|
|
96
|
+
# - :src [String] Image source URL or data URI
|
|
97
|
+
# - :alt [String, nil] Alt text for accessibility
|
|
98
|
+
# - :title [String, nil] Title attribute
|
|
99
|
+
# - :dimensions [Array<Integer>, nil] [width, height] if available
|
|
100
|
+
# - :image_type [String] "data_uri", "external", "relative", or "inline_svg"
|
|
101
|
+
# - :attributes [Hash<String, String>] Additional HTML attributes
|
|
102
|
+
#
|
|
103
|
+
# - :structured_data [Array<Hash>] Structured data blocks:
|
|
104
|
+
# - :data_type [String] "json_ld", "microdata", or "rdfa"
|
|
105
|
+
# - :raw_json [String] Raw JSON content
|
|
106
|
+
# - :schema_type [String, nil] Schema type (e.g., "Article", "Event")
|
|
107
|
+
#
|
|
108
|
+
# @raise [StandardError] If conversion fails or invalid configuration
|
|
109
|
+
#
|
|
110
|
+
# @example Basic usage
|
|
111
|
+
# html = <<~HTML
|
|
112
|
+
# <html lang="en">
|
|
113
|
+
# <head>
|
|
114
|
+
# <title>My Article</title>
|
|
115
|
+
# <meta name="description" content="A great read">
|
|
116
|
+
# </head>
|
|
117
|
+
# <body>
|
|
118
|
+
# <h1 id="intro">Introduction</h1>
|
|
119
|
+
# <p>Visit <a href="https://example.com">our site</a></p>
|
|
120
|
+
# <img src="photo.jpg" alt="Beautiful landscape">
|
|
121
|
+
# </body>
|
|
122
|
+
# </html>
|
|
123
|
+
# HTML
|
|
124
|
+
#
|
|
125
|
+
# markdown, metadata = HtmlToMarkdown.convert_with_metadata(html)
|
|
126
|
+
#
|
|
127
|
+
# puts metadata[:document][:title] # => "My Article"
|
|
128
|
+
# puts metadata[:document][:language] # => "en"
|
|
129
|
+
# puts metadata[:headers].length # => 1
|
|
130
|
+
# puts metadata[:headers][0][:text] # => "Introduction"
|
|
131
|
+
# puts metadata[:links].length # => 1
|
|
132
|
+
# puts metadata[:images].length # => 1
|
|
133
|
+
#
|
|
134
|
+
# @example With selective metadata extraction
|
|
135
|
+
# config = {
|
|
136
|
+
# extract_headers: true,
|
|
137
|
+
# extract_links: true,
|
|
138
|
+
# extract_images: false, # Skip images
|
|
139
|
+
# extract_structured_data: false # Skip structured data
|
|
140
|
+
# }
|
|
141
|
+
#
|
|
142
|
+
# markdown, metadata = HtmlToMarkdown.convert_with_metadata(html, nil, config)
|
|
143
|
+
# puts metadata[:images].empty? # => true (not extracted)
|
|
144
|
+
#
|
|
145
|
+
# @example With conversion options
|
|
146
|
+
# options = {
|
|
147
|
+
# heading_style: "atx", # Use # H1, ## H2 style
|
|
148
|
+
# wrap: true,
|
|
149
|
+
# wrap_width: 80
|
|
150
|
+
# }
|
|
151
|
+
#
|
|
152
|
+
# config = { extract_headers: true }
|
|
153
|
+
#
|
|
154
|
+
# markdown, metadata = HtmlToMarkdown.convert_with_metadata(html, options, config)
|
|
155
|
+
# # Markdown uses ATX-style headings and wraps at 80 characters
|
|
156
|
+
#
|
|
157
|
+
# @see #convert Simple conversion without metadata
|
|
158
|
+
# @see #convert_with_inline_images Extract inline images during conversion
|
|
159
|
+
# @see ConversionOptions Detailed conversion configuration
|
|
160
|
+
def convert_with_metadata(html, options = nil, metadata_config = nil)
|
|
161
|
+
native_convert_with_metadata(html.to_s, options, metadata_config)
|
|
162
|
+
end
|
|
36
163
|
end
|
data/sig/html_to_markdown.rbs
CHANGED
|
@@ -87,6 +87,74 @@ module HtmlToMarkdown
|
|
|
87
87
|
warnings: Array[inline_image_warning]
|
|
88
88
|
}
|
|
89
89
|
|
|
90
|
+
type metadata_config = {
|
|
91
|
+
extract_headers?: bool,
|
|
92
|
+
extract_links?: bool,
|
|
93
|
+
extract_images?: bool,
|
|
94
|
+
extract_structured_data?: bool,
|
|
95
|
+
max_structured_data_size?: Integer
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
type text_direction = "ltr" | "rtl" | "auto" | nil
|
|
99
|
+
|
|
100
|
+
type document_metadata = {
|
|
101
|
+
title: String?,
|
|
102
|
+
description: String?,
|
|
103
|
+
keywords: Array[String],
|
|
104
|
+
author: String?,
|
|
105
|
+
canonical_url: String?,
|
|
106
|
+
base_href: String?,
|
|
107
|
+
language: String?,
|
|
108
|
+
text_direction: text_direction,
|
|
109
|
+
open_graph: Hash[String, String],
|
|
110
|
+
twitter_card: Hash[String, String],
|
|
111
|
+
meta_tags: Hash[String, String]
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
type header_metadata = {
|
|
115
|
+
level: Integer,
|
|
116
|
+
text: String,
|
|
117
|
+
id: String?,
|
|
118
|
+
depth: Integer,
|
|
119
|
+
html_offset: Integer
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
type link_type = "anchor" | "internal" | "external" | "email" | "phone" | "other"
|
|
123
|
+
|
|
124
|
+
type link_metadata = {
|
|
125
|
+
href: String,
|
|
126
|
+
text: String,
|
|
127
|
+
title: String?,
|
|
128
|
+
link_type: link_type,
|
|
129
|
+
rel: Array[String],
|
|
130
|
+
attributes: Hash[String, String]
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
type image_type = "data_uri" | "inline_svg" | "external" | "relative"
|
|
134
|
+
|
|
135
|
+
type image_metadata = {
|
|
136
|
+
src: String,
|
|
137
|
+
alt: String?,
|
|
138
|
+
title: String?,
|
|
139
|
+
dimensions: [Integer, Integer]?,
|
|
140
|
+
image_type: image_type,
|
|
141
|
+
attributes: Hash[String, String]
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
type structured_data = {
|
|
145
|
+
data_type: "json_ld" | "microdata" | "rdfa",
|
|
146
|
+
raw_json: String,
|
|
147
|
+
schema_type: String?
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
type extended_metadata = {
|
|
151
|
+
document: document_metadata,
|
|
152
|
+
headers: Array[header_metadata],
|
|
153
|
+
links: Array[link_metadata],
|
|
154
|
+
images: Array[image_metadata],
|
|
155
|
+
structured_data: Array[structured_data]
|
|
156
|
+
}
|
|
157
|
+
|
|
90
158
|
# Native methods (implemented in Rust via Magnus/rb-sys)
|
|
91
159
|
# These are aliased from the Rust extension and available as both module and instance methods
|
|
92
160
|
private
|
|
@@ -99,6 +167,11 @@ module HtmlToMarkdown
|
|
|
99
167
|
conversion_options? options,
|
|
100
168
|
inline_image_config? image_config
|
|
101
169
|
) -> html_extraction
|
|
170
|
+
def self.native_convert_with_metadata: (
|
|
171
|
+
String html,
|
|
172
|
+
conversion_options? options,
|
|
173
|
+
metadata_config? metadata_config
|
|
174
|
+
) -> [String, extended_metadata]
|
|
102
175
|
|
|
103
176
|
def native_convert: (String html, conversion_options? options) -> String
|
|
104
177
|
def native_options: (conversion_options? options_hash) -> Options
|
|
@@ -108,14 +181,19 @@ module HtmlToMarkdown
|
|
|
108
181
|
conversion_options? options,
|
|
109
182
|
inline_image_config? image_config
|
|
110
183
|
) -> html_extraction
|
|
184
|
+
def native_convert_with_metadata: (
|
|
185
|
+
String html,
|
|
186
|
+
conversion_options? options,
|
|
187
|
+
metadata_config? metadata_config
|
|
188
|
+
) -> [String, extended_metadata]
|
|
111
189
|
|
|
112
190
|
public
|
|
113
191
|
|
|
114
192
|
# Convert HTML to Markdown with optional configuration
|
|
115
|
-
def self.convert: (String html, ?conversion_options
|
|
193
|
+
def self.convert: (String html, ?conversion_options options) -> String
|
|
116
194
|
|
|
117
195
|
# Create a reusable options handle for performance
|
|
118
|
-
def self.options: (?conversion_options
|
|
196
|
+
def self.options: (?conversion_options options_hash) -> Options
|
|
119
197
|
|
|
120
198
|
# Convert HTML using a pre-built options handle
|
|
121
199
|
def self.convert_with_options: (String html, Options options_handle) -> String
|
|
@@ -123,17 +201,54 @@ module HtmlToMarkdown
|
|
|
123
201
|
# Convert HTML with inline image extraction
|
|
124
202
|
def self.convert_with_inline_images: (
|
|
125
203
|
String html,
|
|
126
|
-
?conversion_options
|
|
127
|
-
?inline_image_config
|
|
204
|
+
?conversion_options options,
|
|
205
|
+
?inline_image_config image_config
|
|
128
206
|
) -> html_extraction
|
|
129
207
|
|
|
208
|
+
# Convert HTML to Markdown with metadata extraction
|
|
209
|
+
#
|
|
210
|
+
# Extracts comprehensive metadata (headers, links, images, structured data) during conversion.
|
|
211
|
+
#
|
|
212
|
+
# Args:
|
|
213
|
+
# html: HTML string to convert
|
|
214
|
+
# options: Optional conversion configuration
|
|
215
|
+
# metadata_config: Optional metadata extraction configuration
|
|
216
|
+
#
|
|
217
|
+
# Returns:
|
|
218
|
+
# Array containing:
|
|
219
|
+
# - [0] markdown: String - Converted markdown output
|
|
220
|
+
# - [1] metadata: Hash - Extracted metadata with document, headers, links, images, structured_data
|
|
221
|
+
#
|
|
222
|
+
# The metadata hash contains:
|
|
223
|
+
# - document: Document-level metadata (title, description, lang, etc.)
|
|
224
|
+
# - headers: List of header elements with hierarchy
|
|
225
|
+
# - links: List of extracted hyperlinks with classification
|
|
226
|
+
# - images: List of extracted images with metadata
|
|
227
|
+
# - structured_data: List of JSON-LD, Microdata, or RDFa blocks
|
|
228
|
+
#
|
|
229
|
+
# Example:
|
|
230
|
+
# html = '<html lang="en"><head><title>Test</title></head><body><h1>Hello</h1></body></html>'
|
|
231
|
+
# markdown, metadata = HtmlToMarkdown.convert_with_metadata(html)
|
|
232
|
+
# puts "Title: #{metadata['document']['title']}"
|
|
233
|
+
# puts "Headers: #{metadata['headers'].length}"
|
|
234
|
+
def self.convert_with_metadata: (
|
|
235
|
+
String html,
|
|
236
|
+
?conversion_options options,
|
|
237
|
+
?metadata_config metadata_config
|
|
238
|
+
) -> [String, extended_metadata]
|
|
239
|
+
|
|
130
240
|
# Instance method versions (created by module_function)
|
|
131
|
-
def convert: (String html, ?conversion_options
|
|
132
|
-
def options: (?conversion_options
|
|
241
|
+
def convert: (String html, ?conversion_options options) -> String
|
|
242
|
+
def options: (?conversion_options options_hash) -> Options
|
|
133
243
|
def convert_with_options: (String html, Options options_handle) -> String
|
|
134
244
|
def convert_with_inline_images: (
|
|
135
245
|
String html,
|
|
136
|
-
?conversion_options
|
|
137
|
-
?inline_image_config
|
|
246
|
+
?conversion_options options,
|
|
247
|
+
?inline_image_config image_config
|
|
138
248
|
) -> html_extraction
|
|
249
|
+
def convert_with_metadata: (
|
|
250
|
+
String html,
|
|
251
|
+
?conversion_options options,
|
|
252
|
+
?metadata_config metadata_config
|
|
253
|
+
) -> [String, extended_metadata]
|
|
139
254
|
end
|
|
@@ -0,0 +1,440 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'spec_helper'
|
|
4
|
+
|
|
5
|
+
RSpec.describe HtmlToMarkdown do
|
|
6
|
+
describe '.convert_with_metadata' do
|
|
7
|
+
it 'returns array with markdown and metadata' do
|
|
8
|
+
html = '<html><head><title>Test</title></head><body><p>Content</p></body></html>'
|
|
9
|
+
result = described_class.convert_with_metadata(html)
|
|
10
|
+
|
|
11
|
+
expect(result).to be_an(Array)
|
|
12
|
+
expect(result.length).to eq(2)
|
|
13
|
+
expect(result[0]).to be_a(String)
|
|
14
|
+
expect(result[1]).to be_a(Hash)
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
context 'when extracting document metadata' do
|
|
18
|
+
it 'extracts title' do
|
|
19
|
+
html = '<html><head><title>My Page Title</title></head><body><p>Content</p></body></html>'
|
|
20
|
+
_, metadata = described_class.convert_with_metadata(html)
|
|
21
|
+
|
|
22
|
+
expect(metadata[:document][:title]).to eq('My Page Title')
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
it 'extracts description' do
|
|
26
|
+
html = <<~HTML
|
|
27
|
+
<html>
|
|
28
|
+
<head><meta name="description" content="Page description"></head>
|
|
29
|
+
<body><p>Content</p></body>
|
|
30
|
+
</html>
|
|
31
|
+
HTML
|
|
32
|
+
_, metadata = described_class.convert_with_metadata(html)
|
|
33
|
+
|
|
34
|
+
expect(metadata[:document][:description]).to eq('Page description')
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
it 'extracts keywords' do
|
|
38
|
+
html = <<~HTML
|
|
39
|
+
<html>
|
|
40
|
+
<head><meta name="keywords" content="keyword1, keyword2, keyword3"></head>
|
|
41
|
+
<body><p>Content</p></body>
|
|
42
|
+
</html>
|
|
43
|
+
HTML
|
|
44
|
+
_, metadata = described_class.convert_with_metadata(html)
|
|
45
|
+
|
|
46
|
+
expect(metadata[:document][:keywords]).to include('keyword1', 'keyword2', 'keyword3')
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
it 'extracts author' do
|
|
50
|
+
html = '<html><head><meta name="author" content="John Doe"></head><body><p>Content</p></body></html>'
|
|
51
|
+
_, metadata = described_class.convert_with_metadata(html)
|
|
52
|
+
|
|
53
|
+
expect(metadata[:document][:author]).to eq('John Doe')
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
it 'extracts base href' do
|
|
57
|
+
html = '<html><head><base href="https://example.com/"></head><body><p>Content</p></body></html>'
|
|
58
|
+
_, metadata = described_class.convert_with_metadata(html)
|
|
59
|
+
|
|
60
|
+
expect(metadata[:document][:base_href]).to eq('https://example.com/')
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
it 'extracts canonical URL' do
|
|
64
|
+
html = '<html><head><link rel="canonical" href="https://example.com/page"></head><body><p>Content</p></body></html>'
|
|
65
|
+
_, metadata = described_class.convert_with_metadata(html)
|
|
66
|
+
|
|
67
|
+
expect(metadata[:document][:canonical_url]).to eq('https://example.com/page')
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
it 'extracts language' do
|
|
71
|
+
html = '<html lang="en"><head></head><body><p>Content</p></body></html>'
|
|
72
|
+
_, metadata = described_class.convert_with_metadata(html)
|
|
73
|
+
|
|
74
|
+
expect(metadata[:document][:language]).to eq('en')
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
it 'extracts text direction' do
|
|
78
|
+
html = '<html dir="ltr"><head></head><body><p>Content</p></body></html>'
|
|
79
|
+
_, metadata = described_class.convert_with_metadata(html)
|
|
80
|
+
|
|
81
|
+
expect(metadata[:document][:text_direction]).to eq('ltr')
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
it 'extracts open graph metadata' do
|
|
85
|
+
html = <<~HTML
|
|
86
|
+
<html>
|
|
87
|
+
<head>
|
|
88
|
+
<meta property="og:title" content="OG Title">
|
|
89
|
+
<meta property="og:description" content="OG Description">
|
|
90
|
+
<meta property="og:image" content="https://example.com/image.jpg">
|
|
91
|
+
</head>
|
|
92
|
+
<body><p>Content</p></body>
|
|
93
|
+
</html>
|
|
94
|
+
HTML
|
|
95
|
+
_, metadata = described_class.convert_with_metadata(html)
|
|
96
|
+
|
|
97
|
+
expect(metadata[:document][:open_graph]).to include(
|
|
98
|
+
'title' => 'OG Title',
|
|
99
|
+
'description' => 'OG Description',
|
|
100
|
+
'image' => 'https://example.com/image.jpg'
|
|
101
|
+
)
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
it 'extracts twitter card metadata' do
|
|
105
|
+
html = <<~HTML
|
|
106
|
+
<html>
|
|
107
|
+
<head>
|
|
108
|
+
<meta name="twitter:card" content="summary_large_image">
|
|
109
|
+
<meta name="twitter:title" content="Twitter Title">
|
|
110
|
+
</head>
|
|
111
|
+
<body><p>Content</p></body>
|
|
112
|
+
</html>
|
|
113
|
+
HTML
|
|
114
|
+
_, metadata = described_class.convert_with_metadata(html)
|
|
115
|
+
|
|
116
|
+
expect(metadata[:document][:twitter_card]).to include(
|
|
117
|
+
'card' => 'summary_large_image',
|
|
118
|
+
'title' => 'Twitter Title'
|
|
119
|
+
)
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
it 'returns empty arrays and hashes for missing metadata' do
|
|
123
|
+
html = '<p>Content</p>'
|
|
124
|
+
_, metadata = described_class.convert_with_metadata(html)
|
|
125
|
+
|
|
126
|
+
expect(metadata[:document][:title]).to be_nil
|
|
127
|
+
expect(metadata[:document][:description]).to be_nil
|
|
128
|
+
expect(metadata[:document][:keywords]).to eq([])
|
|
129
|
+
expect(metadata[:document][:open_graph]).to eq({})
|
|
130
|
+
expect(metadata[:document][:twitter_card]).to eq({})
|
|
131
|
+
expect(metadata[:document][:meta_tags]).to eq({})
|
|
132
|
+
end
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
context 'when extracting header metadata' do
|
|
136
|
+
it 'extracts headers with hierarchy' do
|
|
137
|
+
html = <<~HTML
|
|
138
|
+
<html>
|
|
139
|
+
<body>
|
|
140
|
+
<h1>Main Title</h1>
|
|
141
|
+
<h2>Section</h2>
|
|
142
|
+
<h3>Subsection</h3>
|
|
143
|
+
</body>
|
|
144
|
+
</html>
|
|
145
|
+
HTML
|
|
146
|
+
_, metadata = described_class.convert_with_metadata(html)
|
|
147
|
+
|
|
148
|
+
expect(metadata[:headers].length).to eq(3)
|
|
149
|
+
expect(metadata[:headers][0][:level]).to eq(1)
|
|
150
|
+
expect(metadata[:headers][0][:text]).to eq('Main Title')
|
|
151
|
+
expect(metadata[:headers][1][:level]).to eq(2)
|
|
152
|
+
expect(metadata[:headers][1][:text]).to eq('Section')
|
|
153
|
+
expect(metadata[:headers][2][:level]).to eq(3)
|
|
154
|
+
expect(metadata[:headers][2][:text]).to eq('Subsection')
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
it 'includes header id' do
|
|
158
|
+
html = '<html><body><h1 id="main-title">Title</h1></body></html>'
|
|
159
|
+
_, metadata = described_class.convert_with_metadata(html)
|
|
160
|
+
|
|
161
|
+
expect(metadata[:headers][0][:id]).to eq('main-title')
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
it 'includes depth and html_offset' do
|
|
165
|
+
html = '<html><body><h1>Title</h1></body></html>'
|
|
166
|
+
_, metadata = described_class.convert_with_metadata(html)
|
|
167
|
+
|
|
168
|
+
header = metadata[:headers][0]
|
|
169
|
+
expect(header).to include(:depth, :html_offset)
|
|
170
|
+
expect(header[:depth]).to be_a(Integer)
|
|
171
|
+
expect(header[:html_offset]).to be_a(Integer)
|
|
172
|
+
end
|
|
173
|
+
end
|
|
174
|
+
|
|
175
|
+
context 'when extracting link metadata' do
|
|
176
|
+
it 'extracts links with classification' do
|
|
177
|
+
html = <<~HTML
|
|
178
|
+
<html>
|
|
179
|
+
<body>
|
|
180
|
+
<a href="#section">Anchor</a>
|
|
181
|
+
<a href="https://example.com">External</a>
|
|
182
|
+
<a href="/page">Internal</a>
|
|
183
|
+
<a href="mailto:test@example.com">Email</a>
|
|
184
|
+
<a href="tel:+1234567890">Phone</a>
|
|
185
|
+
</body>
|
|
186
|
+
</html>
|
|
187
|
+
HTML
|
|
188
|
+
_, metadata = described_class.convert_with_metadata(html)
|
|
189
|
+
|
|
190
|
+
links = metadata[:links]
|
|
191
|
+
expect(links.length).to eq(5)
|
|
192
|
+
|
|
193
|
+
expect(links[0][:link_type]).to eq('anchor')
|
|
194
|
+
expect(links[1][:link_type]).to eq('external')
|
|
195
|
+
expect(links[2][:link_type]).to eq('internal')
|
|
196
|
+
expect(links[3][:link_type]).to eq('email')
|
|
197
|
+
expect(links[4][:link_type]).to eq('phone')
|
|
198
|
+
end
|
|
199
|
+
|
|
200
|
+
it 'includes link text and href' do
|
|
201
|
+
html = '<html><body><a href="https://example.com">Click here</a></body></html>'
|
|
202
|
+
_, metadata = described_class.convert_with_metadata(html)
|
|
203
|
+
|
|
204
|
+
link = metadata[:links][0]
|
|
205
|
+
expect(link[:href]).to eq('https://example.com')
|
|
206
|
+
expect(link[:text]).to eq('Click here')
|
|
207
|
+
end
|
|
208
|
+
|
|
209
|
+
it 'includes link title attribute' do
|
|
210
|
+
html = '<html><body><a href="https://example.com" title="Example Site">Link</a></body></html>'
|
|
211
|
+
_, metadata = described_class.convert_with_metadata(html)
|
|
212
|
+
|
|
213
|
+
link = metadata[:links][0]
|
|
214
|
+
expect(link[:title]).to eq('Example Site')
|
|
215
|
+
end
|
|
216
|
+
|
|
217
|
+
it 'includes link rel attributes' do
|
|
218
|
+
html = '<html><body><a href="https://example.com" rel="nofollow external">Link</a></body></html>'
|
|
219
|
+
_, metadata = described_class.convert_with_metadata(html)
|
|
220
|
+
|
|
221
|
+
link = metadata[:links][0]
|
|
222
|
+
expect(link[:rel]).to include('nofollow', 'external')
|
|
223
|
+
end
|
|
224
|
+
|
|
225
|
+
it 'includes link attributes' do
|
|
226
|
+
html = '<html><body><a href="https://example.com" data-custom="value">Link</a></body></html>'
|
|
227
|
+
_, metadata = described_class.convert_with_metadata(html)
|
|
228
|
+
|
|
229
|
+
link = metadata[:links][0]
|
|
230
|
+
expect(link[:attributes]).to include('data-custom' => 'value')
|
|
231
|
+
end
|
|
232
|
+
end
|
|
233
|
+
|
|
234
|
+
context 'when extracting image metadata' do
|
|
235
|
+
it 'extracts images with source type' do
|
|
236
|
+
html = <<~HTML
|
|
237
|
+
<html>
|
|
238
|
+
<body>
|
|
239
|
+
<img src="https://example.com/image.jpg" alt="External">
|
|
240
|
+
<img src="/images/local.jpg" alt="Relative">
|
|
241
|
+
<img src="data:image/png;base64,..." alt="Data URI">
|
|
242
|
+
</body>
|
|
243
|
+
</html>
|
|
244
|
+
HTML
|
|
245
|
+
_, metadata = described_class.convert_with_metadata(html)
|
|
246
|
+
|
|
247
|
+
images = metadata[:images]
|
|
248
|
+
expect(images.length).to eq(3)
|
|
249
|
+
|
|
250
|
+
expect(images[0][:image_type]).to eq('external')
|
|
251
|
+
expect(images[1][:image_type]).to eq('relative')
|
|
252
|
+
expect(images[2][:image_type]).to eq('data_uri')
|
|
253
|
+
end
|
|
254
|
+
|
|
255
|
+
it 'includes image alt and title' do
|
|
256
|
+
html = '<html><body><img src="image.jpg" alt="Alt text" title="Image title"></body></html>'
|
|
257
|
+
_, metadata = described_class.convert_with_metadata(html)
|
|
258
|
+
|
|
259
|
+
image = metadata[:images][0]
|
|
260
|
+
expect(image[:alt]).to eq('Alt text')
|
|
261
|
+
expect(image[:title]).to eq('Image title')
|
|
262
|
+
end
|
|
263
|
+
|
|
264
|
+
it 'includes image dimensions' do
|
|
265
|
+
html = '<html><body><img src="image.jpg" width="800" height="600"></body></html>'
|
|
266
|
+
_, metadata = described_class.convert_with_metadata(html)
|
|
267
|
+
|
|
268
|
+
image = metadata[:images][0]
|
|
269
|
+
expect(image[:dimensions]).to be_an(Array)
|
|
270
|
+
expect(image[:dimensions].length).to eq(2)
|
|
271
|
+
end
|
|
272
|
+
|
|
273
|
+
it 'handles missing image attributes' do
|
|
274
|
+
html = '<html><body><img src="image.jpg"></body></html>'
|
|
275
|
+
_, metadata = described_class.convert_with_metadata(html)
|
|
276
|
+
|
|
277
|
+
image = metadata[:images][0]
|
|
278
|
+
expect(image[:alt]).to be_nil
|
|
279
|
+
expect(image[:title]).to be_nil
|
|
280
|
+
end
|
|
281
|
+
end
|
|
282
|
+
|
|
283
|
+
context 'with metadata configuration flags' do
|
|
284
|
+
it 'respects extract_headers flag' do
|
|
285
|
+
html = '<html><body><h1>Title</h1><p>Content</p></body></html>'
|
|
286
|
+
config = { extract_headers: false }
|
|
287
|
+
_, metadata = described_class.convert_with_metadata(html, nil, config)
|
|
288
|
+
|
|
289
|
+
expect(metadata[:headers]).to eq([])
|
|
290
|
+
end
|
|
291
|
+
|
|
292
|
+
it 'respects extract_links flag' do
|
|
293
|
+
html = '<html><body><a href="https://example.com">Link</a></body></html>'
|
|
294
|
+
config = { extract_links: false }
|
|
295
|
+
_, metadata = described_class.convert_with_metadata(html, nil, config)
|
|
296
|
+
|
|
297
|
+
expect(metadata[:links]).to eq([])
|
|
298
|
+
end
|
|
299
|
+
|
|
300
|
+
it 'respects extract_images flag' do
|
|
301
|
+
html = '<html><body><img src="image.jpg" alt="test"></body></html>'
|
|
302
|
+
config = { extract_images: false }
|
|
303
|
+
_, metadata = described_class.convert_with_metadata(html, nil, config)
|
|
304
|
+
|
|
305
|
+
expect(metadata[:images]).to eq([])
|
|
306
|
+
end
|
|
307
|
+
|
|
308
|
+
it 'respects extract_structured_data flag' do
|
|
309
|
+
html = '<html><body><script type="application/ld+json">{"@type":"Article"}</script></body></html>'
|
|
310
|
+
config = { extract_structured_data: false }
|
|
311
|
+
_, metadata = described_class.convert_with_metadata(html, nil, config)
|
|
312
|
+
|
|
313
|
+
expect(metadata[:structured_data]).to eq([])
|
|
314
|
+
end
|
|
315
|
+
end
|
|
316
|
+
|
|
317
|
+
context 'with conversion options and metadata config' do
|
|
318
|
+
it 'accepts both conversion options and metadata config' do
|
|
319
|
+
html = '<html><head><title>Test</title></head><body><h1>Heading</h1></body></html>'
|
|
320
|
+
conv_opts = { heading_style: :atx_closed }
|
|
321
|
+
meta_opts = { extract_headers: true }
|
|
322
|
+
|
|
323
|
+
markdown, metadata = described_class.convert_with_metadata(html, conv_opts, meta_opts)
|
|
324
|
+
|
|
325
|
+
expect(markdown).to include('# Heading #')
|
|
326
|
+
expect(metadata[:headers].length).to eq(1)
|
|
327
|
+
end
|
|
328
|
+
|
|
329
|
+
it 'works with nil options' do
|
|
330
|
+
html = '<html><head><title>Test</title></head><body><p>Content</p></body></html>'
|
|
331
|
+
result = described_class.convert_with_metadata(html, nil, nil)
|
|
332
|
+
|
|
333
|
+
expect(result).to be_an(Array)
|
|
334
|
+
expect(result.length).to eq(2)
|
|
335
|
+
end
|
|
336
|
+
end
|
|
337
|
+
|
|
338
|
+
context 'when extracting structured data' do
|
|
339
|
+
it 'extracts JSON-LD blocks' do
|
|
340
|
+
html = <<~HTML
|
|
341
|
+
<html>
|
|
342
|
+
<head>
|
|
343
|
+
<script type="application/ld+json">
|
|
344
|
+
{"@context":"https://schema.org","@type":"Article","headline":"Test"}
|
|
345
|
+
</script>
|
|
346
|
+
</head>
|
|
347
|
+
<body><p>Content</p></body>
|
|
348
|
+
</html>
|
|
349
|
+
HTML
|
|
350
|
+
_, metadata = described_class.convert_with_metadata(html)
|
|
351
|
+
|
|
352
|
+
# Structured data extraction may vary by implementation
|
|
353
|
+
expect(metadata[:structured_data]).to be_an(Array)
|
|
354
|
+
end
|
|
355
|
+
end
|
|
356
|
+
|
|
357
|
+
context 'with edge cases' do
|
|
358
|
+
it 'handles empty HTML' do
|
|
359
|
+
html = ''
|
|
360
|
+
markdown, metadata = described_class.convert_with_metadata(html)
|
|
361
|
+
|
|
362
|
+
expect(markdown).to be_a(String)
|
|
363
|
+
expect(metadata).to be_a(Hash)
|
|
364
|
+
end
|
|
365
|
+
|
|
366
|
+
it 'handles malformed HTML' do
|
|
367
|
+
html = '<html><head><title>Unclosed'
|
|
368
|
+
markdown, metadata = described_class.convert_with_metadata(html)
|
|
369
|
+
|
|
370
|
+
expect(markdown).to be_a(String)
|
|
371
|
+
expect(metadata).to be_a(Hash)
|
|
372
|
+
end
|
|
373
|
+
|
|
374
|
+
it 'handles special characters in metadata' do
|
|
375
|
+
html = '<html><head><title>Title with "quotes" & <brackets></title></head><body><p>Content</p></body></html>'
|
|
376
|
+
_, metadata = described_class.convert_with_metadata(html)
|
|
377
|
+
|
|
378
|
+
expect(metadata[:document][:title]).to be_a(String)
|
|
379
|
+
end
|
|
380
|
+
|
|
381
|
+
it 'handles whitespace in metadata' do
|
|
382
|
+
html = '<html><head><title> Title with spaces </title></head><body><p>Content</p></body></html>'
|
|
383
|
+
_, metadata = described_class.convert_with_metadata(html)
|
|
384
|
+
|
|
385
|
+
# Whitespace may be normalized
|
|
386
|
+
expect(metadata[:document][:title]).to match(/Title.*spaces/)
|
|
387
|
+
end
|
|
388
|
+
|
|
389
|
+
it 'handles multiple values for same metadata key' do
|
|
390
|
+
html = <<~HTML
|
|
391
|
+
<html>
|
|
392
|
+
<head>
|
|
393
|
+
<meta name="author" content="Author 1">
|
|
394
|
+
<meta name="author" content="Author 2">
|
|
395
|
+
</head>
|
|
396
|
+
<body><p>Content</p></body>
|
|
397
|
+
</html>
|
|
398
|
+
HTML
|
|
399
|
+
_, metadata = described_class.convert_with_metadata(html)
|
|
400
|
+
|
|
401
|
+
# Last value typically wins, but implementation may vary
|
|
402
|
+
expect(metadata[:document][:author]).to be_a(String)
|
|
403
|
+
end
|
|
404
|
+
end
|
|
405
|
+
|
|
406
|
+
context 'when returning value structure' do
|
|
407
|
+
it 'returns proper metadata hash structure' do
|
|
408
|
+
html = <<~HTML
|
|
409
|
+
<html>
|
|
410
|
+
<head><title>Test</title><base href="https://example.com"></head>
|
|
411
|
+
<body><h1>H1</h1><a href="link">Link</a><img src="img.jpg"></body>
|
|
412
|
+
</html>
|
|
413
|
+
HTML
|
|
414
|
+
_, metadata = described_class.convert_with_metadata(html)
|
|
415
|
+
|
|
416
|
+
expect(metadata).to include(
|
|
417
|
+
:document,
|
|
418
|
+
:headers,
|
|
419
|
+
:links,
|
|
420
|
+
:images,
|
|
421
|
+
:structured_data
|
|
422
|
+
)
|
|
423
|
+
|
|
424
|
+
expect(metadata[:document]).to include(
|
|
425
|
+
:title,
|
|
426
|
+
:description,
|
|
427
|
+
:keywords,
|
|
428
|
+
:author,
|
|
429
|
+
:canonical_url,
|
|
430
|
+
:base_href,
|
|
431
|
+
:language,
|
|
432
|
+
:text_direction,
|
|
433
|
+
:open_graph,
|
|
434
|
+
:twitter_card,
|
|
435
|
+
:meta_tags
|
|
436
|
+
)
|
|
437
|
+
end
|
|
438
|
+
end
|
|
439
|
+
end
|
|
440
|
+
end
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: html-to-markdown
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 2.
|
|
4
|
+
version: 2.14.1
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Na'aman Hirschfeld
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2025-12-
|
|
11
|
+
date: 2025-12-12 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: rb_sys
|
|
@@ -67,6 +67,7 @@ files:
|
|
|
67
67
|
- sig/open3.rbs
|
|
68
68
|
- spec/cli_proxy_spec.rb
|
|
69
69
|
- spec/convert_spec.rb
|
|
70
|
+
- spec/metadata_extraction_spec.rb
|
|
70
71
|
- spec/spec_helper.rb
|
|
71
72
|
homepage: https://github.com/Goldziher/html-to-markdown
|
|
72
73
|
licenses:
|