html-to-markdown 2.12.1 → 2.14.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 3adb662037df2da8df76e80cc7ee63c0ab325a64b8a70d6238c5d2c6b1918e4c
4
- data.tar.gz: fdb9ee8c561d21fc22df149d852455447d4991e67a8166b575c8eaed1f356c3b
3
+ metadata.gz: dd0a9378e87b5c10c4500389c1c5e9c25a3b7b7ea1a930839b20ec0c9e00745b
4
+ data.tar.gz: 81b1626a43403390709c9c1fdecc377648ef8cda554fafafda6c0cfa2f841bcb
5
5
  SHA512:
6
- metadata.gz: 1b6b756de803db35705031ccbc0611cdaf86ff2446a01be6713d58684c422562628b735495689f0e81dc77ebaf39d8ce1bb04894013fdb1d9925bcffb14cb6e8
7
- data.tar.gz: f93cc1cb4a3caf2097aa8caa685c55b9eb9b48821ba49dbe2622ab2b0967954364ab0709b8e5d7bb5afa032de5e5c10c6bf0726d6b1c794e571f6784f6aceb79
6
+ metadata.gz: 30e15347d844c106f7e0538f499e9162c3d14903b7af3dff932649150e050b68e04bf4775a20377aaacc8f7c3c290a9128cb290047f9247f251a6214b793e043
7
+ data.tar.gz: d966ff4c9395461196e6f81f087ba7be193b9ea2f071e74053f9ae7feed10c5ffeca73f54cd01c53ba9ba8548634b3820dfa70712b040f5ce2d7d3ceefdff5b4
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- html-to-markdown (2.12.1)
4
+ html-to-markdown (2.14.1)
5
5
  rb_sys (>= 0.9, < 1.0)
6
6
 
7
7
  GEM
data/README.md CHANGED
@@ -184,6 +184,237 @@ result.inline_images.each do |img|
184
184
  end
185
185
  ```
186
186
 
187
+ ### Metadata Extraction
188
+
189
+ Extract comprehensive metadata alongside Markdown conversion: document properties (title, description, author, language), social metadata (Open Graph, Twitter cards), heading hierarchy, link analysis (type classification, rel attributes), image metadata (dimensions, type detection), and structured data (JSON-LD, Microdata, RDFa).
190
+
191
+ #### Basic Usage
192
+
193
+ ```ruby
194
+ require 'html_to_markdown'
195
+
196
+ html = '<html lang="en"><head><title>Test</title></head><body><h1>Hello</h1></body></html>'
197
+ markdown, metadata = HtmlToMarkdown.convert_with_metadata(html)
198
+
199
+ puts markdown
200
+ puts metadata[:document][:title] # "Test"
201
+ puts metadata[:headers].length # 1
202
+ ```
203
+
204
+ #### With Conversion Options
205
+
206
+ ```ruby
207
+ conv_opts = { heading_style: :atx_closed }
208
+ metadata_opts = { extract_headers: true, extract_links: false }
209
+
210
+ markdown, metadata = HtmlToMarkdown.convert_with_metadata(
211
+ html,
212
+ conv_opts,
213
+ metadata_opts
214
+ )
215
+ ```
216
+
217
+ #### Full Example
218
+
219
+ ```ruby
220
+ require 'html_to_markdown'
221
+
222
+ html = <<~HTML
223
+ <html>
224
+ <head>
225
+ <title>Example</title>
226
+ <meta name="description" content="Demo page">
227
+ <link rel="canonical" href="https://example.com/page">
228
+ <meta property="og:image" content="https://example.com/og.jpg">
229
+ <meta name="twitter:card" content="summary_large_image">
230
+ </head>
231
+ <body>
232
+ <h1 id="welcome">Welcome</h1>
233
+ <a href="https://example.com" rel="nofollow external">Example link</a>
234
+ <img src="https://example.com/image.jpg" alt="Hero" width="640" height="480">
235
+ <script type="application/ld+json">
236
+ {"@context": "https://schema.org", "@type": "Article"}
237
+ </script>
238
+ </body>
239
+ </html>
240
+ HTML
241
+
242
+ markdown, metadata = HtmlToMarkdown.convert_with_metadata(
243
+ html,
244
+ { heading_style: :atx },
245
+ { extract_links: true, extract_images: true, extract_headers: true, extract_structured_data: true }
246
+ )
247
+
248
+ puts markdown
249
+ puts metadata[:document][:title] # "Example"
250
+ puts metadata[:document][:description] # "Demo page"
251
+ puts metadata[:document][:open_graph] # {"og:image" => "https://example.com/og.jpg"}
252
+ puts metadata[:links].first[:rel] # ["nofollow", "external"]
253
+ puts metadata[:images].first[:dimensions] # [640, 480]
254
+ puts metadata[:headers].first[:id] # "welcome"
255
+ ```
256
+
257
+ #### Return Value Structure
258
+
259
+ Returns a 2-element array: `[markdown_string, metadata_hash]`
260
+
261
+ The metadata hash contains:
262
+
263
+ ```ruby
264
+ {
265
+ document: {
266
+ title: String?,
267
+ description: String?,
268
+ keywords: Array[String],
269
+ author: String?,
270
+ canonical_url: String?,
271
+ base_href: String?,
272
+ language: String?,
273
+ text_direction: "ltr" | "rtl" | "auto" | nil,
274
+ open_graph: Hash[String, String],
275
+ twitter_card: Hash[String, String],
276
+ meta_tags: Hash[String, String]
277
+ },
278
+ headers: [
279
+ {
280
+ level: Integer, # 1-6
281
+ text: String,
282
+ id: String?,
283
+ depth: Integer,
284
+ html_offset: Integer
285
+ }
286
+ ],
287
+ links: [
288
+ {
289
+ href: String,
290
+ text: String,
291
+ title: String?,
292
+ link_type: "anchor" | "internal" | "external" | "email" | "phone" | "other",
293
+ rel: Array[String],
294
+ attributes: Hash[String, String]
295
+ }
296
+ ],
297
+ images: [
298
+ {
299
+ src: String,
300
+ alt: String?,
301
+ title: String?,
302
+ dimensions: [Integer, Integer]?,
303
+ image_type: "data_uri" | "inline_svg" | "external" | "relative",
304
+ attributes: Hash[String, String]
305
+ }
306
+ ],
307
+ structured_data: [
308
+ {
309
+ data_type: "json_ld" | "microdata" | "rdfa",
310
+ raw_json: String,
311
+ schema_type: String?
312
+ }
313
+ ]
314
+ }
315
+ ```
316
+
317
+ #### Metadata Configuration
318
+
319
+ Pass a hash with the following options to control which metadata types are extracted:
320
+
321
+ ```ruby
322
+ config = {
323
+ extract_headers: true, # Extract h1-h6 elements (default: true)
324
+ extract_links: true, # Extract <a> elements (default: true)
325
+ extract_images: true, # Extract <img> elements (default: true)
326
+ extract_structured_data: true, # Extract JSON-LD/Microdata/RDFa (default: true)
327
+ max_structured_data_size: 1_000_000 # Max bytes for structured data (default: 1MB)
328
+ }
329
+
330
+ markdown, metadata = HtmlToMarkdown.convert_with_metadata(html, nil, config)
331
+ ```
332
+
333
+ #### Features
334
+
335
+ The Ruby binding provides comprehensive metadata extraction during HTML-to-Markdown conversion:
336
+
337
+ - **Document Metadata**: title, description, keywords, author, canonical URL, language, text direction
338
+ - **Open Graph & Twitter Card**: social media metadata extraction
339
+ - **Headers**: h1-h6 extraction with hierarchy, ids, and depth tracking
340
+ - **Links**: hyperlink extraction with type classification (anchor, internal, external, email, phone)
341
+ - **Images**: image extraction with source type (data_uri, inline_svg, external, relative) and dimensions
342
+ - **Structured Data**: JSON-LD, Microdata, and RDFa extraction
343
+
344
+ #### Type Safety with RBS
345
+
346
+ All types are defined in RBS format in `sig/html_to_markdown.rbs`:
347
+
348
+ - `document_metadata` - Document-level metadata structure
349
+ - `header_metadata` - Individual header element
350
+ - `link_metadata` - Individual link element
351
+ - `image_metadata` - Individual image element
352
+ - `structured_data` - Structured data block
353
+ - `extended_metadata` - Complete metadata extraction result
354
+
355
+ Uses strict RBS type checking with Steep for full type safety:
356
+
357
+ ```bash
358
+ steep check
359
+ ```
360
+
361
+ #### Implementation Architecture
362
+
363
+ The Rust implementation uses a single-pass collector pattern for efficient metadata extraction:
364
+
365
+ 1. **No duplication**: Core logic lives in Rust (`crates/html-to-markdown/src/metadata.rs`)
366
+ 2. **Minimal wrapper layer**: Ruby binding in `crates/html-to-markdown-rb/src/lib.rs`
367
+ 3. **Type translation**: Rust types → Ruby hashes with proper Magnus bindings
368
+ 4. **Hash conversion**: Uses Magnus `RHash` API for efficient Ruby hash construction
369
+
370
+ The metadata feature is gated by a Cargo feature in `Cargo.toml`:
371
+
372
+ ```toml
373
+ [features]
374
+ metadata = ["html-to-markdown-rs/metadata"]
375
+ ```
376
+
377
+ This ensures:
378
+ - Zero overhead when metadata is not needed
379
+ - Clean integration with feature flag detection
380
+ - Consistent with Python binding implementation
381
+
382
+ #### Language Parity
383
+
384
+ Implements the same API as the Python binding:
385
+
386
+ - Same method signature: `convert_with_metadata(html, options, metadata_config)`
387
+ - Same return type: `[markdown, metadata_dict]`
388
+ - Same metadata structures and field names
389
+ - Same enum values (link_type, image_type, data_type, text_direction)
390
+
391
+ Enables seamless migration and multi-language development.
392
+
393
+ #### Performance
394
+
395
+ Single-pass collection during tree traversal:
396
+ - No additional parsing passes
397
+ - Minimal memory overhead
398
+ - Configurable extraction granularity
399
+ - Built-in size limits for safety
400
+
401
+ #### Testing
402
+
403
+ Comprehensive RSpec test suite in `spec/metadata_extraction_spec.rb`:
404
+
405
+ ```bash
406
+ cd packages/ruby
407
+ bundle exec rake compile -- --release --features metadata
408
+ bundle exec rspec spec/metadata_extraction_spec.rb
409
+ ```
410
+
411
+ Tests cover:
412
+ - All metadata types extraction
413
+ - Configuration flags
414
+ - Edge cases (empty HTML, malformed input, special characters)
415
+ - Return value structure validation
416
+ - Integration with conversion options
417
+
187
418
  ## CLI
188
419
 
189
420
  The gem bundles a small proxy for the Rust CLI binary. Use it when you need parity with the standalone `html-to-markdown` executable.
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "html-to-markdown-rb"
3
- version = "2.12.1"
3
+ version = "2.14.1"
4
4
  edition = "2024"
5
5
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
6
6
  license = "MIT"
@@ -18,10 +18,11 @@ name = "html_to_markdown_rb"
18
18
  crate-type = ["cdylib", "rlib"]
19
19
 
20
20
  [features]
21
- default = []
21
+ default = ["metadata"]
22
+ metadata = ["html-to-markdown-rs/metadata"]
22
23
 
23
24
  [dependencies]
24
- html-to-markdown-rs = { version = "2.12.1", features = ["inline-images"] }
25
+ html-to-markdown-rs = { version = "2.14.1", features = ["inline-images"] }
25
26
  magnus = { git = "https://github.com/matsadler/magnus", rev = "f6db11769efb517427bf7f121f9c32e18b059b38", features = ["rb-sys"] }
26
27
 
27
28
  [dev-dependencies]
@@ -4,6 +4,17 @@ use html_to_markdown_rs::{
4
4
  PreprocessingPreset, WhitespaceMode, convert as convert_inner,
5
5
  convert_with_inline_images as convert_with_inline_images_inner, error::ConversionError, safety::guard_panic,
6
6
  };
7
+
8
+ #[cfg(feature = "metadata")]
9
+ use html_to_markdown_rs::convert_with_metadata as convert_with_metadata_inner;
10
+ #[cfg(feature = "metadata")]
11
+ use html_to_markdown_rs::metadata::{
12
+ DocumentMetadata as RustDocumentMetadata, ExtendedMetadata as RustExtendedMetadata,
13
+ HeaderMetadata as RustHeaderMetadata, ImageMetadata as RustImageMetadata, ImageType as RustImageType,
14
+ LinkMetadata as RustLinkMetadata, LinkType as RustLinkType, MetadataConfig as RustMetadataConfig,
15
+ StructuredData as RustStructuredData, StructuredDataType as RustStructuredDataType,
16
+ TextDirection as RustTextDirection,
17
+ };
7
18
  use magnus::prelude::*;
8
19
  use magnus::r_hash::ForEach;
9
20
  use magnus::{Error, RArray, RHash, Ruby, Symbol, TryConvert, Value, function, scan_args::scan_args};
@@ -423,6 +434,261 @@ fn convert_with_inline_images_fn(ruby: &Ruby, args: &[Value]) -> Result<Value, E
423
434
  extraction_to_value(ruby, extraction)
424
435
  }
425
436
 
437
+ #[cfg(feature = "metadata")]
438
+ fn build_metadata_config(_ruby: &Ruby, config: Option<Value>) -> Result<RustMetadataConfig, Error> {
439
+ let mut cfg = RustMetadataConfig::default();
440
+
441
+ let Some(config) = config else {
442
+ return Ok(cfg);
443
+ };
444
+
445
+ if config.is_nil() {
446
+ return Ok(cfg);
447
+ }
448
+
449
+ let hash = RHash::from_value(config).ok_or_else(|| arg_error("metadata_config must be provided as a Hash"))?;
450
+
451
+ hash.foreach(|key: Value, val: Value| {
452
+ let key_name = symbol_to_string(key)?;
453
+ match key_name.as_str() {
454
+ "extract_headers" => {
455
+ cfg.extract_headers = bool::try_convert(val)?;
456
+ }
457
+ "extract_links" => {
458
+ cfg.extract_links = bool::try_convert(val)?;
459
+ }
460
+ "extract_images" => {
461
+ cfg.extract_images = bool::try_convert(val)?;
462
+ }
463
+ "extract_structured_data" => {
464
+ cfg.extract_structured_data = bool::try_convert(val)?;
465
+ }
466
+ "max_structured_data_size" => {
467
+ cfg.max_structured_data_size = usize::try_convert(val)?;
468
+ }
469
+ _ => {}
470
+ }
471
+ Ok(ForEach::Continue)
472
+ })?;
473
+
474
+ Ok(cfg)
475
+ }
476
+
477
+ #[cfg(feature = "metadata")]
478
+ fn opt_string_to_ruby(ruby: &Ruby, opt: Option<String>) -> Result<Value, Error> {
479
+ match opt {
480
+ Some(val) => Ok(ruby.str_from_slice(val.as_bytes()).as_value()),
481
+ None => Ok(ruby.qnil().as_value()),
482
+ }
483
+ }
484
+
485
+ #[cfg(feature = "metadata")]
486
+ fn btreemap_to_ruby_hash(ruby: &Ruby, map: std::collections::BTreeMap<String, String>) -> Result<Value, Error> {
487
+ let hash = ruby.hash_new();
488
+ for (k, v) in map {
489
+ hash.aset(k, v)?;
490
+ }
491
+ Ok(hash.as_value())
492
+ }
493
+
494
+ #[cfg(feature = "metadata")]
495
+ fn text_direction_to_string(text_direction: Option<RustTextDirection>) -> Option<&'static str> {
496
+ match text_direction {
497
+ Some(RustTextDirection::LeftToRight) => Some("ltr"),
498
+ Some(RustTextDirection::RightToLeft) => Some("rtl"),
499
+ Some(RustTextDirection::Auto) => Some("auto"),
500
+ None => None,
501
+ }
502
+ }
503
+
504
+ #[cfg(feature = "metadata")]
505
+ fn link_type_to_string(link_type: &RustLinkType) -> &'static str {
506
+ match link_type {
507
+ RustLinkType::Anchor => "anchor",
508
+ RustLinkType::Internal => "internal",
509
+ RustLinkType::External => "external",
510
+ RustLinkType::Email => "email",
511
+ RustLinkType::Phone => "phone",
512
+ RustLinkType::Other => "other",
513
+ }
514
+ }
515
+
516
+ #[cfg(feature = "metadata")]
517
+ fn image_type_to_string(image_type: &RustImageType) -> &'static str {
518
+ match image_type {
519
+ RustImageType::DataUri => "data_uri",
520
+ RustImageType::InlineSvg => "inline_svg",
521
+ RustImageType::External => "external",
522
+ RustImageType::Relative => "relative",
523
+ }
524
+ }
525
+
526
+ #[cfg(feature = "metadata")]
527
+ fn structured_data_type_to_string(data_type: &RustStructuredDataType) -> &'static str {
528
+ match data_type {
529
+ RustStructuredDataType::JsonLd => "json_ld",
530
+ RustStructuredDataType::Microdata => "microdata",
531
+ RustStructuredDataType::RDFa => "rdfa",
532
+ }
533
+ }
534
+
535
+ #[cfg(feature = "metadata")]
536
+ fn document_metadata_to_ruby(ruby: &Ruby, doc: RustDocumentMetadata) -> Result<Value, Error> {
537
+ let hash = ruby.hash_new();
538
+
539
+ hash.aset(ruby.intern("title"), opt_string_to_ruby(ruby, doc.title)?)?;
540
+ hash.aset(ruby.intern("description"), opt_string_to_ruby(ruby, doc.description)?)?;
541
+
542
+ let keywords = ruby.ary_new();
543
+ for keyword in doc.keywords {
544
+ keywords.push(keyword)?;
545
+ }
546
+ hash.aset(ruby.intern("keywords"), keywords)?;
547
+
548
+ hash.aset(ruby.intern("author"), opt_string_to_ruby(ruby, doc.author)?)?;
549
+ hash.aset(
550
+ ruby.intern("canonical_url"),
551
+ opt_string_to_ruby(ruby, doc.canonical_url)?,
552
+ )?;
553
+ hash.aset(ruby.intern("base_href"), opt_string_to_ruby(ruby, doc.base_href)?)?;
554
+ hash.aset(ruby.intern("language"), opt_string_to_ruby(ruby, doc.language)?)?;
555
+
556
+ match text_direction_to_string(doc.text_direction) {
557
+ Some(dir) => hash.aset(ruby.intern("text_direction"), dir)?,
558
+ None => hash.aset(ruby.intern("text_direction"), ruby.qnil())?,
559
+ }
560
+
561
+ hash.aset(ruby.intern("open_graph"), btreemap_to_ruby_hash(ruby, doc.open_graph)?)?;
562
+ hash.aset(
563
+ ruby.intern("twitter_card"),
564
+ btreemap_to_ruby_hash(ruby, doc.twitter_card)?,
565
+ )?;
566
+ hash.aset(ruby.intern("meta_tags"), btreemap_to_ruby_hash(ruby, doc.meta_tags)?)?;
567
+
568
+ Ok(hash.as_value())
569
+ }
570
+
571
+ #[cfg(feature = "metadata")]
572
+ fn headers_to_ruby(ruby: &Ruby, headers: Vec<RustHeaderMetadata>) -> Result<Value, Error> {
573
+ let array = ruby.ary_new();
574
+ for header in headers {
575
+ let hash = ruby.hash_new();
576
+ hash.aset(ruby.intern("level"), header.level)?;
577
+ hash.aset(ruby.intern("text"), header.text)?;
578
+ hash.aset(ruby.intern("id"), opt_string_to_ruby(ruby, header.id)?)?;
579
+ hash.aset(ruby.intern("depth"), header.depth as i64)?;
580
+ hash.aset(ruby.intern("html_offset"), header.html_offset as i64)?;
581
+ array.push(hash)?;
582
+ }
583
+ Ok(array.as_value())
584
+ }
585
+
586
+ #[cfg(feature = "metadata")]
587
+ fn links_to_ruby(ruby: &Ruby, links: Vec<RustLinkMetadata>) -> Result<Value, Error> {
588
+ let array = ruby.ary_new();
589
+ for link in links {
590
+ let hash = ruby.hash_new();
591
+ hash.aset(ruby.intern("href"), link.href)?;
592
+ hash.aset(ruby.intern("text"), link.text)?;
593
+ hash.aset(ruby.intern("title"), opt_string_to_ruby(ruby, link.title)?)?;
594
+ hash.aset(ruby.intern("link_type"), link_type_to_string(&link.link_type))?;
595
+
596
+ let rel_array = ruby.ary_new();
597
+ for r in link.rel {
598
+ rel_array.push(r)?;
599
+ }
600
+ hash.aset(ruby.intern("rel"), rel_array)?;
601
+
602
+ hash.aset(ruby.intern("attributes"), btreemap_to_ruby_hash(ruby, link.attributes)?)?;
603
+ array.push(hash)?;
604
+ }
605
+ Ok(array.as_value())
606
+ }
607
+
608
+ #[cfg(feature = "metadata")]
609
+ fn images_to_ruby(ruby: &Ruby, images: Vec<RustImageMetadata>) -> Result<Value, Error> {
610
+ let array = ruby.ary_new();
611
+ for image in images {
612
+ let hash = ruby.hash_new();
613
+ hash.aset(ruby.intern("src"), image.src)?;
614
+ hash.aset(ruby.intern("alt"), opt_string_to_ruby(ruby, image.alt)?)?;
615
+ hash.aset(ruby.intern("title"), opt_string_to_ruby(ruby, image.title)?)?;
616
+
617
+ match image.dimensions {
618
+ Some((width, height)) => {
619
+ let dims = ruby.ary_new();
620
+ dims.push(width as i64)?;
621
+ dims.push(height as i64)?;
622
+ hash.aset(ruby.intern("dimensions"), dims)?;
623
+ }
624
+ None => {
625
+ hash.aset(ruby.intern("dimensions"), ruby.qnil())?;
626
+ }
627
+ }
628
+
629
+ hash.aset(ruby.intern("image_type"), image_type_to_string(&image.image_type))?;
630
+ hash.aset(
631
+ ruby.intern("attributes"),
632
+ btreemap_to_ruby_hash(ruby, image.attributes)?,
633
+ )?;
634
+ array.push(hash)?;
635
+ }
636
+ Ok(array.as_value())
637
+ }
638
+
639
+ #[cfg(feature = "metadata")]
640
+ fn structured_data_to_ruby(ruby: &Ruby, data: Vec<RustStructuredData>) -> Result<Value, Error> {
641
+ let array = ruby.ary_new();
642
+ for item in data {
643
+ let hash = ruby.hash_new();
644
+ hash.aset(
645
+ ruby.intern("data_type"),
646
+ structured_data_type_to_string(&item.data_type),
647
+ )?;
648
+ hash.aset(ruby.intern("raw_json"), item.raw_json)?;
649
+ hash.aset(ruby.intern("schema_type"), opt_string_to_ruby(ruby, item.schema_type)?)?;
650
+ array.push(hash)?;
651
+ }
652
+ Ok(array.as_value())
653
+ }
654
+
655
+ #[cfg(feature = "metadata")]
656
+ fn extended_metadata_to_ruby(ruby: &Ruby, metadata: RustExtendedMetadata) -> Result<Value, Error> {
657
+ let hash = ruby.hash_new();
658
+
659
+ hash.aset(
660
+ ruby.intern("document"),
661
+ document_metadata_to_ruby(ruby, metadata.document)?,
662
+ )?;
663
+ hash.aset(ruby.intern("headers"), headers_to_ruby(ruby, metadata.headers)?)?;
664
+ hash.aset(ruby.intern("links"), links_to_ruby(ruby, metadata.links)?)?;
665
+ hash.aset(ruby.intern("images"), images_to_ruby(ruby, metadata.images)?)?;
666
+ hash.aset(
667
+ ruby.intern("structured_data"),
668
+ structured_data_to_ruby(ruby, metadata.structured_data)?,
669
+ )?;
670
+
671
+ Ok(hash.as_value())
672
+ }
673
+
674
+ #[cfg(feature = "metadata")]
675
+ fn convert_with_metadata_fn(ruby: &Ruby, args: &[Value]) -> Result<Value, Error> {
676
+ let parsed = scan_args::<(String,), (Option<Value>, Option<Value>), (), (), (), ()>(args)?;
677
+ let html = parsed.required.0;
678
+ let options = build_conversion_options(ruby, parsed.optional.0)?;
679
+ let metadata_config = build_metadata_config(ruby, parsed.optional.1)?;
680
+
681
+ let (markdown, metadata) =
682
+ guard_panic(|| convert_with_metadata_inner(&html, Some(options), metadata_config)).map_err(conversion_error)?;
683
+
684
+ // Convert to Ruby array [markdown, metadata_hash]
685
+ let array = ruby.ary_new();
686
+ array.push(markdown)?;
687
+ array.push(extended_metadata_to_ruby(ruby, metadata)?)?;
688
+
689
+ Ok(array.as_value())
690
+ }
691
+
426
692
  #[magnus::init]
427
693
  fn init(ruby: &Ruby) -> Result<(), Error> {
428
694
  let module = ruby.define_module("HtmlToMarkdown")?;
@@ -434,5 +700,8 @@ fn init(ruby: &Ruby) -> Result<(), Error> {
434
700
  function!(convert_with_inline_images_fn, -1),
435
701
  )?;
436
702
 
703
+ #[cfg(feature = "metadata")]
704
+ module.define_singleton_method("convert_with_metadata", function!(convert_with_metadata_fn, -1))?;
705
+
437
706
  Ok(())
438
707
  }
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module HtmlToMarkdown
4
- VERSION = '2.12.1'
4
+ VERSION = '2.14.1'
5
5
  end
@@ -14,6 +14,7 @@ module HtmlToMarkdown
14
14
  alias native_convert_with_inline_images convert_with_inline_images
15
15
  alias native_options options
16
16
  alias native_convert_with_options convert_with_options
17
+ alias native_convert_with_metadata convert_with_metadata
17
18
  end
18
19
 
19
20
  module_function
@@ -33,4 +34,130 @@ module HtmlToMarkdown
33
34
  def options(options_hash = nil)
34
35
  native_options(options_hash)
35
36
  end
37
+
38
+ # Convert HTML to Markdown with comprehensive metadata extraction.
39
+ #
40
+ # Performs HTML-to-Markdown conversion while extracting document metadata, headers,
41
+ # links, images, and structured data in a single pass. Ideal for content analysis,
42
+ # SEO workflows, and document indexing.
43
+ #
44
+ # @param html [String] HTML string to convert. Line endings are normalized (CRLF -> LF).
45
+ # @param options [ConversionOptions, Hash, nil] Optional conversion configuration.
46
+ # When a Hash, keys should match ConversionOptions field names (as symbols or strings).
47
+ # Common options:
48
+ # - :heading_style [String] "atx", "atx_closed", or "underlined" (default: "underlined")
49
+ # - :list_indent_type [String] "spaces" or "tabs" (default: "spaces")
50
+ # - :list_indent_width [Integer] Spaces per indent level (default: 4)
51
+ # - :wrap [true, false] Enable text wrapping (default: false)
52
+ # - :wrap_width [Integer] Wrap at this column width (default: 80)
53
+ # See ConversionOptions documentation for complete list.
54
+ #
55
+ # @param metadata_config [Hash, nil] Optional metadata extraction configuration.
56
+ # Keys should be symbols or strings. Supported keys:
57
+ # - :extract_headers [true, false] Extract h1-h6 heading elements (default: true)
58
+ # - :extract_links [true, false] Extract hyperlinks with type classification (default: true)
59
+ # - :extract_images [true, false] Extract image elements (default: true)
60
+ # - :extract_structured_data [true, false] Extract JSON-LD/Microdata/RDFa (default: true)
61
+ # - :max_structured_data_size [Integer] Size limit for structured data in bytes (default: 1_000_000)
62
+ #
63
+ # @return [Array<String, Hash>] Tuple of [markdown_string, metadata_hash]
64
+ # markdown_string: String - The converted Markdown output
65
+ #
66
+ # metadata_hash: Hash with keys:
67
+ # - :document [Hash] Document-level metadata:
68
+ # - :title [String, nil] From <title> tag
69
+ # - :description [String, nil] From <meta name="description">
70
+ # - :keywords [Array<String>] From <meta name="keywords">
71
+ # - :author [String, nil] From <meta name="author">
72
+ # - :language [String, nil] From lang attribute (e.g., "en")
73
+ # - :text_direction [String, nil] "ltr", "rtl", or "auto"
74
+ # - :canonical_url [String, nil] From <link rel="canonical">
75
+ # - :base_href [String, nil] From <base href="">
76
+ # - :open_graph [Hash<String, String>] Open Graph properties (og:* meta tags)
77
+ # - :twitter_card [Hash<String, String>] Twitter Card properties (twitter:* meta tags)
78
+ # - :meta_tags [Hash<String, String>] Other meta tags
79
+ #
80
+ # - :headers [Array<Hash>] Heading elements:
81
+ # - :level [Integer] 1-6
82
+ # - :text [String] Header text content
83
+ # - :id [String, nil] HTML id attribute
84
+ # - :depth [Integer] Tree nesting depth
85
+ # - :html_offset [Integer] Byte offset in original HTML
86
+ #
87
+ # - :links [Array<Hash>] Hyperlinks:
88
+ # - :href [String] Link URL
89
+ # - :text [String] Link text content
90
+ # - :title [String, nil] Title attribute
91
+ # - :link_type [String] "anchor", "internal", "external", "email", "phone", or "other"
92
+ # - :rel [Array<String>] Rel attribute values
93
+ # - :attributes [Hash<String, String>] Additional HTML attributes
94
+ #
95
+ # - :images [Array<Hash>] Image elements:
96
+ # - :src [String] Image source URL or data URI
97
+ # - :alt [String, nil] Alt text for accessibility
98
+ # - :title [String, nil] Title attribute
99
+ # - :dimensions [Array<Integer>, nil] [width, height] if available
100
+ # - :image_type [String] "data_uri", "external", "relative", or "inline_svg"
101
+ # - :attributes [Hash<String, String>] Additional HTML attributes
102
+ #
103
+ # - :structured_data [Array<Hash>] Structured data blocks:
104
+ # - :data_type [String] "json_ld", "microdata", or "rdfa"
105
+ # - :raw_json [String] Raw JSON content
106
+ # - :schema_type [String, nil] Schema type (e.g., "Article", "Event")
107
+ #
108
+ # @raise [StandardError] If conversion fails or invalid configuration
109
+ #
110
+ # @example Basic usage
111
+ # html = <<~HTML
112
+ # <html lang="en">
113
+ # <head>
114
+ # <title>My Article</title>
115
+ # <meta name="description" content="A great read">
116
+ # </head>
117
+ # <body>
118
+ # <h1 id="intro">Introduction</h1>
119
+ # <p>Visit <a href="https://example.com">our site</a></p>
120
+ # <img src="photo.jpg" alt="Beautiful landscape">
121
+ # </body>
122
+ # </html>
123
+ # HTML
124
+ #
125
+ # markdown, metadata = HtmlToMarkdown.convert_with_metadata(html)
126
+ #
127
+ # puts metadata[:document][:title] # => "My Article"
128
+ # puts metadata[:document][:language] # => "en"
129
+ # puts metadata[:headers].length # => 1
130
+ # puts metadata[:headers][0][:text] # => "Introduction"
131
+ # puts metadata[:links].length # => 1
132
+ # puts metadata[:images].length # => 1
133
+ #
134
+ # @example With selective metadata extraction
135
+ # config = {
136
+ # extract_headers: true,
137
+ # extract_links: true,
138
+ # extract_images: false, # Skip images
139
+ # extract_structured_data: false # Skip structured data
140
+ # }
141
+ #
142
+ # markdown, metadata = HtmlToMarkdown.convert_with_metadata(html, nil, config)
143
+ # puts metadata[:images].empty? # => true (not extracted)
144
+ #
145
+ # @example With conversion options
146
+ # options = {
147
+ # heading_style: "atx", # Use # H1, ## H2 style
148
+ # wrap: true,
149
+ # wrap_width: 80
150
+ # }
151
+ #
152
+ # config = { extract_headers: true }
153
+ #
154
+ # markdown, metadata = HtmlToMarkdown.convert_with_metadata(html, options, config)
155
+ # # Markdown uses ATX-style headings and wraps at 80 characters
156
+ #
157
+ # @see #convert Simple conversion without metadata
158
+ # @see #convert_with_inline_images Extract inline images during conversion
159
+ # @see ConversionOptions Detailed conversion configuration
160
+ def convert_with_metadata(html, options = nil, metadata_config = nil)
161
+ native_convert_with_metadata(html.to_s, options, metadata_config)
162
+ end
36
163
  end
@@ -87,6 +87,74 @@ module HtmlToMarkdown
87
87
  warnings: Array[inline_image_warning]
88
88
  }
89
89
 
90
+ type metadata_config = {
91
+ extract_headers?: bool,
92
+ extract_links?: bool,
93
+ extract_images?: bool,
94
+ extract_structured_data?: bool,
95
+ max_structured_data_size?: Integer
96
+ }
97
+
98
+ type text_direction = "ltr" | "rtl" | "auto" | nil
99
+
100
+ type document_metadata = {
101
+ title: String?,
102
+ description: String?,
103
+ keywords: Array[String],
104
+ author: String?,
105
+ canonical_url: String?,
106
+ base_href: String?,
107
+ language: String?,
108
+ text_direction: text_direction,
109
+ open_graph: Hash[String, String],
110
+ twitter_card: Hash[String, String],
111
+ meta_tags: Hash[String, String]
112
+ }
113
+
114
+ type header_metadata = {
115
+ level: Integer,
116
+ text: String,
117
+ id: String?,
118
+ depth: Integer,
119
+ html_offset: Integer
120
+ }
121
+
122
+ type link_type = "anchor" | "internal" | "external" | "email" | "phone" | "other"
123
+
124
+ type link_metadata = {
125
+ href: String,
126
+ text: String,
127
+ title: String?,
128
+ link_type: link_type,
129
+ rel: Array[String],
130
+ attributes: Hash[String, String]
131
+ }
132
+
133
+ type image_type = "data_uri" | "inline_svg" | "external" | "relative"
134
+
135
+ type image_metadata = {
136
+ src: String,
137
+ alt: String?,
138
+ title: String?,
139
+ dimensions: [Integer, Integer]?,
140
+ image_type: image_type,
141
+ attributes: Hash[String, String]
142
+ }
143
+
144
+ type structured_data = {
145
+ data_type: "json_ld" | "microdata" | "rdfa",
146
+ raw_json: String,
147
+ schema_type: String?
148
+ }
149
+
150
+ type extended_metadata = {
151
+ document: document_metadata,
152
+ headers: Array[header_metadata],
153
+ links: Array[link_metadata],
154
+ images: Array[image_metadata],
155
+ structured_data: Array[structured_data]
156
+ }
157
+
90
158
  # Native methods (implemented in Rust via Magnus/rb-sys)
91
159
  # These are aliased from the Rust extension and available as both module and instance methods
92
160
  private
@@ -99,6 +167,11 @@ module HtmlToMarkdown
99
167
  conversion_options? options,
100
168
  inline_image_config? image_config
101
169
  ) -> html_extraction
170
+ def self.native_convert_with_metadata: (
171
+ String html,
172
+ conversion_options? options,
173
+ metadata_config? metadata_config
174
+ ) -> [String, extended_metadata]
102
175
 
103
176
  def native_convert: (String html, conversion_options? options) -> String
104
177
  def native_options: (conversion_options? options_hash) -> Options
@@ -108,14 +181,19 @@ module HtmlToMarkdown
108
181
  conversion_options? options,
109
182
  inline_image_config? image_config
110
183
  ) -> html_extraction
184
+ def native_convert_with_metadata: (
185
+ String html,
186
+ conversion_options? options,
187
+ metadata_config? metadata_config
188
+ ) -> [String, extended_metadata]
111
189
 
112
190
  public
113
191
 
114
192
  # Convert HTML to Markdown with optional configuration
115
- def self.convert: (String html, ?conversion_options? options) -> String
193
+ def self.convert: (String html, ?conversion_options options) -> String
116
194
 
117
195
  # Create a reusable options handle for performance
118
- def self.options: (?conversion_options? options_hash) -> Options
196
+ def self.options: (?conversion_options options_hash) -> Options
119
197
 
120
198
  # Convert HTML using a pre-built options handle
121
199
  def self.convert_with_options: (String html, Options options_handle) -> String
@@ -123,17 +201,54 @@ module HtmlToMarkdown
123
201
  # Convert HTML with inline image extraction
124
202
  def self.convert_with_inline_images: (
125
203
  String html,
126
- ?conversion_options? options,
127
- ?inline_image_config? image_config
204
+ ?conversion_options options,
205
+ ?inline_image_config image_config
128
206
  ) -> html_extraction
129
207
 
208
+ # Convert HTML to Markdown with metadata extraction
209
+ #
210
+ # Extracts comprehensive metadata (headers, links, images, structured data) during conversion.
211
+ #
212
+ # Args:
213
+ # html: HTML string to convert
214
+ # options: Optional conversion configuration
215
+ # metadata_config: Optional metadata extraction configuration
216
+ #
217
+ # Returns:
218
+ # Array containing:
219
+ # - [0] markdown: String - Converted markdown output
220
+ # - [1] metadata: Hash - Extracted metadata with document, headers, links, images, structured_data
221
+ #
222
+ # The metadata hash contains:
223
+ # - document: Document-level metadata (title, description, lang, etc.)
224
+ # - headers: List of header elements with hierarchy
225
+ # - links: List of extracted hyperlinks with classification
226
+ # - images: List of extracted images with metadata
227
+ # - structured_data: List of JSON-LD, Microdata, or RDFa blocks
228
+ #
229
+ # Example:
230
+ # html = '<html lang="en"><head><title>Test</title></head><body><h1>Hello</h1></body></html>'
231
+ # markdown, metadata = HtmlToMarkdown.convert_with_metadata(html)
232
+ # puts "Title: #{metadata['document']['title']}"
233
+ # puts "Headers: #{metadata['headers'].length}"
234
+ def self.convert_with_metadata: (
235
+ String html,
236
+ ?conversion_options options,
237
+ ?metadata_config metadata_config
238
+ ) -> [String, extended_metadata]
239
+
130
240
  # Instance method versions (created by module_function)
131
- def convert: (String html, ?conversion_options? options) -> String
132
- def options: (?conversion_options? options_hash) -> Options
241
+ def convert: (String html, ?conversion_options options) -> String
242
+ def options: (?conversion_options options_hash) -> Options
133
243
  def convert_with_options: (String html, Options options_handle) -> String
134
244
  def convert_with_inline_images: (
135
245
  String html,
136
- ?conversion_options? options,
137
- ?inline_image_config? image_config
246
+ ?conversion_options options,
247
+ ?inline_image_config image_config
138
248
  ) -> html_extraction
249
+ def convert_with_metadata: (
250
+ String html,
251
+ ?conversion_options options,
252
+ ?metadata_config metadata_config
253
+ ) -> [String, extended_metadata]
139
254
  end
@@ -0,0 +1,440 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'spec_helper'
4
+
5
+ RSpec.describe HtmlToMarkdown do
6
+ describe '.convert_with_metadata' do
7
+ it 'returns array with markdown and metadata' do
8
+ html = '<html><head><title>Test</title></head><body><p>Content</p></body></html>'
9
+ result = described_class.convert_with_metadata(html)
10
+
11
+ expect(result).to be_an(Array)
12
+ expect(result.length).to eq(2)
13
+ expect(result[0]).to be_a(String)
14
+ expect(result[1]).to be_a(Hash)
15
+ end
16
+
17
+ context 'when extracting document metadata' do
18
+ it 'extracts title' do
19
+ html = '<html><head><title>My Page Title</title></head><body><p>Content</p></body></html>'
20
+ _, metadata = described_class.convert_with_metadata(html)
21
+
22
+ expect(metadata[:document][:title]).to eq('My Page Title')
23
+ end
24
+
25
+ it 'extracts description' do
26
+ html = <<~HTML
27
+ <html>
28
+ <head><meta name="description" content="Page description"></head>
29
+ <body><p>Content</p></body>
30
+ </html>
31
+ HTML
32
+ _, metadata = described_class.convert_with_metadata(html)
33
+
34
+ expect(metadata[:document][:description]).to eq('Page description')
35
+ end
36
+
37
+ it 'extracts keywords' do
38
+ html = <<~HTML
39
+ <html>
40
+ <head><meta name="keywords" content="keyword1, keyword2, keyword3"></head>
41
+ <body><p>Content</p></body>
42
+ </html>
43
+ HTML
44
+ _, metadata = described_class.convert_with_metadata(html)
45
+
46
+ expect(metadata[:document][:keywords]).to include('keyword1', 'keyword2', 'keyword3')
47
+ end
48
+
49
+ it 'extracts author' do
50
+ html = '<html><head><meta name="author" content="John Doe"></head><body><p>Content</p></body></html>'
51
+ _, metadata = described_class.convert_with_metadata(html)
52
+
53
+ expect(metadata[:document][:author]).to eq('John Doe')
54
+ end
55
+
56
+ it 'extracts base href' do
57
+ html = '<html><head><base href="https://example.com/"></head><body><p>Content</p></body></html>'
58
+ _, metadata = described_class.convert_with_metadata(html)
59
+
60
+ expect(metadata[:document][:base_href]).to eq('https://example.com/')
61
+ end
62
+
63
+ it 'extracts canonical URL' do
64
+ html = '<html><head><link rel="canonical" href="https://example.com/page"></head><body><p>Content</p></body></html>'
65
+ _, metadata = described_class.convert_with_metadata(html)
66
+
67
+ expect(metadata[:document][:canonical_url]).to eq('https://example.com/page')
68
+ end
69
+
70
+ it 'extracts language' do
71
+ html = '<html lang="en"><head></head><body><p>Content</p></body></html>'
72
+ _, metadata = described_class.convert_with_metadata(html)
73
+
74
+ expect(metadata[:document][:language]).to eq('en')
75
+ end
76
+
77
+ it 'extracts text direction' do
78
+ html = '<html dir="ltr"><head></head><body><p>Content</p></body></html>'
79
+ _, metadata = described_class.convert_with_metadata(html)
80
+
81
+ expect(metadata[:document][:text_direction]).to eq('ltr')
82
+ end
83
+
84
+ it 'extracts open graph metadata' do
85
+ html = <<~HTML
86
+ <html>
87
+ <head>
88
+ <meta property="og:title" content="OG Title">
89
+ <meta property="og:description" content="OG Description">
90
+ <meta property="og:image" content="https://example.com/image.jpg">
91
+ </head>
92
+ <body><p>Content</p></body>
93
+ </html>
94
+ HTML
95
+ _, metadata = described_class.convert_with_metadata(html)
96
+
97
+ expect(metadata[:document][:open_graph]).to include(
98
+ 'title' => 'OG Title',
99
+ 'description' => 'OG Description',
100
+ 'image' => 'https://example.com/image.jpg'
101
+ )
102
+ end
103
+
104
+ it 'extracts twitter card metadata' do
105
+ html = <<~HTML
106
+ <html>
107
+ <head>
108
+ <meta name="twitter:card" content="summary_large_image">
109
+ <meta name="twitter:title" content="Twitter Title">
110
+ </head>
111
+ <body><p>Content</p></body>
112
+ </html>
113
+ HTML
114
+ _, metadata = described_class.convert_with_metadata(html)
115
+
116
+ expect(metadata[:document][:twitter_card]).to include(
117
+ 'card' => 'summary_large_image',
118
+ 'title' => 'Twitter Title'
119
+ )
120
+ end
121
+
122
+ it 'returns empty arrays and hashes for missing metadata' do
123
+ html = '<p>Content</p>'
124
+ _, metadata = described_class.convert_with_metadata(html)
125
+
126
+ expect(metadata[:document][:title]).to be_nil
127
+ expect(metadata[:document][:description]).to be_nil
128
+ expect(metadata[:document][:keywords]).to eq([])
129
+ expect(metadata[:document][:open_graph]).to eq({})
130
+ expect(metadata[:document][:twitter_card]).to eq({})
131
+ expect(metadata[:document][:meta_tags]).to eq({})
132
+ end
133
+ end
134
+
135
+ context 'when extracting header metadata' do
136
+ it 'extracts headers with hierarchy' do
137
+ html = <<~HTML
138
+ <html>
139
+ <body>
140
+ <h1>Main Title</h1>
141
+ <h2>Section</h2>
142
+ <h3>Subsection</h3>
143
+ </body>
144
+ </html>
145
+ HTML
146
+ _, metadata = described_class.convert_with_metadata(html)
147
+
148
+ expect(metadata[:headers].length).to eq(3)
149
+ expect(metadata[:headers][0][:level]).to eq(1)
150
+ expect(metadata[:headers][0][:text]).to eq('Main Title')
151
+ expect(metadata[:headers][1][:level]).to eq(2)
152
+ expect(metadata[:headers][1][:text]).to eq('Section')
153
+ expect(metadata[:headers][2][:level]).to eq(3)
154
+ expect(metadata[:headers][2][:text]).to eq('Subsection')
155
+ end
156
+
157
+ it 'includes header id' do
158
+ html = '<html><body><h1 id="main-title">Title</h1></body></html>'
159
+ _, metadata = described_class.convert_with_metadata(html)
160
+
161
+ expect(metadata[:headers][0][:id]).to eq('main-title')
162
+ end
163
+
164
+ it 'includes depth and html_offset' do
165
+ html = '<html><body><h1>Title</h1></body></html>'
166
+ _, metadata = described_class.convert_with_metadata(html)
167
+
168
+ header = metadata[:headers][0]
169
+ expect(header).to include(:depth, :html_offset)
170
+ expect(header[:depth]).to be_a(Integer)
171
+ expect(header[:html_offset]).to be_a(Integer)
172
+ end
173
+ end
174
+
175
+ context 'when extracting link metadata' do
176
+ it 'extracts links with classification' do
177
+ html = <<~HTML
178
+ <html>
179
+ <body>
180
+ <a href="#section">Anchor</a>
181
+ <a href="https://example.com">External</a>
182
+ <a href="/page">Internal</a>
183
+ <a href="mailto:test@example.com">Email</a>
184
+ <a href="tel:+1234567890">Phone</a>
185
+ </body>
186
+ </html>
187
+ HTML
188
+ _, metadata = described_class.convert_with_metadata(html)
189
+
190
+ links = metadata[:links]
191
+ expect(links.length).to eq(5)
192
+
193
+ expect(links[0][:link_type]).to eq('anchor')
194
+ expect(links[1][:link_type]).to eq('external')
195
+ expect(links[2][:link_type]).to eq('internal')
196
+ expect(links[3][:link_type]).to eq('email')
197
+ expect(links[4][:link_type]).to eq('phone')
198
+ end
199
+
200
+ it 'includes link text and href' do
201
+ html = '<html><body><a href="https://example.com">Click here</a></body></html>'
202
+ _, metadata = described_class.convert_with_metadata(html)
203
+
204
+ link = metadata[:links][0]
205
+ expect(link[:href]).to eq('https://example.com')
206
+ expect(link[:text]).to eq('Click here')
207
+ end
208
+
209
+ it 'includes link title attribute' do
210
+ html = '<html><body><a href="https://example.com" title="Example Site">Link</a></body></html>'
211
+ _, metadata = described_class.convert_with_metadata(html)
212
+
213
+ link = metadata[:links][0]
214
+ expect(link[:title]).to eq('Example Site')
215
+ end
216
+
217
+ it 'includes link rel attributes' do
218
+ html = '<html><body><a href="https://example.com" rel="nofollow external">Link</a></body></html>'
219
+ _, metadata = described_class.convert_with_metadata(html)
220
+
221
+ link = metadata[:links][0]
222
+ expect(link[:rel]).to include('nofollow', 'external')
223
+ end
224
+
225
+ it 'includes link attributes' do
226
+ html = '<html><body><a href="https://example.com" data-custom="value">Link</a></body></html>'
227
+ _, metadata = described_class.convert_with_metadata(html)
228
+
229
+ link = metadata[:links][0]
230
+ expect(link[:attributes]).to include('data-custom' => 'value')
231
+ end
232
+ end
233
+
234
+ context 'when extracting image metadata' do
235
+ it 'extracts images with source type' do
236
+ html = <<~HTML
237
+ <html>
238
+ <body>
239
+ <img src="https://example.com/image.jpg" alt="External">
240
+ <img src="/images/local.jpg" alt="Relative">
241
+ <img src="data:image/png;base64,..." alt="Data URI">
242
+ </body>
243
+ </html>
244
+ HTML
245
+ _, metadata = described_class.convert_with_metadata(html)
246
+
247
+ images = metadata[:images]
248
+ expect(images.length).to eq(3)
249
+
250
+ expect(images[0][:image_type]).to eq('external')
251
+ expect(images[1][:image_type]).to eq('relative')
252
+ expect(images[2][:image_type]).to eq('data_uri')
253
+ end
254
+
255
+ it 'includes image alt and title' do
256
+ html = '<html><body><img src="image.jpg" alt="Alt text" title="Image title"></body></html>'
257
+ _, metadata = described_class.convert_with_metadata(html)
258
+
259
+ image = metadata[:images][0]
260
+ expect(image[:alt]).to eq('Alt text')
261
+ expect(image[:title]).to eq('Image title')
262
+ end
263
+
264
+ it 'includes image dimensions' do
265
+ html = '<html><body><img src="image.jpg" width="800" height="600"></body></html>'
266
+ _, metadata = described_class.convert_with_metadata(html)
267
+
268
+ image = metadata[:images][0]
269
+ expect(image[:dimensions]).to be_an(Array)
270
+ expect(image[:dimensions].length).to eq(2)
271
+ end
272
+
273
+ it 'handles missing image attributes' do
274
+ html = '<html><body><img src="image.jpg"></body></html>'
275
+ _, metadata = described_class.convert_with_metadata(html)
276
+
277
+ image = metadata[:images][0]
278
+ expect(image[:alt]).to be_nil
279
+ expect(image[:title]).to be_nil
280
+ end
281
+ end
282
+
283
+ context 'with metadata configuration flags' do
284
+ it 'respects extract_headers flag' do
285
+ html = '<html><body><h1>Title</h1><p>Content</p></body></html>'
286
+ config = { extract_headers: false }
287
+ _, metadata = described_class.convert_with_metadata(html, nil, config)
288
+
289
+ expect(metadata[:headers]).to eq([])
290
+ end
291
+
292
+ it 'respects extract_links flag' do
293
+ html = '<html><body><a href="https://example.com">Link</a></body></html>'
294
+ config = { extract_links: false }
295
+ _, metadata = described_class.convert_with_metadata(html, nil, config)
296
+
297
+ expect(metadata[:links]).to eq([])
298
+ end
299
+
300
+ it 'respects extract_images flag' do
301
+ html = '<html><body><img src="image.jpg" alt="test"></body></html>'
302
+ config = { extract_images: false }
303
+ _, metadata = described_class.convert_with_metadata(html, nil, config)
304
+
305
+ expect(metadata[:images]).to eq([])
306
+ end
307
+
308
+ it 'respects extract_structured_data flag' do
309
+ html = '<html><body><script type="application/ld+json">{"@type":"Article"}</script></body></html>'
310
+ config = { extract_structured_data: false }
311
+ _, metadata = described_class.convert_with_metadata(html, nil, config)
312
+
313
+ expect(metadata[:structured_data]).to eq([])
314
+ end
315
+ end
316
+
317
+ context 'with conversion options and metadata config' do
318
+ it 'accepts both conversion options and metadata config' do
319
+ html = '<html><head><title>Test</title></head><body><h1>Heading</h1></body></html>'
320
+ conv_opts = { heading_style: :atx_closed }
321
+ meta_opts = { extract_headers: true }
322
+
323
+ markdown, metadata = described_class.convert_with_metadata(html, conv_opts, meta_opts)
324
+
325
+ expect(markdown).to include('# Heading #')
326
+ expect(metadata[:headers].length).to eq(1)
327
+ end
328
+
329
+ it 'works with nil options' do
330
+ html = '<html><head><title>Test</title></head><body><p>Content</p></body></html>'
331
+ result = described_class.convert_with_metadata(html, nil, nil)
332
+
333
+ expect(result).to be_an(Array)
334
+ expect(result.length).to eq(2)
335
+ end
336
+ end
337
+
338
+ context 'when extracting structured data' do
339
+ it 'extracts JSON-LD blocks' do
340
+ html = <<~HTML
341
+ <html>
342
+ <head>
343
+ <script type="application/ld+json">
344
+ {"@context":"https://schema.org","@type":"Article","headline":"Test"}
345
+ </script>
346
+ </head>
347
+ <body><p>Content</p></body>
348
+ </html>
349
+ HTML
350
+ _, metadata = described_class.convert_with_metadata(html)
351
+
352
+ # Structured data extraction may vary by implementation
353
+ expect(metadata[:structured_data]).to be_an(Array)
354
+ end
355
+ end
356
+
357
+ context 'with edge cases' do
358
+ it 'handles empty HTML' do
359
+ html = ''
360
+ markdown, metadata = described_class.convert_with_metadata(html)
361
+
362
+ expect(markdown).to be_a(String)
363
+ expect(metadata).to be_a(Hash)
364
+ end
365
+
366
+ it 'handles malformed HTML' do
367
+ html = '<html><head><title>Unclosed'
368
+ markdown, metadata = described_class.convert_with_metadata(html)
369
+
370
+ expect(markdown).to be_a(String)
371
+ expect(metadata).to be_a(Hash)
372
+ end
373
+
374
+ it 'handles special characters in metadata' do
375
+ html = '<html><head><title>Title with "quotes" & <brackets></title></head><body><p>Content</p></body></html>'
376
+ _, metadata = described_class.convert_with_metadata(html)
377
+
378
+ expect(metadata[:document][:title]).to be_a(String)
379
+ end
380
+
381
+ it 'handles whitespace in metadata' do
382
+ html = '<html><head><title> Title with spaces </title></head><body><p>Content</p></body></html>'
383
+ _, metadata = described_class.convert_with_metadata(html)
384
+
385
+ # Whitespace may be normalized
386
+ expect(metadata[:document][:title]).to match(/Title.*spaces/)
387
+ end
388
+
389
+ it 'handles multiple values for same metadata key' do
390
+ html = <<~HTML
391
+ <html>
392
+ <head>
393
+ <meta name="author" content="Author 1">
394
+ <meta name="author" content="Author 2">
395
+ </head>
396
+ <body><p>Content</p></body>
397
+ </html>
398
+ HTML
399
+ _, metadata = described_class.convert_with_metadata(html)
400
+
401
+ # Last value typically wins, but implementation may vary
402
+ expect(metadata[:document][:author]).to be_a(String)
403
+ end
404
+ end
405
+
406
+ context 'when returning value structure' do
407
+ it 'returns proper metadata hash structure' do
408
+ html = <<~HTML
409
+ <html>
410
+ <head><title>Test</title><base href="https://example.com"></head>
411
+ <body><h1>H1</h1><a href="link">Link</a><img src="img.jpg"></body>
412
+ </html>
413
+ HTML
414
+ _, metadata = described_class.convert_with_metadata(html)
415
+
416
+ expect(metadata).to include(
417
+ :document,
418
+ :headers,
419
+ :links,
420
+ :images,
421
+ :structured_data
422
+ )
423
+
424
+ expect(metadata[:document]).to include(
425
+ :title,
426
+ :description,
427
+ :keywords,
428
+ :author,
429
+ :canonical_url,
430
+ :base_href,
431
+ :language,
432
+ :text_direction,
433
+ :open_graph,
434
+ :twitter_card,
435
+ :meta_tags
436
+ )
437
+ end
438
+ end
439
+ end
440
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: html-to-markdown
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.12.1
4
+ version: 2.14.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Na'aman Hirschfeld
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2025-12-09 00:00:00.000000000 Z
11
+ date: 2025-12-12 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys
@@ -67,6 +67,7 @@ files:
67
67
  - sig/open3.rbs
68
68
  - spec/cli_proxy_spec.rb
69
69
  - spec/convert_spec.rb
70
+ - spec/metadata_extraction_spec.rb
70
71
  - spec/spec_helper.rb
71
72
  homepage: https://github.com/Goldziher/html-to-markdown
72
73
  licenses: