html-to-markdown 2.30.0 → 3.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (166) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +6 -19
  3. data/README.md +37 -50
  4. data/ext/html-to-markdown-rb/native/Cargo.lock +13 -701
  5. data/ext/html-to-markdown-rb/native/Cargo.toml +1 -4
  6. data/ext/html-to-markdown-rb/native/README.md +4 -13
  7. data/ext/html-to-markdown-rb/native/src/conversion/inline_images.rs +2 -73
  8. data/ext/html-to-markdown-rb/native/src/conversion/metadata.rs +5 -49
  9. data/ext/html-to-markdown-rb/native/src/conversion/mod.rs +0 -6
  10. data/ext/html-to-markdown-rb/native/src/lib.rs +76 -213
  11. data/ext/html-to-markdown-rb/native/src/options.rs +0 -3
  12. data/lib/html_to_markdown/version.rb +1 -1
  13. data/lib/html_to_markdown.rb +13 -194
  14. data/sig/html_to_markdown.rbs +12 -373
  15. data/vendor/Cargo.toml +6 -3
  16. data/vendor/html-to-markdown-rs/Cargo.toml +4 -10
  17. data/vendor/html-to-markdown-rs/README.md +126 -52
  18. data/vendor/html-to-markdown-rs/examples/basic.rs +6 -1
  19. data/vendor/html-to-markdown-rs/examples/table.rs +6 -1
  20. data/vendor/html-to-markdown-rs/examples/test_escape.rs +6 -1
  21. data/vendor/html-to-markdown-rs/examples/test_inline_formatting.rs +8 -2
  22. data/vendor/html-to-markdown-rs/examples/test_lists.rs +6 -1
  23. data/vendor/html-to-markdown-rs/examples/test_semantic_tags.rs +6 -1
  24. data/vendor/html-to-markdown-rs/examples/test_tables.rs +6 -1
  25. data/vendor/html-to-markdown-rs/examples/test_task_lists.rs +6 -1
  26. data/vendor/html-to-markdown-rs/examples/test_whitespace.rs +6 -1
  27. data/vendor/html-to-markdown-rs/src/convert_api.rs +151 -745
  28. data/vendor/html-to-markdown-rs/src/converter/block/blockquote.rs +3 -5
  29. data/vendor/html-to-markdown-rs/src/converter/block/div.rs +1 -7
  30. data/vendor/html-to-markdown-rs/src/converter/block/heading.rs +18 -5
  31. data/vendor/html-to-markdown-rs/src/converter/block/paragraph.rs +10 -0
  32. data/vendor/html-to-markdown-rs/src/converter/block/preformatted.rs +3 -5
  33. data/vendor/html-to-markdown-rs/src/converter/block/table/builder.rs +16 -11
  34. data/vendor/html-to-markdown-rs/src/converter/block/table/cell.rs +20 -0
  35. data/vendor/html-to-markdown-rs/src/converter/block/table/cells.rs +4 -17
  36. data/vendor/html-to-markdown-rs/src/converter/block/table/mod.rs +140 -0
  37. data/vendor/html-to-markdown-rs/src/converter/block/table/scanner.rs +4 -18
  38. data/vendor/html-to-markdown-rs/src/converter/block/table/utils.rs +2 -18
  39. data/vendor/html-to-markdown-rs/src/converter/context.rs +8 -0
  40. data/vendor/html-to-markdown-rs/src/converter/dom_context.rs +1 -6
  41. data/vendor/html-to-markdown-rs/src/converter/form/elements.rs +14 -14
  42. data/vendor/html-to-markdown-rs/src/converter/handlers/blockquote.rs +4 -5
  43. data/vendor/html-to-markdown-rs/src/converter/handlers/code_block.rs +5 -10
  44. data/vendor/html-to-markdown-rs/src/converter/handlers/graphic.rs +3 -5
  45. data/vendor/html-to-markdown-rs/src/converter/handlers/image.rs +3 -5
  46. data/vendor/html-to-markdown-rs/src/converter/handlers/link.rs +3 -5
  47. data/vendor/html-to-markdown-rs/src/converter/inline/code.rs +3 -5
  48. data/vendor/html-to-markdown-rs/src/converter/inline/emphasis.rs +4 -10
  49. data/vendor/html-to-markdown-rs/src/converter/inline/link.rs +4 -170
  50. data/vendor/html-to-markdown-rs/src/converter/inline/semantic/marks.rs +7 -19
  51. data/vendor/html-to-markdown-rs/src/converter/list/item.rs +3 -5
  52. data/vendor/html-to-markdown-rs/src/converter/list/ordered.rs +4 -10
  53. data/vendor/html-to-markdown-rs/src/converter/list/unordered.rs +6 -12
  54. data/vendor/html-to-markdown-rs/src/converter/list/utils.rs +1 -12
  55. data/vendor/html-to-markdown-rs/src/converter/main.rs +85 -56
  56. data/vendor/html-to-markdown-rs/src/converter/main_helpers.rs +4 -68
  57. data/vendor/html-to-markdown-rs/src/converter/media/embedded.rs +1 -5
  58. data/vendor/html-to-markdown-rs/src/converter/media/graphic.rs +3 -40
  59. data/vendor/html-to-markdown-rs/src/converter/media/image.rs +0 -8
  60. data/vendor/html-to-markdown-rs/src/converter/media/svg.rs +3 -13
  61. data/vendor/html-to-markdown-rs/src/converter/metadata.rs +1 -1
  62. data/vendor/html-to-markdown-rs/src/converter/mod.rs +0 -8
  63. data/vendor/html-to-markdown-rs/src/converter/plain_text.rs +37 -12
  64. data/vendor/html-to-markdown-rs/src/converter/semantic/attributes.rs +5 -30
  65. data/vendor/html-to-markdown-rs/src/converter/semantic/figure.rs +29 -0
  66. data/vendor/html-to-markdown-rs/src/converter/text/escaping.rs +1 -36
  67. data/vendor/html-to-markdown-rs/src/converter/text/mod.rs +1 -3
  68. data/vendor/html-to-markdown-rs/src/converter/text/normalization.rs +0 -53
  69. data/vendor/html-to-markdown-rs/src/converter/text_node.rs +1 -1
  70. data/vendor/html-to-markdown-rs/src/converter/utility/attributes.rs +0 -41
  71. data/vendor/html-to-markdown-rs/src/converter/utility/caching.rs +2 -1
  72. data/vendor/html-to-markdown-rs/src/converter/utility/content.rs +15 -98
  73. data/vendor/html-to-markdown-rs/src/converter/utility/preprocessing.rs +113 -4
  74. data/vendor/html-to-markdown-rs/src/converter/utility/serialization.rs +3 -0
  75. data/vendor/html-to-markdown-rs/src/converter/visitor_hooks.rs +4 -10
  76. data/vendor/html-to-markdown-rs/src/exports.rs +1 -4
  77. data/vendor/html-to-markdown-rs/src/inline_images.rs +1 -1
  78. data/vendor/html-to-markdown-rs/src/lib.rs +13 -133
  79. data/vendor/html-to-markdown-rs/src/metadata/collector.rs +4 -4
  80. data/vendor/html-to-markdown-rs/src/metadata/mod.rs +22 -22
  81. data/vendor/html-to-markdown-rs/src/metadata/types.rs +3 -3
  82. data/vendor/html-to-markdown-rs/src/options/conversion.rs +351 -323
  83. data/vendor/html-to-markdown-rs/src/options/preprocessing.rs +8 -2
  84. data/vendor/html-to-markdown-rs/src/prelude.rs +1 -15
  85. data/vendor/html-to-markdown-rs/src/rcdom.rs +7 -1
  86. data/vendor/html-to-markdown-rs/src/text.rs +25 -14
  87. data/vendor/html-to-markdown-rs/src/types/document.rs +175 -0
  88. data/vendor/html-to-markdown-rs/src/types/mod.rs +17 -0
  89. data/vendor/html-to-markdown-rs/src/types/result.rs +49 -0
  90. data/vendor/html-to-markdown-rs/src/types/structure_builder.rs +790 -0
  91. data/vendor/html-to-markdown-rs/src/types/structure_collector.rs +442 -0
  92. data/vendor/html-to-markdown-rs/src/types/tables.rs +47 -0
  93. data/vendor/html-to-markdown-rs/src/types/warnings.rs +28 -0
  94. data/vendor/html-to-markdown-rs/src/visitor/mod.rs +0 -6
  95. data/vendor/html-to-markdown-rs/src/visitor/traits.rs +0 -1
  96. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/mod.rs +1 -21
  97. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/mod.rs +0 -5
  98. data/vendor/html-to-markdown-rs/src/visitor_helpers.rs +1 -845
  99. data/vendor/html-to-markdown-rs/tests/br_in_inline_test.rs +8 -1
  100. data/vendor/html-to-markdown-rs/tests/commonmark_compliance_test.rs +8 -8
  101. data/vendor/html-to-markdown-rs/tests/djot_output_test.rs +8 -2
  102. data/vendor/html-to-markdown-rs/tests/integration_test.rs +23 -6
  103. data/vendor/html-to-markdown-rs/tests/issue_121_regressions.rs +8 -1
  104. data/vendor/html-to-markdown-rs/tests/issue_127_regressions.rs +8 -2
  105. data/vendor/html-to-markdown-rs/tests/issue_128_regressions.rs +6 -1
  106. data/vendor/html-to-markdown-rs/tests/issue_131_regressions.rs +8 -1
  107. data/vendor/html-to-markdown-rs/tests/issue_134_regressions.rs +8 -1
  108. data/vendor/html-to-markdown-rs/tests/issue_139_regressions.rs +8 -1
  109. data/vendor/html-to-markdown-rs/tests/issue_140_regressions.rs +8 -1
  110. data/vendor/html-to-markdown-rs/tests/issue_143_regressions.rs +8 -1
  111. data/vendor/html-to-markdown-rs/tests/issue_145_regressions.rs +8 -7
  112. data/vendor/html-to-markdown-rs/tests/issue_146_regressions.rs +8 -7
  113. data/vendor/html-to-markdown-rs/tests/issue_176_regressions.rs +12 -2
  114. data/vendor/html-to-markdown-rs/tests/issue_190_regressions.rs +8 -1
  115. data/vendor/html-to-markdown-rs/tests/issue_199_regressions.rs +6 -1
  116. data/vendor/html-to-markdown-rs/tests/issue_200_regressions.rs +6 -1
  117. data/vendor/html-to-markdown-rs/tests/issue_212_regressions.rs +6 -1
  118. data/vendor/html-to-markdown-rs/tests/issue_216_217_regressions.rs +6 -1
  119. data/vendor/html-to-markdown-rs/tests/json_ld_script_extraction.rs +4 -6
  120. data/vendor/html-to-markdown-rs/tests/lists_test.rs +8 -1
  121. data/vendor/html-to-markdown-rs/tests/plain_output_test.rs +8 -2
  122. data/vendor/html-to-markdown-rs/tests/preprocessing_tests.rs +8 -1
  123. data/vendor/html-to-markdown-rs/tests/skip_images_test.rs +8 -11
  124. data/vendor/html-to-markdown-rs/tests/tables_test.rs +12 -2
  125. data/vendor/html-to-markdown-rs/tests/test_custom_elements.rs +8 -1
  126. data/vendor/html-to-markdown-rs/tests/test_nested_simple.rs +8 -1
  127. data/vendor/html-to-markdown-rs/tests/test_script_style_stripping.rs +17 -28
  128. data/vendor/html-to-markdown-rs/tests/test_spa_bisect.rs +8 -1
  129. data/vendor/html-to-markdown-rs/tests/visitor_integration_test.rs +29 -33
  130. data/vendor/html-to-markdown-rs/tests/xml_tables_test.rs +8 -1
  131. metadata +9 -37
  132. data/bin/benchmark.rb +0 -232
  133. data/ext/html-to-markdown-rb/native/src/conversion/tables.rs +0 -71
  134. data/ext/html-to-markdown-rb/native/src/profiling.rs +0 -215
  135. data/ext/html-to-markdown-rb/native/src/visitor/bridge.rs +0 -252
  136. data/ext/html-to-markdown-rb/native/src/visitor/callbacks.rs +0 -640
  137. data/ext/html-to-markdown-rb/native/src/visitor/mod.rs +0 -12
  138. data/spec/convert_spec.rb +0 -77
  139. data/spec/convert_with_tables_spec.rb +0 -194
  140. data/spec/metadata_extraction_spec.rb +0 -437
  141. data/spec/visitor_issue_187_spec.rb +0 -605
  142. data/spec/visitor_spec.rb +0 -1149
  143. data/vendor/html-to-markdown-rs/src/hocr/converter/code_analysis.rs +0 -254
  144. data/vendor/html-to-markdown-rs/src/hocr/converter/core.rs +0 -249
  145. data/vendor/html-to-markdown-rs/src/hocr/converter/elements.rs +0 -382
  146. data/vendor/html-to-markdown-rs/src/hocr/converter/hierarchy.rs +0 -379
  147. data/vendor/html-to-markdown-rs/src/hocr/converter/keywords.rs +0 -55
  148. data/vendor/html-to-markdown-rs/src/hocr/converter/layout.rs +0 -313
  149. data/vendor/html-to-markdown-rs/src/hocr/converter/mod.rs +0 -26
  150. data/vendor/html-to-markdown-rs/src/hocr/converter/output.rs +0 -78
  151. data/vendor/html-to-markdown-rs/src/hocr/extractor.rs +0 -232
  152. data/vendor/html-to-markdown-rs/src/hocr/mod.rs +0 -42
  153. data/vendor/html-to-markdown-rs/src/hocr/parser.rs +0 -333
  154. data/vendor/html-to-markdown-rs/src/hocr/spatial/coords.rs +0 -129
  155. data/vendor/html-to-markdown-rs/src/hocr/spatial/grouping.rs +0 -165
  156. data/vendor/html-to-markdown-rs/src/hocr/spatial/layout.rs +0 -335
  157. data/vendor/html-to-markdown-rs/src/hocr/spatial/mod.rs +0 -15
  158. data/vendor/html-to-markdown-rs/src/hocr/spatial/output.rs +0 -63
  159. data/vendor/html-to-markdown-rs/src/hocr/types.rs +0 -269
  160. data/vendor/html-to-markdown-rs/src/visitor/async_traits.rs +0 -249
  161. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/bridge.rs +0 -189
  162. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/bridge_visitor.rs +0 -343
  163. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/macros.rs +0 -217
  164. data/vendor/html-to-markdown-rs/tests/async_visitor_test.rs +0 -57
  165. data/vendor/html-to-markdown-rs/tests/convert_with_metadata_no_frontmatter.rs +0 -100
  166. data/vendor/html-to-markdown-rs/tests/hocr_compliance_test.rs +0 -509
@@ -1,100 +0,0 @@
1
- #![allow(missing_docs)]
2
-
3
- //! Regression tests: `convert_with_metadata` must never prepend YAML frontmatter
4
- //! to the markdown string. Metadata is returned as a struct, so embedding it
5
- //! in the content is redundant and pollutes the output.
6
-
7
- use html_to_markdown_rs::ConversionOptions;
8
- use html_to_markdown_rs::metadata::MetadataConfig;
9
-
10
- #[test]
11
- fn convert_with_metadata_omits_yaml_frontmatter_default_options() {
12
- let html = r#"
13
- <html>
14
- <head>
15
- <title>My Page Title</title>
16
- <meta name="description" content="A page description">
17
- <meta name="author" content="Jane Doe">
18
- </head>
19
- <body><p>Hello world</p></body>
20
- </html>
21
- "#;
22
-
23
- let (markdown, metadata) = html_to_markdown_rs::convert_with_metadata(html, None, MetadataConfig::default(), None)
24
- .expect("convert_with_metadata failed");
25
-
26
- // Metadata struct should contain the extracted data
27
- assert!(
28
- metadata.document.title.is_some(),
29
- "metadata.document.title should be populated"
30
- );
31
-
32
- // Markdown output must NOT contain YAML frontmatter delimiters
33
- assert!(
34
- !markdown.contains("---"),
35
- "markdown should not contain YAML frontmatter delimiters, got:\n{markdown}"
36
- );
37
- assert!(
38
- !markdown.starts_with("---\n"),
39
- "markdown should not start with YAML frontmatter, got:\n{markdown}"
40
- );
41
- }
42
-
43
- #[test]
44
- fn convert_with_metadata_omits_frontmatter_even_when_extract_metadata_is_true() {
45
- let html = r#"
46
- <html>
47
- <head>
48
- <title>Test Title</title>
49
- <meta name="description" content="Test description">
50
- </head>
51
- <body><p>Content here</p></body>
52
- </html>
53
- "#;
54
-
55
- // Explicitly pass extract_metadata: true — convert_with_metadata should override it
56
- let options = ConversionOptions {
57
- extract_metadata: true,
58
- ..Default::default()
59
- };
60
-
61
- let (markdown, metadata) =
62
- html_to_markdown_rs::convert_with_metadata(html, Some(options), MetadataConfig::default(), None)
63
- .expect("convert_with_metadata failed");
64
-
65
- assert!(
66
- metadata.document.title.is_some(),
67
- "metadata struct should still contain title"
68
- );
69
- assert!(
70
- !markdown.contains("---"),
71
- "YAML frontmatter must not appear even when extract_metadata was explicitly true, got:\n{markdown}"
72
- );
73
- }
74
-
75
- #[test]
76
- fn convert_with_metadata_body_content_is_clean() {
77
- let html = r#"
78
- <html>
79
- <head>
80
- <title>Page</title>
81
- <meta name="keywords" content="rust, html, markdown">
82
- </head>
83
- <body><h1>Heading</h1><p>Paragraph text.</p></body>
84
- </html>
85
- "#;
86
-
87
- let (markdown, _metadata) = html_to_markdown_rs::convert_with_metadata(html, None, MetadataConfig::default(), None)
88
- .expect("convert_with_metadata failed");
89
-
90
- // The output should start with the body content, not frontmatter
91
- let trimmed = markdown.trim();
92
- assert!(
93
- trimmed.starts_with('#') || trimmed.starts_with("Heading"),
94
- "markdown should start with body content, got:\n{markdown}"
95
- );
96
- assert!(
97
- trimmed.contains("Paragraph text."),
98
- "markdown should contain paragraph text, got:\n{markdown}"
99
- );
100
- }
@@ -1,509 +0,0 @@
1
- #![allow(missing_docs, clippy::float_cmp, clippy::items_after_statements, deprecated)]
2
-
3
- //! hOCR 1.2 compliance integration tests
4
- //!
5
- //! Tests full hOCR specification support across all element types and properties.
6
-
7
- use html_to_markdown_rs::hocr::{HocrElement, HocrElementType, convert_to_markdown, extract_hocr_document};
8
-
9
- #[test]
10
- fn test_full_hocr_document_structure() {
11
- let hocr = r#"<!DOCTYPE html>
12
- <html>
13
- <head>
14
- <meta name="ocr-system" content="tesseract 5.0" />
15
- <meta name="ocr-capabilities" content="ocr_page ocr_carea ocr_par ocr_line ocrx_word" />
16
- <meta name="ocr-number-of-pages" content="1" />
17
- </head>
18
- <body>
19
- <div class="ocr_page" title="bbox 0 0 1000 1000">
20
- <div class="ocr_carea" title="bbox 0 0 1000 100">
21
- <h1 class="ocr_title" title="bbox 0 0 500 50">Document Title</h1>
22
- </div>
23
- <div class="ocr_carea" title="bbox 0 100 1000 500">
24
- <h2 class="ocr_chapter" title="bbox 0 100 300 130">Chapter 1</h2>
25
- <p class="ocr_par" title="bbox 0 150 900 250">
26
- <span class="ocr_line" title="bbox 0 150 800 180">
27
- <span class="ocrx_word" title="bbox 0 150 50 180; x_wconf 95">This</span>
28
- <span class="ocrx_word" title="bbox 60 150 90 180; x_wconf 92">is</span>
29
- <span class="ocrx_word" title="bbox 100 150 150 180; x_wconf 98">text</span>
30
- </span>
31
- </p>
32
- </div>
33
- </div>
34
- </body>
35
- </html>"#;
36
-
37
- let dom = tl::parse(hocr, tl::ParserOptions::default()).unwrap();
38
- let (elements, metadata) = extract_hocr_document(&dom);
39
-
40
- assert_eq!(metadata.ocr_system, Some("tesseract 5.0".to_string()));
41
- assert_eq!(metadata.ocr_number_of_pages, Some(1));
42
- assert!(metadata.ocr_capabilities.contains(&"ocr_page".to_string()));
43
-
44
- assert!(!elements.is_empty());
45
-
46
- let markdown = convert_to_markdown(&elements, true);
47
- assert!(markdown.contains("Document Title"));
48
- assert!(markdown.contains("Chapter 1"));
49
- assert!(markdown.contains("This is text"));
50
- }
51
-
52
- #[test]
53
- fn test_advanced_properties() {
54
- let hocr = r#"<div class="ocr_page" title="bbox 0 0 1000 1000">
55
- <span class="ocr_line" title="bbox 100 50 500 80; baseline 0.015 -18; x_font &quot;Arial&quot;; x_fsize 12">
56
- <span class="ocrx_word" title="bbox 100 50 150 80; x_wconf 95; textangle 2.5">Word</span>
57
- </span>
58
- </div>"#;
59
-
60
- let dom = tl::parse(hocr, tl::ParserOptions::default()).unwrap();
61
- let (elements, _) = extract_hocr_document(&dom);
62
-
63
- fn find_line(elements: &[HocrElement]) -> Option<&HocrElement> {
64
- for elem in elements {
65
- if matches!(elem.element_type, HocrElementType::OcrLine) {
66
- return Some(elem);
67
- }
68
- if let Some(found) = find_line(&elem.children) {
69
- return Some(found);
70
- }
71
- }
72
- None
73
- }
74
-
75
- let line = find_line(&elements).expect("Should find ocr_line");
76
-
77
- assert!(line.properties.baseline.is_some());
78
- assert_eq!(line.properties.baseline.unwrap().slope, 0.015);
79
- assert_eq!(line.properties.baseline.unwrap().constant, -18);
80
- assert_eq!(line.properties.x_font, Some("Arial".to_string()));
81
- assert_eq!(line.properties.x_fsize, Some(12));
82
-
83
- assert_eq!(line.children.len(), 1);
84
- assert_eq!(line.children[0].properties.textangle, Some(2.5));
85
- }
86
-
87
- #[test]
88
- fn test_all_logical_elements() {
89
- let hocr = r#"<div class="ocr_document">
90
- <div class="ocr_part"><span class="ocrx_word" title="bbox 0 0 50 20">Part</span></div>
91
- <div class="ocr_chapter"><span class="ocrx_word" title="bbox 0 0 50 20">Chapter</span></div>
92
- <div class="ocr_section"><span class="ocrx_word" title="bbox 0 0 50 20">Section</span></div>
93
- <div class="ocr_subsection"><span class="ocrx_word" title="bbox 0 0 50 20">Subsection</span></div>
94
- <p class="ocr_par"><span class="ocrx_word" title="bbox 0 0 50 20">Paragraph</span></p>
95
- <blockquote class="ocr_blockquote"><span class="ocrx_word" title="bbox 0 0 50 20">Quote</span></blockquote>
96
- <div class="ocr_caption"><span class="ocrx_word" title="bbox 0 0 50 20">Caption</span></div>
97
- </div>"#;
98
-
99
- let dom = tl::parse(hocr, tl::ParserOptions::default()).unwrap();
100
- let (elements, _) = extract_hocr_document(&dom);
101
-
102
- let markdown = convert_to_markdown(&elements, true);
103
-
104
- assert!(markdown.contains("# Part") || markdown.contains("# Chapter"));
105
- assert!(markdown.contains("## Section"));
106
- assert!(markdown.contains("### Subsection"));
107
- assert!(markdown.contains("Paragraph"));
108
- assert!(markdown.contains("> Quote"));
109
- assert!(markdown.contains("*Caption*"));
110
- }
111
-
112
- #[test]
113
- fn test_float_elements() {
114
- let hocr = r#"<div class="ocr_page" title="bbox 0 0 1000 1000">
115
- <div class="ocr_header" title="bbox 0 0 1000 50"><span class="ocrx_word" title="bbox 0 0 50 20">Header</span></div>
116
- <div class="ocr_footer" title="bbox 0 950 1000 1000"><span class="ocrx_word" title="bbox 0 0 50 20">Footer</span></div>
117
- <div class="ocr_table" title="bbox 100 100 500 300">
118
- <span class="ocrx_word" title="bbox 100 100 150 120">Table</span>
119
- </div>
120
- </div>"#;
121
-
122
- let dom = tl::parse(hocr, tl::ParserOptions::default()).unwrap();
123
- let (elements, _) = extract_hocr_document(&dom);
124
-
125
- let markdown = convert_to_markdown(&elements, true);
126
-
127
- assert!(markdown.contains("*Header*"));
128
- assert!(markdown.contains("*Footer*"));
129
- }
130
-
131
- #[test]
132
- fn test_character_level_properties() {
133
- let hocr = r#"<span class="ocr_cinfo" title="x_confs 95.3 87.2 92.1; x_bboxes 0 0 10 20 10 0 20 20 20 0 30 20">
134
- <span class="ocrx_word" title="bbox 0 0 30 20">ABC</span>
135
- </span>"#;
136
-
137
- let dom = tl::parse(hocr, tl::ParserOptions::default()).unwrap();
138
- let (elements, _) = extract_hocr_document(&dom);
139
-
140
- fn find_cinfo(elements: &[HocrElement]) -> Option<&HocrElement> {
141
- for elem in elements {
142
- if matches!(elem.element_type, HocrElementType::OcrCinfo) {
143
- return Some(elem);
144
- }
145
- if let Some(found) = find_cinfo(&elem.children) {
146
- return Some(found);
147
- }
148
- }
149
- None
150
- }
151
-
152
- if let Some(cinfo) = find_cinfo(&elements) {
153
- assert_eq!(cinfo.properties.x_confs, vec![95.3, 87.2, 92.1]);
154
-
155
- assert_eq!(cinfo.properties.x_bboxes.len(), 3);
156
- assert_eq!(cinfo.properties.x_bboxes[0].x1, 0);
157
- assert_eq!(cinfo.properties.x_bboxes[1].x1, 10);
158
- assert_eq!(cinfo.properties.x_bboxes[2].x1, 20);
159
- }
160
- }
161
-
162
- #[test]
163
- fn test_page_properties() {
164
- let hocr = r#"<div class="ocr_page" title="bbox 0 0 2480 3508; image &quot;/path/to/image.png&quot;; ppageno 5; lpageno &quot;V&quot;; scan_res 300 300">
165
- <p class="ocr_par">Content</p>
166
- </div>"#;
167
-
168
- let dom = tl::parse(hocr, tl::ParserOptions::default()).unwrap();
169
- let (elements, _) = extract_hocr_document(&dom);
170
-
171
- fn find_page(elements: &[HocrElement]) -> Option<&HocrElement> {
172
- for elem in elements {
173
- if matches!(elem.element_type, HocrElementType::OcrPage) {
174
- return Some(elem);
175
- }
176
- if let Some(found) = find_page(&elem.children) {
177
- return Some(found);
178
- }
179
- }
180
- None
181
- }
182
-
183
- let page = find_page(&elements).expect("Should find ocr_page");
184
-
185
- assert_eq!(page.properties.image, Some("/path/to/image.png".to_string()));
186
- assert_eq!(page.properties.ppageno, Some(5));
187
- assert_eq!(page.properties.lpageno, Some("V".to_string()));
188
- assert_eq!(page.properties.scan_res, Some((300, 300)));
189
- }
190
-
191
- #[test]
192
- fn test_content_flow_and_order() {
193
- let hocr = r#"<div class="ocr_linear" title="order 1; cflow &quot;main-flow&quot;">
194
- <p class="ocr_par" title="order 2"><span class="ocrx_word" title="bbox 0 0 50 20">Second</span></p>
195
- <p class="ocr_par" title="order 1"><span class="ocrx_word" title="bbox 0 0 50 20">First</span></p>
196
- </div>"#;
197
-
198
- let dom = tl::parse(hocr, tl::ParserOptions::default()).unwrap();
199
- let (elements, _) = extract_hocr_document(&dom);
200
-
201
- fn find_linear(elements: &[HocrElement]) -> Option<&HocrElement> {
202
- for elem in elements {
203
- if matches!(elem.element_type, HocrElementType::OcrLinear) {
204
- return Some(elem);
205
- }
206
- if let Some(found) = find_linear(&elem.children) {
207
- return Some(found);
208
- }
209
- }
210
- None
211
- }
212
-
213
- let linear = find_linear(&elements).expect("Should find ocr_linear");
214
-
215
- assert_eq!(linear.properties.order, Some(1));
216
- assert_eq!(linear.properties.cflow, Some("main-flow".to_string()));
217
-
218
- assert_eq!(linear.children[0].properties.order, Some(2));
219
- assert_eq!(linear.children[1].properties.order, Some(1));
220
- }
221
-
222
- #[test]
223
- fn test_abstract_and_author() {
224
- let hocr = r#"<div class="ocr_document">
225
- <div class="ocr_abstract"><span class="ocrx_word" title="bbox 0 0 50 20">This is an abstract</span></div>
226
- <div class="ocr_author"><span class="ocrx_word" title="bbox 0 0 50 20">John Doe</span></div>
227
- </div>"#;
228
-
229
- let dom = tl::parse(hocr, tl::ParserOptions::default()).unwrap();
230
- let (elements, _) = extract_hocr_document(&dom);
231
- let markdown = convert_to_markdown(&elements, true);
232
-
233
- assert!(markdown.contains("**Abstract**"));
234
- assert!(markdown.contains("This is an abstract"));
235
- assert!(markdown.contains("*John Doe*"));
236
- }
237
-
238
- #[test]
239
- fn test_separator() {
240
- let hocr = r#"<div class="ocr_page">
241
- <p class="ocr_par"><span class="ocrx_word" title="bbox 0 0 50 20">Text before</span></p>
242
- <div class="ocr_separator"></div>
243
- <p class="ocr_par"><span class="ocrx_word" title="bbox 0 0 50 20">Text after</span></p>
244
- </div>"#;
245
-
246
- let dom = tl::parse(hocr, tl::ParserOptions::default()).unwrap();
247
- let (elements, _) = extract_hocr_document(&dom);
248
- let markdown = convert_to_markdown(&elements, true);
249
-
250
- assert!(markdown.contains("Text before"));
251
- assert!(markdown.contains("---"));
252
- assert!(markdown.contains("Text after"));
253
- }
254
-
255
- #[test]
256
- fn test_image_elements() {
257
- let hocr = r#"<div class="ocr_page">
258
- <div class="ocr_image" title="image &quot;/path/to/image.png&quot;"><span class="ocrx_word" title="bbox 0 0 50 20">Alt text</span></div>
259
- <div class="ocr_photo"><span class="ocrx_word" title="bbox 0 0 50 20">Photo caption</span></div>
260
- <div class="ocr_linedrawing"><span class="ocrx_word" title="bbox 0 0 50 20">Drawing caption</span></div>
261
- </div>"#;
262
-
263
- let dom = tl::parse(hocr, tl::ParserOptions::default()).unwrap();
264
- let (elements, _) = extract_hocr_document(&dom);
265
- let markdown = convert_to_markdown(&elements, true);
266
-
267
- assert!(markdown.contains("![Alt text](/path/to/image.png)"));
268
- assert!(markdown.contains("![Image]"));
269
- }
270
-
271
- #[test]
272
- fn test_math_and_chem() {
273
- let hocr = r#"<div class="ocr_page">
274
- <span class="ocr_math"><span class="ocrx_word" title="bbox 0 0 50 20">E=mc^2</span></span>
275
- <span class="ocr_chem"><span class="ocrx_word" title="bbox 0 0 50 20">H2O</span></span>
276
- <div class="ocr_display"><span class="ocrx_word" title="bbox 0 0 50 20">x^2 + y^2 = z^2</span></div>
277
- </div>"#;
278
-
279
- let dom = tl::parse(hocr, tl::ParserOptions::default()).unwrap();
280
- let (elements, _) = extract_hocr_document(&dom);
281
- let markdown = convert_to_markdown(&elements, true);
282
-
283
- assert!(markdown.contains("`E=mc^2`"));
284
- assert!(markdown.contains("`H2O`"));
285
- assert!(markdown.contains("```"));
286
- assert!(markdown.contains("x^2 + y^2 = z^2"));
287
- }
288
-
289
- #[test]
290
- fn test_dropcap_and_glyphs() {
291
- let hocr = r#"<div class="ocr_page">
292
- <p class="ocr_par">
293
- <span class="ocr_dropcap"><span class="ocrx_word" title="bbox 0 0 50 20">T</span></span>
294
- <span class="ocrx_word" title="bbox 0 0 50 20">his is text</span>
295
- </p>
296
- <span class="ocr_glyph" title="bbox 0 0 10 20">A</span>
297
- <span class="ocr_glyphs">
298
- <span class="ocr_glyph" title="bbox 0 0 10 20">B</span>
299
- <span class="ocr_glyph" title="bbox 0 0 10 20">C</span>
300
- </span>
301
- </div>"#;
302
-
303
- let dom = tl::parse(hocr, tl::ParserOptions::default()).unwrap();
304
- let (elements, _) = extract_hocr_document(&dom);
305
- let markdown = convert_to_markdown(&elements, true);
306
-
307
- assert!(markdown.contains("**T**"));
308
- assert!(markdown.contains("his is text"));
309
- assert!(markdown.contains('A'));
310
- assert!(markdown.contains('B'));
311
- assert!(markdown.contains('C'));
312
- }
313
-
314
- #[test]
315
- fn test_float_elements_comprehensive() {
316
- let hocr = r#"<div class="ocr_page">
317
- <div class="ocr_float"><span class="ocrx_word" title="bbox 0 0 50 20">Float content</span></div>
318
- <div class="ocr_textfloat"><span class="ocrx_word" title="bbox 0 0 50 20">Text float</span></div>
319
- <div class="ocr_textimage"><span class="ocrx_word" title="bbox 0 0 50 20">Text with image</span></div>
320
- </div>"#;
321
-
322
- let dom = tl::parse(hocr, tl::ParserOptions::default()).unwrap();
323
- let (elements, _) = extract_hocr_document(&dom);
324
- let markdown = convert_to_markdown(&elements, true);
325
-
326
- assert!(markdown.contains("Float content"));
327
- assert!(markdown.contains("Text float"));
328
- assert!(markdown.contains("Text with image"));
329
- }
330
-
331
- #[test]
332
- fn test_container_elements() {
333
- let hocr = r#"<div class="ocr_document">
334
- <div class="ocr_column">
335
- <p class="ocr_par"><span class="ocrx_word" title="bbox 0 0 50 20">Column text</span></p>
336
- </div>
337
- <div class="ocr_xycut">
338
- <p class="ocr_par"><span class="ocrx_word" title="bbox 0 0 50 20">Layout analysis</span></p>
339
- </div>
340
- <div class="ocrx_block">
341
- <span class="ocrx_word" title="bbox 0 0 50 20">Block content</span>
342
- </div>
343
- </div>"#;
344
-
345
- let dom = tl::parse(hocr, tl::ParserOptions::default()).unwrap();
346
- let (elements, _) = extract_hocr_document(&dom);
347
- let markdown = convert_to_markdown(&elements, true);
348
-
349
- assert!(markdown.contains("Column text"));
350
- assert!(markdown.contains("Layout analysis"));
351
- assert!(markdown.contains("Block content"));
352
- }
353
-
354
- #[test]
355
- fn test_ocr_header_renders_as_italic_not_heading() {
356
- // OcrHeader is a "page running header" (repeated at top of pages),
357
- // NOT a section heading. It must render as italic (*text*), not as # heading.
358
- let hocr = r#"<div class="ocr_page" title="bbox 0 0 1000 1000">
359
- <div class="ocr_header" title="bbox 0 0 1000 50">
360
- <span class="ocr_line" title="bbox 0 0 500 30">
361
- <span class="ocrx_word" title="bbox 0 0 100 30; x_wconf 95">Chapter</span>
362
- <span class="ocrx_word" title="bbox 110 0 200 30; x_wconf 95">One</span>
363
- </span>
364
- </div>
365
- <p class="ocr_par" title="bbox 0 100 900 200">
366
- <span class="ocr_line" title="bbox 0 100 800 130">
367
- <span class="ocrx_word" title="bbox 0 100 50 130; x_wconf 95">Some</span>
368
- <span class="ocrx_word" title="bbox 60 100 120 130; x_wconf 95">body</span>
369
- <span class="ocrx_word" title="bbox 130 100 180 130; x_wconf 95">text</span>
370
- <span class="ocrx_word" title="bbox 190 100 240 130; x_wconf 95">here</span>
371
- </span>
372
- </p>
373
- </div>"#;
374
-
375
- let dom = tl::parse(hocr, tl::ParserOptions::default()).unwrap();
376
- let (elements, _) = extract_hocr_document(&dom);
377
- let markdown = convert_to_markdown(&elements, true);
378
-
379
- // OcrHeader must render as italic
380
- assert!(
381
- markdown.contains("*Chapter One*"),
382
- "OcrHeader should render as italic (*text*), got: {markdown}"
383
- );
384
- // It must NOT render as a markdown heading
385
- assert!(
386
- !markdown.contains("# Chapter One"),
387
- "OcrHeader must NOT render as a markdown heading, got: {markdown}"
388
- );
389
- }
390
-
391
- #[test]
392
- fn test_heading_detection_with_x_fsize_on_line_child() {
393
- // A paragraph containing a single ocr_line child with x_fsize 18 (large font)
394
- // and short capitalized text should be detected as a heading.
395
- let hocr = r#"<div class="ocr_page" title="bbox 0 0 1000 1000">
396
- <div class="ocr_carea" title="bbox 0 0 1000 500">
397
- <p class="ocr_par" title="bbox 0 0 500 40">
398
- <span class="ocr_line" title="bbox 0 0 500 30; x_fsize 18">
399
- <span class="ocrx_word" title="bbox 0 0 120 30; x_wconf 95">Important</span>
400
- <span class="ocrx_word" title="bbox 130 0 250 30; x_wconf 95">Section</span>
401
- <span class="ocrx_word" title="bbox 260 0 350 30; x_wconf 95">Title</span>
402
- </span>
403
- </p>
404
- <p class="ocr_par" title="bbox 0 60 900 200">
405
- <span class="ocr_line" title="bbox 0 60 800 90; x_fsize 12">
406
- <span class="ocrx_word" title="bbox 0 60 50 90; x_wconf 95">This</span>
407
- <span class="ocrx_word" title="bbox 60 60 90 90; x_wconf 92">is</span>
408
- <span class="ocrx_word" title="bbox 100 60 200 90; x_wconf 98">regular</span>
409
- <span class="ocrx_word" title="bbox 210 60 280 90; x_wconf 98">body</span>
410
- <span class="ocrx_word" title="bbox 290 60 340 90; x_wconf 98">text</span>
411
- <span class="ocrx_word" title="bbox 350 60 430 90; x_wconf 98">content.</span>
412
- </span>
413
- </p>
414
- </div>
415
- </div>"#;
416
-
417
- let dom = tl::parse(hocr, tl::ParserOptions::default()).unwrap();
418
- let (elements, _) = extract_hocr_document(&dom);
419
- let markdown = convert_to_markdown(&elements, true);
420
-
421
- // The large-font paragraph should be detected as a heading
422
- assert!(
423
- markdown.contains("# Important Section Title"),
424
- "Large font paragraph should be detected as heading, got: {markdown}"
425
- );
426
- }
427
-
428
- #[test]
429
- fn test_single_word_heading_with_large_font() {
430
- // A single-word paragraph with large font size should be detected as a heading.
431
- // Without font size awareness, single-word paragraphs are rejected.
432
- let hocr = r#"<div class="ocr_page" title="bbox 0 0 1000 1000">
433
- <div class="ocr_carea" title="bbox 0 0 1000 500">
434
- <p class="ocr_par" title="bbox 0 0 300 40">
435
- <span class="ocr_line" title="bbox 0 0 300 30; x_fsize 24">
436
- <span class="ocrx_word" title="bbox 0 0 200 30; x_wconf 95">Introduction</span>
437
- </span>
438
- </p>
439
- <p class="ocr_par" title="bbox 0 60 900 200">
440
- <span class="ocr_line" title="bbox 0 60 800 90; x_fsize 12">
441
- <span class="ocrx_word" title="bbox 0 60 50 90; x_wconf 95">Some</span>
442
- <span class="ocrx_word" title="bbox 60 60 120 90; x_wconf 92">body</span>
443
- <span class="ocrx_word" title="bbox 130 60 180 90; x_wconf 98">text</span>
444
- <span class="ocrx_word" title="bbox 190 60 280 90; x_wconf 98">follows.</span>
445
- </span>
446
- </p>
447
- </div>
448
- </div>"#;
449
-
450
- let dom = tl::parse(hocr, tl::ParserOptions::default()).unwrap();
451
- let (elements, _) = extract_hocr_document(&dom);
452
- let markdown = convert_to_markdown(&elements, true);
453
-
454
- // Single word with large font should be detected as heading
455
- assert!(
456
- markdown.contains("# Introduction"),
457
- "Single word with large font should be detected as heading, got: {markdown}"
458
- );
459
- }
460
-
461
- #[test]
462
- fn test_single_word_without_large_font_not_heading() {
463
- // A single-word paragraph without large font should NOT be detected as heading.
464
- // This ensures we haven't broken the existing behavior.
465
- let hocr = r#"<div class="ocr_page" title="bbox 0 0 1000 1000">
466
- <div class="ocr_carea" title="bbox 0 0 1000 500">
467
- <p class="ocr_par" title="bbox 0 0 300 20">
468
- <span class="ocr_line" title="bbox 0 0 300 12; x_fsize 10">
469
- <span class="ocrx_word" title="bbox 0 0 100 12; x_wconf 95">Word</span>
470
- </span>
471
- </p>
472
- </div>
473
- </div>"#;
474
-
475
- let dom = tl::parse(hocr, tl::ParserOptions::default()).unwrap();
476
- let (elements, _) = extract_hocr_document(&dom);
477
- let markdown = convert_to_markdown(&elements, true);
478
-
479
- // Single word with small font should NOT be a heading
480
- assert!(
481
- !markdown.contains("# Word"),
482
- "Single word with small font should not be detected as heading, got: {markdown}"
483
- );
484
- }
485
-
486
- #[test]
487
- fn test_heading_detection_with_bbox_height_proxy() {
488
- // When x_fsize is absent, bbox height should serve as a font-size proxy.
489
- // A bbox height of 30 pixels (>= 14) indicates large text.
490
- let hocr = r#"<div class="ocr_page" title="bbox 0 0 1000 1000">
491
- <div class="ocr_carea" title="bbox 0 0 1000 500">
492
- <p class="ocr_par" title="bbox 0 0 500 40">
493
- <span class="ocr_line" title="bbox 0 0 500 30">
494
- <span class="ocrx_word" title="bbox 0 0 200 30; x_wconf 95">Summary</span>
495
- </span>
496
- </p>
497
- </div>
498
- </div>"#;
499
-
500
- let dom = tl::parse(hocr, tl::ParserOptions::default()).unwrap();
501
- let (elements, _) = extract_hocr_document(&dom);
502
- let markdown = convert_to_markdown(&elements, true);
503
-
504
- // bbox height of 30 (y2=30 - y1=0) should serve as proxy for large font
505
- assert!(
506
- markdown.contains("# Summary"),
507
- "Single word with tall bbox (height=30) should be detected as heading via bbox proxy, got: {markdown}"
508
- );
509
- }