html-to-markdown 2.30.0 → 3.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (166) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +6 -19
  3. data/README.md +37 -50
  4. data/ext/html-to-markdown-rb/native/Cargo.lock +13 -701
  5. data/ext/html-to-markdown-rb/native/Cargo.toml +1 -4
  6. data/ext/html-to-markdown-rb/native/README.md +4 -13
  7. data/ext/html-to-markdown-rb/native/src/conversion/inline_images.rs +2 -73
  8. data/ext/html-to-markdown-rb/native/src/conversion/metadata.rs +5 -49
  9. data/ext/html-to-markdown-rb/native/src/conversion/mod.rs +0 -6
  10. data/ext/html-to-markdown-rb/native/src/lib.rs +76 -213
  11. data/ext/html-to-markdown-rb/native/src/options.rs +0 -3
  12. data/lib/html_to_markdown/version.rb +1 -1
  13. data/lib/html_to_markdown.rb +13 -194
  14. data/sig/html_to_markdown.rbs +12 -373
  15. data/vendor/Cargo.toml +6 -3
  16. data/vendor/html-to-markdown-rs/Cargo.toml +4 -10
  17. data/vendor/html-to-markdown-rs/README.md +126 -52
  18. data/vendor/html-to-markdown-rs/examples/basic.rs +6 -1
  19. data/vendor/html-to-markdown-rs/examples/table.rs +6 -1
  20. data/vendor/html-to-markdown-rs/examples/test_escape.rs +6 -1
  21. data/vendor/html-to-markdown-rs/examples/test_inline_formatting.rs +8 -2
  22. data/vendor/html-to-markdown-rs/examples/test_lists.rs +6 -1
  23. data/vendor/html-to-markdown-rs/examples/test_semantic_tags.rs +6 -1
  24. data/vendor/html-to-markdown-rs/examples/test_tables.rs +6 -1
  25. data/vendor/html-to-markdown-rs/examples/test_task_lists.rs +6 -1
  26. data/vendor/html-to-markdown-rs/examples/test_whitespace.rs +6 -1
  27. data/vendor/html-to-markdown-rs/src/convert_api.rs +151 -745
  28. data/vendor/html-to-markdown-rs/src/converter/block/blockquote.rs +3 -5
  29. data/vendor/html-to-markdown-rs/src/converter/block/div.rs +1 -7
  30. data/vendor/html-to-markdown-rs/src/converter/block/heading.rs +18 -5
  31. data/vendor/html-to-markdown-rs/src/converter/block/paragraph.rs +10 -0
  32. data/vendor/html-to-markdown-rs/src/converter/block/preformatted.rs +3 -5
  33. data/vendor/html-to-markdown-rs/src/converter/block/table/builder.rs +16 -11
  34. data/vendor/html-to-markdown-rs/src/converter/block/table/cell.rs +20 -0
  35. data/vendor/html-to-markdown-rs/src/converter/block/table/cells.rs +4 -17
  36. data/vendor/html-to-markdown-rs/src/converter/block/table/mod.rs +140 -0
  37. data/vendor/html-to-markdown-rs/src/converter/block/table/scanner.rs +4 -18
  38. data/vendor/html-to-markdown-rs/src/converter/block/table/utils.rs +2 -18
  39. data/vendor/html-to-markdown-rs/src/converter/context.rs +8 -0
  40. data/vendor/html-to-markdown-rs/src/converter/dom_context.rs +1 -6
  41. data/vendor/html-to-markdown-rs/src/converter/form/elements.rs +14 -14
  42. data/vendor/html-to-markdown-rs/src/converter/handlers/blockquote.rs +4 -5
  43. data/vendor/html-to-markdown-rs/src/converter/handlers/code_block.rs +5 -10
  44. data/vendor/html-to-markdown-rs/src/converter/handlers/graphic.rs +3 -5
  45. data/vendor/html-to-markdown-rs/src/converter/handlers/image.rs +3 -5
  46. data/vendor/html-to-markdown-rs/src/converter/handlers/link.rs +3 -5
  47. data/vendor/html-to-markdown-rs/src/converter/inline/code.rs +3 -5
  48. data/vendor/html-to-markdown-rs/src/converter/inline/emphasis.rs +4 -10
  49. data/vendor/html-to-markdown-rs/src/converter/inline/link.rs +4 -170
  50. data/vendor/html-to-markdown-rs/src/converter/inline/semantic/marks.rs +7 -19
  51. data/vendor/html-to-markdown-rs/src/converter/list/item.rs +3 -5
  52. data/vendor/html-to-markdown-rs/src/converter/list/ordered.rs +4 -10
  53. data/vendor/html-to-markdown-rs/src/converter/list/unordered.rs +6 -12
  54. data/vendor/html-to-markdown-rs/src/converter/list/utils.rs +1 -12
  55. data/vendor/html-to-markdown-rs/src/converter/main.rs +85 -56
  56. data/vendor/html-to-markdown-rs/src/converter/main_helpers.rs +4 -68
  57. data/vendor/html-to-markdown-rs/src/converter/media/embedded.rs +1 -5
  58. data/vendor/html-to-markdown-rs/src/converter/media/graphic.rs +3 -40
  59. data/vendor/html-to-markdown-rs/src/converter/media/image.rs +0 -8
  60. data/vendor/html-to-markdown-rs/src/converter/media/svg.rs +3 -13
  61. data/vendor/html-to-markdown-rs/src/converter/metadata.rs +1 -1
  62. data/vendor/html-to-markdown-rs/src/converter/mod.rs +0 -8
  63. data/vendor/html-to-markdown-rs/src/converter/plain_text.rs +37 -12
  64. data/vendor/html-to-markdown-rs/src/converter/semantic/attributes.rs +5 -30
  65. data/vendor/html-to-markdown-rs/src/converter/semantic/figure.rs +29 -0
  66. data/vendor/html-to-markdown-rs/src/converter/text/escaping.rs +1 -36
  67. data/vendor/html-to-markdown-rs/src/converter/text/mod.rs +1 -3
  68. data/vendor/html-to-markdown-rs/src/converter/text/normalization.rs +0 -53
  69. data/vendor/html-to-markdown-rs/src/converter/text_node.rs +1 -1
  70. data/vendor/html-to-markdown-rs/src/converter/utility/attributes.rs +0 -41
  71. data/vendor/html-to-markdown-rs/src/converter/utility/caching.rs +2 -1
  72. data/vendor/html-to-markdown-rs/src/converter/utility/content.rs +15 -98
  73. data/vendor/html-to-markdown-rs/src/converter/utility/preprocessing.rs +113 -4
  74. data/vendor/html-to-markdown-rs/src/converter/utility/serialization.rs +3 -0
  75. data/vendor/html-to-markdown-rs/src/converter/visitor_hooks.rs +4 -10
  76. data/vendor/html-to-markdown-rs/src/exports.rs +1 -4
  77. data/vendor/html-to-markdown-rs/src/inline_images.rs +1 -1
  78. data/vendor/html-to-markdown-rs/src/lib.rs +13 -133
  79. data/vendor/html-to-markdown-rs/src/metadata/collector.rs +4 -4
  80. data/vendor/html-to-markdown-rs/src/metadata/mod.rs +22 -22
  81. data/vendor/html-to-markdown-rs/src/metadata/types.rs +3 -3
  82. data/vendor/html-to-markdown-rs/src/options/conversion.rs +351 -323
  83. data/vendor/html-to-markdown-rs/src/options/preprocessing.rs +8 -2
  84. data/vendor/html-to-markdown-rs/src/prelude.rs +1 -15
  85. data/vendor/html-to-markdown-rs/src/rcdom.rs +7 -1
  86. data/vendor/html-to-markdown-rs/src/text.rs +25 -14
  87. data/vendor/html-to-markdown-rs/src/types/document.rs +175 -0
  88. data/vendor/html-to-markdown-rs/src/types/mod.rs +17 -0
  89. data/vendor/html-to-markdown-rs/src/types/result.rs +49 -0
  90. data/vendor/html-to-markdown-rs/src/types/structure_builder.rs +790 -0
  91. data/vendor/html-to-markdown-rs/src/types/structure_collector.rs +442 -0
  92. data/vendor/html-to-markdown-rs/src/types/tables.rs +47 -0
  93. data/vendor/html-to-markdown-rs/src/types/warnings.rs +28 -0
  94. data/vendor/html-to-markdown-rs/src/visitor/mod.rs +0 -6
  95. data/vendor/html-to-markdown-rs/src/visitor/traits.rs +0 -1
  96. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/mod.rs +1 -21
  97. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/mod.rs +0 -5
  98. data/vendor/html-to-markdown-rs/src/visitor_helpers.rs +1 -845
  99. data/vendor/html-to-markdown-rs/tests/br_in_inline_test.rs +8 -1
  100. data/vendor/html-to-markdown-rs/tests/commonmark_compliance_test.rs +8 -8
  101. data/vendor/html-to-markdown-rs/tests/djot_output_test.rs +8 -2
  102. data/vendor/html-to-markdown-rs/tests/integration_test.rs +23 -6
  103. data/vendor/html-to-markdown-rs/tests/issue_121_regressions.rs +8 -1
  104. data/vendor/html-to-markdown-rs/tests/issue_127_regressions.rs +8 -2
  105. data/vendor/html-to-markdown-rs/tests/issue_128_regressions.rs +6 -1
  106. data/vendor/html-to-markdown-rs/tests/issue_131_regressions.rs +8 -1
  107. data/vendor/html-to-markdown-rs/tests/issue_134_regressions.rs +8 -1
  108. data/vendor/html-to-markdown-rs/tests/issue_139_regressions.rs +8 -1
  109. data/vendor/html-to-markdown-rs/tests/issue_140_regressions.rs +8 -1
  110. data/vendor/html-to-markdown-rs/tests/issue_143_regressions.rs +8 -1
  111. data/vendor/html-to-markdown-rs/tests/issue_145_regressions.rs +8 -7
  112. data/vendor/html-to-markdown-rs/tests/issue_146_regressions.rs +8 -7
  113. data/vendor/html-to-markdown-rs/tests/issue_176_regressions.rs +12 -2
  114. data/vendor/html-to-markdown-rs/tests/issue_190_regressions.rs +8 -1
  115. data/vendor/html-to-markdown-rs/tests/issue_199_regressions.rs +6 -1
  116. data/vendor/html-to-markdown-rs/tests/issue_200_regressions.rs +6 -1
  117. data/vendor/html-to-markdown-rs/tests/issue_212_regressions.rs +6 -1
  118. data/vendor/html-to-markdown-rs/tests/issue_216_217_regressions.rs +6 -1
  119. data/vendor/html-to-markdown-rs/tests/json_ld_script_extraction.rs +4 -6
  120. data/vendor/html-to-markdown-rs/tests/lists_test.rs +8 -1
  121. data/vendor/html-to-markdown-rs/tests/plain_output_test.rs +8 -2
  122. data/vendor/html-to-markdown-rs/tests/preprocessing_tests.rs +8 -1
  123. data/vendor/html-to-markdown-rs/tests/skip_images_test.rs +8 -11
  124. data/vendor/html-to-markdown-rs/tests/tables_test.rs +12 -2
  125. data/vendor/html-to-markdown-rs/tests/test_custom_elements.rs +8 -1
  126. data/vendor/html-to-markdown-rs/tests/test_nested_simple.rs +8 -1
  127. data/vendor/html-to-markdown-rs/tests/test_script_style_stripping.rs +17 -28
  128. data/vendor/html-to-markdown-rs/tests/test_spa_bisect.rs +8 -1
  129. data/vendor/html-to-markdown-rs/tests/visitor_integration_test.rs +29 -33
  130. data/vendor/html-to-markdown-rs/tests/xml_tables_test.rs +8 -1
  131. metadata +9 -37
  132. data/bin/benchmark.rb +0 -232
  133. data/ext/html-to-markdown-rb/native/src/conversion/tables.rs +0 -71
  134. data/ext/html-to-markdown-rb/native/src/profiling.rs +0 -215
  135. data/ext/html-to-markdown-rb/native/src/visitor/bridge.rs +0 -252
  136. data/ext/html-to-markdown-rb/native/src/visitor/callbacks.rs +0 -640
  137. data/ext/html-to-markdown-rb/native/src/visitor/mod.rs +0 -12
  138. data/spec/convert_spec.rb +0 -77
  139. data/spec/convert_with_tables_spec.rb +0 -194
  140. data/spec/metadata_extraction_spec.rb +0 -437
  141. data/spec/visitor_issue_187_spec.rb +0 -605
  142. data/spec/visitor_spec.rb +0 -1149
  143. data/vendor/html-to-markdown-rs/src/hocr/converter/code_analysis.rs +0 -254
  144. data/vendor/html-to-markdown-rs/src/hocr/converter/core.rs +0 -249
  145. data/vendor/html-to-markdown-rs/src/hocr/converter/elements.rs +0 -382
  146. data/vendor/html-to-markdown-rs/src/hocr/converter/hierarchy.rs +0 -379
  147. data/vendor/html-to-markdown-rs/src/hocr/converter/keywords.rs +0 -55
  148. data/vendor/html-to-markdown-rs/src/hocr/converter/layout.rs +0 -313
  149. data/vendor/html-to-markdown-rs/src/hocr/converter/mod.rs +0 -26
  150. data/vendor/html-to-markdown-rs/src/hocr/converter/output.rs +0 -78
  151. data/vendor/html-to-markdown-rs/src/hocr/extractor.rs +0 -232
  152. data/vendor/html-to-markdown-rs/src/hocr/mod.rs +0 -42
  153. data/vendor/html-to-markdown-rs/src/hocr/parser.rs +0 -333
  154. data/vendor/html-to-markdown-rs/src/hocr/spatial/coords.rs +0 -129
  155. data/vendor/html-to-markdown-rs/src/hocr/spatial/grouping.rs +0 -165
  156. data/vendor/html-to-markdown-rs/src/hocr/spatial/layout.rs +0 -335
  157. data/vendor/html-to-markdown-rs/src/hocr/spatial/mod.rs +0 -15
  158. data/vendor/html-to-markdown-rs/src/hocr/spatial/output.rs +0 -63
  159. data/vendor/html-to-markdown-rs/src/hocr/types.rs +0 -269
  160. data/vendor/html-to-markdown-rs/src/visitor/async_traits.rs +0 -249
  161. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/bridge.rs +0 -189
  162. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/bridge_visitor.rs +0 -343
  163. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/macros.rs +0 -217
  164. data/vendor/html-to-markdown-rs/tests/async_visitor_test.rs +0 -57
  165. data/vendor/html-to-markdown-rs/tests/convert_with_metadata_no_frontmatter.rs +0 -100
  166. data/vendor/html-to-markdown-rs/tests/hocr_compliance_test.rs +0 -509
@@ -42,7 +42,10 @@ impl PreprocessingPreset {
42
42
  any(feature = "serde", feature = "metadata"),
43
43
  derive(serde::Serialize, serde::Deserialize)
44
44
  )]
45
- #[cfg_attr(any(feature = "serde", feature = "metadata"), serde(rename_all = "camelCase"))]
45
+ #[cfg_attr(
46
+ any(feature = "serde", feature = "metadata"),
47
+ serde(rename_all = "camelCase", deny_unknown_fields)
48
+ )]
46
49
  pub struct PreprocessingOptions {
47
50
  /// Enable HTML preprocessing globally
48
51
  pub enabled: bool,
@@ -67,7 +70,10 @@ pub struct PreprocessingOptions {
67
70
  any(feature = "serde", feature = "metadata"),
68
71
  derive(serde::Serialize, serde::Deserialize)
69
72
  )]
70
- #[cfg_attr(any(feature = "serde", feature = "metadata"), serde(rename_all = "camelCase"))]
73
+ #[cfg_attr(
74
+ any(feature = "serde", feature = "metadata"),
75
+ serde(rename_all = "camelCase", deny_unknown_fields)
76
+ )]
71
77
  pub struct PreprocessingOptionsUpdate {
72
78
  /// Optional global preprocessing enablement override
73
79
  pub enabled: Option<bool>,
@@ -9,18 +9,4 @@
9
9
  pub use crate::convert;
10
10
  pub use crate::error::{ConversionError, Result};
11
11
  pub use crate::options::{ConversionOptions, HeadingStyle};
12
-
13
- #[cfg(feature = "inline-images")]
14
- pub use crate::convert_with_inline_images;
15
-
16
- #[cfg(feature = "metadata")]
17
- pub use crate::convert_with_metadata;
18
-
19
- #[cfg(feature = "visitor")]
20
- pub use crate::convert_with_visitor;
21
-
22
- #[cfg(feature = "visitor")]
23
- pub use crate::{ConversionWithTables, TableData, convert_with_tables};
24
-
25
- #[cfg(feature = "async-visitor")]
26
- pub use crate::convert_with_async_visitor;
12
+ pub use crate::types::ConversionResult;
@@ -67,7 +67,10 @@ pub enum NodeData {
67
67
  /// [dtd wiki]: https://en.wikipedia.org/wiki/Document_type_declaration
68
68
  Doctype {
69
69
  name: StrTendril,
70
+ // Fields required by html5ever's DOM model; not accessed during conversion.
71
+ #[allow(dead_code)]
70
72
  public_id: StrTendril,
73
+ #[allow(dead_code)]
71
74
  system_id: StrTendril,
72
75
  },
73
76
 
@@ -386,7 +389,10 @@ impl TreeSink for RcDom {
386
389
  let previous_parent = child.parent.replace(Some(Rc::downgrade(new_parent)));
387
390
  assert!(Rc::ptr_eq(
388
391
  node,
389
- &previous_parent.unwrap().upgrade().expect("dangling weak")
392
+ &previous_parent
393
+ .expect("invariant: child must have a parent during reparenting")
394
+ .upgrade()
395
+ .expect("dangling weak")
390
396
  ))
391
397
  }
392
398
  new_children.extend(mem::take(&mut *children));
@@ -6,15 +6,17 @@ use std::borrow::Cow;
6
6
  use std::sync::LazyLock;
7
7
 
8
8
  /// Regex for escaping miscellaneous characters
9
- static ESCAPE_MISC_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"([\\&<`\[\]>~#=+|\-])").unwrap());
9
+ static ESCAPE_MISC_RE: LazyLock<Regex> =
10
+ LazyLock::new(|| Regex::new(r"([\\&<`\[\]>~#=+|\-])").expect("valid regex pattern"));
10
11
 
11
12
  /// Regex for escaping numbered lists
12
- static ESCAPE_NUMBERED_LIST_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"([0-9])([.)])").unwrap());
13
+ static ESCAPE_NUMBERED_LIST_RE: LazyLock<Regex> =
14
+ LazyLock::new(|| Regex::new(r"([0-9])([.)])").expect("valid regex pattern"));
13
15
 
14
16
  /// Regex for escaping ASCII punctuation (CommonMark spec example 12)
15
17
  /// Matches: `! " # $ % & ' ( ) * + , - . / : ; < = > ? @ [ \ ] ^ _ \` { | } ~`
16
18
  static ESCAPE_ASCII_RE: LazyLock<Regex> =
17
- LazyLock::new(|| Regex::new(r"([!\x22#$%&\x27()*+,\-./:;<=>?@\[\\\]^_`{|}~])").unwrap());
19
+ LazyLock::new(|| Regex::new(r"([!\x22#$%&\x27()*+,\-./:;<=>?@\[\\\]^_`{|}~])").expect("valid regex pattern"));
18
20
 
19
21
  /// Escape Markdown special characters in text.
20
22
  ///
@@ -99,28 +101,37 @@ pub fn escape(
99
101
  }
100
102
  }
101
103
 
102
- let mut result = text.to_string();
104
+ let mut result: Cow<'_, str> = Cow::Borrowed(text);
103
105
 
104
106
  if escape_ascii {
105
- result = ESCAPE_ASCII_RE.replace_all(&result, r"\$1").to_string();
106
- return Cow::Owned(result);
107
+ result = match ESCAPE_ASCII_RE.replace_all(result.as_ref(), r"\$1") {
108
+ Cow::Borrowed(_) => result,
109
+ Cow::Owned(s) => Cow::Owned(s),
110
+ };
111
+ return result;
107
112
  }
108
113
 
109
114
  if escape_misc {
110
- result = ESCAPE_MISC_RE.replace_all(&result, r"\$1").to_string();
111
-
112
- result = ESCAPE_NUMBERED_LIST_RE.replace_all(&result, r"$1\$2").to_string();
115
+ result = match ESCAPE_MISC_RE.replace_all(result.as_ref(), r"\$1") {
116
+ Cow::Borrowed(_) => result,
117
+ Cow::Owned(s) => Cow::Owned(s),
118
+ };
119
+
120
+ result = match ESCAPE_NUMBERED_LIST_RE.replace_all(result.as_ref(), r"$1\$2") {
121
+ Cow::Borrowed(_) => result,
122
+ Cow::Owned(s) => Cow::Owned(s),
123
+ };
113
124
  }
114
125
 
115
- if escape_asterisks {
116
- result = result.replace('*', r"\*");
126
+ if escape_asterisks && result.contains('*') {
127
+ result = Cow::Owned(result.replace('*', r"\*"));
117
128
  }
118
129
 
119
- if escape_underscores {
120
- result = result.replace('_', r"\_");
130
+ if escape_underscores && result.contains('_') {
131
+ result = Cow::Owned(result.replace('_', r"\_"));
121
132
  }
122
133
 
123
- Cow::Owned(result)
134
+ result
124
135
  }
125
136
 
126
137
  /// Extract boundary whitespace from text (chomp).
@@ -0,0 +1,175 @@
1
+ //! Structured document tree types aligned with kreuzberg's `DocumentStructure`.
2
+
3
+ use std::collections::HashMap;
4
+
5
+ use serde::{Deserialize, Serialize};
6
+
7
+ use super::tables::TableGrid;
8
+
9
+ /// A structured document tree representing the semantic content of an HTML document.
10
+ ///
11
+ /// Uses a flat node array with index-based parent/child references for efficient traversal.
12
+ #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
13
+ pub struct DocumentStructure {
14
+ /// All nodes in document reading order.
15
+ pub nodes: Vec<DocumentNode>,
16
+ /// The source format (always "html" for this crate).
17
+ #[serde(skip_serializing_if = "Option::is_none")]
18
+ pub source_format: Option<String>,
19
+ }
20
+
21
+ /// A single node in the document tree.
22
+ #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
23
+ pub struct DocumentNode {
24
+ /// Deterministic node identifier.
25
+ pub id: String,
26
+ /// The semantic content of this node.
27
+ pub content: NodeContent,
28
+ /// Index of the parent node (None for root nodes).
29
+ #[serde(skip_serializing_if = "Option::is_none")]
30
+ pub parent: Option<u32>,
31
+ /// Indices of child nodes in reading order.
32
+ #[serde(skip_serializing_if = "Vec::is_empty", default)]
33
+ pub children: Vec<u32>,
34
+ /// Inline formatting annotations (bold, italic, links, etc.) with byte offsets into the text.
35
+ #[serde(skip_serializing_if = "Vec::is_empty", default)]
36
+ pub annotations: Vec<TextAnnotation>,
37
+ /// Format-specific attributes (e.g. class, id, data-* attributes).
38
+ #[serde(skip_serializing_if = "Option::is_none")]
39
+ pub attributes: Option<HashMap<String, String>>,
40
+ }
41
+
42
+ /// The semantic content type of a document node.
43
+ ///
44
+ /// Uses internally tagged representation (`"node_type": "heading"`) for JSON serialization.
45
+ #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
46
+ #[serde(tag = "node_type", rename_all = "snake_case")]
47
+ pub enum NodeContent {
48
+ /// A heading element (h1-h6).
49
+ Heading {
50
+ /// Heading level (1-6).
51
+ level: u8,
52
+ /// The heading text content.
53
+ text: String,
54
+ },
55
+ /// A paragraph of text.
56
+ Paragraph {
57
+ /// The paragraph text content.
58
+ text: String,
59
+ },
60
+ /// A list container (ordered or unordered). Children are `ListItem` nodes.
61
+ List {
62
+ /// Whether this is an ordered list.
63
+ ordered: bool,
64
+ },
65
+ /// A single list item.
66
+ ListItem {
67
+ /// The list item text content.
68
+ text: String,
69
+ },
70
+ /// A table with structured cell data.
71
+ Table {
72
+ /// The table grid structure.
73
+ grid: TableGrid,
74
+ },
75
+ /// An image element.
76
+ Image {
77
+ /// Alt text or caption.
78
+ #[serde(skip_serializing_if = "Option::is_none")]
79
+ description: Option<String>,
80
+ /// Image source URL.
81
+ #[serde(skip_serializing_if = "Option::is_none")]
82
+ src: Option<String>,
83
+ /// Index into `ConversionResult.images` when image extraction is enabled.
84
+ #[serde(skip_serializing_if = "Option::is_none")]
85
+ image_index: Option<u32>,
86
+ },
87
+ /// A code block or inline code.
88
+ Code {
89
+ /// The code text content.
90
+ text: String,
91
+ /// Programming language (from class="language-*" or similar).
92
+ #[serde(skip_serializing_if = "Option::is_none")]
93
+ language: Option<String>,
94
+ },
95
+ /// A block quote container.
96
+ Quote,
97
+ /// A definition list container.
98
+ DefinitionList,
99
+ /// A definition list entry with term and description.
100
+ DefinitionItem {
101
+ /// The term being defined.
102
+ term: String,
103
+ /// The definition text.
104
+ definition: String,
105
+ },
106
+ /// A raw block preserved as-is (e.g. `<script>`, `<style>` content).
107
+ RawBlock {
108
+ /// The format of the raw content (e.g. "html", "css", "javascript").
109
+ format: String,
110
+ /// The raw content.
111
+ content: String,
112
+ },
113
+ /// A block of key-value metadata pairs (from `<head>` meta tags).
114
+ MetadataBlock {
115
+ /// Key-value metadata pairs.
116
+ entries: Vec<(String, String)>,
117
+ },
118
+ /// A section grouping container (auto-generated from heading hierarchy).
119
+ Group {
120
+ /// Optional section label.
121
+ #[serde(skip_serializing_if = "Option::is_none")]
122
+ label: Option<String>,
123
+ /// The heading level that created this group.
124
+ #[serde(skip_serializing_if = "Option::is_none")]
125
+ heading_level: Option<u8>,
126
+ /// The heading text that created this group.
127
+ #[serde(skip_serializing_if = "Option::is_none")]
128
+ heading_text: Option<String>,
129
+ },
130
+ }
131
+
132
+ /// An inline text annotation with byte-range offsets.
133
+ ///
134
+ /// Annotations describe formatting (bold, italic, etc.) and links within a node's text content.
135
+ #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
136
+ pub struct TextAnnotation {
137
+ /// Start byte offset (inclusive) into the parent node's text.
138
+ pub start: u32,
139
+ /// End byte offset (exclusive) into the parent node's text.
140
+ pub end: u32,
141
+ /// The type of annotation.
142
+ pub kind: AnnotationKind,
143
+ }
144
+
145
+ /// The type of an inline text annotation.
146
+ ///
147
+ /// Uses internally tagged representation (`"annotation_type": "bold"`) for JSON serialization.
148
+ #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
149
+ #[serde(tag = "annotation_type", rename_all = "snake_case")]
150
+ pub enum AnnotationKind {
151
+ /// Bold / strong emphasis.
152
+ Bold,
153
+ /// Italic / emphasis.
154
+ Italic,
155
+ /// Underline.
156
+ Underline,
157
+ /// Strikethrough / deleted text.
158
+ Strikethrough,
159
+ /// Inline code.
160
+ Code,
161
+ /// Subscript text.
162
+ Subscript,
163
+ /// Superscript text.
164
+ Superscript,
165
+ /// Highlighted / marked text.
166
+ Highlight,
167
+ /// A hyperlink.
168
+ Link {
169
+ /// The link URL.
170
+ url: String,
171
+ /// Optional link title attribute.
172
+ #[serde(skip_serializing_if = "Option::is_none")]
173
+ title: Option<String>,
174
+ },
175
+ }
@@ -0,0 +1,17 @@
1
+ //! Core types for structured HTML extraction results.
2
+ //!
3
+ //! These types are aligned with kreuzberg's `DocumentStructure` model for seamless integration.
4
+
5
+ mod document;
6
+ mod result;
7
+ pub mod structure_builder;
8
+ pub mod structure_collector;
9
+ mod tables;
10
+ mod warnings;
11
+
12
+ pub use document::{AnnotationKind, DocumentNode, DocumentStructure, NodeContent, TextAnnotation};
13
+ pub use result::ConversionResult;
14
+ pub use structure_builder::build_document_structure;
15
+ pub use structure_collector::{StructureCollector, StructureCollectorHandle};
16
+ pub use tables::{GridCell, TableData, TableGrid};
17
+ pub use warnings::{ProcessingWarning, WarningKind};
@@ -0,0 +1,49 @@
1
+ //! The primary result type for HTML conversion and extraction.
2
+
3
+ use super::document::DocumentStructure;
4
+ use super::tables::TableData;
5
+ use super::warnings::ProcessingWarning;
6
+
7
+ /// The primary result of HTML conversion and extraction.
8
+ ///
9
+ /// Contains the converted text output, optional structured document tree,
10
+ /// metadata, extracted tables, images, and processing warnings.
11
+ ///
12
+ /// # Example
13
+ ///
14
+ /// ```rust,ignore
15
+ /// use html_to_markdown_rs::{convert, ConversionOptions};
16
+ ///
17
+ /// let result = convert("<h1>Hello</h1><p>World</p>", None)?;
18
+ /// assert!(result.content.is_some());
19
+ /// assert!(result.warnings.is_empty());
20
+ /// ```
21
+ #[derive(Debug, Clone, Default)]
22
+ pub struct ConversionResult {
23
+ /// Converted text output (markdown, djot, or plain text).
24
+ ///
25
+ /// `None` when `output_format` is set to `OutputFormat::None`,
26
+ /// indicating extraction-only mode.
27
+ pub content: Option<String>,
28
+
29
+ /// Structured document tree with semantic elements.
30
+ ///
31
+ /// Populated when `include_document_structure` is `true` in options.
32
+ pub document: Option<DocumentStructure>,
33
+
34
+ /// Extracted HTML metadata (title, OG, links, images, structured data).
35
+ #[cfg(feature = "metadata")]
36
+ pub metadata: crate::metadata::HtmlMetadata,
37
+
38
+ /// Extracted tables with structured cell data and markdown representation.
39
+ pub tables: Vec<TableData>,
40
+
41
+ /// Extracted inline images (data URIs and SVGs).
42
+ ///
43
+ /// Populated when `extract_images` is `true` in options.
44
+ #[cfg(feature = "inline-images")]
45
+ pub images: Vec<crate::inline_images::InlineImage>,
46
+
47
+ /// Non-fatal processing warnings.
48
+ pub warnings: Vec<ProcessingWarning>,
49
+ }