html-to-markdown 3.4.0 → 3.6.0.pre.rc.24

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (188) hide show
  1. checksums.yaml +4 -4
  2. data/LICENSE +21 -0
  3. data/README.md +347 -0
  4. data/Steepfile +10 -2
  5. data/ext/html_to_markdown_rb/Cargo.toml +3 -2
  6. data/ext/html_to_markdown_rb/extconf.rb +5 -5
  7. data/ext/html_to_markdown_rb/native/Cargo.lock +962 -0
  8. data/ext/html_to_markdown_rb/native/Cargo.toml +6 -11
  9. data/ext/html_to_markdown_rb/native/extconf.rb +14 -0
  10. data/ext/html_to_markdown_rb/src/lib.rs +1715 -646
  11. data/lib/html_to_markdown/native.rb +913 -37
  12. data/lib/html_to_markdown/version.rb +3 -3
  13. data/lib/html_to_markdown.rb +9 -4
  14. data/lib/html_to_markdown_rb.so +0 -0
  15. data/sig/types.rbs +59 -292
  16. metadata +32 -179
  17. data/ext/html_to_markdown_rb/Makefile +0 -592
  18. data/lib/bin/html-to-markdown +0 -0
  19. data/vendor/Cargo.toml +0 -33
  20. data/vendor/html-to-markdown-rs/Cargo.toml +0 -54
  21. data/vendor/html-to-markdown-rs/README.md +0 -278
  22. data/vendor/html-to-markdown-rs/examples/basic.rs +0 -24
  23. data/vendor/html-to-markdown-rs/examples/table.rs +0 -25
  24. data/vendor/html-to-markdown-rs/examples/test_deser.rs +0 -12
  25. data/vendor/html-to-markdown-rs/examples/test_escape.rs +0 -58
  26. data/vendor/html-to-markdown-rs/examples/test_inline_formatting.rs +0 -113
  27. data/vendor/html-to-markdown-rs/examples/test_lists.rs +0 -39
  28. data/vendor/html-to-markdown-rs/examples/test_semantic_tags.rs +0 -89
  29. data/vendor/html-to-markdown-rs/examples/test_tables.rs +0 -100
  30. data/vendor/html-to-markdown-rs/examples/test_task_lists.rs +0 -61
  31. data/vendor/html-to-markdown-rs/examples/test_whitespace.rs +0 -34
  32. data/vendor/html-to-markdown-rs/src/convert_api.rs +0 -349
  33. data/vendor/html-to-markdown-rs/src/converter/block/blockquote.rs +0 -178
  34. data/vendor/html-to-markdown-rs/src/converter/block/container.rs +0 -114
  35. data/vendor/html-to-markdown-rs/src/converter/block/div.rs +0 -149
  36. data/vendor/html-to-markdown-rs/src/converter/block/heading.rs +0 -428
  37. data/vendor/html-to-markdown-rs/src/converter/block/horizontal_rule.rs +0 -103
  38. data/vendor/html-to-markdown-rs/src/converter/block/line_break.rs +0 -89
  39. data/vendor/html-to-markdown-rs/src/converter/block/mod.rs +0 -10
  40. data/vendor/html-to-markdown-rs/src/converter/block/paragraph.rs +0 -140
  41. data/vendor/html-to-markdown-rs/src/converter/block/preformatted.rs +0 -298
  42. data/vendor/html-to-markdown-rs/src/converter/block/table/builder.rs +0 -453
  43. data/vendor/html-to-markdown-rs/src/converter/block/table/caption.rs +0 -44
  44. data/vendor/html-to-markdown-rs/src/converter/block/table/cell.rs +0 -276
  45. data/vendor/html-to-markdown-rs/src/converter/block/table/cells.rs +0 -336
  46. data/vendor/html-to-markdown-rs/src/converter/block/table/layout.rs +0 -58
  47. data/vendor/html-to-markdown-rs/src/converter/block/table/mod.rs +0 -266
  48. data/vendor/html-to-markdown-rs/src/converter/block/table/scanner.rs +0 -146
  49. data/vendor/html-to-markdown-rs/src/converter/block/table/utils.rs +0 -34
  50. data/vendor/html-to-markdown-rs/src/converter/block/unknown.rs +0 -138
  51. data/vendor/html-to-markdown-rs/src/converter/context.rs +0 -208
  52. data/vendor/html-to-markdown-rs/src/converter/dom_context.rs +0 -337
  53. data/vendor/html-to-markdown-rs/src/converter/form/elements.rs +0 -770
  54. data/vendor/html-to-markdown-rs/src/converter/form/mod.rs +0 -82
  55. data/vendor/html-to-markdown-rs/src/converter/format/djot.rs +0 -64
  56. data/vendor/html-to-markdown-rs/src/converter/format/markdown.rs +0 -59
  57. data/vendor/html-to-markdown-rs/src/converter/format/mod.rs +0 -43
  58. data/vendor/html-to-markdown-rs/src/converter/handlers/blockquote.rs +0 -173
  59. data/vendor/html-to-markdown-rs/src/converter/handlers/code_block.rs +0 -434
  60. data/vendor/html-to-markdown-rs/src/converter/handlers/graphic.rs +0 -234
  61. data/vendor/html-to-markdown-rs/src/converter/handlers/image.rs +0 -282
  62. data/vendor/html-to-markdown-rs/src/converter/handlers/link.rs +0 -316
  63. data/vendor/html-to-markdown-rs/src/converter/handlers/mod.rs +0 -26
  64. data/vendor/html-to-markdown-rs/src/converter/inline/code.rs +0 -306
  65. data/vendor/html-to-markdown-rs/src/converter/inline/emphasis.rs +0 -345
  66. data/vendor/html-to-markdown-rs/src/converter/inline/link.rs +0 -428
  67. data/vendor/html-to-markdown-rs/src/converter/inline/mod.rs +0 -237
  68. data/vendor/html-to-markdown-rs/src/converter/inline/ruby.rs +0 -337
  69. data/vendor/html-to-markdown-rs/src/converter/inline/semantic/marks.rs +0 -566
  70. data/vendor/html-to-markdown-rs/src/converter/inline/semantic/mod.rs +0 -86
  71. data/vendor/html-to-markdown-rs/src/converter/inline/semantic/typography.rs +0 -558
  72. data/vendor/html-to-markdown-rs/src/converter/list/definition.rs +0 -232
  73. data/vendor/html-to-markdown-rs/src/converter/list/item.rs +0 -332
  74. data/vendor/html-to-markdown-rs/src/converter/list/mod.rs +0 -70
  75. data/vendor/html-to-markdown-rs/src/converter/list/ordered.rs +0 -201
  76. data/vendor/html-to-markdown-rs/src/converter/list/unordered.rs +0 -195
  77. data/vendor/html-to-markdown-rs/src/converter/list/utils.rs +0 -314
  78. data/vendor/html-to-markdown-rs/src/converter/main.rs +0 -710
  79. data/vendor/html-to-markdown-rs/src/converter/main_helpers.rs +0 -452
  80. data/vendor/html-to-markdown-rs/src/converter/media/embedded.rs +0 -393
  81. data/vendor/html-to-markdown-rs/src/converter/media/graphic.rs +0 -4
  82. data/vendor/html-to-markdown-rs/src/converter/media/image.rs +0 -183
  83. data/vendor/html-to-markdown-rs/src/converter/media/mod.rs +0 -87
  84. data/vendor/html-to-markdown-rs/src/converter/media/svg.rs +0 -280
  85. data/vendor/html-to-markdown-rs/src/converter/metadata.rs +0 -220
  86. data/vendor/html-to-markdown-rs/src/converter/mod.rs +0 -156
  87. data/vendor/html-to-markdown-rs/src/converter/plain_text.rs +0 -516
  88. data/vendor/html-to-markdown-rs/src/converter/preprocessing_helpers.rs +0 -201
  89. data/vendor/html-to-markdown-rs/src/converter/reference_collector.rs +0 -69
  90. data/vendor/html-to-markdown-rs/src/converter/semantic/attributes.rs +0 -269
  91. data/vendor/html-to-markdown-rs/src/converter/semantic/definition_list.rs +0 -266
  92. data/vendor/html-to-markdown-rs/src/converter/semantic/figure.rs +0 -391
  93. data/vendor/html-to-markdown-rs/src/converter/semantic/mod.rs +0 -112
  94. data/vendor/html-to-markdown-rs/src/converter/semantic/sectioning.rs +0 -85
  95. data/vendor/html-to-markdown-rs/src/converter/semantic/summary.rs +0 -324
  96. data/vendor/html-to-markdown-rs/src/converter/text/mod.rs +0 -8
  97. data/vendor/html-to-markdown-rs/src/converter/text/processing.rs +0 -56
  98. data/vendor/html-to-markdown-rs/src/converter/text_node.rs +0 -269
  99. data/vendor/html-to-markdown-rs/src/converter/utility/attributes.rs +0 -151
  100. data/vendor/html-to-markdown-rs/src/converter/utility/caching.rs +0 -74
  101. data/vendor/html-to-markdown-rs/src/converter/utility/content.rs +0 -271
  102. data/vendor/html-to-markdown-rs/src/converter/utility/mod.rs +0 -17
  103. data/vendor/html-to-markdown-rs/src/converter/utility/preprocessing.rs +0 -1002
  104. data/vendor/html-to-markdown-rs/src/converter/utility/serialization.rs +0 -126
  105. data/vendor/html-to-markdown-rs/src/converter/utility/siblings.rs +0 -97
  106. data/vendor/html-to-markdown-rs/src/converter/visitor_hooks.rs +0 -189
  107. data/vendor/html-to-markdown-rs/src/error.rs +0 -43
  108. data/vendor/html-to-markdown-rs/src/exports.rs +0 -24
  109. data/vendor/html-to-markdown-rs/src/inline_images.rs +0 -336
  110. data/vendor/html-to-markdown-rs/src/lib.rs +0 -139
  111. data/vendor/html-to-markdown-rs/src/metadata/collector.rs +0 -457
  112. data/vendor/html-to-markdown-rs/src/metadata/config.rs +0 -394
  113. data/vendor/html-to-markdown-rs/src/metadata/extraction.rs +0 -398
  114. data/vendor/html-to-markdown-rs/src/metadata/mod.rs +0 -288
  115. data/vendor/html-to-markdown-rs/src/metadata/types.rs +0 -477
  116. data/vendor/html-to-markdown-rs/src/options/conversion.rs +0 -559
  117. data/vendor/html-to-markdown-rs/src/options/inline_image.rs +0 -111
  118. data/vendor/html-to-markdown-rs/src/options/mod.rs +0 -20
  119. data/vendor/html-to-markdown-rs/src/options/preprocessing.rs +0 -201
  120. data/vendor/html-to-markdown-rs/src/options/validation.rs +0 -416
  121. data/vendor/html-to-markdown-rs/src/prelude.rs +0 -1
  122. data/vendor/html-to-markdown-rs/src/rcdom.rs +0 -487
  123. data/vendor/html-to-markdown-rs/src/text.rs +0 -358
  124. data/vendor/html-to-markdown-rs/src/types/document.rs +0 -191
  125. data/vendor/html-to-markdown-rs/src/types/mod.rs +0 -17
  126. data/vendor/html-to-markdown-rs/src/types/result.rs +0 -54
  127. data/vendor/html-to-markdown-rs/src/types/structure_builder.rs +0 -791
  128. data/vendor/html-to-markdown-rs/src/types/structure_collector.rs +0 -483
  129. data/vendor/html-to-markdown-rs/src/types/tables.rs +0 -52
  130. data/vendor/html-to-markdown-rs/src/types/warnings.rs +0 -33
  131. data/vendor/html-to-markdown-rs/src/validation.rs +0 -158
  132. data/vendor/html-to-markdown-rs/src/visitor/default_impl.rs +0 -63
  133. data/vendor/html-to-markdown-rs/src/visitor/mod.rs +0 -41
  134. data/vendor/html-to-markdown-rs/src/visitor/traits.rs +0 -370
  135. data/vendor/html-to-markdown-rs/src/visitor/types.rs +0 -319
  136. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/mod.rs +0 -1
  137. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/content.rs +0 -126
  138. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/mod.rs +0 -27
  139. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/state.rs +0 -110
  140. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/traversal.rs +0 -250
  141. data/vendor/html-to-markdown-rs/src/visitor_helpers.rs +0 -597
  142. data/vendor/html-to-markdown-rs/src/wrapper/sync.rs +0 -413
  143. data/vendor/html-to-markdown-rs/src/wrapper/utils.rs +0 -290
  144. data/vendor/html-to-markdown-rs/src/wrapper.rs +0 -9
  145. data/vendor/html-to-markdown-rs/tests/br_in_inline_test.rs +0 -87
  146. data/vendor/html-to-markdown-rs/tests/commonmark_compliance_test.rs +0 -297
  147. data/vendor/html-to-markdown-rs/tests/djot_output_test.rs +0 -153
  148. data/vendor/html-to-markdown-rs/tests/exclude_selectors_test.rs +0 -132
  149. data/vendor/html-to-markdown-rs/tests/integration_test.rs +0 -631
  150. data/vendor/html-to-markdown-rs/tests/issue_121_regressions.rs +0 -49
  151. data/vendor/html-to-markdown-rs/tests/issue_127_regressions.rs +0 -58
  152. data/vendor/html-to-markdown-rs/tests/issue_128_regressions.rs +0 -17
  153. data/vendor/html-to-markdown-rs/tests/issue_131_regressions.rs +0 -41
  154. data/vendor/html-to-markdown-rs/tests/issue_134_regressions.rs +0 -40
  155. data/vendor/html-to-markdown-rs/tests/issue_139_regressions.rs +0 -26
  156. data/vendor/html-to-markdown-rs/tests/issue_140_regressions.rs +0 -185
  157. data/vendor/html-to-markdown-rs/tests/issue_143_regressions.rs +0 -100
  158. data/vendor/html-to-markdown-rs/tests/issue_145_regressions.rs +0 -133
  159. data/vendor/html-to-markdown-rs/tests/issue_146_regressions.rs +0 -144
  160. data/vendor/html-to-markdown-rs/tests/issue_176_regressions.rs +0 -62
  161. data/vendor/html-to-markdown-rs/tests/issue_190_regressions.rs +0 -128
  162. data/vendor/html-to-markdown-rs/tests/issue_199_regressions.rs +0 -20
  163. data/vendor/html-to-markdown-rs/tests/issue_200_regressions.rs +0 -62
  164. data/vendor/html-to-markdown-rs/tests/issue_212_regressions.rs +0 -68
  165. data/vendor/html-to-markdown-rs/tests/issue_216_217_regressions.rs +0 -87
  166. data/vendor/html-to-markdown-rs/tests/issue_336_regressions.rs +0 -74
  167. data/vendor/html-to-markdown-rs/tests/issue_339_regressions.rs +0 -92
  168. data/vendor/html-to-markdown-rs/tests/issue_347_regressions.rs +0 -154
  169. data/vendor/html-to-markdown-rs/tests/issue_348_visitor_plain.rs +0 -93
  170. data/vendor/html-to-markdown-rs/tests/json_ld_script_extraction.rs +0 -44
  171. data/vendor/html-to-markdown-rs/tests/lists_test.rs +0 -199
  172. data/vendor/html-to-markdown-rs/tests/plain_output_test.rs +0 -273
  173. data/vendor/html-to-markdown-rs/tests/preprocessing_tests.rs +0 -61
  174. data/vendor/html-to-markdown-rs/tests/reference_links_test.rs +0 -169
  175. data/vendor/html-to-markdown-rs/tests/sectioning_elements_test.rs +0 -137
  176. data/vendor/html-to-markdown-rs/tests/skip_images_test.rs +0 -522
  177. data/vendor/html-to-markdown-rs/tests/tables_test.rs +0 -743
  178. data/vendor/html-to-markdown-rs/tests/test_custom_elements.rs +0 -41
  179. data/vendor/html-to-markdown-rs/tests/test_issue_187.rs +0 -204
  180. data/vendor/html-to-markdown-rs/tests/test_issue_218.rs +0 -68
  181. data/vendor/html-to-markdown-rs/tests/test_issue_277.rs +0 -77
  182. data/vendor/html-to-markdown-rs/tests/test_max_depth.rs +0 -82
  183. data/vendor/html-to-markdown-rs/tests/test_nested_simple.rs +0 -45
  184. data/vendor/html-to-markdown-rs/tests/test_script_style_stripping.rs +0 -396
  185. data/vendor/html-to-markdown-rs/tests/test_spa_bisect.rs +0 -34
  186. data/vendor/html-to-markdown-rs/tests/visitor_code_integration_test.rs +0 -121
  187. data/vendor/html-to-markdown-rs/tests/visitor_integration_test.rs +0 -1190
  188. data/vendor/html-to-markdown-rs/tests/xml_tables_test.rs +0 -372
@@ -1,280 +0,0 @@
1
- //! SVG and MathML element handling with serialization and base64 encoding.
2
-
3
- use crate::converter::main_helpers::tag_name_eq;
4
- use crate::converter::utility::content::normalized_tag_name;
5
- #[allow(unused_imports)]
6
- use std::collections::BTreeMap;
7
- use tl::{NodeHandle, Parser};
8
-
9
- #[cfg(feature = "inline-images")]
10
- use crate::inline_images::{InlineImageCollector, InlineImageFormat, InlineImageSource};
11
-
12
- #[cfg(feature = "inline-images")]
13
- type InlineCollectorHandle = std::rc::Rc<std::cell::RefCell<InlineImageCollector>>;
14
-
15
- /// Handle inline SVG elements with size limits and base64 encoding.
16
- ///
17
- /// # Features
18
- /// - SVG serialization to HTML string
19
- /// - Size validation with configurable limits
20
- /// - Base64 encoding for data URI
21
- /// - Metadata extraction (aria-label, title, dimensions)
22
- #[cfg(feature = "inline-images")]
23
- #[allow(clippy::trivially_copy_pass_by_ref)]
24
- #[allow(clippy::needless_pass_by_value)]
25
- #[allow(clippy::option_if_let_else)]
26
- pub fn handle_inline_svg(
27
- collector_ref: &InlineCollectorHandle,
28
- node_handle: &NodeHandle,
29
- parser: &Parser,
30
- title_opt: Option<String>,
31
- attributes: BTreeMap<String, String>,
32
- ) {
33
- let max_size = {
34
- let borrow = collector_ref.borrow();
35
- if !borrow.capture_svg() {
36
- return;
37
- }
38
- borrow.max_decoded_size()
39
- };
40
-
41
- if max_size == 0 {
42
- let mut collector = collector_ref.borrow_mut();
43
- let index = collector.next_index();
44
- collector.warn_skip(index, "max SVG payload size is zero");
45
- return;
46
- }
47
-
48
- let mut collector = collector_ref.borrow_mut();
49
- let index = collector.next_index();
50
-
51
- let serialized = serialize_element(node_handle, parser);
52
- if serialized.is_empty() {
53
- collector.warn_skip(index, "unable to serialize SVG element");
54
- return;
55
- }
56
-
57
- let data = serialized.into_bytes();
58
- if data.len() as u64 > max_size {
59
- collector.warn_skip(
60
- index,
61
- format!(
62
- "serialized SVG payload ({} bytes) exceeds configured max ({})",
63
- data.len(),
64
- max_size
65
- ),
66
- );
67
- return;
68
- }
69
-
70
- let description = attributes
71
- .get("aria-label")
72
- .and_then(|value| non_empty_trimmed(value))
73
- .or_else(|| title_opt.as_deref().and_then(non_empty_trimmed));
74
-
75
- let filename_candidate = attributes
76
- .get("data-filename")
77
- .cloned()
78
- .or_else(|| attributes.get("filename").cloned())
79
- .or_else(|| attributes.get("data-name").cloned());
80
-
81
- let image = collector.build_image(
82
- data,
83
- InlineImageFormat::Svg,
84
- filename_candidate,
85
- description,
86
- None,
87
- InlineImageSource::SvgElement,
88
- attributes,
89
- );
90
-
91
- collector.push_image(index, image);
92
- }
93
-
94
- /// Serialize an element to HTML string (for SVG and Math elements).
95
- #[allow(clippy::trivially_copy_pass_by_ref)]
96
- pub fn serialize_element(node_handle: &NodeHandle, parser: &Parser) -> String {
97
- if let Some(tl::Node::Tag(tag)) = node_handle.get(parser) {
98
- let tag_name = normalized_tag_name(tag.name().as_utf8_str());
99
- let mut html = String::with_capacity(256);
100
- html.push('<');
101
- html.push_str(&tag_name);
102
-
103
- for (key, value_opt) in tag.attributes().iter() {
104
- html.push(' ');
105
- html.push_str(&key);
106
- if let Some(value) = value_opt {
107
- html.push_str("=\"");
108
- html.push_str(&value);
109
- html.push('"');
110
- }
111
- }
112
-
113
- let has_children = !tag.children().top().is_empty();
114
- if has_children {
115
- html.push('>');
116
- let children = tag.children();
117
- {
118
- for child_handle in children.top().iter() {
119
- html.push_str(&serialize_node(child_handle, parser));
120
- }
121
- }
122
- html.push_str("</");
123
- html.push_str(&tag_name);
124
- html.push('>');
125
- } else {
126
- html.push_str(" />");
127
- }
128
- return html;
129
- }
130
- String::new()
131
- }
132
-
133
- /// Serialize a node to HTML string.
134
- #[allow(clippy::trivially_copy_pass_by_ref)]
135
- pub fn serialize_node(node_handle: &NodeHandle, parser: &Parser) -> String {
136
- if let Some(node) = node_handle.get(parser) {
137
- match node {
138
- tl::Node::Raw(bytes) => bytes.as_utf8_str().to_string(),
139
- tl::Node::Tag(_) => serialize_element(node_handle, parser),
140
- _ => String::new(),
141
- }
142
- } else {
143
- String::new()
144
- }
145
- }
146
-
147
- /// Extract non-empty trimmed string or return None.
148
- #[cfg(feature = "inline-images")]
149
- fn non_empty_trimmed(value: &str) -> Option<String> {
150
- let trimmed = value.trim();
151
- if trimmed.is_empty() {
152
- None
153
- } else {
154
- Some(trimmed.to_string())
155
- }
156
- }
157
-
158
- /// Handle SVG element conversion to Markdown.
159
- ///
160
- /// Extracts title from child elements, handles inline image collection,
161
- /// and outputs either the title text (in inline mode) or a base64-encoded image.
162
- #[allow(clippy::too_many_arguments)]
163
- pub fn handle_svg(
164
- node_handle: &NodeHandle,
165
- tag: &tl::HTMLTag,
166
- parser: &Parser,
167
- output: &mut String,
168
- options: &crate::options::ConversionOptions,
169
- ctx: &super::Context,
170
- _depth: usize,
171
- dom_ctx: &super::DomContext,
172
- ) {
173
- use crate::converter::utility::content::get_text_content;
174
-
175
- let mut title = String::from("SVG Image");
176
- let children = tag.children();
177
- for child_handle in children.top().iter() {
178
- if let Some(tl::Node::Tag(child_tag)) = child_handle.get(parser) {
179
- if tag_name_eq(child_tag.name().as_utf8_str(), "title") {
180
- title = get_text_content(child_handle, parser, dom_ctx).trim().to_string();
181
- break;
182
- }
183
- }
184
- }
185
-
186
- #[cfg(feature = "inline-images")]
187
- if let Some(ref collector_ref) = ctx.inline_collector {
188
- let title_opt = if title == "SVG Image" {
189
- None
190
- } else {
191
- Some(title.clone())
192
- };
193
- let mut attributes_map = BTreeMap::new();
194
- for (key, value_opt) in tag.attributes().iter() {
195
- let key_str = key.to_string();
196
- let keep = key_str == "width"
197
- || key_str == "height"
198
- || key_str == "filename"
199
- || key_str == "aria-label"
200
- || key_str.starts_with("data-");
201
- if keep {
202
- let value = value_opt.map(|value| value.to_string()).unwrap_or_default();
203
- attributes_map.insert(key_str, value);
204
- }
205
- }
206
- handle_inline_svg(collector_ref, node_handle, parser, title_opt, attributes_map);
207
- }
208
-
209
- if options.skip_images {
210
- return;
211
- }
212
-
213
- if ctx.convert_as_inline {
214
- output.push_str(&title);
215
- } else {
216
- use base64::{Engine as _, engine::general_purpose::STANDARD};
217
-
218
- let svg_html = serialize_element(node_handle, parser);
219
- let base64_svg = STANDARD.encode(svg_html.as_bytes());
220
-
221
- output.push_str("![");
222
- output.push_str(&title);
223
- output.push_str("](data:image/svg+xml;base64,");
224
- output.push_str(&base64_svg);
225
- output.push(')');
226
- }
227
- }
228
-
229
- /// Handle MathML element conversion to Markdown.
230
- ///
231
- /// Serializes MathML to HTML comment and outputs text content with escaping.
232
- #[allow(clippy::too_many_arguments)]
233
- pub fn handle_math(
234
- node_handle: &NodeHandle,
235
- tag: &tl::HTMLTag,
236
- parser: &Parser,
237
- output: &mut String,
238
- options: &crate::options::ConversionOptions,
239
- ctx: &super::Context,
240
- _depth: usize,
241
- dom_ctx: &super::DomContext,
242
- ) {
243
- use crate::converter::utility::content::get_text_content;
244
- use crate::text;
245
-
246
- let text_content = get_text_content(node_handle, parser, dom_ctx).trim().to_string();
247
-
248
- if text_content.is_empty() {
249
- return;
250
- }
251
-
252
- let math_html = serialize_element(node_handle, parser);
253
-
254
- let escaped_text = text::escape(
255
- &text_content,
256
- options.escape_misc,
257
- options.escape_asterisks,
258
- options.escape_underscores,
259
- options.escape_ascii,
260
- );
261
-
262
- let is_display_block = tag
263
- .attributes()
264
- .get("display")
265
- .flatten()
266
- .is_some_and(|v| v.as_utf8_str() == "block");
267
-
268
- if is_display_block && !ctx.in_paragraph && !ctx.convert_as_inline {
269
- output.push_str("\n\n");
270
- }
271
-
272
- output.push_str("<!-- MathML: ");
273
- output.push_str(&math_html);
274
- output.push_str(" --> ");
275
- output.push_str(&escaped_text);
276
-
277
- if is_display_block && !ctx.in_paragraph && !ctx.convert_as_inline {
278
- output.push_str("\n\n");
279
- }
280
- }
@@ -1,220 +0,0 @@
1
- //! Handler for metadata and script elements (head, script, style, math).
2
- //!
3
- //! Converts various metadata-related elements:
4
- //! - **head**: Document metadata container; processes script[type="application/ld+json"]
5
- //! - **script**: Script elements; extracts JSON-LD structured data when appropriate
6
- //! - **style**: CSS stylesheet elements; skipped in conversion
7
- //! - **math**: MathML elements with serialization and HTML comments for preservation
8
-
9
- use crate::converter::media::svg::serialize_element;
10
- use crate::options::ConversionOptions;
11
- #[cfg(feature = "metadata")]
12
- use crate::text::decode_html_entities;
13
- use crate::text::escape;
14
- use tl::{NodeHandle, Parser};
15
-
16
- // Type aliases for Context and DomContext to avoid circular imports
17
- type Context = crate::converter::Context;
18
- type DomContext = crate::converter::DomContext;
19
-
20
- /// Handles metadata elements: head, script, style, math.
21
- ///
22
- /// Processes various metadata-related elements:
23
- /// - head: Scans for structured data in script[type="application/ld+json"]
24
- /// - script: Extracts JSON-LD for structured data collection
25
- /// - style: Skipped (CSS not relevant in markdown)
26
- /// - math: Preserves MathML as HTML comments with text content
27
- pub fn handle(
28
- tag_name: &str,
29
- node_handle: &NodeHandle,
30
- parser: &Parser,
31
- output: &mut String,
32
- options: &ConversionOptions,
33
- ctx: &Context,
34
- depth: usize,
35
- dom_ctx: &DomContext,
36
- ) {
37
- match tag_name {
38
- "head" => {
39
- handle_head(node_handle, parser, output, options, ctx, depth, dom_ctx);
40
- }
41
- "script" => {
42
- handle_script(node_handle, parser, output, options, ctx);
43
- }
44
- "style" => {
45
- // Style elements are skipped - no output
46
- }
47
- "math" => {
48
- handle_math(node_handle, parser, output, options, ctx, dom_ctx);
49
- }
50
- _ => {}
51
- }
52
- }
53
-
54
- /// Handle head element.
55
- ///
56
- /// Head elements contain metadata. We process them to extract structured data from
57
- /// nested script[type="application/ld+json"] elements if metadata collection is enabled.
58
- fn handle_head(
59
- node_handle: &NodeHandle,
60
- parser: &Parser,
61
- output: &mut String,
62
- options: &ConversionOptions,
63
- ctx: &Context,
64
- depth: usize,
65
- dom_ctx: &DomContext,
66
- ) {
67
- use crate::converter::walk_node;
68
-
69
- let Some(node) = node_handle.get(parser) else { return };
70
-
71
- let tag = match node {
72
- tl::Node::Tag(tag) => tag,
73
- _ => return,
74
- };
75
-
76
- let children = tag.children();
77
- let has_body_like = children.top().iter().any(|child_handle| {
78
- if let Some(child_name) = dom_ctx.tag_name_for(*child_handle, parser) {
79
- matches!(
80
- child_name.as_ref(),
81
- "body" | "main" | "article" | "section" | "div" | "p"
82
- )
83
- } else {
84
- false
85
- }
86
- });
87
-
88
- #[cfg(feature = "metadata")]
89
- if ctx.metadata_wants_structured_data {
90
- if let Some(ref collector) = ctx.metadata_collector {
91
- for child_handle in children.top().iter() {
92
- if let Some(tl::Node::Tag(child_tag)) = child_handle.get(parser) {
93
- let child_name = dom_ctx
94
- .tag_name_for(*child_handle, parser)
95
- .unwrap_or_else(|| crate::converter::normalized_tag_name(child_tag.name().as_utf8_str()));
96
- if child_name.as_ref() == "script" {
97
- if let Some(type_attr) = child_tag.attributes().get("type").flatten() {
98
- let type_value = type_attr.as_utf8_str();
99
- let type_value = type_value.as_ref();
100
- let type_value = type_value.split(';').next().unwrap_or(type_value);
101
- if type_value.trim().eq_ignore_ascii_case("application/ld+json") {
102
- let json = child_tag.inner_text(parser);
103
- let json = json.trim();
104
- if !json.is_empty() {
105
- let json = decode_html_entities(json);
106
- if !json.is_empty() {
107
- collector.borrow_mut().add_json_ld(json);
108
- }
109
- }
110
- }
111
- }
112
- }
113
- }
114
- }
115
- }
116
- }
117
-
118
- // If head contains body-like elements (malformed HTML), process them
119
- if has_body_like {
120
- for child_handle in children.top().iter() {
121
- walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
122
- }
123
- }
124
- }
125
-
126
- /// Handle script element.
127
- ///
128
- /// Script elements are processed to extract JSON-LD structured data when
129
- /// the type is "application/ld+json" and metadata collection is enabled.
130
- #[cfg_attr(not(feature = "metadata"), allow(unused_variables))]
131
- fn handle_script(
132
- node_handle: &NodeHandle,
133
- parser: &Parser,
134
- _output: &mut String,
135
- _options: &ConversionOptions,
136
- ctx: &Context,
137
- ) {
138
- let Some(node) = node_handle.get(parser) else { return };
139
-
140
- let tag = match node {
141
- tl::Node::Tag(tag) => tag,
142
- _ => return,
143
- };
144
-
145
- #[cfg(feature = "metadata")]
146
- if let Some(type_attr) = tag.attributes().get("type").flatten() {
147
- let type_value = type_attr.as_utf8_str();
148
- let type_value = type_value.as_ref();
149
- let type_value = type_value.split(';').next().unwrap_or(type_value);
150
- if type_value.trim().eq_ignore_ascii_case("application/ld+json") && ctx.metadata_wants_structured_data {
151
- if let Some(ref collector) = ctx.metadata_collector {
152
- let json = tag.inner_text(parser);
153
- let json = json.trim();
154
- if !json.is_empty() {
155
- let json = decode_html_entities(json);
156
- if !json.is_empty() {
157
- collector.borrow_mut().add_json_ld(json);
158
- }
159
- }
160
- }
161
- }
162
- }
163
- }
164
-
165
- /// Handle math element.
166
- ///
167
- /// MathML elements are serialized to HTML and wrapped in a comment to preserve them.
168
- /// The text content of the element is also output as plain text.
169
- fn handle_math(
170
- node_handle: &NodeHandle,
171
- parser: &Parser,
172
- output: &mut String,
173
- options: &ConversionOptions,
174
- ctx: &Context,
175
- dom_ctx: &DomContext,
176
- ) {
177
- let text_content = crate::converter::get_text_content(node_handle, parser, dom_ctx)
178
- .trim()
179
- .to_string();
180
-
181
- if text_content.is_empty() {
182
- return;
183
- }
184
-
185
- let math_html = serialize_element(node_handle, parser);
186
-
187
- let escaped_text = escape(
188
- &text_content,
189
- options.escape_misc,
190
- options.escape_asterisks,
191
- options.escape_underscores,
192
- options.escape_ascii,
193
- );
194
-
195
- let Some(node) = node_handle.get(parser) else { return };
196
-
197
- let tag = match node {
198
- tl::Node::Tag(tag) => tag,
199
- _ => return,
200
- };
201
-
202
- let is_display_block = tag
203
- .attributes()
204
- .get("display")
205
- .flatten()
206
- .is_some_and(|v| v.as_utf8_str() == "block");
207
-
208
- if is_display_block && !ctx.in_paragraph && !ctx.convert_as_inline {
209
- output.push_str("\n\n");
210
- }
211
-
212
- output.push_str("<!-- MathML: ");
213
- output.push_str(&math_html);
214
- output.push_str(" --> ");
215
- output.push_str(&escaped_text);
216
-
217
- if is_display_block && !ctx.in_paragraph && !ctx.convert_as_inline {
218
- output.push_str("\n\n");
219
- }
220
- }
@@ -1,156 +0,0 @@
1
- //! HTML to Markdown conversion engine with modular architecture.
2
- //!
3
- //! This module provides the complete conversion pipeline for transforming HTML documents
4
- //! into Markdown format. It follows a modular, type-safe design where HTML element handling
5
- //! is organized by semantic category (block, inline, list, table, etc.) with dispatch functions
6
- //! routing elements to their specialized handlers.
7
- //!
8
- //! # Module Organization
9
- //!
10
- //! The converter module is organized into semantic categories:
11
- //!
12
- //! - **[block]**: Block-level elements (headings, paragraphs, blockquotes, preformatted text, tables)
13
- //! - **[inline]**: Inline formatting (emphasis, links, code, semantic formatting)
14
- //! - **[list]**: List structures (ordered, unordered, definition lists)
15
- //! - **[table]**: Accessible via `block::table` submodule
16
- //! - **[media]**: Media elements (images, video, audio, embedded content, SVG)
17
- //! - **[semantic]**: Semantic HTML5 elements (sectioning, figures, interactive elements)
18
- //! - **[form]**: Form elements (inputs, selects, buttons, fieldsets)
19
- //! - **[utility]**: Helper functions (DOM traversal, caching, serialization, attributes)
20
- //! - **[text]**: Text processing and escaping (via crate::text module)
21
- //!
22
- //! # Public Types
23
- //!
24
- //! The main context types used across the conversion pipeline:
25
- //!
26
- //! - **[Context]**: Stateful conversion context tracking (e.g., list nesting, code blocks, in_heading)
27
- //! - **[DomContext]**: DOM relationship cache for efficient tree navigation
28
- //!
29
- //! # Conversion Flow
30
- //!
31
- //! The conversion process follows these steps:
32
- //!
33
- //! 1. **Parse HTML**: Input HTML is parsed into a DOM tree using the astral-tl parser
34
- //! 2. **Walk Tree**: Recursive tree walk starting from the root document node
35
- //! 3. **Dispatch**: Each element is dispatched to its handler based on tag name
36
- //! 4. **Convert**: Handler transforms the element to Markdown representation
37
- //! 5. **Post-process**: Text escaping and whitespace normalization
38
- //!
39
- //! # Handler Pattern
40
- //!
41
- //! Each submodule (block, inline, list, etc.) follows a consistent pattern:
42
- //!
43
- //! ```text
44
- //! // Module declares handlers for specific element types
45
- //! pub fn dispatch_<category>_handler(
46
- //! tag_name: &str,
47
- //! node_handle: &NodeHandle,
48
- //! parser: &Parser,
49
- //! output: &mut String,
50
- //! options: &ConversionOptions,
51
- //! ctx: &Context,
52
- //! depth: usize,
53
- //! dom_ctx: &DomContext,
54
- //! ) -> bool {
55
- //! // Route to appropriate handler, return true if handled
56
- //! }
57
- //! ```
58
- //!
59
- //! # Visibility Rules
60
- //!
61
- //! - **Context & DomContext**: Public types for external module coordination
62
- //! - **Dispatch functions**: Public for main walk_node caller
63
- //! - **Individual handlers**: Typically pub for direct access if needed
64
- //! - **Internal utilities**: pub(crate) or pub(super) for module-internal use
65
- //!
66
- //! # Feature Support
67
- //!
68
- //! - Inline image extraction (`inline-images` feature)
69
- //! - Metadata collection (`metadata` feature)
70
- //! - Custom visitor callbacks (`visitor` feature)
71
- //!
72
- //! # Example Integration
73
- //!
74
- //! Once `converter.rs` is refactored to use `converter/main.rs`, the walk_node function
75
- //! will use dispatch functions like:
76
- //!
77
- //! ```text
78
- //! use crate::converter::{block, inline, list, media, semantic, form};
79
- //!
80
- //! fn walk_node(...) {
81
- //! // Try each dispatcher in order
82
- //! if block::dispatch_block_handler(&tag, ...) { return; }
83
- //! if inline::dispatch_inline_handler(&tag, ...) { return; }
84
- //! if list::dispatch_list_handler(&tag, ...) { return; }
85
- //! if media::dispatch_media_handler(&tag, ...) { return; }
86
- //! if semantic::dispatch_semantic_handler(&tag, ...) { return; }
87
- //! if form::dispatch_form_handler(&tag, ...) { return; }
88
- //! // Default handling for unrecognized tags
89
- //! }
90
- //! ```
91
-
92
- pub mod block;
93
- pub mod context;
94
- pub mod dom_context;
95
- pub mod form;
96
- pub mod format;
97
- pub mod handlers;
98
- pub mod inline;
99
- pub mod list;
100
- pub mod main;
101
- mod main_helpers;
102
- pub mod media;
103
- mod metadata;
104
- pub mod plain_text;
105
- pub mod preprocessing_helpers;
106
- pub mod reference_collector;
107
- pub mod semantic;
108
- pub mod text;
109
- mod text_node;
110
- pub mod utility;
111
-
112
- #[cfg(feature = "visitor")]
113
- pub mod visitor_hooks;
114
-
115
- // Import and re-export public types and functions from the main module
116
- pub use self::context::Context;
117
- pub use self::dom_context::DomContext;
118
-
119
- // Import the tree walker and utility functions from main and main_helpers
120
- pub use self::main::{convert_html_impl, walk_node};
121
- pub use self::main_helpers::trim_trailing_whitespace;
122
-
123
- // Re-export helper functions from utility modules (migrated from converter_legacy)
124
- pub use crate::converter::utility::content::{chomp_inline, get_text_content, normalized_tag_name};
125
- #[allow(unused_imports)]
126
- pub use crate::converter::utility::serialization::{serialize_node, serialize_node_to_html};
127
-
128
- // Helper functions migrated to utility modules
129
- pub use crate::converter::utility::siblings::append_inline_suffix;
130
-
131
- // Caching functions migrated to utility/caching
132
-
133
- // Content functions migrated to utility/content
134
-
135
- // Heading functions migrated to block/heading
136
- pub use crate::converter::block::heading::find_single_heading_child;
137
-
138
- // Link functions migrated to inline/link
139
-
140
- // Re-export dispatch functions for routing elements to handlers
141
- // Media module doesn't have a dispatcher - it exports utility functions
142
-
143
- // Re-export utility submodules for public access to their types
144
- // NOTE: utility::preprocessing is deliberately not re-exported to avoid naming conflict
145
- // with preprocessing_helpers module. Users should access utility::preprocessing directly.
146
-
147
- // Re-export format renderer types
148
-
149
- // Block and inline handlers are internal - only dispatchers are exposed
150
- // Individual handlers are pub(crate) and not meant to be part of the public API
151
-
152
- // Re-export media utilities for internal use (crate-private)
153
-
154
- // Re-export list utilities for internal use (crate-private)
155
-
156
- // Semantic and form handlers are also internal (pub(crate))