html-to-markdown 3.2.3 → 3.4.0.pre.rc.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (136) hide show
  1. checksums.yaml +4 -4
  2. data/Steepfile +6 -0
  3. data/ext/html_to_markdown_rb/Cargo.toml +2 -2
  4. data/ext/html_to_markdown_rb/native/Cargo.toml +28 -0
  5. data/ext/html_to_markdown_rb/src/html-to-markdown/version.rb +10 -0
  6. data/ext/html_to_markdown_rb/src/html-to-markdown.rb +13 -0
  7. data/ext/html_to_markdown_rb/src/lib.rs +2088 -268
  8. data/lib/bin/html-to-markdown +0 -0
  9. data/lib/html_to_markdown/version.rb +1 -1
  10. data/lib/html_to_markdown.rb +5 -3
  11. data/sig/types.rbs +769 -0
  12. data/vendor/Cargo.toml +2 -2
  13. data/vendor/html-to-markdown-rs/Cargo.toml +1 -1
  14. data/vendor/html-to-markdown-rs/examples/basic.rs +1 -1
  15. data/vendor/html-to-markdown-rs/examples/table.rs +1 -1
  16. data/vendor/html-to-markdown-rs/examples/test_deser.rs +1 -1
  17. data/vendor/html-to-markdown-rs/examples/test_escape.rs +1 -1
  18. data/vendor/html-to-markdown-rs/examples/test_inline_formatting.rs +1 -1
  19. data/vendor/html-to-markdown-rs/examples/test_lists.rs +1 -1
  20. data/vendor/html-to-markdown-rs/examples/test_semantic_tags.rs +1 -1
  21. data/vendor/html-to-markdown-rs/examples/test_tables.rs +1 -1
  22. data/vendor/html-to-markdown-rs/examples/test_task_lists.rs +1 -1
  23. data/vendor/html-to-markdown-rs/examples/test_whitespace.rs +1 -1
  24. data/vendor/html-to-markdown-rs/src/convert_api.rs +15 -25
  25. data/vendor/html-to-markdown-rs/src/converter/block/blockquote.rs +1 -1
  26. data/vendor/html-to-markdown-rs/src/converter/block/container.rs +3 -3
  27. data/vendor/html-to-markdown-rs/src/converter/block/div.rs +1 -1
  28. data/vendor/html-to-markdown-rs/src/converter/block/heading.rs +6 -7
  29. data/vendor/html-to-markdown-rs/src/converter/block/horizontal_rule.rs +1 -1
  30. data/vendor/html-to-markdown-rs/src/converter/block/line_break.rs +1 -1
  31. data/vendor/html-to-markdown-rs/src/converter/block/mod.rs +0 -108
  32. data/vendor/html-to-markdown-rs/src/converter/block/paragraph.rs +1 -1
  33. data/vendor/html-to-markdown-rs/src/converter/block/preformatted.rs +1 -1
  34. data/vendor/html-to-markdown-rs/src/converter/block/table/builder.rs +1 -1
  35. data/vendor/html-to-markdown-rs/src/converter/block/table/cell.rs +1 -1
  36. data/vendor/html-to-markdown-rs/src/converter/block/table/layout.rs +1 -1
  37. data/vendor/html-to-markdown-rs/src/converter/block/table/mod.rs +2 -4
  38. data/vendor/html-to-markdown-rs/src/converter/block/unknown.rs +1 -1
  39. data/vendor/html-to-markdown-rs/src/converter/context.rs +10 -0
  40. data/vendor/html-to-markdown-rs/src/converter/dom_context.rs +1 -1
  41. data/vendor/html-to-markdown-rs/src/converter/form/elements.rs +14 -14
  42. data/vendor/html-to-markdown-rs/src/converter/form/mod.rs +1 -1
  43. data/vendor/html-to-markdown-rs/src/converter/format/mod.rs +0 -3
  44. data/vendor/html-to-markdown-rs/src/converter/inline/code.rs +1 -1
  45. data/vendor/html-to-markdown-rs/src/converter/inline/emphasis.rs +1 -1
  46. data/vendor/html-to-markdown-rs/src/converter/inline/link.rs +2 -2
  47. data/vendor/html-to-markdown-rs/src/converter/inline/mod.rs +0 -1
  48. data/vendor/html-to-markdown-rs/src/converter/inline/ruby.rs +1 -1
  49. data/vendor/html-to-markdown-rs/src/converter/inline/semantic/mod.rs +1 -1
  50. data/vendor/html-to-markdown-rs/src/converter/list/definition.rs +3 -3
  51. data/vendor/html-to-markdown-rs/src/converter/list/item.rs +1 -1
  52. data/vendor/html-to-markdown-rs/src/converter/list/mod.rs +0 -1
  53. data/vendor/html-to-markdown-rs/src/converter/list/ordered.rs +2 -2
  54. data/vendor/html-to-markdown-rs/src/converter/list/unordered.rs +2 -2
  55. data/vendor/html-to-markdown-rs/src/converter/main.rs +57 -31
  56. data/vendor/html-to-markdown-rs/src/converter/media/embedded.rs +8 -8
  57. data/vendor/html-to-markdown-rs/src/converter/media/image.rs +1 -1
  58. data/vendor/html-to-markdown-rs/src/converter/media/mod.rs +1 -1
  59. data/vendor/html-to-markdown-rs/src/converter/media/svg.rs +5 -5
  60. data/vendor/html-to-markdown-rs/src/converter/mod.rs +6 -17
  61. data/vendor/html-to-markdown-rs/src/converter/plain_text.rs +64 -11
  62. data/vendor/html-to-markdown-rs/src/converter/preprocessing_helpers.rs +80 -22
  63. data/vendor/html-to-markdown-rs/src/converter/semantic/figure.rs +1 -1
  64. data/vendor/html-to-markdown-rs/src/converter/semantic/mod.rs +1 -1
  65. data/vendor/html-to-markdown-rs/src/converter/text/mod.rs +0 -4
  66. data/vendor/html-to-markdown-rs/src/converter/utility/attributes.rs +5 -9
  67. data/vendor/html-to-markdown-rs/src/converter/utility/caching.rs +3 -3
  68. data/vendor/html-to-markdown-rs/src/converter/utility/content.rs +10 -10
  69. data/vendor/html-to-markdown-rs/src/converter/utility/preprocessing.rs +13 -13
  70. data/vendor/html-to-markdown-rs/src/converter/utility/serialization.rs +4 -4
  71. data/vendor/html-to-markdown-rs/src/converter/utility/siblings.rs +6 -14
  72. data/vendor/html-to-markdown-rs/src/inline_images.rs +6 -0
  73. data/vendor/html-to-markdown-rs/src/lib.rs +17 -18
  74. data/vendor/html-to-markdown-rs/src/options/conversion.rs +31 -0
  75. data/vendor/html-to-markdown-rs/src/prelude.rs +1 -12
  76. data/vendor/html-to-markdown-rs/src/text.rs +0 -44
  77. data/vendor/html-to-markdown-rs/src/types/warnings.rs +2 -0
  78. data/vendor/html-to-markdown-rs/src/visitor/types.rs +5 -1
  79. data/vendor/html-to-markdown-rs/src/visitor_helpers.rs +4 -1
  80. data/vendor/html-to-markdown-rs/tests/br_in_inline_test.rs +1 -1
  81. data/vendor/html-to-markdown-rs/tests/commonmark_compliance_test.rs +1 -1
  82. data/vendor/html-to-markdown-rs/tests/djot_output_test.rs +1 -1
  83. data/vendor/html-to-markdown-rs/tests/exclude_selectors_test.rs +136 -0
  84. data/vendor/html-to-markdown-rs/tests/integration_test.rs +1 -1
  85. data/vendor/html-to-markdown-rs/tests/issue_121_regressions.rs +1 -1
  86. data/vendor/html-to-markdown-rs/tests/issue_127_regressions.rs +1 -1
  87. data/vendor/html-to-markdown-rs/tests/issue_128_regressions.rs +1 -1
  88. data/vendor/html-to-markdown-rs/tests/issue_131_regressions.rs +1 -1
  89. data/vendor/html-to-markdown-rs/tests/issue_134_regressions.rs +1 -1
  90. data/vendor/html-to-markdown-rs/tests/issue_139_regressions.rs +1 -1
  91. data/vendor/html-to-markdown-rs/tests/issue_140_regressions.rs +1 -1
  92. data/vendor/html-to-markdown-rs/tests/issue_143_regressions.rs +1 -1
  93. data/vendor/html-to-markdown-rs/tests/issue_145_regressions.rs +1 -1
  94. data/vendor/html-to-markdown-rs/tests/issue_146_regressions.rs +1 -1
  95. data/vendor/html-to-markdown-rs/tests/issue_176_regressions.rs +2 -2
  96. data/vendor/html-to-markdown-rs/tests/issue_190_regressions.rs +1 -1
  97. data/vendor/html-to-markdown-rs/tests/issue_199_regressions.rs +1 -1
  98. data/vendor/html-to-markdown-rs/tests/issue_200_regressions.rs +1 -1
  99. data/vendor/html-to-markdown-rs/tests/issue_212_regressions.rs +1 -1
  100. data/vendor/html-to-markdown-rs/tests/issue_216_217_regressions.rs +1 -1
  101. data/vendor/html-to-markdown-rs/tests/json_ld_script_extraction.rs +2 -2
  102. data/vendor/html-to-markdown-rs/tests/lists_test.rs +1 -1
  103. data/vendor/html-to-markdown-rs/tests/plain_output_test.rs +1 -1
  104. data/vendor/html-to-markdown-rs/tests/preprocessing_tests.rs +1 -1
  105. data/vendor/html-to-markdown-rs/tests/reference_links_test.rs +1 -1
  106. data/vendor/html-to-markdown-rs/tests/sectioning_elements_test.rs +137 -0
  107. data/vendor/html-to-markdown-rs/tests/skip_images_test.rs +1 -1
  108. data/vendor/html-to-markdown-rs/tests/tables_test.rs +2 -2
  109. data/vendor/html-to-markdown-rs/tests/test_custom_elements.rs +1 -1
  110. data/vendor/html-to-markdown-rs/tests/test_issue_187.rs +5 -2
  111. data/vendor/html-to-markdown-rs/tests/test_issue_218.rs +4 -4
  112. data/vendor/html-to-markdown-rs/tests/test_issue_277.rs +77 -0
  113. data/vendor/html-to-markdown-rs/tests/test_max_depth.rs +82 -0
  114. data/vendor/html-to-markdown-rs/tests/test_nested_simple.rs +1 -1
  115. data/vendor/html-to-markdown-rs/tests/test_script_style_stripping.rs +4 -4
  116. data/vendor/html-to-markdown-rs/tests/test_spa_bisect.rs +1 -1
  117. data/vendor/html-to-markdown-rs/tests/visitor_code_integration_test.rs +6 -6
  118. data/vendor/html-to-markdown-rs/tests/visitor_integration_test.rs +103 -35
  119. data/vendor/html-to-markdown-rs/tests/xml_tables_test.rs +1 -1
  120. metadata +21 -43
  121. data/.bundle/config +0 -2
  122. data/.gitignore +0 -3
  123. data/.rubocop.yml +0 -59
  124. data/Gemfile +0 -18
  125. data/Gemfile.lock +0 -173
  126. data/README.md +0 -331
  127. data/Rakefile +0 -26
  128. data/exe/html-to-markdown +0 -6
  129. data/ext/html_to_markdown_rb/src/html_to_markdown_rs/version.rb +0 -6
  130. data/ext/html_to_markdown_rb/src/html_to_markdown_rs.rb +0 -9
  131. data/html-to-markdown-rb.gemspec +0 -99
  132. data/lib/html_to_markdown_rs.rb +0 -3
  133. data/sig/html_to_markdown.rbs +0 -149
  134. data/vendor/html-to-markdown-rs/src/converter/text/escaping.rs +0 -94
  135. data/vendor/html-to-markdown-rs/src/converter/text/normalization.rs +0 -86
  136. data/vendor/html-to-markdown-rs/src/safety.rs +0 -70
data/sig/types.rbs ADDED
@@ -0,0 +1,769 @@
1
+ # This file is auto-generated by alef — DO NOT EDIT.
2
+ # alef:hash:fa557708df795d5b42dd32042603884cf4e9e96a2609974ffb238997cf8b32b3
3
+ # To regenerate: alef generate
4
+ # To verify freshness: alef verify --exit-code
5
+ # Issues & docs: https://github.com/kreuzberg-dev/alef
6
+
7
+ module HtmlToMarkdown
8
+
9
+ VERSION: String
10
+
11
+ class DocumentMetadata
12
+ # Document-level metadata extracted from `<head>` and top-level elements.
13
+ #
14
+ # Contains all metadata typically used by search engines, social media platforms,
15
+ # and browsers for document indexing and presentation.
16
+ #
17
+ # # Examples
18
+ #
19
+ # ```
20
+ # # use html_to_markdown_rs::metadata::DocumentMetadata;
21
+ # let doc = DocumentMetadata {
22
+ # title: Some("My Article".to_string()),
23
+ # description: Some("A great article about Rust".to_string()),
24
+ # keywords: vec!["rust".to_string(), "programming".to_string()],
25
+ # ..Default::default()
26
+ # };
27
+ #
28
+ # assert_eq!(doc.title, Some("My Article".to_string()));
29
+ # ```
30
+
31
+ attr_accessor title: String
32
+ attr_accessor description: String
33
+ attr_accessor keywords: Array[String]
34
+ attr_accessor author: String
35
+ attr_accessor canonical_url: String
36
+ attr_accessor base_href: String
37
+ attr_accessor language: String
38
+ attr_accessor text_direction: TextDirection
39
+ attr_accessor open_graph: Hash[String, String]
40
+ attr_accessor twitter_card: Hash[String, String]
41
+ attr_accessor meta_tags: Hash[String, String]
42
+
43
+ def initialize: (?title: String, ?description: String, keywords: Array[String], ?author: String, ?canonical_url: String, ?base_href: String, ?language: String, ?text_direction: TextDirection, open_graph: Hash[String, String], twitter_card: Hash[String, String], meta_tags: Hash[String, String]) -> void
44
+ end
45
+
46
+ class HeaderMetadata
47
+ # Header element metadata with hierarchy tracking.
48
+ #
49
+ # Captures heading elements (h1-h6) with their text content, identifiers,
50
+ # and position in the document structure.
51
+ #
52
+ # # Examples
53
+ #
54
+ # ```
55
+ # # use html_to_markdown_rs::metadata::HeaderMetadata;
56
+ # let header = HeaderMetadata {
57
+ # level: 1,
58
+ # text: "Main Title".to_string(),
59
+ # id: Some("main-title".to_string()),
60
+ # depth: 0,
61
+ # html_offset: 145,
62
+ # };
63
+ #
64
+ # assert_eq!(header.level, 1);
65
+ # assert!(header.is_valid());
66
+ # ```
67
+
68
+ attr_reader level: Integer
69
+ attr_reader text: String
70
+ attr_reader id: String
71
+ attr_reader depth: Integer
72
+ attr_reader html_offset: Integer
73
+
74
+ def initialize: (level: Integer, text: String, ?id: String, depth: Integer, html_offset: Integer) -> void
75
+ def is_valid: () -> bool
76
+ end
77
+
78
+ class LinkMetadata
79
+ # Hyperlink metadata with categorization and attributes.
80
+ #
81
+ # Represents `<a>` elements with parsed href values, text content, and link type classification.
82
+ #
83
+ # # Examples
84
+ #
85
+ # ```
86
+ # # use html_to_markdown_rs::metadata::{LinkMetadata, LinkType};
87
+ # let link = LinkMetadata {
88
+ # href: "https://example.com".to_string(),
89
+ # text: "Example".to_string(),
90
+ # title: Some("Visit Example".to_string()),
91
+ # link_type: LinkType::External,
92
+ # rel: vec!["nofollow".to_string()],
93
+ # attributes: Default::default(),
94
+ # };
95
+ #
96
+ # assert_eq!(link.link_type, LinkType::External);
97
+ # assert_eq!(link.text, "Example");
98
+ # ```
99
+
100
+ attr_reader href: String
101
+ attr_reader text: String
102
+ attr_reader title: String
103
+ attr_reader link_type: LinkType
104
+ attr_reader rel: Array[String]
105
+ attr_reader attributes: Hash[String, String]
106
+
107
+ def initialize: (href: String, text: String, ?title: String, link_type: LinkType, rel: Array[String], attributes: Hash[String, String]) -> void
108
+ def self.classify_link: (String href) -> LinkType
109
+ end
110
+
111
+ class ImageMetadata
112
+ # Image metadata with source and dimensions.
113
+ #
114
+ # Captures `<img>` elements and inline `<svg>` elements with metadata
115
+ # for image analysis and optimization.
116
+ #
117
+ # # Examples
118
+ #
119
+ # ```
120
+ # # use html_to_markdown_rs::metadata::{ImageMetadata, ImageType};
121
+ # let img = ImageMetadata {
122
+ # src: "https://example.com/image.jpg".to_string(),
123
+ # alt: Some("An example image".to_string()),
124
+ # title: Some("Example".to_string()),
125
+ # dimensions: Some((800, 600)),
126
+ # image_type: ImageType::External,
127
+ # attributes: Default::default(),
128
+ # };
129
+ #
130
+ # assert_eq!(img.image_type, ImageType::External);
131
+ # ```
132
+
133
+ attr_reader src: String
134
+ attr_reader alt: String
135
+ attr_reader title: String
136
+ attr_reader dimensions: Array[Integer]
137
+ attr_reader image_type: ImageType
138
+ attr_reader attributes: Hash[String, String]
139
+
140
+ def initialize: (src: String, ?alt: String, ?title: String, ?dimensions: Array[Integer], image_type: ImageType, attributes: Hash[String, String]) -> void
141
+ end
142
+
143
+ class StructuredData
144
+ # Structured data block (JSON-LD, Microdata, or RDFa).
145
+ #
146
+ # Represents machine-readable structured data found in the document.
147
+ # JSON-LD blocks are collected as raw JSON strings for flexibility.
148
+ #
149
+ # # Examples
150
+ #
151
+ # ```
152
+ # # use html_to_markdown_rs::metadata::{StructuredData, StructuredDataType};
153
+ # let schema = StructuredData {
154
+ # data_type: StructuredDataType::JsonLd,
155
+ # raw_json: r#"{"@context":"https://schema.org","@type":"Article"}"#.to_string(),
156
+ # schema_type: Some("Article".to_string()),
157
+ # };
158
+ #
159
+ # assert_eq!(schema.data_type, StructuredDataType::JsonLd);
160
+ # ```
161
+
162
+ attr_reader data_type: StructuredDataType
163
+ attr_reader raw_json: String
164
+ attr_reader schema_type: String
165
+
166
+ def initialize: (data_type: StructuredDataType, raw_json: String, ?schema_type: String) -> void
167
+ end
168
+
169
+ class HtmlMetadata
170
+ # Comprehensive metadata extraction result from HTML document.
171
+ #
172
+ # Contains all extracted metadata types in a single structure,
173
+ # suitable for serialization and transmission across language boundaries.
174
+ #
175
+ # # Examples
176
+ #
177
+ # ```
178
+ # # use html_to_markdown_rs::metadata::HtmlMetadata;
179
+ # let metadata = HtmlMetadata {
180
+ # document: Default::default(),
181
+ # headers: Vec::new(),
182
+ # links: Vec::new(),
183
+ # images: Vec::new(),
184
+ # structured_data: Vec::new(),
185
+ # };
186
+ #
187
+ # assert!(metadata.headers.is_empty());
188
+ # ```
189
+
190
+ attr_accessor document: DocumentMetadata
191
+ attr_accessor headers: Array[HeaderMetadata]
192
+ attr_accessor links: Array[LinkMetadata]
193
+ attr_accessor images: Array[ImageMetadata]
194
+ attr_accessor structured_data: Array[StructuredData]
195
+
196
+ def initialize: (document: DocumentMetadata, headers: Array[HeaderMetadata], links: Array[LinkMetadata], images: Array[ImageMetadata], structured_data: Array[StructuredData]) -> void
197
+ end
198
+
199
+ class ConversionOptions
200
+ # Main conversion options for HTML to Markdown conversion.
201
+ #
202
+ # Use [`ConversionOptions::builder()`] to construct, or [`Default::default()`] for defaults.
203
+ #
204
+ # # Example
205
+ #
206
+ # ```text
207
+ # use html_to_markdown_rs::ConversionOptions;
208
+ #
209
+ # let options = ConversionOptions::builder()
210
+ # .heading_style(HeadingStyle::Atx)
211
+ # .wrap(true)
212
+ # .wrap_width(100)
213
+ # .build();
214
+ # ```
215
+
216
+ attr_accessor heading_style: HeadingStyle
217
+ attr_accessor list_indent_type: ListIndentType
218
+ attr_accessor list_indent_width: Integer
219
+ attr_accessor bullets: String
220
+ attr_accessor strong_em_symbol: String
221
+ attr_accessor escape_asterisks: bool
222
+ attr_accessor escape_underscores: bool
223
+ attr_accessor escape_misc: bool
224
+ attr_accessor escape_ascii: bool
225
+ attr_accessor code_language: String
226
+ attr_accessor autolinks: bool
227
+ attr_accessor default_title: bool
228
+ attr_accessor br_in_tables: bool
229
+ attr_accessor highlight_style: HighlightStyle
230
+ attr_accessor extract_metadata: bool
231
+ attr_accessor whitespace_mode: WhitespaceMode
232
+ attr_accessor strip_newlines: bool
233
+ attr_accessor wrap: bool
234
+ attr_accessor wrap_width: Integer
235
+ attr_accessor convert_as_inline: bool
236
+ attr_accessor sub_symbol: String
237
+ attr_accessor sup_symbol: String
238
+ attr_accessor newline_style: NewlineStyle
239
+ attr_accessor code_block_style: CodeBlockStyle
240
+ attr_accessor keep_inline_images_in: Array[String]
241
+ attr_accessor preprocessing: PreprocessingOptions
242
+ attr_accessor encoding: String
243
+ attr_accessor debug: bool
244
+ attr_accessor strip_tags: Array[String]
245
+ attr_accessor preserve_tags: Array[String]
246
+ attr_accessor skip_images: bool
247
+ attr_accessor link_style: LinkStyle
248
+ attr_accessor output_format: OutputFormat
249
+ attr_accessor include_document_structure: bool
250
+ attr_accessor extract_images: bool
251
+ attr_accessor max_image_size: Integer
252
+ attr_accessor capture_svg: bool
253
+ attr_accessor infer_dimensions: bool
254
+ attr_accessor max_depth: Integer
255
+ attr_accessor exclude_selectors: Array[String]
256
+
257
+ def initialize: (heading_style: HeadingStyle, list_indent_type: ListIndentType, list_indent_width: Integer, bullets: String, strong_em_symbol: String, escape_asterisks: bool, escape_underscores: bool, escape_misc: bool, escape_ascii: bool, code_language: String, autolinks: bool, default_title: bool, br_in_tables: bool, highlight_style: HighlightStyle, extract_metadata: bool, whitespace_mode: WhitespaceMode, strip_newlines: bool, wrap: bool, wrap_width: Integer, convert_as_inline: bool, sub_symbol: String, sup_symbol: String, newline_style: NewlineStyle, code_block_style: CodeBlockStyle, keep_inline_images_in: Array[String], preprocessing: PreprocessingOptions, encoding: String, debug: bool, strip_tags: Array[String], preserve_tags: Array[String], skip_images: bool, link_style: LinkStyle, output_format: OutputFormat, include_document_structure: bool, extract_images: bool, max_image_size: Integer, capture_svg: bool, infer_dimensions: bool, ?max_depth: Integer, exclude_selectors: Array[String]) -> void
258
+ def apply_update: (ConversionOptionsUpdate update) -> void
259
+ def self.default: () -> ConversionOptions
260
+ def self.builder: () -> ConversionOptionsBuilder
261
+ def self.from_update: (ConversionOptionsUpdate update) -> ConversionOptions
262
+ def self.from: (ConversionOptionsUpdate update) -> ConversionOptions
263
+ end
264
+
265
+ class ConversionOptionsBuilder
266
+ # Builder for [`ConversionOptions`].
267
+ #
268
+ # All fields start with default values. Call `.build()` to produce the final options.
269
+
270
+ def strip_tags: (Array[String] tags) -> ConversionOptionsBuilder
271
+ def preserve_tags: (Array[String] tags) -> ConversionOptionsBuilder
272
+ def keep_inline_images_in: (Array[String] tags) -> ConversionOptionsBuilder
273
+ def exclude_selectors: (Array[String] selectors) -> ConversionOptionsBuilder
274
+ def preprocessing: (PreprocessingOptions preprocessing) -> ConversionOptionsBuilder
275
+ def build: () -> ConversionOptions
276
+ end
277
+
278
+ class ConversionOptionsUpdate
279
+ # Partial update for `ConversionOptions`.
280
+ #
281
+ # Uses `Option<T>` fields for selective updates. Bindings use this to construct
282
+ # options from language-native types. Prefer [`ConversionOptionsBuilder`] for Rust code.
283
+
284
+ attr_accessor heading_style: HeadingStyle
285
+ attr_accessor list_indent_type: ListIndentType
286
+ attr_accessor list_indent_width: Integer
287
+ attr_accessor bullets: String
288
+ attr_accessor strong_em_symbol: String
289
+ attr_accessor escape_asterisks: bool
290
+ attr_accessor escape_underscores: bool
291
+ attr_accessor escape_misc: bool
292
+ attr_accessor escape_ascii: bool
293
+ attr_accessor code_language: String
294
+ attr_accessor autolinks: bool
295
+ attr_accessor default_title: bool
296
+ attr_accessor br_in_tables: bool
297
+ attr_accessor highlight_style: HighlightStyle
298
+ attr_accessor extract_metadata: bool
299
+ attr_accessor whitespace_mode: WhitespaceMode
300
+ attr_accessor strip_newlines: bool
301
+ attr_accessor wrap: bool
302
+ attr_accessor wrap_width: Integer
303
+ attr_accessor convert_as_inline: bool
304
+ attr_accessor sub_symbol: String
305
+ attr_accessor sup_symbol: String
306
+ attr_accessor newline_style: NewlineStyle
307
+ attr_accessor code_block_style: CodeBlockStyle
308
+ attr_accessor keep_inline_images_in: Array[String]
309
+ attr_accessor preprocessing: PreprocessingOptionsUpdate
310
+ attr_accessor encoding: String
311
+ attr_accessor debug: bool
312
+ attr_accessor strip_tags: Array[String]
313
+ attr_accessor preserve_tags: Array[String]
314
+ attr_accessor skip_images: bool
315
+ attr_accessor link_style: LinkStyle
316
+ attr_accessor output_format: OutputFormat
317
+ attr_accessor include_document_structure: bool
318
+ attr_accessor extract_images: bool
319
+ attr_accessor max_image_size: Integer
320
+ attr_accessor capture_svg: bool
321
+ attr_accessor infer_dimensions: bool
322
+ attr_accessor max_depth: Integer?
323
+ attr_accessor exclude_selectors: Array[String]
324
+
325
+ def initialize: (?heading_style: HeadingStyle, ?list_indent_type: ListIndentType, ?list_indent_width: Integer, ?bullets: String, ?strong_em_symbol: String, ?escape_asterisks: bool, ?escape_underscores: bool, ?escape_misc: bool, ?escape_ascii: bool, ?code_language: String, ?autolinks: bool, ?default_title: bool, ?br_in_tables: bool, ?highlight_style: HighlightStyle, ?extract_metadata: bool, ?whitespace_mode: WhitespaceMode, ?strip_newlines: bool, ?wrap: bool, ?wrap_width: Integer, ?convert_as_inline: bool, ?sub_symbol: String, ?sup_symbol: String, ?newline_style: NewlineStyle, ?code_block_style: CodeBlockStyle, ?keep_inline_images_in: Array[String], ?preprocessing: PreprocessingOptionsUpdate, ?encoding: String, ?debug: bool, ?strip_tags: Array[String], ?preserve_tags: Array[String], ?skip_images: bool, ?link_style: LinkStyle, ?output_format: OutputFormat, ?include_document_structure: bool, ?extract_images: bool, ?max_image_size: Integer, ?capture_svg: bool, ?infer_dimensions: bool, ?max_depth: Integer?, ?exclude_selectors: Array[String]) -> void
326
+ end
327
+
328
+ class PreprocessingOptions
329
+ # HTML preprocessing options for document cleanup before conversion.
330
+
331
+ attr_accessor enabled: bool
332
+ attr_accessor preset: PreprocessingPreset
333
+ attr_accessor remove_navigation: bool
334
+ attr_accessor remove_forms: bool
335
+
336
+ def initialize: (enabled: bool, preset: PreprocessingPreset, remove_navigation: bool, remove_forms: bool) -> void
337
+ def apply_update: (PreprocessingOptionsUpdate update) -> void
338
+ def self.default: () -> PreprocessingOptions
339
+ def self.from_update: (PreprocessingOptionsUpdate update) -> PreprocessingOptions
340
+ def self.from: (PreprocessingOptionsUpdate update) -> PreprocessingOptions
341
+ end
342
+
343
+ class PreprocessingOptionsUpdate
344
+ # Partial update for `PreprocessingOptions`.
345
+ #
346
+ # This struct uses `Option<T>` to represent optional fields that can be selectively updated.
347
+ # Only specified fields (Some values) will override existing options; None values leave the
348
+ # corresponding fields unchanged when applied via [`PreprocessingOptions::apply_update`].
349
+
350
+ attr_accessor enabled: bool
351
+ attr_accessor preset: PreprocessingPreset
352
+ attr_accessor remove_navigation: bool
353
+ attr_accessor remove_forms: bool
354
+
355
+ def initialize: (?enabled: bool, ?preset: PreprocessingPreset, ?remove_navigation: bool, ?remove_forms: bool) -> void
356
+ end
357
+
358
+ class DocumentStructure
359
+ # A structured document tree representing the semantic content of an HTML document.
360
+ #
361
+ # Uses a flat node array with index-based parent/child references for efficient traversal.
362
+
363
+ attr_reader nodes: Array[DocumentNode]
364
+ attr_reader source_format: String
365
+
366
+ def initialize: (nodes: Array[DocumentNode], ?source_format: String) -> void
367
+ end
368
+
369
+ class DocumentNode
370
+ # A single node in the document tree.
371
+
372
+ attr_reader id: String
373
+ attr_reader content: NodeContent
374
+ attr_reader parent: Integer
375
+ attr_reader children: Array[Integer]
376
+ attr_reader annotations: Array[TextAnnotation]
377
+ attr_reader attributes: Hash[String, String]
378
+
379
+ def initialize: (id: String, content: NodeContent, ?parent: Integer, children: Array[Integer], annotations: Array[TextAnnotation], ?attributes: Hash[String, String]) -> void
380
+ end
381
+
382
+ class TextAnnotation
383
+ # An inline text annotation with byte-range offsets.
384
+ #
385
+ # Annotations describe formatting (bold, italic, etc.) and links within a node's text content.
386
+
387
+ attr_reader start: Integer
388
+ attr_reader end: Integer
389
+ attr_reader kind: AnnotationKind
390
+
391
+ def initialize: (start: Integer, end: Integer, kind: AnnotationKind) -> void
392
+ end
393
+
394
+ class ConversionResult
395
+ # The primary result of HTML conversion and extraction.
396
+ #
397
+ # Contains the converted text output, optional structured document tree,
398
+ # metadata, extracted tables, images, and processing warnings.
399
+ #
400
+ # # Example
401
+ #
402
+ # ```text
403
+ # use html_to_markdown_rs::{convert, ConversionOptions};
404
+ #
405
+ # let result = convert("<h1>Hello</h1><p>World</p>", None)?;
406
+ # assert!(result.content.is_some());
407
+ # assert!(result.warnings.is_empty());
408
+ # ```
409
+
410
+ attr_accessor content: String
411
+ attr_accessor document: DocumentStructure
412
+ attr_accessor metadata: HtmlMetadata
413
+ attr_accessor tables: Array[TableData]
414
+ attr_accessor images: Array[String]
415
+ attr_accessor warnings: Array[ProcessingWarning]
416
+
417
+ def initialize: (?content: String, ?document: DocumentStructure, metadata: HtmlMetadata, tables: Array[TableData], images: Array[String], warnings: Array[ProcessingWarning]) -> void
418
+ end
419
+
420
+ class TableGrid
421
+ # A structured table grid with cell-level data including spans.
422
+
423
+ attr_accessor rows: Integer
424
+ attr_accessor cols: Integer
425
+ attr_accessor cells: Array[GridCell]
426
+
427
+ def initialize: (rows: Integer, cols: Integer, cells: Array[GridCell]) -> void
428
+ end
429
+
430
+ class GridCell
431
+ # A single cell in a table grid.
432
+
433
+ attr_reader content: String
434
+ attr_reader row: Integer
435
+ attr_reader col: Integer
436
+ attr_reader row_span: Integer
437
+ attr_reader col_span: Integer
438
+ attr_reader is_header: bool
439
+
440
+ def initialize: (content: String, row: Integer, col: Integer, row_span: Integer, col_span: Integer, is_header: bool) -> void
441
+ end
442
+
443
+ class TableData
444
+ # A top-level extracted table with both structured data and markdown representation.
445
+
446
+ attr_reader grid: TableGrid
447
+ attr_reader markdown: String
448
+
449
+ def initialize: (grid: TableGrid, markdown: String) -> void
450
+ end
451
+
452
+ class ProcessingWarning
453
+ # A non-fatal warning generated during HTML processing.
454
+
455
+ attr_reader message: String
456
+ attr_reader kind: WarningKind
457
+
458
+ def initialize: (message: String, kind: WarningKind) -> void
459
+ end
460
+
461
+ class NodeContext
462
+ # Context information passed to all visitor methods.
463
+ #
464
+ # Provides comprehensive metadata about the current node being visited,
465
+ # including its type, attributes, position in the DOM tree, and parent context.
466
+
467
+ attr_reader node_type: NodeType
468
+ attr_reader tag_name: String
469
+ attr_reader attributes: Hash[String, String]
470
+ attr_reader depth: Integer
471
+ attr_reader index_in_parent: Integer
472
+ attr_reader parent_tag: String
473
+ attr_reader is_inline: bool
474
+
475
+ def initialize: (node_type: NodeType, tag_name: String, attributes: Hash[String, String], depth: Integer, index_in_parent: Integer, ?parent_tag: String, is_inline: bool) -> void
476
+ end
477
+
478
+ class TextDirection
479
+ # Text directionality of document content.
480
+ #
481
+ # Corresponds to the HTML `dir` attribute and `bdi` element directionality.
482
+
483
+ LeftToRight: Integer
484
+ RightToLeft: Integer
485
+ Auto: Integer
486
+ end
487
+
488
+ class LinkType
489
+ # Link classification based on href value and document context.
490
+ #
491
+ # Used to categorize links during extraction for filtering and analysis.
492
+
493
+ Anchor: Integer
494
+ Internal: Integer
495
+ External: Integer
496
+ Email: Integer
497
+ Phone: Integer
498
+ Other: Integer
499
+ end
500
+
501
+ class ImageType
502
+ # Image source classification for proper handling and processing.
503
+ #
504
+ # Determines whether an image is embedded (data URI), inline SVG, external, or relative.
505
+
506
+ DataUri: Integer
507
+ InlineSvg: Integer
508
+ External: Integer
509
+ Relative: Integer
510
+ end
511
+
512
+ class StructuredDataType
513
+ # Structured data format type.
514
+ #
515
+ # Identifies the schema/format used for structured data markup.
516
+
517
+ JsonLd: Integer
518
+ Microdata: Integer
519
+ RDFa: Integer
520
+ end
521
+
522
+ class PreprocessingPreset
523
+ # HTML preprocessing aggressiveness level.
524
+ #
525
+ # Controls the extent of cleanup performed before conversion. Higher levels remove more elements.
526
+
527
+ Minimal: Integer
528
+ Standard: Integer
529
+ Aggressive: Integer
530
+ end
531
+
532
+ class HeadingStyle
533
+ # Heading style options for Markdown output.
534
+ #
535
+ # Controls how headings (h1-h6) are rendered in the output Markdown.
536
+
537
+ Underlined: Integer
538
+ Atx: Integer
539
+ AtxClosed: Integer
540
+ end
541
+
542
+ class ListIndentType
543
+ # List indentation character type.
544
+ #
545
+ # Controls whether list items are indented with spaces or tabs.
546
+
547
+ Spaces: Integer
548
+ Tabs: Integer
549
+ end
550
+
551
+ class WhitespaceMode
552
+ # Whitespace handling strategy during conversion.
553
+ #
554
+ # Determines how sequences of whitespace characters (spaces, tabs, newlines) are processed.
555
+
556
+ Normalized: Integer
557
+ Strict: Integer
558
+ end
559
+
560
+ class NewlineStyle
561
+ # Line break syntax in Markdown output.
562
+ #
563
+ # Controls how soft line breaks (from `<br>` or line breaks in source) are rendered.
564
+
565
+ Spaces: Integer
566
+ Backslash: Integer
567
+ end
568
+
569
+ class CodeBlockStyle
570
+ # Code block fence style in Markdown output.
571
+ #
572
+ # Determines how code blocks (`<pre><code>`) are rendered in Markdown.
573
+
574
+ Indented: Integer
575
+ Backticks: Integer
576
+ Tildes: Integer
577
+ end
578
+
579
+ class HighlightStyle
580
+ # Highlight rendering style for `<mark>` elements.
581
+ #
582
+ # Controls how highlighted text is rendered in Markdown output.
583
+
584
+ DoubleEqual: Integer
585
+ Html: Integer
586
+ Bold: Integer
587
+ None: Integer
588
+ end
589
+
590
+ class LinkStyle
591
+ # Link rendering style in Markdown output.
592
+ #
593
+ # Controls whether links and images use inline `[text](url)` syntax or
594
+ # reference-style `[text][1]` syntax with definitions collected at the end.
595
+
596
+ Inline: Integer
597
+ Reference: Integer
598
+ end
599
+
600
+ class OutputFormat
601
+ # Output format for conversion.
602
+ #
603
+ # Specifies the target markup language format for the conversion output.
604
+
605
+ Markdown: Integer
606
+ Djot: Integer
607
+ Plain: Integer
608
+ end
609
+
610
+ class NodeContent
611
+ # The semantic content type of a document node.
612
+ #
613
+ # Uses internally tagged representation (`"node_type": "heading"`) for JSON serialization.
614
+
615
+ Heading: Integer
616
+ Paragraph: Integer
617
+ List: Integer
618
+ ListItem: Integer
619
+ Table: Integer
620
+ Image: Integer
621
+ Code: Integer
622
+ Quote: Integer
623
+ DefinitionList: Integer
624
+ DefinitionItem: Integer
625
+ RawBlock: Integer
626
+ MetadataBlock: Integer
627
+ Group: Integer
628
+ end
629
+
630
+ class AnnotationKind
631
+ # The type of an inline text annotation.
632
+ #
633
+ # Uses internally tagged representation (`"annotation_type": "bold"`) for JSON serialization.
634
+
635
+ Bold: Integer
636
+ Italic: Integer
637
+ Underline: Integer
638
+ Strikethrough: Integer
639
+ Code: Integer
640
+ Subscript: Integer
641
+ Superscript: Integer
642
+ Highlight: Integer
643
+ Link: Integer
644
+ end
645
+
646
+ class WarningKind
647
+ # Categories of processing warnings.
648
+
649
+ ImageExtractionFailed: Integer
650
+ EncodingFallback: Integer
651
+ TruncatedInput: Integer
652
+ MalformedHtml: Integer
653
+ SanitizationApplied: Integer
654
+ DepthLimitExceeded: Integer
655
+ end
656
+
657
+ class NodeType
658
+ # Node type enumeration covering all HTML element types.
659
+ #
660
+ # This enum categorizes all HTML elements that the converter recognizes,
661
+ # providing a coarse-grained classification for visitor dispatch.
662
+
663
+ Text: Integer
664
+ Element: Integer
665
+ Heading: Integer
666
+ Paragraph: Integer
667
+ Div: Integer
668
+ Blockquote: Integer
669
+ Pre: Integer
670
+ Hr: Integer
671
+ List: Integer
672
+ ListItem: Integer
673
+ DefinitionList: Integer
674
+ DefinitionTerm: Integer
675
+ DefinitionDescription: Integer
676
+ Table: Integer
677
+ TableRow: Integer
678
+ TableCell: Integer
679
+ TableHeader: Integer
680
+ TableBody: Integer
681
+ TableHead: Integer
682
+ TableFoot: Integer
683
+ Link: Integer
684
+ Image: Integer
685
+ Strong: Integer
686
+ Em: Integer
687
+ Code: Integer
688
+ Strikethrough: Integer
689
+ Underline: Integer
690
+ Subscript: Integer
691
+ Superscript: Integer
692
+ Mark: Integer
693
+ Small: Integer
694
+ Br: Integer
695
+ Span: Integer
696
+ Article: Integer
697
+ Section: Integer
698
+ Nav: Integer
699
+ Aside: Integer
700
+ Header: Integer
701
+ Footer: Integer
702
+ Main: Integer
703
+ Figure: Integer
704
+ Figcaption: Integer
705
+ Time: Integer
706
+ Details: Integer
707
+ Summary: Integer
708
+ Form: Integer
709
+ Input: Integer
710
+ Select: Integer
711
+ Option: Integer
712
+ Button: Integer
713
+ Textarea: Integer
714
+ Label: Integer
715
+ Fieldset: Integer
716
+ Legend: Integer
717
+ Audio: Integer
718
+ Video: Integer
719
+ Picture: Integer
720
+ Source: Integer
721
+ Iframe: Integer
722
+ Svg: Integer
723
+ Canvas: Integer
724
+ Ruby: Integer
725
+ Rt: Integer
726
+ Rp: Integer
727
+ Abbr: Integer
728
+ Kbd: Integer
729
+ Samp: Integer
730
+ Var: Integer
731
+ Cite: Integer
732
+ Q: Integer
733
+ Del: Integer
734
+ Ins: Integer
735
+ Data: Integer
736
+ Meter: Integer
737
+ Progress: Integer
738
+ Output: Integer
739
+ Template: Integer
740
+ Slot: Integer
741
+ Html: Integer
742
+ Head: Integer
743
+ Body: Integer
744
+ Title: Integer
745
+ Meta: Integer
746
+ LinkTag: Integer
747
+ Style: Integer
748
+ Script: Integer
749
+ Base: Integer
750
+ Custom: Integer
751
+ end
752
+
753
+ class VisitResult
754
+ # Result of a visitor callback.
755
+ #
756
+ # Allows visitors to control the conversion flow by either proceeding
757
+ # with default behavior, providing custom output, skipping elements,
758
+ # preserving HTML, or signaling errors.
759
+
760
+ Continue: Integer
761
+ Custom: Integer
762
+ Skip: Integer
763
+ PreserveHtml: Integer
764
+ Error: Integer
765
+ end
766
+
767
+ def self.convert: (String html, ?ConversionOptions options, ?String visitor) -> ConversionResult
768
+
769
+ end