html-to-markdown 3.4.0 → 3.6.0.pre.rc.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (188) hide show
  1. checksums.yaml +4 -4
  2. data/LICENSE +21 -0
  3. data/README.md +347 -0
  4. data/Steepfile +10 -2
  5. data/ext/html_to_markdown_rb/Cargo.toml +3 -2
  6. data/ext/html_to_markdown_rb/extconf.rb +5 -5
  7. data/ext/html_to_markdown_rb/native/Cargo.lock +962 -0
  8. data/ext/html_to_markdown_rb/native/Cargo.toml +6 -11
  9. data/ext/html_to_markdown_rb/native/extconf.rb +14 -0
  10. data/ext/html_to_markdown_rb/src/lib.rs +1715 -646
  11. data/lib/html_to_markdown/native.rb +913 -37
  12. data/lib/html_to_markdown/version.rb +3 -3
  13. data/lib/html_to_markdown.rb +9 -4
  14. data/lib/html_to_markdown_rb.so +0 -0
  15. data/sig/types.rbs +59 -292
  16. metadata +32 -179
  17. data/ext/html_to_markdown_rb/Makefile +0 -592
  18. data/lib/bin/html-to-markdown +0 -0
  19. data/vendor/Cargo.toml +0 -33
  20. data/vendor/html-to-markdown-rs/Cargo.toml +0 -54
  21. data/vendor/html-to-markdown-rs/README.md +0 -278
  22. data/vendor/html-to-markdown-rs/examples/basic.rs +0 -24
  23. data/vendor/html-to-markdown-rs/examples/table.rs +0 -25
  24. data/vendor/html-to-markdown-rs/examples/test_deser.rs +0 -12
  25. data/vendor/html-to-markdown-rs/examples/test_escape.rs +0 -58
  26. data/vendor/html-to-markdown-rs/examples/test_inline_formatting.rs +0 -113
  27. data/vendor/html-to-markdown-rs/examples/test_lists.rs +0 -39
  28. data/vendor/html-to-markdown-rs/examples/test_semantic_tags.rs +0 -89
  29. data/vendor/html-to-markdown-rs/examples/test_tables.rs +0 -100
  30. data/vendor/html-to-markdown-rs/examples/test_task_lists.rs +0 -61
  31. data/vendor/html-to-markdown-rs/examples/test_whitespace.rs +0 -34
  32. data/vendor/html-to-markdown-rs/src/convert_api.rs +0 -349
  33. data/vendor/html-to-markdown-rs/src/converter/block/blockquote.rs +0 -178
  34. data/vendor/html-to-markdown-rs/src/converter/block/container.rs +0 -114
  35. data/vendor/html-to-markdown-rs/src/converter/block/div.rs +0 -149
  36. data/vendor/html-to-markdown-rs/src/converter/block/heading.rs +0 -428
  37. data/vendor/html-to-markdown-rs/src/converter/block/horizontal_rule.rs +0 -103
  38. data/vendor/html-to-markdown-rs/src/converter/block/line_break.rs +0 -89
  39. data/vendor/html-to-markdown-rs/src/converter/block/mod.rs +0 -10
  40. data/vendor/html-to-markdown-rs/src/converter/block/paragraph.rs +0 -140
  41. data/vendor/html-to-markdown-rs/src/converter/block/preformatted.rs +0 -298
  42. data/vendor/html-to-markdown-rs/src/converter/block/table/builder.rs +0 -453
  43. data/vendor/html-to-markdown-rs/src/converter/block/table/caption.rs +0 -44
  44. data/vendor/html-to-markdown-rs/src/converter/block/table/cell.rs +0 -276
  45. data/vendor/html-to-markdown-rs/src/converter/block/table/cells.rs +0 -336
  46. data/vendor/html-to-markdown-rs/src/converter/block/table/layout.rs +0 -58
  47. data/vendor/html-to-markdown-rs/src/converter/block/table/mod.rs +0 -266
  48. data/vendor/html-to-markdown-rs/src/converter/block/table/scanner.rs +0 -146
  49. data/vendor/html-to-markdown-rs/src/converter/block/table/utils.rs +0 -34
  50. data/vendor/html-to-markdown-rs/src/converter/block/unknown.rs +0 -138
  51. data/vendor/html-to-markdown-rs/src/converter/context.rs +0 -208
  52. data/vendor/html-to-markdown-rs/src/converter/dom_context.rs +0 -337
  53. data/vendor/html-to-markdown-rs/src/converter/form/elements.rs +0 -770
  54. data/vendor/html-to-markdown-rs/src/converter/form/mod.rs +0 -82
  55. data/vendor/html-to-markdown-rs/src/converter/format/djot.rs +0 -64
  56. data/vendor/html-to-markdown-rs/src/converter/format/markdown.rs +0 -59
  57. data/vendor/html-to-markdown-rs/src/converter/format/mod.rs +0 -43
  58. data/vendor/html-to-markdown-rs/src/converter/handlers/blockquote.rs +0 -173
  59. data/vendor/html-to-markdown-rs/src/converter/handlers/code_block.rs +0 -434
  60. data/vendor/html-to-markdown-rs/src/converter/handlers/graphic.rs +0 -234
  61. data/vendor/html-to-markdown-rs/src/converter/handlers/image.rs +0 -282
  62. data/vendor/html-to-markdown-rs/src/converter/handlers/link.rs +0 -316
  63. data/vendor/html-to-markdown-rs/src/converter/handlers/mod.rs +0 -26
  64. data/vendor/html-to-markdown-rs/src/converter/inline/code.rs +0 -306
  65. data/vendor/html-to-markdown-rs/src/converter/inline/emphasis.rs +0 -345
  66. data/vendor/html-to-markdown-rs/src/converter/inline/link.rs +0 -428
  67. data/vendor/html-to-markdown-rs/src/converter/inline/mod.rs +0 -237
  68. data/vendor/html-to-markdown-rs/src/converter/inline/ruby.rs +0 -337
  69. data/vendor/html-to-markdown-rs/src/converter/inline/semantic/marks.rs +0 -566
  70. data/vendor/html-to-markdown-rs/src/converter/inline/semantic/mod.rs +0 -86
  71. data/vendor/html-to-markdown-rs/src/converter/inline/semantic/typography.rs +0 -558
  72. data/vendor/html-to-markdown-rs/src/converter/list/definition.rs +0 -232
  73. data/vendor/html-to-markdown-rs/src/converter/list/item.rs +0 -332
  74. data/vendor/html-to-markdown-rs/src/converter/list/mod.rs +0 -70
  75. data/vendor/html-to-markdown-rs/src/converter/list/ordered.rs +0 -201
  76. data/vendor/html-to-markdown-rs/src/converter/list/unordered.rs +0 -195
  77. data/vendor/html-to-markdown-rs/src/converter/list/utils.rs +0 -314
  78. data/vendor/html-to-markdown-rs/src/converter/main.rs +0 -710
  79. data/vendor/html-to-markdown-rs/src/converter/main_helpers.rs +0 -452
  80. data/vendor/html-to-markdown-rs/src/converter/media/embedded.rs +0 -393
  81. data/vendor/html-to-markdown-rs/src/converter/media/graphic.rs +0 -4
  82. data/vendor/html-to-markdown-rs/src/converter/media/image.rs +0 -183
  83. data/vendor/html-to-markdown-rs/src/converter/media/mod.rs +0 -87
  84. data/vendor/html-to-markdown-rs/src/converter/media/svg.rs +0 -280
  85. data/vendor/html-to-markdown-rs/src/converter/metadata.rs +0 -220
  86. data/vendor/html-to-markdown-rs/src/converter/mod.rs +0 -156
  87. data/vendor/html-to-markdown-rs/src/converter/plain_text.rs +0 -516
  88. data/vendor/html-to-markdown-rs/src/converter/preprocessing_helpers.rs +0 -201
  89. data/vendor/html-to-markdown-rs/src/converter/reference_collector.rs +0 -69
  90. data/vendor/html-to-markdown-rs/src/converter/semantic/attributes.rs +0 -269
  91. data/vendor/html-to-markdown-rs/src/converter/semantic/definition_list.rs +0 -266
  92. data/vendor/html-to-markdown-rs/src/converter/semantic/figure.rs +0 -391
  93. data/vendor/html-to-markdown-rs/src/converter/semantic/mod.rs +0 -112
  94. data/vendor/html-to-markdown-rs/src/converter/semantic/sectioning.rs +0 -85
  95. data/vendor/html-to-markdown-rs/src/converter/semantic/summary.rs +0 -324
  96. data/vendor/html-to-markdown-rs/src/converter/text/mod.rs +0 -8
  97. data/vendor/html-to-markdown-rs/src/converter/text/processing.rs +0 -56
  98. data/vendor/html-to-markdown-rs/src/converter/text_node.rs +0 -269
  99. data/vendor/html-to-markdown-rs/src/converter/utility/attributes.rs +0 -151
  100. data/vendor/html-to-markdown-rs/src/converter/utility/caching.rs +0 -74
  101. data/vendor/html-to-markdown-rs/src/converter/utility/content.rs +0 -271
  102. data/vendor/html-to-markdown-rs/src/converter/utility/mod.rs +0 -17
  103. data/vendor/html-to-markdown-rs/src/converter/utility/preprocessing.rs +0 -1002
  104. data/vendor/html-to-markdown-rs/src/converter/utility/serialization.rs +0 -126
  105. data/vendor/html-to-markdown-rs/src/converter/utility/siblings.rs +0 -97
  106. data/vendor/html-to-markdown-rs/src/converter/visitor_hooks.rs +0 -189
  107. data/vendor/html-to-markdown-rs/src/error.rs +0 -43
  108. data/vendor/html-to-markdown-rs/src/exports.rs +0 -24
  109. data/vendor/html-to-markdown-rs/src/inline_images.rs +0 -336
  110. data/vendor/html-to-markdown-rs/src/lib.rs +0 -139
  111. data/vendor/html-to-markdown-rs/src/metadata/collector.rs +0 -457
  112. data/vendor/html-to-markdown-rs/src/metadata/config.rs +0 -394
  113. data/vendor/html-to-markdown-rs/src/metadata/extraction.rs +0 -398
  114. data/vendor/html-to-markdown-rs/src/metadata/mod.rs +0 -288
  115. data/vendor/html-to-markdown-rs/src/metadata/types.rs +0 -477
  116. data/vendor/html-to-markdown-rs/src/options/conversion.rs +0 -559
  117. data/vendor/html-to-markdown-rs/src/options/inline_image.rs +0 -111
  118. data/vendor/html-to-markdown-rs/src/options/mod.rs +0 -20
  119. data/vendor/html-to-markdown-rs/src/options/preprocessing.rs +0 -201
  120. data/vendor/html-to-markdown-rs/src/options/validation.rs +0 -416
  121. data/vendor/html-to-markdown-rs/src/prelude.rs +0 -1
  122. data/vendor/html-to-markdown-rs/src/rcdom.rs +0 -487
  123. data/vendor/html-to-markdown-rs/src/text.rs +0 -358
  124. data/vendor/html-to-markdown-rs/src/types/document.rs +0 -191
  125. data/vendor/html-to-markdown-rs/src/types/mod.rs +0 -17
  126. data/vendor/html-to-markdown-rs/src/types/result.rs +0 -54
  127. data/vendor/html-to-markdown-rs/src/types/structure_builder.rs +0 -791
  128. data/vendor/html-to-markdown-rs/src/types/structure_collector.rs +0 -483
  129. data/vendor/html-to-markdown-rs/src/types/tables.rs +0 -52
  130. data/vendor/html-to-markdown-rs/src/types/warnings.rs +0 -33
  131. data/vendor/html-to-markdown-rs/src/validation.rs +0 -158
  132. data/vendor/html-to-markdown-rs/src/visitor/default_impl.rs +0 -63
  133. data/vendor/html-to-markdown-rs/src/visitor/mod.rs +0 -41
  134. data/vendor/html-to-markdown-rs/src/visitor/traits.rs +0 -370
  135. data/vendor/html-to-markdown-rs/src/visitor/types.rs +0 -319
  136. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/mod.rs +0 -1
  137. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/content.rs +0 -126
  138. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/mod.rs +0 -27
  139. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/state.rs +0 -110
  140. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/traversal.rs +0 -250
  141. data/vendor/html-to-markdown-rs/src/visitor_helpers.rs +0 -597
  142. data/vendor/html-to-markdown-rs/src/wrapper/sync.rs +0 -413
  143. data/vendor/html-to-markdown-rs/src/wrapper/utils.rs +0 -290
  144. data/vendor/html-to-markdown-rs/src/wrapper.rs +0 -9
  145. data/vendor/html-to-markdown-rs/tests/br_in_inline_test.rs +0 -87
  146. data/vendor/html-to-markdown-rs/tests/commonmark_compliance_test.rs +0 -297
  147. data/vendor/html-to-markdown-rs/tests/djot_output_test.rs +0 -153
  148. data/vendor/html-to-markdown-rs/tests/exclude_selectors_test.rs +0 -132
  149. data/vendor/html-to-markdown-rs/tests/integration_test.rs +0 -631
  150. data/vendor/html-to-markdown-rs/tests/issue_121_regressions.rs +0 -49
  151. data/vendor/html-to-markdown-rs/tests/issue_127_regressions.rs +0 -58
  152. data/vendor/html-to-markdown-rs/tests/issue_128_regressions.rs +0 -17
  153. data/vendor/html-to-markdown-rs/tests/issue_131_regressions.rs +0 -41
  154. data/vendor/html-to-markdown-rs/tests/issue_134_regressions.rs +0 -40
  155. data/vendor/html-to-markdown-rs/tests/issue_139_regressions.rs +0 -26
  156. data/vendor/html-to-markdown-rs/tests/issue_140_regressions.rs +0 -185
  157. data/vendor/html-to-markdown-rs/tests/issue_143_regressions.rs +0 -100
  158. data/vendor/html-to-markdown-rs/tests/issue_145_regressions.rs +0 -133
  159. data/vendor/html-to-markdown-rs/tests/issue_146_regressions.rs +0 -144
  160. data/vendor/html-to-markdown-rs/tests/issue_176_regressions.rs +0 -62
  161. data/vendor/html-to-markdown-rs/tests/issue_190_regressions.rs +0 -128
  162. data/vendor/html-to-markdown-rs/tests/issue_199_regressions.rs +0 -20
  163. data/vendor/html-to-markdown-rs/tests/issue_200_regressions.rs +0 -62
  164. data/vendor/html-to-markdown-rs/tests/issue_212_regressions.rs +0 -68
  165. data/vendor/html-to-markdown-rs/tests/issue_216_217_regressions.rs +0 -87
  166. data/vendor/html-to-markdown-rs/tests/issue_336_regressions.rs +0 -74
  167. data/vendor/html-to-markdown-rs/tests/issue_339_regressions.rs +0 -92
  168. data/vendor/html-to-markdown-rs/tests/issue_347_regressions.rs +0 -154
  169. data/vendor/html-to-markdown-rs/tests/issue_348_visitor_plain.rs +0 -93
  170. data/vendor/html-to-markdown-rs/tests/json_ld_script_extraction.rs +0 -44
  171. data/vendor/html-to-markdown-rs/tests/lists_test.rs +0 -199
  172. data/vendor/html-to-markdown-rs/tests/plain_output_test.rs +0 -273
  173. data/vendor/html-to-markdown-rs/tests/preprocessing_tests.rs +0 -61
  174. data/vendor/html-to-markdown-rs/tests/reference_links_test.rs +0 -169
  175. data/vendor/html-to-markdown-rs/tests/sectioning_elements_test.rs +0 -137
  176. data/vendor/html-to-markdown-rs/tests/skip_images_test.rs +0 -522
  177. data/vendor/html-to-markdown-rs/tests/tables_test.rs +0 -743
  178. data/vendor/html-to-markdown-rs/tests/test_custom_elements.rs +0 -41
  179. data/vendor/html-to-markdown-rs/tests/test_issue_187.rs +0 -204
  180. data/vendor/html-to-markdown-rs/tests/test_issue_218.rs +0 -68
  181. data/vendor/html-to-markdown-rs/tests/test_issue_277.rs +0 -77
  182. data/vendor/html-to-markdown-rs/tests/test_max_depth.rs +0 -82
  183. data/vendor/html-to-markdown-rs/tests/test_nested_simple.rs +0 -45
  184. data/vendor/html-to-markdown-rs/tests/test_script_style_stripping.rs +0 -396
  185. data/vendor/html-to-markdown-rs/tests/test_spa_bisect.rs +0 -34
  186. data/vendor/html-to-markdown-rs/tests/visitor_code_integration_test.rs +0 -121
  187. data/vendor/html-to-markdown-rs/tests/visitor_integration_test.rs +0 -1190
  188. data/vendor/html-to-markdown-rs/tests/xml_tables_test.rs +0 -372
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e1296a51b6ca0f757a95487c567e9d1e0d2acda80a46656cd14002322a4b850b
4
- data.tar.gz: dca253ecef2f4818aa9e39e56d48ca5bb9ad35f92759580ee893d17eb455d494
3
+ metadata.gz: 99d668eec369d0e03a53d5ddf4192c1e896ad5940d669b598a73ed65b0329c82
4
+ data.tar.gz: 91e04cc771c7a7725a705258d4268eb16eda7541d7b591d480dfa53da1480015
5
5
  SHA512:
6
- metadata.gz: 989e6b723c3670894b9e87ad87c1d752c27e3917f5cc60f0ba8f40175ee97c35ef4a2a40da53ab77908c6febb44abc0fe66064470494e73be3a5aae61d8498ae
7
- data.tar.gz: 6275bc085c86f521114dbfd860e856dfdeac43ae92b755d81960e22e11a6d73ec4f25c89c837c37072d5500fd22af92f9507ace1673337708cac1a5376db926c
6
+ metadata.gz: 8b85249b540aab795dafc749a17b1bd00b1d58e4557cb4b524e833c0f1fc86c7b5a141956cf5a96a6fe8a14aaaea6aa44e9fd102d1e1337b2045dff829c4f302
7
+ data.tar.gz: d7c532c8de1ab8f7f5a0313f56c409d5f51ad4618552be841fe6a1e4b39b69a28d9c25788d423e52490130682a6c26f5d352b2db4d617fc7bb31ae2a9dc85c01
data/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright 2024-2025 Na'aman Hirschfeld
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,347 @@
1
+ # html-to-markdown
2
+
3
+ <div align="center" style="display: flex; flex-wrap: wrap; gap: 8px; justify-content: center; margin: 20px 0;">
4
+ <a href="https://github.com/kreuzberg-dev/alef">
5
+ <img src="https://img.shields.io/badge/Bindings-alef%20%D7%90-007ec6" alt="Bindings">
6
+ </a>
7
+ <!-- Language Bindings -->
8
+ <a href="https://crates.io/crates/html-to-markdown-rs">
9
+ <img src="https://img.shields.io/crates/v/html-to-markdown-rs?label=Rust&color=007ec6" alt="Rust">
10
+ </a>
11
+ <a href="https://pypi.org/project/html-to-markdown/">
12
+ <img src="https://img.shields.io/pypi/v/html-to-markdown?label=Python&color=007ec6" alt="Python">
13
+ </a>
14
+ <a href="https://www.npmjs.com/package/@kreuzberg/html-to-markdown-node">
15
+ <img src="https://img.shields.io/npm/v/@kreuzberg/html-to-markdown-node?label=Node.js&color=007ec6" alt="Node.js">
16
+ </a>
17
+ <a href="https://www.npmjs.com/package/@kreuzberg/html-to-markdown-wasm">
18
+ <img src="https://img.shields.io/npm/v/@kreuzberg/html-to-markdown-wasm?label=WASM&color=007ec6" alt="WASM">
19
+ </a>
20
+ <a href="https://central.sonatype.com/artifact/dev.kreuzberg/html-to-markdown">
21
+ <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/html-to-markdown?label=Java&color=007ec6" alt="Java">
22
+ </a>
23
+ <a href="https://pkg.go.dev/github.com/kreuzberg-dev/html-to-markdown/packages/go/v3/htmltomarkdown">
24
+ <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/html-to-markdown?label=Go&color=007ec6&filter=v3*" alt="Go">
25
+ </a>
26
+ <a href="https://www.nuget.org/packages/KreuzbergDev.HtmlToMarkdown/">
27
+ <img src="https://img.shields.io/nuget/v/KreuzbergDev.HtmlToMarkdown?label=C%23&color=007ec6" alt="C#">
28
+ </a>
29
+ <a href="https://packagist.org/packages/kreuzberg-dev/html-to-markdown">
30
+ <img src="https://img.shields.io/packagist/v/kreuzberg-dev/html-to-markdown?label=PHP&color=007ec6" alt="PHP">
31
+ </a>
32
+ <a href="https://rubygems.org/gems/html-to-markdown">
33
+ <img src="https://img.shields.io/gem/v/html-to-markdown?label=Ruby&color=007ec6" alt="Ruby">
34
+ </a>
35
+ <a href="https://hex.pm/packages/html_to_markdown">
36
+ <img src="https://img.shields.io/hexpm/v/html_to_markdown?label=Elixir&color=007ec6" alt="Elixir">
37
+ </a>
38
+ <a href="https://kreuzberg-dev.r-universe.dev/htmltomarkdown">
39
+ <img src="https://img.shields.io/badge/R-htmltomarkdown-007ec6" alt="R">
40
+ </a>
41
+ <a href="https://pub.dev/packages/h2m">
42
+ <img src="https://img.shields.io/pub/v/h2m?label=Dart&color=007ec6" alt="Dart">
43
+ </a>
44
+ <a href="https://central.sonatype.com/artifact/dev.kreuzberg/html-to-markdown-android">
45
+ <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/html-to-markdown-android?label=Kotlin&color=007ec6" alt="Kotlin">
46
+ </a>
47
+ <a href="https://github.com/kreuzberg-dev/html-to-markdown/tree/main/packages/swift">
48
+ <img src="https://img.shields.io/badge/Swift-SPM-007ec6" alt="Swift">
49
+ </a>
50
+ <a href="https://github.com/kreuzberg-dev/html-to-markdown/tree/main/packages/zig">
51
+ <img src="https://img.shields.io/badge/Zig-package-007ec6" alt="Zig">
52
+ </a>
53
+ <a href="https://github.com/kreuzberg-dev/html-to-markdown/releases">
54
+ <img src="https://img.shields.io/badge/C-FFI-007ec6" alt="C FFI">
55
+ </a>
56
+
57
+ <!-- Project Info -->
58
+ <a href="https://github.com/kreuzberg-dev/html-to-markdown/blob/main/LICENSE">
59
+ <img src="https://img.shields.io/badge/License-MIT-007ec6" alt="License">
60
+ </a>
61
+ <a href="https://docs.html-to-markdown.kreuzberg.dev">
62
+ <img src="https://img.shields.io/badge/Docs-html--to--markdown-007ec6" alt="Documentation">
63
+ </a>
64
+ </div>
65
+
66
+ <div align="center" style="margin: 24px 0 0;">
67
+ <a href="https://kreuzberg.dev">
68
+ <img alt="html-to-markdown" src="https://github.com/user-attachments/assets/478a83da-237b-446b-b3a8-e564c13e00a8" />
69
+ </a>
70
+ </div>
71
+
72
+ <div align="center" style="display: flex; flex-wrap: wrap; gap: 12px; justify-content: center; margin: 28px 0 24px;">
73
+ <a href="https://discord.gg/xt9WY3GnKR">
74
+ <img height="22" src="https://img.shields.io/badge/Discord-Chat-007ec6?logo=discord&logoColor=white" alt="Join Discord">
75
+ </a>
76
+ <a href="https://docs.html-to-markdown.kreuzberg.dev/demo/">
77
+ <img height="22" src="https://img.shields.io/badge/Live%20Demo-Open-007ec6?logo=webassembly&logoColor=white" alt="Live Demo">
78
+ </a>
79
+ </div>
80
+
81
+ Blazing-fast HTML to Markdown conversion for Ruby, powered by the same Rust engine used by our Python, Node.js, WebAssembly, and PHP packages.
82
+ Ship identical Markdown across every runtime while enjoying native extension performance with Magnus bindings.
83
+
84
+ ## What This Package Provides
85
+
86
+ - **Same renderer as every binding** — output matches Rust, Python, Node.js, Ruby, PHP, Go, Java, .NET, Elixir, R, Dart, Swift, Zig, C FFI, and WASM.
87
+ - **Structured conversion result** — Markdown plus metadata, links, headings, images, tables, and warnings where the binding exposes them.
88
+ - **Production defaults** — HTML is parsed with the Rust core, sanitized by default, and rendered without runtime-specific Markdown drift.
89
+ - **Ruby extension** — Magnus-backed native extension with idiomatic Ruby objects and no separate service process.
90
+
91
+ ## Installation
92
+
93
+ ```bash
94
+ gem install html-to-markdown
95
+ ```
96
+
97
+ Requires Ruby 3.2+ with Magnus native extension bindings. Published for Linux, macOS.
98
+
99
+ ## Performance Snapshot
100
+
101
+ **Apple M4** · `convert()` · Real Wikipedia documents
102
+
103
+ | Document | Size | Latency | Throughput |
104
+ |----------|------|---------|------------|
105
+ | Lists (Timeline) | 129KB | 0.71ms | 182 MB/s |
106
+ | Tables (Countries) | 360KB | 2.15ms | 167 MB/s |
107
+ | Mixed (Python wiki) | 656KB | 4.89ms | 134 MB/s |
108
+
109
+ ## Quick Start
110
+
111
+ Basic conversion:
112
+
113
+ ```ruby
114
+ require 'html_to_markdown'
115
+
116
+ html = "<h1>Hello</h1><p>This is <strong>fast</strong>!</p>"
117
+ result = HtmlToMarkdown.convert(html)
118
+ markdown = result[:content]
119
+ ```
120
+
121
+ With conversion options:
122
+
123
+ ```ruby
124
+ require 'html_to_markdown'
125
+
126
+ html = "<h1>Hello</h1><p>This is <strong>fast</strong>!</p>"
127
+ result = HtmlToMarkdown.convert(html, heading_style: :atx, code_block_style: :fenced)
128
+ markdown = result[:content]
129
+ ```
130
+
131
+ ## Architecture
132
+
133
+ The converter routes each input through one of three tiers based on a fast prescan of the byte stream:
134
+
135
+ 1. **Tier-1 — single-pass byte scanner.** Handles 110+ HTML tags directly. Bails on any construct it cannot prove byte-equivalent to Tier-2.
136
+ 2. **Tier-2 — DOM walker.** Picks up Tier-1 bails and inputs the classifier rejected up front.
137
+ 3. **Tier-3 — standards-conformant parser.** Engaged for malformed HTML requiring full HTML5 repair.
138
+
139
+ The dispatcher is invisible to the caller. Output is byte-identical across tiers — enforced by a 116-snapshot oracle.
140
+
141
+ ## Capabilities
142
+
143
+ - **16 languages, one Rust core.** Rust, Python, Node.js, WASM, Java, Go, C#, PHP, Ruby, Elixir, R, Dart, Kotlin (Android), Swift, Zig, C ABI.
144
+ - **CommonMark-compatible Markdown** with GFM-style tables.
145
+ - **Djot output**: set `output_format = "djot"` (see Djot Output Format section below).
146
+ - **Real-HTML robust**: unclosed tags, CDATA, custom elements, malformed entities, nested tables, mixed encodings handled without losing content.
147
+ - **Metadata extraction**, **visitor API**, **inline images**, **configurable preprocessing presets**.
148
+ - **Per-group regression gates in CI**: every PR runs the bench harness against per-group thresholds.
149
+
150
+ ## API Reference
151
+
152
+ ### Core Function
153
+
154
+ **`convert(html, options: nil, visitor: nil) -> ConversionResult`**
155
+
156
+ Converts HTML to Markdown. Returns a `ConversionResult` hash with all results in a single call.
157
+
158
+ ```ruby
159
+ require 'html_to_markdown'
160
+
161
+ result = HtmlToMarkdown.convert(html)
162
+ markdown = result[:content] # Converted Markdown string
163
+ metadata = result[:metadata] # Metadata (when extract_metadata: true)
164
+ tables = result[:tables] # Structured table data (when extract_tables: true)
165
+ document = result[:document] # Document-level info
166
+ images = result[:images] # Extracted images
167
+ warnings = result[:warnings] # Any conversion warnings
168
+ ```
169
+
170
+ ### Options
171
+
172
+ **`ConversionOptions`** – Key configuration fields:
173
+
174
+ - `heading_style`: Heading format (`"underlined"` | `"atx"` | `"atx_closed"`) — default: `"underlined"`
175
+ - `list_indent_width`: Spaces per indent level — default: `2`
176
+ - `bullets`: Bullet characters cycle — default: `"*+-"`
177
+ - `wrap`: Enable text wrapping — default: `false`
178
+ - `wrap_width`: Wrap at column — default: `80`
179
+ - `code_language`: Default fenced code block language — default: none
180
+ - `extract_metadata`: Enable metadata extraction into `result.metadata` — default: `false`
181
+ - `extract_tables`: Enable structured table extraction into `result.tables` — default: `false`
182
+ - `output_format`: Output markup format (`"markdown"` | `"djot"` | `"plain"`) — default: `"markdown"`
183
+
184
+ ## Djot Output Format
185
+
186
+ The library supports converting HTML to [Djot](https://djot.net/), a lightweight markup language similar to Markdown but with a different syntax for some elements. Set `output_format` to `"djot"` to use this format.
187
+
188
+ ### Syntax Differences
189
+
190
+ | Element | Markdown | Djot |
191
+ | -------------- | ---------- | ---------- |
192
+ | Strong | `**text**` | `*text*` |
193
+ | Emphasis | `*text*` | `_text_` |
194
+ | Strikethrough | `~~text~~` | `{-text-}` |
195
+ | Inserted/Added | N/A | `{+text+}` |
196
+ | Highlighted | N/A | `{=text=}` |
197
+ | Subscript | N/A | `~text~` |
198
+ | Superscript | N/A | `^text^` |
199
+
200
+ ### Example Usage
201
+
202
+ ```ruby
203
+ require 'html_to_markdown'
204
+
205
+ html = "<p>This is <strong>bold</strong> and <em>italic</em> text.</p>"
206
+
207
+ # Default Markdown output
208
+ markdown = HtmlToMarkdown.convert(html)
209
+ # Result: "This is **bold** and *italic* text."
210
+
211
+ # Djot output
212
+ djot = HtmlToMarkdown.convert(html, output_format: 'djot')
213
+ # Result: "This is *bold* and _italic_ text."
214
+ ```
215
+
216
+ Djot's extended syntax allows you to express more semantic meaning in lightweight text, making it useful for documents that require strikethrough, insertion tracking, or mathematical notation.
217
+
218
+ ## Plain Text Output
219
+
220
+ Set `output_format` to `"plain"` to strip all markup and return only visible text. This bypasses the Markdown conversion pipeline entirely for maximum speed.
221
+
222
+ ```ruby
223
+ require 'html_to_markdown'
224
+
225
+ html = "<h1>Title</h1><p>This is <strong>bold</strong> and <em>italic</em> text.</p>"
226
+
227
+ plain = HtmlToMarkdown.convert(html, output_format: 'plain')
228
+ # Result: "Title\n\nThis is bold and italic text."
229
+ ```
230
+
231
+ Plain text mode is useful for search indexing, text extraction, and feeding content to LLMs.
232
+
233
+ ## Metadata Extraction
234
+
235
+ The metadata extraction feature enables comprehensive document analysis during conversion. Extract document properties, headers, links, images, and structured data in a single pass — all via the standard `convert()` function.
236
+
237
+ **Use Cases:**
238
+
239
+ - **SEO analysis** – Extract title, description, Open Graph tags, Twitter cards
240
+ - **Table of contents generation** – Build structured outlines from heading hierarchy
241
+ - **Content migration** – Document all external links and resources
242
+ - **Accessibility audits** – Check for images without alt text, empty links, invalid heading hierarchy
243
+ - **Link validation** – Classify and validate anchor, internal, external, email, and phone links
244
+
245
+ **Zero Overhead When Disabled:** Metadata extraction adds negligible overhead and happens during the HTML parsing pass. Pass `extract_metadata: true` in `ConversionOptions` to enable it; the result is available at `result.metadata`.
246
+
247
+ ### Example: Quick Start
248
+
249
+ ```ruby
250
+ require 'html_to_markdown'
251
+
252
+ html = '<h1>Article</h1><img src="test.jpg" alt="test">'
253
+ result = HtmlToMarkdown.convert(html, extract_metadata: true)
254
+
255
+ puts result[:content] # Converted Markdown
256
+ puts result[:metadata][:document][:title] # Document title
257
+ puts result[:metadata][:headers] # All h1-h6 elements
258
+ puts result[:metadata][:links] # All hyperlinks
259
+ puts result[:metadata][:images] # All images with alt text
260
+ puts result[:metadata][:structured_data] # JSON-LD, Microdata, RDFa
261
+ ```
262
+
263
+ ## Visitor Pattern
264
+
265
+ The visitor pattern enables custom HTML→Markdown conversion logic by providing callbacks for specific HTML elements during traversal. Pass a visitor as the third argument to `convert()`.
266
+
267
+ **Use Cases:**
268
+
269
+ - **Custom Markdown dialects** – Convert to Obsidian, Notion, or other flavors
270
+ - **Content filtering** – Remove tracking pixels, ads, or unwanted elements
271
+ - **URL rewriting** – Rewrite CDN URLs, add query parameters, validate links
272
+ - **Accessibility validation** – Check alt text, heading hierarchy, link text
273
+ - **Analytics** – Track element usage, link destinations, image sources
274
+
275
+ **Supported Visitor Methods:** 40+ callbacks for text, inline elements, links, images, headings, lists, blocks, and tables.
276
+
277
+ ### Example: Quick Start
278
+
279
+ ```ruby
280
+ require 'html_to_markdown'
281
+
282
+ class MyVisitor
283
+ def visit_link(ctx, href, text, title = nil)
284
+ # Rewrite CDN URLs
285
+ if href.start_with?('https://old-cdn.com')
286
+ href = href.sub('https://old-cdn.com', 'https://new-cdn.com')
287
+ end
288
+ { type: :custom, output: "[#{text}](#{href})" }
289
+ end
290
+
291
+ def visit_image(ctx, src, alt = nil, title = nil)
292
+ # Skip tracking pixels
293
+ src.include?('tracking') ? { type: :skip } : { type: :continue }
294
+ end
295
+ end
296
+
297
+ html = '<a href="https://old-cdn.com/file.pdf">Download</a>'
298
+ result = HtmlToMarkdown.convert(html, visitor: MyVisitor.new)
299
+ markdown = result[:content]
300
+ ```
301
+
302
+ ## Examples
303
+
304
+ ## Links
305
+
306
+ - **GitHub:** [github.com/kreuzberg-dev/html-to-markdown](https://github.com/kreuzberg-dev/html-to-markdown)
307
+ - **RubyGems:** [rubygems.org/gems/html-to-markdown](https://rubygems.org/gems/html-to-markdown)
308
+ - **Discord:** [discord.gg/xt9WY3GnKR](https://discord.gg/xt9WY3GnKR)
309
+
310
+ ## Part of Kreuzberg.dev
311
+
312
+ - [Kreuzberg](https://github.com/kreuzberg-dev/kreuzberg) — document intelligence: text, tables, metadata from 90+ formats with optional OCR.
313
+ - [Kreuzberg Cloud](https://github.com/kreuzberg-dev/kreuzberg-cloud) — managed extraction API with SDKs, dashboards, and observability.
314
+ - [kreuzcrawl](https://github.com/kreuzberg-dev/kreuzcrawl) — web crawling and scraping with HTML→Markdown and headless-Chrome fallback.
315
+ - [liter-llm](https://github.com/kreuzberg-dev/liter-llm) — universal LLM API client with native bindings for 14 languages and 143 providers.
316
+ - [tree-sitter-language-pack](https://github.com/kreuzberg-dev/tree-sitter-language-pack) — tree-sitter grammars and code-intelligence primitives.
317
+ - [alef](https://github.com/kreuzberg-dev/alef) — the polyglot binding generator that produces every per-language binding across the 5 polyglot repos.
318
+ - [Discord](https://discord.gg/xt9WY3GnKR) — community, roadmap, announcements.
319
+
320
+ ## Contributing
321
+
322
+ We welcome contributions! Please see our [Contributing Guide](https://github.com/kreuzberg-dev/html-to-markdown/blob/main/CONTRIBUTING.md) for details on:
323
+
324
+ - Setting up the development environment
325
+ - Running tests locally
326
+ - Submitting pull requests
327
+ - Reporting issues
328
+
329
+ All contributions must follow our code quality standards (enforced via pre-commit hooks):
330
+
331
+ - Proper test coverage (Rust 95%+, language bindings 80%+)
332
+ - Formatting and linting checks
333
+ - Documentation for public APIs
334
+
335
+ ## License
336
+
337
+ MIT License – see [LICENSE](https://github.com/kreuzberg-dev/html-to-markdown/blob/main/LICENSE). Copyright © Kreuzberg, Inc.
338
+
339
+ ## Support
340
+
341
+ If you find this library useful, consider [sponsoring the project](https://github.com/sponsors/kreuzberg-dev).
342
+
343
+ Have questions or run into issues? We're here to help:
344
+
345
+ - **GitHub Issues:** [github.com/kreuzberg-dev/html-to-markdown/issues](https://github.com/kreuzberg-dev/html-to-markdown/issues)
346
+ - **Issues:** [github.com/kreuzberg-dev/html-to-markdown/issues](https://github.com/kreuzberg-dev/html-to-markdown/issues)
347
+ - **Discord Community:** [discord.gg/xt9WY3GnKR](https://discord.gg/xt9WY3GnKR)
data/Steepfile CHANGED
@@ -1,6 +1,14 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  target :lib do
4
- signature 'sig'
5
- check 'lib'
4
+ signature "sig"
5
+ check "lib"
6
+ # The generated `lib/html_to_markdown/native.rb` carries inline Sorbet
7
+ # `sig { ... }` blocks on tagged-enum variant Data classes. Sorbet's runtime
8
+ # provides those via `extend T::Sig`, but Steep does not understand the
9
+ # extension (it relies on RBS, not Sorbet sigs) and reports
10
+ # `Type `self` does not have method `sig`` on every block. RBS coverage
11
+ # for the same surface lives in `sig/types.rbs`, so we steer Steep to the
12
+ # RBS file by ignoring the .rb.
13
+ ignore "lib/html_to_markdown/native.rb"
6
14
  end
@@ -1,7 +1,7 @@
1
1
 
2
2
  [package]
3
3
  name = "html-to-markdown-rb"
4
- version = "3.4.0"
4
+ version = "3.6.0-rc.23"
5
5
  edition = "2024"
6
6
  license = "MIT"
7
7
  [workspace]
@@ -11,12 +11,13 @@ exclude = ["native"]
11
11
  crate-type = ["cdylib"]
12
12
 
13
13
  [dependencies]
14
- html-to-markdown-rs = { path = "../../vendor/html-to-markdown-rs", features = [
14
+ html-to-markdown-rs = { path = "../../../../crates/html-to-markdown", features = [
15
15
  "full",
16
16
  "metadata",
17
17
  "visitor",
18
18
  "serde",
19
19
  "inline-images",
20
+ "testkit",
20
21
  ] }
21
22
  magnus = "0.8"
22
23
  serde = { version = "1", features = ["derive"] }
@@ -1,11 +1,11 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require 'mkmf'
4
- require 'rb_sys/mkmf'
3
+ require "mkmf"
4
+ require "rb_sys/mkmf"
5
5
 
6
- default_profile = ENV.fetch('CARGO_PROFILE', 'release')
6
+ default_profile = ENV.fetch("CARGO_PROFILE", "release")
7
7
 
8
- create_rust_makefile('html_to_markdown_rb') do |config|
8
+ create_rust_makefile("html_to_markdown_rb") do |config|
9
9
  config.profile = default_profile.to_sym
10
- config.ext_dir = 'native'
10
+ config.ext_dir = "native"
11
11
  end