html-to-markdown 3.2.3 → 3.4.0.pre.rc.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (136) hide show
  1. checksums.yaml +4 -4
  2. data/Steepfile +6 -0
  3. data/ext/html_to_markdown_rb/Cargo.toml +2 -2
  4. data/ext/html_to_markdown_rb/native/Cargo.toml +28 -0
  5. data/ext/html_to_markdown_rb/src/html-to-markdown/version.rb +10 -0
  6. data/ext/html_to_markdown_rb/src/html-to-markdown.rb +13 -0
  7. data/ext/html_to_markdown_rb/src/lib.rs +2088 -268
  8. data/lib/bin/html-to-markdown +0 -0
  9. data/lib/html_to_markdown/version.rb +1 -1
  10. data/lib/html_to_markdown.rb +5 -3
  11. data/sig/types.rbs +769 -0
  12. data/vendor/Cargo.toml +2 -2
  13. data/vendor/html-to-markdown-rs/Cargo.toml +1 -1
  14. data/vendor/html-to-markdown-rs/examples/basic.rs +1 -1
  15. data/vendor/html-to-markdown-rs/examples/table.rs +1 -1
  16. data/vendor/html-to-markdown-rs/examples/test_deser.rs +1 -1
  17. data/vendor/html-to-markdown-rs/examples/test_escape.rs +1 -1
  18. data/vendor/html-to-markdown-rs/examples/test_inline_formatting.rs +1 -1
  19. data/vendor/html-to-markdown-rs/examples/test_lists.rs +1 -1
  20. data/vendor/html-to-markdown-rs/examples/test_semantic_tags.rs +1 -1
  21. data/vendor/html-to-markdown-rs/examples/test_tables.rs +1 -1
  22. data/vendor/html-to-markdown-rs/examples/test_task_lists.rs +1 -1
  23. data/vendor/html-to-markdown-rs/examples/test_whitespace.rs +1 -1
  24. data/vendor/html-to-markdown-rs/src/convert_api.rs +15 -25
  25. data/vendor/html-to-markdown-rs/src/converter/block/blockquote.rs +1 -1
  26. data/vendor/html-to-markdown-rs/src/converter/block/container.rs +3 -3
  27. data/vendor/html-to-markdown-rs/src/converter/block/div.rs +1 -1
  28. data/vendor/html-to-markdown-rs/src/converter/block/heading.rs +6 -7
  29. data/vendor/html-to-markdown-rs/src/converter/block/horizontal_rule.rs +1 -1
  30. data/vendor/html-to-markdown-rs/src/converter/block/line_break.rs +1 -1
  31. data/vendor/html-to-markdown-rs/src/converter/block/mod.rs +0 -108
  32. data/vendor/html-to-markdown-rs/src/converter/block/paragraph.rs +1 -1
  33. data/vendor/html-to-markdown-rs/src/converter/block/preformatted.rs +1 -1
  34. data/vendor/html-to-markdown-rs/src/converter/block/table/builder.rs +1 -1
  35. data/vendor/html-to-markdown-rs/src/converter/block/table/cell.rs +1 -1
  36. data/vendor/html-to-markdown-rs/src/converter/block/table/layout.rs +1 -1
  37. data/vendor/html-to-markdown-rs/src/converter/block/table/mod.rs +2 -4
  38. data/vendor/html-to-markdown-rs/src/converter/block/unknown.rs +1 -1
  39. data/vendor/html-to-markdown-rs/src/converter/context.rs +10 -0
  40. data/vendor/html-to-markdown-rs/src/converter/dom_context.rs +1 -1
  41. data/vendor/html-to-markdown-rs/src/converter/form/elements.rs +14 -14
  42. data/vendor/html-to-markdown-rs/src/converter/form/mod.rs +1 -1
  43. data/vendor/html-to-markdown-rs/src/converter/format/mod.rs +0 -3
  44. data/vendor/html-to-markdown-rs/src/converter/inline/code.rs +1 -1
  45. data/vendor/html-to-markdown-rs/src/converter/inline/emphasis.rs +1 -1
  46. data/vendor/html-to-markdown-rs/src/converter/inline/link.rs +2 -2
  47. data/vendor/html-to-markdown-rs/src/converter/inline/mod.rs +0 -1
  48. data/vendor/html-to-markdown-rs/src/converter/inline/ruby.rs +1 -1
  49. data/vendor/html-to-markdown-rs/src/converter/inline/semantic/mod.rs +1 -1
  50. data/vendor/html-to-markdown-rs/src/converter/list/definition.rs +3 -3
  51. data/vendor/html-to-markdown-rs/src/converter/list/item.rs +1 -1
  52. data/vendor/html-to-markdown-rs/src/converter/list/mod.rs +0 -1
  53. data/vendor/html-to-markdown-rs/src/converter/list/ordered.rs +2 -2
  54. data/vendor/html-to-markdown-rs/src/converter/list/unordered.rs +2 -2
  55. data/vendor/html-to-markdown-rs/src/converter/main.rs +57 -31
  56. data/vendor/html-to-markdown-rs/src/converter/media/embedded.rs +8 -8
  57. data/vendor/html-to-markdown-rs/src/converter/media/image.rs +1 -1
  58. data/vendor/html-to-markdown-rs/src/converter/media/mod.rs +1 -1
  59. data/vendor/html-to-markdown-rs/src/converter/media/svg.rs +5 -5
  60. data/vendor/html-to-markdown-rs/src/converter/mod.rs +6 -17
  61. data/vendor/html-to-markdown-rs/src/converter/plain_text.rs +64 -11
  62. data/vendor/html-to-markdown-rs/src/converter/preprocessing_helpers.rs +80 -22
  63. data/vendor/html-to-markdown-rs/src/converter/semantic/figure.rs +1 -1
  64. data/vendor/html-to-markdown-rs/src/converter/semantic/mod.rs +1 -1
  65. data/vendor/html-to-markdown-rs/src/converter/text/mod.rs +0 -4
  66. data/vendor/html-to-markdown-rs/src/converter/utility/attributes.rs +5 -9
  67. data/vendor/html-to-markdown-rs/src/converter/utility/caching.rs +3 -3
  68. data/vendor/html-to-markdown-rs/src/converter/utility/content.rs +10 -10
  69. data/vendor/html-to-markdown-rs/src/converter/utility/preprocessing.rs +13 -13
  70. data/vendor/html-to-markdown-rs/src/converter/utility/serialization.rs +4 -4
  71. data/vendor/html-to-markdown-rs/src/converter/utility/siblings.rs +6 -14
  72. data/vendor/html-to-markdown-rs/src/inline_images.rs +6 -0
  73. data/vendor/html-to-markdown-rs/src/lib.rs +17 -18
  74. data/vendor/html-to-markdown-rs/src/options/conversion.rs +31 -0
  75. data/vendor/html-to-markdown-rs/src/prelude.rs +1 -12
  76. data/vendor/html-to-markdown-rs/src/text.rs +0 -44
  77. data/vendor/html-to-markdown-rs/src/types/warnings.rs +2 -0
  78. data/vendor/html-to-markdown-rs/src/visitor/types.rs +5 -1
  79. data/vendor/html-to-markdown-rs/src/visitor_helpers.rs +4 -1
  80. data/vendor/html-to-markdown-rs/tests/br_in_inline_test.rs +1 -1
  81. data/vendor/html-to-markdown-rs/tests/commonmark_compliance_test.rs +1 -1
  82. data/vendor/html-to-markdown-rs/tests/djot_output_test.rs +1 -1
  83. data/vendor/html-to-markdown-rs/tests/exclude_selectors_test.rs +136 -0
  84. data/vendor/html-to-markdown-rs/tests/integration_test.rs +1 -1
  85. data/vendor/html-to-markdown-rs/tests/issue_121_regressions.rs +1 -1
  86. data/vendor/html-to-markdown-rs/tests/issue_127_regressions.rs +1 -1
  87. data/vendor/html-to-markdown-rs/tests/issue_128_regressions.rs +1 -1
  88. data/vendor/html-to-markdown-rs/tests/issue_131_regressions.rs +1 -1
  89. data/vendor/html-to-markdown-rs/tests/issue_134_regressions.rs +1 -1
  90. data/vendor/html-to-markdown-rs/tests/issue_139_regressions.rs +1 -1
  91. data/vendor/html-to-markdown-rs/tests/issue_140_regressions.rs +1 -1
  92. data/vendor/html-to-markdown-rs/tests/issue_143_regressions.rs +1 -1
  93. data/vendor/html-to-markdown-rs/tests/issue_145_regressions.rs +1 -1
  94. data/vendor/html-to-markdown-rs/tests/issue_146_regressions.rs +1 -1
  95. data/vendor/html-to-markdown-rs/tests/issue_176_regressions.rs +2 -2
  96. data/vendor/html-to-markdown-rs/tests/issue_190_regressions.rs +1 -1
  97. data/vendor/html-to-markdown-rs/tests/issue_199_regressions.rs +1 -1
  98. data/vendor/html-to-markdown-rs/tests/issue_200_regressions.rs +1 -1
  99. data/vendor/html-to-markdown-rs/tests/issue_212_regressions.rs +1 -1
  100. data/vendor/html-to-markdown-rs/tests/issue_216_217_regressions.rs +1 -1
  101. data/vendor/html-to-markdown-rs/tests/json_ld_script_extraction.rs +2 -2
  102. data/vendor/html-to-markdown-rs/tests/lists_test.rs +1 -1
  103. data/vendor/html-to-markdown-rs/tests/plain_output_test.rs +1 -1
  104. data/vendor/html-to-markdown-rs/tests/preprocessing_tests.rs +1 -1
  105. data/vendor/html-to-markdown-rs/tests/reference_links_test.rs +1 -1
  106. data/vendor/html-to-markdown-rs/tests/sectioning_elements_test.rs +137 -0
  107. data/vendor/html-to-markdown-rs/tests/skip_images_test.rs +1 -1
  108. data/vendor/html-to-markdown-rs/tests/tables_test.rs +2 -2
  109. data/vendor/html-to-markdown-rs/tests/test_custom_elements.rs +1 -1
  110. data/vendor/html-to-markdown-rs/tests/test_issue_187.rs +5 -2
  111. data/vendor/html-to-markdown-rs/tests/test_issue_218.rs +4 -4
  112. data/vendor/html-to-markdown-rs/tests/test_issue_277.rs +77 -0
  113. data/vendor/html-to-markdown-rs/tests/test_max_depth.rs +82 -0
  114. data/vendor/html-to-markdown-rs/tests/test_nested_simple.rs +1 -1
  115. data/vendor/html-to-markdown-rs/tests/test_script_style_stripping.rs +4 -4
  116. data/vendor/html-to-markdown-rs/tests/test_spa_bisect.rs +1 -1
  117. data/vendor/html-to-markdown-rs/tests/visitor_code_integration_test.rs +6 -6
  118. data/vendor/html-to-markdown-rs/tests/visitor_integration_test.rs +103 -35
  119. data/vendor/html-to-markdown-rs/tests/xml_tables_test.rs +1 -1
  120. metadata +21 -43
  121. data/.bundle/config +0 -2
  122. data/.gitignore +0 -3
  123. data/.rubocop.yml +0 -59
  124. data/Gemfile +0 -18
  125. data/Gemfile.lock +0 -173
  126. data/README.md +0 -331
  127. data/Rakefile +0 -26
  128. data/exe/html-to-markdown +0 -6
  129. data/ext/html_to_markdown_rb/src/html_to_markdown_rs/version.rb +0 -6
  130. data/ext/html_to_markdown_rb/src/html_to_markdown_rs.rb +0 -9
  131. data/html-to-markdown-rb.gemspec +0 -99
  132. data/lib/html_to_markdown_rs.rb +0 -3
  133. data/sig/html_to_markdown.rbs +0 -149
  134. data/vendor/html-to-markdown-rs/src/converter/text/escaping.rs +0 -94
  135. data/vendor/html-to-markdown-rs/src/converter/text/normalization.rs +0 -86
  136. data/vendor/html-to-markdown-rs/src/safety.rs +0 -70
data/Gemfile.lock DELETED
@@ -1,173 +0,0 @@
1
- PATH
2
- remote: .
3
- specs:
4
- html-to-markdown (3.2.3)
5
- rb_sys (>= 0.9, < 1.0)
6
-
7
- GEM
8
- remote: https://rubygems.org/
9
- specs:
10
- ast (2.4.3)
11
- concurrent-ruby (1.3.6)
12
- csv (3.3.5)
13
- diff-lcs (1.6.2)
14
- ffi (1.17.4-arm64-darwin)
15
- ffi (1.17.4-x86_64-linux-gnu)
16
- fileutils (1.8.0)
17
- json (2.19.3)
18
- language_server-protocol (3.17.0.5)
19
- lint_roller (1.1.0)
20
- listen (3.10.0)
21
- logger
22
- rb-fsevent (~> 0.10, >= 0.10.3)
23
- rb-inotify (~> 0.9, >= 0.9.10)
24
- logger (1.7.0)
25
- parallel (2.0.1)
26
- parser (3.3.11.1)
27
- ast (~> 2.4.1)
28
- racc
29
- prism (1.9.0)
30
- racc (1.8.1)
31
- rainbow (3.1.1)
32
- rake (13.4.2)
33
- rake-compiler (1.3.1)
34
- rake
35
- rake-compiler-dock (1.11.0)
36
- rb-fsevent (0.11.2)
37
- rb-inotify (0.11.1)
38
- ffi (~> 1.0)
39
- rb_sys (0.9.126)
40
- json (>= 2)
41
- rake-compiler-dock (= 1.11.0)
42
- rbs (4.0.2)
43
- logger
44
- prism (>= 1.6.0)
45
- tsort
46
- regexp_parser (2.12.0)
47
- rspec (3.13.2)
48
- rspec-core (~> 3.13.0)
49
- rspec-expectations (~> 3.13.0)
50
- rspec-mocks (~> 3.13.0)
51
- rspec-core (3.13.6)
52
- rspec-support (~> 3.13.0)
53
- rspec-expectations (3.13.5)
54
- diff-lcs (>= 1.2.0, < 2.0)
55
- rspec-support (~> 3.13.0)
56
- rspec-mocks (3.13.8)
57
- diff-lcs (>= 1.2.0, < 2.0)
58
- rspec-support (~> 3.13.0)
59
- rspec-support (3.13.7)
60
- rubocop (1.86.1)
61
- json (~> 2.3)
62
- language_server-protocol (~> 3.17.0.2)
63
- lint_roller (~> 1.1.0)
64
- parallel (>= 1.10)
65
- parser (>= 3.3.0.2)
66
- rainbow (>= 2.2.2, < 4.0)
67
- regexp_parser (>= 2.9.3, < 3.0)
68
- rubocop-ast (>= 1.49.0, < 2.0)
69
- ruby-progressbar (~> 1.7)
70
- unicode-display_width (>= 2.4.0, < 4.0)
71
- rubocop-ast (1.49.1)
72
- parser (>= 3.3.7.2)
73
- prism (~> 1.7)
74
- rubocop-performance (1.26.1)
75
- lint_roller (~> 1.1)
76
- rubocop (>= 1.75.0, < 2.0)
77
- rubocop-ast (>= 1.47.1, < 2.0)
78
- rubocop-rspec (3.9.0)
79
- lint_roller (~> 1.1)
80
- rubocop (~> 1.81)
81
- ruby-progressbar (1.13.0)
82
- securerandom (0.4.1)
83
- steep (2.0.0)
84
- concurrent-ruby (>= 1.1.10)
85
- csv (>= 3.0.9)
86
- fileutils (>= 1.1.0)
87
- json (>= 2.1.0)
88
- language_server-protocol (>= 3.17.0.4, < 4.0)
89
- listen (~> 3.0)
90
- logger (>= 1.3.0)
91
- parser (>= 3.2)
92
- prism (>= 0.25.0)
93
- rainbow (>= 2.2.2, < 4.0)
94
- rbs (~> 4.0)
95
- securerandom (>= 0.1)
96
- strscan (>= 1.0.0)
97
- terminal-table (>= 2, < 5)
98
- uri (>= 0.12.0)
99
- strscan (3.1.8)
100
- terminal-table (4.0.0)
101
- unicode-display_width (>= 1.1.1, < 4)
102
- tsort (0.2.0)
103
- unicode-display_width (3.2.0)
104
- unicode-emoji (~> 4.1)
105
- unicode-emoji (4.2.0)
106
- uri (1.1.1)
107
-
108
- PLATFORMS
109
- arm64-darwin
110
- x86_64-linux
111
-
112
- DEPENDENCIES
113
- html-to-markdown!
114
- rake-compiler
115
- rb_sys
116
- rbs
117
- rspec
118
- rubocop
119
- rubocop-performance
120
- rubocop-rspec
121
- steep
122
-
123
- CHECKSUMS
124
- ast (2.4.3) sha256=954615157c1d6a382bc27d690d973195e79db7f55e9765ac7c481c60bdb4d383
125
- concurrent-ruby (1.3.6) sha256=6b56837e1e7e5292f9864f34b69c5a2cbc75c0cf5338f1ce9903d10fa762d5ab
126
- csv (3.3.5) sha256=6e5134ac3383ef728b7f02725d9872934f523cb40b961479f69cf3afa6c8e73f
127
- diff-lcs (1.6.2) sha256=9ae0d2cba7d4df3075fe8cd8602a8604993efc0dfa934cff568969efb1909962
128
- ffi (1.17.4-arm64-darwin) sha256=19071aaf1419251b0a46852abf960e77330a3b334d13a4ab51d58b31a937001b
129
- ffi (1.17.4-x86_64-linux-gnu) sha256=9d3db14c2eae074b382fa9c083fe95aec6e0a1451da249eab096c34002bc752d
130
- fileutils (1.8.0) sha256=8c6b1df54e2540bdb2f39258f08af78853aa70bad52b4d394bbc6424593c6e02
131
- html-to-markdown (3.2.3)
132
- json (2.19.3) sha256=289b0bb53052a1fa8c34ab33cc750b659ba14a5c45f3fcf4b18762dc67c78646
133
- language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
134
- lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
135
- listen (3.10.0) sha256=c6e182db62143aeccc2e1960033bebe7445309c7272061979bb098d03760c9d2
136
- logger (1.7.0) sha256=196edec7cc44b66cfb40f9755ce11b392f21f7967696af15d274dde7edff0203
137
- parallel (2.0.1) sha256=337782d3e39f4121e67563bf91dd8ece67f48923d90698614773a0ec9a5b2c7d
138
- parser (3.3.11.1) sha256=d17ace7aabe3e72c3cc94043714be27cc6f852f104d81aa284c2281aecc65d54
139
- prism (1.9.0) sha256=7b530c6a9f92c24300014919c9dcbc055bf4cdf51ec30aed099b06cd6674ef85
140
- racc (1.8.1) sha256=4a7f6929691dbec8b5209a0b373bc2614882b55fc5d2e447a21aaa691303d62f
141
- rainbow (3.1.1) sha256=039491aa3a89f42efa1d6dec2fc4e62ede96eb6acd95e52f1ad581182b79bc6a
142
- rake (13.4.2) sha256=cb825b2bd5f1f8e91ca37bddb4b9aaf345551b4731da62949be002fa89283701
143
- rake-compiler (1.3.1) sha256=6b351612b6e2d73ddd5563ee799bb58685176e05363db6758504bd11573d670a
144
- rake-compiler-dock (1.11.0) sha256=eab51f2cd533eb35cea6b624a75281f047123e70a64c58b607471bb49428f8c2
145
- rb-fsevent (0.11.2) sha256=43900b972e7301d6570f64b850a5aa67833ee7d87b458ee92805d56b7318aefe
146
- rb-inotify (0.11.1) sha256=a0a700441239b0ff18eb65e3866236cd78613d6b9f78fea1f9ac47a85e47be6e
147
- rb_sys (0.9.126) sha256=ba958e0b8b4b89eeae0b3d24b64c809eb2c37e0ab0773a49e9b1c2e22c95aef8
148
- rbs (4.0.2) sha256=af75671e66cd03434cc546622741ebf83f6197ec4328375805306330bf78ef25
149
- regexp_parser (2.12.0) sha256=35a916a1d63190ab5c9009457136ae5f3c0c7512d60291d0d1378ba18ce08ebb
150
- rspec (3.13.2) sha256=206284a08ad798e61f86d7ca3e376718d52c0bc944626b2349266f239f820587
151
- rspec-core (3.13.6) sha256=a8823c6411667b60a8bca135364351dda34cd55e44ff94c4be4633b37d828b2d
152
- rspec-expectations (3.13.5) sha256=33a4d3a1d95060aea4c94e9f237030a8f9eae5615e9bd85718fe3a09e4b58836
153
- rspec-mocks (3.13.8) sha256=086ad3d3d17533f4237643de0b5c42f04b66348c28bf6b9c2d3f4a3b01af1d47
154
- rspec-support (3.13.7) sha256=0640e5570872aafefd79867901deeeeb40b0c9875a36b983d85f54fb7381c47c
155
- rubocop (1.86.1) sha256=44415f3f01d01a21e01132248d2fd0867572475b566ca188a0a42133a08d4531
156
- rubocop-ast (1.49.1) sha256=4412f3ee70f6fe4546cc489548e0f6fcf76cafcfa80fa03af67098ffed755035
157
- rubocop-performance (1.26.1) sha256=cd19b936ff196df85829d264b522fd4f98b6c89ad271fa52744a8c11b8f71834
158
- rubocop-rspec (3.9.0) sha256=8fa70a3619408237d789aeecfb9beef40576acc855173e60939d63332fdb55e2
159
- ruby-progressbar (1.13.0) sha256=80fc9c47a9b640d6834e0dc7b3c94c9df37f08cb072b7761e4a71e22cff29b33
160
- securerandom (0.4.1) sha256=cc5193d414a4341b6e225f0cb4446aceca8e50d5e1888743fac16987638ea0b1
161
- steep (2.0.0) sha256=6eb0ecc09637bbb54f0a5f2cf63daea6d3208ccace64b4f1107d976333605c30
162
- strscan (3.1.8) sha256=aae2db611a225559f21ffbb71765c9a4e60fd262534a9ea84f4f11c7f32f679e
163
- terminal-table (4.0.0) sha256=f504793203f8251b2ea7c7068333053f0beeea26093ec9962e62ea79f94301d2
164
- tsort (0.2.0) sha256=9650a793f6859a43b6641671278f79cfead60ac714148aabe4e3f0060480089f
165
- unicode-display_width (3.2.0) sha256=0cdd96b5681a5949cdbc2c55e7b420facae74c4aaf9a9815eee1087cb1853c42
166
- unicode-emoji (4.2.0) sha256=519e69150f75652e40bf736106cfbc8f0f73aa3fb6a65afe62fefa7f80b0f80f
167
- uri (1.1.1) sha256=379fa58d27ffb1387eaada68c749d1426738bd0f654d812fcc07e7568f5c57c6
168
-
169
- RUBY VERSION
170
- ruby 3.4.8
171
-
172
- BUNDLED WITH
173
- 4.0.3
data/README.md DELETED
@@ -1,331 +0,0 @@
1
- # html-to-markdown
2
-
3
- <div align="center" style="display: flex; flex-wrap: wrap; gap: 8px; justify-content: center; margin: 20px 0;">
4
- <!-- Language Bindings -->
5
- <a href="https://crates.io/crates/html-to-markdown-rs">
6
- <img src="https://img.shields.io/crates/v/html-to-markdown-rs?label=Rust&color=007ec6" alt="Rust">
7
- </a>
8
- <a href="https://pypi.org/project/html-to-markdown/">
9
- <img src="https://img.shields.io/pypi/v/html-to-markdown?label=Python&color=007ec6" alt="Python">
10
- </a>
11
- <a href="https://www.npmjs.com/package/@kreuzberg/html-to-markdown-node">
12
- <img src="https://img.shields.io/npm/v/@kreuzberg/html-to-markdown-node?label=Node.js&color=007ec6" alt="Node.js">
13
- </a>
14
- <a href="https://www.npmjs.com/package/@kreuzberg/html-to-markdown-wasm">
15
- <img src="https://img.shields.io/npm/v/@kreuzberg/html-to-markdown-wasm?label=WASM&color=007ec6" alt="WASM">
16
- </a>
17
- <a href="https://central.sonatype.com/artifact/dev.kreuzberg/html-to-markdown">
18
- <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/html-to-markdown?label=Java&color=007ec6" alt="Java">
19
- </a>
20
- <a href="https://pkg.go.dev/github.com/kreuzberg-dev/html-to-markdown/packages/go/v3/htmltomarkdown">
21
- <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/html-to-markdown?label=Go&color=007ec6&filter=v3.2.0" alt="Go">
22
- </a>
23
- <a href="https://www.nuget.org/packages/KreuzbergDev.HtmlToMarkdown/">
24
- <img src="https://img.shields.io/nuget/v/KreuzbergDev.HtmlToMarkdown?label=C%23&color=007ec6" alt="C#">
25
- </a>
26
- <a href="https://packagist.org/packages/kreuzberg-dev/html-to-markdown">
27
- <img src="https://img.shields.io/packagist/v/kreuzberg-dev/html-to-markdown?label=PHP&color=007ec6" alt="PHP">
28
- </a>
29
- <a href="https://rubygems.org/gems/html-to-markdown">
30
- <img src="https://img.shields.io/gem/v/html-to-markdown?label=Ruby&color=007ec6" alt="Ruby">
31
- </a>
32
- <a href="https://hex.pm/packages/html_to_markdown">
33
- <img src="https://img.shields.io/hexpm/v/html_to_markdown?label=Elixir&color=007ec6" alt="Elixir">
34
- </a>
35
- <a href="https://kreuzberg-dev.r-universe.dev/htmltomarkdown">
36
- <img src="https://img.shields.io/cran/v/htmltomarkdown?label=R&color=007ec6" alt="R">
37
- </a>
38
- <a href="https://github.com/kreuzberg-dev/html-to-markdown/releases">
39
- <img src="https://img.shields.io/badge/C-FFI-007ec6" alt="C">
40
- </a>
41
-
42
- <!-- Project Info -->
43
- <a href="https://docs.html-to-markdown.kreuzberg.dev">
44
- <img src="https://img.shields.io/badge/Docs-kreuzberg.dev-007ec6" alt="Documentation">
45
- </a>
46
- <a href="https://github.com/kreuzberg-dev/html-to-markdown/blob/main/LICENSE">
47
- <img src="https://img.shields.io/badge/License-MIT-blue.svg" alt="License">
48
- </a>
49
- </div>
50
-
51
- <img width="1128" height="191" alt="html-to-markdown" src="https://github.com/user-attachments/assets/419fc06c-8313-4324-b159-4b4d3cfce5c0" />
52
-
53
- <div align="center" style="margin-top: 20px;">
54
- <a href="https://discord.gg/pXxagNK2zN">
55
- <img height="22" src="https://img.shields.io/badge/Discord-Join%20our%20community-7289da?logo=discord&logoColor=white" alt="Discord">
56
- </a>
57
- </div>
58
-
59
- Blazing-fast HTML to Markdown conversion for Ruby, powered by the same Rust engine used by our Python, Node.js, WebAssembly, and PHP packages.
60
- Ship identical Markdown across every runtime while enjoying native extension performance with Magnus bindings.
61
-
62
-
63
- ## Installation
64
-
65
- ```bash
66
- gem install html-to-markdown
67
- ```
68
-
69
-
70
-
71
- Requires Ruby 3.2+ with Magnus native extension bindings. Published for Linux, macOS.
72
-
73
-
74
-
75
-
76
-
77
-
78
- ## Performance Snapshot
79
-
80
- **Apple M4** · `convert()` · Real Wikipedia documents
81
-
82
- | Document | Size | Latency | Throughput |
83
- |----------|------|---------|------------|
84
- | Lists (Timeline) | 129KB | 0.71ms | 182 MB/s |
85
- | Tables (Countries) | 360KB | 2.15ms | 167 MB/s |
86
- | Mixed (Python wiki) | 656KB | 4.89ms | 134 MB/s |
87
-
88
-
89
-
90
-
91
- ## Quick Start
92
-
93
- Basic conversion:
94
-
95
- ```ruby
96
- require 'html_to_markdown'
97
-
98
- html = "<h1>Hello</h1><p>This is <strong>fast</strong>!</p>"
99
- result = HtmlToMarkdown.convert(html)
100
- markdown = result[:content]
101
- ```
102
-
103
-
104
- With conversion options:
105
-
106
- ```ruby
107
- require 'html_to_markdown'
108
-
109
- html = "<h1>Hello</h1><p>This is <strong>fast</strong>!</p>"
110
- result = HtmlToMarkdown.convert(html, heading_style: :atx, code_block_style: :fenced)
111
- markdown = result[:content]
112
- ```
113
-
114
-
115
- ## API Reference
116
-
117
- ### Core Function
118
-
119
-
120
- **`convert(html, options: nil, visitor: nil) -> ConversionResult`**
121
-
122
- Converts HTML to Markdown. Returns a `ConversionResult` hash with all results in a single call.
123
-
124
- ```ruby
125
- require 'html_to_markdown'
126
-
127
- result = HtmlToMarkdown.convert(html)
128
- markdown = result[:content] # Converted Markdown string
129
- metadata = result[:metadata] # Metadata (when extract_metadata: true)
130
- tables = result[:tables] # Structured table data (when extract_tables: true)
131
- document = result[:document] # Document-level info
132
- images = result[:images] # Extracted images
133
- warnings = result[:warnings] # Any conversion warnings
134
- ```
135
-
136
-
137
-
138
- ### Options
139
-
140
- **`ConversionOptions`** – Key configuration fields:
141
-
142
- - `heading_style`: Heading format (`"underlined"` | `"atx"` | `"atx_closed"`) — default: `"underlined"`
143
- - `list_indent_width`: Spaces per indent level — default: `2`
144
- - `bullets`: Bullet characters cycle — default: `"*+-"`
145
- - `wrap`: Enable text wrapping — default: `false`
146
- - `wrap_width`: Wrap at column — default: `80`
147
- - `code_language`: Default fenced code block language — default: none
148
- - `extract_metadata`: Enable metadata extraction into `result.metadata` — default: `false`
149
- - `extract_tables`: Enable structured table extraction into `result.tables` — default: `false`
150
- - `output_format`: Output markup format (`"markdown"` | `"djot"` | `"plain"`) — default: `"markdown"`
151
-
152
- ## Djot Output Format
153
-
154
- The library supports converting HTML to [Djot](https://djot.net/), a lightweight markup language similar to Markdown but with a different syntax for some elements. Set `output_format` to `"djot"` to use this format.
155
-
156
- ### Syntax Differences
157
-
158
- | Element | Markdown | Djot |
159
- |---------|----------|------|
160
- | Strong | `**text**` | `*text*` |
161
- | Emphasis | `*text*` | `_text_` |
162
- | Strikethrough | `~~text~~` | `{-text-}` |
163
- | Inserted/Added | N/A | `{+text+}` |
164
- | Highlighted | N/A | `{=text=}` |
165
- | Subscript | N/A | `~text~` |
166
- | Superscript | N/A | `^text^` |
167
-
168
- ### Example Usage
169
-
170
-
171
-
172
- ```ruby
173
- require 'html_to_markdown'
174
-
175
- html = "<p>This is <strong>bold</strong> and <em>italic</em> text.</p>"
176
-
177
- # Default Markdown output
178
- markdown = HtmlToMarkdown.convert(html)
179
- # Result: "This is **bold** and *italic* text."
180
-
181
- # Djot output
182
- djot = HtmlToMarkdown.convert(html, output_format: 'djot')
183
- # Result: "This is *bold* and _italic_ text."
184
- ```
185
-
186
-
187
-
188
- Djot's extended syntax allows you to express more semantic meaning in lightweight text, making it useful for documents that require strikethrough, insertion tracking, or mathematical notation.
189
-
190
- ## Plain Text Output
191
-
192
- Set `output_format` to `"plain"` to strip all markup and return only visible text. This bypasses the Markdown conversion pipeline entirely for maximum speed.
193
-
194
-
195
-
196
- ```ruby
197
- require 'html_to_markdown'
198
-
199
- html = "<h1>Title</h1><p>This is <strong>bold</strong> and <em>italic</em> text.</p>"
200
-
201
- plain = HtmlToMarkdown.convert(html, output_format: 'plain')
202
- # Result: "Title\n\nThis is bold and italic text."
203
- ```
204
-
205
-
206
-
207
- Plain text mode is useful for search indexing, text extraction, and feeding content to LLMs.
208
-
209
-
210
-
211
- ## Metadata Extraction
212
-
213
- The metadata extraction feature enables comprehensive document analysis during conversion. Extract document properties, headers, links, images, and structured data in a single pass — all via the standard `convert()` function.
214
-
215
- **Use Cases:**
216
-
217
- - **SEO analysis** – Extract title, description, Open Graph tags, Twitter cards
218
- - **Table of contents generation** – Build structured outlines from heading hierarchy
219
- - **Content migration** – Document all external links and resources
220
- - **Accessibility audits** – Check for images without alt text, empty links, invalid heading hierarchy
221
- - **Link validation** – Classify and validate anchor, internal, external, email, and phone links
222
-
223
- **Zero Overhead When Disabled:** Metadata extraction adds negligible overhead and happens during the HTML parsing pass. Pass `extract_metadata: true` in `ConversionOptions` to enable it; the result is available at `result.metadata`.
224
-
225
- ### Example: Quick Start
226
-
227
-
228
-
229
- ```ruby
230
- require 'html_to_markdown'
231
-
232
- html = '<h1>Article</h1><img src="test.jpg" alt="test">'
233
- result = HtmlToMarkdown.convert(html, extract_metadata: true)
234
-
235
- puts result[:content] # Converted Markdown
236
- puts result[:metadata][:document][:title] # Document title
237
- puts result[:metadata][:headers] # All h1-h6 elements
238
- puts result[:metadata][:links] # All hyperlinks
239
- puts result[:metadata][:images] # All images with alt text
240
- puts result[:metadata][:structured_data] # JSON-LD, Microdata, RDFa
241
- ```
242
-
243
-
244
-
245
-
246
-
247
-
248
- ## Visitor Pattern
249
-
250
- The visitor pattern enables custom HTML→Markdown conversion logic by providing callbacks for specific HTML elements during traversal. Pass a visitor as the third argument to `convert()`.
251
-
252
- **Use Cases:**
253
-
254
- - **Custom Markdown dialects** – Convert to Obsidian, Notion, or other flavors
255
- - **Content filtering** – Remove tracking pixels, ads, or unwanted elements
256
- - **URL rewriting** – Rewrite CDN URLs, add query parameters, validate links
257
- - **Accessibility validation** – Check alt text, heading hierarchy, link text
258
- - **Analytics** – Track element usage, link destinations, image sources
259
-
260
- **Supported Visitor Methods:** 40+ callbacks for text, inline elements, links, images, headings, lists, blocks, and tables.
261
-
262
- ### Example: Quick Start
263
-
264
-
265
-
266
- ```ruby
267
- require 'html_to_markdown'
268
-
269
- class MyVisitor
270
- def visit_link(ctx, href, text, title = nil)
271
- # Rewrite CDN URLs
272
- if href.start_with?('https://old-cdn.com')
273
- href = href.sub('https://old-cdn.com', 'https://new-cdn.com')
274
- end
275
- { type: :custom, output: "[#{text}](#{href})" }
276
- end
277
-
278
- def visit_image(ctx, src, alt = nil, title = nil)
279
- # Skip tracking pixels
280
- src.include?('tracking') ? { type: :skip } : { type: :continue }
281
- end
282
- end
283
-
284
- html = '<a href="https://old-cdn.com/file.pdf">Download</a>'
285
- result = HtmlToMarkdown.convert(html, visitor: MyVisitor.new)
286
- markdown = result[:content]
287
- ```
288
-
289
-
290
-
291
-
292
- ## Examples
293
-
294
-
295
- ## Links
296
-
297
- - **GitHub:** [github.com/kreuzberg-dev/html-to-markdown](https://github.com/kreuzberg-dev/html-to-markdown)
298
-
299
- - **RubyGems:** [rubygems.org/gems/html-to-markdown](https://rubygems.org/gems/html-to-markdown)
300
-
301
- - **Kreuzberg Ecosystem:** [kreuzberg.dev](https://kreuzberg.dev)
302
- - **Discord:** [discord.gg/pXxagNK2zN](https://discord.gg/pXxagNK2zN)
303
-
304
- ## Contributing
305
-
306
- We welcome contributions! Please see our [Contributing Guide](https://github.com/kreuzberg-dev/html-to-markdown/blob/main/CONTRIBUTING.md) for details on:
307
-
308
- - Setting up the development environment
309
- - Running tests locally
310
- - Submitting pull requests
311
- - Reporting issues
312
-
313
- All contributions must follow our code quality standards (enforced via pre-commit hooks):
314
-
315
- - Proper test coverage (Rust 95%+, language bindings 80%+)
316
- - Formatting and linting checks
317
- - Documentation for public APIs
318
-
319
- ## License
320
-
321
- MIT License – see [LICENSE](https://github.com/kreuzberg-dev/html-to-markdown/blob/main/LICENSE).
322
-
323
- ## Support
324
-
325
- If you find this library useful, consider [sponsoring the project](https://github.com/sponsors/kreuzberg-dev).
326
-
327
- Have questions or run into issues? We're here to help:
328
-
329
- - **GitHub Issues:** [github.com/kreuzberg-dev/html-to-markdown/issues](https://github.com/kreuzberg-dev/html-to-markdown/issues)
330
- - **Discussions:** [github.com/kreuzberg-dev/html-to-markdown/discussions](https://github.com/kreuzberg-dev/html-to-markdown/discussions)
331
- - **Discord Community:** [discord.gg/pXxagNK2zN](https://discord.gg/pXxagNK2zN)
data/Rakefile DELETED
@@ -1,26 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'bundler/gem_tasks'
4
- require 'rake/extensiontask'
5
- require 'rspec/core/rake_task'
6
-
7
- GEMSPEC = Gem::Specification.load(File.expand_path('html_to_markdown_rs.gemspec', __dir__))
8
-
9
- Rake::ExtensionTask.new('html_to_markdown_rb', GEMSPEC) do |ext|
10
- ext.lib_dir = 'lib'
11
- ext.ext_dir = 'ext/html_to_markdown_rb'
12
- ext.cross_compile = true
13
- ext.cross_platform = %w[
14
- x86_64-linux
15
- aarch64-linux
16
- x86_64-darwin
17
- arm64-darwin
18
- x64-mingw32
19
- x64-mingw-ucrt
20
- ]
21
- end
22
-
23
- RSpec::Core::RakeTask.new(:spec)
24
-
25
- task spec: :compile
26
- task default: :spec
data/exe/html-to-markdown DELETED
@@ -1,6 +0,0 @@
1
- #!/usr/bin/env ruby
2
- # frozen_string_literal: true
3
-
4
- require 'html_to_markdown/cli'
5
-
6
- exit HtmlToMarkdown::CLI.run(ARGV)
@@ -1,6 +0,0 @@
1
- # This file is auto-generated by alef. DO NOT EDIT.
2
- # frozen_string_literal: true
3
-
4
- module Html_to_markdown_rs
5
- VERSION = "3.2.0"
6
- end
@@ -1,9 +0,0 @@
1
- # This file is auto-generated by alef. DO NOT EDIT.
2
- # frozen_string_literal: true
3
-
4
- require_relative 'html_to_markdown_rs/version'
5
- require_relative 'html_to_markdown_rs/native'
6
-
7
- module Html_to_markdown_rs
8
- # Re-export all types and functions from native extension
9
- end
@@ -1,99 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require_relative 'lib/html_to_markdown/version'
4
-
5
- repo_root = File.expand_path('../..', __dir__)
6
- crate_prefix = 'packages/ruby/'
7
- git_cmd = %(git -C "#{repo_root}" ls-files -z #{crate_prefix})
8
- ruby_files =
9
- `#{git_cmd}`.split("\x0")
10
- .select { |path| path.start_with?(crate_prefix) }
11
- .map { |path| path.delete_prefix(crate_prefix) }
12
-
13
- fallback_files = Dir.chdir(__dir__) do
14
- Dir.glob(
15
- %w[
16
- README.md
17
- ext/**/*
18
- exe/*
19
- lib/**/*.rb
20
- lib/bin/*
21
- src/**/*.rs
22
- spec/**/*.rb
23
- sig/**/*.rbs
24
- ]
25
- )
26
- end
27
-
28
- # Vendor files: include vendored crates and workspace Cargo.toml
29
- vendor_files = Dir.chdir(__dir__) do
30
- Dir.glob('vendor/html-to-markdown-rs/**/*', File::FNM_DOTMATCH)
31
- .select { |f| File.file?(f) }
32
- .grep_v(%r{/target/})
33
- .grep_v(/\.(swp|bak|tmp)$/)
34
- end
35
-
36
- # Include vendor/Cargo.toml (workspace definition) if it exists
37
- workspace_toml = if File.exist?(File.join(__dir__, 'vendor/Cargo.toml'))
38
- ['vendor/Cargo.toml']
39
- else
40
- []
41
- end
42
-
43
- # When vendor exists, use ext/ files from filesystem (modified by vendor script)
44
- # instead of git (which has the unmodified Cargo.toml with workspace paths)
45
- ext_files_from_fs = Dir.chdir(__dir__) do
46
- Dir.glob('ext/**/*', File::FNM_DOTMATCH)
47
- .reject { |f| File.directory?(f) }
48
- .reject { |f| f.include?('/target/') }
49
- end
50
-
51
- # Include native artifacts (.so, .bundle, .dylib) if present (for platform gems)
52
- native_files = Dir.chdir(__dir__) do
53
- Dir.glob('lib/**/*.{so,bundle,dylib}')
54
- end
55
-
56
- files = if vendor_files.any?
57
- # Vendor exists: use ext/ from filesystem (has modified Cargo.toml)
58
- non_ext_ruby_files = (ruby_files.empty? ? fallback_files : ruby_files)
59
- .reject { |f| f.start_with?('ext/') }
60
- non_ext_ruby_files + ext_files_from_fs + vendor_files + workspace_toml + native_files
61
- else
62
- ruby_files.empty? ? fallback_files : ruby_files
63
- end
64
-
65
- files = files.uniq
66
-
67
- Gem::Specification.new do |spec|
68
- spec.name = 'html-to-markdown'
69
- spec.version = HtmlToMarkdown::VERSION
70
- spec.authors = ["Na'aman Hirschfeld"]
71
- spec.email = ['nhirschfeld@gmail.com']
72
-
73
- spec.summary = 'Blazing-fast HTML to Markdown conversion for Ruby, powered by Rust.'
74
- spec.description = <<~DESC.strip
75
- html-to-markdown is a native Ruby extension built on the shared Rust engine that powers the html-to-markdown project.
76
- It delivers identical HTML-to-Markdown output across languages, exposes inline image extraction, and ships with a CLI for automation workflows.
77
- DESC
78
- spec.homepage = 'https://github.com/kreuzberg-dev/html-to-markdown'
79
- spec.license = 'MIT'
80
-
81
- spec.required_ruby_version = Gem::Requirement.new('>= 3.2')
82
-
83
- spec.bindir = 'exe'
84
- spec.executables = ['html-to-markdown']
85
- spec.require_paths = ['lib']
86
-
87
- spec.files = files
88
- spec.extra_rdoc_files = ['README.md']
89
-
90
- spec.extensions = ['ext/html_to_markdown_rb/extconf.rb']
91
-
92
- spec.add_dependency 'rb_sys', '>= 0.9', '< 1.0'
93
- spec.metadata['rubygems_mfa_required'] = 'true'
94
- spec.metadata['homepage_uri'] = 'https://github.com/kreuzberg-dev/html-to-markdown'
95
- spec.metadata['source_code_uri'] = 'https://github.com/kreuzberg-dev/html-to-markdown'
96
- spec.metadata['bug_tracker_uri'] = 'https://github.com/kreuzberg-dev/html-to-markdown/issues'
97
- spec.metadata['changelog_uri'] = 'https://github.com/kreuzberg-dev/html-to-markdown/releases'
98
- spec.metadata['documentation_uri'] = 'https://github.com/kreuzberg-dev/html-to-markdown/blob/main/packages/ruby/README.md'
99
- end
@@ -1,3 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'html_to_markdown_rb'