html-to-markdown 2.27.2 → 2.28.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (237) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +8 -8
  3. data/ext/html-to-markdown-rb/native/Cargo.toml +1 -1
  4. data/ext/html-to-markdown-rb/native/src/conversion/mod.rs +6 -0
  5. data/ext/html-to-markdown-rb/native/src/conversion/tables.rs +71 -0
  6. data/ext/html-to-markdown-rb/native/src/lib.rs +27 -1
  7. data/lib/html_to_markdown/version.rb +1 -1
  8. data/lib/html_to_markdown.rb +29 -0
  9. data/rust-vendor/getrandom/.cargo-checksum.json +1 -1
  10. data/rust-vendor/getrandom/.cargo_vcs_info.json +1 -1
  11. data/rust-vendor/getrandom/CHANGELOG.md +62 -43
  12. data/rust-vendor/getrandom/Cargo.lock +49 -56
  13. data/rust-vendor/getrandom/Cargo.toml +2 -2
  14. data/rust-vendor/getrandom/Cargo.toml.orig +2 -2
  15. data/rust-vendor/getrandom/src/backends/efi_rng.rs +8 -10
  16. data/rust-vendor/getrandom/src/backends/getentropy.rs +13 -4
  17. data/rust-vendor/getrandom/src/backends/linux_android_with_fallback.rs +10 -25
  18. data/rust-vendor/getrandom/src/backends/netbsd.rs +17 -25
  19. data/rust-vendor/getrandom/src/backends/rdrand.rs +15 -9
  20. data/rust-vendor/getrandom/src/backends/rndr.rs +2 -1
  21. data/rust-vendor/getrandom/src/backends/vxworks.rs +7 -3
  22. data/rust-vendor/getrandom/src/backends/windows.rs +21 -5
  23. data/rust-vendor/getrandom/src/utils/lazy_bool.rs +39 -0
  24. data/rust-vendor/getrandom/src/utils/lazy_ptr.rs +57 -0
  25. data/rust-vendor/html-to-markdown-rs/Cargo.toml +2 -2
  26. data/rust-vendor/html-to-markdown-rs/README.md +29 -0
  27. data/rust-vendor/html-to-markdown-rs/src/convert_api.rs +368 -0
  28. data/rust-vendor/html-to-markdown-rs/src/converter/main.rs +10 -5
  29. data/rust-vendor/html-to-markdown-rs/src/converter/text_node.rs +2 -1
  30. data/rust-vendor/html-to-markdown-rs/src/lib.rs +3 -0
  31. data/rust-vendor/html-to-markdown-rs/src/prelude.rs +3 -0
  32. data/rust-vendor/html-to-markdown-rs/tests/issue_216_217_regressions.rs +82 -0
  33. data/rust-vendor/quote/.cargo-checksum.json +1 -1
  34. data/rust-vendor/quote/.cargo_vcs_info.json +1 -1
  35. data/rust-vendor/quote/.github/workflows/ci.yml +2 -2
  36. data/rust-vendor/quote/Cargo.lock +21 -21
  37. data/rust-vendor/quote/Cargo.toml +2 -2
  38. data/rust-vendor/quote/Cargo.toml.orig +2 -2
  39. data/rust-vendor/quote/README.md +0 -1
  40. data/rust-vendor/quote/src/lib.rs +1 -1
  41. data/rust-vendor/quote/src/to_tokens.rs +7 -0
  42. data/rust-vendor/quote/tests/ui/not-quotable.stderr +1 -1
  43. data/rust-vendor/quote/tests/ui/not-repeatable.stderr +3 -11
  44. data/rust-vendor/r-efi/.cargo-checksum.json +1 -1
  45. data/rust-vendor/r-efi/.cargo_vcs_info.json +1 -1
  46. data/rust-vendor/r-efi/AUTHORS +1 -0
  47. data/rust-vendor/r-efi/Cargo.lock +1 -1
  48. data/rust-vendor/r-efi/Cargo.toml +1 -3
  49. data/rust-vendor/r-efi/Cargo.toml.orig +1 -5
  50. data/rust-vendor/r-efi/NEWS.md +16 -0
  51. data/rust-vendor/r-efi/src/base.rs +1 -1
  52. data/rust-vendor/r-efi/src/lib.rs +27 -12
  53. data/rust-vendor/r-efi/src/protocols/absolute_pointer.rs +4 -4
  54. data/rust-vendor/r-efi/src/protocols/block_io.rs +8 -8
  55. data/rust-vendor/r-efi/src/protocols/bus_specific_driver_override.rs +2 -2
  56. data/rust-vendor/r-efi/src/protocols/debug_support.rs +10 -10
  57. data/rust-vendor/r-efi/src/protocols/debugport.rs +8 -8
  58. data/rust-vendor/r-efi/src/protocols/decompress.rs +4 -4
  59. data/rust-vendor/r-efi/src/protocols/device_path_from_text.rs +4 -4
  60. data/rust-vendor/r-efi/src/protocols/device_path_to_text.rs +4 -4
  61. data/rust-vendor/r-efi/src/protocols/device_path_utilities.rs +16 -16
  62. data/rust-vendor/r-efi/src/protocols/disk_io.rs +4 -4
  63. data/rust-vendor/r-efi/src/protocols/disk_io2.rs +8 -8
  64. data/rust-vendor/r-efi/src/protocols/driver_binding.rs +6 -6
  65. data/rust-vendor/r-efi/src/protocols/driver_diagnostics2.rs +2 -2
  66. data/rust-vendor/r-efi/src/protocols/driver_family_override.rs +2 -2
  67. data/rust-vendor/r-efi/src/protocols/file.rs +28 -28
  68. data/rust-vendor/r-efi/src/protocols/graphics_output.rs +6 -6
  69. data/rust-vendor/r-efi/src/protocols/hii_database.rs +24 -24
  70. data/rust-vendor/r-efi/src/protocols/hii_font.rs +8 -8
  71. data/rust-vendor/r-efi/src/protocols/hii_font_ex.rs +10 -10
  72. data/rust-vendor/r-efi/src/protocols/hii_string.rs +10 -10
  73. data/rust-vendor/r-efi/src/protocols/ip4.rs +16 -16
  74. data/rust-vendor/r-efi/src/protocols/ip6.rs +18 -18
  75. data/rust-vendor/r-efi/src/protocols/load_file.rs +2 -2
  76. data/rust-vendor/r-efi/src/protocols/loaded_image.rs +2 -2
  77. data/rust-vendor/r-efi/src/protocols/managed_network.rs +16 -16
  78. data/rust-vendor/r-efi/src/protocols/memory_attribute.rs +6 -6
  79. data/rust-vendor/r-efi/src/protocols/mp_services.rs +15 -15
  80. data/rust-vendor/r-efi/src/protocols/pci_io.rs +26 -26
  81. data/rust-vendor/r-efi/src/protocols/platform_driver_override.rs +6 -6
  82. data/rust-vendor/r-efi/src/protocols/rng.rs +4 -4
  83. data/rust-vendor/r-efi/src/protocols/service_binding.rs +4 -4
  84. data/rust-vendor/r-efi/src/protocols/shell.rs +81 -81
  85. data/rust-vendor/r-efi/src/protocols/shell_dynamic_command.rs +4 -4
  86. data/rust-vendor/r-efi/src/protocols/simple_file_system.rs +2 -2
  87. data/rust-vendor/r-efi/src/protocols/simple_network.rs +26 -26
  88. data/rust-vendor/r-efi/src/protocols/simple_text_input.rs +4 -4
  89. data/rust-vendor/r-efi/src/protocols/simple_text_input_ex.rs +11 -11
  90. data/rust-vendor/r-efi/src/protocols/simple_text_output.rs +18 -18
  91. data/rust-vendor/r-efi/src/protocols/tcp4.rs +20 -20
  92. data/rust-vendor/r-efi/src/protocols/tcp6.rs +18 -18
  93. data/rust-vendor/r-efi/src/protocols/timestamp.rs +3 -3
  94. data/rust-vendor/r-efi/src/protocols/udp4.rs +16 -16
  95. data/rust-vendor/r-efi/src/protocols/udp6.rs +14 -14
  96. data/rust-vendor/r-efi/src/system.rs +115 -115
  97. data/rust-vendor/r-efi/src/vendor/intel/console_control.rs +6 -6
  98. data/rust-vendor/r-efi-5.3.0/.cargo-checksum.json +1 -0
  99. data/rust-vendor/r-efi-5.3.0/.cargo_vcs_info.json +6 -0
  100. data/rust-vendor/r-efi-5.3.0/.github/workflows/publish.yml +39 -0
  101. data/rust-vendor/r-efi-5.3.0/.github/workflows/rust-tests.yml +125 -0
  102. data/rust-vendor/r-efi-5.3.0/AUTHORS +74 -0
  103. data/rust-vendor/r-efi-5.3.0/Cargo.lock +16 -0
  104. data/rust-vendor/r-efi-5.3.0/Cargo.toml +70 -0
  105. data/rust-vendor/r-efi-5.3.0/Cargo.toml.orig +51 -0
  106. data/rust-vendor/r-efi-5.3.0/Makefile +85 -0
  107. data/rust-vendor/r-efi-5.3.0/NEWS.md +301 -0
  108. data/rust-vendor/r-efi-5.3.0/README.md +99 -0
  109. data/rust-vendor/r-efi-5.3.0/examples/freestanding.rs +34 -0
  110. data/rust-vendor/r-efi-5.3.0/examples/gop-query.rs +188 -0
  111. data/rust-vendor/r-efi-5.3.0/examples/hello-world.rs +55 -0
  112. data/rust-vendor/r-efi-5.3.0/src/base.rs +993 -0
  113. data/rust-vendor/r-efi-5.3.0/src/hii.rs +1300 -0
  114. data/rust-vendor/r-efi-5.3.0/src/lib.rs +182 -0
  115. data/rust-vendor/r-efi-5.3.0/src/protocols/absolute_pointer.rs +69 -0
  116. data/rust-vendor/r-efi-5.3.0/src/protocols/block_io.rs +70 -0
  117. data/rust-vendor/r-efi-5.3.0/src/protocols/bus_specific_driver_override.rs +32 -0
  118. data/rust-vendor/r-efi-5.3.0/src/protocols/debug_support.rs +835 -0
  119. data/rust-vendor/r-efi-5.3.0/src/protocols/debugport.rs +42 -0
  120. data/rust-vendor/r-efi-5.3.0/src/protocols/decompress.rs +37 -0
  121. data/rust-vendor/r-efi-5.3.0/src/protocols/device_path.rs +82 -0
  122. data/rust-vendor/r-efi-5.3.0/src/protocols/device_path_from_text.rs +26 -0
  123. data/rust-vendor/r-efi-5.3.0/src/protocols/device_path_to_text.rs +30 -0
  124. data/rust-vendor/r-efi-5.3.0/src/protocols/device_path_utilities.rs +63 -0
  125. data/rust-vendor/r-efi-5.3.0/src/protocols/disk_io.rs +40 -0
  126. data/rust-vendor/r-efi-5.3.0/src/protocols/disk_io2.rs +58 -0
  127. data/rust-vendor/r-efi-5.3.0/src/protocols/driver_binding.rs +42 -0
  128. data/rust-vendor/r-efi-5.3.0/src/protocols/driver_diagnostics2.rs +38 -0
  129. data/rust-vendor/r-efi-5.3.0/src/protocols/driver_family_override.rs +23 -0
  130. data/rust-vendor/r-efi-5.3.0/src/protocols/file.rs +183 -0
  131. data/rust-vendor/r-efi-5.3.0/src/protocols/graphics_output.rs +103 -0
  132. data/rust-vendor/r-efi-5.3.0/src/protocols/hii_database.rs +299 -0
  133. data/rust-vendor/r-efi-5.3.0/src/protocols/hii_font.rs +87 -0
  134. data/rust-vendor/r-efi-5.3.0/src/protocols/hii_font_ex.rs +107 -0
  135. data/rust-vendor/r-efi-5.3.0/src/protocols/hii_package_list.rs +14 -0
  136. data/rust-vendor/r-efi-5.3.0/src/protocols/hii_string.rs +71 -0
  137. data/rust-vendor/r-efi-5.3.0/src/protocols/ip4.rs +202 -0
  138. data/rust-vendor/r-efi-5.3.0/src/protocols/ip6.rs +264 -0
  139. data/rust-vendor/r-efi-5.3.0/src/protocols/load_file.rs +26 -0
  140. data/rust-vendor/r-efi-5.3.0/src/protocols/load_file2.rs +15 -0
  141. data/rust-vendor/r-efi-5.3.0/src/protocols/loaded_image.rs +39 -0
  142. data/rust-vendor/r-efi-5.3.0/src/protocols/loaded_image_device_path.rs +13 -0
  143. data/rust-vendor/r-efi-5.3.0/src/protocols/managed_network.rs +147 -0
  144. data/rust-vendor/r-efi-5.3.0/src/protocols/memory_attribute.rs +40 -0
  145. data/rust-vendor/r-efi-5.3.0/src/protocols/mp_services.rs +121 -0
  146. data/rust-vendor/r-efi-5.3.0/src/protocols/pci_io.rs +203 -0
  147. data/rust-vendor/r-efi-5.3.0/src/protocols/platform_driver_override.rs +46 -0
  148. data/rust-vendor/r-efi-5.3.0/src/protocols/rng.rs +83 -0
  149. data/rust-vendor/r-efi-5.3.0/src/protocols/service_binding.rs +20 -0
  150. data/rust-vendor/r-efi-5.3.0/src/protocols/shell.rs +295 -0
  151. data/rust-vendor/r-efi-5.3.0/src/protocols/shell_dynamic_command.rs +33 -0
  152. data/rust-vendor/r-efi-5.3.0/src/protocols/shell_parameters.rs +23 -0
  153. data/rust-vendor/r-efi-5.3.0/src/protocols/simple_file_system.rs +26 -0
  154. data/rust-vendor/r-efi-5.3.0/src/protocols/simple_network.rs +196 -0
  155. data/rust-vendor/r-efi-5.3.0/src/protocols/simple_text_input.rs +38 -0
  156. data/rust-vendor/r-efi-5.3.0/src/protocols/simple_text_input_ex.rs +85 -0
  157. data/rust-vendor/r-efi-5.3.0/src/protocols/simple_text_output.rs +86 -0
  158. data/rust-vendor/r-efi-5.3.0/src/protocols/tcp4.rs +224 -0
  159. data/rust-vendor/r-efi-5.3.0/src/protocols/tcp6.rs +202 -0
  160. data/rust-vendor/r-efi-5.3.0/src/protocols/timestamp.rs +32 -0
  161. data/rust-vendor/r-efi-5.3.0/src/protocols/udp4.rs +151 -0
  162. data/rust-vendor/r-efi-5.3.0/src/protocols/udp6.rs +137 -0
  163. data/rust-vendor/r-efi-5.3.0/src/protocols.rs +54 -0
  164. data/rust-vendor/r-efi-5.3.0/src/system.rs +1130 -0
  165. data/rust-vendor/r-efi-5.3.0/src/vendor/intel/console_control.rs +37 -0
  166. data/rust-vendor/r-efi-5.3.0/src/vendor.rs +10 -0
  167. data/rust-vendor/tokio/.cargo-checksum.json +1 -1
  168. data/rust-vendor/tokio/.cargo_vcs_info.json +1 -1
  169. data/rust-vendor/tokio/CHANGELOG.md +94 -0
  170. data/rust-vendor/tokio/Cargo.lock +1549 -0
  171. data/rust-vendor/tokio/Cargo.toml +96 -83
  172. data/rust-vendor/tokio/Cargo.toml.orig +7 -7
  173. data/rust-vendor/tokio/README.md +1 -1
  174. data/rust-vendor/tokio/src/fs/open_options.rs +4 -1
  175. data/rust-vendor/tokio/src/fs/read.rs +4 -1
  176. data/rust-vendor/tokio/src/fs/write.rs +4 -1
  177. data/rust-vendor/tokio/src/io/async_write.rs +3 -4
  178. data/rust-vendor/tokio/src/io/poll_evented.rs +23 -1
  179. data/rust-vendor/tokio/src/io/stderr.rs +15 -1
  180. data/rust-vendor/tokio/src/io/stdout.rs +14 -0
  181. data/rust-vendor/tokio/src/io/util/async_write_ext.rs +2 -2
  182. data/rust-vendor/tokio/src/io/util/write_buf.rs +11 -2
  183. data/rust-vendor/tokio/src/lib.rs +12 -28
  184. data/rust-vendor/tokio/src/macros/select.rs +6 -8
  185. data/rust-vendor/tokio/src/net/tcp/socket.rs +25 -1
  186. data/rust-vendor/tokio/src/net/tcp/stream.rs +40 -1
  187. data/rust-vendor/tokio/src/process/unix/pidfd_reaper.rs +1 -41
  188. data/rust-vendor/tokio/src/runtime/blocking/pool.rs +18 -14
  189. data/rust-vendor/tokio/src/runtime/builder.rs +10 -4
  190. data/rust-vendor/tokio/src/runtime/handle.rs +3 -2
  191. data/rust-vendor/tokio/src/runtime/io/driver/uring.rs +49 -61
  192. data/rust-vendor/tokio/src/runtime/io/driver.rs +6 -5
  193. data/rust-vendor/tokio/src/runtime/mod.rs +20 -1
  194. data/rust-vendor/tokio/src/runtime/runtime.rs +71 -1
  195. data/rust-vendor/tokio/src/runtime/scheduler/current_thread/mod.rs +24 -8
  196. data/rust-vendor/tokio/src/runtime/scheduler/multi_thread/worker.rs +5 -0
  197. data/rust-vendor/tokio/src/runtime/task/core.rs +1 -0
  198. data/rust-vendor/tokio/src/runtime/task/join.rs +7 -3
  199. data/rust-vendor/tokio/src/runtime/task/list.rs +5 -3
  200. data/rust-vendor/tokio/src/runtime/task/mod.rs +0 -5
  201. data/rust-vendor/tokio/src/runtime/tests/loom_blocking.rs +39 -1
  202. data/rust-vendor/tokio/src/signal/mod.rs +6 -17
  203. data/rust-vendor/tokio/src/signal/registry.rs +1 -1
  204. data/rust-vendor/tokio/src/signal/unix.rs +24 -44
  205. data/rust-vendor/tokio/src/signal/windows/sys.rs +52 -64
  206. data/rust-vendor/tokio/src/signal/windows.rs +35 -23
  207. data/rust-vendor/tokio/src/sync/mpsc/mod.rs +3 -1
  208. data/rust-vendor/tokio/src/sync/oneshot.rs +13 -0
  209. data/rust-vendor/tokio/src/sync/rwlock.rs +4 -5
  210. data/rust-vendor/tokio/src/sync/tests/loom_oneshot.rs +27 -1
  211. data/rust-vendor/tokio/src/task/blocking.rs +16 -1
  212. data/rust-vendor/tokio/src/task/builder.rs +2 -2
  213. data/rust-vendor/tokio/src/task/mod.rs +1 -1
  214. data/rust-vendor/tokio/src/task/spawn.rs +8 -3
  215. data/rust-vendor/tokio/src/task/yield_now.rs +13 -23
  216. data/rust-vendor/tokio/src/time/clock.rs +62 -0
  217. data/rust-vendor/tokio/src/util/memchr.rs +32 -4
  218. data/rust-vendor/tokio/src/util/sharded_list.rs +6 -4
  219. data/rust-vendor/tokio/tests/fs_link.rs +54 -0
  220. data/rust-vendor/tokio/tests/io_async_fd_memory_leak.rs +209 -0
  221. data/rust-vendor/tokio/tests/io_write_buf.rs +56 -0
  222. data/rust-vendor/tokio/tests/process_issue_7144.rs +8 -0
  223. data/rust-vendor/tokio/tests/rt_basic.rs +41 -0
  224. data/rust-vendor/tokio/tests/rt_common_before_park.rs +92 -0
  225. data/rust-vendor/tokio/tests/rt_metrics.rs +1 -1
  226. data/rust-vendor/tokio/tests/rt_panic.rs +12 -0
  227. data/rust-vendor/tokio/tests/rt_shutdown_err.rs +82 -0
  228. data/rust-vendor/tokio/tests/rt_threaded.rs +49 -1
  229. data/rust-vendor/tokio/tests/rt_unstable_metrics.rs +32 -0
  230. data/rust-vendor/tokio/tests/tcp_connect.rs +2 -3
  231. data/rust-vendor/tokio/tests/tcp_shutdown.rs +1 -3
  232. data/rust-vendor/tokio/tests/tcp_socket.rs +3 -4
  233. data/rust-vendor/tokio/tests/tcp_stream.rs +3 -0
  234. data/sig/html_to_markdown.rbs +46 -0
  235. data/spec/convert_with_tables_spec.rb +194 -0
  236. metadata +80 -3
  237. data/rust-vendor/getrandom/src/utils/lazy.rs +0 -64
@@ -49,9 +49,25 @@ const TRUE: BOOL = 1;
49
49
  #[inline]
50
50
  pub fn fill_inner(dest: &mut [MaybeUninit<u8>]) -> Result<(), Error> {
51
51
  let result = unsafe { ProcessPrng(dest.as_mut_ptr().cast::<u8>(), dest.len()) };
52
- // `ProcessPrng` is documented to always return TRUE. All potential errors are handled
53
- // during loading of `BCryptPrimitive.dll`. See the "Process base PRNG" section
54
- // in the aforementioned Windows RNG whitepaper for more information.
55
- debug_assert!(result == TRUE);
56
- Ok(())
52
+ // On Windows 10 and later, `ProcessPrng` is documented to always return
53
+ // TRUE. All potential errors are handled during loading of
54
+ // `BCryptPrimitive.dll`. See the "Process base PRNG" section in the
55
+ // aforementioned Windows RNG whitepaper for more information.
56
+ //
57
+ // The Zig project found that Windows 8 implements `ProcessPrng` in a way
58
+ // that may fail and return a value other than `TRUE`. Although recent
59
+ // versions of the Rust toolchain do not support Windows 8, we cannot rule
60
+ // out this backend being used in an executable that will run on Windows 8
61
+ // (e.g. a fork of this crate backported to have an MSRV lower than 1.76,
62
+ // or a fork of the Rust toolchain to support older Windows versions, or
63
+ // other build hacks).
64
+ //
65
+ // Further, Wine's implementation of `ProcessPrng` CAN fail, in every
66
+ // version through Wine 11.2, and this may be the case for any other Windows
67
+ // emulation layers.
68
+ if result == TRUE {
69
+ Ok(())
70
+ } else {
71
+ Err(Error::UNEXPECTED)
72
+ }
57
73
  }
@@ -0,0 +1,39 @@
1
+ use core::sync::atomic::{AtomicU8, Ordering::Relaxed};
2
+
3
+ /// Lazily caches a `bool` in an `AtomicU8`.
4
+ ///
5
+ /// Initialization is intentionally unsynchronized: concurrent callers may race
6
+ /// and run `init` more than once. Once a value is produced, it is cached and
7
+ /// reused by subsequent calls.
8
+ ///
9
+ /// Uses `Relaxed` ordering because this helper only publishes the cached
10
+ /// value itself.
11
+ pub(crate) struct LazyBool(AtomicU8);
12
+
13
+ impl LazyBool {
14
+ const UNINIT: u8 = u8::MAX;
15
+
16
+ /// Create new `LazyBool`.
17
+ pub const fn new() -> Self {
18
+ Self(AtomicU8::new(Self::UNINIT))
19
+ }
20
+
21
+ /// Call the `init` closure and return the result after caching it.
22
+ #[cold]
23
+ fn cold_init(&self, init: impl FnOnce() -> bool) -> bool {
24
+ let val = u8::from(init());
25
+ self.0.store(val, Relaxed);
26
+ val != 0
27
+ }
28
+
29
+ /// Retrieve the cached value if it was already initialized or call the `init` closure
30
+ /// and return the result after caching it.
31
+ #[inline]
32
+ pub fn unsync_init(&self, init: impl FnOnce() -> bool) -> bool {
33
+ let val = self.0.load(Relaxed);
34
+ if val == Self::UNINIT {
35
+ return self.cold_init(init);
36
+ }
37
+ val != 0
38
+ }
39
+ }
@@ -0,0 +1,57 @@
1
+ use core::{
2
+ convert::Infallible,
3
+ ptr::{self, NonNull},
4
+ sync::atomic::{AtomicPtr, Ordering::Relaxed},
5
+ };
6
+
7
+ /// Lazily caches a non-null pointer in an `AtomicPtr`.
8
+ ///
9
+ /// Initialization is intentionally unsynchronized: concurrent callers may race
10
+ /// and run `init` more than once. Once a value is produced, it is cached and
11
+ /// reused by subsequent calls.
12
+ ///
13
+ /// For fallible initialization (`try_unsync_init`), only successful values are
14
+ /// cached; errors are returned to the caller and are not cached.
15
+ ///
16
+ /// Uses `Ordering::Relaxed` because this helper only publishes the cached
17
+ /// pointer value. Callers must not rely on this mechanism to synchronize
18
+ /// unrelated memory side effects performed by `init`.
19
+ pub(crate) struct LazyPtr<T>(AtomicPtr<T>);
20
+
21
+ impl<T> LazyPtr<T> {
22
+ /// Create new `LazyPtr`.
23
+ pub const fn new() -> Self {
24
+ Self(AtomicPtr::new(ptr::null_mut()))
25
+ }
26
+
27
+ /// Call the `init` closure and return the result after caching it in the case of success.
28
+ #[cold]
29
+ fn cold_init<E>(&self, init: impl FnOnce() -> Result<NonNull<T>, E>) -> Result<NonNull<T>, E> {
30
+ let val = init()?;
31
+ self.0.store(val.as_ptr(), Relaxed);
32
+ Ok(val)
33
+ }
34
+
35
+ /// Retrieve the cached value if it was already initialized or call the potentially fallible
36
+ /// `init` closure and return the result after caching it in the case of success.
37
+ #[inline]
38
+ pub fn try_unsync_init<E>(
39
+ &self,
40
+ init: impl FnOnce() -> Result<NonNull<T>, E>,
41
+ ) -> Result<NonNull<T>, E> {
42
+ let p = self.0.load(Relaxed);
43
+ match NonNull::new(p) {
44
+ Some(val) => Ok(val),
45
+ None => self.cold_init(init),
46
+ }
47
+ }
48
+
49
+ /// Retrieve the cached value if it was already initialized or call the `init` closure
50
+ /// and return the result after caching it.
51
+ #[inline]
52
+ #[allow(dead_code, reason = "Some modules use only `try_unsync_init`")]
53
+ pub fn unsync_init(&self, init: impl FnOnce() -> NonNull<T>) -> NonNull<T> {
54
+ let Ok(p): Result<_, Infallible> = self.try_unsync_init(|| Ok(init()));
55
+ p
56
+ }
57
+ }
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "html-to-markdown-rs"
3
- version = "2.27.2"
3
+ version = "2.28.0"
4
4
  edition = "2024"
5
5
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
6
6
  license = "MIT"
@@ -40,7 +40,7 @@ serde = { version = "1.0", features = ["derive"], optional = true }
40
40
  serde_json = { version = "1.0", optional = true }
41
41
  async-trait = { version = "0.1", optional = true }
42
42
  futures = { version = "0.3", optional = true }
43
- tokio = { version = "1.49", features = ["rt-multi-thread", "sync"], optional = true }
43
+ tokio = { version = "1.50", features = ["rt-multi-thread", "sync"], optional = true }
44
44
 
45
45
  [dev-dependencies]
46
46
  serde = { version = "1.0", features = ["derive"] }
@@ -148,6 +148,35 @@ for (i, img) in extraction.inline_images.iter().enumerate() {
148
148
  }
149
149
  ```
150
150
 
151
+ ## Table Extraction
152
+
153
+ Extract structured table data alongside the Markdown conversion. Each table found in the HTML is returned with its cell contents, header row flags, and rendered Markdown output.
154
+
155
+ Requires the `visitor` feature.
156
+
157
+ ```rust
158
+ use html_to_markdown_rs::convert_with_tables;
159
+
160
+ let html = r#"
161
+ <table>
162
+ <tr><th>Name</th><th>Age</th></tr>
163
+ <tr><td>Alice</td><td>30</td></tr>
164
+ <tr><td>Bob</td><td>25</td></tr>
165
+ </table>
166
+ "#;
167
+
168
+ let result = convert_with_tables(html, None, None)?;
169
+
170
+ println!("{}", result.content);
171
+ for table in &result.tables {
172
+ println!("Table with {} rows:", table.cells.len());
173
+ for (i, row) in table.cells.iter().enumerate() {
174
+ let prefix = if table.is_header_row[i] { "Header" } else { "Row" };
175
+ println!(" {}: {:?}", prefix, row);
176
+ }
177
+ }
178
+ ```
179
+
151
180
  ## Other Language Bindings
152
181
 
153
182
  This is the core Rust library. For other languages:
@@ -681,3 +681,371 @@ pub fn metadata_config_from_json(json: &str) -> Result<MetadataConfig> {
681
681
  let update: crate::MetadataConfigUpdate = parse_json(json)?;
682
682
  Ok(MetadataConfig::from(update))
683
683
  }
684
+
685
+ // ============================================================================
686
+ // Table Extraction API (requires visitor feature)
687
+ // ============================================================================
688
+
689
+ /// Extracted table data from HTML conversion.
690
+ ///
691
+ /// Each instance represents a single `<table>` element found during conversion.
692
+ /// Tables are collected in document order.
693
+ #[cfg(feature = "visitor")]
694
+ #[derive(Debug, Clone)]
695
+ #[cfg_attr(
696
+ any(feature = "serde", feature = "metadata"),
697
+ derive(serde::Serialize, serde::Deserialize)
698
+ )]
699
+ pub struct TableData {
700
+ /// Table cells organized as rows x columns. Cell contents are already
701
+ /// converted to the target output format (markdown/djot/plain).
702
+ pub cells: Vec<Vec<String>>,
703
+ /// Complete rendered table in the target output format.
704
+ pub markdown: String,
705
+ /// Per-row flag indicating whether the row was inside `<thead>`.
706
+ pub is_header_row: Vec<bool>,
707
+ }
708
+
709
+ /// Result of HTML-to-markdown conversion with extracted table data.
710
+ #[cfg(feature = "visitor")]
711
+ #[derive(Debug, Clone)]
712
+ #[cfg_attr(
713
+ any(feature = "serde", feature = "metadata"),
714
+ derive(serde::Serialize, serde::Deserialize)
715
+ )]
716
+ pub struct ConversionWithTables {
717
+ /// Converted markdown/djot/plain text content.
718
+ pub content: String,
719
+ /// Extended metadata (if metadata extraction was requested).
720
+ #[cfg(feature = "metadata")]
721
+ pub metadata: Option<ExtendedMetadata>,
722
+ /// All tables found in the HTML, in document order.
723
+ pub tables: Vec<TableData>,
724
+ }
725
+
726
+ #[cfg(feature = "visitor")]
727
+ #[derive(Debug)]
728
+ struct TableCollector {
729
+ tables: Vec<TableData>,
730
+ current_rows: Vec<Vec<String>>,
731
+ current_is_header: Vec<bool>,
732
+ }
733
+
734
+ #[cfg(feature = "visitor")]
735
+ impl TableCollector {
736
+ fn new() -> Self {
737
+ Self {
738
+ tables: Vec::new(),
739
+ current_rows: Vec::new(),
740
+ current_is_header: Vec::new(),
741
+ }
742
+ }
743
+ }
744
+
745
+ #[cfg(feature = "visitor")]
746
+ impl visitor::HtmlVisitor for TableCollector {
747
+ fn visit_table_start(&mut self, _ctx: &visitor::NodeContext) -> visitor::VisitResult {
748
+ self.current_rows.clear();
749
+ self.current_is_header.clear();
750
+ visitor::VisitResult::Continue
751
+ }
752
+
753
+ fn visit_table_row(
754
+ &mut self,
755
+ _ctx: &visitor::NodeContext,
756
+ cells: &[String],
757
+ is_header: bool,
758
+ ) -> visitor::VisitResult {
759
+ self.current_rows.push(cells.to_vec());
760
+ self.current_is_header.push(is_header);
761
+ visitor::VisitResult::Continue
762
+ }
763
+
764
+ fn visit_table_end(&mut self, _ctx: &visitor::NodeContext, output: &str) -> visitor::VisitResult {
765
+ if !self.current_rows.is_empty() {
766
+ self.tables.push(TableData {
767
+ cells: std::mem::take(&mut self.current_rows),
768
+ markdown: output.to_string(),
769
+ is_header_row: std::mem::take(&mut self.current_is_header),
770
+ });
771
+ }
772
+ visitor::VisitResult::Continue
773
+ }
774
+ }
775
+
776
+ /// Convert HTML to markdown/djot/plain text with structured table extraction.
777
+ ///
778
+ /// Combines conversion, optional metadata extraction, and table data collection
779
+ /// in a single DOM walk. Each table found in the HTML is returned with its
780
+ /// cell contents (already converted to the target format) and rendered output.
781
+ ///
782
+ /// # Arguments
783
+ ///
784
+ /// * `html` - The HTML string to convert
785
+ /// * `options` - Optional conversion options (defaults to `ConversionOptions::default()`)
786
+ /// * `metadata_cfg` - Optional metadata extraction configuration (requires `metadata` feature)
787
+ ///
788
+ /// # Example
789
+ ///
790
+ /// ```ignore
791
+ /// use html_to_markdown_rs::convert_with_tables;
792
+ ///
793
+ /// let html = r#"<table><tr><th>Name</th><th>Age</th></tr><tr><td>Alice</td><td>30</td></tr></table>"#;
794
+ /// let result = convert_with_tables(html, None, None).unwrap();
795
+ /// assert_eq!(result.tables.len(), 1);
796
+ /// assert_eq!(result.tables[0].cells[0], vec!["Name", "Age"]);
797
+ /// ```
798
+ ///
799
+ /// # Errors
800
+ ///
801
+ /// Returns an error if HTML parsing fails or if the input contains invalid UTF-8.
802
+ #[cfg(feature = "visitor")]
803
+ pub fn convert_with_tables(
804
+ html: &str,
805
+ options: Option<ConversionOptions>,
806
+ #[cfg(feature = "metadata")] metadata_cfg: Option<MetadataConfig>,
807
+ #[cfg(not(feature = "metadata"))] _metadata_cfg: Option<()>,
808
+ ) -> Result<ConversionWithTables> {
809
+ use std::cell::RefCell;
810
+ use std::rc::Rc;
811
+
812
+ let collector = Rc::new(RefCell::new(TableCollector::new()));
813
+ let visitor_handle: visitor::VisitorHandle = Rc::clone(&collector) as visitor::VisitorHandle;
814
+
815
+ #[cfg(feature = "metadata")]
816
+ let result = {
817
+ let metadata_config = metadata_cfg.unwrap_or_default();
818
+ let (content, metadata) = convert_with_metadata(html, options, metadata_config, Some(visitor_handle))?;
819
+ let tables = Rc::try_unwrap(collector)
820
+ .map_err(|_| ConversionError::Other("failed to recover table collector state".into()))?
821
+ .into_inner()
822
+ .tables;
823
+ ConversionWithTables {
824
+ content,
825
+ metadata: Some(metadata),
826
+ tables,
827
+ }
828
+ };
829
+
830
+ #[cfg(not(feature = "metadata"))]
831
+ let result = {
832
+ let content = convert_with_visitor(html, options, Some(visitor_handle))?;
833
+ let tables = Rc::try_unwrap(collector)
834
+ .map_err(|_| ConversionError::Other("failed to recover table collector state".into()))?
835
+ .into_inner()
836
+ .tables;
837
+ ConversionWithTables { content, tables }
838
+ };
839
+
840
+ Ok(result)
841
+ }
842
+
843
+ #[cfg(test)]
844
+ #[cfg(feature = "visitor")]
845
+ mod table_extraction_tests {
846
+ use super::*;
847
+
848
+ fn tables_from_html(html: &str) -> ConversionWithTables {
849
+ convert_with_tables(
850
+ html,
851
+ None,
852
+ #[cfg(feature = "metadata")]
853
+ None,
854
+ #[cfg(not(feature = "metadata"))]
855
+ None,
856
+ )
857
+ .unwrap()
858
+ }
859
+
860
+ #[test]
861
+ fn test_convert_with_tables_basic() {
862
+ let html = r#"<table><tr><th>Name</th><th>Age</th></tr><tr><td>Alice</td><td>30</td></tr></table>"#;
863
+ let result = tables_from_html(html);
864
+ assert_eq!(result.tables.len(), 1);
865
+ assert_eq!(result.tables[0].cells.len(), 2);
866
+ assert_eq!(result.tables[0].cells[0], vec!["Name", "Age"]);
867
+ assert_eq!(result.tables[0].cells[1], vec!["Alice", "30"]);
868
+ assert!(result.tables[0].is_header_row[0]);
869
+ assert!(!result.tables[0].is_header_row[1]);
870
+ assert!(result.tables[0].markdown.contains('|'));
871
+ }
872
+
873
+ #[test]
874
+ fn test_convert_with_tables_nested() {
875
+ let html = r#"
876
+ <table>
877
+ <tr><th>Category</th><th>Details</th></tr>
878
+ <tr>
879
+ <td>Project Alpha</td>
880
+ <td>
881
+ <table>
882
+ <tr><th>Task</th><th>Status</th></tr>
883
+ <tr><td>001</td><td>Done</td></tr>
884
+ </table>
885
+ </td>
886
+ </tr>
887
+ </table>"#;
888
+ let result = tables_from_html(html);
889
+ assert!(
890
+ result.tables.len() >= 2,
891
+ "Expected at least 2 tables (outer + nested), got {}",
892
+ result.tables.len()
893
+ );
894
+ }
895
+
896
+ #[test]
897
+ fn test_convert_with_tables_no_tables() {
898
+ let html = "<p>No tables here</p>";
899
+ let result = tables_from_html(html);
900
+ assert!(result.tables.is_empty());
901
+ assert!(result.content.contains("No tables here"));
902
+ }
903
+
904
+ #[test]
905
+ fn test_convert_with_tables_empty_table() {
906
+ let result = tables_from_html("<table></table>");
907
+ assert!(result.tables.is_empty(), "Empty table should not produce TableData");
908
+ }
909
+
910
+ #[test]
911
+ fn test_convert_with_tables_headers_only() {
912
+ let html = r#"<table><thead><tr><th>A</th><th>B</th></tr></thead></table>"#;
913
+ let result = tables_from_html(html);
914
+ assert_eq!(result.tables.len(), 1);
915
+ assert!(result.tables[0].is_header_row[0]);
916
+ assert_eq!(result.tables[0].cells[0], vec!["A", "B"]);
917
+ }
918
+
919
+ #[test]
920
+ fn test_convert_with_tables_thead_tbody_tfoot() {
921
+ let html = r#"
922
+ <table>
923
+ <thead><tr><th>H1</th></tr></thead>
924
+ <tbody><tr><td>B1</td></tr></tbody>
925
+ <tfoot><tr><td>F1</td></tr></tfoot>
926
+ </table>"#;
927
+ let result = tables_from_html(html);
928
+ assert_eq!(result.tables.len(), 1);
929
+ let t = &result.tables[0];
930
+ assert!(t.is_header_row[0], "thead row should be header");
931
+ assert!(!t.is_header_row[1], "tbody row should not be header");
932
+ assert_eq!(t.cells[0], vec!["H1"]);
933
+ assert_eq!(t.cells[1], vec!["B1"]);
934
+ }
935
+
936
+ #[test]
937
+ fn test_convert_with_tables_multiple_separate() {
938
+ let html = r#"
939
+ <table><tr><td>T1</td></tr></table>
940
+ <p>Between tables</p>
941
+ <table><tr><td>T2</td></tr></table>"#;
942
+ let result = tables_from_html(html);
943
+ assert_eq!(result.tables.len(), 2, "Should find 2 separate tables");
944
+ }
945
+
946
+ #[test]
947
+ fn test_convert_with_tables_special_chars() {
948
+ let html = r#"<table><tr><td>a | b</td><td>c*d</td></tr></table>"#;
949
+ let result = tables_from_html(html);
950
+ assert_eq!(result.tables.len(), 1);
951
+ assert!(!result.tables[0].cells[0].is_empty());
952
+ }
953
+
954
+ #[test]
955
+ fn test_convert_with_tables_single_cell() {
956
+ let html = r#"<table><tr><td>Only cell</td></tr></table>"#;
957
+ let result = tables_from_html(html);
958
+ assert_eq!(result.tables.len(), 1);
959
+ assert_eq!(result.tables[0].cells.len(), 1);
960
+ assert_eq!(result.tables[0].cells[0], vec!["Only cell"]);
961
+ }
962
+
963
+ #[test]
964
+ fn test_convert_with_tables_content_preserved() {
965
+ let html = r#"<p>Before</p><table><tr><td>Cell</td></tr></table><p>After</p>"#;
966
+ let result = tables_from_html(html);
967
+ assert!(result.content.contains("Before"));
968
+ assert!(result.content.contains("After"));
969
+ assert!(result.content.contains('|'), "Markdown table should appear in content");
970
+ }
971
+
972
+ #[test]
973
+ fn test_convert_with_tables_with_options() {
974
+ let options = ConversionOptions {
975
+ heading_style: crate::options::HeadingStyle::Underlined,
976
+ ..ConversionOptions::default()
977
+ };
978
+ let html = r#"<h1>Title</h1><table><tr><td>Cell</td></tr></table>"#;
979
+ let result = convert_with_tables(
980
+ html,
981
+ Some(options),
982
+ #[cfg(feature = "metadata")]
983
+ None,
984
+ #[cfg(not(feature = "metadata"))]
985
+ None,
986
+ )
987
+ .unwrap();
988
+ assert_eq!(result.tables.len(), 1);
989
+ assert!(result.content.contains("Title"));
990
+ }
991
+
992
+ #[test]
993
+ fn test_convert_with_tables_plain_text_format() {
994
+ let options = ConversionOptions {
995
+ output_format: crate::options::OutputFormat::Plain,
996
+ ..ConversionOptions::default()
997
+ };
998
+ let html = r#"<table><tr><th>Name</th></tr><tr><td>Alice</td></tr></table>"#;
999
+ let result = convert_with_tables(
1000
+ html,
1001
+ Some(options),
1002
+ #[cfg(feature = "metadata")]
1003
+ None,
1004
+ #[cfg(not(feature = "metadata"))]
1005
+ None,
1006
+ )
1007
+ .unwrap();
1008
+ assert!(
1009
+ !result.tables.is_empty(),
1010
+ "Tables should be populated even with plain text output format"
1011
+ );
1012
+ assert_eq!(result.tables[0].cells[0], vec!["Name"]);
1013
+ }
1014
+
1015
+ #[cfg(feature = "metadata")]
1016
+ #[test]
1017
+ fn test_convert_with_tables_metadata_integration() {
1018
+ let html = r#"<html lang="en"><head><title>Test</title></head><body>
1019
+ <table><tr><th>Col</th></tr><tr><td>Val</td></tr></table>
1020
+ </body></html>"#;
1021
+ let config = MetadataConfig::default();
1022
+ let result = convert_with_tables(html, None, Some(config)).unwrap();
1023
+ assert_eq!(result.tables.len(), 1);
1024
+ let meta = result.metadata.as_ref().expect("metadata should be present");
1025
+ assert_eq!(meta.document.language, Some("en".to_string()));
1026
+ }
1027
+
1028
+ #[cfg(feature = "metadata")]
1029
+ #[test]
1030
+ fn test_convert_with_tables_plain_text_metadata() {
1031
+ let options = ConversionOptions {
1032
+ output_format: crate::options::OutputFormat::Plain,
1033
+ ..ConversionOptions::default()
1034
+ };
1035
+ let html = r#"<html lang="fr"><body>
1036
+ <table><tr><td>Cell</td></tr></table>
1037
+ </body></html>"#;
1038
+ let config = MetadataConfig::default();
1039
+ let result = convert_with_tables(html, Some(options), Some(config)).unwrap();
1040
+ assert!(
1041
+ !result.tables.is_empty(),
1042
+ "Tables should be populated in plain text mode"
1043
+ );
1044
+ let meta = result.metadata.as_ref().expect("metadata should be present");
1045
+ assert_eq!(
1046
+ meta.document.language,
1047
+ Some("fr".to_string()),
1048
+ "Metadata should be populated in plain text mode"
1049
+ );
1050
+ }
1051
+ }
@@ -136,11 +136,9 @@ pub(crate) fn convert_html_impl(
136
136
  }
137
137
  }
138
138
 
139
- // Fast path for plain text output: skip the full conversion pipeline
140
- if options.output_format == OutputFormat::Plain {
141
- let plain = extract_plain_text(&dom, parser, options);
142
- return Ok(plain);
143
- }
139
+ // Plain text output: run the full pipeline (for metadata + visitor callbacks),
140
+ // then return plain text instead of markdown.
141
+ let is_plain_text = options.output_format == OutputFormat::Plain;
144
142
 
145
143
  let wants_frontmatter = options.extract_metadata && !options.convert_as_inline;
146
144
  #[cfg(feature = "metadata")]
@@ -230,6 +228,13 @@ pub(crate) fn convert_html_impl(
230
228
  return Err(crate::error::ConversionError::Visitor(err.clone()));
231
229
  }
232
230
 
231
+ // If plain text was requested, discard the markdown output and return plain text.
232
+ // The full pipeline was still run above so that metadata + visitor callbacks fire.
233
+ if is_plain_text {
234
+ let plain = extract_plain_text(&dom, parser, options);
235
+ return Ok(plain);
236
+ }
237
+
233
238
  trim_line_end_whitespace(&mut output);
234
239
  let trimmed = output.trim_end_matches('\n');
235
240
  if trimmed.is_empty() {
@@ -188,7 +188,8 @@ pub fn process_text_node(
188
188
  // the current block's content, not from a previous block's closing.
189
189
  // Without this distinction, the second paragraph after a "\n\n" boundary
190
190
  // would incorrectly suppress the trailing space before inline elements.
191
- let current_block_output = &output[ctx.block_content_start..];
191
+ let safe_start = ctx.block_content_start.min(output.len());
192
+ let current_block_output = &output[safe_start..];
192
193
  let at_paragraph_break = current_block_output.ends_with("\n\n");
193
194
  if !at_paragraph_break {
194
195
  if has_double_newline {
@@ -98,6 +98,9 @@ pub use convert_api::convert_with_metadata;
98
98
  #[cfg(feature = "visitor")]
99
99
  pub use convert_api::convert_with_visitor;
100
100
 
101
+ #[cfg(feature = "visitor")]
102
+ pub use convert_api::{ConversionWithTables, TableData, convert_with_tables};
103
+
101
104
  #[cfg(feature = "async-visitor")]
102
105
  pub use convert_api::convert_with_async_visitor;
103
106
 
@@ -19,5 +19,8 @@ pub use crate::convert_with_metadata;
19
19
  #[cfg(feature = "visitor")]
20
20
  pub use crate::convert_with_visitor;
21
21
 
22
+ #[cfg(feature = "visitor")]
23
+ pub use crate::{ConversionWithTables, TableData, convert_with_tables};
24
+
22
25
  #[cfg(feature = "async-visitor")]
23
26
  pub use crate::convert_with_async_visitor;