html-to-markdown 2.27.2 → 2.28.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +8 -8
- data/ext/html-to-markdown-rb/native/Cargo.toml +1 -1
- data/ext/html-to-markdown-rb/native/src/conversion/mod.rs +6 -0
- data/ext/html-to-markdown-rb/native/src/conversion/tables.rs +71 -0
- data/ext/html-to-markdown-rb/native/src/lib.rs +27 -1
- data/lib/html_to_markdown/version.rb +1 -1
- data/lib/html_to_markdown.rb +29 -0
- data/rust-vendor/getrandom/.cargo-checksum.json +1 -1
- data/rust-vendor/getrandom/.cargo_vcs_info.json +1 -1
- data/rust-vendor/getrandom/CHANGELOG.md +62 -43
- data/rust-vendor/getrandom/Cargo.lock +49 -56
- data/rust-vendor/getrandom/Cargo.toml +2 -2
- data/rust-vendor/getrandom/Cargo.toml.orig +2 -2
- data/rust-vendor/getrandom/src/backends/efi_rng.rs +8 -10
- data/rust-vendor/getrandom/src/backends/getentropy.rs +13 -4
- data/rust-vendor/getrandom/src/backends/linux_android_with_fallback.rs +10 -25
- data/rust-vendor/getrandom/src/backends/netbsd.rs +17 -25
- data/rust-vendor/getrandom/src/backends/rdrand.rs +15 -9
- data/rust-vendor/getrandom/src/backends/rndr.rs +2 -1
- data/rust-vendor/getrandom/src/backends/vxworks.rs +7 -3
- data/rust-vendor/getrandom/src/backends/windows.rs +21 -5
- data/rust-vendor/getrandom/src/utils/lazy_bool.rs +39 -0
- data/rust-vendor/getrandom/src/utils/lazy_ptr.rs +57 -0
- data/rust-vendor/html-to-markdown-rs/Cargo.toml +2 -2
- data/rust-vendor/html-to-markdown-rs/README.md +29 -0
- data/rust-vendor/html-to-markdown-rs/src/convert_api.rs +368 -0
- data/rust-vendor/html-to-markdown-rs/src/converter/main.rs +10 -5
- data/rust-vendor/html-to-markdown-rs/src/converter/text_node.rs +2 -1
- data/rust-vendor/html-to-markdown-rs/src/lib.rs +3 -0
- data/rust-vendor/html-to-markdown-rs/src/prelude.rs +3 -0
- data/rust-vendor/html-to-markdown-rs/tests/issue_216_217_regressions.rs +82 -0
- data/rust-vendor/quote/.cargo-checksum.json +1 -1
- data/rust-vendor/quote/.cargo_vcs_info.json +1 -1
- data/rust-vendor/quote/.github/workflows/ci.yml +2 -2
- data/rust-vendor/quote/Cargo.lock +21 -21
- data/rust-vendor/quote/Cargo.toml +2 -2
- data/rust-vendor/quote/Cargo.toml.orig +2 -2
- data/rust-vendor/quote/README.md +0 -1
- data/rust-vendor/quote/src/lib.rs +1 -1
- data/rust-vendor/quote/src/to_tokens.rs +7 -0
- data/rust-vendor/quote/tests/ui/not-quotable.stderr +1 -1
- data/rust-vendor/quote/tests/ui/not-repeatable.stderr +3 -11
- data/rust-vendor/r-efi/.cargo-checksum.json +1 -1
- data/rust-vendor/r-efi/.cargo_vcs_info.json +1 -1
- data/rust-vendor/r-efi/AUTHORS +1 -0
- data/rust-vendor/r-efi/Cargo.lock +1 -1
- data/rust-vendor/r-efi/Cargo.toml +1 -3
- data/rust-vendor/r-efi/Cargo.toml.orig +1 -5
- data/rust-vendor/r-efi/NEWS.md +16 -0
- data/rust-vendor/r-efi/src/base.rs +1 -1
- data/rust-vendor/r-efi/src/lib.rs +27 -12
- data/rust-vendor/r-efi/src/protocols/absolute_pointer.rs +4 -4
- data/rust-vendor/r-efi/src/protocols/block_io.rs +8 -8
- data/rust-vendor/r-efi/src/protocols/bus_specific_driver_override.rs +2 -2
- data/rust-vendor/r-efi/src/protocols/debug_support.rs +10 -10
- data/rust-vendor/r-efi/src/protocols/debugport.rs +8 -8
- data/rust-vendor/r-efi/src/protocols/decompress.rs +4 -4
- data/rust-vendor/r-efi/src/protocols/device_path_from_text.rs +4 -4
- data/rust-vendor/r-efi/src/protocols/device_path_to_text.rs +4 -4
- data/rust-vendor/r-efi/src/protocols/device_path_utilities.rs +16 -16
- data/rust-vendor/r-efi/src/protocols/disk_io.rs +4 -4
- data/rust-vendor/r-efi/src/protocols/disk_io2.rs +8 -8
- data/rust-vendor/r-efi/src/protocols/driver_binding.rs +6 -6
- data/rust-vendor/r-efi/src/protocols/driver_diagnostics2.rs +2 -2
- data/rust-vendor/r-efi/src/protocols/driver_family_override.rs +2 -2
- data/rust-vendor/r-efi/src/protocols/file.rs +28 -28
- data/rust-vendor/r-efi/src/protocols/graphics_output.rs +6 -6
- data/rust-vendor/r-efi/src/protocols/hii_database.rs +24 -24
- data/rust-vendor/r-efi/src/protocols/hii_font.rs +8 -8
- data/rust-vendor/r-efi/src/protocols/hii_font_ex.rs +10 -10
- data/rust-vendor/r-efi/src/protocols/hii_string.rs +10 -10
- data/rust-vendor/r-efi/src/protocols/ip4.rs +16 -16
- data/rust-vendor/r-efi/src/protocols/ip6.rs +18 -18
- data/rust-vendor/r-efi/src/protocols/load_file.rs +2 -2
- data/rust-vendor/r-efi/src/protocols/loaded_image.rs +2 -2
- data/rust-vendor/r-efi/src/protocols/managed_network.rs +16 -16
- data/rust-vendor/r-efi/src/protocols/memory_attribute.rs +6 -6
- data/rust-vendor/r-efi/src/protocols/mp_services.rs +15 -15
- data/rust-vendor/r-efi/src/protocols/pci_io.rs +26 -26
- data/rust-vendor/r-efi/src/protocols/platform_driver_override.rs +6 -6
- data/rust-vendor/r-efi/src/protocols/rng.rs +4 -4
- data/rust-vendor/r-efi/src/protocols/service_binding.rs +4 -4
- data/rust-vendor/r-efi/src/protocols/shell.rs +81 -81
- data/rust-vendor/r-efi/src/protocols/shell_dynamic_command.rs +4 -4
- data/rust-vendor/r-efi/src/protocols/simple_file_system.rs +2 -2
- data/rust-vendor/r-efi/src/protocols/simple_network.rs +26 -26
- data/rust-vendor/r-efi/src/protocols/simple_text_input.rs +4 -4
- data/rust-vendor/r-efi/src/protocols/simple_text_input_ex.rs +11 -11
- data/rust-vendor/r-efi/src/protocols/simple_text_output.rs +18 -18
- data/rust-vendor/r-efi/src/protocols/tcp4.rs +20 -20
- data/rust-vendor/r-efi/src/protocols/tcp6.rs +18 -18
- data/rust-vendor/r-efi/src/protocols/timestamp.rs +3 -3
- data/rust-vendor/r-efi/src/protocols/udp4.rs +16 -16
- data/rust-vendor/r-efi/src/protocols/udp6.rs +14 -14
- data/rust-vendor/r-efi/src/system.rs +115 -115
- data/rust-vendor/r-efi/src/vendor/intel/console_control.rs +6 -6
- data/rust-vendor/r-efi-5.3.0/.cargo-checksum.json +1 -0
- data/rust-vendor/r-efi-5.3.0/.cargo_vcs_info.json +6 -0
- data/rust-vendor/r-efi-5.3.0/.github/workflows/publish.yml +39 -0
- data/rust-vendor/r-efi-5.3.0/.github/workflows/rust-tests.yml +125 -0
- data/rust-vendor/r-efi-5.3.0/AUTHORS +74 -0
- data/rust-vendor/r-efi-5.3.0/Cargo.lock +16 -0
- data/rust-vendor/r-efi-5.3.0/Cargo.toml +70 -0
- data/rust-vendor/r-efi-5.3.0/Cargo.toml.orig +51 -0
- data/rust-vendor/r-efi-5.3.0/Makefile +85 -0
- data/rust-vendor/r-efi-5.3.0/NEWS.md +301 -0
- data/rust-vendor/r-efi-5.3.0/README.md +99 -0
- data/rust-vendor/r-efi-5.3.0/examples/freestanding.rs +34 -0
- data/rust-vendor/r-efi-5.3.0/examples/gop-query.rs +188 -0
- data/rust-vendor/r-efi-5.3.0/examples/hello-world.rs +55 -0
- data/rust-vendor/r-efi-5.3.0/src/base.rs +993 -0
- data/rust-vendor/r-efi-5.3.0/src/hii.rs +1300 -0
- data/rust-vendor/r-efi-5.3.0/src/lib.rs +182 -0
- data/rust-vendor/r-efi-5.3.0/src/protocols/absolute_pointer.rs +69 -0
- data/rust-vendor/r-efi-5.3.0/src/protocols/block_io.rs +70 -0
- data/rust-vendor/r-efi-5.3.0/src/protocols/bus_specific_driver_override.rs +32 -0
- data/rust-vendor/r-efi-5.3.0/src/protocols/debug_support.rs +835 -0
- data/rust-vendor/r-efi-5.3.0/src/protocols/debugport.rs +42 -0
- data/rust-vendor/r-efi-5.3.0/src/protocols/decompress.rs +37 -0
- data/rust-vendor/r-efi-5.3.0/src/protocols/device_path.rs +82 -0
- data/rust-vendor/r-efi-5.3.0/src/protocols/device_path_from_text.rs +26 -0
- data/rust-vendor/r-efi-5.3.0/src/protocols/device_path_to_text.rs +30 -0
- data/rust-vendor/r-efi-5.3.0/src/protocols/device_path_utilities.rs +63 -0
- data/rust-vendor/r-efi-5.3.0/src/protocols/disk_io.rs +40 -0
- data/rust-vendor/r-efi-5.3.0/src/protocols/disk_io2.rs +58 -0
- data/rust-vendor/r-efi-5.3.0/src/protocols/driver_binding.rs +42 -0
- data/rust-vendor/r-efi-5.3.0/src/protocols/driver_diagnostics2.rs +38 -0
- data/rust-vendor/r-efi-5.3.0/src/protocols/driver_family_override.rs +23 -0
- data/rust-vendor/r-efi-5.3.0/src/protocols/file.rs +183 -0
- data/rust-vendor/r-efi-5.3.0/src/protocols/graphics_output.rs +103 -0
- data/rust-vendor/r-efi-5.3.0/src/protocols/hii_database.rs +299 -0
- data/rust-vendor/r-efi-5.3.0/src/protocols/hii_font.rs +87 -0
- data/rust-vendor/r-efi-5.3.0/src/protocols/hii_font_ex.rs +107 -0
- data/rust-vendor/r-efi-5.3.0/src/protocols/hii_package_list.rs +14 -0
- data/rust-vendor/r-efi-5.3.0/src/protocols/hii_string.rs +71 -0
- data/rust-vendor/r-efi-5.3.0/src/protocols/ip4.rs +202 -0
- data/rust-vendor/r-efi-5.3.0/src/protocols/ip6.rs +264 -0
- data/rust-vendor/r-efi-5.3.0/src/protocols/load_file.rs +26 -0
- data/rust-vendor/r-efi-5.3.0/src/protocols/load_file2.rs +15 -0
- data/rust-vendor/r-efi-5.3.0/src/protocols/loaded_image.rs +39 -0
- data/rust-vendor/r-efi-5.3.0/src/protocols/loaded_image_device_path.rs +13 -0
- data/rust-vendor/r-efi-5.3.0/src/protocols/managed_network.rs +147 -0
- data/rust-vendor/r-efi-5.3.0/src/protocols/memory_attribute.rs +40 -0
- data/rust-vendor/r-efi-5.3.0/src/protocols/mp_services.rs +121 -0
- data/rust-vendor/r-efi-5.3.0/src/protocols/pci_io.rs +203 -0
- data/rust-vendor/r-efi-5.3.0/src/protocols/platform_driver_override.rs +46 -0
- data/rust-vendor/r-efi-5.3.0/src/protocols/rng.rs +83 -0
- data/rust-vendor/r-efi-5.3.0/src/protocols/service_binding.rs +20 -0
- data/rust-vendor/r-efi-5.3.0/src/protocols/shell.rs +295 -0
- data/rust-vendor/r-efi-5.3.0/src/protocols/shell_dynamic_command.rs +33 -0
- data/rust-vendor/r-efi-5.3.0/src/protocols/shell_parameters.rs +23 -0
- data/rust-vendor/r-efi-5.3.0/src/protocols/simple_file_system.rs +26 -0
- data/rust-vendor/r-efi-5.3.0/src/protocols/simple_network.rs +196 -0
- data/rust-vendor/r-efi-5.3.0/src/protocols/simple_text_input.rs +38 -0
- data/rust-vendor/r-efi-5.3.0/src/protocols/simple_text_input_ex.rs +85 -0
- data/rust-vendor/r-efi-5.3.0/src/protocols/simple_text_output.rs +86 -0
- data/rust-vendor/r-efi-5.3.0/src/protocols/tcp4.rs +224 -0
- data/rust-vendor/r-efi-5.3.0/src/protocols/tcp6.rs +202 -0
- data/rust-vendor/r-efi-5.3.0/src/protocols/timestamp.rs +32 -0
- data/rust-vendor/r-efi-5.3.0/src/protocols/udp4.rs +151 -0
- data/rust-vendor/r-efi-5.3.0/src/protocols/udp6.rs +137 -0
- data/rust-vendor/r-efi-5.3.0/src/protocols.rs +54 -0
- data/rust-vendor/r-efi-5.3.0/src/system.rs +1130 -0
- data/rust-vendor/r-efi-5.3.0/src/vendor/intel/console_control.rs +37 -0
- data/rust-vendor/r-efi-5.3.0/src/vendor.rs +10 -0
- data/rust-vendor/tokio/.cargo-checksum.json +1 -1
- data/rust-vendor/tokio/.cargo_vcs_info.json +1 -1
- data/rust-vendor/tokio/CHANGELOG.md +94 -0
- data/rust-vendor/tokio/Cargo.lock +1549 -0
- data/rust-vendor/tokio/Cargo.toml +96 -83
- data/rust-vendor/tokio/Cargo.toml.orig +7 -7
- data/rust-vendor/tokio/README.md +1 -1
- data/rust-vendor/tokio/src/fs/open_options.rs +4 -1
- data/rust-vendor/tokio/src/fs/read.rs +4 -1
- data/rust-vendor/tokio/src/fs/write.rs +4 -1
- data/rust-vendor/tokio/src/io/async_write.rs +3 -4
- data/rust-vendor/tokio/src/io/poll_evented.rs +23 -1
- data/rust-vendor/tokio/src/io/stderr.rs +15 -1
- data/rust-vendor/tokio/src/io/stdout.rs +14 -0
- data/rust-vendor/tokio/src/io/util/async_write_ext.rs +2 -2
- data/rust-vendor/tokio/src/io/util/write_buf.rs +11 -2
- data/rust-vendor/tokio/src/lib.rs +12 -28
- data/rust-vendor/tokio/src/macros/select.rs +6 -8
- data/rust-vendor/tokio/src/net/tcp/socket.rs +25 -1
- data/rust-vendor/tokio/src/net/tcp/stream.rs +40 -1
- data/rust-vendor/tokio/src/process/unix/pidfd_reaper.rs +1 -41
- data/rust-vendor/tokio/src/runtime/blocking/pool.rs +18 -14
- data/rust-vendor/tokio/src/runtime/builder.rs +10 -4
- data/rust-vendor/tokio/src/runtime/handle.rs +3 -2
- data/rust-vendor/tokio/src/runtime/io/driver/uring.rs +49 -61
- data/rust-vendor/tokio/src/runtime/io/driver.rs +6 -5
- data/rust-vendor/tokio/src/runtime/mod.rs +20 -1
- data/rust-vendor/tokio/src/runtime/runtime.rs +71 -1
- data/rust-vendor/tokio/src/runtime/scheduler/current_thread/mod.rs +24 -8
- data/rust-vendor/tokio/src/runtime/scheduler/multi_thread/worker.rs +5 -0
- data/rust-vendor/tokio/src/runtime/task/core.rs +1 -0
- data/rust-vendor/tokio/src/runtime/task/join.rs +7 -3
- data/rust-vendor/tokio/src/runtime/task/list.rs +5 -3
- data/rust-vendor/tokio/src/runtime/task/mod.rs +0 -5
- data/rust-vendor/tokio/src/runtime/tests/loom_blocking.rs +39 -1
- data/rust-vendor/tokio/src/signal/mod.rs +6 -17
- data/rust-vendor/tokio/src/signal/registry.rs +1 -1
- data/rust-vendor/tokio/src/signal/unix.rs +24 -44
- data/rust-vendor/tokio/src/signal/windows/sys.rs +52 -64
- data/rust-vendor/tokio/src/signal/windows.rs +35 -23
- data/rust-vendor/tokio/src/sync/mpsc/mod.rs +3 -1
- data/rust-vendor/tokio/src/sync/oneshot.rs +13 -0
- data/rust-vendor/tokio/src/sync/rwlock.rs +4 -5
- data/rust-vendor/tokio/src/sync/tests/loom_oneshot.rs +27 -1
- data/rust-vendor/tokio/src/task/blocking.rs +16 -1
- data/rust-vendor/tokio/src/task/builder.rs +2 -2
- data/rust-vendor/tokio/src/task/mod.rs +1 -1
- data/rust-vendor/tokio/src/task/spawn.rs +8 -3
- data/rust-vendor/tokio/src/task/yield_now.rs +13 -23
- data/rust-vendor/tokio/src/time/clock.rs +62 -0
- data/rust-vendor/tokio/src/util/memchr.rs +32 -4
- data/rust-vendor/tokio/src/util/sharded_list.rs +6 -4
- data/rust-vendor/tokio/tests/fs_link.rs +54 -0
- data/rust-vendor/tokio/tests/io_async_fd_memory_leak.rs +209 -0
- data/rust-vendor/tokio/tests/io_write_buf.rs +56 -0
- data/rust-vendor/tokio/tests/process_issue_7144.rs +8 -0
- data/rust-vendor/tokio/tests/rt_basic.rs +41 -0
- data/rust-vendor/tokio/tests/rt_common_before_park.rs +92 -0
- data/rust-vendor/tokio/tests/rt_metrics.rs +1 -1
- data/rust-vendor/tokio/tests/rt_panic.rs +12 -0
- data/rust-vendor/tokio/tests/rt_shutdown_err.rs +82 -0
- data/rust-vendor/tokio/tests/rt_threaded.rs +49 -1
- data/rust-vendor/tokio/tests/rt_unstable_metrics.rs +32 -0
- data/rust-vendor/tokio/tests/tcp_connect.rs +2 -3
- data/rust-vendor/tokio/tests/tcp_shutdown.rs +1 -3
- data/rust-vendor/tokio/tests/tcp_socket.rs +3 -4
- data/rust-vendor/tokio/tests/tcp_stream.rs +3 -0
- data/sig/html_to_markdown.rbs +46 -0
- data/spec/convert_with_tables_spec.rb +194 -0
- metadata +80 -3
- data/rust-vendor/getrandom/src/utils/lazy.rs +0 -64
|
@@ -49,9 +49,25 @@ const TRUE: BOOL = 1;
|
|
|
49
49
|
#[inline]
|
|
50
50
|
pub fn fill_inner(dest: &mut [MaybeUninit<u8>]) -> Result<(), Error> {
|
|
51
51
|
let result = unsafe { ProcessPrng(dest.as_mut_ptr().cast::<u8>(), dest.len()) };
|
|
52
|
-
// `ProcessPrng` is documented to always return
|
|
53
|
-
//
|
|
54
|
-
//
|
|
55
|
-
|
|
56
|
-
|
|
52
|
+
// On Windows 10 and later, `ProcessPrng` is documented to always return
|
|
53
|
+
// TRUE. All potential errors are handled during loading of
|
|
54
|
+
// `BCryptPrimitive.dll`. See the "Process base PRNG" section in the
|
|
55
|
+
// aforementioned Windows RNG whitepaper for more information.
|
|
56
|
+
//
|
|
57
|
+
// The Zig project found that Windows 8 implements `ProcessPrng` in a way
|
|
58
|
+
// that may fail and return a value other than `TRUE`. Although recent
|
|
59
|
+
// versions of the Rust toolchain do not support Windows 8, we cannot rule
|
|
60
|
+
// out this backend being used in an executable that will run on Windows 8
|
|
61
|
+
// (e.g. a fork of this crate backported to have an MSRV lower than 1.76,
|
|
62
|
+
// or a fork of the Rust toolchain to support older Windows versions, or
|
|
63
|
+
// other build hacks).
|
|
64
|
+
//
|
|
65
|
+
// Further, Wine's implementation of `ProcessPrng` CAN fail, in every
|
|
66
|
+
// version through Wine 11.2, and this may be the case for any other Windows
|
|
67
|
+
// emulation layers.
|
|
68
|
+
if result == TRUE {
|
|
69
|
+
Ok(())
|
|
70
|
+
} else {
|
|
71
|
+
Err(Error::UNEXPECTED)
|
|
72
|
+
}
|
|
57
73
|
}
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
use core::sync::atomic::{AtomicU8, Ordering::Relaxed};
|
|
2
|
+
|
|
3
|
+
/// Lazily caches a `bool` in an `AtomicU8`.
|
|
4
|
+
///
|
|
5
|
+
/// Initialization is intentionally unsynchronized: concurrent callers may race
|
|
6
|
+
/// and run `init` more than once. Once a value is produced, it is cached and
|
|
7
|
+
/// reused by subsequent calls.
|
|
8
|
+
///
|
|
9
|
+
/// Uses `Relaxed` ordering because this helper only publishes the cached
|
|
10
|
+
/// value itself.
|
|
11
|
+
pub(crate) struct LazyBool(AtomicU8);
|
|
12
|
+
|
|
13
|
+
impl LazyBool {
|
|
14
|
+
const UNINIT: u8 = u8::MAX;
|
|
15
|
+
|
|
16
|
+
/// Create new `LazyBool`.
|
|
17
|
+
pub const fn new() -> Self {
|
|
18
|
+
Self(AtomicU8::new(Self::UNINIT))
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
/// Call the `init` closure and return the result after caching it.
|
|
22
|
+
#[cold]
|
|
23
|
+
fn cold_init(&self, init: impl FnOnce() -> bool) -> bool {
|
|
24
|
+
let val = u8::from(init());
|
|
25
|
+
self.0.store(val, Relaxed);
|
|
26
|
+
val != 0
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
/// Retrieve the cached value if it was already initialized or call the `init` closure
|
|
30
|
+
/// and return the result after caching it.
|
|
31
|
+
#[inline]
|
|
32
|
+
pub fn unsync_init(&self, init: impl FnOnce() -> bool) -> bool {
|
|
33
|
+
let val = self.0.load(Relaxed);
|
|
34
|
+
if val == Self::UNINIT {
|
|
35
|
+
return self.cold_init(init);
|
|
36
|
+
}
|
|
37
|
+
val != 0
|
|
38
|
+
}
|
|
39
|
+
}
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
use core::{
|
|
2
|
+
convert::Infallible,
|
|
3
|
+
ptr::{self, NonNull},
|
|
4
|
+
sync::atomic::{AtomicPtr, Ordering::Relaxed},
|
|
5
|
+
};
|
|
6
|
+
|
|
7
|
+
/// Lazily caches a non-null pointer in an `AtomicPtr`.
|
|
8
|
+
///
|
|
9
|
+
/// Initialization is intentionally unsynchronized: concurrent callers may race
|
|
10
|
+
/// and run `init` more than once. Once a value is produced, it is cached and
|
|
11
|
+
/// reused by subsequent calls.
|
|
12
|
+
///
|
|
13
|
+
/// For fallible initialization (`try_unsync_init`), only successful values are
|
|
14
|
+
/// cached; errors are returned to the caller and are not cached.
|
|
15
|
+
///
|
|
16
|
+
/// Uses `Ordering::Relaxed` because this helper only publishes the cached
|
|
17
|
+
/// pointer value. Callers must not rely on this mechanism to synchronize
|
|
18
|
+
/// unrelated memory side effects performed by `init`.
|
|
19
|
+
pub(crate) struct LazyPtr<T>(AtomicPtr<T>);
|
|
20
|
+
|
|
21
|
+
impl<T> LazyPtr<T> {
|
|
22
|
+
/// Create new `LazyPtr`.
|
|
23
|
+
pub const fn new() -> Self {
|
|
24
|
+
Self(AtomicPtr::new(ptr::null_mut()))
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
/// Call the `init` closure and return the result after caching it in the case of success.
|
|
28
|
+
#[cold]
|
|
29
|
+
fn cold_init<E>(&self, init: impl FnOnce() -> Result<NonNull<T>, E>) -> Result<NonNull<T>, E> {
|
|
30
|
+
let val = init()?;
|
|
31
|
+
self.0.store(val.as_ptr(), Relaxed);
|
|
32
|
+
Ok(val)
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
/// Retrieve the cached value if it was already initialized or call the potentially fallible
|
|
36
|
+
/// `init` closure and return the result after caching it in the case of success.
|
|
37
|
+
#[inline]
|
|
38
|
+
pub fn try_unsync_init<E>(
|
|
39
|
+
&self,
|
|
40
|
+
init: impl FnOnce() -> Result<NonNull<T>, E>,
|
|
41
|
+
) -> Result<NonNull<T>, E> {
|
|
42
|
+
let p = self.0.load(Relaxed);
|
|
43
|
+
match NonNull::new(p) {
|
|
44
|
+
Some(val) => Ok(val),
|
|
45
|
+
None => self.cold_init(init),
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
/// Retrieve the cached value if it was already initialized or call the `init` closure
|
|
50
|
+
/// and return the result after caching it.
|
|
51
|
+
#[inline]
|
|
52
|
+
#[allow(dead_code, reason = "Some modules use only `try_unsync_init`")]
|
|
53
|
+
pub fn unsync_init(&self, init: impl FnOnce() -> NonNull<T>) -> NonNull<T> {
|
|
54
|
+
let Ok(p): Result<_, Infallible> = self.try_unsync_init(|| Ok(init()));
|
|
55
|
+
p
|
|
56
|
+
}
|
|
57
|
+
}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "html-to-markdown-rs"
|
|
3
|
-
version = "2.
|
|
3
|
+
version = "2.28.0"
|
|
4
4
|
edition = "2024"
|
|
5
5
|
authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
|
|
6
6
|
license = "MIT"
|
|
@@ -40,7 +40,7 @@ serde = { version = "1.0", features = ["derive"], optional = true }
|
|
|
40
40
|
serde_json = { version = "1.0", optional = true }
|
|
41
41
|
async-trait = { version = "0.1", optional = true }
|
|
42
42
|
futures = { version = "0.3", optional = true }
|
|
43
|
-
tokio = { version = "1.
|
|
43
|
+
tokio = { version = "1.50", features = ["rt-multi-thread", "sync"], optional = true }
|
|
44
44
|
|
|
45
45
|
[dev-dependencies]
|
|
46
46
|
serde = { version = "1.0", features = ["derive"] }
|
|
@@ -148,6 +148,35 @@ for (i, img) in extraction.inline_images.iter().enumerate() {
|
|
|
148
148
|
}
|
|
149
149
|
```
|
|
150
150
|
|
|
151
|
+
## Table Extraction
|
|
152
|
+
|
|
153
|
+
Extract structured table data alongside the Markdown conversion. Each table found in the HTML is returned with its cell contents, header row flags, and rendered Markdown output.
|
|
154
|
+
|
|
155
|
+
Requires the `visitor` feature.
|
|
156
|
+
|
|
157
|
+
```rust
|
|
158
|
+
use html_to_markdown_rs::convert_with_tables;
|
|
159
|
+
|
|
160
|
+
let html = r#"
|
|
161
|
+
<table>
|
|
162
|
+
<tr><th>Name</th><th>Age</th></tr>
|
|
163
|
+
<tr><td>Alice</td><td>30</td></tr>
|
|
164
|
+
<tr><td>Bob</td><td>25</td></tr>
|
|
165
|
+
</table>
|
|
166
|
+
"#;
|
|
167
|
+
|
|
168
|
+
let result = convert_with_tables(html, None, None)?;
|
|
169
|
+
|
|
170
|
+
println!("{}", result.content);
|
|
171
|
+
for table in &result.tables {
|
|
172
|
+
println!("Table with {} rows:", table.cells.len());
|
|
173
|
+
for (i, row) in table.cells.iter().enumerate() {
|
|
174
|
+
let prefix = if table.is_header_row[i] { "Header" } else { "Row" };
|
|
175
|
+
println!(" {}: {:?}", prefix, row);
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
```
|
|
179
|
+
|
|
151
180
|
## Other Language Bindings
|
|
152
181
|
|
|
153
182
|
This is the core Rust library. For other languages:
|
|
@@ -681,3 +681,371 @@ pub fn metadata_config_from_json(json: &str) -> Result<MetadataConfig> {
|
|
|
681
681
|
let update: crate::MetadataConfigUpdate = parse_json(json)?;
|
|
682
682
|
Ok(MetadataConfig::from(update))
|
|
683
683
|
}
|
|
684
|
+
|
|
685
|
+
// ============================================================================
|
|
686
|
+
// Table Extraction API (requires visitor feature)
|
|
687
|
+
// ============================================================================
|
|
688
|
+
|
|
689
|
+
/// Extracted table data from HTML conversion.
|
|
690
|
+
///
|
|
691
|
+
/// Each instance represents a single `<table>` element found during conversion.
|
|
692
|
+
/// Tables are collected in document order.
|
|
693
|
+
#[cfg(feature = "visitor")]
|
|
694
|
+
#[derive(Debug, Clone)]
|
|
695
|
+
#[cfg_attr(
|
|
696
|
+
any(feature = "serde", feature = "metadata"),
|
|
697
|
+
derive(serde::Serialize, serde::Deserialize)
|
|
698
|
+
)]
|
|
699
|
+
pub struct TableData {
|
|
700
|
+
/// Table cells organized as rows x columns. Cell contents are already
|
|
701
|
+
/// converted to the target output format (markdown/djot/plain).
|
|
702
|
+
pub cells: Vec<Vec<String>>,
|
|
703
|
+
/// Complete rendered table in the target output format.
|
|
704
|
+
pub markdown: String,
|
|
705
|
+
/// Per-row flag indicating whether the row was inside `<thead>`.
|
|
706
|
+
pub is_header_row: Vec<bool>,
|
|
707
|
+
}
|
|
708
|
+
|
|
709
|
+
/// Result of HTML-to-markdown conversion with extracted table data.
|
|
710
|
+
#[cfg(feature = "visitor")]
|
|
711
|
+
#[derive(Debug, Clone)]
|
|
712
|
+
#[cfg_attr(
|
|
713
|
+
any(feature = "serde", feature = "metadata"),
|
|
714
|
+
derive(serde::Serialize, serde::Deserialize)
|
|
715
|
+
)]
|
|
716
|
+
pub struct ConversionWithTables {
|
|
717
|
+
/// Converted markdown/djot/plain text content.
|
|
718
|
+
pub content: String,
|
|
719
|
+
/// Extended metadata (if metadata extraction was requested).
|
|
720
|
+
#[cfg(feature = "metadata")]
|
|
721
|
+
pub metadata: Option<ExtendedMetadata>,
|
|
722
|
+
/// All tables found in the HTML, in document order.
|
|
723
|
+
pub tables: Vec<TableData>,
|
|
724
|
+
}
|
|
725
|
+
|
|
726
|
+
#[cfg(feature = "visitor")]
|
|
727
|
+
#[derive(Debug)]
|
|
728
|
+
struct TableCollector {
|
|
729
|
+
tables: Vec<TableData>,
|
|
730
|
+
current_rows: Vec<Vec<String>>,
|
|
731
|
+
current_is_header: Vec<bool>,
|
|
732
|
+
}
|
|
733
|
+
|
|
734
|
+
#[cfg(feature = "visitor")]
|
|
735
|
+
impl TableCollector {
|
|
736
|
+
fn new() -> Self {
|
|
737
|
+
Self {
|
|
738
|
+
tables: Vec::new(),
|
|
739
|
+
current_rows: Vec::new(),
|
|
740
|
+
current_is_header: Vec::new(),
|
|
741
|
+
}
|
|
742
|
+
}
|
|
743
|
+
}
|
|
744
|
+
|
|
745
|
+
#[cfg(feature = "visitor")]
|
|
746
|
+
impl visitor::HtmlVisitor for TableCollector {
|
|
747
|
+
fn visit_table_start(&mut self, _ctx: &visitor::NodeContext) -> visitor::VisitResult {
|
|
748
|
+
self.current_rows.clear();
|
|
749
|
+
self.current_is_header.clear();
|
|
750
|
+
visitor::VisitResult::Continue
|
|
751
|
+
}
|
|
752
|
+
|
|
753
|
+
fn visit_table_row(
|
|
754
|
+
&mut self,
|
|
755
|
+
_ctx: &visitor::NodeContext,
|
|
756
|
+
cells: &[String],
|
|
757
|
+
is_header: bool,
|
|
758
|
+
) -> visitor::VisitResult {
|
|
759
|
+
self.current_rows.push(cells.to_vec());
|
|
760
|
+
self.current_is_header.push(is_header);
|
|
761
|
+
visitor::VisitResult::Continue
|
|
762
|
+
}
|
|
763
|
+
|
|
764
|
+
fn visit_table_end(&mut self, _ctx: &visitor::NodeContext, output: &str) -> visitor::VisitResult {
|
|
765
|
+
if !self.current_rows.is_empty() {
|
|
766
|
+
self.tables.push(TableData {
|
|
767
|
+
cells: std::mem::take(&mut self.current_rows),
|
|
768
|
+
markdown: output.to_string(),
|
|
769
|
+
is_header_row: std::mem::take(&mut self.current_is_header),
|
|
770
|
+
});
|
|
771
|
+
}
|
|
772
|
+
visitor::VisitResult::Continue
|
|
773
|
+
}
|
|
774
|
+
}
|
|
775
|
+
|
|
776
|
+
/// Convert HTML to markdown/djot/plain text with structured table extraction.
|
|
777
|
+
///
|
|
778
|
+
/// Combines conversion, optional metadata extraction, and table data collection
|
|
779
|
+
/// in a single DOM walk. Each table found in the HTML is returned with its
|
|
780
|
+
/// cell contents (already converted to the target format) and rendered output.
|
|
781
|
+
///
|
|
782
|
+
/// # Arguments
|
|
783
|
+
///
|
|
784
|
+
/// * `html` - The HTML string to convert
|
|
785
|
+
/// * `options` - Optional conversion options (defaults to `ConversionOptions::default()`)
|
|
786
|
+
/// * `metadata_cfg` - Optional metadata extraction configuration (requires `metadata` feature)
|
|
787
|
+
///
|
|
788
|
+
/// # Example
|
|
789
|
+
///
|
|
790
|
+
/// ```ignore
|
|
791
|
+
/// use html_to_markdown_rs::convert_with_tables;
|
|
792
|
+
///
|
|
793
|
+
/// let html = r#"<table><tr><th>Name</th><th>Age</th></tr><tr><td>Alice</td><td>30</td></tr></table>"#;
|
|
794
|
+
/// let result = convert_with_tables(html, None, None).unwrap();
|
|
795
|
+
/// assert_eq!(result.tables.len(), 1);
|
|
796
|
+
/// assert_eq!(result.tables[0].cells[0], vec!["Name", "Age"]);
|
|
797
|
+
/// ```
|
|
798
|
+
///
|
|
799
|
+
/// # Errors
|
|
800
|
+
///
|
|
801
|
+
/// Returns an error if HTML parsing fails or if the input contains invalid UTF-8.
|
|
802
|
+
#[cfg(feature = "visitor")]
|
|
803
|
+
pub fn convert_with_tables(
|
|
804
|
+
html: &str,
|
|
805
|
+
options: Option<ConversionOptions>,
|
|
806
|
+
#[cfg(feature = "metadata")] metadata_cfg: Option<MetadataConfig>,
|
|
807
|
+
#[cfg(not(feature = "metadata"))] _metadata_cfg: Option<()>,
|
|
808
|
+
) -> Result<ConversionWithTables> {
|
|
809
|
+
use std::cell::RefCell;
|
|
810
|
+
use std::rc::Rc;
|
|
811
|
+
|
|
812
|
+
let collector = Rc::new(RefCell::new(TableCollector::new()));
|
|
813
|
+
let visitor_handle: visitor::VisitorHandle = Rc::clone(&collector) as visitor::VisitorHandle;
|
|
814
|
+
|
|
815
|
+
#[cfg(feature = "metadata")]
|
|
816
|
+
let result = {
|
|
817
|
+
let metadata_config = metadata_cfg.unwrap_or_default();
|
|
818
|
+
let (content, metadata) = convert_with_metadata(html, options, metadata_config, Some(visitor_handle))?;
|
|
819
|
+
let tables = Rc::try_unwrap(collector)
|
|
820
|
+
.map_err(|_| ConversionError::Other("failed to recover table collector state".into()))?
|
|
821
|
+
.into_inner()
|
|
822
|
+
.tables;
|
|
823
|
+
ConversionWithTables {
|
|
824
|
+
content,
|
|
825
|
+
metadata: Some(metadata),
|
|
826
|
+
tables,
|
|
827
|
+
}
|
|
828
|
+
};
|
|
829
|
+
|
|
830
|
+
#[cfg(not(feature = "metadata"))]
|
|
831
|
+
let result = {
|
|
832
|
+
let content = convert_with_visitor(html, options, Some(visitor_handle))?;
|
|
833
|
+
let tables = Rc::try_unwrap(collector)
|
|
834
|
+
.map_err(|_| ConversionError::Other("failed to recover table collector state".into()))?
|
|
835
|
+
.into_inner()
|
|
836
|
+
.tables;
|
|
837
|
+
ConversionWithTables { content, tables }
|
|
838
|
+
};
|
|
839
|
+
|
|
840
|
+
Ok(result)
|
|
841
|
+
}
|
|
842
|
+
|
|
843
|
+
#[cfg(test)]
|
|
844
|
+
#[cfg(feature = "visitor")]
|
|
845
|
+
mod table_extraction_tests {
|
|
846
|
+
use super::*;
|
|
847
|
+
|
|
848
|
+
fn tables_from_html(html: &str) -> ConversionWithTables {
|
|
849
|
+
convert_with_tables(
|
|
850
|
+
html,
|
|
851
|
+
None,
|
|
852
|
+
#[cfg(feature = "metadata")]
|
|
853
|
+
None,
|
|
854
|
+
#[cfg(not(feature = "metadata"))]
|
|
855
|
+
None,
|
|
856
|
+
)
|
|
857
|
+
.unwrap()
|
|
858
|
+
}
|
|
859
|
+
|
|
860
|
+
#[test]
|
|
861
|
+
fn test_convert_with_tables_basic() {
|
|
862
|
+
let html = r#"<table><tr><th>Name</th><th>Age</th></tr><tr><td>Alice</td><td>30</td></tr></table>"#;
|
|
863
|
+
let result = tables_from_html(html);
|
|
864
|
+
assert_eq!(result.tables.len(), 1);
|
|
865
|
+
assert_eq!(result.tables[0].cells.len(), 2);
|
|
866
|
+
assert_eq!(result.tables[0].cells[0], vec!["Name", "Age"]);
|
|
867
|
+
assert_eq!(result.tables[0].cells[1], vec!["Alice", "30"]);
|
|
868
|
+
assert!(result.tables[0].is_header_row[0]);
|
|
869
|
+
assert!(!result.tables[0].is_header_row[1]);
|
|
870
|
+
assert!(result.tables[0].markdown.contains('|'));
|
|
871
|
+
}
|
|
872
|
+
|
|
873
|
+
#[test]
|
|
874
|
+
fn test_convert_with_tables_nested() {
|
|
875
|
+
let html = r#"
|
|
876
|
+
<table>
|
|
877
|
+
<tr><th>Category</th><th>Details</th></tr>
|
|
878
|
+
<tr>
|
|
879
|
+
<td>Project Alpha</td>
|
|
880
|
+
<td>
|
|
881
|
+
<table>
|
|
882
|
+
<tr><th>Task</th><th>Status</th></tr>
|
|
883
|
+
<tr><td>001</td><td>Done</td></tr>
|
|
884
|
+
</table>
|
|
885
|
+
</td>
|
|
886
|
+
</tr>
|
|
887
|
+
</table>"#;
|
|
888
|
+
let result = tables_from_html(html);
|
|
889
|
+
assert!(
|
|
890
|
+
result.tables.len() >= 2,
|
|
891
|
+
"Expected at least 2 tables (outer + nested), got {}",
|
|
892
|
+
result.tables.len()
|
|
893
|
+
);
|
|
894
|
+
}
|
|
895
|
+
|
|
896
|
+
#[test]
|
|
897
|
+
fn test_convert_with_tables_no_tables() {
|
|
898
|
+
let html = "<p>No tables here</p>";
|
|
899
|
+
let result = tables_from_html(html);
|
|
900
|
+
assert!(result.tables.is_empty());
|
|
901
|
+
assert!(result.content.contains("No tables here"));
|
|
902
|
+
}
|
|
903
|
+
|
|
904
|
+
#[test]
|
|
905
|
+
fn test_convert_with_tables_empty_table() {
|
|
906
|
+
let result = tables_from_html("<table></table>");
|
|
907
|
+
assert!(result.tables.is_empty(), "Empty table should not produce TableData");
|
|
908
|
+
}
|
|
909
|
+
|
|
910
|
+
#[test]
|
|
911
|
+
fn test_convert_with_tables_headers_only() {
|
|
912
|
+
let html = r#"<table><thead><tr><th>A</th><th>B</th></tr></thead></table>"#;
|
|
913
|
+
let result = tables_from_html(html);
|
|
914
|
+
assert_eq!(result.tables.len(), 1);
|
|
915
|
+
assert!(result.tables[0].is_header_row[0]);
|
|
916
|
+
assert_eq!(result.tables[0].cells[0], vec!["A", "B"]);
|
|
917
|
+
}
|
|
918
|
+
|
|
919
|
+
#[test]
|
|
920
|
+
fn test_convert_with_tables_thead_tbody_tfoot() {
|
|
921
|
+
let html = r#"
|
|
922
|
+
<table>
|
|
923
|
+
<thead><tr><th>H1</th></tr></thead>
|
|
924
|
+
<tbody><tr><td>B1</td></tr></tbody>
|
|
925
|
+
<tfoot><tr><td>F1</td></tr></tfoot>
|
|
926
|
+
</table>"#;
|
|
927
|
+
let result = tables_from_html(html);
|
|
928
|
+
assert_eq!(result.tables.len(), 1);
|
|
929
|
+
let t = &result.tables[0];
|
|
930
|
+
assert!(t.is_header_row[0], "thead row should be header");
|
|
931
|
+
assert!(!t.is_header_row[1], "tbody row should not be header");
|
|
932
|
+
assert_eq!(t.cells[0], vec!["H1"]);
|
|
933
|
+
assert_eq!(t.cells[1], vec!["B1"]);
|
|
934
|
+
}
|
|
935
|
+
|
|
936
|
+
#[test]
|
|
937
|
+
fn test_convert_with_tables_multiple_separate() {
|
|
938
|
+
let html = r#"
|
|
939
|
+
<table><tr><td>T1</td></tr></table>
|
|
940
|
+
<p>Between tables</p>
|
|
941
|
+
<table><tr><td>T2</td></tr></table>"#;
|
|
942
|
+
let result = tables_from_html(html);
|
|
943
|
+
assert_eq!(result.tables.len(), 2, "Should find 2 separate tables");
|
|
944
|
+
}
|
|
945
|
+
|
|
946
|
+
#[test]
|
|
947
|
+
fn test_convert_with_tables_special_chars() {
|
|
948
|
+
let html = r#"<table><tr><td>a | b</td><td>c*d</td></tr></table>"#;
|
|
949
|
+
let result = tables_from_html(html);
|
|
950
|
+
assert_eq!(result.tables.len(), 1);
|
|
951
|
+
assert!(!result.tables[0].cells[0].is_empty());
|
|
952
|
+
}
|
|
953
|
+
|
|
954
|
+
#[test]
|
|
955
|
+
fn test_convert_with_tables_single_cell() {
|
|
956
|
+
let html = r#"<table><tr><td>Only cell</td></tr></table>"#;
|
|
957
|
+
let result = tables_from_html(html);
|
|
958
|
+
assert_eq!(result.tables.len(), 1);
|
|
959
|
+
assert_eq!(result.tables[0].cells.len(), 1);
|
|
960
|
+
assert_eq!(result.tables[0].cells[0], vec!["Only cell"]);
|
|
961
|
+
}
|
|
962
|
+
|
|
963
|
+
#[test]
|
|
964
|
+
fn test_convert_with_tables_content_preserved() {
|
|
965
|
+
let html = r#"<p>Before</p><table><tr><td>Cell</td></tr></table><p>After</p>"#;
|
|
966
|
+
let result = tables_from_html(html);
|
|
967
|
+
assert!(result.content.contains("Before"));
|
|
968
|
+
assert!(result.content.contains("After"));
|
|
969
|
+
assert!(result.content.contains('|'), "Markdown table should appear in content");
|
|
970
|
+
}
|
|
971
|
+
|
|
972
|
+
#[test]
|
|
973
|
+
fn test_convert_with_tables_with_options() {
|
|
974
|
+
let options = ConversionOptions {
|
|
975
|
+
heading_style: crate::options::HeadingStyle::Underlined,
|
|
976
|
+
..ConversionOptions::default()
|
|
977
|
+
};
|
|
978
|
+
let html = r#"<h1>Title</h1><table><tr><td>Cell</td></tr></table>"#;
|
|
979
|
+
let result = convert_with_tables(
|
|
980
|
+
html,
|
|
981
|
+
Some(options),
|
|
982
|
+
#[cfg(feature = "metadata")]
|
|
983
|
+
None,
|
|
984
|
+
#[cfg(not(feature = "metadata"))]
|
|
985
|
+
None,
|
|
986
|
+
)
|
|
987
|
+
.unwrap();
|
|
988
|
+
assert_eq!(result.tables.len(), 1);
|
|
989
|
+
assert!(result.content.contains("Title"));
|
|
990
|
+
}
|
|
991
|
+
|
|
992
|
+
#[test]
|
|
993
|
+
fn test_convert_with_tables_plain_text_format() {
|
|
994
|
+
let options = ConversionOptions {
|
|
995
|
+
output_format: crate::options::OutputFormat::Plain,
|
|
996
|
+
..ConversionOptions::default()
|
|
997
|
+
};
|
|
998
|
+
let html = r#"<table><tr><th>Name</th></tr><tr><td>Alice</td></tr></table>"#;
|
|
999
|
+
let result = convert_with_tables(
|
|
1000
|
+
html,
|
|
1001
|
+
Some(options),
|
|
1002
|
+
#[cfg(feature = "metadata")]
|
|
1003
|
+
None,
|
|
1004
|
+
#[cfg(not(feature = "metadata"))]
|
|
1005
|
+
None,
|
|
1006
|
+
)
|
|
1007
|
+
.unwrap();
|
|
1008
|
+
assert!(
|
|
1009
|
+
!result.tables.is_empty(),
|
|
1010
|
+
"Tables should be populated even with plain text output format"
|
|
1011
|
+
);
|
|
1012
|
+
assert_eq!(result.tables[0].cells[0], vec!["Name"]);
|
|
1013
|
+
}
|
|
1014
|
+
|
|
1015
|
+
#[cfg(feature = "metadata")]
|
|
1016
|
+
#[test]
|
|
1017
|
+
fn test_convert_with_tables_metadata_integration() {
|
|
1018
|
+
let html = r#"<html lang="en"><head><title>Test</title></head><body>
|
|
1019
|
+
<table><tr><th>Col</th></tr><tr><td>Val</td></tr></table>
|
|
1020
|
+
</body></html>"#;
|
|
1021
|
+
let config = MetadataConfig::default();
|
|
1022
|
+
let result = convert_with_tables(html, None, Some(config)).unwrap();
|
|
1023
|
+
assert_eq!(result.tables.len(), 1);
|
|
1024
|
+
let meta = result.metadata.as_ref().expect("metadata should be present");
|
|
1025
|
+
assert_eq!(meta.document.language, Some("en".to_string()));
|
|
1026
|
+
}
|
|
1027
|
+
|
|
1028
|
+
#[cfg(feature = "metadata")]
|
|
1029
|
+
#[test]
|
|
1030
|
+
fn test_convert_with_tables_plain_text_metadata() {
|
|
1031
|
+
let options = ConversionOptions {
|
|
1032
|
+
output_format: crate::options::OutputFormat::Plain,
|
|
1033
|
+
..ConversionOptions::default()
|
|
1034
|
+
};
|
|
1035
|
+
let html = r#"<html lang="fr"><body>
|
|
1036
|
+
<table><tr><td>Cell</td></tr></table>
|
|
1037
|
+
</body></html>"#;
|
|
1038
|
+
let config = MetadataConfig::default();
|
|
1039
|
+
let result = convert_with_tables(html, Some(options), Some(config)).unwrap();
|
|
1040
|
+
assert!(
|
|
1041
|
+
!result.tables.is_empty(),
|
|
1042
|
+
"Tables should be populated in plain text mode"
|
|
1043
|
+
);
|
|
1044
|
+
let meta = result.metadata.as_ref().expect("metadata should be present");
|
|
1045
|
+
assert_eq!(
|
|
1046
|
+
meta.document.language,
|
|
1047
|
+
Some("fr".to_string()),
|
|
1048
|
+
"Metadata should be populated in plain text mode"
|
|
1049
|
+
);
|
|
1050
|
+
}
|
|
1051
|
+
}
|
|
@@ -136,11 +136,9 @@ pub(crate) fn convert_html_impl(
|
|
|
136
136
|
}
|
|
137
137
|
}
|
|
138
138
|
|
|
139
|
-
//
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
return Ok(plain);
|
|
143
|
-
}
|
|
139
|
+
// Plain text output: run the full pipeline (for metadata + visitor callbacks),
|
|
140
|
+
// then return plain text instead of markdown.
|
|
141
|
+
let is_plain_text = options.output_format == OutputFormat::Plain;
|
|
144
142
|
|
|
145
143
|
let wants_frontmatter = options.extract_metadata && !options.convert_as_inline;
|
|
146
144
|
#[cfg(feature = "metadata")]
|
|
@@ -230,6 +228,13 @@ pub(crate) fn convert_html_impl(
|
|
|
230
228
|
return Err(crate::error::ConversionError::Visitor(err.clone()));
|
|
231
229
|
}
|
|
232
230
|
|
|
231
|
+
// If plain text was requested, discard the markdown output and return plain text.
|
|
232
|
+
// The full pipeline was still run above so that metadata + visitor callbacks fire.
|
|
233
|
+
if is_plain_text {
|
|
234
|
+
let plain = extract_plain_text(&dom, parser, options);
|
|
235
|
+
return Ok(plain);
|
|
236
|
+
}
|
|
237
|
+
|
|
233
238
|
trim_line_end_whitespace(&mut output);
|
|
234
239
|
let trimmed = output.trim_end_matches('\n');
|
|
235
240
|
if trimmed.is_empty() {
|
|
@@ -188,7 +188,8 @@ pub fn process_text_node(
|
|
|
188
188
|
// the current block's content, not from a previous block's closing.
|
|
189
189
|
// Without this distinction, the second paragraph after a "\n\n" boundary
|
|
190
190
|
// would incorrectly suppress the trailing space before inline elements.
|
|
191
|
-
let
|
|
191
|
+
let safe_start = ctx.block_content_start.min(output.len());
|
|
192
|
+
let current_block_output = &output[safe_start..];
|
|
192
193
|
let at_paragraph_break = current_block_output.ends_with("\n\n");
|
|
193
194
|
if !at_paragraph_break {
|
|
194
195
|
if has_double_newline {
|
|
@@ -98,6 +98,9 @@ pub use convert_api::convert_with_metadata;
|
|
|
98
98
|
#[cfg(feature = "visitor")]
|
|
99
99
|
pub use convert_api::convert_with_visitor;
|
|
100
100
|
|
|
101
|
+
#[cfg(feature = "visitor")]
|
|
102
|
+
pub use convert_api::{ConversionWithTables, TableData, convert_with_tables};
|
|
103
|
+
|
|
101
104
|
#[cfg(feature = "async-visitor")]
|
|
102
105
|
pub use convert_api::convert_with_async_visitor;
|
|
103
106
|
|
|
@@ -19,5 +19,8 @@ pub use crate::convert_with_metadata;
|
|
|
19
19
|
#[cfg(feature = "visitor")]
|
|
20
20
|
pub use crate::convert_with_visitor;
|
|
21
21
|
|
|
22
|
+
#[cfg(feature = "visitor")]
|
|
23
|
+
pub use crate::{ConversionWithTables, TableData, convert_with_tables};
|
|
24
|
+
|
|
22
25
|
#[cfg(feature = "async-visitor")]
|
|
23
26
|
pub use crate::convert_with_async_visitor;
|