html-to-markdown 2.24.6 → 2.25.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/ext/html-to-markdown-rb/native/Cargo.lock +9 -32
- data/ext/html-to-markdown-rb/native/Cargo.toml +1 -1
- data/lib/html_to_markdown/version.rb +1 -1
- data/rust-vendor/html-to-markdown-rs/Cargo.toml +0 -1
- data/rust-vendor/html-to-markdown-rs/src/converter/main_helpers.rs +1 -1
- data/rust-vendor/html-to-markdown-rs/src/hocr/converter/hierarchy.rs +20 -5
- data/rust-vendor/html-to-markdown-rs/src/lib.rs +1 -0
- data/rust-vendor/{markup5ever_rcdom/lib.rs → html-to-markdown-rs/src/rcdom.rs} +56 -91
- data/rust-vendor/html-to-markdown-rs/tests/hocr_compliance_test.rs +157 -0
- data/rust-vendor/memmap2/.cargo-checksum.json +1 -1
- data/rust-vendor/memmap2/.cargo_vcs_info.json +1 -1
- data/rust-vendor/memmap2/CHANGELOG.md +8 -0
- data/rust-vendor/memmap2/Cargo.lock +1 -1
- data/rust-vendor/memmap2/Cargo.toml +2 -1
- data/rust-vendor/memmap2/Cargo.toml.orig +2 -1
- data/rust-vendor/memmap2/src/lib.rs +25 -1
- data/rust-vendor/memmap2/src/stub.rs +1 -4
- data/rust-vendor/memmap2/src/unix.rs +14 -1
- data/rust-vendor/png/.cargo-checksum.json +1 -1
- data/rust-vendor/png/.cargo_vcs_info.json +1 -1
- data/rust-vendor/png/CHANGES.md +44 -0
- data/rust-vendor/png/Cargo.lock +124 -171
- data/rust-vendor/png/Cargo.toml +1 -1
- data/rust-vendor/png/Cargo.toml.orig +1 -1
- data/rust-vendor/png/benches/expand_paletted.rs +5 -5
- data/rust-vendor/png/benches/unfilter.rs +3 -3
- data/rust-vendor/png/src/adam7.rs +17 -10
- data/rust-vendor/png/src/common.rs +8 -8
- data/rust-vendor/png/src/decoder/mod.rs +53 -20
- data/rust-vendor/png/src/decoder/stream.rs +263 -78
- data/rust-vendor/png/src/decoder/unfiltering_buffer.rs +210 -53
- data/rust-vendor/png/src/decoder/zlib.rs +130 -90
- data/rust-vendor/png/src/encoder.rs +4 -2
- data/rust-vendor/png/src/{filter.rs → filter/mod.rs} +100 -367
- data/rust-vendor/png/src/filter/optimization-notes.md +104 -0
- data/rust-vendor/png/src/filter/paeth.rs +398 -0
- data/rust-vendor/png/src/filter/simd.rs +308 -0
- data/rust-vendor/png/src/lib.rs +1 -0
- data/rust-vendor/syn/.cargo-checksum.json +1 -1
- data/rust-vendor/syn/.cargo_vcs_info.json +1 -1
- data/rust-vendor/syn/Cargo.lock +40 -41
- data/rust-vendor/syn/Cargo.toml +1 -1
- data/rust-vendor/syn/Cargo.toml.orig +1 -1
- data/rust-vendor/syn/src/item.rs +61 -40
- data/rust-vendor/syn/src/lib.rs +2 -1
- data/rust-vendor/syn/tests/test_item.rs +54 -0
- data/rust-vendor/unicode-ident/.cargo-checksum.json +1 -1
- data/rust-vendor/unicode-ident/.cargo_vcs_info.json +1 -1
- data/rust-vendor/unicode-ident/Cargo.lock +21 -21
- data/rust-vendor/unicode-ident/Cargo.toml +1 -1
- data/rust-vendor/unicode-ident/Cargo.toml.orig +1 -1
- data/rust-vendor/unicode-ident/src/lib.rs +1 -1
- data/rust-vendor/unicode-ident/src/tables.rs +87 -97
- data/rust-vendor/unicode-ident/tests/static_size.rs +1 -1
- metadata +7 -177
- data/rust-vendor/markup5ever_rcdom/.cargo-checksum.json +0 -1
- data/rust-vendor/markup5ever_rcdom/.cargo_vcs_info.json +0 -7
- data/rust-vendor/markup5ever_rcdom/Cargo.lock +0 -658
- data/rust-vendor/markup5ever_rcdom/Cargo.toml +0 -109
- data/rust-vendor/markup5ever_rcdom/Cargo.toml.orig +0 -42
- data/rust-vendor/markup5ever_rcdom/LICENSE-APACHE +0 -201
- data/rust-vendor/markup5ever_rcdom/LICENSE-MIT +0 -25
- data/rust-vendor/markup5ever_rcdom/README.md +0 -7
- data/rust-vendor/markup5ever_rcdom/custom-html5lib-tokenizer-tests/regression.test +0 -69
- data/rust-vendor/markup5ever_rcdom/data/test/ignore +0 -1
- data/rust-vendor/markup5ever_rcdom/examples/hello_xml.rs +0 -39
- data/rust-vendor/markup5ever_rcdom/examples/html2html.rs +0 -51
- data/rust-vendor/markup5ever_rcdom/examples/print-rcdom.rs +0 -78
- data/rust-vendor/markup5ever_rcdom/examples/xml_tree_printer.rs +0 -67
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/.gitattributes +0 -2
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/.github/workflows/downstream.yml +0 -76
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/.github/workflows/lint.yml +0 -25
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/.gitignore +0 -79
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/AUTHORS.rst +0 -34
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/LICENSE +0 -21
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/encoding/chardet/test_big5.txt +0 -51
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/encoding/scripted/tests1.dat +0 -5
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/encoding/test-yahoo-jp.dat +0 -10
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/encoding/tests1.dat +0 -388
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/encoding/tests2.dat +0 -115
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint +0 -6
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/__init__.py +0 -0
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/__init__.py +0 -0
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/LICENSE +0 -18
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/__init__.py +0 -0
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/lexer.py +0 -211
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/lexer.pyi +0 -34
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/parser.py +0 -872
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/parser.pyi +0 -83
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/py.typed +0 -0
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/util.py +0 -72
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/util.pyi +0 -7
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/vendor.txt +0 -1
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor-patches/funcparserlib.patch +0 -24
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/lint.py +0 -280
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/parser.py +0 -177
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/pyproject.toml +0 -7
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/serializer/core.test +0 -125
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/serializer/injectmeta.test +0 -66
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/serializer/optionaltags.test +0 -965
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/serializer/options.test +0 -60
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/serializer/whitespace.test +0 -51
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/README.md +0 -107
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/contentModelFlags.test +0 -93
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/domjs.test +0 -335
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/entities.test +0 -542
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/escapeFlag.test +0 -36
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/namedEntities.test +0 -42422
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/numericEntities.test +0 -1677
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/pendingSpecChanges.test +0 -9
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/test1.test +0 -353
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/test2.test +0 -275
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/test3.test +0 -11233
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/test4.test +0 -532
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/unicodeChars.test +0 -1577
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/unicodeCharsProblematic.test +0 -41
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/xmlViolation.test +0 -20
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/README.md +0 -108
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/adoption01.dat +0 -354
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/adoption02.dat +0 -39
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/blocks.dat +0 -695
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/comments01.dat +0 -217
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/doctype01.dat +0 -474
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/domjs-unsafe.dat +0 -0
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/entities01.dat +0 -943
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/entities02.dat +0 -309
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/foreign-fragment.dat +0 -645
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/html5test-com.dat +0 -301
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/inbody01.dat +0 -54
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/isindex.dat +0 -49
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/main-element.dat +0 -46
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/math.dat +0 -104
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/menuitem-element.dat +0 -240
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/namespace-sensitivity.dat +0 -22
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/noscript01.dat +0 -237
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/pending-spec-changes-plain-text-unsafe.dat +0 -0
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/pending-spec-changes.dat +0 -46
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/plain-text-unsafe.dat +0 -0
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/quirks01.dat +0 -53
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/ruby.dat +0 -302
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/scriptdata01.dat +0 -372
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/scripted/adoption01.dat +0 -16
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/scripted/ark.dat +0 -27
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/scripted/webkit01.dat +0 -30
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/search-element.dat +0 -46
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/svg.dat +0 -104
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tables01.dat +0 -322
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/template.dat +0 -1673
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests1.dat +0 -1956
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests10.dat +0 -849
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests11.dat +0 -523
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests12.dat +0 -62
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests14.dat +0 -75
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests15.dat +0 -216
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests16.dat +0 -2602
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests17.dat +0 -179
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests18.dat +0 -558
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests19.dat +0 -1398
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests2.dat +0 -831
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests20.dat +0 -842
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests21.dat +0 -306
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests22.dat +0 -190
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests23.dat +0 -168
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests24.dat +0 -79
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests25.dat +0 -288
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests26.dat +0 -453
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests3.dat +0 -305
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests4.dat +0 -74
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests5.dat +0 -210
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests6.dat +0 -663
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests7.dat +0 -453
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests8.dat +0 -165
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests9.dat +0 -472
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests_innerHTML_1.dat +0 -843
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tricky01.dat +0 -336
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/webkit01.dat +0 -785
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/webkit02.dat +0 -554
- data/rust-vendor/markup5ever_rcdom/tests/foreach_html5lib_test/mod.rs +0 -41
- data/rust-vendor/markup5ever_rcdom/tests/html-driver.rs +0 -29
- data/rust-vendor/markup5ever_rcdom/tests/html-serializer.rs +0 -265
- data/rust-vendor/markup5ever_rcdom/tests/html-tokenizer.rs +0 -487
- data/rust-vendor/markup5ever_rcdom/tests/html-tree-builder.rs +0 -298
- data/rust-vendor/markup5ever_rcdom/tests/html-tree-sink.rs +0 -141
- data/rust-vendor/markup5ever_rcdom/tests/util/find_tests.rs +0 -34
- data/rust-vendor/markup5ever_rcdom/tests/util/runner.rs +0 -48
- data/rust-vendor/markup5ever_rcdom/tests/xml-driver.rs +0 -101
- data/rust-vendor/markup5ever_rcdom/tests/xml-tokenizer.rs +0 -374
- data/rust-vendor/markup5ever_rcdom/tests/xml-tree-builder.rs +0 -237
- data/rust-vendor/markup5ever_rcdom/xml5lib-tests/AUTHORS.rst +0 -9
- data/rust-vendor/markup5ever_rcdom/xml5lib-tests/LICENSE +0 -21
- data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/README.md +0 -92
- data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/comments.test +0 -274
- data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/doctype.test +0 -3232
- data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/entities.test +0 -283
- data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/eof.test +0 -113
- data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/namedEntities.test +0 -42210
- data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/numericEntities.test +0 -1349
- data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/test1.test +0 -162
- data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/test2.test +0 -64
- data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/unicodeChars.test +0 -1295
- data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tree-construction/README.md +0 -104
- data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tree-construction/namespace.dat +0 -119
- data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tree-construction/test1.dat +0 -124
- data/rust-vendor/xml5ever/.cargo-checksum.json +0 -1
- data/rust-vendor/xml5ever/.cargo_vcs_info.json +0 -6
- data/rust-vendor/xml5ever/Cargo.lock +0 -752
- data/rust-vendor/xml5ever/Cargo.toml +0 -69
- data/rust-vendor/xml5ever/Cargo.toml.orig +0 -29
- data/rust-vendor/xml5ever/LICENSE-APACHE +0 -201
- data/rust-vendor/xml5ever/LICENSE-MIT +0 -25
- data/rust-vendor/xml5ever/README.md +0 -72
- data/rust-vendor/xml5ever/benches/xml5ever.rs +0 -77
- data/rust-vendor/xml5ever/data/bench/strong.xml +0 -1
- data/rust-vendor/xml5ever/examples/README.md +0 -223
- data/rust-vendor/xml5ever/examples/example.xml +0 -3
- data/rust-vendor/xml5ever/examples/simple_xml_tokenizer.rs +0 -81
- data/rust-vendor/xml5ever/examples/xml_tokenizer.rs +0 -115
- data/rust-vendor/xml5ever/src/driver.rs +0 -90
- data/rust-vendor/xml5ever/src/lib.rs +0 -47
- data/rust-vendor/xml5ever/src/macros.rs +0 -18
- data/rust-vendor/xml5ever/src/serialize/mod.rs +0 -216
- data/rust-vendor/xml5ever/src/tokenizer/char_ref/mod.rs +0 -456
- data/rust-vendor/xml5ever/src/tokenizer/interface.rs +0 -116
- data/rust-vendor/xml5ever/src/tokenizer/mod.rs +0 -1344
- data/rust-vendor/xml5ever/src/tokenizer/qname.rs +0 -84
- data/rust-vendor/xml5ever/src/tokenizer/states.rs +0 -167
- data/rust-vendor/xml5ever/src/tree_builder/mod.rs +0 -774
- data/rust-vendor/xml5ever/src/tree_builder/types.rs +0 -37
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: deb434d4fa161dcd1df3b836eac2ce0133bac0124f88a890757042044501acf2
|
|
4
|
+
data.tar.gz: 76a70b3e1a1abafa465b0d076526d1b58d50f3a39453e406bc2d4b4c8b669ce6
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 59404b56deed91b2f0c65bf358fd05b685042804b30a2ee92ebb5cdb220a6b89c9f3dbc62d5576ef7d818c6372c42c1f8a2cc8bfbd2b8db16c8ecdbcdd60d427
|
|
7
|
+
data.tar.gz: 30f12a84b9b2b32aa6a1c1c3499ff8777e22994c84a38a22d4127eb91274ba4c9923270c2e8583e56b6cf758202581c58b38315a38fbd8c28ec737251ae32288
|
data/Gemfile.lock
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
html-to-markdown (2.
|
|
4
|
+
html-to-markdown (2.25.1)
|
|
5
5
|
rb_sys (>= 0.9, < 1.0)
|
|
6
6
|
|
|
7
7
|
GEM
|
|
@@ -161,7 +161,7 @@ CHECKSUMS
|
|
|
161
161
|
ffi (1.17.3-x86_64-darwin) sha256=1f211811eb5cfaa25998322cdd92ab104bfbd26d1c4c08471599c511f2c00bb5
|
|
162
162
|
ffi (1.17.3-x86_64-linux-gnu) sha256=3746b01f677aae7b16dc1acb7cb3cc17b3e35bdae7676a3f568153fb0e2c887f
|
|
163
163
|
fileutils (1.8.0) sha256=8c6b1df54e2540bdb2f39258f08af78853aa70bad52b4d394bbc6424593c6e02
|
|
164
|
-
html-to-markdown (2.
|
|
164
|
+
html-to-markdown (2.25.1)
|
|
165
165
|
i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
|
|
166
166
|
json (2.18.1) sha256=fe112755501b8d0466b5ada6cf50c8c3f41e897fa128ac5d263ec09eedc9f986
|
|
167
167
|
language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
|
data/README.md
CHANGED
|
@@ -18,7 +18,7 @@
|
|
|
18
18
|
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/html-to-markdown?label=Java&color=007ec6" alt="Java">
|
|
19
19
|
</a>
|
|
20
20
|
<a href="https://pkg.go.dev/github.com/kreuzberg-dev/html-to-markdown/packages/go/v2/htmltomarkdown">
|
|
21
|
-
<img src="https://img.shields.io/badge/Go-v2.
|
|
21
|
+
<img src="https://img.shields.io/badge/Go-v2.25.1-007ec6" alt="Go">
|
|
22
22
|
</a>
|
|
23
23
|
<a href="https://www.nuget.org/packages/KreuzbergDev.HtmlToMarkdown/">
|
|
24
24
|
<img src="https://img.shields.io/nuget/v/KreuzbergDev.HtmlToMarkdown?label=C%23&color=007ec6" alt="C#">
|
|
@@ -424,7 +424,7 @@ dependencies = [
|
|
|
424
424
|
|
|
425
425
|
[[package]]
|
|
426
426
|
name = "html-to-markdown-rb"
|
|
427
|
-
version = "2.
|
|
427
|
+
version = "2.25.1"
|
|
428
428
|
dependencies = [
|
|
429
429
|
"html-to-markdown-rs",
|
|
430
430
|
"magnus",
|
|
@@ -443,7 +443,6 @@ dependencies = [
|
|
|
443
443
|
"html5ever",
|
|
444
444
|
"image",
|
|
445
445
|
"lru",
|
|
446
|
-
"markup5ever_rcdom",
|
|
447
446
|
"once_cell",
|
|
448
447
|
"regex",
|
|
449
448
|
"serde",
|
|
@@ -663,18 +662,6 @@ dependencies = [
|
|
|
663
662
|
"web_atoms",
|
|
664
663
|
]
|
|
665
664
|
|
|
666
|
-
[[package]]
|
|
667
|
-
name = "markup5ever_rcdom"
|
|
668
|
-
version = "0.36.0+unofficial"
|
|
669
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
670
|
-
checksum = "3e5fc8802e8797c0dfdd2ce5c21aa0aee21abbc7b3b18559100651b3352a7b63"
|
|
671
|
-
dependencies = [
|
|
672
|
-
"html5ever",
|
|
673
|
-
"markup5ever",
|
|
674
|
-
"tendril",
|
|
675
|
-
"xml5ever",
|
|
676
|
-
]
|
|
677
|
-
|
|
678
665
|
[[package]]
|
|
679
666
|
name = "memchr"
|
|
680
667
|
version = "2.8.0"
|
|
@@ -683,9 +670,9 @@ checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79"
|
|
|
683
670
|
|
|
684
671
|
[[package]]
|
|
685
672
|
name = "memmap2"
|
|
686
|
-
version = "0.9.
|
|
673
|
+
version = "0.9.10"
|
|
687
674
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
688
|
-
checksum = "
|
|
675
|
+
checksum = "714098028fe011992e1c3962653c96b2d578c4b4bce9036e15ff220319b1e0e3"
|
|
689
676
|
dependencies = [
|
|
690
677
|
"libc",
|
|
691
678
|
]
|
|
@@ -841,9 +828,9 @@ dependencies = [
|
|
|
841
828
|
|
|
842
829
|
[[package]]
|
|
843
830
|
name = "png"
|
|
844
|
-
version = "0.18.
|
|
831
|
+
version = "0.18.1"
|
|
845
832
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
846
|
-
checksum = "
|
|
833
|
+
checksum = "60769b8b31b2a9f263dae2776c37b1b28ae246943cf719eb6946a1db05128a61"
|
|
847
834
|
dependencies = [
|
|
848
835
|
"bitflags 2.11.0",
|
|
849
836
|
"crc32fast",
|
|
@@ -1217,9 +1204,9 @@ dependencies = [
|
|
|
1217
1204
|
|
|
1218
1205
|
[[package]]
|
|
1219
1206
|
name = "syn"
|
|
1220
|
-
version = "2.0.
|
|
1207
|
+
version = "2.0.116"
|
|
1221
1208
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1222
|
-
checksum = "
|
|
1209
|
+
checksum = "3df424c70518695237746f84cede799c9c58fcb37450d7b23716568cc8bc69cb"
|
|
1223
1210
|
dependencies = [
|
|
1224
1211
|
"proc-macro2",
|
|
1225
1212
|
"quote",
|
|
@@ -1272,9 +1259,9 @@ dependencies = [
|
|
|
1272
1259
|
|
|
1273
1260
|
[[package]]
|
|
1274
1261
|
name = "unicode-ident"
|
|
1275
|
-
version = "1.0.
|
|
1262
|
+
version = "1.0.24"
|
|
1276
1263
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1277
|
-
checksum = "
|
|
1264
|
+
checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75"
|
|
1278
1265
|
|
|
1279
1266
|
[[package]]
|
|
1280
1267
|
name = "unicode-xid"
|
|
@@ -1556,16 +1543,6 @@ dependencies = [
|
|
|
1556
1543
|
"wasmparser",
|
|
1557
1544
|
]
|
|
1558
1545
|
|
|
1559
|
-
[[package]]
|
|
1560
|
-
name = "xml5ever"
|
|
1561
|
-
version = "0.36.1"
|
|
1562
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1563
|
-
checksum = "f57dd51b88a4b9f99f9b55b136abb86210629d61c48117ddb87f567e51e66be7"
|
|
1564
|
-
dependencies = [
|
|
1565
|
-
"log",
|
|
1566
|
-
"markup5ever",
|
|
1567
|
-
]
|
|
1568
|
-
|
|
1569
1546
|
[[package]]
|
|
1570
1547
|
name = "yansi"
|
|
1571
1548
|
version = "1.0.1"
|
|
@@ -35,7 +35,6 @@ ahash = "0.8"
|
|
|
35
35
|
html-escape = "0.2.13"
|
|
36
36
|
image = { version = "0.25", default-features = false, features = ["gif", "jpeg", "png", "bmp", "webp"], optional = true }
|
|
37
37
|
html5ever = "0.36"
|
|
38
|
-
markup5ever_rcdom = "0.36"
|
|
39
38
|
lru = "0.16"
|
|
40
39
|
serde = { version = "1.0", features = ["derive"], optional = true }
|
|
41
40
|
serde_json = { version = "1.0", optional = true }
|
|
@@ -101,9 +101,9 @@ pub fn has_custom_element_tags(html: &str) -> bool {
|
|
|
101
101
|
///
|
|
102
102
|
/// Returns Some(repaired_html) if repair was successful, None otherwise.
|
|
103
103
|
pub fn repair_with_html5ever(input: &str) -> Option<String> {
|
|
104
|
+
use crate::rcdom::{RcDom, SerializableHandle};
|
|
104
105
|
use html5ever::serialize::{SerializeOpts, serialize};
|
|
105
106
|
use html5ever::tendril::TendrilSink;
|
|
106
|
-
use markup5ever_rcdom::{RcDom, SerializableHandle};
|
|
107
107
|
|
|
108
108
|
let dom = html5ever::parse_document(RcDom::default(), Default::default())
|
|
109
109
|
.from_utf8()
|
|
@@ -57,17 +57,30 @@ pub fn detect_heading_paragraph(element: &HocrElement, text: &str) -> Option<Str
|
|
|
57
57
|
return None;
|
|
58
58
|
}
|
|
59
59
|
|
|
60
|
-
let
|
|
60
|
+
let line_children: Vec<&HocrElement> = element
|
|
61
61
|
.children
|
|
62
62
|
.iter()
|
|
63
63
|
.filter(|child| matches!(child.element_type, HocrElementType::OcrLine | HocrElementType::OcrxLine))
|
|
64
|
-
.
|
|
64
|
+
.collect();
|
|
65
65
|
|
|
66
|
-
if
|
|
66
|
+
if line_children.len() != 1 {
|
|
67
67
|
return None;
|
|
68
68
|
}
|
|
69
69
|
|
|
70
|
-
|
|
70
|
+
// Determine effective font size from child line elements.
|
|
71
|
+
// First check x_fsize, then fall back to bbox height as a proxy.
|
|
72
|
+
let font_size = line_children.iter().find_map(|child| {
|
|
73
|
+
child
|
|
74
|
+
.properties
|
|
75
|
+
.x_fsize
|
|
76
|
+
.or_else(|| child.properties.bbox.map(|b| b.height()))
|
|
77
|
+
});
|
|
78
|
+
|
|
79
|
+
let has_large_font = font_size.is_some_and(|size| size >= 14);
|
|
80
|
+
|
|
81
|
+
let char_limit = if has_large_font { 80 } else { 60 };
|
|
82
|
+
|
|
83
|
+
if text.is_empty() || text.len() > char_limit || text.contains(':') || text.contains('\n') {
|
|
71
84
|
return None;
|
|
72
85
|
}
|
|
73
86
|
|
|
@@ -83,7 +96,9 @@ pub fn detect_heading_paragraph(element: &HocrElement, text: &str) -> Option<Str
|
|
|
83
96
|
}
|
|
84
97
|
}
|
|
85
98
|
|
|
86
|
-
|
|
99
|
+
// Allow single-word headings when font size is large
|
|
100
|
+
let min_words = if has_large_font { 1 } else { 2 };
|
|
101
|
+
if word_count < min_words {
|
|
87
102
|
return None;
|
|
88
103
|
}
|
|
89
104
|
|
|
@@ -1,43 +1,39 @@
|
|
|
1
|
-
//
|
|
2
|
-
//
|
|
1
|
+
// Vendored from markup5ever_rcdom v0.36.0+unofficial
|
|
2
|
+
// Original source: https://github.com/servo/html5ever (rcdom/)
|
|
3
|
+
// Copyright (c) 2014 The html5ever Project Developers
|
|
4
|
+
// Licensed under MIT OR Apache-2.0 (see ATTRIBUTIONS.md)
|
|
3
5
|
//
|
|
4
|
-
//
|
|
5
|
-
//
|
|
6
|
-
//
|
|
7
|
-
//
|
|
8
|
-
//
|
|
6
|
+
// Vendored to:
|
|
7
|
+
// - Remove unused xml5ever transitive dependency
|
|
8
|
+
// - Eliminate pinned external dependency on "+unofficial" crate
|
|
9
|
+
// - Gain full control over this small, critical module
|
|
10
|
+
//
|
|
11
|
+
// Changes from upstream:
|
|
12
|
+
// - Replaced `extern crate markup5ever` / `extern crate tendril` with
|
|
13
|
+
// `use` imports through `html5ever` (edition 2024 compatibility)
|
|
14
|
+
// - Added module-level clippy allows for vendored code style
|
|
15
|
+
|
|
16
|
+
#![allow(
|
|
17
|
+
clippy::panic,
|
|
18
|
+
clippy::expect_used,
|
|
19
|
+
clippy::missing_panics_doc,
|
|
20
|
+
clippy::must_use_candidate,
|
|
21
|
+
clippy::return_self_not_must_use,
|
|
22
|
+
clippy::module_name_repetitions,
|
|
23
|
+
clippy::redundant_else,
|
|
24
|
+
clippy::match_wildcard_for_single_variants,
|
|
25
|
+
clippy::similar_names,
|
|
26
|
+
clippy::items_after_statements,
|
|
27
|
+
clippy::use_self,
|
|
28
|
+
clippy::missing_fields_in_debug,
|
|
29
|
+
clippy::semicolon_if_nothing_returned,
|
|
30
|
+
missing_docs
|
|
31
|
+
)]
|
|
9
32
|
|
|
10
33
|
//! A simple reference-counted DOM.
|
|
11
34
|
//!
|
|
12
35
|
//! This is sufficient as a static parse tree, but don't build a
|
|
13
36
|
//! web browser using it. :)
|
|
14
|
-
//!
|
|
15
|
-
//! A DOM is a [tree structure] with ordered children that can be represented in an XML-like
|
|
16
|
-
//! format. For example, the following graph
|
|
17
|
-
//!
|
|
18
|
-
//! ```text
|
|
19
|
-
//! div
|
|
20
|
-
//! +- "text node"
|
|
21
|
-
//! +- span
|
|
22
|
-
//! ```
|
|
23
|
-
//! in HTML would be serialized as
|
|
24
|
-
//!
|
|
25
|
-
//! ```html
|
|
26
|
-
//! <div>text node<span></span></div>
|
|
27
|
-
//! ```
|
|
28
|
-
//!
|
|
29
|
-
//! See the [document object model article on wikipedia][dom wiki] for more information.
|
|
30
|
-
//!
|
|
31
|
-
//! This implementation stores the information associated with each node once, and then hands out
|
|
32
|
-
//! refs to children. The nodes themselves are reference-counted to avoid copying - you can create
|
|
33
|
-
//! a new ref and then a node will outlive the document. Nodes own their children, but only have
|
|
34
|
-
//! weak references to their parents.
|
|
35
|
-
//!
|
|
36
|
-
//! [tree structure]: https://en.wikipedia.org/wiki/Tree_(data_structure)
|
|
37
|
-
//! [dom wiki]: https://en.wikipedia.org/wiki/Document_Object_Model
|
|
38
|
-
|
|
39
|
-
extern crate markup5ever;
|
|
40
|
-
extern crate tendril;
|
|
41
37
|
|
|
42
38
|
use std::borrow::Cow;
|
|
43
39
|
use std::cell::{Cell, RefCell};
|
|
@@ -48,16 +44,16 @@ use std::io;
|
|
|
48
44
|
use std::mem;
|
|
49
45
|
use std::rc::{Rc, Weak};
|
|
50
46
|
|
|
51
|
-
use tendril::StrTendril;
|
|
47
|
+
use html5ever::tendril::StrTendril;
|
|
52
48
|
|
|
53
|
-
use
|
|
54
|
-
use
|
|
55
|
-
use
|
|
56
|
-
use
|
|
57
|
-
use
|
|
58
|
-
use
|
|
59
|
-
use
|
|
60
|
-
use
|
|
49
|
+
use html5ever::Attribute;
|
|
50
|
+
use html5ever::ExpandedName;
|
|
51
|
+
use html5ever::QualName;
|
|
52
|
+
use html5ever::interface::tree_builder;
|
|
53
|
+
use html5ever::interface::tree_builder::{ElementFlags, NodeOrText, QuirksMode, TreeSink};
|
|
54
|
+
use html5ever::serialize::TraversalScope;
|
|
55
|
+
use html5ever::serialize::TraversalScope::{ChildrenOnly, IncludeNode};
|
|
56
|
+
use html5ever::serialize::{Serialize, Serializer};
|
|
61
57
|
|
|
62
58
|
/// The different kinds of nodes in the DOM.
|
|
63
59
|
#[derive(Debug)]
|
|
@@ -98,10 +94,7 @@ pub enum NodeData {
|
|
|
98
94
|
},
|
|
99
95
|
|
|
100
96
|
/// A Processing instruction.
|
|
101
|
-
ProcessingInstruction {
|
|
102
|
-
target: StrTendril,
|
|
103
|
-
contents: StrTendril,
|
|
104
|
-
},
|
|
97
|
+
ProcessingInstruction { target: StrTendril, contents: StrTendril },
|
|
105
98
|
}
|
|
106
99
|
|
|
107
100
|
/// A DOM node.
|
|
@@ -130,10 +123,9 @@ impl Drop for Node {
|
|
|
130
123
|
let mut nodes = mem::take(&mut *self.children.borrow_mut());
|
|
131
124
|
while let Some(node) = nodes.pop() {
|
|
132
125
|
let children = mem::take(&mut *node.children.borrow_mut());
|
|
133
|
-
nodes.extend(children
|
|
126
|
+
nodes.extend(children);
|
|
134
127
|
if let NodeData::Element {
|
|
135
|
-
ref template_contents,
|
|
136
|
-
..
|
|
128
|
+
ref template_contents, ..
|
|
137
129
|
} = node.data
|
|
138
130
|
{
|
|
139
131
|
if let Some(template_contents) = template_contents.borrow_mut().take() {
|
|
@@ -193,7 +185,7 @@ fn append_to_existing_text(prev: &Handle, text: &str) -> bool {
|
|
|
193
185
|
NodeData::Text { ref contents } => {
|
|
194
186
|
contents.borrow_mut().push_slice(text);
|
|
195
187
|
true
|
|
196
|
-
}
|
|
188
|
+
}
|
|
197
189
|
_ => false,
|
|
198
190
|
}
|
|
199
191
|
}
|
|
@@ -240,8 +232,7 @@ impl TreeSink for RcDom {
|
|
|
240
232
|
|
|
241
233
|
fn get_template_contents(&self, target: &Handle) -> Handle {
|
|
242
234
|
if let NodeData::Element {
|
|
243
|
-
ref template_contents,
|
|
244
|
-
..
|
|
235
|
+
ref template_contents, ..
|
|
245
236
|
} = target.data
|
|
246
237
|
{
|
|
247
238
|
template_contents
|
|
@@ -287,10 +278,7 @@ impl TreeSink for RcDom {
|
|
|
287
278
|
}
|
|
288
279
|
|
|
289
280
|
fn create_pi(&self, target: StrTendril, data: StrTendril) -> Handle {
|
|
290
|
-
Node::new(NodeData::ProcessingInstruction {
|
|
291
|
-
target,
|
|
292
|
-
contents: data,
|
|
293
|
-
})
|
|
281
|
+
Node::new(NodeData::ProcessingInstruction { target, contents: data })
|
|
294
282
|
}
|
|
295
283
|
|
|
296
284
|
fn append(&self, parent: &Handle, child: NodeOrText<Handle>) {
|
|
@@ -315,8 +303,7 @@ impl TreeSink for RcDom {
|
|
|
315
303
|
}
|
|
316
304
|
|
|
317
305
|
fn append_before_sibling(&self, sibling: &Handle, child: NodeOrText<Handle>) {
|
|
318
|
-
let (parent, i) = get_parent_and_index(sibling)
|
|
319
|
-
.expect("append_before_sibling called on node without parent");
|
|
306
|
+
let (parent, i) = get_parent_and_index(sibling).expect("append_before_sibling called on node without parent");
|
|
320
307
|
|
|
321
308
|
let child = match (child, i) {
|
|
322
309
|
// No previous node.
|
|
@@ -334,7 +321,7 @@ impl TreeSink for RcDom {
|
|
|
334
321
|
Node::new(NodeData::Text {
|
|
335
322
|
contents: RefCell::new(text),
|
|
336
323
|
})
|
|
337
|
-
}
|
|
324
|
+
}
|
|
338
325
|
|
|
339
326
|
// The tree builder promises we won't have a text node after
|
|
340
327
|
// the insertion point.
|
|
@@ -366,12 +353,7 @@ impl TreeSink for RcDom {
|
|
|
366
353
|
}
|
|
367
354
|
}
|
|
368
355
|
|
|
369
|
-
fn append_doctype_to_document(
|
|
370
|
-
&self,
|
|
371
|
-
name: StrTendril,
|
|
372
|
-
public_id: StrTendril,
|
|
373
|
-
system_id: StrTendril,
|
|
374
|
-
) {
|
|
356
|
+
fn append_doctype_to_document(&self, name: StrTendril, public_id: StrTendril, system_id: StrTendril) {
|
|
375
357
|
append(
|
|
376
358
|
&self.document,
|
|
377
359
|
Node::new(NodeData::Doctype {
|
|
@@ -389,15 +371,8 @@ impl TreeSink for RcDom {
|
|
|
389
371
|
panic!("not an element")
|
|
390
372
|
};
|
|
391
373
|
|
|
392
|
-
let existing_names = existing
|
|
393
|
-
|
|
394
|
-
.map(|e| e.name.clone())
|
|
395
|
-
.collect::<HashSet<_>>();
|
|
396
|
-
existing.extend(
|
|
397
|
-
attrs
|
|
398
|
-
.into_iter()
|
|
399
|
-
.filter(|attr| !existing_names.contains(&attr.name)),
|
|
400
|
-
);
|
|
374
|
+
let existing_names = existing.iter().map(|e| e.name.clone()).collect::<HashSet<_>>();
|
|
375
|
+
existing.extend(attrs.into_iter().filter(|attr| !existing_names.contains(&attr.name)));
|
|
401
376
|
}
|
|
402
377
|
|
|
403
378
|
fn remove_from_parent(&self, target: &Handle) {
|
|
@@ -461,27 +436,17 @@ impl Serialize for SerializableHandle {
|
|
|
461
436
|
let mut ops = VecDeque::new();
|
|
462
437
|
match traversal_scope {
|
|
463
438
|
IncludeNode => ops.push_back(SerializeOp::Open(self.0.clone())),
|
|
464
|
-
ChildrenOnly(_) => ops.extend(
|
|
465
|
-
self.0
|
|
466
|
-
.children
|
|
467
|
-
.borrow()
|
|
468
|
-
.iter()
|
|
469
|
-
.map(|h| SerializeOp::Open(h.clone())),
|
|
470
|
-
),
|
|
439
|
+
ChildrenOnly(_) => ops.extend(self.0.children.borrow().iter().map(|h| SerializeOp::Open(h.clone()))),
|
|
471
440
|
}
|
|
472
441
|
|
|
473
442
|
while let Some(op) = ops.pop_front() {
|
|
474
443
|
match op {
|
|
475
444
|
SerializeOp::Open(handle) => match handle.data {
|
|
476
445
|
NodeData::Element {
|
|
477
|
-
ref name,
|
|
478
|
-
ref attrs,
|
|
479
|
-
..
|
|
446
|
+
ref name, ref attrs, ..
|
|
480
447
|
} => {
|
|
481
|
-
serializer
|
|
482
|
-
name.clone(),
|
|
483
|
-
attrs.borrow().iter().map(|at| (&at.name, &at.value[..])),
|
|
484
|
-
)?;
|
|
448
|
+
serializer
|
|
449
|
+
.start_elem(name.clone(), attrs.borrow().iter().map(|at| (&at.name, &at.value[..])))?;
|
|
485
450
|
|
|
486
451
|
ops.reserve(1 + handle.children.borrow().len());
|
|
487
452
|
ops.push_front(SerializeOp::Close(name.clone()));
|
|
@@ -489,7 +454,7 @@ impl Serialize for SerializableHandle {
|
|
|
489
454
|
for child in handle.children.borrow().iter().rev() {
|
|
490
455
|
ops.push_front(SerializeOp::Open(child.clone()));
|
|
491
456
|
}
|
|
492
|
-
}
|
|
457
|
+
}
|
|
493
458
|
|
|
494
459
|
NodeData::Doctype { ref name, .. } => serializer.write_doctype(name)?,
|
|
495
460
|
|
|
@@ -507,7 +472,7 @@ impl Serialize for SerializableHandle {
|
|
|
507
472
|
|
|
508
473
|
SerializeOp::Close(name) => {
|
|
509
474
|
serializer.end_elem(name)?;
|
|
510
|
-
}
|
|
475
|
+
}
|
|
511
476
|
}
|
|
512
477
|
}
|
|
513
478
|
|
|
@@ -350,3 +350,160 @@ fn test_container_elements() {
|
|
|
350
350
|
assert!(markdown.contains("Layout analysis"));
|
|
351
351
|
assert!(markdown.contains("Block content"));
|
|
352
352
|
}
|
|
353
|
+
|
|
354
|
+
#[test]
|
|
355
|
+
fn test_ocr_header_renders_as_italic_not_heading() {
|
|
356
|
+
// OcrHeader is a "page running header" (repeated at top of pages),
|
|
357
|
+
// NOT a section heading. It must render as italic (*text*), not as # heading.
|
|
358
|
+
let hocr = r#"<div class="ocr_page" title="bbox 0 0 1000 1000">
|
|
359
|
+
<div class="ocr_header" title="bbox 0 0 1000 50">
|
|
360
|
+
<span class="ocr_line" title="bbox 0 0 500 30">
|
|
361
|
+
<span class="ocrx_word" title="bbox 0 0 100 30; x_wconf 95">Chapter</span>
|
|
362
|
+
<span class="ocrx_word" title="bbox 110 0 200 30; x_wconf 95">One</span>
|
|
363
|
+
</span>
|
|
364
|
+
</div>
|
|
365
|
+
<p class="ocr_par" title="bbox 0 100 900 200">
|
|
366
|
+
<span class="ocr_line" title="bbox 0 100 800 130">
|
|
367
|
+
<span class="ocrx_word" title="bbox 0 100 50 130; x_wconf 95">Some</span>
|
|
368
|
+
<span class="ocrx_word" title="bbox 60 100 120 130; x_wconf 95">body</span>
|
|
369
|
+
<span class="ocrx_word" title="bbox 130 100 180 130; x_wconf 95">text</span>
|
|
370
|
+
<span class="ocrx_word" title="bbox 190 100 240 130; x_wconf 95">here</span>
|
|
371
|
+
</span>
|
|
372
|
+
</p>
|
|
373
|
+
</div>"#;
|
|
374
|
+
|
|
375
|
+
let dom = tl::parse(hocr, tl::ParserOptions::default()).unwrap();
|
|
376
|
+
let (elements, _) = extract_hocr_document(&dom);
|
|
377
|
+
let markdown = convert_to_markdown(&elements, true);
|
|
378
|
+
|
|
379
|
+
// OcrHeader must render as italic
|
|
380
|
+
assert!(
|
|
381
|
+
markdown.contains("*Chapter One*"),
|
|
382
|
+
"OcrHeader should render as italic (*text*), got: {markdown}"
|
|
383
|
+
);
|
|
384
|
+
// It must NOT render as a markdown heading
|
|
385
|
+
assert!(
|
|
386
|
+
!markdown.contains("# Chapter One"),
|
|
387
|
+
"OcrHeader must NOT render as a markdown heading, got: {markdown}"
|
|
388
|
+
);
|
|
389
|
+
}
|
|
390
|
+
|
|
391
|
+
#[test]
|
|
392
|
+
fn test_heading_detection_with_x_fsize_on_line_child() {
|
|
393
|
+
// A paragraph containing a single ocr_line child with x_fsize 18 (large font)
|
|
394
|
+
// and short capitalized text should be detected as a heading.
|
|
395
|
+
let hocr = r#"<div class="ocr_page" title="bbox 0 0 1000 1000">
|
|
396
|
+
<div class="ocr_carea" title="bbox 0 0 1000 500">
|
|
397
|
+
<p class="ocr_par" title="bbox 0 0 500 40">
|
|
398
|
+
<span class="ocr_line" title="bbox 0 0 500 30; x_fsize 18">
|
|
399
|
+
<span class="ocrx_word" title="bbox 0 0 120 30; x_wconf 95">Important</span>
|
|
400
|
+
<span class="ocrx_word" title="bbox 130 0 250 30; x_wconf 95">Section</span>
|
|
401
|
+
<span class="ocrx_word" title="bbox 260 0 350 30; x_wconf 95">Title</span>
|
|
402
|
+
</span>
|
|
403
|
+
</p>
|
|
404
|
+
<p class="ocr_par" title="bbox 0 60 900 200">
|
|
405
|
+
<span class="ocr_line" title="bbox 0 60 800 90; x_fsize 12">
|
|
406
|
+
<span class="ocrx_word" title="bbox 0 60 50 90; x_wconf 95">This</span>
|
|
407
|
+
<span class="ocrx_word" title="bbox 60 60 90 90; x_wconf 92">is</span>
|
|
408
|
+
<span class="ocrx_word" title="bbox 100 60 200 90; x_wconf 98">regular</span>
|
|
409
|
+
<span class="ocrx_word" title="bbox 210 60 280 90; x_wconf 98">body</span>
|
|
410
|
+
<span class="ocrx_word" title="bbox 290 60 340 90; x_wconf 98">text</span>
|
|
411
|
+
<span class="ocrx_word" title="bbox 350 60 430 90; x_wconf 98">content.</span>
|
|
412
|
+
</span>
|
|
413
|
+
</p>
|
|
414
|
+
</div>
|
|
415
|
+
</div>"#;
|
|
416
|
+
|
|
417
|
+
let dom = tl::parse(hocr, tl::ParserOptions::default()).unwrap();
|
|
418
|
+
let (elements, _) = extract_hocr_document(&dom);
|
|
419
|
+
let markdown = convert_to_markdown(&elements, true);
|
|
420
|
+
|
|
421
|
+
// The large-font paragraph should be detected as a heading
|
|
422
|
+
assert!(
|
|
423
|
+
markdown.contains("# Important Section Title"),
|
|
424
|
+
"Large font paragraph should be detected as heading, got: {markdown}"
|
|
425
|
+
);
|
|
426
|
+
}
|
|
427
|
+
|
|
428
|
+
#[test]
|
|
429
|
+
fn test_single_word_heading_with_large_font() {
|
|
430
|
+
// A single-word paragraph with large font size should be detected as a heading.
|
|
431
|
+
// Without font size awareness, single-word paragraphs are rejected.
|
|
432
|
+
let hocr = r#"<div class="ocr_page" title="bbox 0 0 1000 1000">
|
|
433
|
+
<div class="ocr_carea" title="bbox 0 0 1000 500">
|
|
434
|
+
<p class="ocr_par" title="bbox 0 0 300 40">
|
|
435
|
+
<span class="ocr_line" title="bbox 0 0 300 30; x_fsize 24">
|
|
436
|
+
<span class="ocrx_word" title="bbox 0 0 200 30; x_wconf 95">Introduction</span>
|
|
437
|
+
</span>
|
|
438
|
+
</p>
|
|
439
|
+
<p class="ocr_par" title="bbox 0 60 900 200">
|
|
440
|
+
<span class="ocr_line" title="bbox 0 60 800 90; x_fsize 12">
|
|
441
|
+
<span class="ocrx_word" title="bbox 0 60 50 90; x_wconf 95">Some</span>
|
|
442
|
+
<span class="ocrx_word" title="bbox 60 60 120 90; x_wconf 92">body</span>
|
|
443
|
+
<span class="ocrx_word" title="bbox 130 60 180 90; x_wconf 98">text</span>
|
|
444
|
+
<span class="ocrx_word" title="bbox 190 60 280 90; x_wconf 98">follows.</span>
|
|
445
|
+
</span>
|
|
446
|
+
</p>
|
|
447
|
+
</div>
|
|
448
|
+
</div>"#;
|
|
449
|
+
|
|
450
|
+
let dom = tl::parse(hocr, tl::ParserOptions::default()).unwrap();
|
|
451
|
+
let (elements, _) = extract_hocr_document(&dom);
|
|
452
|
+
let markdown = convert_to_markdown(&elements, true);
|
|
453
|
+
|
|
454
|
+
// Single word with large font should be detected as heading
|
|
455
|
+
assert!(
|
|
456
|
+
markdown.contains("# Introduction"),
|
|
457
|
+
"Single word with large font should be detected as heading, got: {markdown}"
|
|
458
|
+
);
|
|
459
|
+
}
|
|
460
|
+
|
|
461
|
+
#[test]
|
|
462
|
+
fn test_single_word_without_large_font_not_heading() {
|
|
463
|
+
// A single-word paragraph without large font should NOT be detected as heading.
|
|
464
|
+
// This ensures we haven't broken the existing behavior.
|
|
465
|
+
let hocr = r#"<div class="ocr_page" title="bbox 0 0 1000 1000">
|
|
466
|
+
<div class="ocr_carea" title="bbox 0 0 1000 500">
|
|
467
|
+
<p class="ocr_par" title="bbox 0 0 300 20">
|
|
468
|
+
<span class="ocr_line" title="bbox 0 0 300 12; x_fsize 10">
|
|
469
|
+
<span class="ocrx_word" title="bbox 0 0 100 12; x_wconf 95">Word</span>
|
|
470
|
+
</span>
|
|
471
|
+
</p>
|
|
472
|
+
</div>
|
|
473
|
+
</div>"#;
|
|
474
|
+
|
|
475
|
+
let dom = tl::parse(hocr, tl::ParserOptions::default()).unwrap();
|
|
476
|
+
let (elements, _) = extract_hocr_document(&dom);
|
|
477
|
+
let markdown = convert_to_markdown(&elements, true);
|
|
478
|
+
|
|
479
|
+
// Single word with small font should NOT be a heading
|
|
480
|
+
assert!(
|
|
481
|
+
!markdown.contains("# Word"),
|
|
482
|
+
"Single word with small font should not be detected as heading, got: {markdown}"
|
|
483
|
+
);
|
|
484
|
+
}
|
|
485
|
+
|
|
486
|
+
#[test]
|
|
487
|
+
fn test_heading_detection_with_bbox_height_proxy() {
|
|
488
|
+
// When x_fsize is absent, bbox height should serve as a font-size proxy.
|
|
489
|
+
// A bbox height of 30 pixels (>= 14) indicates large text.
|
|
490
|
+
let hocr = r#"<div class="ocr_page" title="bbox 0 0 1000 1000">
|
|
491
|
+
<div class="ocr_carea" title="bbox 0 0 1000 500">
|
|
492
|
+
<p class="ocr_par" title="bbox 0 0 500 40">
|
|
493
|
+
<span class="ocr_line" title="bbox 0 0 500 30">
|
|
494
|
+
<span class="ocrx_word" title="bbox 0 0 200 30; x_wconf 95">Summary</span>
|
|
495
|
+
</span>
|
|
496
|
+
</p>
|
|
497
|
+
</div>
|
|
498
|
+
</div>"#;
|
|
499
|
+
|
|
500
|
+
let dom = tl::parse(hocr, tl::ParserOptions::default()).unwrap();
|
|
501
|
+
let (elements, _) = extract_hocr_document(&dom);
|
|
502
|
+
let markdown = convert_to_markdown(&elements, true);
|
|
503
|
+
|
|
504
|
+
// bbox height of 30 (y2=30 - y1=0) should serve as proxy for large font
|
|
505
|
+
assert!(
|
|
506
|
+
markdown.contains("# Summary"),
|
|
507
|
+
"Single word with tall bbox (height=30) should be detected as heading via bbox proxy, got: {markdown}"
|
|
508
|
+
);
|
|
509
|
+
}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"files":{".cargo_vcs_info.json":"
|
|
1
|
+
{"files":{".cargo_vcs_info.json":"57a15fc6b2ddb894def64e09a11e5cc52e575be77740a6fc0aa3c451c8c0fa49",".github/workflows/main.yml":"e2b3d5678a31325a616bae0c1ccb223f9ed2f5b43d39a134c5c45858e4fbf4ca","CHANGELOG.md":"832a95917c80f443c113ede708a5602550d081bd79cc33f4152365357ab36bb8","Cargo.lock":"6ddd1fa91a86a6e8bbbc17161703f21799213d823ea26d7f7ce86181295a3fc8","Cargo.toml":"01ff6425d680ddac0a8efb1b41e6b0503d0153388fadcc4bf584ca38c4dca221","Cargo.toml.orig":"507519ce5facd6b78a80e3cfab1437c7b6b5a1fc99c65e24552525c884d4ed3e","LICENSE-APACHE":"04ea4849dba9dcae07113850c6f1b1a69052c625210639914eee352023f750ad","LICENSE-MIT":"0d25d03b5ab49576178ad0cae7a2648d12c17ad0452fe49c07e55e4b59aa5257","README.md":"e3388f55065d69e076d90871c0a91dc97420bd0d07b4f154b08e40ac47b115eb","examples/cat.rs":"594b9457ca6eb4ce9b840133da5076fa7b96334953df03f894233169564622f6","src/advice.rs":"a4c023982a598a77c23b5a4e524de581329d42287d639be88e2ffda3bd929511","src/lib.rs":"d93f73dd80b5bfdecc10836a7ebcd04c124f6283f9a104686fe48a18d34764ab","src/stub.rs":"beccccb0233903df5de1773674b2dcd9b0991889a10c23719f5aee8f7496f958","src/unix.rs":"fea7c7c21a6082bc77052e5e40a2bff1311103a19c2cd281ff383604fa799b35","src/windows.rs":"0fbd1efc122a7e83defb5d0a401e973a3876e90c6a1f14f2ac1976462633dd79"},"package":"714098028fe011992e1c3962653c96b2d578c4b4bce9036e15ff220319b1e0e3"}
|