html-to-markdown 2.24.6 → 2.25.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (231) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/html-to-markdown-rb/native/Cargo.lock +9 -32
  5. data/ext/html-to-markdown-rb/native/Cargo.toml +1 -1
  6. data/lib/html_to_markdown/version.rb +1 -1
  7. data/rust-vendor/html-to-markdown-rs/Cargo.toml +0 -1
  8. data/rust-vendor/html-to-markdown-rs/src/converter/main_helpers.rs +1 -1
  9. data/rust-vendor/html-to-markdown-rs/src/hocr/converter/hierarchy.rs +20 -5
  10. data/rust-vendor/html-to-markdown-rs/src/lib.rs +1 -0
  11. data/rust-vendor/{markup5ever_rcdom/lib.rs → html-to-markdown-rs/src/rcdom.rs} +56 -91
  12. data/rust-vendor/html-to-markdown-rs/tests/hocr_compliance_test.rs +157 -0
  13. data/rust-vendor/memmap2/.cargo-checksum.json +1 -1
  14. data/rust-vendor/memmap2/.cargo_vcs_info.json +1 -1
  15. data/rust-vendor/memmap2/CHANGELOG.md +8 -0
  16. data/rust-vendor/memmap2/Cargo.lock +1 -1
  17. data/rust-vendor/memmap2/Cargo.toml +2 -1
  18. data/rust-vendor/memmap2/Cargo.toml.orig +2 -1
  19. data/rust-vendor/memmap2/src/lib.rs +25 -1
  20. data/rust-vendor/memmap2/src/stub.rs +1 -4
  21. data/rust-vendor/memmap2/src/unix.rs +14 -1
  22. data/rust-vendor/png/.cargo-checksum.json +1 -1
  23. data/rust-vendor/png/.cargo_vcs_info.json +1 -1
  24. data/rust-vendor/png/CHANGES.md +44 -0
  25. data/rust-vendor/png/Cargo.lock +124 -171
  26. data/rust-vendor/png/Cargo.toml +1 -1
  27. data/rust-vendor/png/Cargo.toml.orig +1 -1
  28. data/rust-vendor/png/benches/expand_paletted.rs +5 -5
  29. data/rust-vendor/png/benches/unfilter.rs +3 -3
  30. data/rust-vendor/png/src/adam7.rs +17 -10
  31. data/rust-vendor/png/src/common.rs +8 -8
  32. data/rust-vendor/png/src/decoder/mod.rs +53 -20
  33. data/rust-vendor/png/src/decoder/stream.rs +263 -78
  34. data/rust-vendor/png/src/decoder/unfiltering_buffer.rs +210 -53
  35. data/rust-vendor/png/src/decoder/zlib.rs +130 -90
  36. data/rust-vendor/png/src/encoder.rs +4 -2
  37. data/rust-vendor/png/src/{filter.rs → filter/mod.rs} +100 -367
  38. data/rust-vendor/png/src/filter/optimization-notes.md +104 -0
  39. data/rust-vendor/png/src/filter/paeth.rs +398 -0
  40. data/rust-vendor/png/src/filter/simd.rs +308 -0
  41. data/rust-vendor/png/src/lib.rs +1 -0
  42. data/rust-vendor/syn/.cargo-checksum.json +1 -1
  43. data/rust-vendor/syn/.cargo_vcs_info.json +1 -1
  44. data/rust-vendor/syn/Cargo.lock +40 -41
  45. data/rust-vendor/syn/Cargo.toml +1 -1
  46. data/rust-vendor/syn/Cargo.toml.orig +1 -1
  47. data/rust-vendor/syn/src/item.rs +61 -40
  48. data/rust-vendor/syn/src/lib.rs +2 -1
  49. data/rust-vendor/syn/tests/test_item.rs +54 -0
  50. data/rust-vendor/unicode-ident/.cargo-checksum.json +1 -1
  51. data/rust-vendor/unicode-ident/.cargo_vcs_info.json +1 -1
  52. data/rust-vendor/unicode-ident/Cargo.lock +21 -21
  53. data/rust-vendor/unicode-ident/Cargo.toml +1 -1
  54. data/rust-vendor/unicode-ident/Cargo.toml.orig +1 -1
  55. data/rust-vendor/unicode-ident/src/lib.rs +1 -1
  56. data/rust-vendor/unicode-ident/src/tables.rs +87 -97
  57. data/rust-vendor/unicode-ident/tests/static_size.rs +1 -1
  58. metadata +7 -177
  59. data/rust-vendor/markup5ever_rcdom/.cargo-checksum.json +0 -1
  60. data/rust-vendor/markup5ever_rcdom/.cargo_vcs_info.json +0 -7
  61. data/rust-vendor/markup5ever_rcdom/Cargo.lock +0 -658
  62. data/rust-vendor/markup5ever_rcdom/Cargo.toml +0 -109
  63. data/rust-vendor/markup5ever_rcdom/Cargo.toml.orig +0 -42
  64. data/rust-vendor/markup5ever_rcdom/LICENSE-APACHE +0 -201
  65. data/rust-vendor/markup5ever_rcdom/LICENSE-MIT +0 -25
  66. data/rust-vendor/markup5ever_rcdom/README.md +0 -7
  67. data/rust-vendor/markup5ever_rcdom/custom-html5lib-tokenizer-tests/regression.test +0 -69
  68. data/rust-vendor/markup5ever_rcdom/data/test/ignore +0 -1
  69. data/rust-vendor/markup5ever_rcdom/examples/hello_xml.rs +0 -39
  70. data/rust-vendor/markup5ever_rcdom/examples/html2html.rs +0 -51
  71. data/rust-vendor/markup5ever_rcdom/examples/print-rcdom.rs +0 -78
  72. data/rust-vendor/markup5ever_rcdom/examples/xml_tree_printer.rs +0 -67
  73. data/rust-vendor/markup5ever_rcdom/html5lib-tests/.gitattributes +0 -2
  74. data/rust-vendor/markup5ever_rcdom/html5lib-tests/.github/workflows/downstream.yml +0 -76
  75. data/rust-vendor/markup5ever_rcdom/html5lib-tests/.github/workflows/lint.yml +0 -25
  76. data/rust-vendor/markup5ever_rcdom/html5lib-tests/.gitignore +0 -79
  77. data/rust-vendor/markup5ever_rcdom/html5lib-tests/AUTHORS.rst +0 -34
  78. data/rust-vendor/markup5ever_rcdom/html5lib-tests/LICENSE +0 -21
  79. data/rust-vendor/markup5ever_rcdom/html5lib-tests/encoding/chardet/test_big5.txt +0 -51
  80. data/rust-vendor/markup5ever_rcdom/html5lib-tests/encoding/scripted/tests1.dat +0 -5
  81. data/rust-vendor/markup5ever_rcdom/html5lib-tests/encoding/test-yahoo-jp.dat +0 -10
  82. data/rust-vendor/markup5ever_rcdom/html5lib-tests/encoding/tests1.dat +0 -388
  83. data/rust-vendor/markup5ever_rcdom/html5lib-tests/encoding/tests2.dat +0 -115
  84. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint +0 -6
  85. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/__init__.py +0 -0
  86. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/__init__.py +0 -0
  87. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/LICENSE +0 -18
  88. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/__init__.py +0 -0
  89. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/lexer.py +0 -211
  90. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/lexer.pyi +0 -34
  91. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/parser.py +0 -872
  92. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/parser.pyi +0 -83
  93. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/py.typed +0 -0
  94. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/util.py +0 -72
  95. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/util.pyi +0 -7
  96. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/vendor.txt +0 -1
  97. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor-patches/funcparserlib.patch +0 -24
  98. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/lint.py +0 -280
  99. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/parser.py +0 -177
  100. data/rust-vendor/markup5ever_rcdom/html5lib-tests/pyproject.toml +0 -7
  101. data/rust-vendor/markup5ever_rcdom/html5lib-tests/serializer/core.test +0 -125
  102. data/rust-vendor/markup5ever_rcdom/html5lib-tests/serializer/injectmeta.test +0 -66
  103. data/rust-vendor/markup5ever_rcdom/html5lib-tests/serializer/optionaltags.test +0 -965
  104. data/rust-vendor/markup5ever_rcdom/html5lib-tests/serializer/options.test +0 -60
  105. data/rust-vendor/markup5ever_rcdom/html5lib-tests/serializer/whitespace.test +0 -51
  106. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/README.md +0 -107
  107. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/contentModelFlags.test +0 -93
  108. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/domjs.test +0 -335
  109. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/entities.test +0 -542
  110. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/escapeFlag.test +0 -36
  111. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/namedEntities.test +0 -42422
  112. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/numericEntities.test +0 -1677
  113. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/pendingSpecChanges.test +0 -9
  114. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/test1.test +0 -353
  115. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/test2.test +0 -275
  116. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/test3.test +0 -11233
  117. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/test4.test +0 -532
  118. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/unicodeChars.test +0 -1577
  119. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/unicodeCharsProblematic.test +0 -41
  120. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/xmlViolation.test +0 -20
  121. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/README.md +0 -108
  122. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/adoption01.dat +0 -354
  123. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/adoption02.dat +0 -39
  124. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/blocks.dat +0 -695
  125. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/comments01.dat +0 -217
  126. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/doctype01.dat +0 -474
  127. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/domjs-unsafe.dat +0 -0
  128. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/entities01.dat +0 -943
  129. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/entities02.dat +0 -309
  130. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/foreign-fragment.dat +0 -645
  131. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/html5test-com.dat +0 -301
  132. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/inbody01.dat +0 -54
  133. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/isindex.dat +0 -49
  134. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/main-element.dat +0 -46
  135. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/math.dat +0 -104
  136. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/menuitem-element.dat +0 -240
  137. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/namespace-sensitivity.dat +0 -22
  138. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/noscript01.dat +0 -237
  139. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/pending-spec-changes-plain-text-unsafe.dat +0 -0
  140. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/pending-spec-changes.dat +0 -46
  141. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/plain-text-unsafe.dat +0 -0
  142. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/quirks01.dat +0 -53
  143. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/ruby.dat +0 -302
  144. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/scriptdata01.dat +0 -372
  145. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/scripted/adoption01.dat +0 -16
  146. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/scripted/ark.dat +0 -27
  147. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/scripted/webkit01.dat +0 -30
  148. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/search-element.dat +0 -46
  149. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/svg.dat +0 -104
  150. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tables01.dat +0 -322
  151. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/template.dat +0 -1673
  152. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests1.dat +0 -1956
  153. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests10.dat +0 -849
  154. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests11.dat +0 -523
  155. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests12.dat +0 -62
  156. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests14.dat +0 -75
  157. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests15.dat +0 -216
  158. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests16.dat +0 -2602
  159. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests17.dat +0 -179
  160. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests18.dat +0 -558
  161. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests19.dat +0 -1398
  162. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests2.dat +0 -831
  163. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests20.dat +0 -842
  164. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests21.dat +0 -306
  165. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests22.dat +0 -190
  166. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests23.dat +0 -168
  167. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests24.dat +0 -79
  168. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests25.dat +0 -288
  169. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests26.dat +0 -453
  170. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests3.dat +0 -305
  171. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests4.dat +0 -74
  172. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests5.dat +0 -210
  173. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests6.dat +0 -663
  174. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests7.dat +0 -453
  175. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests8.dat +0 -165
  176. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests9.dat +0 -472
  177. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests_innerHTML_1.dat +0 -843
  178. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tricky01.dat +0 -336
  179. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/webkit01.dat +0 -785
  180. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/webkit02.dat +0 -554
  181. data/rust-vendor/markup5ever_rcdom/tests/foreach_html5lib_test/mod.rs +0 -41
  182. data/rust-vendor/markup5ever_rcdom/tests/html-driver.rs +0 -29
  183. data/rust-vendor/markup5ever_rcdom/tests/html-serializer.rs +0 -265
  184. data/rust-vendor/markup5ever_rcdom/tests/html-tokenizer.rs +0 -487
  185. data/rust-vendor/markup5ever_rcdom/tests/html-tree-builder.rs +0 -298
  186. data/rust-vendor/markup5ever_rcdom/tests/html-tree-sink.rs +0 -141
  187. data/rust-vendor/markup5ever_rcdom/tests/util/find_tests.rs +0 -34
  188. data/rust-vendor/markup5ever_rcdom/tests/util/runner.rs +0 -48
  189. data/rust-vendor/markup5ever_rcdom/tests/xml-driver.rs +0 -101
  190. data/rust-vendor/markup5ever_rcdom/tests/xml-tokenizer.rs +0 -374
  191. data/rust-vendor/markup5ever_rcdom/tests/xml-tree-builder.rs +0 -237
  192. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/AUTHORS.rst +0 -9
  193. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/LICENSE +0 -21
  194. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/README.md +0 -92
  195. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/comments.test +0 -274
  196. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/doctype.test +0 -3232
  197. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/entities.test +0 -283
  198. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/eof.test +0 -113
  199. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/namedEntities.test +0 -42210
  200. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/numericEntities.test +0 -1349
  201. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/test1.test +0 -162
  202. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/test2.test +0 -64
  203. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/unicodeChars.test +0 -1295
  204. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tree-construction/README.md +0 -104
  205. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tree-construction/namespace.dat +0 -119
  206. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tree-construction/test1.dat +0 -124
  207. data/rust-vendor/xml5ever/.cargo-checksum.json +0 -1
  208. data/rust-vendor/xml5ever/.cargo_vcs_info.json +0 -6
  209. data/rust-vendor/xml5ever/Cargo.lock +0 -752
  210. data/rust-vendor/xml5ever/Cargo.toml +0 -69
  211. data/rust-vendor/xml5ever/Cargo.toml.orig +0 -29
  212. data/rust-vendor/xml5ever/LICENSE-APACHE +0 -201
  213. data/rust-vendor/xml5ever/LICENSE-MIT +0 -25
  214. data/rust-vendor/xml5ever/README.md +0 -72
  215. data/rust-vendor/xml5ever/benches/xml5ever.rs +0 -77
  216. data/rust-vendor/xml5ever/data/bench/strong.xml +0 -1
  217. data/rust-vendor/xml5ever/examples/README.md +0 -223
  218. data/rust-vendor/xml5ever/examples/example.xml +0 -3
  219. data/rust-vendor/xml5ever/examples/simple_xml_tokenizer.rs +0 -81
  220. data/rust-vendor/xml5ever/examples/xml_tokenizer.rs +0 -115
  221. data/rust-vendor/xml5ever/src/driver.rs +0 -90
  222. data/rust-vendor/xml5ever/src/lib.rs +0 -47
  223. data/rust-vendor/xml5ever/src/macros.rs +0 -18
  224. data/rust-vendor/xml5ever/src/serialize/mod.rs +0 -216
  225. data/rust-vendor/xml5ever/src/tokenizer/char_ref/mod.rs +0 -456
  226. data/rust-vendor/xml5ever/src/tokenizer/interface.rs +0 -116
  227. data/rust-vendor/xml5ever/src/tokenizer/mod.rs +0 -1344
  228. data/rust-vendor/xml5ever/src/tokenizer/qname.rs +0 -84
  229. data/rust-vendor/xml5ever/src/tokenizer/states.rs +0 -167
  230. data/rust-vendor/xml5ever/src/tree_builder/mod.rs +0 -774
  231. data/rust-vendor/xml5ever/src/tree_builder/types.rs +0 -37
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 0cc6e5553ec13a221908d7529f53e0caa8d190df56523e7dab5b5852d229c9ef
4
- data.tar.gz: 9543fca98f17d95285d97e5421565d604ee6c3e29ff1007a9af1692af5742ffc
3
+ metadata.gz: deb434d4fa161dcd1df3b836eac2ce0133bac0124f88a890757042044501acf2
4
+ data.tar.gz: 76a70b3e1a1abafa465b0d076526d1b58d50f3a39453e406bc2d4b4c8b669ce6
5
5
  SHA512:
6
- metadata.gz: 948a72a00eceee87691aa3a20965d8fdde6cefb21ad465b46faed0fbec148915cad1573d98689de75c9d183528d90f1a118969d1857940d886884948195f6b4b
7
- data.tar.gz: 91425e85e5df7fc997e6b1a091103166da1270a55d700ecfdaa1c4273d7029f98d06501d593eac594b6fd1420fd55fd331b37a62e299dc330805c3c0e1fa031a
6
+ metadata.gz: 59404b56deed91b2f0c65bf358fd05b685042804b30a2ee92ebb5cdb220a6b89c9f3dbc62d5576ef7d818c6372c42c1f8a2cc8bfbd2b8db16c8ecdbcdd60d427
7
+ data.tar.gz: 30f12a84b9b2b32aa6a1c1c3499ff8777e22994c84a38a22d4127eb91274ba4c9923270c2e8583e56b6cf758202581c58b38315a38fbd8c28ec737251ae32288
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- html-to-markdown (2.24.6)
4
+ html-to-markdown (2.25.1)
5
5
  rb_sys (>= 0.9, < 1.0)
6
6
 
7
7
  GEM
@@ -161,7 +161,7 @@ CHECKSUMS
161
161
  ffi (1.17.3-x86_64-darwin) sha256=1f211811eb5cfaa25998322cdd92ab104bfbd26d1c4c08471599c511f2c00bb5
162
162
  ffi (1.17.3-x86_64-linux-gnu) sha256=3746b01f677aae7b16dc1acb7cb3cc17b3e35bdae7676a3f568153fb0e2c887f
163
163
  fileutils (1.8.0) sha256=8c6b1df54e2540bdb2f39258f08af78853aa70bad52b4d394bbc6424593c6e02
164
- html-to-markdown (2.24.6)
164
+ html-to-markdown (2.25.1)
165
165
  i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
166
166
  json (2.18.1) sha256=fe112755501b8d0466b5ada6cf50c8c3f41e897fa128ac5d263ec09eedc9f986
167
167
  language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
data/README.md CHANGED
@@ -18,7 +18,7 @@
18
18
  <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/html-to-markdown?label=Java&color=007ec6" alt="Java">
19
19
  </a>
20
20
  <a href="https://pkg.go.dev/github.com/kreuzberg-dev/html-to-markdown/packages/go/v2/htmltomarkdown">
21
- <img src="https://img.shields.io/badge/Go-v2.24.6-007ec6" alt="Go">
21
+ <img src="https://img.shields.io/badge/Go-v2.25.1-007ec6" alt="Go">
22
22
  </a>
23
23
  <a href="https://www.nuget.org/packages/KreuzbergDev.HtmlToMarkdown/">
24
24
  <img src="https://img.shields.io/nuget/v/KreuzbergDev.HtmlToMarkdown?label=C%23&color=007ec6" alt="C#">
@@ -424,7 +424,7 @@ dependencies = [
424
424
 
425
425
  [[package]]
426
426
  name = "html-to-markdown-rb"
427
- version = "2.24.6"
427
+ version = "2.25.1"
428
428
  dependencies = [
429
429
  "html-to-markdown-rs",
430
430
  "magnus",
@@ -443,7 +443,6 @@ dependencies = [
443
443
  "html5ever",
444
444
  "image",
445
445
  "lru",
446
- "markup5ever_rcdom",
447
446
  "once_cell",
448
447
  "regex",
449
448
  "serde",
@@ -663,18 +662,6 @@ dependencies = [
663
662
  "web_atoms",
664
663
  ]
665
664
 
666
- [[package]]
667
- name = "markup5ever_rcdom"
668
- version = "0.36.0+unofficial"
669
- source = "registry+https://github.com/rust-lang/crates.io-index"
670
- checksum = "3e5fc8802e8797c0dfdd2ce5c21aa0aee21abbc7b3b18559100651b3352a7b63"
671
- dependencies = [
672
- "html5ever",
673
- "markup5ever",
674
- "tendril",
675
- "xml5ever",
676
- ]
677
-
678
665
  [[package]]
679
666
  name = "memchr"
680
667
  version = "2.8.0"
@@ -683,9 +670,9 @@ checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79"
683
670
 
684
671
  [[package]]
685
672
  name = "memmap2"
686
- version = "0.9.9"
673
+ version = "0.9.10"
687
674
  source = "registry+https://github.com/rust-lang/crates.io-index"
688
- checksum = "744133e4a0e0a658e1374cf3bf8e415c4052a15a111acd372764c55b4177d490"
675
+ checksum = "714098028fe011992e1c3962653c96b2d578c4b4bce9036e15ff220319b1e0e3"
689
676
  dependencies = [
690
677
  "libc",
691
678
  ]
@@ -841,9 +828,9 @@ dependencies = [
841
828
 
842
829
  [[package]]
843
830
  name = "png"
844
- version = "0.18.0"
831
+ version = "0.18.1"
845
832
  source = "registry+https://github.com/rust-lang/crates.io-index"
846
- checksum = "97baced388464909d42d89643fe4361939af9b7ce7a31ee32a168f832a70f2a0"
833
+ checksum = "60769b8b31b2a9f263dae2776c37b1b28ae246943cf719eb6946a1db05128a61"
847
834
  dependencies = [
848
835
  "bitflags 2.11.0",
849
836
  "crc32fast",
@@ -1217,9 +1204,9 @@ dependencies = [
1217
1204
 
1218
1205
  [[package]]
1219
1206
  name = "syn"
1220
- version = "2.0.115"
1207
+ version = "2.0.116"
1221
1208
  source = "registry+https://github.com/rust-lang/crates.io-index"
1222
- checksum = "6e614ed320ac28113fa64972c4262d5dbc89deacdfd00c34a3e4cea073243c12"
1209
+ checksum = "3df424c70518695237746f84cede799c9c58fcb37450d7b23716568cc8bc69cb"
1223
1210
  dependencies = [
1224
1211
  "proc-macro2",
1225
1212
  "quote",
@@ -1272,9 +1259,9 @@ dependencies = [
1272
1259
 
1273
1260
  [[package]]
1274
1261
  name = "unicode-ident"
1275
- version = "1.0.23"
1262
+ version = "1.0.24"
1276
1263
  source = "registry+https://github.com/rust-lang/crates.io-index"
1277
- checksum = "537dd038a89878be9b64dd4bd1b260315c1bb94f4d784956b81e27a088d9a09e"
1264
+ checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75"
1278
1265
 
1279
1266
  [[package]]
1280
1267
  name = "unicode-xid"
@@ -1556,16 +1543,6 @@ dependencies = [
1556
1543
  "wasmparser",
1557
1544
  ]
1558
1545
 
1559
- [[package]]
1560
- name = "xml5ever"
1561
- version = "0.36.1"
1562
- source = "registry+https://github.com/rust-lang/crates.io-index"
1563
- checksum = "f57dd51b88a4b9f99f9b55b136abb86210629d61c48117ddb87f567e51e66be7"
1564
- dependencies = [
1565
- "log",
1566
- "markup5ever",
1567
- ]
1568
-
1569
1546
  [[package]]
1570
1547
  name = "yansi"
1571
1548
  version = "1.0.1"
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "html-to-markdown-rb"
3
- version ="2.24.6"
3
+ version ="2.25.1"
4
4
  edition = "2024"
5
5
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
6
6
  license = "MIT"
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module HtmlToMarkdown
4
- VERSION = '2.24.6'
4
+ VERSION = '2.25.1'
5
5
  end
@@ -35,7 +35,6 @@ ahash = "0.8"
35
35
  html-escape = "0.2.13"
36
36
  image = { version = "0.25", default-features = false, features = ["gif", "jpeg", "png", "bmp", "webp"], optional = true }
37
37
  html5ever = "0.36"
38
- markup5ever_rcdom = "0.36"
39
38
  lru = "0.16"
40
39
  serde = { version = "1.0", features = ["derive"], optional = true }
41
40
  serde_json = { version = "1.0", optional = true }
@@ -101,9 +101,9 @@ pub fn has_custom_element_tags(html: &str) -> bool {
101
101
  ///
102
102
  /// Returns Some(repaired_html) if repair was successful, None otherwise.
103
103
  pub fn repair_with_html5ever(input: &str) -> Option<String> {
104
+ use crate::rcdom::{RcDom, SerializableHandle};
104
105
  use html5ever::serialize::{SerializeOpts, serialize};
105
106
  use html5ever::tendril::TendrilSink;
106
- use markup5ever_rcdom::{RcDom, SerializableHandle};
107
107
 
108
108
  let dom = html5ever::parse_document(RcDom::default(), Default::default())
109
109
  .from_utf8()
@@ -57,17 +57,30 @@ pub fn detect_heading_paragraph(element: &HocrElement, text: &str) -> Option<Str
57
57
  return None;
58
58
  }
59
59
 
60
- let line_count = element
60
+ let line_children: Vec<&HocrElement> = element
61
61
  .children
62
62
  .iter()
63
63
  .filter(|child| matches!(child.element_type, HocrElementType::OcrLine | HocrElementType::OcrxLine))
64
- .count();
64
+ .collect();
65
65
 
66
- if line_count != 1 {
66
+ if line_children.len() != 1 {
67
67
  return None;
68
68
  }
69
69
 
70
- if text.is_empty() || text.len() > 60 || text.contains(':') || text.contains('\n') {
70
+ // Determine effective font size from child line elements.
71
+ // First check x_fsize, then fall back to bbox height as a proxy.
72
+ let font_size = line_children.iter().find_map(|child| {
73
+ child
74
+ .properties
75
+ .x_fsize
76
+ .or_else(|| child.properties.bbox.map(|b| b.height()))
77
+ });
78
+
79
+ let has_large_font = font_size.is_some_and(|size| size >= 14);
80
+
81
+ let char_limit = if has_large_font { 80 } else { 60 };
82
+
83
+ if text.is_empty() || text.len() > char_limit || text.contains(':') || text.contains('\n') {
71
84
  return None;
72
85
  }
73
86
 
@@ -83,7 +96,9 @@ pub fn detect_heading_paragraph(element: &HocrElement, text: &str) -> Option<Str
83
96
  }
84
97
  }
85
98
 
86
- if word_count < 2 {
99
+ // Allow single-word headings when font size is large
100
+ let min_words = if has_large_font { 1 } else { 2 };
101
+ if word_count < min_words {
87
102
  return None;
88
103
  }
89
104
 
@@ -68,6 +68,7 @@ pub mod wrapper;
68
68
  mod convert_api;
69
69
  mod exports;
70
70
  pub mod prelude;
71
+ mod rcdom;
71
72
  mod validation;
72
73
 
73
74
  // ============================================================================
@@ -1,43 +1,39 @@
1
- // Copyright 2014-2017 The html5ever Project Developers. See the
2
- // COPYRIGHT file at the top-level directory of this distribution.
1
+ // Vendored from markup5ever_rcdom v0.36.0+unofficial
2
+ // Original source: https://github.com/servo/html5ever (rcdom/)
3
+ // Copyright (c) 2014 The html5ever Project Developers
4
+ // Licensed under MIT OR Apache-2.0 (see ATTRIBUTIONS.md)
3
5
  //
4
- // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5
- // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6
- // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7
- // option. This file may not be copied, modified, or distributed
8
- // except according to those terms.
6
+ // Vendored to:
7
+ // - Remove unused xml5ever transitive dependency
8
+ // - Eliminate pinned external dependency on "+unofficial" crate
9
+ // - Gain full control over this small, critical module
10
+ //
11
+ // Changes from upstream:
12
+ // - Replaced `extern crate markup5ever` / `extern crate tendril` with
13
+ // `use` imports through `html5ever` (edition 2024 compatibility)
14
+ // - Added module-level clippy allows for vendored code style
15
+
16
+ #![allow(
17
+ clippy::panic,
18
+ clippy::expect_used,
19
+ clippy::missing_panics_doc,
20
+ clippy::must_use_candidate,
21
+ clippy::return_self_not_must_use,
22
+ clippy::module_name_repetitions,
23
+ clippy::redundant_else,
24
+ clippy::match_wildcard_for_single_variants,
25
+ clippy::similar_names,
26
+ clippy::items_after_statements,
27
+ clippy::use_self,
28
+ clippy::missing_fields_in_debug,
29
+ clippy::semicolon_if_nothing_returned,
30
+ missing_docs
31
+ )]
9
32
 
10
33
  //! A simple reference-counted DOM.
11
34
  //!
12
35
  //! This is sufficient as a static parse tree, but don't build a
13
36
  //! web browser using it. :)
14
- //!
15
- //! A DOM is a [tree structure] with ordered children that can be represented in an XML-like
16
- //! format. For example, the following graph
17
- //!
18
- //! ```text
19
- //! div
20
- //! +- "text node"
21
- //! +- span
22
- //! ```
23
- //! in HTML would be serialized as
24
- //!
25
- //! ```html
26
- //! <div>text node<span></span></div>
27
- //! ```
28
- //!
29
- //! See the [document object model article on wikipedia][dom wiki] for more information.
30
- //!
31
- //! This implementation stores the information associated with each node once, and then hands out
32
- //! refs to children. The nodes themselves are reference-counted to avoid copying - you can create
33
- //! a new ref and then a node will outlive the document. Nodes own their children, but only have
34
- //! weak references to their parents.
35
- //!
36
- //! [tree structure]: https://en.wikipedia.org/wiki/Tree_(data_structure)
37
- //! [dom wiki]: https://en.wikipedia.org/wiki/Document_Object_Model
38
-
39
- extern crate markup5ever;
40
- extern crate tendril;
41
37
 
42
38
  use std::borrow::Cow;
43
39
  use std::cell::{Cell, RefCell};
@@ -48,16 +44,16 @@ use std::io;
48
44
  use std::mem;
49
45
  use std::rc::{Rc, Weak};
50
46
 
51
- use tendril::StrTendril;
47
+ use html5ever::tendril::StrTendril;
52
48
 
53
- use markup5ever::interface::tree_builder;
54
- use markup5ever::interface::tree_builder::{ElementFlags, NodeOrText, QuirksMode, TreeSink};
55
- use markup5ever::serialize::TraversalScope;
56
- use markup5ever::serialize::TraversalScope::{ChildrenOnly, IncludeNode};
57
- use markup5ever::serialize::{Serialize, Serializer};
58
- use markup5ever::Attribute;
59
- use markup5ever::ExpandedName;
60
- use markup5ever::QualName;
49
+ use html5ever::Attribute;
50
+ use html5ever::ExpandedName;
51
+ use html5ever::QualName;
52
+ use html5ever::interface::tree_builder;
53
+ use html5ever::interface::tree_builder::{ElementFlags, NodeOrText, QuirksMode, TreeSink};
54
+ use html5ever::serialize::TraversalScope;
55
+ use html5ever::serialize::TraversalScope::{ChildrenOnly, IncludeNode};
56
+ use html5ever::serialize::{Serialize, Serializer};
61
57
 
62
58
  /// The different kinds of nodes in the DOM.
63
59
  #[derive(Debug)]
@@ -98,10 +94,7 @@ pub enum NodeData {
98
94
  },
99
95
 
100
96
  /// A Processing instruction.
101
- ProcessingInstruction {
102
- target: StrTendril,
103
- contents: StrTendril,
104
- },
97
+ ProcessingInstruction { target: StrTendril, contents: StrTendril },
105
98
  }
106
99
 
107
100
  /// A DOM node.
@@ -130,10 +123,9 @@ impl Drop for Node {
130
123
  let mut nodes = mem::take(&mut *self.children.borrow_mut());
131
124
  while let Some(node) = nodes.pop() {
132
125
  let children = mem::take(&mut *node.children.borrow_mut());
133
- nodes.extend(children.into_iter());
126
+ nodes.extend(children);
134
127
  if let NodeData::Element {
135
- ref template_contents,
136
- ..
128
+ ref template_contents, ..
137
129
  } = node.data
138
130
  {
139
131
  if let Some(template_contents) = template_contents.borrow_mut().take() {
@@ -193,7 +185,7 @@ fn append_to_existing_text(prev: &Handle, text: &str) -> bool {
193
185
  NodeData::Text { ref contents } => {
194
186
  contents.borrow_mut().push_slice(text);
195
187
  true
196
- },
188
+ }
197
189
  _ => false,
198
190
  }
199
191
  }
@@ -240,8 +232,7 @@ impl TreeSink for RcDom {
240
232
 
241
233
  fn get_template_contents(&self, target: &Handle) -> Handle {
242
234
  if let NodeData::Element {
243
- ref template_contents,
244
- ..
235
+ ref template_contents, ..
245
236
  } = target.data
246
237
  {
247
238
  template_contents
@@ -287,10 +278,7 @@ impl TreeSink for RcDom {
287
278
  }
288
279
 
289
280
  fn create_pi(&self, target: StrTendril, data: StrTendril) -> Handle {
290
- Node::new(NodeData::ProcessingInstruction {
291
- target,
292
- contents: data,
293
- })
281
+ Node::new(NodeData::ProcessingInstruction { target, contents: data })
294
282
  }
295
283
 
296
284
  fn append(&self, parent: &Handle, child: NodeOrText<Handle>) {
@@ -315,8 +303,7 @@ impl TreeSink for RcDom {
315
303
  }
316
304
 
317
305
  fn append_before_sibling(&self, sibling: &Handle, child: NodeOrText<Handle>) {
318
- let (parent, i) = get_parent_and_index(sibling)
319
- .expect("append_before_sibling called on node without parent");
306
+ let (parent, i) = get_parent_and_index(sibling).expect("append_before_sibling called on node without parent");
320
307
 
321
308
  let child = match (child, i) {
322
309
  // No previous node.
@@ -334,7 +321,7 @@ impl TreeSink for RcDom {
334
321
  Node::new(NodeData::Text {
335
322
  contents: RefCell::new(text),
336
323
  })
337
- },
324
+ }
338
325
 
339
326
  // The tree builder promises we won't have a text node after
340
327
  // the insertion point.
@@ -366,12 +353,7 @@ impl TreeSink for RcDom {
366
353
  }
367
354
  }
368
355
 
369
- fn append_doctype_to_document(
370
- &self,
371
- name: StrTendril,
372
- public_id: StrTendril,
373
- system_id: StrTendril,
374
- ) {
356
+ fn append_doctype_to_document(&self, name: StrTendril, public_id: StrTendril, system_id: StrTendril) {
375
357
  append(
376
358
  &self.document,
377
359
  Node::new(NodeData::Doctype {
@@ -389,15 +371,8 @@ impl TreeSink for RcDom {
389
371
  panic!("not an element")
390
372
  };
391
373
 
392
- let existing_names = existing
393
- .iter()
394
- .map(|e| e.name.clone())
395
- .collect::<HashSet<_>>();
396
- existing.extend(
397
- attrs
398
- .into_iter()
399
- .filter(|attr| !existing_names.contains(&attr.name)),
400
- );
374
+ let existing_names = existing.iter().map(|e| e.name.clone()).collect::<HashSet<_>>();
375
+ existing.extend(attrs.into_iter().filter(|attr| !existing_names.contains(&attr.name)));
401
376
  }
402
377
 
403
378
  fn remove_from_parent(&self, target: &Handle) {
@@ -461,27 +436,17 @@ impl Serialize for SerializableHandle {
461
436
  let mut ops = VecDeque::new();
462
437
  match traversal_scope {
463
438
  IncludeNode => ops.push_back(SerializeOp::Open(self.0.clone())),
464
- ChildrenOnly(_) => ops.extend(
465
- self.0
466
- .children
467
- .borrow()
468
- .iter()
469
- .map(|h| SerializeOp::Open(h.clone())),
470
- ),
439
+ ChildrenOnly(_) => ops.extend(self.0.children.borrow().iter().map(|h| SerializeOp::Open(h.clone()))),
471
440
  }
472
441
 
473
442
  while let Some(op) = ops.pop_front() {
474
443
  match op {
475
444
  SerializeOp::Open(handle) => match handle.data {
476
445
  NodeData::Element {
477
- ref name,
478
- ref attrs,
479
- ..
446
+ ref name, ref attrs, ..
480
447
  } => {
481
- serializer.start_elem(
482
- name.clone(),
483
- attrs.borrow().iter().map(|at| (&at.name, &at.value[..])),
484
- )?;
448
+ serializer
449
+ .start_elem(name.clone(), attrs.borrow().iter().map(|at| (&at.name, &at.value[..])))?;
485
450
 
486
451
  ops.reserve(1 + handle.children.borrow().len());
487
452
  ops.push_front(SerializeOp::Close(name.clone()));
@@ -489,7 +454,7 @@ impl Serialize for SerializableHandle {
489
454
  for child in handle.children.borrow().iter().rev() {
490
455
  ops.push_front(SerializeOp::Open(child.clone()));
491
456
  }
492
- },
457
+ }
493
458
 
494
459
  NodeData::Doctype { ref name, .. } => serializer.write_doctype(name)?,
495
460
 
@@ -507,7 +472,7 @@ impl Serialize for SerializableHandle {
507
472
 
508
473
  SerializeOp::Close(name) => {
509
474
  serializer.end_elem(name)?;
510
- },
475
+ }
511
476
  }
512
477
  }
513
478
 
@@ -350,3 +350,160 @@ fn test_container_elements() {
350
350
  assert!(markdown.contains("Layout analysis"));
351
351
  assert!(markdown.contains("Block content"));
352
352
  }
353
+
354
+ #[test]
355
+ fn test_ocr_header_renders_as_italic_not_heading() {
356
+ // OcrHeader is a "page running header" (repeated at top of pages),
357
+ // NOT a section heading. It must render as italic (*text*), not as # heading.
358
+ let hocr = r#"<div class="ocr_page" title="bbox 0 0 1000 1000">
359
+ <div class="ocr_header" title="bbox 0 0 1000 50">
360
+ <span class="ocr_line" title="bbox 0 0 500 30">
361
+ <span class="ocrx_word" title="bbox 0 0 100 30; x_wconf 95">Chapter</span>
362
+ <span class="ocrx_word" title="bbox 110 0 200 30; x_wconf 95">One</span>
363
+ </span>
364
+ </div>
365
+ <p class="ocr_par" title="bbox 0 100 900 200">
366
+ <span class="ocr_line" title="bbox 0 100 800 130">
367
+ <span class="ocrx_word" title="bbox 0 100 50 130; x_wconf 95">Some</span>
368
+ <span class="ocrx_word" title="bbox 60 100 120 130; x_wconf 95">body</span>
369
+ <span class="ocrx_word" title="bbox 130 100 180 130; x_wconf 95">text</span>
370
+ <span class="ocrx_word" title="bbox 190 100 240 130; x_wconf 95">here</span>
371
+ </span>
372
+ </p>
373
+ </div>"#;
374
+
375
+ let dom = tl::parse(hocr, tl::ParserOptions::default()).unwrap();
376
+ let (elements, _) = extract_hocr_document(&dom);
377
+ let markdown = convert_to_markdown(&elements, true);
378
+
379
+ // OcrHeader must render as italic
380
+ assert!(
381
+ markdown.contains("*Chapter One*"),
382
+ "OcrHeader should render as italic (*text*), got: {markdown}"
383
+ );
384
+ // It must NOT render as a markdown heading
385
+ assert!(
386
+ !markdown.contains("# Chapter One"),
387
+ "OcrHeader must NOT render as a markdown heading, got: {markdown}"
388
+ );
389
+ }
390
+
391
+ #[test]
392
+ fn test_heading_detection_with_x_fsize_on_line_child() {
393
+ // A paragraph containing a single ocr_line child with x_fsize 18 (large font)
394
+ // and short capitalized text should be detected as a heading.
395
+ let hocr = r#"<div class="ocr_page" title="bbox 0 0 1000 1000">
396
+ <div class="ocr_carea" title="bbox 0 0 1000 500">
397
+ <p class="ocr_par" title="bbox 0 0 500 40">
398
+ <span class="ocr_line" title="bbox 0 0 500 30; x_fsize 18">
399
+ <span class="ocrx_word" title="bbox 0 0 120 30; x_wconf 95">Important</span>
400
+ <span class="ocrx_word" title="bbox 130 0 250 30; x_wconf 95">Section</span>
401
+ <span class="ocrx_word" title="bbox 260 0 350 30; x_wconf 95">Title</span>
402
+ </span>
403
+ </p>
404
+ <p class="ocr_par" title="bbox 0 60 900 200">
405
+ <span class="ocr_line" title="bbox 0 60 800 90; x_fsize 12">
406
+ <span class="ocrx_word" title="bbox 0 60 50 90; x_wconf 95">This</span>
407
+ <span class="ocrx_word" title="bbox 60 60 90 90; x_wconf 92">is</span>
408
+ <span class="ocrx_word" title="bbox 100 60 200 90; x_wconf 98">regular</span>
409
+ <span class="ocrx_word" title="bbox 210 60 280 90; x_wconf 98">body</span>
410
+ <span class="ocrx_word" title="bbox 290 60 340 90; x_wconf 98">text</span>
411
+ <span class="ocrx_word" title="bbox 350 60 430 90; x_wconf 98">content.</span>
412
+ </span>
413
+ </p>
414
+ </div>
415
+ </div>"#;
416
+
417
+ let dom = tl::parse(hocr, tl::ParserOptions::default()).unwrap();
418
+ let (elements, _) = extract_hocr_document(&dom);
419
+ let markdown = convert_to_markdown(&elements, true);
420
+
421
+ // The large-font paragraph should be detected as a heading
422
+ assert!(
423
+ markdown.contains("# Important Section Title"),
424
+ "Large font paragraph should be detected as heading, got: {markdown}"
425
+ );
426
+ }
427
+
428
+ #[test]
429
+ fn test_single_word_heading_with_large_font() {
430
+ // A single-word paragraph with large font size should be detected as a heading.
431
+ // Without font size awareness, single-word paragraphs are rejected.
432
+ let hocr = r#"<div class="ocr_page" title="bbox 0 0 1000 1000">
433
+ <div class="ocr_carea" title="bbox 0 0 1000 500">
434
+ <p class="ocr_par" title="bbox 0 0 300 40">
435
+ <span class="ocr_line" title="bbox 0 0 300 30; x_fsize 24">
436
+ <span class="ocrx_word" title="bbox 0 0 200 30; x_wconf 95">Introduction</span>
437
+ </span>
438
+ </p>
439
+ <p class="ocr_par" title="bbox 0 60 900 200">
440
+ <span class="ocr_line" title="bbox 0 60 800 90; x_fsize 12">
441
+ <span class="ocrx_word" title="bbox 0 60 50 90; x_wconf 95">Some</span>
442
+ <span class="ocrx_word" title="bbox 60 60 120 90; x_wconf 92">body</span>
443
+ <span class="ocrx_word" title="bbox 130 60 180 90; x_wconf 98">text</span>
444
+ <span class="ocrx_word" title="bbox 190 60 280 90; x_wconf 98">follows.</span>
445
+ </span>
446
+ </p>
447
+ </div>
448
+ </div>"#;
449
+
450
+ let dom = tl::parse(hocr, tl::ParserOptions::default()).unwrap();
451
+ let (elements, _) = extract_hocr_document(&dom);
452
+ let markdown = convert_to_markdown(&elements, true);
453
+
454
+ // Single word with large font should be detected as heading
455
+ assert!(
456
+ markdown.contains("# Introduction"),
457
+ "Single word with large font should be detected as heading, got: {markdown}"
458
+ );
459
+ }
460
+
461
+ #[test]
462
+ fn test_single_word_without_large_font_not_heading() {
463
+ // A single-word paragraph without large font should NOT be detected as heading.
464
+ // This ensures we haven't broken the existing behavior.
465
+ let hocr = r#"<div class="ocr_page" title="bbox 0 0 1000 1000">
466
+ <div class="ocr_carea" title="bbox 0 0 1000 500">
467
+ <p class="ocr_par" title="bbox 0 0 300 20">
468
+ <span class="ocr_line" title="bbox 0 0 300 12; x_fsize 10">
469
+ <span class="ocrx_word" title="bbox 0 0 100 12; x_wconf 95">Word</span>
470
+ </span>
471
+ </p>
472
+ </div>
473
+ </div>"#;
474
+
475
+ let dom = tl::parse(hocr, tl::ParserOptions::default()).unwrap();
476
+ let (elements, _) = extract_hocr_document(&dom);
477
+ let markdown = convert_to_markdown(&elements, true);
478
+
479
+ // Single word with small font should NOT be a heading
480
+ assert!(
481
+ !markdown.contains("# Word"),
482
+ "Single word with small font should not be detected as heading, got: {markdown}"
483
+ );
484
+ }
485
+
486
+ #[test]
487
+ fn test_heading_detection_with_bbox_height_proxy() {
488
+ // When x_fsize is absent, bbox height should serve as a font-size proxy.
489
+ // A bbox height of 30 pixels (>= 14) indicates large text.
490
+ let hocr = r#"<div class="ocr_page" title="bbox 0 0 1000 1000">
491
+ <div class="ocr_carea" title="bbox 0 0 1000 500">
492
+ <p class="ocr_par" title="bbox 0 0 500 40">
493
+ <span class="ocr_line" title="bbox 0 0 500 30">
494
+ <span class="ocrx_word" title="bbox 0 0 200 30; x_wconf 95">Summary</span>
495
+ </span>
496
+ </p>
497
+ </div>
498
+ </div>"#;
499
+
500
+ let dom = tl::parse(hocr, tl::ParserOptions::default()).unwrap();
501
+ let (elements, _) = extract_hocr_document(&dom);
502
+ let markdown = convert_to_markdown(&elements, true);
503
+
504
+ // bbox height of 30 (y2=30 - y1=0) should serve as proxy for large font
505
+ assert!(
506
+ markdown.contains("# Summary"),
507
+ "Single word with tall bbox (height=30) should be detected as heading via bbox proxy, got: {markdown}"
508
+ );
509
+ }
@@ -1 +1 @@
1
- {"files":{".cargo_vcs_info.json":"175df596d2f8c352eaca47603bf5b23a0eab080ffb910b914fa2e3a8ad191159",".github/workflows/main.yml":"e2b3d5678a31325a616bae0c1ccb223f9ed2f5b43d39a134c5c45858e4fbf4ca","CHANGELOG.md":"63d0899dd592ddfedaf38832478ff0342726ab951021fb9b7bc237ce7be4066a","Cargo.lock":"ab24cb6544ffc0a82625e5bde17c6da152490577a0659851a1c77e767fdcd413","Cargo.toml":"c50a23ef2999b6a5ca3fd96d72b8932ebfc2f940acfed78a881c73f7eeaa91dd","Cargo.toml.orig":"11b7a3b11bea46362dd87d4371a2495f6ca0f1f7957011e8802d4c6d20fe4c4a","LICENSE-APACHE":"04ea4849dba9dcae07113850c6f1b1a69052c625210639914eee352023f750ad","LICENSE-MIT":"0d25d03b5ab49576178ad0cae7a2648d12c17ad0452fe49c07e55e4b59aa5257","README.md":"e3388f55065d69e076d90871c0a91dc97420bd0d07b4f154b08e40ac47b115eb","examples/cat.rs":"594b9457ca6eb4ce9b840133da5076fa7b96334953df03f894233169564622f6","src/advice.rs":"a4c023982a598a77c23b5a4e524de581329d42287d639be88e2ffda3bd929511","src/lib.rs":"50f44261aec0353976962d1314d2d885e9a34afda6215bb5b67d3afb336519e1","src/stub.rs":"656088d19012fce05de58420c8b84d59f5f1d10c3bb79a950c7f7394bca5fae1","src/unix.rs":"6133da835aff4aa7bf29f17e43d501c6409a6b343b00232fd082f4a0db035cdb","src/windows.rs":"0fbd1efc122a7e83defb5d0a401e973a3876e90c6a1f14f2ac1976462633dd79"},"package":"744133e4a0e0a658e1374cf3bf8e415c4052a15a111acd372764c55b4177d490"}
1
+ {"files":{".cargo_vcs_info.json":"57a15fc6b2ddb894def64e09a11e5cc52e575be77740a6fc0aa3c451c8c0fa49",".github/workflows/main.yml":"e2b3d5678a31325a616bae0c1ccb223f9ed2f5b43d39a134c5c45858e4fbf4ca","CHANGELOG.md":"832a95917c80f443c113ede708a5602550d081bd79cc33f4152365357ab36bb8","Cargo.lock":"6ddd1fa91a86a6e8bbbc17161703f21799213d823ea26d7f7ce86181295a3fc8","Cargo.toml":"01ff6425d680ddac0a8efb1b41e6b0503d0153388fadcc4bf584ca38c4dca221","Cargo.toml.orig":"507519ce5facd6b78a80e3cfab1437c7b6b5a1fc99c65e24552525c884d4ed3e","LICENSE-APACHE":"04ea4849dba9dcae07113850c6f1b1a69052c625210639914eee352023f750ad","LICENSE-MIT":"0d25d03b5ab49576178ad0cae7a2648d12c17ad0452fe49c07e55e4b59aa5257","README.md":"e3388f55065d69e076d90871c0a91dc97420bd0d07b4f154b08e40ac47b115eb","examples/cat.rs":"594b9457ca6eb4ce9b840133da5076fa7b96334953df03f894233169564622f6","src/advice.rs":"a4c023982a598a77c23b5a4e524de581329d42287d639be88e2ffda3bd929511","src/lib.rs":"d93f73dd80b5bfdecc10836a7ebcd04c124f6283f9a104686fe48a18d34764ab","src/stub.rs":"beccccb0233903df5de1773674b2dcd9b0991889a10c23719f5aee8f7496f958","src/unix.rs":"fea7c7c21a6082bc77052e5e40a2bff1311103a19c2cd281ff383604fa799b35","src/windows.rs":"0fbd1efc122a7e83defb5d0a401e973a3876e90c6a1f14f2ac1976462633dd79"},"package":"714098028fe011992e1c3962653c96b2d578c4b4bce9036e15ff220319b1e0e3"}
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "git": {
3
- "sha1": "cc94666eabb82a3a1647f8dc1d949a9ffc6babff"
3
+ "sha1": "1f0196ab31cf1e6f4eb4b3633c51a4d3ba920788"
4
4
  },
5
5
  "path_in_vcs": ""
6
6
  }