html-to-markdown 2.24.6 → 2.25.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (231) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/html-to-markdown-rb/native/Cargo.lock +9 -32
  5. data/ext/html-to-markdown-rb/native/Cargo.toml +1 -1
  6. data/lib/html_to_markdown/version.rb +1 -1
  7. data/rust-vendor/html-to-markdown-rs/Cargo.toml +0 -1
  8. data/rust-vendor/html-to-markdown-rs/src/converter/main_helpers.rs +1 -1
  9. data/rust-vendor/html-to-markdown-rs/src/hocr/converter/hierarchy.rs +20 -5
  10. data/rust-vendor/html-to-markdown-rs/src/lib.rs +1 -0
  11. data/rust-vendor/{markup5ever_rcdom/lib.rs → html-to-markdown-rs/src/rcdom.rs} +56 -91
  12. data/rust-vendor/html-to-markdown-rs/tests/hocr_compliance_test.rs +157 -0
  13. data/rust-vendor/memmap2/.cargo-checksum.json +1 -1
  14. data/rust-vendor/memmap2/.cargo_vcs_info.json +1 -1
  15. data/rust-vendor/memmap2/CHANGELOG.md +8 -0
  16. data/rust-vendor/memmap2/Cargo.lock +1 -1
  17. data/rust-vendor/memmap2/Cargo.toml +2 -1
  18. data/rust-vendor/memmap2/Cargo.toml.orig +2 -1
  19. data/rust-vendor/memmap2/src/lib.rs +25 -1
  20. data/rust-vendor/memmap2/src/stub.rs +1 -4
  21. data/rust-vendor/memmap2/src/unix.rs +14 -1
  22. data/rust-vendor/png/.cargo-checksum.json +1 -1
  23. data/rust-vendor/png/.cargo_vcs_info.json +1 -1
  24. data/rust-vendor/png/CHANGES.md +44 -0
  25. data/rust-vendor/png/Cargo.lock +124 -171
  26. data/rust-vendor/png/Cargo.toml +1 -1
  27. data/rust-vendor/png/Cargo.toml.orig +1 -1
  28. data/rust-vendor/png/benches/expand_paletted.rs +5 -5
  29. data/rust-vendor/png/benches/unfilter.rs +3 -3
  30. data/rust-vendor/png/src/adam7.rs +17 -10
  31. data/rust-vendor/png/src/common.rs +8 -8
  32. data/rust-vendor/png/src/decoder/mod.rs +53 -20
  33. data/rust-vendor/png/src/decoder/stream.rs +263 -78
  34. data/rust-vendor/png/src/decoder/unfiltering_buffer.rs +210 -53
  35. data/rust-vendor/png/src/decoder/zlib.rs +130 -90
  36. data/rust-vendor/png/src/encoder.rs +4 -2
  37. data/rust-vendor/png/src/{filter.rs → filter/mod.rs} +100 -367
  38. data/rust-vendor/png/src/filter/optimization-notes.md +104 -0
  39. data/rust-vendor/png/src/filter/paeth.rs +398 -0
  40. data/rust-vendor/png/src/filter/simd.rs +308 -0
  41. data/rust-vendor/png/src/lib.rs +1 -0
  42. data/rust-vendor/syn/.cargo-checksum.json +1 -1
  43. data/rust-vendor/syn/.cargo_vcs_info.json +1 -1
  44. data/rust-vendor/syn/Cargo.lock +40 -41
  45. data/rust-vendor/syn/Cargo.toml +1 -1
  46. data/rust-vendor/syn/Cargo.toml.orig +1 -1
  47. data/rust-vendor/syn/src/item.rs +61 -40
  48. data/rust-vendor/syn/src/lib.rs +2 -1
  49. data/rust-vendor/syn/tests/test_item.rs +54 -0
  50. data/rust-vendor/unicode-ident/.cargo-checksum.json +1 -1
  51. data/rust-vendor/unicode-ident/.cargo_vcs_info.json +1 -1
  52. data/rust-vendor/unicode-ident/Cargo.lock +21 -21
  53. data/rust-vendor/unicode-ident/Cargo.toml +1 -1
  54. data/rust-vendor/unicode-ident/Cargo.toml.orig +1 -1
  55. data/rust-vendor/unicode-ident/src/lib.rs +1 -1
  56. data/rust-vendor/unicode-ident/src/tables.rs +87 -97
  57. data/rust-vendor/unicode-ident/tests/static_size.rs +1 -1
  58. metadata +7 -177
  59. data/rust-vendor/markup5ever_rcdom/.cargo-checksum.json +0 -1
  60. data/rust-vendor/markup5ever_rcdom/.cargo_vcs_info.json +0 -7
  61. data/rust-vendor/markup5ever_rcdom/Cargo.lock +0 -658
  62. data/rust-vendor/markup5ever_rcdom/Cargo.toml +0 -109
  63. data/rust-vendor/markup5ever_rcdom/Cargo.toml.orig +0 -42
  64. data/rust-vendor/markup5ever_rcdom/LICENSE-APACHE +0 -201
  65. data/rust-vendor/markup5ever_rcdom/LICENSE-MIT +0 -25
  66. data/rust-vendor/markup5ever_rcdom/README.md +0 -7
  67. data/rust-vendor/markup5ever_rcdom/custom-html5lib-tokenizer-tests/regression.test +0 -69
  68. data/rust-vendor/markup5ever_rcdom/data/test/ignore +0 -1
  69. data/rust-vendor/markup5ever_rcdom/examples/hello_xml.rs +0 -39
  70. data/rust-vendor/markup5ever_rcdom/examples/html2html.rs +0 -51
  71. data/rust-vendor/markup5ever_rcdom/examples/print-rcdom.rs +0 -78
  72. data/rust-vendor/markup5ever_rcdom/examples/xml_tree_printer.rs +0 -67
  73. data/rust-vendor/markup5ever_rcdom/html5lib-tests/.gitattributes +0 -2
  74. data/rust-vendor/markup5ever_rcdom/html5lib-tests/.github/workflows/downstream.yml +0 -76
  75. data/rust-vendor/markup5ever_rcdom/html5lib-tests/.github/workflows/lint.yml +0 -25
  76. data/rust-vendor/markup5ever_rcdom/html5lib-tests/.gitignore +0 -79
  77. data/rust-vendor/markup5ever_rcdom/html5lib-tests/AUTHORS.rst +0 -34
  78. data/rust-vendor/markup5ever_rcdom/html5lib-tests/LICENSE +0 -21
  79. data/rust-vendor/markup5ever_rcdom/html5lib-tests/encoding/chardet/test_big5.txt +0 -51
  80. data/rust-vendor/markup5ever_rcdom/html5lib-tests/encoding/scripted/tests1.dat +0 -5
  81. data/rust-vendor/markup5ever_rcdom/html5lib-tests/encoding/test-yahoo-jp.dat +0 -10
  82. data/rust-vendor/markup5ever_rcdom/html5lib-tests/encoding/tests1.dat +0 -388
  83. data/rust-vendor/markup5ever_rcdom/html5lib-tests/encoding/tests2.dat +0 -115
  84. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint +0 -6
  85. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/__init__.py +0 -0
  86. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/__init__.py +0 -0
  87. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/LICENSE +0 -18
  88. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/__init__.py +0 -0
  89. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/lexer.py +0 -211
  90. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/lexer.pyi +0 -34
  91. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/parser.py +0 -872
  92. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/parser.pyi +0 -83
  93. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/py.typed +0 -0
  94. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/util.py +0 -72
  95. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/util.pyi +0 -7
  96. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/vendor.txt +0 -1
  97. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor-patches/funcparserlib.patch +0 -24
  98. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/lint.py +0 -280
  99. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/parser.py +0 -177
  100. data/rust-vendor/markup5ever_rcdom/html5lib-tests/pyproject.toml +0 -7
  101. data/rust-vendor/markup5ever_rcdom/html5lib-tests/serializer/core.test +0 -125
  102. data/rust-vendor/markup5ever_rcdom/html5lib-tests/serializer/injectmeta.test +0 -66
  103. data/rust-vendor/markup5ever_rcdom/html5lib-tests/serializer/optionaltags.test +0 -965
  104. data/rust-vendor/markup5ever_rcdom/html5lib-tests/serializer/options.test +0 -60
  105. data/rust-vendor/markup5ever_rcdom/html5lib-tests/serializer/whitespace.test +0 -51
  106. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/README.md +0 -107
  107. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/contentModelFlags.test +0 -93
  108. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/domjs.test +0 -335
  109. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/entities.test +0 -542
  110. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/escapeFlag.test +0 -36
  111. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/namedEntities.test +0 -42422
  112. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/numericEntities.test +0 -1677
  113. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/pendingSpecChanges.test +0 -9
  114. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/test1.test +0 -353
  115. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/test2.test +0 -275
  116. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/test3.test +0 -11233
  117. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/test4.test +0 -532
  118. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/unicodeChars.test +0 -1577
  119. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/unicodeCharsProblematic.test +0 -41
  120. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/xmlViolation.test +0 -20
  121. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/README.md +0 -108
  122. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/adoption01.dat +0 -354
  123. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/adoption02.dat +0 -39
  124. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/blocks.dat +0 -695
  125. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/comments01.dat +0 -217
  126. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/doctype01.dat +0 -474
  127. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/domjs-unsafe.dat +0 -0
  128. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/entities01.dat +0 -943
  129. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/entities02.dat +0 -309
  130. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/foreign-fragment.dat +0 -645
  131. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/html5test-com.dat +0 -301
  132. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/inbody01.dat +0 -54
  133. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/isindex.dat +0 -49
  134. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/main-element.dat +0 -46
  135. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/math.dat +0 -104
  136. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/menuitem-element.dat +0 -240
  137. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/namespace-sensitivity.dat +0 -22
  138. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/noscript01.dat +0 -237
  139. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/pending-spec-changes-plain-text-unsafe.dat +0 -0
  140. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/pending-spec-changes.dat +0 -46
  141. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/plain-text-unsafe.dat +0 -0
  142. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/quirks01.dat +0 -53
  143. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/ruby.dat +0 -302
  144. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/scriptdata01.dat +0 -372
  145. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/scripted/adoption01.dat +0 -16
  146. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/scripted/ark.dat +0 -27
  147. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/scripted/webkit01.dat +0 -30
  148. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/search-element.dat +0 -46
  149. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/svg.dat +0 -104
  150. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tables01.dat +0 -322
  151. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/template.dat +0 -1673
  152. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests1.dat +0 -1956
  153. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests10.dat +0 -849
  154. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests11.dat +0 -523
  155. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests12.dat +0 -62
  156. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests14.dat +0 -75
  157. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests15.dat +0 -216
  158. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests16.dat +0 -2602
  159. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests17.dat +0 -179
  160. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests18.dat +0 -558
  161. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests19.dat +0 -1398
  162. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests2.dat +0 -831
  163. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests20.dat +0 -842
  164. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests21.dat +0 -306
  165. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests22.dat +0 -190
  166. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests23.dat +0 -168
  167. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests24.dat +0 -79
  168. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests25.dat +0 -288
  169. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests26.dat +0 -453
  170. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests3.dat +0 -305
  171. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests4.dat +0 -74
  172. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests5.dat +0 -210
  173. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests6.dat +0 -663
  174. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests7.dat +0 -453
  175. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests8.dat +0 -165
  176. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests9.dat +0 -472
  177. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests_innerHTML_1.dat +0 -843
  178. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tricky01.dat +0 -336
  179. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/webkit01.dat +0 -785
  180. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/webkit02.dat +0 -554
  181. data/rust-vendor/markup5ever_rcdom/tests/foreach_html5lib_test/mod.rs +0 -41
  182. data/rust-vendor/markup5ever_rcdom/tests/html-driver.rs +0 -29
  183. data/rust-vendor/markup5ever_rcdom/tests/html-serializer.rs +0 -265
  184. data/rust-vendor/markup5ever_rcdom/tests/html-tokenizer.rs +0 -487
  185. data/rust-vendor/markup5ever_rcdom/tests/html-tree-builder.rs +0 -298
  186. data/rust-vendor/markup5ever_rcdom/tests/html-tree-sink.rs +0 -141
  187. data/rust-vendor/markup5ever_rcdom/tests/util/find_tests.rs +0 -34
  188. data/rust-vendor/markup5ever_rcdom/tests/util/runner.rs +0 -48
  189. data/rust-vendor/markup5ever_rcdom/tests/xml-driver.rs +0 -101
  190. data/rust-vendor/markup5ever_rcdom/tests/xml-tokenizer.rs +0 -374
  191. data/rust-vendor/markup5ever_rcdom/tests/xml-tree-builder.rs +0 -237
  192. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/AUTHORS.rst +0 -9
  193. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/LICENSE +0 -21
  194. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/README.md +0 -92
  195. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/comments.test +0 -274
  196. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/doctype.test +0 -3232
  197. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/entities.test +0 -283
  198. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/eof.test +0 -113
  199. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/namedEntities.test +0 -42210
  200. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/numericEntities.test +0 -1349
  201. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/test1.test +0 -162
  202. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/test2.test +0 -64
  203. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/unicodeChars.test +0 -1295
  204. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tree-construction/README.md +0 -104
  205. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tree-construction/namespace.dat +0 -119
  206. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tree-construction/test1.dat +0 -124
  207. data/rust-vendor/xml5ever/.cargo-checksum.json +0 -1
  208. data/rust-vendor/xml5ever/.cargo_vcs_info.json +0 -6
  209. data/rust-vendor/xml5ever/Cargo.lock +0 -752
  210. data/rust-vendor/xml5ever/Cargo.toml +0 -69
  211. data/rust-vendor/xml5ever/Cargo.toml.orig +0 -29
  212. data/rust-vendor/xml5ever/LICENSE-APACHE +0 -201
  213. data/rust-vendor/xml5ever/LICENSE-MIT +0 -25
  214. data/rust-vendor/xml5ever/README.md +0 -72
  215. data/rust-vendor/xml5ever/benches/xml5ever.rs +0 -77
  216. data/rust-vendor/xml5ever/data/bench/strong.xml +0 -1
  217. data/rust-vendor/xml5ever/examples/README.md +0 -223
  218. data/rust-vendor/xml5ever/examples/example.xml +0 -3
  219. data/rust-vendor/xml5ever/examples/simple_xml_tokenizer.rs +0 -81
  220. data/rust-vendor/xml5ever/examples/xml_tokenizer.rs +0 -115
  221. data/rust-vendor/xml5ever/src/driver.rs +0 -90
  222. data/rust-vendor/xml5ever/src/lib.rs +0 -47
  223. data/rust-vendor/xml5ever/src/macros.rs +0 -18
  224. data/rust-vendor/xml5ever/src/serialize/mod.rs +0 -216
  225. data/rust-vendor/xml5ever/src/tokenizer/char_ref/mod.rs +0 -456
  226. data/rust-vendor/xml5ever/src/tokenizer/interface.rs +0 -116
  227. data/rust-vendor/xml5ever/src/tokenizer/mod.rs +0 -1344
  228. data/rust-vendor/xml5ever/src/tokenizer/qname.rs +0 -84
  229. data/rust-vendor/xml5ever/src/tokenizer/states.rs +0 -167
  230. data/rust-vendor/xml5ever/src/tree_builder/mod.rs +0 -774
  231. data/rust-vendor/xml5ever/src/tree_builder/types.rs +0 -37
@@ -1,487 +0,0 @@
1
- // Copyright 2014-2017 The html5ever Project Developers. See the
2
- // COPYRIGHT file at the top-level directory of this distribution.
3
- //
4
- // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5
- // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6
- // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7
- // option. This file may not be copied, modified, or distributed
8
- // except according to those terms.
9
-
10
- mod foreach_html5lib_test;
11
-
12
- use foreach_html5lib_test::foreach_html5lib_test;
13
- use html5ever::tendril::*;
14
- use html5ever::tokenizer::states::{
15
- CdataSection, Data, Plaintext, RawData, Rawtext, Rcdata, ScriptData,
16
- };
17
- use html5ever::tokenizer::BufferQueue;
18
- use html5ever::tokenizer::{CharacterTokens, EOFToken, NullCharacterToken, ParseError};
19
- use html5ever::tokenizer::{CommentToken, DoctypeToken, TagToken, Token};
20
- use html5ever::tokenizer::{Doctype, EndTag, StartTag, Tag};
21
- use html5ever::tokenizer::{TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts};
22
- use html5ever::TokenizerResult;
23
- use html5ever::{ns, Attribute, LocalName, QualName};
24
- use serde_json::{Map, Value};
25
- use std::cell::RefCell;
26
- use std::char;
27
- use std::ffi::OsStr;
28
- use std::fs::File;
29
- use std::io::Read;
30
- use std::path::Path;
31
-
32
- use util::runner::{run_all, Test};
33
-
34
- mod util {
35
- pub mod runner;
36
- }
37
-
38
- #[derive(Debug)]
39
- struct TestError;
40
-
41
- impl PartialEq for TestError {
42
- fn eq(&self, _: &TestError) -> bool {
43
- // TODO: actually match exact error messages
44
- true
45
- }
46
- }
47
-
48
- // some large testcases hang forever without an upper-bound of splits to generate
49
- const MAX_SPLITS: usize = 1000;
50
-
51
- // Return all ways of splitting the string into at most n
52
- // possibly-empty pieces.
53
- fn splits(s: &str, n: usize) -> Vec<Vec<StrTendril>> {
54
- if n == 1 {
55
- return vec![vec![s.to_tendril()]];
56
- }
57
-
58
- let mut out = vec![];
59
- for p in s.char_indices().map(|(n, _)| n).chain(Some(s.len())) {
60
- let y = &s[p..];
61
- for mut x in splits(&s[..p], n - 1).into_iter() {
62
- x.push(y.to_tendril());
63
- out.push(x);
64
- }
65
- }
66
-
67
- out.extend(splits(s, n - 1));
68
- out.truncate(MAX_SPLITS);
69
- out
70
- }
71
-
72
- struct TokenLogger {
73
- tokens: RefCell<Vec<Token>>,
74
- errors: RefCell<Vec<TestError>>,
75
- current_str: RefCell<StrTendril>,
76
- exact_errors: bool,
77
- }
78
-
79
- impl TokenLogger {
80
- fn new(exact_errors: bool) -> TokenLogger {
81
- TokenLogger {
82
- tokens: RefCell::new(vec![]),
83
- errors: RefCell::new(vec![]),
84
- current_str: RefCell::new(StrTendril::new()),
85
- exact_errors,
86
- }
87
- }
88
-
89
- // Push anything other than character tokens
90
- fn push(&self, token: Token) {
91
- self.finish_str();
92
- self.tokens.borrow_mut().push(token);
93
- }
94
-
95
- fn finish_str(&self) {
96
- if !self.current_str.borrow().is_empty() {
97
- let s = self.current_str.take();
98
- self.tokens.borrow_mut().push(CharacterTokens(s));
99
- }
100
- }
101
-
102
- fn get_tokens(self) -> (Vec<Token>, Vec<TestError>) {
103
- self.finish_str();
104
- (self.tokens.take(), self.errors.take())
105
- }
106
- }
107
-
108
- impl TokenSink for TokenLogger {
109
- type Handle = ();
110
-
111
- fn process_token(&self, token: Token, _line_number: u64) -> TokenSinkResult<()> {
112
- match token {
113
- CharacterTokens(b) => {
114
- self.current_str.borrow_mut().push_slice(&b);
115
- },
116
-
117
- NullCharacterToken => {
118
- self.current_str.borrow_mut().push_char('\0');
119
- },
120
-
121
- ParseError(_) => {
122
- if self.exact_errors {
123
- self.errors.borrow_mut().push(TestError);
124
- }
125
- },
126
-
127
- TagToken(mut t) => {
128
- // The spec seems to indicate that one can emit
129
- // erroneous end tags with attrs, but the test
130
- // cases don't contain them.
131
- match t.kind {
132
- EndTag => {
133
- t.self_closing = false;
134
- t.attrs = vec![];
135
- },
136
- _ => t.attrs.sort_by(|a1, a2| a1.name.cmp(&a2.name)),
137
- }
138
- self.push(TagToken(t));
139
- },
140
-
141
- EOFToken => (),
142
-
143
- _ => self.push(token),
144
- }
145
- TokenSinkResult::Continue
146
- }
147
- }
148
-
149
- fn tokenize(input: Vec<StrTendril>, opts: TokenizerOpts) -> (Vec<Token>, Vec<TestError>) {
150
- let sink = TokenLogger::new(opts.exact_errors);
151
- let tokenizer = Tokenizer::new(sink, opts);
152
-
153
- let buffer = BufferQueue::default();
154
- for chunk in input.into_iter() {
155
- buffer.push_back(chunk);
156
- }
157
-
158
- while tokenizer.feed(&buffer) != TokenizerResult::Done {
159
- // Ignore any script tags...
160
- }
161
-
162
- tokenizer.end();
163
- tokenizer.sink.get_tokens()
164
- }
165
-
166
- trait JsonExt: Sized {
167
- fn get_str(&self) -> String;
168
- fn get_tendril(&self) -> StrTendril;
169
- fn get_nullable_tendril(&self) -> Option<StrTendril>;
170
- fn get_bool(&self) -> bool;
171
- fn get_obj(&self) -> &Map<String, Self>;
172
- fn get_list(&self) -> &Vec<Self>;
173
- fn find(&self, key: &str) -> &Self;
174
- }
175
-
176
- impl JsonExt for Value {
177
- fn get_str(&self) -> String {
178
- match *self {
179
- Value::String(ref s) => s.to_string(),
180
- _ => panic!("Value::get_str: not a String"),
181
- }
182
- }
183
-
184
- fn get_tendril(&self) -> StrTendril {
185
- match *self {
186
- Value::String(ref s) => s.to_tendril(),
187
- _ => panic!("Value::get_tendril: not a String"),
188
- }
189
- }
190
-
191
- fn get_nullable_tendril(&self) -> Option<StrTendril> {
192
- match *self {
193
- Value::Null => None,
194
- Value::String(ref s) => Some(s.to_tendril()),
195
- _ => panic!("Value::get_nullable_tendril: not a String"),
196
- }
197
- }
198
-
199
- fn get_bool(&self) -> bool {
200
- match *self {
201
- Value::Bool(b) => b,
202
- _ => panic!("Value::get_bool: not a Bool"),
203
- }
204
- }
205
-
206
- fn get_obj(&self) -> &Map<String, Value> {
207
- match self {
208
- Value::Object(m) => m,
209
- _ => panic!("Value::get_obj: not an Object"),
210
- }
211
- }
212
-
213
- fn get_list(&self) -> &Vec<Value> {
214
- match self {
215
- Value::Array(m) => m,
216
- _ => panic!("Value::get_list: not an Array"),
217
- }
218
- }
219
-
220
- fn find(&self, key: &str) -> &Value {
221
- self.get_obj().get(key).unwrap()
222
- }
223
- }
224
-
225
- // Parse a JSON object (other than "ParseError") to a token.
226
- fn json_to_token(js: &Value) -> Token {
227
- let parts = js.get_list();
228
- // Collect refs here so we don't have to use "ref" in all the patterns below.
229
- let args: Vec<&Value> = parts[1..].iter().collect();
230
- match &*parts[0].get_str() {
231
- "DOCTYPE" => DoctypeToken(Doctype {
232
- name: args[0].get_nullable_tendril(),
233
- public_id: args[1].get_nullable_tendril(),
234
- system_id: args[2].get_nullable_tendril(),
235
- force_quirks: !args[3].get_bool(),
236
- }),
237
-
238
- "StartTag" => TagToken(Tag {
239
- kind: StartTag,
240
- name: LocalName::from(&*args[0].get_str()),
241
- attrs: args[1]
242
- .get_obj()
243
- .iter()
244
- .map(|(k, v)| Attribute {
245
- name: QualName::new(None, ns!(), LocalName::from(&**k)),
246
- value: v.get_tendril(),
247
- })
248
- .collect(),
249
- self_closing: match args.get(2) {
250
- Some(b) => b.get_bool(),
251
- None => false,
252
- },
253
- }),
254
-
255
- "EndTag" => TagToken(Tag {
256
- kind: EndTag,
257
- name: LocalName::from(&*args[0].get_str()),
258
- attrs: vec![],
259
- self_closing: false,
260
- }),
261
-
262
- "Comment" => CommentToken(args[0].get_tendril()),
263
-
264
- "Character" => CharacterTokens(args[0].get_tendril()),
265
-
266
- // We don't need to produce NullCharacterToken because
267
- // the TokenLogger will convert them to CharacterTokens.
268
- _ => panic!("don't understand token {parts:?}"),
269
- }
270
- }
271
-
272
- // Parse the "output" field of the test case into a vector of tokens.
273
- fn json_to_tokens(
274
- js_tokens: &Value,
275
- js_errors: &[Value],
276
- exact_errors: bool,
277
- ) -> (Vec<Token>, Vec<TestError>) {
278
- // Use a TokenLogger so that we combine character tokens separated
279
- // by an ignored error.
280
- let sink = TokenLogger::new(exact_errors);
281
- for tok in js_tokens.get_list().iter() {
282
- assert_eq!(
283
- sink.process_token(json_to_token(tok), 0),
284
- TokenSinkResult::Continue
285
- );
286
- }
287
-
288
- for err in js_errors {
289
- assert_eq!(
290
- sink.process_token(ParseError(err.find("code").get_str().into()), 0),
291
- TokenSinkResult::Continue
292
- );
293
- }
294
-
295
- sink.get_tokens()
296
- }
297
-
298
- // Undo the escaping in "doubleEscaped" tests.
299
- fn unescape(s: &str) -> Option<String> {
300
- let mut out = String::with_capacity(s.len());
301
- let mut it = s.chars().peekable();
302
- loop {
303
- match it.next() {
304
- None => return Some(out),
305
- Some('\\') => {
306
- if it.peek() != Some(&'u') {
307
- panic!("can't understand escape");
308
- }
309
- let _ = it.next();
310
- let hex: String = it.by_ref().take(4).collect();
311
- match u32::from_str_radix(&hex, 16).ok().and_then(char::from_u32) {
312
- // Some of the tests use lone surrogates, but we have no
313
- // way to represent them in the UTF-8 input to our parser.
314
- // Since these can only come from script, we will catch
315
- // them there.
316
- None => return None,
317
- Some(c) => out.push(c),
318
- }
319
- },
320
- Some(c) => out.push(c),
321
- }
322
- }
323
- }
324
-
325
- fn unescape_json(js: &Value) -> Value {
326
- match js {
327
- // unwrap is OK here because the spec'd *output* of the tokenizer never
328
- // contains a lone surrogate.
329
- Value::String(s) => Value::String(unescape(s).unwrap()),
330
- Value::Array(xs) => Value::Array(xs.iter().map(unescape_json).collect()),
331
- Value::Object(obj) => {
332
- let mut new_obj = Map::new();
333
- for (k, v) in obj.iter() {
334
- new_obj.insert(k.clone(), unescape_json(v));
335
- }
336
- Value::Object(new_obj)
337
- },
338
- _ => js.clone(),
339
- }
340
- }
341
-
342
- fn mk_test(
343
- desc: String,
344
- input: String,
345
- expect: Value,
346
- expect_errors: Vec<Value>,
347
- opts: TokenizerOpts,
348
- ) -> Test {
349
- Test {
350
- name: desc,
351
- skip: false,
352
- test: Box::new(move || {
353
- // Split up the input at different points to test incremental tokenization.
354
- let insplits = splits(&input, 3);
355
- for input in insplits.into_iter() {
356
- // Clone 'input' so we have it for the failure message.
357
- // Also clone opts. If we don't, we get the wrong
358
- // result but the compiler doesn't catch it!
359
- // Possibly mozilla/rust#12223.
360
- let output = tokenize(input.clone(), opts.clone());
361
- let expect_toks = json_to_tokens(&expect, &expect_errors, opts.exact_errors);
362
- if output != expect_toks {
363
- panic!("\ninput: {input:?}\ngot: {output:?}\nexpected: {expect_toks:?}");
364
- }
365
- }
366
- }),
367
- }
368
- }
369
-
370
- fn mk_tests(tests: &mut Vec<Test>, filename: &str, js: &Value) {
371
- let obj = js.get_obj();
372
- let mut input = js.find("input").get_str();
373
- let mut expect = js.find("output").clone();
374
- let expect_errors = js
375
- .get("errors")
376
- .map(JsonExt::get_list)
377
- .map(Vec::as_slice)
378
- .unwrap_or_default();
379
- let desc = format!("tok: {}: {}", filename, js.find("description").get_str());
380
-
381
- // "Double-escaped" tests require additional processing of
382
- // the input and output.
383
- if obj
384
- .get(&"doubleEscaped".to_string())
385
- .is_some_and(|j| j.get_bool())
386
- {
387
- match unescape(&input) {
388
- None => return,
389
- Some(i) => input = i,
390
- }
391
- expect = unescape_json(&expect);
392
- }
393
-
394
- // Some tests have a last start tag name.
395
- let start_tag = obj.get(&"lastStartTag".to_string()).map(|s| s.get_str());
396
-
397
- // Some tests want to start in a state other than Data.
398
- let state_overrides = match obj.get(&"initialStates".to_string()) {
399
- Some(Value::Array(xs)) => xs
400
- .iter()
401
- .map(|s| {
402
- Some(match &s.get_str()[..] {
403
- "PLAINTEXT state" => Plaintext,
404
- "RAWTEXT state" => RawData(Rawtext),
405
- "RCDATA state" => RawData(Rcdata),
406
- "Script data state" => RawData(ScriptData),
407
- "CDATA section state" => CdataSection,
408
- "Data state" => Data,
409
- s => panic!("don't know state {s}"),
410
- })
411
- })
412
- .collect(),
413
- None => vec![None],
414
- _ => panic!("don't understand initialStates value"),
415
- };
416
-
417
- // Build the tests.
418
- for state in state_overrides.into_iter() {
419
- for &exact_errors in [false, true].iter() {
420
- let mut newdesc = desc.clone();
421
- if let Some(s) = state {
422
- newdesc = format!("{newdesc} (in state {s:?})")
423
- };
424
- if exact_errors {
425
- newdesc = format!("{newdesc} (exact errors)");
426
- }
427
-
428
- tests.push(mk_test(
429
- newdesc,
430
- input.clone(),
431
- expect.clone(),
432
- expect_errors.to_owned(),
433
- TokenizerOpts {
434
- exact_errors,
435
- initial_state: state,
436
- last_start_tag_name: start_tag.clone(),
437
-
438
- // Not discarding a BOM is what the test suite expects; see
439
- // https://github.com/html5lib/html5lib-tests/issues/2
440
- discard_bom: false,
441
-
442
- ..Default::default()
443
- },
444
- ));
445
- }
446
- }
447
- }
448
-
449
- fn tests(src_dir: &Path) -> Vec<Test> {
450
- let mut tests = vec![];
451
-
452
- let mut add_test = |path: &Path, mut file: File| {
453
- let mut s = String::new();
454
- file.read_to_string(&mut s).expect("file reading error");
455
- let js: Value = serde_json::from_str(&s).expect("json parse error");
456
-
457
- if let Some(Value::Array(lst)) = js.get_obj().get("tests") {
458
- for test in lst.iter() {
459
- mk_tests(
460
- &mut tests,
461
- path.file_name().unwrap().to_str().unwrap(),
462
- test,
463
- )
464
- }
465
- }
466
- };
467
-
468
- foreach_html5lib_test(
469
- src_dir,
470
- "html5lib-tests/tokenizer",
471
- OsStr::new("test"),
472
- &mut add_test,
473
- );
474
-
475
- foreach_html5lib_test(
476
- src_dir,
477
- "custom-html5lib-tokenizer-tests",
478
- OsStr::new("test"),
479
- &mut add_test,
480
- );
481
-
482
- tests
483
- }
484
-
485
- fn main() {
486
- run_all(tests(Path::new("./")));
487
- }