html-to-markdown 2.24.6 → 2.25.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (231) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/html-to-markdown-rb/native/Cargo.lock +9 -32
  5. data/ext/html-to-markdown-rb/native/Cargo.toml +1 -1
  6. data/lib/html_to_markdown/version.rb +1 -1
  7. data/rust-vendor/html-to-markdown-rs/Cargo.toml +0 -1
  8. data/rust-vendor/html-to-markdown-rs/src/converter/main_helpers.rs +1 -1
  9. data/rust-vendor/html-to-markdown-rs/src/hocr/converter/hierarchy.rs +20 -5
  10. data/rust-vendor/html-to-markdown-rs/src/lib.rs +1 -0
  11. data/rust-vendor/{markup5ever_rcdom/lib.rs → html-to-markdown-rs/src/rcdom.rs} +56 -91
  12. data/rust-vendor/html-to-markdown-rs/tests/hocr_compliance_test.rs +157 -0
  13. data/rust-vendor/memmap2/.cargo-checksum.json +1 -1
  14. data/rust-vendor/memmap2/.cargo_vcs_info.json +1 -1
  15. data/rust-vendor/memmap2/CHANGELOG.md +8 -0
  16. data/rust-vendor/memmap2/Cargo.lock +1 -1
  17. data/rust-vendor/memmap2/Cargo.toml +2 -1
  18. data/rust-vendor/memmap2/Cargo.toml.orig +2 -1
  19. data/rust-vendor/memmap2/src/lib.rs +25 -1
  20. data/rust-vendor/memmap2/src/stub.rs +1 -4
  21. data/rust-vendor/memmap2/src/unix.rs +14 -1
  22. data/rust-vendor/png/.cargo-checksum.json +1 -1
  23. data/rust-vendor/png/.cargo_vcs_info.json +1 -1
  24. data/rust-vendor/png/CHANGES.md +44 -0
  25. data/rust-vendor/png/Cargo.lock +124 -171
  26. data/rust-vendor/png/Cargo.toml +1 -1
  27. data/rust-vendor/png/Cargo.toml.orig +1 -1
  28. data/rust-vendor/png/benches/expand_paletted.rs +5 -5
  29. data/rust-vendor/png/benches/unfilter.rs +3 -3
  30. data/rust-vendor/png/src/adam7.rs +17 -10
  31. data/rust-vendor/png/src/common.rs +8 -8
  32. data/rust-vendor/png/src/decoder/mod.rs +53 -20
  33. data/rust-vendor/png/src/decoder/stream.rs +263 -78
  34. data/rust-vendor/png/src/decoder/unfiltering_buffer.rs +210 -53
  35. data/rust-vendor/png/src/decoder/zlib.rs +130 -90
  36. data/rust-vendor/png/src/encoder.rs +4 -2
  37. data/rust-vendor/png/src/{filter.rs → filter/mod.rs} +100 -367
  38. data/rust-vendor/png/src/filter/optimization-notes.md +104 -0
  39. data/rust-vendor/png/src/filter/paeth.rs +398 -0
  40. data/rust-vendor/png/src/filter/simd.rs +308 -0
  41. data/rust-vendor/png/src/lib.rs +1 -0
  42. data/rust-vendor/syn/.cargo-checksum.json +1 -1
  43. data/rust-vendor/syn/.cargo_vcs_info.json +1 -1
  44. data/rust-vendor/syn/Cargo.lock +40 -41
  45. data/rust-vendor/syn/Cargo.toml +1 -1
  46. data/rust-vendor/syn/Cargo.toml.orig +1 -1
  47. data/rust-vendor/syn/src/item.rs +61 -40
  48. data/rust-vendor/syn/src/lib.rs +2 -1
  49. data/rust-vendor/syn/tests/test_item.rs +54 -0
  50. data/rust-vendor/unicode-ident/.cargo-checksum.json +1 -1
  51. data/rust-vendor/unicode-ident/.cargo_vcs_info.json +1 -1
  52. data/rust-vendor/unicode-ident/Cargo.lock +21 -21
  53. data/rust-vendor/unicode-ident/Cargo.toml +1 -1
  54. data/rust-vendor/unicode-ident/Cargo.toml.orig +1 -1
  55. data/rust-vendor/unicode-ident/src/lib.rs +1 -1
  56. data/rust-vendor/unicode-ident/src/tables.rs +87 -97
  57. data/rust-vendor/unicode-ident/tests/static_size.rs +1 -1
  58. metadata +7 -177
  59. data/rust-vendor/markup5ever_rcdom/.cargo-checksum.json +0 -1
  60. data/rust-vendor/markup5ever_rcdom/.cargo_vcs_info.json +0 -7
  61. data/rust-vendor/markup5ever_rcdom/Cargo.lock +0 -658
  62. data/rust-vendor/markup5ever_rcdom/Cargo.toml +0 -109
  63. data/rust-vendor/markup5ever_rcdom/Cargo.toml.orig +0 -42
  64. data/rust-vendor/markup5ever_rcdom/LICENSE-APACHE +0 -201
  65. data/rust-vendor/markup5ever_rcdom/LICENSE-MIT +0 -25
  66. data/rust-vendor/markup5ever_rcdom/README.md +0 -7
  67. data/rust-vendor/markup5ever_rcdom/custom-html5lib-tokenizer-tests/regression.test +0 -69
  68. data/rust-vendor/markup5ever_rcdom/data/test/ignore +0 -1
  69. data/rust-vendor/markup5ever_rcdom/examples/hello_xml.rs +0 -39
  70. data/rust-vendor/markup5ever_rcdom/examples/html2html.rs +0 -51
  71. data/rust-vendor/markup5ever_rcdom/examples/print-rcdom.rs +0 -78
  72. data/rust-vendor/markup5ever_rcdom/examples/xml_tree_printer.rs +0 -67
  73. data/rust-vendor/markup5ever_rcdom/html5lib-tests/.gitattributes +0 -2
  74. data/rust-vendor/markup5ever_rcdom/html5lib-tests/.github/workflows/downstream.yml +0 -76
  75. data/rust-vendor/markup5ever_rcdom/html5lib-tests/.github/workflows/lint.yml +0 -25
  76. data/rust-vendor/markup5ever_rcdom/html5lib-tests/.gitignore +0 -79
  77. data/rust-vendor/markup5ever_rcdom/html5lib-tests/AUTHORS.rst +0 -34
  78. data/rust-vendor/markup5ever_rcdom/html5lib-tests/LICENSE +0 -21
  79. data/rust-vendor/markup5ever_rcdom/html5lib-tests/encoding/chardet/test_big5.txt +0 -51
  80. data/rust-vendor/markup5ever_rcdom/html5lib-tests/encoding/scripted/tests1.dat +0 -5
  81. data/rust-vendor/markup5ever_rcdom/html5lib-tests/encoding/test-yahoo-jp.dat +0 -10
  82. data/rust-vendor/markup5ever_rcdom/html5lib-tests/encoding/tests1.dat +0 -388
  83. data/rust-vendor/markup5ever_rcdom/html5lib-tests/encoding/tests2.dat +0 -115
  84. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint +0 -6
  85. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/__init__.py +0 -0
  86. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/__init__.py +0 -0
  87. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/LICENSE +0 -18
  88. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/__init__.py +0 -0
  89. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/lexer.py +0 -211
  90. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/lexer.pyi +0 -34
  91. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/parser.py +0 -872
  92. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/parser.pyi +0 -83
  93. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/py.typed +0 -0
  94. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/util.py +0 -72
  95. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/util.pyi +0 -7
  96. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/vendor.txt +0 -1
  97. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor-patches/funcparserlib.patch +0 -24
  98. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/lint.py +0 -280
  99. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/parser.py +0 -177
  100. data/rust-vendor/markup5ever_rcdom/html5lib-tests/pyproject.toml +0 -7
  101. data/rust-vendor/markup5ever_rcdom/html5lib-tests/serializer/core.test +0 -125
  102. data/rust-vendor/markup5ever_rcdom/html5lib-tests/serializer/injectmeta.test +0 -66
  103. data/rust-vendor/markup5ever_rcdom/html5lib-tests/serializer/optionaltags.test +0 -965
  104. data/rust-vendor/markup5ever_rcdom/html5lib-tests/serializer/options.test +0 -60
  105. data/rust-vendor/markup5ever_rcdom/html5lib-tests/serializer/whitespace.test +0 -51
  106. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/README.md +0 -107
  107. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/contentModelFlags.test +0 -93
  108. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/domjs.test +0 -335
  109. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/entities.test +0 -542
  110. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/escapeFlag.test +0 -36
  111. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/namedEntities.test +0 -42422
  112. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/numericEntities.test +0 -1677
  113. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/pendingSpecChanges.test +0 -9
  114. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/test1.test +0 -353
  115. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/test2.test +0 -275
  116. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/test3.test +0 -11233
  117. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/test4.test +0 -532
  118. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/unicodeChars.test +0 -1577
  119. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/unicodeCharsProblematic.test +0 -41
  120. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/xmlViolation.test +0 -20
  121. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/README.md +0 -108
  122. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/adoption01.dat +0 -354
  123. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/adoption02.dat +0 -39
  124. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/blocks.dat +0 -695
  125. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/comments01.dat +0 -217
  126. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/doctype01.dat +0 -474
  127. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/domjs-unsafe.dat +0 -0
  128. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/entities01.dat +0 -943
  129. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/entities02.dat +0 -309
  130. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/foreign-fragment.dat +0 -645
  131. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/html5test-com.dat +0 -301
  132. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/inbody01.dat +0 -54
  133. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/isindex.dat +0 -49
  134. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/main-element.dat +0 -46
  135. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/math.dat +0 -104
  136. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/menuitem-element.dat +0 -240
  137. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/namespace-sensitivity.dat +0 -22
  138. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/noscript01.dat +0 -237
  139. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/pending-spec-changes-plain-text-unsafe.dat +0 -0
  140. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/pending-spec-changes.dat +0 -46
  141. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/plain-text-unsafe.dat +0 -0
  142. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/quirks01.dat +0 -53
  143. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/ruby.dat +0 -302
  144. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/scriptdata01.dat +0 -372
  145. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/scripted/adoption01.dat +0 -16
  146. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/scripted/ark.dat +0 -27
  147. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/scripted/webkit01.dat +0 -30
  148. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/search-element.dat +0 -46
  149. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/svg.dat +0 -104
  150. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tables01.dat +0 -322
  151. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/template.dat +0 -1673
  152. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests1.dat +0 -1956
  153. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests10.dat +0 -849
  154. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests11.dat +0 -523
  155. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests12.dat +0 -62
  156. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests14.dat +0 -75
  157. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests15.dat +0 -216
  158. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests16.dat +0 -2602
  159. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests17.dat +0 -179
  160. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests18.dat +0 -558
  161. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests19.dat +0 -1398
  162. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests2.dat +0 -831
  163. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests20.dat +0 -842
  164. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests21.dat +0 -306
  165. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests22.dat +0 -190
  166. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests23.dat +0 -168
  167. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests24.dat +0 -79
  168. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests25.dat +0 -288
  169. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests26.dat +0 -453
  170. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests3.dat +0 -305
  171. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests4.dat +0 -74
  172. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests5.dat +0 -210
  173. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests6.dat +0 -663
  174. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests7.dat +0 -453
  175. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests8.dat +0 -165
  176. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests9.dat +0 -472
  177. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests_innerHTML_1.dat +0 -843
  178. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tricky01.dat +0 -336
  179. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/webkit01.dat +0 -785
  180. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/webkit02.dat +0 -554
  181. data/rust-vendor/markup5ever_rcdom/tests/foreach_html5lib_test/mod.rs +0 -41
  182. data/rust-vendor/markup5ever_rcdom/tests/html-driver.rs +0 -29
  183. data/rust-vendor/markup5ever_rcdom/tests/html-serializer.rs +0 -265
  184. data/rust-vendor/markup5ever_rcdom/tests/html-tokenizer.rs +0 -487
  185. data/rust-vendor/markup5ever_rcdom/tests/html-tree-builder.rs +0 -298
  186. data/rust-vendor/markup5ever_rcdom/tests/html-tree-sink.rs +0 -141
  187. data/rust-vendor/markup5ever_rcdom/tests/util/find_tests.rs +0 -34
  188. data/rust-vendor/markup5ever_rcdom/tests/util/runner.rs +0 -48
  189. data/rust-vendor/markup5ever_rcdom/tests/xml-driver.rs +0 -101
  190. data/rust-vendor/markup5ever_rcdom/tests/xml-tokenizer.rs +0 -374
  191. data/rust-vendor/markup5ever_rcdom/tests/xml-tree-builder.rs +0 -237
  192. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/AUTHORS.rst +0 -9
  193. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/LICENSE +0 -21
  194. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/README.md +0 -92
  195. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/comments.test +0 -274
  196. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/doctype.test +0 -3232
  197. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/entities.test +0 -283
  198. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/eof.test +0 -113
  199. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/namedEntities.test +0 -42210
  200. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/numericEntities.test +0 -1349
  201. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/test1.test +0 -162
  202. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/test2.test +0 -64
  203. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/unicodeChars.test +0 -1295
  204. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tree-construction/README.md +0 -104
  205. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tree-construction/namespace.dat +0 -119
  206. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tree-construction/test1.dat +0 -124
  207. data/rust-vendor/xml5ever/.cargo-checksum.json +0 -1
  208. data/rust-vendor/xml5ever/.cargo_vcs_info.json +0 -6
  209. data/rust-vendor/xml5ever/Cargo.lock +0 -752
  210. data/rust-vendor/xml5ever/Cargo.toml +0 -69
  211. data/rust-vendor/xml5ever/Cargo.toml.orig +0 -29
  212. data/rust-vendor/xml5ever/LICENSE-APACHE +0 -201
  213. data/rust-vendor/xml5ever/LICENSE-MIT +0 -25
  214. data/rust-vendor/xml5ever/README.md +0 -72
  215. data/rust-vendor/xml5ever/benches/xml5ever.rs +0 -77
  216. data/rust-vendor/xml5ever/data/bench/strong.xml +0 -1
  217. data/rust-vendor/xml5ever/examples/README.md +0 -223
  218. data/rust-vendor/xml5ever/examples/example.xml +0 -3
  219. data/rust-vendor/xml5ever/examples/simple_xml_tokenizer.rs +0 -81
  220. data/rust-vendor/xml5ever/examples/xml_tokenizer.rs +0 -115
  221. data/rust-vendor/xml5ever/src/driver.rs +0 -90
  222. data/rust-vendor/xml5ever/src/lib.rs +0 -47
  223. data/rust-vendor/xml5ever/src/macros.rs +0 -18
  224. data/rust-vendor/xml5ever/src/serialize/mod.rs +0 -216
  225. data/rust-vendor/xml5ever/src/tokenizer/char_ref/mod.rs +0 -456
  226. data/rust-vendor/xml5ever/src/tokenizer/interface.rs +0 -116
  227. data/rust-vendor/xml5ever/src/tokenizer/mod.rs +0 -1344
  228. data/rust-vendor/xml5ever/src/tokenizer/qname.rs +0 -84
  229. data/rust-vendor/xml5ever/src/tokenizer/states.rs +0 -167
  230. data/rust-vendor/xml5ever/src/tree_builder/mod.rs +0 -774
  231. data/rust-vendor/xml5ever/src/tree_builder/types.rs +0 -37
@@ -1,216 +0,0 @@
1
- // Copyright 2014-2017 The html5ever Project Developers. See the
2
- // COPYRIGHT file at the top-level directory of this distribution.
3
- //
4
- // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5
- // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6
- // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7
- // option. This file may not be copied, modified, or distributed
8
- // except according to those terms.
9
-
10
- use crate::tree_builder::NamespaceMap;
11
- use crate::QualName;
12
- pub use markup5ever::serialize::{AttrRef, Serialize, Serializer, TraversalScope};
13
- use std::io::{self, Write};
14
-
15
- #[derive(Clone)]
16
- /// Struct for setting serializer options.
17
- pub struct SerializeOpts {
18
- /// Serialize the root node? Default: ChildrenOnly
19
- pub traversal_scope: TraversalScope,
20
- }
21
-
22
- impl Default for SerializeOpts {
23
- fn default() -> SerializeOpts {
24
- SerializeOpts {
25
- traversal_scope: TraversalScope::ChildrenOnly(None),
26
- }
27
- }
28
- }
29
-
30
- /// Method for serializing generic node to a given writer.
31
- pub fn serialize<Wr, T>(writer: Wr, node: &T, opts: SerializeOpts) -> io::Result<()>
32
- where
33
- Wr: Write,
34
- T: Serialize,
35
- {
36
- let mut ser = XmlSerializer::new(writer);
37
- node.serialize(&mut ser, opts.traversal_scope)
38
- }
39
-
40
- /// Struct used for serializing nodes into a text that other XML
41
- /// parses can read.
42
- ///
43
- /// Serializer contains a set of functions (start_elem, end_elem...)
44
- /// that make parsing nodes easier.
45
- pub struct XmlSerializer<Wr> {
46
- writer: Wr,
47
- namespace_stack: NamespaceMapStack,
48
- }
49
-
50
- #[derive(Debug)]
51
- struct NamespaceMapStack(Vec<NamespaceMap>);
52
-
53
- impl NamespaceMapStack {
54
- fn new() -> NamespaceMapStack {
55
- NamespaceMapStack(vec![])
56
- }
57
-
58
- fn push(&mut self, namespace: NamespaceMap) {
59
- self.0.push(namespace);
60
- }
61
-
62
- fn pop(&mut self) {
63
- self.0.pop();
64
- }
65
- }
66
-
67
- /// Writes given text into the Serializer, escaping it,
68
- /// depending on where the text is written inside the tag or attribute value.
69
- ///
70
- /// For example
71
- ///```text
72
- /// <tag>'&-quotes'</tag> becomes <tag>'&amp;-quotes'</tag>
73
- /// <tag = "'&-quotes'"> becomes <tag = "&apos;&amp;-quotes&apos;"
74
- ///```
75
- fn write_to_buf_escaped<W: Write>(writer: &mut W, text: &str, attr_mode: bool) -> io::Result<()> {
76
- for c in text.chars() {
77
- match c {
78
- '&' => writer.write_all(b"&amp;"),
79
- '\'' if attr_mode => writer.write_all(b"&apos;"),
80
- '"' if attr_mode => writer.write_all(b"&quot;"),
81
- '<' if !attr_mode => writer.write_all(b"&lt;"),
82
- '>' if !attr_mode => writer.write_all(b"&gt;"),
83
- c => writer.write_fmt(format_args!("{c}")),
84
- }?;
85
- }
86
- Ok(())
87
- }
88
-
89
- #[inline]
90
- fn write_qual_name<W: Write>(writer: &mut W, name: &QualName) -> io::Result<()> {
91
- if let Some(ref prefix) = name.prefix {
92
- writer.write_all(prefix.as_bytes())?;
93
- writer.write_all(b":")?;
94
- }
95
-
96
- writer.write_all(name.local.as_bytes())?;
97
- Ok(())
98
- }
99
-
100
- impl<Wr: Write> XmlSerializer<Wr> {
101
- /// Creates a new Serializier from a writer and given serialization options.
102
- pub fn new(writer: Wr) -> Self {
103
- XmlSerializer {
104
- writer,
105
- namespace_stack: NamespaceMapStack::new(),
106
- }
107
- }
108
-
109
- #[inline(always)]
110
- fn qual_name(&mut self, name: &QualName) -> io::Result<()> {
111
- self.find_or_insert_ns(name);
112
- write_qual_name(&mut self.writer, name)
113
- }
114
-
115
- #[inline(always)]
116
- fn qual_attr_name(&mut self, name: &QualName) -> io::Result<()> {
117
- self.find_or_insert_ns(name);
118
- write_qual_name(&mut self.writer, name)
119
- }
120
-
121
- fn find_uri(&self, name: &QualName) -> bool {
122
- let mut found = false;
123
- for stack in self.namespace_stack.0.iter().rev() {
124
- if let Some(Some(el)) = stack.get(&name.prefix) {
125
- found = *el == name.ns;
126
- break;
127
- }
128
- }
129
- found
130
- }
131
-
132
- fn find_or_insert_ns(&mut self, name: &QualName) {
133
- if (name.prefix.is_some() || !name.ns.is_empty()) && !self.find_uri(name) {
134
- if let Some(last_ns) = self.namespace_stack.0.last_mut() {
135
- last_ns.insert(name);
136
- }
137
- }
138
- }
139
- }
140
-
141
- impl<Wr: Write> Serializer for XmlSerializer<Wr> {
142
- /// Serializes given start element into text. Start element contains
143
- /// qualified name and an attributes iterator.
144
- fn start_elem<'a, AttrIter>(&mut self, name: QualName, attrs: AttrIter) -> io::Result<()>
145
- where
146
- AttrIter: Iterator<Item = AttrRef<'a>>,
147
- {
148
- self.namespace_stack.push(NamespaceMap::empty());
149
-
150
- self.writer.write_all(b"<")?;
151
- self.qual_name(&name)?;
152
- if let Some(current_namespace) = self.namespace_stack.0.last() {
153
- for (prefix, url_opt) in current_namespace.get_scope_iter() {
154
- self.writer.write_all(b" xmlns")?;
155
- if let Some(ref prefix) = *prefix {
156
- self.writer.write_all(b":")?;
157
- self.writer.write_all(prefix.as_bytes())?;
158
- }
159
-
160
- self.writer.write_all(b"=\"")?;
161
- let url = if let Some(ref a) = *url_opt {
162
- a.as_bytes()
163
- } else {
164
- b""
165
- };
166
- self.writer.write_all(url)?;
167
- self.writer.write_all(b"\"")?;
168
- }
169
- }
170
- for (name, value) in attrs {
171
- self.writer.write_all(b" ")?;
172
- self.qual_attr_name(name)?;
173
- self.writer.write_all(b"=\"")?;
174
- write_to_buf_escaped(&mut self.writer, value, true)?;
175
- self.writer.write_all(b"\"")?;
176
- }
177
- self.writer.write_all(b">")?;
178
- Ok(())
179
- }
180
-
181
- /// Serializes given end element into text.
182
- fn end_elem(&mut self, name: QualName) -> io::Result<()> {
183
- self.namespace_stack.pop();
184
- self.writer.write_all(b"</")?;
185
- self.qual_name(&name)?;
186
- self.writer.write_all(b">")
187
- }
188
-
189
- /// Serializes comment into text.
190
- fn write_comment(&mut self, text: &str) -> io::Result<()> {
191
- self.writer.write_all(b"<!--")?;
192
- self.writer.write_all(text.as_bytes())?;
193
- self.writer.write_all(b"-->")
194
- }
195
-
196
- /// Serializes given doctype
197
- fn write_doctype(&mut self, name: &str) -> io::Result<()> {
198
- self.writer.write_all(b"<!DOCTYPE ")?;
199
- self.writer.write_all(name.as_bytes())?;
200
- self.writer.write_all(b">")
201
- }
202
-
203
- /// Serializes text for a node or an attributes.
204
- fn write_text(&mut self, text: &str) -> io::Result<()> {
205
- write_to_buf_escaped(&mut self.writer, text, false)
206
- }
207
-
208
- /// Serializes given processing instruction.
209
- fn write_processing_instruction(&mut self, target: &str, data: &str) -> io::Result<()> {
210
- self.writer.write_all(b"<?")?;
211
- self.writer.write_all(target.as_bytes())?;
212
- self.writer.write_all(b" ")?;
213
- self.writer.write_all(data.as_bytes())?;
214
- self.writer.write_all(b"?>")
215
- }
216
- }
@@ -1,456 +0,0 @@
1
- // Copyright 2014-2017 The html5ever Project Developers. See the
2
- // COPYRIGHT file at the top-level directory of this distribution.
3
- //
4
- // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5
- // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6
- // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7
- // option. This file may not be copied, modified, or distributed
8
- // except according to those terms.
9
-
10
- use super::{TokenSink, XmlTokenizer};
11
- use crate::data;
12
- use crate::tendril::StrTendril;
13
- use log::debug;
14
- use markup5ever::buffer_queue::BufferQueue;
15
- use std::borrow::Cow::{self, Borrowed};
16
- use std::char::from_u32;
17
-
18
- use self::State::*;
19
- pub use self::Status::*;
20
-
21
- //§ tokenizing-character-references
22
- pub struct CharRef {
23
- /// The resulting character(s)
24
- pub chars: [char; 2],
25
-
26
- /// How many slots in `chars` are valid?
27
- pub num_chars: u8,
28
- }
29
-
30
- pub enum Status {
31
- Stuck,
32
- Progress,
33
- Done,
34
- }
35
-
36
- #[derive(Debug)]
37
- enum State {
38
- Begin,
39
- Octothorpe,
40
- Numeric(u32), // base
41
- NumericSemicolon,
42
- Named,
43
- BogusName,
44
- }
45
-
46
- pub struct CharRefTokenizer {
47
- state: State,
48
- addnl_allowed: Option<char>,
49
- result: Option<CharRef>,
50
-
51
- num: u32,
52
- num_too_big: bool,
53
- seen_digit: bool,
54
- hex_marker: Option<char>,
55
-
56
- name_buf_opt: Option<StrTendril>,
57
- name_match: Option<(u32, u32)>,
58
- name_len: usize,
59
- }
60
-
61
- impl CharRefTokenizer {
62
- // NB: We assume that we have an additional allowed character iff we're
63
- // tokenizing in an attribute value.
64
- pub fn new(addnl_allowed: Option<char>) -> CharRefTokenizer {
65
- CharRefTokenizer {
66
- state: Begin,
67
- addnl_allowed,
68
- result: None,
69
- num: 0,
70
- num_too_big: false,
71
- seen_digit: false,
72
- hex_marker: None,
73
- name_buf_opt: None,
74
- name_match: None,
75
- name_len: 0,
76
- }
77
- }
78
-
79
- // A CharRefTokenizer can only tokenize one character reference,
80
- // so this method consumes the tokenizer.
81
- pub fn get_result(self) -> CharRef {
82
- self.result.expect("get_result called before done")
83
- }
84
-
85
- fn name_buf(&self) -> &StrTendril {
86
- self.name_buf_opt
87
- .as_ref()
88
- .expect("name_buf missing in named character reference")
89
- }
90
-
91
- fn name_buf_mut(&mut self) -> &mut StrTendril {
92
- self.name_buf_opt
93
- .as_mut()
94
- .expect("name_buf missing in named character reference")
95
- }
96
-
97
- fn finish_none(&mut self) -> Status {
98
- self.result = Some(CharRef {
99
- chars: ['\0', '\0'],
100
- num_chars: 0,
101
- });
102
- Done
103
- }
104
-
105
- fn finish_one(&mut self, c: char) -> Status {
106
- self.result = Some(CharRef {
107
- chars: [c, '\0'],
108
- num_chars: 1,
109
- });
110
- Done
111
- }
112
- }
113
-
114
- impl CharRefTokenizer {
115
- pub fn step<Sink: TokenSink>(
116
- &mut self,
117
- tokenizer: &XmlTokenizer<Sink>,
118
- input: &BufferQueue,
119
- ) -> Status {
120
- if self.result.is_some() {
121
- return Done;
122
- }
123
-
124
- debug!("char ref tokenizer stepping in state {:?}", self.state);
125
- match self.state {
126
- Begin => self.do_begin(tokenizer, input),
127
- Octothorpe => self.do_octothorpe(tokenizer, input),
128
- Numeric(base) => self.do_numeric(tokenizer, base, input),
129
- NumericSemicolon => self.do_numeric_semicolon(tokenizer, input),
130
- Named => self.do_named(tokenizer, input),
131
- BogusName => self.do_bogus_name(tokenizer, input),
132
- }
133
- }
134
-
135
- fn do_begin<Sink: TokenSink>(
136
- &mut self,
137
- tokenizer: &XmlTokenizer<Sink>,
138
- input: &BufferQueue,
139
- ) -> Status {
140
- match tokenizer.peek(input) {
141
- Some('\t' | '\n' | '\x0C' | ' ' | '<' | '&') => self.finish_none(),
142
- Some(c) if Some(c) == self.addnl_allowed => self.finish_none(),
143
- Some('#') => {
144
- tokenizer.discard_char(input);
145
- self.state = Octothorpe;
146
- Progress
147
- },
148
- Some(_) => {
149
- self.state = Named;
150
- self.name_buf_opt = Some(StrTendril::new());
151
- Progress
152
- },
153
- None => Stuck,
154
- }
155
- }
156
-
157
- fn do_octothorpe<Sink: TokenSink>(
158
- &mut self,
159
- tokenizer: &XmlTokenizer<Sink>,
160
- input: &BufferQueue,
161
- ) -> Status {
162
- match tokenizer.peek(input) {
163
- Some(c @ ('x' | 'X')) => {
164
- tokenizer.discard_char(input);
165
- self.hex_marker = Some(c);
166
- self.state = Numeric(16);
167
- },
168
- Some(_) => {
169
- self.hex_marker = None;
170
- self.state = Numeric(10);
171
- },
172
- None => return Stuck,
173
- }
174
- Progress
175
- }
176
-
177
- fn do_numeric<Sink: TokenSink>(
178
- &mut self,
179
- tokenizer: &XmlTokenizer<Sink>,
180
- base: u32,
181
- input: &BufferQueue,
182
- ) -> Status {
183
- let Some(c) = tokenizer.peek(input) else {
184
- return Stuck;
185
- };
186
- match c.to_digit(base) {
187
- Some(n) => {
188
- tokenizer.discard_char(input);
189
- self.num = self.num.wrapping_mul(base);
190
- if self.num > 0x10FFFF {
191
- // We might overflow, and the character is definitely invalid.
192
- // We still parse digits and semicolon, but don't use the result.
193
- self.num_too_big = true;
194
- }
195
- self.num = self.num.wrapping_add(n);
196
- self.seen_digit = true;
197
- Progress
198
- },
199
-
200
- None if !self.seen_digit => self.unconsume_numeric(tokenizer, input),
201
-
202
- None => {
203
- self.state = NumericSemicolon;
204
- Progress
205
- },
206
- }
207
- }
208
-
209
- fn do_numeric_semicolon<Sink: TokenSink>(
210
- &mut self,
211
- tokenizer: &XmlTokenizer<Sink>,
212
- input: &BufferQueue,
213
- ) -> Status {
214
- match tokenizer.peek(input) {
215
- Some(';') => tokenizer.discard_char(input),
216
- Some(_) => tokenizer.emit_error(Borrowed(
217
- "Semicolon missing after numeric character reference",
218
- )),
219
- None => return Stuck,
220
- };
221
- self.finish_numeric(tokenizer)
222
- }
223
-
224
- fn unconsume_numeric<Sink: TokenSink>(
225
- &mut self,
226
- tokenizer: &XmlTokenizer<Sink>,
227
- input: &BufferQueue,
228
- ) -> Status {
229
- let mut unconsume = StrTendril::from_char('#');
230
- if let Some(c) = self.hex_marker {
231
- unconsume.push_char(c);
232
- }
233
-
234
- tokenizer.unconsume(input, unconsume);
235
- tokenizer.emit_error(Borrowed("Numeric character reference without digits"));
236
- self.finish_none()
237
- }
238
-
239
- fn finish_numeric<Sink: TokenSink>(&mut self, tokenizer: &XmlTokenizer<Sink>) -> Status {
240
- fn conv(n: u32) -> char {
241
- from_u32(n).expect("invalid char missed by error handling cases")
242
- }
243
-
244
- let (c, error) = match self.num {
245
- n if (n > 0x10FFFF) || self.num_too_big => ('\u{fffd}', true),
246
- 0x00 | 0xD800..=0xDFFF => ('\u{fffd}', true),
247
-
248
- 0x80..=0x9F => match data::C1_REPLACEMENTS[(self.num - 0x80) as usize] {
249
- Some(c) => (c, true),
250
- None => (conv(self.num), true),
251
- },
252
-
253
- 0x01..=0x08 | 0x0B | 0x0D..=0x1F | 0x7F | 0xFDD0..=0xFDEF => (conv(self.num), true),
254
-
255
- n if (n & 0xFFFE) == 0xFFFE => (conv(n), true),
256
-
257
- n => (conv(n), false),
258
- };
259
-
260
- if error {
261
- let msg = if tokenizer.opts.exact_errors {
262
- Cow::from(format!(
263
- "Invalid numeric character reference value 0x{:06X}",
264
- self.num
265
- ))
266
- } else {
267
- Cow::from("Invalid numeric character reference")
268
- };
269
- tokenizer.emit_error(msg);
270
- }
271
-
272
- self.finish_one(c)
273
- }
274
-
275
- fn do_named<Sink: TokenSink>(
276
- &mut self,
277
- tokenizer: &XmlTokenizer<Sink>,
278
- input: &BufferQueue,
279
- ) -> Status {
280
- let Some(c) = tokenizer.get_char(input) else {
281
- return Stuck;
282
- };
283
- self.name_buf_mut().push_char(c);
284
- match data::NAMED_ENTITIES.get(&self.name_buf()[..]) {
285
- // We have either a full match or a prefix of one.
286
- Some(&m) => {
287
- if m.0 != 0 {
288
- // We have a full match, but there might be a longer one to come.
289
- self.name_match = Some(m);
290
- self.name_len = self.name_buf().len();
291
- }
292
- // Otherwise we just have a prefix match.
293
- Progress
294
- },
295
-
296
- // Can't continue the match.
297
- None => self.finish_named(tokenizer, Some(c), input),
298
- }
299
- }
300
-
301
- fn emit_name_error<Sink: TokenSink>(&mut self, tokenizer: &XmlTokenizer<Sink>) {
302
- let msg = if tokenizer.opts.exact_errors {
303
- Cow::from(format!("Invalid character reference &{}", self.name_buf()))
304
- } else {
305
- Cow::from("Invalid character reference")
306
- };
307
- tokenizer.emit_error(msg);
308
- }
309
-
310
- fn unconsume_name<Sink: TokenSink>(
311
- &mut self,
312
- tokenizer: &XmlTokenizer<Sink>,
313
- input: &BufferQueue,
314
- ) {
315
- tokenizer.unconsume(input, self.name_buf_opt.take().unwrap());
316
- }
317
-
318
- fn finish_named<Sink: TokenSink>(
319
- &mut self,
320
- tokenizer: &XmlTokenizer<Sink>,
321
- end_char: Option<char>,
322
- input: &BufferQueue,
323
- ) -> Status {
324
- match self.name_match {
325
- None => {
326
- match end_char {
327
- Some(c) if c.is_ascii_alphanumeric() => {
328
- // Keep looking for a semicolon, to determine whether
329
- // we emit a parse error.
330
- self.state = BogusName;
331
- return Progress;
332
- },
333
-
334
- // Check length because &; is not a parse error.
335
- Some(';') if self.name_buf().len() > 1 => self.emit_name_error(tokenizer),
336
-
337
- _ => (),
338
- }
339
- self.unconsume_name(tokenizer, input);
340
- self.finish_none()
341
- },
342
-
343
- Some((c1, c2)) => {
344
- // We have a complete match, but we may have consumed
345
- // additional characters into self.name_buf. Usually
346
- // at least one, but several in cases like
347
- //
348
- // &not => match for U+00AC
349
- // &noti => valid prefix for &notin
350
- // &notit => can't continue match
351
-
352
- let name_len = self.name_len;
353
- assert!(name_len > 0);
354
- let last_matched = self.name_buf()[name_len - 1..].chars().next().unwrap();
355
-
356
- // There might not be a next character after the match, if
357
- // we had a full match and then hit EOF.
358
- let next_after = if name_len == self.name_buf().len() {
359
- None
360
- } else {
361
- Some(self.name_buf()[name_len..].chars().next().unwrap())
362
- };
363
-
364
- // "If the character reference is being consumed as part of an
365
- // attribute, and the last character matched is not a U+003B
366
- // SEMICOLON character (;), and the next character is either a
367
- // U+003D EQUALS SIGN character (=) or an alphanumeric ASCII
368
- // character, then, for historical reasons, all the characters
369
- // that were matched after the U+0026 AMPERSAND character (&)
370
- // must be unconsumed, and nothing is returned. However, if
371
- // this next character is in fact a U+003D EQUALS SIGN
372
- // character (=), then this is a parse error"
373
-
374
- let unconsume_all = match (self.addnl_allowed, last_matched, next_after) {
375
- (_, ';', _) => false,
376
- (Some(_), _, Some('=')) => {
377
- tokenizer.emit_error(Borrowed(
378
- "Equals sign after character reference in attribute",
379
- ));
380
- true
381
- },
382
- (Some(_), _, Some(c)) if c.is_ascii_alphanumeric() => true,
383
- _ => {
384
- tokenizer.emit_error(Borrowed(
385
- "Character reference does not end with semicolon",
386
- ));
387
- false
388
- },
389
- };
390
-
391
- if unconsume_all {
392
- self.unconsume_name(tokenizer, input);
393
- self.finish_none()
394
- } else {
395
- tokenizer
396
- .unconsume(input, StrTendril::from_slice(&self.name_buf()[name_len..]));
397
- self.result = Some(CharRef {
398
- chars: [from_u32(c1).unwrap(), from_u32(c2).unwrap()],
399
- num_chars: if c2 == 0 { 1 } else { 2 },
400
- });
401
- Done
402
- }
403
- },
404
- }
405
- }
406
-
407
- fn do_bogus_name<Sink: TokenSink>(
408
- &mut self,
409
- tokenizer: &XmlTokenizer<Sink>,
410
- input: &BufferQueue,
411
- ) -> Status {
412
- let Some(c) = tokenizer.get_char(input) else {
413
- return Stuck;
414
- };
415
- self.name_buf_mut().push_char(c);
416
- match c {
417
- _ if c.is_ascii_alphanumeric() => return Progress,
418
- ';' => self.emit_name_error(tokenizer),
419
- _ => (),
420
- }
421
- self.unconsume_name(tokenizer, input);
422
- self.finish_none()
423
- }
424
-
425
- pub fn end_of_file<Sink: TokenSink>(
426
- &mut self,
427
- tokenizer: &XmlTokenizer<Sink>,
428
- input: &BufferQueue,
429
- ) {
430
- while self.result.is_none() {
431
- match self.state {
432
- Begin => drop(self.finish_none()),
433
-
434
- Numeric(_) if !self.seen_digit => drop(self.unconsume_numeric(tokenizer, input)),
435
-
436
- Numeric(_) | NumericSemicolon => {
437
- tokenizer.emit_error(Borrowed("EOF in numeric character reference"));
438
- self.finish_numeric(tokenizer);
439
- },
440
-
441
- Named => drop(self.finish_named(tokenizer, None, input)),
442
-
443
- BogusName => {
444
- self.unconsume_name(tokenizer, input);
445
- self.finish_none();
446
- },
447
-
448
- Octothorpe => {
449
- tokenizer.unconsume(input, StrTendril::from_slice("#"));
450
- tokenizer.emit_error(Borrowed("EOF after '#' in character reference"));
451
- self.finish_none();
452
- },
453
- }
454
- }
455
- }
456
- }