html-to-markdown 2.24.6 → 2.25.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (231) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/html-to-markdown-rb/native/Cargo.lock +9 -32
  5. data/ext/html-to-markdown-rb/native/Cargo.toml +1 -1
  6. data/lib/html_to_markdown/version.rb +1 -1
  7. data/rust-vendor/html-to-markdown-rs/Cargo.toml +0 -1
  8. data/rust-vendor/html-to-markdown-rs/src/converter/main_helpers.rs +1 -1
  9. data/rust-vendor/html-to-markdown-rs/src/hocr/converter/hierarchy.rs +20 -5
  10. data/rust-vendor/html-to-markdown-rs/src/lib.rs +1 -0
  11. data/rust-vendor/{markup5ever_rcdom/lib.rs → html-to-markdown-rs/src/rcdom.rs} +56 -91
  12. data/rust-vendor/html-to-markdown-rs/tests/hocr_compliance_test.rs +157 -0
  13. data/rust-vendor/memmap2/.cargo-checksum.json +1 -1
  14. data/rust-vendor/memmap2/.cargo_vcs_info.json +1 -1
  15. data/rust-vendor/memmap2/CHANGELOG.md +8 -0
  16. data/rust-vendor/memmap2/Cargo.lock +1 -1
  17. data/rust-vendor/memmap2/Cargo.toml +2 -1
  18. data/rust-vendor/memmap2/Cargo.toml.orig +2 -1
  19. data/rust-vendor/memmap2/src/lib.rs +25 -1
  20. data/rust-vendor/memmap2/src/stub.rs +1 -4
  21. data/rust-vendor/memmap2/src/unix.rs +14 -1
  22. data/rust-vendor/png/.cargo-checksum.json +1 -1
  23. data/rust-vendor/png/.cargo_vcs_info.json +1 -1
  24. data/rust-vendor/png/CHANGES.md +44 -0
  25. data/rust-vendor/png/Cargo.lock +124 -171
  26. data/rust-vendor/png/Cargo.toml +1 -1
  27. data/rust-vendor/png/Cargo.toml.orig +1 -1
  28. data/rust-vendor/png/benches/expand_paletted.rs +5 -5
  29. data/rust-vendor/png/benches/unfilter.rs +3 -3
  30. data/rust-vendor/png/src/adam7.rs +17 -10
  31. data/rust-vendor/png/src/common.rs +8 -8
  32. data/rust-vendor/png/src/decoder/mod.rs +53 -20
  33. data/rust-vendor/png/src/decoder/stream.rs +263 -78
  34. data/rust-vendor/png/src/decoder/unfiltering_buffer.rs +210 -53
  35. data/rust-vendor/png/src/decoder/zlib.rs +130 -90
  36. data/rust-vendor/png/src/encoder.rs +4 -2
  37. data/rust-vendor/png/src/{filter.rs → filter/mod.rs} +100 -367
  38. data/rust-vendor/png/src/filter/optimization-notes.md +104 -0
  39. data/rust-vendor/png/src/filter/paeth.rs +398 -0
  40. data/rust-vendor/png/src/filter/simd.rs +308 -0
  41. data/rust-vendor/png/src/lib.rs +1 -0
  42. data/rust-vendor/syn/.cargo-checksum.json +1 -1
  43. data/rust-vendor/syn/.cargo_vcs_info.json +1 -1
  44. data/rust-vendor/syn/Cargo.lock +40 -41
  45. data/rust-vendor/syn/Cargo.toml +1 -1
  46. data/rust-vendor/syn/Cargo.toml.orig +1 -1
  47. data/rust-vendor/syn/src/item.rs +61 -40
  48. data/rust-vendor/syn/src/lib.rs +2 -1
  49. data/rust-vendor/syn/tests/test_item.rs +54 -0
  50. data/rust-vendor/unicode-ident/.cargo-checksum.json +1 -1
  51. data/rust-vendor/unicode-ident/.cargo_vcs_info.json +1 -1
  52. data/rust-vendor/unicode-ident/Cargo.lock +21 -21
  53. data/rust-vendor/unicode-ident/Cargo.toml +1 -1
  54. data/rust-vendor/unicode-ident/Cargo.toml.orig +1 -1
  55. data/rust-vendor/unicode-ident/src/lib.rs +1 -1
  56. data/rust-vendor/unicode-ident/src/tables.rs +87 -97
  57. data/rust-vendor/unicode-ident/tests/static_size.rs +1 -1
  58. metadata +7 -177
  59. data/rust-vendor/markup5ever_rcdom/.cargo-checksum.json +0 -1
  60. data/rust-vendor/markup5ever_rcdom/.cargo_vcs_info.json +0 -7
  61. data/rust-vendor/markup5ever_rcdom/Cargo.lock +0 -658
  62. data/rust-vendor/markup5ever_rcdom/Cargo.toml +0 -109
  63. data/rust-vendor/markup5ever_rcdom/Cargo.toml.orig +0 -42
  64. data/rust-vendor/markup5ever_rcdom/LICENSE-APACHE +0 -201
  65. data/rust-vendor/markup5ever_rcdom/LICENSE-MIT +0 -25
  66. data/rust-vendor/markup5ever_rcdom/README.md +0 -7
  67. data/rust-vendor/markup5ever_rcdom/custom-html5lib-tokenizer-tests/regression.test +0 -69
  68. data/rust-vendor/markup5ever_rcdom/data/test/ignore +0 -1
  69. data/rust-vendor/markup5ever_rcdom/examples/hello_xml.rs +0 -39
  70. data/rust-vendor/markup5ever_rcdom/examples/html2html.rs +0 -51
  71. data/rust-vendor/markup5ever_rcdom/examples/print-rcdom.rs +0 -78
  72. data/rust-vendor/markup5ever_rcdom/examples/xml_tree_printer.rs +0 -67
  73. data/rust-vendor/markup5ever_rcdom/html5lib-tests/.gitattributes +0 -2
  74. data/rust-vendor/markup5ever_rcdom/html5lib-tests/.github/workflows/downstream.yml +0 -76
  75. data/rust-vendor/markup5ever_rcdom/html5lib-tests/.github/workflows/lint.yml +0 -25
  76. data/rust-vendor/markup5ever_rcdom/html5lib-tests/.gitignore +0 -79
  77. data/rust-vendor/markup5ever_rcdom/html5lib-tests/AUTHORS.rst +0 -34
  78. data/rust-vendor/markup5ever_rcdom/html5lib-tests/LICENSE +0 -21
  79. data/rust-vendor/markup5ever_rcdom/html5lib-tests/encoding/chardet/test_big5.txt +0 -51
  80. data/rust-vendor/markup5ever_rcdom/html5lib-tests/encoding/scripted/tests1.dat +0 -5
  81. data/rust-vendor/markup5ever_rcdom/html5lib-tests/encoding/test-yahoo-jp.dat +0 -10
  82. data/rust-vendor/markup5ever_rcdom/html5lib-tests/encoding/tests1.dat +0 -388
  83. data/rust-vendor/markup5ever_rcdom/html5lib-tests/encoding/tests2.dat +0 -115
  84. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint +0 -6
  85. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/__init__.py +0 -0
  86. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/__init__.py +0 -0
  87. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/LICENSE +0 -18
  88. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/__init__.py +0 -0
  89. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/lexer.py +0 -211
  90. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/lexer.pyi +0 -34
  91. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/parser.py +0 -872
  92. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/parser.pyi +0 -83
  93. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/py.typed +0 -0
  94. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/util.py +0 -72
  95. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/util.pyi +0 -7
  96. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/vendor.txt +0 -1
  97. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor-patches/funcparserlib.patch +0 -24
  98. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/lint.py +0 -280
  99. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/parser.py +0 -177
  100. data/rust-vendor/markup5ever_rcdom/html5lib-tests/pyproject.toml +0 -7
  101. data/rust-vendor/markup5ever_rcdom/html5lib-tests/serializer/core.test +0 -125
  102. data/rust-vendor/markup5ever_rcdom/html5lib-tests/serializer/injectmeta.test +0 -66
  103. data/rust-vendor/markup5ever_rcdom/html5lib-tests/serializer/optionaltags.test +0 -965
  104. data/rust-vendor/markup5ever_rcdom/html5lib-tests/serializer/options.test +0 -60
  105. data/rust-vendor/markup5ever_rcdom/html5lib-tests/serializer/whitespace.test +0 -51
  106. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/README.md +0 -107
  107. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/contentModelFlags.test +0 -93
  108. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/domjs.test +0 -335
  109. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/entities.test +0 -542
  110. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/escapeFlag.test +0 -36
  111. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/namedEntities.test +0 -42422
  112. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/numericEntities.test +0 -1677
  113. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/pendingSpecChanges.test +0 -9
  114. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/test1.test +0 -353
  115. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/test2.test +0 -275
  116. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/test3.test +0 -11233
  117. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/test4.test +0 -532
  118. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/unicodeChars.test +0 -1577
  119. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/unicodeCharsProblematic.test +0 -41
  120. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/xmlViolation.test +0 -20
  121. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/README.md +0 -108
  122. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/adoption01.dat +0 -354
  123. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/adoption02.dat +0 -39
  124. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/blocks.dat +0 -695
  125. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/comments01.dat +0 -217
  126. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/doctype01.dat +0 -474
  127. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/domjs-unsafe.dat +0 -0
  128. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/entities01.dat +0 -943
  129. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/entities02.dat +0 -309
  130. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/foreign-fragment.dat +0 -645
  131. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/html5test-com.dat +0 -301
  132. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/inbody01.dat +0 -54
  133. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/isindex.dat +0 -49
  134. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/main-element.dat +0 -46
  135. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/math.dat +0 -104
  136. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/menuitem-element.dat +0 -240
  137. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/namespace-sensitivity.dat +0 -22
  138. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/noscript01.dat +0 -237
  139. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/pending-spec-changes-plain-text-unsafe.dat +0 -0
  140. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/pending-spec-changes.dat +0 -46
  141. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/plain-text-unsafe.dat +0 -0
  142. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/quirks01.dat +0 -53
  143. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/ruby.dat +0 -302
  144. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/scriptdata01.dat +0 -372
  145. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/scripted/adoption01.dat +0 -16
  146. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/scripted/ark.dat +0 -27
  147. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/scripted/webkit01.dat +0 -30
  148. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/search-element.dat +0 -46
  149. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/svg.dat +0 -104
  150. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tables01.dat +0 -322
  151. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/template.dat +0 -1673
  152. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests1.dat +0 -1956
  153. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests10.dat +0 -849
  154. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests11.dat +0 -523
  155. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests12.dat +0 -62
  156. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests14.dat +0 -75
  157. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests15.dat +0 -216
  158. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests16.dat +0 -2602
  159. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests17.dat +0 -179
  160. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests18.dat +0 -558
  161. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests19.dat +0 -1398
  162. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests2.dat +0 -831
  163. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests20.dat +0 -842
  164. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests21.dat +0 -306
  165. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests22.dat +0 -190
  166. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests23.dat +0 -168
  167. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests24.dat +0 -79
  168. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests25.dat +0 -288
  169. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests26.dat +0 -453
  170. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests3.dat +0 -305
  171. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests4.dat +0 -74
  172. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests5.dat +0 -210
  173. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests6.dat +0 -663
  174. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests7.dat +0 -453
  175. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests8.dat +0 -165
  176. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests9.dat +0 -472
  177. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests_innerHTML_1.dat +0 -843
  178. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tricky01.dat +0 -336
  179. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/webkit01.dat +0 -785
  180. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/webkit02.dat +0 -554
  181. data/rust-vendor/markup5ever_rcdom/tests/foreach_html5lib_test/mod.rs +0 -41
  182. data/rust-vendor/markup5ever_rcdom/tests/html-driver.rs +0 -29
  183. data/rust-vendor/markup5ever_rcdom/tests/html-serializer.rs +0 -265
  184. data/rust-vendor/markup5ever_rcdom/tests/html-tokenizer.rs +0 -487
  185. data/rust-vendor/markup5ever_rcdom/tests/html-tree-builder.rs +0 -298
  186. data/rust-vendor/markup5ever_rcdom/tests/html-tree-sink.rs +0 -141
  187. data/rust-vendor/markup5ever_rcdom/tests/util/find_tests.rs +0 -34
  188. data/rust-vendor/markup5ever_rcdom/tests/util/runner.rs +0 -48
  189. data/rust-vendor/markup5ever_rcdom/tests/xml-driver.rs +0 -101
  190. data/rust-vendor/markup5ever_rcdom/tests/xml-tokenizer.rs +0 -374
  191. data/rust-vendor/markup5ever_rcdom/tests/xml-tree-builder.rs +0 -237
  192. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/AUTHORS.rst +0 -9
  193. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/LICENSE +0 -21
  194. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/README.md +0 -92
  195. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/comments.test +0 -274
  196. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/doctype.test +0 -3232
  197. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/entities.test +0 -283
  198. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/eof.test +0 -113
  199. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/namedEntities.test +0 -42210
  200. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/numericEntities.test +0 -1349
  201. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/test1.test +0 -162
  202. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/test2.test +0 -64
  203. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/unicodeChars.test +0 -1295
  204. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tree-construction/README.md +0 -104
  205. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tree-construction/namespace.dat +0 -119
  206. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tree-construction/test1.dat +0 -124
  207. data/rust-vendor/xml5ever/.cargo-checksum.json +0 -1
  208. data/rust-vendor/xml5ever/.cargo_vcs_info.json +0 -6
  209. data/rust-vendor/xml5ever/Cargo.lock +0 -752
  210. data/rust-vendor/xml5ever/Cargo.toml +0 -69
  211. data/rust-vendor/xml5ever/Cargo.toml.orig +0 -29
  212. data/rust-vendor/xml5ever/LICENSE-APACHE +0 -201
  213. data/rust-vendor/xml5ever/LICENSE-MIT +0 -25
  214. data/rust-vendor/xml5ever/README.md +0 -72
  215. data/rust-vendor/xml5ever/benches/xml5ever.rs +0 -77
  216. data/rust-vendor/xml5ever/data/bench/strong.xml +0 -1
  217. data/rust-vendor/xml5ever/examples/README.md +0 -223
  218. data/rust-vendor/xml5ever/examples/example.xml +0 -3
  219. data/rust-vendor/xml5ever/examples/simple_xml_tokenizer.rs +0 -81
  220. data/rust-vendor/xml5ever/examples/xml_tokenizer.rs +0 -115
  221. data/rust-vendor/xml5ever/src/driver.rs +0 -90
  222. data/rust-vendor/xml5ever/src/lib.rs +0 -47
  223. data/rust-vendor/xml5ever/src/macros.rs +0 -18
  224. data/rust-vendor/xml5ever/src/serialize/mod.rs +0 -216
  225. data/rust-vendor/xml5ever/src/tokenizer/char_ref/mod.rs +0 -456
  226. data/rust-vendor/xml5ever/src/tokenizer/interface.rs +0 -116
  227. data/rust-vendor/xml5ever/src/tokenizer/mod.rs +0 -1344
  228. data/rust-vendor/xml5ever/src/tokenizer/qname.rs +0 -84
  229. data/rust-vendor/xml5ever/src/tokenizer/states.rs +0 -167
  230. data/rust-vendor/xml5ever/src/tree_builder/mod.rs +0 -774
  231. data/rust-vendor/xml5ever/src/tree_builder/types.rs +0 -37
@@ -1,1344 +0,0 @@
1
- // Copyright 2014-2017 The html5ever Project Developers. See the
2
- // COPYRIGHT file at the top-level directory of this distribution.
3
- //
4
- // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5
- // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6
- // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7
- // option. This file may not be copied, modified, or distributed
8
- // except according to those terms.
9
-
10
- mod char_ref;
11
- mod interface;
12
- mod qname;
13
- pub mod states;
14
-
15
- pub use self::interface::{
16
- Doctype, EmptyTag, EndTag, Pi, ShortTag, StartTag, Tag, TagKind, Token, TokenSink,
17
- };
18
- pub use crate::{LocalName, Namespace, Prefix};
19
-
20
- use crate::macros::time;
21
- use crate::tendril::StrTendril;
22
- use crate::{buffer_queue, Attribute, QualName, SmallCharSet};
23
- use log::debug;
24
- use markup5ever::{local_name, namespace_prefix, ns, small_char_set, TokenizerResult};
25
- use std::borrow::Cow::{self, Borrowed};
26
- use std::cell::{Cell, RefCell, RefMut};
27
- use std::collections::BTreeMap;
28
- use std::mem::replace;
29
-
30
- use buffer_queue::{BufferQueue, FromSet, NotFromSet, SetResult};
31
- use char_ref::{CharRef, CharRefTokenizer};
32
- use qname::QualNameTokenizer;
33
- use states::{AttrValueKind::*, DoctypeKind, DoctypeKind::*, XmlState};
34
-
35
- /// Copy of Tokenizer options, with an impl for `Default`.
36
- #[derive(Copy, Clone)]
37
- pub struct XmlTokenizerOpts {
38
- /// Report all parse errors described in the spec, at some
39
- /// performance penalty? Default: false
40
- pub exact_errors: bool,
41
-
42
- /// Discard a `U+FEFF BYTE ORDER MARK` if we see one at the beginning
43
- /// of the stream? Default: true
44
- pub discard_bom: bool,
45
-
46
- /// Keep a record of how long we spent in each state? Printed
47
- /// when `end()` is called. Default: false
48
- pub profile: bool,
49
-
50
- /// Initial state override. Only the test runner should use
51
- /// a non-`None` value!
52
- pub initial_state: Option<XmlState>,
53
- }
54
-
55
- fn process_qname(tag_name: StrTendril) -> QualName {
56
- // If tag name can't possibly contain full namespace, skip qualified name
57
- // parsing altogether. For a tag to have namespace it must look like:
58
- // a:b
59
- // Since StrTendril are UTF-8, we know that minimal size in bytes must be
60
- // three bytes minimum.
61
- let split = if (*tag_name).len() < 3 {
62
- None
63
- } else {
64
- QualNameTokenizer::new((*tag_name).as_bytes()).run()
65
- };
66
-
67
- match split {
68
- None => QualName::new(None, ns!(), LocalName::from(&*tag_name)),
69
- Some(col) => {
70
- let len = (*tag_name).len() as u32;
71
- let prefix = tag_name.subtendril(0, col);
72
- let local = tag_name.subtendril(col + 1, len - col - 1);
73
- let ns = ns!(); // Actual namespace URL set in XmlTreeBuilder::bind_qname
74
- QualName::new(Some(Prefix::from(&*prefix)), ns, LocalName::from(&*local))
75
- },
76
- }
77
- }
78
-
79
- fn option_push(opt_str: &mut Option<StrTendril>, c: char) {
80
- match *opt_str {
81
- Some(ref mut s) => s.push_char(c),
82
- None => *opt_str = Some(StrTendril::from_char(c)),
83
- }
84
- }
85
-
86
- impl Default for XmlTokenizerOpts {
87
- fn default() -> XmlTokenizerOpts {
88
- XmlTokenizerOpts {
89
- exact_errors: false,
90
- discard_bom: true,
91
- profile: false,
92
- initial_state: None,
93
- }
94
- }
95
- }
96
- /// The Xml tokenizer.
97
- pub struct XmlTokenizer<Sink> {
98
- /// Options controlling the behavior of the tokenizer.
99
- opts: XmlTokenizerOpts,
100
-
101
- /// Destination for tokens we emit.
102
- pub sink: Sink,
103
-
104
- /// The abstract machine state as described in the spec.
105
- state: Cell<XmlState>,
106
-
107
- /// Are we at the end of the file, once buffers have been processed
108
- /// completely? This affects whether we will wait for lookahead or not.
109
- at_eof: Cell<bool>,
110
-
111
- /// Tokenizer for character references, if we're tokenizing
112
- /// one at the moment.
113
- char_ref_tokenizer: RefCell<Option<Box<CharRefTokenizer>>>,
114
-
115
- /// Current input character. Just consumed, may reconsume.
116
- current_char: Cell<char>,
117
-
118
- /// Should we reconsume the current input character?
119
- reconsume: Cell<bool>,
120
-
121
- /// Did we just consume \r, translating it to \n? In that case we need
122
- /// to ignore the next character if it's \n.
123
- ignore_lf: Cell<bool>,
124
-
125
- /// Discard a U+FEFF BYTE ORDER MARK if we see one? Only done at the
126
- /// beginning of the stream.
127
- discard_bom: Cell<bool>,
128
-
129
- /// Temporary buffer
130
- temp_buf: RefCell<StrTendril>,
131
-
132
- /// Current tag kind.
133
- current_tag_kind: Cell<TagKind>,
134
-
135
- /// Current tag name.
136
- current_tag_name: RefCell<StrTendril>,
137
-
138
- /// Current tag attributes.
139
- current_tag_attrs: RefCell<Vec<Attribute>>,
140
-
141
- /// Current attribute name.
142
- current_attr_name: RefCell<StrTendril>,
143
-
144
- /// Current attribute value.
145
- current_attr_value: RefCell<StrTendril>,
146
-
147
- current_doctype: RefCell<Doctype>,
148
-
149
- /// Current comment.
150
- current_comment: RefCell<StrTendril>,
151
-
152
- /// Current processing instruction target.
153
- current_pi_target: RefCell<StrTendril>,
154
-
155
- /// Current processing instruction value.
156
- current_pi_data: RefCell<StrTendril>,
157
-
158
- /// Record of how many ns we spent in each state, if profiling is enabled.
159
- state_profile: RefCell<BTreeMap<XmlState, u64>>,
160
-
161
- /// Record of how many ns we spent in the token sink.
162
- time_in_sink: Cell<u64>,
163
- }
164
-
165
- impl<Sink: TokenSink> XmlTokenizer<Sink> {
166
- /// Create a new tokenizer which feeds tokens to a particular `TokenSink`.
167
- pub fn new(sink: Sink, opts: XmlTokenizerOpts) -> XmlTokenizer<Sink> {
168
- if opts.profile && cfg!(for_c) {
169
- panic!("Can't profile tokenizer when built as a C library");
170
- }
171
-
172
- let state = *opts.initial_state.as_ref().unwrap_or(&XmlState::Data);
173
- let discard_bom = opts.discard_bom;
174
- XmlTokenizer {
175
- opts,
176
- sink,
177
- state: Cell::new(state),
178
- char_ref_tokenizer: RefCell::new(None),
179
- at_eof: Cell::new(false),
180
- current_char: Cell::new('\0'),
181
- reconsume: Cell::new(false),
182
- ignore_lf: Cell::new(false),
183
- temp_buf: RefCell::new(StrTendril::new()),
184
- discard_bom: Cell::new(discard_bom),
185
- current_tag_kind: Cell::new(StartTag),
186
- current_tag_name: RefCell::new(StrTendril::new()),
187
- current_tag_attrs: RefCell::new(vec![]),
188
- current_attr_name: RefCell::new(StrTendril::new()),
189
- current_attr_value: RefCell::new(StrTendril::new()),
190
- current_comment: RefCell::new(StrTendril::new()),
191
- current_pi_data: RefCell::new(StrTendril::new()),
192
- current_pi_target: RefCell::new(StrTendril::new()),
193
- current_doctype: RefCell::new(Doctype::default()),
194
- state_profile: RefCell::new(BTreeMap::new()),
195
- time_in_sink: Cell::new(0),
196
- }
197
- }
198
-
199
- /// Feed an input string into the tokenizer.
200
- pub fn feed(&self, input: &BufferQueue) -> TokenizerResult<Sink::Handle> {
201
- if input.is_empty() {
202
- return TokenizerResult::Done;
203
- }
204
-
205
- if self.discard_bom.get() {
206
- if let Some(c) = input.peek() {
207
- if c == '\u{feff}' {
208
- input.next();
209
- }
210
- } else {
211
- return TokenizerResult::Done;
212
- }
213
- };
214
-
215
- self.run(input)
216
- }
217
-
218
- fn process_token(&self, token: Token) -> ProcessResult<Sink::Handle> {
219
- if self.opts.profile {
220
- let (result, dt) = time!(self.sink.process_token(token));
221
- self.time_in_sink.set(self.time_in_sink.get() + dt);
222
- result
223
- } else {
224
- self.sink.process_token(token)
225
- }
226
- }
227
-
228
- // Get the next input character, which might be the character
229
- // 'c' that we already consumed from the buffers.
230
- fn get_preprocessed_char(&self, mut c: char, input: &BufferQueue) -> Option<char> {
231
- if self.ignore_lf.get() {
232
- self.ignore_lf.set(false);
233
- if c == '\n' {
234
- c = input.next()?;
235
- }
236
- }
237
-
238
- if c == '\r' {
239
- self.ignore_lf.set(true);
240
- c = '\n';
241
- }
242
-
243
- // Normalize \x00 into \uFFFD
244
- if c == '\x00' {
245
- c = '\u{FFFD}'
246
- }
247
-
248
- // Exclude forbidden Unicode characters
249
- if self.opts.exact_errors
250
- && match c as u32 {
251
- 0x01..=0x08 | 0x0B | 0x0E..=0x1F | 0x7F..=0x9F | 0xFDD0..=0xFDEF => true,
252
- n if (n & 0xFFFE) == 0xFFFE => true,
253
- _ => false,
254
- }
255
- {
256
- let msg = format!("Bad character {c}");
257
- self.emit_error(Cow::Owned(msg));
258
- }
259
-
260
- debug!("got character {c}");
261
- self.current_char.set(c);
262
- Some(c)
263
- }
264
-
265
- fn bad_eof_error(&self) {
266
- let msg = if self.opts.exact_errors {
267
- Cow::from(format!("Saw EOF in state {:?}", self.state))
268
- } else {
269
- Cow::from("Unexpected EOF")
270
- };
271
- self.emit_error(msg);
272
- }
273
-
274
- fn pop_except_from(&self, input: &BufferQueue, set: SmallCharSet) -> Option<SetResult> {
275
- // Bail to the slow path for various corner cases.
276
- // This means that `FromSet` can contain characters not in the set!
277
- // It shouldn't matter because the fallback `FromSet` case should
278
- // always do the same thing as the `NotFromSet` case.
279
- if self.opts.exact_errors || self.reconsume.get() || self.ignore_lf.get() {
280
- return self.get_char(input).map(FromSet);
281
- }
282
-
283
- let d = input.pop_except_from(set);
284
- debug!("got characters {d:?}");
285
- match d {
286
- Some(FromSet(c)) => self.get_preprocessed_char(c, input).map(FromSet),
287
-
288
- // NB: We don't set self.current_char for a run of characters not
289
- // in the set. It shouldn't matter for the codepaths that use
290
- // this.
291
- _ => d,
292
- }
293
- }
294
-
295
- // Check if the next characters are an ASCII case-insensitive match. See
296
- // BufferQueue::eat.
297
- //
298
- // NB: this doesn't do input stream preprocessing or set the current input
299
- // character.
300
- fn eat(&self, input: &BufferQueue, pat: &str) -> Option<bool> {
301
- input.push_front(replace(&mut *self.temp_buf.borrow_mut(), StrTendril::new()));
302
- match input.eat(pat, u8::eq_ignore_ascii_case) {
303
- None if self.at_eof.get() => Some(false),
304
- None => {
305
- let mut temp_buf = self.temp_buf.borrow_mut();
306
- while let Some(data) = input.next() {
307
- temp_buf.push_char(data);
308
- }
309
- None
310
- },
311
- Some(matched) => Some(matched),
312
- }
313
- }
314
-
315
- /// Run the state machine for as long as we can.
316
- pub fn run(&self, input: &BufferQueue) -> TokenizerResult<Sink::Handle> {
317
- if self.opts.profile {
318
- loop {
319
- let state = self.state.get();
320
- let old_sink = self.time_in_sink.get();
321
- let (run, mut dt) = time!(self.step(input));
322
- dt -= self.time_in_sink.get() - old_sink;
323
- let new = match self.state_profile.borrow_mut().get_mut(&state) {
324
- Some(x) => {
325
- *x += dt;
326
- false
327
- },
328
- None => true,
329
- };
330
- if new {
331
- // do this here because of borrow shenanigans
332
- self.state_profile.borrow_mut().insert(state, dt);
333
- }
334
- match run {
335
- ProcessResult::Continue => continue,
336
- ProcessResult::Done => return TokenizerResult::Done,
337
- ProcessResult::Script(handle) => return TokenizerResult::Script(handle),
338
- }
339
- }
340
- } else {
341
- loop {
342
- match self.step(input) {
343
- ProcessResult::Continue => continue,
344
- ProcessResult::Done => return TokenizerResult::Done,
345
- ProcessResult::Script(handle) => return TokenizerResult::Script(handle),
346
- }
347
- }
348
- }
349
- }
350
-
351
- //§ tokenization
352
- // Get the next input character, if one is available.
353
- fn get_char(&self, input: &BufferQueue) -> Option<char> {
354
- if self.reconsume.get() {
355
- self.reconsume.set(false);
356
- Some(self.current_char.get())
357
- } else {
358
- input
359
- .next()
360
- .and_then(|c| self.get_preprocessed_char(c, input))
361
- }
362
- }
363
-
364
- fn bad_char_error(&self) {
365
- let msg = if self.opts.exact_errors {
366
- let c = self.current_char.get();
367
- let state = self.state.get();
368
- Cow::from(format!("Saw {c} in state {state:?}"))
369
- } else {
370
- Cow::from("Bad character")
371
- };
372
- self.emit_error(msg);
373
- }
374
-
375
- fn discard_tag(&self) {
376
- *self.current_tag_name.borrow_mut() = StrTendril::new();
377
- *self.current_tag_attrs.borrow_mut() = Vec::new();
378
- }
379
-
380
- fn create_tag(&self, kind: TagKind, c: char) {
381
- self.discard_tag();
382
- self.current_tag_name.borrow_mut().push_char(c);
383
- self.current_tag_kind.set(kind);
384
- }
385
-
386
- // This method creates a PI token and
387
- // sets its target to given char
388
- fn create_pi(&self, c: char) {
389
- *self.current_pi_target.borrow_mut() = StrTendril::new();
390
- *self.current_pi_data.borrow_mut() = StrTendril::new();
391
- self.current_pi_target.borrow_mut().push_char(c);
392
- }
393
-
394
- fn emit_char(&self, c: char) {
395
- self.process_token(Token::Characters(StrTendril::from_char(match c {
396
- '\0' => '\u{FFFD}',
397
- c => c,
398
- })));
399
- }
400
-
401
- fn emit_short_tag(&self) -> ProcessResult<Sink::Handle> {
402
- self.current_tag_kind.set(ShortTag);
403
- *self.current_tag_name.borrow_mut() = StrTendril::new();
404
- self.emit_current_tag()
405
- }
406
-
407
- fn emit_empty_tag(&self) -> ProcessResult<Sink::Handle> {
408
- self.current_tag_kind.set(EmptyTag);
409
- self.emit_current_tag()
410
- }
411
-
412
- fn set_empty_tag(&self) {
413
- self.current_tag_kind.set(EmptyTag);
414
- }
415
-
416
- fn emit_start_tag(&self) -> ProcessResult<Sink::Handle> {
417
- self.current_tag_kind.set(StartTag);
418
- self.emit_current_tag()
419
- }
420
-
421
- fn emit_current_tag(&self) -> ProcessResult<Sink::Handle> {
422
- self.finish_attribute();
423
-
424
- let qname = process_qname(replace(
425
- &mut *self.current_tag_name.borrow_mut(),
426
- StrTendril::new(),
427
- ));
428
-
429
- match self.current_tag_kind.get() {
430
- StartTag | EmptyTag => {},
431
- EndTag => {
432
- if !self.current_tag_attrs.borrow().is_empty() {
433
- self.emit_error(Borrowed("Attributes on an end tag"));
434
- }
435
- },
436
- ShortTag => {
437
- if !self.current_tag_attrs.borrow().is_empty() {
438
- self.emit_error(Borrowed("Attributes on a short tag"));
439
- }
440
- },
441
- }
442
-
443
- let token = Token::Tag(Tag {
444
- kind: self.current_tag_kind.get(),
445
- name: qname,
446
- attrs: self.current_tag_attrs.take(),
447
- });
448
-
449
- self.process_token(token)
450
- }
451
-
452
- // The string must not contain '\0'!
453
- fn emit_chars(&self, b: StrTendril) {
454
- self.process_token(Token::Characters(b));
455
- }
456
-
457
- // Emits the current Processing Instruction
458
- fn emit_pi(&self) -> ProcessResult<<Sink as TokenSink>::Handle> {
459
- let token = Token::ProcessingInstruction(Pi {
460
- target: replace(&mut *self.current_pi_target.borrow_mut(), StrTendril::new()),
461
- data: replace(&mut *self.current_pi_data.borrow_mut(), StrTendril::new()),
462
- });
463
- self.process_token(token)
464
- }
465
-
466
- fn consume_char_ref(&self, addnl_allowed: Option<char>) {
467
- // NB: The char ref tokenizer assumes we have an additional allowed
468
- // character iff we're tokenizing in an attribute value.
469
- *self.char_ref_tokenizer.borrow_mut() =
470
- Some(Box::new(CharRefTokenizer::new(addnl_allowed)));
471
- }
472
-
473
- fn emit_eof(&self) {
474
- self.process_token(Token::EndOfFile);
475
- }
476
-
477
- fn emit_error(&self, error: Cow<'static, str>) {
478
- self.process_token(Token::ParseError(error));
479
- }
480
-
481
- fn emit_current_comment(&self) {
482
- let comment = self.current_comment.take();
483
- self.process_token(Token::Comment(comment));
484
- }
485
-
486
- fn emit_current_doctype(&self) {
487
- let doctype = self.current_doctype.take();
488
- self.process_token(Token::Doctype(doctype));
489
- }
490
-
491
- fn doctype_id(&self, kind: DoctypeKind) -> RefMut<'_, Option<StrTendril>> {
492
- let current_doctype = self.current_doctype.borrow_mut();
493
- match kind {
494
- DoctypeKind::Public => RefMut::map(current_doctype, |d| &mut d.public_id),
495
- DoctypeKind::System => RefMut::map(current_doctype, |d| &mut d.system_id),
496
- }
497
- }
498
-
499
- fn clear_doctype_id(&self, kind: DoctypeKind) {
500
- let mut id = self.doctype_id(kind);
501
- match *id {
502
- Some(ref mut s) => s.clear(),
503
- None => *id = Some(StrTendril::new()),
504
- }
505
- }
506
-
507
- fn peek(&self, input: &BufferQueue) -> Option<char> {
508
- if self.reconsume.get() {
509
- Some(self.current_char.get())
510
- } else {
511
- input.peek()
512
- }
513
- }
514
-
515
- fn discard_char(&self, input: &BufferQueue) {
516
- let c = self.get_char(input);
517
- assert!(c.is_some());
518
- }
519
-
520
- fn unconsume(&self, input: &BufferQueue, buf: StrTendril) {
521
- input.push_front(buf);
522
- }
523
- }
524
-
525
- // Shorthand for common state machine behaviors.
526
- macro_rules! shorthand (
527
- ( $me:ident : emit $c:expr ) => ( $me.emit_char($c) );
528
- ( $me:ident : create_tag $kind:ident $c:expr ) => ( $me.create_tag($kind, $c) );
529
- ( $me:ident : push_tag $c:expr ) => ( $me.current_tag_name.borrow_mut().push_char($c) );
530
- ( $me:ident : discard_tag $input:expr ) => ( $me.discard_tag($input) );
531
- ( $me:ident : discard_char ) => ( $me.discard_char() );
532
- ( $me:ident : push_temp $c:expr ) => ( $me.temp_buf.borrow_mut().push_char($c) );
533
- ( $me:ident : emit_temp ) => ( $me.emit_temp_buf() );
534
- ( $me:ident : clear_temp ) => ( $me.clear_temp_buf() );
535
- ( $me:ident : create_attr $c:expr ) => ( $me.create_attribute($c) );
536
- ( $me:ident : push_name $c:expr ) => ( $me.current_attr_name.borrow_mut().push_char($c) );
537
- ( $me:ident : push_value $c:expr ) => ( $me.current_attr_value.borrow_mut().push_char($c) );
538
- ( $me:ident : append_value $c:expr ) => ( $me.current_attr_value.borrow_mut().push_tendril($c));
539
- ( $me:ident : push_comment $c:expr ) => ( $me.current_comment.borrow_mut().push_char($c) );
540
- ( $me:ident : append_comment $c:expr ) => ( $me.current_comment.borrow_mut().push_slice($c) );
541
- ( $me:ident : emit_comment ) => ( $me.emit_current_comment() );
542
- ( $me:ident : clear_comment ) => ( $me.current_comment.borrow_mut().clear() );
543
- ( $me:ident : create_doctype ) => ( *$me.current_doctype.borrow_mut() = Doctype::default() );
544
- ( $me:ident : push_doctype_name $c:expr ) => ( option_push(&mut $me.current_doctype.borrow_mut().name, $c) );
545
- ( $me:ident : push_doctype_id $k:ident $c:expr ) => ( option_push(&mut $me.doctype_id($k), $c) );
546
- ( $me:ident : clear_doctype_id $k:ident ) => ( $me.clear_doctype_id($k) );
547
- ( $me:ident : emit_doctype ) => ( $me.emit_current_doctype() );
548
- ( $me:ident : error ) => ( $me.bad_char_error() );
549
- ( $me:ident : error_eof ) => ( $me.bad_eof_error() );
550
- ( $me:ident : create_pi $c:expr ) => ( $me.create_pi($c) );
551
- ( $me:ident : push_pi_target $c:expr ) => ( $me.current_pi_target.borrow_mut().push_char($c) );
552
- ( $me:ident : push_pi_data $c:expr ) => ( $me.current_pi_data.borrow_mut().push_char($c) );
553
- ( $me:ident : set_empty_tag ) => ( $me.set_empty_tag() );
554
- );
555
-
556
- // Tracing of tokenizer actions. This adds significant bloat and compile time,
557
- // so it's behind a cfg flag.
558
- #[cfg(feature = "trace_tokenizer")]
559
- macro_rules! sh_trace ( ( $me:ident : $($cmds:tt)* ) => ({
560
- debug!(" {:?}", stringify!($($cmds)*));
561
- shorthand!($me : $($cmds)*);
562
- }));
563
-
564
- #[cfg(not(feature = "trace_tokenizer"))]
565
- macro_rules! sh_trace ( ( $me:ident : $($cmds:tt)* ) => ( shorthand!($me: $($cmds)*) ) );
566
-
567
- // A little DSL for sequencing shorthand actions.
568
- macro_rules! go (
569
- // A pattern like $($cmd:tt)* ; $($rest:tt)* causes parse ambiguity.
570
- // We have to tell the parser how much lookahead we need.
571
-
572
- ( $me:ident : $a:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a); go!($me: $($rest)*); });
573
- ( $me:ident : $a:tt $b:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b); go!($me: $($rest)*); });
574
- ( $me:ident : $a:tt $b:tt $c:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b $c); go!($me: $($rest)*); });
575
- ( $me:ident : $a:tt $b:tt $c:tt $d:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b $c $d); go!($me: $($rest)*); });
576
-
577
- // These can only come at the end.
578
-
579
- ( $me:ident : to $s:ident ) => ({ $me.state.set(XmlState::$s); return ProcessResult::Continue; });
580
- ( $me:ident : to $s:ident $k1:expr ) => ({ $me.state.set(XmlState::$s($k1)); return ProcessResult::Continue; });
581
- ( $me:ident : to $s:ident $k1:ident $k2:expr ) => ({ $me.state.set(XmlState::$s($k1($k2))); return ProcessResult::Continue; });
582
-
583
- ( $me:ident : reconsume $s:ident ) => ({ $me.reconsume.set(true); go!($me: to $s); });
584
- ( $me:ident : reconsume $s:ident $k1:expr ) => ({ $me.reconsume.set(true); go!($me: to $s $k1); });
585
- ( $me:ident : reconsume $s:ident $k1:ident $k2:expr ) => ({ $me.reconsume.set(true); go!($me: to $s $k1 $k2); });
586
-
587
- ( $me:ident : consume_char_ref ) => ({ $me.consume_char_ref(None); return ProcessResult::Continue; });
588
- ( $me:ident : consume_char_ref $addnl:expr ) => ({ $me.consume_char_ref(Some($addnl)); return ProcessResult::Continue; });
589
-
590
- // We have a default next state after emitting a tag, but the sink can override.
591
- ( $me:ident : emit_tag $s:ident ) => ({
592
- $me.state.set(XmlState::$s);
593
- return $me.emit_current_tag();
594
- });
595
-
596
- // We have a special when dealing with empty and short tags in Xml
597
- ( $me:ident : emit_short_tag $s:ident ) => ({
598
- $me.state.set(XmlState::$s);
599
- return $me.emit_short_tag();
600
- });
601
-
602
- ( $me:ident : emit_empty_tag $s:ident ) => ({
603
- $me.state.set(XmlState::$s);
604
- return $me.emit_empty_tag();
605
- });
606
-
607
- ( $me:ident : emit_start_tag $s:ident ) => ({
608
- $me.state.set(XmlState::$s);
609
- return $me.emit_start_tag();
610
- });
611
-
612
- ( $me:ident : emit_pi $s:ident ) => ({
613
- $me.state.set(XmlState::$s);
614
- return $me.emit_pi();
615
- });
616
-
617
- ( $me:ident : eof ) => ({ $me.emit_eof(); return ProcessResult::Done; });
618
-
619
- // If nothing else matched, it's a single command
620
- ( $me:ident : $($cmd:tt)+ ) => ( sh_trace!($me: $($cmd)+) );
621
-
622
- // or nothing.
623
- ( $me:ident : ) => (());
624
- );
625
-
626
- // This is a macro because it can cause early return
627
- // from the function where it is used.
628
- macro_rules! get_char ( ($me:expr, $input:expr) => {{
629
- let Some(character) = $me.get_char($input) else {
630
- return ProcessResult::Done;
631
- };
632
- character
633
- }});
634
-
635
- macro_rules! pop_except_from ( ($me:expr, $input:expr, $set:expr) => {{
636
- let Some(popped_element) = $me.pop_except_from($input, $set) else {
637
- return ProcessResult::Done;
638
- };
639
- popped_element
640
- }});
641
-
642
- macro_rules! eat ( ($me:expr, $input:expr, $pat:expr) => {{
643
- let Some(value) = $me.eat($input, $pat) else {
644
- return ProcessResult::Done;
645
- };
646
- value
647
- }});
648
-
649
- /// The result of a single tokenization operation
650
- pub enum ProcessResult<Handle> {
651
- /// The tokenizer needs more input before it can continue
652
- Done,
653
- /// The tokenizer can be invoked again immediately
654
- Continue,
655
- /// The tokenizer encountered a script element that must be executed
656
- /// before tokenization can continue
657
- Script(Handle),
658
- }
659
-
660
- impl<Sink: TokenSink> XmlTokenizer<Sink> {
661
- // Run the state machine for a while.
662
- #[allow(clippy::never_loop)]
663
- fn step(&self, input: &BufferQueue) -> ProcessResult<Sink::Handle> {
664
- if self.char_ref_tokenizer.borrow().is_some() {
665
- return self.step_char_ref_tokenizer(input);
666
- }
667
-
668
- debug!("processing in state {:?}", self.state);
669
- match self.state.get() {
670
- //§ data-state
671
- XmlState::Data => loop {
672
- match pop_except_from!(self, input, small_char_set!('\r' '&' '<')) {
673
- FromSet('&') => go!(self: consume_char_ref),
674
- FromSet('<') => go!(self: to TagState),
675
- FromSet(c) => go!(self: emit c),
676
- NotFromSet(b) => self.emit_chars(b),
677
- }
678
- },
679
- //§ tag-state
680
- XmlState::TagState => loop {
681
- match get_char!(self, input) {
682
- '!' => go!(self: to MarkupDecl),
683
- '/' => go!(self: to EndTagState),
684
- '?' => go!(self: to Pi),
685
- '\t' | '\n' | ' ' | ':' | '<' | '>' => {
686
- go!(self: error; emit '<'; reconsume Data)
687
- },
688
- cl => go!(self: create_tag StartTag cl; to TagName),
689
- }
690
- },
691
- //§ end-tag-state
692
- XmlState::EndTagState => loop {
693
- match get_char!(self, input) {
694
- '>' => go!(self: emit_short_tag Data),
695
- '\t' | '\n' | ' ' | '<' | ':' => {
696
- go!(self: error; emit '<'; emit '/'; reconsume Data)
697
- },
698
- cl => go!(self: create_tag EndTag cl; to EndTagName),
699
- }
700
- },
701
- //§ end-tag-name-state
702
- XmlState::EndTagName => loop {
703
- match get_char!(self, input) {
704
- '\t' | '\n' | ' ' => go!(self: to EndTagNameAfter),
705
- '/' => go!(self: error; to EndTagNameAfter),
706
- '>' => go!(self: emit_tag Data),
707
- cl => go!(self: push_tag cl),
708
- }
709
- },
710
- //§ end-tag-name-after-state
711
- XmlState::EndTagNameAfter => loop {
712
- match get_char!(self, input) {
713
- '>' => go!(self: emit_tag Data),
714
- '\t' | '\n' | ' ' => (),
715
- _ => self.emit_error(Borrowed("Unexpected element in tag name")),
716
- }
717
- },
718
- //§ pi-state
719
- XmlState::Pi => loop {
720
- match get_char!(self, input) {
721
- '\t' | '\n' | ' ' => go!(self: error; reconsume BogusComment),
722
- cl => go!(self: create_pi cl; to PiTarget),
723
- }
724
- },
725
- //§ pi-target-state
726
- XmlState::PiTarget => loop {
727
- match get_char!(self, input) {
728
- '\t' | '\n' | ' ' => go!(self: to PiTargetAfter),
729
- '?' => go!(self: to PiAfter),
730
- cl => go!(self: push_pi_target cl),
731
- }
732
- },
733
- //§ pi-target-after-state
734
- XmlState::PiTargetAfter => loop {
735
- match get_char!(self, input) {
736
- '\t' | '\n' | ' ' => (),
737
- _ => go!(self: reconsume PiData),
738
- }
739
- },
740
- //§ pi-data-state
741
- XmlState::PiData => loop {
742
- match get_char!(self, input) {
743
- '?' => go!(self: to PiAfter),
744
- cl => go!(self: push_pi_data cl),
745
- }
746
- },
747
- //§ pi-after-state
748
- XmlState::PiAfter => loop {
749
- match get_char!(self, input) {
750
- '>' => go!(self: emit_pi Data),
751
- '?' => go!(self: to PiAfter),
752
- cl => go!(self: push_pi_data cl),
753
- }
754
- },
755
- //§ markup-declaration-state
756
- XmlState::MarkupDecl => loop {
757
- if eat!(self, input, "--") {
758
- go!(self: clear_comment; to CommentStart);
759
- } else if eat!(self, input, "[CDATA[") {
760
- go!(self: to Cdata);
761
- } else if eat!(self, input, "DOCTYPE") {
762
- go!(self: to Doctype);
763
- } else {
764
- // FIXME: 'error' gives wrong message
765
- go!(self: error; to BogusComment);
766
- }
767
- },
768
- //§ comment-start-state
769
- XmlState::CommentStart => loop {
770
- match get_char!(self, input) {
771
- '-' => go!(self: to CommentStartDash),
772
- '>' => go!(self: error; emit_comment; to Data),
773
- _ => go!(self: reconsume Comment),
774
- }
775
- },
776
- //§ comment-start-dash-state
777
- XmlState::CommentStartDash => loop {
778
- match get_char!(self, input) {
779
- '-' => go!(self: to CommentEnd),
780
- '>' => go!(self: error; emit_comment; to Data),
781
- _ => go!(self: push_comment '-'; reconsume Comment),
782
- }
783
- },
784
- //§ comment-state
785
- XmlState::Comment => loop {
786
- match get_char!(self, input) {
787
- '<' => go!(self: push_comment '<'; to CommentLessThan),
788
- '-' => go!(self: to CommentEndDash),
789
- c => go!(self: push_comment c),
790
- }
791
- },
792
- //§ comment-less-than-sign-state
793
- XmlState::CommentLessThan => loop {
794
- match get_char!(self, input) {
795
- '!' => go!(self: push_comment '!';to CommentLessThanBang),
796
- '<' => go!(self: push_comment '<'),
797
- _ => go!(self: reconsume Comment),
798
- }
799
- },
800
- //§ comment-less-than-sign-bang-state
801
- XmlState::CommentLessThanBang => loop {
802
- match get_char!(self, input) {
803
- '-' => go!(self: to CommentLessThanBangDash),
804
- _ => go!(self: reconsume Comment),
805
- }
806
- },
807
- //§ comment-less-than-sign-bang-dash-state
808
- XmlState::CommentLessThanBangDash => loop {
809
- match get_char!(self, input) {
810
- '-' => go!(self: to CommentLessThanBangDashDash),
811
- _ => go!(self: reconsume CommentEndDash),
812
- }
813
- },
814
- //§ comment-less-than-sign-bang-dash-dash-state
815
- XmlState::CommentLessThanBangDashDash => loop {
816
- match get_char!(self, input) {
817
- '>' => go!(self: reconsume CommentEnd),
818
- _ => go!(self: error; reconsume CommentEnd),
819
- }
820
- },
821
- //§ comment-end-dash-state
822
- XmlState::CommentEndDash => loop {
823
- match get_char!(self, input) {
824
- '-' => go!(self: to CommentEnd),
825
- _ => go!(self: push_comment '-'; reconsume Comment),
826
- }
827
- },
828
- //§ comment-end-state
829
- XmlState::CommentEnd => loop {
830
- match get_char!(self, input) {
831
- '>' => go!(self: emit_comment; to Data),
832
- '!' => go!(self: to CommentEndBang),
833
- '-' => go!(self: push_comment '-'),
834
- _ => go!(self: append_comment "--"; reconsume Comment),
835
- }
836
- },
837
- //§ comment-end-bang-state
838
- XmlState::CommentEndBang => loop {
839
- match get_char!(self, input) {
840
- '-' => go!(self: append_comment "--!"; to CommentEndDash),
841
- '>' => go!(self: error; emit_comment; to Data),
842
- _ => go!(self: append_comment "--!"; reconsume Comment),
843
- }
844
- },
845
- //§ bogus-comment-state
846
- XmlState::BogusComment => loop {
847
- match get_char!(self, input) {
848
- '>' => go!(self: emit_comment; to Data),
849
- c => go!(self: push_comment c),
850
- }
851
- },
852
- //§ cdata-state
853
- XmlState::Cdata => loop {
854
- match get_char!(self, input) {
855
- ']' => go!(self: to CdataBracket),
856
- cl => go!(self: emit cl),
857
- }
858
- },
859
- //§ cdata-bracket-state
860
- XmlState::CdataBracket => loop {
861
- match get_char!(self, input) {
862
- ']' => go!(self: to CdataEnd),
863
- cl => go!(self: emit ']'; emit cl; to Cdata),
864
- }
865
- },
866
- //§ cdata-end-state
867
- XmlState::CdataEnd => loop {
868
- match get_char!(self, input) {
869
- '>' => go!(self: to Data),
870
- ']' => go!(self: emit ']'),
871
- cl => go!(self: emit ']'; emit ']'; emit cl; to Cdata),
872
- }
873
- },
874
- //§ tag-name-state
875
- XmlState::TagName => loop {
876
- match get_char!(self, input) {
877
- '\t' | '\n' | ' ' => go!(self: to TagAttrNameBefore),
878
- '>' => go!(self: emit_tag Data),
879
- '/' => go!(self: set_empty_tag; to TagEmpty),
880
- cl => go!(self: push_tag cl),
881
- }
882
- },
883
- //§ empty-tag-state
884
- XmlState::TagEmpty => loop {
885
- match get_char!(self, input) {
886
- '>' => go!(self: emit_empty_tag Data),
887
- _ => go!(self: reconsume TagAttrValueBefore),
888
- }
889
- },
890
- //§ tag-attribute-name-before-state
891
- XmlState::TagAttrNameBefore => loop {
892
- match get_char!(self, input) {
893
- '\t' | '\n' | ' ' => (),
894
- '>' => go!(self: emit_tag Data),
895
- '/' => go!(self: set_empty_tag; to TagEmpty),
896
- ':' => go!(self: error),
897
- cl => go!(self: create_attr cl; to TagAttrName),
898
- }
899
- },
900
- //§ tag-attribute-name-state
901
- XmlState::TagAttrName => loop {
902
- match get_char!(self, input) {
903
- '=' => go!(self: to TagAttrValueBefore),
904
- '>' => go!(self: emit_tag Data),
905
- '\t' | '\n' | ' ' => go!(self: to TagAttrNameAfter),
906
- '/' => go!(self: set_empty_tag; to TagEmpty),
907
- cl => go!(self: push_name cl),
908
- }
909
- },
910
- //§ tag-attribute-name-after-state
911
- XmlState::TagAttrNameAfter => loop {
912
- match get_char!(self, input) {
913
- '\t' | '\n' | ' ' => (),
914
- '=' => go!(self: to TagAttrValueBefore),
915
- '>' => go!(self: emit_tag Data),
916
- '/' => go!(self: set_empty_tag; to TagEmpty),
917
- cl => go!(self: create_attr cl; to TagAttrName),
918
- }
919
- },
920
- //§ tag-attribute-value-before-state
921
- XmlState::TagAttrValueBefore => loop {
922
- match get_char!(self, input) {
923
- '\t' | '\n' | ' ' => (),
924
- '"' => go!(self: to TagAttrValue DoubleQuoted),
925
- '\'' => go!(self: to TagAttrValue SingleQuoted),
926
- '&' => go!(self: reconsume TagAttrValue(Unquoted)),
927
- '>' => go!(self: emit_tag Data),
928
- cl => go!(self: push_value cl; to TagAttrValue(Unquoted)),
929
- }
930
- },
931
- //§ tag-attribute-value-double-quoted-state
932
- XmlState::TagAttrValue(DoubleQuoted) => loop {
933
- match pop_except_from!(self, input, small_char_set!('\n' '"' '&')) {
934
- FromSet('"') => go!(self: to TagAttrNameBefore),
935
- FromSet('&') => go!(self: consume_char_ref '"' ),
936
- FromSet(c) => go!(self: push_value c),
937
- NotFromSet(ref b) => go!(self: append_value b),
938
- }
939
- },
940
- //§ tag-attribute-value-single-quoted-state
941
- XmlState::TagAttrValue(SingleQuoted) => loop {
942
- match pop_except_from!(self, input, small_char_set!('\n' '\'' '&')) {
943
- FromSet('\'') => go!(self: to TagAttrNameBefore),
944
- FromSet('&') => go!(self: consume_char_ref '\''),
945
- FromSet(c) => go!(self: push_value c),
946
- NotFromSet(ref b) => go!(self: append_value b),
947
- }
948
- },
949
- //§ tag-attribute-value-double-quoted-state
950
- XmlState::TagAttrValue(Unquoted) => loop {
951
- match pop_except_from!(self, input, small_char_set!('\n' '\t' ' ' '&' '>')) {
952
- FromSet('\t') | FromSet('\n') | FromSet(' ') => go!(self: to TagAttrNameBefore),
953
- FromSet('&') => go!(self: consume_char_ref),
954
- FromSet('>') => go!(self: emit_tag Data),
955
- FromSet(c) => go!(self: push_value c),
956
- NotFromSet(ref b) => go!(self: append_value b),
957
- }
958
- },
959
-
960
- //§ doctype-state
961
- XmlState::Doctype => loop {
962
- match get_char!(self, input) {
963
- '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeDoctypeName),
964
- _ => go!(self: error; reconsume BeforeDoctypeName),
965
- }
966
- },
967
- //§ before-doctype-name-state
968
- XmlState::BeforeDoctypeName => loop {
969
- match get_char!(self, input) {
970
- '\t' | '\n' | '\x0C' | ' ' => (),
971
- '>' => go!(self: error; emit_doctype; to Data),
972
- c => go!(self: create_doctype; push_doctype_name (c.to_ascii_lowercase());
973
- to DoctypeName),
974
- }
975
- },
976
- //§ doctype-name-state
977
- XmlState::DoctypeName => loop {
978
- match get_char!(self, input) {
979
- '\t' | '\n' | '\x0C' | ' ' => go!(self: to AfterDoctypeName),
980
- '>' => go!(self: emit_doctype; to Data),
981
- c => go!(self: push_doctype_name (c.to_ascii_lowercase());
982
- to DoctypeName),
983
- }
984
- },
985
- //§ after-doctype-name-state
986
- XmlState::AfterDoctypeName => loop {
987
- if eat!(self, input, "public") {
988
- go!(self: to AfterDoctypeKeyword Public);
989
- } else if eat!(self, input, "system") {
990
- go!(self: to AfterDoctypeKeyword System);
991
- } else {
992
- match get_char!(self, input) {
993
- '\t' | '\n' | '\x0C' | ' ' => (),
994
- '>' => go!(self: emit_doctype; to Data),
995
- _ => go!(self: error; to BogusDoctype),
996
- }
997
- }
998
- },
999
- //§ after-doctype-public-keyword-state
1000
- XmlState::AfterDoctypeKeyword(Public) => loop {
1001
- match get_char!(self, input) {
1002
- '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeDoctypeIdentifier Public),
1003
- '"' => {
1004
- go!(self: error; clear_doctype_id Public; to DoctypeIdentifierDoubleQuoted Public)
1005
- },
1006
- '\'' => {
1007
- go!(self: error; clear_doctype_id Public; to DoctypeIdentifierSingleQuoted Public)
1008
- },
1009
- '>' => go!(self: error; emit_doctype; to Data),
1010
- _ => go!(self: error; to BogusDoctype),
1011
- }
1012
- },
1013
- //§ after-doctype-system-keyword-state
1014
- XmlState::AfterDoctypeKeyword(System) => loop {
1015
- match get_char!(self, input) {
1016
- '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeDoctypeIdentifier System),
1017
- '"' => {
1018
- go!(self: error; clear_doctype_id System; to DoctypeIdentifierDoubleQuoted System)
1019
- },
1020
- '\'' => {
1021
- go!(self: error; clear_doctype_id System; to DoctypeIdentifierSingleQuoted System)
1022
- },
1023
- '>' => go!(self: error; emit_doctype; to Data),
1024
- _ => go!(self: error; to BogusDoctype),
1025
- }
1026
- },
1027
- //§ before_doctype_public_identifier_state before_doctype_system_identifier_state
1028
- XmlState::BeforeDoctypeIdentifier(kind) => loop {
1029
- match get_char!(self, input) {
1030
- '\t' | '\n' | '\x0C' | ' ' => (),
1031
- '"' => go!(self: clear_doctype_id kind; to DoctypeIdentifierDoubleQuoted kind),
1032
- '\'' => go!(self: clear_doctype_id kind; to DoctypeIdentifierSingleQuoted kind),
1033
- '>' => go!(self: error; emit_doctype; to Data),
1034
- _ => go!(self: error; to BogusDoctype),
1035
- }
1036
- },
1037
- //§ doctype_public_identifier_double_quoted_state doctype_system_identifier_double_quoted_state
1038
- XmlState::DoctypeIdentifierDoubleQuoted(kind) => loop {
1039
- match get_char!(self, input) {
1040
- '"' => go!(self: to AfterDoctypeIdentifier kind),
1041
- '>' => go!(self: error; emit_doctype; to Data),
1042
- c => go!(self: push_doctype_id kind c),
1043
- }
1044
- },
1045
- //§ doctype_public_identifier_single_quoted_state doctype_system_identifier_single_quoted_state
1046
- XmlState::DoctypeIdentifierSingleQuoted(kind) => loop {
1047
- match get_char!(self, input) {
1048
- '\'' => go!(self: to AfterDoctypeIdentifier kind),
1049
- '>' => go!(self: error; emit_doctype; to Data),
1050
- c => go!(self: push_doctype_id kind c),
1051
- }
1052
- },
1053
- //§ doctype_public_identifier_single_quoted_state
1054
- XmlState::AfterDoctypeIdentifier(Public) => loop {
1055
- match get_char!(self, input) {
1056
- '\t' | '\n' | '\x0C' | ' ' => {
1057
- go!(self: to BetweenDoctypePublicAndSystemIdentifiers)
1058
- },
1059
- '\'' => {
1060
- go!(self: error; clear_doctype_id System; to DoctypeIdentifierSingleQuoted(System))
1061
- },
1062
- '"' => {
1063
- go!(self: error; clear_doctype_id System; to DoctypeIdentifierDoubleQuoted(System))
1064
- },
1065
- '>' => go!(self: emit_doctype; to Data),
1066
- _ => go!(self: error; to BogusDoctype),
1067
- }
1068
- },
1069
- //§ doctype_system_identifier_single_quoted_state
1070
- XmlState::AfterDoctypeIdentifier(System) => loop {
1071
- match get_char!(self, input) {
1072
- '\t' | '\n' | '\x0C' | ' ' => (),
1073
- '>' => go!(self: emit_doctype; to Data),
1074
- _ => go!(self: error; to BogusDoctype),
1075
- }
1076
- },
1077
- //§ between_doctype_public_and_system_identifier_state
1078
- XmlState::BetweenDoctypePublicAndSystemIdentifiers => loop {
1079
- match get_char!(self, input) {
1080
- '\t' | '\n' | '\x0C' | ' ' => (),
1081
- '>' => go!(self: emit_doctype; to Data),
1082
- '\'' => go!(self: to DoctypeIdentifierSingleQuoted System),
1083
- '"' => go!(self: to DoctypeIdentifierDoubleQuoted System),
1084
- _ => go!(self: error; to BogusDoctype),
1085
- }
1086
- },
1087
- //§ bogus_doctype_state
1088
- XmlState::BogusDoctype => loop {
1089
- if get_char!(self, input) == '>' {
1090
- go!(self: emit_doctype; to Data);
1091
- }
1092
- },
1093
- }
1094
- }
1095
-
1096
- /// Indicate that we have reached the end of the input.
1097
- pub fn end(&self) {
1098
- // Handle EOF in the char ref sub-tokenizer, if there is one.
1099
- // Do this first because it might un-consume stuff.
1100
- let input = BufferQueue::default();
1101
- match self.char_ref_tokenizer.take() {
1102
- None => (),
1103
- Some(mut tok) => {
1104
- tok.end_of_file(self, &input);
1105
- self.process_char_ref(tok.get_result());
1106
- },
1107
- }
1108
-
1109
- // Process all remaining buffered input.
1110
- // If we're waiting for lookahead, we're not gonna get it.
1111
- self.at_eof.set(true);
1112
- let _ = self.run(&input);
1113
-
1114
- loop {
1115
- if !matches!(self.eof_step(), ProcessResult::Continue) {
1116
- break;
1117
- }
1118
- }
1119
-
1120
- self.sink.end();
1121
-
1122
- if self.opts.profile {
1123
- self.dump_profile();
1124
- }
1125
- }
1126
-
1127
- #[cfg(for_c)]
1128
- fn dump_profile(&self) {
1129
- unreachable!();
1130
- }
1131
-
1132
- #[cfg(not(for_c))]
1133
- fn dump_profile(&self) {
1134
- let mut results: Vec<(XmlState, u64)> = self
1135
- .state_profile
1136
- .borrow()
1137
- .iter()
1138
- .map(|(s, t)| (*s, *t))
1139
- .collect();
1140
- results.sort_by(|&(_, x), &(_, y)| y.cmp(&x));
1141
-
1142
- let total: u64 = results
1143
- .iter()
1144
- .map(|&(_, t)| t)
1145
- .fold(0, ::std::ops::Add::add);
1146
- debug!("\nTokenizer profile, in nanoseconds");
1147
- debug!(
1148
- "\n{:12} total in token sink",
1149
- self.time_in_sink.get()
1150
- );
1151
- debug!("\n{total:12} total in tokenizer");
1152
-
1153
- for (k, v) in results.into_iter() {
1154
- let pct = 100.0 * (v as f64) / (total as f64);
1155
- debug!("{v:12} {pct:4.1}% {k:?}");
1156
- }
1157
- }
1158
-
1159
- fn eof_step(&self) -> ProcessResult<Sink::Handle> {
1160
- debug!("processing EOF in state {:?}", self.state.get());
1161
- match self.state.get() {
1162
- XmlState::Data => go!(self: eof),
1163
- XmlState::CommentStart | XmlState::CommentLessThan | XmlState::CommentLessThanBang => {
1164
- go!(self: reconsume Comment)
1165
- },
1166
- XmlState::CommentLessThanBangDash => go!(self: reconsume CommentEndDash),
1167
- XmlState::CommentLessThanBangDashDash => go!(self: reconsume CommentEnd),
1168
- XmlState::CommentStartDash
1169
- | XmlState::Comment
1170
- | XmlState::CommentEndDash
1171
- | XmlState::CommentEnd
1172
- | XmlState::CommentEndBang => go!(self: error_eof; emit_comment; eof),
1173
- XmlState::TagState => go!(self: error_eof; emit '<'; to Data),
1174
- XmlState::EndTagState => go!(self: error_eof; emit '<'; emit '/'; to Data),
1175
- XmlState::TagEmpty => go!(self: error_eof; to TagAttrNameBefore),
1176
- XmlState::Cdata | XmlState::CdataBracket | XmlState::CdataEnd => {
1177
- go!(self: error_eof; to Data)
1178
- },
1179
- XmlState::Pi => go!(self: error_eof; to BogusComment),
1180
- XmlState::PiTargetAfter | XmlState::PiAfter => go!(self: reconsume PiData),
1181
- XmlState::MarkupDecl => go!(self: error_eof; to BogusComment),
1182
- XmlState::TagName
1183
- | XmlState::TagAttrNameBefore
1184
- | XmlState::EndTagName
1185
- | XmlState::TagAttrNameAfter
1186
- | XmlState::EndTagNameAfter
1187
- | XmlState::TagAttrValueBefore
1188
- | XmlState::TagAttrValue(_) => go!(self: error_eof; emit_tag Data),
1189
- XmlState::PiData | XmlState::PiTarget => go!(self: error_eof; emit_pi Data),
1190
- XmlState::TagAttrName => go!(self: error_eof; emit_start_tag Data),
1191
- XmlState::BeforeDoctypeName
1192
- | XmlState::Doctype
1193
- | XmlState::DoctypeName
1194
- | XmlState::AfterDoctypeName
1195
- | XmlState::AfterDoctypeKeyword(_)
1196
- | XmlState::BeforeDoctypeIdentifier(_)
1197
- | XmlState::AfterDoctypeIdentifier(_)
1198
- | XmlState::DoctypeIdentifierSingleQuoted(_)
1199
- | XmlState::DoctypeIdentifierDoubleQuoted(_)
1200
- | XmlState::BetweenDoctypePublicAndSystemIdentifiers => {
1201
- go!(self: error_eof; emit_doctype; to Data)
1202
- },
1203
- XmlState::BogusDoctype => go!(self: emit_doctype; to Data),
1204
- XmlState::BogusComment => go!(self: emit_comment; to Data),
1205
- }
1206
- }
1207
-
1208
- fn process_char_ref(&self, char_ref: CharRef) {
1209
- let CharRef {
1210
- mut chars,
1211
- mut num_chars,
1212
- } = char_ref;
1213
-
1214
- if num_chars == 0 {
1215
- chars[0] = '&';
1216
- num_chars = 1;
1217
- }
1218
-
1219
- for i in 0..num_chars {
1220
- let c = chars[i as usize];
1221
- match self.state.get() {
1222
- XmlState::Data | XmlState::Cdata => go!(self: emit c),
1223
-
1224
- XmlState::TagAttrValue(_) => go!(self: push_value c),
1225
-
1226
- _ => panic!(
1227
- "state {:?} should not be reachable in process_char_ref",
1228
- self.state.get()
1229
- ),
1230
- }
1231
- }
1232
- }
1233
-
1234
- fn step_char_ref_tokenizer(&self, input: &BufferQueue) -> ProcessResult<Sink::Handle> {
1235
- let mut tok = self.char_ref_tokenizer.take().unwrap();
1236
- let outcome = tok.step(self, input);
1237
-
1238
- let progress = match outcome {
1239
- char_ref::Done => {
1240
- self.process_char_ref(tok.get_result());
1241
- return ProcessResult::Continue;
1242
- },
1243
-
1244
- char_ref::Stuck => ProcessResult::Done,
1245
- char_ref::Progress => ProcessResult::Continue,
1246
- };
1247
-
1248
- *self.char_ref_tokenizer.borrow_mut() = Some(tok);
1249
- progress
1250
- }
1251
-
1252
- fn finish_attribute(&self) {
1253
- if self.current_attr_name.borrow().is_empty() {
1254
- return;
1255
- }
1256
-
1257
- // Check for a duplicate attribute.
1258
- // FIXME: the spec says we should error as soon as the name is finished.
1259
- // FIXME: linear time search, do we care?
1260
- let dup = {
1261
- let current_attr_name = self.current_attr_name.borrow();
1262
- let name = &current_attr_name[..];
1263
- self.current_tag_attrs
1264
- .borrow()
1265
- .iter()
1266
- .any(|a| &*a.name.local == name)
1267
- };
1268
-
1269
- if dup {
1270
- self.emit_error(Borrowed("Duplicate attribute"));
1271
- self.current_attr_name.borrow_mut().clear();
1272
- self.current_attr_value.borrow_mut().clear();
1273
- } else {
1274
- let qname = process_qname(replace(
1275
- &mut self.current_attr_name.borrow_mut(),
1276
- StrTendril::new(),
1277
- ));
1278
- let attr = Attribute {
1279
- name: qname.clone(),
1280
- value: replace(&mut self.current_attr_value.borrow_mut(), StrTendril::new()),
1281
- };
1282
-
1283
- if qname.local == local_name!("xmlns")
1284
- || qname.prefix == Some(namespace_prefix!("xmlns"))
1285
- {
1286
- self.current_tag_attrs.borrow_mut().insert(0, attr);
1287
- } else {
1288
- self.current_tag_attrs.borrow_mut().push(attr);
1289
- }
1290
- }
1291
- }
1292
-
1293
- fn create_attribute(&self, c: char) {
1294
- self.finish_attribute();
1295
-
1296
- self.current_attr_name.borrow_mut().push_char(c);
1297
- }
1298
- }
1299
-
1300
- #[cfg(test)]
1301
- mod test {
1302
-
1303
- use super::process_qname;
1304
- use crate::tendril::SliceExt;
1305
- use crate::{LocalName, Prefix};
1306
-
1307
- #[test]
1308
- fn simple_namespace() {
1309
- let qname = process_qname("prefix:local".to_tendril());
1310
- assert_eq!(qname.prefix, Some(Prefix::from("prefix")));
1311
- assert_eq!(qname.local, LocalName::from("local"));
1312
-
1313
- let qname = process_qname("a:b".to_tendril());
1314
- assert_eq!(qname.prefix, Some(Prefix::from("a")));
1315
- assert_eq!(qname.local, LocalName::from("b"));
1316
- }
1317
-
1318
- #[test]
1319
- fn wrong_namespaces() {
1320
- let qname = process_qname(":local".to_tendril());
1321
- assert_eq!(qname.prefix, None);
1322
- assert_eq!(qname.local, LocalName::from(":local"));
1323
-
1324
- let qname = process_qname("::local".to_tendril());
1325
- assert_eq!(qname.prefix, None);
1326
- assert_eq!(qname.local, LocalName::from("::local"));
1327
-
1328
- let qname = process_qname("a::local".to_tendril());
1329
- assert_eq!(qname.prefix, None);
1330
- assert_eq!(qname.local, LocalName::from("a::local"));
1331
-
1332
- let qname = process_qname("fake::".to_tendril());
1333
- assert_eq!(qname.prefix, None);
1334
- assert_eq!(qname.local, LocalName::from("fake::"));
1335
-
1336
- let qname = process_qname(":::".to_tendril());
1337
- assert_eq!(qname.prefix, None);
1338
- assert_eq!(qname.local, LocalName::from(":::"));
1339
-
1340
- let qname = process_qname(":a:b:".to_tendril());
1341
- assert_eq!(qname.prefix, None);
1342
- assert_eq!(qname.local, LocalName::from(":a:b:"));
1343
- }
1344
- }