html-to-markdown 2.24.6 → 2.25.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (203) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/ext/html-to-markdown-rb/native/Cargo.lock +3 -26
  4. data/ext/html-to-markdown-rb/native/Cargo.toml +1 -1
  5. data/lib/html_to_markdown/version.rb +1 -1
  6. data/rust-vendor/html-to-markdown-rs/Cargo.toml +0 -1
  7. data/rust-vendor/html-to-markdown-rs/src/converter/main_helpers.rs +1 -1
  8. data/rust-vendor/html-to-markdown-rs/src/lib.rs +1 -0
  9. data/rust-vendor/{markup5ever_rcdom/lib.rs → html-to-markdown-rs/src/rcdom.rs} +53 -91
  10. data/rust-vendor/png/.cargo-checksum.json +1 -1
  11. data/rust-vendor/png/.cargo_vcs_info.json +1 -1
  12. data/rust-vendor/png/CHANGES.md +44 -0
  13. data/rust-vendor/png/Cargo.lock +124 -171
  14. data/rust-vendor/png/Cargo.toml +1 -1
  15. data/rust-vendor/png/Cargo.toml.orig +1 -1
  16. data/rust-vendor/png/benches/expand_paletted.rs +5 -5
  17. data/rust-vendor/png/benches/unfilter.rs +3 -3
  18. data/rust-vendor/png/src/adam7.rs +17 -10
  19. data/rust-vendor/png/src/common.rs +8 -8
  20. data/rust-vendor/png/src/decoder/mod.rs +53 -20
  21. data/rust-vendor/png/src/decoder/stream.rs +263 -78
  22. data/rust-vendor/png/src/decoder/unfiltering_buffer.rs +210 -53
  23. data/rust-vendor/png/src/decoder/zlib.rs +130 -90
  24. data/rust-vendor/png/src/encoder.rs +4 -2
  25. data/rust-vendor/png/src/{filter.rs → filter/mod.rs} +100 -367
  26. data/rust-vendor/png/src/filter/optimization-notes.md +104 -0
  27. data/rust-vendor/png/src/filter/paeth.rs +398 -0
  28. data/rust-vendor/png/src/filter/simd.rs +308 -0
  29. data/rust-vendor/png/src/lib.rs +1 -0
  30. metadata +7 -177
  31. data/rust-vendor/markup5ever_rcdom/.cargo-checksum.json +0 -1
  32. data/rust-vendor/markup5ever_rcdom/.cargo_vcs_info.json +0 -7
  33. data/rust-vendor/markup5ever_rcdom/Cargo.lock +0 -658
  34. data/rust-vendor/markup5ever_rcdom/Cargo.toml +0 -109
  35. data/rust-vendor/markup5ever_rcdom/Cargo.toml.orig +0 -42
  36. data/rust-vendor/markup5ever_rcdom/LICENSE-APACHE +0 -201
  37. data/rust-vendor/markup5ever_rcdom/LICENSE-MIT +0 -25
  38. data/rust-vendor/markup5ever_rcdom/README.md +0 -7
  39. data/rust-vendor/markup5ever_rcdom/custom-html5lib-tokenizer-tests/regression.test +0 -69
  40. data/rust-vendor/markup5ever_rcdom/data/test/ignore +0 -1
  41. data/rust-vendor/markup5ever_rcdom/examples/hello_xml.rs +0 -39
  42. data/rust-vendor/markup5ever_rcdom/examples/html2html.rs +0 -51
  43. data/rust-vendor/markup5ever_rcdom/examples/print-rcdom.rs +0 -78
  44. data/rust-vendor/markup5ever_rcdom/examples/xml_tree_printer.rs +0 -67
  45. data/rust-vendor/markup5ever_rcdom/html5lib-tests/.gitattributes +0 -2
  46. data/rust-vendor/markup5ever_rcdom/html5lib-tests/.github/workflows/downstream.yml +0 -76
  47. data/rust-vendor/markup5ever_rcdom/html5lib-tests/.github/workflows/lint.yml +0 -25
  48. data/rust-vendor/markup5ever_rcdom/html5lib-tests/.gitignore +0 -79
  49. data/rust-vendor/markup5ever_rcdom/html5lib-tests/AUTHORS.rst +0 -34
  50. data/rust-vendor/markup5ever_rcdom/html5lib-tests/LICENSE +0 -21
  51. data/rust-vendor/markup5ever_rcdom/html5lib-tests/encoding/chardet/test_big5.txt +0 -51
  52. data/rust-vendor/markup5ever_rcdom/html5lib-tests/encoding/scripted/tests1.dat +0 -5
  53. data/rust-vendor/markup5ever_rcdom/html5lib-tests/encoding/test-yahoo-jp.dat +0 -10
  54. data/rust-vendor/markup5ever_rcdom/html5lib-tests/encoding/tests1.dat +0 -388
  55. data/rust-vendor/markup5ever_rcdom/html5lib-tests/encoding/tests2.dat +0 -115
  56. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint +0 -6
  57. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/__init__.py +0 -0
  58. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/__init__.py +0 -0
  59. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/LICENSE +0 -18
  60. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/__init__.py +0 -0
  61. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/lexer.py +0 -211
  62. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/lexer.pyi +0 -34
  63. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/parser.py +0 -872
  64. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/parser.pyi +0 -83
  65. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/py.typed +0 -0
  66. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/util.py +0 -72
  67. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/util.pyi +0 -7
  68. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/vendor.txt +0 -1
  69. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor-patches/funcparserlib.patch +0 -24
  70. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/lint.py +0 -280
  71. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/parser.py +0 -177
  72. data/rust-vendor/markup5ever_rcdom/html5lib-tests/pyproject.toml +0 -7
  73. data/rust-vendor/markup5ever_rcdom/html5lib-tests/serializer/core.test +0 -125
  74. data/rust-vendor/markup5ever_rcdom/html5lib-tests/serializer/injectmeta.test +0 -66
  75. data/rust-vendor/markup5ever_rcdom/html5lib-tests/serializer/optionaltags.test +0 -965
  76. data/rust-vendor/markup5ever_rcdom/html5lib-tests/serializer/options.test +0 -60
  77. data/rust-vendor/markup5ever_rcdom/html5lib-tests/serializer/whitespace.test +0 -51
  78. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/README.md +0 -107
  79. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/contentModelFlags.test +0 -93
  80. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/domjs.test +0 -335
  81. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/entities.test +0 -542
  82. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/escapeFlag.test +0 -36
  83. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/namedEntities.test +0 -42422
  84. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/numericEntities.test +0 -1677
  85. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/pendingSpecChanges.test +0 -9
  86. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/test1.test +0 -353
  87. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/test2.test +0 -275
  88. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/test3.test +0 -11233
  89. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/test4.test +0 -532
  90. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/unicodeChars.test +0 -1577
  91. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/unicodeCharsProblematic.test +0 -41
  92. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/xmlViolation.test +0 -20
  93. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/README.md +0 -108
  94. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/adoption01.dat +0 -354
  95. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/adoption02.dat +0 -39
  96. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/blocks.dat +0 -695
  97. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/comments01.dat +0 -217
  98. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/doctype01.dat +0 -474
  99. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/domjs-unsafe.dat +0 -0
  100. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/entities01.dat +0 -943
  101. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/entities02.dat +0 -309
  102. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/foreign-fragment.dat +0 -645
  103. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/html5test-com.dat +0 -301
  104. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/inbody01.dat +0 -54
  105. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/isindex.dat +0 -49
  106. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/main-element.dat +0 -46
  107. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/math.dat +0 -104
  108. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/menuitem-element.dat +0 -240
  109. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/namespace-sensitivity.dat +0 -22
  110. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/noscript01.dat +0 -237
  111. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/pending-spec-changes-plain-text-unsafe.dat +0 -0
  112. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/pending-spec-changes.dat +0 -46
  113. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/plain-text-unsafe.dat +0 -0
  114. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/quirks01.dat +0 -53
  115. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/ruby.dat +0 -302
  116. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/scriptdata01.dat +0 -372
  117. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/scripted/adoption01.dat +0 -16
  118. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/scripted/ark.dat +0 -27
  119. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/scripted/webkit01.dat +0 -30
  120. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/search-element.dat +0 -46
  121. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/svg.dat +0 -104
  122. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tables01.dat +0 -322
  123. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/template.dat +0 -1673
  124. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests1.dat +0 -1956
  125. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests10.dat +0 -849
  126. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests11.dat +0 -523
  127. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests12.dat +0 -62
  128. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests14.dat +0 -75
  129. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests15.dat +0 -216
  130. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests16.dat +0 -2602
  131. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests17.dat +0 -179
  132. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests18.dat +0 -558
  133. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests19.dat +0 -1398
  134. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests2.dat +0 -831
  135. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests20.dat +0 -842
  136. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests21.dat +0 -306
  137. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests22.dat +0 -190
  138. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests23.dat +0 -168
  139. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests24.dat +0 -79
  140. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests25.dat +0 -288
  141. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests26.dat +0 -453
  142. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests3.dat +0 -305
  143. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests4.dat +0 -74
  144. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests5.dat +0 -210
  145. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests6.dat +0 -663
  146. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests7.dat +0 -453
  147. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests8.dat +0 -165
  148. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests9.dat +0 -472
  149. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests_innerHTML_1.dat +0 -843
  150. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tricky01.dat +0 -336
  151. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/webkit01.dat +0 -785
  152. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/webkit02.dat +0 -554
  153. data/rust-vendor/markup5ever_rcdom/tests/foreach_html5lib_test/mod.rs +0 -41
  154. data/rust-vendor/markup5ever_rcdom/tests/html-driver.rs +0 -29
  155. data/rust-vendor/markup5ever_rcdom/tests/html-serializer.rs +0 -265
  156. data/rust-vendor/markup5ever_rcdom/tests/html-tokenizer.rs +0 -487
  157. data/rust-vendor/markup5ever_rcdom/tests/html-tree-builder.rs +0 -298
  158. data/rust-vendor/markup5ever_rcdom/tests/html-tree-sink.rs +0 -141
  159. data/rust-vendor/markup5ever_rcdom/tests/util/find_tests.rs +0 -34
  160. data/rust-vendor/markup5ever_rcdom/tests/util/runner.rs +0 -48
  161. data/rust-vendor/markup5ever_rcdom/tests/xml-driver.rs +0 -101
  162. data/rust-vendor/markup5ever_rcdom/tests/xml-tokenizer.rs +0 -374
  163. data/rust-vendor/markup5ever_rcdom/tests/xml-tree-builder.rs +0 -237
  164. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/AUTHORS.rst +0 -9
  165. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/LICENSE +0 -21
  166. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/README.md +0 -92
  167. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/comments.test +0 -274
  168. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/doctype.test +0 -3232
  169. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/entities.test +0 -283
  170. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/eof.test +0 -113
  171. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/namedEntities.test +0 -42210
  172. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/numericEntities.test +0 -1349
  173. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/test1.test +0 -162
  174. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/test2.test +0 -64
  175. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/unicodeChars.test +0 -1295
  176. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tree-construction/README.md +0 -104
  177. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tree-construction/namespace.dat +0 -119
  178. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tree-construction/test1.dat +0 -124
  179. data/rust-vendor/xml5ever/.cargo-checksum.json +0 -1
  180. data/rust-vendor/xml5ever/.cargo_vcs_info.json +0 -6
  181. data/rust-vendor/xml5ever/Cargo.lock +0 -752
  182. data/rust-vendor/xml5ever/Cargo.toml +0 -69
  183. data/rust-vendor/xml5ever/Cargo.toml.orig +0 -29
  184. data/rust-vendor/xml5ever/LICENSE-APACHE +0 -201
  185. data/rust-vendor/xml5ever/LICENSE-MIT +0 -25
  186. data/rust-vendor/xml5ever/README.md +0 -72
  187. data/rust-vendor/xml5ever/benches/xml5ever.rs +0 -77
  188. data/rust-vendor/xml5ever/data/bench/strong.xml +0 -1
  189. data/rust-vendor/xml5ever/examples/README.md +0 -223
  190. data/rust-vendor/xml5ever/examples/example.xml +0 -3
  191. data/rust-vendor/xml5ever/examples/simple_xml_tokenizer.rs +0 -81
  192. data/rust-vendor/xml5ever/examples/xml_tokenizer.rs +0 -115
  193. data/rust-vendor/xml5ever/src/driver.rs +0 -90
  194. data/rust-vendor/xml5ever/src/lib.rs +0 -47
  195. data/rust-vendor/xml5ever/src/macros.rs +0 -18
  196. data/rust-vendor/xml5ever/src/serialize/mod.rs +0 -216
  197. data/rust-vendor/xml5ever/src/tokenizer/char_ref/mod.rs +0 -456
  198. data/rust-vendor/xml5ever/src/tokenizer/interface.rs +0 -116
  199. data/rust-vendor/xml5ever/src/tokenizer/mod.rs +0 -1344
  200. data/rust-vendor/xml5ever/src/tokenizer/qname.rs +0 -84
  201. data/rust-vendor/xml5ever/src/tokenizer/states.rs +0 -167
  202. data/rust-vendor/xml5ever/src/tree_builder/mod.rs +0 -774
  203. data/rust-vendor/xml5ever/src/tree_builder/types.rs +0 -37
@@ -1,223 +0,0 @@
1
- # Examples
2
-
3
- The examples have been designed with [`cargo-script`](https://github.com/DanielKeep/cargo-script) in mind.
4
-
5
- Here I'll just give broad overview how to install [`cargo script`] for Rust 1.5. For more details, check out [cargo-script repository](https://github.com/DanielKeep/cargo-script).
6
-
7
- cargo install cargo-script
8
-
9
-
10
- # Token printer
11
-
12
- The basis of xml5ever is its tokenizer and tree builder. Roughly speaking tokenizer
13
- takes input and returns a set of tokens like comment, processing instruction, start
14
- tag, end tag, etc.
15
-
16
- First let's define our dependencies:
17
-
18
- ```toml
19
- [dependencies]
20
- xml5ever = "0.2.0"
21
- tendril = "0.1.3"
22
- ```
23
-
24
- With dependencies declared, we can now make a simple tokenizer sink. First step is to
25
- define a [`TokenSink`](https://docs.rs/xml5ever/latest/xml5ever/tokenizer/trait.TokenSink.html). [`TokenSink`](https://docs.rs/xml5ever/latest/xml5ever/tokenizer/trait.TokenSink.html) are traits that received stream of [`Tokens`](https://docs.rs/xml5ever/latest/xml5ever/tokenizer/enum.Token.html).
26
-
27
- In our case we'll define a unit struct (i.e. a struct without any fields).
28
-
29
- ```rust
30
- struct SimpleTokenPrinter;
31
- ```
32
-
33
- To make `SimpleTokenPrinter` a [`TokenSink`](https://docs.rs/xml5ever/latest/xml5ever/tokenizer/trait.TokenSink.html), we need to implement [process_token](https://docs.rs/xml5ever/latest/xml5ever/tokenizer/trait.TokenSink.html#tymethod.process_token) method.
34
-
35
- ```rust
36
- impl TokenSink for SimpleTokenPrinter {
37
- fn process_token(&mut self, token: Token) {
38
- match token {
39
- CharacterTokens(b) => {
40
- println!("TEXT: {}", &*b);
41
- },
42
- NullCharacterToken => print!("NULL"),
43
- TagToken(tag) => {
44
- println!("{:?} {} ", tag.kind, &*tag.name.local);
45
- },
46
- ParseError(err) => {
47
- println!("ERROR: {}", err);
48
- },
49
- PIToken(Pi{ref target, ref data}) => {
50
- println!("PI : <?{} {}?>", &*target, &*data);
51
- },
52
- CommentToken(ref comment) => {
53
- println!("<!--{:?}-->", &*comment);
54
- },
55
- EOFToken => {
56
- println!("EOF");
57
- },
58
- DoctypeToken(Doctype{ref name, ref public_id, ..}) => {
59
- println!("<!DOCTYPE {:?} {:?}>", &*name, &*public_id);
60
- }
61
- }
62
- }
63
- }
64
- ```
65
-
66
- Now, we need some input to process. For input we'll use `stdin`. However, xml5ever `tokenize_to` method only takes `StrTendril`. So we need to construct a
67
- [`ByteTendril`](https://docs.rs/tendril/latest/tendril/type.ByteTendril.html) using `ByteTendril::new()`, then read the `stdin` using [`read_to_tendril`](https://docs.rs/tendril/latest/tendril/trait.ReadExt.html#tymethod.read_to_tendril) extension.
68
-
69
- Once that is set, to make `SimpleTokenPrinter` parse the input, call,
70
- `tokenize_to` with it as the first parameter, input wrapped in Option for second parameter and XmlToke.
71
-
72
- ```rust
73
- fn main() {
74
- let sink = SimpleTokenPrinter;
75
-
76
- // We need a ByteTendril to read a file
77
- let mut input = ByteTendril::new();
78
- // Using SliceExt.read_to_tendril we read stdin
79
- io::stdin().read_to_tendril(&mut input).unwrap();
80
- // For xml5ever we need StrTendril, so we reinterpret it
81
- // into StrTendril.
82
- //
83
- // You might wonder, how does `try_reinterpret` know we
84
- // need StrTendril and the answer is type inference based
85
- // on `tokenize_xml_to` signature.
86
- let input = input.try_reinterpret().unwrap();
87
- // Here we create and run tokenizer
88
- let mut tok = XmlTokenizer::new(sink, Default::default());
89
- // We pass input to parsed.
90
- tok.feed(input);
91
-
92
- // tok.end must be invoked for final bytes to be processed.
93
- tok.end();
94
- }
95
- ```
96
-
97
- NOTE: `unwrap` causes panic, it's only OK to use in simple examples.
98
-
99
- For full source code check out: [`examples/simple_xml_tokenizer.rs`](https://github.com/servo/html5ever/blob/main/xml5ever/examples/simple_xml_tokenizer.rs)
100
-
101
- Once we have successfully compiled the example we run the example with inline
102
- xml
103
-
104
- ```bash
105
- cargo script simple_xml_tokenizer.rs <<< "<xml>Text with <b>bold words</b>!</xml>"
106
- ```
107
-
108
- or by sending an [`examples/example.xml`](https://github.com/servo/html5ever/blob/main/xml5ever/examples/example.xml) located in same folder as examples.
109
-
110
- ```bash
111
- cargo script simple_xml_tokenizer.rs < example.xml
112
- ```
113
-
114
- # Tree printer
115
-
116
- To actually get an XML document tree from the xml5ever, you need to use a `TreeSink`.
117
- `TreeSink` is in many way similar to the TokenSink. Basically, TokenSink takes data
118
- and returns list of tokens, while TreeSink takes tokens and returns a tree of parsed
119
- XML document. Do note, that this is a simplified explanation and consult
120
- documentation for more info.
121
-
122
- Ok, with that in mind, let's build us a TreePrinter. For example if we get an XML
123
- file like:
124
-
125
- ```xml
126
- <student>
127
- <first-name>Bobby</first-name>
128
- <last-name>Tables</last-name>
129
- </student>
130
- ```
131
-
132
- We'd want a structure similar to this:
133
-
134
- ```
135
- #document
136
- student
137
- first-name
138
- #text Bobby
139
- last-name
140
- #text Tables
141
-
142
- ```
143
- We won't print anything other than element names and text fields. So comments,
144
- doctypes and other such elements are ignored.
145
-
146
- First part is similar to making SimpleTokenPrinter:
147
-
148
- ```rust
149
- // We need to allocate an input tendril for xml5ever
150
- let mut input = ByteTendril::new();
151
- // Using SliceExt.read_to_tendril functions we can read stdin
152
- io::stdin().read_to_tendril(&mut input).unwrap();
153
- let input = input.try_reinterpret().unwrap();
154
- ```
155
-
156
- This time, we need an implementation of [`TreeSink`](https://docs.rs/xml5ever/latest/xml5ever/tree_builder/trait.TreeSink.html). xml5ever comes with a
157
- built-in `TreeSink` implementation called [`RcDom`](https://docs.rs/markup5ever_rcdom/latest/markup5ever_rcdom/struct.RcDom.html). To process input into
158
- a `TreeSink` we use the following line:
159
-
160
- ```rust
161
- let dom: RcDom = parse(one_input(input), Default::default());
162
- ```
163
-
164
- Let's analyze it a bit. First there is `let dom: RcDom`. We need this part,
165
- because the type inferencer can't infer which TreeSink implementation we mean
166
- in this scenario.
167
-
168
- Function [`one_input`](https://ygg01.github.io/docs/xml5ever/xml5ever/fn.one_input.html) is a convenience function that turns any value into an iterator. In this case
169
- it converts a StrTendril into an Iterator over itself.
170
-
171
- Ok, so now that we parsed our tree what with it? Well, for that we might need some
172
- kind of function that will help us traverse it. We shall call that function `walk`.
173
-
174
- ```rust
175
- fn walk(prefix: &str, handle: Handle) {
176
- let node = handle.borrow();
177
-
178
- // We print out the prefix before we start
179
- print!("{}", prefix);
180
- // We are only interested in following nodes:
181
- // Document, Text and Element, so our match
182
- // reflects that.
183
- match node.node {
184
- Document
185
- => println!("#document"),
186
-
187
- Text(ref text) => {
188
- println!("#text {}", text.escape_default())
189
- },
190
-
191
- Element(ref name, _) => {
192
- println!("{}", name.local);
193
- },
194
-
195
- _ => {},
196
-
197
- }
198
-
199
- // We increase indent in child nodes
200
- let new_indent = {
201
- let mut temp = String::new();
202
- temp.push_str(prefix);
203
- temp.push_str(" ");
204
- temp
205
- };
206
-
207
- for child in node.children.iter()
208
- // In order to avoid weird indentation, we filter
209
- // only Text/Element nodes.
210
- // We don't need to filter Document since its guaranteed
211
- // child elements don't contain documents
212
- .filter(|child| match child.borrow().node {
213
- Text(_) | Element (_, _) => true,
214
- _ => false,
215
- }
216
- ) {
217
- // Recursion - Yay!
218
- walk(&new_indent, child.clone());
219
- }
220
- }
221
- ```
222
-
223
- For full source code check out: [`examples/xml_tree_printer.rs`](https://github.com/servo/html5ever/blob/main/rcdom/examples/xml_tree_printer.rs)
@@ -1,3 +0,0 @@
1
- <?xml version="1.0" encoding="UTF-8"?>
2
- <!DOCTYPE xml PUBLIC "http://www.wc3.org/">
3
- <student><first-name>Bobby</first-name><last-name>Tables</last-name></student>
@@ -1,81 +0,0 @@
1
- #!/usr/bin/env run-cargo-script
2
- //! This is a regular crate doc comment, but it also contains a partial
3
- //! Cargo manifest. Note the use of a *fenced* code block, and the
4
- //! `cargo` "language".
5
- //!
6
- //! ```cargo
7
- //! [dependencies]
8
- //! xml5ever = "0.1.1"
9
- //! tendril = "0.1.3"
10
- //! markup5ever = "0.7.4"
11
- //! ```
12
- extern crate markup5ever;
13
- extern crate xml5ever;
14
-
15
- use std::io;
16
-
17
- use markup5ever::buffer_queue::BufferQueue;
18
- use xml5ever::tendril::{ByteTendril, ReadExt};
19
- use xml5ever::tokenizer::{Doctype, Pi, ProcessResult, Token, TokenSink, XmlTokenizer};
20
-
21
- struct SimpleTokenPrinter;
22
-
23
- impl TokenSink for SimpleTokenPrinter {
24
- type Handle = ();
25
-
26
- fn process_token(&self, token: Token) -> ProcessResult<()> {
27
- match token {
28
- Token::Characters(b) => {
29
- println!("TEXT: {}", &*b);
30
- },
31
- Token::NullCharacter => print!("NULL"),
32
- Token::Tag(tag) => {
33
- println!("{:?} {} ", tag.kind, &*tag.name.local);
34
- },
35
- Token::ParseError(err) => {
36
- println!("ERROR: {err}");
37
- },
38
- Token::ProcessingInstruction(Pi {
39
- ref target,
40
- ref data,
41
- }) => {
42
- println!("PI : <?{target} {data}?>");
43
- },
44
- Token::Comment(ref comment) => {
45
- println!("<!--{comment:?}-->");
46
- },
47
- Token::EndOfFile => {
48
- println!("EOF");
49
- },
50
- Token::Doctype(Doctype {
51
- ref name,
52
- ref public_id,
53
- ..
54
- }) => {
55
- println!("<!DOCTYPE {name:?} {public_id:?}>");
56
- },
57
- };
58
- ProcessResult::Continue
59
- }
60
- }
61
-
62
- fn main() {
63
- // Our implementation of TokenSink
64
- let sink = SimpleTokenPrinter;
65
-
66
- // We need a ByteTendril to read a file
67
- let mut input = ByteTendril::new();
68
-
69
- // Using SliceExt.read_to_tendril we can read stdin
70
- io::stdin().read_to_tendril(&mut input).unwrap();
71
- // For xml5ever we need StrTendril, so we reinterpret it
72
- // into StrTendril.
73
-
74
- // Load input into BufferQueue
75
- let input_buffer = BufferQueue::default();
76
- input_buffer.push_back(input.try_reinterpret().unwrap());
77
- // Here we create and run tokenizer
78
- let tok = XmlTokenizer::new(sink, Default::default());
79
- let _ = tok.feed(&input_buffer);
80
- tok.end();
81
- }
@@ -1,115 +0,0 @@
1
- #!/usr/bin/env run-cargo-script
2
- //! This is a regular crate doc comment, but it also contains a partial
3
- //! Cargo manifest. Note the use of a *fenced* code block, and the
4
- //! `cargo` "language".
5
- //!
6
- //! ```cargo
7
- //! [dependencies]
8
- //! xml5ever = "0.2.0"
9
- //! tendril = "0.1.3"
10
- //! markup5ever = "0.7.4"
11
- //! ```
12
- extern crate markup5ever;
13
- extern crate xml5ever;
14
-
15
- use std::cell::Cell;
16
- use std::io;
17
-
18
- use markup5ever::buffer_queue::BufferQueue;
19
- use xml5ever::tendril::{ByteTendril, ReadExt};
20
- use xml5ever::tokenizer::{
21
- EmptyTag, EndTag, Pi, ProcessResult, ShortTag, StartTag, Token, TokenSink, XmlTokenizer,
22
- XmlTokenizerOpts,
23
- };
24
-
25
- #[derive(Clone)]
26
- struct TokenPrinter {
27
- in_char_run: Cell<bool>,
28
- }
29
-
30
- impl TokenPrinter {
31
- fn is_char(&self, is_char: bool) {
32
- match (self.in_char_run.get(), is_char) {
33
- (false, true) => print!("CHAR : \""),
34
- (true, false) => println!("\""),
35
- _ => (),
36
- }
37
- self.in_char_run.set(is_char);
38
- }
39
-
40
- fn do_char(&self, c: char) {
41
- self.is_char(true);
42
- print!("{}", c.escape_default().collect::<String>());
43
- }
44
- }
45
-
46
- impl TokenSink for TokenPrinter {
47
- type Handle = ();
48
-
49
- fn process_token(&self, token: Token) -> ProcessResult<()> {
50
- match token {
51
- Token::Characters(b) => {
52
- for c in b.chars() {
53
- self.do_char(c);
54
- }
55
- },
56
- Token::NullCharacter => self.do_char('\0'),
57
- Token::Tag(tag) => {
58
- self.is_char(false);
59
- // This is not proper HTML serialization, of course.
60
- match tag.kind {
61
- StartTag => print!("TAG : <\x1b[32m{}\x1b[0m", tag.name.local),
62
- EndTag => print!("END TAG : <\x1b[31m/{}\x1b[0m", tag.name.local),
63
- ShortTag => print!("Short TAG : <\x1b[31m/{}\x1b[0m", tag.name.local),
64
- EmptyTag => print!("Empty TAG : <\x1b[31m{}\x1b[0m", tag.name.local),
65
- }
66
- for attr in tag.attrs.iter() {
67
- print!(
68
- " \x1b[36m{}\x1b[0m='\x1b[34m{}\x1b[0m'",
69
- attr.name.local, attr.value
70
- );
71
- }
72
- if tag.kind == EmptyTag {
73
- print!("/");
74
- }
75
- println!(">");
76
- },
77
- Token::ParseError(err) => {
78
- self.is_char(false);
79
- println!("ERROR: {err}");
80
- },
81
- Token::ProcessingInstruction(Pi { target, data }) => {
82
- self.is_char(false);
83
- println!("PI : <?{target:?} {data:?}?>");
84
- },
85
- _ => {
86
- self.is_char(false);
87
- println!("OTHER: {token:?}");
88
- },
89
- };
90
-
91
- ProcessResult::Continue
92
- }
93
- }
94
-
95
- fn main() {
96
- let sink = TokenPrinter {
97
- in_char_run: Cell::new(false),
98
- };
99
- let mut input = ByteTendril::new();
100
- io::stdin().read_to_tendril(&mut input).unwrap();
101
- let input_buffer = BufferQueue::default();
102
- input_buffer.push_back(input.try_reinterpret().unwrap());
103
-
104
- let tok = XmlTokenizer::new(
105
- sink,
106
- XmlTokenizerOpts {
107
- profile: true,
108
- exact_errors: true,
109
- ..Default::default()
110
- },
111
- );
112
- let _ = tok.feed(&input_buffer);
113
- tok.end();
114
- tok.sink.is_char(false);
115
- }
@@ -1,90 +0,0 @@
1
- // Copyright 2014-2017 The html5ever Project Developers. See the
2
- // COPYRIGHT file at the top-level directory of this distribution.
3
- //
4
- // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5
- // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6
- // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7
- // option. This file may not be copied, modified, or distributed
8
- // except according to those terms.
9
-
10
- use crate::tokenizer::{XmlTokenizer, XmlTokenizerOpts};
11
- use crate::tree_builder::{TreeSink, XmlTreeBuilder, XmlTreeBuilderOpts};
12
-
13
- use std::borrow::Cow;
14
-
15
- use crate::tendril;
16
- use crate::tendril::stream::{TendrilSink, Utf8LossyDecoder};
17
- use crate::tendril::StrTendril;
18
- use markup5ever::buffer_queue::BufferQueue;
19
-
20
- /// All-encompasing parser setting structure.
21
- #[derive(Clone, Default)]
22
- pub struct XmlParseOpts {
23
- /// Xml tokenizer options.
24
- pub tokenizer: XmlTokenizerOpts,
25
- /// Xml tree builder .
26
- pub tree_builder: XmlTreeBuilderOpts,
27
- }
28
-
29
- /// Parse and send results to a `TreeSink`.
30
- ///
31
- /// ## Example
32
- ///
33
- /// ```ignore
34
- /// let mut sink = MySink;
35
- /// parse_document(&mut sink, iter::once(my_str), Default::default());
36
- /// ```
37
- pub fn parse_document<Sink>(sink: Sink, opts: XmlParseOpts) -> XmlParser<Sink>
38
- where
39
- Sink: TreeSink,
40
- {
41
- let tb = XmlTreeBuilder::new(sink, opts.tree_builder);
42
- let tok = XmlTokenizer::new(tb, opts.tokenizer);
43
- XmlParser {
44
- tokenizer: tok,
45
- input_buffer: BufferQueue::default(),
46
- }
47
- }
48
-
49
- /// An XML parser,
50
- /// ready to receive Unicode input through the `tendril::TendrilSink` trait’s methods.
51
- pub struct XmlParser<Sink>
52
- where
53
- Sink: TreeSink,
54
- {
55
- /// Tokenizer used by XmlParser.
56
- pub tokenizer: XmlTokenizer<XmlTreeBuilder<Sink::Handle, Sink>>,
57
- /// Input used by XmlParser.
58
- pub input_buffer: BufferQueue,
59
- }
60
-
61
- impl<Sink: TreeSink> TendrilSink<tendril::fmt::UTF8> for XmlParser<Sink> {
62
- type Output = Sink::Output;
63
-
64
- fn process(&mut self, t: StrTendril) {
65
- self.input_buffer.push_back(t);
66
- // FIXME: Properly support </script> somehow.
67
- let _ = self.tokenizer.feed(&self.input_buffer);
68
- }
69
-
70
- // FIXME: Is it too noisy to report every character decoding error?
71
- fn error(&mut self, desc: Cow<'static, str>) {
72
- self.tokenizer.sink.sink.parse_error(desc)
73
- }
74
-
75
- fn finish(self) -> Self::Output {
76
- self.tokenizer.end();
77
- self.tokenizer.sink.sink.finish()
78
- }
79
- }
80
-
81
- impl<Sink: TreeSink> XmlParser<Sink> {
82
- /// Wrap this parser into a `TendrilSink` that accepts UTF-8 bytes.
83
- ///
84
- /// Use this when your input is bytes that are known to be in the UTF-8 encoding.
85
- /// Decoding is lossy, like `String::from_utf8_lossy`.
86
- #[allow(clippy::wrong_self_convention)]
87
- pub fn from_utf8(self) -> Utf8LossyDecoder<Self> {
88
- Utf8LossyDecoder::new(self)
89
- }
90
- }
@@ -1,47 +0,0 @@
1
- // Copyright 2014-2017 The html5ever Project Developers. See the
2
- // COPYRIGHT file at the top-level directory of this distribution.
3
- //
4
- // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5
- // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6
- // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7
- // option. This file may not be copied, modified, or distributed
8
- // except according to those terms.
9
-
10
- //! This crate provides a push based XML parser library that
11
- //! adheres to XML5 specification. In other words this library
12
- //! trades well-formedness for error recovery.
13
- //!
14
- //! The idea behind this, was to minimize number of errors from
15
- //! tools that generate XML (e.g. `&#83` won't just return `&#83`
16
- //! as text, but will parse it into `S` ).
17
- //! You can check out full specification [here](https://ygg01.github.io/xml5_draft/).
18
- //!
19
- //! What this library provides is a solid XML parser that can:
20
- //!
21
- //! * Parse somewhat erroneous XML input
22
- //! * Provide support for [Numeric character references](https://en.wikipedia.org/wiki/Numeric_character_reference).
23
- //! * Provide partial [XML namespace](http://www.w3.org/TR/xml-names11/) support.
24
- //! * Provide full set of SVG/MathML entities
25
- //!
26
- //! What isn't in scope for this library:
27
- //!
28
- //! * Document Type Definition parsing - this is pretty hard to do right and nowadays, its used
29
- //!
30
-
31
- #![crate_name = "xml5ever"]
32
- #![crate_type = "dylib"]
33
- #![allow(unexpected_cfgs)]
34
- #![deny(missing_docs)]
35
-
36
- pub use markup5ever::*;
37
-
38
- pub(crate) mod macros;
39
-
40
- /// Driver
41
- pub mod driver;
42
- /// Serializer for XML5.
43
- pub mod serialize;
44
- /// XML5 tokenizer - converts input into tokens
45
- pub mod tokenizer;
46
- /// XML5 tree builder - converts tokens into a tree like structure
47
- pub mod tree_builder;
@@ -1,18 +0,0 @@
1
- // Copyright 2014-2017 The html5ever Project Developers. See the
2
- // COPYRIGHT file at the top-level directory of this distribution.
3
- //
4
- // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5
- // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6
- // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7
- // option. This file may not be copied, modified, or distributed
8
- // except according to those terms.
9
-
10
- macro_rules! time {
11
- ($e:expr) => {{
12
- let t0 = ::std::time::Instant::now();
13
- let result = $e;
14
- let dt = t0.elapsed().as_nanos() as u64;
15
- (result, dt)
16
- }};
17
- }
18
- pub(crate) use time;