html-to-markdown 2.24.6 → 2.25.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (231) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/html-to-markdown-rb/native/Cargo.lock +9 -32
  5. data/ext/html-to-markdown-rb/native/Cargo.toml +1 -1
  6. data/lib/html_to_markdown/version.rb +1 -1
  7. data/rust-vendor/html-to-markdown-rs/Cargo.toml +0 -1
  8. data/rust-vendor/html-to-markdown-rs/src/converter/main_helpers.rs +1 -1
  9. data/rust-vendor/html-to-markdown-rs/src/hocr/converter/hierarchy.rs +20 -5
  10. data/rust-vendor/html-to-markdown-rs/src/lib.rs +1 -0
  11. data/rust-vendor/{markup5ever_rcdom/lib.rs → html-to-markdown-rs/src/rcdom.rs} +56 -91
  12. data/rust-vendor/html-to-markdown-rs/tests/hocr_compliance_test.rs +157 -0
  13. data/rust-vendor/memmap2/.cargo-checksum.json +1 -1
  14. data/rust-vendor/memmap2/.cargo_vcs_info.json +1 -1
  15. data/rust-vendor/memmap2/CHANGELOG.md +8 -0
  16. data/rust-vendor/memmap2/Cargo.lock +1 -1
  17. data/rust-vendor/memmap2/Cargo.toml +2 -1
  18. data/rust-vendor/memmap2/Cargo.toml.orig +2 -1
  19. data/rust-vendor/memmap2/src/lib.rs +25 -1
  20. data/rust-vendor/memmap2/src/stub.rs +1 -4
  21. data/rust-vendor/memmap2/src/unix.rs +14 -1
  22. data/rust-vendor/png/.cargo-checksum.json +1 -1
  23. data/rust-vendor/png/.cargo_vcs_info.json +1 -1
  24. data/rust-vendor/png/CHANGES.md +44 -0
  25. data/rust-vendor/png/Cargo.lock +124 -171
  26. data/rust-vendor/png/Cargo.toml +1 -1
  27. data/rust-vendor/png/Cargo.toml.orig +1 -1
  28. data/rust-vendor/png/benches/expand_paletted.rs +5 -5
  29. data/rust-vendor/png/benches/unfilter.rs +3 -3
  30. data/rust-vendor/png/src/adam7.rs +17 -10
  31. data/rust-vendor/png/src/common.rs +8 -8
  32. data/rust-vendor/png/src/decoder/mod.rs +53 -20
  33. data/rust-vendor/png/src/decoder/stream.rs +263 -78
  34. data/rust-vendor/png/src/decoder/unfiltering_buffer.rs +210 -53
  35. data/rust-vendor/png/src/decoder/zlib.rs +130 -90
  36. data/rust-vendor/png/src/encoder.rs +4 -2
  37. data/rust-vendor/png/src/{filter.rs → filter/mod.rs} +100 -367
  38. data/rust-vendor/png/src/filter/optimization-notes.md +104 -0
  39. data/rust-vendor/png/src/filter/paeth.rs +398 -0
  40. data/rust-vendor/png/src/filter/simd.rs +308 -0
  41. data/rust-vendor/png/src/lib.rs +1 -0
  42. data/rust-vendor/syn/.cargo-checksum.json +1 -1
  43. data/rust-vendor/syn/.cargo_vcs_info.json +1 -1
  44. data/rust-vendor/syn/Cargo.lock +40 -41
  45. data/rust-vendor/syn/Cargo.toml +1 -1
  46. data/rust-vendor/syn/Cargo.toml.orig +1 -1
  47. data/rust-vendor/syn/src/item.rs +61 -40
  48. data/rust-vendor/syn/src/lib.rs +2 -1
  49. data/rust-vendor/syn/tests/test_item.rs +54 -0
  50. data/rust-vendor/unicode-ident/.cargo-checksum.json +1 -1
  51. data/rust-vendor/unicode-ident/.cargo_vcs_info.json +1 -1
  52. data/rust-vendor/unicode-ident/Cargo.lock +21 -21
  53. data/rust-vendor/unicode-ident/Cargo.toml +1 -1
  54. data/rust-vendor/unicode-ident/Cargo.toml.orig +1 -1
  55. data/rust-vendor/unicode-ident/src/lib.rs +1 -1
  56. data/rust-vendor/unicode-ident/src/tables.rs +87 -97
  57. data/rust-vendor/unicode-ident/tests/static_size.rs +1 -1
  58. metadata +7 -177
  59. data/rust-vendor/markup5ever_rcdom/.cargo-checksum.json +0 -1
  60. data/rust-vendor/markup5ever_rcdom/.cargo_vcs_info.json +0 -7
  61. data/rust-vendor/markup5ever_rcdom/Cargo.lock +0 -658
  62. data/rust-vendor/markup5ever_rcdom/Cargo.toml +0 -109
  63. data/rust-vendor/markup5ever_rcdom/Cargo.toml.orig +0 -42
  64. data/rust-vendor/markup5ever_rcdom/LICENSE-APACHE +0 -201
  65. data/rust-vendor/markup5ever_rcdom/LICENSE-MIT +0 -25
  66. data/rust-vendor/markup5ever_rcdom/README.md +0 -7
  67. data/rust-vendor/markup5ever_rcdom/custom-html5lib-tokenizer-tests/regression.test +0 -69
  68. data/rust-vendor/markup5ever_rcdom/data/test/ignore +0 -1
  69. data/rust-vendor/markup5ever_rcdom/examples/hello_xml.rs +0 -39
  70. data/rust-vendor/markup5ever_rcdom/examples/html2html.rs +0 -51
  71. data/rust-vendor/markup5ever_rcdom/examples/print-rcdom.rs +0 -78
  72. data/rust-vendor/markup5ever_rcdom/examples/xml_tree_printer.rs +0 -67
  73. data/rust-vendor/markup5ever_rcdom/html5lib-tests/.gitattributes +0 -2
  74. data/rust-vendor/markup5ever_rcdom/html5lib-tests/.github/workflows/downstream.yml +0 -76
  75. data/rust-vendor/markup5ever_rcdom/html5lib-tests/.github/workflows/lint.yml +0 -25
  76. data/rust-vendor/markup5ever_rcdom/html5lib-tests/.gitignore +0 -79
  77. data/rust-vendor/markup5ever_rcdom/html5lib-tests/AUTHORS.rst +0 -34
  78. data/rust-vendor/markup5ever_rcdom/html5lib-tests/LICENSE +0 -21
  79. data/rust-vendor/markup5ever_rcdom/html5lib-tests/encoding/chardet/test_big5.txt +0 -51
  80. data/rust-vendor/markup5ever_rcdom/html5lib-tests/encoding/scripted/tests1.dat +0 -5
  81. data/rust-vendor/markup5ever_rcdom/html5lib-tests/encoding/test-yahoo-jp.dat +0 -10
  82. data/rust-vendor/markup5ever_rcdom/html5lib-tests/encoding/tests1.dat +0 -388
  83. data/rust-vendor/markup5ever_rcdom/html5lib-tests/encoding/tests2.dat +0 -115
  84. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint +0 -6
  85. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/__init__.py +0 -0
  86. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/__init__.py +0 -0
  87. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/LICENSE +0 -18
  88. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/__init__.py +0 -0
  89. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/lexer.py +0 -211
  90. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/lexer.pyi +0 -34
  91. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/parser.py +0 -872
  92. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/parser.pyi +0 -83
  93. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/py.typed +0 -0
  94. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/util.py +0 -72
  95. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/util.pyi +0 -7
  96. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/vendor.txt +0 -1
  97. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor-patches/funcparserlib.patch +0 -24
  98. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/lint.py +0 -280
  99. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/parser.py +0 -177
  100. data/rust-vendor/markup5ever_rcdom/html5lib-tests/pyproject.toml +0 -7
  101. data/rust-vendor/markup5ever_rcdom/html5lib-tests/serializer/core.test +0 -125
  102. data/rust-vendor/markup5ever_rcdom/html5lib-tests/serializer/injectmeta.test +0 -66
  103. data/rust-vendor/markup5ever_rcdom/html5lib-tests/serializer/optionaltags.test +0 -965
  104. data/rust-vendor/markup5ever_rcdom/html5lib-tests/serializer/options.test +0 -60
  105. data/rust-vendor/markup5ever_rcdom/html5lib-tests/serializer/whitespace.test +0 -51
  106. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/README.md +0 -107
  107. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/contentModelFlags.test +0 -93
  108. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/domjs.test +0 -335
  109. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/entities.test +0 -542
  110. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/escapeFlag.test +0 -36
  111. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/namedEntities.test +0 -42422
  112. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/numericEntities.test +0 -1677
  113. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/pendingSpecChanges.test +0 -9
  114. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/test1.test +0 -353
  115. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/test2.test +0 -275
  116. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/test3.test +0 -11233
  117. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/test4.test +0 -532
  118. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/unicodeChars.test +0 -1577
  119. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/unicodeCharsProblematic.test +0 -41
  120. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/xmlViolation.test +0 -20
  121. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/README.md +0 -108
  122. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/adoption01.dat +0 -354
  123. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/adoption02.dat +0 -39
  124. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/blocks.dat +0 -695
  125. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/comments01.dat +0 -217
  126. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/doctype01.dat +0 -474
  127. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/domjs-unsafe.dat +0 -0
  128. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/entities01.dat +0 -943
  129. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/entities02.dat +0 -309
  130. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/foreign-fragment.dat +0 -645
  131. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/html5test-com.dat +0 -301
  132. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/inbody01.dat +0 -54
  133. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/isindex.dat +0 -49
  134. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/main-element.dat +0 -46
  135. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/math.dat +0 -104
  136. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/menuitem-element.dat +0 -240
  137. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/namespace-sensitivity.dat +0 -22
  138. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/noscript01.dat +0 -237
  139. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/pending-spec-changes-plain-text-unsafe.dat +0 -0
  140. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/pending-spec-changes.dat +0 -46
  141. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/plain-text-unsafe.dat +0 -0
  142. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/quirks01.dat +0 -53
  143. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/ruby.dat +0 -302
  144. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/scriptdata01.dat +0 -372
  145. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/scripted/adoption01.dat +0 -16
  146. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/scripted/ark.dat +0 -27
  147. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/scripted/webkit01.dat +0 -30
  148. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/search-element.dat +0 -46
  149. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/svg.dat +0 -104
  150. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tables01.dat +0 -322
  151. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/template.dat +0 -1673
  152. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests1.dat +0 -1956
  153. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests10.dat +0 -849
  154. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests11.dat +0 -523
  155. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests12.dat +0 -62
  156. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests14.dat +0 -75
  157. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests15.dat +0 -216
  158. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests16.dat +0 -2602
  159. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests17.dat +0 -179
  160. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests18.dat +0 -558
  161. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests19.dat +0 -1398
  162. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests2.dat +0 -831
  163. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests20.dat +0 -842
  164. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests21.dat +0 -306
  165. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests22.dat +0 -190
  166. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests23.dat +0 -168
  167. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests24.dat +0 -79
  168. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests25.dat +0 -288
  169. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests26.dat +0 -453
  170. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests3.dat +0 -305
  171. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests4.dat +0 -74
  172. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests5.dat +0 -210
  173. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests6.dat +0 -663
  174. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests7.dat +0 -453
  175. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests8.dat +0 -165
  176. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests9.dat +0 -472
  177. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests_innerHTML_1.dat +0 -843
  178. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tricky01.dat +0 -336
  179. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/webkit01.dat +0 -785
  180. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/webkit02.dat +0 -554
  181. data/rust-vendor/markup5ever_rcdom/tests/foreach_html5lib_test/mod.rs +0 -41
  182. data/rust-vendor/markup5ever_rcdom/tests/html-driver.rs +0 -29
  183. data/rust-vendor/markup5ever_rcdom/tests/html-serializer.rs +0 -265
  184. data/rust-vendor/markup5ever_rcdom/tests/html-tokenizer.rs +0 -487
  185. data/rust-vendor/markup5ever_rcdom/tests/html-tree-builder.rs +0 -298
  186. data/rust-vendor/markup5ever_rcdom/tests/html-tree-sink.rs +0 -141
  187. data/rust-vendor/markup5ever_rcdom/tests/util/find_tests.rs +0 -34
  188. data/rust-vendor/markup5ever_rcdom/tests/util/runner.rs +0 -48
  189. data/rust-vendor/markup5ever_rcdom/tests/xml-driver.rs +0 -101
  190. data/rust-vendor/markup5ever_rcdom/tests/xml-tokenizer.rs +0 -374
  191. data/rust-vendor/markup5ever_rcdom/tests/xml-tree-builder.rs +0 -237
  192. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/AUTHORS.rst +0 -9
  193. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/LICENSE +0 -21
  194. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/README.md +0 -92
  195. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/comments.test +0 -274
  196. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/doctype.test +0 -3232
  197. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/entities.test +0 -283
  198. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/eof.test +0 -113
  199. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/namedEntities.test +0 -42210
  200. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/numericEntities.test +0 -1349
  201. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/test1.test +0 -162
  202. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/test2.test +0 -64
  203. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/unicodeChars.test +0 -1295
  204. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tree-construction/README.md +0 -104
  205. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tree-construction/namespace.dat +0 -119
  206. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tree-construction/test1.dat +0 -124
  207. data/rust-vendor/xml5ever/.cargo-checksum.json +0 -1
  208. data/rust-vendor/xml5ever/.cargo_vcs_info.json +0 -6
  209. data/rust-vendor/xml5ever/Cargo.lock +0 -752
  210. data/rust-vendor/xml5ever/Cargo.toml +0 -69
  211. data/rust-vendor/xml5ever/Cargo.toml.orig +0 -29
  212. data/rust-vendor/xml5ever/LICENSE-APACHE +0 -201
  213. data/rust-vendor/xml5ever/LICENSE-MIT +0 -25
  214. data/rust-vendor/xml5ever/README.md +0 -72
  215. data/rust-vendor/xml5ever/benches/xml5ever.rs +0 -77
  216. data/rust-vendor/xml5ever/data/bench/strong.xml +0 -1
  217. data/rust-vendor/xml5ever/examples/README.md +0 -223
  218. data/rust-vendor/xml5ever/examples/example.xml +0 -3
  219. data/rust-vendor/xml5ever/examples/simple_xml_tokenizer.rs +0 -81
  220. data/rust-vendor/xml5ever/examples/xml_tokenizer.rs +0 -115
  221. data/rust-vendor/xml5ever/src/driver.rs +0 -90
  222. data/rust-vendor/xml5ever/src/lib.rs +0 -47
  223. data/rust-vendor/xml5ever/src/macros.rs +0 -18
  224. data/rust-vendor/xml5ever/src/serialize/mod.rs +0 -216
  225. data/rust-vendor/xml5ever/src/tokenizer/char_ref/mod.rs +0 -456
  226. data/rust-vendor/xml5ever/src/tokenizer/interface.rs +0 -116
  227. data/rust-vendor/xml5ever/src/tokenizer/mod.rs +0 -1344
  228. data/rust-vendor/xml5ever/src/tokenizer/qname.rs +0 -84
  229. data/rust-vendor/xml5ever/src/tokenizer/states.rs +0 -167
  230. data/rust-vendor/xml5ever/src/tree_builder/mod.rs +0 -774
  231. data/rust-vendor/xml5ever/src/tree_builder/types.rs +0 -37
@@ -1,223 +0,0 @@
1
- # Examples
2
-
3
- The examples have been designed with [`cargo-script`](https://github.com/DanielKeep/cargo-script) in mind.
4
-
5
- Here I'll just give broad overview how to install [`cargo script`] for Rust 1.5. For more details, check out [cargo-script repository](https://github.com/DanielKeep/cargo-script).
6
-
7
- cargo install cargo-script
8
-
9
-
10
- # Token printer
11
-
12
- The basis of xml5ever is its tokenizer and tree builder. Roughly speaking tokenizer
13
- takes input and returns a set of tokens like comment, processing instruction, start
14
- tag, end tag, etc.
15
-
16
- First let's define our dependencies:
17
-
18
- ```toml
19
- [dependencies]
20
- xml5ever = "0.2.0"
21
- tendril = "0.1.3"
22
- ```
23
-
24
- With dependencies declared, we can now make a simple tokenizer sink. First step is to
25
- define a [`TokenSink`](https://docs.rs/xml5ever/latest/xml5ever/tokenizer/trait.TokenSink.html). [`TokenSink`](https://docs.rs/xml5ever/latest/xml5ever/tokenizer/trait.TokenSink.html) are traits that received stream of [`Tokens`](https://docs.rs/xml5ever/latest/xml5ever/tokenizer/enum.Token.html).
26
-
27
- In our case we'll define a unit struct (i.e. a struct without any fields).
28
-
29
- ```rust
30
- struct SimpleTokenPrinter;
31
- ```
32
-
33
- To make `SimpleTokenPrinter` a [`TokenSink`](https://docs.rs/xml5ever/latest/xml5ever/tokenizer/trait.TokenSink.html), we need to implement [process_token](https://docs.rs/xml5ever/latest/xml5ever/tokenizer/trait.TokenSink.html#tymethod.process_token) method.
34
-
35
- ```rust
36
- impl TokenSink for SimpleTokenPrinter {
37
- fn process_token(&mut self, token: Token) {
38
- match token {
39
- CharacterTokens(b) => {
40
- println!("TEXT: {}", &*b);
41
- },
42
- NullCharacterToken => print!("NULL"),
43
- TagToken(tag) => {
44
- println!("{:?} {} ", tag.kind, &*tag.name.local);
45
- },
46
- ParseError(err) => {
47
- println!("ERROR: {}", err);
48
- },
49
- PIToken(Pi{ref target, ref data}) => {
50
- println!("PI : <?{} {}?>", &*target, &*data);
51
- },
52
- CommentToken(ref comment) => {
53
- println!("<!--{:?}-->", &*comment);
54
- },
55
- EOFToken => {
56
- println!("EOF");
57
- },
58
- DoctypeToken(Doctype{ref name, ref public_id, ..}) => {
59
- println!("<!DOCTYPE {:?} {:?}>", &*name, &*public_id);
60
- }
61
- }
62
- }
63
- }
64
- ```
65
-
66
- Now, we need some input to process. For input we'll use `stdin`. However, xml5ever `tokenize_to` method only takes `StrTendril`. So we need to construct a
67
- [`ByteTendril`](https://docs.rs/tendril/latest/tendril/type.ByteTendril.html) using `ByteTendril::new()`, then read the `stdin` using [`read_to_tendril`](https://docs.rs/tendril/latest/tendril/trait.ReadExt.html#tymethod.read_to_tendril) extension.
68
-
69
- Once that is set, to make `SimpleTokenPrinter` parse the input, call,
70
- `tokenize_to` with it as the first parameter, input wrapped in Option for second parameter and XmlToke.
71
-
72
- ```rust
73
- fn main() {
74
- let sink = SimpleTokenPrinter;
75
-
76
- // We need a ByteTendril to read a file
77
- let mut input = ByteTendril::new();
78
- // Using SliceExt.read_to_tendril we read stdin
79
- io::stdin().read_to_tendril(&mut input).unwrap();
80
- // For xml5ever we need StrTendril, so we reinterpret it
81
- // into StrTendril.
82
- //
83
- // You might wonder, how does `try_reinterpret` know we
84
- // need StrTendril and the answer is type inference based
85
- // on `tokenize_xml_to` signature.
86
- let input = input.try_reinterpret().unwrap();
87
- // Here we create and run tokenizer
88
- let mut tok = XmlTokenizer::new(sink, Default::default());
89
- // We pass input to parsed.
90
- tok.feed(input);
91
-
92
- // tok.end must be invoked for final bytes to be processed.
93
- tok.end();
94
- }
95
- ```
96
-
97
- NOTE: `unwrap` causes panic, it's only OK to use in simple examples.
98
-
99
- For full source code check out: [`examples/simple_xml_tokenizer.rs`](https://github.com/servo/html5ever/blob/main/xml5ever/examples/simple_xml_tokenizer.rs)
100
-
101
- Once we have successfully compiled the example we run the example with inline
102
- xml
103
-
104
- ```bash
105
- cargo script simple_xml_tokenizer.rs <<< "<xml>Text with <b>bold words</b>!</xml>"
106
- ```
107
-
108
- or by sending an [`examples/example.xml`](https://github.com/servo/html5ever/blob/main/xml5ever/examples/example.xml) located in same folder as examples.
109
-
110
- ```bash
111
- cargo script simple_xml_tokenizer.rs < example.xml
112
- ```
113
-
114
- # Tree printer
115
-
116
- To actually get an XML document tree from the xml5ever, you need to use a `TreeSink`.
117
- `TreeSink` is in many way similar to the TokenSink. Basically, TokenSink takes data
118
- and returns list of tokens, while TreeSink takes tokens and returns a tree of parsed
119
- XML document. Do note, that this is a simplified explanation and consult
120
- documentation for more info.
121
-
122
- Ok, with that in mind, let's build us a TreePrinter. For example if we get an XML
123
- file like:
124
-
125
- ```xml
126
- <student>
127
- <first-name>Bobby</first-name>
128
- <last-name>Tables</last-name>
129
- </student>
130
- ```
131
-
132
- We'd want a structure similar to this:
133
-
134
- ```
135
- #document
136
- student
137
- first-name
138
- #text Bobby
139
- last-name
140
- #text Tables
141
-
142
- ```
143
- We won't print anything other than element names and text fields. So comments,
144
- doctypes and other such elements are ignored.
145
-
146
- First part is similar to making SimpleTokenPrinter:
147
-
148
- ```rust
149
- // We need to allocate an input tendril for xml5ever
150
- let mut input = ByteTendril::new();
151
- // Using SliceExt.read_to_tendril functions we can read stdin
152
- io::stdin().read_to_tendril(&mut input).unwrap();
153
- let input = input.try_reinterpret().unwrap();
154
- ```
155
-
156
- This time, we need an implementation of [`TreeSink`](https://docs.rs/xml5ever/latest/xml5ever/tree_builder/trait.TreeSink.html). xml5ever comes with a
157
- built-in `TreeSink` implementation called [`RcDom`](https://docs.rs/markup5ever_rcdom/latest/markup5ever_rcdom/struct.RcDom.html). To process input into
158
- a `TreeSink` we use the following line:
159
-
160
- ```rust
161
- let dom: RcDom = parse(one_input(input), Default::default());
162
- ```
163
-
164
- Let's analyze it a bit. First there is `let dom: RcDom`. We need this part,
165
- because the type inferencer can't infer which TreeSink implementation we mean
166
- in this scenario.
167
-
168
- Function [`one_input`](https://ygg01.github.io/docs/xml5ever/xml5ever/fn.one_input.html) is a convenience function that turns any value into an iterator. In this case
169
- it converts a StrTendril into an Iterator over itself.
170
-
171
- Ok, so now that we parsed our tree what with it? Well, for that we might need some
172
- kind of function that will help us traverse it. We shall call that function `walk`.
173
-
174
- ```rust
175
- fn walk(prefix: &str, handle: Handle) {
176
- let node = handle.borrow();
177
-
178
- // We print out the prefix before we start
179
- print!("{}", prefix);
180
- // We are only interested in following nodes:
181
- // Document, Text and Element, so our match
182
- // reflects that.
183
- match node.node {
184
- Document
185
- => println!("#document"),
186
-
187
- Text(ref text) => {
188
- println!("#text {}", text.escape_default())
189
- },
190
-
191
- Element(ref name, _) => {
192
- println!("{}", name.local);
193
- },
194
-
195
- _ => {},
196
-
197
- }
198
-
199
- // We increase indent in child nodes
200
- let new_indent = {
201
- let mut temp = String::new();
202
- temp.push_str(prefix);
203
- temp.push_str(" ");
204
- temp
205
- };
206
-
207
- for child in node.children.iter()
208
- // In order to avoid weird indentation, we filter
209
- // only Text/Element nodes.
210
- // We don't need to filter Document since its guaranteed
211
- // child elements don't contain documents
212
- .filter(|child| match child.borrow().node {
213
- Text(_) | Element (_, _) => true,
214
- _ => false,
215
- }
216
- ) {
217
- // Recursion - Yay!
218
- walk(&new_indent, child.clone());
219
- }
220
- }
221
- ```
222
-
223
- For full source code check out: [`examples/xml_tree_printer.rs`](https://github.com/servo/html5ever/blob/main/rcdom/examples/xml_tree_printer.rs)
@@ -1,3 +0,0 @@
1
- <?xml version="1.0" encoding="UTF-8"?>
2
- <!DOCTYPE xml PUBLIC "http://www.wc3.org/">
3
- <student><first-name>Bobby</first-name><last-name>Tables</last-name></student>
@@ -1,81 +0,0 @@
1
- #!/usr/bin/env run-cargo-script
2
- //! This is a regular crate doc comment, but it also contains a partial
3
- //! Cargo manifest. Note the use of a *fenced* code block, and the
4
- //! `cargo` "language".
5
- //!
6
- //! ```cargo
7
- //! [dependencies]
8
- //! xml5ever = "0.1.1"
9
- //! tendril = "0.1.3"
10
- //! markup5ever = "0.7.4"
11
- //! ```
12
- extern crate markup5ever;
13
- extern crate xml5ever;
14
-
15
- use std::io;
16
-
17
- use markup5ever::buffer_queue::BufferQueue;
18
- use xml5ever::tendril::{ByteTendril, ReadExt};
19
- use xml5ever::tokenizer::{Doctype, Pi, ProcessResult, Token, TokenSink, XmlTokenizer};
20
-
21
- struct SimpleTokenPrinter;
22
-
23
- impl TokenSink for SimpleTokenPrinter {
24
- type Handle = ();
25
-
26
- fn process_token(&self, token: Token) -> ProcessResult<()> {
27
- match token {
28
- Token::Characters(b) => {
29
- println!("TEXT: {}", &*b);
30
- },
31
- Token::NullCharacter => print!("NULL"),
32
- Token::Tag(tag) => {
33
- println!("{:?} {} ", tag.kind, &*tag.name.local);
34
- },
35
- Token::ParseError(err) => {
36
- println!("ERROR: {err}");
37
- },
38
- Token::ProcessingInstruction(Pi {
39
- ref target,
40
- ref data,
41
- }) => {
42
- println!("PI : <?{target} {data}?>");
43
- },
44
- Token::Comment(ref comment) => {
45
- println!("<!--{comment:?}-->");
46
- },
47
- Token::EndOfFile => {
48
- println!("EOF");
49
- },
50
- Token::Doctype(Doctype {
51
- ref name,
52
- ref public_id,
53
- ..
54
- }) => {
55
- println!("<!DOCTYPE {name:?} {public_id:?}>");
56
- },
57
- };
58
- ProcessResult::Continue
59
- }
60
- }
61
-
62
- fn main() {
63
- // Our implementation of TokenSink
64
- let sink = SimpleTokenPrinter;
65
-
66
- // We need a ByteTendril to read a file
67
- let mut input = ByteTendril::new();
68
-
69
- // Using SliceExt.read_to_tendril we can read stdin
70
- io::stdin().read_to_tendril(&mut input).unwrap();
71
- // For xml5ever we need StrTendril, so we reinterpret it
72
- // into StrTendril.
73
-
74
- // Load input into BufferQueue
75
- let input_buffer = BufferQueue::default();
76
- input_buffer.push_back(input.try_reinterpret().unwrap());
77
- // Here we create and run tokenizer
78
- let tok = XmlTokenizer::new(sink, Default::default());
79
- let _ = tok.feed(&input_buffer);
80
- tok.end();
81
- }
@@ -1,115 +0,0 @@
1
- #!/usr/bin/env run-cargo-script
2
- //! This is a regular crate doc comment, but it also contains a partial
3
- //! Cargo manifest. Note the use of a *fenced* code block, and the
4
- //! `cargo` "language".
5
- //!
6
- //! ```cargo
7
- //! [dependencies]
8
- //! xml5ever = "0.2.0"
9
- //! tendril = "0.1.3"
10
- //! markup5ever = "0.7.4"
11
- //! ```
12
- extern crate markup5ever;
13
- extern crate xml5ever;
14
-
15
- use std::cell::Cell;
16
- use std::io;
17
-
18
- use markup5ever::buffer_queue::BufferQueue;
19
- use xml5ever::tendril::{ByteTendril, ReadExt};
20
- use xml5ever::tokenizer::{
21
- EmptyTag, EndTag, Pi, ProcessResult, ShortTag, StartTag, Token, TokenSink, XmlTokenizer,
22
- XmlTokenizerOpts,
23
- };
24
-
25
- #[derive(Clone)]
26
- struct TokenPrinter {
27
- in_char_run: Cell<bool>,
28
- }
29
-
30
- impl TokenPrinter {
31
- fn is_char(&self, is_char: bool) {
32
- match (self.in_char_run.get(), is_char) {
33
- (false, true) => print!("CHAR : \""),
34
- (true, false) => println!("\""),
35
- _ => (),
36
- }
37
- self.in_char_run.set(is_char);
38
- }
39
-
40
- fn do_char(&self, c: char) {
41
- self.is_char(true);
42
- print!("{}", c.escape_default().collect::<String>());
43
- }
44
- }
45
-
46
- impl TokenSink for TokenPrinter {
47
- type Handle = ();
48
-
49
- fn process_token(&self, token: Token) -> ProcessResult<()> {
50
- match token {
51
- Token::Characters(b) => {
52
- for c in b.chars() {
53
- self.do_char(c);
54
- }
55
- },
56
- Token::NullCharacter => self.do_char('\0'),
57
- Token::Tag(tag) => {
58
- self.is_char(false);
59
- // This is not proper HTML serialization, of course.
60
- match tag.kind {
61
- StartTag => print!("TAG : <\x1b[32m{}\x1b[0m", tag.name.local),
62
- EndTag => print!("END TAG : <\x1b[31m/{}\x1b[0m", tag.name.local),
63
- ShortTag => print!("Short TAG : <\x1b[31m/{}\x1b[0m", tag.name.local),
64
- EmptyTag => print!("Empty TAG : <\x1b[31m{}\x1b[0m", tag.name.local),
65
- }
66
- for attr in tag.attrs.iter() {
67
- print!(
68
- " \x1b[36m{}\x1b[0m='\x1b[34m{}\x1b[0m'",
69
- attr.name.local, attr.value
70
- );
71
- }
72
- if tag.kind == EmptyTag {
73
- print!("/");
74
- }
75
- println!(">");
76
- },
77
- Token::ParseError(err) => {
78
- self.is_char(false);
79
- println!("ERROR: {err}");
80
- },
81
- Token::ProcessingInstruction(Pi { target, data }) => {
82
- self.is_char(false);
83
- println!("PI : <?{target:?} {data:?}?>");
84
- },
85
- _ => {
86
- self.is_char(false);
87
- println!("OTHER: {token:?}");
88
- },
89
- };
90
-
91
- ProcessResult::Continue
92
- }
93
- }
94
-
95
- fn main() {
96
- let sink = TokenPrinter {
97
- in_char_run: Cell::new(false),
98
- };
99
- let mut input = ByteTendril::new();
100
- io::stdin().read_to_tendril(&mut input).unwrap();
101
- let input_buffer = BufferQueue::default();
102
- input_buffer.push_back(input.try_reinterpret().unwrap());
103
-
104
- let tok = XmlTokenizer::new(
105
- sink,
106
- XmlTokenizerOpts {
107
- profile: true,
108
- exact_errors: true,
109
- ..Default::default()
110
- },
111
- );
112
- let _ = tok.feed(&input_buffer);
113
- tok.end();
114
- tok.sink.is_char(false);
115
- }
@@ -1,90 +0,0 @@
1
- // Copyright 2014-2017 The html5ever Project Developers. See the
2
- // COPYRIGHT file at the top-level directory of this distribution.
3
- //
4
- // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5
- // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6
- // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7
- // option. This file may not be copied, modified, or distributed
8
- // except according to those terms.
9
-
10
- use crate::tokenizer::{XmlTokenizer, XmlTokenizerOpts};
11
- use crate::tree_builder::{TreeSink, XmlTreeBuilder, XmlTreeBuilderOpts};
12
-
13
- use std::borrow::Cow;
14
-
15
- use crate::tendril;
16
- use crate::tendril::stream::{TendrilSink, Utf8LossyDecoder};
17
- use crate::tendril::StrTendril;
18
- use markup5ever::buffer_queue::BufferQueue;
19
-
20
- /// All-encompasing parser setting structure.
21
- #[derive(Clone, Default)]
22
- pub struct XmlParseOpts {
23
- /// Xml tokenizer options.
24
- pub tokenizer: XmlTokenizerOpts,
25
- /// Xml tree builder .
26
- pub tree_builder: XmlTreeBuilderOpts,
27
- }
28
-
29
- /// Parse and send results to a `TreeSink`.
30
- ///
31
- /// ## Example
32
- ///
33
- /// ```ignore
34
- /// let mut sink = MySink;
35
- /// parse_document(&mut sink, iter::once(my_str), Default::default());
36
- /// ```
37
- pub fn parse_document<Sink>(sink: Sink, opts: XmlParseOpts) -> XmlParser<Sink>
38
- where
39
- Sink: TreeSink,
40
- {
41
- let tb = XmlTreeBuilder::new(sink, opts.tree_builder);
42
- let tok = XmlTokenizer::new(tb, opts.tokenizer);
43
- XmlParser {
44
- tokenizer: tok,
45
- input_buffer: BufferQueue::default(),
46
- }
47
- }
48
-
49
- /// An XML parser,
50
- /// ready to receive Unicode input through the `tendril::TendrilSink` trait’s methods.
51
- pub struct XmlParser<Sink>
52
- where
53
- Sink: TreeSink,
54
- {
55
- /// Tokenizer used by XmlParser.
56
- pub tokenizer: XmlTokenizer<XmlTreeBuilder<Sink::Handle, Sink>>,
57
- /// Input used by XmlParser.
58
- pub input_buffer: BufferQueue,
59
- }
60
-
61
- impl<Sink: TreeSink> TendrilSink<tendril::fmt::UTF8> for XmlParser<Sink> {
62
- type Output = Sink::Output;
63
-
64
- fn process(&mut self, t: StrTendril) {
65
- self.input_buffer.push_back(t);
66
- // FIXME: Properly support </script> somehow.
67
- let _ = self.tokenizer.feed(&self.input_buffer);
68
- }
69
-
70
- // FIXME: Is it too noisy to report every character decoding error?
71
- fn error(&mut self, desc: Cow<'static, str>) {
72
- self.tokenizer.sink.sink.parse_error(desc)
73
- }
74
-
75
- fn finish(self) -> Self::Output {
76
- self.tokenizer.end();
77
- self.tokenizer.sink.sink.finish()
78
- }
79
- }
80
-
81
- impl<Sink: TreeSink> XmlParser<Sink> {
82
- /// Wrap this parser into a `TendrilSink` that accepts UTF-8 bytes.
83
- ///
84
- /// Use this when your input is bytes that are known to be in the UTF-8 encoding.
85
- /// Decoding is lossy, like `String::from_utf8_lossy`.
86
- #[allow(clippy::wrong_self_convention)]
87
- pub fn from_utf8(self) -> Utf8LossyDecoder<Self> {
88
- Utf8LossyDecoder::new(self)
89
- }
90
- }
@@ -1,47 +0,0 @@
1
- // Copyright 2014-2017 The html5ever Project Developers. See the
2
- // COPYRIGHT file at the top-level directory of this distribution.
3
- //
4
- // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5
- // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6
- // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7
- // option. This file may not be copied, modified, or distributed
8
- // except according to those terms.
9
-
10
- //! This crate provides a push based XML parser library that
11
- //! adheres to XML5 specification. In other words this library
12
- //! trades well-formedness for error recovery.
13
- //!
14
- //! The idea behind this, was to minimize number of errors from
15
- //! tools that generate XML (e.g. `&#83` won't just return `&#83`
16
- //! as text, but will parse it into `S` ).
17
- //! You can check out full specification [here](https://ygg01.github.io/xml5_draft/).
18
- //!
19
- //! What this library provides is a solid XML parser that can:
20
- //!
21
- //! * Parse somewhat erroneous XML input
22
- //! * Provide support for [Numeric character references](https://en.wikipedia.org/wiki/Numeric_character_reference).
23
- //! * Provide partial [XML namespace](http://www.w3.org/TR/xml-names11/) support.
24
- //! * Provide full set of SVG/MathML entities
25
- //!
26
- //! What isn't in scope for this library:
27
- //!
28
- //! * Document Type Definition parsing - this is pretty hard to do right and nowadays, its used
29
- //!
30
-
31
- #![crate_name = "xml5ever"]
32
- #![crate_type = "dylib"]
33
- #![allow(unexpected_cfgs)]
34
- #![deny(missing_docs)]
35
-
36
- pub use markup5ever::*;
37
-
38
- pub(crate) mod macros;
39
-
40
- /// Driver
41
- pub mod driver;
42
- /// Serializer for XML5.
43
- pub mod serialize;
44
- /// XML5 tokenizer - converts input into tokens
45
- pub mod tokenizer;
46
- /// XML5 tree builder - converts tokens into a tree like structure
47
- pub mod tree_builder;
@@ -1,18 +0,0 @@
1
- // Copyright 2014-2017 The html5ever Project Developers. See the
2
- // COPYRIGHT file at the top-level directory of this distribution.
3
- //
4
- // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5
- // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6
- // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7
- // option. This file may not be copied, modified, or distributed
8
- // except according to those terms.
9
-
10
- macro_rules! time {
11
- ($e:expr) => {{
12
- let t0 = ::std::time::Instant::now();
13
- let result = $e;
14
- let dt = t0.elapsed().as_nanos() as u64;
15
- (result, dt)
16
- }};
17
- }
18
- pub(crate) use time;