html-to-markdown 2.24.6 → 2.25.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/ext/html-to-markdown-rb/native/Cargo.lock +9 -32
- data/ext/html-to-markdown-rb/native/Cargo.toml +1 -1
- data/lib/html_to_markdown/version.rb +1 -1
- data/rust-vendor/html-to-markdown-rs/Cargo.toml +0 -1
- data/rust-vendor/html-to-markdown-rs/src/converter/main_helpers.rs +1 -1
- data/rust-vendor/html-to-markdown-rs/src/hocr/converter/hierarchy.rs +20 -5
- data/rust-vendor/html-to-markdown-rs/src/lib.rs +1 -0
- data/rust-vendor/{markup5ever_rcdom/lib.rs → html-to-markdown-rs/src/rcdom.rs} +56 -91
- data/rust-vendor/html-to-markdown-rs/tests/hocr_compliance_test.rs +157 -0
- data/rust-vendor/memmap2/.cargo-checksum.json +1 -1
- data/rust-vendor/memmap2/.cargo_vcs_info.json +1 -1
- data/rust-vendor/memmap2/CHANGELOG.md +8 -0
- data/rust-vendor/memmap2/Cargo.lock +1 -1
- data/rust-vendor/memmap2/Cargo.toml +2 -1
- data/rust-vendor/memmap2/Cargo.toml.orig +2 -1
- data/rust-vendor/memmap2/src/lib.rs +25 -1
- data/rust-vendor/memmap2/src/stub.rs +1 -4
- data/rust-vendor/memmap2/src/unix.rs +14 -1
- data/rust-vendor/png/.cargo-checksum.json +1 -1
- data/rust-vendor/png/.cargo_vcs_info.json +1 -1
- data/rust-vendor/png/CHANGES.md +44 -0
- data/rust-vendor/png/Cargo.lock +124 -171
- data/rust-vendor/png/Cargo.toml +1 -1
- data/rust-vendor/png/Cargo.toml.orig +1 -1
- data/rust-vendor/png/benches/expand_paletted.rs +5 -5
- data/rust-vendor/png/benches/unfilter.rs +3 -3
- data/rust-vendor/png/src/adam7.rs +17 -10
- data/rust-vendor/png/src/common.rs +8 -8
- data/rust-vendor/png/src/decoder/mod.rs +53 -20
- data/rust-vendor/png/src/decoder/stream.rs +263 -78
- data/rust-vendor/png/src/decoder/unfiltering_buffer.rs +210 -53
- data/rust-vendor/png/src/decoder/zlib.rs +130 -90
- data/rust-vendor/png/src/encoder.rs +4 -2
- data/rust-vendor/png/src/{filter.rs → filter/mod.rs} +100 -367
- data/rust-vendor/png/src/filter/optimization-notes.md +104 -0
- data/rust-vendor/png/src/filter/paeth.rs +398 -0
- data/rust-vendor/png/src/filter/simd.rs +308 -0
- data/rust-vendor/png/src/lib.rs +1 -0
- data/rust-vendor/syn/.cargo-checksum.json +1 -1
- data/rust-vendor/syn/.cargo_vcs_info.json +1 -1
- data/rust-vendor/syn/Cargo.lock +40 -41
- data/rust-vendor/syn/Cargo.toml +1 -1
- data/rust-vendor/syn/Cargo.toml.orig +1 -1
- data/rust-vendor/syn/src/item.rs +61 -40
- data/rust-vendor/syn/src/lib.rs +2 -1
- data/rust-vendor/syn/tests/test_item.rs +54 -0
- data/rust-vendor/unicode-ident/.cargo-checksum.json +1 -1
- data/rust-vendor/unicode-ident/.cargo_vcs_info.json +1 -1
- data/rust-vendor/unicode-ident/Cargo.lock +21 -21
- data/rust-vendor/unicode-ident/Cargo.toml +1 -1
- data/rust-vendor/unicode-ident/Cargo.toml.orig +1 -1
- data/rust-vendor/unicode-ident/src/lib.rs +1 -1
- data/rust-vendor/unicode-ident/src/tables.rs +87 -97
- data/rust-vendor/unicode-ident/tests/static_size.rs +1 -1
- metadata +7 -177
- data/rust-vendor/markup5ever_rcdom/.cargo-checksum.json +0 -1
- data/rust-vendor/markup5ever_rcdom/.cargo_vcs_info.json +0 -7
- data/rust-vendor/markup5ever_rcdom/Cargo.lock +0 -658
- data/rust-vendor/markup5ever_rcdom/Cargo.toml +0 -109
- data/rust-vendor/markup5ever_rcdom/Cargo.toml.orig +0 -42
- data/rust-vendor/markup5ever_rcdom/LICENSE-APACHE +0 -201
- data/rust-vendor/markup5ever_rcdom/LICENSE-MIT +0 -25
- data/rust-vendor/markup5ever_rcdom/README.md +0 -7
- data/rust-vendor/markup5ever_rcdom/custom-html5lib-tokenizer-tests/regression.test +0 -69
- data/rust-vendor/markup5ever_rcdom/data/test/ignore +0 -1
- data/rust-vendor/markup5ever_rcdom/examples/hello_xml.rs +0 -39
- data/rust-vendor/markup5ever_rcdom/examples/html2html.rs +0 -51
- data/rust-vendor/markup5ever_rcdom/examples/print-rcdom.rs +0 -78
- data/rust-vendor/markup5ever_rcdom/examples/xml_tree_printer.rs +0 -67
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/.gitattributes +0 -2
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/.github/workflows/downstream.yml +0 -76
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/.github/workflows/lint.yml +0 -25
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/.gitignore +0 -79
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/AUTHORS.rst +0 -34
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/LICENSE +0 -21
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/encoding/chardet/test_big5.txt +0 -51
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/encoding/scripted/tests1.dat +0 -5
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/encoding/test-yahoo-jp.dat +0 -10
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/encoding/tests1.dat +0 -388
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/encoding/tests2.dat +0 -115
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint +0 -6
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/__init__.py +0 -0
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/__init__.py +0 -0
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/LICENSE +0 -18
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/__init__.py +0 -0
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/lexer.py +0 -211
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/lexer.pyi +0 -34
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/parser.py +0 -872
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/parser.pyi +0 -83
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/py.typed +0 -0
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/util.py +0 -72
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/util.pyi +0 -7
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/vendor.txt +0 -1
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor-patches/funcparserlib.patch +0 -24
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/lint.py +0 -280
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/parser.py +0 -177
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/pyproject.toml +0 -7
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/serializer/core.test +0 -125
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/serializer/injectmeta.test +0 -66
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/serializer/optionaltags.test +0 -965
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/serializer/options.test +0 -60
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/serializer/whitespace.test +0 -51
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/README.md +0 -107
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/contentModelFlags.test +0 -93
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/domjs.test +0 -335
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/entities.test +0 -542
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/escapeFlag.test +0 -36
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/namedEntities.test +0 -42422
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/numericEntities.test +0 -1677
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/pendingSpecChanges.test +0 -9
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/test1.test +0 -353
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/test2.test +0 -275
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/test3.test +0 -11233
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/test4.test +0 -532
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/unicodeChars.test +0 -1577
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/unicodeCharsProblematic.test +0 -41
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/xmlViolation.test +0 -20
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/README.md +0 -108
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/adoption01.dat +0 -354
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/adoption02.dat +0 -39
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/blocks.dat +0 -695
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/comments01.dat +0 -217
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/doctype01.dat +0 -474
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/domjs-unsafe.dat +0 -0
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/entities01.dat +0 -943
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/entities02.dat +0 -309
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/foreign-fragment.dat +0 -645
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/html5test-com.dat +0 -301
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/inbody01.dat +0 -54
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/isindex.dat +0 -49
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/main-element.dat +0 -46
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/math.dat +0 -104
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/menuitem-element.dat +0 -240
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/namespace-sensitivity.dat +0 -22
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/noscript01.dat +0 -237
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/pending-spec-changes-plain-text-unsafe.dat +0 -0
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/pending-spec-changes.dat +0 -46
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/plain-text-unsafe.dat +0 -0
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/quirks01.dat +0 -53
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/ruby.dat +0 -302
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/scriptdata01.dat +0 -372
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/scripted/adoption01.dat +0 -16
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/scripted/ark.dat +0 -27
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/scripted/webkit01.dat +0 -30
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/search-element.dat +0 -46
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/svg.dat +0 -104
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tables01.dat +0 -322
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/template.dat +0 -1673
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests1.dat +0 -1956
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests10.dat +0 -849
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests11.dat +0 -523
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests12.dat +0 -62
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests14.dat +0 -75
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests15.dat +0 -216
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests16.dat +0 -2602
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests17.dat +0 -179
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests18.dat +0 -558
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests19.dat +0 -1398
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests2.dat +0 -831
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests20.dat +0 -842
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests21.dat +0 -306
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests22.dat +0 -190
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests23.dat +0 -168
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests24.dat +0 -79
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests25.dat +0 -288
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests26.dat +0 -453
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests3.dat +0 -305
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests4.dat +0 -74
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests5.dat +0 -210
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests6.dat +0 -663
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests7.dat +0 -453
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests8.dat +0 -165
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests9.dat +0 -472
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests_innerHTML_1.dat +0 -843
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tricky01.dat +0 -336
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/webkit01.dat +0 -785
- data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/webkit02.dat +0 -554
- data/rust-vendor/markup5ever_rcdom/tests/foreach_html5lib_test/mod.rs +0 -41
- data/rust-vendor/markup5ever_rcdom/tests/html-driver.rs +0 -29
- data/rust-vendor/markup5ever_rcdom/tests/html-serializer.rs +0 -265
- data/rust-vendor/markup5ever_rcdom/tests/html-tokenizer.rs +0 -487
- data/rust-vendor/markup5ever_rcdom/tests/html-tree-builder.rs +0 -298
- data/rust-vendor/markup5ever_rcdom/tests/html-tree-sink.rs +0 -141
- data/rust-vendor/markup5ever_rcdom/tests/util/find_tests.rs +0 -34
- data/rust-vendor/markup5ever_rcdom/tests/util/runner.rs +0 -48
- data/rust-vendor/markup5ever_rcdom/tests/xml-driver.rs +0 -101
- data/rust-vendor/markup5ever_rcdom/tests/xml-tokenizer.rs +0 -374
- data/rust-vendor/markup5ever_rcdom/tests/xml-tree-builder.rs +0 -237
- data/rust-vendor/markup5ever_rcdom/xml5lib-tests/AUTHORS.rst +0 -9
- data/rust-vendor/markup5ever_rcdom/xml5lib-tests/LICENSE +0 -21
- data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/README.md +0 -92
- data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/comments.test +0 -274
- data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/doctype.test +0 -3232
- data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/entities.test +0 -283
- data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/eof.test +0 -113
- data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/namedEntities.test +0 -42210
- data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/numericEntities.test +0 -1349
- data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/test1.test +0 -162
- data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/test2.test +0 -64
- data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/unicodeChars.test +0 -1295
- data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tree-construction/README.md +0 -104
- data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tree-construction/namespace.dat +0 -119
- data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tree-construction/test1.dat +0 -124
- data/rust-vendor/xml5ever/.cargo-checksum.json +0 -1
- data/rust-vendor/xml5ever/.cargo_vcs_info.json +0 -6
- data/rust-vendor/xml5ever/Cargo.lock +0 -752
- data/rust-vendor/xml5ever/Cargo.toml +0 -69
- data/rust-vendor/xml5ever/Cargo.toml.orig +0 -29
- data/rust-vendor/xml5ever/LICENSE-APACHE +0 -201
- data/rust-vendor/xml5ever/LICENSE-MIT +0 -25
- data/rust-vendor/xml5ever/README.md +0 -72
- data/rust-vendor/xml5ever/benches/xml5ever.rs +0 -77
- data/rust-vendor/xml5ever/data/bench/strong.xml +0 -1
- data/rust-vendor/xml5ever/examples/README.md +0 -223
- data/rust-vendor/xml5ever/examples/example.xml +0 -3
- data/rust-vendor/xml5ever/examples/simple_xml_tokenizer.rs +0 -81
- data/rust-vendor/xml5ever/examples/xml_tokenizer.rs +0 -115
- data/rust-vendor/xml5ever/src/driver.rs +0 -90
- data/rust-vendor/xml5ever/src/lib.rs +0 -47
- data/rust-vendor/xml5ever/src/macros.rs +0 -18
- data/rust-vendor/xml5ever/src/serialize/mod.rs +0 -216
- data/rust-vendor/xml5ever/src/tokenizer/char_ref/mod.rs +0 -456
- data/rust-vendor/xml5ever/src/tokenizer/interface.rs +0 -116
- data/rust-vendor/xml5ever/src/tokenizer/mod.rs +0 -1344
- data/rust-vendor/xml5ever/src/tokenizer/qname.rs +0 -84
- data/rust-vendor/xml5ever/src/tokenizer/states.rs +0 -167
- data/rust-vendor/xml5ever/src/tree_builder/mod.rs +0 -774
- data/rust-vendor/xml5ever/src/tree_builder/types.rs +0 -37
|
@@ -1,223 +0,0 @@
|
|
|
1
|
-
# Examples
|
|
2
|
-
|
|
3
|
-
The examples have been designed with [`cargo-script`](https://github.com/DanielKeep/cargo-script) in mind.
|
|
4
|
-
|
|
5
|
-
Here I'll just give broad overview how to install [`cargo script`] for Rust 1.5. For more details, check out [cargo-script repository](https://github.com/DanielKeep/cargo-script).
|
|
6
|
-
|
|
7
|
-
cargo install cargo-script
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
# Token printer
|
|
11
|
-
|
|
12
|
-
The basis of xml5ever is its tokenizer and tree builder. Roughly speaking tokenizer
|
|
13
|
-
takes input and returns a set of tokens like comment, processing instruction, start
|
|
14
|
-
tag, end tag, etc.
|
|
15
|
-
|
|
16
|
-
First let's define our dependencies:
|
|
17
|
-
|
|
18
|
-
```toml
|
|
19
|
-
[dependencies]
|
|
20
|
-
xml5ever = "0.2.0"
|
|
21
|
-
tendril = "0.1.3"
|
|
22
|
-
```
|
|
23
|
-
|
|
24
|
-
With dependencies declared, we can now make a simple tokenizer sink. First step is to
|
|
25
|
-
define a [`TokenSink`](https://docs.rs/xml5ever/latest/xml5ever/tokenizer/trait.TokenSink.html). [`TokenSink`](https://docs.rs/xml5ever/latest/xml5ever/tokenizer/trait.TokenSink.html) are traits that received stream of [`Tokens`](https://docs.rs/xml5ever/latest/xml5ever/tokenizer/enum.Token.html).
|
|
26
|
-
|
|
27
|
-
In our case we'll define a unit struct (i.e. a struct without any fields).
|
|
28
|
-
|
|
29
|
-
```rust
|
|
30
|
-
struct SimpleTokenPrinter;
|
|
31
|
-
```
|
|
32
|
-
|
|
33
|
-
To make `SimpleTokenPrinter` a [`TokenSink`](https://docs.rs/xml5ever/latest/xml5ever/tokenizer/trait.TokenSink.html), we need to implement [process_token](https://docs.rs/xml5ever/latest/xml5ever/tokenizer/trait.TokenSink.html#tymethod.process_token) method.
|
|
34
|
-
|
|
35
|
-
```rust
|
|
36
|
-
impl TokenSink for SimpleTokenPrinter {
|
|
37
|
-
fn process_token(&mut self, token: Token) {
|
|
38
|
-
match token {
|
|
39
|
-
CharacterTokens(b) => {
|
|
40
|
-
println!("TEXT: {}", &*b);
|
|
41
|
-
},
|
|
42
|
-
NullCharacterToken => print!("NULL"),
|
|
43
|
-
TagToken(tag) => {
|
|
44
|
-
println!("{:?} {} ", tag.kind, &*tag.name.local);
|
|
45
|
-
},
|
|
46
|
-
ParseError(err) => {
|
|
47
|
-
println!("ERROR: {}", err);
|
|
48
|
-
},
|
|
49
|
-
PIToken(Pi{ref target, ref data}) => {
|
|
50
|
-
println!("PI : <?{} {}?>", &*target, &*data);
|
|
51
|
-
},
|
|
52
|
-
CommentToken(ref comment) => {
|
|
53
|
-
println!("<!--{:?}-->", &*comment);
|
|
54
|
-
},
|
|
55
|
-
EOFToken => {
|
|
56
|
-
println!("EOF");
|
|
57
|
-
},
|
|
58
|
-
DoctypeToken(Doctype{ref name, ref public_id, ..}) => {
|
|
59
|
-
println!("<!DOCTYPE {:?} {:?}>", &*name, &*public_id);
|
|
60
|
-
}
|
|
61
|
-
}
|
|
62
|
-
}
|
|
63
|
-
}
|
|
64
|
-
```
|
|
65
|
-
|
|
66
|
-
Now, we need some input to process. For input we'll use `stdin`. However, xml5ever `tokenize_to` method only takes `StrTendril`. So we need to construct a
|
|
67
|
-
[`ByteTendril`](https://docs.rs/tendril/latest/tendril/type.ByteTendril.html) using `ByteTendril::new()`, then read the `stdin` using [`read_to_tendril`](https://docs.rs/tendril/latest/tendril/trait.ReadExt.html#tymethod.read_to_tendril) extension.
|
|
68
|
-
|
|
69
|
-
Once that is set, to make `SimpleTokenPrinter` parse the input, call,
|
|
70
|
-
`tokenize_to` with it as the first parameter, input wrapped in Option for second parameter and XmlToke.
|
|
71
|
-
|
|
72
|
-
```rust
|
|
73
|
-
fn main() {
|
|
74
|
-
let sink = SimpleTokenPrinter;
|
|
75
|
-
|
|
76
|
-
// We need a ByteTendril to read a file
|
|
77
|
-
let mut input = ByteTendril::new();
|
|
78
|
-
// Using SliceExt.read_to_tendril we read stdin
|
|
79
|
-
io::stdin().read_to_tendril(&mut input).unwrap();
|
|
80
|
-
// For xml5ever we need StrTendril, so we reinterpret it
|
|
81
|
-
// into StrTendril.
|
|
82
|
-
//
|
|
83
|
-
// You might wonder, how does `try_reinterpret` know we
|
|
84
|
-
// need StrTendril and the answer is type inference based
|
|
85
|
-
// on `tokenize_xml_to` signature.
|
|
86
|
-
let input = input.try_reinterpret().unwrap();
|
|
87
|
-
// Here we create and run tokenizer
|
|
88
|
-
let mut tok = XmlTokenizer::new(sink, Default::default());
|
|
89
|
-
// We pass input to parsed.
|
|
90
|
-
tok.feed(input);
|
|
91
|
-
|
|
92
|
-
// tok.end must be invoked for final bytes to be processed.
|
|
93
|
-
tok.end();
|
|
94
|
-
}
|
|
95
|
-
```
|
|
96
|
-
|
|
97
|
-
NOTE: `unwrap` causes panic, it's only OK to use in simple examples.
|
|
98
|
-
|
|
99
|
-
For full source code check out: [`examples/simple_xml_tokenizer.rs`](https://github.com/servo/html5ever/blob/main/xml5ever/examples/simple_xml_tokenizer.rs)
|
|
100
|
-
|
|
101
|
-
Once we have successfully compiled the example we run the example with inline
|
|
102
|
-
xml
|
|
103
|
-
|
|
104
|
-
```bash
|
|
105
|
-
cargo script simple_xml_tokenizer.rs <<< "<xml>Text with <b>bold words</b>!</xml>"
|
|
106
|
-
```
|
|
107
|
-
|
|
108
|
-
or by sending an [`examples/example.xml`](https://github.com/servo/html5ever/blob/main/xml5ever/examples/example.xml) located in same folder as examples.
|
|
109
|
-
|
|
110
|
-
```bash
|
|
111
|
-
cargo script simple_xml_tokenizer.rs < example.xml
|
|
112
|
-
```
|
|
113
|
-
|
|
114
|
-
# Tree printer
|
|
115
|
-
|
|
116
|
-
To actually get an XML document tree from the xml5ever, you need to use a `TreeSink`.
|
|
117
|
-
`TreeSink` is in many way similar to the TokenSink. Basically, TokenSink takes data
|
|
118
|
-
and returns list of tokens, while TreeSink takes tokens and returns a tree of parsed
|
|
119
|
-
XML document. Do note, that this is a simplified explanation and consult
|
|
120
|
-
documentation for more info.
|
|
121
|
-
|
|
122
|
-
Ok, with that in mind, let's build us a TreePrinter. For example if we get an XML
|
|
123
|
-
file like:
|
|
124
|
-
|
|
125
|
-
```xml
|
|
126
|
-
<student>
|
|
127
|
-
<first-name>Bobby</first-name>
|
|
128
|
-
<last-name>Tables</last-name>
|
|
129
|
-
</student>
|
|
130
|
-
```
|
|
131
|
-
|
|
132
|
-
We'd want a structure similar to this:
|
|
133
|
-
|
|
134
|
-
```
|
|
135
|
-
#document
|
|
136
|
-
student
|
|
137
|
-
first-name
|
|
138
|
-
#text Bobby
|
|
139
|
-
last-name
|
|
140
|
-
#text Tables
|
|
141
|
-
|
|
142
|
-
```
|
|
143
|
-
We won't print anything other than element names and text fields. So comments,
|
|
144
|
-
doctypes and other such elements are ignored.
|
|
145
|
-
|
|
146
|
-
First part is similar to making SimpleTokenPrinter:
|
|
147
|
-
|
|
148
|
-
```rust
|
|
149
|
-
// We need to allocate an input tendril for xml5ever
|
|
150
|
-
let mut input = ByteTendril::new();
|
|
151
|
-
// Using SliceExt.read_to_tendril functions we can read stdin
|
|
152
|
-
io::stdin().read_to_tendril(&mut input).unwrap();
|
|
153
|
-
let input = input.try_reinterpret().unwrap();
|
|
154
|
-
```
|
|
155
|
-
|
|
156
|
-
This time, we need an implementation of [`TreeSink`](https://docs.rs/xml5ever/latest/xml5ever/tree_builder/trait.TreeSink.html). xml5ever comes with a
|
|
157
|
-
built-in `TreeSink` implementation called [`RcDom`](https://docs.rs/markup5ever_rcdom/latest/markup5ever_rcdom/struct.RcDom.html). To process input into
|
|
158
|
-
a `TreeSink` we use the following line:
|
|
159
|
-
|
|
160
|
-
```rust
|
|
161
|
-
let dom: RcDom = parse(one_input(input), Default::default());
|
|
162
|
-
```
|
|
163
|
-
|
|
164
|
-
Let's analyze it a bit. First there is `let dom: RcDom`. We need this part,
|
|
165
|
-
because the type inferencer can't infer which TreeSink implementation we mean
|
|
166
|
-
in this scenario.
|
|
167
|
-
|
|
168
|
-
Function [`one_input`](https://ygg01.github.io/docs/xml5ever/xml5ever/fn.one_input.html) is a convenience function that turns any value into an iterator. In this case
|
|
169
|
-
it converts a StrTendril into an Iterator over itself.
|
|
170
|
-
|
|
171
|
-
Ok, so now that we parsed our tree what with it? Well, for that we might need some
|
|
172
|
-
kind of function that will help us traverse it. We shall call that function `walk`.
|
|
173
|
-
|
|
174
|
-
```rust
|
|
175
|
-
fn walk(prefix: &str, handle: Handle) {
|
|
176
|
-
let node = handle.borrow();
|
|
177
|
-
|
|
178
|
-
// We print out the prefix before we start
|
|
179
|
-
print!("{}", prefix);
|
|
180
|
-
// We are only interested in following nodes:
|
|
181
|
-
// Document, Text and Element, so our match
|
|
182
|
-
// reflects that.
|
|
183
|
-
match node.node {
|
|
184
|
-
Document
|
|
185
|
-
=> println!("#document"),
|
|
186
|
-
|
|
187
|
-
Text(ref text) => {
|
|
188
|
-
println!("#text {}", text.escape_default())
|
|
189
|
-
},
|
|
190
|
-
|
|
191
|
-
Element(ref name, _) => {
|
|
192
|
-
println!("{}", name.local);
|
|
193
|
-
},
|
|
194
|
-
|
|
195
|
-
_ => {},
|
|
196
|
-
|
|
197
|
-
}
|
|
198
|
-
|
|
199
|
-
// We increase indent in child nodes
|
|
200
|
-
let new_indent = {
|
|
201
|
-
let mut temp = String::new();
|
|
202
|
-
temp.push_str(prefix);
|
|
203
|
-
temp.push_str(" ");
|
|
204
|
-
temp
|
|
205
|
-
};
|
|
206
|
-
|
|
207
|
-
for child in node.children.iter()
|
|
208
|
-
// In order to avoid weird indentation, we filter
|
|
209
|
-
// only Text/Element nodes.
|
|
210
|
-
// We don't need to filter Document since its guaranteed
|
|
211
|
-
// child elements don't contain documents
|
|
212
|
-
.filter(|child| match child.borrow().node {
|
|
213
|
-
Text(_) | Element (_, _) => true,
|
|
214
|
-
_ => false,
|
|
215
|
-
}
|
|
216
|
-
) {
|
|
217
|
-
// Recursion - Yay!
|
|
218
|
-
walk(&new_indent, child.clone());
|
|
219
|
-
}
|
|
220
|
-
}
|
|
221
|
-
```
|
|
222
|
-
|
|
223
|
-
For full source code check out: [`examples/xml_tree_printer.rs`](https://github.com/servo/html5ever/blob/main/rcdom/examples/xml_tree_printer.rs)
|
|
@@ -1,81 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env run-cargo-script
|
|
2
|
-
//! This is a regular crate doc comment, but it also contains a partial
|
|
3
|
-
//! Cargo manifest. Note the use of a *fenced* code block, and the
|
|
4
|
-
//! `cargo` "language".
|
|
5
|
-
//!
|
|
6
|
-
//! ```cargo
|
|
7
|
-
//! [dependencies]
|
|
8
|
-
//! xml5ever = "0.1.1"
|
|
9
|
-
//! tendril = "0.1.3"
|
|
10
|
-
//! markup5ever = "0.7.4"
|
|
11
|
-
//! ```
|
|
12
|
-
extern crate markup5ever;
|
|
13
|
-
extern crate xml5ever;
|
|
14
|
-
|
|
15
|
-
use std::io;
|
|
16
|
-
|
|
17
|
-
use markup5ever::buffer_queue::BufferQueue;
|
|
18
|
-
use xml5ever::tendril::{ByteTendril, ReadExt};
|
|
19
|
-
use xml5ever::tokenizer::{Doctype, Pi, ProcessResult, Token, TokenSink, XmlTokenizer};
|
|
20
|
-
|
|
21
|
-
struct SimpleTokenPrinter;
|
|
22
|
-
|
|
23
|
-
impl TokenSink for SimpleTokenPrinter {
|
|
24
|
-
type Handle = ();
|
|
25
|
-
|
|
26
|
-
fn process_token(&self, token: Token) -> ProcessResult<()> {
|
|
27
|
-
match token {
|
|
28
|
-
Token::Characters(b) => {
|
|
29
|
-
println!("TEXT: {}", &*b);
|
|
30
|
-
},
|
|
31
|
-
Token::NullCharacter => print!("NULL"),
|
|
32
|
-
Token::Tag(tag) => {
|
|
33
|
-
println!("{:?} {} ", tag.kind, &*tag.name.local);
|
|
34
|
-
},
|
|
35
|
-
Token::ParseError(err) => {
|
|
36
|
-
println!("ERROR: {err}");
|
|
37
|
-
},
|
|
38
|
-
Token::ProcessingInstruction(Pi {
|
|
39
|
-
ref target,
|
|
40
|
-
ref data,
|
|
41
|
-
}) => {
|
|
42
|
-
println!("PI : <?{target} {data}?>");
|
|
43
|
-
},
|
|
44
|
-
Token::Comment(ref comment) => {
|
|
45
|
-
println!("<!--{comment:?}-->");
|
|
46
|
-
},
|
|
47
|
-
Token::EndOfFile => {
|
|
48
|
-
println!("EOF");
|
|
49
|
-
},
|
|
50
|
-
Token::Doctype(Doctype {
|
|
51
|
-
ref name,
|
|
52
|
-
ref public_id,
|
|
53
|
-
..
|
|
54
|
-
}) => {
|
|
55
|
-
println!("<!DOCTYPE {name:?} {public_id:?}>");
|
|
56
|
-
},
|
|
57
|
-
};
|
|
58
|
-
ProcessResult::Continue
|
|
59
|
-
}
|
|
60
|
-
}
|
|
61
|
-
|
|
62
|
-
fn main() {
|
|
63
|
-
// Our implementation of TokenSink
|
|
64
|
-
let sink = SimpleTokenPrinter;
|
|
65
|
-
|
|
66
|
-
// We need a ByteTendril to read a file
|
|
67
|
-
let mut input = ByteTendril::new();
|
|
68
|
-
|
|
69
|
-
// Using SliceExt.read_to_tendril we can read stdin
|
|
70
|
-
io::stdin().read_to_tendril(&mut input).unwrap();
|
|
71
|
-
// For xml5ever we need StrTendril, so we reinterpret it
|
|
72
|
-
// into StrTendril.
|
|
73
|
-
|
|
74
|
-
// Load input into BufferQueue
|
|
75
|
-
let input_buffer = BufferQueue::default();
|
|
76
|
-
input_buffer.push_back(input.try_reinterpret().unwrap());
|
|
77
|
-
// Here we create and run tokenizer
|
|
78
|
-
let tok = XmlTokenizer::new(sink, Default::default());
|
|
79
|
-
let _ = tok.feed(&input_buffer);
|
|
80
|
-
tok.end();
|
|
81
|
-
}
|
|
@@ -1,115 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env run-cargo-script
|
|
2
|
-
//! This is a regular crate doc comment, but it also contains a partial
|
|
3
|
-
//! Cargo manifest. Note the use of a *fenced* code block, and the
|
|
4
|
-
//! `cargo` "language".
|
|
5
|
-
//!
|
|
6
|
-
//! ```cargo
|
|
7
|
-
//! [dependencies]
|
|
8
|
-
//! xml5ever = "0.2.0"
|
|
9
|
-
//! tendril = "0.1.3"
|
|
10
|
-
//! markup5ever = "0.7.4"
|
|
11
|
-
//! ```
|
|
12
|
-
extern crate markup5ever;
|
|
13
|
-
extern crate xml5ever;
|
|
14
|
-
|
|
15
|
-
use std::cell::Cell;
|
|
16
|
-
use std::io;
|
|
17
|
-
|
|
18
|
-
use markup5ever::buffer_queue::BufferQueue;
|
|
19
|
-
use xml5ever::tendril::{ByteTendril, ReadExt};
|
|
20
|
-
use xml5ever::tokenizer::{
|
|
21
|
-
EmptyTag, EndTag, Pi, ProcessResult, ShortTag, StartTag, Token, TokenSink, XmlTokenizer,
|
|
22
|
-
XmlTokenizerOpts,
|
|
23
|
-
};
|
|
24
|
-
|
|
25
|
-
#[derive(Clone)]
|
|
26
|
-
struct TokenPrinter {
|
|
27
|
-
in_char_run: Cell<bool>,
|
|
28
|
-
}
|
|
29
|
-
|
|
30
|
-
impl TokenPrinter {
|
|
31
|
-
fn is_char(&self, is_char: bool) {
|
|
32
|
-
match (self.in_char_run.get(), is_char) {
|
|
33
|
-
(false, true) => print!("CHAR : \""),
|
|
34
|
-
(true, false) => println!("\""),
|
|
35
|
-
_ => (),
|
|
36
|
-
}
|
|
37
|
-
self.in_char_run.set(is_char);
|
|
38
|
-
}
|
|
39
|
-
|
|
40
|
-
fn do_char(&self, c: char) {
|
|
41
|
-
self.is_char(true);
|
|
42
|
-
print!("{}", c.escape_default().collect::<String>());
|
|
43
|
-
}
|
|
44
|
-
}
|
|
45
|
-
|
|
46
|
-
impl TokenSink for TokenPrinter {
|
|
47
|
-
type Handle = ();
|
|
48
|
-
|
|
49
|
-
fn process_token(&self, token: Token) -> ProcessResult<()> {
|
|
50
|
-
match token {
|
|
51
|
-
Token::Characters(b) => {
|
|
52
|
-
for c in b.chars() {
|
|
53
|
-
self.do_char(c);
|
|
54
|
-
}
|
|
55
|
-
},
|
|
56
|
-
Token::NullCharacter => self.do_char('\0'),
|
|
57
|
-
Token::Tag(tag) => {
|
|
58
|
-
self.is_char(false);
|
|
59
|
-
// This is not proper HTML serialization, of course.
|
|
60
|
-
match tag.kind {
|
|
61
|
-
StartTag => print!("TAG : <\x1b[32m{}\x1b[0m", tag.name.local),
|
|
62
|
-
EndTag => print!("END TAG : <\x1b[31m/{}\x1b[0m", tag.name.local),
|
|
63
|
-
ShortTag => print!("Short TAG : <\x1b[31m/{}\x1b[0m", tag.name.local),
|
|
64
|
-
EmptyTag => print!("Empty TAG : <\x1b[31m{}\x1b[0m", tag.name.local),
|
|
65
|
-
}
|
|
66
|
-
for attr in tag.attrs.iter() {
|
|
67
|
-
print!(
|
|
68
|
-
" \x1b[36m{}\x1b[0m='\x1b[34m{}\x1b[0m'",
|
|
69
|
-
attr.name.local, attr.value
|
|
70
|
-
);
|
|
71
|
-
}
|
|
72
|
-
if tag.kind == EmptyTag {
|
|
73
|
-
print!("/");
|
|
74
|
-
}
|
|
75
|
-
println!(">");
|
|
76
|
-
},
|
|
77
|
-
Token::ParseError(err) => {
|
|
78
|
-
self.is_char(false);
|
|
79
|
-
println!("ERROR: {err}");
|
|
80
|
-
},
|
|
81
|
-
Token::ProcessingInstruction(Pi { target, data }) => {
|
|
82
|
-
self.is_char(false);
|
|
83
|
-
println!("PI : <?{target:?} {data:?}?>");
|
|
84
|
-
},
|
|
85
|
-
_ => {
|
|
86
|
-
self.is_char(false);
|
|
87
|
-
println!("OTHER: {token:?}");
|
|
88
|
-
},
|
|
89
|
-
};
|
|
90
|
-
|
|
91
|
-
ProcessResult::Continue
|
|
92
|
-
}
|
|
93
|
-
}
|
|
94
|
-
|
|
95
|
-
fn main() {
|
|
96
|
-
let sink = TokenPrinter {
|
|
97
|
-
in_char_run: Cell::new(false),
|
|
98
|
-
};
|
|
99
|
-
let mut input = ByteTendril::new();
|
|
100
|
-
io::stdin().read_to_tendril(&mut input).unwrap();
|
|
101
|
-
let input_buffer = BufferQueue::default();
|
|
102
|
-
input_buffer.push_back(input.try_reinterpret().unwrap());
|
|
103
|
-
|
|
104
|
-
let tok = XmlTokenizer::new(
|
|
105
|
-
sink,
|
|
106
|
-
XmlTokenizerOpts {
|
|
107
|
-
profile: true,
|
|
108
|
-
exact_errors: true,
|
|
109
|
-
..Default::default()
|
|
110
|
-
},
|
|
111
|
-
);
|
|
112
|
-
let _ = tok.feed(&input_buffer);
|
|
113
|
-
tok.end();
|
|
114
|
-
tok.sink.is_char(false);
|
|
115
|
-
}
|
|
@@ -1,90 +0,0 @@
|
|
|
1
|
-
// Copyright 2014-2017 The html5ever Project Developers. See the
|
|
2
|
-
// COPYRIGHT file at the top-level directory of this distribution.
|
|
3
|
-
//
|
|
4
|
-
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
|
5
|
-
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
|
6
|
-
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
|
7
|
-
// option. This file may not be copied, modified, or distributed
|
|
8
|
-
// except according to those terms.
|
|
9
|
-
|
|
10
|
-
use crate::tokenizer::{XmlTokenizer, XmlTokenizerOpts};
|
|
11
|
-
use crate::tree_builder::{TreeSink, XmlTreeBuilder, XmlTreeBuilderOpts};
|
|
12
|
-
|
|
13
|
-
use std::borrow::Cow;
|
|
14
|
-
|
|
15
|
-
use crate::tendril;
|
|
16
|
-
use crate::tendril::stream::{TendrilSink, Utf8LossyDecoder};
|
|
17
|
-
use crate::tendril::StrTendril;
|
|
18
|
-
use markup5ever::buffer_queue::BufferQueue;
|
|
19
|
-
|
|
20
|
-
/// All-encompasing parser setting structure.
|
|
21
|
-
#[derive(Clone, Default)]
|
|
22
|
-
pub struct XmlParseOpts {
|
|
23
|
-
/// Xml tokenizer options.
|
|
24
|
-
pub tokenizer: XmlTokenizerOpts,
|
|
25
|
-
/// Xml tree builder .
|
|
26
|
-
pub tree_builder: XmlTreeBuilderOpts,
|
|
27
|
-
}
|
|
28
|
-
|
|
29
|
-
/// Parse and send results to a `TreeSink`.
|
|
30
|
-
///
|
|
31
|
-
/// ## Example
|
|
32
|
-
///
|
|
33
|
-
/// ```ignore
|
|
34
|
-
/// let mut sink = MySink;
|
|
35
|
-
/// parse_document(&mut sink, iter::once(my_str), Default::default());
|
|
36
|
-
/// ```
|
|
37
|
-
pub fn parse_document<Sink>(sink: Sink, opts: XmlParseOpts) -> XmlParser<Sink>
|
|
38
|
-
where
|
|
39
|
-
Sink: TreeSink,
|
|
40
|
-
{
|
|
41
|
-
let tb = XmlTreeBuilder::new(sink, opts.tree_builder);
|
|
42
|
-
let tok = XmlTokenizer::new(tb, opts.tokenizer);
|
|
43
|
-
XmlParser {
|
|
44
|
-
tokenizer: tok,
|
|
45
|
-
input_buffer: BufferQueue::default(),
|
|
46
|
-
}
|
|
47
|
-
}
|
|
48
|
-
|
|
49
|
-
/// An XML parser,
|
|
50
|
-
/// ready to receive Unicode input through the `tendril::TendrilSink` trait’s methods.
|
|
51
|
-
pub struct XmlParser<Sink>
|
|
52
|
-
where
|
|
53
|
-
Sink: TreeSink,
|
|
54
|
-
{
|
|
55
|
-
/// Tokenizer used by XmlParser.
|
|
56
|
-
pub tokenizer: XmlTokenizer<XmlTreeBuilder<Sink::Handle, Sink>>,
|
|
57
|
-
/// Input used by XmlParser.
|
|
58
|
-
pub input_buffer: BufferQueue,
|
|
59
|
-
}
|
|
60
|
-
|
|
61
|
-
impl<Sink: TreeSink> TendrilSink<tendril::fmt::UTF8> for XmlParser<Sink> {
|
|
62
|
-
type Output = Sink::Output;
|
|
63
|
-
|
|
64
|
-
fn process(&mut self, t: StrTendril) {
|
|
65
|
-
self.input_buffer.push_back(t);
|
|
66
|
-
// FIXME: Properly support </script> somehow.
|
|
67
|
-
let _ = self.tokenizer.feed(&self.input_buffer);
|
|
68
|
-
}
|
|
69
|
-
|
|
70
|
-
// FIXME: Is it too noisy to report every character decoding error?
|
|
71
|
-
fn error(&mut self, desc: Cow<'static, str>) {
|
|
72
|
-
self.tokenizer.sink.sink.parse_error(desc)
|
|
73
|
-
}
|
|
74
|
-
|
|
75
|
-
fn finish(self) -> Self::Output {
|
|
76
|
-
self.tokenizer.end();
|
|
77
|
-
self.tokenizer.sink.sink.finish()
|
|
78
|
-
}
|
|
79
|
-
}
|
|
80
|
-
|
|
81
|
-
impl<Sink: TreeSink> XmlParser<Sink> {
|
|
82
|
-
/// Wrap this parser into a `TendrilSink` that accepts UTF-8 bytes.
|
|
83
|
-
///
|
|
84
|
-
/// Use this when your input is bytes that are known to be in the UTF-8 encoding.
|
|
85
|
-
/// Decoding is lossy, like `String::from_utf8_lossy`.
|
|
86
|
-
#[allow(clippy::wrong_self_convention)]
|
|
87
|
-
pub fn from_utf8(self) -> Utf8LossyDecoder<Self> {
|
|
88
|
-
Utf8LossyDecoder::new(self)
|
|
89
|
-
}
|
|
90
|
-
}
|
|
@@ -1,47 +0,0 @@
|
|
|
1
|
-
// Copyright 2014-2017 The html5ever Project Developers. See the
|
|
2
|
-
// COPYRIGHT file at the top-level directory of this distribution.
|
|
3
|
-
//
|
|
4
|
-
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
|
5
|
-
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
|
6
|
-
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
|
7
|
-
// option. This file may not be copied, modified, or distributed
|
|
8
|
-
// except according to those terms.
|
|
9
|
-
|
|
10
|
-
//! This crate provides a push based XML parser library that
|
|
11
|
-
//! adheres to XML5 specification. In other words this library
|
|
12
|
-
//! trades well-formedness for error recovery.
|
|
13
|
-
//!
|
|
14
|
-
//! The idea behind this, was to minimize number of errors from
|
|
15
|
-
//! tools that generate XML (e.g. `S` won't just return `S`
|
|
16
|
-
//! as text, but will parse it into `S` ).
|
|
17
|
-
//! You can check out full specification [here](https://ygg01.github.io/xml5_draft/).
|
|
18
|
-
//!
|
|
19
|
-
//! What this library provides is a solid XML parser that can:
|
|
20
|
-
//!
|
|
21
|
-
//! * Parse somewhat erroneous XML input
|
|
22
|
-
//! * Provide support for [Numeric character references](https://en.wikipedia.org/wiki/Numeric_character_reference).
|
|
23
|
-
//! * Provide partial [XML namespace](http://www.w3.org/TR/xml-names11/) support.
|
|
24
|
-
//! * Provide full set of SVG/MathML entities
|
|
25
|
-
//!
|
|
26
|
-
//! What isn't in scope for this library:
|
|
27
|
-
//!
|
|
28
|
-
//! * Document Type Definition parsing - this is pretty hard to do right and nowadays, its used
|
|
29
|
-
//!
|
|
30
|
-
|
|
31
|
-
#![crate_name = "xml5ever"]
|
|
32
|
-
#![crate_type = "dylib"]
|
|
33
|
-
#![allow(unexpected_cfgs)]
|
|
34
|
-
#![deny(missing_docs)]
|
|
35
|
-
|
|
36
|
-
pub use markup5ever::*;
|
|
37
|
-
|
|
38
|
-
pub(crate) mod macros;
|
|
39
|
-
|
|
40
|
-
/// Driver
|
|
41
|
-
pub mod driver;
|
|
42
|
-
/// Serializer for XML5.
|
|
43
|
-
pub mod serialize;
|
|
44
|
-
/// XML5 tokenizer - converts input into tokens
|
|
45
|
-
pub mod tokenizer;
|
|
46
|
-
/// XML5 tree builder - converts tokens into a tree like structure
|
|
47
|
-
pub mod tree_builder;
|
|
@@ -1,18 +0,0 @@
|
|
|
1
|
-
// Copyright 2014-2017 The html5ever Project Developers. See the
|
|
2
|
-
// COPYRIGHT file at the top-level directory of this distribution.
|
|
3
|
-
//
|
|
4
|
-
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
|
5
|
-
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
|
6
|
-
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
|
7
|
-
// option. This file may not be copied, modified, or distributed
|
|
8
|
-
// except according to those terms.
|
|
9
|
-
|
|
10
|
-
macro_rules! time {
|
|
11
|
-
($e:expr) => {{
|
|
12
|
-
let t0 = ::std::time::Instant::now();
|
|
13
|
-
let result = $e;
|
|
14
|
-
let dt = t0.elapsed().as_nanos() as u64;
|
|
15
|
-
(result, dt)
|
|
16
|
-
}};
|
|
17
|
-
}
|
|
18
|
-
pub(crate) use time;
|