gammo 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +8 -0
- data/.travis.yml +6 -0
- data/Gemfile +9 -0
- data/Gemfile.lock +27 -0
- data/LICENSE.txt +21 -0
- data/README.md +177 -0
- data/Rakefile +25 -0
- data/gammo.gemspec +23 -0
- data/lib/gammo.rb +15 -0
- data/lib/gammo/attribute.rb +17 -0
- data/lib/gammo/fragment_parser.rb +65 -0
- data/lib/gammo/node.rb +157 -0
- data/lib/gammo/parser.rb +524 -0
- data/lib/gammo/parser/constants.rb +94 -0
- data/lib/gammo/parser/foreign.rb +307 -0
- data/lib/gammo/parser/insertion_mode.rb +74 -0
- data/lib/gammo/parser/insertion_mode/after_after_body.rb +36 -0
- data/lib/gammo/parser/insertion_mode/after_after_frameset.rb +32 -0
- data/lib/gammo/parser/insertion_mode/after_body.rb +46 -0
- data/lib/gammo/parser/insertion_mode/after_frameset.rb +39 -0
- data/lib/gammo/parser/insertion_mode/after_head.rb +70 -0
- data/lib/gammo/parser/insertion_mode/before_head.rb +49 -0
- data/lib/gammo/parser/insertion_mode/before_html.rb +45 -0
- data/lib/gammo/parser/insertion_mode/in_body.rb +463 -0
- data/lib/gammo/parser/insertion_mode/in_caption.rb +47 -0
- data/lib/gammo/parser/insertion_mode/in_cell.rb +46 -0
- data/lib/gammo/parser/insertion_mode/in_column_group.rb +66 -0
- data/lib/gammo/parser/insertion_mode/in_frameset.rb +48 -0
- data/lib/gammo/parser/insertion_mode/in_head.rb +98 -0
- data/lib/gammo/parser/insertion_mode/in_head_noscript.rb +52 -0
- data/lib/gammo/parser/insertion_mode/in_row.rb +53 -0
- data/lib/gammo/parser/insertion_mode/in_select.rb +77 -0
- data/lib/gammo/parser/insertion_mode/in_select_in_table.rb +46 -0
- data/lib/gammo/parser/insertion_mode/in_table.rb +114 -0
- data/lib/gammo/parser/insertion_mode/in_table_body.rb +55 -0
- data/lib/gammo/parser/insertion_mode/in_template.rb +80 -0
- data/lib/gammo/parser/insertion_mode/initial.rb +152 -0
- data/lib/gammo/parser/insertion_mode/text.rb +32 -0
- data/lib/gammo/parser/insertion_mode_stack.rb +8 -0
- data/lib/gammo/parser/node_stack.rb +24 -0
- data/lib/gammo/tags.rb +9 -0
- data/lib/gammo/tags/table.rb +744 -0
- data/lib/gammo/tokenizer.rb +373 -0
- data/lib/gammo/tokenizer/debug.rb +34 -0
- data/lib/gammo/tokenizer/entity.rb +2240 -0
- data/lib/gammo/tokenizer/escape.rb +174 -0
- data/lib/gammo/tokenizer/script_scanner.rb +229 -0
- data/lib/gammo/tokenizer/tokens.rb +66 -0
- data/lib/gammo/version.rb +3 -0
- data/misc/html.yaml +384 -0
- data/misc/table.erubi +14 -0
- metadata +97 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: '009d6d5682151d83fe688e67ba57541bcccf5542b2865d8b85be77fae8156178'
|
4
|
+
data.tar.gz: 31a2f1d37e01a3c9e47db2b034965c75b0bc7ddd4d2f86826ae08fd37d199788
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: c77bcb2f3cc9b25ac7400eff41819289980b1ff9a53481cca51b1444bca24dbdd114ead1c90f6cec219b0c844b0e63775338a7a7f74910b24c0ab6b0a00e2d54
|
7
|
+
data.tar.gz: a907e000dd8d4c01bcdb3f834ec17fbc20bdddcc91c38b77add44d3b56116eef8b7a64acdbd967fff65b1076df65871899816493ca012b98947266cc1ba51d8c
|
data/.gitignore
ADDED
data/.travis.yml
ADDED
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
gammo (0.1.0)
|
5
|
+
|
6
|
+
GEM
|
7
|
+
remote: https://rubygems.org/
|
8
|
+
specs:
|
9
|
+
erubi (1.9.0)
|
10
|
+
power_assert (1.1.5)
|
11
|
+
rake (12.3.3)
|
12
|
+
test-unit (3.3.5)
|
13
|
+
power_assert
|
14
|
+
yard (0.9.20)
|
15
|
+
|
16
|
+
PLATFORMS
|
17
|
+
ruby
|
18
|
+
|
19
|
+
DEPENDENCIES
|
20
|
+
erubi
|
21
|
+
gammo!
|
22
|
+
rake (~> 12.0)
|
23
|
+
test-unit (~> 3.3.5)
|
24
|
+
yard
|
25
|
+
|
26
|
+
BUNDLED WITH
|
27
|
+
2.0.2
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2020 namusyaka
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
13
|
+
all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,177 @@
|
|
1
|
+
# Gammo - A pure-Ruby HTML5 parser
|
2
|
+
|
3
|
+
[](https://travis-ci.org/namusyaka/gammo)
|
4
|
+
|
5
|
+
Gammo is an implementation of the HTML5 parsing algorithm which conforms [the WHATWG specification](https://html.spec.whatwg.org/multipage/parsing.html), without any dependencies. Given an HTML string, Gammo parses it and builds DOM tree based on the tokenization and tree-construction algorithm defined in WHATWG parsing algorithm.
|
6
|
+
|
7
|
+
Gammo, its naming is inspired by [Gumbo](https://github.com/google/gumbo-parser). But Gammo is a fried tofu fritter made with vegetables.
|
8
|
+
|
9
|
+
```ruby
|
10
|
+
require 'gammo'
|
11
|
+
require 'open-uri'
|
12
|
+
|
13
|
+
parser = Gammo.new(open('https://google.com'))
|
14
|
+
parser.parse #=> #<Gammo::Node::Document>
|
15
|
+
```
|
16
|
+
|
17
|
+
## Overview
|
18
|
+
|
19
|
+
### Features
|
20
|
+
|
21
|
+
- [Tokenization](#tokenization): Gammo has a tokenizer for implementing [the tokenization algorithm](https://html.spec.whatwg.org/multipage/parsing.html#tokenization).
|
22
|
+
- [Parsing](#parsing): Gammo provides a parser which implements the parsing algorithm by the above tokenization and [the tree-construction algorithm](https://html.spec.whatwg.org/multipage/parsing.html#tree-construction).
|
23
|
+
- [Node](#node): Gammo provides the nodes which implement [WHATWG DOM specification](https://dom.spec.whatwg.org/) partially.
|
24
|
+
- [Performance](#performance): Gammo does not prioritize performance, and there are a few potential performance notes.
|
25
|
+
|
26
|
+
## Tokenizaton
|
27
|
+
|
28
|
+
`Gammo::Tokenizer` implements the tokenization algorithm in WHATWG. You can get tokens in order by calling `Gammo::Tokenizer#next_token`.
|
29
|
+
|
30
|
+
Here is a simple example for performing only the tokenizer.
|
31
|
+
|
32
|
+
```ruby
|
33
|
+
def dump_for(token)
|
34
|
+
puts "data: #{token.data}, class: #{token.class}"
|
35
|
+
end
|
36
|
+
|
37
|
+
tokenizer = Gammo::Tokenizer.new('<!doctype html><input type="button"><frameset>')
|
38
|
+
dump_for tokenizer.next_token #=> data: html, class: Gammo::Tokenizer::DoctypeToken
|
39
|
+
dump_for tokenizer.next_token #=> data: input, class: Gammo::Tokenizer::StartTagToken
|
40
|
+
dump_for tokenizer.next_token #=> data: frameset, class: Gammo::Tokenizer::StartTagToken
|
41
|
+
dump_for tokenizer.next_token #=> data: end of string, class: Gammo::Tokenizer::ErrorToken
|
42
|
+
```
|
43
|
+
|
44
|
+
The parser described below depends on this tokenizer, it applies the WHATWG parsing algorithm to the tokens extracted by this tokenization in order.
|
45
|
+
|
46
|
+
### Token types
|
47
|
+
|
48
|
+
The tokens generated by the tokenizer will be categorized into one of the following types:
|
49
|
+
|
50
|
+
<table>
|
51
|
+
<thead>
|
52
|
+
<tr>
|
53
|
+
<th>Token type</th>
|
54
|
+
<th>Description</th>
|
55
|
+
</tr>
|
56
|
+
</thead>
|
57
|
+
<tbody>
|
58
|
+
<tr>
|
59
|
+
<td><code>Gammo::Tokenizer::ErrorToken</code></td>
|
60
|
+
<td>Represents an error token, it usually means end-of-string.</td>
|
61
|
+
</tr>
|
62
|
+
<tr>
|
63
|
+
<td><code>Gammo::Tokenizer::TextToken</code></td>
|
64
|
+
<td>Represents a text token like "foo" which is inner text of elements.</td>
|
65
|
+
</tr>
|
66
|
+
<tr>
|
67
|
+
<td><code>Gammo::Tokenizer::StartTagToken</code></td>
|
68
|
+
<td>Represents a start tag token like <code><a></code>.</td>
|
69
|
+
</tr>
|
70
|
+
<tr>
|
71
|
+
<td><code>Gammo::Tokenizer::EndTagToken</code></td>
|
72
|
+
<td>Represents an end tag token like <code></a></code>.</td>
|
73
|
+
</tr>
|
74
|
+
<tr>
|
75
|
+
<td><code>Gammo::Tokenizer::SelfClosingTagToken</code></td>
|
76
|
+
<td>Represents a self closing tag token like <code><img /></code></td>
|
77
|
+
</tr>
|
78
|
+
<tr>
|
79
|
+
<td><code>Gammo::Tokenizer::CommentToken</code></td>
|
80
|
+
<td>Represents a comment token like <code><!-- comment --></code>.</td>
|
81
|
+
</tr>
|
82
|
+
<tr>
|
83
|
+
<td><code>Gammo::Tokenizer::DoctypeToken</code></td>
|
84
|
+
<td>Represents a doctype token like <code><!doctype html></code>.</td>
|
85
|
+
</tr>
|
86
|
+
</tbody>
|
87
|
+
</table>
|
88
|
+
|
89
|
+
## Parsing
|
90
|
+
|
91
|
+
`Gammo::Parser` implements processing in [the tree-construction stage](https://html.spec.whatwg.org/multipage/parsing.html#tree-construction) based on the tokenization described above.
|
92
|
+
|
93
|
+
A successfully parsed parser has the `document` accessor as the root document (this is the same as the return value of the `Gammo::Parser#parse`). From the `document` accessor, you can traverse the DOM tree constructed by the parser.
|
94
|
+
|
95
|
+
```ruby
|
96
|
+
require 'gammo'
|
97
|
+
require 'pp'
|
98
|
+
|
99
|
+
document = Gammo.new('<!doctype html><input type="button">').parse
|
100
|
+
|
101
|
+
def dump_for(node, strm)
|
102
|
+
strm << node.to_h
|
103
|
+
return unless node && (child = node.first_child)
|
104
|
+
while child
|
105
|
+
dump_for(child, (strm.last[:children] ||= []))
|
106
|
+
child = child.next_sibling
|
107
|
+
end
|
108
|
+
strm
|
109
|
+
end
|
110
|
+
|
111
|
+
pp dump_for(document, [])
|
112
|
+
```
|
113
|
+
|
114
|
+
### Notes
|
115
|
+
|
116
|
+
Currently, it's not possible to traverse the DOM tree with css selector or xpath like [Nokogiri](https://nokogiri.org/).
|
117
|
+
However, Gammo plans to implement these features in the future.
|
118
|
+
|
119
|
+
## Node
|
120
|
+
|
121
|
+
The nodes generated by the parser will be categorized into one of the following types:
|
122
|
+
|
123
|
+
<table>
|
124
|
+
<thead>
|
125
|
+
<tr>
|
126
|
+
<th>Node type</th>
|
127
|
+
<th>Description</th>
|
128
|
+
</tr>
|
129
|
+
</thead>
|
130
|
+
<tbody>
|
131
|
+
<tr>
|
132
|
+
<td><code>Gammo::Node::Error</code></td>
|
133
|
+
<td>Represents error node, it usually means end-of-string.</td>
|
134
|
+
</tr>
|
135
|
+
<tr>
|
136
|
+
<td><code>Gammo::Node::Text</code></td>
|
137
|
+
<td>Represents the text node like "foo" which is inner text of elements.</td>
|
138
|
+
</tr>
|
139
|
+
<tr>
|
140
|
+
<td><code>Gammo::Node::Document</code></td>
|
141
|
+
<td>Represents the root document type. It's always returned by <code>Gammo::Parser#document</code>.</td>
|
142
|
+
</tr>
|
143
|
+
<tr>
|
144
|
+
<td><code>Gammo::Node::Element</code></td>
|
145
|
+
<td>Represents any elements of HTML like <code><p></code>.</td>
|
146
|
+
</tr>
|
147
|
+
<tr>
|
148
|
+
<td><code>Gammo::Node::Comment</code></td>
|
149
|
+
<td>Represents comments like <code><!-- foo --></code></td>
|
150
|
+
</tr>
|
151
|
+
<tr>
|
152
|
+
<td><code>Gammo::Node::Doctype</code></td>
|
153
|
+
<td>Represents doctype like <code><!doctype html></code></td>
|
154
|
+
</tr>
|
155
|
+
</tbody>
|
156
|
+
</table>
|
157
|
+
|
158
|
+
For some nodes such as `Gammo::Node::Element` and `Gammo::Node::Document`, they contains pointers to nodes that can be referenced by itself, such as `Gammo::Node#next_sibling` or `Gammo::Node#first_child`. In addition, APIs such as `Gammo::Node#append_child` and `Gammo::Node#remove_child` that perform operations defined in DOM living standard are also provided.
|
159
|
+
|
160
|
+
## Performance
|
161
|
+
|
162
|
+
As mentioned in the features at the beginning, Gammo doesn't prioritize its performance.
|
163
|
+
Thus, for example, Gammo is not suitable for very performance-sensitive applications (e.g. performing Gammo parsing synchronously from an incoming request from an end user).
|
164
|
+
Instead, the goal is to work well with batch processing such as crawlers.
|
165
|
+
Gammo places the highest priority on making it easy to parse HTML by peforming it without depending on native-extensions and external gems.
|
166
|
+
|
167
|
+
## References
|
168
|
+
|
169
|
+
This was developed with reference to the following softwares.
|
170
|
+
|
171
|
+
- [x/net/html](https://godoc.org/golang.org/x/net/html): I've been working on this package, it gave me strong reason to make this happen.
|
172
|
+
- [Blink](https://www.chromium.org/blink): Blink gave me great impression about tree construction.
|
173
|
+
- [html5lib-tests](https://github.com/html5lib/html5lib-tests): Gammo relies on this test.
|
174
|
+
|
175
|
+
## License
|
176
|
+
|
177
|
+
The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
|
data/Rakefile
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
2
|
+
require "rake/testtask"
|
3
|
+
require 'yaml'
|
4
|
+
require 'erubi'
|
5
|
+
|
6
|
+
Rake::TestTask.new(:test) do |t|
|
7
|
+
t.libs << "test"
|
8
|
+
t.libs << "lib"
|
9
|
+
t.test_files = FileList["test/**/*_test.rb"]
|
10
|
+
end
|
11
|
+
|
12
|
+
task default: :test
|
13
|
+
|
14
|
+
def camelize(str)
|
15
|
+
str.sub(/^[a-z\d]*/) { $&.capitalize }.sub(/\-[a-z]*/) { $&.slice(1..-1).capitalize }
|
16
|
+
end
|
17
|
+
|
18
|
+
task default: :test
|
19
|
+
|
20
|
+
task :generate do
|
21
|
+
data = YAML.load(File.read('misc/html.yaml'), symbolize_names: true)
|
22
|
+
@tags = data.each_value.inject(:+).uniq
|
23
|
+
table = eval(Erubi::Engine.new(File.read('misc/table.erubi')).src, binding)
|
24
|
+
File.write('lib/gammo/tags/table.rb', table)
|
25
|
+
end
|
data/gammo.gemspec
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
require_relative 'lib/gammo/version'
|
2
|
+
|
3
|
+
Gem::Specification.new do |spec|
|
4
|
+
spec.name = "gammo"
|
5
|
+
spec.version = Gammo::VERSION
|
6
|
+
spec.authors = ["namusyaka"]
|
7
|
+
spec.email = ["namusyaka@gmail.com"]
|
8
|
+
|
9
|
+
spec.summary = %q{An HTML parser which implements WHATWG parsing algorithm.}
|
10
|
+
spec.description = %q{Gammo is an implementation of the HTML5 parsing algorithm which conforms the WHATWG specification with pure Ruby.}
|
11
|
+
spec.homepage = "https://github.com/namusyaka/gammo"
|
12
|
+
spec.license = "MIT"
|
13
|
+
spec.required_ruby_version = Gem::Requirement.new(">= 2.3.0")
|
14
|
+
|
15
|
+
spec.metadata["homepage_uri"] = spec.homepage
|
16
|
+
spec.metadata["source_code_uri"] = "https://github.com/namusyaka/gammo"
|
17
|
+
spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
|
18
|
+
`git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
19
|
+
end
|
20
|
+
spec.bindir = "exe"
|
21
|
+
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
22
|
+
spec.require_paths = ["lib"]
|
23
|
+
end
|
data/lib/gammo.rb
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
require "gammo/version"
|
2
|
+
require "gammo/parser"
|
3
|
+
require "gammo/fragment_parser"
|
4
|
+
|
5
|
+
module Gammo
|
6
|
+
# Constructs a parser based on the input.
|
7
|
+
#
|
8
|
+
# @param [String] input
|
9
|
+
# @param [TrueClass, FalseClass] fragment
|
10
|
+
# @param [Hash] options
|
11
|
+
# @return [Gammo::Parser]
|
12
|
+
def self.new(input, fragment: false, **options)
|
13
|
+
(fragment ? FragmentParser : Parser).new(input, **options)
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module Gammo
|
2
|
+
# Class for representing an attribute.
|
3
|
+
class Attribute
|
4
|
+
attr_accessor :key, :value, :namespace
|
5
|
+
|
6
|
+
# Constructs an attribute with the key-value pair.
|
7
|
+
# @param [String] key
|
8
|
+
# @param [String] value
|
9
|
+
# @param [String] namespace
|
10
|
+
# @return [Attribute]
|
11
|
+
def initialize(key:, value:, namespace: nil)
|
12
|
+
@key = key
|
13
|
+
@value = value
|
14
|
+
@namespace = namespace
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,65 @@
|
|
1
|
+
require 'gammo/parser'
|
2
|
+
|
3
|
+
module Gammo
|
4
|
+
# Class for parsing a fragment of an HTML input and building an HTML tree.
|
5
|
+
class FragmentParser < ::Gammo::Parser
|
6
|
+
# Constructs a parser instance for fragment parsing algorithm.
|
7
|
+
# @see https://html.spec.whatwg.org/multipage/parsing.html#html-fragment-parsing-algorithm
|
8
|
+
# @param [String] input
|
9
|
+
# @param [Gammo::Node] context
|
10
|
+
# @raise [Gammo::ParseError] raises if context is not valid.
|
11
|
+
# @return Gammo::FragmentParser
|
12
|
+
def initialize(input, context:, **options)
|
13
|
+
validate_context(context)
|
14
|
+
super(input, context: context, **options)
|
15
|
+
@root = Node::Element.new(tag: Tags::Html, data: Tags::Html.to_s)
|
16
|
+
@tokenizer = Tokenizer.new(input, context: !context.namespace && context.tag.to_s)
|
17
|
+
@open_elements = NodeStack.new([@root])
|
18
|
+
document.append_child(@root)
|
19
|
+
template_stack << InTemplate if context.tag == Tags::Template
|
20
|
+
reset_insertion_mode
|
21
|
+
while context
|
22
|
+
if context.instance_of?(Node::Element) && context.tag == Tags::Form
|
23
|
+
@form = context
|
24
|
+
break
|
25
|
+
end
|
26
|
+
context = context.parent
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
# Parses a fragment of the current input and builds HTML tree from it.
|
31
|
+
# @raise [Gammo::ParseError] Raised if the parser gets error while parsing.
|
32
|
+
# @return [Array<Gammo::Node>]
|
33
|
+
def parse
|
34
|
+
super
|
35
|
+
parent = context ? @root : document
|
36
|
+
child = parent.first_child
|
37
|
+
collection = []
|
38
|
+
while child
|
39
|
+
node = child.next_sibling
|
40
|
+
parent.remove_child(child)
|
41
|
+
collection << child
|
42
|
+
child = node
|
43
|
+
end
|
44
|
+
collection
|
45
|
+
end
|
46
|
+
|
47
|
+
# Always returns true.
|
48
|
+
# @return [TrueClass]
|
49
|
+
# @!visibility private
|
50
|
+
def fragment?
|
51
|
+
true
|
52
|
+
end
|
53
|
+
|
54
|
+
# Validates given context. Raises {Gammo::ParseError} if context is not
|
55
|
+
# {Gammo::Node}.
|
56
|
+
# @param [Gammo::Node] context
|
57
|
+
# @raise [Gammo::ParseError]
|
58
|
+
def validate_context(context)
|
59
|
+
fail ParseError, 'given non-element node in "context"' unless context.instance_of?(Node::Element)
|
60
|
+
unless context.tag == Tags.lookup(context.data)
|
61
|
+
fail ParseError, "inconsistent context node, tag = #{context.tag}, data = #{Tags.lookup(context.data)}"
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
data/lib/gammo/node.rb
ADDED
@@ -0,0 +1,157 @@
|
|
1
|
+
module Gammo
|
2
|
+
# Class for representing Node.
|
3
|
+
# https://html.spec.whatwg.org/multipage/parsing.html#tokenization
|
4
|
+
class Node
|
5
|
+
# Represents the error token.
|
6
|
+
Error = Class.new(Node)
|
7
|
+
|
8
|
+
# Represents the text token.
|
9
|
+
Text = Class.new(Node)
|
10
|
+
|
11
|
+
# Represents the root document token.
|
12
|
+
Document = Class.new(Node)
|
13
|
+
|
14
|
+
# Represents the element token including start, end and self-closing token.
|
15
|
+
Element = Class.new(Node)
|
16
|
+
|
17
|
+
# Represents the comment token like "<!-- foo -->".
|
18
|
+
Comment = Class.new(Node)
|
19
|
+
|
20
|
+
# Represents the document type token.
|
21
|
+
Doctype = Class.new(Node)
|
22
|
+
|
23
|
+
# Represents the marker defined in 12.2.4.3.
|
24
|
+
# https://html.spec.whatwg.org/multipage/parsing.html#tokenization
|
25
|
+
ScopeMarker = Class.new(Node)
|
26
|
+
|
27
|
+
# Default scope marker is inserted when entering applet,
|
28
|
+
# object, marquee, template, td, th, and caption elements, and are used
|
29
|
+
# to prevent formatting from "leaking" into applet, object, marquee,
|
30
|
+
# template, td, th, and caption elements"
|
31
|
+
DEFAULT_SCOPE_MARKER = Node::ScopeMarker.new
|
32
|
+
|
33
|
+
# Raised if uncaught node is given for particular operations.
|
34
|
+
# @!visibility private
|
35
|
+
UncaughtTypeError = Class.new(ArgumentError)
|
36
|
+
|
37
|
+
# Raised if anything goes wrong on hierarchy while node operations.
|
38
|
+
# @!visibility private
|
39
|
+
HierarchyRequestError = Class.new(ArgumentError)
|
40
|
+
|
41
|
+
# `parent` is the pointer for the parent node.
|
42
|
+
attr_accessor :parent
|
43
|
+
|
44
|
+
# `first_child` and `last_child` are pointers for the first and the last nodes.
|
45
|
+
attr_accessor :first_child, :last_child
|
46
|
+
|
47
|
+
# `previous_sibling` and `next_sibling` are pointers for the previous and next sibling nodes.
|
48
|
+
attr_accessor :previous_sibling, :next_sibling
|
49
|
+
|
50
|
+
# Properties required to represent node.
|
51
|
+
attr_accessor :tag, :data, :namespace, :attributes
|
52
|
+
|
53
|
+
# Constructs a node which represents HTML element node.
|
54
|
+
# @param [String] tag
|
55
|
+
# @param [String] data
|
56
|
+
# @param [String, NilClass] namespace
|
57
|
+
# @param [Hash(String => String)] attributes
|
58
|
+
# @return [Gammo::Node]
|
59
|
+
def initialize(tag: nil, data: nil, namespace: nil, attributes: [])
|
60
|
+
@tag = tag
|
61
|
+
@data = data
|
62
|
+
@namespace = namespace
|
63
|
+
@attributes = attributes
|
64
|
+
end
|
65
|
+
|
66
|
+
# Inserts a node before a reference node as a child of a specified parent node.
|
67
|
+
# @param [Gammo::Node] node
|
68
|
+
# @param [Gammo::Node] ref
|
69
|
+
# @raise [HierarchyRequestError] Raised if given node is already attached to the self node.
|
70
|
+
# @return [Gammo::Node] A node inserted before the reference node.
|
71
|
+
def insert_before(node, ref)
|
72
|
+
raise HierarchyRequestError,
|
73
|
+
'insert_before called for an attached child node' if attached?(node)
|
74
|
+
if ref
|
75
|
+
previous_sibling, next_sibling = ref.previous_sibling, ref
|
76
|
+
else
|
77
|
+
previous_sibling = last_child
|
78
|
+
end
|
79
|
+
if previous_sibling
|
80
|
+
previous_sibling.next_sibling = node
|
81
|
+
else
|
82
|
+
@first_child = node
|
83
|
+
end
|
84
|
+
if next_sibling
|
85
|
+
next_sibling.previous_sibling = node
|
86
|
+
else
|
87
|
+
@last_child = node
|
88
|
+
end
|
89
|
+
node.parent = self
|
90
|
+
node.previous_sibling = previous_sibling
|
91
|
+
node.next_sibling = next_sibling
|
92
|
+
node
|
93
|
+
end
|
94
|
+
|
95
|
+
# Appends given `child` into self node.
|
96
|
+
# @param [Gammo::Node] child
|
97
|
+
# @raise [HierarchyRequestError] Raised if given node is already attached to the self node.
|
98
|
+
# @return [Gammo::Node] A node appended into the self node.
|
99
|
+
def append_child(child)
|
100
|
+
raise HierarchyRequestError,
|
101
|
+
'append_child called for an attached child node' if attached?(child)
|
102
|
+
if last = last_child
|
103
|
+
last.next_sibling = child
|
104
|
+
else
|
105
|
+
@first_child = child
|
106
|
+
end
|
107
|
+
@last_child = child
|
108
|
+
child.parent = self
|
109
|
+
child.previous_sibling = last
|
110
|
+
child
|
111
|
+
end
|
112
|
+
|
113
|
+
# Removes given `child` from self node.
|
114
|
+
# @param [Gammo::Node] child
|
115
|
+
# @raise [UncaughtTypeError] Raised unless given node is not child of the self node.
|
116
|
+
# @return [Gammo::Node] A node removed from the self node.
|
117
|
+
def remove_child(child)
|
118
|
+
raise UncaughtTypeError,
|
119
|
+
'remove_child called for a non-child node' unless child?(child)
|
120
|
+
@first_child = child.next_sibling if first_child == child
|
121
|
+
child.next_sibling.previous_sibling = child.previous_sibling if child.next_sibling
|
122
|
+
@last_child = child.previous_sibling if last_child == child
|
123
|
+
child.previous_sibling.next_sibling = child.next_sibling if child.previous_sibling
|
124
|
+
child.parent = child.previous_sibling = child.next_sibling = nil
|
125
|
+
child
|
126
|
+
end
|
127
|
+
|
128
|
+
# Clones self into a new node.
|
129
|
+
# @return [Gammo::Node]
|
130
|
+
# @!visibility private
|
131
|
+
def clone
|
132
|
+
self.class.new(tag: self.tag, data: self.data, attributes: self.attributes.dup)
|
133
|
+
end
|
134
|
+
|
135
|
+
# @!visibility private
|
136
|
+
def to_h
|
137
|
+
{
|
138
|
+
tag: tag,
|
139
|
+
data: data,
|
140
|
+
attributes: attributes,
|
141
|
+
type: self.class
|
142
|
+
}
|
143
|
+
end
|
144
|
+
|
145
|
+
private
|
146
|
+
|
147
|
+
# @!visibility private
|
148
|
+
def attached?(node)
|
149
|
+
node.parent || node.previous_sibling || node.next_sibling
|
150
|
+
end
|
151
|
+
|
152
|
+
# @!visibility private
|
153
|
+
def child?(node)
|
154
|
+
node.parent == self
|
155
|
+
end
|
156
|
+
end
|
157
|
+
end
|