gammo 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +8 -0
- data/.travis.yml +6 -0
- data/Gemfile +9 -0
- data/Gemfile.lock +27 -0
- data/LICENSE.txt +21 -0
- data/README.md +177 -0
- data/Rakefile +25 -0
- data/gammo.gemspec +23 -0
- data/lib/gammo.rb +15 -0
- data/lib/gammo/attribute.rb +17 -0
- data/lib/gammo/fragment_parser.rb +65 -0
- data/lib/gammo/node.rb +157 -0
- data/lib/gammo/parser.rb +524 -0
- data/lib/gammo/parser/constants.rb +94 -0
- data/lib/gammo/parser/foreign.rb +307 -0
- data/lib/gammo/parser/insertion_mode.rb +74 -0
- data/lib/gammo/parser/insertion_mode/after_after_body.rb +36 -0
- data/lib/gammo/parser/insertion_mode/after_after_frameset.rb +32 -0
- data/lib/gammo/parser/insertion_mode/after_body.rb +46 -0
- data/lib/gammo/parser/insertion_mode/after_frameset.rb +39 -0
- data/lib/gammo/parser/insertion_mode/after_head.rb +70 -0
- data/lib/gammo/parser/insertion_mode/before_head.rb +49 -0
- data/lib/gammo/parser/insertion_mode/before_html.rb +45 -0
- data/lib/gammo/parser/insertion_mode/in_body.rb +463 -0
- data/lib/gammo/parser/insertion_mode/in_caption.rb +47 -0
- data/lib/gammo/parser/insertion_mode/in_cell.rb +46 -0
- data/lib/gammo/parser/insertion_mode/in_column_group.rb +66 -0
- data/lib/gammo/parser/insertion_mode/in_frameset.rb +48 -0
- data/lib/gammo/parser/insertion_mode/in_head.rb +98 -0
- data/lib/gammo/parser/insertion_mode/in_head_noscript.rb +52 -0
- data/lib/gammo/parser/insertion_mode/in_row.rb +53 -0
- data/lib/gammo/parser/insertion_mode/in_select.rb +77 -0
- data/lib/gammo/parser/insertion_mode/in_select_in_table.rb +46 -0
- data/lib/gammo/parser/insertion_mode/in_table.rb +114 -0
- data/lib/gammo/parser/insertion_mode/in_table_body.rb +55 -0
- data/lib/gammo/parser/insertion_mode/in_template.rb +80 -0
- data/lib/gammo/parser/insertion_mode/initial.rb +152 -0
- data/lib/gammo/parser/insertion_mode/text.rb +32 -0
- data/lib/gammo/parser/insertion_mode_stack.rb +8 -0
- data/lib/gammo/parser/node_stack.rb +24 -0
- data/lib/gammo/tags.rb +9 -0
- data/lib/gammo/tags/table.rb +744 -0
- data/lib/gammo/tokenizer.rb +373 -0
- data/lib/gammo/tokenizer/debug.rb +34 -0
- data/lib/gammo/tokenizer/entity.rb +2240 -0
- data/lib/gammo/tokenizer/escape.rb +174 -0
- data/lib/gammo/tokenizer/script_scanner.rb +229 -0
- data/lib/gammo/tokenizer/tokens.rb +66 -0
- data/lib/gammo/version.rb +3 -0
- data/misc/html.yaml +384 -0
- data/misc/table.erubi +14 -0
- metadata +97 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: '009d6d5682151d83fe688e67ba57541bcccf5542b2865d8b85be77fae8156178'
|
4
|
+
data.tar.gz: 31a2f1d37e01a3c9e47db2b034965c75b0bc7ddd4d2f86826ae08fd37d199788
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: c77bcb2f3cc9b25ac7400eff41819289980b1ff9a53481cca51b1444bca24dbdd114ead1c90f6cec219b0c844b0e63775338a7a7f74910b24c0ab6b0a00e2d54
|
7
|
+
data.tar.gz: a907e000dd8d4c01bcdb3f834ec17fbc20bdddcc91c38b77add44d3b56116eef8b7a64acdbd967fff65b1076df65871899816493ca012b98947266cc1ba51d8c
|
data/.gitignore
ADDED
data/.travis.yml
ADDED
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
gammo (0.1.0)
|
5
|
+
|
6
|
+
GEM
|
7
|
+
remote: https://rubygems.org/
|
8
|
+
specs:
|
9
|
+
erubi (1.9.0)
|
10
|
+
power_assert (1.1.5)
|
11
|
+
rake (12.3.3)
|
12
|
+
test-unit (3.3.5)
|
13
|
+
power_assert
|
14
|
+
yard (0.9.20)
|
15
|
+
|
16
|
+
PLATFORMS
|
17
|
+
ruby
|
18
|
+
|
19
|
+
DEPENDENCIES
|
20
|
+
erubi
|
21
|
+
gammo!
|
22
|
+
rake (~> 12.0)
|
23
|
+
test-unit (~> 3.3.5)
|
24
|
+
yard
|
25
|
+
|
26
|
+
BUNDLED WITH
|
27
|
+
2.0.2
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2020 namusyaka
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
13
|
+
all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,177 @@
|
|
1
|
+
# Gammo - A pure-Ruby HTML5 parser
|
2
|
+
|
3
|
+
[![Build Status](https://travis-ci.org/namusyaka/gammo.svg?branch=master)](https://travis-ci.org/namusyaka/gammo)
|
4
|
+
|
5
|
+
Gammo is an implementation of the HTML5 parsing algorithm which conforms [the WHATWG specification](https://html.spec.whatwg.org/multipage/parsing.html), without any dependencies. Given an HTML string, Gammo parses it and builds DOM tree based on the tokenization and tree-construction algorithm defined in WHATWG parsing algorithm.
|
6
|
+
|
7
|
+
Gammo, its naming is inspired by [Gumbo](https://github.com/google/gumbo-parser). But Gammo is a fried tofu fritter made with vegetables.
|
8
|
+
|
9
|
+
```ruby
|
10
|
+
require 'gammo'
|
11
|
+
require 'open-uri'
|
12
|
+
|
13
|
+
parser = Gammo.new(open('https://google.com'))
|
14
|
+
parser.parse #=> #<Gammo::Node::Document>
|
15
|
+
```
|
16
|
+
|
17
|
+
## Overview
|
18
|
+
|
19
|
+
### Features
|
20
|
+
|
21
|
+
- [Tokenization](#tokenization): Gammo has a tokenizer for implementing [the tokenization algorithm](https://html.spec.whatwg.org/multipage/parsing.html#tokenization).
|
22
|
+
- [Parsing](#parsing): Gammo provides a parser which implements the parsing algorithm by the above tokenization and [the tree-construction algorithm](https://html.spec.whatwg.org/multipage/parsing.html#tree-construction).
|
23
|
+
- [Node](#node): Gammo provides the nodes which implement [WHATWG DOM specification](https://dom.spec.whatwg.org/) partially.
|
24
|
+
- [Performance](#performance): Gammo does not prioritize performance, and there are a few potential performance notes.
|
25
|
+
|
26
|
+
## Tokenizaton
|
27
|
+
|
28
|
+
`Gammo::Tokenizer` implements the tokenization algorithm in WHATWG. You can get tokens in order by calling `Gammo::Tokenizer#next_token`.
|
29
|
+
|
30
|
+
Here is a simple example for performing only the tokenizer.
|
31
|
+
|
32
|
+
```ruby
|
33
|
+
def dump_for(token)
|
34
|
+
puts "data: #{token.data}, class: #{token.class}"
|
35
|
+
end
|
36
|
+
|
37
|
+
tokenizer = Gammo::Tokenizer.new('<!doctype html><input type="button"><frameset>')
|
38
|
+
dump_for tokenizer.next_token #=> data: html, class: Gammo::Tokenizer::DoctypeToken
|
39
|
+
dump_for tokenizer.next_token #=> data: input, class: Gammo::Tokenizer::StartTagToken
|
40
|
+
dump_for tokenizer.next_token #=> data: frameset, class: Gammo::Tokenizer::StartTagToken
|
41
|
+
dump_for tokenizer.next_token #=> data: end of string, class: Gammo::Tokenizer::ErrorToken
|
42
|
+
```
|
43
|
+
|
44
|
+
The parser described below depends on this tokenizer, it applies the WHATWG parsing algorithm to the tokens extracted by this tokenization in order.
|
45
|
+
|
46
|
+
### Token types
|
47
|
+
|
48
|
+
The tokens generated by the tokenizer will be categorized into one of the following types:
|
49
|
+
|
50
|
+
<table>
|
51
|
+
<thead>
|
52
|
+
<tr>
|
53
|
+
<th>Token type</th>
|
54
|
+
<th>Description</th>
|
55
|
+
</tr>
|
56
|
+
</thead>
|
57
|
+
<tbody>
|
58
|
+
<tr>
|
59
|
+
<td><code>Gammo::Tokenizer::ErrorToken</code></td>
|
60
|
+
<td>Represents an error token, it usually means end-of-string.</td>
|
61
|
+
</tr>
|
62
|
+
<tr>
|
63
|
+
<td><code>Gammo::Tokenizer::TextToken</code></td>
|
64
|
+
<td>Represents a text token like "foo" which is inner text of elements.</td>
|
65
|
+
</tr>
|
66
|
+
<tr>
|
67
|
+
<td><code>Gammo::Tokenizer::StartTagToken</code></td>
|
68
|
+
<td>Represents a start tag token like <code><a></code>.</td>
|
69
|
+
</tr>
|
70
|
+
<tr>
|
71
|
+
<td><code>Gammo::Tokenizer::EndTagToken</code></td>
|
72
|
+
<td>Represents an end tag token like <code></a></code>.</td>
|
73
|
+
</tr>
|
74
|
+
<tr>
|
75
|
+
<td><code>Gammo::Tokenizer::SelfClosingTagToken</code></td>
|
76
|
+
<td>Represents a self closing tag token like <code><img /></code></td>
|
77
|
+
</tr>
|
78
|
+
<tr>
|
79
|
+
<td><code>Gammo::Tokenizer::CommentToken</code></td>
|
80
|
+
<td>Represents a comment token like <code><!-- comment --></code>.</td>
|
81
|
+
</tr>
|
82
|
+
<tr>
|
83
|
+
<td><code>Gammo::Tokenizer::DoctypeToken</code></td>
|
84
|
+
<td>Represents a doctype token like <code><!doctype html></code>.</td>
|
85
|
+
</tr>
|
86
|
+
</tbody>
|
87
|
+
</table>
|
88
|
+
|
89
|
+
## Parsing
|
90
|
+
|
91
|
+
`Gammo::Parser` implements processing in [the tree-construction stage](https://html.spec.whatwg.org/multipage/parsing.html#tree-construction) based on the tokenization described above.
|
92
|
+
|
93
|
+
A successfully parsed parser has the `document` accessor as the root document (this is the same as the return value of the `Gammo::Parser#parse`). From the `document` accessor, you can traverse the DOM tree constructed by the parser.
|
94
|
+
|
95
|
+
```ruby
|
96
|
+
require 'gammo'
|
97
|
+
require 'pp'
|
98
|
+
|
99
|
+
document = Gammo.new('<!doctype html><input type="button">').parse
|
100
|
+
|
101
|
+
def dump_for(node, strm)
|
102
|
+
strm << node.to_h
|
103
|
+
return unless node && (child = node.first_child)
|
104
|
+
while child
|
105
|
+
dump_for(child, (strm.last[:children] ||= []))
|
106
|
+
child = child.next_sibling
|
107
|
+
end
|
108
|
+
strm
|
109
|
+
end
|
110
|
+
|
111
|
+
pp dump_for(document, [])
|
112
|
+
```
|
113
|
+
|
114
|
+
### Notes
|
115
|
+
|
116
|
+
Currently, it's not possible to traverse the DOM tree with css selector or xpath like [Nokogiri](https://nokogiri.org/).
|
117
|
+
However, Gammo plans to implement these features in the future.
|
118
|
+
|
119
|
+
## Node
|
120
|
+
|
121
|
+
The nodes generated by the parser will be categorized into one of the following types:
|
122
|
+
|
123
|
+
<table>
|
124
|
+
<thead>
|
125
|
+
<tr>
|
126
|
+
<th>Node type</th>
|
127
|
+
<th>Description</th>
|
128
|
+
</tr>
|
129
|
+
</thead>
|
130
|
+
<tbody>
|
131
|
+
<tr>
|
132
|
+
<td><code>Gammo::Node::Error</code></td>
|
133
|
+
<td>Represents error node, it usually means end-of-string.</td>
|
134
|
+
</tr>
|
135
|
+
<tr>
|
136
|
+
<td><code>Gammo::Node::Text</code></td>
|
137
|
+
<td>Represents the text node like "foo" which is inner text of elements.</td>
|
138
|
+
</tr>
|
139
|
+
<tr>
|
140
|
+
<td><code>Gammo::Node::Document</code></td>
|
141
|
+
<td>Represents the root document type. It's always returned by <code>Gammo::Parser#document</code>.</td>
|
142
|
+
</tr>
|
143
|
+
<tr>
|
144
|
+
<td><code>Gammo::Node::Element</code></td>
|
145
|
+
<td>Represents any elements of HTML like <code><p></code>.</td>
|
146
|
+
</tr>
|
147
|
+
<tr>
|
148
|
+
<td><code>Gammo::Node::Comment</code></td>
|
149
|
+
<td>Represents comments like <code><!-- foo --></code></td>
|
150
|
+
</tr>
|
151
|
+
<tr>
|
152
|
+
<td><code>Gammo::Node::Doctype</code></td>
|
153
|
+
<td>Represents doctype like <code><!doctype html></code></td>
|
154
|
+
</tr>
|
155
|
+
</tbody>
|
156
|
+
</table>
|
157
|
+
|
158
|
+
For some nodes such as `Gammo::Node::Element` and `Gammo::Node::Document`, they contains pointers to nodes that can be referenced by itself, such as `Gammo::Node#next_sibling` or `Gammo::Node#first_child`. In addition, APIs such as `Gammo::Node#append_child` and `Gammo::Node#remove_child` that perform operations defined in DOM living standard are also provided.
|
159
|
+
|
160
|
+
## Performance
|
161
|
+
|
162
|
+
As mentioned in the features at the beginning, Gammo doesn't prioritize its performance.
|
163
|
+
Thus, for example, Gammo is not suitable for very performance-sensitive applications (e.g. performing Gammo parsing synchronously from an incoming request from an end user).
|
164
|
+
Instead, the goal is to work well with batch processing such as crawlers.
|
165
|
+
Gammo places the highest priority on making it easy to parse HTML by peforming it without depending on native-extensions and external gems.
|
166
|
+
|
167
|
+
## References
|
168
|
+
|
169
|
+
This was developed with reference to the following softwares.
|
170
|
+
|
171
|
+
- [x/net/html](https://godoc.org/golang.org/x/net/html): I've been working on this package, it gave me strong reason to make this happen.
|
172
|
+
- [Blink](https://www.chromium.org/blink): Blink gave me great impression about tree construction.
|
173
|
+
- [html5lib-tests](https://github.com/html5lib/html5lib-tests): Gammo relies on this test.
|
174
|
+
|
175
|
+
## License
|
176
|
+
|
177
|
+
The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
|
data/Rakefile
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
2
|
+
require "rake/testtask"
|
3
|
+
require 'yaml'
|
4
|
+
require 'erubi'
|
5
|
+
|
6
|
+
Rake::TestTask.new(:test) do |t|
|
7
|
+
t.libs << "test"
|
8
|
+
t.libs << "lib"
|
9
|
+
t.test_files = FileList["test/**/*_test.rb"]
|
10
|
+
end
|
11
|
+
|
12
|
+
task default: :test
|
13
|
+
|
14
|
+
def camelize(str)
|
15
|
+
str.sub(/^[a-z\d]*/) { $&.capitalize }.sub(/\-[a-z]*/) { $&.slice(1..-1).capitalize }
|
16
|
+
end
|
17
|
+
|
18
|
+
task default: :test
|
19
|
+
|
20
|
+
task :generate do
|
21
|
+
data = YAML.load(File.read('misc/html.yaml'), symbolize_names: true)
|
22
|
+
@tags = data.each_value.inject(:+).uniq
|
23
|
+
table = eval(Erubi::Engine.new(File.read('misc/table.erubi')).src, binding)
|
24
|
+
File.write('lib/gammo/tags/table.rb', table)
|
25
|
+
end
|
data/gammo.gemspec
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
require_relative 'lib/gammo/version'
|
2
|
+
|
3
|
+
Gem::Specification.new do |spec|
|
4
|
+
spec.name = "gammo"
|
5
|
+
spec.version = Gammo::VERSION
|
6
|
+
spec.authors = ["namusyaka"]
|
7
|
+
spec.email = ["namusyaka@gmail.com"]
|
8
|
+
|
9
|
+
spec.summary = %q{An HTML parser which implements WHATWG parsing algorithm.}
|
10
|
+
spec.description = %q{Gammo is an implementation of the HTML5 parsing algorithm which conforms the WHATWG specification with pure Ruby.}
|
11
|
+
spec.homepage = "https://github.com/namusyaka/gammo"
|
12
|
+
spec.license = "MIT"
|
13
|
+
spec.required_ruby_version = Gem::Requirement.new(">= 2.3.0")
|
14
|
+
|
15
|
+
spec.metadata["homepage_uri"] = spec.homepage
|
16
|
+
spec.metadata["source_code_uri"] = "https://github.com/namusyaka/gammo"
|
17
|
+
spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
|
18
|
+
`git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
19
|
+
end
|
20
|
+
spec.bindir = "exe"
|
21
|
+
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
22
|
+
spec.require_paths = ["lib"]
|
23
|
+
end
|
data/lib/gammo.rb
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
require "gammo/version"
|
2
|
+
require "gammo/parser"
|
3
|
+
require "gammo/fragment_parser"
|
4
|
+
|
5
|
+
module Gammo
|
6
|
+
# Constructs a parser based on the input.
|
7
|
+
#
|
8
|
+
# @param [String] input
|
9
|
+
# @param [TrueClass, FalseClass] fragment
|
10
|
+
# @param [Hash] options
|
11
|
+
# @return [Gammo::Parser]
|
12
|
+
def self.new(input, fragment: false, **options)
|
13
|
+
(fragment ? FragmentParser : Parser).new(input, **options)
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module Gammo
|
2
|
+
# Class for representing an attribute.
|
3
|
+
class Attribute
|
4
|
+
attr_accessor :key, :value, :namespace
|
5
|
+
|
6
|
+
# Constructs an attribute with the key-value pair.
|
7
|
+
# @param [String] key
|
8
|
+
# @param [String] value
|
9
|
+
# @param [String] namespace
|
10
|
+
# @return [Attribute]
|
11
|
+
def initialize(key:, value:, namespace: nil)
|
12
|
+
@key = key
|
13
|
+
@value = value
|
14
|
+
@namespace = namespace
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,65 @@
|
|
1
|
+
require 'gammo/parser'
|
2
|
+
|
3
|
+
module Gammo
|
4
|
+
# Class for parsing a fragment of an HTML input and building an HTML tree.
|
5
|
+
class FragmentParser < ::Gammo::Parser
|
6
|
+
# Constructs a parser instance for fragment parsing algorithm.
|
7
|
+
# @see https://html.spec.whatwg.org/multipage/parsing.html#html-fragment-parsing-algorithm
|
8
|
+
# @param [String] input
|
9
|
+
# @param [Gammo::Node] context
|
10
|
+
# @raise [Gammo::ParseError] raises if context is not valid.
|
11
|
+
# @return Gammo::FragmentParser
|
12
|
+
def initialize(input, context:, **options)
|
13
|
+
validate_context(context)
|
14
|
+
super(input, context: context, **options)
|
15
|
+
@root = Node::Element.new(tag: Tags::Html, data: Tags::Html.to_s)
|
16
|
+
@tokenizer = Tokenizer.new(input, context: !context.namespace && context.tag.to_s)
|
17
|
+
@open_elements = NodeStack.new([@root])
|
18
|
+
document.append_child(@root)
|
19
|
+
template_stack << InTemplate if context.tag == Tags::Template
|
20
|
+
reset_insertion_mode
|
21
|
+
while context
|
22
|
+
if context.instance_of?(Node::Element) && context.tag == Tags::Form
|
23
|
+
@form = context
|
24
|
+
break
|
25
|
+
end
|
26
|
+
context = context.parent
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
# Parses a fragment of the current input and builds HTML tree from it.
|
31
|
+
# @raise [Gammo::ParseError] Raised if the parser gets error while parsing.
|
32
|
+
# @return [Array<Gammo::Node>]
|
33
|
+
def parse
|
34
|
+
super
|
35
|
+
parent = context ? @root : document
|
36
|
+
child = parent.first_child
|
37
|
+
collection = []
|
38
|
+
while child
|
39
|
+
node = child.next_sibling
|
40
|
+
parent.remove_child(child)
|
41
|
+
collection << child
|
42
|
+
child = node
|
43
|
+
end
|
44
|
+
collection
|
45
|
+
end
|
46
|
+
|
47
|
+
# Always returns true.
|
48
|
+
# @return [TrueClass]
|
49
|
+
# @!visibility private
|
50
|
+
def fragment?
|
51
|
+
true
|
52
|
+
end
|
53
|
+
|
54
|
+
# Validates given context. Raises {Gammo::ParseError} if context is not
|
55
|
+
# {Gammo::Node}.
|
56
|
+
# @param [Gammo::Node] context
|
57
|
+
# @raise [Gammo::ParseError]
|
58
|
+
def validate_context(context)
|
59
|
+
fail ParseError, 'given non-element node in "context"' unless context.instance_of?(Node::Element)
|
60
|
+
unless context.tag == Tags.lookup(context.data)
|
61
|
+
fail ParseError, "inconsistent context node, tag = #{context.tag}, data = #{Tags.lookup(context.data)}"
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
data/lib/gammo/node.rb
ADDED
@@ -0,0 +1,157 @@
|
|
1
|
+
module Gammo
|
2
|
+
# Class for representing Node.
|
3
|
+
# https://html.spec.whatwg.org/multipage/parsing.html#tokenization
|
4
|
+
class Node
|
5
|
+
# Represents the error token.
|
6
|
+
Error = Class.new(Node)
|
7
|
+
|
8
|
+
# Represents the text token.
|
9
|
+
Text = Class.new(Node)
|
10
|
+
|
11
|
+
# Represents the root document token.
|
12
|
+
Document = Class.new(Node)
|
13
|
+
|
14
|
+
# Represents the element token including start, end and self-closing token.
|
15
|
+
Element = Class.new(Node)
|
16
|
+
|
17
|
+
# Represents the comment token like "<!-- foo -->".
|
18
|
+
Comment = Class.new(Node)
|
19
|
+
|
20
|
+
# Represents the document type token.
|
21
|
+
Doctype = Class.new(Node)
|
22
|
+
|
23
|
+
# Represents the marker defined in 12.2.4.3.
|
24
|
+
# https://html.spec.whatwg.org/multipage/parsing.html#tokenization
|
25
|
+
ScopeMarker = Class.new(Node)
|
26
|
+
|
27
|
+
# Default scope marker is inserted when entering applet,
|
28
|
+
# object, marquee, template, td, th, and caption elements, and are used
|
29
|
+
# to prevent formatting from "leaking" into applet, object, marquee,
|
30
|
+
# template, td, th, and caption elements"
|
31
|
+
DEFAULT_SCOPE_MARKER = Node::ScopeMarker.new
|
32
|
+
|
33
|
+
# Raised if uncaught node is given for particular operations.
|
34
|
+
# @!visibility private
|
35
|
+
UncaughtTypeError = Class.new(ArgumentError)
|
36
|
+
|
37
|
+
# Raised if anything goes wrong on hierarchy while node operations.
|
38
|
+
# @!visibility private
|
39
|
+
HierarchyRequestError = Class.new(ArgumentError)
|
40
|
+
|
41
|
+
# `parent` is the pointer for the parent node.
|
42
|
+
attr_accessor :parent
|
43
|
+
|
44
|
+
# `first_child` and `last_child` are pointers for the first and the last nodes.
|
45
|
+
attr_accessor :first_child, :last_child
|
46
|
+
|
47
|
+
# `previous_sibling` and `next_sibling` are pointers for the previous and next sibling nodes.
|
48
|
+
attr_accessor :previous_sibling, :next_sibling
|
49
|
+
|
50
|
+
# Properties required to represent node.
|
51
|
+
attr_accessor :tag, :data, :namespace, :attributes
|
52
|
+
|
53
|
+
# Constructs a node which represents HTML element node.
|
54
|
+
# @param [String] tag
|
55
|
+
# @param [String] data
|
56
|
+
# @param [String, NilClass] namespace
|
57
|
+
# @param [Hash(String => String)] attributes
|
58
|
+
# @return [Gammo::Node]
|
59
|
+
def initialize(tag: nil, data: nil, namespace: nil, attributes: [])
|
60
|
+
@tag = tag
|
61
|
+
@data = data
|
62
|
+
@namespace = namespace
|
63
|
+
@attributes = attributes
|
64
|
+
end
|
65
|
+
|
66
|
+
# Inserts a node before a reference node as a child of a specified parent node.
|
67
|
+
# @param [Gammo::Node] node
|
68
|
+
# @param [Gammo::Node] ref
|
69
|
+
# @raise [HierarchyRequestError] Raised if given node is already attached to the self node.
|
70
|
+
# @return [Gammo::Node] A node inserted before the reference node.
|
71
|
+
def insert_before(node, ref)
|
72
|
+
raise HierarchyRequestError,
|
73
|
+
'insert_before called for an attached child node' if attached?(node)
|
74
|
+
if ref
|
75
|
+
previous_sibling, next_sibling = ref.previous_sibling, ref
|
76
|
+
else
|
77
|
+
previous_sibling = last_child
|
78
|
+
end
|
79
|
+
if previous_sibling
|
80
|
+
previous_sibling.next_sibling = node
|
81
|
+
else
|
82
|
+
@first_child = node
|
83
|
+
end
|
84
|
+
if next_sibling
|
85
|
+
next_sibling.previous_sibling = node
|
86
|
+
else
|
87
|
+
@last_child = node
|
88
|
+
end
|
89
|
+
node.parent = self
|
90
|
+
node.previous_sibling = previous_sibling
|
91
|
+
node.next_sibling = next_sibling
|
92
|
+
node
|
93
|
+
end
|
94
|
+
|
95
|
+
# Appends given `child` into self node.
|
96
|
+
# @param [Gammo::Node] child
|
97
|
+
# @raise [HierarchyRequestError] Raised if given node is already attached to the self node.
|
98
|
+
# @return [Gammo::Node] A node appended into the self node.
|
99
|
+
def append_child(child)
|
100
|
+
raise HierarchyRequestError,
|
101
|
+
'append_child called for an attached child node' if attached?(child)
|
102
|
+
if last = last_child
|
103
|
+
last.next_sibling = child
|
104
|
+
else
|
105
|
+
@first_child = child
|
106
|
+
end
|
107
|
+
@last_child = child
|
108
|
+
child.parent = self
|
109
|
+
child.previous_sibling = last
|
110
|
+
child
|
111
|
+
end
|
112
|
+
|
113
|
+
# Removes given `child` from self node.
|
114
|
+
# @param [Gammo::Node] child
|
115
|
+
# @raise [UncaughtTypeError] Raised unless given node is not child of the self node.
|
116
|
+
# @return [Gammo::Node] A node removed from the self node.
|
117
|
+
def remove_child(child)
|
118
|
+
raise UncaughtTypeError,
|
119
|
+
'remove_child called for a non-child node' unless child?(child)
|
120
|
+
@first_child = child.next_sibling if first_child == child
|
121
|
+
child.next_sibling.previous_sibling = child.previous_sibling if child.next_sibling
|
122
|
+
@last_child = child.previous_sibling if last_child == child
|
123
|
+
child.previous_sibling.next_sibling = child.next_sibling if child.previous_sibling
|
124
|
+
child.parent = child.previous_sibling = child.next_sibling = nil
|
125
|
+
child
|
126
|
+
end
|
127
|
+
|
128
|
+
# Clones self into a new node.
|
129
|
+
# @return [Gammo::Node]
|
130
|
+
# @!visibility private
|
131
|
+
def clone
|
132
|
+
self.class.new(tag: self.tag, data: self.data, attributes: self.attributes.dup)
|
133
|
+
end
|
134
|
+
|
135
|
+
# @!visibility private
|
136
|
+
def to_h
|
137
|
+
{
|
138
|
+
tag: tag,
|
139
|
+
data: data,
|
140
|
+
attributes: attributes,
|
141
|
+
type: self.class
|
142
|
+
}
|
143
|
+
end
|
144
|
+
|
145
|
+
private
|
146
|
+
|
147
|
+
# @!visibility private
|
148
|
+
def attached?(node)
|
149
|
+
node.parent || node.previous_sibling || node.next_sibling
|
150
|
+
end
|
151
|
+
|
152
|
+
# @!visibility private
|
153
|
+
def child?(node)
|
154
|
+
node.parent == self
|
155
|
+
end
|
156
|
+
end
|
157
|
+
end
|