nokogumbo 1.5.0 → 2.0.0.pre.alpha
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +56 -0
- data/README.md +146 -22
- data/ext/nokogumbo/extconf.rb +116 -0
- data/ext/{nokogumboc → nokogumbo}/nokogumbo.c +174 -71
- data/gumbo-parser/src/ascii.c +33 -0
- data/gumbo-parser/src/ascii.h +31 -0
- data/gumbo-parser/src/attribute.c +26 -28
- data/gumbo-parser/src/attribute.h +3 -23
- data/gumbo-parser/src/char_ref.c +135 -2351
- data/gumbo-parser/src/char_ref.h +13 -29
- data/gumbo-parser/src/error.c +215 -133
- data/gumbo-parser/src/error.h +34 -49
- data/gumbo-parser/src/foreign_attrs.c +104 -0
- data/gumbo-parser/src/gumbo.h +506 -304
- data/gumbo-parser/src/insertion_mode.h +4 -28
- data/gumbo-parser/src/macros.h +91 -0
- data/gumbo-parser/src/parser.c +1989 -1431
- data/gumbo-parser/src/parser.h +6 -22
- data/gumbo-parser/src/replacement.h +33 -0
- data/gumbo-parser/src/string_buffer.c +43 -50
- data/gumbo-parser/src/string_buffer.h +24 -40
- data/gumbo-parser/src/string_piece.c +39 -39
- data/gumbo-parser/src/svg_attrs.c +174 -0
- data/gumbo-parser/src/svg_tags.c +137 -0
- data/gumbo-parser/src/tag.c +186 -59
- data/gumbo-parser/src/tag_lookup.c +382 -0
- data/gumbo-parser/src/tag_lookup.h +13 -0
- data/gumbo-parser/src/token_type.h +1 -25
- data/gumbo-parser/src/tokenizer.c +899 -495
- data/gumbo-parser/src/tokenizer.h +37 -37
- data/gumbo-parser/src/tokenizer_states.h +6 -22
- data/gumbo-parser/src/utf8.c +103 -86
- data/gumbo-parser/src/utf8.h +37 -41
- data/gumbo-parser/src/util.c +48 -38
- data/gumbo-parser/src/util.h +10 -40
- data/gumbo-parser/src/vector.c +45 -57
- data/gumbo-parser/src/vector.h +17 -39
- data/lib/nokogumbo.rb +10 -174
- data/lib/nokogumbo/html5.rb +250 -0
- data/lib/nokogumbo/html5/document.rb +37 -0
- data/lib/nokogumbo/html5/document_fragment.rb +46 -0
- data/lib/nokogumbo/version.rb +3 -0
- data/lib/nokogumbo/xml/node.rb +57 -0
- metadata +32 -19
- data/ext/nokogumboc/extconf.rb +0 -60
- data/gumbo-parser/src/char_ref.rl +0 -2554
- data/gumbo-parser/src/string_piece.h +0 -38
- data/gumbo-parser/src/tag.in +0 -150
- data/gumbo-parser/src/tag_enum.h +0 -153
- data/gumbo-parser/src/tag_gperf.h +0 -105
- data/gumbo-parser/src/tag_sizes.h +0 -4
- data/gumbo-parser/src/tag_strings.h +0 -153
- data/gumbo-parser/visualc/include/strings.h +0 -4
- data/test-nokogumbo.rb +0 -190
@@ -1,4 +0,0 @@
|
|
1
|
-
// Generated via `gentags.py src/tag.in`.
|
2
|
-
// Do not edit; edit src/tag.in instead.
|
3
|
-
// clang-format off
|
4
|
-
4, 4, 5, 4, 4, 4, 5, 6, 8, 8, 4, 7, 7, 3, 5, 2, 2, 2, 2, 2, 2, 6, 6, 6, 7, 1, 2, 3, 10, 2, 2, 2, 2, 2, 2, 6, 10, 4, 3, 1, 2, 6, 5, 1, 4, 1, 3, 4, 4, 4, 4, 3, 4, 3, 3, 3, 1, 1, 1, 4, 4, 2, 2, 3, 3, 4, 2, 3, 3, 3, 5, 3, 6, 5, 6, 5, 5, 5, 6, 5, 6, 3, 4, 4, 2, 2, 2, 2, 5, 6, 10, 14, 3, 13, 4, 5, 7, 8, 3, 5, 5, 5, 2, 2, 2, 4, 8, 6, 5, 5, 6, 6, 8, 8, 6, 8, 6, 6, 8, 5, 7, 7, 4, 8, 6, 7, 7, 3, 5, 8, 8, 7, 7, 3, 6, 7, 9, 2, 6, 8, 3, 5, 6, 4, 7, 8, 4, 6, 2, 3,
|
@@ -1,153 +0,0 @@
|
|
1
|
-
// Generated via `gentags.py src/tag.in`.
|
2
|
-
// Do not edit; edit src/tag.in instead.
|
3
|
-
// clang-format off
|
4
|
-
"html",
|
5
|
-
"head",
|
6
|
-
"title",
|
7
|
-
"base",
|
8
|
-
"link",
|
9
|
-
"meta",
|
10
|
-
"style",
|
11
|
-
"script",
|
12
|
-
"noscript",
|
13
|
-
"template",
|
14
|
-
"body",
|
15
|
-
"article",
|
16
|
-
"section",
|
17
|
-
"nav",
|
18
|
-
"aside",
|
19
|
-
"h1",
|
20
|
-
"h2",
|
21
|
-
"h3",
|
22
|
-
"h4",
|
23
|
-
"h5",
|
24
|
-
"h6",
|
25
|
-
"hgroup",
|
26
|
-
"header",
|
27
|
-
"footer",
|
28
|
-
"address",
|
29
|
-
"p",
|
30
|
-
"hr",
|
31
|
-
"pre",
|
32
|
-
"blockquote",
|
33
|
-
"ol",
|
34
|
-
"ul",
|
35
|
-
"li",
|
36
|
-
"dl",
|
37
|
-
"dt",
|
38
|
-
"dd",
|
39
|
-
"figure",
|
40
|
-
"figcaption",
|
41
|
-
"main",
|
42
|
-
"div",
|
43
|
-
"a",
|
44
|
-
"em",
|
45
|
-
"strong",
|
46
|
-
"small",
|
47
|
-
"s",
|
48
|
-
"cite",
|
49
|
-
"q",
|
50
|
-
"dfn",
|
51
|
-
"abbr",
|
52
|
-
"data",
|
53
|
-
"time",
|
54
|
-
"code",
|
55
|
-
"var",
|
56
|
-
"samp",
|
57
|
-
"kbd",
|
58
|
-
"sub",
|
59
|
-
"sup",
|
60
|
-
"i",
|
61
|
-
"b",
|
62
|
-
"u",
|
63
|
-
"mark",
|
64
|
-
"ruby",
|
65
|
-
"rt",
|
66
|
-
"rp",
|
67
|
-
"bdi",
|
68
|
-
"bdo",
|
69
|
-
"span",
|
70
|
-
"br",
|
71
|
-
"wbr",
|
72
|
-
"ins",
|
73
|
-
"del",
|
74
|
-
"image",
|
75
|
-
"img",
|
76
|
-
"iframe",
|
77
|
-
"embed",
|
78
|
-
"object",
|
79
|
-
"param",
|
80
|
-
"video",
|
81
|
-
"audio",
|
82
|
-
"source",
|
83
|
-
"track",
|
84
|
-
"canvas",
|
85
|
-
"map",
|
86
|
-
"area",
|
87
|
-
"math",
|
88
|
-
"mi",
|
89
|
-
"mo",
|
90
|
-
"mn",
|
91
|
-
"ms",
|
92
|
-
"mtext",
|
93
|
-
"mglyph",
|
94
|
-
"malignmark",
|
95
|
-
"annotation-xml",
|
96
|
-
"svg",
|
97
|
-
"foreignobject",
|
98
|
-
"desc",
|
99
|
-
"table",
|
100
|
-
"caption",
|
101
|
-
"colgroup",
|
102
|
-
"col",
|
103
|
-
"tbody",
|
104
|
-
"thead",
|
105
|
-
"tfoot",
|
106
|
-
"tr",
|
107
|
-
"td",
|
108
|
-
"th",
|
109
|
-
"form",
|
110
|
-
"fieldset",
|
111
|
-
"legend",
|
112
|
-
"label",
|
113
|
-
"input",
|
114
|
-
"button",
|
115
|
-
"select",
|
116
|
-
"datalist",
|
117
|
-
"optgroup",
|
118
|
-
"option",
|
119
|
-
"textarea",
|
120
|
-
"keygen",
|
121
|
-
"output",
|
122
|
-
"progress",
|
123
|
-
"meter",
|
124
|
-
"details",
|
125
|
-
"summary",
|
126
|
-
"menu",
|
127
|
-
"menuitem",
|
128
|
-
"applet",
|
129
|
-
"acronym",
|
130
|
-
"bgsound",
|
131
|
-
"dir",
|
132
|
-
"frame",
|
133
|
-
"frameset",
|
134
|
-
"noframes",
|
135
|
-
"isindex",
|
136
|
-
"listing",
|
137
|
-
"xmp",
|
138
|
-
"nextid",
|
139
|
-
"noembed",
|
140
|
-
"plaintext",
|
141
|
-
"rb",
|
142
|
-
"strike",
|
143
|
-
"basefont",
|
144
|
-
"big",
|
145
|
-
"blink",
|
146
|
-
"center",
|
147
|
-
"font",
|
148
|
-
"marquee",
|
149
|
-
"multicol",
|
150
|
-
"nobr",
|
151
|
-
"spacer",
|
152
|
-
"tt",
|
153
|
-
"rtc",
|
data/test-nokogumbo.rb
DELETED
@@ -1,190 +0,0 @@
|
|
1
|
-
$:.unshift('lib')
|
2
|
-
$:.unshift('ext/nokogumboc')
|
3
|
-
|
4
|
-
gem 'minitest'
|
5
|
-
|
6
|
-
require 'nokogumbo'
|
7
|
-
require 'minitest/autorun'
|
8
|
-
|
9
|
-
class TestNokogumbo < Minitest::Test
|
10
|
-
def test_element_text
|
11
|
-
doc = Nokogiri::HTML5(buffer)
|
12
|
-
assert_equal "content", doc.at('span').text
|
13
|
-
end
|
14
|
-
|
15
|
-
def test_element_cdata_textarea
|
16
|
-
doc = Nokogiri::HTML5(buffer)
|
17
|
-
assert_equal "foo<x>bar", doc.at('textarea').text.strip
|
18
|
-
end
|
19
|
-
|
20
|
-
def test_element_cdata_script
|
21
|
-
doc = Nokogiri::HTML5.fragment(buffer)
|
22
|
-
assert_equal true, doc.document.html?
|
23
|
-
assert_equal "<script> if (a < b) alert(1) </script>", doc.at('script').to_s
|
24
|
-
end
|
25
|
-
|
26
|
-
def test_attr_value
|
27
|
-
doc = Nokogiri::HTML5(buffer)
|
28
|
-
assert_equal "utf-8", doc.at('meta')['charset']
|
29
|
-
end
|
30
|
-
|
31
|
-
def test_comment
|
32
|
-
doc = Nokogiri::HTML5(buffer)
|
33
|
-
assert_equal " test comment ", doc.xpath('//comment()').text
|
34
|
-
end
|
35
|
-
|
36
|
-
def test_unknown_element
|
37
|
-
doc = Nokogiri::HTML5(buffer)
|
38
|
-
assert_equal "main", doc.at('main').name
|
39
|
-
end
|
40
|
-
|
41
|
-
def test_IO
|
42
|
-
require 'stringio'
|
43
|
-
doc = Nokogiri::HTML5(StringIO.new(buffer))
|
44
|
-
assert_equal 'textarea', doc.at('form').element_children.first.name
|
45
|
-
end
|
46
|
-
|
47
|
-
def test_nil
|
48
|
-
doc = Nokogiri::HTML5(nil)
|
49
|
-
assert_equal 1, doc.search('body').count
|
50
|
-
end
|
51
|
-
|
52
|
-
if ''.respond_to? 'encoding'
|
53
|
-
def test_macroman_encoding
|
54
|
-
mac="<span>\xCA</span>".force_encoding('macroman')
|
55
|
-
doc = Nokogiri::HTML5(mac)
|
56
|
-
assert_equal '<span> </span>', doc.at('span').to_xml
|
57
|
-
end
|
58
|
-
|
59
|
-
def test_iso8859_encoding
|
60
|
-
iso8859="<span>Se\xF1or</span>".force_encoding(Encoding::ASCII_8BIT)
|
61
|
-
doc = Nokogiri::HTML5(iso8859)
|
62
|
-
assert_equal '<span>Señor</span>', doc.at('span').to_xml
|
63
|
-
end
|
64
|
-
|
65
|
-
def test_charset_encoding
|
66
|
-
utf8="<meta charset='utf-8'><span>Se\xC3\xB1or</span>".
|
67
|
-
force_encoding(Encoding::ASCII_8BIT)
|
68
|
-
doc = Nokogiri::HTML5(utf8)
|
69
|
-
assert_equal '<span>Señor</span>', doc.at('span').to_xml
|
70
|
-
end
|
71
|
-
|
72
|
-
def test_bogus_encoding
|
73
|
-
bogus="<meta charset='bogus'><span>Se\xF1or</span>".
|
74
|
-
force_encoding(Encoding::ASCII_8BIT)
|
75
|
-
doc = Nokogiri::HTML5(bogus)
|
76
|
-
assert_equal '<span>Señor</span>', doc.at('span').to_xml
|
77
|
-
end
|
78
|
-
end
|
79
|
-
|
80
|
-
def test_html5_doctype
|
81
|
-
doc = Nokogiri::HTML5.parse("<!DOCTYPE html><html></html>")
|
82
|
-
assert_match /<!DOCTYPE html>/, doc.to_html
|
83
|
-
end
|
84
|
-
|
85
|
-
def test_fragment_head
|
86
|
-
doc = Nokogiri::HTML5.fragment(buffer[/<head>(.*?)<\/head>/m, 1])
|
87
|
-
assert_equal "hello world", doc.xpath('title').text
|
88
|
-
assert_equal "utf-8", doc.xpath('meta').first['charset']
|
89
|
-
end
|
90
|
-
|
91
|
-
def test_fragment_body
|
92
|
-
doc = Nokogiri::HTML5.fragment(buffer[/<body>(.*?)<\/body>/m, 1])
|
93
|
-
assert_equal '<span>content</span>', doc.xpath('main/span').to_xml
|
94
|
-
assert_equal " test comment ", doc.xpath('comment()').text
|
95
|
-
end
|
96
|
-
|
97
|
-
def test_xlink_attribute
|
98
|
-
source = <<-EOF.gsub(/^ {6}/, '')
|
99
|
-
<svg xmlns="http://www.w3.org/2000/svg">
|
100
|
-
<a xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="#s1"/>
|
101
|
-
</svg>
|
102
|
-
EOF
|
103
|
-
doc = Nokogiri::HTML5.fragment(source)
|
104
|
-
a = doc.at('a')
|
105
|
-
assert_equal ["xlink:href", "xmlns:xlink"], a.attributes.keys.sort
|
106
|
-
end
|
107
|
-
|
108
|
-
def test_template
|
109
|
-
source = <<-EOF.gsub(/^ {6}/, '')
|
110
|
-
<template id="productrow">
|
111
|
-
<tr>
|
112
|
-
<td class="record"></td>
|
113
|
-
<td></td>
|
114
|
-
</tr>
|
115
|
-
</template>
|
116
|
-
EOF
|
117
|
-
doc = Nokogiri::HTML5.fragment(source)
|
118
|
-
template = doc.at('template')
|
119
|
-
assert_equal "productrow", template['id']
|
120
|
-
assert_equal "record", template.at('td')['class']
|
121
|
-
end
|
122
|
-
|
123
|
-
def test_root_comments
|
124
|
-
doc = Nokogiri::HTML5("<!DOCTYPE html><!-- start --><html></html><!-- -->")
|
125
|
-
assert_equal ["html", "comment", "html", "comment"], doc.children.map(&:name)
|
126
|
-
end
|
127
|
-
|
128
|
-
def test_parse_errors
|
129
|
-
doc = Nokogiri::HTML5("<!DOCTYPE html><html><!-- -- --></a>", max_parse_errors: 10)
|
130
|
-
assert_equal doc.errors.length, 2
|
131
|
-
doc = Nokogiri::HTML5("<!DOCTYPE html><html>", max_parse_errors: 10)
|
132
|
-
assert_empty doc.errors
|
133
|
-
end
|
134
|
-
|
135
|
-
def test_max_parse_errors
|
136
|
-
# This document contains 2 parse errors, but we force limit to 1.
|
137
|
-
doc = Nokogiri::HTML5("<!DOCTYPE html><html><!-- -- --></a>", max_parse_errors: 1)
|
138
|
-
assert_equal 1, doc.errors.length
|
139
|
-
doc = Nokogiri::HTML5("<!DOCTYPE html><html>", max_parse_errors: 1)
|
140
|
-
assert_empty doc.errors
|
141
|
-
end
|
142
|
-
|
143
|
-
def test_default_max_parse_errors
|
144
|
-
# This document contains 200 parse errors, but default limit is 0.
|
145
|
-
doc = Nokogiri::HTML5("<!DOCTYPE html><html>" + "</p>" * 200)
|
146
|
-
assert_equal 0, doc.errors.length
|
147
|
-
end
|
148
|
-
|
149
|
-
def test_parse_fragment_errors
|
150
|
-
doc = Nokogiri::HTML5.fragment("<\r\n", max_parse_errors: 10)
|
151
|
-
refute_empty doc.errors
|
152
|
-
end
|
153
|
-
|
154
|
-
def test_fragment_max_parse_errors
|
155
|
-
# This fragment contains 3 parse errors, but we force limit to 1.
|
156
|
-
doc = Nokogiri::HTML5.fragment("<!-- -- --></a>", max_parse_errors: 1)
|
157
|
-
assert_equal 1, doc.errors.length
|
158
|
-
end
|
159
|
-
|
160
|
-
def test_fragment_default_max_parse_errors
|
161
|
-
# This fragment contains 201 parse errors, but default limit is 0.
|
162
|
-
doc = Nokogiri::HTML5.fragment("</p>" * 200)
|
163
|
-
assert_equal 0, doc.errors.length
|
164
|
-
end
|
165
|
-
|
166
|
-
private
|
167
|
-
|
168
|
-
def buffer
|
169
|
-
<<-EOF.gsub(/^ /, '')
|
170
|
-
<html>
|
171
|
-
<head>
|
172
|
-
<meta charset="utf-8"/>
|
173
|
-
<title>hello world</title>
|
174
|
-
<script> if (a < b) alert(1) </script>
|
175
|
-
</head>
|
176
|
-
<body>
|
177
|
-
<h1>hello world</h1>
|
178
|
-
<main>
|
179
|
-
<span>content</span>
|
180
|
-
</main>
|
181
|
-
<!-- test comment -->
|
182
|
-
<form>
|
183
|
-
<textarea>foo<x>bar</textarea>
|
184
|
-
</form>
|
185
|
-
</body>
|
186
|
-
</html>
|
187
|
-
EOF
|
188
|
-
end
|
189
|
-
|
190
|
-
end
|