nokogumbo 1.5.0 → 2.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +237 -26
- data/ext/nokogumbo/extconf.rb +121 -0
- data/ext/nokogumbo/nokogumbo.c +793 -0
- data/gumbo-parser/src/ascii.c +75 -0
- data/gumbo-parser/src/ascii.h +115 -0
- data/gumbo-parser/src/attribute.c +26 -28
- data/gumbo-parser/src/attribute.h +3 -23
- data/gumbo-parser/src/char_ref.c +5972 -6816
- data/gumbo-parser/src/char_ref.h +14 -45
- data/gumbo-parser/src/error.c +510 -163
- data/gumbo-parser/src/error.h +70 -147
- data/gumbo-parser/src/foreign_attrs.c +104 -0
- data/gumbo-parser/src/gumbo.h +577 -305
- data/gumbo-parser/src/insertion_mode.h +4 -28
- data/gumbo-parser/src/macros.h +91 -0
- data/gumbo-parser/src/parser.c +2922 -2228
- data/gumbo-parser/src/parser.h +6 -22
- data/gumbo-parser/src/replacement.h +33 -0
- data/gumbo-parser/src/string_buffer.c +43 -50
- data/gumbo-parser/src/string_buffer.h +24 -40
- data/gumbo-parser/src/string_piece.c +39 -39
- data/gumbo-parser/src/svg_attrs.c +174 -0
- data/gumbo-parser/src/svg_tags.c +137 -0
- data/gumbo-parser/src/tag.c +186 -59
- data/gumbo-parser/src/tag_lookup.c +382 -0
- data/gumbo-parser/src/tag_lookup.h +13 -0
- data/gumbo-parser/src/token_buffer.c +79 -0
- data/gumbo-parser/src/token_buffer.h +71 -0
- data/gumbo-parser/src/token_type.h +1 -25
- data/gumbo-parser/src/tokenizer.c +2127 -1561
- data/gumbo-parser/src/tokenizer.h +41 -52
- data/gumbo-parser/src/tokenizer_states.h +281 -45
- data/gumbo-parser/src/utf8.c +98 -123
- data/gumbo-parser/src/utf8.h +84 -52
- data/gumbo-parser/src/util.c +48 -38
- data/gumbo-parser/src/util.h +10 -40
- data/gumbo-parser/src/vector.c +45 -57
- data/gumbo-parser/src/vector.h +17 -39
- data/lib/nokogumbo.rb +11 -173
- data/lib/nokogumbo/html5.rb +252 -0
- data/lib/nokogumbo/html5/document.rb +53 -0
- data/lib/nokogumbo/html5/document_fragment.rb +62 -0
- data/lib/nokogumbo/html5/node.rb +72 -0
- data/lib/nokogumbo/version.rb +3 -0
- metadata +43 -24
- data/ext/nokogumboc/extconf.rb +0 -60
- data/ext/nokogumboc/nokogumbo.c +0 -295
- data/gumbo-parser/src/char_ref.rl +0 -2554
- data/gumbo-parser/src/string_piece.h +0 -38
- data/gumbo-parser/src/tag.in +0 -150
- data/gumbo-parser/src/tag_enum.h +0 -153
- data/gumbo-parser/src/tag_gperf.h +0 -105
- data/gumbo-parser/src/tag_sizes.h +0 -4
- data/gumbo-parser/src/tag_strings.h +0 -153
- data/gumbo-parser/visualc/include/strings.h +0 -4
- data/test-nokogumbo.rb +0 -190
@@ -1,153 +0,0 @@
|
|
1
|
-
// Generated via `gentags.py src/tag.in`.
|
2
|
-
// Do not edit; edit src/tag.in instead.
|
3
|
-
// clang-format off
|
4
|
-
"html",
|
5
|
-
"head",
|
6
|
-
"title",
|
7
|
-
"base",
|
8
|
-
"link",
|
9
|
-
"meta",
|
10
|
-
"style",
|
11
|
-
"script",
|
12
|
-
"noscript",
|
13
|
-
"template",
|
14
|
-
"body",
|
15
|
-
"article",
|
16
|
-
"section",
|
17
|
-
"nav",
|
18
|
-
"aside",
|
19
|
-
"h1",
|
20
|
-
"h2",
|
21
|
-
"h3",
|
22
|
-
"h4",
|
23
|
-
"h5",
|
24
|
-
"h6",
|
25
|
-
"hgroup",
|
26
|
-
"header",
|
27
|
-
"footer",
|
28
|
-
"address",
|
29
|
-
"p",
|
30
|
-
"hr",
|
31
|
-
"pre",
|
32
|
-
"blockquote",
|
33
|
-
"ol",
|
34
|
-
"ul",
|
35
|
-
"li",
|
36
|
-
"dl",
|
37
|
-
"dt",
|
38
|
-
"dd",
|
39
|
-
"figure",
|
40
|
-
"figcaption",
|
41
|
-
"main",
|
42
|
-
"div",
|
43
|
-
"a",
|
44
|
-
"em",
|
45
|
-
"strong",
|
46
|
-
"small",
|
47
|
-
"s",
|
48
|
-
"cite",
|
49
|
-
"q",
|
50
|
-
"dfn",
|
51
|
-
"abbr",
|
52
|
-
"data",
|
53
|
-
"time",
|
54
|
-
"code",
|
55
|
-
"var",
|
56
|
-
"samp",
|
57
|
-
"kbd",
|
58
|
-
"sub",
|
59
|
-
"sup",
|
60
|
-
"i",
|
61
|
-
"b",
|
62
|
-
"u",
|
63
|
-
"mark",
|
64
|
-
"ruby",
|
65
|
-
"rt",
|
66
|
-
"rp",
|
67
|
-
"bdi",
|
68
|
-
"bdo",
|
69
|
-
"span",
|
70
|
-
"br",
|
71
|
-
"wbr",
|
72
|
-
"ins",
|
73
|
-
"del",
|
74
|
-
"image",
|
75
|
-
"img",
|
76
|
-
"iframe",
|
77
|
-
"embed",
|
78
|
-
"object",
|
79
|
-
"param",
|
80
|
-
"video",
|
81
|
-
"audio",
|
82
|
-
"source",
|
83
|
-
"track",
|
84
|
-
"canvas",
|
85
|
-
"map",
|
86
|
-
"area",
|
87
|
-
"math",
|
88
|
-
"mi",
|
89
|
-
"mo",
|
90
|
-
"mn",
|
91
|
-
"ms",
|
92
|
-
"mtext",
|
93
|
-
"mglyph",
|
94
|
-
"malignmark",
|
95
|
-
"annotation-xml",
|
96
|
-
"svg",
|
97
|
-
"foreignobject",
|
98
|
-
"desc",
|
99
|
-
"table",
|
100
|
-
"caption",
|
101
|
-
"colgroup",
|
102
|
-
"col",
|
103
|
-
"tbody",
|
104
|
-
"thead",
|
105
|
-
"tfoot",
|
106
|
-
"tr",
|
107
|
-
"td",
|
108
|
-
"th",
|
109
|
-
"form",
|
110
|
-
"fieldset",
|
111
|
-
"legend",
|
112
|
-
"label",
|
113
|
-
"input",
|
114
|
-
"button",
|
115
|
-
"select",
|
116
|
-
"datalist",
|
117
|
-
"optgroup",
|
118
|
-
"option",
|
119
|
-
"textarea",
|
120
|
-
"keygen",
|
121
|
-
"output",
|
122
|
-
"progress",
|
123
|
-
"meter",
|
124
|
-
"details",
|
125
|
-
"summary",
|
126
|
-
"menu",
|
127
|
-
"menuitem",
|
128
|
-
"applet",
|
129
|
-
"acronym",
|
130
|
-
"bgsound",
|
131
|
-
"dir",
|
132
|
-
"frame",
|
133
|
-
"frameset",
|
134
|
-
"noframes",
|
135
|
-
"isindex",
|
136
|
-
"listing",
|
137
|
-
"xmp",
|
138
|
-
"nextid",
|
139
|
-
"noembed",
|
140
|
-
"plaintext",
|
141
|
-
"rb",
|
142
|
-
"strike",
|
143
|
-
"basefont",
|
144
|
-
"big",
|
145
|
-
"blink",
|
146
|
-
"center",
|
147
|
-
"font",
|
148
|
-
"marquee",
|
149
|
-
"multicol",
|
150
|
-
"nobr",
|
151
|
-
"spacer",
|
152
|
-
"tt",
|
153
|
-
"rtc",
|
data/test-nokogumbo.rb
DELETED
@@ -1,190 +0,0 @@
|
|
1
|
-
$:.unshift('lib')
|
2
|
-
$:.unshift('ext/nokogumboc')
|
3
|
-
|
4
|
-
gem 'minitest'
|
5
|
-
|
6
|
-
require 'nokogumbo'
|
7
|
-
require 'minitest/autorun'
|
8
|
-
|
9
|
-
class TestNokogumbo < Minitest::Test
|
10
|
-
def test_element_text
|
11
|
-
doc = Nokogiri::HTML5(buffer)
|
12
|
-
assert_equal "content", doc.at('span').text
|
13
|
-
end
|
14
|
-
|
15
|
-
def test_element_cdata_textarea
|
16
|
-
doc = Nokogiri::HTML5(buffer)
|
17
|
-
assert_equal "foo<x>bar", doc.at('textarea').text.strip
|
18
|
-
end
|
19
|
-
|
20
|
-
def test_element_cdata_script
|
21
|
-
doc = Nokogiri::HTML5.fragment(buffer)
|
22
|
-
assert_equal true, doc.document.html?
|
23
|
-
assert_equal "<script> if (a < b) alert(1) </script>", doc.at('script').to_s
|
24
|
-
end
|
25
|
-
|
26
|
-
def test_attr_value
|
27
|
-
doc = Nokogiri::HTML5(buffer)
|
28
|
-
assert_equal "utf-8", doc.at('meta')['charset']
|
29
|
-
end
|
30
|
-
|
31
|
-
def test_comment
|
32
|
-
doc = Nokogiri::HTML5(buffer)
|
33
|
-
assert_equal " test comment ", doc.xpath('//comment()').text
|
34
|
-
end
|
35
|
-
|
36
|
-
def test_unknown_element
|
37
|
-
doc = Nokogiri::HTML5(buffer)
|
38
|
-
assert_equal "main", doc.at('main').name
|
39
|
-
end
|
40
|
-
|
41
|
-
def test_IO
|
42
|
-
require 'stringio'
|
43
|
-
doc = Nokogiri::HTML5(StringIO.new(buffer))
|
44
|
-
assert_equal 'textarea', doc.at('form').element_children.first.name
|
45
|
-
end
|
46
|
-
|
47
|
-
def test_nil
|
48
|
-
doc = Nokogiri::HTML5(nil)
|
49
|
-
assert_equal 1, doc.search('body').count
|
50
|
-
end
|
51
|
-
|
52
|
-
if ''.respond_to? 'encoding'
|
53
|
-
def test_macroman_encoding
|
54
|
-
mac="<span>\xCA</span>".force_encoding('macroman')
|
55
|
-
doc = Nokogiri::HTML5(mac)
|
56
|
-
assert_equal '<span> </span>', doc.at('span').to_xml
|
57
|
-
end
|
58
|
-
|
59
|
-
def test_iso8859_encoding
|
60
|
-
iso8859="<span>Se\xF1or</span>".force_encoding(Encoding::ASCII_8BIT)
|
61
|
-
doc = Nokogiri::HTML5(iso8859)
|
62
|
-
assert_equal '<span>Señor</span>', doc.at('span').to_xml
|
63
|
-
end
|
64
|
-
|
65
|
-
def test_charset_encoding
|
66
|
-
utf8="<meta charset='utf-8'><span>Se\xC3\xB1or</span>".
|
67
|
-
force_encoding(Encoding::ASCII_8BIT)
|
68
|
-
doc = Nokogiri::HTML5(utf8)
|
69
|
-
assert_equal '<span>Señor</span>', doc.at('span').to_xml
|
70
|
-
end
|
71
|
-
|
72
|
-
def test_bogus_encoding
|
73
|
-
bogus="<meta charset='bogus'><span>Se\xF1or</span>".
|
74
|
-
force_encoding(Encoding::ASCII_8BIT)
|
75
|
-
doc = Nokogiri::HTML5(bogus)
|
76
|
-
assert_equal '<span>Señor</span>', doc.at('span').to_xml
|
77
|
-
end
|
78
|
-
end
|
79
|
-
|
80
|
-
def test_html5_doctype
|
81
|
-
doc = Nokogiri::HTML5.parse("<!DOCTYPE html><html></html>")
|
82
|
-
assert_match /<!DOCTYPE html>/, doc.to_html
|
83
|
-
end
|
84
|
-
|
85
|
-
def test_fragment_head
|
86
|
-
doc = Nokogiri::HTML5.fragment(buffer[/<head>(.*?)<\/head>/m, 1])
|
87
|
-
assert_equal "hello world", doc.xpath('title').text
|
88
|
-
assert_equal "utf-8", doc.xpath('meta').first['charset']
|
89
|
-
end
|
90
|
-
|
91
|
-
def test_fragment_body
|
92
|
-
doc = Nokogiri::HTML5.fragment(buffer[/<body>(.*?)<\/body>/m, 1])
|
93
|
-
assert_equal '<span>content</span>', doc.xpath('main/span').to_xml
|
94
|
-
assert_equal " test comment ", doc.xpath('comment()').text
|
95
|
-
end
|
96
|
-
|
97
|
-
def test_xlink_attribute
|
98
|
-
source = <<-EOF.gsub(/^ {6}/, '')
|
99
|
-
<svg xmlns="http://www.w3.org/2000/svg">
|
100
|
-
<a xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="#s1"/>
|
101
|
-
</svg>
|
102
|
-
EOF
|
103
|
-
doc = Nokogiri::HTML5.fragment(source)
|
104
|
-
a = doc.at('a')
|
105
|
-
assert_equal ["xlink:href", "xmlns:xlink"], a.attributes.keys.sort
|
106
|
-
end
|
107
|
-
|
108
|
-
def test_template
|
109
|
-
source = <<-EOF.gsub(/^ {6}/, '')
|
110
|
-
<template id="productrow">
|
111
|
-
<tr>
|
112
|
-
<td class="record"></td>
|
113
|
-
<td></td>
|
114
|
-
</tr>
|
115
|
-
</template>
|
116
|
-
EOF
|
117
|
-
doc = Nokogiri::HTML5.fragment(source)
|
118
|
-
template = doc.at('template')
|
119
|
-
assert_equal "productrow", template['id']
|
120
|
-
assert_equal "record", template.at('td')['class']
|
121
|
-
end
|
122
|
-
|
123
|
-
def test_root_comments
|
124
|
-
doc = Nokogiri::HTML5("<!DOCTYPE html><!-- start --><html></html><!-- -->")
|
125
|
-
assert_equal ["html", "comment", "html", "comment"], doc.children.map(&:name)
|
126
|
-
end
|
127
|
-
|
128
|
-
def test_parse_errors
|
129
|
-
doc = Nokogiri::HTML5("<!DOCTYPE html><html><!-- -- --></a>", max_parse_errors: 10)
|
130
|
-
assert_equal doc.errors.length, 2
|
131
|
-
doc = Nokogiri::HTML5("<!DOCTYPE html><html>", max_parse_errors: 10)
|
132
|
-
assert_empty doc.errors
|
133
|
-
end
|
134
|
-
|
135
|
-
def test_max_parse_errors
|
136
|
-
# This document contains 2 parse errors, but we force limit to 1.
|
137
|
-
doc = Nokogiri::HTML5("<!DOCTYPE html><html><!-- -- --></a>", max_parse_errors: 1)
|
138
|
-
assert_equal 1, doc.errors.length
|
139
|
-
doc = Nokogiri::HTML5("<!DOCTYPE html><html>", max_parse_errors: 1)
|
140
|
-
assert_empty doc.errors
|
141
|
-
end
|
142
|
-
|
143
|
-
def test_default_max_parse_errors
|
144
|
-
# This document contains 200 parse errors, but default limit is 0.
|
145
|
-
doc = Nokogiri::HTML5("<!DOCTYPE html><html>" + "</p>" * 200)
|
146
|
-
assert_equal 0, doc.errors.length
|
147
|
-
end
|
148
|
-
|
149
|
-
def test_parse_fragment_errors
|
150
|
-
doc = Nokogiri::HTML5.fragment("<\r\n", max_parse_errors: 10)
|
151
|
-
refute_empty doc.errors
|
152
|
-
end
|
153
|
-
|
154
|
-
def test_fragment_max_parse_errors
|
155
|
-
# This fragment contains 3 parse errors, but we force limit to 1.
|
156
|
-
doc = Nokogiri::HTML5.fragment("<!-- -- --></a>", max_parse_errors: 1)
|
157
|
-
assert_equal 1, doc.errors.length
|
158
|
-
end
|
159
|
-
|
160
|
-
def test_fragment_default_max_parse_errors
|
161
|
-
# This fragment contains 201 parse errors, but default limit is 0.
|
162
|
-
doc = Nokogiri::HTML5.fragment("</p>" * 200)
|
163
|
-
assert_equal 0, doc.errors.length
|
164
|
-
end
|
165
|
-
|
166
|
-
private
|
167
|
-
|
168
|
-
def buffer
|
169
|
-
<<-EOF.gsub(/^ /, '')
|
170
|
-
<html>
|
171
|
-
<head>
|
172
|
-
<meta charset="utf-8"/>
|
173
|
-
<title>hello world</title>
|
174
|
-
<script> if (a < b) alert(1) </script>
|
175
|
-
</head>
|
176
|
-
<body>
|
177
|
-
<h1>hello world</h1>
|
178
|
-
<main>
|
179
|
-
<span>content</span>
|
180
|
-
</main>
|
181
|
-
<!-- test comment -->
|
182
|
-
<form>
|
183
|
-
<textarea>foo<x>bar</textarea>
|
184
|
-
</form>
|
185
|
-
</body>
|
186
|
-
</html>
|
187
|
-
EOF
|
188
|
-
end
|
189
|
-
|
190
|
-
end
|