nokogumbo 1.5.0 → 2.0.0.pre.alpha

Sign up to get free protection for your applications and to get access to all the features.
Files changed (55) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +56 -0
  3. data/README.md +146 -22
  4. data/ext/nokogumbo/extconf.rb +116 -0
  5. data/ext/{nokogumboc → nokogumbo}/nokogumbo.c +174 -71
  6. data/gumbo-parser/src/ascii.c +33 -0
  7. data/gumbo-parser/src/ascii.h +31 -0
  8. data/gumbo-parser/src/attribute.c +26 -28
  9. data/gumbo-parser/src/attribute.h +3 -23
  10. data/gumbo-parser/src/char_ref.c +135 -2351
  11. data/gumbo-parser/src/char_ref.h +13 -29
  12. data/gumbo-parser/src/error.c +215 -133
  13. data/gumbo-parser/src/error.h +34 -49
  14. data/gumbo-parser/src/foreign_attrs.c +104 -0
  15. data/gumbo-parser/src/gumbo.h +506 -304
  16. data/gumbo-parser/src/insertion_mode.h +4 -28
  17. data/gumbo-parser/src/macros.h +91 -0
  18. data/gumbo-parser/src/parser.c +1989 -1431
  19. data/gumbo-parser/src/parser.h +6 -22
  20. data/gumbo-parser/src/replacement.h +33 -0
  21. data/gumbo-parser/src/string_buffer.c +43 -50
  22. data/gumbo-parser/src/string_buffer.h +24 -40
  23. data/gumbo-parser/src/string_piece.c +39 -39
  24. data/gumbo-parser/src/svg_attrs.c +174 -0
  25. data/gumbo-parser/src/svg_tags.c +137 -0
  26. data/gumbo-parser/src/tag.c +186 -59
  27. data/gumbo-parser/src/tag_lookup.c +382 -0
  28. data/gumbo-parser/src/tag_lookup.h +13 -0
  29. data/gumbo-parser/src/token_type.h +1 -25
  30. data/gumbo-parser/src/tokenizer.c +899 -495
  31. data/gumbo-parser/src/tokenizer.h +37 -37
  32. data/gumbo-parser/src/tokenizer_states.h +6 -22
  33. data/gumbo-parser/src/utf8.c +103 -86
  34. data/gumbo-parser/src/utf8.h +37 -41
  35. data/gumbo-parser/src/util.c +48 -38
  36. data/gumbo-parser/src/util.h +10 -40
  37. data/gumbo-parser/src/vector.c +45 -57
  38. data/gumbo-parser/src/vector.h +17 -39
  39. data/lib/nokogumbo.rb +10 -174
  40. data/lib/nokogumbo/html5.rb +250 -0
  41. data/lib/nokogumbo/html5/document.rb +37 -0
  42. data/lib/nokogumbo/html5/document_fragment.rb +46 -0
  43. data/lib/nokogumbo/version.rb +3 -0
  44. data/lib/nokogumbo/xml/node.rb +57 -0
  45. metadata +32 -19
  46. data/ext/nokogumboc/extconf.rb +0 -60
  47. data/gumbo-parser/src/char_ref.rl +0 -2554
  48. data/gumbo-parser/src/string_piece.h +0 -38
  49. data/gumbo-parser/src/tag.in +0 -150
  50. data/gumbo-parser/src/tag_enum.h +0 -153
  51. data/gumbo-parser/src/tag_gperf.h +0 -105
  52. data/gumbo-parser/src/tag_sizes.h +0 -4
  53. data/gumbo-parser/src/tag_strings.h +0 -153
  54. data/gumbo-parser/visualc/include/strings.h +0 -4
  55. data/test-nokogumbo.rb +0 -190
@@ -1,4 +0,0 @@
1
- // Generated via `gentags.py src/tag.in`.
2
- // Do not edit; edit src/tag.in instead.
3
- // clang-format off
4
- 4, 4, 5, 4, 4, 4, 5, 6, 8, 8, 4, 7, 7, 3, 5, 2, 2, 2, 2, 2, 2, 6, 6, 6, 7, 1, 2, 3, 10, 2, 2, 2, 2, 2, 2, 6, 10, 4, 3, 1, 2, 6, 5, 1, 4, 1, 3, 4, 4, 4, 4, 3, 4, 3, 3, 3, 1, 1, 1, 4, 4, 2, 2, 3, 3, 4, 2, 3, 3, 3, 5, 3, 6, 5, 6, 5, 5, 5, 6, 5, 6, 3, 4, 4, 2, 2, 2, 2, 5, 6, 10, 14, 3, 13, 4, 5, 7, 8, 3, 5, 5, 5, 2, 2, 2, 4, 8, 6, 5, 5, 6, 6, 8, 8, 6, 8, 6, 6, 8, 5, 7, 7, 4, 8, 6, 7, 7, 3, 5, 8, 8, 7, 7, 3, 6, 7, 9, 2, 6, 8, 3, 5, 6, 4, 7, 8, 4, 6, 2, 3,
@@ -1,153 +0,0 @@
1
- // Generated via `gentags.py src/tag.in`.
2
- // Do not edit; edit src/tag.in instead.
3
- // clang-format off
4
- "html",
5
- "head",
6
- "title",
7
- "base",
8
- "link",
9
- "meta",
10
- "style",
11
- "script",
12
- "noscript",
13
- "template",
14
- "body",
15
- "article",
16
- "section",
17
- "nav",
18
- "aside",
19
- "h1",
20
- "h2",
21
- "h3",
22
- "h4",
23
- "h5",
24
- "h6",
25
- "hgroup",
26
- "header",
27
- "footer",
28
- "address",
29
- "p",
30
- "hr",
31
- "pre",
32
- "blockquote",
33
- "ol",
34
- "ul",
35
- "li",
36
- "dl",
37
- "dt",
38
- "dd",
39
- "figure",
40
- "figcaption",
41
- "main",
42
- "div",
43
- "a",
44
- "em",
45
- "strong",
46
- "small",
47
- "s",
48
- "cite",
49
- "q",
50
- "dfn",
51
- "abbr",
52
- "data",
53
- "time",
54
- "code",
55
- "var",
56
- "samp",
57
- "kbd",
58
- "sub",
59
- "sup",
60
- "i",
61
- "b",
62
- "u",
63
- "mark",
64
- "ruby",
65
- "rt",
66
- "rp",
67
- "bdi",
68
- "bdo",
69
- "span",
70
- "br",
71
- "wbr",
72
- "ins",
73
- "del",
74
- "image",
75
- "img",
76
- "iframe",
77
- "embed",
78
- "object",
79
- "param",
80
- "video",
81
- "audio",
82
- "source",
83
- "track",
84
- "canvas",
85
- "map",
86
- "area",
87
- "math",
88
- "mi",
89
- "mo",
90
- "mn",
91
- "ms",
92
- "mtext",
93
- "mglyph",
94
- "malignmark",
95
- "annotation-xml",
96
- "svg",
97
- "foreignobject",
98
- "desc",
99
- "table",
100
- "caption",
101
- "colgroup",
102
- "col",
103
- "tbody",
104
- "thead",
105
- "tfoot",
106
- "tr",
107
- "td",
108
- "th",
109
- "form",
110
- "fieldset",
111
- "legend",
112
- "label",
113
- "input",
114
- "button",
115
- "select",
116
- "datalist",
117
- "optgroup",
118
- "option",
119
- "textarea",
120
- "keygen",
121
- "output",
122
- "progress",
123
- "meter",
124
- "details",
125
- "summary",
126
- "menu",
127
- "menuitem",
128
- "applet",
129
- "acronym",
130
- "bgsound",
131
- "dir",
132
- "frame",
133
- "frameset",
134
- "noframes",
135
- "isindex",
136
- "listing",
137
- "xmp",
138
- "nextid",
139
- "noembed",
140
- "plaintext",
141
- "rb",
142
- "strike",
143
- "basefont",
144
- "big",
145
- "blink",
146
- "center",
147
- "font",
148
- "marquee",
149
- "multicol",
150
- "nobr",
151
- "spacer",
152
- "tt",
153
- "rtc",
@@ -1,4 +0,0 @@
1
- /*Dummy file to satisfy source file dependencies on Windows platform*/
2
- #define strcasecmp _stricmp
3
- #define strncasecmp _strnicmp
4
- #define inline __inline
data/test-nokogumbo.rb DELETED
@@ -1,190 +0,0 @@
1
- $:.unshift('lib')
2
- $:.unshift('ext/nokogumboc')
3
-
4
- gem 'minitest'
5
-
6
- require 'nokogumbo'
7
- require 'minitest/autorun'
8
-
9
- class TestNokogumbo < Minitest::Test
10
- def test_element_text
11
- doc = Nokogiri::HTML5(buffer)
12
- assert_equal "content", doc.at('span').text
13
- end
14
-
15
- def test_element_cdata_textarea
16
- doc = Nokogiri::HTML5(buffer)
17
- assert_equal "foo<x>bar", doc.at('textarea').text.strip
18
- end
19
-
20
- def test_element_cdata_script
21
- doc = Nokogiri::HTML5.fragment(buffer)
22
- assert_equal true, doc.document.html?
23
- assert_equal "<script> if (a < b) alert(1) </script>", doc.at('script').to_s
24
- end
25
-
26
- def test_attr_value
27
- doc = Nokogiri::HTML5(buffer)
28
- assert_equal "utf-8", doc.at('meta')['charset']
29
- end
30
-
31
- def test_comment
32
- doc = Nokogiri::HTML5(buffer)
33
- assert_equal " test comment ", doc.xpath('//comment()').text
34
- end
35
-
36
- def test_unknown_element
37
- doc = Nokogiri::HTML5(buffer)
38
- assert_equal "main", doc.at('main').name
39
- end
40
-
41
- def test_IO
42
- require 'stringio'
43
- doc = Nokogiri::HTML5(StringIO.new(buffer))
44
- assert_equal 'textarea', doc.at('form').element_children.first.name
45
- end
46
-
47
- def test_nil
48
- doc = Nokogiri::HTML5(nil)
49
- assert_equal 1, doc.search('body').count
50
- end
51
-
52
- if ''.respond_to? 'encoding'
53
- def test_macroman_encoding
54
- mac="<span>\xCA</span>".force_encoding('macroman')
55
- doc = Nokogiri::HTML5(mac)
56
- assert_equal '<span>&#xA0;</span>', doc.at('span').to_xml
57
- end
58
-
59
- def test_iso8859_encoding
60
- iso8859="<span>Se\xF1or</span>".force_encoding(Encoding::ASCII_8BIT)
61
- doc = Nokogiri::HTML5(iso8859)
62
- assert_equal '<span>Se&#xF1;or</span>', doc.at('span').to_xml
63
- end
64
-
65
- def test_charset_encoding
66
- utf8="<meta charset='utf-8'><span>Se\xC3\xB1or</span>".
67
- force_encoding(Encoding::ASCII_8BIT)
68
- doc = Nokogiri::HTML5(utf8)
69
- assert_equal '<span>Se&#xF1;or</span>', doc.at('span').to_xml
70
- end
71
-
72
- def test_bogus_encoding
73
- bogus="<meta charset='bogus'><span>Se\xF1or</span>".
74
- force_encoding(Encoding::ASCII_8BIT)
75
- doc = Nokogiri::HTML5(bogus)
76
- assert_equal '<span>Se&#xF1;or</span>', doc.at('span').to_xml
77
- end
78
- end
79
-
80
- def test_html5_doctype
81
- doc = Nokogiri::HTML5.parse("<!DOCTYPE html><html></html>")
82
- assert_match /<!DOCTYPE html>/, doc.to_html
83
- end
84
-
85
- def test_fragment_head
86
- doc = Nokogiri::HTML5.fragment(buffer[/<head>(.*?)<\/head>/m, 1])
87
- assert_equal "hello world", doc.xpath('title').text
88
- assert_equal "utf-8", doc.xpath('meta').first['charset']
89
- end
90
-
91
- def test_fragment_body
92
- doc = Nokogiri::HTML5.fragment(buffer[/<body>(.*?)<\/body>/m, 1])
93
- assert_equal '<span>content</span>', doc.xpath('main/span').to_xml
94
- assert_equal " test comment ", doc.xpath('comment()').text
95
- end
96
-
97
- def test_xlink_attribute
98
- source = <<-EOF.gsub(/^ {6}/, '')
99
- <svg xmlns="http://www.w3.org/2000/svg">
100
- <a xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="#s1"/>
101
- </svg>
102
- EOF
103
- doc = Nokogiri::HTML5.fragment(source)
104
- a = doc.at('a')
105
- assert_equal ["xlink:href", "xmlns:xlink"], a.attributes.keys.sort
106
- end
107
-
108
- def test_template
109
- source = <<-EOF.gsub(/^ {6}/, '')
110
- <template id="productrow">
111
- <tr>
112
- <td class="record"></td>
113
- <td></td>
114
- </tr>
115
- </template>
116
- EOF
117
- doc = Nokogiri::HTML5.fragment(source)
118
- template = doc.at('template')
119
- assert_equal "productrow", template['id']
120
- assert_equal "record", template.at('td')['class']
121
- end
122
-
123
- def test_root_comments
124
- doc = Nokogiri::HTML5("<!DOCTYPE html><!-- start --><html></html><!-- -->")
125
- assert_equal ["html", "comment", "html", "comment"], doc.children.map(&:name)
126
- end
127
-
128
- def test_parse_errors
129
- doc = Nokogiri::HTML5("<!DOCTYPE html><html><!-- -- --></a>", max_parse_errors: 10)
130
- assert_equal doc.errors.length, 2
131
- doc = Nokogiri::HTML5("<!DOCTYPE html><html>", max_parse_errors: 10)
132
- assert_empty doc.errors
133
- end
134
-
135
- def test_max_parse_errors
136
- # This document contains 2 parse errors, but we force limit to 1.
137
- doc = Nokogiri::HTML5("<!DOCTYPE html><html><!-- -- --></a>", max_parse_errors: 1)
138
- assert_equal 1, doc.errors.length
139
- doc = Nokogiri::HTML5("<!DOCTYPE html><html>", max_parse_errors: 1)
140
- assert_empty doc.errors
141
- end
142
-
143
- def test_default_max_parse_errors
144
- # This document contains 200 parse errors, but default limit is 0.
145
- doc = Nokogiri::HTML5("<!DOCTYPE html><html>" + "</p>" * 200)
146
- assert_equal 0, doc.errors.length
147
- end
148
-
149
- def test_parse_fragment_errors
150
- doc = Nokogiri::HTML5.fragment("<\r\n", max_parse_errors: 10)
151
- refute_empty doc.errors
152
- end
153
-
154
- def test_fragment_max_parse_errors
155
- # This fragment contains 3 parse errors, but we force limit to 1.
156
- doc = Nokogiri::HTML5.fragment("<!-- -- --></a>", max_parse_errors: 1)
157
- assert_equal 1, doc.errors.length
158
- end
159
-
160
- def test_fragment_default_max_parse_errors
161
- # This fragment contains 201 parse errors, but default limit is 0.
162
- doc = Nokogiri::HTML5.fragment("</p>" * 200)
163
- assert_equal 0, doc.errors.length
164
- end
165
-
166
- private
167
-
168
- def buffer
169
- <<-EOF.gsub(/^ /, '')
170
- <html>
171
- <head>
172
- <meta charset="utf-8"/>
173
- <title>hello world</title>
174
- <script> if (a < b) alert(1) </script>
175
- </head>
176
- <body>
177
- <h1>hello world</h1>
178
- <main>
179
- <span>content</span>
180
- </main>
181
- <!-- test comment -->
182
- <form>
183
- <textarea>foo<x>bar</textarea>
184
- </form>
185
- </body>
186
- </html>
187
- EOF
188
- end
189
-
190
- end