nokogumbo 1.5.0 → 2.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +237 -26
  3. data/ext/nokogumbo/extconf.rb +121 -0
  4. data/ext/nokogumbo/nokogumbo.c +793 -0
  5. data/gumbo-parser/src/ascii.c +75 -0
  6. data/gumbo-parser/src/ascii.h +115 -0
  7. data/gumbo-parser/src/attribute.c +26 -28
  8. data/gumbo-parser/src/attribute.h +3 -23
  9. data/gumbo-parser/src/char_ref.c +5972 -6816
  10. data/gumbo-parser/src/char_ref.h +14 -45
  11. data/gumbo-parser/src/error.c +510 -163
  12. data/gumbo-parser/src/error.h +70 -147
  13. data/gumbo-parser/src/foreign_attrs.c +104 -0
  14. data/gumbo-parser/src/gumbo.h +577 -305
  15. data/gumbo-parser/src/insertion_mode.h +4 -28
  16. data/gumbo-parser/src/macros.h +91 -0
  17. data/gumbo-parser/src/parser.c +2922 -2228
  18. data/gumbo-parser/src/parser.h +6 -22
  19. data/gumbo-parser/src/replacement.h +33 -0
  20. data/gumbo-parser/src/string_buffer.c +43 -50
  21. data/gumbo-parser/src/string_buffer.h +24 -40
  22. data/gumbo-parser/src/string_piece.c +39 -39
  23. data/gumbo-parser/src/svg_attrs.c +174 -0
  24. data/gumbo-parser/src/svg_tags.c +137 -0
  25. data/gumbo-parser/src/tag.c +186 -59
  26. data/gumbo-parser/src/tag_lookup.c +382 -0
  27. data/gumbo-parser/src/tag_lookup.h +13 -0
  28. data/gumbo-parser/src/token_buffer.c +79 -0
  29. data/gumbo-parser/src/token_buffer.h +71 -0
  30. data/gumbo-parser/src/token_type.h +1 -25
  31. data/gumbo-parser/src/tokenizer.c +2127 -1561
  32. data/gumbo-parser/src/tokenizer.h +41 -52
  33. data/gumbo-parser/src/tokenizer_states.h +281 -45
  34. data/gumbo-parser/src/utf8.c +98 -123
  35. data/gumbo-parser/src/utf8.h +84 -52
  36. data/gumbo-parser/src/util.c +48 -38
  37. data/gumbo-parser/src/util.h +10 -40
  38. data/gumbo-parser/src/vector.c +45 -57
  39. data/gumbo-parser/src/vector.h +17 -39
  40. data/lib/nokogumbo.rb +11 -173
  41. data/lib/nokogumbo/html5.rb +252 -0
  42. data/lib/nokogumbo/html5/document.rb +53 -0
  43. data/lib/nokogumbo/html5/document_fragment.rb +62 -0
  44. data/lib/nokogumbo/html5/node.rb +72 -0
  45. data/lib/nokogumbo/version.rb +3 -0
  46. metadata +43 -24
  47. data/ext/nokogumboc/extconf.rb +0 -60
  48. data/ext/nokogumboc/nokogumbo.c +0 -295
  49. data/gumbo-parser/src/char_ref.rl +0 -2554
  50. data/gumbo-parser/src/string_piece.h +0 -38
  51. data/gumbo-parser/src/tag.in +0 -150
  52. data/gumbo-parser/src/tag_enum.h +0 -153
  53. data/gumbo-parser/src/tag_gperf.h +0 -105
  54. data/gumbo-parser/src/tag_sizes.h +0 -4
  55. data/gumbo-parser/src/tag_strings.h +0 -153
  56. data/gumbo-parser/visualc/include/strings.h +0 -4
  57. data/test-nokogumbo.rb +0 -190
@@ -1,153 +0,0 @@
1
- // Generated via `gentags.py src/tag.in`.
2
- // Do not edit; edit src/tag.in instead.
3
- // clang-format off
4
- "html",
5
- "head",
6
- "title",
7
- "base",
8
- "link",
9
- "meta",
10
- "style",
11
- "script",
12
- "noscript",
13
- "template",
14
- "body",
15
- "article",
16
- "section",
17
- "nav",
18
- "aside",
19
- "h1",
20
- "h2",
21
- "h3",
22
- "h4",
23
- "h5",
24
- "h6",
25
- "hgroup",
26
- "header",
27
- "footer",
28
- "address",
29
- "p",
30
- "hr",
31
- "pre",
32
- "blockquote",
33
- "ol",
34
- "ul",
35
- "li",
36
- "dl",
37
- "dt",
38
- "dd",
39
- "figure",
40
- "figcaption",
41
- "main",
42
- "div",
43
- "a",
44
- "em",
45
- "strong",
46
- "small",
47
- "s",
48
- "cite",
49
- "q",
50
- "dfn",
51
- "abbr",
52
- "data",
53
- "time",
54
- "code",
55
- "var",
56
- "samp",
57
- "kbd",
58
- "sub",
59
- "sup",
60
- "i",
61
- "b",
62
- "u",
63
- "mark",
64
- "ruby",
65
- "rt",
66
- "rp",
67
- "bdi",
68
- "bdo",
69
- "span",
70
- "br",
71
- "wbr",
72
- "ins",
73
- "del",
74
- "image",
75
- "img",
76
- "iframe",
77
- "embed",
78
- "object",
79
- "param",
80
- "video",
81
- "audio",
82
- "source",
83
- "track",
84
- "canvas",
85
- "map",
86
- "area",
87
- "math",
88
- "mi",
89
- "mo",
90
- "mn",
91
- "ms",
92
- "mtext",
93
- "mglyph",
94
- "malignmark",
95
- "annotation-xml",
96
- "svg",
97
- "foreignobject",
98
- "desc",
99
- "table",
100
- "caption",
101
- "colgroup",
102
- "col",
103
- "tbody",
104
- "thead",
105
- "tfoot",
106
- "tr",
107
- "td",
108
- "th",
109
- "form",
110
- "fieldset",
111
- "legend",
112
- "label",
113
- "input",
114
- "button",
115
- "select",
116
- "datalist",
117
- "optgroup",
118
- "option",
119
- "textarea",
120
- "keygen",
121
- "output",
122
- "progress",
123
- "meter",
124
- "details",
125
- "summary",
126
- "menu",
127
- "menuitem",
128
- "applet",
129
- "acronym",
130
- "bgsound",
131
- "dir",
132
- "frame",
133
- "frameset",
134
- "noframes",
135
- "isindex",
136
- "listing",
137
- "xmp",
138
- "nextid",
139
- "noembed",
140
- "plaintext",
141
- "rb",
142
- "strike",
143
- "basefont",
144
- "big",
145
- "blink",
146
- "center",
147
- "font",
148
- "marquee",
149
- "multicol",
150
- "nobr",
151
- "spacer",
152
- "tt",
153
- "rtc",
@@ -1,4 +0,0 @@
1
- /*Dummy file to satisfy source file dependencies on Windows platform*/
2
- #define strcasecmp _stricmp
3
- #define strncasecmp _strnicmp
4
- #define inline __inline
@@ -1,190 +0,0 @@
1
- $:.unshift('lib')
2
- $:.unshift('ext/nokogumboc')
3
-
4
- gem 'minitest'
5
-
6
- require 'nokogumbo'
7
- require 'minitest/autorun'
8
-
9
- class TestNokogumbo < Minitest::Test
10
- def test_element_text
11
- doc = Nokogiri::HTML5(buffer)
12
- assert_equal "content", doc.at('span').text
13
- end
14
-
15
- def test_element_cdata_textarea
16
- doc = Nokogiri::HTML5(buffer)
17
- assert_equal "foo<x>bar", doc.at('textarea').text.strip
18
- end
19
-
20
- def test_element_cdata_script
21
- doc = Nokogiri::HTML5.fragment(buffer)
22
- assert_equal true, doc.document.html?
23
- assert_equal "<script> if (a < b) alert(1) </script>", doc.at('script').to_s
24
- end
25
-
26
- def test_attr_value
27
- doc = Nokogiri::HTML5(buffer)
28
- assert_equal "utf-8", doc.at('meta')['charset']
29
- end
30
-
31
- def test_comment
32
- doc = Nokogiri::HTML5(buffer)
33
- assert_equal " test comment ", doc.xpath('//comment()').text
34
- end
35
-
36
- def test_unknown_element
37
- doc = Nokogiri::HTML5(buffer)
38
- assert_equal "main", doc.at('main').name
39
- end
40
-
41
- def test_IO
42
- require 'stringio'
43
- doc = Nokogiri::HTML5(StringIO.new(buffer))
44
- assert_equal 'textarea', doc.at('form').element_children.first.name
45
- end
46
-
47
- def test_nil
48
- doc = Nokogiri::HTML5(nil)
49
- assert_equal 1, doc.search('body').count
50
- end
51
-
52
- if ''.respond_to? 'encoding'
53
- def test_macroman_encoding
54
- mac="<span>\xCA</span>".force_encoding('macroman')
55
- doc = Nokogiri::HTML5(mac)
56
- assert_equal '<span>&#xA0;</span>', doc.at('span').to_xml
57
- end
58
-
59
- def test_iso8859_encoding
60
- iso8859="<span>Se\xF1or</span>".force_encoding(Encoding::ASCII_8BIT)
61
- doc = Nokogiri::HTML5(iso8859)
62
- assert_equal '<span>Se&#xF1;or</span>', doc.at('span').to_xml
63
- end
64
-
65
- def test_charset_encoding
66
- utf8="<meta charset='utf-8'><span>Se\xC3\xB1or</span>".
67
- force_encoding(Encoding::ASCII_8BIT)
68
- doc = Nokogiri::HTML5(utf8)
69
- assert_equal '<span>Se&#xF1;or</span>', doc.at('span').to_xml
70
- end
71
-
72
- def test_bogus_encoding
73
- bogus="<meta charset='bogus'><span>Se\xF1or</span>".
74
- force_encoding(Encoding::ASCII_8BIT)
75
- doc = Nokogiri::HTML5(bogus)
76
- assert_equal '<span>Se&#xF1;or</span>', doc.at('span').to_xml
77
- end
78
- end
79
-
80
- def test_html5_doctype
81
- doc = Nokogiri::HTML5.parse("<!DOCTYPE html><html></html>")
82
- assert_match /<!DOCTYPE html>/, doc.to_html
83
- end
84
-
85
- def test_fragment_head
86
- doc = Nokogiri::HTML5.fragment(buffer[/<head>(.*?)<\/head>/m, 1])
87
- assert_equal "hello world", doc.xpath('title').text
88
- assert_equal "utf-8", doc.xpath('meta').first['charset']
89
- end
90
-
91
- def test_fragment_body
92
- doc = Nokogiri::HTML5.fragment(buffer[/<body>(.*?)<\/body>/m, 1])
93
- assert_equal '<span>content</span>', doc.xpath('main/span').to_xml
94
- assert_equal " test comment ", doc.xpath('comment()').text
95
- end
96
-
97
- def test_xlink_attribute
98
- source = <<-EOF.gsub(/^ {6}/, '')
99
- <svg xmlns="http://www.w3.org/2000/svg">
100
- <a xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="#s1"/>
101
- </svg>
102
- EOF
103
- doc = Nokogiri::HTML5.fragment(source)
104
- a = doc.at('a')
105
- assert_equal ["xlink:href", "xmlns:xlink"], a.attributes.keys.sort
106
- end
107
-
108
- def test_template
109
- source = <<-EOF.gsub(/^ {6}/, '')
110
- <template id="productrow">
111
- <tr>
112
- <td class="record"></td>
113
- <td></td>
114
- </tr>
115
- </template>
116
- EOF
117
- doc = Nokogiri::HTML5.fragment(source)
118
- template = doc.at('template')
119
- assert_equal "productrow", template['id']
120
- assert_equal "record", template.at('td')['class']
121
- end
122
-
123
- def test_root_comments
124
- doc = Nokogiri::HTML5("<!DOCTYPE html><!-- start --><html></html><!-- -->")
125
- assert_equal ["html", "comment", "html", "comment"], doc.children.map(&:name)
126
- end
127
-
128
- def test_parse_errors
129
- doc = Nokogiri::HTML5("<!DOCTYPE html><html><!-- -- --></a>", max_parse_errors: 10)
130
- assert_equal doc.errors.length, 2
131
- doc = Nokogiri::HTML5("<!DOCTYPE html><html>", max_parse_errors: 10)
132
- assert_empty doc.errors
133
- end
134
-
135
- def test_max_parse_errors
136
- # This document contains 2 parse errors, but we force limit to 1.
137
- doc = Nokogiri::HTML5("<!DOCTYPE html><html><!-- -- --></a>", max_parse_errors: 1)
138
- assert_equal 1, doc.errors.length
139
- doc = Nokogiri::HTML5("<!DOCTYPE html><html>", max_parse_errors: 1)
140
- assert_empty doc.errors
141
- end
142
-
143
- def test_default_max_parse_errors
144
- # This document contains 200 parse errors, but default limit is 0.
145
- doc = Nokogiri::HTML5("<!DOCTYPE html><html>" + "</p>" * 200)
146
- assert_equal 0, doc.errors.length
147
- end
148
-
149
- def test_parse_fragment_errors
150
- doc = Nokogiri::HTML5.fragment("<\r\n", max_parse_errors: 10)
151
- refute_empty doc.errors
152
- end
153
-
154
- def test_fragment_max_parse_errors
155
- # This fragment contains 3 parse errors, but we force limit to 1.
156
- doc = Nokogiri::HTML5.fragment("<!-- -- --></a>", max_parse_errors: 1)
157
- assert_equal 1, doc.errors.length
158
- end
159
-
160
- def test_fragment_default_max_parse_errors
161
- # This fragment contains 201 parse errors, but default limit is 0.
162
- doc = Nokogiri::HTML5.fragment("</p>" * 200)
163
- assert_equal 0, doc.errors.length
164
- end
165
-
166
- private
167
-
168
- def buffer
169
- <<-EOF.gsub(/^ /, '')
170
- <html>
171
- <head>
172
- <meta charset="utf-8"/>
173
- <title>hello world</title>
174
- <script> if (a < b) alert(1) </script>
175
- </head>
176
- <body>
177
- <h1>hello world</h1>
178
- <main>
179
- <span>content</span>
180
- </main>
181
- <!-- test comment -->
182
- <form>
183
- <textarea>foo<x>bar</textarea>
184
- </form>
185
- </body>
186
- </html>
187
- EOF
188
- end
189
-
190
- end