nokogumbo 1.5.0 → 2.0.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (57) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +237 -26
  3. data/ext/nokogumbo/extconf.rb +121 -0
  4. data/ext/nokogumbo/nokogumbo.c +793 -0
  5. data/gumbo-parser/src/ascii.c +75 -0
  6. data/gumbo-parser/src/ascii.h +115 -0
  7. data/gumbo-parser/src/attribute.c +26 -28
  8. data/gumbo-parser/src/attribute.h +3 -23
  9. data/gumbo-parser/src/char_ref.c +5972 -6816
  10. data/gumbo-parser/src/char_ref.h +14 -45
  11. data/gumbo-parser/src/error.c +510 -163
  12. data/gumbo-parser/src/error.h +70 -147
  13. data/gumbo-parser/src/foreign_attrs.c +104 -0
  14. data/gumbo-parser/src/gumbo.h +577 -305
  15. data/gumbo-parser/src/insertion_mode.h +4 -28
  16. data/gumbo-parser/src/macros.h +91 -0
  17. data/gumbo-parser/src/parser.c +2922 -2228
  18. data/gumbo-parser/src/parser.h +6 -22
  19. data/gumbo-parser/src/replacement.h +33 -0
  20. data/gumbo-parser/src/string_buffer.c +43 -50
  21. data/gumbo-parser/src/string_buffer.h +24 -40
  22. data/gumbo-parser/src/string_piece.c +39 -39
  23. data/gumbo-parser/src/svg_attrs.c +174 -0
  24. data/gumbo-parser/src/svg_tags.c +137 -0
  25. data/gumbo-parser/src/tag.c +186 -59
  26. data/gumbo-parser/src/tag_lookup.c +382 -0
  27. data/gumbo-parser/src/tag_lookup.h +13 -0
  28. data/gumbo-parser/src/token_buffer.c +79 -0
  29. data/gumbo-parser/src/token_buffer.h +71 -0
  30. data/gumbo-parser/src/token_type.h +1 -25
  31. data/gumbo-parser/src/tokenizer.c +2127 -1561
  32. data/gumbo-parser/src/tokenizer.h +41 -52
  33. data/gumbo-parser/src/tokenizer_states.h +281 -45
  34. data/gumbo-parser/src/utf8.c +98 -123
  35. data/gumbo-parser/src/utf8.h +84 -52
  36. data/gumbo-parser/src/util.c +48 -38
  37. data/gumbo-parser/src/util.h +10 -40
  38. data/gumbo-parser/src/vector.c +45 -57
  39. data/gumbo-parser/src/vector.h +17 -39
  40. data/lib/nokogumbo.rb +11 -173
  41. data/lib/nokogumbo/html5.rb +252 -0
  42. data/lib/nokogumbo/html5/document.rb +53 -0
  43. data/lib/nokogumbo/html5/document_fragment.rb +62 -0
  44. data/lib/nokogumbo/html5/node.rb +72 -0
  45. data/lib/nokogumbo/version.rb +3 -0
  46. metadata +43 -24
  47. data/ext/nokogumboc/extconf.rb +0 -60
  48. data/ext/nokogumboc/nokogumbo.c +0 -295
  49. data/gumbo-parser/src/char_ref.rl +0 -2554
  50. data/gumbo-parser/src/string_piece.h +0 -38
  51. data/gumbo-parser/src/tag.in +0 -150
  52. data/gumbo-parser/src/tag_enum.h +0 -153
  53. data/gumbo-parser/src/tag_gperf.h +0 -105
  54. data/gumbo-parser/src/tag_sizes.h +0 -4
  55. data/gumbo-parser/src/tag_strings.h +0 -153
  56. data/gumbo-parser/visualc/include/strings.h +0 -4
  57. data/test-nokogumbo.rb +0 -190
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 96fa61565f78d5491e0b6d5b505cf936524745eb848b8b6584fc15e20c7ae35b
4
- data.tar.gz: e5416f71bbe90323f04b8aad4dc48b28947e43a9eb46f446f8ca1444f519a07b
3
+ metadata.gz: e4694cf3eefbeee2a55cd4bb355b7ec6159c64eac4454dff02b1fbf7e5e8375a
4
+ data.tar.gz: 67832a7c26148f59755360758fcc0b0c1969949bf1e5a1b27f5cabe4b9e8b40b
5
5
  SHA512:
6
- metadata.gz: 676bf3585d38cd4ad5c72b8b3afd4952e248c747683ae1072dd43f6ce1ccd279177e4d0c75a9821ed76d32806333128152231349d8d113ae5d81279580b13004
7
- data.tar.gz: 3459078d96977399e75551c4a3ee5623091f48569984b771e540ec111125f5af91e39a8d78cbd3ce9280326b1b9395dc4a0b0d7f0a72294876682cb9fe35e3d9
6
+ metadata.gz: 3a415817caaf0c3c03037664bda8ed8aa17cc14419e75672dcaa2e2a7dd6d9a20e6ab59095a2295f90da5e45de2c3d72f9a25557533836d55dc67966fe8c7a14
7
+ data.tar.gz: 8dc8f9f2d55936a63097301dc5eb6fb54ed1e4c274b03cdcd6f45e2b4ac2cdc911a54e8e5838ce468820ebade731a62f7cfc167817528fd0adb415087ce924b6
data/README.md CHANGED
@@ -1,23 +1,21 @@
1
- Nokogumbo - a Nokogiri interface to the Gumbo HTML5 parser.
2
- ===========
1
+ # Nokogumbo - a Nokogiri interface to the Gumbo HTML5 parser.
3
2
 
4
- Nokogumbo provides the ability for a Ruby program to invoke the
5
- [Gumbo HTML5 parser](https://github.com/google/gumbo-parser#readme)
3
+ Nokogumbo provides the ability for a Ruby program to invoke
4
+ [our version of the Gumbo HTML5 parser](https://github.com/rubys/nokogumbo/tree/master/gumbo-parser/src)
6
5
  and to access the result as a
7
6
  [Nokogiri::HTML::Document](http://rdoc.info/github/sparklemotion/nokogiri/Nokogiri/HTML/Document).
8
7
 
9
- [![Build Status](https://travis-ci.org/rubys/nokogumbo.svg)](https://travis-ci.org/rubys/nokogumbo)
8
+ [![Travis-CI Build Status](https://travis-ci.org/rubys/nokogumbo.svg)](https://travis-ci.org/rubys/nokogumbo)
9
+ [![Appveyor Build Status](https://ci.appveyor.com/api/projects/status/github/rubys/nokogumbo)](https://ci.appveyor.com/project/rubys/nokogumbo/branch/master)
10
10
 
11
- Usage
12
- -----
11
+ ## Usage
13
12
 
14
13
  ```ruby
15
14
  require 'nokogumbo'
16
- doc = Nokogiri::HTML5(string)
15
+ doc = Nokogiri.HTML5(string)
17
16
  ```
18
17
 
19
- An experimental _fragment_ method is also provided. While not HTML5
20
- compliant, it may be useful:
18
+ To parse an HTML fragment, a `fragment` method is provided.
21
19
 
22
20
  ```ruby
23
21
  require 'nokogumbo'
@@ -32,21 +30,207 @@ require 'nokogumbo'
32
30
  doc = Nokogiri::HTML5.get(uri)
33
31
  ```
34
32
 
35
- Example
36
- -----
33
+ ## Parsing options
34
+ The document and fragment parsing methods,
35
+ - `Nokogiri.HTML5(html, url = nil, encoding = nil, options = {})`
36
+ - `Nokogiri::HTML5.parse(html, url = nil, encoding = nil, options = {})`
37
+ - `Nokogiri::HTML5::Document.parse(html, url = nil, encoding = nil, options = {})`
38
+ - `Nokogiri::HTML5.fragment(html, encoding = nil, options = {})`
39
+ - `Nokogiri::HTML5::DocumentFragment.parse(html, encoding = nil, options = {})`
40
+ support options that are different from Nokogiri's.
41
+
42
+ The two currently supported options are `:max_errors` and `:max_tree_depth`,
43
+ described below.
44
+
45
+ ### Error reporting
46
+ Nokogumbo contains an experimental parse error reporting facility. By default,
47
+ no parse errors are reported but this can be configured by passing the
48
+ `:max_errors` option to `::parse` or `::fragment`.
49
+
37
50
  ```ruby
38
51
  require 'nokogumbo'
39
- puts Nokogiri::HTML5.get('http://nokogiri.org').search('ol li')[2].text
52
+ doc = Nokogiri::HTML5.parse('<span/>Hi there!</span foo=bar />', max_errors: 10)
53
+ doc.errors.each do |err|
54
+ puts(err)
55
+ end
40
56
  ```
41
57
 
42
- Use `.to_html` instead of `.to_s` when parsing and serializing multiple times
58
+ This prints the following.
43
59
  ```
60
+ 1:1: ERROR: Expected a doctype token
61
+ <span/>Hi there!</span foo=bar />
62
+ ^
63
+ 1:1: ERROR: Start tag of nonvoid HTML element ends with '/>', use '>'.
64
+ <span/>Hi there!</span foo=bar />
65
+ ^
66
+ 1:17: ERROR: End tag ends with '/>', use '>'.
67
+ <span/>Hi there!</span foo=bar />
68
+ ^
69
+ 1:17: ERROR: End tag contains attributes.
70
+ <span/>Hi there!</span foo=bar />
71
+ ^
72
+ ```
73
+
74
+ Using `max_errors: -1` results in an unlimited number of errors being
75
+ returned.
76
+
77
+ The errors returned by `#errors` are instances of
78
+ [`Nokogiri::XML::SyntaxError`](https://www.rubydoc.info/github/sparklemotion/nokogiri/Nokogiri/XML/SyntaxError).
79
+
80
+ The [HTML
81
+ standard](https://html.spec.whatwg.org/multipage/parsing.html#parse-errors)
82
+ defines a number of standard parse error codes. These error codes only cover
83
+ the "tokenization" stage of parsing HTML. The parse errors in the
84
+ "tree construction" stage do not have standardized error codes (yet).
85
+
86
+ As a convenience to Nokogumbo users, the defined error codes are available
87
+ via the
88
+ [`Nokogiri::XML::SyntaxError#str1`](https://www.rubydoc.info/github/sparklemotion/nokogiri/Nokogiri/XML/SyntaxError#str1-instance_method)
89
+ method.
90
+
91
+ ```ruby
44
92
  require 'nokogumbo'
45
- Nokogiri::HTML5.parse(Nokogiri::HTML5.parse('<div></div> a').to_html).to_html
93
+ doc = Nokogiri::HTML5.parse('<span/>Hi there!</span foo=bar />', max_errors: 10)
94
+ doc.errors.each do |err|
95
+ puts("#{err.line}:#{err.column}: #{err.str1}")
96
+ end
97
+ ```
98
+
99
+ This prints the following.
100
+ ```
101
+ 1:1: generic-parser
102
+ 1:1: non-void-html-element-start-tag-with-trailing-solidus
103
+ 1:17: end-tag-with-trailing-solidus
104
+ 1:17: end-tag-with-attributes
105
+ ```
106
+
107
+ Note that the first error is `generic-parser` because it's an error from the
108
+ tree construction stage and doesn't have a standardized error code.
109
+
110
+ For the purposes of semantic versioning, the error messages, error locations,
111
+ and error codes are not part of Nokogumbo's public API. That is, these are
112
+ subject to change without Nokogumbo's major version number changing. These may
113
+ be stabilized in the future.
114
+
115
+ ### Maximum tree depth
116
+ The maximum depth of the DOM tree parsed by the various parsing methods is
117
+ configurable by the `:max_tree_depth` option. If the depth of the tree would
118
+ exceed this limit, then an
119
+ [ArgumentError](https://ruby-doc.org/core-2.5.0/ArgumentError.html) is thrown.
120
+
121
+ This limit (which defaults to `Nokogumbo::DEFAULT_MAX_TREE_DEPTH = 400`) can
122
+ be removed by giving the option `max_tree_depth: -1`.
123
+
124
+ ``` ruby
125
+ html = '<!DOCTYPE html>' + '<div>' * 1000
126
+ doc = Nokogiri.HTML5(html)
127
+ # raises ArgumentError: Document tree depth limit exceeded
128
+ doc = Nokogiri.HTML5(html, max_tree_depth: -1)
46
129
  ```
47
130
 
48
- Notes
49
- -----
131
+ ### Attribute limit per element
132
+ The maximum number of attributes per DOM element is configurable by the
133
+ `:max_attributes` option. If a given element would exceed this limit, then an
134
+ [ArgumentError](https://ruby-doc.org/core-2.5.0/ArgumentError.html) is thrown.
135
+
136
+ This limit (which defaults to `Nokogumbo::DEFAULT_MAX_ATTRIBUTES = 400`) can
137
+ be removed by giving the option `max_attributes: -1`.
138
+
139
+ ``` ruby
140
+ html = '<!DOCTYPE html><div ' + (1..1000).map { |x| "attr-#{x}" }.join(' ') + '>'
141
+ # "<!DOCTYPE html><div attr-1 attr-2 attr-3 ... attr-1000>"
142
+ doc = Nokogiri.HTML5(html)
143
+ # raises ArgumentError: Attributes per element limit exceeded
144
+ doc = Nokogiri.HTML5(html, max_attributes: -1)
145
+ ```
146
+
147
+ ## HTML Serialization
148
+
149
+ After parsing HTML, it may be serialized using any of the Nokogiri
150
+ [serialization
151
+ methods](https://www.rubydoc.info/gems/nokogiri/Nokogiri/XML/Node). In
152
+ particular, `#serialize`, `#to_html`, and `#to_s` will serialize a given node
153
+ and its children. (This is the equivalent of JavaScript's
154
+ `Element.outerHTML`.) Similarly, `#inner_html` will serialize the children of
155
+ a given node. (This is the equivalent of JavaScript's `Element.innerHTML`.)
156
+
157
+ ``` ruby
158
+ doc = Nokogiri::HTML5("<!DOCTYPE html><span>Hello world!</span>")
159
+ puts doc.serialize
160
+ # Prints: <!DOCTYPE html><html><head></head><body><span>Hello world!</span></body></html>
161
+ ```
162
+
163
+ Due to quirks in how HTML is parsed and serialized, it's possible for a DOM
164
+ tree to be serialized and then re-parsed, resulting in a different DOM.
165
+ Mostly, this happens with DOMs produced from invalid HTML. Unfortunately, even
166
+ valid HTML may not survive serialization and re-parsing.
167
+
168
+ In particular, a newline at the start of `pre`, `listing`, and `textarea`
169
+ elements is ignored by the parser.
170
+
171
+ ``` ruby
172
+ doc = Nokogiri::HTML5(<<-EOF)
173
+ <!DOCTYPE html>
174
+ <pre>
175
+ Content</pre>
176
+ EOF
177
+ puts doc.at('/html/body/pre').serialize
178
+ # Prints: <pre>Content</pre>
179
+ ```
180
+
181
+ In this case, the original HTML is semantically equivalent to the serialized
182
+ version. If the `pre`, `listing`, or `textarea` content starts with two
183
+ newlines, the first newline will be stripped on the first parse and the second
184
+ newline will be stripped on the second, leading to semantically different
185
+ DOMs. Passing the parameter `preserve_newline: true` will cause two or more
186
+ newlines to be preserved. (A single leading newline will still be removed.)
187
+
188
+ ``` ruby
189
+ doc = Nokogiri::HTML5(<<-EOF)
190
+ <!DOCTYPE html>
191
+ <listing>
192
+
193
+ Content</listing>
194
+ EOF
195
+ puts doc.at('/html/body/listing').serialize(preserve_newline: true)
196
+ # Prints: <listing>
197
+ #
198
+ # Content</listing>
199
+ ```
200
+
201
+ ## Encodings
202
+ Nokogumbo always parses HTML using
203
+ [UTF-8](https://en.wikipedia.org/wiki/UTF-8); however, the encoding of the
204
+ input can be explicitly selected via the optional `encoding` parameter. This
205
+ is most useful when the input comes not from a string but from an IO object.
206
+
207
+ When serializing a document or node, the encoding of the output string can be
208
+ specified via the `:encoding` options. Characters that cannot be encoded in
209
+ the selected encoding will be encoded as [HTML numeric
210
+ entities](https://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references).
211
+
212
+ ``` ruby
213
+ frag = Nokogiri::HTML5.fragment('<span>아는 길도 물어가라</span>')
214
+ html = frag.serialize(encoding: 'US-ASCII')
215
+ puts html
216
+ # Prints: <span>&#xc544;&#xb294; &#xae38;&#xb3c4; &#xbb3c;&#xc5b4;&#xac00;&#xb77c;</span>
217
+ frag = Nokogiri::HTML5.fragment(html)
218
+ puts frag.serialize
219
+ # Prints: <span>아는 길도 물어가라</span>
220
+ ```
221
+
222
+ (There's a [bug](https://bugs.ruby-lang.org/issues/15033) in all current
223
+ versions of Ruby that can cause the entity encoding to fail. Of the mandated
224
+ supported encodings for HTML, the only encoding I'm aware of that has this bug
225
+ is `'ISO-2022-JP'`. I recommend avoiding this encoding.)
226
+
227
+ ## Examples
228
+ ```ruby
229
+ require 'nokogumbo'
230
+ puts Nokogiri::HTML5.get('http://nokogiri.org').search('ol li')[2].text
231
+ ```
232
+
233
+ ## Notes
50
234
 
51
235
  * The `Nokogiri::HTML5.fragment` function takes a string and parses it
52
236
  as a HTML5 document. The `<html>`, `<head>`, and `<body>` elements are
@@ -74,20 +258,47 @@ rules defined in the HTML5 specification for doing so.
74
258
  * Instead of returning `unknown` as the element name for unknown tags, the
75
259
  original tag name is returned verbatim.
76
260
 
77
- * If the Gumbo HTML5 parser is not already installed, the source for the
78
- parser will be downloaded and compiled into the Gem itself.
261
+ # Flavors of Nokogumbo
262
+ Nokogumbo uses libxml2, the XML library underlying Nokogiri, to speed up
263
+ parsing. If the libxml2 headers are not available, then Nokogumbo resorts to
264
+ using Nokogiri's Ruby API to construct the DOM tree.
265
+
266
+ Nokogiri can be configured to either use the system library version of libxml2
267
+ or use a bundled version. By default (as of Nokogiri version 1.8.4), Nokogiri
268
+ will use a bundled version.
269
+
270
+ To prevent differences between versions of libxml2, Nokogumbo will only use
271
+ libxml2 if the build process can find the exact same version used by Nokogiri.
272
+ This leads to three possibilities
273
+
274
+ 1. Nokogiri is compiled with the bundled libxml2. In this case, Nokogumbo will
275
+ (by default) use the same version of libxml2.
276
+ 2. Nokogiri is compiled with the system libxml2. In this case, if the libxml2
277
+ headers are available, then Nokogumbo will (by default) use the system
278
+ version and headers.
279
+ 3. Nokogiri is compiled with the system libxml2 but its headers aren't
280
+ available at build time for Nokogumbo. In this case, Nokogumbo will use the
281
+ slower Ruby API.
282
+
283
+ Using libxml2 can be required by passing `-- --with-libxml2` to `bundle exec
284
+ rake` or to `gem install`. Using libxml2 can be prohibited by instead passing
285
+ `-- --without-libxml2`.
286
+
287
+ Functionally, the only difference between using libxml2 or not is in the
288
+ behavior of `Nokogiri::XML::Node#line`. If it is used, then `#line` will
289
+ return the line number of the corresponding node. Otherwise, it will return 0.
79
290
 
80
- Installation
81
- ============
291
+ # Installation
82
292
 
83
- git clone --recursive https://github.com/rubys/nokogumbo.git
293
+ git clone https://github.com/rubys/nokogumbo.git
84
294
  cd nokogumbo
85
295
  bundle install
86
296
  rake gem
87
297
  gem install pkg/nokogumbo*.gem
88
298
 
89
- Related efforts
90
- ============
299
+ # Related efforts
91
300
 
92
- * [ruby-gumbo](https://github.com/nevir/ruby-gumbo#readme) - a ruby binding
93
- for the Gumbo HTML5 parser.
301
+ * [ruby-gumbo](https://github.com/nevir/ruby-gumbo#readme) -- a ruby binding
302
+ for the Gumbo HTML5 parser.
303
+ * [lua-gumbo](https://gitlab.com/craigbarnes/lua-gumbo) -- a lua binding for
304
+ the Gumbo HTML5 parser.
@@ -0,0 +1,121 @@
1
+ require 'rubygems'
2
+ require 'fileutils'
3
+ require 'mkmf'
4
+ require 'nokogiri'
5
+
6
+ $CFLAGS += " -std=c99"
7
+ $LDFLAGS.gsub!('-Wl,--no-undefined', '')
8
+ $warnflags = CONFIG['warnflags'] = '-Wall'
9
+
10
+ NG_SPEC = Gem::Specification.find_by_name('nokogiri', "= #{Nokogiri::VERSION}")
11
+
12
+ def download_headers
13
+ begin
14
+ require 'yaml'
15
+
16
+ dependencies = YAML.load_file(File.join(NG_SPEC.gem_dir, 'dependencies.yml'))
17
+ version = dependencies['libxml2']['version']
18
+ host = RbConfig::CONFIG["host_alias"].empty? ? RbConfig::CONFIG["host"] : RbConfig::CONFIG["host_alias"]
19
+ path = File.join('ports', host, 'libxml2', version, 'include/libxml2')
20
+ return path if File.directory?(path)
21
+
22
+ # Make sure we're using the same version Nokogiri uses
23
+ dep_index = NG_SPEC.dependencies.index { |dep| dep.name == 'mini_portile2' and dep.type == :runtime }
24
+ return nil if dep_index.nil?
25
+ requirement = NG_SPEC.dependencies[dep_index].requirement.to_s
26
+
27
+ gem 'mini_portile2', requirement
28
+ require 'mini_portile2'
29
+ p = MiniPortile::new('libxml2', version).tap do |r|
30
+ r.host = RbConfig::CONFIG["host_alias"].empty? ? RbConfig::CONFIG["host"] : RbConfig::CONFIG["host_alias"]
31
+ r.files = [{
32
+ url: "http://xmlsoft.org/sources/libxml2-#{r.version}.tar.gz",
33
+ sha256: dependencies['libxml2']['sha256']
34
+ }]
35
+ r.configure_options += [
36
+ "--without-python",
37
+ "--without-readline",
38
+ "--with-c14n",
39
+ "--with-debug",
40
+ "--with-threads"
41
+ ]
42
+ end
43
+ p.download unless p.downloaded?
44
+ p.extract
45
+ p.configure unless p.configured?
46
+ system('make', '-C', "tmp/#{p.host}/ports/libxml2/#{version}/libxml2-#{version}/include/libxml", 'install-xmlincHEADERS')
47
+ path
48
+ rescue
49
+ puts 'failed to download/install headers'
50
+ nil
51
+ end
52
+ end
53
+
54
+ required = arg_config('--with-libxml2')
55
+ prohibited = arg_config('--without-libxml2')
56
+ if required and prohibited
57
+ abort "cannot use both --with-libxml2 and --without-libxml2"
58
+ end
59
+
60
+ have_libxml2 = false
61
+ have_ng = false
62
+
63
+ if !prohibited
64
+ if Nokogiri::VERSION_INFO.include?('libxml') and
65
+ Nokogiri::VERSION_INFO['libxml']['source'] == 'packaged'
66
+ # Nokogiri has libxml2 built in. Find the headers.
67
+ libxml2_path = File.join(Nokogiri::VERSION_INFO['libxml']['libxml2_path'],
68
+ 'include/libxml2')
69
+ if find_header('libxml/tree.h', libxml2_path)
70
+ have_libxml2 = true
71
+ else
72
+ # Unfortunately, some versions of Nokogiri delete these files.
73
+ # https://github.com/sparklemotion/nokogiri/pull/1788
74
+ # Try to download them
75
+ libxml2_path = download_headers
76
+ unless libxml2_path.nil?
77
+ have_libxml2 = find_header('libxml/tree.h', libxml2_path)
78
+ end
79
+ end
80
+ else
81
+ # Nokogiri is compiled with system headers.
82
+ # Hack to work around broken mkmf on macOS
83
+ # (https://bugs.ruby-lang.org/issues/14992 fixed now)
84
+ if RbConfig::MAKEFILE_CONFIG['LIBPATHENV'] == 'DYLD_LIBRARY_PATH'
85
+ RbConfig::MAKEFILE_CONFIG['LIBPATHENV'] = 'DYLD_FALLBACK_LIBRARY_PATH'
86
+ end
87
+
88
+ pkg_config('libxml-2.0')
89
+ have_libxml2 = have_library('xml2', 'xmlNewDoc')
90
+ end
91
+ if required and !have_libxml2
92
+ abort "libxml2 required but could not be located"
93
+ end
94
+
95
+ if have_libxml2
96
+ # Find nokogiri.h
97
+ have_ng = find_header('nokogiri.h', File.join(NG_SPEC.gem_dir, 'ext/nokogiri'))
98
+ end
99
+ end
100
+
101
+ if have_libxml2 and have_ng
102
+ $CFLAGS += " -DNGLIB=1"
103
+ end
104
+
105
+ # Symlink gumbo-parser source files.
106
+ ext_dir = File.dirname(__FILE__)
107
+ gumbo_src = File.join(ext_dir, 'gumbo_src')
108
+
109
+ Dir.chdir(ext_dir) do
110
+ $srcs = Dir['*.c', '../../gumbo-parser/src/*.c']
111
+ $hdrs = Dir['*.h', '../../gumbo-parser/src/*.h']
112
+ end
113
+ $INCFLAGS << ' -I$(srcdir)/../../gumbo-parser/src'
114
+ $VPATH << '$(srcdir)/../../gumbo-parser/src'
115
+
116
+ create_makefile('nokogumbo/nokogumbo') do |conf|
117
+ conf.map! do |chunk|
118
+ chunk.gsub(/^HDRS = .*$/, "HDRS = #{$hdrs.map { |h| File.join('$(srcdir)', h)}.join(' ')}")
119
+ end
120
+ end
121
+ # vim: set sw=2 sts=2 ts=8 et:
@@ -0,0 +1,793 @@
1
+ //
2
+ // nokogumbo.c defines the following:
3
+ //
4
+ // class Nokogumbo
5
+ // def parse(utf8_string) # returns Nokogiri::HTML5::Document
6
+ // end
7
+ //
8
+ // Processing starts by calling gumbo_parse_with_options. The resulting
9
+ // document tree is then walked:
10
+ //
11
+ // * if Nokogiri and libxml2 headers are available at compile time,
12
+ // (if NGLIB) then a parallel libxml2 tree is constructed, and the
13
+ // final document is then wrapped using Nokogiri_wrap_xml_document.
14
+ // This approach reduces memory and CPU requirements as Ruby objects
15
+ // are only built when necessary.
16
+ //
17
+ // * if the necessary headers are not available at compile time, Nokogiri
18
+ // methods are called instead, producing the equivalent functionality.
19
+ //
20
+
21
+ #include <assert.h>
22
+ #include <ruby.h>
23
+ #include <ruby/version.h>
24
+
25
+ #include "gumbo.h"
26
+
27
+ // class constants
28
+ static VALUE Document;
29
+
30
+ // Interned symbols
31
+ static ID internal_subset;
32
+ static ID parent;
33
+
34
+ /* Backwards compatibility to Ruby 2.1.0 */
35
+ #if RUBY_API_VERSION_CODE < 20200
36
+ #define ONIG_ESCAPE_UCHAR_COLLISION 1
37
+ #include <ruby/encoding.h>
38
+
39
+ static VALUE rb_utf8_str_new(const char *str, long length) {
40
+ return rb_enc_str_new(str, length, rb_utf8_encoding());
41
+ }
42
+
43
+ static VALUE rb_utf8_str_new_cstr(const char *str) {
44
+ return rb_enc_str_new_cstr(str, rb_utf8_encoding());
45
+ }
46
+
47
+ static VALUE rb_utf8_str_new_static(const char *str, long length) {
48
+ return rb_enc_str_new(str, length, rb_utf8_encoding());
49
+ }
50
+ #endif
51
+
52
+ #if NGLIB
53
+ #include <nokogiri.h>
54
+ #include <libxml/tree.h>
55
+ #include <libxml/HTMLtree.h>
56
+
57
+ #define NIL NULL
58
+ #else
59
+ #define NIL Qnil
60
+
61
+ // These are defined by nokogiri.h
62
+ static VALUE cNokogiriXmlSyntaxError;
63
+ static VALUE cNokogiriXmlElement;
64
+ static VALUE cNokogiriXmlText;
65
+ static VALUE cNokogiriXmlCData;
66
+ static VALUE cNokogiriXmlComment;
67
+
68
+ // Interned symbols.
69
+ static ID new;
70
+ static ID node_name_;
71
+
72
+ // Map libxml2 types to Ruby VALUE.
73
+ typedef VALUE xmlNodePtr;
74
+ typedef VALUE xmlDocPtr;
75
+ typedef VALUE xmlNsPtr;
76
+ typedef VALUE xmlDtdPtr;
77
+ typedef char xmlChar;
78
+ #define BAD_CAST
79
+
80
+ // Redefine libxml2 API as Ruby function calls.
81
+ static xmlNodePtr xmlNewDocNode(xmlDocPtr doc, xmlNsPtr ns, const xmlChar *name, const xmlChar *content) {
82
+ assert(ns == NIL && content == NULL);
83
+ return rb_funcall(cNokogiriXmlElement, new, 2, rb_utf8_str_new_cstr(name), doc);
84
+ }
85
+
86
+ static xmlNodePtr xmlNewDocText(xmlDocPtr doc, const xmlChar *content) {
87
+ VALUE str = rb_utf8_str_new_cstr(content);
88
+ return rb_funcall(cNokogiriXmlText, new, 2, str, doc);
89
+ }
90
+
91
+ static xmlNodePtr xmlNewCDataBlock(xmlDocPtr doc, const xmlChar *content, int len) {
92
+ VALUE str = rb_utf8_str_new(content, len);
93
+ // CDATA.new takes arguments in the opposite order from Text.new.
94
+ return rb_funcall(cNokogiriXmlCData, new, 2, doc, str);
95
+ }
96
+
97
+ static xmlNodePtr xmlNewDocComment(xmlDocPtr doc, const xmlChar *content) {
98
+ VALUE str = rb_utf8_str_new_cstr(content);
99
+ return rb_funcall(cNokogiriXmlComment, new, 2, doc, str);
100
+ }
101
+
102
+ static xmlNodePtr xmlAddChild(xmlNodePtr parent, xmlNodePtr cur) {
103
+ ID add_child;
104
+ CONST_ID(add_child, "add_child");
105
+ return rb_funcall(parent, add_child, 1, cur);
106
+ }
107
+
108
+ static void xmlSetNs(xmlNodePtr node, xmlNsPtr ns) {
109
+ ID namespace_;
110
+ CONST_ID(namespace_, "namespace=");
111
+ rb_funcall(node, namespace_, 1, ns);
112
+ }
113
+
114
+ static void xmlFreeDoc(xmlDocPtr doc) { }
115
+
116
+ static VALUE Nokogiri_wrap_xml_document(VALUE klass, xmlDocPtr doc) {
117
+ return doc;
118
+ }
119
+
120
+ static VALUE find_dummy_key(VALUE collection) {
121
+ VALUE r_dummy = Qnil;
122
+ char dummy[5] = "a";
123
+ size_t len = 1;
124
+ ID key_;
125
+ CONST_ID(key_, "key?");
126
+ while (len < sizeof dummy) {
127
+ r_dummy = rb_utf8_str_new(dummy, len);
128
+ if (rb_funcall(collection, key_, 1, r_dummy) == Qfalse)
129
+ return r_dummy;
130
+ for (size_t i = 0; ; ++i) {
131
+ if (dummy[i] == 0) {
132
+ dummy[i] = 'a';
133
+ ++len;
134
+ break;
135
+ }
136
+ if (dummy[i] == 'z')
137
+ dummy[i] = 'a';
138
+ else {
139
+ ++dummy[i];
140
+ break;
141
+ }
142
+ }
143
+ }
144
+ // This collection has 475254 elements?? Give up.
145
+ rb_raise(rb_eArgError, "Failed to find a dummy key.");
146
+ }
147
+
148
+ // This should return an xmlAttrPtr, but we don't need it and it's easier to
149
+ // not get the result.
150
+ static void xmlNewNsProp (
151
+ xmlNodePtr node,
152
+ xmlNsPtr ns,
153
+ const xmlChar *name,
154
+ const xmlChar *value
155
+ ) {
156
+ ID set_attribute;
157
+ CONST_ID(set_attribute, "set_attribute");
158
+
159
+ VALUE rvalue = rb_utf8_str_new_cstr(value);
160
+
161
+ if (RTEST(ns)) {
162
+ // This is an easy case, we have a namespace so it's enough to do
163
+ // node["#{ns.prefix}:#{name}"] = value
164
+ ID prefix;
165
+ CONST_ID(prefix, "prefix");
166
+ VALUE ns_prefix = rb_funcall(ns, prefix, 0);
167
+ VALUE qname = rb_sprintf("%" PRIsVALUE ":%s", ns_prefix, name);
168
+ rb_funcall(node, set_attribute, 2, qname, rvalue);
169
+ return;
170
+ }
171
+
172
+ size_t len = strlen(name);
173
+ VALUE rname = rb_utf8_str_new(name, len);
174
+ if (memchr(name, ':', len) == NULL) {
175
+ // This is the easiest case. There's no colon so we can do
176
+ // node[name] = value.
177
+ rb_funcall(node, set_attribute, 2, rname, rvalue);
178
+ return;
179
+ }
180
+
181
+ // Nokogiri::XML::Node#set_attribute calls xmlSetProp(node, name, value)
182
+ // which behaves roughly as
183
+ // if name is a QName prefix:local
184
+ // if node->doc has a namespace ns corresponding to prefix
185
+ // return xmlSetNsProp(node, ns, local, value)
186
+ // return xmlSetNsProp(node, NULL, name, value)
187
+ //
188
+ // If the prefix is "xml", then the namespace lookup will create it.
189
+ //
190
+ // By contrast, xmlNewNsProp does not do this parsing and creates an attribute
191
+ // with the name and value exactly as given. This is the behavior that we
192
+ // want.
193
+ //
194
+ // Thus, for attribute names like "xml:lang", #set_attribute will create an
195
+ // attribute with namespace "xml" and name "lang". This is incorrect for
196
+ // html elements (but correct for foreign elements).
197
+ //
198
+ // Work around this by inserting a dummy attribute and then changing the
199
+ // name, if needed.
200
+
201
+ // Find a dummy attribute string that doesn't already exist.
202
+ VALUE dummy = find_dummy_key(node);
203
+ // Add the dummy attribute.
204
+ rb_funcall(node, set_attribute, 2, dummy, rvalue);
205
+
206
+ // Remove the old attribute, if it exists.
207
+ ID remove_attribute;
208
+ CONST_ID(remove_attribute, "remove_attribute");
209
+ rb_funcall(node, remove_attribute, 1, rname);
210
+
211
+ // Rename the dummy
212
+ ID attribute;
213
+ CONST_ID(attribute, "attribute");
214
+ VALUE attr = rb_funcall(node, attribute, 1, dummy);
215
+ rb_funcall(attr, node_name_, 1, rname);
216
+ }
217
+ #endif
218
+
219
+ // URI = system id
220
+ // external id = public id
221
+ static xmlDocPtr new_html_doc(const char *dtd_name, const char *system, const char *public)
222
+ {
223
+ #if NGLIB
224
+ // These two libxml2 functions take the public and system ids in
225
+ // opposite orders.
226
+ htmlDocPtr doc = htmlNewDocNoDtD(/* URI */ NULL, /* ExternalID */NULL);
227
+ assert(doc);
228
+ if (dtd_name)
229
+ xmlCreateIntSubset(doc, BAD_CAST dtd_name, BAD_CAST public, BAD_CAST system);
230
+ return doc;
231
+ #else
232
+ // remove internal subset from newly created documents
233
+ VALUE doc;
234
+ // If system and public are both NULL, Document#new is going to set default
235
+ // values for them so we're going to have to remove the internal subset
236
+ // which seems to leak memory in Nokogiri, so leak as little as possible.
237
+ if (system == NULL && public == NULL) {
238
+ ID remove;
239
+ CONST_ID(remove, "remove");
240
+ doc = rb_funcall(Document, new, 2, /* URI */ Qnil, /* external_id */ rb_utf8_str_new_static("", 0));
241
+ rb_funcall(rb_funcall(doc, internal_subset, 0), remove, 0);
242
+ if (dtd_name) {
243
+ // We need to create an internal subset now.
244
+ ID create_internal_subset;
245
+ CONST_ID(create_internal_subset, "create_internal_subset");
246
+ rb_funcall(doc, create_internal_subset, 3, rb_utf8_str_new_cstr(dtd_name), Qnil, Qnil);
247
+ }
248
+ } else {
249
+ assert(dtd_name);
250
+ // Rather than removing and creating the internal subset as we did above,
251
+ // just create and then rename one.
252
+ VALUE r_system = system ? rb_utf8_str_new_cstr(system) : Qnil;
253
+ VALUE r_public = public ? rb_utf8_str_new_cstr(public) : Qnil;
254
+ doc = rb_funcall(Document, new, 2, r_system, r_public);
255
+ rb_funcall(rb_funcall(doc, internal_subset, 0), node_name_, 1, rb_utf8_str_new_cstr(dtd_name));
256
+ }
257
+ return doc;
258
+ #endif
259
+ }
260
+
261
+ static xmlNodePtr get_parent(xmlNodePtr node) {
262
+ #if NGLIB
263
+ return node->parent;
264
+ #else
265
+ if (!rb_respond_to(node, parent))
266
+ return Qnil;
267
+ return rb_funcall(node, parent, 0);
268
+ #endif
269
+ }
270
+
271
+ static GumboOutput *perform_parse(const GumboOptions *options, VALUE input) {
272
+ assert(RTEST(input));
273
+ Check_Type(input, T_STRING);
274
+ GumboOutput *output = gumbo_parse_with_options (
275
+ options,
276
+ RSTRING_PTR(input),
277
+ RSTRING_LEN(input)
278
+ );
279
+
280
+ const char *status_string = gumbo_status_to_string(output->status);
281
+ switch (output->status) {
282
+ case GUMBO_STATUS_OK:
283
+ break;
284
+ case GUMBO_STATUS_TOO_MANY_ATTRIBUTES:
285
+ case GUMBO_STATUS_TREE_TOO_DEEP:
286
+ gumbo_destroy_output(output);
287
+ rb_raise(rb_eArgError, "%s", status_string);
288
+ case GUMBO_STATUS_OUT_OF_MEMORY:
289
+ gumbo_destroy_output(output);
290
+ rb_raise(rb_eNoMemError, "%s", status_string);
291
+ }
292
+ return output;
293
+ }
294
+
295
+ static xmlNsPtr lookup_or_add_ns (
296
+ xmlDocPtr doc,
297
+ xmlNodePtr root,
298
+ const char *href,
299
+ const char *prefix
300
+ ) {
301
+ #if NGLIB
302
+ xmlNsPtr ns = xmlSearchNs(doc, root, BAD_CAST prefix);
303
+ if (ns)
304
+ return ns;
305
+ return xmlNewNs(root, BAD_CAST href, BAD_CAST prefix);
306
+ #else
307
+ ID add_namespace_definition;
308
+ CONST_ID(add_namespace_definition, "add_namespace_definition");
309
+ VALUE rprefix = rb_utf8_str_new_cstr(prefix);
310
+ VALUE rhref = rb_utf8_str_new_cstr(href);
311
+ return rb_funcall(root, add_namespace_definition, 2, rprefix, rhref);
312
+ #endif
313
+ }
314
+
315
+ static void set_line(xmlNodePtr node, size_t line) {
316
+ #if NGLIB
317
+ // libxml2 uses 65535 to mean look elsewhere for the line number on some
318
+ // nodes.
319
+ if (line < 65535)
320
+ node->line = (unsigned short)line;
321
+ #else
322
+ // XXX: If Nokogiri gets a `#line=` method, we'll use that.
323
+ #endif
324
+ }
325
+
326
+ // Construct an XML tree rooted at xml_output_node from the Gumbo tree rooted
327
+ // at gumbo_node.
328
+ static void build_tree (
329
+ xmlDocPtr doc,
330
+ xmlNodePtr xml_output_node,
331
+ const GumboNode *gumbo_node
332
+ ) {
333
+ xmlNodePtr xml_root = NIL;
334
+ xmlNodePtr xml_node = xml_output_node;
335
+ size_t child_index = 0;
336
+
337
+ while (true) {
338
+ assert(gumbo_node != NULL);
339
+ const GumboVector *children = gumbo_node->type == GUMBO_NODE_DOCUMENT?
340
+ &gumbo_node->v.document.children : &gumbo_node->v.element.children;
341
+ if (child_index >= children->length) {
342
+ // Move up the tree and to the next child.
343
+ if (xml_node == xml_output_node) {
344
+ // We've built as much of the tree as we can.
345
+ return;
346
+ }
347
+ child_index = gumbo_node->index_within_parent + 1;
348
+ gumbo_node = gumbo_node->parent;
349
+ xml_node = get_parent(xml_node);
350
+ // Children of fragments don't share the same root, so reset it and
351
+ // it'll be set below. In the non-fragment case, this will only happen
352
+ // after the html element has been finished at which point there are no
353
+ // further elements.
354
+ if (xml_node == xml_output_node)
355
+ xml_root = NIL;
356
+ continue;
357
+ }
358
+ const GumboNode *gumbo_child = children->data[child_index++];
359
+ xmlNodePtr xml_child;
360
+
361
+ switch (gumbo_child->type) {
362
+ case GUMBO_NODE_DOCUMENT:
363
+ abort(); // Bug in Gumbo.
364
+
365
+ case GUMBO_NODE_TEXT:
366
+ case GUMBO_NODE_WHITESPACE:
367
+ xml_child = xmlNewDocText(doc, BAD_CAST gumbo_child->v.text.text);
368
+ set_line(xml_child, gumbo_child->v.text.start_pos.line);
369
+ xmlAddChild(xml_node, xml_child);
370
+ break;
371
+
372
+ case GUMBO_NODE_CDATA:
373
+ xml_child = xmlNewCDataBlock(doc, BAD_CAST gumbo_child->v.text.text,
374
+ (int) strlen(gumbo_child->v.text.text));
375
+ set_line(xml_child, gumbo_child->v.text.start_pos.line);
376
+ xmlAddChild(xml_node, xml_child);
377
+ break;
378
+
379
+ case GUMBO_NODE_COMMENT:
380
+ xml_child = xmlNewDocComment(doc, BAD_CAST gumbo_child->v.text.text);
381
+ set_line(xml_child, gumbo_child->v.text.start_pos.line);
382
+ xmlAddChild(xml_node, xml_child);
383
+ break;
384
+
385
+ case GUMBO_NODE_TEMPLATE:
386
+ // XXX: Should create a template element and a new DocumentFragment
387
+ case GUMBO_NODE_ELEMENT:
388
+ {
389
+ xml_child = xmlNewDocNode(doc, NIL, BAD_CAST gumbo_child->v.element.name, NULL);
390
+ set_line(xml_child, gumbo_child->v.element.start_pos.line);
391
+ if (xml_root == NIL)
392
+ xml_root = xml_child;
393
+ xmlNsPtr ns = NIL;
394
+ switch (gumbo_child->v.element.tag_namespace) {
395
+ case GUMBO_NAMESPACE_HTML:
396
+ break;
397
+ case GUMBO_NAMESPACE_SVG:
398
+ ns = lookup_or_add_ns(doc, xml_root, "http://www.w3.org/2000/svg", "svg");
399
+ break;
400
+ case GUMBO_NAMESPACE_MATHML:
401
+ ns = lookup_or_add_ns(doc, xml_root, "http://www.w3.org/1998/Math/MathML", "math");
402
+ break;
403
+ }
404
+ if (ns != NIL)
405
+ xmlSetNs(xml_child, ns);
406
+ xmlAddChild(xml_node, xml_child);
407
+
408
+ // Add the attributes.
409
+ const GumboVector* attrs = &gumbo_child->v.element.attributes;
410
+ for (size_t i=0; i < attrs->length; i++) {
411
+ const GumboAttribute *attr = attrs->data[i];
412
+
413
+ switch (attr->attr_namespace) {
414
+ case GUMBO_ATTR_NAMESPACE_XLINK:
415
+ ns = lookup_or_add_ns(doc, xml_root, "http://www.w3.org/1999/xlink", "xlink");
416
+ break;
417
+
418
+ case GUMBO_ATTR_NAMESPACE_XML:
419
+ ns = lookup_or_add_ns(doc, xml_root, "http://www.w3.org/XML/1998/namespace", "xml");
420
+ break;
421
+
422
+ case GUMBO_ATTR_NAMESPACE_XMLNS:
423
+ ns = lookup_or_add_ns(doc, xml_root, "http://www.w3.org/2000/xmlns/", "xmlns");
424
+ break;
425
+
426
+ default:
427
+ ns = NIL;
428
+ }
429
+ xmlNewNsProp(xml_child, ns, BAD_CAST attr->name, BAD_CAST attr->value);
430
+ }
431
+
432
+ // Add children for this element.
433
+ child_index = 0;
434
+ gumbo_node = gumbo_child;
435
+ xml_node = xml_child;
436
+ }
437
+ }
438
+ }
439
+ }
440
+
441
+ static void add_errors(const GumboOutput *output, VALUE rdoc, VALUE input, VALUE url) {
442
+ const char *input_str = RSTRING_PTR(input);
443
+ size_t input_len = RSTRING_LEN(input);
444
+
445
+ // Add parse errors to rdoc.
446
+ if (output->errors.length) {
447
+ const GumboVector *errors = &output->errors;
448
+ VALUE rerrors = rb_ary_new2(errors->length);
449
+
450
+ for (size_t i=0; i < errors->length; i++) {
451
+ GumboError *err = errors->data[i];
452
+ GumboSourcePosition position = gumbo_error_position(err);
453
+ char *msg;
454
+ size_t size = gumbo_caret_diagnostic_to_string(err, input_str, input_len, &msg);
455
+ VALUE err_str = rb_utf8_str_new(msg, size);
456
+ free(msg);
457
+ VALUE syntax_error = rb_class_new_instance(1, &err_str, cNokogiriXmlSyntaxError);
458
+ const char *error_code = gumbo_error_code(err);
459
+ VALUE str1 = error_code? rb_utf8_str_new_static(error_code, strlen(error_code)) : Qnil;
460
+ rb_iv_set(syntax_error, "@domain", INT2NUM(1)); // XML_FROM_PARSER
461
+ rb_iv_set(syntax_error, "@code", INT2NUM(1)); // XML_ERR_INTERNAL_ERROR
462
+ rb_iv_set(syntax_error, "@level", INT2NUM(2)); // XML_ERR_ERROR
463
+ rb_iv_set(syntax_error, "@file", url);
464
+ rb_iv_set(syntax_error, "@line", INT2NUM(position.line));
465
+ rb_iv_set(syntax_error, "@str1", str1);
466
+ rb_iv_set(syntax_error, "@str2", Qnil);
467
+ rb_iv_set(syntax_error, "@str3", Qnil);
468
+ rb_iv_set(syntax_error, "@int1", INT2NUM(0));
469
+ rb_iv_set(syntax_error, "@column", INT2NUM(position.column));
470
+ rb_ary_push(rerrors, syntax_error);
471
+ }
472
+ rb_iv_set(rdoc, "@errors", rerrors);
473
+ }
474
+ }
475
+
476
+ typedef struct {
477
+ GumboOutput *output;
478
+ VALUE input;
479
+ VALUE url_or_frag;
480
+ xmlDocPtr doc;
481
+ } ParseArgs;
482
+
483
+ static void parse_args_mark(void *parse_args) {
484
+ ParseArgs *args = parse_args;
485
+ rb_gc_mark_maybe(args->input);
486
+ rb_gc_mark_maybe(args->url_or_frag);
487
+ }
488
+
489
+ // Wrap a ParseArgs pointer. The underlying ParseArgs must outlive the
490
+ // wrapper.
491
+ static VALUE wrap_parse_args(ParseArgs *args) {
492
+ return Data_Wrap_Struct(rb_cData, parse_args_mark, RUBY_NEVER_FREE, args);
493
+ }
494
+
495
+ // Returnsd the underlying ParseArgs wrapped by wrap_parse_args.
496
+ static ParseArgs *unwrap_parse_args(VALUE obj) {
497
+ ParseArgs *args;
498
+ Data_Get_Struct(obj, ParseArgs, args);
499
+ return args;
500
+ }
501
+
502
+ static VALUE parse_cleanup(VALUE parse_args) {
503
+ ParseArgs *args = unwrap_parse_args(parse_args);
504
+ gumbo_destroy_output(args->output);
505
+ // Make sure garbage collection doesn't mark the objects as being live based
506
+ // on references from the ParseArgs. This may be unnecessary.
507
+ args->input = Qnil;
508
+ args->url_or_frag = Qnil;
509
+ if (args->doc != NIL)
510
+ xmlFreeDoc(args->doc);
511
+ return Qnil;
512
+ }
513
+
514
+ static VALUE parse_continue(VALUE parse_args);
515
+
516
+ // Parse a string using gumbo_parse into a Nokogiri document
517
+ static VALUE parse(VALUE self, VALUE input, VALUE url, VALUE max_attributes, VALUE max_errors, VALUE max_depth) {
518
+ GumboOptions options = kGumboDefaultOptions;
519
+ options.max_attributes = NUM2INT(max_attributes);
520
+ options.max_errors = NUM2INT(max_errors);
521
+ options.max_tree_depth = NUM2INT(max_depth);
522
+
523
+ GumboOutput *output = perform_parse(&options, input);
524
+ ParseArgs args = {
525
+ .output = output,
526
+ .input = input,
527
+ .url_or_frag = url,
528
+ .doc = NIL,
529
+ };
530
+ VALUE parse_args = wrap_parse_args(&args);
531
+
532
+ return rb_ensure(parse_continue, parse_args, parse_cleanup, parse_args);
533
+ }
534
+
535
+ static VALUE parse_continue(VALUE parse_args) {
536
+ ParseArgs *args = unwrap_parse_args(parse_args);
537
+ GumboOutput *output = args->output;
538
+ xmlDocPtr doc;
539
+ if (output->document->v.document.has_doctype) {
540
+ const char *name = output->document->v.document.name;
541
+ const char *public = output->document->v.document.public_identifier;
542
+ const char *system = output->document->v.document.system_identifier;
543
+ public = public[0] ? public : NULL;
544
+ system = system[0] ? system : NULL;
545
+ doc = new_html_doc(name, system, public);
546
+ } else {
547
+ doc = new_html_doc(NULL, NULL, NULL);
548
+ }
549
+ args->doc = doc; // Make sure doc gets cleaned up if an error is thrown.
550
+ build_tree(doc, (xmlNodePtr)doc, output->document);
551
+ VALUE rdoc = Nokogiri_wrap_xml_document(Document, doc);
552
+ args->doc = NIL; // The Ruby runtime now owns doc so don't delete it.
553
+ add_errors(output, rdoc, args->input, args->url_or_frag);
554
+ return rdoc;
555
+ }
556
+
557
+ static int lookup_namespace(VALUE node, bool require_known_ns) {
558
+ ID namespace, href;
559
+ CONST_ID(namespace, "namespace");
560
+ CONST_ID(href, "href");
561
+ VALUE ns = rb_funcall(node, namespace, 0);
562
+
563
+ if (NIL_P(ns))
564
+ return GUMBO_NAMESPACE_HTML;
565
+ ns = rb_funcall(ns, href, 0);
566
+ assert(RTEST(ns));
567
+ Check_Type(ns, T_STRING);
568
+
569
+ const char *href_ptr = RSTRING_PTR(ns);
570
+ size_t href_len = RSTRING_LEN(ns);
571
+ #define NAMESPACE_P(uri) (href_len == sizeof uri - 1 && !memcmp(href_ptr, uri, href_len))
572
+ if (NAMESPACE_P("http://www.w3.org/1999/xhtml"))
573
+ return GUMBO_NAMESPACE_HTML;
574
+ if (NAMESPACE_P("http://www.w3.org/1998/Math/MathML"))
575
+ return GUMBO_NAMESPACE_MATHML;
576
+ if (NAMESPACE_P("http://www.w3.org/2000/svg"))
577
+ return GUMBO_NAMESPACE_SVG;
578
+ #undef NAMESPACE_P
579
+ if (require_known_ns)
580
+ rb_raise(rb_eArgError, "Unexpected namespace URI \"%*s\"", (int)href_len, href_ptr);
581
+ return -1;
582
+ }
583
+
584
+ static xmlNodePtr extract_xml_node(VALUE node) {
585
+ #if NGLIB
586
+ xmlNodePtr xml_node;
587
+ Data_Get_Struct(node, xmlNode, xml_node);
588
+ return xml_node;
589
+ #else
590
+ return node;
591
+ #endif
592
+ }
593
+
594
+ static VALUE fragment_continue(VALUE parse_args);
595
+
596
+ static VALUE fragment (
597
+ VALUE self,
598
+ VALUE doc_fragment,
599
+ VALUE tags,
600
+ VALUE ctx,
601
+ VALUE max_attributes,
602
+ VALUE max_errors,
603
+ VALUE max_depth
604
+ ) {
605
+ ID name = rb_intern_const("name");
606
+ const char *ctx_tag;
607
+ GumboNamespaceEnum ctx_ns;
608
+ GumboQuirksModeEnum quirks_mode;
609
+ bool form = false;
610
+ const char *encoding = NULL;
611
+
612
+ if (NIL_P(ctx)) {
613
+ ctx_tag = "body";
614
+ ctx_ns = GUMBO_NAMESPACE_HTML;
615
+ } else if (TYPE(ctx) == T_STRING) {
616
+ ctx_tag = StringValueCStr(ctx);
617
+ ctx_ns = GUMBO_NAMESPACE_HTML;
618
+ size_t len = RSTRING_LEN(ctx);
619
+ const char *colon = memchr(ctx_tag, ':', len);
620
+ if (colon) {
621
+ switch (colon - ctx_tag) {
622
+ case 3:
623
+ if (st_strncasecmp(ctx_tag, "svg", 3) != 0)
624
+ goto error;
625
+ ctx_ns = GUMBO_NAMESPACE_SVG;
626
+ break;
627
+ case 4:
628
+ if (st_strncasecmp(ctx_tag, "html", 4) == 0)
629
+ ctx_ns = GUMBO_NAMESPACE_HTML;
630
+ else if (st_strncasecmp(ctx_tag, "math", 4) == 0)
631
+ ctx_ns = GUMBO_NAMESPACE_MATHML;
632
+ else
633
+ goto error;
634
+ break;
635
+ default:
636
+ error:
637
+ rb_raise(rb_eArgError, "Invalid context namespace '%*s'", (int)(colon - ctx_tag), ctx_tag);
638
+ }
639
+ ctx_tag = colon+1;
640
+ } else {
641
+ // For convenience, put 'svg' and 'math' in their namespaces.
642
+ if (len == 3 && st_strncasecmp(ctx_tag, "svg", 3) == 0)
643
+ ctx_ns = GUMBO_NAMESPACE_SVG;
644
+ else if (len == 4 && st_strncasecmp(ctx_tag, "math", 4) == 0)
645
+ ctx_ns = GUMBO_NAMESPACE_MATHML;
646
+ }
647
+
648
+ // Check if it's a form.
649
+ form = ctx_ns == GUMBO_NAMESPACE_HTML && st_strcasecmp(ctx_tag, "form") == 0;
650
+ } else {
651
+ ID element_ = rb_intern_const("element?");
652
+
653
+ // Context fragment name.
654
+ VALUE tag_name = rb_funcall(ctx, name, 0);
655
+ assert(RTEST(tag_name));
656
+ Check_Type(tag_name, T_STRING);
657
+ ctx_tag = StringValueCStr(tag_name);
658
+
659
+ // Context fragment namespace.
660
+ ctx_ns = lookup_namespace(ctx, true);
661
+
662
+ // Check for a form ancestor, including self.
663
+ for (VALUE node = ctx;
664
+ !NIL_P(node);
665
+ node = rb_respond_to(node, parent) ? rb_funcall(node, parent, 0) : Qnil) {
666
+ if (!RTEST(rb_funcall(node, element_, 0)))
667
+ continue;
668
+ VALUE element_name = rb_funcall(node, name, 0);
669
+ if (RSTRING_LEN(element_name) == 4
670
+ && !st_strcasecmp(RSTRING_PTR(element_name), "form")
671
+ && lookup_namespace(node, false) == GUMBO_NAMESPACE_HTML) {
672
+ form = true;
673
+ break;
674
+ }
675
+ }
676
+
677
+ // Encoding.
678
+ if (RSTRING_LEN(tag_name) == 14
679
+ && !st_strcasecmp(ctx_tag, "annotation-xml")) {
680
+ VALUE enc = rb_funcall(ctx, rb_intern_const("[]"),
681
+ rb_utf8_str_new_static("encoding", 8));
682
+ if (RTEST(enc)) {
683
+ Check_Type(enc, T_STRING);
684
+ encoding = StringValueCStr(enc);
685
+ }
686
+ }
687
+ }
688
+
689
+ // Quirks mode.
690
+ VALUE doc = rb_funcall(doc_fragment, rb_intern_const("document"), 0);
691
+ VALUE dtd = rb_funcall(doc, internal_subset, 0);
692
+ if (NIL_P(dtd)) {
693
+ quirks_mode = GUMBO_DOCTYPE_NO_QUIRKS;
694
+ } else {
695
+ VALUE dtd_name = rb_funcall(dtd, name, 0);
696
+ VALUE pubid = rb_funcall(dtd, rb_intern_const("external_id"), 0);
697
+ VALUE sysid = rb_funcall(dtd, rb_intern_const("system_id"), 0);
698
+ quirks_mode = gumbo_compute_quirks_mode (
699
+ NIL_P(dtd_name)? NULL:StringValueCStr(dtd_name),
700
+ NIL_P(pubid)? NULL:StringValueCStr(pubid),
701
+ NIL_P(sysid)? NULL:StringValueCStr(sysid)
702
+ );
703
+ }
704
+
705
+ // Perform a fragment parse.
706
+ int depth = NUM2INT(max_depth);
707
+ GumboOptions options = kGumboDefaultOptions;
708
+ options.max_attributes = NUM2INT(max_attributes);
709
+ options.max_errors = NUM2INT(max_errors);
710
+ // Add one to account for the HTML element.
711
+ options.max_tree_depth = depth < 0 ? -1 : (depth + 1);
712
+ options.fragment_context = ctx_tag;
713
+ options.fragment_namespace = ctx_ns;
714
+ options.fragment_encoding = encoding;
715
+ options.quirks_mode = quirks_mode;
716
+ options.fragment_context_has_form_ancestor = form;
717
+
718
+ GumboOutput *output = perform_parse(&options, tags);
719
+ ParseArgs args = {
720
+ .output = output,
721
+ .input = tags,
722
+ .url_or_frag = doc_fragment,
723
+ .doc = (xmlDocPtr)extract_xml_node(doc),
724
+ };
725
+ VALUE parse_args = wrap_parse_args(&args);
726
+ rb_ensure(fragment_continue, parse_args, parse_cleanup, parse_args);
727
+ return Qnil;
728
+ }
729
+
730
+ static VALUE fragment_continue(VALUE parse_args) {
731
+ ParseArgs *args = unwrap_parse_args(parse_args);
732
+ GumboOutput *output = args->output;
733
+ VALUE doc_fragment = args->url_or_frag;
734
+ xmlDocPtr xml_doc = args->doc;
735
+
736
+ args->doc = NIL; // The Ruby runtime owns doc so make sure we don't delete it.
737
+ xmlNodePtr xml_frag = extract_xml_node(doc_fragment);
738
+ build_tree(xml_doc, xml_frag, output->root);
739
+ add_errors(output, doc_fragment, args->input, rb_utf8_str_new_static("#fragment", 9));
740
+ return Qnil;
741
+ }
742
+
743
+ // Initialize the Nokogumbo class and fetch constants we will use later.
744
+ void Init_nokogumbo() {
745
+ rb_funcall(rb_mKernel, rb_intern_const("gem"), 1, rb_utf8_str_new_static("nokogiri", 8));
746
+ rb_require("nokogiri");
747
+
748
+ VALUE line_supported = Qtrue;
749
+
750
+ #if !NGLIB
751
+ // Class constants.
752
+ VALUE mNokogiri = rb_const_get(rb_cObject, rb_intern_const("Nokogiri"));
753
+ VALUE mNokogiriXml = rb_const_get(mNokogiri, rb_intern_const("XML"));
754
+ cNokogiriXmlSyntaxError = rb_const_get(mNokogiriXml, rb_intern_const("SyntaxError"));
755
+ rb_gc_register_mark_object(cNokogiriXmlSyntaxError);
756
+ cNokogiriXmlElement = rb_const_get(mNokogiriXml, rb_intern_const("Element"));
757
+ rb_gc_register_mark_object(cNokogiriXmlElement);
758
+ cNokogiriXmlText = rb_const_get(mNokogiriXml, rb_intern_const("Text"));
759
+ rb_gc_register_mark_object(cNokogiriXmlText);
760
+ cNokogiriXmlCData = rb_const_get(mNokogiriXml, rb_intern_const("CDATA"));
761
+ rb_gc_register_mark_object(cNokogiriXmlCData);
762
+ cNokogiriXmlComment = rb_const_get(mNokogiriXml, rb_intern_const("Comment"));
763
+ rb_gc_register_mark_object(cNokogiriXmlComment);
764
+
765
+ // Interned symbols.
766
+ new = rb_intern_const("new");
767
+ node_name_ = rb_intern_const("node_name=");
768
+
769
+ // #line is not supported (returns 0)
770
+ line_supported = Qfalse;
771
+ #endif
772
+
773
+ // Class constants.
774
+ VALUE HTML5 = rb_const_get(mNokogiri, rb_intern_const("HTML5"));
775
+ Document = rb_const_get(HTML5, rb_intern_const("Document"));
776
+ rb_gc_register_mark_object(Document);
777
+
778
+ // Interned symbols.
779
+ internal_subset = rb_intern_const("internal_subset");
780
+ parent = rb_intern_const("parent");
781
+
782
+ // Define Nokogumbo module with parse and fragment methods.
783
+ VALUE Gumbo = rb_define_module("Nokogumbo");
784
+ rb_define_singleton_method(Gumbo, "parse", parse, 5);
785
+ rb_define_singleton_method(Gumbo, "fragment", fragment, 6);
786
+
787
+ // Add private constant for testing.
788
+ rb_define_const(Gumbo, "LINE_SUPPORTED", line_supported);
789
+ rb_funcall(Gumbo, rb_intern_const("private_constant"), 1,
790
+ rb_utf8_str_new_cstr("LINE_SUPPORTED"));
791
+ }
792
+
793
+ // vim: set shiftwidth=2 softtabstop=2 tabstop=8 expandtab: