nokogumbo 1.5.0 → 2.0.0.pre.alpha

Sign up to get free protection for your applications and to get access to all the features.
Files changed (55) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +56 -0
  3. data/README.md +146 -22
  4. data/ext/nokogumbo/extconf.rb +116 -0
  5. data/ext/{nokogumboc → nokogumbo}/nokogumbo.c +174 -71
  6. data/gumbo-parser/src/ascii.c +33 -0
  7. data/gumbo-parser/src/ascii.h +31 -0
  8. data/gumbo-parser/src/attribute.c +26 -28
  9. data/gumbo-parser/src/attribute.h +3 -23
  10. data/gumbo-parser/src/char_ref.c +135 -2351
  11. data/gumbo-parser/src/char_ref.h +13 -29
  12. data/gumbo-parser/src/error.c +215 -133
  13. data/gumbo-parser/src/error.h +34 -49
  14. data/gumbo-parser/src/foreign_attrs.c +104 -0
  15. data/gumbo-parser/src/gumbo.h +506 -304
  16. data/gumbo-parser/src/insertion_mode.h +4 -28
  17. data/gumbo-parser/src/macros.h +91 -0
  18. data/gumbo-parser/src/parser.c +1989 -1431
  19. data/gumbo-parser/src/parser.h +6 -22
  20. data/gumbo-parser/src/replacement.h +33 -0
  21. data/gumbo-parser/src/string_buffer.c +43 -50
  22. data/gumbo-parser/src/string_buffer.h +24 -40
  23. data/gumbo-parser/src/string_piece.c +39 -39
  24. data/gumbo-parser/src/svg_attrs.c +174 -0
  25. data/gumbo-parser/src/svg_tags.c +137 -0
  26. data/gumbo-parser/src/tag.c +186 -59
  27. data/gumbo-parser/src/tag_lookup.c +382 -0
  28. data/gumbo-parser/src/tag_lookup.h +13 -0
  29. data/gumbo-parser/src/token_type.h +1 -25
  30. data/gumbo-parser/src/tokenizer.c +899 -495
  31. data/gumbo-parser/src/tokenizer.h +37 -37
  32. data/gumbo-parser/src/tokenizer_states.h +6 -22
  33. data/gumbo-parser/src/utf8.c +103 -86
  34. data/gumbo-parser/src/utf8.h +37 -41
  35. data/gumbo-parser/src/util.c +48 -38
  36. data/gumbo-parser/src/util.h +10 -40
  37. data/gumbo-parser/src/vector.c +45 -57
  38. data/gumbo-parser/src/vector.h +17 -39
  39. data/lib/nokogumbo.rb +10 -174
  40. data/lib/nokogumbo/html5.rb +250 -0
  41. data/lib/nokogumbo/html5/document.rb +37 -0
  42. data/lib/nokogumbo/html5/document_fragment.rb +46 -0
  43. data/lib/nokogumbo/version.rb +3 -0
  44. data/lib/nokogumbo/xml/node.rb +57 -0
  45. metadata +32 -19
  46. data/ext/nokogumboc/extconf.rb +0 -60
  47. data/gumbo-parser/src/char_ref.rl +0 -2554
  48. data/gumbo-parser/src/string_piece.h +0 -38
  49. data/gumbo-parser/src/tag.in +0 -150
  50. data/gumbo-parser/src/tag_enum.h +0 -153
  51. data/gumbo-parser/src/tag_gperf.h +0 -105
  52. data/gumbo-parser/src/tag_sizes.h +0 -4
  53. data/gumbo-parser/src/tag_strings.h +0 -153
  54. data/gumbo-parser/visualc/include/strings.h +0 -4
  55. data/test-nokogumbo.rb +0 -190
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 96fa61565f78d5491e0b6d5b505cf936524745eb848b8b6584fc15e20c7ae35b
4
- data.tar.gz: e5416f71bbe90323f04b8aad4dc48b28947e43a9eb46f446f8ca1444f519a07b
3
+ metadata.gz: e0d434c0749d7922ba8f084c15ed7219ccbf0e07b715368ae846bc38e64aad17
4
+ data.tar.gz: 2770648e3e9e82d0ffb1877f1c06edc537688cf6a8405bc52dbdf5a6bb69bc1a
5
5
  SHA512:
6
- metadata.gz: 676bf3585d38cd4ad5c72b8b3afd4952e248c747683ae1072dd43f6ce1ccd279177e4d0c75a9821ed76d32806333128152231349d8d113ae5d81279580b13004
7
- data.tar.gz: 3459078d96977399e75551c4a3ee5623091f48569984b771e540ec111125f5af91e39a8d78cbd3ce9280326b1b9395dc4a0b0d7f0a72294876682cb9fe35e3d9
6
+ metadata.gz: e6c3de49495bf55ccaa250e2a3275b6796b0f0565da2a930e3333d2a153f2a16312eb77cb28ca3e03c17720127c2ecc27a1f71cfd6acfd15407295c29973e9fb
7
+ data.tar.gz: e8ce6c80cb2327d2327f03c7e829156c1f0074ba4d6fce2b0d59305b80112b8fd5edc0932fad1fca13cb5f4bb6f2652fe52a2f090110aa76d06e1afbdebc334f
data/CHANGELOG.md ADDED
@@ -0,0 +1,56 @@
1
+ # Changelog
2
+
3
+ All notable changes to Nokogumbo will be documented in this file.
4
+
5
+ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
6
+ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
7
+
8
+ ## [Unreleased]
9
+ ### Added
10
+ - Experimental support for errors (it was supported in 1.5.0 but
11
+ undocumented).
12
+ - Added proper HTML5 serialization.
13
+ - Added option `:max_tree_depth` to control the maximum parse tree depth.
14
+
15
+ ### Changed
16
+ - Integrated [Gumbo parser](https://github.com/google/gumbo-parser) into
17
+ Nokogumbo. A system version will not be used.
18
+ - The undocumented (but publicly mentioned) `:max_parse_errors` renamed to `:max_errors`;
19
+ `:max_parse_errors` is deprecated and will go away
20
+ - The various `#parse` and `#fragment` (and `Nokogiri.HTML5`) methods return
21
+ `Nokogiri::HTML5::Document` and `Nokogiri::HTML5::DocumentFragment` classes
22
+ rather than `Nokogiri::HTML::Document` and
23
+ `Nokogiri::HTML::DocumentFragment`.
24
+ - Changed the top-level API to more closely match Nokogiri's while maintaining
25
+ backwards compatibility. The new APIs are
26
+ * `Nokogiri::HTML5(html, url = nil, encoding = nil, **options, &block)`
27
+ * `Nokogiri::HTML5.parse(html, url = nil, encoding = nil, **options, &block)`
28
+ * `Nokogiri::HTML5::Document.parse(html, url = nil, encoding = nil, **options, &block)`
29
+ * `Nokogiri::HTML5.fragment(html, encoding = nil, **options)`
30
+ * `Nokogiri::HTML5::DocumentFragment.parse(html, encoding = nil, **options)`
31
+ In all cases, `html` can be a string or an `IO` object (something that
32
+ responds to `#read`). The `url` parameter is entirely for error reporting,
33
+ as in Nokogiri. The `encoding` parameter only signals what encoding `html`
34
+ should have on input; the output `Document` or `DocumentFragment` will be in
35
+ UTF-8. Currently, the only options supported is `:max_errors` which controls
36
+ the maximum number of reported by `#errors`.
37
+
38
+ ### Deprecated
39
+ - `:max_parse_errors`; use `:max_errors`
40
+
41
+ ### Removed
42
+
43
+ ### Fixed
44
+ - Fixed documents failing to serialize (via `to_html`) if they contain certain
45
+ `meta` elements that set the `charset`.
46
+ - Documents are now properly marked as UTF-8 after parsing.
47
+ - Fixed `Nokogiri::HTML5.fragment` reporting an error due to a missing
48
+ `<!DOCTYPE html>`.
49
+ - Fixed crash when input contains U+0000 NULL bytes and error reporting is
50
+ enabled.
51
+
52
+ ### Security
53
+ - The most recent, released version of Gumbo has a [potential security
54
+ issue](https://github.com/google/gumbo-parser/pull/375) that could result in
55
+ a cross-site scripting vulnerability. This has been fixed by integrating
56
+ Gumbo into Nokogumbo.
data/README.md CHANGED
@@ -1,5 +1,4 @@
1
- Nokogumbo - a Nokogiri interface to the Gumbo HTML5 parser.
2
- ===========
1
+ # Nokogumbo - a Nokogiri interface to the Gumbo HTML5 parser.
3
2
 
4
3
  Nokogumbo provides the ability for a Ruby program to invoke the
5
4
  [Gumbo HTML5 parser](https://github.com/google/gumbo-parser#readme)
@@ -8,12 +7,11 @@ and to access the result as a
8
7
 
9
8
  [![Build Status](https://travis-ci.org/rubys/nokogumbo.svg)](https://travis-ci.org/rubys/nokogumbo)
10
9
 
11
- Usage
12
- -----
10
+ ## Usage
13
11
 
14
12
  ```ruby
15
13
  require 'nokogumbo'
16
- doc = Nokogiri::HTML5(string)
14
+ doc = Nokogiri.HTML5(string)
17
15
  ```
18
16
 
19
17
  An experimental _fragment_ method is also provided. While not HTML5
@@ -32,21 +30,150 @@ require 'nokogumbo'
32
30
  doc = Nokogiri::HTML5.get(uri)
33
31
  ```
34
32
 
35
- Example
36
- -----
33
+ ## Parsing options
34
+ The document and fragment parsing methods,
35
+ - `Nokogiri.HTML5(html, url = nil, encoding = nil, options = {})`
36
+ - `Nokogiri::HTML5.parse(html, url = nil, encoding = nil, options = {})`
37
+ - `Nokogiri::HTML5::Document.parse(html, url = nil, encoding = nil, options = {})`
38
+ - `Nokogiri::HTML5.fragment(html, encoding = nil, options = {})`
39
+ - `Nokogiri::HTML5::DocumentFragment.parse(html, encoding = nil, options = {})`
40
+ support options that are different from Nokogiri's.
41
+
42
+ The two currently supported options are `:max_errors` and `:max_tree_depth`,
43
+ described below.
44
+
45
+ ### Error reporting
46
+ Nokogumbo contains an experimental parse error reporting facility. By default,
47
+ no parse errors are reported but this can be configured by passing the
48
+ `:max_errors` option to `::parse` or `::fragment`.
49
+
37
50
  ```ruby
38
51
  require 'nokogumbo'
39
- puts Nokogiri::HTML5.get('http://nokogiri.org').search('ol li')[2].text
52
+ doc = Nokogiri::HTML5.parse('Hi there!<body>', max_errors: 10)
53
+ doc.errors.each do |err|
54
+ puts err
55
+ end
56
+ ```
57
+
58
+ This prints the following.
59
+ ```
60
+ 1:1: ERROR: @1:1: The doctype must be the first token in the document.
61
+ Hi there!<body>
62
+ ^
63
+ 1:10: ERROR: @1:10: That tag isn't allowed here Currently open tags: html, body..
64
+ Hi there!<body>
65
+ ^
66
+ ```
67
+
68
+ Using `max_errors: -1` results in an unlimited number of errors being
69
+ returned.
70
+
71
+ The errors returned by `#errors` are instances of
72
+ [`Nokogiri::XML::SyntaxError`](https://www.rubydoc.info/github/sparklemotion/nokogiri/Nokogiri/XML/SyntaxError).
73
+
74
+ ### Maximum tree depth
75
+ The maximum depth of the DOM tree parsed by the various parsing methods is
76
+ configurable by the `:max_tree_depth` option. If the depth of the tree would
77
+ exceed this limit, then an
78
+ [ArgumentError](https://ruby-doc.org/core-2.5.0/ArgumentError.html) is thrown.
79
+
80
+ This limit (which defaults to `Nokogumbo::DEFAULT_MAX_TREE_DEPTH = 400`) can
81
+ be removed by giving the option `max_tree_depth: -1`.
82
+
83
+ ``` ruby
84
+ html = '<!DOCTYPE html>' + '<div>' * 1000
85
+ doc = Nokogiri.HTML5(html)
86
+ # raises ArgumentError: Document tree depth limit exceeded
87
+ doc = Nokogiri.HTML5(html, max_tree_depth: -1)
40
88
  ```
41
89
 
42
- Use `.to_html` instead of `.to_s` when parsing and serializing multiple times
90
+ ## HTML Serialization
91
+
92
+ After parsing HTML, it may be serialized using any of the Nokogiri
93
+ [serialization
94
+ methods](https://www.rubydoc.info/gems/nokogiri/Nokogiri/XML/Node). In
95
+ particular, `#serialize`, `#to_html`, and `#to_s` will serialize a given node
96
+ and its children. (This is the equivalent of JavaScript's
97
+ `Element.outerHTML`.) Similarly, `#inner_html` will serialize the children of
98
+ a given node. (This is the equivalent of JavaScript's `Element.innerHTML`.)
99
+
100
+ ``` ruby
101
+ doc = Nokogiri::HTML5("<!DOCTYPE html><span>Hello world!</span>")
102
+ puts doc.serialize
103
+ # Prints: <!DOCTYPE html><html><head></head><body><span>Hello world!</span></body></html>
104
+ ```
105
+
106
+ Due to quirks in how HTML is parsed and serialized, it's possible for a DOM
107
+ tree to be serialized and then re-parsed, resulting in a different DOM.
108
+ Mostly, this happens with DOMs produced from invalid HTML. Unfortunately, even
109
+ valid HTML may not survive serialization and re-parsing.
110
+
111
+ In particular, a newline at the start of `pre`, `listing`, and `textarea`
112
+ elements is ignored by the parser.
113
+
114
+ ``` ruby
115
+ doc = Nokogiri::HTML5(<<-EOF)
116
+ <!DOCTYPE html>
117
+ <pre>
118
+ Content</pre>
119
+ EOF
120
+ puts doc.at('/html/body/pre').serialize
121
+ # Prints: <pre>Content</pre>
43
122
  ```
123
+
124
+ In this case, the original HTML is semantically equivalent to the serialized
125
+ version. If the `pre`, `listing`, or `textarea` content starts with two
126
+ newlines, the first newline will be stripped on the first parse and the second
127
+ newline will be stripped on the second, leading to semantically different
128
+ DOMs. Passing the parameter `preserve_newline: true` will cause two or more
129
+ newlines to be preserved. (A single leading newline will still be removed.)
130
+
131
+ ``` ruby
132
+ doc = Nokogiri::HTML5(<<-EOF)
133
+ <!DOCTYPE html>
134
+ <listing>
135
+
136
+ Content</listing>
137
+ EOF
138
+ puts doc.at('/html/body/listing').serialize(preserve_newline: true)
139
+ # Prints: <listing>
140
+ #
141
+ # Content</listing>
142
+ ```
143
+
144
+ ## Encodings
145
+ Nokogumbo always parses HTML using
146
+ [UTF-8](https://en.wikipedia.org/wiki/UTF-8); however, the encoding of the
147
+ input can be explicitly selected via the optional `encoding` parameter. This
148
+ is most useful when the input comes not from a string but from an IO object.
149
+
150
+ When serializing a document or node, the encoding of the output string can be
151
+ specified via the `:encoding` options. Characters that cannot be encoded in
152
+ the selected encoding will be encoded as [HTML numeric
153
+ entities](https://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references).
154
+
155
+ ``` ruby
156
+ frag = Nokogiri::HTML5.fragment('<span>아는 길도 물어가라</span>')
157
+ html = frag.serialize(encoding: 'US-ASCII')
158
+ puts html
159
+ # Prints: <span>&#xc544;&#xb294; &#xae38;&#xb3c4; &#xbb3c;&#xc5b4;&#xac00;&#xb77c;</span>
160
+ frag = Nokogiri::HTML5.fragment(html)
161
+ puts frag.serialize
162
+ # Prints: <span>아는 길도 물어가라</span>
163
+ ```
164
+
165
+ (There's a [bug](https://bugs.ruby-lang.org/issues/15033) in all current
166
+ versions of Ruby that can cause the entity encoding to fail. Of the mandated
167
+ supported encodings for HTML, the only encoding I'm aware of that has this bug
168
+ is `'ISO-2022-JP'`. I recommend avoiding this encoding.)
169
+
170
+ ## Examples
171
+ ```ruby
44
172
  require 'nokogumbo'
45
- Nokogiri::HTML5.parse(Nokogiri::HTML5.parse('<div></div> a').to_html).to_html
173
+ puts Nokogiri::HTML5.get('http://nokogiri.org').search('ol li')[2].text
46
174
  ```
47
175
 
48
- Notes
49
- -----
176
+ ## Notes
50
177
 
51
178
  * The `Nokogiri::HTML5.fragment` function takes a string and parses it
52
179
  as a HTML5 document. The `<html>`, `<head>`, and `<body>` elements are
@@ -74,20 +201,17 @@ rules defined in the HTML5 specification for doing so.
74
201
  * Instead of returning `unknown` as the element name for unknown tags, the
75
202
  original tag name is returned verbatim.
76
203
 
77
- * If the Gumbo HTML5 parser is not already installed, the source for the
78
- parser will be downloaded and compiled into the Gem itself.
79
-
80
- Installation
81
- ============
204
+ # Installation
82
205
 
83
- git clone --recursive https://github.com/rubys/nokogumbo.git
206
+ git clone https://github.com/rubys/nokogumbo.git
84
207
  cd nokogumbo
85
208
  bundle install
86
209
  rake gem
87
210
  gem install pkg/nokogumbo*.gem
88
211
 
89
- Related efforts
90
- ============
212
+ # Related efforts
91
213
 
92
- * [ruby-gumbo](https://github.com/nevir/ruby-gumbo#readme) - a ruby binding
93
- for the Gumbo HTML5 parser.
214
+ * [ruby-gumbo](https://github.com/nevir/ruby-gumbo#readme) -- a ruby binding
215
+ for the Gumbo HTML5 parser.
216
+ * [lua-gumbo](https://gitlab.com/craigbarnes/lua-gumbo) -- a lua binding for
217
+ the Gumbo HTML5 parser.
@@ -0,0 +1,116 @@
1
+ require 'fileutils'
2
+ require 'mkmf'
3
+ require 'nokogiri'
4
+
5
+ $CFLAGS += " -std=c99"
6
+ $LDFLAGS.gsub!('-Wl,--no-undefined', '')
7
+ $warnflags = CONFIG['warnflags'] = '-Wall'
8
+
9
+ NG_SPEC = Gem::Specification.find_by_name('nokogiri', "= #{Nokogiri::VERSION}")
10
+
11
+ def download_headers
12
+ begin
13
+ require 'yaml'
14
+
15
+ dependencies = YAML.load_file(File.join(NG_SPEC.gem_dir, 'dependencies.yml'))
16
+ version = dependencies['libxml2']['version']
17
+ host = RbConfig::CONFIG["host_alias"].empty? ? RbConfig::CONFIG["host"] : RbConfig::CONFIG["host_alias"]
18
+ path = File.join('ports', host, 'libxml2', version, 'include/libxml2')
19
+ return path if File.directory?(path)
20
+
21
+ # Make sure we're using the same version Nokogiri uses
22
+ dep_index = NG_SPEC.dependencies.index { |dep| dep.name == 'mini_portile2' and dep.type == :runtime }
23
+ return nil if dep_index.nil?
24
+ requirement = NG_SPEC.dependencies[dep_index].requirement.to_s
25
+
26
+ require 'rubygems'
27
+ gem 'mini_portile2', requirement
28
+ require 'mini_portile2'
29
+ p = MiniPortile::new('libxml2', version).tap do |r|
30
+ r.host = RbConfig::CONFIG["host_alias"].empty? ? RbConfig::CONFIG["host"] : RbConfig::CONFIG["host_alias"]
31
+ r.files = [{
32
+ url: "http://xmlsoft.org/sources/libxml2-#{r.version}.tar.gz",
33
+ sha256: dependencies['libxml2']['sha256']
34
+ }]
35
+ r.configure_options += [
36
+ "--without-python",
37
+ "--without-readline",
38
+ "--with-c14n",
39
+ "--with-debug",
40
+ "--with-threads"
41
+ ]
42
+ end
43
+ p.download unless p.downloaded?
44
+ p.extract
45
+ p.configure unless p.configured?
46
+ system('make', '-C', "tmp/#{p.host}/ports/libxml2/#{version}/libxml2-#{version}/include/libxml", 'install-xmlincHEADERS')
47
+ path
48
+ rescue
49
+ puts 'failed to download/install headers'
50
+ nil
51
+ end
52
+ end
53
+
54
+ required = arg_config('--with-libxml2')
55
+ prohibited = arg_config('--without-libxml2')
56
+ if required and prohibited
57
+ abort "cannot use both --with-libxml2 and --without-libxml2"
58
+ end
59
+
60
+ have_libxml2 = false
61
+ have_ng = false
62
+
63
+ if !prohibited
64
+ if Nokogiri::VERSION_INFO.include?('libxml') and
65
+ Nokogiri::VERSION_INFO['libxml']['source'] == 'packaged'
66
+ # Nokogiri has libxml2 built in. Find the headers.
67
+ libxml2_path = File.join(Nokogiri::VERSION_INFO['libxml']['libxml2_path'],
68
+ 'include/libxml2')
69
+ if find_header('libxml/tree.h', libxml2_path)
70
+ have_libxml2 = true
71
+ else
72
+ # Unfortunately, some versions of Nokogiri delete these files.
73
+ # https://github.com/sparklemotion/nokogiri/pull/1788
74
+ # Try to download them
75
+ libxml2_path = download_headers
76
+ unless libxml2_path.nil?
77
+ have_libxml2 = find_header('libxml/tree.h', libxml2_path)
78
+ end
79
+ end
80
+ else
81
+ # Nokogiri is compiled with system headers.
82
+ # Hack to work around broken mkmf on macOS
83
+ # (https://bugs.ruby-lang.org/issues/14992 fixed now)
84
+ if RbConfig::MAKEFILE_CONFIG['LIBPATHENV'] == 'DYLD_LIBRARY_PATH'
85
+ RbConfig::MAKEFILE_CONFIG['LIBPATHENV'] = 'DYLD_FALLBACK_LIBRARY_PATH'
86
+ end
87
+
88
+ pkg_config('libxml-2.0')
89
+ have_libxml2 = have_library('xml2', 'xmlNewDoc')
90
+ end
91
+ if required and !have_libxml2
92
+ abort "libxml2 required but could not be located"
93
+ end
94
+
95
+ if have_libxml2
96
+ # Find nokogiri.h
97
+ have_ng = find_header('nokogiri.h', File.join(NG_SPEC.gem_dir, 'ext/nokogiri'))
98
+ end
99
+ end
100
+
101
+ if have_libxml2 and have_ng
102
+ $CFLAGS += " -DNGLIB=1"
103
+ end
104
+
105
+ # Symlink gumbo-parser source files.
106
+ ext_dir = File.dirname(__FILE__)
107
+ gumbo_src = File.join(ext_dir, 'gumbo_src')
108
+
109
+ Dir.chdir(ext_dir) do
110
+ $srcs = Dir['*.c', '../../gumbo-parser/src/*.c']
111
+ end
112
+ $INCFLAGS << ' -I$(srcdir)/../../gumbo-parser/src'
113
+ $VPATH << '$(srcdir)/../../gumbo-parser/src'
114
+
115
+ create_makefile('nokogumbo/nokogumbo')
116
+ # vim: set sw=2 sts=2 ts=8 et:
@@ -2,7 +2,7 @@
2
2
  // nokogumbo.c defines the following:
3
3
  //
4
4
  // class Nokogumbo
5
- // def parse(utf8_string) # returns Nokogiri::HTML::Document
5
+ // def parse(utf8_string) # returns Nokogiri::HTML5::Document
6
6
  // end
7
7
  //
8
8
  // Processing starts by calling gumbo_parse_with_options. The resulting
@@ -18,26 +18,29 @@
18
18
  // methods are called instead, producing the equivalent functionality.
19
19
  //
20
20
 
21
+ #include <assert.h>
21
22
  #include <ruby.h>
22
23
  #include "gumbo.h"
23
24
  #include "error.h"
24
- #include "parser.h"
25
25
 
26
26
  // class constants
27
27
  static VALUE Document;
28
- static VALUE XMLSyntaxError;
29
28
 
30
29
  #ifdef NGLIB
31
30
  #include <nokogiri.h>
31
+ #include <xml_syntax_error.h>
32
32
  #include <libxml/tree.h>
33
+ #include <libxml/HTMLtree.h>
33
34
 
34
35
  #define NIL NULL
35
36
  #define CONST_CAST (xmlChar const*)
36
37
  #else
37
- #define NIL 0
38
+ #define NIL Qnil
38
39
  #define CONST_CAST
39
40
 
40
41
  // more class constants
42
+ static VALUE cNokogiriXmlSyntaxError;
43
+
41
44
  static VALUE Element;
42
45
  static VALUE Text;
43
46
  static VALUE CDATA;
@@ -45,11 +48,15 @@ static VALUE Comment;
45
48
 
46
49
  // interned symbols
47
50
  static VALUE new;
51
+ static VALUE attribute;
48
52
  static VALUE set_attribute;
53
+ static VALUE remove_attribute;
49
54
  static VALUE add_child;
50
55
  static VALUE internal_subset;
51
56
  static VALUE remove_;
52
57
  static VALUE create_internal_subset;
58
+ static VALUE key_;
59
+ static VALUE node_name_;
53
60
 
54
61
  // map libxml2 types to Ruby VALUE
55
62
  #define xmlNodePtr VALUE
@@ -58,12 +65,10 @@ static VALUE create_internal_subset;
58
65
  // redefine libxml2 API as Ruby function calls
59
66
  #define xmlNewDocNode(doc, ns, name, content) \
60
67
  rb_funcall(Element, new, 2, rb_str_new2(name), doc)
61
- #define xmlNewProp(element, name, value) \
62
- rb_funcall(element, set_attribute, 2, rb_str_new2(name), rb_str_new2(value))
63
68
  #define xmlNewDocText(doc, text) \
64
69
  rb_funcall(Text, new, 2, rb_str_new2(text), doc)
65
70
  #define xmlNewCDataBlock(doc, content, length) \
66
- rb_funcall(CDATA, new, 2, rb_str_new(content, length), doc)
71
+ rb_funcall(CDATA, new, 2, doc, rb_str_new(content, length))
67
72
  #define xmlNewDocComment(doc, text) \
68
73
  rb_funcall(Comment, new, 2, doc, rb_str_new2(text))
69
74
  #define xmlAddChild(element, node) \
@@ -77,11 +82,76 @@ static VALUE create_internal_subset;
77
82
  #define Nokogiri_wrap_xml_document(klass, doc) \
78
83
  doc
79
84
 
80
- // remove internal subset from newly created documents
81
- static VALUE xmlNewDoc(char* version) {
82
- VALUE doc = rb_funcall(Document, new, 0);
83
- rb_funcall(rb_funcall(doc, internal_subset, 0), remove_, 0);
84
- return doc;
85
+ static VALUE find_dummy_key(VALUE collection) {
86
+ VALUE r_dummy = Qnil;
87
+ char dummy[5] = "a";
88
+ size_t len = 1;
89
+ while (len < sizeof dummy) {
90
+ r_dummy = rb_str_new(dummy, len);
91
+ if (rb_funcall(collection, key_, 1, r_dummy) == Qfalse)
92
+ return r_dummy;
93
+ for (size_t i = 0; ; ++i) {
94
+ if (dummy[i] == 0) {
95
+ dummy[i] = 'a';
96
+ ++len;
97
+ break;
98
+ }
99
+ if (dummy[i] == 'z')
100
+ dummy[i] = 'a';
101
+ else {
102
+ ++dummy[i];
103
+ break;
104
+ }
105
+ }
106
+ }
107
+ // This collection has 475254 elements?? Give up.
108
+ return Qnil;
109
+ }
110
+
111
+ static xmlNodePtr xmlNewProp(xmlNodePtr node, const char *name, const char *value) {
112
+ // Nokogiri::XML::Node#set_attribute calls xmlSetProp(node, name, value)
113
+ // which behaves roughly as
114
+ // if name is a QName prefix:local
115
+ // if node->doc has a namespace ns corresponding to prefix
116
+ // return xmlSetNsProp(node, ns, local, value)
117
+ // return xmlSetNsProp(node, NULL, name, value)
118
+ //
119
+ // If the prefix is "xml", then the namespace lookup will create it.
120
+ //
121
+ // By contrast, xmlNewProp does not do this parsing and creates an attribute
122
+ // with the name and value exactly as given. This is the behavior that we
123
+ // want.
124
+ //
125
+ // Thus, for attribute names like "xml:lang", #set_attribute will create an
126
+ // attribute with namespace "xml" and name "lang". This is incorrect for
127
+ // html elements (but correct for foreign elements).
128
+ //
129
+ // Work around this by inserting a dummy attribute and then changing the
130
+ // name, if needed.
131
+
132
+ // Can't use strchr since it's locale-sensitive.
133
+ size_t len = strlen(name);
134
+ VALUE r_name = rb_str_new(name, len);
135
+ if (memchr(name, ':', len) == NULL) {
136
+ // No colon.
137
+ return rb_funcall(node, set_attribute, 2, r_name, rb_str_new2(value));
138
+ }
139
+ // Find a dummy attribute string that doesn't already exist.
140
+ VALUE dummy = find_dummy_key(node);
141
+ if (dummy == Qnil)
142
+ return Qnil;
143
+ // Add the dummy attribute.
144
+ VALUE r_value = rb_funcall(node, set_attribute, 2, dummy, rb_str_new2(value));
145
+ if (r_value == Qnil)
146
+ return Qnil;
147
+ // Remove thet old attribute, if it exists.
148
+ rb_funcall(node, remove_attribute, 1, r_name);
149
+ // Rename the dummy
150
+ VALUE attr = rb_funcall(node, attribute, 1, dummy);
151
+ if (attr == Qnil)
152
+ return Qnil;
153
+ rb_funcall(attr, node_name_, 1, r_name);
154
+ return attr;
85
155
  }
86
156
  #endif
87
157
 
@@ -90,30 +160,15 @@ static xmlNodePtr walk_tree(xmlDocPtr document, GumboNode *node);
90
160
 
91
161
  // Build a xmlNodePtr for a given GumboElement (recursively)
92
162
  static xmlNodePtr walk_element(xmlDocPtr document, GumboElement *node) {
93
- // determine tag name for a given node
94
- xmlNodePtr element;
95
- if (node->tag != GUMBO_TAG_UNKNOWN) {
96
- element = xmlNewDocNode(document, NIL,
97
- CONST_CAST gumbo_normalized_tagname(node->tag), NIL);
98
- } else {
99
- GumboStringPiece tag = node->original_tag;
100
- gumbo_tag_from_original_text(&tag);
101
- #ifdef _MSC_VER
102
- char* name = alloca(tag.length+1);
103
- #else
104
- char name[tag.length+1];
105
- #endif
106
- strncpy(name, tag.data, tag.length);
107
- name[tag.length] = '\0';
108
- element = xmlNewDocNode(document, NIL, CONST_CAST name, NIL);
109
- }
163
+ // create the given element
164
+ xmlNodePtr element = xmlNewDocNode(document, NIL, CONST_CAST node->name, NIL);
110
165
 
111
166
  // add in the attributes
112
167
  GumboVector* attrs = &node->attributes;
113
168
  char *name = NULL;
114
- int namelen = 0;
115
- char *ns;
116
- for (int i=0; i < attrs->length; i++) {
169
+ size_t namelen = 0;
170
+ const char *ns;
171
+ for (size_t i=0; i < attrs->length; i++) {
117
172
  GumboAttribute *attr = attrs->data[i];
118
173
 
119
174
  switch (attr->attr_namespace) {
@@ -156,7 +211,7 @@ static xmlNodePtr walk_element(xmlDocPtr document, GumboElement *node) {
156
211
 
157
212
  // add in the children
158
213
  GumboVector* children = &node->children;
159
- for (int i=0; i < children->length; i++) {
214
+ for (size_t i=0; i < children->length; i++) {
160
215
  xmlNodePtr node = walk_tree(document, children->data[i]);
161
216
  if (node) xmlAddChild(element, node);
162
217
  }
@@ -176,37 +231,89 @@ static xmlNodePtr walk_tree(xmlDocPtr document, GumboNode *node) {
176
231
  return xmlNewDocText(document, CONST_CAST node->v.text.text);
177
232
  case GUMBO_NODE_CDATA:
178
233
  return xmlNewCDataBlock(document,
179
- CONST_CAST node->v.text.original_text.data,
180
- (int) node->v.text.original_text.length);
234
+ CONST_CAST node->v.text.text,
235
+ (int) strlen(node->v.text.text));
181
236
  case GUMBO_NODE_COMMENT:
182
237
  return xmlNewDocComment(document, CONST_CAST node->v.text.text);
183
238
  }
184
239
  }
185
240
 
241
+ // URI = system id
242
+ // external id = public id
243
+ #if NGLIB
244
+ static htmlDocPtr new_html_doc(const char *dtd_name, const char *system, const char *public)
245
+ {
246
+ // These two libxml2 functions take the public and system ids in
247
+ // opposite orders.
248
+ htmlDocPtr doc = htmlNewDocNoDtD(/* URI */ NULL, /* ExternalID */NULL);
249
+ assert(doc);
250
+ if (dtd_name)
251
+ xmlCreateIntSubset(doc, CONST_CAST dtd_name, CONST_CAST public, CONST_CAST system);
252
+ return doc;
253
+ }
254
+ #else
255
+ // remove internal subset from newly created documents
256
+ static VALUE new_html_doc(const char *dtd_name, const char *system, const char *public) {
257
+ VALUE doc;
258
+ // If system and public are both NULL, Document#new is going to set default
259
+ // values for them so we're going to have to remove the internal subset
260
+ // which seems to leak memory in Nokogiri, so leak as little as possible.
261
+ if (system == NULL && public == NULL) {
262
+ doc = rb_funcall(Document, new, 2, /* URI */ Qnil, /* external_id */ rb_str_new("", 0));
263
+ rb_funcall(rb_funcall(doc, internal_subset, 0), remove_, 0);
264
+ if (dtd_name) {
265
+ // We need to create an internal subset now.
266
+ rb_funcall(doc, create_internal_subset, 3, rb_str_new2(dtd_name), Qnil, Qnil);
267
+ }
268
+ } else {
269
+ assert(dtd_name);
270
+ // Rather than removing and creating the internal subset as we did above,
271
+ // just create and then rename one.
272
+ VALUE r_system = system ? rb_str_new2(system) : Qnil;
273
+ VALUE r_public = public ? rb_str_new2(public) : Qnil;
274
+ doc = rb_funcall(Document, new, 2, r_system, r_public);
275
+ rb_funcall(rb_funcall(doc, internal_subset, 0), node_name_, 1, rb_str_new2(dtd_name));
276
+ }
277
+ return doc;
278
+ }
279
+ #endif
280
+
186
281
  // Parse a string using gumbo_parse into a Nokogiri document
187
- static VALUE parse(VALUE self, VALUE string, VALUE max_parse_errors) {
188
- GumboOptions options;
189
- memcpy(&options, &kGumboDefaultOptions, sizeof options);
190
- options.max_errors = NUM2INT(max_parse_errors);
282
+ static VALUE parse(VALUE self, VALUE string, VALUE url, VALUE max_errors, VALUE max_depth) {
283
+ GumboOptions options = kGumboDefaultOptions;
284
+ options.max_errors = NUM2INT(max_errors);
285
+ options.max_tree_depth = NUM2INT(max_depth);
191
286
 
192
287
  const char *input = RSTRING_PTR(string);
193
288
  size_t input_len = RSTRING_LEN(string);
194
289
  GumboOutput *output = gumbo_parse_with_options(&options, input, input_len);
195
- xmlDocPtr doc = xmlNewDoc(CONST_CAST "1.0");
196
- #ifdef NGLIB
197
- doc->type = XML_HTML_DOCUMENT_NODE;
198
- #endif
290
+
291
+ const char *status_string = gumbo_status_to_string(output->status);
292
+ switch (output->status) {
293
+ case GUMBO_STATUS_OK:
294
+ break;
295
+ case GUMBO_STATUS_TREE_TOO_DEEP:
296
+ gumbo_destroy_output(output);
297
+ rb_raise(rb_eArgError, "%s", status_string);
298
+ case GUMBO_STATUS_OUT_OF_MEMORY:
299
+ gumbo_destroy_output(output);
300
+ rb_raise(rb_eNoMemError, "%s", status_string);
301
+ }
302
+
303
+ xmlDocPtr doc;
199
304
  if (output->document->v.document.has_doctype) {
200
305
  const char *name = output->document->v.document.name;
201
306
  const char *public = output->document->v.document.public_identifier;
202
307
  const char *system = output->document->v.document.system_identifier;
203
- xmlCreateIntSubset(doc, CONST_CAST name,
204
- (public[0] ? CONST_CAST public : NIL),
205
- (system[0] ? CONST_CAST system : NIL));
308
+ public = public[0] ? public : NULL;
309
+ system = system[0] ? system : NULL;
310
+ doc = new_html_doc(name, system, public);
311
+ } else {
312
+ doc = new_html_doc(NULL, NULL, NULL);
206
313
  }
207
314
 
208
315
  GumboVector *children = &output->document->v.document.children;
209
- for (int i=0; i < children->length; i++) {
316
+ for (size_t i=0; i < children->length; i++) {
210
317
  GumboNode *child = children->data[i];
211
318
  xmlNodePtr node = walk_tree(doc, child);
212
319
  if (node) {
@@ -222,28 +329,20 @@ static VALUE parse(VALUE self, VALUE string, VALUE max_parse_errors) {
222
329
  // Add parse errors to rdoc.
223
330
  if (output->errors.length) {
224
331
  GumboVector *errors = &output->errors;
225
- GumboParser parser = { ._options = &options };
226
332
  GumboStringBuffer msg;
227
333
  VALUE rerrors = rb_ary_new2(errors->length);
228
334
 
229
- gumbo_string_buffer_init(&parser, &msg);
230
- for (int i=0; i < errors->length; i++) {
335
+ gumbo_string_buffer_init(&msg);
336
+ for (size_t i=0; i < errors->length; i++) {
231
337
  GumboError *err = errors->data[i];
232
- gumbo_string_buffer_clear(&parser, &msg);
233
- // Work around bug in gumbo_caret_diagnostic_to_string.
234
- // See https://github.com/google/gumbo-parser/pull/371
235
- // The bug occurs when the error starts with a newline (unless it's the
236
- // first character in the input--but that shouldn't cause an error in
237
- // the first place.
238
- if (*err->original_text == '\n' && err->original_text != input)
239
- --err->original_text;
240
- gumbo_caret_diagnostic_to_string(&parser, err, input, &msg);
338
+ gumbo_string_buffer_clear(&msg);
339
+ gumbo_caret_diagnostic_to_string(err, input, input_len, &msg);
241
340
  VALUE err_str = rb_str_new(msg.data, msg.length);
242
- VALUE syntax_error = rb_class_new_instance(1, &err_str, XMLSyntaxError);
341
+ VALUE syntax_error = rb_class_new_instance(1, &err_str, cNokogiriXmlSyntaxError);
243
342
  rb_iv_set(syntax_error, "@domain", INT2NUM(1)); // XML_FROM_PARSER
244
343
  rb_iv_set(syntax_error, "@code", INT2NUM(1)); // XML_ERR_INTERNAL_ERROR
245
344
  rb_iv_set(syntax_error, "@level", INT2NUM(2)); // XML_ERR_ERROR
246
- rb_iv_set(syntax_error, "@file", Qnil);
345
+ rb_iv_set(syntax_error, "@file", url);
247
346
  rb_iv_set(syntax_error, "@line", INT2NUM(err->position.line));
248
347
  rb_iv_set(syntax_error, "@str1", Qnil);
249
348
  rb_iv_set(syntax_error, "@str2", Qnil);
@@ -253,28 +352,28 @@ static VALUE parse(VALUE self, VALUE string, VALUE max_parse_errors) {
253
352
  rb_ary_push(rerrors, syntax_error);
254
353
  }
255
354
  rb_iv_set(rdoc, "@errors", rerrors);
256
- gumbo_string_buffer_destroy(&parser, &msg);
355
+ gumbo_string_buffer_destroy(&msg);
257
356
  }
258
357
 
259
- gumbo_destroy_output(&options, output);
358
+ gumbo_destroy_output(output);
260
359
 
261
360
  return rdoc;
262
361
  }
263
362
 
264
363
  // Initialize the Nokogumbo class and fetch constants we will use later
265
- void Init_nokogumboc() {
364
+ void Init_nokogumbo() {
266
365
  rb_funcall(rb_mKernel, rb_intern("gem"), 1, rb_str_new2("nokogiri"));
267
366
  rb_require("nokogiri");
268
367
 
269
368
  // class constants
270
369
  VALUE Nokogiri = rb_const_get(rb_cObject, rb_intern("Nokogiri"));
271
- VALUE HTML = rb_const_get(Nokogiri, rb_intern("HTML"));
272
- Document = rb_const_get(HTML, rb_intern("Document"));
273
- VALUE XML = rb_const_get(Nokogiri, rb_intern("XML"));
274
- XMLSyntaxError = rb_const_get(XML, rb_intern("SyntaxError"));
370
+ VALUE HTML5 = rb_const_get(Nokogiri, rb_intern("HTML5"));
371
+ Document = rb_const_get(HTML5, rb_intern("Document"));
275
372
 
276
373
  #ifndef NGLIB
277
374
  // more class constants
375
+ VALUE XML = rb_const_get(Nokogiri, rb_intern("XML"));
376
+ cNokogiriXmlSyntaxError = rb_const_get(XML, rb_intern("SyntaxError"));
278
377
  Element = rb_const_get(XML, rb_intern("Element"));
279
378
  Text = rb_const_get(XML, rb_intern("Text"));
280
379
  CDATA = rb_const_get(XML, rb_intern("CDATA"));
@@ -282,14 +381,18 @@ void Init_nokogumboc() {
282
381
 
283
382
  // interned symbols
284
383
  new = rb_intern("new");
384
+ attribute = rb_intern("attribute");
285
385
  set_attribute = rb_intern("set_attribute");
386
+ remove_attribute = rb_intern("remove_attribute");
286
387
  add_child = rb_intern("add_child_node_and_reparent_attrs");
287
388
  internal_subset = rb_intern("internal_subset");
288
389
  remove_ = rb_intern("remove");
289
390
  create_internal_subset = rb_intern("create_internal_subset");
391
+ key_ = rb_intern("key?");
392
+ node_name_ = rb_intern("node_name=");
290
393
  #endif
291
394
 
292
- // define Nokogumbo class with a singleton parse method
293
- VALUE Gumbo = rb_define_class("Nokogumbo", rb_cObject);
294
- rb_define_singleton_method(Gumbo, "parse", parse, 2);
395
+ // define Nokogumbo module with a parse method
396
+ VALUE Gumbo = rb_define_module("Nokogumbo");
397
+ rb_define_singleton_method(Gumbo, "parse", parse, 4);
295
398
  }