nokogumbo 1.5.0 → 2.0.0.pre.alpha

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +56 -0
  3. data/README.md +146 -22
  4. data/ext/nokogumbo/extconf.rb +116 -0
  5. data/ext/{nokogumboc → nokogumbo}/nokogumbo.c +174 -71
  6. data/gumbo-parser/src/ascii.c +33 -0
  7. data/gumbo-parser/src/ascii.h +31 -0
  8. data/gumbo-parser/src/attribute.c +26 -28
  9. data/gumbo-parser/src/attribute.h +3 -23
  10. data/gumbo-parser/src/char_ref.c +135 -2351
  11. data/gumbo-parser/src/char_ref.h +13 -29
  12. data/gumbo-parser/src/error.c +215 -133
  13. data/gumbo-parser/src/error.h +34 -49
  14. data/gumbo-parser/src/foreign_attrs.c +104 -0
  15. data/gumbo-parser/src/gumbo.h +506 -304
  16. data/gumbo-parser/src/insertion_mode.h +4 -28
  17. data/gumbo-parser/src/macros.h +91 -0
  18. data/gumbo-parser/src/parser.c +1989 -1431
  19. data/gumbo-parser/src/parser.h +6 -22
  20. data/gumbo-parser/src/replacement.h +33 -0
  21. data/gumbo-parser/src/string_buffer.c +43 -50
  22. data/gumbo-parser/src/string_buffer.h +24 -40
  23. data/gumbo-parser/src/string_piece.c +39 -39
  24. data/gumbo-parser/src/svg_attrs.c +174 -0
  25. data/gumbo-parser/src/svg_tags.c +137 -0
  26. data/gumbo-parser/src/tag.c +186 -59
  27. data/gumbo-parser/src/tag_lookup.c +382 -0
  28. data/gumbo-parser/src/tag_lookup.h +13 -0
  29. data/gumbo-parser/src/token_type.h +1 -25
  30. data/gumbo-parser/src/tokenizer.c +899 -495
  31. data/gumbo-parser/src/tokenizer.h +37 -37
  32. data/gumbo-parser/src/tokenizer_states.h +6 -22
  33. data/gumbo-parser/src/utf8.c +103 -86
  34. data/gumbo-parser/src/utf8.h +37 -41
  35. data/gumbo-parser/src/util.c +48 -38
  36. data/gumbo-parser/src/util.h +10 -40
  37. data/gumbo-parser/src/vector.c +45 -57
  38. data/gumbo-parser/src/vector.h +17 -39
  39. data/lib/nokogumbo.rb +10 -174
  40. data/lib/nokogumbo/html5.rb +250 -0
  41. data/lib/nokogumbo/html5/document.rb +37 -0
  42. data/lib/nokogumbo/html5/document_fragment.rb +46 -0
  43. data/lib/nokogumbo/version.rb +3 -0
  44. data/lib/nokogumbo/xml/node.rb +57 -0
  45. metadata +32 -19
  46. data/ext/nokogumboc/extconf.rb +0 -60
  47. data/gumbo-parser/src/char_ref.rl +0 -2554
  48. data/gumbo-parser/src/string_piece.h +0 -38
  49. data/gumbo-parser/src/tag.in +0 -150
  50. data/gumbo-parser/src/tag_enum.h +0 -153
  51. data/gumbo-parser/src/tag_gperf.h +0 -105
  52. data/gumbo-parser/src/tag_sizes.h +0 -4
  53. data/gumbo-parser/src/tag_strings.h +0 -153
  54. data/gumbo-parser/visualc/include/strings.h +0 -4
  55. data/test-nokogumbo.rb +0 -190
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 96fa61565f78d5491e0b6d5b505cf936524745eb848b8b6584fc15e20c7ae35b
4
- data.tar.gz: e5416f71bbe90323f04b8aad4dc48b28947e43a9eb46f446f8ca1444f519a07b
3
+ metadata.gz: e0d434c0749d7922ba8f084c15ed7219ccbf0e07b715368ae846bc38e64aad17
4
+ data.tar.gz: 2770648e3e9e82d0ffb1877f1c06edc537688cf6a8405bc52dbdf5a6bb69bc1a
5
5
  SHA512:
6
- metadata.gz: 676bf3585d38cd4ad5c72b8b3afd4952e248c747683ae1072dd43f6ce1ccd279177e4d0c75a9821ed76d32806333128152231349d8d113ae5d81279580b13004
7
- data.tar.gz: 3459078d96977399e75551c4a3ee5623091f48569984b771e540ec111125f5af91e39a8d78cbd3ce9280326b1b9395dc4a0b0d7f0a72294876682cb9fe35e3d9
6
+ metadata.gz: e6c3de49495bf55ccaa250e2a3275b6796b0f0565da2a930e3333d2a153f2a16312eb77cb28ca3e03c17720127c2ecc27a1f71cfd6acfd15407295c29973e9fb
7
+ data.tar.gz: e8ce6c80cb2327d2327f03c7e829156c1f0074ba4d6fce2b0d59305b80112b8fd5edc0932fad1fca13cb5f4bb6f2652fe52a2f090110aa76d06e1afbdebc334f
data/CHANGELOG.md ADDED
@@ -0,0 +1,56 @@
1
+ # Changelog
2
+
3
+ All notable changes to Nokogumbo will be documented in this file.
4
+
5
+ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
6
+ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
7
+
8
+ ## [Unreleased]
9
+ ### Added
10
+ - Experimental support for errors (it was supported in 1.5.0 but
11
+ undocumented).
12
+ - Added proper HTML5 serialization.
13
+ - Added option `:max_tree_depth` to control the maximum parse tree depth.
14
+
15
+ ### Changed
16
+ - Integrated [Gumbo parser](https://github.com/google/gumbo-parser) into
17
+ Nokogumbo. A system version will not be used.
18
+ - The undocumented (but publicly mentioned) `:max_parse_errors` renamed to `:max_errors`;
19
+ `:max_parse_errors` is deprecated and will go away
20
+ - The various `#parse` and `#fragment` (and `Nokogiri.HTML5`) methods return
21
+ `Nokogiri::HTML5::Document` and `Nokogiri::HTML5::DocumentFragment` classes
22
+ rather than `Nokogiri::HTML::Document` and
23
+ `Nokogiri::HTML::DocumentFragment`.
24
+ - Changed the top-level API to more closely match Nokogiri's while maintaining
25
+ backwards compatibility. The new APIs are
26
+ * `Nokogiri::HTML5(html, url = nil, encoding = nil, **options, &block)`
27
+ * `Nokogiri::HTML5.parse(html, url = nil, encoding = nil, **options, &block)`
28
+ * `Nokogiri::HTML5::Document.parse(html, url = nil, encoding = nil, **options, &block)`
29
+ * `Nokogiri::HTML5.fragment(html, encoding = nil, **options)`
30
+ * `Nokogiri::HTML5::DocumentFragment.parse(html, encoding = nil, **options)`
31
+ In all cases, `html` can be a string or an `IO` object (something that
32
+ responds to `#read`). The `url` parameter is entirely for error reporting,
33
+ as in Nokogiri. The `encoding` parameter only signals what encoding `html`
34
+ should have on input; the output `Document` or `DocumentFragment` will be in
35
+ UTF-8. Currently, the only options supported is `:max_errors` which controls
36
+ the maximum number of reported by `#errors`.
37
+
38
+ ### Deprecated
39
+ - `:max_parse_errors`; use `:max_errors`
40
+
41
+ ### Removed
42
+
43
+ ### Fixed
44
+ - Fixed documents failing to serialize (via `to_html`) if they contain certain
45
+ `meta` elements that set the `charset`.
46
+ - Documents are now properly marked as UTF-8 after parsing.
47
+ - Fixed `Nokogiri::HTML5.fragment` reporting an error due to a missing
48
+ `<!DOCTYPE html>`.
49
+ - Fixed crash when input contains U+0000 NULL bytes and error reporting is
50
+ enabled.
51
+
52
+ ### Security
53
+ - The most recent, released version of Gumbo has a [potential security
54
+ issue](https://github.com/google/gumbo-parser/pull/375) that could result in
55
+ a cross-site scripting vulnerability. This has been fixed by integrating
56
+ Gumbo into Nokogumbo.
data/README.md CHANGED
@@ -1,5 +1,4 @@
1
- Nokogumbo - a Nokogiri interface to the Gumbo HTML5 parser.
2
- ===========
1
+ # Nokogumbo - a Nokogiri interface to the Gumbo HTML5 parser.
3
2
 
4
3
  Nokogumbo provides the ability for a Ruby program to invoke the
5
4
  [Gumbo HTML5 parser](https://github.com/google/gumbo-parser#readme)
@@ -8,12 +7,11 @@ and to access the result as a
8
7
 
9
8
  [![Build Status](https://travis-ci.org/rubys/nokogumbo.svg)](https://travis-ci.org/rubys/nokogumbo)
10
9
 
11
- Usage
12
- -----
10
+ ## Usage
13
11
 
14
12
  ```ruby
15
13
  require 'nokogumbo'
16
- doc = Nokogiri::HTML5(string)
14
+ doc = Nokogiri.HTML5(string)
17
15
  ```
18
16
 
19
17
  An experimental _fragment_ method is also provided. While not HTML5
@@ -32,21 +30,150 @@ require 'nokogumbo'
32
30
  doc = Nokogiri::HTML5.get(uri)
33
31
  ```
34
32
 
35
- Example
36
- -----
33
+ ## Parsing options
34
+ The document and fragment parsing methods,
35
+ - `Nokogiri.HTML5(html, url = nil, encoding = nil, options = {})`
36
+ - `Nokogiri::HTML5.parse(html, url = nil, encoding = nil, options = {})`
37
+ - `Nokogiri::HTML5::Document.parse(html, url = nil, encoding = nil, options = {})`
38
+ - `Nokogiri::HTML5.fragment(html, encoding = nil, options = {})`
39
+ - `Nokogiri::HTML5::DocumentFragment.parse(html, encoding = nil, options = {})`
40
+ support options that are different from Nokogiri's.
41
+
42
+ The two currently supported options are `:max_errors` and `:max_tree_depth`,
43
+ described below.
44
+
45
+ ### Error reporting
46
+ Nokogumbo contains an experimental parse error reporting facility. By default,
47
+ no parse errors are reported but this can be configured by passing the
48
+ `:max_errors` option to `::parse` or `::fragment`.
49
+
37
50
  ```ruby
38
51
  require 'nokogumbo'
39
- puts Nokogiri::HTML5.get('http://nokogiri.org').search('ol li')[2].text
52
+ doc = Nokogiri::HTML5.parse('Hi there!<body>', max_errors: 10)
53
+ doc.errors.each do |err|
54
+ puts err
55
+ end
56
+ ```
57
+
58
+ This prints the following.
59
+ ```
60
+ 1:1: ERROR: @1:1: The doctype must be the first token in the document.
61
+ Hi there!<body>
62
+ ^
63
+ 1:10: ERROR: @1:10: That tag isn't allowed here Currently open tags: html, body..
64
+ Hi there!<body>
65
+ ^
66
+ ```
67
+
68
+ Using `max_errors: -1` results in an unlimited number of errors being
69
+ returned.
70
+
71
+ The errors returned by `#errors` are instances of
72
+ [`Nokogiri::XML::SyntaxError`](https://www.rubydoc.info/github/sparklemotion/nokogiri/Nokogiri/XML/SyntaxError).
73
+
74
+ ### Maximum tree depth
75
+ The maximum depth of the DOM tree parsed by the various parsing methods is
76
+ configurable by the `:max_tree_depth` option. If the depth of the tree would
77
+ exceed this limit, then an
78
+ [ArgumentError](https://ruby-doc.org/core-2.5.0/ArgumentError.html) is thrown.
79
+
80
+ This limit (which defaults to `Nokogumbo::DEFAULT_MAX_TREE_DEPTH = 400`) can
81
+ be removed by giving the option `max_tree_depth: -1`.
82
+
83
+ ``` ruby
84
+ html = '<!DOCTYPE html>' + '<div>' * 1000
85
+ doc = Nokogiri.HTML5(html)
86
+ # raises ArgumentError: Document tree depth limit exceeded
87
+ doc = Nokogiri.HTML5(html, max_tree_depth: -1)
40
88
  ```
41
89
 
42
- Use `.to_html` instead of `.to_s` when parsing and serializing multiple times
90
+ ## HTML Serialization
91
+
92
+ After parsing HTML, it may be serialized using any of the Nokogiri
93
+ [serialization
94
+ methods](https://www.rubydoc.info/gems/nokogiri/Nokogiri/XML/Node). In
95
+ particular, `#serialize`, `#to_html`, and `#to_s` will serialize a given node
96
+ and its children. (This is the equivalent of JavaScript's
97
+ `Element.outerHTML`.) Similarly, `#inner_html` will serialize the children of
98
+ a given node. (This is the equivalent of JavaScript's `Element.innerHTML`.)
99
+
100
+ ``` ruby
101
+ doc = Nokogiri::HTML5("<!DOCTYPE html><span>Hello world!</span>")
102
+ puts doc.serialize
103
+ # Prints: <!DOCTYPE html><html><head></head><body><span>Hello world!</span></body></html>
104
+ ```
105
+
106
+ Due to quirks in how HTML is parsed and serialized, it's possible for a DOM
107
+ tree to be serialized and then re-parsed, resulting in a different DOM.
108
+ Mostly, this happens with DOMs produced from invalid HTML. Unfortunately, even
109
+ valid HTML may not survive serialization and re-parsing.
110
+
111
+ In particular, a newline at the start of `pre`, `listing`, and `textarea`
112
+ elements is ignored by the parser.
113
+
114
+ ``` ruby
115
+ doc = Nokogiri::HTML5(<<-EOF)
116
+ <!DOCTYPE html>
117
+ <pre>
118
+ Content</pre>
119
+ EOF
120
+ puts doc.at('/html/body/pre').serialize
121
+ # Prints: <pre>Content</pre>
43
122
  ```
123
+
124
+ In this case, the original HTML is semantically equivalent to the serialized
125
+ version. If the `pre`, `listing`, or `textarea` content starts with two
126
+ newlines, the first newline will be stripped on the first parse and the second
127
+ newline will be stripped on the second, leading to semantically different
128
+ DOMs. Passing the parameter `preserve_newline: true` will cause two or more
129
+ newlines to be preserved. (A single leading newline will still be removed.)
130
+
131
+ ``` ruby
132
+ doc = Nokogiri::HTML5(<<-EOF)
133
+ <!DOCTYPE html>
134
+ <listing>
135
+
136
+ Content</listing>
137
+ EOF
138
+ puts doc.at('/html/body/listing').serialize(preserve_newline: true)
139
+ # Prints: <listing>
140
+ #
141
+ # Content</listing>
142
+ ```
143
+
144
+ ## Encodings
145
+ Nokogumbo always parses HTML using
146
+ [UTF-8](https://en.wikipedia.org/wiki/UTF-8); however, the encoding of the
147
+ input can be explicitly selected via the optional `encoding` parameter. This
148
+ is most useful when the input comes not from a string but from an IO object.
149
+
150
+ When serializing a document or node, the encoding of the output string can be
151
+ specified via the `:encoding` options. Characters that cannot be encoded in
152
+ the selected encoding will be encoded as [HTML numeric
153
+ entities](https://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references).
154
+
155
+ ``` ruby
156
+ frag = Nokogiri::HTML5.fragment('<span>아는 길도 물어가라</span>')
157
+ html = frag.serialize(encoding: 'US-ASCII')
158
+ puts html
159
+ # Prints: <span>&#xc544;&#xb294; &#xae38;&#xb3c4; &#xbb3c;&#xc5b4;&#xac00;&#xb77c;</span>
160
+ frag = Nokogiri::HTML5.fragment(html)
161
+ puts frag.serialize
162
+ # Prints: <span>아는 길도 물어가라</span>
163
+ ```
164
+
165
+ (There's a [bug](https://bugs.ruby-lang.org/issues/15033) in all current
166
+ versions of Ruby that can cause the entity encoding to fail. Of the mandated
167
+ supported encodings for HTML, the only encoding I'm aware of that has this bug
168
+ is `'ISO-2022-JP'`. I recommend avoiding this encoding.)
169
+
170
+ ## Examples
171
+ ```ruby
44
172
  require 'nokogumbo'
45
- Nokogiri::HTML5.parse(Nokogiri::HTML5.parse('<div></div> a').to_html).to_html
173
+ puts Nokogiri::HTML5.get('http://nokogiri.org').search('ol li')[2].text
46
174
  ```
47
175
 
48
- Notes
49
- -----
176
+ ## Notes
50
177
 
51
178
  * The `Nokogiri::HTML5.fragment` function takes a string and parses it
52
179
  as a HTML5 document. The `<html>`, `<head>`, and `<body>` elements are
@@ -74,20 +201,17 @@ rules defined in the HTML5 specification for doing so.
74
201
  * Instead of returning `unknown` as the element name for unknown tags, the
75
202
  original tag name is returned verbatim.
76
203
 
77
- * If the Gumbo HTML5 parser is not already installed, the source for the
78
- parser will be downloaded and compiled into the Gem itself.
79
-
80
- Installation
81
- ============
204
+ # Installation
82
205
 
83
- git clone --recursive https://github.com/rubys/nokogumbo.git
206
+ git clone https://github.com/rubys/nokogumbo.git
84
207
  cd nokogumbo
85
208
  bundle install
86
209
  rake gem
87
210
  gem install pkg/nokogumbo*.gem
88
211
 
89
- Related efforts
90
- ============
212
+ # Related efforts
91
213
 
92
- * [ruby-gumbo](https://github.com/nevir/ruby-gumbo#readme) - a ruby binding
93
- for the Gumbo HTML5 parser.
214
+ * [ruby-gumbo](https://github.com/nevir/ruby-gumbo#readme) -- a ruby binding
215
+ for the Gumbo HTML5 parser.
216
+ * [lua-gumbo](https://gitlab.com/craigbarnes/lua-gumbo) -- a lua binding for
217
+ the Gumbo HTML5 parser.
@@ -0,0 +1,116 @@
1
+ require 'fileutils'
2
+ require 'mkmf'
3
+ require 'nokogiri'
4
+
5
+ $CFLAGS += " -std=c99"
6
+ $LDFLAGS.gsub!('-Wl,--no-undefined', '')
7
+ $warnflags = CONFIG['warnflags'] = '-Wall'
8
+
9
+ NG_SPEC = Gem::Specification.find_by_name('nokogiri', "= #{Nokogiri::VERSION}")
10
+
11
+ def download_headers
12
+ begin
13
+ require 'yaml'
14
+
15
+ dependencies = YAML.load_file(File.join(NG_SPEC.gem_dir, 'dependencies.yml'))
16
+ version = dependencies['libxml2']['version']
17
+ host = RbConfig::CONFIG["host_alias"].empty? ? RbConfig::CONFIG["host"] : RbConfig::CONFIG["host_alias"]
18
+ path = File.join('ports', host, 'libxml2', version, 'include/libxml2')
19
+ return path if File.directory?(path)
20
+
21
+ # Make sure we're using the same version Nokogiri uses
22
+ dep_index = NG_SPEC.dependencies.index { |dep| dep.name == 'mini_portile2' and dep.type == :runtime }
23
+ return nil if dep_index.nil?
24
+ requirement = NG_SPEC.dependencies[dep_index].requirement.to_s
25
+
26
+ require 'rubygems'
27
+ gem 'mini_portile2', requirement
28
+ require 'mini_portile2'
29
+ p = MiniPortile::new('libxml2', version).tap do |r|
30
+ r.host = RbConfig::CONFIG["host_alias"].empty? ? RbConfig::CONFIG["host"] : RbConfig::CONFIG["host_alias"]
31
+ r.files = [{
32
+ url: "http://xmlsoft.org/sources/libxml2-#{r.version}.tar.gz",
33
+ sha256: dependencies['libxml2']['sha256']
34
+ }]
35
+ r.configure_options += [
36
+ "--without-python",
37
+ "--without-readline",
38
+ "--with-c14n",
39
+ "--with-debug",
40
+ "--with-threads"
41
+ ]
42
+ end
43
+ p.download unless p.downloaded?
44
+ p.extract
45
+ p.configure unless p.configured?
46
+ system('make', '-C', "tmp/#{p.host}/ports/libxml2/#{version}/libxml2-#{version}/include/libxml", 'install-xmlincHEADERS')
47
+ path
48
+ rescue
49
+ puts 'failed to download/install headers'
50
+ nil
51
+ end
52
+ end
53
+
54
+ required = arg_config('--with-libxml2')
55
+ prohibited = arg_config('--without-libxml2')
56
+ if required and prohibited
57
+ abort "cannot use both --with-libxml2 and --without-libxml2"
58
+ end
59
+
60
+ have_libxml2 = false
61
+ have_ng = false
62
+
63
+ if !prohibited
64
+ if Nokogiri::VERSION_INFO.include?('libxml') and
65
+ Nokogiri::VERSION_INFO['libxml']['source'] == 'packaged'
66
+ # Nokogiri has libxml2 built in. Find the headers.
67
+ libxml2_path = File.join(Nokogiri::VERSION_INFO['libxml']['libxml2_path'],
68
+ 'include/libxml2')
69
+ if find_header('libxml/tree.h', libxml2_path)
70
+ have_libxml2 = true
71
+ else
72
+ # Unfortunately, some versions of Nokogiri delete these files.
73
+ # https://github.com/sparklemotion/nokogiri/pull/1788
74
+ # Try to download them
75
+ libxml2_path = download_headers
76
+ unless libxml2_path.nil?
77
+ have_libxml2 = find_header('libxml/tree.h', libxml2_path)
78
+ end
79
+ end
80
+ else
81
+ # Nokogiri is compiled with system headers.
82
+ # Hack to work around broken mkmf on macOS
83
+ # (https://bugs.ruby-lang.org/issues/14992 fixed now)
84
+ if RbConfig::MAKEFILE_CONFIG['LIBPATHENV'] == 'DYLD_LIBRARY_PATH'
85
+ RbConfig::MAKEFILE_CONFIG['LIBPATHENV'] = 'DYLD_FALLBACK_LIBRARY_PATH'
86
+ end
87
+
88
+ pkg_config('libxml-2.0')
89
+ have_libxml2 = have_library('xml2', 'xmlNewDoc')
90
+ end
91
+ if required and !have_libxml2
92
+ abort "libxml2 required but could not be located"
93
+ end
94
+
95
+ if have_libxml2
96
+ # Find nokogiri.h
97
+ have_ng = find_header('nokogiri.h', File.join(NG_SPEC.gem_dir, 'ext/nokogiri'))
98
+ end
99
+ end
100
+
101
+ if have_libxml2 and have_ng
102
+ $CFLAGS += " -DNGLIB=1"
103
+ end
104
+
105
+ # Symlink gumbo-parser source files.
106
+ ext_dir = File.dirname(__FILE__)
107
+ gumbo_src = File.join(ext_dir, 'gumbo_src')
108
+
109
+ Dir.chdir(ext_dir) do
110
+ $srcs = Dir['*.c', '../../gumbo-parser/src/*.c']
111
+ end
112
+ $INCFLAGS << ' -I$(srcdir)/../../gumbo-parser/src'
113
+ $VPATH << '$(srcdir)/../../gumbo-parser/src'
114
+
115
+ create_makefile('nokogumbo/nokogumbo')
116
+ # vim: set sw=2 sts=2 ts=8 et:
@@ -2,7 +2,7 @@
2
2
  // nokogumbo.c defines the following:
3
3
  //
4
4
  // class Nokogumbo
5
- // def parse(utf8_string) # returns Nokogiri::HTML::Document
5
+ // def parse(utf8_string) # returns Nokogiri::HTML5::Document
6
6
  // end
7
7
  //
8
8
  // Processing starts by calling gumbo_parse_with_options. The resulting
@@ -18,26 +18,29 @@
18
18
  // methods are called instead, producing the equivalent functionality.
19
19
  //
20
20
 
21
+ #include <assert.h>
21
22
  #include <ruby.h>
22
23
  #include "gumbo.h"
23
24
  #include "error.h"
24
- #include "parser.h"
25
25
 
26
26
  // class constants
27
27
  static VALUE Document;
28
- static VALUE XMLSyntaxError;
29
28
 
30
29
  #ifdef NGLIB
31
30
  #include <nokogiri.h>
31
+ #include <xml_syntax_error.h>
32
32
  #include <libxml/tree.h>
33
+ #include <libxml/HTMLtree.h>
33
34
 
34
35
  #define NIL NULL
35
36
  #define CONST_CAST (xmlChar const*)
36
37
  #else
37
- #define NIL 0
38
+ #define NIL Qnil
38
39
  #define CONST_CAST
39
40
 
40
41
  // more class constants
42
+ static VALUE cNokogiriXmlSyntaxError;
43
+
41
44
  static VALUE Element;
42
45
  static VALUE Text;
43
46
  static VALUE CDATA;
@@ -45,11 +48,15 @@ static VALUE Comment;
45
48
 
46
49
  // interned symbols
47
50
  static VALUE new;
51
+ static VALUE attribute;
48
52
  static VALUE set_attribute;
53
+ static VALUE remove_attribute;
49
54
  static VALUE add_child;
50
55
  static VALUE internal_subset;
51
56
  static VALUE remove_;
52
57
  static VALUE create_internal_subset;
58
+ static VALUE key_;
59
+ static VALUE node_name_;
53
60
 
54
61
  // map libxml2 types to Ruby VALUE
55
62
  #define xmlNodePtr VALUE
@@ -58,12 +65,10 @@ static VALUE create_internal_subset;
58
65
  // redefine libxml2 API as Ruby function calls
59
66
  #define xmlNewDocNode(doc, ns, name, content) \
60
67
  rb_funcall(Element, new, 2, rb_str_new2(name), doc)
61
- #define xmlNewProp(element, name, value) \
62
- rb_funcall(element, set_attribute, 2, rb_str_new2(name), rb_str_new2(value))
63
68
  #define xmlNewDocText(doc, text) \
64
69
  rb_funcall(Text, new, 2, rb_str_new2(text), doc)
65
70
  #define xmlNewCDataBlock(doc, content, length) \
66
- rb_funcall(CDATA, new, 2, rb_str_new(content, length), doc)
71
+ rb_funcall(CDATA, new, 2, doc, rb_str_new(content, length))
67
72
  #define xmlNewDocComment(doc, text) \
68
73
  rb_funcall(Comment, new, 2, doc, rb_str_new2(text))
69
74
  #define xmlAddChild(element, node) \
@@ -77,11 +82,76 @@ static VALUE create_internal_subset;
77
82
  #define Nokogiri_wrap_xml_document(klass, doc) \
78
83
  doc
79
84
 
80
- // remove internal subset from newly created documents
81
- static VALUE xmlNewDoc(char* version) {
82
- VALUE doc = rb_funcall(Document, new, 0);
83
- rb_funcall(rb_funcall(doc, internal_subset, 0), remove_, 0);
84
- return doc;
85
+ static VALUE find_dummy_key(VALUE collection) {
86
+ VALUE r_dummy = Qnil;
87
+ char dummy[5] = "a";
88
+ size_t len = 1;
89
+ while (len < sizeof dummy) {
90
+ r_dummy = rb_str_new(dummy, len);
91
+ if (rb_funcall(collection, key_, 1, r_dummy) == Qfalse)
92
+ return r_dummy;
93
+ for (size_t i = 0; ; ++i) {
94
+ if (dummy[i] == 0) {
95
+ dummy[i] = 'a';
96
+ ++len;
97
+ break;
98
+ }
99
+ if (dummy[i] == 'z')
100
+ dummy[i] = 'a';
101
+ else {
102
+ ++dummy[i];
103
+ break;
104
+ }
105
+ }
106
+ }
107
+ // This collection has 475254 elements?? Give up.
108
+ return Qnil;
109
+ }
110
+
111
+ static xmlNodePtr xmlNewProp(xmlNodePtr node, const char *name, const char *value) {
112
+ // Nokogiri::XML::Node#set_attribute calls xmlSetProp(node, name, value)
113
+ // which behaves roughly as
114
+ // if name is a QName prefix:local
115
+ // if node->doc has a namespace ns corresponding to prefix
116
+ // return xmlSetNsProp(node, ns, local, value)
117
+ // return xmlSetNsProp(node, NULL, name, value)
118
+ //
119
+ // If the prefix is "xml", then the namespace lookup will create it.
120
+ //
121
+ // By contrast, xmlNewProp does not do this parsing and creates an attribute
122
+ // with the name and value exactly as given. This is the behavior that we
123
+ // want.
124
+ //
125
+ // Thus, for attribute names like "xml:lang", #set_attribute will create an
126
+ // attribute with namespace "xml" and name "lang". This is incorrect for
127
+ // html elements (but correct for foreign elements).
128
+ //
129
+ // Work around this by inserting a dummy attribute and then changing the
130
+ // name, if needed.
131
+
132
+ // Can't use strchr since it's locale-sensitive.
133
+ size_t len = strlen(name);
134
+ VALUE r_name = rb_str_new(name, len);
135
+ if (memchr(name, ':', len) == NULL) {
136
+ // No colon.
137
+ return rb_funcall(node, set_attribute, 2, r_name, rb_str_new2(value));
138
+ }
139
+ // Find a dummy attribute string that doesn't already exist.
140
+ VALUE dummy = find_dummy_key(node);
141
+ if (dummy == Qnil)
142
+ return Qnil;
143
+ // Add the dummy attribute.
144
+ VALUE r_value = rb_funcall(node, set_attribute, 2, dummy, rb_str_new2(value));
145
+ if (r_value == Qnil)
146
+ return Qnil;
147
+ // Remove thet old attribute, if it exists.
148
+ rb_funcall(node, remove_attribute, 1, r_name);
149
+ // Rename the dummy
150
+ VALUE attr = rb_funcall(node, attribute, 1, dummy);
151
+ if (attr == Qnil)
152
+ return Qnil;
153
+ rb_funcall(attr, node_name_, 1, r_name);
154
+ return attr;
85
155
  }
86
156
  #endif
87
157
 
@@ -90,30 +160,15 @@ static xmlNodePtr walk_tree(xmlDocPtr document, GumboNode *node);
90
160
 
91
161
  // Build a xmlNodePtr for a given GumboElement (recursively)
92
162
  static xmlNodePtr walk_element(xmlDocPtr document, GumboElement *node) {
93
- // determine tag name for a given node
94
- xmlNodePtr element;
95
- if (node->tag != GUMBO_TAG_UNKNOWN) {
96
- element = xmlNewDocNode(document, NIL,
97
- CONST_CAST gumbo_normalized_tagname(node->tag), NIL);
98
- } else {
99
- GumboStringPiece tag = node->original_tag;
100
- gumbo_tag_from_original_text(&tag);
101
- #ifdef _MSC_VER
102
- char* name = alloca(tag.length+1);
103
- #else
104
- char name[tag.length+1];
105
- #endif
106
- strncpy(name, tag.data, tag.length);
107
- name[tag.length] = '\0';
108
- element = xmlNewDocNode(document, NIL, CONST_CAST name, NIL);
109
- }
163
+ // create the given element
164
+ xmlNodePtr element = xmlNewDocNode(document, NIL, CONST_CAST node->name, NIL);
110
165
 
111
166
  // add in the attributes
112
167
  GumboVector* attrs = &node->attributes;
113
168
  char *name = NULL;
114
- int namelen = 0;
115
- char *ns;
116
- for (int i=0; i < attrs->length; i++) {
169
+ size_t namelen = 0;
170
+ const char *ns;
171
+ for (size_t i=0; i < attrs->length; i++) {
117
172
  GumboAttribute *attr = attrs->data[i];
118
173
 
119
174
  switch (attr->attr_namespace) {
@@ -156,7 +211,7 @@ static xmlNodePtr walk_element(xmlDocPtr document, GumboElement *node) {
156
211
 
157
212
  // add in the children
158
213
  GumboVector* children = &node->children;
159
- for (int i=0; i < children->length; i++) {
214
+ for (size_t i=0; i < children->length; i++) {
160
215
  xmlNodePtr node = walk_tree(document, children->data[i]);
161
216
  if (node) xmlAddChild(element, node);
162
217
  }
@@ -176,37 +231,89 @@ static xmlNodePtr walk_tree(xmlDocPtr document, GumboNode *node) {
176
231
  return xmlNewDocText(document, CONST_CAST node->v.text.text);
177
232
  case GUMBO_NODE_CDATA:
178
233
  return xmlNewCDataBlock(document,
179
- CONST_CAST node->v.text.original_text.data,
180
- (int) node->v.text.original_text.length);
234
+ CONST_CAST node->v.text.text,
235
+ (int) strlen(node->v.text.text));
181
236
  case GUMBO_NODE_COMMENT:
182
237
  return xmlNewDocComment(document, CONST_CAST node->v.text.text);
183
238
  }
184
239
  }
185
240
 
241
+ // URI = system id
242
+ // external id = public id
243
+ #if NGLIB
244
+ static htmlDocPtr new_html_doc(const char *dtd_name, const char *system, const char *public)
245
+ {
246
+ // These two libxml2 functions take the public and system ids in
247
+ // opposite orders.
248
+ htmlDocPtr doc = htmlNewDocNoDtD(/* URI */ NULL, /* ExternalID */NULL);
249
+ assert(doc);
250
+ if (dtd_name)
251
+ xmlCreateIntSubset(doc, CONST_CAST dtd_name, CONST_CAST public, CONST_CAST system);
252
+ return doc;
253
+ }
254
+ #else
255
+ // remove internal subset from newly created documents
256
+ static VALUE new_html_doc(const char *dtd_name, const char *system, const char *public) {
257
+ VALUE doc;
258
+ // If system and public are both NULL, Document#new is going to set default
259
+ // values for them so we're going to have to remove the internal subset
260
+ // which seems to leak memory in Nokogiri, so leak as little as possible.
261
+ if (system == NULL && public == NULL) {
262
+ doc = rb_funcall(Document, new, 2, /* URI */ Qnil, /* external_id */ rb_str_new("", 0));
263
+ rb_funcall(rb_funcall(doc, internal_subset, 0), remove_, 0);
264
+ if (dtd_name) {
265
+ // We need to create an internal subset now.
266
+ rb_funcall(doc, create_internal_subset, 3, rb_str_new2(dtd_name), Qnil, Qnil);
267
+ }
268
+ } else {
269
+ assert(dtd_name);
270
+ // Rather than removing and creating the internal subset as we did above,
271
+ // just create and then rename one.
272
+ VALUE r_system = system ? rb_str_new2(system) : Qnil;
273
+ VALUE r_public = public ? rb_str_new2(public) : Qnil;
274
+ doc = rb_funcall(Document, new, 2, r_system, r_public);
275
+ rb_funcall(rb_funcall(doc, internal_subset, 0), node_name_, 1, rb_str_new2(dtd_name));
276
+ }
277
+ return doc;
278
+ }
279
+ #endif
280
+
186
281
  // Parse a string using gumbo_parse into a Nokogiri document
187
- static VALUE parse(VALUE self, VALUE string, VALUE max_parse_errors) {
188
- GumboOptions options;
189
- memcpy(&options, &kGumboDefaultOptions, sizeof options);
190
- options.max_errors = NUM2INT(max_parse_errors);
282
+ static VALUE parse(VALUE self, VALUE string, VALUE url, VALUE max_errors, VALUE max_depth) {
283
+ GumboOptions options = kGumboDefaultOptions;
284
+ options.max_errors = NUM2INT(max_errors);
285
+ options.max_tree_depth = NUM2INT(max_depth);
191
286
 
192
287
  const char *input = RSTRING_PTR(string);
193
288
  size_t input_len = RSTRING_LEN(string);
194
289
  GumboOutput *output = gumbo_parse_with_options(&options, input, input_len);
195
- xmlDocPtr doc = xmlNewDoc(CONST_CAST "1.0");
196
- #ifdef NGLIB
197
- doc->type = XML_HTML_DOCUMENT_NODE;
198
- #endif
290
+
291
+ const char *status_string = gumbo_status_to_string(output->status);
292
+ switch (output->status) {
293
+ case GUMBO_STATUS_OK:
294
+ break;
295
+ case GUMBO_STATUS_TREE_TOO_DEEP:
296
+ gumbo_destroy_output(output);
297
+ rb_raise(rb_eArgError, "%s", status_string);
298
+ case GUMBO_STATUS_OUT_OF_MEMORY:
299
+ gumbo_destroy_output(output);
300
+ rb_raise(rb_eNoMemError, "%s", status_string);
301
+ }
302
+
303
+ xmlDocPtr doc;
199
304
  if (output->document->v.document.has_doctype) {
200
305
  const char *name = output->document->v.document.name;
201
306
  const char *public = output->document->v.document.public_identifier;
202
307
  const char *system = output->document->v.document.system_identifier;
203
- xmlCreateIntSubset(doc, CONST_CAST name,
204
- (public[0] ? CONST_CAST public : NIL),
205
- (system[0] ? CONST_CAST system : NIL));
308
+ public = public[0] ? public : NULL;
309
+ system = system[0] ? system : NULL;
310
+ doc = new_html_doc(name, system, public);
311
+ } else {
312
+ doc = new_html_doc(NULL, NULL, NULL);
206
313
  }
207
314
 
208
315
  GumboVector *children = &output->document->v.document.children;
209
- for (int i=0; i < children->length; i++) {
316
+ for (size_t i=0; i < children->length; i++) {
210
317
  GumboNode *child = children->data[i];
211
318
  xmlNodePtr node = walk_tree(doc, child);
212
319
  if (node) {
@@ -222,28 +329,20 @@ static VALUE parse(VALUE self, VALUE string, VALUE max_parse_errors) {
222
329
  // Add parse errors to rdoc.
223
330
  if (output->errors.length) {
224
331
  GumboVector *errors = &output->errors;
225
- GumboParser parser = { ._options = &options };
226
332
  GumboStringBuffer msg;
227
333
  VALUE rerrors = rb_ary_new2(errors->length);
228
334
 
229
- gumbo_string_buffer_init(&parser, &msg);
230
- for (int i=0; i < errors->length; i++) {
335
+ gumbo_string_buffer_init(&msg);
336
+ for (size_t i=0; i < errors->length; i++) {
231
337
  GumboError *err = errors->data[i];
232
- gumbo_string_buffer_clear(&parser, &msg);
233
- // Work around bug in gumbo_caret_diagnostic_to_string.
234
- // See https://github.com/google/gumbo-parser/pull/371
235
- // The bug occurs when the error starts with a newline (unless it's the
236
- // first character in the input--but that shouldn't cause an error in
237
- // the first place.
238
- if (*err->original_text == '\n' && err->original_text != input)
239
- --err->original_text;
240
- gumbo_caret_diagnostic_to_string(&parser, err, input, &msg);
338
+ gumbo_string_buffer_clear(&msg);
339
+ gumbo_caret_diagnostic_to_string(err, input, input_len, &msg);
241
340
  VALUE err_str = rb_str_new(msg.data, msg.length);
242
- VALUE syntax_error = rb_class_new_instance(1, &err_str, XMLSyntaxError);
341
+ VALUE syntax_error = rb_class_new_instance(1, &err_str, cNokogiriXmlSyntaxError);
243
342
  rb_iv_set(syntax_error, "@domain", INT2NUM(1)); // XML_FROM_PARSER
244
343
  rb_iv_set(syntax_error, "@code", INT2NUM(1)); // XML_ERR_INTERNAL_ERROR
245
344
  rb_iv_set(syntax_error, "@level", INT2NUM(2)); // XML_ERR_ERROR
246
- rb_iv_set(syntax_error, "@file", Qnil);
345
+ rb_iv_set(syntax_error, "@file", url);
247
346
  rb_iv_set(syntax_error, "@line", INT2NUM(err->position.line));
248
347
  rb_iv_set(syntax_error, "@str1", Qnil);
249
348
  rb_iv_set(syntax_error, "@str2", Qnil);
@@ -253,28 +352,28 @@ static VALUE parse(VALUE self, VALUE string, VALUE max_parse_errors) {
253
352
  rb_ary_push(rerrors, syntax_error);
254
353
  }
255
354
  rb_iv_set(rdoc, "@errors", rerrors);
256
- gumbo_string_buffer_destroy(&parser, &msg);
355
+ gumbo_string_buffer_destroy(&msg);
257
356
  }
258
357
 
259
- gumbo_destroy_output(&options, output);
358
+ gumbo_destroy_output(output);
260
359
 
261
360
  return rdoc;
262
361
  }
263
362
 
264
363
  // Initialize the Nokogumbo class and fetch constants we will use later
265
- void Init_nokogumboc() {
364
+ void Init_nokogumbo() {
266
365
  rb_funcall(rb_mKernel, rb_intern("gem"), 1, rb_str_new2("nokogiri"));
267
366
  rb_require("nokogiri");
268
367
 
269
368
  // class constants
270
369
  VALUE Nokogiri = rb_const_get(rb_cObject, rb_intern("Nokogiri"));
271
- VALUE HTML = rb_const_get(Nokogiri, rb_intern("HTML"));
272
- Document = rb_const_get(HTML, rb_intern("Document"));
273
- VALUE XML = rb_const_get(Nokogiri, rb_intern("XML"));
274
- XMLSyntaxError = rb_const_get(XML, rb_intern("SyntaxError"));
370
+ VALUE HTML5 = rb_const_get(Nokogiri, rb_intern("HTML5"));
371
+ Document = rb_const_get(HTML5, rb_intern("Document"));
275
372
 
276
373
  #ifndef NGLIB
277
374
  // more class constants
375
+ VALUE XML = rb_const_get(Nokogiri, rb_intern("XML"));
376
+ cNokogiriXmlSyntaxError = rb_const_get(XML, rb_intern("SyntaxError"));
278
377
  Element = rb_const_get(XML, rb_intern("Element"));
279
378
  Text = rb_const_get(XML, rb_intern("Text"));
280
379
  CDATA = rb_const_get(XML, rb_intern("CDATA"));
@@ -282,14 +381,18 @@ void Init_nokogumboc() {
282
381
 
283
382
  // interned symbols
284
383
  new = rb_intern("new");
384
+ attribute = rb_intern("attribute");
285
385
  set_attribute = rb_intern("set_attribute");
386
+ remove_attribute = rb_intern("remove_attribute");
286
387
  add_child = rb_intern("add_child_node_and_reparent_attrs");
287
388
  internal_subset = rb_intern("internal_subset");
288
389
  remove_ = rb_intern("remove");
289
390
  create_internal_subset = rb_intern("create_internal_subset");
391
+ key_ = rb_intern("key?");
392
+ node_name_ = rb_intern("node_name=");
290
393
  #endif
291
394
 
292
- // define Nokogumbo class with a singleton parse method
293
- VALUE Gumbo = rb_define_class("Nokogumbo", rb_cObject);
294
- rb_define_singleton_method(Gumbo, "parse", parse, 2);
395
+ // define Nokogumbo module with a parse method
396
+ VALUE Gumbo = rb_define_module("Nokogumbo");
397
+ rb_define_singleton_method(Gumbo, "parse", parse, 4);
295
398
  }