nokogiri 1.11.0.rc3 → 1.11.0.rc4

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of nokogiri might be problematic. Click here for more details.

Files changed (37) hide show
  1. checksums.yaml +4 -4
  2. data/LICENSE-DEPENDENCIES.md +1015 -947
  3. data/README.md +1 -1
  4. data/ext/nokogiri/depend +476 -357
  5. data/ext/nokogiri/extconf.rb +441 -321
  6. data/ext/nokogiri/html_document.c +79 -78
  7. data/ext/nokogiri/html_sax_parser_context.c +2 -2
  8. data/ext/nokogiri/nokogiri.c +34 -46
  9. data/ext/nokogiri/nokogiri.h +22 -26
  10. data/ext/nokogiri/xml_document.c +2 -2
  11. data/ext/nokogiri/xml_node.c +1 -1
  12. data/ext/nokogiri/xml_node_set.c +1 -1
  13. data/ext/nokogiri/xml_relax_ng.c +29 -11
  14. data/ext/nokogiri/xml_sax_parser.c +2 -7
  15. data/ext/nokogiri/xml_sax_parser_context.c +2 -2
  16. data/ext/nokogiri/xml_schema.c +55 -13
  17. data/ext/nokogiri/xml_xpath_context.c +80 -4
  18. data/ext/nokogiri/xslt_stylesheet.c +1 -4
  19. data/lib/nokogiri.rb +1 -1
  20. data/lib/nokogiri/css/parser.rb +3 -3
  21. data/lib/nokogiri/css/parser.y +2 -2
  22. data/lib/nokogiri/css/xpath_visitor.rb +70 -42
  23. data/lib/nokogiri/html/document.rb +12 -26
  24. data/lib/nokogiri/version.rb +2 -149
  25. data/lib/nokogiri/version/constant.rb +5 -0
  26. data/lib/nokogiri/version/info.rb +182 -0
  27. data/lib/nokogiri/xml/document.rb +17 -7
  28. data/lib/nokogiri/xml/document_fragment.rb +4 -6
  29. data/lib/nokogiri/xml/node.rb +50 -27
  30. data/lib/nokogiri/xml/parse_options.rb +6 -0
  31. data/lib/nokogiri/xml/relax_ng.rb +6 -2
  32. data/lib/nokogiri/xml/schema.rb +12 -4
  33. data/lib/nokogiri/xml/searchable.rb +3 -1
  34. data/patches/libxml2/0006-htmlParseComment-treat-as-if-it-closed-the-comment.patch +73 -0
  35. data/patches/libxml2/0007-use-new-htmlParseLookupCommentEnd-to-find-comment-en.patch +103 -0
  36. data/patches/libxml2/0008-use-glibc-strlen.patch +53 -0
  37. metadata +34 -22
@@ -210,8 +210,10 @@ module Nokogiri
210
210
  end
211
211
 
212
212
  def xpath_query_from_css_rule(rule, ns)
213
+ visitor = Nokogiri::CSS::XPathVisitorOptimallyUseBuiltins.new
213
214
  self.class::IMPLIED_XPATH_CONTEXTS.map do |implied_xpath_context|
214
- CSS.xpath_for(rule.to_s, :prefix => implied_xpath_context, :ns => ns)
215
+ CSS.xpath_for(rule.to_s, {:prefix => implied_xpath_context, :ns => ns,
216
+ :visitor => visitor})
215
217
  end.join(" | ")
216
218
  end
217
219
 
@@ -0,0 +1,73 @@
1
+ From 4f51a6d2b1755ce5b36c524c215aad70d864ac1d Mon Sep 17 00:00:00 2001
2
+ From: Mike Dalessio <mike.dalessio@gmail.com>
3
+ Date: Mon, 3 Aug 2020 17:36:05 -0400
4
+ Subject: [PATCH 1/2] htmlParseComment: treat `--!>` as if it closed the
5
+ comment
6
+
7
+ See guidance provided on incorrectly-closed comments here:
8
+
9
+ https://html.spec.whatwg.org/multipage/parsing.html#parse-error-incorrectly-closed-comment
10
+ ---
11
+ HTMLparser.c | 28 ++++++++++++++++++++--------
12
+ 1 file changed, 20 insertions(+), 8 deletions(-)
13
+
14
+ diff --git a/HTMLparser.c b/HTMLparser.c
15
+ index 7b6d689..4d43479 100644
16
+ --- a/HTMLparser.c
17
+ +++ b/HTMLparser.c
18
+ @@ -3300,6 +3300,7 @@ htmlParseComment(htmlParserCtxtPtr ctxt) {
19
+ int q, ql;
20
+ int r, rl;
21
+ int cur, l;
22
+ + int next, nl;
23
+ xmlParserInputState state;
24
+
25
+ /*
26
+ @@ -3332,6 +3333,21 @@ htmlParseComment(htmlParserCtxtPtr ctxt) {
27
+ while (IS_CHAR(cur) &&
28
+ ((cur != '>') ||
29
+ (r != '-') || (q != '-'))) {
30
+ + NEXTL(l);
31
+ + next = CUR_CHAR(nl);
32
+ + if (next == 0) {
33
+ + SHRINK;
34
+ + GROW;
35
+ + next = CUR_CHAR(nl);
36
+ + }
37
+ +
38
+ + if ((q == '-') && (r == '-') && (cur == '!') && (next == '>')) {
39
+ + htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
40
+ + "Comment incorrectly closed by '--!>'", NULL, NULL);
41
+ + cur = '>';
42
+ + break;
43
+ + }
44
+ +
45
+ if (len + 5 >= size) {
46
+ xmlChar *tmp;
47
+
48
+ @@ -3345,18 +3361,14 @@ htmlParseComment(htmlParserCtxtPtr ctxt) {
49
+ }
50
+ buf = tmp;
51
+ }
52
+ - COPY_BUF(ql,buf,len,q);
53
+ + COPY_BUF(ql,buf,len,q);
54
+ +
55
+ q = r;
56
+ ql = rl;
57
+ r = cur;
58
+ rl = l;
59
+ - NEXTL(l);
60
+ - cur = CUR_CHAR(l);
61
+ - if (cur == 0) {
62
+ - SHRINK;
63
+ - GROW;
64
+ - cur = CUR_CHAR(l);
65
+ - }
66
+ + cur = next;
67
+ + l = nl;
68
+ }
69
+ buf[len] = 0;
70
+ if (IS_CHAR(cur)) {
71
+ --
72
+ 2.25.1
73
+
@@ -0,0 +1,103 @@
1
+ From b20d746fa7cbb74716171bc49d836af99927e41e Mon Sep 17 00:00:00 2001
2
+ From: Mike Dalessio <mike.dalessio@gmail.com>
3
+ Date: Sun, 11 Oct 2020 14:15:37 -0400
4
+ Subject: [PATCH 2/2] use new htmlParseLookupCommentEnd to find comment ends
5
+
6
+ Note that the caret in error messages generated during comment parsing
7
+ may have moved by one byte.
8
+
9
+ See guidance provided on incorrectly-closed comments here:
10
+
11
+ https://html.spec.whatwg.org/multipage/parsing.html#parse-error-incorrectly-closed-comment
12
+ ---
13
+ HTMLparser.c | 46 +++++++++++++++++++++++++++++++++++++---------
14
+ 1 file changed, 37 insertions(+), 9 deletions(-)
15
+
16
+ diff --git a/HTMLparser.c b/HTMLparser.c
17
+ index 4d43479..000dc3d 100644
18
+ --- a/HTMLparser.c
19
+ +++ b/HTMLparser.c
20
+ @@ -5331,6 +5331,39 @@ htmlParseLookupChars(htmlParserCtxtPtr ctxt, const xmlChar * stop,
21
+ return (-1);
22
+ }
23
+
24
+ +/**
25
+ + * htmlParseLookupCommentEnd:
26
+ + * @ctxt: an HTML parser context
27
+ + *
28
+ + * Try to find a comment end tag in the input stream
29
+ + * The search includes "-->" as well as WHATWG-recommended incorrectly-closed tags.
30
+ + * (See https://html.spec.whatwg.org/multipage/parsing.html#parse-error-incorrectly-closed-comment)
31
+ + * This function has a side effect of (possibly) incrementing ctxt->checkIndex
32
+ + * to avoid rescanning sequences of bytes, it DOES change the state of the
33
+ + * parser, do not use liberally.
34
+ + * This wraps to htmlParseLookupSequence()
35
+ + *
36
+ + * Returns the index to the current parsing point if the full sequence is available, -1 otherwise.
37
+ + */
38
+ +static int
39
+ +htmlParseLookupCommentEnd(htmlParserCtxtPtr ctxt)
40
+ +{
41
+ + int mark = 0;
42
+ + int cur = CUR_PTR - BASE_PTR;
43
+ +
44
+ + while (mark >= 0) {
45
+ + mark = htmlParseLookupSequence(ctxt, '-', '-', 0, 1, 1);
46
+ + if ((mark < 0) ||
47
+ + (NXT(mark+2) == '>') ||
48
+ + ((NXT(mark+2) == '!') && (NXT(mark+3) == '>'))) {
49
+ + return mark;
50
+ + }
51
+ + ctxt->checkIndex = cur + mark + 1;
52
+ + }
53
+ + return mark;
54
+ +}
55
+ +
56
+ +
57
+ /**
58
+ * htmlParseTryOrFinish:
59
+ * @ctxt: an HTML parser context
60
+ @@ -5507,8 +5540,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
61
+ cur = in->cur[0];
62
+ if ((cur == '<') && (next == '!') &&
63
+ (in->cur[2] == '-') && (in->cur[3] == '-')) {
64
+ - if ((!terminate) &&
65
+ - (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
66
+ + if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
67
+ goto done;
68
+ #ifdef DEBUG_PUSH
69
+ xmlGenericError(xmlGenericErrorContext,
70
+ @@ -5567,8 +5599,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
71
+ next = in->cur[1];
72
+ if ((cur == '<') && (next == '!') &&
73
+ (in->cur[2] == '-') && (in->cur[3] == '-')) {
74
+ - if ((!terminate) &&
75
+ - (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
76
+ + if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
77
+ goto done;
78
+ #ifdef DEBUG_PUSH
79
+ xmlGenericError(xmlGenericErrorContext,
80
+ @@ -5614,8 +5645,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
81
+ next = in->cur[1];
82
+ if ((cur == '<') && (next == '!') &&
83
+ (in->cur[2] == '-') && (in->cur[3] == '-')) {
84
+ - if ((!terminate) &&
85
+ - (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
86
+ + if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
87
+ goto done;
88
+ #ifdef DEBUG_PUSH
89
+ xmlGenericError(xmlGenericErrorContext,
90
+ @@ -5871,9 +5901,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
91
+ htmlParseDocTypeDecl(ctxt);
92
+ } else if ((cur == '<') && (next == '!') &&
93
+ (in->cur[2] == '-') && (in->cur[3] == '-')) {
94
+ - if ((!terminate) &&
95
+ - (htmlParseLookupSequence(
96
+ - ctxt, '-', '-', '>', 1, 1) < 0))
97
+ + if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
98
+ goto done;
99
+ #ifdef DEBUG_PUSH
100
+ xmlGenericError(xmlGenericErrorContext,
101
+ --
102
+ 2.25.1
103
+
@@ -0,0 +1,53 @@
1
+ From c94172d2a4451368530db2186190d70be8a1d9e5 Mon Sep 17 00:00:00 2001
2
+ From: Ilya Zub <ilya@serpapi.com>
3
+ Date: Wed, 23 Dec 2020 12:45:29 +0200
4
+ Subject: Use glibc strlen to speed up xmlStrlen
5
+ MIME-Version: 1.0
6
+ Content-Type: text/plain; charset=UTF-8
7
+ Content-Transfer-Encoding: 8bit
8
+
9
+ xmlStrlen (entire HTML file): 926171.936981 μs
10
+ glibc_xmlStrlen (entire HTML file): 36905.903992 μs
11
+ delta (xmlStrlen ÷ glibc_xmlStrlen): 25.094584 times
12
+
13
+ xmlStrlen (average string): 57479.204010 μs
14
+ glibc_xmlStrlen (average string): 5802.069000 μs
15
+ delta (xmlStrlen ÷ glibc_xmlStrlen): 9.905937 times
16
+
17
+ xmlStrlen (bigger string): 388056.315979 μs
18
+ glibc_xmlStrlen (bigger string): 12797.856995 μs
19
+ delta (xmlStrlen ÷ glibc_xmlStrlen): 30.318382 times
20
+
21
+ xmlStrlen (smallest string): 15870.046021 μs
22
+ glibc_xmlStrlen (smallest string): 6282.208984 μs
23
+ delta (xmlStrlen ÷ glibc_xmlStrlen): 2.527903 times
24
+
25
+ See https://gitlab.gnome.org/GNOME/libxml2/-/issues/212 for reference.
26
+ ---
27
+ xmlstring.c | 9 ++-------
28
+ 1 file changed, 2 insertions(+), 7 deletions(-)
29
+
30
+ diff --git a/xmlstring.c b/xmlstring.c
31
+ index e8a1e45d..df247dff 100644
32
+ --- a/xmlstring.c
33
+ +++ b/xmlstring.c
34
+ @@ -423,14 +423,9 @@ xmlStrsub(const xmlChar *str, int start, int len) {
35
+
36
+ int
37
+ xmlStrlen(const xmlChar *str) {
38
+ - int len = 0;
39
+ -
40
+ if (str == NULL) return(0);
41
+ - while (*str != 0) { /* non input consuming */
42
+ - str++;
43
+ - len++;
44
+ - }
45
+ - return(len);
46
+ +
47
+ + return strlen((const char*)str);
48
+ }
49
+
50
+ /**
51
+ --
52
+ 2.29.2
53
+
metadata CHANGED
@@ -1,21 +1,35 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: nokogiri
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.11.0.rc3
4
+ version: 1.11.0.rc4
5
5
  platform: ruby
6
6
  authors:
7
- - Aaron Patterson
8
7
  - Mike Dalessio
8
+ - Aaron Patterson
9
+ - John Shahid
9
10
  - Yoko Harada
10
- - Tim Elliott
11
11
  - Akinori MUSHA
12
- - John Shahid
13
12
  - Lars Kanis
13
+ - Tim Elliott
14
14
  autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
- date: 2020-09-08 00:00:00.000000000 Z
17
+ date: 2020-12-29 00:00:00.000000000 Z
18
18
  dependencies:
19
+ - !ruby/object:Gem::Dependency
20
+ name: racc
21
+ requirement: !ruby/object:Gem::Requirement
22
+ requirements:
23
+ - - "~>"
24
+ - !ruby/object:Gem::Version
25
+ version: '1.4'
26
+ type: :runtime
27
+ prerelease: false
28
+ version_requirements: !ruby/object:Gem::Requirement
29
+ requirements:
30
+ - - "~>"
31
+ - !ruby/object:Gem::Version
32
+ version: '1.4'
19
33
  - !ruby/object:Gem::Dependency
20
34
  name: mini_portile2
21
35
  requirement: !ruby/object:Gem::Requirement
@@ -36,14 +50,14 @@ dependencies:
36
50
  requirements:
37
51
  - - "~>"
38
52
  - !ruby/object:Gem::Version
39
- version: '0.34'
53
+ version: '0.41'
40
54
  type: :development
41
55
  prerelease: false
42
56
  version_requirements: !ruby/object:Gem::Requirement
43
57
  requirements:
44
58
  - - "~>"
45
59
  - !ruby/object:Gem::Version
46
- version: '0.34'
60
+ version: '0.41'
47
61
  - !ruby/object:Gem::Dependency
48
62
  name: hoe
49
63
  requirement: !ruby/object:Gem::Requirement
@@ -149,19 +163,19 @@ dependencies:
149
163
  - !ruby/object:Gem::Version
150
164
  version: '5.8'
151
165
  - !ruby/object:Gem::Dependency
152
- name: racc
166
+ name: minitest-reporters
153
167
  requirement: !ruby/object:Gem::Requirement
154
168
  requirements:
155
169
  - - "~>"
156
170
  - !ruby/object:Gem::Version
157
- version: 1.4.14
171
+ version: '1.4'
158
172
  type: :development
159
173
  prerelease: false
160
174
  version_requirements: !ruby/object:Gem::Requirement
161
175
  requirements:
162
176
  - - "~>"
163
177
  - !ruby/object:Gem::Version
164
- version: 1.4.14
178
+ version: '1.4'
165
179
  - !ruby/object:Gem::Dependency
166
180
  name: rake
167
181
  requirement: !ruby/object:Gem::Requirement
@@ -196,14 +210,14 @@ dependencies:
196
210
  requirements:
197
211
  - - "~>"
198
212
  - !ruby/object:Gem::Version
199
- version: '1.0'
213
+ version: '1.1'
200
214
  type: :development
201
215
  prerelease: false
202
216
  version_requirements: !ruby/object:Gem::Requirement
203
217
  requirements:
204
218
  - - "~>"
205
219
  - !ruby/object:Gem::Version
206
- version: '1.0'
220
+ version: '1.1'
207
221
  - !ruby/object:Gem::Dependency
208
222
  name: rexical
209
223
  requirement: !ruby/object:Gem::Requirement
@@ -270,14 +284,7 @@ description: |-
270
284
  Nokogiri (鋸) is an HTML, XML, SAX, and Reader parser. Among
271
285
  Nokogiri's many features is the ability to search documents via XPath
272
286
  or CSS3 selectors.
273
- email:
274
- - aaronp@rubyforge.org
275
- - mike.dalessio@gmail.com
276
- - yokolet@gmail.com
277
- - tle@holymonkey.com
278
- - knu@idaemons.org
279
- - jvshahid@gmail.com
280
- - lars@greiz-reinsdorf.de
287
+ email: nokogiri-talk@googlegroups.com
281
288
  executables:
282
289
  - nokogiri
283
290
  extensions:
@@ -420,6 +427,8 @@ files:
420
427
  - lib/nokogiri/jruby/dependencies.rb
421
428
  - lib/nokogiri/syntax_error.rb
422
429
  - lib/nokogiri/version.rb
430
+ - lib/nokogiri/version/constant.rb
431
+ - lib/nokogiri/version/info.rb
423
432
  - lib/nokogiri/xml.rb
424
433
  - lib/nokogiri/xml/attr.rb
425
434
  - lib/nokogiri/xml/attribute_decl.rb
@@ -465,6 +474,9 @@ files:
465
474
  - patches/libxml2/0003-Update-entities-to-remove-handling-of-ssi.patch
466
475
  - patches/libxml2/0004-libxml2.la-is-in-top_builddir.patch
467
476
  - patches/libxml2/0005-Fix-infinite-loop-in-xmlStringLenDecodeEntities.patch
477
+ - patches/libxml2/0006-htmlParseComment-treat-as-if-it-closed-the-comment.patch
478
+ - patches/libxml2/0007-use-new-htmlParseLookupCommentEnd-to-find-comment-en.patch
479
+ - patches/libxml2/0008-use-glibc-strlen.patch
468
480
  - ports/archives/libxml2-2.9.10.tar.gz
469
481
  - ports/archives/libxslt-1.1.34.tar.gz
470
482
  homepage: https://nokogiri.org
@@ -486,14 +498,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
486
498
  requirements:
487
499
  - - ">="
488
500
  - !ruby/object:Gem::Version
489
- version: 2.4.0
501
+ version: 2.5.0
490
502
  required_rubygems_version: !ruby/object:Gem::Requirement
491
503
  requirements:
492
504
  - - ">"
493
505
  - !ruby/object:Gem::Version
494
506
  version: 1.3.1
495
507
  requirements: []
496
- rubygems_version: 3.1.2
508
+ rubygems_version: 3.1.4
497
509
  signing_key:
498
510
  specification_version: 4
499
511
  summary: Nokogiri (鋸) is an HTML, XML, SAX, and Reader parser