nokogiri 1.13.6 → 1.14.2
Sign up to get free protection for your applications and to get access to all the features.
Potentially problematic release.
This version of nokogiri might be problematic. Click here for more details.
- checksums.yaml +4 -4
- data/Gemfile +39 -0
- data/LICENSE-DEPENDENCIES.md +830 -509
- data/LICENSE.md +1 -1
- data/README.md +18 -11
- data/dependencies.yml +33 -15
- data/ext/nokogiri/extconf.rb +100 -24
- data/ext/nokogiri/gumbo.c +21 -11
- data/ext/nokogiri/html4_document.c +2 -2
- data/ext/nokogiri/html4_element_description.c +1 -1
- data/ext/nokogiri/html4_entity_lookup.c +2 -2
- data/ext/nokogiri/html4_sax_parser_context.c +1 -6
- data/ext/nokogiri/html4_sax_push_parser.c +1 -1
- data/ext/nokogiri/nokogiri.c +38 -51
- data/ext/nokogiri/nokogiri.h +26 -14
- data/ext/nokogiri/test_global_handlers.c +1 -1
- data/ext/nokogiri/xml_attr.c +3 -3
- data/ext/nokogiri/xml_attribute_decl.c +5 -5
- data/ext/nokogiri/xml_cdata.c +3 -3
- data/ext/nokogiri/xml_comment.c +1 -1
- data/ext/nokogiri/xml_document.c +23 -14
- data/ext/nokogiri/xml_document_fragment.c +1 -1
- data/ext/nokogiri/xml_dtd.c +9 -9
- data/ext/nokogiri/xml_element_content.c +3 -3
- data/ext/nokogiri/xml_element_decl.c +5 -5
- data/ext/nokogiri/xml_encoding_handler.c +3 -3
- data/ext/nokogiri/xml_entity_decl.c +6 -6
- data/ext/nokogiri/xml_entity_reference.c +1 -1
- data/ext/nokogiri/xml_namespace.c +80 -14
- data/ext/nokogiri/xml_node.c +363 -82
- data/ext/nokogiri/xml_node_set.c +4 -6
- data/ext/nokogiri/xml_processing_instruction.c +1 -1
- data/ext/nokogiri/xml_reader.c +97 -22
- data/ext/nokogiri/xml_relax_ng.c +1 -3
- data/ext/nokogiri/xml_sax_parser.c +23 -17
- data/ext/nokogiri/xml_sax_parser_context.c +1 -6
- data/ext/nokogiri/xml_sax_push_parser.c +1 -3
- data/ext/nokogiri/xml_schema.c +4 -6
- data/ext/nokogiri/xml_syntax_error.c +1 -1
- data/ext/nokogiri/xml_text.c +2 -2
- data/ext/nokogiri/xml_xpath_context.c +91 -84
- data/ext/nokogiri/xslt_stylesheet.c +15 -14
- data/gumbo-parser/Makefile +10 -0
- data/gumbo-parser/src/attribute.h +1 -1
- data/gumbo-parser/src/error.c +2 -2
- data/gumbo-parser/src/error.h +1 -1
- data/gumbo-parser/src/foreign_attrs.c +2 -2
- data/gumbo-parser/src/{gumbo.h → nokogiri_gumbo.h} +1 -0
- data/gumbo-parser/src/parser.c +8 -5
- data/gumbo-parser/src/replacement.h +1 -1
- data/gumbo-parser/src/string_buffer.h +1 -1
- data/gumbo-parser/src/string_piece.c +1 -1
- data/gumbo-parser/src/svg_attrs.c +2 -2
- data/gumbo-parser/src/svg_tags.c +2 -2
- data/gumbo-parser/src/tag.c +2 -1
- data/gumbo-parser/src/tag_lookup.c +7 -7
- data/gumbo-parser/src/tag_lookup.gperf +1 -0
- data/gumbo-parser/src/tag_lookup.h +1 -1
- data/gumbo-parser/src/token_buffer.h +1 -1
- data/gumbo-parser/src/tokenizer.c +1 -1
- data/gumbo-parser/src/tokenizer.h +1 -1
- data/gumbo-parser/src/utf8.c +1 -1
- data/gumbo-parser/src/utf8.h +1 -1
- data/gumbo-parser/src/util.c +1 -3
- data/gumbo-parser/src/util.h +4 -0
- data/gumbo-parser/src/vector.h +1 -1
- data/lib/nokogiri/css/node.rb +2 -2
- data/lib/nokogiri/css/xpath_visitor.rb +5 -3
- data/lib/nokogiri/css.rb +6 -0
- data/lib/nokogiri/decorators/slop.rb +1 -1
- data/lib/nokogiri/encoding_handler.rb +57 -0
- data/lib/nokogiri/extension.rb +3 -2
- data/lib/nokogiri/html4/document.rb +2 -121
- data/lib/nokogiri/html4/element_description_defaults.rb +6 -12
- data/lib/nokogiri/html4/encoding_reader.rb +121 -0
- data/lib/nokogiri/html4.rb +1 -0
- data/lib/nokogiri/html5/document.rb +113 -36
- data/lib/nokogiri/html5/document_fragment.rb +9 -2
- data/lib/nokogiri/html5/node.rb +3 -5
- data/lib/nokogiri/html5.rb +127 -216
- data/lib/nokogiri/jruby/dependencies.rb +1 -19
- data/lib/nokogiri/jruby/nokogiri_jars.rb +43 -0
- data/lib/nokogiri/version/constant.rb +1 -1
- data/lib/nokogiri/version/info.rb +11 -10
- data/lib/nokogiri/xml/attr.rb +49 -0
- data/lib/nokogiri/xml/builder.rb +1 -1
- data/lib/nokogiri/xml/document.rb +103 -55
- data/lib/nokogiri/xml/document_fragment.rb +49 -6
- data/lib/nokogiri/xml/namespace.rb +42 -0
- data/lib/nokogiri/xml/node/save_options.rb +6 -4
- data/lib/nokogiri/xml/node.rb +190 -35
- data/lib/nokogiri/xml/node_set.rb +88 -9
- data/lib/nokogiri/xml/parse_options.rb +129 -50
- data/lib/nokogiri/xml/pp/node.rb +6 -4
- data/lib/nokogiri/xml/processing_instruction.rb +2 -1
- data/lib/nokogiri/xml/reader.rb +6 -8
- data/lib/nokogiri/xml/sax/parser.rb +2 -3
- data/lib/nokogiri/xslt.rb +1 -1
- data/lib/nokogiri.rb +3 -11
- data/lib/xsd/xmlparser/nokogiri.rb +3 -1
- data/ports/archives/libxml2-2.10.3.tar.xz +0 -0
- data/ports/archives/libxslt-1.1.37.tar.xz +0 -0
- metadata +11 -242
- data/patches/libxml2/0004-use-glibc-strlen.patch +0 -53
- data/patches/libxml2/0005-avoid-isnan-isinf.patch +0 -81
- data/patches/libxml2/0006-update-automake-files-for-arm64.patch +0 -3040
- data/patches/libxml2/0008-htmlParseComment-handle-abruptly-closed-comments.patch +0 -61
- data/ports/archives/libxml2-2.9.14.tar.xz +0 -0
- data/ports/archives/libxslt-1.1.35.tar.xz +0 -0
data/LICENSE.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
The MIT License
|
2
2
|
|
3
|
-
Copyright 2008 --
|
3
|
+
Copyright 2008 -- 2023 by Mike Dalessio, Aaron Patterson, Yoko Harada, Akinori MUSHA, John Shahid, Karol Bucek, Sam Ruby, Craig Barnes, Stephen Checkoway, Lars Kanis, Sergio Arbeo, Timothy Elliott, Nobuyoshi Nakada, Charles Nutter, Patrick Mahoney.
|
4
4
|
|
5
5
|
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
6
6
|
|
data/README.md
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
# Nokogiri
|
4
4
|
|
5
|
-
Nokogiri (鋸) makes it easy and painless to work with XML and HTML from Ruby. It provides a sensible, easy-to-understand API for [reading](https://nokogiri.org/tutorials/parsing_an_html_xml_document.html), writing, [modifying](https://nokogiri.org/tutorials/modifying_an_html_xml_document.html), and [querying](https://nokogiri.org/tutorials/searching_a_xml_html_document.html) documents. It is fast and standards-compliant by relying on native parsers like libxml2
|
5
|
+
Nokogiri (鋸) makes it easy and painless to work with XML and HTML from Ruby. It provides a sensible, easy-to-understand API for [reading](https://nokogiri.org/tutorials/parsing_an_html_xml_document.html), writing, [modifying](https://nokogiri.org/tutorials/modifying_an_html_xml_document.html), and [querying](https://nokogiri.org/tutorials/searching_a_xml_html_document.html) documents. It is fast and standards-compliant by relying on native parsers like libxml2, libgumbo, and xerces.
|
6
6
|
|
7
7
|
## Guiding Principles
|
8
8
|
|
@@ -40,10 +40,6 @@ Some guiding principles Nokogiri tries to follow:
|
|
40
40
|
|
41
41
|
All official documentation is posted at https://nokogiri.org (the source for which is at https://github.com/sparklemotion/nokogiri.org/, and we welcome contributions).
|
42
42
|
|
43
|
-
Consider subscribing to [Tidelift][tidelift] which provides license assurances and timely security notifications for your open source dependencies, including Nokogiri. [Tidelift][tidelift] subscriptions also help the Nokogiri maintainers fund our [automated testing](https://ci.nokogiri.org) which in turn allows us to ship releases, bugfixes, and security updates more often.
|
44
|
-
|
45
|
-
[tidelift]: https://tidelift.com/subscription/pkg/rubygems-nokogiri?utm_source=rubygems-nokogiri&utm_medium=referral&utm_campaign=readme
|
46
|
-
|
47
43
|
### Reading
|
48
44
|
|
49
45
|
Your first stops for learning more about Nokogiri should be:
|
@@ -57,7 +53,6 @@ Your first stops for learning more about Nokogiri should be:
|
|
57
53
|
|
58
54
|
There are a few ways to ask exploratory questions:
|
59
55
|
|
60
|
-
- The Ruby Discord chat server is active at https://discord.gg/UyQnKrT
|
61
56
|
- The Nokogiri mailing list is active at https://groups.google.com/group/nokogiri-talk
|
62
57
|
- Open an issue using the "Help Request" template at https://github.com/sparklemotion/nokogiri/issues
|
63
58
|
|
@@ -103,12 +98,21 @@ We bump `Major.Minor.Patch` versions following this guidance:
|
|
103
98
|
- Updating packaged libraries for security-related reasons.
|
104
99
|
|
105
100
|
|
101
|
+
### Sponsorship
|
102
|
+
|
103
|
+
You can help sponsor the maintainers of this software through one of these organizations:
|
104
|
+
|
105
|
+
- [github.com/sponsors/flavorjones](https://github.com/sponsors/flavorjones)
|
106
|
+
- [opencollective.com/nokogiri](https://opencollective.com/nokogiri)
|
107
|
+
- [tidelift.com/subscription/pkg/rubygems-nokogiri](https://tidelift.com/subscription/pkg/rubygems-nokogiri?utm_source=rubygems-nokogiri&utm_medium=referral&utm_campaign=readme)
|
108
|
+
|
109
|
+
|
106
110
|
## Installation
|
107
111
|
|
108
112
|
Requirements:
|
109
113
|
|
110
|
-
- Ruby >= 2.
|
111
|
-
- JRuby >= 9.
|
114
|
+
- Ruby >= 2.7
|
115
|
+
- JRuby >= 9.4.0.0
|
112
116
|
|
113
117
|
|
114
118
|
### Native Gems: Faster, more reliable installation
|
@@ -119,10 +123,13 @@ Requirements:
|
|
119
123
|
|
120
124
|
Nokogiri ships pre-compiled, "native" gems for the following platforms:
|
121
125
|
|
122
|
-
- Linux:
|
126
|
+
- Linux:
|
127
|
+
- `x86-linux` and `x86_64-linux` (req: `glibc >= 2.17`)
|
128
|
+
- `aarch64-linux` and `arm-linux` (req: `glibc >= 2.29`)
|
129
|
+
- Note that musl platforms like Alpine **are** supported
|
123
130
|
- Darwin/MacOS: `x86_64-darwin` and `arm64-darwin`
|
124
|
-
- Windows: `x86-mingw32` and `x64-
|
125
|
-
- Java: any platform running JRuby 9.
|
131
|
+
- Windows: `x86-mingw32`, `x64-mingw32`, and `x64-mingw-ucrt`
|
132
|
+
- Java: any platform running JRuby 9.4 or higher
|
126
133
|
|
127
134
|
To determine whether your system supports one of these gems, look at the output of `bundle platform` or `ruby -e 'puts Gem::Platform.local.to_s'`.
|
128
135
|
|
data/dependencies.yml
CHANGED
@@ -1,23 +1,41 @@
|
|
1
1
|
libxml2:
|
2
|
-
version: "2.
|
3
|
-
sha256: "
|
4
|
-
# sha-256 hash provided in https://download.gnome.org/sources/libxml2/2.
|
2
|
+
version: "2.10.3"
|
3
|
+
sha256: "5d2cc3d78bec3dbe212a9d7fa629ada25a7da928af432c93060ff5c17ee28a9c"
|
4
|
+
# sha-256 hash provided in https://download.gnome.org/sources/libxml2/2.10/libxml2-2.10.3.sha256sum
|
5
5
|
|
6
6
|
libxslt:
|
7
|
-
version: "1.1.
|
8
|
-
sha256: "
|
9
|
-
# sha-256 hash provided in https://download.gnome.org/sources/libxslt/1.1/libxslt-1.1.
|
7
|
+
version: "1.1.37"
|
8
|
+
sha256: "3a4b27dc8027ccd6146725950336f1ec520928f320f144eb5fa7990ae6123ab4"
|
9
|
+
# sha-256 hash provided in https://download.gnome.org/sources/libxslt/1.1/libxslt-1.1.37.sha256sum
|
10
10
|
|
11
11
|
zlib:
|
12
|
-
version: "1.2.
|
13
|
-
sha256: "
|
12
|
+
version: "1.2.13"
|
13
|
+
sha256: "b3a24de97a8fdbc835b9833169501030b8977031bcb54b3b3ac13740f846ab30"
|
14
14
|
# SHA-256 hash provided on http://zlib.net/
|
15
15
|
|
16
16
|
libiconv:
|
17
|
-
version: "1.
|
18
|
-
sha256: "
|
19
|
-
#
|
20
|
-
#
|
21
|
-
#
|
22
|
-
#
|
23
|
-
#
|
17
|
+
version: "1.17"
|
18
|
+
sha256: "8f74213b56238c85a50a5329f77e06198771e70dd9a739779f4c02f65d971313"
|
19
|
+
# signature verified by following this path:
|
20
|
+
# - release announced at https://savannah.gnu.org/forum/forum.php?forum_id=10175
|
21
|
+
# - which links to https://savannah.gnu.org/users/haible as the releaser
|
22
|
+
# - which links to https://savannah.gnu.org/people/viewgpg.php?user_id=1871 as the gpg key
|
23
|
+
#
|
24
|
+
# So:
|
25
|
+
# - wget -q -O - https://savannah.gnu.org/people/viewgpg.php?user_id=1871 | gpg --import
|
26
|
+
# gpg: key F5BE8B267C6A406D: 1 signature not checked due to a missing key
|
27
|
+
# gpg: key F5BE8B267C6A406D: public key "Bruno Haible (Open Source Development) <bruno@clisp.org>" imported
|
28
|
+
# gpg: Total number processed: 1
|
29
|
+
# gpg: imported: 1
|
30
|
+
# gpg: marginals needed: 3 completes needed: 1 trust model: pgp
|
31
|
+
# gpg: depth: 0 valid: 4 signed: 0 trust: 0-, 0q, 0n, 0m, 0f, 4u
|
32
|
+
# gpg: next trustdb check due at 2024-05-09
|
33
|
+
# - gpg --verify libiconv-1.17.tar.gz.sig ports/archives/libiconv-1.17.tar.gz
|
34
|
+
# gpg: Signature made Sun 15 May 2022 11:26:42 AM EDT
|
35
|
+
# gpg: using RSA key 9001B85AF9E1B83DF1BDA942F5BE8B267C6A406D
|
36
|
+
# gpg: Good signature from "Bruno Haible (Open Source Development) <bruno@clisp.org>" [unknown]
|
37
|
+
# gpg: WARNING: This key is not certified with a trusted signature!
|
38
|
+
# gpg: There is no indication that the signature belongs to the owner.
|
39
|
+
# Primary key fingerprint: 9001 B85A F9E1 B83D F1BD A942 F5BE 8B26 7C6A 406D
|
40
|
+
#
|
41
|
+
# And this sha256sum is calculated from that verified tarball.
|
data/ext/nokogiri/extconf.rb
CHANGED
@@ -200,7 +200,7 @@ def nix?
|
|
200
200
|
end
|
201
201
|
|
202
202
|
def truffle?
|
203
|
-
|
203
|
+
RUBY_ENGINE == "truffleruby"
|
204
204
|
end
|
205
205
|
|
206
206
|
def concat_flags(*args)
|
@@ -211,6 +211,16 @@ def local_have_library(lib, func = nil, headers = nil)
|
|
211
211
|
have_library(lib, func, headers) || have_library("lib#{lib}", func, headers)
|
212
212
|
end
|
213
213
|
|
214
|
+
def zlib_source(version_string)
|
215
|
+
# As of 2022-12, I'm starting to see failed downloads often enough from zlib.net that I want to
|
216
|
+
# change the default to github.
|
217
|
+
if ENV["NOKOGIRI_USE_CANONICAL_ZLIB_SOURCE"]
|
218
|
+
"https://zlib.net/fossils/zlib-#{version_string}.tar.gz"
|
219
|
+
else
|
220
|
+
"https://github.com/madler/zlib/releases/download/v#{version_string}/zlib-#{version_string}.tar.gz"
|
221
|
+
end
|
222
|
+
end
|
223
|
+
|
214
224
|
def gnome_source
|
215
225
|
# As of 2022-02-20, some mirrors have expired SSL certificates. I'm able to retrieve from my home,
|
216
226
|
# but whatever host is resolved on the github actions workers see an expired cert.
|
@@ -400,8 +410,8 @@ def iconv_configure_flags
|
|
400
410
|
return ["--with-iconv=yes"]
|
401
411
|
end
|
402
412
|
|
403
|
-
config = preserving_globals {
|
404
|
-
if config && try_link_iconv("pkg-config libiconv") {
|
413
|
+
config = preserving_globals { pkg_config("libiconv") }
|
414
|
+
if config && try_link_iconv("pkg-config libiconv") { pkg_config("libiconv") }
|
405
415
|
cflags, ldflags, libs = config
|
406
416
|
|
407
417
|
return [
|
@@ -430,10 +440,12 @@ def process_recipe(name, version, static_p, cross_p, cacheable_p = true)
|
|
430
440
|
"#{@target}/#{RUBY_PLATFORM}/#{@name}/#{@version}"
|
431
441
|
end
|
432
442
|
|
433
|
-
|
434
|
-
#
|
435
|
-
# use host if not set.
|
443
|
+
# We use 'host' to set compiler prefix for cross-compiling. Prefer host_alias over host. And
|
444
|
+
# prefer i686 (what external dev tools use) to i386 (what ruby's configure.ac emits).
|
436
445
|
recipe.host = RbConfig::CONFIG["host_alias"].empty? ? RbConfig::CONFIG["host"] : RbConfig::CONFIG["host_alias"]
|
446
|
+
recipe.host = recipe.host.gsub(/i386/, "i686")
|
447
|
+
|
448
|
+
recipe.target = File.join(PACKAGE_ROOT_DIR, "ports") if cacheable_p
|
437
449
|
recipe.configure_options << "--libdir=#{File.join(recipe.path, "lib")}"
|
438
450
|
|
439
451
|
yield recipe
|
@@ -525,7 +537,6 @@ def process_recipe(name, version, static_p, cross_p, cacheable_p = true)
|
|
525
537
|
|
526
538
|
EOM
|
527
539
|
|
528
|
-
pp(recipe.files)
|
529
540
|
chdir_for_build { recipe.cook }
|
530
541
|
FileUtils.touch(checkpoint)
|
531
542
|
end
|
@@ -573,6 +584,34 @@ def do_clean
|
|
573
584
|
exit!(0)
|
574
585
|
end
|
575
586
|
|
587
|
+
# In ruby 3.2, symbol resolution changed on Darwin, to introduce the `-bundle_loader` flag to
|
588
|
+
# resolve symbols against the ruby binary.
|
589
|
+
#
|
590
|
+
# This makes it challenging to build a single extension that works with both a ruby with
|
591
|
+
# `--enable-shared` and one with `--disable-shared. To work around that, we choose to add
|
592
|
+
# `-flat_namespace` to the link line (later in this file).
|
593
|
+
#
|
594
|
+
# The `-flat_namespace` line introduces its own behavior change, which is that (similar to on
|
595
|
+
# Linux), any symbols in the extension that are exported may now be resolved by shared libraries
|
596
|
+
# loaded by the Ruby process. Specifically, that means that libxml2 and libxslt, which are
|
597
|
+
# statically linked into the nokogiri bundle, will resolve (at runtime) to a system libxml2 loaded
|
598
|
+
# by Ruby on Darwin. And it appears that often Ruby on Darwin does indeed load the system libxml2,
|
599
|
+
# and that messes with our assumptions about whether we're running with a patched libxml2 or a
|
600
|
+
# vanilla libxml2.
|
601
|
+
#
|
602
|
+
# We choose to use `-load_hidden` in this case to prevent exporting those symbols from libxml2 and
|
603
|
+
# libxslt, which ensures that they will be resolved to the static libraries in the bundle. In other
|
604
|
+
# words, when we use `load_hidden`, what happens in the extension stays in the extension.
|
605
|
+
#
|
606
|
+
# See https://github.com/rake-compiler/rake-compiler-dock/issues/87 for more info.
|
607
|
+
#
|
608
|
+
# Anyway, this method is the logical bit to tell us when to turn on these workarounds.
|
609
|
+
def needs_darwin_linker_hack
|
610
|
+
config_cross_build? &&
|
611
|
+
darwin? &&
|
612
|
+
Gem::Requirement.new("~> 3.2").satisfied_by?(Gem::Version.new(RbConfig::CONFIG["ruby_version"].split("+").first))
|
613
|
+
end
|
614
|
+
|
576
615
|
#
|
577
616
|
# main
|
578
617
|
#
|
@@ -580,7 +619,7 @@ do_help if arg_config("--help")
|
|
580
619
|
do_clean if arg_config("--clean")
|
581
620
|
|
582
621
|
if openbsd? && !config_system_libraries?
|
583
|
-
|
622
|
+
unless %x(#{ENV["CC"] || "/usr/bin/cc"} -v 2>&1).include?("clang")
|
584
623
|
(ENV["CC"] ||= find_executable("egcc")) ||
|
585
624
|
abort("Please install gcc 4.9+ from ports using `pkg_add -v gcc`")
|
586
625
|
end
|
@@ -616,6 +655,9 @@ $LIBS = concat_flags($LIBS, ENV["LIBS"])
|
|
616
655
|
# errors/warnings. see #2302
|
617
656
|
append_cflags(["-std=c99", "-Wno-declaration-after-statement"])
|
618
657
|
|
658
|
+
# gumbo html5 serialization is slower with O3, let's make sure we use O2
|
659
|
+
append_cflags("-O2")
|
660
|
+
|
619
661
|
# always include debugging information
|
620
662
|
append_cflags("-g")
|
621
663
|
|
@@ -625,8 +667,18 @@ append_cflags("-Winline")
|
|
625
667
|
# good to have no matter what Ruby was compiled with
|
626
668
|
append_cflags("-Wmissing-noreturn")
|
627
669
|
|
670
|
+
# check integer loss of precision
|
671
|
+
if darwin?
|
672
|
+
append_cflags("-Wshorten-64-to-32")
|
673
|
+
else
|
674
|
+
append_cflags("-Wconversion -Wno-sign-conversion")
|
675
|
+
end
|
676
|
+
|
628
677
|
# handle clang variations, see #1101
|
629
|
-
|
678
|
+
if darwin?
|
679
|
+
append_cflags("-Wno-error=unused-command-line-argument-hard-error-in-future")
|
680
|
+
append_cflags("-Wno-unknown-warning-option")
|
681
|
+
end
|
630
682
|
|
631
683
|
# these tend to be noisy, but on occasion useful during development
|
632
684
|
# append_cflags(["-Wcast-qual", "-Wwrite-strings"])
|
@@ -666,6 +718,10 @@ else
|
|
666
718
|
cross_build_p = config_cross_build?
|
667
719
|
message "Cross build is #{cross_build_p ? "enabled" : "disabled"}.\n"
|
668
720
|
|
721
|
+
if needs_darwin_linker_hack
|
722
|
+
append_ldflags("-Wl,-flat_namespace")
|
723
|
+
end
|
724
|
+
|
669
725
|
require "yaml"
|
670
726
|
dependencies = YAML.load_file(File.join(PACKAGE_ROOT_DIR, "dependencies.yml"))
|
671
727
|
|
@@ -674,7 +730,7 @@ else
|
|
674
730
|
if cross_build_p || windows?
|
675
731
|
zlib_recipe = process_recipe("zlib", dependencies["zlib"]["version"], static_p, cross_build_p) do |recipe|
|
676
732
|
recipe.files = [{
|
677
|
-
url:
|
733
|
+
url: zlib_source(recipe.version),
|
678
734
|
sha256: dependencies["zlib"]["sha256"],
|
679
735
|
}]
|
680
736
|
if windows?
|
@@ -712,9 +768,17 @@ else
|
|
712
768
|
else
|
713
769
|
class << recipe
|
714
770
|
def configure
|
715
|
-
|
716
|
-
|
717
|
-
|
771
|
+
env = {}
|
772
|
+
env["CFLAGS"] = concat_flags(ENV["CFLAGS"], "-fPIC", "-g")
|
773
|
+
env["CHOST"] = host
|
774
|
+
execute("configure", ["./configure", "--static", configure_prefix], { env: env })
|
775
|
+
if darwin?
|
776
|
+
# needed as of zlib 1.2.13
|
777
|
+
Dir.chdir(work_path) do
|
778
|
+
makefile = File.read("Makefile").gsub(/^AR=.*$/, "AR=#{host}-libtool")
|
779
|
+
File.open("Makefile", "w") { |m| m.write(makefile) }
|
780
|
+
end
|
781
|
+
end
|
718
782
|
end
|
719
783
|
end
|
720
784
|
end
|
@@ -839,6 +903,11 @@ else
|
|
839
903
|
recipe.configure_options += ["RANLIB=/usr/bin/ranlib", "AR=/usr/bin/ar"]
|
840
904
|
end
|
841
905
|
|
906
|
+
if windows?
|
907
|
+
cflags = concat_flags(cflags, "-ULIBXSLT_STATIC", "-DIN_LIBXSLT")
|
908
|
+
cflags = concat_flags(cflags, "-ULIBEXSLT_STATIC", "-DIN_LIBEXSLT")
|
909
|
+
end
|
910
|
+
|
842
911
|
recipe.configure_options << if source_dir
|
843
912
|
"--config-cache"
|
844
913
|
else
|
@@ -860,9 +929,13 @@ else
|
|
860
929
|
$libs = $libs.shellsplit.tap do |libs|
|
861
930
|
[libxml2_recipe, libxslt_recipe].each do |recipe|
|
862
931
|
libname = recipe.name[/\Alib(.+)\z/, 1]
|
863
|
-
|
932
|
+
config_basename = "#{libname}-config"
|
933
|
+
File.join(recipe.path, "bin", config_basename).tap do |config|
|
864
934
|
# call config scripts explicit with 'sh' for compat with Windows
|
865
|
-
|
935
|
+
cflags = %x(sh #{config} --cflags).strip
|
936
|
+
message("#{config_basename} cflags: #{cflags}\n")
|
937
|
+
$CPPFLAGS = concat_flags(cflags, $CPPFLAGS) # prepend
|
938
|
+
|
866
939
|
%x(sh #{config} --libs).strip.shellsplit.each do |arg|
|
867
940
|
case arg
|
868
941
|
when /\A-L(.+)\z/
|
@@ -881,7 +954,7 @@ else
|
|
881
954
|
end
|
882
955
|
|
883
956
|
patches_string = recipe.patch_files.map { |path| File.basename(path) }.join(" ")
|
884
|
-
append_cppflags(%[-DNOKOGIRI_#{recipe.name.upcase}_PATCHES="
|
957
|
+
append_cppflags(%[-DNOKOGIRI_#{recipe.name.upcase}_PATCHES="\\"#{patches_string}\\""])
|
885
958
|
|
886
959
|
case libname
|
887
960
|
when "xml2"
|
@@ -900,16 +973,17 @@ else
|
|
900
973
|
end.shelljoin
|
901
974
|
|
902
975
|
if static_p
|
976
|
+
static_archive_ld_flag = needs_darwin_linker_hack ? ["-load_hidden"] : []
|
903
977
|
$libs = $libs.shellsplit.map do |arg|
|
904
978
|
case arg
|
905
979
|
when "-lxml2"
|
906
|
-
File.join(libxml2_recipe.path, "lib", libflag_to_filename(arg))
|
980
|
+
static_archive_ld_flag + [File.join(libxml2_recipe.path, "lib", libflag_to_filename(arg))]
|
907
981
|
when "-lxslt", "-lexslt"
|
908
|
-
File.join(libxslt_recipe.path, "lib", libflag_to_filename(arg))
|
982
|
+
static_archive_ld_flag + [File.join(libxslt_recipe.path, "lib", libflag_to_filename(arg))]
|
909
983
|
else
|
910
984
|
arg
|
911
985
|
end
|
912
|
-
end.shelljoin
|
986
|
+
end.flatten.shelljoin
|
913
987
|
end
|
914
988
|
|
915
989
|
ensure_func("xmlParseDoc", "libxml/parser.h")
|
@@ -945,7 +1019,7 @@ libgumbo_recipe = process_recipe("libgumbo", "1.0.0-nokogiri", static_p, cross_b
|
|
945
1019
|
end
|
946
1020
|
|
947
1021
|
def compile
|
948
|
-
cflags = concat_flags(ENV["CFLAGS"], "-fPIC", "-g")
|
1022
|
+
cflags = concat_flags(ENV["CFLAGS"], "-fPIC", "-O2", "-g")
|
949
1023
|
|
950
1024
|
env = { "CC" => gcc_cmd, "CFLAGS" => cflags }
|
951
1025
|
if config_cross_build?
|
@@ -965,7 +1039,7 @@ end
|
|
965
1039
|
append_cppflags("-I#{File.join(libgumbo_recipe.path, "include")}")
|
966
1040
|
$libs = $libs + " " + File.join(libgumbo_recipe.path, "lib", "libgumbo.a")
|
967
1041
|
$LIBPATH = $LIBPATH | [File.join(libgumbo_recipe.path, "lib")]
|
968
|
-
ensure_func("gumbo_parse_with_options", "
|
1042
|
+
ensure_func("gumbo_parse_with_options", "nokogiri_gumbo.h")
|
969
1043
|
|
970
1044
|
have_func("xmlHasFeature") || abort("xmlHasFeature() is missing.") # introduced in libxml 2.6.21
|
971
1045
|
have_func("xmlFirstElementChild") # introduced in libxml 2.7.3
|
@@ -973,11 +1047,11 @@ have_func("xmlRelaxNGSetParserStructuredErrors") # introduced in libxml 2.6.24
|
|
973
1047
|
have_func("xmlRelaxNGSetValidStructuredErrors") # introduced in libxml 2.6.21
|
974
1048
|
have_func("xmlSchemaSetValidStructuredErrors") # introduced in libxml 2.6.23
|
975
1049
|
have_func("xmlSchemaSetParserStructuredErrors") # introduced in libxml 2.6.23
|
976
|
-
|
977
|
-
have_func("
|
1050
|
+
have_func("rb_gc_location") # introduced in Ruby 2.7
|
1051
|
+
have_func("rb_category_warning") # introduced in Ruby 3.0
|
978
1052
|
|
979
1053
|
other_library_versions_string = OTHER_LIBRARY_VERSIONS.map { |k, v| [k, v].join(":") }.join(",")
|
980
|
-
append_cppflags(%[-DNOKOGIRI_OTHER_LIBRARY_VERSIONS="
|
1054
|
+
append_cppflags(%[-DNOKOGIRI_OTHER_LIBRARY_VERSIONS="\\"#{other_library_versions_string}\\""])
|
981
1055
|
|
982
1056
|
unless config_system_libraries?
|
983
1057
|
if cross_build_p
|
@@ -1006,3 +1080,5 @@ if config_clean?
|
|
1006
1080
|
EOF
|
1007
1081
|
end
|
1008
1082
|
end
|
1083
|
+
|
1084
|
+
# rubocop:enable Style/GlobalVars
|
data/ext/nokogiri/gumbo.c
CHANGED
@@ -23,13 +23,13 @@
|
|
23
23
|
//
|
24
24
|
// Processing starts by calling gumbo_parse_with_options. The resulting document tree
|
25
25
|
// is then walked, a parallel libxml2 tree is constructed, and the final document is
|
26
|
-
// then wrapped using
|
26
|
+
// then wrapped using noko_xml_document_wrap. This approach reduces memory and CPU
|
27
27
|
// requirements as Ruby objects are only built when necessary.
|
28
28
|
//
|
29
29
|
|
30
30
|
#include <nokogiri.h>
|
31
31
|
|
32
|
-
#include "
|
32
|
+
#include "nokogiri_gumbo.h"
|
33
33
|
|
34
34
|
VALUE cNokogiriHtml5Document;
|
35
35
|
|
@@ -281,12 +281,12 @@ add_errors(const GumboOutput *output, VALUE rdoc, VALUE input, VALUE url)
|
|
281
281
|
rb_iv_set(syntax_error, "@code", INT2NUM(1)); // XML_ERR_INTERNAL_ERROR
|
282
282
|
rb_iv_set(syntax_error, "@level", INT2NUM(2)); // XML_ERR_ERROR
|
283
283
|
rb_iv_set(syntax_error, "@file", url);
|
284
|
-
rb_iv_set(syntax_error, "@line",
|
284
|
+
rb_iv_set(syntax_error, "@line", SIZET2NUM(position.line));
|
285
285
|
rb_iv_set(syntax_error, "@str1", str1);
|
286
286
|
rb_iv_set(syntax_error, "@str2", Qnil);
|
287
287
|
rb_iv_set(syntax_error, "@str3", Qnil);
|
288
288
|
rb_iv_set(syntax_error, "@int1", INT2NUM(0));
|
289
|
-
rb_iv_set(syntax_error, "@column",
|
289
|
+
rb_iv_set(syntax_error, "@column", SIZET2NUM(position.column));
|
290
290
|
rb_ary_push(rerrors, syntax_error);
|
291
291
|
}
|
292
292
|
rb_iv_set(rdoc, "@errors", rerrors);
|
@@ -297,6 +297,7 @@ typedef struct {
|
|
297
297
|
GumboOutput *output;
|
298
298
|
VALUE input;
|
299
299
|
VALUE url_or_frag;
|
300
|
+
VALUE klass;
|
300
301
|
xmlDocPtr doc;
|
301
302
|
} ParseArgs;
|
302
303
|
|
@@ -321,7 +322,7 @@ static VALUE parse_continue(VALUE parse_args);
|
|
321
322
|
* @!visibility protected
|
322
323
|
*/
|
323
324
|
static VALUE
|
324
|
-
parse(VALUE self, VALUE input, VALUE url, VALUE max_attributes, VALUE max_errors, VALUE max_depth)
|
325
|
+
parse(VALUE self, VALUE input, VALUE url, VALUE max_attributes, VALUE max_errors, VALUE max_depth, VALUE klass)
|
325
326
|
{
|
326
327
|
GumboOptions options = kGumboDefaultOptions;
|
327
328
|
options.max_attributes = NUM2INT(max_attributes);
|
@@ -333,6 +334,7 @@ parse(VALUE self, VALUE input, VALUE url, VALUE max_attributes, VALUE max_errors
|
|
333
334
|
.output = output,
|
334
335
|
.input = input,
|
335
336
|
.url_or_frag = url,
|
337
|
+
.klass = klass,
|
336
338
|
.doc = NULL,
|
337
339
|
};
|
338
340
|
|
@@ -357,7 +359,9 @@ parse_continue(VALUE parse_args)
|
|
357
359
|
}
|
358
360
|
args->doc = doc; // Make sure doc gets cleaned up if an error is thrown.
|
359
361
|
build_tree(doc, (xmlNodePtr)doc, output->document);
|
360
|
-
VALUE rdoc =
|
362
|
+
VALUE rdoc = noko_xml_document_wrap(args->klass, doc);
|
363
|
+
rb_iv_set(rdoc, "@url", args->url_or_frag);
|
364
|
+
rb_iv_set(rdoc, "@quirks_mode", INT2NUM(output->document->v.document.doc_type_quirks_mode));
|
361
365
|
args->doc = NULL; // The Ruby runtime now owns doc so don't delete it.
|
362
366
|
add_errors(output, rdoc, args->input, args->url_or_frag);
|
363
367
|
return rdoc;
|
@@ -401,7 +405,7 @@ static xmlNodePtr
|
|
401
405
|
extract_xml_node(VALUE node)
|
402
406
|
{
|
403
407
|
xmlNodePtr xml_node;
|
404
|
-
|
408
|
+
Noko_Node_Get_Struct(node, xmlNode, xml_node);
|
405
409
|
return xml_node;
|
406
410
|
}
|
407
411
|
|
@@ -498,9 +502,11 @@ error:
|
|
498
502
|
}
|
499
503
|
|
500
504
|
// Encoding.
|
501
|
-
if (
|
505
|
+
if (ctx_ns == GUMBO_NAMESPACE_MATHML
|
506
|
+
&& RSTRING_LEN(tag_name) == 14
|
502
507
|
&& !st_strcasecmp(ctx_tag, "annotation-xml")) {
|
503
508
|
VALUE enc = rb_funcall(ctx, rb_intern_const("[]"),
|
509
|
+
1,
|
504
510
|
rb_utf8_str_new_static("encoding", 8));
|
505
511
|
if (RTEST(enc)) {
|
506
512
|
Check_Type(enc, T_STRING);
|
@@ -512,8 +518,11 @@ error:
|
|
512
518
|
// Quirks mode.
|
513
519
|
VALUE doc = rb_funcall(doc_fragment, rb_intern_const("document"), 0);
|
514
520
|
VALUE dtd = rb_funcall(doc, internal_subset, 0);
|
515
|
-
|
521
|
+
VALUE doc_quirks_mode = rb_iv_get(doc, "@quirks_mode");
|
522
|
+
if (NIL_P(ctx) || NIL_P(doc_quirks_mode)) {
|
516
523
|
quirks_mode = GUMBO_DOCTYPE_NO_QUIRKS;
|
524
|
+
} else if (NIL_P(dtd)) {
|
525
|
+
quirks_mode = GUMBO_DOCTYPE_QUIRKS;
|
517
526
|
} else {
|
518
527
|
VALUE dtd_name = rb_funcall(dtd, name, 0);
|
519
528
|
VALUE pubid = rb_funcall(dtd, rb_intern_const("external_id"), 0);
|
@@ -560,13 +569,14 @@ fragment_continue(VALUE parse_args)
|
|
560
569
|
args->doc = NULL; // The Ruby runtime owns doc so make sure we don't delete it.
|
561
570
|
xmlNodePtr xml_frag = extract_xml_node(doc_fragment);
|
562
571
|
build_tree(xml_doc, xml_frag, output->root);
|
572
|
+
rb_iv_set(doc_fragment, "@quirks_mode", INT2NUM(output->document->v.document.doc_type_quirks_mode));
|
563
573
|
add_errors(output, doc_fragment, args->input, rb_utf8_str_new_static("#fragment", 9));
|
564
574
|
return Qnil;
|
565
575
|
}
|
566
576
|
|
567
577
|
// Initialize the Nokogumbo class and fetch constants we will use later.
|
568
578
|
void
|
569
|
-
noko_init_gumbo()
|
579
|
+
noko_init_gumbo(void)
|
570
580
|
{
|
571
581
|
// Class constants.
|
572
582
|
cNokogiriHtml5Document = rb_define_class_under(mNokogiriHtml5, "Document", cNokogiriHtml4Document);
|
@@ -577,7 +587,7 @@ noko_init_gumbo()
|
|
577
587
|
parent = rb_intern_const("parent");
|
578
588
|
|
579
589
|
// Define Nokogumbo module with parse and fragment methods.
|
580
|
-
rb_define_singleton_method(mNokogiriGumbo, "parse", parse,
|
590
|
+
rb_define_singleton_method(mNokogiriGumbo, "parse", parse, 6);
|
581
591
|
rb_define_singleton_method(mNokogiriGumbo, "fragment", fragment, 6);
|
582
592
|
}
|
583
593
|
|
@@ -146,11 +146,11 @@ rb_html_document_type(VALUE self)
|
|
146
146
|
{
|
147
147
|
htmlDocPtr doc;
|
148
148
|
Data_Get_Struct(self, xmlDoc, doc);
|
149
|
-
return INT2NUM(
|
149
|
+
return INT2NUM(doc->type);
|
150
150
|
}
|
151
151
|
|
152
152
|
void
|
153
|
-
noko_init_html_document()
|
153
|
+
noko_init_html_document(void)
|
154
154
|
{
|
155
155
|
assert(cNokogiriXmlDocument);
|
156
156
|
cNokogiriHtml4Document = rb_define_class_under(mNokogiriHtml4, "Document", cNokogiriXmlDocument);
|
@@ -270,7 +270,7 @@ get_description(VALUE klass, VALUE tag_name)
|
|
270
270
|
}
|
271
271
|
|
272
272
|
void
|
273
|
-
noko_init_html_element_description()
|
273
|
+
noko_init_html_element_description(void)
|
274
274
|
{
|
275
275
|
cNokogiriHtml4ElementDescription = rb_define_class_under(mNokogiriHtml4, "ElementDescription", rb_cObject);
|
276
276
|
|
@@ -20,7 +20,7 @@ get(VALUE _, VALUE rb_entity_name)
|
|
20
20
|
return Qnil;
|
21
21
|
}
|
22
22
|
|
23
|
-
rb_constructor_args[0] =
|
23
|
+
rb_constructor_args[0] = UINT2NUM(c_entity_desc->value);
|
24
24
|
rb_constructor_args[1] = NOKOGIRI_STR_NEW2(c_entity_desc->name);
|
25
25
|
rb_constructor_args[2] = NOKOGIRI_STR_NEW2(c_entity_desc->desc);
|
26
26
|
|
@@ -29,7 +29,7 @@ get(VALUE _, VALUE rb_entity_name)
|
|
29
29
|
}
|
30
30
|
|
31
31
|
void
|
32
|
-
noko_init_html_entity_lookup()
|
32
|
+
noko_init_html_entity_lookup(void)
|
33
33
|
{
|
34
34
|
cNokogiriHtml4EntityLookup = rb_define_class_under(mNokogiriHtml4, "EntityLookup", rb_cObject);
|
35
35
|
|
@@ -5,13 +5,8 @@ VALUE cNokogiriHtml4SaxParserContext ;
|
|
5
5
|
static void
|
6
6
|
deallocate(xmlParserCtxtPtr ctxt)
|
7
7
|
{
|
8
|
-
NOKOGIRI_DEBUG_START(ctxt);
|
9
|
-
|
10
8
|
ctxt->sax = NULL;
|
11
|
-
|
12
9
|
htmlFreeParserCtxt(ctxt);
|
13
|
-
|
14
|
-
NOKOGIRI_DEBUG_END(ctxt);
|
15
10
|
}
|
16
11
|
|
17
12
|
static VALUE
|
@@ -106,7 +101,7 @@ parse_with(VALUE self, VALUE sax_handler)
|
|
106
101
|
}
|
107
102
|
|
108
103
|
void
|
109
|
-
noko_init_html_sax_parser_context()
|
104
|
+
noko_init_html_sax_parser_context(void)
|
110
105
|
{
|
111
106
|
assert(cNokogiriXmlSaxParserContext);
|
112
107
|
cNokogiriHtml4SaxParserContext = rb_define_class_under(mNokogiriHtml4Sax, "ParserContext",
|
@@ -85,7 +85,7 @@ initialize_native(VALUE self, VALUE _xml_sax, VALUE _filename,
|
|
85
85
|
}
|
86
86
|
|
87
87
|
void
|
88
|
-
noko_init_html_sax_push_parser()
|
88
|
+
noko_init_html_sax_push_parser(void)
|
89
89
|
{
|
90
90
|
assert(cNokogiriXmlSaxPushParser);
|
91
91
|
cNokogiriHtml4SaxPushParser = rb_define_class_under(mNokogiriHtml4Sax, "PushParser", cNokogiriXmlSaxPushParser);
|