nokogiri 1.13.10-x86_64-darwin → 1.14.0-x86_64-darwin
Sign up to get free protection for your applications and to get access to all the features.
Potentially problematic release.
This version of nokogiri might be problematic. Click here for more details.
- checksums.yaml +4 -4
- data/Gemfile +33 -0
- data/LICENSE-DEPENDENCIES.md +830 -509
- data/LICENSE.md +1 -1
- data/README.md +18 -11
- data/dependencies.yml +25 -7
- data/ext/nokogiri/extconf.rb +80 -21
- data/ext/nokogiri/gumbo.c +19 -9
- data/ext/nokogiri/html4_document.c +1 -1
- data/ext/nokogiri/html4_entity_lookup.c +1 -1
- data/ext/nokogiri/html4_sax_parser_context.c +0 -5
- data/ext/nokogiri/nokogiri.c +33 -51
- data/ext/nokogiri/nokogiri.h +17 -14
- data/ext/nokogiri/xml_attribute_decl.c +1 -1
- data/ext/nokogiri/xml_cdata.c +1 -1
- data/ext/nokogiri/xml_document.c +16 -11
- data/ext/nokogiri/xml_element_content.c +2 -2
- data/ext/nokogiri/xml_element_decl.c +1 -1
- data/ext/nokogiri/xml_encoding_handler.c +2 -2
- data/ext/nokogiri/xml_namespace.c +38 -8
- data/ext/nokogiri/xml_node.c +286 -26
- data/ext/nokogiri/xml_node_set.c +0 -2
- data/ext/nokogiri/xml_reader.c +40 -20
- data/ext/nokogiri/xml_relax_ng.c +0 -2
- data/ext/nokogiri/xml_sax_parser.c +22 -16
- data/ext/nokogiri/xml_sax_parser_context.c +0 -5
- data/ext/nokogiri/xml_sax_push_parser.c +0 -2
- data/ext/nokogiri/xml_schema.c +0 -2
- data/ext/nokogiri/xml_xpath_context.c +87 -83
- data/ext/nokogiri/xslt_stylesheet.c +14 -13
- data/gumbo-parser/Makefile +10 -0
- data/lib/nokogiri/2.7/nokogiri.bundle +0 -0
- data/lib/nokogiri/3.0/nokogiri.bundle +0 -0
- data/lib/nokogiri/3.1/nokogiri.bundle +0 -0
- data/lib/nokogiri/{2.6 → 3.2}/nokogiri.bundle +0 -0
- data/lib/nokogiri/css/node.rb +2 -2
- data/lib/nokogiri/css/xpath_visitor.rb +5 -3
- data/lib/nokogiri/css.rb +6 -0
- data/lib/nokogiri/encoding_handler.rb +57 -0
- data/lib/nokogiri/extension.rb +3 -2
- data/lib/nokogiri/html4/document.rb +2 -121
- data/lib/nokogiri/html4/element_description_defaults.rb +6 -12
- data/lib/nokogiri/html4/encoding_reader.rb +121 -0
- data/lib/nokogiri/html4.rb +1 -0
- data/lib/nokogiri/html5/document.rb +113 -36
- data/lib/nokogiri/html5/document_fragment.rb +9 -2
- data/lib/nokogiri/html5/node.rb +3 -5
- data/lib/nokogiri/html5.rb +127 -216
- data/lib/nokogiri/jruby/dependencies.rb +1 -19
- data/lib/nokogiri/jruby/nokogiri_jars.rb +43 -0
- data/lib/nokogiri/version/constant.rb +1 -1
- data/lib/nokogiri/version/info.rb +11 -10
- data/lib/nokogiri/xml/attr.rb +49 -0
- data/lib/nokogiri/xml/builder.rb +1 -1
- data/lib/nokogiri/xml/document.rb +102 -54
- data/lib/nokogiri/xml/document_fragment.rb +49 -6
- data/lib/nokogiri/xml/namespace.rb +42 -0
- data/lib/nokogiri/xml/node/save_options.rb +6 -4
- data/lib/nokogiri/xml/node.rb +190 -35
- data/lib/nokogiri/xml/node_set.rb +87 -9
- data/lib/nokogiri/xml/parse_options.rb +129 -50
- data/lib/nokogiri/xml/pp/node.rb +6 -4
- data/lib/nokogiri/xml/processing_instruction.rb +2 -1
- data/lib/nokogiri/xml/sax/parser.rb +2 -3
- data/lib/nokogiri/xslt.rb +1 -1
- data/lib/nokogiri.rb +3 -11
- data/lib/xsd/xmlparser/nokogiri.rb +3 -1
- metadata +13 -248
data/LICENSE.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
The MIT License
|
2
2
|
|
3
|
-
Copyright 2008 --
|
3
|
+
Copyright 2008 -- 2023 by Mike Dalessio, Aaron Patterson, Yoko Harada, Akinori MUSHA, John Shahid, Karol Bucek, Sam Ruby, Craig Barnes, Stephen Checkoway, Lars Kanis, Sergio Arbeo, Timothy Elliott, Nobuyoshi Nakada, Charles Nutter, Patrick Mahoney.
|
4
4
|
|
5
5
|
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
6
6
|
|
data/README.md
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
# Nokogiri
|
4
4
|
|
5
|
-
Nokogiri (鋸) makes it easy and painless to work with XML and HTML from Ruby. It provides a sensible, easy-to-understand API for [reading](https://nokogiri.org/tutorials/parsing_an_html_xml_document.html), writing, [modifying](https://nokogiri.org/tutorials/modifying_an_html_xml_document.html), and [querying](https://nokogiri.org/tutorials/searching_a_xml_html_document.html) documents. It is fast and standards-compliant by relying on native parsers like libxml2
|
5
|
+
Nokogiri (鋸) makes it easy and painless to work with XML and HTML from Ruby. It provides a sensible, easy-to-understand API for [reading](https://nokogiri.org/tutorials/parsing_an_html_xml_document.html), writing, [modifying](https://nokogiri.org/tutorials/modifying_an_html_xml_document.html), and [querying](https://nokogiri.org/tutorials/searching_a_xml_html_document.html) documents. It is fast and standards-compliant by relying on native parsers like libxml2, libgumbo, and xerces.
|
6
6
|
|
7
7
|
## Guiding Principles
|
8
8
|
|
@@ -40,10 +40,6 @@ Some guiding principles Nokogiri tries to follow:
|
|
40
40
|
|
41
41
|
All official documentation is posted at https://nokogiri.org (the source for which is at https://github.com/sparklemotion/nokogiri.org/, and we welcome contributions).
|
42
42
|
|
43
|
-
Consider subscribing to [Tidelift][tidelift] which provides license assurances and timely security notifications for your open source dependencies, including Nokogiri. [Tidelift][tidelift] subscriptions also help the Nokogiri maintainers fund our [automated testing](https://ci.nokogiri.org) which in turn allows us to ship releases, bugfixes, and security updates more often.
|
44
|
-
|
45
|
-
[tidelift]: https://tidelift.com/subscription/pkg/rubygems-nokogiri?utm_source=rubygems-nokogiri&utm_medium=referral&utm_campaign=readme
|
46
|
-
|
47
43
|
### Reading
|
48
44
|
|
49
45
|
Your first stops for learning more about Nokogiri should be:
|
@@ -57,7 +53,6 @@ Your first stops for learning more about Nokogiri should be:
|
|
57
53
|
|
58
54
|
There are a few ways to ask exploratory questions:
|
59
55
|
|
60
|
-
- The Ruby Discord chat server is active at https://discord.gg/UyQnKrT
|
61
56
|
- The Nokogiri mailing list is active at https://groups.google.com/group/nokogiri-talk
|
62
57
|
- Open an issue using the "Help Request" template at https://github.com/sparklemotion/nokogiri/issues
|
63
58
|
|
@@ -103,12 +98,21 @@ We bump `Major.Minor.Patch` versions following this guidance:
|
|
103
98
|
- Updating packaged libraries for security-related reasons.
|
104
99
|
|
105
100
|
|
101
|
+
### Sponsorship
|
102
|
+
|
103
|
+
You can help sponsor the maintainers of this software through one of these organizations:
|
104
|
+
|
105
|
+
- [github.com/sponsors/flavorjones](https://github.com/sponsors/flavorjones)
|
106
|
+
- [opencollective.com/nokogiri](https://opencollective.com/nokogiri)
|
107
|
+
- [tidelift.com/subscription/pkg/rubygems-nokogiri](https://tidelift.com/subscription/pkg/rubygems-nokogiri?utm_source=rubygems-nokogiri&utm_medium=referral&utm_campaign=readme)
|
108
|
+
|
109
|
+
|
106
110
|
## Installation
|
107
111
|
|
108
112
|
Requirements:
|
109
113
|
|
110
|
-
- Ruby >= 2.
|
111
|
-
- JRuby >= 9.
|
114
|
+
- Ruby >= 2.7
|
115
|
+
- JRuby >= 9.4.0.0
|
112
116
|
|
113
117
|
|
114
118
|
### Native Gems: Faster, more reliable installation
|
@@ -119,10 +123,13 @@ Requirements:
|
|
119
123
|
|
120
124
|
Nokogiri ships pre-compiled, "native" gems for the following platforms:
|
121
125
|
|
122
|
-
- Linux:
|
126
|
+
- Linux:
|
127
|
+
- `x86-linux` and `x86_64-linux` (req: `glibc >= 2.17`)
|
128
|
+
- `aarch64-linux` and `arm-linux` (req: `glibc >= 2.29`)
|
129
|
+
- Note that musl platforms like Alpine **are** supported
|
123
130
|
- Darwin/MacOS: `x86_64-darwin` and `arm64-darwin`
|
124
|
-
- Windows: `x86-mingw32` and `x64-
|
125
|
-
- Java: any platform running JRuby 9.
|
131
|
+
- Windows: `x86-mingw32`, `x64-mingw32`, and `x64-mingw-ucrt`
|
132
|
+
- Java: any platform running JRuby 9.4 or higher
|
126
133
|
|
127
134
|
To determine whether your system supports one of these gems, look at the output of `bundle platform` or `ruby -e 'puts Gem::Platform.local.to_s'`.
|
128
135
|
|
data/dependencies.yml
CHANGED
@@ -14,10 +14,28 @@ zlib:
|
|
14
14
|
# SHA-256 hash provided on http://zlib.net/
|
15
15
|
|
16
16
|
libiconv:
|
17
|
-
version: "1.
|
18
|
-
sha256: "
|
19
|
-
#
|
20
|
-
#
|
21
|
-
#
|
22
|
-
#
|
23
|
-
#
|
17
|
+
version: "1.17"
|
18
|
+
sha256: "8f74213b56238c85a50a5329f77e06198771e70dd9a739779f4c02f65d971313"
|
19
|
+
# signature verified by following this path:
|
20
|
+
# - release announced at https://savannah.gnu.org/forum/forum.php?forum_id=10175
|
21
|
+
# - which links to https://savannah.gnu.org/users/haible as the releaser
|
22
|
+
# - which links to https://savannah.gnu.org/people/viewgpg.php?user_id=1871 as the gpg key
|
23
|
+
#
|
24
|
+
# So:
|
25
|
+
# - wget -q -O - https://savannah.gnu.org/people/viewgpg.php?user_id=1871 | gpg --import
|
26
|
+
# gpg: key F5BE8B267C6A406D: 1 signature not checked due to a missing key
|
27
|
+
# gpg: key F5BE8B267C6A406D: public key "Bruno Haible (Open Source Development) <bruno@clisp.org>" imported
|
28
|
+
# gpg: Total number processed: 1
|
29
|
+
# gpg: imported: 1
|
30
|
+
# gpg: marginals needed: 3 completes needed: 1 trust model: pgp
|
31
|
+
# gpg: depth: 0 valid: 4 signed: 0 trust: 0-, 0q, 0n, 0m, 0f, 4u
|
32
|
+
# gpg: next trustdb check due at 2024-05-09
|
33
|
+
# - gpg --verify libiconv-1.17.tar.gz.sig ports/archives/libiconv-1.17.tar.gz
|
34
|
+
# gpg: Signature made Sun 15 May 2022 11:26:42 AM EDT
|
35
|
+
# gpg: using RSA key 9001B85AF9E1B83DF1BDA942F5BE8B267C6A406D
|
36
|
+
# gpg: Good signature from "Bruno Haible (Open Source Development) <bruno@clisp.org>" [unknown]
|
37
|
+
# gpg: WARNING: This key is not certified with a trusted signature!
|
38
|
+
# gpg: There is no indication that the signature belongs to the owner.
|
39
|
+
# Primary key fingerprint: 9001 B85A F9E1 B83D F1BD A942 F5BE 8B26 7C6A 406D
|
40
|
+
#
|
41
|
+
# And this sha256sum is calculated from that verified tarball.
|
data/ext/nokogiri/extconf.rb
CHANGED
@@ -200,7 +200,7 @@ def nix?
|
|
200
200
|
end
|
201
201
|
|
202
202
|
def truffle?
|
203
|
-
|
203
|
+
RUBY_ENGINE == "truffleruby"
|
204
204
|
end
|
205
205
|
|
206
206
|
def concat_flags(*args)
|
@@ -211,6 +211,16 @@ def local_have_library(lib, func = nil, headers = nil)
|
|
211
211
|
have_library(lib, func, headers) || have_library("lib#{lib}", func, headers)
|
212
212
|
end
|
213
213
|
|
214
|
+
def zlib_source(version_string)
|
215
|
+
# As of 2022-12, I'm starting to see failed downloads often enough from zlib.net that I want to
|
216
|
+
# change the default to github.
|
217
|
+
if ENV["NOKOGIRI_USE_CANONICAL_ZLIB_SOURCE"]
|
218
|
+
"https://zlib.net/fossils/zlib-#{version_string}.tar.gz"
|
219
|
+
else
|
220
|
+
"https://github.com/madler/zlib/releases/download/v#{version_string}/zlib-#{version_string}.tar.gz"
|
221
|
+
end
|
222
|
+
end
|
223
|
+
|
214
224
|
def gnome_source
|
215
225
|
# As of 2022-02-20, some mirrors have expired SSL certificates. I'm able to retrieve from my home,
|
216
226
|
# but whatever host is resolved on the github actions workers see an expired cert.
|
@@ -400,8 +410,8 @@ def iconv_configure_flags
|
|
400
410
|
return ["--with-iconv=yes"]
|
401
411
|
end
|
402
412
|
|
403
|
-
config = preserving_globals {
|
404
|
-
if config && try_link_iconv("pkg-config libiconv") {
|
413
|
+
config = preserving_globals { pkg_config("libiconv") }
|
414
|
+
if config && try_link_iconv("pkg-config libiconv") { pkg_config("libiconv") }
|
405
415
|
cflags, ldflags, libs = config
|
406
416
|
|
407
417
|
return [
|
@@ -430,10 +440,12 @@ def process_recipe(name, version, static_p, cross_p, cacheable_p = true)
|
|
430
440
|
"#{@target}/#{RUBY_PLATFORM}/#{@name}/#{@version}"
|
431
441
|
end
|
432
442
|
|
433
|
-
|
434
|
-
#
|
435
|
-
# use host if not set.
|
443
|
+
# We use 'host' to set compiler prefix for cross-compiling. Prefer host_alias over host. And
|
444
|
+
# prefer i686 (what external dev tools use) to i386 (what ruby's configure.ac emits).
|
436
445
|
recipe.host = RbConfig::CONFIG["host_alias"].empty? ? RbConfig::CONFIG["host"] : RbConfig::CONFIG["host_alias"]
|
446
|
+
recipe.host = recipe.host.gsub(/i386/, "i686")
|
447
|
+
|
448
|
+
recipe.target = File.join(PACKAGE_ROOT_DIR, "ports") if cacheable_p
|
437
449
|
recipe.configure_options << "--libdir=#{File.join(recipe.path, "lib")}"
|
438
450
|
|
439
451
|
yield recipe
|
@@ -525,7 +537,6 @@ def process_recipe(name, version, static_p, cross_p, cacheable_p = true)
|
|
525
537
|
|
526
538
|
EOM
|
527
539
|
|
528
|
-
pp(recipe.files)
|
529
540
|
chdir_for_build { recipe.cook }
|
530
541
|
FileUtils.touch(checkpoint)
|
531
542
|
end
|
@@ -573,6 +584,34 @@ def do_clean
|
|
573
584
|
exit!(0)
|
574
585
|
end
|
575
586
|
|
587
|
+
# In ruby 3.2, symbol resolution changed on Darwin, to introduce the `-bundle_loader` flag to
|
588
|
+
# resolve symbols against the ruby binary.
|
589
|
+
#
|
590
|
+
# This makes it challenging to build a single extension that works with both a ruby with
|
591
|
+
# `--enable-shared` and one with `--disable-shared. To work around that, we choose to add
|
592
|
+
# `-flat_namespace` to the link line (later in this file).
|
593
|
+
#
|
594
|
+
# The `-flat_namespace` line introduces its own behavior change, which is that (similar to on
|
595
|
+
# Linux), any symbols in the extension that are exported may now be resolved by shared libraries
|
596
|
+
# loaded by the Ruby process. Specifically, that means that libxml2 and libxslt, which are
|
597
|
+
# statically linked into the nokogiri bundle, will resolve (at runtime) to a system libxml2 loaded
|
598
|
+
# by Ruby on Darwin. And it appears that often Ruby on Darwin does indeed load the system libxml2,
|
599
|
+
# and that messes with our assumptions about whether we're running with a patched libxml2 or a
|
600
|
+
# vanilla libxml2.
|
601
|
+
#
|
602
|
+
# We choose to use `-load_hidden` in this case to prevent exporting those symbols from libxml2 and
|
603
|
+
# libxslt, which ensures that they will be resolved to the static libraries in the bundle. In other
|
604
|
+
# words, when we use `load_hidden`, what happens in the extension stays in the extension.
|
605
|
+
#
|
606
|
+
# See https://github.com/rake-compiler/rake-compiler-dock/issues/87 for more info.
|
607
|
+
#
|
608
|
+
# Anyway, this method is the logical bit to tell us when to turn on these workarounds.
|
609
|
+
def needs_darwin_linker_hack
|
610
|
+
config_cross_build? &&
|
611
|
+
darwin? &&
|
612
|
+
Gem::Requirement.new("~> 3.2").satisfied_by?(Gem::Version.new(RbConfig::CONFIG["ruby_version"].split("+").first))
|
613
|
+
end
|
614
|
+
|
576
615
|
#
|
577
616
|
# main
|
578
617
|
#
|
@@ -580,7 +619,7 @@ do_help if arg_config("--help")
|
|
580
619
|
do_clean if arg_config("--clean")
|
581
620
|
|
582
621
|
if openbsd? && !config_system_libraries?
|
583
|
-
|
622
|
+
unless %x(#{ENV["CC"] || "/usr/bin/cc"} -v 2>&1).include?("clang")
|
584
623
|
(ENV["CC"] ||= find_executable("egcc")) ||
|
585
624
|
abort("Please install gcc 4.9+ from ports using `pkg_add -v gcc`")
|
586
625
|
end
|
@@ -616,6 +655,9 @@ $LIBS = concat_flags($LIBS, ENV["LIBS"])
|
|
616
655
|
# errors/warnings. see #2302
|
617
656
|
append_cflags(["-std=c99", "-Wno-declaration-after-statement"])
|
618
657
|
|
658
|
+
# gumbo html5 serialization is slower with O3, let's make sure we use O2
|
659
|
+
append_cflags("-O2")
|
660
|
+
|
619
661
|
# always include debugging information
|
620
662
|
append_cflags("-g")
|
621
663
|
|
@@ -625,8 +667,18 @@ append_cflags("-Winline")
|
|
625
667
|
# good to have no matter what Ruby was compiled with
|
626
668
|
append_cflags("-Wmissing-noreturn")
|
627
669
|
|
670
|
+
# check integer loss of precision
|
671
|
+
if darwin?
|
672
|
+
append_cflags("-Wshorten-64-to-32")
|
673
|
+
else
|
674
|
+
append_cflags("-Wconversion -Wno-sign-conversion")
|
675
|
+
end
|
676
|
+
|
628
677
|
# handle clang variations, see #1101
|
629
|
-
|
678
|
+
if darwin?
|
679
|
+
append_cflags("-Wno-error=unused-command-line-argument-hard-error-in-future")
|
680
|
+
append_cflags("-Wno-unknown-warning-option")
|
681
|
+
end
|
630
682
|
|
631
683
|
# these tend to be noisy, but on occasion useful during development
|
632
684
|
# append_cflags(["-Wcast-qual", "-Wwrite-strings"])
|
@@ -666,6 +718,10 @@ else
|
|
666
718
|
cross_build_p = config_cross_build?
|
667
719
|
message "Cross build is #{cross_build_p ? "enabled" : "disabled"}.\n"
|
668
720
|
|
721
|
+
if needs_darwin_linker_hack
|
722
|
+
append_ldflags("-Wl,-flat_namespace")
|
723
|
+
end
|
724
|
+
|
669
725
|
require "yaml"
|
670
726
|
dependencies = YAML.load_file(File.join(PACKAGE_ROOT_DIR, "dependencies.yml"))
|
671
727
|
|
@@ -674,7 +730,7 @@ else
|
|
674
730
|
if cross_build_p || windows?
|
675
731
|
zlib_recipe = process_recipe("zlib", dependencies["zlib"]["version"], static_p, cross_build_p) do |recipe|
|
676
732
|
recipe.files = [{
|
677
|
-
url:
|
733
|
+
url: zlib_source(recipe.version),
|
678
734
|
sha256: dependencies["zlib"]["sha256"],
|
679
735
|
}]
|
680
736
|
if windows?
|
@@ -873,9 +929,13 @@ else
|
|
873
929
|
$libs = $libs.shellsplit.tap do |libs|
|
874
930
|
[libxml2_recipe, libxslt_recipe].each do |recipe|
|
875
931
|
libname = recipe.name[/\Alib(.+)\z/, 1]
|
876
|
-
|
932
|
+
config_basename = "#{libname}-config"
|
933
|
+
File.join(recipe.path, "bin", config_basename).tap do |config|
|
877
934
|
# call config scripts explicit with 'sh' for compat with Windows
|
878
|
-
|
935
|
+
cflags = %x(sh #{config} --cflags).strip
|
936
|
+
message("#{config_basename} cflags: #{cflags}\n")
|
937
|
+
$CPPFLAGS = concat_flags(cflags, $CPPFLAGS) # prepend
|
938
|
+
|
879
939
|
%x(sh #{config} --libs).strip.shellsplit.each do |arg|
|
880
940
|
case arg
|
881
941
|
when /\A-L(.+)\z/
|
@@ -894,7 +954,7 @@ else
|
|
894
954
|
end
|
895
955
|
|
896
956
|
patches_string = recipe.patch_files.map { |path| File.basename(path) }.join(" ")
|
897
|
-
append_cppflags(%[-DNOKOGIRI_#{recipe.name.upcase}_PATCHES="
|
957
|
+
append_cppflags(%[-DNOKOGIRI_#{recipe.name.upcase}_PATCHES="\\"#{patches_string}\\""])
|
898
958
|
|
899
959
|
case libname
|
900
960
|
when "xml2"
|
@@ -913,16 +973,17 @@ else
|
|
913
973
|
end.shelljoin
|
914
974
|
|
915
975
|
if static_p
|
976
|
+
static_archive_ld_flag = needs_darwin_linker_hack ? ["-load_hidden"] : []
|
916
977
|
$libs = $libs.shellsplit.map do |arg|
|
917
978
|
case arg
|
918
979
|
when "-lxml2"
|
919
|
-
File.join(libxml2_recipe.path, "lib", libflag_to_filename(arg))
|
980
|
+
static_archive_ld_flag + [File.join(libxml2_recipe.path, "lib", libflag_to_filename(arg))]
|
920
981
|
when "-lxslt", "-lexslt"
|
921
|
-
File.join(libxslt_recipe.path, "lib", libflag_to_filename(arg))
|
982
|
+
static_archive_ld_flag + [File.join(libxslt_recipe.path, "lib", libflag_to_filename(arg))]
|
922
983
|
else
|
923
984
|
arg
|
924
985
|
end
|
925
|
-
end.shelljoin
|
986
|
+
end.flatten.shelljoin
|
926
987
|
end
|
927
988
|
|
928
989
|
ensure_func("xmlParseDoc", "libxml/parser.h")
|
@@ -958,7 +1019,7 @@ libgumbo_recipe = process_recipe("libgumbo", "1.0.0-nokogiri", static_p, cross_b
|
|
958
1019
|
end
|
959
1020
|
|
960
1021
|
def compile
|
961
|
-
cflags = concat_flags(ENV["CFLAGS"], "-fPIC", "-g")
|
1022
|
+
cflags = concat_flags(ENV["CFLAGS"], "-fPIC", "-O2", "-g")
|
962
1023
|
|
963
1024
|
env = { "CC" => gcc_cmd, "CFLAGS" => cflags }
|
964
1025
|
if config_cross_build?
|
@@ -978,7 +1039,7 @@ end
|
|
978
1039
|
append_cppflags("-I#{File.join(libgumbo_recipe.path, "include")}")
|
979
1040
|
$libs = $libs + " " + File.join(libgumbo_recipe.path, "lib", "libgumbo.a")
|
980
1041
|
$LIBPATH = $LIBPATH | [File.join(libgumbo_recipe.path, "lib")]
|
981
|
-
ensure_func("gumbo_parse_with_options", "
|
1042
|
+
ensure_func("gumbo_parse_with_options", "nokogiri_gumbo.h")
|
982
1043
|
|
983
1044
|
have_func("xmlHasFeature") || abort("xmlHasFeature() is missing.") # introduced in libxml 2.6.21
|
984
1045
|
have_func("xmlFirstElementChild") # introduced in libxml 2.7.3
|
@@ -989,10 +1050,8 @@ have_func("xmlSchemaSetParserStructuredErrors") # introduced in libxml 2.6.23
|
|
989
1050
|
have_func("rb_gc_location") # introduced in Ruby 2.7
|
990
1051
|
have_func("rb_category_warning") # introduced in Ruby 3.0
|
991
1052
|
|
992
|
-
have_func("vasprintf")
|
993
|
-
|
994
1053
|
other_library_versions_string = OTHER_LIBRARY_VERSIONS.map { |k, v| [k, v].join(":") }.join(",")
|
995
|
-
append_cppflags(%[-DNOKOGIRI_OTHER_LIBRARY_VERSIONS="
|
1054
|
+
append_cppflags(%[-DNOKOGIRI_OTHER_LIBRARY_VERSIONS="\\"#{other_library_versions_string}\\""])
|
996
1055
|
|
997
1056
|
unless config_system_libraries?
|
998
1057
|
if cross_build_p
|
data/ext/nokogiri/gumbo.c
CHANGED
@@ -23,13 +23,13 @@
|
|
23
23
|
//
|
24
24
|
// Processing starts by calling gumbo_parse_with_options. The resulting document tree
|
25
25
|
// is then walked, a parallel libxml2 tree is constructed, and the final document is
|
26
|
-
// then wrapped using
|
26
|
+
// then wrapped using noko_xml_document_wrap. This approach reduces memory and CPU
|
27
27
|
// requirements as Ruby objects are only built when necessary.
|
28
28
|
//
|
29
29
|
|
30
30
|
#include <nokogiri.h>
|
31
31
|
|
32
|
-
#include "
|
32
|
+
#include "nokogiri_gumbo.h"
|
33
33
|
|
34
34
|
VALUE cNokogiriHtml5Document;
|
35
35
|
|
@@ -281,12 +281,12 @@ add_errors(const GumboOutput *output, VALUE rdoc, VALUE input, VALUE url)
|
|
281
281
|
rb_iv_set(syntax_error, "@code", INT2NUM(1)); // XML_ERR_INTERNAL_ERROR
|
282
282
|
rb_iv_set(syntax_error, "@level", INT2NUM(2)); // XML_ERR_ERROR
|
283
283
|
rb_iv_set(syntax_error, "@file", url);
|
284
|
-
rb_iv_set(syntax_error, "@line",
|
284
|
+
rb_iv_set(syntax_error, "@line", SIZET2NUM(position.line));
|
285
285
|
rb_iv_set(syntax_error, "@str1", str1);
|
286
286
|
rb_iv_set(syntax_error, "@str2", Qnil);
|
287
287
|
rb_iv_set(syntax_error, "@str3", Qnil);
|
288
288
|
rb_iv_set(syntax_error, "@int1", INT2NUM(0));
|
289
|
-
rb_iv_set(syntax_error, "@column",
|
289
|
+
rb_iv_set(syntax_error, "@column", SIZET2NUM(position.column));
|
290
290
|
rb_ary_push(rerrors, syntax_error);
|
291
291
|
}
|
292
292
|
rb_iv_set(rdoc, "@errors", rerrors);
|
@@ -297,6 +297,7 @@ typedef struct {
|
|
297
297
|
GumboOutput *output;
|
298
298
|
VALUE input;
|
299
299
|
VALUE url_or_frag;
|
300
|
+
VALUE klass;
|
300
301
|
xmlDocPtr doc;
|
301
302
|
} ParseArgs;
|
302
303
|
|
@@ -321,7 +322,7 @@ static VALUE parse_continue(VALUE parse_args);
|
|
321
322
|
* @!visibility protected
|
322
323
|
*/
|
323
324
|
static VALUE
|
324
|
-
parse(VALUE self, VALUE input, VALUE url, VALUE max_attributes, VALUE max_errors, VALUE max_depth)
|
325
|
+
parse(VALUE self, VALUE input, VALUE url, VALUE max_attributes, VALUE max_errors, VALUE max_depth, VALUE klass)
|
325
326
|
{
|
326
327
|
GumboOptions options = kGumboDefaultOptions;
|
327
328
|
options.max_attributes = NUM2INT(max_attributes);
|
@@ -333,6 +334,7 @@ parse(VALUE self, VALUE input, VALUE url, VALUE max_attributes, VALUE max_errors
|
|
333
334
|
.output = output,
|
334
335
|
.input = input,
|
335
336
|
.url_or_frag = url,
|
337
|
+
.klass = klass,
|
336
338
|
.doc = NULL,
|
337
339
|
};
|
338
340
|
|
@@ -357,7 +359,9 @@ parse_continue(VALUE parse_args)
|
|
357
359
|
}
|
358
360
|
args->doc = doc; // Make sure doc gets cleaned up if an error is thrown.
|
359
361
|
build_tree(doc, (xmlNodePtr)doc, output->document);
|
360
|
-
VALUE rdoc =
|
362
|
+
VALUE rdoc = noko_xml_document_wrap(args->klass, doc);
|
363
|
+
rb_iv_set(rdoc, "@url", args->url_or_frag);
|
364
|
+
rb_iv_set(rdoc, "@quirks_mode", INT2NUM(output->document->v.document.doc_type_quirks_mode));
|
361
365
|
args->doc = NULL; // The Ruby runtime now owns doc so don't delete it.
|
362
366
|
add_errors(output, rdoc, args->input, args->url_or_frag);
|
363
367
|
return rdoc;
|
@@ -498,9 +502,11 @@ error:
|
|
498
502
|
}
|
499
503
|
|
500
504
|
// Encoding.
|
501
|
-
if (
|
505
|
+
if (ctx_ns == GUMBO_NAMESPACE_MATHML
|
506
|
+
&& RSTRING_LEN(tag_name) == 14
|
502
507
|
&& !st_strcasecmp(ctx_tag, "annotation-xml")) {
|
503
508
|
VALUE enc = rb_funcall(ctx, rb_intern_const("[]"),
|
509
|
+
1,
|
504
510
|
rb_utf8_str_new_static("encoding", 8));
|
505
511
|
if (RTEST(enc)) {
|
506
512
|
Check_Type(enc, T_STRING);
|
@@ -512,8 +518,11 @@ error:
|
|
512
518
|
// Quirks mode.
|
513
519
|
VALUE doc = rb_funcall(doc_fragment, rb_intern_const("document"), 0);
|
514
520
|
VALUE dtd = rb_funcall(doc, internal_subset, 0);
|
515
|
-
|
521
|
+
VALUE doc_quirks_mode = rb_iv_get(doc, "@quirks_mode");
|
522
|
+
if (NIL_P(ctx) || NIL_P(doc_quirks_mode)) {
|
516
523
|
quirks_mode = GUMBO_DOCTYPE_NO_QUIRKS;
|
524
|
+
} else if (NIL_P(dtd)) {
|
525
|
+
quirks_mode = GUMBO_DOCTYPE_QUIRKS;
|
517
526
|
} else {
|
518
527
|
VALUE dtd_name = rb_funcall(dtd, name, 0);
|
519
528
|
VALUE pubid = rb_funcall(dtd, rb_intern_const("external_id"), 0);
|
@@ -560,6 +569,7 @@ fragment_continue(VALUE parse_args)
|
|
560
569
|
args->doc = NULL; // The Ruby runtime owns doc so make sure we don't delete it.
|
561
570
|
xmlNodePtr xml_frag = extract_xml_node(doc_fragment);
|
562
571
|
build_tree(xml_doc, xml_frag, output->root);
|
572
|
+
rb_iv_set(doc_fragment, "@quirks_mode", INT2NUM(output->document->v.document.doc_type_quirks_mode));
|
563
573
|
add_errors(output, doc_fragment, args->input, rb_utf8_str_new_static("#fragment", 9));
|
564
574
|
return Qnil;
|
565
575
|
}
|
@@ -577,7 +587,7 @@ noko_init_gumbo()
|
|
577
587
|
parent = rb_intern_const("parent");
|
578
588
|
|
579
589
|
// Define Nokogumbo module with parse and fragment methods.
|
580
|
-
rb_define_singleton_method(mNokogiriGumbo, "parse", parse,
|
590
|
+
rb_define_singleton_method(mNokogiriGumbo, "parse", parse, 6);
|
581
591
|
rb_define_singleton_method(mNokogiriGumbo, "fragment", fragment, 6);
|
582
592
|
}
|
583
593
|
|
@@ -20,7 +20,7 @@ get(VALUE _, VALUE rb_entity_name)
|
|
20
20
|
return Qnil;
|
21
21
|
}
|
22
22
|
|
23
|
-
rb_constructor_args[0] =
|
23
|
+
rb_constructor_args[0] = UINT2NUM(c_entity_desc->value);
|
24
24
|
rb_constructor_args[1] = NOKOGIRI_STR_NEW2(c_entity_desc->name);
|
25
25
|
rb_constructor_args[2] = NOKOGIRI_STR_NEW2(c_entity_desc->desc);
|
26
26
|
|
data/ext/nokogiri/nokogiri.c
CHANGED
@@ -49,34 +49,11 @@ void noko_init_html_sax_push_parser(void);
|
|
49
49
|
void noko_init_gumbo(void);
|
50
50
|
void noko_init_test_global_handlers(void);
|
51
51
|
|
52
|
-
static ID id_read, id_write;
|
53
|
-
|
54
|
-
|
55
|
-
#ifndef HAVE_VASPRINTF
|
56
|
-
/*
|
57
|
-
* Thank you Geoffroy Couprie for this implementation of vasprintf!
|
58
|
-
*/
|
59
|
-
int
|
60
|
-
vasprintf(char **strp, const char *fmt, va_list ap)
|
61
|
-
{
|
62
|
-
/* Mingw32/64 have a broken vsnprintf implementation that fails when
|
63
|
-
* using a zero-byte limit in order to retrieve the required size for malloc.
|
64
|
-
* So we use a one byte buffer instead.
|
65
|
-
*/
|
66
|
-
char tmp[1];
|
67
|
-
int len = vsnprintf(tmp, 1, fmt, ap) + 1;
|
68
|
-
char *res = (char *)malloc((unsigned int)len);
|
69
|
-
if (res == NULL) {
|
70
|
-
return -1;
|
71
|
-
}
|
72
|
-
*strp = res;
|
73
|
-
return vsnprintf(res, (unsigned int)len, fmt, ap);
|
74
|
-
}
|
75
|
-
#endif
|
52
|
+
static ID id_read, id_write, id_external_encoding;
|
76
53
|
|
77
54
|
|
78
55
|
static VALUE
|
79
|
-
|
56
|
+
noko_io_read_check(VALUE val)
|
80
57
|
{
|
81
58
|
VALUE *args = (VALUE *)val;
|
82
59
|
return rb_funcall(args[0], id_read, 1, args[1]);
|
@@ -84,68 +61,72 @@ read_check(VALUE val)
|
|
84
61
|
|
85
62
|
|
86
63
|
static VALUE
|
87
|
-
|
64
|
+
noko_io_read_failed(VALUE arg, VALUE exc)
|
88
65
|
{
|
89
66
|
return Qundef;
|
90
67
|
}
|
91
68
|
|
92
69
|
|
93
70
|
int
|
94
|
-
noko_io_read(void *
|
71
|
+
noko_io_read(void *io, char *c_buffer, int c_buffer_len)
|
95
72
|
{
|
96
|
-
VALUE
|
97
|
-
|
73
|
+
VALUE rb_io = (VALUE)io;
|
74
|
+
VALUE rb_read_string, rb_args[2];
|
75
|
+
size_t n_bytes_read, safe_len;
|
98
76
|
|
99
|
-
|
100
|
-
|
77
|
+
rb_args[0] = rb_io;
|
78
|
+
rb_args[1] = INT2NUM(c_buffer_len);
|
101
79
|
|
102
|
-
|
80
|
+
rb_read_string = rb_rescue(noko_io_read_check, (VALUE)rb_args, noko_io_read_failed, 0);
|
103
81
|
|
104
|
-
if (NIL_P(
|
105
|
-
if (
|
106
|
-
if (TYPE(
|
82
|
+
if (NIL_P(rb_read_string)) { return 0; }
|
83
|
+
if (rb_read_string == Qundef) { return -1; }
|
84
|
+
if (TYPE(rb_read_string) != T_STRING) { return -1; }
|
107
85
|
|
108
|
-
|
109
|
-
safe_len =
|
110
|
-
memcpy(
|
86
|
+
n_bytes_read = (size_t)RSTRING_LEN(rb_read_string);
|
87
|
+
safe_len = (n_bytes_read > (size_t)c_buffer_len) ? (size_t)c_buffer_len : n_bytes_read;
|
88
|
+
memcpy(c_buffer, StringValuePtr(rb_read_string), safe_len);
|
111
89
|
|
112
90
|
return (int)safe_len;
|
113
91
|
}
|
114
92
|
|
115
93
|
|
116
94
|
static VALUE
|
117
|
-
|
95
|
+
noko_io_write_check(VALUE rb_args)
|
118
96
|
{
|
119
|
-
VALUE
|
120
|
-
|
97
|
+
VALUE rb_io = ((VALUE *)rb_args)[0];
|
98
|
+
VALUE rb_output = ((VALUE *)rb_args)[1];
|
99
|
+
return rb_funcall(rb_io, id_write, 1, rb_output);
|
121
100
|
}
|
122
101
|
|
123
102
|
|
124
103
|
static VALUE
|
125
|
-
|
104
|
+
noko_io_write_failed(VALUE arg, VALUE exc)
|
126
105
|
{
|
127
106
|
return Qundef;
|
128
107
|
}
|
129
108
|
|
130
109
|
|
131
110
|
int
|
132
|
-
noko_io_write(void *
|
111
|
+
noko_io_write(void *io, char *c_buffer, int c_buffer_len)
|
133
112
|
{
|
134
|
-
VALUE
|
135
|
-
|
136
|
-
|
137
|
-
|
113
|
+
VALUE rb_args[2], rb_n_bytes_written;
|
114
|
+
VALUE rb_io = (VALUE)io;
|
115
|
+
VALUE rb_enc = rb_funcall(rb_io, id_external_encoding, 0);
|
116
|
+
rb_encoding *io_encoding = RB_NIL_P(rb_enc) ? rb_ascii8bit_encoding() : rb_to_encoding(rb_enc);
|
138
117
|
|
139
|
-
|
118
|
+
rb_args[0] = rb_io;
|
119
|
+
rb_args[1] = rb_enc_str_new(c_buffer, (long)c_buffer_len, io_encoding);
|
140
120
|
|
141
|
-
|
121
|
+
rb_n_bytes_written = rb_rescue(noko_io_write_check, (VALUE)rb_args, noko_io_write_failed, 0);
|
122
|
+
if (rb_n_bytes_written == Qundef) { return -1; }
|
142
123
|
|
143
|
-
return NUM2INT(
|
124
|
+
return NUM2INT(rb_n_bytes_written);
|
144
125
|
}
|
145
126
|
|
146
127
|
|
147
128
|
int
|
148
|
-
noko_io_close(void *
|
129
|
+
noko_io_close(void *io)
|
149
130
|
{
|
150
131
|
return 0;
|
151
132
|
}
|
@@ -275,4 +256,5 @@ Init_nokogiri()
|
|
275
256
|
|
276
257
|
id_read = rb_intern("read");
|
277
258
|
id_write = rb_intern("write");
|
259
|
+
id_external_encoding = rb_intern("external_encoding");
|
278
260
|
}
|