nokogiri 1.13.10-java → 1.14.0.rc1-java
Sign up to get free protection for your applications and to get access to all the features.
Potentially problematic release.
This version of nokogiri might be problematic. Click here for more details.
- checksums.yaml +4 -4
- data/Gemfile +33 -0
- data/LICENSE-DEPENDENCIES.md +830 -509
- data/README.md +18 -11
- data/dependencies.yml +25 -7
- data/ext/java/nokogiri/Html4Document.java +2 -0
- data/ext/java/nokogiri/Html4ElementDescription.java +9 -9
- data/ext/java/nokogiri/Html4EntityLookup.java +14 -3
- data/ext/java/nokogiri/Html4SaxParserContext.java +2 -2
- data/ext/java/nokogiri/Html4SaxPushParser.java +3 -0
- data/ext/java/nokogiri/NokogiriService.java +1 -24
- data/ext/java/nokogiri/XmlAttr.java +1 -1
- data/ext/java/nokogiri/XmlAttributeDecl.java +2 -1
- data/ext/java/nokogiri/XmlCdata.java +2 -1
- data/ext/java/nokogiri/XmlComment.java +2 -1
- data/ext/java/nokogiri/XmlDocument.java +5 -6
- data/ext/java/nokogiri/XmlDocumentFragment.java +2 -1
- data/ext/java/nokogiri/XmlDtd.java +4 -3
- data/ext/java/nokogiri/XmlElement.java +1 -0
- data/ext/java/nokogiri/XmlElementContent.java +4 -1
- data/ext/java/nokogiri/XmlElementDecl.java +3 -1
- data/ext/java/nokogiri/XmlEntityDecl.java +2 -0
- data/ext/java/nokogiri/XmlEntityReference.java +1 -0
- data/ext/java/nokogiri/XmlNamespace.java +2 -0
- data/ext/java/nokogiri/XmlNode.java +39 -24
- data/ext/java/nokogiri/XmlNodeSet.java +10 -7
- data/ext/java/nokogiri/XmlProcessingInstruction.java +1 -0
- data/ext/java/nokogiri/XmlReader.java +4 -3
- data/ext/java/nokogiri/XmlRelaxng.java +1 -0
- data/ext/java/nokogiri/XmlSaxParserContext.java +1 -0
- data/ext/java/nokogiri/XmlSaxPushParser.java +3 -0
- data/ext/java/nokogiri/XmlSchema.java +4 -2
- data/ext/java/nokogiri/XmlSyntaxError.java +1 -0
- data/ext/java/nokogiri/XmlText.java +1 -0
- data/ext/java/nokogiri/XmlXpathContext.java +2 -0
- data/ext/java/nokogiri/XsltStylesheet.java +16 -13
- data/ext/java/nokogiri/internals/HtmlDomParserContext.java +3 -2
- data/ext/java/nokogiri/internals/NokogiriHandler.java +2 -2
- data/ext/java/nokogiri/internals/NokogiriHelpers.java +4 -5
- data/ext/java/nokogiri/internals/NokogiriXPathFunction.java +3 -3
- data/ext/java/nokogiri/internals/ParserContext.java +2 -0
- data/ext/java/nokogiri/internals/ReaderNode.java +1 -1
- data/ext/java/nokogiri/internals/SaveContextVisitor.java +4 -2
- data/ext/java/nokogiri/internals/SchemaErrorHandler.java +2 -2
- data/ext/java/nokogiri/internals/XmlDomParserContext.java +2 -1
- data/ext/java/nokogiri/internals/c14n/Canonicalizer.java +1 -0
- data/ext/java/nokogiri/internals/dom2dtm/DOM2DTM.java +5 -4
- data/ext/nokogiri/extconf.rb +79 -20
- data/ext/nokogiri/gumbo.c +19 -9
- data/ext/nokogiri/html4_document.c +1 -1
- data/ext/nokogiri/html4_entity_lookup.c +1 -1
- data/ext/nokogiri/html4_sax_parser_context.c +0 -5
- data/ext/nokogiri/nokogiri.c +32 -51
- data/ext/nokogiri/xml_attribute_decl.c +1 -1
- data/ext/nokogiri/xml_cdata.c +1 -1
- data/ext/nokogiri/xml_document.c +16 -11
- data/ext/nokogiri/xml_element_content.c +2 -2
- data/ext/nokogiri/xml_element_decl.c +1 -1
- data/ext/nokogiri/xml_encoding_handler.c +2 -2
- data/ext/nokogiri/xml_namespace.c +38 -8
- data/ext/nokogiri/xml_node.c +286 -26
- data/ext/nokogiri/xml_node_set.c +0 -2
- data/ext/nokogiri/xml_reader.c +40 -20
- data/ext/nokogiri/xml_relax_ng.c +0 -2
- data/ext/nokogiri/xml_sax_parser.c +22 -16
- data/ext/nokogiri/xml_sax_parser_context.c +0 -5
- data/ext/nokogiri/xml_sax_push_parser.c +0 -2
- data/ext/nokogiri/xml_schema.c +0 -2
- data/ext/nokogiri/xml_xpath_context.c +87 -83
- data/ext/nokogiri/xslt_stylesheet.c +14 -13
- data/gumbo-parser/Makefile +10 -0
- data/lib/nokogiri/css/node.rb +2 -2
- data/lib/nokogiri/css/xpath_visitor.rb +3 -1
- data/lib/nokogiri/css.rb +6 -0
- data/lib/nokogiri/encoding_handler.rb +57 -0
- data/lib/nokogiri/extension.rb +3 -2
- data/lib/nokogiri/html4/document.rb +2 -121
- data/lib/nokogiri/html4/element_description_defaults.rb +6 -12
- data/lib/nokogiri/html4/encoding_reader.rb +121 -0
- data/lib/nokogiri/html4.rb +1 -0
- data/lib/nokogiri/html5/document.rb +113 -36
- data/lib/nokogiri/html5/document_fragment.rb +9 -2
- data/lib/nokogiri/html5/node.rb +3 -5
- data/lib/nokogiri/html5.rb +127 -216
- data/lib/nokogiri/jruby/dependencies.rb +1 -19
- data/lib/{isorelax.jar → nokogiri/jruby/isorelax/isorelax/20030108/isorelax-20030108.jar} +0 -0
- data/lib/nokogiri/jruby/net/sf/saxon/Saxon-HE/9.6.0-4/Saxon-HE-9.6.0-4.jar +0 -0
- data/lib/nokogiri/jruby/net/sourceforge/htmlunit/neko-htmlunit/2.63.0/neko-htmlunit-2.63.0.jar +0 -0
- data/lib/nokogiri/jruby/nokogiri_jars.rb +43 -0
- data/lib/nokogiri/jruby/nu/validator/jing/20200702VNU/jing-20200702VNU.jar +0 -0
- data/lib/nokogiri/jruby/org/nokogiri/nekodtd/0.1.11.noko1/nekodtd-0.1.11.noko1.jar +0 -0
- data/lib/{serializer.jar → nokogiri/jruby/xalan/serializer/2.7.2/serializer-2.7.2.jar} +0 -0
- data/lib/{xalan.jar → nokogiri/jruby/xalan/xalan/2.7.2/xalan-2.7.2.jar} +0 -0
- data/lib/{xercesImpl.jar → nokogiri/jruby/xerces/xercesImpl/2.12.2/xercesImpl-2.12.2.jar} +0 -0
- data/lib/{xml-apis.jar → nokogiri/jruby/xml-apis/xml-apis/1.4.01/xml-apis-1.4.01.jar} +0 -0
- data/lib/nokogiri/nokogiri.jar +0 -0
- data/lib/nokogiri/version/constant.rb +1 -1
- data/lib/nokogiri/version/info.rb +11 -10
- data/lib/nokogiri/xml/attr.rb +49 -0
- data/lib/nokogiri/xml/builder.rb +1 -1
- data/lib/nokogiri/xml/document.rb +102 -54
- data/lib/nokogiri/xml/document_fragment.rb +49 -6
- data/lib/nokogiri/xml/namespace.rb +42 -0
- data/lib/nokogiri/xml/node/save_options.rb +4 -2
- data/lib/nokogiri/xml/node.rb +190 -35
- data/lib/nokogiri/xml/node_set.rb +87 -9
- data/lib/nokogiri/xml/parse_options.rb +127 -48
- data/lib/nokogiri/xml/pp/node.rb +6 -4
- data/lib/nokogiri/xml/processing_instruction.rb +2 -1
- data/lib/nokogiri/xml/sax/parser.rb +2 -3
- data/lib/nokogiri/xslt.rb +1 -1
- data/lib/nokogiri.rb +3 -11
- metadata +62 -274
- data/ext/java/nokogiri/EncodingHandler.java +0 -111
- data/lib/jing.jar +0 -0
- data/lib/nekodtd.jar +0 -0
- data/lib/nekohtml.jar +0 -0
data/ext/nokogiri/extconf.rb
CHANGED
@@ -200,7 +200,7 @@ def nix?
|
|
200
200
|
end
|
201
201
|
|
202
202
|
def truffle?
|
203
|
-
|
203
|
+
RUBY_ENGINE == "truffleruby"
|
204
204
|
end
|
205
205
|
|
206
206
|
def concat_flags(*args)
|
@@ -211,6 +211,16 @@ def local_have_library(lib, func = nil, headers = nil)
|
|
211
211
|
have_library(lib, func, headers) || have_library("lib#{lib}", func, headers)
|
212
212
|
end
|
213
213
|
|
214
|
+
def zlib_source(version_string)
|
215
|
+
# As of 2022-12, I'm starting to see failed downloads often enough from zlib.net that I want to
|
216
|
+
# change the default to github.
|
217
|
+
if ENV["NOKOGIRI_USE_CANONICAL_ZLIB_SOURCE"]
|
218
|
+
"https://zlib.net/fossils/zlib-#{version_string}.tar.gz"
|
219
|
+
else
|
220
|
+
"https://github.com/madler/zlib/releases/download/v#{version_string}/zlib-#{version_string}.tar.gz"
|
221
|
+
end
|
222
|
+
end
|
223
|
+
|
214
224
|
def gnome_source
|
215
225
|
# As of 2022-02-20, some mirrors have expired SSL certificates. I'm able to retrieve from my home,
|
216
226
|
# but whatever host is resolved on the github actions workers see an expired cert.
|
@@ -400,8 +410,8 @@ def iconv_configure_flags
|
|
400
410
|
return ["--with-iconv=yes"]
|
401
411
|
end
|
402
412
|
|
403
|
-
config = preserving_globals {
|
404
|
-
if config && try_link_iconv("pkg-config libiconv") {
|
413
|
+
config = preserving_globals { pkg_config("libiconv") }
|
414
|
+
if config && try_link_iconv("pkg-config libiconv") { pkg_config("libiconv") }
|
405
415
|
cflags, ldflags, libs = config
|
406
416
|
|
407
417
|
return [
|
@@ -430,10 +440,12 @@ def process_recipe(name, version, static_p, cross_p, cacheable_p = true)
|
|
430
440
|
"#{@target}/#{RUBY_PLATFORM}/#{@name}/#{@version}"
|
431
441
|
end
|
432
442
|
|
433
|
-
|
434
|
-
#
|
435
|
-
# use host if not set.
|
443
|
+
# We use 'host' to set compiler prefix for cross-compiling. Prefer host_alias over host. And
|
444
|
+
# prefer i686 (what external dev tools use) to i386 (what ruby's configure.ac emits).
|
436
445
|
recipe.host = RbConfig::CONFIG["host_alias"].empty? ? RbConfig::CONFIG["host"] : RbConfig::CONFIG["host_alias"]
|
446
|
+
recipe.host = recipe.host.gsub(/i386/, "i686")
|
447
|
+
|
448
|
+
recipe.target = File.join(PACKAGE_ROOT_DIR, "ports") if cacheable_p
|
437
449
|
recipe.configure_options << "--libdir=#{File.join(recipe.path, "lib")}"
|
438
450
|
|
439
451
|
yield recipe
|
@@ -525,7 +537,6 @@ def process_recipe(name, version, static_p, cross_p, cacheable_p = true)
|
|
525
537
|
|
526
538
|
EOM
|
527
539
|
|
528
|
-
pp(recipe.files)
|
529
540
|
chdir_for_build { recipe.cook }
|
530
541
|
FileUtils.touch(checkpoint)
|
531
542
|
end
|
@@ -573,6 +584,34 @@ def do_clean
|
|
573
584
|
exit!(0)
|
574
585
|
end
|
575
586
|
|
587
|
+
# In ruby 3.2, symbol resolution changed on Darwin, to introduce the `-bundle_loader` flag to
|
588
|
+
# resolve symbols against the ruby binary.
|
589
|
+
#
|
590
|
+
# This makes it challenging to build a single extension that works with both a ruby with
|
591
|
+
# `--enable-shared` and one with `--disable-shared. To work around that, we choose to add
|
592
|
+
# `-flat_namespace` to the link line (later in this file).
|
593
|
+
#
|
594
|
+
# The `-flat_namespace` line introduces its own behavior change, which is that (similar to on
|
595
|
+
# Linux), any symbols in the extension that are exported may now be resolved by shared libraries
|
596
|
+
# loaded by the Ruby process. Specifically, that means that libxml2 and libxslt, which are
|
597
|
+
# statically linked into the nokogiri bundle, will resolve (at runtime) to a system libxml2 loaded
|
598
|
+
# by Ruby on Darwin. And it appears that often Ruby on Darwin does indeed load the system libxml2,
|
599
|
+
# and that messes with our assumptions about whether we're running with a patched libxml2 or a
|
600
|
+
# vanilla libxml2.
|
601
|
+
#
|
602
|
+
# We choose to use `-load_hidden` in this case to prevent exporting those symbols from libxml2 and
|
603
|
+
# libxslt, which ensures that they will be resolved to the static libraries in the bundle. In other
|
604
|
+
# words, when we use `load_hidden`, what happens in the extension stays in the extension.
|
605
|
+
#
|
606
|
+
# See https://github.com/rake-compiler/rake-compiler-dock/issues/87 for more info.
|
607
|
+
#
|
608
|
+
# Anyway, this method is the logical bit to tell us when to turn on these workarounds.
|
609
|
+
def needs_darwin_linker_hack
|
610
|
+
config_cross_build? &&
|
611
|
+
darwin? &&
|
612
|
+
Gem::Requirement.new("~> 3.2").satisfied_by?(Gem::Version.new(RbConfig::CONFIG["ruby_version"].split("+").first))
|
613
|
+
end
|
614
|
+
|
576
615
|
#
|
577
616
|
# main
|
578
617
|
#
|
@@ -616,6 +655,9 @@ $LIBS = concat_flags($LIBS, ENV["LIBS"])
|
|
616
655
|
# errors/warnings. see #2302
|
617
656
|
append_cflags(["-std=c99", "-Wno-declaration-after-statement"])
|
618
657
|
|
658
|
+
# gumbo html5 serialization is slower with O3, let's make sure we use O2
|
659
|
+
append_cflags("-O2")
|
660
|
+
|
619
661
|
# always include debugging information
|
620
662
|
append_cflags("-g")
|
621
663
|
|
@@ -625,8 +667,18 @@ append_cflags("-Winline")
|
|
625
667
|
# good to have no matter what Ruby was compiled with
|
626
668
|
append_cflags("-Wmissing-noreturn")
|
627
669
|
|
670
|
+
# check integer loss of precision
|
671
|
+
if darwin?
|
672
|
+
append_cflags("-Wshorten-64-to-32")
|
673
|
+
else
|
674
|
+
append_cflags("-Wconversion -Wno-sign-conversion")
|
675
|
+
end
|
676
|
+
|
628
677
|
# handle clang variations, see #1101
|
629
|
-
|
678
|
+
if darwin?
|
679
|
+
append_cflags("-Wno-error=unused-command-line-argument-hard-error-in-future")
|
680
|
+
append_cflags("-Wno-unknown-warning-option")
|
681
|
+
end
|
630
682
|
|
631
683
|
# these tend to be noisy, but on occasion useful during development
|
632
684
|
# append_cflags(["-Wcast-qual", "-Wwrite-strings"])
|
@@ -666,6 +718,10 @@ else
|
|
666
718
|
cross_build_p = config_cross_build?
|
667
719
|
message "Cross build is #{cross_build_p ? "enabled" : "disabled"}.\n"
|
668
720
|
|
721
|
+
if needs_darwin_linker_hack
|
722
|
+
append_ldflags("-Wl,-flat_namespace")
|
723
|
+
end
|
724
|
+
|
669
725
|
require "yaml"
|
670
726
|
dependencies = YAML.load_file(File.join(PACKAGE_ROOT_DIR, "dependencies.yml"))
|
671
727
|
|
@@ -674,7 +730,7 @@ else
|
|
674
730
|
if cross_build_p || windows?
|
675
731
|
zlib_recipe = process_recipe("zlib", dependencies["zlib"]["version"], static_p, cross_build_p) do |recipe|
|
676
732
|
recipe.files = [{
|
677
|
-
url:
|
733
|
+
url: zlib_source(recipe.version),
|
678
734
|
sha256: dependencies["zlib"]["sha256"],
|
679
735
|
}]
|
680
736
|
if windows?
|
@@ -873,9 +929,13 @@ else
|
|
873
929
|
$libs = $libs.shellsplit.tap do |libs|
|
874
930
|
[libxml2_recipe, libxslt_recipe].each do |recipe|
|
875
931
|
libname = recipe.name[/\Alib(.+)\z/, 1]
|
876
|
-
|
932
|
+
config_basename = "#{libname}-config"
|
933
|
+
File.join(recipe.path, "bin", config_basename).tap do |config|
|
877
934
|
# call config scripts explicit with 'sh' for compat with Windows
|
878
|
-
|
935
|
+
cflags = %x(sh #{config} --cflags).strip
|
936
|
+
message("#{config_basename} cflags: #{cflags}\n")
|
937
|
+
$CPPFLAGS = concat_flags(cflags, $CPPFLAGS) # prepend
|
938
|
+
|
879
939
|
%x(sh #{config} --libs).strip.shellsplit.each do |arg|
|
880
940
|
case arg
|
881
941
|
when /\A-L(.+)\z/
|
@@ -894,7 +954,7 @@ else
|
|
894
954
|
end
|
895
955
|
|
896
956
|
patches_string = recipe.patch_files.map { |path| File.basename(path) }.join(" ")
|
897
|
-
append_cppflags(%[-DNOKOGIRI_#{recipe.name.upcase}_PATCHES="
|
957
|
+
append_cppflags(%[-DNOKOGIRI_#{recipe.name.upcase}_PATCHES="\\"#{patches_string}\\""])
|
898
958
|
|
899
959
|
case libname
|
900
960
|
when "xml2"
|
@@ -913,16 +973,17 @@ else
|
|
913
973
|
end.shelljoin
|
914
974
|
|
915
975
|
if static_p
|
976
|
+
static_archive_ld_flag = needs_darwin_linker_hack ? ["-load_hidden"] : []
|
916
977
|
$libs = $libs.shellsplit.map do |arg|
|
917
978
|
case arg
|
918
979
|
when "-lxml2"
|
919
|
-
File.join(libxml2_recipe.path, "lib", libflag_to_filename(arg))
|
980
|
+
static_archive_ld_flag + [File.join(libxml2_recipe.path, "lib", libflag_to_filename(arg))]
|
920
981
|
when "-lxslt", "-lexslt"
|
921
|
-
File.join(libxslt_recipe.path, "lib", libflag_to_filename(arg))
|
982
|
+
static_archive_ld_flag + [File.join(libxslt_recipe.path, "lib", libflag_to_filename(arg))]
|
922
983
|
else
|
923
984
|
arg
|
924
985
|
end
|
925
|
-
end.shelljoin
|
986
|
+
end.flatten.shelljoin
|
926
987
|
end
|
927
988
|
|
928
989
|
ensure_func("xmlParseDoc", "libxml/parser.h")
|
@@ -958,7 +1019,7 @@ libgumbo_recipe = process_recipe("libgumbo", "1.0.0-nokogiri", static_p, cross_b
|
|
958
1019
|
end
|
959
1020
|
|
960
1021
|
def compile
|
961
|
-
cflags = concat_flags(ENV["CFLAGS"], "-fPIC", "-g")
|
1022
|
+
cflags = concat_flags(ENV["CFLAGS"], "-fPIC", "-O2", "-g")
|
962
1023
|
|
963
1024
|
env = { "CC" => gcc_cmd, "CFLAGS" => cflags }
|
964
1025
|
if config_cross_build?
|
@@ -978,7 +1039,7 @@ end
|
|
978
1039
|
append_cppflags("-I#{File.join(libgumbo_recipe.path, "include")}")
|
979
1040
|
$libs = $libs + " " + File.join(libgumbo_recipe.path, "lib", "libgumbo.a")
|
980
1041
|
$LIBPATH = $LIBPATH | [File.join(libgumbo_recipe.path, "lib")]
|
981
|
-
ensure_func("gumbo_parse_with_options", "
|
1042
|
+
ensure_func("gumbo_parse_with_options", "nokogiri_gumbo.h")
|
982
1043
|
|
983
1044
|
have_func("xmlHasFeature") || abort("xmlHasFeature() is missing.") # introduced in libxml 2.6.21
|
984
1045
|
have_func("xmlFirstElementChild") # introduced in libxml 2.7.3
|
@@ -989,10 +1050,8 @@ have_func("xmlSchemaSetParserStructuredErrors") # introduced in libxml 2.6.23
|
|
989
1050
|
have_func("rb_gc_location") # introduced in Ruby 2.7
|
990
1051
|
have_func("rb_category_warning") # introduced in Ruby 3.0
|
991
1052
|
|
992
|
-
have_func("vasprintf")
|
993
|
-
|
994
1053
|
other_library_versions_string = OTHER_LIBRARY_VERSIONS.map { |k, v| [k, v].join(":") }.join(",")
|
995
|
-
append_cppflags(%[-DNOKOGIRI_OTHER_LIBRARY_VERSIONS="
|
1054
|
+
append_cppflags(%[-DNOKOGIRI_OTHER_LIBRARY_VERSIONS="\\"#{other_library_versions_string}\\""])
|
996
1055
|
|
997
1056
|
unless config_system_libraries?
|
998
1057
|
if cross_build_p
|
data/ext/nokogiri/gumbo.c
CHANGED
@@ -23,13 +23,13 @@
|
|
23
23
|
//
|
24
24
|
// Processing starts by calling gumbo_parse_with_options. The resulting document tree
|
25
25
|
// is then walked, a parallel libxml2 tree is constructed, and the final document is
|
26
|
-
// then wrapped using
|
26
|
+
// then wrapped using noko_xml_document_wrap. This approach reduces memory and CPU
|
27
27
|
// requirements as Ruby objects are only built when necessary.
|
28
28
|
//
|
29
29
|
|
30
30
|
#include <nokogiri.h>
|
31
31
|
|
32
|
-
#include "
|
32
|
+
#include "nokogiri_gumbo.h"
|
33
33
|
|
34
34
|
VALUE cNokogiriHtml5Document;
|
35
35
|
|
@@ -281,12 +281,12 @@ add_errors(const GumboOutput *output, VALUE rdoc, VALUE input, VALUE url)
|
|
281
281
|
rb_iv_set(syntax_error, "@code", INT2NUM(1)); // XML_ERR_INTERNAL_ERROR
|
282
282
|
rb_iv_set(syntax_error, "@level", INT2NUM(2)); // XML_ERR_ERROR
|
283
283
|
rb_iv_set(syntax_error, "@file", url);
|
284
|
-
rb_iv_set(syntax_error, "@line",
|
284
|
+
rb_iv_set(syntax_error, "@line", SIZET2NUM(position.line));
|
285
285
|
rb_iv_set(syntax_error, "@str1", str1);
|
286
286
|
rb_iv_set(syntax_error, "@str2", Qnil);
|
287
287
|
rb_iv_set(syntax_error, "@str3", Qnil);
|
288
288
|
rb_iv_set(syntax_error, "@int1", INT2NUM(0));
|
289
|
-
rb_iv_set(syntax_error, "@column",
|
289
|
+
rb_iv_set(syntax_error, "@column", SIZET2NUM(position.column));
|
290
290
|
rb_ary_push(rerrors, syntax_error);
|
291
291
|
}
|
292
292
|
rb_iv_set(rdoc, "@errors", rerrors);
|
@@ -297,6 +297,7 @@ typedef struct {
|
|
297
297
|
GumboOutput *output;
|
298
298
|
VALUE input;
|
299
299
|
VALUE url_or_frag;
|
300
|
+
VALUE klass;
|
300
301
|
xmlDocPtr doc;
|
301
302
|
} ParseArgs;
|
302
303
|
|
@@ -321,7 +322,7 @@ static VALUE parse_continue(VALUE parse_args);
|
|
321
322
|
* @!visibility protected
|
322
323
|
*/
|
323
324
|
static VALUE
|
324
|
-
parse(VALUE self, VALUE input, VALUE url, VALUE max_attributes, VALUE max_errors, VALUE max_depth)
|
325
|
+
parse(VALUE self, VALUE input, VALUE url, VALUE max_attributes, VALUE max_errors, VALUE max_depth, VALUE klass)
|
325
326
|
{
|
326
327
|
GumboOptions options = kGumboDefaultOptions;
|
327
328
|
options.max_attributes = NUM2INT(max_attributes);
|
@@ -333,6 +334,7 @@ parse(VALUE self, VALUE input, VALUE url, VALUE max_attributes, VALUE max_errors
|
|
333
334
|
.output = output,
|
334
335
|
.input = input,
|
335
336
|
.url_or_frag = url,
|
337
|
+
.klass = klass,
|
336
338
|
.doc = NULL,
|
337
339
|
};
|
338
340
|
|
@@ -357,7 +359,9 @@ parse_continue(VALUE parse_args)
|
|
357
359
|
}
|
358
360
|
args->doc = doc; // Make sure doc gets cleaned up if an error is thrown.
|
359
361
|
build_tree(doc, (xmlNodePtr)doc, output->document);
|
360
|
-
VALUE rdoc =
|
362
|
+
VALUE rdoc = noko_xml_document_wrap(args->klass, doc);
|
363
|
+
rb_iv_set(rdoc, "@url", args->url_or_frag);
|
364
|
+
rb_iv_set(rdoc, "@quirks_mode", INT2NUM(output->document->v.document.doc_type_quirks_mode));
|
361
365
|
args->doc = NULL; // The Ruby runtime now owns doc so don't delete it.
|
362
366
|
add_errors(output, rdoc, args->input, args->url_or_frag);
|
363
367
|
return rdoc;
|
@@ -498,9 +502,11 @@ error:
|
|
498
502
|
}
|
499
503
|
|
500
504
|
// Encoding.
|
501
|
-
if (
|
505
|
+
if (ctx_ns == GUMBO_NAMESPACE_MATHML
|
506
|
+
&& RSTRING_LEN(tag_name) == 14
|
502
507
|
&& !st_strcasecmp(ctx_tag, "annotation-xml")) {
|
503
508
|
VALUE enc = rb_funcall(ctx, rb_intern_const("[]"),
|
509
|
+
1,
|
504
510
|
rb_utf8_str_new_static("encoding", 8));
|
505
511
|
if (RTEST(enc)) {
|
506
512
|
Check_Type(enc, T_STRING);
|
@@ -512,8 +518,11 @@ error:
|
|
512
518
|
// Quirks mode.
|
513
519
|
VALUE doc = rb_funcall(doc_fragment, rb_intern_const("document"), 0);
|
514
520
|
VALUE dtd = rb_funcall(doc, internal_subset, 0);
|
515
|
-
|
521
|
+
VALUE doc_quirks_mode = rb_iv_get(doc, "@quirks_mode");
|
522
|
+
if (NIL_P(ctx) || NIL_P(doc_quirks_mode)) {
|
516
523
|
quirks_mode = GUMBO_DOCTYPE_NO_QUIRKS;
|
524
|
+
} else if (NIL_P(dtd)) {
|
525
|
+
quirks_mode = GUMBO_DOCTYPE_QUIRKS;
|
517
526
|
} else {
|
518
527
|
VALUE dtd_name = rb_funcall(dtd, name, 0);
|
519
528
|
VALUE pubid = rb_funcall(dtd, rb_intern_const("external_id"), 0);
|
@@ -560,6 +569,7 @@ fragment_continue(VALUE parse_args)
|
|
560
569
|
args->doc = NULL; // The Ruby runtime owns doc so make sure we don't delete it.
|
561
570
|
xmlNodePtr xml_frag = extract_xml_node(doc_fragment);
|
562
571
|
build_tree(xml_doc, xml_frag, output->root);
|
572
|
+
rb_iv_set(doc_fragment, "@quirks_mode", INT2NUM(output->document->v.document.doc_type_quirks_mode));
|
563
573
|
add_errors(output, doc_fragment, args->input, rb_utf8_str_new_static("#fragment", 9));
|
564
574
|
return Qnil;
|
565
575
|
}
|
@@ -577,7 +587,7 @@ noko_init_gumbo()
|
|
577
587
|
parent = rb_intern_const("parent");
|
578
588
|
|
579
589
|
// Define Nokogumbo module with parse and fragment methods.
|
580
|
-
rb_define_singleton_method(mNokogiriGumbo, "parse", parse,
|
590
|
+
rb_define_singleton_method(mNokogiriGumbo, "parse", parse, 6);
|
581
591
|
rb_define_singleton_method(mNokogiriGumbo, "fragment", fragment, 6);
|
582
592
|
}
|
583
593
|
|
@@ -20,7 +20,7 @@ get(VALUE _, VALUE rb_entity_name)
|
|
20
20
|
return Qnil;
|
21
21
|
}
|
22
22
|
|
23
|
-
rb_constructor_args[0] =
|
23
|
+
rb_constructor_args[0] = UINT2NUM(c_entity_desc->value);
|
24
24
|
rb_constructor_args[1] = NOKOGIRI_STR_NEW2(c_entity_desc->name);
|
25
25
|
rb_constructor_args[2] = NOKOGIRI_STR_NEW2(c_entity_desc->desc);
|
26
26
|
|
data/ext/nokogiri/nokogiri.c
CHANGED
@@ -49,34 +49,11 @@ void noko_init_html_sax_push_parser(void);
|
|
49
49
|
void noko_init_gumbo(void);
|
50
50
|
void noko_init_test_global_handlers(void);
|
51
51
|
|
52
|
-
static ID id_read, id_write;
|
53
|
-
|
54
|
-
|
55
|
-
#ifndef HAVE_VASPRINTF
|
56
|
-
/*
|
57
|
-
* Thank you Geoffroy Couprie for this implementation of vasprintf!
|
58
|
-
*/
|
59
|
-
int
|
60
|
-
vasprintf(char **strp, const char *fmt, va_list ap)
|
61
|
-
{
|
62
|
-
/* Mingw32/64 have a broken vsnprintf implementation that fails when
|
63
|
-
* using a zero-byte limit in order to retrieve the required size for malloc.
|
64
|
-
* So we use a one byte buffer instead.
|
65
|
-
*/
|
66
|
-
char tmp[1];
|
67
|
-
int len = vsnprintf(tmp, 1, fmt, ap) + 1;
|
68
|
-
char *res = (char *)malloc((unsigned int)len);
|
69
|
-
if (res == NULL) {
|
70
|
-
return -1;
|
71
|
-
}
|
72
|
-
*strp = res;
|
73
|
-
return vsnprintf(res, (unsigned int)len, fmt, ap);
|
74
|
-
}
|
75
|
-
#endif
|
52
|
+
static ID id_read, id_write, id_external_encoding;
|
76
53
|
|
77
54
|
|
78
55
|
static VALUE
|
79
|
-
|
56
|
+
noko_io_read_check(VALUE val)
|
80
57
|
{
|
81
58
|
VALUE *args = (VALUE *)val;
|
82
59
|
return rb_funcall(args[0], id_read, 1, args[1]);
|
@@ -84,68 +61,71 @@ read_check(VALUE val)
|
|
84
61
|
|
85
62
|
|
86
63
|
static VALUE
|
87
|
-
|
64
|
+
noko_io_read_failed(VALUE arg, VALUE exc)
|
88
65
|
{
|
89
66
|
return Qundef;
|
90
67
|
}
|
91
68
|
|
92
69
|
|
93
70
|
int
|
94
|
-
noko_io_read(void *
|
71
|
+
noko_io_read(void *io, char *c_buffer, int c_buffer_len)
|
95
72
|
{
|
96
|
-
VALUE
|
97
|
-
|
73
|
+
VALUE rb_io = (VALUE)io;
|
74
|
+
VALUE rb_read_string, rb_args[2];
|
75
|
+
size_t n_bytes_read, safe_len;
|
98
76
|
|
99
|
-
|
100
|
-
|
77
|
+
rb_args[0] = rb_io;
|
78
|
+
rb_args[1] = INT2NUM(c_buffer_len);
|
101
79
|
|
102
|
-
|
80
|
+
rb_read_string = rb_rescue(noko_io_read_check, (VALUE)rb_args, noko_io_read_failed, 0);
|
103
81
|
|
104
|
-
if (NIL_P(
|
105
|
-
if (
|
106
|
-
if (TYPE(
|
82
|
+
if (NIL_P(rb_read_string)) { return 0; }
|
83
|
+
if (rb_read_string == Qundef) { return -1; }
|
84
|
+
if (TYPE(rb_read_string) != T_STRING) { return -1; }
|
107
85
|
|
108
|
-
|
109
|
-
safe_len =
|
110
|
-
memcpy(
|
86
|
+
n_bytes_read = (size_t)RSTRING_LEN(rb_read_string);
|
87
|
+
safe_len = (n_bytes_read > (size_t)c_buffer_len) ? (size_t)c_buffer_len : n_bytes_read;
|
88
|
+
memcpy(c_buffer, StringValuePtr(rb_read_string), safe_len);
|
111
89
|
|
112
90
|
return (int)safe_len;
|
113
91
|
}
|
114
92
|
|
115
93
|
|
116
94
|
static VALUE
|
117
|
-
|
95
|
+
noko_io_write_check(VALUE rb_args)
|
118
96
|
{
|
119
|
-
VALUE
|
120
|
-
|
97
|
+
VALUE rb_io = ((VALUE *)rb_args)[0];
|
98
|
+
VALUE rb_output = ((VALUE *)rb_args)[1];
|
99
|
+
return rb_funcall(rb_io, id_write, 1, rb_output);
|
121
100
|
}
|
122
101
|
|
123
102
|
|
124
103
|
static VALUE
|
125
|
-
|
104
|
+
noko_io_write_failed(VALUE arg, VALUE exc)
|
126
105
|
{
|
127
106
|
return Qundef;
|
128
107
|
}
|
129
108
|
|
130
109
|
|
131
110
|
int
|
132
|
-
noko_io_write(void *
|
111
|
+
noko_io_write(void *io, char *c_buffer, int c_buffer_len)
|
133
112
|
{
|
134
|
-
VALUE
|
135
|
-
|
136
|
-
|
137
|
-
args[1] = rb_str_new(buffer, (long)len);
|
113
|
+
VALUE rb_args[2], rb_n_bytes_written;
|
114
|
+
VALUE rb_io = (VALUE)io;
|
115
|
+
rb_encoding *io_encoding = rb_to_encoding(rb_funcall(rb_io, id_external_encoding, 0));
|
138
116
|
|
139
|
-
|
117
|
+
rb_args[0] = rb_io;
|
118
|
+
rb_args[1] = rb_enc_str_new(c_buffer, (long)c_buffer_len, io_encoding);
|
140
119
|
|
141
|
-
|
120
|
+
rb_n_bytes_written = rb_rescue(noko_io_write_check, (VALUE)rb_args, noko_io_write_failed, 0);
|
121
|
+
if (rb_n_bytes_written == Qundef) { return -1; }
|
142
122
|
|
143
|
-
return NUM2INT(
|
123
|
+
return NUM2INT(rb_n_bytes_written);
|
144
124
|
}
|
145
125
|
|
146
126
|
|
147
127
|
int
|
148
|
-
noko_io_close(void *
|
128
|
+
noko_io_close(void *io)
|
149
129
|
{
|
150
130
|
return 0;
|
151
131
|
}
|
@@ -275,4 +255,5 @@ Init_nokogiri()
|
|
275
255
|
|
276
256
|
id_read = rb_intern("read");
|
277
257
|
id_write = rb_intern("write");
|
258
|
+
id_external_encoding = rb_intern("external_encoding");
|
278
259
|
}
|
data/ext/nokogiri/xml_cdata.c
CHANGED
@@ -29,7 +29,7 @@ new (int argc, VALUE *argv, VALUE klass)
|
|
29
29
|
|
30
30
|
if (!NIL_P(content)) {
|
31
31
|
content_str = (xmlChar *)StringValuePtr(content);
|
32
|
-
content_str_len =
|
32
|
+
content_str_len = RSTRING_LENINT(content);
|
33
33
|
}
|
34
34
|
|
35
35
|
node = xmlNewCDataBlock(xml_doc->doc, content_str, content_str_len);
|
data/ext/nokogiri/xml_document.c
CHANGED
@@ -65,14 +65,12 @@ dealloc(xmlDocPtr doc)
|
|
65
65
|
{
|
66
66
|
st_table *node_hash;
|
67
67
|
|
68
|
-
NOKOGIRI_DEBUG_START(doc);
|
69
|
-
|
70
68
|
node_hash = DOC_UNLINKED_NODE_HASH(doc);
|
71
69
|
|
72
70
|
st_foreach(node_hash, dealloc_node_i, (st_data_t)doc);
|
73
71
|
st_free_table(node_hash);
|
74
72
|
|
75
|
-
|
73
|
+
ruby_xfree(doc->_private);
|
76
74
|
|
77
75
|
/* When both Nokogiri and libxml-ruby are loaded, make sure that all nodes
|
78
76
|
* have their _private pointers cleared. This is to avoid libxml-ruby's
|
@@ -84,8 +82,6 @@ dealloc(xmlDocPtr doc)
|
|
84
82
|
}
|
85
83
|
|
86
84
|
xmlFreeDoc(doc);
|
87
|
-
|
88
|
-
NOKOGIRI_DEBUG_END(doc);
|
89
85
|
}
|
90
86
|
|
91
87
|
static void
|
@@ -540,6 +536,7 @@ rb_xml_document_canonicalize(int argc, VALUE *argv, VALUE self)
|
|
540
536
|
VALUE rb_mode;
|
541
537
|
VALUE rb_namespaces;
|
542
538
|
VALUE rb_comments_p;
|
539
|
+
int c_mode = 0;
|
543
540
|
xmlChar **c_namespaces;
|
544
541
|
|
545
542
|
xmlDocPtr c_doc;
|
@@ -551,8 +548,16 @@ rb_xml_document_canonicalize(int argc, VALUE *argv, VALUE self)
|
|
551
548
|
VALUE rb_io;
|
552
549
|
|
553
550
|
rb_scan_args(argc, argv, "03", &rb_mode, &rb_namespaces, &rb_comments_p);
|
554
|
-
if (!NIL_P(rb_mode)) {
|
555
|
-
|
551
|
+
if (!NIL_P(rb_mode)) {
|
552
|
+
Check_Type(rb_mode, T_FIXNUM);
|
553
|
+
c_mode = NUM2INT(rb_mode);
|
554
|
+
}
|
555
|
+
if (!NIL_P(rb_namespaces)) {
|
556
|
+
Check_Type(rb_namespaces, T_ARRAY);
|
557
|
+
if (c_mode == XML_C14N_1_0 || c_mode == XML_C14N_1_1) {
|
558
|
+
rb_raise(rb_eRuntimeError, "This canonicalizer does not support this operation");
|
559
|
+
}
|
560
|
+
}
|
556
561
|
|
557
562
|
Data_Get_Struct(self, xmlDoc, c_doc);
|
558
563
|
|
@@ -573,7 +578,7 @@ rb_xml_document_canonicalize(int argc, VALUE *argv, VALUE self)
|
|
573
578
|
c_namespaces = NULL;
|
574
579
|
} else {
|
575
580
|
long ns_len = RARRAY_LEN(rb_namespaces);
|
576
|
-
c_namespaces =
|
581
|
+
c_namespaces = ruby_xcalloc((size_t)ns_len + 1, sizeof(xmlChar *));
|
577
582
|
for (int j = 0 ; j < ns_len ; j++) {
|
578
583
|
VALUE entry = rb_ary_entry(rb_namespaces, j);
|
579
584
|
c_namespaces[j] = (xmlChar *)StringValueCStr(entry);
|
@@ -581,12 +586,12 @@ rb_xml_document_canonicalize(int argc, VALUE *argv, VALUE self)
|
|
581
586
|
}
|
582
587
|
|
583
588
|
xmlC14NExecute(c_doc, c_callback_wrapper, rb_callback,
|
584
|
-
|
589
|
+
c_mode,
|
585
590
|
c_namespaces,
|
586
591
|
(int)RTEST(rb_comments_p),
|
587
592
|
c_obuf);
|
588
593
|
|
589
|
-
|
594
|
+
ruby_xfree(c_namespaces);
|
590
595
|
xmlOutputBufferClose(c_obuf);
|
591
596
|
|
592
597
|
return rb_funcall(rb_io, rb_intern("string"), 0);
|
@@ -604,7 +609,7 @@ noko_xml_document_wrap_with_init_args(VALUE klass, xmlDocPtr c_document, int arg
|
|
604
609
|
|
605
610
|
rb_document = Data_Wrap_Struct(klass, mark, dealloc, c_document);
|
606
611
|
|
607
|
-
tuple = (nokogiriTuplePtr)
|
612
|
+
tuple = (nokogiriTuplePtr)ruby_xmalloc(sizeof(nokogiriTuple));
|
608
613
|
tuple->doc = rb_document;
|
609
614
|
tuple->unlinkedNodes = st_init_numtable_with_size(128);
|
610
615
|
tuple->node_cache = rb_ary_new();
|
@@ -31,7 +31,7 @@ get_type(VALUE self)
|
|
31
31
|
xmlElementContentPtr elem;
|
32
32
|
Data_Get_Struct(self, xmlElementContent, elem);
|
33
33
|
|
34
|
-
return INT2NUM(
|
34
|
+
return INT2NUM(elem->type);
|
35
35
|
}
|
36
36
|
|
37
37
|
/*
|
@@ -79,7 +79,7 @@ get_occur(VALUE self)
|
|
79
79
|
xmlElementContentPtr elem;
|
80
80
|
Data_Get_Struct(self, xmlElementContent, elem);
|
81
81
|
|
82
|
-
return INT2NUM(
|
82
|
+
return INT2NUM(elem->ocur);
|
83
83
|
}
|
84
84
|
|
85
85
|
/*
|
@@ -45,9 +45,9 @@ rb_xml_encoding_handler_s_delete(VALUE klass, VALUE name)
|
|
45
45
|
|
46
46
|
|
47
47
|
/*
|
48
|
-
* call-seq: Nokogiri::EncodingHandler.alias(
|
48
|
+
* call-seq: Nokogiri::EncodingHandler.alias(real_name, alias_name)
|
49
49
|
*
|
50
|
-
* Alias encoding handler with name +
|
50
|
+
* Alias encoding handler with name +real_name+ to name +alias_name+
|
51
51
|
*/
|
52
52
|
static VALUE
|
53
53
|
rb_xml_encoding_handler_s_alias(VALUE klass, VALUE from, VALUE to)
|