nokogiri 1.13.10-java → 1.14.0-java

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of nokogiri might be problematic. Click here for more details.

Files changed (119) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +33 -0
  3. data/LICENSE-DEPENDENCIES.md +830 -509
  4. data/LICENSE.md +1 -1
  5. data/README.md +18 -11
  6. data/dependencies.yml +25 -7
  7. data/ext/java/nokogiri/Html4Document.java +2 -0
  8. data/ext/java/nokogiri/Html4ElementDescription.java +9 -9
  9. data/ext/java/nokogiri/Html4EntityLookup.java +14 -3
  10. data/ext/java/nokogiri/Html4SaxParserContext.java +2 -2
  11. data/ext/java/nokogiri/Html4SaxPushParser.java +3 -0
  12. data/ext/java/nokogiri/NokogiriService.java +1 -24
  13. data/ext/java/nokogiri/XmlAttr.java +1 -1
  14. data/ext/java/nokogiri/XmlAttributeDecl.java +2 -1
  15. data/ext/java/nokogiri/XmlCdata.java +2 -1
  16. data/ext/java/nokogiri/XmlComment.java +2 -1
  17. data/ext/java/nokogiri/XmlDocument.java +5 -6
  18. data/ext/java/nokogiri/XmlDocumentFragment.java +2 -1
  19. data/ext/java/nokogiri/XmlDtd.java +4 -3
  20. data/ext/java/nokogiri/XmlElement.java +1 -0
  21. data/ext/java/nokogiri/XmlElementContent.java +4 -1
  22. data/ext/java/nokogiri/XmlElementDecl.java +3 -1
  23. data/ext/java/nokogiri/XmlEntityDecl.java +2 -0
  24. data/ext/java/nokogiri/XmlEntityReference.java +1 -0
  25. data/ext/java/nokogiri/XmlNamespace.java +2 -0
  26. data/ext/java/nokogiri/XmlNode.java +39 -24
  27. data/ext/java/nokogiri/XmlNodeSet.java +10 -7
  28. data/ext/java/nokogiri/XmlProcessingInstruction.java +1 -0
  29. data/ext/java/nokogiri/XmlReader.java +4 -3
  30. data/ext/java/nokogiri/XmlRelaxng.java +1 -0
  31. data/ext/java/nokogiri/XmlSaxParserContext.java +1 -0
  32. data/ext/java/nokogiri/XmlSaxPushParser.java +3 -0
  33. data/ext/java/nokogiri/XmlSchema.java +4 -2
  34. data/ext/java/nokogiri/XmlSyntaxError.java +1 -0
  35. data/ext/java/nokogiri/XmlText.java +1 -0
  36. data/ext/java/nokogiri/XmlXpathContext.java +2 -0
  37. data/ext/java/nokogiri/XsltStylesheet.java +16 -13
  38. data/ext/java/nokogiri/internals/HtmlDomParserContext.java +3 -2
  39. data/ext/java/nokogiri/internals/NokogiriHandler.java +2 -2
  40. data/ext/java/nokogiri/internals/NokogiriHelpers.java +4 -5
  41. data/ext/java/nokogiri/internals/NokogiriXPathFunction.java +3 -3
  42. data/ext/java/nokogiri/internals/ParserContext.java +2 -0
  43. data/ext/java/nokogiri/internals/ReaderNode.java +1 -1
  44. data/ext/java/nokogiri/internals/SaveContextVisitor.java +4 -2
  45. data/ext/java/nokogiri/internals/SchemaErrorHandler.java +2 -2
  46. data/ext/java/nokogiri/internals/XmlDomParserContext.java +2 -1
  47. data/ext/java/nokogiri/internals/c14n/Canonicalizer.java +1 -0
  48. data/ext/java/nokogiri/internals/dom2dtm/DOM2DTM.java +5 -4
  49. data/ext/nokogiri/extconf.rb +80 -21
  50. data/ext/nokogiri/gumbo.c +19 -9
  51. data/ext/nokogiri/html4_document.c +1 -1
  52. data/ext/nokogiri/html4_entity_lookup.c +1 -1
  53. data/ext/nokogiri/html4_sax_parser_context.c +0 -5
  54. data/ext/nokogiri/nokogiri.c +33 -51
  55. data/ext/nokogiri/xml_attribute_decl.c +1 -1
  56. data/ext/nokogiri/xml_cdata.c +1 -1
  57. data/ext/nokogiri/xml_document.c +16 -11
  58. data/ext/nokogiri/xml_element_content.c +2 -2
  59. data/ext/nokogiri/xml_element_decl.c +1 -1
  60. data/ext/nokogiri/xml_encoding_handler.c +2 -2
  61. data/ext/nokogiri/xml_namespace.c +38 -8
  62. data/ext/nokogiri/xml_node.c +286 -26
  63. data/ext/nokogiri/xml_node_set.c +0 -2
  64. data/ext/nokogiri/xml_reader.c +40 -20
  65. data/ext/nokogiri/xml_relax_ng.c +0 -2
  66. data/ext/nokogiri/xml_sax_parser.c +22 -16
  67. data/ext/nokogiri/xml_sax_parser_context.c +0 -5
  68. data/ext/nokogiri/xml_sax_push_parser.c +0 -2
  69. data/ext/nokogiri/xml_schema.c +0 -2
  70. data/ext/nokogiri/xml_xpath_context.c +87 -83
  71. data/ext/nokogiri/xslt_stylesheet.c +14 -13
  72. data/gumbo-parser/Makefile +10 -0
  73. data/lib/nokogiri/css/node.rb +2 -2
  74. data/lib/nokogiri/css/xpath_visitor.rb +5 -3
  75. data/lib/nokogiri/css.rb +6 -0
  76. data/lib/nokogiri/encoding_handler.rb +57 -0
  77. data/lib/nokogiri/extension.rb +3 -2
  78. data/lib/nokogiri/html4/document.rb +2 -121
  79. data/lib/nokogiri/html4/element_description_defaults.rb +6 -12
  80. data/lib/nokogiri/html4/encoding_reader.rb +121 -0
  81. data/lib/nokogiri/html4.rb +1 -0
  82. data/lib/nokogiri/html5/document.rb +113 -36
  83. data/lib/nokogiri/html5/document_fragment.rb +9 -2
  84. data/lib/nokogiri/html5/node.rb +3 -5
  85. data/lib/nokogiri/html5.rb +127 -216
  86. data/lib/nokogiri/jruby/dependencies.rb +1 -19
  87. data/lib/{isorelax.jar → nokogiri/jruby/isorelax/isorelax/20030108/isorelax-20030108.jar} +0 -0
  88. data/lib/nokogiri/jruby/net/sf/saxon/Saxon-HE/9.6.0-4/Saxon-HE-9.6.0-4.jar +0 -0
  89. data/lib/nokogiri/jruby/net/sourceforge/htmlunit/neko-htmlunit/2.63.0/neko-htmlunit-2.63.0.jar +0 -0
  90. data/lib/nokogiri/jruby/nokogiri_jars.rb +43 -0
  91. data/lib/nokogiri/jruby/nu/validator/jing/20200702VNU/jing-20200702VNU.jar +0 -0
  92. data/lib/nokogiri/jruby/org/nokogiri/nekodtd/0.1.11.noko1/nekodtd-0.1.11.noko1.jar +0 -0
  93. data/lib/{serializer.jar → nokogiri/jruby/xalan/serializer/2.7.2/serializer-2.7.2.jar} +0 -0
  94. data/lib/{xalan.jar → nokogiri/jruby/xalan/xalan/2.7.2/xalan-2.7.2.jar} +0 -0
  95. data/lib/{xercesImpl.jar → nokogiri/jruby/xerces/xercesImpl/2.12.2/xercesImpl-2.12.2.jar} +0 -0
  96. data/lib/{xml-apis.jar → nokogiri/jruby/xml-apis/xml-apis/1.4.01/xml-apis-1.4.01.jar} +0 -0
  97. data/lib/nokogiri/nokogiri.jar +0 -0
  98. data/lib/nokogiri/version/constant.rb +1 -1
  99. data/lib/nokogiri/version/info.rb +11 -10
  100. data/lib/nokogiri/xml/attr.rb +49 -0
  101. data/lib/nokogiri/xml/builder.rb +1 -1
  102. data/lib/nokogiri/xml/document.rb +102 -54
  103. data/lib/nokogiri/xml/document_fragment.rb +49 -6
  104. data/lib/nokogiri/xml/namespace.rb +42 -0
  105. data/lib/nokogiri/xml/node/save_options.rb +6 -4
  106. data/lib/nokogiri/xml/node.rb +190 -35
  107. data/lib/nokogiri/xml/node_set.rb +87 -9
  108. data/lib/nokogiri/xml/parse_options.rb +129 -50
  109. data/lib/nokogiri/xml/pp/node.rb +6 -4
  110. data/lib/nokogiri/xml/processing_instruction.rb +2 -1
  111. data/lib/nokogiri/xml/sax/parser.rb +2 -3
  112. data/lib/nokogiri/xslt.rb +1 -1
  113. data/lib/nokogiri.rb +3 -11
  114. data/lib/xsd/xmlparser/nokogiri.rb +3 -1
  115. metadata +60 -272
  116. data/ext/java/nokogiri/EncodingHandler.java +0 -111
  117. data/lib/jing.jar +0 -0
  118. data/lib/nekodtd.jar +0 -0
  119. data/lib/nekohtml.jar +0 -0
@@ -200,7 +200,7 @@ def nix?
200
200
  end
201
201
 
202
202
  def truffle?
203
- ::RUBY_ENGINE == "truffleruby"
203
+ RUBY_ENGINE == "truffleruby"
204
204
  end
205
205
 
206
206
  def concat_flags(*args)
@@ -211,6 +211,16 @@ def local_have_library(lib, func = nil, headers = nil)
211
211
  have_library(lib, func, headers) || have_library("lib#{lib}", func, headers)
212
212
  end
213
213
 
214
+ def zlib_source(version_string)
215
+ # As of 2022-12, I'm starting to see failed downloads often enough from zlib.net that I want to
216
+ # change the default to github.
217
+ if ENV["NOKOGIRI_USE_CANONICAL_ZLIB_SOURCE"]
218
+ "https://zlib.net/fossils/zlib-#{version_string}.tar.gz"
219
+ else
220
+ "https://github.com/madler/zlib/releases/download/v#{version_string}/zlib-#{version_string}.tar.gz"
221
+ end
222
+ end
223
+
214
224
  def gnome_source
215
225
  # As of 2022-02-20, some mirrors have expired SSL certificates. I'm able to retrieve from my home,
216
226
  # but whatever host is resolved on the github actions workers see an expired cert.
@@ -400,8 +410,8 @@ def iconv_configure_flags
400
410
  return ["--with-iconv=yes"]
401
411
  end
402
412
 
403
- config = preserving_globals { have_package_configuration("libiconv") }
404
- if config && try_link_iconv("pkg-config libiconv") { have_package_configuration("libiconv") }
413
+ config = preserving_globals { pkg_config("libiconv") }
414
+ if config && try_link_iconv("pkg-config libiconv") { pkg_config("libiconv") }
405
415
  cflags, ldflags, libs = config
406
416
 
407
417
  return [
@@ -430,10 +440,12 @@ def process_recipe(name, version, static_p, cross_p, cacheable_p = true)
430
440
  "#{@target}/#{RUBY_PLATFORM}/#{@name}/#{@version}"
431
441
  end
432
442
 
433
- recipe.target = File.join(PACKAGE_ROOT_DIR, "ports") if cacheable_p
434
- # Prefer host_alias over host in order to use the correct compiler prefix for cross build, but
435
- # use host if not set.
443
+ # We use 'host' to set compiler prefix for cross-compiling. Prefer host_alias over host. And
444
+ # prefer i686 (what external dev tools use) to i386 (what ruby's configure.ac emits).
436
445
  recipe.host = RbConfig::CONFIG["host_alias"].empty? ? RbConfig::CONFIG["host"] : RbConfig::CONFIG["host_alias"]
446
+ recipe.host = recipe.host.gsub(/i386/, "i686")
447
+
448
+ recipe.target = File.join(PACKAGE_ROOT_DIR, "ports") if cacheable_p
437
449
  recipe.configure_options << "--libdir=#{File.join(recipe.path, "lib")}"
438
450
 
439
451
  yield recipe
@@ -525,7 +537,6 @@ def process_recipe(name, version, static_p, cross_p, cacheable_p = true)
525
537
 
526
538
  EOM
527
539
 
528
- pp(recipe.files)
529
540
  chdir_for_build { recipe.cook }
530
541
  FileUtils.touch(checkpoint)
531
542
  end
@@ -573,6 +584,34 @@ def do_clean
573
584
  exit!(0)
574
585
  end
575
586
 
587
+ # In ruby 3.2, symbol resolution changed on Darwin, to introduce the `-bundle_loader` flag to
588
+ # resolve symbols against the ruby binary.
589
+ #
590
+ # This makes it challenging to build a single extension that works with both a ruby with
591
+ # `--enable-shared` and one with `--disable-shared. To work around that, we choose to add
592
+ # `-flat_namespace` to the link line (later in this file).
593
+ #
594
+ # The `-flat_namespace` line introduces its own behavior change, which is that (similar to on
595
+ # Linux), any symbols in the extension that are exported may now be resolved by shared libraries
596
+ # loaded by the Ruby process. Specifically, that means that libxml2 and libxslt, which are
597
+ # statically linked into the nokogiri bundle, will resolve (at runtime) to a system libxml2 loaded
598
+ # by Ruby on Darwin. And it appears that often Ruby on Darwin does indeed load the system libxml2,
599
+ # and that messes with our assumptions about whether we're running with a patched libxml2 or a
600
+ # vanilla libxml2.
601
+ #
602
+ # We choose to use `-load_hidden` in this case to prevent exporting those symbols from libxml2 and
603
+ # libxslt, which ensures that they will be resolved to the static libraries in the bundle. In other
604
+ # words, when we use `load_hidden`, what happens in the extension stays in the extension.
605
+ #
606
+ # See https://github.com/rake-compiler/rake-compiler-dock/issues/87 for more info.
607
+ #
608
+ # Anyway, this method is the logical bit to tell us when to turn on these workarounds.
609
+ def needs_darwin_linker_hack
610
+ config_cross_build? &&
611
+ darwin? &&
612
+ Gem::Requirement.new("~> 3.2").satisfied_by?(Gem::Version.new(RbConfig::CONFIG["ruby_version"].split("+").first))
613
+ end
614
+
576
615
  #
577
616
  # main
578
617
  #
@@ -580,7 +619,7 @@ do_help if arg_config("--help")
580
619
  do_clean if arg_config("--clean")
581
620
 
582
621
  if openbsd? && !config_system_libraries?
583
- if %x(#{ENV["CC"] || "/usr/bin/cc"} -v 2>&1) !~ /clang/
622
+ unless %x(#{ENV["CC"] || "/usr/bin/cc"} -v 2>&1).include?("clang")
584
623
  (ENV["CC"] ||= find_executable("egcc")) ||
585
624
  abort("Please install gcc 4.9+ from ports using `pkg_add -v gcc`")
586
625
  end
@@ -616,6 +655,9 @@ $LIBS = concat_flags($LIBS, ENV["LIBS"])
616
655
  # errors/warnings. see #2302
617
656
  append_cflags(["-std=c99", "-Wno-declaration-after-statement"])
618
657
 
658
+ # gumbo html5 serialization is slower with O3, let's make sure we use O2
659
+ append_cflags("-O2")
660
+
619
661
  # always include debugging information
620
662
  append_cflags("-g")
621
663
 
@@ -625,8 +667,18 @@ append_cflags("-Winline")
625
667
  # good to have no matter what Ruby was compiled with
626
668
  append_cflags("-Wmissing-noreturn")
627
669
 
670
+ # check integer loss of precision
671
+ if darwin?
672
+ append_cflags("-Wshorten-64-to-32")
673
+ else
674
+ append_cflags("-Wconversion -Wno-sign-conversion")
675
+ end
676
+
628
677
  # handle clang variations, see #1101
629
- append_cflags("-Wno-error=unused-command-line-argument-hard-error-in-future") if darwin?
678
+ if darwin?
679
+ append_cflags("-Wno-error=unused-command-line-argument-hard-error-in-future")
680
+ append_cflags("-Wno-unknown-warning-option")
681
+ end
630
682
 
631
683
  # these tend to be noisy, but on occasion useful during development
632
684
  # append_cflags(["-Wcast-qual", "-Wwrite-strings"])
@@ -666,6 +718,10 @@ else
666
718
  cross_build_p = config_cross_build?
667
719
  message "Cross build is #{cross_build_p ? "enabled" : "disabled"}.\n"
668
720
 
721
+ if needs_darwin_linker_hack
722
+ append_ldflags("-Wl,-flat_namespace")
723
+ end
724
+
669
725
  require "yaml"
670
726
  dependencies = YAML.load_file(File.join(PACKAGE_ROOT_DIR, "dependencies.yml"))
671
727
 
@@ -674,7 +730,7 @@ else
674
730
  if cross_build_p || windows?
675
731
  zlib_recipe = process_recipe("zlib", dependencies["zlib"]["version"], static_p, cross_build_p) do |recipe|
676
732
  recipe.files = [{
677
- url: "https://zlib.net/fossils/#{recipe.name}-#{recipe.version}.tar.gz",
733
+ url: zlib_source(recipe.version),
678
734
  sha256: dependencies["zlib"]["sha256"],
679
735
  }]
680
736
  if windows?
@@ -873,9 +929,13 @@ else
873
929
  $libs = $libs.shellsplit.tap do |libs|
874
930
  [libxml2_recipe, libxslt_recipe].each do |recipe|
875
931
  libname = recipe.name[/\Alib(.+)\z/, 1]
876
- File.join(recipe.path, "bin", "#{libname}-config").tap do |config|
932
+ config_basename = "#{libname}-config"
933
+ File.join(recipe.path, "bin", config_basename).tap do |config|
877
934
  # call config scripts explicit with 'sh' for compat with Windows
878
- $CPPFLAGS = %x(sh #{config} --cflags).strip << " " << $CPPFLAGS
935
+ cflags = %x(sh #{config} --cflags).strip
936
+ message("#{config_basename} cflags: #{cflags}\n")
937
+ $CPPFLAGS = concat_flags(cflags, $CPPFLAGS) # prepend
938
+
879
939
  %x(sh #{config} --libs).strip.shellsplit.each do |arg|
880
940
  case arg
881
941
  when /\A-L(.+)\z/
@@ -894,7 +954,7 @@ else
894
954
  end
895
955
 
896
956
  patches_string = recipe.patch_files.map { |path| File.basename(path) }.join(" ")
897
- append_cppflags(%[-DNOKOGIRI_#{recipe.name.upcase}_PATCHES="\\\"#{patches_string}\\\""])
957
+ append_cppflags(%[-DNOKOGIRI_#{recipe.name.upcase}_PATCHES="\\"#{patches_string}\\""])
898
958
 
899
959
  case libname
900
960
  when "xml2"
@@ -913,16 +973,17 @@ else
913
973
  end.shelljoin
914
974
 
915
975
  if static_p
976
+ static_archive_ld_flag = needs_darwin_linker_hack ? ["-load_hidden"] : []
916
977
  $libs = $libs.shellsplit.map do |arg|
917
978
  case arg
918
979
  when "-lxml2"
919
- File.join(libxml2_recipe.path, "lib", libflag_to_filename(arg))
980
+ static_archive_ld_flag + [File.join(libxml2_recipe.path, "lib", libflag_to_filename(arg))]
920
981
  when "-lxslt", "-lexslt"
921
- File.join(libxslt_recipe.path, "lib", libflag_to_filename(arg))
982
+ static_archive_ld_flag + [File.join(libxslt_recipe.path, "lib", libflag_to_filename(arg))]
922
983
  else
923
984
  arg
924
985
  end
925
- end.shelljoin
986
+ end.flatten.shelljoin
926
987
  end
927
988
 
928
989
  ensure_func("xmlParseDoc", "libxml/parser.h")
@@ -958,7 +1019,7 @@ libgumbo_recipe = process_recipe("libgumbo", "1.0.0-nokogiri", static_p, cross_b
958
1019
  end
959
1020
 
960
1021
  def compile
961
- cflags = concat_flags(ENV["CFLAGS"], "-fPIC", "-g")
1022
+ cflags = concat_flags(ENV["CFLAGS"], "-fPIC", "-O2", "-g")
962
1023
 
963
1024
  env = { "CC" => gcc_cmd, "CFLAGS" => cflags }
964
1025
  if config_cross_build?
@@ -978,7 +1039,7 @@ end
978
1039
  append_cppflags("-I#{File.join(libgumbo_recipe.path, "include")}")
979
1040
  $libs = $libs + " " + File.join(libgumbo_recipe.path, "lib", "libgumbo.a")
980
1041
  $LIBPATH = $LIBPATH | [File.join(libgumbo_recipe.path, "lib")]
981
- ensure_func("gumbo_parse_with_options", "gumbo.h")
1042
+ ensure_func("gumbo_parse_with_options", "nokogiri_gumbo.h")
982
1043
 
983
1044
  have_func("xmlHasFeature") || abort("xmlHasFeature() is missing.") # introduced in libxml 2.6.21
984
1045
  have_func("xmlFirstElementChild") # introduced in libxml 2.7.3
@@ -989,10 +1050,8 @@ have_func("xmlSchemaSetParserStructuredErrors") # introduced in libxml 2.6.23
989
1050
  have_func("rb_gc_location") # introduced in Ruby 2.7
990
1051
  have_func("rb_category_warning") # introduced in Ruby 3.0
991
1052
 
992
- have_func("vasprintf")
993
-
994
1053
  other_library_versions_string = OTHER_LIBRARY_VERSIONS.map { |k, v| [k, v].join(":") }.join(",")
995
- append_cppflags(%[-DNOKOGIRI_OTHER_LIBRARY_VERSIONS="\\\"#{other_library_versions_string}\\\""])
1054
+ append_cppflags(%[-DNOKOGIRI_OTHER_LIBRARY_VERSIONS="\\"#{other_library_versions_string}\\""])
996
1055
 
997
1056
  unless config_system_libraries?
998
1057
  if cross_build_p
data/ext/nokogiri/gumbo.c CHANGED
@@ -23,13 +23,13 @@
23
23
  //
24
24
  // Processing starts by calling gumbo_parse_with_options. The resulting document tree
25
25
  // is then walked, a parallel libxml2 tree is constructed, and the final document is
26
- // then wrapped using Nokogiri_wrap_xml_document. This approach reduces memory and CPU
26
+ // then wrapped using noko_xml_document_wrap. This approach reduces memory and CPU
27
27
  // requirements as Ruby objects are only built when necessary.
28
28
  //
29
29
 
30
30
  #include <nokogiri.h>
31
31
 
32
- #include "gumbo.h"
32
+ #include "nokogiri_gumbo.h"
33
33
 
34
34
  VALUE cNokogiriHtml5Document;
35
35
 
@@ -281,12 +281,12 @@ add_errors(const GumboOutput *output, VALUE rdoc, VALUE input, VALUE url)
281
281
  rb_iv_set(syntax_error, "@code", INT2NUM(1)); // XML_ERR_INTERNAL_ERROR
282
282
  rb_iv_set(syntax_error, "@level", INT2NUM(2)); // XML_ERR_ERROR
283
283
  rb_iv_set(syntax_error, "@file", url);
284
- rb_iv_set(syntax_error, "@line", INT2NUM(position.line));
284
+ rb_iv_set(syntax_error, "@line", SIZET2NUM(position.line));
285
285
  rb_iv_set(syntax_error, "@str1", str1);
286
286
  rb_iv_set(syntax_error, "@str2", Qnil);
287
287
  rb_iv_set(syntax_error, "@str3", Qnil);
288
288
  rb_iv_set(syntax_error, "@int1", INT2NUM(0));
289
- rb_iv_set(syntax_error, "@column", INT2NUM(position.column));
289
+ rb_iv_set(syntax_error, "@column", SIZET2NUM(position.column));
290
290
  rb_ary_push(rerrors, syntax_error);
291
291
  }
292
292
  rb_iv_set(rdoc, "@errors", rerrors);
@@ -297,6 +297,7 @@ typedef struct {
297
297
  GumboOutput *output;
298
298
  VALUE input;
299
299
  VALUE url_or_frag;
300
+ VALUE klass;
300
301
  xmlDocPtr doc;
301
302
  } ParseArgs;
302
303
 
@@ -321,7 +322,7 @@ static VALUE parse_continue(VALUE parse_args);
321
322
  * @!visibility protected
322
323
  */
323
324
  static VALUE
324
- parse(VALUE self, VALUE input, VALUE url, VALUE max_attributes, VALUE max_errors, VALUE max_depth)
325
+ parse(VALUE self, VALUE input, VALUE url, VALUE max_attributes, VALUE max_errors, VALUE max_depth, VALUE klass)
325
326
  {
326
327
  GumboOptions options = kGumboDefaultOptions;
327
328
  options.max_attributes = NUM2INT(max_attributes);
@@ -333,6 +334,7 @@ parse(VALUE self, VALUE input, VALUE url, VALUE max_attributes, VALUE max_errors
333
334
  .output = output,
334
335
  .input = input,
335
336
  .url_or_frag = url,
337
+ .klass = klass,
336
338
  .doc = NULL,
337
339
  };
338
340
 
@@ -357,7 +359,9 @@ parse_continue(VALUE parse_args)
357
359
  }
358
360
  args->doc = doc; // Make sure doc gets cleaned up if an error is thrown.
359
361
  build_tree(doc, (xmlNodePtr)doc, output->document);
360
- VALUE rdoc = Nokogiri_wrap_xml_document(cNokogiriHtml5Document, doc);
362
+ VALUE rdoc = noko_xml_document_wrap(args->klass, doc);
363
+ rb_iv_set(rdoc, "@url", args->url_or_frag);
364
+ rb_iv_set(rdoc, "@quirks_mode", INT2NUM(output->document->v.document.doc_type_quirks_mode));
361
365
  args->doc = NULL; // The Ruby runtime now owns doc so don't delete it.
362
366
  add_errors(output, rdoc, args->input, args->url_or_frag);
363
367
  return rdoc;
@@ -498,9 +502,11 @@ error:
498
502
  }
499
503
 
500
504
  // Encoding.
501
- if (RSTRING_LEN(tag_name) == 14
505
+ if (ctx_ns == GUMBO_NAMESPACE_MATHML
506
+ && RSTRING_LEN(tag_name) == 14
502
507
  && !st_strcasecmp(ctx_tag, "annotation-xml")) {
503
508
  VALUE enc = rb_funcall(ctx, rb_intern_const("[]"),
509
+ 1,
504
510
  rb_utf8_str_new_static("encoding", 8));
505
511
  if (RTEST(enc)) {
506
512
  Check_Type(enc, T_STRING);
@@ -512,8 +518,11 @@ error:
512
518
  // Quirks mode.
513
519
  VALUE doc = rb_funcall(doc_fragment, rb_intern_const("document"), 0);
514
520
  VALUE dtd = rb_funcall(doc, internal_subset, 0);
515
- if (NIL_P(dtd)) {
521
+ VALUE doc_quirks_mode = rb_iv_get(doc, "@quirks_mode");
522
+ if (NIL_P(ctx) || NIL_P(doc_quirks_mode)) {
516
523
  quirks_mode = GUMBO_DOCTYPE_NO_QUIRKS;
524
+ } else if (NIL_P(dtd)) {
525
+ quirks_mode = GUMBO_DOCTYPE_QUIRKS;
517
526
  } else {
518
527
  VALUE dtd_name = rb_funcall(dtd, name, 0);
519
528
  VALUE pubid = rb_funcall(dtd, rb_intern_const("external_id"), 0);
@@ -560,6 +569,7 @@ fragment_continue(VALUE parse_args)
560
569
  args->doc = NULL; // The Ruby runtime owns doc so make sure we don't delete it.
561
570
  xmlNodePtr xml_frag = extract_xml_node(doc_fragment);
562
571
  build_tree(xml_doc, xml_frag, output->root);
572
+ rb_iv_set(doc_fragment, "@quirks_mode", INT2NUM(output->document->v.document.doc_type_quirks_mode));
563
573
  add_errors(output, doc_fragment, args->input, rb_utf8_str_new_static("#fragment", 9));
564
574
  return Qnil;
565
575
  }
@@ -577,7 +587,7 @@ noko_init_gumbo()
577
587
  parent = rb_intern_const("parent");
578
588
 
579
589
  // Define Nokogumbo module with parse and fragment methods.
580
- rb_define_singleton_method(mNokogiriGumbo, "parse", parse, 5);
590
+ rb_define_singleton_method(mNokogiriGumbo, "parse", parse, 6);
581
591
  rb_define_singleton_method(mNokogiriGumbo, "fragment", fragment, 6);
582
592
  }
583
593
 
@@ -146,7 +146,7 @@ rb_html_document_type(VALUE self)
146
146
  {
147
147
  htmlDocPtr doc;
148
148
  Data_Get_Struct(self, xmlDoc, doc);
149
- return INT2NUM((long)doc->type);
149
+ return INT2NUM(doc->type);
150
150
  }
151
151
 
152
152
  void
@@ -20,7 +20,7 @@ get(VALUE _, VALUE rb_entity_name)
20
20
  return Qnil;
21
21
  }
22
22
 
23
- rb_constructor_args[0] = INT2NUM((long)c_entity_desc->value);
23
+ rb_constructor_args[0] = UINT2NUM(c_entity_desc->value);
24
24
  rb_constructor_args[1] = NOKOGIRI_STR_NEW2(c_entity_desc->name);
25
25
  rb_constructor_args[2] = NOKOGIRI_STR_NEW2(c_entity_desc->desc);
26
26
 
@@ -5,13 +5,8 @@ VALUE cNokogiriHtml4SaxParserContext ;
5
5
  static void
6
6
  deallocate(xmlParserCtxtPtr ctxt)
7
7
  {
8
- NOKOGIRI_DEBUG_START(ctxt);
9
-
10
8
  ctxt->sax = NULL;
11
-
12
9
  htmlFreeParserCtxt(ctxt);
13
-
14
- NOKOGIRI_DEBUG_END(ctxt);
15
10
  }
16
11
 
17
12
  static VALUE
@@ -49,34 +49,11 @@ void noko_init_html_sax_push_parser(void);
49
49
  void noko_init_gumbo(void);
50
50
  void noko_init_test_global_handlers(void);
51
51
 
52
- static ID id_read, id_write;
53
-
54
-
55
- #ifndef HAVE_VASPRINTF
56
- /*
57
- * Thank you Geoffroy Couprie for this implementation of vasprintf!
58
- */
59
- int
60
- vasprintf(char **strp, const char *fmt, va_list ap)
61
- {
62
- /* Mingw32/64 have a broken vsnprintf implementation that fails when
63
- * using a zero-byte limit in order to retrieve the required size for malloc.
64
- * So we use a one byte buffer instead.
65
- */
66
- char tmp[1];
67
- int len = vsnprintf(tmp, 1, fmt, ap) + 1;
68
- char *res = (char *)malloc((unsigned int)len);
69
- if (res == NULL) {
70
- return -1;
71
- }
72
- *strp = res;
73
- return vsnprintf(res, (unsigned int)len, fmt, ap);
74
- }
75
- #endif
52
+ static ID id_read, id_write, id_external_encoding;
76
53
 
77
54
 
78
55
  static VALUE
79
- read_check(VALUE val)
56
+ noko_io_read_check(VALUE val)
80
57
  {
81
58
  VALUE *args = (VALUE *)val;
82
59
  return rb_funcall(args[0], id_read, 1, args[1]);
@@ -84,68 +61,72 @@ read_check(VALUE val)
84
61
 
85
62
 
86
63
  static VALUE
87
- read_failed(VALUE arg, VALUE exc)
64
+ noko_io_read_failed(VALUE arg, VALUE exc)
88
65
  {
89
66
  return Qundef;
90
67
  }
91
68
 
92
69
 
93
70
  int
94
- noko_io_read(void *ctx, char *buffer, int len)
71
+ noko_io_read(void *io, char *c_buffer, int c_buffer_len)
95
72
  {
96
- VALUE string, args[2];
97
- size_t str_len, safe_len;
73
+ VALUE rb_io = (VALUE)io;
74
+ VALUE rb_read_string, rb_args[2];
75
+ size_t n_bytes_read, safe_len;
98
76
 
99
- args[0] = (VALUE)ctx;
100
- args[1] = INT2NUM(len);
77
+ rb_args[0] = rb_io;
78
+ rb_args[1] = INT2NUM(c_buffer_len);
101
79
 
102
- string = rb_rescue(read_check, (VALUE)args, read_failed, 0);
80
+ rb_read_string = rb_rescue(noko_io_read_check, (VALUE)rb_args, noko_io_read_failed, 0);
103
81
 
104
- if (NIL_P(string)) { return 0; }
105
- if (string == Qundef) { return -1; }
106
- if (TYPE(string) != T_STRING) { return -1; }
82
+ if (NIL_P(rb_read_string)) { return 0; }
83
+ if (rb_read_string == Qundef) { return -1; }
84
+ if (TYPE(rb_read_string) != T_STRING) { return -1; }
107
85
 
108
- str_len = (size_t)RSTRING_LEN(string);
109
- safe_len = str_len > (size_t)len ? (size_t)len : str_len;
110
- memcpy(buffer, StringValuePtr(string), safe_len);
86
+ n_bytes_read = (size_t)RSTRING_LEN(rb_read_string);
87
+ safe_len = (n_bytes_read > (size_t)c_buffer_len) ? (size_t)c_buffer_len : n_bytes_read;
88
+ memcpy(c_buffer, StringValuePtr(rb_read_string), safe_len);
111
89
 
112
90
  return (int)safe_len;
113
91
  }
114
92
 
115
93
 
116
94
  static VALUE
117
- write_check(VALUE val)
95
+ noko_io_write_check(VALUE rb_args)
118
96
  {
119
- VALUE *args = (VALUE *)val;
120
- return rb_funcall(args[0], id_write, 1, args[1]);
97
+ VALUE rb_io = ((VALUE *)rb_args)[0];
98
+ VALUE rb_output = ((VALUE *)rb_args)[1];
99
+ return rb_funcall(rb_io, id_write, 1, rb_output);
121
100
  }
122
101
 
123
102
 
124
103
  static VALUE
125
- write_failed(VALUE arg, VALUE exc)
104
+ noko_io_write_failed(VALUE arg, VALUE exc)
126
105
  {
127
106
  return Qundef;
128
107
  }
129
108
 
130
109
 
131
110
  int
132
- noko_io_write(void *ctx, char *buffer, int len)
111
+ noko_io_write(void *io, char *c_buffer, int c_buffer_len)
133
112
  {
134
- VALUE args[2], size;
135
-
136
- args[0] = (VALUE)ctx;
137
- args[1] = rb_str_new(buffer, (long)len);
113
+ VALUE rb_args[2], rb_n_bytes_written;
114
+ VALUE rb_io = (VALUE)io;
115
+ VALUE rb_enc = rb_funcall(rb_io, id_external_encoding, 0);
116
+ rb_encoding *io_encoding = RB_NIL_P(rb_enc) ? rb_ascii8bit_encoding() : rb_to_encoding(rb_enc);
138
117
 
139
- size = rb_rescue(write_check, (VALUE)args, write_failed, 0);
118
+ rb_args[0] = rb_io;
119
+ rb_args[1] = rb_enc_str_new(c_buffer, (long)c_buffer_len, io_encoding);
140
120
 
141
- if (size == Qundef) { return -1; }
121
+ rb_n_bytes_written = rb_rescue(noko_io_write_check, (VALUE)rb_args, noko_io_write_failed, 0);
122
+ if (rb_n_bytes_written == Qundef) { return -1; }
142
123
 
143
- return NUM2INT(size);
124
+ return NUM2INT(rb_n_bytes_written);
144
125
  }
145
126
 
146
127
 
147
128
  int
148
- noko_io_close(void *ctx)
129
+ noko_io_close(void *io)
149
130
  {
150
131
  return 0;
151
132
  }
@@ -275,4 +256,5 @@ Init_nokogiri()
275
256
 
276
257
  id_read = rb_intern("read");
277
258
  id_write = rb_intern("write");
259
+ id_external_encoding = rb_intern("external_encoding");
278
260
  }
@@ -13,7 +13,7 @@ attribute_type(VALUE self)
13
13
  {
14
14
  xmlAttributePtr node;
15
15
  Noko_Node_Get_Struct(self, xmlAttribute, node);
16
- return INT2NUM((long)node->atype);
16
+ return INT2NUM(node->atype);
17
17
  }
18
18
 
19
19
  /*
@@ -29,7 +29,7 @@ new (int argc, VALUE *argv, VALUE klass)
29
29
 
30
30
  if (!NIL_P(content)) {
31
31
  content_str = (xmlChar *)StringValuePtr(content);
32
- content_str_len = RSTRING_LEN(content);
32
+ content_str_len = RSTRING_LENINT(content);
33
33
  }
34
34
 
35
35
  node = xmlNewCDataBlock(xml_doc->doc, content_str, content_str_len);
@@ -65,14 +65,12 @@ dealloc(xmlDocPtr doc)
65
65
  {
66
66
  st_table *node_hash;
67
67
 
68
- NOKOGIRI_DEBUG_START(doc);
69
-
70
68
  node_hash = DOC_UNLINKED_NODE_HASH(doc);
71
69
 
72
70
  st_foreach(node_hash, dealloc_node_i, (st_data_t)doc);
73
71
  st_free_table(node_hash);
74
72
 
75
- free(doc->_private);
73
+ ruby_xfree(doc->_private);
76
74
 
77
75
  /* When both Nokogiri and libxml-ruby are loaded, make sure that all nodes
78
76
  * have their _private pointers cleared. This is to avoid libxml-ruby's
@@ -84,8 +82,6 @@ dealloc(xmlDocPtr doc)
84
82
  }
85
83
 
86
84
  xmlFreeDoc(doc);
87
-
88
- NOKOGIRI_DEBUG_END(doc);
89
85
  }
90
86
 
91
87
  static void
@@ -540,6 +536,7 @@ rb_xml_document_canonicalize(int argc, VALUE *argv, VALUE self)
540
536
  VALUE rb_mode;
541
537
  VALUE rb_namespaces;
542
538
  VALUE rb_comments_p;
539
+ int c_mode = 0;
543
540
  xmlChar **c_namespaces;
544
541
 
545
542
  xmlDocPtr c_doc;
@@ -551,8 +548,16 @@ rb_xml_document_canonicalize(int argc, VALUE *argv, VALUE self)
551
548
  VALUE rb_io;
552
549
 
553
550
  rb_scan_args(argc, argv, "03", &rb_mode, &rb_namespaces, &rb_comments_p);
554
- if (!NIL_P(rb_mode)) { Check_Type(rb_mode, T_FIXNUM); }
555
- if (!NIL_P(rb_namespaces)) { Check_Type(rb_namespaces, T_ARRAY); }
551
+ if (!NIL_P(rb_mode)) {
552
+ Check_Type(rb_mode, T_FIXNUM);
553
+ c_mode = NUM2INT(rb_mode);
554
+ }
555
+ if (!NIL_P(rb_namespaces)) {
556
+ Check_Type(rb_namespaces, T_ARRAY);
557
+ if (c_mode == XML_C14N_1_0 || c_mode == XML_C14N_1_1) {
558
+ rb_raise(rb_eRuntimeError, "This canonicalizer does not support this operation");
559
+ }
560
+ }
556
561
 
557
562
  Data_Get_Struct(self, xmlDoc, c_doc);
558
563
 
@@ -573,7 +578,7 @@ rb_xml_document_canonicalize(int argc, VALUE *argv, VALUE self)
573
578
  c_namespaces = NULL;
574
579
  } else {
575
580
  long ns_len = RARRAY_LEN(rb_namespaces);
576
- c_namespaces = calloc((size_t)ns_len + 1, sizeof(xmlChar *));
581
+ c_namespaces = ruby_xcalloc((size_t)ns_len + 1, sizeof(xmlChar *));
577
582
  for (int j = 0 ; j < ns_len ; j++) {
578
583
  VALUE entry = rb_ary_entry(rb_namespaces, j);
579
584
  c_namespaces[j] = (xmlChar *)StringValueCStr(entry);
@@ -581,12 +586,12 @@ rb_xml_document_canonicalize(int argc, VALUE *argv, VALUE self)
581
586
  }
582
587
 
583
588
  xmlC14NExecute(c_doc, c_callback_wrapper, rb_callback,
584
- (int)(NIL_P(rb_mode) ? 0 : NUM2INT(rb_mode)),
589
+ c_mode,
585
590
  c_namespaces,
586
591
  (int)RTEST(rb_comments_p),
587
592
  c_obuf);
588
593
 
589
- free(c_namespaces);
594
+ ruby_xfree(c_namespaces);
590
595
  xmlOutputBufferClose(c_obuf);
591
596
 
592
597
  return rb_funcall(rb_io, rb_intern("string"), 0);
@@ -604,7 +609,7 @@ noko_xml_document_wrap_with_init_args(VALUE klass, xmlDocPtr c_document, int arg
604
609
 
605
610
  rb_document = Data_Wrap_Struct(klass, mark, dealloc, c_document);
606
611
 
607
- tuple = (nokogiriTuplePtr)malloc(sizeof(nokogiriTuple));
612
+ tuple = (nokogiriTuplePtr)ruby_xmalloc(sizeof(nokogiriTuple));
608
613
  tuple->doc = rb_document;
609
614
  tuple->unlinkedNodes = st_init_numtable_with_size(128);
610
615
  tuple->node_cache = rb_ary_new();
@@ -31,7 +31,7 @@ get_type(VALUE self)
31
31
  xmlElementContentPtr elem;
32
32
  Data_Get_Struct(self, xmlElementContent, elem);
33
33
 
34
- return INT2NUM((long)elem->type);
34
+ return INT2NUM(elem->type);
35
35
  }
36
36
 
37
37
  /*
@@ -79,7 +79,7 @@ get_occur(VALUE self)
79
79
  xmlElementContentPtr elem;
80
80
  Data_Get_Struct(self, xmlElementContent, elem);
81
81
 
82
- return INT2NUM((long)elem->ocur);
82
+ return INT2NUM(elem->ocur);
83
83
  }
84
84
 
85
85
  /*
@@ -15,7 +15,7 @@ element_type(VALUE self)
15
15
  {
16
16
  xmlElementPtr node;
17
17
  Noko_Node_Get_Struct(self, xmlElement, node);
18
- return INT2NUM((long)node->etype);
18
+ return INT2NUM(node->etype);
19
19
  }
20
20
 
21
21
  /*
@@ -45,9 +45,9 @@ rb_xml_encoding_handler_s_delete(VALUE klass, VALUE name)
45
45
 
46
46
 
47
47
  /*
48
- * call-seq: Nokogiri::EncodingHandler.alias(from, to)
48
+ * call-seq: Nokogiri::EncodingHandler.alias(real_name, alias_name)
49
49
  *
50
- * Alias encoding handler with name +from+ to name +to+
50
+ * Alias encoding handler with name +real_name+ to name +alias_name+
51
51
  */
52
52
  static VALUE
53
53
  rb_xml_encoding_handler_s_alias(VALUE klass, VALUE from, VALUE to)