nokogiri 1.13.10 → 1.14.0.rc1

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of nokogiri might be problematic. Click here for more details.

Files changed (86) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +33 -0
  3. data/LICENSE-DEPENDENCIES.md +830 -509
  4. data/README.md +18 -11
  5. data/dependencies.yml +25 -7
  6. data/ext/nokogiri/extconf.rb +79 -20
  7. data/ext/nokogiri/gumbo.c +19 -9
  8. data/ext/nokogiri/html4_document.c +1 -1
  9. data/ext/nokogiri/html4_entity_lookup.c +1 -1
  10. data/ext/nokogiri/html4_sax_parser_context.c +0 -5
  11. data/ext/nokogiri/nokogiri.c +32 -51
  12. data/ext/nokogiri/nokogiri.h +17 -14
  13. data/ext/nokogiri/xml_attribute_decl.c +1 -1
  14. data/ext/nokogiri/xml_cdata.c +1 -1
  15. data/ext/nokogiri/xml_document.c +16 -11
  16. data/ext/nokogiri/xml_element_content.c +2 -2
  17. data/ext/nokogiri/xml_element_decl.c +1 -1
  18. data/ext/nokogiri/xml_encoding_handler.c +2 -2
  19. data/ext/nokogiri/xml_namespace.c +38 -8
  20. data/ext/nokogiri/xml_node.c +286 -26
  21. data/ext/nokogiri/xml_node_set.c +0 -2
  22. data/ext/nokogiri/xml_reader.c +40 -20
  23. data/ext/nokogiri/xml_relax_ng.c +0 -2
  24. data/ext/nokogiri/xml_sax_parser.c +22 -16
  25. data/ext/nokogiri/xml_sax_parser_context.c +0 -5
  26. data/ext/nokogiri/xml_sax_push_parser.c +0 -2
  27. data/ext/nokogiri/xml_schema.c +0 -2
  28. data/ext/nokogiri/xml_xpath_context.c +87 -83
  29. data/ext/nokogiri/xslt_stylesheet.c +14 -13
  30. data/gumbo-parser/Makefile +10 -0
  31. data/gumbo-parser/src/attribute.h +1 -1
  32. data/gumbo-parser/src/error.c +1 -1
  33. data/gumbo-parser/src/error.h +1 -1
  34. data/gumbo-parser/src/foreign_attrs.c +2 -2
  35. data/gumbo-parser/src/{gumbo.h → nokogiri_gumbo.h} +1 -0
  36. data/gumbo-parser/src/parser.c +7 -4
  37. data/gumbo-parser/src/replacement.h +1 -1
  38. data/gumbo-parser/src/string_buffer.h +1 -1
  39. data/gumbo-parser/src/string_piece.c +1 -1
  40. data/gumbo-parser/src/svg_attrs.c +2 -2
  41. data/gumbo-parser/src/svg_tags.c +2 -2
  42. data/gumbo-parser/src/tag.c +2 -1
  43. data/gumbo-parser/src/tag_lookup.c +7 -7
  44. data/gumbo-parser/src/tag_lookup.gperf +1 -0
  45. data/gumbo-parser/src/tag_lookup.h +1 -1
  46. data/gumbo-parser/src/token_buffer.h +1 -1
  47. data/gumbo-parser/src/tokenizer.c +1 -1
  48. data/gumbo-parser/src/tokenizer.h +1 -1
  49. data/gumbo-parser/src/utf8.c +1 -1
  50. data/gumbo-parser/src/utf8.h +1 -1
  51. data/gumbo-parser/src/util.c +1 -3
  52. data/gumbo-parser/src/util.h +4 -0
  53. data/gumbo-parser/src/vector.h +1 -1
  54. data/lib/nokogiri/css/node.rb +2 -2
  55. data/lib/nokogiri/css/xpath_visitor.rb +3 -1
  56. data/lib/nokogiri/css.rb +6 -0
  57. data/lib/nokogiri/encoding_handler.rb +57 -0
  58. data/lib/nokogiri/extension.rb +3 -2
  59. data/lib/nokogiri/html4/document.rb +2 -121
  60. data/lib/nokogiri/html4/element_description_defaults.rb +6 -12
  61. data/lib/nokogiri/html4/encoding_reader.rb +121 -0
  62. data/lib/nokogiri/html4.rb +1 -0
  63. data/lib/nokogiri/html5/document.rb +113 -36
  64. data/lib/nokogiri/html5/document_fragment.rb +9 -2
  65. data/lib/nokogiri/html5/node.rb +3 -5
  66. data/lib/nokogiri/html5.rb +127 -216
  67. data/lib/nokogiri/jruby/dependencies.rb +1 -19
  68. data/lib/nokogiri/jruby/nokogiri_jars.rb +43 -0
  69. data/lib/nokogiri/version/constant.rb +1 -1
  70. data/lib/nokogiri/version/info.rb +11 -10
  71. data/lib/nokogiri/xml/attr.rb +49 -0
  72. data/lib/nokogiri/xml/builder.rb +1 -1
  73. data/lib/nokogiri/xml/document.rb +102 -54
  74. data/lib/nokogiri/xml/document_fragment.rb +49 -6
  75. data/lib/nokogiri/xml/namespace.rb +42 -0
  76. data/lib/nokogiri/xml/node/save_options.rb +4 -2
  77. data/lib/nokogiri/xml/node.rb +190 -35
  78. data/lib/nokogiri/xml/node_set.rb +87 -9
  79. data/lib/nokogiri/xml/parse_options.rb +127 -48
  80. data/lib/nokogiri/xml/pp/node.rb +6 -4
  81. data/lib/nokogiri/xml/processing_instruction.rb +2 -1
  82. data/lib/nokogiri/xml/sax/parser.rb +2 -3
  83. data/lib/nokogiri/xslt.rb +1 -1
  84. data/lib/nokogiri.rb +3 -11
  85. metadata +11 -247
  86. data/patches/libxml2/0005-avoid-isnan-isinf.patch +0 -81
data/README.md CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  # Nokogiri
4
4
 
5
- Nokogiri (鋸) makes it easy and painless to work with XML and HTML from Ruby. It provides a sensible, easy-to-understand API for [reading](https://nokogiri.org/tutorials/parsing_an_html_xml_document.html), writing, [modifying](https://nokogiri.org/tutorials/modifying_an_html_xml_document.html), and [querying](https://nokogiri.org/tutorials/searching_a_xml_html_document.html) documents. It is fast and standards-compliant by relying on native parsers like libxml2 (CRuby) and xerces (JRuby).
5
+ Nokogiri (鋸) makes it easy and painless to work with XML and HTML from Ruby. It provides a sensible, easy-to-understand API for [reading](https://nokogiri.org/tutorials/parsing_an_html_xml_document.html), writing, [modifying](https://nokogiri.org/tutorials/modifying_an_html_xml_document.html), and [querying](https://nokogiri.org/tutorials/searching_a_xml_html_document.html) documents. It is fast and standards-compliant by relying on native parsers like libxml2, libgumbo, and xerces.
6
6
 
7
7
  ## Guiding Principles
8
8
 
@@ -40,10 +40,6 @@ Some guiding principles Nokogiri tries to follow:
40
40
 
41
41
  All official documentation is posted at https://nokogiri.org (the source for which is at https://github.com/sparklemotion/nokogiri.org/, and we welcome contributions).
42
42
 
43
- Consider subscribing to [Tidelift][tidelift] which provides license assurances and timely security notifications for your open source dependencies, including Nokogiri. [Tidelift][tidelift] subscriptions also help the Nokogiri maintainers fund our [automated testing](https://ci.nokogiri.org) which in turn allows us to ship releases, bugfixes, and security updates more often.
44
-
45
- [tidelift]: https://tidelift.com/subscription/pkg/rubygems-nokogiri?utm_source=rubygems-nokogiri&utm_medium=referral&utm_campaign=readme
46
-
47
43
  ### Reading
48
44
 
49
45
  Your first stops for learning more about Nokogiri should be:
@@ -57,7 +53,6 @@ Your first stops for learning more about Nokogiri should be:
57
53
 
58
54
  There are a few ways to ask exploratory questions:
59
55
 
60
- - The Ruby Discord chat server is active at https://discord.gg/UyQnKrT
61
56
  - The Nokogiri mailing list is active at https://groups.google.com/group/nokogiri-talk
62
57
  - Open an issue using the "Help Request" template at https://github.com/sparklemotion/nokogiri/issues
63
58
 
@@ -103,12 +98,21 @@ We bump `Major.Minor.Patch` versions following this guidance:
103
98
  - Updating packaged libraries for security-related reasons.
104
99
 
105
100
 
101
+ ### Sponsorship
102
+
103
+ You can help sponsor the maintainers of this software through one of these organizations:
104
+
105
+ - [github.com/sponsors/flavorjones](https://github.com/sponsors/flavorjones)
106
+ - [opencollective.com/nokogiri](https://opencollective.com/nokogiri)
107
+ - [tidelift.com/subscription/pkg/rubygems-nokogiri](https://tidelift.com/subscription/pkg/rubygems-nokogiri?utm_source=rubygems-nokogiri&utm_medium=referral&utm_campaign=readme)
108
+
109
+
106
110
  ## Installation
107
111
 
108
112
  Requirements:
109
113
 
110
- - Ruby >= 2.6
111
- - JRuby >= 9.3.0.0
114
+ - Ruby >= 2.7
115
+ - JRuby >= 9.4.0.0
112
116
 
113
117
 
114
118
  ### Native Gems: Faster, more reliable installation
@@ -119,10 +123,13 @@ Requirements:
119
123
 
120
124
  Nokogiri ships pre-compiled, "native" gems for the following platforms:
121
125
 
122
- - Linux: `x86-linux` and `x86_64-linux` (req: `glibc >= 2.17`), including musl platforms like Alpine
126
+ - Linux:
127
+ - `x86-linux` and `x86_64-linux` (req: `glibc >= 2.17`)
128
+ - `aarch64-linux` and `arm-linux` (req: `glibc >= 2.29`)
129
+ - Note that musl platforms like Alpine **are** supported
123
130
  - Darwin/MacOS: `x86_64-darwin` and `arm64-darwin`
124
- - Windows: `x86-mingw32` and `x64-mingw32`
125
- - Java: any platform running JRuby 9.3 or higher
131
+ - Windows: `x86-mingw32`, `x64-mingw32`, and `x64-mingw-ucrt`
132
+ - Java: any platform running JRuby 9.4 or higher
126
133
 
127
134
  To determine whether your system supports one of these gems, look at the output of `bundle platform` or `ruby -e 'puts Gem::Platform.local.to_s'`.
128
135
 
data/dependencies.yml CHANGED
@@ -14,10 +14,28 @@ zlib:
14
14
  # SHA-256 hash provided on http://zlib.net/
15
15
 
16
16
  libiconv:
17
- version: "1.16"
18
- sha256: "e6a1b1b589654277ee790cce3734f07876ac4ccfaecbee8afa0b649cf529cc04"
19
- # gpg: Signature made Fri 26 Apr 2019 03:36:38 PM EDT
20
- # gpg: using RSA key 4F494A942E4616C2
21
- # gpg: Good signature from "Bruno Haible (Open Source Development) <bruno@clisp.org>" [expired]
22
- # gpg: Note: This key has expired!
23
- # Primary key fingerprint: 68D9 4D8A AEEA D48A E7DC 5B90 4F49 4A94 2E46 16C2
17
+ version: "1.17"
18
+ sha256: "8f74213b56238c85a50a5329f77e06198771e70dd9a739779f4c02f65d971313"
19
+ # signature verified by following this path:
20
+ # - release announced at https://savannah.gnu.org/forum/forum.php?forum_id=10175
21
+ # - which links to https://savannah.gnu.org/users/haible as the releaser
22
+ # - which links to https://savannah.gnu.org/people/viewgpg.php?user_id=1871 as the gpg key
23
+ #
24
+ # So:
25
+ # - wget -q -O - https://savannah.gnu.org/people/viewgpg.php?user_id=1871 | gpg --import
26
+ # gpg: key F5BE8B267C6A406D: 1 signature not checked due to a missing key
27
+ # gpg: key F5BE8B267C6A406D: public key "Bruno Haible (Open Source Development) <bruno@clisp.org>" imported
28
+ # gpg: Total number processed: 1
29
+ # gpg: imported: 1
30
+ # gpg: marginals needed: 3 completes needed: 1 trust model: pgp
31
+ # gpg: depth: 0 valid: 4 signed: 0 trust: 0-, 0q, 0n, 0m, 0f, 4u
32
+ # gpg: next trustdb check due at 2024-05-09
33
+ # - gpg --verify libiconv-1.17.tar.gz.sig ports/archives/libiconv-1.17.tar.gz
34
+ # gpg: Signature made Sun 15 May 2022 11:26:42 AM EDT
35
+ # gpg: using RSA key 9001B85AF9E1B83DF1BDA942F5BE8B267C6A406D
36
+ # gpg: Good signature from "Bruno Haible (Open Source Development) <bruno@clisp.org>" [unknown]
37
+ # gpg: WARNING: This key is not certified with a trusted signature!
38
+ # gpg: There is no indication that the signature belongs to the owner.
39
+ # Primary key fingerprint: 9001 B85A F9E1 B83D F1BD A942 F5BE 8B26 7C6A 406D
40
+ #
41
+ # And this sha256sum is calculated from that verified tarball.
@@ -200,7 +200,7 @@ def nix?
200
200
  end
201
201
 
202
202
  def truffle?
203
- ::RUBY_ENGINE == "truffleruby"
203
+ RUBY_ENGINE == "truffleruby"
204
204
  end
205
205
 
206
206
  def concat_flags(*args)
@@ -211,6 +211,16 @@ def local_have_library(lib, func = nil, headers = nil)
211
211
  have_library(lib, func, headers) || have_library("lib#{lib}", func, headers)
212
212
  end
213
213
 
214
+ def zlib_source(version_string)
215
+ # As of 2022-12, I'm starting to see failed downloads often enough from zlib.net that I want to
216
+ # change the default to github.
217
+ if ENV["NOKOGIRI_USE_CANONICAL_ZLIB_SOURCE"]
218
+ "https://zlib.net/fossils/zlib-#{version_string}.tar.gz"
219
+ else
220
+ "https://github.com/madler/zlib/releases/download/v#{version_string}/zlib-#{version_string}.tar.gz"
221
+ end
222
+ end
223
+
214
224
  def gnome_source
215
225
  # As of 2022-02-20, some mirrors have expired SSL certificates. I'm able to retrieve from my home,
216
226
  # but whatever host is resolved on the github actions workers see an expired cert.
@@ -400,8 +410,8 @@ def iconv_configure_flags
400
410
  return ["--with-iconv=yes"]
401
411
  end
402
412
 
403
- config = preserving_globals { have_package_configuration("libiconv") }
404
- if config && try_link_iconv("pkg-config libiconv") { have_package_configuration("libiconv") }
413
+ config = preserving_globals { pkg_config("libiconv") }
414
+ if config && try_link_iconv("pkg-config libiconv") { pkg_config("libiconv") }
405
415
  cflags, ldflags, libs = config
406
416
 
407
417
  return [
@@ -430,10 +440,12 @@ def process_recipe(name, version, static_p, cross_p, cacheable_p = true)
430
440
  "#{@target}/#{RUBY_PLATFORM}/#{@name}/#{@version}"
431
441
  end
432
442
 
433
- recipe.target = File.join(PACKAGE_ROOT_DIR, "ports") if cacheable_p
434
- # Prefer host_alias over host in order to use the correct compiler prefix for cross build, but
435
- # use host if not set.
443
+ # We use 'host' to set compiler prefix for cross-compiling. Prefer host_alias over host. And
444
+ # prefer i686 (what external dev tools use) to i386 (what ruby's configure.ac emits).
436
445
  recipe.host = RbConfig::CONFIG["host_alias"].empty? ? RbConfig::CONFIG["host"] : RbConfig::CONFIG["host_alias"]
446
+ recipe.host = recipe.host.gsub(/i386/, "i686")
447
+
448
+ recipe.target = File.join(PACKAGE_ROOT_DIR, "ports") if cacheable_p
437
449
  recipe.configure_options << "--libdir=#{File.join(recipe.path, "lib")}"
438
450
 
439
451
  yield recipe
@@ -525,7 +537,6 @@ def process_recipe(name, version, static_p, cross_p, cacheable_p = true)
525
537
 
526
538
  EOM
527
539
 
528
- pp(recipe.files)
529
540
  chdir_for_build { recipe.cook }
530
541
  FileUtils.touch(checkpoint)
531
542
  end
@@ -573,6 +584,34 @@ def do_clean
573
584
  exit!(0)
574
585
  end
575
586
 
587
+ # In ruby 3.2, symbol resolution changed on Darwin, to introduce the `-bundle_loader` flag to
588
+ # resolve symbols against the ruby binary.
589
+ #
590
+ # This makes it challenging to build a single extension that works with both a ruby with
591
+ # `--enable-shared` and one with `--disable-shared. To work around that, we choose to add
592
+ # `-flat_namespace` to the link line (later in this file).
593
+ #
594
+ # The `-flat_namespace` line introduces its own behavior change, which is that (similar to on
595
+ # Linux), any symbols in the extension that are exported may now be resolved by shared libraries
596
+ # loaded by the Ruby process. Specifically, that means that libxml2 and libxslt, which are
597
+ # statically linked into the nokogiri bundle, will resolve (at runtime) to a system libxml2 loaded
598
+ # by Ruby on Darwin. And it appears that often Ruby on Darwin does indeed load the system libxml2,
599
+ # and that messes with our assumptions about whether we're running with a patched libxml2 or a
600
+ # vanilla libxml2.
601
+ #
602
+ # We choose to use `-load_hidden` in this case to prevent exporting those symbols from libxml2 and
603
+ # libxslt, which ensures that they will be resolved to the static libraries in the bundle. In other
604
+ # words, when we use `load_hidden`, what happens in the extension stays in the extension.
605
+ #
606
+ # See https://github.com/rake-compiler/rake-compiler-dock/issues/87 for more info.
607
+ #
608
+ # Anyway, this method is the logical bit to tell us when to turn on these workarounds.
609
+ def needs_darwin_linker_hack
610
+ config_cross_build? &&
611
+ darwin? &&
612
+ Gem::Requirement.new("~> 3.2").satisfied_by?(Gem::Version.new(RbConfig::CONFIG["ruby_version"].split("+").first))
613
+ end
614
+
576
615
  #
577
616
  # main
578
617
  #
@@ -616,6 +655,9 @@ $LIBS = concat_flags($LIBS, ENV["LIBS"])
616
655
  # errors/warnings. see #2302
617
656
  append_cflags(["-std=c99", "-Wno-declaration-after-statement"])
618
657
 
658
+ # gumbo html5 serialization is slower with O3, let's make sure we use O2
659
+ append_cflags("-O2")
660
+
619
661
  # always include debugging information
620
662
  append_cflags("-g")
621
663
 
@@ -625,8 +667,18 @@ append_cflags("-Winline")
625
667
  # good to have no matter what Ruby was compiled with
626
668
  append_cflags("-Wmissing-noreturn")
627
669
 
670
+ # check integer loss of precision
671
+ if darwin?
672
+ append_cflags("-Wshorten-64-to-32")
673
+ else
674
+ append_cflags("-Wconversion -Wno-sign-conversion")
675
+ end
676
+
628
677
  # handle clang variations, see #1101
629
- append_cflags("-Wno-error=unused-command-line-argument-hard-error-in-future") if darwin?
678
+ if darwin?
679
+ append_cflags("-Wno-error=unused-command-line-argument-hard-error-in-future")
680
+ append_cflags("-Wno-unknown-warning-option")
681
+ end
630
682
 
631
683
  # these tend to be noisy, but on occasion useful during development
632
684
  # append_cflags(["-Wcast-qual", "-Wwrite-strings"])
@@ -666,6 +718,10 @@ else
666
718
  cross_build_p = config_cross_build?
667
719
  message "Cross build is #{cross_build_p ? "enabled" : "disabled"}.\n"
668
720
 
721
+ if needs_darwin_linker_hack
722
+ append_ldflags("-Wl,-flat_namespace")
723
+ end
724
+
669
725
  require "yaml"
670
726
  dependencies = YAML.load_file(File.join(PACKAGE_ROOT_DIR, "dependencies.yml"))
671
727
 
@@ -674,7 +730,7 @@ else
674
730
  if cross_build_p || windows?
675
731
  zlib_recipe = process_recipe("zlib", dependencies["zlib"]["version"], static_p, cross_build_p) do |recipe|
676
732
  recipe.files = [{
677
- url: "https://zlib.net/fossils/#{recipe.name}-#{recipe.version}.tar.gz",
733
+ url: zlib_source(recipe.version),
678
734
  sha256: dependencies["zlib"]["sha256"],
679
735
  }]
680
736
  if windows?
@@ -873,9 +929,13 @@ else
873
929
  $libs = $libs.shellsplit.tap do |libs|
874
930
  [libxml2_recipe, libxslt_recipe].each do |recipe|
875
931
  libname = recipe.name[/\Alib(.+)\z/, 1]
876
- File.join(recipe.path, "bin", "#{libname}-config").tap do |config|
932
+ config_basename = "#{libname}-config"
933
+ File.join(recipe.path, "bin", config_basename).tap do |config|
877
934
  # call config scripts explicit with 'sh' for compat with Windows
878
- $CPPFLAGS = %x(sh #{config} --cflags).strip << " " << $CPPFLAGS
935
+ cflags = %x(sh #{config} --cflags).strip
936
+ message("#{config_basename} cflags: #{cflags}\n")
937
+ $CPPFLAGS = concat_flags(cflags, $CPPFLAGS) # prepend
938
+
879
939
  %x(sh #{config} --libs).strip.shellsplit.each do |arg|
880
940
  case arg
881
941
  when /\A-L(.+)\z/
@@ -894,7 +954,7 @@ else
894
954
  end
895
955
 
896
956
  patches_string = recipe.patch_files.map { |path| File.basename(path) }.join(" ")
897
- append_cppflags(%[-DNOKOGIRI_#{recipe.name.upcase}_PATCHES="\\\"#{patches_string}\\\""])
957
+ append_cppflags(%[-DNOKOGIRI_#{recipe.name.upcase}_PATCHES="\\"#{patches_string}\\""])
898
958
 
899
959
  case libname
900
960
  when "xml2"
@@ -913,16 +973,17 @@ else
913
973
  end.shelljoin
914
974
 
915
975
  if static_p
976
+ static_archive_ld_flag = needs_darwin_linker_hack ? ["-load_hidden"] : []
916
977
  $libs = $libs.shellsplit.map do |arg|
917
978
  case arg
918
979
  when "-lxml2"
919
- File.join(libxml2_recipe.path, "lib", libflag_to_filename(arg))
980
+ static_archive_ld_flag + [File.join(libxml2_recipe.path, "lib", libflag_to_filename(arg))]
920
981
  when "-lxslt", "-lexslt"
921
- File.join(libxslt_recipe.path, "lib", libflag_to_filename(arg))
982
+ static_archive_ld_flag + [File.join(libxslt_recipe.path, "lib", libflag_to_filename(arg))]
922
983
  else
923
984
  arg
924
985
  end
925
- end.shelljoin
986
+ end.flatten.shelljoin
926
987
  end
927
988
 
928
989
  ensure_func("xmlParseDoc", "libxml/parser.h")
@@ -958,7 +1019,7 @@ libgumbo_recipe = process_recipe("libgumbo", "1.0.0-nokogiri", static_p, cross_b
958
1019
  end
959
1020
 
960
1021
  def compile
961
- cflags = concat_flags(ENV["CFLAGS"], "-fPIC", "-g")
1022
+ cflags = concat_flags(ENV["CFLAGS"], "-fPIC", "-O2", "-g")
962
1023
 
963
1024
  env = { "CC" => gcc_cmd, "CFLAGS" => cflags }
964
1025
  if config_cross_build?
@@ -978,7 +1039,7 @@ end
978
1039
  append_cppflags("-I#{File.join(libgumbo_recipe.path, "include")}")
979
1040
  $libs = $libs + " " + File.join(libgumbo_recipe.path, "lib", "libgumbo.a")
980
1041
  $LIBPATH = $LIBPATH | [File.join(libgumbo_recipe.path, "lib")]
981
- ensure_func("gumbo_parse_with_options", "gumbo.h")
1042
+ ensure_func("gumbo_parse_with_options", "nokogiri_gumbo.h")
982
1043
 
983
1044
  have_func("xmlHasFeature") || abort("xmlHasFeature() is missing.") # introduced in libxml 2.6.21
984
1045
  have_func("xmlFirstElementChild") # introduced in libxml 2.7.3
@@ -989,10 +1050,8 @@ have_func("xmlSchemaSetParserStructuredErrors") # introduced in libxml 2.6.23
989
1050
  have_func("rb_gc_location") # introduced in Ruby 2.7
990
1051
  have_func("rb_category_warning") # introduced in Ruby 3.0
991
1052
 
992
- have_func("vasprintf")
993
-
994
1053
  other_library_versions_string = OTHER_LIBRARY_VERSIONS.map { |k, v| [k, v].join(":") }.join(",")
995
- append_cppflags(%[-DNOKOGIRI_OTHER_LIBRARY_VERSIONS="\\\"#{other_library_versions_string}\\\""])
1054
+ append_cppflags(%[-DNOKOGIRI_OTHER_LIBRARY_VERSIONS="\\"#{other_library_versions_string}\\""])
996
1055
 
997
1056
  unless config_system_libraries?
998
1057
  if cross_build_p
data/ext/nokogiri/gumbo.c CHANGED
@@ -23,13 +23,13 @@
23
23
  //
24
24
  // Processing starts by calling gumbo_parse_with_options. The resulting document tree
25
25
  // is then walked, a parallel libxml2 tree is constructed, and the final document is
26
- // then wrapped using Nokogiri_wrap_xml_document. This approach reduces memory and CPU
26
+ // then wrapped using noko_xml_document_wrap. This approach reduces memory and CPU
27
27
  // requirements as Ruby objects are only built when necessary.
28
28
  //
29
29
 
30
30
  #include <nokogiri.h>
31
31
 
32
- #include "gumbo.h"
32
+ #include "nokogiri_gumbo.h"
33
33
 
34
34
  VALUE cNokogiriHtml5Document;
35
35
 
@@ -281,12 +281,12 @@ add_errors(const GumboOutput *output, VALUE rdoc, VALUE input, VALUE url)
281
281
  rb_iv_set(syntax_error, "@code", INT2NUM(1)); // XML_ERR_INTERNAL_ERROR
282
282
  rb_iv_set(syntax_error, "@level", INT2NUM(2)); // XML_ERR_ERROR
283
283
  rb_iv_set(syntax_error, "@file", url);
284
- rb_iv_set(syntax_error, "@line", INT2NUM(position.line));
284
+ rb_iv_set(syntax_error, "@line", SIZET2NUM(position.line));
285
285
  rb_iv_set(syntax_error, "@str1", str1);
286
286
  rb_iv_set(syntax_error, "@str2", Qnil);
287
287
  rb_iv_set(syntax_error, "@str3", Qnil);
288
288
  rb_iv_set(syntax_error, "@int1", INT2NUM(0));
289
- rb_iv_set(syntax_error, "@column", INT2NUM(position.column));
289
+ rb_iv_set(syntax_error, "@column", SIZET2NUM(position.column));
290
290
  rb_ary_push(rerrors, syntax_error);
291
291
  }
292
292
  rb_iv_set(rdoc, "@errors", rerrors);
@@ -297,6 +297,7 @@ typedef struct {
297
297
  GumboOutput *output;
298
298
  VALUE input;
299
299
  VALUE url_or_frag;
300
+ VALUE klass;
300
301
  xmlDocPtr doc;
301
302
  } ParseArgs;
302
303
 
@@ -321,7 +322,7 @@ static VALUE parse_continue(VALUE parse_args);
321
322
  * @!visibility protected
322
323
  */
323
324
  static VALUE
324
- parse(VALUE self, VALUE input, VALUE url, VALUE max_attributes, VALUE max_errors, VALUE max_depth)
325
+ parse(VALUE self, VALUE input, VALUE url, VALUE max_attributes, VALUE max_errors, VALUE max_depth, VALUE klass)
325
326
  {
326
327
  GumboOptions options = kGumboDefaultOptions;
327
328
  options.max_attributes = NUM2INT(max_attributes);
@@ -333,6 +334,7 @@ parse(VALUE self, VALUE input, VALUE url, VALUE max_attributes, VALUE max_errors
333
334
  .output = output,
334
335
  .input = input,
335
336
  .url_or_frag = url,
337
+ .klass = klass,
336
338
  .doc = NULL,
337
339
  };
338
340
 
@@ -357,7 +359,9 @@ parse_continue(VALUE parse_args)
357
359
  }
358
360
  args->doc = doc; // Make sure doc gets cleaned up if an error is thrown.
359
361
  build_tree(doc, (xmlNodePtr)doc, output->document);
360
- VALUE rdoc = Nokogiri_wrap_xml_document(cNokogiriHtml5Document, doc);
362
+ VALUE rdoc = noko_xml_document_wrap(args->klass, doc);
363
+ rb_iv_set(rdoc, "@url", args->url_or_frag);
364
+ rb_iv_set(rdoc, "@quirks_mode", INT2NUM(output->document->v.document.doc_type_quirks_mode));
361
365
  args->doc = NULL; // The Ruby runtime now owns doc so don't delete it.
362
366
  add_errors(output, rdoc, args->input, args->url_or_frag);
363
367
  return rdoc;
@@ -498,9 +502,11 @@ error:
498
502
  }
499
503
 
500
504
  // Encoding.
501
- if (RSTRING_LEN(tag_name) == 14
505
+ if (ctx_ns == GUMBO_NAMESPACE_MATHML
506
+ && RSTRING_LEN(tag_name) == 14
502
507
  && !st_strcasecmp(ctx_tag, "annotation-xml")) {
503
508
  VALUE enc = rb_funcall(ctx, rb_intern_const("[]"),
509
+ 1,
504
510
  rb_utf8_str_new_static("encoding", 8));
505
511
  if (RTEST(enc)) {
506
512
  Check_Type(enc, T_STRING);
@@ -512,8 +518,11 @@ error:
512
518
  // Quirks mode.
513
519
  VALUE doc = rb_funcall(doc_fragment, rb_intern_const("document"), 0);
514
520
  VALUE dtd = rb_funcall(doc, internal_subset, 0);
515
- if (NIL_P(dtd)) {
521
+ VALUE doc_quirks_mode = rb_iv_get(doc, "@quirks_mode");
522
+ if (NIL_P(ctx) || NIL_P(doc_quirks_mode)) {
516
523
  quirks_mode = GUMBO_DOCTYPE_NO_QUIRKS;
524
+ } else if (NIL_P(dtd)) {
525
+ quirks_mode = GUMBO_DOCTYPE_QUIRKS;
517
526
  } else {
518
527
  VALUE dtd_name = rb_funcall(dtd, name, 0);
519
528
  VALUE pubid = rb_funcall(dtd, rb_intern_const("external_id"), 0);
@@ -560,6 +569,7 @@ fragment_continue(VALUE parse_args)
560
569
  args->doc = NULL; // The Ruby runtime owns doc so make sure we don't delete it.
561
570
  xmlNodePtr xml_frag = extract_xml_node(doc_fragment);
562
571
  build_tree(xml_doc, xml_frag, output->root);
572
+ rb_iv_set(doc_fragment, "@quirks_mode", INT2NUM(output->document->v.document.doc_type_quirks_mode));
563
573
  add_errors(output, doc_fragment, args->input, rb_utf8_str_new_static("#fragment", 9));
564
574
  return Qnil;
565
575
  }
@@ -577,7 +587,7 @@ noko_init_gumbo()
577
587
  parent = rb_intern_const("parent");
578
588
 
579
589
  // Define Nokogumbo module with parse and fragment methods.
580
- rb_define_singleton_method(mNokogiriGumbo, "parse", parse, 5);
590
+ rb_define_singleton_method(mNokogiriGumbo, "parse", parse, 6);
581
591
  rb_define_singleton_method(mNokogiriGumbo, "fragment", fragment, 6);
582
592
  }
583
593
 
@@ -146,7 +146,7 @@ rb_html_document_type(VALUE self)
146
146
  {
147
147
  htmlDocPtr doc;
148
148
  Data_Get_Struct(self, xmlDoc, doc);
149
- return INT2NUM((long)doc->type);
149
+ return INT2NUM(doc->type);
150
150
  }
151
151
 
152
152
  void
@@ -20,7 +20,7 @@ get(VALUE _, VALUE rb_entity_name)
20
20
  return Qnil;
21
21
  }
22
22
 
23
- rb_constructor_args[0] = INT2NUM((long)c_entity_desc->value);
23
+ rb_constructor_args[0] = UINT2NUM(c_entity_desc->value);
24
24
  rb_constructor_args[1] = NOKOGIRI_STR_NEW2(c_entity_desc->name);
25
25
  rb_constructor_args[2] = NOKOGIRI_STR_NEW2(c_entity_desc->desc);
26
26
 
@@ -5,13 +5,8 @@ VALUE cNokogiriHtml4SaxParserContext ;
5
5
  static void
6
6
  deallocate(xmlParserCtxtPtr ctxt)
7
7
  {
8
- NOKOGIRI_DEBUG_START(ctxt);
9
-
10
8
  ctxt->sax = NULL;
11
-
12
9
  htmlFreeParserCtxt(ctxt);
13
-
14
- NOKOGIRI_DEBUG_END(ctxt);
15
10
  }
16
11
 
17
12
  static VALUE
@@ -49,34 +49,11 @@ void noko_init_html_sax_push_parser(void);
49
49
  void noko_init_gumbo(void);
50
50
  void noko_init_test_global_handlers(void);
51
51
 
52
- static ID id_read, id_write;
53
-
54
-
55
- #ifndef HAVE_VASPRINTF
56
- /*
57
- * Thank you Geoffroy Couprie for this implementation of vasprintf!
58
- */
59
- int
60
- vasprintf(char **strp, const char *fmt, va_list ap)
61
- {
62
- /* Mingw32/64 have a broken vsnprintf implementation that fails when
63
- * using a zero-byte limit in order to retrieve the required size for malloc.
64
- * So we use a one byte buffer instead.
65
- */
66
- char tmp[1];
67
- int len = vsnprintf(tmp, 1, fmt, ap) + 1;
68
- char *res = (char *)malloc((unsigned int)len);
69
- if (res == NULL) {
70
- return -1;
71
- }
72
- *strp = res;
73
- return vsnprintf(res, (unsigned int)len, fmt, ap);
74
- }
75
- #endif
52
+ static ID id_read, id_write, id_external_encoding;
76
53
 
77
54
 
78
55
  static VALUE
79
- read_check(VALUE val)
56
+ noko_io_read_check(VALUE val)
80
57
  {
81
58
  VALUE *args = (VALUE *)val;
82
59
  return rb_funcall(args[0], id_read, 1, args[1]);
@@ -84,68 +61,71 @@ read_check(VALUE val)
84
61
 
85
62
 
86
63
  static VALUE
87
- read_failed(VALUE arg, VALUE exc)
64
+ noko_io_read_failed(VALUE arg, VALUE exc)
88
65
  {
89
66
  return Qundef;
90
67
  }
91
68
 
92
69
 
93
70
  int
94
- noko_io_read(void *ctx, char *buffer, int len)
71
+ noko_io_read(void *io, char *c_buffer, int c_buffer_len)
95
72
  {
96
- VALUE string, args[2];
97
- size_t str_len, safe_len;
73
+ VALUE rb_io = (VALUE)io;
74
+ VALUE rb_read_string, rb_args[2];
75
+ size_t n_bytes_read, safe_len;
98
76
 
99
- args[0] = (VALUE)ctx;
100
- args[1] = INT2NUM(len);
77
+ rb_args[0] = rb_io;
78
+ rb_args[1] = INT2NUM(c_buffer_len);
101
79
 
102
- string = rb_rescue(read_check, (VALUE)args, read_failed, 0);
80
+ rb_read_string = rb_rescue(noko_io_read_check, (VALUE)rb_args, noko_io_read_failed, 0);
103
81
 
104
- if (NIL_P(string)) { return 0; }
105
- if (string == Qundef) { return -1; }
106
- if (TYPE(string) != T_STRING) { return -1; }
82
+ if (NIL_P(rb_read_string)) { return 0; }
83
+ if (rb_read_string == Qundef) { return -1; }
84
+ if (TYPE(rb_read_string) != T_STRING) { return -1; }
107
85
 
108
- str_len = (size_t)RSTRING_LEN(string);
109
- safe_len = str_len > (size_t)len ? (size_t)len : str_len;
110
- memcpy(buffer, StringValuePtr(string), safe_len);
86
+ n_bytes_read = (size_t)RSTRING_LEN(rb_read_string);
87
+ safe_len = (n_bytes_read > (size_t)c_buffer_len) ? (size_t)c_buffer_len : n_bytes_read;
88
+ memcpy(c_buffer, StringValuePtr(rb_read_string), safe_len);
111
89
 
112
90
  return (int)safe_len;
113
91
  }
114
92
 
115
93
 
116
94
  static VALUE
117
- write_check(VALUE val)
95
+ noko_io_write_check(VALUE rb_args)
118
96
  {
119
- VALUE *args = (VALUE *)val;
120
- return rb_funcall(args[0], id_write, 1, args[1]);
97
+ VALUE rb_io = ((VALUE *)rb_args)[0];
98
+ VALUE rb_output = ((VALUE *)rb_args)[1];
99
+ return rb_funcall(rb_io, id_write, 1, rb_output);
121
100
  }
122
101
 
123
102
 
124
103
  static VALUE
125
- write_failed(VALUE arg, VALUE exc)
104
+ noko_io_write_failed(VALUE arg, VALUE exc)
126
105
  {
127
106
  return Qundef;
128
107
  }
129
108
 
130
109
 
131
110
  int
132
- noko_io_write(void *ctx, char *buffer, int len)
111
+ noko_io_write(void *io, char *c_buffer, int c_buffer_len)
133
112
  {
134
- VALUE args[2], size;
135
-
136
- args[0] = (VALUE)ctx;
137
- args[1] = rb_str_new(buffer, (long)len);
113
+ VALUE rb_args[2], rb_n_bytes_written;
114
+ VALUE rb_io = (VALUE)io;
115
+ rb_encoding *io_encoding = rb_to_encoding(rb_funcall(rb_io, id_external_encoding, 0));
138
116
 
139
- size = rb_rescue(write_check, (VALUE)args, write_failed, 0);
117
+ rb_args[0] = rb_io;
118
+ rb_args[1] = rb_enc_str_new(c_buffer, (long)c_buffer_len, io_encoding);
140
119
 
141
- if (size == Qundef) { return -1; }
120
+ rb_n_bytes_written = rb_rescue(noko_io_write_check, (VALUE)rb_args, noko_io_write_failed, 0);
121
+ if (rb_n_bytes_written == Qundef) { return -1; }
142
122
 
143
- return NUM2INT(size);
123
+ return NUM2INT(rb_n_bytes_written);
144
124
  }
145
125
 
146
126
 
147
127
  int
148
- noko_io_close(void *ctx)
128
+ noko_io_close(void *io)
149
129
  {
150
130
  return 0;
151
131
  }
@@ -275,4 +255,5 @@ Init_nokogiri()
275
255
 
276
256
  id_read = rb_intern("read");
277
257
  id_write = rb_intern("write");
258
+ id_external_encoding = rb_intern("external_encoding");
278
259
  }