nokogiri 1.10.9 → 1.18.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nokogiri might be problematic. Click here for more details.

Files changed (230) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +38 -0
  3. data/LICENSE-DEPENDENCIES.md +1632 -1022
  4. data/LICENSE.md +1 -1
  5. data/README.md +190 -95
  6. data/bin/nokogiri +63 -50
  7. data/dependencies.yml +34 -66
  8. data/ext/nokogiri/depend +38 -358
  9. data/ext/nokogiri/extconf.rb +909 -422
  10. data/ext/nokogiri/gumbo.c +610 -0
  11. data/ext/nokogiri/html4_document.c +171 -0
  12. data/ext/nokogiri/html4_element_description.c +299 -0
  13. data/ext/nokogiri/html4_entity_lookup.c +37 -0
  14. data/ext/nokogiri/html4_sax_parser.c +40 -0
  15. data/ext/nokogiri/html4_sax_parser_context.c +98 -0
  16. data/ext/nokogiri/html4_sax_push_parser.c +96 -0
  17. data/ext/nokogiri/libxml2_polyfill.c +114 -0
  18. data/ext/nokogiri/nokogiri.c +258 -105
  19. data/ext/nokogiri/nokogiri.h +207 -90
  20. data/ext/nokogiri/test_global_handlers.c +40 -0
  21. data/ext/nokogiri/xml_attr.c +18 -18
  22. data/ext/nokogiri/xml_attribute_decl.c +22 -22
  23. data/ext/nokogiri/xml_cdata.c +33 -33
  24. data/ext/nokogiri/xml_comment.c +19 -31
  25. data/ext/nokogiri/xml_document.c +499 -323
  26. data/ext/nokogiri/xml_document_fragment.c +17 -36
  27. data/ext/nokogiri/xml_dtd.c +65 -59
  28. data/ext/nokogiri/xml_element_content.c +63 -55
  29. data/ext/nokogiri/xml_element_decl.c +31 -31
  30. data/ext/nokogiri/xml_encoding_handler.c +54 -21
  31. data/ext/nokogiri/xml_entity_decl.c +37 -35
  32. data/ext/nokogiri/xml_entity_reference.c +17 -19
  33. data/ext/nokogiri/xml_namespace.c +131 -61
  34. data/ext/nokogiri/xml_node.c +1429 -723
  35. data/ext/nokogiri/xml_node_set.c +257 -225
  36. data/ext/nokogiri/xml_processing_instruction.c +18 -20
  37. data/ext/nokogiri/xml_reader.c +340 -231
  38. data/ext/nokogiri/xml_relax_ng.c +87 -99
  39. data/ext/nokogiri/xml_sax_parser.c +269 -176
  40. data/ext/nokogiri/xml_sax_parser_context.c +286 -152
  41. data/ext/nokogiri/xml_sax_push_parser.c +111 -64
  42. data/ext/nokogiri/xml_schema.c +132 -140
  43. data/ext/nokogiri/xml_syntax_error.c +52 -23
  44. data/ext/nokogiri/xml_text.c +37 -30
  45. data/ext/nokogiri/xml_xpath_context.c +373 -185
  46. data/ext/nokogiri/xslt_stylesheet.c +342 -191
  47. data/gumbo-parser/CHANGES.md +63 -0
  48. data/gumbo-parser/Makefile +129 -0
  49. data/gumbo-parser/THANKS +27 -0
  50. data/gumbo-parser/src/Makefile +34 -0
  51. data/gumbo-parser/src/README.md +41 -0
  52. data/gumbo-parser/src/ascii.c +75 -0
  53. data/gumbo-parser/src/ascii.h +115 -0
  54. data/gumbo-parser/src/attribute.c +42 -0
  55. data/gumbo-parser/src/attribute.h +17 -0
  56. data/gumbo-parser/src/char_ref.c +22225 -0
  57. data/gumbo-parser/src/char_ref.h +29 -0
  58. data/gumbo-parser/src/char_ref.rl +2154 -0
  59. data/gumbo-parser/src/error.c +658 -0
  60. data/gumbo-parser/src/error.h +152 -0
  61. data/gumbo-parser/src/foreign_attrs.c +103 -0
  62. data/gumbo-parser/src/foreign_attrs.gperf +27 -0
  63. data/gumbo-parser/src/insertion_mode.h +33 -0
  64. data/gumbo-parser/src/macros.h +91 -0
  65. data/gumbo-parser/src/nokogiri_gumbo.h +953 -0
  66. data/gumbo-parser/src/parser.c +4932 -0
  67. data/gumbo-parser/src/parser.h +41 -0
  68. data/gumbo-parser/src/replacement.h +33 -0
  69. data/gumbo-parser/src/string_buffer.c +103 -0
  70. data/gumbo-parser/src/string_buffer.h +68 -0
  71. data/gumbo-parser/src/string_piece.c +48 -0
  72. data/gumbo-parser/src/svg_attrs.c +174 -0
  73. data/gumbo-parser/src/svg_attrs.gperf +77 -0
  74. data/gumbo-parser/src/svg_tags.c +137 -0
  75. data/gumbo-parser/src/svg_tags.gperf +55 -0
  76. data/gumbo-parser/src/tag.c +223 -0
  77. data/gumbo-parser/src/tag_lookup.c +382 -0
  78. data/gumbo-parser/src/tag_lookup.gperf +170 -0
  79. data/gumbo-parser/src/tag_lookup.h +13 -0
  80. data/gumbo-parser/src/token_buffer.c +79 -0
  81. data/gumbo-parser/src/token_buffer.h +71 -0
  82. data/gumbo-parser/src/token_type.h +17 -0
  83. data/gumbo-parser/src/tokenizer.c +3464 -0
  84. data/gumbo-parser/src/tokenizer.h +112 -0
  85. data/gumbo-parser/src/tokenizer_states.h +339 -0
  86. data/gumbo-parser/src/utf8.c +245 -0
  87. data/gumbo-parser/src/utf8.h +164 -0
  88. data/gumbo-parser/src/util.c +66 -0
  89. data/gumbo-parser/src/util.h +34 -0
  90. data/gumbo-parser/src/vector.c +111 -0
  91. data/gumbo-parser/src/vector.h +45 -0
  92. data/lib/nokogiri/class_resolver.rb +67 -0
  93. data/lib/nokogiri/css/node.rb +14 -8
  94. data/lib/nokogiri/css/parser.rb +399 -377
  95. data/lib/nokogiri/css/parser.y +250 -245
  96. data/lib/nokogiri/css/parser_extras.rb +16 -71
  97. data/lib/nokogiri/css/selector_cache.rb +38 -0
  98. data/lib/nokogiri/css/syntax_error.rb +3 -1
  99. data/lib/nokogiri/css/tokenizer.rb +7 -5
  100. data/lib/nokogiri/css/tokenizer.rex +11 -9
  101. data/lib/nokogiri/css/xpath_visitor.rb +242 -96
  102. data/lib/nokogiri/css.rb +122 -17
  103. data/lib/nokogiri/decorators/slop.rb +11 -11
  104. data/lib/nokogiri/encoding_handler.rb +57 -0
  105. data/lib/nokogiri/extension.rb +32 -0
  106. data/lib/nokogiri/gumbo.rb +15 -0
  107. data/lib/nokogiri/html.rb +38 -27
  108. data/lib/nokogiri/{html → html4}/builder.rb +4 -2
  109. data/lib/nokogiri/html4/document.rb +235 -0
  110. data/lib/nokogiri/html4/document_fragment.rb +166 -0
  111. data/lib/nokogiri/{html → html4}/element_description.rb +3 -1
  112. data/lib/nokogiri/html4/element_description_defaults.rb +2040 -0
  113. data/lib/nokogiri/html4/encoding_reader.rb +121 -0
  114. data/lib/nokogiri/{html → html4}/entity_lookup.rb +4 -2
  115. data/lib/nokogiri/html4/sax/parser.rb +48 -0
  116. data/lib/nokogiri/html4/sax/parser_context.rb +15 -0
  117. data/lib/nokogiri/{html → html4}/sax/push_parser.rb +12 -11
  118. data/lib/nokogiri/html4.rb +42 -0
  119. data/lib/nokogiri/html5/builder.rb +40 -0
  120. data/lib/nokogiri/html5/document.rb +199 -0
  121. data/lib/nokogiri/html5/document_fragment.rb +200 -0
  122. data/lib/nokogiri/html5/node.rb +103 -0
  123. data/lib/nokogiri/html5.rb +368 -0
  124. data/lib/nokogiri/jruby/dependencies.rb +3 -0
  125. data/lib/nokogiri/jruby/nokogiri_jars.rb +43 -0
  126. data/lib/nokogiri/syntax_error.rb +2 -0
  127. data/lib/nokogiri/version/constant.rb +6 -0
  128. data/lib/nokogiri/version/info.rb +224 -0
  129. data/lib/nokogiri/version.rb +3 -108
  130. data/lib/nokogiri/xml/attr.rb +55 -3
  131. data/lib/nokogiri/xml/attribute_decl.rb +6 -2
  132. data/lib/nokogiri/xml/builder.rb +83 -35
  133. data/lib/nokogiri/xml/cdata.rb +3 -1
  134. data/lib/nokogiri/xml/character_data.rb +2 -0
  135. data/lib/nokogiri/xml/document.rb +359 -130
  136. data/lib/nokogiri/xml/document_fragment.rb +170 -54
  137. data/lib/nokogiri/xml/dtd.rb +4 -2
  138. data/lib/nokogiri/xml/element_content.rb +12 -2
  139. data/lib/nokogiri/xml/element_decl.rb +6 -2
  140. data/lib/nokogiri/xml/entity_decl.rb +7 -3
  141. data/lib/nokogiri/xml/entity_reference.rb +2 -0
  142. data/lib/nokogiri/xml/namespace.rb +44 -0
  143. data/lib/nokogiri/xml/node/save_options.rb +23 -8
  144. data/lib/nokogiri/xml/node.rb +1168 -420
  145. data/lib/nokogiri/xml/node_set.rb +145 -67
  146. data/lib/nokogiri/xml/notation.rb +13 -0
  147. data/lib/nokogiri/xml/parse_options.rb +145 -52
  148. data/lib/nokogiri/xml/pp/character_data.rb +9 -6
  149. data/lib/nokogiri/xml/pp/node.rb +47 -30
  150. data/lib/nokogiri/xml/pp.rb +4 -2
  151. data/lib/nokogiri/xml/processing_instruction.rb +4 -1
  152. data/lib/nokogiri/xml/reader.rb +68 -41
  153. data/lib/nokogiri/xml/relax_ng.rb +60 -17
  154. data/lib/nokogiri/xml/sax/document.rb +198 -111
  155. data/lib/nokogiri/xml/sax/parser.rb +144 -67
  156. data/lib/nokogiri/xml/sax/parser_context.rb +119 -6
  157. data/lib/nokogiri/xml/sax/push_parser.rb +9 -5
  158. data/lib/nokogiri/xml/sax.rb +54 -4
  159. data/lib/nokogiri/xml/schema.rb +116 -39
  160. data/lib/nokogiri/xml/searchable.rb +139 -95
  161. data/lib/nokogiri/xml/syntax_error.rb +29 -5
  162. data/lib/nokogiri/xml/text.rb +2 -0
  163. data/lib/nokogiri/xml/xpath/syntax_error.rb +4 -2
  164. data/lib/nokogiri/xml/xpath.rb +15 -4
  165. data/lib/nokogiri/xml/xpath_context.rb +15 -4
  166. data/lib/nokogiri/xml.rb +45 -55
  167. data/lib/nokogiri/xslt/stylesheet.rb +32 -8
  168. data/lib/nokogiri/xslt.rb +103 -30
  169. data/lib/nokogiri.rb +59 -75
  170. data/lib/xsd/xmlparser/nokogiri.rb +32 -29
  171. data/patches/libxml2/0009-allow-wildcard-namespaces.patch +77 -0
  172. data/patches/libxml2/0010-update-config.guess-and-config.sub-for-libxml2.patch +224 -0
  173. data/patches/libxml2/0011-rip-out-libxml2-s-libc_single_threaded-support.patch +30 -0
  174. data/patches/libxml2/0019-xpath-Use-separate-static-hash-table-for-standard-fu.patch +244 -0
  175. data/patches/libxslt/0001-update-config.guess-and-config.sub-for-libxslt.patch +224 -0
  176. data/ports/archives/libxml2-2.13.6.tar.xz +0 -0
  177. data/ports/archives/libxslt-1.1.42.tar.xz +0 -0
  178. metadata +123 -295
  179. data/ext/nokogiri/html_document.c +0 -170
  180. data/ext/nokogiri/html_document.h +0 -10
  181. data/ext/nokogiri/html_element_description.c +0 -279
  182. data/ext/nokogiri/html_element_description.h +0 -10
  183. data/ext/nokogiri/html_entity_lookup.c +0 -32
  184. data/ext/nokogiri/html_entity_lookup.h +0 -8
  185. data/ext/nokogiri/html_sax_parser_context.c +0 -116
  186. data/ext/nokogiri/html_sax_parser_context.h +0 -11
  187. data/ext/nokogiri/html_sax_push_parser.c +0 -87
  188. data/ext/nokogiri/html_sax_push_parser.h +0 -9
  189. data/ext/nokogiri/xml_attr.h +0 -9
  190. data/ext/nokogiri/xml_attribute_decl.h +0 -9
  191. data/ext/nokogiri/xml_cdata.h +0 -9
  192. data/ext/nokogiri/xml_comment.h +0 -9
  193. data/ext/nokogiri/xml_document.h +0 -23
  194. data/ext/nokogiri/xml_document_fragment.h +0 -10
  195. data/ext/nokogiri/xml_dtd.h +0 -10
  196. data/ext/nokogiri/xml_element_content.h +0 -10
  197. data/ext/nokogiri/xml_element_decl.h +0 -9
  198. data/ext/nokogiri/xml_encoding_handler.h +0 -8
  199. data/ext/nokogiri/xml_entity_decl.h +0 -10
  200. data/ext/nokogiri/xml_entity_reference.h +0 -9
  201. data/ext/nokogiri/xml_io.c +0 -61
  202. data/ext/nokogiri/xml_io.h +0 -11
  203. data/ext/nokogiri/xml_libxml2_hacks.c +0 -112
  204. data/ext/nokogiri/xml_libxml2_hacks.h +0 -12
  205. data/ext/nokogiri/xml_namespace.h +0 -14
  206. data/ext/nokogiri/xml_node.h +0 -13
  207. data/ext/nokogiri/xml_node_set.h +0 -12
  208. data/ext/nokogiri/xml_processing_instruction.h +0 -9
  209. data/ext/nokogiri/xml_reader.h +0 -10
  210. data/ext/nokogiri/xml_relax_ng.h +0 -9
  211. data/ext/nokogiri/xml_sax_parser.h +0 -39
  212. data/ext/nokogiri/xml_sax_parser_context.h +0 -10
  213. data/ext/nokogiri/xml_sax_push_parser.h +0 -9
  214. data/ext/nokogiri/xml_schema.h +0 -9
  215. data/ext/nokogiri/xml_syntax_error.h +0 -13
  216. data/ext/nokogiri/xml_text.h +0 -9
  217. data/ext/nokogiri/xml_xpath_context.h +0 -10
  218. data/ext/nokogiri/xslt_stylesheet.h +0 -14
  219. data/lib/nokogiri/html/document.rb +0 -335
  220. data/lib/nokogiri/html/document_fragment.rb +0 -49
  221. data/lib/nokogiri/html/element_description_defaults.rb +0 -671
  222. data/lib/nokogiri/html/sax/parser.rb +0 -62
  223. data/lib/nokogiri/html/sax/parser_context.rb +0 -16
  224. data/patches/libxml2/0001-Revert-Do-not-URI-escape-in-server-side-includes.patch +0 -78
  225. data/patches/libxml2/0004-libxml2.la-is-in-top_builddir.patch +0 -25
  226. data/patches/libxml2/0005-Fix-infinite-loop-in-xmlStringLenDecodeEntities.patch +0 -32
  227. data/ports/archives/libxml2-2.9.10.tar.gz +0 -0
  228. data/ports/archives/libxslt-1.1.34.tar.gz +0 -0
  229. /data/patches/libxml2/{0002-Remove-script-macro-support.patch → 0001-Remove-script-macro-support.patch} +0 -0
  230. /data/patches/libxml2/{0003-Update-entities-to-remove-handling-of-ssi.patch → 0002-Update-entities-to-remove-handling-of-ssi.patch} +0 -0
@@ -1,98 +1,166 @@
1
- #include <xml_sax_parser_context.h>
1
+ #include <nokogiri.h>
2
2
 
3
3
  VALUE cNokogiriXmlSaxParserContext ;
4
4
 
5
- static void deallocate(xmlParserCtxtPtr ctxt)
6
- {
7
- NOKOGIRI_DEBUG_START(handler);
5
+ static ID id_read;
8
6
 
7
+ static void
8
+ xml_sax_parser_context_type_free(void *data)
9
+ {
10
+ xmlParserCtxtPtr ctxt = data;
9
11
  ctxt->sax = NULL;
10
-
11
- xmlFreeParserCtxt(ctxt);
12
-
13
- NOKOGIRI_DEBUG_END(handler);
12
+ if (ctxt->myDoc) {
13
+ xmlFreeDoc(ctxt->myDoc);
14
+ }
15
+ if (ctxt) {
16
+ xmlFreeParserCtxt(ctxt);
17
+ }
14
18
  }
15
19
 
16
20
  /*
17
- * call-seq:
18
- * parse_io(io, encoding)
19
- *
20
- * Parse +io+ object with +encoding+
21
+ * note that htmlParserCtxtPtr == xmlParserCtxtPtr and xmlFreeParserCtxt() == htmlFreeParserCtxt()
22
+ * so we use this type for both XML::SAX::ParserContext and HTML::SAX::ParserContext
21
23
  */
22
- static VALUE
23
- parse_io(VALUE klass, VALUE io, VALUE encoding)
24
+ static const rb_data_type_t xml_sax_parser_context_type = {
25
+ .wrap_struct_name = "xmlParserCtxt",
26
+ .function = {
27
+ .dfree = xml_sax_parser_context_type_free,
28
+ },
29
+ .flags = RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED,
30
+ };
31
+
32
+ xmlParserCtxtPtr
33
+ noko_xml_sax_parser_context_unwrap(VALUE rb_context)
24
34
  {
25
- xmlParserCtxtPtr ctxt;
26
- xmlCharEncoding enc = (xmlCharEncoding)NUM2INT(encoding);
27
-
28
- ctxt = xmlCreateIOParserCtxt(NULL, NULL,
29
- (xmlInputReadCallback)io_read_callback,
30
- (xmlInputCloseCallback)io_close_callback,
31
- (void *)io, enc);
32
- if (ctxt->sax) {
33
- xmlFree(ctxt->sax);
34
- ctxt->sax = NULL;
35
- }
36
-
37
- return Data_Wrap_Struct(klass, NULL, deallocate, ctxt);
35
+ xmlParserCtxtPtr c_context;
36
+ TypedData_Get_Struct(rb_context, xmlParserCtxt, &xml_sax_parser_context_type, c_context);
37
+ return c_context;
38
38
  }
39
39
 
40
- /*
41
- * call-seq:
42
- * parse_file(filename)
43
- *
44
- * Parse file given +filename+
45
- */
46
- static VALUE parse_file(VALUE klass, VALUE filename)
40
+ VALUE
41
+ noko_xml_sax_parser_context_wrap(VALUE klass, xmlParserCtxtPtr c_context)
47
42
  {
48
- xmlParserCtxtPtr ctxt = xmlCreateFileParserCtxt(StringValueCStr(filename));
49
- return Data_Wrap_Struct(klass, NULL, deallocate, ctxt);
43
+ return TypedData_Wrap_Struct(klass, &xml_sax_parser_context_type, c_context);
50
44
  }
51
45
 
52
- /*
53
- * call-seq:
54
- * parse_memory(data)
55
- *
56
- * Parse the XML stored in memory in +data+
57
- */
58
- static VALUE
59
- parse_memory(VALUE klass, VALUE data)
46
+ void
47
+ noko_xml_sax_parser_context_set_encoding(xmlParserCtxtPtr c_context, VALUE rb_encoding)
60
48
  {
61
- xmlParserCtxtPtr ctxt;
62
-
63
- if (NIL_P(data))
64
- rb_raise(rb_eArgError, "data cannot be nil");
65
- if (!(int)RSTRING_LEN(data))
66
- rb_raise(rb_eRuntimeError, "data cannot be empty");
67
-
68
- ctxt = xmlCreateMemoryParserCtxt(StringValuePtr(data),
69
- (int)RSTRING_LEN(data));
70
- if (ctxt->sax) {
71
- xmlFree(ctxt->sax);
72
- ctxt->sax = NULL;
49
+ if (!NIL_P(rb_encoding)) {
50
+ VALUE rb_encoding_name = rb_funcall(rb_encoding, rb_intern("name"), 0);
51
+
52
+ char *encoding_name = StringValueCStr(rb_encoding_name);
53
+ if (encoding_name) {
54
+ libxmlStructuredErrorHandlerState handler_state;
55
+ VALUE rb_errors = rb_ary_new();
56
+
57
+ noko__structured_error_func_save_and_set(&handler_state, (void *)rb_errors, noko__error_array_pusher);
58
+
59
+ int result = xmlSwitchEncodingName(c_context, encoding_name);
60
+
61
+ noko__structured_error_func_restore(&handler_state);
62
+
63
+ if (result != 0) {
64
+ xmlFreeParserCtxt(c_context);
65
+
66
+ VALUE exception = rb_funcall(cNokogiriXmlSyntaxError, rb_intern("aggregate"), 1, rb_errors);
67
+ if (!NIL_P(exception)) {
68
+ rb_exc_raise(exception);
69
+ } else {
70
+ rb_raise(rb_eRuntimeError, "could not set encoding");
71
+ }
72
+ }
73
73
  }
74
+ }
75
+ }
74
76
 
75
- return Data_Wrap_Struct(klass, NULL, deallocate, ctxt);
77
+ /* :nodoc: */
78
+ static VALUE
79
+ noko_xml_sax_parser_context_s_native_io(VALUE rb_class, VALUE rb_io, VALUE rb_encoding)
80
+ {
81
+ if (!rb_respond_to(rb_io, id_read)) {
82
+ rb_raise(rb_eTypeError, "argument expected to respond to :read");
83
+ }
84
+
85
+ if (!NIL_P(rb_encoding) && !rb_obj_is_kind_of(rb_encoding, rb_cEncoding)) {
86
+ rb_raise(rb_eTypeError, "argument must be an Encoding object");
87
+ }
88
+
89
+ xmlParserCtxtPtr c_context =
90
+ xmlCreateIOParserCtxt(NULL, NULL,
91
+ (xmlInputReadCallback)noko_io_read,
92
+ (xmlInputCloseCallback)noko_io_close,
93
+ (void *)rb_io, XML_CHAR_ENCODING_NONE);
94
+ if (!c_context) {
95
+ rb_raise(rb_eRuntimeError, "failed to create xml sax parser context");
96
+ }
97
+
98
+ noko_xml_sax_parser_context_set_encoding(c_context, rb_encoding);
99
+
100
+ if (c_context->sax) {
101
+ xmlFree(c_context->sax);
102
+ c_context->sax = NULL;
103
+ }
104
+
105
+ VALUE rb_context = noko_xml_sax_parser_context_wrap(rb_class, c_context);
106
+ rb_iv_set(rb_context, "@input", rb_io);
107
+
108
+ return rb_context;
76
109
  }
77
110
 
111
+ /* :nodoc: */
78
112
  static VALUE
79
- parse_doc(VALUE ctxt_val)
113
+ noko_xml_sax_parser_context_s_native_file(VALUE rb_class, VALUE rb_path, VALUE rb_encoding)
80
114
  {
81
- xmlParserCtxtPtr ctxt = (xmlParserCtxtPtr)ctxt_val;
82
- xmlParseDocument(ctxt);
83
- return Qnil;
115
+ if (!NIL_P(rb_encoding) && !rb_obj_is_kind_of(rb_encoding, rb_cEncoding)) {
116
+ rb_raise(rb_eTypeError, "argument must be an Encoding object");
117
+ }
118
+
119
+ xmlParserCtxtPtr c_context = xmlCreateFileParserCtxt(StringValueCStr(rb_path));
120
+ if (!c_context) {
121
+ rb_raise(rb_eRuntimeError, "failed to create xml sax parser context");
122
+ }
123
+
124
+ noko_xml_sax_parser_context_set_encoding(c_context, rb_encoding);
125
+
126
+ if (c_context->sax) {
127
+ xmlFree(c_context->sax);
128
+ c_context->sax = NULL;
129
+ }
130
+
131
+ return noko_xml_sax_parser_context_wrap(rb_class, c_context);
84
132
  }
85
133
 
134
+ /* :nodoc: */
86
135
  static VALUE
87
- parse_doc_finalize(VALUE ctxt_val)
136
+ noko_xml_sax_parser_context_s_native_memory(VALUE rb_class, VALUE rb_input, VALUE rb_encoding)
88
137
  {
89
- xmlParserCtxtPtr ctxt = (xmlParserCtxtPtr)ctxt_val;
138
+ Check_Type(rb_input, T_STRING);
139
+ if (!(int)RSTRING_LEN(rb_input)) {
140
+ rb_raise(rb_eRuntimeError, "input string cannot be empty");
141
+ }
142
+
143
+ if (!NIL_P(rb_encoding) && !rb_obj_is_kind_of(rb_encoding, rb_cEncoding)) {
144
+ rb_raise(rb_eTypeError, "argument must be an Encoding object");
145
+ }
90
146
 
91
- if (NULL != ctxt->myDoc)
92
- xmlFreeDoc(ctxt->myDoc);
147
+ xmlParserCtxtPtr c_context =
148
+ xmlCreateMemoryParserCtxt(StringValuePtr(rb_input), (int)RSTRING_LEN(rb_input));
149
+ if (!c_context) {
150
+ rb_raise(rb_eRuntimeError, "failed to create xml sax parser context");
151
+ }
93
152
 
94
- NOKOGIRI_SAX_TUPLE_DESTROY(ctxt->userData);
95
- return Qnil;
153
+ noko_xml_sax_parser_context_set_encoding(c_context, rb_encoding);
154
+
155
+ if (c_context->sax) {
156
+ xmlFree(c_context->sax);
157
+ c_context->sax = NULL;
158
+ }
159
+
160
+ VALUE rb_context = noko_xml_sax_parser_context_wrap(rb_class, c_context);
161
+ rb_iv_set(rb_context, "@input", rb_input);
162
+
163
+ return rb_context;
96
164
  }
97
165
 
98
166
  /*
@@ -100,84 +168,118 @@ parse_doc_finalize(VALUE ctxt_val)
100
168
  * parse_with(sax_handler)
101
169
  *
102
170
  * Use +sax_handler+ and parse the current document
171
+ *
172
+ * 💡 Calling this method directly is discouraged. Use Nokogiri::XML::SAX::Parser methods which are
173
+ * more convenient for most use cases.
103
174
  */
104
175
  static VALUE
105
- parse_with(VALUE self, VALUE sax_handler)
176
+ noko_xml_sax_parser_context__parse_with(VALUE rb_context, VALUE rb_sax_parser)
106
177
  {
107
- xmlParserCtxtPtr ctxt;
108
- xmlSAXHandlerPtr sax;
178
+ xmlParserCtxtPtr c_context;
179
+ xmlSAXHandlerPtr sax;
109
180
 
110
- if (!rb_obj_is_kind_of(sax_handler, cNokogiriXmlSaxParser))
111
- rb_raise(rb_eArgError, "argument must be a Nokogiri::XML::SAX::Parser");
181
+ if (!rb_obj_is_kind_of(rb_sax_parser, cNokogiriXmlSaxParser)) {
182
+ rb_raise(rb_eArgError, "argument must be a Nokogiri::XML::SAX::Parser");
183
+ }
112
184
 
113
- Data_Get_Struct(self, xmlParserCtxt, ctxt);
114
- Data_Get_Struct(sax_handler, xmlSAXHandler, sax);
185
+ c_context = noko_xml_sax_parser_context_unwrap(rb_context);
186
+ sax = noko_xml_sax_parser_unwrap(rb_sax_parser);
115
187
 
116
- /* Free the sax handler since we'll assign our own */
117
- if (ctxt->sax && ctxt->sax != (xmlSAXHandlerPtr)&xmlDefaultSAXHandler)
118
- xmlFree(ctxt->sax);
188
+ c_context->sax = sax;
189
+ c_context->userData = c_context; /* so we can use libxml2/SAX2.c handlers if we want to */
190
+ c_context->_private = (void *)rb_sax_parser;
119
191
 
120
- ctxt->sax = sax;
121
- ctxt->userData = (void *)NOKOGIRI_SAX_TUPLE_NEW(ctxt, sax_handler);
192
+ xmlSetStructuredErrorFunc(NULL, NULL);
122
193
 
123
- rb_ensure(parse_doc, (VALUE)ctxt, parse_doc_finalize, (VALUE)ctxt);
194
+ /* although we're calling back into Ruby here, we don't need to worry about exceptions, because we
195
+ * don't have any cleanup to do. The only memory we need to free is handled by
196
+ * xml_sax_parser_context_type_free */
197
+ xmlParseDocument(c_context);
124
198
 
125
- return Qnil;
199
+ return Qnil;
126
200
  }
127
201
 
128
202
  /*
129
203
  * call-seq:
130
- * replace_entities=(boolean)
204
+ * replace_entities=(value)
205
+ *
206
+ * See Document@Entity+Handling for an explanation of the behavior controlled by this flag.
207
+ *
208
+ * [Parameters]
209
+ * - +value+ (Boolean) Whether external parsed entities will be resolved.
210
+ *
211
+ * ⚠ <b>It is UNSAFE to set this option to +true+</b> when parsing untrusted documents. The option
212
+ * defaults to +false+ for this reason.
131
213
  *
132
- * Should this parser replace entities? &amp; will get converted to '&' if
133
- * set to true
214
+ * This option is perhaps misnamed by the libxml2 author, since it controls resolution and not
215
+ * replacement.
216
+ *
217
+ * [Example]
218
+ * Because this class is generally not instantiated directly, you would typically set this option
219
+ * via the block argument to Nokogiri::XML::SAX::Parser.parse et al:
220
+ *
221
+ * parser = Nokogiri::XML::SAX::Parser.new(document_handler)
222
+ * parser.parse(xml) do |ctx|
223
+ * ctx.replace_entities = true # this is UNSAFE for untrusted documents!
224
+ * end
134
225
  */
135
- static VALUE set_replace_entities(VALUE self, VALUE value)
226
+ static VALUE
227
+ noko_xml_sax_parser_context__replace_entities_set(VALUE rb_context, VALUE rb_value)
136
228
  {
137
- xmlParserCtxtPtr ctxt;
138
- Data_Get_Struct(self, xmlParserCtxt, ctxt);
229
+ int error;
230
+ xmlParserCtxtPtr ctxt = noko_xml_sax_parser_context_unwrap(rb_context);
231
+
232
+ if (RB_TEST(rb_value)) {
233
+ error = xmlCtxtSetOptions(ctxt, xmlCtxtGetOptions(ctxt) | XML_PARSE_NOENT);
234
+ } else {
235
+ error = xmlCtxtSetOptions(ctxt, xmlCtxtGetOptions(ctxt) & ~XML_PARSE_NOENT);
236
+ }
139
237
 
140
- if(Qfalse == value)
141
- ctxt->replaceEntities = 0;
142
- else
143
- ctxt->replaceEntities = 1;
238
+ if (error) {
239
+ rb_raise(rb_eRuntimeError, "failed to set parser context options (%x)", error);
240
+ }
144
241
 
145
- return value;
242
+ return rb_value;
146
243
  }
147
244
 
148
245
  /*
149
246
  * call-seq:
150
- * replace_entities
247
+ * replace_entities
151
248
  *
152
- * Should this parser replace entities? &amp; will get converted to '&' if
153
- * set to true
249
+ * See Document@Entity+Handling for an explanation of the behavior controlled by this flag.
250
+ *
251
+ * [Returns] (Boolean) Value of the parse option. (Default +false+)
252
+ *
253
+ * This option is perhaps misnamed by the libxml2 author, since it controls resolution and not
254
+ * replacement.
154
255
  */
155
- static VALUE get_replace_entities(VALUE self)
256
+ static VALUE
257
+ noko_xml_sax_parser_context__replace_entities_get(VALUE rb_context)
156
258
  {
157
- xmlParserCtxtPtr ctxt;
158
- Data_Get_Struct(self, xmlParserCtxt, ctxt);
259
+ xmlParserCtxtPtr ctxt = noko_xml_sax_parser_context_unwrap(rb_context);
159
260
 
160
- if(0 == ctxt->replaceEntities)
161
- return Qfalse;
162
- else
261
+ if (xmlCtxtGetOptions(ctxt) & XML_PARSE_NOENT) {
163
262
  return Qtrue;
263
+ } else {
264
+ return Qfalse;
265
+ }
164
266
  }
165
267
 
166
268
  /*
167
269
  * call-seq: line
168
270
  *
169
- * Get the current line the parser context is processing.
271
+ * [Returns] (Integer) the line number of the line being currently parsed.
170
272
  */
171
- static VALUE line(VALUE self)
273
+ static VALUE
274
+ noko_xml_sax_parser_context__line(VALUE rb_context)
172
275
  {
173
- xmlParserCtxtPtr ctxt;
174
276
  xmlParserInputPtr io;
175
-
176
- Data_Get_Struct(self, xmlParserCtxt, ctxt);
277
+ xmlParserCtxtPtr ctxt = noko_xml_sax_parser_context_unwrap(rb_context);
177
278
 
178
279
  io = ctxt->input;
179
- if(io)
280
+ if (io) {
180
281
  return INT2NUM(io->line);
282
+ }
181
283
 
182
284
  return Qnil;
183
285
  }
@@ -185,78 +287,110 @@ static VALUE line(VALUE self)
185
287
  /*
186
288
  * call-seq: column
187
289
  *
188
- * Get the current column the parser context is processing.
290
+ * [Returns] (Integer) the column number of the column being currently parsed.
189
291
  */
190
- static VALUE column(VALUE self)
292
+ static VALUE
293
+ noko_xml_sax_parser_context__column(VALUE rb_context)
191
294
  {
192
- xmlParserCtxtPtr ctxt;
295
+ xmlParserCtxtPtr ctxt = noko_xml_sax_parser_context_unwrap(rb_context);
193
296
  xmlParserInputPtr io;
194
297
 
195
- Data_Get_Struct(self, xmlParserCtxt, ctxt);
196
-
197
298
  io = ctxt->input;
198
- if(io)
299
+ if (io) {
199
300
  return INT2NUM(io->col);
301
+ }
200
302
 
201
303
  return Qnil;
202
304
  }
203
305
 
204
306
  /*
205
307
  * call-seq:
206
- * recovery=(boolean)
308
+ * recovery=(value)
309
+ *
310
+ * Controls whether this parser will recover from parsing errors. If set to +true+, the parser will
311
+ * invoke the SAX::Document#error callback and continue processing the file. If set to +false+, the
312
+ * parser will stop processing the file on the first parsing error.
313
+ *
314
+ * [Parameters]
315
+ * - +value+ (Boolean) Recover from parsing errors. (Default is +false+ for XML and +true+ for HTML.)
316
+ *
317
+ * [Returns] (Boolean) The passed +value+.
207
318
  *
208
- * Should this parser recover from structural errors? It will not stop processing
209
- * file on structural errors if set to true
319
+ * [Example]
320
+ * Because this class is generally not instantiated directly, you would typically set this option
321
+ * via the block argument to Nokogiri::XML::SAX::Parser.parse et al:
322
+ *
323
+ * parser = Nokogiri::XML::SAX::Parser.new(document_handler)
324
+ * parser.parse(xml) do |ctx|
325
+ * ctx.recovery = true
326
+ * end
210
327
  */
211
- static VALUE set_recovery(VALUE self, VALUE value)
328
+ static VALUE
329
+ noko_xml_sax_parser_context__recovery_set(VALUE rb_context, VALUE rb_value)
212
330
  {
213
- xmlParserCtxtPtr ctxt;
214
- Data_Get_Struct(self, xmlParserCtxt, ctxt);
331
+ int error;
332
+ xmlParserCtxtPtr ctxt = noko_xml_sax_parser_context_unwrap(rb_context);
215
333
 
216
- if(value == Qfalse)
217
- ctxt->recovery = 0;
218
- else
219
- ctxt->recovery = 1;
334
+ if (RB_TEST(rb_value)) {
335
+ error = xmlCtxtSetOptions(ctxt, xmlCtxtGetOptions(ctxt) | XML_PARSE_RECOVER);
336
+ } else {
337
+ error = xmlCtxtSetOptions(ctxt, xmlCtxtGetOptions(ctxt) & ~XML_PARSE_RECOVER);
338
+ }
220
339
 
221
- return value;
340
+ if (error) {
341
+ rb_raise(rb_eRuntimeError, "failed to set parser context options (%x)", error);
342
+ }
343
+
344
+ return rb_value;
222
345
  }
223
346
 
224
347
  /*
225
348
  * call-seq:
226
- * recovery
349
+ * recovery
350
+ *
351
+ * Inspect whether this parser will recover from parsing errors. If set to +true+, the parser will
352
+ * invoke the SAX::Document#error callback and continue processing the file. If set to +false+, the
353
+ * parser will stop processing the file on the first parsing error.
227
354
  *
228
- * Should this parser recover from structural errors? It will not stop processing
229
- * file on structural errors if set to true
355
+ * [Returns] (Boolean) Whether this parser will recover from parsing errors.
356
+ *
357
+ * Default is +false+ for XML and +true+ for HTML.
230
358
  */
231
- static VALUE get_recovery(VALUE self)
359
+ static VALUE
360
+ noko_xml_sax_parser_context__recovery_get(VALUE rb_context)
232
361
  {
233
- xmlParserCtxtPtr ctxt;
234
- Data_Get_Struct(self, xmlParserCtxt, ctxt);
362
+ xmlParserCtxtPtr ctxt = noko_xml_sax_parser_context_unwrap(rb_context);
235
363
 
236
- if(ctxt->recovery == 0)
237
- return Qfalse;
238
- else
364
+ if (xmlCtxtGetOptions(ctxt) & XML_PARSE_RECOVER) {
239
365
  return Qtrue;
366
+ } else {
367
+ return Qfalse;
368
+ }
240
369
  }
241
370
 
242
- void init_xml_sax_parser_context()
371
+ void
372
+ noko_init_xml_sax_parser_context(void)
243
373
  {
244
- VALUE nokogiri = rb_define_module("Nokogiri");
245
- VALUE xml = rb_define_module_under(nokogiri, "XML");
246
- VALUE sax = rb_define_module_under(xml, "SAX");
247
- VALUE klass = rb_define_class_under(sax, "ParserContext", rb_cObject);
248
-
249
- cNokogiriXmlSaxParserContext = klass;
250
-
251
- rb_define_singleton_method(klass, "io", parse_io, 2);
252
- rb_define_singleton_method(klass, "memory", parse_memory, 1);
253
- rb_define_singleton_method(klass, "file", parse_file, 1);
254
-
255
- rb_define_method(klass, "parse_with", parse_with, 1);
256
- rb_define_method(klass, "replace_entities=", set_replace_entities, 1);
257
- rb_define_method(klass, "replace_entities", get_replace_entities, 0);
258
- rb_define_method(klass, "recovery=", set_recovery, 1);
259
- rb_define_method(klass, "recovery", get_recovery, 0);
260
- rb_define_method(klass, "line", line, 0);
261
- rb_define_method(klass, "column", column, 0);
374
+ cNokogiriXmlSaxParserContext = rb_define_class_under(mNokogiriXmlSax, "ParserContext", rb_cObject);
375
+
376
+ rb_undef_alloc_func(cNokogiriXmlSaxParserContext);
377
+
378
+ rb_define_singleton_method(cNokogiriXmlSaxParserContext, "native_io",
379
+ noko_xml_sax_parser_context_s_native_io, 2);
380
+ rb_define_singleton_method(cNokogiriXmlSaxParserContext, "native_memory",
381
+ noko_xml_sax_parser_context_s_native_memory, 2);
382
+ rb_define_singleton_method(cNokogiriXmlSaxParserContext, "native_file",
383
+ noko_xml_sax_parser_context_s_native_file, 2);
384
+
385
+ rb_define_method(cNokogiriXmlSaxParserContext, "parse_with", noko_xml_sax_parser_context__parse_with, 1);
386
+ rb_define_method(cNokogiriXmlSaxParserContext, "replace_entities=",
387
+ noko_xml_sax_parser_context__replace_entities_set, 1);
388
+ rb_define_method(cNokogiriXmlSaxParserContext, "replace_entities",
389
+ noko_xml_sax_parser_context__replace_entities_get, 0);
390
+ rb_define_method(cNokogiriXmlSaxParserContext, "recovery=", noko_xml_sax_parser_context__recovery_set, 1);
391
+ rb_define_method(cNokogiriXmlSaxParserContext, "recovery", noko_xml_sax_parser_context__recovery_get, 0);
392
+ rb_define_method(cNokogiriXmlSaxParserContext, "line", noko_xml_sax_parser_context__line, 0);
393
+ rb_define_method(cNokogiriXmlSaxParserContext, "column", noko_xml_sax_parser_context__column, 0);
394
+
395
+ id_read = rb_intern("read");
262
396
  }