nokogiri 1.11.1 → 1.12.0.rc1

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of nokogiri might be problematic. Click here for more details.

Files changed (179) hide show
  1. checksums.yaml +4 -4
  2. data/LICENSE-DEPENDENCIES.md +232 -11
  3. data/LICENSE.md +1 -1
  4. data/README.md +27 -21
  5. data/dependencies.yml +12 -12
  6. data/ext/nokogiri/depend +35 -474
  7. data/ext/nokogiri/extconf.rb +391 -243
  8. data/ext/nokogiri/gumbo.c +611 -0
  9. data/ext/nokogiri/{html_document.c → html4_document.c} +18 -23
  10. data/ext/nokogiri/html4_element_description.c +294 -0
  11. data/ext/nokogiri/html4_entity_lookup.c +37 -0
  12. data/ext/nokogiri/html4_sax_parser_context.c +119 -0
  13. data/ext/nokogiri/{html_sax_push_parser.c → html4_sax_push_parser.c} +29 -27
  14. data/ext/nokogiri/libxml2_backwards_compat.c +121 -0
  15. data/ext/nokogiri/nokogiri.c +206 -66
  16. data/ext/nokogiri/nokogiri.h +166 -76
  17. data/ext/nokogiri/test_global_handlers.c +3 -4
  18. data/ext/nokogiri/xml_attr.c +15 -15
  19. data/ext/nokogiri/xml_attribute_decl.c +18 -18
  20. data/ext/nokogiri/xml_cdata.c +13 -18
  21. data/ext/nokogiri/xml_comment.c +19 -26
  22. data/ext/nokogiri/xml_document.c +258 -200
  23. data/ext/nokogiri/xml_document_fragment.c +13 -15
  24. data/ext/nokogiri/xml_dtd.c +54 -48
  25. data/ext/nokogiri/xml_element_content.c +31 -26
  26. data/ext/nokogiri/xml_element_decl.c +22 -22
  27. data/ext/nokogiri/xml_encoding_handler.c +28 -17
  28. data/ext/nokogiri/xml_entity_decl.c +32 -30
  29. data/ext/nokogiri/xml_entity_reference.c +16 -18
  30. data/ext/nokogiri/xml_namespace.c +58 -49
  31. data/ext/nokogiri/xml_node.c +473 -414
  32. data/ext/nokogiri/xml_node_set.c +174 -162
  33. data/ext/nokogiri/xml_processing_instruction.c +17 -19
  34. data/ext/nokogiri/xml_reader.c +193 -157
  35. data/ext/nokogiri/xml_relax_ng.c +29 -23
  36. data/ext/nokogiri/xml_sax_parser.c +111 -106
  37. data/ext/nokogiri/xml_sax_parser_context.c +102 -85
  38. data/ext/nokogiri/xml_sax_push_parser.c +34 -27
  39. data/ext/nokogiri/xml_schema.c +49 -41
  40. data/ext/nokogiri/xml_syntax_error.c +21 -23
  41. data/ext/nokogiri/xml_text.c +13 -17
  42. data/ext/nokogiri/xml_xpath_context.c +86 -77
  43. data/ext/nokogiri/xslt_stylesheet.c +157 -156
  44. data/gumbo-parser/CHANGES.md +63 -0
  45. data/gumbo-parser/Makefile +101 -0
  46. data/gumbo-parser/THANKS +27 -0
  47. data/gumbo-parser/src/Makefile +17 -0
  48. data/gumbo-parser/src/README.md +41 -0
  49. data/gumbo-parser/src/ascii.c +75 -0
  50. data/gumbo-parser/src/ascii.h +115 -0
  51. data/gumbo-parser/src/attribute.c +42 -0
  52. data/gumbo-parser/src/attribute.h +17 -0
  53. data/gumbo-parser/src/char_ref.c +22225 -0
  54. data/gumbo-parser/src/char_ref.h +29 -0
  55. data/gumbo-parser/src/char_ref.rl +2154 -0
  56. data/gumbo-parser/src/error.c +626 -0
  57. data/gumbo-parser/src/error.h +148 -0
  58. data/gumbo-parser/src/foreign_attrs.c +104 -0
  59. data/gumbo-parser/src/foreign_attrs.gperf +27 -0
  60. data/gumbo-parser/src/gumbo.h +943 -0
  61. data/gumbo-parser/src/insertion_mode.h +33 -0
  62. data/gumbo-parser/src/macros.h +91 -0
  63. data/gumbo-parser/src/parser.c +4886 -0
  64. data/gumbo-parser/src/parser.h +41 -0
  65. data/gumbo-parser/src/replacement.h +33 -0
  66. data/gumbo-parser/src/string_buffer.c +103 -0
  67. data/gumbo-parser/src/string_buffer.h +68 -0
  68. data/gumbo-parser/src/string_piece.c +48 -0
  69. data/gumbo-parser/src/svg_attrs.c +174 -0
  70. data/gumbo-parser/src/svg_attrs.gperf +77 -0
  71. data/gumbo-parser/src/svg_tags.c +137 -0
  72. data/gumbo-parser/src/svg_tags.gperf +55 -0
  73. data/gumbo-parser/src/tag.c +222 -0
  74. data/gumbo-parser/src/tag_lookup.c +382 -0
  75. data/gumbo-parser/src/tag_lookup.gperf +169 -0
  76. data/gumbo-parser/src/tag_lookup.h +13 -0
  77. data/gumbo-parser/src/token_buffer.c +79 -0
  78. data/gumbo-parser/src/token_buffer.h +71 -0
  79. data/gumbo-parser/src/token_type.h +17 -0
  80. data/gumbo-parser/src/tokenizer.c +3463 -0
  81. data/gumbo-parser/src/tokenizer.h +112 -0
  82. data/gumbo-parser/src/tokenizer_states.h +339 -0
  83. data/gumbo-parser/src/utf8.c +245 -0
  84. data/gumbo-parser/src/utf8.h +164 -0
  85. data/gumbo-parser/src/util.c +68 -0
  86. data/gumbo-parser/src/util.h +30 -0
  87. data/gumbo-parser/src/vector.c +111 -0
  88. data/gumbo-parser/src/vector.h +45 -0
  89. data/lib/nokogiri.rb +31 -50
  90. data/lib/nokogiri/css.rb +14 -14
  91. data/lib/nokogiri/css/parser.rb +2 -2
  92. data/lib/nokogiri/css/parser.y +1 -1
  93. data/lib/nokogiri/css/syntax_error.rb +1 -1
  94. data/lib/nokogiri/extension.rb +26 -0
  95. data/lib/nokogiri/gumbo.rb +14 -0
  96. data/lib/nokogiri/html.rb +31 -27
  97. data/lib/nokogiri/html4.rb +40 -0
  98. data/lib/nokogiri/{html → html4}/builder.rb +2 -2
  99. data/lib/nokogiri/{html → html4}/document.rb +4 -4
  100. data/lib/nokogiri/{html → html4}/document_fragment.rb +17 -17
  101. data/lib/nokogiri/{html → html4}/element_description.rb +1 -1
  102. data/lib/nokogiri/{html → html4}/element_description_defaults.rb +1 -1
  103. data/lib/nokogiri/{html → html4}/entity_lookup.rb +1 -1
  104. data/lib/nokogiri/{html → html4}/sax/parser.rb +11 -14
  105. data/lib/nokogiri/html4/sax/parser_context.rb +19 -0
  106. data/lib/nokogiri/{html → html4}/sax/push_parser.rb +5 -5
  107. data/lib/nokogiri/html5.rb +473 -0
  108. data/lib/nokogiri/html5/document.rb +74 -0
  109. data/lib/nokogiri/html5/document_fragment.rb +80 -0
  110. data/lib/nokogiri/html5/node.rb +93 -0
  111. data/lib/nokogiri/version/constant.rb +1 -1
  112. data/lib/nokogiri/version/info.rb +42 -9
  113. data/lib/nokogiri/xml.rb +35 -36
  114. data/lib/nokogiri/xml/document.rb +74 -28
  115. data/lib/nokogiri/xml/node.rb +45 -47
  116. data/lib/nokogiri/xml/parse_options.rb +2 -0
  117. data/lib/nokogiri/xml/pp.rb +2 -2
  118. data/lib/nokogiri/xml/reader.rb +2 -9
  119. data/lib/nokogiri/xml/sax.rb +4 -4
  120. data/lib/nokogiri/xml/sax/document.rb +24 -30
  121. data/lib/nokogiri/xml/xpath.rb +3 -5
  122. data/lib/nokogiri/xml/xpath/syntax_error.rb +1 -1
  123. data/lib/nokogiri/xslt.rb +16 -16
  124. data/lib/nokogiri/xslt/stylesheet.rb +1 -1
  125. data/patches/libxml2/{0002-Remove-script-macro-support.patch → 0001-Remove-script-macro-support.patch} +0 -0
  126. data/patches/libxml2/{0003-Update-entities-to-remove-handling-of-ssi.patch → 0002-Update-entities-to-remove-handling-of-ssi.patch} +0 -0
  127. data/patches/libxml2/{0004-libxml2.la-is-in-top_builddir.patch → 0003-libxml2.la-is-in-top_builddir.patch} +1 -1
  128. data/patches/libxml2/{0008-use-glibc-strlen.patch → 0004-use-glibc-strlen.patch} +0 -0
  129. data/patches/libxml2/{0009-avoid-isnan-isinf.patch → 0005-avoid-isnan-isinf.patch} +4 -4
  130. data/patches/libxml2/0006-update-automake-files-for-arm64.patch +2511 -0
  131. data/patches/libxml2/0007-Fix-XPath-recursion-limit.patch +31 -0
  132. data/patches/libxslt/0001-update-automake-files-for-arm64.patch +2511 -0
  133. data/patches/libxslt/0002-Fix-xml2-config-check-in-configure-script.patch +19 -0
  134. data/ports/archives/libxml2-2.9.12.tar.gz +0 -0
  135. metadata +117 -109
  136. data/ext/nokogiri/html_document.h +0 -10
  137. data/ext/nokogiri/html_element_description.c +0 -279
  138. data/ext/nokogiri/html_element_description.h +0 -10
  139. data/ext/nokogiri/html_entity_lookup.c +0 -32
  140. data/ext/nokogiri/html_entity_lookup.h +0 -8
  141. data/ext/nokogiri/html_sax_parser_context.c +0 -118
  142. data/ext/nokogiri/html_sax_parser_context.h +0 -11
  143. data/ext/nokogiri/html_sax_push_parser.h +0 -9
  144. data/ext/nokogiri/xml_attr.h +0 -9
  145. data/ext/nokogiri/xml_attribute_decl.h +0 -9
  146. data/ext/nokogiri/xml_cdata.h +0 -9
  147. data/ext/nokogiri/xml_comment.h +0 -9
  148. data/ext/nokogiri/xml_document.h +0 -23
  149. data/ext/nokogiri/xml_document_fragment.h +0 -10
  150. data/ext/nokogiri/xml_dtd.h +0 -10
  151. data/ext/nokogiri/xml_element_content.h +0 -10
  152. data/ext/nokogiri/xml_element_decl.h +0 -9
  153. data/ext/nokogiri/xml_encoding_handler.h +0 -8
  154. data/ext/nokogiri/xml_entity_decl.h +0 -10
  155. data/ext/nokogiri/xml_entity_reference.h +0 -9
  156. data/ext/nokogiri/xml_io.c +0 -63
  157. data/ext/nokogiri/xml_io.h +0 -11
  158. data/ext/nokogiri/xml_libxml2_hacks.c +0 -112
  159. data/ext/nokogiri/xml_libxml2_hacks.h +0 -12
  160. data/ext/nokogiri/xml_namespace.h +0 -14
  161. data/ext/nokogiri/xml_node.h +0 -13
  162. data/ext/nokogiri/xml_node_set.h +0 -12
  163. data/ext/nokogiri/xml_processing_instruction.h +0 -9
  164. data/ext/nokogiri/xml_reader.h +0 -10
  165. data/ext/nokogiri/xml_relax_ng.h +0 -9
  166. data/ext/nokogiri/xml_sax_parser.h +0 -39
  167. data/ext/nokogiri/xml_sax_parser_context.h +0 -10
  168. data/ext/nokogiri/xml_sax_push_parser.h +0 -9
  169. data/ext/nokogiri/xml_schema.h +0 -9
  170. data/ext/nokogiri/xml_syntax_error.h +0 -25
  171. data/ext/nokogiri/xml_text.h +0 -9
  172. data/ext/nokogiri/xml_xpath_context.h +0 -10
  173. data/ext/nokogiri/xslt_stylesheet.h +0 -14
  174. data/lib/nokogiri/html/sax/parser_context.rb +0 -17
  175. data/patches/libxml2/0001-Revert-Do-not-URI-escape-in-server-side-includes.patch +0 -78
  176. data/patches/libxml2/0005-Fix-infinite-loop-in-xmlStringLenDecodeEntities.patch +0 -32
  177. data/patches/libxml2/0006-htmlParseComment-treat-as-if-it-closed-the-comment.patch +0 -73
  178. data/patches/libxml2/0007-use-new-htmlParseLookupCommentEnd-to-find-comment-en.patch +0 -103
  179. data/ports/archives/libxml2-2.9.10.tar.gz +0 -0
@@ -1,4 +1,6 @@
1
- #include <html_document.h>
1
+ #include <nokogiri.h>
2
+
3
+ VALUE cNokogiriHtml4Document ;
2
4
 
3
5
  static ID id_encoding_found;
4
6
  static ID id_to_s;
@@ -23,8 +25,7 @@ rb_html_document_s_new(int argc, VALUE *argv, VALUE klass)
23
25
  RTEST(uri) ? (const xmlChar *)StringValueCStr(uri) : NULL,
24
26
  RTEST(external_id) ? (const xmlChar *)StringValueCStr(external_id) : NULL
25
27
  );
26
- rb_doc = Nokogiri_wrap_xml_document(klass, doc);
27
- rb_obj_call_init(rb_doc, argc, argv);
28
+ rb_doc = noko_xml_document_wrap_with_init_args(klass, doc, argc, argv);
28
29
  return rb_doc ;
29
30
  }
30
31
 
@@ -33,7 +34,7 @@ rb_html_document_s_new(int argc, VALUE *argv, VALUE klass)
33
34
  * read_io(io, url, encoding, options)
34
35
  *
35
36
  * Read the HTML document from +io+ with given +url+, +encoding+,
36
- * and +options+. See Nokogiri::HTML.parse
37
+ * and +options+. See Nokogiri::HTML4.parse
37
38
  */
38
39
  static VALUE
39
40
  rb_html_document_s_read_io(VALUE klass, VALUE rb_io, VALUE rb_url, VALUE rb_encoding, VALUE rb_options)
@@ -47,7 +48,7 @@ rb_html_document_s_read_io(VALUE klass, VALUE rb_io, VALUE rb_url, VALUE rb_enco
47
48
 
48
49
  xmlSetStructuredErrorFunc((void *)rb_error_list, Nokogiri_error_array_pusher);
49
50
 
50
- c_doc = htmlReadIO(io_read_callback, io_close_callback, (void *)rb_io, c_url, c_encoding, options);
51
+ c_doc = htmlReadIO(noko_io_read, noko_io_close, (void *)rb_io, c_url, c_encoding, options);
51
52
 
52
53
  xmlSetStructuredErrorFunc(NULL, NULL);
53
54
 
@@ -81,7 +82,7 @@ rb_html_document_s_read_io(VALUE klass, VALUE rb_io, VALUE rb_url, VALUE rb_enco
81
82
  return Qnil;
82
83
  }
83
84
 
84
- rb_doc = Nokogiri_wrap_xml_document(klass, c_doc);
85
+ rb_doc = noko_xml_document_wrap(klass, c_doc);
85
86
  rb_iv_set(rb_doc, "@errors", rb_error_list);
86
87
  return rb_doc;
87
88
  }
@@ -91,7 +92,7 @@ rb_html_document_s_read_io(VALUE klass, VALUE rb_io, VALUE rb_url, VALUE rb_enco
91
92
  * read_memory(string, url, encoding, options)
92
93
  *
93
94
  * Read the HTML document contained in +string+ with given +url+, +encoding+,
94
- * and +options+. See Nokogiri::HTML.parse
95
+ * and +options+. See Nokogiri::HTML4.parse
95
96
  */
96
97
  static VALUE
97
98
  rb_html_document_s_read_memory(VALUE klass, VALUE rb_html, VALUE rb_url, VALUE rb_encoding, VALUE rb_options)
@@ -129,7 +130,7 @@ rb_html_document_s_read_memory(VALUE klass, VALUE rb_html, VALUE rb_url, VALUE r
129
130
  return Qnil;
130
131
  }
131
132
 
132
- rb_doc = Nokogiri_wrap_xml_document(klass, c_doc);
133
+ rb_doc = noko_xml_document_wrap(klass, c_doc);
133
134
  rb_iv_set(rb_doc, "@errors", rb_error_list);
134
135
  return rb_doc;
135
136
  }
@@ -148,23 +149,17 @@ rb_html_document_type(VALUE self)
148
149
  return INT2NUM((long)doc->type);
149
150
  }
150
151
 
151
- VALUE cNokogiriHtmlDocument ;
152
-
153
152
  void
154
- init_html_document()
153
+ noko_init_html_document()
155
154
  {
156
- VALUE nokogiri = rb_define_module("Nokogiri");
157
- VALUE nokogiri_xml = rb_define_module_under(nokogiri, "XML");
158
- VALUE nokogiri_xml_node = rb_define_class_under(nokogiri_xml, "Node", rb_cObject);
159
- VALUE nokogiri_xml_document = rb_define_class_under(nokogiri_xml, "Document", nokogiri_xml_node);
160
- VALUE nokogiri_html = rb_define_module_under(nokogiri, "HTML");
161
- cNokogiriHtmlDocument = rb_define_class_under(nokogiri_html, "Document", nokogiri_xml_document);
162
-
163
- rb_define_singleton_method(cNokogiriHtmlDocument, "read_memory", rb_html_document_s_read_memory, 4);
164
- rb_define_singleton_method(cNokogiriHtmlDocument, "read_io", rb_html_document_s_read_io, 4);
165
- rb_define_singleton_method(cNokogiriHtmlDocument, "new", rb_html_document_s_new, -1);
166
-
167
- rb_define_method(cNokogiriHtmlDocument, "type", rb_html_document_type, 0);
155
+ assert(cNokogiriXmlDocument);
156
+ cNokogiriHtml4Document = rb_define_class_under(mNokogiriHtml4, "Document", cNokogiriXmlDocument);
157
+
158
+ rb_define_singleton_method(cNokogiriHtml4Document, "read_memory", rb_html_document_s_read_memory, 4);
159
+ rb_define_singleton_method(cNokogiriHtml4Document, "read_io", rb_html_document_s_read_io, 4);
160
+ rb_define_singleton_method(cNokogiriHtml4Document, "new", rb_html_document_s_new, -1);
161
+
162
+ rb_define_method(cNokogiriHtml4Document, "type", rb_html_document_type, 0);
168
163
 
169
164
  id_encoding_found = rb_intern("encoding_found");
170
165
  id_to_s = rb_intern("to_s");
@@ -0,0 +1,294 @@
1
+ #include <nokogiri.h>
2
+
3
+ VALUE cNokogiriHtml4ElementDescription ;
4
+
5
+ /*
6
+ * call-seq:
7
+ * required_attributes
8
+ *
9
+ * A list of required attributes for this element
10
+ */
11
+ static VALUE
12
+ required_attributes(VALUE self)
13
+ {
14
+ const htmlElemDesc *description;
15
+ VALUE list;
16
+ int i;
17
+
18
+ Data_Get_Struct(self, htmlElemDesc, description);
19
+
20
+ list = rb_ary_new();
21
+
22
+ if (NULL == description->attrs_req) { return list; }
23
+
24
+ for (i = 0; description->attrs_depr[i]; i++) {
25
+ rb_ary_push(list, NOKOGIRI_STR_NEW2(description->attrs_req[i]));
26
+ }
27
+
28
+ return list;
29
+ }
30
+
31
+ /*
32
+ * call-seq:
33
+ * deprecated_attributes
34
+ *
35
+ * A list of deprecated attributes for this element
36
+ */
37
+ static VALUE
38
+ deprecated_attributes(VALUE self)
39
+ {
40
+ const htmlElemDesc *description;
41
+ VALUE list;
42
+ int i;
43
+
44
+ Data_Get_Struct(self, htmlElemDesc, description);
45
+
46
+ list = rb_ary_new();
47
+
48
+ if (NULL == description->attrs_depr) { return list; }
49
+
50
+ for (i = 0; description->attrs_depr[i]; i++) {
51
+ rb_ary_push(list, NOKOGIRI_STR_NEW2(description->attrs_depr[i]));
52
+ }
53
+
54
+ return list;
55
+ }
56
+
57
+ /*
58
+ * call-seq:
59
+ * optional_attributes
60
+ *
61
+ * A list of optional attributes for this element
62
+ */
63
+ static VALUE
64
+ optional_attributes(VALUE self)
65
+ {
66
+ const htmlElemDesc *description;
67
+ VALUE list;
68
+ int i;
69
+
70
+ Data_Get_Struct(self, htmlElemDesc, description);
71
+
72
+ list = rb_ary_new();
73
+
74
+ if (NULL == description->attrs_opt) { return list; }
75
+
76
+ for (i = 0; description->attrs_opt[i]; i++) {
77
+ rb_ary_push(list, NOKOGIRI_STR_NEW2(description->attrs_opt[i]));
78
+ }
79
+
80
+ return list;
81
+ }
82
+
83
+ /*
84
+ * call-seq:
85
+ * default_sub_element
86
+ *
87
+ * The default sub element for this element
88
+ */
89
+ static VALUE
90
+ default_sub_element(VALUE self)
91
+ {
92
+ const htmlElemDesc *description;
93
+ Data_Get_Struct(self, htmlElemDesc, description);
94
+
95
+ if (description->defaultsubelt) {
96
+ return NOKOGIRI_STR_NEW2(description->defaultsubelt);
97
+ }
98
+
99
+ return Qnil;
100
+ }
101
+
102
+ /*
103
+ * call-seq:
104
+ * sub_elements
105
+ *
106
+ * A list of allowed sub elements for this element.
107
+ */
108
+ static VALUE
109
+ sub_elements(VALUE self)
110
+ {
111
+ const htmlElemDesc *description;
112
+ VALUE list;
113
+ int i;
114
+
115
+ Data_Get_Struct(self, htmlElemDesc, description);
116
+
117
+ list = rb_ary_new();
118
+
119
+ if (NULL == description->subelts) { return list; }
120
+
121
+ for (i = 0; description->subelts[i]; i++) {
122
+ rb_ary_push(list, NOKOGIRI_STR_NEW2(description->subelts[i]));
123
+ }
124
+
125
+ return list;
126
+ }
127
+
128
+ /*
129
+ * call-seq:
130
+ * description
131
+ *
132
+ * The description for this element
133
+ */
134
+ static VALUE
135
+ description(VALUE self)
136
+ {
137
+ const htmlElemDesc *description;
138
+ Data_Get_Struct(self, htmlElemDesc, description);
139
+
140
+ return NOKOGIRI_STR_NEW2(description->desc);
141
+ }
142
+
143
+ /*
144
+ * call-seq:
145
+ * inline?
146
+ *
147
+ * Is this element an inline element?
148
+ */
149
+ static VALUE
150
+ inline_eh(VALUE self)
151
+ {
152
+ const htmlElemDesc *description;
153
+ Data_Get_Struct(self, htmlElemDesc, description);
154
+
155
+ if (description->isinline) { return Qtrue; }
156
+ return Qfalse;
157
+ }
158
+
159
+ /*
160
+ * call-seq:
161
+ * deprecated?
162
+ *
163
+ * Is this element deprecated?
164
+ */
165
+ static VALUE
166
+ deprecated_eh(VALUE self)
167
+ {
168
+ const htmlElemDesc *description;
169
+ Data_Get_Struct(self, htmlElemDesc, description);
170
+
171
+ if (description->depr) { return Qtrue; }
172
+ return Qfalse;
173
+ }
174
+
175
+ /*
176
+ * call-seq:
177
+ * empty?
178
+ *
179
+ * Is this an empty element?
180
+ */
181
+ static VALUE
182
+ empty_eh(VALUE self)
183
+ {
184
+ const htmlElemDesc *description;
185
+ Data_Get_Struct(self, htmlElemDesc, description);
186
+
187
+ if (description->empty) { return Qtrue; }
188
+ return Qfalse;
189
+ }
190
+
191
+ /*
192
+ * call-seq:
193
+ * save_end_tag?
194
+ *
195
+ * Should the end tag be saved?
196
+ */
197
+ static VALUE
198
+ save_end_tag_eh(VALUE self)
199
+ {
200
+ const htmlElemDesc *description;
201
+ Data_Get_Struct(self, htmlElemDesc, description);
202
+
203
+ if (description->saveEndTag) { return Qtrue; }
204
+ return Qfalse;
205
+ }
206
+
207
+ /*
208
+ * call-seq:
209
+ * implied_end_tag?
210
+ *
211
+ * Can the end tag be implied for this tag?
212
+ */
213
+ static VALUE
214
+ implied_end_tag_eh(VALUE self)
215
+ {
216
+ const htmlElemDesc *description;
217
+ Data_Get_Struct(self, htmlElemDesc, description);
218
+
219
+ if (description->endTag) { return Qtrue; }
220
+ return Qfalse;
221
+ }
222
+
223
+ /*
224
+ * call-seq:
225
+ * implied_start_tag?
226
+ *
227
+ * Can the start tag be implied for this tag?
228
+ */
229
+ static VALUE
230
+ implied_start_tag_eh(VALUE self)
231
+ {
232
+ const htmlElemDesc *description;
233
+ Data_Get_Struct(self, htmlElemDesc, description);
234
+
235
+ if (description->startTag) { return Qtrue; }
236
+ return Qfalse;
237
+ }
238
+
239
+ /*
240
+ * call-seq:
241
+ * name
242
+ *
243
+ * Get the tag name for this ElemementDescription
244
+ */
245
+ static VALUE
246
+ name(VALUE self)
247
+ {
248
+ const htmlElemDesc *description;
249
+ Data_Get_Struct(self, htmlElemDesc, description);
250
+
251
+ if (NULL == description->name) { return Qnil; }
252
+ return NOKOGIRI_STR_NEW2(description->name);
253
+ }
254
+
255
+ /*
256
+ * call-seq:
257
+ * [](tag_name)
258
+ *
259
+ * Get ElemementDescription for +tag_name+
260
+ */
261
+ static VALUE
262
+ get_description(VALUE klass, VALUE tag_name)
263
+ {
264
+ const htmlElemDesc *description = htmlTagLookup(
265
+ (const xmlChar *)StringValueCStr(tag_name)
266
+ );
267
+
268
+ if (NULL == description) { return Qnil; }
269
+ return Data_Wrap_Struct(klass, 0, 0, (void *)(uintptr_t)description);
270
+ }
271
+
272
+ void
273
+ noko_init_html_element_description()
274
+ {
275
+ cNokogiriHtml4ElementDescription = rb_define_class_under(mNokogiriHtml4, "ElementDescription", rb_cObject);
276
+
277
+ rb_undef_alloc_func(cNokogiriHtml4ElementDescription);
278
+
279
+ rb_define_singleton_method(cNokogiriHtml4ElementDescription, "[]", get_description, 1);
280
+
281
+ rb_define_method(cNokogiriHtml4ElementDescription, "name", name, 0);
282
+ rb_define_method(cNokogiriHtml4ElementDescription, "implied_start_tag?", implied_start_tag_eh, 0);
283
+ rb_define_method(cNokogiriHtml4ElementDescription, "implied_end_tag?", implied_end_tag_eh, 0);
284
+ rb_define_method(cNokogiriHtml4ElementDescription, "save_end_tag?", save_end_tag_eh, 0);
285
+ rb_define_method(cNokogiriHtml4ElementDescription, "empty?", empty_eh, 0);
286
+ rb_define_method(cNokogiriHtml4ElementDescription, "deprecated?", deprecated_eh, 0);
287
+ rb_define_method(cNokogiriHtml4ElementDescription, "inline?", inline_eh, 0);
288
+ rb_define_method(cNokogiriHtml4ElementDescription, "description", description, 0);
289
+ rb_define_method(cNokogiriHtml4ElementDescription, "sub_elements", sub_elements, 0);
290
+ rb_define_method(cNokogiriHtml4ElementDescription, "default_sub_element", default_sub_element, 0);
291
+ rb_define_method(cNokogiriHtml4ElementDescription, "optional_attributes", optional_attributes, 0);
292
+ rb_define_method(cNokogiriHtml4ElementDescription, "deprecated_attributes", deprecated_attributes, 0);
293
+ rb_define_method(cNokogiriHtml4ElementDescription, "required_attributes", required_attributes, 0);
294
+ }
@@ -0,0 +1,37 @@
1
+ #include <nokogiri.h>
2
+
3
+ static VALUE cNokogiriHtml4EntityLookup;
4
+
5
+ /*
6
+ * call-seq:
7
+ * get(key)
8
+ *
9
+ * Get the HTML4::EntityDescription for +key+
10
+ */
11
+ static VALUE
12
+ get(VALUE _, VALUE rb_entity_name)
13
+ {
14
+ VALUE cNokogiriHtml4EntityDescription;
15
+ const htmlEntityDesc *c_entity_desc;
16
+ VALUE rb_constructor_args[3];
17
+
18
+ c_entity_desc = htmlEntityLookup((const xmlChar *)StringValueCStr(rb_entity_name));
19
+ if (NULL == c_entity_desc) {
20
+ return Qnil;
21
+ }
22
+
23
+ rb_constructor_args[0] = INT2NUM((long)c_entity_desc->value);
24
+ rb_constructor_args[1] = NOKOGIRI_STR_NEW2(c_entity_desc->name);
25
+ rb_constructor_args[2] = NOKOGIRI_STR_NEW2(c_entity_desc->desc);
26
+
27
+ cNokogiriHtml4EntityDescription = rb_const_get_at(mNokogiriHtml4, rb_intern("EntityDescription"));
28
+ return rb_class_new_instance(3, rb_constructor_args, cNokogiriHtml4EntityDescription);
29
+ }
30
+
31
+ void
32
+ noko_init_html_entity_lookup()
33
+ {
34
+ cNokogiriHtml4EntityLookup = rb_define_class_under(mNokogiriHtml4, "EntityLookup", rb_cObject);
35
+
36
+ rb_define_method(cNokogiriHtml4EntityLookup, "get", get, 1);
37
+ }
@@ -0,0 +1,119 @@
1
+ #include <nokogiri.h>
2
+
3
+ VALUE cNokogiriHtml4SaxParserContext ;
4
+
5
+ static void
6
+ deallocate(xmlParserCtxtPtr ctxt)
7
+ {
8
+ NOKOGIRI_DEBUG_START(ctxt);
9
+
10
+ ctxt->sax = NULL;
11
+
12
+ htmlFreeParserCtxt(ctxt);
13
+
14
+ NOKOGIRI_DEBUG_END(ctxt);
15
+ }
16
+
17
+ static VALUE
18
+ parse_memory(VALUE klass, VALUE data, VALUE encoding)
19
+ {
20
+ htmlParserCtxtPtr ctxt;
21
+
22
+ if (NIL_P(data)) {
23
+ rb_raise(rb_eArgError, "data cannot be nil");
24
+ }
25
+ if (!(int)RSTRING_LEN(data)) {
26
+ rb_raise(rb_eRuntimeError, "data cannot be empty");
27
+ }
28
+
29
+ ctxt = htmlCreateMemoryParserCtxt(StringValuePtr(data),
30
+ (int)RSTRING_LEN(data));
31
+ if (ctxt->sax) {
32
+ xmlFree(ctxt->sax);
33
+ ctxt->sax = NULL;
34
+ }
35
+
36
+ if (RTEST(encoding)) {
37
+ xmlCharEncodingHandlerPtr enc = xmlFindCharEncodingHandler(StringValueCStr(encoding));
38
+ if (enc != NULL) {
39
+ xmlSwitchToEncoding(ctxt, enc);
40
+ if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
41
+ rb_raise(rb_eRuntimeError, "Unsupported encoding %s",
42
+ StringValueCStr(encoding));
43
+ }
44
+ }
45
+ }
46
+
47
+ return Data_Wrap_Struct(klass, NULL, deallocate, ctxt);
48
+ }
49
+
50
+ static VALUE
51
+ parse_file(VALUE klass, VALUE filename, VALUE encoding)
52
+ {
53
+ htmlParserCtxtPtr ctxt = htmlCreateFileParserCtxt(
54
+ StringValueCStr(filename),
55
+ StringValueCStr(encoding)
56
+ );
57
+ return Data_Wrap_Struct(klass, NULL, deallocate, ctxt);
58
+ }
59
+
60
+ static VALUE
61
+ parse_doc(VALUE ctxt_val)
62
+ {
63
+ htmlParserCtxtPtr ctxt = (htmlParserCtxtPtr)ctxt_val;
64
+ htmlParseDocument(ctxt);
65
+ return Qnil;
66
+ }
67
+
68
+ static VALUE
69
+ parse_doc_finalize(VALUE ctxt_val)
70
+ {
71
+ htmlParserCtxtPtr ctxt = (htmlParserCtxtPtr)ctxt_val;
72
+
73
+ if (ctxt->myDoc) {
74
+ xmlFreeDoc(ctxt->myDoc);
75
+ }
76
+
77
+ NOKOGIRI_SAX_TUPLE_DESTROY(ctxt->userData);
78
+ return Qnil;
79
+ }
80
+
81
+ static VALUE
82
+ parse_with(VALUE self, VALUE sax_handler)
83
+ {
84
+ htmlParserCtxtPtr ctxt;
85
+ htmlSAXHandlerPtr sax;
86
+
87
+ if (!rb_obj_is_kind_of(sax_handler, cNokogiriXmlSaxParser)) {
88
+ rb_raise(rb_eArgError, "argument must be a Nokogiri::XML::SAX::Parser");
89
+ }
90
+
91
+ Data_Get_Struct(self, htmlParserCtxt, ctxt);
92
+ Data_Get_Struct(sax_handler, htmlSAXHandler, sax);
93
+
94
+ /* Free the sax handler since we'll assign our own */
95
+ if (ctxt->sax && ctxt->sax != (xmlSAXHandlerPtr)&xmlDefaultSAXHandler) {
96
+ xmlFree(ctxt->sax);
97
+ }
98
+
99
+ ctxt->sax = sax;
100
+ ctxt->userData = (void *)NOKOGIRI_SAX_TUPLE_NEW(ctxt, sax_handler);
101
+
102
+ xmlSetStructuredErrorFunc(NULL, NULL);
103
+
104
+ rb_ensure(parse_doc, (VALUE)ctxt, parse_doc_finalize, (VALUE)ctxt);
105
+
106
+ return self;
107
+ }
108
+
109
+ void
110
+ noko_init_html_sax_parser_context()
111
+ {
112
+ assert(cNokogiriXmlSaxParserContext);
113
+ cNokogiriHtml4SaxParserContext = rb_define_class_under(mNokogiriHtml4Sax, "ParserContext", cNokogiriXmlSaxParserContext);
114
+
115
+ rb_define_singleton_method(cNokogiriHtml4SaxParserContext, "memory", parse_memory, 2);
116
+ rb_define_singleton_method(cNokogiriHtml4SaxParserContext, "file", parse_file, 2);
117
+
118
+ rb_define_method(cNokogiriHtml4SaxParserContext, "parse_with", parse_with, 1);
119
+ }