nokogiri 1.6.0 → 1.13.2

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of nokogiri might be problematic. Click here for more details.

Files changed (340) hide show
  1. checksums.yaml +7 -0
  2. data/Gemfile +3 -19
  3. data/LICENSE-DEPENDENCIES.md +1903 -0
  4. data/LICENSE.md +9 -0
  5. data/README.md +280 -0
  6. data/bin/nokogiri +84 -31
  7. data/dependencies.yml +23 -4
  8. data/ext/nokogiri/depend +38 -358
  9. data/ext/nokogiri/extconf.rb +952 -132
  10. data/ext/nokogiri/gumbo.c +584 -0
  11. data/ext/nokogiri/html4_document.c +166 -0
  12. data/ext/nokogiri/html4_element_description.c +294 -0
  13. data/ext/nokogiri/html4_entity_lookup.c +37 -0
  14. data/ext/nokogiri/html4_sax_parser_context.c +120 -0
  15. data/ext/nokogiri/html4_sax_push_parser.c +95 -0
  16. data/ext/nokogiri/libxml2_backwards_compat.c +121 -0
  17. data/ext/nokogiri/nokogiri.c +231 -96
  18. data/ext/nokogiri/nokogiri.h +188 -129
  19. data/ext/nokogiri/test_global_handlers.c +40 -0
  20. data/ext/nokogiri/xml_attr.c +49 -40
  21. data/ext/nokogiri/xml_attribute_decl.c +18 -18
  22. data/ext/nokogiri/xml_cdata.c +24 -23
  23. data/ext/nokogiri/xml_comment.c +29 -21
  24. data/ext/nokogiri/xml_document.c +327 -223
  25. data/ext/nokogiri/xml_document_fragment.c +12 -16
  26. data/ext/nokogiri/xml_dtd.c +56 -50
  27. data/ext/nokogiri/xml_element_content.c +31 -26
  28. data/ext/nokogiri/xml_element_decl.c +22 -22
  29. data/ext/nokogiri/xml_encoding_handler.c +45 -20
  30. data/ext/nokogiri/xml_entity_decl.c +32 -30
  31. data/ext/nokogiri/xml_entity_reference.c +16 -18
  32. data/ext/nokogiri/xml_namespace.c +74 -32
  33. data/ext/nokogiri/xml_node.c +1290 -680
  34. data/ext/nokogiri/xml_node_set.c +239 -208
  35. data/ext/nokogiri/xml_processing_instruction.c +17 -19
  36. data/ext/nokogiri/xml_reader.c +227 -189
  37. data/ext/nokogiri/xml_relax_ng.c +52 -28
  38. data/ext/nokogiri/xml_sax_parser.c +123 -125
  39. data/ext/nokogiri/xml_sax_parser_context.c +138 -79
  40. data/ext/nokogiri/xml_sax_push_parser.c +88 -35
  41. data/ext/nokogiri/xml_schema.c +112 -33
  42. data/ext/nokogiri/xml_syntax_error.c +50 -23
  43. data/ext/nokogiri/xml_text.c +14 -18
  44. data/ext/nokogiri/xml_xpath_context.c +227 -140
  45. data/ext/nokogiri/xslt_stylesheet.c +269 -177
  46. data/gumbo-parser/CHANGES.md +63 -0
  47. data/gumbo-parser/Makefile +101 -0
  48. data/gumbo-parser/THANKS +27 -0
  49. data/gumbo-parser/src/Makefile +34 -0
  50. data/gumbo-parser/src/README.md +41 -0
  51. data/gumbo-parser/src/ascii.c +75 -0
  52. data/gumbo-parser/src/ascii.h +115 -0
  53. data/gumbo-parser/src/attribute.c +42 -0
  54. data/gumbo-parser/src/attribute.h +17 -0
  55. data/gumbo-parser/src/char_ref.c +22225 -0
  56. data/gumbo-parser/src/char_ref.h +29 -0
  57. data/gumbo-parser/src/char_ref.rl +2154 -0
  58. data/gumbo-parser/src/error.c +626 -0
  59. data/gumbo-parser/src/error.h +148 -0
  60. data/gumbo-parser/src/foreign_attrs.c +104 -0
  61. data/gumbo-parser/src/foreign_attrs.gperf +27 -0
  62. data/gumbo-parser/src/gumbo.h +943 -0
  63. data/gumbo-parser/src/insertion_mode.h +33 -0
  64. data/gumbo-parser/src/macros.h +91 -0
  65. data/gumbo-parser/src/parser.c +4875 -0
  66. data/gumbo-parser/src/parser.h +41 -0
  67. data/gumbo-parser/src/replacement.h +33 -0
  68. data/gumbo-parser/src/string_buffer.c +103 -0
  69. data/gumbo-parser/src/string_buffer.h +68 -0
  70. data/gumbo-parser/src/string_piece.c +48 -0
  71. data/gumbo-parser/src/svg_attrs.c +174 -0
  72. data/gumbo-parser/src/svg_attrs.gperf +77 -0
  73. data/gumbo-parser/src/svg_tags.c +137 -0
  74. data/gumbo-parser/src/svg_tags.gperf +55 -0
  75. data/gumbo-parser/src/tag.c +222 -0
  76. data/gumbo-parser/src/tag_lookup.c +382 -0
  77. data/gumbo-parser/src/tag_lookup.gperf +169 -0
  78. data/gumbo-parser/src/tag_lookup.h +13 -0
  79. data/gumbo-parser/src/token_buffer.c +79 -0
  80. data/gumbo-parser/src/token_buffer.h +71 -0
  81. data/gumbo-parser/src/token_type.h +17 -0
  82. data/gumbo-parser/src/tokenizer.c +3463 -0
  83. data/gumbo-parser/src/tokenizer.h +112 -0
  84. data/gumbo-parser/src/tokenizer_states.h +339 -0
  85. data/gumbo-parser/src/utf8.c +245 -0
  86. data/gumbo-parser/src/utf8.h +164 -0
  87. data/gumbo-parser/src/util.c +68 -0
  88. data/gumbo-parser/src/util.h +30 -0
  89. data/gumbo-parser/src/vector.c +111 -0
  90. data/gumbo-parser/src/vector.h +45 -0
  91. data/lib/nokogiri/class_resolver.rb +67 -0
  92. data/lib/nokogiri/css/node.rb +10 -58
  93. data/lib/nokogiri/css/parser.rb +407 -357
  94. data/lib/nokogiri/css/parser.y +265 -246
  95. data/lib/nokogiri/css/parser_extras.rb +52 -49
  96. data/lib/nokogiri/css/syntax_error.rb +3 -1
  97. data/lib/nokogiri/css/tokenizer.rb +107 -104
  98. data/lib/nokogiri/css/tokenizer.rex +8 -7
  99. data/lib/nokogiri/css/xpath_visitor.rb +266 -80
  100. data/lib/nokogiri/css.rb +50 -17
  101. data/lib/nokogiri/decorators/slop.rb +17 -8
  102. data/lib/nokogiri/extension.rb +31 -0
  103. data/lib/nokogiri/gumbo.rb +15 -0
  104. data/lib/nokogiri/html.rb +38 -27
  105. data/lib/nokogiri/{html → html4}/builder.rb +4 -2
  106. data/lib/nokogiri/html4/document.rb +331 -0
  107. data/lib/nokogiri/html4/document_fragment.rb +54 -0
  108. data/lib/nokogiri/{html → html4}/element_description.rb +3 -1
  109. data/lib/nokogiri/html4/element_description_defaults.rb +578 -0
  110. data/lib/nokogiri/{html → html4}/entity_lookup.rb +4 -2
  111. data/lib/nokogiri/{html → html4}/sax/parser.rb +24 -15
  112. data/lib/nokogiri/html4/sax/parser_context.rb +20 -0
  113. data/lib/nokogiri/html4/sax/push_parser.rb +37 -0
  114. data/lib/nokogiri/html4.rb +46 -0
  115. data/lib/nokogiri/html5/document.rb +88 -0
  116. data/lib/nokogiri/html5/document_fragment.rb +83 -0
  117. data/lib/nokogiri/html5/node.rb +96 -0
  118. data/lib/nokogiri/html5.rb +477 -0
  119. data/lib/nokogiri/jruby/dependencies.rb +21 -0
  120. data/lib/nokogiri/syntax_error.rb +2 -0
  121. data/lib/nokogiri/version/constant.rb +6 -0
  122. data/lib/nokogiri/version/info.rb +221 -0
  123. data/lib/nokogiri/version.rb +3 -105
  124. data/lib/nokogiri/xml/attr.rb +6 -3
  125. data/lib/nokogiri/xml/attribute_decl.rb +3 -1
  126. data/lib/nokogiri/xml/builder.rb +96 -54
  127. data/lib/nokogiri/xml/cdata.rb +3 -1
  128. data/lib/nokogiri/xml/character_data.rb +2 -0
  129. data/lib/nokogiri/xml/document.rb +234 -95
  130. data/lib/nokogiri/xml/document_fragment.rb +86 -36
  131. data/lib/nokogiri/xml/dtd.rb +16 -4
  132. data/lib/nokogiri/xml/element_content.rb +2 -0
  133. data/lib/nokogiri/xml/element_decl.rb +3 -1
  134. data/lib/nokogiri/xml/entity_decl.rb +4 -2
  135. data/lib/nokogiri/xml/entity_reference.rb +20 -0
  136. data/lib/nokogiri/xml/namespace.rb +3 -0
  137. data/lib/nokogiri/xml/node/save_options.rb +8 -4
  138. data/lib/nokogiri/xml/node.rb +947 -502
  139. data/lib/nokogiri/xml/node_set.rb +168 -159
  140. data/lib/nokogiri/xml/notation.rb +13 -0
  141. data/lib/nokogiri/xml/parse_options.rb +40 -5
  142. data/lib/nokogiri/xml/pp/character_data.rb +9 -6
  143. data/lib/nokogiri/xml/pp/node.rb +25 -26
  144. data/lib/nokogiri/xml/pp.rb +4 -2
  145. data/lib/nokogiri/xml/processing_instruction.rb +3 -1
  146. data/lib/nokogiri/xml/reader.rb +23 -28
  147. data/lib/nokogiri/xml/relax_ng.rb +8 -2
  148. data/lib/nokogiri/xml/sax/document.rb +45 -49
  149. data/lib/nokogiri/xml/sax/parser.rb +43 -41
  150. data/lib/nokogiri/xml/sax/parser_context.rb +8 -3
  151. data/lib/nokogiri/xml/sax/push_parser.rb +6 -5
  152. data/lib/nokogiri/xml/sax.rb +6 -4
  153. data/lib/nokogiri/xml/schema.rb +19 -9
  154. data/lib/nokogiri/xml/searchable.rb +270 -0
  155. data/lib/nokogiri/xml/syntax_error.rb +25 -1
  156. data/lib/nokogiri/xml/text.rb +2 -0
  157. data/lib/nokogiri/xml/xpath/syntax_error.rb +4 -2
  158. data/lib/nokogiri/xml/xpath.rb +15 -4
  159. data/lib/nokogiri/xml/xpath_context.rb +3 -3
  160. data/lib/nokogiri/xml.rb +38 -36
  161. data/lib/nokogiri/xslt/stylesheet.rb +3 -1
  162. data/lib/nokogiri/xslt.rb +29 -20
  163. data/lib/nokogiri.rb +69 -69
  164. data/lib/xsd/xmlparser/nokogiri.rb +26 -24
  165. data/patches/libxml2/0001-Remove-script-macro-support.patch +40 -0
  166. data/patches/libxml2/0002-Update-entities-to-remove-handling-of-ssi.patch +44 -0
  167. data/patches/libxml2/0003-libxml2.la-is-in-top_builddir.patch +25 -0
  168. data/patches/libxml2/0004-use-glibc-strlen.patch +53 -0
  169. data/patches/libxml2/0005-avoid-isnan-isinf.patch +81 -0
  170. data/patches/libxml2/0006-update-automake-files-for-arm64.patch +3040 -0
  171. data/patches/libxml2/0008-htmlParseComment-handle-abruptly-closed-comments.patch +61 -0
  172. data/patches/libxml2/0009-allow-wildcard-namespaces.patch +77 -0
  173. data/patches/libxslt/0001-update-automake-files-for-arm64.patch +3037 -0
  174. data/ports/archives/libxml2-2.9.13.tar.xz +0 -0
  175. data/ports/archives/libxslt-1.1.35.tar.xz +0 -0
  176. metadata +278 -362
  177. data/.autotest +0 -26
  178. data/.gemtest +0 -0
  179. data/.travis.yml +0 -27
  180. data/CHANGELOG.ja.rdoc +0 -819
  181. data/CHANGELOG.rdoc +0 -819
  182. data/C_CODING_STYLE.rdoc +0 -33
  183. data/Manifest.txt +0 -315
  184. data/README.ja.rdoc +0 -106
  185. data/README.rdoc +0 -175
  186. data/ROADMAP.md +0 -90
  187. data/Rakefile +0 -246
  188. data/STANDARD_RESPONSES.md +0 -47
  189. data/Y_U_NO_GEMSPEC.md +0 -155
  190. data/build_all +0 -105
  191. data/ext/nokogiri/html_document.c +0 -170
  192. data/ext/nokogiri/html_document.h +0 -10
  193. data/ext/nokogiri/html_element_description.c +0 -279
  194. data/ext/nokogiri/html_element_description.h +0 -10
  195. data/ext/nokogiri/html_entity_lookup.c +0 -32
  196. data/ext/nokogiri/html_entity_lookup.h +0 -8
  197. data/ext/nokogiri/html_sax_parser_context.c +0 -116
  198. data/ext/nokogiri/html_sax_parser_context.h +0 -11
  199. data/ext/nokogiri/html_sax_push_parser.c +0 -87
  200. data/ext/nokogiri/html_sax_push_parser.h +0 -9
  201. data/ext/nokogiri/xml_attr.h +0 -9
  202. data/ext/nokogiri/xml_attribute_decl.h +0 -9
  203. data/ext/nokogiri/xml_cdata.h +0 -9
  204. data/ext/nokogiri/xml_comment.h +0 -9
  205. data/ext/nokogiri/xml_document.h +0 -23
  206. data/ext/nokogiri/xml_document_fragment.h +0 -10
  207. data/ext/nokogiri/xml_dtd.h +0 -10
  208. data/ext/nokogiri/xml_element_content.h +0 -10
  209. data/ext/nokogiri/xml_element_decl.h +0 -9
  210. data/ext/nokogiri/xml_encoding_handler.h +0 -8
  211. data/ext/nokogiri/xml_entity_decl.h +0 -10
  212. data/ext/nokogiri/xml_entity_reference.h +0 -9
  213. data/ext/nokogiri/xml_io.c +0 -56
  214. data/ext/nokogiri/xml_io.h +0 -11
  215. data/ext/nokogiri/xml_libxml2_hacks.c +0 -112
  216. data/ext/nokogiri/xml_libxml2_hacks.h +0 -12
  217. data/ext/nokogiri/xml_namespace.h +0 -13
  218. data/ext/nokogiri/xml_node.h +0 -13
  219. data/ext/nokogiri/xml_node_set.h +0 -14
  220. data/ext/nokogiri/xml_processing_instruction.h +0 -9
  221. data/ext/nokogiri/xml_reader.h +0 -10
  222. data/ext/nokogiri/xml_relax_ng.h +0 -9
  223. data/ext/nokogiri/xml_sax_parser.h +0 -39
  224. data/ext/nokogiri/xml_sax_parser_context.h +0 -10
  225. data/ext/nokogiri/xml_sax_push_parser.h +0 -9
  226. data/ext/nokogiri/xml_schema.h +0 -9
  227. data/ext/nokogiri/xml_syntax_error.h +0 -13
  228. data/ext/nokogiri/xml_text.h +0 -9
  229. data/ext/nokogiri/xml_xpath_context.h +0 -10
  230. data/ext/nokogiri/xslt_stylesheet.h +0 -14
  231. data/lib/nokogiri/html/document.rb +0 -254
  232. data/lib/nokogiri/html/document_fragment.rb +0 -41
  233. data/lib/nokogiri/html/element_description_defaults.rb +0 -671
  234. data/lib/nokogiri/html/sax/parser_context.rb +0 -16
  235. data/lib/nokogiri/html/sax/push_parser.rb +0 -16
  236. data/ports/archives/libxml2-2.8.0.tar.gz +0 -0
  237. data/ports/archives/libxslt-1.1.26.tar.gz +0 -0
  238. data/tasks/cross_compile.rb +0 -132
  239. data/tasks/nokogiri.org.rb +0 -24
  240. data/tasks/test.rb +0 -95
  241. data/test/css/test_nthiness.rb +0 -159
  242. data/test/css/test_parser.rb +0 -341
  243. data/test/css/test_tokenizer.rb +0 -198
  244. data/test/css/test_xpath_visitor.rb +0 -91
  245. data/test/decorators/test_slop.rb +0 -16
  246. data/test/files/2ch.html +0 -108
  247. data/test/files/address_book.rlx +0 -12
  248. data/test/files/address_book.xml +0 -10
  249. data/test/files/bar/bar.xsd +0 -4
  250. data/test/files/bogus.xml +0 -0
  251. data/test/files/dont_hurt_em_why.xml +0 -422
  252. data/test/files/encoding.html +0 -82
  253. data/test/files/encoding.xhtml +0 -84
  254. data/test/files/exslt.xml +0 -8
  255. data/test/files/exslt.xslt +0 -35
  256. data/test/files/foo/foo.xsd +0 -4
  257. data/test/files/metacharset.html +0 -10
  258. data/test/files/noencoding.html +0 -47
  259. data/test/files/po.xml +0 -32
  260. data/test/files/po.xsd +0 -66
  261. data/test/files/saml/saml20assertion_schema.xsd +0 -283
  262. data/test/files/saml/saml20protocol_schema.xsd +0 -302
  263. data/test/files/saml/xenc_schema.xsd +0 -146
  264. data/test/files/saml/xmldsig_schema.xsd +0 -318
  265. data/test/files/shift_jis.html +0 -10
  266. data/test/files/shift_jis.xml +0 -5
  267. data/test/files/snuggles.xml +0 -3
  268. data/test/files/staff.dtd +0 -10
  269. data/test/files/staff.xml +0 -59
  270. data/test/files/staff.xslt +0 -32
  271. data/test/files/test_document_url/bar.xml +0 -2
  272. data/test/files/test_document_url/document.dtd +0 -4
  273. data/test/files/test_document_url/document.xml +0 -6
  274. data/test/files/tlm.html +0 -850
  275. data/test/files/to_be_xincluded.xml +0 -2
  276. data/test/files/valid_bar.xml +0 -2
  277. data/test/files/xinclude.xml +0 -4
  278. data/test/helper.rb +0 -154
  279. data/test/html/sax/test_parser.rb +0 -141
  280. data/test/html/sax/test_parser_context.rb +0 -46
  281. data/test/html/test_builder.rb +0 -164
  282. data/test/html/test_document.rb +0 -552
  283. data/test/html/test_document_encoding.rb +0 -138
  284. data/test/html/test_document_fragment.rb +0 -261
  285. data/test/html/test_element_description.rb +0 -105
  286. data/test/html/test_named_characters.rb +0 -14
  287. data/test/html/test_node.rb +0 -196
  288. data/test/html/test_node_encoding.rb +0 -27
  289. data/test/namespaces/test_additional_namespaces_in_builder_doc.rb +0 -14
  290. data/test/namespaces/test_namespaces_in_builder_doc.rb +0 -75
  291. data/test/namespaces/test_namespaces_in_created_doc.rb +0 -75
  292. data/test/namespaces/test_namespaces_in_parsed_doc.rb +0 -66
  293. data/test/test_convert_xpath.rb +0 -135
  294. data/test/test_css_cache.rb +0 -45
  295. data/test/test_encoding_handler.rb +0 -46
  296. data/test/test_memory_leak.rb +0 -156
  297. data/test/test_nokogiri.rb +0 -132
  298. data/test/test_reader.rb +0 -555
  299. data/test/test_soap4r_sax.rb +0 -52
  300. data/test/test_xslt_transforms.rb +0 -254
  301. data/test/xml/node/test_save_options.rb +0 -28
  302. data/test/xml/node/test_subclass.rb +0 -44
  303. data/test/xml/sax/test_parser.rb +0 -366
  304. data/test/xml/sax/test_parser_context.rb +0 -106
  305. data/test/xml/sax/test_push_parser.rb +0 -157
  306. data/test/xml/test_attr.rb +0 -64
  307. data/test/xml/test_attribute_decl.rb +0 -86
  308. data/test/xml/test_builder.rb +0 -306
  309. data/test/xml/test_c14n.rb +0 -151
  310. data/test/xml/test_cdata.rb +0 -48
  311. data/test/xml/test_comment.rb +0 -29
  312. data/test/xml/test_document.rb +0 -828
  313. data/test/xml/test_document_encoding.rb +0 -28
  314. data/test/xml/test_document_fragment.rb +0 -223
  315. data/test/xml/test_dtd.rb +0 -103
  316. data/test/xml/test_dtd_encoding.rb +0 -33
  317. data/test/xml/test_element_content.rb +0 -56
  318. data/test/xml/test_element_decl.rb +0 -73
  319. data/test/xml/test_entity_decl.rb +0 -122
  320. data/test/xml/test_entity_reference.rb +0 -245
  321. data/test/xml/test_namespace.rb +0 -95
  322. data/test/xml/test_node.rb +0 -1137
  323. data/test/xml/test_node_attributes.rb +0 -96
  324. data/test/xml/test_node_encoding.rb +0 -107
  325. data/test/xml/test_node_inheritance.rb +0 -32
  326. data/test/xml/test_node_reparenting.rb +0 -374
  327. data/test/xml/test_node_set.rb +0 -755
  328. data/test/xml/test_parse_options.rb +0 -64
  329. data/test/xml/test_processing_instruction.rb +0 -30
  330. data/test/xml/test_reader_encoding.rb +0 -142
  331. data/test/xml/test_relax_ng.rb +0 -60
  332. data/test/xml/test_schema.rb +0 -103
  333. data/test/xml/test_syntax_error.rb +0 -12
  334. data/test/xml/test_text.rb +0 -45
  335. data/test/xml/test_unparented_node.rb +0 -422
  336. data/test/xml/test_xinclude.rb +0 -83
  337. data/test/xml/test_xpath.rb +0 -295
  338. data/test/xslt/test_custom_functions.rb +0 -133
  339. data/test/xslt/test_exception_handling.rb +0 -37
  340. data/test_all +0 -81
@@ -0,0 +1,112 @@
1
+ #ifndef GUMBO_TOKENIZER_H_
2
+ #define GUMBO_TOKENIZER_H_
3
+
4
+ // This contains an implementation of a tokenizer for HTML5. It consumes a
5
+ // buffer of UTF-8 characters, and then emits a stream of tokens.
6
+
7
+ #include <stdbool.h>
8
+ #include <stddef.h>
9
+
10
+ #include "gumbo.h"
11
+ #include "token_type.h"
12
+ #include "tokenizer_states.h"
13
+
14
+ #ifdef __cplusplus
15
+ extern "C" {
16
+ #endif
17
+
18
+ struct GumboInternalParser;
19
+
20
+ // Struct containing all information pertaining to doctype tokens.
21
+ typedef struct GumboInternalTokenDocType {
22
+ const char* name;
23
+ const char* public_identifier;
24
+ const char* system_identifier;
25
+ bool force_quirks;
26
+ // There's no way to tell a 0-length public or system ID apart from the
27
+ // absence of a public or system ID, but they're handled different by the
28
+ // spec, so we need bool flags for them.
29
+ bool has_public_identifier;
30
+ bool has_system_identifier;
31
+ } GumboTokenDocType;
32
+
33
+ // Struct containing all information pertaining to start tag tokens.
34
+ typedef struct GumboInternalTokenStartTag {
35
+ GumboTag tag;
36
+ // NULL unless tag is GUMBO_TAG_UNKNOWN
37
+ char *name;
38
+ GumboVector /* GumboAttribute */ attributes;
39
+ bool is_self_closing;
40
+ } GumboTokenStartTag;
41
+
42
+ // Struct containing all information pertaining to end tag tokens.
43
+ typedef struct GumboInternalTokenEndTag {
44
+ GumboTag tag;
45
+ // NULL unless tag is GUMBO_TAG_UNKNOWN
46
+ char *name;
47
+ } GumboTokenEndTag;
48
+
49
+ // A data structure representing a single token in the input stream. This
50
+ // contains an enum for the type, the source position, a GumboStringPiece
51
+ // pointing to the original text, and then a union for any parsed data.
52
+ typedef struct GumboInternalToken {
53
+ GumboTokenType type;
54
+ GumboSourcePosition position;
55
+ GumboStringPiece original_text;
56
+ union {
57
+ GumboTokenDocType doc_type;
58
+ GumboTokenStartTag start_tag;
59
+ GumboTokenEndTag end_tag;
60
+ const char* text; // For comments.
61
+ int character; // For character, whitespace, null, and EOF tokens.
62
+ } v;
63
+ } GumboToken;
64
+
65
+ // Initializes the tokenizer state within the GumboParser object, setting up a
66
+ // parse of the specified text.
67
+ void gumbo_tokenizer_state_init (
68
+ struct GumboInternalParser* parser,
69
+ const char* text,
70
+ size_t text_length
71
+ );
72
+
73
+ // Destroys the tokenizer state within the GumboParser object, freeing any
74
+ // dynamically-allocated structures within it.
75
+ void gumbo_tokenizer_state_destroy(struct GumboInternalParser* parser);
76
+
77
+ // Sets the tokenizer state to the specified value. This is needed by some
78
+ // parser states, which alter the state of the tokenizer in response to tags
79
+ // seen.
80
+ void gumbo_tokenizer_set_state (
81
+ struct GumboInternalParser* parser,
82
+ GumboTokenizerEnum state
83
+ );
84
+
85
+ // Flags whether the adjusted current node is a foreign content element. This
86
+ // is necessary for the markup declaration open state, where the tokenizer
87
+ // must be aware of the state of the parser to properly tokenize bad comment
88
+ // tags.
89
+ // https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
90
+ void gumbo_tokenizer_set_is_adjusted_current_node_foreign (
91
+ struct GumboInternalParser* parser,
92
+ bool is_foreign
93
+ );
94
+
95
+ // Lexes a single token from the specified buffer, filling the output with the
96
+ // parsed GumboToken data structure.
97
+ void gumbo_lex(struct GumboInternalParser* parser, GumboToken* output);
98
+
99
+ // Frees the internally-allocated pointers within a GumboToken. Note that this
100
+ // doesn't free the token itself, since oftentimes it will be allocated on the
101
+ // stack.
102
+ //
103
+ // Note that if you are handing over ownership of the internal strings to some
104
+ // other data structure - for example, a parse tree - these do not need to be
105
+ // freed.
106
+ void gumbo_token_destroy(GumboToken* token);
107
+
108
+ #ifdef __cplusplus
109
+ }
110
+ #endif
111
+
112
+ #endif // GUMBO_TOKENIZER_H_
@@ -0,0 +1,339 @@
1
+ #ifndef GUMBO_TOKENIZER_STATES_H_
2
+ #define GUMBO_TOKENIZER_STATES_H_
3
+
4
+ // This contains the list of states used in the tokenizer. Although at first
5
+ // glance it seems like these could be kept internal to the tokenizer, several
6
+ // of the actions in the parser require that it reach into the tokenizer and
7
+ // reset the tokenizer state. For that to work, it needs to have the
8
+ // definitions of individual states available.
9
+ //
10
+ // This may also be useful for providing more detailed error messages for parse
11
+ // errors, as we can match up states and inputs in a table without having to
12
+ // clutter the tokenizer code with lots of precise error messages.
13
+
14
+ // The ordering of this enum is also used to build the dispatch table for the
15
+ // tokenizer state machine, so if it is changed, be sure to update that too.
16
+ typedef enum {
17
+ // 12.2.5.1 Data state
18
+ // https://html.spec.whatwg.org/multipage/parsing.html#data-state
19
+ GUMBO_LEX_DATA,
20
+
21
+ // 12.2.5.2 RCDATA state
22
+ // https://html.spec.whatwg.org/multipage/parsing.html#rcdata-state
23
+ GUMBO_LEX_RCDATA,
24
+
25
+ // 12.2.5.3 RAWTEXT state
26
+ // https://html.spec.whatwg.org/multipage/parsing.html#rawtext-state<Paste>
27
+ GUMBO_LEX_RAWTEXT,
28
+
29
+ // 12.2.5.4 Script data state
30
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-state
31
+ GUMBO_LEX_SCRIPT_DATA,
32
+
33
+ // 12.2.5.5 PLAINTEXT state
34
+ // https://html.spec.whatwg.org/multipage/parsing.html#plaintext-state
35
+ GUMBO_LEX_PLAINTEXT,
36
+
37
+ // 12.2.5.6 Tag open state
38
+ // https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
39
+ GUMBO_LEX_TAG_OPEN,
40
+
41
+ // 12.2.5.7 End tag open state
42
+ // https://html.spec.whatwg.org/multipage/parsing.html#end-tag-open-state
43
+ GUMBO_LEX_END_TAG_OPEN,
44
+
45
+ // 12.2.5.8 Tag name state
46
+ // https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
47
+ GUMBO_LEX_TAG_NAME,
48
+
49
+ // 12.2.5.9 RCDATA less-than sign state
50
+ // https://html.spec.whatwg.org/multipage/parsing.html#rcdata-less-than-sign-state
51
+ GUMBO_LEX_RCDATA_LT,
52
+
53
+ // 12.2.5.10 RCDATA end tag open state
54
+ // https://html.spec.whatwg.org/multipage/parsing.html#rcdata-end-tag-open-state
55
+ GUMBO_LEX_RCDATA_END_TAG_OPEN,
56
+
57
+ // 12.2.5.11 RCDATA end tag name state
58
+ // https://html.spec.whatwg.org/multipage/parsing.html#rcdata-end-tag-name-state
59
+ GUMBO_LEX_RCDATA_END_TAG_NAME,
60
+
61
+ // 12.2.5.12 RAWTEXT less-than sign state
62
+ // https://html.spec.whatwg.org/multipage/parsing.html#rawtext-less-than-sign-state
63
+ GUMBO_LEX_RAWTEXT_LT,
64
+
65
+ // 12.2.5.13 RAWTEXT end tag open state
66
+ // https://html.spec.whatwg.org/multipage/parsing.html#rawtext-end-tag-open-state
67
+ GUMBO_LEX_RAWTEXT_END_TAG_OPEN,
68
+
69
+ // 12.2.5.14 RAWTEXT end tag name state
70
+ // https://html.spec.whatwg.org/multipage/parsing.html#rawtext-end-tag-name-state
71
+ GUMBO_LEX_RAWTEXT_END_TAG_NAME,
72
+
73
+ // 12.2.5.15 Script data less-than sign state
74
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-less-than-sign-state
75
+ GUMBO_LEX_SCRIPT_DATA_LT,
76
+
77
+ // 12.2.5.16 Script data end tag open state
78
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-end-tag-open-state
79
+ GUMBO_LEX_SCRIPT_DATA_END_TAG_OPEN,
80
+
81
+ // 12.2.5.17 Script data end tag name state
82
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-end-tag-name-state
83
+ GUMBO_LEX_SCRIPT_DATA_END_TAG_NAME,
84
+
85
+ // 12.2.5.18 Script data escape start state
86
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escape-start-state
87
+ GUMBO_LEX_SCRIPT_DATA_ESCAPED_START,
88
+
89
+ // 12.2.5.19 Script data escape start dash state
90
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escape-start-dash-state
91
+ GUMBO_LEX_SCRIPT_DATA_ESCAPED_START_DASH,
92
+
93
+ // 12.2.5.20 Script data escaped state
94
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-state
95
+ GUMBO_LEX_SCRIPT_DATA_ESCAPED,
96
+
97
+ // 12.2.5.21 Script data escaped dash state
98
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-dash-state
99
+ GUMBO_LEX_SCRIPT_DATA_ESCAPED_DASH,
100
+
101
+ // 12.2.5.22 Script data escaped dash dash state
102
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-dash-dash-state
103
+ GUMBO_LEX_SCRIPT_DATA_ESCAPED_DASH_DASH,
104
+
105
+ // 12.2.5.23 Script data escaped less than sign state
106
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-less-than-sign-state
107
+ GUMBO_LEX_SCRIPT_DATA_ESCAPED_LT,
108
+
109
+ // 12.2.5.24 Script data escaped end tag open state
110
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-end-tag-open-state
111
+ GUMBO_LEX_SCRIPT_DATA_ESCAPED_END_TAG_OPEN,
112
+
113
+ // 12.2.5.25 Script data escaped end tag name state
114
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-end-tag-name-state
115
+ GUMBO_LEX_SCRIPT_DATA_ESCAPED_END_TAG_NAME,
116
+
117
+ // 12.2.5.26 Script data double escape start state
118
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escape-start-state
119
+ GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_START,
120
+
121
+ // 12.2.5.27 Script data double escaped state
122
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-state
123
+ GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED,
124
+
125
+ // 12.2.5.28 Script data double escaped dash state
126
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-dash-state
127
+ GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_DASH,
128
+
129
+ // 12.2.5.29 Script data double escaped dash dash state
130
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-dash-dash-state
131
+ GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH,
132
+
133
+ // 12.2.5.30 Script data double escaped less-than sign state
134
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-less-than-sign-state
135
+ GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_LT,
136
+
137
+ // 12.2.5.31 Script data double escape end state (XXX: spec bug with the
138
+ // name?)
139
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escape-end-state
140
+ GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_END,
141
+
142
+ // 12.2.5.32 Before attribute name state
143
+ // https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-name-state
144
+ GUMBO_LEX_BEFORE_ATTR_NAME,
145
+
146
+ // 12.2.5.33 Attributet name state
147
+ // https://html.spec.whatwg.org/multipage/parsing.html#attribute-name-state
148
+ GUMBO_LEX_ATTR_NAME,
149
+
150
+ // 12.2.5.34 After attribute name state
151
+ // https://html.spec.whatwg.org/multipage/parsing.html#after-attribute-name-state
152
+ GUMBO_LEX_AFTER_ATTR_NAME,
153
+
154
+ // 12.2.5.35 Before attribute value state
155
+ // https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-value-state
156
+ GUMBO_LEX_BEFORE_ATTR_VALUE,
157
+
158
+ // 12.2.5.36 Attribute value (double-quoted) state
159
+ // https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-(double-quoted)-state
160
+ GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED,
161
+
162
+ // 12.2.5.37 Attribute value (single-quoted) state
163
+ // https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-(single-quoted)-state
164
+ GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED,
165
+
166
+ // 12.2.5.38 Attribute value (unquoted) state
167
+ // https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-(unquoted)-state
168
+ GUMBO_LEX_ATTR_VALUE_UNQUOTED,
169
+
170
+ // 12.2.5.39 After attribute value (quoted) state
171
+ // https://html.spec.whatwg.org/multipage/parsing.html#after-attribute-value-(quoted)-state
172
+ GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED,
173
+
174
+ // 12.2.5.40 Self-closing start tag state
175
+ // https://html.spec.whatwg.org/multipage/parsing.html#self-closing-start-tag-state
176
+ GUMBO_LEX_SELF_CLOSING_START_TAG,
177
+
178
+ // 12.2.5.41 Bogus comment state
179
+ // https://html.spec.whatwg.org/multipage/parsing.html#bogus-comment-state
180
+ GUMBO_LEX_BOGUS_COMMENT,
181
+
182
+ // 12.2.5.42 Markup declaration open state
183
+ // https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
184
+ GUMBO_LEX_MARKUP_DECLARATION_OPEN,
185
+
186
+ // 12.2.5.43 Comment start state
187
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-start-state
188
+ GUMBO_LEX_COMMENT_START,
189
+
190
+ // 12.2.5.44 Comment start dash state
191
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-start-dash-state
192
+ GUMBO_LEX_COMMENT_START_DASH,
193
+
194
+ // 12.2.5.45 Comment state
195
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-state
196
+ GUMBO_LEX_COMMENT,
197
+
198
+ // 12.2.5.46 Comment less-than sign state
199
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-state
200
+ GUMBO_LEX_COMMENT_LT,
201
+
202
+ // 12.2.5.47 Comment less-than sign bang state
203
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-bang-state
204
+ GUMBO_LEX_COMMENT_LT_BANG,
205
+
206
+ // 12.2.5.48 Comment less-than sign bang dash state
207
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-bang-dash-state
208
+ GUMBO_LEX_COMMENT_LT_BANG_DASH,
209
+
210
+ // 12.2.5.49 Comment less-than sign bang dash dash state
211
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-bang-dash-dash-state
212
+ GUMBO_LEX_COMMENT_LT_BANG_DASH_DASH,
213
+
214
+ // 12.2.5.50 Comment end dash state
215
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-end-dash-state
216
+ GUMBO_LEX_COMMENT_END_DASH,
217
+
218
+ // 12.2.5.51 Comment end state
219
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-end-state
220
+ GUMBO_LEX_COMMENT_END,
221
+
222
+ // 12.2.5.52 Comment end bang state
223
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-end-bang-state
224
+ GUMBO_LEX_COMMENT_END_BANG,
225
+
226
+ // 12.2.5.53 DOCTYPE state
227
+ // https://html.spec.whatwg.org/multipage/parsing.html#doctype-state
228
+ GUMBO_LEX_DOCTYPE,
229
+
230
+ // 12.2.5.54 Before DOCTYPE name state
231
+ // https://html.spec.whatwg.org/multipage/parsing.html#before-doctype-name-state
232
+ GUMBO_LEX_BEFORE_DOCTYPE_NAME,
233
+
234
+ // 12.2.5.55 DOCTYPE name state
235
+ // https://html.spec.whatwg.org/multipage/parsing.html#doctype-name-state
236
+ GUMBO_LEX_DOCTYPE_NAME,
237
+
238
+ // 12.2.5.56 After DOCTYPE name state
239
+ // https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-name-state
240
+ GUMBO_LEX_AFTER_DOCTYPE_NAME,
241
+
242
+ // 12.2.5.57 After DOCTYPE public keyword state
243
+ // https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-public-keyword-state
244
+ GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_KEYWORD,
245
+
246
+ // 12.2.5.58 Before DOCTYPE public identifier state
247
+ // https://html.spec.whatwg.org/multipage/parsing.html#before-doctype-public-identifier-state
248
+ GUMBO_LEX_BEFORE_DOCTYPE_PUBLIC_ID,
249
+
250
+ // 12.2.5.59 DOCTYPE public identifier (double-quoted) state
251
+ // https://html.spec.whatwg.org/multipage/parsing.html#doctype-public-identifier-(double-quoted)-state
252
+ GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED,
253
+
254
+ // 12.2.5.60 DOCTYPE public identifier (single-quoted) state
255
+ // https://html.spec.whatwg.org/multipage/parsing.html#doctype-public-identifier-(single-quoted)-state
256
+ GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED,
257
+
258
+ // 12.2.5.61 After DOCTYPE public identifier state
259
+ // https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-public-identifier-state
260
+ GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID,
261
+
262
+ // 12.2.5.62 Between DOCTYPE public and system identifiers state
263
+ // https://html.spec.whatwg.org/multipage/parsing.html#between-doctype-public-and-system-identifiers-state
264
+ GUMBO_LEX_BETWEEN_DOCTYPE_PUBLIC_SYSTEM_ID,
265
+
266
+ // 12.2.5.63 After DOCTYPE system keyword state
267
+ // https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-system-keyword-state
268
+ GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_KEYWORD,
269
+
270
+ // 12.2.5.64 Before DOCTYPE system identifier state
271
+ // https://html.spec.whatwg.org/multipage/parsing.html#before-doctype-system-identifier-state
272
+ GUMBO_LEX_BEFORE_DOCTYPE_SYSTEM_ID,
273
+
274
+ // 12.2.5.65 DOCTYPE system identifier (double-quoted) state
275
+ // https://html.spec.whatwg.org/multipage/parsing.html#doctype-system-identifier-(double-quoted)-state
276
+ GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED,
277
+
278
+ // 12.2.5.66 DOCTYPE system identifier (single-quoted) state
279
+ // https://html.spec.whatwg.org/multipage/parsing.html#doctype-system-identifier-(single-quoted)-state
280
+ GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED,
281
+
282
+ // 12.2.5.67 After DOCTYPE system identifier state
283
+ // https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-system-identifier-state
284
+ GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID,
285
+
286
+ // 12.2.5.68 Bogus DOCTYPE state
287
+ // https://html.spec.whatwg.org/multipage/parsing.html#bogus-doctype-state
288
+ GUMBO_LEX_BOGUS_DOCTYPE,
289
+
290
+ // 12.2.5.69 CDATA section state
291
+ // https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-state
292
+ GUMBO_LEX_CDATA_SECTION,
293
+
294
+ // 12.2.5.70 CDATA section bracket state
295
+ // https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-bracket-state
296
+ GUMBO_LEX_CDATA_SECTION_BRACKET,
297
+
298
+ // 12.2.5.71 CDATA section end state
299
+ // https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-end-state
300
+ GUMBO_LEX_CDATA_SECTION_END,
301
+
302
+ // 12.2.5.72 Character reference state
303
+ // https://html.spec.whatwg.org/multipage/parsing.html#character-reference-state
304
+ GUMBO_LEX_CHARACTER_REFERENCE,
305
+
306
+ // 12.2.5.73 Named character reference state
307
+ // https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
308
+ GUMBO_LEX_NAMED_CHARACTER_REFERENCE,
309
+
310
+ // 12.2.5.74 Ambiguous ampersand state
311
+ // https://html.spec.whatwg.org/multipage/parsing.html#ambiguous-ampersand-state
312
+ GUMBO_LEX_AMBIGUOUS_AMPERSAND,
313
+
314
+ // 12.2.5.75 Numeric character reference state
315
+ // https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-state
316
+ GUMBO_LEX_NUMERIC_CHARACTER_REFERENCE,
317
+
318
+ // 12.2.5.76 Hexadecimal character reference start state
319
+ // https://html.spec.whatwg.org/multipage/parsing.html#hexadecimal-character-reference-start-state
320
+ GUMBO_LEX_HEXADECIMAL_CHARACTER_REFERENCE_START,
321
+
322
+ // 12.2.5.77 Decimal character reference start state
323
+ // https://html.spec.whatwg.org/multipage/parsing.html#decimal-character-reference-start-state
324
+ GUMBO_LEX_DECIMAL_CHARACTER_REFERENCE_START,
325
+
326
+ // 12.2.5.78 Hexadecimal character reference state
327
+ // https://html.spec.whatwg.org/multipage/parsing.html#hexadecimal-character-reference-state
328
+ GUMBO_LEX_HEXADECIMAL_CHARACTER_REFERENCE,
329
+
330
+ // 12.2.5.79 Decimal character reference state
331
+ // https://html.spec.whatwg.org/multipage/parsing.html#decimal-character-reference-state
332
+ GUMBO_LEX_DECIMAL_CHARACTER_REFERENCE,
333
+
334
+ // 12.2.5.80 Numeric character reference end state
335
+ // https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state
336
+ GUMBO_LEX_NUMERIC_CHARACTER_REFERENCE_END
337
+ } GumboTokenizerEnum;
338
+
339
+ #endif // GUMBO_TOKENIZER_STATES_H_