nokogiri 1.5.10 → 1.13.0

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of nokogiri might be problematic. Click here for more details.

Files changed (334) hide show
  1. checksums.yaml +7 -0
  2. data/Gemfile +5 -0
  3. data/LICENSE-DEPENDENCIES.md +1903 -0
  4. data/LICENSE.md +9 -0
  5. data/README.md +280 -0
  6. data/bin/nokogiri +84 -31
  7. data/dependencies.yml +73 -0
  8. data/ext/nokogiri/depend +38 -358
  9. data/ext/nokogiri/extconf.rb +956 -100
  10. data/ext/nokogiri/gumbo.c +584 -0
  11. data/ext/nokogiri/html4_document.c +166 -0
  12. data/ext/nokogiri/html4_element_description.c +294 -0
  13. data/ext/nokogiri/html4_entity_lookup.c +37 -0
  14. data/ext/nokogiri/html4_sax_parser_context.c +120 -0
  15. data/ext/nokogiri/html4_sax_push_parser.c +95 -0
  16. data/ext/nokogiri/libxml2_backwards_compat.c +121 -0
  17. data/ext/nokogiri/nokogiri.c +232 -87
  18. data/ext/nokogiri/nokogiri.h +188 -129
  19. data/ext/nokogiri/test_global_handlers.c +40 -0
  20. data/ext/nokogiri/xml_attr.c +49 -40
  21. data/ext/nokogiri/xml_attribute_decl.c +18 -18
  22. data/ext/nokogiri/xml_cdata.c +24 -23
  23. data/ext/nokogiri/xml_comment.c +29 -21
  24. data/ext/nokogiri/xml_document.c +327 -223
  25. data/ext/nokogiri/xml_document_fragment.c +12 -16
  26. data/ext/nokogiri/xml_dtd.c +56 -50
  27. data/ext/nokogiri/xml_element_content.c +31 -26
  28. data/ext/nokogiri/xml_element_decl.c +22 -22
  29. data/ext/nokogiri/xml_encoding_handler.c +45 -20
  30. data/ext/nokogiri/xml_entity_decl.c +32 -30
  31. data/ext/nokogiri/xml_entity_reference.c +16 -18
  32. data/ext/nokogiri/xml_namespace.c +74 -32
  33. data/ext/nokogiri/xml_node.c +1290 -680
  34. data/ext/nokogiri/xml_node_set.c +239 -208
  35. data/ext/nokogiri/xml_processing_instruction.c +17 -19
  36. data/ext/nokogiri/xml_reader.c +227 -189
  37. data/ext/nokogiri/xml_relax_ng.c +52 -28
  38. data/ext/nokogiri/xml_sax_parser.c +123 -125
  39. data/ext/nokogiri/xml_sax_parser_context.c +138 -79
  40. data/ext/nokogiri/xml_sax_push_parser.c +88 -35
  41. data/ext/nokogiri/xml_schema.c +112 -33
  42. data/ext/nokogiri/xml_syntax_error.c +50 -23
  43. data/ext/nokogiri/xml_text.c +14 -18
  44. data/ext/nokogiri/xml_xpath_context.c +227 -140
  45. data/ext/nokogiri/xslt_stylesheet.c +162 -168
  46. data/gumbo-parser/CHANGES.md +63 -0
  47. data/gumbo-parser/Makefile +101 -0
  48. data/gumbo-parser/THANKS +27 -0
  49. data/gumbo-parser/src/Makefile +34 -0
  50. data/gumbo-parser/src/README.md +41 -0
  51. data/gumbo-parser/src/ascii.c +75 -0
  52. data/gumbo-parser/src/ascii.h +115 -0
  53. data/gumbo-parser/src/attribute.c +42 -0
  54. data/gumbo-parser/src/attribute.h +17 -0
  55. data/gumbo-parser/src/char_ref.c +22225 -0
  56. data/gumbo-parser/src/char_ref.h +29 -0
  57. data/gumbo-parser/src/char_ref.rl +2154 -0
  58. data/gumbo-parser/src/error.c +626 -0
  59. data/gumbo-parser/src/error.h +148 -0
  60. data/gumbo-parser/src/foreign_attrs.c +104 -0
  61. data/gumbo-parser/src/foreign_attrs.gperf +27 -0
  62. data/gumbo-parser/src/gumbo.h +943 -0
  63. data/gumbo-parser/src/insertion_mode.h +33 -0
  64. data/gumbo-parser/src/macros.h +91 -0
  65. data/gumbo-parser/src/parser.c +4875 -0
  66. data/gumbo-parser/src/parser.h +41 -0
  67. data/gumbo-parser/src/replacement.h +33 -0
  68. data/gumbo-parser/src/string_buffer.c +103 -0
  69. data/gumbo-parser/src/string_buffer.h +68 -0
  70. data/gumbo-parser/src/string_piece.c +48 -0
  71. data/gumbo-parser/src/svg_attrs.c +174 -0
  72. data/gumbo-parser/src/svg_attrs.gperf +77 -0
  73. data/gumbo-parser/src/svg_tags.c +137 -0
  74. data/gumbo-parser/src/svg_tags.gperf +55 -0
  75. data/gumbo-parser/src/tag.c +222 -0
  76. data/gumbo-parser/src/tag_lookup.c +382 -0
  77. data/gumbo-parser/src/tag_lookup.gperf +169 -0
  78. data/gumbo-parser/src/tag_lookup.h +13 -0
  79. data/gumbo-parser/src/token_buffer.c +79 -0
  80. data/gumbo-parser/src/token_buffer.h +71 -0
  81. data/gumbo-parser/src/token_type.h +17 -0
  82. data/gumbo-parser/src/tokenizer.c +3463 -0
  83. data/gumbo-parser/src/tokenizer.h +112 -0
  84. data/gumbo-parser/src/tokenizer_states.h +339 -0
  85. data/gumbo-parser/src/utf8.c +245 -0
  86. data/gumbo-parser/src/utf8.h +164 -0
  87. data/gumbo-parser/src/util.c +68 -0
  88. data/gumbo-parser/src/util.h +30 -0
  89. data/gumbo-parser/src/vector.c +111 -0
  90. data/gumbo-parser/src/vector.h +45 -0
  91. data/lib/nokogiri/class_resolver.rb +67 -0
  92. data/lib/nokogiri/css/node.rb +10 -58
  93. data/lib/nokogiri/css/parser.rb +327 -288
  94. data/lib/nokogiri/css/parser.y +67 -45
  95. data/lib/nokogiri/css/parser_extras.rb +52 -49
  96. data/lib/nokogiri/css/syntax_error.rb +3 -1
  97. data/lib/nokogiri/css/tokenizer.rb +107 -104
  98. data/lib/nokogiri/css/tokenizer.rex +7 -6
  99. data/lib/nokogiri/css/xpath_visitor.rb +263 -75
  100. data/lib/nokogiri/css.rb +50 -17
  101. data/lib/nokogiri/decorators/slop.rb +17 -8
  102. data/lib/nokogiri/extension.rb +31 -0
  103. data/lib/nokogiri/gumbo.rb +15 -0
  104. data/lib/nokogiri/html.rb +38 -27
  105. data/lib/nokogiri/{html → html4}/builder.rb +4 -2
  106. data/lib/nokogiri/html4/document.rb +331 -0
  107. data/lib/nokogiri/html4/document_fragment.rb +54 -0
  108. data/lib/nokogiri/{html → html4}/element_description.rb +3 -1
  109. data/lib/nokogiri/html4/element_description_defaults.rb +578 -0
  110. data/lib/nokogiri/{html → html4}/entity_lookup.rb +4 -2
  111. data/lib/nokogiri/{html → html4}/sax/parser.rb +24 -15
  112. data/lib/nokogiri/html4/sax/parser_context.rb +20 -0
  113. data/lib/nokogiri/html4/sax/push_parser.rb +37 -0
  114. data/lib/nokogiri/html4.rb +46 -0
  115. data/lib/nokogiri/html5/document.rb +88 -0
  116. data/lib/nokogiri/html5/document_fragment.rb +83 -0
  117. data/lib/nokogiri/html5/node.rb +96 -0
  118. data/lib/nokogiri/html5.rb +477 -0
  119. data/lib/nokogiri/jruby/dependencies.rb +21 -0
  120. data/lib/nokogiri/syntax_error.rb +2 -0
  121. data/lib/nokogiri/version/constant.rb +6 -0
  122. data/lib/nokogiri/version/info.rb +221 -0
  123. data/lib/nokogiri/version.rb +3 -90
  124. data/lib/nokogiri/xml/attr.rb +6 -3
  125. data/lib/nokogiri/xml/attribute_decl.rb +3 -1
  126. data/lib/nokogiri/xml/builder.rb +96 -54
  127. data/lib/nokogiri/xml/cdata.rb +3 -1
  128. data/lib/nokogiri/xml/character_data.rb +2 -0
  129. data/lib/nokogiri/xml/document.rb +234 -95
  130. data/lib/nokogiri/xml/document_fragment.rb +86 -36
  131. data/lib/nokogiri/xml/dtd.rb +16 -4
  132. data/lib/nokogiri/xml/element_content.rb +2 -0
  133. data/lib/nokogiri/xml/element_decl.rb +3 -1
  134. data/lib/nokogiri/xml/entity_decl.rb +4 -2
  135. data/lib/nokogiri/xml/entity_reference.rb +20 -0
  136. data/lib/nokogiri/xml/namespace.rb +3 -0
  137. data/lib/nokogiri/xml/node/save_options.rb +8 -4
  138. data/lib/nokogiri/xml/node.rb +947 -502
  139. data/lib/nokogiri/xml/node_set.rb +168 -159
  140. data/lib/nokogiri/xml/notation.rb +13 -0
  141. data/lib/nokogiri/xml/parse_options.rb +40 -5
  142. data/lib/nokogiri/xml/pp/character_data.rb +9 -6
  143. data/lib/nokogiri/xml/pp/node.rb +25 -26
  144. data/lib/nokogiri/xml/pp.rb +4 -2
  145. data/lib/nokogiri/xml/processing_instruction.rb +3 -1
  146. data/lib/nokogiri/xml/reader.rb +23 -28
  147. data/lib/nokogiri/xml/relax_ng.rb +8 -2
  148. data/lib/nokogiri/xml/sax/document.rb +45 -49
  149. data/lib/nokogiri/xml/sax/parser.rb +43 -41
  150. data/lib/nokogiri/xml/sax/parser_context.rb +8 -3
  151. data/lib/nokogiri/xml/sax/push_parser.rb +6 -5
  152. data/lib/nokogiri/xml/sax.rb +6 -4
  153. data/lib/nokogiri/xml/schema.rb +19 -9
  154. data/lib/nokogiri/xml/searchable.rb +259 -0
  155. data/lib/nokogiri/xml/syntax_error.rb +25 -1
  156. data/lib/nokogiri/xml/text.rb +2 -0
  157. data/lib/nokogiri/xml/xpath/syntax_error.rb +4 -2
  158. data/lib/nokogiri/xml/xpath.rb +15 -4
  159. data/lib/nokogiri/xml/xpath_context.rb +3 -3
  160. data/lib/nokogiri/xml.rb +38 -36
  161. data/lib/nokogiri/xslt/stylesheet.rb +3 -1
  162. data/lib/nokogiri/xslt.rb +18 -16
  163. data/lib/nokogiri.rb +69 -69
  164. data/lib/xsd/xmlparser/nokogiri.rb +26 -24
  165. data/patches/libxml2/0001-Remove-script-macro-support.patch +40 -0
  166. data/patches/libxml2/0002-Update-entities-to-remove-handling-of-ssi.patch +44 -0
  167. data/patches/libxml2/0003-libxml2.la-is-in-top_builddir.patch +25 -0
  168. data/patches/libxml2/0004-use-glibc-strlen.patch +53 -0
  169. data/patches/libxml2/0005-avoid-isnan-isinf.patch +81 -0
  170. data/patches/libxml2/0006-update-automake-files-for-arm64.patch +2511 -0
  171. data/patches/libxml2/0007-Fix-XPath-recursion-limit.patch +31 -0
  172. data/patches/libxml2/0008-htmlParseComment-handle-abruptly-closed-comments.patch +61 -0
  173. data/patches/libxml2/0009-allow-wildcard-namespaces.patch +77 -0
  174. data/patches/libxslt/0001-update-automake-files-for-arm64.patch +2511 -0
  175. data/patches/libxslt/0002-Fix-xml2-config-check-in-configure-script.patch +19 -0
  176. data/ports/archives/libxml2-2.9.12.tar.gz +0 -0
  177. data/ports/archives/libxslt-1.1.34.tar.gz +0 -0
  178. metadata +382 -460
  179. data/.autotest +0 -26
  180. data/.gemtest +0 -0
  181. data/CHANGELOG.ja.rdoc +0 -785
  182. data/CHANGELOG.rdoc +0 -783
  183. data/C_CODING_STYLE.rdoc +0 -33
  184. data/Manifest.txt +0 -303
  185. data/README.ja.rdoc +0 -106
  186. data/README.rdoc +0 -175
  187. data/ROADMAP.md +0 -90
  188. data/Rakefile +0 -228
  189. data/STANDARD_RESPONSES.md +0 -47
  190. data/Y_U_NO_GEMSPEC.md +0 -155
  191. data/build_all +0 -105
  192. data/ext/nokogiri/html_document.c +0 -170
  193. data/ext/nokogiri/html_document.h +0 -10
  194. data/ext/nokogiri/html_element_description.c +0 -279
  195. data/ext/nokogiri/html_element_description.h +0 -10
  196. data/ext/nokogiri/html_entity_lookup.c +0 -32
  197. data/ext/nokogiri/html_entity_lookup.h +0 -8
  198. data/ext/nokogiri/html_sax_parser_context.c +0 -116
  199. data/ext/nokogiri/html_sax_parser_context.h +0 -11
  200. data/ext/nokogiri/html_sax_push_parser.c +0 -87
  201. data/ext/nokogiri/html_sax_push_parser.h +0 -9
  202. data/ext/nokogiri/xml_attr.h +0 -9
  203. data/ext/nokogiri/xml_attribute_decl.h +0 -9
  204. data/ext/nokogiri/xml_cdata.h +0 -9
  205. data/ext/nokogiri/xml_comment.h +0 -9
  206. data/ext/nokogiri/xml_document.h +0 -23
  207. data/ext/nokogiri/xml_document_fragment.h +0 -10
  208. data/ext/nokogiri/xml_dtd.h +0 -10
  209. data/ext/nokogiri/xml_element_content.h +0 -10
  210. data/ext/nokogiri/xml_element_decl.h +0 -9
  211. data/ext/nokogiri/xml_encoding_handler.h +0 -8
  212. data/ext/nokogiri/xml_entity_decl.h +0 -10
  213. data/ext/nokogiri/xml_entity_reference.h +0 -9
  214. data/ext/nokogiri/xml_io.c +0 -56
  215. data/ext/nokogiri/xml_io.h +0 -11
  216. data/ext/nokogiri/xml_libxml2_hacks.c +0 -112
  217. data/ext/nokogiri/xml_libxml2_hacks.h +0 -12
  218. data/ext/nokogiri/xml_namespace.h +0 -13
  219. data/ext/nokogiri/xml_node.h +0 -13
  220. data/ext/nokogiri/xml_node_set.h +0 -14
  221. data/ext/nokogiri/xml_processing_instruction.h +0 -9
  222. data/ext/nokogiri/xml_reader.h +0 -10
  223. data/ext/nokogiri/xml_relax_ng.h +0 -9
  224. data/ext/nokogiri/xml_sax_parser.h +0 -39
  225. data/ext/nokogiri/xml_sax_parser_context.h +0 -10
  226. data/ext/nokogiri/xml_sax_push_parser.h +0 -9
  227. data/ext/nokogiri/xml_schema.h +0 -9
  228. data/ext/nokogiri/xml_syntax_error.h +0 -13
  229. data/ext/nokogiri/xml_text.h +0 -9
  230. data/ext/nokogiri/xml_xpath_context.h +0 -10
  231. data/ext/nokogiri/xslt_stylesheet.h +0 -14
  232. data/lib/nokogiri/html/document.rb +0 -254
  233. data/lib/nokogiri/html/document_fragment.rb +0 -41
  234. data/lib/nokogiri/html/element_description_defaults.rb +0 -671
  235. data/lib/nokogiri/html/sax/parser_context.rb +0 -16
  236. data/lib/nokogiri/html/sax/push_parser.rb +0 -16
  237. data/tasks/cross_compile.rb +0 -150
  238. data/tasks/nokogiri.org.rb +0 -24
  239. data/tasks/test.rb +0 -95
  240. data/test/css/test_nthiness.rb +0 -159
  241. data/test/css/test_parser.rb +0 -341
  242. data/test/css/test_tokenizer.rb +0 -198
  243. data/test/css/test_xpath_visitor.rb +0 -91
  244. data/test/decorators/test_slop.rb +0 -16
  245. data/test/files/2ch.html +0 -108
  246. data/test/files/address_book.rlx +0 -12
  247. data/test/files/address_book.xml +0 -10
  248. data/test/files/bar/bar.xsd +0 -4
  249. data/test/files/dont_hurt_em_why.xml +0 -422
  250. data/test/files/encoding.html +0 -82
  251. data/test/files/encoding.xhtml +0 -84
  252. data/test/files/exslt.xml +0 -8
  253. data/test/files/exslt.xslt +0 -35
  254. data/test/files/foo/foo.xsd +0 -4
  255. data/test/files/metacharset.html +0 -10
  256. data/test/files/noencoding.html +0 -47
  257. data/test/files/po.xml +0 -32
  258. data/test/files/po.xsd +0 -66
  259. data/test/files/shift_jis.html +0 -10
  260. data/test/files/shift_jis.xml +0 -5
  261. data/test/files/snuggles.xml +0 -3
  262. data/test/files/staff.dtd +0 -10
  263. data/test/files/staff.xml +0 -59
  264. data/test/files/staff.xslt +0 -32
  265. data/test/files/test_document_url/bar.xml +0 -2
  266. data/test/files/test_document_url/document.dtd +0 -4
  267. data/test/files/test_document_url/document.xml +0 -6
  268. data/test/files/tlm.html +0 -850
  269. data/test/files/to_be_xincluded.xml +0 -2
  270. data/test/files/valid_bar.xml +0 -2
  271. data/test/files/xinclude.xml +0 -4
  272. data/test/helper.rb +0 -154
  273. data/test/html/sax/test_parser.rb +0 -141
  274. data/test/html/sax/test_parser_context.rb +0 -46
  275. data/test/html/test_builder.rb +0 -164
  276. data/test/html/test_document.rb +0 -552
  277. data/test/html/test_document_encoding.rb +0 -138
  278. data/test/html/test_document_fragment.rb +0 -261
  279. data/test/html/test_element_description.rb +0 -105
  280. data/test/html/test_named_characters.rb +0 -14
  281. data/test/html/test_node.rb +0 -196
  282. data/test/html/test_node_encoding.rb +0 -27
  283. data/test/namespaces/test_additional_namespaces_in_builder_doc.rb +0 -14
  284. data/test/namespaces/test_namespaces_in_builder_doc.rb +0 -75
  285. data/test/namespaces/test_namespaces_in_created_doc.rb +0 -75
  286. data/test/namespaces/test_namespaces_in_parsed_doc.rb +0 -66
  287. data/test/test_convert_xpath.rb +0 -135
  288. data/test/test_css_cache.rb +0 -45
  289. data/test/test_encoding_handler.rb +0 -46
  290. data/test/test_memory_leak.rb +0 -156
  291. data/test/test_nokogiri.rb +0 -132
  292. data/test/test_reader.rb +0 -555
  293. data/test/test_soap4r_sax.rb +0 -52
  294. data/test/test_xslt_transforms.rb +0 -254
  295. data/test/xml/node/test_save_options.rb +0 -28
  296. data/test/xml/node/test_subclass.rb +0 -44
  297. data/test/xml/sax/test_parser.rb +0 -366
  298. data/test/xml/sax/test_parser_context.rb +0 -106
  299. data/test/xml/sax/test_push_parser.rb +0 -157
  300. data/test/xml/test_attr.rb +0 -64
  301. data/test/xml/test_attribute_decl.rb +0 -86
  302. data/test/xml/test_builder.rb +0 -306
  303. data/test/xml/test_c14n.rb +0 -151
  304. data/test/xml/test_cdata.rb +0 -48
  305. data/test/xml/test_comment.rb +0 -29
  306. data/test/xml/test_document.rb +0 -828
  307. data/test/xml/test_document_encoding.rb +0 -28
  308. data/test/xml/test_document_fragment.rb +0 -223
  309. data/test/xml/test_dtd.rb +0 -103
  310. data/test/xml/test_dtd_encoding.rb +0 -33
  311. data/test/xml/test_element_content.rb +0 -56
  312. data/test/xml/test_element_decl.rb +0 -73
  313. data/test/xml/test_entity_decl.rb +0 -122
  314. data/test/xml/test_entity_reference.rb +0 -245
  315. data/test/xml/test_namespace.rb +0 -95
  316. data/test/xml/test_node.rb +0 -1137
  317. data/test/xml/test_node_attributes.rb +0 -96
  318. data/test/xml/test_node_encoding.rb +0 -107
  319. data/test/xml/test_node_inheritance.rb +0 -32
  320. data/test/xml/test_node_reparenting.rb +0 -374
  321. data/test/xml/test_node_set.rb +0 -755
  322. data/test/xml/test_parse_options.rb +0 -64
  323. data/test/xml/test_processing_instruction.rb +0 -30
  324. data/test/xml/test_reader_encoding.rb +0 -142
  325. data/test/xml/test_relax_ng.rb +0 -60
  326. data/test/xml/test_schema.rb +0 -103
  327. data/test/xml/test_syntax_error.rb +0 -12
  328. data/test/xml/test_text.rb +0 -45
  329. data/test/xml/test_unparented_node.rb +0 -422
  330. data/test/xml/test_xinclude.rb +0 -83
  331. data/test/xml/test_xpath.rb +0 -295
  332. data/test/xslt/test_custom_functions.rb +0 -133
  333. data/test/xslt/test_exception_handling.rb +0 -37
  334. data/test_all +0 -81
@@ -0,0 +1,112 @@
1
+ #ifndef GUMBO_TOKENIZER_H_
2
+ #define GUMBO_TOKENIZER_H_
3
+
4
+ // This contains an implementation of a tokenizer for HTML5. It consumes a
5
+ // buffer of UTF-8 characters, and then emits a stream of tokens.
6
+
7
+ #include <stdbool.h>
8
+ #include <stddef.h>
9
+
10
+ #include "gumbo.h"
11
+ #include "token_type.h"
12
+ #include "tokenizer_states.h"
13
+
14
+ #ifdef __cplusplus
15
+ extern "C" {
16
+ #endif
17
+
18
+ struct GumboInternalParser;
19
+
20
+ // Struct containing all information pertaining to doctype tokens.
21
+ typedef struct GumboInternalTokenDocType {
22
+ const char* name;
23
+ const char* public_identifier;
24
+ const char* system_identifier;
25
+ bool force_quirks;
26
+ // There's no way to tell a 0-length public or system ID apart from the
27
+ // absence of a public or system ID, but they're handled different by the
28
+ // spec, so we need bool flags for them.
29
+ bool has_public_identifier;
30
+ bool has_system_identifier;
31
+ } GumboTokenDocType;
32
+
33
+ // Struct containing all information pertaining to start tag tokens.
34
+ typedef struct GumboInternalTokenStartTag {
35
+ GumboTag tag;
36
+ // NULL unless tag is GUMBO_TAG_UNKNOWN
37
+ char *name;
38
+ GumboVector /* GumboAttribute */ attributes;
39
+ bool is_self_closing;
40
+ } GumboTokenStartTag;
41
+
42
+ // Struct containing all information pertaining to end tag tokens.
43
+ typedef struct GumboInternalTokenEndTag {
44
+ GumboTag tag;
45
+ // NULL unless tag is GUMBO_TAG_UNKNOWN
46
+ char *name;
47
+ } GumboTokenEndTag;
48
+
49
+ // A data structure representing a single token in the input stream. This
50
+ // contains an enum for the type, the source position, a GumboStringPiece
51
+ // pointing to the original text, and then a union for any parsed data.
52
+ typedef struct GumboInternalToken {
53
+ GumboTokenType type;
54
+ GumboSourcePosition position;
55
+ GumboStringPiece original_text;
56
+ union {
57
+ GumboTokenDocType doc_type;
58
+ GumboTokenStartTag start_tag;
59
+ GumboTokenEndTag end_tag;
60
+ const char* text; // For comments.
61
+ int character; // For character, whitespace, null, and EOF tokens.
62
+ } v;
63
+ } GumboToken;
64
+
65
+ // Initializes the tokenizer state within the GumboParser object, setting up a
66
+ // parse of the specified text.
67
+ void gumbo_tokenizer_state_init (
68
+ struct GumboInternalParser* parser,
69
+ const char* text,
70
+ size_t text_length
71
+ );
72
+
73
+ // Destroys the tokenizer state within the GumboParser object, freeing any
74
+ // dynamically-allocated structures within it.
75
+ void gumbo_tokenizer_state_destroy(struct GumboInternalParser* parser);
76
+
77
+ // Sets the tokenizer state to the specified value. This is needed by some
78
+ // parser states, which alter the state of the tokenizer in response to tags
79
+ // seen.
80
+ void gumbo_tokenizer_set_state (
81
+ struct GumboInternalParser* parser,
82
+ GumboTokenizerEnum state
83
+ );
84
+
85
+ // Flags whether the adjusted current node is a foreign content element. This
86
+ // is necessary for the markup declaration open state, where the tokenizer
87
+ // must be aware of the state of the parser to properly tokenize bad comment
88
+ // tags.
89
+ // https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
90
+ void gumbo_tokenizer_set_is_adjusted_current_node_foreign (
91
+ struct GumboInternalParser* parser,
92
+ bool is_foreign
93
+ );
94
+
95
+ // Lexes a single token from the specified buffer, filling the output with the
96
+ // parsed GumboToken data structure.
97
+ void gumbo_lex(struct GumboInternalParser* parser, GumboToken* output);
98
+
99
+ // Frees the internally-allocated pointers within a GumboToken. Note that this
100
+ // doesn't free the token itself, since oftentimes it will be allocated on the
101
+ // stack.
102
+ //
103
+ // Note that if you are handing over ownership of the internal strings to some
104
+ // other data structure - for example, a parse tree - these do not need to be
105
+ // freed.
106
+ void gumbo_token_destroy(GumboToken* token);
107
+
108
+ #ifdef __cplusplus
109
+ }
110
+ #endif
111
+
112
+ #endif // GUMBO_TOKENIZER_H_
@@ -0,0 +1,339 @@
1
+ #ifndef GUMBO_TOKENIZER_STATES_H_
2
+ #define GUMBO_TOKENIZER_STATES_H_
3
+
4
+ // This contains the list of states used in the tokenizer. Although at first
5
+ // glance it seems like these could be kept internal to the tokenizer, several
6
+ // of the actions in the parser require that it reach into the tokenizer and
7
+ // reset the tokenizer state. For that to work, it needs to have the
8
+ // definitions of individual states available.
9
+ //
10
+ // This may also be useful for providing more detailed error messages for parse
11
+ // errors, as we can match up states and inputs in a table without having to
12
+ // clutter the tokenizer code with lots of precise error messages.
13
+
14
+ // The ordering of this enum is also used to build the dispatch table for the
15
+ // tokenizer state machine, so if it is changed, be sure to update that too.
16
+ typedef enum {
17
+ // 12.2.5.1 Data state
18
+ // https://html.spec.whatwg.org/multipage/parsing.html#data-state
19
+ GUMBO_LEX_DATA,
20
+
21
+ // 12.2.5.2 RCDATA state
22
+ // https://html.spec.whatwg.org/multipage/parsing.html#rcdata-state
23
+ GUMBO_LEX_RCDATA,
24
+
25
+ // 12.2.5.3 RAWTEXT state
26
+ // https://html.spec.whatwg.org/multipage/parsing.html#rawtext-state<Paste>
27
+ GUMBO_LEX_RAWTEXT,
28
+
29
+ // 12.2.5.4 Script data state
30
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-state
31
+ GUMBO_LEX_SCRIPT_DATA,
32
+
33
+ // 12.2.5.5 PLAINTEXT state
34
+ // https://html.spec.whatwg.org/multipage/parsing.html#plaintext-state
35
+ GUMBO_LEX_PLAINTEXT,
36
+
37
+ // 12.2.5.6 Tag open state
38
+ // https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
39
+ GUMBO_LEX_TAG_OPEN,
40
+
41
+ // 12.2.5.7 End tag open state
42
+ // https://html.spec.whatwg.org/multipage/parsing.html#end-tag-open-state
43
+ GUMBO_LEX_END_TAG_OPEN,
44
+
45
+ // 12.2.5.8 Tag name state
46
+ // https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
47
+ GUMBO_LEX_TAG_NAME,
48
+
49
+ // 12.2.5.9 RCDATA less-than sign state
50
+ // https://html.spec.whatwg.org/multipage/parsing.html#rcdata-less-than-sign-state
51
+ GUMBO_LEX_RCDATA_LT,
52
+
53
+ // 12.2.5.10 RCDATA end tag open state
54
+ // https://html.spec.whatwg.org/multipage/parsing.html#rcdata-end-tag-open-state
55
+ GUMBO_LEX_RCDATA_END_TAG_OPEN,
56
+
57
+ // 12.2.5.11 RCDATA end tag name state
58
+ // https://html.spec.whatwg.org/multipage/parsing.html#rcdata-end-tag-name-state
59
+ GUMBO_LEX_RCDATA_END_TAG_NAME,
60
+
61
+ // 12.2.5.12 RAWTEXT less-than sign state
62
+ // https://html.spec.whatwg.org/multipage/parsing.html#rawtext-less-than-sign-state
63
+ GUMBO_LEX_RAWTEXT_LT,
64
+
65
+ // 12.2.5.13 RAWTEXT end tag open state
66
+ // https://html.spec.whatwg.org/multipage/parsing.html#rawtext-end-tag-open-state
67
+ GUMBO_LEX_RAWTEXT_END_TAG_OPEN,
68
+
69
+ // 12.2.5.14 RAWTEXT end tag name state
70
+ // https://html.spec.whatwg.org/multipage/parsing.html#rawtext-end-tag-name-state
71
+ GUMBO_LEX_RAWTEXT_END_TAG_NAME,
72
+
73
+ // 12.2.5.15 Script data less-than sign state
74
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-less-than-sign-state
75
+ GUMBO_LEX_SCRIPT_DATA_LT,
76
+
77
+ // 12.2.5.16 Script data end tag open state
78
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-end-tag-open-state
79
+ GUMBO_LEX_SCRIPT_DATA_END_TAG_OPEN,
80
+
81
+ // 12.2.5.17 Script data end tag name state
82
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-end-tag-name-state
83
+ GUMBO_LEX_SCRIPT_DATA_END_TAG_NAME,
84
+
85
+ // 12.2.5.18 Script data escape start state
86
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escape-start-state
87
+ GUMBO_LEX_SCRIPT_DATA_ESCAPED_START,
88
+
89
+ // 12.2.5.19 Script data escape start dash state
90
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escape-start-dash-state
91
+ GUMBO_LEX_SCRIPT_DATA_ESCAPED_START_DASH,
92
+
93
+ // 12.2.5.20 Script data escaped state
94
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-state
95
+ GUMBO_LEX_SCRIPT_DATA_ESCAPED,
96
+
97
+ // 12.2.5.21 Script data escaped dash state
98
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-dash-state
99
+ GUMBO_LEX_SCRIPT_DATA_ESCAPED_DASH,
100
+
101
+ // 12.2.5.22 Script data escaped dash dash state
102
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-dash-dash-state
103
+ GUMBO_LEX_SCRIPT_DATA_ESCAPED_DASH_DASH,
104
+
105
+ // 12.2.5.23 Script data escaped less than sign state
106
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-less-than-sign-state
107
+ GUMBO_LEX_SCRIPT_DATA_ESCAPED_LT,
108
+
109
+ // 12.2.5.24 Script data escaped end tag open state
110
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-end-tag-open-state
111
+ GUMBO_LEX_SCRIPT_DATA_ESCAPED_END_TAG_OPEN,
112
+
113
+ // 12.2.5.25 Script data escaped end tag name state
114
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-end-tag-name-state
115
+ GUMBO_LEX_SCRIPT_DATA_ESCAPED_END_TAG_NAME,
116
+
117
+ // 12.2.5.26 Script data double escape start state
118
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escape-start-state
119
+ GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_START,
120
+
121
+ // 12.2.5.27 Script data double escaped state
122
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-state
123
+ GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED,
124
+
125
+ // 12.2.5.28 Script data double escaped dash state
126
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-dash-state
127
+ GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_DASH,
128
+
129
+ // 12.2.5.29 Script data double escaped dash dash state
130
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-dash-dash-state
131
+ GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH,
132
+
133
+ // 12.2.5.30 Script data double escaped less-than sign state
134
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-less-than-sign-state
135
+ GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_LT,
136
+
137
+ // 12.2.5.31 Script data double escape end state (XXX: spec bug with the
138
+ // name?)
139
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escape-end-state
140
+ GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_END,
141
+
142
+ // 12.2.5.32 Before attribute name state
143
+ // https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-name-state
144
+ GUMBO_LEX_BEFORE_ATTR_NAME,
145
+
146
+ // 12.2.5.33 Attributet name state
147
+ // https://html.spec.whatwg.org/multipage/parsing.html#attribute-name-state
148
+ GUMBO_LEX_ATTR_NAME,
149
+
150
+ // 12.2.5.34 After attribute name state
151
+ // https://html.spec.whatwg.org/multipage/parsing.html#after-attribute-name-state
152
+ GUMBO_LEX_AFTER_ATTR_NAME,
153
+
154
+ // 12.2.5.35 Before attribute value state
155
+ // https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-value-state
156
+ GUMBO_LEX_BEFORE_ATTR_VALUE,
157
+
158
+ // 12.2.5.36 Attribute value (double-quoted) state
159
+ // https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-(double-quoted)-state
160
+ GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED,
161
+
162
+ // 12.2.5.37 Attribute value (single-quoted) state
163
+ // https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-(single-quoted)-state
164
+ GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED,
165
+
166
+ // 12.2.5.38 Attribute value (unquoted) state
167
+ // https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-(unquoted)-state
168
+ GUMBO_LEX_ATTR_VALUE_UNQUOTED,
169
+
170
+ // 12.2.5.39 After attribute value (quoted) state
171
+ // https://html.spec.whatwg.org/multipage/parsing.html#after-attribute-value-(quoted)-state
172
+ GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED,
173
+
174
+ // 12.2.5.40 Self-closing start tag state
175
+ // https://html.spec.whatwg.org/multipage/parsing.html#self-closing-start-tag-state
176
+ GUMBO_LEX_SELF_CLOSING_START_TAG,
177
+
178
+ // 12.2.5.41 Bogus comment state
179
+ // https://html.spec.whatwg.org/multipage/parsing.html#bogus-comment-state
180
+ GUMBO_LEX_BOGUS_COMMENT,
181
+
182
+ // 12.2.5.42 Markup declaration open state
183
+ // https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
184
+ GUMBO_LEX_MARKUP_DECLARATION_OPEN,
185
+
186
+ // 12.2.5.43 Comment start state
187
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-start-state
188
+ GUMBO_LEX_COMMENT_START,
189
+
190
+ // 12.2.5.44 Comment start dash state
191
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-start-dash-state
192
+ GUMBO_LEX_COMMENT_START_DASH,
193
+
194
+ // 12.2.5.45 Comment state
195
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-state
196
+ GUMBO_LEX_COMMENT,
197
+
198
+ // 12.2.5.46 Comment less-than sign state
199
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-state
200
+ GUMBO_LEX_COMMENT_LT,
201
+
202
+ // 12.2.5.47 Comment less-than sign bang state
203
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-bang-state
204
+ GUMBO_LEX_COMMENT_LT_BANG,
205
+
206
+ // 12.2.5.48 Comment less-than sign bang dash state
207
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-bang-dash-state
208
+ GUMBO_LEX_COMMENT_LT_BANG_DASH,
209
+
210
+ // 12.2.5.49 Comment less-than sign bang dash dash state
211
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-bang-dash-dash-state
212
+ GUMBO_LEX_COMMENT_LT_BANG_DASH_DASH,
213
+
214
+ // 12.2.5.50 Comment end dash state
215
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-end-dash-state
216
+ GUMBO_LEX_COMMENT_END_DASH,
217
+
218
+ // 12.2.5.51 Comment end state
219
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-end-state
220
+ GUMBO_LEX_COMMENT_END,
221
+
222
+ // 12.2.5.52 Comment end bang state
223
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-end-bang-state
224
+ GUMBO_LEX_COMMENT_END_BANG,
225
+
226
+ // 12.2.5.53 DOCTYPE state
227
+ // https://html.spec.whatwg.org/multipage/parsing.html#doctype-state
228
+ GUMBO_LEX_DOCTYPE,
229
+
230
+ // 12.2.5.54 Before DOCTYPE name state
231
+ // https://html.spec.whatwg.org/multipage/parsing.html#before-doctype-name-state
232
+ GUMBO_LEX_BEFORE_DOCTYPE_NAME,
233
+
234
+ // 12.2.5.55 DOCTYPE name state
235
+ // https://html.spec.whatwg.org/multipage/parsing.html#doctype-name-state
236
+ GUMBO_LEX_DOCTYPE_NAME,
237
+
238
+ // 12.2.5.56 After DOCTYPE name state
239
+ // https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-name-state
240
+ GUMBO_LEX_AFTER_DOCTYPE_NAME,
241
+
242
+ // 12.2.5.57 After DOCTYPE public keyword state
243
+ // https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-public-keyword-state
244
+ GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_KEYWORD,
245
+
246
+ // 12.2.5.58 Before DOCTYPE public identifier state
247
+ // https://html.spec.whatwg.org/multipage/parsing.html#before-doctype-public-identifier-state
248
+ GUMBO_LEX_BEFORE_DOCTYPE_PUBLIC_ID,
249
+
250
+ // 12.2.5.59 DOCTYPE public identifier (double-quoted) state
251
+ // https://html.spec.whatwg.org/multipage/parsing.html#doctype-public-identifier-(double-quoted)-state
252
+ GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED,
253
+
254
+ // 12.2.5.60 DOCTYPE public identifier (single-quoted) state
255
+ // https://html.spec.whatwg.org/multipage/parsing.html#doctype-public-identifier-(single-quoted)-state
256
+ GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED,
257
+
258
+ // 12.2.5.61 After DOCTYPE public identifier state
259
+ // https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-public-identifier-state
260
+ GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID,
261
+
262
+ // 12.2.5.62 Between DOCTYPE public and system identifiers state
263
+ // https://html.spec.whatwg.org/multipage/parsing.html#between-doctype-public-and-system-identifiers-state
264
+ GUMBO_LEX_BETWEEN_DOCTYPE_PUBLIC_SYSTEM_ID,
265
+
266
+ // 12.2.5.63 After DOCTYPE system keyword state
267
+ // https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-system-keyword-state
268
+ GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_KEYWORD,
269
+
270
+ // 12.2.5.64 Before DOCTYPE system identifier state
271
+ // https://html.spec.whatwg.org/multipage/parsing.html#before-doctype-system-identifier-state
272
+ GUMBO_LEX_BEFORE_DOCTYPE_SYSTEM_ID,
273
+
274
+ // 12.2.5.65 DOCTYPE system identifier (double-quoted) state
275
+ // https://html.spec.whatwg.org/multipage/parsing.html#doctype-system-identifier-(double-quoted)-state
276
+ GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED,
277
+
278
+ // 12.2.5.66 DOCTYPE system identifier (single-quoted) state
279
+ // https://html.spec.whatwg.org/multipage/parsing.html#doctype-system-identifier-(single-quoted)-state
280
+ GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED,
281
+
282
+ // 12.2.5.67 After DOCTYPE system identifier state
283
+ // https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-system-identifier-state
284
+ GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID,
285
+
286
+ // 12.2.5.68 Bogus DOCTYPE state
287
+ // https://html.spec.whatwg.org/multipage/parsing.html#bogus-doctype-state
288
+ GUMBO_LEX_BOGUS_DOCTYPE,
289
+
290
+ // 12.2.5.69 CDATA section state
291
+ // https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-state
292
+ GUMBO_LEX_CDATA_SECTION,
293
+
294
+ // 12.2.5.70 CDATA section bracket state
295
+ // https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-bracket-state
296
+ GUMBO_LEX_CDATA_SECTION_BRACKET,
297
+
298
+ // 12.2.5.71 CDATA section end state
299
+ // https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-end-state
300
+ GUMBO_LEX_CDATA_SECTION_END,
301
+
302
+ // 12.2.5.72 Character reference state
303
+ // https://html.spec.whatwg.org/multipage/parsing.html#character-reference-state
304
+ GUMBO_LEX_CHARACTER_REFERENCE,
305
+
306
+ // 12.2.5.73 Named character reference state
307
+ // https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
308
+ GUMBO_LEX_NAMED_CHARACTER_REFERENCE,
309
+
310
+ // 12.2.5.74 Ambiguous ampersand state
311
+ // https://html.spec.whatwg.org/multipage/parsing.html#ambiguous-ampersand-state
312
+ GUMBO_LEX_AMBIGUOUS_AMPERSAND,
313
+
314
+ // 12.2.5.75 Numeric character reference state
315
+ // https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-state
316
+ GUMBO_LEX_NUMERIC_CHARACTER_REFERENCE,
317
+
318
+ // 12.2.5.76 Hexadecimal character reference start state
319
+ // https://html.spec.whatwg.org/multipage/parsing.html#hexadecimal-character-reference-start-state
320
+ GUMBO_LEX_HEXADECIMAL_CHARACTER_REFERENCE_START,
321
+
322
+ // 12.2.5.77 Decimal character reference start state
323
+ // https://html.spec.whatwg.org/multipage/parsing.html#decimal-character-reference-start-state
324
+ GUMBO_LEX_DECIMAL_CHARACTER_REFERENCE_START,
325
+
326
+ // 12.2.5.78 Hexadecimal character reference state
327
+ // https://html.spec.whatwg.org/multipage/parsing.html#hexadecimal-character-reference-state
328
+ GUMBO_LEX_HEXADECIMAL_CHARACTER_REFERENCE,
329
+
330
+ // 12.2.5.79 Decimal character reference state
331
+ // https://html.spec.whatwg.org/multipage/parsing.html#decimal-character-reference-state
332
+ GUMBO_LEX_DECIMAL_CHARACTER_REFERENCE,
333
+
334
+ // 12.2.5.80 Numeric character reference end state
335
+ // https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state
336
+ GUMBO_LEX_NUMERIC_CHARACTER_REFERENCE_END
337
+ } GumboTokenizerEnum;
338
+
339
+ #endif // GUMBO_TOKENIZER_STATES_H_