nokolexbor 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (486) hide show
  1. checksums.yaml +7 -0
  2. data/ext/nokolexbor/config.h +186 -0
  3. data/ext/nokolexbor/extconf.rb +131 -0
  4. data/ext/nokolexbor/libxml/HTMLparser.h +320 -0
  5. data/ext/nokolexbor/libxml/SAX2.h +173 -0
  6. data/ext/nokolexbor/libxml/chvalid.h +230 -0
  7. data/ext/nokolexbor/libxml/debugXML.h +217 -0
  8. data/ext/nokolexbor/libxml/dict.h +81 -0
  9. data/ext/nokolexbor/libxml/encoding.h +232 -0
  10. data/ext/nokolexbor/libxml/entities.h +153 -0
  11. data/ext/nokolexbor/libxml/globals.h +529 -0
  12. data/ext/nokolexbor/libxml/hash.h +236 -0
  13. data/ext/nokolexbor/libxml/list.h +137 -0
  14. data/ext/nokolexbor/libxml/parser.h +1264 -0
  15. data/ext/nokolexbor/libxml/parserInternals.h +641 -0
  16. data/ext/nokolexbor/libxml/pattern.h +100 -0
  17. data/ext/nokolexbor/libxml/threads.h +94 -0
  18. data/ext/nokolexbor/libxml/tree.h +1315 -0
  19. data/ext/nokolexbor/libxml/uri.h +94 -0
  20. data/ext/nokolexbor/libxml/valid.h +448 -0
  21. data/ext/nokolexbor/libxml/xmlIO.h +369 -0
  22. data/ext/nokolexbor/libxml/xmlautomata.h +146 -0
  23. data/ext/nokolexbor/libxml/xmlerror.h +919 -0
  24. data/ext/nokolexbor/libxml/xmlexports.h +79 -0
  25. data/ext/nokolexbor/libxml/xmlmemory.h +226 -0
  26. data/ext/nokolexbor/libxml/xmlregexp.h +222 -0
  27. data/ext/nokolexbor/libxml/xmlstring.h +140 -0
  28. data/ext/nokolexbor/libxml/xmlversion.h +526 -0
  29. data/ext/nokolexbor/libxml/xpath.h +575 -0
  30. data/ext/nokolexbor/libxml/xpathInternals.h +632 -0
  31. data/ext/nokolexbor/libxml/xpointer.h +137 -0
  32. data/ext/nokolexbor/libxml.h +76 -0
  33. data/ext/nokolexbor/memory.c +39 -0
  34. data/ext/nokolexbor/nl_document.c +51 -0
  35. data/ext/nokolexbor/nl_node.c +790 -0
  36. data/ext/nokolexbor/nl_node_set.c +368 -0
  37. data/ext/nokolexbor/nl_xpath_context.c +200 -0
  38. data/ext/nokolexbor/nokolexbor.c +63 -0
  39. data/ext/nokolexbor/nokolexbor.h +37 -0
  40. data/ext/nokolexbor/private/buf.h +70 -0
  41. data/ext/nokolexbor/private/dict.h +11 -0
  42. data/ext/nokolexbor/private/enc.h +17 -0
  43. data/ext/nokolexbor/private/error.h +21 -0
  44. data/ext/nokolexbor/private/globals.h +9 -0
  45. data/ext/nokolexbor/private/memory.h +9 -0
  46. data/ext/nokolexbor/private/parser.h +27 -0
  47. data/ext/nokolexbor/private/string.h +9 -0
  48. data/ext/nokolexbor/private/threads.h +50 -0
  49. data/ext/nokolexbor/private/tree.h +18 -0
  50. data/ext/nokolexbor/private/xpath.h +7 -0
  51. data/ext/nokolexbor/timsort.h +601 -0
  52. data/ext/nokolexbor/xml_SAX2.c +80 -0
  53. data/ext/nokolexbor/xml_buf.c +363 -0
  54. data/ext/nokolexbor/xml_chvalid.c +334 -0
  55. data/ext/nokolexbor/xml_dict.c +1264 -0
  56. data/ext/nokolexbor/xml_encoding.c +124 -0
  57. data/ext/nokolexbor/xml_error.c +134 -0
  58. data/ext/nokolexbor/xml_globals.c +1085 -0
  59. data/ext/nokolexbor/xml_hash.c +1141 -0
  60. data/ext/nokolexbor/xml_memory.c +203 -0
  61. data/ext/nokolexbor/xml_parser.c +127 -0
  62. data/ext/nokolexbor/xml_parserInternals.c +338 -0
  63. data/ext/nokolexbor/xml_pattern.c +2375 -0
  64. data/ext/nokolexbor/xml_string.c +1051 -0
  65. data/ext/nokolexbor/xml_threads.c +881 -0
  66. data/ext/nokolexbor/xml_tree.c +148 -0
  67. data/ext/nokolexbor/xml_xpath.c +14743 -0
  68. data/lib/nokolexbor/attribute.rb +18 -0
  69. data/lib/nokolexbor/document.rb +6 -0
  70. data/lib/nokolexbor/node.rb +264 -0
  71. data/lib/nokolexbor/node_set.rb +124 -0
  72. data/lib/nokolexbor/version.rb +5 -0
  73. data/lib/nokolexbor/xpath_context.rb +14 -0
  74. data/lib/nokolexbor.rb +17 -0
  75. data/patches/0001-lexbor-support-text-pseudo-element.patch +137 -0
  76. data/patches/0002-lexbor-match-id-class-case-sensitive.patch +22 -0
  77. data/patches/0003-lexbor-attach-template-content-to-self.patch +13 -0
  78. data/vendor/lexbor/CMakeLists.txt +331 -0
  79. data/vendor/lexbor/config.cmake +890 -0
  80. data/vendor/lexbor/feature.cmake +134 -0
  81. data/vendor/lexbor/source/lexbor/core/array.c +208 -0
  82. data/vendor/lexbor/source/lexbor/core/array.h +100 -0
  83. data/vendor/lexbor/source/lexbor/core/array_obj.c +216 -0
  84. data/vendor/lexbor/source/lexbor/core/array_obj.h +134 -0
  85. data/vendor/lexbor/source/lexbor/core/avl.c +442 -0
  86. data/vendor/lexbor/source/lexbor/core/avl.h +82 -0
  87. data/vendor/lexbor/source/lexbor/core/base.h +86 -0
  88. data/vendor/lexbor/source/lexbor/core/bst.c +468 -0
  89. data/vendor/lexbor/source/lexbor/core/bst.h +108 -0
  90. data/vendor/lexbor/source/lexbor/core/bst_map.c +238 -0
  91. data/vendor/lexbor/source/lexbor/core/bst_map.h +87 -0
  92. data/vendor/lexbor/source/lexbor/core/config.cmake +12 -0
  93. data/vendor/lexbor/source/lexbor/core/conv.c +203 -0
  94. data/vendor/lexbor/source/lexbor/core/conv.h +53 -0
  95. data/vendor/lexbor/source/lexbor/core/core.h +35 -0
  96. data/vendor/lexbor/source/lexbor/core/def.h +57 -0
  97. data/vendor/lexbor/source/lexbor/core/diyfp.c +153 -0
  98. data/vendor/lexbor/source/lexbor/core/diyfp.h +258 -0
  99. data/vendor/lexbor/source/lexbor/core/dobject.c +187 -0
  100. data/vendor/lexbor/source/lexbor/core/dobject.h +92 -0
  101. data/vendor/lexbor/source/lexbor/core/dtoa.c +404 -0
  102. data/vendor/lexbor/source/lexbor/core/dtoa.h +28 -0
  103. data/vendor/lexbor/source/lexbor/core/fs.h +60 -0
  104. data/vendor/lexbor/source/lexbor/core/hash.c +476 -0
  105. data/vendor/lexbor/source/lexbor/core/hash.h +218 -0
  106. data/vendor/lexbor/source/lexbor/core/in.c +267 -0
  107. data/vendor/lexbor/source/lexbor/core/in.h +172 -0
  108. data/vendor/lexbor/source/lexbor/core/lexbor.h +35 -0
  109. data/vendor/lexbor/source/lexbor/core/mem.c +228 -0
  110. data/vendor/lexbor/source/lexbor/core/mem.h +141 -0
  111. data/vendor/lexbor/source/lexbor/core/mraw.c +428 -0
  112. data/vendor/lexbor/source/lexbor/core/mraw.h +114 -0
  113. data/vendor/lexbor/source/lexbor/core/perf.h +45 -0
  114. data/vendor/lexbor/source/lexbor/core/plog.c +73 -0
  115. data/vendor/lexbor/source/lexbor/core/plog.h +102 -0
  116. data/vendor/lexbor/source/lexbor/core/print.c +168 -0
  117. data/vendor/lexbor/source/lexbor/core/print.h +39 -0
  118. data/vendor/lexbor/source/lexbor/core/sbst.h +59 -0
  119. data/vendor/lexbor/source/lexbor/core/serialize.c +27 -0
  120. data/vendor/lexbor/source/lexbor/core/serialize.h +32 -0
  121. data/vendor/lexbor/source/lexbor/core/shs.c +118 -0
  122. data/vendor/lexbor/source/lexbor/core/shs.h +82 -0
  123. data/vendor/lexbor/source/lexbor/core/str.c +617 -0
  124. data/vendor/lexbor/source/lexbor/core/str.h +247 -0
  125. data/vendor/lexbor/source/lexbor/core/str_res.h +369 -0
  126. data/vendor/lexbor/source/lexbor/core/strtod.c +326 -0
  127. data/vendor/lexbor/source/lexbor/core/strtod.h +28 -0
  128. data/vendor/lexbor/source/lexbor/core/types.h +39 -0
  129. data/vendor/lexbor/source/lexbor/core/utils.c +43 -0
  130. data/vendor/lexbor/source/lexbor/core/utils.h +36 -0
  131. data/vendor/lexbor/source/lexbor/css/base.h +44 -0
  132. data/vendor/lexbor/source/lexbor/css/config.cmake +2 -0
  133. data/vendor/lexbor/source/lexbor/css/css.h +25 -0
  134. data/vendor/lexbor/source/lexbor/css/log.c +336 -0
  135. data/vendor/lexbor/source/lexbor/css/log.h +103 -0
  136. data/vendor/lexbor/source/lexbor/css/node.h +29 -0
  137. data/vendor/lexbor/source/lexbor/css/parser.c +473 -0
  138. data/vendor/lexbor/source/lexbor/css/parser.h +368 -0
  139. data/vendor/lexbor/source/lexbor/css/selectors/base.h +48 -0
  140. data/vendor/lexbor/source/lexbor/css/selectors/pseudo.c +91 -0
  141. data/vendor/lexbor/source/lexbor/css/selectors/pseudo.h +66 -0
  142. data/vendor/lexbor/source/lexbor/css/selectors/pseudo_const.h +109 -0
  143. data/vendor/lexbor/source/lexbor/css/selectors/pseudo_res.h +302 -0
  144. data/vendor/lexbor/source/lexbor/css/selectors/pseudo_state.c +279 -0
  145. data/vendor/lexbor/source/lexbor/css/selectors/pseudo_state.h +85 -0
  146. data/vendor/lexbor/source/lexbor/css/selectors/selector.c +927 -0
  147. data/vendor/lexbor/source/lexbor/css/selectors/selector.h +200 -0
  148. data/vendor/lexbor/source/lexbor/css/selectors/selectors.c +340 -0
  149. data/vendor/lexbor/source/lexbor/css/selectors/selectors.h +137 -0
  150. data/vendor/lexbor/source/lexbor/css/selectors/state.c +1718 -0
  151. data/vendor/lexbor/source/lexbor/css/selectors/state.h +79 -0
  152. data/vendor/lexbor/source/lexbor/css/stylesheet.h +37 -0
  153. data/vendor/lexbor/source/lexbor/css/syntax/anb.c +443 -0
  154. data/vendor/lexbor/source/lexbor/css/syntax/anb.h +45 -0
  155. data/vendor/lexbor/source/lexbor/css/syntax/base.h +33 -0
  156. data/vendor/lexbor/source/lexbor/css/syntax/parser.c +9 -0
  157. data/vendor/lexbor/source/lexbor/css/syntax/parser.h +25 -0
  158. data/vendor/lexbor/source/lexbor/css/syntax/res.h +48 -0
  159. data/vendor/lexbor/source/lexbor/css/syntax/state.c +2603 -0
  160. data/vendor/lexbor/source/lexbor/css/syntax/state.h +140 -0
  161. data/vendor/lexbor/source/lexbor/css/syntax/state_res.h +273 -0
  162. data/vendor/lexbor/source/lexbor/css/syntax/syntax.c +67 -0
  163. data/vendor/lexbor/source/lexbor/css/syntax/token.c +618 -0
  164. data/vendor/lexbor/source/lexbor/css/syntax/token.h +298 -0
  165. data/vendor/lexbor/source/lexbor/css/syntax/token_res.h +68 -0
  166. data/vendor/lexbor/source/lexbor/css/syntax/tokenizer/error.c +30 -0
  167. data/vendor/lexbor/source/lexbor/css/syntax/tokenizer/error.h +58 -0
  168. data/vendor/lexbor/source/lexbor/css/syntax/tokenizer.c +278 -0
  169. data/vendor/lexbor/source/lexbor/css/syntax/tokenizer.h +121 -0
  170. data/vendor/lexbor/source/lexbor/dom/base.h +32 -0
  171. data/vendor/lexbor/source/lexbor/dom/collection.c +97 -0
  172. data/vendor/lexbor/source/lexbor/dom/collection.h +112 -0
  173. data/vendor/lexbor/source/lexbor/dom/config.cmake +3 -0
  174. data/vendor/lexbor/source/lexbor/dom/dom.h +29 -0
  175. data/vendor/lexbor/source/lexbor/dom/exception.c +18 -0
  176. data/vendor/lexbor/source/lexbor/dom/exception.h +73 -0
  177. data/vendor/lexbor/source/lexbor/dom/interface.c +110 -0
  178. data/vendor/lexbor/source/lexbor/dom/interface.h +88 -0
  179. data/vendor/lexbor/source/lexbor/dom/interfaces/attr.c +445 -0
  180. data/vendor/lexbor/source/lexbor/dom/interfaces/attr.h +152 -0
  181. data/vendor/lexbor/source/lexbor/dom/interfaces/attr_const.h +62 -0
  182. data/vendor/lexbor/source/lexbor/dom/interfaces/attr_res.h +143 -0
  183. data/vendor/lexbor/source/lexbor/dom/interfaces/cdata_section.c +55 -0
  184. data/vendor/lexbor/source/lexbor/dom/interfaces/cdata_section.h +38 -0
  185. data/vendor/lexbor/source/lexbor/dom/interfaces/character_data.c +110 -0
  186. data/vendor/lexbor/source/lexbor/dom/interfaces/character_data.h +51 -0
  187. data/vendor/lexbor/source/lexbor/dom/interfaces/comment.c +64 -0
  188. data/vendor/lexbor/source/lexbor/dom/interfaces/comment.h +42 -0
  189. data/vendor/lexbor/source/lexbor/dom/interfaces/document.c +536 -0
  190. data/vendor/lexbor/source/lexbor/dom/interfaces/document.h +243 -0
  191. data/vendor/lexbor/source/lexbor/dom/interfaces/document_fragment.c +36 -0
  192. data/vendor/lexbor/source/lexbor/dom/interfaces/document_fragment.h +36 -0
  193. data/vendor/lexbor/source/lexbor/dom/interfaces/document_type.c +125 -0
  194. data/vendor/lexbor/source/lexbor/dom/interfaces/document_type.h +108 -0
  195. data/vendor/lexbor/source/lexbor/dom/interfaces/element.c +1411 -0
  196. data/vendor/lexbor/source/lexbor/dom/interfaces/element.h +319 -0
  197. data/vendor/lexbor/source/lexbor/dom/interfaces/event_target.c +32 -0
  198. data/vendor/lexbor/source/lexbor/dom/interfaces/event_target.h +34 -0
  199. data/vendor/lexbor/source/lexbor/dom/interfaces/node.c +661 -0
  200. data/vendor/lexbor/source/lexbor/dom/interfaces/node.h +192 -0
  201. data/vendor/lexbor/source/lexbor/dom/interfaces/processing_instruction.c +87 -0
  202. data/vendor/lexbor/source/lexbor/dom/interfaces/processing_instruction.h +66 -0
  203. data/vendor/lexbor/source/lexbor/dom/interfaces/shadow_root.c +36 -0
  204. data/vendor/lexbor/source/lexbor/dom/interfaces/shadow_root.h +44 -0
  205. data/vendor/lexbor/source/lexbor/dom/interfaces/text.c +63 -0
  206. data/vendor/lexbor/source/lexbor/dom/interfaces/text.h +42 -0
  207. data/vendor/lexbor/source/lexbor/encoding/base.h +218 -0
  208. data/vendor/lexbor/source/lexbor/encoding/big5.c +42839 -0
  209. data/vendor/lexbor/source/lexbor/encoding/config.cmake +12 -0
  210. data/vendor/lexbor/source/lexbor/encoding/const.h +65 -0
  211. data/vendor/lexbor/source/lexbor/encoding/decode.c +3193 -0
  212. data/vendor/lexbor/source/lexbor/encoding/decode.h +370 -0
  213. data/vendor/lexbor/source/lexbor/encoding/encode.c +1931 -0
  214. data/vendor/lexbor/source/lexbor/encoding/encode.h +377 -0
  215. data/vendor/lexbor/source/lexbor/encoding/encoding.c +252 -0
  216. data/vendor/lexbor/source/lexbor/encoding/encoding.h +475 -0
  217. data/vendor/lexbor/source/lexbor/encoding/euc_kr.c +53883 -0
  218. data/vendor/lexbor/source/lexbor/encoding/gb18030.c +47905 -0
  219. data/vendor/lexbor/source/lexbor/encoding/iso_2022_jp_katakana.c +159 -0
  220. data/vendor/lexbor/source/lexbor/encoding/jis0208.c +22477 -0
  221. data/vendor/lexbor/source/lexbor/encoding/jis0212.c +15787 -0
  222. data/vendor/lexbor/source/lexbor/encoding/multi.h +53 -0
  223. data/vendor/lexbor/source/lexbor/encoding/range.c +71 -0
  224. data/vendor/lexbor/source/lexbor/encoding/range.h +34 -0
  225. data/vendor/lexbor/source/lexbor/encoding/res.c +222 -0
  226. data/vendor/lexbor/source/lexbor/encoding/res.h +34 -0
  227. data/vendor/lexbor/source/lexbor/encoding/single.c +13748 -0
  228. data/vendor/lexbor/source/lexbor/encoding/single.h +116 -0
  229. data/vendor/lexbor/source/lexbor/html/base.h +44 -0
  230. data/vendor/lexbor/source/lexbor/html/config.cmake +3 -0
  231. data/vendor/lexbor/source/lexbor/html/encoding.c +574 -0
  232. data/vendor/lexbor/source/lexbor/html/encoding.h +106 -0
  233. data/vendor/lexbor/source/lexbor/html/html.h +107 -0
  234. data/vendor/lexbor/source/lexbor/html/interface.c +165 -0
  235. data/vendor/lexbor/source/lexbor/html/interface.h +186 -0
  236. data/vendor/lexbor/source/lexbor/html/interface_res.h +4449 -0
  237. data/vendor/lexbor/source/lexbor/html/interfaces/anchor_element.c +36 -0
  238. data/vendor/lexbor/source/lexbor/html/interfaces/anchor_element.h +34 -0
  239. data/vendor/lexbor/source/lexbor/html/interfaces/area_element.c +36 -0
  240. data/vendor/lexbor/source/lexbor/html/interfaces/area_element.h +34 -0
  241. data/vendor/lexbor/source/lexbor/html/interfaces/audio_element.c +36 -0
  242. data/vendor/lexbor/source/lexbor/html/interfaces/audio_element.h +34 -0
  243. data/vendor/lexbor/source/lexbor/html/interfaces/base_element.c +36 -0
  244. data/vendor/lexbor/source/lexbor/html/interfaces/base_element.h +34 -0
  245. data/vendor/lexbor/source/lexbor/html/interfaces/body_element.c +36 -0
  246. data/vendor/lexbor/source/lexbor/html/interfaces/body_element.h +34 -0
  247. data/vendor/lexbor/source/lexbor/html/interfaces/br_element.c +36 -0
  248. data/vendor/lexbor/source/lexbor/html/interfaces/br_element.h +34 -0
  249. data/vendor/lexbor/source/lexbor/html/interfaces/button_element.c +36 -0
  250. data/vendor/lexbor/source/lexbor/html/interfaces/button_element.h +34 -0
  251. data/vendor/lexbor/source/lexbor/html/interfaces/canvas_element.c +36 -0
  252. data/vendor/lexbor/source/lexbor/html/interfaces/canvas_element.h +34 -0
  253. data/vendor/lexbor/source/lexbor/html/interfaces/d_list_element.c +36 -0
  254. data/vendor/lexbor/source/lexbor/html/interfaces/d_list_element.h +34 -0
  255. data/vendor/lexbor/source/lexbor/html/interfaces/data_element.c +36 -0
  256. data/vendor/lexbor/source/lexbor/html/interfaces/data_element.h +34 -0
  257. data/vendor/lexbor/source/lexbor/html/interfaces/data_list_element.c +36 -0
  258. data/vendor/lexbor/source/lexbor/html/interfaces/data_list_element.h +34 -0
  259. data/vendor/lexbor/source/lexbor/html/interfaces/details_element.c +36 -0
  260. data/vendor/lexbor/source/lexbor/html/interfaces/details_element.h +34 -0
  261. data/vendor/lexbor/source/lexbor/html/interfaces/dialog_element.c +36 -0
  262. data/vendor/lexbor/source/lexbor/html/interfaces/dialog_element.h +34 -0
  263. data/vendor/lexbor/source/lexbor/html/interfaces/directory_element.c +36 -0
  264. data/vendor/lexbor/source/lexbor/html/interfaces/directory_element.h +34 -0
  265. data/vendor/lexbor/source/lexbor/html/interfaces/div_element.c +36 -0
  266. data/vendor/lexbor/source/lexbor/html/interfaces/div_element.h +34 -0
  267. data/vendor/lexbor/source/lexbor/html/interfaces/document.c +444 -0
  268. data/vendor/lexbor/source/lexbor/html/interfaces/document.h +256 -0
  269. data/vendor/lexbor/source/lexbor/html/interfaces/element.c +64 -0
  270. data/vendor/lexbor/source/lexbor/html/interfaces/element.h +54 -0
  271. data/vendor/lexbor/source/lexbor/html/interfaces/embed_element.c +36 -0
  272. data/vendor/lexbor/source/lexbor/html/interfaces/embed_element.h +34 -0
  273. data/vendor/lexbor/source/lexbor/html/interfaces/field_set_element.c +36 -0
  274. data/vendor/lexbor/source/lexbor/html/interfaces/field_set_element.h +34 -0
  275. data/vendor/lexbor/source/lexbor/html/interfaces/font_element.c +36 -0
  276. data/vendor/lexbor/source/lexbor/html/interfaces/font_element.h +34 -0
  277. data/vendor/lexbor/source/lexbor/html/interfaces/form_element.c +36 -0
  278. data/vendor/lexbor/source/lexbor/html/interfaces/form_element.h +34 -0
  279. data/vendor/lexbor/source/lexbor/html/interfaces/frame_element.c +36 -0
  280. data/vendor/lexbor/source/lexbor/html/interfaces/frame_element.h +34 -0
  281. data/vendor/lexbor/source/lexbor/html/interfaces/frame_set_element.c +36 -0
  282. data/vendor/lexbor/source/lexbor/html/interfaces/frame_set_element.h +34 -0
  283. data/vendor/lexbor/source/lexbor/html/interfaces/head_element.c +36 -0
  284. data/vendor/lexbor/source/lexbor/html/interfaces/head_element.h +34 -0
  285. data/vendor/lexbor/source/lexbor/html/interfaces/heading_element.c +36 -0
  286. data/vendor/lexbor/source/lexbor/html/interfaces/heading_element.h +34 -0
  287. data/vendor/lexbor/source/lexbor/html/interfaces/hr_element.c +36 -0
  288. data/vendor/lexbor/source/lexbor/html/interfaces/hr_element.h +34 -0
  289. data/vendor/lexbor/source/lexbor/html/interfaces/html_element.c +36 -0
  290. data/vendor/lexbor/source/lexbor/html/interfaces/html_element.h +34 -0
  291. data/vendor/lexbor/source/lexbor/html/interfaces/iframe_element.c +36 -0
  292. data/vendor/lexbor/source/lexbor/html/interfaces/iframe_element.h +34 -0
  293. data/vendor/lexbor/source/lexbor/html/interfaces/image_element.c +36 -0
  294. data/vendor/lexbor/source/lexbor/html/interfaces/image_element.h +34 -0
  295. data/vendor/lexbor/source/lexbor/html/interfaces/input_element.c +36 -0
  296. data/vendor/lexbor/source/lexbor/html/interfaces/input_element.h +34 -0
  297. data/vendor/lexbor/source/lexbor/html/interfaces/label_element.c +36 -0
  298. data/vendor/lexbor/source/lexbor/html/interfaces/label_element.h +34 -0
  299. data/vendor/lexbor/source/lexbor/html/interfaces/legend_element.c +36 -0
  300. data/vendor/lexbor/source/lexbor/html/interfaces/legend_element.h +34 -0
  301. data/vendor/lexbor/source/lexbor/html/interfaces/li_element.c +36 -0
  302. data/vendor/lexbor/source/lexbor/html/interfaces/li_element.h +34 -0
  303. data/vendor/lexbor/source/lexbor/html/interfaces/link_element.c +36 -0
  304. data/vendor/lexbor/source/lexbor/html/interfaces/link_element.h +34 -0
  305. data/vendor/lexbor/source/lexbor/html/interfaces/map_element.c +36 -0
  306. data/vendor/lexbor/source/lexbor/html/interfaces/map_element.h +34 -0
  307. data/vendor/lexbor/source/lexbor/html/interfaces/marquee_element.c +36 -0
  308. data/vendor/lexbor/source/lexbor/html/interfaces/marquee_element.h +34 -0
  309. data/vendor/lexbor/source/lexbor/html/interfaces/media_element.c +36 -0
  310. data/vendor/lexbor/source/lexbor/html/interfaces/media_element.h +34 -0
  311. data/vendor/lexbor/source/lexbor/html/interfaces/menu_element.c +36 -0
  312. data/vendor/lexbor/source/lexbor/html/interfaces/menu_element.h +34 -0
  313. data/vendor/lexbor/source/lexbor/html/interfaces/meta_element.c +36 -0
  314. data/vendor/lexbor/source/lexbor/html/interfaces/meta_element.h +34 -0
  315. data/vendor/lexbor/source/lexbor/html/interfaces/meter_element.c +36 -0
  316. data/vendor/lexbor/source/lexbor/html/interfaces/meter_element.h +34 -0
  317. data/vendor/lexbor/source/lexbor/html/interfaces/mod_element.c +36 -0
  318. data/vendor/lexbor/source/lexbor/html/interfaces/mod_element.h +34 -0
  319. data/vendor/lexbor/source/lexbor/html/interfaces/o_list_element.c +36 -0
  320. data/vendor/lexbor/source/lexbor/html/interfaces/o_list_element.h +34 -0
  321. data/vendor/lexbor/source/lexbor/html/interfaces/object_element.c +36 -0
  322. data/vendor/lexbor/source/lexbor/html/interfaces/object_element.h +34 -0
  323. data/vendor/lexbor/source/lexbor/html/interfaces/opt_group_element.c +36 -0
  324. data/vendor/lexbor/source/lexbor/html/interfaces/opt_group_element.h +34 -0
  325. data/vendor/lexbor/source/lexbor/html/interfaces/option_element.c +36 -0
  326. data/vendor/lexbor/source/lexbor/html/interfaces/option_element.h +34 -0
  327. data/vendor/lexbor/source/lexbor/html/interfaces/output_element.c +36 -0
  328. data/vendor/lexbor/source/lexbor/html/interfaces/output_element.h +34 -0
  329. data/vendor/lexbor/source/lexbor/html/interfaces/paragraph_element.c +36 -0
  330. data/vendor/lexbor/source/lexbor/html/interfaces/paragraph_element.h +34 -0
  331. data/vendor/lexbor/source/lexbor/html/interfaces/param_element.c +36 -0
  332. data/vendor/lexbor/source/lexbor/html/interfaces/param_element.h +34 -0
  333. data/vendor/lexbor/source/lexbor/html/interfaces/picture_element.c +36 -0
  334. data/vendor/lexbor/source/lexbor/html/interfaces/picture_element.h +34 -0
  335. data/vendor/lexbor/source/lexbor/html/interfaces/pre_element.c +36 -0
  336. data/vendor/lexbor/source/lexbor/html/interfaces/pre_element.h +34 -0
  337. data/vendor/lexbor/source/lexbor/html/interfaces/progress_element.c +36 -0
  338. data/vendor/lexbor/source/lexbor/html/interfaces/progress_element.h +34 -0
  339. data/vendor/lexbor/source/lexbor/html/interfaces/quote_element.c +36 -0
  340. data/vendor/lexbor/source/lexbor/html/interfaces/quote_element.h +34 -0
  341. data/vendor/lexbor/source/lexbor/html/interfaces/script_element.c +36 -0
  342. data/vendor/lexbor/source/lexbor/html/interfaces/script_element.h +34 -0
  343. data/vendor/lexbor/source/lexbor/html/interfaces/select_element.c +36 -0
  344. data/vendor/lexbor/source/lexbor/html/interfaces/select_element.h +34 -0
  345. data/vendor/lexbor/source/lexbor/html/interfaces/slot_element.c +36 -0
  346. data/vendor/lexbor/source/lexbor/html/interfaces/slot_element.h +34 -0
  347. data/vendor/lexbor/source/lexbor/html/interfaces/source_element.c +36 -0
  348. data/vendor/lexbor/source/lexbor/html/interfaces/source_element.h +34 -0
  349. data/vendor/lexbor/source/lexbor/html/interfaces/span_element.c +36 -0
  350. data/vendor/lexbor/source/lexbor/html/interfaces/span_element.h +34 -0
  351. data/vendor/lexbor/source/lexbor/html/interfaces/style_element.c +36 -0
  352. data/vendor/lexbor/source/lexbor/html/interfaces/style_element.h +34 -0
  353. data/vendor/lexbor/source/lexbor/html/interfaces/table_caption_element.c +36 -0
  354. data/vendor/lexbor/source/lexbor/html/interfaces/table_caption_element.h +34 -0
  355. data/vendor/lexbor/source/lexbor/html/interfaces/table_cell_element.c +36 -0
  356. data/vendor/lexbor/source/lexbor/html/interfaces/table_cell_element.h +34 -0
  357. data/vendor/lexbor/source/lexbor/html/interfaces/table_col_element.c +36 -0
  358. data/vendor/lexbor/source/lexbor/html/interfaces/table_col_element.h +34 -0
  359. data/vendor/lexbor/source/lexbor/html/interfaces/table_element.c +36 -0
  360. data/vendor/lexbor/source/lexbor/html/interfaces/table_element.h +34 -0
  361. data/vendor/lexbor/source/lexbor/html/interfaces/table_row_element.c +36 -0
  362. data/vendor/lexbor/source/lexbor/html/interfaces/table_row_element.h +34 -0
  363. data/vendor/lexbor/source/lexbor/html/interfaces/table_section_element.c +36 -0
  364. data/vendor/lexbor/source/lexbor/html/interfaces/table_section_element.h +34 -0
  365. data/vendor/lexbor/source/lexbor/html/interfaces/template_element.c +46 -0
  366. data/vendor/lexbor/source/lexbor/html/interfaces/template_element.h +38 -0
  367. data/vendor/lexbor/source/lexbor/html/interfaces/text_area_element.c +36 -0
  368. data/vendor/lexbor/source/lexbor/html/interfaces/text_area_element.h +34 -0
  369. data/vendor/lexbor/source/lexbor/html/interfaces/time_element.c +36 -0
  370. data/vendor/lexbor/source/lexbor/html/interfaces/time_element.h +34 -0
  371. data/vendor/lexbor/source/lexbor/html/interfaces/title_element.c +133 -0
  372. data/vendor/lexbor/source/lexbor/html/interfaces/title_element.h +42 -0
  373. data/vendor/lexbor/source/lexbor/html/interfaces/track_element.c +36 -0
  374. data/vendor/lexbor/source/lexbor/html/interfaces/track_element.h +34 -0
  375. data/vendor/lexbor/source/lexbor/html/interfaces/u_list_element.c +36 -0
  376. data/vendor/lexbor/source/lexbor/html/interfaces/u_list_element.h +34 -0
  377. data/vendor/lexbor/source/lexbor/html/interfaces/unknown_element.c +36 -0
  378. data/vendor/lexbor/source/lexbor/html/interfaces/unknown_element.h +34 -0
  379. data/vendor/lexbor/source/lexbor/html/interfaces/video_element.c +36 -0
  380. data/vendor/lexbor/source/lexbor/html/interfaces/video_element.h +34 -0
  381. data/vendor/lexbor/source/lexbor/html/interfaces/window.c +36 -0
  382. data/vendor/lexbor/source/lexbor/html/interfaces/window.h +34 -0
  383. data/vendor/lexbor/source/lexbor/html/node.c +14 -0
  384. data/vendor/lexbor/source/lexbor/html/node.h +67 -0
  385. data/vendor/lexbor/source/lexbor/html/parser.c +469 -0
  386. data/vendor/lexbor/source/lexbor/html/parser.h +170 -0
  387. data/vendor/lexbor/source/lexbor/html/serialize.c +1510 -0
  388. data/vendor/lexbor/source/lexbor/html/serialize.h +93 -0
  389. data/vendor/lexbor/source/lexbor/html/tag.h +103 -0
  390. data/vendor/lexbor/source/lexbor/html/tag_res.h +2262 -0
  391. data/vendor/lexbor/source/lexbor/html/token.c +386 -0
  392. data/vendor/lexbor/source/lexbor/html/token.h +130 -0
  393. data/vendor/lexbor/source/lexbor/html/token_attr.c +44 -0
  394. data/vendor/lexbor/source/lexbor/html/token_attr.h +67 -0
  395. data/vendor/lexbor/source/lexbor/html/tokenizer/error.c +28 -0
  396. data/vendor/lexbor/source/lexbor/html/tokenizer/error.h +141 -0
  397. data/vendor/lexbor/source/lexbor/html/tokenizer/res.h +4956 -0
  398. data/vendor/lexbor/source/lexbor/html/tokenizer/state.c +2171 -0
  399. data/vendor/lexbor/source/lexbor/html/tokenizer/state.h +225 -0
  400. data/vendor/lexbor/source/lexbor/html/tokenizer/state_comment.c +489 -0
  401. data/vendor/lexbor/source/lexbor/html/tokenizer/state_comment.h +27 -0
  402. data/vendor/lexbor/source/lexbor/html/tokenizer/state_doctype.c +1654 -0
  403. data/vendor/lexbor/source/lexbor/html/tokenizer/state_doctype.h +27 -0
  404. data/vendor/lexbor/source/lexbor/html/tokenizer/state_rawtext.c +303 -0
  405. data/vendor/lexbor/source/lexbor/html/tokenizer/state_rawtext.h +32 -0
  406. data/vendor/lexbor/source/lexbor/html/tokenizer/state_rcdata.c +311 -0
  407. data/vendor/lexbor/source/lexbor/html/tokenizer/state_rcdata.h +32 -0
  408. data/vendor/lexbor/source/lexbor/html/tokenizer/state_script.c +1209 -0
  409. data/vendor/lexbor/source/lexbor/html/tokenizer/state_script.h +32 -0
  410. data/vendor/lexbor/source/lexbor/html/tokenizer.c +499 -0
  411. data/vendor/lexbor/source/lexbor/html/tokenizer.h +343 -0
  412. data/vendor/lexbor/source/lexbor/html/tree/active_formatting.c +241 -0
  413. data/vendor/lexbor/source/lexbor/html/tree/active_formatting.h +117 -0
  414. data/vendor/lexbor/source/lexbor/html/tree/error.c +26 -0
  415. data/vendor/lexbor/source/lexbor/html/tree/error.h +114 -0
  416. data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/after_after_body.c +62 -0
  417. data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/after_after_frameset.c +63 -0
  418. data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/after_body.c +82 -0
  419. data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/after_frameset.c +88 -0
  420. data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/after_head.c +222 -0
  421. data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/before_head.c +144 -0
  422. data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/before_html.c +166 -0
  423. data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/foreign_content.c +358 -0
  424. data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/in_body.c +1974 -0
  425. data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/in_caption.c +158 -0
  426. data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/in_cell.c +187 -0
  427. data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/in_column_group.c +194 -0
  428. data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/in_frameset.c +149 -0
  429. data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/in_head.c +374 -0
  430. data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/in_head_noscript.c +121 -0
  431. data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/in_row.c +211 -0
  432. data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/in_select.c +341 -0
  433. data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/in_select_in_table.c +115 -0
  434. data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/in_table.c +451 -0
  435. data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/in_table_body.c +208 -0
  436. data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/in_table_text.c +127 -0
  437. data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/in_template.c +189 -0
  438. data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/initial.c +411 -0
  439. data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/text.c +61 -0
  440. data/vendor/lexbor/source/lexbor/html/tree/insertion_mode.h +135 -0
  441. data/vendor/lexbor/source/lexbor/html/tree/open_elements.c +251 -0
  442. data/vendor/lexbor/source/lexbor/html/tree/open_elements.h +105 -0
  443. data/vendor/lexbor/source/lexbor/html/tree/template_insertion.c +10 -0
  444. data/vendor/lexbor/source/lexbor/html/tree/template_insertion.h +100 -0
  445. data/vendor/lexbor/source/lexbor/html/tree.c +1726 -0
  446. data/vendor/lexbor/source/lexbor/html/tree.h +431 -0
  447. data/vendor/lexbor/source/lexbor/html/tree_res.h +111 -0
  448. data/vendor/lexbor/source/lexbor/ns/base.h +32 -0
  449. data/vendor/lexbor/source/lexbor/ns/config.cmake +2 -0
  450. data/vendor/lexbor/source/lexbor/ns/const.h +37 -0
  451. data/vendor/lexbor/source/lexbor/ns/ns.c +154 -0
  452. data/vendor/lexbor/source/lexbor/ns/ns.h +66 -0
  453. data/vendor/lexbor/source/lexbor/ns/res.h +97 -0
  454. data/vendor/lexbor/source/lexbor/ports/posix/config.cmake +11 -0
  455. data/vendor/lexbor/source/lexbor/ports/posix/lexbor/core/fs.c +236 -0
  456. data/vendor/lexbor/source/lexbor/ports/posix/lexbor/core/memory.c +33 -0
  457. data/vendor/lexbor/source/lexbor/ports/posix/lexbor/core/perf.c +158 -0
  458. data/vendor/lexbor/source/lexbor/ports/windows_nt/config.cmake +18 -0
  459. data/vendor/lexbor/source/lexbor/ports/windows_nt/lexbor/core/fs.c +239 -0
  460. data/vendor/lexbor/source/lexbor/ports/windows_nt/lexbor/core/memory.c +33 -0
  461. data/vendor/lexbor/source/lexbor/ports/windows_nt/lexbor/core/perf.c +81 -0
  462. data/vendor/lexbor/source/lexbor/selectors/base.h +30 -0
  463. data/vendor/lexbor/source/lexbor/selectors/config.cmake +2 -0
  464. data/vendor/lexbor/source/lexbor/selectors/selectors.c +1591 -0
  465. data/vendor/lexbor/source/lexbor/selectors/selectors.h +71 -0
  466. data/vendor/lexbor/source/lexbor/tag/base.h +32 -0
  467. data/vendor/lexbor/source/lexbor/tag/config.cmake +2 -0
  468. data/vendor/lexbor/source/lexbor/tag/const.h +225 -0
  469. data/vendor/lexbor/source/lexbor/tag/res.h +562 -0
  470. data/vendor/lexbor/source/lexbor/tag/tag.c +144 -0
  471. data/vendor/lexbor/source/lexbor/tag/tag.h +123 -0
  472. data/vendor/lexbor/source/lexbor/utils/base.h +32 -0
  473. data/vendor/lexbor/source/lexbor/utils/config.cmake +2 -0
  474. data/vendor/lexbor/source/lexbor/utils/http.c +534 -0
  475. data/vendor/lexbor/source/lexbor/utils/http.h +90 -0
  476. data/vendor/lexbor/source/lexbor/utils/utils.h +15 -0
  477. data/vendor/lexbor/source/lexbor/utils/warc.c +817 -0
  478. data/vendor/lexbor/source/lexbor/utils/warc.h +126 -0
  479. data/vendor/lexbor/utils/lexbor/css/selectors/pseudo.py +231 -0
  480. data/vendor/lexbor/utils/lexbor/css/selectors/tmp/const.h +21 -0
  481. data/vendor/lexbor/utils/lexbor/css/selectors/tmp/res.h +26 -0
  482. data/vendor/lexbor/utils/lexbor/css/syntax/definitions.py +49 -0
  483. data/vendor/lexbor/utils/lexbor/css/syntax/token_res.py +54 -0
  484. data/vendor/lexbor/utils/lexbor/css/syntax/tokenizer_code_map.py +36 -0
  485. data/vendor/lexbor/version +1 -0
  486. metadata +542 -0
@@ -0,0 +1,2171 @@
1
+ /*
2
+ * Copyright (C) 2018-2020 Alexander Borisov
3
+ *
4
+ * Author: Alexander Borisov <borisov@lexbor.com>
5
+ */
6
+
7
+ #include "lexbor/html/tokenizer/state.h"
8
+ #include "lexbor/html/tokenizer/state_comment.h"
9
+ #include "lexbor/html/tokenizer/state_doctype.h"
10
+
11
+ #define LEXBOR_STR_RES_ANSI_REPLACEMENT_CHARACTER
12
+ #define LEXBOR_STR_RES_ALPHANUMERIC_CHARACTER
13
+ #define LEXBOR_STR_RES_REPLACEMENT_CHARACTER
14
+ #define LEXBOR_STR_RES_ALPHA_CHARACTER
15
+ #define LEXBOR_STR_RES_MAP_HEX
16
+ #define LEXBOR_STR_RES_MAP_NUM
17
+ #include "lexbor/core/str_res.h"
18
+
19
+ #define LXB_HTML_TOKENIZER_RES_ENTITIES_SBST
20
+ #include "lexbor/html/tokenizer/res.h"
21
+
22
+
23
+ const lxb_tag_data_t *
24
+ lxb_tag_append_lower(lexbor_hash_t *hash,
25
+ const lxb_char_t *name, size_t length);
26
+
27
+ lxb_dom_attr_data_t *
28
+ lxb_dom_attr_local_name_append(lexbor_hash_t *hash,
29
+ const lxb_char_t *name, size_t length);
30
+
31
+
32
+ static const lxb_char_t *
33
+ lxb_html_tokenizer_state_data(lxb_html_tokenizer_t *tkz,
34
+ const lxb_char_t *data, const lxb_char_t *end);
35
+
36
+ static const lxb_char_t *
37
+ lxb_html_tokenizer_state_plaintext(lxb_html_tokenizer_t *tkz,
38
+ const lxb_char_t *data,
39
+ const lxb_char_t *end);
40
+
41
+ /* Tag */
42
+ static const lxb_char_t *
43
+ lxb_html_tokenizer_state_tag_open(lxb_html_tokenizer_t *tkz,
44
+ const lxb_char_t *data,
45
+ const lxb_char_t *end);
46
+
47
+ static const lxb_char_t *
48
+ lxb_html_tokenizer_state_end_tag_open(lxb_html_tokenizer_t *tkz,
49
+ const lxb_char_t *data,
50
+ const lxb_char_t *end);
51
+
52
+ static const lxb_char_t *
53
+ lxb_html_tokenizer_state_tag_name(lxb_html_tokenizer_t *tkz,
54
+ const lxb_char_t *data,
55
+ const lxb_char_t *end);
56
+
57
+ /* Attribute */
58
+ static const lxb_char_t *
59
+ lxb_html_tokenizer_state_attribute_name(lxb_html_tokenizer_t *tkz,
60
+ const lxb_char_t *data,
61
+ const lxb_char_t *end);
62
+
63
+ static const lxb_char_t *
64
+ lxb_html_tokenizer_state_after_attribute_name(lxb_html_tokenizer_t *tkz,
65
+ const lxb_char_t *data,
66
+ const lxb_char_t *end);
67
+
68
+ static const lxb_char_t *
69
+ lxb_html_tokenizer_state_before_attribute_value(lxb_html_tokenizer_t *tkz,
70
+ const lxb_char_t *data,
71
+ const lxb_char_t *end);
72
+
73
+ static const lxb_char_t *
74
+ lxb_html_tokenizer_state_attribute_value_double_quoted(lxb_html_tokenizer_t *tkz,
75
+ const lxb_char_t *data,
76
+ const lxb_char_t *end);
77
+
78
+ static const lxb_char_t *
79
+ lxb_html_tokenizer_state_attribute_value_single_quoted(lxb_html_tokenizer_t *tkz,
80
+ const lxb_char_t *data,
81
+ const lxb_char_t *end);
82
+
83
+ static const lxb_char_t *
84
+ lxb_html_tokenizer_state_attribute_value_unquoted(lxb_html_tokenizer_t *tkz,
85
+ const lxb_char_t *data,
86
+ const lxb_char_t *end);
87
+
88
+ static const lxb_char_t *
89
+ lxb_html_tokenizer_state_after_attribute_value_quoted(lxb_html_tokenizer_t *tkz,
90
+ const lxb_char_t *data,
91
+ const lxb_char_t *end);
92
+
93
+ static const lxb_char_t *
94
+ lxb_html_tokenizer_state_bogus_comment_before(lxb_html_tokenizer_t *tkz,
95
+ const lxb_char_t *data,
96
+ const lxb_char_t *end);
97
+
98
+ static const lxb_char_t *
99
+ lxb_html_tokenizer_state_bogus_comment(lxb_html_tokenizer_t *tkz,
100
+ const lxb_char_t *data,
101
+ const lxb_char_t *end);
102
+
103
+ /* Markup declaration */
104
+ static const lxb_char_t *
105
+ lxb_html_tokenizer_state_markup_declaration_open(lxb_html_tokenizer_t *tkz,
106
+ const lxb_char_t *data,
107
+ const lxb_char_t *end);
108
+
109
+ static const lxb_char_t *
110
+ lxb_html_tokenizer_state_markup_declaration_comment(lxb_html_tokenizer_t *tkz,
111
+ const lxb_char_t *data,
112
+ const lxb_char_t *end);
113
+
114
+ static const lxb_char_t *
115
+ lxb_html_tokenizer_state_markup_declaration_doctype(lxb_html_tokenizer_t *tkz,
116
+ const lxb_char_t *data,
117
+ const lxb_char_t *end);
118
+
119
+ static const lxb_char_t *
120
+ lxb_html_tokenizer_state_markup_declaration_cdata(lxb_html_tokenizer_t *tkz,
121
+ const lxb_char_t *data,
122
+ const lxb_char_t *end);
123
+
124
+ /* CDATA Section */
125
+ static const lxb_char_t *
126
+ lxb_html_tokenizer_state_cdata_section_before(lxb_html_tokenizer_t *tkz,
127
+ const lxb_char_t *data,
128
+ const lxb_char_t *end);
129
+
130
+ static const lxb_char_t *
131
+ lxb_html_tokenizer_state_cdata_section(lxb_html_tokenizer_t *tkz,
132
+ const lxb_char_t *data,
133
+ const lxb_char_t *end);
134
+
135
+ static const lxb_char_t *
136
+ lxb_html_tokenizer_state_cdata_section_bracket(lxb_html_tokenizer_t *tkz,
137
+ const lxb_char_t *data,
138
+ const lxb_char_t *end);
139
+
140
+ static const lxb_char_t *
141
+ lxb_html_tokenizer_state_cdata_section_end(lxb_html_tokenizer_t *tkz,
142
+ const lxb_char_t *data,
143
+ const lxb_char_t *end);
144
+
145
+ static const lxb_char_t *
146
+ lxb_html_tokenizer_state_char_ref_attr(lxb_html_tokenizer_t *tkz,
147
+ const lxb_char_t *data,
148
+ const lxb_char_t *end);
149
+
150
+ static const lxb_char_t *
151
+ _lxb_html_tokenizer_state_char_ref(lxb_html_tokenizer_t *tkz,
152
+ const lxb_char_t *data,
153
+ const lxb_char_t *end);
154
+
155
+ static const lxb_char_t *
156
+ lxb_html_tokenizer_state_char_ref_named(lxb_html_tokenizer_t *tkz,
157
+ const lxb_char_t *data,
158
+ const lxb_char_t *end);
159
+
160
+ static const lxb_char_t *
161
+ lxb_html_tokenizer_state_char_ref_ambiguous_ampersand(lxb_html_tokenizer_t *tkz,
162
+ const lxb_char_t *data,
163
+ const lxb_char_t *end);
164
+
165
+ static const lxb_char_t *
166
+ lxb_html_tokenizer_state_char_ref_numeric(lxb_html_tokenizer_t *tkz,
167
+ const lxb_char_t *data,
168
+ const lxb_char_t *end);
169
+
170
+ static const lxb_char_t *
171
+ lxb_html_tokenizer_state_char_ref_hexademical_start(lxb_html_tokenizer_t *tkz,
172
+ const lxb_char_t *data,
173
+ const lxb_char_t *end);
174
+
175
+ static const lxb_char_t *
176
+ lxb_html_tokenizer_state_char_ref_decimal_start(lxb_html_tokenizer_t *tkz,
177
+ const lxb_char_t *data,
178
+ const lxb_char_t *end);
179
+
180
+ static const lxb_char_t *
181
+ lxb_html_tokenizer_state_char_ref_hexademical(lxb_html_tokenizer_t *tkz,
182
+ const lxb_char_t *data,
183
+ const lxb_char_t *end);
184
+
185
+ static const lxb_char_t *
186
+ lxb_html_tokenizer_state_char_ref_decimal(lxb_html_tokenizer_t *tkz,
187
+ const lxb_char_t *data,
188
+ const lxb_char_t *end);
189
+
190
+ static const lxb_char_t *
191
+ lxb_html_tokenizer_state_char_ref_numeric_end(lxb_html_tokenizer_t *tkz,
192
+ const lxb_char_t *data,
193
+ const lxb_char_t *end);
194
+
195
+ static size_t
196
+ lxb_html_tokenizer_state_to_ascii_utf_8(size_t codepoint, lxb_char_t *data);
197
+
198
+
199
+ /*
200
+ * Helper function. No in the specification. For 12.2.5.1 Data state
201
+ */
202
+ const lxb_char_t *
203
+ lxb_html_tokenizer_state_data_before(lxb_html_tokenizer_t *tkz,
204
+ const lxb_char_t *data,
205
+ const lxb_char_t *end)
206
+ {
207
+ if (tkz->is_eof == false) {
208
+ lxb_html_tokenizer_state_token_set_begin(tkz, data);
209
+ }
210
+
211
+ /*
212
+ * Text node init param sets before emit token.
213
+ */
214
+
215
+ tkz->state = lxb_html_tokenizer_state_data;
216
+
217
+ return data;
218
+ }
219
+
220
+ /*
221
+ * 12.2.5.1 Data state
222
+ */
223
+ static const lxb_char_t *
224
+ lxb_html_tokenizer_state_data(lxb_html_tokenizer_t *tkz,
225
+ const lxb_char_t *data, const lxb_char_t *end)
226
+ {
227
+ lxb_html_tokenizer_state_begin_set(tkz, data);
228
+
229
+ while (data != end) {
230
+ switch (*data) {
231
+ /* U+003C LESS-THAN SIGN (<) */
232
+ case 0x3C:
233
+ lxb_html_tokenizer_state_append_data_m(tkz, data);
234
+ lxb_html_tokenizer_state_token_set_end(tkz, data);
235
+
236
+ tkz->state = lxb_html_tokenizer_state_tag_open;
237
+ return (data + 1);
238
+
239
+ /* U+0026 AMPERSAND (&) */
240
+ case 0x26:
241
+ lxb_html_tokenizer_state_append_data_m(tkz, data + 1);
242
+
243
+ tkz->state = lxb_html_tokenizer_state_char_ref;
244
+ tkz->state_return = lxb_html_tokenizer_state_data;
245
+
246
+ return data + 1;
247
+
248
+ /* U+000D CARRIAGE RETURN (CR) */
249
+ case 0x0D:
250
+ if (++data >= end) {
251
+ lxb_html_tokenizer_state_append_data_m(tkz, data - 1);
252
+
253
+ tkz->state = lxb_html_tokenizer_state_cr;
254
+ tkz->state_return = lxb_html_tokenizer_state_data;
255
+
256
+ return data;
257
+ }
258
+
259
+ lxb_html_tokenizer_state_append_data_m(tkz, data);
260
+ tkz->pos[-1] = 0x0A;
261
+
262
+ lxb_html_tokenizer_state_begin_set(tkz, data + 1);
263
+
264
+ if (*data != 0x0A) {
265
+ lxb_html_tokenizer_state_begin_set(tkz, data);
266
+ data--;
267
+ }
268
+
269
+ break;
270
+
271
+ /*
272
+ * U+0000 NULL
273
+ * EOF
274
+ */
275
+ case 0x00:
276
+ if (tkz->is_eof) {
277
+ /* Emit TEXT node if not empty */
278
+ if (tkz->token->begin != NULL) {
279
+ lxb_html_tokenizer_state_token_set_end_oef(tkz);
280
+ }
281
+
282
+ if (tkz->token->begin != tkz->token->end) {
283
+ tkz->token->tag_id = LXB_TAG__TEXT;
284
+
285
+ lxb_html_tokenizer_state_append_data_m(tkz, data);
286
+
287
+ lxb_html_tokenizer_state_set_text(tkz);
288
+ lxb_html_tokenizer_state_token_done_wo_check_m(tkz,end);
289
+ }
290
+
291
+ return end;
292
+ }
293
+
294
+ if (SIZE_MAX - tkz->token->null_count < 1) {
295
+ tkz->status = LXB_STATUS_ERROR_OVERFLOW;
296
+ return end;
297
+ }
298
+
299
+ tkz->token->null_count++;
300
+
301
+ lxb_html_tokenizer_error_add(tkz->parse_errors, data,
302
+ LXB_HTML_TOKENIZER_ERROR_UNNUCH);
303
+ break;
304
+ }
305
+
306
+ data++;
307
+ }
308
+
309
+ lxb_html_tokenizer_state_append_data_m(tkz, data);
310
+
311
+ return data;
312
+ }
313
+
314
+ /*
315
+ * Helper function. No in the specification. For 12.2.5.5 PLAINTEXT state
316
+ */
317
+ const lxb_char_t *
318
+ lxb_html_tokenizer_state_plaintext_before(lxb_html_tokenizer_t *tkz,
319
+ const lxb_char_t *data,
320
+ const lxb_char_t *end)
321
+ {
322
+ if (tkz->is_eof == false) {
323
+ lxb_html_tokenizer_state_token_set_begin(tkz, data);
324
+ }
325
+
326
+ tkz->token->tag_id = LXB_TAG__TEXT;
327
+
328
+ tkz->state = lxb_html_tokenizer_state_plaintext;
329
+
330
+ return data;
331
+ }
332
+
333
+ /*
334
+ * 12.2.5.5 PLAINTEXT state
335
+ */
336
+ static const lxb_char_t *
337
+ lxb_html_tokenizer_state_plaintext(lxb_html_tokenizer_t *tkz,
338
+ const lxb_char_t *data,
339
+ const lxb_char_t *end)
340
+ {
341
+ lxb_html_tokenizer_state_begin_set(tkz, data);
342
+
343
+ while (data != end) {
344
+ switch (*data) {
345
+ /* U+000D CARRIAGE RETURN (CR) */
346
+ case 0x0D:
347
+ if (++data >= end) {
348
+ lxb_html_tokenizer_state_append_data_m(tkz, data - 1);
349
+
350
+ tkz->state = lxb_html_tokenizer_state_cr;
351
+ tkz->state_return = lxb_html_tokenizer_state_plaintext;
352
+
353
+ return data;
354
+ }
355
+
356
+ lxb_html_tokenizer_state_append_data_m(tkz, data);
357
+ tkz->pos[-1] = 0x0A;
358
+
359
+ lxb_html_tokenizer_state_begin_set(tkz, data + 1);
360
+
361
+ if (*data != 0x0A) {
362
+ lxb_html_tokenizer_state_begin_set(tkz, data);
363
+ data--;
364
+ }
365
+
366
+ break;
367
+
368
+ /*
369
+ * U+0000 NULL
370
+ * EOF
371
+ */
372
+ case 0x00:
373
+ lxb_html_tokenizer_state_append_data_m(tkz, data);
374
+
375
+ if (tkz->is_eof) {
376
+ if (tkz->token->begin != NULL) {
377
+ lxb_html_tokenizer_state_token_set_end_oef(tkz);
378
+ }
379
+
380
+ lxb_html_tokenizer_state_set_text(tkz);
381
+ lxb_html_tokenizer_state_token_done_m(tkz, end);
382
+
383
+ return end;
384
+ }
385
+
386
+ lxb_html_tokenizer_state_begin_set(tkz, data + 1);
387
+ lxb_html_tokenizer_state_append_replace_m(tkz);
388
+
389
+ lxb_html_tokenizer_error_add(tkz->parse_errors, data,
390
+ LXB_HTML_TOKENIZER_ERROR_UNNUCH);
391
+ break;
392
+ }
393
+
394
+ data++;
395
+ }
396
+
397
+ lxb_html_tokenizer_state_append_data_m(tkz, data);
398
+
399
+ return data;
400
+ }
401
+
402
+ /*
403
+ * 12.2.5.6 Tag open state
404
+ */
405
+ static const lxb_char_t *
406
+ lxb_html_tokenizer_state_tag_open(lxb_html_tokenizer_t *tkz,
407
+ const lxb_char_t *data, const lxb_char_t *end)
408
+ {
409
+ /* ASCII alpha */
410
+ if (lexbor_str_res_alpha_character[ *data ] != LEXBOR_STR_RES_SLIP) {
411
+ tkz->state = lxb_html_tokenizer_state_tag_name;
412
+
413
+ lxb_html_tokenizer_state_token_emit_text_not_empty_m(tkz, end);
414
+ lxb_html_tokenizer_state_token_set_begin(tkz, data);
415
+
416
+ return data;
417
+ }
418
+
419
+ /* U+002F SOLIDUS (/) */
420
+ else if (*data == 0x2F) {
421
+ tkz->state = lxb_html_tokenizer_state_end_tag_open;
422
+
423
+ return (data + 1);
424
+ }
425
+
426
+ /* U+0021 EXCLAMATION MARK (!) */
427
+ else if (*data == 0x21) {
428
+ tkz->state = lxb_html_tokenizer_state_markup_declaration_open;
429
+
430
+ lxb_html_tokenizer_state_token_emit_text_not_empty_m(tkz, end);
431
+
432
+ return (data + 1);
433
+ }
434
+
435
+ /* U+003F QUESTION MARK (?) */
436
+ else if (*data == 0x3F) {
437
+ tkz->state = lxb_html_tokenizer_state_bogus_comment_before;
438
+
439
+ lxb_html_tokenizer_state_token_emit_text_not_empty_m(tkz, end);
440
+ lxb_html_tokenizer_state_token_set_begin(tkz, data);
441
+
442
+ lxb_html_tokenizer_error_add(tkz->parse_errors, data,
443
+ LXB_HTML_TOKENIZER_ERROR_UNQUMAINOFTANA);
444
+
445
+ return data;
446
+ }
447
+
448
+ /* EOF */
449
+ else if (*data == 0x00) {
450
+ if (tkz->is_eof) {
451
+ lxb_html_tokenizer_state_append_m(tkz, "<", 1);
452
+
453
+ lxb_html_tokenizer_state_token_set_end_oef(tkz);
454
+ lxb_html_tokenizer_state_token_emit_text_not_empty_m(tkz, end);
455
+
456
+ lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->token->end,
457
+ LXB_HTML_TOKENIZER_ERROR_EOBETANA);
458
+
459
+ return end;
460
+ }
461
+ }
462
+
463
+ lxb_html_tokenizer_state_append_m(tkz, "<", 1);
464
+
465
+ lxb_html_tokenizer_error_add(tkz->parse_errors, data,
466
+ LXB_HTML_TOKENIZER_ERROR_INFICHOFTANA);
467
+
468
+ tkz->state = lxb_html_tokenizer_state_data;
469
+
470
+ return data;
471
+ }
472
+
473
+ /*
474
+ * 12.2.5.7 End tag open state
475
+ */
476
+ static const lxb_char_t *
477
+ lxb_html_tokenizer_state_end_tag_open(lxb_html_tokenizer_t *tkz,
478
+ const lxb_char_t *data,
479
+ const lxb_char_t *end)
480
+ {
481
+ /* ASCII alpha */
482
+ if (lexbor_str_res_alpha_character[ *data ] != LEXBOR_STR_RES_SLIP) {
483
+ tkz->state = lxb_html_tokenizer_state_tag_name;
484
+
485
+ lxb_html_tokenizer_state_token_emit_text_not_empty_m(tkz, end);
486
+ lxb_html_tokenizer_state_token_set_begin(tkz, data);
487
+
488
+ tkz->token->type |= LXB_HTML_TOKEN_TYPE_CLOSE;
489
+
490
+ return data;
491
+ }
492
+
493
+ /* U+003E GREATER-THAN SIGN (>) */
494
+ else if (*data == 0x3E) {
495
+ tkz->state = lxb_html_tokenizer_state_data;
496
+
497
+ lxb_html_tokenizer_error_add(tkz->parse_errors, data,
498
+ LXB_HTML_TOKENIZER_ERROR_MIENTANA);
499
+
500
+ return (data + 1);
501
+ }
502
+
503
+ /* Fake EOF */
504
+ else if (*data == 0x00) {
505
+ if (tkz->is_eof) {
506
+ lxb_html_tokenizer_state_append_m(tkz, "</", 2);
507
+
508
+ lxb_html_tokenizer_state_token_set_end_oef(tkz);
509
+ lxb_html_tokenizer_state_token_emit_text_not_empty_m(tkz, end);
510
+
511
+ lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->token->end,
512
+ LXB_HTML_TOKENIZER_ERROR_EOBETANA);
513
+
514
+ return end;
515
+ }
516
+ }
517
+
518
+ tkz->state = lxb_html_tokenizer_state_bogus_comment_before;
519
+
520
+ lxb_html_tokenizer_error_add(tkz->parse_errors, data,
521
+ LXB_HTML_TOKENIZER_ERROR_INFICHOFTANA);
522
+
523
+ lxb_html_tokenizer_state_token_emit_text_not_empty_m(tkz, end);
524
+ lxb_html_tokenizer_state_token_set_begin(tkz, data);
525
+
526
+ return data;
527
+ }
528
+
529
+ /*
530
+ * 12.2.5.8 Tag name state
531
+ */
532
+ static const lxb_char_t *
533
+ lxb_html_tokenizer_state_tag_name(lxb_html_tokenizer_t *tkz,
534
+ const lxb_char_t *data, const lxb_char_t *end)
535
+ {
536
+ lxb_html_tokenizer_state_begin_set(tkz, data);
537
+
538
+ while (data != end) {
539
+ switch (*data) {
540
+ /*
541
+ * U+0009 CHARACTER TABULATION (tab)
542
+ * U+000A LINE FEED (LF)
543
+ * U+000C FORM FEED (FF)
544
+ * U+000D CARRIAGE RETURN (CR)
545
+ * U+0020 SPACE
546
+ */
547
+ case 0x09:
548
+ case 0x0A:
549
+ case 0x0C:
550
+ case 0x0D:
551
+ case 0x20:
552
+ lxb_html_tokenizer_state_append_data_m(tkz, data);
553
+ lxb_html_tokenizer_state_set_tag_m(tkz, tkz->start, tkz->pos);
554
+ lxb_html_tokenizer_state_token_set_end(tkz, data);
555
+
556
+ tkz->state = lxb_html_tokenizer_state_before_attribute_name;
557
+ return (data + 1);
558
+
559
+ /* U+002F SOLIDUS (/) */
560
+ case 0x2F:
561
+ lxb_html_tokenizer_state_append_data_m(tkz, data);
562
+ lxb_html_tokenizer_state_set_tag_m(tkz, tkz->start, tkz->pos);
563
+ lxb_html_tokenizer_state_token_set_end(tkz, data);
564
+
565
+ tkz->state = lxb_html_tokenizer_state_self_closing_start_tag;
566
+ return (data + 1);
567
+
568
+ /* U+003E GREATER-THAN SIGN (>) */
569
+ case 0x3E:
570
+ tkz->state = lxb_html_tokenizer_state_data_before;
571
+
572
+ lxb_html_tokenizer_state_append_data_m(tkz, data);
573
+ lxb_html_tokenizer_state_set_tag_m(tkz, tkz->start, tkz->pos);
574
+ lxb_html_tokenizer_state_token_set_end(tkz, data);
575
+ lxb_html_tokenizer_state_token_done_m(tkz, end);
576
+
577
+ return (data + 1);
578
+
579
+ /* U+0000 NULL */
580
+ case 0x00:
581
+ if (tkz->is_eof) {
582
+ lxb_html_tokenizer_state_token_set_end_oef(tkz);
583
+
584
+ lxb_html_tokenizer_error_add(tkz->parse_errors,
585
+ tkz->token->end,
586
+ LXB_HTML_TOKENIZER_ERROR_EOINTA);
587
+ return end;
588
+ }
589
+
590
+ lxb_html_tokenizer_state_append_data_m(tkz, data);
591
+ lxb_html_tokenizer_state_begin_set(tkz, data + 1);
592
+ lxb_html_tokenizer_state_append_replace_m(tkz);
593
+
594
+ lxb_html_tokenizer_error_add(tkz->parse_errors, data,
595
+ LXB_HTML_TOKENIZER_ERROR_UNNUCH);
596
+ break;
597
+
598
+ default:
599
+ break;
600
+ }
601
+
602
+ data++;
603
+ }
604
+
605
+ lxb_html_tokenizer_state_append_data_m(tkz, data);
606
+
607
+ return data;
608
+ }
609
+
610
+ /*
611
+ * 12.2.5.32 Before attribute name state
612
+ */
613
+ const lxb_char_t *
614
+ lxb_html_tokenizer_state_before_attribute_name(lxb_html_tokenizer_t *tkz,
615
+ const lxb_char_t *data,
616
+ const lxb_char_t *end)
617
+ {
618
+ lxb_html_token_attr_t *attr;
619
+
620
+ while (data != end) {
621
+ switch (*data) {
622
+ /*
623
+ * U+0009 CHARACTER TABULATION (tab)
624
+ * U+000A LINE FEED (LF)
625
+ * U+000C FORM FEED (FF)
626
+ * U+000D CARRIAGE RETURN (CR)
627
+ * U+0020 SPACE
628
+ */
629
+ case 0x09:
630
+ case 0x0A:
631
+ case 0x0C:
632
+ case 0x0D:
633
+ case 0x20:
634
+ break;
635
+
636
+ /* U+003D EQUALS SIGN (=) */
637
+ case 0x3D:
638
+ lxb_html_tokenizer_state_token_attr_add_m(tkz, attr, end);
639
+ lxb_html_tokenizer_state_token_attr_set_name_begin(tkz, data);
640
+
641
+ lxb_html_tokenizer_state_append_m(tkz, data, 1);
642
+
643
+ lxb_html_tokenizer_error_add(tkz->parse_errors, data,
644
+ LXB_HTML_TOKENIZER_ERROR_UNEQSIBEATNA);
645
+
646
+ tkz->state = lxb_html_tokenizer_state_attribute_name;
647
+ return (data + 1);
648
+
649
+ /*
650
+ * U+002F SOLIDUS (/)
651
+ * U+003E GREATER-THAN SIGN (>)
652
+ */
653
+ case 0x2F:
654
+ case 0x3E:
655
+ tkz->state = lxb_html_tokenizer_state_after_attribute_name;
656
+ return data;
657
+
658
+ /* EOF */
659
+ case 0x00:
660
+ if (tkz->is_eof) {
661
+ tkz->state = lxb_html_tokenizer_state_after_attribute_name;
662
+ return data;
663
+ }
664
+ /* fall through */
665
+
666
+ /* Anything else */
667
+ default:
668
+ lxb_html_tokenizer_state_token_attr_add_m(tkz, attr, end);
669
+ lxb_html_tokenizer_state_token_attr_set_name_begin(tkz, data);
670
+
671
+ tkz->state = lxb_html_tokenizer_state_attribute_name;
672
+ return data;
673
+ }
674
+
675
+ data++;
676
+ }
677
+
678
+ return data;
679
+ }
680
+
681
+ /*
682
+ * 12.2.5.33 Attribute name state
683
+ */
684
+ static const lxb_char_t *
685
+ lxb_html_tokenizer_state_attribute_name(lxb_html_tokenizer_t *tkz,
686
+ const lxb_char_t *data,
687
+ const lxb_char_t *end)
688
+ {
689
+ lxb_html_tokenizer_state_begin_set(tkz, data);
690
+
691
+ while (data != end) {
692
+ switch (*data) {
693
+ /*
694
+ * U+0009 CHARACTER TABULATION (tab)
695
+ * U+000A LINE FEED (LF)
696
+ * U+000C FORM FEED (FF)
697
+ * U+000D CARRIAGE RETURN (CR)
698
+ * U+0020 SPACE
699
+ * U+002F SOLIDUS (/)
700
+ * U+003E GREATER-THAN SIGN (>)
701
+ */
702
+ case 0x09:
703
+ case 0x0A:
704
+ case 0x0C:
705
+ case 0x0D:
706
+ case 0x20:
707
+ case 0x2F:
708
+ case 0x3E:
709
+ lxb_html_tokenizer_state_append_data_m(tkz, data);
710
+ lxb_html_tokenizer_state_set_name_m(tkz);
711
+ lxb_html_tokenizer_state_token_attr_set_name_end(tkz, data);
712
+
713
+ tkz->state = lxb_html_tokenizer_state_after_attribute_name;
714
+ return data;
715
+
716
+ /*
717
+ * U+0000 NULL
718
+ * EOF
719
+ */
720
+ case 0x00:
721
+ if (tkz->is_eof) {
722
+ lxb_html_tokenizer_state_token_attr_set_name_end_oef(tkz);
723
+
724
+ tkz->state = lxb_html_tokenizer_state_after_attribute_name;
725
+ return data;
726
+ }
727
+
728
+ lxb_html_tokenizer_state_append_data_m(tkz, data);
729
+ lxb_html_tokenizer_state_begin_set(tkz, data + 1);
730
+ lxb_html_tokenizer_state_append_replace_m(tkz);
731
+
732
+ lxb_html_tokenizer_error_add(tkz->parse_errors, data,
733
+ LXB_HTML_TOKENIZER_ERROR_UNNUCH);
734
+ break;
735
+
736
+ /* U+003D EQUALS SIGN (=) */
737
+ case 0x3D:
738
+ lxb_html_tokenizer_state_append_data_m(tkz, data);
739
+ lxb_html_tokenizer_state_set_name_m(tkz);
740
+ lxb_html_tokenizer_state_token_attr_set_name_end(tkz, data);
741
+
742
+ tkz->state = lxb_html_tokenizer_state_before_attribute_value;
743
+ return (data + 1);
744
+
745
+ /*
746
+ * U+0022 QUOTATION MARK (")
747
+ * U+0027 APOSTROPHE (')
748
+ * U+003C LESS-THAN SIGN (<)
749
+ */
750
+ case 0x22:
751
+ case 0x27:
752
+ case 0x3C:
753
+ lxb_html_tokenizer_error_add(tkz->parse_errors, data,
754
+ LXB_HTML_TOKENIZER_ERROR_UNCHINATNA);
755
+ break;
756
+
757
+ default:
758
+ break;
759
+ }
760
+
761
+ data++;
762
+ }
763
+
764
+ lxb_html_tokenizer_state_append_data_m(tkz, data);
765
+
766
+ return data;
767
+ }
768
+
769
+ /*
770
+ * 12.2.5.34 After attribute name state
771
+ */
772
+ static const lxb_char_t *
773
+ lxb_html_tokenizer_state_after_attribute_name(lxb_html_tokenizer_t *tkz,
774
+ const lxb_char_t *data,
775
+ const lxb_char_t *end)
776
+ {
777
+ lxb_html_token_attr_t *attr;
778
+
779
+ while (data != end) {
780
+ switch (*data) {
781
+ /*
782
+ * U+0009 CHARACTER TABULATION (tab)
783
+ * U+000A LINE FEED (LF)
784
+ * U+000C FORM FEED (FF)
785
+ * U+000D CARRIAGE RETURN (CR)
786
+ * U+0020 SPACE
787
+ */
788
+ case 0x09:
789
+ case 0x0A:
790
+ case 0x0C:
791
+ case 0x0D:
792
+ case 0x20:
793
+ break;
794
+
795
+ /* U+002F SOLIDUS (/) */
796
+ case 0x2F:
797
+ tkz->state = lxb_html_tokenizer_state_self_closing_start_tag;
798
+ return (data + 1);
799
+
800
+ /* U+003D EQUALS SIGN (=) */
801
+ case 0x3D:
802
+ tkz->state = lxb_html_tokenizer_state_before_attribute_value;
803
+ return (data + 1);
804
+
805
+ /* U+003E GREATER-THAN SIGN (>) */
806
+ case 0x3E:
807
+ tkz->state = lxb_html_tokenizer_state_data_before;
808
+
809
+ lxb_html_tokenizer_state_token_done_m(tkz, end);
810
+
811
+ return (data + 1);
812
+
813
+ case 0x00:
814
+ if (tkz->is_eof) {
815
+ lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->last,
816
+ LXB_HTML_TOKENIZER_ERROR_EOINTA);
817
+ return end;
818
+ }
819
+ /* fall through */
820
+
821
+ default:
822
+ lxb_html_tokenizer_state_token_attr_add_m(tkz, attr, end);
823
+ lxb_html_tokenizer_state_token_attr_set_name_begin(tkz, data);
824
+
825
+ tkz->state = lxb_html_tokenizer_state_attribute_name;
826
+ return data;
827
+ }
828
+
829
+ data++;
830
+ }
831
+
832
+ return data;
833
+ }
834
+
835
+ /*
836
+ * 12.2.5.35 Before attribute value state
837
+ */
838
+ static const lxb_char_t *
839
+ lxb_html_tokenizer_state_before_attribute_value(lxb_html_tokenizer_t *tkz,
840
+ const lxb_char_t *data,
841
+ const lxb_char_t *end)
842
+ {
843
+ while (data != end) {
844
+ switch (*data) {
845
+ /*
846
+ * U+0009 CHARACTER TABULATION (tab)
847
+ * U+000A LINE FEED (LF)
848
+ * U+000C FORM FEED (FF)
849
+ * U+000D CARRIAGE RETURN (CR)
850
+ * U+0020 SPACE
851
+ */
852
+ case 0x09:
853
+ case 0x0A:
854
+ case 0x0C:
855
+ case 0x0D:
856
+ case 0x20:
857
+ break;
858
+
859
+ /* U+0022 QUOTATION MARK (") */
860
+ case 0x22:
861
+ tkz->state =
862
+ lxb_html_tokenizer_state_attribute_value_double_quoted;
863
+
864
+ return (data + 1);
865
+
866
+ /* U+0027 APOSTROPHE (') */
867
+ case 0x27:
868
+ tkz->state =
869
+ lxb_html_tokenizer_state_attribute_value_single_quoted;
870
+
871
+ return (data + 1);
872
+
873
+ /* U+003E GREATER-THAN SIGN (>) */
874
+ case 0x3E:
875
+ tkz->state = lxb_html_tokenizer_state_data_before;
876
+
877
+ lxb_html_tokenizer_error_add(tkz->parse_errors, data,
878
+ LXB_HTML_TOKENIZER_ERROR_MIATVA);
879
+
880
+ lxb_html_tokenizer_state_token_done_m(tkz, end);
881
+
882
+ return (data + 1);
883
+
884
+ default:
885
+ tkz->state = lxb_html_tokenizer_state_attribute_value_unquoted;
886
+ return data;
887
+ }
888
+
889
+ data++;
890
+ }
891
+
892
+ return data;
893
+ }
894
+
895
+ /*
896
+ * 12.2.5.36 Attribute value (double-quoted) state
897
+ */
898
+ static const lxb_char_t *
899
+ lxb_html_tokenizer_state_attribute_value_double_quoted(lxb_html_tokenizer_t *tkz,
900
+ const lxb_char_t *data,
901
+ const lxb_char_t *end)
902
+ {
903
+ if (tkz->token->attr_last->value_begin == NULL && tkz->is_eof == false) {
904
+ lxb_html_tokenizer_state_token_attr_set_value_begin(tkz, data);
905
+ }
906
+
907
+ lxb_html_tokenizer_state_begin_set(tkz, data);
908
+
909
+ while (data != end) {
910
+ switch (*data) {
911
+ /* U+0022 QUOTATION MARK (") */
912
+ case 0x22:
913
+ lxb_html_tokenizer_state_append_data_m(tkz, data);
914
+ lxb_html_tokenizer_state_token_attr_set_value_end(tkz, data);
915
+ lxb_html_tokenizer_state_set_value_m(tkz);
916
+
917
+ tkz->state =
918
+ lxb_html_tokenizer_state_after_attribute_value_quoted;
919
+
920
+ return (data + 1);
921
+
922
+ /* U+0026 AMPERSAND (&) */
923
+ case 0x26:
924
+ lxb_html_tokenizer_state_append_data_m(tkz, data + 1);
925
+
926
+ tkz->state = lxb_html_tokenizer_state_char_ref_attr;
927
+ tkz->state_return = lxb_html_tokenizer_state_attribute_value_double_quoted;
928
+
929
+ return data + 1;
930
+
931
+ /* U+000D CARRIAGE RETURN (CR) */
932
+ case 0x0D:
933
+ if (++data >= end) {
934
+ lxb_html_tokenizer_state_append_data_m(tkz, data - 1);
935
+
936
+ tkz->state = lxb_html_tokenizer_state_cr;
937
+ tkz->state_return = lxb_html_tokenizer_state_attribute_value_double_quoted;
938
+
939
+ return data;
940
+ }
941
+
942
+ lxb_html_tokenizer_state_append_data_m(tkz, data);
943
+ tkz->pos[-1] = 0x0A;
944
+
945
+ lxb_html_tokenizer_state_begin_set(tkz, data + 1);
946
+
947
+ if (*data != 0x0A) {
948
+ lxb_html_tokenizer_state_begin_set(tkz, data);
949
+ data--;
950
+ }
951
+
952
+ break;
953
+
954
+ /*
955
+ * U+0000 NULL
956
+ * EOF
957
+ */
958
+ case 0x00:
959
+ if (tkz->is_eof) {
960
+ if (tkz->token->attr_last->value_begin != NULL) {
961
+ lxb_html_tokenizer_state_token_attr_set_value_end_oef(tkz);
962
+ }
963
+
964
+ lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->last,
965
+ LXB_HTML_TOKENIZER_ERROR_EOINTA);
966
+ return end;
967
+ }
968
+
969
+ lxb_html_tokenizer_state_append_data_m(tkz, data);
970
+ lxb_html_tokenizer_state_begin_set(tkz, data + 1);
971
+ lxb_html_tokenizer_state_append_replace_m(tkz);
972
+
973
+ lxb_html_tokenizer_error_add(tkz->parse_errors, data,
974
+ LXB_HTML_TOKENIZER_ERROR_UNNUCH);
975
+ break;
976
+
977
+ default:
978
+ break;
979
+ }
980
+
981
+ data++;
982
+ }
983
+
984
+ lxb_html_tokenizer_state_append_data_m(tkz, data);
985
+
986
+ return data;
987
+ }
988
+
989
+ /*
990
+ * 12.2.5.37 Attribute value (single-quoted) state
991
+ */
992
+ static const lxb_char_t *
993
+ lxb_html_tokenizer_state_attribute_value_single_quoted(lxb_html_tokenizer_t *tkz,
994
+ const lxb_char_t *data,
995
+ const lxb_char_t *end)
996
+ {
997
+ if (tkz->token->attr_last->value_begin == NULL && tkz->is_eof == false) {
998
+ lxb_html_tokenizer_state_token_attr_set_value_begin(tkz, data);
999
+ }
1000
+
1001
+ lxb_html_tokenizer_state_begin_set(tkz, data);
1002
+
1003
+ while (data != end) {
1004
+ switch (*data) {
1005
+ /* U+0027 APOSTROPHE (') */
1006
+ case 0x27:
1007
+ lxb_html_tokenizer_state_append_data_m(tkz, data);
1008
+ lxb_html_tokenizer_state_token_attr_set_value_end(tkz, data);
1009
+ lxb_html_tokenizer_state_set_value_m(tkz);
1010
+
1011
+ tkz->state =
1012
+ lxb_html_tokenizer_state_after_attribute_value_quoted;
1013
+
1014
+ return (data + 1);
1015
+
1016
+ /* U+0026 AMPERSAND (&) */
1017
+ case 0x26:
1018
+ lxb_html_tokenizer_state_append_data_m(tkz, data + 1);
1019
+
1020
+ tkz->state = lxb_html_tokenizer_state_char_ref_attr;
1021
+ tkz->state_return = lxb_html_tokenizer_state_attribute_value_single_quoted;
1022
+
1023
+ return data + 1;
1024
+
1025
+ /* U+000D CARRIAGE RETURN (CR) */
1026
+ case 0x0D:
1027
+ if (++data >= end) {
1028
+ lxb_html_tokenizer_state_append_data_m(tkz, data - 1);
1029
+
1030
+ tkz->state = lxb_html_tokenizer_state_cr;
1031
+ tkz->state_return = lxb_html_tokenizer_state_attribute_value_single_quoted;
1032
+
1033
+ return data;
1034
+ }
1035
+
1036
+ lxb_html_tokenizer_state_append_data_m(tkz, data);
1037
+ tkz->pos[-1] = 0x0A;
1038
+
1039
+ lxb_html_tokenizer_state_begin_set(tkz, data + 1);
1040
+
1041
+ if (*data != 0x0A) {
1042
+ lxb_html_tokenizer_state_begin_set(tkz, data);
1043
+ data--;
1044
+ }
1045
+
1046
+ break;
1047
+
1048
+ /*
1049
+ * U+0000 NULL
1050
+ * EOF
1051
+ */
1052
+ case 0x00:
1053
+ if (tkz->is_eof) {
1054
+ if (tkz->token->attr_last->value_begin != NULL) {
1055
+ lxb_html_tokenizer_state_token_attr_set_value_end_oef(tkz);
1056
+ }
1057
+
1058
+ lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->last,
1059
+ LXB_HTML_TOKENIZER_ERROR_EOINTA);
1060
+ return end;
1061
+ }
1062
+
1063
+ lxb_html_tokenizer_state_append_data_m(tkz, data);
1064
+ lxb_html_tokenizer_state_begin_set(tkz, data + 1);
1065
+ lxb_html_tokenizer_state_append_replace_m(tkz);
1066
+
1067
+ lxb_html_tokenizer_error_add(tkz->parse_errors, data,
1068
+ LXB_HTML_TOKENIZER_ERROR_UNNUCH);
1069
+ break;
1070
+
1071
+ default:
1072
+ break;
1073
+ }
1074
+
1075
+ data++;
1076
+ }
1077
+
1078
+ lxb_html_tokenizer_state_append_data_m(tkz, data);
1079
+
1080
+ return data;
1081
+ }
1082
+
1083
+ /*
1084
+ * 12.2.5.38 Attribute value (unquoted) state
1085
+ */
1086
+ static const lxb_char_t *
1087
+ lxb_html_tokenizer_state_attribute_value_unquoted(lxb_html_tokenizer_t *tkz,
1088
+ const lxb_char_t *data,
1089
+ const lxb_char_t *end)
1090
+ {
1091
+ if (tkz->token->attr_last->value_begin == NULL && tkz->is_eof == false) {
1092
+ lxb_html_tokenizer_state_token_attr_set_value_begin(tkz, data);
1093
+ }
1094
+
1095
+ lxb_html_tokenizer_state_begin_set(tkz, data);
1096
+
1097
+ while (data != end) {
1098
+ switch (*data) {
1099
+ /*
1100
+ * U+0009 CHARACTER TABULATION (tab)
1101
+ * U+000A LINE FEED (LF)
1102
+ * U+000C FORM FEED (FF)
1103
+ * U+000D CARRIAGE RETURN (CR)
1104
+ * U+0020 SPACE
1105
+ */
1106
+ case 0x09:
1107
+ case 0x0A:
1108
+ case 0x0C:
1109
+ case 0x0D:
1110
+ case 0x20:
1111
+ lxb_html_tokenizer_state_append_data_m(tkz, data);
1112
+ lxb_html_tokenizer_state_token_attr_set_value_end(tkz, data);
1113
+ lxb_html_tokenizer_state_set_value_m(tkz);
1114
+
1115
+ tkz->state = lxb_html_tokenizer_state_before_attribute_name;
1116
+ return (data + 1);
1117
+
1118
+ /* U+0026 AMPERSAND (&) */
1119
+ case 0x26:
1120
+ lxb_html_tokenizer_state_append_data_m(tkz, data + 1);
1121
+
1122
+ tkz->state = lxb_html_tokenizer_state_char_ref_attr;
1123
+ tkz->state_return = lxb_html_tokenizer_state_attribute_value_unquoted;
1124
+
1125
+ return data + 1;
1126
+
1127
+ /* U+003E GREATER-THAN SIGN (>) */
1128
+ case 0x3E:
1129
+ tkz->state = lxb_html_tokenizer_state_data_before;
1130
+
1131
+ lxb_html_tokenizer_state_append_data_m(tkz, data);
1132
+ lxb_html_tokenizer_state_token_attr_set_value_end(tkz, data);
1133
+ lxb_html_tokenizer_state_set_value_m(tkz);
1134
+
1135
+ lxb_html_tokenizer_state_token_done_m(tkz, end);
1136
+
1137
+ return (data + 1);
1138
+
1139
+ /*
1140
+ * U+0000 NULL
1141
+ * EOF
1142
+ */
1143
+ case 0x00:
1144
+ if (tkz->is_eof) {
1145
+ if (tkz->token->attr_last->value_begin != NULL) {
1146
+ lxb_html_tokenizer_state_token_attr_set_value_end_oef(tkz);
1147
+ }
1148
+
1149
+ lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->last,
1150
+ LXB_HTML_TOKENIZER_ERROR_EOINTA);
1151
+ return end;
1152
+ }
1153
+
1154
+ lxb_html_tokenizer_state_append_data_m(tkz, data);
1155
+ lxb_html_tokenizer_state_begin_set(tkz, data + 1);
1156
+ lxb_html_tokenizer_state_append_replace_m(tkz);
1157
+
1158
+ lxb_html_tokenizer_error_add(tkz->parse_errors, data,
1159
+ LXB_HTML_TOKENIZER_ERROR_UNNUCH);
1160
+ break;
1161
+
1162
+ /*
1163
+ * U+0022 QUOTATION MARK (")
1164
+ * U+0027 APOSTROPHE (')
1165
+ * U+003C LESS-THAN SIGN (<)
1166
+ * U+003D EQUALS SIGN (=)
1167
+ * U+0060 GRAVE ACCENT (`)
1168
+ */
1169
+ case 0x22:
1170
+ case 0x27:
1171
+ case 0x3C:
1172
+ case 0x3D:
1173
+ case 0x60:
1174
+ lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->token->end,
1175
+ LXB_HTML_TOKENIZER_ERROR_UNCHINUNATVA);
1176
+ break;
1177
+
1178
+ default:
1179
+ break;
1180
+ }
1181
+
1182
+ data++;
1183
+ }
1184
+
1185
+ lxb_html_tokenizer_state_append_data_m(tkz, data);
1186
+
1187
+ return data;
1188
+ }
1189
+
1190
+ /*
1191
+ * 12.2.5.39 After attribute value (quoted) state
1192
+ */
1193
+ static const lxb_char_t *
1194
+ lxb_html_tokenizer_state_after_attribute_value_quoted(lxb_html_tokenizer_t *tkz,
1195
+ const lxb_char_t *data,
1196
+ const lxb_char_t *end)
1197
+ {
1198
+ switch (*data) {
1199
+ /*
1200
+ * U+0009 CHARACTER TABULATION (tab)
1201
+ * U+000A LINE FEED (LF)
1202
+ * U+000C FORM FEED (FF)
1203
+ * U+000D CARRIAGE RETURN (CR)
1204
+ * U+0020 SPACE
1205
+ */
1206
+ case 0x09:
1207
+ case 0x0A:
1208
+ case 0x0C:
1209
+ case 0x0D:
1210
+ case 0x20:
1211
+ tkz->state = lxb_html_tokenizer_state_before_attribute_name;
1212
+
1213
+ return (data + 1);
1214
+
1215
+ /* U+002F SOLIDUS (/) */
1216
+ case 0x2F:
1217
+ tkz->state = lxb_html_tokenizer_state_self_closing_start_tag;
1218
+
1219
+ return (data + 1);
1220
+
1221
+ /* U+003E GREATER-THAN SIGN (>) */
1222
+ case 0x3E:
1223
+ tkz->state = lxb_html_tokenizer_state_data_before;
1224
+
1225
+ lxb_html_tokenizer_state_token_done_m(tkz, end);
1226
+
1227
+ return (data + 1);
1228
+
1229
+ /* EOF */
1230
+ case 0x00:
1231
+ if (tkz->is_eof) {
1232
+ lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->last,
1233
+ LXB_HTML_TOKENIZER_ERROR_EOINTA);
1234
+ return end;
1235
+ }
1236
+ /* fall through */
1237
+
1238
+ default:
1239
+ lxb_html_tokenizer_error_add(tkz->parse_errors, data,
1240
+ LXB_HTML_TOKENIZER_ERROR_MIWHBEAT);
1241
+
1242
+ tkz->state = lxb_html_tokenizer_state_before_attribute_name;
1243
+
1244
+ return data;
1245
+ }
1246
+
1247
+ return data;
1248
+ }
1249
+
1250
+
1251
+ const lxb_char_t *
1252
+ lxb_html_tokenizer_state_cr(lxb_html_tokenizer_t *tkz, const lxb_char_t *data,
1253
+ const lxb_char_t *end)
1254
+ {
1255
+ lxb_html_tokenizer_state_append_m(tkz, "\n", 1);
1256
+
1257
+ if (*data == 0x0A) {
1258
+ data++;
1259
+ }
1260
+
1261
+ tkz->state = tkz->state_return;
1262
+
1263
+ return data;
1264
+ }
1265
+
1266
+ /*
1267
+ * 12.2.5.40 Self-closing start tag state
1268
+ */
1269
+ const lxb_char_t *
1270
+ lxb_html_tokenizer_state_self_closing_start_tag(lxb_html_tokenizer_t *tkz,
1271
+ const lxb_char_t *data,
1272
+ const lxb_char_t *end)
1273
+ {
1274
+ switch (*data) {
1275
+ /* U+003E GREATER-THAN SIGN (>) */
1276
+ case 0x3E:
1277
+ tkz->state = lxb_html_tokenizer_state_data_before;
1278
+ tkz->token->type |= LXB_HTML_TOKEN_TYPE_CLOSE_SELF;
1279
+
1280
+ lxb_html_tokenizer_state_token_done_m(tkz, end);
1281
+
1282
+ return (data + 1);
1283
+
1284
+ /* EOF */
1285
+ case 0x00:
1286
+ if (tkz->is_eof) {
1287
+ lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->token->end,
1288
+ LXB_HTML_TOKENIZER_ERROR_EOINTA);
1289
+ return end;
1290
+ }
1291
+ /* fall through */
1292
+
1293
+ default:
1294
+ lxb_html_tokenizer_error_add(tkz->parse_errors, data,
1295
+ LXB_HTML_TOKENIZER_ERROR_UNSOINTA);
1296
+
1297
+ tkz->state = lxb_html_tokenizer_state_before_attribute_name;
1298
+
1299
+ return data;
1300
+ }
1301
+
1302
+ return data;
1303
+ }
1304
+
1305
+ /*
1306
+ * Helper function. No in the specification. For 12.2.5.41 Bogus comment state
1307
+ */
1308
+ static const lxb_char_t *
1309
+ lxb_html_tokenizer_state_bogus_comment_before(lxb_html_tokenizer_t *tkz,
1310
+ const lxb_char_t *data,
1311
+ const lxb_char_t *end)
1312
+ {
1313
+ tkz->token->tag_id = LXB_TAG__EM_COMMENT;
1314
+
1315
+ tkz->state = lxb_html_tokenizer_state_bogus_comment;
1316
+
1317
+ return data;
1318
+ }
1319
+
1320
+ /*
1321
+ * 12.2.5.41 Bogus comment state
1322
+ */
1323
+ static const lxb_char_t *
1324
+ lxb_html_tokenizer_state_bogus_comment(lxb_html_tokenizer_t *tkz,
1325
+ const lxb_char_t *data,
1326
+ const lxb_char_t *end)
1327
+ {
1328
+ lxb_html_tokenizer_state_begin_set(tkz, data);
1329
+
1330
+ while (data != end) {
1331
+ switch (*data) {
1332
+ /* U+003E GREATER-THAN SIGN (>) */
1333
+ case 0x3E:
1334
+ tkz->state = lxb_html_tokenizer_state_data_before;
1335
+
1336
+ lxb_html_tokenizer_state_append_data_m(tkz, data);
1337
+ lxb_html_tokenizer_state_token_set_end(tkz, data);
1338
+ lxb_html_tokenizer_state_set_text(tkz);
1339
+ lxb_html_tokenizer_state_token_done_wo_check_m(tkz, end);
1340
+
1341
+ return (data + 1);
1342
+
1343
+ /* U+000D CARRIAGE RETURN (CR) */
1344
+ case 0x0D:
1345
+ if (++data >= end) {
1346
+ lxb_html_tokenizer_state_append_data_m(tkz, data - 1);
1347
+
1348
+ tkz->state = lxb_html_tokenizer_state_cr;
1349
+ tkz->state_return = lxb_html_tokenizer_state_bogus_comment;
1350
+
1351
+ return data;
1352
+ }
1353
+
1354
+ lxb_html_tokenizer_state_append_data_m(tkz, data);
1355
+ tkz->pos[-1] = 0x0A;
1356
+
1357
+ lxb_html_tokenizer_state_begin_set(tkz, data + 1);
1358
+
1359
+ if (*data != 0x0A) {
1360
+ lxb_html_tokenizer_state_begin_set(tkz, data);
1361
+ data--;
1362
+ }
1363
+
1364
+ break;
1365
+
1366
+ /*
1367
+ * EOF
1368
+ * U+0000 NULL
1369
+ */
1370
+ case 0x00:
1371
+ lxb_html_tokenizer_state_append_data_m(tkz, data);
1372
+
1373
+ if (tkz->is_eof) {
1374
+ if (tkz->token->begin != NULL) {
1375
+ lxb_html_tokenizer_state_token_set_end_oef(tkz);
1376
+ }
1377
+
1378
+ lxb_html_tokenizer_state_set_text(tkz);
1379
+ lxb_html_tokenizer_state_token_done_wo_check_m(tkz, end);
1380
+
1381
+ return end;
1382
+ }
1383
+
1384
+ lxb_html_tokenizer_state_append_replace_m(tkz);
1385
+ lxb_html_tokenizer_state_begin_set(tkz, data + 1);
1386
+
1387
+ lxb_html_tokenizer_error_add(tkz->parse_errors, data,
1388
+ LXB_HTML_TOKENIZER_ERROR_UNNUCH);
1389
+ break;
1390
+ }
1391
+
1392
+ data++;
1393
+ }
1394
+
1395
+ lxb_html_tokenizer_state_append_data_m(tkz, data);
1396
+
1397
+ return data;
1398
+ }
1399
+
1400
+ /*
1401
+ * 12.2.5.42 Markup declaration open state
1402
+ */
1403
+ static const lxb_char_t *
1404
+ lxb_html_tokenizer_state_markup_declaration_open(lxb_html_tokenizer_t *tkz,
1405
+ const lxb_char_t *data,
1406
+ const lxb_char_t *end)
1407
+ {
1408
+ /* Check first char for change parse state */
1409
+ if (tkz->is_eof == false) {
1410
+ lxb_html_tokenizer_state_token_set_begin(tkz, data);
1411
+ }
1412
+
1413
+ /* U+002D HYPHEN-MINUS characters (-) */
1414
+ if (*data == 0x2D) {
1415
+ if ((end - data) < 2) {
1416
+ tkz->state = lxb_html_tokenizer_state_markup_declaration_comment;
1417
+ return (data + 1);
1418
+ }
1419
+
1420
+ if (data[1] == 0x2D) {
1421
+ tkz->state = lxb_html_tokenizer_state_comment_before_start;
1422
+ return (data + 2);
1423
+ }
1424
+ }
1425
+ /*
1426
+ * ASCII case-insensitive match for the word "DOCTYPE"
1427
+ * U+0044 character (D) or U+0064 character (d)
1428
+ */
1429
+ else if (*data == 0x44 || *data == 0x64) {
1430
+ if ((end - data) < 7) {
1431
+ tkz->markup = (lxb_char_t *) "doctype";
1432
+
1433
+ tkz->state = lxb_html_tokenizer_state_markup_declaration_doctype;
1434
+ return data;
1435
+ }
1436
+
1437
+ if (lexbor_str_data_ncasecmp((lxb_char_t *) "doctype", data, 7)) {
1438
+ tkz->state = lxb_html_tokenizer_state_doctype_before;
1439
+ return (data + 7);
1440
+ }
1441
+ }
1442
+ /* Case-sensitive match for the string "[CDATA["
1443
+ * (the five uppercase letters "CDATA" with a U+005B LEFT SQUARE BRACKET
1444
+ * character before and after)
1445
+ */
1446
+ else if (*data == 0x5B) {
1447
+ if ((end - data) < 7) {
1448
+ tkz->markup = (lxb_char_t *) "[CDATA[";
1449
+
1450
+ tkz->state = lxb_html_tokenizer_state_markup_declaration_cdata;
1451
+ return data;
1452
+ }
1453
+
1454
+ if (lexbor_str_data_ncmp((lxb_char_t *) "[CDATA[", data, 7)) {
1455
+ lxb_ns_id_t ns = lxb_html_tokenizer_current_namespace(tkz);
1456
+
1457
+ if (ns != LXB_NS_HTML && ns != LXB_NS__UNDEF) {
1458
+ data += 7;
1459
+
1460
+ lxb_html_tokenizer_state_token_set_begin(tkz, data);
1461
+
1462
+ tkz->state = lxb_html_tokenizer_state_cdata_section_before;
1463
+
1464
+ return data;
1465
+ }
1466
+
1467
+ tkz->state = lxb_html_tokenizer_state_bogus_comment_before;
1468
+
1469
+ return data;
1470
+ }
1471
+ }
1472
+
1473
+ if (tkz->is_eof) {
1474
+ lxb_html_tokenizer_state_token_set_end_oef(tkz);
1475
+
1476
+ tkz->token->begin = tkz->token->end;
1477
+ }
1478
+
1479
+ lxb_html_tokenizer_error_add(tkz->parse_errors, data,
1480
+ LXB_HTML_TOKENIZER_ERROR_INOPCO);
1481
+
1482
+ tkz->state = lxb_html_tokenizer_state_bogus_comment_before;
1483
+
1484
+ return data;
1485
+ }
1486
+
1487
+ /*
1488
+ * Helper function. No in the specification. For 12.2.5.42
1489
+ * For a comment tag <!--
1490
+ */
1491
+ static const lxb_char_t *
1492
+ lxb_html_tokenizer_state_markup_declaration_comment(lxb_html_tokenizer_t *tkz,
1493
+ const lxb_char_t *data,
1494
+ const lxb_char_t *end)
1495
+ {
1496
+ /* U+002D HYPHEN-MINUS characters (-) */
1497
+ if (*data == 0x2D) {
1498
+ tkz->state = lxb_html_tokenizer_state_comment_before_start;
1499
+ return (data + 1);
1500
+ }
1501
+
1502
+ lxb_html_tokenizer_error_add(tkz->parse_errors, data,
1503
+ LXB_HTML_TOKENIZER_ERROR_INOPCO);
1504
+
1505
+ tkz->state = lxb_html_tokenizer_state_bogus_comment_before;
1506
+ return data;
1507
+ }
1508
+
1509
+ /*
1510
+ * Helper function. No in the specification. For 12.2.5.42
1511
+ * For a DOCTYPE tag <!DOCTYPE
1512
+ */
1513
+ static const lxb_char_t *
1514
+ lxb_html_tokenizer_state_markup_declaration_doctype(lxb_html_tokenizer_t *tkz,
1515
+ const lxb_char_t *data,
1516
+ const lxb_char_t *end)
1517
+ {
1518
+ const lxb_char_t *pos;
1519
+ pos = lexbor_str_data_ncasecmp_first(tkz->markup, data, (end - data));
1520
+
1521
+ if (pos == NULL) {
1522
+ lxb_html_tokenizer_error_add(tkz->parse_errors, data,
1523
+ LXB_HTML_TOKENIZER_ERROR_INOPCO);
1524
+
1525
+ tkz->state = lxb_html_tokenizer_state_bogus_comment_before;
1526
+ return data;
1527
+ }
1528
+
1529
+ if (*pos == '\0') {
1530
+ data = (data + (pos - tkz->markup));
1531
+
1532
+ tkz->state = lxb_html_tokenizer_state_doctype_before;
1533
+ return data;
1534
+ }
1535
+
1536
+ tkz->markup = pos;
1537
+
1538
+ return end;
1539
+ }
1540
+
1541
+ /*
1542
+ * Helper function. No in the specification. For 12.2.5.42
1543
+ * For a CDATA tag <![CDATA[
1544
+ */
1545
+ static const lxb_char_t *
1546
+ lxb_html_tokenizer_state_markup_declaration_cdata(lxb_html_tokenizer_t *tkz,
1547
+ const lxb_char_t *data,
1548
+ const lxb_char_t *end)
1549
+ {
1550
+ const lxb_char_t *pos;
1551
+ pos = lexbor_str_data_ncasecmp_first(tkz->markup, data, (end - data));
1552
+
1553
+ if (pos == NULL) {
1554
+ lxb_html_tokenizer_error_add(tkz->parse_errors, data,
1555
+ LXB_HTML_TOKENIZER_ERROR_INOPCO);
1556
+
1557
+ tkz->state = lxb_html_tokenizer_state_bogus_comment_before;
1558
+ return data;
1559
+ }
1560
+
1561
+ if (*pos == '\0') {
1562
+ lxb_ns_id_t ns = lxb_html_tokenizer_current_namespace(tkz);
1563
+
1564
+ if (ns != LXB_NS_HTML && ns != LXB_NS__UNDEF) {
1565
+ data = (data + (pos - tkz->markup));
1566
+
1567
+ tkz->state = lxb_html_tokenizer_state_cdata_section_before;
1568
+ return data;
1569
+ }
1570
+
1571
+ lxb_html_tokenizer_state_append_m(tkz, "[CDATA", 6);
1572
+
1573
+ tkz->state = lxb_html_tokenizer_state_bogus_comment_before;
1574
+ return data;
1575
+ }
1576
+
1577
+ tkz->markup = pos;
1578
+
1579
+ return end;
1580
+ }
1581
+
1582
+ /*
1583
+ * Helper function. No in the specification. For 12.2.5.69
1584
+ */
1585
+ static const lxb_char_t *
1586
+ lxb_html_tokenizer_state_cdata_section_before(lxb_html_tokenizer_t *tkz,
1587
+ const lxb_char_t *data,
1588
+ const lxb_char_t *end)
1589
+ {
1590
+ if (tkz->is_eof == false) {
1591
+ lxb_html_tokenizer_state_token_set_begin(tkz, data);
1592
+ }
1593
+ else {
1594
+ lxb_html_tokenizer_state_token_set_begin(tkz, tkz->last);
1595
+ }
1596
+
1597
+ tkz->token->tag_id = LXB_TAG__TEXT;
1598
+
1599
+ tkz->state = lxb_html_tokenizer_state_cdata_section;
1600
+
1601
+ return data;
1602
+ }
1603
+
1604
+ /*
1605
+ * 12.2.5.69 CDATA section state
1606
+ */
1607
+ static const lxb_char_t *
1608
+ lxb_html_tokenizer_state_cdata_section(lxb_html_tokenizer_t *tkz,
1609
+ const lxb_char_t *data,
1610
+ const lxb_char_t *end)
1611
+ {
1612
+ lxb_html_tokenizer_state_begin_set(tkz, data);
1613
+
1614
+ while (data != end) {
1615
+ switch (*data) {
1616
+ /* U+005D RIGHT SQUARE BRACKET (]) */
1617
+ case 0x5D:
1618
+ lxb_html_tokenizer_state_append_data_m(tkz, data);
1619
+ lxb_html_tokenizer_state_token_set_end(tkz, data);
1620
+
1621
+ tkz->state = lxb_html_tokenizer_state_cdata_section_bracket;
1622
+ return (data + 1);
1623
+
1624
+ /* U+000D CARRIAGE RETURN (CR) */
1625
+ case 0x0D:
1626
+ if (++data >= end) {
1627
+ lxb_html_tokenizer_state_append_data_m(tkz, data - 1);
1628
+
1629
+ tkz->state = lxb_html_tokenizer_state_cr;
1630
+ tkz->state_return = lxb_html_tokenizer_state_cdata_section;
1631
+
1632
+ return data;
1633
+ }
1634
+
1635
+ lxb_html_tokenizer_state_append_data_m(tkz, data);
1636
+ tkz->pos[-1] = 0x0A;
1637
+
1638
+ lxb_html_tokenizer_state_begin_set(tkz, data + 1);
1639
+
1640
+ if (*data != 0x0A) {
1641
+ lxb_html_tokenizer_state_begin_set(tkz, data);
1642
+ data--;
1643
+ }
1644
+
1645
+ break;
1646
+
1647
+ /* EOF */
1648
+ case 0x00:
1649
+ if (tkz->is_eof) {
1650
+ lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->last,
1651
+ LXB_HTML_TOKENIZER_ERROR_EOINCD);
1652
+
1653
+ if (tkz->token->begin != NULL) {
1654
+ lxb_html_tokenizer_state_append_data_m(tkz, data);
1655
+ lxb_html_tokenizer_state_token_set_end_oef(tkz);
1656
+ }
1657
+
1658
+ lxb_html_tokenizer_state_set_text(tkz);
1659
+ lxb_html_tokenizer_state_token_done_m(tkz, end);
1660
+
1661
+ return end;
1662
+ }
1663
+
1664
+ if (SIZE_MAX - tkz->token->null_count < 1) {
1665
+ tkz->status = LXB_STATUS_ERROR_OVERFLOW;
1666
+ return end;
1667
+ }
1668
+
1669
+ tkz->token->null_count++;
1670
+
1671
+ break;
1672
+
1673
+ default:
1674
+ break;
1675
+ }
1676
+
1677
+ data++;
1678
+ }
1679
+
1680
+ lxb_html_tokenizer_state_append_data_m(tkz, data);
1681
+
1682
+ return data;
1683
+ }
1684
+
1685
+ /*
1686
+ * 12.2.5.70 CDATA section bracket state
1687
+ */
1688
+ static const lxb_char_t *
1689
+ lxb_html_tokenizer_state_cdata_section_bracket(lxb_html_tokenizer_t *tkz,
1690
+ const lxb_char_t *data,
1691
+ const lxb_char_t *end)
1692
+ {
1693
+ /* U+005D RIGHT SQUARE BRACKET (]) */
1694
+ if (*data == 0x5D) {
1695
+ tkz->state = lxb_html_tokenizer_state_cdata_section_end;
1696
+ return (data + 1);
1697
+ }
1698
+
1699
+ lxb_html_tokenizer_state_append_m(tkz, "]", 1);
1700
+
1701
+ tkz->state = lxb_html_tokenizer_state_cdata_section;
1702
+
1703
+ return data;
1704
+ }
1705
+
1706
+ /*
1707
+ * 12.2.5.71 CDATA section end state
1708
+ */
1709
+ static const lxb_char_t *
1710
+ lxb_html_tokenizer_state_cdata_section_end(lxb_html_tokenizer_t *tkz,
1711
+ const lxb_char_t *data,
1712
+ const lxb_char_t *end)
1713
+ {
1714
+ /* U+005D RIGHT SQUARE BRACKET (]) */
1715
+ if (*data == 0x5D) {
1716
+ lxb_html_tokenizer_state_append_m(tkz, data, 1);
1717
+ return (data + 1);
1718
+ }
1719
+ /* U+003E GREATER-THAN SIGN character */
1720
+ else if (*data == 0x3E) {
1721
+ tkz->state = lxb_html_tokenizer_state_data_before;
1722
+
1723
+ lxb_html_tokenizer_state_set_text(tkz);
1724
+ lxb_html_tokenizer_state_token_done_m(tkz, end);
1725
+
1726
+ return (data + 1);
1727
+ }
1728
+
1729
+ lxb_html_tokenizer_state_append_m(tkz, "]]", 2);
1730
+
1731
+ tkz->state = lxb_html_tokenizer_state_cdata_section;
1732
+
1733
+ return data;
1734
+ }
1735
+
1736
+ /*
1737
+ * 12.2.5.72 Character reference state
1738
+ */
1739
+ const lxb_char_t *
1740
+ lxb_html_tokenizer_state_char_ref(lxb_html_tokenizer_t *tkz,
1741
+ const lxb_char_t *data, const lxb_char_t *end)
1742
+ {
1743
+ tkz->is_attribute = false;
1744
+
1745
+ return _lxb_html_tokenizer_state_char_ref(tkz, data, end);
1746
+ }
1747
+
1748
+ static const lxb_char_t *
1749
+ lxb_html_tokenizer_state_char_ref_attr(lxb_html_tokenizer_t *tkz,
1750
+ const lxb_char_t *data,
1751
+ const lxb_char_t *end)
1752
+ {
1753
+ tkz->is_attribute = true;
1754
+
1755
+ return _lxb_html_tokenizer_state_char_ref(tkz, data, end);
1756
+ }
1757
+
1758
+ static const lxb_char_t *
1759
+ _lxb_html_tokenizer_state_char_ref(lxb_html_tokenizer_t *tkz,
1760
+ const lxb_char_t *data,
1761
+ const lxb_char_t *end)
1762
+ {
1763
+ /* ASCII alphanumeric */
1764
+ if (lexbor_str_res_alphanumeric_character[ *data ] != LEXBOR_STR_RES_SLIP) {
1765
+ tkz->entity = &lxb_html_tokenizer_res_entities_sbst[1];
1766
+ tkz->entity_match = NULL;
1767
+ tkz->entity_start = (tkz->pos - 1) - tkz->start;
1768
+
1769
+ tkz->state = lxb_html_tokenizer_state_char_ref_named;
1770
+
1771
+ return data;
1772
+ }
1773
+ /* U+0023 NUMBER SIGN (#) */
1774
+ else if (*data == 0x23) {
1775
+ tkz->markup = data;
1776
+ tkz->entity_start = (tkz->pos - 1) - tkz->start;
1777
+
1778
+ lxb_html_tokenizer_state_append_m(tkz, data, 1);
1779
+
1780
+ tkz->state = lxb_html_tokenizer_state_char_ref_numeric;
1781
+
1782
+ return (data + 1);
1783
+ }
1784
+ else {
1785
+ tkz->state = tkz->state_return;
1786
+ }
1787
+
1788
+ return data;
1789
+ }
1790
+
1791
+ /*
1792
+ * 12.2.5.73 Named character reference state
1793
+ *
1794
+ * The slowest part in HTML parsing!!!
1795
+ *
1796
+ * This option works correctly and passes all tests (stream parsing too).
1797
+ * We must seriously think about how to accelerate this part.
1798
+ */
1799
+ static const lxb_char_t *
1800
+ lxb_html_tokenizer_state_char_ref_named(lxb_html_tokenizer_t *tkz,
1801
+ const lxb_char_t *data,
1802
+ const lxb_char_t *end)
1803
+ {
1804
+ size_t size, tail_size;
1805
+ lxb_char_t *start;
1806
+ const lexbor_sbst_entry_static_t *entry = tkz->entity;
1807
+
1808
+ const lxb_char_t *begin = data;
1809
+
1810
+ while (data < end) {
1811
+ entry = lexbor_sbst_entry_static_find(lxb_html_tokenizer_res_entities_sbst,
1812
+ entry, *data);
1813
+ if (entry == NULL) {
1814
+ lxb_html_tokenizer_state_append_m(tkz, begin, (data - begin));
1815
+ goto done;
1816
+ }
1817
+
1818
+ if (entry->value != NULL) {
1819
+ tkz->entity_end = (tkz->pos + (data - begin)) - tkz->start;
1820
+ tkz->entity_match = entry;
1821
+ }
1822
+
1823
+ entry = &lxb_html_tokenizer_res_entities_sbst[ entry->next ];
1824
+
1825
+ data++;
1826
+ }
1827
+
1828
+ /* If entry not NULL and buffer empty, then wait next buffer. */
1829
+ tkz->entity = entry;
1830
+
1831
+ lxb_html_tokenizer_state_append_m(tkz, begin, (end - begin));
1832
+ return data;
1833
+
1834
+ done:
1835
+
1836
+ /* If we have bad entity */
1837
+ if (tkz->entity_match == NULL) {
1838
+ tkz->state = lxb_html_tokenizer_state_char_ref_ambiguous_ampersand;
1839
+
1840
+ return data;
1841
+ }
1842
+
1843
+ tkz->state = tkz->state_return;
1844
+
1845
+ /*
1846
+ * If the character reference was consumed as part of an attribute,
1847
+ * and the last character matched is not a U+003B SEMICOLON character (;),
1848
+ * and the next input character is either a U+003D EQUALS SIGN character (=)
1849
+ * or an ASCII alphanumeric, then, for historical reasons,
1850
+ * flush code points consumed as a character reference
1851
+ * and switch to the return state.
1852
+ */
1853
+ /* U+003B SEMICOLON character (;) */
1854
+ if (tkz->is_attribute && tkz->entity_match->key != 0x3B) {
1855
+ /* U+003D EQUALS SIGN character (=) or ASCII alphanumeric */
1856
+ if (*data == 0x3D
1857
+ || lexbor_str_res_alphanumeric_character[*data] != LEXBOR_STR_RES_SLIP)
1858
+ {
1859
+ return data;
1860
+ }
1861
+ }
1862
+
1863
+ if (tkz->entity_match->key != 0x3B) {
1864
+ lxb_html_tokenizer_error_add(tkz->parse_errors, data,
1865
+ LXB_HTML_TOKENIZER_ERROR_MISEAFCHRE);
1866
+ }
1867
+
1868
+ start = &tkz->start[tkz->entity_start];
1869
+
1870
+ size = tkz->pos - start;
1871
+ tail_size = tkz->pos - &tkz->start[tkz->entity_end] - 1;
1872
+
1873
+ if (tail_size != 0) {
1874
+ if ((size + tail_size) + start > tkz->end) {
1875
+ if (lxb_html_tokenizer_temp_realloc(tkz, size) != LXB_STATUS_OK) {
1876
+ return end;
1877
+ }
1878
+ start = &tkz->start[tkz->entity_start];
1879
+ }
1880
+
1881
+ memmove(start + tkz->entity_match->value_len,
1882
+ tkz->pos - tail_size, tail_size);
1883
+ }
1884
+
1885
+ memcpy(start, tkz->entity_match->value, tkz->entity_match->value_len);
1886
+
1887
+ tkz->pos = start + (tkz->entity_match->value_len + tail_size);
1888
+
1889
+ return data;
1890
+ }
1891
+
1892
+ /*
1893
+ * 12.2.5.74 Ambiguous ampersand state
1894
+ */
1895
+ static const lxb_char_t *
1896
+ lxb_html_tokenizer_state_char_ref_ambiguous_ampersand(lxb_html_tokenizer_t *tkz,
1897
+ const lxb_char_t *data,
1898
+ const lxb_char_t *end)
1899
+ {
1900
+ /* ASCII alphanumeric */
1901
+ /* Skipped, not need */
1902
+
1903
+ /* U+003B SEMICOLON (;) */
1904
+ if (*data == 0x3B) {
1905
+ lxb_html_tokenizer_error_add(tkz->parse_errors, data,
1906
+ LXB_HTML_TOKENIZER_ERROR_UNNACHRE);
1907
+ }
1908
+
1909
+ tkz->state = tkz->state_return;
1910
+
1911
+ return data;
1912
+ }
1913
+
1914
+ /*
1915
+ * 12.2.5.75 Numeric character reference state
1916
+ */
1917
+ static const lxb_char_t *
1918
+ lxb_html_tokenizer_state_char_ref_numeric(lxb_html_tokenizer_t *tkz,
1919
+ const lxb_char_t *data,
1920
+ const lxb_char_t *end)
1921
+ {
1922
+ tkz->entity_number = 0;
1923
+
1924
+ /*
1925
+ * U+0078 LATIN SMALL LETTER X
1926
+ * U+0058 LATIN CAPITAL LETTER X
1927
+ */
1928
+ if (*data == 0x78 || *data == 0x58) {
1929
+ lxb_html_tokenizer_state_append_m(tkz, data, 1);
1930
+
1931
+ tkz->state = lxb_html_tokenizer_state_char_ref_hexademical_start;
1932
+
1933
+ return (data + 1);
1934
+ }
1935
+
1936
+ tkz->state = lxb_html_tokenizer_state_char_ref_decimal_start;
1937
+
1938
+ return data;
1939
+ }
1940
+
1941
+ /*
1942
+ * 12.2.5.76 Hexademical character reference start state
1943
+ */
1944
+ static const lxb_char_t *
1945
+ lxb_html_tokenizer_state_char_ref_hexademical_start(lxb_html_tokenizer_t *tkz,
1946
+ const lxb_char_t *data,
1947
+ const lxb_char_t *end)
1948
+ {
1949
+ /* ASCII hex digit */
1950
+ if (lexbor_str_res_map_hex[ *data ] != LEXBOR_STR_RES_SLIP) {
1951
+ tkz->state = lxb_html_tokenizer_state_char_ref_hexademical;
1952
+ }
1953
+ else {
1954
+ lxb_html_tokenizer_error_add(tkz->parse_errors, data,
1955
+ LXB_HTML_TOKENIZER_ERROR_ABOFDIINNUCHRE);
1956
+
1957
+ tkz->state = tkz->state_return;
1958
+ }
1959
+
1960
+ return data;
1961
+ }
1962
+
1963
+ /*
1964
+ * 12.2.5.77 Decimal character reference start state
1965
+ */
1966
+ static const lxb_char_t *
1967
+ lxb_html_tokenizer_state_char_ref_decimal_start(lxb_html_tokenizer_t *tkz,
1968
+ const lxb_char_t *data,
1969
+ const lxb_char_t *end)
1970
+ {
1971
+ /* ASCII digit */
1972
+ if (lexbor_str_res_map_num[ *data ] != LEXBOR_STR_RES_SLIP) {
1973
+ tkz->state = lxb_html_tokenizer_state_char_ref_decimal;
1974
+ }
1975
+ else {
1976
+ lxb_html_tokenizer_error_add(tkz->parse_errors, data,
1977
+ LXB_HTML_TOKENIZER_ERROR_ABOFDIINNUCHRE);
1978
+
1979
+ tkz->state = tkz->state_return;
1980
+ }
1981
+
1982
+ return data;
1983
+ }
1984
+
1985
+ /*
1986
+ * 12.2.5.78 Hexademical character reference state
1987
+ */
1988
+ static const lxb_char_t *
1989
+ lxb_html_tokenizer_state_char_ref_hexademical(lxb_html_tokenizer_t *tkz,
1990
+ const lxb_char_t *data,
1991
+ const lxb_char_t *end)
1992
+ {
1993
+ while (data != end) {
1994
+ if (lexbor_str_res_map_hex[ *data ] == LEXBOR_STR_RES_SLIP) {
1995
+ tkz->state = tkz->state_return;
1996
+
1997
+ if (*data == ';') {
1998
+ data++;
1999
+ }
2000
+
2001
+ return lxb_html_tokenizer_state_char_ref_numeric_end(tkz, data, end);
2002
+ }
2003
+
2004
+ if (tkz->entity_number <= 0x10FFFF) {
2005
+ tkz->entity_number <<= 4;
2006
+ tkz->entity_number |= lexbor_str_res_map_hex[ *data ];
2007
+ }
2008
+
2009
+ data++;
2010
+ }
2011
+
2012
+ return data;
2013
+ }
2014
+
2015
+ /*
2016
+ * 12.2.5.79 Decimal character reference state
2017
+ */
2018
+ static const lxb_char_t *
2019
+ lxb_html_tokenizer_state_char_ref_decimal(lxb_html_tokenizer_t *tkz,
2020
+ const lxb_char_t *data,
2021
+ const lxb_char_t *end)
2022
+ {
2023
+ while (data != end) {
2024
+ if (lexbor_str_res_map_num[ *data ] == LEXBOR_STR_RES_SLIP) {
2025
+ tkz->state = tkz->state_return;
2026
+
2027
+ if (*data == ';') {
2028
+ data++;
2029
+ }
2030
+
2031
+ return lxb_html_tokenizer_state_char_ref_numeric_end(tkz, data, end);
2032
+ }
2033
+
2034
+ if (tkz->entity_number <= 0x10FFFF) {
2035
+ tkz->entity_number = lexbor_str_res_map_num[ *data ]
2036
+ + tkz->entity_number * 10;
2037
+ }
2038
+
2039
+ data++;
2040
+ }
2041
+
2042
+ return data;
2043
+ }
2044
+
2045
+ /*
2046
+ * 12.2.5.80 Numeric character reference end state
2047
+ */
2048
+ static const lxb_char_t *
2049
+ lxb_html_tokenizer_state_char_ref_numeric_end(lxb_html_tokenizer_t *tkz,
2050
+ const lxb_char_t *data,
2051
+ const lxb_char_t *end)
2052
+ {
2053
+ lxb_char_t *start = &tkz->start[tkz->entity_start];
2054
+
2055
+ if ((start + 4) > tkz->end) {
2056
+ if(lxb_html_tokenizer_temp_realloc(tkz, 4)) {
2057
+ return end;
2058
+ }
2059
+
2060
+ start = &tkz->start[tkz->entity_start];
2061
+ }
2062
+
2063
+ if (tkz->entity_number == 0x00) {
2064
+ lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->markup,
2065
+ LXB_HTML_TOKENIZER_ERROR_NUCHRE);
2066
+
2067
+ goto xFFFD;
2068
+ }
2069
+ else if (tkz->entity_number > 0x10FFFF) {
2070
+ lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->markup,
2071
+ LXB_HTML_TOKENIZER_ERROR_CHREOUUNRA);
2072
+
2073
+ goto xFFFD;
2074
+ }
2075
+ else if (tkz->entity_number >= 0xD800 && tkz->entity_number <= 0xDFFF) {
2076
+ lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->markup,
2077
+ LXB_HTML_TOKENIZER_ERROR_SUCHRE);
2078
+
2079
+ goto xFFFD;
2080
+ }
2081
+ else if (tkz->entity_number >= 0xFDD0 && tkz->entity_number <= 0xFDEF) {
2082
+ lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->markup,
2083
+ LXB_HTML_TOKENIZER_ERROR_NOCHRE);
2084
+ }
2085
+
2086
+ switch (tkz->entity_number) {
2087
+ case 0xFFFE: case 0xFFFF: case 0x1FFFE: case 0x1FFFF: case 0x2FFFE:
2088
+ case 0x2FFFF: case 0x3FFFE: case 0x3FFFF: case 0x4FFFE: case 0x4FFFF:
2089
+ case 0x5FFFE: case 0x5FFFF: case 0x6FFFE: case 0x6FFFF: case 0x7FFFE:
2090
+ case 0x7FFFF: case 0x8FFFE: case 0x8FFFF: case 0x9FFFE: case 0x9FFFF:
2091
+ case 0xAFFFE: case 0xAFFFF: case 0xBFFFE: case 0xBFFFF: case 0xCFFFE:
2092
+ case 0xCFFFF: case 0xDFFFE: case 0xDFFFF: case 0xEFFFE: case 0xEFFFF:
2093
+ case 0xFFFFE: case 0xFFFFF:
2094
+ case 0x10FFFE:
2095
+ case 0x10FFFF:
2096
+ lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->markup,
2097
+ LXB_HTML_TOKENIZER_ERROR_NOCHRE);
2098
+ break;
2099
+
2100
+ default:
2101
+ break;
2102
+ }
2103
+
2104
+ if (tkz->entity_number <= 0x1F
2105
+ || (tkz->entity_number >= 0x7F && tkz->entity_number <= 0x9F))
2106
+ {
2107
+ lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->markup,
2108
+ LXB_HTML_TOKENIZER_ERROR_COCHRE);
2109
+ }
2110
+
2111
+ if (tkz->entity_number <= 0x9F) {
2112
+ tkz->entity_number = (uint32_t) lexbor_str_res_replacement_character[tkz->entity_number];
2113
+ }
2114
+
2115
+ start += lxb_html_tokenizer_state_to_ascii_utf_8(tkz->entity_number, start);
2116
+
2117
+ tkz->pos = start;
2118
+
2119
+ return data;
2120
+
2121
+ xFFFD:
2122
+
2123
+ memcpy(start, lexbor_str_res_ansi_replacement_character,
2124
+ sizeof(lexbor_str_res_ansi_replacement_character) - 1);
2125
+
2126
+ tkz->pos = start + sizeof(lexbor_str_res_ansi_replacement_character) - 1;
2127
+
2128
+ return data;
2129
+ }
2130
+
2131
+ static size_t
2132
+ lxb_html_tokenizer_state_to_ascii_utf_8(size_t codepoint, lxb_char_t *data)
2133
+ {
2134
+ /* 0x80 -- 10xxxxxx */
2135
+ /* 0xC0 -- 110xxxxx */
2136
+ /* 0xE0 -- 1110xxxx */
2137
+ /* 0xF0 -- 11110xxx */
2138
+
2139
+ if (codepoint <= 0x0000007F) {
2140
+ /* 0xxxxxxx */
2141
+ data[0] = (char) codepoint;
2142
+
2143
+ return 1;
2144
+ }
2145
+ else if (codepoint <= 0x000007FF) {
2146
+ /* 110xxxxx 10xxxxxx */
2147
+ data[0] = (char) (0xC0 | (codepoint >> 6 ));
2148
+ data[1] = (char) (0x80 | (codepoint & 0x3F));
2149
+
2150
+ return 2;
2151
+ }
2152
+ else if (codepoint <= 0x0000FFFF) {
2153
+ /* 1110xxxx 10xxxxxx 10xxxxxx */
2154
+ data[0] = (char) (0xE0 | ((codepoint >> 12)));
2155
+ data[1] = (char) (0x80 | ((codepoint >> 6 ) & 0x3F));
2156
+ data[2] = (char) (0x80 | ( codepoint & 0x3F));
2157
+
2158
+ return 3;
2159
+ }
2160
+ else if (codepoint <= 0x001FFFFF) {
2161
+ /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
2162
+ data[0] = (char) (0xF0 | ( codepoint >> 18));
2163
+ data[1] = (char) (0x80 | ((codepoint >> 12) & 0x3F));
2164
+ data[2] = (char) (0x80 | ((codepoint >> 6 ) & 0x3F));
2165
+ data[3] = (char) (0x80 | ( codepoint & 0x3F));
2166
+
2167
+ return 4;
2168
+ }
2169
+
2170
+ return 0;
2171
+ }