nokolexbor 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (486) hide show
  1. checksums.yaml +7 -0
  2. data/ext/nokolexbor/config.h +186 -0
  3. data/ext/nokolexbor/extconf.rb +131 -0
  4. data/ext/nokolexbor/libxml/HTMLparser.h +320 -0
  5. data/ext/nokolexbor/libxml/SAX2.h +173 -0
  6. data/ext/nokolexbor/libxml/chvalid.h +230 -0
  7. data/ext/nokolexbor/libxml/debugXML.h +217 -0
  8. data/ext/nokolexbor/libxml/dict.h +81 -0
  9. data/ext/nokolexbor/libxml/encoding.h +232 -0
  10. data/ext/nokolexbor/libxml/entities.h +153 -0
  11. data/ext/nokolexbor/libxml/globals.h +529 -0
  12. data/ext/nokolexbor/libxml/hash.h +236 -0
  13. data/ext/nokolexbor/libxml/list.h +137 -0
  14. data/ext/nokolexbor/libxml/parser.h +1264 -0
  15. data/ext/nokolexbor/libxml/parserInternals.h +641 -0
  16. data/ext/nokolexbor/libxml/pattern.h +100 -0
  17. data/ext/nokolexbor/libxml/threads.h +94 -0
  18. data/ext/nokolexbor/libxml/tree.h +1315 -0
  19. data/ext/nokolexbor/libxml/uri.h +94 -0
  20. data/ext/nokolexbor/libxml/valid.h +448 -0
  21. data/ext/nokolexbor/libxml/xmlIO.h +369 -0
  22. data/ext/nokolexbor/libxml/xmlautomata.h +146 -0
  23. data/ext/nokolexbor/libxml/xmlerror.h +919 -0
  24. data/ext/nokolexbor/libxml/xmlexports.h +79 -0
  25. data/ext/nokolexbor/libxml/xmlmemory.h +226 -0
  26. data/ext/nokolexbor/libxml/xmlregexp.h +222 -0
  27. data/ext/nokolexbor/libxml/xmlstring.h +140 -0
  28. data/ext/nokolexbor/libxml/xmlversion.h +526 -0
  29. data/ext/nokolexbor/libxml/xpath.h +575 -0
  30. data/ext/nokolexbor/libxml/xpathInternals.h +632 -0
  31. data/ext/nokolexbor/libxml/xpointer.h +137 -0
  32. data/ext/nokolexbor/libxml.h +76 -0
  33. data/ext/nokolexbor/memory.c +39 -0
  34. data/ext/nokolexbor/nl_document.c +51 -0
  35. data/ext/nokolexbor/nl_node.c +790 -0
  36. data/ext/nokolexbor/nl_node_set.c +368 -0
  37. data/ext/nokolexbor/nl_xpath_context.c +200 -0
  38. data/ext/nokolexbor/nokolexbor.c +63 -0
  39. data/ext/nokolexbor/nokolexbor.h +37 -0
  40. data/ext/nokolexbor/private/buf.h +70 -0
  41. data/ext/nokolexbor/private/dict.h +11 -0
  42. data/ext/nokolexbor/private/enc.h +17 -0
  43. data/ext/nokolexbor/private/error.h +21 -0
  44. data/ext/nokolexbor/private/globals.h +9 -0
  45. data/ext/nokolexbor/private/memory.h +9 -0
  46. data/ext/nokolexbor/private/parser.h +27 -0
  47. data/ext/nokolexbor/private/string.h +9 -0
  48. data/ext/nokolexbor/private/threads.h +50 -0
  49. data/ext/nokolexbor/private/tree.h +18 -0
  50. data/ext/nokolexbor/private/xpath.h +7 -0
  51. data/ext/nokolexbor/timsort.h +601 -0
  52. data/ext/nokolexbor/xml_SAX2.c +80 -0
  53. data/ext/nokolexbor/xml_buf.c +363 -0
  54. data/ext/nokolexbor/xml_chvalid.c +334 -0
  55. data/ext/nokolexbor/xml_dict.c +1264 -0
  56. data/ext/nokolexbor/xml_encoding.c +124 -0
  57. data/ext/nokolexbor/xml_error.c +134 -0
  58. data/ext/nokolexbor/xml_globals.c +1085 -0
  59. data/ext/nokolexbor/xml_hash.c +1141 -0
  60. data/ext/nokolexbor/xml_memory.c +203 -0
  61. data/ext/nokolexbor/xml_parser.c +127 -0
  62. data/ext/nokolexbor/xml_parserInternals.c +338 -0
  63. data/ext/nokolexbor/xml_pattern.c +2375 -0
  64. data/ext/nokolexbor/xml_string.c +1051 -0
  65. data/ext/nokolexbor/xml_threads.c +881 -0
  66. data/ext/nokolexbor/xml_tree.c +148 -0
  67. data/ext/nokolexbor/xml_xpath.c +14743 -0
  68. data/lib/nokolexbor/attribute.rb +18 -0
  69. data/lib/nokolexbor/document.rb +6 -0
  70. data/lib/nokolexbor/node.rb +264 -0
  71. data/lib/nokolexbor/node_set.rb +124 -0
  72. data/lib/nokolexbor/version.rb +5 -0
  73. data/lib/nokolexbor/xpath_context.rb +14 -0
  74. data/lib/nokolexbor.rb +17 -0
  75. data/patches/0001-lexbor-support-text-pseudo-element.patch +137 -0
  76. data/patches/0002-lexbor-match-id-class-case-sensitive.patch +22 -0
  77. data/patches/0003-lexbor-attach-template-content-to-self.patch +13 -0
  78. data/vendor/lexbor/CMakeLists.txt +331 -0
  79. data/vendor/lexbor/config.cmake +890 -0
  80. data/vendor/lexbor/feature.cmake +134 -0
  81. data/vendor/lexbor/source/lexbor/core/array.c +208 -0
  82. data/vendor/lexbor/source/lexbor/core/array.h +100 -0
  83. data/vendor/lexbor/source/lexbor/core/array_obj.c +216 -0
  84. data/vendor/lexbor/source/lexbor/core/array_obj.h +134 -0
  85. data/vendor/lexbor/source/lexbor/core/avl.c +442 -0
  86. data/vendor/lexbor/source/lexbor/core/avl.h +82 -0
  87. data/vendor/lexbor/source/lexbor/core/base.h +86 -0
  88. data/vendor/lexbor/source/lexbor/core/bst.c +468 -0
  89. data/vendor/lexbor/source/lexbor/core/bst.h +108 -0
  90. data/vendor/lexbor/source/lexbor/core/bst_map.c +238 -0
  91. data/vendor/lexbor/source/lexbor/core/bst_map.h +87 -0
  92. data/vendor/lexbor/source/lexbor/core/config.cmake +12 -0
  93. data/vendor/lexbor/source/lexbor/core/conv.c +203 -0
  94. data/vendor/lexbor/source/lexbor/core/conv.h +53 -0
  95. data/vendor/lexbor/source/lexbor/core/core.h +35 -0
  96. data/vendor/lexbor/source/lexbor/core/def.h +57 -0
  97. data/vendor/lexbor/source/lexbor/core/diyfp.c +153 -0
  98. data/vendor/lexbor/source/lexbor/core/diyfp.h +258 -0
  99. data/vendor/lexbor/source/lexbor/core/dobject.c +187 -0
  100. data/vendor/lexbor/source/lexbor/core/dobject.h +92 -0
  101. data/vendor/lexbor/source/lexbor/core/dtoa.c +404 -0
  102. data/vendor/lexbor/source/lexbor/core/dtoa.h +28 -0
  103. data/vendor/lexbor/source/lexbor/core/fs.h +60 -0
  104. data/vendor/lexbor/source/lexbor/core/hash.c +476 -0
  105. data/vendor/lexbor/source/lexbor/core/hash.h +218 -0
  106. data/vendor/lexbor/source/lexbor/core/in.c +267 -0
  107. data/vendor/lexbor/source/lexbor/core/in.h +172 -0
  108. data/vendor/lexbor/source/lexbor/core/lexbor.h +35 -0
  109. data/vendor/lexbor/source/lexbor/core/mem.c +228 -0
  110. data/vendor/lexbor/source/lexbor/core/mem.h +141 -0
  111. data/vendor/lexbor/source/lexbor/core/mraw.c +428 -0
  112. data/vendor/lexbor/source/lexbor/core/mraw.h +114 -0
  113. data/vendor/lexbor/source/lexbor/core/perf.h +45 -0
  114. data/vendor/lexbor/source/lexbor/core/plog.c +73 -0
  115. data/vendor/lexbor/source/lexbor/core/plog.h +102 -0
  116. data/vendor/lexbor/source/lexbor/core/print.c +168 -0
  117. data/vendor/lexbor/source/lexbor/core/print.h +39 -0
  118. data/vendor/lexbor/source/lexbor/core/sbst.h +59 -0
  119. data/vendor/lexbor/source/lexbor/core/serialize.c +27 -0
  120. data/vendor/lexbor/source/lexbor/core/serialize.h +32 -0
  121. data/vendor/lexbor/source/lexbor/core/shs.c +118 -0
  122. data/vendor/lexbor/source/lexbor/core/shs.h +82 -0
  123. data/vendor/lexbor/source/lexbor/core/str.c +617 -0
  124. data/vendor/lexbor/source/lexbor/core/str.h +247 -0
  125. data/vendor/lexbor/source/lexbor/core/str_res.h +369 -0
  126. data/vendor/lexbor/source/lexbor/core/strtod.c +326 -0
  127. data/vendor/lexbor/source/lexbor/core/strtod.h +28 -0
  128. data/vendor/lexbor/source/lexbor/core/types.h +39 -0
  129. data/vendor/lexbor/source/lexbor/core/utils.c +43 -0
  130. data/vendor/lexbor/source/lexbor/core/utils.h +36 -0
  131. data/vendor/lexbor/source/lexbor/css/base.h +44 -0
  132. data/vendor/lexbor/source/lexbor/css/config.cmake +2 -0
  133. data/vendor/lexbor/source/lexbor/css/css.h +25 -0
  134. data/vendor/lexbor/source/lexbor/css/log.c +336 -0
  135. data/vendor/lexbor/source/lexbor/css/log.h +103 -0
  136. data/vendor/lexbor/source/lexbor/css/node.h +29 -0
  137. data/vendor/lexbor/source/lexbor/css/parser.c +473 -0
  138. data/vendor/lexbor/source/lexbor/css/parser.h +368 -0
  139. data/vendor/lexbor/source/lexbor/css/selectors/base.h +48 -0
  140. data/vendor/lexbor/source/lexbor/css/selectors/pseudo.c +91 -0
  141. data/vendor/lexbor/source/lexbor/css/selectors/pseudo.h +66 -0
  142. data/vendor/lexbor/source/lexbor/css/selectors/pseudo_const.h +109 -0
  143. data/vendor/lexbor/source/lexbor/css/selectors/pseudo_res.h +302 -0
  144. data/vendor/lexbor/source/lexbor/css/selectors/pseudo_state.c +279 -0
  145. data/vendor/lexbor/source/lexbor/css/selectors/pseudo_state.h +85 -0
  146. data/vendor/lexbor/source/lexbor/css/selectors/selector.c +927 -0
  147. data/vendor/lexbor/source/lexbor/css/selectors/selector.h +200 -0
  148. data/vendor/lexbor/source/lexbor/css/selectors/selectors.c +340 -0
  149. data/vendor/lexbor/source/lexbor/css/selectors/selectors.h +137 -0
  150. data/vendor/lexbor/source/lexbor/css/selectors/state.c +1718 -0
  151. data/vendor/lexbor/source/lexbor/css/selectors/state.h +79 -0
  152. data/vendor/lexbor/source/lexbor/css/stylesheet.h +37 -0
  153. data/vendor/lexbor/source/lexbor/css/syntax/anb.c +443 -0
  154. data/vendor/lexbor/source/lexbor/css/syntax/anb.h +45 -0
  155. data/vendor/lexbor/source/lexbor/css/syntax/base.h +33 -0
  156. data/vendor/lexbor/source/lexbor/css/syntax/parser.c +9 -0
  157. data/vendor/lexbor/source/lexbor/css/syntax/parser.h +25 -0
  158. data/vendor/lexbor/source/lexbor/css/syntax/res.h +48 -0
  159. data/vendor/lexbor/source/lexbor/css/syntax/state.c +2603 -0
  160. data/vendor/lexbor/source/lexbor/css/syntax/state.h +140 -0
  161. data/vendor/lexbor/source/lexbor/css/syntax/state_res.h +273 -0
  162. data/vendor/lexbor/source/lexbor/css/syntax/syntax.c +67 -0
  163. data/vendor/lexbor/source/lexbor/css/syntax/token.c +618 -0
  164. data/vendor/lexbor/source/lexbor/css/syntax/token.h +298 -0
  165. data/vendor/lexbor/source/lexbor/css/syntax/token_res.h +68 -0
  166. data/vendor/lexbor/source/lexbor/css/syntax/tokenizer/error.c +30 -0
  167. data/vendor/lexbor/source/lexbor/css/syntax/tokenizer/error.h +58 -0
  168. data/vendor/lexbor/source/lexbor/css/syntax/tokenizer.c +278 -0
  169. data/vendor/lexbor/source/lexbor/css/syntax/tokenizer.h +121 -0
  170. data/vendor/lexbor/source/lexbor/dom/base.h +32 -0
  171. data/vendor/lexbor/source/lexbor/dom/collection.c +97 -0
  172. data/vendor/lexbor/source/lexbor/dom/collection.h +112 -0
  173. data/vendor/lexbor/source/lexbor/dom/config.cmake +3 -0
  174. data/vendor/lexbor/source/lexbor/dom/dom.h +29 -0
  175. data/vendor/lexbor/source/lexbor/dom/exception.c +18 -0
  176. data/vendor/lexbor/source/lexbor/dom/exception.h +73 -0
  177. data/vendor/lexbor/source/lexbor/dom/interface.c +110 -0
  178. data/vendor/lexbor/source/lexbor/dom/interface.h +88 -0
  179. data/vendor/lexbor/source/lexbor/dom/interfaces/attr.c +445 -0
  180. data/vendor/lexbor/source/lexbor/dom/interfaces/attr.h +152 -0
  181. data/vendor/lexbor/source/lexbor/dom/interfaces/attr_const.h +62 -0
  182. data/vendor/lexbor/source/lexbor/dom/interfaces/attr_res.h +143 -0
  183. data/vendor/lexbor/source/lexbor/dom/interfaces/cdata_section.c +55 -0
  184. data/vendor/lexbor/source/lexbor/dom/interfaces/cdata_section.h +38 -0
  185. data/vendor/lexbor/source/lexbor/dom/interfaces/character_data.c +110 -0
  186. data/vendor/lexbor/source/lexbor/dom/interfaces/character_data.h +51 -0
  187. data/vendor/lexbor/source/lexbor/dom/interfaces/comment.c +64 -0
  188. data/vendor/lexbor/source/lexbor/dom/interfaces/comment.h +42 -0
  189. data/vendor/lexbor/source/lexbor/dom/interfaces/document.c +536 -0
  190. data/vendor/lexbor/source/lexbor/dom/interfaces/document.h +243 -0
  191. data/vendor/lexbor/source/lexbor/dom/interfaces/document_fragment.c +36 -0
  192. data/vendor/lexbor/source/lexbor/dom/interfaces/document_fragment.h +36 -0
  193. data/vendor/lexbor/source/lexbor/dom/interfaces/document_type.c +125 -0
  194. data/vendor/lexbor/source/lexbor/dom/interfaces/document_type.h +108 -0
  195. data/vendor/lexbor/source/lexbor/dom/interfaces/element.c +1411 -0
  196. data/vendor/lexbor/source/lexbor/dom/interfaces/element.h +319 -0
  197. data/vendor/lexbor/source/lexbor/dom/interfaces/event_target.c +32 -0
  198. data/vendor/lexbor/source/lexbor/dom/interfaces/event_target.h +34 -0
  199. data/vendor/lexbor/source/lexbor/dom/interfaces/node.c +661 -0
  200. data/vendor/lexbor/source/lexbor/dom/interfaces/node.h +192 -0
  201. data/vendor/lexbor/source/lexbor/dom/interfaces/processing_instruction.c +87 -0
  202. data/vendor/lexbor/source/lexbor/dom/interfaces/processing_instruction.h +66 -0
  203. data/vendor/lexbor/source/lexbor/dom/interfaces/shadow_root.c +36 -0
  204. data/vendor/lexbor/source/lexbor/dom/interfaces/shadow_root.h +44 -0
  205. data/vendor/lexbor/source/lexbor/dom/interfaces/text.c +63 -0
  206. data/vendor/lexbor/source/lexbor/dom/interfaces/text.h +42 -0
  207. data/vendor/lexbor/source/lexbor/encoding/base.h +218 -0
  208. data/vendor/lexbor/source/lexbor/encoding/big5.c +42839 -0
  209. data/vendor/lexbor/source/lexbor/encoding/config.cmake +12 -0
  210. data/vendor/lexbor/source/lexbor/encoding/const.h +65 -0
  211. data/vendor/lexbor/source/lexbor/encoding/decode.c +3193 -0
  212. data/vendor/lexbor/source/lexbor/encoding/decode.h +370 -0
  213. data/vendor/lexbor/source/lexbor/encoding/encode.c +1931 -0
  214. data/vendor/lexbor/source/lexbor/encoding/encode.h +377 -0
  215. data/vendor/lexbor/source/lexbor/encoding/encoding.c +252 -0
  216. data/vendor/lexbor/source/lexbor/encoding/encoding.h +475 -0
  217. data/vendor/lexbor/source/lexbor/encoding/euc_kr.c +53883 -0
  218. data/vendor/lexbor/source/lexbor/encoding/gb18030.c +47905 -0
  219. data/vendor/lexbor/source/lexbor/encoding/iso_2022_jp_katakana.c +159 -0
  220. data/vendor/lexbor/source/lexbor/encoding/jis0208.c +22477 -0
  221. data/vendor/lexbor/source/lexbor/encoding/jis0212.c +15787 -0
  222. data/vendor/lexbor/source/lexbor/encoding/multi.h +53 -0
  223. data/vendor/lexbor/source/lexbor/encoding/range.c +71 -0
  224. data/vendor/lexbor/source/lexbor/encoding/range.h +34 -0
  225. data/vendor/lexbor/source/lexbor/encoding/res.c +222 -0
  226. data/vendor/lexbor/source/lexbor/encoding/res.h +34 -0
  227. data/vendor/lexbor/source/lexbor/encoding/single.c +13748 -0
  228. data/vendor/lexbor/source/lexbor/encoding/single.h +116 -0
  229. data/vendor/lexbor/source/lexbor/html/base.h +44 -0
  230. data/vendor/lexbor/source/lexbor/html/config.cmake +3 -0
  231. data/vendor/lexbor/source/lexbor/html/encoding.c +574 -0
  232. data/vendor/lexbor/source/lexbor/html/encoding.h +106 -0
  233. data/vendor/lexbor/source/lexbor/html/html.h +107 -0
  234. data/vendor/lexbor/source/lexbor/html/interface.c +165 -0
  235. data/vendor/lexbor/source/lexbor/html/interface.h +186 -0
  236. data/vendor/lexbor/source/lexbor/html/interface_res.h +4449 -0
  237. data/vendor/lexbor/source/lexbor/html/interfaces/anchor_element.c +36 -0
  238. data/vendor/lexbor/source/lexbor/html/interfaces/anchor_element.h +34 -0
  239. data/vendor/lexbor/source/lexbor/html/interfaces/area_element.c +36 -0
  240. data/vendor/lexbor/source/lexbor/html/interfaces/area_element.h +34 -0
  241. data/vendor/lexbor/source/lexbor/html/interfaces/audio_element.c +36 -0
  242. data/vendor/lexbor/source/lexbor/html/interfaces/audio_element.h +34 -0
  243. data/vendor/lexbor/source/lexbor/html/interfaces/base_element.c +36 -0
  244. data/vendor/lexbor/source/lexbor/html/interfaces/base_element.h +34 -0
  245. data/vendor/lexbor/source/lexbor/html/interfaces/body_element.c +36 -0
  246. data/vendor/lexbor/source/lexbor/html/interfaces/body_element.h +34 -0
  247. data/vendor/lexbor/source/lexbor/html/interfaces/br_element.c +36 -0
  248. data/vendor/lexbor/source/lexbor/html/interfaces/br_element.h +34 -0
  249. data/vendor/lexbor/source/lexbor/html/interfaces/button_element.c +36 -0
  250. data/vendor/lexbor/source/lexbor/html/interfaces/button_element.h +34 -0
  251. data/vendor/lexbor/source/lexbor/html/interfaces/canvas_element.c +36 -0
  252. data/vendor/lexbor/source/lexbor/html/interfaces/canvas_element.h +34 -0
  253. data/vendor/lexbor/source/lexbor/html/interfaces/d_list_element.c +36 -0
  254. data/vendor/lexbor/source/lexbor/html/interfaces/d_list_element.h +34 -0
  255. data/vendor/lexbor/source/lexbor/html/interfaces/data_element.c +36 -0
  256. data/vendor/lexbor/source/lexbor/html/interfaces/data_element.h +34 -0
  257. data/vendor/lexbor/source/lexbor/html/interfaces/data_list_element.c +36 -0
  258. data/vendor/lexbor/source/lexbor/html/interfaces/data_list_element.h +34 -0
  259. data/vendor/lexbor/source/lexbor/html/interfaces/details_element.c +36 -0
  260. data/vendor/lexbor/source/lexbor/html/interfaces/details_element.h +34 -0
  261. data/vendor/lexbor/source/lexbor/html/interfaces/dialog_element.c +36 -0
  262. data/vendor/lexbor/source/lexbor/html/interfaces/dialog_element.h +34 -0
  263. data/vendor/lexbor/source/lexbor/html/interfaces/directory_element.c +36 -0
  264. data/vendor/lexbor/source/lexbor/html/interfaces/directory_element.h +34 -0
  265. data/vendor/lexbor/source/lexbor/html/interfaces/div_element.c +36 -0
  266. data/vendor/lexbor/source/lexbor/html/interfaces/div_element.h +34 -0
  267. data/vendor/lexbor/source/lexbor/html/interfaces/document.c +444 -0
  268. data/vendor/lexbor/source/lexbor/html/interfaces/document.h +256 -0
  269. data/vendor/lexbor/source/lexbor/html/interfaces/element.c +64 -0
  270. data/vendor/lexbor/source/lexbor/html/interfaces/element.h +54 -0
  271. data/vendor/lexbor/source/lexbor/html/interfaces/embed_element.c +36 -0
  272. data/vendor/lexbor/source/lexbor/html/interfaces/embed_element.h +34 -0
  273. data/vendor/lexbor/source/lexbor/html/interfaces/field_set_element.c +36 -0
  274. data/vendor/lexbor/source/lexbor/html/interfaces/field_set_element.h +34 -0
  275. data/vendor/lexbor/source/lexbor/html/interfaces/font_element.c +36 -0
  276. data/vendor/lexbor/source/lexbor/html/interfaces/font_element.h +34 -0
  277. data/vendor/lexbor/source/lexbor/html/interfaces/form_element.c +36 -0
  278. data/vendor/lexbor/source/lexbor/html/interfaces/form_element.h +34 -0
  279. data/vendor/lexbor/source/lexbor/html/interfaces/frame_element.c +36 -0
  280. data/vendor/lexbor/source/lexbor/html/interfaces/frame_element.h +34 -0
  281. data/vendor/lexbor/source/lexbor/html/interfaces/frame_set_element.c +36 -0
  282. data/vendor/lexbor/source/lexbor/html/interfaces/frame_set_element.h +34 -0
  283. data/vendor/lexbor/source/lexbor/html/interfaces/head_element.c +36 -0
  284. data/vendor/lexbor/source/lexbor/html/interfaces/head_element.h +34 -0
  285. data/vendor/lexbor/source/lexbor/html/interfaces/heading_element.c +36 -0
  286. data/vendor/lexbor/source/lexbor/html/interfaces/heading_element.h +34 -0
  287. data/vendor/lexbor/source/lexbor/html/interfaces/hr_element.c +36 -0
  288. data/vendor/lexbor/source/lexbor/html/interfaces/hr_element.h +34 -0
  289. data/vendor/lexbor/source/lexbor/html/interfaces/html_element.c +36 -0
  290. data/vendor/lexbor/source/lexbor/html/interfaces/html_element.h +34 -0
  291. data/vendor/lexbor/source/lexbor/html/interfaces/iframe_element.c +36 -0
  292. data/vendor/lexbor/source/lexbor/html/interfaces/iframe_element.h +34 -0
  293. data/vendor/lexbor/source/lexbor/html/interfaces/image_element.c +36 -0
  294. data/vendor/lexbor/source/lexbor/html/interfaces/image_element.h +34 -0
  295. data/vendor/lexbor/source/lexbor/html/interfaces/input_element.c +36 -0
  296. data/vendor/lexbor/source/lexbor/html/interfaces/input_element.h +34 -0
  297. data/vendor/lexbor/source/lexbor/html/interfaces/label_element.c +36 -0
  298. data/vendor/lexbor/source/lexbor/html/interfaces/label_element.h +34 -0
  299. data/vendor/lexbor/source/lexbor/html/interfaces/legend_element.c +36 -0
  300. data/vendor/lexbor/source/lexbor/html/interfaces/legend_element.h +34 -0
  301. data/vendor/lexbor/source/lexbor/html/interfaces/li_element.c +36 -0
  302. data/vendor/lexbor/source/lexbor/html/interfaces/li_element.h +34 -0
  303. data/vendor/lexbor/source/lexbor/html/interfaces/link_element.c +36 -0
  304. data/vendor/lexbor/source/lexbor/html/interfaces/link_element.h +34 -0
  305. data/vendor/lexbor/source/lexbor/html/interfaces/map_element.c +36 -0
  306. data/vendor/lexbor/source/lexbor/html/interfaces/map_element.h +34 -0
  307. data/vendor/lexbor/source/lexbor/html/interfaces/marquee_element.c +36 -0
  308. data/vendor/lexbor/source/lexbor/html/interfaces/marquee_element.h +34 -0
  309. data/vendor/lexbor/source/lexbor/html/interfaces/media_element.c +36 -0
  310. data/vendor/lexbor/source/lexbor/html/interfaces/media_element.h +34 -0
  311. data/vendor/lexbor/source/lexbor/html/interfaces/menu_element.c +36 -0
  312. data/vendor/lexbor/source/lexbor/html/interfaces/menu_element.h +34 -0
  313. data/vendor/lexbor/source/lexbor/html/interfaces/meta_element.c +36 -0
  314. data/vendor/lexbor/source/lexbor/html/interfaces/meta_element.h +34 -0
  315. data/vendor/lexbor/source/lexbor/html/interfaces/meter_element.c +36 -0
  316. data/vendor/lexbor/source/lexbor/html/interfaces/meter_element.h +34 -0
  317. data/vendor/lexbor/source/lexbor/html/interfaces/mod_element.c +36 -0
  318. data/vendor/lexbor/source/lexbor/html/interfaces/mod_element.h +34 -0
  319. data/vendor/lexbor/source/lexbor/html/interfaces/o_list_element.c +36 -0
  320. data/vendor/lexbor/source/lexbor/html/interfaces/o_list_element.h +34 -0
  321. data/vendor/lexbor/source/lexbor/html/interfaces/object_element.c +36 -0
  322. data/vendor/lexbor/source/lexbor/html/interfaces/object_element.h +34 -0
  323. data/vendor/lexbor/source/lexbor/html/interfaces/opt_group_element.c +36 -0
  324. data/vendor/lexbor/source/lexbor/html/interfaces/opt_group_element.h +34 -0
  325. data/vendor/lexbor/source/lexbor/html/interfaces/option_element.c +36 -0
  326. data/vendor/lexbor/source/lexbor/html/interfaces/option_element.h +34 -0
  327. data/vendor/lexbor/source/lexbor/html/interfaces/output_element.c +36 -0
  328. data/vendor/lexbor/source/lexbor/html/interfaces/output_element.h +34 -0
  329. data/vendor/lexbor/source/lexbor/html/interfaces/paragraph_element.c +36 -0
  330. data/vendor/lexbor/source/lexbor/html/interfaces/paragraph_element.h +34 -0
  331. data/vendor/lexbor/source/lexbor/html/interfaces/param_element.c +36 -0
  332. data/vendor/lexbor/source/lexbor/html/interfaces/param_element.h +34 -0
  333. data/vendor/lexbor/source/lexbor/html/interfaces/picture_element.c +36 -0
  334. data/vendor/lexbor/source/lexbor/html/interfaces/picture_element.h +34 -0
  335. data/vendor/lexbor/source/lexbor/html/interfaces/pre_element.c +36 -0
  336. data/vendor/lexbor/source/lexbor/html/interfaces/pre_element.h +34 -0
  337. data/vendor/lexbor/source/lexbor/html/interfaces/progress_element.c +36 -0
  338. data/vendor/lexbor/source/lexbor/html/interfaces/progress_element.h +34 -0
  339. data/vendor/lexbor/source/lexbor/html/interfaces/quote_element.c +36 -0
  340. data/vendor/lexbor/source/lexbor/html/interfaces/quote_element.h +34 -0
  341. data/vendor/lexbor/source/lexbor/html/interfaces/script_element.c +36 -0
  342. data/vendor/lexbor/source/lexbor/html/interfaces/script_element.h +34 -0
  343. data/vendor/lexbor/source/lexbor/html/interfaces/select_element.c +36 -0
  344. data/vendor/lexbor/source/lexbor/html/interfaces/select_element.h +34 -0
  345. data/vendor/lexbor/source/lexbor/html/interfaces/slot_element.c +36 -0
  346. data/vendor/lexbor/source/lexbor/html/interfaces/slot_element.h +34 -0
  347. data/vendor/lexbor/source/lexbor/html/interfaces/source_element.c +36 -0
  348. data/vendor/lexbor/source/lexbor/html/interfaces/source_element.h +34 -0
  349. data/vendor/lexbor/source/lexbor/html/interfaces/span_element.c +36 -0
  350. data/vendor/lexbor/source/lexbor/html/interfaces/span_element.h +34 -0
  351. data/vendor/lexbor/source/lexbor/html/interfaces/style_element.c +36 -0
  352. data/vendor/lexbor/source/lexbor/html/interfaces/style_element.h +34 -0
  353. data/vendor/lexbor/source/lexbor/html/interfaces/table_caption_element.c +36 -0
  354. data/vendor/lexbor/source/lexbor/html/interfaces/table_caption_element.h +34 -0
  355. data/vendor/lexbor/source/lexbor/html/interfaces/table_cell_element.c +36 -0
  356. data/vendor/lexbor/source/lexbor/html/interfaces/table_cell_element.h +34 -0
  357. data/vendor/lexbor/source/lexbor/html/interfaces/table_col_element.c +36 -0
  358. data/vendor/lexbor/source/lexbor/html/interfaces/table_col_element.h +34 -0
  359. data/vendor/lexbor/source/lexbor/html/interfaces/table_element.c +36 -0
  360. data/vendor/lexbor/source/lexbor/html/interfaces/table_element.h +34 -0
  361. data/vendor/lexbor/source/lexbor/html/interfaces/table_row_element.c +36 -0
  362. data/vendor/lexbor/source/lexbor/html/interfaces/table_row_element.h +34 -0
  363. data/vendor/lexbor/source/lexbor/html/interfaces/table_section_element.c +36 -0
  364. data/vendor/lexbor/source/lexbor/html/interfaces/table_section_element.h +34 -0
  365. data/vendor/lexbor/source/lexbor/html/interfaces/template_element.c +46 -0
  366. data/vendor/lexbor/source/lexbor/html/interfaces/template_element.h +38 -0
  367. data/vendor/lexbor/source/lexbor/html/interfaces/text_area_element.c +36 -0
  368. data/vendor/lexbor/source/lexbor/html/interfaces/text_area_element.h +34 -0
  369. data/vendor/lexbor/source/lexbor/html/interfaces/time_element.c +36 -0
  370. data/vendor/lexbor/source/lexbor/html/interfaces/time_element.h +34 -0
  371. data/vendor/lexbor/source/lexbor/html/interfaces/title_element.c +133 -0
  372. data/vendor/lexbor/source/lexbor/html/interfaces/title_element.h +42 -0
  373. data/vendor/lexbor/source/lexbor/html/interfaces/track_element.c +36 -0
  374. data/vendor/lexbor/source/lexbor/html/interfaces/track_element.h +34 -0
  375. data/vendor/lexbor/source/lexbor/html/interfaces/u_list_element.c +36 -0
  376. data/vendor/lexbor/source/lexbor/html/interfaces/u_list_element.h +34 -0
  377. data/vendor/lexbor/source/lexbor/html/interfaces/unknown_element.c +36 -0
  378. data/vendor/lexbor/source/lexbor/html/interfaces/unknown_element.h +34 -0
  379. data/vendor/lexbor/source/lexbor/html/interfaces/video_element.c +36 -0
  380. data/vendor/lexbor/source/lexbor/html/interfaces/video_element.h +34 -0
  381. data/vendor/lexbor/source/lexbor/html/interfaces/window.c +36 -0
  382. data/vendor/lexbor/source/lexbor/html/interfaces/window.h +34 -0
  383. data/vendor/lexbor/source/lexbor/html/node.c +14 -0
  384. data/vendor/lexbor/source/lexbor/html/node.h +67 -0
  385. data/vendor/lexbor/source/lexbor/html/parser.c +469 -0
  386. data/vendor/lexbor/source/lexbor/html/parser.h +170 -0
  387. data/vendor/lexbor/source/lexbor/html/serialize.c +1510 -0
  388. data/vendor/lexbor/source/lexbor/html/serialize.h +93 -0
  389. data/vendor/lexbor/source/lexbor/html/tag.h +103 -0
  390. data/vendor/lexbor/source/lexbor/html/tag_res.h +2262 -0
  391. data/vendor/lexbor/source/lexbor/html/token.c +386 -0
  392. data/vendor/lexbor/source/lexbor/html/token.h +130 -0
  393. data/vendor/lexbor/source/lexbor/html/token_attr.c +44 -0
  394. data/vendor/lexbor/source/lexbor/html/token_attr.h +67 -0
  395. data/vendor/lexbor/source/lexbor/html/tokenizer/error.c +28 -0
  396. data/vendor/lexbor/source/lexbor/html/tokenizer/error.h +141 -0
  397. data/vendor/lexbor/source/lexbor/html/tokenizer/res.h +4956 -0
  398. data/vendor/lexbor/source/lexbor/html/tokenizer/state.c +2171 -0
  399. data/vendor/lexbor/source/lexbor/html/tokenizer/state.h +225 -0
  400. data/vendor/lexbor/source/lexbor/html/tokenizer/state_comment.c +489 -0
  401. data/vendor/lexbor/source/lexbor/html/tokenizer/state_comment.h +27 -0
  402. data/vendor/lexbor/source/lexbor/html/tokenizer/state_doctype.c +1654 -0
  403. data/vendor/lexbor/source/lexbor/html/tokenizer/state_doctype.h +27 -0
  404. data/vendor/lexbor/source/lexbor/html/tokenizer/state_rawtext.c +303 -0
  405. data/vendor/lexbor/source/lexbor/html/tokenizer/state_rawtext.h +32 -0
  406. data/vendor/lexbor/source/lexbor/html/tokenizer/state_rcdata.c +311 -0
  407. data/vendor/lexbor/source/lexbor/html/tokenizer/state_rcdata.h +32 -0
  408. data/vendor/lexbor/source/lexbor/html/tokenizer/state_script.c +1209 -0
  409. data/vendor/lexbor/source/lexbor/html/tokenizer/state_script.h +32 -0
  410. data/vendor/lexbor/source/lexbor/html/tokenizer.c +499 -0
  411. data/vendor/lexbor/source/lexbor/html/tokenizer.h +343 -0
  412. data/vendor/lexbor/source/lexbor/html/tree/active_formatting.c +241 -0
  413. data/vendor/lexbor/source/lexbor/html/tree/active_formatting.h +117 -0
  414. data/vendor/lexbor/source/lexbor/html/tree/error.c +26 -0
  415. data/vendor/lexbor/source/lexbor/html/tree/error.h +114 -0
  416. data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/after_after_body.c +62 -0
  417. data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/after_after_frameset.c +63 -0
  418. data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/after_body.c +82 -0
  419. data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/after_frameset.c +88 -0
  420. data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/after_head.c +222 -0
  421. data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/before_head.c +144 -0
  422. data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/before_html.c +166 -0
  423. data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/foreign_content.c +358 -0
  424. data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/in_body.c +1974 -0
  425. data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/in_caption.c +158 -0
  426. data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/in_cell.c +187 -0
  427. data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/in_column_group.c +194 -0
  428. data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/in_frameset.c +149 -0
  429. data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/in_head.c +374 -0
  430. data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/in_head_noscript.c +121 -0
  431. data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/in_row.c +211 -0
  432. data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/in_select.c +341 -0
  433. data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/in_select_in_table.c +115 -0
  434. data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/in_table.c +451 -0
  435. data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/in_table_body.c +208 -0
  436. data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/in_table_text.c +127 -0
  437. data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/in_template.c +189 -0
  438. data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/initial.c +411 -0
  439. data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/text.c +61 -0
  440. data/vendor/lexbor/source/lexbor/html/tree/insertion_mode.h +135 -0
  441. data/vendor/lexbor/source/lexbor/html/tree/open_elements.c +251 -0
  442. data/vendor/lexbor/source/lexbor/html/tree/open_elements.h +105 -0
  443. data/vendor/lexbor/source/lexbor/html/tree/template_insertion.c +10 -0
  444. data/vendor/lexbor/source/lexbor/html/tree/template_insertion.h +100 -0
  445. data/vendor/lexbor/source/lexbor/html/tree.c +1726 -0
  446. data/vendor/lexbor/source/lexbor/html/tree.h +431 -0
  447. data/vendor/lexbor/source/lexbor/html/tree_res.h +111 -0
  448. data/vendor/lexbor/source/lexbor/ns/base.h +32 -0
  449. data/vendor/lexbor/source/lexbor/ns/config.cmake +2 -0
  450. data/vendor/lexbor/source/lexbor/ns/const.h +37 -0
  451. data/vendor/lexbor/source/lexbor/ns/ns.c +154 -0
  452. data/vendor/lexbor/source/lexbor/ns/ns.h +66 -0
  453. data/vendor/lexbor/source/lexbor/ns/res.h +97 -0
  454. data/vendor/lexbor/source/lexbor/ports/posix/config.cmake +11 -0
  455. data/vendor/lexbor/source/lexbor/ports/posix/lexbor/core/fs.c +236 -0
  456. data/vendor/lexbor/source/lexbor/ports/posix/lexbor/core/memory.c +33 -0
  457. data/vendor/lexbor/source/lexbor/ports/posix/lexbor/core/perf.c +158 -0
  458. data/vendor/lexbor/source/lexbor/ports/windows_nt/config.cmake +18 -0
  459. data/vendor/lexbor/source/lexbor/ports/windows_nt/lexbor/core/fs.c +239 -0
  460. data/vendor/lexbor/source/lexbor/ports/windows_nt/lexbor/core/memory.c +33 -0
  461. data/vendor/lexbor/source/lexbor/ports/windows_nt/lexbor/core/perf.c +81 -0
  462. data/vendor/lexbor/source/lexbor/selectors/base.h +30 -0
  463. data/vendor/lexbor/source/lexbor/selectors/config.cmake +2 -0
  464. data/vendor/lexbor/source/lexbor/selectors/selectors.c +1591 -0
  465. data/vendor/lexbor/source/lexbor/selectors/selectors.h +71 -0
  466. data/vendor/lexbor/source/lexbor/tag/base.h +32 -0
  467. data/vendor/lexbor/source/lexbor/tag/config.cmake +2 -0
  468. data/vendor/lexbor/source/lexbor/tag/const.h +225 -0
  469. data/vendor/lexbor/source/lexbor/tag/res.h +562 -0
  470. data/vendor/lexbor/source/lexbor/tag/tag.c +144 -0
  471. data/vendor/lexbor/source/lexbor/tag/tag.h +123 -0
  472. data/vendor/lexbor/source/lexbor/utils/base.h +32 -0
  473. data/vendor/lexbor/source/lexbor/utils/config.cmake +2 -0
  474. data/vendor/lexbor/source/lexbor/utils/http.c +534 -0
  475. data/vendor/lexbor/source/lexbor/utils/http.h +90 -0
  476. data/vendor/lexbor/source/lexbor/utils/utils.h +15 -0
  477. data/vendor/lexbor/source/lexbor/utils/warc.c +817 -0
  478. data/vendor/lexbor/source/lexbor/utils/warc.h +126 -0
  479. data/vendor/lexbor/utils/lexbor/css/selectors/pseudo.py +231 -0
  480. data/vendor/lexbor/utils/lexbor/css/selectors/tmp/const.h +21 -0
  481. data/vendor/lexbor/utils/lexbor/css/selectors/tmp/res.h +26 -0
  482. data/vendor/lexbor/utils/lexbor/css/syntax/definitions.py +49 -0
  483. data/vendor/lexbor/utils/lexbor/css/syntax/token_res.py +54 -0
  484. data/vendor/lexbor/utils/lexbor/css/syntax/tokenizer_code_map.py +36 -0
  485. data/vendor/lexbor/version +1 -0
  486. metadata +542 -0
@@ -0,0 +1,3193 @@
1
+ /*
2
+ * Copyright (C) 2019 Alexander Borisov
3
+ *
4
+ * Author: Alexander Borisov <borisov@lexbor.com>
5
+ */
6
+
7
+ #include "lexbor/encoding/decode.h"
8
+ #include "lexbor/encoding/single.h"
9
+ #include "lexbor/encoding/multi.h"
10
+ #include "lexbor/encoding/range.h"
11
+
12
+
13
+ #define LXB_ENCODING_DECODE_UTF_8_BOUNDARY(_lower, _upper, _cont) \
14
+ { \
15
+ ch = *p; \
16
+ \
17
+ if (ch < _lower || ch > _upper) { \
18
+ ctx->u.utf_8.lower = 0x00; \
19
+ ctx->u.utf_8.need = 0; \
20
+ \
21
+ LXB_ENCODING_DECODE_ERROR_BEGIN { \
22
+ *data = p; \
23
+ ctx->have_error = true; \
24
+ } \
25
+ LXB_ENCODING_DECODE_ERROR_END(); \
26
+ \
27
+ _cont; \
28
+ } \
29
+ else { \
30
+ p++; \
31
+ need--; \
32
+ ctx->codepoint = (ctx->codepoint << 6) | (ch & 0x3F); \
33
+ } \
34
+ }
35
+
36
+ #define LXB_ENCODING_DECODE_UTF_8_BOUNDARY_SET(first, two, f_lower, s_upper) \
37
+ do { \
38
+ if (ch == first) { \
39
+ ctx->u.utf_8.lower = f_lower; \
40
+ ctx->u.utf_8.upper = 0xBF; \
41
+ } \
42
+ else if (ch == two) { \
43
+ ctx->u.utf_8.lower = 0x80; \
44
+ ctx->u.utf_8.upper = s_upper; \
45
+ } \
46
+ } \
47
+ while (0)
48
+
49
+ #define LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, cp) \
50
+ do { \
51
+ (ctx)->buffer_out[(ctx)->buffer_used++] = (cp); \
52
+ } \
53
+ while (0)
54
+
55
+ #define LXB_ENCODING_DECODE_APPEND(ctx, cp) \
56
+ do { \
57
+ if ((ctx)->buffer_used >= (ctx)->buffer_length) { \
58
+ return LXB_STATUS_SMALL_BUFFER; \
59
+ } \
60
+ \
61
+ (ctx)->buffer_out[(ctx)->buffer_used++] = (cp); \
62
+ } \
63
+ while (0)
64
+
65
+ #define LXB_ENCODING_DECODE_APPEND_P(ctx, cp) \
66
+ do { \
67
+ if ((ctx)->buffer_used >= (ctx)->buffer_length) { \
68
+ *data = p; \
69
+ return LXB_STATUS_SMALL_BUFFER; \
70
+ } \
71
+ \
72
+ (ctx)->buffer_out[(ctx)->buffer_used++] = (cp); \
73
+ } \
74
+ while (0)
75
+
76
+ #define LXB_ENCODING_DECODE_CHECK_OUT(ctx) \
77
+ do { \
78
+ if ((ctx)->buffer_used >= (ctx)->buffer_length) { \
79
+ return LXB_STATUS_SMALL_BUFFER; \
80
+ } \
81
+ } \
82
+ while (0)
83
+
84
+ #define LXB_ENCODING_DECODE_ERROR_BEGIN \
85
+ do { \
86
+ if (ctx->replace_to == NULL) { \
87
+ return LXB_STATUS_ERROR; \
88
+ } \
89
+ \
90
+ if ((ctx->buffer_used + ctx->replace_len) > ctx->buffer_length) { \
91
+ do
92
+
93
+ #define LXB_ENCODING_DECODE_ERROR_END() \
94
+ while (0); \
95
+ \
96
+ return LXB_STATUS_SMALL_BUFFER; \
97
+ } \
98
+ \
99
+ memcpy(&ctx->buffer_out[ctx->buffer_used], ctx->replace_to, \
100
+ sizeof(lxb_codepoint_t) * ctx->replace_len); \
101
+ \
102
+ ctx->buffer_used += ctx->replace_len; \
103
+ } \
104
+ while (0)
105
+
106
+ #define LXB_ENCODING_DECODE_ERROR(ctx) \
107
+ do { \
108
+ LXB_ENCODING_DECODE_ERROR_BEGIN { \
109
+ } LXB_ENCODING_DECODE_ERROR_END(); \
110
+ } \
111
+ while (0)
112
+
113
+ #define LXB_ENCODING_DECODE_FAILED(ident) \
114
+ do { \
115
+ if ((byte) < (0x80)) { \
116
+ (*data)--; \
117
+ } \
118
+ \
119
+ LXB_ENCODING_DECODE_ERROR_BEGIN { \
120
+ ctx->have_error = true; \
121
+ (ident) = 0x01; \
122
+ } \
123
+ LXB_ENCODING_DECODE_ERROR_END(); \
124
+ } \
125
+ while (0)
126
+
127
+ #define LXB_ENCODING_DECODE_SINGLE(decode_map) \
128
+ do { \
129
+ const lxb_char_t *p = *data; \
130
+ \
131
+ while (p < end) { \
132
+ if (*p < 0x80) { \
133
+ LXB_ENCODING_DECODE_APPEND_P(ctx, *p++); \
134
+ } \
135
+ else { \
136
+ ctx->codepoint = decode_map[(*p++) - 0x80].codepoint; \
137
+ if (ctx->codepoint == LXB_ENCODING_ERROR_CODEPOINT) { \
138
+ LXB_ENCODING_DECODE_ERROR_BEGIN { \
139
+ *data = p - 1; \
140
+ } \
141
+ LXB_ENCODING_DECODE_ERROR_END(); \
142
+ continue; \
143
+ } \
144
+ \
145
+ LXB_ENCODING_DECODE_APPEND_P(ctx, ctx->codepoint); \
146
+ } \
147
+ \
148
+ *data = p; \
149
+ } \
150
+ } \
151
+ while (0)
152
+
153
+ #define LXB_ENCODING_DECODE_UTF_8_BOUNDARY_SINGLE(lower, upper) \
154
+ do { \
155
+ ch = **data; \
156
+ \
157
+ if (ch < lower || ch > upper) { \
158
+ goto failed; \
159
+ } \
160
+ \
161
+ (*data)++; \
162
+ needed--; \
163
+ ctx->codepoint = (ctx->codepoint << 6) | (ch & 0x3F); \
164
+ } \
165
+ while (0)
166
+
167
+ #define LXB_ENCODING_DECODE_UTF_8_BOUNDARY_SET_SINGLE(first, two, f_lower, \
168
+ s_upper) \
169
+ do { \
170
+ if (ch == first) { \
171
+ ctx->u.utf_8.lower = f_lower; \
172
+ ctx->u.utf_8.upper = 0xBF; \
173
+ } \
174
+ else if (ch == two) { \
175
+ ctx->u.utf_8.lower = 0x80; \
176
+ ctx->u.utf_8.upper = s_upper; \
177
+ } \
178
+ } \
179
+ while (0)
180
+
181
+
182
+ lxb_status_t
183
+ lxb_encoding_decode_default(lxb_encoding_decode_t *ctx,
184
+ const lxb_char_t **data, const lxb_char_t *end)
185
+ {
186
+ return lxb_encoding_decode_utf_8(ctx, data, end);
187
+ }
188
+
189
+ lxb_status_t
190
+ lxb_encoding_decode_auto(lxb_encoding_decode_t *ctx,
191
+ const lxb_char_t **data, const lxb_char_t *end)
192
+ {
193
+ *data = end;
194
+ return LXB_STATUS_ERROR;
195
+ }
196
+
197
+ lxb_status_t
198
+ lxb_encoding_decode_undefined(lxb_encoding_decode_t *ctx,
199
+ const lxb_char_t **data, const lxb_char_t *end)
200
+ {
201
+ *data = end;
202
+ return LXB_STATUS_ERROR;
203
+ }
204
+
205
+ lxb_status_t
206
+ lxb_encoding_decode_big5(lxb_encoding_decode_t *ctx,
207
+ const lxb_char_t **data, const lxb_char_t *end)
208
+ {
209
+ uint32_t index;
210
+ lxb_char_t lead, byte;
211
+
212
+ ctx->status = LXB_STATUS_OK;
213
+
214
+ if (ctx->u.lead != 0x00) {
215
+ if (ctx->have_error) {
216
+ ctx->u.lead = 0x00;
217
+ ctx->have_error = false;
218
+
219
+ LXB_ENCODING_DECODE_ERROR_BEGIN {
220
+ ctx->u.lead = 0x01;
221
+ ctx->have_error = true;
222
+ } LXB_ENCODING_DECODE_ERROR_END();
223
+ }
224
+ else if (ctx->second_codepoint != 0x0000) {
225
+ if ((ctx->buffer_used + 2) > ctx->buffer_length) {
226
+ return LXB_STATUS_SMALL_BUFFER;
227
+ }
228
+
229
+ LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, ctx->u.lead);
230
+ LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, ctx->second_codepoint);
231
+
232
+ ctx->u.lead = 0x00;
233
+ ctx->second_codepoint = 0x0000;
234
+ }
235
+ else {
236
+ if (*data >= end) {
237
+ ctx->status = LXB_STATUS_CONTINUE;
238
+
239
+ return LXB_STATUS_CONTINUE;
240
+ }
241
+
242
+ LXB_ENCODING_DECODE_CHECK_OUT(ctx);
243
+
244
+ lead = (lxb_char_t) ctx->u.lead;
245
+ ctx->u.lead = 0x00;
246
+
247
+ goto lead_state;
248
+ }
249
+ }
250
+
251
+ while (*data < end) {
252
+ LXB_ENCODING_DECODE_CHECK_OUT(ctx);
253
+
254
+ lead = *(*data)++;
255
+
256
+ if (lead < 0x80) {
257
+ LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, lead);
258
+ continue;
259
+ }
260
+
261
+ if ((unsigned) (lead - 0x81) > (0xFE - 0x81)) {
262
+ LXB_ENCODING_DECODE_ERROR_BEGIN {
263
+ (*data)--;
264
+ }
265
+ LXB_ENCODING_DECODE_ERROR_END();
266
+
267
+ continue;
268
+ }
269
+
270
+ if (*data >= end) {
271
+ ctx->u.lead = lead;
272
+ ctx->status = LXB_STATUS_CONTINUE;
273
+
274
+ return LXB_STATUS_CONTINUE;
275
+ }
276
+
277
+ lead_state:
278
+
279
+ index = 0;
280
+ byte = *(*data)++;
281
+
282
+ if ((unsigned) (byte - 0x40) <= (0x7E - 0x40)
283
+ || (unsigned) (byte - 0xA1) <= (0xFE - 0xA1))
284
+ {
285
+ if (byte < 0x7F) {
286
+ /* Max index == (0xFE - 0x81) * 157 + (0x7E - 0x62) == 19653 */
287
+ index = (lead - 0x81) * 157 + (byte - 0x40);
288
+ }
289
+ else {
290
+ /* Max index == (0xFE - 0x81) * 157 + (0xFE - 0x62) == 19781 */
291
+ index = (lead - 0x81) * 157 + (byte - 0x62);
292
+ }
293
+ }
294
+
295
+ /*
296
+ * 1133 U+00CA U+0304 Ê̄ (LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND MACRON)
297
+ * 1135 U+00CA U+030C Ê̌ (LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND CARON)
298
+ * 1164 U+00EA U+0304 ê̄ (LATIN SMALL LETTER E WITH CIRCUMFLEX AND MACRON)
299
+ * 1166 U+00EA U+030C ê̌ (LATIN SMALL LETTER E WITH CIRCUMFLEX AND CARON)
300
+ */
301
+ switch (index) {
302
+ case 1133:
303
+ if ((ctx->buffer_used + 2) > ctx->buffer_length) {
304
+ ctx->u.lead = 0x00CA;
305
+ ctx->second_codepoint = 0x0304;
306
+
307
+ return LXB_STATUS_SMALL_BUFFER;
308
+ }
309
+
310
+ LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, 0x00CA);
311
+ LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, 0x0304);
312
+
313
+ continue;
314
+
315
+ case 1135:
316
+ if ((ctx->buffer_used + 2) > ctx->buffer_length) {
317
+ ctx->u.lead = 0x00CA;
318
+ ctx->second_codepoint = 0x030C;
319
+
320
+ return LXB_STATUS_SMALL_BUFFER;
321
+ }
322
+
323
+ LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, 0x00CA);
324
+ LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, 0x030C);
325
+
326
+ continue;
327
+
328
+ case 1164:
329
+ if ((ctx->buffer_used + 2) > ctx->buffer_length) {
330
+ ctx->u.lead = 0x00EA;
331
+ ctx->second_codepoint = 0x0304;
332
+
333
+ return LXB_STATUS_SMALL_BUFFER;
334
+ }
335
+
336
+ LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, 0x00EA);
337
+ LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, 0x0304);
338
+
339
+ continue;
340
+
341
+ case 1166:
342
+ if ((ctx->buffer_used + 2) > ctx->buffer_length) {
343
+ ctx->u.lead = 0x00EA;
344
+ ctx->second_codepoint = 0x030C;
345
+
346
+ return LXB_STATUS_SMALL_BUFFER;
347
+ }
348
+
349
+ LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, 0x00EA);
350
+ LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, 0x030C);
351
+
352
+ continue;
353
+
354
+ case 0:
355
+ LXB_ENCODING_DECODE_FAILED(ctx->u.lead);
356
+ continue;
357
+ }
358
+
359
+ ctx->codepoint = lxb_encoding_multi_index_big5[index].codepoint;
360
+ if (ctx->codepoint == LXB_ENCODING_ERROR_CODEPOINT) {
361
+ LXB_ENCODING_DECODE_FAILED(ctx->u.lead);
362
+ continue;
363
+ }
364
+
365
+ LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, ctx->codepoint);
366
+ }
367
+
368
+ return LXB_STATUS_OK;
369
+ }
370
+
371
+ lxb_status_t
372
+ lxb_encoding_decode_euc_jp(lxb_encoding_decode_t *ctx,
373
+ const lxb_char_t **data, const lxb_char_t *end)
374
+ {
375
+ bool is_jis0212;
376
+ lxb_char_t byte, lead;
377
+
378
+ ctx->status = LXB_STATUS_OK;
379
+
380
+ if (ctx->u.euc_jp.lead != 0x00) {
381
+ if (ctx->have_error) {
382
+ ctx->have_error = false;
383
+ ctx->u.euc_jp.lead = 0x00;
384
+
385
+ LXB_ENCODING_DECODE_ERROR_BEGIN {
386
+ ctx->have_error = true;
387
+ ctx->u.euc_jp.lead = 0x01;
388
+ } LXB_ENCODING_DECODE_ERROR_END();
389
+ }
390
+ else {
391
+ if (*data >= end) {
392
+ ctx->status = LXB_STATUS_CONTINUE;
393
+
394
+ return LXB_STATUS_CONTINUE;
395
+ }
396
+
397
+ LXB_ENCODING_DECODE_CHECK_OUT(ctx);
398
+
399
+ lead = ctx->u.euc_jp.lead;
400
+ byte = *(*data)++;
401
+
402
+ ctx->u.euc_jp.lead = 0x00;
403
+
404
+ if (ctx->u.euc_jp.is_jis0212) {
405
+ is_jis0212 = true;
406
+ ctx->u.euc_jp.is_jis0212 = false;
407
+
408
+ goto lead_jis_state;
409
+ }
410
+
411
+ goto lead_state;
412
+ }
413
+ }
414
+
415
+ while (*data < end) {
416
+ LXB_ENCODING_DECODE_CHECK_OUT(ctx);
417
+
418
+ lead = *(*data)++;
419
+
420
+ if (lead < 0x80) {
421
+ LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, lead);
422
+ continue;
423
+ }
424
+
425
+ if ((unsigned) (lead - 0xA1) > (0xFE - 0xA1)
426
+ && (lead != 0x8E && lead != 0x8F))
427
+ {
428
+ LXB_ENCODING_DECODE_ERROR_BEGIN {
429
+ (*data)--;
430
+ }
431
+ LXB_ENCODING_DECODE_ERROR_END();
432
+
433
+ continue;
434
+ }
435
+
436
+ if (*data >= end) {
437
+ ctx->u.euc_jp.lead = lead;
438
+ ctx->status = LXB_STATUS_CONTINUE;
439
+
440
+ return LXB_STATUS_CONTINUE;
441
+ }
442
+
443
+ byte = *(*data)++;
444
+
445
+ lead_state:
446
+
447
+ if (lead == 0x8E && (unsigned) (byte - 0xA1) <= (0xDF - 0xA1)) {
448
+ LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, 0xFF61 - 0xA1 + byte);
449
+ continue;
450
+ }
451
+
452
+ is_jis0212 = false;
453
+
454
+ if (lead == 0x8F && (unsigned) (byte - 0xA1) <= (0xFE - 0xA1)) {
455
+ if (*data >= end) {
456
+ ctx->u.euc_jp.lead = byte;
457
+ ctx->u.euc_jp.is_jis0212 = true;
458
+
459
+ ctx->status = LXB_STATUS_CONTINUE;
460
+
461
+ return LXB_STATUS_CONTINUE;
462
+ }
463
+
464
+ lead = byte;
465
+ byte = *(*data)++;
466
+ is_jis0212 = true;
467
+ }
468
+
469
+ lead_jis_state:
470
+
471
+ if ((unsigned) (lead - 0xA1) > (0xFE - 0xA1)
472
+ || (unsigned) (byte - 0xA1) > (0xFE - 0xA1))
473
+ {
474
+ LXB_ENCODING_DECODE_FAILED(ctx->u.euc_jp.lead);
475
+ continue;
476
+ }
477
+
478
+ /* Max index == (0xFE - 0xA1) * 94 + 0xFE - 0xA1 == 8835 */
479
+ ctx->codepoint = (lead - 0xA1) * 94 + byte - 0xA1;
480
+
481
+ if (is_jis0212) {
482
+ if ((sizeof(lxb_encoding_multi_index_jis0212)
483
+ / sizeof(lxb_encoding_multi_index_t)) <= ctx->codepoint)
484
+ {
485
+ LXB_ENCODING_DECODE_FAILED(ctx->u.euc_jp.lead);
486
+ continue;
487
+ }
488
+
489
+ ctx->codepoint = lxb_encoding_multi_index_jis0212[ctx->codepoint].codepoint;
490
+ }
491
+ else {
492
+ if ((sizeof(lxb_encoding_multi_index_jis0208)
493
+ / sizeof(lxb_encoding_multi_index_t)) <= ctx->codepoint)
494
+ {
495
+ LXB_ENCODING_DECODE_FAILED(ctx->u.euc_jp.lead);
496
+ continue;
497
+ }
498
+
499
+ ctx->codepoint = lxb_encoding_multi_index_jis0208[ctx->codepoint].codepoint;
500
+ }
501
+
502
+ if (ctx->codepoint == LXB_ENCODING_ERROR_CODEPOINT) {
503
+ LXB_ENCODING_DECODE_FAILED(ctx->u.euc_jp.lead);
504
+ continue;
505
+ }
506
+
507
+ LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, ctx->codepoint);
508
+ }
509
+
510
+ return LXB_STATUS_OK;
511
+ }
512
+
513
+ lxb_status_t
514
+ lxb_encoding_decode_euc_kr(lxb_encoding_decode_t *ctx,
515
+ const lxb_char_t **data, const lxb_char_t *end)
516
+ {
517
+ lxb_char_t lead, byte;
518
+
519
+ ctx->status = LXB_STATUS_OK;
520
+
521
+ if (ctx->u.lead != 0x00) {
522
+ if (ctx->have_error) {
523
+ ctx->have_error = false;
524
+ ctx->u.lead = 0x00;
525
+
526
+ LXB_ENCODING_DECODE_ERROR_BEGIN {
527
+ ctx->have_error = true;
528
+ ctx->u.lead = 0x01;
529
+ } LXB_ENCODING_DECODE_ERROR_END();
530
+ }
531
+ else {
532
+ if (*data >= end) {
533
+ ctx->status = LXB_STATUS_CONTINUE;
534
+
535
+ return LXB_STATUS_CONTINUE;
536
+ }
537
+
538
+ LXB_ENCODING_DECODE_CHECK_OUT(ctx);
539
+
540
+ lead = (lxb_char_t) ctx->u.lead;
541
+ ctx->u.lead = 0x00;
542
+
543
+ goto lead_state;
544
+ }
545
+ }
546
+
547
+ while (*data < end) {
548
+ LXB_ENCODING_DECODE_CHECK_OUT(ctx);
549
+
550
+ lead = *(*data)++;
551
+
552
+ if (lead < 0x80) {
553
+ LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, lead);
554
+ continue;
555
+ }
556
+
557
+ if ((unsigned) (lead - 0x81) > (0xFE - 0x81)) {
558
+ LXB_ENCODING_DECODE_ERROR_BEGIN {
559
+ (*data)--;
560
+ }
561
+ LXB_ENCODING_DECODE_ERROR_END();
562
+
563
+ continue;
564
+ }
565
+
566
+ if (*data == end) {
567
+ ctx->u.lead = lead;
568
+ ctx->status = LXB_STATUS_CONTINUE;
569
+
570
+ return LXB_STATUS_CONTINUE;
571
+ }
572
+
573
+ lead_state:
574
+
575
+ byte = *(*data)++;
576
+
577
+ if ((unsigned) (byte - 0x41) > (0xFE - 0x41)) {
578
+ LXB_ENCODING_DECODE_FAILED(ctx->u.lead);
579
+ continue;
580
+ }
581
+
582
+ /* Max index == (0xFE - 0x81) * 190 + (0xFE - 0x41) == 23939 */
583
+ ctx->codepoint = (lead - 0x81) * 190 + (byte - 0x41);
584
+
585
+ if (ctx->codepoint >= sizeof(lxb_encoding_multi_index_euc_kr)
586
+ / sizeof(lxb_encoding_multi_index_t))
587
+ {
588
+ LXB_ENCODING_DECODE_FAILED(ctx->u.lead);
589
+ continue;
590
+ }
591
+
592
+ ctx->codepoint = lxb_encoding_multi_index_euc_kr[ctx->codepoint].codepoint;
593
+ if (ctx->codepoint == LXB_ENCODING_ERROR_CODEPOINT) {
594
+ LXB_ENCODING_DECODE_FAILED(ctx->u.lead);
595
+ continue;
596
+ }
597
+
598
+ LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, ctx->codepoint);
599
+ }
600
+
601
+ return LXB_STATUS_OK;
602
+ }
603
+
604
+ lxb_status_t
605
+ lxb_encoding_decode_gbk(lxb_encoding_decode_t *ctx,
606
+ const lxb_char_t **data, const lxb_char_t *end)
607
+ {
608
+ return lxb_encoding_decode_gb18030(ctx, data, end);
609
+ }
610
+
611
+ lxb_status_t
612
+ lxb_encoding_decode_ibm866(lxb_encoding_decode_t *ctx,
613
+ const lxb_char_t **data, const lxb_char_t *end)
614
+ {
615
+ LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_ibm866);
616
+
617
+ return LXB_STATUS_OK;
618
+ }
619
+
620
+ lxb_status_t
621
+ lxb_encoding_decode_iso_2022_jp(lxb_encoding_decode_t *ctx,
622
+ const lxb_char_t **data, const lxb_char_t *end)
623
+ {
624
+ #define LXB_ENCODING_DECODE_ISO_2022_JP_OK() \
625
+ do { \
626
+ if (*data >= end) { \
627
+ return LXB_STATUS_OK; \
628
+ } \
629
+ } \
630
+ while (0)
631
+
632
+ #define LXB_ENCODING_DECODE_ISO_2022_JP_CONTINUE() \
633
+ do { \
634
+ if (*data >= end) { \
635
+ ctx->status = LXB_STATUS_CONTINUE; \
636
+ return LXB_STATUS_CONTINUE; \
637
+ } \
638
+ } \
639
+ while (0)
640
+
641
+
642
+ lxb_char_t byte;
643
+ lxb_encoding_ctx_2022_jp_t *iso = &ctx->u.iso_2022_jp;
644
+
645
+ ctx->status = LXB_STATUS_OK;
646
+
647
+ if (ctx->have_error) {
648
+ ctx->have_error = false;
649
+
650
+ LXB_ENCODING_DECODE_ERROR_BEGIN {
651
+ ctx->have_error = true;
652
+ }
653
+ LXB_ENCODING_DECODE_ERROR_END();
654
+ }
655
+
656
+ if (iso->prepand != 0x00) {
657
+ if (*data >= end) {
658
+ ctx->status = LXB_STATUS_CONTINUE;
659
+
660
+ return LXB_STATUS_CONTINUE;
661
+ }
662
+
663
+ LXB_ENCODING_DECODE_CHECK_OUT(ctx);
664
+
665
+ byte = iso->prepand;
666
+ iso->prepand = 0x00;
667
+
668
+ goto prepand;
669
+ }
670
+
671
+ if (*data >= end) {
672
+ return LXB_STATUS_OK;
673
+ }
674
+
675
+ do {
676
+ LXB_ENCODING_DECODE_CHECK_OUT(ctx);
677
+
678
+ byte = *(*data)++;
679
+
680
+ prepand:
681
+
682
+ switch (iso->state) {
683
+ case LXB_ENCODING_DECODE_2022_JP_ASCII:
684
+ if (byte == 0x1B) {
685
+ iso->state = LXB_ENCODING_DECODE_2022_JP_ESCAPE_START;
686
+
687
+ LXB_ENCODING_DECODE_ISO_2022_JP_CONTINUE();
688
+ break;
689
+ }
690
+
691
+ /* 0x00 to 0x7F, excluding 0x0E, 0x0F, and 0x1B */
692
+ if ((unsigned) (byte - 0x00) <= (0x7F - 0x00)
693
+ && byte != 0x0E && byte != 0x0F)
694
+ {
695
+ iso->out_flag = false;
696
+
697
+ LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, byte);
698
+ LXB_ENCODING_DECODE_ISO_2022_JP_OK();
699
+ break;
700
+ }
701
+
702
+ iso->out_flag = false;
703
+
704
+ LXB_ENCODING_DECODE_ERROR_BEGIN {
705
+ ctx->have_error = true;
706
+ }
707
+ LXB_ENCODING_DECODE_ERROR_END();
708
+
709
+ LXB_ENCODING_DECODE_ISO_2022_JP_OK();
710
+ break;
711
+
712
+ case LXB_ENCODING_DECODE_2022_JP_ROMAN:
713
+ switch (byte) {
714
+ case 0x1B:
715
+ iso->state = LXB_ENCODING_DECODE_2022_JP_ESCAPE_START;
716
+
717
+ LXB_ENCODING_DECODE_ISO_2022_JP_CONTINUE();
718
+ continue;
719
+
720
+ case 0x5C:
721
+ iso->out_flag = false;
722
+
723
+ LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, 0x00A5);
724
+ LXB_ENCODING_DECODE_ISO_2022_JP_OK();
725
+
726
+ continue;
727
+
728
+ case 0x7E:
729
+ iso->out_flag = false;
730
+
731
+ LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, 0x203E);
732
+ LXB_ENCODING_DECODE_ISO_2022_JP_OK();
733
+
734
+ continue;
735
+
736
+ case 0x0E:
737
+ case 0x0F:
738
+ break;
739
+
740
+ default:
741
+ /* 0x00 to 0x7F */
742
+ if ((unsigned) (byte - 0x00) <= (0x7F - 0x00)) {
743
+ iso->out_flag = false;
744
+
745
+ LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, byte);
746
+ LXB_ENCODING_DECODE_ISO_2022_JP_OK();
747
+
748
+ continue;
749
+ }
750
+
751
+ break;
752
+ }
753
+
754
+ iso->out_flag = false;
755
+
756
+ LXB_ENCODING_DECODE_ERROR_BEGIN {
757
+ ctx->have_error = true;
758
+ }
759
+ LXB_ENCODING_DECODE_ERROR_END();
760
+
761
+ LXB_ENCODING_DECODE_ISO_2022_JP_OK();
762
+ break;
763
+
764
+ case LXB_ENCODING_DECODE_2022_JP_KATAKANA:
765
+ if (byte == 0x1B) {
766
+ iso->state = LXB_ENCODING_DECODE_2022_JP_ESCAPE_START;
767
+
768
+ LXB_ENCODING_DECODE_ISO_2022_JP_CONTINUE();
769
+ break;
770
+ }
771
+
772
+ /* 0x21 to 0x5F */
773
+ if ((unsigned) (byte - 0x21) <= (0x5F - 0x21)) {
774
+ iso->out_flag = false;
775
+
776
+ LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx,
777
+ 0xFF61 - 0x21 + byte);
778
+ LXB_ENCODING_DECODE_ISO_2022_JP_OK();
779
+ break;
780
+ }
781
+
782
+ iso->out_flag = false;
783
+
784
+ LXB_ENCODING_DECODE_ERROR_BEGIN {
785
+ ctx->have_error = true;
786
+ }
787
+ LXB_ENCODING_DECODE_ERROR_END();
788
+
789
+ LXB_ENCODING_DECODE_ISO_2022_JP_OK();
790
+ break;
791
+
792
+ case LXB_ENCODING_DECODE_2022_JP_LEAD:
793
+ if (byte == 0x1B) {
794
+ iso->state = LXB_ENCODING_DECODE_2022_JP_ESCAPE_START;
795
+
796
+ LXB_ENCODING_DECODE_ISO_2022_JP_CONTINUE();
797
+ break;
798
+ }
799
+
800
+ /* 0x21 to 0x7E */
801
+ if ((unsigned) (byte - 0x21) <= (0x7E - 0x21)) {
802
+ iso->out_flag = false;
803
+ iso->lead = byte;
804
+ iso->state = LXB_ENCODING_DECODE_2022_JP_TRAIL;
805
+
806
+ LXB_ENCODING_DECODE_ISO_2022_JP_CONTINUE();
807
+ break;
808
+ }
809
+
810
+ iso->out_flag = false;
811
+
812
+ LXB_ENCODING_DECODE_ERROR_BEGIN {
813
+ ctx->have_error = true;
814
+ }
815
+ LXB_ENCODING_DECODE_ERROR_END();
816
+
817
+ LXB_ENCODING_DECODE_ISO_2022_JP_OK();
818
+ break;
819
+
820
+ case LXB_ENCODING_DECODE_2022_JP_TRAIL:
821
+ if (byte == 0x1B) {
822
+ iso->state = LXB_ENCODING_DECODE_2022_JP_ESCAPE_START;
823
+
824
+ LXB_ENCODING_DECODE_ERROR_BEGIN {
825
+ ctx->have_error = true;
826
+ }
827
+ LXB_ENCODING_DECODE_ERROR_END();
828
+
829
+ LXB_ENCODING_DECODE_ISO_2022_JP_OK();
830
+ break;
831
+ }
832
+
833
+ iso->state = LXB_ENCODING_DECODE_2022_JP_LEAD;
834
+
835
+ /* 0x21 to 0x7E */
836
+ if ((unsigned) (byte - 0x21) <= (0x7E - 0x21)) {
837
+ /* Max index == (0x7E - 0x21) * 94 + 0x7E - 0x21 == 8835 */
838
+ ctx->codepoint = (iso->lead - 0x21) * 94 + byte - 0x21;
839
+
840
+ ctx->codepoint = lxb_encoding_multi_index_jis0208[ctx->codepoint].codepoint;
841
+
842
+ if (ctx->codepoint != LXB_ENCODING_ERROR_CODEPOINT) {
843
+ LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, ctx->codepoint);
844
+ LXB_ENCODING_DECODE_ISO_2022_JP_OK();
845
+
846
+ break;
847
+ }
848
+ }
849
+
850
+ LXB_ENCODING_DECODE_ERROR_BEGIN {
851
+ iso->prepand = 0x01;
852
+ ctx->have_error = true;
853
+ }
854
+ LXB_ENCODING_DECODE_ERROR_END();
855
+
856
+ LXB_ENCODING_DECODE_ISO_2022_JP_OK();
857
+ break;
858
+
859
+ case LXB_ENCODING_DECODE_2022_JP_ESCAPE_START:
860
+ if (byte == 0x24 || byte == 0x28) {
861
+ iso->state = LXB_ENCODING_DECODE_2022_JP_ESCAPE;
862
+ iso->lead = byte;
863
+
864
+ LXB_ENCODING_DECODE_ISO_2022_JP_CONTINUE();
865
+ break;
866
+ }
867
+
868
+ (*data)--;
869
+
870
+ iso->out_flag = false;
871
+ iso->state = ctx->u.iso_2022_jp.out_state;
872
+
873
+ LXB_ENCODING_DECODE_ERROR_BEGIN {
874
+ iso->prepand = 0x01;
875
+ ctx->have_error = true;
876
+ }
877
+ LXB_ENCODING_DECODE_ERROR_END();
878
+
879
+ break;
880
+
881
+ case LXB_ENCODING_DECODE_2022_JP_ESCAPE:
882
+ iso->state = LXB_ENCODING_DECODE_2022_JP_UNSET;
883
+
884
+ if (iso->lead == 0x28) {
885
+ if (byte == 0x42) {
886
+ iso->state = LXB_ENCODING_DECODE_2022_JP_ASCII;
887
+ }
888
+ else if (byte == 0x4A) {
889
+ iso->state = LXB_ENCODING_DECODE_2022_JP_ROMAN;
890
+ }
891
+ else if (byte == 0x49) {
892
+ iso->state = LXB_ENCODING_DECODE_2022_JP_KATAKANA;
893
+ }
894
+ }
895
+ else if (iso->lead == 0x24) {
896
+ if (byte == 0x40 || byte == 0x42) {
897
+ iso->state = LXB_ENCODING_DECODE_2022_JP_LEAD;
898
+ }
899
+ }
900
+
901
+ if (iso->state == LXB_ENCODING_DECODE_2022_JP_UNSET) {
902
+ (*data)--;
903
+
904
+ iso->out_flag = false;
905
+ iso->state = iso->out_state;
906
+
907
+ LXB_ENCODING_DECODE_ERROR_BEGIN {
908
+ iso->prepand = iso->lead;
909
+ iso->lead = 0x00;
910
+
911
+ ctx->have_error = true;
912
+ }
913
+ LXB_ENCODING_DECODE_ERROR_END();
914
+
915
+ byte = iso->lead;
916
+ iso->lead = 0x00;
917
+
918
+ goto prepand;
919
+ }
920
+
921
+ iso->lead = 0x00;
922
+ iso->out_state = iso->state;
923
+
924
+ if (iso->out_flag) {
925
+ LXB_ENCODING_DECODE_ERROR_BEGIN {
926
+ ctx->have_error = true;
927
+ }
928
+ LXB_ENCODING_DECODE_ERROR_END();
929
+
930
+ LXB_ENCODING_DECODE_ISO_2022_JP_OK();
931
+ break;
932
+ }
933
+
934
+ iso->out_flag = true;
935
+
936
+ LXB_ENCODING_DECODE_ISO_2022_JP_CONTINUE();
937
+ break;
938
+ }
939
+ }
940
+ while (true);
941
+
942
+ return LXB_STATUS_OK;
943
+
944
+ #undef LXB_ENCODING_DECODE_ISO_2022_JP_OK
945
+ #undef LXB_ENCODING_DECODE_ISO_2022_JP_CONTINUE
946
+ }
947
+
948
+ lxb_status_t
949
+ lxb_encoding_decode_iso_8859_10(lxb_encoding_decode_t *ctx,
950
+ const lxb_char_t **data, const lxb_char_t *end)
951
+ {
952
+ LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_iso_8859_10);
953
+
954
+ return LXB_STATUS_OK;
955
+ }
956
+
957
+ lxb_status_t
958
+ lxb_encoding_decode_iso_8859_13(lxb_encoding_decode_t *ctx,
959
+ const lxb_char_t **data, const lxb_char_t *end)
960
+ {
961
+ LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_iso_8859_13);
962
+
963
+ return LXB_STATUS_OK;
964
+ }
965
+
966
+ lxb_status_t
967
+ lxb_encoding_decode_iso_8859_14(lxb_encoding_decode_t *ctx,
968
+ const lxb_char_t **data, const lxb_char_t *end)
969
+ {
970
+ LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_iso_8859_14);
971
+
972
+ return LXB_STATUS_OK;
973
+ }
974
+
975
+ lxb_status_t
976
+ lxb_encoding_decode_iso_8859_15(lxb_encoding_decode_t *ctx,
977
+ const lxb_char_t **data, const lxb_char_t *end)
978
+ {
979
+ LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_iso_8859_15);
980
+
981
+ return LXB_STATUS_OK;
982
+ }
983
+
984
+ lxb_status_t
985
+ lxb_encoding_decode_iso_8859_16(lxb_encoding_decode_t *ctx,
986
+ const lxb_char_t **data, const lxb_char_t *end)
987
+ {
988
+ LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_iso_8859_16);
989
+
990
+ return LXB_STATUS_OK;
991
+ }
992
+
993
+ lxb_status_t
994
+ lxb_encoding_decode_iso_8859_2(lxb_encoding_decode_t *ctx,
995
+ const lxb_char_t **data, const lxb_char_t *end)
996
+ {
997
+ LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_iso_8859_2);
998
+
999
+ return LXB_STATUS_OK;
1000
+ }
1001
+
1002
+ lxb_status_t
1003
+ lxb_encoding_decode_iso_8859_3(lxb_encoding_decode_t *ctx,
1004
+ const lxb_char_t **data, const lxb_char_t *end)
1005
+ {
1006
+ LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_iso_8859_3);
1007
+
1008
+ return LXB_STATUS_OK;
1009
+ }
1010
+
1011
+ lxb_status_t
1012
+ lxb_encoding_decode_iso_8859_4(lxb_encoding_decode_t *ctx,
1013
+ const lxb_char_t **data, const lxb_char_t *end)
1014
+ {
1015
+ LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_iso_8859_4);
1016
+
1017
+ return LXB_STATUS_OK;
1018
+ }
1019
+
1020
+ lxb_status_t
1021
+ lxb_encoding_decode_iso_8859_5(lxb_encoding_decode_t *ctx,
1022
+ const lxb_char_t **data, const lxb_char_t *end)
1023
+ {
1024
+ LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_iso_8859_5);
1025
+
1026
+ return LXB_STATUS_OK;
1027
+ }
1028
+
1029
+ lxb_status_t
1030
+ lxb_encoding_decode_iso_8859_6(lxb_encoding_decode_t *ctx,
1031
+ const lxb_char_t **data, const lxb_char_t *end)
1032
+ {
1033
+ LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_iso_8859_6);
1034
+
1035
+ return LXB_STATUS_OK;
1036
+ }
1037
+
1038
+ lxb_status_t
1039
+ lxb_encoding_decode_iso_8859_7(lxb_encoding_decode_t *ctx,
1040
+ const lxb_char_t **data, const lxb_char_t *end)
1041
+ {
1042
+ LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_iso_8859_7);
1043
+
1044
+ return LXB_STATUS_OK;
1045
+ }
1046
+
1047
+ lxb_status_t
1048
+ lxb_encoding_decode_iso_8859_8(lxb_encoding_decode_t *ctx,
1049
+ const lxb_char_t **data, const lxb_char_t *end)
1050
+ {
1051
+ LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_iso_8859_8);
1052
+
1053
+ return LXB_STATUS_OK;
1054
+ }
1055
+
1056
+ lxb_status_t
1057
+ lxb_encoding_decode_iso_8859_8_i(lxb_encoding_decode_t *ctx,
1058
+ const lxb_char_t **data, const lxb_char_t *end)
1059
+ {
1060
+ LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_iso_8859_8);
1061
+
1062
+ return LXB_STATUS_OK;
1063
+ }
1064
+
1065
+ lxb_status_t
1066
+ lxb_encoding_decode_koi8_r(lxb_encoding_decode_t *ctx,
1067
+ const lxb_char_t **data, const lxb_char_t *end)
1068
+ {
1069
+ LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_koi8_r);
1070
+
1071
+ return LXB_STATUS_OK;
1072
+ }
1073
+
1074
+ lxb_status_t
1075
+ lxb_encoding_decode_koi8_u(lxb_encoding_decode_t *ctx,
1076
+ const lxb_char_t **data, const lxb_char_t *end)
1077
+ {
1078
+ LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_koi8_u);
1079
+
1080
+ return LXB_STATUS_OK;
1081
+ }
1082
+
1083
+ lxb_status_t
1084
+ lxb_encoding_decode_shift_jis(lxb_encoding_decode_t *ctx,
1085
+ const lxb_char_t **data, const lxb_char_t *end)
1086
+ {
1087
+ lxb_char_t byte, lead;
1088
+
1089
+ ctx->status = LXB_STATUS_OK;
1090
+
1091
+ if (ctx->u.lead != 0x00) {
1092
+ if (ctx->have_error) {
1093
+ ctx->have_error = false;
1094
+ ctx->u.lead = 0x00;
1095
+
1096
+ LXB_ENCODING_DECODE_ERROR_BEGIN {
1097
+ ctx->have_error = true;
1098
+ ctx->u.lead = 0x01;
1099
+ } LXB_ENCODING_DECODE_ERROR_END();
1100
+ }
1101
+ else {
1102
+ if (*data >= end) {
1103
+ ctx->status = LXB_STATUS_CONTINUE;
1104
+
1105
+ return LXB_STATUS_CONTINUE;
1106
+ }
1107
+
1108
+ LXB_ENCODING_DECODE_CHECK_OUT(ctx);
1109
+
1110
+ lead = (lxb_char_t) ctx->u.lead;
1111
+ ctx->u.lead = 0x00;
1112
+
1113
+ goto lead_state;
1114
+ }
1115
+ }
1116
+
1117
+ while (*data < end) {
1118
+ LXB_ENCODING_DECODE_CHECK_OUT(ctx);
1119
+
1120
+ lead = *(*data)++;
1121
+
1122
+ if (lead <= 0x80) {
1123
+ LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, lead);
1124
+ continue;
1125
+ }
1126
+
1127
+ if ((unsigned) (lead - 0xA1) <= (0xDF - 0xA1)) {
1128
+ LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, 0xFF61 - 0xA1 + lead);
1129
+ continue;
1130
+ }
1131
+
1132
+ if ((unsigned) (lead - 0x81) > (0x9F - 0x81)
1133
+ && lead != 0xE0 && lead != 0xFC)
1134
+ {
1135
+ LXB_ENCODING_DECODE_ERROR_BEGIN {
1136
+ (*data)--;
1137
+ }
1138
+ LXB_ENCODING_DECODE_ERROR_END();
1139
+
1140
+ continue;
1141
+ }
1142
+
1143
+ if (*data >= end) {
1144
+ ctx->u.lead = lead;
1145
+ ctx->status = LXB_STATUS_CONTINUE;
1146
+
1147
+ return LXB_STATUS_CONTINUE;
1148
+ }
1149
+
1150
+ lead_state:
1151
+
1152
+ byte = *(*data)++;
1153
+
1154
+ if (byte < 0x7F) {
1155
+ ctx->codepoint = 0x40;
1156
+ }
1157
+ else {
1158
+ ctx->codepoint = 0x41;
1159
+ }
1160
+
1161
+ if (lead < 0xA0) {
1162
+ ctx->second_codepoint = 0x81;
1163
+ }
1164
+ else {
1165
+ ctx->second_codepoint = 0xC1;
1166
+ }
1167
+
1168
+ if ((unsigned) (byte - 0x40) > (0x7E - 0x40)
1169
+ && (unsigned) (byte - 0x80) > (0xFC - 0x80))
1170
+ {
1171
+ LXB_ENCODING_DECODE_FAILED(ctx->u.lead);
1172
+ continue;
1173
+ }
1174
+
1175
+ /* Max index == (0xFC - 0xC1) * 188 + 0xFC - 0x41 = 11279 */
1176
+ ctx->codepoint = (lead - ctx->second_codepoint) * 188
1177
+ + byte - ctx->codepoint;
1178
+
1179
+ if (ctx->codepoint >= (sizeof(lxb_encoding_multi_index_jis0208)
1180
+ / sizeof(lxb_encoding_multi_index_t)))
1181
+ {
1182
+ LXB_ENCODING_DECODE_FAILED(ctx->u.lead);
1183
+ continue;
1184
+ }
1185
+
1186
+ if ((unsigned) (ctx->codepoint - 8836) <= (10715 - 8836)) {
1187
+ LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, 0xE000 - 8836 + ctx->codepoint);
1188
+ continue;
1189
+ }
1190
+
1191
+ ctx->codepoint = lxb_encoding_multi_index_jis0208[ctx->codepoint].codepoint;
1192
+ if (ctx->codepoint == LXB_ENCODING_ERROR_CODEPOINT) {
1193
+ LXB_ENCODING_DECODE_FAILED(ctx->u.lead);
1194
+ continue;
1195
+ }
1196
+
1197
+ LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, ctx->codepoint);
1198
+ }
1199
+
1200
+ return LXB_STATUS_OK;
1201
+ }
1202
+
1203
+ lxb_inline lxb_status_t
1204
+ lxb_encoding_decode_utf_16(lxb_encoding_decode_t *ctx, bool is_be,
1205
+ const lxb_char_t **data, const lxb_char_t *end)
1206
+ {
1207
+ unsigned lead;
1208
+ lxb_codepoint_t unit;
1209
+
1210
+ ctx->status = LXB_STATUS_OK;
1211
+
1212
+ if (ctx->have_error) {
1213
+ ctx->have_error = false;
1214
+
1215
+ LXB_ENCODING_DECODE_ERROR_BEGIN {
1216
+ ctx->have_error = true;
1217
+ }
1218
+ LXB_ENCODING_DECODE_ERROR_END();
1219
+ }
1220
+
1221
+ if (ctx->u.lead != 0x00) {
1222
+ if (*data >= end) {
1223
+ ctx->status = LXB_STATUS_CONTINUE;
1224
+
1225
+ return LXB_STATUS_CONTINUE;
1226
+ }
1227
+
1228
+ LXB_ENCODING_DECODE_CHECK_OUT(ctx);
1229
+
1230
+ lead = ctx->u.lead - 0x01;
1231
+ ctx->u.lead = 0x00;
1232
+
1233
+ goto lead_state;
1234
+ }
1235
+
1236
+ while (*data < end) {
1237
+ LXB_ENCODING_DECODE_CHECK_OUT(ctx);
1238
+
1239
+ pair_state:
1240
+
1241
+ lead = *(*data)++;
1242
+
1243
+ if (*data >= end) {
1244
+ ctx->u.lead = lead + 0x01;
1245
+ ctx->status = LXB_STATUS_CONTINUE;
1246
+
1247
+ return LXB_STATUS_CONTINUE;
1248
+ }
1249
+
1250
+ lead_state:
1251
+
1252
+ /* For UTF-16BE or UTF-16LE */
1253
+ if (is_be) {
1254
+ unit = (lead << 8) + *(*data)++;
1255
+ }
1256
+ else {
1257
+ unit = (*(*data)++ << 8) + lead;
1258
+ }
1259
+
1260
+ if (ctx->second_codepoint != 0x00) {
1261
+ if ((unsigned) (unit - 0xDC00) <= (0xDFFF - 0xDC00)) {
1262
+ ctx->codepoint = 0x10000 + ((ctx->second_codepoint - 0xD800) << 10)
1263
+ + (unit - 0xDC00);
1264
+
1265
+ ctx->second_codepoint = 0x00;
1266
+
1267
+ LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, ctx->codepoint);
1268
+ continue;
1269
+ }
1270
+
1271
+ (*data)--;
1272
+
1273
+ ctx->second_codepoint = 0x00;
1274
+
1275
+ LXB_ENCODING_DECODE_ERROR_BEGIN {
1276
+ ctx->have_error = true;
1277
+
1278
+ ctx->u.lead = lead + 0x01;
1279
+ }
1280
+ LXB_ENCODING_DECODE_ERROR_END();
1281
+
1282
+ goto lead_state;
1283
+ }
1284
+
1285
+ /* Surrogate pair */
1286
+ if ((unsigned) (unit - 0xD800) <= (0xDFFF - 0xD800)) {
1287
+ if ((unsigned) (unit - 0xDC00) <= (0xDFFF - 0xDC00)) {
1288
+ LXB_ENCODING_DECODE_ERROR_BEGIN {
1289
+ ctx->have_error = true;
1290
+ }
1291
+ LXB_ENCODING_DECODE_ERROR_END();
1292
+
1293
+ continue;
1294
+ }
1295
+
1296
+ ctx->second_codepoint = unit;
1297
+
1298
+ if (*data >= end) {
1299
+ ctx->status = LXB_STATUS_CONTINUE;
1300
+
1301
+ return LXB_STATUS_CONTINUE;
1302
+ }
1303
+
1304
+ goto pair_state;
1305
+ }
1306
+
1307
+ LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, unit);
1308
+ }
1309
+
1310
+ return LXB_STATUS_OK;
1311
+ }
1312
+
1313
+ lxb_status_t
1314
+ lxb_encoding_decode_utf_16be(lxb_encoding_decode_t *ctx,
1315
+ const lxb_char_t **data, const lxb_char_t *end)
1316
+ {
1317
+ return lxb_encoding_decode_utf_16(ctx, true, data, end);
1318
+ }
1319
+
1320
+ lxb_status_t
1321
+ lxb_encoding_decode_utf_16le(lxb_encoding_decode_t *ctx,
1322
+ const lxb_char_t **data, const lxb_char_t *end)
1323
+ {
1324
+ return lxb_encoding_decode_utf_16(ctx, false, data, end);
1325
+ }
1326
+
1327
+ lxb_status_t
1328
+ lxb_encoding_decode_utf_8(lxb_encoding_decode_t *ctx,
1329
+ const lxb_char_t **data, const lxb_char_t *end)
1330
+ {
1331
+ unsigned need;
1332
+ lxb_char_t ch;
1333
+ const lxb_char_t *p = *data;
1334
+
1335
+ ctx->status = LXB_STATUS_OK;
1336
+
1337
+ if (ctx->have_error) {
1338
+ ctx->have_error = false;
1339
+
1340
+ LXB_ENCODING_DECODE_ERROR_BEGIN {
1341
+ ctx->have_error = true;
1342
+ }
1343
+ LXB_ENCODING_DECODE_ERROR_END();
1344
+ }
1345
+
1346
+ if (ctx->u.utf_8.need != 0) {
1347
+ if (p >= end) {
1348
+ ctx->status = LXB_STATUS_CONTINUE;
1349
+
1350
+ return LXB_STATUS_CONTINUE;
1351
+ }
1352
+
1353
+ LXB_ENCODING_DECODE_CHECK_OUT(ctx);
1354
+
1355
+ need = ctx->u.utf_8.need;
1356
+ ctx->u.utf_8.need = 0;
1357
+
1358
+ if (ctx->u.utf_8.lower != 0x00) {
1359
+ LXB_ENCODING_DECODE_UTF_8_BOUNDARY(ctx->u.utf_8.lower,
1360
+ ctx->u.utf_8.upper, goto begin);
1361
+ ctx->u.utf_8.lower = 0x00;
1362
+ }
1363
+
1364
+ goto decode;
1365
+ }
1366
+
1367
+ begin:
1368
+
1369
+ while (p < end) {
1370
+ if (ctx->buffer_used >= ctx->buffer_length) {
1371
+ *data = p;
1372
+
1373
+ return LXB_STATUS_SMALL_BUFFER;
1374
+ }
1375
+
1376
+ ch = *p++;
1377
+
1378
+ if (ch < 0x80) {
1379
+ LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, ch);
1380
+ continue;
1381
+ }
1382
+ else if (ch <= 0xDF) {
1383
+ if (ch < 0xC2) {
1384
+ LXB_ENCODING_DECODE_ERROR_BEGIN {
1385
+ *data = p - 1;
1386
+ }
1387
+ LXB_ENCODING_DECODE_ERROR_END();
1388
+
1389
+ continue;
1390
+ }
1391
+
1392
+ need = 1;
1393
+ ctx->codepoint = ch & 0x1F;
1394
+ }
1395
+ else if (ch < 0xF0) {
1396
+ need = 2;
1397
+ ctx->codepoint = ch & 0x0F;
1398
+
1399
+ if (p == end) {
1400
+ LXB_ENCODING_DECODE_UTF_8_BOUNDARY_SET(0xE0, 0xED, 0xA0, 0x9F);
1401
+
1402
+ *data = p;
1403
+
1404
+ ctx->u.utf_8.need = need;
1405
+ ctx->status = LXB_STATUS_CONTINUE;
1406
+
1407
+ return LXB_STATUS_CONTINUE;
1408
+ }
1409
+
1410
+ if (ch == 0xE0) {
1411
+ LXB_ENCODING_DECODE_UTF_8_BOUNDARY(0xA0, 0xBF, continue);
1412
+ }
1413
+ else if (ch == 0xED) {
1414
+ LXB_ENCODING_DECODE_UTF_8_BOUNDARY(0x80, 0x9F, continue);
1415
+ }
1416
+ }
1417
+ else if (ch < 0xF5) {
1418
+ need = 3;
1419
+ ctx->codepoint = ch & 0x07;
1420
+
1421
+ if (p == end) {
1422
+ LXB_ENCODING_DECODE_UTF_8_BOUNDARY_SET(0xF0, 0xF4, 0x90, 0x8F);
1423
+
1424
+ *data = p;
1425
+
1426
+ ctx->u.utf_8.need = need;
1427
+ ctx->status = LXB_STATUS_CONTINUE;
1428
+
1429
+ return LXB_STATUS_CONTINUE;
1430
+ }
1431
+
1432
+ if (ch == 0xF0) {
1433
+ LXB_ENCODING_DECODE_UTF_8_BOUNDARY(0x90, 0xBF, continue);
1434
+ }
1435
+ else if (ch == 0xF4) {
1436
+ LXB_ENCODING_DECODE_UTF_8_BOUNDARY(0x80, 0x8F, continue);
1437
+ }
1438
+ }
1439
+ else {
1440
+ LXB_ENCODING_DECODE_ERROR_BEGIN {
1441
+ *data = p - 1;
1442
+ }
1443
+ LXB_ENCODING_DECODE_ERROR_END();
1444
+
1445
+ continue;
1446
+ }
1447
+
1448
+ decode:
1449
+
1450
+ do {
1451
+ if (p >= end) {
1452
+ *data = p;
1453
+
1454
+ ctx->u.utf_8.need = need;
1455
+ ctx->status = LXB_STATUS_CONTINUE;
1456
+
1457
+ return LXB_STATUS_CONTINUE;
1458
+ }
1459
+
1460
+ ch = *p++;
1461
+
1462
+ if (ch < 0x80 || ch > 0xBF) {
1463
+ p--;
1464
+
1465
+ ctx->u.utf_8.need = 0;
1466
+
1467
+ LXB_ENCODING_DECODE_ERROR_BEGIN {
1468
+ *data = p;
1469
+ ctx->have_error = true;
1470
+ }
1471
+ LXB_ENCODING_DECODE_ERROR_END();
1472
+
1473
+ break;
1474
+ }
1475
+
1476
+ ctx->codepoint = (ctx->codepoint << 6) | (ch & 0x3F);
1477
+
1478
+ if (--need == 0) {
1479
+ LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, ctx->codepoint);
1480
+
1481
+ break;
1482
+ }
1483
+ }
1484
+ while (true);
1485
+ }
1486
+
1487
+ *data = p;
1488
+
1489
+ return LXB_STATUS_OK;
1490
+ }
1491
+
1492
+ lxb_inline lxb_codepoint_t
1493
+ lxb_encoding_decode_gb18030_range(uint32_t index)
1494
+ {
1495
+ size_t mid, left, right;
1496
+ const lxb_encoding_range_index_t *range;
1497
+
1498
+ /*
1499
+ * Pointer greater than 39419 and less than 189000,
1500
+ * or pointer is greater than 1237575
1501
+ */
1502
+ if ((unsigned) (index - 39419) < (189000 - 39419)
1503
+ || index > 1237575)
1504
+ {
1505
+ return LXB_ENCODING_ERROR_CODEPOINT;
1506
+ }
1507
+
1508
+ if (index == 7457) {
1509
+ return 0xE7C7;
1510
+ }
1511
+
1512
+ left = 0;
1513
+ right = LXB_ENCODING_RANGE_INDEX_GB18030_SIZE;
1514
+ range = lxb_encoding_range_index_gb18030;
1515
+
1516
+ /* Some compilers say about uninitialized mid */
1517
+ mid = 0;
1518
+
1519
+ while (left < right) {
1520
+ mid = left + (right - left) / 2;
1521
+
1522
+ if (range[mid].index < index) {
1523
+ left = mid + 1;
1524
+
1525
+ if (left < right && range[ left ].index > index) {
1526
+ break;
1527
+ }
1528
+ }
1529
+ else if (range[mid].index > index) {
1530
+ right = mid - 1;
1531
+
1532
+ if (right > 0 && range[right].index <= index) {
1533
+ mid = right;
1534
+ break;
1535
+ }
1536
+ }
1537
+ else {
1538
+ break;
1539
+ }
1540
+ }
1541
+
1542
+ return range[mid].codepoint + index - range[mid].index;
1543
+ }
1544
+
1545
+ lxb_status_t
1546
+ lxb_encoding_decode_gb18030(lxb_encoding_decode_t *ctx,
1547
+ const lxb_char_t **data, const lxb_char_t *end)
1548
+ {
1549
+ uint32_t pointer;
1550
+ lxb_char_t first, second, third, offset;
1551
+
1552
+ /* Make compiler happy */
1553
+ second = 0x00;
1554
+
1555
+ ctx->status = LXB_STATUS_OK;
1556
+
1557
+ if (ctx->have_error) {
1558
+ ctx->have_error = false;
1559
+
1560
+ LXB_ENCODING_DECODE_ERROR_BEGIN {
1561
+ ctx->have_error = true;
1562
+ }
1563
+ LXB_ENCODING_DECODE_ERROR_END();
1564
+ }
1565
+
1566
+ if (ctx->u.gb18030.first != 0) {
1567
+ if (*data >= end) {
1568
+ ctx->status = LXB_STATUS_CONTINUE;
1569
+
1570
+ return LXB_STATUS_CONTINUE;
1571
+ }
1572
+
1573
+ LXB_ENCODING_DECODE_CHECK_OUT(ctx);
1574
+
1575
+ if (ctx->u.gb18030.third != 0x00) {
1576
+ first = ctx->u.gb18030.first;
1577
+ second = ctx->u.gb18030.second;
1578
+ third = ctx->u.gb18030.third;
1579
+
1580
+ memset(&ctx->u.gb18030, 0, sizeof(lxb_encoding_ctx_gb18030_t));
1581
+
1582
+ if (ctx->prepend) {
1583
+ /* The first is always < 0x80 */
1584
+ LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, second);
1585
+
1586
+ if (ctx->buffer_used == ctx->buffer_length) {
1587
+ ctx->u.gb18030.first = third;
1588
+
1589
+ return LXB_STATUS_SMALL_BUFFER;
1590
+ }
1591
+
1592
+ first = third;
1593
+ ctx->prepend = false;
1594
+
1595
+ goto prepend_first;
1596
+ }
1597
+
1598
+ goto third_state;
1599
+ }
1600
+ else if (ctx->u.gb18030.second != 0x00) {
1601
+ first = ctx->u.gb18030.first;
1602
+ second = ctx->u.gb18030.second;
1603
+
1604
+ memset(&ctx->u.gb18030, 0, sizeof(lxb_encoding_ctx_gb18030_t));
1605
+
1606
+ goto second_state;
1607
+ }
1608
+
1609
+ first = ctx->u.gb18030.first;
1610
+ ctx->u.gb18030.first = 0x00;
1611
+
1612
+ if (ctx->prepend) {
1613
+ ctx->prepend = false;
1614
+ goto prepend_first;
1615
+ }
1616
+
1617
+ goto first_state;
1618
+ }
1619
+
1620
+ while (*data < end) {
1621
+ LXB_ENCODING_DECODE_CHECK_OUT(ctx);
1622
+
1623
+ first = *(*data)++;
1624
+
1625
+ prepend_first:
1626
+
1627
+ if (first < 0x80) {
1628
+ LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, first);
1629
+ continue;
1630
+ }
1631
+
1632
+ if (first == 0x80) {
1633
+ LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, 0x20AC);
1634
+ continue;
1635
+ }
1636
+
1637
+ /* Range 0x81 to 0xFE, inclusive */
1638
+ if ((unsigned) (first - 0x81) > (0xFE - 0x81)) {
1639
+ LXB_ENCODING_DECODE_ERROR_BEGIN {
1640
+ (*data)--;
1641
+ }
1642
+ LXB_ENCODING_DECODE_ERROR_END();
1643
+
1644
+ continue;
1645
+ }
1646
+
1647
+ if (*data == end) {
1648
+ ctx->u.gb18030.first = first;
1649
+ ctx->status = LXB_STATUS_CONTINUE;
1650
+
1651
+ return LXB_STATUS_CONTINUE;
1652
+ }
1653
+
1654
+ /* First */
1655
+ first_state:
1656
+
1657
+ second = *(*data)++;
1658
+
1659
+ /* Range 0x30 to 0x39, inclusive */
1660
+ if ((unsigned) (second - 0x30) > (0x39 - 0x30)) {
1661
+ offset = (second < 0x7F) ? 0x40 : 0x41;
1662
+
1663
+ /* Range 0x40 to 0x7E, inclusive, or 0x80 to 0xFE, inclusive */
1664
+ if ((unsigned) (second - 0x40) <= (0x7E - 0x40)
1665
+ || (unsigned) (second - 0x80) <= (0xFE - 0x80))
1666
+ {
1667
+ pointer = (first - 0x81) * 190 + (second - offset);
1668
+ }
1669
+ else {
1670
+ if (second < 0x80) {
1671
+ (*data)--;
1672
+ }
1673
+
1674
+ LXB_ENCODING_DECODE_ERROR_BEGIN {
1675
+ ctx->have_error = true;
1676
+ }
1677
+ LXB_ENCODING_DECODE_ERROR_END();
1678
+
1679
+ continue;
1680
+ }
1681
+
1682
+ /* Max pointer value == (0xFE - 0x81) * 190 + (0xFE - 0x41) == 23939 */
1683
+ ctx->codepoint = lxb_encoding_multi_index_gb18030[pointer].codepoint;
1684
+ if (ctx->codepoint == LXB_ENCODING_ERROR_CODEPOINT) {
1685
+ if (second < 0x80) {
1686
+ (*data)--;
1687
+ }
1688
+
1689
+ LXB_ENCODING_DECODE_ERROR_BEGIN {
1690
+ ctx->have_error = true;
1691
+ }
1692
+ LXB_ENCODING_DECODE_ERROR_END();
1693
+
1694
+ continue;
1695
+ }
1696
+
1697
+ LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, ctx->codepoint);
1698
+ continue;
1699
+ }
1700
+
1701
+ if (*data == end) {
1702
+ ctx->u.gb18030.first = first;
1703
+ ctx->u.gb18030.second = second;
1704
+
1705
+ ctx->status = LXB_STATUS_CONTINUE;
1706
+
1707
+ return LXB_STATUS_CONTINUE;
1708
+ }
1709
+
1710
+ /* Second */
1711
+ second_state:
1712
+
1713
+ third = *(*data)++;
1714
+
1715
+ /* Range 0x81 to 0xFE, inclusive */
1716
+ if ((unsigned) (third - 0x81) > (0xFE - 0x81)) {
1717
+ (*data)--;
1718
+
1719
+ LXB_ENCODING_DECODE_ERROR_BEGIN {
1720
+ ctx->prepend = true;
1721
+ ctx->have_error = true;
1722
+ ctx->u.gb18030.first = second;
1723
+ }
1724
+ LXB_ENCODING_DECODE_ERROR_END();
1725
+
1726
+ first = second;
1727
+
1728
+ goto prepend_first;
1729
+ }
1730
+
1731
+ if (*data == end) {
1732
+ ctx->u.gb18030.first = first;
1733
+ ctx->u.gb18030.second = second;
1734
+ ctx->u.gb18030.third = third;
1735
+
1736
+ ctx->status = LXB_STATUS_CONTINUE;
1737
+
1738
+ return LXB_STATUS_CONTINUE;
1739
+ }
1740
+
1741
+ /* Third */
1742
+ third_state:
1743
+
1744
+ /* Range 0x30 to 0x39, inclusive */
1745
+ if ((unsigned) (**data - 0x30) > (0x39 - 0x30)) {
1746
+ ctx->prepend = true;
1747
+
1748
+ LXB_ENCODING_DECODE_ERROR_BEGIN {
1749
+ ctx->prepend = true;
1750
+ ctx->have_error = true;
1751
+
1752
+ /* First is a fake for trigger */
1753
+ ctx->u.gb18030.first = 0x01;
1754
+ ctx->u.gb18030.second = second;
1755
+ ctx->u.gb18030.third = third;
1756
+ }
1757
+ LXB_ENCODING_DECODE_ERROR_END();
1758
+
1759
+ LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, second);
1760
+
1761
+ if (ctx->buffer_used == ctx->buffer_length) {
1762
+ ctx->prepend = true;
1763
+ ctx->have_error = true;
1764
+
1765
+ /* First is a fake for trigger */
1766
+ ctx->u.gb18030.first = 0x01;
1767
+ ctx->u.gb18030.second = second;
1768
+ ctx->u.gb18030.third = third;
1769
+
1770
+ return LXB_STATUS_SMALL_BUFFER;
1771
+ }
1772
+
1773
+ first = third;
1774
+
1775
+ goto prepend_first;
1776
+ }
1777
+
1778
+ pointer = ((first - 0x81) * (10 * 126 * 10))
1779
+ + ((second - 0x30) * (10 * 126))
1780
+ + ((third - 0x81) * 10) + (*(*data)++) - 0x30;
1781
+
1782
+ ctx->codepoint = lxb_encoding_decode_gb18030_range(pointer);
1783
+
1784
+ if (ctx->codepoint == LXB_ENCODING_ERROR_CODEPOINT) {
1785
+ LXB_ENCODING_DECODE_ERROR_BEGIN {}
1786
+ LXB_ENCODING_DECODE_ERROR_END();
1787
+
1788
+ continue;
1789
+ }
1790
+
1791
+ LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, ctx->codepoint);
1792
+ }
1793
+
1794
+ return LXB_STATUS_OK;
1795
+ }
1796
+
1797
+ lxb_status_t
1798
+ lxb_encoding_decode_macintosh(lxb_encoding_decode_t *ctx,
1799
+ const lxb_char_t **data, const lxb_char_t *end)
1800
+ {
1801
+ LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_macintosh);
1802
+
1803
+ return LXB_STATUS_OK;
1804
+ }
1805
+
1806
+ lxb_status_t
1807
+ lxb_encoding_decode_replacement(lxb_encoding_decode_t *ctx,
1808
+ const lxb_char_t **data, const lxb_char_t *end)
1809
+ {
1810
+ *data = end;
1811
+ return LXB_STATUS_ERROR;
1812
+ }
1813
+
1814
+ lxb_status_t
1815
+ lxb_encoding_decode_windows_1250(lxb_encoding_decode_t *ctx,
1816
+ const lxb_char_t **data, const lxb_char_t *end)
1817
+ {
1818
+ LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_windows_1250);
1819
+
1820
+ return LXB_STATUS_OK;
1821
+ }
1822
+
1823
+ lxb_status_t
1824
+ lxb_encoding_decode_windows_1251(lxb_encoding_decode_t *ctx,
1825
+ const lxb_char_t **data, const lxb_char_t *end)
1826
+ {
1827
+ LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_windows_1251);
1828
+
1829
+ return LXB_STATUS_OK;
1830
+ }
1831
+
1832
+ lxb_status_t
1833
+ lxb_encoding_decode_windows_1252(lxb_encoding_decode_t *ctx,
1834
+ const lxb_char_t **data, const lxb_char_t *end)
1835
+ {
1836
+ LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_windows_1252);
1837
+
1838
+ return LXB_STATUS_OK;
1839
+ }
1840
+
1841
+ lxb_status_t
1842
+ lxb_encoding_decode_windows_1253(lxb_encoding_decode_t *ctx,
1843
+ const lxb_char_t **data, const lxb_char_t *end)
1844
+ {
1845
+ LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_windows_1253);
1846
+
1847
+ return LXB_STATUS_OK;
1848
+ }
1849
+
1850
+ lxb_status_t
1851
+ lxb_encoding_decode_windows_1254(lxb_encoding_decode_t *ctx,
1852
+ const lxb_char_t **data, const lxb_char_t *end)
1853
+ {
1854
+ LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_windows_1254);
1855
+
1856
+ return LXB_STATUS_OK;
1857
+ }
1858
+
1859
+ lxb_status_t
1860
+ lxb_encoding_decode_windows_1255(lxb_encoding_decode_t *ctx,
1861
+ const lxb_char_t **data, const lxb_char_t *end)
1862
+ {
1863
+ LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_windows_1255);
1864
+
1865
+ return LXB_STATUS_OK;
1866
+ }
1867
+
1868
+ lxb_status_t
1869
+ lxb_encoding_decode_windows_1256(lxb_encoding_decode_t *ctx,
1870
+ const lxb_char_t **data, const lxb_char_t *end)
1871
+ {
1872
+ LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_windows_1256);
1873
+
1874
+ return LXB_STATUS_OK;
1875
+ }
1876
+
1877
+ lxb_status_t
1878
+ lxb_encoding_decode_windows_1257(lxb_encoding_decode_t *ctx,
1879
+ const lxb_char_t **data, const lxb_char_t *end)
1880
+ {
1881
+ LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_windows_1257);
1882
+
1883
+ return LXB_STATUS_OK;
1884
+ }
1885
+
1886
+ lxb_status_t
1887
+ lxb_encoding_decode_windows_1258(lxb_encoding_decode_t *ctx,
1888
+ const lxb_char_t **data, const lxb_char_t *end)
1889
+ {
1890
+ LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_windows_1258);
1891
+
1892
+ return LXB_STATUS_OK;
1893
+ }
1894
+
1895
+ lxb_status_t
1896
+ lxb_encoding_decode_windows_874(lxb_encoding_decode_t *ctx,
1897
+ const lxb_char_t **data, const lxb_char_t *end)
1898
+ {
1899
+ LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_windows_874);
1900
+
1901
+ return LXB_STATUS_OK;
1902
+ }
1903
+
1904
+ lxb_status_t
1905
+ lxb_encoding_decode_x_mac_cyrillic(lxb_encoding_decode_t *ctx,
1906
+ const lxb_char_t **data, const lxb_char_t *end)
1907
+ {
1908
+ LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_x_mac_cyrillic);
1909
+
1910
+ return LXB_STATUS_OK;
1911
+ }
1912
+
1913
+ lxb_status_t
1914
+ lxb_encoding_decode_x_user_defined(lxb_encoding_decode_t *ctx,
1915
+ const lxb_char_t **data, const lxb_char_t *end)
1916
+ {
1917
+ while (*data < end) {
1918
+ if (**data < 0x80) {
1919
+ LXB_ENCODING_DECODE_APPEND(ctx, *(*data)++);
1920
+ }
1921
+ else {
1922
+ LXB_ENCODING_DECODE_APPEND(ctx, 0xF780 + (*(*data)++) - 0x80);
1923
+ }
1924
+ }
1925
+
1926
+ return LXB_STATUS_OK;
1927
+ }
1928
+
1929
+ /*
1930
+ * Single
1931
+ */
1932
+ lxb_codepoint_t
1933
+ lxb_encoding_decode_default_single(lxb_encoding_decode_t *ctx,
1934
+ const lxb_char_t **data, const lxb_char_t *end)
1935
+ {
1936
+ return lxb_encoding_decode_utf_8_single(ctx, data, end);
1937
+ }
1938
+
1939
+ lxb_codepoint_t
1940
+ lxb_encoding_decode_auto_single(lxb_encoding_decode_t *ctx,
1941
+ const lxb_char_t **data, const lxb_char_t *end)
1942
+ {
1943
+ return LXB_ENCODING_DECODE_ERROR;
1944
+ }
1945
+
1946
+ lxb_codepoint_t
1947
+ lxb_encoding_decode_undefined_single(lxb_encoding_decode_t *ctx,
1948
+ const lxb_char_t **data, const lxb_char_t *end)
1949
+ {
1950
+ return LXB_ENCODING_DECODE_ERROR;
1951
+ }
1952
+
1953
+ lxb_codepoint_t
1954
+ lxb_encoding_decode_big5_single(lxb_encoding_decode_t *ctx,
1955
+ const lxb_char_t **data, const lxb_char_t *end)
1956
+ {
1957
+ uint32_t index;
1958
+ lxb_char_t lead, byte;
1959
+
1960
+ if (ctx->u.lead != 0x00) {
1961
+ if (ctx->second_codepoint != 0x00) {
1962
+ (*data)++;
1963
+
1964
+ ctx->u.lead = 0x00;
1965
+
1966
+ ctx->codepoint = ctx->second_codepoint;
1967
+ ctx->second_codepoint = 0x00;
1968
+
1969
+ return ctx->codepoint;
1970
+ }
1971
+
1972
+ lead = (lxb_char_t) ctx->u.lead;
1973
+ ctx->u.lead = 0x00;
1974
+
1975
+ goto lead_state;
1976
+ }
1977
+
1978
+ lead = *(*data)++;
1979
+
1980
+ if (lead < 0x80) {
1981
+ return lead;
1982
+ }
1983
+
1984
+ if ((unsigned) (lead - 0x81) > (0xFE - 0x81)) {
1985
+ return LXB_ENCODING_DECODE_ERROR;
1986
+ }
1987
+
1988
+ if (*data >= end) {
1989
+ ctx->u.lead = lead;
1990
+
1991
+ return LXB_ENCODING_DECODE_CONTINUE;
1992
+ }
1993
+
1994
+ lead_state:
1995
+
1996
+ index = 0;
1997
+ byte = **data;
1998
+
1999
+ if ((unsigned) (byte - 0x40) <= (0x7E - 0x40)
2000
+ || (unsigned) (byte - 0xA1) <= (0xFE - 0xA1))
2001
+ {
2002
+ if (byte < 0x7F) {
2003
+ /* Max index == (0xFE - 0x81) * 157 + (0x7E - 0x62) == 19653 */
2004
+ index = (lead - 0x81) * 157 + (byte - 0x40);
2005
+ }
2006
+ else {
2007
+ /* Max index == (0xFE - 0x81) * 157 + (0xFE - 0x62) == 19781 */
2008
+ index = (lead - 0x81) * 157 + (byte - 0x62);
2009
+ }
2010
+ }
2011
+
2012
+ /*
2013
+ * 1133 U+00CA U+0304 Ê̄ (LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND MACRON)
2014
+ * 1135 U+00CA U+030C Ê̌ (LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND CARON)
2015
+ * 1164 U+00EA U+0304 ê̄ (LATIN SMALL LETTER E WITH CIRCUMFLEX AND MACRON)
2016
+ * 1166 U+00EA U+030C ê̌ (LATIN SMALL LETTER E WITH CIRCUMFLEX AND CARON)
2017
+ */
2018
+ switch (index) {
2019
+ case 1133:
2020
+ ctx->u.lead = lead;
2021
+ ctx->second_codepoint = 0x0304;
2022
+ return 0x00CA;
2023
+
2024
+ case 1135:
2025
+ ctx->u.lead = lead;
2026
+ ctx->second_codepoint = 0x030C;
2027
+ return 0x00CA;
2028
+
2029
+ case 1164:
2030
+ ctx->u.lead = lead;
2031
+ ctx->second_codepoint = 0x0304;
2032
+ return 0x00EA;
2033
+
2034
+ case 1166:
2035
+ ctx->u.lead = lead;
2036
+ ctx->second_codepoint = 0x030C;
2037
+ return 0x00EA;
2038
+
2039
+ case 0:
2040
+ goto failed;
2041
+ }
2042
+
2043
+ ctx->codepoint = lxb_encoding_multi_index_big5[index].codepoint;
2044
+ if (ctx->codepoint == LXB_ENCODING_ERROR_CODEPOINT) {
2045
+ goto failed;
2046
+ }
2047
+
2048
+ (*data)++;
2049
+
2050
+ return ctx->codepoint;
2051
+
2052
+ failed:
2053
+
2054
+ if (byte >= 0x80) {
2055
+ (*data)++;
2056
+ }
2057
+
2058
+ return LXB_ENCODING_DECODE_ERROR;
2059
+ }
2060
+
2061
+ lxb_codepoint_t
2062
+ lxb_encoding_decode_euc_jp_single(lxb_encoding_decode_t *ctx,
2063
+ const lxb_char_t **data, const lxb_char_t *end)
2064
+ {
2065
+ bool is_jis0212;
2066
+ lxb_char_t byte, lead;
2067
+
2068
+ if (ctx->u.euc_jp.lead != 0x00) {
2069
+ lead = ctx->u.euc_jp.lead;
2070
+ byte = *(*data)++;
2071
+
2072
+ ctx->u.euc_jp.lead = 0x00;
2073
+
2074
+ if (ctx->u.euc_jp.is_jis0212) {
2075
+ is_jis0212 = true;
2076
+ ctx->u.euc_jp.is_jis0212 = false;
2077
+
2078
+ goto lead_jis_state;
2079
+ }
2080
+
2081
+ goto lead_state;
2082
+ }
2083
+
2084
+ lead = *(*data)++;
2085
+
2086
+ if (lead < 0x80) {
2087
+ return lead;
2088
+ }
2089
+
2090
+ if ((unsigned) (lead - 0xA1) > (0xFE - 0xA1)
2091
+ && (lead != 0x8E && lead != 0x8F))
2092
+ {
2093
+ return LXB_ENCODING_DECODE_ERROR;
2094
+ }
2095
+
2096
+ if (*data >= end) {
2097
+ ctx->u.euc_jp.lead = lead;
2098
+ return LXB_ENCODING_DECODE_CONTINUE;
2099
+ }
2100
+
2101
+ byte = *(*data)++;
2102
+
2103
+ lead_state:
2104
+
2105
+ if (lead == 0x8E && (unsigned) (byte - 0xA1) <= (0xDF - 0xA1)) {
2106
+ return 0xFF61 - 0xA1 + byte;
2107
+ }
2108
+
2109
+ is_jis0212 = false;
2110
+
2111
+ if (lead == 0x8F && (unsigned) (byte - 0xA1) <= (0xFE - 0xA1)) {
2112
+ if (*data >= end) {
2113
+ ctx->u.euc_jp.lead = byte;
2114
+ ctx->u.euc_jp.is_jis0212 = true;
2115
+
2116
+ return LXB_ENCODING_DECODE_CONTINUE;
2117
+ }
2118
+
2119
+ lead = byte;
2120
+ byte = *(*data)++;
2121
+ is_jis0212 = true;
2122
+ }
2123
+
2124
+ lead_jis_state:
2125
+
2126
+ if ((unsigned) (lead - 0xA1) > (0xFE - 0xA1)
2127
+ || (unsigned) (byte - 0xA1) > (0xFE - 0xA1))
2128
+ {
2129
+ goto failed;
2130
+ }
2131
+
2132
+ /* Max index == (0xFE - 0xA1) * 94 + 0xFE - 0xA1 == 8835 */
2133
+ ctx->codepoint = (lead - 0xA1) * 94 + byte - 0xA1;
2134
+
2135
+ if (is_jis0212) {
2136
+ if ((sizeof(lxb_encoding_multi_index_jis0212)
2137
+ / sizeof(lxb_encoding_multi_index_t)) <= ctx->codepoint)
2138
+ {
2139
+ goto failed;
2140
+ }
2141
+
2142
+ ctx->codepoint = lxb_encoding_multi_index_jis0212[ctx->codepoint].codepoint;
2143
+ }
2144
+ else {
2145
+ if ((sizeof(lxb_encoding_multi_index_jis0208)
2146
+ / sizeof(lxb_encoding_multi_index_t)) <= ctx->codepoint)
2147
+ {
2148
+ goto failed;
2149
+ }
2150
+
2151
+ ctx->codepoint = lxb_encoding_multi_index_jis0208[ctx->codepoint].codepoint;
2152
+ }
2153
+
2154
+ if (ctx->codepoint == LXB_ENCODING_ERROR_CODEPOINT) {
2155
+ goto failed;
2156
+ }
2157
+
2158
+ return ctx->codepoint;
2159
+
2160
+ failed:
2161
+
2162
+ if (byte < 0x80) {
2163
+ (*data)--;
2164
+ }
2165
+
2166
+ return LXB_ENCODING_DECODE_ERROR;
2167
+ }
2168
+
2169
+ lxb_codepoint_t
2170
+ lxb_encoding_decode_euc_kr_single(lxb_encoding_decode_t *ctx,
2171
+ const lxb_char_t **data, const lxb_char_t *end)
2172
+ {
2173
+ lxb_char_t lead, byte;
2174
+
2175
+ if (ctx->u.lead != 0x00) {
2176
+ lead = (lxb_char_t) ctx->u.lead;
2177
+ ctx->u.lead = 0x00;
2178
+
2179
+ goto lead_state;
2180
+ }
2181
+
2182
+ lead = *(*data)++;
2183
+
2184
+ if (lead < 0x80) {
2185
+ return lead;
2186
+ }
2187
+
2188
+ if ((unsigned) (lead - 0x81) > (0xFE - 0x81)) {
2189
+ return LXB_ENCODING_DECODE_ERROR;
2190
+ }
2191
+
2192
+ if (*data == end) {
2193
+ ctx->u.lead = lead;
2194
+ return LXB_ENCODING_DECODE_CONTINUE;
2195
+ }
2196
+
2197
+ lead_state:
2198
+
2199
+ byte = *(*data)++;
2200
+
2201
+ if ((unsigned) (byte - 0x41) > (0xFE - 0x41)) {
2202
+ goto failed;
2203
+ }
2204
+
2205
+ /* Max index == (0xFE - 0x81) * 190 + (0xFE - 0x41) == 23939 */
2206
+ ctx->codepoint = (lead - 0x81) * 190 + (byte - 0x41);
2207
+
2208
+ if (ctx->codepoint >= sizeof(lxb_encoding_multi_index_euc_kr)
2209
+ / sizeof(lxb_encoding_multi_index_t))
2210
+ {
2211
+ goto failed;
2212
+ }
2213
+
2214
+ ctx->codepoint = lxb_encoding_multi_index_euc_kr[ctx->codepoint].codepoint;
2215
+ if (ctx->codepoint == LXB_ENCODING_ERROR_CODEPOINT) {
2216
+ goto failed;
2217
+ }
2218
+
2219
+ return ctx->codepoint;
2220
+
2221
+ failed:
2222
+
2223
+ if (byte < 0x80) {
2224
+ (*data)--;
2225
+ }
2226
+
2227
+ return LXB_ENCODING_DECODE_ERROR;
2228
+ }
2229
+
2230
+ lxb_codepoint_t
2231
+ lxb_encoding_decode_gbk_single(lxb_encoding_decode_t *ctx,
2232
+ const lxb_char_t **data, const lxb_char_t *end)
2233
+ {
2234
+ return lxb_encoding_decode_gb18030_single(ctx, data, end);
2235
+ }
2236
+
2237
+ lxb_codepoint_t
2238
+ lxb_encoding_decode_ibm866_single(lxb_encoding_decode_t *ctx,
2239
+ const lxb_char_t **data, const lxb_char_t *end)
2240
+ {
2241
+ if (**data < 0x80) {
2242
+ return *(*data)++;
2243
+ }
2244
+
2245
+ return lxb_encoding_single_index_ibm866[*(*data)++ - 0x80].codepoint;
2246
+ }
2247
+
2248
+ lxb_codepoint_t
2249
+ lxb_encoding_decode_iso_2022_jp_single(lxb_encoding_decode_t *ctx,
2250
+ const lxb_char_t **data, const lxb_char_t *end)
2251
+ {
2252
+ lxb_char_t byte;
2253
+ lxb_encoding_ctx_2022_jp_t *iso = &ctx->u.iso_2022_jp;
2254
+
2255
+ if (iso->prepand != 0x00) {
2256
+ byte = iso->prepand;
2257
+ iso->prepand = 0x00;
2258
+
2259
+ goto prepand;
2260
+ }
2261
+
2262
+ do {
2263
+ byte = *(*data)++;
2264
+
2265
+ prepand:
2266
+
2267
+ switch (iso->state) {
2268
+ case LXB_ENCODING_DECODE_2022_JP_ASCII:
2269
+ if (byte == 0x1B) {
2270
+ iso->state = LXB_ENCODING_DECODE_2022_JP_ESCAPE_START;
2271
+
2272
+ break;
2273
+ }
2274
+
2275
+ /* 0x00 to 0x7F, excluding 0x0E, 0x0F, and 0x1B */
2276
+ if ((unsigned) (byte - 0x00) <= (0x7F - 0x00)
2277
+ && byte != 0x0E && byte != 0x0F)
2278
+ {
2279
+ iso->out_flag = false;
2280
+
2281
+ return byte;
2282
+ }
2283
+
2284
+ iso->out_flag = false;
2285
+
2286
+ return LXB_ENCODING_DECODE_ERROR;
2287
+
2288
+ case LXB_ENCODING_DECODE_2022_JP_ROMAN:
2289
+ switch (byte) {
2290
+ case 0x1B:
2291
+ iso->state = LXB_ENCODING_DECODE_2022_JP_ESCAPE_START;
2292
+
2293
+ continue;
2294
+
2295
+ case 0x5C:
2296
+ iso->out_flag = false;
2297
+
2298
+ return 0x00A5;
2299
+
2300
+ case 0x7E:
2301
+ iso->out_flag = false;
2302
+
2303
+ return 0x203E;
2304
+
2305
+ case 0x0E:
2306
+ case 0x0F:
2307
+ break;
2308
+
2309
+ default:
2310
+ /* 0x00 to 0x7F */
2311
+ if ((unsigned) (byte - 0x00) <= (0x7F - 0x00)) {
2312
+ iso->out_flag = false;
2313
+
2314
+ return byte;
2315
+ }
2316
+
2317
+ break;
2318
+ }
2319
+
2320
+ iso->out_flag = false;
2321
+
2322
+ return LXB_ENCODING_DECODE_ERROR;
2323
+
2324
+ case LXB_ENCODING_DECODE_2022_JP_KATAKANA:
2325
+ if (byte == 0x1B) {
2326
+ iso->state = LXB_ENCODING_DECODE_2022_JP_ESCAPE_START;
2327
+
2328
+ break;
2329
+ }
2330
+
2331
+ /* 0x21 to 0x5F */
2332
+ if ((unsigned) (byte - 0x21) <= (0x5F - 0x21)) {
2333
+ iso->out_flag = false;
2334
+
2335
+ return 0xFF61 - 0x21 + byte;
2336
+ }
2337
+
2338
+ iso->out_flag = false;
2339
+
2340
+ return LXB_ENCODING_DECODE_ERROR;
2341
+
2342
+ case LXB_ENCODING_DECODE_2022_JP_LEAD:
2343
+ if (byte == 0x1B) {
2344
+ iso->state = LXB_ENCODING_DECODE_2022_JP_ESCAPE_START;
2345
+
2346
+ break;
2347
+ }
2348
+
2349
+ /* 0x21 to 0x7E */
2350
+ if ((unsigned) (byte - 0x21) <= (0x7E - 0x21)) {
2351
+ iso->out_flag = false;
2352
+ iso->lead = byte;
2353
+ iso->state = LXB_ENCODING_DECODE_2022_JP_TRAIL;
2354
+
2355
+ break;
2356
+ }
2357
+
2358
+ iso->out_flag = false;
2359
+
2360
+ return LXB_ENCODING_DECODE_ERROR;
2361
+
2362
+ case LXB_ENCODING_DECODE_2022_JP_TRAIL:
2363
+ if (byte == 0x1B) {
2364
+ iso->state = LXB_ENCODING_DECODE_2022_JP_ESCAPE_START;
2365
+
2366
+ return LXB_ENCODING_DECODE_ERROR;
2367
+ }
2368
+
2369
+ iso->state = LXB_ENCODING_DECODE_2022_JP_LEAD;
2370
+
2371
+ /* 0x21 to 0x7E */
2372
+ if ((unsigned) (byte - 0x21) <= (0x7E - 0x21)) {
2373
+ /* Max index == (0x7E - 0x21) * 94 + 0x7E - 0x21 == 8835 */
2374
+ ctx->codepoint = (iso->lead - 0x21) * 94 + byte - 0x21;
2375
+
2376
+ return lxb_encoding_multi_index_jis0208[ctx->codepoint].codepoint;
2377
+ }
2378
+
2379
+ return LXB_ENCODING_DECODE_ERROR;
2380
+
2381
+ case LXB_ENCODING_DECODE_2022_JP_ESCAPE_START:
2382
+ if (byte == 0x24 || byte == 0x28) {
2383
+ iso->state = LXB_ENCODING_DECODE_2022_JP_ESCAPE;
2384
+ iso->lead = byte;
2385
+
2386
+ break;
2387
+ }
2388
+
2389
+ (*data)--;
2390
+
2391
+ iso->out_flag = false;
2392
+ iso->state = ctx->u.iso_2022_jp.out_state;
2393
+
2394
+ return LXB_ENCODING_DECODE_ERROR;
2395
+
2396
+ case LXB_ENCODING_DECODE_2022_JP_ESCAPE:
2397
+ iso->state = LXB_ENCODING_DECODE_2022_JP_UNSET;
2398
+
2399
+ if (iso->lead == 0x28) {
2400
+ if (byte == 0x42) {
2401
+ iso->state = LXB_ENCODING_DECODE_2022_JP_ASCII;
2402
+ }
2403
+ else if (byte == 0x4A) {
2404
+ iso->state = LXB_ENCODING_DECODE_2022_JP_ROMAN;
2405
+ }
2406
+ else if (byte == 0x49) {
2407
+ iso->state = LXB_ENCODING_DECODE_2022_JP_KATAKANA;
2408
+ }
2409
+ }
2410
+ else if (iso->lead == 0x24) {
2411
+ if (byte == 0x40 || byte == 0x42) {
2412
+ iso->state = LXB_ENCODING_DECODE_2022_JP_LEAD;
2413
+ }
2414
+ }
2415
+
2416
+ if (iso->state == LXB_ENCODING_DECODE_2022_JP_UNSET) {
2417
+ iso->prepand = iso->lead;
2418
+ iso->lead = 0x00;
2419
+
2420
+ (*data)--;
2421
+
2422
+ iso->out_flag = false;
2423
+ iso->state = iso->out_state;
2424
+
2425
+ return LXB_ENCODING_DECODE_ERROR;
2426
+ }
2427
+
2428
+ iso->lead = 0x00;
2429
+ iso->out_state = iso->state;
2430
+
2431
+ if (iso->out_flag) {
2432
+ return LXB_ENCODING_DECODE_ERROR;
2433
+ }
2434
+
2435
+ iso->out_flag = true;
2436
+
2437
+ break;
2438
+ }
2439
+ }
2440
+ while (*data < end);
2441
+
2442
+ return LXB_ENCODING_DECODE_CONTINUE;
2443
+ }
2444
+
2445
+ lxb_codepoint_t
2446
+ lxb_encoding_decode_iso_8859_10_single(lxb_encoding_decode_t *ctx,
2447
+ const lxb_char_t **data, const lxb_char_t *end)
2448
+ {
2449
+ if (**data < 0x80) {
2450
+ return *(*data)++;
2451
+ }
2452
+
2453
+ return lxb_encoding_single_index_iso_8859_10[*(*data)++ - 0x80].codepoint;
2454
+ }
2455
+
2456
+ lxb_codepoint_t
2457
+ lxb_encoding_decode_iso_8859_13_single(lxb_encoding_decode_t *ctx,
2458
+ const lxb_char_t **data, const lxb_char_t *end)
2459
+ {
2460
+ if (**data < 0x80) {
2461
+ return *(*data)++;
2462
+ }
2463
+
2464
+ return lxb_encoding_single_index_iso_8859_13[*(*data)++ - 0x80].codepoint;
2465
+ }
2466
+
2467
+ lxb_codepoint_t
2468
+ lxb_encoding_decode_iso_8859_14_single(lxb_encoding_decode_t *ctx,
2469
+ const lxb_char_t **data, const lxb_char_t *end)
2470
+ {
2471
+ if (**data < 0x80) {
2472
+ return *(*data)++;
2473
+ }
2474
+
2475
+ return lxb_encoding_single_index_iso_8859_14[*(*data)++ - 0x80].codepoint;
2476
+ }
2477
+
2478
+ lxb_codepoint_t
2479
+ lxb_encoding_decode_iso_8859_15_single(lxb_encoding_decode_t *ctx,
2480
+ const lxb_char_t **data, const lxb_char_t *end)
2481
+ {
2482
+ if (**data < 0x80) {
2483
+ return *(*data)++;
2484
+ }
2485
+
2486
+ return lxb_encoding_single_index_iso_8859_15[*(*data)++ - 0x80].codepoint;
2487
+ }
2488
+
2489
+ lxb_codepoint_t
2490
+ lxb_encoding_decode_iso_8859_16_single(lxb_encoding_decode_t *ctx,
2491
+ const lxb_char_t **data, const lxb_char_t *end)
2492
+ {
2493
+ if (**data < 0x80) {
2494
+ return *(*data)++;
2495
+ }
2496
+
2497
+ return lxb_encoding_single_index_iso_8859_16[*(*data)++ - 0x80].codepoint;
2498
+ }
2499
+
2500
+ lxb_codepoint_t
2501
+ lxb_encoding_decode_iso_8859_2_single(lxb_encoding_decode_t *ctx,
2502
+ const lxb_char_t **data, const lxb_char_t *end)
2503
+ {
2504
+ if (**data < 0x80) {
2505
+ return *(*data)++;
2506
+ }
2507
+
2508
+ return lxb_encoding_single_index_iso_8859_2[*(*data)++ - 0x80].codepoint;
2509
+ }
2510
+
2511
+ lxb_codepoint_t
2512
+ lxb_encoding_decode_iso_8859_3_single(lxb_encoding_decode_t *ctx,
2513
+ const lxb_char_t **data, const lxb_char_t *end)
2514
+ {
2515
+ if (**data < 0x80) {
2516
+ return *(*data)++;
2517
+ }
2518
+
2519
+ return lxb_encoding_single_index_iso_8859_3[*(*data)++ - 0x80].codepoint;
2520
+ }
2521
+
2522
+ lxb_codepoint_t
2523
+ lxb_encoding_decode_iso_8859_4_single(lxb_encoding_decode_t *ctx,
2524
+ const lxb_char_t **data, const lxb_char_t *end)
2525
+ {
2526
+ if (**data < 0x80) {
2527
+ return *(*data)++;
2528
+ }
2529
+
2530
+ return lxb_encoding_single_index_iso_8859_4[*(*data)++ - 0x80].codepoint;
2531
+ }
2532
+
2533
+ lxb_codepoint_t
2534
+ lxb_encoding_decode_iso_8859_5_single(lxb_encoding_decode_t *ctx,
2535
+ const lxb_char_t **data, const lxb_char_t *end)
2536
+ {
2537
+ if (**data < 0x80) {
2538
+ return *(*data)++;
2539
+ }
2540
+
2541
+ return lxb_encoding_single_index_iso_8859_5[*(*data)++ - 0x80].codepoint;
2542
+ }
2543
+
2544
+ lxb_codepoint_t
2545
+ lxb_encoding_decode_iso_8859_6_single(lxb_encoding_decode_t *ctx,
2546
+ const lxb_char_t **data, const lxb_char_t *end)
2547
+ {
2548
+ if (**data < 0x80) {
2549
+ return *(*data)++;
2550
+ }
2551
+
2552
+ return lxb_encoding_single_index_iso_8859_6[*(*data)++ - 0x80].codepoint;
2553
+ }
2554
+
2555
+ lxb_codepoint_t
2556
+ lxb_encoding_decode_iso_8859_7_single(lxb_encoding_decode_t *ctx,
2557
+ const lxb_char_t **data, const lxb_char_t *end)
2558
+ {
2559
+ if (**data < 0x80) {
2560
+ return *(*data)++;
2561
+ }
2562
+
2563
+ return lxb_encoding_single_index_iso_8859_7[*(*data)++ - 0x80].codepoint;
2564
+ }
2565
+
2566
+ lxb_codepoint_t
2567
+ lxb_encoding_decode_iso_8859_8_single(lxb_encoding_decode_t *ctx,
2568
+ const lxb_char_t **data, const lxb_char_t *end)
2569
+ {
2570
+ if (**data < 0x80) {
2571
+ return *(*data)++;
2572
+ }
2573
+
2574
+ return lxb_encoding_single_index_iso_8859_8[*(*data)++ - 0x80].codepoint;
2575
+ }
2576
+
2577
+ lxb_codepoint_t
2578
+ lxb_encoding_decode_iso_8859_8_i_single(lxb_encoding_decode_t *ctx,
2579
+ const lxb_char_t **data, const lxb_char_t *end)
2580
+ {
2581
+ if (**data < 0x80) {
2582
+ return *(*data)++;
2583
+ }
2584
+
2585
+ return lxb_encoding_single_index_iso_8859_8[*(*data)++ - 0x80].codepoint;
2586
+ }
2587
+
2588
+ lxb_codepoint_t
2589
+ lxb_encoding_decode_koi8_r_single(lxb_encoding_decode_t *ctx,
2590
+ const lxb_char_t **data, const lxb_char_t *end)
2591
+ {
2592
+ if (**data < 0x80) {
2593
+ return *(*data)++;
2594
+ }
2595
+
2596
+ return lxb_encoding_single_index_koi8_r[*(*data)++ - 0x80].codepoint;
2597
+ }
2598
+
2599
+ lxb_codepoint_t
2600
+ lxb_encoding_decode_koi8_u_single(lxb_encoding_decode_t *ctx,
2601
+ const lxb_char_t **data, const lxb_char_t *end)
2602
+ {
2603
+ if (**data < 0x80) {
2604
+ return *(*data)++;
2605
+ }
2606
+
2607
+ return lxb_encoding_single_index_koi8_u[*(*data)++ - 0x80].codepoint;
2608
+ }
2609
+
2610
+ lxb_codepoint_t
2611
+ lxb_encoding_decode_shift_jis_single(lxb_encoding_decode_t *ctx,
2612
+ const lxb_char_t **data, const lxb_char_t *end)
2613
+ {
2614
+ lxb_char_t byte, lead;
2615
+
2616
+ if (ctx->u.lead != 0x00) {
2617
+ lead = (lxb_char_t) ctx->u.lead;
2618
+ ctx->u.lead = 0x00;
2619
+
2620
+ goto lead_state;
2621
+ }
2622
+
2623
+ lead = *(*data)++;
2624
+
2625
+ if (lead <= 0x80) {
2626
+ return lead;
2627
+ }
2628
+
2629
+ if ((unsigned) (lead - 0xA1) <= (0xDF - 0xA1)) {
2630
+ return 0xFF61 - 0xA1 + lead;
2631
+ }
2632
+
2633
+ if ((unsigned) (lead - 0x81) > (0x9F - 0x81)
2634
+ && lead != 0xE0 && lead != 0xFC)
2635
+ {
2636
+ return LXB_ENCODING_DECODE_ERROR;
2637
+ }
2638
+
2639
+ if (*data >= end) {
2640
+ ctx->u.lead = lead;
2641
+
2642
+ return LXB_ENCODING_DECODE_CONTINUE;
2643
+ }
2644
+
2645
+ lead_state:
2646
+
2647
+ byte = *(*data)++;
2648
+
2649
+ if (byte < 0x7F) {
2650
+ ctx->codepoint = 0x40;
2651
+ }
2652
+ else {
2653
+ ctx->codepoint = 0x41;
2654
+ }
2655
+
2656
+ if (lead < 0xA0) {
2657
+ ctx->second_codepoint = 0x81;
2658
+ }
2659
+ else {
2660
+ ctx->second_codepoint = 0xC1;
2661
+ }
2662
+
2663
+ if ((unsigned) (byte - 0x40) <= (0x7E - 0x40)
2664
+ || (unsigned) (byte - 0x80) <= (0xFC - 0x80))
2665
+ {
2666
+ /* Max index == (0xFC - 0xC1) * 188 + 0xFC - 0x41 = 11279 */
2667
+ ctx->codepoint = (lead - ctx->second_codepoint) * 188
2668
+ + byte - ctx->codepoint;
2669
+
2670
+ if (ctx->codepoint >= (sizeof(lxb_encoding_multi_index_jis0208)
2671
+ / sizeof(lxb_encoding_multi_index_t)))
2672
+ {
2673
+ goto failed;
2674
+ }
2675
+
2676
+ if ((unsigned) (ctx->codepoint - 8836) <= (10715 - 8836)) {
2677
+ return 0xE000 - 8836 + ctx->codepoint;
2678
+ }
2679
+
2680
+ ctx->codepoint = lxb_encoding_multi_index_jis0208[ctx->codepoint].codepoint;
2681
+ if (ctx->codepoint == LXB_ENCODING_ERROR_CODEPOINT) {
2682
+ goto failed;
2683
+ }
2684
+
2685
+ return ctx->codepoint;
2686
+ }
2687
+
2688
+ failed:
2689
+
2690
+ if (byte < 0x80) {
2691
+ (*data)--;
2692
+ }
2693
+
2694
+ return LXB_ENCODING_DECODE_ERROR;
2695
+ }
2696
+
2697
+ lxb_inline lxb_codepoint_t
2698
+ lxb_encoding_decode_utf_16_single(lxb_encoding_decode_t *ctx, bool is_be,
2699
+ const lxb_char_t **data, const lxb_char_t *end)
2700
+ {
2701
+ unsigned lead;
2702
+ lxb_codepoint_t unit;
2703
+
2704
+ if (ctx->u.lead != 0x00) {
2705
+ lead = ctx->u.lead - 0x01;
2706
+ ctx->u.lead = 0x00;
2707
+
2708
+ goto lead_state;
2709
+ }
2710
+
2711
+ pair_state:
2712
+
2713
+ lead = *(*data)++;
2714
+
2715
+ if (*data >= end) {
2716
+ ctx->u.lead = lead + 0x01;
2717
+ return LXB_ENCODING_DECODE_CONTINUE;
2718
+ }
2719
+
2720
+ lead_state:
2721
+
2722
+ /* For UTF-16BE or UTF-16LE */
2723
+ if (is_be) {
2724
+ unit = (lead << 8) + *(*data)++;
2725
+ }
2726
+ else {
2727
+ unit = (*(*data)++ << 8) + lead;
2728
+ }
2729
+
2730
+ if (ctx->second_codepoint != 0x00) {
2731
+ if ((unsigned) (unit - 0xDC00) <= (0xDFFF - 0xDC00)) {
2732
+ ctx->codepoint = 0x10000 + ((ctx->second_codepoint - 0xD800) << 10)
2733
+ + (unit - 0xDC00);
2734
+
2735
+ ctx->second_codepoint = 0x00;
2736
+ return ctx->codepoint;
2737
+ }
2738
+
2739
+ (*data)--;
2740
+
2741
+ ctx->u.lead = lead + 0x01;
2742
+ ctx->second_codepoint = 0x00;
2743
+
2744
+ return LXB_ENCODING_DECODE_ERROR;
2745
+ }
2746
+
2747
+ /* Surrogate pair */
2748
+ if ((unsigned) (unit - 0xD800) <= (0xDFFF - 0xD800)) {
2749
+ if ((unsigned) (unit - 0xDC00) <= (0xDFFF - 0xDC00)) {
2750
+ return LXB_ENCODING_DECODE_ERROR;
2751
+ }
2752
+
2753
+ ctx->second_codepoint = unit;
2754
+
2755
+ if (*data >= end) {
2756
+ return LXB_ENCODING_DECODE_CONTINUE;
2757
+ }
2758
+
2759
+ goto pair_state;
2760
+ }
2761
+
2762
+ return unit;
2763
+ }
2764
+
2765
+ lxb_codepoint_t
2766
+ lxb_encoding_decode_utf_16be_single(lxb_encoding_decode_t *ctx,
2767
+ const lxb_char_t **data, const lxb_char_t *end)
2768
+ {
2769
+ return lxb_encoding_decode_utf_16_single(ctx, true, data, end);
2770
+ }
2771
+
2772
+ lxb_codepoint_t
2773
+ lxb_encoding_decode_utf_16le_single(lxb_encoding_decode_t *ctx,
2774
+ const lxb_char_t **data, const lxb_char_t *end)
2775
+ {
2776
+ return lxb_encoding_decode_utf_16_single(ctx, false, data, end);
2777
+ }
2778
+
2779
+ lxb_codepoint_t
2780
+ lxb_encoding_decode_utf_8_single(lxb_encoding_decode_t *ctx,
2781
+ const lxb_char_t **data, const lxb_char_t *end)
2782
+ {
2783
+ unsigned needed;
2784
+ lxb_char_t ch;
2785
+ const lxb_char_t *p;
2786
+
2787
+ if (ctx->u.utf_8.need != 0) {
2788
+ needed = ctx->u.utf_8.need;
2789
+ ctx->u.utf_8.need = 0;
2790
+
2791
+ if (ctx->u.utf_8.lower != 0x00) {
2792
+ LXB_ENCODING_DECODE_UTF_8_BOUNDARY_SINGLE(ctx->u.utf_8.lower,
2793
+ ctx->u.utf_8.upper);
2794
+ ctx->u.utf_8.lower = 0x00;
2795
+ }
2796
+
2797
+ goto decode;
2798
+ }
2799
+
2800
+ ch = *(*data)++;
2801
+
2802
+ if (ch < 0x80) {
2803
+ return ch;
2804
+ }
2805
+ else if (ch <= 0xDF) {
2806
+ if (ch < 0xC2) {
2807
+ return LXB_ENCODING_DECODE_ERROR;
2808
+ }
2809
+
2810
+ needed = 1;
2811
+ ctx->codepoint = ch & 0x1F;
2812
+ }
2813
+ else if (ch < 0xF0) {
2814
+ needed = 2;
2815
+ ctx->codepoint = ch & 0x0F;
2816
+
2817
+ if (*data == end) {
2818
+ LXB_ENCODING_DECODE_UTF_8_BOUNDARY_SET_SINGLE(0xE0, 0xED,
2819
+ 0xA0, 0x9F);
2820
+ goto next;
2821
+ }
2822
+
2823
+ if (ch == 0xE0) {
2824
+ LXB_ENCODING_DECODE_UTF_8_BOUNDARY_SINGLE(0xA0, 0xBF);
2825
+ }
2826
+ else if (ch == 0xED) {
2827
+ LXB_ENCODING_DECODE_UTF_8_BOUNDARY_SINGLE(0x80, 0x9F);
2828
+ }
2829
+ }
2830
+ else if (ch < 0xF5) {
2831
+ needed = 3;
2832
+ ctx->codepoint = ch & 0x07;
2833
+
2834
+ if (*data == end) {
2835
+ LXB_ENCODING_DECODE_UTF_8_BOUNDARY_SET_SINGLE(0xF0, 0xF4,
2836
+ 0x90, 0x8F);
2837
+
2838
+ goto next;
2839
+ }
2840
+
2841
+ if (ch == 0xF0) {
2842
+ LXB_ENCODING_DECODE_UTF_8_BOUNDARY_SINGLE(0x90, 0xBF);
2843
+ }
2844
+ else if (ch == 0xF4) {
2845
+ LXB_ENCODING_DECODE_UTF_8_BOUNDARY_SINGLE(0x80, 0x8F);
2846
+ }
2847
+ }
2848
+ else {
2849
+ return LXB_ENCODING_DECODE_ERROR;
2850
+ }
2851
+
2852
+ decode:
2853
+
2854
+ for (p = *data; p < end; p++) {
2855
+ ch = *p;
2856
+
2857
+ if (ch < 0x80 || ch > 0xBF) {
2858
+ *data = p;
2859
+
2860
+ goto failed;
2861
+ }
2862
+
2863
+ ctx->codepoint = (ctx->codepoint << 6) | (ch & 0x3F);
2864
+
2865
+ if (--needed == 0) {
2866
+ *data = p + 1;
2867
+
2868
+ return ctx->codepoint;
2869
+ }
2870
+ }
2871
+
2872
+ *data = p;
2873
+
2874
+ next:
2875
+
2876
+ ctx->u.utf_8.need = needed;
2877
+
2878
+ return LXB_ENCODING_DECODE_CONTINUE;
2879
+
2880
+ failed:
2881
+
2882
+ ctx->u.utf_8.lower = 0x00;
2883
+ ctx->u.utf_8.need = 0;
2884
+
2885
+ return LXB_ENCODING_DECODE_ERROR;
2886
+ }
2887
+
2888
+ lxb_codepoint_t
2889
+ lxb_encoding_decode_gb18030_single(lxb_encoding_decode_t *ctx,
2890
+ const lxb_char_t **data, const lxb_char_t *end)
2891
+ {
2892
+ uint32_t pointer;
2893
+ lxb_char_t first, second, third, offset;
2894
+
2895
+ /* Make compiler happy */
2896
+ second = 0x00;
2897
+
2898
+ if (ctx->u.gb18030.first != 0) {
2899
+ if (ctx->u.gb18030.third != 0x00) {
2900
+ first = ctx->u.gb18030.first;
2901
+ second = ctx->u.gb18030.second;
2902
+ third = ctx->u.gb18030.third;
2903
+
2904
+ memset(&ctx->u.gb18030, 0, sizeof(lxb_encoding_ctx_gb18030_t));
2905
+
2906
+ if (ctx->prepend) {
2907
+ /* The first is always < 0x80 */
2908
+ ctx->u.gb18030.first = third;
2909
+
2910
+ return second;
2911
+ }
2912
+
2913
+ goto third_state;
2914
+ }
2915
+ else if (ctx->u.gb18030.second != 0x00) {
2916
+ first = ctx->u.gb18030.first;
2917
+ second = ctx->u.gb18030.second;
2918
+
2919
+ memset(&ctx->u.gb18030, 0, sizeof(lxb_encoding_ctx_gb18030_t));
2920
+
2921
+ goto second_state;
2922
+ }
2923
+
2924
+ first = ctx->u.gb18030.first;
2925
+ ctx->u.gb18030.first = 0x00;
2926
+
2927
+ if (ctx->prepend) {
2928
+ ctx->prepend = false;
2929
+ goto prepend_first;
2930
+ }
2931
+
2932
+ goto first_state;
2933
+ }
2934
+
2935
+ first = *(*data)++;
2936
+
2937
+ prepend_first:
2938
+
2939
+ if (first < 0x80) {
2940
+ return first;
2941
+ }
2942
+
2943
+ if (first == 0x80) {
2944
+ return 0x20AC;
2945
+ }
2946
+
2947
+ /* Range 0x81 to 0xFE, inclusive */
2948
+ if ((unsigned) (first - 0x81) > (0xFE - 0x81)) {
2949
+ return LXB_ENCODING_DECODE_ERROR;
2950
+ }
2951
+
2952
+ if (*data == end) {
2953
+ ctx->u.gb18030.first = first;
2954
+ return LXB_ENCODING_DECODE_CONTINUE;
2955
+ }
2956
+
2957
+ /* First */
2958
+ first_state:
2959
+
2960
+ second = *(*data)++;
2961
+
2962
+ /* Range 0x30 to 0x39, inclusive */
2963
+ if ((unsigned) (second - 0x30) > (0x39 - 0x30)) {
2964
+ offset = (second < 0x7F) ? 0x40 : 0x41;
2965
+
2966
+ /* Range 0x40 to 0x7E, inclusive, or 0x80 to 0xFE, inclusive */
2967
+ if ((unsigned) (second - 0x40) <= (0x7E - 0x40)
2968
+ || (unsigned) (second - 0x80) <= (0xFE - 0x80))
2969
+ {
2970
+ pointer = (first - 0x81) * 190 + (second - offset);
2971
+ }
2972
+ else {
2973
+ goto failed;
2974
+ }
2975
+
2976
+ /* Max pointer value == (0xFE - 0x81) * 190 + (0xFE - 0x41) == 23939 */
2977
+ ctx->codepoint = lxb_encoding_multi_index_gb18030[pointer].codepoint;
2978
+ if (ctx->codepoint == LXB_ENCODING_ERROR_CODEPOINT) {
2979
+ goto failed;
2980
+ }
2981
+
2982
+ return ctx->codepoint;
2983
+ }
2984
+
2985
+ if (*data == end) {
2986
+ ctx->u.gb18030.first = first;
2987
+ ctx->u.gb18030.second = second;
2988
+
2989
+ return LXB_ENCODING_DECODE_CONTINUE;
2990
+ }
2991
+
2992
+ /* Second */
2993
+ second_state:
2994
+
2995
+ third = *(*data)++;
2996
+
2997
+ /* Range 0x81 to 0xFE, inclusive */
2998
+ if ((unsigned) (third - 0x81) > (0xFE - 0x81)) {
2999
+ (*data)--;
3000
+
3001
+ ctx->prepend = true;
3002
+ ctx->u.gb18030.first = second;
3003
+
3004
+ return LXB_ENCODING_DECODE_ERROR;
3005
+ }
3006
+
3007
+ if (*data == end) {
3008
+ ctx->u.gb18030.first = first;
3009
+ ctx->u.gb18030.second = second;
3010
+ ctx->u.gb18030.third = third;
3011
+
3012
+ return LXB_ENCODING_DECODE_CONTINUE;
3013
+ }
3014
+
3015
+ /* Third */
3016
+ third_state:
3017
+
3018
+ /* Range 0x30 to 0x39, inclusive */
3019
+ if ((unsigned) (**data - 0x30) > (0x39 - 0x30)) {
3020
+ ctx->prepend = true;
3021
+
3022
+ /* First is a fake for trigger */
3023
+ ctx->u.gb18030.first = 0x01;
3024
+ ctx->u.gb18030.second = second;
3025
+ ctx->u.gb18030.third = third;
3026
+
3027
+ return LXB_ENCODING_DECODE_ERROR;
3028
+ }
3029
+
3030
+ pointer = ((first - 0x81) * (10 * 126 * 10))
3031
+ + ((second - 0x30) * (10 * 126))
3032
+ + ((third - 0x81) * 10) + (*(*data)++) - 0x30;
3033
+
3034
+ return lxb_encoding_decode_gb18030_range(pointer);
3035
+
3036
+ failed:
3037
+
3038
+ if (second < 0x80) {
3039
+ (*data)--;
3040
+ }
3041
+
3042
+ return LXB_ENCODING_DECODE_ERROR;
3043
+ }
3044
+
3045
+ lxb_codepoint_t
3046
+ lxb_encoding_decode_macintosh_single(lxb_encoding_decode_t *ctx,
3047
+ const lxb_char_t **data, const lxb_char_t *end)
3048
+ {
3049
+ if (**data < 0x80) {
3050
+ return *(*data)++;
3051
+ }
3052
+
3053
+ return lxb_encoding_single_index_macintosh[*(*data)++ - 0x80].codepoint;
3054
+ }
3055
+
3056
+ lxb_codepoint_t
3057
+ lxb_encoding_decode_replacement_single(lxb_encoding_decode_t *ctx,
3058
+ const lxb_char_t **data, const lxb_char_t *end)
3059
+ {
3060
+ return LXB_ENCODING_DECODE_ERROR;
3061
+ }
3062
+
3063
+ lxb_codepoint_t
3064
+ lxb_encoding_decode_windows_1250_single(lxb_encoding_decode_t *ctx,
3065
+ const lxb_char_t **data, const lxb_char_t *end)
3066
+ {
3067
+ if (**data < 0x80) {
3068
+ return *(*data)++;
3069
+ }
3070
+
3071
+ return lxb_encoding_single_index_windows_1250[*(*data)++ - 0x80].codepoint;
3072
+ }
3073
+
3074
+ lxb_codepoint_t
3075
+ lxb_encoding_decode_windows_1251_single(lxb_encoding_decode_t *ctx,
3076
+ const lxb_char_t **data, const lxb_char_t *end)
3077
+ {
3078
+ if (**data < 0x80) {
3079
+ return *(*data)++;
3080
+ }
3081
+
3082
+ return lxb_encoding_single_index_windows_1251[*(*data)++ - 0x80].codepoint;
3083
+ }
3084
+
3085
+ lxb_codepoint_t
3086
+ lxb_encoding_decode_windows_1252_single(lxb_encoding_decode_t *ctx,
3087
+ const lxb_char_t **data, const lxb_char_t *end)
3088
+ {
3089
+ if (**data < 0x80) {
3090
+ return *(*data)++;
3091
+ }
3092
+
3093
+ return lxb_encoding_single_index_windows_1252[*(*data)++ - 0x80].codepoint;
3094
+ }
3095
+
3096
+ lxb_codepoint_t
3097
+ lxb_encoding_decode_windows_1253_single(lxb_encoding_decode_t *ctx,
3098
+ const lxb_char_t **data, const lxb_char_t *end)
3099
+ {
3100
+ if (**data < 0x80) {
3101
+ return *(*data)++;
3102
+ }
3103
+
3104
+ return lxb_encoding_single_index_windows_1253[*(*data)++ - 0x80].codepoint;
3105
+ }
3106
+
3107
+ lxb_codepoint_t
3108
+ lxb_encoding_decode_windows_1254_single(lxb_encoding_decode_t *ctx,
3109
+ const lxb_char_t **data, const lxb_char_t *end)
3110
+ {
3111
+ if (**data < 0x80) {
3112
+ return *(*data)++;
3113
+ }
3114
+
3115
+ return lxb_encoding_single_index_windows_1254[*(*data)++ - 0x80].codepoint;
3116
+ }
3117
+
3118
+ lxb_codepoint_t
3119
+ lxb_encoding_decode_windows_1255_single(lxb_encoding_decode_t *ctx,
3120
+ const lxb_char_t **data, const lxb_char_t *end)
3121
+ {
3122
+ if (**data < 0x80) {
3123
+ return *(*data)++;
3124
+ }
3125
+
3126
+ return lxb_encoding_single_index_windows_1255[*(*data)++ - 0x80].codepoint;
3127
+ }
3128
+
3129
+ lxb_codepoint_t
3130
+ lxb_encoding_decode_windows_1256_single(lxb_encoding_decode_t *ctx,
3131
+ const lxb_char_t **data, const lxb_char_t *end)
3132
+ {
3133
+ if (**data < 0x80) {
3134
+ return *(*data)++;
3135
+ }
3136
+
3137
+ return lxb_encoding_single_index_windows_1256[*(*data)++ - 0x80].codepoint;
3138
+ }
3139
+
3140
+ lxb_codepoint_t
3141
+ lxb_encoding_decode_windows_1257_single(lxb_encoding_decode_t *ctx,
3142
+ const lxb_char_t **data, const lxb_char_t *end)
3143
+ {
3144
+ if (**data < 0x80) {
3145
+ return *(*data)++;
3146
+ }
3147
+
3148
+ return lxb_encoding_single_index_windows_1257[*(*data)++ - 0x80].codepoint;
3149
+ }
3150
+
3151
+ lxb_codepoint_t
3152
+ lxb_encoding_decode_windows_1258_single(lxb_encoding_decode_t *ctx,
3153
+ const lxb_char_t **data, const lxb_char_t *end)
3154
+ {
3155
+ if (**data < 0x80) {
3156
+ return *(*data)++;
3157
+ }
3158
+
3159
+ return lxb_encoding_single_index_windows_1258[*(*data)++ - 0x80].codepoint;
3160
+ }
3161
+
3162
+ lxb_codepoint_t
3163
+ lxb_encoding_decode_windows_874_single(lxb_encoding_decode_t *ctx,
3164
+ const lxb_char_t **data, const lxb_char_t *end)
3165
+ {
3166
+ if (**data < 0x80) {
3167
+ return *(*data)++;
3168
+ }
3169
+
3170
+ return lxb_encoding_single_index_windows_874[*(*data)++ - 0x80].codepoint;
3171
+ }
3172
+
3173
+ lxb_codepoint_t
3174
+ lxb_encoding_decode_x_mac_cyrillic_single(lxb_encoding_decode_t *ctx,
3175
+ const lxb_char_t **data, const lxb_char_t *end)
3176
+ {
3177
+ if (**data < 0x80) {
3178
+ return *(*data)++;
3179
+ }
3180
+
3181
+ return lxb_encoding_single_index_x_mac_cyrillic[*(*data)++ - 0x80].codepoint;
3182
+ }
3183
+
3184
+ lxb_codepoint_t
3185
+ lxb_encoding_decode_x_user_defined_single(lxb_encoding_decode_t *ctx,
3186
+ const lxb_char_t **data, const lxb_char_t *end)
3187
+ {
3188
+ if (**data < 0x80) {
3189
+ return *(*data)++;
3190
+ }
3191
+
3192
+ return 0xF780 + (*(*data)++) - 0x80;
3193
+ }