nokolexbor 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/ext/nokolexbor/config.h +186 -0
- data/ext/nokolexbor/extconf.rb +131 -0
- data/ext/nokolexbor/libxml/HTMLparser.h +320 -0
- data/ext/nokolexbor/libxml/SAX2.h +173 -0
- data/ext/nokolexbor/libxml/chvalid.h +230 -0
- data/ext/nokolexbor/libxml/debugXML.h +217 -0
- data/ext/nokolexbor/libxml/dict.h +81 -0
- data/ext/nokolexbor/libxml/encoding.h +232 -0
- data/ext/nokolexbor/libxml/entities.h +153 -0
- data/ext/nokolexbor/libxml/globals.h +529 -0
- data/ext/nokolexbor/libxml/hash.h +236 -0
- data/ext/nokolexbor/libxml/list.h +137 -0
- data/ext/nokolexbor/libxml/parser.h +1264 -0
- data/ext/nokolexbor/libxml/parserInternals.h +641 -0
- data/ext/nokolexbor/libxml/pattern.h +100 -0
- data/ext/nokolexbor/libxml/threads.h +94 -0
- data/ext/nokolexbor/libxml/tree.h +1315 -0
- data/ext/nokolexbor/libxml/uri.h +94 -0
- data/ext/nokolexbor/libxml/valid.h +448 -0
- data/ext/nokolexbor/libxml/xmlIO.h +369 -0
- data/ext/nokolexbor/libxml/xmlautomata.h +146 -0
- data/ext/nokolexbor/libxml/xmlerror.h +919 -0
- data/ext/nokolexbor/libxml/xmlexports.h +79 -0
- data/ext/nokolexbor/libxml/xmlmemory.h +226 -0
- data/ext/nokolexbor/libxml/xmlregexp.h +222 -0
- data/ext/nokolexbor/libxml/xmlstring.h +140 -0
- data/ext/nokolexbor/libxml/xmlversion.h +526 -0
- data/ext/nokolexbor/libxml/xpath.h +575 -0
- data/ext/nokolexbor/libxml/xpathInternals.h +632 -0
- data/ext/nokolexbor/libxml/xpointer.h +137 -0
- data/ext/nokolexbor/libxml.h +76 -0
- data/ext/nokolexbor/memory.c +39 -0
- data/ext/nokolexbor/nl_document.c +51 -0
- data/ext/nokolexbor/nl_node.c +790 -0
- data/ext/nokolexbor/nl_node_set.c +368 -0
- data/ext/nokolexbor/nl_xpath_context.c +200 -0
- data/ext/nokolexbor/nokolexbor.c +63 -0
- data/ext/nokolexbor/nokolexbor.h +37 -0
- data/ext/nokolexbor/private/buf.h +70 -0
- data/ext/nokolexbor/private/dict.h +11 -0
- data/ext/nokolexbor/private/enc.h +17 -0
- data/ext/nokolexbor/private/error.h +21 -0
- data/ext/nokolexbor/private/globals.h +9 -0
- data/ext/nokolexbor/private/memory.h +9 -0
- data/ext/nokolexbor/private/parser.h +27 -0
- data/ext/nokolexbor/private/string.h +9 -0
- data/ext/nokolexbor/private/threads.h +50 -0
- data/ext/nokolexbor/private/tree.h +18 -0
- data/ext/nokolexbor/private/xpath.h +7 -0
- data/ext/nokolexbor/timsort.h +601 -0
- data/ext/nokolexbor/xml_SAX2.c +80 -0
- data/ext/nokolexbor/xml_buf.c +363 -0
- data/ext/nokolexbor/xml_chvalid.c +334 -0
- data/ext/nokolexbor/xml_dict.c +1264 -0
- data/ext/nokolexbor/xml_encoding.c +124 -0
- data/ext/nokolexbor/xml_error.c +134 -0
- data/ext/nokolexbor/xml_globals.c +1085 -0
- data/ext/nokolexbor/xml_hash.c +1141 -0
- data/ext/nokolexbor/xml_memory.c +203 -0
- data/ext/nokolexbor/xml_parser.c +127 -0
- data/ext/nokolexbor/xml_parserInternals.c +338 -0
- data/ext/nokolexbor/xml_pattern.c +2375 -0
- data/ext/nokolexbor/xml_string.c +1051 -0
- data/ext/nokolexbor/xml_threads.c +881 -0
- data/ext/nokolexbor/xml_tree.c +148 -0
- data/ext/nokolexbor/xml_xpath.c +14743 -0
- data/lib/nokolexbor/attribute.rb +18 -0
- data/lib/nokolexbor/document.rb +6 -0
- data/lib/nokolexbor/node.rb +264 -0
- data/lib/nokolexbor/node_set.rb +124 -0
- data/lib/nokolexbor/version.rb +5 -0
- data/lib/nokolexbor/xpath_context.rb +14 -0
- data/lib/nokolexbor.rb +17 -0
- data/patches/0001-lexbor-support-text-pseudo-element.patch +137 -0
- data/patches/0002-lexbor-match-id-class-case-sensitive.patch +22 -0
- data/patches/0003-lexbor-attach-template-content-to-self.patch +13 -0
- data/vendor/lexbor/CMakeLists.txt +331 -0
- data/vendor/lexbor/config.cmake +890 -0
- data/vendor/lexbor/feature.cmake +134 -0
- data/vendor/lexbor/source/lexbor/core/array.c +208 -0
- data/vendor/lexbor/source/lexbor/core/array.h +100 -0
- data/vendor/lexbor/source/lexbor/core/array_obj.c +216 -0
- data/vendor/lexbor/source/lexbor/core/array_obj.h +134 -0
- data/vendor/lexbor/source/lexbor/core/avl.c +442 -0
- data/vendor/lexbor/source/lexbor/core/avl.h +82 -0
- data/vendor/lexbor/source/lexbor/core/base.h +86 -0
- data/vendor/lexbor/source/lexbor/core/bst.c +468 -0
- data/vendor/lexbor/source/lexbor/core/bst.h +108 -0
- data/vendor/lexbor/source/lexbor/core/bst_map.c +238 -0
- data/vendor/lexbor/source/lexbor/core/bst_map.h +87 -0
- data/vendor/lexbor/source/lexbor/core/config.cmake +12 -0
- data/vendor/lexbor/source/lexbor/core/conv.c +203 -0
- data/vendor/lexbor/source/lexbor/core/conv.h +53 -0
- data/vendor/lexbor/source/lexbor/core/core.h +35 -0
- data/vendor/lexbor/source/lexbor/core/def.h +57 -0
- data/vendor/lexbor/source/lexbor/core/diyfp.c +153 -0
- data/vendor/lexbor/source/lexbor/core/diyfp.h +258 -0
- data/vendor/lexbor/source/lexbor/core/dobject.c +187 -0
- data/vendor/lexbor/source/lexbor/core/dobject.h +92 -0
- data/vendor/lexbor/source/lexbor/core/dtoa.c +404 -0
- data/vendor/lexbor/source/lexbor/core/dtoa.h +28 -0
- data/vendor/lexbor/source/lexbor/core/fs.h +60 -0
- data/vendor/lexbor/source/lexbor/core/hash.c +476 -0
- data/vendor/lexbor/source/lexbor/core/hash.h +218 -0
- data/vendor/lexbor/source/lexbor/core/in.c +267 -0
- data/vendor/lexbor/source/lexbor/core/in.h +172 -0
- data/vendor/lexbor/source/lexbor/core/lexbor.h +35 -0
- data/vendor/lexbor/source/lexbor/core/mem.c +228 -0
- data/vendor/lexbor/source/lexbor/core/mem.h +141 -0
- data/vendor/lexbor/source/lexbor/core/mraw.c +428 -0
- data/vendor/lexbor/source/lexbor/core/mraw.h +114 -0
- data/vendor/lexbor/source/lexbor/core/perf.h +45 -0
- data/vendor/lexbor/source/lexbor/core/plog.c +73 -0
- data/vendor/lexbor/source/lexbor/core/plog.h +102 -0
- data/vendor/lexbor/source/lexbor/core/print.c +168 -0
- data/vendor/lexbor/source/lexbor/core/print.h +39 -0
- data/vendor/lexbor/source/lexbor/core/sbst.h +59 -0
- data/vendor/lexbor/source/lexbor/core/serialize.c +27 -0
- data/vendor/lexbor/source/lexbor/core/serialize.h +32 -0
- data/vendor/lexbor/source/lexbor/core/shs.c +118 -0
- data/vendor/lexbor/source/lexbor/core/shs.h +82 -0
- data/vendor/lexbor/source/lexbor/core/str.c +617 -0
- data/vendor/lexbor/source/lexbor/core/str.h +247 -0
- data/vendor/lexbor/source/lexbor/core/str_res.h +369 -0
- data/vendor/lexbor/source/lexbor/core/strtod.c +326 -0
- data/vendor/lexbor/source/lexbor/core/strtod.h +28 -0
- data/vendor/lexbor/source/lexbor/core/types.h +39 -0
- data/vendor/lexbor/source/lexbor/core/utils.c +43 -0
- data/vendor/lexbor/source/lexbor/core/utils.h +36 -0
- data/vendor/lexbor/source/lexbor/css/base.h +44 -0
- data/vendor/lexbor/source/lexbor/css/config.cmake +2 -0
- data/vendor/lexbor/source/lexbor/css/css.h +25 -0
- data/vendor/lexbor/source/lexbor/css/log.c +336 -0
- data/vendor/lexbor/source/lexbor/css/log.h +103 -0
- data/vendor/lexbor/source/lexbor/css/node.h +29 -0
- data/vendor/lexbor/source/lexbor/css/parser.c +473 -0
- data/vendor/lexbor/source/lexbor/css/parser.h +368 -0
- data/vendor/lexbor/source/lexbor/css/selectors/base.h +48 -0
- data/vendor/lexbor/source/lexbor/css/selectors/pseudo.c +91 -0
- data/vendor/lexbor/source/lexbor/css/selectors/pseudo.h +66 -0
- data/vendor/lexbor/source/lexbor/css/selectors/pseudo_const.h +109 -0
- data/vendor/lexbor/source/lexbor/css/selectors/pseudo_res.h +302 -0
- data/vendor/lexbor/source/lexbor/css/selectors/pseudo_state.c +279 -0
- data/vendor/lexbor/source/lexbor/css/selectors/pseudo_state.h +85 -0
- data/vendor/lexbor/source/lexbor/css/selectors/selector.c +927 -0
- data/vendor/lexbor/source/lexbor/css/selectors/selector.h +200 -0
- data/vendor/lexbor/source/lexbor/css/selectors/selectors.c +340 -0
- data/vendor/lexbor/source/lexbor/css/selectors/selectors.h +137 -0
- data/vendor/lexbor/source/lexbor/css/selectors/state.c +1718 -0
- data/vendor/lexbor/source/lexbor/css/selectors/state.h +79 -0
- data/vendor/lexbor/source/lexbor/css/stylesheet.h +37 -0
- data/vendor/lexbor/source/lexbor/css/syntax/anb.c +443 -0
- data/vendor/lexbor/source/lexbor/css/syntax/anb.h +45 -0
- data/vendor/lexbor/source/lexbor/css/syntax/base.h +33 -0
- data/vendor/lexbor/source/lexbor/css/syntax/parser.c +9 -0
- data/vendor/lexbor/source/lexbor/css/syntax/parser.h +25 -0
- data/vendor/lexbor/source/lexbor/css/syntax/res.h +48 -0
- data/vendor/lexbor/source/lexbor/css/syntax/state.c +2603 -0
- data/vendor/lexbor/source/lexbor/css/syntax/state.h +140 -0
- data/vendor/lexbor/source/lexbor/css/syntax/state_res.h +273 -0
- data/vendor/lexbor/source/lexbor/css/syntax/syntax.c +67 -0
- data/vendor/lexbor/source/lexbor/css/syntax/token.c +618 -0
- data/vendor/lexbor/source/lexbor/css/syntax/token.h +298 -0
- data/vendor/lexbor/source/lexbor/css/syntax/token_res.h +68 -0
- data/vendor/lexbor/source/lexbor/css/syntax/tokenizer/error.c +30 -0
- data/vendor/lexbor/source/lexbor/css/syntax/tokenizer/error.h +58 -0
- data/vendor/lexbor/source/lexbor/css/syntax/tokenizer.c +278 -0
- data/vendor/lexbor/source/lexbor/css/syntax/tokenizer.h +121 -0
- data/vendor/lexbor/source/lexbor/dom/base.h +32 -0
- data/vendor/lexbor/source/lexbor/dom/collection.c +97 -0
- data/vendor/lexbor/source/lexbor/dom/collection.h +112 -0
- data/vendor/lexbor/source/lexbor/dom/config.cmake +3 -0
- data/vendor/lexbor/source/lexbor/dom/dom.h +29 -0
- data/vendor/lexbor/source/lexbor/dom/exception.c +18 -0
- data/vendor/lexbor/source/lexbor/dom/exception.h +73 -0
- data/vendor/lexbor/source/lexbor/dom/interface.c +110 -0
- data/vendor/lexbor/source/lexbor/dom/interface.h +88 -0
- data/vendor/lexbor/source/lexbor/dom/interfaces/attr.c +445 -0
- data/vendor/lexbor/source/lexbor/dom/interfaces/attr.h +152 -0
- data/vendor/lexbor/source/lexbor/dom/interfaces/attr_const.h +62 -0
- data/vendor/lexbor/source/lexbor/dom/interfaces/attr_res.h +143 -0
- data/vendor/lexbor/source/lexbor/dom/interfaces/cdata_section.c +55 -0
- data/vendor/lexbor/source/lexbor/dom/interfaces/cdata_section.h +38 -0
- data/vendor/lexbor/source/lexbor/dom/interfaces/character_data.c +110 -0
- data/vendor/lexbor/source/lexbor/dom/interfaces/character_data.h +51 -0
- data/vendor/lexbor/source/lexbor/dom/interfaces/comment.c +64 -0
- data/vendor/lexbor/source/lexbor/dom/interfaces/comment.h +42 -0
- data/vendor/lexbor/source/lexbor/dom/interfaces/document.c +536 -0
- data/vendor/lexbor/source/lexbor/dom/interfaces/document.h +243 -0
- data/vendor/lexbor/source/lexbor/dom/interfaces/document_fragment.c +36 -0
- data/vendor/lexbor/source/lexbor/dom/interfaces/document_fragment.h +36 -0
- data/vendor/lexbor/source/lexbor/dom/interfaces/document_type.c +125 -0
- data/vendor/lexbor/source/lexbor/dom/interfaces/document_type.h +108 -0
- data/vendor/lexbor/source/lexbor/dom/interfaces/element.c +1411 -0
- data/vendor/lexbor/source/lexbor/dom/interfaces/element.h +319 -0
- data/vendor/lexbor/source/lexbor/dom/interfaces/event_target.c +32 -0
- data/vendor/lexbor/source/lexbor/dom/interfaces/event_target.h +34 -0
- data/vendor/lexbor/source/lexbor/dom/interfaces/node.c +661 -0
- data/vendor/lexbor/source/lexbor/dom/interfaces/node.h +192 -0
- data/vendor/lexbor/source/lexbor/dom/interfaces/processing_instruction.c +87 -0
- data/vendor/lexbor/source/lexbor/dom/interfaces/processing_instruction.h +66 -0
- data/vendor/lexbor/source/lexbor/dom/interfaces/shadow_root.c +36 -0
- data/vendor/lexbor/source/lexbor/dom/interfaces/shadow_root.h +44 -0
- data/vendor/lexbor/source/lexbor/dom/interfaces/text.c +63 -0
- data/vendor/lexbor/source/lexbor/dom/interfaces/text.h +42 -0
- data/vendor/lexbor/source/lexbor/encoding/base.h +218 -0
- data/vendor/lexbor/source/lexbor/encoding/big5.c +42839 -0
- data/vendor/lexbor/source/lexbor/encoding/config.cmake +12 -0
- data/vendor/lexbor/source/lexbor/encoding/const.h +65 -0
- data/vendor/lexbor/source/lexbor/encoding/decode.c +3193 -0
- data/vendor/lexbor/source/lexbor/encoding/decode.h +370 -0
- data/vendor/lexbor/source/lexbor/encoding/encode.c +1931 -0
- data/vendor/lexbor/source/lexbor/encoding/encode.h +377 -0
- data/vendor/lexbor/source/lexbor/encoding/encoding.c +252 -0
- data/vendor/lexbor/source/lexbor/encoding/encoding.h +475 -0
- data/vendor/lexbor/source/lexbor/encoding/euc_kr.c +53883 -0
- data/vendor/lexbor/source/lexbor/encoding/gb18030.c +47905 -0
- data/vendor/lexbor/source/lexbor/encoding/iso_2022_jp_katakana.c +159 -0
- data/vendor/lexbor/source/lexbor/encoding/jis0208.c +22477 -0
- data/vendor/lexbor/source/lexbor/encoding/jis0212.c +15787 -0
- data/vendor/lexbor/source/lexbor/encoding/multi.h +53 -0
- data/vendor/lexbor/source/lexbor/encoding/range.c +71 -0
- data/vendor/lexbor/source/lexbor/encoding/range.h +34 -0
- data/vendor/lexbor/source/lexbor/encoding/res.c +222 -0
- data/vendor/lexbor/source/lexbor/encoding/res.h +34 -0
- data/vendor/lexbor/source/lexbor/encoding/single.c +13748 -0
- data/vendor/lexbor/source/lexbor/encoding/single.h +116 -0
- data/vendor/lexbor/source/lexbor/html/base.h +44 -0
- data/vendor/lexbor/source/lexbor/html/config.cmake +3 -0
- data/vendor/lexbor/source/lexbor/html/encoding.c +574 -0
- data/vendor/lexbor/source/lexbor/html/encoding.h +106 -0
- data/vendor/lexbor/source/lexbor/html/html.h +107 -0
- data/vendor/lexbor/source/lexbor/html/interface.c +165 -0
- data/vendor/lexbor/source/lexbor/html/interface.h +186 -0
- data/vendor/lexbor/source/lexbor/html/interface_res.h +4449 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/anchor_element.c +36 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/anchor_element.h +34 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/area_element.c +36 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/area_element.h +34 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/audio_element.c +36 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/audio_element.h +34 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/base_element.c +36 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/base_element.h +34 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/body_element.c +36 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/body_element.h +34 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/br_element.c +36 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/br_element.h +34 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/button_element.c +36 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/button_element.h +34 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/canvas_element.c +36 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/canvas_element.h +34 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/d_list_element.c +36 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/d_list_element.h +34 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/data_element.c +36 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/data_element.h +34 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/data_list_element.c +36 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/data_list_element.h +34 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/details_element.c +36 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/details_element.h +34 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/dialog_element.c +36 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/dialog_element.h +34 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/directory_element.c +36 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/directory_element.h +34 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/div_element.c +36 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/div_element.h +34 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/document.c +444 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/document.h +256 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/element.c +64 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/element.h +54 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/embed_element.c +36 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/embed_element.h +34 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/field_set_element.c +36 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/field_set_element.h +34 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/font_element.c +36 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/font_element.h +34 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/form_element.c +36 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/form_element.h +34 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/frame_element.c +36 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/frame_element.h +34 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/frame_set_element.c +36 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/frame_set_element.h +34 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/head_element.c +36 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/head_element.h +34 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/heading_element.c +36 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/heading_element.h +34 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/hr_element.c +36 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/hr_element.h +34 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/html_element.c +36 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/html_element.h +34 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/iframe_element.c +36 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/iframe_element.h +34 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/image_element.c +36 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/image_element.h +34 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/input_element.c +36 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/input_element.h +34 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/label_element.c +36 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/label_element.h +34 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/legend_element.c +36 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/legend_element.h +34 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/li_element.c +36 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/li_element.h +34 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/link_element.c +36 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/link_element.h +34 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/map_element.c +36 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/map_element.h +34 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/marquee_element.c +36 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/marquee_element.h +34 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/media_element.c +36 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/media_element.h +34 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/menu_element.c +36 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/menu_element.h +34 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/meta_element.c +36 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/meta_element.h +34 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/meter_element.c +36 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/meter_element.h +34 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/mod_element.c +36 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/mod_element.h +34 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/o_list_element.c +36 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/o_list_element.h +34 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/object_element.c +36 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/object_element.h +34 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/opt_group_element.c +36 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/opt_group_element.h +34 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/option_element.c +36 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/option_element.h +34 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/output_element.c +36 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/output_element.h +34 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/paragraph_element.c +36 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/paragraph_element.h +34 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/param_element.c +36 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/param_element.h +34 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/picture_element.c +36 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/picture_element.h +34 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/pre_element.c +36 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/pre_element.h +34 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/progress_element.c +36 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/progress_element.h +34 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/quote_element.c +36 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/quote_element.h +34 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/script_element.c +36 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/script_element.h +34 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/select_element.c +36 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/select_element.h +34 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/slot_element.c +36 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/slot_element.h +34 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/source_element.c +36 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/source_element.h +34 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/span_element.c +36 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/span_element.h +34 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/style_element.c +36 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/style_element.h +34 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/table_caption_element.c +36 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/table_caption_element.h +34 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/table_cell_element.c +36 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/table_cell_element.h +34 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/table_col_element.c +36 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/table_col_element.h +34 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/table_element.c +36 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/table_element.h +34 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/table_row_element.c +36 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/table_row_element.h +34 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/table_section_element.c +36 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/table_section_element.h +34 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/template_element.c +46 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/template_element.h +38 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/text_area_element.c +36 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/text_area_element.h +34 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/time_element.c +36 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/time_element.h +34 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/title_element.c +133 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/title_element.h +42 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/track_element.c +36 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/track_element.h +34 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/u_list_element.c +36 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/u_list_element.h +34 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/unknown_element.c +36 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/unknown_element.h +34 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/video_element.c +36 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/video_element.h +34 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/window.c +36 -0
- data/vendor/lexbor/source/lexbor/html/interfaces/window.h +34 -0
- data/vendor/lexbor/source/lexbor/html/node.c +14 -0
- data/vendor/lexbor/source/lexbor/html/node.h +67 -0
- data/vendor/lexbor/source/lexbor/html/parser.c +469 -0
- data/vendor/lexbor/source/lexbor/html/parser.h +170 -0
- data/vendor/lexbor/source/lexbor/html/serialize.c +1510 -0
- data/vendor/lexbor/source/lexbor/html/serialize.h +93 -0
- data/vendor/lexbor/source/lexbor/html/tag.h +103 -0
- data/vendor/lexbor/source/lexbor/html/tag_res.h +2262 -0
- data/vendor/lexbor/source/lexbor/html/token.c +386 -0
- data/vendor/lexbor/source/lexbor/html/token.h +130 -0
- data/vendor/lexbor/source/lexbor/html/token_attr.c +44 -0
- data/vendor/lexbor/source/lexbor/html/token_attr.h +67 -0
- data/vendor/lexbor/source/lexbor/html/tokenizer/error.c +28 -0
- data/vendor/lexbor/source/lexbor/html/tokenizer/error.h +141 -0
- data/vendor/lexbor/source/lexbor/html/tokenizer/res.h +4956 -0
- data/vendor/lexbor/source/lexbor/html/tokenizer/state.c +2171 -0
- data/vendor/lexbor/source/lexbor/html/tokenizer/state.h +225 -0
- data/vendor/lexbor/source/lexbor/html/tokenizer/state_comment.c +489 -0
- data/vendor/lexbor/source/lexbor/html/tokenizer/state_comment.h +27 -0
- data/vendor/lexbor/source/lexbor/html/tokenizer/state_doctype.c +1654 -0
- data/vendor/lexbor/source/lexbor/html/tokenizer/state_doctype.h +27 -0
- data/vendor/lexbor/source/lexbor/html/tokenizer/state_rawtext.c +303 -0
- data/vendor/lexbor/source/lexbor/html/tokenizer/state_rawtext.h +32 -0
- data/vendor/lexbor/source/lexbor/html/tokenizer/state_rcdata.c +311 -0
- data/vendor/lexbor/source/lexbor/html/tokenizer/state_rcdata.h +32 -0
- data/vendor/lexbor/source/lexbor/html/tokenizer/state_script.c +1209 -0
- data/vendor/lexbor/source/lexbor/html/tokenizer/state_script.h +32 -0
- data/vendor/lexbor/source/lexbor/html/tokenizer.c +499 -0
- data/vendor/lexbor/source/lexbor/html/tokenizer.h +343 -0
- data/vendor/lexbor/source/lexbor/html/tree/active_formatting.c +241 -0
- data/vendor/lexbor/source/lexbor/html/tree/active_formatting.h +117 -0
- data/vendor/lexbor/source/lexbor/html/tree/error.c +26 -0
- data/vendor/lexbor/source/lexbor/html/tree/error.h +114 -0
- data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/after_after_body.c +62 -0
- data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/after_after_frameset.c +63 -0
- data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/after_body.c +82 -0
- data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/after_frameset.c +88 -0
- data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/after_head.c +222 -0
- data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/before_head.c +144 -0
- data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/before_html.c +166 -0
- data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/foreign_content.c +358 -0
- data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/in_body.c +1974 -0
- data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/in_caption.c +158 -0
- data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/in_cell.c +187 -0
- data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/in_column_group.c +194 -0
- data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/in_frameset.c +149 -0
- data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/in_head.c +374 -0
- data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/in_head_noscript.c +121 -0
- data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/in_row.c +211 -0
- data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/in_select.c +341 -0
- data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/in_select_in_table.c +115 -0
- data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/in_table.c +451 -0
- data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/in_table_body.c +208 -0
- data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/in_table_text.c +127 -0
- data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/in_template.c +189 -0
- data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/initial.c +411 -0
- data/vendor/lexbor/source/lexbor/html/tree/insertion_mode/text.c +61 -0
- data/vendor/lexbor/source/lexbor/html/tree/insertion_mode.h +135 -0
- data/vendor/lexbor/source/lexbor/html/tree/open_elements.c +251 -0
- data/vendor/lexbor/source/lexbor/html/tree/open_elements.h +105 -0
- data/vendor/lexbor/source/lexbor/html/tree/template_insertion.c +10 -0
- data/vendor/lexbor/source/lexbor/html/tree/template_insertion.h +100 -0
- data/vendor/lexbor/source/lexbor/html/tree.c +1726 -0
- data/vendor/lexbor/source/lexbor/html/tree.h +431 -0
- data/vendor/lexbor/source/lexbor/html/tree_res.h +111 -0
- data/vendor/lexbor/source/lexbor/ns/base.h +32 -0
- data/vendor/lexbor/source/lexbor/ns/config.cmake +2 -0
- data/vendor/lexbor/source/lexbor/ns/const.h +37 -0
- data/vendor/lexbor/source/lexbor/ns/ns.c +154 -0
- data/vendor/lexbor/source/lexbor/ns/ns.h +66 -0
- data/vendor/lexbor/source/lexbor/ns/res.h +97 -0
- data/vendor/lexbor/source/lexbor/ports/posix/config.cmake +11 -0
- data/vendor/lexbor/source/lexbor/ports/posix/lexbor/core/fs.c +236 -0
- data/vendor/lexbor/source/lexbor/ports/posix/lexbor/core/memory.c +33 -0
- data/vendor/lexbor/source/lexbor/ports/posix/lexbor/core/perf.c +158 -0
- data/vendor/lexbor/source/lexbor/ports/windows_nt/config.cmake +18 -0
- data/vendor/lexbor/source/lexbor/ports/windows_nt/lexbor/core/fs.c +239 -0
- data/vendor/lexbor/source/lexbor/ports/windows_nt/lexbor/core/memory.c +33 -0
- data/vendor/lexbor/source/lexbor/ports/windows_nt/lexbor/core/perf.c +81 -0
- data/vendor/lexbor/source/lexbor/selectors/base.h +30 -0
- data/vendor/lexbor/source/lexbor/selectors/config.cmake +2 -0
- data/vendor/lexbor/source/lexbor/selectors/selectors.c +1591 -0
- data/vendor/lexbor/source/lexbor/selectors/selectors.h +71 -0
- data/vendor/lexbor/source/lexbor/tag/base.h +32 -0
- data/vendor/lexbor/source/lexbor/tag/config.cmake +2 -0
- data/vendor/lexbor/source/lexbor/tag/const.h +225 -0
- data/vendor/lexbor/source/lexbor/tag/res.h +562 -0
- data/vendor/lexbor/source/lexbor/tag/tag.c +144 -0
- data/vendor/lexbor/source/lexbor/tag/tag.h +123 -0
- data/vendor/lexbor/source/lexbor/utils/base.h +32 -0
- data/vendor/lexbor/source/lexbor/utils/config.cmake +2 -0
- data/vendor/lexbor/source/lexbor/utils/http.c +534 -0
- data/vendor/lexbor/source/lexbor/utils/http.h +90 -0
- data/vendor/lexbor/source/lexbor/utils/utils.h +15 -0
- data/vendor/lexbor/source/lexbor/utils/warc.c +817 -0
- data/vendor/lexbor/source/lexbor/utils/warc.h +126 -0
- data/vendor/lexbor/utils/lexbor/css/selectors/pseudo.py +231 -0
- data/vendor/lexbor/utils/lexbor/css/selectors/tmp/const.h +21 -0
- data/vendor/lexbor/utils/lexbor/css/selectors/tmp/res.h +26 -0
- data/vendor/lexbor/utils/lexbor/css/syntax/definitions.py +49 -0
- data/vendor/lexbor/utils/lexbor/css/syntax/token_res.py +54 -0
- data/vendor/lexbor/utils/lexbor/css/syntax/tokenizer_code_map.py +36 -0
- data/vendor/lexbor/version +1 -0
- metadata +542 -0
|
@@ -0,0 +1,2171 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Copyright (C) 2018-2020 Alexander Borisov
|
|
3
|
+
*
|
|
4
|
+
* Author: Alexander Borisov <borisov@lexbor.com>
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
#include "lexbor/html/tokenizer/state.h"
|
|
8
|
+
#include "lexbor/html/tokenizer/state_comment.h"
|
|
9
|
+
#include "lexbor/html/tokenizer/state_doctype.h"
|
|
10
|
+
|
|
11
|
+
#define LEXBOR_STR_RES_ANSI_REPLACEMENT_CHARACTER
|
|
12
|
+
#define LEXBOR_STR_RES_ALPHANUMERIC_CHARACTER
|
|
13
|
+
#define LEXBOR_STR_RES_REPLACEMENT_CHARACTER
|
|
14
|
+
#define LEXBOR_STR_RES_ALPHA_CHARACTER
|
|
15
|
+
#define LEXBOR_STR_RES_MAP_HEX
|
|
16
|
+
#define LEXBOR_STR_RES_MAP_NUM
|
|
17
|
+
#include "lexbor/core/str_res.h"
|
|
18
|
+
|
|
19
|
+
#define LXB_HTML_TOKENIZER_RES_ENTITIES_SBST
|
|
20
|
+
#include "lexbor/html/tokenizer/res.h"
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
const lxb_tag_data_t *
|
|
24
|
+
lxb_tag_append_lower(lexbor_hash_t *hash,
|
|
25
|
+
const lxb_char_t *name, size_t length);
|
|
26
|
+
|
|
27
|
+
lxb_dom_attr_data_t *
|
|
28
|
+
lxb_dom_attr_local_name_append(lexbor_hash_t *hash,
|
|
29
|
+
const lxb_char_t *name, size_t length);
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
static const lxb_char_t *
|
|
33
|
+
lxb_html_tokenizer_state_data(lxb_html_tokenizer_t *tkz,
|
|
34
|
+
const lxb_char_t *data, const lxb_char_t *end);
|
|
35
|
+
|
|
36
|
+
static const lxb_char_t *
|
|
37
|
+
lxb_html_tokenizer_state_plaintext(lxb_html_tokenizer_t *tkz,
|
|
38
|
+
const lxb_char_t *data,
|
|
39
|
+
const lxb_char_t *end);
|
|
40
|
+
|
|
41
|
+
/* Tag */
|
|
42
|
+
static const lxb_char_t *
|
|
43
|
+
lxb_html_tokenizer_state_tag_open(lxb_html_tokenizer_t *tkz,
|
|
44
|
+
const lxb_char_t *data,
|
|
45
|
+
const lxb_char_t *end);
|
|
46
|
+
|
|
47
|
+
static const lxb_char_t *
|
|
48
|
+
lxb_html_tokenizer_state_end_tag_open(lxb_html_tokenizer_t *tkz,
|
|
49
|
+
const lxb_char_t *data,
|
|
50
|
+
const lxb_char_t *end);
|
|
51
|
+
|
|
52
|
+
static const lxb_char_t *
|
|
53
|
+
lxb_html_tokenizer_state_tag_name(lxb_html_tokenizer_t *tkz,
|
|
54
|
+
const lxb_char_t *data,
|
|
55
|
+
const lxb_char_t *end);
|
|
56
|
+
|
|
57
|
+
/* Attribute */
|
|
58
|
+
static const lxb_char_t *
|
|
59
|
+
lxb_html_tokenizer_state_attribute_name(lxb_html_tokenizer_t *tkz,
|
|
60
|
+
const lxb_char_t *data,
|
|
61
|
+
const lxb_char_t *end);
|
|
62
|
+
|
|
63
|
+
static const lxb_char_t *
|
|
64
|
+
lxb_html_tokenizer_state_after_attribute_name(lxb_html_tokenizer_t *tkz,
|
|
65
|
+
const lxb_char_t *data,
|
|
66
|
+
const lxb_char_t *end);
|
|
67
|
+
|
|
68
|
+
static const lxb_char_t *
|
|
69
|
+
lxb_html_tokenizer_state_before_attribute_value(lxb_html_tokenizer_t *tkz,
|
|
70
|
+
const lxb_char_t *data,
|
|
71
|
+
const lxb_char_t *end);
|
|
72
|
+
|
|
73
|
+
static const lxb_char_t *
|
|
74
|
+
lxb_html_tokenizer_state_attribute_value_double_quoted(lxb_html_tokenizer_t *tkz,
|
|
75
|
+
const lxb_char_t *data,
|
|
76
|
+
const lxb_char_t *end);
|
|
77
|
+
|
|
78
|
+
static const lxb_char_t *
|
|
79
|
+
lxb_html_tokenizer_state_attribute_value_single_quoted(lxb_html_tokenizer_t *tkz,
|
|
80
|
+
const lxb_char_t *data,
|
|
81
|
+
const lxb_char_t *end);
|
|
82
|
+
|
|
83
|
+
static const lxb_char_t *
|
|
84
|
+
lxb_html_tokenizer_state_attribute_value_unquoted(lxb_html_tokenizer_t *tkz,
|
|
85
|
+
const lxb_char_t *data,
|
|
86
|
+
const lxb_char_t *end);
|
|
87
|
+
|
|
88
|
+
static const lxb_char_t *
|
|
89
|
+
lxb_html_tokenizer_state_after_attribute_value_quoted(lxb_html_tokenizer_t *tkz,
|
|
90
|
+
const lxb_char_t *data,
|
|
91
|
+
const lxb_char_t *end);
|
|
92
|
+
|
|
93
|
+
static const lxb_char_t *
|
|
94
|
+
lxb_html_tokenizer_state_bogus_comment_before(lxb_html_tokenizer_t *tkz,
|
|
95
|
+
const lxb_char_t *data,
|
|
96
|
+
const lxb_char_t *end);
|
|
97
|
+
|
|
98
|
+
static const lxb_char_t *
|
|
99
|
+
lxb_html_tokenizer_state_bogus_comment(lxb_html_tokenizer_t *tkz,
|
|
100
|
+
const lxb_char_t *data,
|
|
101
|
+
const lxb_char_t *end);
|
|
102
|
+
|
|
103
|
+
/* Markup declaration */
|
|
104
|
+
static const lxb_char_t *
|
|
105
|
+
lxb_html_tokenizer_state_markup_declaration_open(lxb_html_tokenizer_t *tkz,
|
|
106
|
+
const lxb_char_t *data,
|
|
107
|
+
const lxb_char_t *end);
|
|
108
|
+
|
|
109
|
+
static const lxb_char_t *
|
|
110
|
+
lxb_html_tokenizer_state_markup_declaration_comment(lxb_html_tokenizer_t *tkz,
|
|
111
|
+
const lxb_char_t *data,
|
|
112
|
+
const lxb_char_t *end);
|
|
113
|
+
|
|
114
|
+
static const lxb_char_t *
|
|
115
|
+
lxb_html_tokenizer_state_markup_declaration_doctype(lxb_html_tokenizer_t *tkz,
|
|
116
|
+
const lxb_char_t *data,
|
|
117
|
+
const lxb_char_t *end);
|
|
118
|
+
|
|
119
|
+
static const lxb_char_t *
|
|
120
|
+
lxb_html_tokenizer_state_markup_declaration_cdata(lxb_html_tokenizer_t *tkz,
|
|
121
|
+
const lxb_char_t *data,
|
|
122
|
+
const lxb_char_t *end);
|
|
123
|
+
|
|
124
|
+
/* CDATA Section */
|
|
125
|
+
static const lxb_char_t *
|
|
126
|
+
lxb_html_tokenizer_state_cdata_section_before(lxb_html_tokenizer_t *tkz,
|
|
127
|
+
const lxb_char_t *data,
|
|
128
|
+
const lxb_char_t *end);
|
|
129
|
+
|
|
130
|
+
static const lxb_char_t *
|
|
131
|
+
lxb_html_tokenizer_state_cdata_section(lxb_html_tokenizer_t *tkz,
|
|
132
|
+
const lxb_char_t *data,
|
|
133
|
+
const lxb_char_t *end);
|
|
134
|
+
|
|
135
|
+
static const lxb_char_t *
|
|
136
|
+
lxb_html_tokenizer_state_cdata_section_bracket(lxb_html_tokenizer_t *tkz,
|
|
137
|
+
const lxb_char_t *data,
|
|
138
|
+
const lxb_char_t *end);
|
|
139
|
+
|
|
140
|
+
static const lxb_char_t *
|
|
141
|
+
lxb_html_tokenizer_state_cdata_section_end(lxb_html_tokenizer_t *tkz,
|
|
142
|
+
const lxb_char_t *data,
|
|
143
|
+
const lxb_char_t *end);
|
|
144
|
+
|
|
145
|
+
static const lxb_char_t *
|
|
146
|
+
lxb_html_tokenizer_state_char_ref_attr(lxb_html_tokenizer_t *tkz,
|
|
147
|
+
const lxb_char_t *data,
|
|
148
|
+
const lxb_char_t *end);
|
|
149
|
+
|
|
150
|
+
static const lxb_char_t *
|
|
151
|
+
_lxb_html_tokenizer_state_char_ref(lxb_html_tokenizer_t *tkz,
|
|
152
|
+
const lxb_char_t *data,
|
|
153
|
+
const lxb_char_t *end);
|
|
154
|
+
|
|
155
|
+
static const lxb_char_t *
|
|
156
|
+
lxb_html_tokenizer_state_char_ref_named(lxb_html_tokenizer_t *tkz,
|
|
157
|
+
const lxb_char_t *data,
|
|
158
|
+
const lxb_char_t *end);
|
|
159
|
+
|
|
160
|
+
static const lxb_char_t *
|
|
161
|
+
lxb_html_tokenizer_state_char_ref_ambiguous_ampersand(lxb_html_tokenizer_t *tkz,
|
|
162
|
+
const lxb_char_t *data,
|
|
163
|
+
const lxb_char_t *end);
|
|
164
|
+
|
|
165
|
+
static const lxb_char_t *
|
|
166
|
+
lxb_html_tokenizer_state_char_ref_numeric(lxb_html_tokenizer_t *tkz,
|
|
167
|
+
const lxb_char_t *data,
|
|
168
|
+
const lxb_char_t *end);
|
|
169
|
+
|
|
170
|
+
static const lxb_char_t *
|
|
171
|
+
lxb_html_tokenizer_state_char_ref_hexademical_start(lxb_html_tokenizer_t *tkz,
|
|
172
|
+
const lxb_char_t *data,
|
|
173
|
+
const lxb_char_t *end);
|
|
174
|
+
|
|
175
|
+
static const lxb_char_t *
|
|
176
|
+
lxb_html_tokenizer_state_char_ref_decimal_start(lxb_html_tokenizer_t *tkz,
|
|
177
|
+
const lxb_char_t *data,
|
|
178
|
+
const lxb_char_t *end);
|
|
179
|
+
|
|
180
|
+
static const lxb_char_t *
|
|
181
|
+
lxb_html_tokenizer_state_char_ref_hexademical(lxb_html_tokenizer_t *tkz,
|
|
182
|
+
const lxb_char_t *data,
|
|
183
|
+
const lxb_char_t *end);
|
|
184
|
+
|
|
185
|
+
static const lxb_char_t *
|
|
186
|
+
lxb_html_tokenizer_state_char_ref_decimal(lxb_html_tokenizer_t *tkz,
|
|
187
|
+
const lxb_char_t *data,
|
|
188
|
+
const lxb_char_t *end);
|
|
189
|
+
|
|
190
|
+
static const lxb_char_t *
|
|
191
|
+
lxb_html_tokenizer_state_char_ref_numeric_end(lxb_html_tokenizer_t *tkz,
|
|
192
|
+
const lxb_char_t *data,
|
|
193
|
+
const lxb_char_t *end);
|
|
194
|
+
|
|
195
|
+
static size_t
|
|
196
|
+
lxb_html_tokenizer_state_to_ascii_utf_8(size_t codepoint, lxb_char_t *data);
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
/*
|
|
200
|
+
* Helper function. No in the specification. For 12.2.5.1 Data state
|
|
201
|
+
*/
|
|
202
|
+
const lxb_char_t *
|
|
203
|
+
lxb_html_tokenizer_state_data_before(lxb_html_tokenizer_t *tkz,
|
|
204
|
+
const lxb_char_t *data,
|
|
205
|
+
const lxb_char_t *end)
|
|
206
|
+
{
|
|
207
|
+
if (tkz->is_eof == false) {
|
|
208
|
+
lxb_html_tokenizer_state_token_set_begin(tkz, data);
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
/*
|
|
212
|
+
* Text node init param sets before emit token.
|
|
213
|
+
*/
|
|
214
|
+
|
|
215
|
+
tkz->state = lxb_html_tokenizer_state_data;
|
|
216
|
+
|
|
217
|
+
return data;
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
/*
|
|
221
|
+
* 12.2.5.1 Data state
|
|
222
|
+
*/
|
|
223
|
+
static const lxb_char_t *
|
|
224
|
+
lxb_html_tokenizer_state_data(lxb_html_tokenizer_t *tkz,
|
|
225
|
+
const lxb_char_t *data, const lxb_char_t *end)
|
|
226
|
+
{
|
|
227
|
+
lxb_html_tokenizer_state_begin_set(tkz, data);
|
|
228
|
+
|
|
229
|
+
while (data != end) {
|
|
230
|
+
switch (*data) {
|
|
231
|
+
/* U+003C LESS-THAN SIGN (<) */
|
|
232
|
+
case 0x3C:
|
|
233
|
+
lxb_html_tokenizer_state_append_data_m(tkz, data);
|
|
234
|
+
lxb_html_tokenizer_state_token_set_end(tkz, data);
|
|
235
|
+
|
|
236
|
+
tkz->state = lxb_html_tokenizer_state_tag_open;
|
|
237
|
+
return (data + 1);
|
|
238
|
+
|
|
239
|
+
/* U+0026 AMPERSAND (&) */
|
|
240
|
+
case 0x26:
|
|
241
|
+
lxb_html_tokenizer_state_append_data_m(tkz, data + 1);
|
|
242
|
+
|
|
243
|
+
tkz->state = lxb_html_tokenizer_state_char_ref;
|
|
244
|
+
tkz->state_return = lxb_html_tokenizer_state_data;
|
|
245
|
+
|
|
246
|
+
return data + 1;
|
|
247
|
+
|
|
248
|
+
/* U+000D CARRIAGE RETURN (CR) */
|
|
249
|
+
case 0x0D:
|
|
250
|
+
if (++data >= end) {
|
|
251
|
+
lxb_html_tokenizer_state_append_data_m(tkz, data - 1);
|
|
252
|
+
|
|
253
|
+
tkz->state = lxb_html_tokenizer_state_cr;
|
|
254
|
+
tkz->state_return = lxb_html_tokenizer_state_data;
|
|
255
|
+
|
|
256
|
+
return data;
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
lxb_html_tokenizer_state_append_data_m(tkz, data);
|
|
260
|
+
tkz->pos[-1] = 0x0A;
|
|
261
|
+
|
|
262
|
+
lxb_html_tokenizer_state_begin_set(tkz, data + 1);
|
|
263
|
+
|
|
264
|
+
if (*data != 0x0A) {
|
|
265
|
+
lxb_html_tokenizer_state_begin_set(tkz, data);
|
|
266
|
+
data--;
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
break;
|
|
270
|
+
|
|
271
|
+
/*
|
|
272
|
+
* U+0000 NULL
|
|
273
|
+
* EOF
|
|
274
|
+
*/
|
|
275
|
+
case 0x00:
|
|
276
|
+
if (tkz->is_eof) {
|
|
277
|
+
/* Emit TEXT node if not empty */
|
|
278
|
+
if (tkz->token->begin != NULL) {
|
|
279
|
+
lxb_html_tokenizer_state_token_set_end_oef(tkz);
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
if (tkz->token->begin != tkz->token->end) {
|
|
283
|
+
tkz->token->tag_id = LXB_TAG__TEXT;
|
|
284
|
+
|
|
285
|
+
lxb_html_tokenizer_state_append_data_m(tkz, data);
|
|
286
|
+
|
|
287
|
+
lxb_html_tokenizer_state_set_text(tkz);
|
|
288
|
+
lxb_html_tokenizer_state_token_done_wo_check_m(tkz,end);
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
return end;
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
if (SIZE_MAX - tkz->token->null_count < 1) {
|
|
295
|
+
tkz->status = LXB_STATUS_ERROR_OVERFLOW;
|
|
296
|
+
return end;
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
tkz->token->null_count++;
|
|
300
|
+
|
|
301
|
+
lxb_html_tokenizer_error_add(tkz->parse_errors, data,
|
|
302
|
+
LXB_HTML_TOKENIZER_ERROR_UNNUCH);
|
|
303
|
+
break;
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
data++;
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
lxb_html_tokenizer_state_append_data_m(tkz, data);
|
|
310
|
+
|
|
311
|
+
return data;
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
/*
|
|
315
|
+
* Helper function. No in the specification. For 12.2.5.5 PLAINTEXT state
|
|
316
|
+
*/
|
|
317
|
+
const lxb_char_t *
|
|
318
|
+
lxb_html_tokenizer_state_plaintext_before(lxb_html_tokenizer_t *tkz,
|
|
319
|
+
const lxb_char_t *data,
|
|
320
|
+
const lxb_char_t *end)
|
|
321
|
+
{
|
|
322
|
+
if (tkz->is_eof == false) {
|
|
323
|
+
lxb_html_tokenizer_state_token_set_begin(tkz, data);
|
|
324
|
+
}
|
|
325
|
+
|
|
326
|
+
tkz->token->tag_id = LXB_TAG__TEXT;
|
|
327
|
+
|
|
328
|
+
tkz->state = lxb_html_tokenizer_state_plaintext;
|
|
329
|
+
|
|
330
|
+
return data;
|
|
331
|
+
}
|
|
332
|
+
|
|
333
|
+
/*
|
|
334
|
+
* 12.2.5.5 PLAINTEXT state
|
|
335
|
+
*/
|
|
336
|
+
static const lxb_char_t *
|
|
337
|
+
lxb_html_tokenizer_state_plaintext(lxb_html_tokenizer_t *tkz,
|
|
338
|
+
const lxb_char_t *data,
|
|
339
|
+
const lxb_char_t *end)
|
|
340
|
+
{
|
|
341
|
+
lxb_html_tokenizer_state_begin_set(tkz, data);
|
|
342
|
+
|
|
343
|
+
while (data != end) {
|
|
344
|
+
switch (*data) {
|
|
345
|
+
/* U+000D CARRIAGE RETURN (CR) */
|
|
346
|
+
case 0x0D:
|
|
347
|
+
if (++data >= end) {
|
|
348
|
+
lxb_html_tokenizer_state_append_data_m(tkz, data - 1);
|
|
349
|
+
|
|
350
|
+
tkz->state = lxb_html_tokenizer_state_cr;
|
|
351
|
+
tkz->state_return = lxb_html_tokenizer_state_plaintext;
|
|
352
|
+
|
|
353
|
+
return data;
|
|
354
|
+
}
|
|
355
|
+
|
|
356
|
+
lxb_html_tokenizer_state_append_data_m(tkz, data);
|
|
357
|
+
tkz->pos[-1] = 0x0A;
|
|
358
|
+
|
|
359
|
+
lxb_html_tokenizer_state_begin_set(tkz, data + 1);
|
|
360
|
+
|
|
361
|
+
if (*data != 0x0A) {
|
|
362
|
+
lxb_html_tokenizer_state_begin_set(tkz, data);
|
|
363
|
+
data--;
|
|
364
|
+
}
|
|
365
|
+
|
|
366
|
+
break;
|
|
367
|
+
|
|
368
|
+
/*
|
|
369
|
+
* U+0000 NULL
|
|
370
|
+
* EOF
|
|
371
|
+
*/
|
|
372
|
+
case 0x00:
|
|
373
|
+
lxb_html_tokenizer_state_append_data_m(tkz, data);
|
|
374
|
+
|
|
375
|
+
if (tkz->is_eof) {
|
|
376
|
+
if (tkz->token->begin != NULL) {
|
|
377
|
+
lxb_html_tokenizer_state_token_set_end_oef(tkz);
|
|
378
|
+
}
|
|
379
|
+
|
|
380
|
+
lxb_html_tokenizer_state_set_text(tkz);
|
|
381
|
+
lxb_html_tokenizer_state_token_done_m(tkz, end);
|
|
382
|
+
|
|
383
|
+
return end;
|
|
384
|
+
}
|
|
385
|
+
|
|
386
|
+
lxb_html_tokenizer_state_begin_set(tkz, data + 1);
|
|
387
|
+
lxb_html_tokenizer_state_append_replace_m(tkz);
|
|
388
|
+
|
|
389
|
+
lxb_html_tokenizer_error_add(tkz->parse_errors, data,
|
|
390
|
+
LXB_HTML_TOKENIZER_ERROR_UNNUCH);
|
|
391
|
+
break;
|
|
392
|
+
}
|
|
393
|
+
|
|
394
|
+
data++;
|
|
395
|
+
}
|
|
396
|
+
|
|
397
|
+
lxb_html_tokenizer_state_append_data_m(tkz, data);
|
|
398
|
+
|
|
399
|
+
return data;
|
|
400
|
+
}
|
|
401
|
+
|
|
402
|
+
/*
|
|
403
|
+
* 12.2.5.6 Tag open state
|
|
404
|
+
*/
|
|
405
|
+
static const lxb_char_t *
|
|
406
|
+
lxb_html_tokenizer_state_tag_open(lxb_html_tokenizer_t *tkz,
|
|
407
|
+
const lxb_char_t *data, const lxb_char_t *end)
|
|
408
|
+
{
|
|
409
|
+
/* ASCII alpha */
|
|
410
|
+
if (lexbor_str_res_alpha_character[ *data ] != LEXBOR_STR_RES_SLIP) {
|
|
411
|
+
tkz->state = lxb_html_tokenizer_state_tag_name;
|
|
412
|
+
|
|
413
|
+
lxb_html_tokenizer_state_token_emit_text_not_empty_m(tkz, end);
|
|
414
|
+
lxb_html_tokenizer_state_token_set_begin(tkz, data);
|
|
415
|
+
|
|
416
|
+
return data;
|
|
417
|
+
}
|
|
418
|
+
|
|
419
|
+
/* U+002F SOLIDUS (/) */
|
|
420
|
+
else if (*data == 0x2F) {
|
|
421
|
+
tkz->state = lxb_html_tokenizer_state_end_tag_open;
|
|
422
|
+
|
|
423
|
+
return (data + 1);
|
|
424
|
+
}
|
|
425
|
+
|
|
426
|
+
/* U+0021 EXCLAMATION MARK (!) */
|
|
427
|
+
else if (*data == 0x21) {
|
|
428
|
+
tkz->state = lxb_html_tokenizer_state_markup_declaration_open;
|
|
429
|
+
|
|
430
|
+
lxb_html_tokenizer_state_token_emit_text_not_empty_m(tkz, end);
|
|
431
|
+
|
|
432
|
+
return (data + 1);
|
|
433
|
+
}
|
|
434
|
+
|
|
435
|
+
/* U+003F QUESTION MARK (?) */
|
|
436
|
+
else if (*data == 0x3F) {
|
|
437
|
+
tkz->state = lxb_html_tokenizer_state_bogus_comment_before;
|
|
438
|
+
|
|
439
|
+
lxb_html_tokenizer_state_token_emit_text_not_empty_m(tkz, end);
|
|
440
|
+
lxb_html_tokenizer_state_token_set_begin(tkz, data);
|
|
441
|
+
|
|
442
|
+
lxb_html_tokenizer_error_add(tkz->parse_errors, data,
|
|
443
|
+
LXB_HTML_TOKENIZER_ERROR_UNQUMAINOFTANA);
|
|
444
|
+
|
|
445
|
+
return data;
|
|
446
|
+
}
|
|
447
|
+
|
|
448
|
+
/* EOF */
|
|
449
|
+
else if (*data == 0x00) {
|
|
450
|
+
if (tkz->is_eof) {
|
|
451
|
+
lxb_html_tokenizer_state_append_m(tkz, "<", 1);
|
|
452
|
+
|
|
453
|
+
lxb_html_tokenizer_state_token_set_end_oef(tkz);
|
|
454
|
+
lxb_html_tokenizer_state_token_emit_text_not_empty_m(tkz, end);
|
|
455
|
+
|
|
456
|
+
lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->token->end,
|
|
457
|
+
LXB_HTML_TOKENIZER_ERROR_EOBETANA);
|
|
458
|
+
|
|
459
|
+
return end;
|
|
460
|
+
}
|
|
461
|
+
}
|
|
462
|
+
|
|
463
|
+
lxb_html_tokenizer_state_append_m(tkz, "<", 1);
|
|
464
|
+
|
|
465
|
+
lxb_html_tokenizer_error_add(tkz->parse_errors, data,
|
|
466
|
+
LXB_HTML_TOKENIZER_ERROR_INFICHOFTANA);
|
|
467
|
+
|
|
468
|
+
tkz->state = lxb_html_tokenizer_state_data;
|
|
469
|
+
|
|
470
|
+
return data;
|
|
471
|
+
}
|
|
472
|
+
|
|
473
|
+
/*
|
|
474
|
+
* 12.2.5.7 End tag open state
|
|
475
|
+
*/
|
|
476
|
+
static const lxb_char_t *
|
|
477
|
+
lxb_html_tokenizer_state_end_tag_open(lxb_html_tokenizer_t *tkz,
|
|
478
|
+
const lxb_char_t *data,
|
|
479
|
+
const lxb_char_t *end)
|
|
480
|
+
{
|
|
481
|
+
/* ASCII alpha */
|
|
482
|
+
if (lexbor_str_res_alpha_character[ *data ] != LEXBOR_STR_RES_SLIP) {
|
|
483
|
+
tkz->state = lxb_html_tokenizer_state_tag_name;
|
|
484
|
+
|
|
485
|
+
lxb_html_tokenizer_state_token_emit_text_not_empty_m(tkz, end);
|
|
486
|
+
lxb_html_tokenizer_state_token_set_begin(tkz, data);
|
|
487
|
+
|
|
488
|
+
tkz->token->type |= LXB_HTML_TOKEN_TYPE_CLOSE;
|
|
489
|
+
|
|
490
|
+
return data;
|
|
491
|
+
}
|
|
492
|
+
|
|
493
|
+
/* U+003E GREATER-THAN SIGN (>) */
|
|
494
|
+
else if (*data == 0x3E) {
|
|
495
|
+
tkz->state = lxb_html_tokenizer_state_data;
|
|
496
|
+
|
|
497
|
+
lxb_html_tokenizer_error_add(tkz->parse_errors, data,
|
|
498
|
+
LXB_HTML_TOKENIZER_ERROR_MIENTANA);
|
|
499
|
+
|
|
500
|
+
return (data + 1);
|
|
501
|
+
}
|
|
502
|
+
|
|
503
|
+
/* Fake EOF */
|
|
504
|
+
else if (*data == 0x00) {
|
|
505
|
+
if (tkz->is_eof) {
|
|
506
|
+
lxb_html_tokenizer_state_append_m(tkz, "</", 2);
|
|
507
|
+
|
|
508
|
+
lxb_html_tokenizer_state_token_set_end_oef(tkz);
|
|
509
|
+
lxb_html_tokenizer_state_token_emit_text_not_empty_m(tkz, end);
|
|
510
|
+
|
|
511
|
+
lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->token->end,
|
|
512
|
+
LXB_HTML_TOKENIZER_ERROR_EOBETANA);
|
|
513
|
+
|
|
514
|
+
return end;
|
|
515
|
+
}
|
|
516
|
+
}
|
|
517
|
+
|
|
518
|
+
tkz->state = lxb_html_tokenizer_state_bogus_comment_before;
|
|
519
|
+
|
|
520
|
+
lxb_html_tokenizer_error_add(tkz->parse_errors, data,
|
|
521
|
+
LXB_HTML_TOKENIZER_ERROR_INFICHOFTANA);
|
|
522
|
+
|
|
523
|
+
lxb_html_tokenizer_state_token_emit_text_not_empty_m(tkz, end);
|
|
524
|
+
lxb_html_tokenizer_state_token_set_begin(tkz, data);
|
|
525
|
+
|
|
526
|
+
return data;
|
|
527
|
+
}
|
|
528
|
+
|
|
529
|
+
/*
|
|
530
|
+
* 12.2.5.8 Tag name state
|
|
531
|
+
*/
|
|
532
|
+
static const lxb_char_t *
|
|
533
|
+
lxb_html_tokenizer_state_tag_name(lxb_html_tokenizer_t *tkz,
|
|
534
|
+
const lxb_char_t *data, const lxb_char_t *end)
|
|
535
|
+
{
|
|
536
|
+
lxb_html_tokenizer_state_begin_set(tkz, data);
|
|
537
|
+
|
|
538
|
+
while (data != end) {
|
|
539
|
+
switch (*data) {
|
|
540
|
+
/*
|
|
541
|
+
* U+0009 CHARACTER TABULATION (tab)
|
|
542
|
+
* U+000A LINE FEED (LF)
|
|
543
|
+
* U+000C FORM FEED (FF)
|
|
544
|
+
* U+000D CARRIAGE RETURN (CR)
|
|
545
|
+
* U+0020 SPACE
|
|
546
|
+
*/
|
|
547
|
+
case 0x09:
|
|
548
|
+
case 0x0A:
|
|
549
|
+
case 0x0C:
|
|
550
|
+
case 0x0D:
|
|
551
|
+
case 0x20:
|
|
552
|
+
lxb_html_tokenizer_state_append_data_m(tkz, data);
|
|
553
|
+
lxb_html_tokenizer_state_set_tag_m(tkz, tkz->start, tkz->pos);
|
|
554
|
+
lxb_html_tokenizer_state_token_set_end(tkz, data);
|
|
555
|
+
|
|
556
|
+
tkz->state = lxb_html_tokenizer_state_before_attribute_name;
|
|
557
|
+
return (data + 1);
|
|
558
|
+
|
|
559
|
+
/* U+002F SOLIDUS (/) */
|
|
560
|
+
case 0x2F:
|
|
561
|
+
lxb_html_tokenizer_state_append_data_m(tkz, data);
|
|
562
|
+
lxb_html_tokenizer_state_set_tag_m(tkz, tkz->start, tkz->pos);
|
|
563
|
+
lxb_html_tokenizer_state_token_set_end(tkz, data);
|
|
564
|
+
|
|
565
|
+
tkz->state = lxb_html_tokenizer_state_self_closing_start_tag;
|
|
566
|
+
return (data + 1);
|
|
567
|
+
|
|
568
|
+
/* U+003E GREATER-THAN SIGN (>) */
|
|
569
|
+
case 0x3E:
|
|
570
|
+
tkz->state = lxb_html_tokenizer_state_data_before;
|
|
571
|
+
|
|
572
|
+
lxb_html_tokenizer_state_append_data_m(tkz, data);
|
|
573
|
+
lxb_html_tokenizer_state_set_tag_m(tkz, tkz->start, tkz->pos);
|
|
574
|
+
lxb_html_tokenizer_state_token_set_end(tkz, data);
|
|
575
|
+
lxb_html_tokenizer_state_token_done_m(tkz, end);
|
|
576
|
+
|
|
577
|
+
return (data + 1);
|
|
578
|
+
|
|
579
|
+
/* U+0000 NULL */
|
|
580
|
+
case 0x00:
|
|
581
|
+
if (tkz->is_eof) {
|
|
582
|
+
lxb_html_tokenizer_state_token_set_end_oef(tkz);
|
|
583
|
+
|
|
584
|
+
lxb_html_tokenizer_error_add(tkz->parse_errors,
|
|
585
|
+
tkz->token->end,
|
|
586
|
+
LXB_HTML_TOKENIZER_ERROR_EOINTA);
|
|
587
|
+
return end;
|
|
588
|
+
}
|
|
589
|
+
|
|
590
|
+
lxb_html_tokenizer_state_append_data_m(tkz, data);
|
|
591
|
+
lxb_html_tokenizer_state_begin_set(tkz, data + 1);
|
|
592
|
+
lxb_html_tokenizer_state_append_replace_m(tkz);
|
|
593
|
+
|
|
594
|
+
lxb_html_tokenizer_error_add(tkz->parse_errors, data,
|
|
595
|
+
LXB_HTML_TOKENIZER_ERROR_UNNUCH);
|
|
596
|
+
break;
|
|
597
|
+
|
|
598
|
+
default:
|
|
599
|
+
break;
|
|
600
|
+
}
|
|
601
|
+
|
|
602
|
+
data++;
|
|
603
|
+
}
|
|
604
|
+
|
|
605
|
+
lxb_html_tokenizer_state_append_data_m(tkz, data);
|
|
606
|
+
|
|
607
|
+
return data;
|
|
608
|
+
}
|
|
609
|
+
|
|
610
|
+
/*
|
|
611
|
+
* 12.2.5.32 Before attribute name state
|
|
612
|
+
*/
|
|
613
|
+
const lxb_char_t *
|
|
614
|
+
lxb_html_tokenizer_state_before_attribute_name(lxb_html_tokenizer_t *tkz,
|
|
615
|
+
const lxb_char_t *data,
|
|
616
|
+
const lxb_char_t *end)
|
|
617
|
+
{
|
|
618
|
+
lxb_html_token_attr_t *attr;
|
|
619
|
+
|
|
620
|
+
while (data != end) {
|
|
621
|
+
switch (*data) {
|
|
622
|
+
/*
|
|
623
|
+
* U+0009 CHARACTER TABULATION (tab)
|
|
624
|
+
* U+000A LINE FEED (LF)
|
|
625
|
+
* U+000C FORM FEED (FF)
|
|
626
|
+
* U+000D CARRIAGE RETURN (CR)
|
|
627
|
+
* U+0020 SPACE
|
|
628
|
+
*/
|
|
629
|
+
case 0x09:
|
|
630
|
+
case 0x0A:
|
|
631
|
+
case 0x0C:
|
|
632
|
+
case 0x0D:
|
|
633
|
+
case 0x20:
|
|
634
|
+
break;
|
|
635
|
+
|
|
636
|
+
/* U+003D EQUALS SIGN (=) */
|
|
637
|
+
case 0x3D:
|
|
638
|
+
lxb_html_tokenizer_state_token_attr_add_m(tkz, attr, end);
|
|
639
|
+
lxb_html_tokenizer_state_token_attr_set_name_begin(tkz, data);
|
|
640
|
+
|
|
641
|
+
lxb_html_tokenizer_state_append_m(tkz, data, 1);
|
|
642
|
+
|
|
643
|
+
lxb_html_tokenizer_error_add(tkz->parse_errors, data,
|
|
644
|
+
LXB_HTML_TOKENIZER_ERROR_UNEQSIBEATNA);
|
|
645
|
+
|
|
646
|
+
tkz->state = lxb_html_tokenizer_state_attribute_name;
|
|
647
|
+
return (data + 1);
|
|
648
|
+
|
|
649
|
+
/*
|
|
650
|
+
* U+002F SOLIDUS (/)
|
|
651
|
+
* U+003E GREATER-THAN SIGN (>)
|
|
652
|
+
*/
|
|
653
|
+
case 0x2F:
|
|
654
|
+
case 0x3E:
|
|
655
|
+
tkz->state = lxb_html_tokenizer_state_after_attribute_name;
|
|
656
|
+
return data;
|
|
657
|
+
|
|
658
|
+
/* EOF */
|
|
659
|
+
case 0x00:
|
|
660
|
+
if (tkz->is_eof) {
|
|
661
|
+
tkz->state = lxb_html_tokenizer_state_after_attribute_name;
|
|
662
|
+
return data;
|
|
663
|
+
}
|
|
664
|
+
/* fall through */
|
|
665
|
+
|
|
666
|
+
/* Anything else */
|
|
667
|
+
default:
|
|
668
|
+
lxb_html_tokenizer_state_token_attr_add_m(tkz, attr, end);
|
|
669
|
+
lxb_html_tokenizer_state_token_attr_set_name_begin(tkz, data);
|
|
670
|
+
|
|
671
|
+
tkz->state = lxb_html_tokenizer_state_attribute_name;
|
|
672
|
+
return data;
|
|
673
|
+
}
|
|
674
|
+
|
|
675
|
+
data++;
|
|
676
|
+
}
|
|
677
|
+
|
|
678
|
+
return data;
|
|
679
|
+
}
|
|
680
|
+
|
|
681
|
+
/*
|
|
682
|
+
* 12.2.5.33 Attribute name state
|
|
683
|
+
*/
|
|
684
|
+
static const lxb_char_t *
|
|
685
|
+
lxb_html_tokenizer_state_attribute_name(lxb_html_tokenizer_t *tkz,
|
|
686
|
+
const lxb_char_t *data,
|
|
687
|
+
const lxb_char_t *end)
|
|
688
|
+
{
|
|
689
|
+
lxb_html_tokenizer_state_begin_set(tkz, data);
|
|
690
|
+
|
|
691
|
+
while (data != end) {
|
|
692
|
+
switch (*data) {
|
|
693
|
+
/*
|
|
694
|
+
* U+0009 CHARACTER TABULATION (tab)
|
|
695
|
+
* U+000A LINE FEED (LF)
|
|
696
|
+
* U+000C FORM FEED (FF)
|
|
697
|
+
* U+000D CARRIAGE RETURN (CR)
|
|
698
|
+
* U+0020 SPACE
|
|
699
|
+
* U+002F SOLIDUS (/)
|
|
700
|
+
* U+003E GREATER-THAN SIGN (>)
|
|
701
|
+
*/
|
|
702
|
+
case 0x09:
|
|
703
|
+
case 0x0A:
|
|
704
|
+
case 0x0C:
|
|
705
|
+
case 0x0D:
|
|
706
|
+
case 0x20:
|
|
707
|
+
case 0x2F:
|
|
708
|
+
case 0x3E:
|
|
709
|
+
lxb_html_tokenizer_state_append_data_m(tkz, data);
|
|
710
|
+
lxb_html_tokenizer_state_set_name_m(tkz);
|
|
711
|
+
lxb_html_tokenizer_state_token_attr_set_name_end(tkz, data);
|
|
712
|
+
|
|
713
|
+
tkz->state = lxb_html_tokenizer_state_after_attribute_name;
|
|
714
|
+
return data;
|
|
715
|
+
|
|
716
|
+
/*
|
|
717
|
+
* U+0000 NULL
|
|
718
|
+
* EOF
|
|
719
|
+
*/
|
|
720
|
+
case 0x00:
|
|
721
|
+
if (tkz->is_eof) {
|
|
722
|
+
lxb_html_tokenizer_state_token_attr_set_name_end_oef(tkz);
|
|
723
|
+
|
|
724
|
+
tkz->state = lxb_html_tokenizer_state_after_attribute_name;
|
|
725
|
+
return data;
|
|
726
|
+
}
|
|
727
|
+
|
|
728
|
+
lxb_html_tokenizer_state_append_data_m(tkz, data);
|
|
729
|
+
lxb_html_tokenizer_state_begin_set(tkz, data + 1);
|
|
730
|
+
lxb_html_tokenizer_state_append_replace_m(tkz);
|
|
731
|
+
|
|
732
|
+
lxb_html_tokenizer_error_add(tkz->parse_errors, data,
|
|
733
|
+
LXB_HTML_TOKENIZER_ERROR_UNNUCH);
|
|
734
|
+
break;
|
|
735
|
+
|
|
736
|
+
/* U+003D EQUALS SIGN (=) */
|
|
737
|
+
case 0x3D:
|
|
738
|
+
lxb_html_tokenizer_state_append_data_m(tkz, data);
|
|
739
|
+
lxb_html_tokenizer_state_set_name_m(tkz);
|
|
740
|
+
lxb_html_tokenizer_state_token_attr_set_name_end(tkz, data);
|
|
741
|
+
|
|
742
|
+
tkz->state = lxb_html_tokenizer_state_before_attribute_value;
|
|
743
|
+
return (data + 1);
|
|
744
|
+
|
|
745
|
+
/*
|
|
746
|
+
* U+0022 QUOTATION MARK (")
|
|
747
|
+
* U+0027 APOSTROPHE (')
|
|
748
|
+
* U+003C LESS-THAN SIGN (<)
|
|
749
|
+
*/
|
|
750
|
+
case 0x22:
|
|
751
|
+
case 0x27:
|
|
752
|
+
case 0x3C:
|
|
753
|
+
lxb_html_tokenizer_error_add(tkz->parse_errors, data,
|
|
754
|
+
LXB_HTML_TOKENIZER_ERROR_UNCHINATNA);
|
|
755
|
+
break;
|
|
756
|
+
|
|
757
|
+
default:
|
|
758
|
+
break;
|
|
759
|
+
}
|
|
760
|
+
|
|
761
|
+
data++;
|
|
762
|
+
}
|
|
763
|
+
|
|
764
|
+
lxb_html_tokenizer_state_append_data_m(tkz, data);
|
|
765
|
+
|
|
766
|
+
return data;
|
|
767
|
+
}
|
|
768
|
+
|
|
769
|
+
/*
|
|
770
|
+
* 12.2.5.34 After attribute name state
|
|
771
|
+
*/
|
|
772
|
+
static const lxb_char_t *
|
|
773
|
+
lxb_html_tokenizer_state_after_attribute_name(lxb_html_tokenizer_t *tkz,
|
|
774
|
+
const lxb_char_t *data,
|
|
775
|
+
const lxb_char_t *end)
|
|
776
|
+
{
|
|
777
|
+
lxb_html_token_attr_t *attr;
|
|
778
|
+
|
|
779
|
+
while (data != end) {
|
|
780
|
+
switch (*data) {
|
|
781
|
+
/*
|
|
782
|
+
* U+0009 CHARACTER TABULATION (tab)
|
|
783
|
+
* U+000A LINE FEED (LF)
|
|
784
|
+
* U+000C FORM FEED (FF)
|
|
785
|
+
* U+000D CARRIAGE RETURN (CR)
|
|
786
|
+
* U+0020 SPACE
|
|
787
|
+
*/
|
|
788
|
+
case 0x09:
|
|
789
|
+
case 0x0A:
|
|
790
|
+
case 0x0C:
|
|
791
|
+
case 0x0D:
|
|
792
|
+
case 0x20:
|
|
793
|
+
break;
|
|
794
|
+
|
|
795
|
+
/* U+002F SOLIDUS (/) */
|
|
796
|
+
case 0x2F:
|
|
797
|
+
tkz->state = lxb_html_tokenizer_state_self_closing_start_tag;
|
|
798
|
+
return (data + 1);
|
|
799
|
+
|
|
800
|
+
/* U+003D EQUALS SIGN (=) */
|
|
801
|
+
case 0x3D:
|
|
802
|
+
tkz->state = lxb_html_tokenizer_state_before_attribute_value;
|
|
803
|
+
return (data + 1);
|
|
804
|
+
|
|
805
|
+
/* U+003E GREATER-THAN SIGN (>) */
|
|
806
|
+
case 0x3E:
|
|
807
|
+
tkz->state = lxb_html_tokenizer_state_data_before;
|
|
808
|
+
|
|
809
|
+
lxb_html_tokenizer_state_token_done_m(tkz, end);
|
|
810
|
+
|
|
811
|
+
return (data + 1);
|
|
812
|
+
|
|
813
|
+
case 0x00:
|
|
814
|
+
if (tkz->is_eof) {
|
|
815
|
+
lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->last,
|
|
816
|
+
LXB_HTML_TOKENIZER_ERROR_EOINTA);
|
|
817
|
+
return end;
|
|
818
|
+
}
|
|
819
|
+
/* fall through */
|
|
820
|
+
|
|
821
|
+
default:
|
|
822
|
+
lxb_html_tokenizer_state_token_attr_add_m(tkz, attr, end);
|
|
823
|
+
lxb_html_tokenizer_state_token_attr_set_name_begin(tkz, data);
|
|
824
|
+
|
|
825
|
+
tkz->state = lxb_html_tokenizer_state_attribute_name;
|
|
826
|
+
return data;
|
|
827
|
+
}
|
|
828
|
+
|
|
829
|
+
data++;
|
|
830
|
+
}
|
|
831
|
+
|
|
832
|
+
return data;
|
|
833
|
+
}
|
|
834
|
+
|
|
835
|
+
/*
|
|
836
|
+
* 12.2.5.35 Before attribute value state
|
|
837
|
+
*/
|
|
838
|
+
static const lxb_char_t *
|
|
839
|
+
lxb_html_tokenizer_state_before_attribute_value(lxb_html_tokenizer_t *tkz,
|
|
840
|
+
const lxb_char_t *data,
|
|
841
|
+
const lxb_char_t *end)
|
|
842
|
+
{
|
|
843
|
+
while (data != end) {
|
|
844
|
+
switch (*data) {
|
|
845
|
+
/*
|
|
846
|
+
* U+0009 CHARACTER TABULATION (tab)
|
|
847
|
+
* U+000A LINE FEED (LF)
|
|
848
|
+
* U+000C FORM FEED (FF)
|
|
849
|
+
* U+000D CARRIAGE RETURN (CR)
|
|
850
|
+
* U+0020 SPACE
|
|
851
|
+
*/
|
|
852
|
+
case 0x09:
|
|
853
|
+
case 0x0A:
|
|
854
|
+
case 0x0C:
|
|
855
|
+
case 0x0D:
|
|
856
|
+
case 0x20:
|
|
857
|
+
break;
|
|
858
|
+
|
|
859
|
+
/* U+0022 QUOTATION MARK (") */
|
|
860
|
+
case 0x22:
|
|
861
|
+
tkz->state =
|
|
862
|
+
lxb_html_tokenizer_state_attribute_value_double_quoted;
|
|
863
|
+
|
|
864
|
+
return (data + 1);
|
|
865
|
+
|
|
866
|
+
/* U+0027 APOSTROPHE (') */
|
|
867
|
+
case 0x27:
|
|
868
|
+
tkz->state =
|
|
869
|
+
lxb_html_tokenizer_state_attribute_value_single_quoted;
|
|
870
|
+
|
|
871
|
+
return (data + 1);
|
|
872
|
+
|
|
873
|
+
/* U+003E GREATER-THAN SIGN (>) */
|
|
874
|
+
case 0x3E:
|
|
875
|
+
tkz->state = lxb_html_tokenizer_state_data_before;
|
|
876
|
+
|
|
877
|
+
lxb_html_tokenizer_error_add(tkz->parse_errors, data,
|
|
878
|
+
LXB_HTML_TOKENIZER_ERROR_MIATVA);
|
|
879
|
+
|
|
880
|
+
lxb_html_tokenizer_state_token_done_m(tkz, end);
|
|
881
|
+
|
|
882
|
+
return (data + 1);
|
|
883
|
+
|
|
884
|
+
default:
|
|
885
|
+
tkz->state = lxb_html_tokenizer_state_attribute_value_unquoted;
|
|
886
|
+
return data;
|
|
887
|
+
}
|
|
888
|
+
|
|
889
|
+
data++;
|
|
890
|
+
}
|
|
891
|
+
|
|
892
|
+
return data;
|
|
893
|
+
}
|
|
894
|
+
|
|
895
|
+
/*
|
|
896
|
+
* 12.2.5.36 Attribute value (double-quoted) state
|
|
897
|
+
*/
|
|
898
|
+
static const lxb_char_t *
|
|
899
|
+
lxb_html_tokenizer_state_attribute_value_double_quoted(lxb_html_tokenizer_t *tkz,
|
|
900
|
+
const lxb_char_t *data,
|
|
901
|
+
const lxb_char_t *end)
|
|
902
|
+
{
|
|
903
|
+
if (tkz->token->attr_last->value_begin == NULL && tkz->is_eof == false) {
|
|
904
|
+
lxb_html_tokenizer_state_token_attr_set_value_begin(tkz, data);
|
|
905
|
+
}
|
|
906
|
+
|
|
907
|
+
lxb_html_tokenizer_state_begin_set(tkz, data);
|
|
908
|
+
|
|
909
|
+
while (data != end) {
|
|
910
|
+
switch (*data) {
|
|
911
|
+
/* U+0022 QUOTATION MARK (") */
|
|
912
|
+
case 0x22:
|
|
913
|
+
lxb_html_tokenizer_state_append_data_m(tkz, data);
|
|
914
|
+
lxb_html_tokenizer_state_token_attr_set_value_end(tkz, data);
|
|
915
|
+
lxb_html_tokenizer_state_set_value_m(tkz);
|
|
916
|
+
|
|
917
|
+
tkz->state =
|
|
918
|
+
lxb_html_tokenizer_state_after_attribute_value_quoted;
|
|
919
|
+
|
|
920
|
+
return (data + 1);
|
|
921
|
+
|
|
922
|
+
/* U+0026 AMPERSAND (&) */
|
|
923
|
+
case 0x26:
|
|
924
|
+
lxb_html_tokenizer_state_append_data_m(tkz, data + 1);
|
|
925
|
+
|
|
926
|
+
tkz->state = lxb_html_tokenizer_state_char_ref_attr;
|
|
927
|
+
tkz->state_return = lxb_html_tokenizer_state_attribute_value_double_quoted;
|
|
928
|
+
|
|
929
|
+
return data + 1;
|
|
930
|
+
|
|
931
|
+
/* U+000D CARRIAGE RETURN (CR) */
|
|
932
|
+
case 0x0D:
|
|
933
|
+
if (++data >= end) {
|
|
934
|
+
lxb_html_tokenizer_state_append_data_m(tkz, data - 1);
|
|
935
|
+
|
|
936
|
+
tkz->state = lxb_html_tokenizer_state_cr;
|
|
937
|
+
tkz->state_return = lxb_html_tokenizer_state_attribute_value_double_quoted;
|
|
938
|
+
|
|
939
|
+
return data;
|
|
940
|
+
}
|
|
941
|
+
|
|
942
|
+
lxb_html_tokenizer_state_append_data_m(tkz, data);
|
|
943
|
+
tkz->pos[-1] = 0x0A;
|
|
944
|
+
|
|
945
|
+
lxb_html_tokenizer_state_begin_set(tkz, data + 1);
|
|
946
|
+
|
|
947
|
+
if (*data != 0x0A) {
|
|
948
|
+
lxb_html_tokenizer_state_begin_set(tkz, data);
|
|
949
|
+
data--;
|
|
950
|
+
}
|
|
951
|
+
|
|
952
|
+
break;
|
|
953
|
+
|
|
954
|
+
/*
|
|
955
|
+
* U+0000 NULL
|
|
956
|
+
* EOF
|
|
957
|
+
*/
|
|
958
|
+
case 0x00:
|
|
959
|
+
if (tkz->is_eof) {
|
|
960
|
+
if (tkz->token->attr_last->value_begin != NULL) {
|
|
961
|
+
lxb_html_tokenizer_state_token_attr_set_value_end_oef(tkz);
|
|
962
|
+
}
|
|
963
|
+
|
|
964
|
+
lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->last,
|
|
965
|
+
LXB_HTML_TOKENIZER_ERROR_EOINTA);
|
|
966
|
+
return end;
|
|
967
|
+
}
|
|
968
|
+
|
|
969
|
+
lxb_html_tokenizer_state_append_data_m(tkz, data);
|
|
970
|
+
lxb_html_tokenizer_state_begin_set(tkz, data + 1);
|
|
971
|
+
lxb_html_tokenizer_state_append_replace_m(tkz);
|
|
972
|
+
|
|
973
|
+
lxb_html_tokenizer_error_add(tkz->parse_errors, data,
|
|
974
|
+
LXB_HTML_TOKENIZER_ERROR_UNNUCH);
|
|
975
|
+
break;
|
|
976
|
+
|
|
977
|
+
default:
|
|
978
|
+
break;
|
|
979
|
+
}
|
|
980
|
+
|
|
981
|
+
data++;
|
|
982
|
+
}
|
|
983
|
+
|
|
984
|
+
lxb_html_tokenizer_state_append_data_m(tkz, data);
|
|
985
|
+
|
|
986
|
+
return data;
|
|
987
|
+
}
|
|
988
|
+
|
|
989
|
+
/*
|
|
990
|
+
* 12.2.5.37 Attribute value (single-quoted) state
|
|
991
|
+
*/
|
|
992
|
+
static const lxb_char_t *
|
|
993
|
+
lxb_html_tokenizer_state_attribute_value_single_quoted(lxb_html_tokenizer_t *tkz,
|
|
994
|
+
const lxb_char_t *data,
|
|
995
|
+
const lxb_char_t *end)
|
|
996
|
+
{
|
|
997
|
+
if (tkz->token->attr_last->value_begin == NULL && tkz->is_eof == false) {
|
|
998
|
+
lxb_html_tokenizer_state_token_attr_set_value_begin(tkz, data);
|
|
999
|
+
}
|
|
1000
|
+
|
|
1001
|
+
lxb_html_tokenizer_state_begin_set(tkz, data);
|
|
1002
|
+
|
|
1003
|
+
while (data != end) {
|
|
1004
|
+
switch (*data) {
|
|
1005
|
+
/* U+0027 APOSTROPHE (') */
|
|
1006
|
+
case 0x27:
|
|
1007
|
+
lxb_html_tokenizer_state_append_data_m(tkz, data);
|
|
1008
|
+
lxb_html_tokenizer_state_token_attr_set_value_end(tkz, data);
|
|
1009
|
+
lxb_html_tokenizer_state_set_value_m(tkz);
|
|
1010
|
+
|
|
1011
|
+
tkz->state =
|
|
1012
|
+
lxb_html_tokenizer_state_after_attribute_value_quoted;
|
|
1013
|
+
|
|
1014
|
+
return (data + 1);
|
|
1015
|
+
|
|
1016
|
+
/* U+0026 AMPERSAND (&) */
|
|
1017
|
+
case 0x26:
|
|
1018
|
+
lxb_html_tokenizer_state_append_data_m(tkz, data + 1);
|
|
1019
|
+
|
|
1020
|
+
tkz->state = lxb_html_tokenizer_state_char_ref_attr;
|
|
1021
|
+
tkz->state_return = lxb_html_tokenizer_state_attribute_value_single_quoted;
|
|
1022
|
+
|
|
1023
|
+
return data + 1;
|
|
1024
|
+
|
|
1025
|
+
/* U+000D CARRIAGE RETURN (CR) */
|
|
1026
|
+
case 0x0D:
|
|
1027
|
+
if (++data >= end) {
|
|
1028
|
+
lxb_html_tokenizer_state_append_data_m(tkz, data - 1);
|
|
1029
|
+
|
|
1030
|
+
tkz->state = lxb_html_tokenizer_state_cr;
|
|
1031
|
+
tkz->state_return = lxb_html_tokenizer_state_attribute_value_single_quoted;
|
|
1032
|
+
|
|
1033
|
+
return data;
|
|
1034
|
+
}
|
|
1035
|
+
|
|
1036
|
+
lxb_html_tokenizer_state_append_data_m(tkz, data);
|
|
1037
|
+
tkz->pos[-1] = 0x0A;
|
|
1038
|
+
|
|
1039
|
+
lxb_html_tokenizer_state_begin_set(tkz, data + 1);
|
|
1040
|
+
|
|
1041
|
+
if (*data != 0x0A) {
|
|
1042
|
+
lxb_html_tokenizer_state_begin_set(tkz, data);
|
|
1043
|
+
data--;
|
|
1044
|
+
}
|
|
1045
|
+
|
|
1046
|
+
break;
|
|
1047
|
+
|
|
1048
|
+
/*
|
|
1049
|
+
* U+0000 NULL
|
|
1050
|
+
* EOF
|
|
1051
|
+
*/
|
|
1052
|
+
case 0x00:
|
|
1053
|
+
if (tkz->is_eof) {
|
|
1054
|
+
if (tkz->token->attr_last->value_begin != NULL) {
|
|
1055
|
+
lxb_html_tokenizer_state_token_attr_set_value_end_oef(tkz);
|
|
1056
|
+
}
|
|
1057
|
+
|
|
1058
|
+
lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->last,
|
|
1059
|
+
LXB_HTML_TOKENIZER_ERROR_EOINTA);
|
|
1060
|
+
return end;
|
|
1061
|
+
}
|
|
1062
|
+
|
|
1063
|
+
lxb_html_tokenizer_state_append_data_m(tkz, data);
|
|
1064
|
+
lxb_html_tokenizer_state_begin_set(tkz, data + 1);
|
|
1065
|
+
lxb_html_tokenizer_state_append_replace_m(tkz);
|
|
1066
|
+
|
|
1067
|
+
lxb_html_tokenizer_error_add(tkz->parse_errors, data,
|
|
1068
|
+
LXB_HTML_TOKENIZER_ERROR_UNNUCH);
|
|
1069
|
+
break;
|
|
1070
|
+
|
|
1071
|
+
default:
|
|
1072
|
+
break;
|
|
1073
|
+
}
|
|
1074
|
+
|
|
1075
|
+
data++;
|
|
1076
|
+
}
|
|
1077
|
+
|
|
1078
|
+
lxb_html_tokenizer_state_append_data_m(tkz, data);
|
|
1079
|
+
|
|
1080
|
+
return data;
|
|
1081
|
+
}
|
|
1082
|
+
|
|
1083
|
+
/*
|
|
1084
|
+
* 12.2.5.38 Attribute value (unquoted) state
|
|
1085
|
+
*/
|
|
1086
|
+
static const lxb_char_t *
|
|
1087
|
+
lxb_html_tokenizer_state_attribute_value_unquoted(lxb_html_tokenizer_t *tkz,
|
|
1088
|
+
const lxb_char_t *data,
|
|
1089
|
+
const lxb_char_t *end)
|
|
1090
|
+
{
|
|
1091
|
+
if (tkz->token->attr_last->value_begin == NULL && tkz->is_eof == false) {
|
|
1092
|
+
lxb_html_tokenizer_state_token_attr_set_value_begin(tkz, data);
|
|
1093
|
+
}
|
|
1094
|
+
|
|
1095
|
+
lxb_html_tokenizer_state_begin_set(tkz, data);
|
|
1096
|
+
|
|
1097
|
+
while (data != end) {
|
|
1098
|
+
switch (*data) {
|
|
1099
|
+
/*
|
|
1100
|
+
* U+0009 CHARACTER TABULATION (tab)
|
|
1101
|
+
* U+000A LINE FEED (LF)
|
|
1102
|
+
* U+000C FORM FEED (FF)
|
|
1103
|
+
* U+000D CARRIAGE RETURN (CR)
|
|
1104
|
+
* U+0020 SPACE
|
|
1105
|
+
*/
|
|
1106
|
+
case 0x09:
|
|
1107
|
+
case 0x0A:
|
|
1108
|
+
case 0x0C:
|
|
1109
|
+
case 0x0D:
|
|
1110
|
+
case 0x20:
|
|
1111
|
+
lxb_html_tokenizer_state_append_data_m(tkz, data);
|
|
1112
|
+
lxb_html_tokenizer_state_token_attr_set_value_end(tkz, data);
|
|
1113
|
+
lxb_html_tokenizer_state_set_value_m(tkz);
|
|
1114
|
+
|
|
1115
|
+
tkz->state = lxb_html_tokenizer_state_before_attribute_name;
|
|
1116
|
+
return (data + 1);
|
|
1117
|
+
|
|
1118
|
+
/* U+0026 AMPERSAND (&) */
|
|
1119
|
+
case 0x26:
|
|
1120
|
+
lxb_html_tokenizer_state_append_data_m(tkz, data + 1);
|
|
1121
|
+
|
|
1122
|
+
tkz->state = lxb_html_tokenizer_state_char_ref_attr;
|
|
1123
|
+
tkz->state_return = lxb_html_tokenizer_state_attribute_value_unquoted;
|
|
1124
|
+
|
|
1125
|
+
return data + 1;
|
|
1126
|
+
|
|
1127
|
+
/* U+003E GREATER-THAN SIGN (>) */
|
|
1128
|
+
case 0x3E:
|
|
1129
|
+
tkz->state = lxb_html_tokenizer_state_data_before;
|
|
1130
|
+
|
|
1131
|
+
lxb_html_tokenizer_state_append_data_m(tkz, data);
|
|
1132
|
+
lxb_html_tokenizer_state_token_attr_set_value_end(tkz, data);
|
|
1133
|
+
lxb_html_tokenizer_state_set_value_m(tkz);
|
|
1134
|
+
|
|
1135
|
+
lxb_html_tokenizer_state_token_done_m(tkz, end);
|
|
1136
|
+
|
|
1137
|
+
return (data + 1);
|
|
1138
|
+
|
|
1139
|
+
/*
|
|
1140
|
+
* U+0000 NULL
|
|
1141
|
+
* EOF
|
|
1142
|
+
*/
|
|
1143
|
+
case 0x00:
|
|
1144
|
+
if (tkz->is_eof) {
|
|
1145
|
+
if (tkz->token->attr_last->value_begin != NULL) {
|
|
1146
|
+
lxb_html_tokenizer_state_token_attr_set_value_end_oef(tkz);
|
|
1147
|
+
}
|
|
1148
|
+
|
|
1149
|
+
lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->last,
|
|
1150
|
+
LXB_HTML_TOKENIZER_ERROR_EOINTA);
|
|
1151
|
+
return end;
|
|
1152
|
+
}
|
|
1153
|
+
|
|
1154
|
+
lxb_html_tokenizer_state_append_data_m(tkz, data);
|
|
1155
|
+
lxb_html_tokenizer_state_begin_set(tkz, data + 1);
|
|
1156
|
+
lxb_html_tokenizer_state_append_replace_m(tkz);
|
|
1157
|
+
|
|
1158
|
+
lxb_html_tokenizer_error_add(tkz->parse_errors, data,
|
|
1159
|
+
LXB_HTML_TOKENIZER_ERROR_UNNUCH);
|
|
1160
|
+
break;
|
|
1161
|
+
|
|
1162
|
+
/*
|
|
1163
|
+
* U+0022 QUOTATION MARK (")
|
|
1164
|
+
* U+0027 APOSTROPHE (')
|
|
1165
|
+
* U+003C LESS-THAN SIGN (<)
|
|
1166
|
+
* U+003D EQUALS SIGN (=)
|
|
1167
|
+
* U+0060 GRAVE ACCENT (`)
|
|
1168
|
+
*/
|
|
1169
|
+
case 0x22:
|
|
1170
|
+
case 0x27:
|
|
1171
|
+
case 0x3C:
|
|
1172
|
+
case 0x3D:
|
|
1173
|
+
case 0x60:
|
|
1174
|
+
lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->token->end,
|
|
1175
|
+
LXB_HTML_TOKENIZER_ERROR_UNCHINUNATVA);
|
|
1176
|
+
break;
|
|
1177
|
+
|
|
1178
|
+
default:
|
|
1179
|
+
break;
|
|
1180
|
+
}
|
|
1181
|
+
|
|
1182
|
+
data++;
|
|
1183
|
+
}
|
|
1184
|
+
|
|
1185
|
+
lxb_html_tokenizer_state_append_data_m(tkz, data);
|
|
1186
|
+
|
|
1187
|
+
return data;
|
|
1188
|
+
}
|
|
1189
|
+
|
|
1190
|
+
/*
|
|
1191
|
+
* 12.2.5.39 After attribute value (quoted) state
|
|
1192
|
+
*/
|
|
1193
|
+
static const lxb_char_t *
|
|
1194
|
+
lxb_html_tokenizer_state_after_attribute_value_quoted(lxb_html_tokenizer_t *tkz,
|
|
1195
|
+
const lxb_char_t *data,
|
|
1196
|
+
const lxb_char_t *end)
|
|
1197
|
+
{
|
|
1198
|
+
switch (*data) {
|
|
1199
|
+
/*
|
|
1200
|
+
* U+0009 CHARACTER TABULATION (tab)
|
|
1201
|
+
* U+000A LINE FEED (LF)
|
|
1202
|
+
* U+000C FORM FEED (FF)
|
|
1203
|
+
* U+000D CARRIAGE RETURN (CR)
|
|
1204
|
+
* U+0020 SPACE
|
|
1205
|
+
*/
|
|
1206
|
+
case 0x09:
|
|
1207
|
+
case 0x0A:
|
|
1208
|
+
case 0x0C:
|
|
1209
|
+
case 0x0D:
|
|
1210
|
+
case 0x20:
|
|
1211
|
+
tkz->state = lxb_html_tokenizer_state_before_attribute_name;
|
|
1212
|
+
|
|
1213
|
+
return (data + 1);
|
|
1214
|
+
|
|
1215
|
+
/* U+002F SOLIDUS (/) */
|
|
1216
|
+
case 0x2F:
|
|
1217
|
+
tkz->state = lxb_html_tokenizer_state_self_closing_start_tag;
|
|
1218
|
+
|
|
1219
|
+
return (data + 1);
|
|
1220
|
+
|
|
1221
|
+
/* U+003E GREATER-THAN SIGN (>) */
|
|
1222
|
+
case 0x3E:
|
|
1223
|
+
tkz->state = lxb_html_tokenizer_state_data_before;
|
|
1224
|
+
|
|
1225
|
+
lxb_html_tokenizer_state_token_done_m(tkz, end);
|
|
1226
|
+
|
|
1227
|
+
return (data + 1);
|
|
1228
|
+
|
|
1229
|
+
/* EOF */
|
|
1230
|
+
case 0x00:
|
|
1231
|
+
if (tkz->is_eof) {
|
|
1232
|
+
lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->last,
|
|
1233
|
+
LXB_HTML_TOKENIZER_ERROR_EOINTA);
|
|
1234
|
+
return end;
|
|
1235
|
+
}
|
|
1236
|
+
/* fall through */
|
|
1237
|
+
|
|
1238
|
+
default:
|
|
1239
|
+
lxb_html_tokenizer_error_add(tkz->parse_errors, data,
|
|
1240
|
+
LXB_HTML_TOKENIZER_ERROR_MIWHBEAT);
|
|
1241
|
+
|
|
1242
|
+
tkz->state = lxb_html_tokenizer_state_before_attribute_name;
|
|
1243
|
+
|
|
1244
|
+
return data;
|
|
1245
|
+
}
|
|
1246
|
+
|
|
1247
|
+
return data;
|
|
1248
|
+
}
|
|
1249
|
+
|
|
1250
|
+
|
|
1251
|
+
const lxb_char_t *
|
|
1252
|
+
lxb_html_tokenizer_state_cr(lxb_html_tokenizer_t *tkz, const lxb_char_t *data,
|
|
1253
|
+
const lxb_char_t *end)
|
|
1254
|
+
{
|
|
1255
|
+
lxb_html_tokenizer_state_append_m(tkz, "\n", 1);
|
|
1256
|
+
|
|
1257
|
+
if (*data == 0x0A) {
|
|
1258
|
+
data++;
|
|
1259
|
+
}
|
|
1260
|
+
|
|
1261
|
+
tkz->state = tkz->state_return;
|
|
1262
|
+
|
|
1263
|
+
return data;
|
|
1264
|
+
}
|
|
1265
|
+
|
|
1266
|
+
/*
|
|
1267
|
+
* 12.2.5.40 Self-closing start tag state
|
|
1268
|
+
*/
|
|
1269
|
+
const lxb_char_t *
|
|
1270
|
+
lxb_html_tokenizer_state_self_closing_start_tag(lxb_html_tokenizer_t *tkz,
|
|
1271
|
+
const lxb_char_t *data,
|
|
1272
|
+
const lxb_char_t *end)
|
|
1273
|
+
{
|
|
1274
|
+
switch (*data) {
|
|
1275
|
+
/* U+003E GREATER-THAN SIGN (>) */
|
|
1276
|
+
case 0x3E:
|
|
1277
|
+
tkz->state = lxb_html_tokenizer_state_data_before;
|
|
1278
|
+
tkz->token->type |= LXB_HTML_TOKEN_TYPE_CLOSE_SELF;
|
|
1279
|
+
|
|
1280
|
+
lxb_html_tokenizer_state_token_done_m(tkz, end);
|
|
1281
|
+
|
|
1282
|
+
return (data + 1);
|
|
1283
|
+
|
|
1284
|
+
/* EOF */
|
|
1285
|
+
case 0x00:
|
|
1286
|
+
if (tkz->is_eof) {
|
|
1287
|
+
lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->token->end,
|
|
1288
|
+
LXB_HTML_TOKENIZER_ERROR_EOINTA);
|
|
1289
|
+
return end;
|
|
1290
|
+
}
|
|
1291
|
+
/* fall through */
|
|
1292
|
+
|
|
1293
|
+
default:
|
|
1294
|
+
lxb_html_tokenizer_error_add(tkz->parse_errors, data,
|
|
1295
|
+
LXB_HTML_TOKENIZER_ERROR_UNSOINTA);
|
|
1296
|
+
|
|
1297
|
+
tkz->state = lxb_html_tokenizer_state_before_attribute_name;
|
|
1298
|
+
|
|
1299
|
+
return data;
|
|
1300
|
+
}
|
|
1301
|
+
|
|
1302
|
+
return data;
|
|
1303
|
+
}
|
|
1304
|
+
|
|
1305
|
+
/*
|
|
1306
|
+
* Helper function. No in the specification. For 12.2.5.41 Bogus comment state
|
|
1307
|
+
*/
|
|
1308
|
+
static const lxb_char_t *
|
|
1309
|
+
lxb_html_tokenizer_state_bogus_comment_before(lxb_html_tokenizer_t *tkz,
|
|
1310
|
+
const lxb_char_t *data,
|
|
1311
|
+
const lxb_char_t *end)
|
|
1312
|
+
{
|
|
1313
|
+
tkz->token->tag_id = LXB_TAG__EM_COMMENT;
|
|
1314
|
+
|
|
1315
|
+
tkz->state = lxb_html_tokenizer_state_bogus_comment;
|
|
1316
|
+
|
|
1317
|
+
return data;
|
|
1318
|
+
}
|
|
1319
|
+
|
|
1320
|
+
/*
|
|
1321
|
+
* 12.2.5.41 Bogus comment state
|
|
1322
|
+
*/
|
|
1323
|
+
static const lxb_char_t *
|
|
1324
|
+
lxb_html_tokenizer_state_bogus_comment(lxb_html_tokenizer_t *tkz,
|
|
1325
|
+
const lxb_char_t *data,
|
|
1326
|
+
const lxb_char_t *end)
|
|
1327
|
+
{
|
|
1328
|
+
lxb_html_tokenizer_state_begin_set(tkz, data);
|
|
1329
|
+
|
|
1330
|
+
while (data != end) {
|
|
1331
|
+
switch (*data) {
|
|
1332
|
+
/* U+003E GREATER-THAN SIGN (>) */
|
|
1333
|
+
case 0x3E:
|
|
1334
|
+
tkz->state = lxb_html_tokenizer_state_data_before;
|
|
1335
|
+
|
|
1336
|
+
lxb_html_tokenizer_state_append_data_m(tkz, data);
|
|
1337
|
+
lxb_html_tokenizer_state_token_set_end(tkz, data);
|
|
1338
|
+
lxb_html_tokenizer_state_set_text(tkz);
|
|
1339
|
+
lxb_html_tokenizer_state_token_done_wo_check_m(tkz, end);
|
|
1340
|
+
|
|
1341
|
+
return (data + 1);
|
|
1342
|
+
|
|
1343
|
+
/* U+000D CARRIAGE RETURN (CR) */
|
|
1344
|
+
case 0x0D:
|
|
1345
|
+
if (++data >= end) {
|
|
1346
|
+
lxb_html_tokenizer_state_append_data_m(tkz, data - 1);
|
|
1347
|
+
|
|
1348
|
+
tkz->state = lxb_html_tokenizer_state_cr;
|
|
1349
|
+
tkz->state_return = lxb_html_tokenizer_state_bogus_comment;
|
|
1350
|
+
|
|
1351
|
+
return data;
|
|
1352
|
+
}
|
|
1353
|
+
|
|
1354
|
+
lxb_html_tokenizer_state_append_data_m(tkz, data);
|
|
1355
|
+
tkz->pos[-1] = 0x0A;
|
|
1356
|
+
|
|
1357
|
+
lxb_html_tokenizer_state_begin_set(tkz, data + 1);
|
|
1358
|
+
|
|
1359
|
+
if (*data != 0x0A) {
|
|
1360
|
+
lxb_html_tokenizer_state_begin_set(tkz, data);
|
|
1361
|
+
data--;
|
|
1362
|
+
}
|
|
1363
|
+
|
|
1364
|
+
break;
|
|
1365
|
+
|
|
1366
|
+
/*
|
|
1367
|
+
* EOF
|
|
1368
|
+
* U+0000 NULL
|
|
1369
|
+
*/
|
|
1370
|
+
case 0x00:
|
|
1371
|
+
lxb_html_tokenizer_state_append_data_m(tkz, data);
|
|
1372
|
+
|
|
1373
|
+
if (tkz->is_eof) {
|
|
1374
|
+
if (tkz->token->begin != NULL) {
|
|
1375
|
+
lxb_html_tokenizer_state_token_set_end_oef(tkz);
|
|
1376
|
+
}
|
|
1377
|
+
|
|
1378
|
+
lxb_html_tokenizer_state_set_text(tkz);
|
|
1379
|
+
lxb_html_tokenizer_state_token_done_wo_check_m(tkz, end);
|
|
1380
|
+
|
|
1381
|
+
return end;
|
|
1382
|
+
}
|
|
1383
|
+
|
|
1384
|
+
lxb_html_tokenizer_state_append_replace_m(tkz);
|
|
1385
|
+
lxb_html_tokenizer_state_begin_set(tkz, data + 1);
|
|
1386
|
+
|
|
1387
|
+
lxb_html_tokenizer_error_add(tkz->parse_errors, data,
|
|
1388
|
+
LXB_HTML_TOKENIZER_ERROR_UNNUCH);
|
|
1389
|
+
break;
|
|
1390
|
+
}
|
|
1391
|
+
|
|
1392
|
+
data++;
|
|
1393
|
+
}
|
|
1394
|
+
|
|
1395
|
+
lxb_html_tokenizer_state_append_data_m(tkz, data);
|
|
1396
|
+
|
|
1397
|
+
return data;
|
|
1398
|
+
}
|
|
1399
|
+
|
|
1400
|
+
/*
|
|
1401
|
+
* 12.2.5.42 Markup declaration open state
|
|
1402
|
+
*/
|
|
1403
|
+
static const lxb_char_t *
|
|
1404
|
+
lxb_html_tokenizer_state_markup_declaration_open(lxb_html_tokenizer_t *tkz,
|
|
1405
|
+
const lxb_char_t *data,
|
|
1406
|
+
const lxb_char_t *end)
|
|
1407
|
+
{
|
|
1408
|
+
/* Check first char for change parse state */
|
|
1409
|
+
if (tkz->is_eof == false) {
|
|
1410
|
+
lxb_html_tokenizer_state_token_set_begin(tkz, data);
|
|
1411
|
+
}
|
|
1412
|
+
|
|
1413
|
+
/* U+002D HYPHEN-MINUS characters (-) */
|
|
1414
|
+
if (*data == 0x2D) {
|
|
1415
|
+
if ((end - data) < 2) {
|
|
1416
|
+
tkz->state = lxb_html_tokenizer_state_markup_declaration_comment;
|
|
1417
|
+
return (data + 1);
|
|
1418
|
+
}
|
|
1419
|
+
|
|
1420
|
+
if (data[1] == 0x2D) {
|
|
1421
|
+
tkz->state = lxb_html_tokenizer_state_comment_before_start;
|
|
1422
|
+
return (data + 2);
|
|
1423
|
+
}
|
|
1424
|
+
}
|
|
1425
|
+
/*
|
|
1426
|
+
* ASCII case-insensitive match for the word "DOCTYPE"
|
|
1427
|
+
* U+0044 character (D) or U+0064 character (d)
|
|
1428
|
+
*/
|
|
1429
|
+
else if (*data == 0x44 || *data == 0x64) {
|
|
1430
|
+
if ((end - data) < 7) {
|
|
1431
|
+
tkz->markup = (lxb_char_t *) "doctype";
|
|
1432
|
+
|
|
1433
|
+
tkz->state = lxb_html_tokenizer_state_markup_declaration_doctype;
|
|
1434
|
+
return data;
|
|
1435
|
+
}
|
|
1436
|
+
|
|
1437
|
+
if (lexbor_str_data_ncasecmp((lxb_char_t *) "doctype", data, 7)) {
|
|
1438
|
+
tkz->state = lxb_html_tokenizer_state_doctype_before;
|
|
1439
|
+
return (data + 7);
|
|
1440
|
+
}
|
|
1441
|
+
}
|
|
1442
|
+
/* Case-sensitive match for the string "[CDATA["
|
|
1443
|
+
* (the five uppercase letters "CDATA" with a U+005B LEFT SQUARE BRACKET
|
|
1444
|
+
* character before and after)
|
|
1445
|
+
*/
|
|
1446
|
+
else if (*data == 0x5B) {
|
|
1447
|
+
if ((end - data) < 7) {
|
|
1448
|
+
tkz->markup = (lxb_char_t *) "[CDATA[";
|
|
1449
|
+
|
|
1450
|
+
tkz->state = lxb_html_tokenizer_state_markup_declaration_cdata;
|
|
1451
|
+
return data;
|
|
1452
|
+
}
|
|
1453
|
+
|
|
1454
|
+
if (lexbor_str_data_ncmp((lxb_char_t *) "[CDATA[", data, 7)) {
|
|
1455
|
+
lxb_ns_id_t ns = lxb_html_tokenizer_current_namespace(tkz);
|
|
1456
|
+
|
|
1457
|
+
if (ns != LXB_NS_HTML && ns != LXB_NS__UNDEF) {
|
|
1458
|
+
data += 7;
|
|
1459
|
+
|
|
1460
|
+
lxb_html_tokenizer_state_token_set_begin(tkz, data);
|
|
1461
|
+
|
|
1462
|
+
tkz->state = lxb_html_tokenizer_state_cdata_section_before;
|
|
1463
|
+
|
|
1464
|
+
return data;
|
|
1465
|
+
}
|
|
1466
|
+
|
|
1467
|
+
tkz->state = lxb_html_tokenizer_state_bogus_comment_before;
|
|
1468
|
+
|
|
1469
|
+
return data;
|
|
1470
|
+
}
|
|
1471
|
+
}
|
|
1472
|
+
|
|
1473
|
+
if (tkz->is_eof) {
|
|
1474
|
+
lxb_html_tokenizer_state_token_set_end_oef(tkz);
|
|
1475
|
+
|
|
1476
|
+
tkz->token->begin = tkz->token->end;
|
|
1477
|
+
}
|
|
1478
|
+
|
|
1479
|
+
lxb_html_tokenizer_error_add(tkz->parse_errors, data,
|
|
1480
|
+
LXB_HTML_TOKENIZER_ERROR_INOPCO);
|
|
1481
|
+
|
|
1482
|
+
tkz->state = lxb_html_tokenizer_state_bogus_comment_before;
|
|
1483
|
+
|
|
1484
|
+
return data;
|
|
1485
|
+
}
|
|
1486
|
+
|
|
1487
|
+
/*
|
|
1488
|
+
* Helper function. No in the specification. For 12.2.5.42
|
|
1489
|
+
* For a comment tag <!--
|
|
1490
|
+
*/
|
|
1491
|
+
static const lxb_char_t *
|
|
1492
|
+
lxb_html_tokenizer_state_markup_declaration_comment(lxb_html_tokenizer_t *tkz,
|
|
1493
|
+
const lxb_char_t *data,
|
|
1494
|
+
const lxb_char_t *end)
|
|
1495
|
+
{
|
|
1496
|
+
/* U+002D HYPHEN-MINUS characters (-) */
|
|
1497
|
+
if (*data == 0x2D) {
|
|
1498
|
+
tkz->state = lxb_html_tokenizer_state_comment_before_start;
|
|
1499
|
+
return (data + 1);
|
|
1500
|
+
}
|
|
1501
|
+
|
|
1502
|
+
lxb_html_tokenizer_error_add(tkz->parse_errors, data,
|
|
1503
|
+
LXB_HTML_TOKENIZER_ERROR_INOPCO);
|
|
1504
|
+
|
|
1505
|
+
tkz->state = lxb_html_tokenizer_state_bogus_comment_before;
|
|
1506
|
+
return data;
|
|
1507
|
+
}
|
|
1508
|
+
|
|
1509
|
+
/*
|
|
1510
|
+
* Helper function. No in the specification. For 12.2.5.42
|
|
1511
|
+
* For a DOCTYPE tag <!DOCTYPE
|
|
1512
|
+
*/
|
|
1513
|
+
static const lxb_char_t *
|
|
1514
|
+
lxb_html_tokenizer_state_markup_declaration_doctype(lxb_html_tokenizer_t *tkz,
|
|
1515
|
+
const lxb_char_t *data,
|
|
1516
|
+
const lxb_char_t *end)
|
|
1517
|
+
{
|
|
1518
|
+
const lxb_char_t *pos;
|
|
1519
|
+
pos = lexbor_str_data_ncasecmp_first(tkz->markup, data, (end - data));
|
|
1520
|
+
|
|
1521
|
+
if (pos == NULL) {
|
|
1522
|
+
lxb_html_tokenizer_error_add(tkz->parse_errors, data,
|
|
1523
|
+
LXB_HTML_TOKENIZER_ERROR_INOPCO);
|
|
1524
|
+
|
|
1525
|
+
tkz->state = lxb_html_tokenizer_state_bogus_comment_before;
|
|
1526
|
+
return data;
|
|
1527
|
+
}
|
|
1528
|
+
|
|
1529
|
+
if (*pos == '\0') {
|
|
1530
|
+
data = (data + (pos - tkz->markup));
|
|
1531
|
+
|
|
1532
|
+
tkz->state = lxb_html_tokenizer_state_doctype_before;
|
|
1533
|
+
return data;
|
|
1534
|
+
}
|
|
1535
|
+
|
|
1536
|
+
tkz->markup = pos;
|
|
1537
|
+
|
|
1538
|
+
return end;
|
|
1539
|
+
}
|
|
1540
|
+
|
|
1541
|
+
/*
|
|
1542
|
+
* Helper function. No in the specification. For 12.2.5.42
|
|
1543
|
+
* For a CDATA tag <![CDATA[
|
|
1544
|
+
*/
|
|
1545
|
+
static const lxb_char_t *
|
|
1546
|
+
lxb_html_tokenizer_state_markup_declaration_cdata(lxb_html_tokenizer_t *tkz,
|
|
1547
|
+
const lxb_char_t *data,
|
|
1548
|
+
const lxb_char_t *end)
|
|
1549
|
+
{
|
|
1550
|
+
const lxb_char_t *pos;
|
|
1551
|
+
pos = lexbor_str_data_ncasecmp_first(tkz->markup, data, (end - data));
|
|
1552
|
+
|
|
1553
|
+
if (pos == NULL) {
|
|
1554
|
+
lxb_html_tokenizer_error_add(tkz->parse_errors, data,
|
|
1555
|
+
LXB_HTML_TOKENIZER_ERROR_INOPCO);
|
|
1556
|
+
|
|
1557
|
+
tkz->state = lxb_html_tokenizer_state_bogus_comment_before;
|
|
1558
|
+
return data;
|
|
1559
|
+
}
|
|
1560
|
+
|
|
1561
|
+
if (*pos == '\0') {
|
|
1562
|
+
lxb_ns_id_t ns = lxb_html_tokenizer_current_namespace(tkz);
|
|
1563
|
+
|
|
1564
|
+
if (ns != LXB_NS_HTML && ns != LXB_NS__UNDEF) {
|
|
1565
|
+
data = (data + (pos - tkz->markup));
|
|
1566
|
+
|
|
1567
|
+
tkz->state = lxb_html_tokenizer_state_cdata_section_before;
|
|
1568
|
+
return data;
|
|
1569
|
+
}
|
|
1570
|
+
|
|
1571
|
+
lxb_html_tokenizer_state_append_m(tkz, "[CDATA", 6);
|
|
1572
|
+
|
|
1573
|
+
tkz->state = lxb_html_tokenizer_state_bogus_comment_before;
|
|
1574
|
+
return data;
|
|
1575
|
+
}
|
|
1576
|
+
|
|
1577
|
+
tkz->markup = pos;
|
|
1578
|
+
|
|
1579
|
+
return end;
|
|
1580
|
+
}
|
|
1581
|
+
|
|
1582
|
+
/*
|
|
1583
|
+
* Helper function. No in the specification. For 12.2.5.69
|
|
1584
|
+
*/
|
|
1585
|
+
static const lxb_char_t *
|
|
1586
|
+
lxb_html_tokenizer_state_cdata_section_before(lxb_html_tokenizer_t *tkz,
|
|
1587
|
+
const lxb_char_t *data,
|
|
1588
|
+
const lxb_char_t *end)
|
|
1589
|
+
{
|
|
1590
|
+
if (tkz->is_eof == false) {
|
|
1591
|
+
lxb_html_tokenizer_state_token_set_begin(tkz, data);
|
|
1592
|
+
}
|
|
1593
|
+
else {
|
|
1594
|
+
lxb_html_tokenizer_state_token_set_begin(tkz, tkz->last);
|
|
1595
|
+
}
|
|
1596
|
+
|
|
1597
|
+
tkz->token->tag_id = LXB_TAG__TEXT;
|
|
1598
|
+
|
|
1599
|
+
tkz->state = lxb_html_tokenizer_state_cdata_section;
|
|
1600
|
+
|
|
1601
|
+
return data;
|
|
1602
|
+
}
|
|
1603
|
+
|
|
1604
|
+
/*
|
|
1605
|
+
* 12.2.5.69 CDATA section state
|
|
1606
|
+
*/
|
|
1607
|
+
static const lxb_char_t *
|
|
1608
|
+
lxb_html_tokenizer_state_cdata_section(lxb_html_tokenizer_t *tkz,
|
|
1609
|
+
const lxb_char_t *data,
|
|
1610
|
+
const lxb_char_t *end)
|
|
1611
|
+
{
|
|
1612
|
+
lxb_html_tokenizer_state_begin_set(tkz, data);
|
|
1613
|
+
|
|
1614
|
+
while (data != end) {
|
|
1615
|
+
switch (*data) {
|
|
1616
|
+
/* U+005D RIGHT SQUARE BRACKET (]) */
|
|
1617
|
+
case 0x5D:
|
|
1618
|
+
lxb_html_tokenizer_state_append_data_m(tkz, data);
|
|
1619
|
+
lxb_html_tokenizer_state_token_set_end(tkz, data);
|
|
1620
|
+
|
|
1621
|
+
tkz->state = lxb_html_tokenizer_state_cdata_section_bracket;
|
|
1622
|
+
return (data + 1);
|
|
1623
|
+
|
|
1624
|
+
/* U+000D CARRIAGE RETURN (CR) */
|
|
1625
|
+
case 0x0D:
|
|
1626
|
+
if (++data >= end) {
|
|
1627
|
+
lxb_html_tokenizer_state_append_data_m(tkz, data - 1);
|
|
1628
|
+
|
|
1629
|
+
tkz->state = lxb_html_tokenizer_state_cr;
|
|
1630
|
+
tkz->state_return = lxb_html_tokenizer_state_cdata_section;
|
|
1631
|
+
|
|
1632
|
+
return data;
|
|
1633
|
+
}
|
|
1634
|
+
|
|
1635
|
+
lxb_html_tokenizer_state_append_data_m(tkz, data);
|
|
1636
|
+
tkz->pos[-1] = 0x0A;
|
|
1637
|
+
|
|
1638
|
+
lxb_html_tokenizer_state_begin_set(tkz, data + 1);
|
|
1639
|
+
|
|
1640
|
+
if (*data != 0x0A) {
|
|
1641
|
+
lxb_html_tokenizer_state_begin_set(tkz, data);
|
|
1642
|
+
data--;
|
|
1643
|
+
}
|
|
1644
|
+
|
|
1645
|
+
break;
|
|
1646
|
+
|
|
1647
|
+
/* EOF */
|
|
1648
|
+
case 0x00:
|
|
1649
|
+
if (tkz->is_eof) {
|
|
1650
|
+
lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->last,
|
|
1651
|
+
LXB_HTML_TOKENIZER_ERROR_EOINCD);
|
|
1652
|
+
|
|
1653
|
+
if (tkz->token->begin != NULL) {
|
|
1654
|
+
lxb_html_tokenizer_state_append_data_m(tkz, data);
|
|
1655
|
+
lxb_html_tokenizer_state_token_set_end_oef(tkz);
|
|
1656
|
+
}
|
|
1657
|
+
|
|
1658
|
+
lxb_html_tokenizer_state_set_text(tkz);
|
|
1659
|
+
lxb_html_tokenizer_state_token_done_m(tkz, end);
|
|
1660
|
+
|
|
1661
|
+
return end;
|
|
1662
|
+
}
|
|
1663
|
+
|
|
1664
|
+
if (SIZE_MAX - tkz->token->null_count < 1) {
|
|
1665
|
+
tkz->status = LXB_STATUS_ERROR_OVERFLOW;
|
|
1666
|
+
return end;
|
|
1667
|
+
}
|
|
1668
|
+
|
|
1669
|
+
tkz->token->null_count++;
|
|
1670
|
+
|
|
1671
|
+
break;
|
|
1672
|
+
|
|
1673
|
+
default:
|
|
1674
|
+
break;
|
|
1675
|
+
}
|
|
1676
|
+
|
|
1677
|
+
data++;
|
|
1678
|
+
}
|
|
1679
|
+
|
|
1680
|
+
lxb_html_tokenizer_state_append_data_m(tkz, data);
|
|
1681
|
+
|
|
1682
|
+
return data;
|
|
1683
|
+
}
|
|
1684
|
+
|
|
1685
|
+
/*
|
|
1686
|
+
* 12.2.5.70 CDATA section bracket state
|
|
1687
|
+
*/
|
|
1688
|
+
static const lxb_char_t *
|
|
1689
|
+
lxb_html_tokenizer_state_cdata_section_bracket(lxb_html_tokenizer_t *tkz,
|
|
1690
|
+
const lxb_char_t *data,
|
|
1691
|
+
const lxb_char_t *end)
|
|
1692
|
+
{
|
|
1693
|
+
/* U+005D RIGHT SQUARE BRACKET (]) */
|
|
1694
|
+
if (*data == 0x5D) {
|
|
1695
|
+
tkz->state = lxb_html_tokenizer_state_cdata_section_end;
|
|
1696
|
+
return (data + 1);
|
|
1697
|
+
}
|
|
1698
|
+
|
|
1699
|
+
lxb_html_tokenizer_state_append_m(tkz, "]", 1);
|
|
1700
|
+
|
|
1701
|
+
tkz->state = lxb_html_tokenizer_state_cdata_section;
|
|
1702
|
+
|
|
1703
|
+
return data;
|
|
1704
|
+
}
|
|
1705
|
+
|
|
1706
|
+
/*
|
|
1707
|
+
* 12.2.5.71 CDATA section end state
|
|
1708
|
+
*/
|
|
1709
|
+
static const lxb_char_t *
|
|
1710
|
+
lxb_html_tokenizer_state_cdata_section_end(lxb_html_tokenizer_t *tkz,
|
|
1711
|
+
const lxb_char_t *data,
|
|
1712
|
+
const lxb_char_t *end)
|
|
1713
|
+
{
|
|
1714
|
+
/* U+005D RIGHT SQUARE BRACKET (]) */
|
|
1715
|
+
if (*data == 0x5D) {
|
|
1716
|
+
lxb_html_tokenizer_state_append_m(tkz, data, 1);
|
|
1717
|
+
return (data + 1);
|
|
1718
|
+
}
|
|
1719
|
+
/* U+003E GREATER-THAN SIGN character */
|
|
1720
|
+
else if (*data == 0x3E) {
|
|
1721
|
+
tkz->state = lxb_html_tokenizer_state_data_before;
|
|
1722
|
+
|
|
1723
|
+
lxb_html_tokenizer_state_set_text(tkz);
|
|
1724
|
+
lxb_html_tokenizer_state_token_done_m(tkz, end);
|
|
1725
|
+
|
|
1726
|
+
return (data + 1);
|
|
1727
|
+
}
|
|
1728
|
+
|
|
1729
|
+
lxb_html_tokenizer_state_append_m(tkz, "]]", 2);
|
|
1730
|
+
|
|
1731
|
+
tkz->state = lxb_html_tokenizer_state_cdata_section;
|
|
1732
|
+
|
|
1733
|
+
return data;
|
|
1734
|
+
}
|
|
1735
|
+
|
|
1736
|
+
/*
|
|
1737
|
+
* 12.2.5.72 Character reference state
|
|
1738
|
+
*/
|
|
1739
|
+
const lxb_char_t *
|
|
1740
|
+
lxb_html_tokenizer_state_char_ref(lxb_html_tokenizer_t *tkz,
|
|
1741
|
+
const lxb_char_t *data, const lxb_char_t *end)
|
|
1742
|
+
{
|
|
1743
|
+
tkz->is_attribute = false;
|
|
1744
|
+
|
|
1745
|
+
return _lxb_html_tokenizer_state_char_ref(tkz, data, end);
|
|
1746
|
+
}
|
|
1747
|
+
|
|
1748
|
+
static const lxb_char_t *
|
|
1749
|
+
lxb_html_tokenizer_state_char_ref_attr(lxb_html_tokenizer_t *tkz,
|
|
1750
|
+
const lxb_char_t *data,
|
|
1751
|
+
const lxb_char_t *end)
|
|
1752
|
+
{
|
|
1753
|
+
tkz->is_attribute = true;
|
|
1754
|
+
|
|
1755
|
+
return _lxb_html_tokenizer_state_char_ref(tkz, data, end);
|
|
1756
|
+
}
|
|
1757
|
+
|
|
1758
|
+
static const lxb_char_t *
|
|
1759
|
+
_lxb_html_tokenizer_state_char_ref(lxb_html_tokenizer_t *tkz,
|
|
1760
|
+
const lxb_char_t *data,
|
|
1761
|
+
const lxb_char_t *end)
|
|
1762
|
+
{
|
|
1763
|
+
/* ASCII alphanumeric */
|
|
1764
|
+
if (lexbor_str_res_alphanumeric_character[ *data ] != LEXBOR_STR_RES_SLIP) {
|
|
1765
|
+
tkz->entity = &lxb_html_tokenizer_res_entities_sbst[1];
|
|
1766
|
+
tkz->entity_match = NULL;
|
|
1767
|
+
tkz->entity_start = (tkz->pos - 1) - tkz->start;
|
|
1768
|
+
|
|
1769
|
+
tkz->state = lxb_html_tokenizer_state_char_ref_named;
|
|
1770
|
+
|
|
1771
|
+
return data;
|
|
1772
|
+
}
|
|
1773
|
+
/* U+0023 NUMBER SIGN (#) */
|
|
1774
|
+
else if (*data == 0x23) {
|
|
1775
|
+
tkz->markup = data;
|
|
1776
|
+
tkz->entity_start = (tkz->pos - 1) - tkz->start;
|
|
1777
|
+
|
|
1778
|
+
lxb_html_tokenizer_state_append_m(tkz, data, 1);
|
|
1779
|
+
|
|
1780
|
+
tkz->state = lxb_html_tokenizer_state_char_ref_numeric;
|
|
1781
|
+
|
|
1782
|
+
return (data + 1);
|
|
1783
|
+
}
|
|
1784
|
+
else {
|
|
1785
|
+
tkz->state = tkz->state_return;
|
|
1786
|
+
}
|
|
1787
|
+
|
|
1788
|
+
return data;
|
|
1789
|
+
}
|
|
1790
|
+
|
|
1791
|
+
/*
|
|
1792
|
+
* 12.2.5.73 Named character reference state
|
|
1793
|
+
*
|
|
1794
|
+
* The slowest part in HTML parsing!!!
|
|
1795
|
+
*
|
|
1796
|
+
* This option works correctly and passes all tests (stream parsing too).
|
|
1797
|
+
* We must seriously think about how to accelerate this part.
|
|
1798
|
+
*/
|
|
1799
|
+
static const lxb_char_t *
|
|
1800
|
+
lxb_html_tokenizer_state_char_ref_named(lxb_html_tokenizer_t *tkz,
|
|
1801
|
+
const lxb_char_t *data,
|
|
1802
|
+
const lxb_char_t *end)
|
|
1803
|
+
{
|
|
1804
|
+
size_t size, tail_size;
|
|
1805
|
+
lxb_char_t *start;
|
|
1806
|
+
const lexbor_sbst_entry_static_t *entry = tkz->entity;
|
|
1807
|
+
|
|
1808
|
+
const lxb_char_t *begin = data;
|
|
1809
|
+
|
|
1810
|
+
while (data < end) {
|
|
1811
|
+
entry = lexbor_sbst_entry_static_find(lxb_html_tokenizer_res_entities_sbst,
|
|
1812
|
+
entry, *data);
|
|
1813
|
+
if (entry == NULL) {
|
|
1814
|
+
lxb_html_tokenizer_state_append_m(tkz, begin, (data - begin));
|
|
1815
|
+
goto done;
|
|
1816
|
+
}
|
|
1817
|
+
|
|
1818
|
+
if (entry->value != NULL) {
|
|
1819
|
+
tkz->entity_end = (tkz->pos + (data - begin)) - tkz->start;
|
|
1820
|
+
tkz->entity_match = entry;
|
|
1821
|
+
}
|
|
1822
|
+
|
|
1823
|
+
entry = &lxb_html_tokenizer_res_entities_sbst[ entry->next ];
|
|
1824
|
+
|
|
1825
|
+
data++;
|
|
1826
|
+
}
|
|
1827
|
+
|
|
1828
|
+
/* If entry not NULL and buffer empty, then wait next buffer. */
|
|
1829
|
+
tkz->entity = entry;
|
|
1830
|
+
|
|
1831
|
+
lxb_html_tokenizer_state_append_m(tkz, begin, (end - begin));
|
|
1832
|
+
return data;
|
|
1833
|
+
|
|
1834
|
+
done:
|
|
1835
|
+
|
|
1836
|
+
/* If we have bad entity */
|
|
1837
|
+
if (tkz->entity_match == NULL) {
|
|
1838
|
+
tkz->state = lxb_html_tokenizer_state_char_ref_ambiguous_ampersand;
|
|
1839
|
+
|
|
1840
|
+
return data;
|
|
1841
|
+
}
|
|
1842
|
+
|
|
1843
|
+
tkz->state = tkz->state_return;
|
|
1844
|
+
|
|
1845
|
+
/*
|
|
1846
|
+
* If the character reference was consumed as part of an attribute,
|
|
1847
|
+
* and the last character matched is not a U+003B SEMICOLON character (;),
|
|
1848
|
+
* and the next input character is either a U+003D EQUALS SIGN character (=)
|
|
1849
|
+
* or an ASCII alphanumeric, then, for historical reasons,
|
|
1850
|
+
* flush code points consumed as a character reference
|
|
1851
|
+
* and switch to the return state.
|
|
1852
|
+
*/
|
|
1853
|
+
/* U+003B SEMICOLON character (;) */
|
|
1854
|
+
if (tkz->is_attribute && tkz->entity_match->key != 0x3B) {
|
|
1855
|
+
/* U+003D EQUALS SIGN character (=) or ASCII alphanumeric */
|
|
1856
|
+
if (*data == 0x3D
|
|
1857
|
+
|| lexbor_str_res_alphanumeric_character[*data] != LEXBOR_STR_RES_SLIP)
|
|
1858
|
+
{
|
|
1859
|
+
return data;
|
|
1860
|
+
}
|
|
1861
|
+
}
|
|
1862
|
+
|
|
1863
|
+
if (tkz->entity_match->key != 0x3B) {
|
|
1864
|
+
lxb_html_tokenizer_error_add(tkz->parse_errors, data,
|
|
1865
|
+
LXB_HTML_TOKENIZER_ERROR_MISEAFCHRE);
|
|
1866
|
+
}
|
|
1867
|
+
|
|
1868
|
+
start = &tkz->start[tkz->entity_start];
|
|
1869
|
+
|
|
1870
|
+
size = tkz->pos - start;
|
|
1871
|
+
tail_size = tkz->pos - &tkz->start[tkz->entity_end] - 1;
|
|
1872
|
+
|
|
1873
|
+
if (tail_size != 0) {
|
|
1874
|
+
if ((size + tail_size) + start > tkz->end) {
|
|
1875
|
+
if (lxb_html_tokenizer_temp_realloc(tkz, size) != LXB_STATUS_OK) {
|
|
1876
|
+
return end;
|
|
1877
|
+
}
|
|
1878
|
+
start = &tkz->start[tkz->entity_start];
|
|
1879
|
+
}
|
|
1880
|
+
|
|
1881
|
+
memmove(start + tkz->entity_match->value_len,
|
|
1882
|
+
tkz->pos - tail_size, tail_size);
|
|
1883
|
+
}
|
|
1884
|
+
|
|
1885
|
+
memcpy(start, tkz->entity_match->value, tkz->entity_match->value_len);
|
|
1886
|
+
|
|
1887
|
+
tkz->pos = start + (tkz->entity_match->value_len + tail_size);
|
|
1888
|
+
|
|
1889
|
+
return data;
|
|
1890
|
+
}
|
|
1891
|
+
|
|
1892
|
+
/*
|
|
1893
|
+
* 12.2.5.74 Ambiguous ampersand state
|
|
1894
|
+
*/
|
|
1895
|
+
static const lxb_char_t *
|
|
1896
|
+
lxb_html_tokenizer_state_char_ref_ambiguous_ampersand(lxb_html_tokenizer_t *tkz,
|
|
1897
|
+
const lxb_char_t *data,
|
|
1898
|
+
const lxb_char_t *end)
|
|
1899
|
+
{
|
|
1900
|
+
/* ASCII alphanumeric */
|
|
1901
|
+
/* Skipped, not need */
|
|
1902
|
+
|
|
1903
|
+
/* U+003B SEMICOLON (;) */
|
|
1904
|
+
if (*data == 0x3B) {
|
|
1905
|
+
lxb_html_tokenizer_error_add(tkz->parse_errors, data,
|
|
1906
|
+
LXB_HTML_TOKENIZER_ERROR_UNNACHRE);
|
|
1907
|
+
}
|
|
1908
|
+
|
|
1909
|
+
tkz->state = tkz->state_return;
|
|
1910
|
+
|
|
1911
|
+
return data;
|
|
1912
|
+
}
|
|
1913
|
+
|
|
1914
|
+
/*
|
|
1915
|
+
* 12.2.5.75 Numeric character reference state
|
|
1916
|
+
*/
|
|
1917
|
+
static const lxb_char_t *
|
|
1918
|
+
lxb_html_tokenizer_state_char_ref_numeric(lxb_html_tokenizer_t *tkz,
|
|
1919
|
+
const lxb_char_t *data,
|
|
1920
|
+
const lxb_char_t *end)
|
|
1921
|
+
{
|
|
1922
|
+
tkz->entity_number = 0;
|
|
1923
|
+
|
|
1924
|
+
/*
|
|
1925
|
+
* U+0078 LATIN SMALL LETTER X
|
|
1926
|
+
* U+0058 LATIN CAPITAL LETTER X
|
|
1927
|
+
*/
|
|
1928
|
+
if (*data == 0x78 || *data == 0x58) {
|
|
1929
|
+
lxb_html_tokenizer_state_append_m(tkz, data, 1);
|
|
1930
|
+
|
|
1931
|
+
tkz->state = lxb_html_tokenizer_state_char_ref_hexademical_start;
|
|
1932
|
+
|
|
1933
|
+
return (data + 1);
|
|
1934
|
+
}
|
|
1935
|
+
|
|
1936
|
+
tkz->state = lxb_html_tokenizer_state_char_ref_decimal_start;
|
|
1937
|
+
|
|
1938
|
+
return data;
|
|
1939
|
+
}
|
|
1940
|
+
|
|
1941
|
+
/*
|
|
1942
|
+
* 12.2.5.76 Hexademical character reference start state
|
|
1943
|
+
*/
|
|
1944
|
+
static const lxb_char_t *
|
|
1945
|
+
lxb_html_tokenizer_state_char_ref_hexademical_start(lxb_html_tokenizer_t *tkz,
|
|
1946
|
+
const lxb_char_t *data,
|
|
1947
|
+
const lxb_char_t *end)
|
|
1948
|
+
{
|
|
1949
|
+
/* ASCII hex digit */
|
|
1950
|
+
if (lexbor_str_res_map_hex[ *data ] != LEXBOR_STR_RES_SLIP) {
|
|
1951
|
+
tkz->state = lxb_html_tokenizer_state_char_ref_hexademical;
|
|
1952
|
+
}
|
|
1953
|
+
else {
|
|
1954
|
+
lxb_html_tokenizer_error_add(tkz->parse_errors, data,
|
|
1955
|
+
LXB_HTML_TOKENIZER_ERROR_ABOFDIINNUCHRE);
|
|
1956
|
+
|
|
1957
|
+
tkz->state = tkz->state_return;
|
|
1958
|
+
}
|
|
1959
|
+
|
|
1960
|
+
return data;
|
|
1961
|
+
}
|
|
1962
|
+
|
|
1963
|
+
/*
|
|
1964
|
+
* 12.2.5.77 Decimal character reference start state
|
|
1965
|
+
*/
|
|
1966
|
+
static const lxb_char_t *
|
|
1967
|
+
lxb_html_tokenizer_state_char_ref_decimal_start(lxb_html_tokenizer_t *tkz,
|
|
1968
|
+
const lxb_char_t *data,
|
|
1969
|
+
const lxb_char_t *end)
|
|
1970
|
+
{
|
|
1971
|
+
/* ASCII digit */
|
|
1972
|
+
if (lexbor_str_res_map_num[ *data ] != LEXBOR_STR_RES_SLIP) {
|
|
1973
|
+
tkz->state = lxb_html_tokenizer_state_char_ref_decimal;
|
|
1974
|
+
}
|
|
1975
|
+
else {
|
|
1976
|
+
lxb_html_tokenizer_error_add(tkz->parse_errors, data,
|
|
1977
|
+
LXB_HTML_TOKENIZER_ERROR_ABOFDIINNUCHRE);
|
|
1978
|
+
|
|
1979
|
+
tkz->state = tkz->state_return;
|
|
1980
|
+
}
|
|
1981
|
+
|
|
1982
|
+
return data;
|
|
1983
|
+
}
|
|
1984
|
+
|
|
1985
|
+
/*
|
|
1986
|
+
* 12.2.5.78 Hexademical character reference state
|
|
1987
|
+
*/
|
|
1988
|
+
static const lxb_char_t *
|
|
1989
|
+
lxb_html_tokenizer_state_char_ref_hexademical(lxb_html_tokenizer_t *tkz,
|
|
1990
|
+
const lxb_char_t *data,
|
|
1991
|
+
const lxb_char_t *end)
|
|
1992
|
+
{
|
|
1993
|
+
while (data != end) {
|
|
1994
|
+
if (lexbor_str_res_map_hex[ *data ] == LEXBOR_STR_RES_SLIP) {
|
|
1995
|
+
tkz->state = tkz->state_return;
|
|
1996
|
+
|
|
1997
|
+
if (*data == ';') {
|
|
1998
|
+
data++;
|
|
1999
|
+
}
|
|
2000
|
+
|
|
2001
|
+
return lxb_html_tokenizer_state_char_ref_numeric_end(tkz, data, end);
|
|
2002
|
+
}
|
|
2003
|
+
|
|
2004
|
+
if (tkz->entity_number <= 0x10FFFF) {
|
|
2005
|
+
tkz->entity_number <<= 4;
|
|
2006
|
+
tkz->entity_number |= lexbor_str_res_map_hex[ *data ];
|
|
2007
|
+
}
|
|
2008
|
+
|
|
2009
|
+
data++;
|
|
2010
|
+
}
|
|
2011
|
+
|
|
2012
|
+
return data;
|
|
2013
|
+
}
|
|
2014
|
+
|
|
2015
|
+
/*
|
|
2016
|
+
* 12.2.5.79 Decimal character reference state
|
|
2017
|
+
*/
|
|
2018
|
+
static const lxb_char_t *
|
|
2019
|
+
lxb_html_tokenizer_state_char_ref_decimal(lxb_html_tokenizer_t *tkz,
|
|
2020
|
+
const lxb_char_t *data,
|
|
2021
|
+
const lxb_char_t *end)
|
|
2022
|
+
{
|
|
2023
|
+
while (data != end) {
|
|
2024
|
+
if (lexbor_str_res_map_num[ *data ] == LEXBOR_STR_RES_SLIP) {
|
|
2025
|
+
tkz->state = tkz->state_return;
|
|
2026
|
+
|
|
2027
|
+
if (*data == ';') {
|
|
2028
|
+
data++;
|
|
2029
|
+
}
|
|
2030
|
+
|
|
2031
|
+
return lxb_html_tokenizer_state_char_ref_numeric_end(tkz, data, end);
|
|
2032
|
+
}
|
|
2033
|
+
|
|
2034
|
+
if (tkz->entity_number <= 0x10FFFF) {
|
|
2035
|
+
tkz->entity_number = lexbor_str_res_map_num[ *data ]
|
|
2036
|
+
+ tkz->entity_number * 10;
|
|
2037
|
+
}
|
|
2038
|
+
|
|
2039
|
+
data++;
|
|
2040
|
+
}
|
|
2041
|
+
|
|
2042
|
+
return data;
|
|
2043
|
+
}
|
|
2044
|
+
|
|
2045
|
+
/*
|
|
2046
|
+
* 12.2.5.80 Numeric character reference end state
|
|
2047
|
+
*/
|
|
2048
|
+
static const lxb_char_t *
|
|
2049
|
+
lxb_html_tokenizer_state_char_ref_numeric_end(lxb_html_tokenizer_t *tkz,
|
|
2050
|
+
const lxb_char_t *data,
|
|
2051
|
+
const lxb_char_t *end)
|
|
2052
|
+
{
|
|
2053
|
+
lxb_char_t *start = &tkz->start[tkz->entity_start];
|
|
2054
|
+
|
|
2055
|
+
if ((start + 4) > tkz->end) {
|
|
2056
|
+
if(lxb_html_tokenizer_temp_realloc(tkz, 4)) {
|
|
2057
|
+
return end;
|
|
2058
|
+
}
|
|
2059
|
+
|
|
2060
|
+
start = &tkz->start[tkz->entity_start];
|
|
2061
|
+
}
|
|
2062
|
+
|
|
2063
|
+
if (tkz->entity_number == 0x00) {
|
|
2064
|
+
lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->markup,
|
|
2065
|
+
LXB_HTML_TOKENIZER_ERROR_NUCHRE);
|
|
2066
|
+
|
|
2067
|
+
goto xFFFD;
|
|
2068
|
+
}
|
|
2069
|
+
else if (tkz->entity_number > 0x10FFFF) {
|
|
2070
|
+
lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->markup,
|
|
2071
|
+
LXB_HTML_TOKENIZER_ERROR_CHREOUUNRA);
|
|
2072
|
+
|
|
2073
|
+
goto xFFFD;
|
|
2074
|
+
}
|
|
2075
|
+
else if (tkz->entity_number >= 0xD800 && tkz->entity_number <= 0xDFFF) {
|
|
2076
|
+
lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->markup,
|
|
2077
|
+
LXB_HTML_TOKENIZER_ERROR_SUCHRE);
|
|
2078
|
+
|
|
2079
|
+
goto xFFFD;
|
|
2080
|
+
}
|
|
2081
|
+
else if (tkz->entity_number >= 0xFDD0 && tkz->entity_number <= 0xFDEF) {
|
|
2082
|
+
lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->markup,
|
|
2083
|
+
LXB_HTML_TOKENIZER_ERROR_NOCHRE);
|
|
2084
|
+
}
|
|
2085
|
+
|
|
2086
|
+
switch (tkz->entity_number) {
|
|
2087
|
+
case 0xFFFE: case 0xFFFF: case 0x1FFFE: case 0x1FFFF: case 0x2FFFE:
|
|
2088
|
+
case 0x2FFFF: case 0x3FFFE: case 0x3FFFF: case 0x4FFFE: case 0x4FFFF:
|
|
2089
|
+
case 0x5FFFE: case 0x5FFFF: case 0x6FFFE: case 0x6FFFF: case 0x7FFFE:
|
|
2090
|
+
case 0x7FFFF: case 0x8FFFE: case 0x8FFFF: case 0x9FFFE: case 0x9FFFF:
|
|
2091
|
+
case 0xAFFFE: case 0xAFFFF: case 0xBFFFE: case 0xBFFFF: case 0xCFFFE:
|
|
2092
|
+
case 0xCFFFF: case 0xDFFFE: case 0xDFFFF: case 0xEFFFE: case 0xEFFFF:
|
|
2093
|
+
case 0xFFFFE: case 0xFFFFF:
|
|
2094
|
+
case 0x10FFFE:
|
|
2095
|
+
case 0x10FFFF:
|
|
2096
|
+
lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->markup,
|
|
2097
|
+
LXB_HTML_TOKENIZER_ERROR_NOCHRE);
|
|
2098
|
+
break;
|
|
2099
|
+
|
|
2100
|
+
default:
|
|
2101
|
+
break;
|
|
2102
|
+
}
|
|
2103
|
+
|
|
2104
|
+
if (tkz->entity_number <= 0x1F
|
|
2105
|
+
|| (tkz->entity_number >= 0x7F && tkz->entity_number <= 0x9F))
|
|
2106
|
+
{
|
|
2107
|
+
lxb_html_tokenizer_error_add(tkz->parse_errors, tkz->markup,
|
|
2108
|
+
LXB_HTML_TOKENIZER_ERROR_COCHRE);
|
|
2109
|
+
}
|
|
2110
|
+
|
|
2111
|
+
if (tkz->entity_number <= 0x9F) {
|
|
2112
|
+
tkz->entity_number = (uint32_t) lexbor_str_res_replacement_character[tkz->entity_number];
|
|
2113
|
+
}
|
|
2114
|
+
|
|
2115
|
+
start += lxb_html_tokenizer_state_to_ascii_utf_8(tkz->entity_number, start);
|
|
2116
|
+
|
|
2117
|
+
tkz->pos = start;
|
|
2118
|
+
|
|
2119
|
+
return data;
|
|
2120
|
+
|
|
2121
|
+
xFFFD:
|
|
2122
|
+
|
|
2123
|
+
memcpy(start, lexbor_str_res_ansi_replacement_character,
|
|
2124
|
+
sizeof(lexbor_str_res_ansi_replacement_character) - 1);
|
|
2125
|
+
|
|
2126
|
+
tkz->pos = start + sizeof(lexbor_str_res_ansi_replacement_character) - 1;
|
|
2127
|
+
|
|
2128
|
+
return data;
|
|
2129
|
+
}
|
|
2130
|
+
|
|
2131
|
+
static size_t
|
|
2132
|
+
lxb_html_tokenizer_state_to_ascii_utf_8(size_t codepoint, lxb_char_t *data)
|
|
2133
|
+
{
|
|
2134
|
+
/* 0x80 -- 10xxxxxx */
|
|
2135
|
+
/* 0xC0 -- 110xxxxx */
|
|
2136
|
+
/* 0xE0 -- 1110xxxx */
|
|
2137
|
+
/* 0xF0 -- 11110xxx */
|
|
2138
|
+
|
|
2139
|
+
if (codepoint <= 0x0000007F) {
|
|
2140
|
+
/* 0xxxxxxx */
|
|
2141
|
+
data[0] = (char) codepoint;
|
|
2142
|
+
|
|
2143
|
+
return 1;
|
|
2144
|
+
}
|
|
2145
|
+
else if (codepoint <= 0x000007FF) {
|
|
2146
|
+
/* 110xxxxx 10xxxxxx */
|
|
2147
|
+
data[0] = (char) (0xC0 | (codepoint >> 6 ));
|
|
2148
|
+
data[1] = (char) (0x80 | (codepoint & 0x3F));
|
|
2149
|
+
|
|
2150
|
+
return 2;
|
|
2151
|
+
}
|
|
2152
|
+
else if (codepoint <= 0x0000FFFF) {
|
|
2153
|
+
/* 1110xxxx 10xxxxxx 10xxxxxx */
|
|
2154
|
+
data[0] = (char) (0xE0 | ((codepoint >> 12)));
|
|
2155
|
+
data[1] = (char) (0x80 | ((codepoint >> 6 ) & 0x3F));
|
|
2156
|
+
data[2] = (char) (0x80 | ( codepoint & 0x3F));
|
|
2157
|
+
|
|
2158
|
+
return 3;
|
|
2159
|
+
}
|
|
2160
|
+
else if (codepoint <= 0x001FFFFF) {
|
|
2161
|
+
/* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
|
|
2162
|
+
data[0] = (char) (0xF0 | ( codepoint >> 18));
|
|
2163
|
+
data[1] = (char) (0x80 | ((codepoint >> 12) & 0x3F));
|
|
2164
|
+
data[2] = (char) (0x80 | ((codepoint >> 6 ) & 0x3F));
|
|
2165
|
+
data[3] = (char) (0x80 | ( codepoint & 0x3F));
|
|
2166
|
+
|
|
2167
|
+
return 4;
|
|
2168
|
+
}
|
|
2169
|
+
|
|
2170
|
+
return 0;
|
|
2171
|
+
}
|