com.googler.python 1.0.7 → 1.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (354) hide show
  1. package/package.json +4 -2
  2. package/python3.4.2/lib/python3.4/site-packages/pip/__init__.py +1 -277
  3. package/python3.4.2/lib/python3.4/site-packages/pip/__main__.py +19 -7
  4. package/python3.4.2/lib/python3.4/site-packages/pip/_internal/__init__.py +246 -0
  5. package/python3.4.2/lib/python3.4/site-packages/pip/_internal/basecommand.py +373 -0
  6. package/python3.4.2/lib/python3.4/site-packages/pip/{baseparser.py → _internal/baseparser.py} +240 -224
  7. package/python3.4.2/lib/python3.4/site-packages/pip/_internal/build_env.py +92 -0
  8. package/python3.4.2/lib/python3.4/site-packages/pip/_internal/cache.py +202 -0
  9. package/python3.4.2/lib/python3.4/site-packages/pip/_internal/cmdoptions.py +609 -0
  10. package/python3.4.2/lib/python3.4/site-packages/pip/_internal/commands/__init__.py +79 -0
  11. package/python3.4.2/lib/python3.4/site-packages/pip/_internal/commands/check.py +42 -0
  12. package/python3.4.2/lib/python3.4/site-packages/pip/_internal/commands/completion.py +94 -0
  13. package/python3.4.2/lib/python3.4/site-packages/pip/_internal/commands/configuration.py +227 -0
  14. package/python3.4.2/lib/python3.4/site-packages/pip/_internal/commands/download.py +233 -0
  15. package/python3.4.2/lib/python3.4/site-packages/pip/_internal/commands/freeze.py +96 -0
  16. package/python3.4.2/lib/python3.4/site-packages/pip/_internal/commands/hash.py +57 -0
  17. package/python3.4.2/lib/python3.4/site-packages/pip/{commands → _internal/commands}/help.py +36 -33
  18. package/python3.4.2/lib/python3.4/site-packages/pip/_internal/commands/install.py +477 -0
  19. package/python3.4.2/lib/python3.4/site-packages/pip/_internal/commands/list.py +343 -0
  20. package/python3.4.2/lib/python3.4/site-packages/pip/_internal/commands/search.py +135 -0
  21. package/python3.4.2/lib/python3.4/site-packages/pip/_internal/commands/show.py +164 -0
  22. package/python3.4.2/lib/python3.4/site-packages/pip/_internal/commands/uninstall.py +71 -0
  23. package/python3.4.2/lib/python3.4/site-packages/pip/_internal/commands/wheel.py +179 -0
  24. package/python3.4.2/lib/python3.4/site-packages/pip/_internal/compat.py +235 -0
  25. package/python3.4.2/lib/python3.4/site-packages/pip/_internal/configuration.py +378 -0
  26. package/python3.4.2/lib/python3.4/site-packages/pip/_internal/download.py +922 -0
  27. package/python3.4.2/lib/python3.4/site-packages/pip/_internal/exceptions.py +249 -0
  28. package/python3.4.2/lib/python3.4/site-packages/pip/_internal/index.py +1117 -0
  29. package/python3.4.2/lib/python3.4/site-packages/pip/_internal/locations.py +194 -0
  30. package/python3.4.2/lib/python3.4/site-packages/pip/_internal/models/__init__.py +4 -0
  31. package/python3.4.2/lib/python3.4/site-packages/pip/_internal/models/index.py +15 -0
  32. package/python3.4.2/lib/python3.4/site-packages/pip/{_vendor/requests/packages/urllib3/contrib → _internal/operations}/__init__.py +0 -0
  33. package/python3.4.2/lib/python3.4/site-packages/pip/_internal/operations/check.py +106 -0
  34. package/python3.4.2/lib/python3.4/site-packages/pip/_internal/operations/freeze.py +252 -0
  35. package/python3.4.2/lib/python3.4/site-packages/pip/_internal/operations/prepare.py +378 -0
  36. package/python3.4.2/lib/python3.4/site-packages/pip/_internal/pep425tags.py +317 -0
  37. package/python3.4.2/lib/python3.4/site-packages/pip/_internal/req/__init__.py +69 -0
  38. package/python3.4.2/lib/python3.4/site-packages/pip/_internal/req/req_file.py +338 -0
  39. package/python3.4.2/lib/python3.4/site-packages/pip/_internal/req/req_install.py +1115 -0
  40. package/python3.4.2/lib/python3.4/site-packages/pip/_internal/req/req_set.py +164 -0
  41. package/python3.4.2/lib/python3.4/site-packages/pip/_internal/req/req_uninstall.py +455 -0
  42. package/python3.4.2/lib/python3.4/site-packages/pip/_internal/resolve.py +354 -0
  43. package/python3.4.2/lib/python3.4/site-packages/pip/{status_codes.py → _internal/status_codes.py} +8 -6
  44. package/python3.4.2/lib/python3.4/site-packages/pip/_internal/utils/__init__.py +0 -0
  45. package/python3.4.2/lib/python3.4/site-packages/pip/_internal/utils/appdirs.py +258 -0
  46. package/python3.4.2/lib/python3.4/site-packages/pip/_internal/utils/deprecation.py +77 -0
  47. package/python3.4.2/lib/python3.4/site-packages/pip/_internal/utils/encoding.py +33 -0
  48. package/python3.4.2/lib/python3.4/site-packages/pip/_internal/utils/filesystem.py +28 -0
  49. package/python3.4.2/lib/python3.4/site-packages/pip/_internal/utils/glibc.py +84 -0
  50. package/python3.4.2/lib/python3.4/site-packages/pip/_internal/utils/hashes.py +94 -0
  51. package/python3.4.2/lib/python3.4/site-packages/pip/_internal/utils/logging.py +132 -0
  52. package/python3.4.2/lib/python3.4/site-packages/pip/_internal/utils/misc.py +851 -0
  53. package/python3.4.2/lib/python3.4/site-packages/pip/_internal/utils/outdated.py +163 -0
  54. package/python3.4.2/lib/python3.4/site-packages/pip/_internal/utils/packaging.py +70 -0
  55. package/python3.4.2/lib/python3.4/site-packages/pip/_internal/utils/setuptools_build.py +8 -0
  56. package/python3.4.2/lib/python3.4/site-packages/pip/_internal/utils/temp_dir.py +82 -0
  57. package/python3.4.2/lib/python3.4/site-packages/pip/_internal/utils/typing.py +29 -0
  58. package/python3.4.2/lib/python3.4/site-packages/pip/_internal/utils/ui.py +421 -0
  59. package/python3.4.2/lib/python3.4/site-packages/pip/_internal/vcs/__init__.py +471 -0
  60. package/python3.4.2/lib/python3.4/site-packages/pip/_internal/vcs/bazaar.py +113 -0
  61. package/python3.4.2/lib/python3.4/site-packages/pip/_internal/vcs/git.py +311 -0
  62. package/python3.4.2/lib/python3.4/site-packages/pip/_internal/vcs/mercurial.py +105 -0
  63. package/python3.4.2/lib/python3.4/site-packages/pip/_internal/vcs/subversion.py +271 -0
  64. package/python3.4.2/lib/python3.4/site-packages/pip/_internal/wheel.py +817 -0
  65. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/__init__.py +109 -8
  66. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/appdirs.py +604 -0
  67. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/cachecontrol/__init__.py +11 -0
  68. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/cachecontrol/_cmd.py +60 -0
  69. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/cachecontrol/adapter.py +134 -0
  70. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/cachecontrol/cache.py +39 -0
  71. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/cachecontrol/caches/__init__.py +2 -0
  72. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/cachecontrol/caches/file_cache.py +133 -0
  73. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/cachecontrol/caches/redis_cache.py +43 -0
  74. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/cachecontrol/compat.py +29 -0
  75. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/cachecontrol/controller.py +373 -0
  76. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/cachecontrol/filewrapper.py +78 -0
  77. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/cachecontrol/heuristics.py +138 -0
  78. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/cachecontrol/serialize.py +194 -0
  79. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/cachecontrol/wrapper.py +27 -0
  80. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/certifi/__init__.py +3 -0
  81. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/certifi/__main__.py +2 -0
  82. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/{requests → certifi}/cacert.pem +1765 -2358
  83. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/certifi/core.py +37 -0
  84. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/{requests/packages/chardet → chardet}/__init__.py +39 -32
  85. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/chardet/big5freq.py +386 -0
  86. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/{requests/packages/chardet → chardet}/big5prober.py +47 -42
  87. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/{requests/packages/chardet → chardet}/chardistribution.py +233 -231
  88. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/chardet/charsetgroupprober.py +106 -0
  89. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/chardet/charsetprober.py +145 -0
  90. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/chardet/cli/__init__.py +1 -0
  91. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/chardet/cli/chardetect.py +85 -0
  92. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/chardet/codingstatemachine.py +88 -0
  93. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/{requests/packages/chardet → chardet}/compat.py +34 -34
  94. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/{requests/packages/chardet → chardet}/cp949prober.py +49 -44
  95. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/chardet/enums.py +76 -0
  96. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/chardet/escprober.py +101 -0
  97. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/chardet/escsm.py +246 -0
  98. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/chardet/eucjpprober.py +92 -0
  99. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/chardet/euckrfreq.py +195 -0
  100. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/{requests/packages/chardet → chardet}/euckrprober.py +47 -42
  101. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/{requests/packages/chardet → chardet}/euctwfreq.py +387 -428
  102. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/{requests/packages/chardet → chardet}/euctwprober.py +46 -41
  103. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/{requests/packages/chardet → chardet}/gb2312freq.py +283 -472
  104. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/{requests/packages/chardet → chardet}/gb2312prober.py +46 -41
  105. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/{requests/packages/chardet → chardet}/hebrewprober.py +292 -283
  106. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/{requests/packages/chardet → chardet}/jisfreq.py +325 -569
  107. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/{requests/packages/chardet → chardet}/jpcntx.py +233 -219
  108. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/{requests/packages/chardet → chardet}/langbulgarianmodel.py +228 -229
  109. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/{requests/packages/chardet → chardet}/langcyrillicmodel.py +333 -329
  110. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/{requests/packages/chardet → chardet}/langgreekmodel.py +225 -225
  111. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/{requests/packages/chardet → chardet}/langhebrewmodel.py +200 -201
  112. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/{requests/packages/chardet → chardet}/langhungarianmodel.py +225 -225
  113. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/{requests/packages/chardet → chardet}/langthaimodel.py +199 -200
  114. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/chardet/langturkishmodel.py +193 -0
  115. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/{requests/packages/chardet → chardet}/latin1prober.py +145 -139
  116. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/chardet/mbcharsetprober.py +91 -0
  117. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/{requests/packages/chardet → chardet}/mbcsgroupprober.py +54 -54
  118. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/chardet/mbcssm.py +572 -0
  119. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/chardet/sbcharsetprober.py +132 -0
  120. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/{requests/packages/chardet → chardet}/sbcsgroupprober.py +73 -69
  121. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/chardet/sjisprober.py +92 -0
  122. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/chardet/universaldetector.py +286 -0
  123. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/{requests/packages/chardet → chardet}/utf8prober.py +82 -76
  124. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/chardet/version.py +9 -0
  125. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/colorama/__init__.py +7 -7
  126. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/colorama/ansi.py +102 -50
  127. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/colorama/ansitowin32.py +236 -190
  128. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/colorama/initialise.py +82 -56
  129. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/colorama/win32.py +156 -137
  130. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/colorama/winterm.py +162 -120
  131. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/distlib/__init__.py +23 -23
  132. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/distlib/_backport/__init__.py +6 -6
  133. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/distlib/_backport/misc.py +41 -41
  134. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/distlib/_backport/shutil.py +761 -761
  135. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/distlib/_backport/sysconfig.cfg +84 -84
  136. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/distlib/_backport/sysconfig.py +788 -788
  137. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/distlib/_backport/tarfile.py +2607 -2607
  138. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/distlib/compat.py +1117 -1064
  139. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/distlib/database.py +1318 -1301
  140. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/distlib/index.py +516 -488
  141. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/distlib/locators.py +1292 -1194
  142. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/distlib/manifest.py +393 -364
  143. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/distlib/markers.py +131 -190
  144. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/distlib/metadata.py +1068 -1026
  145. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/distlib/resources.py +355 -317
  146. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/distlib/scripts.py +415 -323
  147. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/distlib/t32.exe +0 -0
  148. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/distlib/t64.exe +0 -0
  149. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/distlib/util.py +1755 -1575
  150. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/distlib/version.py +736 -721
  151. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/distlib/w32.exe +0 -0
  152. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/distlib/w64.exe +0 -0
  153. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/distlib/wheel.py +984 -958
  154. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/distro.py +1104 -0
  155. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/html5lib/__init__.py +35 -23
  156. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/html5lib/{ihatexml.py → _ihatexml.py} +288 -285
  157. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/html5lib/{inputstream.py → _inputstream.py} +923 -881
  158. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/html5lib/{tokenizer.py → _tokenizer.py} +1721 -1731
  159. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/html5lib/{trie → _trie}/__init__.py +14 -12
  160. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/html5lib/{trie → _trie}/_base.py +37 -37
  161. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/html5lib/{trie → _trie}/datrie.py +44 -44
  162. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/html5lib/{trie → _trie}/py.py +67 -67
  163. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/html5lib/{utils.py → _utils.py} +124 -82
  164. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/html5lib/constants.py +2947 -3104
  165. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/html5lib/filters/alphabeticalattributes.py +29 -20
  166. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/html5lib/filters/{_base.py → base.py} +12 -12
  167. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/html5lib/filters/inject_meta_charset.py +73 -65
  168. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/html5lib/filters/lint.py +93 -93
  169. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/html5lib/filters/optionaltags.py +207 -205
  170. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/html5lib/filters/sanitizer.py +896 -12
  171. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/html5lib/filters/whitespace.py +38 -38
  172. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/html5lib/html5parser.py +2791 -2713
  173. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/html5lib/serializer.py +409 -0
  174. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/html5lib/treeadapters/__init__.py +30 -0
  175. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/html5lib/treeadapters/genshi.py +54 -0
  176. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/html5lib/treeadapters/sax.py +50 -44
  177. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/html5lib/treebuilders/__init__.py +88 -76
  178. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/html5lib/treebuilders/{_base.py → base.py} +417 -377
  179. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/html5lib/treebuilders/dom.py +236 -227
  180. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/html5lib/treebuilders/etree.py +340 -337
  181. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/html5lib/treebuilders/etree_lxml.py +366 -369
  182. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/html5lib/treewalkers/__init__.py +154 -57
  183. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/html5lib/treewalkers/{_base.py → base.py} +252 -200
  184. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/html5lib/treewalkers/dom.py +43 -46
  185. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/html5lib/treewalkers/etree.py +130 -138
  186. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/html5lib/treewalkers/{lxmletree.py → etree_lxml.py} +213 -208
  187. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/html5lib/treewalkers/{genshistream.py → genshi.py} +69 -69
  188. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/idna/__init__.py +2 -0
  189. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/idna/codec.py +118 -0
  190. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/idna/compat.py +12 -0
  191. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/idna/core.py +387 -0
  192. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/idna/idnadata.py +1585 -0
  193. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/idna/intranges.py +53 -0
  194. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/idna/package_data.py +2 -0
  195. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/idna/uts46data.py +7634 -0
  196. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/ipaddress.py +2419 -0
  197. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/lockfile/__init__.py +347 -0
  198. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/lockfile/linklockfile.py +73 -0
  199. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/lockfile/mkdirlockfile.py +84 -0
  200. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/lockfile/pidlockfile.py +190 -0
  201. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/lockfile/sqlitelockfile.py +156 -0
  202. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/lockfile/symlinklockfile.py +70 -0
  203. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/msgpack/__init__.py +66 -0
  204. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/msgpack/_version.py +1 -0
  205. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/msgpack/exceptions.py +41 -0
  206. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/msgpack/fallback.py +971 -0
  207. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/packaging/__about__.py +21 -0
  208. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/packaging/__init__.py +14 -0
  209. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/packaging/_compat.py +30 -0
  210. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/packaging/_structures.py +70 -0
  211. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/packaging/markers.py +301 -0
  212. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/packaging/requirements.py +130 -0
  213. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/packaging/specifiers.py +774 -0
  214. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/packaging/utils.py +63 -0
  215. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/packaging/version.py +441 -0
  216. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/{pkg_resources.py → pkg_resources/__init__.py} +3125 -2762
  217. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/pkg_resources/py31compat.py +22 -0
  218. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/progress/__init__.py +127 -0
  219. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/progress/bar.py +88 -0
  220. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/progress/counter.py +48 -0
  221. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/progress/helpers.py +91 -0
  222. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/progress/spinner.py +44 -0
  223. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/pyparsing.py +5720 -0
  224. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/pytoml/__init__.py +3 -0
  225. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/pytoml/core.py +13 -0
  226. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/pytoml/parser.py +374 -0
  227. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/pytoml/writer.py +127 -0
  228. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/requests/__init__.py +123 -77
  229. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/requests/__version__.py +14 -0
  230. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/requests/_internal_utils.py +42 -0
  231. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/requests/adapters.py +525 -388
  232. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/requests/api.py +152 -120
  233. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/requests/auth.py +293 -193
  234. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/requests/certs.py +18 -24
  235. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/requests/compat.py +73 -115
  236. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/requests/cookies.py +542 -454
  237. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/requests/exceptions.py +122 -75
  238. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/requests/help.py +120 -0
  239. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/requests/hooks.py +34 -45
  240. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/requests/models.py +948 -803
  241. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/requests/packages.py +16 -0
  242. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/requests/sessions.py +737 -637
  243. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/requests/status_codes.py +91 -88
  244. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/requests/structures.py +105 -127
  245. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/requests/utils.py +904 -673
  246. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/retrying.py +267 -0
  247. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/six.py +891 -646
  248. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/urllib3/__init__.py +97 -0
  249. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/urllib3/_collections.py +319 -0
  250. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/urllib3/connection.py +373 -0
  251. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/{requests/packages/urllib3 → urllib3}/connectionpool.py +905 -710
  252. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/urllib3/contrib/__init__.py +0 -0
  253. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/urllib3/contrib/_securetransport/__init__.py +0 -0
  254. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/urllib3/contrib/_securetransport/bindings.py +593 -0
  255. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/urllib3/contrib/_securetransport/low_level.py +343 -0
  256. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/urllib3/contrib/appengine.py +296 -0
  257. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/{requests/packages/urllib3 → urllib3}/contrib/ntlmpool.py +112 -120
  258. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/urllib3/contrib/pyopenssl.py +455 -0
  259. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/urllib3/contrib/securetransport.py +810 -0
  260. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/urllib3/contrib/socks.py +188 -0
  261. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/urllib3/exceptions.py +246 -0
  262. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/{requests/packages/urllib3 → urllib3}/fields.py +178 -177
  263. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/{requests/packages/urllib3 → urllib3}/filepost.py +94 -100
  264. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/{requests/packages/urllib3 → urllib3}/packages/__init__.py +5 -4
  265. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/urllib3/packages/backports/__init__.py +0 -0
  266. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/urllib3/packages/backports/makefile.py +53 -0
  267. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/{requests/packages/urllib3 → urllib3}/packages/ordered_dict.py +259 -260
  268. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/urllib3/packages/six.py +868 -0
  269. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/{requests/packages/urllib3 → urllib3}/packages/ssl_match_hostname/__init__.py +19 -13
  270. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/{requests/packages/urllib3 → urllib3}/packages/ssl_match_hostname/_implementation.py +157 -105
  271. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/urllib3/poolmanager.py +440 -0
  272. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/{requests/packages/urllib3 → urllib3}/request.py +148 -141
  273. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/urllib3/response.py +626 -0
  274. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/urllib3/util/__init__.py +54 -0
  275. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/urllib3/util/connection.py +130 -0
  276. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/urllib3/util/request.py +118 -0
  277. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/urllib3/util/response.py +81 -0
  278. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/urllib3/util/retry.py +401 -0
  279. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/urllib3/util/selectors.py +581 -0
  280. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/urllib3/util/ssl_.py +341 -0
  281. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/{requests/packages/urllib3 → urllib3}/util/timeout.py +242 -234
  282. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/{requests/packages/urllib3 → urllib3}/util/url.py +230 -162
  283. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/urllib3/util/wait.py +40 -0
  284. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/webencodings/__init__.py +342 -0
  285. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/webencodings/labels.py +231 -0
  286. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/webencodings/mklabels.py +59 -0
  287. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/webencodings/tests.py +153 -0
  288. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/webencodings/x_user_defined.py +325 -0
  289. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/_markerlib/__init__.py +0 -16
  290. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/_markerlib/markers.py +0 -119
  291. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/html5lib/sanitizer.py +0 -271
  292. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/html5lib/serializer/__init__.py +0 -16
  293. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/html5lib/serializer/htmlserializer.py +0 -320
  294. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/html5lib/treewalkers/pulldom.py +0 -63
  295. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/re-vendor.py +0 -34
  296. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/requests/packages/__init__.py +0 -3
  297. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/requests/packages/chardet/big5freq.py +0 -925
  298. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/requests/packages/chardet/chardetect.py +0 -46
  299. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/requests/packages/chardet/charsetgroupprober.py +0 -106
  300. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/requests/packages/chardet/charsetprober.py +0 -62
  301. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/requests/packages/chardet/codingstatemachine.py +0 -61
  302. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/requests/packages/chardet/constants.py +0 -39
  303. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/requests/packages/chardet/escprober.py +0 -86
  304. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/requests/packages/chardet/escsm.py +0 -242
  305. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/requests/packages/chardet/eucjpprober.py +0 -90
  306. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/requests/packages/chardet/euckrfreq.py +0 -596
  307. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/requests/packages/chardet/mbcharsetprober.py +0 -86
  308. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/requests/packages/chardet/mbcssm.py +0 -575
  309. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/requests/packages/chardet/sbcharsetprober.py +0 -120
  310. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/requests/packages/chardet/sjisprober.py +0 -91
  311. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/requests/packages/chardet/universaldetector.py +0 -170
  312. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/requests/packages/urllib3/__init__.py +0 -58
  313. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/requests/packages/urllib3/_collections.py +0 -205
  314. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/requests/packages/urllib3/connection.py +0 -204
  315. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/requests/packages/urllib3/contrib/pyopenssl.py +0 -422
  316. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/requests/packages/urllib3/exceptions.py +0 -126
  317. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/requests/packages/urllib3/packages/six.py +0 -385
  318. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/requests/packages/urllib3/poolmanager.py +0 -258
  319. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/requests/packages/urllib3/response.py +0 -308
  320. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/requests/packages/urllib3/util/__init__.py +0 -27
  321. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/requests/packages/urllib3/util/connection.py +0 -45
  322. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/requests/packages/urllib3/util/request.py +0 -68
  323. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/requests/packages/urllib3/util/response.py +0 -13
  324. package/python3.4.2/lib/python3.4/site-packages/pip/_vendor/requests/packages/urllib3/util/ssl_.py +0 -133
  325. package/python3.4.2/lib/python3.4/site-packages/pip/backwardcompat/__init__.py +0 -138
  326. package/python3.4.2/lib/python3.4/site-packages/pip/basecommand.py +0 -201
  327. package/python3.4.2/lib/python3.4/site-packages/pip/cmdoptions.py +0 -371
  328. package/python3.4.2/lib/python3.4/site-packages/pip/commands/__init__.py +0 -88
  329. package/python3.4.2/lib/python3.4/site-packages/pip/commands/bundle.py +0 -42
  330. package/python3.4.2/lib/python3.4/site-packages/pip/commands/completion.py +0 -59
  331. package/python3.4.2/lib/python3.4/site-packages/pip/commands/freeze.py +0 -114
  332. package/python3.4.2/lib/python3.4/site-packages/pip/commands/install.py +0 -314
  333. package/python3.4.2/lib/python3.4/site-packages/pip/commands/list.py +0 -162
  334. package/python3.4.2/lib/python3.4/site-packages/pip/commands/search.py +0 -132
  335. package/python3.4.2/lib/python3.4/site-packages/pip/commands/show.py +0 -80
  336. package/python3.4.2/lib/python3.4/site-packages/pip/commands/uninstall.py +0 -59
  337. package/python3.4.2/lib/python3.4/site-packages/pip/commands/unzip.py +0 -7
  338. package/python3.4.2/lib/python3.4/site-packages/pip/commands/wheel.py +0 -195
  339. package/python3.4.2/lib/python3.4/site-packages/pip/commands/zip.py +0 -351
  340. package/python3.4.2/lib/python3.4/site-packages/pip/download.py +0 -644
  341. package/python3.4.2/lib/python3.4/site-packages/pip/exceptions.py +0 -46
  342. package/python3.4.2/lib/python3.4/site-packages/pip/index.py +0 -990
  343. package/python3.4.2/lib/python3.4/site-packages/pip/locations.py +0 -171
  344. package/python3.4.2/lib/python3.4/site-packages/pip/log.py +0 -276
  345. package/python3.4.2/lib/python3.4/site-packages/pip/pep425tags.py +0 -102
  346. package/python3.4.2/lib/python3.4/site-packages/pip/req.py +0 -1931
  347. package/python3.4.2/lib/python3.4/site-packages/pip/runner.py +0 -18
  348. package/python3.4.2/lib/python3.4/site-packages/pip/util.py +0 -720
  349. package/python3.4.2/lib/python3.4/site-packages/pip/vcs/__init__.py +0 -251
  350. package/python3.4.2/lib/python3.4/site-packages/pip/vcs/bazaar.py +0 -131
  351. package/python3.4.2/lib/python3.4/site-packages/pip/vcs/git.py +0 -194
  352. package/python3.4.2/lib/python3.4/site-packages/pip/vcs/mercurial.py +0 -151
  353. package/python3.4.2/lib/python3.4/site-packages/pip/vcs/subversion.py +0 -273
  354. package/python3.4.2/lib/python3.4/site-packages/pip/wheel.py +0 -560
@@ -1,1731 +1,1721 @@
1
- from __future__ import absolute_import, division, unicode_literals
2
-
3
- try:
4
- chr = unichr # flake8: noqa
5
- except NameError:
6
- pass
7
-
8
- from collections import deque
9
-
10
- from .constants import spaceCharacters
11
- from .constants import entities
12
- from .constants import asciiLetters, asciiUpper2Lower
13
- from .constants import digits, hexDigits, EOF
14
- from .constants import tokenTypes, tagTokenTypes
15
- from .constants import replacementCharacters
16
-
17
- from .inputstream import HTMLInputStream
18
-
19
- from .trie import Trie
20
-
21
- entitiesTrie = Trie(entities)
22
-
23
-
24
- class HTMLTokenizer(object):
25
- """ This class takes care of tokenizing HTML.
26
-
27
- * self.currentToken
28
- Holds the token that is currently being processed.
29
-
30
- * self.state
31
- Holds a reference to the method to be invoked... XXX
32
-
33
- * self.stream
34
- Points to HTMLInputStream object.
35
- """
36
-
37
- def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
38
- lowercaseElementName=True, lowercaseAttrName=True, parser=None):
39
-
40
- self.stream = HTMLInputStream(stream, encoding, parseMeta, useChardet)
41
- self.parser = parser
42
-
43
- # Perform case conversions?
44
- self.lowercaseElementName = lowercaseElementName
45
- self.lowercaseAttrName = lowercaseAttrName
46
-
47
- # Setup the initial tokenizer state
48
- self.escapeFlag = False
49
- self.lastFourChars = []
50
- self.state = self.dataState
51
- self.escape = False
52
-
53
- # The current token being created
54
- self.currentToken = None
55
- super(HTMLTokenizer, self).__init__()
56
-
57
- def __iter__(self):
58
- """ This is where the magic happens.
59
-
60
- We do our usually processing through the states and when we have a token
61
- to return we yield the token which pauses processing until the next token
62
- is requested.
63
- """
64
- self.tokenQueue = deque([])
65
- # Start processing. When EOF is reached self.state will return False
66
- # instead of True and the loop will terminate.
67
- while self.state():
68
- while self.stream.errors:
69
- yield {"type": tokenTypes["ParseError"], "data": self.stream.errors.pop(0)}
70
- while self.tokenQueue:
71
- yield self.tokenQueue.popleft()
72
-
73
- def consumeNumberEntity(self, isHex):
74
- """This function returns either U+FFFD or the character based on the
75
- decimal or hexadecimal representation. It also discards ";" if present.
76
- If not present self.tokenQueue.append({"type": tokenTypes["ParseError"]}) is invoked.
77
- """
78
-
79
- allowed = digits
80
- radix = 10
81
- if isHex:
82
- allowed = hexDigits
83
- radix = 16
84
-
85
- charStack = []
86
-
87
- # Consume all the characters that are in range while making sure we
88
- # don't hit an EOF.
89
- c = self.stream.char()
90
- while c in allowed and c is not EOF:
91
- charStack.append(c)
92
- c = self.stream.char()
93
-
94
- # Convert the set of characters consumed to an int.
95
- charAsInt = int("".join(charStack), radix)
96
-
97
- # Certain characters get replaced with others
98
- if charAsInt in replacementCharacters:
99
- char = replacementCharacters[charAsInt]
100
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
101
- "illegal-codepoint-for-numeric-entity",
102
- "datavars": {"charAsInt": charAsInt}})
103
- elif ((0xD800 <= charAsInt <= 0xDFFF) or
104
- (charAsInt > 0x10FFFF)):
105
- char = "\uFFFD"
106
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
107
- "illegal-codepoint-for-numeric-entity",
108
- "datavars": {"charAsInt": charAsInt}})
109
- else:
110
- # Should speed up this check somehow (e.g. move the set to a constant)
111
- if ((0x0001 <= charAsInt <= 0x0008) or
112
- (0x000E <= charAsInt <= 0x001F) or
113
- (0x007F <= charAsInt <= 0x009F) or
114
- (0xFDD0 <= charAsInt <= 0xFDEF) or
115
- charAsInt in frozenset([0x000B, 0xFFFE, 0xFFFF, 0x1FFFE,
116
- 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
117
- 0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE,
118
- 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
119
- 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE,
120
- 0x9FFFF, 0xAFFFE, 0xAFFFF, 0xBFFFE,
121
- 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
122
- 0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE,
123
- 0xFFFFF, 0x10FFFE, 0x10FFFF])):
124
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
125
- "data":
126
- "illegal-codepoint-for-numeric-entity",
127
- "datavars": {"charAsInt": charAsInt}})
128
- try:
129
- # Try/except needed as UCS-2 Python builds' unichar only works
130
- # within the BMP.
131
- char = chr(charAsInt)
132
- except ValueError:
133
- v = charAsInt - 0x10000
134
- char = chr(0xD800 | (v >> 10)) + chr(0xDC00 | (v & 0x3FF))
135
-
136
- # Discard the ; if present. Otherwise, put it back on the queue and
137
- # invoke parseError on parser.
138
- if c != ";":
139
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
140
- "numeric-entity-without-semicolon"})
141
- self.stream.unget(c)
142
-
143
- return char
144
-
145
- def consumeEntity(self, allowedChar=None, fromAttribute=False):
146
- # Initialise to the default output for when no entity is matched
147
- output = "&"
148
-
149
- charStack = [self.stream.char()]
150
- if (charStack[0] in spaceCharacters or charStack[0] in (EOF, "<", "&")
151
- or (allowedChar is not None and allowedChar == charStack[0])):
152
- self.stream.unget(charStack[0])
153
-
154
- elif charStack[0] == "#":
155
- # Read the next character to see if it's hex or decimal
156
- hex = False
157
- charStack.append(self.stream.char())
158
- if charStack[-1] in ("x", "X"):
159
- hex = True
160
- charStack.append(self.stream.char())
161
-
162
- # charStack[-1] should be the first digit
163
- if (hex and charStack[-1] in hexDigits) \
164
- or (not hex and charStack[-1] in digits):
165
- # At least one digit found, so consume the whole number
166
- self.stream.unget(charStack[-1])
167
- output = self.consumeNumberEntity(hex)
168
- else:
169
- # No digits found
170
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
171
- "data": "expected-numeric-entity"})
172
- self.stream.unget(charStack.pop())
173
- output = "&" + "".join(charStack)
174
-
175
- else:
176
- # At this point in the process might have named entity. Entities
177
- # are stored in the global variable "entities".
178
- #
179
- # Consume characters and compare to these to a substring of the
180
- # entity names in the list until the substring no longer matches.
181
- while (charStack[-1] is not EOF):
182
- if not entitiesTrie.has_keys_with_prefix("".join(charStack)):
183
- break
184
- charStack.append(self.stream.char())
185
-
186
- # At this point we have a string that starts with some characters
187
- # that may match an entity
188
- # Try to find the longest entity the string will match to take care
189
- # of &noti for instance.
190
- try:
191
- entityName = entitiesTrie.longest_prefix("".join(charStack[:-1]))
192
- entityLength = len(entityName)
193
- except KeyError:
194
- entityName = None
195
-
196
- if entityName is not None:
197
- if entityName[-1] != ";":
198
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
199
- "named-entity-without-semicolon"})
200
- if (entityName[-1] != ";" and fromAttribute and
201
- (charStack[entityLength] in asciiLetters or
202
- charStack[entityLength] in digits or
203
- charStack[entityLength] == "=")):
204
- self.stream.unget(charStack.pop())
205
- output = "&" + "".join(charStack)
206
- else:
207
- output = entities[entityName]
208
- self.stream.unget(charStack.pop())
209
- output += "".join(charStack[entityLength:])
210
- else:
211
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
212
- "expected-named-entity"})
213
- self.stream.unget(charStack.pop())
214
- output = "&" + "".join(charStack)
215
-
216
- if fromAttribute:
217
- self.currentToken["data"][-1][1] += output
218
- else:
219
- if output in spaceCharacters:
220
- tokenType = "SpaceCharacters"
221
- else:
222
- tokenType = "Characters"
223
- self.tokenQueue.append({"type": tokenTypes[tokenType], "data": output})
224
-
225
- def processEntityInAttribute(self, allowedChar):
226
- """This method replaces the need for "entityInAttributeValueState".
227
- """
228
- self.consumeEntity(allowedChar=allowedChar, fromAttribute=True)
229
-
230
- def emitCurrentToken(self):
231
- """This method is a generic handler for emitting the tags. It also sets
232
- the state to "data" because that's what's needed after a token has been
233
- emitted.
234
- """
235
- token = self.currentToken
236
- # Add token to the queue to be yielded
237
- if (token["type"] in tagTokenTypes):
238
- if self.lowercaseElementName:
239
- token["name"] = token["name"].translate(asciiUpper2Lower)
240
- if token["type"] == tokenTypes["EndTag"]:
241
- if token["data"]:
242
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
243
- "data": "attributes-in-end-tag"})
244
- if token["selfClosing"]:
245
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
246
- "data": "self-closing-flag-on-end-tag"})
247
- self.tokenQueue.append(token)
248
- self.state = self.dataState
249
-
250
- # Below are the various tokenizer states worked out.
251
- def dataState(self):
252
- data = self.stream.char()
253
- if data == "&":
254
- self.state = self.entityDataState
255
- elif data == "<":
256
- self.state = self.tagOpenState
257
- elif data == "\u0000":
258
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
259
- "data": "invalid-codepoint"})
260
- self.tokenQueue.append({"type": tokenTypes["Characters"],
261
- "data": "\u0000"})
262
- elif data is EOF:
263
- # Tokenization ends.
264
- return False
265
- elif data in spaceCharacters:
266
- # Directly after emitting a token you switch back to the "data
267
- # state". At that point spaceCharacters are important so they are
268
- # emitted separately.
269
- self.tokenQueue.append({"type": tokenTypes["SpaceCharacters"], "data":
270
- data + self.stream.charsUntil(spaceCharacters, True)})
271
- # No need to update lastFourChars here, since the first space will
272
- # have already been appended to lastFourChars and will have broken
273
- # any <!-- or --> sequences
274
- else:
275
- chars = self.stream.charsUntil(("&", "<", "\u0000"))
276
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
277
- data + chars})
278
- return True
279
-
280
- def entityDataState(self):
281
- self.consumeEntity()
282
- self.state = self.dataState
283
- return True
284
-
285
- def rcdataState(self):
286
- data = self.stream.char()
287
- if data == "&":
288
- self.state = self.characterReferenceInRcdata
289
- elif data == "<":
290
- self.state = self.rcdataLessThanSignState
291
- elif data == EOF:
292
- # Tokenization ends.
293
- return False
294
- elif data == "\u0000":
295
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
296
- "data": "invalid-codepoint"})
297
- self.tokenQueue.append({"type": tokenTypes["Characters"],
298
- "data": "\uFFFD"})
299
- elif data in spaceCharacters:
300
- # Directly after emitting a token you switch back to the "data
301
- # state". At that point spaceCharacters are important so they are
302
- # emitted separately.
303
- self.tokenQueue.append({"type": tokenTypes["SpaceCharacters"], "data":
304
- data + self.stream.charsUntil(spaceCharacters, True)})
305
- # No need to update lastFourChars here, since the first space will
306
- # have already been appended to lastFourChars and will have broken
307
- # any <!-- or --> sequences
308
- else:
309
- chars = self.stream.charsUntil(("&", "<", "\u0000"))
310
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
311
- data + chars})
312
- return True
313
-
314
- def characterReferenceInRcdata(self):
315
- self.consumeEntity()
316
- self.state = self.rcdataState
317
- return True
318
-
319
- def rawtextState(self):
320
- data = self.stream.char()
321
- if data == "<":
322
- self.state = self.rawtextLessThanSignState
323
- elif data == "\u0000":
324
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
325
- "data": "invalid-codepoint"})
326
- self.tokenQueue.append({"type": tokenTypes["Characters"],
327
- "data": "\uFFFD"})
328
- elif data == EOF:
329
- # Tokenization ends.
330
- return False
331
- else:
332
- chars = self.stream.charsUntil(("<", "\u0000"))
333
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
334
- data + chars})
335
- return True
336
-
337
- def scriptDataState(self):
338
- data = self.stream.char()
339
- if data == "<":
340
- self.state = self.scriptDataLessThanSignState
341
- elif data == "\u0000":
342
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
343
- "data": "invalid-codepoint"})
344
- self.tokenQueue.append({"type": tokenTypes["Characters"],
345
- "data": "\uFFFD"})
346
- elif data == EOF:
347
- # Tokenization ends.
348
- return False
349
- else:
350
- chars = self.stream.charsUntil(("<", "\u0000"))
351
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
352
- data + chars})
353
- return True
354
-
355
- def plaintextState(self):
356
- data = self.stream.char()
357
- if data == EOF:
358
- # Tokenization ends.
359
- return False
360
- elif data == "\u0000":
361
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
362
- "data": "invalid-codepoint"})
363
- self.tokenQueue.append({"type": tokenTypes["Characters"],
364
- "data": "\uFFFD"})
365
- else:
366
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
367
- data + self.stream.charsUntil("\u0000")})
368
- return True
369
-
370
- def tagOpenState(self):
371
- data = self.stream.char()
372
- if data == "!":
373
- self.state = self.markupDeclarationOpenState
374
- elif data == "/":
375
- self.state = self.closeTagOpenState
376
- elif data in asciiLetters:
377
- self.currentToken = {"type": tokenTypes["StartTag"],
378
- "name": data, "data": [],
379
- "selfClosing": False,
380
- "selfClosingAcknowledged": False}
381
- self.state = self.tagNameState
382
- elif data == ">":
383
- # XXX In theory it could be something besides a tag name. But
384
- # do we really care?
385
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
386
- "expected-tag-name-but-got-right-bracket"})
387
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<>"})
388
- self.state = self.dataState
389
- elif data == "?":
390
- # XXX In theory it could be something besides a tag name. But
391
- # do we really care?
392
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
393
- "expected-tag-name-but-got-question-mark"})
394
- self.stream.unget(data)
395
- self.state = self.bogusCommentState
396
- else:
397
- # XXX
398
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
399
- "expected-tag-name"})
400
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
401
- self.stream.unget(data)
402
- self.state = self.dataState
403
- return True
404
-
405
- def closeTagOpenState(self):
406
- data = self.stream.char()
407
- if data in asciiLetters:
408
- self.currentToken = {"type": tokenTypes["EndTag"], "name": data,
409
- "data": [], "selfClosing": False}
410
- self.state = self.tagNameState
411
- elif data == ">":
412
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
413
- "expected-closing-tag-but-got-right-bracket"})
414
- self.state = self.dataState
415
- elif data is EOF:
416
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
417
- "expected-closing-tag-but-got-eof"})
418
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
419
- self.state = self.dataState
420
- else:
421
- # XXX data can be _'_...
422
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
423
- "expected-closing-tag-but-got-char",
424
- "datavars": {"data": data}})
425
- self.stream.unget(data)
426
- self.state = self.bogusCommentState
427
- return True
428
-
429
- def tagNameState(self):
430
- data = self.stream.char()
431
- if data in spaceCharacters:
432
- self.state = self.beforeAttributeNameState
433
- elif data == ">":
434
- self.emitCurrentToken()
435
- elif data is EOF:
436
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
437
- "eof-in-tag-name"})
438
- self.state = self.dataState
439
- elif data == "/":
440
- self.state = self.selfClosingStartTagState
441
- elif data == "\u0000":
442
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
443
- "data": "invalid-codepoint"})
444
- self.currentToken["name"] += "\uFFFD"
445
- else:
446
- self.currentToken["name"] += data
447
- # (Don't use charsUntil here, because tag names are
448
- # very short and it's faster to not do anything fancy)
449
- return True
450
-
451
- def rcdataLessThanSignState(self):
452
- data = self.stream.char()
453
- if data == "/":
454
- self.temporaryBuffer = ""
455
- self.state = self.rcdataEndTagOpenState
456
- else:
457
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
458
- self.stream.unget(data)
459
- self.state = self.rcdataState
460
- return True
461
-
462
- def rcdataEndTagOpenState(self):
463
- data = self.stream.char()
464
- if data in asciiLetters:
465
- self.temporaryBuffer += data
466
- self.state = self.rcdataEndTagNameState
467
- else:
468
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
469
- self.stream.unget(data)
470
- self.state = self.rcdataState
471
- return True
472
-
473
- def rcdataEndTagNameState(self):
474
- appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
475
- data = self.stream.char()
476
- if data in spaceCharacters and appropriate:
477
- self.currentToken = {"type": tokenTypes["EndTag"],
478
- "name": self.temporaryBuffer,
479
- "data": [], "selfClosing": False}
480
- self.state = self.beforeAttributeNameState
481
- elif data == "/" and appropriate:
482
- self.currentToken = {"type": tokenTypes["EndTag"],
483
- "name": self.temporaryBuffer,
484
- "data": [], "selfClosing": False}
485
- self.state = self.selfClosingStartTagState
486
- elif data == ">" and appropriate:
487
- self.currentToken = {"type": tokenTypes["EndTag"],
488
- "name": self.temporaryBuffer,
489
- "data": [], "selfClosing": False}
490
- self.emitCurrentToken()
491
- self.state = self.dataState
492
- elif data in asciiLetters:
493
- self.temporaryBuffer += data
494
- else:
495
- self.tokenQueue.append({"type": tokenTypes["Characters"],
496
- "data": "</" + self.temporaryBuffer})
497
- self.stream.unget(data)
498
- self.state = self.rcdataState
499
- return True
500
-
501
- def rawtextLessThanSignState(self):
502
- data = self.stream.char()
503
- if data == "/":
504
- self.temporaryBuffer = ""
505
- self.state = self.rawtextEndTagOpenState
506
- else:
507
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
508
- self.stream.unget(data)
509
- self.state = self.rawtextState
510
- return True
511
-
512
- def rawtextEndTagOpenState(self):
513
- data = self.stream.char()
514
- if data in asciiLetters:
515
- self.temporaryBuffer += data
516
- self.state = self.rawtextEndTagNameState
517
- else:
518
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
519
- self.stream.unget(data)
520
- self.state = self.rawtextState
521
- return True
522
-
523
- def rawtextEndTagNameState(self):
524
- appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
525
- data = self.stream.char()
526
- if data in spaceCharacters and appropriate:
527
- self.currentToken = {"type": tokenTypes["EndTag"],
528
- "name": self.temporaryBuffer,
529
- "data": [], "selfClosing": False}
530
- self.state = self.beforeAttributeNameState
531
- elif data == "/" and appropriate:
532
- self.currentToken = {"type": tokenTypes["EndTag"],
533
- "name": self.temporaryBuffer,
534
- "data": [], "selfClosing": False}
535
- self.state = self.selfClosingStartTagState
536
- elif data == ">" and appropriate:
537
- self.currentToken = {"type": tokenTypes["EndTag"],
538
- "name": self.temporaryBuffer,
539
- "data": [], "selfClosing": False}
540
- self.emitCurrentToken()
541
- self.state = self.dataState
542
- elif data in asciiLetters:
543
- self.temporaryBuffer += data
544
- else:
545
- self.tokenQueue.append({"type": tokenTypes["Characters"],
546
- "data": "</" + self.temporaryBuffer})
547
- self.stream.unget(data)
548
- self.state = self.rawtextState
549
- return True
550
-
551
- def scriptDataLessThanSignState(self):
552
- data = self.stream.char()
553
- if data == "/":
554
- self.temporaryBuffer = ""
555
- self.state = self.scriptDataEndTagOpenState
556
- elif data == "!":
557
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<!"})
558
- self.state = self.scriptDataEscapeStartState
559
- else:
560
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
561
- self.stream.unget(data)
562
- self.state = self.scriptDataState
563
- return True
564
-
565
- def scriptDataEndTagOpenState(self):
566
- data = self.stream.char()
567
- if data in asciiLetters:
568
- self.temporaryBuffer += data
569
- self.state = self.scriptDataEndTagNameState
570
- else:
571
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
572
- self.stream.unget(data)
573
- self.state = self.scriptDataState
574
- return True
575
-
576
- def scriptDataEndTagNameState(self):
577
- appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
578
- data = self.stream.char()
579
- if data in spaceCharacters and appropriate:
580
- self.currentToken = {"type": tokenTypes["EndTag"],
581
- "name": self.temporaryBuffer,
582
- "data": [], "selfClosing": False}
583
- self.state = self.beforeAttributeNameState
584
- elif data == "/" and appropriate:
585
- self.currentToken = {"type": tokenTypes["EndTag"],
586
- "name": self.temporaryBuffer,
587
- "data": [], "selfClosing": False}
588
- self.state = self.selfClosingStartTagState
589
- elif data == ">" and appropriate:
590
- self.currentToken = {"type": tokenTypes["EndTag"],
591
- "name": self.temporaryBuffer,
592
- "data": [], "selfClosing": False}
593
- self.emitCurrentToken()
594
- self.state = self.dataState
595
- elif data in asciiLetters:
596
- self.temporaryBuffer += data
597
- else:
598
- self.tokenQueue.append({"type": tokenTypes["Characters"],
599
- "data": "</" + self.temporaryBuffer})
600
- self.stream.unget(data)
601
- self.state = self.scriptDataState
602
- return True
603
-
604
- def scriptDataEscapeStartState(self):
605
- data = self.stream.char()
606
- if data == "-":
607
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
608
- self.state = self.scriptDataEscapeStartDashState
609
- else:
610
- self.stream.unget(data)
611
- self.state = self.scriptDataState
612
- return True
613
-
614
- def scriptDataEscapeStartDashState(self):
615
- data = self.stream.char()
616
- if data == "-":
617
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
618
- self.state = self.scriptDataEscapedDashDashState
619
- else:
620
- self.stream.unget(data)
621
- self.state = self.scriptDataState
622
- return True
623
-
624
- def scriptDataEscapedState(self):
625
- data = self.stream.char()
626
- if data == "-":
627
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
628
- self.state = self.scriptDataEscapedDashState
629
- elif data == "<":
630
- self.state = self.scriptDataEscapedLessThanSignState
631
- elif data == "\u0000":
632
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
633
- "data": "invalid-codepoint"})
634
- self.tokenQueue.append({"type": tokenTypes["Characters"],
635
- "data": "\uFFFD"})
636
- elif data == EOF:
637
- self.state = self.dataState
638
- else:
639
- chars = self.stream.charsUntil(("<", "-", "\u0000"))
640
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
641
- data + chars})
642
- return True
643
-
644
- def scriptDataEscapedDashState(self):
645
- data = self.stream.char()
646
- if data == "-":
647
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
648
- self.state = self.scriptDataEscapedDashDashState
649
- elif data == "<":
650
- self.state = self.scriptDataEscapedLessThanSignState
651
- elif data == "\u0000":
652
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
653
- "data": "invalid-codepoint"})
654
- self.tokenQueue.append({"type": tokenTypes["Characters"],
655
- "data": "\uFFFD"})
656
- self.state = self.scriptDataEscapedState
657
- elif data == EOF:
658
- self.state = self.dataState
659
- else:
660
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
661
- self.state = self.scriptDataEscapedState
662
- return True
663
-
664
- def scriptDataEscapedDashDashState(self):
665
- data = self.stream.char()
666
- if data == "-":
667
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
668
- elif data == "<":
669
- self.state = self.scriptDataEscapedLessThanSignState
670
- elif data == ">":
671
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": ">"})
672
- self.state = self.scriptDataState
673
- elif data == "\u0000":
674
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
675
- "data": "invalid-codepoint"})
676
- self.tokenQueue.append({"type": tokenTypes["Characters"],
677
- "data": "\uFFFD"})
678
- self.state = self.scriptDataEscapedState
679
- elif data == EOF:
680
- self.state = self.dataState
681
- else:
682
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
683
- self.state = self.scriptDataEscapedState
684
- return True
685
-
686
- def scriptDataEscapedLessThanSignState(self):
687
- data = self.stream.char()
688
- if data == "/":
689
- self.temporaryBuffer = ""
690
- self.state = self.scriptDataEscapedEndTagOpenState
691
- elif data in asciiLetters:
692
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<" + data})
693
- self.temporaryBuffer = data
694
- self.state = self.scriptDataDoubleEscapeStartState
695
- else:
696
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
697
- self.stream.unget(data)
698
- self.state = self.scriptDataEscapedState
699
- return True
700
-
701
- def scriptDataEscapedEndTagOpenState(self):
702
- data = self.stream.char()
703
- if data in asciiLetters:
704
- self.temporaryBuffer = data
705
- self.state = self.scriptDataEscapedEndTagNameState
706
- else:
707
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
708
- self.stream.unget(data)
709
- self.state = self.scriptDataEscapedState
710
- return True
711
-
712
- def scriptDataEscapedEndTagNameState(self):
713
- appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
714
- data = self.stream.char()
715
- if data in spaceCharacters and appropriate:
716
- self.currentToken = {"type": tokenTypes["EndTag"],
717
- "name": self.temporaryBuffer,
718
- "data": [], "selfClosing": False}
719
- self.state = self.beforeAttributeNameState
720
- elif data == "/" and appropriate:
721
- self.currentToken = {"type": tokenTypes["EndTag"],
722
- "name": self.temporaryBuffer,
723
- "data": [], "selfClosing": False}
724
- self.state = self.selfClosingStartTagState
725
- elif data == ">" and appropriate:
726
- self.currentToken = {"type": tokenTypes["EndTag"],
727
- "name": self.temporaryBuffer,
728
- "data": [], "selfClosing": False}
729
- self.emitCurrentToken()
730
- self.state = self.dataState
731
- elif data in asciiLetters:
732
- self.temporaryBuffer += data
733
- else:
734
- self.tokenQueue.append({"type": tokenTypes["Characters"],
735
- "data": "</" + self.temporaryBuffer})
736
- self.stream.unget(data)
737
- self.state = self.scriptDataEscapedState
738
- return True
739
-
740
- def scriptDataDoubleEscapeStartState(self):
741
- data = self.stream.char()
742
- if data in (spaceCharacters | frozenset(("/", ">"))):
743
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
744
- if self.temporaryBuffer.lower() == "script":
745
- self.state = self.scriptDataDoubleEscapedState
746
- else:
747
- self.state = self.scriptDataEscapedState
748
- elif data in asciiLetters:
749
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
750
- self.temporaryBuffer += data
751
- else:
752
- self.stream.unget(data)
753
- self.state = self.scriptDataEscapedState
754
- return True
755
-
756
- def scriptDataDoubleEscapedState(self):
757
- data = self.stream.char()
758
- if data == "-":
759
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
760
- self.state = self.scriptDataDoubleEscapedDashState
761
- elif data == "<":
762
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
763
- self.state = self.scriptDataDoubleEscapedLessThanSignState
764
- elif data == "\u0000":
765
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
766
- "data": "invalid-codepoint"})
767
- self.tokenQueue.append({"type": tokenTypes["Characters"],
768
- "data": "\uFFFD"})
769
- elif data == EOF:
770
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
771
- "eof-in-script-in-script"})
772
- self.state = self.dataState
773
- else:
774
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
775
- return True
776
-
777
- def scriptDataDoubleEscapedDashState(self):
778
- data = self.stream.char()
779
- if data == "-":
780
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
781
- self.state = self.scriptDataDoubleEscapedDashDashState
782
- elif data == "<":
783
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
784
- self.state = self.scriptDataDoubleEscapedLessThanSignState
785
- elif data == "\u0000":
786
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
787
- "data": "invalid-codepoint"})
788
- self.tokenQueue.append({"type": tokenTypes["Characters"],
789
- "data": "\uFFFD"})
790
- self.state = self.scriptDataDoubleEscapedState
791
- elif data == EOF:
792
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
793
- "eof-in-script-in-script"})
794
- self.state = self.dataState
795
- else:
796
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
797
- self.state = self.scriptDataDoubleEscapedState
798
- return True
799
-
800
- def scriptDataDoubleEscapedDashDashState(self):
801
- data = self.stream.char()
802
- if data == "-":
803
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
804
- elif data == "<":
805
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
806
- self.state = self.scriptDataDoubleEscapedLessThanSignState
807
- elif data == ">":
808
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": ">"})
809
- self.state = self.scriptDataState
810
- elif data == "\u0000":
811
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
812
- "data": "invalid-codepoint"})
813
- self.tokenQueue.append({"type": tokenTypes["Characters"],
814
- "data": "\uFFFD"})
815
- self.state = self.scriptDataDoubleEscapedState
816
- elif data == EOF:
817
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
818
- "eof-in-script-in-script"})
819
- self.state = self.dataState
820
- else:
821
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
822
- self.state = self.scriptDataDoubleEscapedState
823
- return True
824
-
825
- def scriptDataDoubleEscapedLessThanSignState(self):
826
- data = self.stream.char()
827
- if data == "/":
828
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "/"})
829
- self.temporaryBuffer = ""
830
- self.state = self.scriptDataDoubleEscapeEndState
831
- else:
832
- self.stream.unget(data)
833
- self.state = self.scriptDataDoubleEscapedState
834
- return True
835
-
836
- def scriptDataDoubleEscapeEndState(self):
837
- data = self.stream.char()
838
- if data in (spaceCharacters | frozenset(("/", ">"))):
839
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
840
- if self.temporaryBuffer.lower() == "script":
841
- self.state = self.scriptDataEscapedState
842
- else:
843
- self.state = self.scriptDataDoubleEscapedState
844
- elif data in asciiLetters:
845
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
846
- self.temporaryBuffer += data
847
- else:
848
- self.stream.unget(data)
849
- self.state = self.scriptDataDoubleEscapedState
850
- return True
851
-
852
- def beforeAttributeNameState(self):
853
- data = self.stream.char()
854
- if data in spaceCharacters:
855
- self.stream.charsUntil(spaceCharacters, True)
856
- elif data in asciiLetters:
857
- self.currentToken["data"].append([data, ""])
858
- self.state = self.attributeNameState
859
- elif data == ">":
860
- self.emitCurrentToken()
861
- elif data == "/":
862
- self.state = self.selfClosingStartTagState
863
- elif data in ("'", '"', "=", "<"):
864
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
865
- "invalid-character-in-attribute-name"})
866
- self.currentToken["data"].append([data, ""])
867
- self.state = self.attributeNameState
868
- elif data == "\u0000":
869
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
870
- "data": "invalid-codepoint"})
871
- self.currentToken["data"].append(["\uFFFD", ""])
872
- self.state = self.attributeNameState
873
- elif data is EOF:
874
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
875
- "expected-attribute-name-but-got-eof"})
876
- self.state = self.dataState
877
- else:
878
- self.currentToken["data"].append([data, ""])
879
- self.state = self.attributeNameState
880
- return True
881
-
882
- def attributeNameState(self):
883
- data = self.stream.char()
884
- leavingThisState = True
885
- emitToken = False
886
- if data == "=":
887
- self.state = self.beforeAttributeValueState
888
- elif data in asciiLetters:
889
- self.currentToken["data"][-1][0] += data +\
890
- self.stream.charsUntil(asciiLetters, True)
891
- leavingThisState = False
892
- elif data == ">":
893
- # XXX If we emit here the attributes are converted to a dict
894
- # without being checked and when the code below runs we error
895
- # because data is a dict not a list
896
- emitToken = True
897
- elif data in spaceCharacters:
898
- self.state = self.afterAttributeNameState
899
- elif data == "/":
900
- self.state = self.selfClosingStartTagState
901
- elif data == "\u0000":
902
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
903
- "data": "invalid-codepoint"})
904
- self.currentToken["data"][-1][0] += "\uFFFD"
905
- leavingThisState = False
906
- elif data in ("'", '"', "<"):
907
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
908
- "data":
909
- "invalid-character-in-attribute-name"})
910
- self.currentToken["data"][-1][0] += data
911
- leavingThisState = False
912
- elif data is EOF:
913
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
914
- "data": "eof-in-attribute-name"})
915
- self.state = self.dataState
916
- else:
917
- self.currentToken["data"][-1][0] += data
918
- leavingThisState = False
919
-
920
- if leavingThisState:
921
- # Attributes are not dropped at this stage. That happens when the
922
- # start tag token is emitted so values can still be safely appended
923
- # to attributes, but we do want to report the parse error in time.
924
- if self.lowercaseAttrName:
925
- self.currentToken["data"][-1][0] = (
926
- self.currentToken["data"][-1][0].translate(asciiUpper2Lower))
927
- for name, value in self.currentToken["data"][:-1]:
928
- if self.currentToken["data"][-1][0] == name:
929
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
930
- "duplicate-attribute"})
931
- break
932
- # XXX Fix for above XXX
933
- if emitToken:
934
- self.emitCurrentToken()
935
- return True
936
-
937
- def afterAttributeNameState(self):
938
- data = self.stream.char()
939
- if data in spaceCharacters:
940
- self.stream.charsUntil(spaceCharacters, True)
941
- elif data == "=":
942
- self.state = self.beforeAttributeValueState
943
- elif data == ">":
944
- self.emitCurrentToken()
945
- elif data in asciiLetters:
946
- self.currentToken["data"].append([data, ""])
947
- self.state = self.attributeNameState
948
- elif data == "/":
949
- self.state = self.selfClosingStartTagState
950
- elif data == "\u0000":
951
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
952
- "data": "invalid-codepoint"})
953
- self.currentToken["data"].append(["\uFFFD", ""])
954
- self.state = self.attributeNameState
955
- elif data in ("'", '"', "<"):
956
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
957
- "invalid-character-after-attribute-name"})
958
- self.currentToken["data"].append([data, ""])
959
- self.state = self.attributeNameState
960
- elif data is EOF:
961
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
962
- "expected-end-of-tag-but-got-eof"})
963
- self.state = self.dataState
964
- else:
965
- self.currentToken["data"].append([data, ""])
966
- self.state = self.attributeNameState
967
- return True
968
-
969
- def beforeAttributeValueState(self):
970
- data = self.stream.char()
971
- if data in spaceCharacters:
972
- self.stream.charsUntil(spaceCharacters, True)
973
- elif data == "\"":
974
- self.state = self.attributeValueDoubleQuotedState
975
- elif data == "&":
976
- self.state = self.attributeValueUnQuotedState
977
- self.stream.unget(data)
978
- elif data == "'":
979
- self.state = self.attributeValueSingleQuotedState
980
- elif data == ">":
981
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
982
- "expected-attribute-value-but-got-right-bracket"})
983
- self.emitCurrentToken()
984
- elif data == "\u0000":
985
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
986
- "data": "invalid-codepoint"})
987
- self.currentToken["data"][-1][1] += "\uFFFD"
988
- self.state = self.attributeValueUnQuotedState
989
- elif data in ("=", "<", "`"):
990
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
991
- "equals-in-unquoted-attribute-value"})
992
- self.currentToken["data"][-1][1] += data
993
- self.state = self.attributeValueUnQuotedState
994
- elif data is EOF:
995
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
996
- "expected-attribute-value-but-got-eof"})
997
- self.state = self.dataState
998
- else:
999
- self.currentToken["data"][-1][1] += data
1000
- self.state = self.attributeValueUnQuotedState
1001
- return True
1002
-
1003
- def attributeValueDoubleQuotedState(self):
1004
- data = self.stream.char()
1005
- if data == "\"":
1006
- self.state = self.afterAttributeValueState
1007
- elif data == "&":
1008
- self.processEntityInAttribute('"')
1009
- elif data == "\u0000":
1010
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
1011
- "data": "invalid-codepoint"})
1012
- self.currentToken["data"][-1][1] += "\uFFFD"
1013
- elif data is EOF:
1014
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1015
- "eof-in-attribute-value-double-quote"})
1016
- self.state = self.dataState
1017
- else:
1018
- self.currentToken["data"][-1][1] += data +\
1019
- self.stream.charsUntil(("\"", "&", "\u0000"))
1020
- return True
1021
-
1022
- def attributeValueSingleQuotedState(self):
1023
- data = self.stream.char()
1024
- if data == "'":
1025
- self.state = self.afterAttributeValueState
1026
- elif data == "&":
1027
- self.processEntityInAttribute("'")
1028
- elif data == "\u0000":
1029
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
1030
- "data": "invalid-codepoint"})
1031
- self.currentToken["data"][-1][1] += "\uFFFD"
1032
- elif data is EOF:
1033
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1034
- "eof-in-attribute-value-single-quote"})
1035
- self.state = self.dataState
1036
- else:
1037
- self.currentToken["data"][-1][1] += data +\
1038
- self.stream.charsUntil(("'", "&", "\u0000"))
1039
- return True
1040
-
1041
- def attributeValueUnQuotedState(self):
1042
- data = self.stream.char()
1043
- if data in spaceCharacters:
1044
- self.state = self.beforeAttributeNameState
1045
- elif data == "&":
1046
- self.processEntityInAttribute(">")
1047
- elif data == ">":
1048
- self.emitCurrentToken()
1049
- elif data in ('"', "'", "=", "<", "`"):
1050
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1051
- "unexpected-character-in-unquoted-attribute-value"})
1052
- self.currentToken["data"][-1][1] += data
1053
- elif data == "\u0000":
1054
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
1055
- "data": "invalid-codepoint"})
1056
- self.currentToken["data"][-1][1] += "\uFFFD"
1057
- elif data is EOF:
1058
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1059
- "eof-in-attribute-value-no-quotes"})
1060
- self.state = self.dataState
1061
- else:
1062
- self.currentToken["data"][-1][1] += data + self.stream.charsUntil(
1063
- frozenset(("&", ">", '"', "'", "=", "<", "`", "\u0000")) | spaceCharacters)
1064
- return True
1065
-
1066
- def afterAttributeValueState(self):
1067
- data = self.stream.char()
1068
- if data in spaceCharacters:
1069
- self.state = self.beforeAttributeNameState
1070
- elif data == ">":
1071
- self.emitCurrentToken()
1072
- elif data == "/":
1073
- self.state = self.selfClosingStartTagState
1074
- elif data is EOF:
1075
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1076
- "unexpected-EOF-after-attribute-value"})
1077
- self.stream.unget(data)
1078
- self.state = self.dataState
1079
- else:
1080
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1081
- "unexpected-character-after-attribute-value"})
1082
- self.stream.unget(data)
1083
- self.state = self.beforeAttributeNameState
1084
- return True
1085
-
1086
- def selfClosingStartTagState(self):
1087
- data = self.stream.char()
1088
- if data == ">":
1089
- self.currentToken["selfClosing"] = True
1090
- self.emitCurrentToken()
1091
- elif data is EOF:
1092
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
1093
- "data":
1094
- "unexpected-EOF-after-solidus-in-tag"})
1095
- self.stream.unget(data)
1096
- self.state = self.dataState
1097
- else:
1098
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1099
- "unexpected-character-after-solidus-in-tag"})
1100
- self.stream.unget(data)
1101
- self.state = self.beforeAttributeNameState
1102
- return True
1103
-
1104
- def bogusCommentState(self):
1105
- # Make a new comment token and give it as value all the characters
1106
- # until the first > or EOF (charsUntil checks for EOF automatically)
1107
- # and emit it.
1108
- data = self.stream.charsUntil(">")
1109
- data = data.replace("\u0000", "\uFFFD")
1110
- self.tokenQueue.append(
1111
- {"type": tokenTypes["Comment"], "data": data})
1112
-
1113
- # Eat the character directly after the bogus comment which is either a
1114
- # ">" or an EOF.
1115
- self.stream.char()
1116
- self.state = self.dataState
1117
- return True
1118
-
1119
- def markupDeclarationOpenState(self):
1120
- charStack = [self.stream.char()]
1121
- if charStack[-1] == "-":
1122
- charStack.append(self.stream.char())
1123
- if charStack[-1] == "-":
1124
- self.currentToken = {"type": tokenTypes["Comment"], "data": ""}
1125
- self.state = self.commentStartState
1126
- return True
1127
- elif charStack[-1] in ('d', 'D'):
1128
- matched = True
1129
- for expected in (('o', 'O'), ('c', 'C'), ('t', 'T'),
1130
- ('y', 'Y'), ('p', 'P'), ('e', 'E')):
1131
- charStack.append(self.stream.char())
1132
- if charStack[-1] not in expected:
1133
- matched = False
1134
- break
1135
- if matched:
1136
- self.currentToken = {"type": tokenTypes["Doctype"],
1137
- "name": "",
1138
- "publicId": None, "systemId": None,
1139
- "correct": True}
1140
- self.state = self.doctypeState
1141
- return True
1142
- elif (charStack[-1] == "[" and
1143
- self.parser is not None and
1144
- self.parser.tree.openElements and
1145
- self.parser.tree.openElements[-1].namespace != self.parser.tree.defaultNamespace):
1146
- matched = True
1147
- for expected in ["C", "D", "A", "T", "A", "["]:
1148
- charStack.append(self.stream.char())
1149
- if charStack[-1] != expected:
1150
- matched = False
1151
- break
1152
- if matched:
1153
- self.state = self.cdataSectionState
1154
- return True
1155
-
1156
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1157
- "expected-dashes-or-doctype"})
1158
-
1159
- while charStack:
1160
- self.stream.unget(charStack.pop())
1161
- self.state = self.bogusCommentState
1162
- return True
1163
-
1164
- def commentStartState(self):
1165
- data = self.stream.char()
1166
- if data == "-":
1167
- self.state = self.commentStartDashState
1168
- elif data == "\u0000":
1169
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
1170
- "data": "invalid-codepoint"})
1171
- self.currentToken["data"] += "\uFFFD"
1172
- elif data == ">":
1173
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1174
- "incorrect-comment"})
1175
- self.tokenQueue.append(self.currentToken)
1176
- self.state = self.dataState
1177
- elif data is EOF:
1178
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1179
- "eof-in-comment"})
1180
- self.tokenQueue.append(self.currentToken)
1181
- self.state = self.dataState
1182
- else:
1183
- self.currentToken["data"] += data
1184
- self.state = self.commentState
1185
- return True
1186
-
1187
- def commentStartDashState(self):
1188
- data = self.stream.char()
1189
- if data == "-":
1190
- self.state = self.commentEndState
1191
- elif data == "\u0000":
1192
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
1193
- "data": "invalid-codepoint"})
1194
- self.currentToken["data"] += "-\uFFFD"
1195
- elif data == ">":
1196
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1197
- "incorrect-comment"})
1198
- self.tokenQueue.append(self.currentToken)
1199
- self.state = self.dataState
1200
- elif data is EOF:
1201
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1202
- "eof-in-comment"})
1203
- self.tokenQueue.append(self.currentToken)
1204
- self.state = self.dataState
1205
- else:
1206
- self.currentToken["data"] += "-" + data
1207
- self.state = self.commentState
1208
- return True
1209
-
1210
- def commentState(self):
1211
- data = self.stream.char()
1212
- if data == "-":
1213
- self.state = self.commentEndDashState
1214
- elif data == "\u0000":
1215
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
1216
- "data": "invalid-codepoint"})
1217
- self.currentToken["data"] += "\uFFFD"
1218
- elif data is EOF:
1219
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
1220
- "data": "eof-in-comment"})
1221
- self.tokenQueue.append(self.currentToken)
1222
- self.state = self.dataState
1223
- else:
1224
- self.currentToken["data"] += data + \
1225
- self.stream.charsUntil(("-", "\u0000"))
1226
- return True
1227
-
1228
- def commentEndDashState(self):
1229
- data = self.stream.char()
1230
- if data == "-":
1231
- self.state = self.commentEndState
1232
- elif data == "\u0000":
1233
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
1234
- "data": "invalid-codepoint"})
1235
- self.currentToken["data"] += "-\uFFFD"
1236
- self.state = self.commentState
1237
- elif data is EOF:
1238
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1239
- "eof-in-comment-end-dash"})
1240
- self.tokenQueue.append(self.currentToken)
1241
- self.state = self.dataState
1242
- else:
1243
- self.currentToken["data"] += "-" + data
1244
- self.state = self.commentState
1245
- return True
1246
-
1247
- def commentEndState(self):
1248
- data = self.stream.char()
1249
- if data == ">":
1250
- self.tokenQueue.append(self.currentToken)
1251
- self.state = self.dataState
1252
- elif data == "\u0000":
1253
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
1254
- "data": "invalid-codepoint"})
1255
- self.currentToken["data"] += "--\uFFFD"
1256
- self.state = self.commentState
1257
- elif data == "!":
1258
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1259
- "unexpected-bang-after-double-dash-in-comment"})
1260
- self.state = self.commentEndBangState
1261
- elif data == "-":
1262
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1263
- "unexpected-dash-after-double-dash-in-comment"})
1264
- self.currentToken["data"] += data
1265
- elif data is EOF:
1266
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1267
- "eof-in-comment-double-dash"})
1268
- self.tokenQueue.append(self.currentToken)
1269
- self.state = self.dataState
1270
- else:
1271
- # XXX
1272
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1273
- "unexpected-char-in-comment"})
1274
- self.currentToken["data"] += "--" + data
1275
- self.state = self.commentState
1276
- return True
1277
-
1278
- def commentEndBangState(self):
1279
- data = self.stream.char()
1280
- if data == ">":
1281
- self.tokenQueue.append(self.currentToken)
1282
- self.state = self.dataState
1283
- elif data == "-":
1284
- self.currentToken["data"] += "--!"
1285
- self.state = self.commentEndDashState
1286
- elif data == "\u0000":
1287
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
1288
- "data": "invalid-codepoint"})
1289
- self.currentToken["data"] += "--!\uFFFD"
1290
- self.state = self.commentState
1291
- elif data is EOF:
1292
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1293
- "eof-in-comment-end-bang-state"})
1294
- self.tokenQueue.append(self.currentToken)
1295
- self.state = self.dataState
1296
- else:
1297
- self.currentToken["data"] += "--!" + data
1298
- self.state = self.commentState
1299
- return True
1300
-
1301
- def doctypeState(self):
1302
- data = self.stream.char()
1303
- if data in spaceCharacters:
1304
- self.state = self.beforeDoctypeNameState
1305
- elif data is EOF:
1306
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1307
- "expected-doctype-name-but-got-eof"})
1308
- self.currentToken["correct"] = False
1309
- self.tokenQueue.append(self.currentToken)
1310
- self.state = self.dataState
1311
- else:
1312
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1313
- "need-space-after-doctype"})
1314
- self.stream.unget(data)
1315
- self.state = self.beforeDoctypeNameState
1316
- return True
1317
-
1318
- def beforeDoctypeNameState(self):
1319
- data = self.stream.char()
1320
- if data in spaceCharacters:
1321
- pass
1322
- elif data == ">":
1323
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1324
- "expected-doctype-name-but-got-right-bracket"})
1325
- self.currentToken["correct"] = False
1326
- self.tokenQueue.append(self.currentToken)
1327
- self.state = self.dataState
1328
- elif data == "\u0000":
1329
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
1330
- "data": "invalid-codepoint"})
1331
- self.currentToken["name"] = "\uFFFD"
1332
- self.state = self.doctypeNameState
1333
- elif data is EOF:
1334
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1335
- "expected-doctype-name-but-got-eof"})
1336
- self.currentToken["correct"] = False
1337
- self.tokenQueue.append(self.currentToken)
1338
- self.state = self.dataState
1339
- else:
1340
- self.currentToken["name"] = data
1341
- self.state = self.doctypeNameState
1342
- return True
1343
-
1344
- def doctypeNameState(self):
1345
- data = self.stream.char()
1346
- if data in spaceCharacters:
1347
- self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
1348
- self.state = self.afterDoctypeNameState
1349
- elif data == ">":
1350
- self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
1351
- self.tokenQueue.append(self.currentToken)
1352
- self.state = self.dataState
1353
- elif data == "\u0000":
1354
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
1355
- "data": "invalid-codepoint"})
1356
- self.currentToken["name"] += "\uFFFD"
1357
- self.state = self.doctypeNameState
1358
- elif data is EOF:
1359
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1360
- "eof-in-doctype-name"})
1361
- self.currentToken["correct"] = False
1362
- self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
1363
- self.tokenQueue.append(self.currentToken)
1364
- self.state = self.dataState
1365
- else:
1366
- self.currentToken["name"] += data
1367
- return True
1368
-
1369
- def afterDoctypeNameState(self):
1370
- data = self.stream.char()
1371
- if data in spaceCharacters:
1372
- pass
1373
- elif data == ">":
1374
- self.tokenQueue.append(self.currentToken)
1375
- self.state = self.dataState
1376
- elif data is EOF:
1377
- self.currentToken["correct"] = False
1378
- self.stream.unget(data)
1379
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1380
- "eof-in-doctype"})
1381
- self.tokenQueue.append(self.currentToken)
1382
- self.state = self.dataState
1383
- else:
1384
- if data in ("p", "P"):
1385
- matched = True
1386
- for expected in (("u", "U"), ("b", "B"), ("l", "L"),
1387
- ("i", "I"), ("c", "C")):
1388
- data = self.stream.char()
1389
- if data not in expected:
1390
- matched = False
1391
- break
1392
- if matched:
1393
- self.state = self.afterDoctypePublicKeywordState
1394
- return True
1395
- elif data in ("s", "S"):
1396
- matched = True
1397
- for expected in (("y", "Y"), ("s", "S"), ("t", "T"),
1398
- ("e", "E"), ("m", "M")):
1399
- data = self.stream.char()
1400
- if data not in expected:
1401
- matched = False
1402
- break
1403
- if matched:
1404
- self.state = self.afterDoctypeSystemKeywordState
1405
- return True
1406
-
1407
- # All the characters read before the current 'data' will be
1408
- # [a-zA-Z], so they're garbage in the bogus doctype and can be
1409
- # discarded; only the latest character might be '>' or EOF
1410
- # and needs to be ungetted
1411
- self.stream.unget(data)
1412
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1413
- "expected-space-or-right-bracket-in-doctype", "datavars":
1414
- {"data": data}})
1415
- self.currentToken["correct"] = False
1416
- self.state = self.bogusDoctypeState
1417
-
1418
- return True
1419
-
1420
- def afterDoctypePublicKeywordState(self):
1421
- data = self.stream.char()
1422
- if data in spaceCharacters:
1423
- self.state = self.beforeDoctypePublicIdentifierState
1424
- elif data in ("'", '"'):
1425
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1426
- "unexpected-char-in-doctype"})
1427
- self.stream.unget(data)
1428
- self.state = self.beforeDoctypePublicIdentifierState
1429
- elif data is EOF:
1430
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1431
- "eof-in-doctype"})
1432
- self.currentToken["correct"] = False
1433
- self.tokenQueue.append(self.currentToken)
1434
- self.state = self.dataState
1435
- else:
1436
- self.stream.unget(data)
1437
- self.state = self.beforeDoctypePublicIdentifierState
1438
- return True
1439
-
1440
- def beforeDoctypePublicIdentifierState(self):
1441
- data = self.stream.char()
1442
- if data in spaceCharacters:
1443
- pass
1444
- elif data == "\"":
1445
- self.currentToken["publicId"] = ""
1446
- self.state = self.doctypePublicIdentifierDoubleQuotedState
1447
- elif data == "'":
1448
- self.currentToken["publicId"] = ""
1449
- self.state = self.doctypePublicIdentifierSingleQuotedState
1450
- elif data == ">":
1451
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1452
- "unexpected-end-of-doctype"})
1453
- self.currentToken["correct"] = False
1454
- self.tokenQueue.append(self.currentToken)
1455
- self.state = self.dataState
1456
- elif data is EOF:
1457
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1458
- "eof-in-doctype"})
1459
- self.currentToken["correct"] = False
1460
- self.tokenQueue.append(self.currentToken)
1461
- self.state = self.dataState
1462
- else:
1463
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1464
- "unexpected-char-in-doctype"})
1465
- self.currentToken["correct"] = False
1466
- self.state = self.bogusDoctypeState
1467
- return True
1468
-
1469
- def doctypePublicIdentifierDoubleQuotedState(self):
1470
- data = self.stream.char()
1471
- if data == "\"":
1472
- self.state = self.afterDoctypePublicIdentifierState
1473
- elif data == "\u0000":
1474
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
1475
- "data": "invalid-codepoint"})
1476
- self.currentToken["publicId"] += "\uFFFD"
1477
- elif data == ">":
1478
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1479
- "unexpected-end-of-doctype"})
1480
- self.currentToken["correct"] = False
1481
- self.tokenQueue.append(self.currentToken)
1482
- self.state = self.dataState
1483
- elif data is EOF:
1484
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1485
- "eof-in-doctype"})
1486
- self.currentToken["correct"] = False
1487
- self.tokenQueue.append(self.currentToken)
1488
- self.state = self.dataState
1489
- else:
1490
- self.currentToken["publicId"] += data
1491
- return True
1492
-
1493
- def doctypePublicIdentifierSingleQuotedState(self):
1494
- data = self.stream.char()
1495
- if data == "'":
1496
- self.state = self.afterDoctypePublicIdentifierState
1497
- elif data == "\u0000":
1498
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
1499
- "data": "invalid-codepoint"})
1500
- self.currentToken["publicId"] += "\uFFFD"
1501
- elif data == ">":
1502
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1503
- "unexpected-end-of-doctype"})
1504
- self.currentToken["correct"] = False
1505
- self.tokenQueue.append(self.currentToken)
1506
- self.state = self.dataState
1507
- elif data is EOF:
1508
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1509
- "eof-in-doctype"})
1510
- self.currentToken["correct"] = False
1511
- self.tokenQueue.append(self.currentToken)
1512
- self.state = self.dataState
1513
- else:
1514
- self.currentToken["publicId"] += data
1515
- return True
1516
-
1517
- def afterDoctypePublicIdentifierState(self):
1518
- data = self.stream.char()
1519
- if data in spaceCharacters:
1520
- self.state = self.betweenDoctypePublicAndSystemIdentifiersState
1521
- elif data == ">":
1522
- self.tokenQueue.append(self.currentToken)
1523
- self.state = self.dataState
1524
- elif data == '"':
1525
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1526
- "unexpected-char-in-doctype"})
1527
- self.currentToken["systemId"] = ""
1528
- self.state = self.doctypeSystemIdentifierDoubleQuotedState
1529
- elif data == "'":
1530
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1531
- "unexpected-char-in-doctype"})
1532
- self.currentToken["systemId"] = ""
1533
- self.state = self.doctypeSystemIdentifierSingleQuotedState
1534
- elif data is EOF:
1535
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1536
- "eof-in-doctype"})
1537
- self.currentToken["correct"] = False
1538
- self.tokenQueue.append(self.currentToken)
1539
- self.state = self.dataState
1540
- else:
1541
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1542
- "unexpected-char-in-doctype"})
1543
- self.currentToken["correct"] = False
1544
- self.state = self.bogusDoctypeState
1545
- return True
1546
-
1547
- def betweenDoctypePublicAndSystemIdentifiersState(self):
1548
- data = self.stream.char()
1549
- if data in spaceCharacters:
1550
- pass
1551
- elif data == ">":
1552
- self.tokenQueue.append(self.currentToken)
1553
- self.state = self.dataState
1554
- elif data == '"':
1555
- self.currentToken["systemId"] = ""
1556
- self.state = self.doctypeSystemIdentifierDoubleQuotedState
1557
- elif data == "'":
1558
- self.currentToken["systemId"] = ""
1559
- self.state = self.doctypeSystemIdentifierSingleQuotedState
1560
- elif data == EOF:
1561
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1562
- "eof-in-doctype"})
1563
- self.currentToken["correct"] = False
1564
- self.tokenQueue.append(self.currentToken)
1565
- self.state = self.dataState
1566
- else:
1567
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1568
- "unexpected-char-in-doctype"})
1569
- self.currentToken["correct"] = False
1570
- self.state = self.bogusDoctypeState
1571
- return True
1572
-
1573
- def afterDoctypeSystemKeywordState(self):
1574
- data = self.stream.char()
1575
- if data in spaceCharacters:
1576
- self.state = self.beforeDoctypeSystemIdentifierState
1577
- elif data in ("'", '"'):
1578
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1579
- "unexpected-char-in-doctype"})
1580
- self.stream.unget(data)
1581
- self.state = self.beforeDoctypeSystemIdentifierState
1582
- elif data is EOF:
1583
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1584
- "eof-in-doctype"})
1585
- self.currentToken["correct"] = False
1586
- self.tokenQueue.append(self.currentToken)
1587
- self.state = self.dataState
1588
- else:
1589
- self.stream.unget(data)
1590
- self.state = self.beforeDoctypeSystemIdentifierState
1591
- return True
1592
-
1593
- def beforeDoctypeSystemIdentifierState(self):
1594
- data = self.stream.char()
1595
- if data in spaceCharacters:
1596
- pass
1597
- elif data == "\"":
1598
- self.currentToken["systemId"] = ""
1599
- self.state = self.doctypeSystemIdentifierDoubleQuotedState
1600
- elif data == "'":
1601
- self.currentToken["systemId"] = ""
1602
- self.state = self.doctypeSystemIdentifierSingleQuotedState
1603
- elif data == ">":
1604
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1605
- "unexpected-char-in-doctype"})
1606
- self.currentToken["correct"] = False
1607
- self.tokenQueue.append(self.currentToken)
1608
- self.state = self.dataState
1609
- elif data is EOF:
1610
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1611
- "eof-in-doctype"})
1612
- self.currentToken["correct"] = False
1613
- self.tokenQueue.append(self.currentToken)
1614
- self.state = self.dataState
1615
- else:
1616
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1617
- "unexpected-char-in-doctype"})
1618
- self.currentToken["correct"] = False
1619
- self.state = self.bogusDoctypeState
1620
- return True
1621
-
1622
- def doctypeSystemIdentifierDoubleQuotedState(self):
1623
- data = self.stream.char()
1624
- if data == "\"":
1625
- self.state = self.afterDoctypeSystemIdentifierState
1626
- elif data == "\u0000":
1627
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
1628
- "data": "invalid-codepoint"})
1629
- self.currentToken["systemId"] += "\uFFFD"
1630
- elif data == ">":
1631
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1632
- "unexpected-end-of-doctype"})
1633
- self.currentToken["correct"] = False
1634
- self.tokenQueue.append(self.currentToken)
1635
- self.state = self.dataState
1636
- elif data is EOF:
1637
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1638
- "eof-in-doctype"})
1639
- self.currentToken["correct"] = False
1640
- self.tokenQueue.append(self.currentToken)
1641
- self.state = self.dataState
1642
- else:
1643
- self.currentToken["systemId"] += data
1644
- return True
1645
-
1646
- def doctypeSystemIdentifierSingleQuotedState(self):
1647
- data = self.stream.char()
1648
- if data == "'":
1649
- self.state = self.afterDoctypeSystemIdentifierState
1650
- elif data == "\u0000":
1651
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
1652
- "data": "invalid-codepoint"})
1653
- self.currentToken["systemId"] += "\uFFFD"
1654
- elif data == ">":
1655
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1656
- "unexpected-end-of-doctype"})
1657
- self.currentToken["correct"] = False
1658
- self.tokenQueue.append(self.currentToken)
1659
- self.state = self.dataState
1660
- elif data is EOF:
1661
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1662
- "eof-in-doctype"})
1663
- self.currentToken["correct"] = False
1664
- self.tokenQueue.append(self.currentToken)
1665
- self.state = self.dataState
1666
- else:
1667
- self.currentToken["systemId"] += data
1668
- return True
1669
-
1670
- def afterDoctypeSystemIdentifierState(self):
1671
- data = self.stream.char()
1672
- if data in spaceCharacters:
1673
- pass
1674
- elif data == ">":
1675
- self.tokenQueue.append(self.currentToken)
1676
- self.state = self.dataState
1677
- elif data is EOF:
1678
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1679
- "eof-in-doctype"})
1680
- self.currentToken["correct"] = False
1681
- self.tokenQueue.append(self.currentToken)
1682
- self.state = self.dataState
1683
- else:
1684
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1685
- "unexpected-char-in-doctype"})
1686
- self.state = self.bogusDoctypeState
1687
- return True
1688
-
1689
- def bogusDoctypeState(self):
1690
- data = self.stream.char()
1691
- if data == ">":
1692
- self.tokenQueue.append(self.currentToken)
1693
- self.state = self.dataState
1694
- elif data is EOF:
1695
- # XXX EMIT
1696
- self.stream.unget(data)
1697
- self.tokenQueue.append(self.currentToken)
1698
- self.state = self.dataState
1699
- else:
1700
- pass
1701
- return True
1702
-
1703
- def cdataSectionState(self):
1704
- data = []
1705
- while True:
1706
- data.append(self.stream.charsUntil("]"))
1707
- data.append(self.stream.charsUntil(">"))
1708
- char = self.stream.char()
1709
- if char == EOF:
1710
- break
1711
- else:
1712
- assert char == ">"
1713
- if data[-1][-2:] == "]]":
1714
- data[-1] = data[-1][:-2]
1715
- break
1716
- else:
1717
- data.append(char)
1718
-
1719
- data = "".join(data)
1720
- # Deal with null here rather than in the parser
1721
- nullCount = data.count("\u0000")
1722
- if nullCount > 0:
1723
- for i in range(nullCount):
1724
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
1725
- "data": "invalid-codepoint"})
1726
- data = data.replace("\u0000", "\uFFFD")
1727
- if data:
1728
- self.tokenQueue.append({"type": tokenTypes["Characters"],
1729
- "data": data})
1730
- self.state = self.dataState
1731
- return True
1
+ from __future__ import absolute_import, division, unicode_literals
2
+
3
+ from pip._vendor.six import unichr as chr
4
+
5
+ from collections import deque
6
+
7
+ from .constants import spaceCharacters
8
+ from .constants import entities
9
+ from .constants import asciiLetters, asciiUpper2Lower
10
+ from .constants import digits, hexDigits, EOF
11
+ from .constants import tokenTypes, tagTokenTypes
12
+ from .constants import replacementCharacters
13
+
14
+ from ._inputstream import HTMLInputStream
15
+
16
+ from ._trie import Trie
17
+
18
+ entitiesTrie = Trie(entities)
19
+
20
+
21
+ class HTMLTokenizer(object):
22
+ """ This class takes care of tokenizing HTML.
23
+
24
+ * self.currentToken
25
+ Holds the token that is currently being processed.
26
+
27
+ * self.state
28
+ Holds a reference to the method to be invoked... XXX
29
+
30
+ * self.stream
31
+ Points to HTMLInputStream object.
32
+ """
33
+
34
+ def __init__(self, stream, parser=None, **kwargs):
35
+
36
+ self.stream = HTMLInputStream(stream, **kwargs)
37
+ self.parser = parser
38
+
39
+ # Setup the initial tokenizer state
40
+ self.escapeFlag = False
41
+ self.lastFourChars = []
42
+ self.state = self.dataState
43
+ self.escape = False
44
+
45
+ # The current token being created
46
+ self.currentToken = None
47
+ super(HTMLTokenizer, self).__init__()
48
+
49
+ def __iter__(self):
50
+ """ This is where the magic happens.
51
+
52
+ We do our usually processing through the states and when we have a token
53
+ to return we yield the token which pauses processing until the next token
54
+ is requested.
55
+ """
56
+ self.tokenQueue = deque([])
57
+ # Start processing. When EOF is reached self.state will return False
58
+ # instead of True and the loop will terminate.
59
+ while self.state():
60
+ while self.stream.errors:
61
+ yield {"type": tokenTypes["ParseError"], "data": self.stream.errors.pop(0)}
62
+ while self.tokenQueue:
63
+ yield self.tokenQueue.popleft()
64
+
65
+ def consumeNumberEntity(self, isHex):
66
+ """This function returns either U+FFFD or the character based on the
67
+ decimal or hexadecimal representation. It also discards ";" if present.
68
+ If not present self.tokenQueue.append({"type": tokenTypes["ParseError"]}) is invoked.
69
+ """
70
+
71
+ allowed = digits
72
+ radix = 10
73
+ if isHex:
74
+ allowed = hexDigits
75
+ radix = 16
76
+
77
+ charStack = []
78
+
79
+ # Consume all the characters that are in range while making sure we
80
+ # don't hit an EOF.
81
+ c = self.stream.char()
82
+ while c in allowed and c is not EOF:
83
+ charStack.append(c)
84
+ c = self.stream.char()
85
+
86
+ # Convert the set of characters consumed to an int.
87
+ charAsInt = int("".join(charStack), radix)
88
+
89
+ # Certain characters get replaced with others
90
+ if charAsInt in replacementCharacters:
91
+ char = replacementCharacters[charAsInt]
92
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
93
+ "illegal-codepoint-for-numeric-entity",
94
+ "datavars": {"charAsInt": charAsInt}})
95
+ elif ((0xD800 <= charAsInt <= 0xDFFF) or
96
+ (charAsInt > 0x10FFFF)):
97
+ char = "\uFFFD"
98
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
99
+ "illegal-codepoint-for-numeric-entity",
100
+ "datavars": {"charAsInt": charAsInt}})
101
+ else:
102
+ # Should speed up this check somehow (e.g. move the set to a constant)
103
+ if ((0x0001 <= charAsInt <= 0x0008) or
104
+ (0x000E <= charAsInt <= 0x001F) or
105
+ (0x007F <= charAsInt <= 0x009F) or
106
+ (0xFDD0 <= charAsInt <= 0xFDEF) or
107
+ charAsInt in frozenset([0x000B, 0xFFFE, 0xFFFF, 0x1FFFE,
108
+ 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
109
+ 0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE,
110
+ 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
111
+ 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE,
112
+ 0x9FFFF, 0xAFFFE, 0xAFFFF, 0xBFFFE,
113
+ 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
114
+ 0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE,
115
+ 0xFFFFF, 0x10FFFE, 0x10FFFF])):
116
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
117
+ "data":
118
+ "illegal-codepoint-for-numeric-entity",
119
+ "datavars": {"charAsInt": charAsInt}})
120
+ try:
121
+ # Try/except needed as UCS-2 Python builds' unichar only works
122
+ # within the BMP.
123
+ char = chr(charAsInt)
124
+ except ValueError:
125
+ v = charAsInt - 0x10000
126
+ char = chr(0xD800 | (v >> 10)) + chr(0xDC00 | (v & 0x3FF))
127
+
128
+ # Discard the ; if present. Otherwise, put it back on the queue and
129
+ # invoke parseError on parser.
130
+ if c != ";":
131
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
132
+ "numeric-entity-without-semicolon"})
133
+ self.stream.unget(c)
134
+
135
+ return char
136
+
137
+ def consumeEntity(self, allowedChar=None, fromAttribute=False):
138
+ # Initialise to the default output for when no entity is matched
139
+ output = "&"
140
+
141
+ charStack = [self.stream.char()]
142
+ if (charStack[0] in spaceCharacters or charStack[0] in (EOF, "<", "&") or
143
+ (allowedChar is not None and allowedChar == charStack[0])):
144
+ self.stream.unget(charStack[0])
145
+
146
+ elif charStack[0] == "#":
147
+ # Read the next character to see if it's hex or decimal
148
+ hex = False
149
+ charStack.append(self.stream.char())
150
+ if charStack[-1] in ("x", "X"):
151
+ hex = True
152
+ charStack.append(self.stream.char())
153
+
154
+ # charStack[-1] should be the first digit
155
+ if (hex and charStack[-1] in hexDigits) \
156
+ or (not hex and charStack[-1] in digits):
157
+ # At least one digit found, so consume the whole number
158
+ self.stream.unget(charStack[-1])
159
+ output = self.consumeNumberEntity(hex)
160
+ else:
161
+ # No digits found
162
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
163
+ "data": "expected-numeric-entity"})
164
+ self.stream.unget(charStack.pop())
165
+ output = "&" + "".join(charStack)
166
+
167
+ else:
168
+ # At this point in the process might have named entity. Entities
169
+ # are stored in the global variable "entities".
170
+ #
171
+ # Consume characters and compare to these to a substring of the
172
+ # entity names in the list until the substring no longer matches.
173
+ while (charStack[-1] is not EOF):
174
+ if not entitiesTrie.has_keys_with_prefix("".join(charStack)):
175
+ break
176
+ charStack.append(self.stream.char())
177
+
178
+ # At this point we have a string that starts with some characters
179
+ # that may match an entity
180
+ # Try to find the longest entity the string will match to take care
181
+ # of &noti for instance.
182
+ try:
183
+ entityName = entitiesTrie.longest_prefix("".join(charStack[:-1]))
184
+ entityLength = len(entityName)
185
+ except KeyError:
186
+ entityName = None
187
+
188
+ if entityName is not None:
189
+ if entityName[-1] != ";":
190
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
191
+ "named-entity-without-semicolon"})
192
+ if (entityName[-1] != ";" and fromAttribute and
193
+ (charStack[entityLength] in asciiLetters or
194
+ charStack[entityLength] in digits or
195
+ charStack[entityLength] == "=")):
196
+ self.stream.unget(charStack.pop())
197
+ output = "&" + "".join(charStack)
198
+ else:
199
+ output = entities[entityName]
200
+ self.stream.unget(charStack.pop())
201
+ output += "".join(charStack[entityLength:])
202
+ else:
203
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
204
+ "expected-named-entity"})
205
+ self.stream.unget(charStack.pop())
206
+ output = "&" + "".join(charStack)
207
+
208
+ if fromAttribute:
209
+ self.currentToken["data"][-1][1] += output
210
+ else:
211
+ if output in spaceCharacters:
212
+ tokenType = "SpaceCharacters"
213
+ else:
214
+ tokenType = "Characters"
215
+ self.tokenQueue.append({"type": tokenTypes[tokenType], "data": output})
216
+
217
+ def processEntityInAttribute(self, allowedChar):
218
+ """This method replaces the need for "entityInAttributeValueState".
219
+ """
220
+ self.consumeEntity(allowedChar=allowedChar, fromAttribute=True)
221
+
222
+ def emitCurrentToken(self):
223
+ """This method is a generic handler for emitting the tags. It also sets
224
+ the state to "data" because that's what's needed after a token has been
225
+ emitted.
226
+ """
227
+ token = self.currentToken
228
+ # Add token to the queue to be yielded
229
+ if (token["type"] in tagTokenTypes):
230
+ token["name"] = token["name"].translate(asciiUpper2Lower)
231
+ if token["type"] == tokenTypes["EndTag"]:
232
+ if token["data"]:
233
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
234
+ "data": "attributes-in-end-tag"})
235
+ if token["selfClosing"]:
236
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
237
+ "data": "self-closing-flag-on-end-tag"})
238
+ self.tokenQueue.append(token)
239
+ self.state = self.dataState
240
+
241
+ # Below are the various tokenizer states worked out.
242
+ def dataState(self):
243
+ data = self.stream.char()
244
+ if data == "&":
245
+ self.state = self.entityDataState
246
+ elif data == "<":
247
+ self.state = self.tagOpenState
248
+ elif data == "\u0000":
249
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
250
+ "data": "invalid-codepoint"})
251
+ self.tokenQueue.append({"type": tokenTypes["Characters"],
252
+ "data": "\u0000"})
253
+ elif data is EOF:
254
+ # Tokenization ends.
255
+ return False
256
+ elif data in spaceCharacters:
257
+ # Directly after emitting a token you switch back to the "data
258
+ # state". At that point spaceCharacters are important so they are
259
+ # emitted separately.
260
+ self.tokenQueue.append({"type": tokenTypes["SpaceCharacters"], "data":
261
+ data + self.stream.charsUntil(spaceCharacters, True)})
262
+ # No need to update lastFourChars here, since the first space will
263
+ # have already been appended to lastFourChars and will have broken
264
+ # any <!-- or --> sequences
265
+ else:
266
+ chars = self.stream.charsUntil(("&", "<", "\u0000"))
267
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
268
+ data + chars})
269
+ return True
270
+
271
+ def entityDataState(self):
272
+ self.consumeEntity()
273
+ self.state = self.dataState
274
+ return True
275
+
276
+ def rcdataState(self):
277
+ data = self.stream.char()
278
+ if data == "&":
279
+ self.state = self.characterReferenceInRcdata
280
+ elif data == "<":
281
+ self.state = self.rcdataLessThanSignState
282
+ elif data == EOF:
283
+ # Tokenization ends.
284
+ return False
285
+ elif data == "\u0000":
286
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
287
+ "data": "invalid-codepoint"})
288
+ self.tokenQueue.append({"type": tokenTypes["Characters"],
289
+ "data": "\uFFFD"})
290
+ elif data in spaceCharacters:
291
+ # Directly after emitting a token you switch back to the "data
292
+ # state". At that point spaceCharacters are important so they are
293
+ # emitted separately.
294
+ self.tokenQueue.append({"type": tokenTypes["SpaceCharacters"], "data":
295
+ data + self.stream.charsUntil(spaceCharacters, True)})
296
+ # No need to update lastFourChars here, since the first space will
297
+ # have already been appended to lastFourChars and will have broken
298
+ # any <!-- or --> sequences
299
+ else:
300
+ chars = self.stream.charsUntil(("&", "<", "\u0000"))
301
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
302
+ data + chars})
303
+ return True
304
+
305
+ def characterReferenceInRcdata(self):
306
+ self.consumeEntity()
307
+ self.state = self.rcdataState
308
+ return True
309
+
310
+ def rawtextState(self):
311
+ data = self.stream.char()
312
+ if data == "<":
313
+ self.state = self.rawtextLessThanSignState
314
+ elif data == "\u0000":
315
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
316
+ "data": "invalid-codepoint"})
317
+ self.tokenQueue.append({"type": tokenTypes["Characters"],
318
+ "data": "\uFFFD"})
319
+ elif data == EOF:
320
+ # Tokenization ends.
321
+ return False
322
+ else:
323
+ chars = self.stream.charsUntil(("<", "\u0000"))
324
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
325
+ data + chars})
326
+ return True
327
+
328
+ def scriptDataState(self):
329
+ data = self.stream.char()
330
+ if data == "<":
331
+ self.state = self.scriptDataLessThanSignState
332
+ elif data == "\u0000":
333
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
334
+ "data": "invalid-codepoint"})
335
+ self.tokenQueue.append({"type": tokenTypes["Characters"],
336
+ "data": "\uFFFD"})
337
+ elif data == EOF:
338
+ # Tokenization ends.
339
+ return False
340
+ else:
341
+ chars = self.stream.charsUntil(("<", "\u0000"))
342
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
343
+ data + chars})
344
+ return True
345
+
346
+ def plaintextState(self):
347
+ data = self.stream.char()
348
+ if data == EOF:
349
+ # Tokenization ends.
350
+ return False
351
+ elif data == "\u0000":
352
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
353
+ "data": "invalid-codepoint"})
354
+ self.tokenQueue.append({"type": tokenTypes["Characters"],
355
+ "data": "\uFFFD"})
356
+ else:
357
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
358
+ data + self.stream.charsUntil("\u0000")})
359
+ return True
360
+
361
+ def tagOpenState(self):
362
+ data = self.stream.char()
363
+ if data == "!":
364
+ self.state = self.markupDeclarationOpenState
365
+ elif data == "/":
366
+ self.state = self.closeTagOpenState
367
+ elif data in asciiLetters:
368
+ self.currentToken = {"type": tokenTypes["StartTag"],
369
+ "name": data, "data": [],
370
+ "selfClosing": False,
371
+ "selfClosingAcknowledged": False}
372
+ self.state = self.tagNameState
373
+ elif data == ">":
374
+ # XXX In theory it could be something besides a tag name. But
375
+ # do we really care?
376
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
377
+ "expected-tag-name-but-got-right-bracket"})
378
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<>"})
379
+ self.state = self.dataState
380
+ elif data == "?":
381
+ # XXX In theory it could be something besides a tag name. But
382
+ # do we really care?
383
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
384
+ "expected-tag-name-but-got-question-mark"})
385
+ self.stream.unget(data)
386
+ self.state = self.bogusCommentState
387
+ else:
388
+ # XXX
389
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
390
+ "expected-tag-name"})
391
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
392
+ self.stream.unget(data)
393
+ self.state = self.dataState
394
+ return True
395
+
396
+ def closeTagOpenState(self):
397
+ data = self.stream.char()
398
+ if data in asciiLetters:
399
+ self.currentToken = {"type": tokenTypes["EndTag"], "name": data,
400
+ "data": [], "selfClosing": False}
401
+ self.state = self.tagNameState
402
+ elif data == ">":
403
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
404
+ "expected-closing-tag-but-got-right-bracket"})
405
+ self.state = self.dataState
406
+ elif data is EOF:
407
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
408
+ "expected-closing-tag-but-got-eof"})
409
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
410
+ self.state = self.dataState
411
+ else:
412
+ # XXX data can be _'_...
413
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
414
+ "expected-closing-tag-but-got-char",
415
+ "datavars": {"data": data}})
416
+ self.stream.unget(data)
417
+ self.state = self.bogusCommentState
418
+ return True
419
+
420
+ def tagNameState(self):
421
+ data = self.stream.char()
422
+ if data in spaceCharacters:
423
+ self.state = self.beforeAttributeNameState
424
+ elif data == ">":
425
+ self.emitCurrentToken()
426
+ elif data is EOF:
427
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
428
+ "eof-in-tag-name"})
429
+ self.state = self.dataState
430
+ elif data == "/":
431
+ self.state = self.selfClosingStartTagState
432
+ elif data == "\u0000":
433
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
434
+ "data": "invalid-codepoint"})
435
+ self.currentToken["name"] += "\uFFFD"
436
+ else:
437
+ self.currentToken["name"] += data
438
+ # (Don't use charsUntil here, because tag names are
439
+ # very short and it's faster to not do anything fancy)
440
+ return True
441
+
442
+ def rcdataLessThanSignState(self):
443
+ data = self.stream.char()
444
+ if data == "/":
445
+ self.temporaryBuffer = ""
446
+ self.state = self.rcdataEndTagOpenState
447
+ else:
448
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
449
+ self.stream.unget(data)
450
+ self.state = self.rcdataState
451
+ return True
452
+
453
+ def rcdataEndTagOpenState(self):
454
+ data = self.stream.char()
455
+ if data in asciiLetters:
456
+ self.temporaryBuffer += data
457
+ self.state = self.rcdataEndTagNameState
458
+ else:
459
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
460
+ self.stream.unget(data)
461
+ self.state = self.rcdataState
462
+ return True
463
+
464
+ def rcdataEndTagNameState(self):
465
+ appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
466
+ data = self.stream.char()
467
+ if data in spaceCharacters and appropriate:
468
+ self.currentToken = {"type": tokenTypes["EndTag"],
469
+ "name": self.temporaryBuffer,
470
+ "data": [], "selfClosing": False}
471
+ self.state = self.beforeAttributeNameState
472
+ elif data == "/" and appropriate:
473
+ self.currentToken = {"type": tokenTypes["EndTag"],
474
+ "name": self.temporaryBuffer,
475
+ "data": [], "selfClosing": False}
476
+ self.state = self.selfClosingStartTagState
477
+ elif data == ">" and appropriate:
478
+ self.currentToken = {"type": tokenTypes["EndTag"],
479
+ "name": self.temporaryBuffer,
480
+ "data": [], "selfClosing": False}
481
+ self.emitCurrentToken()
482
+ self.state = self.dataState
483
+ elif data in asciiLetters:
484
+ self.temporaryBuffer += data
485
+ else:
486
+ self.tokenQueue.append({"type": tokenTypes["Characters"],
487
+ "data": "</" + self.temporaryBuffer})
488
+ self.stream.unget(data)
489
+ self.state = self.rcdataState
490
+ return True
491
+
492
+ def rawtextLessThanSignState(self):
493
+ data = self.stream.char()
494
+ if data == "/":
495
+ self.temporaryBuffer = ""
496
+ self.state = self.rawtextEndTagOpenState
497
+ else:
498
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
499
+ self.stream.unget(data)
500
+ self.state = self.rawtextState
501
+ return True
502
+
503
+ def rawtextEndTagOpenState(self):
504
+ data = self.stream.char()
505
+ if data in asciiLetters:
506
+ self.temporaryBuffer += data
507
+ self.state = self.rawtextEndTagNameState
508
+ else:
509
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
510
+ self.stream.unget(data)
511
+ self.state = self.rawtextState
512
+ return True
513
+
514
+ def rawtextEndTagNameState(self):
515
+ appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
516
+ data = self.stream.char()
517
+ if data in spaceCharacters and appropriate:
518
+ self.currentToken = {"type": tokenTypes["EndTag"],
519
+ "name": self.temporaryBuffer,
520
+ "data": [], "selfClosing": False}
521
+ self.state = self.beforeAttributeNameState
522
+ elif data == "/" and appropriate:
523
+ self.currentToken = {"type": tokenTypes["EndTag"],
524
+ "name": self.temporaryBuffer,
525
+ "data": [], "selfClosing": False}
526
+ self.state = self.selfClosingStartTagState
527
+ elif data == ">" and appropriate:
528
+ self.currentToken = {"type": tokenTypes["EndTag"],
529
+ "name": self.temporaryBuffer,
530
+ "data": [], "selfClosing": False}
531
+ self.emitCurrentToken()
532
+ self.state = self.dataState
533
+ elif data in asciiLetters:
534
+ self.temporaryBuffer += data
535
+ else:
536
+ self.tokenQueue.append({"type": tokenTypes["Characters"],
537
+ "data": "</" + self.temporaryBuffer})
538
+ self.stream.unget(data)
539
+ self.state = self.rawtextState
540
+ return True
541
+
542
+ def scriptDataLessThanSignState(self):
543
+ data = self.stream.char()
544
+ if data == "/":
545
+ self.temporaryBuffer = ""
546
+ self.state = self.scriptDataEndTagOpenState
547
+ elif data == "!":
548
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<!"})
549
+ self.state = self.scriptDataEscapeStartState
550
+ else:
551
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
552
+ self.stream.unget(data)
553
+ self.state = self.scriptDataState
554
+ return True
555
+
556
+ def scriptDataEndTagOpenState(self):
557
+ data = self.stream.char()
558
+ if data in asciiLetters:
559
+ self.temporaryBuffer += data
560
+ self.state = self.scriptDataEndTagNameState
561
+ else:
562
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
563
+ self.stream.unget(data)
564
+ self.state = self.scriptDataState
565
+ return True
566
+
567
+ def scriptDataEndTagNameState(self):
568
+ appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
569
+ data = self.stream.char()
570
+ if data in spaceCharacters and appropriate:
571
+ self.currentToken = {"type": tokenTypes["EndTag"],
572
+ "name": self.temporaryBuffer,
573
+ "data": [], "selfClosing": False}
574
+ self.state = self.beforeAttributeNameState
575
+ elif data == "/" and appropriate:
576
+ self.currentToken = {"type": tokenTypes["EndTag"],
577
+ "name": self.temporaryBuffer,
578
+ "data": [], "selfClosing": False}
579
+ self.state = self.selfClosingStartTagState
580
+ elif data == ">" and appropriate:
581
+ self.currentToken = {"type": tokenTypes["EndTag"],
582
+ "name": self.temporaryBuffer,
583
+ "data": [], "selfClosing": False}
584
+ self.emitCurrentToken()
585
+ self.state = self.dataState
586
+ elif data in asciiLetters:
587
+ self.temporaryBuffer += data
588
+ else:
589
+ self.tokenQueue.append({"type": tokenTypes["Characters"],
590
+ "data": "</" + self.temporaryBuffer})
591
+ self.stream.unget(data)
592
+ self.state = self.scriptDataState
593
+ return True
594
+
595
+ def scriptDataEscapeStartState(self):
596
+ data = self.stream.char()
597
+ if data == "-":
598
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
599
+ self.state = self.scriptDataEscapeStartDashState
600
+ else:
601
+ self.stream.unget(data)
602
+ self.state = self.scriptDataState
603
+ return True
604
+
605
+ def scriptDataEscapeStartDashState(self):
606
+ data = self.stream.char()
607
+ if data == "-":
608
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
609
+ self.state = self.scriptDataEscapedDashDashState
610
+ else:
611
+ self.stream.unget(data)
612
+ self.state = self.scriptDataState
613
+ return True
614
+
615
+ def scriptDataEscapedState(self):
616
+ data = self.stream.char()
617
+ if data == "-":
618
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
619
+ self.state = self.scriptDataEscapedDashState
620
+ elif data == "<":
621
+ self.state = self.scriptDataEscapedLessThanSignState
622
+ elif data == "\u0000":
623
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
624
+ "data": "invalid-codepoint"})
625
+ self.tokenQueue.append({"type": tokenTypes["Characters"],
626
+ "data": "\uFFFD"})
627
+ elif data == EOF:
628
+ self.state = self.dataState
629
+ else:
630
+ chars = self.stream.charsUntil(("<", "-", "\u0000"))
631
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
632
+ data + chars})
633
+ return True
634
+
635
+ def scriptDataEscapedDashState(self):
636
+ data = self.stream.char()
637
+ if data == "-":
638
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
639
+ self.state = self.scriptDataEscapedDashDashState
640
+ elif data == "<":
641
+ self.state = self.scriptDataEscapedLessThanSignState
642
+ elif data == "\u0000":
643
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
644
+ "data": "invalid-codepoint"})
645
+ self.tokenQueue.append({"type": tokenTypes["Characters"],
646
+ "data": "\uFFFD"})
647
+ self.state = self.scriptDataEscapedState
648
+ elif data == EOF:
649
+ self.state = self.dataState
650
+ else:
651
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
652
+ self.state = self.scriptDataEscapedState
653
+ return True
654
+
655
+ def scriptDataEscapedDashDashState(self):
656
+ data = self.stream.char()
657
+ if data == "-":
658
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
659
+ elif data == "<":
660
+ self.state = self.scriptDataEscapedLessThanSignState
661
+ elif data == ">":
662
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": ">"})
663
+ self.state = self.scriptDataState
664
+ elif data == "\u0000":
665
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
666
+ "data": "invalid-codepoint"})
667
+ self.tokenQueue.append({"type": tokenTypes["Characters"],
668
+ "data": "\uFFFD"})
669
+ self.state = self.scriptDataEscapedState
670
+ elif data == EOF:
671
+ self.state = self.dataState
672
+ else:
673
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
674
+ self.state = self.scriptDataEscapedState
675
+ return True
676
+
677
+ def scriptDataEscapedLessThanSignState(self):
678
+ data = self.stream.char()
679
+ if data == "/":
680
+ self.temporaryBuffer = ""
681
+ self.state = self.scriptDataEscapedEndTagOpenState
682
+ elif data in asciiLetters:
683
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<" + data})
684
+ self.temporaryBuffer = data
685
+ self.state = self.scriptDataDoubleEscapeStartState
686
+ else:
687
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
688
+ self.stream.unget(data)
689
+ self.state = self.scriptDataEscapedState
690
+ return True
691
+
692
+ def scriptDataEscapedEndTagOpenState(self):
693
+ data = self.stream.char()
694
+ if data in asciiLetters:
695
+ self.temporaryBuffer = data
696
+ self.state = self.scriptDataEscapedEndTagNameState
697
+ else:
698
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
699
+ self.stream.unget(data)
700
+ self.state = self.scriptDataEscapedState
701
+ return True
702
+
703
+ def scriptDataEscapedEndTagNameState(self):
704
+ appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
705
+ data = self.stream.char()
706
+ if data in spaceCharacters and appropriate:
707
+ self.currentToken = {"type": tokenTypes["EndTag"],
708
+ "name": self.temporaryBuffer,
709
+ "data": [], "selfClosing": False}
710
+ self.state = self.beforeAttributeNameState
711
+ elif data == "/" and appropriate:
712
+ self.currentToken = {"type": tokenTypes["EndTag"],
713
+ "name": self.temporaryBuffer,
714
+ "data": [], "selfClosing": False}
715
+ self.state = self.selfClosingStartTagState
716
+ elif data == ">" and appropriate:
717
+ self.currentToken = {"type": tokenTypes["EndTag"],
718
+ "name": self.temporaryBuffer,
719
+ "data": [], "selfClosing": False}
720
+ self.emitCurrentToken()
721
+ self.state = self.dataState
722
+ elif data in asciiLetters:
723
+ self.temporaryBuffer += data
724
+ else:
725
+ self.tokenQueue.append({"type": tokenTypes["Characters"],
726
+ "data": "</" + self.temporaryBuffer})
727
+ self.stream.unget(data)
728
+ self.state = self.scriptDataEscapedState
729
+ return True
730
+
731
+ def scriptDataDoubleEscapeStartState(self):
732
+ data = self.stream.char()
733
+ if data in (spaceCharacters | frozenset(("/", ">"))):
734
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
735
+ if self.temporaryBuffer.lower() == "script":
736
+ self.state = self.scriptDataDoubleEscapedState
737
+ else:
738
+ self.state = self.scriptDataEscapedState
739
+ elif data in asciiLetters:
740
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
741
+ self.temporaryBuffer += data
742
+ else:
743
+ self.stream.unget(data)
744
+ self.state = self.scriptDataEscapedState
745
+ return True
746
+
747
+ def scriptDataDoubleEscapedState(self):
748
+ data = self.stream.char()
749
+ if data == "-":
750
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
751
+ self.state = self.scriptDataDoubleEscapedDashState
752
+ elif data == "<":
753
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
754
+ self.state = self.scriptDataDoubleEscapedLessThanSignState
755
+ elif data == "\u0000":
756
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
757
+ "data": "invalid-codepoint"})
758
+ self.tokenQueue.append({"type": tokenTypes["Characters"],
759
+ "data": "\uFFFD"})
760
+ elif data == EOF:
761
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
762
+ "eof-in-script-in-script"})
763
+ self.state = self.dataState
764
+ else:
765
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
766
+ return True
767
+
768
+ def scriptDataDoubleEscapedDashState(self):
769
+ data = self.stream.char()
770
+ if data == "-":
771
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
772
+ self.state = self.scriptDataDoubleEscapedDashDashState
773
+ elif data == "<":
774
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
775
+ self.state = self.scriptDataDoubleEscapedLessThanSignState
776
+ elif data == "\u0000":
777
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
778
+ "data": "invalid-codepoint"})
779
+ self.tokenQueue.append({"type": tokenTypes["Characters"],
780
+ "data": "\uFFFD"})
781
+ self.state = self.scriptDataDoubleEscapedState
782
+ elif data == EOF:
783
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
784
+ "eof-in-script-in-script"})
785
+ self.state = self.dataState
786
+ else:
787
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
788
+ self.state = self.scriptDataDoubleEscapedState
789
+ return True
790
+
791
+ def scriptDataDoubleEscapedDashDashState(self):
792
+ data = self.stream.char()
793
+ if data == "-":
794
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
795
+ elif data == "<":
796
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
797
+ self.state = self.scriptDataDoubleEscapedLessThanSignState
798
+ elif data == ">":
799
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": ">"})
800
+ self.state = self.scriptDataState
801
+ elif data == "\u0000":
802
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
803
+ "data": "invalid-codepoint"})
804
+ self.tokenQueue.append({"type": tokenTypes["Characters"],
805
+ "data": "\uFFFD"})
806
+ self.state = self.scriptDataDoubleEscapedState
807
+ elif data == EOF:
808
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
809
+ "eof-in-script-in-script"})
810
+ self.state = self.dataState
811
+ else:
812
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
813
+ self.state = self.scriptDataDoubleEscapedState
814
+ return True
815
+
816
+ def scriptDataDoubleEscapedLessThanSignState(self):
817
+ data = self.stream.char()
818
+ if data == "/":
819
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "/"})
820
+ self.temporaryBuffer = ""
821
+ self.state = self.scriptDataDoubleEscapeEndState
822
+ else:
823
+ self.stream.unget(data)
824
+ self.state = self.scriptDataDoubleEscapedState
825
+ return True
826
+
827
+ def scriptDataDoubleEscapeEndState(self):
828
+ data = self.stream.char()
829
+ if data in (spaceCharacters | frozenset(("/", ">"))):
830
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
831
+ if self.temporaryBuffer.lower() == "script":
832
+ self.state = self.scriptDataEscapedState
833
+ else:
834
+ self.state = self.scriptDataDoubleEscapedState
835
+ elif data in asciiLetters:
836
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
837
+ self.temporaryBuffer += data
838
+ else:
839
+ self.stream.unget(data)
840
+ self.state = self.scriptDataDoubleEscapedState
841
+ return True
842
+
843
+ def beforeAttributeNameState(self):
844
+ data = self.stream.char()
845
+ if data in spaceCharacters:
846
+ self.stream.charsUntil(spaceCharacters, True)
847
+ elif data in asciiLetters:
848
+ self.currentToken["data"].append([data, ""])
849
+ self.state = self.attributeNameState
850
+ elif data == ">":
851
+ self.emitCurrentToken()
852
+ elif data == "/":
853
+ self.state = self.selfClosingStartTagState
854
+ elif data in ("'", '"', "=", "<"):
855
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
856
+ "invalid-character-in-attribute-name"})
857
+ self.currentToken["data"].append([data, ""])
858
+ self.state = self.attributeNameState
859
+ elif data == "\u0000":
860
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
861
+ "data": "invalid-codepoint"})
862
+ self.currentToken["data"].append(["\uFFFD", ""])
863
+ self.state = self.attributeNameState
864
+ elif data is EOF:
865
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
866
+ "expected-attribute-name-but-got-eof"})
867
+ self.state = self.dataState
868
+ else:
869
+ self.currentToken["data"].append([data, ""])
870
+ self.state = self.attributeNameState
871
+ return True
872
+
873
+ def attributeNameState(self):
874
+ data = self.stream.char()
875
+ leavingThisState = True
876
+ emitToken = False
877
+ if data == "=":
878
+ self.state = self.beforeAttributeValueState
879
+ elif data in asciiLetters:
880
+ self.currentToken["data"][-1][0] += data +\
881
+ self.stream.charsUntil(asciiLetters, True)
882
+ leavingThisState = False
883
+ elif data == ">":
884
+ # XXX If we emit here the attributes are converted to a dict
885
+ # without being checked and when the code below runs we error
886
+ # because data is a dict not a list
887
+ emitToken = True
888
+ elif data in spaceCharacters:
889
+ self.state = self.afterAttributeNameState
890
+ elif data == "/":
891
+ self.state = self.selfClosingStartTagState
892
+ elif data == "\u0000":
893
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
894
+ "data": "invalid-codepoint"})
895
+ self.currentToken["data"][-1][0] += "\uFFFD"
896
+ leavingThisState = False
897
+ elif data in ("'", '"', "<"):
898
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
899
+ "data":
900
+ "invalid-character-in-attribute-name"})
901
+ self.currentToken["data"][-1][0] += data
902
+ leavingThisState = False
903
+ elif data is EOF:
904
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
905
+ "data": "eof-in-attribute-name"})
906
+ self.state = self.dataState
907
+ else:
908
+ self.currentToken["data"][-1][0] += data
909
+ leavingThisState = False
910
+
911
+ if leavingThisState:
912
+ # Attributes are not dropped at this stage. That happens when the
913
+ # start tag token is emitted so values can still be safely appended
914
+ # to attributes, but we do want to report the parse error in time.
915
+ self.currentToken["data"][-1][0] = (
916
+ self.currentToken["data"][-1][0].translate(asciiUpper2Lower))
917
+ for name, _ in self.currentToken["data"][:-1]:
918
+ if self.currentToken["data"][-1][0] == name:
919
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
920
+ "duplicate-attribute"})
921
+ break
922
+ # XXX Fix for above XXX
923
+ if emitToken:
924
+ self.emitCurrentToken()
925
+ return True
926
+
927
+ def afterAttributeNameState(self):
928
+ data = self.stream.char()
929
+ if data in spaceCharacters:
930
+ self.stream.charsUntil(spaceCharacters, True)
931
+ elif data == "=":
932
+ self.state = self.beforeAttributeValueState
933
+ elif data == ">":
934
+ self.emitCurrentToken()
935
+ elif data in asciiLetters:
936
+ self.currentToken["data"].append([data, ""])
937
+ self.state = self.attributeNameState
938
+ elif data == "/":
939
+ self.state = self.selfClosingStartTagState
940
+ elif data == "\u0000":
941
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
942
+ "data": "invalid-codepoint"})
943
+ self.currentToken["data"].append(["\uFFFD", ""])
944
+ self.state = self.attributeNameState
945
+ elif data in ("'", '"', "<"):
946
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
947
+ "invalid-character-after-attribute-name"})
948
+ self.currentToken["data"].append([data, ""])
949
+ self.state = self.attributeNameState
950
+ elif data is EOF:
951
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
952
+ "expected-end-of-tag-but-got-eof"})
953
+ self.state = self.dataState
954
+ else:
955
+ self.currentToken["data"].append([data, ""])
956
+ self.state = self.attributeNameState
957
+ return True
958
+
959
+ def beforeAttributeValueState(self):
960
+ data = self.stream.char()
961
+ if data in spaceCharacters:
962
+ self.stream.charsUntil(spaceCharacters, True)
963
+ elif data == "\"":
964
+ self.state = self.attributeValueDoubleQuotedState
965
+ elif data == "&":
966
+ self.state = self.attributeValueUnQuotedState
967
+ self.stream.unget(data)
968
+ elif data == "'":
969
+ self.state = self.attributeValueSingleQuotedState
970
+ elif data == ">":
971
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
972
+ "expected-attribute-value-but-got-right-bracket"})
973
+ self.emitCurrentToken()
974
+ elif data == "\u0000":
975
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
976
+ "data": "invalid-codepoint"})
977
+ self.currentToken["data"][-1][1] += "\uFFFD"
978
+ self.state = self.attributeValueUnQuotedState
979
+ elif data in ("=", "<", "`"):
980
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
981
+ "equals-in-unquoted-attribute-value"})
982
+ self.currentToken["data"][-1][1] += data
983
+ self.state = self.attributeValueUnQuotedState
984
+ elif data is EOF:
985
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
986
+ "expected-attribute-value-but-got-eof"})
987
+ self.state = self.dataState
988
+ else:
989
+ self.currentToken["data"][-1][1] += data
990
+ self.state = self.attributeValueUnQuotedState
991
+ return True
992
+
993
+ def attributeValueDoubleQuotedState(self):
994
+ data = self.stream.char()
995
+ if data == "\"":
996
+ self.state = self.afterAttributeValueState
997
+ elif data == "&":
998
+ self.processEntityInAttribute('"')
999
+ elif data == "\u0000":
1000
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
1001
+ "data": "invalid-codepoint"})
1002
+ self.currentToken["data"][-1][1] += "\uFFFD"
1003
+ elif data is EOF:
1004
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1005
+ "eof-in-attribute-value-double-quote"})
1006
+ self.state = self.dataState
1007
+ else:
1008
+ self.currentToken["data"][-1][1] += data +\
1009
+ self.stream.charsUntil(("\"", "&", "\u0000"))
1010
+ return True
1011
+
1012
+ def attributeValueSingleQuotedState(self):
1013
+ data = self.stream.char()
1014
+ if data == "'":
1015
+ self.state = self.afterAttributeValueState
1016
+ elif data == "&":
1017
+ self.processEntityInAttribute("'")
1018
+ elif data == "\u0000":
1019
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
1020
+ "data": "invalid-codepoint"})
1021
+ self.currentToken["data"][-1][1] += "\uFFFD"
1022
+ elif data is EOF:
1023
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1024
+ "eof-in-attribute-value-single-quote"})
1025
+ self.state = self.dataState
1026
+ else:
1027
+ self.currentToken["data"][-1][1] += data +\
1028
+ self.stream.charsUntil(("'", "&", "\u0000"))
1029
+ return True
1030
+
1031
+ def attributeValueUnQuotedState(self):
1032
+ data = self.stream.char()
1033
+ if data in spaceCharacters:
1034
+ self.state = self.beforeAttributeNameState
1035
+ elif data == "&":
1036
+ self.processEntityInAttribute(">")
1037
+ elif data == ">":
1038
+ self.emitCurrentToken()
1039
+ elif data in ('"', "'", "=", "<", "`"):
1040
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1041
+ "unexpected-character-in-unquoted-attribute-value"})
1042
+ self.currentToken["data"][-1][1] += data
1043
+ elif data == "\u0000":
1044
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
1045
+ "data": "invalid-codepoint"})
1046
+ self.currentToken["data"][-1][1] += "\uFFFD"
1047
+ elif data is EOF:
1048
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1049
+ "eof-in-attribute-value-no-quotes"})
1050
+ self.state = self.dataState
1051
+ else:
1052
+ self.currentToken["data"][-1][1] += data + self.stream.charsUntil(
1053
+ frozenset(("&", ">", '"', "'", "=", "<", "`", "\u0000")) | spaceCharacters)
1054
+ return True
1055
+
1056
+ def afterAttributeValueState(self):
1057
+ data = self.stream.char()
1058
+ if data in spaceCharacters:
1059
+ self.state = self.beforeAttributeNameState
1060
+ elif data == ">":
1061
+ self.emitCurrentToken()
1062
+ elif data == "/":
1063
+ self.state = self.selfClosingStartTagState
1064
+ elif data is EOF:
1065
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1066
+ "unexpected-EOF-after-attribute-value"})
1067
+ self.stream.unget(data)
1068
+ self.state = self.dataState
1069
+ else:
1070
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1071
+ "unexpected-character-after-attribute-value"})
1072
+ self.stream.unget(data)
1073
+ self.state = self.beforeAttributeNameState
1074
+ return True
1075
+
1076
+ def selfClosingStartTagState(self):
1077
+ data = self.stream.char()
1078
+ if data == ">":
1079
+ self.currentToken["selfClosing"] = True
1080
+ self.emitCurrentToken()
1081
+ elif data is EOF:
1082
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
1083
+ "data":
1084
+ "unexpected-EOF-after-solidus-in-tag"})
1085
+ self.stream.unget(data)
1086
+ self.state = self.dataState
1087
+ else:
1088
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1089
+ "unexpected-character-after-solidus-in-tag"})
1090
+ self.stream.unget(data)
1091
+ self.state = self.beforeAttributeNameState
1092
+ return True
1093
+
1094
+ def bogusCommentState(self):
1095
+ # Make a new comment token and give it as value all the characters
1096
+ # until the first > or EOF (charsUntil checks for EOF automatically)
1097
+ # and emit it.
1098
+ data = self.stream.charsUntil(">")
1099
+ data = data.replace("\u0000", "\uFFFD")
1100
+ self.tokenQueue.append(
1101
+ {"type": tokenTypes["Comment"], "data": data})
1102
+
1103
+ # Eat the character directly after the bogus comment which is either a
1104
+ # ">" or an EOF.
1105
+ self.stream.char()
1106
+ self.state = self.dataState
1107
+ return True
1108
+
1109
+ def markupDeclarationOpenState(self):
1110
+ charStack = [self.stream.char()]
1111
+ if charStack[-1] == "-":
1112
+ charStack.append(self.stream.char())
1113
+ if charStack[-1] == "-":
1114
+ self.currentToken = {"type": tokenTypes["Comment"], "data": ""}
1115
+ self.state = self.commentStartState
1116
+ return True
1117
+ elif charStack[-1] in ('d', 'D'):
1118
+ matched = True
1119
+ for expected in (('o', 'O'), ('c', 'C'), ('t', 'T'),
1120
+ ('y', 'Y'), ('p', 'P'), ('e', 'E')):
1121
+ charStack.append(self.stream.char())
1122
+ if charStack[-1] not in expected:
1123
+ matched = False
1124
+ break
1125
+ if matched:
1126
+ self.currentToken = {"type": tokenTypes["Doctype"],
1127
+ "name": "",
1128
+ "publicId": None, "systemId": None,
1129
+ "correct": True}
1130
+ self.state = self.doctypeState
1131
+ return True
1132
+ elif (charStack[-1] == "[" and
1133
+ self.parser is not None and
1134
+ self.parser.tree.openElements and
1135
+ self.parser.tree.openElements[-1].namespace != self.parser.tree.defaultNamespace):
1136
+ matched = True
1137
+ for expected in ["C", "D", "A", "T", "A", "["]:
1138
+ charStack.append(self.stream.char())
1139
+ if charStack[-1] != expected:
1140
+ matched = False
1141
+ break
1142
+ if matched:
1143
+ self.state = self.cdataSectionState
1144
+ return True
1145
+
1146
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1147
+ "expected-dashes-or-doctype"})
1148
+
1149
+ while charStack:
1150
+ self.stream.unget(charStack.pop())
1151
+ self.state = self.bogusCommentState
1152
+ return True
1153
+
1154
+ def commentStartState(self):
1155
+ data = self.stream.char()
1156
+ if data == "-":
1157
+ self.state = self.commentStartDashState
1158
+ elif data == "\u0000":
1159
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
1160
+ "data": "invalid-codepoint"})
1161
+ self.currentToken["data"] += "\uFFFD"
1162
+ elif data == ">":
1163
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1164
+ "incorrect-comment"})
1165
+ self.tokenQueue.append(self.currentToken)
1166
+ self.state = self.dataState
1167
+ elif data is EOF:
1168
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1169
+ "eof-in-comment"})
1170
+ self.tokenQueue.append(self.currentToken)
1171
+ self.state = self.dataState
1172
+ else:
1173
+ self.currentToken["data"] += data
1174
+ self.state = self.commentState
1175
+ return True
1176
+
1177
+ def commentStartDashState(self):
1178
+ data = self.stream.char()
1179
+ if data == "-":
1180
+ self.state = self.commentEndState
1181
+ elif data == "\u0000":
1182
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
1183
+ "data": "invalid-codepoint"})
1184
+ self.currentToken["data"] += "-\uFFFD"
1185
+ elif data == ">":
1186
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1187
+ "incorrect-comment"})
1188
+ self.tokenQueue.append(self.currentToken)
1189
+ self.state = self.dataState
1190
+ elif data is EOF:
1191
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1192
+ "eof-in-comment"})
1193
+ self.tokenQueue.append(self.currentToken)
1194
+ self.state = self.dataState
1195
+ else:
1196
+ self.currentToken["data"] += "-" + data
1197
+ self.state = self.commentState
1198
+ return True
1199
+
1200
+ def commentState(self):
1201
+ data = self.stream.char()
1202
+ if data == "-":
1203
+ self.state = self.commentEndDashState
1204
+ elif data == "\u0000":
1205
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
1206
+ "data": "invalid-codepoint"})
1207
+ self.currentToken["data"] += "\uFFFD"
1208
+ elif data is EOF:
1209
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
1210
+ "data": "eof-in-comment"})
1211
+ self.tokenQueue.append(self.currentToken)
1212
+ self.state = self.dataState
1213
+ else:
1214
+ self.currentToken["data"] += data + \
1215
+ self.stream.charsUntil(("-", "\u0000"))
1216
+ return True
1217
+
1218
+ def commentEndDashState(self):
1219
+ data = self.stream.char()
1220
+ if data == "-":
1221
+ self.state = self.commentEndState
1222
+ elif data == "\u0000":
1223
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
1224
+ "data": "invalid-codepoint"})
1225
+ self.currentToken["data"] += "-\uFFFD"
1226
+ self.state = self.commentState
1227
+ elif data is EOF:
1228
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1229
+ "eof-in-comment-end-dash"})
1230
+ self.tokenQueue.append(self.currentToken)
1231
+ self.state = self.dataState
1232
+ else:
1233
+ self.currentToken["data"] += "-" + data
1234
+ self.state = self.commentState
1235
+ return True
1236
+
1237
+ def commentEndState(self):
1238
+ data = self.stream.char()
1239
+ if data == ">":
1240
+ self.tokenQueue.append(self.currentToken)
1241
+ self.state = self.dataState
1242
+ elif data == "\u0000":
1243
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
1244
+ "data": "invalid-codepoint"})
1245
+ self.currentToken["data"] += "--\uFFFD"
1246
+ self.state = self.commentState
1247
+ elif data == "!":
1248
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1249
+ "unexpected-bang-after-double-dash-in-comment"})
1250
+ self.state = self.commentEndBangState
1251
+ elif data == "-":
1252
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1253
+ "unexpected-dash-after-double-dash-in-comment"})
1254
+ self.currentToken["data"] += data
1255
+ elif data is EOF:
1256
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1257
+ "eof-in-comment-double-dash"})
1258
+ self.tokenQueue.append(self.currentToken)
1259
+ self.state = self.dataState
1260
+ else:
1261
+ # XXX
1262
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1263
+ "unexpected-char-in-comment"})
1264
+ self.currentToken["data"] += "--" + data
1265
+ self.state = self.commentState
1266
+ return True
1267
+
1268
+ def commentEndBangState(self):
1269
+ data = self.stream.char()
1270
+ if data == ">":
1271
+ self.tokenQueue.append(self.currentToken)
1272
+ self.state = self.dataState
1273
+ elif data == "-":
1274
+ self.currentToken["data"] += "--!"
1275
+ self.state = self.commentEndDashState
1276
+ elif data == "\u0000":
1277
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
1278
+ "data": "invalid-codepoint"})
1279
+ self.currentToken["data"] += "--!\uFFFD"
1280
+ self.state = self.commentState
1281
+ elif data is EOF:
1282
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1283
+ "eof-in-comment-end-bang-state"})
1284
+ self.tokenQueue.append(self.currentToken)
1285
+ self.state = self.dataState
1286
+ else:
1287
+ self.currentToken["data"] += "--!" + data
1288
+ self.state = self.commentState
1289
+ return True
1290
+
1291
+ def doctypeState(self):
1292
+ data = self.stream.char()
1293
+ if data in spaceCharacters:
1294
+ self.state = self.beforeDoctypeNameState
1295
+ elif data is EOF:
1296
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1297
+ "expected-doctype-name-but-got-eof"})
1298
+ self.currentToken["correct"] = False
1299
+ self.tokenQueue.append(self.currentToken)
1300
+ self.state = self.dataState
1301
+ else:
1302
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1303
+ "need-space-after-doctype"})
1304
+ self.stream.unget(data)
1305
+ self.state = self.beforeDoctypeNameState
1306
+ return True
1307
+
1308
+ def beforeDoctypeNameState(self):
1309
+ data = self.stream.char()
1310
+ if data in spaceCharacters:
1311
+ pass
1312
+ elif data == ">":
1313
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1314
+ "expected-doctype-name-but-got-right-bracket"})
1315
+ self.currentToken["correct"] = False
1316
+ self.tokenQueue.append(self.currentToken)
1317
+ self.state = self.dataState
1318
+ elif data == "\u0000":
1319
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
1320
+ "data": "invalid-codepoint"})
1321
+ self.currentToken["name"] = "\uFFFD"
1322
+ self.state = self.doctypeNameState
1323
+ elif data is EOF:
1324
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1325
+ "expected-doctype-name-but-got-eof"})
1326
+ self.currentToken["correct"] = False
1327
+ self.tokenQueue.append(self.currentToken)
1328
+ self.state = self.dataState
1329
+ else:
1330
+ self.currentToken["name"] = data
1331
+ self.state = self.doctypeNameState
1332
+ return True
1333
+
1334
+ def doctypeNameState(self):
1335
+ data = self.stream.char()
1336
+ if data in spaceCharacters:
1337
+ self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
1338
+ self.state = self.afterDoctypeNameState
1339
+ elif data == ">":
1340
+ self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
1341
+ self.tokenQueue.append(self.currentToken)
1342
+ self.state = self.dataState
1343
+ elif data == "\u0000":
1344
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
1345
+ "data": "invalid-codepoint"})
1346
+ self.currentToken["name"] += "\uFFFD"
1347
+ self.state = self.doctypeNameState
1348
+ elif data is EOF:
1349
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1350
+ "eof-in-doctype-name"})
1351
+ self.currentToken["correct"] = False
1352
+ self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
1353
+ self.tokenQueue.append(self.currentToken)
1354
+ self.state = self.dataState
1355
+ else:
1356
+ self.currentToken["name"] += data
1357
+ return True
1358
+
1359
+ def afterDoctypeNameState(self):
1360
+ data = self.stream.char()
1361
+ if data in spaceCharacters:
1362
+ pass
1363
+ elif data == ">":
1364
+ self.tokenQueue.append(self.currentToken)
1365
+ self.state = self.dataState
1366
+ elif data is EOF:
1367
+ self.currentToken["correct"] = False
1368
+ self.stream.unget(data)
1369
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1370
+ "eof-in-doctype"})
1371
+ self.tokenQueue.append(self.currentToken)
1372
+ self.state = self.dataState
1373
+ else:
1374
+ if data in ("p", "P"):
1375
+ matched = True
1376
+ for expected in (("u", "U"), ("b", "B"), ("l", "L"),
1377
+ ("i", "I"), ("c", "C")):
1378
+ data = self.stream.char()
1379
+ if data not in expected:
1380
+ matched = False
1381
+ break
1382
+ if matched:
1383
+ self.state = self.afterDoctypePublicKeywordState
1384
+ return True
1385
+ elif data in ("s", "S"):
1386
+ matched = True
1387
+ for expected in (("y", "Y"), ("s", "S"), ("t", "T"),
1388
+ ("e", "E"), ("m", "M")):
1389
+ data = self.stream.char()
1390
+ if data not in expected:
1391
+ matched = False
1392
+ break
1393
+ if matched:
1394
+ self.state = self.afterDoctypeSystemKeywordState
1395
+ return True
1396
+
1397
+ # All the characters read before the current 'data' will be
1398
+ # [a-zA-Z], so they're garbage in the bogus doctype and can be
1399
+ # discarded; only the latest character might be '>' or EOF
1400
+ # and needs to be ungetted
1401
+ self.stream.unget(data)
1402
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1403
+ "expected-space-or-right-bracket-in-doctype", "datavars":
1404
+ {"data": data}})
1405
+ self.currentToken["correct"] = False
1406
+ self.state = self.bogusDoctypeState
1407
+
1408
+ return True
1409
+
1410
+ def afterDoctypePublicKeywordState(self):
1411
+ data = self.stream.char()
1412
+ if data in spaceCharacters:
1413
+ self.state = self.beforeDoctypePublicIdentifierState
1414
+ elif data in ("'", '"'):
1415
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1416
+ "unexpected-char-in-doctype"})
1417
+ self.stream.unget(data)
1418
+ self.state = self.beforeDoctypePublicIdentifierState
1419
+ elif data is EOF:
1420
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1421
+ "eof-in-doctype"})
1422
+ self.currentToken["correct"] = False
1423
+ self.tokenQueue.append(self.currentToken)
1424
+ self.state = self.dataState
1425
+ else:
1426
+ self.stream.unget(data)
1427
+ self.state = self.beforeDoctypePublicIdentifierState
1428
+ return True
1429
+
1430
+ def beforeDoctypePublicIdentifierState(self):
1431
+ data = self.stream.char()
1432
+ if data in spaceCharacters:
1433
+ pass
1434
+ elif data == "\"":
1435
+ self.currentToken["publicId"] = ""
1436
+ self.state = self.doctypePublicIdentifierDoubleQuotedState
1437
+ elif data == "'":
1438
+ self.currentToken["publicId"] = ""
1439
+ self.state = self.doctypePublicIdentifierSingleQuotedState
1440
+ elif data == ">":
1441
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1442
+ "unexpected-end-of-doctype"})
1443
+ self.currentToken["correct"] = False
1444
+ self.tokenQueue.append(self.currentToken)
1445
+ self.state = self.dataState
1446
+ elif data is EOF:
1447
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1448
+ "eof-in-doctype"})
1449
+ self.currentToken["correct"] = False
1450
+ self.tokenQueue.append(self.currentToken)
1451
+ self.state = self.dataState
1452
+ else:
1453
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1454
+ "unexpected-char-in-doctype"})
1455
+ self.currentToken["correct"] = False
1456
+ self.state = self.bogusDoctypeState
1457
+ return True
1458
+
1459
+ def doctypePublicIdentifierDoubleQuotedState(self):
1460
+ data = self.stream.char()
1461
+ if data == "\"":
1462
+ self.state = self.afterDoctypePublicIdentifierState
1463
+ elif data == "\u0000":
1464
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
1465
+ "data": "invalid-codepoint"})
1466
+ self.currentToken["publicId"] += "\uFFFD"
1467
+ elif data == ">":
1468
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1469
+ "unexpected-end-of-doctype"})
1470
+ self.currentToken["correct"] = False
1471
+ self.tokenQueue.append(self.currentToken)
1472
+ self.state = self.dataState
1473
+ elif data is EOF:
1474
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1475
+ "eof-in-doctype"})
1476
+ self.currentToken["correct"] = False
1477
+ self.tokenQueue.append(self.currentToken)
1478
+ self.state = self.dataState
1479
+ else:
1480
+ self.currentToken["publicId"] += data
1481
+ return True
1482
+
1483
+ def doctypePublicIdentifierSingleQuotedState(self):
1484
+ data = self.stream.char()
1485
+ if data == "'":
1486
+ self.state = self.afterDoctypePublicIdentifierState
1487
+ elif data == "\u0000":
1488
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
1489
+ "data": "invalid-codepoint"})
1490
+ self.currentToken["publicId"] += "\uFFFD"
1491
+ elif data == ">":
1492
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1493
+ "unexpected-end-of-doctype"})
1494
+ self.currentToken["correct"] = False
1495
+ self.tokenQueue.append(self.currentToken)
1496
+ self.state = self.dataState
1497
+ elif data is EOF:
1498
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1499
+ "eof-in-doctype"})
1500
+ self.currentToken["correct"] = False
1501
+ self.tokenQueue.append(self.currentToken)
1502
+ self.state = self.dataState
1503
+ else:
1504
+ self.currentToken["publicId"] += data
1505
+ return True
1506
+
1507
+ def afterDoctypePublicIdentifierState(self):
1508
+ data = self.stream.char()
1509
+ if data in spaceCharacters:
1510
+ self.state = self.betweenDoctypePublicAndSystemIdentifiersState
1511
+ elif data == ">":
1512
+ self.tokenQueue.append(self.currentToken)
1513
+ self.state = self.dataState
1514
+ elif data == '"':
1515
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1516
+ "unexpected-char-in-doctype"})
1517
+ self.currentToken["systemId"] = ""
1518
+ self.state = self.doctypeSystemIdentifierDoubleQuotedState
1519
+ elif data == "'":
1520
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1521
+ "unexpected-char-in-doctype"})
1522
+ self.currentToken["systemId"] = ""
1523
+ self.state = self.doctypeSystemIdentifierSingleQuotedState
1524
+ elif data is EOF:
1525
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1526
+ "eof-in-doctype"})
1527
+ self.currentToken["correct"] = False
1528
+ self.tokenQueue.append(self.currentToken)
1529
+ self.state = self.dataState
1530
+ else:
1531
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1532
+ "unexpected-char-in-doctype"})
1533
+ self.currentToken["correct"] = False
1534
+ self.state = self.bogusDoctypeState
1535
+ return True
1536
+
1537
+ def betweenDoctypePublicAndSystemIdentifiersState(self):
1538
+ data = self.stream.char()
1539
+ if data in spaceCharacters:
1540
+ pass
1541
+ elif data == ">":
1542
+ self.tokenQueue.append(self.currentToken)
1543
+ self.state = self.dataState
1544
+ elif data == '"':
1545
+ self.currentToken["systemId"] = ""
1546
+ self.state = self.doctypeSystemIdentifierDoubleQuotedState
1547
+ elif data == "'":
1548
+ self.currentToken["systemId"] = ""
1549
+ self.state = self.doctypeSystemIdentifierSingleQuotedState
1550
+ elif data == EOF:
1551
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1552
+ "eof-in-doctype"})
1553
+ self.currentToken["correct"] = False
1554
+ self.tokenQueue.append(self.currentToken)
1555
+ self.state = self.dataState
1556
+ else:
1557
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1558
+ "unexpected-char-in-doctype"})
1559
+ self.currentToken["correct"] = False
1560
+ self.state = self.bogusDoctypeState
1561
+ return True
1562
+
1563
+ def afterDoctypeSystemKeywordState(self):
1564
+ data = self.stream.char()
1565
+ if data in spaceCharacters:
1566
+ self.state = self.beforeDoctypeSystemIdentifierState
1567
+ elif data in ("'", '"'):
1568
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1569
+ "unexpected-char-in-doctype"})
1570
+ self.stream.unget(data)
1571
+ self.state = self.beforeDoctypeSystemIdentifierState
1572
+ elif data is EOF:
1573
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1574
+ "eof-in-doctype"})
1575
+ self.currentToken["correct"] = False
1576
+ self.tokenQueue.append(self.currentToken)
1577
+ self.state = self.dataState
1578
+ else:
1579
+ self.stream.unget(data)
1580
+ self.state = self.beforeDoctypeSystemIdentifierState
1581
+ return True
1582
+
1583
+ def beforeDoctypeSystemIdentifierState(self):
1584
+ data = self.stream.char()
1585
+ if data in spaceCharacters:
1586
+ pass
1587
+ elif data == "\"":
1588
+ self.currentToken["systemId"] = ""
1589
+ self.state = self.doctypeSystemIdentifierDoubleQuotedState
1590
+ elif data == "'":
1591
+ self.currentToken["systemId"] = ""
1592
+ self.state = self.doctypeSystemIdentifierSingleQuotedState
1593
+ elif data == ">":
1594
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1595
+ "unexpected-char-in-doctype"})
1596
+ self.currentToken["correct"] = False
1597
+ self.tokenQueue.append(self.currentToken)
1598
+ self.state = self.dataState
1599
+ elif data is EOF:
1600
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1601
+ "eof-in-doctype"})
1602
+ self.currentToken["correct"] = False
1603
+ self.tokenQueue.append(self.currentToken)
1604
+ self.state = self.dataState
1605
+ else:
1606
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1607
+ "unexpected-char-in-doctype"})
1608
+ self.currentToken["correct"] = False
1609
+ self.state = self.bogusDoctypeState
1610
+ return True
1611
+
1612
+ def doctypeSystemIdentifierDoubleQuotedState(self):
1613
+ data = self.stream.char()
1614
+ if data == "\"":
1615
+ self.state = self.afterDoctypeSystemIdentifierState
1616
+ elif data == "\u0000":
1617
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
1618
+ "data": "invalid-codepoint"})
1619
+ self.currentToken["systemId"] += "\uFFFD"
1620
+ elif data == ">":
1621
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1622
+ "unexpected-end-of-doctype"})
1623
+ self.currentToken["correct"] = False
1624
+ self.tokenQueue.append(self.currentToken)
1625
+ self.state = self.dataState
1626
+ elif data is EOF:
1627
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1628
+ "eof-in-doctype"})
1629
+ self.currentToken["correct"] = False
1630
+ self.tokenQueue.append(self.currentToken)
1631
+ self.state = self.dataState
1632
+ else:
1633
+ self.currentToken["systemId"] += data
1634
+ return True
1635
+
1636
+ def doctypeSystemIdentifierSingleQuotedState(self):
1637
+ data = self.stream.char()
1638
+ if data == "'":
1639
+ self.state = self.afterDoctypeSystemIdentifierState
1640
+ elif data == "\u0000":
1641
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
1642
+ "data": "invalid-codepoint"})
1643
+ self.currentToken["systemId"] += "\uFFFD"
1644
+ elif data == ">":
1645
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1646
+ "unexpected-end-of-doctype"})
1647
+ self.currentToken["correct"] = False
1648
+ self.tokenQueue.append(self.currentToken)
1649
+ self.state = self.dataState
1650
+ elif data is EOF:
1651
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1652
+ "eof-in-doctype"})
1653
+ self.currentToken["correct"] = False
1654
+ self.tokenQueue.append(self.currentToken)
1655
+ self.state = self.dataState
1656
+ else:
1657
+ self.currentToken["systemId"] += data
1658
+ return True
1659
+
1660
+ def afterDoctypeSystemIdentifierState(self):
1661
+ data = self.stream.char()
1662
+ if data in spaceCharacters:
1663
+ pass
1664
+ elif data == ">":
1665
+ self.tokenQueue.append(self.currentToken)
1666
+ self.state = self.dataState
1667
+ elif data is EOF:
1668
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1669
+ "eof-in-doctype"})
1670
+ self.currentToken["correct"] = False
1671
+ self.tokenQueue.append(self.currentToken)
1672
+ self.state = self.dataState
1673
+ else:
1674
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1675
+ "unexpected-char-in-doctype"})
1676
+ self.state = self.bogusDoctypeState
1677
+ return True
1678
+
1679
+ def bogusDoctypeState(self):
1680
+ data = self.stream.char()
1681
+ if data == ">":
1682
+ self.tokenQueue.append(self.currentToken)
1683
+ self.state = self.dataState
1684
+ elif data is EOF:
1685
+ # XXX EMIT
1686
+ self.stream.unget(data)
1687
+ self.tokenQueue.append(self.currentToken)
1688
+ self.state = self.dataState
1689
+ else:
1690
+ pass
1691
+ return True
1692
+
1693
+ def cdataSectionState(self):
1694
+ data = []
1695
+ while True:
1696
+ data.append(self.stream.charsUntil("]"))
1697
+ data.append(self.stream.charsUntil(">"))
1698
+ char = self.stream.char()
1699
+ if char == EOF:
1700
+ break
1701
+ else:
1702
+ assert char == ">"
1703
+ if data[-1][-2:] == "]]":
1704
+ data[-1] = data[-1][:-2]
1705
+ break
1706
+ else:
1707
+ data.append(char)
1708
+
1709
+ data = "".join(data) # pylint:disable=redefined-variable-type
1710
+ # Deal with null here rather than in the parser
1711
+ nullCount = data.count("\u0000")
1712
+ if nullCount > 0:
1713
+ for _ in range(nullCount):
1714
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
1715
+ "data": "invalid-codepoint"})
1716
+ data = data.replace("\u0000", "\uFFFD")
1717
+ if data:
1718
+ self.tokenQueue.append({"type": tokenTypes["Characters"],
1719
+ "data": data})
1720
+ self.state = self.dataState
1721
+ return True