feedtools 0.2.26 → 0.2.27

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (166) hide show
  1. data/CHANGELOG +232 -216
  2. data/db/migration.rb +2 -0
  3. data/db/schema.mysql.sql +2 -0
  4. data/db/schema.postgresql.sql +3 -1
  5. data/db/schema.sqlite.sql +3 -1
  6. data/lib/feed_tools.rb +37 -14
  7. data/lib/feed_tools/database_feed_cache.rb +13 -2
  8. data/lib/feed_tools/feed.rb +430 -104
  9. data/lib/feed_tools/feed_item.rb +533 -268
  10. data/lib/feed_tools/helpers/generic_helper.rb +1 -1
  11. data/lib/feed_tools/helpers/html_helper.rb +78 -116
  12. data/lib/feed_tools/helpers/retrieval_helper.rb +33 -3
  13. data/lib/feed_tools/helpers/uri_helper.rb +46 -54
  14. data/lib/feed_tools/monkey_patch.rb +27 -1
  15. data/lib/feed_tools/vendor/html5/History.txt +10 -0
  16. data/lib/feed_tools/vendor/html5/Manifest.txt +117 -0
  17. data/lib/feed_tools/vendor/html5/README +45 -0
  18. data/lib/feed_tools/vendor/html5/Rakefile.rb +33 -0
  19. data/lib/feed_tools/vendor/html5/bin/html5 +217 -0
  20. data/lib/feed_tools/vendor/html5/lib/core_ext/string.rb +17 -0
  21. data/lib/feed_tools/vendor/html5/lib/html5.rb +13 -0
  22. data/lib/feed_tools/vendor/html5/lib/html5/constants.rb +1046 -0
  23. data/lib/feed_tools/vendor/html5/lib/html5/filters/base.rb +10 -0
  24. data/lib/feed_tools/vendor/html5/lib/html5/filters/inject_meta_charset.rb +82 -0
  25. data/lib/feed_tools/vendor/html5/lib/html5/filters/iso639codes.rb +752 -0
  26. data/lib/feed_tools/vendor/html5/lib/html5/filters/optionaltags.rb +198 -0
  27. data/lib/feed_tools/vendor/html5/lib/html5/filters/rfc2046.rb +30 -0
  28. data/lib/feed_tools/vendor/html5/lib/html5/filters/rfc3987.rb +89 -0
  29. data/lib/feed_tools/vendor/html5/lib/html5/filters/sanitizer.rb +15 -0
  30. data/lib/feed_tools/vendor/html5/lib/html5/filters/validator.rb +830 -0
  31. data/lib/feed_tools/vendor/html5/lib/html5/filters/whitespace.rb +36 -0
  32. data/lib/feed_tools/vendor/html5/lib/html5/html5parser.rb +248 -0
  33. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/after_body_phase.rb +46 -0
  34. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/after_frameset_phase.rb +33 -0
  35. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/after_head_phase.rb +50 -0
  36. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/before_head_phase.rb +41 -0
  37. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_body_phase.rb +613 -0
  38. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_caption_phase.rb +69 -0
  39. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_cell_phase.rb +78 -0
  40. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_column_group_phase.rb +55 -0
  41. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_frameset_phase.rb +57 -0
  42. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_head_phase.rb +138 -0
  43. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_row_phase.rb +89 -0
  44. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_select_phase.rb +85 -0
  45. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_table_body_phase.rb +86 -0
  46. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_table_phase.rb +115 -0
  47. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/initial_phase.rb +133 -0
  48. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/phase.rb +154 -0
  49. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/root_element_phase.rb +41 -0
  50. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/trailing_end_phase.rb +35 -0
  51. data/lib/feed_tools/vendor/html5/lib/html5/inputstream.rb +648 -0
  52. data/lib/feed_tools/vendor/html5/lib/html5/liberalxmlparser.rb +158 -0
  53. data/lib/feed_tools/vendor/html5/lib/html5/sanitizer.rb +188 -0
  54. data/lib/feed_tools/vendor/html5/lib/html5/serializer.rb +2 -0
  55. data/lib/feed_tools/vendor/html5/lib/html5/serializer/htmlserializer.rb +179 -0
  56. data/lib/feed_tools/vendor/html5/lib/html5/serializer/xhtmlserializer.rb +20 -0
  57. data/lib/feed_tools/vendor/html5/lib/html5/sniffer.rb +45 -0
  58. data/lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb +966 -0
  59. data/lib/feed_tools/vendor/html5/lib/html5/treebuilders.rb +24 -0
  60. data/lib/feed_tools/vendor/html5/lib/html5/treebuilders/base.rb +334 -0
  61. data/lib/feed_tools/vendor/html5/lib/html5/treebuilders/hpricot.rb +231 -0
  62. data/lib/feed_tools/vendor/html5/lib/html5/treebuilders/rexml.rb +209 -0
  63. data/lib/feed_tools/vendor/html5/lib/html5/treebuilders/simpletree.rb +185 -0
  64. data/lib/feed_tools/vendor/html5/lib/html5/treewalkers.rb +26 -0
  65. data/lib/feed_tools/vendor/html5/lib/html5/treewalkers/base.rb +162 -0
  66. data/lib/feed_tools/vendor/html5/lib/html5/treewalkers/hpricot.rb +48 -0
  67. data/lib/feed_tools/vendor/html5/lib/html5/treewalkers/rexml.rb +48 -0
  68. data/lib/feed_tools/vendor/html5/lib/html5/treewalkers/simpletree.rb +48 -0
  69. data/lib/feed_tools/vendor/html5/lib/html5/version.rb +3 -0
  70. data/lib/feed_tools/vendor/html5/testdata/encoding/chardet/test_big5.txt +51 -0
  71. data/lib/feed_tools/vendor/html5/testdata/encoding/test-yahoo-jp.dat +10 -0
  72. data/lib/feed_tools/vendor/html5/testdata/encoding/tests1.dat +394 -0
  73. data/lib/feed_tools/vendor/html5/testdata/encoding/tests2.dat +81 -0
  74. data/lib/feed_tools/vendor/html5/testdata/sanitizer/tests1.dat +416 -0
  75. data/lib/feed_tools/vendor/html5/testdata/serializer/core.test +104 -0
  76. data/lib/feed_tools/vendor/html5/testdata/serializer/injectmeta.test +65 -0
  77. data/lib/feed_tools/vendor/html5/testdata/serializer/optionaltags.test +900 -0
  78. data/lib/feed_tools/vendor/html5/testdata/serializer/options.test +60 -0
  79. data/lib/feed_tools/vendor/html5/testdata/serializer/whitespace.test +51 -0
  80. data/lib/feed_tools/vendor/html5/testdata/sites/google-results.htm +1 -0
  81. data/lib/feed_tools/vendor/html5/testdata/sites/python-ref-import.htm +1 -0
  82. data/lib/feed_tools/vendor/html5/testdata/sites/web-apps-old.htm +1 -0
  83. data/lib/feed_tools/vendor/html5/testdata/sites/web-apps.htm +34275 -0
  84. data/lib/feed_tools/vendor/html5/testdata/sniffer/htmlOrFeed.json +43 -0
  85. data/lib/feed_tools/vendor/html5/testdata/tokenizer/contentModelFlags.test +48 -0
  86. data/lib/feed_tools/vendor/html5/testdata/tokenizer/entities.test +2339 -0
  87. data/lib/feed_tools/vendor/html5/testdata/tokenizer/escapeFlag.test +21 -0
  88. data/lib/feed_tools/vendor/html5/testdata/tokenizer/test1.test +172 -0
  89. data/lib/feed_tools/vendor/html5/testdata/tokenizer/test2.test +129 -0
  90. data/lib/feed_tools/vendor/html5/testdata/tokenizer/test3.test +367 -0
  91. data/lib/feed_tools/vendor/html5/testdata/tokenizer/test4.test +198 -0
  92. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests1.dat +1950 -0
  93. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests2.dat +773 -0
  94. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests3.dat +270 -0
  95. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests4.dat +60 -0
  96. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests5.dat +175 -0
  97. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests6.dat +196 -0
  98. data/lib/feed_tools/vendor/html5/testdata/validator/attributes.test +1035 -0
  99. data/lib/feed_tools/vendor/html5/testdata/validator/base-href-attribute.test +787 -0
  100. data/lib/feed_tools/vendor/html5/testdata/validator/base-target-attribute.test +35 -0
  101. data/lib/feed_tools/vendor/html5/testdata/validator/blockquote-cite-attribute.test +7 -0
  102. data/lib/feed_tools/vendor/html5/testdata/validator/classattribute.test +152 -0
  103. data/lib/feed_tools/vendor/html5/testdata/validator/contenteditableattribute.test +59 -0
  104. data/lib/feed_tools/vendor/html5/testdata/validator/contextmenuattribute.test +115 -0
  105. data/lib/feed_tools/vendor/html5/testdata/validator/dirattribute.test +59 -0
  106. data/lib/feed_tools/vendor/html5/testdata/validator/draggableattribute.test +63 -0
  107. data/lib/feed_tools/vendor/html5/testdata/validator/html-xmlns-attribute.test +23 -0
  108. data/lib/feed_tools/vendor/html5/testdata/validator/idattribute.test +115 -0
  109. data/lib/feed_tools/vendor/html5/testdata/validator/inputattributes.test +2795 -0
  110. data/lib/feed_tools/vendor/html5/testdata/validator/irrelevantattribute.test +63 -0
  111. data/lib/feed_tools/vendor/html5/testdata/validator/langattribute.test +5579 -0
  112. data/lib/feed_tools/vendor/html5/testdata/validator/li-value-attribute.test +7 -0
  113. data/lib/feed_tools/vendor/html5/testdata/validator/link-href-attribute.test +7 -0
  114. data/lib/feed_tools/vendor/html5/testdata/validator/link-hreflang-attribute.test +7 -0
  115. data/lib/feed_tools/vendor/html5/testdata/validator/link-rel-attribute.test +271 -0
  116. data/lib/feed_tools/vendor/html5/testdata/validator/ol-start-attribute.test +7 -0
  117. data/lib/feed_tools/vendor/html5/testdata/validator/starttags.test +375 -0
  118. data/lib/feed_tools/vendor/html5/testdata/validator/style-scoped-attribute.test +7 -0
  119. data/lib/feed_tools/vendor/html5/testdata/validator/tabindexattribute.test +79 -0
  120. data/lib/feed_tools/vendor/html5/tests/preamble.rb +72 -0
  121. data/lib/feed_tools/vendor/html5/tests/test_encoding.rb +35 -0
  122. data/lib/feed_tools/vendor/html5/tests/test_lxp.rb +279 -0
  123. data/lib/feed_tools/vendor/html5/tests/test_parser.rb +68 -0
  124. data/lib/feed_tools/vendor/html5/tests/test_sanitizer.rb +142 -0
  125. data/lib/feed_tools/vendor/html5/tests/test_serializer.rb +68 -0
  126. data/lib/feed_tools/vendor/html5/tests/test_sniffer.rb +27 -0
  127. data/lib/feed_tools/vendor/html5/tests/test_stream.rb +62 -0
  128. data/lib/feed_tools/vendor/html5/tests/test_tokenizer.rb +94 -0
  129. data/lib/feed_tools/vendor/html5/tests/test_treewalkers.rb +135 -0
  130. data/lib/feed_tools/vendor/html5/tests/test_validator.rb +31 -0
  131. data/lib/feed_tools/vendor/html5/tests/tokenizer_test_parser.rb +63 -0
  132. data/lib/feed_tools/vendor/uri.rb +781 -0
  133. data/lib/feed_tools/version.rb +1 -1
  134. data/rakefile +27 -6
  135. data/test/unit/atom_test.rb +298 -210
  136. data/test/unit/helper_test.rb +7 -12
  137. data/test/unit/rdf_test.rb +51 -1
  138. data/test/unit/rss_test.rb +13 -3
  139. metadata +239 -116
  140. data/lib/feed_tools/vendor/htree.rb +0 -97
  141. data/lib/feed_tools/vendor/htree/container.rb +0 -10
  142. data/lib/feed_tools/vendor/htree/context.rb +0 -67
  143. data/lib/feed_tools/vendor/htree/display.rb +0 -27
  144. data/lib/feed_tools/vendor/htree/doc.rb +0 -149
  145. data/lib/feed_tools/vendor/htree/elem.rb +0 -262
  146. data/lib/feed_tools/vendor/htree/encoder.rb +0 -163
  147. data/lib/feed_tools/vendor/htree/equality.rb +0 -218
  148. data/lib/feed_tools/vendor/htree/extract_text.rb +0 -37
  149. data/lib/feed_tools/vendor/htree/fstr.rb +0 -33
  150. data/lib/feed_tools/vendor/htree/gencode.rb +0 -97
  151. data/lib/feed_tools/vendor/htree/htmlinfo.rb +0 -672
  152. data/lib/feed_tools/vendor/htree/inspect.rb +0 -108
  153. data/lib/feed_tools/vendor/htree/leaf.rb +0 -94
  154. data/lib/feed_tools/vendor/htree/loc.rb +0 -367
  155. data/lib/feed_tools/vendor/htree/modules.rb +0 -48
  156. data/lib/feed_tools/vendor/htree/name.rb +0 -124
  157. data/lib/feed_tools/vendor/htree/output.rb +0 -207
  158. data/lib/feed_tools/vendor/htree/parse.rb +0 -409
  159. data/lib/feed_tools/vendor/htree/raw_string.rb +0 -124
  160. data/lib/feed_tools/vendor/htree/regexp-util.rb +0 -15
  161. data/lib/feed_tools/vendor/htree/rexml.rb +0 -130
  162. data/lib/feed_tools/vendor/htree/scan.rb +0 -166
  163. data/lib/feed_tools/vendor/htree/tag.rb +0 -111
  164. data/lib/feed_tools/vendor/htree/template.rb +0 -909
  165. data/lib/feed_tools/vendor/htree/text.rb +0 -115
  166. data/lib/feed_tools/vendor/htree/traverse.rb +0 -465
@@ -0,0 +1,20 @@
1
+ require 'html5/serializer/htmlserializer'
2
+
3
+ module HTML5
4
+
5
+ class XHTMLSerializer < HTMLSerializer
6
+ DEFAULTS = {
7
+ :quote_attr_values => true,
8
+ :minimize_boolean_attributes => false,
9
+ :use_trailing_solidus => true,
10
+ :escape_lt_in_attrs => true,
11
+ :omit_optional_tags => false,
12
+ :escape_rcdata => true
13
+ }
14
+
15
+ def initialize(options={})
16
+ super(DEFAULTS.clone.update(options))
17
+ end
18
+ end
19
+
20
+ end
@@ -0,0 +1,45 @@
1
+ module HTML5
2
+ module Sniffer
3
+ # 4.7.4
4
+ def html_or_feed str
5
+ s = str[0, 512] # steps 1, 2
6
+ pos = 0
7
+
8
+ while pos < s.length
9
+ case s[pos]
10
+ when 0x09, 0x20, 0x0A, 0x0D # tab, space, LF, CR
11
+ pos += 1
12
+ when 0x3C # "<"
13
+ pos += 1
14
+ if s[pos..pos+2] == "!--" # [0x21, 0x2D, 0x2D]
15
+ pos += 3
16
+ until s[pos..pos+2] == "-->" or pos >= s.length
17
+ pos += 1
18
+ end
19
+ pos += 3
20
+ elsif s[pos] == 0x21 # "!"
21
+ pos += 1
22
+ until s[pos] == 0x3E or pos >= s.length # ">"
23
+ pos += 1
24
+ end
25
+ pos += 1
26
+ elsif s[pos] == 0x3F # "?"
27
+ until s[pos..pos+1] == "?>" or pos >= s.length # [0x3F, 0x3E]
28
+ pos += 1
29
+ end
30
+ pos += 2
31
+ elsif s[pos..pos+2] == "rss" # [0x72, 0x73, 0x73]
32
+ return "application/rss+xml"
33
+ elsif s[pos..pos+3] == "feed" # [0x66, 0x65, 0x65, 0x64]
34
+ return "application/atom+xml"
35
+ elsif s[pos..pos+6] == "rdf:RDF" # [0x72, 0x64, 0x66, 0x3A, 0x52, 0x44, 0x46]
36
+ raise NotImplementedError
37
+ end
38
+ else
39
+ break
40
+ end
41
+ end
42
+ "text/html"
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,966 @@
1
+ require 'html5/constants'
2
+ require 'html5/inputstream'
3
+
4
+ module HTML5
5
+
6
+ # This class takes care of tokenizing HTML.
7
+ #
8
+ # * @current_token
9
+ # Holds the token that is currently being processed.
10
+ #
11
+ # * @state
12
+ # Holds a reference to the method to be invoked... XXX
13
+ #
14
+ # * @states
15
+ # Holds a mapping between states and methods that implement the state.
16
+ #
17
+ # * @stream
18
+ # Points to HTMLInputStream object.
19
+
20
+ class HTMLTokenizer
21
+ attr_accessor :content_model_flag, :current_token
22
+ attr_reader :stream
23
+
24
+ # XXX need to fix documentation
25
+
26
+ def initialize(stream, options = {})
27
+ @stream = HTMLInputStream.new(stream, options)
28
+
29
+ # Setup the initial tokenizer state
30
+ @content_model_flag = :PCDATA
31
+ @state = :data_state
32
+ @escapeFlag = false
33
+ @lastFourChars = []
34
+
35
+ # The current token being created
36
+ @current_token = nil
37
+
38
+ # Tokens to be processed.
39
+ @token_queue = []
40
+ @lowercase_element_name = options[:lowercase_element_name] != false
41
+ @lowercase_attr_name = options[:lowercase_attr_name] != false
42
+ end
43
+
44
+ # This is where the magic happens.
45
+ #
46
+ # We do our usually processing through the states and when we have a token
47
+ # to return we yield the token which pauses processing until the next token
48
+ # is requested.
49
+ def each
50
+ @token_queue = []
51
+ # Start processing. When EOF is reached @state will return false
52
+ # instead of true and the loop will terminate.
53
+ while send @state
54
+ yield :type => :ParseError, :data => @stream.errors.shift until @stream.errors.empty?
55
+ yield @token_queue.shift until @token_queue.empty?
56
+ end
57
+ end
58
+
59
+ # Below are various helper functions the tokenizer states use worked out.
60
+
61
+ # If the next character is a '>', convert the current_token into
62
+ # an EmptyTag
63
+
64
+ def process_solidus_in_tag
65
+
66
+ # We need to consume another character to make sure it's a ">"
67
+ data = @stream.char
68
+
69
+ if @current_token[:type] == :StartTag and data == ">"
70
+ @current_token[:type] = :EmptyTag
71
+ else
72
+ @token_queue << {:type => :ParseError, :data => "incorrectly-placed-solidus"}
73
+ end
74
+
75
+ # The character we just consumed need to be put back on the stack so it
76
+ # doesn't get lost...
77
+ @stream.unget(data)
78
+ end
79
+
80
+ # This function returns either U+FFFD or the character based on the
81
+ # decimal or hexadecimal representation. It also discards ";" if present.
82
+ # If not present @token_queue << {:type => :ParseError}" is invoked.
83
+
84
+ def consume_number_entity(isHex)
85
+
86
+ # XXX More need to be done here. For instance, #13 should prolly be
87
+ # converted to #10 so we don't get \r (#13 is \r right?) in the DOM and
88
+ # such. Thoughts on this appreciated.
89
+ allowed = DIGITS
90
+ radix = 10
91
+ if isHex
92
+ allowed = HEX_DIGITS
93
+ radix = 16
94
+ end
95
+
96
+ char_stack = []
97
+
98
+ # Consume all the characters that are in range while making sure we
99
+ # don't hit an EOF.
100
+ c = @stream.char
101
+ while allowed.include?(c) and c != :EOF
102
+ char_stack.push(c)
103
+ c = @stream.char
104
+ end
105
+
106
+ # Convert the set of characters consumed to an int.
107
+ charAsInt = char_stack.join('').to_i(radix)
108
+
109
+ if charAsInt == 13
110
+ @token_queue << {:type => :ParseError, :data => "incorrect-cr-newline-entity"}
111
+ charAsInt = 10
112
+ elsif (128..159).include? charAsInt
113
+ # If the integer is between 127 and 160 (so 128 and bigger and 159
114
+ # and smaller) we need to do the "windows trick".
115
+ @token_queue << {:type => :ParseError, :data => "illegal-windows-1252-entity"}
116
+
117
+ charAsInt = ENTITIES_WINDOWS1252[charAsInt - 128]
118
+ end
119
+
120
+ if 0 < charAsInt and charAsInt <= 1114111 and not (55296 <= charAsInt and charAsInt <= 57343)
121
+ char = [charAsInt].pack('U')
122
+ else
123
+ char = [0xFFFD].pack('U')
124
+ @token_queue << {:type => :ParseError, :data => "cant-convert-numeric-entity", :datavars => {"charAsInt" => charAsInt}}
125
+ end
126
+
127
+ # Discard the ; if present. Otherwise, put it back on the queue and
128
+ # invoke parse_error on parser.
129
+ if c != ";"
130
+ @token_queue << {:type => :ParseError, :data => "numeric-entity-without-semicolon"}
131
+ @stream.unget(c)
132
+ end
133
+
134
+ return char
135
+ end
136
+
137
+ def consume_entity(from_attribute=false)
138
+ char = nil
139
+ char_stack = [@stream.char]
140
+ if SPACE_CHARACTERS.include?(char_stack[0]) or [:EOF, '<', '&'].include?(char_stack[0])
141
+ @stream.unget(char_stack)
142
+ elsif char_stack[0] == '#'
143
+ # We might have a number entity here.
144
+ char_stack += [@stream.char, @stream.char]
145
+ if char_stack[0 .. 1].include? :EOF
146
+ # If we reach the end of the file put everything up to :EOF
147
+ # back in the queue
148
+ char_stack = char_stack[0...char_stack.index(:EOF)]
149
+ @stream.unget(char_stack)
150
+ @token_queue << {:type => :ParseError, :data => "expected-numeric-entity-but-got-eof"}
151
+ else
152
+ if char_stack[1].downcase == "x" and HEX_DIGITS.include? char_stack[2]
153
+ # Hexadecimal entity detected.
154
+ @stream.unget(char_stack[2])
155
+ char = consume_number_entity(true)
156
+ elsif DIGITS.include? char_stack[1]
157
+ # Decimal entity detected.
158
+ @stream.unget(char_stack[1..-1])
159
+ char = consume_number_entity(false)
160
+ else
161
+ # No number entity detected.
162
+ @stream.unget(char_stack)
163
+ @token_queue << {:type => :ParseError, :data => "expected-numeric-entity"}
164
+ end
165
+ end
166
+ else
167
+ # At this point in the process might have named entity. Entities
168
+ # are stored in the global variable "entities".
169
+ #
170
+ # Consume characters and compare to these to a substring of the
171
+ # entity names in the list until the substring no longer matches.
172
+ filteredEntityList = ENTITIES.keys
173
+ filteredEntityList.reject! {|e| e[0].chr != char_stack[0]}
174
+ entityName = nil
175
+
176
+ # Try to find the longest entity the string will match to take care
177
+ # of &noti for instance.
178
+ while char_stack.last != :EOF
179
+ name = char_stack.join('')
180
+ if filteredEntityList.any? {|e| e[0...name.length] == name}
181
+ filteredEntityList.reject! {|e| e[0...name.length] != name}
182
+ char_stack.push(@stream.char)
183
+ else
184
+ break
185
+ end
186
+
187
+ if ENTITIES.include? name
188
+ entityName = name
189
+ break if entityName[-1] == ';'
190
+ end
191
+ end
192
+
193
+ if entityName != nil
194
+ char = ENTITIES[entityName]
195
+
196
+ # Check whether or not the last character returned can be
197
+ # discarded or needs to be put back.
198
+ if entityName[-1] != ?;
199
+ @token_queue << {:type => :ParseError, :data => "named-entity-without-semicolon"}
200
+ end
201
+
202
+ if entityName[-1] != ";" and from_attribute and
203
+ (ASCII_LETTERS.include?(char_stack[entityName.length]) or
204
+ DIGITS.include?(char_stack[entityName.length]))
205
+ @stream.unget(char_stack)
206
+ char = '&'
207
+ else
208
+ @stream.unget(char_stack[entityName.length..-1])
209
+ end
210
+ else
211
+ @token_queue << {:type => :ParseError, :data => "expected-named-entity"}
212
+ @stream.unget(char_stack)
213
+ end
214
+ end
215
+ return char
216
+ end
217
+
218
+ # This method replaces the need for "entityInAttributeValueState".
219
+ def process_entity_in_attribute
220
+ entity = consume_entity()
221
+ if entity
222
+ @current_token[:data][-1][1] += entity
223
+ else
224
+ @current_token[:data][-1][1] += "&"
225
+ end
226
+ end
227
+
228
+ # This method is a generic handler for emitting the tags. It also sets
229
+ # the state to "data" because that's what's needed after a token has been
230
+ # emitted.
231
+ def emit_current_token
232
+ # Add token to the queue to be yielded
233
+ token = @current_token
234
+ if [:StartTag, :EndTag, :EmptyTag].include?(token[:type])
235
+ if @lowercase_element_name
236
+ token[:name] = token[:name].downcase
237
+ end
238
+ @token_queue << token
239
+ @state = :data_state
240
+ end
241
+
242
+ end
243
+
244
+ # Below are the various tokenizer states worked out.
245
+
246
+ # XXX AT Perhaps we should have Hixie run some evaluation on billions of
247
+ # documents to figure out what the order of the various if and elsif
248
+ # statements should be.
249
+ def data_state
250
+ data = @stream.char
251
+
252
+ if @content_model_flag == :CDATA or @content_model_flag == :RCDATA
253
+ @lastFourChars << data
254
+ @lastFourChars.shift if @lastFourChars.length > 4
255
+ end
256
+
257
+ if data == "&" and [:PCDATA,:RCDATA].include?(@content_model_flag) and !@escapeFlag
258
+ @state = :entity_data_state
259
+ elsif data == "-" && [:CDATA, :RCDATA].include?(@content_model_flag) && !@escapeFlag && @lastFourChars.join('') == "<!--"
260
+ @escapeFlag = true
261
+ @token_queue << {:type => :Characters, :data => data}
262
+ elsif data == "<" and !@escapeFlag and
263
+ [:PCDATA,:CDATA,:RCDATA].include?(@content_model_flag)
264
+ @state = :tag_open_state
265
+ elsif data == ">" and @escapeFlag and
266
+ [:CDATA,:RCDATA].include?(@content_model_flag) and
267
+ @lastFourChars[1..-1].join('') == "-->"
268
+ @escapeFlag = false
269
+ @token_queue << {:type => :Characters, :data => data}
270
+
271
+ elsif data == :EOF
272
+ # Tokenization ends.
273
+ return false
274
+
275
+ elsif SPACE_CHARACTERS.include? data
276
+ # Directly after emitting a token you switch back to the "data
277
+ # state". At that point SPACE_CHARACTERS are important so they are
278
+ # emitted separately.
279
+ # XXX need to check if we don't need a special "spaces" flag on
280
+ # characters.
281
+ @token_queue << {:type => :SpaceCharacters, :data => data + @stream.chars_until(SPACE_CHARACTERS, true)}
282
+ else
283
+ @token_queue << {:type => :Characters, :data => data + @stream.chars_until(%w[& < > -])}
284
+ end
285
+ return true
286
+ end
287
+
288
+ def entity_data_state
289
+ entity = consume_entity
290
+ if entity
291
+ @token_queue << {:type => :Characters, :data => entity}
292
+ else
293
+ @token_queue << {:type => :Characters, :data => "&"}
294
+ end
295
+ @state = :data_state
296
+ return true
297
+ end
298
+
299
+ def tag_open_state
300
+ data = @stream.char
301
+ if @content_model_flag == :PCDATA
302
+ if data == "!"
303
+ @state = :markup_declaration_open_state
304
+ elsif data == "/"
305
+ @state = :close_tag_open_state
306
+ elsif data != :EOF and ASCII_LETTERS.include? data
307
+ @current_token = {:type => :StartTag, :name => data, :data => []}
308
+ @state = :tag_name_state
309
+ elsif data == ">"
310
+ # XXX In theory it could be something besides a tag name. But
311
+ # do we really care?
312
+ @token_queue << {:type => :ParseError, :data => "expected-tag-name-but-got-right-bracket"}
313
+ @token_queue << {:type => :Characters, :data => "<>"}
314
+ @state = :data_state
315
+ elsif data == "?"
316
+ # XXX In theory it could be something besides a tag name. But
317
+ # do we really care?
318
+ @token_queue.push({:type => :ParseError, :data => "expected-tag-name-but-got-question-mark"})
319
+ @stream.unget(data)
320
+ @state = :bogus_comment_state
321
+ else
322
+ # XXX
323
+ @token_queue << {:type => :ParseError, :data => "expected-tag-name"}
324
+ @token_queue << {:type => :Characters, :data => "<"}
325
+ @stream.unget(data)
326
+ @state = :data_state
327
+ end
328
+ else
329
+ # We know the content model flag is set to either RCDATA or CDATA
330
+ # now because this state can never be entered with the PLAINTEXT
331
+ # flag.
332
+ if data == "/"
333
+ @state = :close_tag_open_state
334
+ else
335
+ @token_queue << {:type => :Characters, :data => "<"}
336
+ @stream.unget(data)
337
+ @state = :data_state
338
+ end
339
+ end
340
+ return true
341
+ end
342
+
343
+ def close_tag_open_state
344
+ if (@content_model_flag == :RCDATA or @content_model_flag == :CDATA)
345
+ if @current_token
346
+ char_stack = []
347
+
348
+ # So far we know that "</" has been consumed. We now need to know
349
+ # whether the next few characters match the name of last emitted
350
+ # start tag which also happens to be the current_token. We also need
351
+ # to have the character directly after the characters that could
352
+ # match the start tag name.
353
+ (@current_token[:name].length + 1).times do
354
+ char_stack.push(@stream.char)
355
+ # Make sure we don't get hit by :EOF
356
+ break if char_stack[-1] == :EOF
357
+ end
358
+
359
+ # Since this is just for checking. We put the characters back on
360
+ # the stack.
361
+ @stream.unget(char_stack)
362
+ end
363
+
364
+ if @current_token and
365
+ @current_token[:name].downcase ==
366
+ char_stack[0...-1].join('').downcase and
367
+ (SPACE_CHARACTERS + [">", "/", "<", :EOF]).include? char_stack[-1]
368
+ # Because the characters are correct we can safely switch to
369
+ # PCDATA mode now. This also means we don't have to do it when
370
+ # emitting the end tag token.
371
+ @content_model_flag = :PCDATA
372
+ else
373
+ @token_queue << {:type => :Characters, :data => "</"}
374
+ @state = :data_state
375
+
376
+ # Need to return here since we don't want the rest of the
377
+ # method to be walked through.
378
+ return true
379
+ end
380
+ end
381
+
382
+ data = @stream.char
383
+ if data == :EOF
384
+ @token_queue << {:type => :ParseError, :data => "expected-closing-tag-but-got-eof"}
385
+ @token_queue << {:type => :Characters, :data => "</"}
386
+ @state = :data_state
387
+ elsif ASCII_LETTERS.include? data
388
+ @current_token = {:type => :EndTag, :name => data, :data => []}
389
+ @state = :tag_name_state
390
+ elsif data == ">"
391
+ @token_queue << {:type => :ParseError, :data => "expected-closing-tag-but-got-right-bracket"}
392
+ @state = :data_state
393
+ else
394
+ # XXX data can be _'_...
395
+ @token_queue << {:type => :ParseError, :data => "expected-closing-tag-but-got-char", :datavars => {:data => data}}
396
+ @stream.unget(data)
397
+ @state = :bogus_comment_state
398
+ end
399
+
400
+ return true
401
+ end
402
+
403
+ def tag_name_state
404
+ data = @stream.char
405
+ if SPACE_CHARACTERS.include? data
406
+ @state = :before_attribute_name_state
407
+ elsif data == :EOF
408
+ @token_queue << {:type => :ParseError, :data => "eof-in-tag-name"}
409
+ emit_current_token
410
+ elsif ASCII_LETTERS.include? data
411
+ @current_token[:name] += data + @stream.chars_until(ASCII_LETTERS, true)
412
+ elsif data == ">"
413
+ emit_current_token
414
+ elsif data == "/"
415
+ process_solidus_in_tag
416
+ @state = :before_attribute_name_state
417
+ else
418
+ @current_token[:name] += data
419
+ end
420
+ return true
421
+ end
422
+
423
+ def before_attribute_name_state
424
+ data = @stream.char
425
+ if SPACE_CHARACTERS.include? data
426
+ @stream.chars_until(SPACE_CHARACTERS, true)
427
+ elsif data == :EOF
428
+ @token_queue << {:type => :ParseError, :data => "expected-attribute-name-but-got-eof"}
429
+ emit_current_token
430
+ elsif ASCII_LETTERS.include? data
431
+ @current_token[:data].push([data, ""])
432
+ @state = :attribute_name_state
433
+ elsif data == ">"
434
+ emit_current_token
435
+ elsif data == "/"
436
+ process_solidus_in_tag
437
+ else
438
+ @current_token[:data].push([data, ""])
439
+ @state = :attribute_name_state
440
+ end
441
+ return true
442
+ end
443
+
444
+ def attribute_name_state
445
+ data = @stream.char
446
+ leavingThisState = true
447
+ emitToken = false
448
+ if data == "="
449
+ @state = :before_attribute_value_state
450
+ elsif data == :EOF
451
+ @token_queue << {:type => :ParseError, :data => "eof-in-attribute-name"}
452
+ @state = :data_state
453
+ emitToken = true
454
+ elsif ASCII_LETTERS.include? data
455
+ @current_token[:data][-1][0] += data + @stream.chars_until(ASCII_LETTERS, true)
456
+ leavingThisState = false
457
+ elsif data == ">"
458
+ # XXX If we emit here the attributes are converted to a dict
459
+ # without being checked and when the code below runs we error
460
+ # because data is a dict not a list
461
+ emitToken = true
462
+ elsif SPACE_CHARACTERS.include? data
463
+ @state = :after_attribute_name_state
464
+ elsif data == "/"
465
+ process_solidus_in_tag
466
+ @state = :before_attribute_name_state
467
+ else
468
+ @current_token[:data][-1][0] += data
469
+ leavingThisState = false
470
+ end
471
+
472
+ if leavingThisState
473
+ # Attributes are not dropped at this stage. That happens when the
474
+ # start tag token is emitted so values can still be safely appended
475
+ # to attributes, but we do want to report the parse error in time.
476
+ if @lowercase_attr_name
477
+ @current_token[:data][-1][0] = @current_token[:data].last.first.downcase
478
+ end
479
+ @current_token[:data][0...-1].each {|name,value|
480
+ if @current_token[:data].last.first == name
481
+ @token_queue << {:type => :ParseError, :data => "duplicate-attribute"}
482
+ break # don't report an error more than once
483
+ end
484
+ }
485
+ # XXX Fix for above XXX
486
+ emit_current_token if emitToken
487
+ end
488
+ return true
489
+ end
490
+
491
+ def after_attribute_name_state
492
+ data = @stream.char
493
+ if SPACE_CHARACTERS.include? data
494
+ @stream.chars_until(SPACE_CHARACTERS, true)
495
+ elsif data == "="
496
+ @state = :before_attribute_value_state
497
+ elsif data == ">"
498
+ emit_current_token
499
+ elsif data == :EOF
500
+ @token_queue << {:type => :ParseError, :data => "expected-end-of-tag-but-got-eof"}
501
+ emit_current_token
502
+ elsif ASCII_LETTERS.include? data
503
+ @current_token[:data].push([data, ""])
504
+ @state = :attribute_name_state
505
+ elsif data == "/"
506
+ process_solidus_in_tag
507
+ @state = :before_attribute_name_state
508
+ else
509
+ @current_token[:data].push([data, ""])
510
+ @state = :attribute_name_state
511
+ end
512
+ return true
513
+ end
514
+
515
+ def before_attribute_value_state
516
+ data = @stream.char
517
+ if SPACE_CHARACTERS.include? data
518
+ @stream.chars_until(SPACE_CHARACTERS, true)
519
+ elsif data == "\""
520
+ @state = :attribute_value_double_quoted_state
521
+ elsif data == "&"
522
+ @state = :attribute_value_unquoted_state
523
+ @stream.unget(data);
524
+ elsif data == "'"
525
+ @state = :attribute_value_single_quoted_state
526
+ elsif data == ">"
527
+ emit_current_token
528
+ elsif data == :EOF
529
+ @token_queue << {:type => :ParseError, :data => "expected-attribute-value-but-got-eof"}
530
+ emit_current_token
531
+ else
532
+ @current_token[:data][-1][1] += data
533
+ @state = :attribute_value_unquoted_state
534
+ end
535
+ return true
536
+ end
537
+
538
+ def attribute_value_double_quoted_state
539
+ data = @stream.char
540
+ if data == "\""
541
+ @state = :before_attribute_name_state
542
+ elsif data == "&"
543
+ process_entity_in_attribute
544
+ elsif data == :EOF
545
+ @token_queue << {:type => :ParseError, :data => "eof-in-attribute-value-double-quote"}
546
+ emit_current_token
547
+ else
548
+ @current_token[:data][-1][1] += data + @stream.chars_until(["\"", "&"])
549
+ end
550
+ return true
551
+ end
552
+
553
+ def attribute_value_single_quoted_state
554
+ data = @stream.char
555
+ if data == "'"
556
+ @state = :before_attribute_name_state
557
+ elsif data == "&"
558
+ process_entity_in_attribute
559
+ elsif data == :EOF
560
+ @token_queue << {:type => :ParseError, :data => "eof-in-attribute-value-single-quote"}
561
+ emit_current_token
562
+ else
563
+ @current_token[:data][-1][1] += data +\
564
+ @stream.chars_until(["'", "&"])
565
+ end
566
+ return true
567
+ end
568
+
569
+ def attribute_value_unquoted_state
570
+ data = @stream.char
571
+ if SPACE_CHARACTERS.include? data
572
+ @state = :before_attribute_name_state
573
+ elsif data == "&"
574
+ process_entity_in_attribute
575
+ elsif data == ">"
576
+ emit_current_token
577
+ elsif data == :EOF
578
+ @token_queue << {:type => :ParseError, :data => "eof-in-attribute-value-no-quotes"}
579
+ emit_current_token
580
+ else
581
+ @current_token[:data][-1][1] += data + @stream.chars_until(["&", ">","<"] + SPACE_CHARACTERS)
582
+ end
583
+ return true
584
+ end
585
+
586
+ def bogus_comment_state
587
+ # Make a new comment token and give it as value all the characters
588
+ # until the first > or :EOF (chars_until checks for :EOF automatically)
589
+ # and emit it.
590
+ @token_queue << {:type => :Comment, :data => @stream.chars_until((">"))}
591
+
592
+ # Eat the character directly after the bogus comment which is either a
593
+ # ">" or an :EOF.
594
+ @stream.char
595
+ @state = :data_state
596
+ return true
597
+ end
598
+
599
+ def markup_declaration_open_state
600
+ char_stack = [@stream.char, @stream.char]
601
+ if char_stack == ["-", "-"]
602
+ @current_token = {:type => :Comment, :data => ""}
603
+ @state = :comment_start_state
604
+ else
605
+ 5.times { char_stack.push(@stream.char) }
606
+ # Put in explicit :EOF check
607
+ if !char_stack.include?(:EOF) && char_stack.join("").upcase == "DOCTYPE"
608
+ @current_token = {:type => :Doctype, :name => "", :publicId => nil, :systemId => nil, :correct => true}
609
+ @state = :doctype_state
610
+ else
611
+ @token_queue << {:type => :ParseError, :data => "expected-dashes-or-doctype"}
612
+ @stream.unget(char_stack)
613
+ @state = :bogus_comment_state
614
+ end
615
+ end
616
+ return true
617
+ end
618
+
619
+ def comment_start_state
620
+ data = @stream.char
621
+ if data == "-"
622
+ @state = :comment_start_dash_state
623
+ elsif data == ">"
624
+ @token_queue << {:type => :ParseError, :data => "incorrect-comment"}
625
+ @token_queue << @current_token
626
+ @state = :data_state
627
+ elsif data == :EOF
628
+ @token_queue << {:type => :ParseError, :data => "eof-in-comment"}
629
+ @token_queue << @current_token
630
+ @state = :data_state
631
+ else
632
+ @current_token[:data] += data + @stream.chars_until("-")
633
+ @state = :comment_state
634
+ end
635
+ return true
636
+ end
637
+
638
+ def comment_start_dash_state
639
+ data = @stream.char
640
+ if data == "-"
641
+ @state = :comment_end_state
642
+ elsif data == ">"
643
+ @token_queue << {:type => :ParseError, :data => "incorrect-comment"}
644
+ @token_queue << @current_token
645
+ @state = :data_state
646
+ elsif data == :EOF
647
+ @token_queue << {:type => :ParseError, :data => "eof-in-comment"}
648
+ @token_queue << @current_token
649
+ @state = :data_state
650
+ else
651
+ @current_token[:data] += '-' + data + @stream.chars_until("-")
652
+ @state = :comment_state
653
+ end
654
+ return true
655
+ end
656
+
657
+ def comment_state
658
+ data = @stream.char
659
+ if data == "-"
660
+ @state = :comment_end_dash_state
661
+ elsif data == :EOF
662
+ @token_queue << {:type => :ParseError, :data => "eof-in-comment"}
663
+ @token_queue << @current_token
664
+ @state = :data_state
665
+ else
666
+ @current_token[:data] += data + @stream.chars_until("-")
667
+ end
668
+ return true
669
+ end
670
+
671
+ def comment_end_dash_state
672
+ data = @stream.char
673
+ if data == "-"
674
+ @state = :comment_end_state
675
+ elsif data == :EOF
676
+ @token_queue << {:type => :ParseError, :data => "eof-in-comment-end-dash"}
677
+ @token_queue << @current_token
678
+ @state = :data_state
679
+ else
680
+ @current_token[:data] += "-" + data +\
681
+ @stream.chars_until("-")
682
+ # Consume the next character which is either a "-" or an :EOF as
683
+ # well so if there's a "-" directly after the "-" we go nicely to
684
+ # the "comment end state" without emitting a ParseError there.
685
+ @stream.char
686
+ end
687
+ return true
688
+ end
689
+
690
+ def comment_end_state
691
+ data = @stream.char
692
+ if data == ">"
693
+ @token_queue << @current_token
694
+ @state = :data_state
695
+ elsif data == "-"
696
+ @token_queue << {:type => :ParseError, :data => "unexpected-dash-after-double-dash-in-comment"}
697
+ @current_token[:data] += data
698
+ elsif data == :EOF
699
+ @token_queue << {:type => :ParseError, :data => "eof-in-comment-double-dash"}
700
+ @token_queue << @current_token
701
+ @state = :data_state
702
+ else
703
+ # XXX
704
+ @token_queue << {:type => :ParseError, :data => "unexpected-char-in-comment"}
705
+ @current_token[:data] += "--" + data
706
+ @state = :comment_state
707
+ end
708
+ return true
709
+ end
710
+
711
+ def doctype_state
712
+ data = @stream.char
713
+ if SPACE_CHARACTERS.include? data
714
+ @state = :before_doctype_name_state
715
+ else
716
+ @token_queue << {:type => :ParseError, :data => "need-space-after-doctype"}
717
+ @stream.unget(data)
718
+ @state = :before_doctype_name_state
719
+ end
720
+ return true
721
+ end
722
+
723
+ def before_doctype_name_state
724
+ data = @stream.char
725
+ if SPACE_CHARACTERS.include? data
726
+ elsif data == ">"
727
+ @token_queue << {:type => :ParseError, :data => "expected-doctype-name-but-got-right-bracket"}
728
+ @current_token[:correct] = false
729
+ @token_queue << @current_token
730
+ @state = :data_state
731
+ elsif data == :EOF
732
+ @token_queue << {:type => :ParseError, :data => "expected-doctype-name-but-got-eof"}
733
+ @current_token[:correct] = false
734
+ @token_queue << @current_token
735
+ @state = :data_state
736
+ else
737
+ @current_token[:name] = data
738
+ @state = :doctype_name_state
739
+ end
740
+ return true
741
+ end
742
+
743
+ def doctype_name_state
744
+ data = @stream.char
745
+ if SPACE_CHARACTERS.include? data
746
+ @state = :after_doctype_name_state
747
+ elsif data == ">"
748
+ @token_queue << @current_token
749
+ @state = :data_state
750
+ elsif data == :EOF
751
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype-name"}
752
+ @current_token[:correct] = false
753
+ @token_queue << @current_token
754
+ @state = :data_state
755
+ else
756
+ @current_token[:name] += data
757
+ end
758
+
759
+ return true
760
+ end
761
+
762
+ def after_doctype_name_state
763
+ data = @stream.char
764
+ if SPACE_CHARACTERS.include? data
765
+ elsif data == ">"
766
+ @token_queue << @current_token
767
+ @state = :data_state
768
+ elsif data == :EOF
769
+ @current_token[:correct] = false
770
+ @stream.unget(data)
771
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
772
+ @token_queue << @current_token
773
+ @state = :data_state
774
+ else
775
+ char_stack = [data]
776
+ 5.times { char_stack << stream.char }
777
+ token = char_stack.join('').tr(ASCII_UPPERCASE,ASCII_LOWERCASE)
778
+ if token == "public" and !char_stack.include?(:EOF)
779
+ @state = :before_doctype_public_identifier_state
780
+ elsif token == "system" and !char_stack.include?(:EOF)
781
+ @state = :before_doctype_system_identifier_state
782
+ else
783
+ @stream.unget(char_stack)
784
+ @token_queue << {:type => :ParseError, :data => "expected-space-or-right-bracket-in-doctype", "datavars" => {"data" => data}}
785
+ @state = :bogus_doctype_state
786
+ end
787
+ end
788
+ return true
789
+ end
790
+
791
+ def before_doctype_public_identifier_state
792
+ data = @stream.char
793
+
794
+ if SPACE_CHARACTERS.include?(data)
795
+ elsif data == "\""
796
+ @current_token[:publicId] = ""
797
+ @state = :doctype_public_identifier_double_quoted_state
798
+ elsif data == "'"
799
+ @current_token[:publicId] = ""
800
+ @state = :doctype_public_identifier_single_quoted_state
801
+ elsif data == ">"
802
+ @token_queue << {:type => :ParseError, :data => "unexpected-end-of-doctype"}
803
+ @current_token[:correct] = false
804
+ @token_queue << @current_token
805
+ @state = :data_state
806
+ elsif data == :EOF
807
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
808
+ @current_token[:correct] = false
809
+ @token_queue << @current_token
810
+ @state = :data_state
811
+ else
812
+ @token_queue << {:type => :ParseError, :data => "unexpected-char-in-doctype"}
813
+ @state = :bogus_doctype_state
814
+ end
815
+
816
+ return true
817
+ end
818
+
819
+ def doctype_public_identifier_double_quoted_state
820
+ data = @stream.char
821
+ if data == "\""
822
+ @state = :after_doctype_public_identifier_state
823
+ elsif data == :EOF
824
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
825
+ @current_token[:correct] = false
826
+ @token_queue << @current_token
827
+ @state = :data_state
828
+ else
829
+ @current_token[:publicId] += data
830
+ end
831
+ return true
832
+ end
833
+
834
+ def doctype_public_identifier_single_quoted_state
835
+ data = @stream.char
836
+ if data == "'"
837
+ @state = :after_doctype_public_identifier_state
838
+ elsif data == :EOF
839
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
840
+ @current_token[:correct] = false
841
+ @token_queue << @current_token
842
+ @state = :data_state
843
+ else
844
+ @current_token[:publicId] += data
845
+ end
846
+ return true
847
+ end
848
+
849
+ def after_doctype_public_identifier_state
850
+ data = @stream.char
851
+ if SPACE_CHARACTERS.include?(data)
852
+ elsif data == "\""
853
+ @current_token[:systemId] = ""
854
+ @state = :doctype_system_identifier_double_quoted_state
855
+ elsif data == "'"
856
+ @current_token[:systemId] = ""
857
+ @state = :doctype_system_identifier_single_quoted_state
858
+ elsif data == ">"
859
+ @token_queue << @current_token
860
+ @state = :data_state
861
+ elsif data == :EOF
862
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
863
+ @current_token[:correct] = false
864
+ @token_queue << @current_token
865
+ @state = :data_state
866
+ else
867
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
868
+ @state = :bogus_doctype_state
869
+ end
870
+ return true
871
+ end
872
+
873
+ def before_doctype_system_identifier_state
874
+ data = @stream.char
875
+ if SPACE_CHARACTERS.include?(data)
876
+ elsif data == "\""
877
+ @current_token[:systemId] = ""
878
+ @state = :doctype_system_identifier_double_quoted_state
879
+ elsif data == "'"
880
+ @current_token[:systemId] = ""
881
+ @state = :doctype_system_identifier_single_quoted_state
882
+ elsif data == ">"
883
+ @token_queue << {:type => :ParseError, :data => "unexpected-char-in-doctype"}
884
+ @current_token[:correct] = false
885
+ @token_queue << @current_token
886
+ @state = :data_state
887
+ elsif data == :EOF
888
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
889
+ @current_token[:correct] = false
890
+ @token_queue << @current_token
891
+ @state = :data_state
892
+ else
893
+ @token_queue << {:type => :ParseError, :data => "unexpected-char-in-doctype"}
894
+ @state = :bogus_doctype_state
895
+ end
896
+ return true
897
+ end
898
+
899
+ def doctype_system_identifier_double_quoted_state
900
+ data = @stream.char
901
+ if data == "\""
902
+ @state = :after_doctype_system_identifier_state
903
+ elsif data == :EOF
904
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
905
+ @current_token[:correct] = false
906
+ @token_queue << @current_token
907
+ @state = :data_state
908
+ else
909
+ @current_token[:systemId] += data
910
+ end
911
+ return true
912
+ end
913
+
914
+ def doctype_system_identifier_single_quoted_state
915
+ data = @stream.char
916
+ if data == "'"
917
+ @state = :after_doctype_system_identifier_state
918
+ elsif data == :EOF
919
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
920
+ @current_token[:correct] = false
921
+ @token_queue << @current_token
922
+ @state = :data_state
923
+ else
924
+ @current_token[:systemId] += data
925
+ end
926
+ return true
927
+ end
928
+
929
+ def after_doctype_system_identifier_state
930
+ data = @stream.char
931
+ if SPACE_CHARACTERS.include?(data)
932
+ elsif data == ">"
933
+ @token_queue << @current_token
934
+ @state = :data_state
935
+ elsif data == :EOF
936
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
937
+ @current_token[:correct] = false
938
+ @token_queue << @current_token
939
+ @state = :data_state
940
+ else
941
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
942
+ @state = :bogus_doctype_state
943
+ end
944
+ return true
945
+ end
946
+
947
+ def bogus_doctype_state
948
+ data = @stream.char
949
+ @current_token[:correct] = false
950
+ if data == ">"
951
+ @token_queue << @current_token
952
+ @state = :data_state
953
+ elsif data == :EOF
954
+ # XXX EMIT
955
+ @stream.unget(data)
956
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
957
+ @current_token[:correct] = false
958
+ @token_queue << @current_token
959
+ @state = :data_state
960
+ end
961
+ return true
962
+ end
963
+
964
+ end
965
+
966
+ end