feedtools 0.2.26 → 0.2.27

Sign up to get free protection for your applications and to get access to all the features.
Files changed (166) hide show
  1. data/CHANGELOG +232 -216
  2. data/db/migration.rb +2 -0
  3. data/db/schema.mysql.sql +2 -0
  4. data/db/schema.postgresql.sql +3 -1
  5. data/db/schema.sqlite.sql +3 -1
  6. data/lib/feed_tools.rb +37 -14
  7. data/lib/feed_tools/database_feed_cache.rb +13 -2
  8. data/lib/feed_tools/feed.rb +430 -104
  9. data/lib/feed_tools/feed_item.rb +533 -268
  10. data/lib/feed_tools/helpers/generic_helper.rb +1 -1
  11. data/lib/feed_tools/helpers/html_helper.rb +78 -116
  12. data/lib/feed_tools/helpers/retrieval_helper.rb +33 -3
  13. data/lib/feed_tools/helpers/uri_helper.rb +46 -54
  14. data/lib/feed_tools/monkey_patch.rb +27 -1
  15. data/lib/feed_tools/vendor/html5/History.txt +10 -0
  16. data/lib/feed_tools/vendor/html5/Manifest.txt +117 -0
  17. data/lib/feed_tools/vendor/html5/README +45 -0
  18. data/lib/feed_tools/vendor/html5/Rakefile.rb +33 -0
  19. data/lib/feed_tools/vendor/html5/bin/html5 +217 -0
  20. data/lib/feed_tools/vendor/html5/lib/core_ext/string.rb +17 -0
  21. data/lib/feed_tools/vendor/html5/lib/html5.rb +13 -0
  22. data/lib/feed_tools/vendor/html5/lib/html5/constants.rb +1046 -0
  23. data/lib/feed_tools/vendor/html5/lib/html5/filters/base.rb +10 -0
  24. data/lib/feed_tools/vendor/html5/lib/html5/filters/inject_meta_charset.rb +82 -0
  25. data/lib/feed_tools/vendor/html5/lib/html5/filters/iso639codes.rb +752 -0
  26. data/lib/feed_tools/vendor/html5/lib/html5/filters/optionaltags.rb +198 -0
  27. data/lib/feed_tools/vendor/html5/lib/html5/filters/rfc2046.rb +30 -0
  28. data/lib/feed_tools/vendor/html5/lib/html5/filters/rfc3987.rb +89 -0
  29. data/lib/feed_tools/vendor/html5/lib/html5/filters/sanitizer.rb +15 -0
  30. data/lib/feed_tools/vendor/html5/lib/html5/filters/validator.rb +830 -0
  31. data/lib/feed_tools/vendor/html5/lib/html5/filters/whitespace.rb +36 -0
  32. data/lib/feed_tools/vendor/html5/lib/html5/html5parser.rb +248 -0
  33. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/after_body_phase.rb +46 -0
  34. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/after_frameset_phase.rb +33 -0
  35. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/after_head_phase.rb +50 -0
  36. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/before_head_phase.rb +41 -0
  37. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_body_phase.rb +613 -0
  38. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_caption_phase.rb +69 -0
  39. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_cell_phase.rb +78 -0
  40. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_column_group_phase.rb +55 -0
  41. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_frameset_phase.rb +57 -0
  42. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_head_phase.rb +138 -0
  43. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_row_phase.rb +89 -0
  44. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_select_phase.rb +85 -0
  45. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_table_body_phase.rb +86 -0
  46. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_table_phase.rb +115 -0
  47. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/initial_phase.rb +133 -0
  48. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/phase.rb +154 -0
  49. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/root_element_phase.rb +41 -0
  50. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/trailing_end_phase.rb +35 -0
  51. data/lib/feed_tools/vendor/html5/lib/html5/inputstream.rb +648 -0
  52. data/lib/feed_tools/vendor/html5/lib/html5/liberalxmlparser.rb +158 -0
  53. data/lib/feed_tools/vendor/html5/lib/html5/sanitizer.rb +188 -0
  54. data/lib/feed_tools/vendor/html5/lib/html5/serializer.rb +2 -0
  55. data/lib/feed_tools/vendor/html5/lib/html5/serializer/htmlserializer.rb +179 -0
  56. data/lib/feed_tools/vendor/html5/lib/html5/serializer/xhtmlserializer.rb +20 -0
  57. data/lib/feed_tools/vendor/html5/lib/html5/sniffer.rb +45 -0
  58. data/lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb +966 -0
  59. data/lib/feed_tools/vendor/html5/lib/html5/treebuilders.rb +24 -0
  60. data/lib/feed_tools/vendor/html5/lib/html5/treebuilders/base.rb +334 -0
  61. data/lib/feed_tools/vendor/html5/lib/html5/treebuilders/hpricot.rb +231 -0
  62. data/lib/feed_tools/vendor/html5/lib/html5/treebuilders/rexml.rb +209 -0
  63. data/lib/feed_tools/vendor/html5/lib/html5/treebuilders/simpletree.rb +185 -0
  64. data/lib/feed_tools/vendor/html5/lib/html5/treewalkers.rb +26 -0
  65. data/lib/feed_tools/vendor/html5/lib/html5/treewalkers/base.rb +162 -0
  66. data/lib/feed_tools/vendor/html5/lib/html5/treewalkers/hpricot.rb +48 -0
  67. data/lib/feed_tools/vendor/html5/lib/html5/treewalkers/rexml.rb +48 -0
  68. data/lib/feed_tools/vendor/html5/lib/html5/treewalkers/simpletree.rb +48 -0
  69. data/lib/feed_tools/vendor/html5/lib/html5/version.rb +3 -0
  70. data/lib/feed_tools/vendor/html5/testdata/encoding/chardet/test_big5.txt +51 -0
  71. data/lib/feed_tools/vendor/html5/testdata/encoding/test-yahoo-jp.dat +10 -0
  72. data/lib/feed_tools/vendor/html5/testdata/encoding/tests1.dat +394 -0
  73. data/lib/feed_tools/vendor/html5/testdata/encoding/tests2.dat +81 -0
  74. data/lib/feed_tools/vendor/html5/testdata/sanitizer/tests1.dat +416 -0
  75. data/lib/feed_tools/vendor/html5/testdata/serializer/core.test +104 -0
  76. data/lib/feed_tools/vendor/html5/testdata/serializer/injectmeta.test +65 -0
  77. data/lib/feed_tools/vendor/html5/testdata/serializer/optionaltags.test +900 -0
  78. data/lib/feed_tools/vendor/html5/testdata/serializer/options.test +60 -0
  79. data/lib/feed_tools/vendor/html5/testdata/serializer/whitespace.test +51 -0
  80. data/lib/feed_tools/vendor/html5/testdata/sites/google-results.htm +1 -0
  81. data/lib/feed_tools/vendor/html5/testdata/sites/python-ref-import.htm +1 -0
  82. data/lib/feed_tools/vendor/html5/testdata/sites/web-apps-old.htm +1 -0
  83. data/lib/feed_tools/vendor/html5/testdata/sites/web-apps.htm +34275 -0
  84. data/lib/feed_tools/vendor/html5/testdata/sniffer/htmlOrFeed.json +43 -0
  85. data/lib/feed_tools/vendor/html5/testdata/tokenizer/contentModelFlags.test +48 -0
  86. data/lib/feed_tools/vendor/html5/testdata/tokenizer/entities.test +2339 -0
  87. data/lib/feed_tools/vendor/html5/testdata/tokenizer/escapeFlag.test +21 -0
  88. data/lib/feed_tools/vendor/html5/testdata/tokenizer/test1.test +172 -0
  89. data/lib/feed_tools/vendor/html5/testdata/tokenizer/test2.test +129 -0
  90. data/lib/feed_tools/vendor/html5/testdata/tokenizer/test3.test +367 -0
  91. data/lib/feed_tools/vendor/html5/testdata/tokenizer/test4.test +198 -0
  92. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests1.dat +1950 -0
  93. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests2.dat +773 -0
  94. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests3.dat +270 -0
  95. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests4.dat +60 -0
  96. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests5.dat +175 -0
  97. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests6.dat +196 -0
  98. data/lib/feed_tools/vendor/html5/testdata/validator/attributes.test +1035 -0
  99. data/lib/feed_tools/vendor/html5/testdata/validator/base-href-attribute.test +787 -0
  100. data/lib/feed_tools/vendor/html5/testdata/validator/base-target-attribute.test +35 -0
  101. data/lib/feed_tools/vendor/html5/testdata/validator/blockquote-cite-attribute.test +7 -0
  102. data/lib/feed_tools/vendor/html5/testdata/validator/classattribute.test +152 -0
  103. data/lib/feed_tools/vendor/html5/testdata/validator/contenteditableattribute.test +59 -0
  104. data/lib/feed_tools/vendor/html5/testdata/validator/contextmenuattribute.test +115 -0
  105. data/lib/feed_tools/vendor/html5/testdata/validator/dirattribute.test +59 -0
  106. data/lib/feed_tools/vendor/html5/testdata/validator/draggableattribute.test +63 -0
  107. data/lib/feed_tools/vendor/html5/testdata/validator/html-xmlns-attribute.test +23 -0
  108. data/lib/feed_tools/vendor/html5/testdata/validator/idattribute.test +115 -0
  109. data/lib/feed_tools/vendor/html5/testdata/validator/inputattributes.test +2795 -0
  110. data/lib/feed_tools/vendor/html5/testdata/validator/irrelevantattribute.test +63 -0
  111. data/lib/feed_tools/vendor/html5/testdata/validator/langattribute.test +5579 -0
  112. data/lib/feed_tools/vendor/html5/testdata/validator/li-value-attribute.test +7 -0
  113. data/lib/feed_tools/vendor/html5/testdata/validator/link-href-attribute.test +7 -0
  114. data/lib/feed_tools/vendor/html5/testdata/validator/link-hreflang-attribute.test +7 -0
  115. data/lib/feed_tools/vendor/html5/testdata/validator/link-rel-attribute.test +271 -0
  116. data/lib/feed_tools/vendor/html5/testdata/validator/ol-start-attribute.test +7 -0
  117. data/lib/feed_tools/vendor/html5/testdata/validator/starttags.test +375 -0
  118. data/lib/feed_tools/vendor/html5/testdata/validator/style-scoped-attribute.test +7 -0
  119. data/lib/feed_tools/vendor/html5/testdata/validator/tabindexattribute.test +79 -0
  120. data/lib/feed_tools/vendor/html5/tests/preamble.rb +72 -0
  121. data/lib/feed_tools/vendor/html5/tests/test_encoding.rb +35 -0
  122. data/lib/feed_tools/vendor/html5/tests/test_lxp.rb +279 -0
  123. data/lib/feed_tools/vendor/html5/tests/test_parser.rb +68 -0
  124. data/lib/feed_tools/vendor/html5/tests/test_sanitizer.rb +142 -0
  125. data/lib/feed_tools/vendor/html5/tests/test_serializer.rb +68 -0
  126. data/lib/feed_tools/vendor/html5/tests/test_sniffer.rb +27 -0
  127. data/lib/feed_tools/vendor/html5/tests/test_stream.rb +62 -0
  128. data/lib/feed_tools/vendor/html5/tests/test_tokenizer.rb +94 -0
  129. data/lib/feed_tools/vendor/html5/tests/test_treewalkers.rb +135 -0
  130. data/lib/feed_tools/vendor/html5/tests/test_validator.rb +31 -0
  131. data/lib/feed_tools/vendor/html5/tests/tokenizer_test_parser.rb +63 -0
  132. data/lib/feed_tools/vendor/uri.rb +781 -0
  133. data/lib/feed_tools/version.rb +1 -1
  134. data/rakefile +27 -6
  135. data/test/unit/atom_test.rb +298 -210
  136. data/test/unit/helper_test.rb +7 -12
  137. data/test/unit/rdf_test.rb +51 -1
  138. data/test/unit/rss_test.rb +13 -3
  139. metadata +239 -116
  140. data/lib/feed_tools/vendor/htree.rb +0 -97
  141. data/lib/feed_tools/vendor/htree/container.rb +0 -10
  142. data/lib/feed_tools/vendor/htree/context.rb +0 -67
  143. data/lib/feed_tools/vendor/htree/display.rb +0 -27
  144. data/lib/feed_tools/vendor/htree/doc.rb +0 -149
  145. data/lib/feed_tools/vendor/htree/elem.rb +0 -262
  146. data/lib/feed_tools/vendor/htree/encoder.rb +0 -163
  147. data/lib/feed_tools/vendor/htree/equality.rb +0 -218
  148. data/lib/feed_tools/vendor/htree/extract_text.rb +0 -37
  149. data/lib/feed_tools/vendor/htree/fstr.rb +0 -33
  150. data/lib/feed_tools/vendor/htree/gencode.rb +0 -97
  151. data/lib/feed_tools/vendor/htree/htmlinfo.rb +0 -672
  152. data/lib/feed_tools/vendor/htree/inspect.rb +0 -108
  153. data/lib/feed_tools/vendor/htree/leaf.rb +0 -94
  154. data/lib/feed_tools/vendor/htree/loc.rb +0 -367
  155. data/lib/feed_tools/vendor/htree/modules.rb +0 -48
  156. data/lib/feed_tools/vendor/htree/name.rb +0 -124
  157. data/lib/feed_tools/vendor/htree/output.rb +0 -207
  158. data/lib/feed_tools/vendor/htree/parse.rb +0 -409
  159. data/lib/feed_tools/vendor/htree/raw_string.rb +0 -124
  160. data/lib/feed_tools/vendor/htree/regexp-util.rb +0 -15
  161. data/lib/feed_tools/vendor/htree/rexml.rb +0 -130
  162. data/lib/feed_tools/vendor/htree/scan.rb +0 -166
  163. data/lib/feed_tools/vendor/htree/tag.rb +0 -111
  164. data/lib/feed_tools/vendor/htree/template.rb +0 -909
  165. data/lib/feed_tools/vendor/htree/text.rb +0 -115
  166. data/lib/feed_tools/vendor/htree/traverse.rb +0 -465
@@ -0,0 +1,20 @@
1
+ require 'html5/serializer/htmlserializer'
2
+
3
+ module HTML5
4
+
5
+ class XHTMLSerializer < HTMLSerializer
6
+ DEFAULTS = {
7
+ :quote_attr_values => true,
8
+ :minimize_boolean_attributes => false,
9
+ :use_trailing_solidus => true,
10
+ :escape_lt_in_attrs => true,
11
+ :omit_optional_tags => false,
12
+ :escape_rcdata => true
13
+ }
14
+
15
+ def initialize(options={})
16
+ super(DEFAULTS.clone.update(options))
17
+ end
18
+ end
19
+
20
+ end
@@ -0,0 +1,45 @@
1
+ module HTML5
2
+ module Sniffer
3
+ # 4.7.4
4
+ def html_or_feed str
5
+ s = str[0, 512] # steps 1, 2
6
+ pos = 0
7
+
8
+ while pos < s.length
9
+ case s[pos]
10
+ when 0x09, 0x20, 0x0A, 0x0D # tab, space, LF, CR
11
+ pos += 1
12
+ when 0x3C # "<"
13
+ pos += 1
14
+ if s[pos..pos+2] == "!--" # [0x21, 0x2D, 0x2D]
15
+ pos += 3
16
+ until s[pos..pos+2] == "-->" or pos >= s.length
17
+ pos += 1
18
+ end
19
+ pos += 3
20
+ elsif s[pos] == 0x21 # "!"
21
+ pos += 1
22
+ until s[pos] == 0x3E or pos >= s.length # ">"
23
+ pos += 1
24
+ end
25
+ pos += 1
26
+ elsif s[pos] == 0x3F # "?"
27
+ until s[pos..pos+1] == "?>" or pos >= s.length # [0x3F, 0x3E]
28
+ pos += 1
29
+ end
30
+ pos += 2
31
+ elsif s[pos..pos+2] == "rss" # [0x72, 0x73, 0x73]
32
+ return "application/rss+xml"
33
+ elsif s[pos..pos+3] == "feed" # [0x66, 0x65, 0x65, 0x64]
34
+ return "application/atom+xml"
35
+ elsif s[pos..pos+6] == "rdf:RDF" # [0x72, 0x64, 0x66, 0x3A, 0x52, 0x44, 0x46]
36
+ raise NotImplementedError
37
+ end
38
+ else
39
+ break
40
+ end
41
+ end
42
+ "text/html"
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,966 @@
1
+ require 'html5/constants'
2
+ require 'html5/inputstream'
3
+
4
+ module HTML5
5
+
6
+ # This class takes care of tokenizing HTML.
7
+ #
8
+ # * @current_token
9
+ # Holds the token that is currently being processed.
10
+ #
11
+ # * @state
12
+ # Holds a reference to the method to be invoked... XXX
13
+ #
14
+ # * @states
15
+ # Holds a mapping between states and methods that implement the state.
16
+ #
17
+ # * @stream
18
+ # Points to HTMLInputStream object.
19
+
20
+ class HTMLTokenizer
21
+ attr_accessor :content_model_flag, :current_token
22
+ attr_reader :stream
23
+
24
+ # XXX need to fix documentation
25
+
26
+ def initialize(stream, options = {})
27
+ @stream = HTMLInputStream.new(stream, options)
28
+
29
+ # Setup the initial tokenizer state
30
+ @content_model_flag = :PCDATA
31
+ @state = :data_state
32
+ @escapeFlag = false
33
+ @lastFourChars = []
34
+
35
+ # The current token being created
36
+ @current_token = nil
37
+
38
+ # Tokens to be processed.
39
+ @token_queue = []
40
+ @lowercase_element_name = options[:lowercase_element_name] != false
41
+ @lowercase_attr_name = options[:lowercase_attr_name] != false
42
+ end
43
+
44
+ # This is where the magic happens.
45
+ #
46
+ # We do our usually processing through the states and when we have a token
47
+ # to return we yield the token which pauses processing until the next token
48
+ # is requested.
49
+ def each
50
+ @token_queue = []
51
+ # Start processing. When EOF is reached @state will return false
52
+ # instead of true and the loop will terminate.
53
+ while send @state
54
+ yield :type => :ParseError, :data => @stream.errors.shift until @stream.errors.empty?
55
+ yield @token_queue.shift until @token_queue.empty?
56
+ end
57
+ end
58
+
59
+ # Below are various helper functions the tokenizer states use worked out.
60
+
61
+ # If the next character is a '>', convert the current_token into
62
+ # an EmptyTag
63
+
64
+ def process_solidus_in_tag
65
+
66
+ # We need to consume another character to make sure it's a ">"
67
+ data = @stream.char
68
+
69
+ if @current_token[:type] == :StartTag and data == ">"
70
+ @current_token[:type] = :EmptyTag
71
+ else
72
+ @token_queue << {:type => :ParseError, :data => "incorrectly-placed-solidus"}
73
+ end
74
+
75
+ # The character we just consumed need to be put back on the stack so it
76
+ # doesn't get lost...
77
+ @stream.unget(data)
78
+ end
79
+
80
+ # This function returns either U+FFFD or the character based on the
81
+ # decimal or hexadecimal representation. It also discards ";" if present.
82
+ # If not present @token_queue << {:type => :ParseError}" is invoked.
83
+
84
+ def consume_number_entity(isHex)
85
+
86
+ # XXX More need to be done here. For instance, #13 should prolly be
87
+ # converted to #10 so we don't get \r (#13 is \r right?) in the DOM and
88
+ # such. Thoughts on this appreciated.
89
+ allowed = DIGITS
90
+ radix = 10
91
+ if isHex
92
+ allowed = HEX_DIGITS
93
+ radix = 16
94
+ end
95
+
96
+ char_stack = []
97
+
98
+ # Consume all the characters that are in range while making sure we
99
+ # don't hit an EOF.
100
+ c = @stream.char
101
+ while allowed.include?(c) and c != :EOF
102
+ char_stack.push(c)
103
+ c = @stream.char
104
+ end
105
+
106
+ # Convert the set of characters consumed to an int.
107
+ charAsInt = char_stack.join('').to_i(radix)
108
+
109
+ if charAsInt == 13
110
+ @token_queue << {:type => :ParseError, :data => "incorrect-cr-newline-entity"}
111
+ charAsInt = 10
112
+ elsif (128..159).include? charAsInt
113
+ # If the integer is between 127 and 160 (so 128 and bigger and 159
114
+ # and smaller) we need to do the "windows trick".
115
+ @token_queue << {:type => :ParseError, :data => "illegal-windows-1252-entity"}
116
+
117
+ charAsInt = ENTITIES_WINDOWS1252[charAsInt - 128]
118
+ end
119
+
120
+ if 0 < charAsInt and charAsInt <= 1114111 and not (55296 <= charAsInt and charAsInt <= 57343)
121
+ char = [charAsInt].pack('U')
122
+ else
123
+ char = [0xFFFD].pack('U')
124
+ @token_queue << {:type => :ParseError, :data => "cant-convert-numeric-entity", :datavars => {"charAsInt" => charAsInt}}
125
+ end
126
+
127
+ # Discard the ; if present. Otherwise, put it back on the queue and
128
+ # invoke parse_error on parser.
129
+ if c != ";"
130
+ @token_queue << {:type => :ParseError, :data => "numeric-entity-without-semicolon"}
131
+ @stream.unget(c)
132
+ end
133
+
134
+ return char
135
+ end
136
+
137
+ def consume_entity(from_attribute=false)
138
+ char = nil
139
+ char_stack = [@stream.char]
140
+ if SPACE_CHARACTERS.include?(char_stack[0]) or [:EOF, '<', '&'].include?(char_stack[0])
141
+ @stream.unget(char_stack)
142
+ elsif char_stack[0] == '#'
143
+ # We might have a number entity here.
144
+ char_stack += [@stream.char, @stream.char]
145
+ if char_stack[0 .. 1].include? :EOF
146
+ # If we reach the end of the file put everything up to :EOF
147
+ # back in the queue
148
+ char_stack = char_stack[0...char_stack.index(:EOF)]
149
+ @stream.unget(char_stack)
150
+ @token_queue << {:type => :ParseError, :data => "expected-numeric-entity-but-got-eof"}
151
+ else
152
+ if char_stack[1].downcase == "x" and HEX_DIGITS.include? char_stack[2]
153
+ # Hexadecimal entity detected.
154
+ @stream.unget(char_stack[2])
155
+ char = consume_number_entity(true)
156
+ elsif DIGITS.include? char_stack[1]
157
+ # Decimal entity detected.
158
+ @stream.unget(char_stack[1..-1])
159
+ char = consume_number_entity(false)
160
+ else
161
+ # No number entity detected.
162
+ @stream.unget(char_stack)
163
+ @token_queue << {:type => :ParseError, :data => "expected-numeric-entity"}
164
+ end
165
+ end
166
+ else
167
+ # At this point in the process might have named entity. Entities
168
+ # are stored in the global variable "entities".
169
+ #
170
+ # Consume characters and compare to these to a substring of the
171
+ # entity names in the list until the substring no longer matches.
172
+ filteredEntityList = ENTITIES.keys
173
+ filteredEntityList.reject! {|e| e[0].chr != char_stack[0]}
174
+ entityName = nil
175
+
176
+ # Try to find the longest entity the string will match to take care
177
+ # of &noti for instance.
178
+ while char_stack.last != :EOF
179
+ name = char_stack.join('')
180
+ if filteredEntityList.any? {|e| e[0...name.length] == name}
181
+ filteredEntityList.reject! {|e| e[0...name.length] != name}
182
+ char_stack.push(@stream.char)
183
+ else
184
+ break
185
+ end
186
+
187
+ if ENTITIES.include? name
188
+ entityName = name
189
+ break if entityName[-1] == ';'
190
+ end
191
+ end
192
+
193
+ if entityName != nil
194
+ char = ENTITIES[entityName]
195
+
196
+ # Check whether or not the last character returned can be
197
+ # discarded or needs to be put back.
198
+ if entityName[-1] != ?;
199
+ @token_queue << {:type => :ParseError, :data => "named-entity-without-semicolon"}
200
+ end
201
+
202
+ if entityName[-1] != ";" and from_attribute and
203
+ (ASCII_LETTERS.include?(char_stack[entityName.length]) or
204
+ DIGITS.include?(char_stack[entityName.length]))
205
+ @stream.unget(char_stack)
206
+ char = '&'
207
+ else
208
+ @stream.unget(char_stack[entityName.length..-1])
209
+ end
210
+ else
211
+ @token_queue << {:type => :ParseError, :data => "expected-named-entity"}
212
+ @stream.unget(char_stack)
213
+ end
214
+ end
215
+ return char
216
+ end
217
+
218
+ # This method replaces the need for "entityInAttributeValueState".
219
+ def process_entity_in_attribute
220
+ entity = consume_entity()
221
+ if entity
222
+ @current_token[:data][-1][1] += entity
223
+ else
224
+ @current_token[:data][-1][1] += "&"
225
+ end
226
+ end
227
+
228
+ # This method is a generic handler for emitting the tags. It also sets
229
+ # the state to "data" because that's what's needed after a token has been
230
+ # emitted.
231
+ def emit_current_token
232
+ # Add token to the queue to be yielded
233
+ token = @current_token
234
+ if [:StartTag, :EndTag, :EmptyTag].include?(token[:type])
235
+ if @lowercase_element_name
236
+ token[:name] = token[:name].downcase
237
+ end
238
+ @token_queue << token
239
+ @state = :data_state
240
+ end
241
+
242
+ end
243
+
244
+ # Below are the various tokenizer states worked out.
245
+
246
+ # XXX AT Perhaps we should have Hixie run some evaluation on billions of
247
+ # documents to figure out what the order of the various if and elsif
248
+ # statements should be.
249
+ def data_state
250
+ data = @stream.char
251
+
252
+ if @content_model_flag == :CDATA or @content_model_flag == :RCDATA
253
+ @lastFourChars << data
254
+ @lastFourChars.shift if @lastFourChars.length > 4
255
+ end
256
+
257
+ if data == "&" and [:PCDATA,:RCDATA].include?(@content_model_flag) and !@escapeFlag
258
+ @state = :entity_data_state
259
+ elsif data == "-" && [:CDATA, :RCDATA].include?(@content_model_flag) && !@escapeFlag && @lastFourChars.join('') == "<!--"
260
+ @escapeFlag = true
261
+ @token_queue << {:type => :Characters, :data => data}
262
+ elsif data == "<" and !@escapeFlag and
263
+ [:PCDATA,:CDATA,:RCDATA].include?(@content_model_flag)
264
+ @state = :tag_open_state
265
+ elsif data == ">" and @escapeFlag and
266
+ [:CDATA,:RCDATA].include?(@content_model_flag) and
267
+ @lastFourChars[1..-1].join('') == "-->"
268
+ @escapeFlag = false
269
+ @token_queue << {:type => :Characters, :data => data}
270
+
271
+ elsif data == :EOF
272
+ # Tokenization ends.
273
+ return false
274
+
275
+ elsif SPACE_CHARACTERS.include? data
276
+ # Directly after emitting a token you switch back to the "data
277
+ # state". At that point SPACE_CHARACTERS are important so they are
278
+ # emitted separately.
279
+ # XXX need to check if we don't need a special "spaces" flag on
280
+ # characters.
281
+ @token_queue << {:type => :SpaceCharacters, :data => data + @stream.chars_until(SPACE_CHARACTERS, true)}
282
+ else
283
+ @token_queue << {:type => :Characters, :data => data + @stream.chars_until(%w[& < > -])}
284
+ end
285
+ return true
286
+ end
287
+
288
+ def entity_data_state
289
+ entity = consume_entity
290
+ if entity
291
+ @token_queue << {:type => :Characters, :data => entity}
292
+ else
293
+ @token_queue << {:type => :Characters, :data => "&"}
294
+ end
295
+ @state = :data_state
296
+ return true
297
+ end
298
+
299
+ def tag_open_state
300
+ data = @stream.char
301
+ if @content_model_flag == :PCDATA
302
+ if data == "!"
303
+ @state = :markup_declaration_open_state
304
+ elsif data == "/"
305
+ @state = :close_tag_open_state
306
+ elsif data != :EOF and ASCII_LETTERS.include? data
307
+ @current_token = {:type => :StartTag, :name => data, :data => []}
308
+ @state = :tag_name_state
309
+ elsif data == ">"
310
+ # XXX In theory it could be something besides a tag name. But
311
+ # do we really care?
312
+ @token_queue << {:type => :ParseError, :data => "expected-tag-name-but-got-right-bracket"}
313
+ @token_queue << {:type => :Characters, :data => "<>"}
314
+ @state = :data_state
315
+ elsif data == "?"
316
+ # XXX In theory it could be something besides a tag name. But
317
+ # do we really care?
318
+ @token_queue.push({:type => :ParseError, :data => "expected-tag-name-but-got-question-mark"})
319
+ @stream.unget(data)
320
+ @state = :bogus_comment_state
321
+ else
322
+ # XXX
323
+ @token_queue << {:type => :ParseError, :data => "expected-tag-name"}
324
+ @token_queue << {:type => :Characters, :data => "<"}
325
+ @stream.unget(data)
326
+ @state = :data_state
327
+ end
328
+ else
329
+ # We know the content model flag is set to either RCDATA or CDATA
330
+ # now because this state can never be entered with the PLAINTEXT
331
+ # flag.
332
+ if data == "/"
333
+ @state = :close_tag_open_state
334
+ else
335
+ @token_queue << {:type => :Characters, :data => "<"}
336
+ @stream.unget(data)
337
+ @state = :data_state
338
+ end
339
+ end
340
+ return true
341
+ end
342
+
343
+ def close_tag_open_state
344
+ if (@content_model_flag == :RCDATA or @content_model_flag == :CDATA)
345
+ if @current_token
346
+ char_stack = []
347
+
348
+ # So far we know that "</" has been consumed. We now need to know
349
+ # whether the next few characters match the name of last emitted
350
+ # start tag which also happens to be the current_token. We also need
351
+ # to have the character directly after the characters that could
352
+ # match the start tag name.
353
+ (@current_token[:name].length + 1).times do
354
+ char_stack.push(@stream.char)
355
+ # Make sure we don't get hit by :EOF
356
+ break if char_stack[-1] == :EOF
357
+ end
358
+
359
+ # Since this is just for checking. We put the characters back on
360
+ # the stack.
361
+ @stream.unget(char_stack)
362
+ end
363
+
364
+ if @current_token and
365
+ @current_token[:name].downcase ==
366
+ char_stack[0...-1].join('').downcase and
367
+ (SPACE_CHARACTERS + [">", "/", "<", :EOF]).include? char_stack[-1]
368
+ # Because the characters are correct we can safely switch to
369
+ # PCDATA mode now. This also means we don't have to do it when
370
+ # emitting the end tag token.
371
+ @content_model_flag = :PCDATA
372
+ else
373
+ @token_queue << {:type => :Characters, :data => "</"}
374
+ @state = :data_state
375
+
376
+ # Need to return here since we don't want the rest of the
377
+ # method to be walked through.
378
+ return true
379
+ end
380
+ end
381
+
382
+ data = @stream.char
383
+ if data == :EOF
384
+ @token_queue << {:type => :ParseError, :data => "expected-closing-tag-but-got-eof"}
385
+ @token_queue << {:type => :Characters, :data => "</"}
386
+ @state = :data_state
387
+ elsif ASCII_LETTERS.include? data
388
+ @current_token = {:type => :EndTag, :name => data, :data => []}
389
+ @state = :tag_name_state
390
+ elsif data == ">"
391
+ @token_queue << {:type => :ParseError, :data => "expected-closing-tag-but-got-right-bracket"}
392
+ @state = :data_state
393
+ else
394
+ # XXX data can be _'_...
395
+ @token_queue << {:type => :ParseError, :data => "expected-closing-tag-but-got-char", :datavars => {:data => data}}
396
+ @stream.unget(data)
397
+ @state = :bogus_comment_state
398
+ end
399
+
400
+ return true
401
+ end
402
+
403
+ def tag_name_state
404
+ data = @stream.char
405
+ if SPACE_CHARACTERS.include? data
406
+ @state = :before_attribute_name_state
407
+ elsif data == :EOF
408
+ @token_queue << {:type => :ParseError, :data => "eof-in-tag-name"}
409
+ emit_current_token
410
+ elsif ASCII_LETTERS.include? data
411
+ @current_token[:name] += data + @stream.chars_until(ASCII_LETTERS, true)
412
+ elsif data == ">"
413
+ emit_current_token
414
+ elsif data == "/"
415
+ process_solidus_in_tag
416
+ @state = :before_attribute_name_state
417
+ else
418
+ @current_token[:name] += data
419
+ end
420
+ return true
421
+ end
422
+
423
+ def before_attribute_name_state
424
+ data = @stream.char
425
+ if SPACE_CHARACTERS.include? data
426
+ @stream.chars_until(SPACE_CHARACTERS, true)
427
+ elsif data == :EOF
428
+ @token_queue << {:type => :ParseError, :data => "expected-attribute-name-but-got-eof"}
429
+ emit_current_token
430
+ elsif ASCII_LETTERS.include? data
431
+ @current_token[:data].push([data, ""])
432
+ @state = :attribute_name_state
433
+ elsif data == ">"
434
+ emit_current_token
435
+ elsif data == "/"
436
+ process_solidus_in_tag
437
+ else
438
+ @current_token[:data].push([data, ""])
439
+ @state = :attribute_name_state
440
+ end
441
+ return true
442
+ end
443
+
444
+ def attribute_name_state
445
+ data = @stream.char
446
+ leavingThisState = true
447
+ emitToken = false
448
+ if data == "="
449
+ @state = :before_attribute_value_state
450
+ elsif data == :EOF
451
+ @token_queue << {:type => :ParseError, :data => "eof-in-attribute-name"}
452
+ @state = :data_state
453
+ emitToken = true
454
+ elsif ASCII_LETTERS.include? data
455
+ @current_token[:data][-1][0] += data + @stream.chars_until(ASCII_LETTERS, true)
456
+ leavingThisState = false
457
+ elsif data == ">"
458
+ # XXX If we emit here the attributes are converted to a dict
459
+ # without being checked and when the code below runs we error
460
+ # because data is a dict not a list
461
+ emitToken = true
462
+ elsif SPACE_CHARACTERS.include? data
463
+ @state = :after_attribute_name_state
464
+ elsif data == "/"
465
+ process_solidus_in_tag
466
+ @state = :before_attribute_name_state
467
+ else
468
+ @current_token[:data][-1][0] += data
469
+ leavingThisState = false
470
+ end
471
+
472
+ if leavingThisState
473
+ # Attributes are not dropped at this stage. That happens when the
474
+ # start tag token is emitted so values can still be safely appended
475
+ # to attributes, but we do want to report the parse error in time.
476
+ if @lowercase_attr_name
477
+ @current_token[:data][-1][0] = @current_token[:data].last.first.downcase
478
+ end
479
+ @current_token[:data][0...-1].each {|name,value|
480
+ if @current_token[:data].last.first == name
481
+ @token_queue << {:type => :ParseError, :data => "duplicate-attribute"}
482
+ break # don't report an error more than once
483
+ end
484
+ }
485
+ # XXX Fix for above XXX
486
+ emit_current_token if emitToken
487
+ end
488
+ return true
489
+ end
490
+
491
+ def after_attribute_name_state
492
+ data = @stream.char
493
+ if SPACE_CHARACTERS.include? data
494
+ @stream.chars_until(SPACE_CHARACTERS, true)
495
+ elsif data == "="
496
+ @state = :before_attribute_value_state
497
+ elsif data == ">"
498
+ emit_current_token
499
+ elsif data == :EOF
500
+ @token_queue << {:type => :ParseError, :data => "expected-end-of-tag-but-got-eof"}
501
+ emit_current_token
502
+ elsif ASCII_LETTERS.include? data
503
+ @current_token[:data].push([data, ""])
504
+ @state = :attribute_name_state
505
+ elsif data == "/"
506
+ process_solidus_in_tag
507
+ @state = :before_attribute_name_state
508
+ else
509
+ @current_token[:data].push([data, ""])
510
+ @state = :attribute_name_state
511
+ end
512
+ return true
513
+ end
514
+
515
+ def before_attribute_value_state
516
+ data = @stream.char
517
+ if SPACE_CHARACTERS.include? data
518
+ @stream.chars_until(SPACE_CHARACTERS, true)
519
+ elsif data == "\""
520
+ @state = :attribute_value_double_quoted_state
521
+ elsif data == "&"
522
+ @state = :attribute_value_unquoted_state
523
+ @stream.unget(data);
524
+ elsif data == "'"
525
+ @state = :attribute_value_single_quoted_state
526
+ elsif data == ">"
527
+ emit_current_token
528
+ elsif data == :EOF
529
+ @token_queue << {:type => :ParseError, :data => "expected-attribute-value-but-got-eof"}
530
+ emit_current_token
531
+ else
532
+ @current_token[:data][-1][1] += data
533
+ @state = :attribute_value_unquoted_state
534
+ end
535
+ return true
536
+ end
537
+
538
+ def attribute_value_double_quoted_state
539
+ data = @stream.char
540
+ if data == "\""
541
+ @state = :before_attribute_name_state
542
+ elsif data == "&"
543
+ process_entity_in_attribute
544
+ elsif data == :EOF
545
+ @token_queue << {:type => :ParseError, :data => "eof-in-attribute-value-double-quote"}
546
+ emit_current_token
547
+ else
548
+ @current_token[:data][-1][1] += data + @stream.chars_until(["\"", "&"])
549
+ end
550
+ return true
551
+ end
552
+
553
+ def attribute_value_single_quoted_state
554
+ data = @stream.char
555
+ if data == "'"
556
+ @state = :before_attribute_name_state
557
+ elsif data == "&"
558
+ process_entity_in_attribute
559
+ elsif data == :EOF
560
+ @token_queue << {:type => :ParseError, :data => "eof-in-attribute-value-single-quote"}
561
+ emit_current_token
562
+ else
563
+ @current_token[:data][-1][1] += data +\
564
+ @stream.chars_until(["'", "&"])
565
+ end
566
+ return true
567
+ end
568
+
569
+ def attribute_value_unquoted_state
570
+ data = @stream.char
571
+ if SPACE_CHARACTERS.include? data
572
+ @state = :before_attribute_name_state
573
+ elsif data == "&"
574
+ process_entity_in_attribute
575
+ elsif data == ">"
576
+ emit_current_token
577
+ elsif data == :EOF
578
+ @token_queue << {:type => :ParseError, :data => "eof-in-attribute-value-no-quotes"}
579
+ emit_current_token
580
+ else
581
+ @current_token[:data][-1][1] += data + @stream.chars_until(["&", ">","<"] + SPACE_CHARACTERS)
582
+ end
583
+ return true
584
+ end
585
+
586
+ def bogus_comment_state
587
+ # Make a new comment token and give it as value all the characters
588
+ # until the first > or :EOF (chars_until checks for :EOF automatically)
589
+ # and emit it.
590
+ @token_queue << {:type => :Comment, :data => @stream.chars_until((">"))}
591
+
592
+ # Eat the character directly after the bogus comment which is either a
593
+ # ">" or an :EOF.
594
+ @stream.char
595
+ @state = :data_state
596
+ return true
597
+ end
598
+
599
+ def markup_declaration_open_state
600
+ char_stack = [@stream.char, @stream.char]
601
+ if char_stack == ["-", "-"]
602
+ @current_token = {:type => :Comment, :data => ""}
603
+ @state = :comment_start_state
604
+ else
605
+ 5.times { char_stack.push(@stream.char) }
606
+ # Put in explicit :EOF check
607
+ if !char_stack.include?(:EOF) && char_stack.join("").upcase == "DOCTYPE"
608
+ @current_token = {:type => :Doctype, :name => "", :publicId => nil, :systemId => nil, :correct => true}
609
+ @state = :doctype_state
610
+ else
611
+ @token_queue << {:type => :ParseError, :data => "expected-dashes-or-doctype"}
612
+ @stream.unget(char_stack)
613
+ @state = :bogus_comment_state
614
+ end
615
+ end
616
+ return true
617
+ end
618
+
619
+ def comment_start_state
620
+ data = @stream.char
621
+ if data == "-"
622
+ @state = :comment_start_dash_state
623
+ elsif data == ">"
624
+ @token_queue << {:type => :ParseError, :data => "incorrect-comment"}
625
+ @token_queue << @current_token
626
+ @state = :data_state
627
+ elsif data == :EOF
628
+ @token_queue << {:type => :ParseError, :data => "eof-in-comment"}
629
+ @token_queue << @current_token
630
+ @state = :data_state
631
+ else
632
+ @current_token[:data] += data + @stream.chars_until("-")
633
+ @state = :comment_state
634
+ end
635
+ return true
636
+ end
637
+
638
+ def comment_start_dash_state
639
+ data = @stream.char
640
+ if data == "-"
641
+ @state = :comment_end_state
642
+ elsif data == ">"
643
+ @token_queue << {:type => :ParseError, :data => "incorrect-comment"}
644
+ @token_queue << @current_token
645
+ @state = :data_state
646
+ elsif data == :EOF
647
+ @token_queue << {:type => :ParseError, :data => "eof-in-comment"}
648
+ @token_queue << @current_token
649
+ @state = :data_state
650
+ else
651
+ @current_token[:data] += '-' + data + @stream.chars_until("-")
652
+ @state = :comment_state
653
+ end
654
+ return true
655
+ end
656
+
657
+ def comment_state
658
+ data = @stream.char
659
+ if data == "-"
660
+ @state = :comment_end_dash_state
661
+ elsif data == :EOF
662
+ @token_queue << {:type => :ParseError, :data => "eof-in-comment"}
663
+ @token_queue << @current_token
664
+ @state = :data_state
665
+ else
666
+ @current_token[:data] += data + @stream.chars_until("-")
667
+ end
668
+ return true
669
+ end
670
+
671
+ def comment_end_dash_state
672
+ data = @stream.char
673
+ if data == "-"
674
+ @state = :comment_end_state
675
+ elsif data == :EOF
676
+ @token_queue << {:type => :ParseError, :data => "eof-in-comment-end-dash"}
677
+ @token_queue << @current_token
678
+ @state = :data_state
679
+ else
680
+ @current_token[:data] += "-" + data +\
681
+ @stream.chars_until("-")
682
+ # Consume the next character which is either a "-" or an :EOF as
683
+ # well so if there's a "-" directly after the "-" we go nicely to
684
+ # the "comment end state" without emitting a ParseError there.
685
+ @stream.char
686
+ end
687
+ return true
688
+ end
689
+
690
+ def comment_end_state
691
+ data = @stream.char
692
+ if data == ">"
693
+ @token_queue << @current_token
694
+ @state = :data_state
695
+ elsif data == "-"
696
+ @token_queue << {:type => :ParseError, :data => "unexpected-dash-after-double-dash-in-comment"}
697
+ @current_token[:data] += data
698
+ elsif data == :EOF
699
+ @token_queue << {:type => :ParseError, :data => "eof-in-comment-double-dash"}
700
+ @token_queue << @current_token
701
+ @state = :data_state
702
+ else
703
+ # XXX
704
+ @token_queue << {:type => :ParseError, :data => "unexpected-char-in-comment"}
705
+ @current_token[:data] += "--" + data
706
+ @state = :comment_state
707
+ end
708
+ return true
709
+ end
710
+
711
+ def doctype_state
712
+ data = @stream.char
713
+ if SPACE_CHARACTERS.include? data
714
+ @state = :before_doctype_name_state
715
+ else
716
+ @token_queue << {:type => :ParseError, :data => "need-space-after-doctype"}
717
+ @stream.unget(data)
718
+ @state = :before_doctype_name_state
719
+ end
720
+ return true
721
+ end
722
+
723
+ def before_doctype_name_state
724
+ data = @stream.char
725
+ if SPACE_CHARACTERS.include? data
726
+ elsif data == ">"
727
+ @token_queue << {:type => :ParseError, :data => "expected-doctype-name-but-got-right-bracket"}
728
+ @current_token[:correct] = false
729
+ @token_queue << @current_token
730
+ @state = :data_state
731
+ elsif data == :EOF
732
+ @token_queue << {:type => :ParseError, :data => "expected-doctype-name-but-got-eof"}
733
+ @current_token[:correct] = false
734
+ @token_queue << @current_token
735
+ @state = :data_state
736
+ else
737
+ @current_token[:name] = data
738
+ @state = :doctype_name_state
739
+ end
740
+ return true
741
+ end
742
+
743
+ def doctype_name_state
744
+ data = @stream.char
745
+ if SPACE_CHARACTERS.include? data
746
+ @state = :after_doctype_name_state
747
+ elsif data == ">"
748
+ @token_queue << @current_token
749
+ @state = :data_state
750
+ elsif data == :EOF
751
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype-name"}
752
+ @current_token[:correct] = false
753
+ @token_queue << @current_token
754
+ @state = :data_state
755
+ else
756
+ @current_token[:name] += data
757
+ end
758
+
759
+ return true
760
+ end
761
+
762
+ def after_doctype_name_state
763
+ data = @stream.char
764
+ if SPACE_CHARACTERS.include? data
765
+ elsif data == ">"
766
+ @token_queue << @current_token
767
+ @state = :data_state
768
+ elsif data == :EOF
769
+ @current_token[:correct] = false
770
+ @stream.unget(data)
771
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
772
+ @token_queue << @current_token
773
+ @state = :data_state
774
+ else
775
+ char_stack = [data]
776
+ 5.times { char_stack << stream.char }
777
+ token = char_stack.join('').tr(ASCII_UPPERCASE,ASCII_LOWERCASE)
778
+ if token == "public" and !char_stack.include?(:EOF)
779
+ @state = :before_doctype_public_identifier_state
780
+ elsif token == "system" and !char_stack.include?(:EOF)
781
+ @state = :before_doctype_system_identifier_state
782
+ else
783
+ @stream.unget(char_stack)
784
+ @token_queue << {:type => :ParseError, :data => "expected-space-or-right-bracket-in-doctype", "datavars" => {"data" => data}}
785
+ @state = :bogus_doctype_state
786
+ end
787
+ end
788
+ return true
789
+ end
790
+
791
+ def before_doctype_public_identifier_state
792
+ data = @stream.char
793
+
794
+ if SPACE_CHARACTERS.include?(data)
795
+ elsif data == "\""
796
+ @current_token[:publicId] = ""
797
+ @state = :doctype_public_identifier_double_quoted_state
798
+ elsif data == "'"
799
+ @current_token[:publicId] = ""
800
+ @state = :doctype_public_identifier_single_quoted_state
801
+ elsif data == ">"
802
+ @token_queue << {:type => :ParseError, :data => "unexpected-end-of-doctype"}
803
+ @current_token[:correct] = false
804
+ @token_queue << @current_token
805
+ @state = :data_state
806
+ elsif data == :EOF
807
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
808
+ @current_token[:correct] = false
809
+ @token_queue << @current_token
810
+ @state = :data_state
811
+ else
812
+ @token_queue << {:type => :ParseError, :data => "unexpected-char-in-doctype"}
813
+ @state = :bogus_doctype_state
814
+ end
815
+
816
+ return true
817
+ end
818
+
819
+ def doctype_public_identifier_double_quoted_state
820
+ data = @stream.char
821
+ if data == "\""
822
+ @state = :after_doctype_public_identifier_state
823
+ elsif data == :EOF
824
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
825
+ @current_token[:correct] = false
826
+ @token_queue << @current_token
827
+ @state = :data_state
828
+ else
829
+ @current_token[:publicId] += data
830
+ end
831
+ return true
832
+ end
833
+
834
+ def doctype_public_identifier_single_quoted_state
835
+ data = @stream.char
836
+ if data == "'"
837
+ @state = :after_doctype_public_identifier_state
838
+ elsif data == :EOF
839
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
840
+ @current_token[:correct] = false
841
+ @token_queue << @current_token
842
+ @state = :data_state
843
+ else
844
+ @current_token[:publicId] += data
845
+ end
846
+ return true
847
+ end
848
+
849
+ def after_doctype_public_identifier_state
850
+ data = @stream.char
851
+ if SPACE_CHARACTERS.include?(data)
852
+ elsif data == "\""
853
+ @current_token[:systemId] = ""
854
+ @state = :doctype_system_identifier_double_quoted_state
855
+ elsif data == "'"
856
+ @current_token[:systemId] = ""
857
+ @state = :doctype_system_identifier_single_quoted_state
858
+ elsif data == ">"
859
+ @token_queue << @current_token
860
+ @state = :data_state
861
+ elsif data == :EOF
862
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
863
+ @current_token[:correct] = false
864
+ @token_queue << @current_token
865
+ @state = :data_state
866
+ else
867
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
868
+ @state = :bogus_doctype_state
869
+ end
870
+ return true
871
+ end
872
+
873
+ def before_doctype_system_identifier_state
874
+ data = @stream.char
875
+ if SPACE_CHARACTERS.include?(data)
876
+ elsif data == "\""
877
+ @current_token[:systemId] = ""
878
+ @state = :doctype_system_identifier_double_quoted_state
879
+ elsif data == "'"
880
+ @current_token[:systemId] = ""
881
+ @state = :doctype_system_identifier_single_quoted_state
882
+ elsif data == ">"
883
+ @token_queue << {:type => :ParseError, :data => "unexpected-char-in-doctype"}
884
+ @current_token[:correct] = false
885
+ @token_queue << @current_token
886
+ @state = :data_state
887
+ elsif data == :EOF
888
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
889
+ @current_token[:correct] = false
890
+ @token_queue << @current_token
891
+ @state = :data_state
892
+ else
893
+ @token_queue << {:type => :ParseError, :data => "unexpected-char-in-doctype"}
894
+ @state = :bogus_doctype_state
895
+ end
896
+ return true
897
+ end
898
+
899
+ def doctype_system_identifier_double_quoted_state
900
+ data = @stream.char
901
+ if data == "\""
902
+ @state = :after_doctype_system_identifier_state
903
+ elsif data == :EOF
904
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
905
+ @current_token[:correct] = false
906
+ @token_queue << @current_token
907
+ @state = :data_state
908
+ else
909
+ @current_token[:systemId] += data
910
+ end
911
+ return true
912
+ end
913
+
914
+ def doctype_system_identifier_single_quoted_state
915
+ data = @stream.char
916
+ if data == "'"
917
+ @state = :after_doctype_system_identifier_state
918
+ elsif data == :EOF
919
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
920
+ @current_token[:correct] = false
921
+ @token_queue << @current_token
922
+ @state = :data_state
923
+ else
924
+ @current_token[:systemId] += data
925
+ end
926
+ return true
927
+ end
928
+
929
+ def after_doctype_system_identifier_state
930
+ data = @stream.char
931
+ if SPACE_CHARACTERS.include?(data)
932
+ elsif data == ">"
933
+ @token_queue << @current_token
934
+ @state = :data_state
935
+ elsif data == :EOF
936
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
937
+ @current_token[:correct] = false
938
+ @token_queue << @current_token
939
+ @state = :data_state
940
+ else
941
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
942
+ @state = :bogus_doctype_state
943
+ end
944
+ return true
945
+ end
946
+
947
+ def bogus_doctype_state
948
+ data = @stream.char
949
+ @current_token[:correct] = false
950
+ if data == ">"
951
+ @token_queue << @current_token
952
+ @state = :data_state
953
+ elsif data == :EOF
954
+ # XXX EMIT
955
+ @stream.unget(data)
956
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
957
+ @current_token[:correct] = false
958
+ @token_queue << @current_token
959
+ @state = :data_state
960
+ end
961
+ return true
962
+ end
963
+
964
+ end
965
+
966
+ end