spk-html5 0.10.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (74) hide show
  1. data/History.txt +10 -0
  2. data/Manifest.txt +73 -0
  3. data/README +45 -0
  4. data/Rakefile.rb +33 -0
  5. data/bin/html5 +7 -0
  6. data/lib/html5.rb +13 -0
  7. data/lib/html5/cli.rb +248 -0
  8. data/lib/html5/constants.rb +1061 -0
  9. data/lib/html5/filters/base.rb +10 -0
  10. data/lib/html5/filters/inject_meta_charset.rb +82 -0
  11. data/lib/html5/filters/iso639codes.rb +755 -0
  12. data/lib/html5/filters/optionaltags.rb +198 -0
  13. data/lib/html5/filters/rfc2046.rb +31 -0
  14. data/lib/html5/filters/rfc3987.rb +91 -0
  15. data/lib/html5/filters/sanitizer.rb +15 -0
  16. data/lib/html5/filters/validator.rb +834 -0
  17. data/lib/html5/filters/whitespace.rb +36 -0
  18. data/lib/html5/html5parser.rb +247 -0
  19. data/lib/html5/html5parser/after_after_body_phase.rb +43 -0
  20. data/lib/html5/html5parser/after_after_frameset_phase.rb +32 -0
  21. data/lib/html5/html5parser/after_body_phase.rb +46 -0
  22. data/lib/html5/html5parser/after_frameset_phase.rb +33 -0
  23. data/lib/html5/html5parser/after_head_phase.rb +55 -0
  24. data/lib/html5/html5parser/before_head_phase.rb +44 -0
  25. data/lib/html5/html5parser/before_html_phase.rb +41 -0
  26. data/lib/html5/html5parser/in_body_phase.rb +636 -0
  27. data/lib/html5/html5parser/in_caption_phase.rb +69 -0
  28. data/lib/html5/html5parser/in_cell_phase.rb +78 -0
  29. data/lib/html5/html5parser/in_column_group_phase.rb +55 -0
  30. data/lib/html5/html5parser/in_foreign_content_phase.rb +50 -0
  31. data/lib/html5/html5parser/in_frameset_phase.rb +56 -0
  32. data/lib/html5/html5parser/in_head_phase.rb +143 -0
  33. data/lib/html5/html5parser/in_row_phase.rb +96 -0
  34. data/lib/html5/html5parser/in_select_phase.rb +90 -0
  35. data/lib/html5/html5parser/in_select_table_phase.rb +35 -0
  36. data/lib/html5/html5parser/in_table_body_phase.rb +92 -0
  37. data/lib/html5/html5parser/in_table_phase.rb +177 -0
  38. data/lib/html5/html5parser/initial_phase.rb +133 -0
  39. data/lib/html5/html5parser/phase.rb +171 -0
  40. data/lib/html5/inputstream.rb +735 -0
  41. data/lib/html5/liberalxmlparser.rb +158 -0
  42. data/lib/html5/sanitizer.rb +209 -0
  43. data/lib/html5/serializer.rb +2 -0
  44. data/lib/html5/serializer/htmlserializer.rb +179 -0
  45. data/lib/html5/serializer/xhtmlserializer.rb +20 -0
  46. data/lib/html5/sniffer.rb +45 -0
  47. data/lib/html5/tokenizer.rb +1059 -0
  48. data/lib/html5/treebuilders.rb +24 -0
  49. data/lib/html5/treebuilders/base.rb +339 -0
  50. data/lib/html5/treebuilders/hpricot.rb +231 -0
  51. data/lib/html5/treebuilders/rexml.rb +215 -0
  52. data/lib/html5/treebuilders/simpletree.rb +191 -0
  53. data/lib/html5/treewalkers.rb +26 -0
  54. data/lib/html5/treewalkers/base.rb +162 -0
  55. data/lib/html5/treewalkers/hpricot.rb +48 -0
  56. data/lib/html5/treewalkers/rexml.rb +48 -0
  57. data/lib/html5/treewalkers/simpletree.rb +48 -0
  58. data/lib/html5/version.rb +3 -0
  59. data/test/preamble.rb +69 -0
  60. data/test/test_cli.rb +16 -0
  61. data/test/test_encoding.rb +35 -0
  62. data/test/test_input_stream.rb +26 -0
  63. data/test/test_lxp.rb +283 -0
  64. data/test/test_parser.rb +63 -0
  65. data/test/test_sanitizer.rb +173 -0
  66. data/test/test_serializer.rb +67 -0
  67. data/test/test_sniffer.rb +27 -0
  68. data/test/test_stream.rb +71 -0
  69. data/test/test_tokenizer.rb +95 -0
  70. data/test/test_treewalkers.rb +135 -0
  71. data/test/test_validator.rb +31 -0
  72. data/test/tokenizer_test_parser.rb +67 -0
  73. data/test19.rb +38 -0
  74. metadata +198 -0
@@ -0,0 +1,20 @@
1
+ require 'html5/serializer/htmlserializer'
2
+
3
+ module HTML5
4
+
5
+ class XHTMLSerializer < HTMLSerializer
6
+ DEFAULTS = {
7
+ :quote_attr_values => true,
8
+ :minimize_boolean_attributes => false,
9
+ :use_trailing_solidus => true,
10
+ :escape_lt_in_attrs => true,
11
+ :omit_optional_tags => false,
12
+ :escape_rcdata => true
13
+ }
14
+
15
+ def initialize(options={})
16
+ super(DEFAULTS.clone.update(options))
17
+ end
18
+ end
19
+
20
+ end
@@ -0,0 +1,45 @@
1
+ module HTML5
2
+ module Sniffer
3
+ # 4.7.4
4
+ def html_or_feed str
5
+ s = str[0, 512] # steps 1, 2
6
+ pos = 0
7
+
8
+ while pos < s.length
9
+ case s[pos]
10
+ when ?\t, ?\ , ?\n, ?\r # 0x09, 0x20, 0x0A, 0x0D == tab, space, LF, CR
11
+ pos += 1
12
+ when ?< # 0x3C
13
+ pos += 1
14
+ if s[pos..pos+2] == "!--" # [0x21, 0x2D, 0x2D]
15
+ pos += 3
16
+ until s[pos..pos+2] == "-->" or pos >= s.length
17
+ pos += 1
18
+ end
19
+ pos += 3
20
+ elsif s[pos] == ?! # 0x21
21
+ pos += 1
22
+ until s[pos] == ?> or pos >= s.length # 0x3E
23
+ pos += 1
24
+ end
25
+ pos += 1
26
+ elsif s[pos] == ?? # 0x3F
27
+ until s[pos..pos+1] == "?>" or pos >= s.length # [0x3F, 0x3E]
28
+ pos += 1
29
+ end
30
+ pos += 2
31
+ elsif s[pos..pos+2] == "rss" # [0x72, 0x73, 0x73]
32
+ return "application/rss+xml"
33
+ elsif s[pos..pos+3] == "feed" # [0x66, 0x65, 0x65, 0x64]
34
+ return "application/atom+xml"
35
+ elsif s[pos..pos+6] == "rdf:RDF" # [0x72, 0x64, 0x66, 0x3A, 0x52, 0x44, 0x46]
36
+ raise NotImplementedError
37
+ end
38
+ else
39
+ break
40
+ end
41
+ end
42
+ "text/html"
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,1059 @@
1
+ require 'html5/constants'
2
+ require 'html5/inputstream'
3
+
4
+ module HTML5
5
+
6
+ # This class takes care of tokenizing HTML.
7
+ #
8
+ # * @current_token
9
+ # Holds the token that is currently being processed.
10
+ #
11
+ # * @state
12
+ # Holds a reference to the method to be invoked... XXX
13
+ #
14
+ # * @states
15
+ # Holds a mapping between states and methods that implement the state.
16
+ #
17
+ # * @stream
18
+ # Points to HTMLInputStream object.
19
+
20
+ class HTMLTokenizer
21
+ attr_accessor :content_model_flag, :current_token
22
+ attr_reader :stream
23
+
24
+ # XXX need to fix documentation
25
+
26
+ def initialize(stream, options = {})
27
+ @stream = HTMLInputStream.new(stream, options)
28
+
29
+ # Setup the initial tokenizer state
30
+ @content_model_flag = :PCDATA
31
+ @state = :data_state
32
+ @escapeFlag = false
33
+ @lastFourChars = []
34
+
35
+ # The current token being created
36
+ @current_token = nil
37
+
38
+ # Tokens to be processed.
39
+ @token_queue = []
40
+ @lowercase_element_name = options[:lowercase_element_name] != false
41
+ @lowercase_attr_name = options[:lowercase_attr_name] != false
42
+ end
43
+
44
+ # This is where the magic happens.
45
+ #
46
+ # We do our usually processing through the states and when we have a token
47
+ # to return we yield the token which pauses processing until the next token
48
+ # is requested.
49
+ def each
50
+ @token_queue = []
51
+ # Start processing. When EOF is reached @state will return false
52
+ # instead of true and the loop will terminate.
53
+ while send @state
54
+ yield :type => :ParseError, :data => @stream.errors.shift until @stream.errors.empty?
55
+ yield @token_queue.shift until @token_queue.empty?
56
+ end
57
+ end
58
+
59
+ # Below are various helper functions the tokenizer states use worked out.
60
+
61
+ # If the next character is a '>', convert the current_token into
62
+ # an EmptyTag
63
+
64
+ def process_solidus_in_tag
65
+
66
+ # We need to consume another character to make sure it's a ">"
67
+ data = @stream.char
68
+ rv = false
69
+ if @current_token[:type] == :StartTag and data == ">"
70
+ @current_token[:type] = :EmptyTag
71
+ elsif data == :EOF
72
+ @token_queue << ({:type => :ParseError, :data => "eof-following-solidus"})
73
+ @state = :data_state
74
+ emit_current_token
75
+ rv = true
76
+ else
77
+ @token_queue << {:type => :ParseError, :data => "incorrectly-placed-solidus"}
78
+ end
79
+
80
+ # The character we just consumed need to be put back on the stack so it
81
+ # doesn't get lost...
82
+ @stream.unget(data)
83
+ rv
84
+ end
85
+
86
+ # This function returns either U+FFFD or the character based on the
87
+ # decimal or hexadecimal representation. It also discards ";" if present.
88
+ # If not present @token_queue << {:type => :ParseError}" is invoked.
89
+
90
+ def consume_number_entity(isHex)
91
+
92
+ # XXX More need to be done here. For instance, #13 should prolly be
93
+ # converted to #10 so we don't get \r (#13 is \r right?) in the DOM and
94
+ # such. Thoughts on this appreciated.
95
+ allowed = DIGITS
96
+ radix = 10
97
+ if isHex
98
+ allowed = HEX_DIGITS
99
+ radix = 16
100
+ end
101
+
102
+ char_stack = []
103
+
104
+ # Consume all the characters that are in range while making sure we
105
+ # don't hit an EOF.
106
+ c = @stream.char
107
+ while allowed.include?(c) and c != :EOF
108
+ char_stack.push(c)
109
+ c = @stream.char
110
+ end
111
+
112
+ # Convert the set of characters consumed to an int.
113
+ charAsInt = char_stack.join('').to_i(radix)
114
+
115
+ if charAsInt == 13
116
+ @token_queue << {:type => :ParseError, :data => "incorrect-cr-newline-entity"}
117
+ charAsInt = 10
118
+ elsif (128..159).include? charAsInt
119
+ # If the integer is between 127 and 160 (so 128 and bigger and 159
120
+ # and smaller) we need to do the "windows trick".
121
+ @token_queue << {:type => :ParseError, :data => "illegal-windows-1252-entity"}
122
+
123
+ charAsInt = ENTITIES_WINDOWS1252[charAsInt - 128]
124
+ end
125
+
126
+ if 0 < charAsInt && charAsInt <= 1114111 && !(55296 <= charAsInt && charAsInt <= 57343) &&
127
+ ![0x10FFFF].include?(charAsInt) # TODO add more entity replacements here
128
+ if String.method_defined? :force_encoding
129
+ char = charAsInt.chr('utf-8')
130
+ else
131
+ char = [charAsInt].pack('U')
132
+ end
133
+ else
134
+ char = [0xFFFD].pack('U')
135
+ @token_queue << {:type => :ParseError, :data => "cant-convert-numeric-entity", :datavars => {"charAsInt" => charAsInt}}
136
+ end
137
+
138
+ # Discard the ; if present. Otherwise, put it back on the queue and
139
+ # invoke parse_error on parser.
140
+ if c != ";"
141
+ @token_queue << {:type => :ParseError, :data => "numeric-entity-without-semicolon"}
142
+ @stream.unget(c)
143
+ end
144
+
145
+ return char
146
+ end
147
+
148
+ def consume_entity(allowed_char=nil, from_attribute=false)
149
+ char = nil
150
+ char_stack = [@stream.char]
151
+ if SPACE_CHARACTERS.include?(char_stack[0]) or [:EOF, '<', '&'].include?(char_stack[0]) ||
152
+ (allowed_char && allowed_char == char_stack[0])
153
+ @stream.unget(char_stack)
154
+ elsif char_stack[0] == '#'
155
+ # We might have a number entity here.
156
+ char_stack += [@stream.char, @stream.char]
157
+ if char_stack[0 .. 1].include? :EOF
158
+ # If we reach the end of the file put everything up to :EOF
159
+ # back in the queue
160
+ char_stack = char_stack[0...char_stack.index(:EOF)]
161
+ @stream.unget(char_stack)
162
+ @token_queue << {:type => :ParseError, :data => "expected-numeric-entity-but-got-eof"}
163
+ else
164
+ if char_stack[1].downcase == "x" and HEX_DIGITS.include? char_stack[2]
165
+ # Hexadecimal entity detected.
166
+ @stream.unget(char_stack[2])
167
+ char = consume_number_entity(true)
168
+ elsif DIGITS.include? char_stack[1]
169
+ # Decimal entity detected.
170
+ @stream.unget(char_stack[1..-1])
171
+ char = consume_number_entity(false)
172
+ else
173
+ # No number entity detected.
174
+ @stream.unget(char_stack)
175
+ @token_queue << {:type => :ParseError, :data => "expected-numeric-entity"}
176
+ end
177
+ end
178
+ else
179
+ # At this point in the process might have named entity. Entities
180
+ # are stored in the global variable "entities".
181
+ #
182
+ # Consume characters and compare to these to a substring of the
183
+ # entity names in the list until the substring no longer matches.
184
+ filteredEntityList = ENTITIES.keys
185
+ filteredEntityList.reject! {|e| e[0].chr != char_stack[0]}
186
+ entityName = nil
187
+
188
+ # Try to find the longest entity the string will match to take care
189
+ # of &noti for instance.
190
+ while char_stack.last != :EOF
191
+ name = char_stack.join('')
192
+ if filteredEntityList.any? {|e| e[0...name.length] == name}
193
+ filteredEntityList.reject! {|e| e[0...name.length] != name}
194
+ char_stack.push(@stream.char)
195
+ else
196
+ break
197
+ end
198
+
199
+ if ENTITIES.include? name
200
+ entityName = name
201
+ break if entityName[-1] == ';'
202
+ end
203
+ end
204
+
205
+ if entityName != nil
206
+ char = ENTITIES[entityName]
207
+
208
+ # Check whether or not the last character returned can be
209
+ # discarded or needs to be put back.
210
+ if entityName[-1] != ?;
211
+ @token_queue << {:type => :ParseError, :data => "named-entity-without-semicolon"}
212
+ end
213
+
214
+ if entityName[-1] != ";" and from_attribute and
215
+ (ASCII_LETTERS.include?(char_stack[entityName.length]) or
216
+ DIGITS.include?(char_stack[entityName.length]))
217
+ @stream.unget(char_stack)
218
+ char = '&'
219
+ else
220
+ @stream.unget(char_stack[entityName.length..-1])
221
+ end
222
+ else
223
+ @token_queue << {:type => :ParseError, :data => "expected-named-entity"}
224
+ @stream.unget(char_stack)
225
+ end
226
+ end
227
+ return char
228
+ end
229
+
230
+ # This method replaces the need for "entityInAttributeValueState".
231
+ def process_entity_in_attribute allowed_char
232
+ entity = consume_entity(allowed_char, true)
233
+ if entity
234
+ @current_token[:data][-1][1] += entity
235
+ else
236
+ @current_token[:data][-1][1] += "&"
237
+ end
238
+ end
239
+
240
+ # This method is a generic handler for emitting the tags. It also sets
241
+ # the state to "data" because that's what's needed after a token has been
242
+ # emitted.
243
+ def emit_current_token
244
+ # Add token to the queue to be yielded
245
+ token = @current_token
246
+ if [:StartTag, :EndTag, :EmptyTag].include?(token[:type])
247
+ if @lowercase_element_name
248
+ token[:name] = token[:name].downcase
249
+ end
250
+
251
+ if token[:type] == :EndTag && token[:self_closing]
252
+ @token_queue << {:type => :ParseError, :data => "self-closing-end-tag"}
253
+ end
254
+ @token_queue << token
255
+ @state = :data_state
256
+ end
257
+
258
+ end
259
+
260
+ # Below are the various tokenizer states worked out.
261
+
262
+ # XXX AT Perhaps we should have Hixie run some evaluation on billions of
263
+ # documents to figure out what the order of the various if and elsif
264
+ # statements should be.
265
+ def data_state
266
+ data = @stream.char
267
+
268
+ if @content_model_flag == :CDATA or @content_model_flag == :RCDATA
269
+ @lastFourChars.shift if @lastFourChars.length == 4
270
+ @lastFourChars << data
271
+ end
272
+
273
+ if data == "&" and [:PCDATA,:RCDATA].include?(@content_model_flag) and !@escapeFlag
274
+ @state = :entity_data_state
275
+ elsif data == "-" && [:CDATA, :RCDATA].include?(@content_model_flag) && !@escapeFlag && @lastFourChars.join('') == "<!--"
276
+ @escapeFlag = true
277
+ @token_queue << {:type => :Characters, :data => data}
278
+ elsif data == "<" and !@escapeFlag and
279
+ [:PCDATA,:CDATA,:RCDATA].include?(@content_model_flag)
280
+ @state = :tag_open_state
281
+ elsif data == ">" and @escapeFlag and
282
+ [:CDATA,:RCDATA].include?(@content_model_flag) and
283
+ @lastFourChars[1..-1].join('') == "-->"
284
+ @escapeFlag = false
285
+ @token_queue << {:type => :Characters, :data => data}
286
+
287
+ elsif data == :EOF
288
+ # Tokenization ends.
289
+ return false
290
+
291
+ elsif SPACE_CHARACTERS.include? data
292
+ # Directly after emitting a token you switch back to the "data
293
+ # state". At that point SPACE_CHARACTERS are important so they are
294
+ # emitted separately.
295
+ # XXX need to check if we don't need a special "spaces" flag on
296
+ # characters.
297
+ @token_queue << {:type => :SpaceCharacters, :data => data + @stream.chars_until(SPACE_CHARACTERS, true)}
298
+ else
299
+ chars = @stream.chars_until(["&", "<", ">", "-"])
300
+ @token_queue << {:type => :Characters, :data => data + chars}
301
+ @lastFourChars += (chars[chars.length > 4 ? -4 : -chars.length, 4] || '').scan(/./)
302
+ @lastFourChars = @lastFourChars[(@lastFourChars.length > 4 ? -4 : -@lastFourChars.length), 4] || []
303
+ end
304
+ return true
305
+ end
306
+
307
+ def entity_data_state
308
+ entity = consume_entity
309
+ if entity
310
+ @token_queue << {:type => :Characters, :data => entity}
311
+ else
312
+ @token_queue << {:type => :Characters, :data => "&"}
313
+ end
314
+ @state = :data_state
315
+ return true
316
+ end
317
+
318
+ def tag_open_state
319
+ data = @stream.char
320
+
321
+ if @content_model_flag == :PCDATA
322
+ if data == "!"
323
+ @state = :markup_declaration_open_state
324
+ elsif data == "/"
325
+ @state = :close_tag_open_state
326
+ elsif data != :EOF and ASCII_LETTERS.include? data
327
+ @current_token = {:type => :StartTag, :name => data, :data => []}
328
+ @state = :tag_name_state
329
+ elsif data == ">"
330
+ # XXX In theory it could be something besides a tag name. But
331
+ # do we really care?
332
+ @token_queue << {:type => :ParseError, :data => "expected-tag-name-but-got-right-bracket"}
333
+ @token_queue << {:type => :Characters, :data => "<>"}
334
+ @state = :data_state
335
+ elsif data == "?"
336
+ # XXX In theory it could be something besides a tag name. But
337
+ # do we really care?
338
+ @token_queue.push({:type => :ParseError, :data => "expected-tag-name-but-got-question-mark"})
339
+ @stream.unget(data)
340
+ @state = :bogus_comment_state
341
+ else
342
+ # XXX
343
+ @token_queue << {:type => :ParseError, :data => "expected-tag-name"}
344
+ @token_queue << {:type => :Characters, :data => "<"}
345
+ @stream.unget(data)
346
+ @state = :data_state
347
+ end
348
+ else
349
+ # We know the content model flag is set to either RCDATA or CDATA
350
+ # now because this state can never be entered with the PLAINTEXT
351
+ # flag.
352
+ if data == "/"
353
+ @state = :close_tag_open_state
354
+ else
355
+ @token_queue << {:type => :Characters, :data => "<"}
356
+ @stream.unget(data)
357
+ @state = :data_state
358
+ end
359
+ end
360
+ return true
361
+ end
362
+
363
+ def close_tag_open_state
364
+ if (@content_model_flag == :RCDATA or @content_model_flag == :CDATA)
365
+ if @current_token
366
+ char_stack = []
367
+
368
+ # So far we know that "</" has been consumed. We now need to know
369
+ # whether the next few characters match the name of last emitted
370
+ # start tag which also happens to be the current_token. We also need
371
+ # to have the character directly after the characters that could
372
+ # match the start tag name.
373
+ (@current_token[:name].length + 1).times do
374
+ char_stack.push(@stream.char)
375
+ # Make sure we don't get hit by :EOF
376
+ break if char_stack[-1] == :EOF
377
+ end
378
+
379
+ # Since this is just for checking. We put the characters back on
380
+ # the stack.
381
+ @stream.unget(char_stack)
382
+ end
383
+
384
+ if @current_token and
385
+ @current_token[:name].downcase ==
386
+ char_stack[0...-1].join('').downcase and
387
+ (SPACE_CHARACTERS + [">", "/", "<", :EOF]).include? char_stack[-1]
388
+ # Because the characters are correct we can safely switch to
389
+ # PCDATA mode now. This also means we don't have to do it when
390
+ # emitting the end tag token.
391
+ @content_model_flag = :PCDATA
392
+ else
393
+ @token_queue << {:type => :Characters, :data => "</"}
394
+ @state = :data_state
395
+
396
+ # Need to return here since we don't want the rest of the
397
+ # method to be walked through.
398
+ return true
399
+ end
400
+ end
401
+
402
+ data = @stream.char
403
+ if data == :EOF
404
+ @token_queue << {:type => :ParseError, :data => "expected-closing-tag-but-got-eof"}
405
+ @token_queue << {:type => :Characters, :data => "</"}
406
+ @state = :data_state
407
+ elsif ASCII_LETTERS.include? data
408
+ @current_token = {:type => :EndTag, :name => data, :data => []}
409
+ @state = :tag_name_state
410
+ elsif data == ">"
411
+ @token_queue << {:type => :ParseError, :data => "expected-closing-tag-but-got-right-bracket"}
412
+ @state = :data_state
413
+ else
414
+ # XXX data can be _'_...
415
+ @token_queue << {:type => :ParseError, :data => "expected-closing-tag-but-got-char", :datavars => {:data => data}}
416
+ @stream.unget(data)
417
+ @state = :bogus_comment_state
418
+ end
419
+
420
+ return true
421
+ end
422
+
423
+ def tag_name_state
424
+ data = @stream.char
425
+ if SPACE_CHARACTERS.include? data
426
+ @state = :before_attribute_name_state
427
+ elsif data == :EOF
428
+ @token_queue << {:type => :ParseError, :data => "eof-in-tag-name"}
429
+ emit_current_token
430
+ elsif ASCII_LETTERS.include? data
431
+ @current_token[:name] += data + @stream.chars_until(ASCII_LETTERS, true)
432
+ elsif data == ">"
433
+ emit_current_token
434
+ elsif data == "/"
435
+ @state = :self_closing_tag_state
436
+ else
437
+ @current_token[:name] += data
438
+ end
439
+ return true
440
+ end
441
+
442
+ def before_attribute_name_state
443
+ data = @stream.char
444
+ if SPACE_CHARACTERS.include? data
445
+ @stream.chars_until(SPACE_CHARACTERS, true)
446
+ elsif data == :EOF
447
+ @token_queue << {:type => :ParseError, :data => "expected-attribute-name-but-got-eof"}
448
+ emit_current_token
449
+ elsif ASCII_LETTERS.include? data
450
+ @current_token[:data].push([data, ""])
451
+ @state = :attribute_name_state
452
+ elsif data == ">"
453
+ emit_current_token
454
+ elsif data == "/"
455
+ @state = :self_closing_tag_state
456
+ elsif data == "'" || data == '"' || data == "="
457
+ @token_queue.push({:type => :ParseError, :data => "invalid-character-in-attribute-name"})
458
+ @current_token[:data].push([data, ""])
459
+ @state = :attribute_name_state
460
+ else
461
+ @current_token[:data].push([data, ""])
462
+ @state = :attribute_name_state
463
+ end
464
+ return true
465
+ end
466
+
467
+ def attribute_name_state
468
+ data = @stream.char
469
+ leavingThisState = true
470
+ emitToken = false
471
+ if data == "="
472
+ @state = :before_attribute_value_state
473
+ elsif data == :EOF
474
+ @token_queue << {:type => :ParseError, :data => "eof-in-attribute-name"}
475
+ @state = :data_state
476
+ emitToken = true
477
+ elsif ASCII_LETTERS.include? data
478
+ @current_token[:data][-1][0] += data + @stream.chars_until(ASCII_LETTERS, true)
479
+ leavingThisState = false
480
+ elsif data == ">"
481
+ # XXX If we emit here the attributes are converted to a dict
482
+ # without being checked and when the code below runs we error
483
+ # because data is a dict not a list
484
+ emitToken = true
485
+ elsif SPACE_CHARACTERS.include? data
486
+ @state = :after_attribute_name_state
487
+ elsif data == "/"
488
+ if !process_solidus_in_tag
489
+ @state = :before_attribute_name_state
490
+ end
491
+ elsif data == "'" or data == '"'
492
+ @token_queue.push({:type => :ParseError, :data => "invalid-character-in-attribute-name"})
493
+ @current_token[:data][-1][0] += data
494
+ leavingThisState = false
495
+ else
496
+ @current_token[:data][-1][0] += data
497
+ leavingThisState = false
498
+ end
499
+
500
+ if leavingThisState
501
+ # Attributes are not dropped at this stage. That happens when the
502
+ # start tag token is emitted so values can still be safely appended
503
+ # to attributes, but we do want to report the parse error in time.
504
+ if @lowercase_attr_name
505
+ @current_token[:data][-1][0] = @current_token[:data].last.first.downcase
506
+ end
507
+ @current_token[:data][0...-1].each {|name,value|
508
+ if @current_token[:data].last.first == name
509
+ @token_queue << {:type => :ParseError, :data => "duplicate-attribute"}
510
+ break # don't report an error more than once
511
+ end
512
+ }
513
+ # XXX Fix for above XXX
514
+ emit_current_token if emitToken
515
+ end
516
+ return true
517
+ end
518
+
519
+ def after_attribute_name_state
520
+ data = @stream.char
521
+ if SPACE_CHARACTERS.include? data
522
+ @stream.chars_until(SPACE_CHARACTERS, true)
523
+ elsif data == "="
524
+ @state = :before_attribute_value_state
525
+ elsif data == ">"
526
+ emit_current_token
527
+ elsif data == :EOF
528
+ @token_queue << {:type => :ParseError, :data => "expected-end-of-tag-but-got-eof"}
529
+ emit_current_token
530
+ elsif ASCII_LETTERS.include? data
531
+ @current_token[:data].push([data, ""])
532
+ @state = :attribute_name_state
533
+ elsif data == "/"
534
+ @state = :self_closing_tag_state
535
+ else
536
+ @current_token[:data].push([data, ""])
537
+ @state = :attribute_name_state
538
+ end
539
+ return true
540
+ end
541
+
542
+ def before_attribute_value_state
543
+ data = @stream.char
544
+ if SPACE_CHARACTERS.include? data
545
+ @stream.chars_until(SPACE_CHARACTERS, true)
546
+ elsif data == "\""
547
+ @state = :attribute_value_double_quoted_state
548
+ elsif data == "&"
549
+ @state = :attribute_value_unquoted_state
550
+ @stream.unget(data);
551
+ elsif data == "'"
552
+ @state = :attribute_value_single_quoted_state
553
+ elsif data == ">"
554
+ emit_current_token
555
+ elsif data == "="
556
+ @token_queue.push({:type => :ParseError, :data => "equals-in-unquoted-attribute-value"})
557
+ @current_token[:data][-1][1] += data
558
+ @state = :attribute_value_unquoted_state
559
+ elsif data == :EOF
560
+ @token_queue << {:type => :ParseError, :data => "expected-attribute-value-but-got-eof"}
561
+ emit_current_token
562
+ else
563
+ @current_token[:data][-1][1] += data
564
+ @state = :attribute_value_unquoted_state
565
+ end
566
+ return true
567
+ end
568
+
569
+ def attribute_value_double_quoted_state
570
+ data = @stream.char
571
+ if data == "\""
572
+ @state = :after_attribute_value_state
573
+ elsif data == "&"
574
+ process_entity_in_attribute('"')
575
+ elsif data == :EOF
576
+ @token_queue << {:type => :ParseError, :data => "eof-in-attribute-value-double-quote"}
577
+ emit_current_token
578
+ else
579
+ @current_token[:data][-1][1] += data + @stream.chars_until(["\"", "&"])
580
+ end
581
+ return true
582
+ end
583
+
584
+ def attribute_value_single_quoted_state
585
+ data = @stream.char
586
+ if data == "'"
587
+ @state = :after_attribute_value_state
588
+ elsif data == "&"
589
+ process_entity_in_attribute("'")
590
+ elsif data == :EOF
591
+ @token_queue << {:type => :ParseError, :data => "eof-in-attribute-value-single-quote"}
592
+ emit_current_token
593
+ else
594
+ @current_token[:data][-1][1] += data + @stream.chars_until(["'", "&"])
595
+ end
596
+ return true
597
+ end
598
+
599
+ def attribute_value_unquoted_state
600
+ data = @stream.char
601
+ if SPACE_CHARACTERS.include? data
602
+ @state = :before_attribute_name_state
603
+ elsif data == "&"
604
+ process_entity_in_attribute ''
605
+ elsif data == ">"
606
+ emit_current_token
607
+ elsif data == '"' || data == "'" || data == "="
608
+ @token_queue.push({:type => :ParseError, :data => "unexpected-character-in-unquoted-attribute-value"})
609
+ @current_token[:data][-1][1] += data
610
+ elsif data == :EOF
611
+ @token_queue << {:type => :ParseError, :data => "eof-in-attribute-value-no-quotes"}
612
+ emit_current_token
613
+ else
614
+ @current_token[:data][-1][1] += data + @stream.chars_until(["&", ">","<"] + SPACE_CHARACTERS)
615
+ end
616
+ return true
617
+ end
618
+
619
+ def after_attribute_value_state
620
+ data = self.stream.char()
621
+ if SPACE_CHARACTERS.include? data
622
+ @state = :before_attribute_name_state
623
+ elsif data == ">"
624
+ emit_current_token
625
+ @state = :data_state
626
+ elsif data == "/"
627
+ @state = :self_closing_tag_state
628
+ elsif data == :EOF
629
+ @token_queue << {:type => :ParseError, :data => "unexpected-EOF-after-attribute-value"}
630
+ emit_current_token
631
+ @stream.unget(data)
632
+ @state = :data_state
633
+ else
634
+ @token_queue.push({:type => :ParseError, :data => "unexpected-character-after-attribute-value"})
635
+ @stream.unget(data)
636
+ @state = :before_attribute_name_state
637
+ end
638
+ true
639
+ end
640
+
641
+ def self_closing_tag_state
642
+ c = @stream.char
643
+ case c
644
+ when ">"
645
+ @current_token[:self_closing] = true
646
+ emit_current_token
647
+ @state = :data_state
648
+ when :EOF
649
+ @token_queue << {:type => :ParseError, :data => "eof-in-tag-name"}
650
+ @stream.unget(c)
651
+ @state = :data_state
652
+ else
653
+ @token_queue << {:type => :ParseError, :data => "expected-self-closing-tag"}
654
+ @stream.unget(c)
655
+ @state = :before_attribute_name_state
656
+ end
657
+ true
658
+ end
659
+
660
+ def bogus_comment_state
661
+ # Make a new comment token and give it as value all the characters
662
+ # until the first > or :EOF (chars_until checks for :EOF automatically)
663
+ # and emit it.
664
+ @token_queue << {:type => :Comment, :data => @stream.chars_until([">"])}
665
+
666
+ # Eat the character directly after the bogus comment which is either a
667
+ # ">" or an :EOF.
668
+ @stream.char
669
+ @state = :data_state
670
+ return true
671
+ end
672
+
673
+ def markup_declaration_open_state
674
+ char_stack = [@stream.char, @stream.char]
675
+ if char_stack == ["-", "-"]
676
+ @current_token = {:type => :Comment, :data => ""}
677
+ @state = :comment_start_state
678
+ else
679
+ 5.times { char_stack.push(@stream.char) }
680
+ # Put in explicit :EOF check
681
+ if !char_stack.include?(:EOF) && char_stack.join("").upcase == "DOCTYPE"
682
+ @current_token = {:type => :Doctype, :name => "", :publicId => nil, :systemId => nil, :correct => true}
683
+ @state = :doctype_state
684
+ else
685
+ @token_queue << {:type => :ParseError, :data => "expected-dashes-or-doctype"}
686
+ @stream.unget(char_stack)
687
+ @state = :bogus_comment_state
688
+ end
689
+ end
690
+ return true
691
+ end
692
+
693
+ def comment_start_state
694
+ data = @stream.char
695
+ if data == "-"
696
+ @state = :comment_start_dash_state
697
+ elsif data == ">"
698
+ @token_queue << {:type => :ParseError, :data => "incorrect-comment"}
699
+ @token_queue << @current_token
700
+ @state = :data_state
701
+ elsif data == :EOF
702
+ @token_queue << {:type => :ParseError, :data => "eof-in-comment"}
703
+ @token_queue << @current_token
704
+ @state = :data_state
705
+ else
706
+ @current_token[:data] += data + @stream.chars_until("-")
707
+ @state = :comment_state
708
+ end
709
+ return true
710
+ end
711
+
712
+ def comment_start_dash_state
713
+ data = @stream.char
714
+ if data == "-"
715
+ @state = :comment_end_state
716
+ elsif data == ">"
717
+ @token_queue << {:type => :ParseError, :data => "incorrect-comment"}
718
+ @token_queue << @current_token
719
+ @state = :data_state
720
+ elsif data == :EOF
721
+ @token_queue << {:type => :ParseError, :data => "eof-in-comment"}
722
+ @token_queue << @current_token
723
+ @state = :data_state
724
+ else
725
+ @current_token[:data] += '-' + data + @stream.chars_until("-")
726
+ @state = :comment_state
727
+ end
728
+ return true
729
+ end
730
+
731
+ def comment_state
732
+ data = @stream.char
733
+ if data == "-"
734
+ @state = :comment_end_dash_state
735
+ elsif data == :EOF
736
+ @token_queue << {:type => :ParseError, :data => "eof-in-comment"}
737
+ @token_queue << @current_token
738
+ @state = :data_state
739
+ else
740
+ @current_token[:data] += data + @stream.chars_until("-")
741
+ end
742
+ return true
743
+ end
744
+
745
+ def comment_end_dash_state
746
+ data = @stream.char
747
+ if data == "-"
748
+ @state = :comment_end_state
749
+ elsif data == :EOF
750
+ @token_queue << {:type => :ParseError, :data => "eof-in-comment-end-dash"}
751
+ @token_queue << @current_token
752
+ @state = :data_state
753
+ else
754
+ @current_token[:data] += "-" + data + @stream.chars_until("-")
755
+ # Consume the next character which is either a "-" or an :EOF as
756
+ # well so if there's a "-" directly after the "-" we go nicely to
757
+ # the "comment end state" without emitting a ParseError there.
758
+ @stream.char
759
+ end
760
+ return true
761
+ end
762
+
763
+ def comment_end_state
764
+ data = @stream.char
765
+ if data == ">"
766
+ @token_queue << @current_token
767
+ @state = :data_state
768
+ elsif data == "-"
769
+ @token_queue << {:type => :ParseError, :data => "unexpected-dash-after-double-dash-in-comment"}
770
+ @current_token[:data] += data
771
+ elsif data == :EOF
772
+ @token_queue << {:type => :ParseError, :data => "eof-in-comment-double-dash"}
773
+ @token_queue << @current_token
774
+ @state = :data_state
775
+ else
776
+ # XXX
777
+ @token_queue << {:type => :ParseError, :data => "unexpected-char-in-comment"}
778
+ @current_token[:data] += "--" + data
779
+ @state = :comment_state
780
+ end
781
+ return true
782
+ end
783
+
784
+ def doctype_state
785
+ data = @stream.char
786
+ if SPACE_CHARACTERS.include? data
787
+ @state = :before_doctype_name_state
788
+ else
789
+ @token_queue << {:type => :ParseError, :data => "need-space-after-doctype"}
790
+ @stream.unget(data)
791
+ @state = :before_doctype_name_state
792
+ end
793
+ return true
794
+ end
795
+
796
+ def before_doctype_name_state
797
+ data = @stream.char
798
+ if SPACE_CHARACTERS.include? data
799
+ elsif data == ">"
800
+ @token_queue << {:type => :ParseError, :data => "expected-doctype-name-but-got-right-bracket"}
801
+ @current_token[:correct] = false
802
+ @token_queue << @current_token
803
+ @state = :data_state
804
+ elsif data == :EOF
805
+ @token_queue << {:type => :ParseError, :data => "expected-doctype-name-but-got-eof"}
806
+ @current_token[:correct] = false
807
+ @token_queue << @current_token
808
+ @state = :data_state
809
+ else
810
+ @current_token[:name] = data
811
+ @state = :doctype_name_state
812
+ end
813
+ return true
814
+ end
815
+
816
+ def doctype_name_state
817
+ data = @stream.char
818
+ if SPACE_CHARACTERS.include? data
819
+ @state = :after_doctype_name_state
820
+ elsif data == ">"
821
+ @token_queue << @current_token
822
+ @state = :data_state
823
+ elsif data == :EOF
824
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype-name"}
825
+ @current_token[:correct] = false
826
+ @token_queue << @current_token
827
+ @state = :data_state
828
+ else
829
+ @current_token[:name] += data
830
+ end
831
+
832
+ return true
833
+ end
834
+
835
+ def after_doctype_name_state
836
+ data = @stream.char
837
+ if SPACE_CHARACTERS.include? data
838
+ elsif data == ">"
839
+ @token_queue << @current_token
840
+ @state = :data_state
841
+ elsif data == :EOF
842
+ @current_token[:correct] = false
843
+ @stream.unget(data)
844
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
845
+ @token_queue << @current_token
846
+ @state = :data_state
847
+ else
848
+ char_stack = [data]
849
+ 5.times { char_stack << stream.char }
850
+ token = char_stack.join('').tr(ASCII_UPPERCASE,ASCII_LOWERCASE)
851
+ if token == "public" and !char_stack.include?(:EOF)
852
+ @state = :before_doctype_public_identifier_state
853
+ elsif token == "system" and !char_stack.include?(:EOF)
854
+ @state = :before_doctype_system_identifier_state
855
+ else
856
+ @stream.unget(char_stack)
857
+ @token_queue << {:type => :ParseError, :data => "expected-space-or-right-bracket-in-doctype", "datavars" => {"data" => data}}
858
+ @current_token[:correct] = false
859
+ @state = :bogus_doctype_state
860
+ end
861
+ end
862
+ return true
863
+ end
864
+
865
+ def before_doctype_public_identifier_state
866
+ data = @stream.char
867
+
868
+ if SPACE_CHARACTERS.include?(data)
869
+ elsif data == "\""
870
+ @current_token[:publicId] = ""
871
+ @state = :doctype_public_identifier_double_quoted_state
872
+ elsif data == "'"
873
+ @current_token[:publicId] = ""
874
+ @state = :doctype_public_identifier_single_quoted_state
875
+ elsif data == ">"
876
+ @token_queue << {:type => :ParseError, :data => "unexpected-end-of-doctype"}
877
+ @current_token[:correct] = false
878
+ @token_queue << @current_token
879
+ @state = :data_state
880
+ elsif data == :EOF
881
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
882
+ @current_token[:correct] = false
883
+ @token_queue << @current_token
884
+ @state = :data_state
885
+ else
886
+ @token_queue << {:type => :ParseError, :data => "unexpected-char-in-doctype"}
887
+ @current_token[:correct] = false
888
+ @state = :bogus_doctype_state
889
+ end
890
+
891
+ return true
892
+ end
893
+
894
+ def doctype_public_identifier_double_quoted_state
895
+ data = @stream.char
896
+ if data == "\""
897
+ @state = :after_doctype_public_identifier_state
898
+ elsif data == ">"
899
+ @token_queue << {:type => :ParseError, :data => "unexpected-end-of-doctype"}
900
+ @current_token[:correct] = false
901
+ @token_queue << @current_token
902
+ @state = :data_state
903
+ elsif data == :EOF
904
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
905
+ @current_token[:correct] = false
906
+ @token_queue << @current_token
907
+ @state = :data_state
908
+ else
909
+ @current_token[:publicId] += data
910
+ end
911
+ return true
912
+ end
913
+
914
+ def doctype_public_identifier_single_quoted_state
915
+ data = @stream.char
916
+ if data == "'"
917
+ @state = :after_doctype_public_identifier_state
918
+ elsif data == ">"
919
+ @token_queue << {:type => :ParseError, :data => "unexpected-end-of-doctype"}
920
+ @current_token[:correct] = false
921
+ @token_queue << @current_token
922
+ @state = :data_state
923
+ elsif data == :EOF
924
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
925
+ @current_token[:correct] = false
926
+ @token_queue << @current_token
927
+ @state = :data_state
928
+ else
929
+ @current_token[:publicId] += data
930
+ end
931
+ return true
932
+ end
933
+
934
+ def after_doctype_public_identifier_state
935
+ data = @stream.char
936
+ if SPACE_CHARACTERS.include?(data)
937
+ elsif data == "\""
938
+ @current_token[:systemId] = ""
939
+ @state = :doctype_system_identifier_double_quoted_state
940
+ elsif data == "'"
941
+ @current_token[:systemId] = ""
942
+ @state = :doctype_system_identifier_single_quoted_state
943
+ elsif data == ">"
944
+ @token_queue << @current_token
945
+ @state = :data_state
946
+ elsif data == :EOF
947
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
948
+ @current_token[:correct] = false
949
+ @token_queue << @current_token
950
+ @state = :data_state
951
+ else
952
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
953
+ @current_token[:correct] = false
954
+ @state = :bogus_doctype_state
955
+ end
956
+ return true
957
+ end
958
+
959
+ def before_doctype_system_identifier_state
960
+ data = @stream.char
961
+ if SPACE_CHARACTERS.include?(data)
962
+ elsif data == "\""
963
+ @current_token[:systemId] = ""
964
+ @state = :doctype_system_identifier_double_quoted_state
965
+ elsif data == "'"
966
+ @current_token[:systemId] = ""
967
+ @state = :doctype_system_identifier_single_quoted_state
968
+ elsif data == ">"
969
+ @token_queue << {:type => :ParseError, :data => "unexpected-char-in-doctype"}
970
+ @current_token[:correct] = false
971
+ @token_queue << @current_token
972
+ @state = :data_state
973
+ elsif data == :EOF
974
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
975
+ @current_token[:correct] = false
976
+ @token_queue << @current_token
977
+ @state = :data_state
978
+ else
979
+ @token_queue << {:type => :ParseError, :data => "unexpected-char-in-doctype"}
980
+ @current_token[:correct] = false
981
+ @state = :bogus_doctype_state
982
+ end
983
+ return true
984
+ end
985
+
986
+ def doctype_system_identifier_double_quoted_state
987
+ data = @stream.char
988
+ if data == "\""
989
+ @state = :after_doctype_system_identifier_state
990
+ elsif data == ">"
991
+ @token_queue << {:type => :ParseError, :data => "unexpected-end-of-doctype"}
992
+ @current_token[:correct] = false
993
+ @token_queue << @current_token
994
+ @state = :data_state
995
+ elsif data == :EOF
996
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
997
+ @current_token[:correct] = false
998
+ @token_queue << @current_token
999
+ @state = :data_state
1000
+ else
1001
+ @current_token[:systemId] += data
1002
+ end
1003
+ return true
1004
+ end
1005
+
1006
+ def doctype_system_identifier_single_quoted_state
1007
+ data = @stream.char
1008
+ if data == "'"
1009
+ @state = :after_doctype_system_identifier_state
1010
+ elsif data == ">"
1011
+ @token_queue << {:type => :ParseError, :data => "unexpected-end-of-doctype"}
1012
+ @current_token[:correct] = false
1013
+ @token_queue << @current_token
1014
+ @state = :data_state
1015
+ elsif data == :EOF
1016
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
1017
+ @current_token[:correct] = false
1018
+ @token_queue << @current_token
1019
+ @state = :data_state
1020
+ else
1021
+ @current_token[:systemId] += data
1022
+ end
1023
+ return true
1024
+ end
1025
+
1026
+ def after_doctype_system_identifier_state
1027
+ data = @stream.char
1028
+ if SPACE_CHARACTERS.include?(data)
1029
+ elsif data == ">"
1030
+ @token_queue << @current_token
1031
+ @state = :data_state
1032
+ elsif data == :EOF
1033
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
1034
+ @current_token[:correct] = false
1035
+ @token_queue << @current_token
1036
+ @state = :data_state
1037
+ else
1038
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
1039
+ @state = :bogus_doctype_state
1040
+ end
1041
+ return true
1042
+ end
1043
+
1044
+ def bogus_doctype_state
1045
+ data = @stream.char
1046
+ if data == ">"
1047
+ @token_queue << @current_token
1048
+ @state = :data_state
1049
+ elsif data == :EOF
1050
+ @stream.unget(data)
1051
+ @token_queue << @current_token
1052
+ @state = :data_state
1053
+ end
1054
+ return true
1055
+ end
1056
+
1057
+ end
1058
+
1059
+ end