spk-html5 0.10.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. data/History.txt +10 -0
  2. data/Manifest.txt +73 -0
  3. data/README +45 -0
  4. data/Rakefile.rb +33 -0
  5. data/bin/html5 +7 -0
  6. data/lib/html5.rb +13 -0
  7. data/lib/html5/cli.rb +248 -0
  8. data/lib/html5/constants.rb +1061 -0
  9. data/lib/html5/filters/base.rb +10 -0
  10. data/lib/html5/filters/inject_meta_charset.rb +82 -0
  11. data/lib/html5/filters/iso639codes.rb +755 -0
  12. data/lib/html5/filters/optionaltags.rb +198 -0
  13. data/lib/html5/filters/rfc2046.rb +31 -0
  14. data/lib/html5/filters/rfc3987.rb +91 -0
  15. data/lib/html5/filters/sanitizer.rb +15 -0
  16. data/lib/html5/filters/validator.rb +834 -0
  17. data/lib/html5/filters/whitespace.rb +36 -0
  18. data/lib/html5/html5parser.rb +247 -0
  19. data/lib/html5/html5parser/after_after_body_phase.rb +43 -0
  20. data/lib/html5/html5parser/after_after_frameset_phase.rb +32 -0
  21. data/lib/html5/html5parser/after_body_phase.rb +46 -0
  22. data/lib/html5/html5parser/after_frameset_phase.rb +33 -0
  23. data/lib/html5/html5parser/after_head_phase.rb +55 -0
  24. data/lib/html5/html5parser/before_head_phase.rb +44 -0
  25. data/lib/html5/html5parser/before_html_phase.rb +41 -0
  26. data/lib/html5/html5parser/in_body_phase.rb +636 -0
  27. data/lib/html5/html5parser/in_caption_phase.rb +69 -0
  28. data/lib/html5/html5parser/in_cell_phase.rb +78 -0
  29. data/lib/html5/html5parser/in_column_group_phase.rb +55 -0
  30. data/lib/html5/html5parser/in_foreign_content_phase.rb +50 -0
  31. data/lib/html5/html5parser/in_frameset_phase.rb +56 -0
  32. data/lib/html5/html5parser/in_head_phase.rb +143 -0
  33. data/lib/html5/html5parser/in_row_phase.rb +96 -0
  34. data/lib/html5/html5parser/in_select_phase.rb +90 -0
  35. data/lib/html5/html5parser/in_select_table_phase.rb +35 -0
  36. data/lib/html5/html5parser/in_table_body_phase.rb +92 -0
  37. data/lib/html5/html5parser/in_table_phase.rb +177 -0
  38. data/lib/html5/html5parser/initial_phase.rb +133 -0
  39. data/lib/html5/html5parser/phase.rb +171 -0
  40. data/lib/html5/inputstream.rb +735 -0
  41. data/lib/html5/liberalxmlparser.rb +158 -0
  42. data/lib/html5/sanitizer.rb +209 -0
  43. data/lib/html5/serializer.rb +2 -0
  44. data/lib/html5/serializer/htmlserializer.rb +179 -0
  45. data/lib/html5/serializer/xhtmlserializer.rb +20 -0
  46. data/lib/html5/sniffer.rb +45 -0
  47. data/lib/html5/tokenizer.rb +1059 -0
  48. data/lib/html5/treebuilders.rb +24 -0
  49. data/lib/html5/treebuilders/base.rb +339 -0
  50. data/lib/html5/treebuilders/hpricot.rb +231 -0
  51. data/lib/html5/treebuilders/rexml.rb +215 -0
  52. data/lib/html5/treebuilders/simpletree.rb +191 -0
  53. data/lib/html5/treewalkers.rb +26 -0
  54. data/lib/html5/treewalkers/base.rb +162 -0
  55. data/lib/html5/treewalkers/hpricot.rb +48 -0
  56. data/lib/html5/treewalkers/rexml.rb +48 -0
  57. data/lib/html5/treewalkers/simpletree.rb +48 -0
  58. data/lib/html5/version.rb +3 -0
  59. data/test/preamble.rb +69 -0
  60. data/test/test_cli.rb +16 -0
  61. data/test/test_encoding.rb +35 -0
  62. data/test/test_input_stream.rb +26 -0
  63. data/test/test_lxp.rb +283 -0
  64. data/test/test_parser.rb +63 -0
  65. data/test/test_sanitizer.rb +173 -0
  66. data/test/test_serializer.rb +67 -0
  67. data/test/test_sniffer.rb +27 -0
  68. data/test/test_stream.rb +71 -0
  69. data/test/test_tokenizer.rb +95 -0
  70. data/test/test_treewalkers.rb +135 -0
  71. data/test/test_validator.rb +31 -0
  72. data/test/tokenizer_test_parser.rb +67 -0
  73. data/test19.rb +38 -0
  74. metadata +198 -0
@@ -0,0 +1,20 @@
1
+ require 'html5/serializer/htmlserializer'
2
+
3
+ module HTML5
4
+
5
+ class XHTMLSerializer < HTMLSerializer
6
+ DEFAULTS = {
7
+ :quote_attr_values => true,
8
+ :minimize_boolean_attributes => false,
9
+ :use_trailing_solidus => true,
10
+ :escape_lt_in_attrs => true,
11
+ :omit_optional_tags => false,
12
+ :escape_rcdata => true
13
+ }
14
+
15
+ def initialize(options={})
16
+ super(DEFAULTS.clone.update(options))
17
+ end
18
+ end
19
+
20
+ end
@@ -0,0 +1,45 @@
1
+ module HTML5
2
+ module Sniffer
3
+ # 4.7.4
4
+ def html_or_feed str
5
+ s = str[0, 512] # steps 1, 2
6
+ pos = 0
7
+
8
+ while pos < s.length
9
+ case s[pos]
10
+ when ?\t, ?\ , ?\n, ?\r # 0x09, 0x20, 0x0A, 0x0D == tab, space, LF, CR
11
+ pos += 1
12
+ when ?< # 0x3C
13
+ pos += 1
14
+ if s[pos..pos+2] == "!--" # [0x21, 0x2D, 0x2D]
15
+ pos += 3
16
+ until s[pos..pos+2] == "-->" or pos >= s.length
17
+ pos += 1
18
+ end
19
+ pos += 3
20
+ elsif s[pos] == ?! # 0x21
21
+ pos += 1
22
+ until s[pos] == ?> or pos >= s.length # 0x3E
23
+ pos += 1
24
+ end
25
+ pos += 1
26
+ elsif s[pos] == ?? # 0x3F
27
+ until s[pos..pos+1] == "?>" or pos >= s.length # [0x3F, 0x3E]
28
+ pos += 1
29
+ end
30
+ pos += 2
31
+ elsif s[pos..pos+2] == "rss" # [0x72, 0x73, 0x73]
32
+ return "application/rss+xml"
33
+ elsif s[pos..pos+3] == "feed" # [0x66, 0x65, 0x65, 0x64]
34
+ return "application/atom+xml"
35
+ elsif s[pos..pos+6] == "rdf:RDF" # [0x72, 0x64, 0x66, 0x3A, 0x52, 0x44, 0x46]
36
+ raise NotImplementedError
37
+ end
38
+ else
39
+ break
40
+ end
41
+ end
42
+ "text/html"
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,1059 @@
1
+ require 'html5/constants'
2
+ require 'html5/inputstream'
3
+
4
+ module HTML5
5
+
6
+ # This class takes care of tokenizing HTML.
7
+ #
8
+ # * @current_token
9
+ # Holds the token that is currently being processed.
10
+ #
11
+ # * @state
12
+ # Holds a reference to the method to be invoked... XXX
13
+ #
14
+ # * @states
15
+ # Holds a mapping between states and methods that implement the state.
16
+ #
17
+ # * @stream
18
+ # Points to HTMLInputStream object.
19
+
20
+ class HTMLTokenizer
21
+ attr_accessor :content_model_flag, :current_token
22
+ attr_reader :stream
23
+
24
+ # XXX need to fix documentation
25
+
26
+ def initialize(stream, options = {})
27
+ @stream = HTMLInputStream.new(stream, options)
28
+
29
+ # Setup the initial tokenizer state
30
+ @content_model_flag = :PCDATA
31
+ @state = :data_state
32
+ @escapeFlag = false
33
+ @lastFourChars = []
34
+
35
+ # The current token being created
36
+ @current_token = nil
37
+
38
+ # Tokens to be processed.
39
+ @token_queue = []
40
+ @lowercase_element_name = options[:lowercase_element_name] != false
41
+ @lowercase_attr_name = options[:lowercase_attr_name] != false
42
+ end
43
+
44
+ # This is where the magic happens.
45
+ #
46
+ # We do our usually processing through the states and when we have a token
47
+ # to return we yield the token which pauses processing until the next token
48
+ # is requested.
49
+ def each
50
+ @token_queue = []
51
+ # Start processing. When EOF is reached @state will return false
52
+ # instead of true and the loop will terminate.
53
+ while send @state
54
+ yield :type => :ParseError, :data => @stream.errors.shift until @stream.errors.empty?
55
+ yield @token_queue.shift until @token_queue.empty?
56
+ end
57
+ end
58
+
59
+ # Below are various helper functions the tokenizer states use worked out.
60
+
61
+ # If the next character is a '>', convert the current_token into
62
+ # an EmptyTag
63
+
64
+ def process_solidus_in_tag
65
+
66
+ # We need to consume another character to make sure it's a ">"
67
+ data = @stream.char
68
+ rv = false
69
+ if @current_token[:type] == :StartTag and data == ">"
70
+ @current_token[:type] = :EmptyTag
71
+ elsif data == :EOF
72
+ @token_queue << ({:type => :ParseError, :data => "eof-following-solidus"})
73
+ @state = :data_state
74
+ emit_current_token
75
+ rv = true
76
+ else
77
+ @token_queue << {:type => :ParseError, :data => "incorrectly-placed-solidus"}
78
+ end
79
+
80
+ # The character we just consumed need to be put back on the stack so it
81
+ # doesn't get lost...
82
+ @stream.unget(data)
83
+ rv
84
+ end
85
+
86
+ # This function returns either U+FFFD or the character based on the
87
+ # decimal or hexadecimal representation. It also discards ";" if present.
88
+ # If not present @token_queue << {:type => :ParseError}" is invoked.
89
+
90
+ def consume_number_entity(isHex)
91
+
92
+ # XXX More need to be done here. For instance, #13 should prolly be
93
+ # converted to #10 so we don't get \r (#13 is \r right?) in the DOM and
94
+ # such. Thoughts on this appreciated.
95
+ allowed = DIGITS
96
+ radix = 10
97
+ if isHex
98
+ allowed = HEX_DIGITS
99
+ radix = 16
100
+ end
101
+
102
+ char_stack = []
103
+
104
+ # Consume all the characters that are in range while making sure we
105
+ # don't hit an EOF.
106
+ c = @stream.char
107
+ while allowed.include?(c) and c != :EOF
108
+ char_stack.push(c)
109
+ c = @stream.char
110
+ end
111
+
112
+ # Convert the set of characters consumed to an int.
113
+ charAsInt = char_stack.join('').to_i(radix)
114
+
115
+ if charAsInt == 13
116
+ @token_queue << {:type => :ParseError, :data => "incorrect-cr-newline-entity"}
117
+ charAsInt = 10
118
+ elsif (128..159).include? charAsInt
119
+ # If the integer is between 127 and 160 (so 128 and bigger and 159
120
+ # and smaller) we need to do the "windows trick".
121
+ @token_queue << {:type => :ParseError, :data => "illegal-windows-1252-entity"}
122
+
123
+ charAsInt = ENTITIES_WINDOWS1252[charAsInt - 128]
124
+ end
125
+
126
+ if 0 < charAsInt && charAsInt <= 1114111 && !(55296 <= charAsInt && charAsInt <= 57343) &&
127
+ ![0x10FFFF].include?(charAsInt) # TODO add more entity replacements here
128
+ if String.method_defined? :force_encoding
129
+ char = charAsInt.chr('utf-8')
130
+ else
131
+ char = [charAsInt].pack('U')
132
+ end
133
+ else
134
+ char = [0xFFFD].pack('U')
135
+ @token_queue << {:type => :ParseError, :data => "cant-convert-numeric-entity", :datavars => {"charAsInt" => charAsInt}}
136
+ end
137
+
138
+ # Discard the ; if present. Otherwise, put it back on the queue and
139
+ # invoke parse_error on parser.
140
+ if c != ";"
141
+ @token_queue << {:type => :ParseError, :data => "numeric-entity-without-semicolon"}
142
+ @stream.unget(c)
143
+ end
144
+
145
+ return char
146
+ end
147
+
148
+ def consume_entity(allowed_char=nil, from_attribute=false)
149
+ char = nil
150
+ char_stack = [@stream.char]
151
+ if SPACE_CHARACTERS.include?(char_stack[0]) or [:EOF, '<', '&'].include?(char_stack[0]) ||
152
+ (allowed_char && allowed_char == char_stack[0])
153
+ @stream.unget(char_stack)
154
+ elsif char_stack[0] == '#'
155
+ # We might have a number entity here.
156
+ char_stack += [@stream.char, @stream.char]
157
+ if char_stack[0 .. 1].include? :EOF
158
+ # If we reach the end of the file put everything up to :EOF
159
+ # back in the queue
160
+ char_stack = char_stack[0...char_stack.index(:EOF)]
161
+ @stream.unget(char_stack)
162
+ @token_queue << {:type => :ParseError, :data => "expected-numeric-entity-but-got-eof"}
163
+ else
164
+ if char_stack[1].downcase == "x" and HEX_DIGITS.include? char_stack[2]
165
+ # Hexadecimal entity detected.
166
+ @stream.unget(char_stack[2])
167
+ char = consume_number_entity(true)
168
+ elsif DIGITS.include? char_stack[1]
169
+ # Decimal entity detected.
170
+ @stream.unget(char_stack[1..-1])
171
+ char = consume_number_entity(false)
172
+ else
173
+ # No number entity detected.
174
+ @stream.unget(char_stack)
175
+ @token_queue << {:type => :ParseError, :data => "expected-numeric-entity"}
176
+ end
177
+ end
178
+ else
179
+ # At this point in the process might have named entity. Entities
180
+ # are stored in the global variable "entities".
181
+ #
182
+ # Consume characters and compare to these to a substring of the
183
+ # entity names in the list until the substring no longer matches.
184
+ filteredEntityList = ENTITIES.keys
185
+ filteredEntityList.reject! {|e| e[0].chr != char_stack[0]}
186
+ entityName = nil
187
+
188
+ # Try to find the longest entity the string will match to take care
189
+ # of &noti for instance.
190
+ while char_stack.last != :EOF
191
+ name = char_stack.join('')
192
+ if filteredEntityList.any? {|e| e[0...name.length] == name}
193
+ filteredEntityList.reject! {|e| e[0...name.length] != name}
194
+ char_stack.push(@stream.char)
195
+ else
196
+ break
197
+ end
198
+
199
+ if ENTITIES.include? name
200
+ entityName = name
201
+ break if entityName[-1] == ';'
202
+ end
203
+ end
204
+
205
+ if entityName != nil
206
+ char = ENTITIES[entityName]
207
+
208
+ # Check whether or not the last character returned can be
209
+ # discarded or needs to be put back.
210
+ if entityName[-1] != ?;
211
+ @token_queue << {:type => :ParseError, :data => "named-entity-without-semicolon"}
212
+ end
213
+
214
+ if entityName[-1] != ";" and from_attribute and
215
+ (ASCII_LETTERS.include?(char_stack[entityName.length]) or
216
+ DIGITS.include?(char_stack[entityName.length]))
217
+ @stream.unget(char_stack)
218
+ char = '&'
219
+ else
220
+ @stream.unget(char_stack[entityName.length..-1])
221
+ end
222
+ else
223
+ @token_queue << {:type => :ParseError, :data => "expected-named-entity"}
224
+ @stream.unget(char_stack)
225
+ end
226
+ end
227
+ return char
228
+ end
229
+
230
+ # This method replaces the need for "entityInAttributeValueState".
231
+ def process_entity_in_attribute allowed_char
232
+ entity = consume_entity(allowed_char, true)
233
+ if entity
234
+ @current_token[:data][-1][1] += entity
235
+ else
236
+ @current_token[:data][-1][1] += "&"
237
+ end
238
+ end
239
+
240
+ # This method is a generic handler for emitting the tags. It also sets
241
+ # the state to "data" because that's what's needed after a token has been
242
+ # emitted.
243
+ def emit_current_token
244
+ # Add token to the queue to be yielded
245
+ token = @current_token
246
+ if [:StartTag, :EndTag, :EmptyTag].include?(token[:type])
247
+ if @lowercase_element_name
248
+ token[:name] = token[:name].downcase
249
+ end
250
+
251
+ if token[:type] == :EndTag && token[:self_closing]
252
+ @token_queue << {:type => :ParseError, :data => "self-closing-end-tag"}
253
+ end
254
+ @token_queue << token
255
+ @state = :data_state
256
+ end
257
+
258
+ end
259
+
260
+ # Below are the various tokenizer states worked out.
261
+
262
+ # XXX AT Perhaps we should have Hixie run some evaluation on billions of
263
+ # documents to figure out what the order of the various if and elsif
264
+ # statements should be.
265
+ def data_state
266
+ data = @stream.char
267
+
268
+ if @content_model_flag == :CDATA or @content_model_flag == :RCDATA
269
+ @lastFourChars.shift if @lastFourChars.length == 4
270
+ @lastFourChars << data
271
+ end
272
+
273
+ if data == "&" and [:PCDATA,:RCDATA].include?(@content_model_flag) and !@escapeFlag
274
+ @state = :entity_data_state
275
+ elsif data == "-" && [:CDATA, :RCDATA].include?(@content_model_flag) && !@escapeFlag && @lastFourChars.join('') == "<!--"
276
+ @escapeFlag = true
277
+ @token_queue << {:type => :Characters, :data => data}
278
+ elsif data == "<" and !@escapeFlag and
279
+ [:PCDATA,:CDATA,:RCDATA].include?(@content_model_flag)
280
+ @state = :tag_open_state
281
+ elsif data == ">" and @escapeFlag and
282
+ [:CDATA,:RCDATA].include?(@content_model_flag) and
283
+ @lastFourChars[1..-1].join('') == "-->"
284
+ @escapeFlag = false
285
+ @token_queue << {:type => :Characters, :data => data}
286
+
287
+ elsif data == :EOF
288
+ # Tokenization ends.
289
+ return false
290
+
291
+ elsif SPACE_CHARACTERS.include? data
292
+ # Directly after emitting a token you switch back to the "data
293
+ # state". At that point SPACE_CHARACTERS are important so they are
294
+ # emitted separately.
295
+ # XXX need to check if we don't need a special "spaces" flag on
296
+ # characters.
297
+ @token_queue << {:type => :SpaceCharacters, :data => data + @stream.chars_until(SPACE_CHARACTERS, true)}
298
+ else
299
+ chars = @stream.chars_until(["&", "<", ">", "-"])
300
+ @token_queue << {:type => :Characters, :data => data + chars}
301
+ @lastFourChars += (chars[chars.length > 4 ? -4 : -chars.length, 4] || '').scan(/./)
302
+ @lastFourChars = @lastFourChars[(@lastFourChars.length > 4 ? -4 : -@lastFourChars.length), 4] || []
303
+ end
304
+ return true
305
+ end
306
+
307
+ def entity_data_state
308
+ entity = consume_entity
309
+ if entity
310
+ @token_queue << {:type => :Characters, :data => entity}
311
+ else
312
+ @token_queue << {:type => :Characters, :data => "&"}
313
+ end
314
+ @state = :data_state
315
+ return true
316
+ end
317
+
318
+ def tag_open_state
319
+ data = @stream.char
320
+
321
+ if @content_model_flag == :PCDATA
322
+ if data == "!"
323
+ @state = :markup_declaration_open_state
324
+ elsif data == "/"
325
+ @state = :close_tag_open_state
326
+ elsif data != :EOF and ASCII_LETTERS.include? data
327
+ @current_token = {:type => :StartTag, :name => data, :data => []}
328
+ @state = :tag_name_state
329
+ elsif data == ">"
330
+ # XXX In theory it could be something besides a tag name. But
331
+ # do we really care?
332
+ @token_queue << {:type => :ParseError, :data => "expected-tag-name-but-got-right-bracket"}
333
+ @token_queue << {:type => :Characters, :data => "<>"}
334
+ @state = :data_state
335
+ elsif data == "?"
336
+ # XXX In theory it could be something besides a tag name. But
337
+ # do we really care?
338
+ @token_queue.push({:type => :ParseError, :data => "expected-tag-name-but-got-question-mark"})
339
+ @stream.unget(data)
340
+ @state = :bogus_comment_state
341
+ else
342
+ # XXX
343
+ @token_queue << {:type => :ParseError, :data => "expected-tag-name"}
344
+ @token_queue << {:type => :Characters, :data => "<"}
345
+ @stream.unget(data)
346
+ @state = :data_state
347
+ end
348
+ else
349
+ # We know the content model flag is set to either RCDATA or CDATA
350
+ # now because this state can never be entered with the PLAINTEXT
351
+ # flag.
352
+ if data == "/"
353
+ @state = :close_tag_open_state
354
+ else
355
+ @token_queue << {:type => :Characters, :data => "<"}
356
+ @stream.unget(data)
357
+ @state = :data_state
358
+ end
359
+ end
360
+ return true
361
+ end
362
+
363
+ def close_tag_open_state
364
+ if (@content_model_flag == :RCDATA or @content_model_flag == :CDATA)
365
+ if @current_token
366
+ char_stack = []
367
+
368
+ # So far we know that "</" has been consumed. We now need to know
369
+ # whether the next few characters match the name of last emitted
370
+ # start tag which also happens to be the current_token. We also need
371
+ # to have the character directly after the characters that could
372
+ # match the start tag name.
373
+ (@current_token[:name].length + 1).times do
374
+ char_stack.push(@stream.char)
375
+ # Make sure we don't get hit by :EOF
376
+ break if char_stack[-1] == :EOF
377
+ end
378
+
379
+ # Since this is just for checking. We put the characters back on
380
+ # the stack.
381
+ @stream.unget(char_stack)
382
+ end
383
+
384
+ if @current_token and
385
+ @current_token[:name].downcase ==
386
+ char_stack[0...-1].join('').downcase and
387
+ (SPACE_CHARACTERS + [">", "/", "<", :EOF]).include? char_stack[-1]
388
+ # Because the characters are correct we can safely switch to
389
+ # PCDATA mode now. This also means we don't have to do it when
390
+ # emitting the end tag token.
391
+ @content_model_flag = :PCDATA
392
+ else
393
+ @token_queue << {:type => :Characters, :data => "</"}
394
+ @state = :data_state
395
+
396
+ # Need to return here since we don't want the rest of the
397
+ # method to be walked through.
398
+ return true
399
+ end
400
+ end
401
+
402
+ data = @stream.char
403
+ if data == :EOF
404
+ @token_queue << {:type => :ParseError, :data => "expected-closing-tag-but-got-eof"}
405
+ @token_queue << {:type => :Characters, :data => "</"}
406
+ @state = :data_state
407
+ elsif ASCII_LETTERS.include? data
408
+ @current_token = {:type => :EndTag, :name => data, :data => []}
409
+ @state = :tag_name_state
410
+ elsif data == ">"
411
+ @token_queue << {:type => :ParseError, :data => "expected-closing-tag-but-got-right-bracket"}
412
+ @state = :data_state
413
+ else
414
+ # XXX data can be _'_...
415
+ @token_queue << {:type => :ParseError, :data => "expected-closing-tag-but-got-char", :datavars => {:data => data}}
416
+ @stream.unget(data)
417
+ @state = :bogus_comment_state
418
+ end
419
+
420
+ return true
421
+ end
422
+
423
+ def tag_name_state
424
+ data = @stream.char
425
+ if SPACE_CHARACTERS.include? data
426
+ @state = :before_attribute_name_state
427
+ elsif data == :EOF
428
+ @token_queue << {:type => :ParseError, :data => "eof-in-tag-name"}
429
+ emit_current_token
430
+ elsif ASCII_LETTERS.include? data
431
+ @current_token[:name] += data + @stream.chars_until(ASCII_LETTERS, true)
432
+ elsif data == ">"
433
+ emit_current_token
434
+ elsif data == "/"
435
+ @state = :self_closing_tag_state
436
+ else
437
+ @current_token[:name] += data
438
+ end
439
+ return true
440
+ end
441
+
442
+ def before_attribute_name_state
443
+ data = @stream.char
444
+ if SPACE_CHARACTERS.include? data
445
+ @stream.chars_until(SPACE_CHARACTERS, true)
446
+ elsif data == :EOF
447
+ @token_queue << {:type => :ParseError, :data => "expected-attribute-name-but-got-eof"}
448
+ emit_current_token
449
+ elsif ASCII_LETTERS.include? data
450
+ @current_token[:data].push([data, ""])
451
+ @state = :attribute_name_state
452
+ elsif data == ">"
453
+ emit_current_token
454
+ elsif data == "/"
455
+ @state = :self_closing_tag_state
456
+ elsif data == "'" || data == '"' || data == "="
457
+ @token_queue.push({:type => :ParseError, :data => "invalid-character-in-attribute-name"})
458
+ @current_token[:data].push([data, ""])
459
+ @state = :attribute_name_state
460
+ else
461
+ @current_token[:data].push([data, ""])
462
+ @state = :attribute_name_state
463
+ end
464
+ return true
465
+ end
466
+
467
+ def attribute_name_state
468
+ data = @stream.char
469
+ leavingThisState = true
470
+ emitToken = false
471
+ if data == "="
472
+ @state = :before_attribute_value_state
473
+ elsif data == :EOF
474
+ @token_queue << {:type => :ParseError, :data => "eof-in-attribute-name"}
475
+ @state = :data_state
476
+ emitToken = true
477
+ elsif ASCII_LETTERS.include? data
478
+ @current_token[:data][-1][0] += data + @stream.chars_until(ASCII_LETTERS, true)
479
+ leavingThisState = false
480
+ elsif data == ">"
481
+ # XXX If we emit here the attributes are converted to a dict
482
+ # without being checked and when the code below runs we error
483
+ # because data is a dict not a list
484
+ emitToken = true
485
+ elsif SPACE_CHARACTERS.include? data
486
+ @state = :after_attribute_name_state
487
+ elsif data == "/"
488
+ if !process_solidus_in_tag
489
+ @state = :before_attribute_name_state
490
+ end
491
+ elsif data == "'" or data == '"'
492
+ @token_queue.push({:type => :ParseError, :data => "invalid-character-in-attribute-name"})
493
+ @current_token[:data][-1][0] += data
494
+ leavingThisState = false
495
+ else
496
+ @current_token[:data][-1][0] += data
497
+ leavingThisState = false
498
+ end
499
+
500
+ if leavingThisState
501
+ # Attributes are not dropped at this stage. That happens when the
502
+ # start tag token is emitted so values can still be safely appended
503
+ # to attributes, but we do want to report the parse error in time.
504
+ if @lowercase_attr_name
505
+ @current_token[:data][-1][0] = @current_token[:data].last.first.downcase
506
+ end
507
+ @current_token[:data][0...-1].each {|name,value|
508
+ if @current_token[:data].last.first == name
509
+ @token_queue << {:type => :ParseError, :data => "duplicate-attribute"}
510
+ break # don't report an error more than once
511
+ end
512
+ }
513
+ # XXX Fix for above XXX
514
+ emit_current_token if emitToken
515
+ end
516
+ return true
517
+ end
518
+
519
+ def after_attribute_name_state
520
+ data = @stream.char
521
+ if SPACE_CHARACTERS.include? data
522
+ @stream.chars_until(SPACE_CHARACTERS, true)
523
+ elsif data == "="
524
+ @state = :before_attribute_value_state
525
+ elsif data == ">"
526
+ emit_current_token
527
+ elsif data == :EOF
528
+ @token_queue << {:type => :ParseError, :data => "expected-end-of-tag-but-got-eof"}
529
+ emit_current_token
530
+ elsif ASCII_LETTERS.include? data
531
+ @current_token[:data].push([data, ""])
532
+ @state = :attribute_name_state
533
+ elsif data == "/"
534
+ @state = :self_closing_tag_state
535
+ else
536
+ @current_token[:data].push([data, ""])
537
+ @state = :attribute_name_state
538
+ end
539
+ return true
540
+ end
541
+
542
+ def before_attribute_value_state
543
+ data = @stream.char
544
+ if SPACE_CHARACTERS.include? data
545
+ @stream.chars_until(SPACE_CHARACTERS, true)
546
+ elsif data == "\""
547
+ @state = :attribute_value_double_quoted_state
548
+ elsif data == "&"
549
+ @state = :attribute_value_unquoted_state
550
+ @stream.unget(data);
551
+ elsif data == "'"
552
+ @state = :attribute_value_single_quoted_state
553
+ elsif data == ">"
554
+ emit_current_token
555
+ elsif data == "="
556
+ @token_queue.push({:type => :ParseError, :data => "equals-in-unquoted-attribute-value"})
557
+ @current_token[:data][-1][1] += data
558
+ @state = :attribute_value_unquoted_state
559
+ elsif data == :EOF
560
+ @token_queue << {:type => :ParseError, :data => "expected-attribute-value-but-got-eof"}
561
+ emit_current_token
562
+ else
563
+ @current_token[:data][-1][1] += data
564
+ @state = :attribute_value_unquoted_state
565
+ end
566
+ return true
567
+ end
568
+
569
+ def attribute_value_double_quoted_state
570
+ data = @stream.char
571
+ if data == "\""
572
+ @state = :after_attribute_value_state
573
+ elsif data == "&"
574
+ process_entity_in_attribute('"')
575
+ elsif data == :EOF
576
+ @token_queue << {:type => :ParseError, :data => "eof-in-attribute-value-double-quote"}
577
+ emit_current_token
578
+ else
579
+ @current_token[:data][-1][1] += data + @stream.chars_until(["\"", "&"])
580
+ end
581
+ return true
582
+ end
583
+
584
+ def attribute_value_single_quoted_state
585
+ data = @stream.char
586
+ if data == "'"
587
+ @state = :after_attribute_value_state
588
+ elsif data == "&"
589
+ process_entity_in_attribute("'")
590
+ elsif data == :EOF
591
+ @token_queue << {:type => :ParseError, :data => "eof-in-attribute-value-single-quote"}
592
+ emit_current_token
593
+ else
594
+ @current_token[:data][-1][1] += data + @stream.chars_until(["'", "&"])
595
+ end
596
+ return true
597
+ end
598
+
599
+ def attribute_value_unquoted_state
600
+ data = @stream.char
601
+ if SPACE_CHARACTERS.include? data
602
+ @state = :before_attribute_name_state
603
+ elsif data == "&"
604
+ process_entity_in_attribute ''
605
+ elsif data == ">"
606
+ emit_current_token
607
+ elsif data == '"' || data == "'" || data == "="
608
+ @token_queue.push({:type => :ParseError, :data => "unexpected-character-in-unquoted-attribute-value"})
609
+ @current_token[:data][-1][1] += data
610
+ elsif data == :EOF
611
+ @token_queue << {:type => :ParseError, :data => "eof-in-attribute-value-no-quotes"}
612
+ emit_current_token
613
+ else
614
+ @current_token[:data][-1][1] += data + @stream.chars_until(["&", ">","<"] + SPACE_CHARACTERS)
615
+ end
616
+ return true
617
+ end
618
+
619
+ def after_attribute_value_state
620
+ data = self.stream.char()
621
+ if SPACE_CHARACTERS.include? data
622
+ @state = :before_attribute_name_state
623
+ elsif data == ">"
624
+ emit_current_token
625
+ @state = :data_state
626
+ elsif data == "/"
627
+ @state = :self_closing_tag_state
628
+ elsif data == :EOF
629
+ @token_queue << {:type => :ParseError, :data => "unexpected-EOF-after-attribute-value"}
630
+ emit_current_token
631
+ @stream.unget(data)
632
+ @state = :data_state
633
+ else
634
+ @token_queue.push({:type => :ParseError, :data => "unexpected-character-after-attribute-value"})
635
+ @stream.unget(data)
636
+ @state = :before_attribute_name_state
637
+ end
638
+ true
639
+ end
640
+
641
+ def self_closing_tag_state
642
+ c = @stream.char
643
+ case c
644
+ when ">"
645
+ @current_token[:self_closing] = true
646
+ emit_current_token
647
+ @state = :data_state
648
+ when :EOF
649
+ @token_queue << {:type => :ParseError, :data => "eof-in-tag-name"}
650
+ @stream.unget(c)
651
+ @state = :data_state
652
+ else
653
+ @token_queue << {:type => :ParseError, :data => "expected-self-closing-tag"}
654
+ @stream.unget(c)
655
+ @state = :before_attribute_name_state
656
+ end
657
+ true
658
+ end
659
+
660
+ def bogus_comment_state
661
+ # Make a new comment token and give it as value all the characters
662
+ # until the first > or :EOF (chars_until checks for :EOF automatically)
663
+ # and emit it.
664
+ @token_queue << {:type => :Comment, :data => @stream.chars_until([">"])}
665
+
666
+ # Eat the character directly after the bogus comment which is either a
667
+ # ">" or an :EOF.
668
+ @stream.char
669
+ @state = :data_state
670
+ return true
671
+ end
672
+
673
+ def markup_declaration_open_state
674
+ char_stack = [@stream.char, @stream.char]
675
+ if char_stack == ["-", "-"]
676
+ @current_token = {:type => :Comment, :data => ""}
677
+ @state = :comment_start_state
678
+ else
679
+ 5.times { char_stack.push(@stream.char) }
680
+ # Put in explicit :EOF check
681
+ if !char_stack.include?(:EOF) && char_stack.join("").upcase == "DOCTYPE"
682
+ @current_token = {:type => :Doctype, :name => "", :publicId => nil, :systemId => nil, :correct => true}
683
+ @state = :doctype_state
684
+ else
685
+ @token_queue << {:type => :ParseError, :data => "expected-dashes-or-doctype"}
686
+ @stream.unget(char_stack)
687
+ @state = :bogus_comment_state
688
+ end
689
+ end
690
+ return true
691
+ end
692
+
693
+ def comment_start_state
694
+ data = @stream.char
695
+ if data == "-"
696
+ @state = :comment_start_dash_state
697
+ elsif data == ">"
698
+ @token_queue << {:type => :ParseError, :data => "incorrect-comment"}
699
+ @token_queue << @current_token
700
+ @state = :data_state
701
+ elsif data == :EOF
702
+ @token_queue << {:type => :ParseError, :data => "eof-in-comment"}
703
+ @token_queue << @current_token
704
+ @state = :data_state
705
+ else
706
+ @current_token[:data] += data + @stream.chars_until("-")
707
+ @state = :comment_state
708
+ end
709
+ return true
710
+ end
711
+
712
+ def comment_start_dash_state
713
+ data = @stream.char
714
+ if data == "-"
715
+ @state = :comment_end_state
716
+ elsif data == ">"
717
+ @token_queue << {:type => :ParseError, :data => "incorrect-comment"}
718
+ @token_queue << @current_token
719
+ @state = :data_state
720
+ elsif data == :EOF
721
+ @token_queue << {:type => :ParseError, :data => "eof-in-comment"}
722
+ @token_queue << @current_token
723
+ @state = :data_state
724
+ else
725
+ @current_token[:data] += '-' + data + @stream.chars_until("-")
726
+ @state = :comment_state
727
+ end
728
+ return true
729
+ end
730
+
731
+ def comment_state
732
+ data = @stream.char
733
+ if data == "-"
734
+ @state = :comment_end_dash_state
735
+ elsif data == :EOF
736
+ @token_queue << {:type => :ParseError, :data => "eof-in-comment"}
737
+ @token_queue << @current_token
738
+ @state = :data_state
739
+ else
740
+ @current_token[:data] += data + @stream.chars_until("-")
741
+ end
742
+ return true
743
+ end
744
+
745
+ def comment_end_dash_state
746
+ data = @stream.char
747
+ if data == "-"
748
+ @state = :comment_end_state
749
+ elsif data == :EOF
750
+ @token_queue << {:type => :ParseError, :data => "eof-in-comment-end-dash"}
751
+ @token_queue << @current_token
752
+ @state = :data_state
753
+ else
754
+ @current_token[:data] += "-" + data + @stream.chars_until("-")
755
+ # Consume the next character which is either a "-" or an :EOF as
756
+ # well so if there's a "-" directly after the "-" we go nicely to
757
+ # the "comment end state" without emitting a ParseError there.
758
+ @stream.char
759
+ end
760
+ return true
761
+ end
762
+
763
+ def comment_end_state
764
+ data = @stream.char
765
+ if data == ">"
766
+ @token_queue << @current_token
767
+ @state = :data_state
768
+ elsif data == "-"
769
+ @token_queue << {:type => :ParseError, :data => "unexpected-dash-after-double-dash-in-comment"}
770
+ @current_token[:data] += data
771
+ elsif data == :EOF
772
+ @token_queue << {:type => :ParseError, :data => "eof-in-comment-double-dash"}
773
+ @token_queue << @current_token
774
+ @state = :data_state
775
+ else
776
+ # XXX
777
+ @token_queue << {:type => :ParseError, :data => "unexpected-char-in-comment"}
778
+ @current_token[:data] += "--" + data
779
+ @state = :comment_state
780
+ end
781
+ return true
782
+ end
783
+
784
+ def doctype_state
785
+ data = @stream.char
786
+ if SPACE_CHARACTERS.include? data
787
+ @state = :before_doctype_name_state
788
+ else
789
+ @token_queue << {:type => :ParseError, :data => "need-space-after-doctype"}
790
+ @stream.unget(data)
791
+ @state = :before_doctype_name_state
792
+ end
793
+ return true
794
+ end
795
+
796
+ def before_doctype_name_state
797
+ data = @stream.char
798
+ if SPACE_CHARACTERS.include? data
799
+ elsif data == ">"
800
+ @token_queue << {:type => :ParseError, :data => "expected-doctype-name-but-got-right-bracket"}
801
+ @current_token[:correct] = false
802
+ @token_queue << @current_token
803
+ @state = :data_state
804
+ elsif data == :EOF
805
+ @token_queue << {:type => :ParseError, :data => "expected-doctype-name-but-got-eof"}
806
+ @current_token[:correct] = false
807
+ @token_queue << @current_token
808
+ @state = :data_state
809
+ else
810
+ @current_token[:name] = data
811
+ @state = :doctype_name_state
812
+ end
813
+ return true
814
+ end
815
+
816
+ def doctype_name_state
817
+ data = @stream.char
818
+ if SPACE_CHARACTERS.include? data
819
+ @state = :after_doctype_name_state
820
+ elsif data == ">"
821
+ @token_queue << @current_token
822
+ @state = :data_state
823
+ elsif data == :EOF
824
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype-name"}
825
+ @current_token[:correct] = false
826
+ @token_queue << @current_token
827
+ @state = :data_state
828
+ else
829
+ @current_token[:name] += data
830
+ end
831
+
832
+ return true
833
+ end
834
+
835
+ def after_doctype_name_state
836
+ data = @stream.char
837
+ if SPACE_CHARACTERS.include? data
838
+ elsif data == ">"
839
+ @token_queue << @current_token
840
+ @state = :data_state
841
+ elsif data == :EOF
842
+ @current_token[:correct] = false
843
+ @stream.unget(data)
844
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
845
+ @token_queue << @current_token
846
+ @state = :data_state
847
+ else
848
+ char_stack = [data]
849
+ 5.times { char_stack << stream.char }
850
+ token = char_stack.join('').tr(ASCII_UPPERCASE,ASCII_LOWERCASE)
851
+ if token == "public" and !char_stack.include?(:EOF)
852
+ @state = :before_doctype_public_identifier_state
853
+ elsif token == "system" and !char_stack.include?(:EOF)
854
+ @state = :before_doctype_system_identifier_state
855
+ else
856
+ @stream.unget(char_stack)
857
+ @token_queue << {:type => :ParseError, :data => "expected-space-or-right-bracket-in-doctype", "datavars" => {"data" => data}}
858
+ @current_token[:correct] = false
859
+ @state = :bogus_doctype_state
860
+ end
861
+ end
862
+ return true
863
+ end
864
+
865
+ def before_doctype_public_identifier_state
866
+ data = @stream.char
867
+
868
+ if SPACE_CHARACTERS.include?(data)
869
+ elsif data == "\""
870
+ @current_token[:publicId] = ""
871
+ @state = :doctype_public_identifier_double_quoted_state
872
+ elsif data == "'"
873
+ @current_token[:publicId] = ""
874
+ @state = :doctype_public_identifier_single_quoted_state
875
+ elsif data == ">"
876
+ @token_queue << {:type => :ParseError, :data => "unexpected-end-of-doctype"}
877
+ @current_token[:correct] = false
878
+ @token_queue << @current_token
879
+ @state = :data_state
880
+ elsif data == :EOF
881
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
882
+ @current_token[:correct] = false
883
+ @token_queue << @current_token
884
+ @state = :data_state
885
+ else
886
+ @token_queue << {:type => :ParseError, :data => "unexpected-char-in-doctype"}
887
+ @current_token[:correct] = false
888
+ @state = :bogus_doctype_state
889
+ end
890
+
891
+ return true
892
+ end
893
+
894
+ def doctype_public_identifier_double_quoted_state
895
+ data = @stream.char
896
+ if data == "\""
897
+ @state = :after_doctype_public_identifier_state
898
+ elsif data == ">"
899
+ @token_queue << {:type => :ParseError, :data => "unexpected-end-of-doctype"}
900
+ @current_token[:correct] = false
901
+ @token_queue << @current_token
902
+ @state = :data_state
903
+ elsif data == :EOF
904
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
905
+ @current_token[:correct] = false
906
+ @token_queue << @current_token
907
+ @state = :data_state
908
+ else
909
+ @current_token[:publicId] += data
910
+ end
911
+ return true
912
+ end
913
+
914
+ def doctype_public_identifier_single_quoted_state
915
+ data = @stream.char
916
+ if data == "'"
917
+ @state = :after_doctype_public_identifier_state
918
+ elsif data == ">"
919
+ @token_queue << {:type => :ParseError, :data => "unexpected-end-of-doctype"}
920
+ @current_token[:correct] = false
921
+ @token_queue << @current_token
922
+ @state = :data_state
923
+ elsif data == :EOF
924
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
925
+ @current_token[:correct] = false
926
+ @token_queue << @current_token
927
+ @state = :data_state
928
+ else
929
+ @current_token[:publicId] += data
930
+ end
931
+ return true
932
+ end
933
+
934
+ def after_doctype_public_identifier_state
935
+ data = @stream.char
936
+ if SPACE_CHARACTERS.include?(data)
937
+ elsif data == "\""
938
+ @current_token[:systemId] = ""
939
+ @state = :doctype_system_identifier_double_quoted_state
940
+ elsif data == "'"
941
+ @current_token[:systemId] = ""
942
+ @state = :doctype_system_identifier_single_quoted_state
943
+ elsif data == ">"
944
+ @token_queue << @current_token
945
+ @state = :data_state
946
+ elsif data == :EOF
947
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
948
+ @current_token[:correct] = false
949
+ @token_queue << @current_token
950
+ @state = :data_state
951
+ else
952
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
953
+ @current_token[:correct] = false
954
+ @state = :bogus_doctype_state
955
+ end
956
+ return true
957
+ end
958
+
959
+ def before_doctype_system_identifier_state
960
+ data = @stream.char
961
+ if SPACE_CHARACTERS.include?(data)
962
+ elsif data == "\""
963
+ @current_token[:systemId] = ""
964
+ @state = :doctype_system_identifier_double_quoted_state
965
+ elsif data == "'"
966
+ @current_token[:systemId] = ""
967
+ @state = :doctype_system_identifier_single_quoted_state
968
+ elsif data == ">"
969
+ @token_queue << {:type => :ParseError, :data => "unexpected-char-in-doctype"}
970
+ @current_token[:correct] = false
971
+ @token_queue << @current_token
972
+ @state = :data_state
973
+ elsif data == :EOF
974
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
975
+ @current_token[:correct] = false
976
+ @token_queue << @current_token
977
+ @state = :data_state
978
+ else
979
+ @token_queue << {:type => :ParseError, :data => "unexpected-char-in-doctype"}
980
+ @current_token[:correct] = false
981
+ @state = :bogus_doctype_state
982
+ end
983
+ return true
984
+ end
985
+
986
+ def doctype_system_identifier_double_quoted_state
987
+ data = @stream.char
988
+ if data == "\""
989
+ @state = :after_doctype_system_identifier_state
990
+ elsif data == ">"
991
+ @token_queue << {:type => :ParseError, :data => "unexpected-end-of-doctype"}
992
+ @current_token[:correct] = false
993
+ @token_queue << @current_token
994
+ @state = :data_state
995
+ elsif data == :EOF
996
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
997
+ @current_token[:correct] = false
998
+ @token_queue << @current_token
999
+ @state = :data_state
1000
+ else
1001
+ @current_token[:systemId] += data
1002
+ end
1003
+ return true
1004
+ end
1005
+
1006
+ def doctype_system_identifier_single_quoted_state
1007
+ data = @stream.char
1008
+ if data == "'"
1009
+ @state = :after_doctype_system_identifier_state
1010
+ elsif data == ">"
1011
+ @token_queue << {:type => :ParseError, :data => "unexpected-end-of-doctype"}
1012
+ @current_token[:correct] = false
1013
+ @token_queue << @current_token
1014
+ @state = :data_state
1015
+ elsif data == :EOF
1016
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
1017
+ @current_token[:correct] = false
1018
+ @token_queue << @current_token
1019
+ @state = :data_state
1020
+ else
1021
+ @current_token[:systemId] += data
1022
+ end
1023
+ return true
1024
+ end
1025
+
1026
+ def after_doctype_system_identifier_state
1027
+ data = @stream.char
1028
+ if SPACE_CHARACTERS.include?(data)
1029
+ elsif data == ">"
1030
+ @token_queue << @current_token
1031
+ @state = :data_state
1032
+ elsif data == :EOF
1033
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
1034
+ @current_token[:correct] = false
1035
+ @token_queue << @current_token
1036
+ @state = :data_state
1037
+ else
1038
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
1039
+ @state = :bogus_doctype_state
1040
+ end
1041
+ return true
1042
+ end
1043
+
1044
+ def bogus_doctype_state
1045
+ data = @stream.char
1046
+ if data == ">"
1047
+ @token_queue << @current_token
1048
+ @state = :data_state
1049
+ elsif data == :EOF
1050
+ @stream.unget(data)
1051
+ @token_queue << @current_token
1052
+ @state = :data_state
1053
+ end
1054
+ return true
1055
+ end
1056
+
1057
+ end
1058
+
1059
+ end