html5 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. data/History.txt +3 -0
  2. data/Manifest.txt +58 -0
  3. data/README +9 -0
  4. data/Rakefile.rb +17 -0
  5. data/lib/html5/constants.rb +818 -0
  6. data/lib/html5/filters/base.rb +10 -0
  7. data/lib/html5/filters/inject_meta_charset.rb +82 -0
  8. data/lib/html5/filters/optionaltags.rb +198 -0
  9. data/lib/html5/filters/sanitizer.rb +15 -0
  10. data/lib/html5/filters/whitespace.rb +36 -0
  11. data/lib/html5/html5parser/after_body_phase.rb +46 -0
  12. data/lib/html5/html5parser/after_frameset_phase.rb +34 -0
  13. data/lib/html5/html5parser/after_head_phase.rb +50 -0
  14. data/lib/html5/html5parser/before_head_phase.rb +41 -0
  15. data/lib/html5/html5parser/in_body_phase.rb +607 -0
  16. data/lib/html5/html5parser/in_caption_phase.rb +68 -0
  17. data/lib/html5/html5parser/in_cell_phase.rb +78 -0
  18. data/lib/html5/html5parser/in_column_group_phase.rb +55 -0
  19. data/lib/html5/html5parser/in_frameset_phase.rb +57 -0
  20. data/lib/html5/html5parser/in_head_phase.rb +138 -0
  21. data/lib/html5/html5parser/in_row_phase.rb +87 -0
  22. data/lib/html5/html5parser/in_select_phase.rb +84 -0
  23. data/lib/html5/html5parser/in_table_body_phase.rb +83 -0
  24. data/lib/html5/html5parser/in_table_phase.rb +110 -0
  25. data/lib/html5/html5parser/initial_phase.rb +134 -0
  26. data/lib/html5/html5parser/phase.rb +158 -0
  27. data/lib/html5/html5parser/root_element_phase.rb +42 -0
  28. data/lib/html5/html5parser/trailing_end_phase.rb +35 -0
  29. data/lib/html5/html5parser.rb +248 -0
  30. data/lib/html5/inputstream.rb +654 -0
  31. data/lib/html5/liberalxmlparser.rb +158 -0
  32. data/lib/html5/sanitizer.rb +188 -0
  33. data/lib/html5/serializer/htmlserializer.rb +180 -0
  34. data/lib/html5/serializer/xhtmlserializer.rb +20 -0
  35. data/lib/html5/serializer.rb +2 -0
  36. data/lib/html5/tokenizer.rb +968 -0
  37. data/lib/html5/treebuilders/base.rb +334 -0
  38. data/lib/html5/treebuilders/hpricot.rb +231 -0
  39. data/lib/html5/treebuilders/rexml.rb +208 -0
  40. data/lib/html5/treebuilders/simpletree.rb +185 -0
  41. data/lib/html5/treebuilders.rb +24 -0
  42. data/lib/html5/treewalkers/base.rb +154 -0
  43. data/lib/html5/treewalkers/hpricot.rb +48 -0
  44. data/lib/html5/treewalkers/rexml.rb +48 -0
  45. data/lib/html5/treewalkers/simpletree.rb +48 -0
  46. data/lib/html5/treewalkers.rb +26 -0
  47. data/lib/html5.rb +13 -0
  48. data/parse.rb +217 -0
  49. data/tests/preamble.rb +82 -0
  50. data/tests/test_encoding.rb +35 -0
  51. data/tests/test_lxp.rb +263 -0
  52. data/tests/test_parser.rb +68 -0
  53. data/tests/test_sanitizer.rb +142 -0
  54. data/tests/test_serializer.rb +68 -0
  55. data/tests/test_stream.rb +62 -0
  56. data/tests/test_tokenizer.rb +94 -0
  57. data/tests/test_treewalkers.rb +116 -0
  58. data/tests/tokenizer_test_parser.rb +63 -0
  59. metadata +120 -0
@@ -0,0 +1,968 @@
1
+ require 'html5/constants'
2
+ require 'html5/inputstream'
3
+
4
+ module HTML5
5
+
6
+ # This class takes care of tokenizing HTML.
7
+ #
8
+ # * @current_token
9
+ # Holds the token that is currently being processed.
10
+ #
11
+ # * @state
12
+ # Holds a reference to the method to be invoked... XXX
13
+ #
14
+ # * @states
15
+ # Holds a mapping between states and methods that implement the state.
16
+ #
17
+ # * @stream
18
+ # Points to HTMLInputStream object.
19
+
20
+ class HTMLTokenizer
21
+ attr_accessor :content_model_flag, :current_token
22
+ attr_reader :stream
23
+
24
+ # XXX need to fix documentation
25
+
26
+ def initialize(stream, options = {})
27
+ @stream = HTMLInputStream.new(stream, options)
28
+
29
+ # Setup the initial tokenizer state
30
+ @content_model_flag = :PCDATA
31
+ @state = :data_state
32
+ @escapeFlag = false
33
+ @lastFourChars = []
34
+
35
+ # The current token being created
36
+ @current_token = nil
37
+
38
+ # Tokens to be processed.
39
+ @token_queue = []
40
+ @lowercase_element_name = options[:lowercase_element_name] != false
41
+ @lowercase_attr_name = options[:lowercase_attr_name] != false
42
+ end
43
+
44
+ # This is where the magic happens.
45
+ #
46
+ # We do our usually processing through the states and when we have a token
47
+ # to return we yield the token which pauses processing until the next token
48
+ # is requested.
49
+ def each
50
+ @token_queue = []
51
+ # Start processing. When EOF is reached @state will return false
52
+ # instead of true and the loop will terminate.
53
+ while send @state
54
+ yield :type => :ParseError, :data => @stream.errors.shift until @stream.errors.empty?
55
+ yield @token_queue.shift until @token_queue.empty?
56
+ end
57
+ end
58
+
59
+ # Below are various helper functions the tokenizer states use worked out.
60
+
61
+ # If the next character is a '>', convert the current_token into
62
+ # an EmptyTag
63
+
64
+ def process_solidus_in_tag
65
+
66
+ # We need to consume another character to make sure it's a ">"
67
+ data = @stream.char
68
+
69
+ if @current_token[:type] == :StartTag and data == ">"
70
+ @current_token[:type] = :EmptyTag
71
+ else
72
+ @token_queue << {:type => :ParseError, :data => _("Solidus (/) incorrectly placed in tag.")}
73
+ end
74
+
75
+ # The character we just consumed need to be put back on the stack so it
76
+ # doesn't get lost...
77
+ @stream.unget(data)
78
+ end
79
+
80
+ # This function returns either U+FFFD or the character based on the
81
+ # decimal or hexadecimal representation. It also discards ";" if present.
82
+ # If not present @token_queue << {:type => :ParseError}" is invoked.
83
+
84
+ def consume_number_entity(isHex)
85
+
86
+ # XXX More need to be done here. For instance, #13 should prolly be
87
+ # converted to #10 so we don't get \r (#13 is \r right?) in the DOM and
88
+ # such. Thoughts on this appreciated.
89
+ allowed = DIGITS
90
+ radix = 10
91
+ if isHex
92
+ allowed = HEX_DIGITS
93
+ radix = 16
94
+ end
95
+
96
+ char_stack = []
97
+
98
+ # Consume all the characters that are in range while making sure we
99
+ # don't hit an EOF.
100
+ c = @stream.char
101
+ while allowed.include?(c) and c != :EOF
102
+ char_stack.push(c)
103
+ c = @stream.char
104
+ end
105
+
106
+ # Convert the set of characters consumed to an int.
107
+ charAsInt = char_stack.join('').to_i(radix)
108
+
109
+ if charAsInt == 13
110
+ @token_queue << {:type => :ParseError, :data => _("Incorrect CR newline entity. Replaced with LF.")}
111
+ charAsInt = 10
112
+ elsif (128..159).include? charAsInt
113
+ # If the integer is between 127 and 160 (so 128 and bigger and 159
114
+ # and smaller) we need to do the "windows trick".
115
+ @token_queue << {:type => :ParseError, :data => _("Entity used with illegal number (windows-1252 reference).")}
116
+
117
+ charAsInt = ENTITIES_WINDOWS1252[charAsInt - 128]
118
+ end
119
+
120
+ if 0 < charAsInt and charAsInt <= 1114111 and not (55296 <= charAsInt and charAsInt <= 57343)
121
+ char = [charAsInt].pack('U')
122
+ else
123
+ char = [0xFFFD].pack('U')
124
+ @token_queue << {:type => :ParseError, :data => _("Numeric entity represents an illegal codepoint.")}
125
+ end
126
+
127
+ # Discard the ; if present. Otherwise, put it back on the queue and
128
+ # invoke parse_error on parser.
129
+ if c != ";"
130
+ @token_queue << {:type => :ParseError, :data => _("Numeric entity didn't end with ';'.")}
131
+ @stream.unget(c)
132
+ end
133
+
134
+ return char
135
+ end
136
+
137
+ def consume_entity(from_attribute=false)
138
+ char = nil
139
+ char_stack = [@stream.char]
140
+ if SPACE_CHARACTERS.include?(char_stack[0]) or [:EOF, '<', '&'].include?(char_stack[0])
141
+ @stream.unget(char_stack)
142
+ elsif char_stack[0] == '#'
143
+ # We might have a number entity here.
144
+ char_stack += [@stream.char, @stream.char]
145
+ if char_stack[0 .. 1].include? :EOF
146
+ # If we reach the end of the file put everything up to :EOF
147
+ # back in the queue
148
+ char_stack = char_stack[0...char_stack.index(:EOF)]
149
+ @stream.unget(char_stack)
150
+ @token_queue << {:type => :ParseError, :data => _("Numeric entity expected. Got end of file instead.")}
151
+ else
152
+ if char_stack[1].downcase == "x" and HEX_DIGITS.include? char_stack[2]
153
+ # Hexadecimal entity detected.
154
+ @stream.unget(char_stack[2])
155
+ char = consume_number_entity(true)
156
+ elsif DIGITS.include? char_stack[1]
157
+ # Decimal entity detected.
158
+ @stream.unget(char_stack[1..-1])
159
+ char = consume_number_entity(false)
160
+ else
161
+ # No number entity detected.
162
+ @stream.unget(char_stack)
163
+ @token_queue << {:type => :ParseError, :data => _("Numeric entity expected but none found.")}
164
+ end
165
+ end
166
+ else
167
+ # At this point in the process might have named entity. Entities
168
+ # are stored in the global variable "entities".
169
+ #
170
+ # Consume characters and compare to these to a substring of the
171
+ # entity names in the list until the substring no longer matches.
172
+ filteredEntityList = ENTITIES.keys
173
+ filteredEntityList.reject! {|e| e[0].chr != char_stack[0]}
174
+ entityName = nil
175
+
176
+ # Try to find the longest entity the string will match to take care
177
+ # of &noti for instance.
178
+ while char_stack.last != :EOF
179
+ name = char_stack.join('')
180
+ if filteredEntityList.any? {|e| e[0...name.length] == name}
181
+ filteredEntityList.reject! {|e| e[0...name.length] != name}
182
+ char_stack.push(@stream.char)
183
+ else
184
+ break
185
+ end
186
+
187
+ if ENTITIES.include? name
188
+ entityName = name
189
+ break if entityName[-1] == ';'
190
+ end
191
+ end
192
+
193
+ if entityName != nil
194
+ char = ENTITIES[entityName]
195
+
196
+ # Check whether or not the last character returned can be
197
+ # discarded or needs to be put back.
198
+ if entityName[-1] != ?;
199
+ @token_queue << {:type => :ParseError, :data => _("Named entity didn't end with ';'.")}
200
+ end
201
+
202
+ if char_stack[-1] != ";" and from_attribute and
203
+ (ASCII_LETTERS.include?(char_stack[entityName.length]) or
204
+ DIGITS.include?(char_stack[entityName.length]))
205
+ @stream.unget(char_stack)
206
+ char = '&'
207
+ else
208
+ @stream.unget(char_stack[entityName.length..-1])
209
+ end
210
+ else
211
+ @token_queue << {:type => :ParseError, :data => _("Named entity expected. Got none.")}
212
+ @stream.unget(char_stack)
213
+ end
214
+ end
215
+ return char
216
+ end
217
+
218
+ # This method replaces the need for "entityInAttributeValueState".
219
+ def process_entity_in_attribute
220
+ entity = consume_entity(true)
221
+ if entity
222
+ @current_token[:data][-1][1] += entity
223
+ else
224
+ @current_token[:data][-1][1] += "&"
225
+ end
226
+ end
227
+
228
+ # This method is a generic handler for emitting the tags. It also sets
229
+ # the state to "data" because that's what's needed after a token has been
230
+ # emitted.
231
+ def emit_current_token
232
+ # Add token to the queue to be yielded
233
+ token = @current_token
234
+ if [:StartTag, :EndTag, :EmptyTag].include?(token[:type])
235
+ if @lowercase_element_name
236
+ token[:name] = token[:name].downcase
237
+ end
238
+ @token_queue << token
239
+ @state = :data_state
240
+ end
241
+
242
+ end
243
+
244
+ # Below are the various tokenizer states worked out.
245
+
246
+ # XXX AT Perhaps we should have Hixie run some evaluation on billions of
247
+ # documents to figure out what the order of the various if and elsif
248
+ # statements should be.
249
+ def data_state
250
+ data = @stream.char
251
+
252
+ if @content_model_flag == :CDATA or @content_model_flag == :RCDATA
253
+ @lastFourChars << data
254
+ @lastFourChars.shift if @lastFourChars.length > 4
255
+ end
256
+
257
+ if data == "&" and [:PCDATA,:RCDATA].include?(@content_model_flag) and !@escapeFlag
258
+ @state = :entity_data_state
259
+ elsif data == "-" && [:CDATA, :RCDATA].include?(@content_model_flag) && !@escapeFlag && @lastFourChars.join('') == "<!--"
260
+ @escapeFlag = true
261
+ @token_queue << {:type => :Characters, :data => data}
262
+ elsif data == "<" and !@escapeFlag and
263
+ [:PCDATA,:CDATA,:RCDATA].include?(@content_model_flag)
264
+ @state = :tag_open_state
265
+ elsif data == ">" and @escapeFlag and
266
+ [:CDATA,:RCDATA].include?(@content_model_flag) and
267
+ @lastFourChars[1..-1].join('') == "-->"
268
+ @escapeFlag = false
269
+ @token_queue << {:type => :Characters, :data => data}
270
+
271
+ elsif data == :EOF
272
+ # Tokenization ends.
273
+ return false
274
+
275
+ elsif SPACE_CHARACTERS.include? data
276
+ # Directly after emitting a token you switch back to the "data
277
+ # state". At that point SPACE_CHARACTERS are important so they are
278
+ # emitted separately.
279
+ # XXX need to check if we don't need a special "spaces" flag on
280
+ # characters.
281
+ @token_queue << {:type => :SpaceCharacters, :data => data + @stream.chars_until(SPACE_CHARACTERS, true)}
282
+ else
283
+ @token_queue << {:type => :Characters, :data => data + @stream.chars_until(%w[& < > -])}
284
+ end
285
+ return true
286
+ end
287
+
288
+ def entity_data_state
289
+ entity = consume_entity
290
+ if entity
291
+ @token_queue << {:type => :Characters, :data => entity}
292
+ else
293
+ @token_queue << {:type => :Characters, :data => "&"}
294
+ end
295
+ @state = :data_state
296
+ return true
297
+ end
298
+
299
+ def tag_open_state
300
+ data = @stream.char
301
+ if @content_model_flag == :PCDATA
302
+ if data == "!"
303
+ @state = :markup_declaration_open_state
304
+ elsif data == "/"
305
+ @state = :close_tag_open_state
306
+ elsif data != :EOF and ASCII_LETTERS.include? data
307
+ @current_token = {:type => :StartTag, :name => data, :data => []}
308
+ @state = :tag_name_state
309
+ elsif data == ">"
310
+ # XXX In theory it could be something besides a tag name. But
311
+ # do we really care?
312
+ @token_queue << {:type => :ParseError, :data => _("Expected tag name. Got '>' instead.")}
313
+ @token_queue << {:type => :Characters, :data => "<>"}
314
+ @state = :data_state
315
+ elsif data == "?"
316
+ # XXX In theory it could be something besides a tag name. But
317
+ # do we really care?
318
+ @token_queue.push({:type => :ParseError, :data => _("Expected tag name. Got '?' instead (HTML doesn't " +
319
+ "support processing instructions).")})
320
+ @stream.unget(data)
321
+ @state = :bogus_comment_state
322
+ else
323
+ # XXX
324
+ @token_queue << {:type => :ParseError, :data => _("Expected tag name. Got something else instead")}
325
+ @token_queue << {:type => :Characters, :data => "<"}
326
+ @stream.unget(data)
327
+ @state = :data_state
328
+ end
329
+ else
330
+ # We know the content model flag is set to either RCDATA or CDATA
331
+ # now because this state can never be entered with the PLAINTEXT
332
+ # flag.
333
+ if data == "/"
334
+ @state = :close_tag_open_state
335
+ else
336
+ @token_queue << {:type => :Characters, :data => "<"}
337
+ @stream.unget(data)
338
+ @state = :data_state
339
+ end
340
+ end
341
+ return true
342
+ end
343
+
344
+ def close_tag_open_state
345
+ if (@content_model_flag == :RCDATA or @content_model_flag == :CDATA)
346
+ if @current_token
347
+ char_stack = []
348
+
349
+ # So far we know that "</" has been consumed. We now need to know
350
+ # whether the next few characters match the name of last emitted
351
+ # start tag which also happens to be the current_token. We also need
352
+ # to have the character directly after the characters that could
353
+ # match the start tag name.
354
+ (@current_token[:name].length + 1).times do
355
+ char_stack.push(@stream.char)
356
+ # Make sure we don't get hit by :EOF
357
+ break if char_stack[-1] == :EOF
358
+ end
359
+
360
+ # Since this is just for checking. We put the characters back on
361
+ # the stack.
362
+ @stream.unget(char_stack)
363
+ end
364
+
365
+ if @current_token and
366
+ @current_token[:name].downcase ==
367
+ char_stack[0...-1].join('').downcase and
368
+ (SPACE_CHARACTERS + [">", "/", "<", :EOF]).include? char_stack[-1]
369
+ # Because the characters are correct we can safely switch to
370
+ # PCDATA mode now. This also means we don't have to do it when
371
+ # emitting the end tag token.
372
+ @content_model_flag = :PCDATA
373
+ else
374
+ @token_queue << {:type => :Characters, :data => "</"}
375
+ @state = :data_state
376
+
377
+ # Need to return here since we don't want the rest of the
378
+ # method to be walked through.
379
+ return true
380
+ end
381
+ end
382
+
383
+ data = @stream.char
384
+ if data == :EOF
385
+ @token_queue << {:type => :ParseError, :data => _("Expected closing tag. Unexpected end of file.")}
386
+ @token_queue << {:type => :Characters, :data => "</"}
387
+ @state = :data_state
388
+ elsif ASCII_LETTERS.include? data
389
+ @current_token = {:type => :EndTag, :name => data, :data => []}
390
+ @state = :tag_name_state
391
+ elsif data == ">"
392
+ @token_queue << {:type => :ParseError, :data => _("Expected closing tag. Got '>' instead. Ignoring '</>'.")}
393
+ @state = :data_state
394
+ else
395
+ # XXX data can be _'_...
396
+ @token_queue << {:type => :ParseError, :data => _("Expected closing tag. Unexpected character '#{data}' found.")}
397
+ @stream.unget(data)
398
+ @state = :bogus_comment_state
399
+ end
400
+
401
+ return true
402
+ end
403
+
404
+ def tag_name_state
405
+ data = @stream.char
406
+ if SPACE_CHARACTERS.include? data
407
+ @state = :before_attribute_name_state
408
+ elsif data == :EOF
409
+ @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in the tag name.")}
410
+ emit_current_token
411
+ elsif ASCII_LETTERS.include? data
412
+ @current_token[:name] += data + @stream.chars_until(ASCII_LETTERS, true)
413
+ elsif data == ">"
414
+ emit_current_token
415
+ elsif data == "/"
416
+ process_solidus_in_tag
417
+ @state = :before_attribute_name_state
418
+ else
419
+ @current_token[:name] += data
420
+ end
421
+ return true
422
+ end
423
+
424
+ def before_attribute_name_state
425
+ data = @stream.char
426
+ if SPACE_CHARACTERS.include? data
427
+ @stream.chars_until(SPACE_CHARACTERS, true)
428
+ elsif data == :EOF
429
+ @token_queue << {:type => :ParseError, :data => _("Unexpected end of file. Expected attribute name instead.")}
430
+ emit_current_token
431
+ elsif ASCII_LETTERS.include? data
432
+ @current_token[:data].push([data, ""])
433
+ @state = :attribute_name_state
434
+ elsif data == ">"
435
+ emit_current_token
436
+ elsif data == "/"
437
+ process_solidus_in_tag
438
+ else
439
+ @current_token[:data].push([data, ""])
440
+ @state = :attribute_name_state
441
+ end
442
+ return true
443
+ end
444
+
445
+ def attribute_name_state
446
+ data = @stream.char
447
+ leavingThisState = true
448
+ emitToken = false
449
+ if data == "="
450
+ @state = :before_attribute_value_state
451
+ elsif data == :EOF
452
+ @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in attribute name.")}
453
+ @state = :data_state
454
+ emitToken = true
455
+ elsif ASCII_LETTERS.include? data
456
+ @current_token[:data][-1][0] += data + @stream.chars_until(ASCII_LETTERS, true)
457
+ leavingThisState = false
458
+ elsif data == ">"
459
+ # XXX If we emit here the attributes are converted to a dict
460
+ # without being checked and when the code below runs we error
461
+ # because data is a dict not a list
462
+ emitToken = true
463
+ elsif SPACE_CHARACTERS.include? data
464
+ @state = :after_attribute_name_state
465
+ elsif data == "/"
466
+ process_solidus_in_tag
467
+ @state = :before_attribute_name_state
468
+ else
469
+ @current_token[:data][-1][0] += data
470
+ leavingThisState = false
471
+ end
472
+
473
+ if leavingThisState
474
+ # Attributes are not dropped at this stage. That happens when the
475
+ # start tag token is emitted so values can still be safely appended
476
+ # to attributes, but we do want to report the parse error in time.
477
+ if @lowercase_attr_name
478
+ @current_token[:data][-1][0] = @current_token[:data].last.first.downcase
479
+ end
480
+ @current_token[:data][0...-1].each {|name,value|
481
+ if @current_token[:data].last.first == name
482
+ @token_queue << {:type => :ParseError, :data =>_("Dropped duplicate attribute on tag.")}
483
+ break # don't report an error more than once
484
+ end
485
+ }
486
+ # XXX Fix for above XXX
487
+ emit_current_token if emitToken
488
+ end
489
+ return true
490
+ end
491
+
492
+ def after_attribute_name_state
493
+ data = @stream.char
494
+ if SPACE_CHARACTERS.include? data
495
+ @stream.chars_until(SPACE_CHARACTERS, true)
496
+ elsif data == "="
497
+ @state = :before_attribute_value_state
498
+ elsif data == ">"
499
+ emit_current_token
500
+ elsif data == :EOF
501
+ @token_queue << {:type => :ParseError, :data => _("Unexpected end of file. Expected = or end of tag.")}
502
+ emit_current_token
503
+ elsif ASCII_LETTERS.include? data
504
+ @current_token[:data].push([data, ""])
505
+ @state = :attribute_name_state
506
+ elsif data == "/"
507
+ process_solidus_in_tag
508
+ @state = :before_attribute_name_state
509
+ else
510
+ @current_token[:data].push([data, ""])
511
+ @state = :attribute_name_state
512
+ end
513
+ return true
514
+ end
515
+
516
+ def before_attribute_value_state
517
+ data = @stream.char
518
+ if SPACE_CHARACTERS.include? data
519
+ @stream.chars_until(SPACE_CHARACTERS, true)
520
+ elsif data == "\""
521
+ @state = :attribute_value_double_quoted_state
522
+ elsif data == "&"
523
+ @state = :attribute_value_unquoted_state
524
+ @stream.unget(data);
525
+ elsif data == "'"
526
+ @state = :attribute_value_single_quoted_state
527
+ elsif data == ">"
528
+ emit_current_token
529
+ elsif data == :EOF
530
+ @token_queue << {:type => :ParseError, :data => _("Unexpected end of file. Expected attribute value.")}
531
+ emit_current_token
532
+ else
533
+ @current_token[:data][-1][1] += data
534
+ @state = :attribute_value_unquoted_state
535
+ end
536
+ return true
537
+ end
538
+
539
+ def attribute_value_double_quoted_state
540
+ data = @stream.char
541
+ if data == "\""
542
+ @state = :before_attribute_name_state
543
+ elsif data == "&"
544
+ process_entity_in_attribute
545
+ elsif data == :EOF
546
+ @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in attribute value (\").")}
547
+ emit_current_token
548
+ else
549
+ @current_token[:data][-1][1] += data + @stream.chars_until(["\"", "&"])
550
+ end
551
+ return true
552
+ end
553
+
554
+ def attribute_value_single_quoted_state
555
+ data = @stream.char
556
+ if data == "'"
557
+ @state = :before_attribute_name_state
558
+ elsif data == "&"
559
+ process_entity_in_attribute
560
+ elsif data == :EOF
561
+ @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in attribute value (').")}
562
+ emit_current_token
563
+ else
564
+ @current_token[:data][-1][1] += data +\
565
+ @stream.chars_until(["'", "&"])
566
+ end
567
+ return true
568
+ end
569
+
570
+ def attribute_value_unquoted_state
571
+ data = @stream.char
572
+ if SPACE_CHARACTERS.include? data
573
+ @state = :before_attribute_name_state
574
+ elsif data == "&"
575
+ process_entity_in_attribute
576
+ elsif data == ">"
577
+ emit_current_token
578
+ elsif data == :EOF
579
+ @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in attribute value.")}
580
+ emit_current_token
581
+ else
582
+ @current_token[:data][-1][1] += data + @stream.chars_until(["&", ">","<"] + SPACE_CHARACTERS)
583
+ end
584
+ return true
585
+ end
586
+
587
+ def bogus_comment_state
588
+ # Make a new comment token and give it as value all the characters
589
+ # until the first > or :EOF (chars_until checks for :EOF automatically)
590
+ # and emit it.
591
+ @token_queue << {:type => :Comment, :data => @stream.chars_until((">"))}
592
+
593
+ # Eat the character directly after the bogus comment which is either a
594
+ # ">" or an :EOF.
595
+ @stream.char
596
+ @state = :data_state
597
+ return true
598
+ end
599
+
600
+ def markup_declaration_open_state
601
+ char_stack = [@stream.char, @stream.char]
602
+ if char_stack == ["-", "-"]
603
+ @current_token = {:type => :Comment, :data => ""}
604
+ @state = :comment_start_state
605
+ else
606
+ 5.times { char_stack.push(@stream.char) }
607
+ # Put in explicit :EOF check
608
+ if !char_stack.include?(:EOF) && char_stack.join("").upcase == "DOCTYPE"
609
+ @current_token = {:type => :Doctype, :name => "", :publicId => nil, :systemId => nil, :correct => true}
610
+ @state = :doctype_state
611
+ else
612
+ @token_queue << {:type => :ParseError, :data => _("Expected '--' or 'DOCTYPE'. Not found.")}
613
+ @stream.unget(char_stack)
614
+ @state = :bogus_comment_state
615
+ end
616
+ end
617
+ return true
618
+ end
619
+
620
+ def comment_start_state
621
+ data = @stream.char
622
+ if data == "-"
623
+ @state = :comment_start_dash_state
624
+ elsif data == ">"
625
+ @token_queue << {:type => :ParseError, :data => _("Incorrect comment.")}
626
+ @token_queue << @current_token
627
+ @state = :data_state
628
+ elsif data == :EOF
629
+ @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in comment.")}
630
+ @token_queue << @current_token
631
+ @state = :data_state
632
+ else
633
+ @current_token[:data] += data + @stream.chars_until("-")
634
+ @state = :comment_state
635
+ end
636
+ return true
637
+ end
638
+
639
+ def comment_start_dash_state
640
+ data = @stream.char
641
+ if data == "-"
642
+ @state = :comment_end_state
643
+ elsif data == ">"
644
+ @token_queue << {:type => :ParseError, :data => _("Incorrect comment.")}
645
+ @token_queue << @current_token
646
+ @state = :data_state
647
+ elsif data == :EOF
648
+ @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in comment.")}
649
+ @token_queue << @current_token
650
+ @state = :data_state
651
+ else
652
+ @current_token[:data] += '-' + data + @stream.chars_until("-")
653
+ @state = :comment_state
654
+ end
655
+ return true
656
+ end
657
+
658
+ def comment_state
659
+ data = @stream.char
660
+ if data == "-"
661
+ @state = :comment_end_dash_state
662
+ elsif data == :EOF
663
+ @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in comment.")}
664
+ @token_queue << @current_token
665
+ @state = :data_state
666
+ else
667
+ @current_token[:data] += data + @stream.chars_until("-")
668
+ end
669
+ return true
670
+ end
671
+
672
+ def comment_end_dash_state
673
+ data = @stream.char
674
+ if data == "-"
675
+ @state = :comment_end_state
676
+ elsif data == :EOF
677
+ @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in comment (-)")}
678
+ @token_queue << @current_token
679
+ @state = :data_state
680
+ else
681
+ @current_token[:data] += "-" + data +\
682
+ @stream.chars_until("-")
683
+ # Consume the next character which is either a "-" or an :EOF as
684
+ # well so if there's a "-" directly after the "-" we go nicely to
685
+ # the "comment end state" without emitting a ParseError there.
686
+ @stream.char
687
+ end
688
+ return true
689
+ end
690
+
691
+ def comment_end_state
692
+ data = @stream.char
693
+ if data == ">"
694
+ @token_queue << @current_token
695
+ @state = :data_state
696
+ elsif data == "-"
697
+ @token_queue << {:type => :ParseError, :data => _("Unexpected '-' after '--' found in comment.")}
698
+ @current_token[:data] += data
699
+ elsif data == :EOF
700
+ @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in comment (--).")}
701
+ @token_queue << @current_token
702
+ @state = :data_state
703
+ else
704
+ # XXX
705
+ @token_queue << {:type => :ParseError, :data => _("Unexpected character in comment found.")}
706
+ @current_token[:data] += "--" + data
707
+ @state = :comment_state
708
+ end
709
+ return true
710
+ end
711
+
712
+ def doctype_state
713
+ data = @stream.char
714
+ if SPACE_CHARACTERS.include? data
715
+ @state = :before_doctype_name_state
716
+ else
717
+ @token_queue << {:type => :ParseError, :data => _("No space after literal string 'DOCTYPE'.")}
718
+ @stream.unget(data)
719
+ @state = :before_doctype_name_state
720
+ end
721
+ return true
722
+ end
723
+
724
+ def before_doctype_name_state
725
+ data = @stream.char
726
+ if SPACE_CHARACTERS.include? data
727
+ elsif data == ">"
728
+ @token_queue << {:type => :ParseError, :data => _("Unexpected > character. Expected DOCTYPE name.")}
729
+ @current_token[:correct] = false
730
+ @token_queue << @current_token
731
+ @state = :data_state
732
+ elsif data == :EOF
733
+ @token_queue << {:type => :ParseError, :data => _("Unexpected end of file. Expected DOCTYPE name.")}
734
+ @current_token[:correct] = false
735
+ @token_queue << @current_token
736
+ @state = :data_state
737
+ else
738
+ @current_token[:name] = data
739
+ @state = :doctype_name_state
740
+ end
741
+ return true
742
+ end
743
+
744
+ def doctype_name_state
745
+ data = @stream.char
746
+ if SPACE_CHARACTERS.include? data
747
+ @state = :after_doctype_name_state
748
+ elsif data == ">"
749
+ @token_queue << @current_token
750
+ @state = :data_state
751
+ elsif data == :EOF
752
+ @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in DOCTYPE name.")}
753
+ @current_token[:correct] = false
754
+ @token_queue << @current_token
755
+ @state = :data_state
756
+ else
757
+ @current_token[:name] += data
758
+ end
759
+
760
+ return true
761
+ end
762
+
763
+ def after_doctype_name_state
764
+ data = @stream.char
765
+ if SPACE_CHARACTERS.include? data
766
+ elsif data == ">"
767
+ @token_queue << @current_token
768
+ @state = :data_state
769
+ elsif data == :EOF
770
+ @current_token[:correct] = false
771
+ @stream.unget(data)
772
+ @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in DOCTYPE.")}
773
+ @token_queue << @current_token
774
+ @state = :data_state
775
+ else
776
+ char_stack = [data]
777
+ 5.times { char_stack << stream.char }
778
+ token = char_stack.join('').tr(ASCII_UPPERCASE,ASCII_LOWERCASE)
779
+ if token == "public" and !char_stack.include?(:EOF)
780
+ @state = :before_doctype_public_identifier_state
781
+ elsif token == "system" and !char_stack.include?(:EOF)
782
+ @state = :before_doctype_system_identifier_state
783
+ else
784
+ @stream.unget(char_stack)
785
+ @token_queue << {:type => :ParseError, :data => _("Expected 'public' or 'system'. Got '#{token}'")}
786
+ @state = :bogus_doctype_state
787
+ end
788
+ end
789
+ return true
790
+ end
791
+
792
+ def before_doctype_public_identifier_state
793
+ data = @stream.char
794
+
795
+ if SPACE_CHARACTERS.include?(data)
796
+ elsif data == "\""
797
+ @current_token[:publicId] = ""
798
+ @state = :doctype_public_identifier_double_quoted_state
799
+ elsif data == "'"
800
+ @current_token[:publicId] = ""
801
+ @state = :doctype_public_identifier_single_quoted_state
802
+ elsif data == ">"
803
+ @token_queue << {:type => :ParseError, :data => _("Unexpected end of DOCTYPE.")}
804
+ @current_token[:correct] = false
805
+ @token_queue << @current_token
806
+ @state = :data_state
807
+ elsif data == :EOF
808
+ @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in DOCTYPE.")}
809
+ @current_token[:correct] = false
810
+ @token_queue << @current_token
811
+ @state = :data_state
812
+ else
813
+ @token_queue << {:type => :ParseError, :data => _("Unexpected character in DOCTYPE.")}
814
+ @state = :bogus_doctype_state
815
+ end
816
+
817
+ return true
818
+ end
819
+
820
+ def doctype_public_identifier_double_quoted_state
821
+ data = @stream.char
822
+ if data == "\""
823
+ @state = :after_doctype_public_identifier_state
824
+ elsif data == :EOF
825
+ @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in DOCTYPE.")}
826
+ @current_token[:correct] = false
827
+ @token_queue << @current_token
828
+ @state = :data_state
829
+ else
830
+ @current_token[:publicId] += data
831
+ end
832
+ return true
833
+ end
834
+
835
+ def doctype_public_identifier_single_quoted_state
836
+ data = @stream.char
837
+ if data == "'"
838
+ @state = :after_doctype_public_identifier_state
839
+ elsif data == :EOF
840
+ @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in DOCTYPE.")}
841
+ @current_token[:correct] = false
842
+ @token_queue << @current_token
843
+ @state = :data_state
844
+ else
845
+ @current_token[:publicId] += data
846
+ end
847
+ return true
848
+ end
849
+
850
+ def after_doctype_public_identifier_state
851
+ data = @stream.char
852
+ if SPACE_CHARACTERS.include?(data)
853
+ elsif data == "\""
854
+ @current_token[:systemId] = ""
855
+ @state = :doctype_system_identifier_double_quoted_state
856
+ elsif data == "'"
857
+ @current_token[:systemId] = ""
858
+ @state = :doctype_system_identifier_single_quoted_state
859
+ elsif data == ">"
860
+ @token_queue << @current_token
861
+ @state = :data_state
862
+ elsif data == :EOF
863
+ @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in DOCTYPE.")}
864
+ @current_token[:correct] = false
865
+ @token_queue << @current_token
866
+ @state = :data_state
867
+ else
868
+ @token_queue << {:type => :ParseError, :data => _("Unexpected character in DOCTYPE.")}
869
+ @state = :bogus_doctype_state
870
+ end
871
+ return true
872
+ end
873
+
874
+ def before_doctype_system_identifier_state
875
+ data = @stream.char
876
+ if SPACE_CHARACTERS.include?(data)
877
+ elsif data == "\""
878
+ @current_token[:systemId] = ""
879
+ @state = :doctype_system_identifier_double_quoted_state
880
+ elsif data == "'"
881
+ @current_token[:systemId] = ""
882
+ @state = :doctype_system_identifier_single_quoted_state
883
+ elsif data == ">"
884
+ @token_queue << {:type => :ParseError, :data => _("Unexpected character in DOCTYPE.")}
885
+ @current_token[:correct] = false
886
+ @token_queue << @current_token
887
+ @state = :data_state
888
+ elsif data == :EOF
889
+ @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in DOCTYPE.")}
890
+ @current_token[:correct] = false
891
+ @token_queue << @current_token
892
+ @state = :data_state
893
+ else
894
+ @token_queue << {:type => :ParseError, :data => _("Unexpected character in DOCTYPE.")}
895
+ @state = :bogus_doctype_state
896
+ end
897
+ return true
898
+ end
899
+
900
+ def doctype_system_identifier_double_quoted_state
901
+ data = @stream.char
902
+ if data == "\""
903
+ @state = :after_doctype_system_identifier_state
904
+ elsif data == :EOF
905
+ @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in DOCTYPE.")}
906
+ @current_token[:correct] = false
907
+ @token_queue << @current_token
908
+ @state = :data_state
909
+ else
910
+ @current_token[:systemId] += data
911
+ end
912
+ return true
913
+ end
914
+
915
+ def doctype_system_identifier_single_quoted_state
916
+ data = @stream.char
917
+ if data == "'"
918
+ @state = :after_doctype_system_identifier_state
919
+ elsif data == :EOF
920
+ @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in DOCTYPE.")}
921
+ @current_token[:correct] = false
922
+ @token_queue << @current_token
923
+ @state = :data_state
924
+ else
925
+ @current_token[:systemId] += data
926
+ end
927
+ return true
928
+ end
929
+
930
+ def after_doctype_system_identifier_state
931
+ data = @stream.char
932
+ if SPACE_CHARACTERS.include?(data)
933
+ elsif data == ">"
934
+ @token_queue << @current_token
935
+ @state = :data_state
936
+ elsif data == :EOF
937
+ @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in DOCTYPE.")}
938
+ @current_token[:correct] = false
939
+ @token_queue << @current_token
940
+ @state = :data_state
941
+ else
942
+ @token_queue << {:type => :ParseError, :data => _("Unexpected character in DOCTYPE.")}
943
+ @state = :bogus_doctype_state
944
+ end
945
+ return true
946
+ end
947
+
948
+ def bogus_doctype_state
949
+ data = @stream.char
950
+ @current_token[:correct] = false
951
+ if data == ">"
952
+ @token_queue << @current_token
953
+ @state = :data_state
954
+ elsif data == :EOF
955
+ # XXX EMIT
956
+ @stream.unget(data)
957
+ @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in bogus doctype.")}
958
+ @current_token[:correct] = false
959
+ @token_queue << @current_token
960
+ @state = :data_state
961
+ end
962
+ return true
963
+ end
964
+
965
+ def _(string); string; end
966
+ end
967
+
968
+ end