html5 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (59) hide show
  1. data/History.txt +3 -0
  2. data/Manifest.txt +58 -0
  3. data/README +9 -0
  4. data/Rakefile.rb +17 -0
  5. data/lib/html5/constants.rb +818 -0
  6. data/lib/html5/filters/base.rb +10 -0
  7. data/lib/html5/filters/inject_meta_charset.rb +82 -0
  8. data/lib/html5/filters/optionaltags.rb +198 -0
  9. data/lib/html5/filters/sanitizer.rb +15 -0
  10. data/lib/html5/filters/whitespace.rb +36 -0
  11. data/lib/html5/html5parser/after_body_phase.rb +46 -0
  12. data/lib/html5/html5parser/after_frameset_phase.rb +34 -0
  13. data/lib/html5/html5parser/after_head_phase.rb +50 -0
  14. data/lib/html5/html5parser/before_head_phase.rb +41 -0
  15. data/lib/html5/html5parser/in_body_phase.rb +607 -0
  16. data/lib/html5/html5parser/in_caption_phase.rb +68 -0
  17. data/lib/html5/html5parser/in_cell_phase.rb +78 -0
  18. data/lib/html5/html5parser/in_column_group_phase.rb +55 -0
  19. data/lib/html5/html5parser/in_frameset_phase.rb +57 -0
  20. data/lib/html5/html5parser/in_head_phase.rb +138 -0
  21. data/lib/html5/html5parser/in_row_phase.rb +87 -0
  22. data/lib/html5/html5parser/in_select_phase.rb +84 -0
  23. data/lib/html5/html5parser/in_table_body_phase.rb +83 -0
  24. data/lib/html5/html5parser/in_table_phase.rb +110 -0
  25. data/lib/html5/html5parser/initial_phase.rb +134 -0
  26. data/lib/html5/html5parser/phase.rb +158 -0
  27. data/lib/html5/html5parser/root_element_phase.rb +42 -0
  28. data/lib/html5/html5parser/trailing_end_phase.rb +35 -0
  29. data/lib/html5/html5parser.rb +248 -0
  30. data/lib/html5/inputstream.rb +654 -0
  31. data/lib/html5/liberalxmlparser.rb +158 -0
  32. data/lib/html5/sanitizer.rb +188 -0
  33. data/lib/html5/serializer/htmlserializer.rb +180 -0
  34. data/lib/html5/serializer/xhtmlserializer.rb +20 -0
  35. data/lib/html5/serializer.rb +2 -0
  36. data/lib/html5/tokenizer.rb +968 -0
  37. data/lib/html5/treebuilders/base.rb +334 -0
  38. data/lib/html5/treebuilders/hpricot.rb +231 -0
  39. data/lib/html5/treebuilders/rexml.rb +208 -0
  40. data/lib/html5/treebuilders/simpletree.rb +185 -0
  41. data/lib/html5/treebuilders.rb +24 -0
  42. data/lib/html5/treewalkers/base.rb +154 -0
  43. data/lib/html5/treewalkers/hpricot.rb +48 -0
  44. data/lib/html5/treewalkers/rexml.rb +48 -0
  45. data/lib/html5/treewalkers/simpletree.rb +48 -0
  46. data/lib/html5/treewalkers.rb +26 -0
  47. data/lib/html5.rb +13 -0
  48. data/parse.rb +217 -0
  49. data/tests/preamble.rb +82 -0
  50. data/tests/test_encoding.rb +35 -0
  51. data/tests/test_lxp.rb +263 -0
  52. data/tests/test_parser.rb +68 -0
  53. data/tests/test_sanitizer.rb +142 -0
  54. data/tests/test_serializer.rb +68 -0
  55. data/tests/test_stream.rb +62 -0
  56. data/tests/test_tokenizer.rb +94 -0
  57. data/tests/test_treewalkers.rb +116 -0
  58. data/tests/tokenizer_test_parser.rb +63 -0
  59. metadata +120 -0
@@ -0,0 +1,968 @@
1
+ require 'html5/constants'
2
+ require 'html5/inputstream'
3
+
4
+ module HTML5
5
+
6
+ # This class takes care of tokenizing HTML.
7
+ #
8
+ # * @current_token
9
+ # Holds the token that is currently being processed.
10
+ #
11
+ # * @state
12
+ # Holds a reference to the method to be invoked... XXX
13
+ #
14
+ # * @states
15
+ # Holds a mapping between states and methods that implement the state.
16
+ #
17
+ # * @stream
18
+ # Points to HTMLInputStream object.
19
+
20
+ class HTMLTokenizer
21
+ attr_accessor :content_model_flag, :current_token
22
+ attr_reader :stream
23
+
24
+ # XXX need to fix documentation
25
+
26
+ def initialize(stream, options = {})
27
+ @stream = HTMLInputStream.new(stream, options)
28
+
29
+ # Setup the initial tokenizer state
30
+ @content_model_flag = :PCDATA
31
+ @state = :data_state
32
+ @escapeFlag = false
33
+ @lastFourChars = []
34
+
35
+ # The current token being created
36
+ @current_token = nil
37
+
38
+ # Tokens to be processed.
39
+ @token_queue = []
40
+ @lowercase_element_name = options[:lowercase_element_name] != false
41
+ @lowercase_attr_name = options[:lowercase_attr_name] != false
42
+ end
43
+
44
+ # This is where the magic happens.
45
+ #
46
+ # We do our usually processing through the states and when we have a token
47
+ # to return we yield the token which pauses processing until the next token
48
+ # is requested.
49
+ def each
50
+ @token_queue = []
51
+ # Start processing. When EOF is reached @state will return false
52
+ # instead of true and the loop will terminate.
53
+ while send @state
54
+ yield :type => :ParseError, :data => @stream.errors.shift until @stream.errors.empty?
55
+ yield @token_queue.shift until @token_queue.empty?
56
+ end
57
+ end
58
+
59
+ # Below are various helper functions the tokenizer states use worked out.
60
+
61
+ # If the next character is a '>', convert the current_token into
62
+ # an EmptyTag
63
+
64
+ def process_solidus_in_tag
65
+
66
+ # We need to consume another character to make sure it's a ">"
67
+ data = @stream.char
68
+
69
+ if @current_token[:type] == :StartTag and data == ">"
70
+ @current_token[:type] = :EmptyTag
71
+ else
72
+ @token_queue << {:type => :ParseError, :data => _("Solidus (/) incorrectly placed in tag.")}
73
+ end
74
+
75
+ # The character we just consumed need to be put back on the stack so it
76
+ # doesn't get lost...
77
+ @stream.unget(data)
78
+ end
79
+
80
+ # This function returns either U+FFFD or the character based on the
81
+ # decimal or hexadecimal representation. It also discards ";" if present.
82
+ # If not present @token_queue << {:type => :ParseError}" is invoked.
83
+
84
+ def consume_number_entity(isHex)
85
+
86
+ # XXX More need to be done here. For instance, #13 should prolly be
87
+ # converted to #10 so we don't get \r (#13 is \r right?) in the DOM and
88
+ # such. Thoughts on this appreciated.
89
+ allowed = DIGITS
90
+ radix = 10
91
+ if isHex
92
+ allowed = HEX_DIGITS
93
+ radix = 16
94
+ end
95
+
96
+ char_stack = []
97
+
98
+ # Consume all the characters that are in range while making sure we
99
+ # don't hit an EOF.
100
+ c = @stream.char
101
+ while allowed.include?(c) and c != :EOF
102
+ char_stack.push(c)
103
+ c = @stream.char
104
+ end
105
+
106
+ # Convert the set of characters consumed to an int.
107
+ charAsInt = char_stack.join('').to_i(radix)
108
+
109
+ if charAsInt == 13
110
+ @token_queue << {:type => :ParseError, :data => _("Incorrect CR newline entity. Replaced with LF.")}
111
+ charAsInt = 10
112
+ elsif (128..159).include? charAsInt
113
+ # If the integer is between 127 and 160 (so 128 and bigger and 159
114
+ # and smaller) we need to do the "windows trick".
115
+ @token_queue << {:type => :ParseError, :data => _("Entity used with illegal number (windows-1252 reference).")}
116
+
117
+ charAsInt = ENTITIES_WINDOWS1252[charAsInt - 128]
118
+ end
119
+
120
+ if 0 < charAsInt and charAsInt <= 1114111 and not (55296 <= charAsInt and charAsInt <= 57343)
121
+ char = [charAsInt].pack('U')
122
+ else
123
+ char = [0xFFFD].pack('U')
124
+ @token_queue << {:type => :ParseError, :data => _("Numeric entity represents an illegal codepoint.")}
125
+ end
126
+
127
+ # Discard the ; if present. Otherwise, put it back on the queue and
128
+ # invoke parse_error on parser.
129
+ if c != ";"
130
+ @token_queue << {:type => :ParseError, :data => _("Numeric entity didn't end with ';'.")}
131
+ @stream.unget(c)
132
+ end
133
+
134
+ return char
135
+ end
136
+
137
+ def consume_entity(from_attribute=false)
138
+ char = nil
139
+ char_stack = [@stream.char]
140
+ if SPACE_CHARACTERS.include?(char_stack[0]) or [:EOF, '<', '&'].include?(char_stack[0])
141
+ @stream.unget(char_stack)
142
+ elsif char_stack[0] == '#'
143
+ # We might have a number entity here.
144
+ char_stack += [@stream.char, @stream.char]
145
+ if char_stack[0 .. 1].include? :EOF
146
+ # If we reach the end of the file put everything up to :EOF
147
+ # back in the queue
148
+ char_stack = char_stack[0...char_stack.index(:EOF)]
149
+ @stream.unget(char_stack)
150
+ @token_queue << {:type => :ParseError, :data => _("Numeric entity expected. Got end of file instead.")}
151
+ else
152
+ if char_stack[1].downcase == "x" and HEX_DIGITS.include? char_stack[2]
153
+ # Hexadecimal entity detected.
154
+ @stream.unget(char_stack[2])
155
+ char = consume_number_entity(true)
156
+ elsif DIGITS.include? char_stack[1]
157
+ # Decimal entity detected.
158
+ @stream.unget(char_stack[1..-1])
159
+ char = consume_number_entity(false)
160
+ else
161
+ # No number entity detected.
162
+ @stream.unget(char_stack)
163
+ @token_queue << {:type => :ParseError, :data => _("Numeric entity expected but none found.")}
164
+ end
165
+ end
166
+ else
167
+ # At this point in the process might have named entity. Entities
168
+ # are stored in the global variable "entities".
169
+ #
170
+ # Consume characters and compare to these to a substring of the
171
+ # entity names in the list until the substring no longer matches.
172
+ filteredEntityList = ENTITIES.keys
173
+ filteredEntityList.reject! {|e| e[0].chr != char_stack[0]}
174
+ entityName = nil
175
+
176
+ # Try to find the longest entity the string will match to take care
177
+ # of &noti for instance.
178
+ while char_stack.last != :EOF
179
+ name = char_stack.join('')
180
+ if filteredEntityList.any? {|e| e[0...name.length] == name}
181
+ filteredEntityList.reject! {|e| e[0...name.length] != name}
182
+ char_stack.push(@stream.char)
183
+ else
184
+ break
185
+ end
186
+
187
+ if ENTITIES.include? name
188
+ entityName = name
189
+ break if entityName[-1] == ';'
190
+ end
191
+ end
192
+
193
+ if entityName != nil
194
+ char = ENTITIES[entityName]
195
+
196
+ # Check whether or not the last character returned can be
197
+ # discarded or needs to be put back.
198
+ if entityName[-1] != ?;
199
+ @token_queue << {:type => :ParseError, :data => _("Named entity didn't end with ';'.")}
200
+ end
201
+
202
+ if char_stack[-1] != ";" and from_attribute and
203
+ (ASCII_LETTERS.include?(char_stack[entityName.length]) or
204
+ DIGITS.include?(char_stack[entityName.length]))
205
+ @stream.unget(char_stack)
206
+ char = '&'
207
+ else
208
+ @stream.unget(char_stack[entityName.length..-1])
209
+ end
210
+ else
211
+ @token_queue << {:type => :ParseError, :data => _("Named entity expected. Got none.")}
212
+ @stream.unget(char_stack)
213
+ end
214
+ end
215
+ return char
216
+ end
217
+
218
+ # This method replaces the need for "entityInAttributeValueState".
219
+ def process_entity_in_attribute
220
+ entity = consume_entity(true)
221
+ if entity
222
+ @current_token[:data][-1][1] += entity
223
+ else
224
+ @current_token[:data][-1][1] += "&"
225
+ end
226
+ end
227
+
228
+ # This method is a generic handler for emitting the tags. It also sets
229
+ # the state to "data" because that's what's needed after a token has been
230
+ # emitted.
231
+ def emit_current_token
232
+ # Add token to the queue to be yielded
233
+ token = @current_token
234
+ if [:StartTag, :EndTag, :EmptyTag].include?(token[:type])
235
+ if @lowercase_element_name
236
+ token[:name] = token[:name].downcase
237
+ end
238
+ @token_queue << token
239
+ @state = :data_state
240
+ end
241
+
242
+ end
243
+
244
+ # Below are the various tokenizer states worked out.
245
+
246
+ # XXX AT Perhaps we should have Hixie run some evaluation on billions of
247
+ # documents to figure out what the order of the various if and elsif
248
+ # statements should be.
249
+ def data_state
250
+ data = @stream.char
251
+
252
+ if @content_model_flag == :CDATA or @content_model_flag == :RCDATA
253
+ @lastFourChars << data
254
+ @lastFourChars.shift if @lastFourChars.length > 4
255
+ end
256
+
257
+ if data == "&" and [:PCDATA,:RCDATA].include?(@content_model_flag) and !@escapeFlag
258
+ @state = :entity_data_state
259
+ elsif data == "-" && [:CDATA, :RCDATA].include?(@content_model_flag) && !@escapeFlag && @lastFourChars.join('') == "<!--"
260
+ @escapeFlag = true
261
+ @token_queue << {:type => :Characters, :data => data}
262
+ elsif data == "<" and !@escapeFlag and
263
+ [:PCDATA,:CDATA,:RCDATA].include?(@content_model_flag)
264
+ @state = :tag_open_state
265
+ elsif data == ">" and @escapeFlag and
266
+ [:CDATA,:RCDATA].include?(@content_model_flag) and
267
+ @lastFourChars[1..-1].join('') == "-->"
268
+ @escapeFlag = false
269
+ @token_queue << {:type => :Characters, :data => data}
270
+
271
+ elsif data == :EOF
272
+ # Tokenization ends.
273
+ return false
274
+
275
+ elsif SPACE_CHARACTERS.include? data
276
+ # Directly after emitting a token you switch back to the "data
277
+ # state". At that point SPACE_CHARACTERS are important so they are
278
+ # emitted separately.
279
+ # XXX need to check if we don't need a special "spaces" flag on
280
+ # characters.
281
+ @token_queue << {:type => :SpaceCharacters, :data => data + @stream.chars_until(SPACE_CHARACTERS, true)}
282
+ else
283
+ @token_queue << {:type => :Characters, :data => data + @stream.chars_until(%w[& < > -])}
284
+ end
285
+ return true
286
+ end
287
+
288
+ def entity_data_state
289
+ entity = consume_entity
290
+ if entity
291
+ @token_queue << {:type => :Characters, :data => entity}
292
+ else
293
+ @token_queue << {:type => :Characters, :data => "&"}
294
+ end
295
+ @state = :data_state
296
+ return true
297
+ end
298
+
299
+ def tag_open_state
300
+ data = @stream.char
301
+ if @content_model_flag == :PCDATA
302
+ if data == "!"
303
+ @state = :markup_declaration_open_state
304
+ elsif data == "/"
305
+ @state = :close_tag_open_state
306
+ elsif data != :EOF and ASCII_LETTERS.include? data
307
+ @current_token = {:type => :StartTag, :name => data, :data => []}
308
+ @state = :tag_name_state
309
+ elsif data == ">"
310
+ # XXX In theory it could be something besides a tag name. But
311
+ # do we really care?
312
+ @token_queue << {:type => :ParseError, :data => _("Expected tag name. Got '>' instead.")}
313
+ @token_queue << {:type => :Characters, :data => "<>"}
314
+ @state = :data_state
315
+ elsif data == "?"
316
+ # XXX In theory it could be something besides a tag name. But
317
+ # do we really care?
318
+ @token_queue.push({:type => :ParseError, :data => _("Expected tag name. Got '?' instead (HTML doesn't " +
319
+ "support processing instructions).")})
320
+ @stream.unget(data)
321
+ @state = :bogus_comment_state
322
+ else
323
+ # XXX
324
+ @token_queue << {:type => :ParseError, :data => _("Expected tag name. Got something else instead")}
325
+ @token_queue << {:type => :Characters, :data => "<"}
326
+ @stream.unget(data)
327
+ @state = :data_state
328
+ end
329
+ else
330
+ # We know the content model flag is set to either RCDATA or CDATA
331
+ # now because this state can never be entered with the PLAINTEXT
332
+ # flag.
333
+ if data == "/"
334
+ @state = :close_tag_open_state
335
+ else
336
+ @token_queue << {:type => :Characters, :data => "<"}
337
+ @stream.unget(data)
338
+ @state = :data_state
339
+ end
340
+ end
341
+ return true
342
+ end
343
+
344
+ def close_tag_open_state
345
+ if (@content_model_flag == :RCDATA or @content_model_flag == :CDATA)
346
+ if @current_token
347
+ char_stack = []
348
+
349
+ # So far we know that "</" has been consumed. We now need to know
350
+ # whether the next few characters match the name of last emitted
351
+ # start tag which also happens to be the current_token. We also need
352
+ # to have the character directly after the characters that could
353
+ # match the start tag name.
354
+ (@current_token[:name].length + 1).times do
355
+ char_stack.push(@stream.char)
356
+ # Make sure we don't get hit by :EOF
357
+ break if char_stack[-1] == :EOF
358
+ end
359
+
360
+ # Since this is just for checking. We put the characters back on
361
+ # the stack.
362
+ @stream.unget(char_stack)
363
+ end
364
+
365
+ if @current_token and
366
+ @current_token[:name].downcase ==
367
+ char_stack[0...-1].join('').downcase and
368
+ (SPACE_CHARACTERS + [">", "/", "<", :EOF]).include? char_stack[-1]
369
+ # Because the characters are correct we can safely switch to
370
+ # PCDATA mode now. This also means we don't have to do it when
371
+ # emitting the end tag token.
372
+ @content_model_flag = :PCDATA
373
+ else
374
+ @token_queue << {:type => :Characters, :data => "</"}
375
+ @state = :data_state
376
+
377
+ # Need to return here since we don't want the rest of the
378
+ # method to be walked through.
379
+ return true
380
+ end
381
+ end
382
+
383
+ data = @stream.char
384
+ if data == :EOF
385
+ @token_queue << {:type => :ParseError, :data => _("Expected closing tag. Unexpected end of file.")}
386
+ @token_queue << {:type => :Characters, :data => "</"}
387
+ @state = :data_state
388
+ elsif ASCII_LETTERS.include? data
389
+ @current_token = {:type => :EndTag, :name => data, :data => []}
390
+ @state = :tag_name_state
391
+ elsif data == ">"
392
+ @token_queue << {:type => :ParseError, :data => _("Expected closing tag. Got '>' instead. Ignoring '</>'.")}
393
+ @state = :data_state
394
+ else
395
+ # XXX data can be _'_...
396
+ @token_queue << {:type => :ParseError, :data => _("Expected closing tag. Unexpected character '#{data}' found.")}
397
+ @stream.unget(data)
398
+ @state = :bogus_comment_state
399
+ end
400
+
401
+ return true
402
+ end
403
+
404
+ def tag_name_state
405
+ data = @stream.char
406
+ if SPACE_CHARACTERS.include? data
407
+ @state = :before_attribute_name_state
408
+ elsif data == :EOF
409
+ @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in the tag name.")}
410
+ emit_current_token
411
+ elsif ASCII_LETTERS.include? data
412
+ @current_token[:name] += data + @stream.chars_until(ASCII_LETTERS, true)
413
+ elsif data == ">"
414
+ emit_current_token
415
+ elsif data == "/"
416
+ process_solidus_in_tag
417
+ @state = :before_attribute_name_state
418
+ else
419
+ @current_token[:name] += data
420
+ end
421
+ return true
422
+ end
423
+
424
+ def before_attribute_name_state
425
+ data = @stream.char
426
+ if SPACE_CHARACTERS.include? data
427
+ @stream.chars_until(SPACE_CHARACTERS, true)
428
+ elsif data == :EOF
429
+ @token_queue << {:type => :ParseError, :data => _("Unexpected end of file. Expected attribute name instead.")}
430
+ emit_current_token
431
+ elsif ASCII_LETTERS.include? data
432
+ @current_token[:data].push([data, ""])
433
+ @state = :attribute_name_state
434
+ elsif data == ">"
435
+ emit_current_token
436
+ elsif data == "/"
437
+ process_solidus_in_tag
438
+ else
439
+ @current_token[:data].push([data, ""])
440
+ @state = :attribute_name_state
441
+ end
442
+ return true
443
+ end
444
+
445
+ def attribute_name_state
446
+ data = @stream.char
447
+ leavingThisState = true
448
+ emitToken = false
449
+ if data == "="
450
+ @state = :before_attribute_value_state
451
+ elsif data == :EOF
452
+ @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in attribute name.")}
453
+ @state = :data_state
454
+ emitToken = true
455
+ elsif ASCII_LETTERS.include? data
456
+ @current_token[:data][-1][0] += data + @stream.chars_until(ASCII_LETTERS, true)
457
+ leavingThisState = false
458
+ elsif data == ">"
459
+ # XXX If we emit here the attributes are converted to a dict
460
+ # without being checked and when the code below runs we error
461
+ # because data is a dict not a list
462
+ emitToken = true
463
+ elsif SPACE_CHARACTERS.include? data
464
+ @state = :after_attribute_name_state
465
+ elsif data == "/"
466
+ process_solidus_in_tag
467
+ @state = :before_attribute_name_state
468
+ else
469
+ @current_token[:data][-1][0] += data
470
+ leavingThisState = false
471
+ end
472
+
473
+ if leavingThisState
474
+ # Attributes are not dropped at this stage. That happens when the
475
+ # start tag token is emitted so values can still be safely appended
476
+ # to attributes, but we do want to report the parse error in time.
477
+ if @lowercase_attr_name
478
+ @current_token[:data][-1][0] = @current_token[:data].last.first.downcase
479
+ end
480
+ @current_token[:data][0...-1].each {|name,value|
481
+ if @current_token[:data].last.first == name
482
+ @token_queue << {:type => :ParseError, :data =>_("Dropped duplicate attribute on tag.")}
483
+ break # don't report an error more than once
484
+ end
485
+ }
486
+ # XXX Fix for above XXX
487
+ emit_current_token if emitToken
488
+ end
489
+ return true
490
+ end
491
+
492
+ def after_attribute_name_state
493
+ data = @stream.char
494
+ if SPACE_CHARACTERS.include? data
495
+ @stream.chars_until(SPACE_CHARACTERS, true)
496
+ elsif data == "="
497
+ @state = :before_attribute_value_state
498
+ elsif data == ">"
499
+ emit_current_token
500
+ elsif data == :EOF
501
+ @token_queue << {:type => :ParseError, :data => _("Unexpected end of file. Expected = or end of tag.")}
502
+ emit_current_token
503
+ elsif ASCII_LETTERS.include? data
504
+ @current_token[:data].push([data, ""])
505
+ @state = :attribute_name_state
506
+ elsif data == "/"
507
+ process_solidus_in_tag
508
+ @state = :before_attribute_name_state
509
+ else
510
+ @current_token[:data].push([data, ""])
511
+ @state = :attribute_name_state
512
+ end
513
+ return true
514
+ end
515
+
516
+ def before_attribute_value_state
517
+ data = @stream.char
518
+ if SPACE_CHARACTERS.include? data
519
+ @stream.chars_until(SPACE_CHARACTERS, true)
520
+ elsif data == "\""
521
+ @state = :attribute_value_double_quoted_state
522
+ elsif data == "&"
523
+ @state = :attribute_value_unquoted_state
524
+ @stream.unget(data);
525
+ elsif data == "'"
526
+ @state = :attribute_value_single_quoted_state
527
+ elsif data == ">"
528
+ emit_current_token
529
+ elsif data == :EOF
530
+ @token_queue << {:type => :ParseError, :data => _("Unexpected end of file. Expected attribute value.")}
531
+ emit_current_token
532
+ else
533
+ @current_token[:data][-1][1] += data
534
+ @state = :attribute_value_unquoted_state
535
+ end
536
+ return true
537
+ end
538
+
539
+ def attribute_value_double_quoted_state
540
+ data = @stream.char
541
+ if data == "\""
542
+ @state = :before_attribute_name_state
543
+ elsif data == "&"
544
+ process_entity_in_attribute
545
+ elsif data == :EOF
546
+ @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in attribute value (\").")}
547
+ emit_current_token
548
+ else
549
+ @current_token[:data][-1][1] += data + @stream.chars_until(["\"", "&"])
550
+ end
551
+ return true
552
+ end
553
+
554
+ def attribute_value_single_quoted_state
555
+ data = @stream.char
556
+ if data == "'"
557
+ @state = :before_attribute_name_state
558
+ elsif data == "&"
559
+ process_entity_in_attribute
560
+ elsif data == :EOF
561
+ @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in attribute value (').")}
562
+ emit_current_token
563
+ else
564
+ @current_token[:data][-1][1] += data +\
565
+ @stream.chars_until(["'", "&"])
566
+ end
567
+ return true
568
+ end
569
+
570
+ def attribute_value_unquoted_state
571
+ data = @stream.char
572
+ if SPACE_CHARACTERS.include? data
573
+ @state = :before_attribute_name_state
574
+ elsif data == "&"
575
+ process_entity_in_attribute
576
+ elsif data == ">"
577
+ emit_current_token
578
+ elsif data == :EOF
579
+ @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in attribute value.")}
580
+ emit_current_token
581
+ else
582
+ @current_token[:data][-1][1] += data + @stream.chars_until(["&", ">","<"] + SPACE_CHARACTERS)
583
+ end
584
+ return true
585
+ end
586
+
587
+ def bogus_comment_state
588
+ # Make a new comment token and give it as value all the characters
589
+ # until the first > or :EOF (chars_until checks for :EOF automatically)
590
+ # and emit it.
591
+ @token_queue << {:type => :Comment, :data => @stream.chars_until((">"))}
592
+
593
+ # Eat the character directly after the bogus comment which is either a
594
+ # ">" or an :EOF.
595
+ @stream.char
596
+ @state = :data_state
597
+ return true
598
+ end
599
+
600
+ def markup_declaration_open_state
601
+ char_stack = [@stream.char, @stream.char]
602
+ if char_stack == ["-", "-"]
603
+ @current_token = {:type => :Comment, :data => ""}
604
+ @state = :comment_start_state
605
+ else
606
+ 5.times { char_stack.push(@stream.char) }
607
+ # Put in explicit :EOF check
608
+ if !char_stack.include?(:EOF) && char_stack.join("").upcase == "DOCTYPE"
609
+ @current_token = {:type => :Doctype, :name => "", :publicId => nil, :systemId => nil, :correct => true}
610
+ @state = :doctype_state
611
+ else
612
+ @token_queue << {:type => :ParseError, :data => _("Expected '--' or 'DOCTYPE'. Not found.")}
613
+ @stream.unget(char_stack)
614
+ @state = :bogus_comment_state
615
+ end
616
+ end
617
+ return true
618
+ end
619
+
620
+ def comment_start_state
621
+ data = @stream.char
622
+ if data == "-"
623
+ @state = :comment_start_dash_state
624
+ elsif data == ">"
625
+ @token_queue << {:type => :ParseError, :data => _("Incorrect comment.")}
626
+ @token_queue << @current_token
627
+ @state = :data_state
628
+ elsif data == :EOF
629
+ @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in comment.")}
630
+ @token_queue << @current_token
631
+ @state = :data_state
632
+ else
633
+ @current_token[:data] += data + @stream.chars_until("-")
634
+ @state = :comment_state
635
+ end
636
+ return true
637
+ end
638
+
639
+ def comment_start_dash_state
640
+ data = @stream.char
641
+ if data == "-"
642
+ @state = :comment_end_state
643
+ elsif data == ">"
644
+ @token_queue << {:type => :ParseError, :data => _("Incorrect comment.")}
645
+ @token_queue << @current_token
646
+ @state = :data_state
647
+ elsif data == :EOF
648
+ @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in comment.")}
649
+ @token_queue << @current_token
650
+ @state = :data_state
651
+ else
652
+ @current_token[:data] += '-' + data + @stream.chars_until("-")
653
+ @state = :comment_state
654
+ end
655
+ return true
656
+ end
657
+
658
+ def comment_state
659
+ data = @stream.char
660
+ if data == "-"
661
+ @state = :comment_end_dash_state
662
+ elsif data == :EOF
663
+ @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in comment.")}
664
+ @token_queue << @current_token
665
+ @state = :data_state
666
+ else
667
+ @current_token[:data] += data + @stream.chars_until("-")
668
+ end
669
+ return true
670
+ end
671
+
672
+ def comment_end_dash_state
673
+ data = @stream.char
674
+ if data == "-"
675
+ @state = :comment_end_state
676
+ elsif data == :EOF
677
+ @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in comment (-)")}
678
+ @token_queue << @current_token
679
+ @state = :data_state
680
+ else
681
+ @current_token[:data] += "-" + data +\
682
+ @stream.chars_until("-")
683
+ # Consume the next character which is either a "-" or an :EOF as
684
+ # well so if there's a "-" directly after the "-" we go nicely to
685
+ # the "comment end state" without emitting a ParseError there.
686
+ @stream.char
687
+ end
688
+ return true
689
+ end
690
+
691
+ def comment_end_state
692
+ data = @stream.char
693
+ if data == ">"
694
+ @token_queue << @current_token
695
+ @state = :data_state
696
+ elsif data == "-"
697
+ @token_queue << {:type => :ParseError, :data => _("Unexpected '-' after '--' found in comment.")}
698
+ @current_token[:data] += data
699
+ elsif data == :EOF
700
+ @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in comment (--).")}
701
+ @token_queue << @current_token
702
+ @state = :data_state
703
+ else
704
+ # XXX
705
+ @token_queue << {:type => :ParseError, :data => _("Unexpected character in comment found.")}
706
+ @current_token[:data] += "--" + data
707
+ @state = :comment_state
708
+ end
709
+ return true
710
+ end
711
+
712
+ def doctype_state
713
+ data = @stream.char
714
+ if SPACE_CHARACTERS.include? data
715
+ @state = :before_doctype_name_state
716
+ else
717
+ @token_queue << {:type => :ParseError, :data => _("No space after literal string 'DOCTYPE'.")}
718
+ @stream.unget(data)
719
+ @state = :before_doctype_name_state
720
+ end
721
+ return true
722
+ end
723
+
724
+ def before_doctype_name_state
725
+ data = @stream.char
726
+ if SPACE_CHARACTERS.include? data
727
+ elsif data == ">"
728
+ @token_queue << {:type => :ParseError, :data => _("Unexpected > character. Expected DOCTYPE name.")}
729
+ @current_token[:correct] = false
730
+ @token_queue << @current_token
731
+ @state = :data_state
732
+ elsif data == :EOF
733
+ @token_queue << {:type => :ParseError, :data => _("Unexpected end of file. Expected DOCTYPE name.")}
734
+ @current_token[:correct] = false
735
+ @token_queue << @current_token
736
+ @state = :data_state
737
+ else
738
+ @current_token[:name] = data
739
+ @state = :doctype_name_state
740
+ end
741
+ return true
742
+ end
743
+
744
+ def doctype_name_state
745
+ data = @stream.char
746
+ if SPACE_CHARACTERS.include? data
747
+ @state = :after_doctype_name_state
748
+ elsif data == ">"
749
+ @token_queue << @current_token
750
+ @state = :data_state
751
+ elsif data == :EOF
752
+ @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in DOCTYPE name.")}
753
+ @current_token[:correct] = false
754
+ @token_queue << @current_token
755
+ @state = :data_state
756
+ else
757
+ @current_token[:name] += data
758
+ end
759
+
760
+ return true
761
+ end
762
+
763
+ def after_doctype_name_state
764
+ data = @stream.char
765
+ if SPACE_CHARACTERS.include? data
766
+ elsif data == ">"
767
+ @token_queue << @current_token
768
+ @state = :data_state
769
+ elsif data == :EOF
770
+ @current_token[:correct] = false
771
+ @stream.unget(data)
772
+ @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in DOCTYPE.")}
773
+ @token_queue << @current_token
774
+ @state = :data_state
775
+ else
776
+ char_stack = [data]
777
+ 5.times { char_stack << stream.char }
778
+ token = char_stack.join('').tr(ASCII_UPPERCASE,ASCII_LOWERCASE)
779
+ if token == "public" and !char_stack.include?(:EOF)
780
+ @state = :before_doctype_public_identifier_state
781
+ elsif token == "system" and !char_stack.include?(:EOF)
782
+ @state = :before_doctype_system_identifier_state
783
+ else
784
+ @stream.unget(char_stack)
785
+ @token_queue << {:type => :ParseError, :data => _("Expected 'public' or 'system'. Got '#{token}'")}
786
+ @state = :bogus_doctype_state
787
+ end
788
+ end
789
+ return true
790
+ end
791
+
792
+ def before_doctype_public_identifier_state
793
+ data = @stream.char
794
+
795
+ if SPACE_CHARACTERS.include?(data)
796
+ elsif data == "\""
797
+ @current_token[:publicId] = ""
798
+ @state = :doctype_public_identifier_double_quoted_state
799
+ elsif data == "'"
800
+ @current_token[:publicId] = ""
801
+ @state = :doctype_public_identifier_single_quoted_state
802
+ elsif data == ">"
803
+ @token_queue << {:type => :ParseError, :data => _("Unexpected end of DOCTYPE.")}
804
+ @current_token[:correct] = false
805
+ @token_queue << @current_token
806
+ @state = :data_state
807
+ elsif data == :EOF
808
+ @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in DOCTYPE.")}
809
+ @current_token[:correct] = false
810
+ @token_queue << @current_token
811
+ @state = :data_state
812
+ else
813
+ @token_queue << {:type => :ParseError, :data => _("Unexpected character in DOCTYPE.")}
814
+ @state = :bogus_doctype_state
815
+ end
816
+
817
+ return true
818
+ end
819
+
820
+ def doctype_public_identifier_double_quoted_state
821
+ data = @stream.char
822
+ if data == "\""
823
+ @state = :after_doctype_public_identifier_state
824
+ elsif data == :EOF
825
+ @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in DOCTYPE.")}
826
+ @current_token[:correct] = false
827
+ @token_queue << @current_token
828
+ @state = :data_state
829
+ else
830
+ @current_token[:publicId] += data
831
+ end
832
+ return true
833
+ end
834
+
835
+ def doctype_public_identifier_single_quoted_state
836
+ data = @stream.char
837
+ if data == "'"
838
+ @state = :after_doctype_public_identifier_state
839
+ elsif data == :EOF
840
+ @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in DOCTYPE.")}
841
+ @current_token[:correct] = false
842
+ @token_queue << @current_token
843
+ @state = :data_state
844
+ else
845
+ @current_token[:publicId] += data
846
+ end
847
+ return true
848
+ end
849
+
850
+ def after_doctype_public_identifier_state
851
+ data = @stream.char
852
+ if SPACE_CHARACTERS.include?(data)
853
+ elsif data == "\""
854
+ @current_token[:systemId] = ""
855
+ @state = :doctype_system_identifier_double_quoted_state
856
+ elsif data == "'"
857
+ @current_token[:systemId] = ""
858
+ @state = :doctype_system_identifier_single_quoted_state
859
+ elsif data == ">"
860
+ @token_queue << @current_token
861
+ @state = :data_state
862
+ elsif data == :EOF
863
+ @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in DOCTYPE.")}
864
+ @current_token[:correct] = false
865
+ @token_queue << @current_token
866
+ @state = :data_state
867
+ else
868
+ @token_queue << {:type => :ParseError, :data => _("Unexpected character in DOCTYPE.")}
869
+ @state = :bogus_doctype_state
870
+ end
871
+ return true
872
+ end
873
+
874
+ def before_doctype_system_identifier_state
875
+ data = @stream.char
876
+ if SPACE_CHARACTERS.include?(data)
877
+ elsif data == "\""
878
+ @current_token[:systemId] = ""
879
+ @state = :doctype_system_identifier_double_quoted_state
880
+ elsif data == "'"
881
+ @current_token[:systemId] = ""
882
+ @state = :doctype_system_identifier_single_quoted_state
883
+ elsif data == ">"
884
+ @token_queue << {:type => :ParseError, :data => _("Unexpected character in DOCTYPE.")}
885
+ @current_token[:correct] = false
886
+ @token_queue << @current_token
887
+ @state = :data_state
888
+ elsif data == :EOF
889
+ @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in DOCTYPE.")}
890
+ @current_token[:correct] = false
891
+ @token_queue << @current_token
892
+ @state = :data_state
893
+ else
894
+ @token_queue << {:type => :ParseError, :data => _("Unexpected character in DOCTYPE.")}
895
+ @state = :bogus_doctype_state
896
+ end
897
+ return true
898
+ end
899
+
900
+ def doctype_system_identifier_double_quoted_state
901
+ data = @stream.char
902
+ if data == "\""
903
+ @state = :after_doctype_system_identifier_state
904
+ elsif data == :EOF
905
+ @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in DOCTYPE.")}
906
+ @current_token[:correct] = false
907
+ @token_queue << @current_token
908
+ @state = :data_state
909
+ else
910
+ @current_token[:systemId] += data
911
+ end
912
+ return true
913
+ end
914
+
915
+ def doctype_system_identifier_single_quoted_state
916
+ data = @stream.char
917
+ if data == "'"
918
+ @state = :after_doctype_system_identifier_state
919
+ elsif data == :EOF
920
+ @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in DOCTYPE.")}
921
+ @current_token[:correct] = false
922
+ @token_queue << @current_token
923
+ @state = :data_state
924
+ else
925
+ @current_token[:systemId] += data
926
+ end
927
+ return true
928
+ end
929
+
930
+ def after_doctype_system_identifier_state
931
+ data = @stream.char
932
+ if SPACE_CHARACTERS.include?(data)
933
+ elsif data == ">"
934
+ @token_queue << @current_token
935
+ @state = :data_state
936
+ elsif data == :EOF
937
+ @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in DOCTYPE.")}
938
+ @current_token[:correct] = false
939
+ @token_queue << @current_token
940
+ @state = :data_state
941
+ else
942
+ @token_queue << {:type => :ParseError, :data => _("Unexpected character in DOCTYPE.")}
943
+ @state = :bogus_doctype_state
944
+ end
945
+ return true
946
+ end
947
+
948
+ def bogus_doctype_state
949
+ data = @stream.char
950
+ @current_token[:correct] = false
951
+ if data == ">"
952
+ @token_queue << @current_token
953
+ @state = :data_state
954
+ elsif data == :EOF
955
+ # XXX EMIT
956
+ @stream.unget(data)
957
+ @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in bogus doctype.")}
958
+ @current_token[:correct] = false
959
+ @token_queue << @current_token
960
+ @state = :data_state
961
+ end
962
+ return true
963
+ end
964
+
965
+ def _(string); string; end
966
+ end
967
+
968
+ end