mullet 0.0.0 → 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. data/lib/mullet/container.rb +8 -4
  2. data/lib/mullet/default_model.rb +9 -9
  3. data/lib/mullet/default_nested_model.rb +7 -6
  4. data/lib/mullet/html/attribute_command.rb +8 -4
  5. data/lib/mullet/html/attributes.rb +22 -0
  6. data/lib/mullet/html/command.rb +4 -3
  7. data/lib/mullet/html/command_element_renderer.rb +19 -0
  8. data/lib/mullet/html/element.rb +41 -0
  9. data/lib/mullet/html/element_renderer.rb +261 -0
  10. data/lib/mullet/html/filtered_element_handler.rb +87 -0
  11. data/lib/mullet/html/for_element_renderer.rb +47 -0
  12. data/lib/mullet/html/if_element_renderer.rb +46 -0
  13. data/lib/mullet/html/layout.rb +48 -0
  14. data/lib/mullet/html/message.rb +55 -0
  15. data/lib/mullet/html/message_attribute_command.rb +30 -0
  16. data/lib/mullet/html/model_attribute_command.rb +30 -0
  17. data/lib/mullet/html/page_builder.rb +152 -0
  18. data/lib/mullet/html/parser/attribute.rb +8 -0
  19. data/lib/mullet/html/parser/constants.rb +1061 -0
  20. data/lib/mullet/html/parser/default_handler.rb +27 -0
  21. data/lib/mullet/html/parser/input_stream.rb +711 -0
  22. data/lib/mullet/html/parser/open_element.rb +77 -0
  23. data/lib/mullet/html/parser/simple_parser.rb +128 -0
  24. data/lib/mullet/html/parser/tokenizer.rb +1085 -0
  25. data/lib/mullet/html/remove_mode.rb +30 -0
  26. data/lib/mullet/html/static_text_renderer.rb +20 -0
  27. data/lib/mullet/html/template.rb +44 -0
  28. data/lib/mullet/html/template_builder.rb +208 -63
  29. data/lib/mullet/html/template_loader.rb +77 -39
  30. data/lib/mullet/html/template_parser.rb +48 -0
  31. data/lib/mullet/html/unless_element_renderer.rb +24 -0
  32. data/lib/mullet/model.rb +2 -5
  33. data/lib/mullet/render_context.rb +24 -18
  34. data/lib/mullet/tilt.rb +37 -0
  35. data/lib/mullet/version.rb +2 -1
  36. data/lib/mullet.rb +1 -0
  37. metadata +58 -11
@@ -0,0 +1,1085 @@
1
+ require 'mullet/html/parser/constants'
2
+ require 'mullet/html/parser/input_stream'
3
+
4
+ module Mullet; module HTML; module Parser
5
+
6
+ # This class takes care of tokenizing HTML.
7
+ #
8
+ # * @current_token
9
+ # Holds the token that is currently being processed.
10
+ #
11
+ # * @state
12
+ # Holds a reference to the method to be invoked... XXX
13
+ #
14
+ # * @states
15
+ # Holds a mapping between states and methods that implement the state.
16
+ #
17
+ # * @stream
18
+ # Points to InputStream object.
19
+
20
+ class Tokenizer
21
+ attr_accessor :content_model_flag, :current_token
22
+ attr_reader :stream
23
+
24
+ # XXX need to fix documentation
25
+
26
+ def initialize(stream, options = {})
27
+ @stream = InputStream.new(stream, options)
28
+
29
+ # Setup the initial tokenizer state
30
+ @content_model_flag = :PCDATA
31
+ @state = :data_state
32
+ @escapeFlag = false
33
+ @lastFourChars = []
34
+
35
+ # The current token being created
36
+ @current_token = nil
37
+
38
+ # Tokens to be processed.
39
+ @token_queue = []
40
+ @lowercase_element_name = options[:lowercase_element_name] != false
41
+ @lowercase_attr_name = options[:lowercase_attr_name] != false
42
+ end
43
+
44
+ # This is where the magic happens.
45
+ #
46
+ # We do our usually processing through the states and when we have a token
47
+ # to return we yield the token which pauses processing until the next token
48
+ # is requested.
49
+ def each
50
+ @token_queue = []
51
+ # Start processing. When EOF is reached @state will return false
52
+ # instead of true and the loop will terminate.
53
+ while send @state
54
+ yield :type => :ParseError, :data => @stream.errors.shift until @stream.errors.empty?
55
+ yield @token_queue.shift until @token_queue.empty?
56
+ end
57
+ end
58
+
59
+ # Below are various helper functions the tokenizer states use worked out.
60
+
61
+ # If the next character is a '>', convert the current_token into
62
+ # an EmptyTag
63
+
64
+ def process_solidus_in_tag
65
+
66
+ # We need to consume another character to make sure it's a ">"
67
+ data = @stream.char
68
+ rv = false
69
+ if @current_token[:type] == :StartTag and data == ">"
70
+ @current_token[:type] = :EmptyTag
71
+ elsif data == :EOF
72
+ @token_queue << ({:type => :ParseError, :data => "eof-following-solidus"})
73
+ @state = :data_state
74
+ emit_current_token
75
+ rv = true
76
+ else
77
+ @token_queue << {:type => :ParseError, :data => "incorrectly-placed-solidus"}
78
+ end
79
+
80
+ # The character we just consumed need to be put back on the stack so it
81
+ # doesn't get lost...
82
+ @stream.unget(data)
83
+ rv
84
+ end
85
+
86
+ # This function returns either U+FFFD or the character based on the
87
+ # decimal or hexadecimal representation. It also discards ";" if present.
88
+ # If not present @token_queue << {:type => :ParseError}" is invoked.
89
+
90
+ def consume_number_entity(isHex)
91
+
92
+ # XXX More need to be done here. For instance, #13 should prolly be
93
+ # converted to #10 so we don't get \r (#13 is \r right?) in the DOM and
94
+ # such. Thoughts on this appreciated.
95
+ allowed = DIGITS
96
+ radix = 10
97
+ if isHex
98
+ allowed = HEX_DIGITS
99
+ radix = 16
100
+ end
101
+
102
+ char_stack = []
103
+
104
+ # Consume all the characters that are in range while making sure we
105
+ # don't hit an EOF.
106
+ c = @stream.char
107
+ while allowed.include?(c) and c != :EOF
108
+ char_stack.push(c)
109
+ c = @stream.char
110
+ end
111
+
112
+ # Convert the set of characters consumed to an int.
113
+ charAsInt = char_stack.join('').to_i(radix)
114
+
115
+ if charAsInt == 13
116
+ @token_queue << {:type => :ParseError, :data => "incorrect-cr-newline-entity"}
117
+ charAsInt = 10
118
+ elsif (128..159).include? charAsInt
119
+ # If the integer is between 127 and 160 (so 128 and bigger and 159
120
+ # and smaller) we need to do the "windows trick".
121
+ @token_queue << {:type => :ParseError, :data => "illegal-windows-1252-entity"}
122
+
123
+ charAsInt = ENTITIES_WINDOWS1252[charAsInt - 128]
124
+ end
125
+
126
+ if 0 < charAsInt && charAsInt <= 1114111 && !(55296 <= charAsInt && charAsInt <= 57343) &&
127
+ ![0x10FFFF].include?(charAsInt) # TODO add more entity replacements here
128
+ if String.method_defined? :force_encoding
129
+ char = charAsInt.chr('utf-8')
130
+ else
131
+ char = [charAsInt].pack('U')
132
+ end
133
+ else
134
+ char = [0xFFFD].pack('U')
135
+ @token_queue << {:type => :ParseError, :data => "cant-convert-numeric-entity", :datavars => {"charAsInt" => charAsInt}}
136
+ end
137
+
138
+ # Discard the ; if present. Otherwise, put it back on the queue and
139
+ # invoke parse_error on parser.
140
+ if c != ";"
141
+ @token_queue << {:type => :ParseError, :data => "numeric-entity-without-semicolon"}
142
+ @stream.unget(c)
143
+ end
144
+
145
+ return char
146
+ end
147
+
148
+ def consume_entity(allowed_char=nil, from_attribute=false)
149
+ char = nil
150
+ char_stack = [@stream.char]
151
+ if SPACE_CHARACTERS.include?(char_stack[0]) or [:EOF, '<', '&'].include?(char_stack[0]) ||
152
+ (allowed_char && allowed_char == char_stack[0])
153
+ @stream.unget(char_stack)
154
+ elsif char_stack[0] == '#'
155
+ # We might have a number entity here.
156
+ char_stack += [@stream.char, @stream.char]
157
+ if char_stack[0 .. 1].include? :EOF
158
+ # If we reach the end of the file put everything up to :EOF
159
+ # back in the queue
160
+ char_stack = char_stack[0...char_stack.index(:EOF)]
161
+ @stream.unget(char_stack)
162
+ @token_queue << {:type => :ParseError, :data => "expected-numeric-entity-but-got-eof"}
163
+ else
164
+ if char_stack[1].downcase == "x" and HEX_DIGITS.include? char_stack[2]
165
+ # Hexadecimal entity detected.
166
+ @stream.unget(char_stack[2])
167
+ char = consume_number_entity(true)
168
+ elsif DIGITS.include? char_stack[1]
169
+ # Decimal entity detected.
170
+ @stream.unget(char_stack[1..-1])
171
+ char = consume_number_entity(false)
172
+ else
173
+ # No number entity detected.
174
+ @stream.unget(char_stack)
175
+ @token_queue << {:type => :ParseError, :data => "expected-numeric-entity"}
176
+ end
177
+ end
178
+ else
179
+ # At this point in the process might have named entity. Entities
180
+ # are stored in the global variable "entities".
181
+ #
182
+ # Consume characters and compare to these to a substring of the
183
+ # entity names in the list until the substring no longer matches.
184
+ filteredEntityList = ENTITIES.keys
185
+ filteredEntityList.reject! {|e| e[0].chr != char_stack[0]}
186
+ entityName = nil
187
+
188
+ # Try to find the longest entity the string will match to take care
189
+ # of &noti for instance.
190
+ while char_stack.last != :EOF
191
+ name = char_stack.join('')
192
+ if filteredEntityList.any? {|e| e[0...name.length] == name}
193
+ filteredEntityList.reject! {|e| e[0...name.length] != name}
194
+ char_stack.push(@stream.char)
195
+ else
196
+ break
197
+ end
198
+
199
+ if ENTITIES.include? name
200
+ entityName = name
201
+ break if entityName[-1] == ';'
202
+ end
203
+ end
204
+
205
+ if entityName != nil
206
+ char = ENTITIES[entityName]
207
+
208
+ # Check whether or not the last character returned can be
209
+ # discarded or needs to be put back.
210
+ if entityName[-1] != ?;
211
+ @token_queue << {:type => :ParseError, :data => "named-entity-without-semicolon"}
212
+ end
213
+
214
+ if entityName[-1] != ";" and from_attribute and
215
+ (ASCII_LETTERS.include?(char_stack[entityName.length]) or
216
+ DIGITS.include?(char_stack[entityName.length]))
217
+ @stream.unget(char_stack)
218
+ char = '&'
219
+ else
220
+ @stream.unget(char_stack[entityName.length..-1])
221
+ end
222
+ else
223
+ @token_queue << {:type => :ParseError, :data => "expected-named-entity"}
224
+ @stream.unget(char_stack)
225
+ end
226
+ end
227
+ return char
228
+ end
229
+
230
+ # This method replaces the need for "entityInAttributeValueState".
231
+ def process_entity_in_attribute allowed_char
232
+ entity = consume_entity(allowed_char, true)
233
+ if entity
234
+ @current_token[:data][-1][1] += entity
235
+ else
236
+ @current_token[:data][-1][1] += "&"
237
+ end
238
+ end
239
+
240
+ # This method is a generic handler for emitting the tags. It also sets
241
+ # the state to "data" because that's what's needed after a token has been
242
+ # emitted.
243
+ def emit_current_token
244
+ # Add token to the queue to be yielded
245
+ token = @current_token
246
+ if [:StartTag, :EndTag, :EmptyTag].include?(token[:type])
247
+ if @lowercase_element_name
248
+ token[:name] = token[:name].downcase
249
+ end
250
+
251
+ if token[:type] == :EndTag && token[:self_closing]
252
+ @token_queue << {:type => :ParseError, :data => "self-closing-end-tag"}
253
+ end
254
+ @token_queue << token
255
+ @state = :data_state
256
+ end
257
+
258
+ end
259
+
260
+ # Below are the various tokenizer states worked out.
261
+
262
+ # XXX AT Perhaps we should have Hixie run some evaluation on billions of
263
+ # documents to figure out what the order of the various if and elsif
264
+ # statements should be.
265
+ def data_state
266
+ data = @stream.char
267
+
268
+ if @content_model_flag == :CDATA or @content_model_flag == :RCDATA
269
+ @lastFourChars.shift if @lastFourChars.length == 4
270
+ @lastFourChars << data
271
+ end
272
+
273
+ if data == "&" and [:PCDATA,:RCDATA].include?(@content_model_flag) and !@escapeFlag
274
+ @state = :entity_data_state
275
+ elsif data == "-" && [:CDATA, :RCDATA].include?(@content_model_flag) && !@escapeFlag && @lastFourChars.join('') == "<!--"
276
+ @escapeFlag = true
277
+ @token_queue << {:type => :Characters, :data => data}
278
+ elsif data == "<" and !@escapeFlag and
279
+ [:PCDATA,:CDATA,:RCDATA].include?(@content_model_flag)
280
+ @state = :tag_open_state
281
+ elsif data == ">" and @escapeFlag and
282
+ [:CDATA,:RCDATA].include?(@content_model_flag) and
283
+ @lastFourChars[1..-1].join('') == "-->"
284
+ @escapeFlag = false
285
+ @token_queue << {:type => :Characters, :data => data}
286
+
287
+ elsif data == :EOF
288
+ # Tokenization ends.
289
+ return false
290
+
291
+ elsif SPACE_CHARACTERS.include? data
292
+ # Directly after emitting a token you switch back to the "data
293
+ # state". At that point SPACE_CHARACTERS are important so they are
294
+ # emitted separately.
295
+ # XXX need to check if we don't need a special "spaces" flag on
296
+ # characters.
297
+ @token_queue << {:type => :SpaceCharacters, :data => data + @stream.chars_until(SPACE_CHARACTERS, true)}
298
+ else
299
+ chars = @stream.chars_until(["&", "<", ">", "-"])
300
+ @token_queue << {:type => :Characters, :data => data + chars}
301
+ @lastFourChars += (chars[chars.length > 4 ? -4 : -chars.length, 4] || '').scan(/./)
302
+ @lastFourChars = @lastFourChars[(@lastFourChars.length > 4 ? -4 : -@lastFourChars.length), 4] || []
303
+ end
304
+ return true
305
+ end
306
+
307
+ def entity_data_state
308
+ entity = consume_entity
309
+ if entity
310
+ @token_queue << {:type => :Characters, :data => entity}
311
+ else
312
+ @token_queue << {:type => :Characters, :data => "&"}
313
+ end
314
+ @state = :data_state
315
+ return true
316
+ end
317
+
318
+ def tag_open_state
319
+ data = @stream.char
320
+
321
+ if @content_model_flag == :PCDATA
322
+ if data == "!"
323
+ @state = :markup_declaration_open_state
324
+ elsif data == "/"
325
+ @state = :close_tag_open_state
326
+ elsif data != :EOF and ASCII_LETTERS.include? data
327
+ @current_token = {:type => :StartTag, :name => data, :data => []}
328
+ @state = :tag_name_state
329
+ elsif data == ">"
330
+ # XXX In theory it could be something besides a tag name. But
331
+ # do we really care?
332
+ @token_queue << {:type => :ParseError, :data => "expected-tag-name-but-got-right-bracket"}
333
+ @token_queue << {:type => :Characters, :data => "<>"}
334
+ @state = :data_state
335
+ elsif data == "?"
336
+ # XXX In theory it could be something besides a tag name. But
337
+ # do we really care?
338
+ @token_queue.push({:type => :ParseError, :data => "expected-tag-name-but-got-question-mark"})
339
+ @stream.unget(data)
340
+ @state = :bogus_comment_state
341
+ else
342
+ # XXX
343
+ @token_queue << {:type => :ParseError, :data => "expected-tag-name"}
344
+ @token_queue << {:type => :Characters, :data => "<"}
345
+ @stream.unget(data)
346
+ @state = :data_state
347
+ end
348
+ else
349
+ # We know the content model flag is set to either RCDATA or CDATA
350
+ # now because this state can never be entered with the PLAINTEXT
351
+ # flag.
352
+ if data == "/"
353
+ @state = :close_tag_open_state
354
+ else
355
+ @token_queue << {:type => :Characters, :data => "<"}
356
+ @stream.unget(data)
357
+ @state = :data_state
358
+ end
359
+ end
360
+ return true
361
+ end
362
+
363
+ def close_tag_open_state
364
+ if (@content_model_flag == :RCDATA or @content_model_flag == :CDATA)
365
+ if @current_token
366
+ char_stack = []
367
+
368
+ # So far we know that "</" has been consumed. We now need to know
369
+ # whether the next few characters match the name of last emitted
370
+ # start tag which also happens to be the current_token. We also need
371
+ # to have the character directly after the characters that could
372
+ # match the start tag name.
373
+ (@current_token[:name].length + 1).times do
374
+ char_stack.push(@stream.char)
375
+ # Make sure we don't get hit by :EOF
376
+ break if char_stack[-1] == :EOF
377
+ end
378
+
379
+ # Since this is just for checking. We put the characters back on
380
+ # the stack.
381
+ @stream.unget(char_stack)
382
+ end
383
+
384
+ if @current_token and
385
+ @current_token[:name].downcase ==
386
+ char_stack[0...-1].join('').downcase and
387
+ (SPACE_CHARACTERS + [">", "/", "<", :EOF]).include? char_stack[-1]
388
+ # Because the characters are correct we can safely switch to
389
+ # PCDATA mode now. This also means we don't have to do it when
390
+ # emitting the end tag token.
391
+ @content_model_flag = :PCDATA
392
+ else
393
+ @token_queue << {:type => :Characters, :data => "</"}
394
+ @state = :data_state
395
+
396
+ # Need to return here since we don't want the rest of the
397
+ # method to be walked through.
398
+ return true
399
+ end
400
+ end
401
+
402
+ data = @stream.char
403
+ if data == :EOF
404
+ @token_queue << {:type => :ParseError, :data => "expected-closing-tag-but-got-eof"}
405
+ @token_queue << {:type => :Characters, :data => "</"}
406
+ @state = :data_state
407
+ elsif ASCII_LETTERS.include? data
408
+ @current_token = {:type => :EndTag, :name => data, :data => []}
409
+ @state = :tag_name_state
410
+ elsif data == ">"
411
+ @token_queue << {:type => :ParseError, :data => "expected-closing-tag-but-got-right-bracket"}
412
+ @state = :data_state
413
+ else
414
+ # XXX data can be _'_...
415
+ @token_queue << {:type => :ParseError, :data => "expected-closing-tag-but-got-char", :datavars => {:data => data}}
416
+ @stream.unget(data)
417
+ @state = :bogus_comment_state
418
+ end
419
+
420
+ return true
421
+ end
422
+
423
+ def tag_name_state
424
+ data = @stream.char
425
+ if SPACE_CHARACTERS.include? data
426
+ @state = :before_attribute_name_state
427
+ elsif data == :EOF
428
+ @token_queue << {:type => :ParseError, :data => "eof-in-tag-name"}
429
+ emit_current_token
430
+ elsif ASCII_LETTERS.include? data
431
+ @current_token[:name] += data + @stream.chars_until(ASCII_LETTERS, true)
432
+ elsif data == ">"
433
+ emit_current_token
434
+ elsif data == "/"
435
+ @state = :self_closing_tag_state
436
+ else
437
+ @current_token[:name] += data
438
+ end
439
+ return true
440
+ end
441
+
442
+ def before_attribute_name_state
443
+ data = @stream.char
444
+ if SPACE_CHARACTERS.include? data
445
+ @stream.chars_until(SPACE_CHARACTERS, true)
446
+ elsif data == :EOF
447
+ @token_queue << {:type => :ParseError, :data => "expected-attribute-name-but-got-eof"}
448
+ emit_current_token
449
+ elsif ASCII_LETTERS.include? data
450
+ @current_token[:data].push([data, ""])
451
+ @state = :attribute_name_state
452
+ elsif data == ">"
453
+ emit_current_token
454
+ elsif data == "/"
455
+ @state = :self_closing_tag_state
456
+ elsif data == "'" || data == '"' || data == "="
457
+ @token_queue.push({:type => :ParseError, :data => "invalid-character-in-attribute-name"})
458
+ @current_token[:data].push([data, ""])
459
+ @state = :attribute_name_state
460
+ else
461
+ @current_token[:data].push([data, ""])
462
+ @state = :attribute_name_state
463
+ end
464
+ return true
465
+ end
466
+
467
+ def attribute_name_state
468
+ data = @stream.char
469
+ leavingThisState = true
470
+ emitToken = false
471
+ if data == "="
472
+ @state = :before_attribute_value_state
473
+ elsif data == :EOF
474
+ @token_queue << {:type => :ParseError, :data => "eof-in-attribute-name"}
475
+ @state = :data_state
476
+ emitToken = true
477
+ elsif ASCII_LETTERS.include? data
478
+ @current_token[:data][-1][0] += data + @stream.chars_until(ASCII_LETTERS, true)
479
+ leavingThisState = false
480
+ elsif data == ">"
481
+ # XXX If we emit here the attributes are converted to a dict
482
+ # without being checked and when the code below runs we error
483
+ # because data is a dict not a list
484
+ emitToken = true
485
+ elsif SPACE_CHARACTERS.include? data
486
+ @state = :after_attribute_name_state
487
+ elsif data == "/"
488
+ if !process_solidus_in_tag
489
+ @state = :before_attribute_name_state
490
+ end
491
+ elsif data == "'" or data == '"'
492
+ @token_queue.push({:type => :ParseError, :data => "invalid-character-in-attribute-name"})
493
+ @current_token[:data][-1][0] += data
494
+ leavingThisState = false
495
+ else
496
+ @current_token[:data][-1][0] += data
497
+ leavingThisState = false
498
+ end
499
+
500
+ if leavingThisState
501
+ # Attributes are not dropped at this stage. That happens when the
502
+ # start tag token is emitted so values can still be safely appended
503
+ # to attributes, but we do want to report the parse error in time.
504
+ if @lowercase_attr_name
505
+ @current_token[:data][-1][0] = @current_token[:data].last.first.downcase
506
+ end
507
+ @current_token[:data][0...-1].each {|name,value|
508
+ if @current_token[:data].last.first == name
509
+ @token_queue << {:type => :ParseError, :data => "duplicate-attribute"}
510
+ break # don't report an error more than once
511
+ end
512
+ }
513
+ # XXX Fix for above XXX
514
+ emit_current_token if emitToken
515
+ end
516
+ return true
517
+ end
518
+
519
+ def after_attribute_name_state
520
+ data = @stream.char
521
+ if SPACE_CHARACTERS.include? data
522
+ @stream.chars_until(SPACE_CHARACTERS, true)
523
+ elsif data == "="
524
+ @state = :before_attribute_value_state
525
+ elsif data == ">"
526
+ emit_current_token
527
+ elsif data == :EOF
528
+ @token_queue << {:type => :ParseError, :data => "expected-end-of-tag-but-got-eof"}
529
+ emit_current_token
530
+ elsif ASCII_LETTERS.include? data
531
+ @current_token[:data].push([data, ""])
532
+ @state = :attribute_name_state
533
+ elsif data == "/"
534
+ @state = :self_closing_tag_state
535
+ else
536
+ @current_token[:data].push([data, ""])
537
+ @state = :attribute_name_state
538
+ end
539
+ return true
540
+ end
541
+
542
+ def before_attribute_value_state
543
+ data = @stream.char
544
+ if SPACE_CHARACTERS.include? data
545
+ @stream.chars_until(SPACE_CHARACTERS, true)
546
+ elsif data == "\""
547
+ @state = :attribute_value_double_quoted_state
548
+ elsif data == "&"
549
+ @state = :attribute_value_unquoted_state
550
+ @stream.unget(data);
551
+ elsif data == "'"
552
+ @state = :attribute_value_single_quoted_state
553
+ elsif data == ">"
554
+ emit_current_token
555
+ elsif data == "="
556
+ @token_queue.push({:type => :ParseError, :data => "equals-in-unquoted-attribute-value"})
557
+ @current_token[:data][-1][1] += data
558
+ @state = :attribute_value_unquoted_state
559
+ elsif data == :EOF
560
+ @token_queue << {:type => :ParseError, :data => "expected-attribute-value-but-got-eof"}
561
+ emit_current_token
562
+ else
563
+ @current_token[:data][-1][1] += data
564
+ @state = :attribute_value_unquoted_state
565
+ end
566
+ return true
567
+ end
568
+
569
+ def attribute_value_double_quoted_state
570
+ data = @stream.char
571
+ if data == "\""
572
+ @state = :after_attribute_value_state
573
+ elsif data == "&"
574
+ process_entity_in_attribute('"')
575
+ elsif data == :EOF
576
+ @token_queue << {:type => :ParseError, :data => "eof-in-attribute-value-double-quote"}
577
+ emit_current_token
578
+ else
579
+ @current_token[:data][-1][1] += data + @stream.chars_until(["\"", "&"])
580
+ end
581
+ return true
582
+ end
583
+
584
+ def attribute_value_single_quoted_state
585
+ data = @stream.char
586
+ if data == "'"
587
+ @state = :after_attribute_value_state
588
+ elsif data == "&"
589
+ process_entity_in_attribute("'")
590
+ elsif data == :EOF
591
+ @token_queue << {:type => :ParseError, :data => "eof-in-attribute-value-single-quote"}
592
+ emit_current_token
593
+ else
594
+ @current_token[:data][-1][1] += data +\
595
+ @stream.chars_until(["'", "&"])
596
+ end
597
+ return true
598
+ end
599
+
600
+ def attribute_value_unquoted_state
601
+ data = @stream.char
602
+ if SPACE_CHARACTERS.include? data
603
+ @state = :before_attribute_name_state
604
+ elsif data == "&"
605
+ process_entity_in_attribute ''
606
+ elsif data == ">"
607
+ emit_current_token
608
+ elsif data == '"' || data == "'" || data == "="
609
+ @token_queue.push({:type => :ParseError, :data => "unexpected-character-in-unquoted-attribute-value"})
610
+ @current_token[:data][-1][1] += data
611
+ elsif data == :EOF
612
+ @token_queue << {:type => :ParseError, :data => "eof-in-attribute-value-no-quotes"}
613
+ emit_current_token
614
+ else
615
+ @current_token[:data][-1][1] += data + @stream.chars_until(["&", ">","<"] + SPACE_CHARACTERS)
616
+ end
617
+ return true
618
+ end
619
+
620
+ def after_attribute_value_state
621
+ data = self.stream.char()
622
+ if SPACE_CHARACTERS.include? data
623
+ @state = :before_attribute_name_state
624
+ elsif data == ">"
625
+ emit_current_token
626
+ @state = :data_state
627
+ elsif data == "/"
628
+ @state = :self_closing_tag_state
629
+ elsif data == :EOF
630
+ @token_queue << {:type => :ParseError, :data => "unexpected-EOF-after-attribute-value"}
631
+ emit_current_token
632
+ @stream.unget(data)
633
+ @state = :data_state
634
+ else
635
+ @token_queue.push({:type => :ParseError, :data => "unexpected-character-after-attribute-value"})
636
+ @stream.unget(data)
637
+ @state = :before_attribute_name_state
638
+ end
639
+ true
640
+ end
641
+
642
+ def self_closing_tag_state
643
+ c = @stream.char
644
+ case c
645
+ when ">"
646
+ @current_token[:self_closing] = true
647
+ emit_current_token
648
+ @state = :data_state
649
+ when :EOF
650
+ @token_queue << {:type => :ParseError, :data => "eof-in-tag-name"}
651
+ @stream.unget(c)
652
+ @state = :data_state
653
+ else
654
+ @token_queue << {:type => :ParseError, :data => "expected-self-closing-tag"}
655
+ @stream.unget(c)
656
+ @state = :before_attribute_name_state
657
+ end
658
+ true
659
+ end
660
+
661
+ def bogus_comment_state
662
+ # Make a new comment token and give it as value all the characters
663
+ # until the first > or :EOF (chars_until checks for :EOF automatically)
664
+ # and emit it.
665
+ @token_queue << {:type => :Comment, :data => @stream.chars_until([">"])}
666
+
667
+ # Eat the character directly after the bogus comment which is either a
668
+ # ">" or an :EOF.
669
+ @stream.char
670
+ @state = :data_state
671
+ return true
672
+ end
673
+
674
+ def markup_declaration_open_state
675
+ char_stack = [@stream.char, @stream.char]
676
+ if char_stack == ["-", "-"]
677
+ @current_token = {:type => :Comment, :data => ""}
678
+ @state = :comment_start_state
679
+ else
680
+ 5.times { char_stack.push(@stream.char) }
681
+ # Put in explicit :EOF check
682
+ if !char_stack.include?(:EOF) && char_stack.join("").upcase == "DOCTYPE"
683
+ @current_token = {:type => :Doctype, :name => "", :publicId => nil, :systemId => nil, :correct => true}
684
+ @state = :doctype_state
685
+ elsif !char_stack.include?(:EOF) && char_stack.join('') == '[CDATA['
686
+ @current_token = {:type => :CDATA, :data => ''}
687
+ @state = :cdata_state
688
+ else
689
+ @token_queue << {:type => :ParseError, :data => "expected-dashes-or-doctype"}
690
+ @stream.unget(char_stack)
691
+ @state = :bogus_comment_state
692
+ end
693
+ end
694
+ return true
695
+ end
696
+
697
+ def comment_start_state
698
+ data = @stream.char
699
+ if data == "-"
700
+ @state = :comment_start_dash_state
701
+ elsif data == ">"
702
+ @token_queue << {:type => :ParseError, :data => "incorrect-comment"}
703
+ @token_queue << @current_token
704
+ @state = :data_state
705
+ elsif data == :EOF
706
+ @token_queue << {:type => :ParseError, :data => "eof-in-comment"}
707
+ @token_queue << @current_token
708
+ @state = :data_state
709
+ else
710
+ @current_token[:data] += data + @stream.chars_until("-")
711
+ @state = :comment_state
712
+ end
713
+ return true
714
+ end
715
+
716
+ def comment_start_dash_state
717
+ data = @stream.char
718
+ if data == "-"
719
+ @state = :comment_end_state
720
+ elsif data == ">"
721
+ @token_queue << {:type => :ParseError, :data => "incorrect-comment"}
722
+ @token_queue << @current_token
723
+ @state = :data_state
724
+ elsif data == :EOF
725
+ @token_queue << {:type => :ParseError, :data => "eof-in-comment"}
726
+ @token_queue << @current_token
727
+ @state = :data_state
728
+ else
729
+ @current_token[:data] += '-' + data + @stream.chars_until("-")
730
+ @state = :comment_state
731
+ end
732
+ return true
733
+ end
734
+
735
+ def comment_state
736
+ data = @stream.char
737
+ if data == "-"
738
+ @state = :comment_end_dash_state
739
+ elsif data == :EOF
740
+ @token_queue << {:type => :ParseError, :data => "eof-in-comment"}
741
+ @token_queue << @current_token
742
+ @state = :data_state
743
+ else
744
+ @current_token[:data] += data + @stream.chars_until("-")
745
+ end
746
+ return true
747
+ end
748
+
749
+ def comment_end_dash_state
750
+ data = @stream.char
751
+ if data == "-"
752
+ @state = :comment_end_state
753
+ elsif data == :EOF
754
+ @token_queue << {:type => :ParseError, :data => "eof-in-comment-end-dash"}
755
+ @token_queue << @current_token
756
+ @state = :data_state
757
+ else
758
+ @current_token[:data] += "-" + data +\
759
+ @stream.chars_until("-")
760
+ # Consume the next character which is either a "-" or an :EOF as
761
+ # well so if there's a "-" directly after the "-" we go nicely to
762
+ # the "comment end state" without emitting a ParseError there.
763
+ @stream.char
764
+ end
765
+ return true
766
+ end
767
+
768
+ def comment_end_state
769
+ data = @stream.char
770
+ if data == ">"
771
+ @token_queue << @current_token
772
+ @state = :data_state
773
+ elsif data == "-"
774
+ @token_queue << {:type => :ParseError, :data => "unexpected-dash-after-double-dash-in-comment"}
775
+ @current_token[:data] += data
776
+ elsif data == :EOF
777
+ @token_queue << {:type => :ParseError, :data => "eof-in-comment-double-dash"}
778
+ @token_queue << @current_token
779
+ @state = :data_state
780
+ else
781
+ # XXX
782
+ @token_queue << {:type => :ParseError, :data => "unexpected-char-in-comment"}
783
+ @current_token[:data] += "--" + data
784
+ @state = :comment_state
785
+ end
786
+ return true
787
+ end
788
+
789
+ def doctype_state
790
+ data = @stream.char
791
+ if SPACE_CHARACTERS.include? data
792
+ @state = :before_doctype_name_state
793
+ else
794
+ @token_queue << {:type => :ParseError, :data => "need-space-after-doctype"}
795
+ @stream.unget(data)
796
+ @state = :before_doctype_name_state
797
+ end
798
+ return true
799
+ end
800
+
801
+ def before_doctype_name_state
802
+ data = @stream.char
803
+ if SPACE_CHARACTERS.include? data
804
+ elsif data == ">"
805
+ @token_queue << {:type => :ParseError, :data => "expected-doctype-name-but-got-right-bracket"}
806
+ @current_token[:correct] = false
807
+ @token_queue << @current_token
808
+ @state = :data_state
809
+ elsif data == :EOF
810
+ @token_queue << {:type => :ParseError, :data => "expected-doctype-name-but-got-eof"}
811
+ @current_token[:correct] = false
812
+ @token_queue << @current_token
813
+ @state = :data_state
814
+ else
815
+ @current_token[:name] = data
816
+ @state = :doctype_name_state
817
+ end
818
+ return true
819
+ end
820
+
821
+ def doctype_name_state
822
+ data = @stream.char
823
+ if SPACE_CHARACTERS.include? data
824
+ @state = :after_doctype_name_state
825
+ elsif data == ">"
826
+ @token_queue << @current_token
827
+ @state = :data_state
828
+ elsif data == :EOF
829
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype-name"}
830
+ @current_token[:correct] = false
831
+ @token_queue << @current_token
832
+ @state = :data_state
833
+ else
834
+ @current_token[:name] += data
835
+ end
836
+
837
+ return true
838
+ end
839
+
840
+ def after_doctype_name_state
841
+ data = @stream.char
842
+ if SPACE_CHARACTERS.include? data
843
+ elsif data == ">"
844
+ @token_queue << @current_token
845
+ @state = :data_state
846
+ elsif data == :EOF
847
+ @current_token[:correct] = false
848
+ @stream.unget(data)
849
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
850
+ @token_queue << @current_token
851
+ @state = :data_state
852
+ else
853
+ char_stack = [data]
854
+ 5.times { char_stack << stream.char }
855
+ token = char_stack.join('').tr(ASCII_UPPERCASE,ASCII_LOWERCASE)
856
+ if token == "public" and !char_stack.include?(:EOF)
857
+ @state = :before_doctype_public_identifier_state
858
+ elsif token == "system" and !char_stack.include?(:EOF)
859
+ @state = :before_doctype_system_identifier_state
860
+ else
861
+ @stream.unget(char_stack)
862
+ @token_queue << {:type => :ParseError, :data => "expected-space-or-right-bracket-in-doctype", "datavars" => {"data" => data}}
863
+ @current_token[:correct] = false
864
+ @state = :bogus_doctype_state
865
+ end
866
+ end
867
+ return true
868
+ end
869
+
870
+ def before_doctype_public_identifier_state
871
+ data = @stream.char
872
+
873
+ if SPACE_CHARACTERS.include?(data)
874
+ elsif data == "\""
875
+ @current_token[:publicId] = ""
876
+ @state = :doctype_public_identifier_double_quoted_state
877
+ elsif data == "'"
878
+ @current_token[:publicId] = ""
879
+ @state = :doctype_public_identifier_single_quoted_state
880
+ elsif data == ">"
881
+ @token_queue << {:type => :ParseError, :data => "unexpected-end-of-doctype"}
882
+ @current_token[:correct] = false
883
+ @token_queue << @current_token
884
+ @state = :data_state
885
+ elsif data == :EOF
886
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
887
+ @current_token[:correct] = false
888
+ @token_queue << @current_token
889
+ @state = :data_state
890
+ else
891
+ @token_queue << {:type => :ParseError, :data => "unexpected-char-in-doctype"}
892
+ @current_token[:correct] = false
893
+ @state = :bogus_doctype_state
894
+ end
895
+
896
+ return true
897
+ end
898
+
899
+ def doctype_public_identifier_double_quoted_state
900
+ data = @stream.char
901
+ if data == "\""
902
+ @state = :after_doctype_public_identifier_state
903
+ elsif data == ">"
904
+ @token_queue << {:type => :ParseError, :data => "unexpected-end-of-doctype"}
905
+ @current_token[:correct] = false
906
+ @token_queue << @current_token
907
+ @state = :data_state
908
+ elsif data == :EOF
909
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
910
+ @current_token[:correct] = false
911
+ @token_queue << @current_token
912
+ @state = :data_state
913
+ else
914
+ @current_token[:publicId] += data
915
+ end
916
+ return true
917
+ end
918
+
919
+ def doctype_public_identifier_single_quoted_state
920
+ data = @stream.char
921
+ if data == "'"
922
+ @state = :after_doctype_public_identifier_state
923
+ elsif data == ">"
924
+ @token_queue << {:type => :ParseError, :data => "unexpected-end-of-doctype"}
925
+ @current_token[:correct] = false
926
+ @token_queue << @current_token
927
+ @state = :data_state
928
+ elsif data == :EOF
929
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
930
+ @current_token[:correct] = false
931
+ @token_queue << @current_token
932
+ @state = :data_state
933
+ else
934
+ @current_token[:publicId] += data
935
+ end
936
+ return true
937
+ end
938
+
939
+ def after_doctype_public_identifier_state
940
+ data = @stream.char
941
+ if SPACE_CHARACTERS.include?(data)
942
+ elsif data == "\""
943
+ @current_token[:systemId] = ""
944
+ @state = :doctype_system_identifier_double_quoted_state
945
+ elsif data == "'"
946
+ @current_token[:systemId] = ""
947
+ @state = :doctype_system_identifier_single_quoted_state
948
+ elsif data == ">"
949
+ @token_queue << @current_token
950
+ @state = :data_state
951
+ elsif data == :EOF
952
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
953
+ @current_token[:correct] = false
954
+ @token_queue << @current_token
955
+ @state = :data_state
956
+ else
957
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
958
+ @current_token[:correct] = false
959
+ @state = :bogus_doctype_state
960
+ end
961
+ return true
962
+ end
963
+
964
+ def before_doctype_system_identifier_state
965
+ data = @stream.char
966
+ if SPACE_CHARACTERS.include?(data)
967
+ elsif data == "\""
968
+ @current_token[:systemId] = ""
969
+ @state = :doctype_system_identifier_double_quoted_state
970
+ elsif data == "'"
971
+ @current_token[:systemId] = ""
972
+ @state = :doctype_system_identifier_single_quoted_state
973
+ elsif data == ">"
974
+ @token_queue << {:type => :ParseError, :data => "unexpected-char-in-doctype"}
975
+ @current_token[:correct] = false
976
+ @token_queue << @current_token
977
+ @state = :data_state
978
+ elsif data == :EOF
979
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
980
+ @current_token[:correct] = false
981
+ @token_queue << @current_token
982
+ @state = :data_state
983
+ else
984
+ @token_queue << {:type => :ParseError, :data => "unexpected-char-in-doctype"}
985
+ @current_token[:correct] = false
986
+ @state = :bogus_doctype_state
987
+ end
988
+ return true
989
+ end
990
+
991
+ def doctype_system_identifier_double_quoted_state
992
+ data = @stream.char
993
+ if data == "\""
994
+ @state = :after_doctype_system_identifier_state
995
+ elsif data == ">"
996
+ @token_queue << {:type => :ParseError, :data => "unexpected-end-of-doctype"}
997
+ @current_token[:correct] = false
998
+ @token_queue << @current_token
999
+ @state = :data_state
1000
+ elsif data == :EOF
1001
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
1002
+ @current_token[:correct] = false
1003
+ @token_queue << @current_token
1004
+ @state = :data_state
1005
+ else
1006
+ @current_token[:systemId] += data
1007
+ end
1008
+ return true
1009
+ end
1010
+
1011
+ def doctype_system_identifier_single_quoted_state
1012
+ data = @stream.char
1013
+ if data == "'"
1014
+ @state = :after_doctype_system_identifier_state
1015
+ elsif data == ">"
1016
+ @token_queue << {:type => :ParseError, :data => "unexpected-end-of-doctype"}
1017
+ @current_token[:correct] = false
1018
+ @token_queue << @current_token
1019
+ @state = :data_state
1020
+ elsif data == :EOF
1021
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
1022
+ @current_token[:correct] = false
1023
+ @token_queue << @current_token
1024
+ @state = :data_state
1025
+ else
1026
+ @current_token[:systemId] += data
1027
+ end
1028
+ return true
1029
+ end
1030
+
1031
+ def after_doctype_system_identifier_state
1032
+ data = @stream.char
1033
+ if SPACE_CHARACTERS.include?(data)
1034
+ elsif data == ">"
1035
+ @token_queue << @current_token
1036
+ @state = :data_state
1037
+ elsif data == :EOF
1038
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
1039
+ @current_token[:correct] = false
1040
+ @token_queue << @current_token
1041
+ @state = :data_state
1042
+ else
1043
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
1044
+ @state = :bogus_doctype_state
1045
+ end
1046
+ return true
1047
+ end
1048
+
1049
+ def bogus_doctype_state
1050
+ data = @stream.char
1051
+ if data == ">"
1052
+ @token_queue << @current_token
1053
+ @state = :data_state
1054
+ elsif data == :EOF
1055
+ @stream.unget(data)
1056
+ @token_queue << @current_token
1057
+ @state = :data_state
1058
+ end
1059
+ return true
1060
+ end
1061
+
1062
+ def cdata_state
1063
+ data = @stream.char
1064
+ if data == ']'
1065
+ char_stack = [@stream.char, @stream.char]
1066
+ if char_stack == [']', '>']
1067
+ @token_queue << @current_token
1068
+ @state = :data_state
1069
+ else
1070
+ @stream.unget(char_stack)
1071
+ @current_token[:data] += data
1072
+ end
1073
+ elsif data == :EOF
1074
+ @token_queue << {:type => :ParseError, :data => "eof-in-cdata"}
1075
+ @token_queue << @current_token
1076
+ @state = :data_state
1077
+ else
1078
+ @current_token[:data] += data + @stream.chars_until(']')
1079
+ end
1080
+ return true
1081
+ end
1082
+
1083
+ end
1084
+
1085
+ end; end; end