html5 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. data/History.txt +3 -0
  2. data/Manifest.txt +58 -0
  3. data/README +9 -0
  4. data/Rakefile.rb +17 -0
  5. data/lib/html5/constants.rb +818 -0
  6. data/lib/html5/filters/base.rb +10 -0
  7. data/lib/html5/filters/inject_meta_charset.rb +82 -0
  8. data/lib/html5/filters/optionaltags.rb +198 -0
  9. data/lib/html5/filters/sanitizer.rb +15 -0
  10. data/lib/html5/filters/whitespace.rb +36 -0
  11. data/lib/html5/html5parser/after_body_phase.rb +46 -0
  12. data/lib/html5/html5parser/after_frameset_phase.rb +34 -0
  13. data/lib/html5/html5parser/after_head_phase.rb +50 -0
  14. data/lib/html5/html5parser/before_head_phase.rb +41 -0
  15. data/lib/html5/html5parser/in_body_phase.rb +607 -0
  16. data/lib/html5/html5parser/in_caption_phase.rb +68 -0
  17. data/lib/html5/html5parser/in_cell_phase.rb +78 -0
  18. data/lib/html5/html5parser/in_column_group_phase.rb +55 -0
  19. data/lib/html5/html5parser/in_frameset_phase.rb +57 -0
  20. data/lib/html5/html5parser/in_head_phase.rb +138 -0
  21. data/lib/html5/html5parser/in_row_phase.rb +87 -0
  22. data/lib/html5/html5parser/in_select_phase.rb +84 -0
  23. data/lib/html5/html5parser/in_table_body_phase.rb +83 -0
  24. data/lib/html5/html5parser/in_table_phase.rb +110 -0
  25. data/lib/html5/html5parser/initial_phase.rb +134 -0
  26. data/lib/html5/html5parser/phase.rb +158 -0
  27. data/lib/html5/html5parser/root_element_phase.rb +42 -0
  28. data/lib/html5/html5parser/trailing_end_phase.rb +35 -0
  29. data/lib/html5/html5parser.rb +248 -0
  30. data/lib/html5/inputstream.rb +654 -0
  31. data/lib/html5/liberalxmlparser.rb +158 -0
  32. data/lib/html5/sanitizer.rb +188 -0
  33. data/lib/html5/serializer/htmlserializer.rb +180 -0
  34. data/lib/html5/serializer/xhtmlserializer.rb +20 -0
  35. data/lib/html5/serializer.rb +2 -0
  36. data/lib/html5/tokenizer.rb +968 -0
  37. data/lib/html5/treebuilders/base.rb +334 -0
  38. data/lib/html5/treebuilders/hpricot.rb +231 -0
  39. data/lib/html5/treebuilders/rexml.rb +208 -0
  40. data/lib/html5/treebuilders/simpletree.rb +185 -0
  41. data/lib/html5/treebuilders.rb +24 -0
  42. data/lib/html5/treewalkers/base.rb +154 -0
  43. data/lib/html5/treewalkers/hpricot.rb +48 -0
  44. data/lib/html5/treewalkers/rexml.rb +48 -0
  45. data/lib/html5/treewalkers/simpletree.rb +48 -0
  46. data/lib/html5/treewalkers.rb +26 -0
  47. data/lib/html5.rb +13 -0
  48. data/parse.rb +217 -0
  49. data/tests/preamble.rb +82 -0
  50. data/tests/test_encoding.rb +35 -0
  51. data/tests/test_lxp.rb +263 -0
  52. data/tests/test_parser.rb +68 -0
  53. data/tests/test_sanitizer.rb +142 -0
  54. data/tests/test_serializer.rb +68 -0
  55. data/tests/test_stream.rb +62 -0
  56. data/tests/test_tokenizer.rb +94 -0
  57. data/tests/test_treewalkers.rb +116 -0
  58. data/tests/tokenizer_test_parser.rb +63 -0
  59. metadata +120 -0
@@ -0,0 +1,654 @@
1
+ require 'stringio'
2
+ require 'html5/constants'
3
+
4
+ module HTML5
5
+
6
+ # Provides a unicode stream of characters to the HTMLTokenizer.
7
+
8
+ # This class takes care of character encoding and removing or replacing
9
+ # incorrect byte-sequences and also provides column and line tracking.
10
+
11
+ class HTMLInputStream
12
+
13
+ attr_accessor :queue, :char_encoding, :errors
14
+
15
+ # Initialises the HTMLInputStream.
16
+ #
17
+ # HTMLInputStream(source, [encoding]) -> Normalized stream from source
18
+ # for use by the HTML5Lib.
19
+ #
20
+ # source can be either a file-object, local filename or a string.
21
+ #
22
+ # The optional encoding parameter must be a string that indicates
23
+ # the encoding. If specified, that encoding will be used,
24
+ # regardless of any BOM or later declaration (such as in a meta
25
+ # element)
26
+ #
27
+ # parseMeta - Look for a <meta> element containing encoding information
28
+
29
+ def initialize(source, options = {})
30
+ @encoding = nil
31
+ @parse_meta = true
32
+ @chardet = true
33
+
34
+ options.each {|name, value| instance_variable_set("@#{name}", value) }
35
+
36
+ # Raw Stream
37
+ @raw_stream = open_stream(source)
38
+
39
+ # Encoding Information
40
+ #Number of bytes to use when looking for a meta element with
41
+ #encoding information
42
+ @NUM_BYTES_META = 512
43
+ #Number of bytes to use when using detecting encoding using chardet
44
+ @NUM_BYTES_CHARDET = 256
45
+ #Number of bytes to use when reading content
46
+ @NUM_BYTES_BUFFER = 1024
47
+
48
+ #Encoding to use if no other information can be found
49
+ @DEFAULT_ENCODING = 'windows-1252'
50
+
51
+ #Detect encoding iff no explicit "transport level" encoding is supplied
52
+ if @encoding.nil? or not HTML5.is_valid_encoding(@encoding)
53
+ @char_encoding = detect_encoding
54
+ else
55
+ @char_encoding = @encoding
56
+ end
57
+
58
+ # Read bytes from stream decoding them into Unicode
59
+ @buffer = @raw_stream.read(@NUM_BYTES_BUFFER) || ''
60
+ if @char_encoding == 'windows-1252'
61
+ @win1252 = true
62
+ elsif @char_encoding != 'utf-8'
63
+ begin
64
+ require 'iconv'
65
+ begin
66
+ @buffer << @raw_stream.read unless @raw_stream.eof?
67
+ @buffer = Iconv.iconv('utf-8', @char_encoding, @buffer).first
68
+ rescue
69
+ @win1252 = true
70
+ end
71
+ rescue LoadError
72
+ @win1252 = true
73
+ end
74
+ end
75
+
76
+ @queue = []
77
+ @errors = []
78
+
79
+ # Reset position in the list to read from
80
+ @tell = 0
81
+ @line = @col = 0
82
+ @line_lengths = []
83
+ end
84
+
85
+ # Produces a file object from source.
86
+ #
87
+ # source can be either a file object, local filename or a string.
88
+ def open_stream(source)
89
+ # Already an IO like object
90
+ if source.respond_to?(:read)
91
+ @stream = source
92
+ else
93
+ # Treat source as a string and wrap in StringIO
94
+ @stream = StringIO.new(source)
95
+ end
96
+ return @stream
97
+ end
98
+
99
+ def detect_encoding
100
+
101
+ #First look for a BOM
102
+ #This will also read past the BOM if present
103
+ encoding = detect_bom
104
+
105
+ #If there is no BOM need to look for meta elements with encoding
106
+ #information
107
+ if encoding.nil? and @parse_meta
108
+ encoding = detect_encoding_meta
109
+ end
110
+
111
+ #Guess with chardet, if avaliable
112
+ if encoding.nil? and @chardet
113
+ begin
114
+ require 'rubygems'
115
+ require 'UniversalDetector' # gem install chardet
116
+ buffers = []
117
+ detector = UniversalDetector::Detector.instance
118
+ detector.reset
119
+ until @raw_stream.eof?
120
+ buffer = @raw_stream.read(@NUM_BYTES_CHARDET)
121
+ break if !buffer or buffer.empty?
122
+ buffers << buffer
123
+ detector.feed(buffer)
124
+ break if detector.instance_eval {@done}
125
+ detector.instance_eval {
126
+ @_mLastChar = @_mLastChar.chr if Fixnum === @_mLastChar
127
+ }
128
+ end
129
+ detector.close
130
+ encoding = detector.result['encoding']
131
+ seek(buffers*'', 0)
132
+ rescue LoadError
133
+ end
134
+ end
135
+
136
+ # If all else fails use the default encoding
137
+ if encoding.nil?
138
+ encoding = @DEFAULT_ENCODING
139
+ end
140
+
141
+ #Substitute for equivalent encodings
142
+ encoding_sub = {'iso-8859-1' => 'windows-1252'}
143
+
144
+ if encoding_sub.has_key?(encoding.downcase)
145
+ encoding = encoding_sub[encoding.downcase]
146
+ end
147
+
148
+ return encoding
149
+ end
150
+
151
+ # Attempts to detect at BOM at the start of the stream. If
152
+ # an encoding can be determined from the BOM return the name of the
153
+ # encoding otherwise return nil
154
+ def detect_bom
155
+ bom_dict = {
156
+ "\xef\xbb\xbf" => 'utf-8',
157
+ "\xff\xfe" => 'utf-16le',
158
+ "\xfe\xff" => 'utf-16be',
159
+ "\xff\xfe\x00\x00" => 'utf-32le',
160
+ "\x00\x00\xfe\xff" => 'utf-32be'
161
+ }
162
+
163
+ # Go to beginning of file and read in 4 bytes
164
+ string = @raw_stream.read(4)
165
+ return nil unless string
166
+
167
+ # Try detecting the BOM using bytes from the string
168
+ encoding = bom_dict[string[0...3]] # UTF-8
169
+ seek = 3
170
+ unless encoding
171
+ # Need to detect UTF-32 before UTF-16
172
+ encoding = bom_dict[string] # UTF-32
173
+ seek = 4
174
+ unless encoding
175
+ encoding = bom_dict[string[0...2]] # UTF-16
176
+ seek = 2
177
+ end
178
+ end
179
+
180
+ # Set the read position past the BOM if one was found, otherwise
181
+ # set it to the start of the stream
182
+ seek(string, encoding ? seek : 0)
183
+
184
+ return encoding
185
+ end
186
+
187
+ def seek(buffer, n)
188
+ if @raw_stream.respond_to?(:unget)
189
+ @raw_stream.unget(buffer[n..-1])
190
+ return
191
+ end
192
+
193
+ if @raw_stream.respond_to?(:seek)
194
+ begin
195
+ @raw_stream.seek(n)
196
+ return
197
+ rescue Errno::ESPIPE
198
+ end
199
+ end
200
+
201
+ require 'delegate'
202
+ @raw_stream = SimpleDelegator.new(@raw_stream)
203
+
204
+ class << @raw_stream
205
+ def read(chars=-1)
206
+ if chars == -1 or chars > @data.length
207
+ result = @data
208
+ @data = ''
209
+ return result if __getobj__.eof?
210
+ return result + __getobj__.read if chars == -1
211
+ return result + __getobj__.read(chars-result.length)
212
+ elsif @data.empty?
213
+ return __getobj__.read(chars)
214
+ else
215
+ result = @data[1...chars]
216
+ @data = @data[chars..-1]
217
+ return result
218
+ end
219
+ end
220
+
221
+ def unget(data)
222
+ if !@data or @data.empty?
223
+ @data = data
224
+ else
225
+ @data += data
226
+ end
227
+ end
228
+ end
229
+
230
+ @raw_stream.unget(buffer[n .. -1])
231
+ end
232
+
233
+ # Report the encoding declared by the meta element
234
+ def detect_encoding_meta
235
+ buffer = @raw_stream.read(@NUM_BYTES_META)
236
+ parser = EncodingParser.new(buffer)
237
+ seek(buffer, 0)
238
+ return parser.get_encoding
239
+ end
240
+
241
+ # Returns (line, col) of the current position in the stream.
242
+ def position
243
+ line, col = @line, @col
244
+ @queue.reverse.each do |c|
245
+ if c == "\n"
246
+ line -= 1
247
+ raise RuntimeError.new("col=#{col}") unless col == 0
248
+ col = @line_lengths[line]
249
+ else
250
+ col -= 1
251
+ end
252
+ end
253
+ return [line+1, col]
254
+ end
255
+
256
+ # Read one character from the stream or queue if available. Return
257
+ # EOF when EOF is reached.
258
+ def char
259
+ unless @queue.empty?
260
+ return @queue.shift
261
+ else
262
+ if @tell + 3 > @buffer.length and !@raw_stream.eof?
263
+ # read next block
264
+ @buffer = @buffer[@tell .. -1] + @raw_stream.read(@NUM_BYTES_BUFFER)
265
+ @tell = 0
266
+ end
267
+
268
+ c = @buffer[@tell]
269
+ @tell += 1
270
+
271
+ case c
272
+ when 0x01 .. 0x7F
273
+ if c == 0x0D
274
+ # normalize newlines
275
+ @tell += 1 if @buffer[@tell] == 0x0A
276
+ c = 0x0A
277
+ end
278
+
279
+ # update position in stream
280
+ if c == 0x0a
281
+ @line_lengths << @col
282
+ @line += 1
283
+ @col = 0
284
+ else
285
+ @col += 1
286
+ end
287
+
288
+ c.chr
289
+
290
+ when 0x80 .. 0xBF
291
+ if !@win1252
292
+ [0xFFFD].pack('U') # invalid utf-8
293
+ elsif c <= 0x9f
294
+ [ENTITIES_WINDOWS1252[c-0x80]].pack('U')
295
+ else
296
+ "\xC2" + c.chr # convert to utf-8
297
+ end
298
+
299
+ when 0xC0 .. 0xFF
300
+ if instance_variable_defined?(:@win1252) && @win1252
301
+ "\xC3" + (c-64).chr # convert to utf-8
302
+ elsif @buffer[@tell-1 .. @tell+3] =~ /^
303
+ ( [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte
304
+ | \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs
305
+ | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte
306
+ | \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates
307
+ | \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3
308
+ | [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15
309
+ | \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16
310
+ )/x
311
+ @tell += $1.length - 1
312
+ $1
313
+ else
314
+ [0xFFFD].pack('U') # invalid utf-8
315
+ end
316
+
317
+ when 0x00
318
+ @errors.push('null character found in input stream, ' +
319
+ 'replaced with U+FFFD')
320
+ [0xFFFD].pack('U') # null characters are invalid
321
+
322
+ else
323
+ :EOF
324
+ end
325
+ end
326
+ end
327
+
328
+ # Returns a string of characters from the stream up to but not
329
+ # including any character in characters or EOF. characters can be
330
+ # any container that supports the in method being called on it.
331
+ def chars_until(characters, opposite=false)
332
+ char_stack = [char]
333
+
334
+ while char_stack.last != :EOF
335
+ break unless (characters.include?(char_stack.last)) == opposite
336
+ char_stack.push(char)
337
+ end
338
+
339
+ # Put the character stopped on back to the front of the queue
340
+ # from where it came.
341
+ c = char_stack.pop
342
+ @queue.insert(0, c) unless c == :EOF
343
+ return char_stack.join('')
344
+ end
345
+
346
+ def unget(characters)
347
+ @queue.unshift(*characters.to_a) unless characters == :EOF
348
+ end
349
+ end
350
+
351
+ # String-like object with an assosiated position and various extra methods
352
+ # If the position is ever greater than the string length then an exception is raised
353
+ class EncodingBytes < String
354
+
355
+ attr_accessor :position
356
+
357
+ def initialize(value)
358
+ super(value)
359
+ @position = -1
360
+ end
361
+
362
+ def each
363
+ while @position < length
364
+ @position += 1
365
+ yield self[@position]
366
+ end
367
+ rescue EOF
368
+ end
369
+
370
+ def current_byte
371
+ raise EOF if @position >= length
372
+ return self[@position].chr
373
+ end
374
+
375
+ # Skip past a list of characters
376
+ def skip(chars=SPACE_CHARACTERS)
377
+ while chars.include?(current_byte)
378
+ @position += 1
379
+ end
380
+ end
381
+
382
+ # Look for a sequence of bytes at the start of a string. If the bytes
383
+ # are found return true and advance the position to the byte after the
384
+ # match. Otherwise return false and leave the position alone
385
+ def match_bytes(bytes, lower=false)
386
+ data = self[position ... position+bytes.length]
387
+ data.downcase! if lower
388
+ rv = (data == bytes)
389
+ @position += bytes.length if rv == true
390
+ return rv
391
+ end
392
+
393
+ # Look for the next sequence of bytes matching a given sequence. If
394
+ # a match is found advance the position to the last byte of the match
395
+ def jump_to(bytes)
396
+ new_position = self[position .. -1].index(bytes)
397
+ if new_position
398
+ @position += (new_position + bytes.length-1)
399
+ return true
400
+ else
401
+ raise EOF
402
+ end
403
+ end
404
+
405
+ # Move the pointer so it points to the next byte in a set of possible
406
+ # bytes
407
+ def find_next(byte_list)
408
+ until byte_list.include?(current_byte)
409
+ @position += 1
410
+ end
411
+ end
412
+ end
413
+
414
+ # Mini parser for detecting character encoding from meta elements
415
+ class EncodingParser
416
+
417
+ # string - the data to work on for encoding detection
418
+ def initialize(data)
419
+ @data = EncodingBytes.new(data.to_s)
420
+ @encoding = nil
421
+ end
422
+
423
+ @@method_dispatch = [
424
+ ['<!--', :handle_comment],
425
+ ['<meta', :handle_meta],
426
+ ['</', :handle_possible_end_tag],
427
+ ['<!', :handle_other],
428
+ ['<?', :handle_other],
429
+ ['<', :handle_possible_start_tag]
430
+ ]
431
+
432
+ def get_encoding
433
+ @data.each do |byte|
434
+ keep_parsing = true
435
+ @@method_dispatch.each do |(key, method)|
436
+ if @data.match_bytes(key, lower = true)
437
+ keep_parsing = send(method)
438
+ break
439
+ end
440
+ end
441
+ break unless keep_parsing
442
+ end
443
+ @encoding = @encoding.strip unless @encoding.nil?
444
+ return @encoding
445
+ end
446
+
447
+ # Skip over comments
448
+ def handle_comment
449
+ return @data.jump_to('-->')
450
+ end
451
+
452
+ def handle_meta
453
+ # if we have <meta not followed by a space so just keep going
454
+ return true unless SPACE_CHARACTERS.include?(@data.current_byte)
455
+
456
+ #We have a valid meta element we want to search for attributes
457
+ while true
458
+ #Try to find the next attribute after the current position
459
+ attr = get_attribute
460
+
461
+ return true if attr.nil?
462
+
463
+ if attr[0] == 'charset'
464
+ tentative_encoding = attr[1]
465
+ if HTML5.is_valid_encoding(tentative_encoding)
466
+ @encoding = tentative_encoding
467
+ return false
468
+ end
469
+ elsif attr[0] == 'content'
470
+ content_parser = ContentAttrParser.new(EncodingBytes.new(attr[1]))
471
+ tentative_encoding = content_parser.parse
472
+ if HTML5.is_valid_encoding(tentative_encoding)
473
+ @encoding = tentative_encoding
474
+ return false
475
+ end
476
+ end
477
+ end
478
+ end
479
+
480
+ def handle_possible_start_tag
481
+ return handle_possible_tag(false)
482
+ end
483
+
484
+ def handle_possible_end_tag
485
+ @data.position += 1
486
+ return handle_possible_tag(true)
487
+ end
488
+
489
+ def handle_possible_tag(end_tag)
490
+ unless ASCII_LETTERS.include?(@data.current_byte)
491
+ #If the next byte is not an ascii letter either ignore this
492
+ #fragment (possible start tag case) or treat it according to
493
+ #handleOther
494
+ if end_tag
495
+ @data.position -= 1
496
+ handle_other
497
+ end
498
+ return true
499
+ end
500
+
501
+ @data.find_next(SPACE_CHARACTERS + ['<', '>'])
502
+
503
+ if @data.current_byte == '<'
504
+ #return to the first step in the overall "two step" algorithm
505
+ #reprocessing the < byte
506
+ @data.position -= 1
507
+ else
508
+ #Read all attributes
509
+ {} until get_attribute.nil?
510
+ end
511
+ return true
512
+ end
513
+
514
+ def handle_other
515
+ return @data.jump_to('>')
516
+ end
517
+
518
+ # Return a name,value pair for the next attribute in the stream,
519
+ # if one is found, or nil
520
+ def get_attribute
521
+ @data.skip(SPACE_CHARACTERS + ['/'])
522
+
523
+ if @data.current_byte == '<'
524
+ @data.position -= 1
525
+ return nil
526
+ elsif @data.current_byte == '>'
527
+ return nil
528
+ end
529
+
530
+ attr_name = []
531
+ attr_value = []
532
+ space_found = false
533
+ #Step 5 attribute name
534
+ while true
535
+ if @data.current_byte == '=' and attr_name
536
+ break
537
+ elsif SPACE_CHARACTERS.include?(@data.current_byte)
538
+ space_found = true
539
+ break
540
+ elsif ['/', '<', '>'].include?(@data.current_byte)
541
+ return [attr_name.join(''), '']
542
+ elsif ASCII_UPPERCASE.include?(@data.current_byte)
543
+ attr_name.push(@data.current_byte.downcase)
544
+ else
545
+ attr_name.push(@data.current_byte)
546
+ end
547
+ #Step 6
548
+ @data.position += 1
549
+ end
550
+ #Step 7
551
+ if space_found
552
+ @data.skip
553
+ #Step 8
554
+ unless @data.current_byte == '='
555
+ @data.position -= 1
556
+ return [attr_name.join(''), '']
557
+ end
558
+ end
559
+ #XXX need to advance position in both spaces and value case
560
+ #Step 9
561
+ @data.position += 1
562
+ #Step 10
563
+ @data.skip
564
+ #Step 11
565
+ if ["'", '"'].include?(@data.current_byte)
566
+ #11.1
567
+ quote_char = @data.current_byte
568
+ while true
569
+ @data.position+=1
570
+ #11.3
571
+ if @data.current_byte == quote_char
572
+ @data.position += 1
573
+ return [attr_name.join(''), attr_value.join('')]
574
+ #11.4
575
+ elsif ASCII_UPPERCASE.include?(@data.current_byte)
576
+ attr_value.push(@data.current_byte.downcase)
577
+ #11.5
578
+ else
579
+ attr_value.push(@data.current_byte)
580
+ end
581
+ end
582
+ elsif ['>', '<'].include?(@data.current_byte)
583
+ return [attr_name.join(''), '']
584
+ elsif ASCII_UPPERCASE.include?(@data.current_byte)
585
+ attr_value.push(@data.current_byte.downcase)
586
+ else
587
+ attr_value.push(@data.current_byte)
588
+ end
589
+ while true
590
+ @data.position += 1
591
+ if (SPACE_CHARACTERS + ['>', '<']).include?(@data.current_byte)
592
+ return [attr_name.join(''), attr_value.join('')]
593
+ elsif ASCII_UPPERCASE.include?(@data.current_byte)
594
+ attr_value.push(@data.current_byte.downcase)
595
+ else
596
+ attr_value.push(@data.current_byte)
597
+ end
598
+ end
599
+ end
600
+ end
601
+
602
+ class ContentAttrParser
603
+ def initialize(data)
604
+ @data = data
605
+ end
606
+
607
+ def parse
608
+ begin
609
+ #Skip to the first ";"
610
+ @data.position = 0
611
+ @data.jump_to(';')
612
+ @data.position += 1
613
+ @data.skip
614
+ #Check if the attr name is charset
615
+ #otherwise return
616
+ @data.jump_to('charset')
617
+ @data.position += 1
618
+ @data.skip
619
+ unless @data.current_byte == '='
620
+ #If there is no = sign keep looking for attrs
621
+ return nil
622
+ end
623
+ @data.position += 1
624
+ @data.skip
625
+ #Look for an encoding between matching quote marks
626
+ if ['"', "'"].include?(@data.current_byte)
627
+ quote_mark = @data.current_byte
628
+ @data.position += 1
629
+ old_position = @data.position
630
+ @data.jump_to(quote_mark)
631
+ return @data[old_position ... @data.position]
632
+ else
633
+ #Unquoted value
634
+ old_position = @data.position
635
+ begin
636
+ @data.find_next(SPACE_CHARACTERS)
637
+ return @data[old_position ... @data.position]
638
+ rescue EOF
639
+ #Return the whole remaining value
640
+ return @data[old_position .. -1]
641
+ end
642
+ end
643
+ rescue EOF
644
+ return nil
645
+ end
646
+ end
647
+ end
648
+
649
+ # Determine if a string is a supported encoding
650
+ def self.is_valid_encoding(encoding)
651
+ (not encoding.nil? and encoding.kind_of?(String) and ENCODINGS.include?(encoding.downcase.strip))
652
+ end
653
+
654
+ end