spk-html5 0.10.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (74) hide show
  1. data/History.txt +10 -0
  2. data/Manifest.txt +73 -0
  3. data/README +45 -0
  4. data/Rakefile.rb +33 -0
  5. data/bin/html5 +7 -0
  6. data/lib/html5.rb +13 -0
  7. data/lib/html5/cli.rb +248 -0
  8. data/lib/html5/constants.rb +1061 -0
  9. data/lib/html5/filters/base.rb +10 -0
  10. data/lib/html5/filters/inject_meta_charset.rb +82 -0
  11. data/lib/html5/filters/iso639codes.rb +755 -0
  12. data/lib/html5/filters/optionaltags.rb +198 -0
  13. data/lib/html5/filters/rfc2046.rb +31 -0
  14. data/lib/html5/filters/rfc3987.rb +91 -0
  15. data/lib/html5/filters/sanitizer.rb +15 -0
  16. data/lib/html5/filters/validator.rb +834 -0
  17. data/lib/html5/filters/whitespace.rb +36 -0
  18. data/lib/html5/html5parser.rb +247 -0
  19. data/lib/html5/html5parser/after_after_body_phase.rb +43 -0
  20. data/lib/html5/html5parser/after_after_frameset_phase.rb +32 -0
  21. data/lib/html5/html5parser/after_body_phase.rb +46 -0
  22. data/lib/html5/html5parser/after_frameset_phase.rb +33 -0
  23. data/lib/html5/html5parser/after_head_phase.rb +55 -0
  24. data/lib/html5/html5parser/before_head_phase.rb +44 -0
  25. data/lib/html5/html5parser/before_html_phase.rb +41 -0
  26. data/lib/html5/html5parser/in_body_phase.rb +636 -0
  27. data/lib/html5/html5parser/in_caption_phase.rb +69 -0
  28. data/lib/html5/html5parser/in_cell_phase.rb +78 -0
  29. data/lib/html5/html5parser/in_column_group_phase.rb +55 -0
  30. data/lib/html5/html5parser/in_foreign_content_phase.rb +50 -0
  31. data/lib/html5/html5parser/in_frameset_phase.rb +56 -0
  32. data/lib/html5/html5parser/in_head_phase.rb +143 -0
  33. data/lib/html5/html5parser/in_row_phase.rb +96 -0
  34. data/lib/html5/html5parser/in_select_phase.rb +90 -0
  35. data/lib/html5/html5parser/in_select_table_phase.rb +35 -0
  36. data/lib/html5/html5parser/in_table_body_phase.rb +92 -0
  37. data/lib/html5/html5parser/in_table_phase.rb +177 -0
  38. data/lib/html5/html5parser/initial_phase.rb +133 -0
  39. data/lib/html5/html5parser/phase.rb +171 -0
  40. data/lib/html5/inputstream.rb +735 -0
  41. data/lib/html5/liberalxmlparser.rb +158 -0
  42. data/lib/html5/sanitizer.rb +209 -0
  43. data/lib/html5/serializer.rb +2 -0
  44. data/lib/html5/serializer/htmlserializer.rb +179 -0
  45. data/lib/html5/serializer/xhtmlserializer.rb +20 -0
  46. data/lib/html5/sniffer.rb +45 -0
  47. data/lib/html5/tokenizer.rb +1059 -0
  48. data/lib/html5/treebuilders.rb +24 -0
  49. data/lib/html5/treebuilders/base.rb +339 -0
  50. data/lib/html5/treebuilders/hpricot.rb +231 -0
  51. data/lib/html5/treebuilders/rexml.rb +215 -0
  52. data/lib/html5/treebuilders/simpletree.rb +191 -0
  53. data/lib/html5/treewalkers.rb +26 -0
  54. data/lib/html5/treewalkers/base.rb +162 -0
  55. data/lib/html5/treewalkers/hpricot.rb +48 -0
  56. data/lib/html5/treewalkers/rexml.rb +48 -0
  57. data/lib/html5/treewalkers/simpletree.rb +48 -0
  58. data/lib/html5/version.rb +3 -0
  59. data/test/preamble.rb +69 -0
  60. data/test/test_cli.rb +16 -0
  61. data/test/test_encoding.rb +35 -0
  62. data/test/test_input_stream.rb +26 -0
  63. data/test/test_lxp.rb +283 -0
  64. data/test/test_parser.rb +63 -0
  65. data/test/test_sanitizer.rb +173 -0
  66. data/test/test_serializer.rb +67 -0
  67. data/test/test_sniffer.rb +27 -0
  68. data/test/test_stream.rb +71 -0
  69. data/test/test_tokenizer.rb +95 -0
  70. data/test/test_treewalkers.rb +135 -0
  71. data/test/test_validator.rb +31 -0
  72. data/test/tokenizer_test_parser.rb +67 -0
  73. data/test19.rb +38 -0
  74. metadata +198 -0
@@ -0,0 +1,133 @@
1
+ require 'html5/html5parser/phase'
2
+
3
+ module HTML5
4
+ class InitialPhase < Phase
5
+
6
+ # This phase deals with error handling as well which is currently not
7
+ # covered in the specification. The error handling is typically known as
8
+ # "quirks mode". It is expected that a future version of HTML5 will define this.
9
+
10
+ def process_eof
11
+ parse_error("expected-doctype-but-got-eof")
12
+ @parser.phase = @parser.phases[:beforeHtml]
13
+ @parser.phase.process_eof
14
+ end
15
+
16
+ def processComment(data)
17
+ @tree.insert_comment(data, @tree.document)
18
+ end
19
+
20
+ def processDoctype(name, publicId, systemId, correct)
21
+ if name.downcase != 'html' or publicId or systemId
22
+ parse_error("unknown-doctype")
23
+ end
24
+ # XXX need to update DOCTYPE tokens
25
+ @tree.insertDoctype(name, publicId, systemId)
26
+
27
+ publicId = publicId.to_s.upcase
28
+
29
+ if name.downcase != 'html'
30
+ # XXX quirks mode
31
+ else
32
+ if ["+//silmaril//dtd html pro v0r11 19970101//en",
33
+ "-//advasoft ltd//dtd html 3.0 aswedit + extensions//en",
34
+ "-//as//dtd html 3.0 aswedit + extensions//en",
35
+ "-//ietf//dtd html 2.0 level 1//en",
36
+ "-//ietf//dtd html 2.0 level 2//en",
37
+ "-//ietf//dtd html 2.0 strict level 1//en",
38
+ "-//ietf//dtd html 2.0 strict level 2//en",
39
+ "-//ietf//dtd html 2.0 strict//en",
40
+ "-//ietf//dtd html 2.0//en",
41
+ "-//ietf//dtd html 2.1e//en",
42
+ "-//ietf//dtd html 3.0//en",
43
+ "-//ietf//dtd html 3.0//en//",
44
+ "-//ietf//dtd html 3.2 final//en",
45
+ "-//ietf//dtd html 3.2//en",
46
+ "-//ietf//dtd html 3//en",
47
+ "-//ietf//dtd html level 0//en",
48
+ "-//ietf//dtd html level 0//en//2.0",
49
+ "-//ietf//dtd html level 1//en",
50
+ "-//ietf//dtd html level 1//en//2.0",
51
+ "-//ietf//dtd html level 2//en",
52
+ "-//ietf//dtd html level 2//en//2.0",
53
+ "-//ietf//dtd html level 3//en",
54
+ "-//ietf//dtd html level 3//en//3.0",
55
+ "-//ietf//dtd html strict level 0//en",
56
+ "-//ietf//dtd html strict level 0//en//2.0",
57
+ "-//ietf//dtd html strict level 1//en",
58
+ "-//ietf//dtd html strict level 1//en//2.0",
59
+ "-//ietf//dtd html strict level 2//en",
60
+ "-//ietf//dtd html strict level 2//en//2.0",
61
+ "-//ietf//dtd html strict level 3//en",
62
+ "-//ietf//dtd html strict level 3//en//3.0",
63
+ "-//ietf//dtd html strict//en",
64
+ "-//ietf//dtd html strict//en//2.0",
65
+ "-//ietf//dtd html strict//en//3.0",
66
+ "-//ietf//dtd html//en",
67
+ "-//ietf//dtd html//en//2.0",
68
+ "-//ietf//dtd html//en//3.0",
69
+ "-//metrius//dtd metrius presentational//en",
70
+ "-//microsoft//dtd internet explorer 2.0 html strict//en",
71
+ "-//microsoft//dtd internet explorer 2.0 html//en",
72
+ "-//microsoft//dtd internet explorer 2.0 tables//en",
73
+ "-//microsoft//dtd internet explorer 3.0 html strict//en",
74
+ "-//microsoft//dtd internet explorer 3.0 html//en",
75
+ "-//microsoft//dtd internet explorer 3.0 tables//en",
76
+ "-//netscape comm. corp.//dtd html//en",
77
+ "-//netscape comm. corp.//dtd strict html//en",
78
+ "-//o'reilly and associates//dtd html 2.0//en",
79
+ "-//o'reilly and associates//dtd html extended 1.0//en",
80
+ "-//spyglass//dtd html 2.0 extended//en",
81
+ "-//sq//dtd html 2.0 hotmetal + extensions//en",
82
+ "-//sun microsystems corp.//dtd hotjava html//en",
83
+ "-//sun microsystems corp.//dtd hotjava strict html//en",
84
+ "-//w3c//dtd html 3 1995-03-24//en",
85
+ "-//w3c//dtd html 3.2 draft//en",
86
+ "-//w3c//dtd html 3.2 final//en",
87
+ "-//w3c//dtd html 3.2//en",
88
+ "-//w3c//dtd html 3.2s draft//en",
89
+ "-//w3c//dtd html 4.0 frameset//en",
90
+ "-//w3c//dtd html 4.0 transitional//en",
91
+ "-//w3c//dtd html experimental 19960712//en",
92
+ "-//w3c//dtd html experimental 970421//en",
93
+ "-//w3c//dtd w3 html//en",
94
+ "-//w3o//dtd w3 html 3.0//en",
95
+ "-//w3o//dtd w3 html 3.0//en//",
96
+ "-//w3o//dtd w3 html strict 3.0//en//",
97
+ "-//webtechs//dtd mozilla html 2.0//en",
98
+ "-//webtechs//dtd mozilla html//en",
99
+ "-/w3c/dtd html 4.0 transitional/en",
100
+ "html"].include?(publicId) or
101
+ (systemId == nil and
102
+ ["-//w3c//dtd html 4.01 frameset//EN",
103
+ "-//w3c//dtd html 4.01 transitional//EN"].include?(publicId)) or
104
+ (systemId == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd")
105
+ #XXX quirks mode
106
+ end
107
+ end
108
+
109
+ @parser.phase = @parser.phases[:beforeHtml]
110
+ end
111
+
112
+ def processSpaceCharacters(data)
113
+ end
114
+
115
+ def processCharacters(data)
116
+ parse_error("expected-doctype-but-got-chars")
117
+ @parser.phase = @parser.phases[:beforeHtml]
118
+ @parser.phase.processCharacters(data)
119
+ end
120
+
121
+ def processStartTag(name, attributes, self_closing=false)
122
+ parse_error("expected-doctype-but-got-start-tag", {"name" => name})
123
+ @parser.phase = @parser.phases[:beforeHtml]
124
+ @parser.phase.processStartTag(name, attributes)
125
+ end
126
+
127
+ def processEndTag(name)
128
+ parse_error("expected-doctype-but-got-end-tag", {"name" => name})
129
+ @parser.phase = @parser.phases[:beforeHtml]
130
+ @parser.phase.processEndTag(name)
131
+ end
132
+ end
133
+ end
@@ -0,0 +1,171 @@
1
+ module HTML5
2
+ # Base class for helper objects that implement each phase of processing.
3
+ #
4
+ # Handler methods should be in the following order (they can be omitted):
5
+ #
6
+ # * EOF
7
+ # * Comment
8
+ # * Doctype
9
+ # * SpaceCharacters
10
+ # * Characters
11
+ # * StartTag
12
+ # - startTag* methods
13
+ # * EndTag
14
+ # - endTag* methods
15
+ #
16
+ class Phase
17
+ extend Forwardable
18
+ def_delegators :@parser, :parse_error
19
+
20
+ # The following example call:
21
+ #
22
+ # tag_handlers('startTag', 'html', %w( base link meta ), %w( li dt dd ) => 'ListItem')
23
+ #
24
+ # ...would return a hash equal to this:
25
+ #
26
+ # { 'html' => 'startTagHtml',
27
+ # 'base' => 'startTagBaseLinkMeta',
28
+ # 'link' => 'startTagBaseLinkMeta',
29
+ # 'meta' => 'startTagBaseLinkMeta',
30
+ # 'li' => 'startTagListItem',
31
+ # 'dt' => 'startTagListItem',
32
+ # 'dd' => 'startTagListItem' }
33
+ #
34
+ def self.tag_handlers(prefix, *tags)
35
+ mapping = {}
36
+ if tags.last.is_a?(Hash)
37
+ tags.pop.each do |names, handler_method_suffix|
38
+ handler_method = prefix + handler_method_suffix
39
+ Array(names).each {|name| mapping[name] = handler_method }
40
+ end
41
+ end
42
+ tags.each do |names|
43
+ names = Array(names)
44
+ handler_method = prefix + names.map {|name| name.capitalize }.join
45
+ names.each {|name| mapping[name] = handler_method }
46
+ end
47
+ mapping
48
+ end
49
+
50
+ def self.start_tag_handlers
51
+ @start_tag_handlers ||= Hash.new('startTagOther')
52
+ end
53
+
54
+ # Declare what start tags this Phase handles. Can be called more than once.
55
+ #
56
+ # Example usage:
57
+ #
58
+ # handle_start 'html'
59
+ # # html start tags will be handled by a method named 'startTagHtml'
60
+ #
61
+ # handle_start %( base link meta )
62
+ # # base, link and meta start tags will be handled by a method named 'startTagBaseLinkMeta'
63
+ #
64
+ # handle_start %( li dt dd ) => 'ListItem'
65
+ # # li, dt, and dd start tags will be handled by a method named 'startTagListItem'
66
+ #
67
+ def self.handle_start(*tags)
68
+ start_tag_handlers.update tag_handlers('startTag', *tags)
69
+ end
70
+
71
+ def self.end_tag_handlers
72
+ @end_tag_handlers ||= Hash.new('endTagOther')
73
+ end
74
+
75
+ # Declare what end tags this Phase handles. Behaves like handle_start.
76
+ #
77
+ def self.handle_end(*tags)
78
+ end_tag_handlers.update tag_handlers('endTag', *tags)
79
+ end
80
+
81
+ def initialize(parser, tree)
82
+ @parser, @tree = parser, tree
83
+ end
84
+
85
+ def process_eof
86
+ @tree.generateImpliedEndTags
87
+
88
+ if @tree.open_elements.length > 2
89
+ parse_error("expected-closing-tag-but-got-eof")
90
+ elsif @tree.open_elements.length == 2 and @tree.open_elements[1].name != 'body'
91
+ # This happens for framesets or something?
92
+ parse_error("expected-closing-tag-but-got-eof")
93
+ elsif @parser.inner_html and @tree.open_elements.length > 1
94
+ # XXX This is not what the specification says. Not sure what to do here.
95
+ parse_error("eof-in-innerhtml")
96
+ end
97
+ # Betting ends.
98
+ end
99
+
100
+ def processComment(data)
101
+ # For most phases the following is correct. Where it's not it will be
102
+ # overridden.
103
+ @tree.insert_comment(data, @tree.open_elements.last)
104
+ end
105
+
106
+ def processDoctype(name, publicId, systemId, correct)
107
+ parse_error("unexpected-doctype")
108
+ end
109
+
110
+ def processSpaceCharacters(data)
111
+ @tree.insertText(data)
112
+ end
113
+
114
+ def processStartTag(name, attributes, self_closing=false)
115
+ if method(self.class.start_tag_handlers[name]).arity == 2
116
+ send self.class.start_tag_handlers[name], name, attributes
117
+ else
118
+ send self.class.start_tag_handlers[name], name, attributes, self_closing
119
+ end
120
+ end
121
+
122
+ def startTagHtml(name, attributes)
123
+ if @parser.first_start_tag == false and name == 'html'
124
+ parse_error("non-html-root")
125
+ end
126
+ # XXX Need a check here to see if the first start tag token emitted is
127
+ # this token... If it's not, invoke parse_error.
128
+ attributes.each do |attr, value|
129
+ unless @tree.open_elements.first.attributes.has_key?(attr)
130
+ @tree.open_elements.first.attributes[attr] = value
131
+ end
132
+ end
133
+ @parser.first_start_tag = false
134
+ end
135
+
136
+ def processEndTag(name)
137
+ send self.class.end_tag_handlers[name], name
138
+ end
139
+
140
+ def assert(value)
141
+ throw AssertionError.new unless value
142
+ end
143
+
144
+ def in_scope?(*args)
145
+ @tree.elementInScope(*args)
146
+ end
147
+
148
+ def remove_open_elements_until(name=nil)
149
+ finished = false
150
+ until finished || @tree.open_elements.length == 0
151
+ element = @tree.open_elements.pop
152
+ finished = name.nil? ? yield(element) : element.name == name
153
+ end
154
+ return element
155
+ end
156
+
157
+ def adjust_mathml_attributes(attributes)
158
+ attributes.collect do |a|
159
+ if a.first =='definitionurl'
160
+ ['definitionURL', a[1]]
161
+ else
162
+ a
163
+ end
164
+ end
165
+ end
166
+
167
+ def adjust_foreign_attributes(attributes)
168
+ attributes
169
+ end
170
+ end
171
+ end
@@ -0,0 +1,735 @@
1
+ require 'stringio'
2
+ require 'html5/constants'
3
+
4
+ module HTML5
5
+
6
+ # Provides a unicode stream of characters to the HTMLTokenizer.
7
+
8
+ # This class takes care of character encoding and removing or replacing
9
+ # incorrect byte-sequences and also provides column and line tracking.
10
+
11
+ class HTMLInputStream
12
+
13
+ attr_accessor :queue, :char_encoding, :errors
14
+
15
+ # see /usr/lib/ruby/1.9.1/rexml/text.rb
16
+ VALID_CHAR = [
17
+ 0x9, 0xA, 0xD,
18
+ (0x20..0xD7FF),
19
+ (0xE000..0xFFFD),
20
+ (0x10000..0x10FFFF)
21
+ ]
22
+ if String.method_defined? :encode
23
+ VALID_XML_CHARS = Regexp.new('^['+
24
+ VALID_CHAR.map { |item|
25
+ case item
26
+ when Fixnum
27
+ [item].pack('U').force_encoding('utf-8')
28
+ when Range
29
+ [item.first, '-'.ord, item.last].pack('UUU').force_encoding('utf-8')
30
+ end
31
+ }.join +
32
+ ']*$')
33
+ else
34
+ VALID_XML_CHARS = /^(
35
+ | [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte
36
+ | \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs
37
+ | [\xE1-\xEC\xEE][\x80-\xBF]{2} # straight 3-byte
38
+ | \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates
39
+ | \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3
40
+ | [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15
41
+ | \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16
42
+ )*$/nx;
43
+ end
44
+
45
+ # Initialises the HTMLInputStream.
46
+ #
47
+ # HTMLInputStream(source, [encoding]) -> Normalized stream from source
48
+ # for use by the HTML5Lib.
49
+ #
50
+ # source can be either a file-object, local filename or a string.
51
+ #
52
+ # The optional encoding parameter must be a string that indicates
53
+ # the encoding. If specified, that encoding will be used,
54
+ # regardless of any BOM or later declaration (such as in a meta
55
+ # element)
56
+ #
57
+ # parseMeta - Look for a <meta> element containing encoding information
58
+
59
+ def initialize(source, options = {})
60
+ @encoding = nil
61
+ @parse_meta = true
62
+ @chardet = true
63
+
64
+ options.each {|name, value| instance_variable_set("@#{name}", value) }
65
+
66
+ # partial Ruby 1.9 support
67
+ if @encoding and source.respond_to? :force_encoding
68
+ source.force_encoding(@encoding) rescue nil
69
+ end
70
+
71
+ # Raw Stream
72
+ @raw_stream = open_stream(source)
73
+
74
+ # Encoding Information
75
+ #Number of bytes to use when looking for a meta element with
76
+ #encoding information
77
+ @NUM_BYTES_META = 512
78
+ #Number of bytes to use when using detecting encoding using chardet
79
+ @NUM_BYTES_CHARDET = 256
80
+ #Number of bytes to use when reading content
81
+ @NUM_BYTES_BUFFER = 1024
82
+
83
+ #Encoding to use if no other information can be found
84
+ @DEFAULT_ENCODING = 'windows-1252'
85
+
86
+ #Detect encoding iff no explicit "transport level" encoding is supplied
87
+ if @encoding.nil?
88
+ @char_encoding = detect_encoding
89
+ else
90
+ @char_encoding = @encoding
91
+ end
92
+
93
+ # Read bytes from stream decoding them into Unicode
94
+ @buffer = @raw_stream.read(@NUM_BYTES_BUFFER) || ''
95
+ if @char_encoding == 'windows-1252'
96
+ @win1252 = true
97
+ elsif @char_encoding != 'utf-8'
98
+ require 'iconv'
99
+ begin
100
+ @buffer << @raw_stream.read unless @raw_stream.eof?
101
+ @buffer = Iconv.iconv('utf-8', @char_encoding, @buffer).first
102
+ rescue
103
+ @win1252 = true
104
+ end
105
+ end
106
+
107
+ @queue = []
108
+ @errors = []
109
+
110
+ # Reset position in the list to read from
111
+ @tell = 0
112
+ @line = @col = 0
113
+ @line_lengths = []
114
+ end
115
+
116
+ # Produces a file object from source.
117
+ #
118
+ # source can be either a file object, local filename or a string.
119
+ def open_stream(source)
120
+ # Already an IO like object
121
+ if source.respond_to?(:read)
122
+ source
123
+ else
124
+ # Treat source as a string and wrap in StringIO
125
+ StringIO.new(source)
126
+ end
127
+ end
128
+
129
+ def detect_encoding
130
+
131
+ #First look for a BOM
132
+ #This will also read past the BOM if present
133
+ encoding = detect_bom
134
+
135
+ #If there is no BOM need to look for meta elements with encoding
136
+ #information
137
+ if encoding.nil? and @parse_meta
138
+ encoding = detect_encoding_meta
139
+ end
140
+
141
+ #Guess with chardet, if avaliable
142
+ if encoding.nil? and @chardet
143
+ begin
144
+ require 'rubygems'
145
+ require 'UniversalDetector' # gem install chardet
146
+ buffers = []
147
+ detector = UniversalDetector::Detector.instance
148
+ detector.reset
149
+ until @raw_stream.eof?
150
+ buffer = @raw_stream.read(@NUM_BYTES_CHARDET)
151
+ break if !buffer or buffer.empty?
152
+ buffers << buffer
153
+ detector.feed(buffer)
154
+ break if detector.instance_eval {@done}
155
+ detector.instance_eval {
156
+ @_mLastChar = @_mLastChar.chr if Fixnum === @_mLastChar
157
+ }
158
+ end
159
+ detector.close
160
+ encoding = detector.result['encoding']
161
+ seek(buffers*'', 0)
162
+ rescue LoadError
163
+ end
164
+ end
165
+
166
+ # If all else fails use the default encoding
167
+ if encoding.nil?
168
+ encoding = @DEFAULT_ENCODING
169
+ end
170
+
171
+ #Substitute for equivalent encoding
172
+ if 'iso-8859-1' == encoding.downcase
173
+ encoding = 'windows-1252'
174
+ end
175
+
176
+ encoding
177
+ end
178
+
179
+ # Attempts to detect at BOM at the start of the stream. If
180
+ # an encoding can be determined from the BOM return the name of the
181
+ # encoding otherwise return nil
182
+ def detect_bom
183
+ bom_dict = {
184
+ "\xef\xbb\xbf" => 'utf-8',
185
+ "\xff\xfe" => 'utf-16le',
186
+ "\xfe\xff" => 'utf-16be',
187
+ "\xff\xfe\x00\x00" => 'utf-32le',
188
+ "\x00\x00\xfe\xff" => 'utf-32be'
189
+ }
190
+
191
+ # Go to beginning of file and read in 4 bytes
192
+ string = @raw_stream.read(4)
193
+ return nil unless string
194
+
195
+ # Try detecting the BOM using bytes from the string
196
+ encoding = bom_dict[string[0...3]] # UTF-8
197
+ seek = 3
198
+ unless encoding
199
+ # Need to detect UTF-32 before UTF-16
200
+ encoding = bom_dict[string] # UTF-32
201
+ seek = 4
202
+ unless encoding
203
+ encoding = bom_dict[string[0...2]] # UTF-16
204
+ seek = 2
205
+ end
206
+ end
207
+
208
+ # Set the read position past the BOM if one was found, otherwise
209
+ # set it to the start of the stream
210
+ seek(string, encoding ? seek : 0)
211
+
212
+ return encoding
213
+ end
214
+
215
+ def seek(buffer, n)
216
+ if @raw_stream.respond_to?(:unget)
217
+ @raw_stream.unget(buffer[n..-1])
218
+ return
219
+ end
220
+
221
+ if @raw_stream.respond_to?(:seek)
222
+ begin
223
+ @raw_stream.seek(n)
224
+ return
225
+ rescue Errno::ESPIPE
226
+ end
227
+ end
228
+
229
+ #TODO: huh?
230
+ require 'delegate'
231
+ @raw_stream = SimpleDelegator.new(@raw_stream)
232
+
233
+ class << @raw_stream
234
+ def read(chars=-1)
235
+ if chars == -1 or chars > @data.length
236
+ result = @data
237
+ @data = ''
238
+ return result if __getobj__.eof?
239
+ return result + __getobj__.read if chars == -1
240
+ return result + __getobj__.read(chars-result.length)
241
+ elsif @data.empty?
242
+ return __getobj__.read(chars)
243
+ else
244
+ result = @data[1...chars]
245
+ @data = @data[chars..-1]
246
+ return result
247
+ end
248
+ end
249
+
250
+ def unget(data)
251
+ if !@data or @data.empty?
252
+ @data = data
253
+ else
254
+ @data += data
255
+ end
256
+ end
257
+ end
258
+
259
+ @raw_stream.unget(buffer[n .. -1])
260
+ end
261
+
262
+ # Report the encoding declared by the meta element
263
+ def detect_encoding_meta
264
+ buffer = @raw_stream.read(@NUM_BYTES_META)
265
+ parser = EncodingParser.new(buffer)
266
+ seek(buffer, 0)
267
+ return parser.get_encoding
268
+ end
269
+
270
+ # Returns (line, col) of the current position in the stream.
271
+ def position
272
+ line, col = @line, @col
273
+ if @queue and @queue.last != :EOF
274
+ @queue.reverse.each do |c|
275
+ if c == "\n"
276
+ line -= 1
277
+ raise RuntimeError.new("col=#{col}") unless col == 0
278
+ col = @line_lengths[line]
279
+ else
280
+ col -= 1
281
+ end
282
+ end
283
+ end
284
+ return [line + 1, col]
285
+ end
286
+
287
+ # Read one character from the stream or queue if available. Return
288
+ # EOF when EOF is reached.
289
+ def char
290
+ unless @queue.empty?
291
+ return @queue.shift
292
+ else
293
+ if @tell + 3 > @buffer.length && !@raw_stream.eof?
294
+ # read next block
295
+ @buffer = @buffer[@tell..-1] + @raw_stream.read(@NUM_BYTES_BUFFER)
296
+ @tell = 0
297
+ end
298
+
299
+ c = @buffer[@tell]
300
+ @tell += 1
301
+
302
+ case c
303
+
304
+ when String
305
+ # partial Ruby 1.9 support
306
+ case c
307
+ when "\0"
308
+ @errors.push("null-character")
309
+ c = "\uFFFD" # null characters are invalid
310
+ when "\r"
311
+ @tell += 1 if @buffer[@tell] == "\n"
312
+ c = "\n"
313
+ when "\x80" .. "\x9F"
314
+ c = ENTITIES_WINDOWS1252[c.ord-0x80].chr('utf-8')
315
+ when "\xA0" .. "\xFF"
316
+ if c.encoding == Encoding::ASCII_8BIT
317
+ c = c.encode('utf-8','iso-8859-1')
318
+ end
319
+ end
320
+
321
+ if c == "\x0D"
322
+ # normalize newlines
323
+ @tell += 1 if @buffer[@tell] == 0x0A
324
+ c = 0x0A
325
+ end
326
+
327
+ # update position in stream
328
+ if c == "\x0a"
329
+ @line_lengths << @col
330
+ @line += 1
331
+ @col = 0
332
+ else
333
+ @col += 1
334
+ end
335
+
336
+ c
337
+
338
+ when 0x01..0x7F
339
+ if c == 0x0D
340
+ # normalize newlines
341
+ @tell += 1 if @buffer[@tell] == 0x0A
342
+ c = 0x0A
343
+ end
344
+
345
+ # update position in stream
346
+ if c == 0x0a
347
+ @line_lengths << @col
348
+ @line += 1
349
+ @col = 0
350
+ else
351
+ @col += 1
352
+ end
353
+
354
+ c.chr
355
+
356
+ when 0x80..0xBF
357
+ if !@win1252
358
+ [0xFFFD].pack('U') # invalid utf-8
359
+ elsif c <= 0x9f
360
+ [ENTITIES_WINDOWS1252[c-0x80]].pack('U')
361
+ else
362
+ "\xC2" + c.chr # convert to utf-8
363
+ end
364
+
365
+ when 0xC0..0xFF
366
+ if instance_variable_defined?("@win1252") && @win1252
367
+ "\xC3" + (c - 64).chr # convert to utf-8
368
+
369
+
370
+ elsif @buffer[@tell - 1..@tell + 3] =~ VALID_XML_CHARS
371
+ @tell += $1.length - 1
372
+ $1
373
+ else
374
+ [0xFFFD].pack('U') # invalid utf-8
375
+ end
376
+
377
+ when 0x00
378
+ @errors.push("null-character")
379
+ [0xFFFD].pack('U') # null characters are invalid
380
+
381
+ else
382
+ :EOF
383
+ end
384
+ end
385
+ end
386
+
387
+ # Returns a string of characters from the stream up to but not
388
+ # including any character in characters or EOF. characters can be
389
+ # any container that supports the in method being called on it.
390
+ def chars_until(characters, opposite=false)
391
+ char_stack = [char]
392
+
393
+ while char_stack.last != :EOF
394
+ break unless (characters.include?(char_stack.last)) == opposite
395
+ char_stack.push(char)
396
+ end
397
+
398
+ # Put the character stopped on back to the front of the queue
399
+ # from where it came.
400
+ c = char_stack.pop
401
+ @queue.insert(0, c) unless c == :EOF
402
+ return char_stack.join('')
403
+ end
404
+
405
+ def unget(characters)
406
+ return if characters == :EOF
407
+ if characters.respond_to? :to_a
408
+ @queue.unshift(*characters.to_a)
409
+ else
410
+ characters.reverse.each_char {|c| @queue.unshift(c)}
411
+ end
412
+ end
413
+ end
414
+
415
+ # String-like object with an assosiated position and various extra methods
416
+ # If the position is ever greater than the string length then an exception is raised
417
+ class EncodingBytes < String
418
+
419
+ attr_accessor :position
420
+
421
+ def initialize(value)
422
+ super(value)
423
+ @position = -1
424
+ end
425
+
426
+ def each
427
+ while @position < length
428
+ @position += 1
429
+ yield self[@position]
430
+ end
431
+ rescue EOF
432
+ end
433
+
434
+ def current_byte
435
+ raise EOF if @position >= length
436
+ return self[@position].chr
437
+ end
438
+
439
+ # Skip past a list of characters
440
+ def skip(chars=SPACE_CHARACTERS)
441
+ while chars.include?(current_byte)
442
+ @position += 1
443
+ end
444
+ end
445
+
446
+ # Look for a sequence of bytes at the start of a string. If the bytes
447
+ # are found return true and advance the position to the byte after the
448
+ # match. Otherwise return false and leave the position alone
449
+ def match_bytes(bytes, lower=false)
450
+ data = self[position ... position+bytes.length]
451
+ data.downcase! if lower
452
+ rv = (data == bytes)
453
+ @position += bytes.length if rv == true
454
+ return rv
455
+ end
456
+
457
+ # Look for the next sequence of bytes matching a given sequence. If
458
+ # a match is found advance the position to the last byte of the match
459
+ def jump_to(bytes)
460
+ new_position = self[position .. -1].index(bytes)
461
+ if new_position
462
+ @position += (new_position + bytes.length-1)
463
+ return true
464
+ else
465
+ raise EOF
466
+ end
467
+ end
468
+
469
+ # Move the pointer so it points to the next byte in a set of possible
470
+ # bytes
471
+ def find_next(byte_list)
472
+ until byte_list.include?(current_byte)
473
+ @position += 1
474
+ end
475
+ end
476
+ end
477
+
478
+ # Mini parser for detecting character encoding from meta elements
479
+ class EncodingParser
480
+ ASCII_PUNCTUATION = %r{[\x09-\x0D\x20-\x2F\x3A-\x40\x5B-\x60\x7B-\x7E]}
481
+ # a (hopefully) temporary hack to deal with the fact that ruby doesn't have a built in encodings
482
+ # library
483
+ ENCODINGS = ['euc_jp', 'utf-8', "iso8859-2", "iso-8859-1", "utf-16", "UTF-16LE", "UTF-16BE"].inject({}){|m, v| m[v.downcase.gsub(ASCII_PUNCTUATION, '')] = v; m}
484
+ # string - the data to work on for encoding detection
485
+ def initialize(data)
486
+ @data = EncodingBytes.new(data.to_s)
487
+ @encoding = nil
488
+ end
489
+
490
+ @@method_dispatch = [
491
+ ['<!--', :handle_comment],
492
+ ['<meta', :handle_meta],
493
+ ['</', :handle_possible_end_tag],
494
+ ['<!', :handle_other],
495
+ ['<?', :handle_other],
496
+ ['<', :handle_possible_start_tag]
497
+ ]
498
+
499
+ def get_encoding
500
+ @data.each do |byte|
501
+ keep_parsing = true
502
+ @@method_dispatch.each do |(key, method)|
503
+ if @data.match_bytes(key, lower = true)
504
+ keep_parsing = send(method)
505
+ break
506
+ end
507
+ end
508
+ break unless keep_parsing
509
+ end
510
+
511
+ unless @encoding.nil?
512
+ @encoding = @encoding.strip
513
+ if ["utf16", "utf16be", "utf16le", "utf32", "utf32be", "utf32le"].include?(@encoding.downcase.gsub(ASCII_PUNCTUATION, ''))
514
+ @encoding = 'utf-8'
515
+ end
516
+ end
517
+
518
+ return @encoding
519
+ end
520
+
521
+ # Skip over comments
522
+ def handle_comment
523
+ return @data.jump_to('-->')
524
+ end
525
+
526
+ def handle_meta
527
+ # if we have <meta not followed by a space so just keep going
528
+ return true unless SPACE_CHARACTERS.include?(@data.current_byte)
529
+
530
+ #We have a valid meta element we want to search for attributes
531
+ while true
532
+ #Try to find the next attribute after the current position
533
+ attr = get_attribute
534
+
535
+ return true if attr.nil?
536
+ if attr[0] == 'charset'
537
+ tentative_encoding = attr[1]
538
+ codec = codec_name(tentative_encoding)
539
+ if codec
540
+ @encoding = codec
541
+ return false
542
+ end
543
+ elsif attr[0] == 'content'
544
+ content_parser = ContentAttrParser.new(EncodingBytes.new(attr[1]))
545
+ tentative_encoding = content_parser.parse
546
+ codec = codec_name(tentative_encoding)
547
+ if codec
548
+ @encoding = codec
549
+ return false
550
+ end
551
+ end
552
+ end
553
+ end
554
+
555
+ def handle_possible_start_tag
556
+ return handle_possible_tag(false)
557
+ end
558
+
559
+ def handle_possible_end_tag
560
+ @data.position += 1
561
+ return handle_possible_tag(true)
562
+ end
563
+
564
+ def handle_possible_tag(end_tag)
565
+ unless ASCII_LETTERS.include?(@data.current_byte)
566
+ #If the next byte is not an ascii letter either ignore this
567
+ #fragment (possible start tag case) or treat it according to
568
+ #handleOther
569
+ if end_tag
570
+ @data.position -= 1
571
+ handle_other
572
+ end
573
+ return true
574
+ end
575
+
576
+ @data.find_next(SPACE_CHARACTERS + ['<', '>'])
577
+
578
+ if @data.current_byte == '<'
579
+ #return to the first step in the overall "two step" algorithm
580
+ #reprocessing the < byte
581
+ @data.position -= 1
582
+ else
583
+ #Read all attributes
584
+ {} until get_attribute.nil?
585
+ end
586
+ return true
587
+ end
588
+
589
+ def handle_other
590
+ return @data.jump_to('>')
591
+ end
592
+
593
+ # Return a name,value pair for the next attribute in the stream,
594
+ # if one is found, or nil
595
+ def get_attribute
596
+ @data.skip(SPACE_CHARACTERS + ['/'])
597
+
598
+ if @data.current_byte == '<'
599
+ @data.position -= 1
600
+ return nil
601
+ elsif @data.current_byte == '>'
602
+ return nil
603
+ end
604
+
605
+ attr_name = []
606
+ attr_value = []
607
+ space_found = false
608
+ #Step 5 attribute name
609
+ while true
610
+ if @data.current_byte == '=' and attr_name
611
+ break
612
+ elsif SPACE_CHARACTERS.include?(@data.current_byte)
613
+ space_found = true
614
+ break
615
+ elsif ['/', '<', '>'].include?(@data.current_byte)
616
+ return [attr_name.join(''), '']
617
+ elsif ASCII_UPPERCASE.include?(@data.current_byte)
618
+ attr_name.push(@data.current_byte.downcase)
619
+ else
620
+ attr_name.push(@data.current_byte)
621
+ end
622
+ #Step 6
623
+ @data.position += 1
624
+ end
625
+ #Step 7
626
+ if space_found
627
+ @data.skip
628
+ #Step 8
629
+ unless @data.current_byte == '='
630
+ @data.position -= 1
631
+ return [attr_name.join(''), '']
632
+ end
633
+ end
634
+ #XXX need to advance position in both spaces and value case
635
+ #Step 9
636
+ @data.position += 1
637
+ #Step 10
638
+ @data.skip
639
+ #Step 11
640
+ if ["'", '"'].include?(@data.current_byte)
641
+ #11.1
642
+ quote_char = @data.current_byte
643
+ while true
644
+ @data.position+=1
645
+ #11.3
646
+ if @data.current_byte == quote_char
647
+ @data.position += 1
648
+ return [attr_name.join(''), attr_value.join('')]
649
+ #11.4
650
+ elsif ASCII_UPPERCASE.include?(@data.current_byte)
651
+ attr_value.push(@data.current_byte.downcase)
652
+ #11.5
653
+ else
654
+ attr_value.push(@data.current_byte)
655
+ end
656
+ end
657
+ elsif ['>', '<'].include?(@data.current_byte)
658
+ return [attr_name.join(''), '']
659
+ elsif ASCII_UPPERCASE.include?(@data.current_byte)
660
+ attr_value.push(@data.current_byte.downcase)
661
+ else
662
+ attr_value.push(@data.current_byte)
663
+ end
664
+ while true
665
+ @data.position += 1
666
+ if (SPACE_CHARACTERS + ['>', '<']).include?(@data.current_byte)
667
+ return [attr_name.join(''), attr_value.join('')]
668
+ elsif ASCII_UPPERCASE.include?(@data.current_byte)
669
+ attr_value.push(@data.current_byte.downcase)
670
+ else
671
+ attr_value.push(@data.current_byte)
672
+ end
673
+ end
674
+ end
675
+
676
+ def codec_name(encoding)
677
+ if (!encoding.nil? && encoding.kind_of?(String))
678
+ canonical_name = encoding.downcase.gsub(ASCII_PUNCTUATION, '')
679
+ ENCODINGS[canonical_name]
680
+ # p encoding
681
+ # encoding
682
+ else
683
+ nil
684
+ end
685
+ end
686
+ end
687
+
688
+ class ContentAttrParser
689
+ def initialize(data)
690
+ @data = data
691
+ end
692
+
693
+ def parse
694
+ begin
695
+ #Skip to the first ";"
696
+ @data.position = 0
697
+ @data.jump_to(';')
698
+ @data.position += 1
699
+ @data.skip
700
+ #Check if the attr name is charset
701
+ #otherwise return
702
+ @data.jump_to('charset')
703
+ @data.position += 1
704
+ @data.skip
705
+ unless @data.current_byte == '='
706
+ #If there is no = sign keep looking for attrs
707
+ return nil
708
+ end
709
+ @data.position += 1
710
+ @data.skip
711
+ #Look for an encoding between matching quote marks
712
+ if ['"', "'"].include?(@data.current_byte)
713
+ quote_mark = @data.current_byte
714
+ @data.position += 1
715
+ old_position = @data.position
716
+ @data.jump_to(quote_mark)
717
+ return @data[old_position ... @data.position]
718
+ else
719
+ #Unquoted value
720
+ old_position = @data.position
721
+ begin
722
+ @data.find_next(SPACE_CHARACTERS)
723
+ return @data[old_position ... @data.position]
724
+ rescue EOF
725
+ #Return the whole remaining value
726
+ return @data[old_position .. -1]
727
+ end
728
+ end
729
+ rescue EOF
730
+ return nil
731
+ end
732
+ end
733
+ end
734
+
735
+ end