spk-html5 0.10.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. data/History.txt +10 -0
  2. data/Manifest.txt +73 -0
  3. data/README +45 -0
  4. data/Rakefile.rb +33 -0
  5. data/bin/html5 +7 -0
  6. data/lib/html5.rb +13 -0
  7. data/lib/html5/cli.rb +248 -0
  8. data/lib/html5/constants.rb +1061 -0
  9. data/lib/html5/filters/base.rb +10 -0
  10. data/lib/html5/filters/inject_meta_charset.rb +82 -0
  11. data/lib/html5/filters/iso639codes.rb +755 -0
  12. data/lib/html5/filters/optionaltags.rb +198 -0
  13. data/lib/html5/filters/rfc2046.rb +31 -0
  14. data/lib/html5/filters/rfc3987.rb +91 -0
  15. data/lib/html5/filters/sanitizer.rb +15 -0
  16. data/lib/html5/filters/validator.rb +834 -0
  17. data/lib/html5/filters/whitespace.rb +36 -0
  18. data/lib/html5/html5parser.rb +247 -0
  19. data/lib/html5/html5parser/after_after_body_phase.rb +43 -0
  20. data/lib/html5/html5parser/after_after_frameset_phase.rb +32 -0
  21. data/lib/html5/html5parser/after_body_phase.rb +46 -0
  22. data/lib/html5/html5parser/after_frameset_phase.rb +33 -0
  23. data/lib/html5/html5parser/after_head_phase.rb +55 -0
  24. data/lib/html5/html5parser/before_head_phase.rb +44 -0
  25. data/lib/html5/html5parser/before_html_phase.rb +41 -0
  26. data/lib/html5/html5parser/in_body_phase.rb +636 -0
  27. data/lib/html5/html5parser/in_caption_phase.rb +69 -0
  28. data/lib/html5/html5parser/in_cell_phase.rb +78 -0
  29. data/lib/html5/html5parser/in_column_group_phase.rb +55 -0
  30. data/lib/html5/html5parser/in_foreign_content_phase.rb +50 -0
  31. data/lib/html5/html5parser/in_frameset_phase.rb +56 -0
  32. data/lib/html5/html5parser/in_head_phase.rb +143 -0
  33. data/lib/html5/html5parser/in_row_phase.rb +96 -0
  34. data/lib/html5/html5parser/in_select_phase.rb +90 -0
  35. data/lib/html5/html5parser/in_select_table_phase.rb +35 -0
  36. data/lib/html5/html5parser/in_table_body_phase.rb +92 -0
  37. data/lib/html5/html5parser/in_table_phase.rb +177 -0
  38. data/lib/html5/html5parser/initial_phase.rb +133 -0
  39. data/lib/html5/html5parser/phase.rb +171 -0
  40. data/lib/html5/inputstream.rb +735 -0
  41. data/lib/html5/liberalxmlparser.rb +158 -0
  42. data/lib/html5/sanitizer.rb +209 -0
  43. data/lib/html5/serializer.rb +2 -0
  44. data/lib/html5/serializer/htmlserializer.rb +179 -0
  45. data/lib/html5/serializer/xhtmlserializer.rb +20 -0
  46. data/lib/html5/sniffer.rb +45 -0
  47. data/lib/html5/tokenizer.rb +1059 -0
  48. data/lib/html5/treebuilders.rb +24 -0
  49. data/lib/html5/treebuilders/base.rb +339 -0
  50. data/lib/html5/treebuilders/hpricot.rb +231 -0
  51. data/lib/html5/treebuilders/rexml.rb +215 -0
  52. data/lib/html5/treebuilders/simpletree.rb +191 -0
  53. data/lib/html5/treewalkers.rb +26 -0
  54. data/lib/html5/treewalkers/base.rb +162 -0
  55. data/lib/html5/treewalkers/hpricot.rb +48 -0
  56. data/lib/html5/treewalkers/rexml.rb +48 -0
  57. data/lib/html5/treewalkers/simpletree.rb +48 -0
  58. data/lib/html5/version.rb +3 -0
  59. data/test/preamble.rb +69 -0
  60. data/test/test_cli.rb +16 -0
  61. data/test/test_encoding.rb +35 -0
  62. data/test/test_input_stream.rb +26 -0
  63. data/test/test_lxp.rb +283 -0
  64. data/test/test_parser.rb +63 -0
  65. data/test/test_sanitizer.rb +173 -0
  66. data/test/test_serializer.rb +67 -0
  67. data/test/test_sniffer.rb +27 -0
  68. data/test/test_stream.rb +71 -0
  69. data/test/test_tokenizer.rb +95 -0
  70. data/test/test_treewalkers.rb +135 -0
  71. data/test/test_validator.rb +31 -0
  72. data/test/tokenizer_test_parser.rb +67 -0
  73. data/test19.rb +38 -0
  74. metadata +198 -0
@@ -0,0 +1,133 @@
1
+ require 'html5/html5parser/phase'
2
+
3
+ module HTML5
4
+ class InitialPhase < Phase
5
+
6
+ # This phase deals with error handling as well which is currently not
7
+ # covered in the specification. The error handling is typically known as
8
+ # "quirks mode". It is expected that a future version of HTML5 will define this.
9
+
10
+ def process_eof
11
+ parse_error("expected-doctype-but-got-eof")
12
+ @parser.phase = @parser.phases[:beforeHtml]
13
+ @parser.phase.process_eof
14
+ end
15
+
16
+ def processComment(data)
17
+ @tree.insert_comment(data, @tree.document)
18
+ end
19
+
20
+ def processDoctype(name, publicId, systemId, correct)
21
+ if name.downcase != 'html' or publicId or systemId
22
+ parse_error("unknown-doctype")
23
+ end
24
+ # XXX need to update DOCTYPE tokens
25
+ @tree.insertDoctype(name, publicId, systemId)
26
+
27
+ publicId = publicId.to_s.upcase
28
+
29
+ if name.downcase != 'html'
30
+ # XXX quirks mode
31
+ else
32
+ if ["+//silmaril//dtd html pro v0r11 19970101//en",
33
+ "-//advasoft ltd//dtd html 3.0 aswedit + extensions//en",
34
+ "-//as//dtd html 3.0 aswedit + extensions//en",
35
+ "-//ietf//dtd html 2.0 level 1//en",
36
+ "-//ietf//dtd html 2.0 level 2//en",
37
+ "-//ietf//dtd html 2.0 strict level 1//en",
38
+ "-//ietf//dtd html 2.0 strict level 2//en",
39
+ "-//ietf//dtd html 2.0 strict//en",
40
+ "-//ietf//dtd html 2.0//en",
41
+ "-//ietf//dtd html 2.1e//en",
42
+ "-//ietf//dtd html 3.0//en",
43
+ "-//ietf//dtd html 3.0//en//",
44
+ "-//ietf//dtd html 3.2 final//en",
45
+ "-//ietf//dtd html 3.2//en",
46
+ "-//ietf//dtd html 3//en",
47
+ "-//ietf//dtd html level 0//en",
48
+ "-//ietf//dtd html level 0//en//2.0",
49
+ "-//ietf//dtd html level 1//en",
50
+ "-//ietf//dtd html level 1//en//2.0",
51
+ "-//ietf//dtd html level 2//en",
52
+ "-//ietf//dtd html level 2//en//2.0",
53
+ "-//ietf//dtd html level 3//en",
54
+ "-//ietf//dtd html level 3//en//3.0",
55
+ "-//ietf//dtd html strict level 0//en",
56
+ "-//ietf//dtd html strict level 0//en//2.0",
57
+ "-//ietf//dtd html strict level 1//en",
58
+ "-//ietf//dtd html strict level 1//en//2.0",
59
+ "-//ietf//dtd html strict level 2//en",
60
+ "-//ietf//dtd html strict level 2//en//2.0",
61
+ "-//ietf//dtd html strict level 3//en",
62
+ "-//ietf//dtd html strict level 3//en//3.0",
63
+ "-//ietf//dtd html strict//en",
64
+ "-//ietf//dtd html strict//en//2.0",
65
+ "-//ietf//dtd html strict//en//3.0",
66
+ "-//ietf//dtd html//en",
67
+ "-//ietf//dtd html//en//2.0",
68
+ "-//ietf//dtd html//en//3.0",
69
+ "-//metrius//dtd metrius presentational//en",
70
+ "-//microsoft//dtd internet explorer 2.0 html strict//en",
71
+ "-//microsoft//dtd internet explorer 2.0 html//en",
72
+ "-//microsoft//dtd internet explorer 2.0 tables//en",
73
+ "-//microsoft//dtd internet explorer 3.0 html strict//en",
74
+ "-//microsoft//dtd internet explorer 3.0 html//en",
75
+ "-//microsoft//dtd internet explorer 3.0 tables//en",
76
+ "-//netscape comm. corp.//dtd html//en",
77
+ "-//netscape comm. corp.//dtd strict html//en",
78
+ "-//o'reilly and associates//dtd html 2.0//en",
79
+ "-//o'reilly and associates//dtd html extended 1.0//en",
80
+ "-//spyglass//dtd html 2.0 extended//en",
81
+ "-//sq//dtd html 2.0 hotmetal + extensions//en",
82
+ "-//sun microsystems corp.//dtd hotjava html//en",
83
+ "-//sun microsystems corp.//dtd hotjava strict html//en",
84
+ "-//w3c//dtd html 3 1995-03-24//en",
85
+ "-//w3c//dtd html 3.2 draft//en",
86
+ "-//w3c//dtd html 3.2 final//en",
87
+ "-//w3c//dtd html 3.2//en",
88
+ "-//w3c//dtd html 3.2s draft//en",
89
+ "-//w3c//dtd html 4.0 frameset//en",
90
+ "-//w3c//dtd html 4.0 transitional//en",
91
+ "-//w3c//dtd html experimental 19960712//en",
92
+ "-//w3c//dtd html experimental 970421//en",
93
+ "-//w3c//dtd w3 html//en",
94
+ "-//w3o//dtd w3 html 3.0//en",
95
+ "-//w3o//dtd w3 html 3.0//en//",
96
+ "-//w3o//dtd w3 html strict 3.0//en//",
97
+ "-//webtechs//dtd mozilla html 2.0//en",
98
+ "-//webtechs//dtd mozilla html//en",
99
+ "-/w3c/dtd html 4.0 transitional/en",
100
+ "html"].include?(publicId) or
101
+ (systemId == nil and
102
+ ["-//w3c//dtd html 4.01 frameset//EN",
103
+ "-//w3c//dtd html 4.01 transitional//EN"].include?(publicId)) or
104
+ (systemId == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd")
105
+ #XXX quirks mode
106
+ end
107
+ end
108
+
109
+ @parser.phase = @parser.phases[:beforeHtml]
110
+ end
111
+
112
+ def processSpaceCharacters(data)
113
+ end
114
+
115
+ def processCharacters(data)
116
+ parse_error("expected-doctype-but-got-chars")
117
+ @parser.phase = @parser.phases[:beforeHtml]
118
+ @parser.phase.processCharacters(data)
119
+ end
120
+
121
+ def processStartTag(name, attributes, self_closing=false)
122
+ parse_error("expected-doctype-but-got-start-tag", {"name" => name})
123
+ @parser.phase = @parser.phases[:beforeHtml]
124
+ @parser.phase.processStartTag(name, attributes)
125
+ end
126
+
127
+ def processEndTag(name)
128
+ parse_error("expected-doctype-but-got-end-tag", {"name" => name})
129
+ @parser.phase = @parser.phases[:beforeHtml]
130
+ @parser.phase.processEndTag(name)
131
+ end
132
+ end
133
+ end
@@ -0,0 +1,171 @@
1
+ module HTML5
2
+ # Base class for helper objects that implement each phase of processing.
3
+ #
4
+ # Handler methods should be in the following order (they can be omitted):
5
+ #
6
+ # * EOF
7
+ # * Comment
8
+ # * Doctype
9
+ # * SpaceCharacters
10
+ # * Characters
11
+ # * StartTag
12
+ # - startTag* methods
13
+ # * EndTag
14
+ # - endTag* methods
15
+ #
16
+ class Phase
17
+ extend Forwardable
18
+ def_delegators :@parser, :parse_error
19
+
20
+ # The following example call:
21
+ #
22
+ # tag_handlers('startTag', 'html', %w( base link meta ), %w( li dt dd ) => 'ListItem')
23
+ #
24
+ # ...would return a hash equal to this:
25
+ #
26
+ # { 'html' => 'startTagHtml',
27
+ # 'base' => 'startTagBaseLinkMeta',
28
+ # 'link' => 'startTagBaseLinkMeta',
29
+ # 'meta' => 'startTagBaseLinkMeta',
30
+ # 'li' => 'startTagListItem',
31
+ # 'dt' => 'startTagListItem',
32
+ # 'dd' => 'startTagListItem' }
33
+ #
34
+ def self.tag_handlers(prefix, *tags)
35
+ mapping = {}
36
+ if tags.last.is_a?(Hash)
37
+ tags.pop.each do |names, handler_method_suffix|
38
+ handler_method = prefix + handler_method_suffix
39
+ Array(names).each {|name| mapping[name] = handler_method }
40
+ end
41
+ end
42
+ tags.each do |names|
43
+ names = Array(names)
44
+ handler_method = prefix + names.map {|name| name.capitalize }.join
45
+ names.each {|name| mapping[name] = handler_method }
46
+ end
47
+ mapping
48
+ end
49
+
50
+ def self.start_tag_handlers
51
+ @start_tag_handlers ||= Hash.new('startTagOther')
52
+ end
53
+
54
+ # Declare what start tags this Phase handles. Can be called more than once.
55
+ #
56
+ # Example usage:
57
+ #
58
+ # handle_start 'html'
59
+ # # html start tags will be handled by a method named 'startTagHtml'
60
+ #
61
+ # handle_start %( base link meta )
62
+ # # base, link and meta start tags will be handled by a method named 'startTagBaseLinkMeta'
63
+ #
64
+ # handle_start %( li dt dd ) => 'ListItem'
65
+ # # li, dt, and dd start tags will be handled by a method named 'startTagListItem'
66
+ #
67
+ def self.handle_start(*tags)
68
+ start_tag_handlers.update tag_handlers('startTag', *tags)
69
+ end
70
+
71
+ def self.end_tag_handlers
72
+ @end_tag_handlers ||= Hash.new('endTagOther')
73
+ end
74
+
75
+ # Declare what end tags this Phase handles. Behaves like handle_start.
76
+ #
77
+ def self.handle_end(*tags)
78
+ end_tag_handlers.update tag_handlers('endTag', *tags)
79
+ end
80
+
81
+ def initialize(parser, tree)
82
+ @parser, @tree = parser, tree
83
+ end
84
+
85
+ def process_eof
86
+ @tree.generateImpliedEndTags
87
+
88
+ if @tree.open_elements.length > 2
89
+ parse_error("expected-closing-tag-but-got-eof")
90
+ elsif @tree.open_elements.length == 2 and @tree.open_elements[1].name != 'body'
91
+ # This happens for framesets or something?
92
+ parse_error("expected-closing-tag-but-got-eof")
93
+ elsif @parser.inner_html and @tree.open_elements.length > 1
94
+ # XXX This is not what the specification says. Not sure what to do here.
95
+ parse_error("eof-in-innerhtml")
96
+ end
97
+ # Betting ends.
98
+ end
99
+
100
+ def processComment(data)
101
+ # For most phases the following is correct. Where it's not it will be
102
+ # overridden.
103
+ @tree.insert_comment(data, @tree.open_elements.last)
104
+ end
105
+
106
+ def processDoctype(name, publicId, systemId, correct)
107
+ parse_error("unexpected-doctype")
108
+ end
109
+
110
+ def processSpaceCharacters(data)
111
+ @tree.insertText(data)
112
+ end
113
+
114
+ def processStartTag(name, attributes, self_closing=false)
115
+ if method(self.class.start_tag_handlers[name]).arity == 2
116
+ send self.class.start_tag_handlers[name], name, attributes
117
+ else
118
+ send self.class.start_tag_handlers[name], name, attributes, self_closing
119
+ end
120
+ end
121
+
122
+ def startTagHtml(name, attributes)
123
+ if @parser.first_start_tag == false and name == 'html'
124
+ parse_error("non-html-root")
125
+ end
126
+ # XXX Need a check here to see if the first start tag token emitted is
127
+ # this token... If it's not, invoke parse_error.
128
+ attributes.each do |attr, value|
129
+ unless @tree.open_elements.first.attributes.has_key?(attr)
130
+ @tree.open_elements.first.attributes[attr] = value
131
+ end
132
+ end
133
+ @parser.first_start_tag = false
134
+ end
135
+
136
+ def processEndTag(name)
137
+ send self.class.end_tag_handlers[name], name
138
+ end
139
+
140
+ def assert(value)
141
+ throw AssertionError.new unless value
142
+ end
143
+
144
+ def in_scope?(*args)
145
+ @tree.elementInScope(*args)
146
+ end
147
+
148
+ def remove_open_elements_until(name=nil)
149
+ finished = false
150
+ until finished || @tree.open_elements.length == 0
151
+ element = @tree.open_elements.pop
152
+ finished = name.nil? ? yield(element) : element.name == name
153
+ end
154
+ return element
155
+ end
156
+
157
+ def adjust_mathml_attributes(attributes)
158
+ attributes.collect do |a|
159
+ if a.first =='definitionurl'
160
+ ['definitionURL', a[1]]
161
+ else
162
+ a
163
+ end
164
+ end
165
+ end
166
+
167
+ def adjust_foreign_attributes(attributes)
168
+ attributes
169
+ end
170
+ end
171
+ end
@@ -0,0 +1,735 @@
1
+ require 'stringio'
2
+ require 'html5/constants'
3
+
4
+ module HTML5
5
+
6
+ # Provides a unicode stream of characters to the HTMLTokenizer.
7
+
8
+ # This class takes care of character encoding and removing or replacing
9
+ # incorrect byte-sequences and also provides column and line tracking.
10
+
11
+ class HTMLInputStream
12
+
13
+ attr_accessor :queue, :char_encoding, :errors
14
+
15
+ # see /usr/lib/ruby/1.9.1/rexml/text.rb
16
+ VALID_CHAR = [
17
+ 0x9, 0xA, 0xD,
18
+ (0x20..0xD7FF),
19
+ (0xE000..0xFFFD),
20
+ (0x10000..0x10FFFF)
21
+ ]
22
+ if String.method_defined? :encode
23
+ VALID_XML_CHARS = Regexp.new('^['+
24
+ VALID_CHAR.map { |item|
25
+ case item
26
+ when Fixnum
27
+ [item].pack('U').force_encoding('utf-8')
28
+ when Range
29
+ [item.first, '-'.ord, item.last].pack('UUU').force_encoding('utf-8')
30
+ end
31
+ }.join +
32
+ ']*$')
33
+ else
34
+ VALID_XML_CHARS = /^(
35
+ | [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte
36
+ | \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs
37
+ | [\xE1-\xEC\xEE][\x80-\xBF]{2} # straight 3-byte
38
+ | \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates
39
+ | \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3
40
+ | [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15
41
+ | \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16
42
+ )*$/nx;
43
+ end
44
+
45
+ # Initialises the HTMLInputStream.
46
+ #
47
+ # HTMLInputStream(source, [encoding]) -> Normalized stream from source
48
+ # for use by the HTML5Lib.
49
+ #
50
+ # source can be either a file-object, local filename or a string.
51
+ #
52
+ # The optional encoding parameter must be a string that indicates
53
+ # the encoding. If specified, that encoding will be used,
54
+ # regardless of any BOM or later declaration (such as in a meta
55
+ # element)
56
+ #
57
+ # parseMeta - Look for a <meta> element containing encoding information
58
+
59
+ def initialize(source, options = {})
60
+ @encoding = nil
61
+ @parse_meta = true
62
+ @chardet = true
63
+
64
+ options.each {|name, value| instance_variable_set("@#{name}", value) }
65
+
66
+ # partial Ruby 1.9 support
67
+ if @encoding and source.respond_to? :force_encoding
68
+ source.force_encoding(@encoding) rescue nil
69
+ end
70
+
71
+ # Raw Stream
72
+ @raw_stream = open_stream(source)
73
+
74
+ # Encoding Information
75
+ #Number of bytes to use when looking for a meta element with
76
+ #encoding information
77
+ @NUM_BYTES_META = 512
78
+ #Number of bytes to use when using detecting encoding using chardet
79
+ @NUM_BYTES_CHARDET = 256
80
+ #Number of bytes to use when reading content
81
+ @NUM_BYTES_BUFFER = 1024
82
+
83
+ #Encoding to use if no other information can be found
84
+ @DEFAULT_ENCODING = 'windows-1252'
85
+
86
+ #Detect encoding iff no explicit "transport level" encoding is supplied
87
+ if @encoding.nil?
88
+ @char_encoding = detect_encoding
89
+ else
90
+ @char_encoding = @encoding
91
+ end
92
+
93
+ # Read bytes from stream decoding them into Unicode
94
+ @buffer = @raw_stream.read(@NUM_BYTES_BUFFER) || ''
95
+ if @char_encoding == 'windows-1252'
96
+ @win1252 = true
97
+ elsif @char_encoding != 'utf-8'
98
+ require 'iconv'
99
+ begin
100
+ @buffer << @raw_stream.read unless @raw_stream.eof?
101
+ @buffer = Iconv.iconv('utf-8', @char_encoding, @buffer).first
102
+ rescue
103
+ @win1252 = true
104
+ end
105
+ end
106
+
107
+ @queue = []
108
+ @errors = []
109
+
110
+ # Reset position in the list to read from
111
+ @tell = 0
112
+ @line = @col = 0
113
+ @line_lengths = []
114
+ end
115
+
116
+ # Produces a file object from source.
117
+ #
118
+ # source can be either a file object, local filename or a string.
119
+ def open_stream(source)
120
+ # Already an IO like object
121
+ if source.respond_to?(:read)
122
+ source
123
+ else
124
+ # Treat source as a string and wrap in StringIO
125
+ StringIO.new(source)
126
+ end
127
+ end
128
+
129
+ def detect_encoding
130
+
131
+ #First look for a BOM
132
+ #This will also read past the BOM if present
133
+ encoding = detect_bom
134
+
135
+ #If there is no BOM need to look for meta elements with encoding
136
+ #information
137
+ if encoding.nil? and @parse_meta
138
+ encoding = detect_encoding_meta
139
+ end
140
+
141
+ #Guess with chardet, if avaliable
142
+ if encoding.nil? and @chardet
143
+ begin
144
+ require 'rubygems'
145
+ require 'UniversalDetector' # gem install chardet
146
+ buffers = []
147
+ detector = UniversalDetector::Detector.instance
148
+ detector.reset
149
+ until @raw_stream.eof?
150
+ buffer = @raw_stream.read(@NUM_BYTES_CHARDET)
151
+ break if !buffer or buffer.empty?
152
+ buffers << buffer
153
+ detector.feed(buffer)
154
+ break if detector.instance_eval {@done}
155
+ detector.instance_eval {
156
+ @_mLastChar = @_mLastChar.chr if Fixnum === @_mLastChar
157
+ }
158
+ end
159
+ detector.close
160
+ encoding = detector.result['encoding']
161
+ seek(buffers*'', 0)
162
+ rescue LoadError
163
+ end
164
+ end
165
+
166
+ # If all else fails use the default encoding
167
+ if encoding.nil?
168
+ encoding = @DEFAULT_ENCODING
169
+ end
170
+
171
+ #Substitute for equivalent encoding
172
+ if 'iso-8859-1' == encoding.downcase
173
+ encoding = 'windows-1252'
174
+ end
175
+
176
+ encoding
177
+ end
178
+
179
+ # Attempts to detect at BOM at the start of the stream. If
180
+ # an encoding can be determined from the BOM return the name of the
181
+ # encoding otherwise return nil
182
+ def detect_bom
183
+ bom_dict = {
184
+ "\xef\xbb\xbf" => 'utf-8',
185
+ "\xff\xfe" => 'utf-16le',
186
+ "\xfe\xff" => 'utf-16be',
187
+ "\xff\xfe\x00\x00" => 'utf-32le',
188
+ "\x00\x00\xfe\xff" => 'utf-32be'
189
+ }
190
+
191
+ # Go to beginning of file and read in 4 bytes
192
+ string = @raw_stream.read(4)
193
+ return nil unless string
194
+
195
+ # Try detecting the BOM using bytes from the string
196
+ encoding = bom_dict[string[0...3]] # UTF-8
197
+ seek = 3
198
+ unless encoding
199
+ # Need to detect UTF-32 before UTF-16
200
+ encoding = bom_dict[string] # UTF-32
201
+ seek = 4
202
+ unless encoding
203
+ encoding = bom_dict[string[0...2]] # UTF-16
204
+ seek = 2
205
+ end
206
+ end
207
+
208
+ # Set the read position past the BOM if one was found, otherwise
209
+ # set it to the start of the stream
210
+ seek(string, encoding ? seek : 0)
211
+
212
+ return encoding
213
+ end
214
+
215
+ def seek(buffer, n)
216
+ if @raw_stream.respond_to?(:unget)
217
+ @raw_stream.unget(buffer[n..-1])
218
+ return
219
+ end
220
+
221
+ if @raw_stream.respond_to?(:seek)
222
+ begin
223
+ @raw_stream.seek(n)
224
+ return
225
+ rescue Errno::ESPIPE
226
+ end
227
+ end
228
+
229
+ #TODO: huh?
230
+ require 'delegate'
231
+ @raw_stream = SimpleDelegator.new(@raw_stream)
232
+
233
+ class << @raw_stream
234
+ def read(chars=-1)
235
+ if chars == -1 or chars > @data.length
236
+ result = @data
237
+ @data = ''
238
+ return result if __getobj__.eof?
239
+ return result + __getobj__.read if chars == -1
240
+ return result + __getobj__.read(chars-result.length)
241
+ elsif @data.empty?
242
+ return __getobj__.read(chars)
243
+ else
244
+ result = @data[1...chars]
245
+ @data = @data[chars..-1]
246
+ return result
247
+ end
248
+ end
249
+
250
+ def unget(data)
251
+ if !@data or @data.empty?
252
+ @data = data
253
+ else
254
+ @data += data
255
+ end
256
+ end
257
+ end
258
+
259
+ @raw_stream.unget(buffer[n .. -1])
260
+ end
261
+
262
+ # Report the encoding declared by the meta element
263
+ def detect_encoding_meta
264
+ buffer = @raw_stream.read(@NUM_BYTES_META)
265
+ parser = EncodingParser.new(buffer)
266
+ seek(buffer, 0)
267
+ return parser.get_encoding
268
+ end
269
+
270
+ # Returns (line, col) of the current position in the stream.
271
+ def position
272
+ line, col = @line, @col
273
+ if @queue and @queue.last != :EOF
274
+ @queue.reverse.each do |c|
275
+ if c == "\n"
276
+ line -= 1
277
+ raise RuntimeError.new("col=#{col}") unless col == 0
278
+ col = @line_lengths[line]
279
+ else
280
+ col -= 1
281
+ end
282
+ end
283
+ end
284
+ return [line + 1, col]
285
+ end
286
+
287
+ # Read one character from the stream or queue if available. Return
288
+ # EOF when EOF is reached.
289
+ def char
290
+ unless @queue.empty?
291
+ return @queue.shift
292
+ else
293
+ if @tell + 3 > @buffer.length && !@raw_stream.eof?
294
+ # read next block
295
+ @buffer = @buffer[@tell..-1] + @raw_stream.read(@NUM_BYTES_BUFFER)
296
+ @tell = 0
297
+ end
298
+
299
+ c = @buffer[@tell]
300
+ @tell += 1
301
+
302
+ case c
303
+
304
+ when String
305
+ # partial Ruby 1.9 support
306
+ case c
307
+ when "\0"
308
+ @errors.push("null-character")
309
+ c = "\uFFFD" # null characters are invalid
310
+ when "\r"
311
+ @tell += 1 if @buffer[@tell] == "\n"
312
+ c = "\n"
313
+ when "\x80" .. "\x9F"
314
+ c = ENTITIES_WINDOWS1252[c.ord-0x80].chr('utf-8')
315
+ when "\xA0" .. "\xFF"
316
+ if c.encoding == Encoding::ASCII_8BIT
317
+ c = c.encode('utf-8','iso-8859-1')
318
+ end
319
+ end
320
+
321
+ if c == "\x0D"
322
+ # normalize newlines
323
+ @tell += 1 if @buffer[@tell] == 0x0A
324
+ c = 0x0A
325
+ end
326
+
327
+ # update position in stream
328
+ if c == "\x0a"
329
+ @line_lengths << @col
330
+ @line += 1
331
+ @col = 0
332
+ else
333
+ @col += 1
334
+ end
335
+
336
+ c
337
+
338
+ when 0x01..0x7F
339
+ if c == 0x0D
340
+ # normalize newlines
341
+ @tell += 1 if @buffer[@tell] == 0x0A
342
+ c = 0x0A
343
+ end
344
+
345
+ # update position in stream
346
+ if c == 0x0a
347
+ @line_lengths << @col
348
+ @line += 1
349
+ @col = 0
350
+ else
351
+ @col += 1
352
+ end
353
+
354
+ c.chr
355
+
356
+ when 0x80..0xBF
357
+ if !@win1252
358
+ [0xFFFD].pack('U') # invalid utf-8
359
+ elsif c <= 0x9f
360
+ [ENTITIES_WINDOWS1252[c-0x80]].pack('U')
361
+ else
362
+ "\xC2" + c.chr # convert to utf-8
363
+ end
364
+
365
+ when 0xC0..0xFF
366
+ if instance_variable_defined?("@win1252") && @win1252
367
+ "\xC3" + (c - 64).chr # convert to utf-8
368
+
369
+
370
+ elsif @buffer[@tell - 1..@tell + 3] =~ VALID_XML_CHARS
371
+ @tell += $1.length - 1
372
+ $1
373
+ else
374
+ [0xFFFD].pack('U') # invalid utf-8
375
+ end
376
+
377
+ when 0x00
378
+ @errors.push("null-character")
379
+ [0xFFFD].pack('U') # null characters are invalid
380
+
381
+ else
382
+ :EOF
383
+ end
384
+ end
385
+ end
386
+
387
+ # Returns a string of characters from the stream up to but not
388
+ # including any character in characters or EOF. characters can be
389
+ # any container that supports the in method being called on it.
390
+ def chars_until(characters, opposite=false)
391
+ char_stack = [char]
392
+
393
+ while char_stack.last != :EOF
394
+ break unless (characters.include?(char_stack.last)) == opposite
395
+ char_stack.push(char)
396
+ end
397
+
398
+ # Put the character stopped on back to the front of the queue
399
+ # from where it came.
400
+ c = char_stack.pop
401
+ @queue.insert(0, c) unless c == :EOF
402
+ return char_stack.join('')
403
+ end
404
+
405
+ def unget(characters)
406
+ return if characters == :EOF
407
+ if characters.respond_to? :to_a
408
+ @queue.unshift(*characters.to_a)
409
+ else
410
+ characters.reverse.each_char {|c| @queue.unshift(c)}
411
+ end
412
+ end
413
+ end
414
+
415
+ # String-like object with an assosiated position and various extra methods
416
+ # If the position is ever greater than the string length then an exception is raised
417
+ class EncodingBytes < String
418
+
419
+ attr_accessor :position
420
+
421
+ def initialize(value)
422
+ super(value)
423
+ @position = -1
424
+ end
425
+
426
+ def each
427
+ while @position < length
428
+ @position += 1
429
+ yield self[@position]
430
+ end
431
+ rescue EOF
432
+ end
433
+
434
+ def current_byte
435
+ raise EOF if @position >= length
436
+ return self[@position].chr
437
+ end
438
+
439
+ # Skip past a list of characters
440
+ def skip(chars=SPACE_CHARACTERS)
441
+ while chars.include?(current_byte)
442
+ @position += 1
443
+ end
444
+ end
445
+
446
+ # Look for a sequence of bytes at the start of a string. If the bytes
447
+ # are found return true and advance the position to the byte after the
448
+ # match. Otherwise return false and leave the position alone
449
+ def match_bytes(bytes, lower=false)
450
+ data = self[position ... position+bytes.length]
451
+ data.downcase! if lower
452
+ rv = (data == bytes)
453
+ @position += bytes.length if rv == true
454
+ return rv
455
+ end
456
+
457
+ # Look for the next sequence of bytes matching a given sequence. If
458
+ # a match is found advance the position to the last byte of the match
459
+ def jump_to(bytes)
460
+ new_position = self[position .. -1].index(bytes)
461
+ if new_position
462
+ @position += (new_position + bytes.length-1)
463
+ return true
464
+ else
465
+ raise EOF
466
+ end
467
+ end
468
+
469
+ # Move the pointer so it points to the next byte in a set of possible
470
+ # bytes
471
+ def find_next(byte_list)
472
+ until byte_list.include?(current_byte)
473
+ @position += 1
474
+ end
475
+ end
476
+ end
477
+
478
+ # Mini parser for detecting character encoding from meta elements
479
+ class EncodingParser
480
+ ASCII_PUNCTUATION = %r{[\x09-\x0D\x20-\x2F\x3A-\x40\x5B-\x60\x7B-\x7E]}
481
+ # a (hopefully) temporary hack to deal with the fact that ruby doesn't have a built in encodings
482
+ # library
483
+ ENCODINGS = ['euc_jp', 'utf-8', "iso8859-2", "iso-8859-1", "utf-16", "UTF-16LE", "UTF-16BE"].inject({}){|m, v| m[v.downcase.gsub(ASCII_PUNCTUATION, '')] = v; m}
484
+ # string - the data to work on for encoding detection
485
+ def initialize(data)
486
+ @data = EncodingBytes.new(data.to_s)
487
+ @encoding = nil
488
+ end
489
+
490
+ @@method_dispatch = [
491
+ ['<!--', :handle_comment],
492
+ ['<meta', :handle_meta],
493
+ ['</', :handle_possible_end_tag],
494
+ ['<!', :handle_other],
495
+ ['<?', :handle_other],
496
+ ['<', :handle_possible_start_tag]
497
+ ]
498
+
499
+ def get_encoding
500
+ @data.each do |byte|
501
+ keep_parsing = true
502
+ @@method_dispatch.each do |(key, method)|
503
+ if @data.match_bytes(key, lower = true)
504
+ keep_parsing = send(method)
505
+ break
506
+ end
507
+ end
508
+ break unless keep_parsing
509
+ end
510
+
511
+ unless @encoding.nil?
512
+ @encoding = @encoding.strip
513
+ if ["utf16", "utf16be", "utf16le", "utf32", "utf32be", "utf32le"].include?(@encoding.downcase.gsub(ASCII_PUNCTUATION, ''))
514
+ @encoding = 'utf-8'
515
+ end
516
+ end
517
+
518
+ return @encoding
519
+ end
520
+
521
+ # Skip over comments
522
+ def handle_comment
523
+ return @data.jump_to('-->')
524
+ end
525
+
526
+ def handle_meta
527
+ # if we have <meta not followed by a space so just keep going
528
+ return true unless SPACE_CHARACTERS.include?(@data.current_byte)
529
+
530
+ #We have a valid meta element we want to search for attributes
531
+ while true
532
+ #Try to find the next attribute after the current position
533
+ attr = get_attribute
534
+
535
+ return true if attr.nil?
536
+ if attr[0] == 'charset'
537
+ tentative_encoding = attr[1]
538
+ codec = codec_name(tentative_encoding)
539
+ if codec
540
+ @encoding = codec
541
+ return false
542
+ end
543
+ elsif attr[0] == 'content'
544
+ content_parser = ContentAttrParser.new(EncodingBytes.new(attr[1]))
545
+ tentative_encoding = content_parser.parse
546
+ codec = codec_name(tentative_encoding)
547
+ if codec
548
+ @encoding = codec
549
+ return false
550
+ end
551
+ end
552
+ end
553
+ end
554
+
555
+ def handle_possible_start_tag
556
+ return handle_possible_tag(false)
557
+ end
558
+
559
+ def handle_possible_end_tag
560
+ @data.position += 1
561
+ return handle_possible_tag(true)
562
+ end
563
+
564
+ def handle_possible_tag(end_tag)
565
+ unless ASCII_LETTERS.include?(@data.current_byte)
566
+ #If the next byte is not an ascii letter either ignore this
567
+ #fragment (possible start tag case) or treat it according to
568
+ #handleOther
569
+ if end_tag
570
+ @data.position -= 1
571
+ handle_other
572
+ end
573
+ return true
574
+ end
575
+
576
+ @data.find_next(SPACE_CHARACTERS + ['<', '>'])
577
+
578
+ if @data.current_byte == '<'
579
+ #return to the first step in the overall "two step" algorithm
580
+ #reprocessing the < byte
581
+ @data.position -= 1
582
+ else
583
+ #Read all attributes
584
+ {} until get_attribute.nil?
585
+ end
586
+ return true
587
+ end
588
+
589
+ def handle_other
590
+ return @data.jump_to('>')
591
+ end
592
+
593
+ # Return a name,value pair for the next attribute in the stream,
594
+ # if one is found, or nil
595
+ def get_attribute
596
+ @data.skip(SPACE_CHARACTERS + ['/'])
597
+
598
+ if @data.current_byte == '<'
599
+ @data.position -= 1
600
+ return nil
601
+ elsif @data.current_byte == '>'
602
+ return nil
603
+ end
604
+
605
+ attr_name = []
606
+ attr_value = []
607
+ space_found = false
608
+ #Step 5 attribute name
609
+ while true
610
+ if @data.current_byte == '=' and attr_name
611
+ break
612
+ elsif SPACE_CHARACTERS.include?(@data.current_byte)
613
+ space_found = true
614
+ break
615
+ elsif ['/', '<', '>'].include?(@data.current_byte)
616
+ return [attr_name.join(''), '']
617
+ elsif ASCII_UPPERCASE.include?(@data.current_byte)
618
+ attr_name.push(@data.current_byte.downcase)
619
+ else
620
+ attr_name.push(@data.current_byte)
621
+ end
622
+ #Step 6
623
+ @data.position += 1
624
+ end
625
+ #Step 7
626
+ if space_found
627
+ @data.skip
628
+ #Step 8
629
+ unless @data.current_byte == '='
630
+ @data.position -= 1
631
+ return [attr_name.join(''), '']
632
+ end
633
+ end
634
+ #XXX need to advance position in both spaces and value case
635
+ #Step 9
636
+ @data.position += 1
637
+ #Step 10
638
+ @data.skip
639
+ #Step 11
640
+ if ["'", '"'].include?(@data.current_byte)
641
+ #11.1
642
+ quote_char = @data.current_byte
643
+ while true
644
+ @data.position+=1
645
+ #11.3
646
+ if @data.current_byte == quote_char
647
+ @data.position += 1
648
+ return [attr_name.join(''), attr_value.join('')]
649
+ #11.4
650
+ elsif ASCII_UPPERCASE.include?(@data.current_byte)
651
+ attr_value.push(@data.current_byte.downcase)
652
+ #11.5
653
+ else
654
+ attr_value.push(@data.current_byte)
655
+ end
656
+ end
657
+ elsif ['>', '<'].include?(@data.current_byte)
658
+ return [attr_name.join(''), '']
659
+ elsif ASCII_UPPERCASE.include?(@data.current_byte)
660
+ attr_value.push(@data.current_byte.downcase)
661
+ else
662
+ attr_value.push(@data.current_byte)
663
+ end
664
+ while true
665
+ @data.position += 1
666
+ if (SPACE_CHARACTERS + ['>', '<']).include?(@data.current_byte)
667
+ return [attr_name.join(''), attr_value.join('')]
668
+ elsif ASCII_UPPERCASE.include?(@data.current_byte)
669
+ attr_value.push(@data.current_byte.downcase)
670
+ else
671
+ attr_value.push(@data.current_byte)
672
+ end
673
+ end
674
+ end
675
+
676
+ def codec_name(encoding)
677
+ if (!encoding.nil? && encoding.kind_of?(String))
678
+ canonical_name = encoding.downcase.gsub(ASCII_PUNCTUATION, '')
679
+ ENCODINGS[canonical_name]
680
+ # p encoding
681
+ # encoding
682
+ else
683
+ nil
684
+ end
685
+ end
686
+ end
687
+
688
+ class ContentAttrParser
689
+ def initialize(data)
690
+ @data = data
691
+ end
692
+
693
+ def parse
694
+ begin
695
+ #Skip to the first ";"
696
+ @data.position = 0
697
+ @data.jump_to(';')
698
+ @data.position += 1
699
+ @data.skip
700
+ #Check if the attr name is charset
701
+ #otherwise return
702
+ @data.jump_to('charset')
703
+ @data.position += 1
704
+ @data.skip
705
+ unless @data.current_byte == '='
706
+ #If there is no = sign keep looking for attrs
707
+ return nil
708
+ end
709
+ @data.position += 1
710
+ @data.skip
711
+ #Look for an encoding between matching quote marks
712
+ if ['"', "'"].include?(@data.current_byte)
713
+ quote_mark = @data.current_byte
714
+ @data.position += 1
715
+ old_position = @data.position
716
+ @data.jump_to(quote_mark)
717
+ return @data[old_position ... @data.position]
718
+ else
719
+ #Unquoted value
720
+ old_position = @data.position
721
+ begin
722
+ @data.find_next(SPACE_CHARACTERS)
723
+ return @data[old_position ... @data.position]
724
+ rescue EOF
725
+ #Return the whole remaining value
726
+ return @data[old_position .. -1]
727
+ end
728
+ end
729
+ rescue EOF
730
+ return nil
731
+ end
732
+ end
733
+ end
734
+
735
+ end