feedtools 0.2.26 → 0.2.27

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (166) hide show
  1. data/CHANGELOG +232 -216
  2. data/db/migration.rb +2 -0
  3. data/db/schema.mysql.sql +2 -0
  4. data/db/schema.postgresql.sql +3 -1
  5. data/db/schema.sqlite.sql +3 -1
  6. data/lib/feed_tools.rb +37 -14
  7. data/lib/feed_tools/database_feed_cache.rb +13 -2
  8. data/lib/feed_tools/feed.rb +430 -104
  9. data/lib/feed_tools/feed_item.rb +533 -268
  10. data/lib/feed_tools/helpers/generic_helper.rb +1 -1
  11. data/lib/feed_tools/helpers/html_helper.rb +78 -116
  12. data/lib/feed_tools/helpers/retrieval_helper.rb +33 -3
  13. data/lib/feed_tools/helpers/uri_helper.rb +46 -54
  14. data/lib/feed_tools/monkey_patch.rb +27 -1
  15. data/lib/feed_tools/vendor/html5/History.txt +10 -0
  16. data/lib/feed_tools/vendor/html5/Manifest.txt +117 -0
  17. data/lib/feed_tools/vendor/html5/README +45 -0
  18. data/lib/feed_tools/vendor/html5/Rakefile.rb +33 -0
  19. data/lib/feed_tools/vendor/html5/bin/html5 +217 -0
  20. data/lib/feed_tools/vendor/html5/lib/core_ext/string.rb +17 -0
  21. data/lib/feed_tools/vendor/html5/lib/html5.rb +13 -0
  22. data/lib/feed_tools/vendor/html5/lib/html5/constants.rb +1046 -0
  23. data/lib/feed_tools/vendor/html5/lib/html5/filters/base.rb +10 -0
  24. data/lib/feed_tools/vendor/html5/lib/html5/filters/inject_meta_charset.rb +82 -0
  25. data/lib/feed_tools/vendor/html5/lib/html5/filters/iso639codes.rb +752 -0
  26. data/lib/feed_tools/vendor/html5/lib/html5/filters/optionaltags.rb +198 -0
  27. data/lib/feed_tools/vendor/html5/lib/html5/filters/rfc2046.rb +30 -0
  28. data/lib/feed_tools/vendor/html5/lib/html5/filters/rfc3987.rb +89 -0
  29. data/lib/feed_tools/vendor/html5/lib/html5/filters/sanitizer.rb +15 -0
  30. data/lib/feed_tools/vendor/html5/lib/html5/filters/validator.rb +830 -0
  31. data/lib/feed_tools/vendor/html5/lib/html5/filters/whitespace.rb +36 -0
  32. data/lib/feed_tools/vendor/html5/lib/html5/html5parser.rb +248 -0
  33. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/after_body_phase.rb +46 -0
  34. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/after_frameset_phase.rb +33 -0
  35. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/after_head_phase.rb +50 -0
  36. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/before_head_phase.rb +41 -0
  37. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_body_phase.rb +613 -0
  38. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_caption_phase.rb +69 -0
  39. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_cell_phase.rb +78 -0
  40. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_column_group_phase.rb +55 -0
  41. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_frameset_phase.rb +57 -0
  42. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_head_phase.rb +138 -0
  43. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_row_phase.rb +89 -0
  44. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_select_phase.rb +85 -0
  45. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_table_body_phase.rb +86 -0
  46. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_table_phase.rb +115 -0
  47. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/initial_phase.rb +133 -0
  48. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/phase.rb +154 -0
  49. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/root_element_phase.rb +41 -0
  50. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/trailing_end_phase.rb +35 -0
  51. data/lib/feed_tools/vendor/html5/lib/html5/inputstream.rb +648 -0
  52. data/lib/feed_tools/vendor/html5/lib/html5/liberalxmlparser.rb +158 -0
  53. data/lib/feed_tools/vendor/html5/lib/html5/sanitizer.rb +188 -0
  54. data/lib/feed_tools/vendor/html5/lib/html5/serializer.rb +2 -0
  55. data/lib/feed_tools/vendor/html5/lib/html5/serializer/htmlserializer.rb +179 -0
  56. data/lib/feed_tools/vendor/html5/lib/html5/serializer/xhtmlserializer.rb +20 -0
  57. data/lib/feed_tools/vendor/html5/lib/html5/sniffer.rb +45 -0
  58. data/lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb +966 -0
  59. data/lib/feed_tools/vendor/html5/lib/html5/treebuilders.rb +24 -0
  60. data/lib/feed_tools/vendor/html5/lib/html5/treebuilders/base.rb +334 -0
  61. data/lib/feed_tools/vendor/html5/lib/html5/treebuilders/hpricot.rb +231 -0
  62. data/lib/feed_tools/vendor/html5/lib/html5/treebuilders/rexml.rb +209 -0
  63. data/lib/feed_tools/vendor/html5/lib/html5/treebuilders/simpletree.rb +185 -0
  64. data/lib/feed_tools/vendor/html5/lib/html5/treewalkers.rb +26 -0
  65. data/lib/feed_tools/vendor/html5/lib/html5/treewalkers/base.rb +162 -0
  66. data/lib/feed_tools/vendor/html5/lib/html5/treewalkers/hpricot.rb +48 -0
  67. data/lib/feed_tools/vendor/html5/lib/html5/treewalkers/rexml.rb +48 -0
  68. data/lib/feed_tools/vendor/html5/lib/html5/treewalkers/simpletree.rb +48 -0
  69. data/lib/feed_tools/vendor/html5/lib/html5/version.rb +3 -0
  70. data/lib/feed_tools/vendor/html5/testdata/encoding/chardet/test_big5.txt +51 -0
  71. data/lib/feed_tools/vendor/html5/testdata/encoding/test-yahoo-jp.dat +10 -0
  72. data/lib/feed_tools/vendor/html5/testdata/encoding/tests1.dat +394 -0
  73. data/lib/feed_tools/vendor/html5/testdata/encoding/tests2.dat +81 -0
  74. data/lib/feed_tools/vendor/html5/testdata/sanitizer/tests1.dat +416 -0
  75. data/lib/feed_tools/vendor/html5/testdata/serializer/core.test +104 -0
  76. data/lib/feed_tools/vendor/html5/testdata/serializer/injectmeta.test +65 -0
  77. data/lib/feed_tools/vendor/html5/testdata/serializer/optionaltags.test +900 -0
  78. data/lib/feed_tools/vendor/html5/testdata/serializer/options.test +60 -0
  79. data/lib/feed_tools/vendor/html5/testdata/serializer/whitespace.test +51 -0
  80. data/lib/feed_tools/vendor/html5/testdata/sites/google-results.htm +1 -0
  81. data/lib/feed_tools/vendor/html5/testdata/sites/python-ref-import.htm +1 -0
  82. data/lib/feed_tools/vendor/html5/testdata/sites/web-apps-old.htm +1 -0
  83. data/lib/feed_tools/vendor/html5/testdata/sites/web-apps.htm +34275 -0
  84. data/lib/feed_tools/vendor/html5/testdata/sniffer/htmlOrFeed.json +43 -0
  85. data/lib/feed_tools/vendor/html5/testdata/tokenizer/contentModelFlags.test +48 -0
  86. data/lib/feed_tools/vendor/html5/testdata/tokenizer/entities.test +2339 -0
  87. data/lib/feed_tools/vendor/html5/testdata/tokenizer/escapeFlag.test +21 -0
  88. data/lib/feed_tools/vendor/html5/testdata/tokenizer/test1.test +172 -0
  89. data/lib/feed_tools/vendor/html5/testdata/tokenizer/test2.test +129 -0
  90. data/lib/feed_tools/vendor/html5/testdata/tokenizer/test3.test +367 -0
  91. data/lib/feed_tools/vendor/html5/testdata/tokenizer/test4.test +198 -0
  92. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests1.dat +1950 -0
  93. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests2.dat +773 -0
  94. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests3.dat +270 -0
  95. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests4.dat +60 -0
  96. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests5.dat +175 -0
  97. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests6.dat +196 -0
  98. data/lib/feed_tools/vendor/html5/testdata/validator/attributes.test +1035 -0
  99. data/lib/feed_tools/vendor/html5/testdata/validator/base-href-attribute.test +787 -0
  100. data/lib/feed_tools/vendor/html5/testdata/validator/base-target-attribute.test +35 -0
  101. data/lib/feed_tools/vendor/html5/testdata/validator/blockquote-cite-attribute.test +7 -0
  102. data/lib/feed_tools/vendor/html5/testdata/validator/classattribute.test +152 -0
  103. data/lib/feed_tools/vendor/html5/testdata/validator/contenteditableattribute.test +59 -0
  104. data/lib/feed_tools/vendor/html5/testdata/validator/contextmenuattribute.test +115 -0
  105. data/lib/feed_tools/vendor/html5/testdata/validator/dirattribute.test +59 -0
  106. data/lib/feed_tools/vendor/html5/testdata/validator/draggableattribute.test +63 -0
  107. data/lib/feed_tools/vendor/html5/testdata/validator/html-xmlns-attribute.test +23 -0
  108. data/lib/feed_tools/vendor/html5/testdata/validator/idattribute.test +115 -0
  109. data/lib/feed_tools/vendor/html5/testdata/validator/inputattributes.test +2795 -0
  110. data/lib/feed_tools/vendor/html5/testdata/validator/irrelevantattribute.test +63 -0
  111. data/lib/feed_tools/vendor/html5/testdata/validator/langattribute.test +5579 -0
  112. data/lib/feed_tools/vendor/html5/testdata/validator/li-value-attribute.test +7 -0
  113. data/lib/feed_tools/vendor/html5/testdata/validator/link-href-attribute.test +7 -0
  114. data/lib/feed_tools/vendor/html5/testdata/validator/link-hreflang-attribute.test +7 -0
  115. data/lib/feed_tools/vendor/html5/testdata/validator/link-rel-attribute.test +271 -0
  116. data/lib/feed_tools/vendor/html5/testdata/validator/ol-start-attribute.test +7 -0
  117. data/lib/feed_tools/vendor/html5/testdata/validator/starttags.test +375 -0
  118. data/lib/feed_tools/vendor/html5/testdata/validator/style-scoped-attribute.test +7 -0
  119. data/lib/feed_tools/vendor/html5/testdata/validator/tabindexattribute.test +79 -0
  120. data/lib/feed_tools/vendor/html5/tests/preamble.rb +72 -0
  121. data/lib/feed_tools/vendor/html5/tests/test_encoding.rb +35 -0
  122. data/lib/feed_tools/vendor/html5/tests/test_lxp.rb +279 -0
  123. data/lib/feed_tools/vendor/html5/tests/test_parser.rb +68 -0
  124. data/lib/feed_tools/vendor/html5/tests/test_sanitizer.rb +142 -0
  125. data/lib/feed_tools/vendor/html5/tests/test_serializer.rb +68 -0
  126. data/lib/feed_tools/vendor/html5/tests/test_sniffer.rb +27 -0
  127. data/lib/feed_tools/vendor/html5/tests/test_stream.rb +62 -0
  128. data/lib/feed_tools/vendor/html5/tests/test_tokenizer.rb +94 -0
  129. data/lib/feed_tools/vendor/html5/tests/test_treewalkers.rb +135 -0
  130. data/lib/feed_tools/vendor/html5/tests/test_validator.rb +31 -0
  131. data/lib/feed_tools/vendor/html5/tests/tokenizer_test_parser.rb +63 -0
  132. data/lib/feed_tools/vendor/uri.rb +781 -0
  133. data/lib/feed_tools/version.rb +1 -1
  134. data/rakefile +27 -6
  135. data/test/unit/atom_test.rb +298 -210
  136. data/test/unit/helper_test.rb +7 -12
  137. data/test/unit/rdf_test.rb +51 -1
  138. data/test/unit/rss_test.rb +13 -3
  139. metadata +239 -116
  140. data/lib/feed_tools/vendor/htree.rb +0 -97
  141. data/lib/feed_tools/vendor/htree/container.rb +0 -10
  142. data/lib/feed_tools/vendor/htree/context.rb +0 -67
  143. data/lib/feed_tools/vendor/htree/display.rb +0 -27
  144. data/lib/feed_tools/vendor/htree/doc.rb +0 -149
  145. data/lib/feed_tools/vendor/htree/elem.rb +0 -262
  146. data/lib/feed_tools/vendor/htree/encoder.rb +0 -163
  147. data/lib/feed_tools/vendor/htree/equality.rb +0 -218
  148. data/lib/feed_tools/vendor/htree/extract_text.rb +0 -37
  149. data/lib/feed_tools/vendor/htree/fstr.rb +0 -33
  150. data/lib/feed_tools/vendor/htree/gencode.rb +0 -97
  151. data/lib/feed_tools/vendor/htree/htmlinfo.rb +0 -672
  152. data/lib/feed_tools/vendor/htree/inspect.rb +0 -108
  153. data/lib/feed_tools/vendor/htree/leaf.rb +0 -94
  154. data/lib/feed_tools/vendor/htree/loc.rb +0 -367
  155. data/lib/feed_tools/vendor/htree/modules.rb +0 -48
  156. data/lib/feed_tools/vendor/htree/name.rb +0 -124
  157. data/lib/feed_tools/vendor/htree/output.rb +0 -207
  158. data/lib/feed_tools/vendor/htree/parse.rb +0 -409
  159. data/lib/feed_tools/vendor/htree/raw_string.rb +0 -124
  160. data/lib/feed_tools/vendor/htree/regexp-util.rb +0 -15
  161. data/lib/feed_tools/vendor/htree/rexml.rb +0 -130
  162. data/lib/feed_tools/vendor/htree/scan.rb +0 -166
  163. data/lib/feed_tools/vendor/htree/tag.rb +0 -111
  164. data/lib/feed_tools/vendor/htree/template.rb +0 -909
  165. data/lib/feed_tools/vendor/htree/text.rb +0 -115
  166. data/lib/feed_tools/vendor/htree/traverse.rb +0 -465
@@ -0,0 +1,41 @@
1
+ require 'html5/html5parser/phase'
2
+
3
+ module HTML5
4
+ class RootElementPhase < Phase
5
+
6
+ def process_eof
7
+ insert_html_element
8
+ @parser.phase.process_eof
9
+ end
10
+
11
+ def processComment(data)
12
+ @tree.insert_comment(data, @tree.document)
13
+ end
14
+
15
+ def processSpaceCharacters(data)
16
+ end
17
+
18
+ def processCharacters(data)
19
+ insert_html_element
20
+ @parser.phase.processCharacters(data)
21
+ end
22
+
23
+ def processStartTag(name, attributes)
24
+ @parser.first_start_tag = true if name == 'html'
25
+ insert_html_element
26
+ @parser.phase.processStartTag(name, attributes)
27
+ end
28
+
29
+ def processEndTag(name)
30
+ insert_html_element
31
+ @parser.phase.processEndTag(name)
32
+ end
33
+
34
+ def insert_html_element
35
+ element = @tree.createElement('html', {})
36
+ @tree.open_elements << element
37
+ @tree.document.appendChild(element)
38
+ @parser.phase = @parser.phases[:beforeHead]
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,35 @@
1
+ require 'html5/html5parser/phase'
2
+
3
+ module HTML5
4
+ class TrailingEndPhase < Phase
5
+
6
+ def process_eof
7
+ end
8
+
9
+ def processComment(data)
10
+ @tree.insert_comment(data, @tree.document)
11
+ end
12
+
13
+ def processSpaceCharacters(data)
14
+ @parser.last_phase.processSpaceCharacters(data)
15
+ end
16
+
17
+ def processCharacters(data)
18
+ parse_error("expected-eof-but-got-char")
19
+ @parser.phase = @parser.last_phase
20
+ @parser.phase.processCharacters(data)
21
+ end
22
+
23
+ def processStartTag(name, attributes)
24
+ parse_error("expected-eof-but-got-start-tag", {"name" => name})
25
+ @parser.phase = @parser.last_phase
26
+ @parser.phase.processStartTag(name, attributes)
27
+ end
28
+
29
+ def processEndTag(name)
30
+ parse_error("expected-eof-but-got-end-tag", {"name" => name})
31
+ @parser.phase = @parser.last_phase
32
+ @parser.phase.processEndTag(name)
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,648 @@
1
+ require 'stringio'
2
+ require 'html5/constants'
3
+
4
+ module HTML5
5
+
6
+ # Provides a unicode stream of characters to the HTMLTokenizer.
7
+
8
+ # This class takes care of character encoding and removing or replacing
9
+ # incorrect byte-sequences and also provides column and line tracking.
10
+
11
+ class HTMLInputStream
12
+
13
+ attr_accessor :queue, :char_encoding, :errors
14
+
15
+ # Initialises the HTMLInputStream.
16
+ #
17
+ # HTMLInputStream(source, [encoding]) -> Normalized stream from source
18
+ # for use by the HTML5Lib.
19
+ #
20
+ # source can be either a file-object, local filename or a string.
21
+ #
22
+ # The optional encoding parameter must be a string that indicates
23
+ # the encoding. If specified, that encoding will be used,
24
+ # regardless of any BOM or later declaration (such as in a meta
25
+ # element)
26
+ #
27
+ # parseMeta - Look for a <meta> element containing encoding information
28
+
29
+ def initialize(source, options = {})
30
+ @encoding = nil
31
+ @parse_meta = true
32
+ @chardet = true
33
+
34
+ options.each {|name, value| instance_variable_set("@#{name}", value) }
35
+
36
+ # Raw Stream
37
+ @raw_stream = open_stream(source)
38
+
39
+ # Encoding Information
40
+ #Number of bytes to use when looking for a meta element with
41
+ #encoding information
42
+ @NUM_BYTES_META = 512
43
+ #Number of bytes to use when using detecting encoding using chardet
44
+ @NUM_BYTES_CHARDET = 256
45
+ #Number of bytes to use when reading content
46
+ @NUM_BYTES_BUFFER = 1024
47
+
48
+ #Encoding to use if no other information can be found
49
+ @DEFAULT_ENCODING = 'windows-1252'
50
+
51
+ #Detect encoding iff no explicit "transport level" encoding is supplied
52
+ if @encoding.nil? or not HTML5.is_valid_encoding(@encoding)
53
+ @char_encoding = detect_encoding
54
+ else
55
+ @char_encoding = @encoding
56
+ end
57
+
58
+ # Read bytes from stream decoding them into Unicode
59
+ @buffer = @raw_stream.read(@NUM_BYTES_BUFFER) || ''
60
+ if @char_encoding == 'windows-1252'
61
+ @win1252 = true
62
+ elsif @char_encoding != 'utf-8'
63
+ require 'iconv'
64
+ begin
65
+ @buffer << @raw_stream.read unless @raw_stream.eof?
66
+ @buffer = Iconv.iconv('utf-8', @char_encoding, @buffer).first
67
+ rescue
68
+ @win1252 = true
69
+ end
70
+ end
71
+
72
+ @queue = []
73
+ @errors = []
74
+
75
+ # Reset position in the list to read from
76
+ @tell = 0
77
+ @line = @col = 0
78
+ @line_lengths = []
79
+ end
80
+
81
+ # Produces a file object from source.
82
+ #
83
+ # source can be either a file object, local filename or a string.
84
+ def open_stream(source)
85
+ # Already an IO like object
86
+ if source.respond_to?(:read)
87
+ source
88
+ else
89
+ # Treat source as a string and wrap in StringIO
90
+ StringIO.new(source)
91
+ end
92
+ end
93
+
94
+ def detect_encoding
95
+
96
+ #First look for a BOM
97
+ #This will also read past the BOM if present
98
+ encoding = detect_bom
99
+
100
+ #If there is no BOM need to look for meta elements with encoding
101
+ #information
102
+ if encoding.nil? and @parse_meta
103
+ encoding = detect_encoding_meta
104
+ end
105
+
106
+ #Guess with chardet, if avaliable
107
+ if encoding.nil? and @chardet
108
+ begin
109
+ require 'rubygems'
110
+ require 'UniversalDetector' # gem install chardet
111
+ buffers = []
112
+ detector = UniversalDetector::Detector.instance
113
+ detector.reset
114
+ until @raw_stream.eof?
115
+ buffer = @raw_stream.read(@NUM_BYTES_CHARDET)
116
+ break if !buffer or buffer.empty?
117
+ buffers << buffer
118
+ detector.feed(buffer)
119
+ break if detector.instance_eval {@done}
120
+ detector.instance_eval {
121
+ @_mLastChar = @_mLastChar.chr if Fixnum === @_mLastChar
122
+ }
123
+ end
124
+ detector.close
125
+ encoding = detector.result['encoding']
126
+ seek(buffers*'', 0)
127
+ rescue LoadError
128
+ end
129
+ end
130
+
131
+ # If all else fails use the default encoding
132
+ if encoding.nil?
133
+ encoding = @DEFAULT_ENCODING
134
+ end
135
+
136
+ #Substitute for equivalent encoding
137
+ if 'iso-8859-1' == encoding.downcase
138
+ encoding = 'windows-1252'
139
+ end
140
+
141
+ encoding
142
+ end
143
+
144
+ # Attempts to detect at BOM at the start of the stream. If
145
+ # an encoding can be determined from the BOM return the name of the
146
+ # encoding otherwise return nil
147
+ def detect_bom
148
+ bom_dict = {
149
+ "\xef\xbb\xbf" => 'utf-8',
150
+ "\xff\xfe" => 'utf-16le',
151
+ "\xfe\xff" => 'utf-16be',
152
+ "\xff\xfe\x00\x00" => 'utf-32le',
153
+ "\x00\x00\xfe\xff" => 'utf-32be'
154
+ }
155
+
156
+ # Go to beginning of file and read in 4 bytes
157
+ string = @raw_stream.read(4)
158
+ return nil unless string
159
+
160
+ # Try detecting the BOM using bytes from the string
161
+ encoding = bom_dict[string[0...3]] # UTF-8
162
+ seek = 3
163
+ unless encoding
164
+ # Need to detect UTF-32 before UTF-16
165
+ encoding = bom_dict[string] # UTF-32
166
+ seek = 4
167
+ unless encoding
168
+ encoding = bom_dict[string[0...2]] # UTF-16
169
+ seek = 2
170
+ end
171
+ end
172
+
173
+ # Set the read position past the BOM if one was found, otherwise
174
+ # set it to the start of the stream
175
+ seek(string, encoding ? seek : 0)
176
+
177
+ return encoding
178
+ end
179
+
180
+ def seek(buffer, n)
181
+ if @raw_stream.respond_to?(:unget)
182
+ @raw_stream.unget(buffer[n..-1])
183
+ return
184
+ end
185
+
186
+ if @raw_stream.respond_to?(:seek)
187
+ begin
188
+ @raw_stream.seek(n)
189
+ return
190
+ rescue Errno::ESPIPE
191
+ end
192
+ end
193
+
194
+ #TODO: huh?
195
+ require 'delegate'
196
+ @raw_stream = SimpleDelegator.new(@raw_stream)
197
+
198
+ class << @raw_stream
199
+ def read(chars=-1)
200
+ if chars == -1 or chars > @data.length
201
+ result = @data
202
+ @data = ''
203
+ return result if __getobj__.eof?
204
+ return result + __getobj__.read if chars == -1
205
+ return result + __getobj__.read(chars-result.length)
206
+ elsif @data.empty?
207
+ return __getobj__.read(chars)
208
+ else
209
+ result = @data[1...chars]
210
+ @data = @data[chars..-1]
211
+ return result
212
+ end
213
+ end
214
+
215
+ def unget(data)
216
+ if !@data or @data.empty?
217
+ @data = data
218
+ else
219
+ @data += data
220
+ end
221
+ end
222
+ end
223
+
224
+ @raw_stream.unget(buffer[n .. -1])
225
+ end
226
+
227
+ # Report the encoding declared by the meta element
228
+ def detect_encoding_meta
229
+ buffer = @raw_stream.read(@NUM_BYTES_META)
230
+ parser = EncodingParser.new(buffer)
231
+ seek(buffer, 0)
232
+ return parser.get_encoding
233
+ end
234
+
235
+ # Returns (line, col) of the current position in the stream.
236
+ def position
237
+ line, col = @line, @col
238
+ @queue.reverse.each do |c|
239
+ if c == "\n"
240
+ line -= 1
241
+ raise RuntimeError.new("col=#{col}") unless col == 0
242
+ col = @line_lengths[line]
243
+ else
244
+ col -= 1
245
+ end
246
+ end
247
+ return [line + 1, col]
248
+ end
249
+
250
+ # Read one character from the stream or queue if available. Return
251
+ # EOF when EOF is reached.
252
+ def char
253
+ unless @queue.empty?
254
+ return @queue.shift
255
+ else
256
+ if @tell + 3 > @buffer.length && !@raw_stream.eof?
257
+ # read next block
258
+ @buffer = @buffer[@tell..-1] + @raw_stream.read(@NUM_BYTES_BUFFER)
259
+ @tell = 0
260
+ end
261
+
262
+ c = @buffer[@tell]
263
+ @tell += 1
264
+
265
+ case c
266
+ when 0x01..0x7F
267
+ if c == 0x0D
268
+ # normalize newlines
269
+ @tell += 1 if @buffer[@tell] == 0x0A
270
+ c = 0x0A
271
+ end
272
+
273
+ # update position in stream
274
+ if c == 0x0a
275
+ @line_lengths << @col
276
+ @line += 1
277
+ @col = 0
278
+ else
279
+ @col += 1
280
+ end
281
+
282
+ c.chr
283
+
284
+ when 0x80..0xBF
285
+ if !@win1252
286
+ [0xFFFD].pack('U') # invalid utf-8
287
+ elsif c <= 0x9f
288
+ [ENTITIES_WINDOWS1252[c-0x80]].pack('U')
289
+ else
290
+ "\xC2" + c.chr # convert to utf-8
291
+ end
292
+
293
+ when 0xC0..0xFF
294
+ if instance_variables.include?("@win1252") && @win1252
295
+ "\xC3" + (c - 64).chr # convert to utf-8
296
+ # from http://www.w3.org/International/questions/qa-forms-utf-8.en.php
297
+ elsif @buffer[@tell - 1..@tell + 3] =~ /^
298
+ ( [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte
299
+ | \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs
300
+ | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte
301
+ | \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates
302
+ | \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3
303
+ | [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15
304
+ | \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16
305
+ )/x
306
+ @tell += $1.length - 1
307
+ $1
308
+ else
309
+ [0xFFFD].pack('U') # invalid utf-8
310
+ end
311
+
312
+ when 0x00
313
+ @errors.push("null-character")
314
+ [0xFFFD].pack('U') # null characters are invalid
315
+
316
+ else
317
+ :EOF
318
+ end
319
+ end
320
+ end
321
+
322
+ # Returns a string of characters from the stream up to but not
323
+ # including any character in characters or EOF. characters can be
324
+ # any container that supports the in method being called on it.
325
+ def chars_until(characters, opposite=false)
326
+ char_stack = [char]
327
+
328
+ while char_stack.last != :EOF
329
+ break unless (characters.include?(char_stack.last)) == opposite
330
+ char_stack.push(char)
331
+ end
332
+
333
+ # Put the character stopped on back to the front of the queue
334
+ # from where it came.
335
+ c = char_stack.pop
336
+ @queue.insert(0, c) unless c == :EOF
337
+ return char_stack.join('')
338
+ end
339
+
340
+ def unget(characters)
341
+ @queue.unshift(*characters.to_a) unless characters == :EOF
342
+ end
343
+ end
344
+
345
+ # String-like object with an assosiated position and various extra methods
346
+ # If the position is ever greater than the string length then an exception is raised
347
+ class EncodingBytes < String
348
+
349
+ attr_accessor :position
350
+
351
+ def initialize(value)
352
+ super(value)
353
+ @position = -1
354
+ end
355
+
356
+ def each
357
+ while @position < length
358
+ @position += 1
359
+ yield self[@position]
360
+ end
361
+ rescue EOF
362
+ end
363
+
364
+ def current_byte
365
+ raise EOF if @position >= length
366
+ return self[@position].chr
367
+ end
368
+
369
+ # Skip past a list of characters
370
+ def skip(chars=SPACE_CHARACTERS)
371
+ while chars.include?(current_byte)
372
+ @position += 1
373
+ end
374
+ end
375
+
376
+ # Look for a sequence of bytes at the start of a string. If the bytes
377
+ # are found return true and advance the position to the byte after the
378
+ # match. Otherwise return false and leave the position alone
379
+ def match_bytes(bytes, lower=false)
380
+ data = self[position ... position+bytes.length]
381
+ data.downcase! if lower
382
+ rv = (data == bytes)
383
+ @position += bytes.length if rv == true
384
+ return rv
385
+ end
386
+
387
+ # Look for the next sequence of bytes matching a given sequence. If
388
+ # a match is found advance the position to the last byte of the match
389
+ def jump_to(bytes)
390
+ new_position = self[position .. -1].index(bytes)
391
+ if new_position
392
+ @position += (new_position + bytes.length-1)
393
+ return true
394
+ else
395
+ raise EOF
396
+ end
397
+ end
398
+
399
+ # Move the pointer so it points to the next byte in a set of possible
400
+ # bytes
401
+ def find_next(byte_list)
402
+ until byte_list.include?(current_byte)
403
+ @position += 1
404
+ end
405
+ end
406
+ end
407
+
408
+ # Mini parser for detecting character encoding from meta elements
409
+ class EncodingParser
410
+
411
+ # string - the data to work on for encoding detection
412
+ def initialize(data)
413
+ @data = EncodingBytes.new(data.to_s)
414
+ @encoding = nil
415
+ end
416
+
417
+ @@method_dispatch = [
418
+ ['<!--', :handle_comment],
419
+ ['<meta', :handle_meta],
420
+ ['</', :handle_possible_end_tag],
421
+ ['<!', :handle_other],
422
+ ['<?', :handle_other],
423
+ ['<', :handle_possible_start_tag]
424
+ ]
425
+
426
+ def get_encoding
427
+ @data.each do |byte|
428
+ keep_parsing = true
429
+ @@method_dispatch.each do |(key, method)|
430
+ if @data.match_bytes(key, lower = true)
431
+ keep_parsing = send(method)
432
+ break
433
+ end
434
+ end
435
+ break unless keep_parsing
436
+ end
437
+ @encoding = @encoding.strip unless @encoding.nil?
438
+ return @encoding
439
+ end
440
+
441
+ # Skip over comments
442
+ def handle_comment
443
+ return @data.jump_to('-->')
444
+ end
445
+
446
+ def handle_meta
447
+ # if we have <meta not followed by a space so just keep going
448
+ return true unless SPACE_CHARACTERS.include?(@data.current_byte)
449
+
450
+ #We have a valid meta element we want to search for attributes
451
+ while true
452
+ #Try to find the next attribute after the current position
453
+ attr = get_attribute
454
+
455
+ return true if attr.nil?
456
+
457
+ if attr[0] == 'charset'
458
+ tentative_encoding = attr[1]
459
+ if HTML5.is_valid_encoding(tentative_encoding)
460
+ @encoding = tentative_encoding
461
+ return false
462
+ end
463
+ elsif attr[0] == 'content'
464
+ content_parser = ContentAttrParser.new(EncodingBytes.new(attr[1]))
465
+ tentative_encoding = content_parser.parse
466
+ if HTML5.is_valid_encoding(tentative_encoding)
467
+ @encoding = tentative_encoding
468
+ return false
469
+ end
470
+ end
471
+ end
472
+ end
473
+
474
+ def handle_possible_start_tag
475
+ return handle_possible_tag(false)
476
+ end
477
+
478
+ def handle_possible_end_tag
479
+ @data.position += 1
480
+ return handle_possible_tag(true)
481
+ end
482
+
483
+ def handle_possible_tag(end_tag)
484
+ unless ASCII_LETTERS.include?(@data.current_byte)
485
+ #If the next byte is not an ascii letter either ignore this
486
+ #fragment (possible start tag case) or treat it according to
487
+ #handleOther
488
+ if end_tag
489
+ @data.position -= 1
490
+ handle_other
491
+ end
492
+ return true
493
+ end
494
+
495
+ @data.find_next(SPACE_CHARACTERS + ['<', '>'])
496
+
497
+ if @data.current_byte == '<'
498
+ #return to the first step in the overall "two step" algorithm
499
+ #reprocessing the < byte
500
+ @data.position -= 1
501
+ else
502
+ #Read all attributes
503
+ {} until get_attribute.nil?
504
+ end
505
+ return true
506
+ end
507
+
508
+ def handle_other
509
+ return @data.jump_to('>')
510
+ end
511
+
512
+ # Return a name,value pair for the next attribute in the stream,
513
+ # if one is found, or nil
514
+ def get_attribute
515
+ @data.skip(SPACE_CHARACTERS + ['/'])
516
+
517
+ if @data.current_byte == '<'
518
+ @data.position -= 1
519
+ return nil
520
+ elsif @data.current_byte == '>'
521
+ return nil
522
+ end
523
+
524
+ attr_name = []
525
+ attr_value = []
526
+ space_found = false
527
+ #Step 5 attribute name
528
+ while true
529
+ if @data.current_byte == '=' and attr_name
530
+ break
531
+ elsif SPACE_CHARACTERS.include?(@data.current_byte)
532
+ space_found = true
533
+ break
534
+ elsif ['/', '<', '>'].include?(@data.current_byte)
535
+ return [attr_name.join(''), '']
536
+ elsif ASCII_UPPERCASE.include?(@data.current_byte)
537
+ attr_name.push(@data.current_byte.downcase)
538
+ else
539
+ attr_name.push(@data.current_byte)
540
+ end
541
+ #Step 6
542
+ @data.position += 1
543
+ end
544
+ #Step 7
545
+ if space_found
546
+ @data.skip
547
+ #Step 8
548
+ unless @data.current_byte == '='
549
+ @data.position -= 1
550
+ return [attr_name.join(''), '']
551
+ end
552
+ end
553
+ #XXX need to advance position in both spaces and value case
554
+ #Step 9
555
+ @data.position += 1
556
+ #Step 10
557
+ @data.skip
558
+ #Step 11
559
+ if ["'", '"'].include?(@data.current_byte)
560
+ #11.1
561
+ quote_char = @data.current_byte
562
+ while true
563
+ @data.position+=1
564
+ #11.3
565
+ if @data.current_byte == quote_char
566
+ @data.position += 1
567
+ return [attr_name.join(''), attr_value.join('')]
568
+ #11.4
569
+ elsif ASCII_UPPERCASE.include?(@data.current_byte)
570
+ attr_value.push(@data.current_byte.downcase)
571
+ #11.5
572
+ else
573
+ attr_value.push(@data.current_byte)
574
+ end
575
+ end
576
+ elsif ['>', '<'].include?(@data.current_byte)
577
+ return [attr_name.join(''), '']
578
+ elsif ASCII_UPPERCASE.include?(@data.current_byte)
579
+ attr_value.push(@data.current_byte.downcase)
580
+ else
581
+ attr_value.push(@data.current_byte)
582
+ end
583
+ while true
584
+ @data.position += 1
585
+ if (SPACE_CHARACTERS + ['>', '<']).include?(@data.current_byte)
586
+ return [attr_name.join(''), attr_value.join('')]
587
+ elsif ASCII_UPPERCASE.include?(@data.current_byte)
588
+ attr_value.push(@data.current_byte.downcase)
589
+ else
590
+ attr_value.push(@data.current_byte)
591
+ end
592
+ end
593
+ end
594
+ end
595
+
596
+ class ContentAttrParser
597
+ def initialize(data)
598
+ @data = data
599
+ end
600
+
601
+ def parse
602
+ begin
603
+ #Skip to the first ";"
604
+ @data.position = 0
605
+ @data.jump_to(';')
606
+ @data.position += 1
607
+ @data.skip
608
+ #Check if the attr name is charset
609
+ #otherwise return
610
+ @data.jump_to('charset')
611
+ @data.position += 1
612
+ @data.skip
613
+ unless @data.current_byte == '='
614
+ #If there is no = sign keep looking for attrs
615
+ return nil
616
+ end
617
+ @data.position += 1
618
+ @data.skip
619
+ #Look for an encoding between matching quote marks
620
+ if ['"', "'"].include?(@data.current_byte)
621
+ quote_mark = @data.current_byte
622
+ @data.position += 1
623
+ old_position = @data.position
624
+ @data.jump_to(quote_mark)
625
+ return @data[old_position ... @data.position]
626
+ else
627
+ #Unquoted value
628
+ old_position = @data.position
629
+ begin
630
+ @data.find_next(SPACE_CHARACTERS)
631
+ return @data[old_position ... @data.position]
632
+ rescue EOF
633
+ #Return the whole remaining value
634
+ return @data[old_position .. -1]
635
+ end
636
+ end
637
+ rescue EOF
638
+ return nil
639
+ end
640
+ end
641
+ end
642
+
643
+ # Determine if a string is a supported encoding
644
+ def self.is_valid_encoding(encoding)
645
+ (not encoding.nil? and encoding.kind_of?(String) and ENCODINGS.include?(encoding.downcase.strip))
646
+ end
647
+
648
+ end