feedtools 0.2.26 → 0.2.27

Sign up to get free protection for your applications and to get access to all the features.
Files changed (166) hide show
  1. data/CHANGELOG +232 -216
  2. data/db/migration.rb +2 -0
  3. data/db/schema.mysql.sql +2 -0
  4. data/db/schema.postgresql.sql +3 -1
  5. data/db/schema.sqlite.sql +3 -1
  6. data/lib/feed_tools.rb +37 -14
  7. data/lib/feed_tools/database_feed_cache.rb +13 -2
  8. data/lib/feed_tools/feed.rb +430 -104
  9. data/lib/feed_tools/feed_item.rb +533 -268
  10. data/lib/feed_tools/helpers/generic_helper.rb +1 -1
  11. data/lib/feed_tools/helpers/html_helper.rb +78 -116
  12. data/lib/feed_tools/helpers/retrieval_helper.rb +33 -3
  13. data/lib/feed_tools/helpers/uri_helper.rb +46 -54
  14. data/lib/feed_tools/monkey_patch.rb +27 -1
  15. data/lib/feed_tools/vendor/html5/History.txt +10 -0
  16. data/lib/feed_tools/vendor/html5/Manifest.txt +117 -0
  17. data/lib/feed_tools/vendor/html5/README +45 -0
  18. data/lib/feed_tools/vendor/html5/Rakefile.rb +33 -0
  19. data/lib/feed_tools/vendor/html5/bin/html5 +217 -0
  20. data/lib/feed_tools/vendor/html5/lib/core_ext/string.rb +17 -0
  21. data/lib/feed_tools/vendor/html5/lib/html5.rb +13 -0
  22. data/lib/feed_tools/vendor/html5/lib/html5/constants.rb +1046 -0
  23. data/lib/feed_tools/vendor/html5/lib/html5/filters/base.rb +10 -0
  24. data/lib/feed_tools/vendor/html5/lib/html5/filters/inject_meta_charset.rb +82 -0
  25. data/lib/feed_tools/vendor/html5/lib/html5/filters/iso639codes.rb +752 -0
  26. data/lib/feed_tools/vendor/html5/lib/html5/filters/optionaltags.rb +198 -0
  27. data/lib/feed_tools/vendor/html5/lib/html5/filters/rfc2046.rb +30 -0
  28. data/lib/feed_tools/vendor/html5/lib/html5/filters/rfc3987.rb +89 -0
  29. data/lib/feed_tools/vendor/html5/lib/html5/filters/sanitizer.rb +15 -0
  30. data/lib/feed_tools/vendor/html5/lib/html5/filters/validator.rb +830 -0
  31. data/lib/feed_tools/vendor/html5/lib/html5/filters/whitespace.rb +36 -0
  32. data/lib/feed_tools/vendor/html5/lib/html5/html5parser.rb +248 -0
  33. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/after_body_phase.rb +46 -0
  34. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/after_frameset_phase.rb +33 -0
  35. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/after_head_phase.rb +50 -0
  36. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/before_head_phase.rb +41 -0
  37. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_body_phase.rb +613 -0
  38. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_caption_phase.rb +69 -0
  39. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_cell_phase.rb +78 -0
  40. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_column_group_phase.rb +55 -0
  41. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_frameset_phase.rb +57 -0
  42. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_head_phase.rb +138 -0
  43. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_row_phase.rb +89 -0
  44. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_select_phase.rb +85 -0
  45. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_table_body_phase.rb +86 -0
  46. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_table_phase.rb +115 -0
  47. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/initial_phase.rb +133 -0
  48. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/phase.rb +154 -0
  49. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/root_element_phase.rb +41 -0
  50. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/trailing_end_phase.rb +35 -0
  51. data/lib/feed_tools/vendor/html5/lib/html5/inputstream.rb +648 -0
  52. data/lib/feed_tools/vendor/html5/lib/html5/liberalxmlparser.rb +158 -0
  53. data/lib/feed_tools/vendor/html5/lib/html5/sanitizer.rb +188 -0
  54. data/lib/feed_tools/vendor/html5/lib/html5/serializer.rb +2 -0
  55. data/lib/feed_tools/vendor/html5/lib/html5/serializer/htmlserializer.rb +179 -0
  56. data/lib/feed_tools/vendor/html5/lib/html5/serializer/xhtmlserializer.rb +20 -0
  57. data/lib/feed_tools/vendor/html5/lib/html5/sniffer.rb +45 -0
  58. data/lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb +966 -0
  59. data/lib/feed_tools/vendor/html5/lib/html5/treebuilders.rb +24 -0
  60. data/lib/feed_tools/vendor/html5/lib/html5/treebuilders/base.rb +334 -0
  61. data/lib/feed_tools/vendor/html5/lib/html5/treebuilders/hpricot.rb +231 -0
  62. data/lib/feed_tools/vendor/html5/lib/html5/treebuilders/rexml.rb +209 -0
  63. data/lib/feed_tools/vendor/html5/lib/html5/treebuilders/simpletree.rb +185 -0
  64. data/lib/feed_tools/vendor/html5/lib/html5/treewalkers.rb +26 -0
  65. data/lib/feed_tools/vendor/html5/lib/html5/treewalkers/base.rb +162 -0
  66. data/lib/feed_tools/vendor/html5/lib/html5/treewalkers/hpricot.rb +48 -0
  67. data/lib/feed_tools/vendor/html5/lib/html5/treewalkers/rexml.rb +48 -0
  68. data/lib/feed_tools/vendor/html5/lib/html5/treewalkers/simpletree.rb +48 -0
  69. data/lib/feed_tools/vendor/html5/lib/html5/version.rb +3 -0
  70. data/lib/feed_tools/vendor/html5/testdata/encoding/chardet/test_big5.txt +51 -0
  71. data/lib/feed_tools/vendor/html5/testdata/encoding/test-yahoo-jp.dat +10 -0
  72. data/lib/feed_tools/vendor/html5/testdata/encoding/tests1.dat +394 -0
  73. data/lib/feed_tools/vendor/html5/testdata/encoding/tests2.dat +81 -0
  74. data/lib/feed_tools/vendor/html5/testdata/sanitizer/tests1.dat +416 -0
  75. data/lib/feed_tools/vendor/html5/testdata/serializer/core.test +104 -0
  76. data/lib/feed_tools/vendor/html5/testdata/serializer/injectmeta.test +65 -0
  77. data/lib/feed_tools/vendor/html5/testdata/serializer/optionaltags.test +900 -0
  78. data/lib/feed_tools/vendor/html5/testdata/serializer/options.test +60 -0
  79. data/lib/feed_tools/vendor/html5/testdata/serializer/whitespace.test +51 -0
  80. data/lib/feed_tools/vendor/html5/testdata/sites/google-results.htm +1 -0
  81. data/lib/feed_tools/vendor/html5/testdata/sites/python-ref-import.htm +1 -0
  82. data/lib/feed_tools/vendor/html5/testdata/sites/web-apps-old.htm +1 -0
  83. data/lib/feed_tools/vendor/html5/testdata/sites/web-apps.htm +34275 -0
  84. data/lib/feed_tools/vendor/html5/testdata/sniffer/htmlOrFeed.json +43 -0
  85. data/lib/feed_tools/vendor/html5/testdata/tokenizer/contentModelFlags.test +48 -0
  86. data/lib/feed_tools/vendor/html5/testdata/tokenizer/entities.test +2339 -0
  87. data/lib/feed_tools/vendor/html5/testdata/tokenizer/escapeFlag.test +21 -0
  88. data/lib/feed_tools/vendor/html5/testdata/tokenizer/test1.test +172 -0
  89. data/lib/feed_tools/vendor/html5/testdata/tokenizer/test2.test +129 -0
  90. data/lib/feed_tools/vendor/html5/testdata/tokenizer/test3.test +367 -0
  91. data/lib/feed_tools/vendor/html5/testdata/tokenizer/test4.test +198 -0
  92. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests1.dat +1950 -0
  93. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests2.dat +773 -0
  94. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests3.dat +270 -0
  95. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests4.dat +60 -0
  96. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests5.dat +175 -0
  97. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests6.dat +196 -0
  98. data/lib/feed_tools/vendor/html5/testdata/validator/attributes.test +1035 -0
  99. data/lib/feed_tools/vendor/html5/testdata/validator/base-href-attribute.test +787 -0
  100. data/lib/feed_tools/vendor/html5/testdata/validator/base-target-attribute.test +35 -0
  101. data/lib/feed_tools/vendor/html5/testdata/validator/blockquote-cite-attribute.test +7 -0
  102. data/lib/feed_tools/vendor/html5/testdata/validator/classattribute.test +152 -0
  103. data/lib/feed_tools/vendor/html5/testdata/validator/contenteditableattribute.test +59 -0
  104. data/lib/feed_tools/vendor/html5/testdata/validator/contextmenuattribute.test +115 -0
  105. data/lib/feed_tools/vendor/html5/testdata/validator/dirattribute.test +59 -0
  106. data/lib/feed_tools/vendor/html5/testdata/validator/draggableattribute.test +63 -0
  107. data/lib/feed_tools/vendor/html5/testdata/validator/html-xmlns-attribute.test +23 -0
  108. data/lib/feed_tools/vendor/html5/testdata/validator/idattribute.test +115 -0
  109. data/lib/feed_tools/vendor/html5/testdata/validator/inputattributes.test +2795 -0
  110. data/lib/feed_tools/vendor/html5/testdata/validator/irrelevantattribute.test +63 -0
  111. data/lib/feed_tools/vendor/html5/testdata/validator/langattribute.test +5579 -0
  112. data/lib/feed_tools/vendor/html5/testdata/validator/li-value-attribute.test +7 -0
  113. data/lib/feed_tools/vendor/html5/testdata/validator/link-href-attribute.test +7 -0
  114. data/lib/feed_tools/vendor/html5/testdata/validator/link-hreflang-attribute.test +7 -0
  115. data/lib/feed_tools/vendor/html5/testdata/validator/link-rel-attribute.test +271 -0
  116. data/lib/feed_tools/vendor/html5/testdata/validator/ol-start-attribute.test +7 -0
  117. data/lib/feed_tools/vendor/html5/testdata/validator/starttags.test +375 -0
  118. data/lib/feed_tools/vendor/html5/testdata/validator/style-scoped-attribute.test +7 -0
  119. data/lib/feed_tools/vendor/html5/testdata/validator/tabindexattribute.test +79 -0
  120. data/lib/feed_tools/vendor/html5/tests/preamble.rb +72 -0
  121. data/lib/feed_tools/vendor/html5/tests/test_encoding.rb +35 -0
  122. data/lib/feed_tools/vendor/html5/tests/test_lxp.rb +279 -0
  123. data/lib/feed_tools/vendor/html5/tests/test_parser.rb +68 -0
  124. data/lib/feed_tools/vendor/html5/tests/test_sanitizer.rb +142 -0
  125. data/lib/feed_tools/vendor/html5/tests/test_serializer.rb +68 -0
  126. data/lib/feed_tools/vendor/html5/tests/test_sniffer.rb +27 -0
  127. data/lib/feed_tools/vendor/html5/tests/test_stream.rb +62 -0
  128. data/lib/feed_tools/vendor/html5/tests/test_tokenizer.rb +94 -0
  129. data/lib/feed_tools/vendor/html5/tests/test_treewalkers.rb +135 -0
  130. data/lib/feed_tools/vendor/html5/tests/test_validator.rb +31 -0
  131. data/lib/feed_tools/vendor/html5/tests/tokenizer_test_parser.rb +63 -0
  132. data/lib/feed_tools/vendor/uri.rb +781 -0
  133. data/lib/feed_tools/version.rb +1 -1
  134. data/rakefile +27 -6
  135. data/test/unit/atom_test.rb +298 -210
  136. data/test/unit/helper_test.rb +7 -12
  137. data/test/unit/rdf_test.rb +51 -1
  138. data/test/unit/rss_test.rb +13 -3
  139. metadata +239 -116
  140. data/lib/feed_tools/vendor/htree.rb +0 -97
  141. data/lib/feed_tools/vendor/htree/container.rb +0 -10
  142. data/lib/feed_tools/vendor/htree/context.rb +0 -67
  143. data/lib/feed_tools/vendor/htree/display.rb +0 -27
  144. data/lib/feed_tools/vendor/htree/doc.rb +0 -149
  145. data/lib/feed_tools/vendor/htree/elem.rb +0 -262
  146. data/lib/feed_tools/vendor/htree/encoder.rb +0 -163
  147. data/lib/feed_tools/vendor/htree/equality.rb +0 -218
  148. data/lib/feed_tools/vendor/htree/extract_text.rb +0 -37
  149. data/lib/feed_tools/vendor/htree/fstr.rb +0 -33
  150. data/lib/feed_tools/vendor/htree/gencode.rb +0 -97
  151. data/lib/feed_tools/vendor/htree/htmlinfo.rb +0 -672
  152. data/lib/feed_tools/vendor/htree/inspect.rb +0 -108
  153. data/lib/feed_tools/vendor/htree/leaf.rb +0 -94
  154. data/lib/feed_tools/vendor/htree/loc.rb +0 -367
  155. data/lib/feed_tools/vendor/htree/modules.rb +0 -48
  156. data/lib/feed_tools/vendor/htree/name.rb +0 -124
  157. data/lib/feed_tools/vendor/htree/output.rb +0 -207
  158. data/lib/feed_tools/vendor/htree/parse.rb +0 -409
  159. data/lib/feed_tools/vendor/htree/raw_string.rb +0 -124
  160. data/lib/feed_tools/vendor/htree/regexp-util.rb +0 -15
  161. data/lib/feed_tools/vendor/htree/rexml.rb +0 -130
  162. data/lib/feed_tools/vendor/htree/scan.rb +0 -166
  163. data/lib/feed_tools/vendor/htree/tag.rb +0 -111
  164. data/lib/feed_tools/vendor/htree/template.rb +0 -909
  165. data/lib/feed_tools/vendor/htree/text.rb +0 -115
  166. data/lib/feed_tools/vendor/htree/traverse.rb +0 -465
@@ -0,0 +1,41 @@
1
+ require 'html5/html5parser/phase'
2
+
3
+ module HTML5
4
+ class RootElementPhase < Phase
5
+
6
+ def process_eof
7
+ insert_html_element
8
+ @parser.phase.process_eof
9
+ end
10
+
11
+ def processComment(data)
12
+ @tree.insert_comment(data, @tree.document)
13
+ end
14
+
15
+ def processSpaceCharacters(data)
16
+ end
17
+
18
+ def processCharacters(data)
19
+ insert_html_element
20
+ @parser.phase.processCharacters(data)
21
+ end
22
+
23
+ def processStartTag(name, attributes)
24
+ @parser.first_start_tag = true if name == 'html'
25
+ insert_html_element
26
+ @parser.phase.processStartTag(name, attributes)
27
+ end
28
+
29
+ def processEndTag(name)
30
+ insert_html_element
31
+ @parser.phase.processEndTag(name)
32
+ end
33
+
34
+ def insert_html_element
35
+ element = @tree.createElement('html', {})
36
+ @tree.open_elements << element
37
+ @tree.document.appendChild(element)
38
+ @parser.phase = @parser.phases[:beforeHead]
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,35 @@
1
+ require 'html5/html5parser/phase'
2
+
3
+ module HTML5
4
+ class TrailingEndPhase < Phase
5
+
6
+ def process_eof
7
+ end
8
+
9
+ def processComment(data)
10
+ @tree.insert_comment(data, @tree.document)
11
+ end
12
+
13
+ def processSpaceCharacters(data)
14
+ @parser.last_phase.processSpaceCharacters(data)
15
+ end
16
+
17
+ def processCharacters(data)
18
+ parse_error("expected-eof-but-got-char")
19
+ @parser.phase = @parser.last_phase
20
+ @parser.phase.processCharacters(data)
21
+ end
22
+
23
+ def processStartTag(name, attributes)
24
+ parse_error("expected-eof-but-got-start-tag", {"name" => name})
25
+ @parser.phase = @parser.last_phase
26
+ @parser.phase.processStartTag(name, attributes)
27
+ end
28
+
29
+ def processEndTag(name)
30
+ parse_error("expected-eof-but-got-end-tag", {"name" => name})
31
+ @parser.phase = @parser.last_phase
32
+ @parser.phase.processEndTag(name)
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,648 @@
1
+ require 'stringio'
2
+ require 'html5/constants'
3
+
4
+ module HTML5
5
+
6
+ # Provides a unicode stream of characters to the HTMLTokenizer.
7
+
8
+ # This class takes care of character encoding and removing or replacing
9
+ # incorrect byte-sequences and also provides column and line tracking.
10
+
11
+ class HTMLInputStream
12
+
13
+ attr_accessor :queue, :char_encoding, :errors
14
+
15
+ # Initialises the HTMLInputStream.
16
+ #
17
+ # HTMLInputStream(source, [encoding]) -> Normalized stream from source
18
+ # for use by the HTML5Lib.
19
+ #
20
+ # source can be either a file-object, local filename or a string.
21
+ #
22
+ # The optional encoding parameter must be a string that indicates
23
+ # the encoding. If specified, that encoding will be used,
24
+ # regardless of any BOM or later declaration (such as in a meta
25
+ # element)
26
+ #
27
+ # parseMeta - Look for a <meta> element containing encoding information
28
+
29
+ def initialize(source, options = {})
30
+ @encoding = nil
31
+ @parse_meta = true
32
+ @chardet = true
33
+
34
+ options.each {|name, value| instance_variable_set("@#{name}", value) }
35
+
36
+ # Raw Stream
37
+ @raw_stream = open_stream(source)
38
+
39
+ # Encoding Information
40
+ #Number of bytes to use when looking for a meta element with
41
+ #encoding information
42
+ @NUM_BYTES_META = 512
43
+ #Number of bytes to use when using detecting encoding using chardet
44
+ @NUM_BYTES_CHARDET = 256
45
+ #Number of bytes to use when reading content
46
+ @NUM_BYTES_BUFFER = 1024
47
+
48
+ #Encoding to use if no other information can be found
49
+ @DEFAULT_ENCODING = 'windows-1252'
50
+
51
+ #Detect encoding iff no explicit "transport level" encoding is supplied
52
+ if @encoding.nil? or not HTML5.is_valid_encoding(@encoding)
53
+ @char_encoding = detect_encoding
54
+ else
55
+ @char_encoding = @encoding
56
+ end
57
+
58
+ # Read bytes from stream decoding them into Unicode
59
+ @buffer = @raw_stream.read(@NUM_BYTES_BUFFER) || ''
60
+ if @char_encoding == 'windows-1252'
61
+ @win1252 = true
62
+ elsif @char_encoding != 'utf-8'
63
+ require 'iconv'
64
+ begin
65
+ @buffer << @raw_stream.read unless @raw_stream.eof?
66
+ @buffer = Iconv.iconv('utf-8', @char_encoding, @buffer).first
67
+ rescue
68
+ @win1252 = true
69
+ end
70
+ end
71
+
72
+ @queue = []
73
+ @errors = []
74
+
75
+ # Reset position in the list to read from
76
+ @tell = 0
77
+ @line = @col = 0
78
+ @line_lengths = []
79
+ end
80
+
81
+ # Produces a file object from source.
82
+ #
83
+ # source can be either a file object, local filename or a string.
84
+ def open_stream(source)
85
+ # Already an IO like object
86
+ if source.respond_to?(:read)
87
+ source
88
+ else
89
+ # Treat source as a string and wrap in StringIO
90
+ StringIO.new(source)
91
+ end
92
+ end
93
+
94
+ def detect_encoding
95
+
96
+ #First look for a BOM
97
+ #This will also read past the BOM if present
98
+ encoding = detect_bom
99
+
100
+ #If there is no BOM need to look for meta elements with encoding
101
+ #information
102
+ if encoding.nil? and @parse_meta
103
+ encoding = detect_encoding_meta
104
+ end
105
+
106
+ #Guess with chardet, if avaliable
107
+ if encoding.nil? and @chardet
108
+ begin
109
+ require 'rubygems'
110
+ require 'UniversalDetector' # gem install chardet
111
+ buffers = []
112
+ detector = UniversalDetector::Detector.instance
113
+ detector.reset
114
+ until @raw_stream.eof?
115
+ buffer = @raw_stream.read(@NUM_BYTES_CHARDET)
116
+ break if !buffer or buffer.empty?
117
+ buffers << buffer
118
+ detector.feed(buffer)
119
+ break if detector.instance_eval {@done}
120
+ detector.instance_eval {
121
+ @_mLastChar = @_mLastChar.chr if Fixnum === @_mLastChar
122
+ }
123
+ end
124
+ detector.close
125
+ encoding = detector.result['encoding']
126
+ seek(buffers*'', 0)
127
+ rescue LoadError
128
+ end
129
+ end
130
+
131
+ # If all else fails use the default encoding
132
+ if encoding.nil?
133
+ encoding = @DEFAULT_ENCODING
134
+ end
135
+
136
+ #Substitute for equivalent encoding
137
+ if 'iso-8859-1' == encoding.downcase
138
+ encoding = 'windows-1252'
139
+ end
140
+
141
+ encoding
142
+ end
143
+
144
+ # Attempts to detect at BOM at the start of the stream. If
145
+ # an encoding can be determined from the BOM return the name of the
146
+ # encoding otherwise return nil
147
+ def detect_bom
148
+ bom_dict = {
149
+ "\xef\xbb\xbf" => 'utf-8',
150
+ "\xff\xfe" => 'utf-16le',
151
+ "\xfe\xff" => 'utf-16be',
152
+ "\xff\xfe\x00\x00" => 'utf-32le',
153
+ "\x00\x00\xfe\xff" => 'utf-32be'
154
+ }
155
+
156
+ # Go to beginning of file and read in 4 bytes
157
+ string = @raw_stream.read(4)
158
+ return nil unless string
159
+
160
+ # Try detecting the BOM using bytes from the string
161
+ encoding = bom_dict[string[0...3]] # UTF-8
162
+ seek = 3
163
+ unless encoding
164
+ # Need to detect UTF-32 before UTF-16
165
+ encoding = bom_dict[string] # UTF-32
166
+ seek = 4
167
+ unless encoding
168
+ encoding = bom_dict[string[0...2]] # UTF-16
169
+ seek = 2
170
+ end
171
+ end
172
+
173
+ # Set the read position past the BOM if one was found, otherwise
174
+ # set it to the start of the stream
175
+ seek(string, encoding ? seek : 0)
176
+
177
+ return encoding
178
+ end
179
+
180
+ def seek(buffer, n)
181
+ if @raw_stream.respond_to?(:unget)
182
+ @raw_stream.unget(buffer[n..-1])
183
+ return
184
+ end
185
+
186
+ if @raw_stream.respond_to?(:seek)
187
+ begin
188
+ @raw_stream.seek(n)
189
+ return
190
+ rescue Errno::ESPIPE
191
+ end
192
+ end
193
+
194
+ #TODO: huh?
195
+ require 'delegate'
196
+ @raw_stream = SimpleDelegator.new(@raw_stream)
197
+
198
+ class << @raw_stream
199
+ def read(chars=-1)
200
+ if chars == -1 or chars > @data.length
201
+ result = @data
202
+ @data = ''
203
+ return result if __getobj__.eof?
204
+ return result + __getobj__.read if chars == -1
205
+ return result + __getobj__.read(chars-result.length)
206
+ elsif @data.empty?
207
+ return __getobj__.read(chars)
208
+ else
209
+ result = @data[1...chars]
210
+ @data = @data[chars..-1]
211
+ return result
212
+ end
213
+ end
214
+
215
+ def unget(data)
216
+ if !@data or @data.empty?
217
+ @data = data
218
+ else
219
+ @data += data
220
+ end
221
+ end
222
+ end
223
+
224
+ @raw_stream.unget(buffer[n .. -1])
225
+ end
226
+
227
+ # Report the encoding declared by the meta element
228
+ def detect_encoding_meta
229
+ buffer = @raw_stream.read(@NUM_BYTES_META)
230
+ parser = EncodingParser.new(buffer)
231
+ seek(buffer, 0)
232
+ return parser.get_encoding
233
+ end
234
+
235
+ # Returns (line, col) of the current position in the stream.
236
+ def position
237
+ line, col = @line, @col
238
+ @queue.reverse.each do |c|
239
+ if c == "\n"
240
+ line -= 1
241
+ raise RuntimeError.new("col=#{col}") unless col == 0
242
+ col = @line_lengths[line]
243
+ else
244
+ col -= 1
245
+ end
246
+ end
247
+ return [line + 1, col]
248
+ end
249
+
250
+ # Read one character from the stream or queue if available. Return
251
+ # EOF when EOF is reached.
252
+ def char
253
+ unless @queue.empty?
254
+ return @queue.shift
255
+ else
256
+ if @tell + 3 > @buffer.length && !@raw_stream.eof?
257
+ # read next block
258
+ @buffer = @buffer[@tell..-1] + @raw_stream.read(@NUM_BYTES_BUFFER)
259
+ @tell = 0
260
+ end
261
+
262
+ c = @buffer[@tell]
263
+ @tell += 1
264
+
265
+ case c
266
+ when 0x01..0x7F
267
+ if c == 0x0D
268
+ # normalize newlines
269
+ @tell += 1 if @buffer[@tell] == 0x0A
270
+ c = 0x0A
271
+ end
272
+
273
+ # update position in stream
274
+ if c == 0x0a
275
+ @line_lengths << @col
276
+ @line += 1
277
+ @col = 0
278
+ else
279
+ @col += 1
280
+ end
281
+
282
+ c.chr
283
+
284
+ when 0x80..0xBF
285
+ if !@win1252
286
+ [0xFFFD].pack('U') # invalid utf-8
287
+ elsif c <= 0x9f
288
+ [ENTITIES_WINDOWS1252[c-0x80]].pack('U')
289
+ else
290
+ "\xC2" + c.chr # convert to utf-8
291
+ end
292
+
293
+ when 0xC0..0xFF
294
+ if instance_variables.include?("@win1252") && @win1252
295
+ "\xC3" + (c - 64).chr # convert to utf-8
296
+ # from http://www.w3.org/International/questions/qa-forms-utf-8.en.php
297
+ elsif @buffer[@tell - 1..@tell + 3] =~ /^
298
+ ( [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte
299
+ | \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs
300
+ | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte
301
+ | \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates
302
+ | \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3
303
+ | [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15
304
+ | \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16
305
+ )/x
306
+ @tell += $1.length - 1
307
+ $1
308
+ else
309
+ [0xFFFD].pack('U') # invalid utf-8
310
+ end
311
+
312
+ when 0x00
313
+ @errors.push("null-character")
314
+ [0xFFFD].pack('U') # null characters are invalid
315
+
316
+ else
317
+ :EOF
318
+ end
319
+ end
320
+ end
321
+
322
+ # Returns a string of characters from the stream up to but not
323
+ # including any character in characters or EOF. characters can be
324
+ # any container that supports the in method being called on it.
325
+ def chars_until(characters, opposite=false)
326
+ char_stack = [char]
327
+
328
+ while char_stack.last != :EOF
329
+ break unless (characters.include?(char_stack.last)) == opposite
330
+ char_stack.push(char)
331
+ end
332
+
333
+ # Put the character stopped on back to the front of the queue
334
+ # from where it came.
335
+ c = char_stack.pop
336
+ @queue.insert(0, c) unless c == :EOF
337
+ return char_stack.join('')
338
+ end
339
+
340
+ def unget(characters)
341
+ @queue.unshift(*characters.to_a) unless characters == :EOF
342
+ end
343
+ end
344
+
345
+ # String-like object with an assosiated position and various extra methods
346
+ # If the position is ever greater than the string length then an exception is raised
347
+ class EncodingBytes < String
348
+
349
+ attr_accessor :position
350
+
351
+ def initialize(value)
352
+ super(value)
353
+ @position = -1
354
+ end
355
+
356
+ def each
357
+ while @position < length
358
+ @position += 1
359
+ yield self[@position]
360
+ end
361
+ rescue EOF
362
+ end
363
+
364
+ def current_byte
365
+ raise EOF if @position >= length
366
+ return self[@position].chr
367
+ end
368
+
369
+ # Skip past a list of characters
370
+ def skip(chars=SPACE_CHARACTERS)
371
+ while chars.include?(current_byte)
372
+ @position += 1
373
+ end
374
+ end
375
+
376
+ # Look for a sequence of bytes at the start of a string. If the bytes
377
+ # are found return true and advance the position to the byte after the
378
+ # match. Otherwise return false and leave the position alone
379
+ def match_bytes(bytes, lower=false)
380
+ data = self[position ... position+bytes.length]
381
+ data.downcase! if lower
382
+ rv = (data == bytes)
383
+ @position += bytes.length if rv == true
384
+ return rv
385
+ end
386
+
387
+ # Look for the next sequence of bytes matching a given sequence. If
388
+ # a match is found advance the position to the last byte of the match
389
+ def jump_to(bytes)
390
+ new_position = self[position .. -1].index(bytes)
391
+ if new_position
392
+ @position += (new_position + bytes.length-1)
393
+ return true
394
+ else
395
+ raise EOF
396
+ end
397
+ end
398
+
399
+ # Move the pointer so it points to the next byte in a set of possible
400
+ # bytes
401
+ def find_next(byte_list)
402
+ until byte_list.include?(current_byte)
403
+ @position += 1
404
+ end
405
+ end
406
+ end
407
+
408
+ # Mini parser for detecting character encoding from meta elements
409
+ class EncodingParser
410
+
411
+ # string - the data to work on for encoding detection
412
+ def initialize(data)
413
+ @data = EncodingBytes.new(data.to_s)
414
+ @encoding = nil
415
+ end
416
+
417
+ @@method_dispatch = [
418
+ ['<!--', :handle_comment],
419
+ ['<meta', :handle_meta],
420
+ ['</', :handle_possible_end_tag],
421
+ ['<!', :handle_other],
422
+ ['<?', :handle_other],
423
+ ['<', :handle_possible_start_tag]
424
+ ]
425
+
426
+ def get_encoding
427
+ @data.each do |byte|
428
+ keep_parsing = true
429
+ @@method_dispatch.each do |(key, method)|
430
+ if @data.match_bytes(key, lower = true)
431
+ keep_parsing = send(method)
432
+ break
433
+ end
434
+ end
435
+ break unless keep_parsing
436
+ end
437
+ @encoding = @encoding.strip unless @encoding.nil?
438
+ return @encoding
439
+ end
440
+
441
+ # Skip over comments
442
+ def handle_comment
443
+ return @data.jump_to('-->')
444
+ end
445
+
446
+ def handle_meta
447
+ # if we have <meta not followed by a space so just keep going
448
+ return true unless SPACE_CHARACTERS.include?(@data.current_byte)
449
+
450
+ #We have a valid meta element we want to search for attributes
451
+ while true
452
+ #Try to find the next attribute after the current position
453
+ attr = get_attribute
454
+
455
+ return true if attr.nil?
456
+
457
+ if attr[0] == 'charset'
458
+ tentative_encoding = attr[1]
459
+ if HTML5.is_valid_encoding(tentative_encoding)
460
+ @encoding = tentative_encoding
461
+ return false
462
+ end
463
+ elsif attr[0] == 'content'
464
+ content_parser = ContentAttrParser.new(EncodingBytes.new(attr[1]))
465
+ tentative_encoding = content_parser.parse
466
+ if HTML5.is_valid_encoding(tentative_encoding)
467
+ @encoding = tentative_encoding
468
+ return false
469
+ end
470
+ end
471
+ end
472
+ end
473
+
474
+ def handle_possible_start_tag
475
+ return handle_possible_tag(false)
476
+ end
477
+
478
+ def handle_possible_end_tag
479
+ @data.position += 1
480
+ return handle_possible_tag(true)
481
+ end
482
+
483
+ def handle_possible_tag(end_tag)
484
+ unless ASCII_LETTERS.include?(@data.current_byte)
485
+ #If the next byte is not an ascii letter either ignore this
486
+ #fragment (possible start tag case) or treat it according to
487
+ #handleOther
488
+ if end_tag
489
+ @data.position -= 1
490
+ handle_other
491
+ end
492
+ return true
493
+ end
494
+
495
+ @data.find_next(SPACE_CHARACTERS + ['<', '>'])
496
+
497
+ if @data.current_byte == '<'
498
+ #return to the first step in the overall "two step" algorithm
499
+ #reprocessing the < byte
500
+ @data.position -= 1
501
+ else
502
+ #Read all attributes
503
+ {} until get_attribute.nil?
504
+ end
505
+ return true
506
+ end
507
+
508
+ def handle_other
509
+ return @data.jump_to('>')
510
+ end
511
+
512
+ # Return a name,value pair for the next attribute in the stream,
513
+ # if one is found, or nil
514
+ def get_attribute
515
+ @data.skip(SPACE_CHARACTERS + ['/'])
516
+
517
+ if @data.current_byte == '<'
518
+ @data.position -= 1
519
+ return nil
520
+ elsif @data.current_byte == '>'
521
+ return nil
522
+ end
523
+
524
+ attr_name = []
525
+ attr_value = []
526
+ space_found = false
527
+ #Step 5 attribute name
528
+ while true
529
+ if @data.current_byte == '=' and attr_name
530
+ break
531
+ elsif SPACE_CHARACTERS.include?(@data.current_byte)
532
+ space_found = true
533
+ break
534
+ elsif ['/', '<', '>'].include?(@data.current_byte)
535
+ return [attr_name.join(''), '']
536
+ elsif ASCII_UPPERCASE.include?(@data.current_byte)
537
+ attr_name.push(@data.current_byte.downcase)
538
+ else
539
+ attr_name.push(@data.current_byte)
540
+ end
541
+ #Step 6
542
+ @data.position += 1
543
+ end
544
+ #Step 7
545
+ if space_found
546
+ @data.skip
547
+ #Step 8
548
+ unless @data.current_byte == '='
549
+ @data.position -= 1
550
+ return [attr_name.join(''), '']
551
+ end
552
+ end
553
+ #XXX need to advance position in both spaces and value case
554
+ #Step 9
555
+ @data.position += 1
556
+ #Step 10
557
+ @data.skip
558
+ #Step 11
559
+ if ["'", '"'].include?(@data.current_byte)
560
+ #11.1
561
+ quote_char = @data.current_byte
562
+ while true
563
+ @data.position+=1
564
+ #11.3
565
+ if @data.current_byte == quote_char
566
+ @data.position += 1
567
+ return [attr_name.join(''), attr_value.join('')]
568
+ #11.4
569
+ elsif ASCII_UPPERCASE.include?(@data.current_byte)
570
+ attr_value.push(@data.current_byte.downcase)
571
+ #11.5
572
+ else
573
+ attr_value.push(@data.current_byte)
574
+ end
575
+ end
576
+ elsif ['>', '<'].include?(@data.current_byte)
577
+ return [attr_name.join(''), '']
578
+ elsif ASCII_UPPERCASE.include?(@data.current_byte)
579
+ attr_value.push(@data.current_byte.downcase)
580
+ else
581
+ attr_value.push(@data.current_byte)
582
+ end
583
+ while true
584
+ @data.position += 1
585
+ if (SPACE_CHARACTERS + ['>', '<']).include?(@data.current_byte)
586
+ return [attr_name.join(''), attr_value.join('')]
587
+ elsif ASCII_UPPERCASE.include?(@data.current_byte)
588
+ attr_value.push(@data.current_byte.downcase)
589
+ else
590
+ attr_value.push(@data.current_byte)
591
+ end
592
+ end
593
+ end
594
+ end
595
+
596
+ class ContentAttrParser
597
+ def initialize(data)
598
+ @data = data
599
+ end
600
+
601
+ def parse
602
+ begin
603
+ #Skip to the first ";"
604
+ @data.position = 0
605
+ @data.jump_to(';')
606
+ @data.position += 1
607
+ @data.skip
608
+ #Check if the attr name is charset
609
+ #otherwise return
610
+ @data.jump_to('charset')
611
+ @data.position += 1
612
+ @data.skip
613
+ unless @data.current_byte == '='
614
+ #If there is no = sign keep looking for attrs
615
+ return nil
616
+ end
617
+ @data.position += 1
618
+ @data.skip
619
+ #Look for an encoding between matching quote marks
620
+ if ['"', "'"].include?(@data.current_byte)
621
+ quote_mark = @data.current_byte
622
+ @data.position += 1
623
+ old_position = @data.position
624
+ @data.jump_to(quote_mark)
625
+ return @data[old_position ... @data.position]
626
+ else
627
+ #Unquoted value
628
+ old_position = @data.position
629
+ begin
630
+ @data.find_next(SPACE_CHARACTERS)
631
+ return @data[old_position ... @data.position]
632
+ rescue EOF
633
+ #Return the whole remaining value
634
+ return @data[old_position .. -1]
635
+ end
636
+ end
637
+ rescue EOF
638
+ return nil
639
+ end
640
+ end
641
+ end
642
+
643
+ # Determine if a string is a supported encoding
644
+ def self.is_valid_encoding(encoding)
645
+ (not encoding.nil? and encoding.kind_of?(String) and ENCODINGS.include?(encoding.downcase.strip))
646
+ end
647
+
648
+ end