feedtools 0.2.26 → 0.2.27

Sign up to get free protection for your applications and to get access to all the features.
Files changed (166) hide show
  1. data/CHANGELOG +232 -216
  2. data/db/migration.rb +2 -0
  3. data/db/schema.mysql.sql +2 -0
  4. data/db/schema.postgresql.sql +3 -1
  5. data/db/schema.sqlite.sql +3 -1
  6. data/lib/feed_tools.rb +37 -14
  7. data/lib/feed_tools/database_feed_cache.rb +13 -2
  8. data/lib/feed_tools/feed.rb +430 -104
  9. data/lib/feed_tools/feed_item.rb +533 -268
  10. data/lib/feed_tools/helpers/generic_helper.rb +1 -1
  11. data/lib/feed_tools/helpers/html_helper.rb +78 -116
  12. data/lib/feed_tools/helpers/retrieval_helper.rb +33 -3
  13. data/lib/feed_tools/helpers/uri_helper.rb +46 -54
  14. data/lib/feed_tools/monkey_patch.rb +27 -1
  15. data/lib/feed_tools/vendor/html5/History.txt +10 -0
  16. data/lib/feed_tools/vendor/html5/Manifest.txt +117 -0
  17. data/lib/feed_tools/vendor/html5/README +45 -0
  18. data/lib/feed_tools/vendor/html5/Rakefile.rb +33 -0
  19. data/lib/feed_tools/vendor/html5/bin/html5 +217 -0
  20. data/lib/feed_tools/vendor/html5/lib/core_ext/string.rb +17 -0
  21. data/lib/feed_tools/vendor/html5/lib/html5.rb +13 -0
  22. data/lib/feed_tools/vendor/html5/lib/html5/constants.rb +1046 -0
  23. data/lib/feed_tools/vendor/html5/lib/html5/filters/base.rb +10 -0
  24. data/lib/feed_tools/vendor/html5/lib/html5/filters/inject_meta_charset.rb +82 -0
  25. data/lib/feed_tools/vendor/html5/lib/html5/filters/iso639codes.rb +752 -0
  26. data/lib/feed_tools/vendor/html5/lib/html5/filters/optionaltags.rb +198 -0
  27. data/lib/feed_tools/vendor/html5/lib/html5/filters/rfc2046.rb +30 -0
  28. data/lib/feed_tools/vendor/html5/lib/html5/filters/rfc3987.rb +89 -0
  29. data/lib/feed_tools/vendor/html5/lib/html5/filters/sanitizer.rb +15 -0
  30. data/lib/feed_tools/vendor/html5/lib/html5/filters/validator.rb +830 -0
  31. data/lib/feed_tools/vendor/html5/lib/html5/filters/whitespace.rb +36 -0
  32. data/lib/feed_tools/vendor/html5/lib/html5/html5parser.rb +248 -0
  33. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/after_body_phase.rb +46 -0
  34. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/after_frameset_phase.rb +33 -0
  35. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/after_head_phase.rb +50 -0
  36. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/before_head_phase.rb +41 -0
  37. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_body_phase.rb +613 -0
  38. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_caption_phase.rb +69 -0
  39. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_cell_phase.rb +78 -0
  40. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_column_group_phase.rb +55 -0
  41. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_frameset_phase.rb +57 -0
  42. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_head_phase.rb +138 -0
  43. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_row_phase.rb +89 -0
  44. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_select_phase.rb +85 -0
  45. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_table_body_phase.rb +86 -0
  46. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_table_phase.rb +115 -0
  47. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/initial_phase.rb +133 -0
  48. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/phase.rb +154 -0
  49. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/root_element_phase.rb +41 -0
  50. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/trailing_end_phase.rb +35 -0
  51. data/lib/feed_tools/vendor/html5/lib/html5/inputstream.rb +648 -0
  52. data/lib/feed_tools/vendor/html5/lib/html5/liberalxmlparser.rb +158 -0
  53. data/lib/feed_tools/vendor/html5/lib/html5/sanitizer.rb +188 -0
  54. data/lib/feed_tools/vendor/html5/lib/html5/serializer.rb +2 -0
  55. data/lib/feed_tools/vendor/html5/lib/html5/serializer/htmlserializer.rb +179 -0
  56. data/lib/feed_tools/vendor/html5/lib/html5/serializer/xhtmlserializer.rb +20 -0
  57. data/lib/feed_tools/vendor/html5/lib/html5/sniffer.rb +45 -0
  58. data/lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb +966 -0
  59. data/lib/feed_tools/vendor/html5/lib/html5/treebuilders.rb +24 -0
  60. data/lib/feed_tools/vendor/html5/lib/html5/treebuilders/base.rb +334 -0
  61. data/lib/feed_tools/vendor/html5/lib/html5/treebuilders/hpricot.rb +231 -0
  62. data/lib/feed_tools/vendor/html5/lib/html5/treebuilders/rexml.rb +209 -0
  63. data/lib/feed_tools/vendor/html5/lib/html5/treebuilders/simpletree.rb +185 -0
  64. data/lib/feed_tools/vendor/html5/lib/html5/treewalkers.rb +26 -0
  65. data/lib/feed_tools/vendor/html5/lib/html5/treewalkers/base.rb +162 -0
  66. data/lib/feed_tools/vendor/html5/lib/html5/treewalkers/hpricot.rb +48 -0
  67. data/lib/feed_tools/vendor/html5/lib/html5/treewalkers/rexml.rb +48 -0
  68. data/lib/feed_tools/vendor/html5/lib/html5/treewalkers/simpletree.rb +48 -0
  69. data/lib/feed_tools/vendor/html5/lib/html5/version.rb +3 -0
  70. data/lib/feed_tools/vendor/html5/testdata/encoding/chardet/test_big5.txt +51 -0
  71. data/lib/feed_tools/vendor/html5/testdata/encoding/test-yahoo-jp.dat +10 -0
  72. data/lib/feed_tools/vendor/html5/testdata/encoding/tests1.dat +394 -0
  73. data/lib/feed_tools/vendor/html5/testdata/encoding/tests2.dat +81 -0
  74. data/lib/feed_tools/vendor/html5/testdata/sanitizer/tests1.dat +416 -0
  75. data/lib/feed_tools/vendor/html5/testdata/serializer/core.test +104 -0
  76. data/lib/feed_tools/vendor/html5/testdata/serializer/injectmeta.test +65 -0
  77. data/lib/feed_tools/vendor/html5/testdata/serializer/optionaltags.test +900 -0
  78. data/lib/feed_tools/vendor/html5/testdata/serializer/options.test +60 -0
  79. data/lib/feed_tools/vendor/html5/testdata/serializer/whitespace.test +51 -0
  80. data/lib/feed_tools/vendor/html5/testdata/sites/google-results.htm +1 -0
  81. data/lib/feed_tools/vendor/html5/testdata/sites/python-ref-import.htm +1 -0
  82. data/lib/feed_tools/vendor/html5/testdata/sites/web-apps-old.htm +1 -0
  83. data/lib/feed_tools/vendor/html5/testdata/sites/web-apps.htm +34275 -0
  84. data/lib/feed_tools/vendor/html5/testdata/sniffer/htmlOrFeed.json +43 -0
  85. data/lib/feed_tools/vendor/html5/testdata/tokenizer/contentModelFlags.test +48 -0
  86. data/lib/feed_tools/vendor/html5/testdata/tokenizer/entities.test +2339 -0
  87. data/lib/feed_tools/vendor/html5/testdata/tokenizer/escapeFlag.test +21 -0
  88. data/lib/feed_tools/vendor/html5/testdata/tokenizer/test1.test +172 -0
  89. data/lib/feed_tools/vendor/html5/testdata/tokenizer/test2.test +129 -0
  90. data/lib/feed_tools/vendor/html5/testdata/tokenizer/test3.test +367 -0
  91. data/lib/feed_tools/vendor/html5/testdata/tokenizer/test4.test +198 -0
  92. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests1.dat +1950 -0
  93. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests2.dat +773 -0
  94. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests3.dat +270 -0
  95. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests4.dat +60 -0
  96. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests5.dat +175 -0
  97. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests6.dat +196 -0
  98. data/lib/feed_tools/vendor/html5/testdata/validator/attributes.test +1035 -0
  99. data/lib/feed_tools/vendor/html5/testdata/validator/base-href-attribute.test +787 -0
  100. data/lib/feed_tools/vendor/html5/testdata/validator/base-target-attribute.test +35 -0
  101. data/lib/feed_tools/vendor/html5/testdata/validator/blockquote-cite-attribute.test +7 -0
  102. data/lib/feed_tools/vendor/html5/testdata/validator/classattribute.test +152 -0
  103. data/lib/feed_tools/vendor/html5/testdata/validator/contenteditableattribute.test +59 -0
  104. data/lib/feed_tools/vendor/html5/testdata/validator/contextmenuattribute.test +115 -0
  105. data/lib/feed_tools/vendor/html5/testdata/validator/dirattribute.test +59 -0
  106. data/lib/feed_tools/vendor/html5/testdata/validator/draggableattribute.test +63 -0
  107. data/lib/feed_tools/vendor/html5/testdata/validator/html-xmlns-attribute.test +23 -0
  108. data/lib/feed_tools/vendor/html5/testdata/validator/idattribute.test +115 -0
  109. data/lib/feed_tools/vendor/html5/testdata/validator/inputattributes.test +2795 -0
  110. data/lib/feed_tools/vendor/html5/testdata/validator/irrelevantattribute.test +63 -0
  111. data/lib/feed_tools/vendor/html5/testdata/validator/langattribute.test +5579 -0
  112. data/lib/feed_tools/vendor/html5/testdata/validator/li-value-attribute.test +7 -0
  113. data/lib/feed_tools/vendor/html5/testdata/validator/link-href-attribute.test +7 -0
  114. data/lib/feed_tools/vendor/html5/testdata/validator/link-hreflang-attribute.test +7 -0
  115. data/lib/feed_tools/vendor/html5/testdata/validator/link-rel-attribute.test +271 -0
  116. data/lib/feed_tools/vendor/html5/testdata/validator/ol-start-attribute.test +7 -0
  117. data/lib/feed_tools/vendor/html5/testdata/validator/starttags.test +375 -0
  118. data/lib/feed_tools/vendor/html5/testdata/validator/style-scoped-attribute.test +7 -0
  119. data/lib/feed_tools/vendor/html5/testdata/validator/tabindexattribute.test +79 -0
  120. data/lib/feed_tools/vendor/html5/tests/preamble.rb +72 -0
  121. data/lib/feed_tools/vendor/html5/tests/test_encoding.rb +35 -0
  122. data/lib/feed_tools/vendor/html5/tests/test_lxp.rb +279 -0
  123. data/lib/feed_tools/vendor/html5/tests/test_parser.rb +68 -0
  124. data/lib/feed_tools/vendor/html5/tests/test_sanitizer.rb +142 -0
  125. data/lib/feed_tools/vendor/html5/tests/test_serializer.rb +68 -0
  126. data/lib/feed_tools/vendor/html5/tests/test_sniffer.rb +27 -0
  127. data/lib/feed_tools/vendor/html5/tests/test_stream.rb +62 -0
  128. data/lib/feed_tools/vendor/html5/tests/test_tokenizer.rb +94 -0
  129. data/lib/feed_tools/vendor/html5/tests/test_treewalkers.rb +135 -0
  130. data/lib/feed_tools/vendor/html5/tests/test_validator.rb +31 -0
  131. data/lib/feed_tools/vendor/html5/tests/tokenizer_test_parser.rb +63 -0
  132. data/lib/feed_tools/vendor/uri.rb +781 -0
  133. data/lib/feed_tools/version.rb +1 -1
  134. data/rakefile +27 -6
  135. data/test/unit/atom_test.rb +298 -210
  136. data/test/unit/helper_test.rb +7 -12
  137. data/test/unit/rdf_test.rb +51 -1
  138. data/test/unit/rss_test.rb +13 -3
  139. metadata +239 -116
  140. data/lib/feed_tools/vendor/htree.rb +0 -97
  141. data/lib/feed_tools/vendor/htree/container.rb +0 -10
  142. data/lib/feed_tools/vendor/htree/context.rb +0 -67
  143. data/lib/feed_tools/vendor/htree/display.rb +0 -27
  144. data/lib/feed_tools/vendor/htree/doc.rb +0 -149
  145. data/lib/feed_tools/vendor/htree/elem.rb +0 -262
  146. data/lib/feed_tools/vendor/htree/encoder.rb +0 -163
  147. data/lib/feed_tools/vendor/htree/equality.rb +0 -218
  148. data/lib/feed_tools/vendor/htree/extract_text.rb +0 -37
  149. data/lib/feed_tools/vendor/htree/fstr.rb +0 -33
  150. data/lib/feed_tools/vendor/htree/gencode.rb +0 -97
  151. data/lib/feed_tools/vendor/htree/htmlinfo.rb +0 -672
  152. data/lib/feed_tools/vendor/htree/inspect.rb +0 -108
  153. data/lib/feed_tools/vendor/htree/leaf.rb +0 -94
  154. data/lib/feed_tools/vendor/htree/loc.rb +0 -367
  155. data/lib/feed_tools/vendor/htree/modules.rb +0 -48
  156. data/lib/feed_tools/vendor/htree/name.rb +0 -124
  157. data/lib/feed_tools/vendor/htree/output.rb +0 -207
  158. data/lib/feed_tools/vendor/htree/parse.rb +0 -409
  159. data/lib/feed_tools/vendor/htree/raw_string.rb +0 -124
  160. data/lib/feed_tools/vendor/htree/regexp-util.rb +0 -15
  161. data/lib/feed_tools/vendor/htree/rexml.rb +0 -130
  162. data/lib/feed_tools/vendor/htree/scan.rb +0 -166
  163. data/lib/feed_tools/vendor/htree/tag.rb +0 -111
  164. data/lib/feed_tools/vendor/htree/template.rb +0 -909
  165. data/lib/feed_tools/vendor/htree/text.rb +0 -115
  166. data/lib/feed_tools/vendor/htree/traverse.rb +0 -465
@@ -55,4 +55,4 @@ module FeedTools
55
55
  return result
56
56
  end
57
57
  end
58
- end
58
+ end
@@ -28,6 +28,30 @@ require 'rexml/document'
28
28
  module FeedTools
29
29
  # Methods for pulling remote data
30
30
  module HtmlHelper
31
+
32
+ TIDY_OPTIONS = [
33
+ :add_xml_decl, :add_xml_space, :alt_text, :assume_xml_procins, :bare,
34
+ :clean, :css_prefix, :decorate_inferred_ul, :doctype,
35
+ :drop_empty_paras, :drop_font_tags, :drop_proprietary_attributes,
36
+ :enclose_block_text, :enclose_text, :escape_cdata, :fix_backslash,
37
+ :fix_bad_comments, :fix_uri, :hide_comments, :hide_endtags,
38
+ :indent_cdata, :input_xml, :join_classes, :join_styles,
39
+ :literal_attributes, :logical_emphasis, :lower_literals, :merge_divs,
40
+ :ncr, :new_blocklevel_tags, :new_empty_tags, :new_inline_tags,
41
+ :new_pre_tags, :numeric_entities, :output_html, :output_xhtml,
42
+ :output_xml, :preserve_entities, :quote_ampersand, :quote_marks,
43
+ :quote_nbsp, :repeated_attributes, :replace_color, :show_body_only,
44
+ :uppercase_attributes, :uppercase_tags, :word_2000,
45
+ :accessibility_check, :show_errors, :show_warnings, :break_before_br,
46
+ :indent, :indent_attributes, :indent_spaces, :markup,
47
+ :punctuation_wrap, :split, :tab_size, :vertical_space, :wrap,
48
+ :wrap_asp, :wrap_attributes, :wrap_jste, :wrap_php,
49
+ :wrap_script_literals, :wrap_sections, :ascii_chars, :char_encoding,
50
+ :input_encoding, :language, :newline, :output_bom, :output_encoding,
51
+ :error_file, :force_output, :gnu_emacs, :gnu_emacs_file, :keep_time,
52
+ :output_file, :quiet, :slide_style, :tidy_mark, :write_back
53
+ ]
54
+
31
55
  # Escapes all html entities
32
56
  def self.escape_entities(html)
33
57
  return nil if html.nil?
@@ -43,9 +67,12 @@ module FeedTools
43
67
  unescaped_html = html
44
68
  unescaped_html.gsub!(/&/, "&")
45
69
  unescaped_html.gsub!(/&/, "&")
46
- unescaped_html = unescaped_html.gsub(/&#x\d+;/) do |hex|
47
- "&#" + hex[3..-2].to_i(16).to_s + ";"
48
- end
70
+ substitute_numerical_entities = Proc.new do |s|
71
+ m = $1
72
+ m = "0#{m}" if m[0] == ?x
73
+ [Integer(m)].pack('U*')
74
+ end
75
+ unescaped_html.gsub!(/&#0*((?:\d+)|(?:x[a-f0-9]+));/, &substitute_numerical_entities)
49
76
  unescaped_html = CGI.unescapeHTML(unescaped_html)
50
77
  unescaped_html.gsub!(/'/, "'")
51
78
  unescaped_html.gsub!(/"/, "\"")
@@ -140,7 +167,7 @@ module FeedTools
140
167
  end
141
168
  for path in libtidy_locations
142
169
  if File.exists? path
143
- if File.ftype(path) == "file"
170
+ if File.ftype(path) == "file" || File.ftype(path) == "link"
144
171
  Tidy.path = path
145
172
  @tidy_enabled = true
146
173
  break
@@ -178,12 +205,18 @@ module FeedTools
178
205
  # Tidys up the html
179
206
  def self.tidy_html(html, options = {})
180
207
  return nil if html.nil?
208
+ FeedTools::GenericHelper.validate_options(TIDY_OPTIONS, options.keys)
181
209
 
182
- FeedTools::GenericHelper.validate_options([ :input_encoding,
183
- :output_encoding ],
184
- options.keys)
185
- options = { :input_encoding => "utf-8",
186
- :output_encoding => "utf-8" }.merge(options)
210
+ options = {
211
+ :add_xml_decl => false,
212
+ :char_encoding => "utf8",
213
+ :doctype => "omit",
214
+ :indent => false,
215
+ :logical_emphasis => true,
216
+ :markup => true,
217
+ :show_warnings => false,
218
+ :wrap => 0
219
+ }.merge(options)
187
220
 
188
221
  if FeedTools::HtmlHelper.tidy_enabled?
189
222
  is_fragment = true
@@ -196,39 +229,26 @@ module FeedTools
196
229
  is_fragment = false
197
230
  end
198
231
 
232
+ options[:show_body_only] = true if is_fragment
233
+
199
234
  # Tidy sucks?
200
235
  # TODO: find the correct set of tidy options to set so
201
236
  # that *ugly* hacks like this aren't necessary.
202
237
  html = html.gsub(/\302\240/, "\240")
203
238
 
204
- tidy_html = Tidy.open(:show_warnings=>false) do |tidy|
205
- tidy.options.output_xml = true
206
- tidy.options.markup = true
207
- tidy.options.indent = true
208
- tidy.options.wrap = 0
209
- tidy.options.logical_emphasis = true
210
- tidy.options.input_encoding = options[:input_encoding]
211
- tidy.options.output_encoding = options[:output_encoding]
212
- tidy.options.doctype = "omit"
239
+ tidy_html = Tidy.open(options) do |tidy|
213
240
  xml = tidy.clean(html)
214
241
  xml
215
242
  end
216
- if is_fragment
217
- # Tidy sticks <html>...<body>[our html]</body>...</html> in.
218
- # We don't want this.
219
- tidy_html.strip!
220
- tidy_html.gsub!(/^<html>(.|\n)*<body>/, "")
221
- tidy_html.gsub!(/<\/body>(.|\n)*<\/html>$/, "")
222
- tidy_html.gsub!("\t", " ")
223
- tidy_html = FeedTools::HtmlHelper.unindent(tidy_html, 4)
224
- tidy_html.strip!
225
- end
243
+ tidy_html.strip!
226
244
  else
227
245
  tidy_html = html
228
246
  end
247
+
229
248
  if tidy_html.blank? && !html.blank?
230
249
  tidy_html = html.strip
231
250
  end
251
+
232
252
  return tidy_html
233
253
  end
234
254
 
@@ -260,77 +280,6 @@ module FeedTools
260
280
  return buffer
261
281
  end
262
282
 
263
- # Removes all dangerous html tags from the html formatted text.
264
- # If mode is set to :escape, dangerous and unknown elements will
265
- # be escaped. If mode is set to :strip, dangerous and unknown
266
- # elements and all children will be removed entirely.
267
- # Dangerous or unknown attributes are always removed.
268
- def self.sanitize_html(html, mode=:strip)
269
- return nil if html.nil?
270
-
271
- # Lists borrowed from Mark Pilgrim's feedparser
272
- acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b',
273
- 'big', 'blockquote', 'br', 'button', 'caption', 'center', 'cite',
274
- 'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl',
275
- 'dt', 'em', 'fieldset', 'font', 'form', 'h1', 'h2', 'h3', 'h4',
276
- 'h5', 'h6', 'hr', 'i', 'img', 'input', 'ins', 'kbd', 'label', 'legend',
277
- 'li', 'map', 'menu', 'ol', 'optgroup', 'option', 'p', 'pre', 'q', 's',
278
- 'samp', 'select', 'small', 'span', 'strike', 'strong', 'sub', 'sup',
279
- 'table', 'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'tt',
280
- 'u', 'ul', 'var']
281
-
282
- acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
283
- 'action', 'align', 'alt', 'axis', 'border', 'cellpadding',
284
- 'cellspacing', 'char', 'charoff', 'charset', 'checked', 'cite', 'class',
285
- 'clear', 'cols', 'colspan', 'color', 'compact', 'coords', 'datetime',
286
- 'dir', 'disabled', 'enctype', 'for', 'frame', 'headers', 'height',
287
- 'href', 'hreflang', 'hspace', 'id', 'ismap', 'label', 'lang',
288
- 'longdesc', 'maxlength', 'media', 'method', 'multiple', 'name',
289
- 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev',
290
- 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
291
- 'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title',
292
- 'type', 'usemap', 'valign', 'value', 'vspace', 'width']
293
-
294
- # Replace with appropriate named entities
295
- html.gsub!(/&#x26;/, "&amp;")
296
- html.gsub!(/&#38;/, "&amp;")
297
- html.gsub!(/&lt;!'/, "&amp;lt;!'")
298
-
299
- # Hackity hack. But it works, and it seems plenty fast enough.
300
- html_doc = HTree.parse_xml("<root>" + html + "</root>").to_rexml
301
-
302
- sanitize_node = lambda do |html_node|
303
- if html_node.respond_to? :children
304
- for child in html_node.children
305
- if child.kind_of? REXML::Element
306
- unless acceptable_elements.include? child.name.downcase
307
- if mode == :strip
308
- html_node.delete_element(child)
309
- else
310
- new_child = REXML::Text.new(CGI.escapeHTML(child.to_s))
311
- html_node.insert_after(child, new_child)
312
- html_node.delete_element(child)
313
- end
314
- end
315
- child.attributes.each_attribute do |attribute|
316
- if !(attribute.value =~ /^xmlns(:.+)?$/)
317
- unless acceptable_attributes.include?(
318
- attribute.value.downcase)
319
- child.delete_attribute(attribute.value)
320
- end
321
- end
322
- end
323
- end
324
- sanitize_node.call(child)
325
- end
326
- end
327
- html_node
328
- end
329
- sanitize_node.call(html_doc.root)
330
- html = html_doc.root.inner_xml
331
- return html
332
- end
333
-
334
283
  # Returns true if the type string provided indicates that something is
335
284
  # xml or xhtml content.
336
285
  def self.xml_type?(type)
@@ -405,20 +354,35 @@ module FeedTools
405
354
  ["q", "cite"],
406
355
  ["script", "src"]
407
356
  ]
408
- html_doc = HTree.parse_xml("<root>" + html + "</root>").to_rexml
409
357
 
358
+ # HACK: Prevent the parser from freaking out if it sees this:
359
+ html.gsub!(/<!'/, "&lt;!'")
360
+
361
+ if FeedTools.configurations[:sanitization_enabled]
362
+ fragments = HTML5::HTMLParser.parse_fragment(
363
+ html, :tokenizer => HTML5::HTMLSanitizer)
364
+ else
365
+ fragments = HTML5::HTMLParser.parse_fragment(html)
366
+ end
410
367
  resolve_node = lambda do |html_node|
411
368
  if html_node.kind_of? REXML::Element
412
- for element_attribute_pair in relative_uri_attributes
413
- if html_node.name.downcase == element_attribute_pair[0]
414
- attribute = html_node.attribute(element_attribute_pair[1])
369
+ for element_name, attribute_name in relative_uri_attributes
370
+ if html_node.name.downcase == element_name
371
+ attribute = html_node.attribute(attribute_name)
415
372
  if attribute != nil
416
373
  href = attribute.value
417
374
  href = FeedTools::UriHelper.resolve_relative_uri(
418
375
  href, [html_node.base_uri] | base_uri_sources)
419
- html_node.attribute(
420
- element_attribute_pair[1]).instance_variable_set(
421
- "@value", href)
376
+ href = FeedTools::UriHelper.normalize_url(href)
377
+ html_node.attribute(attribute_name).instance_variable_set(
378
+ "@value", href)
379
+ html_node.attribute(attribute_name).instance_variable_set(
380
+ "@unnormalized", href)
381
+ html_node.attribute(attribute_name).instance_variable_set(
382
+ "@normalized", href)
383
+ if html_node.attribute(attribute_name).value != href
384
+ warn("Failed to update href to resolved value.")
385
+ end
422
386
  end
423
387
  end
424
388
  end
@@ -430,8 +394,12 @@ module FeedTools
430
394
  end
431
395
  html_node
432
396
  end
433
- resolve_node.call(html_doc.root)
434
- html = html_doc.root.inner_xml
397
+ fragments.each do |fragment|
398
+ resolve_node.call(fragment)
399
+ end
400
+ html = (fragments.map do |stuff|
401
+ stuff.to_s
402
+ end).join("")
435
403
  return html
436
404
  end
437
405
 
@@ -552,22 +520,16 @@ module FeedTools
552
520
  content = FeedTools::HtmlHelper.unescape_entities(
553
521
  content_node.inner_xml.strip)
554
522
  else
555
- content = content_node.inner_xml.strip
556
- repair_entities = true
523
+ content = FeedTools::HtmlHelper.unescape_entities(
524
+ content_node.inner_xml.strip)
557
525
  end
558
526
  if type == "text" || mode == "text" ||
559
527
  type == "text/plain" || mode == "text/plain"
560
528
  content = FeedTools::HtmlHelper.escape_entities(content)
561
529
  end
562
530
  unless content.nil?
563
- if FeedTools.configurations[:sanitization_enabled]
564
- content = FeedTools::HtmlHelper.sanitize_html(content, :strip)
565
- end
566
531
  content = FeedTools::HtmlHelper.resolve_relative_uris(content,
567
532
  [content_node.base_uri] | base_uri_sources)
568
- if repair_entities
569
- content = FeedTools::HtmlHelper.unescape_entities(content)
570
- end
571
533
  content = FeedTools::HtmlHelper.tidy_html(content)
572
534
  end
573
535
  if FeedTools.configurations[:tab_spaces] != nil
@@ -108,21 +108,51 @@ module FeedTools
108
108
  proxy_user = nil
109
109
  proxy_password = nil
110
110
 
111
+ auth_user = nil
112
+ auth_password = nil
113
+ auth_scheme = nil
114
+
111
115
  if options[:feed_object] != nil
112
116
  proxy_address =
113
117
  options[:feed_object].configurations[:proxy_address] || nil
114
118
  proxy_port =
115
119
  options[:feed_object].configurations[:proxy_port].to_i || nil
116
120
  proxy_user =
117
- options[:feed_object].configurations[:proxy_user].to_i || nil
121
+ options[:feed_object].configurations[:proxy_user] || nil
118
122
  proxy_password =
119
- options[:feed_object].configurations[:proxy_password].to_i || nil
123
+ options[:feed_object].configurations[:proxy_password] || nil
124
+
125
+ auth_user =
126
+ options[:feed_object].configurations[:auth_user] || nil
127
+ auth_password =
128
+ options[:feed_object].configurations[:auth_password] || nil
129
+ auth_scheme =
130
+ options[:feed_object].configurations[:auth_scheme] || nil
120
131
  end
121
132
 
133
+ if (auth_user &&
134
+ (auth_scheme == nil || auth_scheme.to_s.to_sym == :basic))
135
+ options[:request_headers]["Authorization"] =
136
+ "Basic " + [
137
+ "#{auth_user}:#{auth_password}"
138
+ ].pack('m').delete("\r\n")
139
+ end
140
+
122
141
  # No need to check for nil
123
142
  http = Net::HTTP::Proxy(
124
143
  proxy_address, proxy_port, proxy_user, proxy_password).new(
125
144
  uri.host, (uri.port or 80))
145
+
146
+ if options[:feed_object] != nil &&
147
+ options[:feed_object].configurations[:http_timeout] != nil
148
+ http.open_timeout =
149
+ options[:feed_object].configurations[:http_timeout].to_f
150
+ elsif FeedTools.configurations[:http_timeout] != nil
151
+ http.open_timeout = FeedTools.configurations[:http_timeout].to_f
152
+ end
153
+ if http.open_timeout != nil && http.open_timeout == 0
154
+ http.open_timeout = nil
155
+ end
126
156
 
127
157
  path = uri.path
128
158
  path += ('?' + uri.query) if uri.query
@@ -238,4 +268,4 @@ module FeedTools
238
268
  :head, url, options, &block)
239
269
  end
240
270
  end
241
- end
271
+ end
@@ -59,13 +59,33 @@ module FeedTools
59
59
  # to be. Also translates from the feed: and rss: pseudo-protocols to the
60
60
  # http: protocol.
61
61
  def self.normalize_url(url)
62
- if url.kind_of?(URI)
62
+ if url.nil?
63
+ return nil
64
+ end
65
+ if !url.kind_of?(String)
63
66
  url = url.to_s
64
67
  end
65
68
  if url.blank?
66
- return nil
69
+ return ""
70
+ end
71
+ normalized_url = url.strip
72
+
73
+ begin
74
+ normalized_url =
75
+ FeedTools::URI.convert_path(normalized_url.strip).normalize.to_s
76
+ rescue Exception
77
+ end
78
+
79
+ begin
80
+ begin
81
+ normalized_url =
82
+ FeedTools::URI.parse(normalized_url.strip).normalize.to_s
83
+ rescue Exception
84
+ normalized_url = CGI.unescape(url.strip)
85
+ end
86
+ rescue Exception
87
+ normalized_url = url.strip
67
88
  end
68
- normalized_url = CGI.unescape(url.strip)
69
89
 
70
90
  # if a url begins with the '/' character, it only makes sense that they
71
91
  # meant to be using a file:// url. Fix it for them.
@@ -90,76 +110,40 @@ module FeedTools
90
110
  # deal with all of the many ugly possibilities involved in the rss:
91
111
  # and feed: pseudo-protocols (incidentally, whose crazy idea was this
92
112
  # mess?)
113
+ normalized_url.gsub!(/^htp:\/*/i, "http://")
93
114
  normalized_url.gsub!(/^http:\/*(feed:\/*)?/i, "http://")
94
115
  normalized_url.gsub!(/^http:\/*(rss:\/*)?/i, "http://")
95
116
  normalized_url.gsub!(/^feed:\/*(http:\/*)?/i, "http://")
96
117
  normalized_url.gsub!(/^rss:\/*(http:\/*)?/i, "http://")
97
118
  normalized_url.gsub!(/^file:\/*/i, "file:///")
98
119
  normalized_url.gsub!(/^https:\/*/i, "https://")
120
+ normalized_url.gsub!(/^mms:\/*/i, "http://")
99
121
  # fix (very) bad urls (usually of the user-entered sort)
100
122
  normalized_url.gsub!(/^http:\/*(http:\/*)*/i, "http://")
123
+ normalized_url.gsub!(/^http:\/*$/i, "")
101
124
 
102
125
  if (normalized_url =~ /^file:/i) == 0
103
126
  # Adjust windows-style urls
104
127
  normalized_url.gsub!(/^file:\/\/\/([a-zA-Z])\|/i, 'file:///\1:')
105
128
  normalized_url.gsub!(/\\/, '/')
106
129
  else
107
- if (normalized_url =~ /^https?:\/\//i) == nil
130
+ if FeedTools::URI.parse(normalized_url).scheme == nil &&
131
+ normalized_url =~ /\./ &&
108
132
  normalized_url = "http://" + normalized_url
109
133
  end
110
134
  if normalized_url == "http://"
111
135
  return nil
112
136
  end
113
- begin
114
- scheme, host_part, path =
115
- normalized_url.scan(/^(https?):\/\/([^\/]+)\/(.*)/i).flatten
116
- if scheme != nil && host_part != nil && path != nil
117
- scheme = scheme.downcase
118
- if FeedTools::UriHelper.idn_enabled?
119
- host_part =
120
- IDN::Idna.toASCII(host_part)
121
- end
122
- new_path = ""
123
- for index in 0...path.size
124
- if path[index] <= 32 || path[index] >= 126
125
- new_path << ("%" + path[index].to_s(16).upcase)
126
- else
127
- new_path << path[index..index]
128
- end
129
- end
130
- path = new_path
131
- normalized_url = scheme + "://" + host_part + "/" + path
132
- end
133
- rescue Object
134
- end
135
- begin
136
- feed_uri = URI.parse(normalized_url)
137
- if feed_uri.scheme == nil
138
- feed_uri.scheme = "http"
139
- end
140
- if feed_uri.path.blank?
141
- feed_uri.path = "/"
142
- end
143
- if (feed_uri.path =~ /^[\/]+/) == 0
144
- feed_uri.path.gsub!(/^[\/]+/, "/")
145
- end
146
- while (feed_uri.path =~ /^\/\.\./)
147
- feed_uri.path.gsub!(/^\/\.\./, "")
148
- end
149
- if feed_uri.path.blank?
150
- feed_uri.path = "/"
151
- end
152
- feed_uri.host.downcase!
153
- normalized_url = feed_uri.to_s
154
- rescue URI::InvalidURIError
155
- end
137
+ end
138
+ if normalized_url =~ /^https?:\/\/#/i
139
+ normalized_url.gsub!(/^https?:\/\/#/i, "#")
140
+ end
141
+ if normalized_url =~ /^https?:\/\/\?/i
142
+ normalized_url.gsub!(/^https?:\/\/\?/i, "?")
156
143
  end
157
144
 
158
- # We can't do a proper set of escaping, so this will
159
- # have to do.
160
- normalized_url.gsub!(/%20/, " ")
161
- normalized_url.gsub!(/ /, "%20")
162
-
145
+ normalized_url =
146
+ FeedTools::URI.parse(normalized_url.strip).normalize.to_s
163
147
  return normalized_url
164
148
  end
165
149
 
@@ -168,7 +152,15 @@ module FeedTools
168
152
  return relative_uri if base_uri_sources.blank?
169
153
  return nil if relative_uri.nil?
170
154
  begin
171
- base_uri = URI.parse(
155
+ # Massive HACK to get around file protocol URIs being used to
156
+ # resolve relative URIs on feeds in the local file system.
157
+ # Better to leave these URIs unresolved and hope some other
158
+ # tool resolves them correctly.
159
+ base_uri_sources.reject! do |base_uri|
160
+ base_uri == nil ||
161
+ FeedTools::URI.parse(base_uri).scheme == "file"
162
+ end
163
+ base_uri = FeedTools::URI.parse(
172
164
  FeedTools::XmlHelper.select_not_blank(base_uri_sources))
173
165
  resolved_uri = base_uri
174
166
  if relative_uri.to_s != ''
@@ -207,7 +199,7 @@ module FeedTools
207
199
  end
208
200
  normalized_url = normalize_url(url)
209
201
  require 'uuidtools'
210
- return UUID.sha1_create(UUID_URL_NAMESPACE, normalized_url).to_uri_string
202
+ return UUID.sha1_create(UUID_URL_NAMESPACE, normalized_url).to_uri.to_s
211
203
  end
212
204
 
213
205
  # Returns true if the parameter appears to be a valid uri