feedtools 0.2.26 → 0.2.27

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (166) hide show
  1. data/CHANGELOG +232 -216
  2. data/db/migration.rb +2 -0
  3. data/db/schema.mysql.sql +2 -0
  4. data/db/schema.postgresql.sql +3 -1
  5. data/db/schema.sqlite.sql +3 -1
  6. data/lib/feed_tools.rb +37 -14
  7. data/lib/feed_tools/database_feed_cache.rb +13 -2
  8. data/lib/feed_tools/feed.rb +430 -104
  9. data/lib/feed_tools/feed_item.rb +533 -268
  10. data/lib/feed_tools/helpers/generic_helper.rb +1 -1
  11. data/lib/feed_tools/helpers/html_helper.rb +78 -116
  12. data/lib/feed_tools/helpers/retrieval_helper.rb +33 -3
  13. data/lib/feed_tools/helpers/uri_helper.rb +46 -54
  14. data/lib/feed_tools/monkey_patch.rb +27 -1
  15. data/lib/feed_tools/vendor/html5/History.txt +10 -0
  16. data/lib/feed_tools/vendor/html5/Manifest.txt +117 -0
  17. data/lib/feed_tools/vendor/html5/README +45 -0
  18. data/lib/feed_tools/vendor/html5/Rakefile.rb +33 -0
  19. data/lib/feed_tools/vendor/html5/bin/html5 +217 -0
  20. data/lib/feed_tools/vendor/html5/lib/core_ext/string.rb +17 -0
  21. data/lib/feed_tools/vendor/html5/lib/html5.rb +13 -0
  22. data/lib/feed_tools/vendor/html5/lib/html5/constants.rb +1046 -0
  23. data/lib/feed_tools/vendor/html5/lib/html5/filters/base.rb +10 -0
  24. data/lib/feed_tools/vendor/html5/lib/html5/filters/inject_meta_charset.rb +82 -0
  25. data/lib/feed_tools/vendor/html5/lib/html5/filters/iso639codes.rb +752 -0
  26. data/lib/feed_tools/vendor/html5/lib/html5/filters/optionaltags.rb +198 -0
  27. data/lib/feed_tools/vendor/html5/lib/html5/filters/rfc2046.rb +30 -0
  28. data/lib/feed_tools/vendor/html5/lib/html5/filters/rfc3987.rb +89 -0
  29. data/lib/feed_tools/vendor/html5/lib/html5/filters/sanitizer.rb +15 -0
  30. data/lib/feed_tools/vendor/html5/lib/html5/filters/validator.rb +830 -0
  31. data/lib/feed_tools/vendor/html5/lib/html5/filters/whitespace.rb +36 -0
  32. data/lib/feed_tools/vendor/html5/lib/html5/html5parser.rb +248 -0
  33. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/after_body_phase.rb +46 -0
  34. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/after_frameset_phase.rb +33 -0
  35. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/after_head_phase.rb +50 -0
  36. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/before_head_phase.rb +41 -0
  37. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_body_phase.rb +613 -0
  38. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_caption_phase.rb +69 -0
  39. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_cell_phase.rb +78 -0
  40. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_column_group_phase.rb +55 -0
  41. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_frameset_phase.rb +57 -0
  42. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_head_phase.rb +138 -0
  43. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_row_phase.rb +89 -0
  44. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_select_phase.rb +85 -0
  45. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_table_body_phase.rb +86 -0
  46. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_table_phase.rb +115 -0
  47. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/initial_phase.rb +133 -0
  48. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/phase.rb +154 -0
  49. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/root_element_phase.rb +41 -0
  50. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/trailing_end_phase.rb +35 -0
  51. data/lib/feed_tools/vendor/html5/lib/html5/inputstream.rb +648 -0
  52. data/lib/feed_tools/vendor/html5/lib/html5/liberalxmlparser.rb +158 -0
  53. data/lib/feed_tools/vendor/html5/lib/html5/sanitizer.rb +188 -0
  54. data/lib/feed_tools/vendor/html5/lib/html5/serializer.rb +2 -0
  55. data/lib/feed_tools/vendor/html5/lib/html5/serializer/htmlserializer.rb +179 -0
  56. data/lib/feed_tools/vendor/html5/lib/html5/serializer/xhtmlserializer.rb +20 -0
  57. data/lib/feed_tools/vendor/html5/lib/html5/sniffer.rb +45 -0
  58. data/lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb +966 -0
  59. data/lib/feed_tools/vendor/html5/lib/html5/treebuilders.rb +24 -0
  60. data/lib/feed_tools/vendor/html5/lib/html5/treebuilders/base.rb +334 -0
  61. data/lib/feed_tools/vendor/html5/lib/html5/treebuilders/hpricot.rb +231 -0
  62. data/lib/feed_tools/vendor/html5/lib/html5/treebuilders/rexml.rb +209 -0
  63. data/lib/feed_tools/vendor/html5/lib/html5/treebuilders/simpletree.rb +185 -0
  64. data/lib/feed_tools/vendor/html5/lib/html5/treewalkers.rb +26 -0
  65. data/lib/feed_tools/vendor/html5/lib/html5/treewalkers/base.rb +162 -0
  66. data/lib/feed_tools/vendor/html5/lib/html5/treewalkers/hpricot.rb +48 -0
  67. data/lib/feed_tools/vendor/html5/lib/html5/treewalkers/rexml.rb +48 -0
  68. data/lib/feed_tools/vendor/html5/lib/html5/treewalkers/simpletree.rb +48 -0
  69. data/lib/feed_tools/vendor/html5/lib/html5/version.rb +3 -0
  70. data/lib/feed_tools/vendor/html5/testdata/encoding/chardet/test_big5.txt +51 -0
  71. data/lib/feed_tools/vendor/html5/testdata/encoding/test-yahoo-jp.dat +10 -0
  72. data/lib/feed_tools/vendor/html5/testdata/encoding/tests1.dat +394 -0
  73. data/lib/feed_tools/vendor/html5/testdata/encoding/tests2.dat +81 -0
  74. data/lib/feed_tools/vendor/html5/testdata/sanitizer/tests1.dat +416 -0
  75. data/lib/feed_tools/vendor/html5/testdata/serializer/core.test +104 -0
  76. data/lib/feed_tools/vendor/html5/testdata/serializer/injectmeta.test +65 -0
  77. data/lib/feed_tools/vendor/html5/testdata/serializer/optionaltags.test +900 -0
  78. data/lib/feed_tools/vendor/html5/testdata/serializer/options.test +60 -0
  79. data/lib/feed_tools/vendor/html5/testdata/serializer/whitespace.test +51 -0
  80. data/lib/feed_tools/vendor/html5/testdata/sites/google-results.htm +1 -0
  81. data/lib/feed_tools/vendor/html5/testdata/sites/python-ref-import.htm +1 -0
  82. data/lib/feed_tools/vendor/html5/testdata/sites/web-apps-old.htm +1 -0
  83. data/lib/feed_tools/vendor/html5/testdata/sites/web-apps.htm +34275 -0
  84. data/lib/feed_tools/vendor/html5/testdata/sniffer/htmlOrFeed.json +43 -0
  85. data/lib/feed_tools/vendor/html5/testdata/tokenizer/contentModelFlags.test +48 -0
  86. data/lib/feed_tools/vendor/html5/testdata/tokenizer/entities.test +2339 -0
  87. data/lib/feed_tools/vendor/html5/testdata/tokenizer/escapeFlag.test +21 -0
  88. data/lib/feed_tools/vendor/html5/testdata/tokenizer/test1.test +172 -0
  89. data/lib/feed_tools/vendor/html5/testdata/tokenizer/test2.test +129 -0
  90. data/lib/feed_tools/vendor/html5/testdata/tokenizer/test3.test +367 -0
  91. data/lib/feed_tools/vendor/html5/testdata/tokenizer/test4.test +198 -0
  92. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests1.dat +1950 -0
  93. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests2.dat +773 -0
  94. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests3.dat +270 -0
  95. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests4.dat +60 -0
  96. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests5.dat +175 -0
  97. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests6.dat +196 -0
  98. data/lib/feed_tools/vendor/html5/testdata/validator/attributes.test +1035 -0
  99. data/lib/feed_tools/vendor/html5/testdata/validator/base-href-attribute.test +787 -0
  100. data/lib/feed_tools/vendor/html5/testdata/validator/base-target-attribute.test +35 -0
  101. data/lib/feed_tools/vendor/html5/testdata/validator/blockquote-cite-attribute.test +7 -0
  102. data/lib/feed_tools/vendor/html5/testdata/validator/classattribute.test +152 -0
  103. data/lib/feed_tools/vendor/html5/testdata/validator/contenteditableattribute.test +59 -0
  104. data/lib/feed_tools/vendor/html5/testdata/validator/contextmenuattribute.test +115 -0
  105. data/lib/feed_tools/vendor/html5/testdata/validator/dirattribute.test +59 -0
  106. data/lib/feed_tools/vendor/html5/testdata/validator/draggableattribute.test +63 -0
  107. data/lib/feed_tools/vendor/html5/testdata/validator/html-xmlns-attribute.test +23 -0
  108. data/lib/feed_tools/vendor/html5/testdata/validator/idattribute.test +115 -0
  109. data/lib/feed_tools/vendor/html5/testdata/validator/inputattributes.test +2795 -0
  110. data/lib/feed_tools/vendor/html5/testdata/validator/irrelevantattribute.test +63 -0
  111. data/lib/feed_tools/vendor/html5/testdata/validator/langattribute.test +5579 -0
  112. data/lib/feed_tools/vendor/html5/testdata/validator/li-value-attribute.test +7 -0
  113. data/lib/feed_tools/vendor/html5/testdata/validator/link-href-attribute.test +7 -0
  114. data/lib/feed_tools/vendor/html5/testdata/validator/link-hreflang-attribute.test +7 -0
  115. data/lib/feed_tools/vendor/html5/testdata/validator/link-rel-attribute.test +271 -0
  116. data/lib/feed_tools/vendor/html5/testdata/validator/ol-start-attribute.test +7 -0
  117. data/lib/feed_tools/vendor/html5/testdata/validator/starttags.test +375 -0
  118. data/lib/feed_tools/vendor/html5/testdata/validator/style-scoped-attribute.test +7 -0
  119. data/lib/feed_tools/vendor/html5/testdata/validator/tabindexattribute.test +79 -0
  120. data/lib/feed_tools/vendor/html5/tests/preamble.rb +72 -0
  121. data/lib/feed_tools/vendor/html5/tests/test_encoding.rb +35 -0
  122. data/lib/feed_tools/vendor/html5/tests/test_lxp.rb +279 -0
  123. data/lib/feed_tools/vendor/html5/tests/test_parser.rb +68 -0
  124. data/lib/feed_tools/vendor/html5/tests/test_sanitizer.rb +142 -0
  125. data/lib/feed_tools/vendor/html5/tests/test_serializer.rb +68 -0
  126. data/lib/feed_tools/vendor/html5/tests/test_sniffer.rb +27 -0
  127. data/lib/feed_tools/vendor/html5/tests/test_stream.rb +62 -0
  128. data/lib/feed_tools/vendor/html5/tests/test_tokenizer.rb +94 -0
  129. data/lib/feed_tools/vendor/html5/tests/test_treewalkers.rb +135 -0
  130. data/lib/feed_tools/vendor/html5/tests/test_validator.rb +31 -0
  131. data/lib/feed_tools/vendor/html5/tests/tokenizer_test_parser.rb +63 -0
  132. data/lib/feed_tools/vendor/uri.rb +781 -0
  133. data/lib/feed_tools/version.rb +1 -1
  134. data/rakefile +27 -6
  135. data/test/unit/atom_test.rb +298 -210
  136. data/test/unit/helper_test.rb +7 -12
  137. data/test/unit/rdf_test.rb +51 -1
  138. data/test/unit/rss_test.rb +13 -3
  139. metadata +239 -116
  140. data/lib/feed_tools/vendor/htree.rb +0 -97
  141. data/lib/feed_tools/vendor/htree/container.rb +0 -10
  142. data/lib/feed_tools/vendor/htree/context.rb +0 -67
  143. data/lib/feed_tools/vendor/htree/display.rb +0 -27
  144. data/lib/feed_tools/vendor/htree/doc.rb +0 -149
  145. data/lib/feed_tools/vendor/htree/elem.rb +0 -262
  146. data/lib/feed_tools/vendor/htree/encoder.rb +0 -163
  147. data/lib/feed_tools/vendor/htree/equality.rb +0 -218
  148. data/lib/feed_tools/vendor/htree/extract_text.rb +0 -37
  149. data/lib/feed_tools/vendor/htree/fstr.rb +0 -33
  150. data/lib/feed_tools/vendor/htree/gencode.rb +0 -97
  151. data/lib/feed_tools/vendor/htree/htmlinfo.rb +0 -672
  152. data/lib/feed_tools/vendor/htree/inspect.rb +0 -108
  153. data/lib/feed_tools/vendor/htree/leaf.rb +0 -94
  154. data/lib/feed_tools/vendor/htree/loc.rb +0 -367
  155. data/lib/feed_tools/vendor/htree/modules.rb +0 -48
  156. data/lib/feed_tools/vendor/htree/name.rb +0 -124
  157. data/lib/feed_tools/vendor/htree/output.rb +0 -207
  158. data/lib/feed_tools/vendor/htree/parse.rb +0 -409
  159. data/lib/feed_tools/vendor/htree/raw_string.rb +0 -124
  160. data/lib/feed_tools/vendor/htree/regexp-util.rb +0 -15
  161. data/lib/feed_tools/vendor/htree/rexml.rb +0 -130
  162. data/lib/feed_tools/vendor/htree/scan.rb +0 -166
  163. data/lib/feed_tools/vendor/htree/tag.rb +0 -111
  164. data/lib/feed_tools/vendor/htree/template.rb +0 -909
  165. data/lib/feed_tools/vendor/htree/text.rb +0 -115
  166. data/lib/feed_tools/vendor/htree/traverse.rb +0 -465
@@ -55,4 +55,4 @@ module FeedTools
55
55
  return result
56
56
  end
57
57
  end
58
- end
58
+ end
@@ -28,6 +28,30 @@ require 'rexml/document'
28
28
  module FeedTools
29
29
  # Methods for pulling remote data
30
30
  module HtmlHelper
31
+
32
+ TIDY_OPTIONS = [
33
+ :add_xml_decl, :add_xml_space, :alt_text, :assume_xml_procins, :bare,
34
+ :clean, :css_prefix, :decorate_inferred_ul, :doctype,
35
+ :drop_empty_paras, :drop_font_tags, :drop_proprietary_attributes,
36
+ :enclose_block_text, :enclose_text, :escape_cdata, :fix_backslash,
37
+ :fix_bad_comments, :fix_uri, :hide_comments, :hide_endtags,
38
+ :indent_cdata, :input_xml, :join_classes, :join_styles,
39
+ :literal_attributes, :logical_emphasis, :lower_literals, :merge_divs,
40
+ :ncr, :new_blocklevel_tags, :new_empty_tags, :new_inline_tags,
41
+ :new_pre_tags, :numeric_entities, :output_html, :output_xhtml,
42
+ :output_xml, :preserve_entities, :quote_ampersand, :quote_marks,
43
+ :quote_nbsp, :repeated_attributes, :replace_color, :show_body_only,
44
+ :uppercase_attributes, :uppercase_tags, :word_2000,
45
+ :accessibility_check, :show_errors, :show_warnings, :break_before_br,
46
+ :indent, :indent_attributes, :indent_spaces, :markup,
47
+ :punctuation_wrap, :split, :tab_size, :vertical_space, :wrap,
48
+ :wrap_asp, :wrap_attributes, :wrap_jste, :wrap_php,
49
+ :wrap_script_literals, :wrap_sections, :ascii_chars, :char_encoding,
50
+ :input_encoding, :language, :newline, :output_bom, :output_encoding,
51
+ :error_file, :force_output, :gnu_emacs, :gnu_emacs_file, :keep_time,
52
+ :output_file, :quiet, :slide_style, :tidy_mark, :write_back
53
+ ]
54
+
31
55
  # Escapes all html entities
32
56
  def self.escape_entities(html)
33
57
  return nil if html.nil?
@@ -43,9 +67,12 @@ module FeedTools
43
67
  unescaped_html = html
44
68
  unescaped_html.gsub!(/&/, "&")
45
69
  unescaped_html.gsub!(/&/, "&")
46
- unescaped_html = unescaped_html.gsub(/&#x\d+;/) do |hex|
47
- "&#" + hex[3..-2].to_i(16).to_s + ";"
48
- end
70
+ substitute_numerical_entities = Proc.new do |s|
71
+ m = $1
72
+ m = "0#{m}" if m[0] == ?x
73
+ [Integer(m)].pack('U*')
74
+ end
75
+ unescaped_html.gsub!(/&#0*((?:\d+)|(?:x[a-f0-9]+));/, &substitute_numerical_entities)
49
76
  unescaped_html = CGI.unescapeHTML(unescaped_html)
50
77
  unescaped_html.gsub!(/'/, "'")
51
78
  unescaped_html.gsub!(/"/, "\"")
@@ -140,7 +167,7 @@ module FeedTools
140
167
  end
141
168
  for path in libtidy_locations
142
169
  if File.exists? path
143
- if File.ftype(path) == "file"
170
+ if File.ftype(path) == "file" || File.ftype(path) == "link"
144
171
  Tidy.path = path
145
172
  @tidy_enabled = true
146
173
  break
@@ -178,12 +205,18 @@ module FeedTools
178
205
  # Tidys up the html
179
206
  def self.tidy_html(html, options = {})
180
207
  return nil if html.nil?
208
+ FeedTools::GenericHelper.validate_options(TIDY_OPTIONS, options.keys)
181
209
 
182
- FeedTools::GenericHelper.validate_options([ :input_encoding,
183
- :output_encoding ],
184
- options.keys)
185
- options = { :input_encoding => "utf-8",
186
- :output_encoding => "utf-8" }.merge(options)
210
+ options = {
211
+ :add_xml_decl => false,
212
+ :char_encoding => "utf8",
213
+ :doctype => "omit",
214
+ :indent => false,
215
+ :logical_emphasis => true,
216
+ :markup => true,
217
+ :show_warnings => false,
218
+ :wrap => 0
219
+ }.merge(options)
187
220
 
188
221
  if FeedTools::HtmlHelper.tidy_enabled?
189
222
  is_fragment = true
@@ -196,39 +229,26 @@ module FeedTools
196
229
  is_fragment = false
197
230
  end
198
231
 
232
+ options[:show_body_only] = true if is_fragment
233
+
199
234
  # Tidy sucks?
200
235
  # TODO: find the correct set of tidy options to set so
201
236
  # that *ugly* hacks like this aren't necessary.
202
237
  html = html.gsub(/\302\240/, "\240")
203
238
 
204
- tidy_html = Tidy.open(:show_warnings=>false) do |tidy|
205
- tidy.options.output_xml = true
206
- tidy.options.markup = true
207
- tidy.options.indent = true
208
- tidy.options.wrap = 0
209
- tidy.options.logical_emphasis = true
210
- tidy.options.input_encoding = options[:input_encoding]
211
- tidy.options.output_encoding = options[:output_encoding]
212
- tidy.options.doctype = "omit"
239
+ tidy_html = Tidy.open(options) do |tidy|
213
240
  xml = tidy.clean(html)
214
241
  xml
215
242
  end
216
- if is_fragment
217
- # Tidy sticks <html>...<body>[our html]</body>...</html> in.
218
- # We don't want this.
219
- tidy_html.strip!
220
- tidy_html.gsub!(/^<html>(.|\n)*<body>/, "")
221
- tidy_html.gsub!(/<\/body>(.|\n)*<\/html>$/, "")
222
- tidy_html.gsub!("\t", " ")
223
- tidy_html = FeedTools::HtmlHelper.unindent(tidy_html, 4)
224
- tidy_html.strip!
225
- end
243
+ tidy_html.strip!
226
244
  else
227
245
  tidy_html = html
228
246
  end
247
+
229
248
  if tidy_html.blank? && !html.blank?
230
249
  tidy_html = html.strip
231
250
  end
251
+
232
252
  return tidy_html
233
253
  end
234
254
 
@@ -260,77 +280,6 @@ module FeedTools
260
280
  return buffer
261
281
  end
262
282
 
263
- # Removes all dangerous html tags from the html formatted text.
264
- # If mode is set to :escape, dangerous and unknown elements will
265
- # be escaped. If mode is set to :strip, dangerous and unknown
266
- # elements and all children will be removed entirely.
267
- # Dangerous or unknown attributes are always removed.
268
- def self.sanitize_html(html, mode=:strip)
269
- return nil if html.nil?
270
-
271
- # Lists borrowed from Mark Pilgrim's feedparser
272
- acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b',
273
- 'big', 'blockquote', 'br', 'button', 'caption', 'center', 'cite',
274
- 'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl',
275
- 'dt', 'em', 'fieldset', 'font', 'form', 'h1', 'h2', 'h3', 'h4',
276
- 'h5', 'h6', 'hr', 'i', 'img', 'input', 'ins', 'kbd', 'label', 'legend',
277
- 'li', 'map', 'menu', 'ol', 'optgroup', 'option', 'p', 'pre', 'q', 's',
278
- 'samp', 'select', 'small', 'span', 'strike', 'strong', 'sub', 'sup',
279
- 'table', 'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'tt',
280
- 'u', 'ul', 'var']
281
-
282
- acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
283
- 'action', 'align', 'alt', 'axis', 'border', 'cellpadding',
284
- 'cellspacing', 'char', 'charoff', 'charset', 'checked', 'cite', 'class',
285
- 'clear', 'cols', 'colspan', 'color', 'compact', 'coords', 'datetime',
286
- 'dir', 'disabled', 'enctype', 'for', 'frame', 'headers', 'height',
287
- 'href', 'hreflang', 'hspace', 'id', 'ismap', 'label', 'lang',
288
- 'longdesc', 'maxlength', 'media', 'method', 'multiple', 'name',
289
- 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev',
290
- 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
291
- 'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title',
292
- 'type', 'usemap', 'valign', 'value', 'vspace', 'width']
293
-
294
- # Replace with appropriate named entities
295
- html.gsub!(/&#x26;/, "&amp;")
296
- html.gsub!(/&#38;/, "&amp;")
297
- html.gsub!(/&lt;!'/, "&amp;lt;!'")
298
-
299
- # Hackity hack. But it works, and it seems plenty fast enough.
300
- html_doc = HTree.parse_xml("<root>" + html + "</root>").to_rexml
301
-
302
- sanitize_node = lambda do |html_node|
303
- if html_node.respond_to? :children
304
- for child in html_node.children
305
- if child.kind_of? REXML::Element
306
- unless acceptable_elements.include? child.name.downcase
307
- if mode == :strip
308
- html_node.delete_element(child)
309
- else
310
- new_child = REXML::Text.new(CGI.escapeHTML(child.to_s))
311
- html_node.insert_after(child, new_child)
312
- html_node.delete_element(child)
313
- end
314
- end
315
- child.attributes.each_attribute do |attribute|
316
- if !(attribute.value =~ /^xmlns(:.+)?$/)
317
- unless acceptable_attributes.include?(
318
- attribute.value.downcase)
319
- child.delete_attribute(attribute.value)
320
- end
321
- end
322
- end
323
- end
324
- sanitize_node.call(child)
325
- end
326
- end
327
- html_node
328
- end
329
- sanitize_node.call(html_doc.root)
330
- html = html_doc.root.inner_xml
331
- return html
332
- end
333
-
334
283
  # Returns true if the type string provided indicates that something is
335
284
  # xml or xhtml content.
336
285
  def self.xml_type?(type)
@@ -405,20 +354,35 @@ module FeedTools
405
354
  ["q", "cite"],
406
355
  ["script", "src"]
407
356
  ]
408
- html_doc = HTree.parse_xml("<root>" + html + "</root>").to_rexml
409
357
 
358
+ # HACK: Prevent the parser from freaking out if it sees this:
359
+ html.gsub!(/<!'/, "&lt;!'")
360
+
361
+ if FeedTools.configurations[:sanitization_enabled]
362
+ fragments = HTML5::HTMLParser.parse_fragment(
363
+ html, :tokenizer => HTML5::HTMLSanitizer)
364
+ else
365
+ fragments = HTML5::HTMLParser.parse_fragment(html)
366
+ end
410
367
  resolve_node = lambda do |html_node|
411
368
  if html_node.kind_of? REXML::Element
412
- for element_attribute_pair in relative_uri_attributes
413
- if html_node.name.downcase == element_attribute_pair[0]
414
- attribute = html_node.attribute(element_attribute_pair[1])
369
+ for element_name, attribute_name in relative_uri_attributes
370
+ if html_node.name.downcase == element_name
371
+ attribute = html_node.attribute(attribute_name)
415
372
  if attribute != nil
416
373
  href = attribute.value
417
374
  href = FeedTools::UriHelper.resolve_relative_uri(
418
375
  href, [html_node.base_uri] | base_uri_sources)
419
- html_node.attribute(
420
- element_attribute_pair[1]).instance_variable_set(
421
- "@value", href)
376
+ href = FeedTools::UriHelper.normalize_url(href)
377
+ html_node.attribute(attribute_name).instance_variable_set(
378
+ "@value", href)
379
+ html_node.attribute(attribute_name).instance_variable_set(
380
+ "@unnormalized", href)
381
+ html_node.attribute(attribute_name).instance_variable_set(
382
+ "@normalized", href)
383
+ if html_node.attribute(attribute_name).value != href
384
+ warn("Failed to update href to resolved value.")
385
+ end
422
386
  end
423
387
  end
424
388
  end
@@ -430,8 +394,12 @@ module FeedTools
430
394
  end
431
395
  html_node
432
396
  end
433
- resolve_node.call(html_doc.root)
434
- html = html_doc.root.inner_xml
397
+ fragments.each do |fragment|
398
+ resolve_node.call(fragment)
399
+ end
400
+ html = (fragments.map do |stuff|
401
+ stuff.to_s
402
+ end).join("")
435
403
  return html
436
404
  end
437
405
 
@@ -552,22 +520,16 @@ module FeedTools
552
520
  content = FeedTools::HtmlHelper.unescape_entities(
553
521
  content_node.inner_xml.strip)
554
522
  else
555
- content = content_node.inner_xml.strip
556
- repair_entities = true
523
+ content = FeedTools::HtmlHelper.unescape_entities(
524
+ content_node.inner_xml.strip)
557
525
  end
558
526
  if type == "text" || mode == "text" ||
559
527
  type == "text/plain" || mode == "text/plain"
560
528
  content = FeedTools::HtmlHelper.escape_entities(content)
561
529
  end
562
530
  unless content.nil?
563
- if FeedTools.configurations[:sanitization_enabled]
564
- content = FeedTools::HtmlHelper.sanitize_html(content, :strip)
565
- end
566
531
  content = FeedTools::HtmlHelper.resolve_relative_uris(content,
567
532
  [content_node.base_uri] | base_uri_sources)
568
- if repair_entities
569
- content = FeedTools::HtmlHelper.unescape_entities(content)
570
- end
571
533
  content = FeedTools::HtmlHelper.tidy_html(content)
572
534
  end
573
535
  if FeedTools.configurations[:tab_spaces] != nil
@@ -108,21 +108,51 @@ module FeedTools
108
108
  proxy_user = nil
109
109
  proxy_password = nil
110
110
 
111
+ auth_user = nil
112
+ auth_password = nil
113
+ auth_scheme = nil
114
+
111
115
  if options[:feed_object] != nil
112
116
  proxy_address =
113
117
  options[:feed_object].configurations[:proxy_address] || nil
114
118
  proxy_port =
115
119
  options[:feed_object].configurations[:proxy_port].to_i || nil
116
120
  proxy_user =
117
- options[:feed_object].configurations[:proxy_user].to_i || nil
121
+ options[:feed_object].configurations[:proxy_user] || nil
118
122
  proxy_password =
119
- options[:feed_object].configurations[:proxy_password].to_i || nil
123
+ options[:feed_object].configurations[:proxy_password] || nil
124
+
125
+ auth_user =
126
+ options[:feed_object].configurations[:auth_user] || nil
127
+ auth_password =
128
+ options[:feed_object].configurations[:auth_password] || nil
129
+ auth_scheme =
130
+ options[:feed_object].configurations[:auth_scheme] || nil
120
131
  end
121
132
 
133
+ if (auth_user &&
134
+ (auth_scheme == nil || auth_scheme.to_s.to_sym == :basic))
135
+ options[:request_headers]["Authorization"] =
136
+ "Basic " + [
137
+ "#{auth_user}:#{auth_password}"
138
+ ].pack('m').delete("\r\n")
139
+ end
140
+
122
141
  # No need to check for nil
123
142
  http = Net::HTTP::Proxy(
124
143
  proxy_address, proxy_port, proxy_user, proxy_password).new(
125
144
  uri.host, (uri.port or 80))
145
+
146
+ if options[:feed_object] != nil &&
147
+ options[:feed_object].configurations[:http_timeout] != nil
148
+ http.open_timeout =
149
+ options[:feed_object].configurations[:http_timeout].to_f
150
+ elsif FeedTools.configurations[:http_timeout] != nil
151
+ http.open_timeout = FeedTools.configurations[:http_timeout].to_f
152
+ end
153
+ if http.open_timeout != nil && http.open_timeout == 0
154
+ http.open_timeout = nil
155
+ end
126
156
 
127
157
  path = uri.path
128
158
  path += ('?' + uri.query) if uri.query
@@ -238,4 +268,4 @@ module FeedTools
238
268
  :head, url, options, &block)
239
269
  end
240
270
  end
241
- end
271
+ end
@@ -59,13 +59,33 @@ module FeedTools
59
59
  # to be. Also translates from the feed: and rss: pseudo-protocols to the
60
60
  # http: protocol.
61
61
  def self.normalize_url(url)
62
- if url.kind_of?(URI)
62
+ if url.nil?
63
+ return nil
64
+ end
65
+ if !url.kind_of?(String)
63
66
  url = url.to_s
64
67
  end
65
68
  if url.blank?
66
- return nil
69
+ return ""
70
+ end
71
+ normalized_url = url.strip
72
+
73
+ begin
74
+ normalized_url =
75
+ FeedTools::URI.convert_path(normalized_url.strip).normalize.to_s
76
+ rescue Exception
77
+ end
78
+
79
+ begin
80
+ begin
81
+ normalized_url =
82
+ FeedTools::URI.parse(normalized_url.strip).normalize.to_s
83
+ rescue Exception
84
+ normalized_url = CGI.unescape(url.strip)
85
+ end
86
+ rescue Exception
87
+ normalized_url = url.strip
67
88
  end
68
- normalized_url = CGI.unescape(url.strip)
69
89
 
70
90
  # if a url begins with the '/' character, it only makes sense that they
71
91
  # meant to be using a file:// url. Fix it for them.
@@ -90,76 +110,40 @@ module FeedTools
90
110
  # deal with all of the many ugly possibilities involved in the rss:
91
111
  # and feed: pseudo-protocols (incidentally, whose crazy idea was this
92
112
  # mess?)
113
+ normalized_url.gsub!(/^htp:\/*/i, "http://")
93
114
  normalized_url.gsub!(/^http:\/*(feed:\/*)?/i, "http://")
94
115
  normalized_url.gsub!(/^http:\/*(rss:\/*)?/i, "http://")
95
116
  normalized_url.gsub!(/^feed:\/*(http:\/*)?/i, "http://")
96
117
  normalized_url.gsub!(/^rss:\/*(http:\/*)?/i, "http://")
97
118
  normalized_url.gsub!(/^file:\/*/i, "file:///")
98
119
  normalized_url.gsub!(/^https:\/*/i, "https://")
120
+ normalized_url.gsub!(/^mms:\/*/i, "http://")
99
121
  # fix (very) bad urls (usually of the user-entered sort)
100
122
  normalized_url.gsub!(/^http:\/*(http:\/*)*/i, "http://")
123
+ normalized_url.gsub!(/^http:\/*$/i, "")
101
124
 
102
125
  if (normalized_url =~ /^file:/i) == 0
103
126
  # Adjust windows-style urls
104
127
  normalized_url.gsub!(/^file:\/\/\/([a-zA-Z])\|/i, 'file:///\1:')
105
128
  normalized_url.gsub!(/\\/, '/')
106
129
  else
107
- if (normalized_url =~ /^https?:\/\//i) == nil
130
+ if FeedTools::URI.parse(normalized_url).scheme == nil &&
131
+ normalized_url =~ /\./ &&
108
132
  normalized_url = "http://" + normalized_url
109
133
  end
110
134
  if normalized_url == "http://"
111
135
  return nil
112
136
  end
113
- begin
114
- scheme, host_part, path =
115
- normalized_url.scan(/^(https?):\/\/([^\/]+)\/(.*)/i).flatten
116
- if scheme != nil && host_part != nil && path != nil
117
- scheme = scheme.downcase
118
- if FeedTools::UriHelper.idn_enabled?
119
- host_part =
120
- IDN::Idna.toASCII(host_part)
121
- end
122
- new_path = ""
123
- for index in 0...path.size
124
- if path[index] <= 32 || path[index] >= 126
125
- new_path << ("%" + path[index].to_s(16).upcase)
126
- else
127
- new_path << path[index..index]
128
- end
129
- end
130
- path = new_path
131
- normalized_url = scheme + "://" + host_part + "/" + path
132
- end
133
- rescue Object
134
- end
135
- begin
136
- feed_uri = URI.parse(normalized_url)
137
- if feed_uri.scheme == nil
138
- feed_uri.scheme = "http"
139
- end
140
- if feed_uri.path.blank?
141
- feed_uri.path = "/"
142
- end
143
- if (feed_uri.path =~ /^[\/]+/) == 0
144
- feed_uri.path.gsub!(/^[\/]+/, "/")
145
- end
146
- while (feed_uri.path =~ /^\/\.\./)
147
- feed_uri.path.gsub!(/^\/\.\./, "")
148
- end
149
- if feed_uri.path.blank?
150
- feed_uri.path = "/"
151
- end
152
- feed_uri.host.downcase!
153
- normalized_url = feed_uri.to_s
154
- rescue URI::InvalidURIError
155
- end
137
+ end
138
+ if normalized_url =~ /^https?:\/\/#/i
139
+ normalized_url.gsub!(/^https?:\/\/#/i, "#")
140
+ end
141
+ if normalized_url =~ /^https?:\/\/\?/i
142
+ normalized_url.gsub!(/^https?:\/\/\?/i, "?")
156
143
  end
157
144
 
158
- # We can't do a proper set of escaping, so this will
159
- # have to do.
160
- normalized_url.gsub!(/%20/, " ")
161
- normalized_url.gsub!(/ /, "%20")
162
-
145
+ normalized_url =
146
+ FeedTools::URI.parse(normalized_url.strip).normalize.to_s
163
147
  return normalized_url
164
148
  end
165
149
 
@@ -168,7 +152,15 @@ module FeedTools
168
152
  return relative_uri if base_uri_sources.blank?
169
153
  return nil if relative_uri.nil?
170
154
  begin
171
- base_uri = URI.parse(
155
+ # Massive HACK to get around file protocol URIs being used to
156
+ # resolve relative URIs on feeds in the local file system.
157
+ # Better to leave these URIs unresolved and hope some other
158
+ # tool resolves them correctly.
159
+ base_uri_sources.reject! do |base_uri|
160
+ base_uri == nil ||
161
+ FeedTools::URI.parse(base_uri).scheme == "file"
162
+ end
163
+ base_uri = FeedTools::URI.parse(
172
164
  FeedTools::XmlHelper.select_not_blank(base_uri_sources))
173
165
  resolved_uri = base_uri
174
166
  if relative_uri.to_s != ''
@@ -207,7 +199,7 @@ module FeedTools
207
199
  end
208
200
  normalized_url = normalize_url(url)
209
201
  require 'uuidtools'
210
- return UUID.sha1_create(UUID_URL_NAMESPACE, normalized_url).to_uri_string
202
+ return UUID.sha1_create(UUID_URL_NAMESPACE, normalized_url).to_uri.to_s
211
203
  end
212
204
 
213
205
  # Returns true if the parameter appears to be a valid uri