feedtools 0.2.26 → 0.2.27
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +232 -216
- data/db/migration.rb +2 -0
- data/db/schema.mysql.sql +2 -0
- data/db/schema.postgresql.sql +3 -1
- data/db/schema.sqlite.sql +3 -1
- data/lib/feed_tools.rb +37 -14
- data/lib/feed_tools/database_feed_cache.rb +13 -2
- data/lib/feed_tools/feed.rb +430 -104
- data/lib/feed_tools/feed_item.rb +533 -268
- data/lib/feed_tools/helpers/generic_helper.rb +1 -1
- data/lib/feed_tools/helpers/html_helper.rb +78 -116
- data/lib/feed_tools/helpers/retrieval_helper.rb +33 -3
- data/lib/feed_tools/helpers/uri_helper.rb +46 -54
- data/lib/feed_tools/monkey_patch.rb +27 -1
- data/lib/feed_tools/vendor/html5/History.txt +10 -0
- data/lib/feed_tools/vendor/html5/Manifest.txt +117 -0
- data/lib/feed_tools/vendor/html5/README +45 -0
- data/lib/feed_tools/vendor/html5/Rakefile.rb +33 -0
- data/lib/feed_tools/vendor/html5/bin/html5 +217 -0
- data/lib/feed_tools/vendor/html5/lib/core_ext/string.rb +17 -0
- data/lib/feed_tools/vendor/html5/lib/html5.rb +13 -0
- data/lib/feed_tools/vendor/html5/lib/html5/constants.rb +1046 -0
- data/lib/feed_tools/vendor/html5/lib/html5/filters/base.rb +10 -0
- data/lib/feed_tools/vendor/html5/lib/html5/filters/inject_meta_charset.rb +82 -0
- data/lib/feed_tools/vendor/html5/lib/html5/filters/iso639codes.rb +752 -0
- data/lib/feed_tools/vendor/html5/lib/html5/filters/optionaltags.rb +198 -0
- data/lib/feed_tools/vendor/html5/lib/html5/filters/rfc2046.rb +30 -0
- data/lib/feed_tools/vendor/html5/lib/html5/filters/rfc3987.rb +89 -0
- data/lib/feed_tools/vendor/html5/lib/html5/filters/sanitizer.rb +15 -0
- data/lib/feed_tools/vendor/html5/lib/html5/filters/validator.rb +830 -0
- data/lib/feed_tools/vendor/html5/lib/html5/filters/whitespace.rb +36 -0
- data/lib/feed_tools/vendor/html5/lib/html5/html5parser.rb +248 -0
- data/lib/feed_tools/vendor/html5/lib/html5/html5parser/after_body_phase.rb +46 -0
- data/lib/feed_tools/vendor/html5/lib/html5/html5parser/after_frameset_phase.rb +33 -0
- data/lib/feed_tools/vendor/html5/lib/html5/html5parser/after_head_phase.rb +50 -0
- data/lib/feed_tools/vendor/html5/lib/html5/html5parser/before_head_phase.rb +41 -0
- data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_body_phase.rb +613 -0
- data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_caption_phase.rb +69 -0
- data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_cell_phase.rb +78 -0
- data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_column_group_phase.rb +55 -0
- data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_frameset_phase.rb +57 -0
- data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_head_phase.rb +138 -0
- data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_row_phase.rb +89 -0
- data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_select_phase.rb +85 -0
- data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_table_body_phase.rb +86 -0
- data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_table_phase.rb +115 -0
- data/lib/feed_tools/vendor/html5/lib/html5/html5parser/initial_phase.rb +133 -0
- data/lib/feed_tools/vendor/html5/lib/html5/html5parser/phase.rb +154 -0
- data/lib/feed_tools/vendor/html5/lib/html5/html5parser/root_element_phase.rb +41 -0
- data/lib/feed_tools/vendor/html5/lib/html5/html5parser/trailing_end_phase.rb +35 -0
- data/lib/feed_tools/vendor/html5/lib/html5/inputstream.rb +648 -0
- data/lib/feed_tools/vendor/html5/lib/html5/liberalxmlparser.rb +158 -0
- data/lib/feed_tools/vendor/html5/lib/html5/sanitizer.rb +188 -0
- data/lib/feed_tools/vendor/html5/lib/html5/serializer.rb +2 -0
- data/lib/feed_tools/vendor/html5/lib/html5/serializer/htmlserializer.rb +179 -0
- data/lib/feed_tools/vendor/html5/lib/html5/serializer/xhtmlserializer.rb +20 -0
- data/lib/feed_tools/vendor/html5/lib/html5/sniffer.rb +45 -0
- data/lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb +966 -0
- data/lib/feed_tools/vendor/html5/lib/html5/treebuilders.rb +24 -0
- data/lib/feed_tools/vendor/html5/lib/html5/treebuilders/base.rb +334 -0
- data/lib/feed_tools/vendor/html5/lib/html5/treebuilders/hpricot.rb +231 -0
- data/lib/feed_tools/vendor/html5/lib/html5/treebuilders/rexml.rb +209 -0
- data/lib/feed_tools/vendor/html5/lib/html5/treebuilders/simpletree.rb +185 -0
- data/lib/feed_tools/vendor/html5/lib/html5/treewalkers.rb +26 -0
- data/lib/feed_tools/vendor/html5/lib/html5/treewalkers/base.rb +162 -0
- data/lib/feed_tools/vendor/html5/lib/html5/treewalkers/hpricot.rb +48 -0
- data/lib/feed_tools/vendor/html5/lib/html5/treewalkers/rexml.rb +48 -0
- data/lib/feed_tools/vendor/html5/lib/html5/treewalkers/simpletree.rb +48 -0
- data/lib/feed_tools/vendor/html5/lib/html5/version.rb +3 -0
- data/lib/feed_tools/vendor/html5/testdata/encoding/chardet/test_big5.txt +51 -0
- data/lib/feed_tools/vendor/html5/testdata/encoding/test-yahoo-jp.dat +10 -0
- data/lib/feed_tools/vendor/html5/testdata/encoding/tests1.dat +394 -0
- data/lib/feed_tools/vendor/html5/testdata/encoding/tests2.dat +81 -0
- data/lib/feed_tools/vendor/html5/testdata/sanitizer/tests1.dat +416 -0
- data/lib/feed_tools/vendor/html5/testdata/serializer/core.test +104 -0
- data/lib/feed_tools/vendor/html5/testdata/serializer/injectmeta.test +65 -0
- data/lib/feed_tools/vendor/html5/testdata/serializer/optionaltags.test +900 -0
- data/lib/feed_tools/vendor/html5/testdata/serializer/options.test +60 -0
- data/lib/feed_tools/vendor/html5/testdata/serializer/whitespace.test +51 -0
- data/lib/feed_tools/vendor/html5/testdata/sites/google-results.htm +1 -0
- data/lib/feed_tools/vendor/html5/testdata/sites/python-ref-import.htm +1 -0
- data/lib/feed_tools/vendor/html5/testdata/sites/web-apps-old.htm +1 -0
- data/lib/feed_tools/vendor/html5/testdata/sites/web-apps.htm +34275 -0
- data/lib/feed_tools/vendor/html5/testdata/sniffer/htmlOrFeed.json +43 -0
- data/lib/feed_tools/vendor/html5/testdata/tokenizer/contentModelFlags.test +48 -0
- data/lib/feed_tools/vendor/html5/testdata/tokenizer/entities.test +2339 -0
- data/lib/feed_tools/vendor/html5/testdata/tokenizer/escapeFlag.test +21 -0
- data/lib/feed_tools/vendor/html5/testdata/tokenizer/test1.test +172 -0
- data/lib/feed_tools/vendor/html5/testdata/tokenizer/test2.test +129 -0
- data/lib/feed_tools/vendor/html5/testdata/tokenizer/test3.test +367 -0
- data/lib/feed_tools/vendor/html5/testdata/tokenizer/test4.test +198 -0
- data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests1.dat +1950 -0
- data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests2.dat +773 -0
- data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests3.dat +270 -0
- data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests4.dat +60 -0
- data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests5.dat +175 -0
- data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests6.dat +196 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/attributes.test +1035 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/base-href-attribute.test +787 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/base-target-attribute.test +35 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/blockquote-cite-attribute.test +7 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/classattribute.test +152 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/contenteditableattribute.test +59 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/contextmenuattribute.test +115 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/dirattribute.test +59 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/draggableattribute.test +63 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/html-xmlns-attribute.test +23 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/idattribute.test +115 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/inputattributes.test +2795 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/irrelevantattribute.test +63 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/langattribute.test +5579 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/li-value-attribute.test +7 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/link-href-attribute.test +7 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/link-hreflang-attribute.test +7 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/link-rel-attribute.test +271 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/ol-start-attribute.test +7 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/starttags.test +375 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/style-scoped-attribute.test +7 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/tabindexattribute.test +79 -0
- data/lib/feed_tools/vendor/html5/tests/preamble.rb +72 -0
- data/lib/feed_tools/vendor/html5/tests/test_encoding.rb +35 -0
- data/lib/feed_tools/vendor/html5/tests/test_lxp.rb +279 -0
- data/lib/feed_tools/vendor/html5/tests/test_parser.rb +68 -0
- data/lib/feed_tools/vendor/html5/tests/test_sanitizer.rb +142 -0
- data/lib/feed_tools/vendor/html5/tests/test_serializer.rb +68 -0
- data/lib/feed_tools/vendor/html5/tests/test_sniffer.rb +27 -0
- data/lib/feed_tools/vendor/html5/tests/test_stream.rb +62 -0
- data/lib/feed_tools/vendor/html5/tests/test_tokenizer.rb +94 -0
- data/lib/feed_tools/vendor/html5/tests/test_treewalkers.rb +135 -0
- data/lib/feed_tools/vendor/html5/tests/test_validator.rb +31 -0
- data/lib/feed_tools/vendor/html5/tests/tokenizer_test_parser.rb +63 -0
- data/lib/feed_tools/vendor/uri.rb +781 -0
- data/lib/feed_tools/version.rb +1 -1
- data/rakefile +27 -6
- data/test/unit/atom_test.rb +298 -210
- data/test/unit/helper_test.rb +7 -12
- data/test/unit/rdf_test.rb +51 -1
- data/test/unit/rss_test.rb +13 -3
- metadata +239 -116
- data/lib/feed_tools/vendor/htree.rb +0 -97
- data/lib/feed_tools/vendor/htree/container.rb +0 -10
- data/lib/feed_tools/vendor/htree/context.rb +0 -67
- data/lib/feed_tools/vendor/htree/display.rb +0 -27
- data/lib/feed_tools/vendor/htree/doc.rb +0 -149
- data/lib/feed_tools/vendor/htree/elem.rb +0 -262
- data/lib/feed_tools/vendor/htree/encoder.rb +0 -163
- data/lib/feed_tools/vendor/htree/equality.rb +0 -218
- data/lib/feed_tools/vendor/htree/extract_text.rb +0 -37
- data/lib/feed_tools/vendor/htree/fstr.rb +0 -33
- data/lib/feed_tools/vendor/htree/gencode.rb +0 -97
- data/lib/feed_tools/vendor/htree/htmlinfo.rb +0 -672
- data/lib/feed_tools/vendor/htree/inspect.rb +0 -108
- data/lib/feed_tools/vendor/htree/leaf.rb +0 -94
- data/lib/feed_tools/vendor/htree/loc.rb +0 -367
- data/lib/feed_tools/vendor/htree/modules.rb +0 -48
- data/lib/feed_tools/vendor/htree/name.rb +0 -124
- data/lib/feed_tools/vendor/htree/output.rb +0 -207
- data/lib/feed_tools/vendor/htree/parse.rb +0 -409
- data/lib/feed_tools/vendor/htree/raw_string.rb +0 -124
- data/lib/feed_tools/vendor/htree/regexp-util.rb +0 -15
- data/lib/feed_tools/vendor/htree/rexml.rb +0 -130
- data/lib/feed_tools/vendor/htree/scan.rb +0 -166
- data/lib/feed_tools/vendor/htree/tag.rb +0 -111
- data/lib/feed_tools/vendor/htree/template.rb +0 -909
- data/lib/feed_tools/vendor/htree/text.rb +0 -115
- data/lib/feed_tools/vendor/htree/traverse.rb +0 -465
@@ -28,6 +28,30 @@ require 'rexml/document'
|
|
28
28
|
module FeedTools
|
29
29
|
# Methods for pulling remote data
|
30
30
|
module HtmlHelper
|
31
|
+
|
32
|
+
TIDY_OPTIONS = [
|
33
|
+
:add_xml_decl, :add_xml_space, :alt_text, :assume_xml_procins, :bare,
|
34
|
+
:clean, :css_prefix, :decorate_inferred_ul, :doctype,
|
35
|
+
:drop_empty_paras, :drop_font_tags, :drop_proprietary_attributes,
|
36
|
+
:enclose_block_text, :enclose_text, :escape_cdata, :fix_backslash,
|
37
|
+
:fix_bad_comments, :fix_uri, :hide_comments, :hide_endtags,
|
38
|
+
:indent_cdata, :input_xml, :join_classes, :join_styles,
|
39
|
+
:literal_attributes, :logical_emphasis, :lower_literals, :merge_divs,
|
40
|
+
:ncr, :new_blocklevel_tags, :new_empty_tags, :new_inline_tags,
|
41
|
+
:new_pre_tags, :numeric_entities, :output_html, :output_xhtml,
|
42
|
+
:output_xml, :preserve_entities, :quote_ampersand, :quote_marks,
|
43
|
+
:quote_nbsp, :repeated_attributes, :replace_color, :show_body_only,
|
44
|
+
:uppercase_attributes, :uppercase_tags, :word_2000,
|
45
|
+
:accessibility_check, :show_errors, :show_warnings, :break_before_br,
|
46
|
+
:indent, :indent_attributes, :indent_spaces, :markup,
|
47
|
+
:punctuation_wrap, :split, :tab_size, :vertical_space, :wrap,
|
48
|
+
:wrap_asp, :wrap_attributes, :wrap_jste, :wrap_php,
|
49
|
+
:wrap_script_literals, :wrap_sections, :ascii_chars, :char_encoding,
|
50
|
+
:input_encoding, :language, :newline, :output_bom, :output_encoding,
|
51
|
+
:error_file, :force_output, :gnu_emacs, :gnu_emacs_file, :keep_time,
|
52
|
+
:output_file, :quiet, :slide_style, :tidy_mark, :write_back
|
53
|
+
]
|
54
|
+
|
31
55
|
# Escapes all html entities
|
32
56
|
def self.escape_entities(html)
|
33
57
|
return nil if html.nil?
|
@@ -43,9 +67,12 @@ module FeedTools
|
|
43
67
|
unescaped_html = html
|
44
68
|
unescaped_html.gsub!(/&/, "&")
|
45
69
|
unescaped_html.gsub!(/&/, "&")
|
46
|
-
|
47
|
-
|
48
|
-
|
70
|
+
substitute_numerical_entities = Proc.new do |s|
|
71
|
+
m = $1
|
72
|
+
m = "0#{m}" if m[0] == ?x
|
73
|
+
[Integer(m)].pack('U*')
|
74
|
+
end
|
75
|
+
unescaped_html.gsub!(/�*((?:\d+)|(?:x[a-f0-9]+));/, &substitute_numerical_entities)
|
49
76
|
unescaped_html = CGI.unescapeHTML(unescaped_html)
|
50
77
|
unescaped_html.gsub!(/'/, "'")
|
51
78
|
unescaped_html.gsub!(/"/, "\"")
|
@@ -140,7 +167,7 @@ module FeedTools
|
|
140
167
|
end
|
141
168
|
for path in libtidy_locations
|
142
169
|
if File.exists? path
|
143
|
-
if File.ftype(path) == "file"
|
170
|
+
if File.ftype(path) == "file" || File.ftype(path) == "link"
|
144
171
|
Tidy.path = path
|
145
172
|
@tidy_enabled = true
|
146
173
|
break
|
@@ -178,12 +205,18 @@ module FeedTools
|
|
178
205
|
# Tidys up the html
|
179
206
|
def self.tidy_html(html, options = {})
|
180
207
|
return nil if html.nil?
|
208
|
+
FeedTools::GenericHelper.validate_options(TIDY_OPTIONS, options.keys)
|
181
209
|
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
210
|
+
options = {
|
211
|
+
:add_xml_decl => false,
|
212
|
+
:char_encoding => "utf8",
|
213
|
+
:doctype => "omit",
|
214
|
+
:indent => false,
|
215
|
+
:logical_emphasis => true,
|
216
|
+
:markup => true,
|
217
|
+
:show_warnings => false,
|
218
|
+
:wrap => 0
|
219
|
+
}.merge(options)
|
187
220
|
|
188
221
|
if FeedTools::HtmlHelper.tidy_enabled?
|
189
222
|
is_fragment = true
|
@@ -196,39 +229,26 @@ module FeedTools
|
|
196
229
|
is_fragment = false
|
197
230
|
end
|
198
231
|
|
232
|
+
options[:show_body_only] = true if is_fragment
|
233
|
+
|
199
234
|
# Tidy sucks?
|
200
235
|
# TODO: find the correct set of tidy options to set so
|
201
236
|
# that *ugly* hacks like this aren't necessary.
|
202
237
|
html = html.gsub(/\302\240/, "\240")
|
203
238
|
|
204
|
-
tidy_html = Tidy.open(
|
205
|
-
tidy.options.output_xml = true
|
206
|
-
tidy.options.markup = true
|
207
|
-
tidy.options.indent = true
|
208
|
-
tidy.options.wrap = 0
|
209
|
-
tidy.options.logical_emphasis = true
|
210
|
-
tidy.options.input_encoding = options[:input_encoding]
|
211
|
-
tidy.options.output_encoding = options[:output_encoding]
|
212
|
-
tidy.options.doctype = "omit"
|
239
|
+
tidy_html = Tidy.open(options) do |tidy|
|
213
240
|
xml = tidy.clean(html)
|
214
241
|
xml
|
215
242
|
end
|
216
|
-
|
217
|
-
# Tidy sticks <html>...<body>[our html]</body>...</html> in.
|
218
|
-
# We don't want this.
|
219
|
-
tidy_html.strip!
|
220
|
-
tidy_html.gsub!(/^<html>(.|\n)*<body>/, "")
|
221
|
-
tidy_html.gsub!(/<\/body>(.|\n)*<\/html>$/, "")
|
222
|
-
tidy_html.gsub!("\t", " ")
|
223
|
-
tidy_html = FeedTools::HtmlHelper.unindent(tidy_html, 4)
|
224
|
-
tidy_html.strip!
|
225
|
-
end
|
243
|
+
tidy_html.strip!
|
226
244
|
else
|
227
245
|
tidy_html = html
|
228
246
|
end
|
247
|
+
|
229
248
|
if tidy_html.blank? && !html.blank?
|
230
249
|
tidy_html = html.strip
|
231
250
|
end
|
251
|
+
|
232
252
|
return tidy_html
|
233
253
|
end
|
234
254
|
|
@@ -260,77 +280,6 @@ module FeedTools
|
|
260
280
|
return buffer
|
261
281
|
end
|
262
282
|
|
263
|
-
# Removes all dangerous html tags from the html formatted text.
|
264
|
-
# If mode is set to :escape, dangerous and unknown elements will
|
265
|
-
# be escaped. If mode is set to :strip, dangerous and unknown
|
266
|
-
# elements and all children will be removed entirely.
|
267
|
-
# Dangerous or unknown attributes are always removed.
|
268
|
-
def self.sanitize_html(html, mode=:strip)
|
269
|
-
return nil if html.nil?
|
270
|
-
|
271
|
-
# Lists borrowed from Mark Pilgrim's feedparser
|
272
|
-
acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b',
|
273
|
-
'big', 'blockquote', 'br', 'button', 'caption', 'center', 'cite',
|
274
|
-
'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl',
|
275
|
-
'dt', 'em', 'fieldset', 'font', 'form', 'h1', 'h2', 'h3', 'h4',
|
276
|
-
'h5', 'h6', 'hr', 'i', 'img', 'input', 'ins', 'kbd', 'label', 'legend',
|
277
|
-
'li', 'map', 'menu', 'ol', 'optgroup', 'option', 'p', 'pre', 'q', 's',
|
278
|
-
'samp', 'select', 'small', 'span', 'strike', 'strong', 'sub', 'sup',
|
279
|
-
'table', 'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'tt',
|
280
|
-
'u', 'ul', 'var']
|
281
|
-
|
282
|
-
acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
|
283
|
-
'action', 'align', 'alt', 'axis', 'border', 'cellpadding',
|
284
|
-
'cellspacing', 'char', 'charoff', 'charset', 'checked', 'cite', 'class',
|
285
|
-
'clear', 'cols', 'colspan', 'color', 'compact', 'coords', 'datetime',
|
286
|
-
'dir', 'disabled', 'enctype', 'for', 'frame', 'headers', 'height',
|
287
|
-
'href', 'hreflang', 'hspace', 'id', 'ismap', 'label', 'lang',
|
288
|
-
'longdesc', 'maxlength', 'media', 'method', 'multiple', 'name',
|
289
|
-
'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev',
|
290
|
-
'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
|
291
|
-
'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title',
|
292
|
-
'type', 'usemap', 'valign', 'value', 'vspace', 'width']
|
293
|
-
|
294
|
-
# Replace with appropriate named entities
|
295
|
-
html.gsub!(/&/, "&")
|
296
|
-
html.gsub!(/&/, "&")
|
297
|
-
html.gsub!(/<!'/, "&lt;!'")
|
298
|
-
|
299
|
-
# Hackity hack. But it works, and it seems plenty fast enough.
|
300
|
-
html_doc = HTree.parse_xml("<root>" + html + "</root>").to_rexml
|
301
|
-
|
302
|
-
sanitize_node = lambda do |html_node|
|
303
|
-
if html_node.respond_to? :children
|
304
|
-
for child in html_node.children
|
305
|
-
if child.kind_of? REXML::Element
|
306
|
-
unless acceptable_elements.include? child.name.downcase
|
307
|
-
if mode == :strip
|
308
|
-
html_node.delete_element(child)
|
309
|
-
else
|
310
|
-
new_child = REXML::Text.new(CGI.escapeHTML(child.to_s))
|
311
|
-
html_node.insert_after(child, new_child)
|
312
|
-
html_node.delete_element(child)
|
313
|
-
end
|
314
|
-
end
|
315
|
-
child.attributes.each_attribute do |attribute|
|
316
|
-
if !(attribute.value =~ /^xmlns(:.+)?$/)
|
317
|
-
unless acceptable_attributes.include?(
|
318
|
-
attribute.value.downcase)
|
319
|
-
child.delete_attribute(attribute.value)
|
320
|
-
end
|
321
|
-
end
|
322
|
-
end
|
323
|
-
end
|
324
|
-
sanitize_node.call(child)
|
325
|
-
end
|
326
|
-
end
|
327
|
-
html_node
|
328
|
-
end
|
329
|
-
sanitize_node.call(html_doc.root)
|
330
|
-
html = html_doc.root.inner_xml
|
331
|
-
return html
|
332
|
-
end
|
333
|
-
|
334
283
|
# Returns true if the type string provided indicates that something is
|
335
284
|
# xml or xhtml content.
|
336
285
|
def self.xml_type?(type)
|
@@ -405,20 +354,35 @@ module FeedTools
|
|
405
354
|
["q", "cite"],
|
406
355
|
["script", "src"]
|
407
356
|
]
|
408
|
-
html_doc = HTree.parse_xml("<root>" + html + "</root>").to_rexml
|
409
357
|
|
358
|
+
# HACK: Prevent the parser from freaking out if it sees this:
|
359
|
+
html.gsub!(/<!'/, "<!'")
|
360
|
+
|
361
|
+
if FeedTools.configurations[:sanitization_enabled]
|
362
|
+
fragments = HTML5::HTMLParser.parse_fragment(
|
363
|
+
html, :tokenizer => HTML5::HTMLSanitizer)
|
364
|
+
else
|
365
|
+
fragments = HTML5::HTMLParser.parse_fragment(html)
|
366
|
+
end
|
410
367
|
resolve_node = lambda do |html_node|
|
411
368
|
if html_node.kind_of? REXML::Element
|
412
|
-
for
|
413
|
-
if html_node.name.downcase ==
|
414
|
-
attribute = html_node.attribute(
|
369
|
+
for element_name, attribute_name in relative_uri_attributes
|
370
|
+
if html_node.name.downcase == element_name
|
371
|
+
attribute = html_node.attribute(attribute_name)
|
415
372
|
if attribute != nil
|
416
373
|
href = attribute.value
|
417
374
|
href = FeedTools::UriHelper.resolve_relative_uri(
|
418
375
|
href, [html_node.base_uri] | base_uri_sources)
|
419
|
-
|
420
|
-
|
421
|
-
|
376
|
+
href = FeedTools::UriHelper.normalize_url(href)
|
377
|
+
html_node.attribute(attribute_name).instance_variable_set(
|
378
|
+
"@value", href)
|
379
|
+
html_node.attribute(attribute_name).instance_variable_set(
|
380
|
+
"@unnormalized", href)
|
381
|
+
html_node.attribute(attribute_name).instance_variable_set(
|
382
|
+
"@normalized", href)
|
383
|
+
if html_node.attribute(attribute_name).value != href
|
384
|
+
warn("Failed to update href to resolved value.")
|
385
|
+
end
|
422
386
|
end
|
423
387
|
end
|
424
388
|
end
|
@@ -430,8 +394,12 @@ module FeedTools
|
|
430
394
|
end
|
431
395
|
html_node
|
432
396
|
end
|
433
|
-
|
434
|
-
|
397
|
+
fragments.each do |fragment|
|
398
|
+
resolve_node.call(fragment)
|
399
|
+
end
|
400
|
+
html = (fragments.map do |stuff|
|
401
|
+
stuff.to_s
|
402
|
+
end).join("")
|
435
403
|
return html
|
436
404
|
end
|
437
405
|
|
@@ -552,22 +520,16 @@ module FeedTools
|
|
552
520
|
content = FeedTools::HtmlHelper.unescape_entities(
|
553
521
|
content_node.inner_xml.strip)
|
554
522
|
else
|
555
|
-
content =
|
556
|
-
|
523
|
+
content = FeedTools::HtmlHelper.unescape_entities(
|
524
|
+
content_node.inner_xml.strip)
|
557
525
|
end
|
558
526
|
if type == "text" || mode == "text" ||
|
559
527
|
type == "text/plain" || mode == "text/plain"
|
560
528
|
content = FeedTools::HtmlHelper.escape_entities(content)
|
561
529
|
end
|
562
530
|
unless content.nil?
|
563
|
-
if FeedTools.configurations[:sanitization_enabled]
|
564
|
-
content = FeedTools::HtmlHelper.sanitize_html(content, :strip)
|
565
|
-
end
|
566
531
|
content = FeedTools::HtmlHelper.resolve_relative_uris(content,
|
567
532
|
[content_node.base_uri] | base_uri_sources)
|
568
|
-
if repair_entities
|
569
|
-
content = FeedTools::HtmlHelper.unescape_entities(content)
|
570
|
-
end
|
571
533
|
content = FeedTools::HtmlHelper.tidy_html(content)
|
572
534
|
end
|
573
535
|
if FeedTools.configurations[:tab_spaces] != nil
|
@@ -108,21 +108,51 @@ module FeedTools
|
|
108
108
|
proxy_user = nil
|
109
109
|
proxy_password = nil
|
110
110
|
|
111
|
+
auth_user = nil
|
112
|
+
auth_password = nil
|
113
|
+
auth_scheme = nil
|
114
|
+
|
111
115
|
if options[:feed_object] != nil
|
112
116
|
proxy_address =
|
113
117
|
options[:feed_object].configurations[:proxy_address] || nil
|
114
118
|
proxy_port =
|
115
119
|
options[:feed_object].configurations[:proxy_port].to_i || nil
|
116
120
|
proxy_user =
|
117
|
-
options[:feed_object].configurations[:proxy_user]
|
121
|
+
options[:feed_object].configurations[:proxy_user] || nil
|
118
122
|
proxy_password =
|
119
|
-
options[:feed_object].configurations[:proxy_password]
|
123
|
+
options[:feed_object].configurations[:proxy_password] || nil
|
124
|
+
|
125
|
+
auth_user =
|
126
|
+
options[:feed_object].configurations[:auth_user] || nil
|
127
|
+
auth_password =
|
128
|
+
options[:feed_object].configurations[:auth_password] || nil
|
129
|
+
auth_scheme =
|
130
|
+
options[:feed_object].configurations[:auth_scheme] || nil
|
120
131
|
end
|
121
132
|
|
133
|
+
if (auth_user &&
|
134
|
+
(auth_scheme == nil || auth_scheme.to_s.to_sym == :basic))
|
135
|
+
options[:request_headers]["Authorization"] =
|
136
|
+
"Basic " + [
|
137
|
+
"#{auth_user}:#{auth_password}"
|
138
|
+
].pack('m').delete("\r\n")
|
139
|
+
end
|
140
|
+
|
122
141
|
# No need to check for nil
|
123
142
|
http = Net::HTTP::Proxy(
|
124
143
|
proxy_address, proxy_port, proxy_user, proxy_password).new(
|
125
144
|
uri.host, (uri.port or 80))
|
145
|
+
|
146
|
+
if options[:feed_object] != nil &&
|
147
|
+
options[:feed_object].configurations[:http_timeout] != nil
|
148
|
+
http.open_timeout =
|
149
|
+
options[:feed_object].configurations[:http_timeout].to_f
|
150
|
+
elsif FeedTools.configurations[:http_timeout] != nil
|
151
|
+
http.open_timeout = FeedTools.configurations[:http_timeout].to_f
|
152
|
+
end
|
153
|
+
if http.open_timeout != nil && http.open_timeout == 0
|
154
|
+
http.open_timeout = nil
|
155
|
+
end
|
126
156
|
|
127
157
|
path = uri.path
|
128
158
|
path += ('?' + uri.query) if uri.query
|
@@ -238,4 +268,4 @@ module FeedTools
|
|
238
268
|
:head, url, options, &block)
|
239
269
|
end
|
240
270
|
end
|
241
|
-
end
|
271
|
+
end
|
@@ -59,13 +59,33 @@ module FeedTools
|
|
59
59
|
# to be. Also translates from the feed: and rss: pseudo-protocols to the
|
60
60
|
# http: protocol.
|
61
61
|
def self.normalize_url(url)
|
62
|
-
if url.
|
62
|
+
if url.nil?
|
63
|
+
return nil
|
64
|
+
end
|
65
|
+
if !url.kind_of?(String)
|
63
66
|
url = url.to_s
|
64
67
|
end
|
65
68
|
if url.blank?
|
66
|
-
return
|
69
|
+
return ""
|
70
|
+
end
|
71
|
+
normalized_url = url.strip
|
72
|
+
|
73
|
+
begin
|
74
|
+
normalized_url =
|
75
|
+
FeedTools::URI.convert_path(normalized_url.strip).normalize.to_s
|
76
|
+
rescue Exception
|
77
|
+
end
|
78
|
+
|
79
|
+
begin
|
80
|
+
begin
|
81
|
+
normalized_url =
|
82
|
+
FeedTools::URI.parse(normalized_url.strip).normalize.to_s
|
83
|
+
rescue Exception
|
84
|
+
normalized_url = CGI.unescape(url.strip)
|
85
|
+
end
|
86
|
+
rescue Exception
|
87
|
+
normalized_url = url.strip
|
67
88
|
end
|
68
|
-
normalized_url = CGI.unescape(url.strip)
|
69
89
|
|
70
90
|
# if a url begins with the '/' character, it only makes sense that they
|
71
91
|
# meant to be using a file:// url. Fix it for them.
|
@@ -90,76 +110,40 @@ module FeedTools
|
|
90
110
|
# deal with all of the many ugly possibilities involved in the rss:
|
91
111
|
# and feed: pseudo-protocols (incidentally, whose crazy idea was this
|
92
112
|
# mess?)
|
113
|
+
normalized_url.gsub!(/^htp:\/*/i, "http://")
|
93
114
|
normalized_url.gsub!(/^http:\/*(feed:\/*)?/i, "http://")
|
94
115
|
normalized_url.gsub!(/^http:\/*(rss:\/*)?/i, "http://")
|
95
116
|
normalized_url.gsub!(/^feed:\/*(http:\/*)?/i, "http://")
|
96
117
|
normalized_url.gsub!(/^rss:\/*(http:\/*)?/i, "http://")
|
97
118
|
normalized_url.gsub!(/^file:\/*/i, "file:///")
|
98
119
|
normalized_url.gsub!(/^https:\/*/i, "https://")
|
120
|
+
normalized_url.gsub!(/^mms:\/*/i, "http://")
|
99
121
|
# fix (very) bad urls (usually of the user-entered sort)
|
100
122
|
normalized_url.gsub!(/^http:\/*(http:\/*)*/i, "http://")
|
123
|
+
normalized_url.gsub!(/^http:\/*$/i, "")
|
101
124
|
|
102
125
|
if (normalized_url =~ /^file:/i) == 0
|
103
126
|
# Adjust windows-style urls
|
104
127
|
normalized_url.gsub!(/^file:\/\/\/([a-zA-Z])\|/i, 'file:///\1:')
|
105
128
|
normalized_url.gsub!(/\\/, '/')
|
106
129
|
else
|
107
|
-
if (normalized_url
|
130
|
+
if FeedTools::URI.parse(normalized_url).scheme == nil &&
|
131
|
+
normalized_url =~ /\./ &&
|
108
132
|
normalized_url = "http://" + normalized_url
|
109
133
|
end
|
110
134
|
if normalized_url == "http://"
|
111
135
|
return nil
|
112
136
|
end
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
host_part =
|
120
|
-
IDN::Idna.toASCII(host_part)
|
121
|
-
end
|
122
|
-
new_path = ""
|
123
|
-
for index in 0...path.size
|
124
|
-
if path[index] <= 32 || path[index] >= 126
|
125
|
-
new_path << ("%" + path[index].to_s(16).upcase)
|
126
|
-
else
|
127
|
-
new_path << path[index..index]
|
128
|
-
end
|
129
|
-
end
|
130
|
-
path = new_path
|
131
|
-
normalized_url = scheme + "://" + host_part + "/" + path
|
132
|
-
end
|
133
|
-
rescue Object
|
134
|
-
end
|
135
|
-
begin
|
136
|
-
feed_uri = URI.parse(normalized_url)
|
137
|
-
if feed_uri.scheme == nil
|
138
|
-
feed_uri.scheme = "http"
|
139
|
-
end
|
140
|
-
if feed_uri.path.blank?
|
141
|
-
feed_uri.path = "/"
|
142
|
-
end
|
143
|
-
if (feed_uri.path =~ /^[\/]+/) == 0
|
144
|
-
feed_uri.path.gsub!(/^[\/]+/, "/")
|
145
|
-
end
|
146
|
-
while (feed_uri.path =~ /^\/\.\./)
|
147
|
-
feed_uri.path.gsub!(/^\/\.\./, "")
|
148
|
-
end
|
149
|
-
if feed_uri.path.blank?
|
150
|
-
feed_uri.path = "/"
|
151
|
-
end
|
152
|
-
feed_uri.host.downcase!
|
153
|
-
normalized_url = feed_uri.to_s
|
154
|
-
rescue URI::InvalidURIError
|
155
|
-
end
|
137
|
+
end
|
138
|
+
if normalized_url =~ /^https?:\/\/#/i
|
139
|
+
normalized_url.gsub!(/^https?:\/\/#/i, "#")
|
140
|
+
end
|
141
|
+
if normalized_url =~ /^https?:\/\/\?/i
|
142
|
+
normalized_url.gsub!(/^https?:\/\/\?/i, "?")
|
156
143
|
end
|
157
144
|
|
158
|
-
|
159
|
-
|
160
|
-
normalized_url.gsub!(/%20/, " ")
|
161
|
-
normalized_url.gsub!(/ /, "%20")
|
162
|
-
|
145
|
+
normalized_url =
|
146
|
+
FeedTools::URI.parse(normalized_url.strip).normalize.to_s
|
163
147
|
return normalized_url
|
164
148
|
end
|
165
149
|
|
@@ -168,7 +152,15 @@ module FeedTools
|
|
168
152
|
return relative_uri if base_uri_sources.blank?
|
169
153
|
return nil if relative_uri.nil?
|
170
154
|
begin
|
171
|
-
|
155
|
+
# Massive HACK to get around file protocol URIs being used to
|
156
|
+
# resolve relative URIs on feeds in the local file system.
|
157
|
+
# Better to leave these URIs unresolved and hope some other
|
158
|
+
# tool resolves them correctly.
|
159
|
+
base_uri_sources.reject! do |base_uri|
|
160
|
+
base_uri == nil ||
|
161
|
+
FeedTools::URI.parse(base_uri).scheme == "file"
|
162
|
+
end
|
163
|
+
base_uri = FeedTools::URI.parse(
|
172
164
|
FeedTools::XmlHelper.select_not_blank(base_uri_sources))
|
173
165
|
resolved_uri = base_uri
|
174
166
|
if relative_uri.to_s != ''
|
@@ -207,7 +199,7 @@ module FeedTools
|
|
207
199
|
end
|
208
200
|
normalized_url = normalize_url(url)
|
209
201
|
require 'uuidtools'
|
210
|
-
return UUID.sha1_create(UUID_URL_NAMESPACE, normalized_url).
|
202
|
+
return UUID.sha1_create(UUID_URL_NAMESPACE, normalized_url).to_uri.to_s
|
211
203
|
end
|
212
204
|
|
213
205
|
# Returns true if the parameter appears to be a valid uri
|