feedtools 0.2.26 → 0.2.27
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +232 -216
- data/db/migration.rb +2 -0
- data/db/schema.mysql.sql +2 -0
- data/db/schema.postgresql.sql +3 -1
- data/db/schema.sqlite.sql +3 -1
- data/lib/feed_tools.rb +37 -14
- data/lib/feed_tools/database_feed_cache.rb +13 -2
- data/lib/feed_tools/feed.rb +430 -104
- data/lib/feed_tools/feed_item.rb +533 -268
- data/lib/feed_tools/helpers/generic_helper.rb +1 -1
- data/lib/feed_tools/helpers/html_helper.rb +78 -116
- data/lib/feed_tools/helpers/retrieval_helper.rb +33 -3
- data/lib/feed_tools/helpers/uri_helper.rb +46 -54
- data/lib/feed_tools/monkey_patch.rb +27 -1
- data/lib/feed_tools/vendor/html5/History.txt +10 -0
- data/lib/feed_tools/vendor/html5/Manifest.txt +117 -0
- data/lib/feed_tools/vendor/html5/README +45 -0
- data/lib/feed_tools/vendor/html5/Rakefile.rb +33 -0
- data/lib/feed_tools/vendor/html5/bin/html5 +217 -0
- data/lib/feed_tools/vendor/html5/lib/core_ext/string.rb +17 -0
- data/lib/feed_tools/vendor/html5/lib/html5.rb +13 -0
- data/lib/feed_tools/vendor/html5/lib/html5/constants.rb +1046 -0
- data/lib/feed_tools/vendor/html5/lib/html5/filters/base.rb +10 -0
- data/lib/feed_tools/vendor/html5/lib/html5/filters/inject_meta_charset.rb +82 -0
- data/lib/feed_tools/vendor/html5/lib/html5/filters/iso639codes.rb +752 -0
- data/lib/feed_tools/vendor/html5/lib/html5/filters/optionaltags.rb +198 -0
- data/lib/feed_tools/vendor/html5/lib/html5/filters/rfc2046.rb +30 -0
- data/lib/feed_tools/vendor/html5/lib/html5/filters/rfc3987.rb +89 -0
- data/lib/feed_tools/vendor/html5/lib/html5/filters/sanitizer.rb +15 -0
- data/lib/feed_tools/vendor/html5/lib/html5/filters/validator.rb +830 -0
- data/lib/feed_tools/vendor/html5/lib/html5/filters/whitespace.rb +36 -0
- data/lib/feed_tools/vendor/html5/lib/html5/html5parser.rb +248 -0
- data/lib/feed_tools/vendor/html5/lib/html5/html5parser/after_body_phase.rb +46 -0
- data/lib/feed_tools/vendor/html5/lib/html5/html5parser/after_frameset_phase.rb +33 -0
- data/lib/feed_tools/vendor/html5/lib/html5/html5parser/after_head_phase.rb +50 -0
- data/lib/feed_tools/vendor/html5/lib/html5/html5parser/before_head_phase.rb +41 -0
- data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_body_phase.rb +613 -0
- data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_caption_phase.rb +69 -0
- data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_cell_phase.rb +78 -0
- data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_column_group_phase.rb +55 -0
- data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_frameset_phase.rb +57 -0
- data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_head_phase.rb +138 -0
- data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_row_phase.rb +89 -0
- data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_select_phase.rb +85 -0
- data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_table_body_phase.rb +86 -0
- data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_table_phase.rb +115 -0
- data/lib/feed_tools/vendor/html5/lib/html5/html5parser/initial_phase.rb +133 -0
- data/lib/feed_tools/vendor/html5/lib/html5/html5parser/phase.rb +154 -0
- data/lib/feed_tools/vendor/html5/lib/html5/html5parser/root_element_phase.rb +41 -0
- data/lib/feed_tools/vendor/html5/lib/html5/html5parser/trailing_end_phase.rb +35 -0
- data/lib/feed_tools/vendor/html5/lib/html5/inputstream.rb +648 -0
- data/lib/feed_tools/vendor/html5/lib/html5/liberalxmlparser.rb +158 -0
- data/lib/feed_tools/vendor/html5/lib/html5/sanitizer.rb +188 -0
- data/lib/feed_tools/vendor/html5/lib/html5/serializer.rb +2 -0
- data/lib/feed_tools/vendor/html5/lib/html5/serializer/htmlserializer.rb +179 -0
- data/lib/feed_tools/vendor/html5/lib/html5/serializer/xhtmlserializer.rb +20 -0
- data/lib/feed_tools/vendor/html5/lib/html5/sniffer.rb +45 -0
- data/lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb +966 -0
- data/lib/feed_tools/vendor/html5/lib/html5/treebuilders.rb +24 -0
- data/lib/feed_tools/vendor/html5/lib/html5/treebuilders/base.rb +334 -0
- data/lib/feed_tools/vendor/html5/lib/html5/treebuilders/hpricot.rb +231 -0
- data/lib/feed_tools/vendor/html5/lib/html5/treebuilders/rexml.rb +209 -0
- data/lib/feed_tools/vendor/html5/lib/html5/treebuilders/simpletree.rb +185 -0
- data/lib/feed_tools/vendor/html5/lib/html5/treewalkers.rb +26 -0
- data/lib/feed_tools/vendor/html5/lib/html5/treewalkers/base.rb +162 -0
- data/lib/feed_tools/vendor/html5/lib/html5/treewalkers/hpricot.rb +48 -0
- data/lib/feed_tools/vendor/html5/lib/html5/treewalkers/rexml.rb +48 -0
- data/lib/feed_tools/vendor/html5/lib/html5/treewalkers/simpletree.rb +48 -0
- data/lib/feed_tools/vendor/html5/lib/html5/version.rb +3 -0
- data/lib/feed_tools/vendor/html5/testdata/encoding/chardet/test_big5.txt +51 -0
- data/lib/feed_tools/vendor/html5/testdata/encoding/test-yahoo-jp.dat +10 -0
- data/lib/feed_tools/vendor/html5/testdata/encoding/tests1.dat +394 -0
- data/lib/feed_tools/vendor/html5/testdata/encoding/tests2.dat +81 -0
- data/lib/feed_tools/vendor/html5/testdata/sanitizer/tests1.dat +416 -0
- data/lib/feed_tools/vendor/html5/testdata/serializer/core.test +104 -0
- data/lib/feed_tools/vendor/html5/testdata/serializer/injectmeta.test +65 -0
- data/lib/feed_tools/vendor/html5/testdata/serializer/optionaltags.test +900 -0
- data/lib/feed_tools/vendor/html5/testdata/serializer/options.test +60 -0
- data/lib/feed_tools/vendor/html5/testdata/serializer/whitespace.test +51 -0
- data/lib/feed_tools/vendor/html5/testdata/sites/google-results.htm +1 -0
- data/lib/feed_tools/vendor/html5/testdata/sites/python-ref-import.htm +1 -0
- data/lib/feed_tools/vendor/html5/testdata/sites/web-apps-old.htm +1 -0
- data/lib/feed_tools/vendor/html5/testdata/sites/web-apps.htm +34275 -0
- data/lib/feed_tools/vendor/html5/testdata/sniffer/htmlOrFeed.json +43 -0
- data/lib/feed_tools/vendor/html5/testdata/tokenizer/contentModelFlags.test +48 -0
- data/lib/feed_tools/vendor/html5/testdata/tokenizer/entities.test +2339 -0
- data/lib/feed_tools/vendor/html5/testdata/tokenizer/escapeFlag.test +21 -0
- data/lib/feed_tools/vendor/html5/testdata/tokenizer/test1.test +172 -0
- data/lib/feed_tools/vendor/html5/testdata/tokenizer/test2.test +129 -0
- data/lib/feed_tools/vendor/html5/testdata/tokenizer/test3.test +367 -0
- data/lib/feed_tools/vendor/html5/testdata/tokenizer/test4.test +198 -0
- data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests1.dat +1950 -0
- data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests2.dat +773 -0
- data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests3.dat +270 -0
- data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests4.dat +60 -0
- data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests5.dat +175 -0
- data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests6.dat +196 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/attributes.test +1035 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/base-href-attribute.test +787 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/base-target-attribute.test +35 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/blockquote-cite-attribute.test +7 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/classattribute.test +152 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/contenteditableattribute.test +59 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/contextmenuattribute.test +115 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/dirattribute.test +59 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/draggableattribute.test +63 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/html-xmlns-attribute.test +23 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/idattribute.test +115 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/inputattributes.test +2795 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/irrelevantattribute.test +63 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/langattribute.test +5579 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/li-value-attribute.test +7 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/link-href-attribute.test +7 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/link-hreflang-attribute.test +7 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/link-rel-attribute.test +271 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/ol-start-attribute.test +7 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/starttags.test +375 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/style-scoped-attribute.test +7 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/tabindexattribute.test +79 -0
- data/lib/feed_tools/vendor/html5/tests/preamble.rb +72 -0
- data/lib/feed_tools/vendor/html5/tests/test_encoding.rb +35 -0
- data/lib/feed_tools/vendor/html5/tests/test_lxp.rb +279 -0
- data/lib/feed_tools/vendor/html5/tests/test_parser.rb +68 -0
- data/lib/feed_tools/vendor/html5/tests/test_sanitizer.rb +142 -0
- data/lib/feed_tools/vendor/html5/tests/test_serializer.rb +68 -0
- data/lib/feed_tools/vendor/html5/tests/test_sniffer.rb +27 -0
- data/lib/feed_tools/vendor/html5/tests/test_stream.rb +62 -0
- data/lib/feed_tools/vendor/html5/tests/test_tokenizer.rb +94 -0
- data/lib/feed_tools/vendor/html5/tests/test_treewalkers.rb +135 -0
- data/lib/feed_tools/vendor/html5/tests/test_validator.rb +31 -0
- data/lib/feed_tools/vendor/html5/tests/tokenizer_test_parser.rb +63 -0
- data/lib/feed_tools/vendor/uri.rb +781 -0
- data/lib/feed_tools/version.rb +1 -1
- data/rakefile +27 -6
- data/test/unit/atom_test.rb +298 -210
- data/test/unit/helper_test.rb +7 -12
- data/test/unit/rdf_test.rb +51 -1
- data/test/unit/rss_test.rb +13 -3
- metadata +239 -116
- data/lib/feed_tools/vendor/htree.rb +0 -97
- data/lib/feed_tools/vendor/htree/container.rb +0 -10
- data/lib/feed_tools/vendor/htree/context.rb +0 -67
- data/lib/feed_tools/vendor/htree/display.rb +0 -27
- data/lib/feed_tools/vendor/htree/doc.rb +0 -149
- data/lib/feed_tools/vendor/htree/elem.rb +0 -262
- data/lib/feed_tools/vendor/htree/encoder.rb +0 -163
- data/lib/feed_tools/vendor/htree/equality.rb +0 -218
- data/lib/feed_tools/vendor/htree/extract_text.rb +0 -37
- data/lib/feed_tools/vendor/htree/fstr.rb +0 -33
- data/lib/feed_tools/vendor/htree/gencode.rb +0 -97
- data/lib/feed_tools/vendor/htree/htmlinfo.rb +0 -672
- data/lib/feed_tools/vendor/htree/inspect.rb +0 -108
- data/lib/feed_tools/vendor/htree/leaf.rb +0 -94
- data/lib/feed_tools/vendor/htree/loc.rb +0 -367
- data/lib/feed_tools/vendor/htree/modules.rb +0 -48
- data/lib/feed_tools/vendor/htree/name.rb +0 -124
- data/lib/feed_tools/vendor/htree/output.rb +0 -207
- data/lib/feed_tools/vendor/htree/parse.rb +0 -409
- data/lib/feed_tools/vendor/htree/raw_string.rb +0 -124
- data/lib/feed_tools/vendor/htree/regexp-util.rb +0 -15
- data/lib/feed_tools/vendor/htree/rexml.rb +0 -130
- data/lib/feed_tools/vendor/htree/scan.rb +0 -166
- data/lib/feed_tools/vendor/htree/tag.rb +0 -111
- data/lib/feed_tools/vendor/htree/template.rb +0 -909
- data/lib/feed_tools/vendor/htree/text.rb +0 -115
- data/lib/feed_tools/vendor/htree/traverse.rb +0 -465
|
@@ -28,6 +28,30 @@ require 'rexml/document'
|
|
|
28
28
|
module FeedTools
|
|
29
29
|
# Methods for pulling remote data
|
|
30
30
|
module HtmlHelper
|
|
31
|
+
|
|
32
|
+
TIDY_OPTIONS = [
|
|
33
|
+
:add_xml_decl, :add_xml_space, :alt_text, :assume_xml_procins, :bare,
|
|
34
|
+
:clean, :css_prefix, :decorate_inferred_ul, :doctype,
|
|
35
|
+
:drop_empty_paras, :drop_font_tags, :drop_proprietary_attributes,
|
|
36
|
+
:enclose_block_text, :enclose_text, :escape_cdata, :fix_backslash,
|
|
37
|
+
:fix_bad_comments, :fix_uri, :hide_comments, :hide_endtags,
|
|
38
|
+
:indent_cdata, :input_xml, :join_classes, :join_styles,
|
|
39
|
+
:literal_attributes, :logical_emphasis, :lower_literals, :merge_divs,
|
|
40
|
+
:ncr, :new_blocklevel_tags, :new_empty_tags, :new_inline_tags,
|
|
41
|
+
:new_pre_tags, :numeric_entities, :output_html, :output_xhtml,
|
|
42
|
+
:output_xml, :preserve_entities, :quote_ampersand, :quote_marks,
|
|
43
|
+
:quote_nbsp, :repeated_attributes, :replace_color, :show_body_only,
|
|
44
|
+
:uppercase_attributes, :uppercase_tags, :word_2000,
|
|
45
|
+
:accessibility_check, :show_errors, :show_warnings, :break_before_br,
|
|
46
|
+
:indent, :indent_attributes, :indent_spaces, :markup,
|
|
47
|
+
:punctuation_wrap, :split, :tab_size, :vertical_space, :wrap,
|
|
48
|
+
:wrap_asp, :wrap_attributes, :wrap_jste, :wrap_php,
|
|
49
|
+
:wrap_script_literals, :wrap_sections, :ascii_chars, :char_encoding,
|
|
50
|
+
:input_encoding, :language, :newline, :output_bom, :output_encoding,
|
|
51
|
+
:error_file, :force_output, :gnu_emacs, :gnu_emacs_file, :keep_time,
|
|
52
|
+
:output_file, :quiet, :slide_style, :tidy_mark, :write_back
|
|
53
|
+
]
|
|
54
|
+
|
|
31
55
|
# Escapes all html entities
|
|
32
56
|
def self.escape_entities(html)
|
|
33
57
|
return nil if html.nil?
|
|
@@ -43,9 +67,12 @@ module FeedTools
|
|
|
43
67
|
unescaped_html = html
|
|
44
68
|
unescaped_html.gsub!(/&/, "&")
|
|
45
69
|
unescaped_html.gsub!(/&/, "&")
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
70
|
+
substitute_numerical_entities = Proc.new do |s|
|
|
71
|
+
m = $1
|
|
72
|
+
m = "0#{m}" if m[0] == ?x
|
|
73
|
+
[Integer(m)].pack('U*')
|
|
74
|
+
end
|
|
75
|
+
unescaped_html.gsub!(/�*((?:\d+)|(?:x[a-f0-9]+));/, &substitute_numerical_entities)
|
|
49
76
|
unescaped_html = CGI.unescapeHTML(unescaped_html)
|
|
50
77
|
unescaped_html.gsub!(/'/, "'")
|
|
51
78
|
unescaped_html.gsub!(/"/, "\"")
|
|
@@ -140,7 +167,7 @@ module FeedTools
|
|
|
140
167
|
end
|
|
141
168
|
for path in libtidy_locations
|
|
142
169
|
if File.exists? path
|
|
143
|
-
if File.ftype(path) == "file"
|
|
170
|
+
if File.ftype(path) == "file" || File.ftype(path) == "link"
|
|
144
171
|
Tidy.path = path
|
|
145
172
|
@tidy_enabled = true
|
|
146
173
|
break
|
|
@@ -178,12 +205,18 @@ module FeedTools
|
|
|
178
205
|
# Tidys up the html
|
|
179
206
|
def self.tidy_html(html, options = {})
|
|
180
207
|
return nil if html.nil?
|
|
208
|
+
FeedTools::GenericHelper.validate_options(TIDY_OPTIONS, options.keys)
|
|
181
209
|
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
210
|
+
options = {
|
|
211
|
+
:add_xml_decl => false,
|
|
212
|
+
:char_encoding => "utf8",
|
|
213
|
+
:doctype => "omit",
|
|
214
|
+
:indent => false,
|
|
215
|
+
:logical_emphasis => true,
|
|
216
|
+
:markup => true,
|
|
217
|
+
:show_warnings => false,
|
|
218
|
+
:wrap => 0
|
|
219
|
+
}.merge(options)
|
|
187
220
|
|
|
188
221
|
if FeedTools::HtmlHelper.tidy_enabled?
|
|
189
222
|
is_fragment = true
|
|
@@ -196,39 +229,26 @@ module FeedTools
|
|
|
196
229
|
is_fragment = false
|
|
197
230
|
end
|
|
198
231
|
|
|
232
|
+
options[:show_body_only] = true if is_fragment
|
|
233
|
+
|
|
199
234
|
# Tidy sucks?
|
|
200
235
|
# TODO: find the correct set of tidy options to set so
|
|
201
236
|
# that *ugly* hacks like this aren't necessary.
|
|
202
237
|
html = html.gsub(/\302\240/, "\240")
|
|
203
238
|
|
|
204
|
-
tidy_html = Tidy.open(
|
|
205
|
-
tidy.options.output_xml = true
|
|
206
|
-
tidy.options.markup = true
|
|
207
|
-
tidy.options.indent = true
|
|
208
|
-
tidy.options.wrap = 0
|
|
209
|
-
tidy.options.logical_emphasis = true
|
|
210
|
-
tidy.options.input_encoding = options[:input_encoding]
|
|
211
|
-
tidy.options.output_encoding = options[:output_encoding]
|
|
212
|
-
tidy.options.doctype = "omit"
|
|
239
|
+
tidy_html = Tidy.open(options) do |tidy|
|
|
213
240
|
xml = tidy.clean(html)
|
|
214
241
|
xml
|
|
215
242
|
end
|
|
216
|
-
|
|
217
|
-
# Tidy sticks <html>...<body>[our html]</body>...</html> in.
|
|
218
|
-
# We don't want this.
|
|
219
|
-
tidy_html.strip!
|
|
220
|
-
tidy_html.gsub!(/^<html>(.|\n)*<body>/, "")
|
|
221
|
-
tidy_html.gsub!(/<\/body>(.|\n)*<\/html>$/, "")
|
|
222
|
-
tidy_html.gsub!("\t", " ")
|
|
223
|
-
tidy_html = FeedTools::HtmlHelper.unindent(tidy_html, 4)
|
|
224
|
-
tidy_html.strip!
|
|
225
|
-
end
|
|
243
|
+
tidy_html.strip!
|
|
226
244
|
else
|
|
227
245
|
tidy_html = html
|
|
228
246
|
end
|
|
247
|
+
|
|
229
248
|
if tidy_html.blank? && !html.blank?
|
|
230
249
|
tidy_html = html.strip
|
|
231
250
|
end
|
|
251
|
+
|
|
232
252
|
return tidy_html
|
|
233
253
|
end
|
|
234
254
|
|
|
@@ -260,77 +280,6 @@ module FeedTools
|
|
|
260
280
|
return buffer
|
|
261
281
|
end
|
|
262
282
|
|
|
263
|
-
# Removes all dangerous html tags from the html formatted text.
|
|
264
|
-
# If mode is set to :escape, dangerous and unknown elements will
|
|
265
|
-
# be escaped. If mode is set to :strip, dangerous and unknown
|
|
266
|
-
# elements and all children will be removed entirely.
|
|
267
|
-
# Dangerous or unknown attributes are always removed.
|
|
268
|
-
def self.sanitize_html(html, mode=:strip)
|
|
269
|
-
return nil if html.nil?
|
|
270
|
-
|
|
271
|
-
# Lists borrowed from Mark Pilgrim's feedparser
|
|
272
|
-
acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b',
|
|
273
|
-
'big', 'blockquote', 'br', 'button', 'caption', 'center', 'cite',
|
|
274
|
-
'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl',
|
|
275
|
-
'dt', 'em', 'fieldset', 'font', 'form', 'h1', 'h2', 'h3', 'h4',
|
|
276
|
-
'h5', 'h6', 'hr', 'i', 'img', 'input', 'ins', 'kbd', 'label', 'legend',
|
|
277
|
-
'li', 'map', 'menu', 'ol', 'optgroup', 'option', 'p', 'pre', 'q', 's',
|
|
278
|
-
'samp', 'select', 'small', 'span', 'strike', 'strong', 'sub', 'sup',
|
|
279
|
-
'table', 'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'tt',
|
|
280
|
-
'u', 'ul', 'var']
|
|
281
|
-
|
|
282
|
-
acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
|
|
283
|
-
'action', 'align', 'alt', 'axis', 'border', 'cellpadding',
|
|
284
|
-
'cellspacing', 'char', 'charoff', 'charset', 'checked', 'cite', 'class',
|
|
285
|
-
'clear', 'cols', 'colspan', 'color', 'compact', 'coords', 'datetime',
|
|
286
|
-
'dir', 'disabled', 'enctype', 'for', 'frame', 'headers', 'height',
|
|
287
|
-
'href', 'hreflang', 'hspace', 'id', 'ismap', 'label', 'lang',
|
|
288
|
-
'longdesc', 'maxlength', 'media', 'method', 'multiple', 'name',
|
|
289
|
-
'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev',
|
|
290
|
-
'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
|
|
291
|
-
'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title',
|
|
292
|
-
'type', 'usemap', 'valign', 'value', 'vspace', 'width']
|
|
293
|
-
|
|
294
|
-
# Replace with appropriate named entities
|
|
295
|
-
html.gsub!(/&/, "&")
|
|
296
|
-
html.gsub!(/&/, "&")
|
|
297
|
-
html.gsub!(/<!'/, "&lt;!'")
|
|
298
|
-
|
|
299
|
-
# Hackity hack. But it works, and it seems plenty fast enough.
|
|
300
|
-
html_doc = HTree.parse_xml("<root>" + html + "</root>").to_rexml
|
|
301
|
-
|
|
302
|
-
sanitize_node = lambda do |html_node|
|
|
303
|
-
if html_node.respond_to? :children
|
|
304
|
-
for child in html_node.children
|
|
305
|
-
if child.kind_of? REXML::Element
|
|
306
|
-
unless acceptable_elements.include? child.name.downcase
|
|
307
|
-
if mode == :strip
|
|
308
|
-
html_node.delete_element(child)
|
|
309
|
-
else
|
|
310
|
-
new_child = REXML::Text.new(CGI.escapeHTML(child.to_s))
|
|
311
|
-
html_node.insert_after(child, new_child)
|
|
312
|
-
html_node.delete_element(child)
|
|
313
|
-
end
|
|
314
|
-
end
|
|
315
|
-
child.attributes.each_attribute do |attribute|
|
|
316
|
-
if !(attribute.value =~ /^xmlns(:.+)?$/)
|
|
317
|
-
unless acceptable_attributes.include?(
|
|
318
|
-
attribute.value.downcase)
|
|
319
|
-
child.delete_attribute(attribute.value)
|
|
320
|
-
end
|
|
321
|
-
end
|
|
322
|
-
end
|
|
323
|
-
end
|
|
324
|
-
sanitize_node.call(child)
|
|
325
|
-
end
|
|
326
|
-
end
|
|
327
|
-
html_node
|
|
328
|
-
end
|
|
329
|
-
sanitize_node.call(html_doc.root)
|
|
330
|
-
html = html_doc.root.inner_xml
|
|
331
|
-
return html
|
|
332
|
-
end
|
|
333
|
-
|
|
334
283
|
# Returns true if the type string provided indicates that something is
|
|
335
284
|
# xml or xhtml content.
|
|
336
285
|
def self.xml_type?(type)
|
|
@@ -405,20 +354,35 @@ module FeedTools
|
|
|
405
354
|
["q", "cite"],
|
|
406
355
|
["script", "src"]
|
|
407
356
|
]
|
|
408
|
-
html_doc = HTree.parse_xml("<root>" + html + "</root>").to_rexml
|
|
409
357
|
|
|
358
|
+
# HACK: Prevent the parser from freaking out if it sees this:
|
|
359
|
+
html.gsub!(/<!'/, "<!'")
|
|
360
|
+
|
|
361
|
+
if FeedTools.configurations[:sanitization_enabled]
|
|
362
|
+
fragments = HTML5::HTMLParser.parse_fragment(
|
|
363
|
+
html, :tokenizer => HTML5::HTMLSanitizer)
|
|
364
|
+
else
|
|
365
|
+
fragments = HTML5::HTMLParser.parse_fragment(html)
|
|
366
|
+
end
|
|
410
367
|
resolve_node = lambda do |html_node|
|
|
411
368
|
if html_node.kind_of? REXML::Element
|
|
412
|
-
for
|
|
413
|
-
if html_node.name.downcase ==
|
|
414
|
-
attribute = html_node.attribute(
|
|
369
|
+
for element_name, attribute_name in relative_uri_attributes
|
|
370
|
+
if html_node.name.downcase == element_name
|
|
371
|
+
attribute = html_node.attribute(attribute_name)
|
|
415
372
|
if attribute != nil
|
|
416
373
|
href = attribute.value
|
|
417
374
|
href = FeedTools::UriHelper.resolve_relative_uri(
|
|
418
375
|
href, [html_node.base_uri] | base_uri_sources)
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
376
|
+
href = FeedTools::UriHelper.normalize_url(href)
|
|
377
|
+
html_node.attribute(attribute_name).instance_variable_set(
|
|
378
|
+
"@value", href)
|
|
379
|
+
html_node.attribute(attribute_name).instance_variable_set(
|
|
380
|
+
"@unnormalized", href)
|
|
381
|
+
html_node.attribute(attribute_name).instance_variable_set(
|
|
382
|
+
"@normalized", href)
|
|
383
|
+
if html_node.attribute(attribute_name).value != href
|
|
384
|
+
warn("Failed to update href to resolved value.")
|
|
385
|
+
end
|
|
422
386
|
end
|
|
423
387
|
end
|
|
424
388
|
end
|
|
@@ -430,8 +394,12 @@ module FeedTools
|
|
|
430
394
|
end
|
|
431
395
|
html_node
|
|
432
396
|
end
|
|
433
|
-
|
|
434
|
-
|
|
397
|
+
fragments.each do |fragment|
|
|
398
|
+
resolve_node.call(fragment)
|
|
399
|
+
end
|
|
400
|
+
html = (fragments.map do |stuff|
|
|
401
|
+
stuff.to_s
|
|
402
|
+
end).join("")
|
|
435
403
|
return html
|
|
436
404
|
end
|
|
437
405
|
|
|
@@ -552,22 +520,16 @@ module FeedTools
|
|
|
552
520
|
content = FeedTools::HtmlHelper.unescape_entities(
|
|
553
521
|
content_node.inner_xml.strip)
|
|
554
522
|
else
|
|
555
|
-
content =
|
|
556
|
-
|
|
523
|
+
content = FeedTools::HtmlHelper.unescape_entities(
|
|
524
|
+
content_node.inner_xml.strip)
|
|
557
525
|
end
|
|
558
526
|
if type == "text" || mode == "text" ||
|
|
559
527
|
type == "text/plain" || mode == "text/plain"
|
|
560
528
|
content = FeedTools::HtmlHelper.escape_entities(content)
|
|
561
529
|
end
|
|
562
530
|
unless content.nil?
|
|
563
|
-
if FeedTools.configurations[:sanitization_enabled]
|
|
564
|
-
content = FeedTools::HtmlHelper.sanitize_html(content, :strip)
|
|
565
|
-
end
|
|
566
531
|
content = FeedTools::HtmlHelper.resolve_relative_uris(content,
|
|
567
532
|
[content_node.base_uri] | base_uri_sources)
|
|
568
|
-
if repair_entities
|
|
569
|
-
content = FeedTools::HtmlHelper.unescape_entities(content)
|
|
570
|
-
end
|
|
571
533
|
content = FeedTools::HtmlHelper.tidy_html(content)
|
|
572
534
|
end
|
|
573
535
|
if FeedTools.configurations[:tab_spaces] != nil
|
|
@@ -108,21 +108,51 @@ module FeedTools
|
|
|
108
108
|
proxy_user = nil
|
|
109
109
|
proxy_password = nil
|
|
110
110
|
|
|
111
|
+
auth_user = nil
|
|
112
|
+
auth_password = nil
|
|
113
|
+
auth_scheme = nil
|
|
114
|
+
|
|
111
115
|
if options[:feed_object] != nil
|
|
112
116
|
proxy_address =
|
|
113
117
|
options[:feed_object].configurations[:proxy_address] || nil
|
|
114
118
|
proxy_port =
|
|
115
119
|
options[:feed_object].configurations[:proxy_port].to_i || nil
|
|
116
120
|
proxy_user =
|
|
117
|
-
options[:feed_object].configurations[:proxy_user]
|
|
121
|
+
options[:feed_object].configurations[:proxy_user] || nil
|
|
118
122
|
proxy_password =
|
|
119
|
-
options[:feed_object].configurations[:proxy_password]
|
|
123
|
+
options[:feed_object].configurations[:proxy_password] || nil
|
|
124
|
+
|
|
125
|
+
auth_user =
|
|
126
|
+
options[:feed_object].configurations[:auth_user] || nil
|
|
127
|
+
auth_password =
|
|
128
|
+
options[:feed_object].configurations[:auth_password] || nil
|
|
129
|
+
auth_scheme =
|
|
130
|
+
options[:feed_object].configurations[:auth_scheme] || nil
|
|
120
131
|
end
|
|
121
132
|
|
|
133
|
+
if (auth_user &&
|
|
134
|
+
(auth_scheme == nil || auth_scheme.to_s.to_sym == :basic))
|
|
135
|
+
options[:request_headers]["Authorization"] =
|
|
136
|
+
"Basic " + [
|
|
137
|
+
"#{auth_user}:#{auth_password}"
|
|
138
|
+
].pack('m').delete("\r\n")
|
|
139
|
+
end
|
|
140
|
+
|
|
122
141
|
# No need to check for nil
|
|
123
142
|
http = Net::HTTP::Proxy(
|
|
124
143
|
proxy_address, proxy_port, proxy_user, proxy_password).new(
|
|
125
144
|
uri.host, (uri.port or 80))
|
|
145
|
+
|
|
146
|
+
if options[:feed_object] != nil &&
|
|
147
|
+
options[:feed_object].configurations[:http_timeout] != nil
|
|
148
|
+
http.open_timeout =
|
|
149
|
+
options[:feed_object].configurations[:http_timeout].to_f
|
|
150
|
+
elsif FeedTools.configurations[:http_timeout] != nil
|
|
151
|
+
http.open_timeout = FeedTools.configurations[:http_timeout].to_f
|
|
152
|
+
end
|
|
153
|
+
if http.open_timeout != nil && http.open_timeout == 0
|
|
154
|
+
http.open_timeout = nil
|
|
155
|
+
end
|
|
126
156
|
|
|
127
157
|
path = uri.path
|
|
128
158
|
path += ('?' + uri.query) if uri.query
|
|
@@ -238,4 +268,4 @@ module FeedTools
|
|
|
238
268
|
:head, url, options, &block)
|
|
239
269
|
end
|
|
240
270
|
end
|
|
241
|
-
end
|
|
271
|
+
end
|
|
@@ -59,13 +59,33 @@ module FeedTools
|
|
|
59
59
|
# to be. Also translates from the feed: and rss: pseudo-protocols to the
|
|
60
60
|
# http: protocol.
|
|
61
61
|
def self.normalize_url(url)
|
|
62
|
-
if url.
|
|
62
|
+
if url.nil?
|
|
63
|
+
return nil
|
|
64
|
+
end
|
|
65
|
+
if !url.kind_of?(String)
|
|
63
66
|
url = url.to_s
|
|
64
67
|
end
|
|
65
68
|
if url.blank?
|
|
66
|
-
return
|
|
69
|
+
return ""
|
|
70
|
+
end
|
|
71
|
+
normalized_url = url.strip
|
|
72
|
+
|
|
73
|
+
begin
|
|
74
|
+
normalized_url =
|
|
75
|
+
FeedTools::URI.convert_path(normalized_url.strip).normalize.to_s
|
|
76
|
+
rescue Exception
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
begin
|
|
80
|
+
begin
|
|
81
|
+
normalized_url =
|
|
82
|
+
FeedTools::URI.parse(normalized_url.strip).normalize.to_s
|
|
83
|
+
rescue Exception
|
|
84
|
+
normalized_url = CGI.unescape(url.strip)
|
|
85
|
+
end
|
|
86
|
+
rescue Exception
|
|
87
|
+
normalized_url = url.strip
|
|
67
88
|
end
|
|
68
|
-
normalized_url = CGI.unescape(url.strip)
|
|
69
89
|
|
|
70
90
|
# if a url begins with the '/' character, it only makes sense that they
|
|
71
91
|
# meant to be using a file:// url. Fix it for them.
|
|
@@ -90,76 +110,40 @@ module FeedTools
|
|
|
90
110
|
# deal with all of the many ugly possibilities involved in the rss:
|
|
91
111
|
# and feed: pseudo-protocols (incidentally, whose crazy idea was this
|
|
92
112
|
# mess?)
|
|
113
|
+
normalized_url.gsub!(/^htp:\/*/i, "http://")
|
|
93
114
|
normalized_url.gsub!(/^http:\/*(feed:\/*)?/i, "http://")
|
|
94
115
|
normalized_url.gsub!(/^http:\/*(rss:\/*)?/i, "http://")
|
|
95
116
|
normalized_url.gsub!(/^feed:\/*(http:\/*)?/i, "http://")
|
|
96
117
|
normalized_url.gsub!(/^rss:\/*(http:\/*)?/i, "http://")
|
|
97
118
|
normalized_url.gsub!(/^file:\/*/i, "file:///")
|
|
98
119
|
normalized_url.gsub!(/^https:\/*/i, "https://")
|
|
120
|
+
normalized_url.gsub!(/^mms:\/*/i, "http://")
|
|
99
121
|
# fix (very) bad urls (usually of the user-entered sort)
|
|
100
122
|
normalized_url.gsub!(/^http:\/*(http:\/*)*/i, "http://")
|
|
123
|
+
normalized_url.gsub!(/^http:\/*$/i, "")
|
|
101
124
|
|
|
102
125
|
if (normalized_url =~ /^file:/i) == 0
|
|
103
126
|
# Adjust windows-style urls
|
|
104
127
|
normalized_url.gsub!(/^file:\/\/\/([a-zA-Z])\|/i, 'file:///\1:')
|
|
105
128
|
normalized_url.gsub!(/\\/, '/')
|
|
106
129
|
else
|
|
107
|
-
if (normalized_url
|
|
130
|
+
if FeedTools::URI.parse(normalized_url).scheme == nil &&
|
|
131
|
+
normalized_url =~ /\./ &&
|
|
108
132
|
normalized_url = "http://" + normalized_url
|
|
109
133
|
end
|
|
110
134
|
if normalized_url == "http://"
|
|
111
135
|
return nil
|
|
112
136
|
end
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
host_part =
|
|
120
|
-
IDN::Idna.toASCII(host_part)
|
|
121
|
-
end
|
|
122
|
-
new_path = ""
|
|
123
|
-
for index in 0...path.size
|
|
124
|
-
if path[index] <= 32 || path[index] >= 126
|
|
125
|
-
new_path << ("%" + path[index].to_s(16).upcase)
|
|
126
|
-
else
|
|
127
|
-
new_path << path[index..index]
|
|
128
|
-
end
|
|
129
|
-
end
|
|
130
|
-
path = new_path
|
|
131
|
-
normalized_url = scheme + "://" + host_part + "/" + path
|
|
132
|
-
end
|
|
133
|
-
rescue Object
|
|
134
|
-
end
|
|
135
|
-
begin
|
|
136
|
-
feed_uri = URI.parse(normalized_url)
|
|
137
|
-
if feed_uri.scheme == nil
|
|
138
|
-
feed_uri.scheme = "http"
|
|
139
|
-
end
|
|
140
|
-
if feed_uri.path.blank?
|
|
141
|
-
feed_uri.path = "/"
|
|
142
|
-
end
|
|
143
|
-
if (feed_uri.path =~ /^[\/]+/) == 0
|
|
144
|
-
feed_uri.path.gsub!(/^[\/]+/, "/")
|
|
145
|
-
end
|
|
146
|
-
while (feed_uri.path =~ /^\/\.\./)
|
|
147
|
-
feed_uri.path.gsub!(/^\/\.\./, "")
|
|
148
|
-
end
|
|
149
|
-
if feed_uri.path.blank?
|
|
150
|
-
feed_uri.path = "/"
|
|
151
|
-
end
|
|
152
|
-
feed_uri.host.downcase!
|
|
153
|
-
normalized_url = feed_uri.to_s
|
|
154
|
-
rescue URI::InvalidURIError
|
|
155
|
-
end
|
|
137
|
+
end
|
|
138
|
+
if normalized_url =~ /^https?:\/\/#/i
|
|
139
|
+
normalized_url.gsub!(/^https?:\/\/#/i, "#")
|
|
140
|
+
end
|
|
141
|
+
if normalized_url =~ /^https?:\/\/\?/i
|
|
142
|
+
normalized_url.gsub!(/^https?:\/\/\?/i, "?")
|
|
156
143
|
end
|
|
157
144
|
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
normalized_url.gsub!(/%20/, " ")
|
|
161
|
-
normalized_url.gsub!(/ /, "%20")
|
|
162
|
-
|
|
145
|
+
normalized_url =
|
|
146
|
+
FeedTools::URI.parse(normalized_url.strip).normalize.to_s
|
|
163
147
|
return normalized_url
|
|
164
148
|
end
|
|
165
149
|
|
|
@@ -168,7 +152,15 @@ module FeedTools
|
|
|
168
152
|
return relative_uri if base_uri_sources.blank?
|
|
169
153
|
return nil if relative_uri.nil?
|
|
170
154
|
begin
|
|
171
|
-
|
|
155
|
+
# Massive HACK to get around file protocol URIs being used to
|
|
156
|
+
# resolve relative URIs on feeds in the local file system.
|
|
157
|
+
# Better to leave these URIs unresolved and hope some other
|
|
158
|
+
# tool resolves them correctly.
|
|
159
|
+
base_uri_sources.reject! do |base_uri|
|
|
160
|
+
base_uri == nil ||
|
|
161
|
+
FeedTools::URI.parse(base_uri).scheme == "file"
|
|
162
|
+
end
|
|
163
|
+
base_uri = FeedTools::URI.parse(
|
|
172
164
|
FeedTools::XmlHelper.select_not_blank(base_uri_sources))
|
|
173
165
|
resolved_uri = base_uri
|
|
174
166
|
if relative_uri.to_s != ''
|
|
@@ -207,7 +199,7 @@ module FeedTools
|
|
|
207
199
|
end
|
|
208
200
|
normalized_url = normalize_url(url)
|
|
209
201
|
require 'uuidtools'
|
|
210
|
-
return UUID.sha1_create(UUID_URL_NAMESPACE, normalized_url).
|
|
202
|
+
return UUID.sha1_create(UUID_URL_NAMESPACE, normalized_url).to_uri.to_s
|
|
211
203
|
end
|
|
212
204
|
|
|
213
205
|
# Returns true if the parameter appears to be a valid uri
|