feedtools 0.2.22 → 0.2.23
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +28 -0
- data/README +23 -2
- data/db/migration.rb +19 -0
- data/db/schema.mysql.sql +1 -1
- data/db/schema.postgresql.sql +1 -1
- data/db/schema.sqlite.sql +1 -1
- data/lib/feed_tools.rb +71 -388
- data/lib/feed_tools/database_feed_cache.rb +4 -3
- data/lib/feed_tools/feed.rb +809 -607
- data/lib/feed_tools/feed_item.rb +551 -574
- data/lib/feed_tools/feed_structures.rb +252 -0
- data/lib/feed_tools/helpers/feed_tools_helper.rb +6 -5
- data/lib/feed_tools/helpers/generic_helper.rb +16 -158
- data/lib/feed_tools/helpers/html_helper.rb +629 -0
- data/lib/feed_tools/helpers/retrieval_helper.rb +5 -0
- data/lib/feed_tools/helpers/uri_helper.rb +223 -0
- data/lib/feed_tools/helpers/xml_helper.rb +239 -0
- data/rakefile +10 -237
- data/test/unit/amp_test.rb +102 -94
- data/test/unit/atom_test.rb +239 -6
- data/test/unit/cache_test.rb +1 -1
- data/test/unit/encoding_test.rb +5 -5
- data/test/unit/generation_test.rb +34 -1
- data/test/unit/helper_test.rb +111 -17
- data/test/unit/rss_test.rb +21 -2
- metadata +7 -3
- data/lib/feed_tools/helpers/module_helper.rb +0 -27
@@ -0,0 +1,629 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2005 Robert Aman
|
3
|
+
#
|
4
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
5
|
+
# a copy of this software and associated documentation files (the
|
6
|
+
# "Software"), to deal in the Software without restriction, including
|
7
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
8
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
9
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
10
|
+
# the following conditions:
|
11
|
+
#
|
12
|
+
# The above copyright notice and this permission notice shall be
|
13
|
+
# included in all copies or substantial portions of the Software.
|
14
|
+
#
|
15
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
16
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
17
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
18
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
19
|
+
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
20
|
+
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
21
|
+
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
22
|
+
#++
|
23
|
+
|
24
|
+
require 'feed_tools'
|
25
|
+
require 'feed_tools/helpers/xml_helper'
|
26
|
+
require 'rexml/document'
|
27
|
+
|
28
|
+
module FeedTools
|
29
|
+
# Methods for pulling remote data
|
30
|
+
module HtmlHelper
|
31
|
+
# Escapes all html entities
|
32
|
+
def self.escape_entities(html)
|
33
|
+
return nil if html.nil?
|
34
|
+
escaped_html = CGI.escapeHTML(html)
|
35
|
+
escaped_html.gsub!(/'/, "'")
|
36
|
+
escaped_html.gsub!(/"/, """)
|
37
|
+
return escaped_html
|
38
|
+
end
|
39
|
+
|
40
|
+
# Unescapes all html entities
|
41
|
+
def self.unescape_entities(html)
|
42
|
+
return nil if html.nil?
|
43
|
+
unescaped_html = html
|
44
|
+
unescaped_html.gsub!(/&/, "&")
|
45
|
+
unescaped_html.gsub!(/&/, "&")
|
46
|
+
unescaped_html = unescaped_html.gsub(/&#x\d+;/) do |hex|
|
47
|
+
"&#" + hex[3..-2].to_i(16).to_s + ";"
|
48
|
+
end
|
49
|
+
unescaped_html = CGI.unescapeHTML(unescaped_html)
|
50
|
+
unescaped_html.gsub!(/'/, "'")
|
51
|
+
unescaped_html.gsub!(/"/, "\"")
|
52
|
+
return unescaped_html
|
53
|
+
end
|
54
|
+
|
55
|
+
# Removes all html tags from the html formatted text, but leaves
|
56
|
+
# escaped entities alone.
|
57
|
+
def self.strip_html_tags(html)
|
58
|
+
return nil if html.nil?
|
59
|
+
stripped_html = html
|
60
|
+
stripped_html.gsub!(/<\/?[^>]+>/, "")
|
61
|
+
return stripped_html
|
62
|
+
end
|
63
|
+
|
64
|
+
# Removes all html tags from the html formatted text and removes
|
65
|
+
# escaped entities.
|
66
|
+
def self.convert_html_to_plain_text(html)
|
67
|
+
return nil if html.nil?
|
68
|
+
stripped_html = html
|
69
|
+
stripped_html = FeedTools::HtmlHelper.strip_html_tags(stripped_html)
|
70
|
+
stripped_html = FeedTools::HtmlHelper.unescape_entities(stripped_html)
|
71
|
+
stripped_html.gsub!(/‘/, "'")
|
72
|
+
stripped_html.gsub!(/’/, "'")
|
73
|
+
stripped_html.gsub!(/“/, "\"")
|
74
|
+
stripped_html.gsub!(/”/, "\"")
|
75
|
+
return stripped_html
|
76
|
+
end
|
77
|
+
|
78
|
+
# Returns true if the html tidy module can be used.
|
79
|
+
#
|
80
|
+
# Obviously, you need the tidy gem installed in order to run with html
|
81
|
+
# tidy features turned on.
|
82
|
+
#
|
83
|
+
# This method does a fairly complicated, and probably unnecessarily
|
84
|
+
# desperate search for the libtidy library. If you want this thing to
|
85
|
+
# execute fast, the best thing to do is to set Tidy.path ahead of time.
|
86
|
+
# If Tidy.path is set, this method doesn't do much. If it's not set,
|
87
|
+
# it will do it's darnedest to find the libtidy library. If you set
|
88
|
+
# the LIBTIDYPATH environment variable to the libtidy library, it should
|
89
|
+
# be able to find it.
|
90
|
+
#
|
91
|
+
# Once the library is located, this method will run much faster.
|
92
|
+
def self.tidy_enabled?
|
93
|
+
# This is an override variable to keep tidy from being used even if it
|
94
|
+
# is available.
|
95
|
+
if FeedTools.configurations[:tidy_enabled] == false
|
96
|
+
return false
|
97
|
+
end
|
98
|
+
if @tidy_enabled.nil? || @tidy_enabled == false
|
99
|
+
@tidy_enabled = false
|
100
|
+
begin
|
101
|
+
require 'tidy'
|
102
|
+
if Tidy.path.nil?
|
103
|
+
# *Shrug*, just brute force it, I guess. There's a lot of places
|
104
|
+
# this thing might be hiding in, depending on platform and general
|
105
|
+
# sanity of the person who installed the thing. Most of these are
|
106
|
+
# probably unlikely, but it's not like checking unlikely locations
|
107
|
+
# hurts. Much. Especially if you actually find it.
|
108
|
+
libtidy_locations = [
|
109
|
+
'/usr/local/lib/libtidy.dylib',
|
110
|
+
'/opt/local/lib/libtidy.dylib',
|
111
|
+
'/usr/lib/libtidy.dylib',
|
112
|
+
'/usr/local/lib/tidylib.dylib',
|
113
|
+
'/opt/local/lib/tidylib.dylib',
|
114
|
+
'/usr/lib/tidylib.dylib',
|
115
|
+
'/usr/local/lib/tidy.dylib',
|
116
|
+
'/opt/local/lib/tidy.dylib',
|
117
|
+
'/usr/lib/tidy.dylib',
|
118
|
+
'/usr/local/lib/libtidy.so',
|
119
|
+
'/opt/local/lib/libtidy.so',
|
120
|
+
'/usr/lib/libtidy.so',
|
121
|
+
'/usr/local/lib/tidylib.so',
|
122
|
+
'/opt/local/lib/tidylib.so',
|
123
|
+
'/usr/lib/tidylib.so',
|
124
|
+
'/usr/local/lib/tidy.so',
|
125
|
+
'/opt/local/lib/tidy.so',
|
126
|
+
'/usr/lib/tidy.so',
|
127
|
+
'C:\Program Files\Tidy\tidy.dll',
|
128
|
+
'C:\Tidy\tidy.dll',
|
129
|
+
'C:\Ruby\bin\tidy.dll',
|
130
|
+
'C:\Ruby\tidy.dll',
|
131
|
+
'/usr/local/lib',
|
132
|
+
'/opt/local/lib',
|
133
|
+
'/usr/lib'
|
134
|
+
]
|
135
|
+
# We just made this thing up, but if someone sets it, we'll
|
136
|
+
# go ahead and check it
|
137
|
+
unless ENV['LIBTIDYPATH'].nil?
|
138
|
+
libtidy_locations =
|
139
|
+
libtidy_locations.reverse.push(ENV['LIBTIDYPATH'])
|
140
|
+
end
|
141
|
+
for path in libtidy_locations
|
142
|
+
if File.exists? path
|
143
|
+
if File.ftype(path) == "file"
|
144
|
+
Tidy.path = path
|
145
|
+
@tidy_enabled = true
|
146
|
+
break
|
147
|
+
elsif File.ftype(path) == "directory"
|
148
|
+
# Ok, now perhaps we're getting a bit more desperate
|
149
|
+
lib_paths =
|
150
|
+
`find #{path} -name '*tidy*' | grep '\\.\\(so\\|dylib\\)$'`
|
151
|
+
# If there's more than one, grab the first one and
|
152
|
+
# hope for the best, and if it doesn't work, then blame the
|
153
|
+
# user for not specifying more accurately.
|
154
|
+
tidy_path = lib_paths.split("\n").first
|
155
|
+
unless tidy_path.nil?
|
156
|
+
Tidy.path = tidy_path
|
157
|
+
@tidy_enabled = true
|
158
|
+
break
|
159
|
+
end
|
160
|
+
end
|
161
|
+
end
|
162
|
+
end
|
163
|
+
# Still couldn't find it.
|
164
|
+
unless @tidy_enabled
|
165
|
+
@tidy_enabled = false
|
166
|
+
end
|
167
|
+
else
|
168
|
+
@tidy_enabled = true
|
169
|
+
end
|
170
|
+
rescue LoadError
|
171
|
+
# Tidy not installed, disable features that rely on tidy.
|
172
|
+
@tidy_enabled = false
|
173
|
+
end
|
174
|
+
end
|
175
|
+
return @tidy_enabled
|
176
|
+
end
|
177
|
+
|
178
|
+
# Tidys up the html
|
179
|
+
def self.tidy_html(html, options = {})
|
180
|
+
return nil if html.nil?
|
181
|
+
|
182
|
+
FeedTools::GenericHelper.validate_options([ :input_encoding,
|
183
|
+
:output_encoding ],
|
184
|
+
options.keys)
|
185
|
+
options = { :input_encoding => "utf-8",
|
186
|
+
:output_encoding => "utf-8" }.merge(options)
|
187
|
+
|
188
|
+
if FeedTools::HtmlHelper.tidy_enabled?
|
189
|
+
is_fragment = true
|
190
|
+
html.gsub!(/<!'/, "&lt;!'")
|
191
|
+
if (html.strip =~ /<html>(.|\n)*<body>/) != nil ||
|
192
|
+
(html.strip =~ /<\/body>(.|\n)*<\/html>$/) != nil
|
193
|
+
is_fragment = false
|
194
|
+
end
|
195
|
+
if (html.strip =~ /<\?xml(.|\n)*\?>/) != nil
|
196
|
+
is_fragment = false
|
197
|
+
end
|
198
|
+
|
199
|
+
# Tidy sucks?
|
200
|
+
# TODO: find the correct set of tidy options to set so
|
201
|
+
# that *ugly* hacks like this aren't necessary.
|
202
|
+
html = html.gsub(/\302\240/, "\240")
|
203
|
+
|
204
|
+
tidy_html = Tidy.open(:show_warnings=>false) do |tidy|
|
205
|
+
tidy.options.output_xml = true
|
206
|
+
tidy.options.markup = true
|
207
|
+
tidy.options.indent = true
|
208
|
+
tidy.options.wrap = 0
|
209
|
+
tidy.options.logical_emphasis = true
|
210
|
+
tidy.options.input_encoding = options[:input_encoding]
|
211
|
+
tidy.options.output_encoding = options[:output_encoding]
|
212
|
+
tidy.options.doctype = "omit"
|
213
|
+
xml = tidy.clean(html)
|
214
|
+
xml
|
215
|
+
end
|
216
|
+
if is_fragment
|
217
|
+
# Tidy sticks <html>...<body>[our html]</body>...</html> in.
|
218
|
+
# We don't want this.
|
219
|
+
tidy_html.strip!
|
220
|
+
tidy_html.gsub!(/^<html>(.|\n)*<body>/, "")
|
221
|
+
tidy_html.gsub!(/<\/body>(.|\n)*<\/html>$/, "")
|
222
|
+
tidy_html.gsub!("\t", " ")
|
223
|
+
tidy_html = FeedTools::HtmlHelper.unindent(tidy_html, 4)
|
224
|
+
tidy_html.strip!
|
225
|
+
end
|
226
|
+
else
|
227
|
+
tidy_html = html
|
228
|
+
end
|
229
|
+
if tidy_html.blank? && !html.blank?
|
230
|
+
tidy_html = html.strip
|
231
|
+
end
|
232
|
+
return tidy_html
|
233
|
+
end
|
234
|
+
|
235
|
+
# Unindents a text selection by a specified number of spaces.
|
236
|
+
def self.unindent(text, spaces)
|
237
|
+
lines = text.split("\n")
|
238
|
+
buffer = ""
|
239
|
+
for line in lines
|
240
|
+
for index in 0...spaces
|
241
|
+
if line[0...1] == " "
|
242
|
+
line = line[1..-1]
|
243
|
+
else
|
244
|
+
break
|
245
|
+
end
|
246
|
+
end
|
247
|
+
buffer << line << "\n"
|
248
|
+
end
|
249
|
+
return buffer
|
250
|
+
end
|
251
|
+
|
252
|
+
# Removes all dangerous html tags from the html formatted text.
|
253
|
+
# If mode is set to :escape, dangerous and unknown elements will
|
254
|
+
# be escaped. If mode is set to :strip, dangerous and unknown
|
255
|
+
# elements and all children will be removed entirely.
|
256
|
+
# Dangerous or unknown attributes are always removed.
|
257
|
+
def self.sanitize_html(html, mode=:strip)
|
258
|
+
return nil if html.nil?
|
259
|
+
|
260
|
+
# Lists borrowed from Mark Pilgrim's feedparser
|
261
|
+
acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b',
|
262
|
+
'big', 'blockquote', 'br', 'button', 'caption', 'center', 'cite',
|
263
|
+
'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl',
|
264
|
+
'dt', 'em', 'fieldset', 'font', 'form', 'h1', 'h2', 'h3', 'h4',
|
265
|
+
'h5', 'h6', 'hr', 'i', 'img', 'input', 'ins', 'kbd', 'label', 'legend',
|
266
|
+
'li', 'map', 'menu', 'ol', 'optgroup', 'option', 'p', 'pre', 'q', 's',
|
267
|
+
'samp', 'select', 'small', 'span', 'strike', 'strong', 'sub', 'sup',
|
268
|
+
'table', 'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'tt',
|
269
|
+
'u', 'ul', 'var']
|
270
|
+
|
271
|
+
acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
|
272
|
+
'action', 'align', 'alt', 'axis', 'border', 'cellpadding',
|
273
|
+
'cellspacing', 'char', 'charoff', 'charset', 'checked', 'cite', 'class',
|
274
|
+
'clear', 'cols', 'colspan', 'color', 'compact', 'coords', 'datetime',
|
275
|
+
'dir', 'disabled', 'enctype', 'for', 'frame', 'headers', 'height',
|
276
|
+
'href', 'hreflang', 'hspace', 'id', 'ismap', 'label', 'lang',
|
277
|
+
'longdesc', 'maxlength', 'media', 'method', 'multiple', 'name',
|
278
|
+
'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev',
|
279
|
+
'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
|
280
|
+
'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title',
|
281
|
+
'type', 'usemap', 'valign', 'value', 'vspace', 'width']
|
282
|
+
|
283
|
+
# Replace with appropriate named entities
|
284
|
+
html.gsub!(/&/, "&")
|
285
|
+
html.gsub!(/&/, "&")
|
286
|
+
html.gsub!(/<!'/, "&lt;!'")
|
287
|
+
|
288
|
+
# Hackity hack. But it works, and it seems plenty fast enough.
|
289
|
+
html_doc = HTree.parse_xml("<root>" + html + "</root>").to_rexml
|
290
|
+
|
291
|
+
sanitize_node = lambda do |html_node|
|
292
|
+
if html_node.respond_to? :children
|
293
|
+
for child in html_node.children
|
294
|
+
if child.kind_of? REXML::Element
|
295
|
+
unless acceptable_elements.include? child.name.downcase
|
296
|
+
if mode == :strip
|
297
|
+
html_node.delete_element(child)
|
298
|
+
else
|
299
|
+
new_child = REXML::Text.new(CGI.escapeHTML(child.to_s))
|
300
|
+
html_node.insert_after(child, new_child)
|
301
|
+
html_node.delete_element(child)
|
302
|
+
end
|
303
|
+
end
|
304
|
+
for attribute in child.attributes.keys
|
305
|
+
if !(attribute =~ /^xmlns/)
|
306
|
+
unless acceptable_attributes.include? attribute.downcase
|
307
|
+
child.delete_attribute(attribute)
|
308
|
+
end
|
309
|
+
end
|
310
|
+
end
|
311
|
+
end
|
312
|
+
sanitize_node.call(child)
|
313
|
+
end
|
314
|
+
end
|
315
|
+
html_node
|
316
|
+
end
|
317
|
+
sanitize_node.call(html_doc.root)
|
318
|
+
html = html_doc.root.inner_xml
|
319
|
+
return html
|
320
|
+
end
|
321
|
+
|
322
|
+
# Returns true if the type string provided indicates that something is
|
323
|
+
# xml or xhtml content.
|
324
|
+
def self.xml_type?(type)
|
325
|
+
if [
|
326
|
+
"xml",
|
327
|
+
"xhtml",
|
328
|
+
"application/xhtml+xml"
|
329
|
+
].include?(type)
|
330
|
+
return true
|
331
|
+
elsif type != nil && type[-3..-1] == "xml"
|
332
|
+
return true
|
333
|
+
else
|
334
|
+
return false
|
335
|
+
end
|
336
|
+
end
|
337
|
+
|
338
|
+
# Returns true if the type string provided indicates that something is
|
339
|
+
# html or xhtml content.
|
340
|
+
def self.text_type?(type)
|
341
|
+
return [
|
342
|
+
"text",
|
343
|
+
"text/plain"
|
344
|
+
].include?(type)
|
345
|
+
end
|
346
|
+
|
347
|
+
# Returns true if the type string provided indicates that something is
|
348
|
+
# html or xhtml content.
|
349
|
+
def self.html_type?(type)
|
350
|
+
return [
|
351
|
+
"html",
|
352
|
+
"xhtml",
|
353
|
+
"text/html",
|
354
|
+
"application/xhtml+xml"
|
355
|
+
].include?(type)
|
356
|
+
end
|
357
|
+
|
358
|
+
# Returns true if the type string provided indicates that something is
|
359
|
+
# only html (not xhtml) content.
|
360
|
+
def self.only_html_type?(type)
|
361
|
+
return [
|
362
|
+
"html",
|
363
|
+
"text/html"
|
364
|
+
].include?(type)
|
365
|
+
end
|
366
|
+
|
367
|
+
# can_be_relative_uri = ['link', 'id', 'wfw_comment', 'wfw_commentrss', 'docs', 'url', 'href', 'comments', 'license', 'icon', 'logo']
|
368
|
+
|
369
|
+
# Resolves all relative uris in a block of html.
|
370
|
+
def self.resolve_relative_uris(html, base_uri_sources=[])
|
371
|
+
relative_uri_attributes = [
|
372
|
+
["a", "href"],
|
373
|
+
["applet", "codebase"],
|
374
|
+
["area", "href"],
|
375
|
+
["blockquote", "cite"],
|
376
|
+
["body", "background"],
|
377
|
+
["del", "cite"],
|
378
|
+
["form", "action"],
|
379
|
+
["frame", "longdesc"],
|
380
|
+
["frame", "src"],
|
381
|
+
["iframe", "longdesc"],
|
382
|
+
["iframe", "src"],
|
383
|
+
["head", "profile"],
|
384
|
+
["img", "longdesc"],
|
385
|
+
["img", "src"],
|
386
|
+
["img", "usemap"],
|
387
|
+
["input", "src"],
|
388
|
+
["input", "usemap"],
|
389
|
+
["ins", "cite"],
|
390
|
+
["link", "href"],
|
391
|
+
["object", "classid"],
|
392
|
+
["object", "codebase"],
|
393
|
+
["object", "data"],
|
394
|
+
["object", "usemap"],
|
395
|
+
["q", "cite"],
|
396
|
+
["script", "src"]
|
397
|
+
]
|
398
|
+
html_doc = HTree.parse_xml("<root>" + html + "</root>").to_rexml
|
399
|
+
|
400
|
+
resolve_node = lambda do |html_node|
|
401
|
+
if html_node.respond_to? :children
|
402
|
+
for child in html_node.children
|
403
|
+
if child.kind_of? REXML::Element
|
404
|
+
for element_attribute_pair in relative_uri_attributes
|
405
|
+
if child.name.downcase == element_attribute_pair[0]
|
406
|
+
attribute = child.attribute(element_attribute_pair[1])
|
407
|
+
if attribute != nil
|
408
|
+
href = attribute.value
|
409
|
+
href = FeedTools::UriHelper.resolve_relative_uri(
|
410
|
+
href, [child.base_uri] | base_uri_sources)
|
411
|
+
child.attribute(
|
412
|
+
element_attribute_pair[1]).instance_variable_set(
|
413
|
+
"@value", href)
|
414
|
+
end
|
415
|
+
end
|
416
|
+
end
|
417
|
+
end
|
418
|
+
resolve_node.call(child)
|
419
|
+
end
|
420
|
+
end
|
421
|
+
html_node
|
422
|
+
end
|
423
|
+
resolve_node.call(html_doc.root)
|
424
|
+
html = html_doc.root.inner_xml
|
425
|
+
return html
|
426
|
+
end
|
427
|
+
|
428
|
+
# Returns a string containing normalized xhtml from within a REXML node.
|
429
|
+
def self.extract_xhtml(rexml_node)
|
430
|
+
rexml_node_dup = rexml_node.deep_clone
|
431
|
+
normalize_namespaced_xhtml = lambda do |node, node_dup|
|
432
|
+
if node.kind_of? REXML::Element
|
433
|
+
node_namespace = node.namespace
|
434
|
+
# Massive hack, relies on REXML not changing
|
435
|
+
for index in 0...node.attributes.values.size
|
436
|
+
attribute = node.attributes.values[index]
|
437
|
+
attribute_dup = node_dup.attributes.values[index]
|
438
|
+
if attribute.namespace == FEED_TOOLS_NAMESPACES['xhtml']
|
439
|
+
attribute_dup.instance_variable_set(
|
440
|
+
"@expanded_name", attribute.name)
|
441
|
+
end
|
442
|
+
if node_namespace == FEED_TOOLS_NAMESPACES['xhtml']
|
443
|
+
if attribute.name == 'xmlns'
|
444
|
+
node_dup.attributes.delete('xmlns')
|
445
|
+
end
|
446
|
+
end
|
447
|
+
end
|
448
|
+
if node_namespace == FEED_TOOLS_NAMESPACES['xhtml']
|
449
|
+
node_dup.instance_variable_set("@expanded_name", node.name)
|
450
|
+
end
|
451
|
+
if !node_namespace.blank? && node.prefix.blank?
|
452
|
+
if node.namespace != FEED_TOOLS_NAMESPACES['xhtml']
|
453
|
+
node_dup.add_namespace(node_namespace)
|
454
|
+
end
|
455
|
+
end
|
456
|
+
end
|
457
|
+
for index in 0...node.children.size
|
458
|
+
child = node.children[index]
|
459
|
+
child_dup = node_dup.children[index]
|
460
|
+
if child.kind_of? REXML::Element
|
461
|
+
normalize_namespaced_xhtml.call(child, child_dup)
|
462
|
+
end
|
463
|
+
end
|
464
|
+
end
|
465
|
+
normalize_namespaced_xhtml.call(rexml_node, rexml_node_dup)
|
466
|
+
buffer = ""
|
467
|
+
rexml_node_dup.each_child do |child|
|
468
|
+
if child.kind_of? REXML::Comment
|
469
|
+
buffer << "<!--" + child.to_s + "-->"
|
470
|
+
else
|
471
|
+
buffer << child.to_s
|
472
|
+
end
|
473
|
+
end
|
474
|
+
return buffer.strip
|
475
|
+
end
|
476
|
+
|
477
|
+
# Given a REXML node, returns its content, normalized as HTML.
|
478
|
+
def self.process_text_construct(content_node, feed_type, feed_version,
|
479
|
+
base_uri_sources=[])
|
480
|
+
if content_node.nil?
|
481
|
+
return nil
|
482
|
+
end
|
483
|
+
|
484
|
+
content = nil
|
485
|
+
root_node_name = nil
|
486
|
+
type = FeedTools::XmlHelper.try_xpaths(content_node, "@type",
|
487
|
+
:select_result_value => true)
|
488
|
+
mode = FeedTools::XmlHelper.try_xpaths(content_node, "@mode",
|
489
|
+
:select_result_value => true)
|
490
|
+
encoding = FeedTools::XmlHelper.try_xpaths(content_node, "@encoding",
|
491
|
+
:select_result_value => true)
|
492
|
+
|
493
|
+
if type.nil?
|
494
|
+
atom_namespaces = [
|
495
|
+
FEED_TOOLS_NAMESPACES['atom10'],
|
496
|
+
FEED_TOOLS_NAMESPACES['atom03']
|
497
|
+
]
|
498
|
+
if ((atom_namespaces.include?(content_node.namespace) ||
|
499
|
+
atom_namespaces.include?(content_node.root.namespace)) ||
|
500
|
+
feed_type == "atom")
|
501
|
+
type = "text"
|
502
|
+
end
|
503
|
+
end
|
504
|
+
|
505
|
+
# Note that we're checking for misuse of type, mode and encoding here
|
506
|
+
if content_node.cdatas.size > 0
|
507
|
+
content = content_node.cdatas.first.to_s.strip
|
508
|
+
elsif type == "base64" || mode == "base64" ||
|
509
|
+
encoding == "base64"
|
510
|
+
content = Base64.decode64(content_node.inner_xml.strip)
|
511
|
+
elsif type == "xhtml" || mode == "xhtml" ||
|
512
|
+
type == "xml" || mode == "xml" ||
|
513
|
+
type == "application/xhtml+xml" ||
|
514
|
+
content_node.namespace == FEED_TOOLS_NAMESPACES['xhtml']
|
515
|
+
content = FeedTools::HtmlHelper.extract_xhtml(content_node)
|
516
|
+
elsif type == "escaped" || mode == "escaped"
|
517
|
+
content = FeedTools::HtmlHelper.unescape_entities(
|
518
|
+
content_node.inner_xml.strip)
|
519
|
+
elsif type == "text" || mode == "text" ||
|
520
|
+
type == "text/plain" || mode == "text/plain"
|
521
|
+
content = FeedTools::HtmlHelper.unescape_entities(
|
522
|
+
content_node.inner_xml.strip)
|
523
|
+
else
|
524
|
+
content = content_node.inner_xml.strip
|
525
|
+
repair_entities = true
|
526
|
+
end
|
527
|
+
if type == "text" || mode == "text" ||
|
528
|
+
type == "text/plain" || mode == "text/plain"
|
529
|
+
content = FeedTools::HtmlHelper.escape_entities(content)
|
530
|
+
end
|
531
|
+
unless content.nil?
|
532
|
+
if FeedTools.configurations[:sanitization_enabled]
|
533
|
+
content = FeedTools::HtmlHelper.sanitize_html(content, :strip)
|
534
|
+
end
|
535
|
+
content = FeedTools::HtmlHelper.resolve_relative_uris(content,
|
536
|
+
[content_node.base_uri] | base_uri_sources)
|
537
|
+
if repair_entities
|
538
|
+
content = FeedTools::HtmlHelper.unescape_entities(content)
|
539
|
+
end
|
540
|
+
content = FeedTools::HtmlHelper.tidy_html(content)
|
541
|
+
end
|
542
|
+
if FeedTools.configurations[:tab_spaces] != nil
|
543
|
+
spaces = FeedTools.configurations[:tab_spaces].to_i
|
544
|
+
content.gsub!("\t", " " * spaces) unless content.blank?
|
545
|
+
end
|
546
|
+
content.strip unless content.blank?
|
547
|
+
content = nil if content.blank?
|
548
|
+
return content
|
549
|
+
end
|
550
|
+
|
551
|
+
# Strips semantically empty div wrapper elements
|
552
|
+
def self.strip_wrapper_element(xhtml)
|
553
|
+
return nil if xhtml.nil?
|
554
|
+
return xhtml if xhtml.blank?
|
555
|
+
begin
|
556
|
+
doc = REXML::Document.new(xhtml.to_s.strip)
|
557
|
+
if doc.children.size == 1
|
558
|
+
child = doc.children[0]
|
559
|
+
if child.name.downcase == "div"
|
560
|
+
return child.inner_xml.strip
|
561
|
+
end
|
562
|
+
end
|
563
|
+
return xhtml.to_s.strip
|
564
|
+
rescue Exception
|
565
|
+
return xhtml.to_s.strip
|
566
|
+
end
|
567
|
+
end
|
568
|
+
|
569
|
+
# Given a block of html, locates feed links with a given mime type.
|
570
|
+
def self.extract_link_by_mime_type(html, mime_type)
|
571
|
+
require 'feed_tools/vendor/htree'
|
572
|
+
require 'feed_tools/helpers/xml_helper'
|
573
|
+
|
574
|
+
# This is technically very, very wrong. But it saves oodles of
|
575
|
+
# clock cycles, and probably works 99.999% of the time.
|
576
|
+
html_document = HTree.parse_xml(
|
577
|
+
FeedTools::HtmlHelper.tidy_html(html.gsub(/<body>(.|\n)*<\/body>/, ""))).to_rexml
|
578
|
+
html_node = nil
|
579
|
+
head_node = nil
|
580
|
+
link_nodes = []
|
581
|
+
for node in html_document.children
|
582
|
+
next unless node.kind_of?(REXML::Element)
|
583
|
+
if node.name.downcase == "html" &&
|
584
|
+
node.children.size > 0
|
585
|
+
html_node = node
|
586
|
+
break
|
587
|
+
end
|
588
|
+
end
|
589
|
+
return nil if html_node.nil?
|
590
|
+
for node in html_node.children
|
591
|
+
next unless node.kind_of?(REXML::Element)
|
592
|
+
if node.name.downcase == "head"
|
593
|
+
head_node = node
|
594
|
+
break
|
595
|
+
end
|
596
|
+
if node.name.downcase == "link"
|
597
|
+
link_nodes << node
|
598
|
+
end
|
599
|
+
end
|
600
|
+
return nil if html_node.nil? && link_nodes.empty?
|
601
|
+
if !head_node.nil?
|
602
|
+
link_nodes = []
|
603
|
+
for node in head_node.children
|
604
|
+
next unless node.kind_of?(REXML::Element)
|
605
|
+
if node.name.downcase == "link"
|
606
|
+
link_nodes << node
|
607
|
+
end
|
608
|
+
end
|
609
|
+
end
|
610
|
+
find_link_nodes = lambda do |links|
|
611
|
+
for link in links
|
612
|
+
next unless link.kind_of?(REXML::Element)
|
613
|
+
if link.attributes['type'].to_s.strip.downcase ==
|
614
|
+
mime_type.downcase &&
|
615
|
+
link.attributes['rel'].to_s.strip.downcase == "alternate"
|
616
|
+
href = link.attributes['href']
|
617
|
+
return href unless href.blank?
|
618
|
+
end
|
619
|
+
end
|
620
|
+
for link in links
|
621
|
+
next unless link.kind_of?(REXML::Element)
|
622
|
+
find_link_nodes.call(link.children)
|
623
|
+
end
|
624
|
+
end
|
625
|
+
find_link_nodes.call(link_nodes)
|
626
|
+
return nil
|
627
|
+
end
|
628
|
+
end
|
629
|
+
end
|