feedtools 0.2.22 → 0.2.23
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +28 -0
- data/README +23 -2
- data/db/migration.rb +19 -0
- data/db/schema.mysql.sql +1 -1
- data/db/schema.postgresql.sql +1 -1
- data/db/schema.sqlite.sql +1 -1
- data/lib/feed_tools.rb +71 -388
- data/lib/feed_tools/database_feed_cache.rb +4 -3
- data/lib/feed_tools/feed.rb +809 -607
- data/lib/feed_tools/feed_item.rb +551 -574
- data/lib/feed_tools/feed_structures.rb +252 -0
- data/lib/feed_tools/helpers/feed_tools_helper.rb +6 -5
- data/lib/feed_tools/helpers/generic_helper.rb +16 -158
- data/lib/feed_tools/helpers/html_helper.rb +629 -0
- data/lib/feed_tools/helpers/retrieval_helper.rb +5 -0
- data/lib/feed_tools/helpers/uri_helper.rb +223 -0
- data/lib/feed_tools/helpers/xml_helper.rb +239 -0
- data/rakefile +10 -237
- data/test/unit/amp_test.rb +102 -94
- data/test/unit/atom_test.rb +239 -6
- data/test/unit/cache_test.rb +1 -1
- data/test/unit/encoding_test.rb +5 -5
- data/test/unit/generation_test.rb +34 -1
- data/test/unit/helper_test.rb +111 -17
- data/test/unit/rss_test.rb +21 -2
- metadata +7 -3
- data/lib/feed_tools/helpers/module_helper.rb +0 -27
@@ -0,0 +1,629 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2005 Robert Aman
|
3
|
+
#
|
4
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
5
|
+
# a copy of this software and associated documentation files (the
|
6
|
+
# "Software"), to deal in the Software without restriction, including
|
7
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
8
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
9
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
10
|
+
# the following conditions:
|
11
|
+
#
|
12
|
+
# The above copyright notice and this permission notice shall be
|
13
|
+
# included in all copies or substantial portions of the Software.
|
14
|
+
#
|
15
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
16
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
17
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
18
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
19
|
+
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
20
|
+
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
21
|
+
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
22
|
+
#++
|
23
|
+
|
24
|
+
require 'feed_tools'
|
25
|
+
require 'feed_tools/helpers/xml_helper'
|
26
|
+
require 'rexml/document'
|
27
|
+
|
28
|
+
module FeedTools
|
29
|
+
# Methods for pulling remote data
|
30
|
+
module HtmlHelper
|
31
|
+
# Escapes all html entities
|
32
|
+
def self.escape_entities(html)
|
33
|
+
return nil if html.nil?
|
34
|
+
escaped_html = CGI.escapeHTML(html)
|
35
|
+
escaped_html.gsub!(/'/, "'")
|
36
|
+
escaped_html.gsub!(/"/, """)
|
37
|
+
return escaped_html
|
38
|
+
end
|
39
|
+
|
40
|
+
# Unescapes all html entities
|
41
|
+
def self.unescape_entities(html)
|
42
|
+
return nil if html.nil?
|
43
|
+
unescaped_html = html
|
44
|
+
unescaped_html.gsub!(/&/, "&")
|
45
|
+
unescaped_html.gsub!(/&/, "&")
|
46
|
+
unescaped_html = unescaped_html.gsub(/&#x\d+;/) do |hex|
|
47
|
+
"&#" + hex[3..-2].to_i(16).to_s + ";"
|
48
|
+
end
|
49
|
+
unescaped_html = CGI.unescapeHTML(unescaped_html)
|
50
|
+
unescaped_html.gsub!(/'/, "'")
|
51
|
+
unescaped_html.gsub!(/"/, "\"")
|
52
|
+
return unescaped_html
|
53
|
+
end
|
54
|
+
|
55
|
+
# Removes all html tags from the html formatted text, but leaves
|
56
|
+
# escaped entities alone.
|
57
|
+
def self.strip_html_tags(html)
|
58
|
+
return nil if html.nil?
|
59
|
+
stripped_html = html
|
60
|
+
stripped_html.gsub!(/<\/?[^>]+>/, "")
|
61
|
+
return stripped_html
|
62
|
+
end
|
63
|
+
|
64
|
+
# Removes all html tags from the html formatted text and removes
|
65
|
+
# escaped entities.
|
66
|
+
def self.convert_html_to_plain_text(html)
|
67
|
+
return nil if html.nil?
|
68
|
+
stripped_html = html
|
69
|
+
stripped_html = FeedTools::HtmlHelper.strip_html_tags(stripped_html)
|
70
|
+
stripped_html = FeedTools::HtmlHelper.unescape_entities(stripped_html)
|
71
|
+
stripped_html.gsub!(/‘/, "'")
|
72
|
+
stripped_html.gsub!(/’/, "'")
|
73
|
+
stripped_html.gsub!(/“/, "\"")
|
74
|
+
stripped_html.gsub!(/”/, "\"")
|
75
|
+
return stripped_html
|
76
|
+
end
|
77
|
+
|
78
|
+
# Returns true if the html tidy module can be used.
|
79
|
+
#
|
80
|
+
# Obviously, you need the tidy gem installed in order to run with html
|
81
|
+
# tidy features turned on.
|
82
|
+
#
|
83
|
+
# This method does a fairly complicated, and probably unnecessarily
|
84
|
+
# desperate search for the libtidy library. If you want this thing to
|
85
|
+
# execute fast, the best thing to do is to set Tidy.path ahead of time.
|
86
|
+
# If Tidy.path is set, this method doesn't do much. If it's not set,
|
87
|
+
# it will do it's darnedest to find the libtidy library. If you set
|
88
|
+
# the LIBTIDYPATH environment variable to the libtidy library, it should
|
89
|
+
# be able to find it.
|
90
|
+
#
|
91
|
+
# Once the library is located, this method will run much faster.
|
92
|
+
def self.tidy_enabled?
|
93
|
+
# This is an override variable to keep tidy from being used even if it
|
94
|
+
# is available.
|
95
|
+
if FeedTools.configurations[:tidy_enabled] == false
|
96
|
+
return false
|
97
|
+
end
|
98
|
+
if @tidy_enabled.nil? || @tidy_enabled == false
|
99
|
+
@tidy_enabled = false
|
100
|
+
begin
|
101
|
+
require 'tidy'
|
102
|
+
if Tidy.path.nil?
|
103
|
+
# *Shrug*, just brute force it, I guess. There's a lot of places
|
104
|
+
# this thing might be hiding in, depending on platform and general
|
105
|
+
# sanity of the person who installed the thing. Most of these are
|
106
|
+
# probably unlikely, but it's not like checking unlikely locations
|
107
|
+
# hurts. Much. Especially if you actually find it.
|
108
|
+
libtidy_locations = [
|
109
|
+
'/usr/local/lib/libtidy.dylib',
|
110
|
+
'/opt/local/lib/libtidy.dylib',
|
111
|
+
'/usr/lib/libtidy.dylib',
|
112
|
+
'/usr/local/lib/tidylib.dylib',
|
113
|
+
'/opt/local/lib/tidylib.dylib',
|
114
|
+
'/usr/lib/tidylib.dylib',
|
115
|
+
'/usr/local/lib/tidy.dylib',
|
116
|
+
'/opt/local/lib/tidy.dylib',
|
117
|
+
'/usr/lib/tidy.dylib',
|
118
|
+
'/usr/local/lib/libtidy.so',
|
119
|
+
'/opt/local/lib/libtidy.so',
|
120
|
+
'/usr/lib/libtidy.so',
|
121
|
+
'/usr/local/lib/tidylib.so',
|
122
|
+
'/opt/local/lib/tidylib.so',
|
123
|
+
'/usr/lib/tidylib.so',
|
124
|
+
'/usr/local/lib/tidy.so',
|
125
|
+
'/opt/local/lib/tidy.so',
|
126
|
+
'/usr/lib/tidy.so',
|
127
|
+
'C:\Program Files\Tidy\tidy.dll',
|
128
|
+
'C:\Tidy\tidy.dll',
|
129
|
+
'C:\Ruby\bin\tidy.dll',
|
130
|
+
'C:\Ruby\tidy.dll',
|
131
|
+
'/usr/local/lib',
|
132
|
+
'/opt/local/lib',
|
133
|
+
'/usr/lib'
|
134
|
+
]
|
135
|
+
# We just made this thing up, but if someone sets it, we'll
|
136
|
+
# go ahead and check it
|
137
|
+
unless ENV['LIBTIDYPATH'].nil?
|
138
|
+
libtidy_locations =
|
139
|
+
libtidy_locations.reverse.push(ENV['LIBTIDYPATH'])
|
140
|
+
end
|
141
|
+
for path in libtidy_locations
|
142
|
+
if File.exists? path
|
143
|
+
if File.ftype(path) == "file"
|
144
|
+
Tidy.path = path
|
145
|
+
@tidy_enabled = true
|
146
|
+
break
|
147
|
+
elsif File.ftype(path) == "directory"
|
148
|
+
# Ok, now perhaps we're getting a bit more desperate
|
149
|
+
lib_paths =
|
150
|
+
`find #{path} -name '*tidy*' | grep '\\.\\(so\\|dylib\\)$'`
|
151
|
+
# If there's more than one, grab the first one and
|
152
|
+
# hope for the best, and if it doesn't work, then blame the
|
153
|
+
# user for not specifying more accurately.
|
154
|
+
tidy_path = lib_paths.split("\n").first
|
155
|
+
unless tidy_path.nil?
|
156
|
+
Tidy.path = tidy_path
|
157
|
+
@tidy_enabled = true
|
158
|
+
break
|
159
|
+
end
|
160
|
+
end
|
161
|
+
end
|
162
|
+
end
|
163
|
+
# Still couldn't find it.
|
164
|
+
unless @tidy_enabled
|
165
|
+
@tidy_enabled = false
|
166
|
+
end
|
167
|
+
else
|
168
|
+
@tidy_enabled = true
|
169
|
+
end
|
170
|
+
rescue LoadError
|
171
|
+
# Tidy not installed, disable features that rely on tidy.
|
172
|
+
@tidy_enabled = false
|
173
|
+
end
|
174
|
+
end
|
175
|
+
return @tidy_enabled
|
176
|
+
end
|
177
|
+
|
178
|
+
# Tidys up the html
|
179
|
+
def self.tidy_html(html, options = {})
|
180
|
+
return nil if html.nil?
|
181
|
+
|
182
|
+
FeedTools::GenericHelper.validate_options([ :input_encoding,
|
183
|
+
:output_encoding ],
|
184
|
+
options.keys)
|
185
|
+
options = { :input_encoding => "utf-8",
|
186
|
+
:output_encoding => "utf-8" }.merge(options)
|
187
|
+
|
188
|
+
if FeedTools::HtmlHelper.tidy_enabled?
|
189
|
+
is_fragment = true
|
190
|
+
html.gsub!(/<!'/, "&lt;!'")
|
191
|
+
if (html.strip =~ /<html>(.|\n)*<body>/) != nil ||
|
192
|
+
(html.strip =~ /<\/body>(.|\n)*<\/html>$/) != nil
|
193
|
+
is_fragment = false
|
194
|
+
end
|
195
|
+
if (html.strip =~ /<\?xml(.|\n)*\?>/) != nil
|
196
|
+
is_fragment = false
|
197
|
+
end
|
198
|
+
|
199
|
+
# Tidy sucks?
|
200
|
+
# TODO: find the correct set of tidy options to set so
|
201
|
+
# that *ugly* hacks like this aren't necessary.
|
202
|
+
html = html.gsub(/\302\240/, "\240")
|
203
|
+
|
204
|
+
tidy_html = Tidy.open(:show_warnings=>false) do |tidy|
|
205
|
+
tidy.options.output_xml = true
|
206
|
+
tidy.options.markup = true
|
207
|
+
tidy.options.indent = true
|
208
|
+
tidy.options.wrap = 0
|
209
|
+
tidy.options.logical_emphasis = true
|
210
|
+
tidy.options.input_encoding = options[:input_encoding]
|
211
|
+
tidy.options.output_encoding = options[:output_encoding]
|
212
|
+
tidy.options.doctype = "omit"
|
213
|
+
xml = tidy.clean(html)
|
214
|
+
xml
|
215
|
+
end
|
216
|
+
if is_fragment
|
217
|
+
# Tidy sticks <html>...<body>[our html]</body>...</html> in.
|
218
|
+
# We don't want this.
|
219
|
+
tidy_html.strip!
|
220
|
+
tidy_html.gsub!(/^<html>(.|\n)*<body>/, "")
|
221
|
+
tidy_html.gsub!(/<\/body>(.|\n)*<\/html>$/, "")
|
222
|
+
tidy_html.gsub!("\t", " ")
|
223
|
+
tidy_html = FeedTools::HtmlHelper.unindent(tidy_html, 4)
|
224
|
+
tidy_html.strip!
|
225
|
+
end
|
226
|
+
else
|
227
|
+
tidy_html = html
|
228
|
+
end
|
229
|
+
if tidy_html.blank? && !html.blank?
|
230
|
+
tidy_html = html.strip
|
231
|
+
end
|
232
|
+
return tidy_html
|
233
|
+
end
|
234
|
+
|
235
|
+
# Unindents a text selection by a specified number of spaces.
|
236
|
+
def self.unindent(text, spaces)
|
237
|
+
lines = text.split("\n")
|
238
|
+
buffer = ""
|
239
|
+
for line in lines
|
240
|
+
for index in 0...spaces
|
241
|
+
if line[0...1] == " "
|
242
|
+
line = line[1..-1]
|
243
|
+
else
|
244
|
+
break
|
245
|
+
end
|
246
|
+
end
|
247
|
+
buffer << line << "\n"
|
248
|
+
end
|
249
|
+
return buffer
|
250
|
+
end
|
251
|
+
|
252
|
+
# Removes all dangerous html tags from the html formatted text.
|
253
|
+
# If mode is set to :escape, dangerous and unknown elements will
|
254
|
+
# be escaped. If mode is set to :strip, dangerous and unknown
|
255
|
+
# elements and all children will be removed entirely.
|
256
|
+
# Dangerous or unknown attributes are always removed.
|
257
|
+
def self.sanitize_html(html, mode=:strip)
|
258
|
+
return nil if html.nil?
|
259
|
+
|
260
|
+
# Lists borrowed from Mark Pilgrim's feedparser
|
261
|
+
acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b',
|
262
|
+
'big', 'blockquote', 'br', 'button', 'caption', 'center', 'cite',
|
263
|
+
'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl',
|
264
|
+
'dt', 'em', 'fieldset', 'font', 'form', 'h1', 'h2', 'h3', 'h4',
|
265
|
+
'h5', 'h6', 'hr', 'i', 'img', 'input', 'ins', 'kbd', 'label', 'legend',
|
266
|
+
'li', 'map', 'menu', 'ol', 'optgroup', 'option', 'p', 'pre', 'q', 's',
|
267
|
+
'samp', 'select', 'small', 'span', 'strike', 'strong', 'sub', 'sup',
|
268
|
+
'table', 'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'tt',
|
269
|
+
'u', 'ul', 'var']
|
270
|
+
|
271
|
+
acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
|
272
|
+
'action', 'align', 'alt', 'axis', 'border', 'cellpadding',
|
273
|
+
'cellspacing', 'char', 'charoff', 'charset', 'checked', 'cite', 'class',
|
274
|
+
'clear', 'cols', 'colspan', 'color', 'compact', 'coords', 'datetime',
|
275
|
+
'dir', 'disabled', 'enctype', 'for', 'frame', 'headers', 'height',
|
276
|
+
'href', 'hreflang', 'hspace', 'id', 'ismap', 'label', 'lang',
|
277
|
+
'longdesc', 'maxlength', 'media', 'method', 'multiple', 'name',
|
278
|
+
'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev',
|
279
|
+
'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
|
280
|
+
'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title',
|
281
|
+
'type', 'usemap', 'valign', 'value', 'vspace', 'width']
|
282
|
+
|
283
|
+
# Replace with appropriate named entities
|
284
|
+
html.gsub!(/&/, "&")
|
285
|
+
html.gsub!(/&/, "&")
|
286
|
+
html.gsub!(/<!'/, "&lt;!'")
|
287
|
+
|
288
|
+
# Hackity hack. But it works, and it seems plenty fast enough.
|
289
|
+
html_doc = HTree.parse_xml("<root>" + html + "</root>").to_rexml
|
290
|
+
|
291
|
+
sanitize_node = lambda do |html_node|
|
292
|
+
if html_node.respond_to? :children
|
293
|
+
for child in html_node.children
|
294
|
+
if child.kind_of? REXML::Element
|
295
|
+
unless acceptable_elements.include? child.name.downcase
|
296
|
+
if mode == :strip
|
297
|
+
html_node.delete_element(child)
|
298
|
+
else
|
299
|
+
new_child = REXML::Text.new(CGI.escapeHTML(child.to_s))
|
300
|
+
html_node.insert_after(child, new_child)
|
301
|
+
html_node.delete_element(child)
|
302
|
+
end
|
303
|
+
end
|
304
|
+
for attribute in child.attributes.keys
|
305
|
+
if !(attribute =~ /^xmlns/)
|
306
|
+
unless acceptable_attributes.include? attribute.downcase
|
307
|
+
child.delete_attribute(attribute)
|
308
|
+
end
|
309
|
+
end
|
310
|
+
end
|
311
|
+
end
|
312
|
+
sanitize_node.call(child)
|
313
|
+
end
|
314
|
+
end
|
315
|
+
html_node
|
316
|
+
end
|
317
|
+
sanitize_node.call(html_doc.root)
|
318
|
+
html = html_doc.root.inner_xml
|
319
|
+
return html
|
320
|
+
end
|
321
|
+
|
322
|
+
# Returns true if the type string provided indicates that something is
|
323
|
+
# xml or xhtml content.
|
324
|
+
def self.xml_type?(type)
|
325
|
+
if [
|
326
|
+
"xml",
|
327
|
+
"xhtml",
|
328
|
+
"application/xhtml+xml"
|
329
|
+
].include?(type)
|
330
|
+
return true
|
331
|
+
elsif type != nil && type[-3..-1] == "xml"
|
332
|
+
return true
|
333
|
+
else
|
334
|
+
return false
|
335
|
+
end
|
336
|
+
end
|
337
|
+
|
338
|
+
# Returns true if the type string provided indicates that something is
|
339
|
+
# html or xhtml content.
|
340
|
+
def self.text_type?(type)
|
341
|
+
return [
|
342
|
+
"text",
|
343
|
+
"text/plain"
|
344
|
+
].include?(type)
|
345
|
+
end
|
346
|
+
|
347
|
+
# Returns true if the type string provided indicates that something is
|
348
|
+
# html or xhtml content.
|
349
|
+
def self.html_type?(type)
|
350
|
+
return [
|
351
|
+
"html",
|
352
|
+
"xhtml",
|
353
|
+
"text/html",
|
354
|
+
"application/xhtml+xml"
|
355
|
+
].include?(type)
|
356
|
+
end
|
357
|
+
|
358
|
+
# Returns true if the type string provided indicates that something is
|
359
|
+
# only html (not xhtml) content.
|
360
|
+
def self.only_html_type?(type)
|
361
|
+
return [
|
362
|
+
"html",
|
363
|
+
"text/html"
|
364
|
+
].include?(type)
|
365
|
+
end
|
366
|
+
|
367
|
+
# can_be_relative_uri = ['link', 'id', 'wfw_comment', 'wfw_commentrss', 'docs', 'url', 'href', 'comments', 'license', 'icon', 'logo']
|
368
|
+
|
369
|
+
# Resolves all relative uris in a block of html.
|
370
|
+
def self.resolve_relative_uris(html, base_uri_sources=[])
|
371
|
+
relative_uri_attributes = [
|
372
|
+
["a", "href"],
|
373
|
+
["applet", "codebase"],
|
374
|
+
["area", "href"],
|
375
|
+
["blockquote", "cite"],
|
376
|
+
["body", "background"],
|
377
|
+
["del", "cite"],
|
378
|
+
["form", "action"],
|
379
|
+
["frame", "longdesc"],
|
380
|
+
["frame", "src"],
|
381
|
+
["iframe", "longdesc"],
|
382
|
+
["iframe", "src"],
|
383
|
+
["head", "profile"],
|
384
|
+
["img", "longdesc"],
|
385
|
+
["img", "src"],
|
386
|
+
["img", "usemap"],
|
387
|
+
["input", "src"],
|
388
|
+
["input", "usemap"],
|
389
|
+
["ins", "cite"],
|
390
|
+
["link", "href"],
|
391
|
+
["object", "classid"],
|
392
|
+
["object", "codebase"],
|
393
|
+
["object", "data"],
|
394
|
+
["object", "usemap"],
|
395
|
+
["q", "cite"],
|
396
|
+
["script", "src"]
|
397
|
+
]
|
398
|
+
html_doc = HTree.parse_xml("<root>" + html + "</root>").to_rexml
|
399
|
+
|
400
|
+
resolve_node = lambda do |html_node|
|
401
|
+
if html_node.respond_to? :children
|
402
|
+
for child in html_node.children
|
403
|
+
if child.kind_of? REXML::Element
|
404
|
+
for element_attribute_pair in relative_uri_attributes
|
405
|
+
if child.name.downcase == element_attribute_pair[0]
|
406
|
+
attribute = child.attribute(element_attribute_pair[1])
|
407
|
+
if attribute != nil
|
408
|
+
href = attribute.value
|
409
|
+
href = FeedTools::UriHelper.resolve_relative_uri(
|
410
|
+
href, [child.base_uri] | base_uri_sources)
|
411
|
+
child.attribute(
|
412
|
+
element_attribute_pair[1]).instance_variable_set(
|
413
|
+
"@value", href)
|
414
|
+
end
|
415
|
+
end
|
416
|
+
end
|
417
|
+
end
|
418
|
+
resolve_node.call(child)
|
419
|
+
end
|
420
|
+
end
|
421
|
+
html_node
|
422
|
+
end
|
423
|
+
resolve_node.call(html_doc.root)
|
424
|
+
html = html_doc.root.inner_xml
|
425
|
+
return html
|
426
|
+
end
|
427
|
+
|
428
|
+
# Returns a string containing normalized xhtml from within a REXML node.
|
429
|
+
def self.extract_xhtml(rexml_node)
|
430
|
+
rexml_node_dup = rexml_node.deep_clone
|
431
|
+
normalize_namespaced_xhtml = lambda do |node, node_dup|
|
432
|
+
if node.kind_of? REXML::Element
|
433
|
+
node_namespace = node.namespace
|
434
|
+
# Massive hack, relies on REXML not changing
|
435
|
+
for index in 0...node.attributes.values.size
|
436
|
+
attribute = node.attributes.values[index]
|
437
|
+
attribute_dup = node_dup.attributes.values[index]
|
438
|
+
if attribute.namespace == FEED_TOOLS_NAMESPACES['xhtml']
|
439
|
+
attribute_dup.instance_variable_set(
|
440
|
+
"@expanded_name", attribute.name)
|
441
|
+
end
|
442
|
+
if node_namespace == FEED_TOOLS_NAMESPACES['xhtml']
|
443
|
+
if attribute.name == 'xmlns'
|
444
|
+
node_dup.attributes.delete('xmlns')
|
445
|
+
end
|
446
|
+
end
|
447
|
+
end
|
448
|
+
if node_namespace == FEED_TOOLS_NAMESPACES['xhtml']
|
449
|
+
node_dup.instance_variable_set("@expanded_name", node.name)
|
450
|
+
end
|
451
|
+
if !node_namespace.blank? && node.prefix.blank?
|
452
|
+
if node.namespace != FEED_TOOLS_NAMESPACES['xhtml']
|
453
|
+
node_dup.add_namespace(node_namespace)
|
454
|
+
end
|
455
|
+
end
|
456
|
+
end
|
457
|
+
for index in 0...node.children.size
|
458
|
+
child = node.children[index]
|
459
|
+
child_dup = node_dup.children[index]
|
460
|
+
if child.kind_of? REXML::Element
|
461
|
+
normalize_namespaced_xhtml.call(child, child_dup)
|
462
|
+
end
|
463
|
+
end
|
464
|
+
end
|
465
|
+
normalize_namespaced_xhtml.call(rexml_node, rexml_node_dup)
|
466
|
+
buffer = ""
|
467
|
+
rexml_node_dup.each_child do |child|
|
468
|
+
if child.kind_of? REXML::Comment
|
469
|
+
buffer << "<!--" + child.to_s + "-->"
|
470
|
+
else
|
471
|
+
buffer << child.to_s
|
472
|
+
end
|
473
|
+
end
|
474
|
+
return buffer.strip
|
475
|
+
end
|
476
|
+
|
477
|
+
# Given a REXML node, returns its content, normalized as HTML.
|
478
|
+
def self.process_text_construct(content_node, feed_type, feed_version,
|
479
|
+
base_uri_sources=[])
|
480
|
+
if content_node.nil?
|
481
|
+
return nil
|
482
|
+
end
|
483
|
+
|
484
|
+
content = nil
|
485
|
+
root_node_name = nil
|
486
|
+
type = FeedTools::XmlHelper.try_xpaths(content_node, "@type",
|
487
|
+
:select_result_value => true)
|
488
|
+
mode = FeedTools::XmlHelper.try_xpaths(content_node, "@mode",
|
489
|
+
:select_result_value => true)
|
490
|
+
encoding = FeedTools::XmlHelper.try_xpaths(content_node, "@encoding",
|
491
|
+
:select_result_value => true)
|
492
|
+
|
493
|
+
if type.nil?
|
494
|
+
atom_namespaces = [
|
495
|
+
FEED_TOOLS_NAMESPACES['atom10'],
|
496
|
+
FEED_TOOLS_NAMESPACES['atom03']
|
497
|
+
]
|
498
|
+
if ((atom_namespaces.include?(content_node.namespace) ||
|
499
|
+
atom_namespaces.include?(content_node.root.namespace)) ||
|
500
|
+
feed_type == "atom")
|
501
|
+
type = "text"
|
502
|
+
end
|
503
|
+
end
|
504
|
+
|
505
|
+
# Note that we're checking for misuse of type, mode and encoding here
|
506
|
+
if content_node.cdatas.size > 0
|
507
|
+
content = content_node.cdatas.first.to_s.strip
|
508
|
+
elsif type == "base64" || mode == "base64" ||
|
509
|
+
encoding == "base64"
|
510
|
+
content = Base64.decode64(content_node.inner_xml.strip)
|
511
|
+
elsif type == "xhtml" || mode == "xhtml" ||
|
512
|
+
type == "xml" || mode == "xml" ||
|
513
|
+
type == "application/xhtml+xml" ||
|
514
|
+
content_node.namespace == FEED_TOOLS_NAMESPACES['xhtml']
|
515
|
+
content = FeedTools::HtmlHelper.extract_xhtml(content_node)
|
516
|
+
elsif type == "escaped" || mode == "escaped"
|
517
|
+
content = FeedTools::HtmlHelper.unescape_entities(
|
518
|
+
content_node.inner_xml.strip)
|
519
|
+
elsif type == "text" || mode == "text" ||
|
520
|
+
type == "text/plain" || mode == "text/plain"
|
521
|
+
content = FeedTools::HtmlHelper.unescape_entities(
|
522
|
+
content_node.inner_xml.strip)
|
523
|
+
else
|
524
|
+
content = content_node.inner_xml.strip
|
525
|
+
repair_entities = true
|
526
|
+
end
|
527
|
+
if type == "text" || mode == "text" ||
|
528
|
+
type == "text/plain" || mode == "text/plain"
|
529
|
+
content = FeedTools::HtmlHelper.escape_entities(content)
|
530
|
+
end
|
531
|
+
unless content.nil?
|
532
|
+
if FeedTools.configurations[:sanitization_enabled]
|
533
|
+
content = FeedTools::HtmlHelper.sanitize_html(content, :strip)
|
534
|
+
end
|
535
|
+
content = FeedTools::HtmlHelper.resolve_relative_uris(content,
|
536
|
+
[content_node.base_uri] | base_uri_sources)
|
537
|
+
if repair_entities
|
538
|
+
content = FeedTools::HtmlHelper.unescape_entities(content)
|
539
|
+
end
|
540
|
+
content = FeedTools::HtmlHelper.tidy_html(content)
|
541
|
+
end
|
542
|
+
if FeedTools.configurations[:tab_spaces] != nil
|
543
|
+
spaces = FeedTools.configurations[:tab_spaces].to_i
|
544
|
+
content.gsub!("\t", " " * spaces) unless content.blank?
|
545
|
+
end
|
546
|
+
content.strip unless content.blank?
|
547
|
+
content = nil if content.blank?
|
548
|
+
return content
|
549
|
+
end
|
550
|
+
|
551
|
+
# Strips semantically empty div wrapper elements
|
552
|
+
def self.strip_wrapper_element(xhtml)
|
553
|
+
return nil if xhtml.nil?
|
554
|
+
return xhtml if xhtml.blank?
|
555
|
+
begin
|
556
|
+
doc = REXML::Document.new(xhtml.to_s.strip)
|
557
|
+
if doc.children.size == 1
|
558
|
+
child = doc.children[0]
|
559
|
+
if child.name.downcase == "div"
|
560
|
+
return child.inner_xml.strip
|
561
|
+
end
|
562
|
+
end
|
563
|
+
return xhtml.to_s.strip
|
564
|
+
rescue Exception
|
565
|
+
return xhtml.to_s.strip
|
566
|
+
end
|
567
|
+
end
|
568
|
+
|
569
|
+
# Given a block of html, locates feed links with a given mime type.
|
570
|
+
def self.extract_link_by_mime_type(html, mime_type)
|
571
|
+
require 'feed_tools/vendor/htree'
|
572
|
+
require 'feed_tools/helpers/xml_helper'
|
573
|
+
|
574
|
+
# This is technically very, very wrong. But it saves oodles of
|
575
|
+
# clock cycles, and probably works 99.999% of the time.
|
576
|
+
html_document = HTree.parse_xml(
|
577
|
+
FeedTools::HtmlHelper.tidy_html(html.gsub(/<body>(.|\n)*<\/body>/, ""))).to_rexml
|
578
|
+
html_node = nil
|
579
|
+
head_node = nil
|
580
|
+
link_nodes = []
|
581
|
+
for node in html_document.children
|
582
|
+
next unless node.kind_of?(REXML::Element)
|
583
|
+
if node.name.downcase == "html" &&
|
584
|
+
node.children.size > 0
|
585
|
+
html_node = node
|
586
|
+
break
|
587
|
+
end
|
588
|
+
end
|
589
|
+
return nil if html_node.nil?
|
590
|
+
for node in html_node.children
|
591
|
+
next unless node.kind_of?(REXML::Element)
|
592
|
+
if node.name.downcase == "head"
|
593
|
+
head_node = node
|
594
|
+
break
|
595
|
+
end
|
596
|
+
if node.name.downcase == "link"
|
597
|
+
link_nodes << node
|
598
|
+
end
|
599
|
+
end
|
600
|
+
return nil if html_node.nil? && link_nodes.empty?
|
601
|
+
if !head_node.nil?
|
602
|
+
link_nodes = []
|
603
|
+
for node in head_node.children
|
604
|
+
next unless node.kind_of?(REXML::Element)
|
605
|
+
if node.name.downcase == "link"
|
606
|
+
link_nodes << node
|
607
|
+
end
|
608
|
+
end
|
609
|
+
end
|
610
|
+
find_link_nodes = lambda do |links|
|
611
|
+
for link in links
|
612
|
+
next unless link.kind_of?(REXML::Element)
|
613
|
+
if link.attributes['type'].to_s.strip.downcase ==
|
614
|
+
mime_type.downcase &&
|
615
|
+
link.attributes['rel'].to_s.strip.downcase == "alternate"
|
616
|
+
href = link.attributes['href']
|
617
|
+
return href unless href.blank?
|
618
|
+
end
|
619
|
+
end
|
620
|
+
for link in links
|
621
|
+
next unless link.kind_of?(REXML::Element)
|
622
|
+
find_link_nodes.call(link.children)
|
623
|
+
end
|
624
|
+
end
|
625
|
+
find_link_nodes.call(link_nodes)
|
626
|
+
return nil
|
627
|
+
end
|
628
|
+
end
|
629
|
+
end
|