feedtools 0.2.22 → 0.2.23
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +28 -0
- data/README +23 -2
- data/db/migration.rb +19 -0
- data/db/schema.mysql.sql +1 -1
- data/db/schema.postgresql.sql +1 -1
- data/db/schema.sqlite.sql +1 -1
- data/lib/feed_tools.rb +71 -388
- data/lib/feed_tools/database_feed_cache.rb +4 -3
- data/lib/feed_tools/feed.rb +809 -607
- data/lib/feed_tools/feed_item.rb +551 -574
- data/lib/feed_tools/feed_structures.rb +252 -0
- data/lib/feed_tools/helpers/feed_tools_helper.rb +6 -5
- data/lib/feed_tools/helpers/generic_helper.rb +16 -158
- data/lib/feed_tools/helpers/html_helper.rb +629 -0
- data/lib/feed_tools/helpers/retrieval_helper.rb +5 -0
- data/lib/feed_tools/helpers/uri_helper.rb +223 -0
- data/lib/feed_tools/helpers/xml_helper.rb +239 -0
- data/rakefile +10 -237
- data/test/unit/amp_test.rb +102 -94
- data/test/unit/atom_test.rb +239 -6
- data/test/unit/cache_test.rb +1 -1
- data/test/unit/encoding_test.rb +5 -5
- data/test/unit/generation_test.rb +34 -1
- data/test/unit/helper_test.rb +111 -17
- data/test/unit/rss_test.rb +21 -2
- metadata +7 -3
- data/lib/feed_tools/helpers/module_helper.rb +0 -27
data/CHANGELOG
CHANGED
@@ -1,3 +1,31 @@
|
|
1
|
+
== FeedTools 0.2.23
|
2
|
+
* autodiscovery implemented
|
3
|
+
* now knows a title from a hole in the ground
|
4
|
+
* now resolves relative urls when possible
|
5
|
+
* changed default table name to "cached_feeds" to avoid name collisions
|
6
|
+
* schema now uses "href" instead of "url"
|
7
|
+
* feed cache is set to nil by default now
|
8
|
+
* both summary and content elements are generated now
|
9
|
+
* now supports proxies
|
10
|
+
* now supports internationalized domain names if libidn is installed
|
11
|
+
* fixed bug with feed merging referencing a method that was refactored
|
12
|
+
* no longer dies if uuidtools gem is missing but the UUID class is defined
|
13
|
+
* updated timestamp handling and generation
|
14
|
+
* added support for entry sorting on any feed item field
|
15
|
+
* added support for disabling entry sorting entirely
|
16
|
+
* fixed issue with itunes categories
|
17
|
+
* fixed itunes subtitle/summary
|
18
|
+
* fixed entry assignment bug
|
19
|
+
* fixed issued/published variable name mix-up
|
20
|
+
* added support for the payload module
|
21
|
+
* added support for xhtml:div elements
|
22
|
+
* dc:date now preempts pubDate
|
23
|
+
* added better support for the scriptingNews format
|
24
|
+
* now correctly strips out wrapper div elements from text constructs
|
25
|
+
* fixed issue with some atom links being incorrectly identified as images
|
26
|
+
* reorganized some of the helper modules
|
27
|
+
* made some portions of url normalization case insensitive
|
28
|
+
* fixed issue with filename handling on Windows
|
1
29
|
== FeedTools 0.2.22
|
2
30
|
* fixed another atom generation error
|
3
31
|
== FeedTools 0.2.21
|
data/README
CHANGED
@@ -1,7 +1,9 @@
|
|
1
|
-
|
2
|
-
caching system.
|
1
|
+
FeedTools was designed to be a simple XML feed parser, generator, and
|
2
|
+
translator with a built-in caching system.
|
3
3
|
|
4
4
|
== Example
|
5
|
+
require 'feed_tools'
|
6
|
+
|
5
7
|
slashdot_feed = FeedTools::Feed.open('http://www.slashdot.org/index.rss')
|
6
8
|
slashdot_feed.title
|
7
9
|
=> "Slashdot"
|
@@ -11,3 +13,22 @@
|
|
11
13
|
=> "http://slashdot.org/"
|
12
14
|
slashdot_feed.items.first.find_node("slash:hitparade/text()").to_s
|
13
15
|
=> "43,37,28,23,11,3,1"
|
16
|
+
|
17
|
+
== Installation
|
18
|
+
You can install FeedTools as a gem:
|
19
|
+
gem install feedtools
|
20
|
+
|
21
|
+
Or you can install it from the tarball or zip packages on the download page
|
22
|
+
and then extract it to your vendors directory as you would with any other
|
23
|
+
Ruby library.
|
24
|
+
|
25
|
+
After installation, you will either need to run in non-caching mode or set
|
26
|
+
up a caching mechanism. The database feed cache system currently included
|
27
|
+
with FeedTools is the most common caching method. To set up the database
|
28
|
+
feed cache, you will first need to create the appropriate database schema.
|
29
|
+
Schema files for MySQL, PostgreSQL, and SQLite have been included, but the
|
30
|
+
preferred method of creating the schema within the Rails environment is with
|
31
|
+
a migration file. A migration file has been supplied with FeedTools and can
|
32
|
+
be found in the db directory. Run
|
33
|
+
<tt>script/generate migration add_feed_tools_tables</tt> and then copy and
|
34
|
+
paste the contents of db/migration.rb into your new migration file.
|
data/db/migration.rb
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
class AddFeedToolsTables < ActiveRecord::Migration
|
2
|
+
def self.up
|
3
|
+
puts "Adding cached feeds table..."
|
4
|
+
create_table :cached_feeds do |t|
|
5
|
+
t.column :href, :string
|
6
|
+
t.column :title, :string
|
7
|
+
t.column :link, :string
|
8
|
+
t.column :feed_data, :text
|
9
|
+
t.column :feed_data_type, :string
|
10
|
+
t.column :http_headers, :text
|
11
|
+
t.column :last_retrieved, :datetime
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
def self.down
|
16
|
+
puts "Dropping cached feeds table..."
|
17
|
+
drop_table :cached_feeds
|
18
|
+
end
|
19
|
+
end
|
data/db/schema.mysql.sql
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
-- Example MySQL schema
|
2
2
|
CREATE TABLE `feeds` (
|
3
3
|
`id` int(10) unsigned NOT NULL auto_increment,
|
4
|
-
`
|
4
|
+
`href` varchar(255) default NULL,
|
5
5
|
`title` varchar(255) default NULL,
|
6
6
|
`link` varchar(255) default NULL,
|
7
7
|
`feed_data` longtext default NULL,
|
data/db/schema.postgresql.sql
CHANGED
data/db/schema.sqlite.sql
CHANGED
data/lib/feed_tools.rb
CHANGED
@@ -32,7 +32,7 @@ FEED_TOOLS_ENV = ENV['FEED_TOOLS_ENV'] ||
|
|
32
32
|
ENV['RAILS_ENV'] ||
|
33
33
|
'development' # :nodoc:
|
34
34
|
|
35
|
-
FEED_TOOLS_VERSION = "0.2.
|
35
|
+
FEED_TOOLS_VERSION = "0.2.23"
|
36
36
|
|
37
37
|
FEED_TOOLS_NAMESPACES = {
|
38
38
|
"admin" => "http://webns.net/mvcb/",
|
@@ -40,8 +40,9 @@ FEED_TOOLS_NAMESPACES = {
|
|
40
40
|
"annotate" => "http://purl.org/rss/1.0/modules/annotate/",
|
41
41
|
"atom10" => "http://www.w3.org/2005/Atom",
|
42
42
|
"atom03" => "http://purl.org/atom/ns#",
|
43
|
-
|
43
|
+
"atom-blog" => "http://purl.org/atom-blog/ns#",
|
44
44
|
"audio" => "http://media.tangent.org/rss/1.0/",
|
45
|
+
"bitTorrent" =>"http://www.reallysimplesyndication.com/bitTorrentRssModule",
|
45
46
|
"blogChannel" => "http://backend.userland.com/blogChannelModule",
|
46
47
|
"blogger" => "http://www.blogger.com/atom/ns#",
|
47
48
|
"cc" => "http://web.resource.org/cc/",
|
@@ -61,20 +62,24 @@ FEED_TOOLS_NAMESPACES = {
|
|
61
62
|
"itunes" => "http://www.itunes.com/dtds/podcast-1.0.dtd",
|
62
63
|
"l" => "http://purl.org/rss/1.0/modules/link/",
|
63
64
|
"media" => "http://search.yahoo.com/mrss",
|
65
|
+
"p" => "http://purl.org/net/rss1.1/payload#",
|
64
66
|
"pingback" => "http://madskills.com/public/xml/rss/module/pingback/",
|
65
67
|
"prism" => "http://prismstandard.org/namespaces/1.2/basic/",
|
66
68
|
"rdf" => "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
|
67
69
|
"rdfs" => "http://www.w3.org/2000/01/rdf-schema#",
|
68
70
|
"ref" => "http://purl.org/rss/1.0/modules/reference/",
|
69
71
|
"reqv" => "http://purl.org/rss/1.0/modules/richequiv/",
|
72
|
+
"rss09" => "http://my.netscape.com/rdf/simple/0.9/",
|
70
73
|
"rss10" => "http://purl.org/rss/1.0/",
|
74
|
+
"rss11" => "http://purl.org/net/rss1.1#",
|
75
|
+
"rss20" => "http://backend.userland.com/rss2",
|
71
76
|
"search" => "http://purl.org/rss/1.0/modules/search/",
|
72
77
|
"slash" => "http://purl.org/rss/1.0/modules/slash/",
|
73
78
|
"soap" => "http://schemas.xmlsoap.org/soap/envelope/",
|
74
79
|
"ss" => "http://purl.org/rss/1.0/modules/servicestatus/",
|
75
80
|
"str" => "http://hacks.benhammersley.com/rss/streaming/",
|
76
81
|
"sub" => "http://purl.org/rss/1.0/modules/subscription/",
|
77
|
-
"
|
82
|
+
"syn" => "http://purl.org/rss/1.0/modules/syndication/",
|
78
83
|
"taxo" => "http://purl.org/rss/1.0/modules/taxonomy/",
|
79
84
|
"thr" => "http://purl.org/rss/1.0/modules/threading/",
|
80
85
|
"ti" => "http://purl.org/rss/1.0/modules/textinput/",
|
@@ -91,7 +96,7 @@ $:.unshift(File.dirname(__FILE__) + "/feed_tools/vendor")
|
|
91
96
|
begin
|
92
97
|
begin
|
93
98
|
require 'iconv'
|
94
|
-
rescue
|
99
|
+
rescue Object
|
95
100
|
warn("The Iconv library does not appear to be installed properly. " +
|
96
101
|
"FeedTools cannot function properly without it.")
|
97
102
|
raise
|
@@ -101,11 +106,15 @@ begin
|
|
101
106
|
|
102
107
|
require_gem('builder', '>= 1.2.4')
|
103
108
|
|
109
|
+
# Preload optional libraries.
|
104
110
|
begin
|
105
111
|
require 'tidy'
|
106
|
-
rescue
|
107
|
-
# Ignore the error for now.
|
112
|
+
rescue Object
|
108
113
|
end
|
114
|
+
begin
|
115
|
+
require 'idn'
|
116
|
+
rescue Object
|
117
|
+
end
|
109
118
|
|
110
119
|
require 'feed_tools/vendor/htree'
|
111
120
|
|
@@ -126,11 +135,25 @@ begin
|
|
126
135
|
|
127
136
|
require_gem('activesupport', '>= 1.1.1')
|
128
137
|
require_gem('activerecord', '>= 1.11.1')
|
129
|
-
require_gem('uuidtools', '>= 0.1.2')
|
130
138
|
|
139
|
+
begin
|
140
|
+
require_gem('uuidtools', '>= 0.1.2')
|
141
|
+
rescue Gem::LoadError
|
142
|
+
begin
|
143
|
+
require 'uuidtools'
|
144
|
+
rescue Object
|
145
|
+
raise unless defined? UUID
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
131
149
|
require 'feed_tools/feed'
|
132
150
|
require 'feed_tools/feed_item'
|
151
|
+
require 'feed_tools/feed_structures'
|
133
152
|
require 'feed_tools/database_feed_cache'
|
153
|
+
|
154
|
+
require 'feed_tools/helpers/html_helper'
|
155
|
+
require 'feed_tools/helpers/xml_helper'
|
156
|
+
require 'feed_tools/helpers/uri_helper'
|
134
157
|
rescue LoadError
|
135
158
|
# ActiveSupport will very likely mess this up. So drop a warn so that the
|
136
159
|
# programmer can figure it out if things get wierd and unpredictable.
|
@@ -159,19 +182,27 @@ module FeedTools
|
|
159
182
|
|
160
183
|
def FeedTools.load_configurations
|
161
184
|
if @configurations.blank?
|
185
|
+
# TODO: Load this from a config file.
|
162
186
|
config_hash = {}
|
163
187
|
@configurations = {
|
164
|
-
:feed_cache =>
|
188
|
+
:feed_cache => nil,
|
189
|
+
:proxy_address => nil,
|
190
|
+
:proxy_port => nil,
|
165
191
|
:user_agent => "FeedTools/#{FEED_TOOLS_VERSION} " +
|
166
192
|
"+http://www.sporkmonger.com/projects/feedtools/",
|
167
193
|
:generator_name => "FeedTools/#{FEED_TOOLS_VERSION}",
|
168
194
|
:generator_href => "http://www.sporkmonger.com/projects/feedtools/",
|
169
|
-
:tidy_enabled =>
|
195
|
+
:tidy_enabled => true,
|
170
196
|
:tidy_options => {},
|
197
|
+
:idn_enabled => true,
|
198
|
+
:sanitization_enabled => true,
|
171
199
|
:sanitize_with_nofollow => true,
|
200
|
+
:always_strip_wrapper_elements => true,
|
172
201
|
:timestamp_estimation_enabled => true,
|
173
202
|
:url_normalization_enabled => true,
|
203
|
+
:entry_sorting_property => "time",
|
174
204
|
:strip_comment_count => false,
|
205
|
+
:tab_spaces => 2,
|
175
206
|
:max_ttl => 3.days.to_s,
|
176
207
|
:output_encoding => "utf-8"
|
177
208
|
}.merge(config_hash)
|
@@ -236,6 +267,9 @@ module FeedTools
|
|
236
267
|
cache_class = eval(class_name)
|
237
268
|
if cache_class.kind_of?(Class)
|
238
269
|
@feed_cache = cache_class
|
270
|
+
if @feed_cache.respond_to? :initialize_cache
|
271
|
+
@feed_cache.initialize_cache
|
272
|
+
end
|
239
273
|
return cache_class
|
240
274
|
else
|
241
275
|
return nil
|
@@ -258,376 +292,7 @@ module FeedTools
|
|
258
292
|
rescue
|
259
293
|
return false
|
260
294
|
end
|
261
|
-
end
|
262
|
-
|
263
|
-
# Returns true if the html tidy module can be used.
|
264
|
-
#
|
265
|
-
# Obviously, you need the tidy gem installed in order to run with html
|
266
|
-
# tidy features turned on.
|
267
|
-
#
|
268
|
-
# This method does a fairly complicated, and probably unnecessarily
|
269
|
-
# desperate search for the libtidy library. If you want this thing to
|
270
|
-
# execute fast, the best thing to do is to set Tidy.path ahead of time.
|
271
|
-
# If Tidy.path is set, this method doesn't do much. If it's not set,
|
272
|
-
# it will do it's darnedest to find the libtidy library. If you set
|
273
|
-
# the LIBTIDYPATH environment variable to the libtidy library, it should
|
274
|
-
# be able to find it.
|
275
|
-
#
|
276
|
-
# Once the library is located, this method will run much faster.
|
277
|
-
def FeedTools.tidy_enabled?
|
278
|
-
# This is an override variable to keep tidy from being used even if it
|
279
|
-
# is available.
|
280
|
-
if FeedTools.configurations[:tidy_enabled] == false
|
281
|
-
return false
|
282
|
-
end
|
283
|
-
if @tidy_enabled.nil? || @tidy_enabled == false
|
284
|
-
@tidy_enabled = false
|
285
|
-
begin
|
286
|
-
require 'tidy'
|
287
|
-
if Tidy.path.nil?
|
288
|
-
# *Shrug*, just brute force it, I guess. There's a lot of places
|
289
|
-
# this thing might be hiding in, depending on platform and general
|
290
|
-
# sanity of the person who installed the thing. Most of these are
|
291
|
-
# probably unlikely, but it's not like checking unlikely locations
|
292
|
-
# hurts. Much. Especially if you actually find it.
|
293
|
-
libtidy_locations = [
|
294
|
-
'/usr/local/lib/libtidy.dylib',
|
295
|
-
'/opt/local/lib/libtidy.dylib',
|
296
|
-
'/usr/lib/libtidy.dylib',
|
297
|
-
'/usr/local/lib/tidylib.dylib',
|
298
|
-
'/opt/local/lib/tidylib.dylib',
|
299
|
-
'/usr/lib/tidylib.dylib',
|
300
|
-
'/usr/local/lib/tidy.dylib',
|
301
|
-
'/opt/local/lib/tidy.dylib',
|
302
|
-
'/usr/lib/tidy.dylib',
|
303
|
-
'/usr/local/lib/libtidy.so',
|
304
|
-
'/opt/local/lib/libtidy.so',
|
305
|
-
'/usr/lib/libtidy.so',
|
306
|
-
'/usr/local/lib/tidylib.so',
|
307
|
-
'/opt/local/lib/tidylib.so',
|
308
|
-
'/usr/lib/tidylib.so',
|
309
|
-
'/usr/local/lib/tidy.so',
|
310
|
-
'/opt/local/lib/tidy.so',
|
311
|
-
'/usr/lib/tidy.so',
|
312
|
-
'C:\Program Files\Tidy\tidy.dll',
|
313
|
-
'C:\Tidy\tidy.dll',
|
314
|
-
'C:\Ruby\bin\tidy.dll',
|
315
|
-
'C:\Ruby\tidy.dll',
|
316
|
-
'/usr/local/lib',
|
317
|
-
'/opt/local/lib',
|
318
|
-
'/usr/lib'
|
319
|
-
]
|
320
|
-
# We just made this thing up, but if someone sets it, we'll
|
321
|
-
# go ahead and check it
|
322
|
-
unless ENV['LIBTIDYPATH'].nil?
|
323
|
-
libtidy_locations =
|
324
|
-
libtidy_locations.reverse.push(ENV['LIBTIDYPATH'])
|
325
|
-
end
|
326
|
-
for path in libtidy_locations
|
327
|
-
if File.exists? path
|
328
|
-
if File.ftype(path) == "file"
|
329
|
-
Tidy.path = path
|
330
|
-
@tidy_enabled = true
|
331
|
-
break
|
332
|
-
elsif File.ftype(path) == "directory"
|
333
|
-
# Ok, now perhaps we're getting a bit more desperate
|
334
|
-
lib_paths =
|
335
|
-
`find #{path} -name '*tidy*' | grep '\\.\\(so\\|dylib\\)$'`
|
336
|
-
# If there's more than one, grab the first one and
|
337
|
-
# hope for the best, and if it doesn't work, then blame the
|
338
|
-
# user for not specifying more accurately.
|
339
|
-
tidy_path = lib_paths.split("\n").first
|
340
|
-
unless tidy_path.nil?
|
341
|
-
Tidy.path = tidy_path
|
342
|
-
@tidy_enabled = true
|
343
|
-
break
|
344
|
-
end
|
345
|
-
end
|
346
|
-
end
|
347
|
-
end
|
348
|
-
# Still couldn't find it.
|
349
|
-
unless @tidy_enabled
|
350
|
-
@tidy_enabled = false
|
351
|
-
end
|
352
|
-
else
|
353
|
-
@tidy_enabled = true
|
354
|
-
end
|
355
|
-
rescue LoadError
|
356
|
-
# Tidy not installed, disable features that rely on tidy.
|
357
|
-
@tidy_enabled = false
|
358
|
-
end
|
359
|
-
end
|
360
|
-
return @tidy_enabled
|
361
|
-
end
|
362
|
-
|
363
|
-
# Attempts to ensures that the passed url is valid and sane. Accepts very, very ugly urls
|
364
|
-
# and makes every effort to figure out what it was supposed to be. Also translates from
|
365
|
-
# the feed: and rss: pseudo-protocols to the http: protocol.
|
366
|
-
def FeedTools.normalize_url(url)
|
367
|
-
if url.nil? || url == ""
|
368
|
-
return nil
|
369
|
-
end
|
370
|
-
normalized_url = url.strip
|
371
|
-
|
372
|
-
# if a url begins with the '/' character, it only makes sense that they
|
373
|
-
# meant to be using a file:// url. Fix it for them.
|
374
|
-
if normalized_url.length > 0 && normalized_url[0..0] == "/"
|
375
|
-
normalized_url = "file://" + normalized_url
|
376
|
-
end
|
377
|
-
|
378
|
-
# if a url begins with a drive letter followed by a colon, we're looking at
|
379
|
-
# a file:// url. Fix it for them.
|
380
|
-
if normalized_url.length > 0 &&
|
381
|
-
normalized_url.scan(/^[a-zA-Z]:[\\\/]/).size > 0
|
382
|
-
normalized_url = "file:///" + normalized_url
|
383
|
-
end
|
384
|
-
|
385
|
-
# if a url begins with javascript:, it's quite possibly an attempt at
|
386
|
-
# doing something malicious. Let's keep that from getting anywhere,
|
387
|
-
# shall we?
|
388
|
-
if (normalized_url.downcase =~ /javascript:/) != nil
|
389
|
-
return "#"
|
390
|
-
end
|
391
|
-
|
392
|
-
# deal with all of the many ugly possibilities involved in the rss:
|
393
|
-
# and feed: pseudo-protocols (incidentally, whose crazy idea was this
|
394
|
-
# mess?)
|
395
|
-
normalized_url.gsub!(/^http:\/*(feed:\/*)?/, "http://")
|
396
|
-
normalized_url.gsub!(/^http:\/*(rss:\/*)?/, "http://")
|
397
|
-
normalized_url.gsub!(/^feed:\/*(http:\/*)?/, "http://")
|
398
|
-
normalized_url.gsub!(/^rss:\/*(http:\/*)?/, "http://")
|
399
|
-
normalized_url.gsub!(/^file:\/*/, "file:///")
|
400
|
-
normalized_url.gsub!(/^https:\/*/, "https://")
|
401
|
-
# fix (very) bad urls (usually of the user-entered sort)
|
402
|
-
normalized_url.gsub!(/^http:\/*(http:\/*)*/, "http://")
|
403
|
-
|
404
|
-
if (normalized_url =~ /^file:/) == 0
|
405
|
-
# Adjust windows-style urls
|
406
|
-
normalized_url.gsub!(/^file:\/\/\/([a-zA-Z])\|/, 'file:///\1:')
|
407
|
-
normalized_url.gsub!(/\\/, '/')
|
408
|
-
else
|
409
|
-
if (normalized_url =~ /https?:\/\//) == nil
|
410
|
-
normalized_url = "http://" + normalized_url
|
411
|
-
end
|
412
|
-
if normalized_url == "http://"
|
413
|
-
return nil
|
414
|
-
end
|
415
|
-
begin
|
416
|
-
feed_uri = URI.parse(normalized_url)
|
417
|
-
if feed_uri.scheme == nil
|
418
|
-
feed_uri.scheme = "http"
|
419
|
-
end
|
420
|
-
if feed_uri.path == nil || feed_uri.path == ""
|
421
|
-
feed_uri.path = "/"
|
422
|
-
end
|
423
|
-
if (feed_uri.path =~ /^[\/]+/) == 0
|
424
|
-
feed_uri.path.gsub!(/^[\/]+/, "/")
|
425
|
-
end
|
426
|
-
feed_uri.host.downcase!
|
427
|
-
normalized_url = feed_uri.to_s
|
428
|
-
rescue URI::InvalidURIError
|
429
|
-
end
|
430
|
-
end
|
431
|
-
|
432
|
-
# We can't do a proper set of escaping, so this will
|
433
|
-
# have to do.
|
434
|
-
normalized_url.gsub!(/%20/, " ")
|
435
|
-
normalized_url.gsub!(/ /, "%20")
|
436
|
-
|
437
|
-
return normalized_url
|
438
|
-
end
|
439
|
-
|
440
|
-
# Converts a url into a tag uri
|
441
|
-
def FeedTools.build_tag_uri(url, date)
|
442
|
-
unless url.kind_of? String
|
443
|
-
raise ArgumentError, "Expected String, got #{url.class.name}"
|
444
|
-
end
|
445
|
-
unless date.kind_of? Time
|
446
|
-
raise ArgumentError, "Expected Time, got #{date.class.name}"
|
447
|
-
end
|
448
|
-
tag_uri = normalize_url(url)
|
449
|
-
unless FeedTools.is_uri?(tag_uri)
|
450
|
-
raise ArgumentError, "Must supply a valid URL."
|
451
|
-
end
|
452
|
-
host = URI.parse(tag_uri).host
|
453
|
-
tag_uri.gsub!(/^(http|ftp|file):\/*/, "")
|
454
|
-
tag_uri.gsub!(/#/, "/")
|
455
|
-
tag_uri = "tag:#{host},#{date.strftime('%Y-%m-%d')}:" +
|
456
|
-
"#{tag_uri[(tag_uri.index(host) + host.size)..-1]}"
|
457
|
-
return tag_uri
|
458
|
-
end
|
459
|
-
|
460
|
-
# Converts a url into a urn:uuid: uri
|
461
|
-
def FeedTools.build_urn_uri(url)
|
462
|
-
unless url.kind_of? String
|
463
|
-
raise ArgumentError, "Expected String, got #{url.class.name}"
|
464
|
-
end
|
465
|
-
normalized_url = normalize_url(url)
|
466
|
-
require 'uuidtools'
|
467
|
-
return UUID.sha1_create(UUID_URL_NAMESPACE, normalized_url).to_uri_string
|
468
|
-
end
|
469
|
-
|
470
|
-
# Returns true if the parameter appears to be a valid uri
|
471
|
-
def FeedTools.is_uri?(url)
|
472
|
-
return false if url.nil?
|
473
|
-
begin
|
474
|
-
uri = URI.parse(url)
|
475
|
-
if uri.scheme.nil? || uri.scheme == ""
|
476
|
-
return false
|
477
|
-
end
|
478
|
-
rescue URI::InvalidURIError
|
479
|
-
return false
|
480
|
-
end
|
481
|
-
return true
|
482
|
-
end
|
483
|
-
|
484
|
-
# Escapes all html entities
|
485
|
-
def FeedTools.escape_entities(html)
|
486
|
-
return nil if html.nil?
|
487
|
-
escaped_html = CGI.escapeHTML(html)
|
488
|
-
escaped_html.gsub!(/'/, "'")
|
489
|
-
escaped_html.gsub!(/"/, """)
|
490
|
-
return escaped_html
|
491
|
-
end
|
492
|
-
|
493
|
-
# Unescapes all html entities
|
494
|
-
def FeedTools.unescape_entities(html)
|
495
|
-
return nil if html.nil?
|
496
|
-
unescaped_html = html
|
497
|
-
unescaped_html.gsub!(/&/, "&")
|
498
|
-
unescaped_html.gsub!(/&/, "&")
|
499
|
-
unescaped_html = CGI.unescapeHTML(unescaped_html)
|
500
|
-
unescaped_html.gsub!(/'/, "'")
|
501
|
-
unescaped_html.gsub!(/"/, "\"")
|
502
|
-
return unescaped_html
|
503
|
-
end
|
504
|
-
|
505
|
-
# Removes all html tags from the html formatted text.
|
506
|
-
def FeedTools.strip_html(html)
|
507
|
-
return nil if html.nil?
|
508
|
-
# TODO: do this properly
|
509
|
-
# ======================
|
510
|
-
stripped_html = html.gsub(/<\/?[^>]+>/, "")
|
511
|
-
return stripped_html
|
512
|
-
end
|
513
|
-
|
514
|
-
# Tidys up the html
|
515
|
-
def FeedTools.tidy_html(html, options = {})
|
516
|
-
return nil if html.nil?
|
517
|
-
if FeedTools.tidy_enabled?
|
518
|
-
is_fragment = true
|
519
|
-
html.gsub!(/<!'/, "&lt;!'")
|
520
|
-
if (html.strip =~ /<html>(.|\n)*<body>/) != nil ||
|
521
|
-
(html.strip =~ /<\/body>(.|\n)*<\/html>$/) != nil
|
522
|
-
is_fragment = false
|
523
|
-
end
|
524
|
-
if (html.strip =~ /<\?xml(.|\n)*\?>/) != nil
|
525
|
-
is_fragment = false
|
526
|
-
end
|
527
|
-
tidy_html = Tidy.open(:show_warnings=>false) do |tidy|
|
528
|
-
tidy.options.output_xml = true
|
529
|
-
tidy.options.numeric_entities = true
|
530
|
-
tidy.options.markup = true
|
531
|
-
tidy.options.indent = false
|
532
|
-
tidy.options.wrap = 0
|
533
|
-
tidy.options.logical_emphasis = true
|
534
|
-
# TODO: Make this match the actual encoding of the feed
|
535
|
-
# =====================================================
|
536
|
-
tidy.options.input_encoding = "utf8"
|
537
|
-
tidy.options.output_encoding = "ascii"
|
538
|
-
tidy.options.ascii_chars = false
|
539
|
-
tidy.options.doctype = "omit"
|
540
|
-
xml = tidy.clean(html)
|
541
|
-
xml
|
542
|
-
end
|
543
|
-
if is_fragment
|
544
|
-
# Tidy sticks <html>...<body>[our html]</body>...</html> in.
|
545
|
-
# We don't want this.
|
546
|
-
tidy_html.strip!
|
547
|
-
tidy_html.gsub!(/^<html>(.|\n)*<body>/, "")
|
548
|
-
tidy_html.gsub!(/<\/body>(.|\n)*<\/html>$/, "")
|
549
|
-
tidy_html.strip!
|
550
|
-
end
|
551
|
-
tidy_html.gsub!(/&/, "&")
|
552
|
-
tidy_html.gsub!(/&/, "&")
|
553
|
-
tidy_html.gsub!(/\320\262\320\202\342\204\242/, "\342\200\231")
|
554
|
-
|
555
|
-
else
|
556
|
-
tidy_html = html
|
557
|
-
end
|
558
|
-
if tidy_html.blank? && !html.blank?
|
559
|
-
tidy_html = html.strip
|
560
|
-
end
|
561
|
-
return tidy_html
|
562
|
-
end
|
563
|
-
|
564
|
-
# Removes all dangerous html tags from the html formatted text.
|
565
|
-
# If mode is set to :escape, dangerous and unknown elements will
|
566
|
-
# be escaped. If mode is set to :strip, dangerous and unknown
|
567
|
-
# elements and all children will be removed entirely.
|
568
|
-
# Dangerous or unknown attributes are always removed.
|
569
|
-
def FeedTools.sanitize_html(html, mode=:strip)
|
570
|
-
return nil if html.nil?
|
571
|
-
|
572
|
-
# Lists borrowed from Mark Pilgrim's feedparser
|
573
|
-
acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b',
|
574
|
-
'big', 'blockquote', 'br', 'button', 'caption', 'center', 'cite',
|
575
|
-
'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl',
|
576
|
-
'dt', 'em', 'fieldset', 'font', 'form', 'h1', 'h2', 'h3', 'h4',
|
577
|
-
'h5', 'h6', 'hr', 'i', 'img', 'input', 'ins', 'kbd', 'label', 'legend',
|
578
|
-
'li', 'map', 'menu', 'ol', 'optgroup', 'option', 'p', 'pre', 'q', 's',
|
579
|
-
'samp', 'select', 'small', 'span', 'strike', 'strong', 'sub', 'sup',
|
580
|
-
'table', 'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'tt',
|
581
|
-
'u', 'ul', 'var']
|
582
|
-
|
583
|
-
acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
|
584
|
-
'action', 'align', 'alt', 'axis', 'border', 'cellpadding',
|
585
|
-
'cellspacing', 'char', 'charoff', 'charset', 'checked', 'cite', 'class',
|
586
|
-
'clear', 'cols', 'colspan', 'color', 'compact', 'coords', 'datetime',
|
587
|
-
'dir', 'disabled', 'enctype', 'for', 'frame', 'headers', 'height',
|
588
|
-
'href', 'hreflang', 'hspace', 'id', 'ismap', 'label', 'lang',
|
589
|
-
'longdesc', 'maxlength', 'media', 'method', 'multiple', 'name',
|
590
|
-
'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev',
|
591
|
-
'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
|
592
|
-
'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title',
|
593
|
-
'type', 'usemap', 'valign', 'value', 'vspace', 'width']
|
594
|
-
|
595
|
-
# Replace with appropriate named entities
|
596
|
-
html.gsub!(/&/, "&")
|
597
|
-
html.gsub!(/&/, "&")
|
598
|
-
html.gsub!(/<!'/, "&lt;!'")
|
599
|
-
|
600
|
-
# Hackity hack. But it works, and it seems plenty fast enough.
|
601
|
-
html_doc = HTree.parse_xml("<root>" + html + "</root>").to_rexml
|
602
|
-
|
603
|
-
sanitize_node = lambda do |html_node|
|
604
|
-
if html_node.respond_to? :children
|
605
|
-
for child in html_node.children
|
606
|
-
if child.kind_of? REXML::Element
|
607
|
-
unless acceptable_elements.include? child.name.downcase
|
608
|
-
if mode == :strip
|
609
|
-
html_node.delete_element(child)
|
610
|
-
else
|
611
|
-
new_child = REXML::Text.new(CGI.escapeHTML(child.to_s))
|
612
|
-
html_node.insert_after(child, new_child)
|
613
|
-
html_node.delete_element(child)
|
614
|
-
end
|
615
|
-
end
|
616
|
-
for attribute in child.attributes.keys
|
617
|
-
unless acceptable_attributes.include? attribute.downcase
|
618
|
-
child.delete_attribute(attribute)
|
619
|
-
end
|
620
|
-
end
|
621
|
-
end
|
622
|
-
sanitize_node.call(child)
|
623
|
-
end
|
624
|
-
end
|
625
|
-
html_node
|
626
|
-
end
|
627
|
-
sanitize_node.call(html_doc.root)
|
628
|
-
html = html_doc.root.inner_xml
|
629
|
-
return html
|
630
|
-
end
|
295
|
+
end
|
631
296
|
|
632
297
|
# Creates a merged "planet" feed from a set of urls.
|
633
298
|
#
|
@@ -637,7 +302,7 @@ module FeedTools
|
|
637
302
|
# in conjunction with the DatabaseFeedCache as it will
|
638
303
|
# open multiple connections to the database.
|
639
304
|
def FeedTools.build_merged_feed(url_array, options = {})
|
640
|
-
validate_options([ :multi_threaded ],
|
305
|
+
FeedTools::GenericHelper.validate_options([ :multi_threaded ],
|
641
306
|
options.keys)
|
642
307
|
options = { :multi_threaded => false }.merge(options)
|
643
308
|
return nil if url_array.nil?
|
@@ -930,19 +595,37 @@ module REXML # :nodoc:
|
|
930
595
|
result << child.to_s
|
931
596
|
end
|
932
597
|
end
|
933
|
-
return result
|
598
|
+
return result.strip
|
934
599
|
end
|
600
|
+
else
|
601
|
+
warn("inner_xml method already exists.")
|
935
602
|
end
|
936
603
|
|
937
|
-
|
938
|
-
|
939
|
-
|
604
|
+
def base_uri # :nodoc:
|
605
|
+
begin
|
606
|
+
base_attribute = FeedTools::XmlHelper.try_xpaths(self, [
|
607
|
+
'@xml:base'
|
608
|
+
])
|
609
|
+
if parent == nil || parent.kind_of?(REXML::Document)
|
610
|
+
return nil if base_attribute == nil
|
611
|
+
return base_attribute.value
|
612
|
+
end
|
613
|
+
if base_attribute != nil && parent == nil
|
614
|
+
return base_attribute.value
|
615
|
+
elsif parent != nil && base_attribute == nil
|
940
616
|
return parent.base_uri
|
941
|
-
elsif parent
|
942
|
-
|
943
|
-
|
944
|
-
|
617
|
+
elsif parent != nil && base_attribute != nil
|
618
|
+
parent_base_uri = parent.base_uri
|
619
|
+
if parent_base_uri != nil
|
620
|
+
uri = URI.parse(parent_base_uri)
|
621
|
+
return (uri + base_attribute.value).to_s
|
622
|
+
else
|
623
|
+
return base_attribute.value
|
624
|
+
end
|
945
625
|
end
|
626
|
+
return nil
|
627
|
+
rescue
|
628
|
+
return nil
|
946
629
|
end
|
947
630
|
end
|
948
631
|
end
|