feedtools 0.2.22 → 0.2.23
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +28 -0
- data/README +23 -2
- data/db/migration.rb +19 -0
- data/db/schema.mysql.sql +1 -1
- data/db/schema.postgresql.sql +1 -1
- data/db/schema.sqlite.sql +1 -1
- data/lib/feed_tools.rb +71 -388
- data/lib/feed_tools/database_feed_cache.rb +4 -3
- data/lib/feed_tools/feed.rb +809 -607
- data/lib/feed_tools/feed_item.rb +551 -574
- data/lib/feed_tools/feed_structures.rb +252 -0
- data/lib/feed_tools/helpers/feed_tools_helper.rb +6 -5
- data/lib/feed_tools/helpers/generic_helper.rb +16 -158
- data/lib/feed_tools/helpers/html_helper.rb +629 -0
- data/lib/feed_tools/helpers/retrieval_helper.rb +5 -0
- data/lib/feed_tools/helpers/uri_helper.rb +223 -0
- data/lib/feed_tools/helpers/xml_helper.rb +239 -0
- data/rakefile +10 -237
- data/test/unit/amp_test.rb +102 -94
- data/test/unit/atom_test.rb +239 -6
- data/test/unit/cache_test.rb +1 -1
- data/test/unit/encoding_test.rb +5 -5
- data/test/unit/generation_test.rb +34 -1
- data/test/unit/helper_test.rb +111 -17
- data/test/unit/rss_test.rb +21 -2
- metadata +7 -3
- data/lib/feed_tools/helpers/module_helper.rb +0 -27
data/CHANGELOG
CHANGED
@@ -1,3 +1,31 @@
|
|
1
|
+
== FeedTools 0.2.23
|
2
|
+
* autodiscovery implemented
|
3
|
+
* now knows a title from a hole in the ground
|
4
|
+
* now resolves relative urls when possible
|
5
|
+
* changed default table name to "cached_feeds" to avoid name collisions
|
6
|
+
* schema now uses "href" instead of "url"
|
7
|
+
* feed cache is set to nil by default now
|
8
|
+
* both summary and content elements are generated now
|
9
|
+
* now supports proxies
|
10
|
+
* now supports internationalized domain names if libidn is installed
|
11
|
+
* fixed bug with feed merging referencing a method that was refactored
|
12
|
+
* no longer dies if uuidtools gem is missing but the UUID class is defined
|
13
|
+
* updated timestamp handling and generation
|
14
|
+
* added support for entry sorting on any feed item field
|
15
|
+
* added support for disabling entry sorting entirely
|
16
|
+
* fixed issue with itunes categories
|
17
|
+
* fixed itunes subtitle/summary
|
18
|
+
* fixed entry assignment bug
|
19
|
+
* fixed issued/published variable name mix-up
|
20
|
+
* added support for the payload module
|
21
|
+
* added support for xhtml:div elements
|
22
|
+
* dc:date now preempts pubDate
|
23
|
+
* added better support for the scriptingNews format
|
24
|
+
* now correctly strips out wrapper div elements from text constructs
|
25
|
+
* fixed issue with some atom links being incorrectly identified as images
|
26
|
+
* reorganized some of the helper modules
|
27
|
+
* made some portions of url normalization case insensitive
|
28
|
+
* fixed issue with filename handling on Windows
|
1
29
|
== FeedTools 0.2.22
|
2
30
|
* fixed another atom generation error
|
3
31
|
== FeedTools 0.2.21
|
data/README
CHANGED
@@ -1,7 +1,9 @@
|
|
1
|
-
|
2
|
-
caching system.
|
1
|
+
FeedTools was designed to be a simple XML feed parser, generator, and
|
2
|
+
translator with a built-in caching system.
|
3
3
|
|
4
4
|
== Example
|
5
|
+
require 'feed_tools'
|
6
|
+
|
5
7
|
slashdot_feed = FeedTools::Feed.open('http://www.slashdot.org/index.rss')
|
6
8
|
slashdot_feed.title
|
7
9
|
=> "Slashdot"
|
@@ -11,3 +13,22 @@
|
|
11
13
|
=> "http://slashdot.org/"
|
12
14
|
slashdot_feed.items.first.find_node("slash:hitparade/text()").to_s
|
13
15
|
=> "43,37,28,23,11,3,1"
|
16
|
+
|
17
|
+
== Installation
|
18
|
+
You can install FeedTools as a gem:
|
19
|
+
gem install feedtools
|
20
|
+
|
21
|
+
Or you can install it from the tarball or zip packages on the download page
|
22
|
+
and then extract it to your vendors directory as you would with any other
|
23
|
+
Ruby library.
|
24
|
+
|
25
|
+
After installation, you will either need to run in non-caching mode or set
|
26
|
+
up a caching mechanism. The database feed cache system currently included
|
27
|
+
with FeedTools is the most common caching method. To set up the database
|
28
|
+
feed cache, you will first need to create the appropriate database schema.
|
29
|
+
Schema files for MySQL, PostgreSQL, and SQLite have been included, but the
|
30
|
+
preferred method of creating the schema within the Rails environment is with
|
31
|
+
a migration file. A migration file has been supplied with FeedTools and can
|
32
|
+
be found in the db directory. Run
|
33
|
+
<tt>script/generate migration add_feed_tools_tables</tt> and then copy and
|
34
|
+
paste the contents of db/migration.rb into your new migration file.
|
data/db/migration.rb
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
class AddFeedToolsTables < ActiveRecord::Migration
|
2
|
+
def self.up
|
3
|
+
puts "Adding cached feeds table..."
|
4
|
+
create_table :cached_feeds do |t|
|
5
|
+
t.column :href, :string
|
6
|
+
t.column :title, :string
|
7
|
+
t.column :link, :string
|
8
|
+
t.column :feed_data, :text
|
9
|
+
t.column :feed_data_type, :string
|
10
|
+
t.column :http_headers, :text
|
11
|
+
t.column :last_retrieved, :datetime
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
def self.down
|
16
|
+
puts "Dropping cached feeds table..."
|
17
|
+
drop_table :cached_feeds
|
18
|
+
end
|
19
|
+
end
|
data/db/schema.mysql.sql
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
-- Example MySQL schema
|
2
2
|
CREATE TABLE `feeds` (
|
3
3
|
`id` int(10) unsigned NOT NULL auto_increment,
|
4
|
-
`
|
4
|
+
`href` varchar(255) default NULL,
|
5
5
|
`title` varchar(255) default NULL,
|
6
6
|
`link` varchar(255) default NULL,
|
7
7
|
`feed_data` longtext default NULL,
|
data/db/schema.postgresql.sql
CHANGED
data/db/schema.sqlite.sql
CHANGED
data/lib/feed_tools.rb
CHANGED
@@ -32,7 +32,7 @@ FEED_TOOLS_ENV = ENV['FEED_TOOLS_ENV'] ||
|
|
32
32
|
ENV['RAILS_ENV'] ||
|
33
33
|
'development' # :nodoc:
|
34
34
|
|
35
|
-
FEED_TOOLS_VERSION = "0.2.
|
35
|
+
FEED_TOOLS_VERSION = "0.2.23"
|
36
36
|
|
37
37
|
FEED_TOOLS_NAMESPACES = {
|
38
38
|
"admin" => "http://webns.net/mvcb/",
|
@@ -40,8 +40,9 @@ FEED_TOOLS_NAMESPACES = {
|
|
40
40
|
"annotate" => "http://purl.org/rss/1.0/modules/annotate/",
|
41
41
|
"atom10" => "http://www.w3.org/2005/Atom",
|
42
42
|
"atom03" => "http://purl.org/atom/ns#",
|
43
|
-
|
43
|
+
"atom-blog" => "http://purl.org/atom-blog/ns#",
|
44
44
|
"audio" => "http://media.tangent.org/rss/1.0/",
|
45
|
+
"bitTorrent" =>"http://www.reallysimplesyndication.com/bitTorrentRssModule",
|
45
46
|
"blogChannel" => "http://backend.userland.com/blogChannelModule",
|
46
47
|
"blogger" => "http://www.blogger.com/atom/ns#",
|
47
48
|
"cc" => "http://web.resource.org/cc/",
|
@@ -61,20 +62,24 @@ FEED_TOOLS_NAMESPACES = {
|
|
61
62
|
"itunes" => "http://www.itunes.com/dtds/podcast-1.0.dtd",
|
62
63
|
"l" => "http://purl.org/rss/1.0/modules/link/",
|
63
64
|
"media" => "http://search.yahoo.com/mrss",
|
65
|
+
"p" => "http://purl.org/net/rss1.1/payload#",
|
64
66
|
"pingback" => "http://madskills.com/public/xml/rss/module/pingback/",
|
65
67
|
"prism" => "http://prismstandard.org/namespaces/1.2/basic/",
|
66
68
|
"rdf" => "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
|
67
69
|
"rdfs" => "http://www.w3.org/2000/01/rdf-schema#",
|
68
70
|
"ref" => "http://purl.org/rss/1.0/modules/reference/",
|
69
71
|
"reqv" => "http://purl.org/rss/1.0/modules/richequiv/",
|
72
|
+
"rss09" => "http://my.netscape.com/rdf/simple/0.9/",
|
70
73
|
"rss10" => "http://purl.org/rss/1.0/",
|
74
|
+
"rss11" => "http://purl.org/net/rss1.1#",
|
75
|
+
"rss20" => "http://backend.userland.com/rss2",
|
71
76
|
"search" => "http://purl.org/rss/1.0/modules/search/",
|
72
77
|
"slash" => "http://purl.org/rss/1.0/modules/slash/",
|
73
78
|
"soap" => "http://schemas.xmlsoap.org/soap/envelope/",
|
74
79
|
"ss" => "http://purl.org/rss/1.0/modules/servicestatus/",
|
75
80
|
"str" => "http://hacks.benhammersley.com/rss/streaming/",
|
76
81
|
"sub" => "http://purl.org/rss/1.0/modules/subscription/",
|
77
|
-
"
|
82
|
+
"syn" => "http://purl.org/rss/1.0/modules/syndication/",
|
78
83
|
"taxo" => "http://purl.org/rss/1.0/modules/taxonomy/",
|
79
84
|
"thr" => "http://purl.org/rss/1.0/modules/threading/",
|
80
85
|
"ti" => "http://purl.org/rss/1.0/modules/textinput/",
|
@@ -91,7 +96,7 @@ $:.unshift(File.dirname(__FILE__) + "/feed_tools/vendor")
|
|
91
96
|
begin
|
92
97
|
begin
|
93
98
|
require 'iconv'
|
94
|
-
rescue
|
99
|
+
rescue Object
|
95
100
|
warn("The Iconv library does not appear to be installed properly. " +
|
96
101
|
"FeedTools cannot function properly without it.")
|
97
102
|
raise
|
@@ -101,11 +106,15 @@ begin
|
|
101
106
|
|
102
107
|
require_gem('builder', '>= 1.2.4')
|
103
108
|
|
109
|
+
# Preload optional libraries.
|
104
110
|
begin
|
105
111
|
require 'tidy'
|
106
|
-
rescue
|
107
|
-
# Ignore the error for now.
|
112
|
+
rescue Object
|
108
113
|
end
|
114
|
+
begin
|
115
|
+
require 'idn'
|
116
|
+
rescue Object
|
117
|
+
end
|
109
118
|
|
110
119
|
require 'feed_tools/vendor/htree'
|
111
120
|
|
@@ -126,11 +135,25 @@ begin
|
|
126
135
|
|
127
136
|
require_gem('activesupport', '>= 1.1.1')
|
128
137
|
require_gem('activerecord', '>= 1.11.1')
|
129
|
-
require_gem('uuidtools', '>= 0.1.2')
|
130
138
|
|
139
|
+
begin
|
140
|
+
require_gem('uuidtools', '>= 0.1.2')
|
141
|
+
rescue Gem::LoadError
|
142
|
+
begin
|
143
|
+
require 'uuidtools'
|
144
|
+
rescue Object
|
145
|
+
raise unless defined? UUID
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
131
149
|
require 'feed_tools/feed'
|
132
150
|
require 'feed_tools/feed_item'
|
151
|
+
require 'feed_tools/feed_structures'
|
133
152
|
require 'feed_tools/database_feed_cache'
|
153
|
+
|
154
|
+
require 'feed_tools/helpers/html_helper'
|
155
|
+
require 'feed_tools/helpers/xml_helper'
|
156
|
+
require 'feed_tools/helpers/uri_helper'
|
134
157
|
rescue LoadError
|
135
158
|
# ActiveSupport will very likely mess this up. So drop a warn so that the
|
136
159
|
# programmer can figure it out if things get wierd and unpredictable.
|
@@ -159,19 +182,27 @@ module FeedTools
|
|
159
182
|
|
160
183
|
def FeedTools.load_configurations
|
161
184
|
if @configurations.blank?
|
185
|
+
# TODO: Load this from a config file.
|
162
186
|
config_hash = {}
|
163
187
|
@configurations = {
|
164
|
-
:feed_cache =>
|
188
|
+
:feed_cache => nil,
|
189
|
+
:proxy_address => nil,
|
190
|
+
:proxy_port => nil,
|
165
191
|
:user_agent => "FeedTools/#{FEED_TOOLS_VERSION} " +
|
166
192
|
"+http://www.sporkmonger.com/projects/feedtools/",
|
167
193
|
:generator_name => "FeedTools/#{FEED_TOOLS_VERSION}",
|
168
194
|
:generator_href => "http://www.sporkmonger.com/projects/feedtools/",
|
169
|
-
:tidy_enabled =>
|
195
|
+
:tidy_enabled => true,
|
170
196
|
:tidy_options => {},
|
197
|
+
:idn_enabled => true,
|
198
|
+
:sanitization_enabled => true,
|
171
199
|
:sanitize_with_nofollow => true,
|
200
|
+
:always_strip_wrapper_elements => true,
|
172
201
|
:timestamp_estimation_enabled => true,
|
173
202
|
:url_normalization_enabled => true,
|
203
|
+
:entry_sorting_property => "time",
|
174
204
|
:strip_comment_count => false,
|
205
|
+
:tab_spaces => 2,
|
175
206
|
:max_ttl => 3.days.to_s,
|
176
207
|
:output_encoding => "utf-8"
|
177
208
|
}.merge(config_hash)
|
@@ -236,6 +267,9 @@ module FeedTools
|
|
236
267
|
cache_class = eval(class_name)
|
237
268
|
if cache_class.kind_of?(Class)
|
238
269
|
@feed_cache = cache_class
|
270
|
+
if @feed_cache.respond_to? :initialize_cache
|
271
|
+
@feed_cache.initialize_cache
|
272
|
+
end
|
239
273
|
return cache_class
|
240
274
|
else
|
241
275
|
return nil
|
@@ -258,376 +292,7 @@ module FeedTools
|
|
258
292
|
rescue
|
259
293
|
return false
|
260
294
|
end
|
261
|
-
end
|
262
|
-
|
263
|
-
# Returns true if the html tidy module can be used.
|
264
|
-
#
|
265
|
-
# Obviously, you need the tidy gem installed in order to run with html
|
266
|
-
# tidy features turned on.
|
267
|
-
#
|
268
|
-
# This method does a fairly complicated, and probably unnecessarily
|
269
|
-
# desperate search for the libtidy library. If you want this thing to
|
270
|
-
# execute fast, the best thing to do is to set Tidy.path ahead of time.
|
271
|
-
# If Tidy.path is set, this method doesn't do much. If it's not set,
|
272
|
-
# it will do it's darnedest to find the libtidy library. If you set
|
273
|
-
# the LIBTIDYPATH environment variable to the libtidy library, it should
|
274
|
-
# be able to find it.
|
275
|
-
#
|
276
|
-
# Once the library is located, this method will run much faster.
|
277
|
-
def FeedTools.tidy_enabled?
|
278
|
-
# This is an override variable to keep tidy from being used even if it
|
279
|
-
# is available.
|
280
|
-
if FeedTools.configurations[:tidy_enabled] == false
|
281
|
-
return false
|
282
|
-
end
|
283
|
-
if @tidy_enabled.nil? || @tidy_enabled == false
|
284
|
-
@tidy_enabled = false
|
285
|
-
begin
|
286
|
-
require 'tidy'
|
287
|
-
if Tidy.path.nil?
|
288
|
-
# *Shrug*, just brute force it, I guess. There's a lot of places
|
289
|
-
# this thing might be hiding in, depending on platform and general
|
290
|
-
# sanity of the person who installed the thing. Most of these are
|
291
|
-
# probably unlikely, but it's not like checking unlikely locations
|
292
|
-
# hurts. Much. Especially if you actually find it.
|
293
|
-
libtidy_locations = [
|
294
|
-
'/usr/local/lib/libtidy.dylib',
|
295
|
-
'/opt/local/lib/libtidy.dylib',
|
296
|
-
'/usr/lib/libtidy.dylib',
|
297
|
-
'/usr/local/lib/tidylib.dylib',
|
298
|
-
'/opt/local/lib/tidylib.dylib',
|
299
|
-
'/usr/lib/tidylib.dylib',
|
300
|
-
'/usr/local/lib/tidy.dylib',
|
301
|
-
'/opt/local/lib/tidy.dylib',
|
302
|
-
'/usr/lib/tidy.dylib',
|
303
|
-
'/usr/local/lib/libtidy.so',
|
304
|
-
'/opt/local/lib/libtidy.so',
|
305
|
-
'/usr/lib/libtidy.so',
|
306
|
-
'/usr/local/lib/tidylib.so',
|
307
|
-
'/opt/local/lib/tidylib.so',
|
308
|
-
'/usr/lib/tidylib.so',
|
309
|
-
'/usr/local/lib/tidy.so',
|
310
|
-
'/opt/local/lib/tidy.so',
|
311
|
-
'/usr/lib/tidy.so',
|
312
|
-
'C:\Program Files\Tidy\tidy.dll',
|
313
|
-
'C:\Tidy\tidy.dll',
|
314
|
-
'C:\Ruby\bin\tidy.dll',
|
315
|
-
'C:\Ruby\tidy.dll',
|
316
|
-
'/usr/local/lib',
|
317
|
-
'/opt/local/lib',
|
318
|
-
'/usr/lib'
|
319
|
-
]
|
320
|
-
# We just made this thing up, but if someone sets it, we'll
|
321
|
-
# go ahead and check it
|
322
|
-
unless ENV['LIBTIDYPATH'].nil?
|
323
|
-
libtidy_locations =
|
324
|
-
libtidy_locations.reverse.push(ENV['LIBTIDYPATH'])
|
325
|
-
end
|
326
|
-
for path in libtidy_locations
|
327
|
-
if File.exists? path
|
328
|
-
if File.ftype(path) == "file"
|
329
|
-
Tidy.path = path
|
330
|
-
@tidy_enabled = true
|
331
|
-
break
|
332
|
-
elsif File.ftype(path) == "directory"
|
333
|
-
# Ok, now perhaps we're getting a bit more desperate
|
334
|
-
lib_paths =
|
335
|
-
`find #{path} -name '*tidy*' | grep '\\.\\(so\\|dylib\\)$'`
|
336
|
-
# If there's more than one, grab the first one and
|
337
|
-
# hope for the best, and if it doesn't work, then blame the
|
338
|
-
# user for not specifying more accurately.
|
339
|
-
tidy_path = lib_paths.split("\n").first
|
340
|
-
unless tidy_path.nil?
|
341
|
-
Tidy.path = tidy_path
|
342
|
-
@tidy_enabled = true
|
343
|
-
break
|
344
|
-
end
|
345
|
-
end
|
346
|
-
end
|
347
|
-
end
|
348
|
-
# Still couldn't find it.
|
349
|
-
unless @tidy_enabled
|
350
|
-
@tidy_enabled = false
|
351
|
-
end
|
352
|
-
else
|
353
|
-
@tidy_enabled = true
|
354
|
-
end
|
355
|
-
rescue LoadError
|
356
|
-
# Tidy not installed, disable features that rely on tidy.
|
357
|
-
@tidy_enabled = false
|
358
|
-
end
|
359
|
-
end
|
360
|
-
return @tidy_enabled
|
361
|
-
end
|
362
|
-
|
363
|
-
# Attempts to ensures that the passed url is valid and sane. Accepts very, very ugly urls
|
364
|
-
# and makes every effort to figure out what it was supposed to be. Also translates from
|
365
|
-
# the feed: and rss: pseudo-protocols to the http: protocol.
|
366
|
-
def FeedTools.normalize_url(url)
|
367
|
-
if url.nil? || url == ""
|
368
|
-
return nil
|
369
|
-
end
|
370
|
-
normalized_url = url.strip
|
371
|
-
|
372
|
-
# if a url begins with the '/' character, it only makes sense that they
|
373
|
-
# meant to be using a file:// url. Fix it for them.
|
374
|
-
if normalized_url.length > 0 && normalized_url[0..0] == "/"
|
375
|
-
normalized_url = "file://" + normalized_url
|
376
|
-
end
|
377
|
-
|
378
|
-
# if a url begins with a drive letter followed by a colon, we're looking at
|
379
|
-
# a file:// url. Fix it for them.
|
380
|
-
if normalized_url.length > 0 &&
|
381
|
-
normalized_url.scan(/^[a-zA-Z]:[\\\/]/).size > 0
|
382
|
-
normalized_url = "file:///" + normalized_url
|
383
|
-
end
|
384
|
-
|
385
|
-
# if a url begins with javascript:, it's quite possibly an attempt at
|
386
|
-
# doing something malicious. Let's keep that from getting anywhere,
|
387
|
-
# shall we?
|
388
|
-
if (normalized_url.downcase =~ /javascript:/) != nil
|
389
|
-
return "#"
|
390
|
-
end
|
391
|
-
|
392
|
-
# deal with all of the many ugly possibilities involved in the rss:
|
393
|
-
# and feed: pseudo-protocols (incidentally, whose crazy idea was this
|
394
|
-
# mess?)
|
395
|
-
normalized_url.gsub!(/^http:\/*(feed:\/*)?/, "http://")
|
396
|
-
normalized_url.gsub!(/^http:\/*(rss:\/*)?/, "http://")
|
397
|
-
normalized_url.gsub!(/^feed:\/*(http:\/*)?/, "http://")
|
398
|
-
normalized_url.gsub!(/^rss:\/*(http:\/*)?/, "http://")
|
399
|
-
normalized_url.gsub!(/^file:\/*/, "file:///")
|
400
|
-
normalized_url.gsub!(/^https:\/*/, "https://")
|
401
|
-
# fix (very) bad urls (usually of the user-entered sort)
|
402
|
-
normalized_url.gsub!(/^http:\/*(http:\/*)*/, "http://")
|
403
|
-
|
404
|
-
if (normalized_url =~ /^file:/) == 0
|
405
|
-
# Adjust windows-style urls
|
406
|
-
normalized_url.gsub!(/^file:\/\/\/([a-zA-Z])\|/, 'file:///\1:')
|
407
|
-
normalized_url.gsub!(/\\/, '/')
|
408
|
-
else
|
409
|
-
if (normalized_url =~ /https?:\/\//) == nil
|
410
|
-
normalized_url = "http://" + normalized_url
|
411
|
-
end
|
412
|
-
if normalized_url == "http://"
|
413
|
-
return nil
|
414
|
-
end
|
415
|
-
begin
|
416
|
-
feed_uri = URI.parse(normalized_url)
|
417
|
-
if feed_uri.scheme == nil
|
418
|
-
feed_uri.scheme = "http"
|
419
|
-
end
|
420
|
-
if feed_uri.path == nil || feed_uri.path == ""
|
421
|
-
feed_uri.path = "/"
|
422
|
-
end
|
423
|
-
if (feed_uri.path =~ /^[\/]+/) == 0
|
424
|
-
feed_uri.path.gsub!(/^[\/]+/, "/")
|
425
|
-
end
|
426
|
-
feed_uri.host.downcase!
|
427
|
-
normalized_url = feed_uri.to_s
|
428
|
-
rescue URI::InvalidURIError
|
429
|
-
end
|
430
|
-
end
|
431
|
-
|
432
|
-
# We can't do a proper set of escaping, so this will
|
433
|
-
# have to do.
|
434
|
-
normalized_url.gsub!(/%20/, " ")
|
435
|
-
normalized_url.gsub!(/ /, "%20")
|
436
|
-
|
437
|
-
return normalized_url
|
438
|
-
end
|
439
|
-
|
440
|
-
# Converts a url into a tag uri
|
441
|
-
def FeedTools.build_tag_uri(url, date)
|
442
|
-
unless url.kind_of? String
|
443
|
-
raise ArgumentError, "Expected String, got #{url.class.name}"
|
444
|
-
end
|
445
|
-
unless date.kind_of? Time
|
446
|
-
raise ArgumentError, "Expected Time, got #{date.class.name}"
|
447
|
-
end
|
448
|
-
tag_uri = normalize_url(url)
|
449
|
-
unless FeedTools.is_uri?(tag_uri)
|
450
|
-
raise ArgumentError, "Must supply a valid URL."
|
451
|
-
end
|
452
|
-
host = URI.parse(tag_uri).host
|
453
|
-
tag_uri.gsub!(/^(http|ftp|file):\/*/, "")
|
454
|
-
tag_uri.gsub!(/#/, "/")
|
455
|
-
tag_uri = "tag:#{host},#{date.strftime('%Y-%m-%d')}:" +
|
456
|
-
"#{tag_uri[(tag_uri.index(host) + host.size)..-1]}"
|
457
|
-
return tag_uri
|
458
|
-
end
|
459
|
-
|
460
|
-
# Converts a url into a urn:uuid: uri
|
461
|
-
def FeedTools.build_urn_uri(url)
|
462
|
-
unless url.kind_of? String
|
463
|
-
raise ArgumentError, "Expected String, got #{url.class.name}"
|
464
|
-
end
|
465
|
-
normalized_url = normalize_url(url)
|
466
|
-
require 'uuidtools'
|
467
|
-
return UUID.sha1_create(UUID_URL_NAMESPACE, normalized_url).to_uri_string
|
468
|
-
end
|
469
|
-
|
470
|
-
# Returns true if the parameter appears to be a valid uri
|
471
|
-
def FeedTools.is_uri?(url)
|
472
|
-
return false if url.nil?
|
473
|
-
begin
|
474
|
-
uri = URI.parse(url)
|
475
|
-
if uri.scheme.nil? || uri.scheme == ""
|
476
|
-
return false
|
477
|
-
end
|
478
|
-
rescue URI::InvalidURIError
|
479
|
-
return false
|
480
|
-
end
|
481
|
-
return true
|
482
|
-
end
|
483
|
-
|
484
|
-
# Escapes all html entities
|
485
|
-
def FeedTools.escape_entities(html)
|
486
|
-
return nil if html.nil?
|
487
|
-
escaped_html = CGI.escapeHTML(html)
|
488
|
-
escaped_html.gsub!(/'/, "'")
|
489
|
-
escaped_html.gsub!(/"/, """)
|
490
|
-
return escaped_html
|
491
|
-
end
|
492
|
-
|
493
|
-
# Unescapes all html entities
|
494
|
-
def FeedTools.unescape_entities(html)
|
495
|
-
return nil if html.nil?
|
496
|
-
unescaped_html = html
|
497
|
-
unescaped_html.gsub!(/&/, "&")
|
498
|
-
unescaped_html.gsub!(/&/, "&")
|
499
|
-
unescaped_html = CGI.unescapeHTML(unescaped_html)
|
500
|
-
unescaped_html.gsub!(/'/, "'")
|
501
|
-
unescaped_html.gsub!(/"/, "\"")
|
502
|
-
return unescaped_html
|
503
|
-
end
|
504
|
-
|
505
|
-
# Removes all html tags from the html formatted text.
|
506
|
-
def FeedTools.strip_html(html)
|
507
|
-
return nil if html.nil?
|
508
|
-
# TODO: do this properly
|
509
|
-
# ======================
|
510
|
-
stripped_html = html.gsub(/<\/?[^>]+>/, "")
|
511
|
-
return stripped_html
|
512
|
-
end
|
513
|
-
|
514
|
-
# Tidys up the html
|
515
|
-
def FeedTools.tidy_html(html, options = {})
|
516
|
-
return nil if html.nil?
|
517
|
-
if FeedTools.tidy_enabled?
|
518
|
-
is_fragment = true
|
519
|
-
html.gsub!(/<!'/, "&lt;!'")
|
520
|
-
if (html.strip =~ /<html>(.|\n)*<body>/) != nil ||
|
521
|
-
(html.strip =~ /<\/body>(.|\n)*<\/html>$/) != nil
|
522
|
-
is_fragment = false
|
523
|
-
end
|
524
|
-
if (html.strip =~ /<\?xml(.|\n)*\?>/) != nil
|
525
|
-
is_fragment = false
|
526
|
-
end
|
527
|
-
tidy_html = Tidy.open(:show_warnings=>false) do |tidy|
|
528
|
-
tidy.options.output_xml = true
|
529
|
-
tidy.options.numeric_entities = true
|
530
|
-
tidy.options.markup = true
|
531
|
-
tidy.options.indent = false
|
532
|
-
tidy.options.wrap = 0
|
533
|
-
tidy.options.logical_emphasis = true
|
534
|
-
# TODO: Make this match the actual encoding of the feed
|
535
|
-
# =====================================================
|
536
|
-
tidy.options.input_encoding = "utf8"
|
537
|
-
tidy.options.output_encoding = "ascii"
|
538
|
-
tidy.options.ascii_chars = false
|
539
|
-
tidy.options.doctype = "omit"
|
540
|
-
xml = tidy.clean(html)
|
541
|
-
xml
|
542
|
-
end
|
543
|
-
if is_fragment
|
544
|
-
# Tidy sticks <html>...<body>[our html]</body>...</html> in.
|
545
|
-
# We don't want this.
|
546
|
-
tidy_html.strip!
|
547
|
-
tidy_html.gsub!(/^<html>(.|\n)*<body>/, "")
|
548
|
-
tidy_html.gsub!(/<\/body>(.|\n)*<\/html>$/, "")
|
549
|
-
tidy_html.strip!
|
550
|
-
end
|
551
|
-
tidy_html.gsub!(/&/, "&")
|
552
|
-
tidy_html.gsub!(/&/, "&")
|
553
|
-
tidy_html.gsub!(/\320\262\320\202\342\204\242/, "\342\200\231")
|
554
|
-
|
555
|
-
else
|
556
|
-
tidy_html = html
|
557
|
-
end
|
558
|
-
if tidy_html.blank? && !html.blank?
|
559
|
-
tidy_html = html.strip
|
560
|
-
end
|
561
|
-
return tidy_html
|
562
|
-
end
|
563
|
-
|
564
|
-
# Removes all dangerous html tags from the html formatted text.
|
565
|
-
# If mode is set to :escape, dangerous and unknown elements will
|
566
|
-
# be escaped. If mode is set to :strip, dangerous and unknown
|
567
|
-
# elements and all children will be removed entirely.
|
568
|
-
# Dangerous or unknown attributes are always removed.
|
569
|
-
def FeedTools.sanitize_html(html, mode=:strip)
|
570
|
-
return nil if html.nil?
|
571
|
-
|
572
|
-
# Lists borrowed from Mark Pilgrim's feedparser
|
573
|
-
acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b',
|
574
|
-
'big', 'blockquote', 'br', 'button', 'caption', 'center', 'cite',
|
575
|
-
'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl',
|
576
|
-
'dt', 'em', 'fieldset', 'font', 'form', 'h1', 'h2', 'h3', 'h4',
|
577
|
-
'h5', 'h6', 'hr', 'i', 'img', 'input', 'ins', 'kbd', 'label', 'legend',
|
578
|
-
'li', 'map', 'menu', 'ol', 'optgroup', 'option', 'p', 'pre', 'q', 's',
|
579
|
-
'samp', 'select', 'small', 'span', 'strike', 'strong', 'sub', 'sup',
|
580
|
-
'table', 'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'tt',
|
581
|
-
'u', 'ul', 'var']
|
582
|
-
|
583
|
-
acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
|
584
|
-
'action', 'align', 'alt', 'axis', 'border', 'cellpadding',
|
585
|
-
'cellspacing', 'char', 'charoff', 'charset', 'checked', 'cite', 'class',
|
586
|
-
'clear', 'cols', 'colspan', 'color', 'compact', 'coords', 'datetime',
|
587
|
-
'dir', 'disabled', 'enctype', 'for', 'frame', 'headers', 'height',
|
588
|
-
'href', 'hreflang', 'hspace', 'id', 'ismap', 'label', 'lang',
|
589
|
-
'longdesc', 'maxlength', 'media', 'method', 'multiple', 'name',
|
590
|
-
'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev',
|
591
|
-
'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
|
592
|
-
'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title',
|
593
|
-
'type', 'usemap', 'valign', 'value', 'vspace', 'width']
|
594
|
-
|
595
|
-
# Replace with appropriate named entities
|
596
|
-
html.gsub!(/&/, "&")
|
597
|
-
html.gsub!(/&/, "&")
|
598
|
-
html.gsub!(/<!'/, "&lt;!'")
|
599
|
-
|
600
|
-
# Hackity hack. But it works, and it seems plenty fast enough.
|
601
|
-
html_doc = HTree.parse_xml("<root>" + html + "</root>").to_rexml
|
602
|
-
|
603
|
-
sanitize_node = lambda do |html_node|
|
604
|
-
if html_node.respond_to? :children
|
605
|
-
for child in html_node.children
|
606
|
-
if child.kind_of? REXML::Element
|
607
|
-
unless acceptable_elements.include? child.name.downcase
|
608
|
-
if mode == :strip
|
609
|
-
html_node.delete_element(child)
|
610
|
-
else
|
611
|
-
new_child = REXML::Text.new(CGI.escapeHTML(child.to_s))
|
612
|
-
html_node.insert_after(child, new_child)
|
613
|
-
html_node.delete_element(child)
|
614
|
-
end
|
615
|
-
end
|
616
|
-
for attribute in child.attributes.keys
|
617
|
-
unless acceptable_attributes.include? attribute.downcase
|
618
|
-
child.delete_attribute(attribute)
|
619
|
-
end
|
620
|
-
end
|
621
|
-
end
|
622
|
-
sanitize_node.call(child)
|
623
|
-
end
|
624
|
-
end
|
625
|
-
html_node
|
626
|
-
end
|
627
|
-
sanitize_node.call(html_doc.root)
|
628
|
-
html = html_doc.root.inner_xml
|
629
|
-
return html
|
630
|
-
end
|
295
|
+
end
|
631
296
|
|
632
297
|
# Creates a merged "planet" feed from a set of urls.
|
633
298
|
#
|
@@ -637,7 +302,7 @@ module FeedTools
|
|
637
302
|
# in conjunction with the DatabaseFeedCache as it will
|
638
303
|
# open multiple connections to the database.
|
639
304
|
def FeedTools.build_merged_feed(url_array, options = {})
|
640
|
-
validate_options([ :multi_threaded ],
|
305
|
+
FeedTools::GenericHelper.validate_options([ :multi_threaded ],
|
641
306
|
options.keys)
|
642
307
|
options = { :multi_threaded => false }.merge(options)
|
643
308
|
return nil if url_array.nil?
|
@@ -930,19 +595,37 @@ module REXML # :nodoc:
|
|
930
595
|
result << child.to_s
|
931
596
|
end
|
932
597
|
end
|
933
|
-
return result
|
598
|
+
return result.strip
|
934
599
|
end
|
600
|
+
else
|
601
|
+
warn("inner_xml method already exists.")
|
935
602
|
end
|
936
603
|
|
937
|
-
|
938
|
-
|
939
|
-
|
604
|
+
def base_uri # :nodoc:
|
605
|
+
begin
|
606
|
+
base_attribute = FeedTools::XmlHelper.try_xpaths(self, [
|
607
|
+
'@xml:base'
|
608
|
+
])
|
609
|
+
if parent == nil || parent.kind_of?(REXML::Document)
|
610
|
+
return nil if base_attribute == nil
|
611
|
+
return base_attribute.value
|
612
|
+
end
|
613
|
+
if base_attribute != nil && parent == nil
|
614
|
+
return base_attribute.value
|
615
|
+
elsif parent != nil && base_attribute == nil
|
940
616
|
return parent.base_uri
|
941
|
-
elsif parent
|
942
|
-
|
943
|
-
|
944
|
-
|
617
|
+
elsif parent != nil && base_attribute != nil
|
618
|
+
parent_base_uri = parent.base_uri
|
619
|
+
if parent_base_uri != nil
|
620
|
+
uri = URI.parse(parent_base_uri)
|
621
|
+
return (uri + base_attribute.value).to_s
|
622
|
+
else
|
623
|
+
return base_attribute.value
|
624
|
+
end
|
945
625
|
end
|
626
|
+
return nil
|
627
|
+
rescue
|
628
|
+
return nil
|
946
629
|
end
|
947
630
|
end
|
948
631
|
end
|