feedtools 0.2.22 → 0.2.23
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +28 -0
- data/README +23 -2
- data/db/migration.rb +19 -0
- data/db/schema.mysql.sql +1 -1
- data/db/schema.postgresql.sql +1 -1
- data/db/schema.sqlite.sql +1 -1
- data/lib/feed_tools.rb +71 -388
- data/lib/feed_tools/database_feed_cache.rb +4 -3
- data/lib/feed_tools/feed.rb +809 -607
- data/lib/feed_tools/feed_item.rb +551 -574
- data/lib/feed_tools/feed_structures.rb +252 -0
- data/lib/feed_tools/helpers/feed_tools_helper.rb +6 -5
- data/lib/feed_tools/helpers/generic_helper.rb +16 -158
- data/lib/feed_tools/helpers/html_helper.rb +629 -0
- data/lib/feed_tools/helpers/retrieval_helper.rb +5 -0
- data/lib/feed_tools/helpers/uri_helper.rb +223 -0
- data/lib/feed_tools/helpers/xml_helper.rb +239 -0
- data/rakefile +10 -237
- data/test/unit/amp_test.rb +102 -94
- data/test/unit/atom_test.rb +239 -6
- data/test/unit/cache_test.rb +1 -1
- data/test/unit/encoding_test.rb +5 -5
- data/test/unit/generation_test.rb +34 -1
- data/test/unit/helper_test.rb +111 -17
- data/test/unit/rss_test.rb +21 -2
- metadata +7 -3
- data/lib/feed_tools/helpers/module_helper.rb +0 -27
@@ -28,6 +28,11 @@ require 'net/http'
|
|
28
28
|
# require 'net/https'
|
29
29
|
# require 'net/ftp'
|
30
30
|
|
31
|
+
# Stolen from the Universal Feed Parser
|
32
|
+
FEED_TOOLS_ACCEPT_HEADER = "application/atom+xml,application/rdf+xml," +
|
33
|
+
"application/rss+xml,application/x-netcdf,application/xml;" +
|
34
|
+
"q=0.9,text/xml;q=0.2,*/*;q=0.1"
|
35
|
+
|
31
36
|
# TODO: Refactor http_fetch and other methods.
|
32
37
|
module FeedTools
|
33
38
|
# Methods for pulling remote data
|
@@ -0,0 +1,223 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2005 Robert Aman
|
3
|
+
#
|
4
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
5
|
+
# a copy of this software and associated documentation files (the
|
6
|
+
# "Software"), to deal in the Software without restriction, including
|
7
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
8
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
9
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
10
|
+
# the following conditions:
|
11
|
+
#
|
12
|
+
# The above copyright notice and this permission notice shall be
|
13
|
+
# included in all copies or substantial portions of the Software.
|
14
|
+
#
|
15
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
16
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
17
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
18
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
19
|
+
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
20
|
+
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
21
|
+
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
22
|
+
#++
|
23
|
+
|
24
|
+
require 'feed_tools'
|
25
|
+
require 'uri'
|
26
|
+
|
27
|
+
module FeedTools
|
28
|
+
# Generic url processing methods needed in numerous places throughout
|
29
|
+
# FeedTools
|
30
|
+
module UriHelper
|
31
|
+
# Returns true if the idn module can be used.
|
32
|
+
def self.idn_enabled?
|
33
|
+
# This is an override variable to keep idn from being used even if it
|
34
|
+
# is available.
|
35
|
+
if FeedTools.configurations[:idn_enabled] == false
|
36
|
+
return false
|
37
|
+
end
|
38
|
+
if @idn_enabled.nil? || @idn_enabled == false
|
39
|
+
@idn_enabled = false
|
40
|
+
begin
|
41
|
+
require 'idn'
|
42
|
+
if IDN::Idna.toASCII('http://www.詹姆斯.com/') ==
|
43
|
+
"http://www.xn--8ws00zhy3a.com/"
|
44
|
+
@idn_enabled = true
|
45
|
+
else
|
46
|
+
@idn_enabled = false
|
47
|
+
end
|
48
|
+
rescue LoadError
|
49
|
+
# Tidy not installed, disable features that rely on tidy.
|
50
|
+
@idn_enabled = false
|
51
|
+
end
|
52
|
+
end
|
53
|
+
return @idn_enabled
|
54
|
+
end
|
55
|
+
|
56
|
+
# Attempts to ensures that the passed url is valid and sane. Accepts very,
|
57
|
+
# very ugly urls and makes every effort to figure out what it was supposed
|
58
|
+
# to be. Also translates from the feed: and rss: pseudo-protocols to the
|
59
|
+
# http: protocol.
|
60
|
+
def self.normalize_url(url)
|
61
|
+
if url.kind_of?(URI)
|
62
|
+
url = url.to_s
|
63
|
+
end
|
64
|
+
if url.blank?
|
65
|
+
return nil
|
66
|
+
end
|
67
|
+
normalized_url = CGI.unescape(url.strip)
|
68
|
+
|
69
|
+
# if a url begins with the '/' character, it only makes sense that they
|
70
|
+
# meant to be using a file:// url. Fix it for them.
|
71
|
+
if normalized_url.length > 0 && normalized_url[0..0] == "/"
|
72
|
+
normalized_url = "file://" + normalized_url
|
73
|
+
end
|
74
|
+
|
75
|
+
# if a url begins with a drive letter followed by a colon, we're looking at
|
76
|
+
# a file:// url. Fix it for them.
|
77
|
+
if normalized_url.length > 0 &&
|
78
|
+
normalized_url.scan(/^[a-zA-Z]:[\\\/]/).size > 0
|
79
|
+
normalized_url = "file:///" + normalized_url
|
80
|
+
end
|
81
|
+
|
82
|
+
# if a url begins with javascript:, it's quite possibly an attempt at
|
83
|
+
# doing something malicious. Let's keep that from getting anywhere,
|
84
|
+
# shall we?
|
85
|
+
if (normalized_url.downcase =~ /javascript:/) != nil
|
86
|
+
return "#"
|
87
|
+
end
|
88
|
+
|
89
|
+
# deal with all of the many ugly possibilities involved in the rss:
|
90
|
+
# and feed: pseudo-protocols (incidentally, whose crazy idea was this
|
91
|
+
# mess?)
|
92
|
+
normalized_url.gsub!(/^http:\/*(feed:\/*)?/i, "http://")
|
93
|
+
normalized_url.gsub!(/^http:\/*(rss:\/*)?/i, "http://")
|
94
|
+
normalized_url.gsub!(/^feed:\/*(http:\/*)?/i, "http://")
|
95
|
+
normalized_url.gsub!(/^rss:\/*(http:\/*)?/i, "http://")
|
96
|
+
normalized_url.gsub!(/^file:\/*/i, "file:///")
|
97
|
+
normalized_url.gsub!(/^https:\/*/i, "https://")
|
98
|
+
# fix (very) bad urls (usually of the user-entered sort)
|
99
|
+
normalized_url.gsub!(/^http:\/*(http:\/*)*/i, "http://")
|
100
|
+
|
101
|
+
if (normalized_url =~ /^file:/i) == 0
|
102
|
+
# Adjust windows-style urls
|
103
|
+
normalized_url.gsub!(/^file:\/\/\/([a-zA-Z])\|/i, 'file:///\1:')
|
104
|
+
normalized_url.gsub!(/\\/, '/')
|
105
|
+
else
|
106
|
+
if (normalized_url =~ /^https?:\/\//i) == nil
|
107
|
+
normalized_url = "http://" + normalized_url
|
108
|
+
end
|
109
|
+
if normalized_url == "http://"
|
110
|
+
return nil
|
111
|
+
end
|
112
|
+
begin
|
113
|
+
scheme, host_part, path =
|
114
|
+
normalized_url.scan(/^(https?):\/\/([^\/]+)\/(.*)/i).flatten
|
115
|
+
if scheme != nil && host_part != nil && path != nil
|
116
|
+
scheme = scheme.downcase
|
117
|
+
if FeedTools::UriHelper.idn_enabled?
|
118
|
+
host_part =
|
119
|
+
IDN::Idna.toASCII(host_part)
|
120
|
+
end
|
121
|
+
new_path = ""
|
122
|
+
for index in 0...path.size
|
123
|
+
if path[index] <= 32 || path[index] >= 126
|
124
|
+
new_path << ("%" + path[index].to_s(16).upcase)
|
125
|
+
else
|
126
|
+
new_path << path[index..index]
|
127
|
+
end
|
128
|
+
end
|
129
|
+
path = new_path
|
130
|
+
normalized_url = scheme + "://" + host_part + "/" + path
|
131
|
+
end
|
132
|
+
rescue Object
|
133
|
+
end
|
134
|
+
begin
|
135
|
+
feed_uri = URI.parse(normalized_url)
|
136
|
+
if feed_uri.scheme == nil
|
137
|
+
feed_uri.scheme = "http"
|
138
|
+
end
|
139
|
+
if feed_uri.path.blank?
|
140
|
+
feed_uri.path = "/"
|
141
|
+
end
|
142
|
+
if (feed_uri.path =~ /^[\/]+/) == 0
|
143
|
+
feed_uri.path.gsub!(/^[\/]+/, "/")
|
144
|
+
end
|
145
|
+
while (feed_uri.path =~ /^\/\.\./)
|
146
|
+
feed_uri.path.gsub!(/^\/\.\./, "")
|
147
|
+
end
|
148
|
+
if feed_uri.path.blank?
|
149
|
+
feed_uri.path = "/"
|
150
|
+
end
|
151
|
+
feed_uri.host.downcase!
|
152
|
+
normalized_url = feed_uri.to_s
|
153
|
+
rescue URI::InvalidURIError
|
154
|
+
end
|
155
|
+
end
|
156
|
+
|
157
|
+
# We can't do a proper set of escaping, so this will
|
158
|
+
# have to do.
|
159
|
+
normalized_url.gsub!(/%20/, " ")
|
160
|
+
normalized_url.gsub!(/ /, "%20")
|
161
|
+
|
162
|
+
return normalized_url
|
163
|
+
end
|
164
|
+
|
165
|
+
# Resolves a relative uri
|
166
|
+
def self.resolve_relative_uri(relative_uri, base_uri_sources=[])
|
167
|
+
return relative_uri if base_uri_sources.blank?
|
168
|
+
return nil if relative_uri.nil?
|
169
|
+
begin
|
170
|
+
base_uri = URI.parse(
|
171
|
+
FeedTools::XmlHelper.select_not_blank(base_uri_sources))
|
172
|
+
resolved_uri = base_uri + relative_uri.to_s
|
173
|
+
return FeedTools::UriHelper.normalize_url(resolved_uri.to_s)
|
174
|
+
rescue
|
175
|
+
return relative_uri
|
176
|
+
end
|
177
|
+
end
|
178
|
+
|
179
|
+
# Converts a url into a tag uri
|
180
|
+
def self.build_tag_uri(url, date)
|
181
|
+
unless url.kind_of? String
|
182
|
+
raise ArgumentError, "Expected String, got #{url.class.name}"
|
183
|
+
end
|
184
|
+
unless date.kind_of? Time
|
185
|
+
raise ArgumentError, "Expected Time, got #{date.class.name}"
|
186
|
+
end
|
187
|
+
tag_uri = normalize_url(url)
|
188
|
+
unless FeedTools::UriHelper.is_uri?(tag_uri)
|
189
|
+
raise ArgumentError, "Must supply a valid URL."
|
190
|
+
end
|
191
|
+
host = URI.parse(tag_uri).host
|
192
|
+
tag_uri.gsub!(/^(http|ftp|file):\/*/, "")
|
193
|
+
tag_uri.gsub!(/#/, "/")
|
194
|
+
tag_uri = "tag:#{host},#{date.strftime('%Y-%m-%d')}:" +
|
195
|
+
"#{tag_uri[(tag_uri.index(host) + host.size)..-1]}"
|
196
|
+
return tag_uri
|
197
|
+
end
|
198
|
+
|
199
|
+
# Converts a url into a urn:uuid: uri
|
200
|
+
def self.build_urn_uri(url)
|
201
|
+
unless url.kind_of? String
|
202
|
+
raise ArgumentError, "Expected String, got #{url.class.name}"
|
203
|
+
end
|
204
|
+
normalized_url = normalize_url(url)
|
205
|
+
require 'uuidtools'
|
206
|
+
return UUID.sha1_create(UUID_URL_NAMESPACE, normalized_url).to_uri_string
|
207
|
+
end
|
208
|
+
|
209
|
+
# Returns true if the parameter appears to be a valid uri
|
210
|
+
def self.is_uri?(url)
|
211
|
+
return false if url.nil?
|
212
|
+
begin
|
213
|
+
uri = URI.parse(url)
|
214
|
+
if uri.scheme.blank?
|
215
|
+
return false
|
216
|
+
end
|
217
|
+
rescue URI::InvalidURIError
|
218
|
+
return false
|
219
|
+
end
|
220
|
+
return true
|
221
|
+
end
|
222
|
+
end
|
223
|
+
end
|
@@ -0,0 +1,239 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2005 Robert Aman
|
3
|
+
#
|
4
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
5
|
+
# a copy of this software and associated documentation files (the
|
6
|
+
# "Software"), to deal in the Software without restriction, including
|
7
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
8
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
9
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
10
|
+
# the following conditions:
|
11
|
+
#
|
12
|
+
# The above copyright notice and this permission notice shall be
|
13
|
+
# included in all copies or substantial portions of the Software.
|
14
|
+
#
|
15
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
16
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
17
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
18
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
19
|
+
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
20
|
+
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
21
|
+
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
22
|
+
#++
|
23
|
+
|
24
|
+
require 'feed_tools'
|
25
|
+
require 'feed_tools/helpers/generic_helper'
|
26
|
+
require 'rexml/document'
|
27
|
+
|
28
|
+
module FeedTools
|
29
|
+
# Generic xml methods needed in numerous places throughout FeedTools
|
30
|
+
module XmlHelper
|
31
|
+
# Selects the first non-blank result.
|
32
|
+
def self.select_not_blank(results, &block)
|
33
|
+
if results.kind_of? Array
|
34
|
+
for result in results
|
35
|
+
blank_result = false
|
36
|
+
if !block.nil?
|
37
|
+
blank_result = block.call(result)
|
38
|
+
else
|
39
|
+
blank_result = result.to_s.blank?
|
40
|
+
end
|
41
|
+
unless result.nil? || blank_result
|
42
|
+
return result
|
43
|
+
end
|
44
|
+
end
|
45
|
+
else
|
46
|
+
blank_result = false
|
47
|
+
if !block.nil?
|
48
|
+
blank_result = block.call(results)
|
49
|
+
else
|
50
|
+
blank_result = results.to_s.blank?
|
51
|
+
end
|
52
|
+
unless results.nil? || blank_result
|
53
|
+
return results
|
54
|
+
end
|
55
|
+
end
|
56
|
+
return nil
|
57
|
+
end
|
58
|
+
|
59
|
+
# Runs through a list of XPath queries on an element or document and
|
60
|
+
# returns the first non-blank result. Subsequent XPath queries will
|
61
|
+
# not be evaluated.
|
62
|
+
def self.try_xpaths(element, xpath_list,
|
63
|
+
options={}, &block)
|
64
|
+
FeedTools::GenericHelper.validate_options([ :select_result_value ],
|
65
|
+
options.keys)
|
66
|
+
options = { :select_result_value => false }.merge(options)
|
67
|
+
|
68
|
+
result = nil
|
69
|
+
if element.nil?
|
70
|
+
return nil
|
71
|
+
end
|
72
|
+
for xpath in xpath_list
|
73
|
+
# Namespace aware
|
74
|
+
result = REXML::XPath.liberal_first(element, xpath,
|
75
|
+
FEED_TOOLS_NAMESPACES)
|
76
|
+
if options[:select_result_value] && !result.nil?
|
77
|
+
if result.respond_to?(:value)
|
78
|
+
result = result.value
|
79
|
+
else
|
80
|
+
result = result.to_s
|
81
|
+
end
|
82
|
+
end
|
83
|
+
blank_result = false
|
84
|
+
if block_given?
|
85
|
+
blank_result = yield(result)
|
86
|
+
else
|
87
|
+
blank_result = result.to_s.blank?
|
88
|
+
end
|
89
|
+
if !blank_result
|
90
|
+
if result.respond_to? :strip
|
91
|
+
result.strip!
|
92
|
+
end
|
93
|
+
return result
|
94
|
+
end
|
95
|
+
|
96
|
+
# Namespace unaware
|
97
|
+
result = REXML::XPath.liberal_first(element, xpath)
|
98
|
+
if options[:select_result_value] && !result.nil?
|
99
|
+
if result.respond_to?(:value)
|
100
|
+
result = result.value
|
101
|
+
else
|
102
|
+
result = result.to_s
|
103
|
+
end
|
104
|
+
end
|
105
|
+
blank_result = false
|
106
|
+
if block_given?
|
107
|
+
blank_result = yield(result)
|
108
|
+
else
|
109
|
+
blank_result = result.to_s.blank?
|
110
|
+
end
|
111
|
+
if !blank_result
|
112
|
+
if result.respond_to? :strip
|
113
|
+
result.strip!
|
114
|
+
end
|
115
|
+
return result
|
116
|
+
end
|
117
|
+
end
|
118
|
+
return nil
|
119
|
+
end
|
120
|
+
|
121
|
+
# Runs through a list of XPath queries on an element or document and
|
122
|
+
# returns all non-empty results. Subsequent XPath queries will
|
123
|
+
# not be evaluated.
|
124
|
+
def self.try_xpaths_all(element, xpath_list, options={})
|
125
|
+
FeedTools::GenericHelper.validate_options([ :select_result_value ],
|
126
|
+
options.keys)
|
127
|
+
options = { :select_result_value => false }.merge(options)
|
128
|
+
|
129
|
+
results = []
|
130
|
+
if element.nil?
|
131
|
+
return []
|
132
|
+
end
|
133
|
+
for xpath in xpath_list
|
134
|
+
# Namespace aware
|
135
|
+
results = REXML::XPath.liberal_match(element, xpath,
|
136
|
+
FEED_TOOLS_NAMESPACES)
|
137
|
+
if options[:select_result_value] && !results.nil? && !results.empty?
|
138
|
+
results =
|
139
|
+
results.map { |x| x.respond_to?(:value) ? x.value : x.to_s }
|
140
|
+
end
|
141
|
+
if results.blank?
|
142
|
+
results = REXML::XPath.liberal_match(element, xpath)
|
143
|
+
else
|
144
|
+
return results
|
145
|
+
end
|
146
|
+
|
147
|
+
# Namespace unaware
|
148
|
+
if options[:select_result_value] && !results.nil? && !results.empty?
|
149
|
+
results =
|
150
|
+
results.map { |x| x.respond_to?(:value) ? x.value : x.to_s }
|
151
|
+
end
|
152
|
+
if !results.blank?
|
153
|
+
return results
|
154
|
+
end
|
155
|
+
end
|
156
|
+
for xpath in xpath_list
|
157
|
+
if xpath =~ /^\w+$/
|
158
|
+
results = []
|
159
|
+
for child in element.children
|
160
|
+
if child.class == REXML::Element
|
161
|
+
if child.name.downcase == xpath.downcase
|
162
|
+
results << child
|
163
|
+
end
|
164
|
+
end
|
165
|
+
end
|
166
|
+
if options[:select_result_value] && !results.nil? && !results.empty?
|
167
|
+
results =
|
168
|
+
results.map { |x| x.inner_xml }
|
169
|
+
end
|
170
|
+
if !results.blank?
|
171
|
+
return results
|
172
|
+
end
|
173
|
+
end
|
174
|
+
end
|
175
|
+
return []
|
176
|
+
end
|
177
|
+
|
178
|
+
# Runs through a list of XPath queries on an element or document and
|
179
|
+
# returns all non-empty results, appending the results from each query
|
180
|
+
# onto the end of the results from the previous queries.
|
181
|
+
def self.combine_xpaths_all(element, xpath_list, options={})
|
182
|
+
FeedTools::GenericHelper.validate_options([ :select_result_value ],
|
183
|
+
options.keys)
|
184
|
+
options = { :select_result_value => false }.merge(options)
|
185
|
+
|
186
|
+
all_results = []
|
187
|
+
result = []
|
188
|
+
if element.nil?
|
189
|
+
return []
|
190
|
+
end
|
191
|
+
for xpath in xpath_list
|
192
|
+
# Namespace aware
|
193
|
+
results = REXML::XPath.liberal_match(element, xpath,
|
194
|
+
FEED_TOOLS_NAMESPACES)
|
195
|
+
if options[:select_result_value] && !results.nil? && !results.empty?
|
196
|
+
results =
|
197
|
+
results.map { |x| x.respond_to?(:value) ? x.value : x.to_s }
|
198
|
+
end
|
199
|
+
if results.blank?
|
200
|
+
results = REXML::XPath.liberal_match(element, xpath)
|
201
|
+
else
|
202
|
+
all_results.concat(results)
|
203
|
+
next
|
204
|
+
end
|
205
|
+
|
206
|
+
# Namespace unaware
|
207
|
+
if options[:select_result_value] && !results.nil? && !results.empty?
|
208
|
+
results =
|
209
|
+
results.map { |x| x.respond_to?(:value) ? x.value : x.to_s }
|
210
|
+
end
|
211
|
+
if !results.blank?
|
212
|
+
all_results.concat(results)
|
213
|
+
next
|
214
|
+
end
|
215
|
+
end
|
216
|
+
for xpath in xpath_list
|
217
|
+
if xpath =~ /^\w+$/
|
218
|
+
results = []
|
219
|
+
for child in element.children
|
220
|
+
if child.class == REXML::Element
|
221
|
+
if child.name.downcase == xpath.downcase
|
222
|
+
results << child
|
223
|
+
end
|
224
|
+
end
|
225
|
+
end
|
226
|
+
if options[:select_result_value] && !results.nil? && !results.empty?
|
227
|
+
results =
|
228
|
+
results.map { |x| x.inner_xml }
|
229
|
+
end
|
230
|
+
if !results.blank?
|
231
|
+
all_results.concat(results)
|
232
|
+
next
|
233
|
+
end
|
234
|
+
end
|
235
|
+
end
|
236
|
+
return all_results.uniq
|
237
|
+
end
|
238
|
+
end
|
239
|
+
end
|