feedtools 0.2.22 → 0.2.23
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +28 -0
- data/README +23 -2
- data/db/migration.rb +19 -0
- data/db/schema.mysql.sql +1 -1
- data/db/schema.postgresql.sql +1 -1
- data/db/schema.sqlite.sql +1 -1
- data/lib/feed_tools.rb +71 -388
- data/lib/feed_tools/database_feed_cache.rb +4 -3
- data/lib/feed_tools/feed.rb +809 -607
- data/lib/feed_tools/feed_item.rb +551 -574
- data/lib/feed_tools/feed_structures.rb +252 -0
- data/lib/feed_tools/helpers/feed_tools_helper.rb +6 -5
- data/lib/feed_tools/helpers/generic_helper.rb +16 -158
- data/lib/feed_tools/helpers/html_helper.rb +629 -0
- data/lib/feed_tools/helpers/retrieval_helper.rb +5 -0
- data/lib/feed_tools/helpers/uri_helper.rb +223 -0
- data/lib/feed_tools/helpers/xml_helper.rb +239 -0
- data/rakefile +10 -237
- data/test/unit/amp_test.rb +102 -94
- data/test/unit/atom_test.rb +239 -6
- data/test/unit/cache_test.rb +1 -1
- data/test/unit/encoding_test.rb +5 -5
- data/test/unit/generation_test.rb +34 -1
- data/test/unit/helper_test.rb +111 -17
- data/test/unit/rss_test.rb +21 -2
- metadata +7 -3
- data/lib/feed_tools/helpers/module_helper.rb +0 -27
@@ -28,6 +28,11 @@ require 'net/http'
|
|
28
28
|
# require 'net/https'
|
29
29
|
# require 'net/ftp'
|
30
30
|
|
31
|
+
# Stolen from the Universal Feed Parser
|
32
|
+
FEED_TOOLS_ACCEPT_HEADER = "application/atom+xml,application/rdf+xml," +
|
33
|
+
"application/rss+xml,application/x-netcdf,application/xml;" +
|
34
|
+
"q=0.9,text/xml;q=0.2,*/*;q=0.1"
|
35
|
+
|
31
36
|
# TODO: Refactor http_fetch and other methods.
|
32
37
|
module FeedTools
|
33
38
|
# Methods for pulling remote data
|
@@ -0,0 +1,223 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2005 Robert Aman
|
3
|
+
#
|
4
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
5
|
+
# a copy of this software and associated documentation files (the
|
6
|
+
# "Software"), to deal in the Software without restriction, including
|
7
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
8
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
9
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
10
|
+
# the following conditions:
|
11
|
+
#
|
12
|
+
# The above copyright notice and this permission notice shall be
|
13
|
+
# included in all copies or substantial portions of the Software.
|
14
|
+
#
|
15
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
16
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
17
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
18
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
19
|
+
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
20
|
+
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
21
|
+
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
22
|
+
#++
|
23
|
+
|
24
|
+
require 'feed_tools'
|
25
|
+
require 'uri'
|
26
|
+
|
27
|
+
module FeedTools
|
28
|
+
# Generic url processing methods needed in numerous places throughout
|
29
|
+
# FeedTools
|
30
|
+
module UriHelper
|
31
|
+
# Returns true if the idn module can be used.
|
32
|
+
def self.idn_enabled?
|
33
|
+
# This is an override variable to keep idn from being used even if it
|
34
|
+
# is available.
|
35
|
+
if FeedTools.configurations[:idn_enabled] == false
|
36
|
+
return false
|
37
|
+
end
|
38
|
+
if @idn_enabled.nil? || @idn_enabled == false
|
39
|
+
@idn_enabled = false
|
40
|
+
begin
|
41
|
+
require 'idn'
|
42
|
+
if IDN::Idna.toASCII('http://www.詹姆斯.com/') ==
|
43
|
+
"http://www.xn--8ws00zhy3a.com/"
|
44
|
+
@idn_enabled = true
|
45
|
+
else
|
46
|
+
@idn_enabled = false
|
47
|
+
end
|
48
|
+
rescue LoadError
|
49
|
+
# Tidy not installed, disable features that rely on tidy.
|
50
|
+
@idn_enabled = false
|
51
|
+
end
|
52
|
+
end
|
53
|
+
return @idn_enabled
|
54
|
+
end
|
55
|
+
|
56
|
+
# Attempts to ensures that the passed url is valid and sane. Accepts very,
|
57
|
+
# very ugly urls and makes every effort to figure out what it was supposed
|
58
|
+
# to be. Also translates from the feed: and rss: pseudo-protocols to the
|
59
|
+
# http: protocol.
|
60
|
+
def self.normalize_url(url)
|
61
|
+
if url.kind_of?(URI)
|
62
|
+
url = url.to_s
|
63
|
+
end
|
64
|
+
if url.blank?
|
65
|
+
return nil
|
66
|
+
end
|
67
|
+
normalized_url = CGI.unescape(url.strip)
|
68
|
+
|
69
|
+
# if a url begins with the '/' character, it only makes sense that they
|
70
|
+
# meant to be using a file:// url. Fix it for them.
|
71
|
+
if normalized_url.length > 0 && normalized_url[0..0] == "/"
|
72
|
+
normalized_url = "file://" + normalized_url
|
73
|
+
end
|
74
|
+
|
75
|
+
# if a url begins with a drive letter followed by a colon, we're looking at
|
76
|
+
# a file:// url. Fix it for them.
|
77
|
+
if normalized_url.length > 0 &&
|
78
|
+
normalized_url.scan(/^[a-zA-Z]:[\\\/]/).size > 0
|
79
|
+
normalized_url = "file:///" + normalized_url
|
80
|
+
end
|
81
|
+
|
82
|
+
# if a url begins with javascript:, it's quite possibly an attempt at
|
83
|
+
# doing something malicious. Let's keep that from getting anywhere,
|
84
|
+
# shall we?
|
85
|
+
if (normalized_url.downcase =~ /javascript:/) != nil
|
86
|
+
return "#"
|
87
|
+
end
|
88
|
+
|
89
|
+
# deal with all of the many ugly possibilities involved in the rss:
|
90
|
+
# and feed: pseudo-protocols (incidentally, whose crazy idea was this
|
91
|
+
# mess?)
|
92
|
+
normalized_url.gsub!(/^http:\/*(feed:\/*)?/i, "http://")
|
93
|
+
normalized_url.gsub!(/^http:\/*(rss:\/*)?/i, "http://")
|
94
|
+
normalized_url.gsub!(/^feed:\/*(http:\/*)?/i, "http://")
|
95
|
+
normalized_url.gsub!(/^rss:\/*(http:\/*)?/i, "http://")
|
96
|
+
normalized_url.gsub!(/^file:\/*/i, "file:///")
|
97
|
+
normalized_url.gsub!(/^https:\/*/i, "https://")
|
98
|
+
# fix (very) bad urls (usually of the user-entered sort)
|
99
|
+
normalized_url.gsub!(/^http:\/*(http:\/*)*/i, "http://")
|
100
|
+
|
101
|
+
if (normalized_url =~ /^file:/i) == 0
|
102
|
+
# Adjust windows-style urls
|
103
|
+
normalized_url.gsub!(/^file:\/\/\/([a-zA-Z])\|/i, 'file:///\1:')
|
104
|
+
normalized_url.gsub!(/\\/, '/')
|
105
|
+
else
|
106
|
+
if (normalized_url =~ /^https?:\/\//i) == nil
|
107
|
+
normalized_url = "http://" + normalized_url
|
108
|
+
end
|
109
|
+
if normalized_url == "http://"
|
110
|
+
return nil
|
111
|
+
end
|
112
|
+
begin
|
113
|
+
scheme, host_part, path =
|
114
|
+
normalized_url.scan(/^(https?):\/\/([^\/]+)\/(.*)/i).flatten
|
115
|
+
if scheme != nil && host_part != nil && path != nil
|
116
|
+
scheme = scheme.downcase
|
117
|
+
if FeedTools::UriHelper.idn_enabled?
|
118
|
+
host_part =
|
119
|
+
IDN::Idna.toASCII(host_part)
|
120
|
+
end
|
121
|
+
new_path = ""
|
122
|
+
for index in 0...path.size
|
123
|
+
if path[index] <= 32 || path[index] >= 126
|
124
|
+
new_path << ("%" + path[index].to_s(16).upcase)
|
125
|
+
else
|
126
|
+
new_path << path[index..index]
|
127
|
+
end
|
128
|
+
end
|
129
|
+
path = new_path
|
130
|
+
normalized_url = scheme + "://" + host_part + "/" + path
|
131
|
+
end
|
132
|
+
rescue Object
|
133
|
+
end
|
134
|
+
begin
|
135
|
+
feed_uri = URI.parse(normalized_url)
|
136
|
+
if feed_uri.scheme == nil
|
137
|
+
feed_uri.scheme = "http"
|
138
|
+
end
|
139
|
+
if feed_uri.path.blank?
|
140
|
+
feed_uri.path = "/"
|
141
|
+
end
|
142
|
+
if (feed_uri.path =~ /^[\/]+/) == 0
|
143
|
+
feed_uri.path.gsub!(/^[\/]+/, "/")
|
144
|
+
end
|
145
|
+
while (feed_uri.path =~ /^\/\.\./)
|
146
|
+
feed_uri.path.gsub!(/^\/\.\./, "")
|
147
|
+
end
|
148
|
+
if feed_uri.path.blank?
|
149
|
+
feed_uri.path = "/"
|
150
|
+
end
|
151
|
+
feed_uri.host.downcase!
|
152
|
+
normalized_url = feed_uri.to_s
|
153
|
+
rescue URI::InvalidURIError
|
154
|
+
end
|
155
|
+
end
|
156
|
+
|
157
|
+
# We can't do a proper set of escaping, so this will
|
158
|
+
# have to do.
|
159
|
+
normalized_url.gsub!(/%20/, " ")
|
160
|
+
normalized_url.gsub!(/ /, "%20")
|
161
|
+
|
162
|
+
return normalized_url
|
163
|
+
end
|
164
|
+
|
165
|
+
# Resolves a relative uri
|
166
|
+
def self.resolve_relative_uri(relative_uri, base_uri_sources=[])
|
167
|
+
return relative_uri if base_uri_sources.blank?
|
168
|
+
return nil if relative_uri.nil?
|
169
|
+
begin
|
170
|
+
base_uri = URI.parse(
|
171
|
+
FeedTools::XmlHelper.select_not_blank(base_uri_sources))
|
172
|
+
resolved_uri = base_uri + relative_uri.to_s
|
173
|
+
return FeedTools::UriHelper.normalize_url(resolved_uri.to_s)
|
174
|
+
rescue
|
175
|
+
return relative_uri
|
176
|
+
end
|
177
|
+
end
|
178
|
+
|
179
|
+
# Converts a url into a tag uri
|
180
|
+
def self.build_tag_uri(url, date)
|
181
|
+
unless url.kind_of? String
|
182
|
+
raise ArgumentError, "Expected String, got #{url.class.name}"
|
183
|
+
end
|
184
|
+
unless date.kind_of? Time
|
185
|
+
raise ArgumentError, "Expected Time, got #{date.class.name}"
|
186
|
+
end
|
187
|
+
tag_uri = normalize_url(url)
|
188
|
+
unless FeedTools::UriHelper.is_uri?(tag_uri)
|
189
|
+
raise ArgumentError, "Must supply a valid URL."
|
190
|
+
end
|
191
|
+
host = URI.parse(tag_uri).host
|
192
|
+
tag_uri.gsub!(/^(http|ftp|file):\/*/, "")
|
193
|
+
tag_uri.gsub!(/#/, "/")
|
194
|
+
tag_uri = "tag:#{host},#{date.strftime('%Y-%m-%d')}:" +
|
195
|
+
"#{tag_uri[(tag_uri.index(host) + host.size)..-1]}"
|
196
|
+
return tag_uri
|
197
|
+
end
|
198
|
+
|
199
|
+
# Converts a url into a urn:uuid: uri
|
200
|
+
def self.build_urn_uri(url)
|
201
|
+
unless url.kind_of? String
|
202
|
+
raise ArgumentError, "Expected String, got #{url.class.name}"
|
203
|
+
end
|
204
|
+
normalized_url = normalize_url(url)
|
205
|
+
require 'uuidtools'
|
206
|
+
return UUID.sha1_create(UUID_URL_NAMESPACE, normalized_url).to_uri_string
|
207
|
+
end
|
208
|
+
|
209
|
+
# Returns true if the parameter appears to be a valid uri
|
210
|
+
def self.is_uri?(url)
|
211
|
+
return false if url.nil?
|
212
|
+
begin
|
213
|
+
uri = URI.parse(url)
|
214
|
+
if uri.scheme.blank?
|
215
|
+
return false
|
216
|
+
end
|
217
|
+
rescue URI::InvalidURIError
|
218
|
+
return false
|
219
|
+
end
|
220
|
+
return true
|
221
|
+
end
|
222
|
+
end
|
223
|
+
end
|
@@ -0,0 +1,239 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2005 Robert Aman
|
3
|
+
#
|
4
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
5
|
+
# a copy of this software and associated documentation files (the
|
6
|
+
# "Software"), to deal in the Software without restriction, including
|
7
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
8
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
9
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
10
|
+
# the following conditions:
|
11
|
+
#
|
12
|
+
# The above copyright notice and this permission notice shall be
|
13
|
+
# included in all copies or substantial portions of the Software.
|
14
|
+
#
|
15
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
16
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
17
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
18
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
19
|
+
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
20
|
+
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
21
|
+
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
22
|
+
#++
|
23
|
+
|
24
|
+
require 'feed_tools'
|
25
|
+
require 'feed_tools/helpers/generic_helper'
|
26
|
+
require 'rexml/document'
|
27
|
+
|
28
|
+
module FeedTools
|
29
|
+
# Generic xml methods needed in numerous places throughout FeedTools
|
30
|
+
module XmlHelper
|
31
|
+
# Selects the first non-blank result.
|
32
|
+
def self.select_not_blank(results, &block)
|
33
|
+
if results.kind_of? Array
|
34
|
+
for result in results
|
35
|
+
blank_result = false
|
36
|
+
if !block.nil?
|
37
|
+
blank_result = block.call(result)
|
38
|
+
else
|
39
|
+
blank_result = result.to_s.blank?
|
40
|
+
end
|
41
|
+
unless result.nil? || blank_result
|
42
|
+
return result
|
43
|
+
end
|
44
|
+
end
|
45
|
+
else
|
46
|
+
blank_result = false
|
47
|
+
if !block.nil?
|
48
|
+
blank_result = block.call(results)
|
49
|
+
else
|
50
|
+
blank_result = results.to_s.blank?
|
51
|
+
end
|
52
|
+
unless results.nil? || blank_result
|
53
|
+
return results
|
54
|
+
end
|
55
|
+
end
|
56
|
+
return nil
|
57
|
+
end
|
58
|
+
|
59
|
+
# Runs through a list of XPath queries on an element or document and
|
60
|
+
# returns the first non-blank result. Subsequent XPath queries will
|
61
|
+
# not be evaluated.
|
62
|
+
def self.try_xpaths(element, xpath_list,
|
63
|
+
options={}, &block)
|
64
|
+
FeedTools::GenericHelper.validate_options([ :select_result_value ],
|
65
|
+
options.keys)
|
66
|
+
options = { :select_result_value => false }.merge(options)
|
67
|
+
|
68
|
+
result = nil
|
69
|
+
if element.nil?
|
70
|
+
return nil
|
71
|
+
end
|
72
|
+
for xpath in xpath_list
|
73
|
+
# Namespace aware
|
74
|
+
result = REXML::XPath.liberal_first(element, xpath,
|
75
|
+
FEED_TOOLS_NAMESPACES)
|
76
|
+
if options[:select_result_value] && !result.nil?
|
77
|
+
if result.respond_to?(:value)
|
78
|
+
result = result.value
|
79
|
+
else
|
80
|
+
result = result.to_s
|
81
|
+
end
|
82
|
+
end
|
83
|
+
blank_result = false
|
84
|
+
if block_given?
|
85
|
+
blank_result = yield(result)
|
86
|
+
else
|
87
|
+
blank_result = result.to_s.blank?
|
88
|
+
end
|
89
|
+
if !blank_result
|
90
|
+
if result.respond_to? :strip
|
91
|
+
result.strip!
|
92
|
+
end
|
93
|
+
return result
|
94
|
+
end
|
95
|
+
|
96
|
+
# Namespace unaware
|
97
|
+
result = REXML::XPath.liberal_first(element, xpath)
|
98
|
+
if options[:select_result_value] && !result.nil?
|
99
|
+
if result.respond_to?(:value)
|
100
|
+
result = result.value
|
101
|
+
else
|
102
|
+
result = result.to_s
|
103
|
+
end
|
104
|
+
end
|
105
|
+
blank_result = false
|
106
|
+
if block_given?
|
107
|
+
blank_result = yield(result)
|
108
|
+
else
|
109
|
+
blank_result = result.to_s.blank?
|
110
|
+
end
|
111
|
+
if !blank_result
|
112
|
+
if result.respond_to? :strip
|
113
|
+
result.strip!
|
114
|
+
end
|
115
|
+
return result
|
116
|
+
end
|
117
|
+
end
|
118
|
+
return nil
|
119
|
+
end
|
120
|
+
|
121
|
+
# Runs through a list of XPath queries on an element or document and
|
122
|
+
# returns all non-empty results. Subsequent XPath queries will
|
123
|
+
# not be evaluated.
|
124
|
+
def self.try_xpaths_all(element, xpath_list, options={})
|
125
|
+
FeedTools::GenericHelper.validate_options([ :select_result_value ],
|
126
|
+
options.keys)
|
127
|
+
options = { :select_result_value => false }.merge(options)
|
128
|
+
|
129
|
+
results = []
|
130
|
+
if element.nil?
|
131
|
+
return []
|
132
|
+
end
|
133
|
+
for xpath in xpath_list
|
134
|
+
# Namespace aware
|
135
|
+
results = REXML::XPath.liberal_match(element, xpath,
|
136
|
+
FEED_TOOLS_NAMESPACES)
|
137
|
+
if options[:select_result_value] && !results.nil? && !results.empty?
|
138
|
+
results =
|
139
|
+
results.map { |x| x.respond_to?(:value) ? x.value : x.to_s }
|
140
|
+
end
|
141
|
+
if results.blank?
|
142
|
+
results = REXML::XPath.liberal_match(element, xpath)
|
143
|
+
else
|
144
|
+
return results
|
145
|
+
end
|
146
|
+
|
147
|
+
# Namespace unaware
|
148
|
+
if options[:select_result_value] && !results.nil? && !results.empty?
|
149
|
+
results =
|
150
|
+
results.map { |x| x.respond_to?(:value) ? x.value : x.to_s }
|
151
|
+
end
|
152
|
+
if !results.blank?
|
153
|
+
return results
|
154
|
+
end
|
155
|
+
end
|
156
|
+
for xpath in xpath_list
|
157
|
+
if xpath =~ /^\w+$/
|
158
|
+
results = []
|
159
|
+
for child in element.children
|
160
|
+
if child.class == REXML::Element
|
161
|
+
if child.name.downcase == xpath.downcase
|
162
|
+
results << child
|
163
|
+
end
|
164
|
+
end
|
165
|
+
end
|
166
|
+
if options[:select_result_value] && !results.nil? && !results.empty?
|
167
|
+
results =
|
168
|
+
results.map { |x| x.inner_xml }
|
169
|
+
end
|
170
|
+
if !results.blank?
|
171
|
+
return results
|
172
|
+
end
|
173
|
+
end
|
174
|
+
end
|
175
|
+
return []
|
176
|
+
end
|
177
|
+
|
178
|
+
# Runs through a list of XPath queries on an element or document and
|
179
|
+
# returns all non-empty results, appending the results from each query
|
180
|
+
# onto the end of the results from the previous queries.
|
181
|
+
def self.combine_xpaths_all(element, xpath_list, options={})
|
182
|
+
FeedTools::GenericHelper.validate_options([ :select_result_value ],
|
183
|
+
options.keys)
|
184
|
+
options = { :select_result_value => false }.merge(options)
|
185
|
+
|
186
|
+
all_results = []
|
187
|
+
result = []
|
188
|
+
if element.nil?
|
189
|
+
return []
|
190
|
+
end
|
191
|
+
for xpath in xpath_list
|
192
|
+
# Namespace aware
|
193
|
+
results = REXML::XPath.liberal_match(element, xpath,
|
194
|
+
FEED_TOOLS_NAMESPACES)
|
195
|
+
if options[:select_result_value] && !results.nil? && !results.empty?
|
196
|
+
results =
|
197
|
+
results.map { |x| x.respond_to?(:value) ? x.value : x.to_s }
|
198
|
+
end
|
199
|
+
if results.blank?
|
200
|
+
results = REXML::XPath.liberal_match(element, xpath)
|
201
|
+
else
|
202
|
+
all_results.concat(results)
|
203
|
+
next
|
204
|
+
end
|
205
|
+
|
206
|
+
# Namespace unaware
|
207
|
+
if options[:select_result_value] && !results.nil? && !results.empty?
|
208
|
+
results =
|
209
|
+
results.map { |x| x.respond_to?(:value) ? x.value : x.to_s }
|
210
|
+
end
|
211
|
+
if !results.blank?
|
212
|
+
all_results.concat(results)
|
213
|
+
next
|
214
|
+
end
|
215
|
+
end
|
216
|
+
for xpath in xpath_list
|
217
|
+
if xpath =~ /^\w+$/
|
218
|
+
results = []
|
219
|
+
for child in element.children
|
220
|
+
if child.class == REXML::Element
|
221
|
+
if child.name.downcase == xpath.downcase
|
222
|
+
results << child
|
223
|
+
end
|
224
|
+
end
|
225
|
+
end
|
226
|
+
if options[:select_result_value] && !results.nil? && !results.empty?
|
227
|
+
results =
|
228
|
+
results.map { |x| x.inner_xml }
|
229
|
+
end
|
230
|
+
if !results.blank?
|
231
|
+
all_results.concat(results)
|
232
|
+
next
|
233
|
+
end
|
234
|
+
end
|
235
|
+
end
|
236
|
+
return all_results.uniq
|
237
|
+
end
|
238
|
+
end
|
239
|
+
end
|