feedtools 0.2.22 → 0.2.23

Sign up to get free protection for your applications and to get access to all the features.
@@ -28,6 +28,11 @@ require 'net/http'
28
28
  # require 'net/https'
29
29
  # require 'net/ftp'
30
30
 
31
+ # Stolen from the Universal Feed Parser
32
+ FEED_TOOLS_ACCEPT_HEADER = "application/atom+xml,application/rdf+xml," +
33
+ "application/rss+xml,application/x-netcdf,application/xml;" +
34
+ "q=0.9,text/xml;q=0.2,*/*;q=0.1"
35
+
31
36
  # TODO: Refactor http_fetch and other methods.
32
37
  module FeedTools
33
38
  # Methods for pulling remote data
@@ -0,0 +1,223 @@
1
+ #--
2
+ # Copyright (c) 2005 Robert Aman
3
+ #
4
+ # Permission is hereby granted, free of charge, to any person obtaining
5
+ # a copy of this software and associated documentation files (the
6
+ # "Software"), to deal in the Software without restriction, including
7
+ # without limitation the rights to use, copy, modify, merge, publish,
8
+ # distribute, sublicense, and/or sell copies of the Software, and to
9
+ # permit persons to whom the Software is furnished to do so, subject to
10
+ # the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be
13
+ # included in all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22
+ #++
23
+
24
+ require 'feed_tools'
25
+ require 'uri'
26
+
27
+ module FeedTools
28
+ # Generic url processing methods needed in numerous places throughout
29
+ # FeedTools
30
+ module UriHelper
31
+ # Returns true if the idn module can be used.
32
+ def self.idn_enabled?
33
+ # This is an override variable to keep idn from being used even if it
34
+ # is available.
35
+ if FeedTools.configurations[:idn_enabled] == false
36
+ return false
37
+ end
38
+ if @idn_enabled.nil? || @idn_enabled == false
39
+ @idn_enabled = false
40
+ begin
41
+ require 'idn'
42
+ if IDN::Idna.toASCII('http://www.詹姆斯.com/') ==
43
+ "http://www.xn--8ws00zhy3a.com/"
44
+ @idn_enabled = true
45
+ else
46
+ @idn_enabled = false
47
+ end
48
+ rescue LoadError
49
+ # Tidy not installed, disable features that rely on tidy.
50
+ @idn_enabled = false
51
+ end
52
+ end
53
+ return @idn_enabled
54
+ end
55
+
56
+ # Attempts to ensures that the passed url is valid and sane. Accepts very,
57
+ # very ugly urls and makes every effort to figure out what it was supposed
58
+ # to be. Also translates from the feed: and rss: pseudo-protocols to the
59
+ # http: protocol.
60
+ def self.normalize_url(url)
61
+ if url.kind_of?(URI)
62
+ url = url.to_s
63
+ end
64
+ if url.blank?
65
+ return nil
66
+ end
67
+ normalized_url = CGI.unescape(url.strip)
68
+
69
+ # if a url begins with the '/' character, it only makes sense that they
70
+ # meant to be using a file:// url. Fix it for them.
71
+ if normalized_url.length > 0 && normalized_url[0..0] == "/"
72
+ normalized_url = "file://" + normalized_url
73
+ end
74
+
75
+ # if a url begins with a drive letter followed by a colon, we're looking at
76
+ # a file:// url. Fix it for them.
77
+ if normalized_url.length > 0 &&
78
+ normalized_url.scan(/^[a-zA-Z]:[\\\/]/).size > 0
79
+ normalized_url = "file:///" + normalized_url
80
+ end
81
+
82
+ # if a url begins with javascript:, it's quite possibly an attempt at
83
+ # doing something malicious. Let's keep that from getting anywhere,
84
+ # shall we?
85
+ if (normalized_url.downcase =~ /javascript:/) != nil
86
+ return "#"
87
+ end
88
+
89
+ # deal with all of the many ugly possibilities involved in the rss:
90
+ # and feed: pseudo-protocols (incidentally, whose crazy idea was this
91
+ # mess?)
92
+ normalized_url.gsub!(/^http:\/*(feed:\/*)?/i, "http://")
93
+ normalized_url.gsub!(/^http:\/*(rss:\/*)?/i, "http://")
94
+ normalized_url.gsub!(/^feed:\/*(http:\/*)?/i, "http://")
95
+ normalized_url.gsub!(/^rss:\/*(http:\/*)?/i, "http://")
96
+ normalized_url.gsub!(/^file:\/*/i, "file:///")
97
+ normalized_url.gsub!(/^https:\/*/i, "https://")
98
+ # fix (very) bad urls (usually of the user-entered sort)
99
+ normalized_url.gsub!(/^http:\/*(http:\/*)*/i, "http://")
100
+
101
+ if (normalized_url =~ /^file:/i) == 0
102
+ # Adjust windows-style urls
103
+ normalized_url.gsub!(/^file:\/\/\/([a-zA-Z])\|/i, 'file:///\1:')
104
+ normalized_url.gsub!(/\\/, '/')
105
+ else
106
+ if (normalized_url =~ /^https?:\/\//i) == nil
107
+ normalized_url = "http://" + normalized_url
108
+ end
109
+ if normalized_url == "http://"
110
+ return nil
111
+ end
112
+ begin
113
+ scheme, host_part, path =
114
+ normalized_url.scan(/^(https?):\/\/([^\/]+)\/(.*)/i).flatten
115
+ if scheme != nil && host_part != nil && path != nil
116
+ scheme = scheme.downcase
117
+ if FeedTools::UriHelper.idn_enabled?
118
+ host_part =
119
+ IDN::Idna.toASCII(host_part)
120
+ end
121
+ new_path = ""
122
+ for index in 0...path.size
123
+ if path[index] <= 32 || path[index] >= 126
124
+ new_path << ("%" + path[index].to_s(16).upcase)
125
+ else
126
+ new_path << path[index..index]
127
+ end
128
+ end
129
+ path = new_path
130
+ normalized_url = scheme + "://" + host_part + "/" + path
131
+ end
132
+ rescue Object
133
+ end
134
+ begin
135
+ feed_uri = URI.parse(normalized_url)
136
+ if feed_uri.scheme == nil
137
+ feed_uri.scheme = "http"
138
+ end
139
+ if feed_uri.path.blank?
140
+ feed_uri.path = "/"
141
+ end
142
+ if (feed_uri.path =~ /^[\/]+/) == 0
143
+ feed_uri.path.gsub!(/^[\/]+/, "/")
144
+ end
145
+ while (feed_uri.path =~ /^\/\.\./)
146
+ feed_uri.path.gsub!(/^\/\.\./, "")
147
+ end
148
+ if feed_uri.path.blank?
149
+ feed_uri.path = "/"
150
+ end
151
+ feed_uri.host.downcase!
152
+ normalized_url = feed_uri.to_s
153
+ rescue URI::InvalidURIError
154
+ end
155
+ end
156
+
157
+ # We can't do a proper set of escaping, so this will
158
+ # have to do.
159
+ normalized_url.gsub!(/%20/, " ")
160
+ normalized_url.gsub!(/ /, "%20")
161
+
162
+ return normalized_url
163
+ end
164
+
165
+ # Resolves a relative uri
166
+ def self.resolve_relative_uri(relative_uri, base_uri_sources=[])
167
+ return relative_uri if base_uri_sources.blank?
168
+ return nil if relative_uri.nil?
169
+ begin
170
+ base_uri = URI.parse(
171
+ FeedTools::XmlHelper.select_not_blank(base_uri_sources))
172
+ resolved_uri = base_uri + relative_uri.to_s
173
+ return FeedTools::UriHelper.normalize_url(resolved_uri.to_s)
174
+ rescue
175
+ return relative_uri
176
+ end
177
+ end
178
+
179
+ # Converts a url into a tag uri
180
+ def self.build_tag_uri(url, date)
181
+ unless url.kind_of? String
182
+ raise ArgumentError, "Expected String, got #{url.class.name}"
183
+ end
184
+ unless date.kind_of? Time
185
+ raise ArgumentError, "Expected Time, got #{date.class.name}"
186
+ end
187
+ tag_uri = normalize_url(url)
188
+ unless FeedTools::UriHelper.is_uri?(tag_uri)
189
+ raise ArgumentError, "Must supply a valid URL."
190
+ end
191
+ host = URI.parse(tag_uri).host
192
+ tag_uri.gsub!(/^(http|ftp|file):\/*/, "")
193
+ tag_uri.gsub!(/#/, "/")
194
+ tag_uri = "tag:#{host},#{date.strftime('%Y-%m-%d')}:" +
195
+ "#{tag_uri[(tag_uri.index(host) + host.size)..-1]}"
196
+ return tag_uri
197
+ end
198
+
199
+ # Converts a url into a urn:uuid: uri
200
+ def self.build_urn_uri(url)
201
+ unless url.kind_of? String
202
+ raise ArgumentError, "Expected String, got #{url.class.name}"
203
+ end
204
+ normalized_url = normalize_url(url)
205
+ require 'uuidtools'
206
+ return UUID.sha1_create(UUID_URL_NAMESPACE, normalized_url).to_uri_string
207
+ end
208
+
209
+ # Returns true if the parameter appears to be a valid uri
210
+ def self.is_uri?(url)
211
+ return false if url.nil?
212
+ begin
213
+ uri = URI.parse(url)
214
+ if uri.scheme.blank?
215
+ return false
216
+ end
217
+ rescue URI::InvalidURIError
218
+ return false
219
+ end
220
+ return true
221
+ end
222
+ end
223
+ end
@@ -0,0 +1,239 @@
1
+ #--
2
+ # Copyright (c) 2005 Robert Aman
3
+ #
4
+ # Permission is hereby granted, free of charge, to any person obtaining
5
+ # a copy of this software and associated documentation files (the
6
+ # "Software"), to deal in the Software without restriction, including
7
+ # without limitation the rights to use, copy, modify, merge, publish,
8
+ # distribute, sublicense, and/or sell copies of the Software, and to
9
+ # permit persons to whom the Software is furnished to do so, subject to
10
+ # the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be
13
+ # included in all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22
+ #++
23
+
24
+ require 'feed_tools'
25
+ require 'feed_tools/helpers/generic_helper'
26
+ require 'rexml/document'
27
+
28
+ module FeedTools
29
+ # Generic xml methods needed in numerous places throughout FeedTools
30
+ module XmlHelper
31
+ # Selects the first non-blank result.
32
+ def self.select_not_blank(results, &block)
33
+ if results.kind_of? Array
34
+ for result in results
35
+ blank_result = false
36
+ if !block.nil?
37
+ blank_result = block.call(result)
38
+ else
39
+ blank_result = result.to_s.blank?
40
+ end
41
+ unless result.nil? || blank_result
42
+ return result
43
+ end
44
+ end
45
+ else
46
+ blank_result = false
47
+ if !block.nil?
48
+ blank_result = block.call(results)
49
+ else
50
+ blank_result = results.to_s.blank?
51
+ end
52
+ unless results.nil? || blank_result
53
+ return results
54
+ end
55
+ end
56
+ return nil
57
+ end
58
+
59
+ # Runs through a list of XPath queries on an element or document and
60
+ # returns the first non-blank result. Subsequent XPath queries will
61
+ # not be evaluated.
62
+ def self.try_xpaths(element, xpath_list,
63
+ options={}, &block)
64
+ FeedTools::GenericHelper.validate_options([ :select_result_value ],
65
+ options.keys)
66
+ options = { :select_result_value => false }.merge(options)
67
+
68
+ result = nil
69
+ if element.nil?
70
+ return nil
71
+ end
72
+ for xpath in xpath_list
73
+ # Namespace aware
74
+ result = REXML::XPath.liberal_first(element, xpath,
75
+ FEED_TOOLS_NAMESPACES)
76
+ if options[:select_result_value] && !result.nil?
77
+ if result.respond_to?(:value)
78
+ result = result.value
79
+ else
80
+ result = result.to_s
81
+ end
82
+ end
83
+ blank_result = false
84
+ if block_given?
85
+ blank_result = yield(result)
86
+ else
87
+ blank_result = result.to_s.blank?
88
+ end
89
+ if !blank_result
90
+ if result.respond_to? :strip
91
+ result.strip!
92
+ end
93
+ return result
94
+ end
95
+
96
+ # Namespace unaware
97
+ result = REXML::XPath.liberal_first(element, xpath)
98
+ if options[:select_result_value] && !result.nil?
99
+ if result.respond_to?(:value)
100
+ result = result.value
101
+ else
102
+ result = result.to_s
103
+ end
104
+ end
105
+ blank_result = false
106
+ if block_given?
107
+ blank_result = yield(result)
108
+ else
109
+ blank_result = result.to_s.blank?
110
+ end
111
+ if !blank_result
112
+ if result.respond_to? :strip
113
+ result.strip!
114
+ end
115
+ return result
116
+ end
117
+ end
118
+ return nil
119
+ end
120
+
121
+ # Runs through a list of XPath queries on an element or document and
122
+ # returns all non-empty results. Subsequent XPath queries will
123
+ # not be evaluated.
124
+ def self.try_xpaths_all(element, xpath_list, options={})
125
+ FeedTools::GenericHelper.validate_options([ :select_result_value ],
126
+ options.keys)
127
+ options = { :select_result_value => false }.merge(options)
128
+
129
+ results = []
130
+ if element.nil?
131
+ return []
132
+ end
133
+ for xpath in xpath_list
134
+ # Namespace aware
135
+ results = REXML::XPath.liberal_match(element, xpath,
136
+ FEED_TOOLS_NAMESPACES)
137
+ if options[:select_result_value] && !results.nil? && !results.empty?
138
+ results =
139
+ results.map { |x| x.respond_to?(:value) ? x.value : x.to_s }
140
+ end
141
+ if results.blank?
142
+ results = REXML::XPath.liberal_match(element, xpath)
143
+ else
144
+ return results
145
+ end
146
+
147
+ # Namespace unaware
148
+ if options[:select_result_value] && !results.nil? && !results.empty?
149
+ results =
150
+ results.map { |x| x.respond_to?(:value) ? x.value : x.to_s }
151
+ end
152
+ if !results.blank?
153
+ return results
154
+ end
155
+ end
156
+ for xpath in xpath_list
157
+ if xpath =~ /^\w+$/
158
+ results = []
159
+ for child in element.children
160
+ if child.class == REXML::Element
161
+ if child.name.downcase == xpath.downcase
162
+ results << child
163
+ end
164
+ end
165
+ end
166
+ if options[:select_result_value] && !results.nil? && !results.empty?
167
+ results =
168
+ results.map { |x| x.inner_xml }
169
+ end
170
+ if !results.blank?
171
+ return results
172
+ end
173
+ end
174
+ end
175
+ return []
176
+ end
177
+
178
+ # Runs through a list of XPath queries on an element or document and
179
+ # returns all non-empty results, appending the results from each query
180
+ # onto the end of the results from the previous queries.
181
+ def self.combine_xpaths_all(element, xpath_list, options={})
182
+ FeedTools::GenericHelper.validate_options([ :select_result_value ],
183
+ options.keys)
184
+ options = { :select_result_value => false }.merge(options)
185
+
186
+ all_results = []
187
+ result = []
188
+ if element.nil?
189
+ return []
190
+ end
191
+ for xpath in xpath_list
192
+ # Namespace aware
193
+ results = REXML::XPath.liberal_match(element, xpath,
194
+ FEED_TOOLS_NAMESPACES)
195
+ if options[:select_result_value] && !results.nil? && !results.empty?
196
+ results =
197
+ results.map { |x| x.respond_to?(:value) ? x.value : x.to_s }
198
+ end
199
+ if results.blank?
200
+ results = REXML::XPath.liberal_match(element, xpath)
201
+ else
202
+ all_results.concat(results)
203
+ next
204
+ end
205
+
206
+ # Namespace unaware
207
+ if options[:select_result_value] && !results.nil? && !results.empty?
208
+ results =
209
+ results.map { |x| x.respond_to?(:value) ? x.value : x.to_s }
210
+ end
211
+ if !results.blank?
212
+ all_results.concat(results)
213
+ next
214
+ end
215
+ end
216
+ for xpath in xpath_list
217
+ if xpath =~ /^\w+$/
218
+ results = []
219
+ for child in element.children
220
+ if child.class == REXML::Element
221
+ if child.name.downcase == xpath.downcase
222
+ results << child
223
+ end
224
+ end
225
+ end
226
+ if options[:select_result_value] && !results.nil? && !results.empty?
227
+ results =
228
+ results.map { |x| x.inner_xml }
229
+ end
230
+ if !results.blank?
231
+ all_results.concat(results)
232
+ next
233
+ end
234
+ end
235
+ end
236
+ return all_results.uniq
237
+ end
238
+ end
239
+ end