feedtools 0.2.22 → 0.2.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -28,6 +28,11 @@ require 'net/http'
28
28
  # require 'net/https'
29
29
  # require 'net/ftp'
30
30
 
31
+ # Stolen from the Universal Feed Parser
32
+ FEED_TOOLS_ACCEPT_HEADER = "application/atom+xml,application/rdf+xml," +
33
+ "application/rss+xml,application/x-netcdf,application/xml;" +
34
+ "q=0.9,text/xml;q=0.2,*/*;q=0.1"
35
+
31
36
  # TODO: Refactor http_fetch and other methods.
32
37
  module FeedTools
33
38
  # Methods for pulling remote data
@@ -0,0 +1,223 @@
1
+ #--
2
+ # Copyright (c) 2005 Robert Aman
3
+ #
4
+ # Permission is hereby granted, free of charge, to any person obtaining
5
+ # a copy of this software and associated documentation files (the
6
+ # "Software"), to deal in the Software without restriction, including
7
+ # without limitation the rights to use, copy, modify, merge, publish,
8
+ # distribute, sublicense, and/or sell copies of the Software, and to
9
+ # permit persons to whom the Software is furnished to do so, subject to
10
+ # the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be
13
+ # included in all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22
+ #++
23
+
24
+ require 'feed_tools'
25
+ require 'uri'
26
+
27
+ module FeedTools
28
+ # Generic url processing methods needed in numerous places throughout
29
+ # FeedTools
30
+ module UriHelper
31
+ # Returns true if the idn module can be used.
32
+ def self.idn_enabled?
33
+ # This is an override variable to keep idn from being used even if it
34
+ # is available.
35
+ if FeedTools.configurations[:idn_enabled] == false
36
+ return false
37
+ end
38
+ if @idn_enabled.nil? || @idn_enabled == false
39
+ @idn_enabled = false
40
+ begin
41
+ require 'idn'
42
+ if IDN::Idna.toASCII('http://www.詹姆斯.com/') ==
43
+ "http://www.xn--8ws00zhy3a.com/"
44
+ @idn_enabled = true
45
+ else
46
+ @idn_enabled = false
47
+ end
48
+ rescue LoadError
49
+ # Tidy not installed, disable features that rely on tidy.
50
+ @idn_enabled = false
51
+ end
52
+ end
53
+ return @idn_enabled
54
+ end
55
+
56
+ # Attempts to ensures that the passed url is valid and sane. Accepts very,
57
+ # very ugly urls and makes every effort to figure out what it was supposed
58
+ # to be. Also translates from the feed: and rss: pseudo-protocols to the
59
+ # http: protocol.
60
+ def self.normalize_url(url)
61
+ if url.kind_of?(URI)
62
+ url = url.to_s
63
+ end
64
+ if url.blank?
65
+ return nil
66
+ end
67
+ normalized_url = CGI.unescape(url.strip)
68
+
69
+ # if a url begins with the '/' character, it only makes sense that they
70
+ # meant to be using a file:// url. Fix it for them.
71
+ if normalized_url.length > 0 && normalized_url[0..0] == "/"
72
+ normalized_url = "file://" + normalized_url
73
+ end
74
+
75
+ # if a url begins with a drive letter followed by a colon, we're looking at
76
+ # a file:// url. Fix it for them.
77
+ if normalized_url.length > 0 &&
78
+ normalized_url.scan(/^[a-zA-Z]:[\\\/]/).size > 0
79
+ normalized_url = "file:///" + normalized_url
80
+ end
81
+
82
+ # if a url begins with javascript:, it's quite possibly an attempt at
83
+ # doing something malicious. Let's keep that from getting anywhere,
84
+ # shall we?
85
+ if (normalized_url.downcase =~ /javascript:/) != nil
86
+ return "#"
87
+ end
88
+
89
+ # deal with all of the many ugly possibilities involved in the rss:
90
+ # and feed: pseudo-protocols (incidentally, whose crazy idea was this
91
+ # mess?)
92
+ normalized_url.gsub!(/^http:\/*(feed:\/*)?/i, "http://")
93
+ normalized_url.gsub!(/^http:\/*(rss:\/*)?/i, "http://")
94
+ normalized_url.gsub!(/^feed:\/*(http:\/*)?/i, "http://")
95
+ normalized_url.gsub!(/^rss:\/*(http:\/*)?/i, "http://")
96
+ normalized_url.gsub!(/^file:\/*/i, "file:///")
97
+ normalized_url.gsub!(/^https:\/*/i, "https://")
98
+ # fix (very) bad urls (usually of the user-entered sort)
99
+ normalized_url.gsub!(/^http:\/*(http:\/*)*/i, "http://")
100
+
101
+ if (normalized_url =~ /^file:/i) == 0
102
+ # Adjust windows-style urls
103
+ normalized_url.gsub!(/^file:\/\/\/([a-zA-Z])\|/i, 'file:///\1:')
104
+ normalized_url.gsub!(/\\/, '/')
105
+ else
106
+ if (normalized_url =~ /^https?:\/\//i) == nil
107
+ normalized_url = "http://" + normalized_url
108
+ end
109
+ if normalized_url == "http://"
110
+ return nil
111
+ end
112
+ begin
113
+ scheme, host_part, path =
114
+ normalized_url.scan(/^(https?):\/\/([^\/]+)\/(.*)/i).flatten
115
+ if scheme != nil && host_part != nil && path != nil
116
+ scheme = scheme.downcase
117
+ if FeedTools::UriHelper.idn_enabled?
118
+ host_part =
119
+ IDN::Idna.toASCII(host_part)
120
+ end
121
+ new_path = ""
122
+ for index in 0...path.size
123
+ if path[index] <= 32 || path[index] >= 126
124
+ new_path << ("%" + path[index].to_s(16).upcase)
125
+ else
126
+ new_path << path[index..index]
127
+ end
128
+ end
129
+ path = new_path
130
+ normalized_url = scheme + "://" + host_part + "/" + path
131
+ end
132
+ rescue Object
133
+ end
134
+ begin
135
+ feed_uri = URI.parse(normalized_url)
136
+ if feed_uri.scheme == nil
137
+ feed_uri.scheme = "http"
138
+ end
139
+ if feed_uri.path.blank?
140
+ feed_uri.path = "/"
141
+ end
142
+ if (feed_uri.path =~ /^[\/]+/) == 0
143
+ feed_uri.path.gsub!(/^[\/]+/, "/")
144
+ end
145
+ while (feed_uri.path =~ /^\/\.\./)
146
+ feed_uri.path.gsub!(/^\/\.\./, "")
147
+ end
148
+ if feed_uri.path.blank?
149
+ feed_uri.path = "/"
150
+ end
151
+ feed_uri.host.downcase!
152
+ normalized_url = feed_uri.to_s
153
+ rescue URI::InvalidURIError
154
+ end
155
+ end
156
+
157
+ # We can't do a proper set of escaping, so this will
158
+ # have to do.
159
+ normalized_url.gsub!(/%20/, " ")
160
+ normalized_url.gsub!(/ /, "%20")
161
+
162
+ return normalized_url
163
+ end
164
+
165
+ # Resolves a relative uri
166
+ def self.resolve_relative_uri(relative_uri, base_uri_sources=[])
167
+ return relative_uri if base_uri_sources.blank?
168
+ return nil if relative_uri.nil?
169
+ begin
170
+ base_uri = URI.parse(
171
+ FeedTools::XmlHelper.select_not_blank(base_uri_sources))
172
+ resolved_uri = base_uri + relative_uri.to_s
173
+ return FeedTools::UriHelper.normalize_url(resolved_uri.to_s)
174
+ rescue
175
+ return relative_uri
176
+ end
177
+ end
178
+
179
+ # Converts a url into a tag uri
180
+ def self.build_tag_uri(url, date)
181
+ unless url.kind_of? String
182
+ raise ArgumentError, "Expected String, got #{url.class.name}"
183
+ end
184
+ unless date.kind_of? Time
185
+ raise ArgumentError, "Expected Time, got #{date.class.name}"
186
+ end
187
+ tag_uri = normalize_url(url)
188
+ unless FeedTools::UriHelper.is_uri?(tag_uri)
189
+ raise ArgumentError, "Must supply a valid URL."
190
+ end
191
+ host = URI.parse(tag_uri).host
192
+ tag_uri.gsub!(/^(http|ftp|file):\/*/, "")
193
+ tag_uri.gsub!(/#/, "/")
194
+ tag_uri = "tag:#{host},#{date.strftime('%Y-%m-%d')}:" +
195
+ "#{tag_uri[(tag_uri.index(host) + host.size)..-1]}"
196
+ return tag_uri
197
+ end
198
+
199
+ # Converts a url into a urn:uuid: uri
200
+ def self.build_urn_uri(url)
201
+ unless url.kind_of? String
202
+ raise ArgumentError, "Expected String, got #{url.class.name}"
203
+ end
204
+ normalized_url = normalize_url(url)
205
+ require 'uuidtools'
206
+ return UUID.sha1_create(UUID_URL_NAMESPACE, normalized_url).to_uri_string
207
+ end
208
+
209
+ # Returns true if the parameter appears to be a valid uri
210
+ def self.is_uri?(url)
211
+ return false if url.nil?
212
+ begin
213
+ uri = URI.parse(url)
214
+ if uri.scheme.blank?
215
+ return false
216
+ end
217
+ rescue URI::InvalidURIError
218
+ return false
219
+ end
220
+ return true
221
+ end
222
+ end
223
+ end
@@ -0,0 +1,239 @@
1
+ #--
2
+ # Copyright (c) 2005 Robert Aman
3
+ #
4
+ # Permission is hereby granted, free of charge, to any person obtaining
5
+ # a copy of this software and associated documentation files (the
6
+ # "Software"), to deal in the Software without restriction, including
7
+ # without limitation the rights to use, copy, modify, merge, publish,
8
+ # distribute, sublicense, and/or sell copies of the Software, and to
9
+ # permit persons to whom the Software is furnished to do so, subject to
10
+ # the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be
13
+ # included in all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22
+ #++
23
+
24
+ require 'feed_tools'
25
+ require 'feed_tools/helpers/generic_helper'
26
+ require 'rexml/document'
27
+
28
+ module FeedTools
29
+ # Generic xml methods needed in numerous places throughout FeedTools
30
+ module XmlHelper
31
+ # Selects the first non-blank result.
32
+ def self.select_not_blank(results, &block)
33
+ if results.kind_of? Array
34
+ for result in results
35
+ blank_result = false
36
+ if !block.nil?
37
+ blank_result = block.call(result)
38
+ else
39
+ blank_result = result.to_s.blank?
40
+ end
41
+ unless result.nil? || blank_result
42
+ return result
43
+ end
44
+ end
45
+ else
46
+ blank_result = false
47
+ if !block.nil?
48
+ blank_result = block.call(results)
49
+ else
50
+ blank_result = results.to_s.blank?
51
+ end
52
+ unless results.nil? || blank_result
53
+ return results
54
+ end
55
+ end
56
+ return nil
57
+ end
58
+
59
+ # Runs through a list of XPath queries on an element or document and
60
+ # returns the first non-blank result. Subsequent XPath queries will
61
+ # not be evaluated.
62
+ def self.try_xpaths(element, xpath_list,
63
+ options={}, &block)
64
+ FeedTools::GenericHelper.validate_options([ :select_result_value ],
65
+ options.keys)
66
+ options = { :select_result_value => false }.merge(options)
67
+
68
+ result = nil
69
+ if element.nil?
70
+ return nil
71
+ end
72
+ for xpath in xpath_list
73
+ # Namespace aware
74
+ result = REXML::XPath.liberal_first(element, xpath,
75
+ FEED_TOOLS_NAMESPACES)
76
+ if options[:select_result_value] && !result.nil?
77
+ if result.respond_to?(:value)
78
+ result = result.value
79
+ else
80
+ result = result.to_s
81
+ end
82
+ end
83
+ blank_result = false
84
+ if block_given?
85
+ blank_result = yield(result)
86
+ else
87
+ blank_result = result.to_s.blank?
88
+ end
89
+ if !blank_result
90
+ if result.respond_to? :strip
91
+ result.strip!
92
+ end
93
+ return result
94
+ end
95
+
96
+ # Namespace unaware
97
+ result = REXML::XPath.liberal_first(element, xpath)
98
+ if options[:select_result_value] && !result.nil?
99
+ if result.respond_to?(:value)
100
+ result = result.value
101
+ else
102
+ result = result.to_s
103
+ end
104
+ end
105
+ blank_result = false
106
+ if block_given?
107
+ blank_result = yield(result)
108
+ else
109
+ blank_result = result.to_s.blank?
110
+ end
111
+ if !blank_result
112
+ if result.respond_to? :strip
113
+ result.strip!
114
+ end
115
+ return result
116
+ end
117
+ end
118
+ return nil
119
+ end
120
+
121
+ # Runs through a list of XPath queries on an element or document and
122
+ # returns all non-empty results. Subsequent XPath queries will
123
+ # not be evaluated.
124
+ def self.try_xpaths_all(element, xpath_list, options={})
125
+ FeedTools::GenericHelper.validate_options([ :select_result_value ],
126
+ options.keys)
127
+ options = { :select_result_value => false }.merge(options)
128
+
129
+ results = []
130
+ if element.nil?
131
+ return []
132
+ end
133
+ for xpath in xpath_list
134
+ # Namespace aware
135
+ results = REXML::XPath.liberal_match(element, xpath,
136
+ FEED_TOOLS_NAMESPACES)
137
+ if options[:select_result_value] && !results.nil? && !results.empty?
138
+ results =
139
+ results.map { |x| x.respond_to?(:value) ? x.value : x.to_s }
140
+ end
141
+ if results.blank?
142
+ results = REXML::XPath.liberal_match(element, xpath)
143
+ else
144
+ return results
145
+ end
146
+
147
+ # Namespace unaware
148
+ if options[:select_result_value] && !results.nil? && !results.empty?
149
+ results =
150
+ results.map { |x| x.respond_to?(:value) ? x.value : x.to_s }
151
+ end
152
+ if !results.blank?
153
+ return results
154
+ end
155
+ end
156
+ for xpath in xpath_list
157
+ if xpath =~ /^\w+$/
158
+ results = []
159
+ for child in element.children
160
+ if child.class == REXML::Element
161
+ if child.name.downcase == xpath.downcase
162
+ results << child
163
+ end
164
+ end
165
+ end
166
+ if options[:select_result_value] && !results.nil? && !results.empty?
167
+ results =
168
+ results.map { |x| x.inner_xml }
169
+ end
170
+ if !results.blank?
171
+ return results
172
+ end
173
+ end
174
+ end
175
+ return []
176
+ end
177
+
178
+ # Runs through a list of XPath queries on an element or document and
179
+ # returns all non-empty results, appending the results from each query
180
+ # onto the end of the results from the previous queries.
181
+ def self.combine_xpaths_all(element, xpath_list, options={})
182
+ FeedTools::GenericHelper.validate_options([ :select_result_value ],
183
+ options.keys)
184
+ options = { :select_result_value => false }.merge(options)
185
+
186
+ all_results = []
187
+ result = []
188
+ if element.nil?
189
+ return []
190
+ end
191
+ for xpath in xpath_list
192
+ # Namespace aware
193
+ results = REXML::XPath.liberal_match(element, xpath,
194
+ FEED_TOOLS_NAMESPACES)
195
+ if options[:select_result_value] && !results.nil? && !results.empty?
196
+ results =
197
+ results.map { |x| x.respond_to?(:value) ? x.value : x.to_s }
198
+ end
199
+ if results.blank?
200
+ results = REXML::XPath.liberal_match(element, xpath)
201
+ else
202
+ all_results.concat(results)
203
+ next
204
+ end
205
+
206
+ # Namespace unaware
207
+ if options[:select_result_value] && !results.nil? && !results.empty?
208
+ results =
209
+ results.map { |x| x.respond_to?(:value) ? x.value : x.to_s }
210
+ end
211
+ if !results.blank?
212
+ all_results.concat(results)
213
+ next
214
+ end
215
+ end
216
+ for xpath in xpath_list
217
+ if xpath =~ /^\w+$/
218
+ results = []
219
+ for child in element.children
220
+ if child.class == REXML::Element
221
+ if child.name.downcase == xpath.downcase
222
+ results << child
223
+ end
224
+ end
225
+ end
226
+ if options[:select_result_value] && !results.nil? && !results.empty?
227
+ results =
228
+ results.map { |x| x.inner_xml }
229
+ end
230
+ if !results.blank?
231
+ all_results.concat(results)
232
+ next
233
+ end
234
+ end
235
+ end
236
+ return all_results.uniq
237
+ end
238
+ end
239
+ end