feedtools 0.2.23 → 0.2.24

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -69,6 +69,24 @@ module FeedTools
69
69
  return parent_feed
70
70
  end
71
71
 
72
+ # Returns the load options for this feed.
73
+ def configurations
74
+ if @configurations.blank?
75
+ parent_feed = self.feed
76
+ if parent_feed != nil
77
+ @configurations = parent_feed.configurations.dup
78
+ else
79
+ @configurations = FeedTools.configurations.dup
80
+ end
81
+ end
82
+ return @configurations
83
+ end
84
+
85
+ # Sets the load options for this feed.
86
+ def configurations=(new_configurations)
87
+ @configurations = new_configurations
88
+ end
89
+
72
90
  # Returns the feed item's encoding.
73
91
  def encoding
74
92
  if @encoding.nil?
@@ -202,10 +220,10 @@ module FeedTools
202
220
  @title = FeedTools::HtmlHelper.process_text_construct(title_node,
203
221
  self.feed_type, self.feed_version)
204
222
  if self.feed_type == "atom" ||
205
- FeedTools.configurations[:always_strip_wrapper_elements]
223
+ self.configurations[:always_strip_wrapper_elements]
206
224
  @title = FeedTools::HtmlHelper.strip_wrapper_element(@title)
207
225
  end
208
- if !@title.blank? && FeedTools.configurations[:strip_comment_count]
226
+ if !@title.blank? && self.configurations[:strip_comment_count]
209
227
  # Some blogging tools include the number of comments in a post
210
228
  # in the title... this is supremely ugly, and breaks any
211
229
  # applications which expect the title to be static, so we're
@@ -257,7 +275,7 @@ module FeedTools
257
275
  @content = FeedTools::HtmlHelper.process_text_construct(content_node,
258
276
  self.feed_type, self.feed_version)
259
277
  if self.feed_type == "atom" ||
260
- FeedTools.configurations[:always_strip_wrapper_elements]
278
+ self.configurations[:always_strip_wrapper_elements]
261
279
  @content = FeedTools::HtmlHelper.strip_wrapper_element(@content)
262
280
  end
263
281
  if @content.blank?
@@ -310,7 +328,7 @@ module FeedTools
310
328
  @summary = FeedTools::HtmlHelper.process_text_construct(summary_node,
311
329
  self.feed_type, self.feed_version)
312
330
  if self.feed_type == "atom" ||
313
- FeedTools.configurations[:always_strip_wrapper_elements]
331
+ self.configurations[:always_strip_wrapper_elements]
314
332
  @summary = FeedTools::HtmlHelper.strip_wrapper_element(@summary)
315
333
  end
316
334
  if @summary.blank?
@@ -456,7 +474,7 @@ module FeedTools
456
474
  end
457
475
  rescue
458
476
  end
459
- if FeedTools.configurations[:url_normalization_enabled]
477
+ if self.configurations[:url_normalization_enabled]
460
478
  @link = FeedTools::UriHelper.normalize_url(@link)
461
479
  end
462
480
  end
@@ -507,7 +525,7 @@ module FeedTools
507
525
  end
508
526
  rescue
509
527
  end
510
- if FeedTools.configurations[:url_normalization_enabled]
528
+ if self.configurations[:url_normalization_enabled]
511
529
  link_object.href =
512
530
  FeedTools::UriHelper.normalize_url(link_object.href)
513
531
  end
@@ -640,7 +658,7 @@ module FeedTools
640
658
  end
641
659
  rescue
642
660
  end
643
- if FeedTools.configurations[:url_normalization_enabled]
661
+ if self.configurations[:url_normalization_enabled]
644
662
  image.href = FeedTools::UriHelper.normalize_url(image.href)
645
663
  end
646
664
  image.href.strip! unless image.href.nil?
@@ -688,7 +706,7 @@ module FeedTools
688
706
  "itunes:image/@href",
689
707
  "itunes:link[@rel='image']/@href"
690
708
  ], :select_result_value => true)
691
- if FeedTools.configurations[:url_normalization_enabled]
709
+ if self.configurations[:url_normalization_enabled]
692
710
  @itunes_image_link = FeedTools::UriHelper.normalize_url(@itunes_image_link)
693
711
  end
694
712
  end
@@ -706,7 +724,7 @@ module FeedTools
706
724
  @media_thumbnail_link = FeedTools::XmlHelper.try_xpaths(self.root_node, [
707
725
  "media:thumbnail/@url"
708
726
  ], :select_result_value => true)
709
- if FeedTools.configurations[:url_normalization_enabled]
727
+ if self.configurations[:url_normalization_enabled]
710
728
  @media_thumbnail_link = FeedTools::UriHelper.normalize_url(@media_thumbnail_link)
711
729
  end
712
730
  end
@@ -734,7 +752,7 @@ module FeedTools
734
752
  @rights = FeedTools::HtmlHelper.process_text_construct(rights_node,
735
753
  self.feed_type, self.feed_version)
736
754
  if self.feed_type == "atom" ||
737
- FeedTools.configurations[:always_strip_wrapper_elements]
755
+ self.configurations[:always_strip_wrapper_elements]
738
756
  @rights = FeedTools::HtmlHelper.strip_wrapper_element(@rights)
739
757
  end
740
758
  end
@@ -1456,14 +1474,14 @@ module FeedTools
1456
1474
  begin
1457
1475
  if !time_string.blank?
1458
1476
  @time = Time.parse(time_string).gmtime
1459
- elsif FeedTools.configurations[:timestamp_estimation_enabled] &&
1477
+ elsif self.configurations[:timestamp_estimation_enabled] &&
1460
1478
  !self.title.nil? &&
1461
1479
  (Time.parse(self.title) - Time.now).abs > 100
1462
1480
  @time = Time.parse(self.title).gmtime
1463
1481
  end
1464
1482
  rescue
1465
1483
  end
1466
- if FeedTools.configurations[:timestamp_estimation_enabled]
1484
+ if self.configurations[:timestamp_estimation_enabled]
1467
1485
  if options[:estimate_timestamp]
1468
1486
  if @time.nil?
1469
1487
  begin
@@ -1615,7 +1633,7 @@ module FeedTools
1615
1633
  end
1616
1634
  rescue
1617
1635
  end
1618
- if FeedTools.configurations[:url_normalization_enabled]
1636
+ if self.configurations[:url_normalization_enabled]
1619
1637
  @comments = FeedTools::UriHelper.normalize_url(@comments)
1620
1638
  end
1621
1639
  end
@@ -170,6 +170,10 @@ module FeedTools
170
170
  alias_method :url=, :href=
171
171
  alias_method :link, :href
172
172
  alias_method :link=, :href=
173
+
174
+ def initialize
175
+ @expression = 'full'
176
+ end
173
177
 
174
178
  # Returns true if this is the default enclosure
175
179
  def is_default?
@@ -202,8 +206,7 @@ module FeedTools
202
206
  # Allowed values are 'sample', 'full', 'nonstop'.
203
207
  def expression=(new_expression)
204
208
  unless ['sample', 'full', 'nonstop'].include? new_expression.downcase
205
- raise ArgumentError,
206
- "Permitted values are 'sample', 'full', 'nonstop'."
209
+ return @expression
207
210
  end
208
211
  @expression = new_expression.downcase
209
212
  end
@@ -24,8 +24,7 @@
24
24
  module FeedTools
25
25
  module DebugHelper
26
26
  # Forces a stack_trace without interfering with the program
27
- def stack_trace
28
-
27
+ def self.stack_trace
29
28
  fork do
30
29
  ObjectSpace.each_object(Thread) do |th|
31
30
  th.raise Exception, "Stack Dump" unless Thread.current == th
@@ -232,6 +232,17 @@ module FeedTools
232
232
  return tidy_html
233
233
  end
234
234
 
235
+ # Indents a text selection by a specified number of spaces.
236
+ def self.indent(text, spaces)
237
+ lines = text.split("\n")
238
+ buffer = ""
239
+ for line in lines
240
+ line = " " * spaces + line
241
+ buffer << line << "\n"
242
+ end
243
+ return buffer
244
+ end
245
+
235
246
  # Unindents a text selection by a specified number of spaces.
236
247
  def self.unindent(text, spaces)
237
248
  lines = text.split("\n")
@@ -301,10 +312,11 @@ module FeedTools
301
312
  html_node.delete_element(child)
302
313
  end
303
314
  end
304
- for attribute in child.attributes.keys
305
- if !(attribute =~ /^xmlns/)
306
- unless acceptable_attributes.include? attribute.downcase
307
- child.delete_attribute(attribute)
315
+ child.attributes.each_attribute do |attribute|
316
+ if !(attribute.value =~ /^xmlns(:.+)?$/)
317
+ unless acceptable_attributes.include?(
318
+ attribute.value.downcase)
319
+ child.delete_attribute(attribute.value)
308
320
  end
309
321
  end
310
322
  end
@@ -364,8 +376,6 @@ module FeedTools
364
376
  ].include?(type)
365
377
  end
366
378
 
367
- # can_be_relative_uri = ['link', 'id', 'wfw_comment', 'wfw_commentrss', 'docs', 'url', 'href', 'comments', 'license', 'icon', 'logo']
368
-
369
379
  # Resolves all relative uris in a block of html.
370
380
  def self.resolve_relative_uris(html, base_uri_sources=[])
371
381
  relative_uri_attributes = [
@@ -398,23 +408,23 @@ module FeedTools
398
408
  html_doc = HTree.parse_xml("<root>" + html + "</root>").to_rexml
399
409
 
400
410
  resolve_node = lambda do |html_node|
401
- if html_node.respond_to? :children
402
- for child in html_node.children
403
- if child.kind_of? REXML::Element
404
- for element_attribute_pair in relative_uri_attributes
405
- if child.name.downcase == element_attribute_pair[0]
406
- attribute = child.attribute(element_attribute_pair[1])
407
- if attribute != nil
408
- href = attribute.value
409
- href = FeedTools::UriHelper.resolve_relative_uri(
410
- href, [child.base_uri] | base_uri_sources)
411
- child.attribute(
412
- element_attribute_pair[1]).instance_variable_set(
413
- "@value", href)
414
- end
415
- end
411
+ if html_node.kind_of? REXML::Element
412
+ for element_attribute_pair in relative_uri_attributes
413
+ if html_node.name.downcase == element_attribute_pair[0]
414
+ attribute = html_node.attribute(element_attribute_pair[1])
415
+ if attribute != nil
416
+ href = attribute.value
417
+ href = FeedTools::UriHelper.resolve_relative_uri(
418
+ href, [html_node.base_uri] | base_uri_sources)
419
+ html_node.attribute(
420
+ element_attribute_pair[1]).instance_variable_set(
421
+ "@value", href)
416
422
  end
417
423
  end
424
+ end
425
+ end
426
+ if html_node.respond_to? :children
427
+ for child in html_node.children
418
428
  resolve_node.call(child)
419
429
  end
420
430
  end
@@ -428,36 +438,55 @@ module FeedTools
428
438
  # Returns a string containing normalized xhtml from within a REXML node.
429
439
  def self.extract_xhtml(rexml_node)
430
440
  rexml_node_dup = rexml_node.deep_clone
441
+ namespace_hash = FEED_TOOLS_NAMESPACES.dup
431
442
  normalize_namespaced_xhtml = lambda do |node, node_dup|
432
443
  if node.kind_of? REXML::Element
433
444
  node_namespace = node.namespace
434
- # Massive hack, relies on REXML not changing
435
- for index in 0...node.attributes.values.size
436
- attribute = node.attributes.values[index]
437
- attribute_dup = node_dup.attributes.values[index]
438
- if attribute.namespace == FEED_TOOLS_NAMESPACES['xhtml']
439
- attribute_dup.instance_variable_set(
440
- "@expanded_name", attribute.name)
441
- end
442
- if node_namespace == FEED_TOOLS_NAMESPACES['xhtml']
443
- if attribute.name == 'xmlns'
444
- node_dup.attributes.delete('xmlns')
445
+ if node_namespace != namespace_hash['atom10'] &&
446
+ node_namespace != namespace_hash['atom03']
447
+ # Massive hack, relies on REXML not changing
448
+ for index in 0...node.attributes.values.size
449
+ attribute = node.attributes.values[index]
450
+ attribute_dup = node_dup.attributes.values[index]
451
+ if attribute.namespace == namespace_hash['xhtml']
452
+ attribute_dup.instance_variable_set(
453
+ "@expanded_name", attribute.name)
454
+ end
455
+ if node_namespace == namespace_hash['xhtml']
456
+ if attribute.name == 'xmlns'
457
+ node_dup.attributes.delete('xmlns')
458
+ end
445
459
  end
446
460
  end
447
- end
448
- if node_namespace == FEED_TOOLS_NAMESPACES['xhtml']
449
- node_dup.instance_variable_set("@expanded_name", node.name)
450
- end
451
- if !node_namespace.blank? && node.prefix.blank?
452
- if node.namespace != FEED_TOOLS_NAMESPACES['xhtml']
453
- node_dup.add_namespace(node_namespace)
461
+ if node_namespace == namespace_hash['xhtml']
462
+ node_dup.instance_variable_set("@expanded_name", node.name)
463
+ end
464
+ if !node_namespace.blank? && node.prefix.blank?
465
+ if node_namespace != namespace_hash['xhtml']
466
+ prefix = nil
467
+ for known_prefix in namespace_hash.keys
468
+ if namespace_hash[known_prefix] == node_namespace
469
+ prefix = known_prefix
470
+ end
471
+ end
472
+ if prefix.nil?
473
+ prefix = "unknown" +
474
+ Digest::SHA1.new(node_namespace).to_s[0..4]
475
+ namespace_hash[prefix] = node_namespace
476
+ end
477
+ node_dup.instance_variable_set("@expanded_name",
478
+ "#{prefix}:#{node.name}")
479
+ node_dup.instance_variable_set("@prefix",
480
+ prefix)
481
+ node_dup.add_namespace(prefix, node_namespace)
482
+ end
454
483
  end
455
484
  end
456
485
  end
457
486
  for index in 0...node.children.size
458
487
  child = node.children[index]
459
- child_dup = node_dup.children[index]
460
488
  if child.kind_of? REXML::Element
489
+ child_dup = node_dup.children[index]
461
490
  normalize_namespaced_xhtml.call(child, child_dup)
462
491
  end
463
492
  end
@@ -513,7 +542,9 @@ module FeedTools
513
542
  type == "application/xhtml+xml" ||
514
543
  content_node.namespace == FEED_TOOLS_NAMESPACES['xhtml']
515
544
  content = FeedTools::HtmlHelper.extract_xhtml(content_node)
516
- elsif type == "escaped" || mode == "escaped"
545
+ elsif type == "escaped" || mode == "escaped" ||
546
+ type == "html" || mode == "html" ||
547
+ type == "text/html" || mode == "text/html"
517
548
  content = FeedTools::HtmlHelper.unescape_entities(
518
549
  content_node.inner_xml.strip)
519
550
  elsif type == "text" || mode == "text" ||
@@ -556,7 +587,7 @@ module FeedTools
556
587
  doc = REXML::Document.new(xhtml.to_s.strip)
557
588
  if doc.children.size == 1
558
589
  child = doc.children[0]
559
- if child.name.downcase == "div"
590
+ if child.kind_of?(REXML::Element) && child.name.downcase == "div"
560
591
  return child.inner_xml.strip
561
592
  end
562
593
  end
@@ -574,7 +605,8 @@ module FeedTools
574
605
  # This is technically very, very wrong. But it saves oodles of
575
606
  # clock cycles, and probably works 99.999% of the time.
576
607
  html_document = HTree.parse_xml(
577
- FeedTools::HtmlHelper.tidy_html(html.gsub(/<body>(.|\n)*<\/body>/, ""))).to_rexml
608
+ FeedTools::HtmlHelper.tidy_html(
609
+ html.gsub(/<body.*?>(.|\n)*<\/body>/, "<body>-</body>"))).to_rexml
578
610
  html_node = nil
579
611
  head_node = nil
580
612
  link_nodes = []
@@ -22,20 +22,218 @@
22
22
  #++
23
23
 
24
24
  require 'feed_tools'
25
+ require 'feed_tools/helpers/uri_helper'
25
26
  require 'net/http'
26
27
 
27
28
  # TODO: Not used yet, don't load since it'll only be a performance hit
28
29
  # require 'net/https'
29
30
  # require 'net/ftp'
30
31
 
31
- # Stolen from the Universal Feed Parser
32
- FEED_TOOLS_ACCEPT_HEADER = "application/atom+xml,application/rdf+xml," +
33
- "application/rss+xml,application/x-netcdf,application/xml;" +
34
- "q=0.9,text/xml;q=0.2,*/*;q=0.1"
35
-
36
- # TODO: Refactor http_fetch and other methods.
37
32
  module FeedTools
38
33
  # Methods for pulling remote data
39
34
  module RetrievalHelper
35
+ # Stolen from the Universal Feed Parser
36
+ ACCEPT_HEADER = "application/atom+xml,application/rdf+xml," +
37
+ "application/rss+xml,application/x-netcdf,application/xml;" +
38
+ "q=0.9,text/xml;q=0.2,*/*;q=0.1"
39
+
40
+ # Makes an HTTP request and returns the HTTP response. Optionally
41
+ # takes a block that determines whether or not to follow a redirect.
42
+ # The block will be passed the HTTP redirect response as an argument.
43
+ def self.http_request(http_operation, url, options={}, &block)
44
+ response = nil
45
+
46
+ options = {
47
+ :feed_object => nil,
48
+ :form_data => nil,
49
+ :request_headers => {},
50
+ :follow_redirects => true,
51
+ :redirect_limit => 10,
52
+ :response_chain => []
53
+ }.merge(options)
54
+
55
+ if options[:redirect_limit] == 0
56
+ raise FeedAccessError, 'Redirect too deep'
57
+ end
58
+
59
+ if options[:response_chain].blank? ||
60
+ !options[:response_chain].kind_of?(Array)
61
+ options[:response_chain] = []
62
+ end
63
+
64
+ if !options[:request_headers].kind_of?(Hash)
65
+ options[:request_headers] = {}
66
+ end
67
+ if !options[:form_data].kind_of?(Hash)
68
+ options[:form_data] = nil
69
+ end
70
+
71
+ if options[:request_headers].blank? && options[:feed_object] != nil
72
+ options[:request_headers] = {}
73
+ unless options[:feed_object].http_headers.nil?
74
+ unless options[:feed_object].http_headers['etag'].nil?
75
+ options[:request_headers]["If-None-Match"] =
76
+ options[:feed_object].http_headers['etag']
77
+ end
78
+ unless options[:feed_object].http_headers['last-modified'].nil?
79
+ options[:request_headers]["If-Modified-Since"] =
80
+ options[:feed_object].http_headers['last-modified']
81
+ end
82
+ end
83
+ unless options[:feed_object].configurations[:user_agent].nil?
84
+ options[:request_headers]["User-Agent"] =
85
+ options[:feed_object].configurations[:user_agent]
86
+ end
87
+ end
88
+ if options[:request_headers]["Accept"].nil?
89
+ options[:request_headers]["Accept"] =
90
+ FeedTools::RetrievalHelper::ACCEPT_HEADER
91
+ end
92
+ if options[:request_headers]["User-Agent"].nil?
93
+ options[:request_headers]["User-Agent"] =
94
+ FeedTools.configurations[:user_agent]
95
+ end
96
+
97
+ uri = nil
98
+ begin
99
+ uri = URI.parse(url)
100
+ rescue URI::InvalidURIError
101
+ # Uh, maybe try to fix it?
102
+ uri = URI.parse(FeedTools::UriHelper.normalize_url(url))
103
+ end
104
+
105
+ begin
106
+ proxy_address = nil
107
+ proxy_port = nil
108
+ proxy_user = nil
109
+ proxy_password = nil
110
+
111
+ if options[:feed_object] != nil
112
+ proxy_address =
113
+ options[:feed_object].configurations[:proxy_address] || nil
114
+ proxy_port =
115
+ options[:feed_object].configurations[:proxy_port].to_i || nil
116
+ proxy_user =
117
+ options[:feed_object].configurations[:proxy_user].to_i || nil
118
+ proxy_password =
119
+ options[:feed_object].configurations[:proxy_password].to_i || nil
120
+ end
121
+
122
+ # No need to check for nil
123
+ http = Net::HTTP::Proxy(
124
+ proxy_address, proxy_port, proxy_user, proxy_password).new(
125
+ uri.host, (uri.port or 80))
126
+
127
+ path = uri.path
128
+ path += ('?' + uri.query) if uri.query
129
+
130
+ request_params = [path, options[:request_headers]]
131
+ if http_operation == :post
132
+ options[:form_data] = {} if options[:form_data].blank?
133
+ request_params << options[:form_data]
134
+ end
135
+ response = http.send(http_operation, *request_params)
136
+
137
+ case response
138
+ when Net::HTTPSuccess
139
+ if options[:feed_object] != nil
140
+ # We've reached the final destination, process all previous
141
+ # redirections, and see if we need to update the url.
142
+ for redirected_response in options[:response_chain]
143
+ if redirected_response.last.code.to_i == 301
144
+ # Reset the cache object or we may get duplicate entries
145
+
146
+ # TODO: verify this line is necessary!
147
+ #=============================================================================
148
+ options[:feed_object].cache_object = nil
149
+
150
+ options[:feed_object].href =
151
+ redirected_response.last['location']
152
+ else
153
+ # Jump out as soon as we hit anything that isn't a
154
+ # permanently moved redirection.
155
+ break
156
+ end
157
+ end
158
+ end
159
+ when Net::HTTPNotModified
160
+ # Do nothing, we just don't want it processed as a redirection
161
+ when Net::HTTPRedirection
162
+ if response['location'].nil?
163
+ raise FeedAccessError,
164
+ "No location to redirect to supplied for " + response.code
165
+ end
166
+ options[:response_chain] << [url, response]
167
+
168
+ redirected_location = response['location']
169
+ redirected_location = FeedTools::UriHelper.resolve_relative_uri(
170
+ redirected_location, [uri.host])
171
+
172
+ if options[:response_chain].assoc(redirected_location) != nil
173
+ raise FeedAccessError,
174
+ "Redirection loop detected: #{redirected_location}"
175
+ end
176
+
177
+ # Let the block handle redirects
178
+ follow_redirect = true
179
+ if block != nil
180
+ follow_redirect = block.call(redirected_location, response)
181
+ end
182
+
183
+ if follow_redirect
184
+ response = FeedTools::RetrievalHelper.http_request(
185
+ http_operation,
186
+ redirected_location,
187
+ options.merge(
188
+ {:redirect_limit => (options[:redirect_limit] - 1)}),
189
+ &block)
190
+ end
191
+ end
192
+ rescue SocketError
193
+ raise FeedAccessError, 'Socket error prevented feed retrieval'
194
+ rescue Timeout::Error
195
+ raise FeedAccessError, 'Timeout while attempting to retrieve feed'
196
+ rescue Errno::ENETUNREACH
197
+ raise FeedAccessError, 'Network was unreachable'
198
+ rescue Errno::ECONNRESET
199
+ raise FeedAccessError, 'Connection was reset by peer'
200
+ end
201
+
202
+ if response != nil
203
+ class << response
204
+ def response_chain
205
+ return @response_chain
206
+ end
207
+ end
208
+ response.instance_variable_set("@response_chain",
209
+ options[:response_chain])
210
+ end
211
+
212
+ return response
213
+ end
214
+
215
+ # Makes an HTTP GET request and returns the HTTP response. Optionally
216
+ # takes a block that determines whether or not to follow a redirect.
217
+ # The block will be passed the HTTP redirect response as an argument.
218
+ def self.http_get(url, options={}, &block)
219
+ return FeedTools::RetrievalHelper.http_request(
220
+ :get, url, options, &block)
221
+ end
222
+
223
+ # Makes an HTTP POST request and returns the HTTP response. Optionally
224
+ # takes a block that determines whether or not to follow a redirect.
225
+ # The block will be passed the HTTP redirect response as an argument.
226
+ def self.http_post(url, options={}, &block)
227
+ return FeedTools::RetrievalHelper.http_request(
228
+ :post, url, options, &block)
229
+ end
230
+
231
+ # Makes an HTTP HEAD request and returns the HTTP response. Optionally
232
+ # takes a block that determines whether or not to follow a redirect.
233
+ # The block will be passed the HTTP redirect response as an argument.
234
+ def http_head(url, options={}, &block)
235
+ return FeedTools::RetrievalHelper.http_request(
236
+ :head, url, options, &block)
237
+ end
40
238
  end
41
239
  end