feedtools 0.2.23 → 0.2.24

Sign up to get free protection for your applications and to get access to all the features.
@@ -69,6 +69,24 @@ module FeedTools
69
69
  return parent_feed
70
70
  end
71
71
 
72
+ # Returns the load options for this feed.
73
+ def configurations
74
+ if @configurations.blank?
75
+ parent_feed = self.feed
76
+ if parent_feed != nil
77
+ @configurations = parent_feed.configurations.dup
78
+ else
79
+ @configurations = FeedTools.configurations.dup
80
+ end
81
+ end
82
+ return @configurations
83
+ end
84
+
85
+ # Sets the load options for this feed.
86
+ def configurations=(new_configurations)
87
+ @configurations = new_configurations
88
+ end
89
+
72
90
  # Returns the feed item's encoding.
73
91
  def encoding
74
92
  if @encoding.nil?
@@ -202,10 +220,10 @@ module FeedTools
202
220
  @title = FeedTools::HtmlHelper.process_text_construct(title_node,
203
221
  self.feed_type, self.feed_version)
204
222
  if self.feed_type == "atom" ||
205
- FeedTools.configurations[:always_strip_wrapper_elements]
223
+ self.configurations[:always_strip_wrapper_elements]
206
224
  @title = FeedTools::HtmlHelper.strip_wrapper_element(@title)
207
225
  end
208
- if !@title.blank? && FeedTools.configurations[:strip_comment_count]
226
+ if !@title.blank? && self.configurations[:strip_comment_count]
209
227
  # Some blogging tools include the number of comments in a post
210
228
  # in the title... this is supremely ugly, and breaks any
211
229
  # applications which expect the title to be static, so we're
@@ -257,7 +275,7 @@ module FeedTools
257
275
  @content = FeedTools::HtmlHelper.process_text_construct(content_node,
258
276
  self.feed_type, self.feed_version)
259
277
  if self.feed_type == "atom" ||
260
- FeedTools.configurations[:always_strip_wrapper_elements]
278
+ self.configurations[:always_strip_wrapper_elements]
261
279
  @content = FeedTools::HtmlHelper.strip_wrapper_element(@content)
262
280
  end
263
281
  if @content.blank?
@@ -310,7 +328,7 @@ module FeedTools
310
328
  @summary = FeedTools::HtmlHelper.process_text_construct(summary_node,
311
329
  self.feed_type, self.feed_version)
312
330
  if self.feed_type == "atom" ||
313
- FeedTools.configurations[:always_strip_wrapper_elements]
331
+ self.configurations[:always_strip_wrapper_elements]
314
332
  @summary = FeedTools::HtmlHelper.strip_wrapper_element(@summary)
315
333
  end
316
334
  if @summary.blank?
@@ -456,7 +474,7 @@ module FeedTools
456
474
  end
457
475
  rescue
458
476
  end
459
- if FeedTools.configurations[:url_normalization_enabled]
477
+ if self.configurations[:url_normalization_enabled]
460
478
  @link = FeedTools::UriHelper.normalize_url(@link)
461
479
  end
462
480
  end
@@ -507,7 +525,7 @@ module FeedTools
507
525
  end
508
526
  rescue
509
527
  end
510
- if FeedTools.configurations[:url_normalization_enabled]
528
+ if self.configurations[:url_normalization_enabled]
511
529
  link_object.href =
512
530
  FeedTools::UriHelper.normalize_url(link_object.href)
513
531
  end
@@ -640,7 +658,7 @@ module FeedTools
640
658
  end
641
659
  rescue
642
660
  end
643
- if FeedTools.configurations[:url_normalization_enabled]
661
+ if self.configurations[:url_normalization_enabled]
644
662
  image.href = FeedTools::UriHelper.normalize_url(image.href)
645
663
  end
646
664
  image.href.strip! unless image.href.nil?
@@ -688,7 +706,7 @@ module FeedTools
688
706
  "itunes:image/@href",
689
707
  "itunes:link[@rel='image']/@href"
690
708
  ], :select_result_value => true)
691
- if FeedTools.configurations[:url_normalization_enabled]
709
+ if self.configurations[:url_normalization_enabled]
692
710
  @itunes_image_link = FeedTools::UriHelper.normalize_url(@itunes_image_link)
693
711
  end
694
712
  end
@@ -706,7 +724,7 @@ module FeedTools
706
724
  @media_thumbnail_link = FeedTools::XmlHelper.try_xpaths(self.root_node, [
707
725
  "media:thumbnail/@url"
708
726
  ], :select_result_value => true)
709
- if FeedTools.configurations[:url_normalization_enabled]
727
+ if self.configurations[:url_normalization_enabled]
710
728
  @media_thumbnail_link = FeedTools::UriHelper.normalize_url(@media_thumbnail_link)
711
729
  end
712
730
  end
@@ -734,7 +752,7 @@ module FeedTools
734
752
  @rights = FeedTools::HtmlHelper.process_text_construct(rights_node,
735
753
  self.feed_type, self.feed_version)
736
754
  if self.feed_type == "atom" ||
737
- FeedTools.configurations[:always_strip_wrapper_elements]
755
+ self.configurations[:always_strip_wrapper_elements]
738
756
  @rights = FeedTools::HtmlHelper.strip_wrapper_element(@rights)
739
757
  end
740
758
  end
@@ -1456,14 +1474,14 @@ module FeedTools
1456
1474
  begin
1457
1475
  if !time_string.blank?
1458
1476
  @time = Time.parse(time_string).gmtime
1459
- elsif FeedTools.configurations[:timestamp_estimation_enabled] &&
1477
+ elsif self.configurations[:timestamp_estimation_enabled] &&
1460
1478
  !self.title.nil? &&
1461
1479
  (Time.parse(self.title) - Time.now).abs > 100
1462
1480
  @time = Time.parse(self.title).gmtime
1463
1481
  end
1464
1482
  rescue
1465
1483
  end
1466
- if FeedTools.configurations[:timestamp_estimation_enabled]
1484
+ if self.configurations[:timestamp_estimation_enabled]
1467
1485
  if options[:estimate_timestamp]
1468
1486
  if @time.nil?
1469
1487
  begin
@@ -1615,7 +1633,7 @@ module FeedTools
1615
1633
  end
1616
1634
  rescue
1617
1635
  end
1618
- if FeedTools.configurations[:url_normalization_enabled]
1636
+ if self.configurations[:url_normalization_enabled]
1619
1637
  @comments = FeedTools::UriHelper.normalize_url(@comments)
1620
1638
  end
1621
1639
  end
@@ -170,6 +170,10 @@ module FeedTools
170
170
  alias_method :url=, :href=
171
171
  alias_method :link, :href
172
172
  alias_method :link=, :href=
173
+
174
+ def initialize
175
+ @expression = 'full'
176
+ end
173
177
 
174
178
  # Returns true if this is the default enclosure
175
179
  def is_default?
@@ -202,8 +206,7 @@ module FeedTools
202
206
  # Allowed values are 'sample', 'full', 'nonstop'.
203
207
  def expression=(new_expression)
204
208
  unless ['sample', 'full', 'nonstop'].include? new_expression.downcase
205
- raise ArgumentError,
206
- "Permitted values are 'sample', 'full', 'nonstop'."
209
+ return @expression
207
210
  end
208
211
  @expression = new_expression.downcase
209
212
  end
@@ -24,8 +24,7 @@
24
24
  module FeedTools
25
25
  module DebugHelper
26
26
  # Forces a stack_trace without interfering with the program
27
- def stack_trace
28
-
27
+ def self.stack_trace
29
28
  fork do
30
29
  ObjectSpace.each_object(Thread) do |th|
31
30
  th.raise Exception, "Stack Dump" unless Thread.current == th
@@ -232,6 +232,17 @@ module FeedTools
232
232
  return tidy_html
233
233
  end
234
234
 
235
+ # Indents a text selection by a specified number of spaces.
236
+ def self.indent(text, spaces)
237
+ lines = text.split("\n")
238
+ buffer = ""
239
+ for line in lines
240
+ line = " " * spaces + line
241
+ buffer << line << "\n"
242
+ end
243
+ return buffer
244
+ end
245
+
235
246
  # Unindents a text selection by a specified number of spaces.
236
247
  def self.unindent(text, spaces)
237
248
  lines = text.split("\n")
@@ -301,10 +312,11 @@ module FeedTools
301
312
  html_node.delete_element(child)
302
313
  end
303
314
  end
304
- for attribute in child.attributes.keys
305
- if !(attribute =~ /^xmlns/)
306
- unless acceptable_attributes.include? attribute.downcase
307
- child.delete_attribute(attribute)
315
+ child.attributes.each_attribute do |attribute|
316
+ if !(attribute.value =~ /^xmlns(:.+)?$/)
317
+ unless acceptable_attributes.include?(
318
+ attribute.value.downcase)
319
+ child.delete_attribute(attribute.value)
308
320
  end
309
321
  end
310
322
  end
@@ -364,8 +376,6 @@ module FeedTools
364
376
  ].include?(type)
365
377
  end
366
378
 
367
- # can_be_relative_uri = ['link', 'id', 'wfw_comment', 'wfw_commentrss', 'docs', 'url', 'href', 'comments', 'license', 'icon', 'logo']
368
-
369
379
  # Resolves all relative uris in a block of html.
370
380
  def self.resolve_relative_uris(html, base_uri_sources=[])
371
381
  relative_uri_attributes = [
@@ -398,23 +408,23 @@ module FeedTools
398
408
  html_doc = HTree.parse_xml("<root>" + html + "</root>").to_rexml
399
409
 
400
410
  resolve_node = lambda do |html_node|
401
- if html_node.respond_to? :children
402
- for child in html_node.children
403
- if child.kind_of? REXML::Element
404
- for element_attribute_pair in relative_uri_attributes
405
- if child.name.downcase == element_attribute_pair[0]
406
- attribute = child.attribute(element_attribute_pair[1])
407
- if attribute != nil
408
- href = attribute.value
409
- href = FeedTools::UriHelper.resolve_relative_uri(
410
- href, [child.base_uri] | base_uri_sources)
411
- child.attribute(
412
- element_attribute_pair[1]).instance_variable_set(
413
- "@value", href)
414
- end
415
- end
411
+ if html_node.kind_of? REXML::Element
412
+ for element_attribute_pair in relative_uri_attributes
413
+ if html_node.name.downcase == element_attribute_pair[0]
414
+ attribute = html_node.attribute(element_attribute_pair[1])
415
+ if attribute != nil
416
+ href = attribute.value
417
+ href = FeedTools::UriHelper.resolve_relative_uri(
418
+ href, [html_node.base_uri] | base_uri_sources)
419
+ html_node.attribute(
420
+ element_attribute_pair[1]).instance_variable_set(
421
+ "@value", href)
416
422
  end
417
423
  end
424
+ end
425
+ end
426
+ if html_node.respond_to? :children
427
+ for child in html_node.children
418
428
  resolve_node.call(child)
419
429
  end
420
430
  end
@@ -428,36 +438,55 @@ module FeedTools
428
438
  # Returns a string containing normalized xhtml from within a REXML node.
429
439
  def self.extract_xhtml(rexml_node)
430
440
  rexml_node_dup = rexml_node.deep_clone
441
+ namespace_hash = FEED_TOOLS_NAMESPACES.dup
431
442
  normalize_namespaced_xhtml = lambda do |node, node_dup|
432
443
  if node.kind_of? REXML::Element
433
444
  node_namespace = node.namespace
434
- # Massive hack, relies on REXML not changing
435
- for index in 0...node.attributes.values.size
436
- attribute = node.attributes.values[index]
437
- attribute_dup = node_dup.attributes.values[index]
438
- if attribute.namespace == FEED_TOOLS_NAMESPACES['xhtml']
439
- attribute_dup.instance_variable_set(
440
- "@expanded_name", attribute.name)
441
- end
442
- if node_namespace == FEED_TOOLS_NAMESPACES['xhtml']
443
- if attribute.name == 'xmlns'
444
- node_dup.attributes.delete('xmlns')
445
+ if node_namespace != namespace_hash['atom10'] &&
446
+ node_namespace != namespace_hash['atom03']
447
+ # Massive hack, relies on REXML not changing
448
+ for index in 0...node.attributes.values.size
449
+ attribute = node.attributes.values[index]
450
+ attribute_dup = node_dup.attributes.values[index]
451
+ if attribute.namespace == namespace_hash['xhtml']
452
+ attribute_dup.instance_variable_set(
453
+ "@expanded_name", attribute.name)
454
+ end
455
+ if node_namespace == namespace_hash['xhtml']
456
+ if attribute.name == 'xmlns'
457
+ node_dup.attributes.delete('xmlns')
458
+ end
445
459
  end
446
460
  end
447
- end
448
- if node_namespace == FEED_TOOLS_NAMESPACES['xhtml']
449
- node_dup.instance_variable_set("@expanded_name", node.name)
450
- end
451
- if !node_namespace.blank? && node.prefix.blank?
452
- if node.namespace != FEED_TOOLS_NAMESPACES['xhtml']
453
- node_dup.add_namespace(node_namespace)
461
+ if node_namespace == namespace_hash['xhtml']
462
+ node_dup.instance_variable_set("@expanded_name", node.name)
463
+ end
464
+ if !node_namespace.blank? && node.prefix.blank?
465
+ if node_namespace != namespace_hash['xhtml']
466
+ prefix = nil
467
+ for known_prefix in namespace_hash.keys
468
+ if namespace_hash[known_prefix] == node_namespace
469
+ prefix = known_prefix
470
+ end
471
+ end
472
+ if prefix.nil?
473
+ prefix = "unknown" +
474
+ Digest::SHA1.new(node_namespace).to_s[0..4]
475
+ namespace_hash[prefix] = node_namespace
476
+ end
477
+ node_dup.instance_variable_set("@expanded_name",
478
+ "#{prefix}:#{node.name}")
479
+ node_dup.instance_variable_set("@prefix",
480
+ prefix)
481
+ node_dup.add_namespace(prefix, node_namespace)
482
+ end
454
483
  end
455
484
  end
456
485
  end
457
486
  for index in 0...node.children.size
458
487
  child = node.children[index]
459
- child_dup = node_dup.children[index]
460
488
  if child.kind_of? REXML::Element
489
+ child_dup = node_dup.children[index]
461
490
  normalize_namespaced_xhtml.call(child, child_dup)
462
491
  end
463
492
  end
@@ -513,7 +542,9 @@ module FeedTools
513
542
  type == "application/xhtml+xml" ||
514
543
  content_node.namespace == FEED_TOOLS_NAMESPACES['xhtml']
515
544
  content = FeedTools::HtmlHelper.extract_xhtml(content_node)
516
- elsif type == "escaped" || mode == "escaped"
545
+ elsif type == "escaped" || mode == "escaped" ||
546
+ type == "html" || mode == "html" ||
547
+ type == "text/html" || mode == "text/html"
517
548
  content = FeedTools::HtmlHelper.unescape_entities(
518
549
  content_node.inner_xml.strip)
519
550
  elsif type == "text" || mode == "text" ||
@@ -556,7 +587,7 @@ module FeedTools
556
587
  doc = REXML::Document.new(xhtml.to_s.strip)
557
588
  if doc.children.size == 1
558
589
  child = doc.children[0]
559
- if child.name.downcase == "div"
590
+ if child.kind_of?(REXML::Element) && child.name.downcase == "div"
560
591
  return child.inner_xml.strip
561
592
  end
562
593
  end
@@ -574,7 +605,8 @@ module FeedTools
574
605
  # This is technically very, very wrong. But it saves oodles of
575
606
  # clock cycles, and probably works 99.999% of the time.
576
607
  html_document = HTree.parse_xml(
577
- FeedTools::HtmlHelper.tidy_html(html.gsub(/<body>(.|\n)*<\/body>/, ""))).to_rexml
608
+ FeedTools::HtmlHelper.tidy_html(
609
+ html.gsub(/<body.*?>(.|\n)*<\/body>/, "<body>-</body>"))).to_rexml
578
610
  html_node = nil
579
611
  head_node = nil
580
612
  link_nodes = []
@@ -22,20 +22,218 @@
22
22
  #++
23
23
 
24
24
  require 'feed_tools'
25
+ require 'feed_tools/helpers/uri_helper'
25
26
  require 'net/http'
26
27
 
27
28
  # TODO: Not used yet, don't load since it'll only be a performance hit
28
29
  # require 'net/https'
29
30
  # require 'net/ftp'
30
31
 
31
- # Stolen from the Universal Feed Parser
32
- FEED_TOOLS_ACCEPT_HEADER = "application/atom+xml,application/rdf+xml," +
33
- "application/rss+xml,application/x-netcdf,application/xml;" +
34
- "q=0.9,text/xml;q=0.2,*/*;q=0.1"
35
-
36
- # TODO: Refactor http_fetch and other methods.
37
32
  module FeedTools
38
33
  # Methods for pulling remote data
39
34
  module RetrievalHelper
35
+ # Stolen from the Universal Feed Parser
36
+ ACCEPT_HEADER = "application/atom+xml,application/rdf+xml," +
37
+ "application/rss+xml,application/x-netcdf,application/xml;" +
38
+ "q=0.9,text/xml;q=0.2,*/*;q=0.1"
39
+
40
+ # Makes an HTTP request and returns the HTTP response. Optionally
41
+ # takes a block that determines whether or not to follow a redirect.
42
+ # The block will be passed the HTTP redirect response as an argument.
43
+ def self.http_request(http_operation, url, options={}, &block)
44
+ response = nil
45
+
46
+ options = {
47
+ :feed_object => nil,
48
+ :form_data => nil,
49
+ :request_headers => {},
50
+ :follow_redirects => true,
51
+ :redirect_limit => 10,
52
+ :response_chain => []
53
+ }.merge(options)
54
+
55
+ if options[:redirect_limit] == 0
56
+ raise FeedAccessError, 'Redirect too deep'
57
+ end
58
+
59
+ if options[:response_chain].blank? ||
60
+ !options[:response_chain].kind_of?(Array)
61
+ options[:response_chain] = []
62
+ end
63
+
64
+ if !options[:request_headers].kind_of?(Hash)
65
+ options[:request_headers] = {}
66
+ end
67
+ if !options[:form_data].kind_of?(Hash)
68
+ options[:form_data] = nil
69
+ end
70
+
71
+ if options[:request_headers].blank? && options[:feed_object] != nil
72
+ options[:request_headers] = {}
73
+ unless options[:feed_object].http_headers.nil?
74
+ unless options[:feed_object].http_headers['etag'].nil?
75
+ options[:request_headers]["If-None-Match"] =
76
+ options[:feed_object].http_headers['etag']
77
+ end
78
+ unless options[:feed_object].http_headers['last-modified'].nil?
79
+ options[:request_headers]["If-Modified-Since"] =
80
+ options[:feed_object].http_headers['last-modified']
81
+ end
82
+ end
83
+ unless options[:feed_object].configurations[:user_agent].nil?
84
+ options[:request_headers]["User-Agent"] =
85
+ options[:feed_object].configurations[:user_agent]
86
+ end
87
+ end
88
+ if options[:request_headers]["Accept"].nil?
89
+ options[:request_headers]["Accept"] =
90
+ FeedTools::RetrievalHelper::ACCEPT_HEADER
91
+ end
92
+ if options[:request_headers]["User-Agent"].nil?
93
+ options[:request_headers]["User-Agent"] =
94
+ FeedTools.configurations[:user_agent]
95
+ end
96
+
97
+ uri = nil
98
+ begin
99
+ uri = URI.parse(url)
100
+ rescue URI::InvalidURIError
101
+ # Uh, maybe try to fix it?
102
+ uri = URI.parse(FeedTools::UriHelper.normalize_url(url))
103
+ end
104
+
105
+ begin
106
+ proxy_address = nil
107
+ proxy_port = nil
108
+ proxy_user = nil
109
+ proxy_password = nil
110
+
111
+ if options[:feed_object] != nil
112
+ proxy_address =
113
+ options[:feed_object].configurations[:proxy_address] || nil
114
+ proxy_port =
115
+ options[:feed_object].configurations[:proxy_port].to_i || nil
116
+ proxy_user =
117
+ options[:feed_object].configurations[:proxy_user].to_i || nil
118
+ proxy_password =
119
+ options[:feed_object].configurations[:proxy_password].to_i || nil
120
+ end
121
+
122
+ # No need to check for nil
123
+ http = Net::HTTP::Proxy(
124
+ proxy_address, proxy_port, proxy_user, proxy_password).new(
125
+ uri.host, (uri.port or 80))
126
+
127
+ path = uri.path
128
+ path += ('?' + uri.query) if uri.query
129
+
130
+ request_params = [path, options[:request_headers]]
131
+ if http_operation == :post
132
+ options[:form_data] = {} if options[:form_data].blank?
133
+ request_params << options[:form_data]
134
+ end
135
+ response = http.send(http_operation, *request_params)
136
+
137
+ case response
138
+ when Net::HTTPSuccess
139
+ if options[:feed_object] != nil
140
+ # We've reached the final destination, process all previous
141
+ # redirections, and see if we need to update the url.
142
+ for redirected_response in options[:response_chain]
143
+ if redirected_response.last.code.to_i == 301
144
+ # Reset the cache object or we may get duplicate entries
145
+
146
+ # TODO: verify this line is necessary!
147
+ #=============================================================================
148
+ options[:feed_object].cache_object = nil
149
+
150
+ options[:feed_object].href =
151
+ redirected_response.last['location']
152
+ else
153
+ # Jump out as soon as we hit anything that isn't a
154
+ # permanently moved redirection.
155
+ break
156
+ end
157
+ end
158
+ end
159
+ when Net::HTTPNotModified
160
+ # Do nothing, we just don't want it processed as a redirection
161
+ when Net::HTTPRedirection
162
+ if response['location'].nil?
163
+ raise FeedAccessError,
164
+ "No location to redirect to supplied for " + response.code
165
+ end
166
+ options[:response_chain] << [url, response]
167
+
168
+ redirected_location = response['location']
169
+ redirected_location = FeedTools::UriHelper.resolve_relative_uri(
170
+ redirected_location, [uri.host])
171
+
172
+ if options[:response_chain].assoc(redirected_location) != nil
173
+ raise FeedAccessError,
174
+ "Redirection loop detected: #{redirected_location}"
175
+ end
176
+
177
+ # Let the block handle redirects
178
+ follow_redirect = true
179
+ if block != nil
180
+ follow_redirect = block.call(redirected_location, response)
181
+ end
182
+
183
+ if follow_redirect
184
+ response = FeedTools::RetrievalHelper.http_request(
185
+ http_operation,
186
+ redirected_location,
187
+ options.merge(
188
+ {:redirect_limit => (options[:redirect_limit] - 1)}),
189
+ &block)
190
+ end
191
+ end
192
+ rescue SocketError
193
+ raise FeedAccessError, 'Socket error prevented feed retrieval'
194
+ rescue Timeout::Error
195
+ raise FeedAccessError, 'Timeout while attempting to retrieve feed'
196
+ rescue Errno::ENETUNREACH
197
+ raise FeedAccessError, 'Network was unreachable'
198
+ rescue Errno::ECONNRESET
199
+ raise FeedAccessError, 'Connection was reset by peer'
200
+ end
201
+
202
+ if response != nil
203
+ class << response
204
+ def response_chain
205
+ return @response_chain
206
+ end
207
+ end
208
+ response.instance_variable_set("@response_chain",
209
+ options[:response_chain])
210
+ end
211
+
212
+ return response
213
+ end
214
+
215
+ # Makes an HTTP GET request and returns the HTTP response. Optionally
216
+ # takes a block that determines whether or not to follow a redirect.
217
+ # The block will be passed the HTTP redirect response as an argument.
218
+ def self.http_get(url, options={}, &block)
219
+ return FeedTools::RetrievalHelper.http_request(
220
+ :get, url, options, &block)
221
+ end
222
+
223
+ # Makes an HTTP POST request and returns the HTTP response. Optionally
224
+ # takes a block that determines whether or not to follow a redirect.
225
+ # The block will be passed the HTTP redirect response as an argument.
226
+ def self.http_post(url, options={}, &block)
227
+ return FeedTools::RetrievalHelper.http_request(
228
+ :post, url, options, &block)
229
+ end
230
+
231
+ # Makes an HTTP HEAD request and returns the HTTP response. Optionally
232
+ # takes a block that determines whether or not to follow a redirect.
233
+ # The block will be passed the HTTP redirect response as an argument.
234
+ def http_head(url, options={}, &block)
235
+ return FeedTools::RetrievalHelper.http_request(
236
+ :head, url, options, &block)
237
+ end
40
238
  end
41
239
  end