feedtools 0.2.18 → 0.2.19

Sign up to get free protection for your applications and to get access to all the features.
data/CHANGELOG CHANGED
@@ -1,3 +1,29 @@
1
+ == FeedTools 0.2.19
2
+ * lousy encoding support (as opposed to none at all)
3
+ * xml processing instruction now correctly prefixes generated feeds
4
+ * attributes are escaped properly when generating feeds
5
+ * uppercase html is no longer sanitized for not being in the whitelist
6
+ * added alias method for assigning to entries
7
+ * changed the xpath querying to be much, much more DRY
8
+ * find_node and find_all_nodes are actually useful now
9
+ * full case-insensitivity implemented for the xpath helper methods
10
+ * fixed bug in tests where some assertion failures could affect other tests
11
+ * fixed bug where the feed item author would sometimes be parsed incorrectly
12
+ * fixed bug where the convertLineBreaks element would break feed entries
13
+ * default (i.e. preferred) methods will be Atom-style instead of RSS-style
14
+ * default feed output format changed to Atom 1.0
15
+ * itunes namespace corrected
16
+ * fixed images property when dealing with atom
17
+ * fixed atom link property
18
+ * improved timestamp handling
19
+ * whitespace nodes now ignored by REXML
20
+ * added option to disable timestamp estimation
21
+ * added option to limit time-to-live to some upper maximum
22
+ * enclosures included in feed generation
23
+ * no longer uses the cache at all for file:/// urls
24
+ * changed itunes:keywords to use commas
25
+ * testing now excludes cache testing by default, use "test_all" to include it
26
+ * more tests
1
27
  == FeedTools 0.2.18
2
28
  * no longer ever polls more often than once every 30 minutes
3
29
  * fixed overlooked improperly refactored enclosure code
@@ -6,6 +32,8 @@
6
32
  * test cases now implemented using helpers
7
33
  * fixed issue with timeouts
8
34
  * fixed stack overflow while estimating timestamps
35
+ * fixed some namespace issue with atom
36
+ * added base64 decoding support
9
37
  == FeedTools 0.2.17
10
38
  * more fixes for timestamping of feed items
11
39
  * fixed nil bug in root_node, feed_type, feed_version, build_xml
data/lib/feed_tools.rb CHANGED
@@ -30,9 +30,9 @@ end
30
30
 
31
31
  FEED_TOOLS_ENV = ENV['FEED_TOOLS_ENV'] ||
32
32
  ENV['RAILS_ENV'] ||
33
- 'production' # :nodoc:
33
+ 'development' # :nodoc:
34
34
 
35
- FEED_TOOLS_VERSION = "0.2.18"
35
+ FEED_TOOLS_VERSION = "0.2.19"
36
36
 
37
37
  FEED_TOOLS_NAMESPACES = {
38
38
  "admin" => "http://webns.net/mvcb/",
@@ -40,8 +40,10 @@ FEED_TOOLS_NAMESPACES = {
40
40
  "annotate" => "http://purl.org/rss/1.0/modules/annotate/",
41
41
  "atom10" => "http://www.w3.org/2005/Atom",
42
42
  "atom03" => "http://purl.org/atom/ns#",
43
+ # "atom-blog" => "http://purl.org/atom-blog/ns#",
43
44
  "audio" => "http://media.tangent.org/rss/1.0/",
44
45
  "blogChannel" => "http://backend.userland.com/blogChannelModule",
46
+ "blogger" => "http://www.blogger.com/atom/ns#",
45
47
  "cc" => "http://web.resource.org/cc/",
46
48
  "creativeCommons" => "http://backend.userland.com/creativeCommonsRssModule",
47
49
  "co" => "http://purl.org/rss/1.0/modules/company",
@@ -56,7 +58,7 @@ FEED_TOOLS_NAMESPACES = {
56
58
  "feedburner" => "http://rssnamespace.org/feedburner/ext/1.0",
57
59
  "foaf" => "http://xmlns.com/foaf/0.1/",
58
60
  "fm" => "http://freshmeat.net/rss/fm/",
59
- "itunes" => "http://www.itunes.com/DTDs/Podcast-1.0.dtd",
61
+ "itunes" => "http://www.itunes.com/dtds/podcast-1.0.dtd",
60
62
  "l" => "http://purl.org/rss/1.0/modules/link/",
61
63
  "media" => "http://search.yahoo.com/mrss",
62
64
  "pingback" => "http://madskills.com/public/xml/rss/module/pingback/",
@@ -97,12 +99,7 @@ begin
97
99
 
98
100
  require 'rubygems'
99
101
 
100
- begin
101
- require 'builder'
102
- rescue LoadError
103
- # RubyGems version is not available, use included Builder
104
- require 'feed_tools/vendor/builder'
105
- end
102
+ require_gem('builder', '>= 1.2.4')
106
103
 
107
104
  begin
108
105
  require 'tidy'
@@ -113,8 +110,10 @@ begin
113
110
  require 'feed_tools/vendor/htree'
114
111
 
115
112
  require 'net/http'
116
- require 'net/https'
117
- require 'net/ftp'
113
+
114
+ # TODO: Not used yet, don't load since it'll only be a performance hit
115
+ # require 'net/https'
116
+ # require 'net/ftp'
118
117
 
119
118
  require 'rexml/document'
120
119
 
@@ -125,7 +124,8 @@ begin
125
124
  require 'yaml'
126
125
  require 'base64'
127
126
 
128
- require_gem('activerecord', '>= 1.10.1')
127
+ require_gem('activesupport', '>= 1.1.1')
128
+ require_gem('activerecord', '>= 1.11.1')
129
129
  require_gem('uuidtools', '>= 0.1.2')
130
130
 
131
131
  require 'feed_tools/feed'
@@ -155,25 +155,55 @@ end
155
155
  # slashdot_feed.items.first.find_node("slash:hitparade/text()").value
156
156
  # => "43,37,28,23,11,3,1"
157
157
  module FeedTools
158
+ @configurations = {}
159
+
160
+ def FeedTools.load_configurations
161
+ if @configurations.blank?
162
+ config_hash = {}
163
+ @configurations = {
164
+ :feed_cache => "FeedTools::DatabaseFeedCache",
165
+ :user_agent => "FeedTools/#{FEED_TOOLS_VERSION} " +
166
+ "+http://www.sporkmonger.com/projects/feedtools/",
167
+ :generator_name => "FeedTools/#{FEED_TOOLS_VERSION}",
168
+ :generator_href => "http://www.sporkmonger.com/projects/feedtools/",
169
+ :tidy_enabled => false,
170
+ :tidy_options => {},
171
+ :sanitize_with_nofollow => true,
172
+ :timestamp_estimation_enabled => true,
173
+ :url_normalization_enabled => true,
174
+ :strip_comment_count => false,
175
+ :max_ttl => 3.days.to_s,
176
+ :output_encoding => "utf-8",
177
+ :no_content_value => "[no description]"
178
+ }.merge(config_hash)
179
+ end
180
+ return @configurations
181
+ end
158
182
 
159
- @force_tidy_enabled = true
160
- @tidy_enabled = false
161
- @feed_cache = DatabaseFeedCache
162
- @user_agent = "FeedTools/#{FEED_TOOLS_VERSION} " +
163
- "+http://www.sporkmonger.com/projects/feedtools/"
164
- @no_content_string = "[no description]"
183
+ # Resets configuration to a clean load
184
+ def FeedTools.reset_configurations
185
+ @configurations = nil
186
+ FeedTools.load_configurations
187
+ end
188
+
189
+ # Returns the configuration hash for FeedTools
190
+ def FeedTools.configurations
191
+ if @configurations.blank?
192
+ FeedTools.load_configurations()
193
+ end
194
+ return @configurations
195
+ end
196
+
197
+ # Sets the configuration hash for FeedTools
198
+ def FeedTools.configurations=(new_configurations)
199
+ @configurations = new_configurations
200
+ end
165
201
 
166
202
  # Error raised when a feed cannot be retrieved
167
203
  class FeedAccessError < StandardError
168
204
  end
169
205
 
170
206
  # Returns the current caching mechanism.
171
- def FeedTools.feed_cache
172
- return @feed_cache
173
- end
174
-
175
- # Sets the current caching mechanism. If set to nil, disables caching.
176
- # Default is the DatabaseFeedCache class.
177
207
  #
178
208
  # Objects of this class must accept the following messages:
179
209
  # id
@@ -199,12 +229,26 @@ module FeedTools
199
229
  # find_by_url
200
230
  # initialize_cache
201
231
  # connected?
202
- def FeedTools.feed_cache=(new_feed_cache)
203
- # TODO: ensure that the feed cache class actually does those things.
204
- # ==================================================================
205
- @feed_cache = new_feed_cache
232
+ def FeedTools.feed_cache
233
+ return nil if FeedTools.configurations[:feed_cache].blank?
234
+ class_name = FeedTools.configurations[:feed_cache].to_s
235
+ if @feed_cache.nil? || @feed_cache.to_s != class_name
236
+ begin
237
+ cache_class = eval(class_name)
238
+ if cache_class.kind_of?(Class)
239
+ @feed_cache = cache_class
240
+ return cache_class
241
+ else
242
+ return nil
243
+ end
244
+ rescue
245
+ return nil
246
+ end
247
+ else
248
+ return @feed_cache
249
+ end
206
250
  end
207
-
251
+
208
252
  # Returns true if FeedTools.feed_cache is not nil and a connection with
209
253
  # the cache has been successfully established. Also returns false if an
210
254
  # error is raised while trying to determine the status of the cache.
@@ -216,28 +260,7 @@ module FeedTools
216
260
  return false
217
261
  end
218
262
  end
219
-
220
- # Returns the currently used user agent string.
221
- def FeedTools.user_agent
222
- return @user_agent
223
- end
224
-
225
- # Sets the user agent string to send in the http headers.
226
- def FeedTools.user_agent=(new_user_agent)
227
- @user_agent = new_user_agent
228
- end
229
-
230
- # Returns the currently used no content string.
231
- def FeedTools.no_content_string
232
- return @no_content_string
233
- end
234
-
235
- # Sets the no content string to use when a feed is missing a content element.
236
- # Used only for xml output.
237
- def FeedTools.no_content_string=(new_no_content_string)
238
- @no_content_string = new_no_content_string
239
- end
240
-
263
+
241
264
  # Returns true if the html tidy module can be used.
242
265
  #
243
266
  # Obviously, you need the tidy gem installed in order to run with html
@@ -255,7 +278,7 @@ module FeedTools
255
278
  def FeedTools.tidy_enabled?
256
279
  # This is an override variable to keep tidy from being used even if it
257
280
  # is available.
258
- if @force_tidy_enabled == false
281
+ if FeedTools.configurations[:tidy_enabled] == false
259
282
  return false
260
283
  end
261
284
  if @tidy_enabled.nil? || @tidy_enabled == false
@@ -337,13 +360,6 @@ module FeedTools
337
360
  end
338
361
  return @tidy_enabled
339
362
  end
340
-
341
- # Turns html tidy support on or off. Be aware, that setting this to true
342
- # does not mean tidy will be enabled. It simply means that tidy will be
343
- # enabled if it is available to be enabled.
344
- def FeedTools.tidy_enabled=(new_tidy_enabled)
345
- @force_tidy_enabled = new_tidy_enabled
346
- end
347
363
 
348
364
  # Attempts to ensures that the passed url is valid and sane. Accepts very, very ugly urls
349
365
  # and makes every effort to figure out what it was supposed to be. Also translates from
@@ -470,8 +486,8 @@ module FeedTools
470
486
  def FeedTools.escape_entities(html)
471
487
  return nil if html.nil?
472
488
  escaped_html = CGI.escapeHTML(html)
473
- unescaped_html.gsub!(/'/, "&apos;")
474
- unescaped_html.gsub!(/"/, "&quot;")
489
+ escaped_html.gsub!(/'/, "&apos;")
490
+ escaped_html.gsub!(/"/, "&quot;")
475
491
  return escaped_html
476
492
  end
477
493
 
@@ -540,6 +556,9 @@ module FeedTools
540
556
  else
541
557
  tidy_html = html
542
558
  end
559
+ if tidy_html.blank? && !html.blank?
560
+ tidy_html = html.strip
561
+ end
543
562
  return tidy_html
544
563
  end
545
564
 
@@ -586,7 +605,7 @@ module FeedTools
586
605
  if html_node.respond_to? :children
587
606
  for child in html_node.children
588
607
  if child.kind_of? REXML::Element
589
- unless acceptable_elements.include? child.name
608
+ unless acceptable_elements.include? child.name.downcase
590
609
  if mode == :strip
591
610
  html_node.delete_element(child)
592
611
  else
@@ -596,7 +615,7 @@ module FeedTools
596
615
  end
597
616
  end
598
617
  for attribute in child.attributes.keys
599
- unless acceptable_attributes.include? attribute
618
+ unless acceptable_attributes.include? attribute.downcase
600
619
  child.delete_attribute(attribute)
601
620
  end
602
621
  end
@@ -655,6 +674,252 @@ module FeedTools
655
674
  end
656
675
 
657
676
  module REXML # :nodoc:
677
+ class LiberalXPathParser < XPathParser # :nodoc:
678
+ private
679
+ def internal_parse(path_stack, nodeset) # :nodoc:
680
+ return nodeset if nodeset.size == 0 or path_stack.size == 0
681
+ case path_stack.shift
682
+ when :document
683
+ return [ nodeset[0].root.parent ]
684
+
685
+ when :qname
686
+ prefix = path_stack.shift.downcase
687
+ name = path_stack.shift.downcase
688
+ n = nodeset.clone
689
+ ns = @namespaces[prefix]
690
+ ns = ns ? ns : ''
691
+ n.delete_if do |node|
692
+ if node.node_type == :element and ns == ''
693
+ ns = node.namespace( prefix )
694
+ end
695
+ !(node.node_type == :element and
696
+ node.name.downcase == name and node.namespace == ns )
697
+ end
698
+ return n
699
+
700
+ when :any
701
+ n = nodeset.clone
702
+ n.delete_if { |node| node.node_type != :element }
703
+ return n
704
+
705
+ when :self
706
+ # THIS SPACE LEFT INTENTIONALLY BLANK
707
+
708
+ when :processing_instruction
709
+ target = path_stack.shift
710
+ n = nodeset.clone
711
+ n.delete_if do |node|
712
+ (node.node_type != :processing_instruction) or
713
+ ( !target.nil? and ( node.target != target ) )
714
+ end
715
+ return n
716
+
717
+ when :text
718
+ n = nodeset.clone
719
+ n.delete_if do |node|
720
+ node.node_type != :text
721
+ end
722
+ return n
723
+
724
+ when :comment
725
+ n = nodeset.clone
726
+ n.delete_if do |node|
727
+ node.node_type != :comment
728
+ end
729
+ return n
730
+
731
+ when :node
732
+ return nodeset
733
+
734
+ when :child
735
+ new_nodeset = []
736
+ nt = nil
737
+ for node in nodeset
738
+ nt = node.node_type
739
+ new_nodeset += node.children if nt == :element or nt == :document
740
+ end
741
+ return new_nodeset
742
+
743
+ when :literal
744
+ literal = path_stack.shift
745
+ if literal =~ /^\d+(\.\d+)?$/
746
+ return ($1 ? literal.to_f : literal.to_i)
747
+ end
748
+ return literal
749
+
750
+ when :attribute
751
+ new_nodeset = []
752
+ case path_stack.shift
753
+ when :qname
754
+ prefix = path_stack.shift
755
+ name = path_stack.shift.downcase
756
+ for element in nodeset
757
+ if element.node_type == :element
758
+ for attribute_name in element.attributes.keys
759
+ if attribute_name.downcase == name
760
+ attrib = element.attribute( attribute_name,
761
+ @namespaces[prefix] )
762
+ new_nodeset << attrib if attrib
763
+ end
764
+ end
765
+ end
766
+ end
767
+ when :any
768
+ for element in nodeset
769
+ if element.node_type == :element
770
+ new_nodeset += element.attributes.to_a
771
+ end
772
+ end
773
+ end
774
+ return new_nodeset
775
+
776
+ when :parent
777
+ return internal_parse( path_stack, nodeset.collect{|n| n.parent}.compact )
778
+
779
+ when :ancestor
780
+ new_nodeset = []
781
+ for node in nodeset
782
+ while node.parent
783
+ node = node.parent
784
+ new_nodeset << node unless new_nodeset.include? node
785
+ end
786
+ end
787
+ return new_nodeset
788
+
789
+ when :ancestor_or_self
790
+ new_nodeset = []
791
+ for node in nodeset
792
+ if node.node_type == :element
793
+ new_nodeset << node
794
+ while ( node.parent )
795
+ node = node.parent
796
+ new_nodeset << node unless new_nodeset.include? node
797
+ end
798
+ end
799
+ end
800
+ return new_nodeset
801
+
802
+ when :predicate
803
+ predicate = path_stack.shift
804
+ new_nodeset = []
805
+ Functions::size = nodeset.size
806
+ nodeset.size.times do |index|
807
+ node = nodeset[index]
808
+ Functions::node = node
809
+ Functions::index = index+1
810
+ result = Predicate( predicate, node )
811
+ if result.kind_of? Numeric
812
+ new_nodeset << node if result == (index+1)
813
+ elsif result.instance_of? Array
814
+ new_nodeset << node if result.size > 0
815
+ else
816
+ new_nodeset << node if result
817
+ end
818
+ end
819
+ return new_nodeset
820
+
821
+ when :descendant_or_self
822
+ rv = descendant_or_self( path_stack, nodeset )
823
+ path_stack.clear
824
+ return rv
825
+
826
+ when :descendant
827
+ results = []
828
+ nt = nil
829
+ for node in nodeset
830
+ nt = node.node_type
831
+ if nt == :element or nt == :document
832
+ results += internal_parse(
833
+ path_stack.clone.unshift( :descendant_or_self ),
834
+ node.children )
835
+ end
836
+ end
837
+ return results
838
+
839
+ when :following_sibling
840
+ results = []
841
+ for node in nodeset
842
+ all_siblings = node.parent.children
843
+ current_index = all_siblings.index( node )
844
+ following_siblings = all_siblings[ current_index+1 .. -1 ]
845
+ results += internal_parse( path_stack.clone, following_siblings )
846
+ end
847
+ return results
848
+
849
+ when :preceding_sibling
850
+ results = []
851
+ for node in nodeset
852
+ all_siblings = node.parent.children
853
+ current_index = all_siblings.index( node )
854
+ preceding_siblings = all_siblings[ 0 .. current_index-1 ]
855
+ results += internal_parse( path_stack.clone, preceding_siblings )
856
+ end
857
+ return results
858
+
859
+ when :preceding
860
+ new_nodeset = []
861
+ for node in nodeset
862
+ new_nodeset += preceding( node )
863
+ end
864
+ return new_nodeset
865
+
866
+ when :following
867
+ new_nodeset = []
868
+ for node in nodeset
869
+ new_nodeset += following( node )
870
+ end
871
+ return new_nodeset
872
+
873
+ when :namespace
874
+ new_set = []
875
+ for node in nodeset
876
+ if node.node_type == :element or node.node_type == :attribute
877
+ new_nodeset << node.namespace
878
+ end
879
+ end
880
+ return new_nodeset
881
+
882
+ when :variable
883
+ var_name = path_stack.shift
884
+ return @variables[ var_name ]
885
+
886
+ end
887
+ nodeset
888
+ end
889
+ end
890
+
891
+ class XPath # :nodoc:
892
+ def self.liberal_match(element, path=nil, namespaces={},
893
+ variables={}) # :nodoc:
894
+ parser = LiberalXPathParser.new
895
+ parser.namespaces = namespaces
896
+ parser.variables = variables
897
+ path = "*" unless path
898
+ element = [element] unless element.kind_of? Array
899
+ parser.parse(path, element)
900
+ end
901
+
902
+ def self.liberal_first(element, path=nil, namespaces={},
903
+ variables={}) # :nodoc:
904
+ parser = LiberalXPathParser.new
905
+ parser.namespaces = namespaces
906
+ parser.variables = variables
907
+ path = "*" unless path
908
+ element = [element] unless element.kind_of? Array
909
+ parser.parse(path, element)[0]
910
+ end
911
+
912
+ def self.liberal_each(element, path=nil, namespaces={},
913
+ variables={}, &block) # :nodoc:
914
+ parser = LiberalXPathParser.new
915
+ parser.namespaces = namespaces
916
+ parser.variables = variables
917
+ path = "*" unless path
918
+ element = [element] unless element.kind_of? Array
919
+ parser.parse(path, element).each( &block )
920
+ end
921
+ end
922
+
658
923
  class Element # :nodoc:
659
924
  unless REXML::Element.public_instance_methods.include? :inner_xml
660
925
  def inner_xml # :nodoc: