feed-normalizer 1.5.1 → 1.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +48 -48
 - data/License.txt +27 -27
 - data/Manifest.txt +18 -19
 - data/README.txt +63 -63
 - data/Rakefile +29 -25
 - data/lib/feed-normalizer.rb +149 -149
 - data/lib/html-cleaner.rb +181 -190
 - data/lib/parsers/rss.rb +110 -95
 - data/lib/parsers/simple-rss.rb +138 -137
 - data/lib/structures.rb +245 -244
 - data/test/data/atom03.xml +128 -127
 - data/test/data/atom10.xml +114 -112
 - data/test/data/rdf10.xml +1498 -1498
 - data/test/data/rss20.xml +64 -63
 - data/test/data/rss20diff.xml +59 -59
 - data/test/data/rss20diff_short.xml +51 -51
 - data/test/test_feednormalizer.rb +265 -267
 - data/test/test_htmlcleaner.rb +156 -155
 - metadata +99 -63
 - data/test/test_all.rb +0 -6
 
    
        data/lib/parsers/simple-rss.rb
    CHANGED
    
    | 
         @@ -1,137 +1,138 @@ 
     | 
|
| 
       1 
     | 
    
         
            -
            require 'simple-rss'
         
     | 
| 
       2 
     | 
    
         
            -
             
     | 
| 
       3 
     | 
    
         
            -
            # Monkey patches for outstanding issues logged in the simple-rss project.
         
     | 
| 
       4 
     | 
    
         
            -
            #   * Add support for issued time field:
         
     | 
| 
       5 
     | 
    
         
            -
            #     http://rubyforge.org/tracker/index.php?func=detail&aid=13980&group_id=893&atid=3517
         
     | 
| 
       6 
     | 
    
         
            -
            #   * The '+' symbol is lost when escaping fields.
         
     | 
| 
       7 
     | 
    
         
            -
            #     http://rubyforge.org/tracker/index.php?func=detail&aid=10852&group_id=893&atid=3517
         
     | 
| 
       8 
     | 
    
         
            -
            #
         
     | 
| 
       9 
     | 
    
         
            -
            class SimpleRSS
         
     | 
| 
       10 
     | 
    
         
            -
              @@item_tags << :issued
         
     | 
| 
       11 
     | 
    
         
            -
             
     | 
| 
       12 
     | 
    
         
            -
              undef clean_content
         
     | 
| 
       13 
     | 
    
         
            -
              def clean_content(tag, attrs, content)
         
     | 
| 
       14 
     | 
    
         
            -
                content = content.to_s
         
     | 
| 
       15 
     | 
    
         
            -
                case tag
         
     | 
| 
       16 
     | 
    
         
            -
                  when :pubDate, :lastBuildDate, :published, :updated, :expirationDate, :modified, :'dc:date', :issued
         
     | 
| 
       17 
     | 
    
         
            -
                    Time.parse(content) rescue unescape(content)
         
     | 
| 
       18 
     | 
    
         
            -
                  when :author, :contributor, :skipHours, :skipDays
         
     | 
| 
       19 
     | 
    
         
            -
                    unescape(content.gsub(/<.*?>/,''))
         
     | 
| 
       20 
     | 
    
         
            -
                  else
         
     | 
| 
       21 
     | 
    
         
            -
                    content.empty? && "#{attrs} " =~ /href=['"]?([^'"]*)['" ]/mi ? $1.strip : unescape(content)
         
     | 
| 
       22 
     | 
    
         
            -
                end
         
     | 
| 
       23 
     | 
    
         
            -
              end
         
     | 
| 
       24 
     | 
    
         
            -
             
     | 
| 
       25 
     | 
    
         
            -
              undef unescape
         
     | 
| 
       26 
     | 
    
         
            -
              def unescape(s)
         
     | 
| 
       27 
     | 
    
         
            -
               if s =~  
     | 
| 
       28 
     | 
    
         
            -
                 # Raw HTML is inside the CDATA, so just remove the CDATA wrapper.
         
     | 
| 
       29 
     | 
    
         
            -
                 s.gsub(/(<!\[CDATA\[|\]\]>)/,'') 
     | 
| 
       30 
     | 
    
         
            -
               elsif s =~ /[<>]/
         
     | 
| 
       31 
     | 
    
         
            -
                 # Already looks like HTML.
         
     | 
| 
       32 
     | 
    
         
            -
                 s
         
     | 
| 
       33 
     | 
    
         
            -
               else
         
     | 
| 
       34 
     | 
    
         
            -
                 # Make it HTML.
         
     | 
| 
       35 
     | 
    
         
            -
                 FeedNormalizer::HtmlCleaner.unescapeHTML(s)
         
     | 
| 
       36 
     | 
    
         
            -
               end
         
     | 
| 
       37 
     | 
    
         
            -
             end
         
     | 
| 
       38 
     | 
    
         
            -
            end
         
     | 
| 
       39 
     | 
    
         
            -
             
     | 
| 
       40 
     | 
    
         
            -
            module FeedNormalizer
         
     | 
| 
       41 
     | 
    
         
            -
             
     | 
| 
       42 
     | 
    
         
            -
              # The SimpleRSS parser can handle both RSS and Atom feeds.
         
     | 
| 
       43 
     | 
    
         
            -
              class SimpleRssParser < Parser
         
     | 
| 
       44 
     | 
    
         
            -
             
     | 
| 
       45 
     | 
    
         
            -
                def self.parser
         
     | 
| 
       46 
     | 
    
         
            -
                  SimpleRSS
         
     | 
| 
       47 
     | 
    
         
            -
                end
         
     | 
| 
       48 
     | 
    
         
            -
             
     | 
| 
       49 
     | 
    
         
            -
                def self.parse(xml, loose)
         
     | 
| 
       50 
     | 
    
         
            -
                  begin
         
     | 
| 
       51 
     | 
    
         
            -
                    atomrss = parser.parse(xml)
         
     | 
| 
       52 
     | 
    
         
            -
                  rescue Exception => e
         
     | 
| 
       53 
     | 
    
         
            -
                    #puts "Parser #{parser} failed because #{e.message.gsub("\n",', ')}"
         
     | 
| 
       54 
     | 
    
         
            -
                    return nil
         
     | 
| 
       55 
     | 
    
         
            -
                  end
         
     | 
| 
       56 
     | 
    
         
            -
             
     | 
| 
       57 
     | 
    
         
            -
                  package(atomrss)
         
     | 
| 
       58 
     | 
    
         
            -
                end
         
     | 
| 
       59 
     | 
    
         
            -
             
     | 
| 
       60 
     | 
    
         
            -
                # Fairly low priority; a slower, liberal parser.
         
     | 
| 
       61 
     | 
    
         
            -
                def self.priority
         
     | 
| 
       62 
     | 
    
         
            -
                  900
         
     | 
| 
       63 
     | 
    
         
            -
                end
         
     | 
| 
       64 
     | 
    
         
            -
             
     | 
| 
       65 
     | 
    
         
            -
                protected
         
     | 
| 
       66 
     | 
    
         
            -
             
     | 
| 
       67 
     | 
    
         
            -
                def self.package(atomrss)
         
     | 
| 
       68 
     | 
    
         
            -
                  feed = Feed.new(self)
         
     | 
| 
       69 
     | 
    
         
            -
             
     | 
| 
       70 
     | 
    
         
            -
                  # root elements
         
     | 
| 
       71 
     | 
    
         
            -
                  feed_mapping = {
         
     | 
| 
       72 
     | 
    
         
            -
                    :generator => :generator,
         
     | 
| 
       73 
     | 
    
         
            -
                    :title => :title,
         
     | 
| 
       74 
     | 
    
         
            -
                    :last_updated => [:updated, :lastBuildDate, :pubDate, :dc_date],
         
     | 
| 
       75 
     | 
    
         
            -
                    :copyright => [:copyright, :rights],
         
     | 
| 
       76 
     | 
    
         
            -
                    :authors => [:author, :webMaster, :managingEditor, :contributor],
         
     | 
| 
       77 
     | 
    
         
            -
                    :urls => :link,
         
     | 
| 
       78 
     | 
    
         
            -
                    :description => [:description, :subtitle],
         
     | 
| 
       79 
     | 
    
         
            -
                    :ttl => :ttl
         
     | 
| 
       80 
     | 
    
         
            -
                  }
         
     | 
| 
       81 
     | 
    
         
            -
             
     | 
| 
       82 
     | 
    
         
            -
                  map_functions!(feed_mapping, atomrss, feed)
         
     | 
| 
       83 
     | 
    
         
            -
             
     | 
| 
       84 
     | 
    
         
            -
                  # custom channel elements
         
     | 
| 
       85 
     | 
    
         
            -
                  feed.id = feed_id(atomrss)
         
     | 
| 
       86 
     | 
    
         
            -
                  feed.image = image(atomrss)
         
     | 
| 
       87 
     | 
    
         
            -
             
     | 
| 
       88 
     | 
    
         
            -
             
     | 
| 
       89 
     | 
    
         
            -
                  # entry elements
         
     | 
| 
       90 
     | 
    
         
            -
                  entry_mapping = {
         
     | 
| 
       91 
     | 
    
         
            -
                    :date_published => [:pubDate, :published, :dc_date, :issued],
         
     | 
| 
       92 
     | 
    
         
            -
                    :urls => :link,
         
     | 
| 
       93 
     | 
    
         
            -
                    : 
     | 
| 
       94 
     | 
    
         
            -
                    : 
     | 
| 
       95 
     | 
    
         
            -
                    : 
     | 
| 
       96 
     | 
    
         
            -
                    : 
     | 
| 
       97 
     | 
    
         
            -
                    : 
     | 
| 
       98 
     | 
    
         
            -
                    : 
     | 
| 
       99 
     | 
    
         
            -
             
     | 
| 
       100 
     | 
    
         
            -
             
     | 
| 
       101 
     | 
    
         
            -
             
     | 
| 
       102 
     | 
    
         
            -
             
     | 
| 
       103 
     | 
    
         
            -
                     
     | 
| 
       104 
     | 
    
         
            -
             
     | 
| 
       105 
     | 
    
         
            -
             
     | 
| 
       106 
     | 
    
         
            -
                     
     | 
| 
       107 
     | 
    
         
            -
                    feed_entry. 
     | 
| 
       108 
     | 
    
         
            -
             
     | 
| 
       109 
     | 
    
         
            -
             
     | 
| 
       110 
     | 
    
         
            -
             
     | 
| 
       111 
     | 
    
         
            -
             
     | 
| 
       112 
     | 
    
         
            -
             
     | 
| 
       113 
     | 
    
         
            -
             
     | 
| 
       114 
     | 
    
         
            -
             
     | 
| 
       115 
     | 
    
         
            -
             
     | 
| 
       116 
     | 
    
         
            -
             
     | 
| 
       117 
     | 
    
         
            -
             
     | 
| 
       118 
     | 
    
         
            -
             
     | 
| 
       119 
     | 
    
         
            -
             
     | 
| 
       120 
     | 
    
         
            -
             
     | 
| 
       121 
     | 
    
         
            -
             
     | 
| 
       122 
     | 
    
         
            -
             
     | 
| 
       123 
     | 
    
         
            -
             
     | 
| 
       124 
     | 
    
         
            -
             
     | 
| 
       125 
     | 
    
         
            -
             
     | 
| 
       126 
     | 
    
         
            -
             
     | 
| 
       127 
     | 
    
         
            -
             
     | 
| 
       128 
     | 
    
         
            -
             
     | 
| 
       129 
     | 
    
         
            -
             
     | 
| 
       130 
     | 
    
         
            -
             
     | 
| 
       131 
     | 
    
         
            -
             
     | 
| 
       132 
     | 
    
         
            -
                 
     | 
| 
       133 
     | 
    
         
            -
             
     | 
| 
       134 
     | 
    
         
            -
             
     | 
| 
       135 
     | 
    
         
            -
             
     | 
| 
       136 
     | 
    
         
            -
             
     | 
| 
       137 
     | 
    
         
            -
            end
         
     | 
| 
      
 1 
     | 
    
         
            +
            require 'simple-rss'
         
     | 
| 
      
 2 
     | 
    
         
            +
             
     | 
| 
      
 3 
     | 
    
         
            +
            # Monkey patches for outstanding issues logged in the simple-rss project.
         
     | 
| 
      
 4 
     | 
    
         
            +
            #   * Add support for issued time field:
         
     | 
| 
      
 5 
     | 
    
         
            +
            #     http://rubyforge.org/tracker/index.php?func=detail&aid=13980&group_id=893&atid=3517
         
     | 
| 
      
 6 
     | 
    
         
            +
            #   * The '+' symbol is lost when escaping fields.
         
     | 
| 
      
 7 
     | 
    
         
            +
            #     http://rubyforge.org/tracker/index.php?func=detail&aid=10852&group_id=893&atid=3517
         
     | 
| 
      
 8 
     | 
    
         
            +
            #
         
     | 
| 
      
 9 
     | 
    
         
            +
            class SimpleRSS
         
     | 
| 
      
 10 
     | 
    
         
            +
              @@item_tags << :issued
         
     | 
| 
      
 11 
     | 
    
         
            +
             
     | 
| 
      
 12 
     | 
    
         
            +
              undef clean_content
         
     | 
| 
      
 13 
     | 
    
         
            +
              def clean_content(tag, attrs, content)
         
     | 
| 
      
 14 
     | 
    
         
            +
                content = content.to_s
         
     | 
| 
      
 15 
     | 
    
         
            +
                case tag
         
     | 
| 
      
 16 
     | 
    
         
            +
                  when :pubDate, :lastBuildDate, :published, :updated, :expirationDate, :modified, :'dc:date', :issued
         
     | 
| 
      
 17 
     | 
    
         
            +
                    Time.parse(content) rescue unescape(content)
         
     | 
| 
      
 18 
     | 
    
         
            +
                  when :author, :contributor, :skipHours, :skipDays
         
     | 
| 
      
 19 
     | 
    
         
            +
                    unescape(content.gsub(/<.*?>/,''))
         
     | 
| 
      
 20 
     | 
    
         
            +
                  else
         
     | 
| 
      
 21 
     | 
    
         
            +
                    content.empty? && "#{attrs} " =~ /href=['"]?([^'"]*)['" ]/mi ? $1.strip : unescape(content)
         
     | 
| 
      
 22 
     | 
    
         
            +
                end
         
     | 
| 
      
 23 
     | 
    
         
            +
              end
         
     | 
| 
      
 24 
     | 
    
         
            +
             
     | 
| 
      
 25 
     | 
    
         
            +
              undef unescape
         
     | 
| 
      
 26 
     | 
    
         
            +
              def unescape(s)
         
     | 
| 
      
 27 
     | 
    
         
            +
               if s =~ /^\s*(<!\[CDATA\[|\]\]>)/
         
     | 
| 
      
 28 
     | 
    
         
            +
                 # Raw HTML is inside the CDATA, so just remove the CDATA wrapper.
         
     | 
| 
      
 29 
     | 
    
         
            +
                 s.gsub(/(<!\[CDATA\[|\]\]>)/,'')
         
     | 
| 
      
 30 
     | 
    
         
            +
               elsif s =~ /[<>]/
         
     | 
| 
      
 31 
     | 
    
         
            +
                 # Already looks like HTML.
         
     | 
| 
      
 32 
     | 
    
         
            +
                 s
         
     | 
| 
      
 33 
     | 
    
         
            +
               else
         
     | 
| 
      
 34 
     | 
    
         
            +
                 # Make it HTML.
         
     | 
| 
      
 35 
     | 
    
         
            +
                 FeedNormalizer::HtmlCleaner.unescapeHTML(s)
         
     | 
| 
      
 36 
     | 
    
         
            +
               end
         
     | 
| 
      
 37 
     | 
    
         
            +
             end
         
     | 
| 
      
 38 
     | 
    
         
            +
            end
         
     | 
| 
      
 39 
     | 
    
         
            +
             
     | 
| 
      
 40 
     | 
    
         
            +
            module FeedNormalizer
         
     | 
| 
      
 41 
     | 
    
         
            +
             
     | 
| 
      
 42 
     | 
    
         
            +
              # The SimpleRSS parser can handle both RSS and Atom feeds.
         
     | 
| 
      
 43 
     | 
    
         
            +
              class SimpleRssParser < Parser
         
     | 
| 
      
 44 
     | 
    
         
            +
             
     | 
| 
      
 45 
     | 
    
         
            +
                def self.parser
         
     | 
| 
      
 46 
     | 
    
         
            +
                  SimpleRSS
         
     | 
| 
      
 47 
     | 
    
         
            +
                end
         
     | 
| 
      
 48 
     | 
    
         
            +
             
     | 
| 
      
 49 
     | 
    
         
            +
                def self.parse(xml, loose)
         
     | 
| 
      
 50 
     | 
    
         
            +
                  begin
         
     | 
| 
      
 51 
     | 
    
         
            +
                    atomrss = parser.parse(xml)
         
     | 
| 
      
 52 
     | 
    
         
            +
                  rescue Exception => e
         
     | 
| 
      
 53 
     | 
    
         
            +
                    #puts "Parser #{parser} failed because #{e.message.gsub("\n",', ')}"
         
     | 
| 
      
 54 
     | 
    
         
            +
                    return nil
         
     | 
| 
      
 55 
     | 
    
         
            +
                  end
         
     | 
| 
      
 56 
     | 
    
         
            +
             
     | 
| 
      
 57 
     | 
    
         
            +
                  package(atomrss)
         
     | 
| 
      
 58 
     | 
    
         
            +
                end
         
     | 
| 
      
 59 
     | 
    
         
            +
             
     | 
| 
      
 60 
     | 
    
         
            +
                # Fairly low priority; a slower, liberal parser.
         
     | 
| 
      
 61 
     | 
    
         
            +
                def self.priority
         
     | 
| 
      
 62 
     | 
    
         
            +
                  900
         
     | 
| 
      
 63 
     | 
    
         
            +
                end
         
     | 
| 
      
 64 
     | 
    
         
            +
             
     | 
| 
      
 65 
     | 
    
         
            +
                protected
         
     | 
| 
      
 66 
     | 
    
         
            +
             
     | 
| 
      
 67 
     | 
    
         
            +
                def self.package(atomrss)
         
     | 
| 
      
 68 
     | 
    
         
            +
                  feed = Feed.new(self)
         
     | 
| 
      
 69 
     | 
    
         
            +
             
     | 
| 
      
 70 
     | 
    
         
            +
                  # root elements
         
     | 
| 
      
 71 
     | 
    
         
            +
                  feed_mapping = {
         
     | 
| 
      
 72 
     | 
    
         
            +
                    :generator => :generator,
         
     | 
| 
      
 73 
     | 
    
         
            +
                    :title => :title,
         
     | 
| 
      
 74 
     | 
    
         
            +
                    :last_updated => [:updated, :lastBuildDate, :pubDate, :dc_date],
         
     | 
| 
      
 75 
     | 
    
         
            +
                    :copyright => [:copyright, :rights],
         
     | 
| 
      
 76 
     | 
    
         
            +
                    :authors => [:author, :webMaster, :managingEditor, :contributor],
         
     | 
| 
      
 77 
     | 
    
         
            +
                    :urls => :link,
         
     | 
| 
      
 78 
     | 
    
         
            +
                    :description => [:description, :subtitle],
         
     | 
| 
      
 79 
     | 
    
         
            +
                    :ttl => :ttl
         
     | 
| 
      
 80 
     | 
    
         
            +
                  }
         
     | 
| 
      
 81 
     | 
    
         
            +
             
     | 
| 
      
 82 
     | 
    
         
            +
                  map_functions!(feed_mapping, atomrss, feed)
         
     | 
| 
      
 83 
     | 
    
         
            +
             
     | 
| 
      
 84 
     | 
    
         
            +
                  # custom channel elements
         
     | 
| 
      
 85 
     | 
    
         
            +
                  feed.id = feed_id(atomrss)
         
     | 
| 
      
 86 
     | 
    
         
            +
                  feed.image = image(atomrss)
         
     | 
| 
      
 87 
     | 
    
         
            +
             
     | 
| 
      
 88 
     | 
    
         
            +
             
     | 
| 
      
 89 
     | 
    
         
            +
                  # entry elements
         
     | 
| 
      
 90 
     | 
    
         
            +
                  entry_mapping = {
         
     | 
| 
      
 91 
     | 
    
         
            +
                    :date_published => [:pubDate, :published, :dc_date, :issued],
         
     | 
| 
      
 92 
     | 
    
         
            +
                    :urls => :link,
         
     | 
| 
      
 93 
     | 
    
         
            +
                    :enclosures => :enclosure,
         
     | 
| 
      
 94 
     | 
    
         
            +
                    :description => [:description, :summary],
         
     | 
| 
      
 95 
     | 
    
         
            +
                    :content => [:content, :content_encoded, :description],
         
     | 
| 
      
 96 
     | 
    
         
            +
                    :title => :title,
         
     | 
| 
      
 97 
     | 
    
         
            +
                    :authors => [:author, :contributor, :dc_creator],
         
     | 
| 
      
 98 
     | 
    
         
            +
                    :categories => :category,
         
     | 
| 
      
 99 
     | 
    
         
            +
                    :last_updated => [:updated, :dc_date, :pubDate]
         
     | 
| 
      
 100 
     | 
    
         
            +
                  }
         
     | 
| 
      
 101 
     | 
    
         
            +
             
     | 
| 
      
 102 
     | 
    
         
            +
                  atomrss.entries.each do |atomrss_entry|
         
     | 
| 
      
 103 
     | 
    
         
            +
                    feed_entry = Entry.new
         
     | 
| 
      
 104 
     | 
    
         
            +
                    map_functions!(entry_mapping, atomrss_entry, feed_entry)
         
     | 
| 
      
 105 
     | 
    
         
            +
             
     | 
| 
      
 106 
     | 
    
         
            +
                    # custom entry elements
         
     | 
| 
      
 107 
     | 
    
         
            +
                    feed_entry.id = atomrss_entry.guid || atomrss_entry[:id] # entries are a Hash..
         
     | 
| 
      
 108 
     | 
    
         
            +
                    feed_entry.copyright = atomrss_entry.copyright || (atomrss.respond_to?(:copyright) ? atomrss.copyright : nil)
         
     | 
| 
      
 109 
     | 
    
         
            +
             
     | 
| 
      
 110 
     | 
    
         
            +
                    feed.entries << feed_entry
         
     | 
| 
      
 111 
     | 
    
         
            +
                  end
         
     | 
| 
      
 112 
     | 
    
         
            +
             
     | 
| 
      
 113 
     | 
    
         
            +
                  feed
         
     | 
| 
      
 114 
     | 
    
         
            +
                end
         
     | 
| 
      
 115 
     | 
    
         
            +
             
     | 
| 
      
 116 
     | 
    
         
            +
                def self.image(parser)
         
     | 
| 
      
 117 
     | 
    
         
            +
                  if parser.respond_to?(:image) && parser.image
         
     | 
| 
      
 118 
     | 
    
         
            +
                    if parser.image =~ /<url>/ # RSS image contains an <url> spec
         
     | 
| 
      
 119 
     | 
    
         
            +
                      parser.image.scan(/<url>(.*?)<\/url>/).to_s
         
     | 
| 
      
 120 
     | 
    
         
            +
                    else
         
     | 
| 
      
 121 
     | 
    
         
            +
                      parser.image # Atom contains just the url
         
     | 
| 
      
 122 
     | 
    
         
            +
                    end
         
     | 
| 
      
 123 
     | 
    
         
            +
                  elsif parser.respond_to?(:logo) && parser.logo
         
     | 
| 
      
 124 
     | 
    
         
            +
                    parser.logo
         
     | 
| 
      
 125 
     | 
    
         
            +
                  end
         
     | 
| 
      
 126 
     | 
    
         
            +
                end
         
     | 
| 
      
 127 
     | 
    
         
            +
             
     | 
| 
      
 128 
     | 
    
         
            +
                def self.feed_id(parser)
         
     | 
| 
      
 129 
     | 
    
         
            +
                  overridden_value(parser, :id) || ("#{parser.link}" if parser.respond_to?(:link))
         
     | 
| 
      
 130 
     | 
    
         
            +
                end
         
     | 
| 
      
 131 
     | 
    
         
            +
             
     | 
| 
      
 132 
     | 
    
         
            +
                # gets the value returned from the method if it overriden, otherwise nil.
         
     | 
| 
      
 133 
     | 
    
         
            +
                def self.overridden_value(object, method)
         
     | 
| 
      
 134 
     | 
    
         
            +
                  object.class.public_instance_methods(false).include? method
         
     | 
| 
      
 135 
     | 
    
         
            +
                end
         
     | 
| 
      
 136 
     | 
    
         
            +
             
     | 
| 
      
 137 
     | 
    
         
            +
              end
         
     | 
| 
      
 138 
     | 
    
         
            +
            end
         
     | 
    
        data/lib/structures.rb
    CHANGED
    
    | 
         @@ -1,244 +1,245 @@ 
     | 
|
| 
       1 
     | 
    
         
            -
             
     | 
| 
       2 
     | 
    
         
            -
            module FeedNormalizer
         
     | 
| 
       3 
     | 
    
         
            -
             
     | 
| 
       4 
     | 
    
         
            -
              module Singular
         
     | 
| 
       5 
     | 
    
         
            -
             
     | 
| 
       6 
     | 
    
         
            -
                # If the method being called is a singular (in this simple case, does not
         
     | 
| 
       7 
     | 
    
         
            -
                # end with an 's'), then it calls the plural method, and calls the first
         
     | 
| 
       8 
     | 
    
         
            -
                # element. We're assuming that plural methods provide an array.
         
     | 
| 
       9 
     | 
    
         
            -
                #
         
     | 
| 
       10 
     | 
    
         
            -
                # Example:
         
     | 
| 
       11 
     | 
    
         
            -
                # Object contains an array called 'alphas', which looks like [:a, :b, :c].
         
     | 
| 
       12 
     | 
    
         
            -
                # Call object.alpha and :a is returned.
         
     | 
| 
       13 
     | 
    
         
            -
                def method_missing(name, *args)
         
     | 
| 
       14 
     | 
    
         
            -
                  return self.send(:"#{name}s").first rescue super(name, *args)
         
     | 
| 
       15 
     | 
    
         
            -
                end
         
     | 
| 
       16 
     | 
    
         
            -
             
     | 
| 
       17 
     | 
    
         
            -
                def respond_to?(x, y=false)
         
     | 
| 
       18 
     | 
    
         
            -
                  self.class::ELEMENTS.include?(x) || self.class::ELEMENTS.include?(:"#{x}s") || super(x, y)
         
     | 
| 
       19 
     | 
    
         
            -
                end
         
     | 
| 
       20 
     | 
    
         
            -
             
     | 
| 
       21 
     | 
    
         
            -
              end
         
     | 
| 
       22 
     | 
    
         
            -
             
     | 
| 
       23 
     | 
    
         
            -
              module ElementEquality
         
     | 
| 
       24 
     | 
    
         
            -
             
     | 
| 
       25 
     | 
    
         
            -
                def eql?(other)
         
     | 
| 
       26 
     | 
    
         
            -
                  self == (other)
         
     | 
| 
       27 
     | 
    
         
            -
                end
         
     | 
| 
       28 
     | 
    
         
            -
             
     | 
| 
       29 
     | 
    
         
            -
                def ==(other)
         
     | 
| 
       30 
     | 
    
         
            -
                  other.equal?(self) ||
         
     | 
| 
       31 
     | 
    
         
            -
                    (other.instance_of?(self.class) &&
         
     | 
| 
       32 
     | 
    
         
            -
                      self.class::ELEMENTS.all?{ |el| self.send(el) == other.send(el)} )
         
     | 
| 
       33 
     | 
    
         
            -
                end
         
     | 
| 
       34 
     | 
    
         
            -
             
     | 
| 
       35 
     | 
    
         
            -
                # Returns the difference between two Feed instances as a hash.
         
     | 
| 
       36 
     | 
    
         
            -
                # Any top-level differences in the Feed object as presented as:
         
     | 
| 
       37 
     | 
    
         
            -
                #
         
     | 
| 
       38 
     | 
    
         
            -
                #  { :title => [content, other_content] }
         
     | 
| 
       39 
     | 
    
         
            -
                #
         
     | 
| 
       40 
     | 
    
         
            -
                # For differences at the items level, an array of hashes shows the diffs
         
     | 
| 
       41 
     | 
    
         
            -
                # on a per-entry basis. Only entries that differ will contain a hash:
         
     | 
| 
       42 
     | 
    
         
            -
                #
         
     | 
| 
       43 
     | 
    
         
            -
                #  { :items => [
         
     | 
| 
       44 
     | 
    
         
            -
                #     {:title => ["An article tile", "A new article title"]},
         
     | 
| 
       45 
     | 
    
         
            -
                #     {:title => ["one title", "a different title"]} ]}
         
     | 
| 
       46 
     | 
    
         
            -
                #
         
     | 
| 
       47 
     | 
    
         
            -
                # If the number of items in each feed are different, then the count of each
         
     | 
| 
       48 
     | 
    
         
            -
                # is provided instead:
         
     | 
| 
       49 
     | 
    
         
            -
                #
         
     | 
| 
       50 
     | 
    
         
            -
                #  { :items => [4,5] }
         
     | 
| 
       51 
     | 
    
         
            -
                #
         
     | 
| 
       52 
     | 
    
         
            -
                # This method can also be useful for human-readable feed comparison if
         
     | 
| 
       53 
     | 
    
         
            -
                # its output is dumped to YAML.
         
     | 
| 
       54 
     | 
    
         
            -
                def diff(other, elements = self.class::ELEMENTS)
         
     | 
| 
       55 
     | 
    
         
            -
                  diffs = {}
         
     | 
| 
       56 
     | 
    
         
            -
             
     | 
| 
       57 
     | 
    
         
            -
                  elements.each do |element|
         
     | 
| 
       58 
     | 
    
         
            -
                    if other.respond_to?(element)
         
     | 
| 
       59 
     | 
    
         
            -
                      self_value = self.send(element)
         
     | 
| 
       60 
     | 
    
         
            -
                      other_value = other.send(element)
         
     | 
| 
       61 
     | 
    
         
            -
             
     | 
| 
       62 
     | 
    
         
            -
                      next if self_value == other_value
         
     | 
| 
       63 
     | 
    
         
            -
             
     | 
| 
       64 
     | 
    
         
            -
                      diffs[element] = if other_value.respond_to?(:diff)
         
     | 
| 
       65 
     | 
    
         
            -
                        self_value.diff(other_value)
         
     | 
| 
       66 
     | 
    
         
            -
             
     | 
| 
       67 
     | 
    
         
            -
                      elsif other_value.is_a?(Enumerable) && other_value.all?{|v| v.respond_to?(:diff)}
         
     | 
| 
       68 
     | 
    
         
            -
             
     | 
| 
       69 
     | 
    
         
            -
                        if self_value.size != other_value.size
         
     | 
| 
       70 
     | 
    
         
            -
                          [self_value.size, other_value.size]
         
     | 
| 
       71 
     | 
    
         
            -
                        else
         
     | 
| 
       72 
     | 
    
         
            -
                          enum_diffs = []
         
     | 
| 
       73 
     | 
    
         
            -
                          self_value.each_with_index do |val, index|
         
     | 
| 
       74 
     | 
    
         
            -
                            enum_diffs << val.diff(other_value[index], val.class::ELEMENTS)
         
     | 
| 
       75 
     | 
    
         
            -
                          end
         
     | 
| 
       76 
     | 
    
         
            -
                          enum_diffs.reject{|h| h.empty?}
         
     | 
| 
       77 
     | 
    
         
            -
                        end
         
     | 
| 
       78 
     | 
    
         
            -
             
     | 
| 
       79 
     | 
    
         
            -
                      else
         
     | 
| 
       80 
     | 
    
         
            -
                        [other_value, self_value] unless other_value == self_value
         
     | 
| 
       81 
     | 
    
         
            -
                      end
         
     | 
| 
       82 
     | 
    
         
            -
                    end
         
     | 
| 
       83 
     | 
    
         
            -
                  end
         
     | 
| 
       84 
     | 
    
         
            -
             
     | 
| 
       85 
     | 
    
         
            -
                  diffs
         
     | 
| 
       86 
     | 
    
         
            -
                end
         
     | 
| 
       87 
     | 
    
         
            -
             
     | 
| 
       88 
     | 
    
         
            -
              end
         
     | 
| 
       89 
     | 
    
         
            -
             
     | 
| 
       90 
     | 
    
         
            -
              module ElementCleaner
         
     | 
| 
       91 
     | 
    
         
            -
                # Recursively cleans all elements in place.
         
     | 
| 
       92 
     | 
    
         
            -
                #
         
     | 
| 
       93 
     | 
    
         
            -
                # Only allow tags in whitelist. Always parse the html with a parser and delete
         
     | 
| 
       94 
     | 
    
         
            -
                # all tags that arent on the list.
         
     | 
| 
       95 
     | 
    
         
            -
                #
         
     | 
| 
       96 
     | 
    
         
            -
                # For feed elements that can contain HTML:
         
     | 
| 
       97 
     | 
    
         
            -
                # - feed.(title|description)
         
     | 
| 
       98 
     | 
    
         
            -
                # - feed.entries[n].(title|description|content)
         
     | 
| 
       99 
     | 
    
         
            -
                #
         
     | 
| 
       100 
     | 
    
         
            -
                def clean!
         
     | 
| 
       101 
     | 
    
         
            -
                  self.class::SIMPLE_ELEMENTS.each do |element|
         
     | 
| 
       102 
     | 
    
         
            -
                    val = self.send(element)
         
     | 
| 
       103 
     | 
    
         
            -
             
     | 
| 
       104 
     | 
    
         
            -
                    send("#{element}=", (val.is_a?(Array) ?
         
     | 
| 
       105 
     | 
    
         
            -
                      val.collect{|v| HtmlCleaner.flatten(v.to_s)} : HtmlCleaner.flatten(val.to_s)))
         
     | 
| 
       106 
     | 
    
         
            -
                  end
         
     | 
| 
       107 
     | 
    
         
            -
             
     | 
| 
       108 
     | 
    
         
            -
                  self.class::HTML_ELEMENTS.each do |element|
         
     | 
| 
       109 
     | 
    
         
            -
                    send("#{element}=", HtmlCleaner.clean(self.send(element).to_s))
         
     | 
| 
       110 
     | 
    
         
            -
                  end
         
     | 
| 
       111 
     | 
    
         
            -
             
     | 
| 
       112 
     | 
    
         
            -
                  self.class::BLENDED_ELEMENTS.each do |element|
         
     | 
| 
       113 
     | 
    
         
            -
                    self.send(element).collect{|v| v.clean!}
         
     | 
| 
       114 
     | 
    
         
            -
                  end
         
     | 
| 
       115 
     | 
    
         
            -
                end
         
     | 
| 
       116 
     | 
    
         
            -
              end
         
     | 
| 
       117 
     | 
    
         
            -
             
     | 
| 
       118 
     | 
    
         
            -
              module TimeFix
         
     | 
| 
       119 
     | 
    
         
            -
                # Reparse any Time instances, due to RSS::Parser's redefinition of
         
     | 
| 
       120 
     | 
    
         
            -
                # certain aspects of the Time class that creates unexpected behaviour
         
     | 
| 
       121 
     | 
    
         
            -
                # when extending the Time class, as some common third party libraries do.
         
     | 
| 
       122 
     | 
    
         
            -
                # See http://code.google.com/p/feed-normalizer/issues/detail?id=13.
         
     | 
| 
       123 
     | 
    
         
            -
                def reparse(obj)
         
     | 
| 
       124 
     | 
    
         
            -
                  @parsed ||= false
         
     | 
| 
       125 
     | 
    
         
            -
             
     | 
| 
       126 
     | 
    
         
            -
                  return obj if @parsed
         
     | 
| 
       127 
     | 
    
         
            -
             
     | 
| 
       128 
     | 
    
         
            -
                  if obj.is_a?(Time)
         
     | 
| 
       129 
     | 
    
         
            -
                    @parsed = true
         
     | 
| 
       130 
     | 
    
         
            -
                    Time.at(obj) rescue obj
         
     | 
| 
       131 
     | 
    
         
            -
                  end
         
     | 
| 
       132 
     | 
    
         
            -
                end
         
     | 
| 
       133 
     | 
    
         
            -
              end
         
     | 
| 
       134 
     | 
    
         
            -
             
     | 
| 
       135 
     | 
    
         
            -
              module RewriteRelativeLinks
         
     | 
| 
       136 
     | 
    
         
            -
                def rewrite_relative_links(text, url)
         
     | 
| 
       137 
     | 
    
         
            -
                  if host = url_host(url)
         
     | 
| 
       138 
     | 
    
         
            -
                    text.to_s.gsub(/(href|src)=('|")\//, '\1=\2http://' + host + '/')
         
     | 
| 
       139 
     | 
    
         
            -
                  else
         
     | 
| 
       140 
     | 
    
         
            -
                    text
         
     | 
| 
       141 
     | 
    
         
            -
                  end
         
     | 
| 
       142 
     | 
    
         
            -
                end
         
     | 
| 
       143 
     | 
    
         
            -
             
     | 
| 
       144 
     | 
    
         
            -
                private
         
     | 
| 
       145 
     | 
    
         
            -
                  def url_host(url)
         
     | 
| 
       146 
     | 
    
         
            -
                    URI.parse(url).host rescue nil
         
     | 
| 
       147 
     | 
    
         
            -
                  end
         
     | 
| 
       148 
     | 
    
         
            -
              end
         
     | 
| 
       149 
     | 
    
         
            -
             
     | 
| 
       150 
     | 
    
         
            -
             
     | 
| 
       151 
     | 
    
         
            -
              # Represents a feed item entry.
         
     | 
| 
       152 
     | 
    
         
            -
              # Available fields are:
         
     | 
| 
       153 
     | 
    
         
            -
              #  * content
         
     | 
| 
       154 
     | 
    
         
            -
              #  * description
         
     | 
| 
       155 
     | 
    
         
            -
              #  * title
         
     | 
| 
       156 
     | 
    
         
            -
              #  * date_published
         
     | 
| 
       157 
     | 
    
         
            -
              #  * urls / url
         
     | 
| 
       158 
     | 
    
         
            -
              #  * id
         
     | 
| 
       159 
     | 
    
         
            -
              #  * authors / author
         
     | 
| 
       160 
     | 
    
         
            -
              #  * copyright
         
     | 
| 
       161 
     | 
    
         
            -
              #  * categories
         
     | 
| 
       162 
     | 
    
         
            -
              class Entry
         
     | 
| 
       163 
     | 
    
         
            -
                include Singular, ElementEquality, ElementCleaner, TimeFix, RewriteRelativeLinks
         
     | 
| 
       164 
     | 
    
         
            -
             
     | 
| 
       165 
     | 
    
         
            -
                HTML_ELEMENTS = [:content, :description, :title]
         
     | 
| 
       166 
     | 
    
         
            -
                SIMPLE_ELEMENTS = [:date_published, :urls, :id, :authors, :copyright, :categories, :last_updated]
         
     | 
| 
       167 
     | 
    
         
            -
                BLENDED_ELEMENTS = []
         
     | 
| 
       168 
     | 
    
         
            -
             
     | 
| 
       169 
     | 
    
         
            -
                ELEMENTS = HTML_ELEMENTS + SIMPLE_ELEMENTS + BLENDED_ELEMENTS
         
     | 
| 
       170 
     | 
    
         
            -
             
     | 
| 
       171 
     | 
    
         
            -
                attr_accessor(*ELEMENTS)
         
     | 
| 
       172 
     | 
    
         
            -
             
     | 
| 
       173 
     | 
    
         
            -
                def initialize
         
     | 
| 
       174 
     | 
    
         
            -
                  @urls = []
         
     | 
| 
       175 
     | 
    
         
            -
                  @authors = []
         
     | 
| 
       176 
     | 
    
         
            -
                  @categories = []
         
     | 
| 
       177 
     | 
    
         
            -
                  @ 
     | 
| 
       178 
     | 
    
         
            -
             
     | 
| 
       179 
     | 
    
         
            -
             
     | 
| 
       180 
     | 
    
         
            -
             
     | 
| 
       181 
     | 
    
         
            -
                 
     | 
| 
       182 
     | 
    
         
            -
             
     | 
| 
       183 
     | 
    
         
            -
             
     | 
| 
       184 
     | 
    
         
            -
             
     | 
| 
       185 
     | 
    
         
            -
             
     | 
| 
       186 
     | 
    
         
            -
                 
     | 
| 
       187 
     | 
    
         
            -
             
     | 
| 
       188 
     | 
    
         
            -
             
     | 
| 
       189 
     | 
    
         
            -
             
     | 
| 
       190 
     | 
    
         
            -
             
     | 
| 
       191 
     | 
    
         
            -
             
     | 
| 
       192 
     | 
    
         
            -
             
     | 
| 
       193 
     | 
    
         
            -
              #  
     | 
| 
       194 
     | 
    
         
            -
              # 
     | 
| 
       195 
     | 
    
         
            -
              #  *  
     | 
| 
       196 
     | 
    
         
            -
              #  *  
     | 
| 
       197 
     | 
    
         
            -
              #  *  
     | 
| 
       198 
     | 
    
         
            -
              #  *  
     | 
| 
       199 
     | 
    
         
            -
              #  *  
     | 
| 
       200 
     | 
    
         
            -
              #  *  
     | 
| 
       201 
     | 
    
         
            -
              #  *  
     | 
| 
       202 
     | 
    
         
            -
              #  *  
     | 
| 
       203 
     | 
    
         
            -
              #  *  
     | 
| 
       204 
     | 
    
         
            -
               
     | 
| 
       205 
     | 
    
         
            -
             
     | 
| 
       206 
     | 
    
         
            -
             
     | 
| 
       207 
     | 
    
         
            -
             
     | 
| 
       208 
     | 
    
         
            -
                 
     | 
| 
       209 
     | 
    
         
            -
             
     | 
| 
       210 
     | 
    
         
            -
             
     | 
| 
       211 
     | 
    
         
            -
                 
     | 
| 
       212 
     | 
    
         
            -
             
     | 
| 
       213 
     | 
    
         
            -
             
     | 
| 
       214 
     | 
    
         
            -
                 
     | 
| 
       215 
     | 
    
         
            -
             
     | 
| 
       216 
     | 
    
         
            -
             
     | 
| 
       217 
     | 
    
         
            -
             
     | 
| 
       218 
     | 
    
         
            -
             
     | 
| 
       219 
     | 
    
         
            -
                attr_accessor( 
     | 
| 
       220 
     | 
    
         
            -
             
     | 
| 
       221 
     | 
    
         
            -
             
     | 
| 
       222 
     | 
    
         
            -
             
     | 
| 
       223 
     | 
    
         
            -
             
     | 
| 
       224 
     | 
    
         
            -
             
     | 
| 
       225 
     | 
    
         
            -
                   
     | 
| 
       226 
     | 
    
         
            -
                  @ 
     | 
| 
       227 
     | 
    
         
            -
                  @ 
     | 
| 
       228 
     | 
    
         
            -
                  @ 
     | 
| 
       229 
     | 
    
         
            -
                  @ 
     | 
| 
       230 
     | 
    
         
            -
                  @ 
     | 
| 
       231 
     | 
    
         
            -
                  @ 
     | 
| 
       232 
     | 
    
         
            -
             
     | 
| 
       233 
     | 
    
         
            -
             
     | 
| 
       234 
     | 
    
         
            -
             
     | 
| 
       235 
     | 
    
         
            -
                 
     | 
| 
       236 
     | 
    
         
            -
             
     | 
| 
       237 
     | 
    
         
            -
             
     | 
| 
       238 
     | 
    
         
            -
             
     | 
| 
       239 
     | 
    
         
            -
             
     | 
| 
       240 
     | 
    
         
            -
             
     | 
| 
       241 
     | 
    
         
            -
             
     | 
| 
       242 
     | 
    
         
            -
             
     | 
| 
       243 
     | 
    
         
            -
             
     | 
| 
       244 
     | 
    
         
            -
             
     | 
| 
      
 1 
     | 
    
         
            +
             
     | 
| 
      
 2 
     | 
    
         
            +
            module FeedNormalizer
         
     | 
| 
      
 3 
     | 
    
         
            +
             
     | 
| 
      
 4 
     | 
    
         
            +
              module Singular
         
     | 
| 
      
 5 
     | 
    
         
            +
             
     | 
| 
      
 6 
     | 
    
         
            +
                # If the method being called is a singular (in this simple case, does not
         
     | 
| 
      
 7 
     | 
    
         
            +
                # end with an 's'), then it calls the plural method, and calls the first
         
     | 
| 
      
 8 
     | 
    
         
            +
                # element. We're assuming that plural methods provide an array.
         
     | 
| 
      
 9 
     | 
    
         
            +
                #
         
     | 
| 
      
 10 
     | 
    
         
            +
                # Example:
         
     | 
| 
      
 11 
     | 
    
         
            +
                # Object contains an array called 'alphas', which looks like [:a, :b, :c].
         
     | 
| 
      
 12 
     | 
    
         
            +
                # Call object.alpha and :a is returned.
         
     | 
| 
      
 13 
     | 
    
         
            +
                def method_missing(name, *args)
         
     | 
| 
      
 14 
     | 
    
         
            +
                  return self.send(:"#{name}s").first rescue super(name, *args)
         
     | 
| 
      
 15 
     | 
    
         
            +
                end
         
     | 
| 
      
 16 
     | 
    
         
            +
             
     | 
| 
      
 17 
     | 
    
         
            +
                def respond_to?(x, y=false)
         
     | 
| 
      
 18 
     | 
    
         
            +
                  self.class::ELEMENTS.include?(x) || self.class::ELEMENTS.include?(:"#{x}s") || super(x, y)
         
     | 
| 
      
 19 
     | 
    
         
            +
                end
         
     | 
| 
      
 20 
     | 
    
         
            +
             
     | 
| 
      
 21 
     | 
    
         
            +
              end
         
     | 
| 
      
 22 
     | 
    
         
            +
             
     | 
| 
      
 23 
     | 
    
         
            +
              module ElementEquality
         
     | 
| 
      
 24 
     | 
    
         
            +
             
     | 
| 
      
 25 
     | 
    
         
            +
                def eql?(other)
         
     | 
| 
      
 26 
     | 
    
         
            +
                  self == (other)
         
     | 
| 
      
 27 
     | 
    
         
            +
                end
         
     | 
| 
      
 28 
     | 
    
         
            +
             
     | 
| 
      
 29 
     | 
    
         
            +
                def ==(other)
         
     | 
| 
      
 30 
     | 
    
         
            +
                  other.equal?(self) ||
         
     | 
| 
      
 31 
     | 
    
         
            +
                    (other.instance_of?(self.class) &&
         
     | 
| 
      
 32 
     | 
    
         
            +
                      self.class::ELEMENTS.all?{ |el| self.send(el) == other.send(el)} )
         
     | 
| 
      
 33 
     | 
    
         
            +
                end
         
     | 
| 
      
 34 
     | 
    
         
            +
             
     | 
| 
      
 35 
     | 
    
         
            +
                # Returns the difference between two Feed instances as a hash.
         
     | 
| 
      
 36 
     | 
    
         
            +
                # Any top-level differences in the Feed object as presented as:
         
     | 
| 
      
 37 
     | 
    
         
            +
                #
         
     | 
| 
      
 38 
     | 
    
         
            +
                #  { :title => [content, other_content] }
         
     | 
| 
      
 39 
     | 
    
         
            +
                #
         
     | 
| 
      
 40 
     | 
    
         
            +
                # For differences at the items level, an array of hashes shows the diffs
         
     | 
| 
      
 41 
     | 
    
         
            +
                # on a per-entry basis. Only entries that differ will contain a hash:
         
     | 
| 
      
 42 
     | 
    
         
            +
                #
         
     | 
| 
      
 43 
     | 
    
         
            +
                #  { :items => [
         
     | 
| 
      
 44 
     | 
    
         
            +
                #     {:title => ["An article tile", "A new article title"]},
         
     | 
| 
      
 45 
     | 
    
         
            +
                #     {:title => ["one title", "a different title"]} ]}
         
     | 
| 
      
 46 
     | 
    
         
            +
                #
         
     | 
| 
      
 47 
     | 
    
         
            +
                # If the number of items in each feed are different, then the count of each
         
     | 
| 
      
 48 
     | 
    
         
            +
                # is provided instead:
         
     | 
| 
      
 49 
     | 
    
         
            +
                #
         
     | 
| 
      
 50 
     | 
    
         
            +
                #  { :items => [4,5] }
         
     | 
| 
      
 51 
     | 
    
         
            +
                #
         
     | 
| 
      
 52 
     | 
    
         
            +
                # This method can also be useful for human-readable feed comparison if
         
     | 
| 
      
 53 
     | 
    
         
            +
                # its output is dumped to YAML.
         
     | 
| 
      
 54 
     | 
    
         
            +
                def diff(other, elements = self.class::ELEMENTS)
         
     | 
| 
      
 55 
     | 
    
         
            +
                  diffs = {}
         
     | 
| 
      
 56 
     | 
    
         
            +
             
     | 
| 
      
 57 
     | 
    
         
            +
                  elements.each do |element|
         
     | 
| 
      
 58 
     | 
    
         
            +
                    if other.respond_to?(element)
         
     | 
| 
      
 59 
     | 
    
         
            +
                      self_value = self.send(element)
         
     | 
| 
      
 60 
     | 
    
         
            +
                      other_value = other.send(element)
         
     | 
| 
      
 61 
     | 
    
         
            +
             
     | 
| 
      
 62 
     | 
    
         
            +
                      next if self_value == other_value
         
     | 
| 
      
 63 
     | 
    
         
            +
             
     | 
| 
      
 64 
     | 
    
         
            +
                      diffs[element] = if other_value.respond_to?(:diff)
         
     | 
| 
      
 65 
     | 
    
         
            +
                        self_value.diff(other_value)
         
     | 
| 
      
 66 
     | 
    
         
            +
             
     | 
| 
      
 67 
     | 
    
         
            +
                      elsif other_value.is_a?(Enumerable) && other_value.all?{|v| v.respond_to?(:diff)}
         
     | 
| 
      
 68 
     | 
    
         
            +
             
     | 
| 
      
 69 
     | 
    
         
            +
                        if self_value.size != other_value.size
         
     | 
| 
      
 70 
     | 
    
         
            +
                          [self_value.size, other_value.size]
         
     | 
| 
      
 71 
     | 
    
         
            +
                        else
         
     | 
| 
      
 72 
     | 
    
         
            +
                          enum_diffs = []
         
     | 
| 
      
 73 
     | 
    
         
            +
                          self_value.each_with_index do |val, index|
         
     | 
| 
      
 74 
     | 
    
         
            +
                            enum_diffs << val.diff(other_value[index], val.class::ELEMENTS)
         
     | 
| 
      
 75 
     | 
    
         
            +
                          end
         
     | 
| 
      
 76 
     | 
    
         
            +
                          enum_diffs.reject{|h| h.empty?}
         
     | 
| 
      
 77 
     | 
    
         
            +
                        end
         
     | 
| 
      
 78 
     | 
    
         
            +
             
     | 
| 
      
 79 
     | 
    
         
            +
                      else
         
     | 
| 
      
 80 
     | 
    
         
            +
                        [other_value, self_value] unless other_value == self_value
         
     | 
| 
      
 81 
     | 
    
         
            +
                      end
         
     | 
| 
      
 82 
     | 
    
         
            +
                    end
         
     | 
| 
      
 83 
     | 
    
         
            +
                  end
         
     | 
| 
      
 84 
     | 
    
         
            +
             
     | 
| 
      
 85 
     | 
    
         
            +
                  diffs
         
     | 
| 
      
 86 
     | 
    
         
            +
                end
         
     | 
| 
      
 87 
     | 
    
         
            +
             
     | 
| 
      
 88 
     | 
    
         
            +
              end
         
     | 
| 
      
 89 
     | 
    
         
            +
             
     | 
| 
      
 90 
     | 
    
         
            +
              module ElementCleaner
         
     | 
| 
      
 91 
     | 
    
         
            +
                # Recursively cleans all elements in place.
         
     | 
| 
      
 92 
     | 
    
         
            +
                #
         
     | 
| 
      
 93 
     | 
    
         
            +
                # Only allow tags in whitelist. Always parse the html with a parser and delete
         
     | 
| 
      
 94 
     | 
    
         
            +
                # all tags that arent on the list.
         
     | 
| 
      
 95 
     | 
    
         
            +
                #
         
     | 
| 
      
 96 
     | 
    
         
            +
                # For feed elements that can contain HTML:
         
     | 
| 
      
 97 
     | 
    
         
            +
                # - feed.(title|description)
         
     | 
| 
      
 98 
     | 
    
         
            +
                # - feed.entries[n].(title|description|content)
         
     | 
| 
      
 99 
     | 
    
         
            +
                #
         
     | 
| 
      
 100 
     | 
    
         
            +
                def clean!
         
     | 
| 
      
 101 
     | 
    
         
            +
                  self.class::SIMPLE_ELEMENTS.each do |element|
         
     | 
| 
      
 102 
     | 
    
         
            +
                    val = self.send(element)
         
     | 
| 
      
 103 
     | 
    
         
            +
             
     | 
| 
      
 104 
     | 
    
         
            +
                    send("#{element}=", (val.is_a?(Array) ?
         
     | 
| 
      
 105 
     | 
    
         
            +
                      val.collect{|v| HtmlCleaner.flatten(v.to_s)} : HtmlCleaner.flatten(val.to_s)))
         
     | 
| 
      
 106 
     | 
    
         
            +
                  end
         
     | 
| 
      
 107 
     | 
    
         
            +
             
     | 
| 
      
 108 
     | 
    
         
            +
                  self.class::HTML_ELEMENTS.each do |element|
         
     | 
| 
      
 109 
     | 
    
         
            +
                    send("#{element}=", HtmlCleaner.clean(self.send(element).to_s))
         
     | 
| 
      
 110 
     | 
    
         
            +
                  end
         
     | 
| 
      
 111 
     | 
    
         
            +
             
     | 
| 
      
 112 
     | 
    
         
            +
                  self.class::BLENDED_ELEMENTS.each do |element|
         
     | 
| 
      
 113 
     | 
    
         
            +
                    self.send(element).collect{|v| v.clean!}
         
     | 
| 
      
 114 
     | 
    
         
            +
                  end
         
     | 
| 
      
 115 
     | 
    
         
            +
                end
         
     | 
| 
      
 116 
     | 
    
         
            +
              end
         
     | 
| 
      
 117 
     | 
    
         
            +
             
     | 
| 
      
 118 
     | 
    
         
            +
              module TimeFix
         
     | 
| 
      
 119 
     | 
    
         
            +
                # Reparse any Time instances, due to RSS::Parser's redefinition of
         
     | 
| 
      
 120 
     | 
    
         
            +
                # certain aspects of the Time class that creates unexpected behaviour
         
     | 
| 
      
 121 
     | 
    
         
            +
                # when extending the Time class, as some common third party libraries do.
         
     | 
| 
      
 122 
     | 
    
         
            +
                # See http://code.google.com/p/feed-normalizer/issues/detail?id=13.
         
     | 
| 
      
 123 
     | 
    
         
            +
                def reparse(obj)
         
     | 
| 
      
 124 
     | 
    
         
            +
                  @parsed ||= false
         
     | 
| 
      
 125 
     | 
    
         
            +
             
     | 
| 
      
 126 
     | 
    
         
            +
                  return obj if @parsed
         
     | 
| 
      
 127 
     | 
    
         
            +
             
     | 
| 
      
 128 
     | 
    
         
            +
                  if obj.is_a?(Time)
         
     | 
| 
      
 129 
     | 
    
         
            +
                    @parsed = true
         
     | 
| 
      
 130 
     | 
    
         
            +
                    Time.at(obj) rescue obj
         
     | 
| 
      
 131 
     | 
    
         
            +
                  end
         
     | 
| 
      
 132 
     | 
    
         
            +
                end
         
     | 
| 
      
 133 
     | 
    
         
            +
              end
         
     | 
| 
      
 134 
     | 
    
         
            +
             
     | 
| 
      
 135 
     | 
    
         
            +
              module RewriteRelativeLinks
         
     | 
| 
      
 136 
     | 
    
         
            +
                def rewrite_relative_links(text, url)
         
     | 
| 
      
 137 
     | 
    
         
            +
                  if host = url_host(url)
         
     | 
| 
      
 138 
     | 
    
         
            +
                    text.to_s.gsub(/(href|src)=('|")\//, '\1=\2http://' + host + '/')
         
     | 
| 
      
 139 
     | 
    
         
            +
                  else
         
     | 
| 
      
 140 
     | 
    
         
            +
                    text
         
     | 
| 
      
 141 
     | 
    
         
            +
                  end
         
     | 
| 
      
 142 
     | 
    
         
            +
                end
         
     | 
| 
      
 143 
     | 
    
         
            +
             
     | 
| 
      
 144 
     | 
    
         
            +
                private
         
     | 
| 
      
 145 
     | 
    
         
            +
                  def url_host(url)
         
     | 
| 
      
 146 
     | 
    
         
            +
                    URI.parse(url).host rescue nil
         
     | 
| 
      
 147 
     | 
    
         
            +
                  end
         
     | 
| 
      
 148 
     | 
    
         
            +
              end
         
     | 
| 
      
 149 
     | 
    
         
            +
             
     | 
| 
      
 150 
     | 
    
         
            +
             
     | 
| 
      
 151 
     | 
    
         
            +
              # Represents a feed item entry.
         
     | 
| 
      
 152 
     | 
    
         
            +
              # Available fields are:
         
     | 
| 
      
 153 
     | 
    
         
            +
              #  * content
         
     | 
| 
      
 154 
     | 
    
         
            +
              #  * description
         
     | 
| 
      
 155 
     | 
    
         
            +
              #  * title
         
     | 
| 
      
 156 
     | 
    
         
            +
              #  * date_published
         
     | 
| 
      
 157 
     | 
    
         
            +
              #  * urls / url
         
     | 
| 
      
 158 
     | 
    
         
            +
              #  * id
         
     | 
| 
      
 159 
     | 
    
         
            +
              #  * authors / author
         
     | 
| 
      
 160 
     | 
    
         
            +
              #  * copyright
         
     | 
| 
      
 161 
     | 
    
         
            +
              #  * categories
         
     | 
| 
      
 162 
     | 
    
         
            +
              class Entry
         
     | 
| 
      
 163 
     | 
    
         
            +
                include Singular, ElementEquality, ElementCleaner, TimeFix, RewriteRelativeLinks
         
     | 
| 
      
 164 
     | 
    
         
            +
             
     | 
| 
      
 165 
     | 
    
         
            +
                HTML_ELEMENTS = [:content, :description, :title]
         
     | 
| 
      
 166 
     | 
    
         
            +
                SIMPLE_ELEMENTS = [:date_published, :urls, :id, :authors, :copyright, :categories, :last_updated, :enclosures]
         
     | 
| 
      
 167 
     | 
    
         
            +
                BLENDED_ELEMENTS = []
         
     | 
| 
      
 168 
     | 
    
         
            +
             
     | 
| 
      
 169 
     | 
    
         
            +
                ELEMENTS = HTML_ELEMENTS + SIMPLE_ELEMENTS + BLENDED_ELEMENTS
         
     | 
| 
      
 170 
     | 
    
         
            +
             
     | 
| 
      
 171 
     | 
    
         
            +
                attr_accessor(*ELEMENTS)
         
     | 
| 
      
 172 
     | 
    
         
            +
             
     | 
| 
      
 173 
     | 
    
         
            +
                def initialize
         
     | 
| 
      
 174 
     | 
    
         
            +
                  @urls = []
         
     | 
| 
      
 175 
     | 
    
         
            +
                  @authors = []
         
     | 
| 
      
 176 
     | 
    
         
            +
                  @categories = []
         
     | 
| 
      
 177 
     | 
    
         
            +
                  @enclosures = []
         
     | 
| 
      
 178 
     | 
    
         
            +
                  @date_published, @content = nil
         
     | 
| 
      
 179 
     | 
    
         
            +
                end
         
     | 
| 
      
 180 
     | 
    
         
            +
             
     | 
| 
      
 181 
     | 
    
         
            +
                undef date_published
         
     | 
| 
      
 182 
     | 
    
         
            +
                def date_published
         
     | 
| 
      
 183 
     | 
    
         
            +
                  @date_published = reparse(@date_published)
         
     | 
| 
      
 184 
     | 
    
         
            +
                end
         
     | 
| 
      
 185 
     | 
    
         
            +
             
     | 
| 
      
 186 
     | 
    
         
            +
                undef content
         
     | 
| 
      
 187 
     | 
    
         
            +
                def content
         
     | 
| 
      
 188 
     | 
    
         
            +
                  @content = rewrite_relative_links(@content, url)
         
     | 
| 
      
 189 
     | 
    
         
            +
                end
         
     | 
| 
      
 190 
     | 
    
         
            +
             
     | 
| 
      
 191 
     | 
    
         
            +
              end
         
     | 
| 
      
 192 
     | 
    
         
            +
             
     | 
| 
      
 193 
     | 
    
         
            +
              # Represents the root element of a feed.
         
     | 
| 
      
 194 
     | 
    
         
            +
              # Available fields are:
         
     | 
| 
      
 195 
     | 
    
         
            +
              #  * title
         
     | 
| 
      
 196 
     | 
    
         
            +
              #  * description
         
     | 
| 
      
 197 
     | 
    
         
            +
              #  * id
         
     | 
| 
      
 198 
     | 
    
         
            +
              #  * last_updated
         
     | 
| 
      
 199 
     | 
    
         
            +
              #  * copyright
         
     | 
| 
      
 200 
     | 
    
         
            +
              #  * authors / author
         
     | 
| 
      
 201 
     | 
    
         
            +
              #  * urls / url
         
     | 
| 
      
 202 
     | 
    
         
            +
              #  * image
         
     | 
| 
      
 203 
     | 
    
         
            +
              #  * generator
         
     | 
| 
      
 204 
     | 
    
         
            +
              #  * items / channel
         
     | 
| 
      
 205 
     | 
    
         
            +
              class Feed
         
     | 
| 
      
 206 
     | 
    
         
            +
                include Singular, ElementEquality, ElementCleaner, TimeFix
         
     | 
| 
      
 207 
     | 
    
         
            +
             
     | 
| 
      
 208 
     | 
    
         
            +
                # Elements that can contain HTML fragments.
         
     | 
| 
      
 209 
     | 
    
         
            +
                HTML_ELEMENTS = [:title, :description]
         
     | 
| 
      
 210 
     | 
    
         
            +
             
     | 
| 
      
 211 
     | 
    
         
            +
                # Elements that contain 'plain' Strings, with HTML escaped.
         
     | 
| 
      
 212 
     | 
    
         
            +
                SIMPLE_ELEMENTS = [:id, :last_updated, :copyright, :authors, :urls, :image, :generator, :ttl, :skip_hours, :skip_days]
         
     | 
| 
      
 213 
     | 
    
         
            +
             
     | 
| 
      
 214 
     | 
    
         
            +
                # Elements that contain both HTML and escaped HTML.
         
     | 
| 
      
 215 
     | 
    
         
            +
                BLENDED_ELEMENTS = [:items]
         
     | 
| 
      
 216 
     | 
    
         
            +
             
     | 
| 
      
 217 
     | 
    
         
            +
                ELEMENTS = HTML_ELEMENTS + SIMPLE_ELEMENTS + BLENDED_ELEMENTS
         
     | 
| 
      
 218 
     | 
    
         
            +
             
     | 
| 
      
 219 
     | 
    
         
            +
                attr_accessor(*ELEMENTS)
         
     | 
| 
      
 220 
     | 
    
         
            +
                attr_accessor(:parser)
         
     | 
| 
      
 221 
     | 
    
         
            +
             
     | 
| 
      
 222 
     | 
    
         
            +
                alias :entries :items
         
     | 
| 
      
 223 
     | 
    
         
            +
             
     | 
| 
      
 224 
     | 
    
         
            +
                def initialize(wrapper)
         
     | 
| 
      
 225 
     | 
    
         
            +
                  # set up associations (i.e. arrays where needed)
         
     | 
| 
      
 226 
     | 
    
         
            +
                  @urls = []
         
     | 
| 
      
 227 
     | 
    
         
            +
                  @authors = []
         
     | 
| 
      
 228 
     | 
    
         
            +
                  @skip_hours = []
         
     | 
| 
      
 229 
     | 
    
         
            +
                  @skip_days = []
         
     | 
| 
      
 230 
     | 
    
         
            +
                  @items = []
         
     | 
| 
      
 231 
     | 
    
         
            +
                  @parser = wrapper.parser.to_s
         
     | 
| 
      
 232 
     | 
    
         
            +
                  @last_updated = nil
         
     | 
| 
      
 233 
     | 
    
         
            +
                end
         
     | 
| 
      
 234 
     | 
    
         
            +
             
     | 
| 
      
 235 
     | 
    
         
            +
                undef last_updated
         
     | 
| 
      
 236 
     | 
    
         
            +
                def last_updated
         
     | 
| 
      
 237 
     | 
    
         
            +
                  @last_updated = reparse(@last_updated)
         
     | 
| 
      
 238 
     | 
    
         
            +
                end
         
     | 
| 
      
 239 
     | 
    
         
            +
             
     | 
| 
      
 240 
     | 
    
         
            +
                def channel() self end
         
     | 
| 
      
 241 
     | 
    
         
            +
             
     | 
| 
      
 242 
     | 
    
         
            +
              end
         
     | 
| 
      
 243 
     | 
    
         
            +
             
     | 
| 
      
 244 
     | 
    
         
            +
            end
         
     | 
| 
      
 245 
     | 
    
         
            +
             
     |