fblee-feedzirra 0.0.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,81 @@
1
+ module Feedzirra
2
+ module FeedEntryUtilities
3
+ def published
4
+ @published || @updated
5
+ end
6
+
7
+ def parse_datetime(string)
8
+ begin
9
+ return DateTime.parse(string).feed_utils_to_gm_time
10
+ rescue
11
+ # This means the date was not in an expected format. Often publishers use bizarre character encodings
12
+ # and/or foreign languages in day or month names, so we try and ignore bogus data and see if it's possible
13
+ # to make a sensible extraction of the data that is parseable.
14
+
15
+ # This reg exp matches string like: ?���, 1 ???��� 2009 14:26 -0400
16
+ # which we take to mean the 1st of the current month, at 14:26 in Eastern Summer Time.
17
+ date_matcher = /.+?\s+(\d?\d)\s+.+?(\d\d\d\d)\s+(\d?\d):(\d\d):?(\d?\d)?\s+([-+]\d\d\d\d|\w\w\w)/
18
+ processed_date = date_matcher.match(string)
19
+ if processed_date.nil?
20
+ puts "DATE CAN'T BE PARSED: #{string}"
21
+ return nil
22
+ else
23
+ day_number = processed_date[1].to_i
24
+ year = processed_date[2].to_i
25
+ # sanitize future years to be this year
26
+ today = DateTime.now
27
+ current_year = today.year
28
+ year = current_year if year > current_year
29
+
30
+ # Guess the month: if the day number is less than today
31
+ if day_number > today.day
32
+ month = (today << 1).month
33
+ else
34
+ month = today.month
35
+ end
36
+
37
+ hours = processed_date[3].to_i
38
+ minutes = processed_date[4].to_i
39
+ seconds = processed_date[5] || "00"
40
+ timezone_indicator = processed_date[6]
41
+
42
+ parsed_datetime_string = "#{year}-#{month}-#{day_number} #{hours}:#{minutes}:#{seconds} #{timezone_indicator}"
43
+ # puts parsed_datetime_string
44
+ correct_date = DateTime.parse(parsed_datetime_string)
45
+ gmt_date = correct_date.feed_utils_to_gm_time
46
+ puts "Correctly sanitized a date string after initial parse failed. Went from: [#{string}] to [#{gmt_date.to_s}] via [#{correct_date.to_s}]"
47
+ return gmt_date
48
+ end
49
+ end
50
+ end
51
+
52
+ ##
53
+ # Returns the id of the entry or its url if not id is present, as some formats don't support it
54
+ def id
55
+ @id || @url
56
+ end
57
+
58
+ ##
59
+ # Writter for published. By default, we keep the "oldest" publish time found.
60
+ def published=(val)
61
+ parsed = parse_datetime(val)
62
+ @published = parsed if !@published || parsed < @published
63
+ end
64
+
65
+ ##
66
+ # Writter for udapted. By default, we keep the most recenet update time found.
67
+ def updated=(val)
68
+ parsed = parse_datetime(val)
69
+ @updated = parsed if !@updated || parsed > @updated
70
+ end
71
+
72
+ def sanitize!
73
+ self.title.sanitize! if self.title
74
+ self.author.sanitize! if self.author
75
+ self.summary.sanitize! if self.summary
76
+ self.content.sanitize! if self.content
77
+ end
78
+
79
+ alias_method :last_modified, :published
80
+ end
81
+ end
@@ -0,0 +1,71 @@
1
+ module Feedzirra
2
+ module FeedUtilities
3
+ UPDATABLE_ATTRIBUTES = %w(title feed_url url last_modified)
4
+
5
+ attr_writer :new_entries, :updated, :last_modified
6
+ attr_accessor :etag
7
+
8
+ def last_modified
9
+ @last_modified ||= begin
10
+ entry = entries.reject {|e| e.published.nil? }.sort_by { |entry| entry.published if entry.published }.last
11
+ entry ? entry.published : nil
12
+ end
13
+ end
14
+
15
+ def updated?
16
+ @updated
17
+ end
18
+
19
+ def new_entries
20
+ @new_entries ||= []
21
+ end
22
+
23
+ def has_new_entries?
24
+ new_entries.size > 0
25
+ end
26
+
27
+ def update_from_feed(feed)
28
+ self.new_entries += find_new_entries_for(feed)
29
+ self.entries.unshift(*self.new_entries)
30
+
31
+ updated! if UPDATABLE_ATTRIBUTES.any? { |name| update_attribute(feed, name) }
32
+ end
33
+
34
+ def update_attribute(feed, name)
35
+ old_value, new_value = send(name), feed.send(name)
36
+
37
+ if old_value != new_value
38
+ send("#{name}=", new_value)
39
+ end
40
+ end
41
+
42
+ def sanitize_entries!
43
+ entries.each {|entry| entry.sanitize!}
44
+ end
45
+
46
+ private
47
+
48
+ def updated!
49
+ @updated = true
50
+ end
51
+
52
+ def find_new_entries_for(feed)
53
+ # this implementation is a hack, which is why it's so ugly.
54
+ # it's to get around the fact that not all feeds have a published date.
55
+ # however, they're always ordered with the newest one first.
56
+ # So we go through the entries just parsed and insert each one as a new entry
57
+ # until we get to one that has the same url as the the newest for the feed
58
+ latest_entry = self.entries.first
59
+ found_new_entries = []
60
+ feed.entries.each do |entry|
61
+ break if entry.url == latest_entry.url
62
+ found_new_entries << entry
63
+ end
64
+ found_new_entries
65
+ end
66
+
67
+ def existing_entry?(test_entry)
68
+ entries.any? { |entry| entry.url == test_entry.url }
69
+ end
70
+ end
71
+ end
@@ -0,0 +1,36 @@
1
+ module Feedzirra
2
+
3
+ module Parser
4
+ # == Summary
5
+ # Parser for dealing with Atom feeds.
6
+ #
7
+ # == Attributes
8
+ # * title
9
+ # * feed_url
10
+ # * url
11
+ # * entries
12
+ class Atom
13
+ include SAXMachine
14
+ include FeedUtilities
15
+ element :title
16
+ element :link, :as => :url, :value => :href, :with => {:type => "text/html"}
17
+ element :link, :as => :feed_url, :value => :href, :with => {:type => "application/atom+xml"}
18
+ elements :link, :as => :links, :value => :href
19
+ element :subtitle, :as => :description
20
+ elements :entry, :as => :entries, :class => AtomEntry
21
+
22
+ def self.able_to_parse?(xml) #:nodoc:
23
+ xml =~ /(Atom)|(#{Regexp.escape("http://purl.org/atom")})|(#{Regexp.escape("http://www.w3.org/2005/Atom")})/
24
+ end
25
+
26
+ def url
27
+ @url || links.last
28
+ end
29
+
30
+ def feed_url
31
+ @feed_url || links.first
32
+ end
33
+ end
34
+ end
35
+
36
+ end
@@ -0,0 +1,41 @@
1
+ module Feedzirra
2
+
3
+ module Parser
4
+ # == Summary
5
+ # Parser for dealing with Atom feed entries.
6
+ #
7
+ # == Attributes
8
+ # * title
9
+ # * url
10
+ # * author
11
+ # * content
12
+ # * summary
13
+ # * published
14
+ # * categories
15
+ class AtomEntry
16
+ include SAXMachine
17
+ include FeedEntryUtilities
18
+ element :title
19
+ element :link, :as => :url, :value => :href, :with => {:type => "text/html", :rel => "alternate"}
20
+ element :name, :as => :author
21
+ element :guid
22
+ element :content
23
+ element :summary
24
+ element :published
25
+ element :id
26
+ element :created, :as => :published
27
+ element :"media:content", :as => :image, :value => :url
28
+ element :issued, :as => :published
29
+ element :updated
30
+ element :modified, :as => :updated
31
+ elements :category, :as => :categories, :value => :term
32
+ elements :link, :as => :links, :value => :href
33
+
34
+ def url
35
+ @url || links.first
36
+ end
37
+ end
38
+
39
+ end
40
+
41
+ end
@@ -0,0 +1,28 @@
1
+ module Feedzirra
2
+
3
+ module Parser
4
+ # == Summary
5
+ # Parser for dealing with Feedburner Atom feeds.
6
+ #
7
+ # == Attributes
8
+ # * title
9
+ # * feed_url
10
+ # * url
11
+ # * entries
12
+ class AtomFeedBurner
13
+ include SAXMachine
14
+ include FeedUtilities
15
+ element :title
16
+ element :link, :as => :url, :value => :href, :with => {:type => "text/html"}
17
+ element :link, :as => :feed_url, :value => :href, :with => {:type => "application/atom+xml"}
18
+ element :subtitle, :as => :description
19
+ elements :entry, :as => :entries, :class => AtomFeedBurnerEntry
20
+
21
+ def self.able_to_parse?(xml) #:nodoc:
22
+ (xml =~ /Atom/ && xml =~ /feedburner/) || false
23
+ end
24
+ end
25
+
26
+ end
27
+
28
+ end
@@ -0,0 +1,37 @@
1
+ module Feedzirra
2
+
3
+ module Parser
4
+ # == Summary
5
+ # Parser for dealing with Feedburner Atom feed entries.
6
+ #
7
+ # == Attributes
8
+ # * title
9
+ # * url
10
+ # * author
11
+ # * content
12
+ # * summary
13
+ # * published
14
+ # * categories
15
+ class AtomFeedBurnerEntry
16
+ include SAXMachine
17
+ include FeedEntryUtilities
18
+ element :title
19
+ element :name, :as => :author
20
+ element :link, :as => :url, :value => :href, :with => {:type => "text/html", :rel => "alternate"}
21
+ element :"feedburner:origLink", :as => :url
22
+ element :summary
23
+ element :guid
24
+ element :content
25
+ element :published
26
+ element :id
27
+ element :issued, :as => :published
28
+ element :created, :as => :published
29
+ element :"media:content", :as => :image, :value => :url
30
+ element :updated
31
+ element :modified, :as => :updated
32
+ elements :category, :as => :categories, :value => :term
33
+ end
34
+
35
+ end
36
+
37
+ end
@@ -0,0 +1,50 @@
1
+ module Feedzirra
2
+
3
+ module Parser
4
+ # iTunes is RSS 2.0 + some apple extensions
5
+ # Source: http://www.apple.com/itunes/whatson/podcasts/specs.html
6
+ class ITunesRSS
7
+ include SAXMachine
8
+ include FeedUtilities
9
+
10
+ attr_accessor :feed_url
11
+
12
+ # RSS 2.0 elements that need including
13
+ element :copyright
14
+ element :description
15
+ element :language
16
+ element :managingEditor
17
+ element :title
18
+ element :link, :as => :url
19
+
20
+ # If author is not present use managingEditor on the channel
21
+ element :"itunes:author", :as => :itunes_author
22
+ element :"itunes:block", :as => :itunes_block
23
+ element :"itunes:image", :value => :href, :as => :itunes_image
24
+ element :"itunes:explicit", :as => :itunes_explicit
25
+ element :"itunes:keywords", :as => :itunes_keywords
26
+ # New URL for the podcast feed
27
+ element :"itunes:new-feed-url", :as => :itunes_new_feed_url
28
+ element :"itunes:subtitle", :as => :itunes_subtitle
29
+ # If summary is not present, use the description tag
30
+ element :"itunes:summary", :as => :itunes_summary
31
+
32
+ # iTunes RSS feeds can have multiple main categories...
33
+ # ...and multiple sub-categories per category
34
+ # TODO subcategories not supported correctly - they are at the same level
35
+ # as the main categories
36
+ elements :"itunes:category", :as => :itunes_categories, :value => :text
37
+
38
+ elements :"itunes:owner", :as => :itunes_owners, :class => ITunesRSSOwner
39
+
40
+ elements :item, :as => :entries, :class => ITunesRSSItem
41
+
42
+ def self.able_to_parse?(xml)
43
+ xml =~ /xmlns:itunes=\"http:\/\/www.itunes.com\/dtds\/podcast-1.0.dtd\"/
44
+ end
45
+
46
+ end
47
+
48
+ end
49
+
50
+ end
@@ -0,0 +1,31 @@
1
+ module Feedzirra
2
+
3
+ module Parser
4
+ # iTunes extensions to the standard RSS2.0 item
5
+ # Source: http://www.apple.com/itunes/whatson/podcasts/specs.html
6
+ class ITunesRSSItem
7
+ include SAXMachine
8
+ include FeedUtilities
9
+ element :author
10
+ element :guid
11
+ element :title
12
+ element :link, :as => :url
13
+ element :description, :as => :summary
14
+ element :pubDate, :as => :published
15
+
16
+ # If author is not present use author tag on the item
17
+ element :"itunes:author", :as => :itunes_author
18
+ element :"itunes:block", :as => :itunes_block
19
+ element :"itunes:duration", :as => :itunes_duration
20
+ element :"itunes:explicit", :as => :itunes_explicit
21
+ element :"itunes:keywords", :as => :itunes_keywords
22
+ element :"itunes:subtitle", :as => :itunes_subtitle
23
+ # If summary is not present, use the description tag
24
+ element :"itunes:summary", :as => :itunes_summary
25
+ element :enclosure, :value => :length, :as => :enclosure_length
26
+ element :enclosure, :value => :type, :as => :enclosure_type
27
+ element :enclosure, :value => :url, :as => :enclosure_url
28
+ end
29
+ end
30
+
31
+ end
@@ -0,0 +1,12 @@
1
+ module Feedzirra
2
+
3
+ module Parser
4
+ class ITunesRSSOwner
5
+ include SAXMachine
6
+ include FeedUtilities
7
+ element :"itunes:name", :as => :name
8
+ element :"itunes:email", :as => :email
9
+ end
10
+ end
11
+
12
+ end
@@ -0,0 +1,36 @@
1
+ module Feedzirra
2
+
3
+ module Parser
4
+ # == Summary
5
+ # Parser for dealing with RSS feeds.
6
+ #
7
+ # == Attributes
8
+ # * title
9
+ # * feed_url
10
+ # * url
11
+ # * entries
12
+ class RSS
13
+ include SAXMachine
14
+ include FeedUtilities
15
+ element :title
16
+ element :link, :as => :url
17
+ elements :item, :as => :entries, :class => RSSEntry
18
+
19
+ # parse the subtitle and description, so we can use whatever we have!
20
+ element :subtitle
21
+ element :description, :as => :feed_description
22
+
23
+ attr_accessor :feed_url
24
+
25
+ def description
26
+ self.feed_description || self.subtitle
27
+ end
28
+
29
+ def self.able_to_parse?(xml) #:nodoc:
30
+ xml =~ /\<rss|\<rdf/
31
+ end
32
+ end
33
+
34
+ end
35
+
36
+ end
@@ -0,0 +1,45 @@
1
+ module Feedzirra
2
+
3
+ module Parser
4
+ # == Summary
5
+ # Parser for dealing with RDF feed entries.
6
+ #
7
+ # == Attributes
8
+ # * title
9
+ # * url
10
+ # * author
11
+ # * content
12
+ # * summary
13
+ # * published
14
+ # * categories
15
+ class RSSEntry
16
+ include SAXMachine
17
+ include FeedEntryUtilities
18
+ element :title
19
+ element :link, :as => :url
20
+
21
+ element :"dc:creator", :as => :author
22
+ element :author, :as => :author
23
+ element :"content:encoded", :as => :content
24
+ element :description, :as => :summary
25
+
26
+ element :pubDate, :as => :published
27
+ element :pubdate, :as => :published
28
+ element :"dc:date", :as => :published
29
+ element :"dc:Date", :as => :published
30
+ element :"dcterms:created", :as => :published
31
+
32
+
33
+ element :"dcterms:modified", :as => :updated
34
+ element :issued, :as => :published
35
+ elements :category, :as => :categories
36
+
37
+ element :guid
38
+
39
+ # TODO: wtf... sometimes type="image/jpeg", sometimes medium="image", what are we to do?
40
+ element :"media:content", :as => :image, :value => :url
41
+ end
42
+
43
+ end
44
+
45
+ end
@@ -0,0 +1,8 @@
1
+ module Feedzirra
2
+ class WebPage
3
+ include SAXMachine
4
+ include FeedUtilities
5
+ element :title # not essential; helpful for debugging
6
+ element :link, :as => :feed_url, :value => :href, :with => {:rel => 'alternate', :type => "application/rss+xml"}
7
+ end
8
+ end
data/lib/feedzirra.rb ADDED
@@ -0,0 +1,35 @@
1
+ $LOAD_PATH.unshift(File.dirname(__FILE__)) unless $LOAD_PATH.include?(File.dirname(__FILE__))
2
+
3
+ gem 'activesupport'
4
+
5
+ require 'zlib'
6
+ require 'curb_core'
7
+ require 'sax-machine'
8
+ require 'dryopteris'
9
+ require 'uri'
10
+ require 'active_support/basic_object'
11
+ require 'active_support/core_ext/object'
12
+ require 'active_support/core_ext/time'
13
+
14
+ require 'core_ext/date'
15
+ require 'core_ext/string'
16
+
17
+ require 'feedzirra/feed_utilities'
18
+ require 'feedzirra/feed_entry_utilities'
19
+ require 'feedzirra/web_page'
20
+ require 'feedzirra/feed'
21
+
22
+ require 'feedzirra/parser/rss_entry'
23
+ require 'feedzirra/parser/itunes_rss_owner'
24
+ require 'feedzirra/parser/itunes_rss_item'
25
+ require 'feedzirra/parser/atom_entry'
26
+ require 'feedzirra/parser/atom_feed_burner_entry'
27
+
28
+ require 'feedzirra/parser/rss'
29
+ require 'feedzirra/parser/itunes_rss'
30
+ require 'feedzirra/parser/atom'
31
+ require 'feedzirra/parser/atom_feed_burner'
32
+
33
+ module Feedzirra
34
+ VERSION = "0.0.17"
35
+ end
@@ -0,0 +1,62 @@
1
+ require File.dirname(__FILE__) + '/../spec_helper'
2
+
3
+ describe Feedzirra::FeedUtilities do
4
+ before(:each) do
5
+ @klass = Class.new do
6
+ include Feedzirra::FeedEntryUtilities
7
+ end
8
+ end
9
+
10
+ describe "handling dates" do
11
+ it "should parse an ISO 8601 formatted datetime into Time" do
12
+ time = @klass.new.parse_datetime("2008-02-20T8:05:00-010:00")
13
+ time.class.should == Time
14
+ time.to_s.should == "Wed Feb 20 18:05:00 UTC 2008"
15
+ end
16
+
17
+ it "should parse a bunch of strangely encoded stuff into Time" do
18
+ time = @klass.new.parse_datetime("Mon, 4 Jan 7010 13:51:39 EST")
19
+ time.class.should == Time
20
+ time.to_s.should == "Mon Jan 04 18:51:39 UTC 2010"
21
+
22
+ time = @klass.new.parse_datetime("���, 5 1��� 2010 10:37 -0500")
23
+ time.class.should == Time
24
+ time.to_s.should == "Tue Jan 05 15:37:00 UTC 2010"
25
+ end
26
+ end
27
+
28
+ describe "sanitizing" do
29
+ before(:each) do
30
+ @feed = Feedzirra::Feed.parse(sample_atom_feed)
31
+ @entry = @feed.entries.first
32
+ end
33
+
34
+ it "should provide a sanitized title" do
35
+ new_title = "<script>" + @entry.title
36
+ @entry.title = new_title
37
+ @entry.title.sanitize.should == Dryopteris.sanitize(new_title)
38
+ end
39
+
40
+ it "should sanitize content in place" do
41
+ new_content = "<script>" + @entry.content
42
+ @entry.content = new_content.dup
43
+ @entry.content.sanitize!.should == Dryopteris.sanitize(new_content)
44
+ @entry.content.should == Dryopteris.sanitize(new_content)
45
+ end
46
+
47
+ it "should sanitize things in place" do
48
+ @entry.title += "<script>"
49
+ @entry.author += "<script>"
50
+ @entry.content += "<script>"
51
+
52
+ cleaned_title = Dryopteris.sanitize(@entry.title)
53
+ cleaned_author = Dryopteris.sanitize(@entry.author)
54
+ cleaned_content = Dryopteris.sanitize(@entry.content)
55
+
56
+ @entry.sanitize!
57
+ @entry.title.should == cleaned_title
58
+ @entry.author.should == cleaned_author
59
+ @entry.content.should == cleaned_content
60
+ end
61
+ end
62
+ end