fblee-feedzirra 0.0.17

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,81 @@
1
+ module Feedzirra
2
+ module FeedEntryUtilities
3
+ def published
4
+ @published || @updated
5
+ end
6
+
7
+ def parse_datetime(string)
8
+ begin
9
+ return DateTime.parse(string).feed_utils_to_gm_time
10
+ rescue
11
+ # This means the date was not in an expected format. Often publishers use bizarre character encodings
12
+ # and/or foreign languages in day or month names, so we try and ignore bogus data and see if it's possible
13
+ # to make a sensible extraction of the data that is parseable.
14
+
15
+ # This reg exp matches string like: ?���, 1 ???��� 2009 14:26 -0400
16
+ # which we take to mean the 1st of the current month, at 14:26 in Eastern Summer Time.
17
+ date_matcher = /.+?\s+(\d?\d)\s+.+?(\d\d\d\d)\s+(\d?\d):(\d\d):?(\d?\d)?\s+([-+]\d\d\d\d|\w\w\w)/
18
+ processed_date = date_matcher.match(string)
19
+ if processed_date.nil?
20
+ puts "DATE CAN'T BE PARSED: #{string}"
21
+ return nil
22
+ else
23
+ day_number = processed_date[1].to_i
24
+ year = processed_date[2].to_i
25
+ # sanitize future years to be this year
26
+ today = DateTime.now
27
+ current_year = today.year
28
+ year = current_year if year > current_year
29
+
30
+ # Guess the month: if the day number is less than today
31
+ if day_number > today.day
32
+ month = (today << 1).month
33
+ else
34
+ month = today.month
35
+ end
36
+
37
+ hours = processed_date[3].to_i
38
+ minutes = processed_date[4].to_i
39
+ seconds = processed_date[5] || "00"
40
+ timezone_indicator = processed_date[6]
41
+
42
+ parsed_datetime_string = "#{year}-#{month}-#{day_number} #{hours}:#{minutes}:#{seconds} #{timezone_indicator}"
43
+ # puts parsed_datetime_string
44
+ correct_date = DateTime.parse(parsed_datetime_string)
45
+ gmt_date = correct_date.feed_utils_to_gm_time
46
+ puts "Correctly sanitized a date string after initial parse failed. Went from: [#{string}] to [#{gmt_date.to_s}] via [#{correct_date.to_s}]"
47
+ return gmt_date
48
+ end
49
+ end
50
+ end
51
+
52
+ ##
53
+ # Returns the id of the entry or its url if not id is present, as some formats don't support it
54
+ def id
55
+ @id || @url
56
+ end
57
+
58
+ ##
59
+ # Writter for published. By default, we keep the "oldest" publish time found.
60
+ def published=(val)
61
+ parsed = parse_datetime(val)
62
+ @published = parsed if !@published || parsed < @published
63
+ end
64
+
65
+ ##
66
+ # Writter for udapted. By default, we keep the most recenet update time found.
67
+ def updated=(val)
68
+ parsed = parse_datetime(val)
69
+ @updated = parsed if !@updated || parsed > @updated
70
+ end
71
+
72
+ def sanitize!
73
+ self.title.sanitize! if self.title
74
+ self.author.sanitize! if self.author
75
+ self.summary.sanitize! if self.summary
76
+ self.content.sanitize! if self.content
77
+ end
78
+
79
+ alias_method :last_modified, :published
80
+ end
81
+ end
@@ -0,0 +1,71 @@
1
+ module Feedzirra
2
+ module FeedUtilities
3
+ UPDATABLE_ATTRIBUTES = %w(title feed_url url last_modified)
4
+
5
+ attr_writer :new_entries, :updated, :last_modified
6
+ attr_accessor :etag
7
+
8
+ def last_modified
9
+ @last_modified ||= begin
10
+ entry = entries.reject {|e| e.published.nil? }.sort_by { |entry| entry.published if entry.published }.last
11
+ entry ? entry.published : nil
12
+ end
13
+ end
14
+
15
+ def updated?
16
+ @updated
17
+ end
18
+
19
+ def new_entries
20
+ @new_entries ||= []
21
+ end
22
+
23
+ def has_new_entries?
24
+ new_entries.size > 0
25
+ end
26
+
27
+ def update_from_feed(feed)
28
+ self.new_entries += find_new_entries_for(feed)
29
+ self.entries.unshift(*self.new_entries)
30
+
31
+ updated! if UPDATABLE_ATTRIBUTES.any? { |name| update_attribute(feed, name) }
32
+ end
33
+
34
+ def update_attribute(feed, name)
35
+ old_value, new_value = send(name), feed.send(name)
36
+
37
+ if old_value != new_value
38
+ send("#{name}=", new_value)
39
+ end
40
+ end
41
+
42
+ def sanitize_entries!
43
+ entries.each {|entry| entry.sanitize!}
44
+ end
45
+
46
+ private
47
+
48
+ def updated!
49
+ @updated = true
50
+ end
51
+
52
+ def find_new_entries_for(feed)
53
+ # this implementation is a hack, which is why it's so ugly.
54
+ # it's to get around the fact that not all feeds have a published date.
55
+ # however, they're always ordered with the newest one first.
56
+ # So we go through the entries just parsed and insert each one as a new entry
57
+ # until we get to one that has the same url as the the newest for the feed
58
+ latest_entry = self.entries.first
59
+ found_new_entries = []
60
+ feed.entries.each do |entry|
61
+ break if entry.url == latest_entry.url
62
+ found_new_entries << entry
63
+ end
64
+ found_new_entries
65
+ end
66
+
67
+ def existing_entry?(test_entry)
68
+ entries.any? { |entry| entry.url == test_entry.url }
69
+ end
70
+ end
71
+ end
@@ -0,0 +1,36 @@
1
+ module Feedzirra
2
+
3
+ module Parser
4
+ # == Summary
5
+ # Parser for dealing with Atom feeds.
6
+ #
7
+ # == Attributes
8
+ # * title
9
+ # * feed_url
10
+ # * url
11
+ # * entries
12
+ class Atom
13
+ include SAXMachine
14
+ include FeedUtilities
15
+ element :title
16
+ element :link, :as => :url, :value => :href, :with => {:type => "text/html"}
17
+ element :link, :as => :feed_url, :value => :href, :with => {:type => "application/atom+xml"}
18
+ elements :link, :as => :links, :value => :href
19
+ element :subtitle, :as => :description
20
+ elements :entry, :as => :entries, :class => AtomEntry
21
+
22
+ def self.able_to_parse?(xml) #:nodoc:
23
+ xml =~ /(Atom)|(#{Regexp.escape("http://purl.org/atom")})|(#{Regexp.escape("http://www.w3.org/2005/Atom")})/
24
+ end
25
+
26
+ def url
27
+ @url || links.last
28
+ end
29
+
30
+ def feed_url
31
+ @feed_url || links.first
32
+ end
33
+ end
34
+ end
35
+
36
+ end
@@ -0,0 +1,41 @@
1
+ module Feedzirra
2
+
3
+ module Parser
4
+ # == Summary
5
+ # Parser for dealing with Atom feed entries.
6
+ #
7
+ # == Attributes
8
+ # * title
9
+ # * url
10
+ # * author
11
+ # * content
12
+ # * summary
13
+ # * published
14
+ # * categories
15
+ class AtomEntry
16
+ include SAXMachine
17
+ include FeedEntryUtilities
18
+ element :title
19
+ element :link, :as => :url, :value => :href, :with => {:type => "text/html", :rel => "alternate"}
20
+ element :name, :as => :author
21
+ element :guid
22
+ element :content
23
+ element :summary
24
+ element :published
25
+ element :id
26
+ element :created, :as => :published
27
+ element :"media:content", :as => :image, :value => :url
28
+ element :issued, :as => :published
29
+ element :updated
30
+ element :modified, :as => :updated
31
+ elements :category, :as => :categories, :value => :term
32
+ elements :link, :as => :links, :value => :href
33
+
34
+ def url
35
+ @url || links.first
36
+ end
37
+ end
38
+
39
+ end
40
+
41
+ end
@@ -0,0 +1,28 @@
1
+ module Feedzirra
2
+
3
+ module Parser
4
+ # == Summary
5
+ # Parser for dealing with Feedburner Atom feeds.
6
+ #
7
+ # == Attributes
8
+ # * title
9
+ # * feed_url
10
+ # * url
11
+ # * entries
12
+ class AtomFeedBurner
13
+ include SAXMachine
14
+ include FeedUtilities
15
+ element :title
16
+ element :link, :as => :url, :value => :href, :with => {:type => "text/html"}
17
+ element :link, :as => :feed_url, :value => :href, :with => {:type => "application/atom+xml"}
18
+ element :subtitle, :as => :description
19
+ elements :entry, :as => :entries, :class => AtomFeedBurnerEntry
20
+
21
+ def self.able_to_parse?(xml) #:nodoc:
22
+ (xml =~ /Atom/ && xml =~ /feedburner/) || false
23
+ end
24
+ end
25
+
26
+ end
27
+
28
+ end
@@ -0,0 +1,37 @@
1
+ module Feedzirra
2
+
3
+ module Parser
4
+ # == Summary
5
+ # Parser for dealing with Feedburner Atom feed entries.
6
+ #
7
+ # == Attributes
8
+ # * title
9
+ # * url
10
+ # * author
11
+ # * content
12
+ # * summary
13
+ # * published
14
+ # * categories
15
+ class AtomFeedBurnerEntry
16
+ include SAXMachine
17
+ include FeedEntryUtilities
18
+ element :title
19
+ element :name, :as => :author
20
+ element :link, :as => :url, :value => :href, :with => {:type => "text/html", :rel => "alternate"}
21
+ element :"feedburner:origLink", :as => :url
22
+ element :summary
23
+ element :guid
24
+ element :content
25
+ element :published
26
+ element :id
27
+ element :issued, :as => :published
28
+ element :created, :as => :published
29
+ element :"media:content", :as => :image, :value => :url
30
+ element :updated
31
+ element :modified, :as => :updated
32
+ elements :category, :as => :categories, :value => :term
33
+ end
34
+
35
+ end
36
+
37
+ end
@@ -0,0 +1,50 @@
1
+ module Feedzirra
2
+
3
+ module Parser
4
+ # iTunes is RSS 2.0 + some apple extensions
5
+ # Source: http://www.apple.com/itunes/whatson/podcasts/specs.html
6
+ class ITunesRSS
7
+ include SAXMachine
8
+ include FeedUtilities
9
+
10
+ attr_accessor :feed_url
11
+
12
+ # RSS 2.0 elements that need including
13
+ element :copyright
14
+ element :description
15
+ element :language
16
+ element :managingEditor
17
+ element :title
18
+ element :link, :as => :url
19
+
20
+ # If author is not present use managingEditor on the channel
21
+ element :"itunes:author", :as => :itunes_author
22
+ element :"itunes:block", :as => :itunes_block
23
+ element :"itunes:image", :value => :href, :as => :itunes_image
24
+ element :"itunes:explicit", :as => :itunes_explicit
25
+ element :"itunes:keywords", :as => :itunes_keywords
26
+ # New URL for the podcast feed
27
+ element :"itunes:new-feed-url", :as => :itunes_new_feed_url
28
+ element :"itunes:subtitle", :as => :itunes_subtitle
29
+ # If summary is not present, use the description tag
30
+ element :"itunes:summary", :as => :itunes_summary
31
+
32
+ # iTunes RSS feeds can have multiple main categories...
33
+ # ...and multiple sub-categories per category
34
+ # TODO subcategories not supported correctly - they are at the same level
35
+ # as the main categories
36
+ elements :"itunes:category", :as => :itunes_categories, :value => :text
37
+
38
+ elements :"itunes:owner", :as => :itunes_owners, :class => ITunesRSSOwner
39
+
40
+ elements :item, :as => :entries, :class => ITunesRSSItem
41
+
42
+ def self.able_to_parse?(xml)
43
+ xml =~ /xmlns:itunes=\"http:\/\/www.itunes.com\/dtds\/podcast-1.0.dtd\"/
44
+ end
45
+
46
+ end
47
+
48
+ end
49
+
50
+ end
@@ -0,0 +1,31 @@
1
+ module Feedzirra
2
+
3
+ module Parser
4
+ # iTunes extensions to the standard RSS2.0 item
5
+ # Source: http://www.apple.com/itunes/whatson/podcasts/specs.html
6
+ class ITunesRSSItem
7
+ include SAXMachine
8
+ include FeedUtilities
9
+ element :author
10
+ element :guid
11
+ element :title
12
+ element :link, :as => :url
13
+ element :description, :as => :summary
14
+ element :pubDate, :as => :published
15
+
16
+ # If author is not present use author tag on the item
17
+ element :"itunes:author", :as => :itunes_author
18
+ element :"itunes:block", :as => :itunes_block
19
+ element :"itunes:duration", :as => :itunes_duration
20
+ element :"itunes:explicit", :as => :itunes_explicit
21
+ element :"itunes:keywords", :as => :itunes_keywords
22
+ element :"itunes:subtitle", :as => :itunes_subtitle
23
+ # If summary is not present, use the description tag
24
+ element :"itunes:summary", :as => :itunes_summary
25
+ element :enclosure, :value => :length, :as => :enclosure_length
26
+ element :enclosure, :value => :type, :as => :enclosure_type
27
+ element :enclosure, :value => :url, :as => :enclosure_url
28
+ end
29
+ end
30
+
31
+ end
@@ -0,0 +1,12 @@
1
+ module Feedzirra
2
+
3
+ module Parser
4
+ class ITunesRSSOwner
5
+ include SAXMachine
6
+ include FeedUtilities
7
+ element :"itunes:name", :as => :name
8
+ element :"itunes:email", :as => :email
9
+ end
10
+ end
11
+
12
+ end
@@ -0,0 +1,36 @@
1
+ module Feedzirra
2
+
3
+ module Parser
4
+ # == Summary
5
+ # Parser for dealing with RSS feeds.
6
+ #
7
+ # == Attributes
8
+ # * title
9
+ # * feed_url
10
+ # * url
11
+ # * entries
12
+ class RSS
13
+ include SAXMachine
14
+ include FeedUtilities
15
+ element :title
16
+ element :link, :as => :url
17
+ elements :item, :as => :entries, :class => RSSEntry
18
+
19
+ # parse the subtitle and description, so we can use whatever we have!
20
+ element :subtitle
21
+ element :description, :as => :feed_description
22
+
23
+ attr_accessor :feed_url
24
+
25
+ def description
26
+ self.feed_description || self.subtitle
27
+ end
28
+
29
+ def self.able_to_parse?(xml) #:nodoc:
30
+ xml =~ /\<rss|\<rdf/
31
+ end
32
+ end
33
+
34
+ end
35
+
36
+ end
@@ -0,0 +1,45 @@
1
+ module Feedzirra
2
+
3
+ module Parser
4
+ # == Summary
5
+ # Parser for dealing with RDF feed entries.
6
+ #
7
+ # == Attributes
8
+ # * title
9
+ # * url
10
+ # * author
11
+ # * content
12
+ # * summary
13
+ # * published
14
+ # * categories
15
+ class RSSEntry
16
+ include SAXMachine
17
+ include FeedEntryUtilities
18
+ element :title
19
+ element :link, :as => :url
20
+
21
+ element :"dc:creator", :as => :author
22
+ element :author, :as => :author
23
+ element :"content:encoded", :as => :content
24
+ element :description, :as => :summary
25
+
26
+ element :pubDate, :as => :published
27
+ element :pubdate, :as => :published
28
+ element :"dc:date", :as => :published
29
+ element :"dc:Date", :as => :published
30
+ element :"dcterms:created", :as => :published
31
+
32
+
33
+ element :"dcterms:modified", :as => :updated
34
+ element :issued, :as => :published
35
+ elements :category, :as => :categories
36
+
37
+ element :guid
38
+
39
+ # TODO: wtf... sometimes type="image/jpeg", sometimes medium="image", what are we to do?
40
+ element :"media:content", :as => :image, :value => :url
41
+ end
42
+
43
+ end
44
+
45
+ end
@@ -0,0 +1,8 @@
1
+ module Feedzirra
2
+ class WebPage
3
+ include SAXMachine
4
+ include FeedUtilities
5
+ element :title # not essential; helpful for debugging
6
+ element :link, :as => :feed_url, :value => :href, :with => {:rel => 'alternate', :type => "application/rss+xml"}
7
+ end
8
+ end
data/lib/feedzirra.rb ADDED
@@ -0,0 +1,35 @@
1
+ $LOAD_PATH.unshift(File.dirname(__FILE__)) unless $LOAD_PATH.include?(File.dirname(__FILE__))
2
+
3
+ gem 'activesupport'
4
+
5
+ require 'zlib'
6
+ require 'curb_core'
7
+ require 'sax-machine'
8
+ require 'dryopteris'
9
+ require 'uri'
10
+ require 'active_support/basic_object'
11
+ require 'active_support/core_ext/object'
12
+ require 'active_support/core_ext/time'
13
+
14
+ require 'core_ext/date'
15
+ require 'core_ext/string'
16
+
17
+ require 'feedzirra/feed_utilities'
18
+ require 'feedzirra/feed_entry_utilities'
19
+ require 'feedzirra/web_page'
20
+ require 'feedzirra/feed'
21
+
22
+ require 'feedzirra/parser/rss_entry'
23
+ require 'feedzirra/parser/itunes_rss_owner'
24
+ require 'feedzirra/parser/itunes_rss_item'
25
+ require 'feedzirra/parser/atom_entry'
26
+ require 'feedzirra/parser/atom_feed_burner_entry'
27
+
28
+ require 'feedzirra/parser/rss'
29
+ require 'feedzirra/parser/itunes_rss'
30
+ require 'feedzirra/parser/atom'
31
+ require 'feedzirra/parser/atom_feed_burner'
32
+
33
+ module Feedzirra
34
+ VERSION = "0.0.17"
35
+ end
@@ -0,0 +1,62 @@
1
+ require File.dirname(__FILE__) + '/../spec_helper'
2
+
3
+ describe Feedzirra::FeedUtilities do
4
+ before(:each) do
5
+ @klass = Class.new do
6
+ include Feedzirra::FeedEntryUtilities
7
+ end
8
+ end
9
+
10
+ describe "handling dates" do
11
+ it "should parse an ISO 8601 formatted datetime into Time" do
12
+ time = @klass.new.parse_datetime("2008-02-20T8:05:00-010:00")
13
+ time.class.should == Time
14
+ time.to_s.should == "Wed Feb 20 18:05:00 UTC 2008"
15
+ end
16
+
17
+ it "should parse a bunch of strangely encoded stuff into Time" do
18
+ time = @klass.new.parse_datetime("Mon, 4 Jan 7010 13:51:39 EST")
19
+ time.class.should == Time
20
+ time.to_s.should == "Mon Jan 04 18:51:39 UTC 2010"
21
+
22
+ time = @klass.new.parse_datetime("���, 5 1��� 2010 10:37 -0500")
23
+ time.class.should == Time
24
+ time.to_s.should == "Tue Jan 05 15:37:00 UTC 2010"
25
+ end
26
+ end
27
+
28
+ describe "sanitizing" do
29
+ before(:each) do
30
+ @feed = Feedzirra::Feed.parse(sample_atom_feed)
31
+ @entry = @feed.entries.first
32
+ end
33
+
34
+ it "should provide a sanitized title" do
35
+ new_title = "<script>" + @entry.title
36
+ @entry.title = new_title
37
+ @entry.title.sanitize.should == Dryopteris.sanitize(new_title)
38
+ end
39
+
40
+ it "should sanitize content in place" do
41
+ new_content = "<script>" + @entry.content
42
+ @entry.content = new_content.dup
43
+ @entry.content.sanitize!.should == Dryopteris.sanitize(new_content)
44
+ @entry.content.should == Dryopteris.sanitize(new_content)
45
+ end
46
+
47
+ it "should sanitize things in place" do
48
+ @entry.title += "<script>"
49
+ @entry.author += "<script>"
50
+ @entry.content += "<script>"
51
+
52
+ cleaned_title = Dryopteris.sanitize(@entry.title)
53
+ cleaned_author = Dryopteris.sanitize(@entry.author)
54
+ cleaned_content = Dryopteris.sanitize(@entry.content)
55
+
56
+ @entry.sanitize!
57
+ @entry.title.should == cleaned_title
58
+ @entry.author.should == cleaned_author
59
+ @entry.content.should == cleaned_content
60
+ end
61
+ end
62
+ end