feed-abstract 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1,5 @@
1
+ pkg/*
2
+ *.gem
3
+ .bundle
4
+ Gemfile.lock
5
+ *.swp
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in feed-abstract.gemspec
4
+ gemspec
data/README.rdoc ADDED
@@ -0,0 +1,23 @@
1
+ = Feed::Abstract
2
+
3
+ Feed::Abstract creates a common object graph for RSS, Atom, and RDF feeds using the classes returned by RSS::Parser
4
+
5
+ == Installation
6
+
7
+ gem install feed-abstract
8
+
9
+ == Usage
10
+
11
+ See Feed::Abstract::Feed for basic examples. Also see the Feed::Abstract::Channel and Feed::Abstract::Item namespaces.
12
+
13
+ == Author
14
+
15
+ Dan Collis Puro, Berkman Center for Internet & Society
16
+ djcp@cyber.law.harvard.edu
17
+
18
+ == License & Copyright
19
+
20
+ GPLv3
21
+
22
+ 2011 President and Fellows of Harvard College
23
+
data/Rakefile ADDED
@@ -0,0 +1,2 @@
1
+ require 'bundler'
2
+ Bundler::GemHelper.install_tasks
@@ -0,0 +1,23 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "feed-abstract/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "feed-abstract"
7
+ s.version = Feed::Abstract::VERSION
8
+ s.platform = Gem::Platform::RUBY
9
+ s.authors = ["Daniel Collis-Puro"]
10
+ s.email = ["djcp@cyber.law.harvard.edu"]
11
+ s.homepage = "https://github.com/berkmancenter/feed-abstract"
12
+ s.summary = %q{Abstracts RSS/Atom/RDF parsing features from the ruby standard lib into a common object graph.}
13
+ s.description = %q{This library creates a common object graph for the RSS/Atom/RDF parsing classes in the ruby standard library. This allows you parse different feed formats and get back the same (or at least a very similar) set of results - item authors are accessible under an "author(s)" attribute, categories/tags/subjects are accessible under "category(ies)" attributes, etc. We do our best to make sure the data makes sense, too - RSS items lack an "updated" attribute, so we use "pubDate" to populate it. }
14
+ s.files = `git ls-files`.split("\n")
15
+ s.rdoc_options = ["--charset=UTF-8"]
16
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
17
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
18
+ s.require_paths = ["lib"]
19
+ s.add_development_dependency "rspec", "2.6"
20
+ s.extra_rdoc_files = [
21
+ "README.rdoc"
22
+ ]
23
+ end
@@ -0,0 +1,33 @@
1
+ # encoding: UTF-8
2
+
3
+ $LOAD_PATH.unshift(File.dirname(__FILE__)) unless $LOAD_PATH.include?(File.dirname(__FILE__))
4
+
5
+ require 'rss'
6
+
7
+ require "feed-abstract/version"
8
+ require 'feed-abstract/mixins'
9
+
10
+ require 'feed-abstract/channel/atom'
11
+ require 'feed-abstract/channel/rss'
12
+ require 'feed-abstract/channel/rdf'
13
+
14
+ require 'feed-abstract/items/rss'
15
+ require 'feed-abstract/items/atom'
16
+ require 'feed-abstract/items/rdf'
17
+
18
+ require 'feed-abstract/item/rss'
19
+ require 'feed-abstract/item/atom'
20
+ require 'feed-abstract/item/rdf'
21
+
22
+ require 'feed-abstract/feed'
23
+
24
+
25
+ #RSS::Rss::Channel::Item.class_eval{
26
+ # def foobar
27
+ # 'asdfasdf'
28
+ # end
29
+ #}
30
+
31
+ #RSS::Rss::Channel::Item.instance_eval{
32
+ # install_get_attribute('category', '', false )
33
+ #}
@@ -0,0 +1,47 @@
1
+ # encoding: UTF-8
2
+
3
+ module Feed
4
+ class Abstract
5
+
6
+ # You don't want this class. You want Feed::Abstract::Channel::Atom, Feed::Abstract::Channel::RSS or Feed::Abstract::Channel::RDF.
7
+ class Channel
8
+
9
+ # See Feed::AbstractMixins::Atom for more instance methods.
10
+ class Atom
11
+ include Feed::AbstractMixins::Atom
12
+
13
+ attr_reader :feed, :source
14
+
15
+ def initialize(feed)
16
+ @feed = @source = feed
17
+ end
18
+
19
+ def description
20
+ @feed.subtitle.content
21
+ end
22
+ alias :subtitle :description
23
+
24
+ # A string representing the application that created this feed.
25
+ def generator
26
+ return '' if @feed.generator.nil?
27
+ @feed.generator.content
28
+ end
29
+
30
+ # A URL (perhaps with domain, depending on input) representing an icon for the feed.
31
+ def icon
32
+ return '' if @feed.icon.nil?
33
+ @feed.icon.content
34
+ end
35
+
36
+ # A URL (perhaps with domain, depending on input) representing a logo for the feed.
37
+ def logo
38
+ return '' if @feed.logo.nil?
39
+ @feed.logo.content
40
+ end
41
+
42
+ end
43
+ end
44
+
45
+
46
+ end
47
+ end
@@ -0,0 +1,63 @@
1
+ # encoding: UTF-8
2
+
3
+ module Feed
4
+ class Abstract
5
+ class Channel
6
+ class RDF < RSS
7
+
8
+ # The authors list as an array.
9
+ def authors
10
+ return [] if @feed.channel.dc_publishers.empty?
11
+ @feed.channel.dc_publishers
12
+ end
13
+
14
+ # The authors list as a string, joined with a comma.
15
+ def author
16
+ return '' if self.authors.empty?
17
+ self.authors.join(', ')
18
+ end
19
+
20
+ # The generator of this feed.
21
+ def generator
22
+ return '' unless @feed.channel.respond_to?(:about)
23
+ if @feed.channel.about.match(/connotea/i)
24
+ return 'Connotea'
25
+ end
26
+ ''
27
+ end
28
+
29
+ # The category list as an array.
30
+ def categories
31
+ return [] if @feed.channel.dc_subjects.empty?
32
+ @feed.channel.dc_subjects.collect{|c| c.content}
33
+ end
34
+
35
+ # The category list as a string, joined with a comma.
36
+ def category
37
+ return '' if self.categories.empty?
38
+ self.categories.join(', ')
39
+ end
40
+
41
+ # A URL (with or without domain depending on input) to a icon representing this feed.
42
+ def icon
43
+ return '' if @feed.channel.image.nil?
44
+ @feed.channel.image.resource
45
+ end
46
+ alias :logo :icon
47
+
48
+ # Copyright info.
49
+ def rights
50
+ return '' if @feed.channel.dc_rights.nil?
51
+ @feed.channel.dc_rights
52
+ end
53
+
54
+ # A Time object representing when this feed was updated, or at least "dated" according to the RDF spec.
55
+ def updated
56
+ return '' if @feed.channel.dc_date.nil?
57
+ @feed.channel.dc_date
58
+ end
59
+
60
+ end
61
+ end
62
+ end
63
+ end
@@ -0,0 +1,91 @@
1
+ # encoding: UTF-8
2
+
3
+ module Feed
4
+ class Abstract
5
+ class Channel
6
+ class RSS
7
+ include Feed::AbstractMixins::RSS
8
+ attr_reader :feed, :source
9
+
10
+ def initialize(feed)
11
+ @feed = @source = feed
12
+ end
13
+
14
+ def title
15
+ @feed.channel.title
16
+ end
17
+
18
+ def description
19
+ @feed.channel.description
20
+ end
21
+ alias :subtitle :description
22
+
23
+ # The generator of this feed as a string. Sometimes a URL, sometimes a string (e.g. the application name).
24
+ def generator
25
+ if ! @feed.channel.generator.nil? && @feed.channel.generator.match(/wordpress\.org/i)
26
+ return 'WordPress'
27
+ elsif @feed.channel.link.match(/www\.delicious\.com/i)
28
+ return 'Delicious'
29
+ end
30
+ return '' if @feed.channel.generator.nil?
31
+ @feed.channel.generator
32
+ end
33
+
34
+ def link
35
+ return '' if @feed.channel.link.nil?
36
+ @feed.channel.link
37
+ end
38
+
39
+ # Copyright info.
40
+ def rights
41
+ return '' if @feed.channel.copyright.nil? && @feed.channel.dc_rights.nil?
42
+ [@feed.channel.copyright,@feed.channel.dc_rights].compact.join(' ')
43
+ end
44
+
45
+ # A Time object.
46
+ def updated
47
+ return '' if @feed.channel.lastBuildDate.nil?
48
+ @feed.channel.lastBuildDate
49
+ end
50
+
51
+ # A globally unique ID for this feed. A URL in this case.
52
+ def guid
53
+ return '' if @feed.channel.link.nil?
54
+ @feed.channel.link
55
+ end
56
+
57
+ # The authors (a merge of the RSS managingEditor and dc:publisher elements) as an array.
58
+ def authors
59
+ return [] if @feed.channel.managingEditor.nil? && @feed.channel.dc_publishers.empty?
60
+ [@feed.channel.managingEditor, @feed.channel.dc_publishers].flatten.uniq
61
+ end
62
+
63
+ # The author list joined with a comma.
64
+ def author
65
+ return '' if self.authors.empty?
66
+ self.authors.join(', ')
67
+ end
68
+
69
+ # The category list (a merge of the RSS category and dc:subject elements) as an array.
70
+ def categories
71
+ return [] if @feed.channel.categories.empty? && @feed.channel.dc_subjects.empty?
72
+ [@feed.channel.categories, @feed.channel.dc_subjects].flatten.uniq.collect{|c| c.content}
73
+ end
74
+
75
+ # The category list as a string, joined with a comma.
76
+ def category
77
+ return '' if @feed.channel.categories.empty?
78
+ @feed.channel.categories.collect{|c| c.content}.join(', ')
79
+ end
80
+
81
+ # A URL to an icon representing this feed.
82
+ def icon
83
+ return '' if @feed.channel.image.nil?
84
+ @feed.channel.image.url
85
+ end
86
+ alias :logo :icon
87
+
88
+ end
89
+ end
90
+ end
91
+ end
@@ -0,0 +1,74 @@
1
+ # encoding: UTF-8
2
+
3
+ module Feed
4
+ class ParserError < Exception
5
+ end
6
+
7
+ class Abstract
8
+
9
+ # Feed::Abstract::Feed is the main class. It invokes RSS::Parser and negotiates which of the Feed::Abstract::Channel and Feed::Abstract::Item classes get dispatched to normalize the object graph of the feed you're parsing.
10
+ class Feed
11
+ attr_reader :channel, :raw_feed, :items
12
+
13
+ # === Parameters
14
+ # * xml - a string or object instance that responds to <b>read</b>
15
+ # * :do_validate - whether or not the feed should be validated. Passed through to RSS::Parser
16
+ # * :ignore_unknown_element - passed through to RSS::Parser
17
+ #
18
+ # === Returns
19
+ # An object with three attributes:
20
+ # * channel - an instance of Feed::Abstract::Channel matching the type of feed we recognized
21
+ # * items - an array of items matching the type of feed we recognized.
22
+ # * raw_feed - the raw feed object returned by RSS::Parser, which might include RSS::Atom::Feed, RSS::RDF, or RSS::Rss
23
+ # You will most likely be using the <b>channel</b> and <b>items</b> attributes.
24
+ #
25
+ # === Notes
26
+ # If a feed can't be parsed, we'll throw a Feed::ParserError.
27
+ #
28
+ # == Examples
29
+ #
30
+ # f = Feed::Abstract::Feed.new(File.open('/home/foo/xml/feed.rss2'))
31
+ # puts f.channel.title
32
+ # puts f.channel.description
33
+ #
34
+ # f.items.each do|item|
35
+ # puts item.title
36
+ # puts item.link
37
+ # end
38
+ #
39
+ # f = Feed::Abstract::Feed.new(File.open('/home/foo/xml/feed.atom'))
40
+ # puts f.channel.generator
41
+ #
42
+ # puts "All tags / categories / subjects in this feed: " + f.items.collect{|i| i.categories}.flatten.uniq.sort.join(', ')
43
+ #
44
+ # f = Feed::Abstract::Feed.new(Net::HTTP.get(URI.parse('http://rss.slashdot.org/Slashdot/slashdot')))
45
+ # puts f.items.collect{|i| i.link}
46
+ #
47
+ def initialize(xml = nil, options = {:do_validate => false, :ignore_unknown_element => true})
48
+ input = (xml.respond_to?(:read)) ? xml.read : xml
49
+ @raw_feed = RSS::Parser.parse(input,options[:do_validate], options[:ignore_unknown_element])
50
+ if @raw_feed == nil
51
+ raise Feed::ParserError
52
+ end
53
+ negotiate_channel_class
54
+ end
55
+
56
+ private
57
+
58
+ #Here's an easy extension point for custom parsers.
59
+ def negotiate_channel_class
60
+ if @raw_feed.class == RSS::Atom::Feed
61
+ @channel = Channel::Atom.new(@raw_feed)
62
+ @items = Items::Atom.new(@raw_feed)
63
+ elsif @raw_feed.class == RSS::RDF
64
+ @channel = Channel::RDF.new(@raw_feed)
65
+ @items = Items::RDF.new(@raw_feed)
66
+ else
67
+ @channel = Channel::RSS.new(@raw_feed)
68
+ @items = Items::RSS.new(@raw_feed)
69
+ end
70
+ end
71
+
72
+ end
73
+ end
74
+ end
@@ -0,0 +1,50 @@
1
+ # encoding: UTF-8
2
+
3
+ module Feed
4
+ class Abstract
5
+
6
+ # You don't want this class. You want Feed::Abstract::Item::Atom, Feed::Abstract::Item::RSS or Feed::Abstract::Item::RDF.
7
+ class Item
8
+
9
+ # See Feed::AbstractMixins::Atom for more instance methods.
10
+ class Atom
11
+ include Feed::AbstractMixins::Atom
12
+ attr_reader :item, :source
13
+
14
+ def initialize(item)
15
+ @item = @source = item
16
+ end
17
+
18
+ # The full content of the item, most likely html.
19
+ def content
20
+ return '' if @item.content.nil?
21
+ @item.content.content
22
+ end
23
+
24
+ # The contributor list as an array.
25
+ def contributors
26
+ return [] if @item.contributors.empty?
27
+ @item.contributors.collect{|c| c.name.content}
28
+ end
29
+
30
+ #The contributor list as a strong joined with a comma.
31
+ def contributor
32
+ return '' if @item.contributors.empty?
33
+ @item.contributors.collect{|c| c.name.content}.join(', ')
34
+ end
35
+
36
+ def summary
37
+ return '' if @item.summary.nil?
38
+ @item.summary.content
39
+ end
40
+
41
+ # A Time object
42
+ def published
43
+ return '' if @item.published.nil?
44
+ @item.published.content
45
+ end
46
+
47
+ end
48
+ end
49
+ end
50
+ end
@@ -0,0 +1,48 @@
1
+ # encoding: UTF-8
2
+
3
+ module Feed
4
+ class Abstract
5
+ class Item
6
+ class RDF < RSS
7
+
8
+ # The author list (from the dc:creator element) as an array.
9
+ def authors
10
+ (@item.dc_creators.empty?) ? [] : @item.dc_creators.collect{|c| c.content}
11
+ end
12
+
13
+ # The author list as a string, joined with a comma.
14
+ def author
15
+ return '' if self.authors.empty?
16
+ self.authors.join(', ')
17
+ end
18
+
19
+ # The category list (parsed from the dc:subject element) as an array.
20
+ def categories
21
+ return [] if @item.dc_subjects.empty?
22
+ @item.dc_subjects.collect{|c| c.content}
23
+ end
24
+
25
+ # The category list as a string, joined with a comma.
26
+ def category
27
+ return '' if self.categories.empty?
28
+ self.categories.join(', ')
29
+ end
30
+
31
+ # A Time object.
32
+ def updated
33
+ return '' if @item.dc_date.nil?
34
+ @item.dc_date
35
+ end
36
+ alias :published :updated
37
+
38
+ # A globally unique id, in this case probably a URL.
39
+ def guid
40
+ return '' if @item.about.nil?
41
+ @item.about
42
+ end
43
+
44
+
45
+ end
46
+ end
47
+ end
48
+ end