feed-abstract 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +5 -0
- data/Gemfile +4 -0
- data/README.rdoc +23 -0
- data/Rakefile +2 -0
- data/feed-abstract.gemspec +23 -0
- data/lib/feed-abstract.rb +33 -0
- data/lib/feed-abstract/channel/atom.rb +47 -0
- data/lib/feed-abstract/channel/rdf.rb +63 -0
- data/lib/feed-abstract/channel/rss.rb +91 -0
- data/lib/feed-abstract/feed.rb +74 -0
- data/lib/feed-abstract/item/atom.rb +50 -0
- data/lib/feed-abstract/item/rdf.rb +48 -0
- data/lib/feed-abstract/item/rss.rb +86 -0
- data/lib/feed-abstract/items/atom.rb +22 -0
- data/lib/feed-abstract/items/rdf.rb +20 -0
- data/lib/feed-abstract/items/rss.rb +20 -0
- data/lib/feed-abstract/mixins.rb +66 -0
- data/lib/feed-abstract/version.rb +7 -0
- data/spec/feed_abstract_channel_spec.rb +132 -0
- data/spec/feed_abstract_item_spec.rb +160 -0
- data/spec/feed_abstract_spec.rb +30 -0
- data/spec/spec_helper.rb +2 -0
- data/spec/test_data/djcp.rss +229 -0
- data/spec/test_data/djcp_code.rss +320 -0
- data/spec/test_data/djcp_delicious.rss +1122 -0
- data/spec/test_data/doc.atom +321 -0
- data/spec/test_data/katanapg.atom +137 -0
- data/spec/test_data/oa.africa.rss +330 -0
- metadata +105 -0
data/Gemfile
ADDED
data/README.rdoc
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
= Feed::Abstract
|
2
|
+
|
3
|
+
Feed::Abstract creates a common object graph for RSS, Atom, and RDF feeds using the classes returned by RSS::Parser
|
4
|
+
|
5
|
+
== Installation
|
6
|
+
|
7
|
+
gem install feed-abstract
|
8
|
+
|
9
|
+
== Usage
|
10
|
+
|
11
|
+
See Feed::Abstract::Feed for basic examples. Also see the Feed::Abstract::Channel and Feed::Abstract::Item namespaces.
|
12
|
+
|
13
|
+
== Author
|
14
|
+
|
15
|
+
Dan Collis Puro, Berkman Center for Internet & Society
|
16
|
+
djcp@cyber.law.harvard.edu
|
17
|
+
|
18
|
+
== License & Copyright
|
19
|
+
|
20
|
+
GPLv3
|
21
|
+
|
22
|
+
2011 President and Fellows of Harvard College
|
23
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "feed-abstract/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "feed-abstract"
|
7
|
+
s.version = Feed::Abstract::VERSION
|
8
|
+
s.platform = Gem::Platform::RUBY
|
9
|
+
s.authors = ["Daniel Collis-Puro"]
|
10
|
+
s.email = ["djcp@cyber.law.harvard.edu"]
|
11
|
+
s.homepage = "https://github.com/berkmancenter/feed-abstract"
|
12
|
+
s.summary = %q{Abstracts RSS/Atom/RDF parsing features from the ruby standard lib into a common object graph.}
|
13
|
+
s.description = %q{This library creates a common object graph for the RSS/Atom/RDF parsing classes in the ruby standard library. This allows you parse different feed formats and get back the same (or at least a very similar) set of results - item authors are accessible under an "author(s)" attribute, categories/tags/subjects are accessible under "category(ies)" attributes, etc. We do our best to make sure the data makes sense, too - RSS items lack an "updated" attribute, so we use "pubDate" to populate it. }
|
14
|
+
s.files = `git ls-files`.split("\n")
|
15
|
+
s.rdoc_options = ["--charset=UTF-8"]
|
16
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
17
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
18
|
+
s.require_paths = ["lib"]
|
19
|
+
s.add_development_dependency "rspec", "2.6"
|
20
|
+
s.extra_rdoc_files = [
|
21
|
+
"README.rdoc"
|
22
|
+
]
|
23
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__)) unless $LOAD_PATH.include?(File.dirname(__FILE__))
|
4
|
+
|
5
|
+
require 'rss'
|
6
|
+
|
7
|
+
require "feed-abstract/version"
|
8
|
+
require 'feed-abstract/mixins'
|
9
|
+
|
10
|
+
require 'feed-abstract/channel/atom'
|
11
|
+
require 'feed-abstract/channel/rss'
|
12
|
+
require 'feed-abstract/channel/rdf'
|
13
|
+
|
14
|
+
require 'feed-abstract/items/rss'
|
15
|
+
require 'feed-abstract/items/atom'
|
16
|
+
require 'feed-abstract/items/rdf'
|
17
|
+
|
18
|
+
require 'feed-abstract/item/rss'
|
19
|
+
require 'feed-abstract/item/atom'
|
20
|
+
require 'feed-abstract/item/rdf'
|
21
|
+
|
22
|
+
require 'feed-abstract/feed'
|
23
|
+
|
24
|
+
|
25
|
+
#RSS::Rss::Channel::Item.class_eval{
|
26
|
+
# def foobar
|
27
|
+
# 'asdfasdf'
|
28
|
+
# end
|
29
|
+
#}
|
30
|
+
|
31
|
+
#RSS::Rss::Channel::Item.instance_eval{
|
32
|
+
# install_get_attribute('category', '', false )
|
33
|
+
#}
|
@@ -0,0 +1,47 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
module Feed
|
4
|
+
class Abstract
|
5
|
+
|
6
|
+
# You don't want this class. You want Feed::Abstract::Channel::Atom, Feed::Abstract::Channel::RSS or Feed::Abstract::Channel::RDF.
|
7
|
+
class Channel
|
8
|
+
|
9
|
+
# See Feed::AbstractMixins::Atom for more instance methods.
|
10
|
+
class Atom
|
11
|
+
include Feed::AbstractMixins::Atom
|
12
|
+
|
13
|
+
attr_reader :feed, :source
|
14
|
+
|
15
|
+
def initialize(feed)
|
16
|
+
@feed = @source = feed
|
17
|
+
end
|
18
|
+
|
19
|
+
def description
|
20
|
+
@feed.subtitle.content
|
21
|
+
end
|
22
|
+
alias :subtitle :description
|
23
|
+
|
24
|
+
# A string representing the application that created this feed.
|
25
|
+
def generator
|
26
|
+
return '' if @feed.generator.nil?
|
27
|
+
@feed.generator.content
|
28
|
+
end
|
29
|
+
|
30
|
+
# A URL (perhaps with domain, depending on input) representing an icon for the feed.
|
31
|
+
def icon
|
32
|
+
return '' if @feed.icon.nil?
|
33
|
+
@feed.icon.content
|
34
|
+
end
|
35
|
+
|
36
|
+
# A URL (perhaps with domain, depending on input) representing a logo for the feed.
|
37
|
+
def logo
|
38
|
+
return '' if @feed.logo.nil?
|
39
|
+
@feed.logo.content
|
40
|
+
end
|
41
|
+
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
|
46
|
+
end
|
47
|
+
end
|
@@ -0,0 +1,63 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
module Feed
|
4
|
+
class Abstract
|
5
|
+
class Channel
|
6
|
+
class RDF < RSS
|
7
|
+
|
8
|
+
# The authors list as an array.
|
9
|
+
def authors
|
10
|
+
return [] if @feed.channel.dc_publishers.empty?
|
11
|
+
@feed.channel.dc_publishers
|
12
|
+
end
|
13
|
+
|
14
|
+
# The authors list as a string, joined with a comma.
|
15
|
+
def author
|
16
|
+
return '' if self.authors.empty?
|
17
|
+
self.authors.join(', ')
|
18
|
+
end
|
19
|
+
|
20
|
+
# The generator of this feed.
|
21
|
+
def generator
|
22
|
+
return '' unless @feed.channel.respond_to?(:about)
|
23
|
+
if @feed.channel.about.match(/connotea/i)
|
24
|
+
return 'Connotea'
|
25
|
+
end
|
26
|
+
''
|
27
|
+
end
|
28
|
+
|
29
|
+
# The category list as an array.
|
30
|
+
def categories
|
31
|
+
return [] if @feed.channel.dc_subjects.empty?
|
32
|
+
@feed.channel.dc_subjects.collect{|c| c.content}
|
33
|
+
end
|
34
|
+
|
35
|
+
# The category list as a string, joined with a comma.
|
36
|
+
def category
|
37
|
+
return '' if self.categories.empty?
|
38
|
+
self.categories.join(', ')
|
39
|
+
end
|
40
|
+
|
41
|
+
# A URL (with or without domain depending on input) to a icon representing this feed.
|
42
|
+
def icon
|
43
|
+
return '' if @feed.channel.image.nil?
|
44
|
+
@feed.channel.image.resource
|
45
|
+
end
|
46
|
+
alias :logo :icon
|
47
|
+
|
48
|
+
# Copyright info.
|
49
|
+
def rights
|
50
|
+
return '' if @feed.channel.dc_rights.nil?
|
51
|
+
@feed.channel.dc_rights
|
52
|
+
end
|
53
|
+
|
54
|
+
# A Time object representing when this feed was updated, or at least "dated" according to the RDF spec.
|
55
|
+
def updated
|
56
|
+
return '' if @feed.channel.dc_date.nil?
|
57
|
+
@feed.channel.dc_date
|
58
|
+
end
|
59
|
+
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
@@ -0,0 +1,91 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
module Feed
|
4
|
+
class Abstract
|
5
|
+
class Channel
|
6
|
+
class RSS
|
7
|
+
include Feed::AbstractMixins::RSS
|
8
|
+
attr_reader :feed, :source
|
9
|
+
|
10
|
+
def initialize(feed)
|
11
|
+
@feed = @source = feed
|
12
|
+
end
|
13
|
+
|
14
|
+
def title
|
15
|
+
@feed.channel.title
|
16
|
+
end
|
17
|
+
|
18
|
+
def description
|
19
|
+
@feed.channel.description
|
20
|
+
end
|
21
|
+
alias :subtitle :description
|
22
|
+
|
23
|
+
# The generator of this feed as a string. Sometimes a URL, sometimes a string (e.g. the application name).
|
24
|
+
def generator
|
25
|
+
if ! @feed.channel.generator.nil? && @feed.channel.generator.match(/wordpress\.org/i)
|
26
|
+
return 'WordPress'
|
27
|
+
elsif @feed.channel.link.match(/www\.delicious\.com/i)
|
28
|
+
return 'Delicious'
|
29
|
+
end
|
30
|
+
return '' if @feed.channel.generator.nil?
|
31
|
+
@feed.channel.generator
|
32
|
+
end
|
33
|
+
|
34
|
+
def link
|
35
|
+
return '' if @feed.channel.link.nil?
|
36
|
+
@feed.channel.link
|
37
|
+
end
|
38
|
+
|
39
|
+
# Copyright info.
|
40
|
+
def rights
|
41
|
+
return '' if @feed.channel.copyright.nil? && @feed.channel.dc_rights.nil?
|
42
|
+
[@feed.channel.copyright,@feed.channel.dc_rights].compact.join(' ')
|
43
|
+
end
|
44
|
+
|
45
|
+
# A Time object.
|
46
|
+
def updated
|
47
|
+
return '' if @feed.channel.lastBuildDate.nil?
|
48
|
+
@feed.channel.lastBuildDate
|
49
|
+
end
|
50
|
+
|
51
|
+
# A globally unique ID for this feed. A URL in this case.
|
52
|
+
def guid
|
53
|
+
return '' if @feed.channel.link.nil?
|
54
|
+
@feed.channel.link
|
55
|
+
end
|
56
|
+
|
57
|
+
# The authors (a merge of the RSS managingEditor and dc:publisher elements) as an array.
|
58
|
+
def authors
|
59
|
+
return [] if @feed.channel.managingEditor.nil? && @feed.channel.dc_publishers.empty?
|
60
|
+
[@feed.channel.managingEditor, @feed.channel.dc_publishers].flatten.uniq
|
61
|
+
end
|
62
|
+
|
63
|
+
# The author list joined with a comma.
|
64
|
+
def author
|
65
|
+
return '' if self.authors.empty?
|
66
|
+
self.authors.join(', ')
|
67
|
+
end
|
68
|
+
|
69
|
+
# The category list (a merge of the RSS category and dc:subject elements) as an array.
|
70
|
+
def categories
|
71
|
+
return [] if @feed.channel.categories.empty? && @feed.channel.dc_subjects.empty?
|
72
|
+
[@feed.channel.categories, @feed.channel.dc_subjects].flatten.uniq.collect{|c| c.content}
|
73
|
+
end
|
74
|
+
|
75
|
+
# The category list as a string, joined with a comma.
|
76
|
+
def category
|
77
|
+
return '' if @feed.channel.categories.empty?
|
78
|
+
@feed.channel.categories.collect{|c| c.content}.join(', ')
|
79
|
+
end
|
80
|
+
|
81
|
+
# A URL to an icon representing this feed.
|
82
|
+
def icon
|
83
|
+
return '' if @feed.channel.image.nil?
|
84
|
+
@feed.channel.image.url
|
85
|
+
end
|
86
|
+
alias :logo :icon
|
87
|
+
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
@@ -0,0 +1,74 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
module Feed
|
4
|
+
class ParserError < Exception
|
5
|
+
end
|
6
|
+
|
7
|
+
class Abstract
|
8
|
+
|
9
|
+
# Feed::Abstract::Feed is the main class. It invokes RSS::Parser and negotiates which of the Feed::Abstract::Channel and Feed::Abstract::Item classes get dispatched to normalize the object graph of the feed you're parsing.
|
10
|
+
class Feed
|
11
|
+
attr_reader :channel, :raw_feed, :items
|
12
|
+
|
13
|
+
# === Parameters
|
14
|
+
# * xml - a string or object instance that responds to <b>read</b>
|
15
|
+
# * :do_validate - whether or not the feed should be validated. Passed through to RSS::Parser
|
16
|
+
# * :ignore_unknown_element - passed through to RSS::Parser
|
17
|
+
#
|
18
|
+
# === Returns
|
19
|
+
# An object with three attributes:
|
20
|
+
# * channel - an instance of Feed::Abstract::Channel matching the type of feed we recognized
|
21
|
+
# * items - an array of items matching the type of feed we recognized.
|
22
|
+
# * raw_feed - the raw feed object returned by RSS::Parser, which might include RSS::Atom::Feed, RSS::RDF, or RSS::Rss
|
23
|
+
# You will most likely be using the <b>channel</b> and <b>items</b> attributes.
|
24
|
+
#
|
25
|
+
# === Notes
|
26
|
+
# If a feed can't be parsed, we'll throw a Feed::ParserError.
|
27
|
+
#
|
28
|
+
# == Examples
|
29
|
+
#
|
30
|
+
# f = Feed::Abstract::Feed.new(File.open('/home/foo/xml/feed.rss2'))
|
31
|
+
# puts f.channel.title
|
32
|
+
# puts f.channel.description
|
33
|
+
#
|
34
|
+
# f.items.each do|item|
|
35
|
+
# puts item.title
|
36
|
+
# puts item.link
|
37
|
+
# end
|
38
|
+
#
|
39
|
+
# f = Feed::Abstract::Feed.new(File.open('/home/foo/xml/feed.atom'))
|
40
|
+
# puts f.channel.generator
|
41
|
+
#
|
42
|
+
# puts "All tags / categories / subjects in this feed: " + f.items.collect{|i| i.categories}.flatten.uniq.sort.join(', ')
|
43
|
+
#
|
44
|
+
# f = Feed::Abstract::Feed.new(Net::HTTP.get(URI.parse('http://rss.slashdot.org/Slashdot/slashdot')))
|
45
|
+
# puts f.items.collect{|i| i.link}
|
46
|
+
#
|
47
|
+
def initialize(xml = nil, options = {:do_validate => false, :ignore_unknown_element => true})
|
48
|
+
input = (xml.respond_to?(:read)) ? xml.read : xml
|
49
|
+
@raw_feed = RSS::Parser.parse(input,options[:do_validate], options[:ignore_unknown_element])
|
50
|
+
if @raw_feed == nil
|
51
|
+
raise Feed::ParserError
|
52
|
+
end
|
53
|
+
negotiate_channel_class
|
54
|
+
end
|
55
|
+
|
56
|
+
private
|
57
|
+
|
58
|
+
#Here's an easy extension point for custom parsers.
|
59
|
+
def negotiate_channel_class
|
60
|
+
if @raw_feed.class == RSS::Atom::Feed
|
61
|
+
@channel = Channel::Atom.new(@raw_feed)
|
62
|
+
@items = Items::Atom.new(@raw_feed)
|
63
|
+
elsif @raw_feed.class == RSS::RDF
|
64
|
+
@channel = Channel::RDF.new(@raw_feed)
|
65
|
+
@items = Items::RDF.new(@raw_feed)
|
66
|
+
else
|
67
|
+
@channel = Channel::RSS.new(@raw_feed)
|
68
|
+
@items = Items::RSS.new(@raw_feed)
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
module Feed
|
4
|
+
class Abstract
|
5
|
+
|
6
|
+
# You don't want this class. You want Feed::Abstract::Item::Atom, Feed::Abstract::Item::RSS or Feed::Abstract::Item::RDF.
|
7
|
+
class Item
|
8
|
+
|
9
|
+
# See Feed::AbstractMixins::Atom for more instance methods.
|
10
|
+
class Atom
|
11
|
+
include Feed::AbstractMixins::Atom
|
12
|
+
attr_reader :item, :source
|
13
|
+
|
14
|
+
def initialize(item)
|
15
|
+
@item = @source = item
|
16
|
+
end
|
17
|
+
|
18
|
+
# The full content of the item, most likely html.
|
19
|
+
def content
|
20
|
+
return '' if @item.content.nil?
|
21
|
+
@item.content.content
|
22
|
+
end
|
23
|
+
|
24
|
+
# The contributor list as an array.
|
25
|
+
def contributors
|
26
|
+
return [] if @item.contributors.empty?
|
27
|
+
@item.contributors.collect{|c| c.name.content}
|
28
|
+
end
|
29
|
+
|
30
|
+
#The contributor list as a strong joined with a comma.
|
31
|
+
def contributor
|
32
|
+
return '' if @item.contributors.empty?
|
33
|
+
@item.contributors.collect{|c| c.name.content}.join(', ')
|
34
|
+
end
|
35
|
+
|
36
|
+
def summary
|
37
|
+
return '' if @item.summary.nil?
|
38
|
+
@item.summary.content
|
39
|
+
end
|
40
|
+
|
41
|
+
# A Time object
|
42
|
+
def published
|
43
|
+
return '' if @item.published.nil?
|
44
|
+
@item.published.content
|
45
|
+
end
|
46
|
+
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
module Feed
|
4
|
+
class Abstract
|
5
|
+
class Item
|
6
|
+
class RDF < RSS
|
7
|
+
|
8
|
+
# The author list (from the dc:creator element) as an array.
|
9
|
+
def authors
|
10
|
+
(@item.dc_creators.empty?) ? [] : @item.dc_creators.collect{|c| c.content}
|
11
|
+
end
|
12
|
+
|
13
|
+
# The author list as a string, joined with a comma.
|
14
|
+
def author
|
15
|
+
return '' if self.authors.empty?
|
16
|
+
self.authors.join(', ')
|
17
|
+
end
|
18
|
+
|
19
|
+
# The category list (parsed from the dc:subject element) as an array.
|
20
|
+
def categories
|
21
|
+
return [] if @item.dc_subjects.empty?
|
22
|
+
@item.dc_subjects.collect{|c| c.content}
|
23
|
+
end
|
24
|
+
|
25
|
+
# The category list as a string, joined with a comma.
|
26
|
+
def category
|
27
|
+
return '' if self.categories.empty?
|
28
|
+
self.categories.join(', ')
|
29
|
+
end
|
30
|
+
|
31
|
+
# A Time object.
|
32
|
+
def updated
|
33
|
+
return '' if @item.dc_date.nil?
|
34
|
+
@item.dc_date
|
35
|
+
end
|
36
|
+
alias :published :updated
|
37
|
+
|
38
|
+
# A globally unique id, in this case probably a URL.
|
39
|
+
def guid
|
40
|
+
return '' if @item.about.nil?
|
41
|
+
@item.about
|
42
|
+
end
|
43
|
+
|
44
|
+
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|