feed-abstract 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +5 -0
- data/Gemfile +4 -0
- data/README.rdoc +23 -0
- data/Rakefile +2 -0
- data/feed-abstract.gemspec +23 -0
- data/lib/feed-abstract.rb +33 -0
- data/lib/feed-abstract/channel/atom.rb +47 -0
- data/lib/feed-abstract/channel/rdf.rb +63 -0
- data/lib/feed-abstract/channel/rss.rb +91 -0
- data/lib/feed-abstract/feed.rb +74 -0
- data/lib/feed-abstract/item/atom.rb +50 -0
- data/lib/feed-abstract/item/rdf.rb +48 -0
- data/lib/feed-abstract/item/rss.rb +86 -0
- data/lib/feed-abstract/items/atom.rb +22 -0
- data/lib/feed-abstract/items/rdf.rb +20 -0
- data/lib/feed-abstract/items/rss.rb +20 -0
- data/lib/feed-abstract/mixins.rb +66 -0
- data/lib/feed-abstract/version.rb +7 -0
- data/spec/feed_abstract_channel_spec.rb +132 -0
- data/spec/feed_abstract_item_spec.rb +160 -0
- data/spec/feed_abstract_spec.rb +30 -0
- data/spec/spec_helper.rb +2 -0
- data/spec/test_data/djcp.rss +229 -0
- data/spec/test_data/djcp_code.rss +320 -0
- data/spec/test_data/djcp_delicious.rss +1122 -0
- data/spec/test_data/doc.atom +321 -0
- data/spec/test_data/katanapg.atom +137 -0
- data/spec/test_data/oa.africa.rss +330 -0
- metadata +105 -0
data/Gemfile
ADDED
data/README.rdoc
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
= Feed::Abstract
|
2
|
+
|
3
|
+
Feed::Abstract creates a common object graph for RSS, Atom, and RDF feeds using the classes returned by RSS::Parser
|
4
|
+
|
5
|
+
== Installation
|
6
|
+
|
7
|
+
gem install feed-abstract
|
8
|
+
|
9
|
+
== Usage
|
10
|
+
|
11
|
+
See Feed::Abstract::Feed for basic examples. Also see the Feed::Abstract::Channel and Feed::Abstract::Item namespaces.
|
12
|
+
|
13
|
+
== Author
|
14
|
+
|
15
|
+
Dan Collis Puro, Berkman Center for Internet & Society
|
16
|
+
djcp@cyber.law.harvard.edu
|
17
|
+
|
18
|
+
== License & Copyright
|
19
|
+
|
20
|
+
GPLv3
|
21
|
+
|
22
|
+
2011 President and Fellows of Harvard College
|
23
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "feed-abstract/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "feed-abstract"
|
7
|
+
s.version = Feed::Abstract::VERSION
|
8
|
+
s.platform = Gem::Platform::RUBY
|
9
|
+
s.authors = ["Daniel Collis-Puro"]
|
10
|
+
s.email = ["djcp@cyber.law.harvard.edu"]
|
11
|
+
s.homepage = "https://github.com/berkmancenter/feed-abstract"
|
12
|
+
s.summary = %q{Abstracts RSS/Atom/RDF parsing features from the ruby standard lib into a common object graph.}
|
13
|
+
s.description = %q{This library creates a common object graph for the RSS/Atom/RDF parsing classes in the ruby standard library. This allows you parse different feed formats and get back the same (or at least a very similar) set of results - item authors are accessible under an "author(s)" attribute, categories/tags/subjects are accessible under "category(ies)" attributes, etc. We do our best to make sure the data makes sense, too - RSS items lack an "updated" attribute, so we use "pubDate" to populate it. }
|
14
|
+
s.files = `git ls-files`.split("\n")
|
15
|
+
s.rdoc_options = ["--charset=UTF-8"]
|
16
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
17
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
18
|
+
s.require_paths = ["lib"]
|
19
|
+
s.add_development_dependency "rspec", "2.6"
|
20
|
+
s.extra_rdoc_files = [
|
21
|
+
"README.rdoc"
|
22
|
+
]
|
23
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__)) unless $LOAD_PATH.include?(File.dirname(__FILE__))
|
4
|
+
|
5
|
+
require 'rss'
|
6
|
+
|
7
|
+
require "feed-abstract/version"
|
8
|
+
require 'feed-abstract/mixins'
|
9
|
+
|
10
|
+
require 'feed-abstract/channel/atom'
|
11
|
+
require 'feed-abstract/channel/rss'
|
12
|
+
require 'feed-abstract/channel/rdf'
|
13
|
+
|
14
|
+
require 'feed-abstract/items/rss'
|
15
|
+
require 'feed-abstract/items/atom'
|
16
|
+
require 'feed-abstract/items/rdf'
|
17
|
+
|
18
|
+
require 'feed-abstract/item/rss'
|
19
|
+
require 'feed-abstract/item/atom'
|
20
|
+
require 'feed-abstract/item/rdf'
|
21
|
+
|
22
|
+
require 'feed-abstract/feed'
|
23
|
+
|
24
|
+
|
25
|
+
#RSS::Rss::Channel::Item.class_eval{
|
26
|
+
# def foobar
|
27
|
+
# 'asdfasdf'
|
28
|
+
# end
|
29
|
+
#}
|
30
|
+
|
31
|
+
#RSS::Rss::Channel::Item.instance_eval{
|
32
|
+
# install_get_attribute('category', '', false )
|
33
|
+
#}
|
@@ -0,0 +1,47 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
module Feed
|
4
|
+
class Abstract
|
5
|
+
|
6
|
+
# You don't want this class. You want Feed::Abstract::Channel::Atom, Feed::Abstract::Channel::RSS or Feed::Abstract::Channel::RDF.
|
7
|
+
class Channel
|
8
|
+
|
9
|
+
# See Feed::AbstractMixins::Atom for more instance methods.
|
10
|
+
class Atom
|
11
|
+
include Feed::AbstractMixins::Atom
|
12
|
+
|
13
|
+
attr_reader :feed, :source
|
14
|
+
|
15
|
+
def initialize(feed)
|
16
|
+
@feed = @source = feed
|
17
|
+
end
|
18
|
+
|
19
|
+
def description
|
20
|
+
@feed.subtitle.content
|
21
|
+
end
|
22
|
+
alias :subtitle :description
|
23
|
+
|
24
|
+
# A string representing the application that created this feed.
|
25
|
+
def generator
|
26
|
+
return '' if @feed.generator.nil?
|
27
|
+
@feed.generator.content
|
28
|
+
end
|
29
|
+
|
30
|
+
# A URL (perhaps with domain, depending on input) representing an icon for the feed.
|
31
|
+
def icon
|
32
|
+
return '' if @feed.icon.nil?
|
33
|
+
@feed.icon.content
|
34
|
+
end
|
35
|
+
|
36
|
+
# A URL (perhaps with domain, depending on input) representing a logo for the feed.
|
37
|
+
def logo
|
38
|
+
return '' if @feed.logo.nil?
|
39
|
+
@feed.logo.content
|
40
|
+
end
|
41
|
+
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
|
46
|
+
end
|
47
|
+
end
|
@@ -0,0 +1,63 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
module Feed
|
4
|
+
class Abstract
|
5
|
+
class Channel
|
6
|
+
class RDF < RSS
|
7
|
+
|
8
|
+
# The authors list as an array.
|
9
|
+
def authors
|
10
|
+
return [] if @feed.channel.dc_publishers.empty?
|
11
|
+
@feed.channel.dc_publishers
|
12
|
+
end
|
13
|
+
|
14
|
+
# The authors list as a string, joined with a comma.
|
15
|
+
def author
|
16
|
+
return '' if self.authors.empty?
|
17
|
+
self.authors.join(', ')
|
18
|
+
end
|
19
|
+
|
20
|
+
# The generator of this feed.
|
21
|
+
def generator
|
22
|
+
return '' unless @feed.channel.respond_to?(:about)
|
23
|
+
if @feed.channel.about.match(/connotea/i)
|
24
|
+
return 'Connotea'
|
25
|
+
end
|
26
|
+
''
|
27
|
+
end
|
28
|
+
|
29
|
+
# The category list as an array.
|
30
|
+
def categories
|
31
|
+
return [] if @feed.channel.dc_subjects.empty?
|
32
|
+
@feed.channel.dc_subjects.collect{|c| c.content}
|
33
|
+
end
|
34
|
+
|
35
|
+
# The category list as a string, joined with a comma.
|
36
|
+
def category
|
37
|
+
return '' if self.categories.empty?
|
38
|
+
self.categories.join(', ')
|
39
|
+
end
|
40
|
+
|
41
|
+
# A URL (with or without domain depending on input) to a icon representing this feed.
|
42
|
+
def icon
|
43
|
+
return '' if @feed.channel.image.nil?
|
44
|
+
@feed.channel.image.resource
|
45
|
+
end
|
46
|
+
alias :logo :icon
|
47
|
+
|
48
|
+
# Copyright info.
|
49
|
+
def rights
|
50
|
+
return '' if @feed.channel.dc_rights.nil?
|
51
|
+
@feed.channel.dc_rights
|
52
|
+
end
|
53
|
+
|
54
|
+
# A Time object representing when this feed was updated, or at least "dated" according to the RDF spec.
|
55
|
+
def updated
|
56
|
+
return '' if @feed.channel.dc_date.nil?
|
57
|
+
@feed.channel.dc_date
|
58
|
+
end
|
59
|
+
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
@@ -0,0 +1,91 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
module Feed
|
4
|
+
class Abstract
|
5
|
+
class Channel
|
6
|
+
class RSS
|
7
|
+
include Feed::AbstractMixins::RSS
|
8
|
+
attr_reader :feed, :source
|
9
|
+
|
10
|
+
def initialize(feed)
|
11
|
+
@feed = @source = feed
|
12
|
+
end
|
13
|
+
|
14
|
+
def title
|
15
|
+
@feed.channel.title
|
16
|
+
end
|
17
|
+
|
18
|
+
def description
|
19
|
+
@feed.channel.description
|
20
|
+
end
|
21
|
+
alias :subtitle :description
|
22
|
+
|
23
|
+
# The generator of this feed as a string. Sometimes a URL, sometimes a string (e.g. the application name).
|
24
|
+
def generator
|
25
|
+
if ! @feed.channel.generator.nil? && @feed.channel.generator.match(/wordpress\.org/i)
|
26
|
+
return 'WordPress'
|
27
|
+
elsif @feed.channel.link.match(/www\.delicious\.com/i)
|
28
|
+
return 'Delicious'
|
29
|
+
end
|
30
|
+
return '' if @feed.channel.generator.nil?
|
31
|
+
@feed.channel.generator
|
32
|
+
end
|
33
|
+
|
34
|
+
def link
|
35
|
+
return '' if @feed.channel.link.nil?
|
36
|
+
@feed.channel.link
|
37
|
+
end
|
38
|
+
|
39
|
+
# Copyright info.
|
40
|
+
def rights
|
41
|
+
return '' if @feed.channel.copyright.nil? && @feed.channel.dc_rights.nil?
|
42
|
+
[@feed.channel.copyright,@feed.channel.dc_rights].compact.join(' ')
|
43
|
+
end
|
44
|
+
|
45
|
+
# A Time object.
|
46
|
+
def updated
|
47
|
+
return '' if @feed.channel.lastBuildDate.nil?
|
48
|
+
@feed.channel.lastBuildDate
|
49
|
+
end
|
50
|
+
|
51
|
+
# A globally unique ID for this feed. A URL in this case.
|
52
|
+
def guid
|
53
|
+
return '' if @feed.channel.link.nil?
|
54
|
+
@feed.channel.link
|
55
|
+
end
|
56
|
+
|
57
|
+
# The authors (a merge of the RSS managingEditor and dc:publisher elements) as an array.
|
58
|
+
def authors
|
59
|
+
return [] if @feed.channel.managingEditor.nil? && @feed.channel.dc_publishers.empty?
|
60
|
+
[@feed.channel.managingEditor, @feed.channel.dc_publishers].flatten.uniq
|
61
|
+
end
|
62
|
+
|
63
|
+
# The author list joined with a comma.
|
64
|
+
def author
|
65
|
+
return '' if self.authors.empty?
|
66
|
+
self.authors.join(', ')
|
67
|
+
end
|
68
|
+
|
69
|
+
# The category list (a merge of the RSS category and dc:subject elements) as an array.
|
70
|
+
def categories
|
71
|
+
return [] if @feed.channel.categories.empty? && @feed.channel.dc_subjects.empty?
|
72
|
+
[@feed.channel.categories, @feed.channel.dc_subjects].flatten.uniq.collect{|c| c.content}
|
73
|
+
end
|
74
|
+
|
75
|
+
# The category list as a string, joined with a comma.
|
76
|
+
def category
|
77
|
+
return '' if @feed.channel.categories.empty?
|
78
|
+
@feed.channel.categories.collect{|c| c.content}.join(', ')
|
79
|
+
end
|
80
|
+
|
81
|
+
# A URL to an icon representing this feed.
|
82
|
+
def icon
|
83
|
+
return '' if @feed.channel.image.nil?
|
84
|
+
@feed.channel.image.url
|
85
|
+
end
|
86
|
+
alias :logo :icon
|
87
|
+
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
@@ -0,0 +1,74 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
module Feed
|
4
|
+
class ParserError < Exception
|
5
|
+
end
|
6
|
+
|
7
|
+
class Abstract
|
8
|
+
|
9
|
+
# Feed::Abstract::Feed is the main class. It invokes RSS::Parser and negotiates which of the Feed::Abstract::Channel and Feed::Abstract::Item classes get dispatched to normalize the object graph of the feed you're parsing.
|
10
|
+
class Feed
|
11
|
+
attr_reader :channel, :raw_feed, :items
|
12
|
+
|
13
|
+
# === Parameters
|
14
|
+
# * xml - a string or object instance that responds to <b>read</b>
|
15
|
+
# * :do_validate - whether or not the feed should be validated. Passed through to RSS::Parser
|
16
|
+
# * :ignore_unknown_element - passed through to RSS::Parser
|
17
|
+
#
|
18
|
+
# === Returns
|
19
|
+
# An object with three attributes:
|
20
|
+
# * channel - an instance of Feed::Abstract::Channel matching the type of feed we recognized
|
21
|
+
# * items - an array of items matching the type of feed we recognized.
|
22
|
+
# * raw_feed - the raw feed object returned by RSS::Parser, which might include RSS::Atom::Feed, RSS::RDF, or RSS::Rss
|
23
|
+
# You will most likely be using the <b>channel</b> and <b>items</b> attributes.
|
24
|
+
#
|
25
|
+
# === Notes
|
26
|
+
# If a feed can't be parsed, we'll throw a Feed::ParserError.
|
27
|
+
#
|
28
|
+
# == Examples
|
29
|
+
#
|
30
|
+
# f = Feed::Abstract::Feed.new(File.open('/home/foo/xml/feed.rss2'))
|
31
|
+
# puts f.channel.title
|
32
|
+
# puts f.channel.description
|
33
|
+
#
|
34
|
+
# f.items.each do|item|
|
35
|
+
# puts item.title
|
36
|
+
# puts item.link
|
37
|
+
# end
|
38
|
+
#
|
39
|
+
# f = Feed::Abstract::Feed.new(File.open('/home/foo/xml/feed.atom'))
|
40
|
+
# puts f.channel.generator
|
41
|
+
#
|
42
|
+
# puts "All tags / categories / subjects in this feed: " + f.items.collect{|i| i.categories}.flatten.uniq.sort.join(', ')
|
43
|
+
#
|
44
|
+
# f = Feed::Abstract::Feed.new(Net::HTTP.get(URI.parse('http://rss.slashdot.org/Slashdot/slashdot')))
|
45
|
+
# puts f.items.collect{|i| i.link}
|
46
|
+
#
|
47
|
+
def initialize(xml = nil, options = {:do_validate => false, :ignore_unknown_element => true})
|
48
|
+
input = (xml.respond_to?(:read)) ? xml.read : xml
|
49
|
+
@raw_feed = RSS::Parser.parse(input,options[:do_validate], options[:ignore_unknown_element])
|
50
|
+
if @raw_feed == nil
|
51
|
+
raise Feed::ParserError
|
52
|
+
end
|
53
|
+
negotiate_channel_class
|
54
|
+
end
|
55
|
+
|
56
|
+
private
|
57
|
+
|
58
|
+
#Here's an easy extension point for custom parsers.
|
59
|
+
def negotiate_channel_class
|
60
|
+
if @raw_feed.class == RSS::Atom::Feed
|
61
|
+
@channel = Channel::Atom.new(@raw_feed)
|
62
|
+
@items = Items::Atom.new(@raw_feed)
|
63
|
+
elsif @raw_feed.class == RSS::RDF
|
64
|
+
@channel = Channel::RDF.new(@raw_feed)
|
65
|
+
@items = Items::RDF.new(@raw_feed)
|
66
|
+
else
|
67
|
+
@channel = Channel::RSS.new(@raw_feed)
|
68
|
+
@items = Items::RSS.new(@raw_feed)
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
module Feed
|
4
|
+
class Abstract
|
5
|
+
|
6
|
+
# You don't want this class. You want Feed::Abstract::Item::Atom, Feed::Abstract::Item::RSS or Feed::Abstract::Item::RDF.
|
7
|
+
class Item
|
8
|
+
|
9
|
+
# See Feed::AbstractMixins::Atom for more instance methods.
|
10
|
+
class Atom
|
11
|
+
include Feed::AbstractMixins::Atom
|
12
|
+
attr_reader :item, :source
|
13
|
+
|
14
|
+
def initialize(item)
|
15
|
+
@item = @source = item
|
16
|
+
end
|
17
|
+
|
18
|
+
# The full content of the item, most likely html.
|
19
|
+
def content
|
20
|
+
return '' if @item.content.nil?
|
21
|
+
@item.content.content
|
22
|
+
end
|
23
|
+
|
24
|
+
# The contributor list as an array.
|
25
|
+
def contributors
|
26
|
+
return [] if @item.contributors.empty?
|
27
|
+
@item.contributors.collect{|c| c.name.content}
|
28
|
+
end
|
29
|
+
|
30
|
+
#The contributor list as a strong joined with a comma.
|
31
|
+
def contributor
|
32
|
+
return '' if @item.contributors.empty?
|
33
|
+
@item.contributors.collect{|c| c.name.content}.join(', ')
|
34
|
+
end
|
35
|
+
|
36
|
+
def summary
|
37
|
+
return '' if @item.summary.nil?
|
38
|
+
@item.summary.content
|
39
|
+
end
|
40
|
+
|
41
|
+
# A Time object
|
42
|
+
def published
|
43
|
+
return '' if @item.published.nil?
|
44
|
+
@item.published.content
|
45
|
+
end
|
46
|
+
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
module Feed
|
4
|
+
class Abstract
|
5
|
+
class Item
|
6
|
+
class RDF < RSS
|
7
|
+
|
8
|
+
# The author list (from the dc:creator element) as an array.
|
9
|
+
def authors
|
10
|
+
(@item.dc_creators.empty?) ? [] : @item.dc_creators.collect{|c| c.content}
|
11
|
+
end
|
12
|
+
|
13
|
+
# The author list as a string, joined with a comma.
|
14
|
+
def author
|
15
|
+
return '' if self.authors.empty?
|
16
|
+
self.authors.join(', ')
|
17
|
+
end
|
18
|
+
|
19
|
+
# The category list (parsed from the dc:subject element) as an array.
|
20
|
+
def categories
|
21
|
+
return [] if @item.dc_subjects.empty?
|
22
|
+
@item.dc_subjects.collect{|c| c.content}
|
23
|
+
end
|
24
|
+
|
25
|
+
# The category list as a string, joined with a comma.
|
26
|
+
def category
|
27
|
+
return '' if self.categories.empty?
|
28
|
+
self.categories.join(', ')
|
29
|
+
end
|
30
|
+
|
31
|
+
# A Time object.
|
32
|
+
def updated
|
33
|
+
return '' if @item.dc_date.nil?
|
34
|
+
@item.dc_date
|
35
|
+
end
|
36
|
+
alias :published :updated
|
37
|
+
|
38
|
+
# A globally unique id, in this case probably a URL.
|
39
|
+
def guid
|
40
|
+
return '' if @item.about.nil?
|
41
|
+
@item.about
|
42
|
+
end
|
43
|
+
|
44
|
+
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|