jsl-feedzirra 0.0.12.1
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +194 -0
- data/Rakefile +56 -0
- data/lib/core_ext/array.rb +8 -0
- data/lib/core_ext/date.rb +21 -0
- data/lib/core_ext/string.rb +9 -0
- data/lib/feedzirra/backend/filesystem.rb +32 -0
- data/lib/feedzirra/backend/memcache.rb +37 -0
- data/lib/feedzirra/backend/memory.rb +22 -0
- data/lib/feedzirra/feed.rb +68 -0
- data/lib/feedzirra/feed_parser.rb +64 -0
- data/lib/feedzirra/http_multi.rb +185 -0
- data/lib/feedzirra/parser/atom.rb +26 -0
- data/lib/feedzirra/parser/atom_entry.rb +34 -0
- data/lib/feedzirra/parser/atom_feed_burner.rb +27 -0
- data/lib/feedzirra/parser/atom_feed_burner_entry.rb +35 -0
- data/lib/feedzirra/parser/feed_entry_utilities.rb +45 -0
- data/lib/feedzirra/parser/feed_utilities.rb +71 -0
- data/lib/feedzirra/parser/itunes_rss.rb +50 -0
- data/lib/feedzirra/parser/itunes_rss_item.rb +31 -0
- data/lib/feedzirra/parser/itunes_rss_owner.rb +12 -0
- data/lib/feedzirra/parser/rss.rb +28 -0
- data/lib/feedzirra/parser/rss_entry.rb +40 -0
- data/lib/feedzirra/reader.rb +28 -0
- data/lib/feedzirra.rb +44 -0
- data/spec/feedzirra/feed_entry_utilities_spec.rb +52 -0
- data/spec/feedzirra/feed_spec.rb +5 -0
- data/spec/feedzirra/feed_utilities_spec.rb +149 -0
- data/spec/feedzirra/parser/atom_entry_spec.rb +45 -0
- data/spec/feedzirra/parser/atom_feed_burner_entry_spec.rb +42 -0
- data/spec/feedzirra/parser/atom_feed_burner_spec.rb +39 -0
- data/spec/feedzirra/parser/atom_spec.rb +35 -0
- data/spec/feedzirra/parser/itunes_rss_item_spec.rb +48 -0
- data/spec/feedzirra/parser/itunes_rss_owner_spec.rb +18 -0
- data/spec/feedzirra/parser/itunes_rss_spec.rb +50 -0
- data/spec/feedzirra/parser/rss_entry_spec.rb +41 -0
- data/spec/feedzirra/parser/rss_spec.rb +41 -0
- data/spec/spec.opts +2 -0
- data/spec/spec_helper.rb +67 -0
- metadata +159 -0
@@ -0,0 +1,185 @@
|
|
1
|
+
module Feedzirra
|
2
|
+
|
3
|
+
# Handles HTTP requests for Feedzirra, including registration of on success and on failure
|
4
|
+
# callbacks.
|
5
|
+
class HttpMulti
|
6
|
+
|
7
|
+
attr_reader :options, :retrievables, :multi, :responses
|
8
|
+
|
9
|
+
DEFAULTS = {
|
10
|
+
:backend => {
|
11
|
+
:class => Feedzirra::Backend::Memory
|
12
|
+
}
|
13
|
+
}
|
14
|
+
|
15
|
+
def initialize(*args)
|
16
|
+
@options = DEFAULTS.merge(args.extract_options!)
|
17
|
+
@retrievables = args.flatten
|
18
|
+
@multi = Curl::Multi.new
|
19
|
+
@responses = { }
|
20
|
+
@backend = @options[:backend][:class].new
|
21
|
+
end
|
22
|
+
|
23
|
+
# Prepares the curl object and calls #perform
|
24
|
+
def run
|
25
|
+
prepare
|
26
|
+
@multi.perform
|
27
|
+
end
|
28
|
+
|
29
|
+
# Breaks the urls into chunks of 30 because of weird errors encountered on
|
30
|
+
# entering more items. As one finishes it pops another off the queue.
|
31
|
+
def prepare
|
32
|
+
retrievable_queue = @retrievables.dup
|
33
|
+
|
34
|
+
retrievable_queue.slice!(0, 30).each do |retrievable|
|
35
|
+
add_to_multi(retrievable, retrievable_queue)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
# Generic method for building Curl::Multi object. Retrievable may be a Feed or a
|
40
|
+
# String URL.
|
41
|
+
def add_to_multi(retrievable, retrievable_queue)
|
42
|
+
if retrievable.respond_to?(:feed_url)
|
43
|
+
url = retrievable.feed_url
|
44
|
+
else
|
45
|
+
url = retrievable
|
46
|
+
retrievable = @backend.get(url) # Try to fetch the last retrieval from backend
|
47
|
+
end
|
48
|
+
|
49
|
+
easy = build_curl_easy(url, retrievable, retrievable_queue)
|
50
|
+
@multi.add(easy)
|
51
|
+
end
|
52
|
+
|
53
|
+
# builds a Curl::Easy object that can be added to Curl::Multi.
|
54
|
+
def build_curl_easy(url, retrievable, retrievable_queue)
|
55
|
+
|
56
|
+
easy = Curl::Easy.new(url) do |curl|
|
57
|
+
curl = set_curl_configuration(curl, retrievable)
|
58
|
+
|
59
|
+
curl.on_success do |c|
|
60
|
+
on_success_handler(c, url, retrievable, retrievable_queue)
|
61
|
+
end
|
62
|
+
|
63
|
+
curl.on_failure do |c|
|
64
|
+
if c.response_code == 304
|
65
|
+
on_success_handler(c, url, retrievable, retrievable_queue)
|
66
|
+
else
|
67
|
+
on_failure_handler(c, url, retrievable, retrievable_queue)
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
easy
|
73
|
+
end
|
74
|
+
|
75
|
+
def set_updated_feed_entries!(retrievable, updated_feed)
|
76
|
+
if retrievable.respond_to?(:update_from_feed)
|
77
|
+
retrievable.update_from_feed(updated_feed)
|
78
|
+
else
|
79
|
+
# all elements are "new", since we weren't dealing with a Feed element.
|
80
|
+
updated_feed.new_entries = updated_feed.entries
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
# Handles successful Curl responses.
|
85
|
+
def on_success_handler(curl, url, retrievable, retrievable_queue)
|
86
|
+
add_to_multi(retrievable_queue.shift, retrievable_queue) unless retrievable_queue.empty?
|
87
|
+
|
88
|
+
begin
|
89
|
+
if curl.response_code == 304
|
90
|
+
updated_feed = retrievable
|
91
|
+
updated_feed.new_entries = [ ]
|
92
|
+
else
|
93
|
+
updated_feed = parser_for_xml(curl.body_str).run
|
94
|
+
updated_feed.feed_url = curl.last_effective_url
|
95
|
+
updated_feed.etag = etag_from_header(curl.header_str)
|
96
|
+
updated_feed.last_modified = last_modified_from_header(curl.header_str)
|
97
|
+
set_updated_feed_entries!(retrievable, updated_feed)
|
98
|
+
end
|
99
|
+
|
100
|
+
@backend.set(url, updated_feed)
|
101
|
+
|
102
|
+
responses[url] = updated_feed
|
103
|
+
@options[:on_success].call(retrievable) if @options.has_key?(:on_success)
|
104
|
+
|
105
|
+
rescue Exception => e
|
106
|
+
puts "Caught exception, but we're throwing it away: #{e}"
|
107
|
+
@options[:on_failure].call(retrievable, curl.response_code, curl.header_str, curl.body_str) if @options.has_key?(:on_failure)
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
# Handles failed Curl responses.
|
112
|
+
def on_failure_handler(curl, url, retrievable, retrievable_queue)
|
113
|
+
add_to_multi(multi, retrievable_queue.shift, retrievable_queue, responses, options) unless retrievable_queue.empty?
|
114
|
+
responses[url] = curl.response_code
|
115
|
+
@options[:on_failure].call(retrievable, curl.response_code, curl.header_str, curl.body_str) if options.has_key?(:on_failure)
|
116
|
+
end
|
117
|
+
|
118
|
+
# Determines the etag from the request headers.
|
119
|
+
#
|
120
|
+
# === Parameters
|
121
|
+
# [header<String>] Raw request header returned from the request
|
122
|
+
# === Returns
|
123
|
+
# A string of the etag or nil if it cannot be found in the headers.
|
124
|
+
def etag_from_header(header)
|
125
|
+
header =~ /.*ETag:\s(.*)\r/
|
126
|
+
$1
|
127
|
+
end
|
128
|
+
|
129
|
+
# Determines the last modified date from the request headers.
|
130
|
+
#
|
131
|
+
# === Parameters
|
132
|
+
# [header<String>] Raw request header returned from the request
|
133
|
+
# === Returns
|
134
|
+
# A Time object of the last modified date or nil if it cannot be found in the headers.
|
135
|
+
def last_modified_from_header(header)
|
136
|
+
header =~ /.*Last-Modified:\s(.*)\r/
|
137
|
+
Time.parse($1) if $1
|
138
|
+
end
|
139
|
+
|
140
|
+
def parser_for_xml(xml)
|
141
|
+
Feedzirra::FeedParser.new(xml)
|
142
|
+
end
|
143
|
+
|
144
|
+
|
145
|
+
# Accepts a Curl::Easy object with an optional set of options and returns
|
146
|
+
# a Curl::Easy object with options merged into the defaults.
|
147
|
+
def set_curl_configuration(curl, retrievable = nil)
|
148
|
+
curl.headers["User-Agent"] = @options[:user_agent] || Feedzirra::USER_AGENT
|
149
|
+
curl.headers["If-Modified-Since"] = @options[:if_modified_since].httpdate if @options.has_key?(:if_modified_since)
|
150
|
+
curl.headers["If-None-Match"] = retrievable.etag if (retrievable.respond_to?(:etag) && retrievable.etag)
|
151
|
+
curl.headers["Accept-Encoding"] = 'gzip, deflate' if @options.has_key?(:compress)
|
152
|
+
|
153
|
+
curl.follow_location = true
|
154
|
+
curl.userpwd = @options[:http_authentication].join(':') if @options.has_key?(:http_authentication)
|
155
|
+
|
156
|
+
curl
|
157
|
+
end
|
158
|
+
|
159
|
+
# Decodes the XML document if it was compressed.
|
160
|
+
#
|
161
|
+
# === Parameters
|
162
|
+
# [curl_request<Curl::Easy>] The Curl::Easy response object from the request.
|
163
|
+
# === Returns
|
164
|
+
# A decoded string of XML.
|
165
|
+
def decode_content(curl)
|
166
|
+
if curl.header_str.match(/Content-Encoding: gzip/)
|
167
|
+
begin
|
168
|
+
gz = Zlib::GzipReader.new(StringIO.new(curl.body_str))
|
169
|
+
xml = gz.read
|
170
|
+
gz.close
|
171
|
+
rescue Zlib::GzipFile::Error
|
172
|
+
# Maybe this is not gzipped?
|
173
|
+
xml = c.body_str
|
174
|
+
end
|
175
|
+
elsif curl.header_str.match(/Content-Encoding: deflate/)
|
176
|
+
xml = Zlib::Inflate.inflate(curl.body_str)
|
177
|
+
else
|
178
|
+
xml = curl.body_str
|
179
|
+
end
|
180
|
+
|
181
|
+
xml
|
182
|
+
end
|
183
|
+
|
184
|
+
end
|
185
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
module Feedzirra
|
2
|
+
|
3
|
+
module Parser
|
4
|
+
# == Summary
|
5
|
+
# Parser for dealing with Atom feeds.
|
6
|
+
#
|
7
|
+
# == Attributes
|
8
|
+
# * title
|
9
|
+
# * feed_url
|
10
|
+
# * url
|
11
|
+
# * entries
|
12
|
+
class Atom
|
13
|
+
include SAXMachine
|
14
|
+
include FeedUtilities
|
15
|
+
element :title
|
16
|
+
element :link, :as => :url, :value => :href, :with => {:type => "text/html"}
|
17
|
+
element :link, :as => :feed_url, :value => :href, :with => {:type => "application/atom+xml"}
|
18
|
+
elements :entry, :as => :entries, :class => AtomEntry
|
19
|
+
|
20
|
+
def self.able_to_parse?(xml) #:nodoc:
|
21
|
+
xml =~ /(Atom)|(#{Regexp.escape("http://purl.org/atom")})/
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
module Feedzirra
|
2
|
+
|
3
|
+
module Parser
|
4
|
+
# == Summary
|
5
|
+
# Parser for dealing with Atom feed entries.
|
6
|
+
#
|
7
|
+
# == Attributes
|
8
|
+
# * title
|
9
|
+
# * url
|
10
|
+
# * author
|
11
|
+
# * content
|
12
|
+
# * summary
|
13
|
+
# * published
|
14
|
+
# * categories
|
15
|
+
class AtomEntry
|
16
|
+
include SAXMachine
|
17
|
+
include FeedEntryUtilities
|
18
|
+
element :title
|
19
|
+
element :link, :as => :url, :value => :href, :with => {:type => "text/html", :rel => "alternate"}
|
20
|
+
element :name, :as => :author
|
21
|
+
element :content
|
22
|
+
element :summary
|
23
|
+
element :published
|
24
|
+
element :id
|
25
|
+
element :created, :as => :published
|
26
|
+
element :issued, :as => :published
|
27
|
+
element :updated
|
28
|
+
element :modified, :as => :updated
|
29
|
+
elements :category, :as => :categories, :value => :term
|
30
|
+
end
|
31
|
+
|
32
|
+
end
|
33
|
+
|
34
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
module Feedzirra
|
2
|
+
|
3
|
+
module Parser
|
4
|
+
# == Summary
|
5
|
+
# Parser for dealing with Feedburner Atom feeds.
|
6
|
+
#
|
7
|
+
# == Attributes
|
8
|
+
# * title
|
9
|
+
# * feed_url
|
10
|
+
# * url
|
11
|
+
# * entries
|
12
|
+
class AtomFeedBurner
|
13
|
+
include SAXMachine
|
14
|
+
include FeedUtilities
|
15
|
+
element :title
|
16
|
+
element :link, :as => :url, :value => :href, :with => {:type => "text/html"}
|
17
|
+
element :link, :as => :feed_url, :value => :href, :with => {:type => "application/atom+xml"}
|
18
|
+
elements :entry, :as => :entries, :class => AtomFeedBurnerEntry
|
19
|
+
|
20
|
+
def self.able_to_parse?(xml) #:nodoc:
|
21
|
+
(xml =~ /Atom/ && xml =~ /feedburner/) || false
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
module Feedzirra
|
2
|
+
|
3
|
+
module Parser
|
4
|
+
# == Summary
|
5
|
+
# Parser for dealing with Feedburner Atom feed entries.
|
6
|
+
#
|
7
|
+
# == Attributes
|
8
|
+
# * title
|
9
|
+
# * url
|
10
|
+
# * author
|
11
|
+
# * content
|
12
|
+
# * summary
|
13
|
+
# * published
|
14
|
+
# * categories
|
15
|
+
class AtomFeedBurnerEntry
|
16
|
+
include SAXMachine
|
17
|
+
include FeedEntryUtilities
|
18
|
+
element :title
|
19
|
+
element :name, :as => :author
|
20
|
+
element :link, :as => :url, :value => :href, :with => {:type => "text/html", :rel => "alternate"}
|
21
|
+
element :"feedburner:origLink", :as => :url
|
22
|
+
element :summary
|
23
|
+
element :content
|
24
|
+
element :published
|
25
|
+
element :id
|
26
|
+
element :issued, :as => :published
|
27
|
+
element :created, :as => :published
|
28
|
+
element :updated
|
29
|
+
element :modified, :as => :updated
|
30
|
+
elements :category, :as => :categories, :value => :term
|
31
|
+
end
|
32
|
+
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
module Feedzirra
|
2
|
+
module FeedEntryUtilities
|
3
|
+
def published
|
4
|
+
@published || @updated
|
5
|
+
end
|
6
|
+
|
7
|
+
def parse_datetime(string)
|
8
|
+
begin
|
9
|
+
DateTime.parse(string).feed_utils_to_gm_time
|
10
|
+
rescue
|
11
|
+
puts "DATE CAN'T BE PARSED: #{string}"
|
12
|
+
nil
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
##
|
17
|
+
# Returns the id of the entry or its url if not id is present, as some formats don't support it
|
18
|
+
def id
|
19
|
+
@id || @url
|
20
|
+
end
|
21
|
+
|
22
|
+
##
|
23
|
+
# Writter for published. By default, we keep the "oldest" publish time found.
|
24
|
+
def published=(val)
|
25
|
+
parsed = parse_datetime(val)
|
26
|
+
@published = parsed if !@published || parsed < @published
|
27
|
+
end
|
28
|
+
|
29
|
+
##
|
30
|
+
# Writter for udapted. By default, we keep the most recenet update time found.
|
31
|
+
def updated=(val)
|
32
|
+
parsed = parse_datetime(val)
|
33
|
+
@updated = parsed if !@updated || parsed > @updated
|
34
|
+
end
|
35
|
+
|
36
|
+
def sanitize!
|
37
|
+
self.title.sanitize! if self.title
|
38
|
+
self.author.sanitize! if self.author
|
39
|
+
self.summary.sanitize! if self.summary
|
40
|
+
self.content.sanitize! if self.content
|
41
|
+
end
|
42
|
+
|
43
|
+
alias_method :last_modified, :published
|
44
|
+
end
|
45
|
+
end
|
@@ -0,0 +1,71 @@
|
|
1
|
+
module Feedzirra
|
2
|
+
module FeedUtilities
|
3
|
+
UPDATABLE_ATTRIBUTES = %w(title feed_url url last_modified)
|
4
|
+
|
5
|
+
attr_writer :new_entries, :updated, :last_modified
|
6
|
+
attr_accessor :etag
|
7
|
+
|
8
|
+
def last_modified
|
9
|
+
@last_modified ||= begin
|
10
|
+
entry = entries.reject {|e| e.published.nil? }.sort_by { |entry| entry.published if entry.published }.last
|
11
|
+
entry ? entry.published : nil
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
def updated?
|
16
|
+
@updated
|
17
|
+
end
|
18
|
+
|
19
|
+
def new_entries
|
20
|
+
@new_entries ||= []
|
21
|
+
end
|
22
|
+
|
23
|
+
def has_new_entries?
|
24
|
+
new_entries.size > 0
|
25
|
+
end
|
26
|
+
|
27
|
+
def update_from_feed(feed)
|
28
|
+
self.new_entries += find_new_entries_for(feed)
|
29
|
+
self.entries.unshift(*self.new_entries)
|
30
|
+
|
31
|
+
updated! if UPDATABLE_ATTRIBUTES.any? { |name| update_attribute(feed, name) }
|
32
|
+
end
|
33
|
+
|
34
|
+
def update_attribute(feed, name)
|
35
|
+
old_value, new_value = send(name), feed.send(name)
|
36
|
+
|
37
|
+
if old_value != new_value
|
38
|
+
send("#{name}=", new_value)
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
def sanitize_entries!
|
43
|
+
entries.each {|entry| entry.sanitize!}
|
44
|
+
end
|
45
|
+
|
46
|
+
private
|
47
|
+
|
48
|
+
def updated!
|
49
|
+
@updated = true
|
50
|
+
end
|
51
|
+
|
52
|
+
def find_new_entries_for(feed)
|
53
|
+
# this implementation is a hack, which is why it's so ugly.
|
54
|
+
# it's to get around the fact that not all feeds have a published date.
|
55
|
+
# however, they're always ordered with the newest one first.
|
56
|
+
# So we go through the entries just parsed and insert each one as a new entry
|
57
|
+
# until we get to one that has the same url as the the newest for the feed
|
58
|
+
latest_entry = self.entries.first
|
59
|
+
found_new_entries = []
|
60
|
+
feed.entries.each do |entry|
|
61
|
+
break if entry.url == latest_entry.url
|
62
|
+
found_new_entries << entry
|
63
|
+
end
|
64
|
+
found_new_entries
|
65
|
+
end
|
66
|
+
|
67
|
+
def existing_entry?(test_entry)
|
68
|
+
entries.any? { |entry| entry.url == test_entry.url }
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
module Feedzirra
|
2
|
+
|
3
|
+
module Parser
|
4
|
+
# iTunes is RSS 2.0 + some apple extensions
|
5
|
+
# Source: http://www.apple.com/itunes/whatson/podcasts/specs.html
|
6
|
+
class ITunesRSS
|
7
|
+
include SAXMachine
|
8
|
+
include FeedUtilities
|
9
|
+
|
10
|
+
attr_accessor :feed_url
|
11
|
+
|
12
|
+
# RSS 2.0 elements that need including
|
13
|
+
element :copyright
|
14
|
+
element :description
|
15
|
+
element :language
|
16
|
+
element :managingEditor
|
17
|
+
element :title
|
18
|
+
element :link, :as => :url
|
19
|
+
|
20
|
+
# If author is not present use managingEditor on the channel
|
21
|
+
element :"itunes:author", :as => :itunes_author
|
22
|
+
element :"itunes:block", :as => :itunes_block
|
23
|
+
element :"itunes:image", :value => :href, :as => :itunes_image
|
24
|
+
element :"itunes:explicit", :as => :itunes_explicit
|
25
|
+
element :"itunes:keywords", :as => :itunes_keywords
|
26
|
+
# New URL for the podcast feed
|
27
|
+
element :"itunes:new-feed-url", :as => :itunes_new_feed_url
|
28
|
+
element :"itunes:subtitle", :as => :itunes_subtitle
|
29
|
+
# If summary is not present, use the description tag
|
30
|
+
element :"itunes:summary", :as => :itunes_summary
|
31
|
+
|
32
|
+
# iTunes RSS feeds can have multiple main categories...
|
33
|
+
# ...and multiple sub-categories per category
|
34
|
+
# TODO subcategories not supported correctly - they are at the same level
|
35
|
+
# as the main categories
|
36
|
+
elements :"itunes:category", :as => :itunes_categories, :value => :text
|
37
|
+
|
38
|
+
elements :"itunes:owner", :as => :itunes_owners, :class => ITunesRSSOwner
|
39
|
+
|
40
|
+
elements :item, :as => :entries, :class => ITunesRSSItem
|
41
|
+
|
42
|
+
def self.able_to_parse?(xml)
|
43
|
+
xml =~ /xmlns:itunes=\"http:\/\/www.itunes.com\/dtds\/podcast-1.0.dtd\"/
|
44
|
+
end
|
45
|
+
|
46
|
+
end
|
47
|
+
|
48
|
+
end
|
49
|
+
|
50
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
module Feedzirra
|
2
|
+
|
3
|
+
module Parser
|
4
|
+
# iTunes extensions to the standard RSS2.0 item
|
5
|
+
# Source: http://www.apple.com/itunes/whatson/podcasts/specs.html
|
6
|
+
class ITunesRSSItem
|
7
|
+
include SAXMachine
|
8
|
+
include FeedUtilities
|
9
|
+
element :author
|
10
|
+
element :guid
|
11
|
+
element :title
|
12
|
+
element :link, :as => :url
|
13
|
+
element :description, :as => :summary
|
14
|
+
element :pubDate, :as => :published
|
15
|
+
|
16
|
+
# If author is not present use author tag on the item
|
17
|
+
element :"itunes:author", :as => :itunes_author
|
18
|
+
element :"itunes:block", :as => :itunes_block
|
19
|
+
element :"itunes:duration", :as => :itunes_duration
|
20
|
+
element :"itunes:explicit", :as => :itunes_explicit
|
21
|
+
element :"itunes:keywords", :as => :itunes_keywords
|
22
|
+
element :"itunes:subtitle", :as => :itunes_subtitle
|
23
|
+
# If summary is not present, use the description tag
|
24
|
+
element :"itunes:summary", :as => :itunes_summary
|
25
|
+
element :enclosure, :value => :length, :as => :enclosure_length
|
26
|
+
element :enclosure, :value => :type, :as => :enclosure_type
|
27
|
+
element :enclosure, :value => :url, :as => :enclosure_url
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
module Feedzirra
|
2
|
+
|
3
|
+
module Parser
|
4
|
+
# == Summary
|
5
|
+
# Parser for dealing with RSS feeds.
|
6
|
+
#
|
7
|
+
# == Attributes
|
8
|
+
# * title
|
9
|
+
# * feed_url
|
10
|
+
# * url
|
11
|
+
# * entries
|
12
|
+
class RSS
|
13
|
+
include SAXMachine
|
14
|
+
include FeedUtilities
|
15
|
+
element :title
|
16
|
+
element :link, :as => :url
|
17
|
+
elements :item, :as => :entries, :class => RSSEntry
|
18
|
+
|
19
|
+
attr_accessor :feed_url
|
20
|
+
|
21
|
+
def self.able_to_parse?(xml) #:nodoc:
|
22
|
+
xml =~ /\<rss|rdf/
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
27
|
+
|
28
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
module Feedzirra
|
2
|
+
|
3
|
+
module Parser
|
4
|
+
# == Summary
|
5
|
+
# Parser for dealing with RDF feed entries.
|
6
|
+
#
|
7
|
+
# == Attributes
|
8
|
+
# * title
|
9
|
+
# * url
|
10
|
+
# * author
|
11
|
+
# * content
|
12
|
+
# * summary
|
13
|
+
# * published
|
14
|
+
# * categories
|
15
|
+
class RSSEntry
|
16
|
+
include SAXMachine
|
17
|
+
include FeedEntryUtilities
|
18
|
+
element :title
|
19
|
+
element :link, :as => :url
|
20
|
+
|
21
|
+
element :"dc:creator", :as => :author
|
22
|
+
element :"content:encoded", :as => :content
|
23
|
+
element :description, :as => :summary
|
24
|
+
|
25
|
+
element :pubDate, :as => :published
|
26
|
+
element :"dc:date", :as => :published
|
27
|
+
element :"dc:Date", :as => :published
|
28
|
+
element :"dcterms:created", :as => :published
|
29
|
+
|
30
|
+
|
31
|
+
element :"dcterms:modified", :as => :updated
|
32
|
+
element :issued, :as => :published
|
33
|
+
elements :category, :as => :categories
|
34
|
+
|
35
|
+
element :guid, :as => :id
|
36
|
+
end
|
37
|
+
|
38
|
+
end
|
39
|
+
|
40
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
module Feedzirra
|
2
|
+
|
3
|
+
# Class +Reader+ is an experimental interface for fetching and parsing +Feed+ objects,
|
4
|
+
# using pluggable and (optionally) persistent backends to maintain state for fetched feeds.
|
5
|
+
class Reader
|
6
|
+
|
7
|
+
attr_reader :urls, :options
|
8
|
+
|
9
|
+
def initialize(*args)
|
10
|
+
@options = args.extract_options!
|
11
|
+
@urls = args.flatten
|
12
|
+
end
|
13
|
+
|
14
|
+
# Map all urls to Feed objects, pulling existing feeds from Backend where available,
|
15
|
+
# then slices and fetches feeds in chunks of 30.
|
16
|
+
def fetch
|
17
|
+
multi = multi_from(@urls, @options)
|
18
|
+
multi.run
|
19
|
+
multi.responses.size == 1 ? multi.responses.values.first : multi.responses.values
|
20
|
+
end
|
21
|
+
|
22
|
+
private
|
23
|
+
|
24
|
+
def multi_from(urls, options)
|
25
|
+
Feedzirra::HttpMulti.new(@urls, @options)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
data/lib/feedzirra.rb
ADDED
@@ -0,0 +1,44 @@
|
|
1
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__)) unless $LOAD_PATH.include?(File.dirname(__FILE__))
|
2
|
+
|
3
|
+
gem 'activesupport'
|
4
|
+
|
5
|
+
require 'zlib'
|
6
|
+
require 'curb'
|
7
|
+
require 'sax-machine'
|
8
|
+
require 'dryopteris'
|
9
|
+
require 'uri'
|
10
|
+
require 'active_support/basic_object'
|
11
|
+
require 'active_support/core_ext/object'
|
12
|
+
require 'active_support/core_ext/time'
|
13
|
+
|
14
|
+
require 'core_ext/date'
|
15
|
+
require 'core_ext/string'
|
16
|
+
require 'core_ext/array'
|
17
|
+
|
18
|
+
require 'feedzirra/backend/filesystem'
|
19
|
+
require 'feedzirra/backend/memcache'
|
20
|
+
require 'feedzirra/backend/memory'
|
21
|
+
|
22
|
+
require 'feedzirra/http_multi'
|
23
|
+
|
24
|
+
require 'feedzirra/parser/feed_utilities'
|
25
|
+
require 'feedzirra/parser/feed_entry_utilities'
|
26
|
+
require 'feedzirra/feed'
|
27
|
+
require 'feedzirra/reader'
|
28
|
+
require 'feedzirra/feed_parser'
|
29
|
+
|
30
|
+
require 'feedzirra/parser/rss_entry'
|
31
|
+
require 'feedzirra/parser/itunes_rss_owner'
|
32
|
+
require 'feedzirra/parser/itunes_rss_item'
|
33
|
+
require 'feedzirra/parser/atom_entry'
|
34
|
+
require 'feedzirra/parser/atom_feed_burner_entry'
|
35
|
+
|
36
|
+
require 'feedzirra/parser/rss'
|
37
|
+
require 'feedzirra/parser/itunes_rss'
|
38
|
+
require 'feedzirra/parser/atom'
|
39
|
+
require 'feedzirra/parser/atom_feed_burner'
|
40
|
+
|
41
|
+
module Feedzirra
|
42
|
+
USER_AGENT = "feedzirra http://github.com/pauldix/feedzirra/tree/master"
|
43
|
+
VERSION = "0.0.12"
|
44
|
+
end
|