somezack-feedzirra 0.0.2 → 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- data/README.textile +10 -6
- data/lib/feedzirra/atom_entry.rb +1 -0
- data/lib/feedzirra/atom_feed_burner_entry.rb +2 -0
- data/lib/feedzirra/feed.rb +22 -6
- data/lib/feedzirra/feed_entry_utilities.rb +23 -14
- data/lib/feedzirra/rss_entry.rb +1 -0
- data/lib/feedzirra.rb +2 -1
- data/spec/feedzirra/atom_entry_spec.rb +4 -0
- data/spec/feedzirra/atom_feed_burner_entry_spec.rb +9 -0
- data/spec/feedzirra/feed_entry_utilities_spec.rb +8 -1
- data/spec/feedzirra/feed_spec.rb +11 -1
- data/spec/feedzirra/rss_entry_spec.rb +4 -0
- data/spec/feedzirra/rss_spec.rb +5 -4
- metadata +13 -3
data/README.textile
CHANGED
@@ -42,6 +42,11 @@ NoMethodError: undefined method `on_success' for #<Curl::Easy:0x1182724>
|
|
42
42
|
</pre>
|
43
43
|
This means that you are requiring curl-multi or the Ruby Forge version of Curb somewhere. You can't use those and need to get the taf2 version up and running.
|
44
44
|
|
45
|
+
If you're on Debian or Ubuntu and getting errors while trying to install the taf2-curb gem, it could be because you don't have the latest version of libcurl installed. Do this to fix:
|
46
|
+
<pre>
|
47
|
+
sudo apt-get install libcurl4-gnutls-dev
|
48
|
+
</pre>
|
49
|
+
|
45
50
|
Another problem could be if you are running Mac Ports and you have libcurl installed through there. You need to uninstall it for curb to work! The version in Mac Ports is old and doesn't play nice with curb. If you're running Leopard, you can just uninstall and you should be golden. If you're on an older version of OS X, you'll then need to "download curl":http://curl.haxx.se/download.html and build from source. Then you'll have to install the taf2-curb gem again. You might have to perform the step above.
|
46
51
|
|
47
52
|
If you're still having issues, please let me know on the mailing list. Also, "Todd Fisher (taf2)":http://github.com/taf2 is working on fixing the gem install. Please send him a full error report.
|
@@ -69,11 +74,13 @@ entry.author # => "Paul Dix"
|
|
69
74
|
entry.summary # => "..."
|
70
75
|
entry.content # => "..."
|
71
76
|
entry.published # => Thu Jan 29 17:00:19 UTC 2009 # it's a Time object
|
77
|
+
entry.categories # => ["...", "..."]
|
72
78
|
|
73
79
|
# sanitizing an entry's content
|
74
|
-
entry.
|
75
|
-
entry.
|
76
|
-
entry.
|
80
|
+
entry.title.sanitize # => returns the title with harmful stuff escaped
|
81
|
+
entry.author.sanitize # => returns the author with harmful stuff escaped
|
82
|
+
entry.content.sanitize # => returns the content with harmful stuff escaped
|
83
|
+
entry.content.sanitize! # => returns content with harmful stuff escaped and replaces original (also exists for author and title)
|
77
84
|
entry.sanitize! # => sanitizes the entry's title, author, and content in place (as in, it changes the value to clean versions)
|
78
85
|
feed.sanitize_entries! # => sanitizes all entries in place
|
79
86
|
|
@@ -133,13 +140,10 @@ This thing needs to hammer on many different feeds in the wild. I'm sure there w
|
|
133
140
|
|
134
141
|
Here are some more specific TODOs.
|
135
142
|
* Make a feedzirra-rails gem to integrate feedzirra seamlessly with Rails and ActiveRecord.
|
136
|
-
* Add function to sanitize content.
|
137
|
-
* Add support to automatically handle gzip and deflate encododing.
|
138
143
|
* Add support for authenticated feeds.
|
139
144
|
* Create a super sweet DSL for defining new parsers.
|
140
145
|
* Test against Ruby 1.9.1 and fix any bugs.
|
141
146
|
* I'm not keeping track of modified on entries. Should I add this?
|
142
|
-
* Should I be parsing stuff like tags or categories for entries?
|
143
147
|
* Clean up the fetching code inside feed.rb so it doesn't suck so hard.
|
144
148
|
* Make the feed_spec actually mock stuff out so it doesn't hit the net.
|
145
149
|
* Readdress how feeds determine if they can parse a document. Maybe I should use namespaces instead?
|
data/lib/feedzirra/atom_entry.rb
CHANGED
@@ -4,9 +4,11 @@ module Feedzirra
|
|
4
4
|
include FeedEntryUtilities
|
5
5
|
element :title
|
6
6
|
element :name, :as => :author
|
7
|
+
element :link, :as => :url, :value => :href, :with => {:type => "text/html", :rel => "alternate"}
|
7
8
|
element :"feedburner:origLink", :as => :url
|
8
9
|
element :summary
|
9
10
|
element :content
|
10
11
|
element :published
|
12
|
+
elements :category, :as => :categories, :value => :term
|
11
13
|
end
|
12
14
|
end
|
data/lib/feedzirra/feed.rb
CHANGED
@@ -29,17 +29,18 @@ module Feedzirra
|
|
29
29
|
# when passed a single url it returns the body of the response
|
30
30
|
# when passed an array of urls it returns a hash with the urls as keys and body of responses as values
|
31
31
|
def self.fetch_raw(urls, options = {})
|
32
|
-
|
32
|
+
url_queue = [*urls]
|
33
33
|
multi = Curl::Multi.new
|
34
34
|
responses = {}
|
35
|
-
|
35
|
+
url_queue.each do |url|
|
36
36
|
easy = Curl::Easy.new(url) do |curl|
|
37
37
|
curl.headers["User-Agent"] = (options[:user_agent] || USER_AGENT)
|
38
38
|
curl.headers["If-Modified-Since"] = options[:if_modified_since].httpdate if options.has_key?(:if_modified_since)
|
39
39
|
curl.headers["If-None-Match"] = options[:if_none_match] if options.has_key?(:if_none_match)
|
40
|
+
curl.headers["Accept-encoding"] = 'gzip, deflate'
|
40
41
|
curl.follow_location = true
|
41
42
|
curl.on_success do |c|
|
42
|
-
responses[url] = c
|
43
|
+
responses[url] = decode_content(c)
|
43
44
|
end
|
44
45
|
curl.on_failure do |c|
|
45
46
|
responses[url] = c.response_code
|
@@ -49,7 +50,7 @@ module Feedzirra
|
|
49
50
|
end
|
50
51
|
|
51
52
|
multi.perform
|
52
|
-
return
|
53
|
+
return urls.is_a?(String) ? responses.values.first : responses
|
53
54
|
end
|
54
55
|
|
55
56
|
def self.fetch_and_parse(urls, options = {})
|
@@ -64,7 +65,21 @@ module Feedzirra
|
|
64
65
|
end
|
65
66
|
|
66
67
|
multi.perform
|
67
|
-
return
|
68
|
+
return urls.is_a?(String) ? responses.values.first : responses
|
69
|
+
end
|
70
|
+
|
71
|
+
def self.decode_content(c)
|
72
|
+
if c.header_str.match(/Content-Encoding: gzip/)
|
73
|
+
gz = Zlib::GzipReader.new(StringIO.new(c.body_str))
|
74
|
+
xml = gz.read
|
75
|
+
gz.close
|
76
|
+
elsif c.header_str.match(/Content-Encoding: deflate/)
|
77
|
+
xml = Zlib::Deflate.inflate(c.body_str)
|
78
|
+
else
|
79
|
+
xml = c.body_str
|
80
|
+
end
|
81
|
+
|
82
|
+
xml
|
68
83
|
end
|
69
84
|
|
70
85
|
def self.update(feeds, options = {})
|
@@ -84,10 +99,11 @@ module Feedzirra
|
|
84
99
|
curl.headers["User-Agent"] = (options[:user_agent] || USER_AGENT)
|
85
100
|
curl.headers["If-Modified-Since"] = options[:if_modified_since].httpdate if options.has_key?(:if_modified_since)
|
86
101
|
curl.headers["If-None-Match"] = options[:if_none_match] if options.has_key?(:if_none_match)
|
102
|
+
curl.headers["Accept-encoding"] = 'gzip, deflate'
|
87
103
|
curl.follow_location = true
|
88
104
|
curl.on_success do |c|
|
89
105
|
add_url_to_multi(multi, url_queue.shift, url_queue, responses, options) unless url_queue.empty?
|
90
|
-
xml = c
|
106
|
+
xml = decode_content(c)
|
91
107
|
klass = determine_feed_parser_for_xml(xml)
|
92
108
|
if klass
|
93
109
|
feed = klass.parse(xml)
|
@@ -1,5 +1,15 @@
|
|
1
1
|
module Feedzirra
|
2
2
|
module FeedEntryUtilities
|
3
|
+
module Sanitize
|
4
|
+
def sanitize!
|
5
|
+
self.replace(sanitize)
|
6
|
+
end
|
7
|
+
|
8
|
+
def sanitize
|
9
|
+
Dryopteris.sanitize(self)
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
3
13
|
attr_reader :published
|
4
14
|
|
5
15
|
def parse_datetime(string)
|
@@ -10,23 +20,22 @@ module Feedzirra
|
|
10
20
|
@published = parse_datetime(val)
|
11
21
|
end
|
12
22
|
|
13
|
-
def
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
dispatcher.new(self)
|
23
|
+
def content
|
24
|
+
@content.extend(Sanitize)
|
25
|
+
end
|
26
|
+
|
27
|
+
def title
|
28
|
+
@title.extend(Sanitize)
|
29
|
+
end
|
30
|
+
|
31
|
+
def author
|
32
|
+
@author.extend(Sanitize)
|
24
33
|
end
|
25
34
|
|
26
35
|
def sanitize!
|
27
|
-
self.title
|
28
|
-
self.author
|
29
|
-
self.content
|
36
|
+
self.title.sanitize!
|
37
|
+
self.author.sanitize!
|
38
|
+
self.content.sanitize!
|
30
39
|
end
|
31
40
|
|
32
41
|
alias_method :last_modified, :published
|
data/lib/feedzirra/rss_entry.rb
CHANGED
data/lib/feedzirra.rb
CHANGED
@@ -2,6 +2,7 @@ $LOAD_PATH.unshift(File.dirname(__FILE__)) unless $LOAD_PATH.include?(File.dirna
|
|
2
2
|
|
3
3
|
gem 'activesupport'
|
4
4
|
|
5
|
+
require 'zlib'
|
5
6
|
require 'curb'
|
6
7
|
require 'sax-machine'
|
7
8
|
require 'dryopteris'
|
@@ -26,5 +27,5 @@ require 'feedzirra/atom'
|
|
26
27
|
require 'feedzirra/atom_feed_burner'
|
27
28
|
|
28
29
|
module Feedzirra
|
29
|
-
VERSION = "0.0.
|
30
|
+
VERSION = "0.0.3"
|
30
31
|
end
|
@@ -11,6 +11,11 @@ describe Feedzirra::AtomFeedBurnerEntry do
|
|
11
11
|
@entry.title.should == "Making a Ruby C library even faster"
|
12
12
|
end
|
13
13
|
|
14
|
+
it "should be able to fetch a url via the 'alternate' rel if no origLink exists" do
|
15
|
+
entry = Feedzirra::AtomFeedBurner.parse(File.read("#{File.dirname(__FILE__)}/../sample_feeds/PaulDixExplainsNothingAlternate.xml")).entries.first
|
16
|
+
entry.url.should == 'http://feeds.feedburner.com/~r/PaulDixExplainsNothing/~3/519925023/making-a-ruby-c-library-even-faster.html'
|
17
|
+
end
|
18
|
+
|
14
19
|
it "should parse the url" do
|
15
20
|
@entry.url.should == "http://www.pauldix.net/2009/01/making-a-ruby-c-library-even-faster.html"
|
16
21
|
end
|
@@ -30,4 +35,8 @@ describe Feedzirra::AtomFeedBurnerEntry do
|
|
30
35
|
it "should parse the published date" do
|
31
36
|
@entry.published.to_s.should == "Thu Jan 22 15:50:22 UTC 2009"
|
32
37
|
end
|
38
|
+
|
39
|
+
it "should parse the categories" do
|
40
|
+
@entry.categories.should == ['Ruby', 'Another Category']
|
41
|
+
end
|
33
42
|
end
|
@@ -24,7 +24,14 @@ describe Feedzirra::FeedUtilities do
|
|
24
24
|
it "should provide a sanitized title" do
|
25
25
|
new_title = "<script>" + @entry.title
|
26
26
|
@entry.title = new_title
|
27
|
-
@entry.
|
27
|
+
@entry.title.sanitize.should == Dryopteris.sanitize(new_title)
|
28
|
+
end
|
29
|
+
|
30
|
+
it "should sanitize content in place" do
|
31
|
+
new_content = "<script>" + @entry.content
|
32
|
+
@entry.content = new_content.dup
|
33
|
+
@entry.content.sanitize!.should == Dryopteris.sanitize(new_content)
|
34
|
+
@entry.content.should == Dryopteris.sanitize(new_content)
|
28
35
|
end
|
29
36
|
|
30
37
|
it "should sanitize things in place" do
|
data/spec/feedzirra/feed_spec.rb
CHANGED
@@ -113,7 +113,7 @@ describe Feedzirra::Feed do
|
|
113
113
|
describe "fetching feeds" do
|
114
114
|
before(:each) do
|
115
115
|
@paul_feed_url = "http://feeds.feedburner.com/PaulDixExplainsNothing"
|
116
|
-
@trotter_feed_url = "http://
|
116
|
+
@trotter_feed_url = "http://feeds2.feedburner.com/trottercashion"
|
117
117
|
end
|
118
118
|
|
119
119
|
describe "handling many feeds" do
|
@@ -139,6 +139,11 @@ describe Feedzirra::Feed do
|
|
139
139
|
results[@paul_feed_url].should =~ /Paul Dix/
|
140
140
|
results[@trotter_feed_url].should =~ /Trotter Cashion/
|
141
141
|
end
|
142
|
+
|
143
|
+
it "should always return a hash when passed an array" do
|
144
|
+
results = Feedzirra::Feed.fetch_raw([@paul_feed_url])
|
145
|
+
results.class.should == Hash
|
146
|
+
end
|
142
147
|
end
|
143
148
|
|
144
149
|
describe "#fetch_and_parse" do
|
@@ -169,6 +174,11 @@ describe Feedzirra::Feed do
|
|
169
174
|
feeds[@trotter_feed_url].feed_url.should == @trotter_feed_url
|
170
175
|
end
|
171
176
|
|
177
|
+
it "should always return a hash when passed an array" do
|
178
|
+
feeds = Feedzirra::Feed.fetch_and_parse([@paul_feed_url])
|
179
|
+
feeds.class.should == Hash
|
180
|
+
end
|
181
|
+
|
172
182
|
it "should yeild the url and feed object to a :on_success lambda" do
|
173
183
|
successful_call_mock = mock("successful_call_mock")
|
174
184
|
successful_call_mock.should_receive(:call)
|
@@ -30,4 +30,8 @@ describe Feedzirra::RSSEntry do
|
|
30
30
|
it "should parse the published date" do
|
31
31
|
@entry.published.to_s.should == "Thu Dec 04 17:17:49 UTC 2008"
|
32
32
|
end
|
33
|
+
|
34
|
+
it "should parse the categories" do
|
35
|
+
@entry.categories.should == ['computadora', 'nokogiri', 'rails']
|
36
|
+
end
|
33
37
|
end
|
data/spec/feedzirra/rss_spec.rb
CHANGED
@@ -5,10 +5,11 @@ describe Feedzirra::RSS do
|
|
5
5
|
it "should return true for an RSS feed" do
|
6
6
|
Feedzirra::RSS.should be_able_to_parse(sample_rss_feed)
|
7
7
|
end
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
8
|
+
|
9
|
+
# this is no longer true. combined rdf and rss into one
|
10
|
+
# it "should return false for an rdf feed" do
|
11
|
+
# Feedzirra::RSS.should_not be_able_to_parse(sample_rdf_feed)
|
12
|
+
# end
|
12
13
|
|
13
14
|
it "should return fase for an atom feed" do
|
14
15
|
Feedzirra::RSS.should_not be_able_to_parse(sample_atom_feed)
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: somezack-feedzirra
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Paul Dix
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2009-
|
12
|
+
date: 2009-02-19 00:00:00 -08:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
@@ -30,7 +30,7 @@ dependencies:
|
|
30
30
|
requirements:
|
31
31
|
- - ">="
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: 0.0.
|
33
|
+
version: 0.0.9
|
34
34
|
version:
|
35
35
|
- !ruby/object:Gem::Dependency
|
36
36
|
name: taf2-curb
|
@@ -42,6 +42,16 @@ dependencies:
|
|
42
42
|
- !ruby/object:Gem::Version
|
43
43
|
version: 0.2.3
|
44
44
|
version:
|
45
|
+
- !ruby/object:Gem::Dependency
|
46
|
+
name: builder
|
47
|
+
type: :runtime
|
48
|
+
version_requirement:
|
49
|
+
version_requirements: !ruby/object:Gem::Requirement
|
50
|
+
requirements:
|
51
|
+
- - ">="
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: 2.1.2
|
54
|
+
version:
|
45
55
|
- !ruby/object:Gem::Dependency
|
46
56
|
name: activesupport
|
47
57
|
type: :runtime
|