jsl-feedtosis 0.0.3 → 0.0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +14 -15
- data/feedtosis.gemspec +2 -2
- data/lib/extensions/core/hash.rb +1 -11
- data/lib/feedtosis.rb +2 -2
- data/lib/feedtosis/client.rb +49 -18
- data/spec/feedtosis/client_spec.rb +39 -6
- data/spec/spec_helper.rb +3 -0
- metadata +2 -2
data/README.rdoc
CHANGED
@@ -9,17 +9,16 @@ well as by pointing out which entries are new in any given feed.
|
|
9
9
|
Feedtosis is designed to help you with book-keeping about feed fetching
|
10
10
|
details. This is usually something that is mundane and not fundamentally related
|
11
11
|
to the business logic of applications that deal with the consumption of
|
12
|
-
syndicated content on the web. Feedtosis keeps track of these
|
13
|
-
|
14
|
-
|
15
|
-
figure out which feed entries are new.
|
12
|
+
syndicated content on the web. Feedtosis keeps track of these details so you can
|
13
|
+
just keep grabbing new content without wasting bandwidth in making unnecessary requests
|
14
|
+
and programmer time in implementing algorithms to figure out which feed entries are new.
|
16
15
|
|
17
16
|
Feedtosis fits into other frameworks to do the heavy lifting, including the
|
18
17
|
Curb library which does HTTP requests through curl, and FeedNormalizer which
|
19
18
|
abstracts the differences between syndication formats. In the sense that it fits
|
20
19
|
into these existing, robust programs, Feedtosis is a modular middleware
|
21
20
|
piece that efficiently glues together disparate parts to create a helpful feed
|
22
|
-
reader with a minimal
|
21
|
+
reader with a minimal, spec-covered codebase.
|
23
22
|
|
24
23
|
== Installation
|
25
24
|
|
@@ -73,13 +72,11 @@ from memory. To configure a different backend, pass an options hash to the
|
|
73
72
|
Feedtosis client initialization:
|
74
73
|
|
75
74
|
url = "http://newsrss.bbc.co.uk/rss/newsonline_world_edition/south_asia/rss.xml"
|
76
|
-
|
77
|
-
res =
|
75
|
+
f = Feedtosis::Client.new(url, :backend => Moneta::Memcache.new(:server => 'localhost:1978'))
|
76
|
+
res = f.fetch
|
78
77
|
|
79
78
|
This example sets up a Memcache backend, which in this case points to Tokyo
|
80
|
-
Tyrant on port 1978.
|
81
|
-
which case you don't have to manually require Moneta::Memcache before
|
82
|
-
initializing the client.
|
79
|
+
Tyrant on port 1978.
|
83
80
|
|
84
81
|
Generally, Feedtosis supports all systems supported by Moneta, and any one
|
85
82
|
of the supported systems can be given to the +moneta_klass+ parameter. Other
|
@@ -109,13 +106,19 @@ computation and storage) which feed entries should be presented to the user as
|
|
109
106
|
{
|
110
107
|
:etag => "4c8f-46ac09fbbe940",
|
111
108
|
:last_modified => "Mon, 25 May 2009 18:17:33 GMT",
|
112
|
-
:digests => ["f2993783ded928637ce5f2dc2d837f10", "da64efa6dd9ce34e5699b9efe73a37a7"]
|
109
|
+
:digests => [["f2993783ded928637ce5f2dc2d837f10", "da64efa6dd9ce34e5699b9efe73a37a7"]]
|
113
110
|
}
|
114
111
|
|
115
112
|
The data stored by Feedtosis in the summary structure allows it to be
|
116
113
|
helpful to the user without storing lots of data that are unnecessary for
|
117
114
|
efficient functioning.
|
118
115
|
|
116
|
+
The summary structure keeps an Array of Arrays containing digests of feeds. The reason
|
117
|
+
for this is that some feeds, such as the Google blog search feeds, contain slightly different
|
118
|
+
but often-recurring results in the result set. Feedtosis keeps complete sets of entry digests
|
119
|
+
for previous feed retrievals. The number of digest sets that will be kept is configurable by
|
120
|
+
setting the option :retained_digest_size on Feedtosis client initialization.
|
121
|
+
|
119
122
|
== HTML cleaning/sanitizing
|
120
123
|
|
121
124
|
Feedtosis doesn't do anything about feed sanitizing, as other libraries have
|
@@ -132,10 +135,6 @@ development of Feedtosis.
|
|
132
135
|
Please let me know if you have any problems with or questions about
|
133
136
|
Feedtosis.
|
134
137
|
|
135
|
-
= References
|
136
|
-
|
137
|
-
(1) http://en.wikipedia.org/wiki/List_of_vores
|
138
|
-
|
139
138
|
= Author
|
140
139
|
|
141
140
|
Justin S. Leitgeb, mailto:justin@phq.org
|
data/feedtosis.gemspec
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
Gem::Specification.new do |s|
|
2
2
|
s.name = %q{feedtosis}
|
3
|
-
s.version = "0.0.3"
|
3
|
+
s.version = "0.0.3.1"
|
4
4
|
|
5
5
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
6
6
|
s.authors = ["Justin Leitgeb"]
|
@@ -38,7 +38,7 @@ Gem::Specification.new do |s|
|
|
38
38
|
'--inline-source'
|
39
39
|
]
|
40
40
|
|
41
|
-
%w[ taf2-curb jsl-moneta jsl-http_headers
|
41
|
+
%w[ taf2-curb jsl-moneta jsl-http_headers feed-normalizer ].each do |dep|
|
42
42
|
s.add_dependency(dep)
|
43
43
|
end
|
44
44
|
|
data/lib/extensions/core/hash.rb
CHANGED
@@ -1,17 +1,7 @@
|
|
1
1
|
class Hash
|
2
|
-
# Returns a Hash containing only input keys.
|
3
|
-
# Method from merb-core.
|
4
|
-
def except(*rejected)
|
5
|
-
reject { |k,v| rejected.include?(k) }
|
6
|
-
end
|
7
2
|
|
8
3
|
def reverse_merge(other_hash)
|
9
4
|
other_hash.merge(self)
|
10
5
|
end
|
11
|
-
|
12
|
-
# Returns a new hash containing only the input keys.
|
13
|
-
# Method from merb-core.
|
14
|
-
def only(*allowed)
|
15
|
-
reject { |k,v| !allowed.include?(k) }
|
16
|
-
end
|
6
|
+
|
17
7
|
end
|
data/lib/feedtosis.rb
CHANGED
@@ -1,10 +1,10 @@
|
|
1
1
|
require 'rubygems'
|
2
|
+
|
2
3
|
require 'curb'
|
3
|
-
require 'moneta'
|
4
|
-
require 'moneta/memory'
|
5
4
|
require 'http_headers'
|
6
5
|
require 'feed-normalizer'
|
7
6
|
require 'md5'
|
7
|
+
require 'uri'
|
8
8
|
|
9
9
|
lib_dirs = [ 'extensions', 'feedtosis' ].map do |d|
|
10
10
|
File.join(File.dirname(__FILE__), d)
|
data/lib/feedtosis/client.rb
CHANGED
@@ -7,14 +7,37 @@ module Feedtosis
|
|
7
7
|
# either new or not new. Entries retrieved are normalized using the
|
8
8
|
# feed-normalizer gem.
|
9
9
|
class Client
|
10
|
-
attr_reader :options, :
|
10
|
+
attr_reader :url, :options, :backend
|
11
|
+
|
12
|
+
DEFAULTS = {
|
13
|
+
:backend => Hash.new,
|
14
|
+
|
15
|
+
# The namespace will be prefixed to the key used for storage of the summary value. Based on your
|
16
|
+
# application needs, it may be useful to provide a custom prefix with initialization options.
|
17
|
+
:namespace => 'feedtosis',
|
18
|
+
|
19
|
+
# Some feed aggregators that we may be pulling from have entries that are present in one fetch and
|
20
|
+
# then disappear (Google blog search does this). For these cases, we can't rely on only the digests of
|
21
|
+
# the last fetch to guarantee "newness" of a feed that we may have previously consumed. We keep a
|
22
|
+
# number of previous sets of digests in order to make sure that we mark correct feeds as "new".
|
23
|
+
:retained_digest_size => 10
|
24
|
+
} unless defined?(DEFAULTS)
|
11
25
|
|
12
|
-
# Initializes a new feedtosis library.
|
13
|
-
#
|
14
|
-
# backend
|
15
|
-
|
16
|
-
|
17
|
-
@
|
26
|
+
# Initializes a new feedtosis library. It must be initialized with a valid URL as the first argument.
|
27
|
+
# A following Hash, if given, may have the following keys:
|
28
|
+
# * backend: a key-value store to be used for summary structures of feeds fetched. Moneta backends work well, but any object acting like a Hash is valid.
|
29
|
+
# * retained_digest_size: an Integer specifying the number of previous MD5 sets of entries to keep, used for new feed detection
|
30
|
+
def initialize(*args)
|
31
|
+
@url = args.first
|
32
|
+
|
33
|
+
@options = args.extract_options!
|
34
|
+
@options = @options.reverse_merge(DEFAULTS)
|
35
|
+
|
36
|
+
@backend = @options[:backend]
|
37
|
+
|
38
|
+
unless @url.match(URI.regexp('http'))
|
39
|
+
raise ArgumentError, "Url #{@url} is not valid!"
|
40
|
+
end
|
18
41
|
|
19
42
|
unless @backend.respond_to?(:[]) && @backend.respond_to?(:[]=)
|
20
43
|
raise ArgumentError, "Backend needs to be a key-value store"
|
@@ -37,12 +60,8 @@ module Feedtosis
|
|
37
60
|
# Marks entries as either seen or not seen based on the unique signature of
|
38
61
|
# the entry, which is calculated by taking the MD5 of common attributes.
|
39
62
|
def mark_new_entries(response)
|
40
|
-
digests =
|
41
|
-
|
42
|
-
else
|
43
|
-
summary_for_feed[:digests]
|
44
|
-
end
|
45
|
-
|
63
|
+
digests = summary_digests
|
64
|
+
|
46
65
|
# For each entry in the responses object, mark @_seen as false if the
|
47
66
|
# digest of this entry doesn't exist in the cached object.
|
48
67
|
response.entries.each do |e|
|
@@ -53,6 +72,14 @@ module Feedtosis
|
|
53
72
|
response
|
54
73
|
end
|
55
74
|
|
75
|
+
# Returns an Array of summary digests for this feed. Since we keep a number of sets
|
76
|
+
# of digests, inject across these sets to accumulate unique identifiers.
|
77
|
+
def summary_digests
|
78
|
+
summary_for_feed[:digests].inject([]) do |r, e|
|
79
|
+
r |= e
|
80
|
+
end.uniq
|
81
|
+
end
|
82
|
+
|
56
83
|
# Processes the results by identifying which entries are new if the response
|
57
84
|
# is a 200. Otherwise, returns the Curl::Easy object for the user to inspect.
|
58
85
|
def process_curl_response(curl)
|
@@ -82,7 +109,7 @@ module Feedtosis
|
|
82
109
|
|
83
110
|
# Returns the summary hash for this feed from the backend store.
|
84
111
|
def summary_for_feed
|
85
|
-
@backend[key_for_cached]
|
112
|
+
@backend[key_for_cached] || { :digests => [ ] }
|
86
113
|
end
|
87
114
|
|
88
115
|
# Sets the headers from the backend, if available
|
@@ -97,8 +124,9 @@ module Feedtosis
|
|
97
124
|
curl
|
98
125
|
end
|
99
126
|
|
127
|
+
# Returns the key for the storage of the summary structure in the key-value system.
|
100
128
|
def key_for_cached
|
101
|
-
MD5.hexdigest(@url)
|
129
|
+
[ @options[:namespace], MD5.hexdigest(@url) ].join('_')
|
102
130
|
end
|
103
131
|
|
104
132
|
# Stores information about the retrieval, including ETag, Last-Modified,
|
@@ -116,11 +144,14 @@ module Feedtosis
|
|
116
144
|
|
117
145
|
# Store digest for each feed entry so we can detect new feeds on the next
|
118
146
|
# retrieval
|
119
|
-
|
147
|
+
new_digest_set = feed.entries.map do |e|
|
120
148
|
digest_for(e)
|
121
149
|
end
|
122
150
|
|
123
|
-
|
151
|
+
new_digest_set = summary_for_feed[:digests].unshift(new_digest_set)
|
152
|
+
new_digest_set = new_digest_set[0..@options[:retained_digest_size]]
|
153
|
+
|
154
|
+
summary.merge!( :digests => new_digest_set )
|
124
155
|
set_summary(summary)
|
125
156
|
end
|
126
157
|
|
@@ -132,7 +163,7 @@ module Feedtosis
|
|
132
163
|
# This signature will be the MD5 of enough fields to have a reasonable
|
133
164
|
# probability of determining if the entry is unique or not.
|
134
165
|
def digest_for(entry)
|
135
|
-
MD5.hexdigest( [ entry.title, entry.content ].join )
|
166
|
+
MD5.hexdigest( [ entry.title, entry.content, entry.date_published ].join )
|
136
167
|
end
|
137
168
|
|
138
169
|
def parser_for_xml(xml)
|
@@ -3,8 +3,8 @@ require File.join(File.dirname(__FILE__), %w[.. spec_helper])
|
|
3
3
|
describe Feedtosis::Client do
|
4
4
|
before do
|
5
5
|
@url = "http://www.example.com/feed.rss"
|
6
|
-
@backend =
|
7
|
-
@fr = Feedtosis::Client.new(@url, @backend)
|
6
|
+
@backend = Hash.new
|
7
|
+
@fr = Feedtosis::Client.new(@url, :backend => @backend)
|
8
8
|
end
|
9
9
|
|
10
10
|
describe "initialization" do
|
@@ -12,12 +12,26 @@ describe Feedtosis::Client do
|
|
12
12
|
@fr.url.should == @url
|
13
13
|
end
|
14
14
|
|
15
|
+
describe "validation of url in first argument" do
|
16
|
+
it "should not raise an error on initialization with a valid HTTP url" do
|
17
|
+
lambda {
|
18
|
+
Feedtosis::Client.new('http://www.example.com')
|
19
|
+
}.should_not raise_error
|
20
|
+
end
|
21
|
+
|
22
|
+
it "should raise an error on initialization with an invalid url" do
|
23
|
+
lambda {
|
24
|
+
Feedtosis::Client.new('ftp://www.example.com')
|
25
|
+
}.should raise_error(ArgumentError)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
15
29
|
it "should set the If-None-Match and If-Modified-Since headers to the value of the summary hash" do
|
16
30
|
curl_headers = mock('headers')
|
17
31
|
curl_headers.expects(:[]=).with('If-None-Match', '42ab')
|
18
32
|
curl_headers.expects(:[]=).with('If-Modified-Since', 'Mon, 25 May 2009 16:38:49 GMT')
|
19
33
|
|
20
|
-
summary = { :etag => '42ab', :last_modified => 'Mon, 25 May 2009 16:38:49 GMT' }
|
34
|
+
summary = { :etag => '42ab', :last_modified => 'Mon, 25 May 2009 16:38:49 GMT', :digests => [ ] }
|
21
35
|
|
22
36
|
@fr.__send__(:set_summary, summary)
|
23
37
|
|
@@ -32,10 +46,17 @@ describe Feedtosis::Client do
|
|
32
46
|
@fr.fetch
|
33
47
|
end
|
34
48
|
|
49
|
+
describe "#summary_for_feed" do
|
50
|
+
it "should return a hash with :digests set to an empty Array when summary is nil" do
|
51
|
+
@fr.__send__(:set_summary, nil)
|
52
|
+
@fr.__send__(:summary_for_feed).should == {:digests => [ ]}
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
35
56
|
describe "when given a pre-initialized backend" do
|
36
57
|
it "should set the @backend to the pre-initialized structure" do
|
37
58
|
h = Moneta::Memory.new
|
38
|
-
fc = Feedtosis::Client.new(@url, h)
|
59
|
+
fc = Feedtosis::Client.new(@url, :backend => h)
|
39
60
|
fc.__send__(:instance_variable_get, :@backend).should == h
|
40
61
|
end
|
41
62
|
|
@@ -43,12 +64,24 @@ describe Feedtosis::Client do
|
|
43
64
|
o = Object.new
|
44
65
|
|
45
66
|
lambda {
|
46
|
-
Feedtosis::Client.new(@url, o)
|
67
|
+
Feedtosis::Client.new(@url, :backend => o)
|
47
68
|
}.should raise_error(ArgumentError)
|
48
69
|
end
|
49
70
|
end
|
50
71
|
end
|
51
72
|
|
73
|
+
describe "#key_for_cached" do
|
74
|
+
it "should default to the MD5 of the url after the namespace" do
|
75
|
+
c = Feedtosis::Client.new(@url)
|
76
|
+
c.__send__(:key_for_cached).should == [ 'feedtosis', MD5.hexdigest(@url) ].join('_')
|
77
|
+
end
|
78
|
+
|
79
|
+
it "should respect a custom namespace if given" do
|
80
|
+
c = Feedtosis::Client.new(@url, :namespace => 'justin')
|
81
|
+
c.__send__(:key_for_cached).should == [ 'justin', MD5.hexdigest(@url) ].join('_')
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
52
85
|
describe "#fetch" do
|
53
86
|
it "should call Curl::Easy.perform with the url, and #process_curl_response" do
|
54
87
|
curl_easy = mock('curl', :perform => true)
|
@@ -86,7 +119,7 @@ describe Feedtosis::Client do
|
|
86
119
|
curl = mock('curl', :perform => true, :response_code => 200,
|
87
120
|
:body_str => xml_fixture('wooster'), :header_str => http_header('wooster'))
|
88
121
|
@fr.expects(:build_curl_easy).returns(curl)
|
89
|
-
@fr.fetch.new_entries.should == []
|
122
|
+
@fr.fetch.new_entries.should == []
|
90
123
|
end
|
91
124
|
end
|
92
125
|
|
data/spec/spec_helper.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: jsl-feedtosis
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.3
|
4
|
+
version: 0.0.3.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Justin Leitgeb
|
@@ -43,7 +43,7 @@ dependencies:
|
|
43
43
|
version: "0"
|
44
44
|
version:
|
45
45
|
- !ruby/object:Gem::Dependency
|
46
|
-
name:
|
46
|
+
name: feed-normalizer
|
47
47
|
type: :runtime
|
48
48
|
version_requirement:
|
49
49
|
version_requirements: !ruby/object:Gem::Requirement
|