siuying-fullfeed 0.4.6 → 0.4.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +22 -0
- data/Rakefile +48 -24
- data/examples/applenews_hk.rb +22 -0
- data/examples/extractors/apple_news_extractor.rb +14 -0
- data/examples/extractors/yahoo_news_hong_kong_extractor.rb +14 -0
- data/examples/server/controller/feed_controller.rb +41 -0
- data/examples/server/server.rb +28 -0
- data/examples/server/views/index.erb +19 -0
- data/examples/sinatra/web.rb +29 -0
- data/examples/ynews_hk.rb +12 -0
- data/examples/ynews_hk_db.rb +32 -0
- data/fullfeed.gemspec +58 -0
- data/lib/fullfeed/agent/agent_factory.rb +53 -0
- data/lib/fullfeed/agent/appengine_agent.rb +21 -0
- data/lib/fullfeed/agent/base.rb +14 -0
- data/lib/fullfeed/agent/mechanize_agent.rb +21 -0
- data/lib/fullfeed/agent/open_uri_agent.rb +15 -0
- data/lib/fullfeed/extractor/base_extractor.rb +37 -0
- data/lib/fullfeed/extractor/extractor_factory.rb +32 -0
- data/lib/fullfeed/extractor/text_extractor.rb +18 -0
- data/lib/fullfeed/extractor/xpath_extractor.rb +23 -0
- data/lib/fullfeed/feed.rb +105 -0
- data/lib/fullfeed/filters/base_filter.rb +61 -0
- data/lib/fullfeed/filters/convert_encoding_filter.rb +23 -0
- data/lib/fullfeed/filters/excess_space_filter.rb +16 -0
- data/lib/fullfeed/filters/uppercase_filter.rb +21 -0
- data/lib/fullfeed/store/base.rb +21 -0
- data/lib/fullfeed/store/db_store.rb +51 -0
- data/lib/fullfeed/store/memory_store.rb +26 -0
- data/lib/fullfeed/store/store_factory.rb +57 -0
- data/lib/fullfeed.rb +50 -0
- data/test/load_files.rb +7 -0
- data/test/test_agent.rb +25 -0
- data/test/test_store.rb +30 -0
- metadata +39 -25
data/LICENSE
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
Copyright (c) 2009 siu [dot] ying [at] gmail [dot] com
|
|
2
|
+
|
|
3
|
+
Permission is hereby granted, free of charge, to any person
|
|
4
|
+
obtaining a copy of this software and associated documentation
|
|
5
|
+
files (the "Software"), to deal in the Software without
|
|
6
|
+
restriction, including without limitation the rights to use,
|
|
7
|
+
copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
8
|
+
copies of the Software, and to permit persons to whom the
|
|
9
|
+
Software is furnished to do so, subject to the following
|
|
10
|
+
conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be
|
|
13
|
+
included in all copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
16
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
|
17
|
+
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
18
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
|
19
|
+
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
|
20
|
+
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
21
|
+
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
|
22
|
+
OTHER DEALINGS IN THE SOFTWARE.
|
data/Rakefile
CHANGED
|
@@ -1,35 +1,59 @@
|
|
|
1
|
-
require
|
|
2
|
-
require
|
|
3
|
-
require
|
|
4
|
-
require "rake/testtask"
|
|
1
|
+
require 'rake/clean'
|
|
2
|
+
require 'rake/testtask'
|
|
3
|
+
require 'fileutils'
|
|
5
4
|
|
|
6
|
-
# Gem
|
|
7
5
|
require "rake/gempackagetask"
|
|
8
6
|
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
# s.executables = ["rackup"]
|
|
24
|
-
s.add_dependency('ruby-cache', '>= 0.3.0')
|
|
25
|
-
s.add_dependency('hpricot', '>= 0.6.1')
|
|
7
|
+
task :default => :package
|
|
8
|
+
|
|
9
|
+
# PACKAGING ============================================================
|
|
10
|
+
|
|
11
|
+
# Load the gemspec using the same limitations as github
|
|
12
|
+
def spec
|
|
13
|
+
@spec ||=
|
|
14
|
+
begin
|
|
15
|
+
require 'rubygems/specification'
|
|
16
|
+
data = File.read('fullfeed.gemspec')
|
|
17
|
+
spec = nil
|
|
18
|
+
Thread.new { spec = eval("$SAFE = 3\n#{data}") }.join
|
|
19
|
+
spec
|
|
20
|
+
end
|
|
26
21
|
end
|
|
27
22
|
|
|
28
23
|
Rake::GemPackageTask.new(spec) do |pkg|
|
|
29
24
|
pkg.gem_spec = spec
|
|
30
25
|
end
|
|
31
26
|
|
|
32
|
-
desc "Install the
|
|
27
|
+
desc "Install the Fullfeed as a gem"
|
|
33
28
|
task :install => [:repackage] do
|
|
34
29
|
sh %{gem install pkg/#{spec.name}-#{spec.version}}
|
|
35
|
-
end
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
# Gemspec Helpers ====================================================
|
|
33
|
+
def source_version
|
|
34
|
+
line = File.read('lib/fullfeed.rb')[/^\s*VERSION = .*/]
|
|
35
|
+
line.match(/.*VERSION = '(.*)'/)[1]
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
task 'fullfeed.gemspec' => FileList['lib/**','bin/**','examples/**','Rakefile','LICENSE','README'] do |f|
|
|
39
|
+
# read spec file and split out manifest section
|
|
40
|
+
spec = File.read(f.name)
|
|
41
|
+
head, manifest, tail = spec.split(" # = MANIFEST =\n")
|
|
42
|
+
# replace version and date
|
|
43
|
+
head.sub!(/\.version = '.*'/, ".version = '#{source_version}'")
|
|
44
|
+
head.sub!(/\.date = '.*'/, ".date = '#{Date.today.to_s}'")
|
|
45
|
+
# determine file list from git ls-files
|
|
46
|
+
files = `git ls-files`.
|
|
47
|
+
split("\n").
|
|
48
|
+
sort.
|
|
49
|
+
reject{ |file| file =~ /^\./ }.
|
|
50
|
+
reject{ |file| file =~ /^.+\/\./ }.
|
|
51
|
+
reject { |file| file =~ /^doc/ }.
|
|
52
|
+
map{ |file| " #{file}" }.
|
|
53
|
+
join("\n")
|
|
54
|
+
# piece file back together and write...
|
|
55
|
+
manifest = " s.files = %w[\n#{files}\n ]\n"
|
|
56
|
+
spec = [head,manifest,tail].join(" # = MANIFEST =\n")
|
|
57
|
+
File.open(f.name, 'w') { |io| io.write(spec) }
|
|
58
|
+
puts "updated #{f.name}"
|
|
59
|
+
end
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# Extractor Example
|
|
2
|
+
#
|
|
3
|
+
# create full text RSS feed from Yahoo! News HK
|
|
4
|
+
|
|
5
|
+
require "rubygems"
|
|
6
|
+
require "fullfeed"
|
|
7
|
+
require "#{File.dirname(__FILE__)}/extractors/apple_news_extractor"
|
|
8
|
+
|
|
9
|
+
# convert encoding filer, convert feed and/or fulltext item to UTF-8
|
|
10
|
+
# first parameter specify the source RSS Feed encoding
|
|
11
|
+
# second parameter specify the fulltext item HTML page encoding
|
|
12
|
+
filter = Fullfeed::Filters::ConvertEncodingFilter.new("UTF-8", "Big5")
|
|
13
|
+
|
|
14
|
+
# create full text RSS feed from Yahoo! News HK
|
|
15
|
+
# At most fetch 5 pages, wait 1 seconds before each try
|
|
16
|
+
feed = Fullfeed::Feed.new("http://rss.appleactionews.com/rss.xml",
|
|
17
|
+
:limit => 5,
|
|
18
|
+
:agent => :open_uri,
|
|
19
|
+
:filters => filter,
|
|
20
|
+
:wait => 1)
|
|
21
|
+
result = feed.fetch
|
|
22
|
+
puts result
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
require "fullfeed"
|
|
2
|
+
|
|
3
|
+
module Fullfeed
|
|
4
|
+
module Extractor
|
|
5
|
+
class YahooNewsHongKongExtractor < XpathExtractor
|
|
6
|
+
# register this extractor to the system
|
|
7
|
+
register
|
|
8
|
+
|
|
9
|
+
def initialize
|
|
10
|
+
super(%r{http://hk\.rd\.yahoo.com/news/rss/\*http://.+\.html}, ".livewords")
|
|
11
|
+
end
|
|
12
|
+
end
|
|
13
|
+
end
|
|
14
|
+
end
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
module Fullfeedr
|
|
2
|
+
class FeedController
|
|
3
|
+
CONF = [
|
|
4
|
+
{
|
|
5
|
+
'name' => "appleactionnews",
|
|
6
|
+
'url' => "http://rss.appleactionews.com/rss.xml",
|
|
7
|
+
'filters' => [Fullfeed::Filters::ConvertEncodingFilter.new("UTF-8", "Big5"), Fullfeed::Filters::ExcessSpaceFilter.new]},
|
|
8
|
+
{
|
|
9
|
+
'name' => "ynews-hk",
|
|
10
|
+
'url' => "http://hk.news.yahoo.com/rss/hongkong/rss.xml"}
|
|
11
|
+
].freeze
|
|
12
|
+
|
|
13
|
+
def initialize
|
|
14
|
+
@feeds = {}
|
|
15
|
+
CONF.each do |conf|
|
|
16
|
+
name = conf['name']
|
|
17
|
+
url = conf['url']
|
|
18
|
+
filters = conf['filters'] || []
|
|
19
|
+
|
|
20
|
+
@feeds[name] = Fullfeed::Feed.new(url,
|
|
21
|
+
:limit => 20,
|
|
22
|
+
:wait => 1,
|
|
23
|
+
:filters => filters,
|
|
24
|
+
:agent => :open_uri,
|
|
25
|
+
:store => :db)
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def fetch(name)
|
|
30
|
+
if @feeds[name]
|
|
31
|
+
@feeds[name].fetch.to_s
|
|
32
|
+
else
|
|
33
|
+
raise ArgumentError, "not a registered name!"
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def list
|
|
38
|
+
CONF
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
end
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# Web Example
|
|
2
|
+
#
|
|
3
|
+
# Start a web server that host the Yahoo! Hong Kong news with full text feed
|
|
4
|
+
# Require sinatra gem
|
|
5
|
+
|
|
6
|
+
require "rubygems"
|
|
7
|
+
gem('fullfeed', '>= 0.4.3')
|
|
8
|
+
|
|
9
|
+
require 'fullfeed'
|
|
10
|
+
require 'sinatra'
|
|
11
|
+
require 'erb'
|
|
12
|
+
|
|
13
|
+
require "#{File.dirname(__FILE__)}/controller/feed_controller"
|
|
14
|
+
|
|
15
|
+
# setup
|
|
16
|
+
Fullfeed::Store::DbStore.setup("sqlite3:ynews.sqlite3", true)
|
|
17
|
+
server = Fullfeedr::FeedController.new
|
|
18
|
+
|
|
19
|
+
# list feeds
|
|
20
|
+
get '/' do
|
|
21
|
+
@list = server.list
|
|
22
|
+
erb :index
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
# fetch pages
|
|
26
|
+
get '/:name' do
|
|
27
|
+
server.fetch(params[:name])
|
|
28
|
+
end
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
<html>
|
|
2
|
+
<head>
|
|
3
|
+
<title>Fullfeedr!</title>
|
|
4
|
+
<link rel="stylesheet" href="http://www.w3.org/StyleSheets/Core/Modernist" type="text/css"/>
|
|
5
|
+
</head>
|
|
6
|
+
<body>
|
|
7
|
+
<h1>Welcome to Fullfeedr!</h1>
|
|
8
|
+
<p>We host following feed here: </p>
|
|
9
|
+
<ol>
|
|
10
|
+
<% for item in @list %>
|
|
11
|
+
<li>
|
|
12
|
+
<a href="/<%= item['name'] %>">/<%= item['name'] %></a>
|
|
13
|
+
(source: <a href="<%= item['url'] %>"><%= item['url'] %></a>)
|
|
14
|
+
</li>
|
|
15
|
+
<% end %>
|
|
16
|
+
</ol>
|
|
17
|
+
<p>(Please wait while the fulltext feed is being downloaded)</p>
|
|
18
|
+
</body>
|
|
19
|
+
</html>
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
# Web Example
|
|
2
|
+
#
|
|
3
|
+
# Start a web server that host the Yahoo! Hong Kong news with full text feed
|
|
4
|
+
# Require sinatra gem
|
|
5
|
+
|
|
6
|
+
require 'rubygems'
|
|
7
|
+
require 'sinatra'
|
|
8
|
+
require "#{File.dirname(__FILE__)}/../../lib/fullfeed"
|
|
9
|
+
require "#{File.dirname(__FILE__)}/../extractors/yahoo_news_hong_kong_extractor"
|
|
10
|
+
|
|
11
|
+
Fullfeed::Store::DbStore.setup("sqlite3:ynews.sqlite3")
|
|
12
|
+
|
|
13
|
+
#DataMapper.auto_migrate!
|
|
14
|
+
|
|
15
|
+
# Create Yahoo! News HK full-text feed
|
|
16
|
+
feed = Fullfeed::Feed.new("http://hk.news.yahoo.com/rss/hongkong/rss.xml",
|
|
17
|
+
:limit => 20,
|
|
18
|
+
:wait => 1,
|
|
19
|
+
:agent => :open_uri,
|
|
20
|
+
:store => :db)
|
|
21
|
+
|
|
22
|
+
# pre fetch the request
|
|
23
|
+
feed.logger.info "Pre-Fetching RSS, could take some time ..."
|
|
24
|
+
feed.fetch
|
|
25
|
+
|
|
26
|
+
get '/' do
|
|
27
|
+
# fetch updated item
|
|
28
|
+
feed.fetch.to_s
|
|
29
|
+
end
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
# Extractor Example
|
|
2
|
+
#
|
|
3
|
+
# create full text RSS feed from Yahoo! News HK
|
|
4
|
+
|
|
5
|
+
require "rubygems"
|
|
6
|
+
require "fullfeed"
|
|
7
|
+
require "#{File.dirname(__FILE__)}/extractors/yahoo_news_hong_kong_extractor"
|
|
8
|
+
|
|
9
|
+
feed = Fullfeed::Feed.new("http://hk.news.yahoo.com/rss/hongkong/rss.xml")
|
|
10
|
+
puts feed.fetch
|
|
11
|
+
|
|
12
|
+
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
# Extractor Example, using Datamapper
|
|
2
|
+
#
|
|
3
|
+
# Create full text RSS feed from Yahoo! News HK, store RSS in datamapper.
|
|
4
|
+
# Later invocation will not cause older items being download again.
|
|
5
|
+
# Use sqlite3 as backend, use proper adapter for your needs!
|
|
6
|
+
#
|
|
7
|
+
# Uncomment the line under "Migrate database" when first run this app (it setup database)
|
|
8
|
+
#
|
|
9
|
+
|
|
10
|
+
require "rubygems"
|
|
11
|
+
require "fullfeed"
|
|
12
|
+
|
|
13
|
+
require "#{File.dirname(__FILE__)}/extractors/yahoo_news_hong_kong_extractor"
|
|
14
|
+
|
|
15
|
+
# setup datamaper
|
|
16
|
+
Fullfeed::Store::DbStore.setup("sqlite3:ynews.sqlite3")
|
|
17
|
+
|
|
18
|
+
# Migrate database, use only once
|
|
19
|
+
#DataMapper.auto_migrate!
|
|
20
|
+
|
|
21
|
+
# create full text RSS feed from Yahoo! News HK
|
|
22
|
+
# At most fetch 20 pages, wait 1 seconds before each try
|
|
23
|
+
feed = Fullfeed::Feed.new("http://hk.news.yahoo.com/rss/hongkong/rss.xml",
|
|
24
|
+
:limit => 20,
|
|
25
|
+
:store => :db,
|
|
26
|
+
:wait => 1)
|
|
27
|
+
result = feed.fetch
|
|
28
|
+
puts result
|
|
29
|
+
|
|
30
|
+
File.open("yahoo.rss", "w") do |file|
|
|
31
|
+
file.write(result)
|
|
32
|
+
end
|
data/fullfeed.gemspec
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
Gem::Specification.new do |s|
|
|
2
|
+
s.specification_version = 2 if s.respond_to? :specification_version=
|
|
3
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
|
4
|
+
|
|
5
|
+
s.name = "fullfeed"
|
|
6
|
+
s.summary = s.description = "Create full text RSS feed from RSS"
|
|
7
|
+
|
|
8
|
+
s.author = "siuying"
|
|
9
|
+
s.email = "siu.ying@gmail.com"
|
|
10
|
+
|
|
11
|
+
s.version = '0.4.8'
|
|
12
|
+
s.date = '2009-06-19'
|
|
13
|
+
s.platform = Gem::Platform::RUBY
|
|
14
|
+
s.require_path = 'lib'
|
|
15
|
+
|
|
16
|
+
# = MANIFEST =
|
|
17
|
+
s.files = %w[
|
|
18
|
+
LICENSE
|
|
19
|
+
README
|
|
20
|
+
Rakefile
|
|
21
|
+
examples/applenews_hk.rb
|
|
22
|
+
examples/extractors/apple_news_extractor.rb
|
|
23
|
+
examples/extractors/yahoo_news_hong_kong_extractor.rb
|
|
24
|
+
examples/server/controller/feed_controller.rb
|
|
25
|
+
examples/server/server.rb
|
|
26
|
+
examples/server/views/index.erb
|
|
27
|
+
examples/sinatra/web.rb
|
|
28
|
+
examples/ynews_hk.rb
|
|
29
|
+
examples/ynews_hk_db.rb
|
|
30
|
+
fullfeed.gemspec
|
|
31
|
+
lib/fullfeed.rb
|
|
32
|
+
lib/fullfeed/agent/agent_factory.rb
|
|
33
|
+
lib/fullfeed/agent/appengine_agent.rb
|
|
34
|
+
lib/fullfeed/agent/base.rb
|
|
35
|
+
lib/fullfeed/agent/mechanize_agent.rb
|
|
36
|
+
lib/fullfeed/agent/open_uri_agent.rb
|
|
37
|
+
lib/fullfeed/extractor/base_extractor.rb
|
|
38
|
+
lib/fullfeed/extractor/extractor_factory.rb
|
|
39
|
+
lib/fullfeed/extractor/text_extractor.rb
|
|
40
|
+
lib/fullfeed/extractor/xpath_extractor.rb
|
|
41
|
+
lib/fullfeed/feed.rb
|
|
42
|
+
lib/fullfeed/filters/base_filter.rb
|
|
43
|
+
lib/fullfeed/filters/convert_encoding_filter.rb
|
|
44
|
+
lib/fullfeed/filters/excess_space_filter.rb
|
|
45
|
+
lib/fullfeed/filters/uppercase_filter.rb
|
|
46
|
+
lib/fullfeed/store/base.rb
|
|
47
|
+
lib/fullfeed/store/db_store.rb
|
|
48
|
+
lib/fullfeed/store/memory_store.rb
|
|
49
|
+
lib/fullfeed/store/store_factory.rb
|
|
50
|
+
test/load_files.rb
|
|
51
|
+
test/test_agent.rb
|
|
52
|
+
test/test_store.rb
|
|
53
|
+
]
|
|
54
|
+
# = MANIFEST =
|
|
55
|
+
|
|
56
|
+
s.rubygems_version = '1.1.1'
|
|
57
|
+
|
|
58
|
+
end
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
require 'singleton'
|
|
2
|
+
|
|
3
|
+
module Fullfeed
|
|
4
|
+
module Agent
|
|
5
|
+
class AgentFactory
|
|
6
|
+
include Singleton
|
|
7
|
+
|
|
8
|
+
def initialize
|
|
9
|
+
@agents = {}
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def register(clazz)
|
|
13
|
+
name = to_symbol_name(clazz.name)
|
|
14
|
+
@agents[name.to_sym] = clazz
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
# get HTTP agent by symbol
|
|
18
|
+
# Accetable agents:
|
|
19
|
+
# # :open_uri - simplistic HTTP client
|
|
20
|
+
# # :mechanize - full feature HTTP client with cookies support
|
|
21
|
+
def agent(name = :open_uri)
|
|
22
|
+
agent_class = @agents[name]
|
|
23
|
+
if agent_class
|
|
24
|
+
agent = agent_class.new
|
|
25
|
+
|
|
26
|
+
if agent.is_a?(BaseAgent)
|
|
27
|
+
return agent
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
raise ArgumentError, "unknown agent name :#{name}, accepatable: #{@agents.keys.inspect}"
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def self.agent(name = :open_uri)
|
|
35
|
+
instance.agent(name)
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
private
|
|
39
|
+
#input: a full class name
|
|
40
|
+
#output: the class name lowercased, underscore separated,
|
|
41
|
+
#and removed "_agent" at last part
|
|
42
|
+
#e.g. "Fullfeed::Agent::MechanizeAgent" => "mechanize"
|
|
43
|
+
def to_symbol_name(class_name)
|
|
44
|
+
class_name.
|
|
45
|
+
split("::").
|
|
46
|
+
last.
|
|
47
|
+
gsub(/(.)([A-Z])/, '\1_\2').
|
|
48
|
+
downcase.
|
|
49
|
+
gsub(/_agent$/, '')
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
end
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
gem('appengine-apis')
|
|
2
|
+
require 'appengine-apis/urlfetch'
|
|
3
|
+
|
|
4
|
+
module Fullfeed
|
|
5
|
+
module Agent
|
|
6
|
+
class AppengineAgent < BaseAgent
|
|
7
|
+
include AppEngine::URLFetch
|
|
8
|
+
register
|
|
9
|
+
|
|
10
|
+
def initialize
|
|
11
|
+
@logger = AppEngine::Logger.new
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def get(url)
|
|
15
|
+
@logger.info "download link: #{url}"
|
|
16
|
+
result = fetch(url)
|
|
17
|
+
result.urlfetch_body rescue result.body
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
module Fullfeed
|
|
2
|
+
module Agent
|
|
3
|
+
# All agent should implement one method: get
|
|
4
|
+
class BaseAgent
|
|
5
|
+
def self.register
|
|
6
|
+
Fullfeed::Agent::AgentFactory.instance.register(self)
|
|
7
|
+
end
|
|
8
|
+
|
|
9
|
+
def get(url)
|
|
10
|
+
raise "Must override get(url)"
|
|
11
|
+
end
|
|
12
|
+
end
|
|
13
|
+
end
|
|
14
|
+
end
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
require 'mechanize'
|
|
2
|
+
|
|
3
|
+
module Fullfeed
|
|
4
|
+
module Agent
|
|
5
|
+
#Use Mechanize as the agent
|
|
6
|
+
#Support cookies ... etc
|
|
7
|
+
class MechanizeAgent < BaseAgent
|
|
8
|
+
register
|
|
9
|
+
|
|
10
|
+
def initialize
|
|
11
|
+
@agent = WWW::Mechanize.new
|
|
12
|
+
@agent.user_agent_alias = "Mac FireFox"
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def get(url)
|
|
16
|
+
page = @agent.get(url)
|
|
17
|
+
page.content
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
require 'open-uri'
|
|
2
|
+
|
|
3
|
+
module Fullfeed
|
|
4
|
+
module Agent
|
|
5
|
+
#Use open-uri as the agent
|
|
6
|
+
#Simplistic but work
|
|
7
|
+
class OpenUriAgent < BaseAgent
|
|
8
|
+
register
|
|
9
|
+
|
|
10
|
+
def get(url)
|
|
11
|
+
open(url, "User-Agent" => "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.5; zh-TW; rv:1.9.0.10) Gecko/2009042315 Firefox/3.0.10").read
|
|
12
|
+
end
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
end
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
require 'singleton'
|
|
2
|
+
|
|
3
|
+
module Fullfeed
|
|
4
|
+
module Extractor
|
|
5
|
+
module ExtractorHelper
|
|
6
|
+
#If the url matched the RegExp(s).
|
|
7
|
+
# # url - a string
|
|
8
|
+
# # regexps - an Array of RegExp, or a RegExp
|
|
9
|
+
def is_matched?(url, regexps)
|
|
10
|
+
regexps = [regexps] unless regexps.is_a? Array
|
|
11
|
+
regexps.each do |rexexp|
|
|
12
|
+
return true if url =~ rexexp
|
|
13
|
+
end
|
|
14
|
+
return false
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
class BaseExtractor
|
|
19
|
+
include Singleton
|
|
20
|
+
include ExtractorHelper
|
|
21
|
+
|
|
22
|
+
#If this extractor accept this url, if true, use it to parse the page
|
|
23
|
+
def accept(url)
|
|
24
|
+
false
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
# extract text from html document, return the content
|
|
28
|
+
def extract(doc)
|
|
29
|
+
nil
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def self.register
|
|
33
|
+
ExtractorFactory.instance.register(self)
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
end
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
require 'singleton'
|
|
2
|
+
|
|
3
|
+
module Fullfeed
|
|
4
|
+
module Extractor
|
|
5
|
+
class ExtractorFactory
|
|
6
|
+
include Singleton
|
|
7
|
+
|
|
8
|
+
def initialize
|
|
9
|
+
@extractors = []
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def register(extractor_class)
|
|
13
|
+
@extractors << extractor_class
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def unregister(extractor)
|
|
17
|
+
@extractors.delete(extractor)
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def extractor(url)
|
|
21
|
+
extractors = @extractors.select() {|e| e.instance.accept(url) }
|
|
22
|
+
return extractors.first.instance if extractors.size > 0
|
|
23
|
+
|
|
24
|
+
# if no extractors accept the above URL, use default TextExtractor
|
|
25
|
+
default = TextExtractor.instance
|
|
26
|
+
return default if default.accept(url)
|
|
27
|
+
return nil
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
end
|
|
32
|
+
end
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
module Fullfeed
|
|
2
|
+
module Extractor
|
|
3
|
+
# extract all text from html. this is being use if no other extractor is suitable
|
|
4
|
+
class TextExtractor < BaseExtractor
|
|
5
|
+
PATTERN = [/^http\:.+$/, /^https\:.+$/]
|
|
6
|
+
|
|
7
|
+
def accept(url)
|
|
8
|
+
is_matched?(url, PATTERN)
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
# extract a html document, return the content text
|
|
12
|
+
def extract(doc)
|
|
13
|
+
hdoc = Hpricot(doc)
|
|
14
|
+
text = (hdoc/"//body").inner_text rescue nil
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
end
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
module Fullfeed
|
|
2
|
+
module Extractor
|
|
3
|
+
class XpathExtractor < BaseExtractor
|
|
4
|
+
attr_reader :xpath, :pattern
|
|
5
|
+
|
|
6
|
+
def initialize(pattern = nil, xpath = nil)
|
|
7
|
+
@pattern = pattern
|
|
8
|
+
@xpath = xpath
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
def accept(url)
|
|
12
|
+
is_matched?(url, @pattern)
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
# return content of Yahoo News HK page
|
|
16
|
+
def extract(doc)
|
|
17
|
+
hdoc = Hpricot(doc)
|
|
18
|
+
text = (hdoc.search(@xpath)).inner_html rescue nil
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
end
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
require 'rubygems'
|
|
2
|
+
gem('hpricot', '>= 0.6.1')
|
|
3
|
+
require 'hpricot'
|
|
4
|
+
|
|
5
|
+
module Fullfeed
|
|
6
|
+
class Feed
|
|
7
|
+
attr_reader :url, :encoding, :xml, :item_limit, :store
|
|
8
|
+
attr_accessor :logger
|
|
9
|
+
|
|
10
|
+
def initialize(url, options = {})
|
|
11
|
+
@url = url
|
|
12
|
+
@wait = options[:wait] || 1
|
|
13
|
+
@item_limit = options[:limit] || 50
|
|
14
|
+
@agent_name = options[:agent] || :open_uri
|
|
15
|
+
@store_name = options[:store] || :memory
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
validate_params
|
|
19
|
+
|
|
20
|
+
@filters = Fullfeed::Filters::FilterChain.new(options[:filters] || [])
|
|
21
|
+
@logger = Logger.new(STDOUT)
|
|
22
|
+
@agent = Fullfeed::Agent::AgentFactory.agent(@agent_name)
|
|
23
|
+
@store = Fullfeed::Store::StoreFactory.store(@url, @item_limit, @store_name)
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
#Fetch the RSS feed.
|
|
28
|
+
#
|
|
29
|
+
#For each item in the feed, extract the content of the link and replace the description with it.
|
|
30
|
+
#Extraction is based on registered Extractor, check the extractor classes for more information.
|
|
31
|
+
def fetch
|
|
32
|
+
@logger.info "Fetch RSS URL: #{@url}"
|
|
33
|
+
doc = @agent.get(@url).to_s
|
|
34
|
+
doc = @filters.before_doc(doc)
|
|
35
|
+
@xml = Hpricot.XML(doc)
|
|
36
|
+
items = (@xml/"//item")
|
|
37
|
+
|
|
38
|
+
@logger.info "Process elements of RSS (count=#{items.size}, limit=#{@item_limit})"
|
|
39
|
+
items.to_a.first(@item_limit).each do |item|
|
|
40
|
+
process_item(item)
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
@filters.after_doc(@xml)
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
private
|
|
47
|
+
def validate_params
|
|
48
|
+
if @wait <= 0
|
|
49
|
+
raise ArgumentError, "invalid wait `#{@wait}'"
|
|
50
|
+
end
|
|
51
|
+
if @item_limit <= 0
|
|
52
|
+
raise ArgumentError, "invalid limit `#{@item_limit}'"
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def process_item(item)
|
|
57
|
+
link = (item/"link").first.inner_text rescue nil
|
|
58
|
+
desc = (item/"description").first rescue nil
|
|
59
|
+
guid = (item/"guid").first.inner_text rescue link
|
|
60
|
+
|
|
61
|
+
if link && desc
|
|
62
|
+
begin
|
|
63
|
+
@logger.debug " Extract item (#{guid}) link: #{link}"
|
|
64
|
+
desc.swap("<description>#{Hpricot::Tag::CData.new(extract_cached(guid, link)).to_html}</description>")
|
|
65
|
+
rescue StandardError => e
|
|
66
|
+
@logger.error "Error fetching/replacing content: #{e.inspect}"
|
|
67
|
+
|
|
68
|
+
end
|
|
69
|
+
else
|
|
70
|
+
@logger.warn "No link or desc node found in item: #{item}"
|
|
71
|
+
|
|
72
|
+
end
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
# read cache or fetch result
|
|
76
|
+
def extract_cached(guid, link)
|
|
77
|
+
@store[guid] ||= extract(link)
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
#Use ExtractorFactor to find a suitable Extractor, if found, extract supplied link to the URL.
|
|
82
|
+
#If not found, use TextExtractor which extract all text from the page.
|
|
83
|
+
def extract(link)
|
|
84
|
+
extractor = Extractor::ExtractorFactory.instance.extractor(link)
|
|
85
|
+
|
|
86
|
+
begin
|
|
87
|
+
unless extractor.nil?
|
|
88
|
+
@logger.debug " Download link: #{link}"
|
|
89
|
+
doc = @agent.get(link).to_s
|
|
90
|
+
doc = @filters.before_item(doc)
|
|
91
|
+
doc = extractor.extract(doc).strip
|
|
92
|
+
doc = @filters.after_item(doc)
|
|
93
|
+
return doc
|
|
94
|
+
else
|
|
95
|
+
return nil
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
ensure
|
|
99
|
+
@logger.debug " Wait #{@wait} seconds before next URL"
|
|
100
|
+
sleep(@wait) if @wait > 0
|
|
101
|
+
|
|
102
|
+
end
|
|
103
|
+
end
|
|
104
|
+
end
|
|
105
|
+
end
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
module Fullfeed
|
|
2
|
+
module Filters
|
|
3
|
+
class FilterChain
|
|
4
|
+
def initialize(filters)
|
|
5
|
+
filters = [filters] unless filters.is_a? Array
|
|
6
|
+
@filters = filters
|
|
7
|
+
end
|
|
8
|
+
|
|
9
|
+
def before_doc(doc)
|
|
10
|
+
run_filters(@filters, :before_doc, doc)
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def after_doc(doc)
|
|
14
|
+
run_filters(@filters, :after_doc, doc)
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def before_item(item)
|
|
18
|
+
run_filters(@filters, :before_item, item)
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def after_item(item)
|
|
22
|
+
run_filters(@filters, :after_item, item)
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
private
|
|
26
|
+
def run_filters(filters, method, target)
|
|
27
|
+
filters.each do |f|
|
|
28
|
+
target = f.send(method.to_sym, target)
|
|
29
|
+
end
|
|
30
|
+
target
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class BaseFilter
|
|
36
|
+
# run before rss is processed
|
|
37
|
+
# doc is html text, should also return html text
|
|
38
|
+
def before_doc(doc)
|
|
39
|
+
doc
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
# run after rss is processed
|
|
43
|
+
# doc is a Hpricot document, should also return a document
|
|
44
|
+
def after_doc(doc)
|
|
45
|
+
doc
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
# run before the item is processed
|
|
49
|
+
# item is HTML text, should also return html text
|
|
50
|
+
def before_item(item)
|
|
51
|
+
item
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
# run after the item is processed
|
|
55
|
+
# item is HTML text, should also return html text
|
|
56
|
+
def after_item(item)
|
|
57
|
+
item
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
end
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
require 'iconv'
|
|
2
|
+
|
|
3
|
+
module Fullfeed
|
|
4
|
+
module Filters
|
|
5
|
+
# convert feed to UTF-8 encoding
|
|
6
|
+
class ConvertEncodingFilter < BaseFilter
|
|
7
|
+
def initialize(feed_encoding, item_encoding)
|
|
8
|
+
@feed_encoding = feed_encoding
|
|
9
|
+
@item_encoding = item_encoding
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
# run before rss is processed
|
|
13
|
+
def before_doc(feed)
|
|
14
|
+
Iconv.conv("UTF-8//IGNORE", @feed_encoding, feed)
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
# run after process the item node
|
|
18
|
+
def after_item(item)
|
|
19
|
+
Iconv.conv("UTF-8//IGNORE", @item_encoding, item)
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
end
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
require 'iconv'
|
|
2
|
+
|
|
3
|
+
module Fullfeed
|
|
4
|
+
module Filters
|
|
5
|
+
# remove spaces between two chinese text, such as appledaily action news pages
|
|
6
|
+
class ExcessSpaceFilter < BaseFilter
|
|
7
|
+
def initialize
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
# run after process the item node
|
|
11
|
+
def after_item(item)
|
|
12
|
+
item.gsub(/([^a-zA-Z0+9]) /, '\1')
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
require 'iconv'
|
|
2
|
+
|
|
3
|
+
module Fullfeed
|
|
4
|
+
module Filters
|
|
5
|
+
# convert feed to UTF-8 encoding
|
|
6
|
+
class UppercaseFilter < BaseFilter
|
|
7
|
+
def initialize
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
# run before rss is processed
|
|
11
|
+
def before_doc(feed)
|
|
12
|
+
feed.upcase
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
# run after process the item node
|
|
16
|
+
def after_item(item)
|
|
17
|
+
item.upcase
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
module Fullfeed
|
|
2
|
+
module Store
|
|
3
|
+
class BaseStore
|
|
4
|
+
def initialize(url, cache_size)
|
|
5
|
+
end
|
|
6
|
+
|
|
7
|
+
def self.register
|
|
8
|
+
StoreFactory.instance.register(self)
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
def []=(args)
|
|
12
|
+
raise "Must override []="
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def [](args)
|
|
16
|
+
raise "Must override []"
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
require 'rubygems'
|
|
2
|
+
gem('datamapper', '>=0.9.7')
|
|
3
|
+
|
|
4
|
+
require 'dm-core'
|
|
5
|
+
require 'dm-timestamps'
|
|
6
|
+
|
|
7
|
+
module Fullfeed
|
|
8
|
+
module Store
|
|
9
|
+
class DbStore < BaseStore
|
|
10
|
+
register
|
|
11
|
+
|
|
12
|
+
def self.setup(url, automigrate = false)
|
|
13
|
+
DataMapper.setup(:default, url)
|
|
14
|
+
begin
|
|
15
|
+
Item.first
|
|
16
|
+
rescue
|
|
17
|
+
DataMapper.auto_migrate!
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
#Initialize a datamapper store
|
|
22
|
+
def initialize(url, cache_size)
|
|
23
|
+
@url = url
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
#Save or update existing item by key
|
|
27
|
+
def []=(key, value)
|
|
28
|
+
item = Item.first_or_create(:feed_url => @url, :guid => key)
|
|
29
|
+
item.content = value
|
|
30
|
+
item.save
|
|
31
|
+
value
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
#Retrieve an item by key
|
|
35
|
+
def [](key)
|
|
36
|
+
item = Item.first(:feed_url => @url, :guid => key)
|
|
37
|
+
item.content rescue nil
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
class Item
|
|
43
|
+
include DataMapper::Resource
|
|
44
|
+
property :id, Serial
|
|
45
|
+
property :feed_url, String
|
|
46
|
+
property :guid, String
|
|
47
|
+
property :content, String
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
require 'rubygems'
|
|
2
|
+
gem('ruby-cache', '>= 0.3.0')
|
|
3
|
+
|
|
4
|
+
require 'cache'
|
|
5
|
+
|
|
6
|
+
module Fullfeed
|
|
7
|
+
module Store
|
|
8
|
+
#Cache in memory, based on Ruby Cache gem
|
|
9
|
+
class MemoryStore < BaseStore
|
|
10
|
+
register
|
|
11
|
+
|
|
12
|
+
def initialize(url, cache_size)
|
|
13
|
+
@cache = Cache.new({:max_num => cache_size})
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def []=(key, value)
|
|
17
|
+
@cache[key] = value
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def [](key)
|
|
21
|
+
@cache[key]
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
end
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
module Fullfeed
|
|
2
|
+
module Store
|
|
3
|
+
class StoreFactory
|
|
4
|
+
include Singleton
|
|
5
|
+
|
|
6
|
+
def initialize
|
|
7
|
+
@stores = {}
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
#register a Store to the StoreFactory
|
|
11
|
+
def register(clazz)
|
|
12
|
+
name = to_symbol_name(clazz.name)
|
|
13
|
+
@stores[name.to_sym] = clazz
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
# get class extends BaseCache from a symbol
|
|
17
|
+
# Accetable name:
|
|
18
|
+
# # :memory - store result in memory
|
|
19
|
+
# # :db - store result in database (require DataMapper)
|
|
20
|
+
def store(url, cache_size, name = :memory)
|
|
21
|
+
store_class = @stores[name]
|
|
22
|
+
|
|
23
|
+
if store_class
|
|
24
|
+
if !cache_size || cache_size <= 0
|
|
25
|
+
raise ArgumentError, "invalid store size: #{cache_size}"
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
store = store_class.new(url, cache_size)
|
|
29
|
+
if store.is_a?(BaseStore)
|
|
30
|
+
return store
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
raise ArgumentError, "unknown store name :#{name}, accepatable: #{@stores.keys.inspect}"
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
#see instance method store
|
|
38
|
+
def self.store(url, cache_size, name = :memory)
|
|
39
|
+
instance.store(url, cache_size, name)
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
private
|
|
43
|
+
#input: a full class name
|
|
44
|
+
#output: the class name lowercased, underscore separated,
|
|
45
|
+
#and removed "_store" at last part
|
|
46
|
+
#e.g. "Fullfeed::Store::MemoryStore" => "memory"
|
|
47
|
+
def to_symbol_name(class_name)
|
|
48
|
+
class_name.
|
|
49
|
+
split("::").
|
|
50
|
+
last.
|
|
51
|
+
gsub(/(.)([A-Z])/, '\1_\2').
|
|
52
|
+
downcase.
|
|
53
|
+
gsub(/_store$/, '')
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
end
|
data/lib/fullfeed.rb
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
path = File.expand_path(File.dirname(__FILE__))
|
|
2
|
+
$:.unshift(path) unless $:.include?(path)
|
|
3
|
+
|
|
4
|
+
module FullFeed
|
|
5
|
+
VERSION = '0.4.7'
|
|
6
|
+
end
|
|
7
|
+
|
|
8
|
+
require 'logger'
|
|
9
|
+
|
|
10
|
+
require "fullfeed/agent/base"
|
|
11
|
+
require "fullfeed/agent/agent_factory"
|
|
12
|
+
require "fullfeed/agent/open_uri_agent"
|
|
13
|
+
|
|
14
|
+
begin
|
|
15
|
+
# optionally require mechanize
|
|
16
|
+
gem('mechanize')
|
|
17
|
+
require "fullfeed/agent/mechanize_agent"
|
|
18
|
+
rescue Gem::LoadError
|
|
19
|
+
end
|
|
20
|
+
begin
|
|
21
|
+
# optionally require appengine-api
|
|
22
|
+
gem('appengine-apis')
|
|
23
|
+
require "fullfeed/agent/appengine_agent"
|
|
24
|
+
rescue Gem::LoadError
|
|
25
|
+
rescue NameError
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
require "fullfeed/extractor/extractor_factory"
|
|
29
|
+
require "fullfeed/extractor/base_extractor"
|
|
30
|
+
require "fullfeed/extractor/text_extractor"
|
|
31
|
+
require "fullfeed/extractor/xpath_extractor"
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
require "fullfeed/filters/base_filter"
|
|
35
|
+
require "fullfeed/filters/convert_encoding_filter"
|
|
36
|
+
require "fullfeed/filters/uppercase_filter"
|
|
37
|
+
require "fullfeed/filters/excess_space_filter"
|
|
38
|
+
|
|
39
|
+
require "fullfeed/store/base"
|
|
40
|
+
require "fullfeed/store/store_factory"
|
|
41
|
+
require "fullfeed/store/memory_store"
|
|
42
|
+
|
|
43
|
+
# only load DbStore if datamapper is installed
|
|
44
|
+
begin
|
|
45
|
+
gem('datamapper', '>= 0.9.7')
|
|
46
|
+
require "fullfeed/store/db_store"
|
|
47
|
+
rescue Gem::LoadError
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
require "fullfeed/feed"
|
data/test/load_files.rb
ADDED
data/test/test_agent.rb
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
|
|
3
|
+
require 'test/unit'
|
|
4
|
+
require "#{File.dirname(__FILE__)}/../lib/fullfeed"
|
|
5
|
+
require 'load_files'
|
|
6
|
+
|
|
7
|
+
class TestAgent < Test::Unit::TestCase
|
|
8
|
+
def test_agent_factory
|
|
9
|
+
open_uri_agent = Fullfeed::Agent::AgentFactory.instance.agent(:open_uri)
|
|
10
|
+
assert_not_nil open_uri_agent
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def test_openuri_agent
|
|
14
|
+
agent = Fullfeed::Agent::AgentFactory.instance.agent(:open_uri)
|
|
15
|
+
doc = agent.get('http://www.google.com/')
|
|
16
|
+
assert_not_nil(doc.to_s)
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def test_mechanize_agent
|
|
20
|
+
agent = Fullfeed::Agent::AgentFactory.instance.agent(:mechanize)
|
|
21
|
+
doc = agent.get('http://www.google.com/')
|
|
22
|
+
assert_not_nil(doc.to_s)
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
end
|
data/test/test_store.rb
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
|
|
3
|
+
require 'test/unit'
|
|
4
|
+
require "#{File.dirname(__FILE__)}/../lib/fullfeed"
|
|
5
|
+
require 'load_files'
|
|
6
|
+
|
|
7
|
+
class TestDbStore < Test::Unit::TestCase
|
|
8
|
+
Fullfeed::Store::DbStore.setup("sqlite3:ynews.sqlite3")
|
|
9
|
+
|
|
10
|
+
def test_db_store
|
|
11
|
+
store = Fullfeed::Store::StoreFactory.store('http://test', 100, :db)
|
|
12
|
+
base_test_store(store)
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def test_memory_store
|
|
16
|
+
store = Fullfeed::Store::StoreFactory.store('http://test', 100, :memory)
|
|
17
|
+
base_test_store(store)
|
|
18
|
+
end
|
|
19
|
+
private
|
|
20
|
+
def base_test_store(store)
|
|
21
|
+
|
|
22
|
+
value = rand().to_s
|
|
23
|
+
store['/100'] = value
|
|
24
|
+
assert_equal store['/100'], value
|
|
25
|
+
|
|
26
|
+
value = "中文測試"
|
|
27
|
+
store['/200'] = value
|
|
28
|
+
assert_equal store['/200'], value
|
|
29
|
+
end
|
|
30
|
+
end
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: siuying-fullfeed
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.4.
|
|
4
|
+
version: 0.4.8
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- siuying
|
|
@@ -9,30 +9,11 @@ autorequire:
|
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
11
|
|
|
12
|
-
date: 2009-
|
|
12
|
+
date: 2009-06-19 00:00:00 -07:00
|
|
13
13
|
default_executable:
|
|
14
|
-
dependencies:
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
type: :runtime
|
|
18
|
-
version_requirement:
|
|
19
|
-
version_requirements: !ruby/object:Gem::Requirement
|
|
20
|
-
requirements:
|
|
21
|
-
- - ">="
|
|
22
|
-
- !ruby/object:Gem::Version
|
|
23
|
-
version: 0.3.0
|
|
24
|
-
version:
|
|
25
|
-
- !ruby/object:Gem::Dependency
|
|
26
|
-
name: hpricot
|
|
27
|
-
type: :runtime
|
|
28
|
-
version_requirement:
|
|
29
|
-
version_requirements: !ruby/object:Gem::Requirement
|
|
30
|
-
requirements:
|
|
31
|
-
- - ">="
|
|
32
|
-
- !ruby/object:Gem::Version
|
|
33
|
-
version: 0.6.1
|
|
34
|
-
version:
|
|
35
|
-
description: Fullfeed RSS creator
|
|
14
|
+
dependencies: []
|
|
15
|
+
|
|
16
|
+
description: Create full text RSS feed from RSS
|
|
36
17
|
email: siu.ying@gmail.com
|
|
37
18
|
executables: []
|
|
38
19
|
|
|
@@ -41,8 +22,41 @@ extensions: []
|
|
|
41
22
|
extra_rdoc_files: []
|
|
42
23
|
|
|
43
24
|
files:
|
|
25
|
+
- LICENSE
|
|
44
26
|
- README
|
|
45
27
|
- Rakefile
|
|
28
|
+
- examples/applenews_hk.rb
|
|
29
|
+
- examples/extractors/apple_news_extractor.rb
|
|
30
|
+
- examples/extractors/yahoo_news_hong_kong_extractor.rb
|
|
31
|
+
- examples/server/controller/feed_controller.rb
|
|
32
|
+
- examples/server/server.rb
|
|
33
|
+
- examples/server/views/index.erb
|
|
34
|
+
- examples/sinatra/web.rb
|
|
35
|
+
- examples/ynews_hk.rb
|
|
36
|
+
- examples/ynews_hk_db.rb
|
|
37
|
+
- fullfeed.gemspec
|
|
38
|
+
- lib/fullfeed.rb
|
|
39
|
+
- lib/fullfeed/agent/agent_factory.rb
|
|
40
|
+
- lib/fullfeed/agent/appengine_agent.rb
|
|
41
|
+
- lib/fullfeed/agent/base.rb
|
|
42
|
+
- lib/fullfeed/agent/mechanize_agent.rb
|
|
43
|
+
- lib/fullfeed/agent/open_uri_agent.rb
|
|
44
|
+
- lib/fullfeed/extractor/base_extractor.rb
|
|
45
|
+
- lib/fullfeed/extractor/extractor_factory.rb
|
|
46
|
+
- lib/fullfeed/extractor/text_extractor.rb
|
|
47
|
+
- lib/fullfeed/extractor/xpath_extractor.rb
|
|
48
|
+
- lib/fullfeed/feed.rb
|
|
49
|
+
- lib/fullfeed/filters/base_filter.rb
|
|
50
|
+
- lib/fullfeed/filters/convert_encoding_filter.rb
|
|
51
|
+
- lib/fullfeed/filters/excess_space_filter.rb
|
|
52
|
+
- lib/fullfeed/filters/uppercase_filter.rb
|
|
53
|
+
- lib/fullfeed/store/base.rb
|
|
54
|
+
- lib/fullfeed/store/db_store.rb
|
|
55
|
+
- lib/fullfeed/store/memory_store.rb
|
|
56
|
+
- lib/fullfeed/store/store_factory.rb
|
|
57
|
+
- test/load_files.rb
|
|
58
|
+
- test/test_agent.rb
|
|
59
|
+
- test/test_store.rb
|
|
46
60
|
has_rdoc: false
|
|
47
61
|
homepage:
|
|
48
62
|
post_install_message:
|
|
@@ -68,6 +82,6 @@ rubyforge_project:
|
|
|
68
82
|
rubygems_version: 1.2.0
|
|
69
83
|
signing_key:
|
|
70
84
|
specification_version: 2
|
|
71
|
-
summary:
|
|
85
|
+
summary: Create full text RSS feed from RSS
|
|
72
86
|
test_files: []
|
|
73
87
|
|