siuying-fullfeed 0.4.6 → 0.4.8

Sign up to get free protection for your applications and to get access to all the features.
Files changed (35) hide show
  1. data/LICENSE +22 -0
  2. data/Rakefile +48 -24
  3. data/examples/applenews_hk.rb +22 -0
  4. data/examples/extractors/apple_news_extractor.rb +14 -0
  5. data/examples/extractors/yahoo_news_hong_kong_extractor.rb +14 -0
  6. data/examples/server/controller/feed_controller.rb +41 -0
  7. data/examples/server/server.rb +28 -0
  8. data/examples/server/views/index.erb +19 -0
  9. data/examples/sinatra/web.rb +29 -0
  10. data/examples/ynews_hk.rb +12 -0
  11. data/examples/ynews_hk_db.rb +32 -0
  12. data/fullfeed.gemspec +58 -0
  13. data/lib/fullfeed/agent/agent_factory.rb +53 -0
  14. data/lib/fullfeed/agent/appengine_agent.rb +21 -0
  15. data/lib/fullfeed/agent/base.rb +14 -0
  16. data/lib/fullfeed/agent/mechanize_agent.rb +21 -0
  17. data/lib/fullfeed/agent/open_uri_agent.rb +15 -0
  18. data/lib/fullfeed/extractor/base_extractor.rb +37 -0
  19. data/lib/fullfeed/extractor/extractor_factory.rb +32 -0
  20. data/lib/fullfeed/extractor/text_extractor.rb +18 -0
  21. data/lib/fullfeed/extractor/xpath_extractor.rb +23 -0
  22. data/lib/fullfeed/feed.rb +105 -0
  23. data/lib/fullfeed/filters/base_filter.rb +61 -0
  24. data/lib/fullfeed/filters/convert_encoding_filter.rb +23 -0
  25. data/lib/fullfeed/filters/excess_space_filter.rb +16 -0
  26. data/lib/fullfeed/filters/uppercase_filter.rb +21 -0
  27. data/lib/fullfeed/store/base.rb +21 -0
  28. data/lib/fullfeed/store/db_store.rb +51 -0
  29. data/lib/fullfeed/store/memory_store.rb +26 -0
  30. data/lib/fullfeed/store/store_factory.rb +57 -0
  31. data/lib/fullfeed.rb +50 -0
  32. data/test/load_files.rb +7 -0
  33. data/test/test_agent.rb +25 -0
  34. data/test/test_store.rb +30 -0
  35. metadata +39 -25
data/LICENSE ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2009 siu [dot] ying [at] gmail [dot] com
2
+
3
+ Permission is hereby granted, free of charge, to any person
4
+ obtaining a copy of this software and associated documentation
5
+ files (the "Software"), to deal in the Software without
6
+ restriction, including without limitation the rights to use,
7
+ copy, modify, merge, publish, distribute, sublicense, and/or sell
8
+ copies of the Software, and to permit persons to whom the
9
+ Software is furnished to do so, subject to the following
10
+ conditions:
11
+
12
+ The above copyright notice and this permission notice shall be
13
+ included in all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
17
+ OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
19
+ HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
20
+ WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21
+ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22
+ OTHER DEALINGS IN THE SOFTWARE.
data/Rakefile CHANGED
@@ -1,35 +1,59 @@
1
- require "rubygems"
2
- require "pathname"
3
- require "rake"
4
- require "rake/testtask"
1
+ require 'rake/clean'
2
+ require 'rake/testtask'
3
+ require 'fileutils'
5
4
 
6
- # Gem
7
5
  require "rake/gempackagetask"
8
6
 
9
- NAME = "fullfeed"
10
- SUMMARY = "Fullfeed RSS creator"
11
- GEM_VERSION = "0.4.6"
12
-
13
- spec = Gem::Specification.new do |s|
14
- s.name = NAME
15
- s.summary = s.description = SUMMARY
16
- s.author = "siuying"
17
- s.email = "siu.ying@gmail.com"
18
- s.version = GEM_VERSION
19
- s.platform = Gem::Platform::RUBY
20
- s.require_path = 'lib'
21
- s.files = %w(README Rakefile) + Dir.glob("{examples,lib,test}/**/*")
22
-
23
- # s.executables = ["rackup"]
24
- s.add_dependency('ruby-cache', '>= 0.3.0')
25
- s.add_dependency('hpricot', '>= 0.6.1')
7
+ task :default => :package
8
+
9
+ # PACKAGING ============================================================
10
+
11
+ # Load the gemspec using the same limitations as github
12
+ def spec
13
+ @spec ||=
14
+ begin
15
+ require 'rubygems/specification'
16
+ data = File.read('fullfeed.gemspec')
17
+ spec = nil
18
+ Thread.new { spec = eval("$SAFE = 3\n#{data}") }.join
19
+ spec
20
+ end
26
21
  end
27
22
 
28
23
  Rake::GemPackageTask.new(spec) do |pkg|
29
24
  pkg.gem_spec = spec
30
25
  end
31
26
 
32
- desc "Install the FullFeed as a gem"
27
+ desc "Install the Fullfeed as a gem"
33
28
  task :install => [:repackage] do
34
29
  sh %{gem install pkg/#{spec.name}-#{spec.version}}
35
- end
30
+ end
31
+
32
+ # Gemspec Helpers ====================================================
33
+ def source_version
34
+ line = File.read('lib/fullfeed.rb')[/^\s*VERSION = .*/]
35
+ line.match(/.*VERSION = '(.*)'/)[1]
36
+ end
37
+
38
+ task 'fullfeed.gemspec' => FileList['lib/**','bin/**','examples/**','Rakefile','LICENSE','README'] do |f|
39
+ # read spec file and split out manifest section
40
+ spec = File.read(f.name)
41
+ head, manifest, tail = spec.split(" # = MANIFEST =\n")
42
+ # replace version and date
43
+ head.sub!(/\.version = '.*'/, ".version = '#{source_version}'")
44
+ head.sub!(/\.date = '.*'/, ".date = '#{Date.today.to_s}'")
45
+ # determine file list from git ls-files
46
+ files = `git ls-files`.
47
+ split("\n").
48
+ sort.
49
+ reject{ |file| file =~ /^\./ }.
50
+ reject{ |file| file =~ /^.+\/\./ }.
51
+ reject { |file| file =~ /^doc/ }.
52
+ map{ |file| " #{file}" }.
53
+ join("\n")
54
+ # piece file back together and write...
55
+ manifest = " s.files = %w[\n#{files}\n ]\n"
56
+ spec = [head,manifest,tail].join(" # = MANIFEST =\n")
57
+ File.open(f.name, 'w') { |io| io.write(spec) }
58
+ puts "updated #{f.name}"
59
+ end
@@ -0,0 +1,22 @@
1
+ # Extractor Example
2
+ #
3
+ # create full text RSS feed from Yahoo! News HK
4
+
5
+ require "rubygems"
6
+ require "fullfeed"
7
+ require "#{File.dirname(__FILE__)}/extractors/apple_news_extractor"
8
+
9
+ # convert encoding filer, convert feed and/or fulltext item to UTF-8
10
+ # first parameter specify the source RSS Feed encoding
11
+ # second parameter specify the fulltext item HTML page encoding
12
+ filter = Fullfeed::Filters::ConvertEncodingFilter.new("UTF-8", "Big5")
13
+
14
+ # create full text RSS feed from Yahoo! News HK
15
+ # At most fetch 5 pages, wait 1 seconds before each try
16
+ feed = Fullfeed::Feed.new("http://rss.appleactionews.com/rss.xml",
17
+ :limit => 5,
18
+ :agent => :open_uri,
19
+ :filters => filter,
20
+ :wait => 1)
21
+ result = feed.fetch
22
+ puts result
@@ -0,0 +1,14 @@
1
+ require "fullfeed"
2
+
3
+ module Fullfeed
4
+ module Extractor
5
+ class AppleNewsExtractor < XpathExtractor
6
+ # register this extractor to the system
7
+ register
8
+
9
+ def initialize
10
+ super(%r{www.appleactionews.com}, ".article")
11
+ end
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,14 @@
1
+ require "fullfeed"
2
+
3
+ module Fullfeed
4
+ module Extractor
5
+ class YahooNewsHongKongExtractor < XpathExtractor
6
+ # register this extractor to the system
7
+ register
8
+
9
+ def initialize
10
+ super(%r{http://hk\.rd\.yahoo.com/news/rss/\*http://.+\.html}, ".livewords")
11
+ end
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,41 @@
1
+ module Fullfeedr
2
+ class FeedController
3
+ CONF = [
4
+ {
5
+ 'name' => "appleactionnews",
6
+ 'url' => "http://rss.appleactionews.com/rss.xml",
7
+ 'filters' => [Fullfeed::Filters::ConvertEncodingFilter.new("UTF-8", "Big5"), Fullfeed::Filters::ExcessSpaceFilter.new]},
8
+ {
9
+ 'name' => "ynews-hk",
10
+ 'url' => "http://hk.news.yahoo.com/rss/hongkong/rss.xml"}
11
+ ].freeze
12
+
13
+ def initialize
14
+ @feeds = {}
15
+ CONF.each do |conf|
16
+ name = conf['name']
17
+ url = conf['url']
18
+ filters = conf['filters'] || []
19
+
20
+ @feeds[name] = Fullfeed::Feed.new(url,
21
+ :limit => 20,
22
+ :wait => 1,
23
+ :filters => filters,
24
+ :agent => :open_uri,
25
+ :store => :db)
26
+ end
27
+ end
28
+
29
+ def fetch(name)
30
+ if @feeds[name]
31
+ @feeds[name].fetch.to_s
32
+ else
33
+ raise ArgumentError, "not a registered name!"
34
+ end
35
+ end
36
+
37
+ def list
38
+ CONF
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,28 @@
1
+ # Web Example
2
+ #
3
+ # Start a web server that host the Yahoo! Hong Kong news with full text feed
4
+ # Require sinatra gem
5
+
6
+ require "rubygems"
7
+ gem('fullfeed', '>= 0.4.3')
8
+
9
+ require 'fullfeed'
10
+ require 'sinatra'
11
+ require 'erb'
12
+
13
+ require "#{File.dirname(__FILE__)}/controller/feed_controller"
14
+
15
+ # setup
16
+ Fullfeed::Store::DbStore.setup("sqlite3:ynews.sqlite3", true)
17
+ server = Fullfeedr::FeedController.new
18
+
19
+ # list feeds
20
+ get '/' do
21
+ @list = server.list
22
+ erb :index
23
+ end
24
+
25
+ # fetch pages
26
+ get '/:name' do
27
+ server.fetch(params[:name])
28
+ end
@@ -0,0 +1,19 @@
1
+ <html>
2
+ <head>
3
+ <title>Fullfeedr!</title>
4
+ <link rel="stylesheet" href="http://www.w3.org/StyleSheets/Core/Modernist" type="text/css"/>
5
+ </head>
6
+ <body>
7
+ <h1>Welcome to Fullfeedr!</h1>
8
+ <p>We host following feed here: </p>
9
+ <ol>
10
+ <% for item in @list %>
11
+ <li>
12
+ <a href="/<%= item['name'] %>">/<%= item['name'] %></a>
13
+ (source: <a href="<%= item['url'] %>"><%= item['url'] %></a>)
14
+ </li>
15
+ <% end %>
16
+ </ol>
17
+ <p>(Please wait while the fulltext feed is being downloaded)</p>
18
+ </body>
19
+ </html>
@@ -0,0 +1,29 @@
1
+ # Web Example
2
+ #
3
+ # Start a web server that host the Yahoo! Hong Kong news with full text feed
4
+ # Require sinatra gem
5
+
6
+ require 'rubygems'
7
+ require 'sinatra'
8
+ require "#{File.dirname(__FILE__)}/../../lib/fullfeed"
9
+ require "#{File.dirname(__FILE__)}/../extractors/yahoo_news_hong_kong_extractor"
10
+
11
+ Fullfeed::Store::DbStore.setup("sqlite3:ynews.sqlite3")
12
+
13
+ #DataMapper.auto_migrate!
14
+
15
+ # Create Yahoo! News HK full-text feed
16
+ feed = Fullfeed::Feed.new("http://hk.news.yahoo.com/rss/hongkong/rss.xml",
17
+ :limit => 20,
18
+ :wait => 1,
19
+ :agent => :open_uri,
20
+ :store => :db)
21
+
22
+ # pre fetch the request
23
+ feed.logger.info "Pre-Fetching RSS, could take some time ..."
24
+ feed.fetch
25
+
26
+ get '/' do
27
+ # fetch updated item
28
+ feed.fetch.to_s
29
+ end
@@ -0,0 +1,12 @@
1
+ # Extractor Example
2
+ #
3
+ # create full text RSS feed from Yahoo! News HK
4
+
5
+ require "rubygems"
6
+ require "fullfeed"
7
+ require "#{File.dirname(__FILE__)}/extractors/yahoo_news_hong_kong_extractor"
8
+
9
+ feed = Fullfeed::Feed.new("http://hk.news.yahoo.com/rss/hongkong/rss.xml")
10
+ puts feed.fetch
11
+
12
+
@@ -0,0 +1,32 @@
1
+ # Extractor Example, using Datamapper
2
+ #
3
+ # Create full text RSS feed from Yahoo! News HK, store RSS in datamapper.
4
+ # Later invocation will not cause older items being download again.
5
+ # Use sqlite3 as backend, use proper adapter for your needs!
6
+ #
7
+ # Uncomment the line under "Migrate database" when first run this app (it setup database)
8
+ #
9
+
10
+ require "rubygems"
11
+ require "fullfeed"
12
+
13
+ require "#{File.dirname(__FILE__)}/extractors/yahoo_news_hong_kong_extractor"
14
+
15
+ # setup datamaper
16
+ Fullfeed::Store::DbStore.setup("sqlite3:ynews.sqlite3")
17
+
18
+ # Migrate database, use only once
19
+ #DataMapper.auto_migrate!
20
+
21
+ # create full text RSS feed from Yahoo! News HK
22
+ # At most fetch 20 pages, wait 1 seconds before each try
23
+ feed = Fullfeed::Feed.new("http://hk.news.yahoo.com/rss/hongkong/rss.xml",
24
+ :limit => 20,
25
+ :store => :db,
26
+ :wait => 1)
27
+ result = feed.fetch
28
+ puts result
29
+
30
+ File.open("yahoo.rss", "w") do |file|
31
+ file.write(result)
32
+ end
data/fullfeed.gemspec ADDED
@@ -0,0 +1,58 @@
1
+ Gem::Specification.new do |s|
2
+ s.specification_version = 2 if s.respond_to? :specification_version=
3
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
4
+
5
+ s.name = "fullfeed"
6
+ s.summary = s.description = "Create full text RSS feed from RSS"
7
+
8
+ s.author = "siuying"
9
+ s.email = "siu.ying@gmail.com"
10
+
11
+ s.version = '0.4.8'
12
+ s.date = '2009-06-19'
13
+ s.platform = Gem::Platform::RUBY
14
+ s.require_path = 'lib'
15
+
16
+ # = MANIFEST =
17
+ s.files = %w[
18
+ LICENSE
19
+ README
20
+ Rakefile
21
+ examples/applenews_hk.rb
22
+ examples/extractors/apple_news_extractor.rb
23
+ examples/extractors/yahoo_news_hong_kong_extractor.rb
24
+ examples/server/controller/feed_controller.rb
25
+ examples/server/server.rb
26
+ examples/server/views/index.erb
27
+ examples/sinatra/web.rb
28
+ examples/ynews_hk.rb
29
+ examples/ynews_hk_db.rb
30
+ fullfeed.gemspec
31
+ lib/fullfeed.rb
32
+ lib/fullfeed/agent/agent_factory.rb
33
+ lib/fullfeed/agent/appengine_agent.rb
34
+ lib/fullfeed/agent/base.rb
35
+ lib/fullfeed/agent/mechanize_agent.rb
36
+ lib/fullfeed/agent/open_uri_agent.rb
37
+ lib/fullfeed/extractor/base_extractor.rb
38
+ lib/fullfeed/extractor/extractor_factory.rb
39
+ lib/fullfeed/extractor/text_extractor.rb
40
+ lib/fullfeed/extractor/xpath_extractor.rb
41
+ lib/fullfeed/feed.rb
42
+ lib/fullfeed/filters/base_filter.rb
43
+ lib/fullfeed/filters/convert_encoding_filter.rb
44
+ lib/fullfeed/filters/excess_space_filter.rb
45
+ lib/fullfeed/filters/uppercase_filter.rb
46
+ lib/fullfeed/store/base.rb
47
+ lib/fullfeed/store/db_store.rb
48
+ lib/fullfeed/store/memory_store.rb
49
+ lib/fullfeed/store/store_factory.rb
50
+ test/load_files.rb
51
+ test/test_agent.rb
52
+ test/test_store.rb
53
+ ]
54
+ # = MANIFEST =
55
+
56
+ s.rubygems_version = '1.1.1'
57
+
58
+ end
@@ -0,0 +1,53 @@
1
+ require 'singleton'
2
+
3
+ module Fullfeed
4
+ module Agent
5
+ class AgentFactory
6
+ include Singleton
7
+
8
+ def initialize
9
+ @agents = {}
10
+ end
11
+
12
+ def register(clazz)
13
+ name = to_symbol_name(clazz.name)
14
+ @agents[name.to_sym] = clazz
15
+ end
16
+
17
+ # get HTTP agent by symbol
18
+ # Accetable agents:
19
+ # # :open_uri - simplistic HTTP client
20
+ # # :mechanize - full feature HTTP client with cookies support
21
+ def agent(name = :open_uri)
22
+ agent_class = @agents[name]
23
+ if agent_class
24
+ agent = agent_class.new
25
+
26
+ if agent.is_a?(BaseAgent)
27
+ return agent
28
+ end
29
+ end
30
+
31
+ raise ArgumentError, "unknown agent name :#{name}, accepatable: #{@agents.keys.inspect}"
32
+ end
33
+
34
+ def self.agent(name = :open_uri)
35
+ instance.agent(name)
36
+ end
37
+
38
+ private
39
+ #input: a full class name
40
+ #output: the class name lowercased, underscore separated,
41
+ #and removed "_agent" at last part
42
+ #e.g. "Fullfeed::Agent::MechanizeAgent" => "mechanize"
43
+ def to_symbol_name(class_name)
44
+ class_name.
45
+ split("::").
46
+ last.
47
+ gsub(/(.)([A-Z])/, '\1_\2').
48
+ downcase.
49
+ gsub(/_agent$/, '')
50
+ end
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,21 @@
1
+ gem('appengine-apis')
2
+ require 'appengine-apis/urlfetch'
3
+
4
+ module Fullfeed
5
+ module Agent
6
+ class AppengineAgent < BaseAgent
7
+ include AppEngine::URLFetch
8
+ register
9
+
10
+ def initialize
11
+ @logger = AppEngine::Logger.new
12
+ end
13
+
14
+ def get(url)
15
+ @logger.info "download link: #{url}"
16
+ result = fetch(url)
17
+ result.urlfetch_body rescue result.body
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,14 @@
1
+ module Fullfeed
2
+ module Agent
3
+ # All agent should implement one method: get
4
+ class BaseAgent
5
+ def self.register
6
+ Fullfeed::Agent::AgentFactory.instance.register(self)
7
+ end
8
+
9
+ def get(url)
10
+ raise "Must override get(url)"
11
+ end
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,21 @@
1
+ require 'mechanize'
2
+
3
+ module Fullfeed
4
+ module Agent
5
+ #Use Mechanize as the agent
6
+ #Support cookies ... etc
7
+ class MechanizeAgent < BaseAgent
8
+ register
9
+
10
+ def initialize
11
+ @agent = WWW::Mechanize.new
12
+ @agent.user_agent_alias = "Mac FireFox"
13
+ end
14
+
15
+ def get(url)
16
+ page = @agent.get(url)
17
+ page.content
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,15 @@
1
+ require 'open-uri'
2
+
3
+ module Fullfeed
4
+ module Agent
5
+ #Use open-uri as the agent
6
+ #Simplistic but work
7
+ class OpenUriAgent < BaseAgent
8
+ register
9
+
10
+ def get(url)
11
+ open(url, "User-Agent" => "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.5; zh-TW; rv:1.9.0.10) Gecko/2009042315 Firefox/3.0.10").read
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,37 @@
1
+ require 'singleton'
2
+
3
+ module Fullfeed
4
+ module Extractor
5
+ module ExtractorHelper
6
+ #If the url matched the RegExp(s).
7
+ # # url - a string
8
+ # # regexps - an Array of RegExp, or a RegExp
9
+ def is_matched?(url, regexps)
10
+ regexps = [regexps] unless regexps.is_a? Array
11
+ regexps.each do |rexexp|
12
+ return true if url =~ rexexp
13
+ end
14
+ return false
15
+ end
16
+ end
17
+
18
+ class BaseExtractor
19
+ include Singleton
20
+ include ExtractorHelper
21
+
22
+ #If this extractor accept this url, if true, use it to parse the page
23
+ def accept(url)
24
+ false
25
+ end
26
+
27
+ # extract text from html document, return the content
28
+ def extract(doc)
29
+ nil
30
+ end
31
+
32
+ def self.register
33
+ ExtractorFactory.instance.register(self)
34
+ end
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,32 @@
1
+ require 'singleton'
2
+
3
+ module Fullfeed
4
+ module Extractor
5
+ class ExtractorFactory
6
+ include Singleton
7
+
8
+ def initialize
9
+ @extractors = []
10
+ end
11
+
12
+ def register(extractor_class)
13
+ @extractors << extractor_class
14
+ end
15
+
16
+ def unregister(extractor)
17
+ @extractors.delete(extractor)
18
+ end
19
+
20
+ def extractor(url)
21
+ extractors = @extractors.select() {|e| e.instance.accept(url) }
22
+ return extractors.first.instance if extractors.size > 0
23
+
24
+ # if no extractors accept the above URL, use default TextExtractor
25
+ default = TextExtractor.instance
26
+ return default if default.accept(url)
27
+ return nil
28
+ end
29
+ end
30
+
31
+ end
32
+ end
@@ -0,0 +1,18 @@
1
+ module Fullfeed
2
+ module Extractor
3
+ # extract all text from html. this is being use if no other extractor is suitable
4
+ class TextExtractor < BaseExtractor
5
+ PATTERN = [/^http\:.+$/, /^https\:.+$/]
6
+
7
+ def accept(url)
8
+ is_matched?(url, PATTERN)
9
+ end
10
+
11
+ # extract a html document, return the content text
12
+ def extract(doc)
13
+ hdoc = Hpricot(doc)
14
+ text = (hdoc/"//body").inner_text rescue nil
15
+ end
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,23 @@
1
+ module Fullfeed
2
+ module Extractor
3
+ class XpathExtractor < BaseExtractor
4
+ attr_reader :xpath, :pattern
5
+
6
+ def initialize(pattern = nil, xpath = nil)
7
+ @pattern = pattern
8
+ @xpath = xpath
9
+ end
10
+
11
+ def accept(url)
12
+ is_matched?(url, @pattern)
13
+ end
14
+
15
+ # return content of Yahoo News HK page
16
+ def extract(doc)
17
+ hdoc = Hpricot(doc)
18
+ text = (hdoc.search(@xpath)).inner_html rescue nil
19
+ end
20
+
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,105 @@
1
+ require 'rubygems'
2
+ gem('hpricot', '>= 0.6.1')
3
+ require 'hpricot'
4
+
5
+ module Fullfeed
6
+ class Feed
7
+ attr_reader :url, :encoding, :xml, :item_limit, :store
8
+ attr_accessor :logger
9
+
10
+ def initialize(url, options = {})
11
+ @url = url
12
+ @wait = options[:wait] || 1
13
+ @item_limit = options[:limit] || 50
14
+ @agent_name = options[:agent] || :open_uri
15
+ @store_name = options[:store] || :memory
16
+
17
+
18
+ validate_params
19
+
20
+ @filters = Fullfeed::Filters::FilterChain.new(options[:filters] || [])
21
+ @logger = Logger.new(STDOUT)
22
+ @agent = Fullfeed::Agent::AgentFactory.agent(@agent_name)
23
+ @store = Fullfeed::Store::StoreFactory.store(@url, @item_limit, @store_name)
24
+ end
25
+
26
+
27
+ #Fetch the RSS feed.
28
+ #
29
+ #For each item in the feed, extract the content of the link and replace the description with it.
30
+ #Extraction is based on registered Extractor, check the extractor classes for more information.
31
+ def fetch
32
+ @logger.info "Fetch RSS URL: #{@url}"
33
+ doc = @agent.get(@url).to_s
34
+ doc = @filters.before_doc(doc)
35
+ @xml = Hpricot.XML(doc)
36
+ items = (@xml/"//item")
37
+
38
+ @logger.info "Process elements of RSS (count=#{items.size}, limit=#{@item_limit})"
39
+ items.to_a.first(@item_limit).each do |item|
40
+ process_item(item)
41
+ end
42
+
43
+ @filters.after_doc(@xml)
44
+ end
45
+
46
+ private
47
+ def validate_params
48
+ if @wait <= 0
49
+ raise ArgumentError, "invalid wait `#{@wait}'"
50
+ end
51
+ if @item_limit <= 0
52
+ raise ArgumentError, "invalid limit `#{@item_limit}'"
53
+ end
54
+ end
55
+
56
+ def process_item(item)
57
+ link = (item/"link").first.inner_text rescue nil
58
+ desc = (item/"description").first rescue nil
59
+ guid = (item/"guid").first.inner_text rescue link
60
+
61
+ if link && desc
62
+ begin
63
+ @logger.debug " Extract item (#{guid}) link: #{link}"
64
+ desc.swap("<description>#{Hpricot::Tag::CData.new(extract_cached(guid, link)).to_html}</description>")
65
+ rescue StandardError => e
66
+ @logger.error "Error fetching/replacing content: #{e.inspect}"
67
+
68
+ end
69
+ else
70
+ @logger.warn "No link or desc node found in item: #{item}"
71
+
72
+ end
73
+ end
74
+
75
+ # read cache or fetch result
76
+ def extract_cached(guid, link)
77
+ @store[guid] ||= extract(link)
78
+ end
79
+
80
+
81
+ #Use ExtractorFactor to find a suitable Extractor, if found, extract supplied link to the URL.
82
+ #If not found, use TextExtractor which extract all text from the page.
83
+ def extract(link)
84
+ extractor = Extractor::ExtractorFactory.instance.extractor(link)
85
+
86
+ begin
87
+ unless extractor.nil?
88
+ @logger.debug " Download link: #{link}"
89
+ doc = @agent.get(link).to_s
90
+ doc = @filters.before_item(doc)
91
+ doc = extractor.extract(doc).strip
92
+ doc = @filters.after_item(doc)
93
+ return doc
94
+ else
95
+ return nil
96
+ end
97
+
98
+ ensure
99
+ @logger.debug " Wait #{@wait} seconds before next URL"
100
+ sleep(@wait) if @wait > 0
101
+
102
+ end
103
+ end
104
+ end
105
+ end
@@ -0,0 +1,61 @@
1
+ module Fullfeed
2
+ module Filters
3
+ class FilterChain
4
+ def initialize(filters)
5
+ filters = [filters] unless filters.is_a? Array
6
+ @filters = filters
7
+ end
8
+
9
+ def before_doc(doc)
10
+ run_filters(@filters, :before_doc, doc)
11
+ end
12
+
13
+ def after_doc(doc)
14
+ run_filters(@filters, :after_doc, doc)
15
+ end
16
+
17
+ def before_item(item)
18
+ run_filters(@filters, :before_item, item)
19
+ end
20
+
21
+ def after_item(item)
22
+ run_filters(@filters, :after_item, item)
23
+ end
24
+
25
+ private
26
+ def run_filters(filters, method, target)
27
+ filters.each do |f|
28
+ target = f.send(method.to_sym, target)
29
+ end
30
+ target
31
+ end
32
+ end
33
+
34
+
35
+ class BaseFilter
36
+ # run before rss is processed
37
+ # doc is html text, should also return html text
38
+ def before_doc(doc)
39
+ doc
40
+ end
41
+
42
+ # run after rss is processed
43
+ # doc is a Hpricot document, should also return a document
44
+ def after_doc(doc)
45
+ doc
46
+ end
47
+
48
+ # run before the item is processed
49
+ # item is HTML text, should also return html text
50
+ def before_item(item)
51
+ item
52
+ end
53
+
54
+ # run after the item is processed
55
+ # item is HTML text, should also return html text
56
+ def after_item(item)
57
+ item
58
+ end
59
+ end
60
+ end
61
+ end
@@ -0,0 +1,23 @@
1
+ require 'iconv'
2
+
3
+ module Fullfeed
4
+ module Filters
5
+ # convert feed to UTF-8 encoding
6
+ class ConvertEncodingFilter < BaseFilter
7
+ def initialize(feed_encoding, item_encoding)
8
+ @feed_encoding = feed_encoding
9
+ @item_encoding = item_encoding
10
+ end
11
+
12
+ # run before rss is processed
13
+ def before_doc(feed)
14
+ Iconv.conv("UTF-8//IGNORE", @feed_encoding, feed)
15
+ end
16
+
17
+ # run after process the item node
18
+ def after_item(item)
19
+ Iconv.conv("UTF-8//IGNORE", @item_encoding, item)
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,16 @@
1
+ require 'iconv'
2
+
3
+ module Fullfeed
4
+ module Filters
5
+ # remove spaces between two chinese text, such as appledaily action news pages
6
+ class ExcessSpaceFilter < BaseFilter
7
+ def initialize
8
+ end
9
+
10
+ # run after process the item node
11
+ def after_item(item)
12
+ item.gsub(/([^a-zA-Z0+9]) /, '\1')
13
+ end
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,21 @@
1
+ require 'iconv'
2
+
3
+ module Fullfeed
4
+ module Filters
5
+ # convert feed to UTF-8 encoding
6
+ class UppercaseFilter < BaseFilter
7
+ def initialize
8
+ end
9
+
10
+ # run before rss is processed
11
+ def before_doc(feed)
12
+ feed.upcase
13
+ end
14
+
15
+ # run after process the item node
16
+ def after_item(item)
17
+ item.upcase
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,21 @@
1
+ module Fullfeed
2
+ module Store
3
+ class BaseStore
4
+ def initialize(url, cache_size)
5
+ end
6
+
7
+ def self.register
8
+ StoreFactory.instance.register(self)
9
+ end
10
+
11
+ def []=(args)
12
+ raise "Must override []="
13
+ end
14
+
15
+ def [](args)
16
+ raise "Must override []"
17
+ end
18
+
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,51 @@
1
+ require 'rubygems'
2
+ gem('datamapper', '>=0.9.7')
3
+
4
+ require 'dm-core'
5
+ require 'dm-timestamps'
6
+
7
+ module Fullfeed
8
+ module Store
9
+ class DbStore < BaseStore
10
+ register
11
+
12
+ def self.setup(url, automigrate = false)
13
+ DataMapper.setup(:default, url)
14
+ begin
15
+ Item.first
16
+ rescue
17
+ DataMapper.auto_migrate!
18
+ end
19
+ end
20
+
21
+ #Initialize a datamapper store
22
+ def initialize(url, cache_size)
23
+ @url = url
24
+ end
25
+
26
+ #Save or update existing item by key
27
+ def []=(key, value)
28
+ item = Item.first_or_create(:feed_url => @url, :guid => key)
29
+ item.content = value
30
+ item.save
31
+ value
32
+ end
33
+
34
+ #Retrieve an item by key
35
+ def [](key)
36
+ item = Item.first(:feed_url => @url, :guid => key)
37
+ item.content rescue nil
38
+ end
39
+
40
+ end
41
+
42
+ class Item
43
+ include DataMapper::Resource
44
+ property :id, Serial
45
+ property :feed_url, String
46
+ property :guid, String
47
+ property :content, String
48
+ end
49
+ end
50
+ end
51
+
@@ -0,0 +1,26 @@
1
+ require 'rubygems'
2
+ gem('ruby-cache', '>= 0.3.0')
3
+
4
+ require 'cache'
5
+
6
+ module Fullfeed
7
+ module Store
8
+ #Cache in memory, based on Ruby Cache gem
9
+ class MemoryStore < BaseStore
10
+ register
11
+
12
+ def initialize(url, cache_size)
13
+ @cache = Cache.new({:max_num => cache_size})
14
+ end
15
+
16
+ def []=(key, value)
17
+ @cache[key] = value
18
+ end
19
+
20
+ def [](key)
21
+ @cache[key]
22
+ end
23
+
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,57 @@
1
+ module Fullfeed
2
+ module Store
3
+ class StoreFactory
4
+ include Singleton
5
+
6
+ def initialize
7
+ @stores = {}
8
+ end
9
+
10
+ #register a Store to the StoreFactory
11
+ def register(clazz)
12
+ name = to_symbol_name(clazz.name)
13
+ @stores[name.to_sym] = clazz
14
+ end
15
+
16
+ # get class extends BaseCache from a symbol
17
+ # Accetable name:
18
+ # # :memory - store result in memory
19
+ # # :db - store result in database (require DataMapper)
20
+ def store(url, cache_size, name = :memory)
21
+ store_class = @stores[name]
22
+
23
+ if store_class
24
+ if !cache_size || cache_size <= 0
25
+ raise ArgumentError, "invalid store size: #{cache_size}"
26
+ end
27
+
28
+ store = store_class.new(url, cache_size)
29
+ if store.is_a?(BaseStore)
30
+ return store
31
+ end
32
+ end
33
+
34
+ raise ArgumentError, "unknown store name :#{name}, accepatable: #{@stores.keys.inspect}"
35
+ end
36
+
37
+ #see instance method store
38
+ def self.store(url, cache_size, name = :memory)
39
+ instance.store(url, cache_size, name)
40
+ end
41
+
42
+ private
43
+ #input: a full class name
44
+ #output: the class name lowercased, underscore separated,
45
+ #and removed "_store" at last part
46
+ #e.g. "Fullfeed::Store::MemoryStore" => "memory"
47
+ def to_symbol_name(class_name)
48
+ class_name.
49
+ split("::").
50
+ last.
51
+ gsub(/(.)([A-Z])/, '\1_\2').
52
+ downcase.
53
+ gsub(/_store$/, '')
54
+ end
55
+ end
56
+ end
57
+ end
data/lib/fullfeed.rb ADDED
@@ -0,0 +1,50 @@
1
+ path = File.expand_path(File.dirname(__FILE__))
2
+ $:.unshift(path) unless $:.include?(path)
3
+
4
+ module FullFeed
5
+ VERSION = '0.4.7'
6
+ end
7
+
8
+ require 'logger'
9
+
10
+ require "fullfeed/agent/base"
11
+ require "fullfeed/agent/agent_factory"
12
+ require "fullfeed/agent/open_uri_agent"
13
+
14
+ begin
15
+ # optionally require mechanize
16
+ gem('mechanize')
17
+ require "fullfeed/agent/mechanize_agent"
18
+ rescue Gem::LoadError
19
+ end
20
+ begin
21
+ # optionally require appengine-api
22
+ gem('appengine-apis')
23
+ require "fullfeed/agent/appengine_agent"
24
+ rescue Gem::LoadError
25
+ rescue NameError
26
+ end
27
+
28
+ require "fullfeed/extractor/extractor_factory"
29
+ require "fullfeed/extractor/base_extractor"
30
+ require "fullfeed/extractor/text_extractor"
31
+ require "fullfeed/extractor/xpath_extractor"
32
+
33
+
34
+ require "fullfeed/filters/base_filter"
35
+ require "fullfeed/filters/convert_encoding_filter"
36
+ require "fullfeed/filters/uppercase_filter"
37
+ require "fullfeed/filters/excess_space_filter"
38
+
39
+ require "fullfeed/store/base"
40
+ require "fullfeed/store/store_factory"
41
+ require "fullfeed/store/memory_store"
42
+
43
+ # only load DbStore if datamapper is installed
44
+ begin
45
+ gem('datamapper', '>= 0.9.7')
46
+ require "fullfeed/store/db_store"
47
+ rescue Gem::LoadError
48
+ end
49
+
50
+ require "fullfeed/feed"
@@ -0,0 +1,7 @@
1
+ module TestFiles
2
+ Dir.chdir(File.dirname(__FILE__)) do
3
+ Dir['files/*.{html,xhtml,xml}'].each do |fname|
4
+ const_set fname[%r!/(\w+)\.\w+$!, 1].upcase, IO.read(fname)
5
+ end
6
+ end
7
+ end
@@ -0,0 +1,25 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'test/unit'
4
+ require "#{File.dirname(__FILE__)}/../lib/fullfeed"
5
+ require 'load_files'
6
+
7
+ class TestAgent < Test::Unit::TestCase
8
+ def test_agent_factory
9
+ open_uri_agent = Fullfeed::Agent::AgentFactory.instance.agent(:open_uri)
10
+ assert_not_nil open_uri_agent
11
+ end
12
+
13
+ def test_openuri_agent
14
+ agent = Fullfeed::Agent::AgentFactory.instance.agent(:open_uri)
15
+ doc = agent.get('http://www.google.com/')
16
+ assert_not_nil(doc.to_s)
17
+ end
18
+
19
+ def test_mechanize_agent
20
+ agent = Fullfeed::Agent::AgentFactory.instance.agent(:mechanize)
21
+ doc = agent.get('http://www.google.com/')
22
+ assert_not_nil(doc.to_s)
23
+ end
24
+
25
+ end
@@ -0,0 +1,30 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'test/unit'
4
+ require "#{File.dirname(__FILE__)}/../lib/fullfeed"
5
+ require 'load_files'
6
+
7
+ class TestDbStore < Test::Unit::TestCase
8
+ Fullfeed::Store::DbStore.setup("sqlite3:ynews.sqlite3")
9
+
10
+ def test_db_store
11
+ store = Fullfeed::Store::StoreFactory.store('http://test', 100, :db)
12
+ base_test_store(store)
13
+ end
14
+
15
+ def test_memory_store
16
+ store = Fullfeed::Store::StoreFactory.store('http://test', 100, :memory)
17
+ base_test_store(store)
18
+ end
19
+ private
20
+ def base_test_store(store)
21
+
22
+ value = rand().to_s
23
+ store['/100'] = value
24
+ assert_equal store['/100'], value
25
+
26
+ value = "中文測試"
27
+ store['/200'] = value
28
+ assert_equal store['/200'], value
29
+ end
30
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: siuying-fullfeed
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.6
4
+ version: 0.4.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - siuying
@@ -9,30 +9,11 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-05-16 00:00:00 -07:00
12
+ date: 2009-06-19 00:00:00 -07:00
13
13
  default_executable:
14
- dependencies:
15
- - !ruby/object:Gem::Dependency
16
- name: ruby-cache
17
- type: :runtime
18
- version_requirement:
19
- version_requirements: !ruby/object:Gem::Requirement
20
- requirements:
21
- - - ">="
22
- - !ruby/object:Gem::Version
23
- version: 0.3.0
24
- version:
25
- - !ruby/object:Gem::Dependency
26
- name: hpricot
27
- type: :runtime
28
- version_requirement:
29
- version_requirements: !ruby/object:Gem::Requirement
30
- requirements:
31
- - - ">="
32
- - !ruby/object:Gem::Version
33
- version: 0.6.1
34
- version:
35
- description: Fullfeed RSS creator
14
+ dependencies: []
15
+
16
+ description: Create full text RSS feed from RSS
36
17
  email: siu.ying@gmail.com
37
18
  executables: []
38
19
 
@@ -41,8 +22,41 @@ extensions: []
41
22
  extra_rdoc_files: []
42
23
 
43
24
  files:
25
+ - LICENSE
44
26
  - README
45
27
  - Rakefile
28
+ - examples/applenews_hk.rb
29
+ - examples/extractors/apple_news_extractor.rb
30
+ - examples/extractors/yahoo_news_hong_kong_extractor.rb
31
+ - examples/server/controller/feed_controller.rb
32
+ - examples/server/server.rb
33
+ - examples/server/views/index.erb
34
+ - examples/sinatra/web.rb
35
+ - examples/ynews_hk.rb
36
+ - examples/ynews_hk_db.rb
37
+ - fullfeed.gemspec
38
+ - lib/fullfeed.rb
39
+ - lib/fullfeed/agent/agent_factory.rb
40
+ - lib/fullfeed/agent/appengine_agent.rb
41
+ - lib/fullfeed/agent/base.rb
42
+ - lib/fullfeed/agent/mechanize_agent.rb
43
+ - lib/fullfeed/agent/open_uri_agent.rb
44
+ - lib/fullfeed/extractor/base_extractor.rb
45
+ - lib/fullfeed/extractor/extractor_factory.rb
46
+ - lib/fullfeed/extractor/text_extractor.rb
47
+ - lib/fullfeed/extractor/xpath_extractor.rb
48
+ - lib/fullfeed/feed.rb
49
+ - lib/fullfeed/filters/base_filter.rb
50
+ - lib/fullfeed/filters/convert_encoding_filter.rb
51
+ - lib/fullfeed/filters/excess_space_filter.rb
52
+ - lib/fullfeed/filters/uppercase_filter.rb
53
+ - lib/fullfeed/store/base.rb
54
+ - lib/fullfeed/store/db_store.rb
55
+ - lib/fullfeed/store/memory_store.rb
56
+ - lib/fullfeed/store/store_factory.rb
57
+ - test/load_files.rb
58
+ - test/test_agent.rb
59
+ - test/test_store.rb
46
60
  has_rdoc: false
47
61
  homepage:
48
62
  post_install_message:
@@ -68,6 +82,6 @@ rubyforge_project:
68
82
  rubygems_version: 1.2.0
69
83
  signing_key:
70
84
  specification_version: 2
71
- summary: Fullfeed RSS creator
85
+ summary: Create full text RSS feed from RSS
72
86
  test_files: []
73
87