siuying-fullfeed 0.4.6 → 0.4.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. data/LICENSE +22 -0
  2. data/Rakefile +48 -24
  3. data/examples/applenews_hk.rb +22 -0
  4. data/examples/extractors/apple_news_extractor.rb +14 -0
  5. data/examples/extractors/yahoo_news_hong_kong_extractor.rb +14 -0
  6. data/examples/server/controller/feed_controller.rb +41 -0
  7. data/examples/server/server.rb +28 -0
  8. data/examples/server/views/index.erb +19 -0
  9. data/examples/sinatra/web.rb +29 -0
  10. data/examples/ynews_hk.rb +12 -0
  11. data/examples/ynews_hk_db.rb +32 -0
  12. data/fullfeed.gemspec +58 -0
  13. data/lib/fullfeed/agent/agent_factory.rb +53 -0
  14. data/lib/fullfeed/agent/appengine_agent.rb +21 -0
  15. data/lib/fullfeed/agent/base.rb +14 -0
  16. data/lib/fullfeed/agent/mechanize_agent.rb +21 -0
  17. data/lib/fullfeed/agent/open_uri_agent.rb +15 -0
  18. data/lib/fullfeed/extractor/base_extractor.rb +37 -0
  19. data/lib/fullfeed/extractor/extractor_factory.rb +32 -0
  20. data/lib/fullfeed/extractor/text_extractor.rb +18 -0
  21. data/lib/fullfeed/extractor/xpath_extractor.rb +23 -0
  22. data/lib/fullfeed/feed.rb +105 -0
  23. data/lib/fullfeed/filters/base_filter.rb +61 -0
  24. data/lib/fullfeed/filters/convert_encoding_filter.rb +23 -0
  25. data/lib/fullfeed/filters/excess_space_filter.rb +16 -0
  26. data/lib/fullfeed/filters/uppercase_filter.rb +21 -0
  27. data/lib/fullfeed/store/base.rb +21 -0
  28. data/lib/fullfeed/store/db_store.rb +51 -0
  29. data/lib/fullfeed/store/memory_store.rb +26 -0
  30. data/lib/fullfeed/store/store_factory.rb +57 -0
  31. data/lib/fullfeed.rb +50 -0
  32. data/test/load_files.rb +7 -0
  33. data/test/test_agent.rb +25 -0
  34. data/test/test_store.rb +30 -0
  35. metadata +39 -25
data/LICENSE ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2009 siu [dot] ying [at] gmail [dot] com
2
+
3
+ Permission is hereby granted, free of charge, to any person
4
+ obtaining a copy of this software and associated documentation
5
+ files (the "Software"), to deal in the Software without
6
+ restriction, including without limitation the rights to use,
7
+ copy, modify, merge, publish, distribute, sublicense, and/or sell
8
+ copies of the Software, and to permit persons to whom the
9
+ Software is furnished to do so, subject to the following
10
+ conditions:
11
+
12
+ The above copyright notice and this permission notice shall be
13
+ included in all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
17
+ OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
19
+ HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
20
+ WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21
+ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22
+ OTHER DEALINGS IN THE SOFTWARE.
data/Rakefile CHANGED
@@ -1,35 +1,59 @@
1
- require "rubygems"
2
- require "pathname"
3
- require "rake"
4
- require "rake/testtask"
1
+ require 'rake/clean'
2
+ require 'rake/testtask'
3
+ require 'fileutils'
5
4
 
6
- # Gem
7
5
  require "rake/gempackagetask"
8
6
 
9
- NAME = "fullfeed"
10
- SUMMARY = "Fullfeed RSS creator"
11
- GEM_VERSION = "0.4.6"
12
-
13
- spec = Gem::Specification.new do |s|
14
- s.name = NAME
15
- s.summary = s.description = SUMMARY
16
- s.author = "siuying"
17
- s.email = "siu.ying@gmail.com"
18
- s.version = GEM_VERSION
19
- s.platform = Gem::Platform::RUBY
20
- s.require_path = 'lib'
21
- s.files = %w(README Rakefile) + Dir.glob("{examples,lib,test}/**/*")
22
-
23
- # s.executables = ["rackup"]
24
- s.add_dependency('ruby-cache', '>= 0.3.0')
25
- s.add_dependency('hpricot', '>= 0.6.1')
7
+ task :default => :package
8
+
9
+ # PACKAGING ============================================================
10
+
11
+ # Load the gemspec using the same limitations as github
12
+ def spec
13
+ @spec ||=
14
+ begin
15
+ require 'rubygems/specification'
16
+ data = File.read('fullfeed.gemspec')
17
+ spec = nil
18
+ Thread.new { spec = eval("$SAFE = 3\n#{data}") }.join
19
+ spec
20
+ end
26
21
  end
27
22
 
28
23
  Rake::GemPackageTask.new(spec) do |pkg|
29
24
  pkg.gem_spec = spec
30
25
  end
31
26
 
32
- desc "Install the FullFeed as a gem"
27
+ desc "Install the Fullfeed as a gem"
33
28
  task :install => [:repackage] do
34
29
  sh %{gem install pkg/#{spec.name}-#{spec.version}}
35
- end
30
+ end
31
+
32
+ # Gemspec Helpers ====================================================
33
+ def source_version
34
+ line = File.read('lib/fullfeed.rb')[/^\s*VERSION = .*/]
35
+ line.match(/.*VERSION = '(.*)'/)[1]
36
+ end
37
+
38
+ task 'fullfeed.gemspec' => FileList['lib/**','bin/**','examples/**','Rakefile','LICENSE','README'] do |f|
39
+ # read spec file and split out manifest section
40
+ spec = File.read(f.name)
41
+ head, manifest, tail = spec.split(" # = MANIFEST =\n")
42
+ # replace version and date
43
+ head.sub!(/\.version = '.*'/, ".version = '#{source_version}'")
44
+ head.sub!(/\.date = '.*'/, ".date = '#{Date.today.to_s}'")
45
+ # determine file list from git ls-files
46
+ files = `git ls-files`.
47
+ split("\n").
48
+ sort.
49
+ reject{ |file| file =~ /^\./ }.
50
+ reject{ |file| file =~ /^.+\/\./ }.
51
+ reject { |file| file =~ /^doc/ }.
52
+ map{ |file| " #{file}" }.
53
+ join("\n")
54
+ # piece file back together and write...
55
+ manifest = " s.files = %w[\n#{files}\n ]\n"
56
+ spec = [head,manifest,tail].join(" # = MANIFEST =\n")
57
+ File.open(f.name, 'w') { |io| io.write(spec) }
58
+ puts "updated #{f.name}"
59
+ end
@@ -0,0 +1,22 @@
1
+ # Extractor Example
2
+ #
3
+ # create full text RSS feed from Yahoo! News HK
4
+
5
+ require "rubygems"
6
+ require "fullfeed"
7
+ require "#{File.dirname(__FILE__)}/extractors/apple_news_extractor"
8
+
9
+ # convert encoding filer, convert feed and/or fulltext item to UTF-8
10
+ # first parameter specify the source RSS Feed encoding
11
+ # second parameter specify the fulltext item HTML page encoding
12
+ filter = Fullfeed::Filters::ConvertEncodingFilter.new("UTF-8", "Big5")
13
+
14
+ # create full text RSS feed from Yahoo! News HK
15
+ # At most fetch 5 pages, wait 1 seconds before each try
16
+ feed = Fullfeed::Feed.new("http://rss.appleactionews.com/rss.xml",
17
+ :limit => 5,
18
+ :agent => :open_uri,
19
+ :filters => filter,
20
+ :wait => 1)
21
+ result = feed.fetch
22
+ puts result
@@ -0,0 +1,14 @@
1
+ require "fullfeed"
2
+
3
+ module Fullfeed
4
+ module Extractor
5
+ class AppleNewsExtractor < XpathExtractor
6
+ # register this extractor to the system
7
+ register
8
+
9
+ def initialize
10
+ super(%r{www.appleactionews.com}, ".article")
11
+ end
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,14 @@
1
+ require "fullfeed"
2
+
3
+ module Fullfeed
4
+ module Extractor
5
+ class YahooNewsHongKongExtractor < XpathExtractor
6
+ # register this extractor to the system
7
+ register
8
+
9
+ def initialize
10
+ super(%r{http://hk\.rd\.yahoo.com/news/rss/\*http://.+\.html}, ".livewords")
11
+ end
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,41 @@
1
+ module Fullfeedr
2
+ class FeedController
3
+ CONF = [
4
+ {
5
+ 'name' => "appleactionnews",
6
+ 'url' => "http://rss.appleactionews.com/rss.xml",
7
+ 'filters' => [Fullfeed::Filters::ConvertEncodingFilter.new("UTF-8", "Big5"), Fullfeed::Filters::ExcessSpaceFilter.new]},
8
+ {
9
+ 'name' => "ynews-hk",
10
+ 'url' => "http://hk.news.yahoo.com/rss/hongkong/rss.xml"}
11
+ ].freeze
12
+
13
+ def initialize
14
+ @feeds = {}
15
+ CONF.each do |conf|
16
+ name = conf['name']
17
+ url = conf['url']
18
+ filters = conf['filters'] || []
19
+
20
+ @feeds[name] = Fullfeed::Feed.new(url,
21
+ :limit => 20,
22
+ :wait => 1,
23
+ :filters => filters,
24
+ :agent => :open_uri,
25
+ :store => :db)
26
+ end
27
+ end
28
+
29
+ def fetch(name)
30
+ if @feeds[name]
31
+ @feeds[name].fetch.to_s
32
+ else
33
+ raise ArgumentError, "not a registered name!"
34
+ end
35
+ end
36
+
37
+ def list
38
+ CONF
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,28 @@
1
+ # Web Example
2
+ #
3
+ # Start a web server that host the Yahoo! Hong Kong news with full text feed
4
+ # Require sinatra gem
5
+
6
+ require "rubygems"
7
+ gem('fullfeed', '>= 0.4.3')
8
+
9
+ require 'fullfeed'
10
+ require 'sinatra'
11
+ require 'erb'
12
+
13
+ require "#{File.dirname(__FILE__)}/controller/feed_controller"
14
+
15
+ # setup
16
+ Fullfeed::Store::DbStore.setup("sqlite3:ynews.sqlite3", true)
17
+ server = Fullfeedr::FeedController.new
18
+
19
+ # list feeds
20
+ get '/' do
21
+ @list = server.list
22
+ erb :index
23
+ end
24
+
25
+ # fetch pages
26
+ get '/:name' do
27
+ server.fetch(params[:name])
28
+ end
@@ -0,0 +1,19 @@
1
+ <html>
2
+ <head>
3
+ <title>Fullfeedr!</title>
4
+ <link rel="stylesheet" href="http://www.w3.org/StyleSheets/Core/Modernist" type="text/css"/>
5
+ </head>
6
+ <body>
7
+ <h1>Welcome to Fullfeedr!</h1>
8
+ <p>We host following feed here: </p>
9
+ <ol>
10
+ <% for item in @list %>
11
+ <li>
12
+ <a href="/<%= item['name'] %>">/<%= item['name'] %></a>
13
+ (source: <a href="<%= item['url'] %>"><%= item['url'] %></a>)
14
+ </li>
15
+ <% end %>
16
+ </ol>
17
+ <p>(Please wait while the fulltext feed is being downloaded)</p>
18
+ </body>
19
+ </html>
@@ -0,0 +1,29 @@
1
+ # Web Example
2
+ #
3
+ # Start a web server that host the Yahoo! Hong Kong news with full text feed
4
+ # Require sinatra gem
5
+
6
+ require 'rubygems'
7
+ require 'sinatra'
8
+ require "#{File.dirname(__FILE__)}/../../lib/fullfeed"
9
+ require "#{File.dirname(__FILE__)}/../extractors/yahoo_news_hong_kong_extractor"
10
+
11
+ Fullfeed::Store::DbStore.setup("sqlite3:ynews.sqlite3")
12
+
13
+ #DataMapper.auto_migrate!
14
+
15
+ # Create Yahoo! News HK full-text feed
16
+ feed = Fullfeed::Feed.new("http://hk.news.yahoo.com/rss/hongkong/rss.xml",
17
+ :limit => 20,
18
+ :wait => 1,
19
+ :agent => :open_uri,
20
+ :store => :db)
21
+
22
+ # pre fetch the request
23
+ feed.logger.info "Pre-Fetching RSS, could take some time ..."
24
+ feed.fetch
25
+
26
+ get '/' do
27
+ # fetch updated item
28
+ feed.fetch.to_s
29
+ end
@@ -0,0 +1,12 @@
1
+ # Extractor Example
2
+ #
3
+ # create full text RSS feed from Yahoo! News HK
4
+
5
+ require "rubygems"
6
+ require "fullfeed"
7
+ require "#{File.dirname(__FILE__)}/extractors/yahoo_news_hong_kong_extractor"
8
+
9
+ feed = Fullfeed::Feed.new("http://hk.news.yahoo.com/rss/hongkong/rss.xml")
10
+ puts feed.fetch
11
+
12
+
@@ -0,0 +1,32 @@
1
+ # Extractor Example, using Datamapper
2
+ #
3
+ # Create full text RSS feed from Yahoo! News HK, store RSS in datamapper.
4
+ # Later invocation will not cause older items being download again.
5
+ # Use sqlite3 as backend, use proper adapter for your needs!
6
+ #
7
+ # Uncomment the line under "Migrate database" when first run this app (it setup database)
8
+ #
9
+
10
+ require "rubygems"
11
+ require "fullfeed"
12
+
13
+ require "#{File.dirname(__FILE__)}/extractors/yahoo_news_hong_kong_extractor"
14
+
15
+ # setup datamaper
16
+ Fullfeed::Store::DbStore.setup("sqlite3:ynews.sqlite3")
17
+
18
+ # Migrate database, use only once
19
+ #DataMapper.auto_migrate!
20
+
21
+ # create full text RSS feed from Yahoo! News HK
22
+ # At most fetch 20 pages, wait 1 seconds before each try
23
+ feed = Fullfeed::Feed.new("http://hk.news.yahoo.com/rss/hongkong/rss.xml",
24
+ :limit => 20,
25
+ :store => :db,
26
+ :wait => 1)
27
+ result = feed.fetch
28
+ puts result
29
+
30
+ File.open("yahoo.rss", "w") do |file|
31
+ file.write(result)
32
+ end
data/fullfeed.gemspec ADDED
@@ -0,0 +1,58 @@
1
+ Gem::Specification.new do |s|
2
+ s.specification_version = 2 if s.respond_to? :specification_version=
3
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
4
+
5
+ s.name = "fullfeed"
6
+ s.summary = s.description = "Create full text RSS feed from RSS"
7
+
8
+ s.author = "siuying"
9
+ s.email = "siu.ying@gmail.com"
10
+
11
+ s.version = '0.4.8'
12
+ s.date = '2009-06-19'
13
+ s.platform = Gem::Platform::RUBY
14
+ s.require_path = 'lib'
15
+
16
+ # = MANIFEST =
17
+ s.files = %w[
18
+ LICENSE
19
+ README
20
+ Rakefile
21
+ examples/applenews_hk.rb
22
+ examples/extractors/apple_news_extractor.rb
23
+ examples/extractors/yahoo_news_hong_kong_extractor.rb
24
+ examples/server/controller/feed_controller.rb
25
+ examples/server/server.rb
26
+ examples/server/views/index.erb
27
+ examples/sinatra/web.rb
28
+ examples/ynews_hk.rb
29
+ examples/ynews_hk_db.rb
30
+ fullfeed.gemspec
31
+ lib/fullfeed.rb
32
+ lib/fullfeed/agent/agent_factory.rb
33
+ lib/fullfeed/agent/appengine_agent.rb
34
+ lib/fullfeed/agent/base.rb
35
+ lib/fullfeed/agent/mechanize_agent.rb
36
+ lib/fullfeed/agent/open_uri_agent.rb
37
+ lib/fullfeed/extractor/base_extractor.rb
38
+ lib/fullfeed/extractor/extractor_factory.rb
39
+ lib/fullfeed/extractor/text_extractor.rb
40
+ lib/fullfeed/extractor/xpath_extractor.rb
41
+ lib/fullfeed/feed.rb
42
+ lib/fullfeed/filters/base_filter.rb
43
+ lib/fullfeed/filters/convert_encoding_filter.rb
44
+ lib/fullfeed/filters/excess_space_filter.rb
45
+ lib/fullfeed/filters/uppercase_filter.rb
46
+ lib/fullfeed/store/base.rb
47
+ lib/fullfeed/store/db_store.rb
48
+ lib/fullfeed/store/memory_store.rb
49
+ lib/fullfeed/store/store_factory.rb
50
+ test/load_files.rb
51
+ test/test_agent.rb
52
+ test/test_store.rb
53
+ ]
54
+ # = MANIFEST =
55
+
56
+ s.rubygems_version = '1.1.1'
57
+
58
+ end
@@ -0,0 +1,53 @@
1
+ require 'singleton'
2
+
3
+ module Fullfeed
4
+ module Agent
5
+ class AgentFactory
6
+ include Singleton
7
+
8
+ def initialize
9
+ @agents = {}
10
+ end
11
+
12
+ def register(clazz)
13
+ name = to_symbol_name(clazz.name)
14
+ @agents[name.to_sym] = clazz
15
+ end
16
+
17
+ # get HTTP agent by symbol
18
+ # Accetable agents:
19
+ # # :open_uri - simplistic HTTP client
20
+ # # :mechanize - full feature HTTP client with cookies support
21
+ def agent(name = :open_uri)
22
+ agent_class = @agents[name]
23
+ if agent_class
24
+ agent = agent_class.new
25
+
26
+ if agent.is_a?(BaseAgent)
27
+ return agent
28
+ end
29
+ end
30
+
31
+ raise ArgumentError, "unknown agent name :#{name}, accepatable: #{@agents.keys.inspect}"
32
+ end
33
+
34
+ def self.agent(name = :open_uri)
35
+ instance.agent(name)
36
+ end
37
+
38
+ private
39
+ #input: a full class name
40
+ #output: the class name lowercased, underscore separated,
41
+ #and removed "_agent" at last part
42
+ #e.g. "Fullfeed::Agent::MechanizeAgent" => "mechanize"
43
+ def to_symbol_name(class_name)
44
+ class_name.
45
+ split("::").
46
+ last.
47
+ gsub(/(.)([A-Z])/, '\1_\2').
48
+ downcase.
49
+ gsub(/_agent$/, '')
50
+ end
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,21 @@
1
+ gem('appengine-apis')
2
+ require 'appengine-apis/urlfetch'
3
+
4
+ module Fullfeed
5
+ module Agent
6
+ class AppengineAgent < BaseAgent
7
+ include AppEngine::URLFetch
8
+ register
9
+
10
+ def initialize
11
+ @logger = AppEngine::Logger.new
12
+ end
13
+
14
+ def get(url)
15
+ @logger.info "download link: #{url}"
16
+ result = fetch(url)
17
+ result.urlfetch_body rescue result.body
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,14 @@
1
+ module Fullfeed
2
+ module Agent
3
+ # All agent should implement one method: get
4
+ class BaseAgent
5
+ def self.register
6
+ Fullfeed::Agent::AgentFactory.instance.register(self)
7
+ end
8
+
9
+ def get(url)
10
+ raise "Must override get(url)"
11
+ end
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,21 @@
1
+ require 'mechanize'
2
+
3
+ module Fullfeed
4
+ module Agent
5
+ #Use Mechanize as the agent
6
+ #Support cookies ... etc
7
+ class MechanizeAgent < BaseAgent
8
+ register
9
+
10
+ def initialize
11
+ @agent = WWW::Mechanize.new
12
+ @agent.user_agent_alias = "Mac FireFox"
13
+ end
14
+
15
+ def get(url)
16
+ page = @agent.get(url)
17
+ page.content
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,15 @@
1
+ require 'open-uri'
2
+
3
+ module Fullfeed
4
+ module Agent
5
+ #Use open-uri as the agent
6
+ #Simplistic but work
7
+ class OpenUriAgent < BaseAgent
8
+ register
9
+
10
+ def get(url)
11
+ open(url, "User-Agent" => "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.5; zh-TW; rv:1.9.0.10) Gecko/2009042315 Firefox/3.0.10").read
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,37 @@
1
+ require 'singleton'
2
+
3
+ module Fullfeed
4
+ module Extractor
5
+ module ExtractorHelper
6
+ #If the url matched the RegExp(s).
7
+ # # url - a string
8
+ # # regexps - an Array of RegExp, or a RegExp
9
+ def is_matched?(url, regexps)
10
+ regexps = [regexps] unless regexps.is_a? Array
11
+ regexps.each do |rexexp|
12
+ return true if url =~ rexexp
13
+ end
14
+ return false
15
+ end
16
+ end
17
+
18
+ class BaseExtractor
19
+ include Singleton
20
+ include ExtractorHelper
21
+
22
+ #If this extractor accept this url, if true, use it to parse the page
23
+ def accept(url)
24
+ false
25
+ end
26
+
27
+ # extract text from html document, return the content
28
+ def extract(doc)
29
+ nil
30
+ end
31
+
32
+ def self.register
33
+ ExtractorFactory.instance.register(self)
34
+ end
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,32 @@
1
+ require 'singleton'
2
+
3
+ module Fullfeed
4
+ module Extractor
5
+ class ExtractorFactory
6
+ include Singleton
7
+
8
+ def initialize
9
+ @extractors = []
10
+ end
11
+
12
+ def register(extractor_class)
13
+ @extractors << extractor_class
14
+ end
15
+
16
+ def unregister(extractor)
17
+ @extractors.delete(extractor)
18
+ end
19
+
20
+ def extractor(url)
21
+ extractors = @extractors.select() {|e| e.instance.accept(url) }
22
+ return extractors.first.instance if extractors.size > 0
23
+
24
+ # if no extractors accept the above URL, use default TextExtractor
25
+ default = TextExtractor.instance
26
+ return default if default.accept(url)
27
+ return nil
28
+ end
29
+ end
30
+
31
+ end
32
+ end
@@ -0,0 +1,18 @@
1
+ module Fullfeed
2
+ module Extractor
3
+ # extract all text from html. this is being use if no other extractor is suitable
4
+ class TextExtractor < BaseExtractor
5
+ PATTERN = [/^http\:.+$/, /^https\:.+$/]
6
+
7
+ def accept(url)
8
+ is_matched?(url, PATTERN)
9
+ end
10
+
11
+ # extract a html document, return the content text
12
+ def extract(doc)
13
+ hdoc = Hpricot(doc)
14
+ text = (hdoc/"//body").inner_text rescue nil
15
+ end
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,23 @@
1
+ module Fullfeed
2
+ module Extractor
3
+ class XpathExtractor < BaseExtractor
4
+ attr_reader :xpath, :pattern
5
+
6
+ def initialize(pattern = nil, xpath = nil)
7
+ @pattern = pattern
8
+ @xpath = xpath
9
+ end
10
+
11
+ def accept(url)
12
+ is_matched?(url, @pattern)
13
+ end
14
+
15
+ # return content of Yahoo News HK page
16
+ def extract(doc)
17
+ hdoc = Hpricot(doc)
18
+ text = (hdoc.search(@xpath)).inner_html rescue nil
19
+ end
20
+
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,105 @@
1
+ require 'rubygems'
2
+ gem('hpricot', '>= 0.6.1')
3
+ require 'hpricot'
4
+
5
+ module Fullfeed
6
+ class Feed
7
+ attr_reader :url, :encoding, :xml, :item_limit, :store
8
+ attr_accessor :logger
9
+
10
+ def initialize(url, options = {})
11
+ @url = url
12
+ @wait = options[:wait] || 1
13
+ @item_limit = options[:limit] || 50
14
+ @agent_name = options[:agent] || :open_uri
15
+ @store_name = options[:store] || :memory
16
+
17
+
18
+ validate_params
19
+
20
+ @filters = Fullfeed::Filters::FilterChain.new(options[:filters] || [])
21
+ @logger = Logger.new(STDOUT)
22
+ @agent = Fullfeed::Agent::AgentFactory.agent(@agent_name)
23
+ @store = Fullfeed::Store::StoreFactory.store(@url, @item_limit, @store_name)
24
+ end
25
+
26
+
27
+ #Fetch the RSS feed.
28
+ #
29
+ #For each item in the feed, extract the content of the link and replace the description with it.
30
+ #Extraction is based on registered Extractor, check the extractor classes for more information.
31
+ def fetch
32
+ @logger.info "Fetch RSS URL: #{@url}"
33
+ doc = @agent.get(@url).to_s
34
+ doc = @filters.before_doc(doc)
35
+ @xml = Hpricot.XML(doc)
36
+ items = (@xml/"//item")
37
+
38
+ @logger.info "Process elements of RSS (count=#{items.size}, limit=#{@item_limit})"
39
+ items.to_a.first(@item_limit).each do |item|
40
+ process_item(item)
41
+ end
42
+
43
+ @filters.after_doc(@xml)
44
+ end
45
+
46
+ private
47
+ def validate_params
48
+ if @wait <= 0
49
+ raise ArgumentError, "invalid wait `#{@wait}'"
50
+ end
51
+ if @item_limit <= 0
52
+ raise ArgumentError, "invalid limit `#{@item_limit}'"
53
+ end
54
+ end
55
+
56
+ def process_item(item)
57
+ link = (item/"link").first.inner_text rescue nil
58
+ desc = (item/"description").first rescue nil
59
+ guid = (item/"guid").first.inner_text rescue link
60
+
61
+ if link && desc
62
+ begin
63
+ @logger.debug " Extract item (#{guid}) link: #{link}"
64
+ desc.swap("<description>#{Hpricot::Tag::CData.new(extract_cached(guid, link)).to_html}</description>")
65
+ rescue StandardError => e
66
+ @logger.error "Error fetching/replacing content: #{e.inspect}"
67
+
68
+ end
69
+ else
70
+ @logger.warn "No link or desc node found in item: #{item}"
71
+
72
+ end
73
+ end
74
+
75
+ # read cache or fetch result
76
+ def extract_cached(guid, link)
77
+ @store[guid] ||= extract(link)
78
+ end
79
+
80
+
81
+ #Use ExtractorFactor to find a suitable Extractor, if found, extract supplied link to the URL.
82
+ #If not found, use TextExtractor which extract all text from the page.
83
+ def extract(link)
84
+ extractor = Extractor::ExtractorFactory.instance.extractor(link)
85
+
86
+ begin
87
+ unless extractor.nil?
88
+ @logger.debug " Download link: #{link}"
89
+ doc = @agent.get(link).to_s
90
+ doc = @filters.before_item(doc)
91
+ doc = extractor.extract(doc).strip
92
+ doc = @filters.after_item(doc)
93
+ return doc
94
+ else
95
+ return nil
96
+ end
97
+
98
+ ensure
99
+ @logger.debug " Wait #{@wait} seconds before next URL"
100
+ sleep(@wait) if @wait > 0
101
+
102
+ end
103
+ end
104
+ end
105
+ end
@@ -0,0 +1,61 @@
1
+ module Fullfeed
2
+ module Filters
3
+ class FilterChain
4
+ def initialize(filters)
5
+ filters = [filters] unless filters.is_a? Array
6
+ @filters = filters
7
+ end
8
+
9
+ def before_doc(doc)
10
+ run_filters(@filters, :before_doc, doc)
11
+ end
12
+
13
+ def after_doc(doc)
14
+ run_filters(@filters, :after_doc, doc)
15
+ end
16
+
17
+ def before_item(item)
18
+ run_filters(@filters, :before_item, item)
19
+ end
20
+
21
+ def after_item(item)
22
+ run_filters(@filters, :after_item, item)
23
+ end
24
+
25
+ private
26
+ def run_filters(filters, method, target)
27
+ filters.each do |f|
28
+ target = f.send(method.to_sym, target)
29
+ end
30
+ target
31
+ end
32
+ end
33
+
34
+
35
+ class BaseFilter
36
+ # run before rss is processed
37
+ # doc is html text, should also return html text
38
+ def before_doc(doc)
39
+ doc
40
+ end
41
+
42
+ # run after rss is processed
43
+ # doc is a Hpricot document, should also return a document
44
+ def after_doc(doc)
45
+ doc
46
+ end
47
+
48
+ # run before the item is processed
49
+ # item is HTML text, should also return html text
50
+ def before_item(item)
51
+ item
52
+ end
53
+
54
+ # run after the item is processed
55
+ # item is HTML text, should also return html text
56
+ def after_item(item)
57
+ item
58
+ end
59
+ end
60
+ end
61
+ end
@@ -0,0 +1,23 @@
1
+ require 'iconv'
2
+
3
+ module Fullfeed
4
+ module Filters
5
+ # convert feed to UTF-8 encoding
6
+ class ConvertEncodingFilter < BaseFilter
7
+ def initialize(feed_encoding, item_encoding)
8
+ @feed_encoding = feed_encoding
9
+ @item_encoding = item_encoding
10
+ end
11
+
12
+ # run before rss is processed
13
+ def before_doc(feed)
14
+ Iconv.conv("UTF-8//IGNORE", @feed_encoding, feed)
15
+ end
16
+
17
+ # run after process the item node
18
+ def after_item(item)
19
+ Iconv.conv("UTF-8//IGNORE", @item_encoding, item)
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,16 @@
1
+ require 'iconv'
2
+
3
+ module Fullfeed
4
+ module Filters
5
+ # remove spaces between two chinese text, such as appledaily action news pages
6
+ class ExcessSpaceFilter < BaseFilter
7
+ def initialize
8
+ end
9
+
10
+ # run after process the item node
11
+ def after_item(item)
12
+ item.gsub(/([^a-zA-Z0+9]) /, '\1')
13
+ end
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,21 @@
1
+ require 'iconv'
2
+
3
+ module Fullfeed
4
+ module Filters
5
+ # convert feed to UTF-8 encoding
6
+ class UppercaseFilter < BaseFilter
7
+ def initialize
8
+ end
9
+
10
+ # run before rss is processed
11
+ def before_doc(feed)
12
+ feed.upcase
13
+ end
14
+
15
+ # run after process the item node
16
+ def after_item(item)
17
+ item.upcase
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,21 @@
1
+ module Fullfeed
2
+ module Store
3
+ class BaseStore
4
+ def initialize(url, cache_size)
5
+ end
6
+
7
+ def self.register
8
+ StoreFactory.instance.register(self)
9
+ end
10
+
11
+ def []=(args)
12
+ raise "Must override []="
13
+ end
14
+
15
+ def [](args)
16
+ raise "Must override []"
17
+ end
18
+
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,51 @@
1
+ require 'rubygems'
2
+ gem('datamapper', '>=0.9.7')
3
+
4
+ require 'dm-core'
5
+ require 'dm-timestamps'
6
+
7
+ module Fullfeed
8
+ module Store
9
+ class DbStore < BaseStore
10
+ register
11
+
12
+ def self.setup(url, automigrate = false)
13
+ DataMapper.setup(:default, url)
14
+ begin
15
+ Item.first
16
+ rescue
17
+ DataMapper.auto_migrate!
18
+ end
19
+ end
20
+
21
+ #Initialize a datamapper store
22
+ def initialize(url, cache_size)
23
+ @url = url
24
+ end
25
+
26
+ #Save or update existing item by key
27
+ def []=(key, value)
28
+ item = Item.first_or_create(:feed_url => @url, :guid => key)
29
+ item.content = value
30
+ item.save
31
+ value
32
+ end
33
+
34
+ #Retrieve an item by key
35
+ def [](key)
36
+ item = Item.first(:feed_url => @url, :guid => key)
37
+ item.content rescue nil
38
+ end
39
+
40
+ end
41
+
42
+ class Item
43
+ include DataMapper::Resource
44
+ property :id, Serial
45
+ property :feed_url, String
46
+ property :guid, String
47
+ property :content, String
48
+ end
49
+ end
50
+ end
51
+
@@ -0,0 +1,26 @@
1
+ require 'rubygems'
2
+ gem('ruby-cache', '>= 0.3.0')
3
+
4
+ require 'cache'
5
+
6
+ module Fullfeed
7
+ module Store
8
+ #Cache in memory, based on Ruby Cache gem
9
+ class MemoryStore < BaseStore
10
+ register
11
+
12
+ def initialize(url, cache_size)
13
+ @cache = Cache.new({:max_num => cache_size})
14
+ end
15
+
16
+ def []=(key, value)
17
+ @cache[key] = value
18
+ end
19
+
20
+ def [](key)
21
+ @cache[key]
22
+ end
23
+
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,57 @@
1
+ module Fullfeed
2
+ module Store
3
+ class StoreFactory
4
+ include Singleton
5
+
6
+ def initialize
7
+ @stores = {}
8
+ end
9
+
10
+ #register a Store to the StoreFactory
11
+ def register(clazz)
12
+ name = to_symbol_name(clazz.name)
13
+ @stores[name.to_sym] = clazz
14
+ end
15
+
16
+ # get class extends BaseCache from a symbol
17
+ # Accetable name:
18
+ # # :memory - store result in memory
19
+ # # :db - store result in database (require DataMapper)
20
+ def store(url, cache_size, name = :memory)
21
+ store_class = @stores[name]
22
+
23
+ if store_class
24
+ if !cache_size || cache_size <= 0
25
+ raise ArgumentError, "invalid store size: #{cache_size}"
26
+ end
27
+
28
+ store = store_class.new(url, cache_size)
29
+ if store.is_a?(BaseStore)
30
+ return store
31
+ end
32
+ end
33
+
34
+ raise ArgumentError, "unknown store name :#{name}, accepatable: #{@stores.keys.inspect}"
35
+ end
36
+
37
+ #see instance method store
38
+ def self.store(url, cache_size, name = :memory)
39
+ instance.store(url, cache_size, name)
40
+ end
41
+
42
+ private
43
+ #input: a full class name
44
+ #output: the class name lowercased, underscore separated,
45
+ #and removed "_store" at last part
46
+ #e.g. "Fullfeed::Store::MemoryStore" => "memory"
47
+ def to_symbol_name(class_name)
48
+ class_name.
49
+ split("::").
50
+ last.
51
+ gsub(/(.)([A-Z])/, '\1_\2').
52
+ downcase.
53
+ gsub(/_store$/, '')
54
+ end
55
+ end
56
+ end
57
+ end
data/lib/fullfeed.rb ADDED
@@ -0,0 +1,50 @@
1
+ path = File.expand_path(File.dirname(__FILE__))
2
+ $:.unshift(path) unless $:.include?(path)
3
+
4
+ module FullFeed
5
+ VERSION = '0.4.7'
6
+ end
7
+
8
+ require 'logger'
9
+
10
+ require "fullfeed/agent/base"
11
+ require "fullfeed/agent/agent_factory"
12
+ require "fullfeed/agent/open_uri_agent"
13
+
14
+ begin
15
+ # optionally require mechanize
16
+ gem('mechanize')
17
+ require "fullfeed/agent/mechanize_agent"
18
+ rescue Gem::LoadError
19
+ end
20
+ begin
21
+ # optionally require appengine-api
22
+ gem('appengine-apis')
23
+ require "fullfeed/agent/appengine_agent"
24
+ rescue Gem::LoadError
25
+ rescue NameError
26
+ end
27
+
28
+ require "fullfeed/extractor/extractor_factory"
29
+ require "fullfeed/extractor/base_extractor"
30
+ require "fullfeed/extractor/text_extractor"
31
+ require "fullfeed/extractor/xpath_extractor"
32
+
33
+
34
+ require "fullfeed/filters/base_filter"
35
+ require "fullfeed/filters/convert_encoding_filter"
36
+ require "fullfeed/filters/uppercase_filter"
37
+ require "fullfeed/filters/excess_space_filter"
38
+
39
+ require "fullfeed/store/base"
40
+ require "fullfeed/store/store_factory"
41
+ require "fullfeed/store/memory_store"
42
+
43
+ # only load DbStore if datamapper is installed
44
+ begin
45
+ gem('datamapper', '>= 0.9.7')
46
+ require "fullfeed/store/db_store"
47
+ rescue Gem::LoadError
48
+ end
49
+
50
+ require "fullfeed/feed"
@@ -0,0 +1,7 @@
1
+ module TestFiles
2
+ Dir.chdir(File.dirname(__FILE__)) do
3
+ Dir['files/*.{html,xhtml,xml}'].each do |fname|
4
+ const_set fname[%r!/(\w+)\.\w+$!, 1].upcase, IO.read(fname)
5
+ end
6
+ end
7
+ end
@@ -0,0 +1,25 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'test/unit'
4
+ require "#{File.dirname(__FILE__)}/../lib/fullfeed"
5
+ require 'load_files'
6
+
7
+ class TestAgent < Test::Unit::TestCase
8
+ def test_agent_factory
9
+ open_uri_agent = Fullfeed::Agent::AgentFactory.instance.agent(:open_uri)
10
+ assert_not_nil open_uri_agent
11
+ end
12
+
13
+ def test_openuri_agent
14
+ agent = Fullfeed::Agent::AgentFactory.instance.agent(:open_uri)
15
+ doc = agent.get('http://www.google.com/')
16
+ assert_not_nil(doc.to_s)
17
+ end
18
+
19
+ def test_mechanize_agent
20
+ agent = Fullfeed::Agent::AgentFactory.instance.agent(:mechanize)
21
+ doc = agent.get('http://www.google.com/')
22
+ assert_not_nil(doc.to_s)
23
+ end
24
+
25
+ end
@@ -0,0 +1,30 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'test/unit'
4
+ require "#{File.dirname(__FILE__)}/../lib/fullfeed"
5
+ require 'load_files'
6
+
7
+ class TestDbStore < Test::Unit::TestCase
8
+ Fullfeed::Store::DbStore.setup("sqlite3:ynews.sqlite3")
9
+
10
+ def test_db_store
11
+ store = Fullfeed::Store::StoreFactory.store('http://test', 100, :db)
12
+ base_test_store(store)
13
+ end
14
+
15
+ def test_memory_store
16
+ store = Fullfeed::Store::StoreFactory.store('http://test', 100, :memory)
17
+ base_test_store(store)
18
+ end
19
+ private
20
+ def base_test_store(store)
21
+
22
+ value = rand().to_s
23
+ store['/100'] = value
24
+ assert_equal store['/100'], value
25
+
26
+ value = "中文測試"
27
+ store['/200'] = value
28
+ assert_equal store['/200'], value
29
+ end
30
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: siuying-fullfeed
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.6
4
+ version: 0.4.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - siuying
@@ -9,30 +9,11 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-05-16 00:00:00 -07:00
12
+ date: 2009-06-19 00:00:00 -07:00
13
13
  default_executable:
14
- dependencies:
15
- - !ruby/object:Gem::Dependency
16
- name: ruby-cache
17
- type: :runtime
18
- version_requirement:
19
- version_requirements: !ruby/object:Gem::Requirement
20
- requirements:
21
- - - ">="
22
- - !ruby/object:Gem::Version
23
- version: 0.3.0
24
- version:
25
- - !ruby/object:Gem::Dependency
26
- name: hpricot
27
- type: :runtime
28
- version_requirement:
29
- version_requirements: !ruby/object:Gem::Requirement
30
- requirements:
31
- - - ">="
32
- - !ruby/object:Gem::Version
33
- version: 0.6.1
34
- version:
35
- description: Fullfeed RSS creator
14
+ dependencies: []
15
+
16
+ description: Create full text RSS feed from RSS
36
17
  email: siu.ying@gmail.com
37
18
  executables: []
38
19
 
@@ -41,8 +22,41 @@ extensions: []
41
22
  extra_rdoc_files: []
42
23
 
43
24
  files:
25
+ - LICENSE
44
26
  - README
45
27
  - Rakefile
28
+ - examples/applenews_hk.rb
29
+ - examples/extractors/apple_news_extractor.rb
30
+ - examples/extractors/yahoo_news_hong_kong_extractor.rb
31
+ - examples/server/controller/feed_controller.rb
32
+ - examples/server/server.rb
33
+ - examples/server/views/index.erb
34
+ - examples/sinatra/web.rb
35
+ - examples/ynews_hk.rb
36
+ - examples/ynews_hk_db.rb
37
+ - fullfeed.gemspec
38
+ - lib/fullfeed.rb
39
+ - lib/fullfeed/agent/agent_factory.rb
40
+ - lib/fullfeed/agent/appengine_agent.rb
41
+ - lib/fullfeed/agent/base.rb
42
+ - lib/fullfeed/agent/mechanize_agent.rb
43
+ - lib/fullfeed/agent/open_uri_agent.rb
44
+ - lib/fullfeed/extractor/base_extractor.rb
45
+ - lib/fullfeed/extractor/extractor_factory.rb
46
+ - lib/fullfeed/extractor/text_extractor.rb
47
+ - lib/fullfeed/extractor/xpath_extractor.rb
48
+ - lib/fullfeed/feed.rb
49
+ - lib/fullfeed/filters/base_filter.rb
50
+ - lib/fullfeed/filters/convert_encoding_filter.rb
51
+ - lib/fullfeed/filters/excess_space_filter.rb
52
+ - lib/fullfeed/filters/uppercase_filter.rb
53
+ - lib/fullfeed/store/base.rb
54
+ - lib/fullfeed/store/db_store.rb
55
+ - lib/fullfeed/store/memory_store.rb
56
+ - lib/fullfeed/store/store_factory.rb
57
+ - test/load_files.rb
58
+ - test/test_agent.rb
59
+ - test/test_store.rb
46
60
  has_rdoc: false
47
61
  homepage:
48
62
  post_install_message:
@@ -68,6 +82,6 @@ rubyforge_project:
68
82
  rubygems_version: 1.2.0
69
83
  signing_key:
70
84
  specification_version: 2
71
- summary: Fullfeed RSS creator
85
+ summary: Create full text RSS feed from RSS
72
86
  test_files: []
73
87