feed-processor 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1 @@
1
+ urls.txt
data/README.textile ADDED
@@ -0,0 +1,43 @@
1
+ Feed Processor is a multi-stage feed processor built with JRuby, MRI, beanstalk and MongoDB.
2
+
3
+ There are two steps to the feed processing:
4
+
5
+ # Step 1: Download feed content using non-blocking IO and insert the raw data into MongoDB. A message is sent via Beanstalk notifying the parser stage that the feed data is ready for a specific feed.
6
+ # Step 2: A multi-processor feed parser pulls the raw data from MongoDB, parses it and inserts the resulting parsed record into MongoDB.
7
+
8
+ h2. Dependencies
9
+
10
+ * MongoDB
11
+ * beanstalkd
12
+ * JRuby
13
+ * MRI
14
+
15
+ Gems (for JRuby):
16
+
17
+ * jruby-http-reactor
18
+ * threadify
19
+ * beanstalk-client
20
+ * mongo_mapper
21
+
22
+ Gems (for MRI):
23
+
24
+ * beanstalk-client
25
+ * mongo_mapper
26
+ * feedzirra
27
+
28
+ h2. Executing
29
+
30
+ Each of the following commands should be executed in a separate console or executed to run as a background process.
31
+
32
+ Start MongoDB and Beanstalk:
33
+
34
+ mongod
35
+ beanstalkd
36
+
37
+ Run the fetch processor:
38
+
39
+ jruby -rubygems -Ilib bin/fetch urls.txt
40
+
41
+ Run the parse processor:
42
+
43
+ ruby -rubygems -Ilib bin/parse
data/Rakefile ADDED
@@ -0,0 +1,15 @@
1
+ begin
2
+ require 'jeweler'
3
+ Jeweler::Tasks.new do |gemspec|
4
+ gemspec.name = "feed-processor"
5
+ gemspec.summary = "A multi-stage feed processor."
6
+ gemspec.description = "Feed Processor is a multi-stage feed processor built with JRuby, MRI, beanstalk and MongoDB."
7
+ gemspec.email = "anthonyeden@gmail.com"
8
+ gemspec.homepage = "http://github.com/aeden/feed-processor"
9
+ gemspec.authors = ["Anthony Eden"]
10
+ end
11
+ rescue LoadError
12
+ puts "Jeweler not available. Install it with: gem install jeweler"
13
+ end
14
+
15
+ Jeweler::GemcutterTasks.new
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.0.1
data/bin/fetch ADDED
@@ -0,0 +1,20 @@
1
+ #!/usr/bin/env jruby
2
+ #
3
+ # Description: Fetches the content from the list of URLs provided
4
+ # on the command line and stores that content and its response code
5
+ # in a MongoDB database.
6
+ #
7
+ # Usage: jruby -rubygems -Ilib bin/fetch urls.txt
8
+ #
9
+ # Dependencies:
10
+ #
11
+ # * jruby-http-reactor
12
+ # * beanstalk-client
13
+ # * threadify
14
+ # * mongo_mapper
15
+
16
+ $stdout.sync = true
17
+ require 'feed_processor'
18
+ require 'feed_processor/fetcher'
19
+ fetcher = FeedProcessor::Fetcher.new(:threads => 16)
20
+ fetcher.execute
data/bin/parse ADDED
@@ -0,0 +1,11 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # Description: Listens for events on a queue and parses the
4
+ # HTTP response body stored in MongoDB for the given URL.
5
+ #
6
+ # Usage: ruby -rubygems -Ilib bin/parse
7
+
8
+ require 'feed_processor'
9
+ require 'feed_processor/parser'
10
+ parser = FeedProcessor::Parser.new
11
+ parser.execute
@@ -0,0 +1,52 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{feed-processor}
8
+ s.version = "0.0.1"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["Anthony Eden"]
12
+ s.date = %q{2010-02-22}
13
+ s.description = %q{Feed Processor is a multi-stage feed processor built with JRuby, MRI, beanstalk and MongoDB.}
14
+ s.email = %q{anthonyeden@gmail.com}
15
+ s.executables = ["fetch", "parse"]
16
+ s.extra_rdoc_files = [
17
+ "README.textile"
18
+ ]
19
+ s.files = [
20
+ ".gitignore",
21
+ "README.textile",
22
+ "Rakefile",
23
+ "VERSION",
24
+ "bin/fetch",
25
+ "bin/parse",
26
+ "feed-processor.gemspec",
27
+ "lib/feed_processor.rb",
28
+ "lib/feed_processor/content.rb",
29
+ "lib/feed_processor/feed.rb",
30
+ "lib/feed_processor/fetcher.rb",
31
+ "lib/feed_processor/file_based_request_generator.rb",
32
+ "lib/feed_processor/parser.rb",
33
+ "lib/feed_processor/response.rb",
34
+ "lib/feed_processor/util.rb"
35
+ ]
36
+ s.homepage = %q{http://github.com/aeden/feed-processor}
37
+ s.rdoc_options = ["--charset=UTF-8"]
38
+ s.require_paths = ["lib"]
39
+ s.rubygems_version = %q{1.3.5}
40
+ s.summary = %q{A multi-stage feed processor.}
41
+
42
+ if s.respond_to? :specification_version then
43
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
44
+ s.specification_version = 3
45
+
46
+ if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
47
+ else
48
+ end
49
+ else
50
+ end
51
+ end
52
+
@@ -0,0 +1,19 @@
1
+ require 'mongo_mapper'
2
+
3
+ class Content
4
+ include MongoMapper::Document
5
+
6
+ key :title, String
7
+ key :url, String
8
+ key :author, String
9
+ key :summary, String
10
+ key :content, String
11
+ key :published, Date
12
+ key :categories, Array
13
+ key :feed_id, String
14
+
15
+ belongs_to :feed
16
+
17
+ validates_uniqueness_of :url
18
+
19
+ end
@@ -0,0 +1,13 @@
1
+ require 'mongo_mapper'
2
+
3
+ class Feed
4
+ include MongoMapper::Document
5
+
6
+ key :title, String
7
+ key :url, String
8
+ key :status, String
9
+
10
+ many :contents
11
+
12
+ validates_uniqueness_of :url
13
+ end
@@ -0,0 +1,66 @@
1
+ require 'mongo_mapper'
2
+ require 'uri'
3
+ require 'http_reactor'
4
+ require 'threadify'
5
+ require 'beanstalk-client'
6
+
7
+ module FeedProcessor
8
+ class Fetcher
9
+ attr_reader :options
10
+
11
+ def initialize(options={})
12
+ @options = options
13
+ @request_generator = options[:request_generator] || FeedProcessor::FileBasedRequestGenerator.new
14
+ setup_mongo(options[:mongo])
15
+ end
16
+
17
+ def requests
18
+ @request_generator.requests
19
+ end
20
+
21
+ def execute
22
+ number_of_threads = options[:threads] || 16
23
+ main_queue = Beanstalk::Pool.new(['localhost:11300'])
24
+
25
+ number_of_requests = requests.length
26
+ main_queue.put("START #{number_of_requests}")
27
+ slice_size = number_of_requests / number_of_threads
28
+ slice_size = 1 if slice_size < 1
29
+ puts "Each slice has #{slice_size} urls (#{number_of_requests} requests / #{number_of_threads} threads)"
30
+
31
+ requests.threadify(:each_slice, slice_size) do |slice|
32
+ queue = Beanstalk::Pool.new(['localhost:11300'])
33
+ HttpReactor::Client.new(slice) do |response, context|
34
+ begin
35
+ request = context.get_attribute('http_target_request')
36
+ puts "#{response.code}:#{request.uri}"
37
+
38
+ Response.create({
39
+ :url => request.uri,
40
+ :data => FeedProcessor::Util.decode_content(response),
41
+ :status => response.code
42
+ })
43
+
44
+ if response.code == 200
45
+ queue.put(request.uri.to_s)
46
+ end
47
+ rescue Exception => e
48
+ puts "Exception in handler: #{e.message}"
49
+ end
50
+ end
51
+ end
52
+
53
+ main_queue.put("END #{number_of_requests}")
54
+ puts "Fetched #{number_of_requests} feeds"
55
+
56
+ end
57
+
58
+ protected
59
+ def setup_mongo(options={})
60
+ options ||= {}
61
+ options[:database] ||= 'feed_processor'
62
+ MongoMapper.connection = Mongo::Connection.new(nil, nil)
63
+ MongoMapper.database = options[:database]
64
+ end
65
+ end
66
+ end
@@ -0,0 +1,17 @@
1
+ module FeedProcessor
2
+ class FileBasedRequestGenerator
3
+ def requests
4
+ @requests ||= begin
5
+ puts "Generating requests"
6
+ requests = []
7
+ open(ARGV.pop) do |f|
8
+ f.each do |line|
9
+ requests << HttpReactor::Request.new(URI.parse(line)) if line =~ /^http:/
10
+ end
11
+ end
12
+ puts "Generated #{requests.length} requests"
13
+ requests
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,66 @@
1
+ require 'mongo_mapper'
2
+ require 'beanstalk-client'
3
+ require 'feedzirra'
4
+
5
+ module FeedProcessor
6
+ class Parser
7
+ def initialize(options={})
8
+ @options = options
9
+ setup_mongo(options[:mongo])
10
+ end
11
+ def execute
12
+ queue = Beanstalk::Pool.new(['localhost:11300'])
13
+ puts "Now accepting feeds to parse..."
14
+ begin
15
+ loop do
16
+ job = queue.reserve
17
+ if job.body =~ /^START (.+)$/
18
+ puts "starting #{$1} feeds"
19
+ elsif job.body =~ /^END (.+)$/
20
+ puts "finished #{$1} feeds"
21
+ else
22
+ url = job.body
23
+ puts "parsing #{url}"
24
+ responses = Response.all(:conditions => {'url' => url})
25
+ responses.each do |response|
26
+ begin
27
+ feed = Feedzirra::Feed.parse(response.data)
28
+ f = Feed.create({:url => url, :status => 'to-process'})
29
+ entries = feed.entries
30
+ puts "found #{entries.length} entries in #{url}"
31
+ entries.each do |entry|
32
+ begin
33
+ f.contents.create({
34
+ :title => entry.title,
35
+ :url => entry.url,
36
+ :author => entry.author,
37
+ :summary => entry.summary,
38
+ :content => entry.content,
39
+ :published => entry.published,
40
+ :categories => entry.categories,
41
+ })
42
+ rescue => e
43
+ puts "error creating entry #{entry.url}: #{e.message}"
44
+ end
45
+ end
46
+ rescue => e
47
+ puts "error parsing feed #{url}: #{e.message}"
48
+ end
49
+ end
50
+ end
51
+ job.delete
52
+ end
53
+ rescue Interrupt
54
+ puts "Exiting parser"
55
+ end
56
+ end
57
+
58
+ protected
59
+ def setup_mongo(options={})
60
+ options ||= {}
61
+ options[:database] ||= 'feed_processor'
62
+ MongoMapper.connection = Mongo::Connection.new(nil, nil)
63
+ MongoMapper.database = options[:database]
64
+ end
65
+ end
66
+ end
@@ -0,0 +1,11 @@
1
+ require 'mongo_mapper'
2
+
3
+ class Response
4
+ include MongoMapper::Document
5
+
6
+ key :url, String
7
+ key :data, String
8
+ key :status, String
9
+
10
+ validates_uniqueness_of :url
11
+ end
@@ -0,0 +1,24 @@
1
+ module FeedProcessor
2
+ class Util
3
+ # shamelessly stolen from feedzirra
4
+ def self.decode_content(res)
5
+ case res['content-encoding']
6
+ when 'gzip'
7
+ begin
8
+ gz = Zlib::GzipReader.new(StringIO.new(res.body))
9
+ xml = gz.read
10
+ gz.close
11
+ rescue Zlib::GzipFile::Error
12
+ # Maybe this is not gzipped?
13
+ xml = res.body
14
+ end
15
+ when 'deflate'
16
+ xml = Zlib::Inflate.inflate(res.body)
17
+ else
18
+ xml = res.body
19
+ end
20
+
21
+ xml
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,5 @@
1
+ require 'feed_processor/util'
2
+ require 'feed_processor/file_based_request_generator'
3
+ require 'feed_processor/response'
4
+ require 'feed_processor/feed'
5
+ require 'feed_processor/content'
metadata ADDED
@@ -0,0 +1,70 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: feed-processor
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Anthony Eden
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2010-02-22 00:00:00 -10:00
13
+ default_executable:
14
+ dependencies: []
15
+
16
+ description: Feed Processor is a multi-stage feed processor built with JRuby, MRI, beanstalk and MongoDB.
17
+ email: anthonyeden@gmail.com
18
+ executables:
19
+ - fetch
20
+ - parse
21
+ extensions: []
22
+
23
+ extra_rdoc_files:
24
+ - README.textile
25
+ files:
26
+ - .gitignore
27
+ - README.textile
28
+ - Rakefile
29
+ - VERSION
30
+ - bin/fetch
31
+ - bin/parse
32
+ - feed-processor.gemspec
33
+ - lib/feed_processor.rb
34
+ - lib/feed_processor/content.rb
35
+ - lib/feed_processor/feed.rb
36
+ - lib/feed_processor/fetcher.rb
37
+ - lib/feed_processor/file_based_request_generator.rb
38
+ - lib/feed_processor/parser.rb
39
+ - lib/feed_processor/response.rb
40
+ - lib/feed_processor/util.rb
41
+ has_rdoc: true
42
+ homepage: http://github.com/aeden/feed-processor
43
+ licenses: []
44
+
45
+ post_install_message:
46
+ rdoc_options:
47
+ - --charset=UTF-8
48
+ require_paths:
49
+ - lib
50
+ required_ruby_version: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: "0"
55
+ version:
56
+ required_rubygems_version: !ruby/object:Gem::Requirement
57
+ requirements:
58
+ - - ">="
59
+ - !ruby/object:Gem::Version
60
+ version: "0"
61
+ version:
62
+ requirements: []
63
+
64
+ rubyforge_project:
65
+ rubygems_version: 1.3.5
66
+ signing_key:
67
+ specification_version: 3
68
+ summary: A multi-stage feed processor.
69
+ test_files: []
70
+