feed-processor 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore ADDED
@@ -0,0 +1 @@
1
+ urls.txt
data/README.textile ADDED
@@ -0,0 +1,43 @@
1
+ Feed Processor is a multi-stage feed processor built with JRuby, MRI, beanstalk and MongoDB.
2
+
3
+ There are two steps to the feed processing:
4
+
5
+ # Step 1: Download feed content using non-blocking IO and insert the raw data into MongoDB. A message is sent via Beanstalk notifying the parser stage that the feed data is ready for a specific feed.
6
+ # Step 2: A multi-processor feed parser pulls the raw data from MongoDB, parses it and inserts the resulting parsed record into MongoDB.
7
+
8
+ h2. Dependencies
9
+
10
+ * MongoDB
11
+ * beanstalkd
12
+ * JRuby
13
+ * MRI
14
+
15
+ Gems (for JRuby):
16
+
17
+ * jruby-http-reactor
18
+ * threadify
19
+ * beanstalk-client
20
+ * mongo_mapper
21
+
22
+ Gems (for MRI):
23
+
24
+ * beanstalk-client
25
+ * mongo_mapper
26
+ * feedzirra
27
+
28
+ h2. Executing
29
+
30
+ Each of the following commands should be executed in a separate console or executed to run as a background process.
31
+
32
+ Start MongoDB and Beanstalk:
33
+
34
+ mongod
35
+ beanstalkd
36
+
37
+ Run the fetch processor:
38
+
39
+ jruby -rubygems -Ilib bin/fetch urls.txt
40
+
41
+ Run the parse processor:
42
+
43
+ ruby -rubygems -Ilib bin/parse
data/Rakefile ADDED
@@ -0,0 +1,15 @@
1
+ begin
2
+ require 'jeweler'
3
+ Jeweler::Tasks.new do |gemspec|
4
+ gemspec.name = "feed-processor"
5
+ gemspec.summary = "A multi-stage feed processor."
6
+ gemspec.description = "Feed Processor is a multi-stage feed processor built with JRuby, MRI, beanstalk and MongoDB."
7
+ gemspec.email = "anthonyeden@gmail.com"
8
+ gemspec.homepage = "http://github.com/aeden/feed-processor"
9
+ gemspec.authors = ["Anthony Eden"]
10
+ end
11
+ rescue LoadError
12
+ puts "Jeweler not available. Install it with: gem install jeweler"
13
+ end
14
+
15
+ Jeweler::GemcutterTasks.new
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.0.1
data/bin/fetch ADDED
@@ -0,0 +1,20 @@
1
+ #!/usr/bin/env jruby
2
+ #
3
+ # Description: Fetches the content from the list of URLs provided
4
+ # on the command line and stores that content and its response code
5
+ # in a MongoDB database.
6
+ #
7
+ # Usage: jruby -rubygems -Ilib bin/fetch urls.txt
8
+ #
9
+ # Dependencies:
10
+ #
11
+ # * jruby-http-reactor
12
+ # * beanstalk-client
13
+ # * threadify
14
+ # * mongo_mapper
15
+
16
+ $stdout.sync = true
17
+ require 'feed_processor'
18
+ require 'feed_processor/fetcher'
19
+ fetcher = FeedProcessor::Fetcher.new(:threads => 16)
20
+ fetcher.execute
data/bin/parse ADDED
@@ -0,0 +1,11 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # Description: Listens for events on a queue and parses the
4
+ # HTTP response body stored in MongoDB for the given URL.
5
+ #
6
+ # Usage: ruby -rubygems -Ilib bin/parse
7
+
8
+ require 'feed_processor'
9
+ require 'feed_processor/parser'
10
+ parser = FeedProcessor::Parser.new
11
+ parser.execute
@@ -0,0 +1,52 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{feed-processor}
8
+ s.version = "0.0.1"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["Anthony Eden"]
12
+ s.date = %q{2010-02-22}
13
+ s.description = %q{Feed Processor is a multi-stage feed processor built with JRuby, MRI, beanstalk and MongoDB.}
14
+ s.email = %q{anthonyeden@gmail.com}
15
+ s.executables = ["fetch", "parse"]
16
+ s.extra_rdoc_files = [
17
+ "README.textile"
18
+ ]
19
+ s.files = [
20
+ ".gitignore",
21
+ "README.textile",
22
+ "Rakefile",
23
+ "VERSION",
24
+ "bin/fetch",
25
+ "bin/parse",
26
+ "feed-processor.gemspec",
27
+ "lib/feed_processor.rb",
28
+ "lib/feed_processor/content.rb",
29
+ "lib/feed_processor/feed.rb",
30
+ "lib/feed_processor/fetcher.rb",
31
+ "lib/feed_processor/file_based_request_generator.rb",
32
+ "lib/feed_processor/parser.rb",
33
+ "lib/feed_processor/response.rb",
34
+ "lib/feed_processor/util.rb"
35
+ ]
36
+ s.homepage = %q{http://github.com/aeden/feed-processor}
37
+ s.rdoc_options = ["--charset=UTF-8"]
38
+ s.require_paths = ["lib"]
39
+ s.rubygems_version = %q{1.3.5}
40
+ s.summary = %q{A multi-stage feed processor.}
41
+
42
+ if s.respond_to? :specification_version then
43
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
44
+ s.specification_version = 3
45
+
46
+ if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
47
+ else
48
+ end
49
+ else
50
+ end
51
+ end
52
+
@@ -0,0 +1,19 @@
1
+ require 'mongo_mapper'
2
+
3
+ class Content
4
+ include MongoMapper::Document
5
+
6
+ key :title, String
7
+ key :url, String
8
+ key :author, String
9
+ key :summary, String
10
+ key :content, String
11
+ key :published, Date
12
+ key :categories, Array
13
+ key :feed_id, String
14
+
15
+ belongs_to :feed
16
+
17
+ validates_uniqueness_of :url
18
+
19
+ end
@@ -0,0 +1,13 @@
1
+ require 'mongo_mapper'
2
+
3
+ class Feed
4
+ include MongoMapper::Document
5
+
6
+ key :title, String
7
+ key :url, String
8
+ key :status, String
9
+
10
+ many :contents
11
+
12
+ validates_uniqueness_of :url
13
+ end
@@ -0,0 +1,66 @@
1
+ require 'mongo_mapper'
2
+ require 'uri'
3
+ require 'http_reactor'
4
+ require 'threadify'
5
+ require 'beanstalk-client'
6
+
7
+ module FeedProcessor
8
+ class Fetcher
9
+ attr_reader :options
10
+
11
+ def initialize(options={})
12
+ @options = options
13
+ @request_generator = options[:request_generator] || FeedProcessor::FileBasedRequestGenerator.new
14
+ setup_mongo(options[:mongo])
15
+ end
16
+
17
+ def requests
18
+ @request_generator.requests
19
+ end
20
+
21
+ def execute
22
+ number_of_threads = options[:threads] || 16
23
+ main_queue = Beanstalk::Pool.new(['localhost:11300'])
24
+
25
+ number_of_requests = requests.length
26
+ main_queue.put("START #{number_of_requests}")
27
+ slice_size = number_of_requests / number_of_threads
28
+ slice_size = 1 if slice_size < 1
29
+ puts "Each slice has #{slice_size} urls (#{number_of_requests} requests / #{number_of_threads} threads)"
30
+
31
+ requests.threadify(:each_slice, slice_size) do |slice|
32
+ queue = Beanstalk::Pool.new(['localhost:11300'])
33
+ HttpReactor::Client.new(slice) do |response, context|
34
+ begin
35
+ request = context.get_attribute('http_target_request')
36
+ puts "#{response.code}:#{request.uri}"
37
+
38
+ Response.create({
39
+ :url => request.uri,
40
+ :data => FeedProcessor::Util.decode_content(response),
41
+ :status => response.code
42
+ })
43
+
44
+ if response.code == 200
45
+ queue.put(request.uri.to_s)
46
+ end
47
+ rescue Exception => e
48
+ puts "Exception in handler: #{e.message}"
49
+ end
50
+ end
51
+ end
52
+
53
+ main_queue.put("END #{number_of_requests}")
54
+ puts "Fetched #{number_of_requests} feeds"
55
+
56
+ end
57
+
58
+ protected
59
+ def setup_mongo(options={})
60
+ options ||= {}
61
+ options[:database] ||= 'feed_processor'
62
+ MongoMapper.connection = Mongo::Connection.new(nil, nil)
63
+ MongoMapper.database = options[:database]
64
+ end
65
+ end
66
+ end
@@ -0,0 +1,17 @@
1
+ module FeedProcessor
2
+ class FileBasedRequestGenerator
3
+ def requests
4
+ @requests ||= begin
5
+ puts "Generating requests"
6
+ requests = []
7
+ open(ARGV.pop) do |f|
8
+ f.each do |line|
9
+ requests << HttpReactor::Request.new(URI.parse(line)) if line =~ /^http:/
10
+ end
11
+ end
12
+ puts "Generated #{requests.length} requests"
13
+ requests
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,66 @@
1
+ require 'mongo_mapper'
2
+ require 'beanstalk-client'
3
+ require 'feedzirra'
4
+
5
+ module FeedProcessor
6
+ class Parser
7
+ def initialize(options={})
8
+ @options = options
9
+ setup_mongo(options[:mongo])
10
+ end
11
+ def execute
12
+ queue = Beanstalk::Pool.new(['localhost:11300'])
13
+ puts "Now accepting feeds to parse..."
14
+ begin
15
+ loop do
16
+ job = queue.reserve
17
+ if job.body =~ /^START (.+)$/
18
+ puts "starting #{$1} feeds"
19
+ elsif job.body =~ /^END (.+)$/
20
+ puts "finished #{$1} feeds"
21
+ else
22
+ url = job.body
23
+ puts "parsing #{url}"
24
+ responses = Response.all(:conditions => {'url' => url})
25
+ responses.each do |response|
26
+ begin
27
+ feed = Feedzirra::Feed.parse(response.data)
28
+ f = Feed.create({:url => url, :status => 'to-process'})
29
+ entries = feed.entries
30
+ puts "found #{entries.length} entries in #{url}"
31
+ entries.each do |entry|
32
+ begin
33
+ f.contents.create({
34
+ :title => entry.title,
35
+ :url => entry.url,
36
+ :author => entry.author,
37
+ :summary => entry.summary,
38
+ :content => entry.content,
39
+ :published => entry.published,
40
+ :categories => entry.categories,
41
+ })
42
+ rescue => e
43
+ puts "error creating entry #{entry.url}: #{e.message}"
44
+ end
45
+ end
46
+ rescue => e
47
+ puts "error parsing feed #{url}: #{e.message}"
48
+ end
49
+ end
50
+ end
51
+ job.delete
52
+ end
53
+ rescue Interrupt
54
+ puts "Exiting parser"
55
+ end
56
+ end
57
+
58
+ protected
59
+ def setup_mongo(options={})
60
+ options ||= {}
61
+ options[:database] ||= 'feed_processor'
62
+ MongoMapper.connection = Mongo::Connection.new(nil, nil)
63
+ MongoMapper.database = options[:database]
64
+ end
65
+ end
66
+ end
@@ -0,0 +1,11 @@
1
+ require 'mongo_mapper'
2
+
3
+ class Response
4
+ include MongoMapper::Document
5
+
6
+ key :url, String
7
+ key :data, String
8
+ key :status, String
9
+
10
+ validates_uniqueness_of :url
11
+ end
@@ -0,0 +1,24 @@
1
+ module FeedProcessor
2
+ class Util
3
+ # shamelessly stolen from feedzirra
4
+ def self.decode_content(res)
5
+ case res['content-encoding']
6
+ when 'gzip'
7
+ begin
8
+ gz = Zlib::GzipReader.new(StringIO.new(res.body))
9
+ xml = gz.read
10
+ gz.close
11
+ rescue Zlib::GzipFile::Error
12
+ # Maybe this is not gzipped?
13
+ xml = res.body
14
+ end
15
+ when 'deflate'
16
+ xml = Zlib::Inflate.inflate(res.body)
17
+ else
18
+ xml = res.body
19
+ end
20
+
21
+ xml
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,5 @@
1
+ require 'feed_processor/util'
2
+ require 'feed_processor/file_based_request_generator'
3
+ require 'feed_processor/response'
4
+ require 'feed_processor/feed'
5
+ require 'feed_processor/content'
metadata ADDED
@@ -0,0 +1,70 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: feed-processor
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Anthony Eden
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2010-02-22 00:00:00 -10:00
13
+ default_executable:
14
+ dependencies: []
15
+
16
+ description: Feed Processor is a multi-stage feed processor built with JRuby, MRI, beanstalk and MongoDB.
17
+ email: anthonyeden@gmail.com
18
+ executables:
19
+ - fetch
20
+ - parse
21
+ extensions: []
22
+
23
+ extra_rdoc_files:
24
+ - README.textile
25
+ files:
26
+ - .gitignore
27
+ - README.textile
28
+ - Rakefile
29
+ - VERSION
30
+ - bin/fetch
31
+ - bin/parse
32
+ - feed-processor.gemspec
33
+ - lib/feed_processor.rb
34
+ - lib/feed_processor/content.rb
35
+ - lib/feed_processor/feed.rb
36
+ - lib/feed_processor/fetcher.rb
37
+ - lib/feed_processor/file_based_request_generator.rb
38
+ - lib/feed_processor/parser.rb
39
+ - lib/feed_processor/response.rb
40
+ - lib/feed_processor/util.rb
41
+ has_rdoc: true
42
+ homepage: http://github.com/aeden/feed-processor
43
+ licenses: []
44
+
45
+ post_install_message:
46
+ rdoc_options:
47
+ - --charset=UTF-8
48
+ require_paths:
49
+ - lib
50
+ required_ruby_version: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: "0"
55
+ version:
56
+ required_rubygems_version: !ruby/object:Gem::Requirement
57
+ requirements:
58
+ - - ">="
59
+ - !ruby/object:Gem::Version
60
+ version: "0"
61
+ version:
62
+ requirements: []
63
+
64
+ rubyforge_project:
65
+ rubygems_version: 1.3.5
66
+ signing_key:
67
+ specification_version: 3
68
+ summary: A multi-stage feed processor.
69
+ test_files: []
70
+