feed-processor 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +1 -0
- data/README.textile +43 -0
- data/Rakefile +15 -0
- data/VERSION +1 -0
- data/bin/fetch +20 -0
- data/bin/parse +11 -0
- data/feed-processor.gemspec +52 -0
- data/lib/feed_processor/content.rb +19 -0
- data/lib/feed_processor/feed.rb +13 -0
- data/lib/feed_processor/fetcher.rb +66 -0
- data/lib/feed_processor/file_based_request_generator.rb +17 -0
- data/lib/feed_processor/parser.rb +66 -0
- data/lib/feed_processor/response.rb +11 -0
- data/lib/feed_processor/util.rb +24 -0
- data/lib/feed_processor.rb +5 -0
- metadata +70 -0
data/.gitignore
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
urls.txt
|
data/README.textile
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
Feed Processor is a multi-stage feed processor built with JRuby, MRI, beanstalk and MongoDB.
|
2
|
+
|
3
|
+
There are two steps to the feed processing:
|
4
|
+
|
5
|
+
# Step 1: Download feed content using non-blocking IO and insert the raw data into MongoDB. A message is sent via Beanstalk notifying the parser stage that the feed data is ready for a specific feed.
|
6
|
+
# Step 2: A multi-processor feed parser pulls the raw data from MongoDB, parses it and inserts the resulting parsed record into MongoDB.
|
7
|
+
|
8
|
+
h2. Dependencies
|
9
|
+
|
10
|
+
* MongoDB
|
11
|
+
* beanstalkd
|
12
|
+
* JRuby
|
13
|
+
* MRI
|
14
|
+
|
15
|
+
Gems (for JRuby):
|
16
|
+
|
17
|
+
* jruby-http-reactor
|
18
|
+
* threadify
|
19
|
+
* beanstalk-client
|
20
|
+
* mongo_mapper
|
21
|
+
|
22
|
+
Gems (for MRI):
|
23
|
+
|
24
|
+
* beanstalk-client
|
25
|
+
* mongo_mapper
|
26
|
+
* feedzirra
|
27
|
+
|
28
|
+
h2. Executing
|
29
|
+
|
30
|
+
Each of the following commands should be executed in a separate console or executed to run as a background process.
|
31
|
+
|
32
|
+
Start MongoDB and Beanstalk:
|
33
|
+
|
34
|
+
mongod
|
35
|
+
beanstalkd
|
36
|
+
|
37
|
+
Run the fetch processor:
|
38
|
+
|
39
|
+
jruby -rubygems -Ilib bin/fetch urls.txt
|
40
|
+
|
41
|
+
Run the parse processor:
|
42
|
+
|
43
|
+
ruby -rubygems -Ilib bin/parse
|
data/Rakefile
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
begin
|
2
|
+
require 'jeweler'
|
3
|
+
Jeweler::Tasks.new do |gemspec|
|
4
|
+
gemspec.name = "feed-processor"
|
5
|
+
gemspec.summary = "A multi-stage feed processor."
|
6
|
+
gemspec.description = "Feed Processor is a multi-stage feed processor built with JRuby, MRI, beanstalk and MongoDB."
|
7
|
+
gemspec.email = "anthonyeden@gmail.com"
|
8
|
+
gemspec.homepage = "http://github.com/aeden/feed-processor"
|
9
|
+
gemspec.authors = ["Anthony Eden"]
|
10
|
+
end
|
11
|
+
rescue LoadError
|
12
|
+
puts "Jeweler not available. Install it with: gem install jeweler"
|
13
|
+
end
|
14
|
+
|
15
|
+
Jeweler::GemcutterTasks.new
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.0.1
|
data/bin/fetch
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
#!/usr/bin/env jruby
|
2
|
+
#
|
3
|
+
# Description: Fetches the content from the list of URLs provided
|
4
|
+
# on the command line and stores that content and its response code
|
5
|
+
# in a MongoDB database.
|
6
|
+
#
|
7
|
+
# Usage: jruby -rubygems -Ilib bin/fetch urls.txt
|
8
|
+
#
|
9
|
+
# Dependencies:
|
10
|
+
#
|
11
|
+
# * jruby-http-reactor
|
12
|
+
# * beanstalk-client
|
13
|
+
# * threadify
|
14
|
+
# * mongo_mapper
|
15
|
+
|
16
|
+
$stdout.sync = true
|
17
|
+
require 'feed_processor'
|
18
|
+
require 'feed_processor/fetcher'
|
19
|
+
fetcher = FeedProcessor::Fetcher.new(:threads => 16)
|
20
|
+
fetcher.execute
|
data/bin/parse
ADDED
@@ -0,0 +1,11 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# Description: Listens for events on a queue and parses the
|
4
|
+
# HTTP response body stored in MongoDB for the given URL.
|
5
|
+
#
|
6
|
+
# Usage: ruby -rubygems -Ilib bin/parse
|
7
|
+
|
8
|
+
require 'feed_processor'
|
9
|
+
require 'feed_processor/parser'
|
10
|
+
parser = FeedProcessor::Parser.new
|
11
|
+
parser.execute
|
@@ -0,0 +1,52 @@
|
|
1
|
+
# Generated by jeweler
|
2
|
+
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
|
4
|
+
# -*- encoding: utf-8 -*-
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = %q{feed-processor}
|
8
|
+
s.version = "0.0.1"
|
9
|
+
|
10
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
+
s.authors = ["Anthony Eden"]
|
12
|
+
s.date = %q{2010-02-22}
|
13
|
+
s.description = %q{Feed Processor is a multi-stage feed processor built with JRuby, MRI, beanstalk and MongoDB.}
|
14
|
+
s.email = %q{anthonyeden@gmail.com}
|
15
|
+
s.executables = ["fetch", "parse"]
|
16
|
+
s.extra_rdoc_files = [
|
17
|
+
"README.textile"
|
18
|
+
]
|
19
|
+
s.files = [
|
20
|
+
".gitignore",
|
21
|
+
"README.textile",
|
22
|
+
"Rakefile",
|
23
|
+
"VERSION",
|
24
|
+
"bin/fetch",
|
25
|
+
"bin/parse",
|
26
|
+
"feed-processor.gemspec",
|
27
|
+
"lib/feed_processor.rb",
|
28
|
+
"lib/feed_processor/content.rb",
|
29
|
+
"lib/feed_processor/feed.rb",
|
30
|
+
"lib/feed_processor/fetcher.rb",
|
31
|
+
"lib/feed_processor/file_based_request_generator.rb",
|
32
|
+
"lib/feed_processor/parser.rb",
|
33
|
+
"lib/feed_processor/response.rb",
|
34
|
+
"lib/feed_processor/util.rb"
|
35
|
+
]
|
36
|
+
s.homepage = %q{http://github.com/aeden/feed-processor}
|
37
|
+
s.rdoc_options = ["--charset=UTF-8"]
|
38
|
+
s.require_paths = ["lib"]
|
39
|
+
s.rubygems_version = %q{1.3.5}
|
40
|
+
s.summary = %q{A multi-stage feed processor.}
|
41
|
+
|
42
|
+
if s.respond_to? :specification_version then
|
43
|
+
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
44
|
+
s.specification_version = 3
|
45
|
+
|
46
|
+
if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
|
47
|
+
else
|
48
|
+
end
|
49
|
+
else
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
@@ -0,0 +1,19 @@
|
|
1
|
+
require 'mongo_mapper'
|
2
|
+
|
3
|
+
class Content
|
4
|
+
include MongoMapper::Document
|
5
|
+
|
6
|
+
key :title, String
|
7
|
+
key :url, String
|
8
|
+
key :author, String
|
9
|
+
key :summary, String
|
10
|
+
key :content, String
|
11
|
+
key :published, Date
|
12
|
+
key :categories, Array
|
13
|
+
key :feed_id, String
|
14
|
+
|
15
|
+
belongs_to :feed
|
16
|
+
|
17
|
+
validates_uniqueness_of :url
|
18
|
+
|
19
|
+
end
|
@@ -0,0 +1,66 @@
|
|
1
|
+
require 'mongo_mapper'
|
2
|
+
require 'uri'
|
3
|
+
require 'http_reactor'
|
4
|
+
require 'threadify'
|
5
|
+
require 'beanstalk-client'
|
6
|
+
|
7
|
+
module FeedProcessor
|
8
|
+
class Fetcher
|
9
|
+
attr_reader :options
|
10
|
+
|
11
|
+
def initialize(options={})
|
12
|
+
@options = options
|
13
|
+
@request_generator = options[:request_generator] || FeedProcessor::FileBasedRequestGenerator.new
|
14
|
+
setup_mongo(options[:mongo])
|
15
|
+
end
|
16
|
+
|
17
|
+
def requests
|
18
|
+
@request_generator.requests
|
19
|
+
end
|
20
|
+
|
21
|
+
def execute
|
22
|
+
number_of_threads = options[:threads] || 16
|
23
|
+
main_queue = Beanstalk::Pool.new(['localhost:11300'])
|
24
|
+
|
25
|
+
number_of_requests = requests.length
|
26
|
+
main_queue.put("START #{number_of_requests}")
|
27
|
+
slice_size = number_of_requests / number_of_threads
|
28
|
+
slice_size = 1 if slice_size < 1
|
29
|
+
puts "Each slice has #{slice_size} urls (#{number_of_requests} requests / #{number_of_threads} threads)"
|
30
|
+
|
31
|
+
requests.threadify(:each_slice, slice_size) do |slice|
|
32
|
+
queue = Beanstalk::Pool.new(['localhost:11300'])
|
33
|
+
HttpReactor::Client.new(slice) do |response, context|
|
34
|
+
begin
|
35
|
+
request = context.get_attribute('http_target_request')
|
36
|
+
puts "#{response.code}:#{request.uri}"
|
37
|
+
|
38
|
+
Response.create({
|
39
|
+
:url => request.uri,
|
40
|
+
:data => FeedProcessor::Util.decode_content(response),
|
41
|
+
:status => response.code
|
42
|
+
})
|
43
|
+
|
44
|
+
if response.code == 200
|
45
|
+
queue.put(request.uri.to_s)
|
46
|
+
end
|
47
|
+
rescue Exception => e
|
48
|
+
puts "Exception in handler: #{e.message}"
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
main_queue.put("END #{number_of_requests}")
|
54
|
+
puts "Fetched #{number_of_requests} feeds"
|
55
|
+
|
56
|
+
end
|
57
|
+
|
58
|
+
protected
|
59
|
+
def setup_mongo(options={})
|
60
|
+
options ||= {}
|
61
|
+
options[:database] ||= 'feed_processor'
|
62
|
+
MongoMapper.connection = Mongo::Connection.new(nil, nil)
|
63
|
+
MongoMapper.database = options[:database]
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module FeedProcessor
|
2
|
+
class FileBasedRequestGenerator
|
3
|
+
def requests
|
4
|
+
@requests ||= begin
|
5
|
+
puts "Generating requests"
|
6
|
+
requests = []
|
7
|
+
open(ARGV.pop) do |f|
|
8
|
+
f.each do |line|
|
9
|
+
requests << HttpReactor::Request.new(URI.parse(line)) if line =~ /^http:/
|
10
|
+
end
|
11
|
+
end
|
12
|
+
puts "Generated #{requests.length} requests"
|
13
|
+
requests
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,66 @@
|
|
1
|
+
require 'mongo_mapper'
|
2
|
+
require 'beanstalk-client'
|
3
|
+
require 'feedzirra'
|
4
|
+
|
5
|
+
module FeedProcessor
|
6
|
+
class Parser
|
7
|
+
def initialize(options={})
|
8
|
+
@options = options
|
9
|
+
setup_mongo(options[:mongo])
|
10
|
+
end
|
11
|
+
def execute
|
12
|
+
queue = Beanstalk::Pool.new(['localhost:11300'])
|
13
|
+
puts "Now accepting feeds to parse..."
|
14
|
+
begin
|
15
|
+
loop do
|
16
|
+
job = queue.reserve
|
17
|
+
if job.body =~ /^START (.+)$/
|
18
|
+
puts "starting #{$1} feeds"
|
19
|
+
elsif job.body =~ /^END (.+)$/
|
20
|
+
puts "finished #{$1} feeds"
|
21
|
+
else
|
22
|
+
url = job.body
|
23
|
+
puts "parsing #{url}"
|
24
|
+
responses = Response.all(:conditions => {'url' => url})
|
25
|
+
responses.each do |response|
|
26
|
+
begin
|
27
|
+
feed = Feedzirra::Feed.parse(response.data)
|
28
|
+
f = Feed.create({:url => url, :status => 'to-process'})
|
29
|
+
entries = feed.entries
|
30
|
+
puts "found #{entries.length} entries in #{url}"
|
31
|
+
entries.each do |entry|
|
32
|
+
begin
|
33
|
+
f.contents.create({
|
34
|
+
:title => entry.title,
|
35
|
+
:url => entry.url,
|
36
|
+
:author => entry.author,
|
37
|
+
:summary => entry.summary,
|
38
|
+
:content => entry.content,
|
39
|
+
:published => entry.published,
|
40
|
+
:categories => entry.categories,
|
41
|
+
})
|
42
|
+
rescue => e
|
43
|
+
puts "error creating entry #{entry.url}: #{e.message}"
|
44
|
+
end
|
45
|
+
end
|
46
|
+
rescue => e
|
47
|
+
puts "error parsing feed #{url}: #{e.message}"
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
job.delete
|
52
|
+
end
|
53
|
+
rescue Interrupt
|
54
|
+
puts "Exiting parser"
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
protected
|
59
|
+
def setup_mongo(options={})
|
60
|
+
options ||= {}
|
61
|
+
options[:database] ||= 'feed_processor'
|
62
|
+
MongoMapper.connection = Mongo::Connection.new(nil, nil)
|
63
|
+
MongoMapper.database = options[:database]
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
module FeedProcessor
|
2
|
+
class Util
|
3
|
+
# shamelessly stolen from feedzirra
|
4
|
+
def self.decode_content(res)
|
5
|
+
case res['content-encoding']
|
6
|
+
when 'gzip'
|
7
|
+
begin
|
8
|
+
gz = Zlib::GzipReader.new(StringIO.new(res.body))
|
9
|
+
xml = gz.read
|
10
|
+
gz.close
|
11
|
+
rescue Zlib::GzipFile::Error
|
12
|
+
# Maybe this is not gzipped?
|
13
|
+
xml = res.body
|
14
|
+
end
|
15
|
+
when 'deflate'
|
16
|
+
xml = Zlib::Inflate.inflate(res.body)
|
17
|
+
else
|
18
|
+
xml = res.body
|
19
|
+
end
|
20
|
+
|
21
|
+
xml
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
metadata
ADDED
@@ -0,0 +1,70 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: feed-processor
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Anthony Eden
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2010-02-22 00:00:00 -10:00
|
13
|
+
default_executable:
|
14
|
+
dependencies: []
|
15
|
+
|
16
|
+
description: Feed Processor is a multi-stage feed processor built with JRuby, MRI, beanstalk and MongoDB.
|
17
|
+
email: anthonyeden@gmail.com
|
18
|
+
executables:
|
19
|
+
- fetch
|
20
|
+
- parse
|
21
|
+
extensions: []
|
22
|
+
|
23
|
+
extra_rdoc_files:
|
24
|
+
- README.textile
|
25
|
+
files:
|
26
|
+
- .gitignore
|
27
|
+
- README.textile
|
28
|
+
- Rakefile
|
29
|
+
- VERSION
|
30
|
+
- bin/fetch
|
31
|
+
- bin/parse
|
32
|
+
- feed-processor.gemspec
|
33
|
+
- lib/feed_processor.rb
|
34
|
+
- lib/feed_processor/content.rb
|
35
|
+
- lib/feed_processor/feed.rb
|
36
|
+
- lib/feed_processor/fetcher.rb
|
37
|
+
- lib/feed_processor/file_based_request_generator.rb
|
38
|
+
- lib/feed_processor/parser.rb
|
39
|
+
- lib/feed_processor/response.rb
|
40
|
+
- lib/feed_processor/util.rb
|
41
|
+
has_rdoc: true
|
42
|
+
homepage: http://github.com/aeden/feed-processor
|
43
|
+
licenses: []
|
44
|
+
|
45
|
+
post_install_message:
|
46
|
+
rdoc_options:
|
47
|
+
- --charset=UTF-8
|
48
|
+
require_paths:
|
49
|
+
- lib
|
50
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: "0"
|
55
|
+
version:
|
56
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
57
|
+
requirements:
|
58
|
+
- - ">="
|
59
|
+
- !ruby/object:Gem::Version
|
60
|
+
version: "0"
|
61
|
+
version:
|
62
|
+
requirements: []
|
63
|
+
|
64
|
+
rubyforge_project:
|
65
|
+
rubygems_version: 1.3.5
|
66
|
+
signing_key:
|
67
|
+
specification_version: 3
|
68
|
+
summary: A multi-stage feed processor.
|
69
|
+
test_files: []
|
70
|
+
|