feed-processor 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +1 -0
- data/README.textile +43 -0
- data/Rakefile +15 -0
- data/VERSION +1 -0
- data/bin/fetch +20 -0
- data/bin/parse +11 -0
- data/feed-processor.gemspec +52 -0
- data/lib/feed_processor/content.rb +19 -0
- data/lib/feed_processor/feed.rb +13 -0
- data/lib/feed_processor/fetcher.rb +66 -0
- data/lib/feed_processor/file_based_request_generator.rb +17 -0
- data/lib/feed_processor/parser.rb +66 -0
- data/lib/feed_processor/response.rb +11 -0
- data/lib/feed_processor/util.rb +24 -0
- data/lib/feed_processor.rb +5 -0
- metadata +70 -0
data/.gitignore
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
urls.txt
|
data/README.textile
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
Feed Processor is a multi-stage feed processor built with JRuby, MRI, beanstalk and MongoDB.
|
2
|
+
|
3
|
+
There are two steps to the feed processing:
|
4
|
+
|
5
|
+
# Step 1: Download feed content using non-blocking IO and insert the raw data into MongoDB. A message is sent via Beanstalk notifying the parser stage that the feed data is ready for a specific feed.
|
6
|
+
# Step 2: A multi-processor feed parser pulls the raw data from MongoDB, parses it and inserts the resulting parsed record into MongoDB.
|
7
|
+
|
8
|
+
h2. Dependencies
|
9
|
+
|
10
|
+
* MongoDB
|
11
|
+
* beanstalkd
|
12
|
+
* JRuby
|
13
|
+
* MRI
|
14
|
+
|
15
|
+
Gems (for JRuby):
|
16
|
+
|
17
|
+
* jruby-http-reactor
|
18
|
+
* threadify
|
19
|
+
* beanstalk-client
|
20
|
+
* mongo_mapper
|
21
|
+
|
22
|
+
Gems (for MRI):
|
23
|
+
|
24
|
+
* beanstalk-client
|
25
|
+
* mongo_mapper
|
26
|
+
* feedzirra
|
27
|
+
|
28
|
+
h2. Executing
|
29
|
+
|
30
|
+
Each of the following commands should be executed in a separate console or executed to run as a background process.
|
31
|
+
|
32
|
+
Start MongoDB and Beanstalk:
|
33
|
+
|
34
|
+
mongod
|
35
|
+
beanstalkd
|
36
|
+
|
37
|
+
Run the fetch processor:
|
38
|
+
|
39
|
+
jruby -rubygems -Ilib bin/fetch urls.txt
|
40
|
+
|
41
|
+
Run the parse processor:
|
42
|
+
|
43
|
+
ruby -rubygems -Ilib bin/parse
|
data/Rakefile
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
begin
|
2
|
+
require 'jeweler'
|
3
|
+
Jeweler::Tasks.new do |gemspec|
|
4
|
+
gemspec.name = "feed-processor"
|
5
|
+
gemspec.summary = "A multi-stage feed processor."
|
6
|
+
gemspec.description = "Feed Processor is a multi-stage feed processor built with JRuby, MRI, beanstalk and MongoDB."
|
7
|
+
gemspec.email = "anthonyeden@gmail.com"
|
8
|
+
gemspec.homepage = "http://github.com/aeden/feed-processor"
|
9
|
+
gemspec.authors = ["Anthony Eden"]
|
10
|
+
end
|
11
|
+
rescue LoadError
|
12
|
+
puts "Jeweler not available. Install it with: gem install jeweler"
|
13
|
+
end
|
14
|
+
|
15
|
+
Jeweler::GemcutterTasks.new
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.0.1
|
data/bin/fetch
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
#!/usr/bin/env jruby
|
2
|
+
#
|
3
|
+
# Description: Fetches the content from the list of URLs provided
|
4
|
+
# on the command line and stores that content and its response code
|
5
|
+
# in a MongoDB database.
|
6
|
+
#
|
7
|
+
# Usage: jruby -rubygems -Ilib bin/fetch urls.txt
|
8
|
+
#
|
9
|
+
# Dependencies:
|
10
|
+
#
|
11
|
+
# * jruby-http-reactor
|
12
|
+
# * beanstalk-client
|
13
|
+
# * threadify
|
14
|
+
# * mongo_mapper
|
15
|
+
|
16
|
+
$stdout.sync = true
|
17
|
+
require 'feed_processor'
|
18
|
+
require 'feed_processor/fetcher'
|
19
|
+
fetcher = FeedProcessor::Fetcher.new(:threads => 16)
|
20
|
+
fetcher.execute
|
data/bin/parse
ADDED
@@ -0,0 +1,11 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# Description: Listens for events on a queue and parses the
|
4
|
+
# HTTP response body stored in MongoDB for the given URL.
|
5
|
+
#
|
6
|
+
# Usage: ruby -rubygems -Ilib bin/parse
|
7
|
+
|
8
|
+
require 'feed_processor'
|
9
|
+
require 'feed_processor/parser'
|
10
|
+
parser = FeedProcessor::Parser.new
|
11
|
+
parser.execute
|
@@ -0,0 +1,52 @@
|
|
1
|
+
# Generated by jeweler
|
2
|
+
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
|
4
|
+
# -*- encoding: utf-8 -*-
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = %q{feed-processor}
|
8
|
+
s.version = "0.0.1"
|
9
|
+
|
10
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
+
s.authors = ["Anthony Eden"]
|
12
|
+
s.date = %q{2010-02-22}
|
13
|
+
s.description = %q{Feed Processor is a multi-stage feed processor built with JRuby, MRI, beanstalk and MongoDB.}
|
14
|
+
s.email = %q{anthonyeden@gmail.com}
|
15
|
+
s.executables = ["fetch", "parse"]
|
16
|
+
s.extra_rdoc_files = [
|
17
|
+
"README.textile"
|
18
|
+
]
|
19
|
+
s.files = [
|
20
|
+
".gitignore",
|
21
|
+
"README.textile",
|
22
|
+
"Rakefile",
|
23
|
+
"VERSION",
|
24
|
+
"bin/fetch",
|
25
|
+
"bin/parse",
|
26
|
+
"feed-processor.gemspec",
|
27
|
+
"lib/feed_processor.rb",
|
28
|
+
"lib/feed_processor/content.rb",
|
29
|
+
"lib/feed_processor/feed.rb",
|
30
|
+
"lib/feed_processor/fetcher.rb",
|
31
|
+
"lib/feed_processor/file_based_request_generator.rb",
|
32
|
+
"lib/feed_processor/parser.rb",
|
33
|
+
"lib/feed_processor/response.rb",
|
34
|
+
"lib/feed_processor/util.rb"
|
35
|
+
]
|
36
|
+
s.homepage = %q{http://github.com/aeden/feed-processor}
|
37
|
+
s.rdoc_options = ["--charset=UTF-8"]
|
38
|
+
s.require_paths = ["lib"]
|
39
|
+
s.rubygems_version = %q{1.3.5}
|
40
|
+
s.summary = %q{A multi-stage feed processor.}
|
41
|
+
|
42
|
+
if s.respond_to? :specification_version then
|
43
|
+
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
44
|
+
s.specification_version = 3
|
45
|
+
|
46
|
+
if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
|
47
|
+
else
|
48
|
+
end
|
49
|
+
else
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
@@ -0,0 +1,19 @@
|
|
1
|
+
require 'mongo_mapper'
|
2
|
+
|
3
|
+
class Content
|
4
|
+
include MongoMapper::Document
|
5
|
+
|
6
|
+
key :title, String
|
7
|
+
key :url, String
|
8
|
+
key :author, String
|
9
|
+
key :summary, String
|
10
|
+
key :content, String
|
11
|
+
key :published, Date
|
12
|
+
key :categories, Array
|
13
|
+
key :feed_id, String
|
14
|
+
|
15
|
+
belongs_to :feed
|
16
|
+
|
17
|
+
validates_uniqueness_of :url
|
18
|
+
|
19
|
+
end
|
@@ -0,0 +1,66 @@
|
|
1
|
+
require 'mongo_mapper'
|
2
|
+
require 'uri'
|
3
|
+
require 'http_reactor'
|
4
|
+
require 'threadify'
|
5
|
+
require 'beanstalk-client'
|
6
|
+
|
7
|
+
module FeedProcessor
|
8
|
+
class Fetcher
|
9
|
+
attr_reader :options
|
10
|
+
|
11
|
+
def initialize(options={})
|
12
|
+
@options = options
|
13
|
+
@request_generator = options[:request_generator] || FeedProcessor::FileBasedRequestGenerator.new
|
14
|
+
setup_mongo(options[:mongo])
|
15
|
+
end
|
16
|
+
|
17
|
+
def requests
|
18
|
+
@request_generator.requests
|
19
|
+
end
|
20
|
+
|
21
|
+
def execute
|
22
|
+
number_of_threads = options[:threads] || 16
|
23
|
+
main_queue = Beanstalk::Pool.new(['localhost:11300'])
|
24
|
+
|
25
|
+
number_of_requests = requests.length
|
26
|
+
main_queue.put("START #{number_of_requests}")
|
27
|
+
slice_size = number_of_requests / number_of_threads
|
28
|
+
slice_size = 1 if slice_size < 1
|
29
|
+
puts "Each slice has #{slice_size} urls (#{number_of_requests} requests / #{number_of_threads} threads)"
|
30
|
+
|
31
|
+
requests.threadify(:each_slice, slice_size) do |slice|
|
32
|
+
queue = Beanstalk::Pool.new(['localhost:11300'])
|
33
|
+
HttpReactor::Client.new(slice) do |response, context|
|
34
|
+
begin
|
35
|
+
request = context.get_attribute('http_target_request')
|
36
|
+
puts "#{response.code}:#{request.uri}"
|
37
|
+
|
38
|
+
Response.create({
|
39
|
+
:url => request.uri,
|
40
|
+
:data => FeedProcessor::Util.decode_content(response),
|
41
|
+
:status => response.code
|
42
|
+
})
|
43
|
+
|
44
|
+
if response.code == 200
|
45
|
+
queue.put(request.uri.to_s)
|
46
|
+
end
|
47
|
+
rescue Exception => e
|
48
|
+
puts "Exception in handler: #{e.message}"
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
main_queue.put("END #{number_of_requests}")
|
54
|
+
puts "Fetched #{number_of_requests} feeds"
|
55
|
+
|
56
|
+
end
|
57
|
+
|
58
|
+
protected
|
59
|
+
def setup_mongo(options={})
|
60
|
+
options ||= {}
|
61
|
+
options[:database] ||= 'feed_processor'
|
62
|
+
MongoMapper.connection = Mongo::Connection.new(nil, nil)
|
63
|
+
MongoMapper.database = options[:database]
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module FeedProcessor
|
2
|
+
class FileBasedRequestGenerator
|
3
|
+
def requests
|
4
|
+
@requests ||= begin
|
5
|
+
puts "Generating requests"
|
6
|
+
requests = []
|
7
|
+
open(ARGV.pop) do |f|
|
8
|
+
f.each do |line|
|
9
|
+
requests << HttpReactor::Request.new(URI.parse(line)) if line =~ /^http:/
|
10
|
+
end
|
11
|
+
end
|
12
|
+
puts "Generated #{requests.length} requests"
|
13
|
+
requests
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,66 @@
|
|
1
|
+
require 'mongo_mapper'
|
2
|
+
require 'beanstalk-client'
|
3
|
+
require 'feedzirra'
|
4
|
+
|
5
|
+
module FeedProcessor
|
6
|
+
class Parser
|
7
|
+
def initialize(options={})
|
8
|
+
@options = options
|
9
|
+
setup_mongo(options[:mongo])
|
10
|
+
end
|
11
|
+
def execute
|
12
|
+
queue = Beanstalk::Pool.new(['localhost:11300'])
|
13
|
+
puts "Now accepting feeds to parse..."
|
14
|
+
begin
|
15
|
+
loop do
|
16
|
+
job = queue.reserve
|
17
|
+
if job.body =~ /^START (.+)$/
|
18
|
+
puts "starting #{$1} feeds"
|
19
|
+
elsif job.body =~ /^END (.+)$/
|
20
|
+
puts "finished #{$1} feeds"
|
21
|
+
else
|
22
|
+
url = job.body
|
23
|
+
puts "parsing #{url}"
|
24
|
+
responses = Response.all(:conditions => {'url' => url})
|
25
|
+
responses.each do |response|
|
26
|
+
begin
|
27
|
+
feed = Feedzirra::Feed.parse(response.data)
|
28
|
+
f = Feed.create({:url => url, :status => 'to-process'})
|
29
|
+
entries = feed.entries
|
30
|
+
puts "found #{entries.length} entries in #{url}"
|
31
|
+
entries.each do |entry|
|
32
|
+
begin
|
33
|
+
f.contents.create({
|
34
|
+
:title => entry.title,
|
35
|
+
:url => entry.url,
|
36
|
+
:author => entry.author,
|
37
|
+
:summary => entry.summary,
|
38
|
+
:content => entry.content,
|
39
|
+
:published => entry.published,
|
40
|
+
:categories => entry.categories,
|
41
|
+
})
|
42
|
+
rescue => e
|
43
|
+
puts "error creating entry #{entry.url}: #{e.message}"
|
44
|
+
end
|
45
|
+
end
|
46
|
+
rescue => e
|
47
|
+
puts "error parsing feed #{url}: #{e.message}"
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
job.delete
|
52
|
+
end
|
53
|
+
rescue Interrupt
|
54
|
+
puts "Exiting parser"
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
protected
|
59
|
+
def setup_mongo(options={})
|
60
|
+
options ||= {}
|
61
|
+
options[:database] ||= 'feed_processor'
|
62
|
+
MongoMapper.connection = Mongo::Connection.new(nil, nil)
|
63
|
+
MongoMapper.database = options[:database]
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
module FeedProcessor
|
2
|
+
class Util
|
3
|
+
# shamelessly stolen from feedzirra
|
4
|
+
def self.decode_content(res)
|
5
|
+
case res['content-encoding']
|
6
|
+
when 'gzip'
|
7
|
+
begin
|
8
|
+
gz = Zlib::GzipReader.new(StringIO.new(res.body))
|
9
|
+
xml = gz.read
|
10
|
+
gz.close
|
11
|
+
rescue Zlib::GzipFile::Error
|
12
|
+
# Maybe this is not gzipped?
|
13
|
+
xml = res.body
|
14
|
+
end
|
15
|
+
when 'deflate'
|
16
|
+
xml = Zlib::Inflate.inflate(res.body)
|
17
|
+
else
|
18
|
+
xml = res.body
|
19
|
+
end
|
20
|
+
|
21
|
+
xml
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
metadata
ADDED
@@ -0,0 +1,70 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: feed-processor
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Anthony Eden
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2010-02-22 00:00:00 -10:00
|
13
|
+
default_executable:
|
14
|
+
dependencies: []
|
15
|
+
|
16
|
+
description: Feed Processor is a multi-stage feed processor built with JRuby, MRI, beanstalk and MongoDB.
|
17
|
+
email: anthonyeden@gmail.com
|
18
|
+
executables:
|
19
|
+
- fetch
|
20
|
+
- parse
|
21
|
+
extensions: []
|
22
|
+
|
23
|
+
extra_rdoc_files:
|
24
|
+
- README.textile
|
25
|
+
files:
|
26
|
+
- .gitignore
|
27
|
+
- README.textile
|
28
|
+
- Rakefile
|
29
|
+
- VERSION
|
30
|
+
- bin/fetch
|
31
|
+
- bin/parse
|
32
|
+
- feed-processor.gemspec
|
33
|
+
- lib/feed_processor.rb
|
34
|
+
- lib/feed_processor/content.rb
|
35
|
+
- lib/feed_processor/feed.rb
|
36
|
+
- lib/feed_processor/fetcher.rb
|
37
|
+
- lib/feed_processor/file_based_request_generator.rb
|
38
|
+
- lib/feed_processor/parser.rb
|
39
|
+
- lib/feed_processor/response.rb
|
40
|
+
- lib/feed_processor/util.rb
|
41
|
+
has_rdoc: true
|
42
|
+
homepage: http://github.com/aeden/feed-processor
|
43
|
+
licenses: []
|
44
|
+
|
45
|
+
post_install_message:
|
46
|
+
rdoc_options:
|
47
|
+
- --charset=UTF-8
|
48
|
+
require_paths:
|
49
|
+
- lib
|
50
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: "0"
|
55
|
+
version:
|
56
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
57
|
+
requirements:
|
58
|
+
- - ">="
|
59
|
+
- !ruby/object:Gem::Version
|
60
|
+
version: "0"
|
61
|
+
version:
|
62
|
+
requirements: []
|
63
|
+
|
64
|
+
rubyforge_project:
|
65
|
+
rubygems_version: 1.3.5
|
66
|
+
signing_key:
|
67
|
+
specification_version: 3
|
68
|
+
summary: A multi-stage feed processor.
|
69
|
+
test_files: []
|
70
|
+
|