RubyGems - feed-processor - Versions diffs - 0.0.1 - Mend

feed-processor 0.0.1

Files changed (16) hide show

data/.gitignore +1 -0
data/README.textile +43 -0
data/Rakefile +15 -0
data/VERSION +1 -0
data/bin/fetch +20 -0
data/bin/parse +11 -0
data/feed-processor.gemspec +52 -0
data/lib/feed_processor/content.rb +19 -0
data/lib/feed_processor/feed.rb +13 -0
data/lib/feed_processor/fetcher.rb +66 -0
data/lib/feed_processor/file_based_request_generator.rb +17 -0
data/lib/feed_processor/parser.rb +66 -0
data/lib/feed_processor/response.rb +11 -0
data/lib/feed_processor/util.rb +24 -0
data/lib/feed_processor.rb +5 -0
metadata +70 -0

data/.gitignore ADDED Viewed

	@@ -0,0 +1 @@
1	+ urls.txt

data/README.textile ADDED Viewed

@@ -0,0 +1,43 @@
+Feed Processor is a multi-stage feed processor built with JRuby, MRI, beanstalk and MongoDB.
+There are two steps to the feed processing:
+# Step 1: Download feed content using non-blocking IO and insert the raw data into MongoDB. A message is sent via Beanstalk notifying the parser stage that the feed data is ready for a specific feed.
+# Step 2: A multi-processor feed parser pulls the raw data from MongoDB, parses it and inserts the resulting parsed record into MongoDB.
+h2. Dependencies
+* MongoDB
+* beanstalkd
+* JRuby
+* MRI
+Gems (for JRuby):
+* jruby-http-reactor
+* threadify
+* beanstalk-client
+* mongo_mapper
+Gems (for MRI):
+* beanstalk-client
+* mongo_mapper
+* feedzirra
+h2. Executing
+Each of the following commands should be executed in a separate console or executed to run as a background process.
+Start MongoDB and Beanstalk:
+  mongod
+  beanstalkd
+Run the fetch processor:
+  jruby -rubygems -Ilib bin/fetch urls.txt
+Run the parse processor:
+  ruby -rubygems -Ilib bin/parse

data/Rakefile ADDED Viewed

@@ -0,0 +1,15 @@
+begin
+  require 'jeweler'
+  Jeweler::Tasks.new do |gemspec|
+    gemspec.name = "feed-processor"
+    gemspec.summary = "A multi-stage feed processor."
+    gemspec.description = "Feed Processor is a multi-stage feed processor built with JRuby, MRI, beanstalk and MongoDB."
+    gemspec.email = "anthonyeden@gmail.com"
+    gemspec.homepage = "http://github.com/aeden/feed-processor"
+    gemspec.authors = ["Anthony Eden"]
+  end
+rescue LoadError
+  puts "Jeweler not available. Install it with: gem install jeweler"
+end
+Jeweler::GemcutterTasks.new

data/VERSION ADDED Viewed

	@@ -0,0 +1 @@
1	+ 0.0.1

data/bin/fetch ADDED Viewed

@@ -0,0 +1,20 @@
+#!/usr/bin/env jruby
+#
+# Description: Fetches the content from the list of URLs provided
+# on the command line and stores that content and its response code
+# in a MongoDB database.
+#
+# Usage: jruby -rubygems -Ilib bin/fetch urls.txt
+#
+# Dependencies:
+#
+# * jruby-http-reactor
+# * beanstalk-client
+# * threadify
+# * mongo_mapper
+$stdout.sync = true
+require 'feed_processor'
+require 'feed_processor/fetcher'
+fetcher = FeedProcessor::Fetcher.new(:threads => 16)
+fetcher.execute

data/bin/parse ADDED Viewed

@@ -0,0 +1,11 @@
+#!/usr/bin/env ruby
+# Description: Listens for events on a queue and parses the
+# HTTP response body stored in MongoDB for the given URL.
+#
+# Usage: ruby -rubygems -Ilib bin/parse
+require 'feed_processor'
+require 'feed_processor/parser'
+parser = FeedProcessor::Parser.new
+parser.execute

data/feed-processor.gemspec ADDED Viewed

@@ -0,0 +1,52 @@
+# Generated by jeweler
+# DO NOT EDIT THIS FILE DIRECTLY
+# Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
+# -*- encoding: utf-8 -*-
+Gem::Specification.new do |s|
+  s.name = %q{feed-processor}
+  s.version = "0.0.1"
+  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
+  s.authors = ["Anthony Eden"]
+  s.date = %q{2010-02-22}
+  s.description = %q{Feed Processor is a multi-stage feed processor built with JRuby, MRI, beanstalk and MongoDB.}
+  s.email = %q{anthonyeden@gmail.com}
+  s.executables = ["fetch", "parse"]
+  s.extra_rdoc_files = [
+    "README.textile"
+  ]
+  s.files = [
+    ".gitignore",
+     "README.textile",
+     "Rakefile",
+     "VERSION",
+     "bin/fetch",
+     "bin/parse",
+     "feed-processor.gemspec",
+     "lib/feed_processor.rb",
+     "lib/feed_processor/content.rb",
+     "lib/feed_processor/feed.rb",
+     "lib/feed_processor/fetcher.rb",
+     "lib/feed_processor/file_based_request_generator.rb",
+     "lib/feed_processor/parser.rb",
+     "lib/feed_processor/response.rb",
+     "lib/feed_processor/util.rb"
+  ]
+  s.homepage = %q{http://github.com/aeden/feed-processor}
+  s.rdoc_options = ["--charset=UTF-8"]
+  s.require_paths = ["lib"]
+  s.rubygems_version = %q{1.3.5}
+  s.summary = %q{A multi-stage feed processor.}
+  if s.respond_to? :specification_version then
+    current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
+    s.specification_version = 3
+    if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
+    else
+    end
+  else
+  end
+end

data/lib/feed_processor/content.rb ADDED Viewed

@@ -0,0 +1,19 @@
+require 'mongo_mapper'
+class Content
+  include MongoMapper::Document
+  key :title, String
+  key :url, String
+  key :author, String
+  key :summary, String
+  key :content, String
+  key :published, Date
+  key :categories, Array
+  key :feed_id, String
+  belongs_to :feed
+  validates_uniqueness_of :url
+end

data/lib/feed_processor/feed.rb ADDED Viewed

@@ -0,0 +1,13 @@
+require 'mongo_mapper'
+class Feed
+  include MongoMapper::Document
+  key :title, String
+  key :url, String
+  key :status, String
+  many :contents
+  validates_uniqueness_of :url
+end

data/lib/feed_processor/fetcher.rb ADDED Viewed

@@ -0,0 +1,66 @@
+require 'mongo_mapper'
+require 'uri'
+require 'http_reactor'
+require 'threadify'
+require 'beanstalk-client'
+module FeedProcessor
+  class Fetcher
+    attr_reader :options
+    def initialize(options={})
+      @options = options
+      @request_generator = options[:request_generator] || FeedProcessor::FileBasedRequestGenerator.new
+      setup_mongo(options[:mongo])
+    end
+    def requests
+      @request_generator.requests
+    end
+    def execute
+      number_of_threads = options[:threads] || 16
+      main_queue = Beanstalk::Pool.new(['localhost:11300'])
+      number_of_requests = requests.length
+      main_queue.put("START #{number_of_requests}")
+      slice_size = number_of_requests / number_of_threads
+      slice_size = 1 if slice_size < 1
+      puts "Each slice has #{slice_size} urls (#{number_of_requests} requests / #{number_of_threads} threads)"
+      requests.threadify(:each_slice, slice_size) do |slice|
+        queue = Beanstalk::Pool.new(['localhost:11300'])
+        HttpReactor::Client.new(slice) do |response, context|
+          begin
+            request = context.get_attribute('http_target_request')
+            puts "#{response.code}:#{request.uri}"
+            Response.create({
+              :url => request.uri,
+              :data => FeedProcessor::Util.decode_content(response),
+              :status => response.code
+            })
+            if response.code == 200
+              queue.put(request.uri.to_s)
+            end
+          rescue Exception => e
+            puts "Exception in handler: #{e.message}"
+          end
+        end
+      end
+      main_queue.put("END #{number_of_requests}")
+      puts "Fetched #{number_of_requests} feeds"
+    end
+    protected
+    def setup_mongo(options={})
+      options ||= {}
+      options[:database] ||= 'feed_processor'
+      MongoMapper.connection = Mongo::Connection.new(nil, nil)
+      MongoMapper.database = options[:database]
+    end
+  end
+end

data/lib/feed_processor/file_based_request_generator.rb ADDED Viewed

@@ -0,0 +1,17 @@
+module FeedProcessor
+  class FileBasedRequestGenerator
+    def requests
+      @requests ||= begin
+        puts "Generating requests"
+        requests = []
+        open(ARGV.pop) do |f|
+          f.each do |line|
+            requests << HttpReactor::Request.new(URI.parse(line)) if line =~ /^http:/
+          end
+        end
+        puts "Generated #{requests.length} requests"
+        requests
+      end
+    end
+  end
+end

data/lib/feed_processor/parser.rb ADDED Viewed

@@ -0,0 +1,66 @@
+require 'mongo_mapper'
+require 'beanstalk-client'
+require 'feedzirra'
+module FeedProcessor
+  class Parser
+    def initialize(options={})
+      @options = options
+      setup_mongo(options[:mongo])
+    end
+    def execute
+      queue = Beanstalk::Pool.new(['localhost:11300'])
+      puts "Now accepting feeds to parse..."
+      begin
+        loop do
+          job = queue.reserve
+          if job.body =~ /^START (.+)$/
+            puts "starting #{$1} feeds"
+          elsif job.body =~ /^END (.+)$/
+            puts "finished #{$1} feeds"
+          else
+            url = job.body
+            puts "parsing #{url}"
+            responses = Response.all(:conditions => {'url' => url})
+            responses.each do |response|
+              begin
+                feed = Feedzirra::Feed.parse(response.data)
+                f = Feed.create({:url => url, :status => 'to-process'})
+                entries = feed.entries
+                puts "found #{entries.length} entries in #{url}"
+                entries.each do |entry|
+                  begin
+                    f.contents.create({
+                      :title => entry.title,
+                      :url => entry.url,
+                      :author => entry.author,
+                      :summary => entry.summary,
+                      :content => entry.content,
+                      :published => entry.published,
+                      :categories => entry.categories,
+                    })
+                  rescue => e
+                    puts "error creating entry #{entry.url}: #{e.message}"
+                  end
+                end
+              rescue => e
+                puts "error parsing feed #{url}: #{e.message}"
+              end
+            end
+          end
+          job.delete
+        end
+      rescue Interrupt
+        puts "Exiting parser"
+      end
+    end
+    protected
+    def setup_mongo(options={})
+      options ||= {}
+      options[:database] ||= 'feed_processor'
+      MongoMapper.connection = Mongo::Connection.new(nil, nil)
+      MongoMapper.database = options[:database]
+    end
+  end
+end

data/lib/feed_processor/response.rb ADDED Viewed

@@ -0,0 +1,11 @@
+require 'mongo_mapper'
+class Response
+  include MongoMapper::Document
+  key :url, String
+  key :data, String
+  key :status, String
+  validates_uniqueness_of :url
+end

data/lib/feed_processor/util.rb ADDED Viewed

@@ -0,0 +1,24 @@
+module FeedProcessor
+  class Util
+    # shamelessly stolen from feedzirra
+    def self.decode_content(res)
+      case res['content-encoding']
+      when 'gzip'
+        begin
+          gz =  Zlib::GzipReader.new(StringIO.new(res.body))
+          xml = gz.read
+          gz.close
+        rescue Zlib::GzipFile::Error
+          # Maybe this is not gzipped?
+          xml = res.body
+        end
+      when 'deflate'
+        xml = Zlib::Inflate.inflate(res.body)
+      else
+        xml = res.body
+      end
+      xml
+    end
+  end
+end

data/lib/feed_processor.rb ADDED Viewed

@@ -0,0 +1,5 @@
+require 'feed_processor/util'
+require 'feed_processor/file_based_request_generator'
+require 'feed_processor/response'
+require 'feed_processor/feed'
+require 'feed_processor/content'

metadata ADDED Viewed

@@ -0,0 +1,70 @@
+--- !ruby/object:Gem::Specification
+name: feed-processor
+version: !ruby/object:Gem::Version
+  version: 0.0.1
+platform: ruby
+authors:
+- Anthony Eden
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2010-02-22 00:00:00 -10:00
+default_executable:
+dependencies: []
+description: Feed Processor is a multi-stage feed processor built with JRuby, MRI, beanstalk and MongoDB.
+email: anthonyeden@gmail.com
+executables:
+- fetch
+- parse
+extensions: []
+extra_rdoc_files:
+- README.textile
+files:
+- .gitignore
+- README.textile
+- Rakefile
+- VERSION
+- bin/fetch
+- bin/parse
+- feed-processor.gemspec
+- lib/feed_processor.rb
+- lib/feed_processor/content.rb
+- lib/feed_processor/feed.rb
+- lib/feed_processor/fetcher.rb
+- lib/feed_processor/file_based_request_generator.rb
+- lib/feed_processor/parser.rb
+- lib/feed_processor/response.rb
+- lib/feed_processor/util.rb
+has_rdoc: true
+homepage: http://github.com/aeden/feed-processor
+licenses: []
+post_install_message:
+rdoc_options:
+- --charset=UTF-8
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+requirements: []
+rubyforge_project:
+rubygems_version: 1.3.5
+signing_key:
+specification_version: 3
+summary: A multi-stage feed processor.
+test_files: []