RubyGems - feed-processor - Versions diffs - 0.0.1 - Mend

feed-processor 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

data/.gitignore +1 -0
data/README.textile +43 -0
data/Rakefile +15 -0
data/VERSION +1 -0
data/bin/fetch +20 -0
data/bin/parse +11 -0
data/feed-processor.gemspec +52 -0
data/lib/feed_processor/content.rb +19 -0
data/lib/feed_processor/feed.rb +13 -0
data/lib/feed_processor/fetcher.rb +66 -0
data/lib/feed_processor/file_based_request_generator.rb +17 -0
data/lib/feed_processor/parser.rb +66 -0
data/lib/feed_processor/response.rb +11 -0
data/lib/feed_processor/util.rb +24 -0
data/lib/feed_processor.rb +5 -0
metadata +70 -0

data/.gitignore ADDED Viewed

	@@ -0,0 +1 @@
1	+ urls.txt

data/README.textile ADDED Viewed

@@ -0,0 +1,43 @@
+Feed Processor is a multi-stage feed processor built with JRuby, MRI, beanstalk and MongoDB.
+There are two steps to the feed processing:
+# Step 1: Download feed content using non-blocking IO and insert the raw data into MongoDB. A message is sent via Beanstalk notifying the parser stage that the feed data is ready for a specific feed.
+# Step 2: A multi-processor feed parser pulls the raw data from MongoDB, parses it and inserts the resulting parsed record into MongoDB.
+h2. Dependencies
+* MongoDB
+* beanstalkd
+* JRuby
+* MRI
+Gems (for JRuby):
+* jruby-http-reactor
+* threadify
+* beanstalk-client
+* mongo_mapper
+Gems (for MRI):
+* beanstalk-client
+* mongo_mapper
+* feedzirra
+h2. Executing
+Each of the following commands should be executed in a separate console or executed to run as a background process.
+Start MongoDB and Beanstalk:
+  mongod
+  beanstalkd
+Run the fetch processor:
+  jruby -rubygems -Ilib bin/fetch urls.txt
+Run the parse processor:
+  ruby -rubygems -Ilib bin/parse

data/Rakefile ADDED Viewed

@@ -0,0 +1,15 @@
+begin
+  require 'jeweler'
+  Jeweler::Tasks.new do |gemspec|
+    gemspec.name = "feed-processor"
+    gemspec.summary = "A multi-stage feed processor."
+    gemspec.description = "Feed Processor is a multi-stage feed processor built with JRuby, MRI, beanstalk and MongoDB."
+    gemspec.email = "anthonyeden@gmail.com"
+    gemspec.homepage = "http://github.com/aeden/feed-processor"
+    gemspec.authors = ["Anthony Eden"]
+  end
+rescue LoadError
+  puts "Jeweler not available. Install it with: gem install jeweler"
+end
+Jeweler::GemcutterTasks.new

data/VERSION ADDED Viewed

	@@ -0,0 +1 @@
1	+ 0.0.1

data/bin/fetch ADDED Viewed

@@ -0,0 +1,20 @@
+#!/usr/bin/env jruby
+#
+# Description: Fetches the content from the list of URLs provided
+# on the command line and stores that content and its response code
+# in a MongoDB database.
+#
+# Usage: jruby -rubygems -Ilib bin/fetch urls.txt
+#
+# Dependencies:
+#
+# * jruby-http-reactor
+# * beanstalk-client
+# * threadify
+# * mongo_mapper
+$stdout.sync = true
+require 'feed_processor'
+require 'feed_processor/fetcher'
+fetcher = FeedProcessor::Fetcher.new(:threads => 16)
+fetcher.execute

data/bin/parse ADDED Viewed

@@ -0,0 +1,11 @@
+#!/usr/bin/env ruby
+# Description: Listens for events on a queue and parses the
+# HTTP response body stored in MongoDB for the given URL.
+#
+# Usage: ruby -rubygems -Ilib bin/parse
+require 'feed_processor'
+require 'feed_processor/parser'
+parser = FeedProcessor::Parser.new
+parser.execute

data/feed-processor.gemspec ADDED Viewed

@@ -0,0 +1,52 @@
+# Generated by jeweler
+# DO NOT EDIT THIS FILE DIRECTLY
+# Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
+# -*- encoding: utf-8 -*-
+Gem::Specification.new do |s|
+  s.name = %q{feed-processor}
+  s.version = "0.0.1"
+  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
+  s.authors = ["Anthony Eden"]
+  s.date = %q{2010-02-22}
+  s.description = %q{Feed Processor is a multi-stage feed processor built with JRuby, MRI, beanstalk and MongoDB.}
+  s.email = %q{anthonyeden@gmail.com}
+  s.executables = ["fetch", "parse"]
+  s.extra_rdoc_files = [
+    "README.textile"
+  ]
+  s.files = [
+    ".gitignore",
+     "README.textile",
+     "Rakefile",
+     "VERSION",
+     "bin/fetch",
+     "bin/parse",
+     "feed-processor.gemspec",
+     "lib/feed_processor.rb",
+     "lib/feed_processor/content.rb",
+     "lib/feed_processor/feed.rb",
+     "lib/feed_processor/fetcher.rb",
+     "lib/feed_processor/file_based_request_generator.rb",
+     "lib/feed_processor/parser.rb",
+     "lib/feed_processor/response.rb",
+     "lib/feed_processor/util.rb"
+  ]
+  s.homepage = %q{http://github.com/aeden/feed-processor}
+  s.rdoc_options = ["--charset=UTF-8"]
+  s.require_paths = ["lib"]
+  s.rubygems_version = %q{1.3.5}
+  s.summary = %q{A multi-stage feed processor.}
+  if s.respond_to? :specification_version then
+    current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
+    s.specification_version = 3
+    if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
+    else
+    end
+  else
+  end
+end

data/lib/feed_processor/content.rb ADDED Viewed

@@ -0,0 +1,19 @@
+require 'mongo_mapper'
+class Content
+  include MongoMapper::Document
+  key :title, String
+  key :url, String
+  key :author, String
+  key :summary, String
+  key :content, String
+  key :published, Date
+  key :categories, Array
+  key :feed_id, String
+  belongs_to :feed
+  validates_uniqueness_of :url
+end

data/lib/feed_processor/feed.rb ADDED Viewed

@@ -0,0 +1,13 @@
+require 'mongo_mapper'
+class Feed
+  include MongoMapper::Document
+  key :title, String
+  key :url, String
+  key :status, String
+  many :contents
+  validates_uniqueness_of :url
+end

data/lib/feed_processor/fetcher.rb ADDED Viewed

@@ -0,0 +1,66 @@
+require 'mongo_mapper'
+require 'uri'
+require 'http_reactor'
+require 'threadify'
+require 'beanstalk-client'
+module FeedProcessor
+  class Fetcher
+    attr_reader :options
+    def initialize(options={})
+      @options = options
+      @request_generator = options[:request_generator] || FeedProcessor::FileBasedRequestGenerator.new
+      setup_mongo(options[:mongo])
+    end
+    def requests
+      @request_generator.requests
+    end
+    def execute
+      number_of_threads = options[:threads] || 16
+      main_queue = Beanstalk::Pool.new(['localhost:11300'])
+      number_of_requests = requests.length
+      main_queue.put("START #{number_of_requests}")
+      slice_size = number_of_requests / number_of_threads
+      slice_size = 1 if slice_size < 1
+      puts "Each slice has #{slice_size} urls (#{number_of_requests} requests / #{number_of_threads} threads)"
+      requests.threadify(:each_slice, slice_size) do |slice|
+        queue = Beanstalk::Pool.new(['localhost:11300'])
+        HttpReactor::Client.new(slice) do |response, context|
+          begin
+            request = context.get_attribute('http_target_request')
+            puts "#{response.code}:#{request.uri}"
+            Response.create({
+              :url => request.uri,
+              :data => FeedProcessor::Util.decode_content(response),
+              :status => response.code
+            })
+            if response.code == 200
+              queue.put(request.uri.to_s)
+            end
+          rescue Exception => e
+            puts "Exception in handler: #{e.message}"
+          end
+        end
+      end
+      main_queue.put("END #{number_of_requests}")
+      puts "Fetched #{number_of_requests} feeds"
+    end
+    protected
+    def setup_mongo(options={})
+      options ||= {}
+      options[:database] ||= 'feed_processor'
+      MongoMapper.connection = Mongo::Connection.new(nil, nil)
+      MongoMapper.database = options[:database]
+    end
+  end
+end

data/lib/feed_processor/file_based_request_generator.rb ADDED Viewed

@@ -0,0 +1,17 @@
+module FeedProcessor
+  class FileBasedRequestGenerator
+    def requests
+      @requests ||= begin
+        puts "Generating requests"
+        requests = []
+        open(ARGV.pop) do |f|
+          f.each do |line|
+            requests << HttpReactor::Request.new(URI.parse(line)) if line =~ /^http:/
+          end
+        end
+        puts "Generated #{requests.length} requests"
+        requests
+      end
+    end
+  end
+end

data/lib/feed_processor/parser.rb ADDED Viewed

@@ -0,0 +1,66 @@
+require 'mongo_mapper'
+require 'beanstalk-client'
+require 'feedzirra'
+module FeedProcessor
+  class Parser
+    def initialize(options={})
+      @options = options
+      setup_mongo(options[:mongo])
+    end
+    def execute
+      queue = Beanstalk::Pool.new(['localhost:11300'])
+      puts "Now accepting feeds to parse..."
+      begin
+        loop do
+          job = queue.reserve
+          if job.body =~ /^START (.+)$/
+            puts "starting #{$1} feeds"
+          elsif job.body =~ /^END (.+)$/
+            puts "finished #{$1} feeds"
+          else
+            url = job.body
+            puts "parsing #{url}"
+            responses = Response.all(:conditions => {'url' => url})
+            responses.each do |response|
+              begin
+                feed = Feedzirra::Feed.parse(response.data)
+                f = Feed.create({:url => url, :status => 'to-process'})
+                entries = feed.entries
+                puts "found #{entries.length} entries in #{url}"
+                entries.each do |entry|
+                  begin
+                    f.contents.create({
+                      :title => entry.title,
+                      :url => entry.url,
+                      :author => entry.author,
+                      :summary => entry.summary,
+                      :content => entry.content,
+                      :published => entry.published,
+                      :categories => entry.categories,
+                    })
+                  rescue => e
+                    puts "error creating entry #{entry.url}: #{e.message}"
+                  end
+                end
+              rescue => e
+                puts "error parsing feed #{url}: #{e.message}"
+              end
+            end
+          end
+          job.delete
+        end
+      rescue Interrupt
+        puts "Exiting parser"
+      end
+    end
+    protected
+    def setup_mongo(options={})
+      options ||= {}
+      options[:database] ||= 'feed_processor'
+      MongoMapper.connection = Mongo::Connection.new(nil, nil)
+      MongoMapper.database = options[:database]
+    end
+  end
+end

data/lib/feed_processor/response.rb ADDED Viewed

@@ -0,0 +1,11 @@
+require 'mongo_mapper'
+class Response
+  include MongoMapper::Document
+  key :url, String
+  key :data, String
+  key :status, String
+  validates_uniqueness_of :url
+end

data/lib/feed_processor/util.rb ADDED Viewed

@@ -0,0 +1,24 @@
+module FeedProcessor
+  class Util
+    # shamelessly stolen from feedzirra
+    def self.decode_content(res)
+      case res['content-encoding']
+      when 'gzip'
+        begin
+          gz =  Zlib::GzipReader.new(StringIO.new(res.body))
+          xml = gz.read
+          gz.close
+        rescue Zlib::GzipFile::Error
+          # Maybe this is not gzipped?
+          xml = res.body
+        end
+      when 'deflate'
+        xml = Zlib::Inflate.inflate(res.body)
+      else
+        xml = res.body
+      end
+      xml
+    end
+  end
+end

data/lib/feed_processor.rb ADDED Viewed

@@ -0,0 +1,5 @@
+require 'feed_processor/util'
+require 'feed_processor/file_based_request_generator'
+require 'feed_processor/response'
+require 'feed_processor/feed'
+require 'feed_processor/content'

metadata ADDED Viewed

@@ -0,0 +1,70 @@
+--- !ruby/object:Gem::Specification
+name: feed-processor
+version: !ruby/object:Gem::Version
+  version: 0.0.1
+platform: ruby
+authors:
+- Anthony Eden
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2010-02-22 00:00:00 -10:00
+default_executable:
+dependencies: []
+description: Feed Processor is a multi-stage feed processor built with JRuby, MRI, beanstalk and MongoDB.
+email: anthonyeden@gmail.com
+executables:
+- fetch
+- parse
+extensions: []
+extra_rdoc_files:
+- README.textile
+files:
+- .gitignore
+- README.textile
+- Rakefile
+- VERSION
+- bin/fetch
+- bin/parse
+- feed-processor.gemspec
+- lib/feed_processor.rb
+- lib/feed_processor/content.rb
+- lib/feed_processor/feed.rb
+- lib/feed_processor/fetcher.rb
+- lib/feed_processor/file_based_request_generator.rb
+- lib/feed_processor/parser.rb
+- lib/feed_processor/response.rb
+- lib/feed_processor/util.rb
+has_rdoc: true
+homepage: http://github.com/aeden/feed-processor
+licenses: []
+post_install_message:
+rdoc_options:
+- --charset=UTF-8
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+requirements: []
+rubyforge_project:
+rubygems_version: 1.3.5
+signing_key:
+specification_version: 3
+summary: A multi-stage feed processor.
+test_files: []