RubyGems - spider - Versions diffs - 0.4.1 → 0.4.2 - Mend

spider 0.4.1 → 0.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

data/CHANGES +5 -0
data/README +28 -2
data/lib/spider.rb +1 -1
data/lib/spider/included_in_memcached.rb +2 -1
data/lib/spider/next_urls_in_sqs.rb +66 -0
data/lib/spider/spider_instance.rb +38 -7
data/lib/test.rb +27 -0
data/spider.gemspec +4 -3
metadata +61 -52

data/CHANGES CHANGED

@@ -1,3 +1,8 @@
+2008-07-06
+* Trap interrupts and shutdown gracefully
+* Support for custom urls-to-crawl objects
+* Example AmazonSQS urls-to-crawl support (next_urls_in_sqs.rb)
 2007-11-09:
 * Handle redirects that assume a base URL.

data/README CHANGED

@@ -75,6 +75,31 @@ scraping, collecting, and looping so that you can just handle the data.
    s.check_already_seen_with ExpireLinks.new
  end
+=== Store nodes to visit with Amazon SQS
+ require 'spider'
+ require 'spider/next_urls_in_sqs'
+ Spider.start_at('http://mike-burns.com') do |s|
+   s.store_next_urls_with NextUrlsInSQS.new(AWS_ACCESS_KEY, AWS_SECRET_ACCESS_KEY)
+ end
+==== Store nodes to visit with a custom object
+ require 'spider'
+ class MyArray < Array
+   def pop
+	super
+   end
+   def push(a_msg)
+     super(a_msg)
+   end
+ end
+ Spider.start_at('http://mike-burns.com') do |s|
+   s.store_next_urls_with MyArray.new
+ end
 === Create a URL graph
  require 'spider'
@@ -106,9 +131,10 @@ scraping, collecting, and looping so that you can just handle the data.
 == Author
-Mike Burns http://mike-burns.com mike@mike-burns.com
+John Nagro john.nagro@gmail.com
+Mike Burns http://mike-burns.com mike@mike-burns.com (original author)
-Help from Matt Horan, John Nagro, and Henri Cook.
+Help from Matt Horan, and Henri Cook.
 With `robot_rules' from James Edward Gray II via
 http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/177589

data/lib/spider.rb CHANGED

@@ -1,4 +1,4 @@
-# Copyright 2007 Mike Burns
+# Copyright 2007-2008 Mike Burns & John Nagro
 # :include: README
 # Redistribution and use in source and binary forms, with or without

data/lib/spider/included_in_memcached.rb CHANGED

@@ -1,5 +1,6 @@
 # Use memcached to track cycles.
+#
+# Copyright 2007 Mike Burns
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are met:
 #      * Redistributions of source code must retain the above copyright

data/lib/spider/next_urls_in_sqs.rb ADDED

@@ -0,0 +1,66 @@
+# Use AmazonSQS to track nodes to visit.
+#
+# Copyright 2008 John Nagro
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#      * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#      * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in the
+#      documentation and/or other materials provided with the distribution.
+#      * Neither the name Mike Burns nor the
+#      names of his contributors may be used to endorse or promote products
+#      derived from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY Mike Burns ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL Mike Burns BE LIABLE FOR ANY
+# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+require 'rubygems'
+require 'right_aws'
+require 'yaml'
+# A specialized class using AmazonSQS to track nodes to walk. It supports
+# two operations: push and pop . Together these can be used to
+# add items to the queue, then pull items off the queue.
+#
+# This is useful if you want multiple Spider processes crawling the same
+# data set.
+#
+# To use it with Spider use the store_next_urls_with method:
+#
+#  Spider.start_at('http://example.com/') do |s|
+#    s.store_next_urls_with NextUrlsInSQS.new(AWS_ACCESS_KEY, AWS_SECRET_ACCESS_KEY, queue_name)
+#  end
+class NextUrlsInSQS
+  # Construct a new NextUrlsInSQS instance. All arguments here are
+  # passed to RightAWS::SqsGen2 (part of the right_aws gem) or used
+  # to set the AmazonSQS queue name (optional).
+  def initialize(aws_access_key, aws_secret_access_key, queue_name = 'ruby-spider')
+    @sqs = RightAws::SqsGen2.new(aws_access_key, aws_secret_access_key)
+    @queue = @sqs.queue(queue_name)
+  end
+  # Pull an item off the queue, loop until data is found. Data is
+  # encoded with YAML.
+  def pop
+    while true
+      message = @queue.pop
+      return YAML::load(message.to_s) unless message.nil?
+      sleep 5
+    end
+  end
+  # Put data on the queue. Data is encoded with YAML.
+  def push(a_msg)
+    encoded_message = YAML::dump(a_msg)
+    @queue.push(a_msg)
+  end
+end

data/lib/spider/spider_instance.rb CHANGED

@@ -1,6 +1,6 @@
 # Specialized spidering rules.
-# Copyright 2007 Mike Burns
+# Copyright 2007-2008 Mike Burns & John Nagro
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are met:
 #      * Redistributions of source code must retain the above copyright
@@ -51,7 +51,7 @@ class SpiderInstance
     @url_checks  = []
     @cache       = :memory
     @callbacks   = {}
-    @next_urls   = next_urls
+    @next_urls   = [next_urls]
     @seen        = seen
     @rules       = rules || RobotRules.new('Ruby Spider 1.0')
     @robots_seen = robots_seen
@@ -96,6 +96,30 @@ class SpiderInstance
     end
   end
+  # The Web is a really, really, really big graph; as such, this list
+  # of nodes to visit grows really, really, really big.
+  #
+  # Change the object used to store nodes we have yet to walk. The default
+  # object is an instance of Array. Available with Spider is a wrapper of
+  # AmazonSQS.
+  #
+  # You can implement a custom class for this; any object passed to
+  # check_already_seen_with must understand just push and pop .
+  #
+  #  # default
+  #  store_next_urls_with Array.new
+  #
+  #  # AmazonSQS
+  #  require 'spider/next_urls_in_sqs'
+  #  store_next_urls_with NextUrlsInSQS.new(AWS_ACCESS_KEY, AWS_SECRET_ACCESS_KEY, queue_name)
+  def store_next_urls_with(a_store)
+    tmp_next_urls = @next_urls
+    @next_urls = a_store
+    tmp_next_urls.each do |a_url_hash|
+      @next_urls.push a_url_hash
+    end
+  end
   # Add a response handler. A response handler's trigger can be :every,
   # :success, :failure, or any HTTP status code. The handler itself can be
   # either a Proc or a block.
@@ -159,9 +183,11 @@ class SpiderInstance
     @headers = {}
   end
-  def start! #:nodoc:
-    next_urls = @next_urls
+  def start! #:nodoc:
+    interrupted = false
+    trap("SIGINT") { interrupted = true }
     begin
+      next_urls = @next_urls.pop
       tmp_n_u = {}
       next_urls.each do |prior_url, urls|
         urls.map do |a_url|
@@ -172,13 +198,18 @@ class SpiderInstance
           @setup.call(a_url) unless @setup.nil?
           get_page(parsed_url) do |response|
             do_callbacks(a_url, response, prior_url)
-            tmp_n_u[a_url] = generate_next_urls(a_url, response)
+            #tmp_n_u[a_url] = generate_next_urls(a_url, response)
+            #@next_urls.push tmp_n_u
+            generate_next_urls(a_url, response).each do |a_next_url|
+              @next_urls.push a_url => a_next_url
+            end
+            #exit if interrupted
           end
           @teardown.call(a_url) unless @teardown.nil?
+          exit if interrupted
         end
       end
-      next_urls = tmp_n_u
-    end while !next_urls.empty?
+    end while !@next_urls.empty?
   end
   def success_or_failure(code) #:nodoc:

data/lib/test.rb ADDED

@@ -0,0 +1,27 @@
+require 'spider.rb'
+require 'spider/next_urls_in_sqs.rb'
+class MyArray < Array
+  def pop
+    a_msg = super
+    puts "pop: #{a_msg.inspect}"
+    return a_msg
+  end
+  def push(a_msg)
+    puts "push: #{a_msg.inspect}"
+    super(a_msg)
+  end
+end
+AWS_ACCESS_KEY = '0YA99M8Y09J2D4FEC602'
+AWS_SECRET_ACCESS_KEY = 'Sc9R9uiwbFYz7XhQqkPvSK3Bbq4tPYPVMWyDlF+a'
+#Spider.start_at("http://docs.huihoo.com/ruby/ruby-man-1.4/function.html") do |s|
+Spider.start_at("http://www.google.com") do |s|
+  #s.store_next_urls_with NextUrlsInSQS.new(AWS_ACCESS_KEY, AWS_SECRET_ACCESS_KEY)
+  s.store_next_urls_with MyArray.new
+  s.on(:every) do |a_url, resp, prior_url|
+    puts a_url
+  end
+end

data/spider.gemspec CHANGED

@@ -1,11 +1,12 @@
 require 'rubygems'
 spec = Gem::Specification.new do |s|
-  s.author = 'Mike Burns'
-  s.email = 'mike@mike-burns.com'
+  s.author = 'John Nagro'
+  s.email = 'john.nagro@gmail.com'
   s.has_rdoc = true
   s.homepage = 'http://spider.rubyforge.org/'
   s.name = 'spider'
+  s.rubyforge_project = 'spider'
   s.summary = 'A Web spidering library'
   s.files = Dir['**/*'].delete_if { |f| f =~ /(cvs|gem|svn)$/i }
   s.require_path = 'lib'
@@ -13,5 +14,5 @@ spec = Gem::Specification.new do |s|
 A Web spidering library: handles robots.txt, scraping, finding more
 links, and doing it all over again.
 EOF
-  s.version = '0.4.1'
+  s.version = '0.4.2'
 end

metadata CHANGED

@@ -1,78 +1,87 @@
 --- !ruby/object:Gem::Specification
-rubygems_version: 0.9.4
-specification_version: 1
 name: spider
 version: !ruby/object:Gem::Version
-  version: 0.4.1
-date: 2007-11-10 00:00:00 -05:00
-summary: A Web spidering library
-require_paths:
-- lib
-email: mike@mike-burns.com
-homepage: http://spider.rubyforge.org/
-rubyforge_project:
-description: "A Web spidering library: handles robots.txt, scraping, finding more links, and doing it all over again."
-autorequire:
-default_executable:
-bindir: bin
-has_rdoc: true
-required_ruby_version: !ruby/object:Gem::Version::Requirement
-  requirements:
-  - - ">"
-    - !ruby/object:Gem::Version
-      version: 0.0.0
-  version:
+  version: 0.4.2
 platform: ruby
-signing_key:
-cert_chain:
-post_install_message:
 authors:
-- Mike Burns
+- John Nagro
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2008-07-06 00:00:00 -04:00
+default_executable:
+dependencies: []
+description: "A Web spidering library: handles robots.txt, scraping, finding more links, and doing it all over again."
+email: john.nagro@gmail.com
+executables: []
+extensions: []
+extra_rdoc_files: []
 files:
+- CHANGES
 - doc
-- doc/rdoc-style.css
+- doc/classes
+- doc/classes/IncludedInMemcached.html
+- doc/classes/Spider.html
+- doc/classes/SpiderInstance.html
+- doc/created.rid
 - doc/files
 - doc/files/lib
-- doc/files/lib/spider_rb.html
 - doc/files/lib/spider
-- doc/files/lib/spider/spider_instance_rb.html
 - doc/files/lib/spider/included_in_memcached_rb.html
+- doc/files/lib/spider/spider_instance_rb.html
+- doc/files/lib/spider_rb.html
 - doc/files/README.html
-- doc/classes
-- doc/classes/IncludedInMemcached.html
-- doc/classes/SpiderInstance.html
-- doc/classes/Spider.html
-- doc/fr_file_index.html
 - doc/fr_class_index.html
+- doc/fr_file_index.html
 - doc/fr_method_index.html
 - doc/index.html
-- doc/created.rid
+- doc/rdoc-style.css
+- lib
+- lib/spider
+- lib/spider/included_in_memcached.rb
+- lib/spider/next_urls_in_sqs.rb
+- lib/spider/robot_rules.rb
+- lib/spider/spider_instance.rb
+- lib/spider.rb
+- lib/test.rb
+- README
 - spec
+- spec/spec_helper.rb
 - spec/spider
 - spec/spider/included_in_memcached_spec.rb
 - spec/spider/spider_instance_spec.rb
 - spec/spider_spec.rb
-- spec/spec_helper.rb
-- README
 - spider.gemspec
-- CHANGES
-- lib
-- lib/spider.rb
-- lib/spider
-- lib/spider/included_in_memcached.rb
-- lib/spider/robot_rules.rb
-- lib/spider/spider_instance.rb
-test_files: []
+has_rdoc: true
+homepage: http://spider.rubyforge.org/
+post_install_message:
 rdoc_options: []
-extra_rdoc_files: []
-executables: []
-extensions: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
 requirements: []
-dependencies: []
+rubyforge_project: spider
+rubygems_version: 1.0.1
+signing_key:
+specification_version: 2
+summary: A Web spidering library
+test_files: []