RubyGems - spiderz - Versions diffs - 0.1.0 - Mend

spiderz 0.1.0

Files changed (8) hide show

data/History.txt ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ === 0.1.0 / 2008-11-30
2	+

data/Manifest.txt ADDED Viewed

@@ -0,0 +1,7 @@
+History.txt
+Manifest.txt
+README.txt
+Rakefile
+bin/spiderz
+lib/spiderz.rb
+test/test_spiderz.rb

data/README.txt ADDED Viewed

@@ -0,0 +1,60 @@
+= spiderz
+http://parkerfox.co.uk/labs/spiderz
+== DESCRIPTION:
+Scarily easy spidering
+== FEATURES/PROBLEMS:
+* Very simple spidering using Hpricot
+== SYNOPSIS:
+Create a site map.
+# create the class
+spider = Spiderz.new "http://mysite.com"
+# setup a custom
+spider.success do |url, doc|
+  title = (doc / "title").text.strip
+  puts "<a href='#{url}' >#{title}</a>"
+end
+# set it going
+spider.crawl
+== REQUIREMENTS:
+* Hpricot
+== INSTALL:
+sudo gem install spiderz
+== LICENSE:
+(The MIT License)
+Copyright (c) 2008 FIX
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+'Software'), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/Rakefile ADDED Viewed

@@ -0,0 +1,14 @@
+# -*- ruby -*-
+require 'rubygems'
+require 'hoe'
+require './lib/spiderz.rb'
+Hoe.new('spiderz', Spiderz::VERSION) do |p|
+  # p.rubyforge_name = 'spiderzx' # if different than lowercase project name
+  p.developer('Jonah Fox', 'jonah@parkefox.co.uk')
+  p.remote_rdoc_dir = '' # Release to root
+end
+# vim: syntax=Ruby

data/bin/spiderz ADDED Viewed

File without changes

data/lib/spiderz.rb ADDED Viewed

@@ -0,0 +1,98 @@
+require 'rubygems'
+require 'hpricot'
+require 'open-uri'
+class Spiderz
+  VERSION = "0.1.0"
+  #root should be like http://www.google.com (i.e. with http://)
+  def initialize(root)
+    @crawled = {}
+    @root = root
+    @success = Proc.new { |url, doc| puts "Successfully read url: #{url}" }
+    @failure = Proc.new { |url| puts "failure to read/parse url: #{url}" }
+    @started = Proc.new { |url| puts "Started crawling from url: #{url}" }
+    @completed = Proc.new { |url| puts "Crawling complete" }
+    @skip = Proc.new do |href|
+      !href || (external?(href) || mail?(href) || bookmark?(href))
+    end
+  end
+  def crawl(url)
+    @started.call(url)
+    @to_crawl = [url]
+    while(@to_crawl.length > 0)
+      @to_crawl += page_links(@to_crawl.shift)
+    end
+    @completed.call(url)
+  end
+  def external? href
+    href.match("[a-z]+://") && !href.match(@root)
+  end
+  def bookmark? href
+    href.match(/^#/)
+  end
+  def mail? href
+    href.match("mailto")
+  end
+  def started &action
+    @started = action
+  end
+  def completed &action
+    @completed = action
+  end
+  def failure &action
+    @failure = action
+  end
+  def success &action
+    @success = action
+  end
+  def skip &action
+    @skip = action
+  end
+  def page_links url
+    #puts url
+    return [] if @crawled[url]
+    @crawled[url] = true
+    begin
+      doc = Hpricot(open(@root+url))
+    rescue
+      @failure.call(url)
+      return []
+    end
+    @success.call(url, doc)
+    links = doc/"a" #find links
+    urls = links.map do |a|
+      a.attributes["href"]
+    end
+    urls.delete_if do |url|
+      @crawled[url] || @skip.call(url)
+    end
+    urls
+  end
+end

data/test/test_spiderz.rb ADDED Viewed

File without changes

metadata ADDED Viewed

@@ -0,0 +1,72 @@
+--- !ruby/object:Gem::Specification
+name: spiderz
+version: !ruby/object:Gem::Version
+  version: 0.1.0
+platform: ruby
+authors:
+- Jonah Fox
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2008-11-30 00:00:00 +00:00
+default_executable:
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: hoe
+  type: :development
+  version_requirement:
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 1.8.2
+    version:
+description: Scarily easy spidering
+email:
+- jonah@parkefox.co.uk
+executables:
+- spiderz
+extensions: []
+extra_rdoc_files:
+- History.txt
+- Manifest.txt
+- README.txt
+files:
+- History.txt
+- Manifest.txt
+- README.txt
+- Rakefile
+- bin/spiderz
+- lib/spiderz.rb
+- test/test_spiderz.rb
+has_rdoc: true
+homepage: http://parkerfox.co.uk/labs/spiderz
+post_install_message:
+rdoc_options:
+- --main
+- README.txt
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+requirements: []
+rubyforge_project: spiderz
+rubygems_version: 1.3.1
+signing_key:
+specification_version: 2
+summary: Scarily easy spidering
+test_files:
+- test/test_spiderz.rb