RubyGems - fizx-rwget - Versions diffs - 0.5.0 - Mend

fizx-rwget 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

data/.document +5 -0
data/.gitignore +5 -0
data/README.markdown +57 -0
data/Rakefile +59 -0
data/VERSION +1 -0
data/bin/rwget +15 -0
data/lib/rwget.rb +5 -0
data/lib/rwget/controller.rb +119 -0
data/lib/rwget/dupes.rb +19 -0
data/lib/rwget/fetch.rb +44 -0
data/lib/rwget/links.rb +33 -0
data/lib/rwget/queue.rb +32 -0
data/lib/rwget/rwget_option_parser.rb +158 -0
data/lib/rwget/sitemap_links.rb +27 -0
data/lib/rwget/store.rb +17 -0
data/test/controller_test.rb +37 -0
data/test/dupes_test.rb +18 -0
data/test/fetch_test.rb +19 -0
data/test/fixtures/events00.xml.gz +0 -0
data/test/fixtures/sitemap_index.xml +79 -0
data/test/fixtures/yelp.html +2329 -0
data/test/links_test.rb +22 -0
data/test/queue_test.rb +14 -0
data/test/server.rb +28 -0
data/test/sitemap_links_test.rb +18 -0
data/test/store_test.rb +28 -0
metadata +137 -0

data/.document ADDED

@@ -0,0 +1,5 @@
+README.rdoc
+lib/**/*.rb
+bin/*
+features/**/*.feature
+LICENSE

data/.gitignore ADDED

@@ -0,0 +1,5 @@
+*.sw?
+.DS_Store
+coverage
+rdoc
+pkg

data/README.markdown ADDED

@@ -0,0 +1,57 @@
+# RWGet
+RWget is a web crawler that tries to emulate a subset of the interface of GNU/Wget, but with more flexibility for my needs.
+## Features
+1. Regular expression accept/reject lists
+2. Pluggable interfaces for robots-txt, url-fetcher, url-queue, url-dupe-detector, and page-storage.  The defaults store locally, and fetch using libcurl, but you could easily change to db storage, a distributed queue, etc.
+## Help page
+    Usage: /usr/bin/rwget [options] SEED_URL [SEED_URL2 ...]
+        -w, --wait=SECONDS               wait SECONDS between retrievals.
+        -P, --directory-prefix=PREFIX    save files to PREFIX/...
+        -U, --user-agent=AGENT           identify as AGENT instead of RWget/VERSION.
+        -A, --accept-pattern=RUBY_REGEX  URLs must match RUBY_REGEX to be saved to the queue.
+            --time-limit=AMOUNT          Crawler will stop after this AMOUNT of time has passed.
+        -R, --reject-pattern=RUBY_REGEX  URLs must NOT match RUBY_REGEX to be saved to the queue.
+            --require=RUBY_SCRIPT        Will execute 'require RUBY_SCRIPT'
+            --limit-rate=RATE            limit download rate to RATE.
+            --http-proxy=URL             Proxies via URL
+            --proxy-user=USER            Sets proxy user to USER
+            --proxy-password=PASSWORD    Sets proxy password to PASSWORD
+            --fetch-class=RUBY_CLASS     Must implement fetch(uri, user_agent_string) #=> [final_redirected_url, file_object]
+            --store-class=RUBY_CLASS     Must implement put(key_string, temp_file)
+            --dupes-class=RUBY_CLASS     Must implement dupe?(uri)
+            --queue-class=RUBY_CLASS     Must implement put(key_string, depth_int) and get() #=> [key_string, depth_int]
+            --links-class=RUBY_CLASS     Must implement urls(base_uri, temp_file) #=> [uri, ...]
+        -S, --sitemap=URL                URL of a sitemap to crawl (will ignore inter-page links)
+        -Q, --quota=NUMBER               set retrieval quota to NUMBER.
+            --max-redirect=NUM           maximum redirections allowed per page.
+        -H, --span-hosts                 go to foreign hosts when recursive
+            --connect-timeout=SECS       set the connect timeout to SECS.
+        -T, --timeout=SECS               set all timeout values to SECONDS.
+        -l, --level=NUMBER               maximum recursion depth (inf or 0 for infinite).
+            --[no-]timestampize          Prepend the timestamp of when the crawl started to the directory structure.
+            --incremental-from=PREVIOUS  Build upon the indexing already saved in PREVIOUS.
+            --protocol-directories       use protocol name in directories.
+            --no-host-directories        don't create host directories.
+        -v, --[no-]verbose               Run verbosely
+        -h, --help                       Show this message
+## Ruby API
+    require "rubygems"
+    require "rwget"
+    # options is the same as the command-line long options, but converted into
+    # idiomatic ruby.  See the RDoc for details.
+    # i.e.
+    # sh$ rwget -T 5 -A ".*foo.*" http://google.com
+    # becomes:
+    # irb$ RWGet::Controller.new({:seeds => ["http://google.com"],
+    #            :timeout => 5, :accept_patterns => /.*foo.*/}).start
+    RWGet::Controller.new(options).start

data/Rakefile ADDED

@@ -0,0 +1,59 @@
+require 'rubygems'
+require 'rake'
+begin
+  require 'jeweler'
+  Jeweler::Tasks.new do |gem|
+    gem.name = "rwget"
+    gem.summary = %Q{Ruby port of wget, emphasis on recursive/crawler}
+    gem.email = "kyle@kylemaxwell.com"
+    gem.homepage = "http://github.com/fizx/rwget"
+    gem.authors = ["Kyle Maxwell"]
+    gem.add_dependency("curb", ["> 0.0.0"])
+    gem.add_dependency("hpricot", ["> 0.0.0", "< 0.7"])
+    gem.add_dependency("fizx-robots", [">= 0.3.1"])
+    gem.add_dependency("bloomfilter", ["> 0.0.0"])
+    gem.add_dependency("libxml-ruby", ["> 0.9"])
+  end
+rescue LoadError
+  puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
+end
+require 'rake/testtask'
+Rake::TestTask.new(:test) do |test|
+  test.libs << 'lib' << 'test'
+  test.pattern = 'test/**/*_test.rb'
+  test.verbose = true
+end
+begin
+  require 'rcov/rcovtask'
+  Rcov::RcovTask.new do |test|
+    test.libs << 'test'
+    test.pattern = 'test/**/*_test.rb'
+    test.verbose = true
+  end
+rescue LoadError
+  task :rcov do
+    abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
+  end
+end
+task :default => :test
+require 'rake/rdoctask'
+Rake::RDocTask.new do |rdoc|
+  if File.exist?('VERSION.yml')
+    config = YAML.load(File.read('VERSION.yml'))
+    version = "#{config[:major]}.#{config[:minor]}.#{config[:patch]}"
+  else
+    version = ""
+  end
+  rdoc.rdoc_dir = 'rdoc'
+  rdoc.title = "rwget #{version}"
+  rdoc.rdoc_files.include('README*')
+  rdoc.rdoc_files.include('lib/**/*.rb')
+end

data/VERSION ADDED

	@@ -0,0 +1 @@
1	+ 0.5.0

data/bin/rwget ADDED

@@ -0,0 +1,15 @@
+#!/usr/bin/env ruby
+require File.dirname(__FILE__) + "/../lib/rwget"
+parser = RWGetOptionParser.new
+parser.parse!
+if parser.options[:seeds].empty?
+  puts parser.usage
+  puts "    -h for options listing"
+  exit(1)
+end
+controller = RWGet::Controller.new(parser.options)
+controller.start
+controller.close

data/lib/rwget.rb ADDED

@@ -0,0 +1,5 @@
+module RWGet
+end
+Dir[File.dirname(__FILE__) + "/rwget/*.rb"].each do |f|
+  require f.gsub(/\.rb$/, '')
+end

data/lib/rwget/controller.rb ADDED

@@ -0,0 +1,119 @@
+require "set"
+class RWGet::Controller
+  attr_reader :options
+  def self.resolve_class(string)
+    string.split("::").inject(Kernel) do |const, string|
+      const.const_get(string)
+    end
+  end
+  def initialize(options)
+    @options = options
+    @options[:user_agent] ||= "Ruby/Wget"
+    @options[:accept_patterns] ||= []
+    @options[:reject_patterns] ||= []
+    %w[quota depth wait limit_rate time_limit].each do |key|
+      key = key.to_sym
+      @options[key] = @options[key].to_i
+    end
+    @queue = (options[:queue_class] ? self.class.resolve_class(options[:queue_class]) : RWGet::Queue).new(options)
+    @fetch = (options[:fetch_class] ? self.class.resolve_class(options[:fetch_class]) : RWGet::Fetch).new(options)
+    @store = (options[:store_class] ? self.class.resolve_class(options[:store_class]) : RWGet::Store).new(options)
+    @links = (options[:links_class] ? self.class.resolve_class(options[:links_class]) : RWGet::Links).new(options)
+    @dupes = (options[:dupes_class] ? self.class.resolve_class(options[:dupes_class]) : RWGet::Dupes).new(options)
+  end
+  def start
+    @start_time = Time.now.to_i.to_s
+    @start = Time.now
+    @original_hosts = Set.new
+    options[:seeds].each do |seed|
+      @queue.put(seed, 0)
+      @original_hosts << URI.parse(seed).host
+    end
+    downloaded = 0
+    while (options[:quota] == 0 || downloaded < options[:quota]) &&
+          (options[:time_limit] == 0 || Time.now - @start < options[:time_limit])
+      url, depth = @queue.get
+      unless url
+        puts "no more urls"
+        return
+      end
+      if options[:depth] > 0 && depth > options[:depth]
+        next
+      end
+      uri = URI.parse(url)
+      while options[:limit_rate] > 0 && downloaded / (Time.now - @start) > options[:limit_rate]
+        puts "sleeping until under rate limit"
+        sleep 1
+      end
+      puts "download rate: #{downloaded / (Time.now - @start)}bps"
+      puts "downloading #{uri}"
+      effective_url, tmpfile = @fetch.fetch(uri, options[:user_agent])
+      if tmpfile
+        downloaded += File.size(tmpfile.path)
+        puts "parsing links"
+        @links.urls(effective_url, tmpfile).each do |link|
+          legal = legal?(link)
+          dupe = @dupes.dupe?(link)
+          puts "dupe: #{link}" if dupe
+          if legal && !dupe
+            puts "adding link: #{link}"
+            @queue.put(link, depth + 1)
+          end
+        end
+        key = key_for(uri)
+        puts "storing at #{key}"
+        @store.put(key, tmpfile)
+        sleep options[:wait]
+      else
+        puts "unable to download"
+      end
+    end
+    puts "hit time/quota"
+  end
+  def legal?(link)
+    unless options[:span_hosts] || @original_hosts.include?(link.host)
+      puts "can't span hosts: #{link}"
+      return false
+    end
+    link = link.to_s
+    legal = options[:accept_patterns].empty?
+    puts "accepted by default: #{link}" if legal
+    legal ||= options[:accept_patterns].any?{|p| link =~ p}
+    puts "not in accept patterns: #{link}" if !legal
+    rejected = options[:reject_patterns].any?{|p| link =~ p}
+    puts "in reject patterns: #{link}" if rejected
+    legal && !rejected
+  end
+  def key_for(uri)
+    arr = []
+    arr << options[:prefix]     if options[:prefix]
+    arr << @start_time        if options[:timestampize]
+    arr << uri.scheme           if options[:protocol_directories]
+    arr << uri.host             unless options[:no_host_directories]
+    paths = uri.path.split("/")
+    paths.shift                 if paths.first.to_s.empty?
+    File.join(arr + paths)
+  end
+  def close
+    [@queue, @fetch, @store, @links, @dupes].each do |obj|
+      obj.close if obj.respond_to?(:close)
+    end
+  end
+end

data/lib/rwget/dupes.rb ADDED

@@ -0,0 +1,19 @@
+require "rubygems"
+require "tempfile"
+require "bloomfilter"
+class RWGet::Dupes
+  SIZE = 1_000_000
+  def initialize(options = {})
+    @tmp = Tempfile.new("bloom")
+    @bloom = ExternalBloomFilter.create(@tmp.path, SIZE)
+  end
+  def dupe?(uri)
+    key = uri.to_s
+    return true if @bloom.include?(key)
+    @bloom.add(key)
+    return false
+  end
+end

data/lib/rwget/fetch.rb ADDED

@@ -0,0 +1,44 @@
+require "open-uri"
+require "tempfile"
+require "rubygems"
+require "robots"
+require "curl"
+class RWGet::Fetch
+  DEFAULT_TIMEOUT = 30
+  DEFAULT_REDIRECTS = 30
+  def initialize(options = {})
+    @robots = {}
+    @curl = Curl::Easy.new
+    @curl.connect_timeout = options[:connect_timeout] || DEFAULT_TIMEOUT
+    @curl.timeout = options[:timeout] || DEFAULT_TIMEOUT
+    @curl.max_redirects = options[:max_redirect] || DEFAULT_REDIRECTS
+    @curl.follow_location = true
+    if options[:http_proxy]
+      @curl.proxy_url = options[:http_proxy]
+      if options[:proxy_user]
+        @curl.proxypwd = "#{options[:proxy_user]}:#{options[:proxy_password]}"
+      end
+    end
+    puts "timeout: #{@curl.timeout}"
+  end
+  def fetch(uri, user_agent)
+    @robots[user_agent] ||= Robots.new(user_agent)
+    unless @robots[user_agent].allowed?(uri)
+      puts "disallowed by robots.txt"
+      return nil
+    end
+    @curl.headers["User-Agent"] = user_agent
+    @curl.url = uri.to_s
+    @curl.perform
+    tmp = nil
+    Tempfile.open("curl") {|file| file.print(@curl.body_str); tmp = file }
+    tmp.open
+    [@curl.last_effective_url, tmp]
+  rescue Exception => e
+    STDERR.puts "#{uri} not retrieved: #{e.message}"
+    nil
+  end
+end

data/lib/rwget/links.rb ADDED

@@ -0,0 +1,33 @@
+require "rubygems"
+require "hpricot"
+class RWGet::Links
+  def initialize(options = {})
+  end
+  def urls(base, tmpfile)
+    @urls = []
+    base = base.to_s
+    string = File.read(tmpfile.path)
+    xml = string =~ /<\?xml/
+    doc = xml ? Hpricot.XML(string) : Hpricot(string)
+    (doc / "//item/link").each do |l|
+      add base, l.inner_text
+    end
+    (doc / "a").each do |a|
+      add base, a.attributes["href"]
+    end
+    @urls
+  rescue Exception => e
+    STDERR.puts "Couldn't parse #{base} for links: #{e.message}"
+    []
+  end
+  def add(base, href)
+    begin
+      @urls << URI.join(base, href.strip) if href
+    rescue Exception => e
+      STDERR.puts "url error parsing URI.join(#{base.inspect}, #{href.inspect}): #{e.message}"
+    end
+  end
+end

data/lib/rwget/queue.rb ADDED

@@ -0,0 +1,32 @@
+require "tempfile"
+class RWGet::Queue
+  def initialize(options = {})
+    @writer = Tempfile.new("rwget-queue")
+    @reader = File.open(@writer.path, "r")
+    @dirty = false
+  end
+  def put(key, depth)
+    @writer.puts "#{key}\t#{depth}"
+    @dirty = true
+  end
+  def get(retrying = false)
+    sleep 0.1 if retrying
+    if @dirty
+      @writer.flush
+      @dirty = false
+    end
+    line = @reader.gets
+    unless line
+      return retrying ? nil : get(:retry)
+    end
+    key, depth = line.split("\t")
+    return [key, depth.to_i]
+  end
+  def close
+    @writer.close
+    @reader.close
+  end
+end

data/lib/rwget/rwget_option_parser.rb ADDED

@@ -0,0 +1,158 @@
+require 'optparse'
+class RWGetOptionParser < OptionParser
+  attr_accessor :options
+  def usage
+    "Usage: #{$0} [options] SEED_URL [SEED_URL2 ...]"
+  end
+  def parse!
+    super
+    options[:seeds] = ARGV
+  end
+  def initialize
+    self.options = {}
+    super do |opts|
+      yield opts if block_given?
+      opts.banner = usage
+      opts.on("-w", "--wait=SECONDS", "wait SECONDS between retrievals.") do |w|
+        options[:wait] = w.to_i
+      end
+      opts.on("-P", "--directory-prefix=PREFIX", "save files to PREFIX/...") do |p|
+        options[:prefix] = p
+      end
+      opts.on("-U", "--user-agent=AGENT", "identify as AGENT instead of RWget/VERSION.") do |u|
+        options[:user_agent] = u
+      end
+      opts.on("-Ap", "--accept-pattern=RUBY_REGEX", "URLs must match RUBY_REGEX to be saved to the queue.") do |r|
+        options[:accept_patterns] ||= []
+        options[:accept_patterns] << Regexp.new(r)
+      end
+      opts.on("--time-limit=AMOUNT", "Crawler will stop after this AMOUNT of time has passed.") do |t|
+        options[:time_limit] = t.to_i
+        options[:time_limit] *= 60 if t =~ /m/i
+        options[:time_limit] *= 60 * 60 if t =~ /h/i
+        options[:time_limit] *= 60 * 60 * 24 if t =~ /d/i
+        options[:time_limit] *= 60 * 60 * 24 * 7 if t =~ /w/i
+      end
+      opts.on("-Rp", "--reject-pattern=RUBY_REGEX", "URLs must NOT match RUBY_REGEX to be saved to the queue.") do |r|
+        options[:reject_patterns] ||= []
+        options[:reject_patterns] << Regexp.new(r)
+      end
+      opts.on("--require=RUBY_SCRIPT", "Will execute 'require RUBY_SCRIPT'") do |s|
+        require s
+      end
+      opts.on("--limit-rate=RATE", "limit download rate to RATE.") do |r|
+        rate = r.to_i
+        rate *= 1000 if r =~ /k/i
+        rate *= 1000000 if r =~ /m/i
+        options[:limit_rate] = rate
+        puts "rate is #{rate}"
+      end
+      opts.on("--http-proxy=URL", "Proxies via URL") do |u|
+        options[:http_proxy] = u
+      end
+      opts.on("--proxy-user=USER", "Sets proxy user to USER") do |u|
+        options[:proxy_user] = u
+      end
+      opts.on("--proxy-password=PASSWORD", "Sets proxy password to PASSWORD") do |p|
+        options[:proxy_password] = p
+      end
+      opts.on("--fetch-class=RUBY_CLASS", "Must implement fetch(uri, user_agent_string) #=> [final_redirected_url, file_object]") do |c|
+        options[:fetch_class] = c
+      end
+      opts.on("--store-class=RUBY_CLASS", "Must implement put(key_string, temp_file)") do |c|
+        options[:store_class] = c
+      end
+      opts.on("--dupes-class=RUBY_CLASS", "Must implement dupe?(uri)") do |c|
+        options[:dupes_class] = c
+      end
+      opts.on("--queue-class=RUBY_CLASS", "Must implement put(key_string, depth_int) and get() #=> [key_string, depth_int]") do |c|
+        options[:queue_class] = c
+      end
+      opts.on("--queue-class=RUBY_CLASS", "Must implement put(key_string, depth_int) and get() #=> [key_string, depth_int]") do |c|
+        options[:queue_class] = c
+      end
+      opts.on("--links-class=RUBY_CLASS", "Must implement urls(base_uri, temp_file) #=> [uri, ...]") do |c|
+        options[:links_class] = c
+      end
+      opts.on("-S", "--sitemap=URL", "URL of a sitemap to crawl (will ignore inter-page links)") do |url|
+        options[:seeds] << url
+        options[:links_class] = "RWGet::SitemapLinks"
+      end
+      opts.on("-Q", "--quota=NUMBER", "set retrieval quota to NUMBER.") do |q|
+        options[:quota] = q.to_i
+        options[:quota] *= 1000 if q =~ /k/i
+        options[:quota] *= 1000000 if q =~ /m/i
+      end
+      opts.on("--max-redirect=NUM", "maximum redirections allowed per page.") do |m|
+        options[:max_redirect] = m.to_i
+      end
+      opts.on("-H", "--span-hosts", "go to foreign hosts when recursive") do |s|
+        options[:span_hosts] = s
+      end
+      opts.on("--connect-timeout=SECS", "set the connect timeout to SECS.") do |t|
+        options[:connect_timeout] = t.to_i
+      end
+      opts.on("-T", "--timeout=SECS", "set all timeout values to SECONDS.") do |t|
+        options[:timeout] = t.to_i
+      end
+      opts.on("-l", "--level=NUMBER", "maximum recursion depth (inf or 0 for infinite).") do |l|
+        options[:depth] = l.to_i
+      end
+      opts.on("--[no-]timestampize", "Prepend the timestamp of when the crawl started to the directory structure.") do |t|
+        options[:timestampize] = t
+      end
+      opts.on("--incremental-from=PREVIOUS", "Build upon the indexing already saved in PREVIOUS.") do |r|
+        options[:incremental_from] = r
+      end
+      opts.on("--protocol-directories", "use protocol name in directories.") do |p|
+        options[:protocol_directories] = p
+      end
+      opts.on("--no-host-directories", "don't create host directories.") do |h|
+        options[:no_host_directories] = h
+      end
+      opts.on("-v", "--[no-]verbose", "Run verbosely") do |v|
+        options[:verbose] = v
+      end
+      opts.on_tail("-h", "--help", "Show this message") do
+        puts opts
+        exit
+      end
+    end
+  end
+end