RubyGems - krawler - Versions diffs - 0.1.2 → 1.0.0 - Mend

krawler 0.1.2 → 1.0.0

Files changed (5) hide show

data/README.md CHANGED Viewed

@@ -1,24 +1,47 @@
 # Krawler
-TODO: Write a gem description
+Simple little command-line web crawler.  Use it to find 404's or 500's on your site.
+I use it to warm caches.  Multi-threaded enabled for faster crawling.
 ## Installation
-Add this line to your application's Gemfile:
+Install:
-    gem 'krawler'
+    gem install krawler
-And then execute:
+## Usage
-    $ bundle
+From the command line:
-Or install it yourself as:
+    $ krawl http://localhost:3000/
-    $ gem install krawler
+Options:
-## Usage
+    -e, --exclude regex              Exclude matching paths
+    -s, --sub-restrict               Restrict to sub paths of base url
+    -c, --concurrent count           Crawl with count number of concurrent connections
+    -r, --randomize                  Randomize crawl path
+Examples:
+Restrict crawling to sub-paths of /public
+    $ krawl http://localhost:3000/public -s
+Restrict crawling to paths that do not match `/^\/api\//`
+    $ krawl http://localhost:3000/ -e "^\/api\/"
+Crawl with 4 current threaded crawlers. Make sure your server is capable of handling
+concurrent requests.
+    $ krawl http://production.server -c 4
+Randomize the crawl path.  Helpful when you have a lot of links and get bored watching
+the same crawl path over and over.
+    $ krawl http://localhost:3000/ -r
-TODO: Write usage instructions here
 ## Contributing

data/bin/krawl CHANGED Viewed

@@ -4,18 +4,31 @@ require 'optparse'
 options = {}
 optparse = OptionParser.new do |opts|
-  opts.banner = "Usage: krawl [url] [options]"
+  opts.banner = 'Usage: krawl [base url] [options]'
-  opts.separator ""
-  opts.separator "Specific options:"
+  opts.separator ''
+  opts.separator 'Specific options:'
-  opts.on("-ex [regex]", "Exclude matching paths") do |ex|
-    options[:ex] = ex
+  opts.on('-e', '--exclude regex', 'Exclude matching paths') do |e|
+    options[:e] = e
   end
-  opts.on("-r", "Restrict to sub paths") do |r|
-    options[:r] = true
+  opts.on('-s', '--sub-restrict', 'Restrict to sub paths of base url', 'Default: false') do |s|
+    options[:s] = true
   end
+  opts.on('-c', '--concurrent count', 'Crawl with count number of concurrent connections', 'Default: 4') do |c|
+    options[:c] = c.to_i
+  end
+  opts.on('-r', '--randomize', 'Randomize crawl path', 'Default: true') do |r|
+    options[:r] = r
+  end
+  opts.separator ''
+  opts.on('-h', '--help', 'Show this help message.') { puts opts; exit }
 end
 optparse.parse!
@@ -25,6 +38,8 @@ if ARGV.empty? || !(ARGV[0] =~ /^http/)
 end
 Krawler::Base.new(ARGV[0] || 'http://localhost:3000/', {
-  :exclude  => options[:ex],
-  :restrict => options[:r]
+  :exclude   => options[:e],
+  :restrict  => options[:s],
+  :threads   => options[:c],
+  :randomize => options[:r]
 }).base

data/lib/krawler/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Krawler
-  VERSION = "0.1.2"
+  VERSION = "1.0.0"
 end

data/lib/krawler.rb CHANGED Viewed

@@ -3,7 +3,6 @@ require 'mechanize'
 require 'timeout'
 require 'uri'
 require 'thread'
-require 'pry'
 module Krawler
@@ -19,8 +18,8 @@ module Krawler
       @suspect_links  = []
       @exclude        = options[:exclude]
       @restrict       = options[:restrict]
-      @randomize      = true
-      @max_threads    = 4
+      @randomize      = options[:randomize]
+      @threads        = options[:threads]   || 1
       @mutex          = Mutex.new
       @agent          = Mechanize.new
     end
@@ -40,9 +39,9 @@ module Krawler
       @suspect_links.each { |link| puts link }
     end
-    def initialize_threads
+    def initialize_threads(agent)
       threads = []
-      @max_threads.times do |i|
+      @threads.times do |i|
         threads << Thread.new(i) do
           agent = @agent.dup

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: krawler
 version: !ruby/object:Gem::Version
-  version: 0.1.2
+  version: 1.0.0
   prerelease:
 platform: ruby
 authors:
@@ -13,7 +13,7 @@ date: 2012-05-20 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: mechanize
-  requirement: &70168780830200 !ruby/object:Gem::Requirement
+  requirement: &70309315236200 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ~>
@@ -21,7 +21,7 @@ dependencies:
         version: 2.5.0
   type: :runtime
   prerelease: false
-  version_requirements: *70168780830200
+  version_requirements: *70309315236200
 description: Simple little website crawler.
 email:
 - mike@urlgonomics.com