krawler 0.1.2 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.md CHANGED
@@ -1,24 +1,47 @@
1
1
  # Krawler
2
2
 
3
- TODO: Write a gem description
3
+ Simple little command-line web crawler. Use it to find 404's or 500's on your site.
4
+ I use it to warm caches. Multi-threaded enabled for faster crawling.
4
5
 
5
6
  ## Installation
6
7
 
7
- Add this line to your application's Gemfile:
8
+ Install:
8
9
 
9
- gem 'krawler'
10
+ gem install krawler
10
11
 
11
- And then execute:
12
+ ## Usage
12
13
 
13
- $ bundle
14
+ From the command line:
14
15
 
15
- Or install it yourself as:
16
+ $ krawl http://localhost:3000/
16
17
 
17
- $ gem install krawler
18
+ Options:
18
19
 
19
- ## Usage
20
+ -e, --exclude regex Exclude matching paths
21
+ -s, --sub-restrict Restrict to sub paths of base url
22
+ -c, --concurrent count Crawl with count number of concurrent connections
23
+ -r, --randomize Randomize crawl path
24
+
25
+ Examples:
26
+
27
+ Restrict crawling to sub-paths of /public
28
+
29
+ $ krawl http://localhost:3000/public -s
30
+
31
+ Restrict crawling to paths that do not match `/^\/api\//`
32
+
33
+ $ krawl http://localhost:3000/ -e "^\/api\/"
34
+
35
+ Crawl with 4 current threaded crawlers. Make sure your server is capable of handling
36
+ concurrent requests.
37
+
38
+ $ krawl http://production.server -c 4
39
+
40
+ Randomize the crawl path. Helpful when you have a lot of links and get bored watching
41
+ the same crawl path over and over.
42
+
43
+ $ krawl http://localhost:3000/ -r
20
44
 
21
- TODO: Write usage instructions here
22
45
 
23
46
  ## Contributing
24
47
 
data/bin/krawl CHANGED
@@ -4,18 +4,31 @@ require 'optparse'
4
4
 
5
5
  options = {}
6
6
  optparse = OptionParser.new do |opts|
7
- opts.banner = "Usage: krawl [url] [options]"
7
+ opts.banner = 'Usage: krawl [base url] [options]'
8
8
 
9
- opts.separator ""
10
- opts.separator "Specific options:"
9
+ opts.separator ''
10
+ opts.separator 'Specific options:'
11
11
 
12
- opts.on("-ex [regex]", "Exclude matching paths") do |ex|
13
- options[:ex] = ex
12
+ opts.on('-e', '--exclude regex', 'Exclude matching paths') do |e|
13
+ options[:e] = e
14
14
  end
15
15
 
16
- opts.on("-r", "Restrict to sub paths") do |r|
17
- options[:r] = true
16
+ opts.on('-s', '--sub-restrict', 'Restrict to sub paths of base url', 'Default: false') do |s|
17
+ options[:s] = true
18
18
  end
19
+
20
+ opts.on('-c', '--concurrent count', 'Crawl with count number of concurrent connections', 'Default: 4') do |c|
21
+ options[:c] = c.to_i
22
+ end
23
+
24
+ opts.on('-r', '--randomize', 'Randomize crawl path', 'Default: true') do |r|
25
+ options[:r] = r
26
+ end
27
+
28
+ opts.separator ''
29
+
30
+ opts.on('-h', '--help', 'Show this help message.') { puts opts; exit }
31
+
19
32
  end
20
33
  optparse.parse!
21
34
 
@@ -25,6 +38,8 @@ if ARGV.empty? || !(ARGV[0] =~ /^http/)
25
38
  end
26
39
 
27
40
  Krawler::Base.new(ARGV[0] || 'http://localhost:3000/', {
28
- :exclude => options[:ex],
29
- :restrict => options[:r]
41
+ :exclude => options[:e],
42
+ :restrict => options[:s],
43
+ :threads => options[:c],
44
+ :randomize => options[:r]
30
45
  }).base
@@ -1,3 +1,3 @@
1
1
  module Krawler
2
- VERSION = "0.1.2"
2
+ VERSION = "1.0.0"
3
3
  end
data/lib/krawler.rb CHANGED
@@ -3,7 +3,6 @@ require 'mechanize'
3
3
  require 'timeout'
4
4
  require 'uri'
5
5
  require 'thread'
6
- require 'pry'
7
6
 
8
7
  module Krawler
9
8
 
@@ -19,8 +18,8 @@ module Krawler
19
18
  @suspect_links = []
20
19
  @exclude = options[:exclude]
21
20
  @restrict = options[:restrict]
22
- @randomize = true
23
- @max_threads = 4
21
+ @randomize = options[:randomize]
22
+ @threads = options[:threads] || 1
24
23
  @mutex = Mutex.new
25
24
  @agent = Mechanize.new
26
25
  end
@@ -40,9 +39,9 @@ module Krawler
40
39
  @suspect_links.each { |link| puts link }
41
40
  end
42
41
 
43
- def initialize_threads
42
+ def initialize_threads(agent)
44
43
  threads = []
45
- @max_threads.times do |i|
44
+ @threads.times do |i|
46
45
  threads << Thread.new(i) do
47
46
 
48
47
  agent = @agent.dup
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: krawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 1.0.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -13,7 +13,7 @@ date: 2012-05-20 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: mechanize
16
- requirement: &70168780830200 !ruby/object:Gem::Requirement
16
+ requirement: &70309315236200 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ~>
@@ -21,7 +21,7 @@ dependencies:
21
21
  version: 2.5.0
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70168780830200
24
+ version_requirements: *70309315236200
25
25
  description: Simple little website crawler.
26
26
  email:
27
27
  - mike@urlgonomics.com