krawler 0.1.2 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +32 -9
- data/bin/krawl +24 -9
- data/lib/krawler/version.rb +1 -1
- data/lib/krawler.rb +4 -5
- metadata +3 -3
data/README.md
CHANGED
@@ -1,24 +1,47 @@
|
|
1
1
|
# Krawler
|
2
2
|
|
3
|
-
|
3
|
+
Simple little command-line web crawler. Use it to find 404's or 500's on your site.
|
4
|
+
I use it to warm caches. Multi-threaded enabled for faster crawling.
|
4
5
|
|
5
6
|
## Installation
|
6
7
|
|
7
|
-
|
8
|
+
Install:
|
8
9
|
|
9
|
-
gem
|
10
|
+
gem install krawler
|
10
11
|
|
11
|
-
|
12
|
+
## Usage
|
12
13
|
|
13
|
-
|
14
|
+
From the command line:
|
14
15
|
|
15
|
-
|
16
|
+
$ krawl http://localhost:3000/
|
16
17
|
|
17
|
-
|
18
|
+
Options:
|
18
19
|
|
19
|
-
|
20
|
+
-e, --exclude regex Exclude matching paths
|
21
|
+
-s, --sub-restrict Restrict to sub paths of base url
|
22
|
+
-c, --concurrent count Crawl with count number of concurrent connections
|
23
|
+
-r, --randomize Randomize crawl path
|
24
|
+
|
25
|
+
Examples:
|
26
|
+
|
27
|
+
Restrict crawling to sub-paths of /public
|
28
|
+
|
29
|
+
$ krawl http://localhost:3000/public -s
|
30
|
+
|
31
|
+
Restrict crawling to paths that do not match `/^\/api\//`
|
32
|
+
|
33
|
+
$ krawl http://localhost:3000/ -e "^\/api\/"
|
34
|
+
|
35
|
+
Crawl with 4 current threaded crawlers. Make sure your server is capable of handling
|
36
|
+
concurrent requests.
|
37
|
+
|
38
|
+
$ krawl http://production.server -c 4
|
39
|
+
|
40
|
+
Randomize the crawl path. Helpful when you have a lot of links and get bored watching
|
41
|
+
the same crawl path over and over.
|
42
|
+
|
43
|
+
$ krawl http://localhost:3000/ -r
|
20
44
|
|
21
|
-
TODO: Write usage instructions here
|
22
45
|
|
23
46
|
## Contributing
|
24
47
|
|
data/bin/krawl
CHANGED
@@ -4,18 +4,31 @@ require 'optparse'
|
|
4
4
|
|
5
5
|
options = {}
|
6
6
|
optparse = OptionParser.new do |opts|
|
7
|
-
opts.banner =
|
7
|
+
opts.banner = 'Usage: krawl [base url] [options]'
|
8
8
|
|
9
|
-
opts.separator
|
10
|
-
opts.separator
|
9
|
+
opts.separator ''
|
10
|
+
opts.separator 'Specific options:'
|
11
11
|
|
12
|
-
opts.on(
|
13
|
-
options[:
|
12
|
+
opts.on('-e', '--exclude regex', 'Exclude matching paths') do |e|
|
13
|
+
options[:e] = e
|
14
14
|
end
|
15
15
|
|
16
|
-
opts.on(
|
17
|
-
options[:
|
16
|
+
opts.on('-s', '--sub-restrict', 'Restrict to sub paths of base url', 'Default: false') do |s|
|
17
|
+
options[:s] = true
|
18
18
|
end
|
19
|
+
|
20
|
+
opts.on('-c', '--concurrent count', 'Crawl with count number of concurrent connections', 'Default: 4') do |c|
|
21
|
+
options[:c] = c.to_i
|
22
|
+
end
|
23
|
+
|
24
|
+
opts.on('-r', '--randomize', 'Randomize crawl path', 'Default: true') do |r|
|
25
|
+
options[:r] = r
|
26
|
+
end
|
27
|
+
|
28
|
+
opts.separator ''
|
29
|
+
|
30
|
+
opts.on('-h', '--help', 'Show this help message.') { puts opts; exit }
|
31
|
+
|
19
32
|
end
|
20
33
|
optparse.parse!
|
21
34
|
|
@@ -25,6 +38,8 @@ if ARGV.empty? || !(ARGV[0] =~ /^http/)
|
|
25
38
|
end
|
26
39
|
|
27
40
|
Krawler::Base.new(ARGV[0] || 'http://localhost:3000/', {
|
28
|
-
:exclude
|
29
|
-
:restrict
|
41
|
+
:exclude => options[:e],
|
42
|
+
:restrict => options[:s],
|
43
|
+
:threads => options[:c],
|
44
|
+
:randomize => options[:r]
|
30
45
|
}).base
|
data/lib/krawler/version.rb
CHANGED
data/lib/krawler.rb
CHANGED
@@ -3,7 +3,6 @@ require 'mechanize'
|
|
3
3
|
require 'timeout'
|
4
4
|
require 'uri'
|
5
5
|
require 'thread'
|
6
|
-
require 'pry'
|
7
6
|
|
8
7
|
module Krawler
|
9
8
|
|
@@ -19,8 +18,8 @@ module Krawler
|
|
19
18
|
@suspect_links = []
|
20
19
|
@exclude = options[:exclude]
|
21
20
|
@restrict = options[:restrict]
|
22
|
-
@randomize =
|
23
|
-
@
|
21
|
+
@randomize = options[:randomize]
|
22
|
+
@threads = options[:threads] || 1
|
24
23
|
@mutex = Mutex.new
|
25
24
|
@agent = Mechanize.new
|
26
25
|
end
|
@@ -40,9 +39,9 @@ module Krawler
|
|
40
39
|
@suspect_links.each { |link| puts link }
|
41
40
|
end
|
42
41
|
|
43
|
-
def initialize_threads
|
42
|
+
def initialize_threads(agent)
|
44
43
|
threads = []
|
45
|
-
@
|
44
|
+
@threads.times do |i|
|
46
45
|
threads << Thread.new(i) do
|
47
46
|
|
48
47
|
agent = @agent.dup
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: krawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 1.0.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -13,7 +13,7 @@ date: 2012-05-20 00:00:00.000000000 Z
|
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: mechanize
|
16
|
-
requirement: &
|
16
|
+
requirement: &70309315236200 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ~>
|
@@ -21,7 +21,7 @@ dependencies:
|
|
21
21
|
version: 2.5.0
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70309315236200
|
25
25
|
description: Simple little website crawler.
|
26
26
|
email:
|
27
27
|
- mike@urlgonomics.com
|