krawler 0.1.2 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +32 -9
- data/bin/krawl +24 -9
- data/lib/krawler/version.rb +1 -1
- data/lib/krawler.rb +4 -5
- metadata +3 -3
data/README.md
CHANGED
@@ -1,24 +1,47 @@
|
|
1
1
|
# Krawler
|
2
2
|
|
3
|
-
|
3
|
+
Simple little command-line web crawler. Use it to find 404's or 500's on your site.
|
4
|
+
I use it to warm caches. Multi-threaded enabled for faster crawling.
|
4
5
|
|
5
6
|
## Installation
|
6
7
|
|
7
|
-
|
8
|
+
Install:
|
8
9
|
|
9
|
-
gem
|
10
|
+
gem install krawler
|
10
11
|
|
11
|
-
|
12
|
+
## Usage
|
12
13
|
|
13
|
-
|
14
|
+
From the command line:
|
14
15
|
|
15
|
-
|
16
|
+
$ krawl http://localhost:3000/
|
16
17
|
|
17
|
-
|
18
|
+
Options:
|
18
19
|
|
19
|
-
|
20
|
+
-e, --exclude regex Exclude matching paths
|
21
|
+
-s, --sub-restrict Restrict to sub paths of base url
|
22
|
+
-c, --concurrent count Crawl with count number of concurrent connections
|
23
|
+
-r, --randomize Randomize crawl path
|
24
|
+
|
25
|
+
Examples:
|
26
|
+
|
27
|
+
Restrict crawling to sub-paths of /public
|
28
|
+
|
29
|
+
$ krawl http://localhost:3000/public -s
|
30
|
+
|
31
|
+
Restrict crawling to paths that do not match `/^\/api\//`
|
32
|
+
|
33
|
+
$ krawl http://localhost:3000/ -e "^\/api\/"
|
34
|
+
|
35
|
+
Crawl with 4 current threaded crawlers. Make sure your server is capable of handling
|
36
|
+
concurrent requests.
|
37
|
+
|
38
|
+
$ krawl http://production.server -c 4
|
39
|
+
|
40
|
+
Randomize the crawl path. Helpful when you have a lot of links and get bored watching
|
41
|
+
the same crawl path over and over.
|
42
|
+
|
43
|
+
$ krawl http://localhost:3000/ -r
|
20
44
|
|
21
|
-
TODO: Write usage instructions here
|
22
45
|
|
23
46
|
## Contributing
|
24
47
|
|
data/bin/krawl
CHANGED
@@ -4,18 +4,31 @@ require 'optparse'
|
|
4
4
|
|
5
5
|
options = {}
|
6
6
|
optparse = OptionParser.new do |opts|
|
7
|
-
opts.banner =
|
7
|
+
opts.banner = 'Usage: krawl [base url] [options]'
|
8
8
|
|
9
|
-
opts.separator
|
10
|
-
opts.separator
|
9
|
+
opts.separator ''
|
10
|
+
opts.separator 'Specific options:'
|
11
11
|
|
12
|
-
opts.on(
|
13
|
-
options[:
|
12
|
+
opts.on('-e', '--exclude regex', 'Exclude matching paths') do |e|
|
13
|
+
options[:e] = e
|
14
14
|
end
|
15
15
|
|
16
|
-
opts.on(
|
17
|
-
options[:
|
16
|
+
opts.on('-s', '--sub-restrict', 'Restrict to sub paths of base url', 'Default: false') do |s|
|
17
|
+
options[:s] = true
|
18
18
|
end
|
19
|
+
|
20
|
+
opts.on('-c', '--concurrent count', 'Crawl with count number of concurrent connections', 'Default: 4') do |c|
|
21
|
+
options[:c] = c.to_i
|
22
|
+
end
|
23
|
+
|
24
|
+
opts.on('-r', '--randomize', 'Randomize crawl path', 'Default: true') do |r|
|
25
|
+
options[:r] = r
|
26
|
+
end
|
27
|
+
|
28
|
+
opts.separator ''
|
29
|
+
|
30
|
+
opts.on('-h', '--help', 'Show this help message.') { puts opts; exit }
|
31
|
+
|
19
32
|
end
|
20
33
|
optparse.parse!
|
21
34
|
|
@@ -25,6 +38,8 @@ if ARGV.empty? || !(ARGV[0] =~ /^http/)
|
|
25
38
|
end
|
26
39
|
|
27
40
|
Krawler::Base.new(ARGV[0] || 'http://localhost:3000/', {
|
28
|
-
:exclude
|
29
|
-
:restrict
|
41
|
+
:exclude => options[:e],
|
42
|
+
:restrict => options[:s],
|
43
|
+
:threads => options[:c],
|
44
|
+
:randomize => options[:r]
|
30
45
|
}).base
|
data/lib/krawler/version.rb
CHANGED
data/lib/krawler.rb
CHANGED
@@ -3,7 +3,6 @@ require 'mechanize'
|
|
3
3
|
require 'timeout'
|
4
4
|
require 'uri'
|
5
5
|
require 'thread'
|
6
|
-
require 'pry'
|
7
6
|
|
8
7
|
module Krawler
|
9
8
|
|
@@ -19,8 +18,8 @@ module Krawler
|
|
19
18
|
@suspect_links = []
|
20
19
|
@exclude = options[:exclude]
|
21
20
|
@restrict = options[:restrict]
|
22
|
-
@randomize =
|
23
|
-
@
|
21
|
+
@randomize = options[:randomize]
|
22
|
+
@threads = options[:threads] || 1
|
24
23
|
@mutex = Mutex.new
|
25
24
|
@agent = Mechanize.new
|
26
25
|
end
|
@@ -40,9 +39,9 @@ module Krawler
|
|
40
39
|
@suspect_links.each { |link| puts link }
|
41
40
|
end
|
42
41
|
|
43
|
-
def initialize_threads
|
42
|
+
def initialize_threads(agent)
|
44
43
|
threads = []
|
45
|
-
@
|
44
|
+
@threads.times do |i|
|
46
45
|
threads << Thread.new(i) do
|
47
46
|
|
48
47
|
agent = @agent.dup
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: krawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 1.0.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -13,7 +13,7 @@ date: 2012-05-20 00:00:00.000000000 Z
|
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: mechanize
|
16
|
-
requirement: &
|
16
|
+
requirement: &70309315236200 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ~>
|
@@ -21,7 +21,7 @@ dependencies:
|
|
21
21
|
version: 2.5.0
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70309315236200
|
25
25
|
description: Simple little website crawler.
|
26
26
|
email:
|
27
27
|
- mike@urlgonomics.com
|