krawler 1.0.8 → 1.0.9
Sign up to get free protection for your applications and to get access to all the features.
- data/bin/krawl +5 -0
- data/lib/krawler/version.rb +1 -1
- data/lib/krawler.rb +15 -4
- metadata +4 -5
data/bin/krawl
CHANGED
@@ -13,6 +13,10 @@ optparse = OptionParser.new do |opts|
|
|
13
13
|
options[:e] = e
|
14
14
|
end
|
15
15
|
|
16
|
+
opts.on('-i', '--include regex', 'Include matching paths regardless of sub path restriction') do |i|
|
17
|
+
options[:i] = i
|
18
|
+
end
|
19
|
+
|
16
20
|
opts.on('-s', '--sub-restrict', 'Restrict to sub paths of base url', 'Default: false') do |s|
|
17
21
|
options[:s] = true
|
18
22
|
end
|
@@ -55,6 +59,7 @@ end
|
|
55
59
|
|
56
60
|
Krawler::Base.new(ARGV[0] || 'http://localhost:3000/', {
|
57
61
|
:exclude => options[:e],
|
62
|
+
:include => options[:i],
|
58
63
|
:restrict => options[:s],
|
59
64
|
:threads => options[:c],
|
60
65
|
:randomize => options[:r],
|
data/lib/krawler/version.rb
CHANGED
data/lib/krawler.rb
CHANGED
@@ -3,6 +3,7 @@ require 'mechanize'
|
|
3
3
|
require 'timeout'
|
4
4
|
require 'uri'
|
5
5
|
require 'thread'
|
6
|
+
|
6
7
|
module Krawler
|
7
8
|
|
8
9
|
class Base
|
@@ -16,12 +17,13 @@ module Krawler
|
|
16
17
|
@bad_links = []
|
17
18
|
@suspect_links = []
|
18
19
|
@exclude = options[:exclude]
|
20
|
+
@include = options[:include]
|
19
21
|
@restrict = options[:restrict]
|
20
22
|
@randomize = options[:randomize]
|
21
23
|
@threads = options[:threads] || 1
|
22
24
|
@username = options[:username]
|
23
25
|
@password = options[:password]
|
24
|
-
@login_url
|
26
|
+
@login_url = options[:login_url]
|
25
27
|
@mutex = Mutex.new
|
26
28
|
@agent = Mechanize.new
|
27
29
|
@agent.user_agent = 'Krawler'
|
@@ -149,10 +151,19 @@ module Krawler
|
|
149
151
|
|
150
152
|
if (new_link =~ /^#{Regexp.escape(@host)}/) || (new_link =~ /^\//) # don't crawl external domains
|
151
153
|
|
152
|
-
next if @crawled_links.include?(new_link) || @links_to_crawl.include?(new_link)
|
154
|
+
next if @crawled_links.include?(new_link) || @links_to_crawl.include?(new_link) # don't crawl what we've alread crawled
|
153
155
|
next if @exclude && new_link =~ /#{@exclude}/ # don't crawl excluded matched paths
|
154
|
-
|
155
|
-
|
156
|
+
|
157
|
+
if @restrict # don't crawl outside of our restricted base path
|
158
|
+
if @include && new_url.path =~ /#{@include}/ # unless we match our inclusion
|
159
|
+
# ignore
|
160
|
+
else
|
161
|
+
if new_url.path !~ /^#{Regexp.escape(@base_path)}/
|
162
|
+
next
|
163
|
+
end
|
164
|
+
end
|
165
|
+
end
|
166
|
+
|
156
167
|
@links_to_crawl << new_link
|
157
168
|
end
|
158
169
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: krawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.9
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-11-03 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: mechanize
|
16
|
-
requirement: &
|
16
|
+
requirement: &70292261892960 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ~>
|
@@ -21,7 +21,7 @@ dependencies:
|
|
21
21
|
version: 2.5.1
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70292261892960
|
25
25
|
description: Simple little website crawler.
|
26
26
|
email:
|
27
27
|
- mike@urlgonomics.com
|
@@ -65,4 +65,3 @@ signing_key:
|
|
65
65
|
specification_version: 3
|
66
66
|
summary: ''
|
67
67
|
test_files: []
|
68
|
-
has_rdoc:
|