krawler 1.0.8 → 1.0.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/krawl +5 -0
- data/lib/krawler/version.rb +1 -1
- data/lib/krawler.rb +15 -4
- metadata +4 -5
data/bin/krawl
CHANGED
@@ -13,6 +13,10 @@ optparse = OptionParser.new do |opts|
|
|
13
13
|
options[:e] = e
|
14
14
|
end
|
15
15
|
|
16
|
+
opts.on('-i', '--include regex', 'Include matching paths regardless of sub path restriction') do |i|
|
17
|
+
options[:i] = i
|
18
|
+
end
|
19
|
+
|
16
20
|
opts.on('-s', '--sub-restrict', 'Restrict to sub paths of base url', 'Default: false') do |s|
|
17
21
|
options[:s] = true
|
18
22
|
end
|
@@ -55,6 +59,7 @@ end
|
|
55
59
|
|
56
60
|
Krawler::Base.new(ARGV[0] || 'http://localhost:3000/', {
|
57
61
|
:exclude => options[:e],
|
62
|
+
:include => options[:i],
|
58
63
|
:restrict => options[:s],
|
59
64
|
:threads => options[:c],
|
60
65
|
:randomize => options[:r],
|
data/lib/krawler/version.rb
CHANGED
data/lib/krawler.rb
CHANGED
@@ -3,6 +3,7 @@ require 'mechanize'
|
|
3
3
|
require 'timeout'
|
4
4
|
require 'uri'
|
5
5
|
require 'thread'
|
6
|
+
|
6
7
|
module Krawler
|
7
8
|
|
8
9
|
class Base
|
@@ -16,12 +17,13 @@ module Krawler
|
|
16
17
|
@bad_links = []
|
17
18
|
@suspect_links = []
|
18
19
|
@exclude = options[:exclude]
|
20
|
+
@include = options[:include]
|
19
21
|
@restrict = options[:restrict]
|
20
22
|
@randomize = options[:randomize]
|
21
23
|
@threads = options[:threads] || 1
|
22
24
|
@username = options[:username]
|
23
25
|
@password = options[:password]
|
24
|
-
@login_url
|
26
|
+
@login_url = options[:login_url]
|
25
27
|
@mutex = Mutex.new
|
26
28
|
@agent = Mechanize.new
|
27
29
|
@agent.user_agent = 'Krawler'
|
@@ -149,10 +151,19 @@ module Krawler
|
|
149
151
|
|
150
152
|
if (new_link =~ /^#{Regexp.escape(@host)}/) || (new_link =~ /^\//) # don't crawl external domains
|
151
153
|
|
152
|
-
next if @crawled_links.include?(new_link) || @links_to_crawl.include?(new_link)
|
154
|
+
next if @crawled_links.include?(new_link) || @links_to_crawl.include?(new_link) # don't crawl what we've alread crawled
|
153
155
|
next if @exclude && new_link =~ /#{@exclude}/ # don't crawl excluded matched paths
|
154
|
-
|
155
|
-
|
156
|
+
|
157
|
+
if @restrict # don't crawl outside of our restricted base path
|
158
|
+
if @include && new_url.path =~ /#{@include}/ # unless we match our inclusion
|
159
|
+
# ignore
|
160
|
+
else
|
161
|
+
if new_url.path !~ /^#{Regexp.escape(@base_path)}/
|
162
|
+
next
|
163
|
+
end
|
164
|
+
end
|
165
|
+
end
|
166
|
+
|
156
167
|
@links_to_crawl << new_link
|
157
168
|
end
|
158
169
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: krawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.9
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-11-03 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: mechanize
|
16
|
-
requirement: &
|
16
|
+
requirement: &70292261892960 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ~>
|
@@ -21,7 +21,7 @@ dependencies:
|
|
21
21
|
version: 2.5.1
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70292261892960
|
25
25
|
description: Simple little website crawler.
|
26
26
|
email:
|
27
27
|
- mike@urlgonomics.com
|
@@ -65,4 +65,3 @@ signing_key:
|
|
65
65
|
specification_version: 3
|
66
66
|
summary: ''
|
67
67
|
test_files: []
|
68
|
-
has_rdoc:
|