krawler 1.0.8 → 1.0.9

Sign up to get free protection for your applications and to get access to all the features.
data/bin/krawl CHANGED
@@ -13,6 +13,10 @@ optparse = OptionParser.new do |opts|
13
13
  options[:e] = e
14
14
  end
15
15
 
16
+ opts.on('-i', '--include regex', 'Include matching paths regardless of sub path restriction') do |i|
17
+ options[:i] = i
18
+ end
19
+
16
20
  opts.on('-s', '--sub-restrict', 'Restrict to sub paths of base url', 'Default: false') do |s|
17
21
  options[:s] = true
18
22
  end
@@ -55,6 +59,7 @@ end
55
59
 
56
60
  Krawler::Base.new(ARGV[0] || 'http://localhost:3000/', {
57
61
  :exclude => options[:e],
62
+ :include => options[:i],
58
63
  :restrict => options[:s],
59
64
  :threads => options[:c],
60
65
  :randomize => options[:r],
@@ -1,3 +1,3 @@
1
1
  module Krawler
2
- VERSION = '1.0.8'
2
+ VERSION = '1.0.9'
3
3
  end
data/lib/krawler.rb CHANGED
@@ -3,6 +3,7 @@ require 'mechanize'
3
3
  require 'timeout'
4
4
  require 'uri'
5
5
  require 'thread'
6
+
6
7
  module Krawler
7
8
 
8
9
  class Base
@@ -16,12 +17,13 @@ module Krawler
16
17
  @bad_links = []
17
18
  @suspect_links = []
18
19
  @exclude = options[:exclude]
20
+ @include = options[:include]
19
21
  @restrict = options[:restrict]
20
22
  @randomize = options[:randomize]
21
23
  @threads = options[:threads] || 1
22
24
  @username = options[:username]
23
25
  @password = options[:password]
24
- @login_url = options[:login_url]
26
+ @login_url = options[:login_url]
25
27
  @mutex = Mutex.new
26
28
  @agent = Mechanize.new
27
29
  @agent.user_agent = 'Krawler'
@@ -149,10 +151,19 @@ module Krawler
149
151
 
150
152
  if (new_link =~ /^#{Regexp.escape(@host)}/) || (new_link =~ /^\//) # don't crawl external domains
151
153
 
152
- next if @crawled_links.include?(new_link) || @links_to_crawl.include?(new_link) # don't crawl what we've alread crawled
154
+ next if @crawled_links.include?(new_link) || @links_to_crawl.include?(new_link) # don't crawl what we've alread crawled
153
155
  next if @exclude && new_link =~ /#{@exclude}/ # don't crawl excluded matched paths
154
- next if @restrict && (new_url.path !~ /^#{Regexp.escape(@base_path)}/) # don't crawl outside of our restricted base path
155
-
156
+
157
+ if @restrict # don't crawl outside of our restricted base path
158
+ if @include && new_url.path =~ /#{@include}/ # unless we match our inclusion
159
+ # ignore
160
+ else
161
+ if new_url.path !~ /^#{Regexp.escape(@base_path)}/
162
+ next
163
+ end
164
+ end
165
+ end
166
+
156
167
  @links_to_crawl << new_link
157
168
  end
158
169
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: krawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.8
4
+ version: 1.0.9
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-10-31 00:00:00.000000000 Z
12
+ date: 2012-11-03 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: mechanize
16
- requirement: &70166215374240 !ruby/object:Gem::Requirement
16
+ requirement: &70292261892960 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ~>
@@ -21,7 +21,7 @@ dependencies:
21
21
  version: 2.5.1
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70166215374240
24
+ version_requirements: *70292261892960
25
25
  description: Simple little website crawler.
26
26
  email:
27
27
  - mike@urlgonomics.com
@@ -65,4 +65,3 @@ signing_key:
65
65
  specification_version: 3
66
66
  summary: ''
67
67
  test_files: []
68
- has_rdoc: