krawler 1.0.8 → 1.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/bin/krawl CHANGED
@@ -13,6 +13,10 @@ optparse = OptionParser.new do |opts|
13
13
  options[:e] = e
14
14
  end
15
15
 
16
+ opts.on('-i', '--include regex', 'Include matching paths regardless of sub path restriction') do |i|
17
+ options[:i] = i
18
+ end
19
+
16
20
  opts.on('-s', '--sub-restrict', 'Restrict to sub paths of base url', 'Default: false') do |s|
17
21
  options[:s] = true
18
22
  end
@@ -55,6 +59,7 @@ end
55
59
 
56
60
  Krawler::Base.new(ARGV[0] || 'http://localhost:3000/', {
57
61
  :exclude => options[:e],
62
+ :include => options[:i],
58
63
  :restrict => options[:s],
59
64
  :threads => options[:c],
60
65
  :randomize => options[:r],
@@ -1,3 +1,3 @@
1
1
  module Krawler
2
- VERSION = '1.0.8'
2
+ VERSION = '1.0.9'
3
3
  end
data/lib/krawler.rb CHANGED
@@ -3,6 +3,7 @@ require 'mechanize'
3
3
  require 'timeout'
4
4
  require 'uri'
5
5
  require 'thread'
6
+
6
7
  module Krawler
7
8
 
8
9
  class Base
@@ -16,12 +17,13 @@ module Krawler
16
17
  @bad_links = []
17
18
  @suspect_links = []
18
19
  @exclude = options[:exclude]
20
+ @include = options[:include]
19
21
  @restrict = options[:restrict]
20
22
  @randomize = options[:randomize]
21
23
  @threads = options[:threads] || 1
22
24
  @username = options[:username]
23
25
  @password = options[:password]
24
- @login_url = options[:login_url]
26
+ @login_url = options[:login_url]
25
27
  @mutex = Mutex.new
26
28
  @agent = Mechanize.new
27
29
  @agent.user_agent = 'Krawler'
@@ -149,10 +151,19 @@ module Krawler
149
151
 
150
152
  if (new_link =~ /^#{Regexp.escape(@host)}/) || (new_link =~ /^\//) # don't crawl external domains
151
153
 
152
- next if @crawled_links.include?(new_link) || @links_to_crawl.include?(new_link) # don't crawl what we've alread crawled
154
+ next if @crawled_links.include?(new_link) || @links_to_crawl.include?(new_link) # don't crawl what we've alread crawled
153
155
  next if @exclude && new_link =~ /#{@exclude}/ # don't crawl excluded matched paths
154
- next if @restrict && (new_url.path !~ /^#{Regexp.escape(@base_path)}/) # don't crawl outside of our restricted base path
155
-
156
+
157
+ if @restrict # don't crawl outside of our restricted base path
158
+ if @include && new_url.path =~ /#{@include}/ # unless we match our inclusion
159
+ # ignore
160
+ else
161
+ if new_url.path !~ /^#{Regexp.escape(@base_path)}/
162
+ next
163
+ end
164
+ end
165
+ end
166
+
156
167
  @links_to_crawl << new_link
157
168
  end
158
169
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: krawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.8
4
+ version: 1.0.9
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-10-31 00:00:00.000000000 Z
12
+ date: 2012-11-03 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: mechanize
16
- requirement: &70166215374240 !ruby/object:Gem::Requirement
16
+ requirement: &70292261892960 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ~>
@@ -21,7 +21,7 @@ dependencies:
21
21
  version: 2.5.1
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70166215374240
24
+ version_requirements: *70292261892960
25
25
  description: Simple little website crawler.
26
26
  email:
27
27
  - mike@urlgonomics.com
@@ -65,4 +65,3 @@ signing_key:
65
65
  specification_version: 3
66
66
  summary: ''
67
67
  test_files: []
68
- has_rdoc: