krawler 0.1.0 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (4) hide show
  1. data/bin/krawl +6 -1
  2. data/lib/krawler/version.rb +1 -1
  3. data/lib/krawler.rb +23 -10
  4. metadata +4 -4
data/bin/krawl CHANGED
@@ -12,6 +12,10 @@ optparse = OptionParser.new do |opts|
12
12
  opts.on("-ex [regex]", "Exclude matching paths") do |ex|
13
13
  options[:ex] = ex
14
14
  end
15
+
16
+ opts.on("-r", "Restrict to sub paths") do |r|
17
+ options[:r] = true
18
+ end
15
19
  end
16
20
  optparse.parse!
17
21
 
@@ -21,5 +25,6 @@ if ARGV.empty? || !(ARGV[0] =~ /^http/)
21
25
  end
22
26
 
23
27
  Krawler::Base.new(ARGV[0] || 'http://localhost:3000/', {
24
- :exclude => options[:ex]
28
+ :exclude => options[:ex],
29
+ :restrict => options[:r]
25
30
  }).base
@@ -1,3 +1,3 @@
1
1
  module Krawler
2
- VERSION = "0.1.0"
2
+ VERSION = "0.1.1"
3
3
  end
data/lib/krawler.rb CHANGED
@@ -1,24 +1,29 @@
1
1
  require 'krawler/version'
2
2
  require 'mechanize'
3
3
  require 'timeout'
4
+ require 'uri'
5
+ require 'pry'
4
6
 
5
7
  module Krawler
6
8
 
7
9
  class Base
8
10
 
9
11
  def initialize(url, options)
10
- @base = url
11
- @agent = Mechanize.new
12
- @links_to_crawl = [@base]
12
+ url = URI(url)
13
+ @host = "#{url.scheme}://#{url.host}"
14
+ @base_path = url.path
15
+ @agent = Mechanize.new
16
+ @links_to_crawl = [url]
13
17
  @crawled_links = []
14
18
  @bad_links = []
15
19
  @suspect_links = []
16
20
  @exclude = options[:exclude]
21
+ @restrict = options[:restrict]
17
22
  end
18
23
 
19
24
  def base
20
- puts "Crawling #{@base}"
21
-
25
+ puts "Crawling..."
26
+
22
27
  while !@links_to_crawl.empty? do
23
28
  crawl_page(@links_to_crawl.pop)
24
29
  end
@@ -51,11 +56,19 @@ module Krawler
51
56
 
52
57
  return if !page.respond_to?(:links)
53
58
  page.links.each do |new_link|
54
- new_link = new_link.href
55
- if (new_link =~ /^#{Regexp.escape(@base)}/) || (new_link =~ /^\//)
56
- next if @crawled_links.include?(new_link)
57
- next if @exclude && new_link =~ /#{@exclude}/
58
-
59
+ begin
60
+ new_url = URI(new_link.href)
61
+ new_link = new_url.to_s
62
+ rescue ArgumentError # junk link
63
+ next
64
+ end
65
+
66
+ if (new_link =~ /^#{Regexp.escape(@host)}/) || (new_link =~ /^\//) # don't crawl external domains
67
+
68
+ next if @crawled_links.include?(new_link) # don't crawl what we've alread crawled
69
+ next if @exclude && new_link =~ /#{@exclude}/ # don't crawl excluded matched paths
70
+ next if @restrict && (new_url.path !~ /^#{Regexp.escape(@base_path)}/) # don't crawl outside of our restricted base path
71
+
59
72
  @links_to_crawl << new_link
60
73
  end
61
74
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: krawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.1
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-05-14 00:00:00.000000000 Z
12
+ date: 2012-05-15 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: mechanize
16
- requirement: &70120618617640 !ruby/object:Gem::Requirement
16
+ requirement: &70155565557800 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ~>
@@ -21,7 +21,7 @@ dependencies:
21
21
  version: 2.5.0
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70120618617640
24
+ version_requirements: *70155565557800
25
25
  description: Simple little website crawler.
26
26
  email:
27
27
  - mike@urlgonomics.com