krawler 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (4) hide show
  1. data/bin/krawl +6 -1
  2. data/lib/krawler/version.rb +1 -1
  3. data/lib/krawler.rb +23 -10
  4. metadata +4 -4
data/bin/krawl CHANGED
@@ -12,6 +12,10 @@ optparse = OptionParser.new do |opts|
12
12
  opts.on("-ex [regex]", "Exclude matching paths") do |ex|
13
13
  options[:ex] = ex
14
14
  end
15
+
16
+ opts.on("-r", "Restrict to sub paths") do |r|
17
+ options[:r] = true
18
+ end
15
19
  end
16
20
  optparse.parse!
17
21
 
@@ -21,5 +25,6 @@ if ARGV.empty? || !(ARGV[0] =~ /^http/)
21
25
  end
22
26
 
23
27
  Krawler::Base.new(ARGV[0] || 'http://localhost:3000/', {
24
- :exclude => options[:ex]
28
+ :exclude => options[:ex],
29
+ :restrict => options[:r]
25
30
  }).base
@@ -1,3 +1,3 @@
1
1
  module Krawler
2
- VERSION = "0.1.0"
2
+ VERSION = "0.1.1"
3
3
  end
data/lib/krawler.rb CHANGED
@@ -1,24 +1,29 @@
1
1
  require 'krawler/version'
2
2
  require 'mechanize'
3
3
  require 'timeout'
4
+ require 'uri'
5
+ require 'pry'
4
6
 
5
7
  module Krawler
6
8
 
7
9
  class Base
8
10
 
9
11
  def initialize(url, options)
10
- @base = url
11
- @agent = Mechanize.new
12
- @links_to_crawl = [@base]
12
+ url = URI(url)
13
+ @host = "#{url.scheme}://#{url.host}"
14
+ @base_path = url.path
15
+ @agent = Mechanize.new
16
+ @links_to_crawl = [url]
13
17
  @crawled_links = []
14
18
  @bad_links = []
15
19
  @suspect_links = []
16
20
  @exclude = options[:exclude]
21
+ @restrict = options[:restrict]
17
22
  end
18
23
 
19
24
  def base
20
- puts "Crawling #{@base}"
21
-
25
+ puts "Crawling..."
26
+
22
27
  while !@links_to_crawl.empty? do
23
28
  crawl_page(@links_to_crawl.pop)
24
29
  end
@@ -51,11 +56,19 @@ module Krawler
51
56
 
52
57
  return if !page.respond_to?(:links)
53
58
  page.links.each do |new_link|
54
- new_link = new_link.href
55
- if (new_link =~ /^#{Regexp.escape(@base)}/) || (new_link =~ /^\//)
56
- next if @crawled_links.include?(new_link)
57
- next if @exclude && new_link =~ /#{@exclude}/
58
-
59
+ begin
60
+ new_url = URI(new_link.href)
61
+ new_link = new_url.to_s
62
+ rescue ArgumentError # junk link
63
+ next
64
+ end
65
+
66
+ if (new_link =~ /^#{Regexp.escape(@host)}/) || (new_link =~ /^\//) # don't crawl external domains
67
+
68
+ next if @crawled_links.include?(new_link) # don't crawl what we've alread crawled
69
+ next if @exclude && new_link =~ /#{@exclude}/ # don't crawl excluded matched paths
70
+ next if @restrict && (new_url.path !~ /^#{Regexp.escape(@base_path)}/) # don't crawl outside of our restricted base path
71
+
59
72
  @links_to_crawl << new_link
60
73
  end
61
74
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: krawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.1
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-05-14 00:00:00.000000000 Z
12
+ date: 2012-05-15 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: mechanize
16
- requirement: &70120618617640 !ruby/object:Gem::Requirement
16
+ requirement: &70155565557800 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ~>
@@ -21,7 +21,7 @@ dependencies:
21
21
  version: 2.5.0
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70120618617640
24
+ version_requirements: *70155565557800
25
25
  description: Simple little website crawler.
26
26
  email:
27
27
  - mike@urlgonomics.com