krawler 0.0.2 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/bin/krawl CHANGED
@@ -1,4 +1,25 @@
1
1
  #!/usr/bin/env ruby
2
2
  require 'krawler'
3
+ require 'optparse'
3
4
 
4
- Krawler::Base.new(ARGV[0] || 'http://localhost:3000/').base
5
+ options = {}
6
+ optparse = OptionParser.new do |opts|
7
+ opts.banner = "Usage: krawl [url] [options]"
8
+
9
+ opts.separator ""
10
+ opts.separator "Specific options:"
11
+
12
+ opts.on("-ex [regex]", "Exclude matching paths") do |ex|
13
+ options[:ex] = ex
14
+ end
15
+ end
16
+ optparse.parse!
17
+
18
+ if ARGV.empty? || !(ARGV[0] =~ /^http/)
19
+ puts optparse
20
+ exit(-1)
21
+ end
22
+
23
+ Krawler::Base.new(ARGV[0] || 'http://localhost:3000/', {
24
+ :exclude => options[:ex]
25
+ }).base
data/krawler.gemspec CHANGED
@@ -4,9 +4,9 @@ require File.expand_path('../lib/krawler/version', __FILE__)
4
4
  Gem::Specification.new do |gem|
5
5
  gem.authors = ["Mike Evans"]
6
6
  gem.email = ["mike@urlgonomics.com"]
7
- gem.description = %q{Simple little rake task to crawl a site.}
7
+ gem.description = %q{Simple little website crawler.}
8
8
  gem.summary = %q{}
9
- gem.homepage = ""
9
+ gem.homepage = 'https://github.com/mje113/krawl'
10
10
 
11
11
  gem.add_dependency 'mechanize', '~> 2.5.0'
12
12
  gem.rubyforge_project = 'krawler'
@@ -1,3 +1,3 @@
1
1
  module Krawler
2
- VERSION = "0.0.2"
2
+ VERSION = "0.1.0"
3
3
  end
data/lib/krawler.rb CHANGED
@@ -1,17 +1,19 @@
1
1
  require 'krawler/version'
2
2
  require 'mechanize'
3
+ require 'timeout'
3
4
 
4
5
  module Krawler
5
6
 
6
7
  class Base
7
8
 
8
- def initialize(url)
9
+ def initialize(url, options)
9
10
  @base = url
10
11
  @agent = Mechanize.new
11
12
  @links_to_crawl = [@base]
12
13
  @crawled_links = []
13
14
  @bad_links = []
14
15
  @suspect_links = []
16
+ @exclude = options[:exclude]
15
17
  end
16
18
 
17
19
  def base
@@ -41,14 +43,10 @@ module Krawler
41
43
  @bad_links << link
42
44
  return
43
45
  rescue Timeout::Error => e
44
- puts "SLOW PAGE, timeout at #{Time.now - start} seconds"
45
46
  @suspect_links << link
46
47
  return
47
- end
48
-
49
- elapsed = Time.now - start
50
- if elapsed > 7.0
51
- puts "SLOW PAGE, #{Time.now - start} seconds"
48
+ ensure
49
+ puts " [#{Time.now - start}s] #{@links_to_crawl.size} links..."
52
50
  end
53
51
 
54
52
  return if !page.respond_to?(:links)
@@ -56,6 +54,7 @@ module Krawler
56
54
  new_link = new_link.href
57
55
  if (new_link =~ /^#{Regexp.escape(@base)}/) || (new_link =~ /^\//)
58
56
  next if @crawled_links.include?(new_link)
57
+ next if @exclude && new_link =~ /#{@exclude}/
59
58
 
60
59
  @links_to_crawl << new_link
61
60
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: krawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.1.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-05-10 00:00:00.000000000 Z
12
+ date: 2012-05-14 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: mechanize
16
- requirement: &70335080854540 !ruby/object:Gem::Requirement
16
+ requirement: &70120618617640 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ~>
@@ -21,8 +21,8 @@ dependencies:
21
21
  version: 2.5.0
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70335080854540
25
- description: Simple little rake task to crawl a site.
24
+ version_requirements: *70120618617640
25
+ description: Simple little website crawler.
26
26
  email:
27
27
  - mike@urlgonomics.com
28
28
  executables:
@@ -40,7 +40,7 @@ files:
40
40
  - lib/krawler.rb
41
41
  - lib/krawler/version.rb
42
42
  - tasks/krawler.rake
43
- homepage: ''
43
+ homepage: https://github.com/mje113/krawl
44
44
  licenses: []
45
45
  post_install_message:
46
46
  rdoc_options: []