krawler 0.0.2 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/bin/krawl CHANGED
@@ -1,4 +1,25 @@
1
1
  #!/usr/bin/env ruby
2
2
  require 'krawler'
3
+ require 'optparse'
3
4
 
4
- Krawler::Base.new(ARGV[0] || 'http://localhost:3000/').base
5
+ options = {}
6
+ optparse = OptionParser.new do |opts|
7
+ opts.banner = "Usage: krawl [url] [options]"
8
+
9
+ opts.separator ""
10
+ opts.separator "Specific options:"
11
+
12
+ opts.on("-ex [regex]", "Exclude matching paths") do |ex|
13
+ options[:ex] = ex
14
+ end
15
+ end
16
+ optparse.parse!
17
+
18
+ if ARGV.empty? || !(ARGV[0] =~ /^http/)
19
+ puts optparse
20
+ exit(-1)
21
+ end
22
+
23
+ Krawler::Base.new(ARGV[0] || 'http://localhost:3000/', {
24
+ :exclude => options[:ex]
25
+ }).base
data/krawler.gemspec CHANGED
@@ -4,9 +4,9 @@ require File.expand_path('../lib/krawler/version', __FILE__)
4
4
  Gem::Specification.new do |gem|
5
5
  gem.authors = ["Mike Evans"]
6
6
  gem.email = ["mike@urlgonomics.com"]
7
- gem.description = %q{Simple little rake task to crawl a site.}
7
+ gem.description = %q{Simple little website crawler.}
8
8
  gem.summary = %q{}
9
- gem.homepage = ""
9
+ gem.homepage = 'https://github.com/mje113/krawl'
10
10
 
11
11
  gem.add_dependency 'mechanize', '~> 2.5.0'
12
12
  gem.rubyforge_project = 'krawler'
@@ -1,3 +1,3 @@
1
1
  module Krawler
2
- VERSION = "0.0.2"
2
+ VERSION = "0.1.0"
3
3
  end
data/lib/krawler.rb CHANGED
@@ -1,17 +1,19 @@
1
1
  require 'krawler/version'
2
2
  require 'mechanize'
3
+ require 'timeout'
3
4
 
4
5
  module Krawler
5
6
 
6
7
  class Base
7
8
 
8
- def initialize(url)
9
+ def initialize(url, options)
9
10
  @base = url
10
11
  @agent = Mechanize.new
11
12
  @links_to_crawl = [@base]
12
13
  @crawled_links = []
13
14
  @bad_links = []
14
15
  @suspect_links = []
16
+ @exclude = options[:exclude]
15
17
  end
16
18
 
17
19
  def base
@@ -41,14 +43,10 @@ module Krawler
41
43
  @bad_links << link
42
44
  return
43
45
  rescue Timeout::Error => e
44
- puts "SLOW PAGE, timeout at #{Time.now - start} seconds"
45
46
  @suspect_links << link
46
47
  return
47
- end
48
-
49
- elapsed = Time.now - start
50
- if elapsed > 7.0
51
- puts "SLOW PAGE, #{Time.now - start} seconds"
48
+ ensure
49
+ puts " [#{Time.now - start}s] #{@links_to_crawl.size} links..."
52
50
  end
53
51
 
54
52
  return if !page.respond_to?(:links)
@@ -56,6 +54,7 @@ module Krawler
56
54
  new_link = new_link.href
57
55
  if (new_link =~ /^#{Regexp.escape(@base)}/) || (new_link =~ /^\//)
58
56
  next if @crawled_links.include?(new_link)
57
+ next if @exclude && new_link =~ /#{@exclude}/
59
58
 
60
59
  @links_to_crawl << new_link
61
60
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: krawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.1.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-05-10 00:00:00.000000000 Z
12
+ date: 2012-05-14 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: mechanize
16
- requirement: &70335080854540 !ruby/object:Gem::Requirement
16
+ requirement: &70120618617640 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ~>
@@ -21,8 +21,8 @@ dependencies:
21
21
  version: 2.5.0
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70335080854540
25
- description: Simple little rake task to crawl a site.
24
+ version_requirements: *70120618617640
25
+ description: Simple little website crawler.
26
26
  email:
27
27
  - mike@urlgonomics.com
28
28
  executables:
@@ -40,7 +40,7 @@ files:
40
40
  - lib/krawler.rb
41
41
  - lib/krawler/version.rb
42
42
  - tasks/krawler.rake
43
- homepage: ''
43
+ homepage: https://github.com/mje113/krawl
44
44
  licenses: []
45
45
  post_install_message:
46
46
  rdoc_options: []