krawler 0.0.2 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/bin/krawl +22 -1
- data/krawler.gemspec +2 -2
- data/lib/krawler/version.rb +1 -1
- data/lib/krawler.rb +6 -7
- metadata +6 -6
data/bin/krawl
CHANGED
@@ -1,4 +1,25 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
require 'krawler'
|
3
|
+
require 'optparse'
|
3
4
|
|
4
|
-
|
5
|
+
options = {}
|
6
|
+
optparse = OptionParser.new do |opts|
|
7
|
+
opts.banner = "Usage: krawl [url] [options]"
|
8
|
+
|
9
|
+
opts.separator ""
|
10
|
+
opts.separator "Specific options:"
|
11
|
+
|
12
|
+
opts.on("-ex [regex]", "Exclude matching paths") do |ex|
|
13
|
+
options[:ex] = ex
|
14
|
+
end
|
15
|
+
end
|
16
|
+
optparse.parse!
|
17
|
+
|
18
|
+
if ARGV.empty? || !(ARGV[0] =~ /^http/)
|
19
|
+
puts optparse
|
20
|
+
exit(-1)
|
21
|
+
end
|
22
|
+
|
23
|
+
Krawler::Base.new(ARGV[0] || 'http://localhost:3000/', {
|
24
|
+
:exclude => options[:ex]
|
25
|
+
}).base
|
data/krawler.gemspec
CHANGED
@@ -4,9 +4,9 @@ require File.expand_path('../lib/krawler/version', __FILE__)
|
|
4
4
|
Gem::Specification.new do |gem|
|
5
5
|
gem.authors = ["Mike Evans"]
|
6
6
|
gem.email = ["mike@urlgonomics.com"]
|
7
|
-
gem.description = %q{Simple little
|
7
|
+
gem.description = %q{Simple little website crawler.}
|
8
8
|
gem.summary = %q{}
|
9
|
-
gem.homepage =
|
9
|
+
gem.homepage = 'https://github.com/mje113/krawl'
|
10
10
|
|
11
11
|
gem.add_dependency 'mechanize', '~> 2.5.0'
|
12
12
|
gem.rubyforge_project = 'krawler'
|
data/lib/krawler/version.rb
CHANGED
data/lib/krawler.rb
CHANGED
@@ -1,17 +1,19 @@
|
|
1
1
|
require 'krawler/version'
|
2
2
|
require 'mechanize'
|
3
|
+
require 'timeout'
|
3
4
|
|
4
5
|
module Krawler
|
5
6
|
|
6
7
|
class Base
|
7
8
|
|
8
|
-
def initialize(url)
|
9
|
+
def initialize(url, options)
|
9
10
|
@base = url
|
10
11
|
@agent = Mechanize.new
|
11
12
|
@links_to_crawl = [@base]
|
12
13
|
@crawled_links = []
|
13
14
|
@bad_links = []
|
14
15
|
@suspect_links = []
|
16
|
+
@exclude = options[:exclude]
|
15
17
|
end
|
16
18
|
|
17
19
|
def base
|
@@ -41,14 +43,10 @@ module Krawler
|
|
41
43
|
@bad_links << link
|
42
44
|
return
|
43
45
|
rescue Timeout::Error => e
|
44
|
-
puts "SLOW PAGE, timeout at #{Time.now - start} seconds"
|
45
46
|
@suspect_links << link
|
46
47
|
return
|
47
|
-
|
48
|
-
|
49
|
-
elapsed = Time.now - start
|
50
|
-
if elapsed > 7.0
|
51
|
-
puts "SLOW PAGE, #{Time.now - start} seconds"
|
48
|
+
ensure
|
49
|
+
puts " [#{Time.now - start}s] #{@links_to_crawl.size} links..."
|
52
50
|
end
|
53
51
|
|
54
52
|
return if !page.respond_to?(:links)
|
@@ -56,6 +54,7 @@ module Krawler
|
|
56
54
|
new_link = new_link.href
|
57
55
|
if (new_link =~ /^#{Regexp.escape(@base)}/) || (new_link =~ /^\//)
|
58
56
|
next if @crawled_links.include?(new_link)
|
57
|
+
next if @exclude && new_link =~ /#{@exclude}/
|
59
58
|
|
60
59
|
@links_to_crawl << new_link
|
61
60
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: krawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 0.1.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-05-
|
12
|
+
date: 2012-05-14 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: mechanize
|
16
|
-
requirement: &
|
16
|
+
requirement: &70120618617640 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ~>
|
@@ -21,8 +21,8 @@ dependencies:
|
|
21
21
|
version: 2.5.0
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
25
|
-
description: Simple little
|
24
|
+
version_requirements: *70120618617640
|
25
|
+
description: Simple little website crawler.
|
26
26
|
email:
|
27
27
|
- mike@urlgonomics.com
|
28
28
|
executables:
|
@@ -40,7 +40,7 @@ files:
|
|
40
40
|
- lib/krawler.rb
|
41
41
|
- lib/krawler/version.rb
|
42
42
|
- tasks/krawler.rake
|
43
|
-
homepage:
|
43
|
+
homepage: https://github.com/mje113/krawl
|
44
44
|
licenses: []
|
45
45
|
post_install_message:
|
46
46
|
rdoc_options: []
|