krawler 0.1.1 → 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/krawler/version.rb +1 -1
- data/lib/krawler.rb +68 -31
- metadata +4 -4
data/lib/krawler/version.rb
CHANGED
data/lib/krawler.rb
CHANGED
@@ -2,6 +2,7 @@ require 'krawler/version'
|
|
2
2
|
require 'mechanize'
|
3
3
|
require 'timeout'
|
4
4
|
require 'uri'
|
5
|
+
require 'thread'
|
5
6
|
require 'pry'
|
6
7
|
|
7
8
|
module Krawler
|
@@ -9,67 +10,103 @@ module Krawler
|
|
9
10
|
class Base
|
10
11
|
|
11
12
|
def initialize(url, options)
|
12
|
-
url
|
13
|
-
@host = "#{url.scheme}://#{url.host}"
|
14
|
-
@base_path = url.path
|
15
|
-
@
|
16
|
-
@links_to_crawl = [url]
|
13
|
+
@url = URI(url)
|
14
|
+
@host = "#{@url.scheme}://#{@url.host}"
|
15
|
+
@base_path = @url.path
|
16
|
+
@links_to_crawl = [@url.to_s]
|
17
17
|
@crawled_links = []
|
18
18
|
@bad_links = []
|
19
19
|
@suspect_links = []
|
20
20
|
@exclude = options[:exclude]
|
21
21
|
@restrict = options[:restrict]
|
22
|
+
@randomize = true
|
23
|
+
@max_threads = 4
|
24
|
+
@mutex = Mutex.new
|
25
|
+
@agent = Mechanize.new
|
22
26
|
end
|
23
27
|
|
24
28
|
def base
|
25
29
|
puts "Crawling..."
|
26
30
|
|
27
|
-
|
28
|
-
|
29
|
-
end
|
31
|
+
crawl_page(@url, @agent)
|
32
|
+
initialize_threads(@agent)
|
30
33
|
|
31
34
|
puts "#{@crawled_links.size} total Good Links"
|
32
35
|
|
33
36
|
puts "Bad Links:"
|
34
|
-
@bad_links.each {|link| puts link }
|
37
|
+
@bad_links.each { |link| puts link }
|
35
38
|
|
36
39
|
puts "Suspect Links:"
|
37
|
-
@suspect_links.each {|link| puts link}
|
40
|
+
@suspect_links.each { |link| puts link }
|
41
|
+
end
|
42
|
+
|
43
|
+
def initialize_threads
|
44
|
+
threads = []
|
45
|
+
@max_threads.times do |i|
|
46
|
+
threads << Thread.new(i) do
|
47
|
+
|
48
|
+
agent = @agent.dup
|
49
|
+
|
50
|
+
while !@links_to_crawl.empty? do
|
51
|
+
link = @mutex.synchronize {
|
52
|
+
if @randomize
|
53
|
+
@links_to_crawl.slice!(rand(@links_to_crawl.size))
|
54
|
+
else
|
55
|
+
@links_to_crawl.pop
|
56
|
+
end
|
57
|
+
}
|
58
|
+
|
59
|
+
crawl_page(link, agent)
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
threads.each { |t| t.join }
|
38
65
|
end
|
39
66
|
|
40
|
-
def crawl_page(link)
|
67
|
+
def crawl_page(link, agent)
|
41
68
|
@crawled_links << link
|
42
|
-
|
69
|
+
|
43
70
|
begin
|
44
71
|
start = Time.now
|
45
|
-
page =
|
72
|
+
page = agent.get(link)
|
46
73
|
rescue Mechanize::ResponseCodeError => e
|
47
|
-
puts e
|
74
|
+
@mutex.synchronize { puts e }
|
48
75
|
@bad_links << link
|
49
76
|
return
|
50
77
|
rescue Timeout::Error => e
|
51
78
|
@suspect_links << link
|
52
79
|
return
|
53
80
|
ensure
|
54
|
-
|
81
|
+
@mutex.synchronize {
|
82
|
+
puts link
|
83
|
+
puts " [#{Time.now - start}s] #{@links_to_crawl.size} links..."
|
84
|
+
}
|
55
85
|
end
|
56
86
|
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
@
|
87
|
+
@mutex.synchronize do
|
88
|
+
return if !page.respond_to?(:links)
|
89
|
+
page.links.each do |new_link|
|
90
|
+
next if new_link.href.nil?
|
91
|
+
|
92
|
+
# quick scrub known issues
|
93
|
+
new_link = new_link.href.gsub(/ /, '%20')
|
94
|
+
|
95
|
+
begin
|
96
|
+
new_url = URI(new_link)
|
97
|
+
new_link = new_url.to_s
|
98
|
+
rescue ArgumentError # junk link
|
99
|
+
next
|
100
|
+
end
|
101
|
+
|
102
|
+
if (new_link =~ /^#{Regexp.escape(@host)}/) || (new_link =~ /^\//) # don't crawl external domains
|
103
|
+
|
104
|
+
next if @crawled_links.include?(new_link) || @links_to_crawl.include?(new_link) # don't crawl what we've alread crawled
|
105
|
+
next if @exclude && new_link =~ /#{@exclude}/ # don't crawl excluded matched paths
|
106
|
+
next if @restrict && (new_url.path !~ /^#{Regexp.escape(@base_path)}/) # don't crawl outside of our restricted base path
|
107
|
+
|
108
|
+
@links_to_crawl << new_link
|
109
|
+
end
|
73
110
|
end
|
74
111
|
end
|
75
112
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: krawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-05-
|
12
|
+
date: 2012-05-20 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: mechanize
|
16
|
-
requirement: &
|
16
|
+
requirement: &70168780830200 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ~>
|
@@ -21,7 +21,7 @@ dependencies:
|
|
21
21
|
version: 2.5.0
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70168780830200
|
25
25
|
description: Simple little website crawler.
|
26
26
|
email:
|
27
27
|
- mike@urlgonomics.com
|