krawler 0.1.1 → 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. data/lib/krawler/version.rb +1 -1
  2. data/lib/krawler.rb +68 -31
  3. metadata +4 -4
@@ -1,3 +1,3 @@
1
1
  module Krawler
2
- VERSION = "0.1.1"
2
+ VERSION = "0.1.2"
3
3
  end
data/lib/krawler.rb CHANGED
@@ -2,6 +2,7 @@ require 'krawler/version'
2
2
  require 'mechanize'
3
3
  require 'timeout'
4
4
  require 'uri'
5
+ require 'thread'
5
6
  require 'pry'
6
7
 
7
8
  module Krawler
@@ -9,67 +10,103 @@ module Krawler
9
10
  class Base
10
11
 
11
12
  def initialize(url, options)
12
- url = URI(url)
13
- @host = "#{url.scheme}://#{url.host}"
14
- @base_path = url.path
15
- @agent = Mechanize.new
16
- @links_to_crawl = [url]
13
+ @url = URI(url)
14
+ @host = "#{@url.scheme}://#{@url.host}"
15
+ @base_path = @url.path
16
+ @links_to_crawl = [@url.to_s]
17
17
  @crawled_links = []
18
18
  @bad_links = []
19
19
  @suspect_links = []
20
20
  @exclude = options[:exclude]
21
21
  @restrict = options[:restrict]
22
+ @randomize = true
23
+ @max_threads = 4
24
+ @mutex = Mutex.new
25
+ @agent = Mechanize.new
22
26
  end
23
27
 
24
28
  def base
25
29
  puts "Crawling..."
26
30
 
27
- while !@links_to_crawl.empty? do
28
- crawl_page(@links_to_crawl.pop)
29
- end
31
+ crawl_page(@url, @agent)
32
+ initialize_threads(@agent)
30
33
 
31
34
  puts "#{@crawled_links.size} total Good Links"
32
35
 
33
36
  puts "Bad Links:"
34
- @bad_links.each {|link| puts link }
37
+ @bad_links.each { |link| puts link }
35
38
 
36
39
  puts "Suspect Links:"
37
- @suspect_links.each {|link| puts link}
40
+ @suspect_links.each { |link| puts link }
41
+ end
42
+
43
+ def initialize_threads
44
+ threads = []
45
+ @max_threads.times do |i|
46
+ threads << Thread.new(i) do
47
+
48
+ agent = @agent.dup
49
+
50
+ while !@links_to_crawl.empty? do
51
+ link = @mutex.synchronize {
52
+ if @randomize
53
+ @links_to_crawl.slice!(rand(@links_to_crawl.size))
54
+ else
55
+ @links_to_crawl.pop
56
+ end
57
+ }
58
+
59
+ crawl_page(link, agent)
60
+ end
61
+ end
62
+ end
63
+
64
+ threads.each { |t| t.join }
38
65
  end
39
66
 
40
- def crawl_page(link)
67
+ def crawl_page(link, agent)
41
68
  @crawled_links << link
42
- puts link
69
+
43
70
  begin
44
71
  start = Time.now
45
- page = @agent.get(link)
72
+ page = agent.get(link)
46
73
  rescue Mechanize::ResponseCodeError => e
47
- puts e
74
+ @mutex.synchronize { puts e }
48
75
  @bad_links << link
49
76
  return
50
77
  rescue Timeout::Error => e
51
78
  @suspect_links << link
52
79
  return
53
80
  ensure
54
- puts " [#{Time.now - start}s] #{@links_to_crawl.size} links..."
81
+ @mutex.synchronize {
82
+ puts link
83
+ puts " [#{Time.now - start}s] #{@links_to_crawl.size} links..."
84
+ }
55
85
  end
56
86
 
57
- return if !page.respond_to?(:links)
58
- page.links.each do |new_link|
59
- begin
60
- new_url = URI(new_link.href)
61
- new_link = new_url.to_s
62
- rescue ArgumentError # junk link
63
- next
64
- end
65
-
66
- if (new_link =~ /^#{Regexp.escape(@host)}/) || (new_link =~ /^\//) # don't crawl external domains
67
-
68
- next if @crawled_links.include?(new_link) # don't crawl what we've alread crawled
69
- next if @exclude && new_link =~ /#{@exclude}/ # don't crawl excluded matched paths
70
- next if @restrict && (new_url.path !~ /^#{Regexp.escape(@base_path)}/) # don't crawl outside of our restricted base path
71
-
72
- @links_to_crawl << new_link
87
+ @mutex.synchronize do
88
+ return if !page.respond_to?(:links)
89
+ page.links.each do |new_link|
90
+ next if new_link.href.nil?
91
+
92
+ # quick scrub known issues
93
+ new_link = new_link.href.gsub(/ /, '%20')
94
+
95
+ begin
96
+ new_url = URI(new_link)
97
+ new_link = new_url.to_s
98
+ rescue ArgumentError # junk link
99
+ next
100
+ end
101
+
102
+ if (new_link =~ /^#{Regexp.escape(@host)}/) || (new_link =~ /^\//) # don't crawl external domains
103
+
104
+ next if @crawled_links.include?(new_link) || @links_to_crawl.include?(new_link) # don't crawl what we've alread crawled
105
+ next if @exclude && new_link =~ /#{@exclude}/ # don't crawl excluded matched paths
106
+ next if @restrict && (new_url.path !~ /^#{Regexp.escape(@base_path)}/) # don't crawl outside of our restricted base path
107
+
108
+ @links_to_crawl << new_link
109
+ end
73
110
  end
74
111
  end
75
112
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: krawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-05-15 00:00:00.000000000 Z
12
+ date: 2012-05-20 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: mechanize
16
- requirement: &70155565557800 !ruby/object:Gem::Requirement
16
+ requirement: &70168780830200 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ~>
@@ -21,7 +21,7 @@ dependencies:
21
21
  version: 2.5.0
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70155565557800
24
+ version_requirements: *70168780830200
25
25
  description: Simple little website crawler.
26
26
  email:
27
27
  - mike@urlgonomics.com