krawler 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. data/lib/krawler/version.rb +1 -1
  2. data/lib/krawler.rb +68 -31
  3. metadata +4 -4
@@ -1,3 +1,3 @@
1
1
  module Krawler
2
- VERSION = "0.1.1"
2
+ VERSION = "0.1.2"
3
3
  end
data/lib/krawler.rb CHANGED
@@ -2,6 +2,7 @@ require 'krawler/version'
2
2
  require 'mechanize'
3
3
  require 'timeout'
4
4
  require 'uri'
5
+ require 'thread'
5
6
  require 'pry'
6
7
 
7
8
  module Krawler
@@ -9,67 +10,103 @@ module Krawler
9
10
  class Base
10
11
 
11
12
  def initialize(url, options)
12
- url = URI(url)
13
- @host = "#{url.scheme}://#{url.host}"
14
- @base_path = url.path
15
- @agent = Mechanize.new
16
- @links_to_crawl = [url]
13
+ @url = URI(url)
14
+ @host = "#{@url.scheme}://#{@url.host}"
15
+ @base_path = @url.path
16
+ @links_to_crawl = [@url.to_s]
17
17
  @crawled_links = []
18
18
  @bad_links = []
19
19
  @suspect_links = []
20
20
  @exclude = options[:exclude]
21
21
  @restrict = options[:restrict]
22
+ @randomize = true
23
+ @max_threads = 4
24
+ @mutex = Mutex.new
25
+ @agent = Mechanize.new
22
26
  end
23
27
 
24
28
  def base
25
29
  puts "Crawling..."
26
30
 
27
- while !@links_to_crawl.empty? do
28
- crawl_page(@links_to_crawl.pop)
29
- end
31
+ crawl_page(@url, @agent)
32
+ initialize_threads(@agent)
30
33
 
31
34
  puts "#{@crawled_links.size} total Good Links"
32
35
 
33
36
  puts "Bad Links:"
34
- @bad_links.each {|link| puts link }
37
+ @bad_links.each { |link| puts link }
35
38
 
36
39
  puts "Suspect Links:"
37
- @suspect_links.each {|link| puts link}
40
+ @suspect_links.each { |link| puts link }
41
+ end
42
+
43
+ def initialize_threads
44
+ threads = []
45
+ @max_threads.times do |i|
46
+ threads << Thread.new(i) do
47
+
48
+ agent = @agent.dup
49
+
50
+ while !@links_to_crawl.empty? do
51
+ link = @mutex.synchronize {
52
+ if @randomize
53
+ @links_to_crawl.slice!(rand(@links_to_crawl.size))
54
+ else
55
+ @links_to_crawl.pop
56
+ end
57
+ }
58
+
59
+ crawl_page(link, agent)
60
+ end
61
+ end
62
+ end
63
+
64
+ threads.each { |t| t.join }
38
65
  end
39
66
 
40
- def crawl_page(link)
67
+ def crawl_page(link, agent)
41
68
  @crawled_links << link
42
- puts link
69
+
43
70
  begin
44
71
  start = Time.now
45
- page = @agent.get(link)
72
+ page = agent.get(link)
46
73
  rescue Mechanize::ResponseCodeError => e
47
- puts e
74
+ @mutex.synchronize { puts e }
48
75
  @bad_links << link
49
76
  return
50
77
  rescue Timeout::Error => e
51
78
  @suspect_links << link
52
79
  return
53
80
  ensure
54
- puts " [#{Time.now - start}s] #{@links_to_crawl.size} links..."
81
+ @mutex.synchronize {
82
+ puts link
83
+ puts " [#{Time.now - start}s] #{@links_to_crawl.size} links..."
84
+ }
55
85
  end
56
86
 
57
- return if !page.respond_to?(:links)
58
- page.links.each do |new_link|
59
- begin
60
- new_url = URI(new_link.href)
61
- new_link = new_url.to_s
62
- rescue ArgumentError # junk link
63
- next
64
- end
65
-
66
- if (new_link =~ /^#{Regexp.escape(@host)}/) || (new_link =~ /^\//) # don't crawl external domains
67
-
68
- next if @crawled_links.include?(new_link) # don't crawl what we've alread crawled
69
- next if @exclude && new_link =~ /#{@exclude}/ # don't crawl excluded matched paths
70
- next if @restrict && (new_url.path !~ /^#{Regexp.escape(@base_path)}/) # don't crawl outside of our restricted base path
71
-
72
- @links_to_crawl << new_link
87
+ @mutex.synchronize do
88
+ return if !page.respond_to?(:links)
89
+ page.links.each do |new_link|
90
+ next if new_link.href.nil?
91
+
92
+ # quick scrub known issues
93
+ new_link = new_link.href.gsub(/ /, '%20')
94
+
95
+ begin
96
+ new_url = URI(new_link)
97
+ new_link = new_url.to_s
98
+ rescue ArgumentError # junk link
99
+ next
100
+ end
101
+
102
+ if (new_link =~ /^#{Regexp.escape(@host)}/) || (new_link =~ /^\//) # don't crawl external domains
103
+
104
+ next if @crawled_links.include?(new_link) || @links_to_crawl.include?(new_link) # don't crawl what we've alread crawled
105
+ next if @exclude && new_link =~ /#{@exclude}/ # don't crawl excluded matched paths
106
+ next if @restrict && (new_url.path !~ /^#{Regexp.escape(@base_path)}/) # don't crawl outside of our restricted base path
107
+
108
+ @links_to_crawl << new_link
109
+ end
73
110
  end
74
111
  end
75
112
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: krawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-05-15 00:00:00.000000000 Z
12
+ date: 2012-05-20 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: mechanize
16
- requirement: &70155565557800 !ruby/object:Gem::Requirement
16
+ requirement: &70168780830200 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ~>
@@ -21,7 +21,7 @@ dependencies:
21
21
  version: 2.5.0
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70155565557800
24
+ version_requirements: *70168780830200
25
25
  description: Simple little website crawler.
26
26
  email:
27
27
  - mike@urlgonomics.com