krawler 1.0.9 → 1.0.10

Sign up to get free protection for your applications and to get access to all the features.
data/lib/krawler.rb CHANGED
@@ -1,13 +1,17 @@
1
1
  require 'krawler/version'
2
+ require 'krawler/authentication'
2
3
  require 'mechanize'
3
4
  require 'timeout'
4
5
  require 'uri'
5
6
  require 'thread'
7
+ require 'pry'
6
8
 
7
9
  module Krawler
8
10
 
9
11
  class Base
10
12
 
13
+ include Authentication
14
+
11
15
  def initialize(url, options)
12
16
  @url = URI(url)
13
17
  @host = "#{@url.scheme}://#{@url.host}"
@@ -28,7 +32,7 @@ module Krawler
28
32
  @agent = Mechanize.new
29
33
  @agent.user_agent = 'Krawler'
30
34
  @agent.ssl_version = 'SSLv3'
31
- @headers = { 'Accept-Encoding' => 'gzip, deflate' }
35
+ @headers = { 'Accept-Encoding' => 'gzip, deflate' }
32
36
  @headers['Cache-Control'] = 'no-cache' if options[:no_cache]
33
37
  end
34
38
 
@@ -53,33 +57,6 @@ module Krawler
53
57
  @suspect_links.each { |link| puts link }
54
58
  end
55
59
 
56
- def authenticate(agent, user, password, login_url)
57
- agent.get(login_url) do |page|
58
- login_form = page.form
59
-
60
- login_form['user[email]'] = user
61
- login_form['user[password]'] = password
62
-
63
- agent.submit(login_form, login_form.buttons.first)
64
- end
65
- end
66
-
67
- def use_authentication?
68
- !@username.nil? || !@password.nil? || !@login_url.nil?
69
- end
70
-
71
- def validate_authentication_options
72
- any_nil = [@login_url, @username, @password].any? {|v| v.nil?}
73
- all_nil = [@login_url, @username, @password].all? {|v| v.nil?}
74
- if (any_nil && !all_nil)
75
- puts "You must either provide all authentication options" +
76
- " (username, password, and loginurl) or provide none."
77
- return false
78
- else
79
- return true
80
- end
81
- end
82
-
83
60
  def initialize_threads(agent)
84
61
  threads = []
85
62
  @threads.times do |i|
@@ -95,7 +72,6 @@ module Krawler
95
72
  @links_to_crawl.pop
96
73
  end
97
74
  }
98
-
99
75
  crawl_page(link, agent)
100
76
  end
101
77
  end
@@ -135,6 +111,9 @@ module Krawler
135
111
 
136
112
  @mutex.synchronize do
137
113
  return if !page.respond_to?(:links)
114
+
115
+ recache_invalid_results(page)
116
+
138
117
  page.links.each do |new_link|
139
118
  next if new_link.href.nil?
140
119
  next if new_link.rel.include? 'nofollow'
@@ -169,5 +148,28 @@ module Krawler
169
148
  end
170
149
  end
171
150
  end
151
+
152
+ protected
153
+
154
+ def params_to_hash(params)
155
+ params = CGI.unescape(params)
156
+ Hash[ params.split('&').map { |p| p.split('=') } ]
157
+ end
158
+
159
+ def hash_to_params(hash)
160
+ hash.map { |k, v| "#{k}=#{v}" }.sort * '&'
161
+ end
162
+
163
+ def recache_invalid_results(page)
164
+ page.search('tr td i.icon-remove').each do |invalid|
165
+ a = invalid.parent.parent.css('a').first
166
+ next if a.nil?
167
+ uri = URI(a['href'])
168
+ query = params_to_hash(uri.query || '')
169
+ query['cache'] = 'false'
170
+ uri.query = hash_to_params(query)
171
+ @links_to_crawl << uri.to_s
172
+ end
173
+ end
172
174
  end
173
175
  end
@@ -0,0 +1,33 @@
1
+ module Krawler
2
+
3
+ module Authentication
4
+
5
+ def authenticate(agent, user, password, login_url)
6
+ agent.get(login_url) do |page|
7
+ login_form = page.form
8
+
9
+ login_form['user[email]'] = user
10
+ login_form['user[password]'] = password
11
+
12
+ agent.submit(login_form, login_form.buttons.first)
13
+ end
14
+ end
15
+
16
+ def use_authentication?
17
+ !@username.nil? || !@password.nil? || !@login_url.nil?
18
+ end
19
+
20
+ def validate_authentication_options
21
+ any_nil = [@login_url, @username, @password].any? {|v| v.nil?}
22
+ all_nil = [@login_url, @username, @password].all? {|v| v.nil?}
23
+ if (any_nil && !all_nil)
24
+ puts "You must either provide all authentication options" +
25
+ " (username, password, and loginurl) or provide none."
26
+ return false
27
+ else
28
+ return true
29
+ end
30
+ end
31
+ end
32
+
33
+ end
@@ -1,3 +1,3 @@
1
1
  module Krawler
2
- VERSION = '1.0.9'
2
+ VERSION = '1.0.10'
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: krawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.9
4
+ version: 1.0.10
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-11-03 00:00:00.000000000 Z
12
+ date: 2012-11-28 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: mechanize
16
- requirement: &70292261892960 !ruby/object:Gem::Requirement
16
+ requirement: !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ~>
@@ -21,7 +21,12 @@ dependencies:
21
21
  version: 2.5.1
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70292261892960
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ~>
28
+ - !ruby/object:Gem::Version
29
+ version: 2.5.1
25
30
  description: Simple little website crawler.
26
31
  email:
27
32
  - mike@urlgonomics.com
@@ -38,6 +43,7 @@ files:
38
43
  - bin/krawl
39
44
  - krawler.gemspec
40
45
  - lib/krawler.rb
46
+ - lib/krawler/authentication.rb
41
47
  - lib/krawler/version.rb
42
48
  - tasks/krawler.rake
43
49
  homepage: https://github.com/mje113/krawl
@@ -60,7 +66,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
60
66
  version: '0'
61
67
  requirements: []
62
68
  rubyforge_project: krawler
63
- rubygems_version: 1.8.17
69
+ rubygems_version: 1.8.24
64
70
  signing_key:
65
71
  specification_version: 3
66
72
  summary: ''