krawler 1.0.9 → 1.0.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/krawler.rb +31 -29
- data/lib/krawler/authentication.rb +33 -0
- data/lib/krawler/version.rb +1 -1
- metadata +11 -5
data/lib/krawler.rb
CHANGED
@@ -1,13 +1,17 @@
|
|
1
1
|
require 'krawler/version'
|
2
|
+
require 'krawler/authentication'
|
2
3
|
require 'mechanize'
|
3
4
|
require 'timeout'
|
4
5
|
require 'uri'
|
5
6
|
require 'thread'
|
7
|
+
require 'pry'
|
6
8
|
|
7
9
|
module Krawler
|
8
10
|
|
9
11
|
class Base
|
10
12
|
|
13
|
+
include Authentication
|
14
|
+
|
11
15
|
def initialize(url, options)
|
12
16
|
@url = URI(url)
|
13
17
|
@host = "#{@url.scheme}://#{@url.host}"
|
@@ -28,7 +32,7 @@ module Krawler
|
|
28
32
|
@agent = Mechanize.new
|
29
33
|
@agent.user_agent = 'Krawler'
|
30
34
|
@agent.ssl_version = 'SSLv3'
|
31
|
-
@headers
|
35
|
+
@headers = { 'Accept-Encoding' => 'gzip, deflate' }
|
32
36
|
@headers['Cache-Control'] = 'no-cache' if options[:no_cache]
|
33
37
|
end
|
34
38
|
|
@@ -53,33 +57,6 @@ module Krawler
|
|
53
57
|
@suspect_links.each { |link| puts link }
|
54
58
|
end
|
55
59
|
|
56
|
-
def authenticate(agent, user, password, login_url)
|
57
|
-
agent.get(login_url) do |page|
|
58
|
-
login_form = page.form
|
59
|
-
|
60
|
-
login_form['user[email]'] = user
|
61
|
-
login_form['user[password]'] = password
|
62
|
-
|
63
|
-
agent.submit(login_form, login_form.buttons.first)
|
64
|
-
end
|
65
|
-
end
|
66
|
-
|
67
|
-
def use_authentication?
|
68
|
-
!@username.nil? || !@password.nil? || !@login_url.nil?
|
69
|
-
end
|
70
|
-
|
71
|
-
def validate_authentication_options
|
72
|
-
any_nil = [@login_url, @username, @password].any? {|v| v.nil?}
|
73
|
-
all_nil = [@login_url, @username, @password].all? {|v| v.nil?}
|
74
|
-
if (any_nil && !all_nil)
|
75
|
-
puts "You must either provide all authentication options" +
|
76
|
-
" (username, password, and loginurl) or provide none."
|
77
|
-
return false
|
78
|
-
else
|
79
|
-
return true
|
80
|
-
end
|
81
|
-
end
|
82
|
-
|
83
60
|
def initialize_threads(agent)
|
84
61
|
threads = []
|
85
62
|
@threads.times do |i|
|
@@ -95,7 +72,6 @@ module Krawler
|
|
95
72
|
@links_to_crawl.pop
|
96
73
|
end
|
97
74
|
}
|
98
|
-
|
99
75
|
crawl_page(link, agent)
|
100
76
|
end
|
101
77
|
end
|
@@ -135,6 +111,9 @@ module Krawler
|
|
135
111
|
|
136
112
|
@mutex.synchronize do
|
137
113
|
return if !page.respond_to?(:links)
|
114
|
+
|
115
|
+
recache_invalid_results(page)
|
116
|
+
|
138
117
|
page.links.each do |new_link|
|
139
118
|
next if new_link.href.nil?
|
140
119
|
next if new_link.rel.include? 'nofollow'
|
@@ -169,5 +148,28 @@ module Krawler
|
|
169
148
|
end
|
170
149
|
end
|
171
150
|
end
|
151
|
+
|
152
|
+
protected
|
153
|
+
|
154
|
+
def params_to_hash(params)
|
155
|
+
params = CGI.unescape(params)
|
156
|
+
Hash[ params.split('&').map { |p| p.split('=') } ]
|
157
|
+
end
|
158
|
+
|
159
|
+
def hash_to_params(hash)
|
160
|
+
hash.map { |k, v| "#{k}=#{v}" }.sort * '&'
|
161
|
+
end
|
162
|
+
|
163
|
+
def recache_invalid_results(page)
|
164
|
+
page.search('tr td i.icon-remove').each do |invalid|
|
165
|
+
a = invalid.parent.parent.css('a').first
|
166
|
+
next if a.nil?
|
167
|
+
uri = URI(a['href'])
|
168
|
+
query = params_to_hash(uri.query || '')
|
169
|
+
query['cache'] = 'false'
|
170
|
+
uri.query = hash_to_params(query)
|
171
|
+
@links_to_crawl << uri.to_s
|
172
|
+
end
|
173
|
+
end
|
172
174
|
end
|
173
175
|
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
module Krawler
|
2
|
+
|
3
|
+
module Authentication
|
4
|
+
|
5
|
+
def authenticate(agent, user, password, login_url)
|
6
|
+
agent.get(login_url) do |page|
|
7
|
+
login_form = page.form
|
8
|
+
|
9
|
+
login_form['user[email]'] = user
|
10
|
+
login_form['user[password]'] = password
|
11
|
+
|
12
|
+
agent.submit(login_form, login_form.buttons.first)
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
def use_authentication?
|
17
|
+
!@username.nil? || !@password.nil? || !@login_url.nil?
|
18
|
+
end
|
19
|
+
|
20
|
+
def validate_authentication_options
|
21
|
+
any_nil = [@login_url, @username, @password].any? {|v| v.nil?}
|
22
|
+
all_nil = [@login_url, @username, @password].all? {|v| v.nil?}
|
23
|
+
if (any_nil && !all_nil)
|
24
|
+
puts "You must either provide all authentication options" +
|
25
|
+
" (username, password, and loginurl) or provide none."
|
26
|
+
return false
|
27
|
+
else
|
28
|
+
return true
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
end
|
data/lib/krawler/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: krawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.10
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-11-
|
12
|
+
date: 2012-11-28 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: mechanize
|
16
|
-
requirement:
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ~>
|
@@ -21,7 +21,12 @@ dependencies:
|
|
21
21
|
version: 2.5.1
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements:
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ~>
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: 2.5.1
|
25
30
|
description: Simple little website crawler.
|
26
31
|
email:
|
27
32
|
- mike@urlgonomics.com
|
@@ -38,6 +43,7 @@ files:
|
|
38
43
|
- bin/krawl
|
39
44
|
- krawler.gemspec
|
40
45
|
- lib/krawler.rb
|
46
|
+
- lib/krawler/authentication.rb
|
41
47
|
- lib/krawler/version.rb
|
42
48
|
- tasks/krawler.rake
|
43
49
|
homepage: https://github.com/mje113/krawl
|
@@ -60,7 +66,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
60
66
|
version: '0'
|
61
67
|
requirements: []
|
62
68
|
rubyforge_project: krawler
|
63
|
-
rubygems_version: 1.8.
|
69
|
+
rubygems_version: 1.8.24
|
64
70
|
signing_key:
|
65
71
|
specification_version: 3
|
66
72
|
summary: ''
|