krawler 1.0.9 → 1.0.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/krawler.rb CHANGED
@@ -1,13 +1,17 @@
1
1
  require 'krawler/version'
2
+ require 'krawler/authentication'
2
3
  require 'mechanize'
3
4
  require 'timeout'
4
5
  require 'uri'
5
6
  require 'thread'
7
+ require 'pry'
6
8
 
7
9
  module Krawler
8
10
 
9
11
  class Base
10
12
 
13
+ include Authentication
14
+
11
15
  def initialize(url, options)
12
16
  @url = URI(url)
13
17
  @host = "#{@url.scheme}://#{@url.host}"
@@ -28,7 +32,7 @@ module Krawler
28
32
  @agent = Mechanize.new
29
33
  @agent.user_agent = 'Krawler'
30
34
  @agent.ssl_version = 'SSLv3'
31
- @headers = { 'Accept-Encoding' => 'gzip, deflate' }
35
+ @headers = { 'Accept-Encoding' => 'gzip, deflate' }
32
36
  @headers['Cache-Control'] = 'no-cache' if options[:no_cache]
33
37
  end
34
38
 
@@ -53,33 +57,6 @@ module Krawler
53
57
  @suspect_links.each { |link| puts link }
54
58
  end
55
59
 
56
- def authenticate(agent, user, password, login_url)
57
- agent.get(login_url) do |page|
58
- login_form = page.form
59
-
60
- login_form['user[email]'] = user
61
- login_form['user[password]'] = password
62
-
63
- agent.submit(login_form, login_form.buttons.first)
64
- end
65
- end
66
-
67
- def use_authentication?
68
- !@username.nil? || !@password.nil? || !@login_url.nil?
69
- end
70
-
71
- def validate_authentication_options
72
- any_nil = [@login_url, @username, @password].any? {|v| v.nil?}
73
- all_nil = [@login_url, @username, @password].all? {|v| v.nil?}
74
- if (any_nil && !all_nil)
75
- puts "You must either provide all authentication options" +
76
- " (username, password, and loginurl) or provide none."
77
- return false
78
- else
79
- return true
80
- end
81
- end
82
-
83
60
  def initialize_threads(agent)
84
61
  threads = []
85
62
  @threads.times do |i|
@@ -95,7 +72,6 @@ module Krawler
95
72
  @links_to_crawl.pop
96
73
  end
97
74
  }
98
-
99
75
  crawl_page(link, agent)
100
76
  end
101
77
  end
@@ -135,6 +111,9 @@ module Krawler
135
111
 
136
112
  @mutex.synchronize do
137
113
  return if !page.respond_to?(:links)
114
+
115
+ recache_invalid_results(page)
116
+
138
117
  page.links.each do |new_link|
139
118
  next if new_link.href.nil?
140
119
  next if new_link.rel.include? 'nofollow'
@@ -169,5 +148,28 @@ module Krawler
169
148
  end
170
149
  end
171
150
  end
151
+
152
+ protected
153
+
154
+ def params_to_hash(params)
155
+ params = CGI.unescape(params)
156
+ Hash[ params.split('&').map { |p| p.split('=') } ]
157
+ end
158
+
159
+ def hash_to_params(hash)
160
+ hash.map { |k, v| "#{k}=#{v}" }.sort * '&'
161
+ end
162
+
163
+ def recache_invalid_results(page)
164
+ page.search('tr td i.icon-remove').each do |invalid|
165
+ a = invalid.parent.parent.css('a').first
166
+ next if a.nil?
167
+ uri = URI(a['href'])
168
+ query = params_to_hash(uri.query || '')
169
+ query['cache'] = 'false'
170
+ uri.query = hash_to_params(query)
171
+ @links_to_crawl << uri.to_s
172
+ end
173
+ end
172
174
  end
173
175
  end
@@ -0,0 +1,33 @@
1
+ module Krawler
2
+
3
+ module Authentication
4
+
5
+ def authenticate(agent, user, password, login_url)
6
+ agent.get(login_url) do |page|
7
+ login_form = page.form
8
+
9
+ login_form['user[email]'] = user
10
+ login_form['user[password]'] = password
11
+
12
+ agent.submit(login_form, login_form.buttons.first)
13
+ end
14
+ end
15
+
16
+ def use_authentication?
17
+ !@username.nil? || !@password.nil? || !@login_url.nil?
18
+ end
19
+
20
+ def validate_authentication_options
21
+ any_nil = [@login_url, @username, @password].any? {|v| v.nil?}
22
+ all_nil = [@login_url, @username, @password].all? {|v| v.nil?}
23
+ if (any_nil && !all_nil)
24
+ puts "You must either provide all authentication options" +
25
+ " (username, password, and loginurl) or provide none."
26
+ return false
27
+ else
28
+ return true
29
+ end
30
+ end
31
+ end
32
+
33
+ end
@@ -1,3 +1,3 @@
1
1
  module Krawler
2
- VERSION = '1.0.9'
2
+ VERSION = '1.0.10'
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: krawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.9
4
+ version: 1.0.10
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-11-03 00:00:00.000000000 Z
12
+ date: 2012-11-28 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: mechanize
16
- requirement: &70292261892960 !ruby/object:Gem::Requirement
16
+ requirement: !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ~>
@@ -21,7 +21,12 @@ dependencies:
21
21
  version: 2.5.1
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70292261892960
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ~>
28
+ - !ruby/object:Gem::Version
29
+ version: 2.5.1
25
30
  description: Simple little website crawler.
26
31
  email:
27
32
  - mike@urlgonomics.com
@@ -38,6 +43,7 @@ files:
38
43
  - bin/krawl
39
44
  - krawler.gemspec
40
45
  - lib/krawler.rb
46
+ - lib/krawler/authentication.rb
41
47
  - lib/krawler/version.rb
42
48
  - tasks/krawler.rake
43
49
  homepage: https://github.com/mje113/krawl
@@ -60,7 +66,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
60
66
  version: '0'
61
67
  requirements: []
62
68
  rubyforge_project: krawler
63
- rubygems_version: 1.8.17
69
+ rubygems_version: 1.8.24
64
70
  signing_key:
65
71
  specification_version: 3
66
72
  summary: ''