krawler 1.0.7 → 1.0.8
Sign up to get free protection for your applications and to get access to all the features.
- data/bin/krawl +16 -1
- data/krawler.gemspec +1 -1
- data/lib/krawler/version.rb +1 -1
- data/lib/krawler.rb +36 -1
- metadata +6 -5
data/bin/krawl
CHANGED
@@ -29,6 +29,18 @@ optparse = OptionParser.new do |opts|
|
|
29
29
|
options[:nc] = true
|
30
30
|
end
|
31
31
|
|
32
|
+
opts.on('-l[login_url]', '--login_url[=login_url]', 'Login URL') do |login_url|
|
33
|
+
options[:l] = login_url
|
34
|
+
end
|
35
|
+
|
36
|
+
opts.on('-u[username]', '--username[=username]', 'Username') do |username|
|
37
|
+
options[:u] = username
|
38
|
+
end
|
39
|
+
|
40
|
+
opts.on('-p[password]', '--password[=password]', 'Password') do |password|
|
41
|
+
options[:p] = password
|
42
|
+
end
|
43
|
+
|
32
44
|
opts.separator ''
|
33
45
|
|
34
46
|
opts.on('-h', '--help', 'Show this help message.') { puts opts; exit }
|
@@ -46,5 +58,8 @@ Krawler::Base.new(ARGV[0] || 'http://localhost:3000/', {
|
|
46
58
|
:restrict => options[:s],
|
47
59
|
:threads => options[:c],
|
48
60
|
:randomize => options[:r],
|
49
|
-
:no_cache => options[:nc]
|
61
|
+
:no_cache => options[:nc],
|
62
|
+
:username => options[:u],
|
63
|
+
:password => options[:p],
|
64
|
+
:login_url => options[:l]
|
50
65
|
}).base
|
data/krawler.gemspec
CHANGED
@@ -8,7 +8,7 @@ Gem::Specification.new do |gem|
|
|
8
8
|
gem.summary = %q{}
|
9
9
|
gem.homepage = 'https://github.com/mje113/krawl'
|
10
10
|
|
11
|
-
gem.add_dependency 'mechanize', '~> 2.5.
|
11
|
+
gem.add_dependency 'mechanize', '~> 2.5.1'
|
12
12
|
gem.rubyforge_project = 'krawler'
|
13
13
|
|
14
14
|
gem.files = `git ls-files`.split($\)
|
data/lib/krawler/version.rb
CHANGED
data/lib/krawler.rb
CHANGED
@@ -3,7 +3,6 @@ require 'mechanize'
|
|
3
3
|
require 'timeout'
|
4
4
|
require 'uri'
|
5
5
|
require 'thread'
|
6
|
-
require 'pry'
|
7
6
|
module Krawler
|
8
7
|
|
9
8
|
class Base
|
@@ -20,6 +19,9 @@ module Krawler
|
|
20
19
|
@restrict = options[:restrict]
|
21
20
|
@randomize = options[:randomize]
|
22
21
|
@threads = options[:threads] || 1
|
22
|
+
@username = options[:username]
|
23
|
+
@password = options[:password]
|
24
|
+
@login_url = options[:login_url]
|
23
25
|
@mutex = Mutex.new
|
24
26
|
@agent = Mechanize.new
|
25
27
|
@agent.user_agent = 'Krawler'
|
@@ -29,8 +31,14 @@ module Krawler
|
|
29
31
|
end
|
30
32
|
|
31
33
|
def base
|
34
|
+
return -1 unless validate_authentication_options
|
35
|
+
|
32
36
|
puts "Krawling..."
|
33
37
|
|
38
|
+
if use_authentication?
|
39
|
+
authenticate(@agent, @username, @password, @login_url)
|
40
|
+
end
|
41
|
+
|
34
42
|
crawl_page(@url, @agent)
|
35
43
|
initialize_threads(@agent)
|
36
44
|
|
@@ -43,6 +51,33 @@ module Krawler
|
|
43
51
|
@suspect_links.each { |link| puts link }
|
44
52
|
end
|
45
53
|
|
54
|
+
def authenticate(agent, user, password, login_url)
|
55
|
+
agent.get(login_url) do |page|
|
56
|
+
login_form = page.form
|
57
|
+
|
58
|
+
login_form['user[email]'] = user
|
59
|
+
login_form['user[password]'] = password
|
60
|
+
|
61
|
+
agent.submit(login_form, login_form.buttons.first)
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
def use_authentication?
|
66
|
+
!@username.nil? || !@password.nil? || !@login_url.nil?
|
67
|
+
end
|
68
|
+
|
69
|
+
def validate_authentication_options
|
70
|
+
any_nil = [@login_url, @username, @password].any? {|v| v.nil?}
|
71
|
+
all_nil = [@login_url, @username, @password].all? {|v| v.nil?}
|
72
|
+
if (any_nil && !all_nil)
|
73
|
+
puts "You must either provide all authentication options" +
|
74
|
+
" (username, password, and loginurl) or provide none."
|
75
|
+
return false
|
76
|
+
else
|
77
|
+
return true
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
46
81
|
def initialize_threads(agent)
|
47
82
|
threads = []
|
48
83
|
@threads.times do |i|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: krawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.8
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,19 +9,19 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-10-31 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: mechanize
|
16
|
-
requirement: &
|
16
|
+
requirement: &70166215374240 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ~>
|
20
20
|
- !ruby/object:Gem::Version
|
21
|
-
version: 2.5.
|
21
|
+
version: 2.5.1
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70166215374240
|
25
25
|
description: Simple little website crawler.
|
26
26
|
email:
|
27
27
|
- mike@urlgonomics.com
|
@@ -65,3 +65,4 @@ signing_key:
|
|
65
65
|
specification_version: 3
|
66
66
|
summary: ''
|
67
67
|
test_files: []
|
68
|
+
has_rdoc:
|