krawler 1.0.7 → 1.0.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/krawl +16 -1
- data/krawler.gemspec +1 -1
- data/lib/krawler/version.rb +1 -1
- data/lib/krawler.rb +36 -1
- metadata +6 -5
data/bin/krawl
CHANGED
@@ -29,6 +29,18 @@ optparse = OptionParser.new do |opts|
|
|
29
29
|
options[:nc] = true
|
30
30
|
end
|
31
31
|
|
32
|
+
opts.on('-l[login_url]', '--login_url[=login_url]', 'Login URL') do |login_url|
|
33
|
+
options[:l] = login_url
|
34
|
+
end
|
35
|
+
|
36
|
+
opts.on('-u[username]', '--username[=username]', 'Username') do |username|
|
37
|
+
options[:u] = username
|
38
|
+
end
|
39
|
+
|
40
|
+
opts.on('-p[password]', '--password[=password]', 'Password') do |password|
|
41
|
+
options[:p] = password
|
42
|
+
end
|
43
|
+
|
32
44
|
opts.separator ''
|
33
45
|
|
34
46
|
opts.on('-h', '--help', 'Show this help message.') { puts opts; exit }
|
@@ -46,5 +58,8 @@ Krawler::Base.new(ARGV[0] || 'http://localhost:3000/', {
|
|
46
58
|
:restrict => options[:s],
|
47
59
|
:threads => options[:c],
|
48
60
|
:randomize => options[:r],
|
49
|
-
:no_cache => options[:nc]
|
61
|
+
:no_cache => options[:nc],
|
62
|
+
:username => options[:u],
|
63
|
+
:password => options[:p],
|
64
|
+
:login_url => options[:l]
|
50
65
|
}).base
|
data/krawler.gemspec
CHANGED
@@ -8,7 +8,7 @@ Gem::Specification.new do |gem|
|
|
8
8
|
gem.summary = %q{}
|
9
9
|
gem.homepage = 'https://github.com/mje113/krawl'
|
10
10
|
|
11
|
-
gem.add_dependency 'mechanize', '~> 2.5.
|
11
|
+
gem.add_dependency 'mechanize', '~> 2.5.1'
|
12
12
|
gem.rubyforge_project = 'krawler'
|
13
13
|
|
14
14
|
gem.files = `git ls-files`.split($\)
|
data/lib/krawler/version.rb
CHANGED
data/lib/krawler.rb
CHANGED
@@ -3,7 +3,6 @@ require 'mechanize'
|
|
3
3
|
require 'timeout'
|
4
4
|
require 'uri'
|
5
5
|
require 'thread'
|
6
|
-
require 'pry'
|
7
6
|
module Krawler
|
8
7
|
|
9
8
|
class Base
|
@@ -20,6 +19,9 @@ module Krawler
|
|
20
19
|
@restrict = options[:restrict]
|
21
20
|
@randomize = options[:randomize]
|
22
21
|
@threads = options[:threads] || 1
|
22
|
+
@username = options[:username]
|
23
|
+
@password = options[:password]
|
24
|
+
@login_url = options[:login_url]
|
23
25
|
@mutex = Mutex.new
|
24
26
|
@agent = Mechanize.new
|
25
27
|
@agent.user_agent = 'Krawler'
|
@@ -29,8 +31,14 @@ module Krawler
|
|
29
31
|
end
|
30
32
|
|
31
33
|
def base
|
34
|
+
return -1 unless validate_authentication_options
|
35
|
+
|
32
36
|
puts "Krawling..."
|
33
37
|
|
38
|
+
if use_authentication?
|
39
|
+
authenticate(@agent, @username, @password, @login_url)
|
40
|
+
end
|
41
|
+
|
34
42
|
crawl_page(@url, @agent)
|
35
43
|
initialize_threads(@agent)
|
36
44
|
|
@@ -43,6 +51,33 @@ module Krawler
|
|
43
51
|
@suspect_links.each { |link| puts link }
|
44
52
|
end
|
45
53
|
|
54
|
+
def authenticate(agent, user, password, login_url)
|
55
|
+
agent.get(login_url) do |page|
|
56
|
+
login_form = page.form
|
57
|
+
|
58
|
+
login_form['user[email]'] = user
|
59
|
+
login_form['user[password]'] = password
|
60
|
+
|
61
|
+
agent.submit(login_form, login_form.buttons.first)
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
def use_authentication?
|
66
|
+
!@username.nil? || !@password.nil? || !@login_url.nil?
|
67
|
+
end
|
68
|
+
|
69
|
+
def validate_authentication_options
|
70
|
+
any_nil = [@login_url, @username, @password].any? {|v| v.nil?}
|
71
|
+
all_nil = [@login_url, @username, @password].all? {|v| v.nil?}
|
72
|
+
if (any_nil && !all_nil)
|
73
|
+
puts "You must either provide all authentication options" +
|
74
|
+
" (username, password, and loginurl) or provide none."
|
75
|
+
return false
|
76
|
+
else
|
77
|
+
return true
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
46
81
|
def initialize_threads(agent)
|
47
82
|
threads = []
|
48
83
|
@threads.times do |i|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: krawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.8
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,19 +9,19 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-10-31 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: mechanize
|
16
|
-
requirement: &
|
16
|
+
requirement: &70166215374240 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ~>
|
20
20
|
- !ruby/object:Gem::Version
|
21
|
-
version: 2.5.
|
21
|
+
version: 2.5.1
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70166215374240
|
25
25
|
description: Simple little website crawler.
|
26
26
|
email:
|
27
27
|
- mike@urlgonomics.com
|
@@ -65,3 +65,4 @@ signing_key:
|
|
65
65
|
specification_version: 3
|
66
66
|
summary: ''
|
67
67
|
test_files: []
|
68
|
+
has_rdoc:
|