krawler 1.0.7 → 1.0.8

Sign up to get free protection for your applications and to get access to all the features.
data/bin/krawl CHANGED
@@ -29,6 +29,18 @@ optparse = OptionParser.new do |opts|
29
29
  options[:nc] = true
30
30
  end
31
31
 
32
+ opts.on('-l[login_url]', '--login_url[=login_url]', 'Login URL') do |login_url|
33
+ options[:l] = login_url
34
+ end
35
+
36
+ opts.on('-u[username]', '--username[=username]', 'Username') do |username|
37
+ options[:u] = username
38
+ end
39
+
40
+ opts.on('-p[password]', '--password[=password]', 'Password') do |password|
41
+ options[:p] = password
42
+ end
43
+
32
44
  opts.separator ''
33
45
 
34
46
  opts.on('-h', '--help', 'Show this help message.') { puts opts; exit }
@@ -46,5 +58,8 @@ Krawler::Base.new(ARGV[0] || 'http://localhost:3000/', {
46
58
  :restrict => options[:s],
47
59
  :threads => options[:c],
48
60
  :randomize => options[:r],
49
- :no_cache => options[:nc]
61
+ :no_cache => options[:nc],
62
+ :username => options[:u],
63
+ :password => options[:p],
64
+ :login_url => options[:l]
50
65
  }).base
data/krawler.gemspec CHANGED
@@ -8,7 +8,7 @@ Gem::Specification.new do |gem|
8
8
  gem.summary = %q{}
9
9
  gem.homepage = 'https://github.com/mje113/krawl'
10
10
 
11
- gem.add_dependency 'mechanize', '~> 2.5.0'
11
+ gem.add_dependency 'mechanize', '~> 2.5.1'
12
12
  gem.rubyforge_project = 'krawler'
13
13
 
14
14
  gem.files = `git ls-files`.split($\)
@@ -1,3 +1,3 @@
1
1
  module Krawler
2
- VERSION = '1.0.7'
2
+ VERSION = '1.0.8'
3
3
  end
data/lib/krawler.rb CHANGED
@@ -3,7 +3,6 @@ require 'mechanize'
3
3
  require 'timeout'
4
4
  require 'uri'
5
5
  require 'thread'
6
- require 'pry'
7
6
  module Krawler
8
7
 
9
8
  class Base
@@ -20,6 +19,9 @@ module Krawler
20
19
  @restrict = options[:restrict]
21
20
  @randomize = options[:randomize]
22
21
  @threads = options[:threads] || 1
22
+ @username = options[:username]
23
+ @password = options[:password]
24
+ @login_url = options[:login_url]
23
25
  @mutex = Mutex.new
24
26
  @agent = Mechanize.new
25
27
  @agent.user_agent = 'Krawler'
@@ -29,8 +31,14 @@ module Krawler
29
31
  end
30
32
 
31
33
  def base
34
+ return -1 unless validate_authentication_options
35
+
32
36
  puts "Krawling..."
33
37
 
38
+ if use_authentication?
39
+ authenticate(@agent, @username, @password, @login_url)
40
+ end
41
+
34
42
  crawl_page(@url, @agent)
35
43
  initialize_threads(@agent)
36
44
 
@@ -43,6 +51,33 @@ module Krawler
43
51
  @suspect_links.each { |link| puts link }
44
52
  end
45
53
 
54
+ def authenticate(agent, user, password, login_url)
55
+ agent.get(login_url) do |page|
56
+ login_form = page.form
57
+
58
+ login_form['user[email]'] = user
59
+ login_form['user[password]'] = password
60
+
61
+ agent.submit(login_form, login_form.buttons.first)
62
+ end
63
+ end
64
+
65
+ def use_authentication?
66
+ !@username.nil? || !@password.nil? || !@login_url.nil?
67
+ end
68
+
69
+ def validate_authentication_options
70
+ any_nil = [@login_url, @username, @password].any? {|v| v.nil?}
71
+ all_nil = [@login_url, @username, @password].all? {|v| v.nil?}
72
+ if (any_nil && !all_nil)
73
+ puts "You must either provide all authentication options" +
74
+ " (username, password, and loginurl) or provide none."
75
+ return false
76
+ else
77
+ return true
78
+ end
79
+ end
80
+
46
81
  def initialize_threads(agent)
47
82
  threads = []
48
83
  @threads.times do |i|
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: krawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.7
4
+ version: 1.0.8
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,19 +9,19 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-09-04 00:00:00.000000000 Z
12
+ date: 2012-10-31 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: mechanize
16
- requirement: &70222535947940 !ruby/object:Gem::Requirement
16
+ requirement: &70166215374240 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ~>
20
20
  - !ruby/object:Gem::Version
21
- version: 2.5.0
21
+ version: 2.5.1
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70222535947940
24
+ version_requirements: *70166215374240
25
25
  description: Simple little website crawler.
26
26
  email:
27
27
  - mike@urlgonomics.com
@@ -65,3 +65,4 @@ signing_key:
65
65
  specification_version: 3
66
66
  summary: ''
67
67
  test_files: []
68
+ has_rdoc: