krawler 1.0.7 → 1.0.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/bin/krawl CHANGED
@@ -29,6 +29,18 @@ optparse = OptionParser.new do |opts|
29
29
  options[:nc] = true
30
30
  end
31
31
 
32
+ opts.on('-l[login_url]', '--login_url[=login_url]', 'Login URL') do |login_url|
33
+ options[:l] = login_url
34
+ end
35
+
36
+ opts.on('-u[username]', '--username[=username]', 'Username') do |username|
37
+ options[:u] = username
38
+ end
39
+
40
+ opts.on('-p[password]', '--password[=password]', 'Password') do |password|
41
+ options[:p] = password
42
+ end
43
+
32
44
  opts.separator ''
33
45
 
34
46
  opts.on('-h', '--help', 'Show this help message.') { puts opts; exit }
@@ -46,5 +58,8 @@ Krawler::Base.new(ARGV[0] || 'http://localhost:3000/', {
46
58
  :restrict => options[:s],
47
59
  :threads => options[:c],
48
60
  :randomize => options[:r],
49
- :no_cache => options[:nc]
61
+ :no_cache => options[:nc],
62
+ :username => options[:u],
63
+ :password => options[:p],
64
+ :login_url => options[:l]
50
65
  }).base
data/krawler.gemspec CHANGED
@@ -8,7 +8,7 @@ Gem::Specification.new do |gem|
8
8
  gem.summary = %q{}
9
9
  gem.homepage = 'https://github.com/mje113/krawl'
10
10
 
11
- gem.add_dependency 'mechanize', '~> 2.5.0'
11
+ gem.add_dependency 'mechanize', '~> 2.5.1'
12
12
  gem.rubyforge_project = 'krawler'
13
13
 
14
14
  gem.files = `git ls-files`.split($\)
@@ -1,3 +1,3 @@
1
1
  module Krawler
2
- VERSION = '1.0.7'
2
+ VERSION = '1.0.8'
3
3
  end
data/lib/krawler.rb CHANGED
@@ -3,7 +3,6 @@ require 'mechanize'
3
3
  require 'timeout'
4
4
  require 'uri'
5
5
  require 'thread'
6
- require 'pry'
7
6
  module Krawler
8
7
 
9
8
  class Base
@@ -20,6 +19,9 @@ module Krawler
20
19
  @restrict = options[:restrict]
21
20
  @randomize = options[:randomize]
22
21
  @threads = options[:threads] || 1
22
+ @username = options[:username]
23
+ @password = options[:password]
24
+ @login_url = options[:login_url]
23
25
  @mutex = Mutex.new
24
26
  @agent = Mechanize.new
25
27
  @agent.user_agent = 'Krawler'
@@ -29,8 +31,14 @@ module Krawler
29
31
  end
30
32
 
31
33
  def base
34
+ return -1 unless validate_authentication_options
35
+
32
36
  puts "Krawling..."
33
37
 
38
+ if use_authentication?
39
+ authenticate(@agent, @username, @password, @login_url)
40
+ end
41
+
34
42
  crawl_page(@url, @agent)
35
43
  initialize_threads(@agent)
36
44
 
@@ -43,6 +51,33 @@ module Krawler
43
51
  @suspect_links.each { |link| puts link }
44
52
  end
45
53
 
54
+ def authenticate(agent, user, password, login_url)
55
+ agent.get(login_url) do |page|
56
+ login_form = page.form
57
+
58
+ login_form['user[email]'] = user
59
+ login_form['user[password]'] = password
60
+
61
+ agent.submit(login_form, login_form.buttons.first)
62
+ end
63
+ end
64
+
65
+ def use_authentication?
66
+ !@username.nil? || !@password.nil? || !@login_url.nil?
67
+ end
68
+
69
+ def validate_authentication_options
70
+ any_nil = [@login_url, @username, @password].any? {|v| v.nil?}
71
+ all_nil = [@login_url, @username, @password].all? {|v| v.nil?}
72
+ if (any_nil && !all_nil)
73
+ puts "You must either provide all authentication options" +
74
+ " (username, password, and loginurl) or provide none."
75
+ return false
76
+ else
77
+ return true
78
+ end
79
+ end
80
+
46
81
  def initialize_threads(agent)
47
82
  threads = []
48
83
  @threads.times do |i|
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: krawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.7
4
+ version: 1.0.8
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,19 +9,19 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-09-04 00:00:00.000000000 Z
12
+ date: 2012-10-31 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: mechanize
16
- requirement: &70222535947940 !ruby/object:Gem::Requirement
16
+ requirement: &70166215374240 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ~>
20
20
  - !ruby/object:Gem::Version
21
- version: 2.5.0
21
+ version: 2.5.1
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70222535947940
24
+ version_requirements: *70166215374240
25
25
  description: Simple little website crawler.
26
26
  email:
27
27
  - mike@urlgonomics.com
@@ -65,3 +65,4 @@ signing_key:
65
65
  specification_version: 3
66
66
  summary: ''
67
67
  test_files: []
68
+ has_rdoc: