rawler 0.1.1 → 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -8,12 +8,13 @@ Rawler will only parse pages with content type 'text/html', but it will check fo
8
8
 
9
9
  rawler http://example.com [options]
10
10
 
11
- where [options] are:
12
- --username, -u <s>: HTTP Basic Username
13
- --password, -p <s>: HTTP Basic Password
14
- --wait: Number of seconds to wait betweet requests (default 3)
15
- --version, -v: Print version and exit
16
- --help, -h: Show this message
11
+ where [options] are:
12
+ --username, -u <s>: HTT Basic Username
13
+ --password, -p <s>: HTT Basic Password
14
+ --wait, -w <f>: Seconds to wait between requests, may be fractional e.g. '1.5' (default: 3.0)
15
+ --log, -l: Log results to file rawler_log.txt
16
+ --version, -v: Print version and exit
17
+ --help, -h: Show this message
17
18
 
18
19
  ### INSTALL:
19
20
 
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.1.1
1
+ 0.1.3
data/bin/rawler CHANGED
@@ -16,7 +16,8 @@ EOS
16
16
 
17
17
  opt :username, "HTT Basic Username", :type => :string
18
18
  opt :password, "HTT Basic Password", :type => :string
19
- opt :wait, "Number of seconds to wait betweet requests", :default => 3
19
+ opt :wait, "Seconds to wait between requests, may be fractional e.g. '1.5'", :type => :float, :default => 3.0
20
+ opt :log, "Log results to file rawler_log.txt", :type => :boolean, :default => false
20
21
  end
21
22
 
22
23
  domain = ARGV.shift
data/lib/rawler.rb CHANGED
@@ -35,7 +35,7 @@ require 'rawler/core_extensions'
35
35
  # The Rawler module itself is very simple, and it's only used for storing configuration data like the url that we want to fetch, basic username and password.
36
36
 
37
37
  module Rawler
38
- VERSION = '0.1.1'
38
+ VERSION = "#{File.read(File.expand_path(File.dirname(__FILE__)) + '/../VERSION')}"
39
39
 
40
40
  # `output` is where we want to direct output. It's set to `$stdout` by default.
41
41
 
@@ -52,7 +52,11 @@ module Rawler
52
52
  # Username and Password for basic auth, if needed.
53
53
 
54
54
  mattr_accessor :username, :password
55
-
55
+
56
+ # Log switch
57
+
58
+ mattr_accessor :log
59
+
56
60
  # Here we autoload when needed the specific namespaces.
57
61
 
58
62
  # [Rawler::Base](rawler/base.html) is responsible for validating all the pages in a domain. It's where all the magic happens.
data/lib/rawler/base.rb CHANGED
@@ -22,13 +22,16 @@ module Rawler
22
22
  Rawler.output = Logger.new(output)
23
23
  Rawler.username = options[:username]
24
24
  Rawler.password = options[:password]
25
- Rawler.wait = options[:wait].to_i
25
+ Rawler.wait = options[:wait]
26
+ Rawler.log = options[:log]
27
+ @logfile = File.new("rawler_log.txt", "w") if Rawler.log
26
28
  end
27
29
 
28
30
  # The method used to start the real validation process
29
31
 
30
32
  def validate
31
33
  validate_links_in_page(Rawler.url)
34
+ @logfile.close if Rawler.log
32
35
  end
33
36
 
34
37
  private
@@ -113,9 +116,7 @@ module Rawler
113
116
 
114
117
  code = code.to_i
115
118
  case code / 100
116
- when 1
117
- Rawler.output.info(message)
118
- when 2 then
119
+ when 1,2
119
120
  Rawler.output.info(message)
120
121
  when 3 then
121
122
  Rawler.output.warn(message)
@@ -124,6 +125,7 @@ module Rawler
124
125
  else
125
126
  Rawler.output.error("Unknown code #{message}")
126
127
  end
128
+ @logfile.puts(message) if Rawler.log
127
129
  end
128
130
 
129
131
  end
@@ -16,11 +16,17 @@ module Rawler
16
16
 
17
17
  private
18
18
 
19
- def perform_request(method, url)
19
+ def perform_request(method, url)
20
20
  uri = URI.parse(url)
21
- http = Net::HTTP.new(uri.host, uri.port)
22
- http.use_ssl = (uri.scheme == 'https')
23
21
 
22
+ # Use http_proxy if set
23
+ proxy = URI.parse(ENV['http_proxy']) if ENV['http_proxy'] rescue nil
24
+ if proxy
25
+ http = Net::HTTP::Proxy(proxy.host, proxy.port).new(uri.host, uri.port)
26
+ else
27
+ http = Net::HTTP.new(uri.host, uri.port)
28
+ end
29
+ http.use_ssl = (uri.scheme == 'https')
24
30
  http.verify_mode = OpenSSL::SSL::VERIFY_NONE
25
31
 
26
32
  path = (uri.path.size == 0) ? "/" : uri.path
data/rawler.gemspec ADDED
@@ -0,0 +1,75 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = "rawler"
8
+ s.version = "0.1.3"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["Oscar Del Ben"]
12
+ s.date = "2011-11-11"
13
+ s.description = "Rawler is a tool that crawls the links of your website"
14
+ s.email = "info@oscardelben.com"
15
+ s.executables = ["rawler"]
16
+ s.extra_rdoc_files = [
17
+ "LICENSE.txt",
18
+ "README.md"
19
+ ]
20
+ s.files = [
21
+ "Gemfile",
22
+ "Gemfile.lock",
23
+ "LICENSE.txt",
24
+ "README.md",
25
+ "Rakefile",
26
+ "VERSION",
27
+ "bin/rawler",
28
+ "lib/rawler.rb",
29
+ "lib/rawler/base.rb",
30
+ "lib/rawler/core_extensions.rb",
31
+ "lib/rawler/core_extensions/module.rb",
32
+ "lib/rawler/crawler.rb",
33
+ "lib/rawler/request.rb",
34
+ "rawler.gemspec",
35
+ "spec/lib/base_spec.rb",
36
+ "spec/lib/rawler/base_spec.rb",
37
+ "spec/lib/rawler/crawler_spec.rb",
38
+ "spec/lib/rawler_spec.rb",
39
+ "spec/spec.opts",
40
+ "spec/spec_helper.rb",
41
+ "test/helper.rb",
42
+ "test/test_rawler.rb",
43
+ "vendor/lib-trollop.rb"
44
+ ]
45
+ s.homepage = "http://github.com/oscardelben/rawler"
46
+ s.licenses = ["MIT"]
47
+ s.require_paths = ["lib"]
48
+ s.rubygems_version = "1.8.11"
49
+ s.summary = "Rawler is a tool that crawls the links of your website"
50
+
51
+ if s.respond_to? :specification_version then
52
+ s.specification_version = 3
53
+
54
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
55
+ s.add_runtime_dependency(%q<nokogiri>, [">= 0"])
56
+ s.add_development_dependency(%q<shoulda>, [">= 0"])
57
+ s.add_development_dependency(%q<bundler>, ["~> 1.0.0"])
58
+ s.add_development_dependency(%q<jeweler>, ["~> 1.6.4"])
59
+ s.add_development_dependency(%q<rcov>, [">= 0"])
60
+ else
61
+ s.add_dependency(%q<nokogiri>, [">= 0"])
62
+ s.add_dependency(%q<shoulda>, [">= 0"])
63
+ s.add_dependency(%q<bundler>, ["~> 1.0.0"])
64
+ s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
65
+ s.add_dependency(%q<rcov>, [">= 0"])
66
+ end
67
+ else
68
+ s.add_dependency(%q<nokogiri>, [">= 0"])
69
+ s.add_dependency(%q<shoulda>, [">= 0"])
70
+ s.add_dependency(%q<bundler>, ["~> 1.0.0"])
71
+ s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
72
+ s.add_dependency(%q<rcov>, [">= 0"])
73
+ end
74
+ end
75
+
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rawler
3
3
  version: !ruby/object:Gem::Version
4
- hash: 25
4
+ hash: 29
5
5
  prerelease:
6
6
  segments:
7
7
  - 0
8
8
  - 1
9
- - 1
10
- version: 0.1.1
9
+ - 3
10
+ version: 0.1.3
11
11
  platform: ruby
12
12
  authors:
13
13
  - Oscar Del Ben
@@ -15,12 +15,9 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2011-09-16 00:00:00 +02:00
19
- default_executable: rawler
18
+ date: 2011-11-11 00:00:00 Z
20
19
  dependencies:
21
20
  - !ruby/object:Gem::Dependency
22
- prerelease: false
23
- type: :runtime
24
21
  requirement: &id001 !ruby/object:Gem::Requirement
25
22
  none: false
26
23
  requirements:
@@ -30,11 +27,11 @@ dependencies:
30
27
  segments:
31
28
  - 0
32
29
  version: "0"
33
- name: nokogiri
34
30
  version_requirements: *id001
35
- - !ruby/object:Gem::Dependency
31
+ name: nokogiri
36
32
  prerelease: false
37
- type: :development
33
+ type: :runtime
34
+ - !ruby/object:Gem::Dependency
38
35
  requirement: &id002 !ruby/object:Gem::Requirement
39
36
  none: false
40
37
  requirements:
@@ -44,11 +41,11 @@ dependencies:
44
41
  segments:
45
42
  - 0
46
43
  version: "0"
47
- name: shoulda
48
44
  version_requirements: *id002
49
- - !ruby/object:Gem::Dependency
45
+ name: shoulda
50
46
  prerelease: false
51
47
  type: :development
48
+ - !ruby/object:Gem::Dependency
52
49
  requirement: &id003 !ruby/object:Gem::Requirement
53
50
  none: false
54
51
  requirements:
@@ -60,11 +57,11 @@ dependencies:
60
57
  - 0
61
58
  - 0
62
59
  version: 1.0.0
63
- name: bundler
64
60
  version_requirements: *id003
65
- - !ruby/object:Gem::Dependency
61
+ name: bundler
66
62
  prerelease: false
67
63
  type: :development
64
+ - !ruby/object:Gem::Dependency
68
65
  requirement: &id004 !ruby/object:Gem::Requirement
69
66
  none: false
70
67
  requirements:
@@ -76,11 +73,11 @@ dependencies:
76
73
  - 6
77
74
  - 4
78
75
  version: 1.6.4
79
- name: jeweler
80
76
  version_requirements: *id004
81
- - !ruby/object:Gem::Dependency
77
+ name: jeweler
82
78
  prerelease: false
83
79
  type: :development
80
+ - !ruby/object:Gem::Dependency
84
81
  requirement: &id005 !ruby/object:Gem::Requirement
85
82
  none: false
86
83
  requirements:
@@ -90,8 +87,10 @@ dependencies:
90
87
  segments:
91
88
  - 0
92
89
  version: "0"
93
- name: rcov
94
90
  version_requirements: *id005
91
+ name: rcov
92
+ prerelease: false
93
+ type: :development
95
94
  description: Rawler is a tool that crawls the links of your website
96
95
  email: info@oscardelben.com
97
96
  executables:
@@ -100,12 +99,12 @@ extensions: []
100
99
 
101
100
  extra_rdoc_files:
102
101
  - LICENSE.txt
103
- - README.rdoc
102
+ - README.md
104
103
  files:
105
104
  - Gemfile
106
105
  - Gemfile.lock
107
106
  - LICENSE.txt
108
- - README.rdoc
107
+ - README.md
109
108
  - Rakefile
110
109
  - VERSION
111
110
  - bin/rawler
@@ -115,6 +114,7 @@ files:
115
114
  - lib/rawler/core_extensions/module.rb
116
115
  - lib/rawler/crawler.rb
117
116
  - lib/rawler/request.rb
117
+ - rawler.gemspec
118
118
  - spec/lib/base_spec.rb
119
119
  - spec/lib/rawler/base_spec.rb
120
120
  - spec/lib/rawler/crawler_spec.rb
@@ -124,7 +124,6 @@ files:
124
124
  - test/helper.rb
125
125
  - test/test_rawler.rb
126
126
  - vendor/lib-trollop.rb
127
- has_rdoc: true
128
127
  homepage: http://github.com/oscardelben/rawler
129
128
  licenses:
130
129
  - MIT
@@ -154,7 +153,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
154
153
  requirements: []
155
154
 
156
155
  rubyforge_project:
157
- rubygems_version: 1.6.2
156
+ rubygems_version: 1.8.11
158
157
  signing_key:
159
158
  specification_version: 3
160
159
  summary: Rawler is a tool that crawls the links of your website