rawler 0.1.1 → 0.1.3
Sign up to get free protection for your applications and to get access to all the features.
- data/{README.rdoc → README.md} +7 -6
- data/VERSION +1 -1
- data/bin/rawler +2 -1
- data/lib/rawler.rb +6 -2
- data/lib/rawler/base.rb +6 -4
- data/lib/rawler/request.rb +9 -3
- data/rawler.gemspec +75 -0
- metadata +20 -21
data/{README.rdoc → README.md}
RENAMED
@@ -8,12 +8,13 @@ Rawler will only parse pages with content type 'text/html', but it will check fo
|
|
8
8
|
|
9
9
|
rawler http://example.com [options]
|
10
10
|
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
11
|
+
where [options] are:
|
12
|
+
--username, -u <s>: HTT Basic Username
|
13
|
+
--password, -p <s>: HTT Basic Password
|
14
|
+
--wait, -w <f>: Seconds to wait between requests, may be fractional e.g. '1.5' (default: 3.0)
|
15
|
+
--log, -l: Log results to file rawler_log.txt
|
16
|
+
--version, -v: Print version and exit
|
17
|
+
--help, -h: Show this message
|
17
18
|
|
18
19
|
### INSTALL:
|
19
20
|
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.1.
|
1
|
+
0.1.3
|
data/bin/rawler
CHANGED
@@ -16,7 +16,8 @@ EOS
|
|
16
16
|
|
17
17
|
opt :username, "HTT Basic Username", :type => :string
|
18
18
|
opt :password, "HTT Basic Password", :type => :string
|
19
|
-
opt :wait, "
|
19
|
+
opt :wait, "Seconds to wait between requests, may be fractional e.g. '1.5'", :type => :float, :default => 3.0
|
20
|
+
opt :log, "Log results to file rawler_log.txt", :type => :boolean, :default => false
|
20
21
|
end
|
21
22
|
|
22
23
|
domain = ARGV.shift
|
data/lib/rawler.rb
CHANGED
@@ -35,7 +35,7 @@ require 'rawler/core_extensions'
|
|
35
35
|
# The Rawler module itself is very simple, and it's only used for storing configuration data like the url that we want to fetch, basic username and password.
|
36
36
|
|
37
37
|
module Rawler
|
38
|
-
VERSION =
|
38
|
+
VERSION = "#{File.read(File.expand_path(File.dirname(__FILE__)) + '/../VERSION')}"
|
39
39
|
|
40
40
|
# `output` is where we want to direct output. It's set to `$stdout` by default.
|
41
41
|
|
@@ -52,7 +52,11 @@ module Rawler
|
|
52
52
|
# Username and Password for basic auth, if needed.
|
53
53
|
|
54
54
|
mattr_accessor :username, :password
|
55
|
-
|
55
|
+
|
56
|
+
# Log switch
|
57
|
+
|
58
|
+
mattr_accessor :log
|
59
|
+
|
56
60
|
# Here we autoload when needed the specific namespaces.
|
57
61
|
|
58
62
|
# [Rawler::Base](rawler/base.html) is responsible for validating all the pages in a domain. It's where all the magic happens.
|
data/lib/rawler/base.rb
CHANGED
@@ -22,13 +22,16 @@ module Rawler
|
|
22
22
|
Rawler.output = Logger.new(output)
|
23
23
|
Rawler.username = options[:username]
|
24
24
|
Rawler.password = options[:password]
|
25
|
-
Rawler.wait = options[:wait]
|
25
|
+
Rawler.wait = options[:wait]
|
26
|
+
Rawler.log = options[:log]
|
27
|
+
@logfile = File.new("rawler_log.txt", "w") if Rawler.log
|
26
28
|
end
|
27
29
|
|
28
30
|
# The method used to start the real validation process
|
29
31
|
|
30
32
|
def validate
|
31
33
|
validate_links_in_page(Rawler.url)
|
34
|
+
@logfile.close if Rawler.log
|
32
35
|
end
|
33
36
|
|
34
37
|
private
|
@@ -113,9 +116,7 @@ module Rawler
|
|
113
116
|
|
114
117
|
code = code.to_i
|
115
118
|
case code / 100
|
116
|
-
when 1
|
117
|
-
Rawler.output.info(message)
|
118
|
-
when 2 then
|
119
|
+
when 1,2
|
119
120
|
Rawler.output.info(message)
|
120
121
|
when 3 then
|
121
122
|
Rawler.output.warn(message)
|
@@ -124,6 +125,7 @@ module Rawler
|
|
124
125
|
else
|
125
126
|
Rawler.output.error("Unknown code #{message}")
|
126
127
|
end
|
128
|
+
@logfile.puts(message) if Rawler.log
|
127
129
|
end
|
128
130
|
|
129
131
|
end
|
data/lib/rawler/request.rb
CHANGED
@@ -16,11 +16,17 @@ module Rawler
|
|
16
16
|
|
17
17
|
private
|
18
18
|
|
19
|
-
def perform_request(method, url)
|
19
|
+
def perform_request(method, url)
|
20
20
|
uri = URI.parse(url)
|
21
|
-
http = Net::HTTP.new(uri.host, uri.port)
|
22
|
-
http.use_ssl = (uri.scheme == 'https')
|
23
21
|
|
22
|
+
# Use http_proxy if set
|
23
|
+
proxy = URI.parse(ENV['http_proxy']) if ENV['http_proxy'] rescue nil
|
24
|
+
if proxy
|
25
|
+
http = Net::HTTP::Proxy(proxy.host, proxy.port).new(uri.host, uri.port)
|
26
|
+
else
|
27
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
28
|
+
end
|
29
|
+
http.use_ssl = (uri.scheme == 'https')
|
24
30
|
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
25
31
|
|
26
32
|
path = (uri.path.size == 0) ? "/" : uri.path
|
data/rawler.gemspec
ADDED
@@ -0,0 +1,75 @@
|
|
1
|
+
# Generated by jeweler
|
2
|
+
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
|
4
|
+
# -*- encoding: utf-8 -*-
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = "rawler"
|
8
|
+
s.version = "0.1.3"
|
9
|
+
|
10
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
+
s.authors = ["Oscar Del Ben"]
|
12
|
+
s.date = "2011-11-11"
|
13
|
+
s.description = "Rawler is a tool that crawls the links of your website"
|
14
|
+
s.email = "info@oscardelben.com"
|
15
|
+
s.executables = ["rawler"]
|
16
|
+
s.extra_rdoc_files = [
|
17
|
+
"LICENSE.txt",
|
18
|
+
"README.md"
|
19
|
+
]
|
20
|
+
s.files = [
|
21
|
+
"Gemfile",
|
22
|
+
"Gemfile.lock",
|
23
|
+
"LICENSE.txt",
|
24
|
+
"README.md",
|
25
|
+
"Rakefile",
|
26
|
+
"VERSION",
|
27
|
+
"bin/rawler",
|
28
|
+
"lib/rawler.rb",
|
29
|
+
"lib/rawler/base.rb",
|
30
|
+
"lib/rawler/core_extensions.rb",
|
31
|
+
"lib/rawler/core_extensions/module.rb",
|
32
|
+
"lib/rawler/crawler.rb",
|
33
|
+
"lib/rawler/request.rb",
|
34
|
+
"rawler.gemspec",
|
35
|
+
"spec/lib/base_spec.rb",
|
36
|
+
"spec/lib/rawler/base_spec.rb",
|
37
|
+
"spec/lib/rawler/crawler_spec.rb",
|
38
|
+
"spec/lib/rawler_spec.rb",
|
39
|
+
"spec/spec.opts",
|
40
|
+
"spec/spec_helper.rb",
|
41
|
+
"test/helper.rb",
|
42
|
+
"test/test_rawler.rb",
|
43
|
+
"vendor/lib-trollop.rb"
|
44
|
+
]
|
45
|
+
s.homepage = "http://github.com/oscardelben/rawler"
|
46
|
+
s.licenses = ["MIT"]
|
47
|
+
s.require_paths = ["lib"]
|
48
|
+
s.rubygems_version = "1.8.11"
|
49
|
+
s.summary = "Rawler is a tool that crawls the links of your website"
|
50
|
+
|
51
|
+
if s.respond_to? :specification_version then
|
52
|
+
s.specification_version = 3
|
53
|
+
|
54
|
+
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
55
|
+
s.add_runtime_dependency(%q<nokogiri>, [">= 0"])
|
56
|
+
s.add_development_dependency(%q<shoulda>, [">= 0"])
|
57
|
+
s.add_development_dependency(%q<bundler>, ["~> 1.0.0"])
|
58
|
+
s.add_development_dependency(%q<jeweler>, ["~> 1.6.4"])
|
59
|
+
s.add_development_dependency(%q<rcov>, [">= 0"])
|
60
|
+
else
|
61
|
+
s.add_dependency(%q<nokogiri>, [">= 0"])
|
62
|
+
s.add_dependency(%q<shoulda>, [">= 0"])
|
63
|
+
s.add_dependency(%q<bundler>, ["~> 1.0.0"])
|
64
|
+
s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
|
65
|
+
s.add_dependency(%q<rcov>, [">= 0"])
|
66
|
+
end
|
67
|
+
else
|
68
|
+
s.add_dependency(%q<nokogiri>, [">= 0"])
|
69
|
+
s.add_dependency(%q<shoulda>, [">= 0"])
|
70
|
+
s.add_dependency(%q<bundler>, ["~> 1.0.0"])
|
71
|
+
s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
|
72
|
+
s.add_dependency(%q<rcov>, [">= 0"])
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 29
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 1
|
9
|
-
-
|
10
|
-
version: 0.1.
|
9
|
+
- 3
|
10
|
+
version: 0.1.3
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Oscar Del Ben
|
@@ -15,12 +15,9 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2011-
|
19
|
-
default_executable: rawler
|
18
|
+
date: 2011-11-11 00:00:00 Z
|
20
19
|
dependencies:
|
21
20
|
- !ruby/object:Gem::Dependency
|
22
|
-
prerelease: false
|
23
|
-
type: :runtime
|
24
21
|
requirement: &id001 !ruby/object:Gem::Requirement
|
25
22
|
none: false
|
26
23
|
requirements:
|
@@ -30,11 +27,11 @@ dependencies:
|
|
30
27
|
segments:
|
31
28
|
- 0
|
32
29
|
version: "0"
|
33
|
-
name: nokogiri
|
34
30
|
version_requirements: *id001
|
35
|
-
|
31
|
+
name: nokogiri
|
36
32
|
prerelease: false
|
37
|
-
type: :
|
33
|
+
type: :runtime
|
34
|
+
- !ruby/object:Gem::Dependency
|
38
35
|
requirement: &id002 !ruby/object:Gem::Requirement
|
39
36
|
none: false
|
40
37
|
requirements:
|
@@ -44,11 +41,11 @@ dependencies:
|
|
44
41
|
segments:
|
45
42
|
- 0
|
46
43
|
version: "0"
|
47
|
-
name: shoulda
|
48
44
|
version_requirements: *id002
|
49
|
-
|
45
|
+
name: shoulda
|
50
46
|
prerelease: false
|
51
47
|
type: :development
|
48
|
+
- !ruby/object:Gem::Dependency
|
52
49
|
requirement: &id003 !ruby/object:Gem::Requirement
|
53
50
|
none: false
|
54
51
|
requirements:
|
@@ -60,11 +57,11 @@ dependencies:
|
|
60
57
|
- 0
|
61
58
|
- 0
|
62
59
|
version: 1.0.0
|
63
|
-
name: bundler
|
64
60
|
version_requirements: *id003
|
65
|
-
|
61
|
+
name: bundler
|
66
62
|
prerelease: false
|
67
63
|
type: :development
|
64
|
+
- !ruby/object:Gem::Dependency
|
68
65
|
requirement: &id004 !ruby/object:Gem::Requirement
|
69
66
|
none: false
|
70
67
|
requirements:
|
@@ -76,11 +73,11 @@ dependencies:
|
|
76
73
|
- 6
|
77
74
|
- 4
|
78
75
|
version: 1.6.4
|
79
|
-
name: jeweler
|
80
76
|
version_requirements: *id004
|
81
|
-
|
77
|
+
name: jeweler
|
82
78
|
prerelease: false
|
83
79
|
type: :development
|
80
|
+
- !ruby/object:Gem::Dependency
|
84
81
|
requirement: &id005 !ruby/object:Gem::Requirement
|
85
82
|
none: false
|
86
83
|
requirements:
|
@@ -90,8 +87,10 @@ dependencies:
|
|
90
87
|
segments:
|
91
88
|
- 0
|
92
89
|
version: "0"
|
93
|
-
name: rcov
|
94
90
|
version_requirements: *id005
|
91
|
+
name: rcov
|
92
|
+
prerelease: false
|
93
|
+
type: :development
|
95
94
|
description: Rawler is a tool that crawls the links of your website
|
96
95
|
email: info@oscardelben.com
|
97
96
|
executables:
|
@@ -100,12 +99,12 @@ extensions: []
|
|
100
99
|
|
101
100
|
extra_rdoc_files:
|
102
101
|
- LICENSE.txt
|
103
|
-
- README.
|
102
|
+
- README.md
|
104
103
|
files:
|
105
104
|
- Gemfile
|
106
105
|
- Gemfile.lock
|
107
106
|
- LICENSE.txt
|
108
|
-
- README.
|
107
|
+
- README.md
|
109
108
|
- Rakefile
|
110
109
|
- VERSION
|
111
110
|
- bin/rawler
|
@@ -115,6 +114,7 @@ files:
|
|
115
114
|
- lib/rawler/core_extensions/module.rb
|
116
115
|
- lib/rawler/crawler.rb
|
117
116
|
- lib/rawler/request.rb
|
117
|
+
- rawler.gemspec
|
118
118
|
- spec/lib/base_spec.rb
|
119
119
|
- spec/lib/rawler/base_spec.rb
|
120
120
|
- spec/lib/rawler/crawler_spec.rb
|
@@ -124,7 +124,6 @@ files:
|
|
124
124
|
- test/helper.rb
|
125
125
|
- test/test_rawler.rb
|
126
126
|
- vendor/lib-trollop.rb
|
127
|
-
has_rdoc: true
|
128
127
|
homepage: http://github.com/oscardelben/rawler
|
129
128
|
licenses:
|
130
129
|
- MIT
|
@@ -154,7 +153,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
154
153
|
requirements: []
|
155
154
|
|
156
155
|
rubyforge_project:
|
157
|
-
rubygems_version: 1.
|
156
|
+
rubygems_version: 1.8.11
|
158
157
|
signing_key:
|
159
158
|
specification_version: 3
|
160
159
|
summary: Rawler is a tool that crawls the links of your website
|