rawler 0.1.1 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/{README.rdoc → README.md} +7 -6
- data/VERSION +1 -1
- data/bin/rawler +2 -1
- data/lib/rawler.rb +6 -2
- data/lib/rawler/base.rb +6 -4
- data/lib/rawler/request.rb +9 -3
- data/rawler.gemspec +75 -0
- metadata +20 -21
data/{README.rdoc → README.md}
RENAMED
@@ -8,12 +8,13 @@ Rawler will only parse pages with content type 'text/html', but it will check fo
|
|
8
8
|
|
9
9
|
rawler http://example.com [options]
|
10
10
|
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
11
|
+
where [options] are:
|
12
|
+
--username, -u <s>: HTT Basic Username
|
13
|
+
--password, -p <s>: HTT Basic Password
|
14
|
+
--wait, -w <f>: Seconds to wait between requests, may be fractional e.g. '1.5' (default: 3.0)
|
15
|
+
--log, -l: Log results to file rawler_log.txt
|
16
|
+
--version, -v: Print version and exit
|
17
|
+
--help, -h: Show this message
|
17
18
|
|
18
19
|
### INSTALL:
|
19
20
|
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.1.
|
1
|
+
0.1.3
|
data/bin/rawler
CHANGED
@@ -16,7 +16,8 @@ EOS
|
|
16
16
|
|
17
17
|
opt :username, "HTT Basic Username", :type => :string
|
18
18
|
opt :password, "HTT Basic Password", :type => :string
|
19
|
-
opt :wait, "
|
19
|
+
opt :wait, "Seconds to wait between requests, may be fractional e.g. '1.5'", :type => :float, :default => 3.0
|
20
|
+
opt :log, "Log results to file rawler_log.txt", :type => :boolean, :default => false
|
20
21
|
end
|
21
22
|
|
22
23
|
domain = ARGV.shift
|
data/lib/rawler.rb
CHANGED
@@ -35,7 +35,7 @@ require 'rawler/core_extensions'
|
|
35
35
|
# The Rawler module itself is very simple, and it's only used for storing configuration data like the url that we want to fetch, basic username and password.
|
36
36
|
|
37
37
|
module Rawler
|
38
|
-
VERSION =
|
38
|
+
VERSION = "#{File.read(File.expand_path(File.dirname(__FILE__)) + '/../VERSION')}"
|
39
39
|
|
40
40
|
# `output` is where we want to direct output. It's set to `$stdout` by default.
|
41
41
|
|
@@ -52,7 +52,11 @@ module Rawler
|
|
52
52
|
# Username and Password for basic auth, if needed.
|
53
53
|
|
54
54
|
mattr_accessor :username, :password
|
55
|
-
|
55
|
+
|
56
|
+
# Log switch
|
57
|
+
|
58
|
+
mattr_accessor :log
|
59
|
+
|
56
60
|
# Here we autoload when needed the specific namespaces.
|
57
61
|
|
58
62
|
# [Rawler::Base](rawler/base.html) is responsible for validating all the pages in a domain. It's where all the magic happens.
|
data/lib/rawler/base.rb
CHANGED
@@ -22,13 +22,16 @@ module Rawler
|
|
22
22
|
Rawler.output = Logger.new(output)
|
23
23
|
Rawler.username = options[:username]
|
24
24
|
Rawler.password = options[:password]
|
25
|
-
Rawler.wait = options[:wait]
|
25
|
+
Rawler.wait = options[:wait]
|
26
|
+
Rawler.log = options[:log]
|
27
|
+
@logfile = File.new("rawler_log.txt", "w") if Rawler.log
|
26
28
|
end
|
27
29
|
|
28
30
|
# The method used to start the real validation process
|
29
31
|
|
30
32
|
def validate
|
31
33
|
validate_links_in_page(Rawler.url)
|
34
|
+
@logfile.close if Rawler.log
|
32
35
|
end
|
33
36
|
|
34
37
|
private
|
@@ -113,9 +116,7 @@ module Rawler
|
|
113
116
|
|
114
117
|
code = code.to_i
|
115
118
|
case code / 100
|
116
|
-
when 1
|
117
|
-
Rawler.output.info(message)
|
118
|
-
when 2 then
|
119
|
+
when 1,2
|
119
120
|
Rawler.output.info(message)
|
120
121
|
when 3 then
|
121
122
|
Rawler.output.warn(message)
|
@@ -124,6 +125,7 @@ module Rawler
|
|
124
125
|
else
|
125
126
|
Rawler.output.error("Unknown code #{message}")
|
126
127
|
end
|
128
|
+
@logfile.puts(message) if Rawler.log
|
127
129
|
end
|
128
130
|
|
129
131
|
end
|
data/lib/rawler/request.rb
CHANGED
@@ -16,11 +16,17 @@ module Rawler
|
|
16
16
|
|
17
17
|
private
|
18
18
|
|
19
|
-
def perform_request(method, url)
|
19
|
+
def perform_request(method, url)
|
20
20
|
uri = URI.parse(url)
|
21
|
-
http = Net::HTTP.new(uri.host, uri.port)
|
22
|
-
http.use_ssl = (uri.scheme == 'https')
|
23
21
|
|
22
|
+
# Use http_proxy if set
|
23
|
+
proxy = URI.parse(ENV['http_proxy']) if ENV['http_proxy'] rescue nil
|
24
|
+
if proxy
|
25
|
+
http = Net::HTTP::Proxy(proxy.host, proxy.port).new(uri.host, uri.port)
|
26
|
+
else
|
27
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
28
|
+
end
|
29
|
+
http.use_ssl = (uri.scheme == 'https')
|
24
30
|
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
25
31
|
|
26
32
|
path = (uri.path.size == 0) ? "/" : uri.path
|
data/rawler.gemspec
ADDED
@@ -0,0 +1,75 @@
|
|
1
|
+
# Generated by jeweler
|
2
|
+
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
|
4
|
+
# -*- encoding: utf-8 -*-
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = "rawler"
|
8
|
+
s.version = "0.1.3"
|
9
|
+
|
10
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
+
s.authors = ["Oscar Del Ben"]
|
12
|
+
s.date = "2011-11-11"
|
13
|
+
s.description = "Rawler is a tool that crawls the links of your website"
|
14
|
+
s.email = "info@oscardelben.com"
|
15
|
+
s.executables = ["rawler"]
|
16
|
+
s.extra_rdoc_files = [
|
17
|
+
"LICENSE.txt",
|
18
|
+
"README.md"
|
19
|
+
]
|
20
|
+
s.files = [
|
21
|
+
"Gemfile",
|
22
|
+
"Gemfile.lock",
|
23
|
+
"LICENSE.txt",
|
24
|
+
"README.md",
|
25
|
+
"Rakefile",
|
26
|
+
"VERSION",
|
27
|
+
"bin/rawler",
|
28
|
+
"lib/rawler.rb",
|
29
|
+
"lib/rawler/base.rb",
|
30
|
+
"lib/rawler/core_extensions.rb",
|
31
|
+
"lib/rawler/core_extensions/module.rb",
|
32
|
+
"lib/rawler/crawler.rb",
|
33
|
+
"lib/rawler/request.rb",
|
34
|
+
"rawler.gemspec",
|
35
|
+
"spec/lib/base_spec.rb",
|
36
|
+
"spec/lib/rawler/base_spec.rb",
|
37
|
+
"spec/lib/rawler/crawler_spec.rb",
|
38
|
+
"spec/lib/rawler_spec.rb",
|
39
|
+
"spec/spec.opts",
|
40
|
+
"spec/spec_helper.rb",
|
41
|
+
"test/helper.rb",
|
42
|
+
"test/test_rawler.rb",
|
43
|
+
"vendor/lib-trollop.rb"
|
44
|
+
]
|
45
|
+
s.homepage = "http://github.com/oscardelben/rawler"
|
46
|
+
s.licenses = ["MIT"]
|
47
|
+
s.require_paths = ["lib"]
|
48
|
+
s.rubygems_version = "1.8.11"
|
49
|
+
s.summary = "Rawler is a tool that crawls the links of your website"
|
50
|
+
|
51
|
+
if s.respond_to? :specification_version then
|
52
|
+
s.specification_version = 3
|
53
|
+
|
54
|
+
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
55
|
+
s.add_runtime_dependency(%q<nokogiri>, [">= 0"])
|
56
|
+
s.add_development_dependency(%q<shoulda>, [">= 0"])
|
57
|
+
s.add_development_dependency(%q<bundler>, ["~> 1.0.0"])
|
58
|
+
s.add_development_dependency(%q<jeweler>, ["~> 1.6.4"])
|
59
|
+
s.add_development_dependency(%q<rcov>, [">= 0"])
|
60
|
+
else
|
61
|
+
s.add_dependency(%q<nokogiri>, [">= 0"])
|
62
|
+
s.add_dependency(%q<shoulda>, [">= 0"])
|
63
|
+
s.add_dependency(%q<bundler>, ["~> 1.0.0"])
|
64
|
+
s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
|
65
|
+
s.add_dependency(%q<rcov>, [">= 0"])
|
66
|
+
end
|
67
|
+
else
|
68
|
+
s.add_dependency(%q<nokogiri>, [">= 0"])
|
69
|
+
s.add_dependency(%q<shoulda>, [">= 0"])
|
70
|
+
s.add_dependency(%q<bundler>, ["~> 1.0.0"])
|
71
|
+
s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
|
72
|
+
s.add_dependency(%q<rcov>, [">= 0"])
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 29
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 1
|
9
|
-
-
|
10
|
-
version: 0.1.
|
9
|
+
- 3
|
10
|
+
version: 0.1.3
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Oscar Del Ben
|
@@ -15,12 +15,9 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2011-
|
19
|
-
default_executable: rawler
|
18
|
+
date: 2011-11-11 00:00:00 Z
|
20
19
|
dependencies:
|
21
20
|
- !ruby/object:Gem::Dependency
|
22
|
-
prerelease: false
|
23
|
-
type: :runtime
|
24
21
|
requirement: &id001 !ruby/object:Gem::Requirement
|
25
22
|
none: false
|
26
23
|
requirements:
|
@@ -30,11 +27,11 @@ dependencies:
|
|
30
27
|
segments:
|
31
28
|
- 0
|
32
29
|
version: "0"
|
33
|
-
name: nokogiri
|
34
30
|
version_requirements: *id001
|
35
|
-
|
31
|
+
name: nokogiri
|
36
32
|
prerelease: false
|
37
|
-
type: :
|
33
|
+
type: :runtime
|
34
|
+
- !ruby/object:Gem::Dependency
|
38
35
|
requirement: &id002 !ruby/object:Gem::Requirement
|
39
36
|
none: false
|
40
37
|
requirements:
|
@@ -44,11 +41,11 @@ dependencies:
|
|
44
41
|
segments:
|
45
42
|
- 0
|
46
43
|
version: "0"
|
47
|
-
name: shoulda
|
48
44
|
version_requirements: *id002
|
49
|
-
|
45
|
+
name: shoulda
|
50
46
|
prerelease: false
|
51
47
|
type: :development
|
48
|
+
- !ruby/object:Gem::Dependency
|
52
49
|
requirement: &id003 !ruby/object:Gem::Requirement
|
53
50
|
none: false
|
54
51
|
requirements:
|
@@ -60,11 +57,11 @@ dependencies:
|
|
60
57
|
- 0
|
61
58
|
- 0
|
62
59
|
version: 1.0.0
|
63
|
-
name: bundler
|
64
60
|
version_requirements: *id003
|
65
|
-
|
61
|
+
name: bundler
|
66
62
|
prerelease: false
|
67
63
|
type: :development
|
64
|
+
- !ruby/object:Gem::Dependency
|
68
65
|
requirement: &id004 !ruby/object:Gem::Requirement
|
69
66
|
none: false
|
70
67
|
requirements:
|
@@ -76,11 +73,11 @@ dependencies:
|
|
76
73
|
- 6
|
77
74
|
- 4
|
78
75
|
version: 1.6.4
|
79
|
-
name: jeweler
|
80
76
|
version_requirements: *id004
|
81
|
-
|
77
|
+
name: jeweler
|
82
78
|
prerelease: false
|
83
79
|
type: :development
|
80
|
+
- !ruby/object:Gem::Dependency
|
84
81
|
requirement: &id005 !ruby/object:Gem::Requirement
|
85
82
|
none: false
|
86
83
|
requirements:
|
@@ -90,8 +87,10 @@ dependencies:
|
|
90
87
|
segments:
|
91
88
|
- 0
|
92
89
|
version: "0"
|
93
|
-
name: rcov
|
94
90
|
version_requirements: *id005
|
91
|
+
name: rcov
|
92
|
+
prerelease: false
|
93
|
+
type: :development
|
95
94
|
description: Rawler is a tool that crawls the links of your website
|
96
95
|
email: info@oscardelben.com
|
97
96
|
executables:
|
@@ -100,12 +99,12 @@ extensions: []
|
|
100
99
|
|
101
100
|
extra_rdoc_files:
|
102
101
|
- LICENSE.txt
|
103
|
-
- README.
|
102
|
+
- README.md
|
104
103
|
files:
|
105
104
|
- Gemfile
|
106
105
|
- Gemfile.lock
|
107
106
|
- LICENSE.txt
|
108
|
-
- README.
|
107
|
+
- README.md
|
109
108
|
- Rakefile
|
110
109
|
- VERSION
|
111
110
|
- bin/rawler
|
@@ -115,6 +114,7 @@ files:
|
|
115
114
|
- lib/rawler/core_extensions/module.rb
|
116
115
|
- lib/rawler/crawler.rb
|
117
116
|
- lib/rawler/request.rb
|
117
|
+
- rawler.gemspec
|
118
118
|
- spec/lib/base_spec.rb
|
119
119
|
- spec/lib/rawler/base_spec.rb
|
120
120
|
- spec/lib/rawler/crawler_spec.rb
|
@@ -124,7 +124,6 @@ files:
|
|
124
124
|
- test/helper.rb
|
125
125
|
- test/test_rawler.rb
|
126
126
|
- vendor/lib-trollop.rb
|
127
|
-
has_rdoc: true
|
128
127
|
homepage: http://github.com/oscardelben/rawler
|
129
128
|
licenses:
|
130
129
|
- MIT
|
@@ -154,7 +153,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
154
153
|
requirements: []
|
155
154
|
|
156
155
|
rubyforge_project:
|
157
|
-
rubygems_version: 1.
|
156
|
+
rubygems_version: 1.8.11
|
158
157
|
signing_key:
|
159
158
|
specification_version: 3
|
160
159
|
summary: Rawler is a tool that crawls the links of your website
|