rawler 0.1.7 → 0.1.8
Sign up to get free protection for your applications and to get access to all the features.
- data/.rspec +1 -0
- data/.travis.yml +5 -0
- data/Gemfile +2 -3
- data/Gemfile.lock +35 -17
- data/README.md +7 -4
- data/Rakefile +1 -31
- data/VERSION +1 -1
- data/bin/rawler +7 -4
- data/lib/rawler.rb +16 -2
- data/lib/rawler/base.rb +5 -0
- data/lib/rawler/crawler.rb +2 -0
- data/rawler.gemspec +10 -13
- data/spec/lib/rawler/crawler_spec.rb +157 -76
- data/spec/lib/rawler_spec.rb +2 -1
- data/spec/spec_helper.rb +7 -2
- metadata +11 -27
- data/test/helper.rb +0 -18
- data/test/test_rawler.rb +0 -7
data/.rspec
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
--order rand
|
data/.travis.yml
ADDED
data/Gemfile
CHANGED
@@ -5,13 +5,12 @@ source "http://rubygems.org"
|
|
5
5
|
|
6
6
|
# Add dependencies to develop your gem here.
|
7
7
|
# Include everything needed to run rake, tests, features, etc.
|
8
|
-
gem 'nokogiri'
|
8
|
+
gem 'nokogiri', '~> 1.5'
|
9
9
|
|
10
10
|
group :development, :test do
|
11
11
|
gem 'fakeweb'
|
12
12
|
gem "rspec"
|
13
13
|
gem "shoulda", ">= 0"
|
14
|
-
gem "bundler", "~> 1.0
|
14
|
+
gem "bundler", "~> 1.0"
|
15
15
|
gem "jeweler", "~> 1.6.4"
|
16
|
-
gem "rcov", ">= 0"
|
17
16
|
end
|
data/Gemfile.lock
CHANGED
@@ -1,34 +1,52 @@
|
|
1
1
|
GEM
|
2
2
|
remote: http://rubygems.org/
|
3
3
|
specs:
|
4
|
-
|
4
|
+
activesupport (4.0.0)
|
5
|
+
i18n (~> 0.6, >= 0.6.4)
|
6
|
+
minitest (~> 4.2)
|
7
|
+
multi_json (~> 1.3)
|
8
|
+
thread_safe (~> 0.1)
|
9
|
+
tzinfo (~> 0.3.37)
|
10
|
+
atomic (1.1.13)
|
11
|
+
diff-lcs (1.2.4)
|
5
12
|
fakeweb (1.3.0)
|
6
|
-
git (1.2.
|
13
|
+
git (1.2.6)
|
14
|
+
i18n (0.6.5)
|
7
15
|
jeweler (1.6.4)
|
8
16
|
bundler (~> 1.0)
|
9
17
|
git (>= 1.2.5)
|
10
18
|
rake
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
rspec-
|
22
|
-
|
19
|
+
mini_portile (0.5.1)
|
20
|
+
minitest (4.7.5)
|
21
|
+
multi_json (1.7.9)
|
22
|
+
nokogiri (1.6.0)
|
23
|
+
mini_portile (~> 0.5.0)
|
24
|
+
rake (10.1.0)
|
25
|
+
rspec (2.14.1)
|
26
|
+
rspec-core (~> 2.14.0)
|
27
|
+
rspec-expectations (~> 2.14.0)
|
28
|
+
rspec-mocks (~> 2.14.0)
|
29
|
+
rspec-core (2.14.5)
|
30
|
+
rspec-expectations (2.14.2)
|
31
|
+
diff-lcs (>= 1.1.3, < 2.0)
|
32
|
+
rspec-mocks (2.14.3)
|
33
|
+
shoulda (3.5.0)
|
34
|
+
shoulda-context (~> 1.0, >= 1.0.1)
|
35
|
+
shoulda-matchers (>= 1.4.1, < 3.0)
|
36
|
+
shoulda-context (1.1.5)
|
37
|
+
shoulda-matchers (2.3.0)
|
38
|
+
activesupport (>= 3.0.0)
|
39
|
+
thread_safe (0.1.2)
|
40
|
+
atomic
|
41
|
+
tzinfo (0.3.37)
|
23
42
|
|
24
43
|
PLATFORMS
|
25
44
|
ruby
|
26
45
|
|
27
46
|
DEPENDENCIES
|
28
|
-
bundler (~> 1.0
|
47
|
+
bundler (~> 1.0)
|
29
48
|
fakeweb
|
30
49
|
jeweler (~> 1.6.4)
|
31
|
-
nokogiri
|
32
|
-
rcov
|
50
|
+
nokogiri (~> 1.5)
|
33
51
|
rspec
|
34
52
|
shoulda
|
data/README.md
CHANGED
@@ -9,14 +9,17 @@ Rawler will only parse pages with content type 'text/html', but it will check fo
|
|
9
9
|
rawler http://example.com [options]
|
10
10
|
|
11
11
|
where [options] are:
|
12
|
-
--username, -u <s>:
|
13
|
-
--password, -p <s>:
|
12
|
+
--username, -u <s>: HTTP Basic Username
|
13
|
+
--password, -p <s>: HTTP Basic Password
|
14
14
|
--wait, -w <f>: Seconds to wait between requests, may be fractional e.g. '1.5' (default: 3.0)
|
15
15
|
--log, -l: Log results to file rawler_log.txt
|
16
16
|
--logfile, -o <s>: Specify logfile, implies --log (default: rawler_log.txt)
|
17
17
|
--css, -c: Check CSS links
|
18
|
-
--skip, -s <s>: Skip
|
19
|
-
--iskip, -i <s>: Skip
|
18
|
+
--skip, -s <s>: Skip URLs that match a regexp
|
19
|
+
--iskip, -i <s>: Skip URLs that match a case insensitive regexp
|
20
|
+
--include <s>: Only include URLs that match a regexp
|
21
|
+
--iinclude <s>: Only include URLs that match a case insensitive regexp
|
22
|
+
--local <s>: Restrict to the given URL and below. Equivalent to '--include ^http://mysite.com/*'.
|
20
23
|
--version, -v: Print version and exit
|
21
24
|
--help, -h: Show this message
|
22
25
|
|
data/Rakefile
CHANGED
@@ -28,24 +28,7 @@ Jeweler::Tasks.new do |gem|
|
|
28
28
|
end
|
29
29
|
Jeweler::RubygemsDotOrgTasks.new
|
30
30
|
|
31
|
-
require '
|
32
|
-
Rake::TestTask.new(:test) do |test|
|
33
|
-
test.libs << 'lib' << 'test'
|
34
|
-
test.pattern = 'test/**/test_*.rb'
|
35
|
-
test.verbose = true
|
36
|
-
end
|
37
|
-
|
38
|
-
require 'rcov/rcovtask'
|
39
|
-
Rcov::RcovTask.new do |test|
|
40
|
-
test.libs << 'test'
|
41
|
-
test.pattern = 'test/**/test_*.rb'
|
42
|
-
test.verbose = true
|
43
|
-
test.rcov_opts << '--exclude "gems/*"'
|
44
|
-
end
|
45
|
-
|
46
|
-
task :default => :test
|
47
|
-
|
48
|
-
require 'rake/rdoctask'
|
31
|
+
require 'rdoc/task'
|
49
32
|
Rake::RDocTask.new do |rdoc|
|
50
33
|
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
51
34
|
|
@@ -54,16 +37,3 @@ Rake::RDocTask.new do |rdoc|
|
|
54
37
|
rdoc.rdoc_files.include('README*')
|
55
38
|
rdoc.rdoc_files.include('lib/**/*.rb')
|
56
39
|
end
|
57
|
-
|
58
|
-
desc 'generate docs'
|
59
|
-
task :rocco do
|
60
|
-
#%x!rm -r html/*!
|
61
|
-
|
62
|
-
Dir.chdir "lib"
|
63
|
-
|
64
|
-
files = Dir['**/*.*']
|
65
|
-
|
66
|
-
files.each do |file|
|
67
|
-
%x!rocco #{file} -o ../html!
|
68
|
-
end
|
69
|
-
end
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.1.
|
1
|
+
0.1.8
|
data/bin/rawler
CHANGED
@@ -14,14 +14,17 @@ Usage:
|
|
14
14
|
where [options] are:
|
15
15
|
EOS
|
16
16
|
|
17
|
-
opt :username, "
|
18
|
-
opt :password, "
|
17
|
+
opt :username, "HTTP Basic Username", :type => :string
|
18
|
+
opt :password, "HTTP Basic Password", :type => :string
|
19
19
|
opt :wait, "Seconds to wait between requests, may be fractional e.g. '1.5'", :type => :float, :default => 3.0
|
20
20
|
opt :log, "Log results to file #{Rawler::Base::DEFAULT_LOGFILE}", :type => :boolean, :default => false
|
21
21
|
opt :logfile, "Specify logfile, implies --log", :type => :string, :default => Rawler::Base::DEFAULT_LOGFILE
|
22
22
|
opt :css, "Check CSS links", :type => :boolean, :default => false
|
23
|
-
opt :skip, "Skip
|
24
|
-
opt :iskip, "Skip
|
23
|
+
opt :skip, "Skip URLs that match a pattern", :type => :string
|
24
|
+
opt :iskip, "Skip URLs that match a case insensitive pattern", :type => :string
|
25
|
+
opt :include, "Only include URLS that match a pattern", :type => :string
|
26
|
+
opt :iinclude, "Only include URLS that match a case insensitive pattern. Equivalent to '--include ^http://mysite.com/*'.", :type => :string
|
27
|
+
opt :local, "Restrict to the given URL and below", :type => :boolean, :default => false
|
25
28
|
end
|
26
29
|
|
27
30
|
|
data/lib/rawler.rb
CHANGED
@@ -12,6 +12,7 @@ module Rawler
|
|
12
12
|
mattr_accessor :username, :password
|
13
13
|
mattr_accessor :log, :logfile
|
14
14
|
mattr_accessor :css
|
15
|
+
mattr_accessor :include_url_pattern
|
15
16
|
mattr_accessor :skip_url_pattern
|
16
17
|
|
17
18
|
autoload :Base, "rawler/base"
|
@@ -28,7 +29,20 @@ module Rawler
|
|
28
29
|
@@url = url
|
29
30
|
end
|
30
31
|
|
31
|
-
def self.
|
32
|
-
|
32
|
+
def self.create_regex(pattern, icase=false)
|
33
|
+
pattern.nil? ? nil : Regexp.new(pattern, icase ? Regexp::IGNORECASE : nil )
|
34
|
+
end
|
35
|
+
|
36
|
+
def self.set_include_pattern(pattern, icase=false)
|
37
|
+
self.include_url_pattern = self.create_regex(pattern, icase)
|
38
|
+
end
|
39
|
+
|
40
|
+
def self.set_skip_pattern(pattern, icase=false)
|
41
|
+
self.skip_url_pattern = self.create_regex(pattern, icase)
|
42
|
+
end
|
43
|
+
|
44
|
+
def self.local=(is_local)
|
45
|
+
pattern = is_local ? "^#{self.url}" : nil
|
46
|
+
self.set_include_pattern(pattern)
|
33
47
|
end
|
34
48
|
end
|
data/lib/rawler/base.rb
CHANGED
@@ -15,6 +15,11 @@ module Rawler
|
|
15
15
|
Rawler.password = options[:password]
|
16
16
|
Rawler.wait = options[:wait]
|
17
17
|
Rawler.css = options[:css]
|
18
|
+
|
19
|
+
Rawler.local = options[:local]
|
20
|
+
|
21
|
+
Rawler.set_include_pattern(options[:include], false) unless options[:include].nil?
|
22
|
+
Rawler.set_include_pattern(options[:iinclude], true) unless options[:iinclude].nil?
|
18
23
|
|
19
24
|
Rawler.set_skip_pattern(options[:skip], false) unless options[:skip].nil?
|
20
25
|
Rawler.set_skip_pattern(options[:iskip], true) unless options[:iskip].nil?
|
data/lib/rawler/crawler.rb
CHANGED
data/rawler.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "rawler"
|
8
|
-
s.version = "0.1.
|
8
|
+
s.version = "0.1.8"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Oscar Del Ben"]
|
12
|
-
s.date = "
|
12
|
+
s.date = "2013-09-07"
|
13
13
|
s.description = "Rawler is a tool that crawls the links of your website"
|
14
14
|
s.email = "info@oscardelben.com"
|
15
15
|
s.executables = ["rawler"]
|
@@ -18,6 +18,8 @@ Gem::Specification.new do |s|
|
|
18
18
|
"README.md"
|
19
19
|
]
|
20
20
|
s.files = [
|
21
|
+
".rspec",
|
22
|
+
".travis.yml",
|
21
23
|
"Gemfile",
|
22
24
|
"Gemfile.lock",
|
23
25
|
"LICENSE.txt",
|
@@ -39,8 +41,6 @@ Gem::Specification.new do |s|
|
|
39
41
|
"spec/lib/rawler_spec.rb",
|
40
42
|
"spec/spec.opts",
|
41
43
|
"spec/spec_helper.rb",
|
42
|
-
"test/helper.rb",
|
43
|
-
"test/test_rawler.rb",
|
44
44
|
"vendor/lib-trollop.rb"
|
45
45
|
]
|
46
46
|
s.homepage = "http://github.com/oscardelben/rawler"
|
@@ -53,30 +53,27 @@ Gem::Specification.new do |s|
|
|
53
53
|
s.specification_version = 3
|
54
54
|
|
55
55
|
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
56
|
-
s.add_runtime_dependency(%q<nokogiri>, ["
|
56
|
+
s.add_runtime_dependency(%q<nokogiri>, ["~> 1.5"])
|
57
57
|
s.add_development_dependency(%q<fakeweb>, [">= 0"])
|
58
58
|
s.add_development_dependency(%q<rspec>, [">= 0"])
|
59
59
|
s.add_development_dependency(%q<shoulda>, [">= 0"])
|
60
|
-
s.add_development_dependency(%q<bundler>, ["~> 1.0
|
60
|
+
s.add_development_dependency(%q<bundler>, ["~> 1.0"])
|
61
61
|
s.add_development_dependency(%q<jeweler>, ["~> 1.6.4"])
|
62
|
-
s.add_development_dependency(%q<rcov>, [">= 0"])
|
63
62
|
else
|
64
|
-
s.add_dependency(%q<nokogiri>, ["
|
63
|
+
s.add_dependency(%q<nokogiri>, ["~> 1.5"])
|
65
64
|
s.add_dependency(%q<fakeweb>, [">= 0"])
|
66
65
|
s.add_dependency(%q<rspec>, [">= 0"])
|
67
66
|
s.add_dependency(%q<shoulda>, [">= 0"])
|
68
|
-
s.add_dependency(%q<bundler>, ["~> 1.0
|
67
|
+
s.add_dependency(%q<bundler>, ["~> 1.0"])
|
69
68
|
s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
|
70
|
-
s.add_dependency(%q<rcov>, [">= 0"])
|
71
69
|
end
|
72
70
|
else
|
73
|
-
s.add_dependency(%q<nokogiri>, ["
|
71
|
+
s.add_dependency(%q<nokogiri>, ["~> 1.5"])
|
74
72
|
s.add_dependency(%q<fakeweb>, [">= 0"])
|
75
73
|
s.add_dependency(%q<rspec>, [">= 0"])
|
76
74
|
s.add_dependency(%q<shoulda>, [">= 0"])
|
77
|
-
s.add_dependency(%q<bundler>, ["~> 1.0
|
75
|
+
s.add_dependency(%q<bundler>, ["~> 1.0"])
|
78
76
|
s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
|
79
|
-
s.add_dependency(%q<rcov>, [">= 0"])
|
80
77
|
end
|
81
78
|
end
|
82
79
|
|
@@ -8,12 +8,12 @@ describe Rawler::Crawler do
|
|
8
8
|
let(:output) { double('output', :error => nil) }
|
9
9
|
|
10
10
|
before(:each) do
|
11
|
-
Rawler.stub
|
12
|
-
Rawler.stub
|
11
|
+
Rawler.stub(:url).and_return(url)
|
12
|
+
Rawler.stub(:output).and_return(output)
|
13
13
|
end
|
14
14
|
|
15
15
|
context "basic functionality" do
|
16
|
-
|
16
|
+
|
17
17
|
let(:url) { 'http://example.com' }
|
18
18
|
let(:crawler) { Rawler::Crawler.new(url) }
|
19
19
|
let(:content) {
|
@@ -32,87 +32,87 @@ describe Rawler::Crawler do
|
|
32
32
|
it "should parse all links" do
|
33
33
|
crawler.links.should == ['http://example.com/foo', 'http://external.com/bar']
|
34
34
|
end
|
35
|
-
|
35
|
+
|
36
36
|
it "should parse css links" do
|
37
37
|
crawler.css_links.should == ['http://example.com/css/styles.css']
|
38
|
-
end
|
38
|
+
end
|
39
39
|
end
|
40
|
-
|
40
|
+
|
41
41
|
context "relative paths" do
|
42
|
-
|
42
|
+
|
43
43
|
context "base URL ends with a slash" do
|
44
|
-
|
44
|
+
|
45
45
|
let(:url) { 'http://example.com/dir1/dir2/' }
|
46
46
|
let(:crawler) { Rawler::Crawler.new(url) }
|
47
47
|
let(:content) { '<a href="/foo">foo</a> <a href="bar">bar</a> <a href="../baz">baz</a>' }
|
48
|
-
|
48
|
+
|
49
49
|
before(:each) do
|
50
50
|
register(url, content)
|
51
51
|
end
|
52
|
-
|
52
|
+
|
53
53
|
it "should parse relative links" do
|
54
54
|
crawler.links.should == ['http://example.com/foo', 'http://example.com/dir1/dir2/bar', 'http://example.com/dir1/baz']
|
55
55
|
end
|
56
|
-
|
56
|
+
|
57
57
|
end
|
58
|
-
|
58
|
+
|
59
59
|
context "base URL doesn't end with a slash" do
|
60
|
-
|
60
|
+
|
61
61
|
let(:url) { 'http://example.com/dir1/dir2' }
|
62
62
|
let(:crawler) { Rawler::Crawler.new(url) }
|
63
63
|
let(:content) { '<a href="/foo">foo</a> <a href="bar">bar</a> <a href="../baz">baz</a>' }
|
64
|
-
|
64
|
+
|
65
65
|
before(:each) do
|
66
66
|
register(url, content)
|
67
67
|
end
|
68
|
-
|
68
|
+
|
69
69
|
it "should parse relative links" do
|
70
70
|
crawler.links.should == ['http://example.com/foo', 'http://example.com/dir1/bar', 'http://example.com/baz']
|
71
71
|
end
|
72
|
-
|
72
|
+
|
73
73
|
end
|
74
|
-
|
74
|
+
|
75
75
|
end
|
76
|
-
|
76
|
+
|
77
77
|
context "different domains" do
|
78
|
-
|
78
|
+
|
79
79
|
let(:url) { 'http://external.com/path' }
|
80
80
|
let(:crawler) { Rawler::Crawler.new(url) }
|
81
81
|
let(:content) { '<a href="/foo">foo</a>' }
|
82
|
-
|
82
|
+
|
83
83
|
before(:each) do
|
84
|
-
Rawler.stub
|
84
|
+
Rawler.stub(:url).and_return('http://example.com')
|
85
85
|
register(url, content)
|
86
86
|
end
|
87
|
-
|
87
|
+
|
88
88
|
it "should parse relative links" do
|
89
89
|
crawler.links.should == []
|
90
90
|
end
|
91
|
-
|
91
|
+
|
92
92
|
end
|
93
|
-
|
93
|
+
|
94
94
|
context "urls with hash tags" do
|
95
|
-
|
95
|
+
|
96
96
|
let(:url) { 'http://example.com/path' }
|
97
97
|
let(:crawler) { Rawler::Crawler.new(url) }
|
98
98
|
let(:content) { '<a href="/foo#bar">foo</a>' }
|
99
|
-
|
99
|
+
|
100
100
|
before(:each) do
|
101
101
|
register(url, content)
|
102
102
|
end
|
103
|
-
|
103
|
+
|
104
104
|
it "should not encode hashtags" do
|
105
105
|
crawler.links.should == ['http://example.com/foo#bar']
|
106
106
|
end
|
107
|
-
|
107
|
+
|
108
108
|
end
|
109
|
-
|
109
|
+
|
110
110
|
context "urls with unicode characters" do
|
111
|
-
|
111
|
+
|
112
112
|
let(:url) { 'http://example.com' }
|
113
113
|
let(:crawler) { Rawler::Crawler.new(url) }
|
114
114
|
let(:content) { '<a href="http://example.com/写程序容易出现的几个不好的地方">foo</a>' }
|
115
|
-
|
115
|
+
|
116
116
|
before(:each) do
|
117
117
|
register(url, content)
|
118
118
|
end
|
@@ -120,9 +120,9 @@ describe Rawler::Crawler do
|
|
120
120
|
it "should parse unicode links" do
|
121
121
|
crawler.links.should == ['http://example.com/%E5%86%99%E7%A8%8B%E5%BA%8F%E5%AE%B9%E6%98%93%E5%87%BA%E7%8E%B0%E7%9A%84%E5%87%A0%E4%B8%AA%E4%B8%8D%E5%A5%BD%E7%9A%84%E5%9C%B0%E6%96%B9']
|
122
122
|
end
|
123
|
-
|
123
|
+
|
124
124
|
end
|
125
|
-
|
125
|
+
|
126
126
|
context "invalid urls" do
|
127
127
|
|
128
128
|
context "javascript" do
|
@@ -130,11 +130,11 @@ describe Rawler::Crawler do
|
|
130
130
|
let(:crawler) { Rawler::Crawler.new(url) }
|
131
131
|
let(:js_url) { "javascript:fn('nbjmup;jhfs.esf{fio/dpn');" }
|
132
132
|
let(:content) { "<a href=\"#{js_url}\">foo</a><a name=\"foo\">" }
|
133
|
-
|
133
|
+
|
134
134
|
before(:each) do
|
135
135
|
register(url, content)
|
136
136
|
end
|
137
|
-
|
137
|
+
|
138
138
|
it "should return empty links" do
|
139
139
|
crawler.links.should == []
|
140
140
|
end
|
@@ -149,11 +149,11 @@ describe Rawler::Crawler do
|
|
149
149
|
let(:url) { 'http://example.com/path' }
|
150
150
|
let(:crawler) { Rawler::Crawler.new(url) }
|
151
151
|
let(:content) { "<a href=\"mailto:example@example.com\">foo</a><a name=\"foo\">" }
|
152
|
-
|
152
|
+
|
153
153
|
before(:each) do
|
154
154
|
register(url, content)
|
155
155
|
end
|
156
|
-
|
156
|
+
|
157
157
|
it "should return empty links" do
|
158
158
|
crawler.links.should == []
|
159
159
|
end
|
@@ -163,16 +163,16 @@ describe Rawler::Crawler do
|
|
163
163
|
crawler.links
|
164
164
|
end
|
165
165
|
end
|
166
|
-
|
166
|
+
|
167
167
|
context "callto" do
|
168
168
|
let(:url) { 'http://example.com/path' }
|
169
169
|
let(:crawler) { Rawler::Crawler.new(url) }
|
170
170
|
let(:content) { "<a href=\"callto:home22\">foo</a><a name=\"foo\">" }
|
171
|
-
|
171
|
+
|
172
172
|
before(:each) do
|
173
173
|
register(url, content)
|
174
174
|
end
|
175
|
-
|
175
|
+
|
176
176
|
it "should return empty links" do
|
177
177
|
crawler.links.should == []
|
178
178
|
end
|
@@ -187,12 +187,12 @@ describe Rawler::Crawler do
|
|
187
187
|
let(:url) { 'http://example.com/path' }
|
188
188
|
let(:crawler) { Rawler::Crawler.new(url) }
|
189
189
|
let(:content) { "<a href=\"http://example.com/search/page:1/\">foo</a><a href=\"http://example.com/search/page:2/\">foo</a>" }
|
190
|
-
|
190
|
+
|
191
191
|
before(:each) do
|
192
192
|
Rawler.set_skip_pattern('\/search\/(.*\/)?page:[2-9]', false)
|
193
193
|
register(url, content)
|
194
194
|
end
|
195
|
-
|
195
|
+
|
196
196
|
it "should return one links" do
|
197
197
|
crawler.links.length.should eql(1)
|
198
198
|
end
|
@@ -201,63 +201,144 @@ describe Rawler::Crawler do
|
|
201
201
|
crawler.should_not_receive(:write)
|
202
202
|
crawler.links
|
203
203
|
end
|
204
|
+
|
205
|
+
after(:each) do
|
206
|
+
Rawler.set_skip_pattern(nil)
|
207
|
+
end
|
204
208
|
end
|
205
209
|
|
206
210
|
context "case-insensitive skip matches" do
|
207
211
|
let(:url) { 'http://example.com/path' }
|
208
212
|
let(:crawler) { Rawler::Crawler.new(url) }
|
209
213
|
let(:content) { "<a href=\"http://example.com/search/page:1/\">foo</a><a href=\"http://example.com/search/page:2/\">foo</a>" }
|
210
|
-
|
214
|
+
|
211
215
|
before(:each) do
|
212
216
|
Rawler.set_skip_pattern('\/seArcH\/(.*\/)?PAGE:[2-9]', true)
|
213
217
|
register(url, content)
|
214
218
|
end
|
215
|
-
|
219
|
+
|
220
|
+
it "should return one links" do
|
221
|
+
crawler.links.length.should eql(1)
|
222
|
+
end
|
223
|
+
|
224
|
+
it "should not report that it's skipping" do
|
225
|
+
crawler.should_not_receive(:write)
|
226
|
+
crawler.links
|
227
|
+
end
|
228
|
+
|
229
|
+
after(:each) do
|
230
|
+
Rawler.set_skip_pattern(nil)
|
231
|
+
end
|
232
|
+
end
|
233
|
+
|
234
|
+
context "include matches" do
|
235
|
+
let(:url) { 'http://example.com/path' }
|
236
|
+
let(:crawler) { Rawler::Crawler.new(url) }
|
237
|
+
let(:content) { "<a href=\"http://example.com/search/page:1/\">foo</a><a href=\"http://example.com/search/page:2/\">foo</a>" }
|
238
|
+
|
239
|
+
before(:each) do
|
240
|
+
Rawler.set_include_pattern('\/search\/(.*\/)?page:[2-9]', false)
|
241
|
+
register(url, content)
|
242
|
+
end
|
243
|
+
|
244
|
+
it "should return one links" do
|
245
|
+
crawler.links.length.should eql(1)
|
246
|
+
crawler.links.should eq(['http://example.com/search/page:2/'])
|
247
|
+
end
|
248
|
+
|
249
|
+
it "should not report that it's including" do
|
250
|
+
crawler.should_not_receive(:write)
|
251
|
+
crawler.links
|
252
|
+
end
|
253
|
+
|
254
|
+
after(:each) do
|
255
|
+
Rawler.set_include_pattern(nil)
|
256
|
+
end
|
257
|
+
end
|
258
|
+
|
259
|
+
context "case-insensitive include matches" do
|
260
|
+
let(:url) { 'http://example.com/path' }
|
261
|
+
let(:crawler) { Rawler::Crawler.new(url) }
|
262
|
+
let(:content) { "<a href=\"http://example.com/search/page:1/\">foo</a><a href=\"http://example.com/search/page:2/\">foo</a>" }
|
263
|
+
|
264
|
+
before(:each) do
|
265
|
+
Rawler.set_include_pattern('\/seArcH\/(.*\/)?PAGE:[2-9]', true)
|
266
|
+
register(url, content)
|
267
|
+
end
|
268
|
+
|
216
269
|
it "should return one links" do
|
217
270
|
crawler.links.length.should eql(1)
|
218
271
|
end
|
219
272
|
|
273
|
+
it "should not report that it's including" do
|
274
|
+
crawler.should_not_receive(:write)
|
275
|
+
crawler.links
|
276
|
+
end
|
277
|
+
|
278
|
+
after(:each) do
|
279
|
+
Rawler.set_include_pattern(nil)
|
280
|
+
end
|
281
|
+
end
|
282
|
+
|
283
|
+
context "non-local site should be omitted when local flag is used" do
|
284
|
+
let(:url) { 'http://example.com/' }
|
285
|
+
let(:crawler) { Rawler::Crawler.new(url) }
|
286
|
+
let(:content) { "<a href=\"http://example.com/page1/\">foo</a><a href=\"http://example.org/page2\">foo</a>" }
|
287
|
+
|
288
|
+
before(:each) do
|
289
|
+
Rawler.local = true
|
290
|
+
register(url, content)
|
291
|
+
end
|
292
|
+
|
293
|
+
it "should return one link" do
|
294
|
+
crawler.links.length.should eql(1)
|
295
|
+
end
|
296
|
+
|
220
297
|
it "should not report that it's skipping" do
|
221
298
|
crawler.should_not_receive(:write)
|
222
299
|
crawler.links
|
223
300
|
end
|
301
|
+
|
302
|
+
after(:each) do
|
303
|
+
Rawler.local = false
|
304
|
+
end
|
224
305
|
end
|
225
306
|
|
226
307
|
end
|
227
308
|
|
228
309
|
context "content type" do
|
229
|
-
|
310
|
+
|
230
311
|
['text/plain', 'text/css', 'image/jpeg'].each do |content_type|
|
231
|
-
|
312
|
+
|
232
313
|
let(:url) { 'http://example.com' }
|
233
314
|
let(:crawler) { Rawler::Crawler.new(url) }
|
234
|
-
|
315
|
+
|
235
316
|
before(:each) do
|
236
317
|
register(url, '', 200, :content_type => content_type)
|
237
318
|
end
|
238
|
-
|
319
|
+
|
239
320
|
it "should ignore '#{content_type}'" do
|
240
321
|
crawler.links.should == []
|
241
322
|
end
|
242
|
-
|
323
|
+
|
243
324
|
end
|
244
325
|
end
|
245
|
-
|
326
|
+
|
246
327
|
context "Exceptions" do
|
247
|
-
|
328
|
+
|
248
329
|
let(:url) { 'http://example.com' }
|
249
330
|
let(:crawler) { Rawler::Crawler.new(url) }
|
250
|
-
|
331
|
+
|
251
332
|
before(:each) do
|
252
333
|
register(url, '')
|
253
334
|
end
|
254
|
-
|
335
|
+
|
255
336
|
context "Errno::ECONNREFUSED" do
|
256
|
-
|
337
|
+
|
257
338
|
before(:each) do
|
258
|
-
Rawler::Request.stub
|
339
|
+
Rawler::Request.stub(:get).and_raise Errno::ECONNREFUSED
|
259
340
|
end
|
260
|
-
|
341
|
+
|
261
342
|
it "should return an empty array" do
|
262
343
|
crawler.links.should == []
|
263
344
|
end
|
@@ -266,14 +347,14 @@ describe Rawler::Crawler do
|
|
266
347
|
output.should_receive(:error).with("Couldn't connect to #{url}")
|
267
348
|
|
268
349
|
crawler.links
|
269
|
-
end
|
270
|
-
|
350
|
+
end
|
351
|
+
|
271
352
|
end
|
272
|
-
|
353
|
+
|
273
354
|
context "Errno::ETIMEDOUT" do
|
274
|
-
|
355
|
+
|
275
356
|
before(:each) do
|
276
|
-
Rawler::Request.stub
|
357
|
+
Rawler::Request.stub(:get).and_raise Errno::ETIMEDOUT
|
277
358
|
end
|
278
359
|
|
279
360
|
it "should return an empty array when raising Errno::ETIMEDOUT" do
|
@@ -285,33 +366,33 @@ describe Rawler::Crawler do
|
|
285
366
|
|
286
367
|
crawler.links
|
287
368
|
end
|
288
|
-
|
369
|
+
|
289
370
|
end
|
290
|
-
|
371
|
+
|
291
372
|
end
|
292
|
-
|
373
|
+
|
293
374
|
context "http basic" do
|
294
|
-
|
375
|
+
|
295
376
|
let(:url) { 'http://example.com' }
|
296
377
|
let(:content) { '<a href="http://example.com/secret-path">foo</a>' }
|
297
378
|
let(:crawler) { Rawler::Crawler.new('http://example.com/secret') }
|
298
|
-
|
379
|
+
|
299
380
|
before(:each) do
|
300
381
|
register('http://example.com/secret', '', :status => ["401", "Unauthorized"])
|
301
382
|
register('http://foo:bar@example.com/secret', content)
|
302
383
|
|
303
|
-
Rawler.stub
|
304
|
-
Rawler.stub
|
384
|
+
Rawler.stub(:username).and_return('foo')
|
385
|
+
Rawler.stub(:password).and_return('bar')
|
305
386
|
end
|
306
|
-
|
387
|
+
|
307
388
|
it "should crawl http basic pages" do
|
308
389
|
crawler.links.should == ['http://example.com/secret-path']
|
309
390
|
end
|
310
|
-
|
391
|
+
|
311
392
|
end
|
312
|
-
|
393
|
+
|
313
394
|
context "url domain" do
|
314
|
-
|
395
|
+
|
315
396
|
let(:content) {
|
316
397
|
content = <<-content
|
317
398
|
<a href="http://example.com/valid">foo</a>
|
@@ -322,11 +403,11 @@ describe Rawler::Crawler do
|
|
322
403
|
}
|
323
404
|
let(:url) { 'http://example.com' }
|
324
405
|
let(:crawler) { Rawler::Crawler.new(url) }
|
325
|
-
|
406
|
+
|
326
407
|
before(:each) do
|
327
408
|
register(url, content)
|
328
409
|
end
|
329
|
-
|
410
|
+
|
330
411
|
it "should ignore links other than http or https" do
|
331
412
|
crawler.links.should == ['http://example.com/valid', 'https://foo.com', 'http://fooo.com']
|
332
413
|
end
|
@@ -336,11 +417,11 @@ describe Rawler::Crawler do
|
|
336
417
|
let(:content) { '<a href="http://foo;bar">foo</a>' }
|
337
418
|
let(:url) { 'http://example.com' }
|
338
419
|
let(:crawler) { Rawler::Crawler.new(url) }
|
339
|
-
|
420
|
+
|
340
421
|
before(:each) do
|
341
422
|
register(url, content)
|
342
423
|
end
|
343
|
-
|
424
|
+
|
344
425
|
it "should notify about the invalid url" do
|
345
426
|
output.should_receive(:error).with('Invalid url: http://foo;bar - Called from: http://example.com')
|
346
427
|
crawler.links.should == []
|
data/spec/lib/rawler_spec.rb
CHANGED
@@ -8,7 +8,7 @@ describe Rawler::Base do
|
|
8
8
|
let(:rawler) { Rawler::Base.new('http://example.com', output) }
|
9
9
|
|
10
10
|
before(:each) do
|
11
|
-
Rawler.stub
|
11
|
+
Rawler.stub(:output).and_return(output)
|
12
12
|
register('http://example.com', site)
|
13
13
|
end
|
14
14
|
|
@@ -94,6 +94,7 @@ describe Rawler::Base do
|
|
94
94
|
end
|
95
95
|
|
96
96
|
it "should not validate links on external pages" do
|
97
|
+
register('http://example.com/', '<a href="http://external.com/foo">x</a>')
|
97
98
|
register('http://example.com/foo', '<a href="http://external.com/foo">x</a>')
|
98
99
|
register('http://external.com/foo', '<a href="http://external.com/bar">x</a>')
|
99
100
|
register('http://external.com/bar', '')
|
data/spec/spec_helper.rb
CHANGED
@@ -1,9 +1,9 @@
|
|
1
1
|
module Kernel
|
2
|
-
|
2
|
+
|
3
3
|
def sleep(duration)
|
4
4
|
nil
|
5
5
|
end
|
6
|
-
|
6
|
+
|
7
7
|
end
|
8
8
|
|
9
9
|
|
@@ -16,3 +16,8 @@ FakeWeb.allow_net_connect = false
|
|
16
16
|
def register(uri, content, status=200, options={})
|
17
17
|
FakeWeb.register_uri(:any, uri, { :body => content, :status => status, :content_type => 'text/html' }.merge(options))
|
18
18
|
end
|
19
|
+
|
20
|
+
if ENV['COVERAGE']
|
21
|
+
require 'simplecov'
|
22
|
+
SimpleCov.start
|
23
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.8
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,24 +9,24 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2013-09-07 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: nokogiri
|
16
16
|
requirement: !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
|
-
- -
|
19
|
+
- - ~>
|
20
20
|
- !ruby/object:Gem::Version
|
21
|
-
version: '
|
21
|
+
version: '1.5'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
24
|
version_requirements: !ruby/object:Gem::Requirement
|
25
25
|
none: false
|
26
26
|
requirements:
|
27
|
-
- -
|
27
|
+
- - ~>
|
28
28
|
- !ruby/object:Gem::Version
|
29
|
-
version: '
|
29
|
+
version: '1.5'
|
30
30
|
- !ruby/object:Gem::Dependency
|
31
31
|
name: fakeweb
|
32
32
|
requirement: !ruby/object:Gem::Requirement
|
@@ -82,7 +82,7 @@ dependencies:
|
|
82
82
|
requirements:
|
83
83
|
- - ~>
|
84
84
|
- !ruby/object:Gem::Version
|
85
|
-
version: 1.0
|
85
|
+
version: '1.0'
|
86
86
|
type: :development
|
87
87
|
prerelease: false
|
88
88
|
version_requirements: !ruby/object:Gem::Requirement
|
@@ -90,7 +90,7 @@ dependencies:
|
|
90
90
|
requirements:
|
91
91
|
- - ~>
|
92
92
|
- !ruby/object:Gem::Version
|
93
|
-
version: 1.0
|
93
|
+
version: '1.0'
|
94
94
|
- !ruby/object:Gem::Dependency
|
95
95
|
name: jeweler
|
96
96
|
requirement: !ruby/object:Gem::Requirement
|
@@ -107,22 +107,6 @@ dependencies:
|
|
107
107
|
- - ~>
|
108
108
|
- !ruby/object:Gem::Version
|
109
109
|
version: 1.6.4
|
110
|
-
- !ruby/object:Gem::Dependency
|
111
|
-
name: rcov
|
112
|
-
requirement: !ruby/object:Gem::Requirement
|
113
|
-
none: false
|
114
|
-
requirements:
|
115
|
-
- - ! '>='
|
116
|
-
- !ruby/object:Gem::Version
|
117
|
-
version: '0'
|
118
|
-
type: :development
|
119
|
-
prerelease: false
|
120
|
-
version_requirements: !ruby/object:Gem::Requirement
|
121
|
-
none: false
|
122
|
-
requirements:
|
123
|
-
- - ! '>='
|
124
|
-
- !ruby/object:Gem::Version
|
125
|
-
version: '0'
|
126
110
|
description: Rawler is a tool that crawls the links of your website
|
127
111
|
email: info@oscardelben.com
|
128
112
|
executables:
|
@@ -132,6 +116,8 @@ extra_rdoc_files:
|
|
132
116
|
- LICENSE.txt
|
133
117
|
- README.md
|
134
118
|
files:
|
119
|
+
- .rspec
|
120
|
+
- .travis.yml
|
135
121
|
- Gemfile
|
136
122
|
- Gemfile.lock
|
137
123
|
- LICENSE.txt
|
@@ -153,8 +139,6 @@ files:
|
|
153
139
|
- spec/lib/rawler_spec.rb
|
154
140
|
- spec/spec.opts
|
155
141
|
- spec/spec_helper.rb
|
156
|
-
- test/helper.rb
|
157
|
-
- test/test_rawler.rb
|
158
142
|
- vendor/lib-trollop.rb
|
159
143
|
homepage: http://github.com/oscardelben/rawler
|
160
144
|
licenses:
|
@@ -171,7 +155,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
171
155
|
version: '0'
|
172
156
|
segments:
|
173
157
|
- 0
|
174
|
-
hash:
|
158
|
+
hash: 3460091796500092184
|
175
159
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
176
160
|
none: false
|
177
161
|
requirements:
|
data/test/helper.rb
DELETED
@@ -1,18 +0,0 @@
|
|
1
|
-
require 'rubygems'
|
2
|
-
require 'bundler'
|
3
|
-
begin
|
4
|
-
Bundler.setup(:default, :development)
|
5
|
-
rescue Bundler::BundlerError => e
|
6
|
-
$stderr.puts e.message
|
7
|
-
$stderr.puts "Run `bundle install` to install missing gems"
|
8
|
-
exit e.status_code
|
9
|
-
end
|
10
|
-
require 'test/unit'
|
11
|
-
require 'shoulda'
|
12
|
-
|
13
|
-
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
14
|
-
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
15
|
-
require 'rawler'
|
16
|
-
|
17
|
-
class Test::Unit::TestCase
|
18
|
-
end
|