rawler 0.1.7 → 0.1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.rspec +1 -0
- data/.travis.yml +5 -0
- data/Gemfile +2 -3
- data/Gemfile.lock +35 -17
- data/README.md +7 -4
- data/Rakefile +1 -31
- data/VERSION +1 -1
- data/bin/rawler +7 -4
- data/lib/rawler.rb +16 -2
- data/lib/rawler/base.rb +5 -0
- data/lib/rawler/crawler.rb +2 -0
- data/rawler.gemspec +10 -13
- data/spec/lib/rawler/crawler_spec.rb +157 -76
- data/spec/lib/rawler_spec.rb +2 -1
- data/spec/spec_helper.rb +7 -2
- metadata +11 -27
- data/test/helper.rb +0 -18
- data/test/test_rawler.rb +0 -7
data/.rspec
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
--order rand
|
data/.travis.yml
ADDED
data/Gemfile
CHANGED
@@ -5,13 +5,12 @@ source "http://rubygems.org"
|
|
5
5
|
|
6
6
|
# Add dependencies to develop your gem here.
|
7
7
|
# Include everything needed to run rake, tests, features, etc.
|
8
|
-
gem 'nokogiri'
|
8
|
+
gem 'nokogiri', '~> 1.5'
|
9
9
|
|
10
10
|
group :development, :test do
|
11
11
|
gem 'fakeweb'
|
12
12
|
gem "rspec"
|
13
13
|
gem "shoulda", ">= 0"
|
14
|
-
gem "bundler", "~> 1.0
|
14
|
+
gem "bundler", "~> 1.0"
|
15
15
|
gem "jeweler", "~> 1.6.4"
|
16
|
-
gem "rcov", ">= 0"
|
17
16
|
end
|
data/Gemfile.lock
CHANGED
@@ -1,34 +1,52 @@
|
|
1
1
|
GEM
|
2
2
|
remote: http://rubygems.org/
|
3
3
|
specs:
|
4
|
-
|
4
|
+
activesupport (4.0.0)
|
5
|
+
i18n (~> 0.6, >= 0.6.4)
|
6
|
+
minitest (~> 4.2)
|
7
|
+
multi_json (~> 1.3)
|
8
|
+
thread_safe (~> 0.1)
|
9
|
+
tzinfo (~> 0.3.37)
|
10
|
+
atomic (1.1.13)
|
11
|
+
diff-lcs (1.2.4)
|
5
12
|
fakeweb (1.3.0)
|
6
|
-
git (1.2.
|
13
|
+
git (1.2.6)
|
14
|
+
i18n (0.6.5)
|
7
15
|
jeweler (1.6.4)
|
8
16
|
bundler (~> 1.0)
|
9
17
|
git (>= 1.2.5)
|
10
18
|
rake
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
rspec-
|
22
|
-
|
19
|
+
mini_portile (0.5.1)
|
20
|
+
minitest (4.7.5)
|
21
|
+
multi_json (1.7.9)
|
22
|
+
nokogiri (1.6.0)
|
23
|
+
mini_portile (~> 0.5.0)
|
24
|
+
rake (10.1.0)
|
25
|
+
rspec (2.14.1)
|
26
|
+
rspec-core (~> 2.14.0)
|
27
|
+
rspec-expectations (~> 2.14.0)
|
28
|
+
rspec-mocks (~> 2.14.0)
|
29
|
+
rspec-core (2.14.5)
|
30
|
+
rspec-expectations (2.14.2)
|
31
|
+
diff-lcs (>= 1.1.3, < 2.0)
|
32
|
+
rspec-mocks (2.14.3)
|
33
|
+
shoulda (3.5.0)
|
34
|
+
shoulda-context (~> 1.0, >= 1.0.1)
|
35
|
+
shoulda-matchers (>= 1.4.1, < 3.0)
|
36
|
+
shoulda-context (1.1.5)
|
37
|
+
shoulda-matchers (2.3.0)
|
38
|
+
activesupport (>= 3.0.0)
|
39
|
+
thread_safe (0.1.2)
|
40
|
+
atomic
|
41
|
+
tzinfo (0.3.37)
|
23
42
|
|
24
43
|
PLATFORMS
|
25
44
|
ruby
|
26
45
|
|
27
46
|
DEPENDENCIES
|
28
|
-
bundler (~> 1.0
|
47
|
+
bundler (~> 1.0)
|
29
48
|
fakeweb
|
30
49
|
jeweler (~> 1.6.4)
|
31
|
-
nokogiri
|
32
|
-
rcov
|
50
|
+
nokogiri (~> 1.5)
|
33
51
|
rspec
|
34
52
|
shoulda
|
data/README.md
CHANGED
@@ -9,14 +9,17 @@ Rawler will only parse pages with content type 'text/html', but it will check fo
|
|
9
9
|
rawler http://example.com [options]
|
10
10
|
|
11
11
|
where [options] are:
|
12
|
-
--username, -u <s>:
|
13
|
-
--password, -p <s>:
|
12
|
+
--username, -u <s>: HTTP Basic Username
|
13
|
+
--password, -p <s>: HTTP Basic Password
|
14
14
|
--wait, -w <f>: Seconds to wait between requests, may be fractional e.g. '1.5' (default: 3.0)
|
15
15
|
--log, -l: Log results to file rawler_log.txt
|
16
16
|
--logfile, -o <s>: Specify logfile, implies --log (default: rawler_log.txt)
|
17
17
|
--css, -c: Check CSS links
|
18
|
-
--skip, -s <s>: Skip
|
19
|
-
--iskip, -i <s>: Skip
|
18
|
+
--skip, -s <s>: Skip URLs that match a regexp
|
19
|
+
--iskip, -i <s>: Skip URLs that match a case insensitive regexp
|
20
|
+
--include <s>: Only include URLs that match a regexp
|
21
|
+
--iinclude <s>: Only include URLs that match a case insensitive regexp
|
22
|
+
--local <s>: Restrict to the given URL and below. Equivalent to '--include ^http://mysite.com/*'.
|
20
23
|
--version, -v: Print version and exit
|
21
24
|
--help, -h: Show this message
|
22
25
|
|
data/Rakefile
CHANGED
@@ -28,24 +28,7 @@ Jeweler::Tasks.new do |gem|
|
|
28
28
|
end
|
29
29
|
Jeweler::RubygemsDotOrgTasks.new
|
30
30
|
|
31
|
-
require '
|
32
|
-
Rake::TestTask.new(:test) do |test|
|
33
|
-
test.libs << 'lib' << 'test'
|
34
|
-
test.pattern = 'test/**/test_*.rb'
|
35
|
-
test.verbose = true
|
36
|
-
end
|
37
|
-
|
38
|
-
require 'rcov/rcovtask'
|
39
|
-
Rcov::RcovTask.new do |test|
|
40
|
-
test.libs << 'test'
|
41
|
-
test.pattern = 'test/**/test_*.rb'
|
42
|
-
test.verbose = true
|
43
|
-
test.rcov_opts << '--exclude "gems/*"'
|
44
|
-
end
|
45
|
-
|
46
|
-
task :default => :test
|
47
|
-
|
48
|
-
require 'rake/rdoctask'
|
31
|
+
require 'rdoc/task'
|
49
32
|
Rake::RDocTask.new do |rdoc|
|
50
33
|
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
51
34
|
|
@@ -54,16 +37,3 @@ Rake::RDocTask.new do |rdoc|
|
|
54
37
|
rdoc.rdoc_files.include('README*')
|
55
38
|
rdoc.rdoc_files.include('lib/**/*.rb')
|
56
39
|
end
|
57
|
-
|
58
|
-
desc 'generate docs'
|
59
|
-
task :rocco do
|
60
|
-
#%x!rm -r html/*!
|
61
|
-
|
62
|
-
Dir.chdir "lib"
|
63
|
-
|
64
|
-
files = Dir['**/*.*']
|
65
|
-
|
66
|
-
files.each do |file|
|
67
|
-
%x!rocco #{file} -o ../html!
|
68
|
-
end
|
69
|
-
end
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.1.
|
1
|
+
0.1.8
|
data/bin/rawler
CHANGED
@@ -14,14 +14,17 @@ Usage:
|
|
14
14
|
where [options] are:
|
15
15
|
EOS
|
16
16
|
|
17
|
-
opt :username, "
|
18
|
-
opt :password, "
|
17
|
+
opt :username, "HTTP Basic Username", :type => :string
|
18
|
+
opt :password, "HTTP Basic Password", :type => :string
|
19
19
|
opt :wait, "Seconds to wait between requests, may be fractional e.g. '1.5'", :type => :float, :default => 3.0
|
20
20
|
opt :log, "Log results to file #{Rawler::Base::DEFAULT_LOGFILE}", :type => :boolean, :default => false
|
21
21
|
opt :logfile, "Specify logfile, implies --log", :type => :string, :default => Rawler::Base::DEFAULT_LOGFILE
|
22
22
|
opt :css, "Check CSS links", :type => :boolean, :default => false
|
23
|
-
opt :skip, "Skip
|
24
|
-
opt :iskip, "Skip
|
23
|
+
opt :skip, "Skip URLs that match a pattern", :type => :string
|
24
|
+
opt :iskip, "Skip URLs that match a case insensitive pattern", :type => :string
|
25
|
+
opt :include, "Only include URLS that match a pattern", :type => :string
|
26
|
+
opt :iinclude, "Only include URLS that match a case insensitive pattern. Equivalent to '--include ^http://mysite.com/*'.", :type => :string
|
27
|
+
opt :local, "Restrict to the given URL and below", :type => :boolean, :default => false
|
25
28
|
end
|
26
29
|
|
27
30
|
|
data/lib/rawler.rb
CHANGED
@@ -12,6 +12,7 @@ module Rawler
|
|
12
12
|
mattr_accessor :username, :password
|
13
13
|
mattr_accessor :log, :logfile
|
14
14
|
mattr_accessor :css
|
15
|
+
mattr_accessor :include_url_pattern
|
15
16
|
mattr_accessor :skip_url_pattern
|
16
17
|
|
17
18
|
autoload :Base, "rawler/base"
|
@@ -28,7 +29,20 @@ module Rawler
|
|
28
29
|
@@url = url
|
29
30
|
end
|
30
31
|
|
31
|
-
def self.
|
32
|
-
|
32
|
+
def self.create_regex(pattern, icase=false)
|
33
|
+
pattern.nil? ? nil : Regexp.new(pattern, icase ? Regexp::IGNORECASE : nil )
|
34
|
+
end
|
35
|
+
|
36
|
+
def self.set_include_pattern(pattern, icase=false)
|
37
|
+
self.include_url_pattern = self.create_regex(pattern, icase)
|
38
|
+
end
|
39
|
+
|
40
|
+
def self.set_skip_pattern(pattern, icase=false)
|
41
|
+
self.skip_url_pattern = self.create_regex(pattern, icase)
|
42
|
+
end
|
43
|
+
|
44
|
+
def self.local=(is_local)
|
45
|
+
pattern = is_local ? "^#{self.url}" : nil
|
46
|
+
self.set_include_pattern(pattern)
|
33
47
|
end
|
34
48
|
end
|
data/lib/rawler/base.rb
CHANGED
@@ -15,6 +15,11 @@ module Rawler
|
|
15
15
|
Rawler.password = options[:password]
|
16
16
|
Rawler.wait = options[:wait]
|
17
17
|
Rawler.css = options[:css]
|
18
|
+
|
19
|
+
Rawler.local = options[:local]
|
20
|
+
|
21
|
+
Rawler.set_include_pattern(options[:include], false) unless options[:include].nil?
|
22
|
+
Rawler.set_include_pattern(options[:iinclude], true) unless options[:iinclude].nil?
|
18
23
|
|
19
24
|
Rawler.set_skip_pattern(options[:skip], false) unless options[:skip].nil?
|
20
25
|
Rawler.set_skip_pattern(options[:iskip], true) unless options[:iskip].nil?
|
data/lib/rawler/crawler.rb
CHANGED
data/rawler.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "rawler"
|
8
|
-
s.version = "0.1.
|
8
|
+
s.version = "0.1.8"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Oscar Del Ben"]
|
12
|
-
s.date = "
|
12
|
+
s.date = "2013-09-07"
|
13
13
|
s.description = "Rawler is a tool that crawls the links of your website"
|
14
14
|
s.email = "info@oscardelben.com"
|
15
15
|
s.executables = ["rawler"]
|
@@ -18,6 +18,8 @@ Gem::Specification.new do |s|
|
|
18
18
|
"README.md"
|
19
19
|
]
|
20
20
|
s.files = [
|
21
|
+
".rspec",
|
22
|
+
".travis.yml",
|
21
23
|
"Gemfile",
|
22
24
|
"Gemfile.lock",
|
23
25
|
"LICENSE.txt",
|
@@ -39,8 +41,6 @@ Gem::Specification.new do |s|
|
|
39
41
|
"spec/lib/rawler_spec.rb",
|
40
42
|
"spec/spec.opts",
|
41
43
|
"spec/spec_helper.rb",
|
42
|
-
"test/helper.rb",
|
43
|
-
"test/test_rawler.rb",
|
44
44
|
"vendor/lib-trollop.rb"
|
45
45
|
]
|
46
46
|
s.homepage = "http://github.com/oscardelben/rawler"
|
@@ -53,30 +53,27 @@ Gem::Specification.new do |s|
|
|
53
53
|
s.specification_version = 3
|
54
54
|
|
55
55
|
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
56
|
-
s.add_runtime_dependency(%q<nokogiri>, ["
|
56
|
+
s.add_runtime_dependency(%q<nokogiri>, ["~> 1.5"])
|
57
57
|
s.add_development_dependency(%q<fakeweb>, [">= 0"])
|
58
58
|
s.add_development_dependency(%q<rspec>, [">= 0"])
|
59
59
|
s.add_development_dependency(%q<shoulda>, [">= 0"])
|
60
|
-
s.add_development_dependency(%q<bundler>, ["~> 1.0
|
60
|
+
s.add_development_dependency(%q<bundler>, ["~> 1.0"])
|
61
61
|
s.add_development_dependency(%q<jeweler>, ["~> 1.6.4"])
|
62
|
-
s.add_development_dependency(%q<rcov>, [">= 0"])
|
63
62
|
else
|
64
|
-
s.add_dependency(%q<nokogiri>, ["
|
63
|
+
s.add_dependency(%q<nokogiri>, ["~> 1.5"])
|
65
64
|
s.add_dependency(%q<fakeweb>, [">= 0"])
|
66
65
|
s.add_dependency(%q<rspec>, [">= 0"])
|
67
66
|
s.add_dependency(%q<shoulda>, [">= 0"])
|
68
|
-
s.add_dependency(%q<bundler>, ["~> 1.0
|
67
|
+
s.add_dependency(%q<bundler>, ["~> 1.0"])
|
69
68
|
s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
|
70
|
-
s.add_dependency(%q<rcov>, [">= 0"])
|
71
69
|
end
|
72
70
|
else
|
73
|
-
s.add_dependency(%q<nokogiri>, ["
|
71
|
+
s.add_dependency(%q<nokogiri>, ["~> 1.5"])
|
74
72
|
s.add_dependency(%q<fakeweb>, [">= 0"])
|
75
73
|
s.add_dependency(%q<rspec>, [">= 0"])
|
76
74
|
s.add_dependency(%q<shoulda>, [">= 0"])
|
77
|
-
s.add_dependency(%q<bundler>, ["~> 1.0
|
75
|
+
s.add_dependency(%q<bundler>, ["~> 1.0"])
|
78
76
|
s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
|
79
|
-
s.add_dependency(%q<rcov>, [">= 0"])
|
80
77
|
end
|
81
78
|
end
|
82
79
|
|
@@ -8,12 +8,12 @@ describe Rawler::Crawler do
|
|
8
8
|
let(:output) { double('output', :error => nil) }
|
9
9
|
|
10
10
|
before(:each) do
|
11
|
-
Rawler.stub
|
12
|
-
Rawler.stub
|
11
|
+
Rawler.stub(:url).and_return(url)
|
12
|
+
Rawler.stub(:output).and_return(output)
|
13
13
|
end
|
14
14
|
|
15
15
|
context "basic functionality" do
|
16
|
-
|
16
|
+
|
17
17
|
let(:url) { 'http://example.com' }
|
18
18
|
let(:crawler) { Rawler::Crawler.new(url) }
|
19
19
|
let(:content) {
|
@@ -32,87 +32,87 @@ describe Rawler::Crawler do
|
|
32
32
|
it "should parse all links" do
|
33
33
|
crawler.links.should == ['http://example.com/foo', 'http://external.com/bar']
|
34
34
|
end
|
35
|
-
|
35
|
+
|
36
36
|
it "should parse css links" do
|
37
37
|
crawler.css_links.should == ['http://example.com/css/styles.css']
|
38
|
-
end
|
38
|
+
end
|
39
39
|
end
|
40
|
-
|
40
|
+
|
41
41
|
context "relative paths" do
|
42
|
-
|
42
|
+
|
43
43
|
context "base URL ends with a slash" do
|
44
|
-
|
44
|
+
|
45
45
|
let(:url) { 'http://example.com/dir1/dir2/' }
|
46
46
|
let(:crawler) { Rawler::Crawler.new(url) }
|
47
47
|
let(:content) { '<a href="/foo">foo</a> <a href="bar">bar</a> <a href="../baz">baz</a>' }
|
48
|
-
|
48
|
+
|
49
49
|
before(:each) do
|
50
50
|
register(url, content)
|
51
51
|
end
|
52
|
-
|
52
|
+
|
53
53
|
it "should parse relative links" do
|
54
54
|
crawler.links.should == ['http://example.com/foo', 'http://example.com/dir1/dir2/bar', 'http://example.com/dir1/baz']
|
55
55
|
end
|
56
|
-
|
56
|
+
|
57
57
|
end
|
58
|
-
|
58
|
+
|
59
59
|
context "base URL doesn't end with a slash" do
|
60
|
-
|
60
|
+
|
61
61
|
let(:url) { 'http://example.com/dir1/dir2' }
|
62
62
|
let(:crawler) { Rawler::Crawler.new(url) }
|
63
63
|
let(:content) { '<a href="/foo">foo</a> <a href="bar">bar</a> <a href="../baz">baz</a>' }
|
64
|
-
|
64
|
+
|
65
65
|
before(:each) do
|
66
66
|
register(url, content)
|
67
67
|
end
|
68
|
-
|
68
|
+
|
69
69
|
it "should parse relative links" do
|
70
70
|
crawler.links.should == ['http://example.com/foo', 'http://example.com/dir1/bar', 'http://example.com/baz']
|
71
71
|
end
|
72
|
-
|
72
|
+
|
73
73
|
end
|
74
|
-
|
74
|
+
|
75
75
|
end
|
76
|
-
|
76
|
+
|
77
77
|
context "different domains" do
|
78
|
-
|
78
|
+
|
79
79
|
let(:url) { 'http://external.com/path' }
|
80
80
|
let(:crawler) { Rawler::Crawler.new(url) }
|
81
81
|
let(:content) { '<a href="/foo">foo</a>' }
|
82
|
-
|
82
|
+
|
83
83
|
before(:each) do
|
84
|
-
Rawler.stub
|
84
|
+
Rawler.stub(:url).and_return('http://example.com')
|
85
85
|
register(url, content)
|
86
86
|
end
|
87
|
-
|
87
|
+
|
88
88
|
it "should parse relative links" do
|
89
89
|
crawler.links.should == []
|
90
90
|
end
|
91
|
-
|
91
|
+
|
92
92
|
end
|
93
|
-
|
93
|
+
|
94
94
|
context "urls with hash tags" do
|
95
|
-
|
95
|
+
|
96
96
|
let(:url) { 'http://example.com/path' }
|
97
97
|
let(:crawler) { Rawler::Crawler.new(url) }
|
98
98
|
let(:content) { '<a href="/foo#bar">foo</a>' }
|
99
|
-
|
99
|
+
|
100
100
|
before(:each) do
|
101
101
|
register(url, content)
|
102
102
|
end
|
103
|
-
|
103
|
+
|
104
104
|
it "should not encode hashtags" do
|
105
105
|
crawler.links.should == ['http://example.com/foo#bar']
|
106
106
|
end
|
107
|
-
|
107
|
+
|
108
108
|
end
|
109
|
-
|
109
|
+
|
110
110
|
context "urls with unicode characters" do
|
111
|
-
|
111
|
+
|
112
112
|
let(:url) { 'http://example.com' }
|
113
113
|
let(:crawler) { Rawler::Crawler.new(url) }
|
114
114
|
let(:content) { '<a href="http://example.com/写程序容易出现的几个不好的地方">foo</a>' }
|
115
|
-
|
115
|
+
|
116
116
|
before(:each) do
|
117
117
|
register(url, content)
|
118
118
|
end
|
@@ -120,9 +120,9 @@ describe Rawler::Crawler do
|
|
120
120
|
it "should parse unicode links" do
|
121
121
|
crawler.links.should == ['http://example.com/%E5%86%99%E7%A8%8B%E5%BA%8F%E5%AE%B9%E6%98%93%E5%87%BA%E7%8E%B0%E7%9A%84%E5%87%A0%E4%B8%AA%E4%B8%8D%E5%A5%BD%E7%9A%84%E5%9C%B0%E6%96%B9']
|
122
122
|
end
|
123
|
-
|
123
|
+
|
124
124
|
end
|
125
|
-
|
125
|
+
|
126
126
|
context "invalid urls" do
|
127
127
|
|
128
128
|
context "javascript" do
|
@@ -130,11 +130,11 @@ describe Rawler::Crawler do
|
|
130
130
|
let(:crawler) { Rawler::Crawler.new(url) }
|
131
131
|
let(:js_url) { "javascript:fn('nbjmup;jhfs.esf{fio/dpn');" }
|
132
132
|
let(:content) { "<a href=\"#{js_url}\">foo</a><a name=\"foo\">" }
|
133
|
-
|
133
|
+
|
134
134
|
before(:each) do
|
135
135
|
register(url, content)
|
136
136
|
end
|
137
|
-
|
137
|
+
|
138
138
|
it "should return empty links" do
|
139
139
|
crawler.links.should == []
|
140
140
|
end
|
@@ -149,11 +149,11 @@ describe Rawler::Crawler do
|
|
149
149
|
let(:url) { 'http://example.com/path' }
|
150
150
|
let(:crawler) { Rawler::Crawler.new(url) }
|
151
151
|
let(:content) { "<a href=\"mailto:example@example.com\">foo</a><a name=\"foo\">" }
|
152
|
-
|
152
|
+
|
153
153
|
before(:each) do
|
154
154
|
register(url, content)
|
155
155
|
end
|
156
|
-
|
156
|
+
|
157
157
|
it "should return empty links" do
|
158
158
|
crawler.links.should == []
|
159
159
|
end
|
@@ -163,16 +163,16 @@ describe Rawler::Crawler do
|
|
163
163
|
crawler.links
|
164
164
|
end
|
165
165
|
end
|
166
|
-
|
166
|
+
|
167
167
|
context "callto" do
|
168
168
|
let(:url) { 'http://example.com/path' }
|
169
169
|
let(:crawler) { Rawler::Crawler.new(url) }
|
170
170
|
let(:content) { "<a href=\"callto:home22\">foo</a><a name=\"foo\">" }
|
171
|
-
|
171
|
+
|
172
172
|
before(:each) do
|
173
173
|
register(url, content)
|
174
174
|
end
|
175
|
-
|
175
|
+
|
176
176
|
it "should return empty links" do
|
177
177
|
crawler.links.should == []
|
178
178
|
end
|
@@ -187,12 +187,12 @@ describe Rawler::Crawler do
|
|
187
187
|
let(:url) { 'http://example.com/path' }
|
188
188
|
let(:crawler) { Rawler::Crawler.new(url) }
|
189
189
|
let(:content) { "<a href=\"http://example.com/search/page:1/\">foo</a><a href=\"http://example.com/search/page:2/\">foo</a>" }
|
190
|
-
|
190
|
+
|
191
191
|
before(:each) do
|
192
192
|
Rawler.set_skip_pattern('\/search\/(.*\/)?page:[2-9]', false)
|
193
193
|
register(url, content)
|
194
194
|
end
|
195
|
-
|
195
|
+
|
196
196
|
it "should return one links" do
|
197
197
|
crawler.links.length.should eql(1)
|
198
198
|
end
|
@@ -201,63 +201,144 @@ describe Rawler::Crawler do
|
|
201
201
|
crawler.should_not_receive(:write)
|
202
202
|
crawler.links
|
203
203
|
end
|
204
|
+
|
205
|
+
after(:each) do
|
206
|
+
Rawler.set_skip_pattern(nil)
|
207
|
+
end
|
204
208
|
end
|
205
209
|
|
206
210
|
context "case-insensitive skip matches" do
|
207
211
|
let(:url) { 'http://example.com/path' }
|
208
212
|
let(:crawler) { Rawler::Crawler.new(url) }
|
209
213
|
let(:content) { "<a href=\"http://example.com/search/page:1/\">foo</a><a href=\"http://example.com/search/page:2/\">foo</a>" }
|
210
|
-
|
214
|
+
|
211
215
|
before(:each) do
|
212
216
|
Rawler.set_skip_pattern('\/seArcH\/(.*\/)?PAGE:[2-9]', true)
|
213
217
|
register(url, content)
|
214
218
|
end
|
215
|
-
|
219
|
+
|
220
|
+
it "should return one links" do
|
221
|
+
crawler.links.length.should eql(1)
|
222
|
+
end
|
223
|
+
|
224
|
+
it "should not report that it's skipping" do
|
225
|
+
crawler.should_not_receive(:write)
|
226
|
+
crawler.links
|
227
|
+
end
|
228
|
+
|
229
|
+
after(:each) do
|
230
|
+
Rawler.set_skip_pattern(nil)
|
231
|
+
end
|
232
|
+
end
|
233
|
+
|
234
|
+
context "include matches" do
|
235
|
+
let(:url) { 'http://example.com/path' }
|
236
|
+
let(:crawler) { Rawler::Crawler.new(url) }
|
237
|
+
let(:content) { "<a href=\"http://example.com/search/page:1/\">foo</a><a href=\"http://example.com/search/page:2/\">foo</a>" }
|
238
|
+
|
239
|
+
before(:each) do
|
240
|
+
Rawler.set_include_pattern('\/search\/(.*\/)?page:[2-9]', false)
|
241
|
+
register(url, content)
|
242
|
+
end
|
243
|
+
|
244
|
+
it "should return one links" do
|
245
|
+
crawler.links.length.should eql(1)
|
246
|
+
crawler.links.should eq(['http://example.com/search/page:2/'])
|
247
|
+
end
|
248
|
+
|
249
|
+
it "should not report that it's including" do
|
250
|
+
crawler.should_not_receive(:write)
|
251
|
+
crawler.links
|
252
|
+
end
|
253
|
+
|
254
|
+
after(:each) do
|
255
|
+
Rawler.set_include_pattern(nil)
|
256
|
+
end
|
257
|
+
end
|
258
|
+
|
259
|
+
context "case-insensitive include matches" do
|
260
|
+
let(:url) { 'http://example.com/path' }
|
261
|
+
let(:crawler) { Rawler::Crawler.new(url) }
|
262
|
+
let(:content) { "<a href=\"http://example.com/search/page:1/\">foo</a><a href=\"http://example.com/search/page:2/\">foo</a>" }
|
263
|
+
|
264
|
+
before(:each) do
|
265
|
+
Rawler.set_include_pattern('\/seArcH\/(.*\/)?PAGE:[2-9]', true)
|
266
|
+
register(url, content)
|
267
|
+
end
|
268
|
+
|
216
269
|
it "should return one links" do
|
217
270
|
crawler.links.length.should eql(1)
|
218
271
|
end
|
219
272
|
|
273
|
+
it "should not report that it's including" do
|
274
|
+
crawler.should_not_receive(:write)
|
275
|
+
crawler.links
|
276
|
+
end
|
277
|
+
|
278
|
+
after(:each) do
|
279
|
+
Rawler.set_include_pattern(nil)
|
280
|
+
end
|
281
|
+
end
|
282
|
+
|
283
|
+
context "non-local site should be omitted when local flag is used" do
|
284
|
+
let(:url) { 'http://example.com/' }
|
285
|
+
let(:crawler) { Rawler::Crawler.new(url) }
|
286
|
+
let(:content) { "<a href=\"http://example.com/page1/\">foo</a><a href=\"http://example.org/page2\">foo</a>" }
|
287
|
+
|
288
|
+
before(:each) do
|
289
|
+
Rawler.local = true
|
290
|
+
register(url, content)
|
291
|
+
end
|
292
|
+
|
293
|
+
it "should return one link" do
|
294
|
+
crawler.links.length.should eql(1)
|
295
|
+
end
|
296
|
+
|
220
297
|
it "should not report that it's skipping" do
|
221
298
|
crawler.should_not_receive(:write)
|
222
299
|
crawler.links
|
223
300
|
end
|
301
|
+
|
302
|
+
after(:each) do
|
303
|
+
Rawler.local = false
|
304
|
+
end
|
224
305
|
end
|
225
306
|
|
226
307
|
end
|
227
308
|
|
228
309
|
context "content type" do
|
229
|
-
|
310
|
+
|
230
311
|
['text/plain', 'text/css', 'image/jpeg'].each do |content_type|
|
231
|
-
|
312
|
+
|
232
313
|
let(:url) { 'http://example.com' }
|
233
314
|
let(:crawler) { Rawler::Crawler.new(url) }
|
234
|
-
|
315
|
+
|
235
316
|
before(:each) do
|
236
317
|
register(url, '', 200, :content_type => content_type)
|
237
318
|
end
|
238
|
-
|
319
|
+
|
239
320
|
it "should ignore '#{content_type}'" do
|
240
321
|
crawler.links.should == []
|
241
322
|
end
|
242
|
-
|
323
|
+
|
243
324
|
end
|
244
325
|
end
|
245
|
-
|
326
|
+
|
246
327
|
context "Exceptions" do
|
247
|
-
|
328
|
+
|
248
329
|
let(:url) { 'http://example.com' }
|
249
330
|
let(:crawler) { Rawler::Crawler.new(url) }
|
250
|
-
|
331
|
+
|
251
332
|
before(:each) do
|
252
333
|
register(url, '')
|
253
334
|
end
|
254
|
-
|
335
|
+
|
255
336
|
context "Errno::ECONNREFUSED" do
|
256
|
-
|
337
|
+
|
257
338
|
before(:each) do
|
258
|
-
Rawler::Request.stub
|
339
|
+
Rawler::Request.stub(:get).and_raise Errno::ECONNREFUSED
|
259
340
|
end
|
260
|
-
|
341
|
+
|
261
342
|
it "should return an empty array" do
|
262
343
|
crawler.links.should == []
|
263
344
|
end
|
@@ -266,14 +347,14 @@ describe Rawler::Crawler do
|
|
266
347
|
output.should_receive(:error).with("Couldn't connect to #{url}")
|
267
348
|
|
268
349
|
crawler.links
|
269
|
-
end
|
270
|
-
|
350
|
+
end
|
351
|
+
|
271
352
|
end
|
272
|
-
|
353
|
+
|
273
354
|
context "Errno::ETIMEDOUT" do
|
274
|
-
|
355
|
+
|
275
356
|
before(:each) do
|
276
|
-
Rawler::Request.stub
|
357
|
+
Rawler::Request.stub(:get).and_raise Errno::ETIMEDOUT
|
277
358
|
end
|
278
359
|
|
279
360
|
it "should return an empty array when raising Errno::ETIMEDOUT" do
|
@@ -285,33 +366,33 @@ describe Rawler::Crawler do
|
|
285
366
|
|
286
367
|
crawler.links
|
287
368
|
end
|
288
|
-
|
369
|
+
|
289
370
|
end
|
290
|
-
|
371
|
+
|
291
372
|
end
|
292
|
-
|
373
|
+
|
293
374
|
context "http basic" do
|
294
|
-
|
375
|
+
|
295
376
|
let(:url) { 'http://example.com' }
|
296
377
|
let(:content) { '<a href="http://example.com/secret-path">foo</a>' }
|
297
378
|
let(:crawler) { Rawler::Crawler.new('http://example.com/secret') }
|
298
|
-
|
379
|
+
|
299
380
|
before(:each) do
|
300
381
|
register('http://example.com/secret', '', :status => ["401", "Unauthorized"])
|
301
382
|
register('http://foo:bar@example.com/secret', content)
|
302
383
|
|
303
|
-
Rawler.stub
|
304
|
-
Rawler.stub
|
384
|
+
Rawler.stub(:username).and_return('foo')
|
385
|
+
Rawler.stub(:password).and_return('bar')
|
305
386
|
end
|
306
|
-
|
387
|
+
|
307
388
|
it "should crawl http basic pages" do
|
308
389
|
crawler.links.should == ['http://example.com/secret-path']
|
309
390
|
end
|
310
|
-
|
391
|
+
|
311
392
|
end
|
312
|
-
|
393
|
+
|
313
394
|
context "url domain" do
|
314
|
-
|
395
|
+
|
315
396
|
let(:content) {
|
316
397
|
content = <<-content
|
317
398
|
<a href="http://example.com/valid">foo</a>
|
@@ -322,11 +403,11 @@ describe Rawler::Crawler do
|
|
322
403
|
}
|
323
404
|
let(:url) { 'http://example.com' }
|
324
405
|
let(:crawler) { Rawler::Crawler.new(url) }
|
325
|
-
|
406
|
+
|
326
407
|
before(:each) do
|
327
408
|
register(url, content)
|
328
409
|
end
|
329
|
-
|
410
|
+
|
330
411
|
it "should ignore links other than http or https" do
|
331
412
|
crawler.links.should == ['http://example.com/valid', 'https://foo.com', 'http://fooo.com']
|
332
413
|
end
|
@@ -336,11 +417,11 @@ describe Rawler::Crawler do
|
|
336
417
|
let(:content) { '<a href="http://foo;bar">foo</a>' }
|
337
418
|
let(:url) { 'http://example.com' }
|
338
419
|
let(:crawler) { Rawler::Crawler.new(url) }
|
339
|
-
|
420
|
+
|
340
421
|
before(:each) do
|
341
422
|
register(url, content)
|
342
423
|
end
|
343
|
-
|
424
|
+
|
344
425
|
it "should notify about the invalid url" do
|
345
426
|
output.should_receive(:error).with('Invalid url: http://foo;bar - Called from: http://example.com')
|
346
427
|
crawler.links.should == []
|
data/spec/lib/rawler_spec.rb
CHANGED
@@ -8,7 +8,7 @@ describe Rawler::Base do
|
|
8
8
|
let(:rawler) { Rawler::Base.new('http://example.com', output) }
|
9
9
|
|
10
10
|
before(:each) do
|
11
|
-
Rawler.stub
|
11
|
+
Rawler.stub(:output).and_return(output)
|
12
12
|
register('http://example.com', site)
|
13
13
|
end
|
14
14
|
|
@@ -94,6 +94,7 @@ describe Rawler::Base do
|
|
94
94
|
end
|
95
95
|
|
96
96
|
it "should not validate links on external pages" do
|
97
|
+
register('http://example.com/', '<a href="http://external.com/foo">x</a>')
|
97
98
|
register('http://example.com/foo', '<a href="http://external.com/foo">x</a>')
|
98
99
|
register('http://external.com/foo', '<a href="http://external.com/bar">x</a>')
|
99
100
|
register('http://external.com/bar', '')
|
data/spec/spec_helper.rb
CHANGED
@@ -1,9 +1,9 @@
|
|
1
1
|
module Kernel
|
2
|
-
|
2
|
+
|
3
3
|
def sleep(duration)
|
4
4
|
nil
|
5
5
|
end
|
6
|
-
|
6
|
+
|
7
7
|
end
|
8
8
|
|
9
9
|
|
@@ -16,3 +16,8 @@ FakeWeb.allow_net_connect = false
|
|
16
16
|
def register(uri, content, status=200, options={})
|
17
17
|
FakeWeb.register_uri(:any, uri, { :body => content, :status => status, :content_type => 'text/html' }.merge(options))
|
18
18
|
end
|
19
|
+
|
20
|
+
if ENV['COVERAGE']
|
21
|
+
require 'simplecov'
|
22
|
+
SimpleCov.start
|
23
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.8
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,24 +9,24 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2013-09-07 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: nokogiri
|
16
16
|
requirement: !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
|
-
- -
|
19
|
+
- - ~>
|
20
20
|
- !ruby/object:Gem::Version
|
21
|
-
version: '
|
21
|
+
version: '1.5'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
24
|
version_requirements: !ruby/object:Gem::Requirement
|
25
25
|
none: false
|
26
26
|
requirements:
|
27
|
-
- -
|
27
|
+
- - ~>
|
28
28
|
- !ruby/object:Gem::Version
|
29
|
-
version: '
|
29
|
+
version: '1.5'
|
30
30
|
- !ruby/object:Gem::Dependency
|
31
31
|
name: fakeweb
|
32
32
|
requirement: !ruby/object:Gem::Requirement
|
@@ -82,7 +82,7 @@ dependencies:
|
|
82
82
|
requirements:
|
83
83
|
- - ~>
|
84
84
|
- !ruby/object:Gem::Version
|
85
|
-
version: 1.0
|
85
|
+
version: '1.0'
|
86
86
|
type: :development
|
87
87
|
prerelease: false
|
88
88
|
version_requirements: !ruby/object:Gem::Requirement
|
@@ -90,7 +90,7 @@ dependencies:
|
|
90
90
|
requirements:
|
91
91
|
- - ~>
|
92
92
|
- !ruby/object:Gem::Version
|
93
|
-
version: 1.0
|
93
|
+
version: '1.0'
|
94
94
|
- !ruby/object:Gem::Dependency
|
95
95
|
name: jeweler
|
96
96
|
requirement: !ruby/object:Gem::Requirement
|
@@ -107,22 +107,6 @@ dependencies:
|
|
107
107
|
- - ~>
|
108
108
|
- !ruby/object:Gem::Version
|
109
109
|
version: 1.6.4
|
110
|
-
- !ruby/object:Gem::Dependency
|
111
|
-
name: rcov
|
112
|
-
requirement: !ruby/object:Gem::Requirement
|
113
|
-
none: false
|
114
|
-
requirements:
|
115
|
-
- - ! '>='
|
116
|
-
- !ruby/object:Gem::Version
|
117
|
-
version: '0'
|
118
|
-
type: :development
|
119
|
-
prerelease: false
|
120
|
-
version_requirements: !ruby/object:Gem::Requirement
|
121
|
-
none: false
|
122
|
-
requirements:
|
123
|
-
- - ! '>='
|
124
|
-
- !ruby/object:Gem::Version
|
125
|
-
version: '0'
|
126
110
|
description: Rawler is a tool that crawls the links of your website
|
127
111
|
email: info@oscardelben.com
|
128
112
|
executables:
|
@@ -132,6 +116,8 @@ extra_rdoc_files:
|
|
132
116
|
- LICENSE.txt
|
133
117
|
- README.md
|
134
118
|
files:
|
119
|
+
- .rspec
|
120
|
+
- .travis.yml
|
135
121
|
- Gemfile
|
136
122
|
- Gemfile.lock
|
137
123
|
- LICENSE.txt
|
@@ -153,8 +139,6 @@ files:
|
|
153
139
|
- spec/lib/rawler_spec.rb
|
154
140
|
- spec/spec.opts
|
155
141
|
- spec/spec_helper.rb
|
156
|
-
- test/helper.rb
|
157
|
-
- test/test_rawler.rb
|
158
142
|
- vendor/lib-trollop.rb
|
159
143
|
homepage: http://github.com/oscardelben/rawler
|
160
144
|
licenses:
|
@@ -171,7 +155,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
171
155
|
version: '0'
|
172
156
|
segments:
|
173
157
|
- 0
|
174
|
-
hash:
|
158
|
+
hash: 3460091796500092184
|
175
159
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
176
160
|
none: false
|
177
161
|
requirements:
|
data/test/helper.rb
DELETED
@@ -1,18 +0,0 @@
|
|
1
|
-
require 'rubygems'
|
2
|
-
require 'bundler'
|
3
|
-
begin
|
4
|
-
Bundler.setup(:default, :development)
|
5
|
-
rescue Bundler::BundlerError => e
|
6
|
-
$stderr.puts e.message
|
7
|
-
$stderr.puts "Run `bundle install` to install missing gems"
|
8
|
-
exit e.status_code
|
9
|
-
end
|
10
|
-
require 'test/unit'
|
11
|
-
require 'shoulda'
|
12
|
-
|
13
|
-
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
14
|
-
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
15
|
-
require 'rawler'
|
16
|
-
|
17
|
-
class Test::Unit::TestCase
|
18
|
-
end
|