rawler 0.1.7 → 0.1.8

Sign up to get free protection for your applications and to get access to all the features.
data/.rspec ADDED
@@ -0,0 +1 @@
1
+ --order rand
@@ -0,0 +1,5 @@
1
+ language: ruby
2
+ rvm:
3
+ - 1.9.3
4
+ - 2.0.0
5
+ script: bundle exec rspec spec
data/Gemfile CHANGED
@@ -5,13 +5,12 @@ source "http://rubygems.org"
5
5
 
6
6
  # Add dependencies to develop your gem here.
7
7
  # Include everything needed to run rake, tests, features, etc.
8
- gem 'nokogiri'
8
+ gem 'nokogiri', '~> 1.5'
9
9
 
10
10
  group :development, :test do
11
11
  gem 'fakeweb'
12
12
  gem "rspec"
13
13
  gem "shoulda", ">= 0"
14
- gem "bundler", "~> 1.0.0"
14
+ gem "bundler", "~> 1.0"
15
15
  gem "jeweler", "~> 1.6.4"
16
- gem "rcov", ">= 0"
17
16
  end
@@ -1,34 +1,52 @@
1
1
  GEM
2
2
  remote: http://rubygems.org/
3
3
  specs:
4
- diff-lcs (1.1.3)
4
+ activesupport (4.0.0)
5
+ i18n (~> 0.6, >= 0.6.4)
6
+ minitest (~> 4.2)
7
+ multi_json (~> 1.3)
8
+ thread_safe (~> 0.1)
9
+ tzinfo (~> 0.3.37)
10
+ atomic (1.1.13)
11
+ diff-lcs (1.2.4)
5
12
  fakeweb (1.3.0)
6
- git (1.2.5)
13
+ git (1.2.6)
14
+ i18n (0.6.5)
7
15
  jeweler (1.6.4)
8
16
  bundler (~> 1.0)
9
17
  git (>= 1.2.5)
10
18
  rake
11
- nokogiri (1.5.0)
12
- rake (0.9.2.2)
13
- rcov (0.9.11)
14
- rspec (2.7.0)
15
- rspec-core (~> 2.7.0)
16
- rspec-expectations (~> 2.7.0)
17
- rspec-mocks (~> 2.7.0)
18
- rspec-core (2.7.1)
19
- rspec-expectations (2.7.0)
20
- diff-lcs (~> 1.1.2)
21
- rspec-mocks (2.7.0)
22
- shoulda (2.11.3)
19
+ mini_portile (0.5.1)
20
+ minitest (4.7.5)
21
+ multi_json (1.7.9)
22
+ nokogiri (1.6.0)
23
+ mini_portile (~> 0.5.0)
24
+ rake (10.1.0)
25
+ rspec (2.14.1)
26
+ rspec-core (~> 2.14.0)
27
+ rspec-expectations (~> 2.14.0)
28
+ rspec-mocks (~> 2.14.0)
29
+ rspec-core (2.14.5)
30
+ rspec-expectations (2.14.2)
31
+ diff-lcs (>= 1.1.3, < 2.0)
32
+ rspec-mocks (2.14.3)
33
+ shoulda (3.5.0)
34
+ shoulda-context (~> 1.0, >= 1.0.1)
35
+ shoulda-matchers (>= 1.4.1, < 3.0)
36
+ shoulda-context (1.1.5)
37
+ shoulda-matchers (2.3.0)
38
+ activesupport (>= 3.0.0)
39
+ thread_safe (0.1.2)
40
+ atomic
41
+ tzinfo (0.3.37)
23
42
 
24
43
  PLATFORMS
25
44
  ruby
26
45
 
27
46
  DEPENDENCIES
28
- bundler (~> 1.0.0)
47
+ bundler (~> 1.0)
29
48
  fakeweb
30
49
  jeweler (~> 1.6.4)
31
- nokogiri
32
- rcov
50
+ nokogiri (~> 1.5)
33
51
  rspec
34
52
  shoulda
data/README.md CHANGED
@@ -9,14 +9,17 @@ Rawler will only parse pages with content type 'text/html', but it will check fo
9
9
  rawler http://example.com [options]
10
10
 
11
11
  where [options] are:
12
- --username, -u <s>: HTT Basic Username
13
- --password, -p <s>: HTT Basic Password
12
+ --username, -u <s>: HTTP Basic Username
13
+ --password, -p <s>: HTTP Basic Password
14
14
  --wait, -w <f>: Seconds to wait between requests, may be fractional e.g. '1.5' (default: 3.0)
15
15
  --log, -l: Log results to file rawler_log.txt
16
16
  --logfile, -o <s>: Specify logfile, implies --log (default: rawler_log.txt)
17
17
  --css, -c: Check CSS links
18
- --skip, -s <s>: Skip URLS that match a regexp
19
- --iskip, -i <s>: Skip URLS that match a case insensitive regexp
18
+ --skip, -s <s>: Skip URLs that match a regexp
19
+ --iskip, -i <s>: Skip URLs that match a case insensitive regexp
20
+ --include <s>: Only include URLs that match a regexp
21
+ --iinclude <s>: Only include URLs that match a case insensitive regexp
22
+ --local <s>: Restrict to the given URL and below. Equivalent to '--include ^http://mysite.com/*'.
20
23
  --version, -v: Print version and exit
21
24
  --help, -h: Show this message
22
25
 
data/Rakefile CHANGED
@@ -28,24 +28,7 @@ Jeweler::Tasks.new do |gem|
28
28
  end
29
29
  Jeweler::RubygemsDotOrgTasks.new
30
30
 
31
- require 'rake/testtask'
32
- Rake::TestTask.new(:test) do |test|
33
- test.libs << 'lib' << 'test'
34
- test.pattern = 'test/**/test_*.rb'
35
- test.verbose = true
36
- end
37
-
38
- require 'rcov/rcovtask'
39
- Rcov::RcovTask.new do |test|
40
- test.libs << 'test'
41
- test.pattern = 'test/**/test_*.rb'
42
- test.verbose = true
43
- test.rcov_opts << '--exclude "gems/*"'
44
- end
45
-
46
- task :default => :test
47
-
48
- require 'rake/rdoctask'
31
+ require 'rdoc/task'
49
32
  Rake::RDocTask.new do |rdoc|
50
33
  version = File.exist?('VERSION') ? File.read('VERSION') : ""
51
34
 
@@ -54,16 +37,3 @@ Rake::RDocTask.new do |rdoc|
54
37
  rdoc.rdoc_files.include('README*')
55
38
  rdoc.rdoc_files.include('lib/**/*.rb')
56
39
  end
57
-
58
- desc 'generate docs'
59
- task :rocco do
60
- #%x!rm -r html/*!
61
-
62
- Dir.chdir "lib"
63
-
64
- files = Dir['**/*.*']
65
-
66
- files.each do |file|
67
- %x!rocco #{file} -o ../html!
68
- end
69
- end
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.1.7
1
+ 0.1.8
data/bin/rawler CHANGED
@@ -14,14 +14,17 @@ Usage:
14
14
  where [options] are:
15
15
  EOS
16
16
 
17
- opt :username, "HTT Basic Username", :type => :string
18
- opt :password, "HTT Basic Password", :type => :string
17
+ opt :username, "HTTP Basic Username", :type => :string
18
+ opt :password, "HTTP Basic Password", :type => :string
19
19
  opt :wait, "Seconds to wait between requests, may be fractional e.g. '1.5'", :type => :float, :default => 3.0
20
20
  opt :log, "Log results to file #{Rawler::Base::DEFAULT_LOGFILE}", :type => :boolean, :default => false
21
21
  opt :logfile, "Specify logfile, implies --log", :type => :string, :default => Rawler::Base::DEFAULT_LOGFILE
22
22
  opt :css, "Check CSS links", :type => :boolean, :default => false
23
- opt :skip, "Skip URLS that match a pattern", :type => :string
24
- opt :iskip, "Skip URLS that match a case insensitive pattern", :type => :string
23
+ opt :skip, "Skip URLs that match a pattern", :type => :string
24
+ opt :iskip, "Skip URLs that match a case insensitive pattern", :type => :string
25
+ opt :include, "Only include URLS that match a pattern", :type => :string
26
+ opt :iinclude, "Only include URLS that match a case insensitive pattern. Equivalent to '--include ^http://mysite.com/*'.", :type => :string
27
+ opt :local, "Restrict to the given URL and below", :type => :boolean, :default => false
25
28
  end
26
29
 
27
30
 
@@ -12,6 +12,7 @@ module Rawler
12
12
  mattr_accessor :username, :password
13
13
  mattr_accessor :log, :logfile
14
14
  mattr_accessor :css
15
+ mattr_accessor :include_url_pattern
15
16
  mattr_accessor :skip_url_pattern
16
17
 
17
18
  autoload :Base, "rawler/base"
@@ -28,7 +29,20 @@ module Rawler
28
29
  @@url = url
29
30
  end
30
31
 
31
- def self.set_skip_pattern(pattern, icase)
32
- self.skip_url_pattern = pattern.nil? ? nil : Regexp.new(pattern, icase ? Regexp::IGNORECASE : nil )
32
+ def self.create_regex(pattern, icase=false)
33
+ pattern.nil? ? nil : Regexp.new(pattern, icase ? Regexp::IGNORECASE : nil )
34
+ end
35
+
36
+ def self.set_include_pattern(pattern, icase=false)
37
+ self.include_url_pattern = self.create_regex(pattern, icase)
38
+ end
39
+
40
+ def self.set_skip_pattern(pattern, icase=false)
41
+ self.skip_url_pattern = self.create_regex(pattern, icase)
42
+ end
43
+
44
+ def self.local=(is_local)
45
+ pattern = is_local ? "^#{self.url}" : nil
46
+ self.set_include_pattern(pattern)
33
47
  end
34
48
  end
@@ -15,6 +15,11 @@ module Rawler
15
15
  Rawler.password = options[:password]
16
16
  Rawler.wait = options[:wait]
17
17
  Rawler.css = options[:css]
18
+
19
+ Rawler.local = options[:local]
20
+
21
+ Rawler.set_include_pattern(options[:include], false) unless options[:include].nil?
22
+ Rawler.set_include_pattern(options[:iinclude], true) unless options[:iinclude].nil?
18
23
 
19
24
  Rawler.set_skip_pattern(options[:skip], false) unless options[:skip].nil?
20
25
  Rawler.set_skip_pattern(options[:iskip], true) unless options[:iskip].nil?
@@ -91,6 +91,8 @@ module Rawler
91
91
  scheme = URI.parse(url).scheme
92
92
  if url =~ Rawler.skip_url_pattern
93
93
  false
94
+ elsif Rawler.include_url_pattern && url !~ Rawler.include_url_pattern
95
+ false
94
96
  elsif ['http', 'https'].include?(scheme)
95
97
  true
96
98
  else
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = "rawler"
8
- s.version = "0.1.7"
8
+ s.version = "0.1.8"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Oscar Del Ben"]
12
- s.date = "2012-12-02"
12
+ s.date = "2013-09-07"
13
13
  s.description = "Rawler is a tool that crawls the links of your website"
14
14
  s.email = "info@oscardelben.com"
15
15
  s.executables = ["rawler"]
@@ -18,6 +18,8 @@ Gem::Specification.new do |s|
18
18
  "README.md"
19
19
  ]
20
20
  s.files = [
21
+ ".rspec",
22
+ ".travis.yml",
21
23
  "Gemfile",
22
24
  "Gemfile.lock",
23
25
  "LICENSE.txt",
@@ -39,8 +41,6 @@ Gem::Specification.new do |s|
39
41
  "spec/lib/rawler_spec.rb",
40
42
  "spec/spec.opts",
41
43
  "spec/spec_helper.rb",
42
- "test/helper.rb",
43
- "test/test_rawler.rb",
44
44
  "vendor/lib-trollop.rb"
45
45
  ]
46
46
  s.homepage = "http://github.com/oscardelben/rawler"
@@ -53,30 +53,27 @@ Gem::Specification.new do |s|
53
53
  s.specification_version = 3
54
54
 
55
55
  if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
56
- s.add_runtime_dependency(%q<nokogiri>, [">= 0"])
56
+ s.add_runtime_dependency(%q<nokogiri>, ["~> 1.5"])
57
57
  s.add_development_dependency(%q<fakeweb>, [">= 0"])
58
58
  s.add_development_dependency(%q<rspec>, [">= 0"])
59
59
  s.add_development_dependency(%q<shoulda>, [">= 0"])
60
- s.add_development_dependency(%q<bundler>, ["~> 1.0.0"])
60
+ s.add_development_dependency(%q<bundler>, ["~> 1.0"])
61
61
  s.add_development_dependency(%q<jeweler>, ["~> 1.6.4"])
62
- s.add_development_dependency(%q<rcov>, [">= 0"])
63
62
  else
64
- s.add_dependency(%q<nokogiri>, [">= 0"])
63
+ s.add_dependency(%q<nokogiri>, ["~> 1.5"])
65
64
  s.add_dependency(%q<fakeweb>, [">= 0"])
66
65
  s.add_dependency(%q<rspec>, [">= 0"])
67
66
  s.add_dependency(%q<shoulda>, [">= 0"])
68
- s.add_dependency(%q<bundler>, ["~> 1.0.0"])
67
+ s.add_dependency(%q<bundler>, ["~> 1.0"])
69
68
  s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
70
- s.add_dependency(%q<rcov>, [">= 0"])
71
69
  end
72
70
  else
73
- s.add_dependency(%q<nokogiri>, [">= 0"])
71
+ s.add_dependency(%q<nokogiri>, ["~> 1.5"])
74
72
  s.add_dependency(%q<fakeweb>, [">= 0"])
75
73
  s.add_dependency(%q<rspec>, [">= 0"])
76
74
  s.add_dependency(%q<shoulda>, [">= 0"])
77
- s.add_dependency(%q<bundler>, ["~> 1.0.0"])
75
+ s.add_dependency(%q<bundler>, ["~> 1.0"])
78
76
  s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
79
- s.add_dependency(%q<rcov>, [">= 0"])
80
77
  end
81
78
  end
82
79
 
@@ -8,12 +8,12 @@ describe Rawler::Crawler do
8
8
  let(:output) { double('output', :error => nil) }
9
9
 
10
10
  before(:each) do
11
- Rawler.stub!(:url).and_return(url)
12
- Rawler.stub!(:output).and_return(output)
11
+ Rawler.stub(:url).and_return(url)
12
+ Rawler.stub(:output).and_return(output)
13
13
  end
14
14
 
15
15
  context "basic functionality" do
16
-
16
+
17
17
  let(:url) { 'http://example.com' }
18
18
  let(:crawler) { Rawler::Crawler.new(url) }
19
19
  let(:content) {
@@ -32,87 +32,87 @@ describe Rawler::Crawler do
32
32
  it "should parse all links" do
33
33
  crawler.links.should == ['http://example.com/foo', 'http://external.com/bar']
34
34
  end
35
-
35
+
36
36
  it "should parse css links" do
37
37
  crawler.css_links.should == ['http://example.com/css/styles.css']
38
- end
38
+ end
39
39
  end
40
-
40
+
41
41
  context "relative paths" do
42
-
42
+
43
43
  context "base URL ends with a slash" do
44
-
44
+
45
45
  let(:url) { 'http://example.com/dir1/dir2/' }
46
46
  let(:crawler) { Rawler::Crawler.new(url) }
47
47
  let(:content) { '<a href="/foo">foo</a> <a href="bar">bar</a> <a href="../baz">baz</a>' }
48
-
48
+
49
49
  before(:each) do
50
50
  register(url, content)
51
51
  end
52
-
52
+
53
53
  it "should parse relative links" do
54
54
  crawler.links.should == ['http://example.com/foo', 'http://example.com/dir1/dir2/bar', 'http://example.com/dir1/baz']
55
55
  end
56
-
56
+
57
57
  end
58
-
58
+
59
59
  context "base URL doesn't end with a slash" do
60
-
60
+
61
61
  let(:url) { 'http://example.com/dir1/dir2' }
62
62
  let(:crawler) { Rawler::Crawler.new(url) }
63
63
  let(:content) { '<a href="/foo">foo</a> <a href="bar">bar</a> <a href="../baz">baz</a>' }
64
-
64
+
65
65
  before(:each) do
66
66
  register(url, content)
67
67
  end
68
-
68
+
69
69
  it "should parse relative links" do
70
70
  crawler.links.should == ['http://example.com/foo', 'http://example.com/dir1/bar', 'http://example.com/baz']
71
71
  end
72
-
72
+
73
73
  end
74
-
74
+
75
75
  end
76
-
76
+
77
77
  context "different domains" do
78
-
78
+
79
79
  let(:url) { 'http://external.com/path' }
80
80
  let(:crawler) { Rawler::Crawler.new(url) }
81
81
  let(:content) { '<a href="/foo">foo</a>' }
82
-
82
+
83
83
  before(:each) do
84
- Rawler.stub!(:url).and_return('http://example.com')
84
+ Rawler.stub(:url).and_return('http://example.com')
85
85
  register(url, content)
86
86
  end
87
-
87
+
88
88
  it "should parse relative links" do
89
89
  crawler.links.should == []
90
90
  end
91
-
91
+
92
92
  end
93
-
93
+
94
94
  context "urls with hash tags" do
95
-
95
+
96
96
  let(:url) { 'http://example.com/path' }
97
97
  let(:crawler) { Rawler::Crawler.new(url) }
98
98
  let(:content) { '<a href="/foo#bar">foo</a>' }
99
-
99
+
100
100
  before(:each) do
101
101
  register(url, content)
102
102
  end
103
-
103
+
104
104
  it "should not encode hashtags" do
105
105
  crawler.links.should == ['http://example.com/foo#bar']
106
106
  end
107
-
107
+
108
108
  end
109
-
109
+
110
110
  context "urls with unicode characters" do
111
-
111
+
112
112
  let(:url) { 'http://example.com' }
113
113
  let(:crawler) { Rawler::Crawler.new(url) }
114
114
  let(:content) { '<a href="http://example.com/写程序容易出现的几个不好的地方">foo</a>' }
115
-
115
+
116
116
  before(:each) do
117
117
  register(url, content)
118
118
  end
@@ -120,9 +120,9 @@ describe Rawler::Crawler do
120
120
  it "should parse unicode links" do
121
121
  crawler.links.should == ['http://example.com/%E5%86%99%E7%A8%8B%E5%BA%8F%E5%AE%B9%E6%98%93%E5%87%BA%E7%8E%B0%E7%9A%84%E5%87%A0%E4%B8%AA%E4%B8%8D%E5%A5%BD%E7%9A%84%E5%9C%B0%E6%96%B9']
122
122
  end
123
-
123
+
124
124
  end
125
-
125
+
126
126
  context "invalid urls" do
127
127
 
128
128
  context "javascript" do
@@ -130,11 +130,11 @@ describe Rawler::Crawler do
130
130
  let(:crawler) { Rawler::Crawler.new(url) }
131
131
  let(:js_url) { "javascript:fn('nbjmup;jhfs.esf{fio/dpn');" }
132
132
  let(:content) { "<a href=\"#{js_url}\">foo</a><a name=\"foo\">" }
133
-
133
+
134
134
  before(:each) do
135
135
  register(url, content)
136
136
  end
137
-
137
+
138
138
  it "should return empty links" do
139
139
  crawler.links.should == []
140
140
  end
@@ -149,11 +149,11 @@ describe Rawler::Crawler do
149
149
  let(:url) { 'http://example.com/path' }
150
150
  let(:crawler) { Rawler::Crawler.new(url) }
151
151
  let(:content) { "<a href=\"mailto:example@example.com\">foo</a><a name=\"foo\">" }
152
-
152
+
153
153
  before(:each) do
154
154
  register(url, content)
155
155
  end
156
-
156
+
157
157
  it "should return empty links" do
158
158
  crawler.links.should == []
159
159
  end
@@ -163,16 +163,16 @@ describe Rawler::Crawler do
163
163
  crawler.links
164
164
  end
165
165
  end
166
-
166
+
167
167
  context "callto" do
168
168
  let(:url) { 'http://example.com/path' }
169
169
  let(:crawler) { Rawler::Crawler.new(url) }
170
170
  let(:content) { "<a href=\"callto:home22\">foo</a><a name=\"foo\">" }
171
-
171
+
172
172
  before(:each) do
173
173
  register(url, content)
174
174
  end
175
-
175
+
176
176
  it "should return empty links" do
177
177
  crawler.links.should == []
178
178
  end
@@ -187,12 +187,12 @@ describe Rawler::Crawler do
187
187
  let(:url) { 'http://example.com/path' }
188
188
  let(:crawler) { Rawler::Crawler.new(url) }
189
189
  let(:content) { "<a href=\"http://example.com/search/page:1/\">foo</a><a href=\"http://example.com/search/page:2/\">foo</a>" }
190
-
190
+
191
191
  before(:each) do
192
192
  Rawler.set_skip_pattern('\/search\/(.*\/)?page:[2-9]', false)
193
193
  register(url, content)
194
194
  end
195
-
195
+
196
196
  it "should return one links" do
197
197
  crawler.links.length.should eql(1)
198
198
  end
@@ -201,63 +201,144 @@ describe Rawler::Crawler do
201
201
  crawler.should_not_receive(:write)
202
202
  crawler.links
203
203
  end
204
+
205
+ after(:each) do
206
+ Rawler.set_skip_pattern(nil)
207
+ end
204
208
  end
205
209
 
206
210
  context "case-insensitive skip matches" do
207
211
  let(:url) { 'http://example.com/path' }
208
212
  let(:crawler) { Rawler::Crawler.new(url) }
209
213
  let(:content) { "<a href=\"http://example.com/search/page:1/\">foo</a><a href=\"http://example.com/search/page:2/\">foo</a>" }
210
-
214
+
211
215
  before(:each) do
212
216
  Rawler.set_skip_pattern('\/seArcH\/(.*\/)?PAGE:[2-9]', true)
213
217
  register(url, content)
214
218
  end
215
-
219
+
220
+ it "should return one links" do
221
+ crawler.links.length.should eql(1)
222
+ end
223
+
224
+ it "should not report that it's skipping" do
225
+ crawler.should_not_receive(:write)
226
+ crawler.links
227
+ end
228
+
229
+ after(:each) do
230
+ Rawler.set_skip_pattern(nil)
231
+ end
232
+ end
233
+
234
+ context "include matches" do
235
+ let(:url) { 'http://example.com/path' }
236
+ let(:crawler) { Rawler::Crawler.new(url) }
237
+ let(:content) { "<a href=\"http://example.com/search/page:1/\">foo</a><a href=\"http://example.com/search/page:2/\">foo</a>" }
238
+
239
+ before(:each) do
240
+ Rawler.set_include_pattern('\/search\/(.*\/)?page:[2-9]', false)
241
+ register(url, content)
242
+ end
243
+
244
+ it "should return one links" do
245
+ crawler.links.length.should eql(1)
246
+ crawler.links.should eq(['http://example.com/search/page:2/'])
247
+ end
248
+
249
+ it "should not report that it's including" do
250
+ crawler.should_not_receive(:write)
251
+ crawler.links
252
+ end
253
+
254
+ after(:each) do
255
+ Rawler.set_include_pattern(nil)
256
+ end
257
+ end
258
+
259
+ context "case-insensitive include matches" do
260
+ let(:url) { 'http://example.com/path' }
261
+ let(:crawler) { Rawler::Crawler.new(url) }
262
+ let(:content) { "<a href=\"http://example.com/search/page:1/\">foo</a><a href=\"http://example.com/search/page:2/\">foo</a>" }
263
+
264
+ before(:each) do
265
+ Rawler.set_include_pattern('\/seArcH\/(.*\/)?PAGE:[2-9]', true)
266
+ register(url, content)
267
+ end
268
+
216
269
  it "should return one links" do
217
270
  crawler.links.length.should eql(1)
218
271
  end
219
272
 
273
+ it "should not report that it's including" do
274
+ crawler.should_not_receive(:write)
275
+ crawler.links
276
+ end
277
+
278
+ after(:each) do
279
+ Rawler.set_include_pattern(nil)
280
+ end
281
+ end
282
+
283
+ context "non-local site should be omitted when local flag is used" do
284
+ let(:url) { 'http://example.com/' }
285
+ let(:crawler) { Rawler::Crawler.new(url) }
286
+ let(:content) { "<a href=\"http://example.com/page1/\">foo</a><a href=\"http://example.org/page2\">foo</a>" }
287
+
288
+ before(:each) do
289
+ Rawler.local = true
290
+ register(url, content)
291
+ end
292
+
293
+ it "should return one link" do
294
+ crawler.links.length.should eql(1)
295
+ end
296
+
220
297
  it "should not report that it's skipping" do
221
298
  crawler.should_not_receive(:write)
222
299
  crawler.links
223
300
  end
301
+
302
+ after(:each) do
303
+ Rawler.local = false
304
+ end
224
305
  end
225
306
 
226
307
  end
227
308
 
228
309
  context "content type" do
229
-
310
+
230
311
  ['text/plain', 'text/css', 'image/jpeg'].each do |content_type|
231
-
312
+
232
313
  let(:url) { 'http://example.com' }
233
314
  let(:crawler) { Rawler::Crawler.new(url) }
234
-
315
+
235
316
  before(:each) do
236
317
  register(url, '', 200, :content_type => content_type)
237
318
  end
238
-
319
+
239
320
  it "should ignore '#{content_type}'" do
240
321
  crawler.links.should == []
241
322
  end
242
-
323
+
243
324
  end
244
325
  end
245
-
326
+
246
327
  context "Exceptions" do
247
-
328
+
248
329
  let(:url) { 'http://example.com' }
249
330
  let(:crawler) { Rawler::Crawler.new(url) }
250
-
331
+
251
332
  before(:each) do
252
333
  register(url, '')
253
334
  end
254
-
335
+
255
336
  context "Errno::ECONNREFUSED" do
256
-
337
+
257
338
  before(:each) do
258
- Rawler::Request.stub!(:get).and_raise Errno::ECONNREFUSED
339
+ Rawler::Request.stub(:get).and_raise Errno::ECONNREFUSED
259
340
  end
260
-
341
+
261
342
  it "should return an empty array" do
262
343
  crawler.links.should == []
263
344
  end
@@ -266,14 +347,14 @@ describe Rawler::Crawler do
266
347
  output.should_receive(:error).with("Couldn't connect to #{url}")
267
348
 
268
349
  crawler.links
269
- end
270
-
350
+ end
351
+
271
352
  end
272
-
353
+
273
354
  context "Errno::ETIMEDOUT" do
274
-
355
+
275
356
  before(:each) do
276
- Rawler::Request.stub!(:get).and_raise Errno::ETIMEDOUT
357
+ Rawler::Request.stub(:get).and_raise Errno::ETIMEDOUT
277
358
  end
278
359
 
279
360
  it "should return an empty array when raising Errno::ETIMEDOUT" do
@@ -285,33 +366,33 @@ describe Rawler::Crawler do
285
366
 
286
367
  crawler.links
287
368
  end
288
-
369
+
289
370
  end
290
-
371
+
291
372
  end
292
-
373
+
293
374
  context "http basic" do
294
-
375
+
295
376
  let(:url) { 'http://example.com' }
296
377
  let(:content) { '<a href="http://example.com/secret-path">foo</a>' }
297
378
  let(:crawler) { Rawler::Crawler.new('http://example.com/secret') }
298
-
379
+
299
380
  before(:each) do
300
381
  register('http://example.com/secret', '', :status => ["401", "Unauthorized"])
301
382
  register('http://foo:bar@example.com/secret', content)
302
383
 
303
- Rawler.stub!(:username).and_return('foo')
304
- Rawler.stub!(:password).and_return('bar')
384
+ Rawler.stub(:username).and_return('foo')
385
+ Rawler.stub(:password).and_return('bar')
305
386
  end
306
-
387
+
307
388
  it "should crawl http basic pages" do
308
389
  crawler.links.should == ['http://example.com/secret-path']
309
390
  end
310
-
391
+
311
392
  end
312
-
393
+
313
394
  context "url domain" do
314
-
395
+
315
396
  let(:content) {
316
397
  content = <<-content
317
398
  <a href="http://example.com/valid">foo</a>
@@ -322,11 +403,11 @@ describe Rawler::Crawler do
322
403
  }
323
404
  let(:url) { 'http://example.com' }
324
405
  let(:crawler) { Rawler::Crawler.new(url) }
325
-
406
+
326
407
  before(:each) do
327
408
  register(url, content)
328
409
  end
329
-
410
+
330
411
  it "should ignore links other than http or https" do
331
412
  crawler.links.should == ['http://example.com/valid', 'https://foo.com', 'http://fooo.com']
332
413
  end
@@ -336,11 +417,11 @@ describe Rawler::Crawler do
336
417
  let(:content) { '<a href="http://foo;bar">foo</a>' }
337
418
  let(:url) { 'http://example.com' }
338
419
  let(:crawler) { Rawler::Crawler.new(url) }
339
-
420
+
340
421
  before(:each) do
341
422
  register(url, content)
342
423
  end
343
-
424
+
344
425
  it "should notify about the invalid url" do
345
426
  output.should_receive(:error).with('Invalid url: http://foo;bar - Called from: http://example.com')
346
427
  crawler.links.should == []
@@ -8,7 +8,7 @@ describe Rawler::Base do
8
8
  let(:rawler) { Rawler::Base.new('http://example.com', output) }
9
9
 
10
10
  before(:each) do
11
- Rawler.stub!(:output).and_return(output)
11
+ Rawler.stub(:output).and_return(output)
12
12
  register('http://example.com', site)
13
13
  end
14
14
 
@@ -94,6 +94,7 @@ describe Rawler::Base do
94
94
  end
95
95
 
96
96
  it "should not validate links on external pages" do
97
+ register('http://example.com/', '<a href="http://external.com/foo">x</a>')
97
98
  register('http://example.com/foo', '<a href="http://external.com/foo">x</a>')
98
99
  register('http://external.com/foo', '<a href="http://external.com/bar">x</a>')
99
100
  register('http://external.com/bar', '')
@@ -1,9 +1,9 @@
1
1
  module Kernel
2
-
2
+
3
3
  def sleep(duration)
4
4
  nil
5
5
  end
6
-
6
+
7
7
  end
8
8
 
9
9
 
@@ -16,3 +16,8 @@ FakeWeb.allow_net_connect = false
16
16
  def register(uri, content, status=200, options={})
17
17
  FakeWeb.register_uri(:any, uri, { :body => content, :status => status, :content_type => 'text/html' }.merge(options))
18
18
  end
19
+
20
+ if ENV['COVERAGE']
21
+ require 'simplecov'
22
+ SimpleCov.start
23
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.7
4
+ version: 0.1.8
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,24 +9,24 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-12-02 00:00:00.000000000 Z
12
+ date: 2013-09-07 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: nokogiri
16
16
  requirement: !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
- - - ! '>='
19
+ - - ~>
20
20
  - !ruby/object:Gem::Version
21
- version: '0'
21
+ version: '1.5'
22
22
  type: :runtime
23
23
  prerelease: false
24
24
  version_requirements: !ruby/object:Gem::Requirement
25
25
  none: false
26
26
  requirements:
27
- - - ! '>='
27
+ - - ~>
28
28
  - !ruby/object:Gem::Version
29
- version: '0'
29
+ version: '1.5'
30
30
  - !ruby/object:Gem::Dependency
31
31
  name: fakeweb
32
32
  requirement: !ruby/object:Gem::Requirement
@@ -82,7 +82,7 @@ dependencies:
82
82
  requirements:
83
83
  - - ~>
84
84
  - !ruby/object:Gem::Version
85
- version: 1.0.0
85
+ version: '1.0'
86
86
  type: :development
87
87
  prerelease: false
88
88
  version_requirements: !ruby/object:Gem::Requirement
@@ -90,7 +90,7 @@ dependencies:
90
90
  requirements:
91
91
  - - ~>
92
92
  - !ruby/object:Gem::Version
93
- version: 1.0.0
93
+ version: '1.0'
94
94
  - !ruby/object:Gem::Dependency
95
95
  name: jeweler
96
96
  requirement: !ruby/object:Gem::Requirement
@@ -107,22 +107,6 @@ dependencies:
107
107
  - - ~>
108
108
  - !ruby/object:Gem::Version
109
109
  version: 1.6.4
110
- - !ruby/object:Gem::Dependency
111
- name: rcov
112
- requirement: !ruby/object:Gem::Requirement
113
- none: false
114
- requirements:
115
- - - ! '>='
116
- - !ruby/object:Gem::Version
117
- version: '0'
118
- type: :development
119
- prerelease: false
120
- version_requirements: !ruby/object:Gem::Requirement
121
- none: false
122
- requirements:
123
- - - ! '>='
124
- - !ruby/object:Gem::Version
125
- version: '0'
126
110
  description: Rawler is a tool that crawls the links of your website
127
111
  email: info@oscardelben.com
128
112
  executables:
@@ -132,6 +116,8 @@ extra_rdoc_files:
132
116
  - LICENSE.txt
133
117
  - README.md
134
118
  files:
119
+ - .rspec
120
+ - .travis.yml
135
121
  - Gemfile
136
122
  - Gemfile.lock
137
123
  - LICENSE.txt
@@ -153,8 +139,6 @@ files:
153
139
  - spec/lib/rawler_spec.rb
154
140
  - spec/spec.opts
155
141
  - spec/spec_helper.rb
156
- - test/helper.rb
157
- - test/test_rawler.rb
158
142
  - vendor/lib-trollop.rb
159
143
  homepage: http://github.com/oscardelben/rawler
160
144
  licenses:
@@ -171,7 +155,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
171
155
  version: '0'
172
156
  segments:
173
157
  - 0
174
- hash: -477710177479430630
158
+ hash: 3460091796500092184
175
159
  required_rubygems_version: !ruby/object:Gem::Requirement
176
160
  none: false
177
161
  requirements:
@@ -1,18 +0,0 @@
1
- require 'rubygems'
2
- require 'bundler'
3
- begin
4
- Bundler.setup(:default, :development)
5
- rescue Bundler::BundlerError => e
6
- $stderr.puts e.message
7
- $stderr.puts "Run `bundle install` to install missing gems"
8
- exit e.status_code
9
- end
10
- require 'test/unit'
11
- require 'shoulda'
12
-
13
- $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
14
- $LOAD_PATH.unshift(File.dirname(__FILE__))
15
- require 'rawler'
16
-
17
- class Test::Unit::TestCase
18
- end
@@ -1,7 +0,0 @@
1
- require 'helper'
2
-
3
- class TestRawler < Test::Unit::TestCase
4
- should "probably rename this file and start testing for real" do
5
- flunk "hey buddy, you should probably rename this file and start testing for real"
6
- end
7
- end