rawler 0.1.7 → 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.rspec ADDED
@@ -0,0 +1 @@
1
+ --order rand
@@ -0,0 +1,5 @@
1
+ language: ruby
2
+ rvm:
3
+ - 1.9.3
4
+ - 2.0.0
5
+ script: bundle exec rspec spec
data/Gemfile CHANGED
@@ -5,13 +5,12 @@ source "http://rubygems.org"
5
5
 
6
6
  # Add dependencies to develop your gem here.
7
7
  # Include everything needed to run rake, tests, features, etc.
8
- gem 'nokogiri'
8
+ gem 'nokogiri', '~> 1.5'
9
9
 
10
10
  group :development, :test do
11
11
  gem 'fakeweb'
12
12
  gem "rspec"
13
13
  gem "shoulda", ">= 0"
14
- gem "bundler", "~> 1.0.0"
14
+ gem "bundler", "~> 1.0"
15
15
  gem "jeweler", "~> 1.6.4"
16
- gem "rcov", ">= 0"
17
16
  end
@@ -1,34 +1,52 @@
1
1
  GEM
2
2
  remote: http://rubygems.org/
3
3
  specs:
4
- diff-lcs (1.1.3)
4
+ activesupport (4.0.0)
5
+ i18n (~> 0.6, >= 0.6.4)
6
+ minitest (~> 4.2)
7
+ multi_json (~> 1.3)
8
+ thread_safe (~> 0.1)
9
+ tzinfo (~> 0.3.37)
10
+ atomic (1.1.13)
11
+ diff-lcs (1.2.4)
5
12
  fakeweb (1.3.0)
6
- git (1.2.5)
13
+ git (1.2.6)
14
+ i18n (0.6.5)
7
15
  jeweler (1.6.4)
8
16
  bundler (~> 1.0)
9
17
  git (>= 1.2.5)
10
18
  rake
11
- nokogiri (1.5.0)
12
- rake (0.9.2.2)
13
- rcov (0.9.11)
14
- rspec (2.7.0)
15
- rspec-core (~> 2.7.0)
16
- rspec-expectations (~> 2.7.0)
17
- rspec-mocks (~> 2.7.0)
18
- rspec-core (2.7.1)
19
- rspec-expectations (2.7.0)
20
- diff-lcs (~> 1.1.2)
21
- rspec-mocks (2.7.0)
22
- shoulda (2.11.3)
19
+ mini_portile (0.5.1)
20
+ minitest (4.7.5)
21
+ multi_json (1.7.9)
22
+ nokogiri (1.6.0)
23
+ mini_portile (~> 0.5.0)
24
+ rake (10.1.0)
25
+ rspec (2.14.1)
26
+ rspec-core (~> 2.14.0)
27
+ rspec-expectations (~> 2.14.0)
28
+ rspec-mocks (~> 2.14.0)
29
+ rspec-core (2.14.5)
30
+ rspec-expectations (2.14.2)
31
+ diff-lcs (>= 1.1.3, < 2.0)
32
+ rspec-mocks (2.14.3)
33
+ shoulda (3.5.0)
34
+ shoulda-context (~> 1.0, >= 1.0.1)
35
+ shoulda-matchers (>= 1.4.1, < 3.0)
36
+ shoulda-context (1.1.5)
37
+ shoulda-matchers (2.3.0)
38
+ activesupport (>= 3.0.0)
39
+ thread_safe (0.1.2)
40
+ atomic
41
+ tzinfo (0.3.37)
23
42
 
24
43
  PLATFORMS
25
44
  ruby
26
45
 
27
46
  DEPENDENCIES
28
- bundler (~> 1.0.0)
47
+ bundler (~> 1.0)
29
48
  fakeweb
30
49
  jeweler (~> 1.6.4)
31
- nokogiri
32
- rcov
50
+ nokogiri (~> 1.5)
33
51
  rspec
34
52
  shoulda
data/README.md CHANGED
@@ -9,14 +9,17 @@ Rawler will only parse pages with content type 'text/html', but it will check fo
9
9
  rawler http://example.com [options]
10
10
 
11
11
  where [options] are:
12
- --username, -u <s>: HTT Basic Username
13
- --password, -p <s>: HTT Basic Password
12
+ --username, -u <s>: HTTP Basic Username
13
+ --password, -p <s>: HTTP Basic Password
14
14
  --wait, -w <f>: Seconds to wait between requests, may be fractional e.g. '1.5' (default: 3.0)
15
15
  --log, -l: Log results to file rawler_log.txt
16
16
  --logfile, -o <s>: Specify logfile, implies --log (default: rawler_log.txt)
17
17
  --css, -c: Check CSS links
18
- --skip, -s <s>: Skip URLS that match a regexp
19
- --iskip, -i <s>: Skip URLS that match a case insensitive regexp
18
+ --skip, -s <s>: Skip URLs that match a regexp
19
+ --iskip, -i <s>: Skip URLs that match a case insensitive regexp
20
+ --include <s>: Only include URLs that match a regexp
21
+ --iinclude <s>: Only include URLs that match a case insensitive regexp
22
+ --local <s>: Restrict to the given URL and below. Equivalent to '--include ^http://mysite.com/*'.
20
23
  --version, -v: Print version and exit
21
24
  --help, -h: Show this message
22
25
 
data/Rakefile CHANGED
@@ -28,24 +28,7 @@ Jeweler::Tasks.new do |gem|
28
28
  end
29
29
  Jeweler::RubygemsDotOrgTasks.new
30
30
 
31
- require 'rake/testtask'
32
- Rake::TestTask.new(:test) do |test|
33
- test.libs << 'lib' << 'test'
34
- test.pattern = 'test/**/test_*.rb'
35
- test.verbose = true
36
- end
37
-
38
- require 'rcov/rcovtask'
39
- Rcov::RcovTask.new do |test|
40
- test.libs << 'test'
41
- test.pattern = 'test/**/test_*.rb'
42
- test.verbose = true
43
- test.rcov_opts << '--exclude "gems/*"'
44
- end
45
-
46
- task :default => :test
47
-
48
- require 'rake/rdoctask'
31
+ require 'rdoc/task'
49
32
  Rake::RDocTask.new do |rdoc|
50
33
  version = File.exist?('VERSION') ? File.read('VERSION') : ""
51
34
 
@@ -54,16 +37,3 @@ Rake::RDocTask.new do |rdoc|
54
37
  rdoc.rdoc_files.include('README*')
55
38
  rdoc.rdoc_files.include('lib/**/*.rb')
56
39
  end
57
-
58
- desc 'generate docs'
59
- task :rocco do
60
- #%x!rm -r html/*!
61
-
62
- Dir.chdir "lib"
63
-
64
- files = Dir['**/*.*']
65
-
66
- files.each do |file|
67
- %x!rocco #{file} -o ../html!
68
- end
69
- end
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.1.7
1
+ 0.1.8
data/bin/rawler CHANGED
@@ -14,14 +14,17 @@ Usage:
14
14
  where [options] are:
15
15
  EOS
16
16
 
17
- opt :username, "HTT Basic Username", :type => :string
18
- opt :password, "HTT Basic Password", :type => :string
17
+ opt :username, "HTTP Basic Username", :type => :string
18
+ opt :password, "HTTP Basic Password", :type => :string
19
19
  opt :wait, "Seconds to wait between requests, may be fractional e.g. '1.5'", :type => :float, :default => 3.0
20
20
  opt :log, "Log results to file #{Rawler::Base::DEFAULT_LOGFILE}", :type => :boolean, :default => false
21
21
  opt :logfile, "Specify logfile, implies --log", :type => :string, :default => Rawler::Base::DEFAULT_LOGFILE
22
22
  opt :css, "Check CSS links", :type => :boolean, :default => false
23
- opt :skip, "Skip URLS that match a pattern", :type => :string
24
- opt :iskip, "Skip URLS that match a case insensitive pattern", :type => :string
23
+ opt :skip, "Skip URLs that match a pattern", :type => :string
24
+ opt :iskip, "Skip URLs that match a case insensitive pattern", :type => :string
25
+ opt :include, "Only include URLS that match a pattern", :type => :string
26
+ opt :iinclude, "Only include URLS that match a case insensitive pattern. Equivalent to '--include ^http://mysite.com/*'.", :type => :string
27
+ opt :local, "Restrict to the given URL and below", :type => :boolean, :default => false
25
28
  end
26
29
 
27
30
 
@@ -12,6 +12,7 @@ module Rawler
12
12
  mattr_accessor :username, :password
13
13
  mattr_accessor :log, :logfile
14
14
  mattr_accessor :css
15
+ mattr_accessor :include_url_pattern
15
16
  mattr_accessor :skip_url_pattern
16
17
 
17
18
  autoload :Base, "rawler/base"
@@ -28,7 +29,20 @@ module Rawler
28
29
  @@url = url
29
30
  end
30
31
 
31
- def self.set_skip_pattern(pattern, icase)
32
- self.skip_url_pattern = pattern.nil? ? nil : Regexp.new(pattern, icase ? Regexp::IGNORECASE : nil )
32
+ def self.create_regex(pattern, icase=false)
33
+ pattern.nil? ? nil : Regexp.new(pattern, icase ? Regexp::IGNORECASE : nil )
34
+ end
35
+
36
+ def self.set_include_pattern(pattern, icase=false)
37
+ self.include_url_pattern = self.create_regex(pattern, icase)
38
+ end
39
+
40
+ def self.set_skip_pattern(pattern, icase=false)
41
+ self.skip_url_pattern = self.create_regex(pattern, icase)
42
+ end
43
+
44
+ def self.local=(is_local)
45
+ pattern = is_local ? "^#{self.url}" : nil
46
+ self.set_include_pattern(pattern)
33
47
  end
34
48
  end
@@ -15,6 +15,11 @@ module Rawler
15
15
  Rawler.password = options[:password]
16
16
  Rawler.wait = options[:wait]
17
17
  Rawler.css = options[:css]
18
+
19
+ Rawler.local = options[:local]
20
+
21
+ Rawler.set_include_pattern(options[:include], false) unless options[:include].nil?
22
+ Rawler.set_include_pattern(options[:iinclude], true) unless options[:iinclude].nil?
18
23
 
19
24
  Rawler.set_skip_pattern(options[:skip], false) unless options[:skip].nil?
20
25
  Rawler.set_skip_pattern(options[:iskip], true) unless options[:iskip].nil?
@@ -91,6 +91,8 @@ module Rawler
91
91
  scheme = URI.parse(url).scheme
92
92
  if url =~ Rawler.skip_url_pattern
93
93
  false
94
+ elsif Rawler.include_url_pattern && url !~ Rawler.include_url_pattern
95
+ false
94
96
  elsif ['http', 'https'].include?(scheme)
95
97
  true
96
98
  else
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = "rawler"
8
- s.version = "0.1.7"
8
+ s.version = "0.1.8"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Oscar Del Ben"]
12
- s.date = "2012-12-02"
12
+ s.date = "2013-09-07"
13
13
  s.description = "Rawler is a tool that crawls the links of your website"
14
14
  s.email = "info@oscardelben.com"
15
15
  s.executables = ["rawler"]
@@ -18,6 +18,8 @@ Gem::Specification.new do |s|
18
18
  "README.md"
19
19
  ]
20
20
  s.files = [
21
+ ".rspec",
22
+ ".travis.yml",
21
23
  "Gemfile",
22
24
  "Gemfile.lock",
23
25
  "LICENSE.txt",
@@ -39,8 +41,6 @@ Gem::Specification.new do |s|
39
41
  "spec/lib/rawler_spec.rb",
40
42
  "spec/spec.opts",
41
43
  "spec/spec_helper.rb",
42
- "test/helper.rb",
43
- "test/test_rawler.rb",
44
44
  "vendor/lib-trollop.rb"
45
45
  ]
46
46
  s.homepage = "http://github.com/oscardelben/rawler"
@@ -53,30 +53,27 @@ Gem::Specification.new do |s|
53
53
  s.specification_version = 3
54
54
 
55
55
  if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
56
- s.add_runtime_dependency(%q<nokogiri>, [">= 0"])
56
+ s.add_runtime_dependency(%q<nokogiri>, ["~> 1.5"])
57
57
  s.add_development_dependency(%q<fakeweb>, [">= 0"])
58
58
  s.add_development_dependency(%q<rspec>, [">= 0"])
59
59
  s.add_development_dependency(%q<shoulda>, [">= 0"])
60
- s.add_development_dependency(%q<bundler>, ["~> 1.0.0"])
60
+ s.add_development_dependency(%q<bundler>, ["~> 1.0"])
61
61
  s.add_development_dependency(%q<jeweler>, ["~> 1.6.4"])
62
- s.add_development_dependency(%q<rcov>, [">= 0"])
63
62
  else
64
- s.add_dependency(%q<nokogiri>, [">= 0"])
63
+ s.add_dependency(%q<nokogiri>, ["~> 1.5"])
65
64
  s.add_dependency(%q<fakeweb>, [">= 0"])
66
65
  s.add_dependency(%q<rspec>, [">= 0"])
67
66
  s.add_dependency(%q<shoulda>, [">= 0"])
68
- s.add_dependency(%q<bundler>, ["~> 1.0.0"])
67
+ s.add_dependency(%q<bundler>, ["~> 1.0"])
69
68
  s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
70
- s.add_dependency(%q<rcov>, [">= 0"])
71
69
  end
72
70
  else
73
- s.add_dependency(%q<nokogiri>, [">= 0"])
71
+ s.add_dependency(%q<nokogiri>, ["~> 1.5"])
74
72
  s.add_dependency(%q<fakeweb>, [">= 0"])
75
73
  s.add_dependency(%q<rspec>, [">= 0"])
76
74
  s.add_dependency(%q<shoulda>, [">= 0"])
77
- s.add_dependency(%q<bundler>, ["~> 1.0.0"])
75
+ s.add_dependency(%q<bundler>, ["~> 1.0"])
78
76
  s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
79
- s.add_dependency(%q<rcov>, [">= 0"])
80
77
  end
81
78
  end
82
79
 
@@ -8,12 +8,12 @@ describe Rawler::Crawler do
8
8
  let(:output) { double('output', :error => nil) }
9
9
 
10
10
  before(:each) do
11
- Rawler.stub!(:url).and_return(url)
12
- Rawler.stub!(:output).and_return(output)
11
+ Rawler.stub(:url).and_return(url)
12
+ Rawler.stub(:output).and_return(output)
13
13
  end
14
14
 
15
15
  context "basic functionality" do
16
-
16
+
17
17
  let(:url) { 'http://example.com' }
18
18
  let(:crawler) { Rawler::Crawler.new(url) }
19
19
  let(:content) {
@@ -32,87 +32,87 @@ describe Rawler::Crawler do
32
32
  it "should parse all links" do
33
33
  crawler.links.should == ['http://example.com/foo', 'http://external.com/bar']
34
34
  end
35
-
35
+
36
36
  it "should parse css links" do
37
37
  crawler.css_links.should == ['http://example.com/css/styles.css']
38
- end
38
+ end
39
39
  end
40
-
40
+
41
41
  context "relative paths" do
42
-
42
+
43
43
  context "base URL ends with a slash" do
44
-
44
+
45
45
  let(:url) { 'http://example.com/dir1/dir2/' }
46
46
  let(:crawler) { Rawler::Crawler.new(url) }
47
47
  let(:content) { '<a href="/foo">foo</a> <a href="bar">bar</a> <a href="../baz">baz</a>' }
48
-
48
+
49
49
  before(:each) do
50
50
  register(url, content)
51
51
  end
52
-
52
+
53
53
  it "should parse relative links" do
54
54
  crawler.links.should == ['http://example.com/foo', 'http://example.com/dir1/dir2/bar', 'http://example.com/dir1/baz']
55
55
  end
56
-
56
+
57
57
  end
58
-
58
+
59
59
  context "base URL doesn't end with a slash" do
60
-
60
+
61
61
  let(:url) { 'http://example.com/dir1/dir2' }
62
62
  let(:crawler) { Rawler::Crawler.new(url) }
63
63
  let(:content) { '<a href="/foo">foo</a> <a href="bar">bar</a> <a href="../baz">baz</a>' }
64
-
64
+
65
65
  before(:each) do
66
66
  register(url, content)
67
67
  end
68
-
68
+
69
69
  it "should parse relative links" do
70
70
  crawler.links.should == ['http://example.com/foo', 'http://example.com/dir1/bar', 'http://example.com/baz']
71
71
  end
72
-
72
+
73
73
  end
74
-
74
+
75
75
  end
76
-
76
+
77
77
  context "different domains" do
78
-
78
+
79
79
  let(:url) { 'http://external.com/path' }
80
80
  let(:crawler) { Rawler::Crawler.new(url) }
81
81
  let(:content) { '<a href="/foo">foo</a>' }
82
-
82
+
83
83
  before(:each) do
84
- Rawler.stub!(:url).and_return('http://example.com')
84
+ Rawler.stub(:url).and_return('http://example.com')
85
85
  register(url, content)
86
86
  end
87
-
87
+
88
88
  it "should parse relative links" do
89
89
  crawler.links.should == []
90
90
  end
91
-
91
+
92
92
  end
93
-
93
+
94
94
  context "urls with hash tags" do
95
-
95
+
96
96
  let(:url) { 'http://example.com/path' }
97
97
  let(:crawler) { Rawler::Crawler.new(url) }
98
98
  let(:content) { '<a href="/foo#bar">foo</a>' }
99
-
99
+
100
100
  before(:each) do
101
101
  register(url, content)
102
102
  end
103
-
103
+
104
104
  it "should not encode hashtags" do
105
105
  crawler.links.should == ['http://example.com/foo#bar']
106
106
  end
107
-
107
+
108
108
  end
109
-
109
+
110
110
  context "urls with unicode characters" do
111
-
111
+
112
112
  let(:url) { 'http://example.com' }
113
113
  let(:crawler) { Rawler::Crawler.new(url) }
114
114
  let(:content) { '<a href="http://example.com/写程序容易出现的几个不好的地方">foo</a>' }
115
-
115
+
116
116
  before(:each) do
117
117
  register(url, content)
118
118
  end
@@ -120,9 +120,9 @@ describe Rawler::Crawler do
120
120
  it "should parse unicode links" do
121
121
  crawler.links.should == ['http://example.com/%E5%86%99%E7%A8%8B%E5%BA%8F%E5%AE%B9%E6%98%93%E5%87%BA%E7%8E%B0%E7%9A%84%E5%87%A0%E4%B8%AA%E4%B8%8D%E5%A5%BD%E7%9A%84%E5%9C%B0%E6%96%B9']
122
122
  end
123
-
123
+
124
124
  end
125
-
125
+
126
126
  context "invalid urls" do
127
127
 
128
128
  context "javascript" do
@@ -130,11 +130,11 @@ describe Rawler::Crawler do
130
130
  let(:crawler) { Rawler::Crawler.new(url) }
131
131
  let(:js_url) { "javascript:fn('nbjmup;jhfs.esf{fio/dpn');" }
132
132
  let(:content) { "<a href=\"#{js_url}\">foo</a><a name=\"foo\">" }
133
-
133
+
134
134
  before(:each) do
135
135
  register(url, content)
136
136
  end
137
-
137
+
138
138
  it "should return empty links" do
139
139
  crawler.links.should == []
140
140
  end
@@ -149,11 +149,11 @@ describe Rawler::Crawler do
149
149
  let(:url) { 'http://example.com/path' }
150
150
  let(:crawler) { Rawler::Crawler.new(url) }
151
151
  let(:content) { "<a href=\"mailto:example@example.com\">foo</a><a name=\"foo\">" }
152
-
152
+
153
153
  before(:each) do
154
154
  register(url, content)
155
155
  end
156
-
156
+
157
157
  it "should return empty links" do
158
158
  crawler.links.should == []
159
159
  end
@@ -163,16 +163,16 @@ describe Rawler::Crawler do
163
163
  crawler.links
164
164
  end
165
165
  end
166
-
166
+
167
167
  context "callto" do
168
168
  let(:url) { 'http://example.com/path' }
169
169
  let(:crawler) { Rawler::Crawler.new(url) }
170
170
  let(:content) { "<a href=\"callto:home22\">foo</a><a name=\"foo\">" }
171
-
171
+
172
172
  before(:each) do
173
173
  register(url, content)
174
174
  end
175
-
175
+
176
176
  it "should return empty links" do
177
177
  crawler.links.should == []
178
178
  end
@@ -187,12 +187,12 @@ describe Rawler::Crawler do
187
187
  let(:url) { 'http://example.com/path' }
188
188
  let(:crawler) { Rawler::Crawler.new(url) }
189
189
  let(:content) { "<a href=\"http://example.com/search/page:1/\">foo</a><a href=\"http://example.com/search/page:2/\">foo</a>" }
190
-
190
+
191
191
  before(:each) do
192
192
  Rawler.set_skip_pattern('\/search\/(.*\/)?page:[2-9]', false)
193
193
  register(url, content)
194
194
  end
195
-
195
+
196
196
  it "should return one links" do
197
197
  crawler.links.length.should eql(1)
198
198
  end
@@ -201,63 +201,144 @@ describe Rawler::Crawler do
201
201
  crawler.should_not_receive(:write)
202
202
  crawler.links
203
203
  end
204
+
205
+ after(:each) do
206
+ Rawler.set_skip_pattern(nil)
207
+ end
204
208
  end
205
209
 
206
210
  context "case-insensitive skip matches" do
207
211
  let(:url) { 'http://example.com/path' }
208
212
  let(:crawler) { Rawler::Crawler.new(url) }
209
213
  let(:content) { "<a href=\"http://example.com/search/page:1/\">foo</a><a href=\"http://example.com/search/page:2/\">foo</a>" }
210
-
214
+
211
215
  before(:each) do
212
216
  Rawler.set_skip_pattern('\/seArcH\/(.*\/)?PAGE:[2-9]', true)
213
217
  register(url, content)
214
218
  end
215
-
219
+
220
+ it "should return one links" do
221
+ crawler.links.length.should eql(1)
222
+ end
223
+
224
+ it "should not report that it's skipping" do
225
+ crawler.should_not_receive(:write)
226
+ crawler.links
227
+ end
228
+
229
+ after(:each) do
230
+ Rawler.set_skip_pattern(nil)
231
+ end
232
+ end
233
+
234
+ context "include matches" do
235
+ let(:url) { 'http://example.com/path' }
236
+ let(:crawler) { Rawler::Crawler.new(url) }
237
+ let(:content) { "<a href=\"http://example.com/search/page:1/\">foo</a><a href=\"http://example.com/search/page:2/\">foo</a>" }
238
+
239
+ before(:each) do
240
+ Rawler.set_include_pattern('\/search\/(.*\/)?page:[2-9]', false)
241
+ register(url, content)
242
+ end
243
+
244
+ it "should return one links" do
245
+ crawler.links.length.should eql(1)
246
+ crawler.links.should eq(['http://example.com/search/page:2/'])
247
+ end
248
+
249
+ it "should not report that it's including" do
250
+ crawler.should_not_receive(:write)
251
+ crawler.links
252
+ end
253
+
254
+ after(:each) do
255
+ Rawler.set_include_pattern(nil)
256
+ end
257
+ end
258
+
259
+ context "case-insensitive include matches" do
260
+ let(:url) { 'http://example.com/path' }
261
+ let(:crawler) { Rawler::Crawler.new(url) }
262
+ let(:content) { "<a href=\"http://example.com/search/page:1/\">foo</a><a href=\"http://example.com/search/page:2/\">foo</a>" }
263
+
264
+ before(:each) do
265
+ Rawler.set_include_pattern('\/seArcH\/(.*\/)?PAGE:[2-9]', true)
266
+ register(url, content)
267
+ end
268
+
216
269
  it "should return one links" do
217
270
  crawler.links.length.should eql(1)
218
271
  end
219
272
 
273
+ it "should not report that it's including" do
274
+ crawler.should_not_receive(:write)
275
+ crawler.links
276
+ end
277
+
278
+ after(:each) do
279
+ Rawler.set_include_pattern(nil)
280
+ end
281
+ end
282
+
283
+ context "non-local site should be omitted when local flag is used" do
284
+ let(:url) { 'http://example.com/' }
285
+ let(:crawler) { Rawler::Crawler.new(url) }
286
+ let(:content) { "<a href=\"http://example.com/page1/\">foo</a><a href=\"http://example.org/page2\">foo</a>" }
287
+
288
+ before(:each) do
289
+ Rawler.local = true
290
+ register(url, content)
291
+ end
292
+
293
+ it "should return one link" do
294
+ crawler.links.length.should eql(1)
295
+ end
296
+
220
297
  it "should not report that it's skipping" do
221
298
  crawler.should_not_receive(:write)
222
299
  crawler.links
223
300
  end
301
+
302
+ after(:each) do
303
+ Rawler.local = false
304
+ end
224
305
  end
225
306
 
226
307
  end
227
308
 
228
309
  context "content type" do
229
-
310
+
230
311
  ['text/plain', 'text/css', 'image/jpeg'].each do |content_type|
231
-
312
+
232
313
  let(:url) { 'http://example.com' }
233
314
  let(:crawler) { Rawler::Crawler.new(url) }
234
-
315
+
235
316
  before(:each) do
236
317
  register(url, '', 200, :content_type => content_type)
237
318
  end
238
-
319
+
239
320
  it "should ignore '#{content_type}'" do
240
321
  crawler.links.should == []
241
322
  end
242
-
323
+
243
324
  end
244
325
  end
245
-
326
+
246
327
  context "Exceptions" do
247
-
328
+
248
329
  let(:url) { 'http://example.com' }
249
330
  let(:crawler) { Rawler::Crawler.new(url) }
250
-
331
+
251
332
  before(:each) do
252
333
  register(url, '')
253
334
  end
254
-
335
+
255
336
  context "Errno::ECONNREFUSED" do
256
-
337
+
257
338
  before(:each) do
258
- Rawler::Request.stub!(:get).and_raise Errno::ECONNREFUSED
339
+ Rawler::Request.stub(:get).and_raise Errno::ECONNREFUSED
259
340
  end
260
-
341
+
261
342
  it "should return an empty array" do
262
343
  crawler.links.should == []
263
344
  end
@@ -266,14 +347,14 @@ describe Rawler::Crawler do
266
347
  output.should_receive(:error).with("Couldn't connect to #{url}")
267
348
 
268
349
  crawler.links
269
- end
270
-
350
+ end
351
+
271
352
  end
272
-
353
+
273
354
  context "Errno::ETIMEDOUT" do
274
-
355
+
275
356
  before(:each) do
276
- Rawler::Request.stub!(:get).and_raise Errno::ETIMEDOUT
357
+ Rawler::Request.stub(:get).and_raise Errno::ETIMEDOUT
277
358
  end
278
359
 
279
360
  it "should return an empty array when raising Errno::ETIMEDOUT" do
@@ -285,33 +366,33 @@ describe Rawler::Crawler do
285
366
 
286
367
  crawler.links
287
368
  end
288
-
369
+
289
370
  end
290
-
371
+
291
372
  end
292
-
373
+
293
374
  context "http basic" do
294
-
375
+
295
376
  let(:url) { 'http://example.com' }
296
377
  let(:content) { '<a href="http://example.com/secret-path">foo</a>' }
297
378
  let(:crawler) { Rawler::Crawler.new('http://example.com/secret') }
298
-
379
+
299
380
  before(:each) do
300
381
  register('http://example.com/secret', '', :status => ["401", "Unauthorized"])
301
382
  register('http://foo:bar@example.com/secret', content)
302
383
 
303
- Rawler.stub!(:username).and_return('foo')
304
- Rawler.stub!(:password).and_return('bar')
384
+ Rawler.stub(:username).and_return('foo')
385
+ Rawler.stub(:password).and_return('bar')
305
386
  end
306
-
387
+
307
388
  it "should crawl http basic pages" do
308
389
  crawler.links.should == ['http://example.com/secret-path']
309
390
  end
310
-
391
+
311
392
  end
312
-
393
+
313
394
  context "url domain" do
314
-
395
+
315
396
  let(:content) {
316
397
  content = <<-content
317
398
  <a href="http://example.com/valid">foo</a>
@@ -322,11 +403,11 @@ describe Rawler::Crawler do
322
403
  }
323
404
  let(:url) { 'http://example.com' }
324
405
  let(:crawler) { Rawler::Crawler.new(url) }
325
-
406
+
326
407
  before(:each) do
327
408
  register(url, content)
328
409
  end
329
-
410
+
330
411
  it "should ignore links other than http or https" do
331
412
  crawler.links.should == ['http://example.com/valid', 'https://foo.com', 'http://fooo.com']
332
413
  end
@@ -336,11 +417,11 @@ describe Rawler::Crawler do
336
417
  let(:content) { '<a href="http://foo;bar">foo</a>' }
337
418
  let(:url) { 'http://example.com' }
338
419
  let(:crawler) { Rawler::Crawler.new(url) }
339
-
420
+
340
421
  before(:each) do
341
422
  register(url, content)
342
423
  end
343
-
424
+
344
425
  it "should notify about the invalid url" do
345
426
  output.should_receive(:error).with('Invalid url: http://foo;bar - Called from: http://example.com')
346
427
  crawler.links.should == []
@@ -8,7 +8,7 @@ describe Rawler::Base do
8
8
  let(:rawler) { Rawler::Base.new('http://example.com', output) }
9
9
 
10
10
  before(:each) do
11
- Rawler.stub!(:output).and_return(output)
11
+ Rawler.stub(:output).and_return(output)
12
12
  register('http://example.com', site)
13
13
  end
14
14
 
@@ -94,6 +94,7 @@ describe Rawler::Base do
94
94
  end
95
95
 
96
96
  it "should not validate links on external pages" do
97
+ register('http://example.com/', '<a href="http://external.com/foo">x</a>')
97
98
  register('http://example.com/foo', '<a href="http://external.com/foo">x</a>')
98
99
  register('http://external.com/foo', '<a href="http://external.com/bar">x</a>')
99
100
  register('http://external.com/bar', '')
@@ -1,9 +1,9 @@
1
1
  module Kernel
2
-
2
+
3
3
  def sleep(duration)
4
4
  nil
5
5
  end
6
-
6
+
7
7
  end
8
8
 
9
9
 
@@ -16,3 +16,8 @@ FakeWeb.allow_net_connect = false
16
16
  def register(uri, content, status=200, options={})
17
17
  FakeWeb.register_uri(:any, uri, { :body => content, :status => status, :content_type => 'text/html' }.merge(options))
18
18
  end
19
+
20
+ if ENV['COVERAGE']
21
+ require 'simplecov'
22
+ SimpleCov.start
23
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.7
4
+ version: 0.1.8
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,24 +9,24 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-12-02 00:00:00.000000000 Z
12
+ date: 2013-09-07 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: nokogiri
16
16
  requirement: !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
- - - ! '>='
19
+ - - ~>
20
20
  - !ruby/object:Gem::Version
21
- version: '0'
21
+ version: '1.5'
22
22
  type: :runtime
23
23
  prerelease: false
24
24
  version_requirements: !ruby/object:Gem::Requirement
25
25
  none: false
26
26
  requirements:
27
- - - ! '>='
27
+ - - ~>
28
28
  - !ruby/object:Gem::Version
29
- version: '0'
29
+ version: '1.5'
30
30
  - !ruby/object:Gem::Dependency
31
31
  name: fakeweb
32
32
  requirement: !ruby/object:Gem::Requirement
@@ -82,7 +82,7 @@ dependencies:
82
82
  requirements:
83
83
  - - ~>
84
84
  - !ruby/object:Gem::Version
85
- version: 1.0.0
85
+ version: '1.0'
86
86
  type: :development
87
87
  prerelease: false
88
88
  version_requirements: !ruby/object:Gem::Requirement
@@ -90,7 +90,7 @@ dependencies:
90
90
  requirements:
91
91
  - - ~>
92
92
  - !ruby/object:Gem::Version
93
- version: 1.0.0
93
+ version: '1.0'
94
94
  - !ruby/object:Gem::Dependency
95
95
  name: jeweler
96
96
  requirement: !ruby/object:Gem::Requirement
@@ -107,22 +107,6 @@ dependencies:
107
107
  - - ~>
108
108
  - !ruby/object:Gem::Version
109
109
  version: 1.6.4
110
- - !ruby/object:Gem::Dependency
111
- name: rcov
112
- requirement: !ruby/object:Gem::Requirement
113
- none: false
114
- requirements:
115
- - - ! '>='
116
- - !ruby/object:Gem::Version
117
- version: '0'
118
- type: :development
119
- prerelease: false
120
- version_requirements: !ruby/object:Gem::Requirement
121
- none: false
122
- requirements:
123
- - - ! '>='
124
- - !ruby/object:Gem::Version
125
- version: '0'
126
110
  description: Rawler is a tool that crawls the links of your website
127
111
  email: info@oscardelben.com
128
112
  executables:
@@ -132,6 +116,8 @@ extra_rdoc_files:
132
116
  - LICENSE.txt
133
117
  - README.md
134
118
  files:
119
+ - .rspec
120
+ - .travis.yml
135
121
  - Gemfile
136
122
  - Gemfile.lock
137
123
  - LICENSE.txt
@@ -153,8 +139,6 @@ files:
153
139
  - spec/lib/rawler_spec.rb
154
140
  - spec/spec.opts
155
141
  - spec/spec_helper.rb
156
- - test/helper.rb
157
- - test/test_rawler.rb
158
142
  - vendor/lib-trollop.rb
159
143
  homepage: http://github.com/oscardelben/rawler
160
144
  licenses:
@@ -171,7 +155,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
171
155
  version: '0'
172
156
  segments:
173
157
  - 0
174
- hash: -477710177479430630
158
+ hash: 3460091796500092184
175
159
  required_rubygems_version: !ruby/object:Gem::Requirement
176
160
  none: false
177
161
  requirements:
@@ -1,18 +0,0 @@
1
- require 'rubygems'
2
- require 'bundler'
3
- begin
4
- Bundler.setup(:default, :development)
5
- rescue Bundler::BundlerError => e
6
- $stderr.puts e.message
7
- $stderr.puts "Run `bundle install` to install missing gems"
8
- exit e.status_code
9
- end
10
- require 'test/unit'
11
- require 'shoulda'
12
-
13
- $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
14
- $LOAD_PATH.unshift(File.dirname(__FILE__))
15
- require 'rawler'
16
-
17
- class Test::Unit::TestCase
18
- end
@@ -1,7 +0,0 @@
1
- require 'helper'
2
-
3
- class TestRawler < Test::Unit::TestCase
4
- should "probably rename this file and start testing for real" do
5
- flunk "hey buddy, you should probably rename this file and start testing for real"
6
- end
7
- end