rawler 0.1.6 → 0.1.7

Sign up to get free protection for your applications and to get access to all the features.
data/README.md CHANGED
@@ -15,6 +15,8 @@ Rawler will only parse pages with content type 'text/html', but it will check fo
15
15
  --log, -l: Log results to file rawler_log.txt
16
16
  --logfile, -o <s>: Specify logfile, implies --log (default: rawler_log.txt)
17
17
  --css, -c: Check CSS links
18
+ --skip, -s <s>: Skip URLS that match a regexp
19
+ --iskip, -i <s>: Skip URLS that match a case insensitive regexp
18
20
  --version, -v: Print version and exit
19
21
  --help, -h: Show this message
20
22
 
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.1.6
1
+ 0.1.7
data/bin/rawler CHANGED
@@ -20,6 +20,8 @@ EOS
20
20
  opt :log, "Log results to file #{Rawler::Base::DEFAULT_LOGFILE}", :type => :boolean, :default => false
21
21
  opt :logfile, "Specify logfile, implies --log", :type => :string, :default => Rawler::Base::DEFAULT_LOGFILE
22
22
  opt :css, "Check CSS links", :type => :boolean, :default => false
23
+ opt :skip, "Skip URLS that match a pattern", :type => :string
24
+ opt :iskip, "Skip URLS that match a case insensitive pattern", :type => :string
23
25
  end
24
26
 
25
27
 
File without changes
@@ -12,6 +12,7 @@ module Rawler
12
12
  mattr_accessor :username, :password
13
13
  mattr_accessor :log, :logfile
14
14
  mattr_accessor :css
15
+ mattr_accessor :skip_url_pattern
15
16
 
16
17
  autoload :Base, "rawler/base"
17
18
  autoload :Crawler, "rawler/crawler"
@@ -26,4 +27,8 @@ module Rawler
26
27
 
27
28
  @@url = url
28
29
  end
30
+
31
+ def self.set_skip_pattern(pattern, icase)
32
+ self.skip_url_pattern = pattern.nil? ? nil : Regexp.new(pattern, icase ? Regexp::IGNORECASE : nil )
33
+ end
29
34
  end
@@ -16,6 +16,9 @@ module Rawler
16
16
  Rawler.wait = options[:wait]
17
17
  Rawler.css = options[:css]
18
18
 
19
+ Rawler.set_skip_pattern(options[:skip], false) unless options[:skip].nil?
20
+ Rawler.set_skip_pattern(options[:iskip], true) unless options[:iskip].nil?
21
+
19
22
  # Using a custom logfile implies logging.
20
23
  Rawler.logfile = options[:logfile] || DEFAULT_LOGFILE
21
24
  Rawler.log = options[:log] || Rawler.logfile != DEFAULT_LOGFILE
@@ -64,6 +64,10 @@ module Rawler
64
64
  Rawler.output.error(message)
65
65
  end
66
66
 
67
+ def info(message)
68
+ Rawler.output.info(message)
69
+ end
70
+
67
71
  def different_domain?(url_1, url_2)
68
72
  URI.parse(url_1).host != URI.parse(url_2).host
69
73
  end
@@ -85,7 +89,9 @@ module Rawler
85
89
  url.strip!
86
90
 
87
91
  scheme = URI.parse(url).scheme
88
- if ['http', 'https'].include?(scheme)
92
+ if url =~ Rawler.skip_url_pattern
93
+ false
94
+ elsif ['http', 'https'].include?(scheme)
89
95
  true
90
96
  else
91
97
  write("Invalid url - #{url}") unless url =~ SKIP_FORMATS
@@ -93,8 +99,8 @@ module Rawler
93
99
  end
94
100
 
95
101
  rescue URI::InvalidURIError
102
+ write("Invalid url - #{url}")
96
103
  false
97
- write("Invalid url - #{url}")
98
104
  end
99
105
  end
100
106
  end
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = "rawler"
8
- s.version = "0.1.6"
8
+ s.version = "0.1.7"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Oscar Del Ben"]
12
- s.date = "2012-07-17"
12
+ s.date = "2012-12-02"
13
13
  s.description = "Rawler is a tool that crawls the links of your website"
14
14
  s.email = "info@oscardelben.com"
15
15
  s.executables = ["rawler"]
@@ -25,6 +25,7 @@ Gem::Specification.new do |s|
25
25
  "Rakefile",
26
26
  "VERSION",
27
27
  "bin/rawler",
28
+ "custom_logfile",
28
29
  "lib/rawler.rb",
29
30
  "lib/rawler/base.rb",
30
31
  "lib/rawler/core_extensions.rb",
@@ -183,6 +183,46 @@ describe Rawler::Crawler do
183
183
  end
184
184
  end
185
185
 
186
+ context "skip matches" do
187
+ let(:url) { 'http://example.com/path' }
188
+ let(:crawler) { Rawler::Crawler.new(url) }
189
+ let(:content) { "<a href=\"http://example.com/search/page:1/\">foo</a><a href=\"http://example.com/search/page:2/\">foo</a>" }
190
+
191
+ before(:each) do
192
+ Rawler.set_skip_pattern('\/search\/(.*\/)?page:[2-9]', false)
193
+ register(url, content)
194
+ end
195
+
196
+ it "should return one links" do
197
+ crawler.links.length.should eql(1)
198
+ end
199
+
200
+ it "should not report that it's skipping" do
201
+ crawler.should_not_receive(:write)
202
+ crawler.links
203
+ end
204
+ end
205
+
206
+ context "case-insensitive skip matches" do
207
+ let(:url) { 'http://example.com/path' }
208
+ let(:crawler) { Rawler::Crawler.new(url) }
209
+ let(:content) { "<a href=\"http://example.com/search/page:1/\">foo</a><a href=\"http://example.com/search/page:2/\">foo</a>" }
210
+
211
+ before(:each) do
212
+ Rawler.set_skip_pattern('\/seArcH\/(.*\/)?PAGE:[2-9]', true)
213
+ register(url, content)
214
+ end
215
+
216
+ it "should return one links" do
217
+ crawler.links.length.should eql(1)
218
+ end
219
+
220
+ it "should not report that it's skipping" do
221
+ crawler.should_not_receive(:write)
222
+ crawler.links
223
+ end
224
+ end
225
+
186
226
  end
187
227
 
188
228
  context "content type" do
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.6
4
+ version: 0.1.7
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-07-17 00:00:00.000000000 Z
12
+ date: 2012-12-02 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: nokogiri
@@ -139,6 +139,7 @@ files:
139
139
  - Rakefile
140
140
  - VERSION
141
141
  - bin/rawler
142
+ - custom_logfile
142
143
  - lib/rawler.rb
143
144
  - lib/rawler/base.rb
144
145
  - lib/rawler/core_extensions.rb
@@ -170,7 +171,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
170
171
  version: '0'
171
172
  segments:
172
173
  - 0
173
- hash: -2845352824216855200
174
+ hash: -477710177479430630
174
175
  required_rubygems_version: !ruby/object:Gem::Requirement
175
176
  none: false
176
177
  requirements: