rawler 0.1.6 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.md CHANGED
@@ -15,6 +15,8 @@ Rawler will only parse pages with content type 'text/html', but it will check fo
15
15
  --log, -l: Log results to file rawler_log.txt
16
16
  --logfile, -o <s>: Specify logfile, implies --log (default: rawler_log.txt)
17
17
  --css, -c: Check CSS links
18
+ --skip, -s <s>: Skip URLS that match a regexp
19
+ --iskip, -i <s>: Skip URLS that match a case insensitive regexp
18
20
  --version, -v: Print version and exit
19
21
  --help, -h: Show this message
20
22
 
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.1.6
1
+ 0.1.7
data/bin/rawler CHANGED
@@ -20,6 +20,8 @@ EOS
20
20
  opt :log, "Log results to file #{Rawler::Base::DEFAULT_LOGFILE}", :type => :boolean, :default => false
21
21
  opt :logfile, "Specify logfile, implies --log", :type => :string, :default => Rawler::Base::DEFAULT_LOGFILE
22
22
  opt :css, "Check CSS links", :type => :boolean, :default => false
23
+ opt :skip, "Skip URLS that match a pattern", :type => :string
24
+ opt :iskip, "Skip URLS that match a case insensitive pattern", :type => :string
23
25
  end
24
26
 
25
27
 
File without changes
@@ -12,6 +12,7 @@ module Rawler
12
12
  mattr_accessor :username, :password
13
13
  mattr_accessor :log, :logfile
14
14
  mattr_accessor :css
15
+ mattr_accessor :skip_url_pattern
15
16
 
16
17
  autoload :Base, "rawler/base"
17
18
  autoload :Crawler, "rawler/crawler"
@@ -26,4 +27,8 @@ module Rawler
26
27
 
27
28
  @@url = url
28
29
  end
30
+
31
+ def self.set_skip_pattern(pattern, icase)
32
+ self.skip_url_pattern = pattern.nil? ? nil : Regexp.new(pattern, icase ? Regexp::IGNORECASE : nil )
33
+ end
29
34
  end
@@ -16,6 +16,9 @@ module Rawler
16
16
  Rawler.wait = options[:wait]
17
17
  Rawler.css = options[:css]
18
18
 
19
+ Rawler.set_skip_pattern(options[:skip], false) unless options[:skip].nil?
20
+ Rawler.set_skip_pattern(options[:iskip], true) unless options[:iskip].nil?
21
+
19
22
  # Using a custom logfile implies logging.
20
23
  Rawler.logfile = options[:logfile] || DEFAULT_LOGFILE
21
24
  Rawler.log = options[:log] || Rawler.logfile != DEFAULT_LOGFILE
@@ -64,6 +64,10 @@ module Rawler
64
64
  Rawler.output.error(message)
65
65
  end
66
66
 
67
+ def info(message)
68
+ Rawler.output.info(message)
69
+ end
70
+
67
71
  def different_domain?(url_1, url_2)
68
72
  URI.parse(url_1).host != URI.parse(url_2).host
69
73
  end
@@ -85,7 +89,9 @@ module Rawler
85
89
  url.strip!
86
90
 
87
91
  scheme = URI.parse(url).scheme
88
- if ['http', 'https'].include?(scheme)
92
+ if url =~ Rawler.skip_url_pattern
93
+ false
94
+ elsif ['http', 'https'].include?(scheme)
89
95
  true
90
96
  else
91
97
  write("Invalid url - #{url}") unless url =~ SKIP_FORMATS
@@ -93,8 +99,8 @@ module Rawler
93
99
  end
94
100
 
95
101
  rescue URI::InvalidURIError
102
+ write("Invalid url - #{url}")
96
103
  false
97
- write("Invalid url - #{url}")
98
104
  end
99
105
  end
100
106
  end
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = "rawler"
8
- s.version = "0.1.6"
8
+ s.version = "0.1.7"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Oscar Del Ben"]
12
- s.date = "2012-07-17"
12
+ s.date = "2012-12-02"
13
13
  s.description = "Rawler is a tool that crawls the links of your website"
14
14
  s.email = "info@oscardelben.com"
15
15
  s.executables = ["rawler"]
@@ -25,6 +25,7 @@ Gem::Specification.new do |s|
25
25
  "Rakefile",
26
26
  "VERSION",
27
27
  "bin/rawler",
28
+ "custom_logfile",
28
29
  "lib/rawler.rb",
29
30
  "lib/rawler/base.rb",
30
31
  "lib/rawler/core_extensions.rb",
@@ -183,6 +183,46 @@ describe Rawler::Crawler do
183
183
  end
184
184
  end
185
185
 
186
+ context "skip matches" do
187
+ let(:url) { 'http://example.com/path' }
188
+ let(:crawler) { Rawler::Crawler.new(url) }
189
+ let(:content) { "<a href=\"http://example.com/search/page:1/\">foo</a><a href=\"http://example.com/search/page:2/\">foo</a>" }
190
+
191
+ before(:each) do
192
+ Rawler.set_skip_pattern('\/search\/(.*\/)?page:[2-9]', false)
193
+ register(url, content)
194
+ end
195
+
196
+ it "should return one links" do
197
+ crawler.links.length.should eql(1)
198
+ end
199
+
200
+ it "should not report that it's skipping" do
201
+ crawler.should_not_receive(:write)
202
+ crawler.links
203
+ end
204
+ end
205
+
206
+ context "case-insensitive skip matches" do
207
+ let(:url) { 'http://example.com/path' }
208
+ let(:crawler) { Rawler::Crawler.new(url) }
209
+ let(:content) { "<a href=\"http://example.com/search/page:1/\">foo</a><a href=\"http://example.com/search/page:2/\">foo</a>" }
210
+
211
+ before(:each) do
212
+ Rawler.set_skip_pattern('\/seArcH\/(.*\/)?PAGE:[2-9]', true)
213
+ register(url, content)
214
+ end
215
+
216
+ it "should return one links" do
217
+ crawler.links.length.should eql(1)
218
+ end
219
+
220
+ it "should not report that it's skipping" do
221
+ crawler.should_not_receive(:write)
222
+ crawler.links
223
+ end
224
+ end
225
+
186
226
  end
187
227
 
188
228
  context "content type" do
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.6
4
+ version: 0.1.7
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-07-17 00:00:00.000000000 Z
12
+ date: 2012-12-02 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: nokogiri
@@ -139,6 +139,7 @@ files:
139
139
  - Rakefile
140
140
  - VERSION
141
141
  - bin/rawler
142
+ - custom_logfile
142
143
  - lib/rawler.rb
143
144
  - lib/rawler/base.rb
144
145
  - lib/rawler/core_extensions.rb
@@ -170,7 +171,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
170
171
  version: '0'
171
172
  segments:
172
173
  - 0
173
- hash: -2845352824216855200
174
+ hash: -477710177479430630
174
175
  required_rubygems_version: !ruby/object:Gem::Requirement
175
176
  none: false
176
177
  requirements: