rawler 0.1.6 → 0.1.7
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +2 -0
- data/VERSION +1 -1
- data/bin/rawler +2 -0
- data/custom_logfile +0 -0
- data/lib/rawler.rb +5 -0
- data/lib/rawler/base.rb +3 -0
- data/lib/rawler/crawler.rb +8 -2
- data/rawler.gemspec +3 -2
- data/spec/lib/rawler/crawler_spec.rb +40 -0
- metadata +4 -3
data/README.md
CHANGED
@@ -15,6 +15,8 @@ Rawler will only parse pages with content type 'text/html', but it will check fo
|
|
15
15
|
--log, -l: Log results to file rawler_log.txt
|
16
16
|
--logfile, -o <s>: Specify logfile, implies --log (default: rawler_log.txt)
|
17
17
|
--css, -c: Check CSS links
|
18
|
+
--skip, -s <s>: Skip URLS that match a regexp
|
19
|
+
--iskip, -i <s>: Skip URLS that match a case insensitive regexp
|
18
20
|
--version, -v: Print version and exit
|
19
21
|
--help, -h: Show this message
|
20
22
|
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.1.
|
1
|
+
0.1.7
|
data/bin/rawler
CHANGED
@@ -20,6 +20,8 @@ EOS
|
|
20
20
|
opt :log, "Log results to file #{Rawler::Base::DEFAULT_LOGFILE}", :type => :boolean, :default => false
|
21
21
|
opt :logfile, "Specify logfile, implies --log", :type => :string, :default => Rawler::Base::DEFAULT_LOGFILE
|
22
22
|
opt :css, "Check CSS links", :type => :boolean, :default => false
|
23
|
+
opt :skip, "Skip URLS that match a pattern", :type => :string
|
24
|
+
opt :iskip, "Skip URLS that match a case insensitive pattern", :type => :string
|
23
25
|
end
|
24
26
|
|
25
27
|
|
data/custom_logfile
ADDED
File without changes
|
data/lib/rawler.rb
CHANGED
@@ -12,6 +12,7 @@ module Rawler
|
|
12
12
|
mattr_accessor :username, :password
|
13
13
|
mattr_accessor :log, :logfile
|
14
14
|
mattr_accessor :css
|
15
|
+
mattr_accessor :skip_url_pattern
|
15
16
|
|
16
17
|
autoload :Base, "rawler/base"
|
17
18
|
autoload :Crawler, "rawler/crawler"
|
@@ -26,4 +27,8 @@ module Rawler
|
|
26
27
|
|
27
28
|
@@url = url
|
28
29
|
end
|
30
|
+
|
31
|
+
def self.set_skip_pattern(pattern, icase)
|
32
|
+
self.skip_url_pattern = pattern.nil? ? nil : Regexp.new(pattern, icase ? Regexp::IGNORECASE : nil )
|
33
|
+
end
|
29
34
|
end
|
data/lib/rawler/base.rb
CHANGED
@@ -16,6 +16,9 @@ module Rawler
|
|
16
16
|
Rawler.wait = options[:wait]
|
17
17
|
Rawler.css = options[:css]
|
18
18
|
|
19
|
+
Rawler.set_skip_pattern(options[:skip], false) unless options[:skip].nil?
|
20
|
+
Rawler.set_skip_pattern(options[:iskip], true) unless options[:iskip].nil?
|
21
|
+
|
19
22
|
# Using a custom logfile implies logging.
|
20
23
|
Rawler.logfile = options[:logfile] || DEFAULT_LOGFILE
|
21
24
|
Rawler.log = options[:log] || Rawler.logfile != DEFAULT_LOGFILE
|
data/lib/rawler/crawler.rb
CHANGED
@@ -64,6 +64,10 @@ module Rawler
|
|
64
64
|
Rawler.output.error(message)
|
65
65
|
end
|
66
66
|
|
67
|
+
def info(message)
|
68
|
+
Rawler.output.info(message)
|
69
|
+
end
|
70
|
+
|
67
71
|
def different_domain?(url_1, url_2)
|
68
72
|
URI.parse(url_1).host != URI.parse(url_2).host
|
69
73
|
end
|
@@ -85,7 +89,9 @@ module Rawler
|
|
85
89
|
url.strip!
|
86
90
|
|
87
91
|
scheme = URI.parse(url).scheme
|
88
|
-
if
|
92
|
+
if url =~ Rawler.skip_url_pattern
|
93
|
+
false
|
94
|
+
elsif ['http', 'https'].include?(scheme)
|
89
95
|
true
|
90
96
|
else
|
91
97
|
write("Invalid url - #{url}") unless url =~ SKIP_FORMATS
|
@@ -93,8 +99,8 @@ module Rawler
|
|
93
99
|
end
|
94
100
|
|
95
101
|
rescue URI::InvalidURIError
|
102
|
+
write("Invalid url - #{url}")
|
96
103
|
false
|
97
|
-
write("Invalid url - #{url}")
|
98
104
|
end
|
99
105
|
end
|
100
106
|
end
|
data/rawler.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "rawler"
|
8
|
-
s.version = "0.1.
|
8
|
+
s.version = "0.1.7"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Oscar Del Ben"]
|
12
|
-
s.date = "2012-
|
12
|
+
s.date = "2012-12-02"
|
13
13
|
s.description = "Rawler is a tool that crawls the links of your website"
|
14
14
|
s.email = "info@oscardelben.com"
|
15
15
|
s.executables = ["rawler"]
|
@@ -25,6 +25,7 @@ Gem::Specification.new do |s|
|
|
25
25
|
"Rakefile",
|
26
26
|
"VERSION",
|
27
27
|
"bin/rawler",
|
28
|
+
"custom_logfile",
|
28
29
|
"lib/rawler.rb",
|
29
30
|
"lib/rawler/base.rb",
|
30
31
|
"lib/rawler/core_extensions.rb",
|
@@ -183,6 +183,46 @@ describe Rawler::Crawler do
|
|
183
183
|
end
|
184
184
|
end
|
185
185
|
|
186
|
+
context "skip matches" do
|
187
|
+
let(:url) { 'http://example.com/path' }
|
188
|
+
let(:crawler) { Rawler::Crawler.new(url) }
|
189
|
+
let(:content) { "<a href=\"http://example.com/search/page:1/\">foo</a><a href=\"http://example.com/search/page:2/\">foo</a>" }
|
190
|
+
|
191
|
+
before(:each) do
|
192
|
+
Rawler.set_skip_pattern('\/search\/(.*\/)?page:[2-9]', false)
|
193
|
+
register(url, content)
|
194
|
+
end
|
195
|
+
|
196
|
+
it "should return one links" do
|
197
|
+
crawler.links.length.should eql(1)
|
198
|
+
end
|
199
|
+
|
200
|
+
it "should not report that it's skipping" do
|
201
|
+
crawler.should_not_receive(:write)
|
202
|
+
crawler.links
|
203
|
+
end
|
204
|
+
end
|
205
|
+
|
206
|
+
context "case-insensitive skip matches" do
|
207
|
+
let(:url) { 'http://example.com/path' }
|
208
|
+
let(:crawler) { Rawler::Crawler.new(url) }
|
209
|
+
let(:content) { "<a href=\"http://example.com/search/page:1/\">foo</a><a href=\"http://example.com/search/page:2/\">foo</a>" }
|
210
|
+
|
211
|
+
before(:each) do
|
212
|
+
Rawler.set_skip_pattern('\/seArcH\/(.*\/)?PAGE:[2-9]', true)
|
213
|
+
register(url, content)
|
214
|
+
end
|
215
|
+
|
216
|
+
it "should return one links" do
|
217
|
+
crawler.links.length.should eql(1)
|
218
|
+
end
|
219
|
+
|
220
|
+
it "should not report that it's skipping" do
|
221
|
+
crawler.should_not_receive(:write)
|
222
|
+
crawler.links
|
223
|
+
end
|
224
|
+
end
|
225
|
+
|
186
226
|
end
|
187
227
|
|
188
228
|
context "content type" do
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.7
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-12-02 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: nokogiri
|
@@ -139,6 +139,7 @@ files:
|
|
139
139
|
- Rakefile
|
140
140
|
- VERSION
|
141
141
|
- bin/rawler
|
142
|
+
- custom_logfile
|
142
143
|
- lib/rawler.rb
|
143
144
|
- lib/rawler/base.rb
|
144
145
|
- lib/rawler/core_extensions.rb
|
@@ -170,7 +171,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
170
171
|
version: '0'
|
171
172
|
segments:
|
172
173
|
- 0
|
173
|
-
hash: -
|
174
|
+
hash: -477710177479430630
|
174
175
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
175
176
|
none: false
|
176
177
|
requirements:
|