rawler 0.1.6 → 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +2 -0
- data/VERSION +1 -1
- data/bin/rawler +2 -0
- data/custom_logfile +0 -0
- data/lib/rawler.rb +5 -0
- data/lib/rawler/base.rb +3 -0
- data/lib/rawler/crawler.rb +8 -2
- data/rawler.gemspec +3 -2
- data/spec/lib/rawler/crawler_spec.rb +40 -0
- metadata +4 -3
data/README.md
CHANGED
|
@@ -15,6 +15,8 @@ Rawler will only parse pages with content type 'text/html', but it will check fo
|
|
|
15
15
|
--log, -l: Log results to file rawler_log.txt
|
|
16
16
|
--logfile, -o <s>: Specify logfile, implies --log (default: rawler_log.txt)
|
|
17
17
|
--css, -c: Check CSS links
|
|
18
|
+
--skip, -s <s>: Skip URLS that match a regexp
|
|
19
|
+
--iskip, -i <s>: Skip URLS that match a case insensitive regexp
|
|
18
20
|
--version, -v: Print version and exit
|
|
19
21
|
--help, -h: Show this message
|
|
20
22
|
|
data/VERSION
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
0.1.
|
|
1
|
+
0.1.7
|
data/bin/rawler
CHANGED
|
@@ -20,6 +20,8 @@ EOS
|
|
|
20
20
|
opt :log, "Log results to file #{Rawler::Base::DEFAULT_LOGFILE}", :type => :boolean, :default => false
|
|
21
21
|
opt :logfile, "Specify logfile, implies --log", :type => :string, :default => Rawler::Base::DEFAULT_LOGFILE
|
|
22
22
|
opt :css, "Check CSS links", :type => :boolean, :default => false
|
|
23
|
+
opt :skip, "Skip URLS that match a pattern", :type => :string
|
|
24
|
+
opt :iskip, "Skip URLS that match a case insensitive pattern", :type => :string
|
|
23
25
|
end
|
|
24
26
|
|
|
25
27
|
|
data/custom_logfile
ADDED
|
File without changes
|
data/lib/rawler.rb
CHANGED
|
@@ -12,6 +12,7 @@ module Rawler
|
|
|
12
12
|
mattr_accessor :username, :password
|
|
13
13
|
mattr_accessor :log, :logfile
|
|
14
14
|
mattr_accessor :css
|
|
15
|
+
mattr_accessor :skip_url_pattern
|
|
15
16
|
|
|
16
17
|
autoload :Base, "rawler/base"
|
|
17
18
|
autoload :Crawler, "rawler/crawler"
|
|
@@ -26,4 +27,8 @@ module Rawler
|
|
|
26
27
|
|
|
27
28
|
@@url = url
|
|
28
29
|
end
|
|
30
|
+
|
|
31
|
+
def self.set_skip_pattern(pattern, icase)
|
|
32
|
+
self.skip_url_pattern = pattern.nil? ? nil : Regexp.new(pattern, icase ? Regexp::IGNORECASE : nil )
|
|
33
|
+
end
|
|
29
34
|
end
|
data/lib/rawler/base.rb
CHANGED
|
@@ -16,6 +16,9 @@ module Rawler
|
|
|
16
16
|
Rawler.wait = options[:wait]
|
|
17
17
|
Rawler.css = options[:css]
|
|
18
18
|
|
|
19
|
+
Rawler.set_skip_pattern(options[:skip], false) unless options[:skip].nil?
|
|
20
|
+
Rawler.set_skip_pattern(options[:iskip], true) unless options[:iskip].nil?
|
|
21
|
+
|
|
19
22
|
# Using a custom logfile implies logging.
|
|
20
23
|
Rawler.logfile = options[:logfile] || DEFAULT_LOGFILE
|
|
21
24
|
Rawler.log = options[:log] || Rawler.logfile != DEFAULT_LOGFILE
|
data/lib/rawler/crawler.rb
CHANGED
|
@@ -64,6 +64,10 @@ module Rawler
|
|
|
64
64
|
Rawler.output.error(message)
|
|
65
65
|
end
|
|
66
66
|
|
|
67
|
+
def info(message)
|
|
68
|
+
Rawler.output.info(message)
|
|
69
|
+
end
|
|
70
|
+
|
|
67
71
|
def different_domain?(url_1, url_2)
|
|
68
72
|
URI.parse(url_1).host != URI.parse(url_2).host
|
|
69
73
|
end
|
|
@@ -85,7 +89,9 @@ module Rawler
|
|
|
85
89
|
url.strip!
|
|
86
90
|
|
|
87
91
|
scheme = URI.parse(url).scheme
|
|
88
|
-
if
|
|
92
|
+
if url =~ Rawler.skip_url_pattern
|
|
93
|
+
false
|
|
94
|
+
elsif ['http', 'https'].include?(scheme)
|
|
89
95
|
true
|
|
90
96
|
else
|
|
91
97
|
write("Invalid url - #{url}") unless url =~ SKIP_FORMATS
|
|
@@ -93,8 +99,8 @@ module Rawler
|
|
|
93
99
|
end
|
|
94
100
|
|
|
95
101
|
rescue URI::InvalidURIError
|
|
102
|
+
write("Invalid url - #{url}")
|
|
96
103
|
false
|
|
97
|
-
write("Invalid url - #{url}")
|
|
98
104
|
end
|
|
99
105
|
end
|
|
100
106
|
end
|
data/rawler.gemspec
CHANGED
|
@@ -5,11 +5,11 @@
|
|
|
5
5
|
|
|
6
6
|
Gem::Specification.new do |s|
|
|
7
7
|
s.name = "rawler"
|
|
8
|
-
s.version = "0.1.
|
|
8
|
+
s.version = "0.1.7"
|
|
9
9
|
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
|
11
11
|
s.authors = ["Oscar Del Ben"]
|
|
12
|
-
s.date = "2012-
|
|
12
|
+
s.date = "2012-12-02"
|
|
13
13
|
s.description = "Rawler is a tool that crawls the links of your website"
|
|
14
14
|
s.email = "info@oscardelben.com"
|
|
15
15
|
s.executables = ["rawler"]
|
|
@@ -25,6 +25,7 @@ Gem::Specification.new do |s|
|
|
|
25
25
|
"Rakefile",
|
|
26
26
|
"VERSION",
|
|
27
27
|
"bin/rawler",
|
|
28
|
+
"custom_logfile",
|
|
28
29
|
"lib/rawler.rb",
|
|
29
30
|
"lib/rawler/base.rb",
|
|
30
31
|
"lib/rawler/core_extensions.rb",
|
|
@@ -183,6 +183,46 @@ describe Rawler::Crawler do
|
|
|
183
183
|
end
|
|
184
184
|
end
|
|
185
185
|
|
|
186
|
+
context "skip matches" do
|
|
187
|
+
let(:url) { 'http://example.com/path' }
|
|
188
|
+
let(:crawler) { Rawler::Crawler.new(url) }
|
|
189
|
+
let(:content) { "<a href=\"http://example.com/search/page:1/\">foo</a><a href=\"http://example.com/search/page:2/\">foo</a>" }
|
|
190
|
+
|
|
191
|
+
before(:each) do
|
|
192
|
+
Rawler.set_skip_pattern('\/search\/(.*\/)?page:[2-9]', false)
|
|
193
|
+
register(url, content)
|
|
194
|
+
end
|
|
195
|
+
|
|
196
|
+
it "should return one links" do
|
|
197
|
+
crawler.links.length.should eql(1)
|
|
198
|
+
end
|
|
199
|
+
|
|
200
|
+
it "should not report that it's skipping" do
|
|
201
|
+
crawler.should_not_receive(:write)
|
|
202
|
+
crawler.links
|
|
203
|
+
end
|
|
204
|
+
end
|
|
205
|
+
|
|
206
|
+
context "case-insensitive skip matches" do
|
|
207
|
+
let(:url) { 'http://example.com/path' }
|
|
208
|
+
let(:crawler) { Rawler::Crawler.new(url) }
|
|
209
|
+
let(:content) { "<a href=\"http://example.com/search/page:1/\">foo</a><a href=\"http://example.com/search/page:2/\">foo</a>" }
|
|
210
|
+
|
|
211
|
+
before(:each) do
|
|
212
|
+
Rawler.set_skip_pattern('\/seArcH\/(.*\/)?PAGE:[2-9]', true)
|
|
213
|
+
register(url, content)
|
|
214
|
+
end
|
|
215
|
+
|
|
216
|
+
it "should return one links" do
|
|
217
|
+
crawler.links.length.should eql(1)
|
|
218
|
+
end
|
|
219
|
+
|
|
220
|
+
it "should not report that it's skipping" do
|
|
221
|
+
crawler.should_not_receive(:write)
|
|
222
|
+
crawler.links
|
|
223
|
+
end
|
|
224
|
+
end
|
|
225
|
+
|
|
186
226
|
end
|
|
187
227
|
|
|
188
228
|
context "content type" do
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: rawler
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.1.
|
|
4
|
+
version: 0.1.7
|
|
5
5
|
prerelease:
|
|
6
6
|
platform: ruby
|
|
7
7
|
authors:
|
|
@@ -9,7 +9,7 @@ authors:
|
|
|
9
9
|
autorequire:
|
|
10
10
|
bindir: bin
|
|
11
11
|
cert_chain: []
|
|
12
|
-
date: 2012-
|
|
12
|
+
date: 2012-12-02 00:00:00.000000000 Z
|
|
13
13
|
dependencies:
|
|
14
14
|
- !ruby/object:Gem::Dependency
|
|
15
15
|
name: nokogiri
|
|
@@ -139,6 +139,7 @@ files:
|
|
|
139
139
|
- Rakefile
|
|
140
140
|
- VERSION
|
|
141
141
|
- bin/rawler
|
|
142
|
+
- custom_logfile
|
|
142
143
|
- lib/rawler.rb
|
|
143
144
|
- lib/rawler/base.rb
|
|
144
145
|
- lib/rawler/core_extensions.rb
|
|
@@ -170,7 +171,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
|
170
171
|
version: '0'
|
|
171
172
|
segments:
|
|
172
173
|
- 0
|
|
173
|
-
hash: -
|
|
174
|
+
hash: -477710177479430630
|
|
174
175
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
175
176
|
none: false
|
|
176
177
|
requirements:
|