is_crawler 0.1.1 → 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +3 -1
- data/lib/config/crawlers.yml +34 -0
- data/lib/crawler.rb +5 -19
- data/lib/is_crawler/version.rb +1 -1
- data/spec/is_crawler_spec.rb +3 -3
- data/spec/lib/crawler_spec.rb +2 -1
- metadata +4 -3
data/README.md
CHANGED
@@ -32,7 +32,9 @@ You can use the `is_crawler?` method with just a user agent string to determine
|
|
32
32
|
end
|
33
33
|
end
|
34
34
|
|
35
|
-
...or provide one or more crawlers to find out if the string matches *specific* crawlers:
|
35
|
+
...or provide one or more crawlers to find out if the string matches *specific* crawlers:
|
36
|
+
|
37
|
+
is_crawler?("Some User Agent/1.0", :facebook, :google)
|
36
38
|
|
37
39
|
You can also define custom crawlers like this:
|
38
40
|
|
@@ -0,0 +1,34 @@
|
|
1
|
+
addthis:
|
2
|
+
ua_string: AddThis.com
|
3
|
+
alexa:
|
4
|
+
ua_string: ia_archiver
|
5
|
+
archive_org:
|
6
|
+
ua_string: archive.org_bot
|
7
|
+
bing:
|
8
|
+
ua_string: bingbot
|
9
|
+
bitly:
|
10
|
+
ua_string: bitlybot
|
11
|
+
exabot:
|
12
|
+
ua_string: Exabot
|
13
|
+
facebook:
|
14
|
+
ua_string: facebookexternalhit
|
15
|
+
flipboard:
|
16
|
+
ua_string: FlipboardProxy
|
17
|
+
google:
|
18
|
+
ua_string: Googlebot
|
19
|
+
google_web_preview:
|
20
|
+
ua_string: Google Web Preview
|
21
|
+
msn:
|
22
|
+
ua_string: MSNBot
|
23
|
+
openwebspider:
|
24
|
+
ua_string: OpenWebSpider
|
25
|
+
technorati:
|
26
|
+
ua_string: Technoratibot
|
27
|
+
twitter:
|
28
|
+
ua_string: Twitterbot
|
29
|
+
yahoo:
|
30
|
+
ua_string: Yahoo! Slurp
|
31
|
+
yahoo_jp:
|
32
|
+
ua_string: Y!J
|
33
|
+
yandex:
|
34
|
+
ua_string: Yandex
|
data/lib/crawler.rb
CHANGED
@@ -1,23 +1,9 @@
|
|
1
|
+
require 'yaml'
|
2
|
+
|
1
3
|
class Crawler < Struct.new(:name, :ua_string)
|
2
|
-
DEFAULT =
|
3
|
-
|
4
|
-
|
5
|
-
Crawler.new(:archive_org, "archive.org_bot"),
|
6
|
-
Crawler.new(:bing, "bingbot"),
|
7
|
-
Crawler.new(:bitly, "bitlybot"),
|
8
|
-
Crawler.new(:exabot, "Exabot"),
|
9
|
-
Crawler.new(:facebook, "facebookexternalhit"),
|
10
|
-
Crawler.new(:flipboard, "FlipboardProxy"),
|
11
|
-
Crawler.new(:google, "Googlebot"),
|
12
|
-
Crawler.new(:google_web_preview, "Google Web Preview"),
|
13
|
-
Crawler.new(:msn, "MSNBot"),
|
14
|
-
Crawler.new(:openwebspider, "OpenWebSpider"),
|
15
|
-
Crawler.new(:technorati, "Technoratibot"),
|
16
|
-
Crawler.new(:twitter, "Twitterbot"),
|
17
|
-
Crawler.new(:yahoo, "Yahoo! Slurp"),
|
18
|
-
Crawler.new(:yahoo_jp, "Y!J"),
|
19
|
-
Crawler.new(:yandex, "Yandex")
|
20
|
-
].freeze
|
4
|
+
DEFAULT = YAML.load(File.read(File.expand_path('../config/crawlers.yml', __FILE__))).collect do |k,v|
|
5
|
+
Crawler.new(k.to_sym, v["ua_string"])
|
6
|
+
end
|
21
7
|
|
22
8
|
CUSTOM = []
|
23
9
|
|
data/lib/is_crawler/version.rb
CHANGED
data/spec/is_crawler_spec.rb
CHANGED
@@ -4,7 +4,7 @@ describe IsCrawler do
|
|
4
4
|
describe '#is_crawler?' do
|
5
5
|
context 'When specific crawlers are provided' do
|
6
6
|
subject { Test.new.is_crawler?(user_agent, :facebook, :google) }
|
7
|
-
context '
|
7
|
+
context 'and the provided string matches a crawler' do
|
8
8
|
context 'and it is in the specified list' do
|
9
9
|
context 'as the first element' do
|
10
10
|
let(:user_agent) { "facebookexternalhit/1.1" }
|
@@ -17,13 +17,13 @@ describe IsCrawler do
|
|
17
17
|
end
|
18
18
|
end
|
19
19
|
|
20
|
-
context '
|
20
|
+
context 'but it is not in the specified list' do
|
21
21
|
let(:user_agent) { "Twitterbot/1.1" }
|
22
22
|
it { should be_false }
|
23
23
|
end
|
24
24
|
end
|
25
25
|
|
26
|
-
context '
|
26
|
+
context 'but the provided string matches no crawlers' do
|
27
27
|
it { should be_false }
|
28
28
|
end
|
29
29
|
end
|
data/spec/lib/crawler_spec.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
describe Crawler do
|
2
2
|
let(:user_agent) { "Commodo Vestibulum/1.0" }
|
3
|
+
|
3
4
|
describe '.matches_any?' do
|
4
5
|
subject { Crawler.matches_any?(user_agent) }
|
5
6
|
|
@@ -37,7 +38,7 @@ describe Crawler do
|
|
37
38
|
subject { Crawler.which_crawler(user_agent) }
|
38
39
|
context 'When the provided string matches a crawler' do
|
39
40
|
let(:user_agent) { "facebookexternalhit/1.1" }
|
40
|
-
it { should
|
41
|
+
it { should be :facebook }
|
41
42
|
end
|
42
43
|
|
43
44
|
context 'When the provided string matches no crawlers' do
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: is_crawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -57,6 +57,7 @@ files:
|
|
57
57
|
- README.md
|
58
58
|
- Rakefile
|
59
59
|
- is_crawler.gemspec
|
60
|
+
- lib/config/crawlers.yml
|
60
61
|
- lib/crawler.rb
|
61
62
|
- lib/is_crawler.rb
|
62
63
|
- lib/is_crawler/version.rb
|
@@ -76,7 +77,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
76
77
|
version: '0'
|
77
78
|
segments:
|
78
79
|
- 0
|
79
|
-
hash:
|
80
|
+
hash: -3983243957140806942
|
80
81
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
81
82
|
none: false
|
82
83
|
requirements:
|
@@ -85,7 +86,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
85
86
|
version: '0'
|
86
87
|
segments:
|
87
88
|
- 0
|
88
|
-
hash:
|
89
|
+
hash: -3983243957140806942
|
89
90
|
requirements: []
|
90
91
|
rubyforge_project:
|
91
92
|
rubygems_version: 1.8.24
|