is_crawler 0.1.2 → 0.1.3
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +4 -5
- data/lib/config/crawlers.yml +22 -34
- data/lib/crawler.rb +3 -3
- data/lib/is_crawler/version.rb +1 -1
- data/lib/is_crawler.rb +1 -1
- data/spec/lib/crawler_spec.rb +28 -21
- metadata +4 -4
data/README.md
CHANGED
@@ -1,7 +1,6 @@
|
|
1
|
-
#
|
2
|
-
[![Gem Version](https://badge.fury.io/rb/is_crawler.png)](http://badge.fury.io/rb/is_crawler) [![Code Climate](https://codeclimate.com/github/ccashwell/is_crawler.png)](https://codeclimate.com/github/ccashwell/is_crawler) [![Build Status](https://travis-ci.org/ccashwell/is_crawler.png?branch=master)](https://travis-ci.org/ccashwell/is_crawler)
|
1
|
+
# is_crawler [![Gem Version](https://badge.fury.io/rb/is_crawler.png)](http://badge.fury.io/rb/is_crawler) [![Build Status](https://travis-ci.org/ccashwell/is_crawler.png?branch=master)](https://travis-ci.org/ccashwell/is_crawler) [![Code Climate](https://codeclimate.com/github/ccashwell/is_crawler.png)](https://codeclimate.com/github/ccashwell/is_crawler)
|
3
2
|
|
4
|
-
|
3
|
+
This gem does one thing: determine if the supplied string matches a known crawler or bot. It matches against a very short list of strings found in the user agents that represent over 95% of crawler traffic. IMO, if it ain't detected, it ain't important.
|
5
4
|
|
6
5
|
## Installation
|
7
6
|
|
@@ -24,7 +23,7 @@ You can use the `is_crawler?` method with just a user agent string to determine
|
|
24
23
|
class MyController < ActionController::Base
|
25
24
|
include IsCrawler
|
26
25
|
def index
|
27
|
-
if
|
26
|
+
if is_crawler? request.env["HTTP_USER_AGENT"]
|
28
27
|
render 'special_crawler_index'
|
29
28
|
else
|
30
29
|
render 'normal_boring_index'
|
@@ -38,7 +37,7 @@ You can use the `is_crawler?` method with just a user agent string to determine
|
|
38
37
|
|
39
38
|
You can also define custom crawlers like this:
|
40
39
|
|
41
|
-
Crawler::CUSTOM << Crawler.new(:custom_crawler_name, "string that is always present in crawler
|
40
|
+
Crawler::CUSTOM << Crawler.new(:custom_crawler_name, "string that is always present in the crawler's user agent")
|
42
41
|
|
43
42
|
That's it!
|
44
43
|
|
data/lib/config/crawlers.yml
CHANGED
@@ -1,34 +1,22 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
alexa:
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
bing:
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
openwebspider:
|
24
|
-
ua_string: OpenWebSpider
|
25
|
-
technorati:
|
26
|
-
ua_string: Technoratibot
|
27
|
-
twitter:
|
28
|
-
ua_string: Twitterbot
|
29
|
-
yahoo:
|
30
|
-
ua_string: Yahoo! Slurp
|
31
|
-
yahoo_jp:
|
32
|
-
ua_string: Y!J
|
33
|
-
yandex:
|
34
|
-
ua_string: Yandex
|
1
|
+
crawlers:
|
2
|
+
addthis: AddThis.com
|
3
|
+
alexa: ia_archiver
|
4
|
+
archive_org: archive.org_bot
|
5
|
+
ask: Ask Jeeves
|
6
|
+
baidu: baidu
|
7
|
+
bing: bingbot
|
8
|
+
bitly: bitlybot
|
9
|
+
blekko: Blekkobot
|
10
|
+
exabot: Exabot
|
11
|
+
facebook: facebookexternalhit
|
12
|
+
flipboard: FlipboardProxy
|
13
|
+
google: Googlebot
|
14
|
+
google_web_preview: Google Web Preview
|
15
|
+
msn: MSNBot
|
16
|
+
mywebsearch: MyWebSearch
|
17
|
+
openwebspider: OpenWebSpider
|
18
|
+
technorati: Technoratibot
|
19
|
+
twitter: Twitterbot
|
20
|
+
yahoo: Yahoo! Slurp
|
21
|
+
yahoo_jp: Y!J
|
22
|
+
yandex: Yandex
|
data/lib/crawler.rb
CHANGED
@@ -1,8 +1,8 @@
|
|
1
1
|
require 'yaml'
|
2
2
|
|
3
3
|
class Crawler < Struct.new(:name, :ua_string)
|
4
|
-
DEFAULT = YAML.load(File.read(File.expand_path('../config/crawlers.yml', __FILE__))).collect do |k,v|
|
5
|
-
Crawler.new(k.to_sym, v
|
4
|
+
DEFAULT = YAML.load(File.read(File.expand_path('../config/crawlers.yml', __FILE__)))["crawlers"].collect do |k,v|
|
5
|
+
Crawler.new(k.to_sym, v)
|
6
6
|
end
|
7
7
|
|
8
8
|
CUSTOM = []
|
@@ -17,7 +17,7 @@ class Crawler < Struct.new(:name, :ua_string)
|
|
17
17
|
end
|
18
18
|
|
19
19
|
def which_crawler user_agent
|
20
|
-
all.detect {|crawler| crawler.matches? user_agent }.name rescue nil
|
20
|
+
all.detect { |crawler| crawler.matches? user_agent }.name rescue nil
|
21
21
|
end
|
22
22
|
end
|
23
23
|
|
data/lib/is_crawler/version.rb
CHANGED
data/lib/is_crawler.rb
CHANGED
@@ -6,7 +6,7 @@ module IsCrawler
|
|
6
6
|
if specific_crawlers && specific_crawlers.size > 0
|
7
7
|
specific_crawlers.include?(Crawler.which_crawler(requesting_user_agent))
|
8
8
|
else
|
9
|
-
Crawler.matches_any?(requesting_user_agent)
|
9
|
+
Crawler.matches_any?(requesting_user_agent)
|
10
10
|
end
|
11
11
|
end
|
12
12
|
end
|
data/spec/lib/crawler_spec.rb
CHANGED
@@ -1,62 +1,69 @@
|
|
1
1
|
describe Crawler do
|
2
|
-
let(:
|
2
|
+
let(:chrome_user_agent) { 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1309.0 Safari/537.17' }
|
3
|
+
let(:google_user_agent) { 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)' }
|
4
|
+
let(:facebook_user_agent) { 'facebookexternalhit/1.0 (http://www.facebook.com/externalhit_uatext.php)' }
|
3
5
|
|
4
6
|
describe '.matches_any?' do
|
5
7
|
subject { Crawler.matches_any?(user_agent) }
|
6
8
|
|
7
9
|
context 'When an unknown user agent is encountered' do
|
10
|
+
let(:user_agent) { chrome_user_agent }
|
8
11
|
it { should be_false }
|
9
12
|
end
|
10
13
|
|
11
14
|
context 'When a known user agent is encountered' do
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
15
|
+
context 'such as the facebook crawler' do
|
16
|
+
let(:user_agent) { facebook_user_agent }
|
17
|
+
it { should be_true }
|
18
|
+
end
|
19
|
+
|
20
|
+
context 'such as the Googlebot' do
|
21
|
+
let(:user_agent) { google_user_agent }
|
22
|
+
it { should be_true }
|
17
23
|
end
|
18
24
|
end
|
19
25
|
end
|
20
26
|
|
21
27
|
describe '#matches?' do
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
end
|
28
|
+
describe 'Comparing Googlebot\'s known UA string' do
|
29
|
+
subject { Crawler.new(:google, 'Googlebot').matches?(user_agent) }
|
30
|
+
context "with a matching string" do
|
31
|
+
let(:user_agent) { google_user_agent }
|
32
|
+
it { should be_true }
|
33
|
+
end
|
29
34
|
|
30
|
-
|
31
|
-
|
32
|
-
|
35
|
+
context 'with a non-matching string' do
|
36
|
+
let(:user_agent) { chrome_user_agent }
|
37
|
+
it { should be_false }
|
33
38
|
end
|
34
39
|
end
|
35
40
|
end
|
36
41
|
|
37
|
-
describe '
|
42
|
+
describe '.which_crawler' do
|
38
43
|
subject { Crawler.which_crawler(user_agent) }
|
39
44
|
context 'When the provided string matches a crawler' do
|
40
|
-
let(:user_agent) {
|
45
|
+
let(:user_agent) { facebook_user_agent }
|
41
46
|
it { should be :facebook }
|
42
47
|
end
|
43
48
|
|
44
49
|
context 'When the provided string matches no crawlers' do
|
50
|
+
let(:user_agent) { chrome_user_agent }
|
45
51
|
it { should be_nil }
|
46
52
|
end
|
47
53
|
end
|
48
54
|
|
49
55
|
describe 'Custom Crawler' do
|
50
56
|
let(:custom_crawler) { Crawler.new(:custom, "Custom/1.0") }
|
57
|
+
let(:user_agent) { custom_crawler.ua_string }
|
51
58
|
before { Crawler::CUSTOM << custom_crawler }
|
52
59
|
context '.matches_any' do
|
53
60
|
subject { Crawler.matches_any?(user_agent) }
|
54
61
|
context 'When the provided string matches the custom crawler' do
|
55
|
-
let(:user_agent) { "Custom/1.0" }
|
56
62
|
it { should be_true }
|
57
63
|
end
|
58
64
|
|
59
65
|
context 'When the provided string does not match the custom crawler' do
|
66
|
+
let(:user_agent) { chrome_user_agent }
|
60
67
|
it { should be_false }
|
61
68
|
end
|
62
69
|
end
|
@@ -64,11 +71,11 @@ describe Crawler do
|
|
64
71
|
context '.which_crawler' do
|
65
72
|
subject { Crawler.which_crawler(user_agent) }
|
66
73
|
context 'When the provided string matches the custom crawler' do
|
67
|
-
let(:user_agent) { "Custom/1.0" }
|
68
74
|
it { should be custom_crawler.name }
|
69
75
|
end
|
70
76
|
|
71
77
|
context 'When the provided string does not match the custom crawler' do
|
78
|
+
let(:user_agent) { chrome_user_agent }
|
72
79
|
it { should_not be custom_crawler.name }
|
73
80
|
end
|
74
81
|
end
|
@@ -76,11 +83,11 @@ describe Crawler do
|
|
76
83
|
context '#matches?' do
|
77
84
|
subject { custom_crawler.matches?(user_agent) }
|
78
85
|
context 'When the provided string matches the custom crawler' do
|
79
|
-
let(:user_agent) { "Custom/1.0" }
|
80
86
|
it { should be_true }
|
81
87
|
end
|
82
88
|
|
83
89
|
context 'When the provided string does not match the custom crawler' do
|
90
|
+
let(:user_agent) { chrome_user_agent }
|
84
91
|
it { should be_false }
|
85
92
|
end
|
86
93
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: is_crawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.3
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-03-
|
12
|
+
date: 2013-03-04 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rake
|
@@ -77,7 +77,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
77
77
|
version: '0'
|
78
78
|
segments:
|
79
79
|
- 0
|
80
|
-
hash: -
|
80
|
+
hash: -1826155163722186205
|
81
81
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
82
82
|
none: false
|
83
83
|
requirements:
|
@@ -86,7 +86,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
86
86
|
version: '0'
|
87
87
|
segments:
|
88
88
|
- 0
|
89
|
-
hash: -
|
89
|
+
hash: -1826155163722186205
|
90
90
|
requirements: []
|
91
91
|
rubyforge_project:
|
92
92
|
rubygems_version: 1.8.24
|