is_crawler 0.1.2 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +4 -5
- data/lib/config/crawlers.yml +22 -34
- data/lib/crawler.rb +3 -3
- data/lib/is_crawler/version.rb +1 -1
- data/lib/is_crawler.rb +1 -1
- data/spec/lib/crawler_spec.rb +28 -21
- metadata +4 -4
data/README.md
CHANGED
@@ -1,7 +1,6 @@
|
|
1
|
-
#
|
2
|
-
[](http://badge.fury.io/rb/is_crawler) [](https://codeclimate.com/github/ccashwell/is_crawler) [](https://travis-ci.org/ccashwell/is_crawler)
|
1
|
+
# is_crawler [](http://badge.fury.io/rb/is_crawler) [](https://travis-ci.org/ccashwell/is_crawler) [](https://codeclimate.com/github/ccashwell/is_crawler)
|
3
2
|
|
4
|
-
|
3
|
+
This gem does one thing: determine if the supplied string matches a known crawler or bot. It matches against a very short list of strings found in the user agents that represent over 95% of crawler traffic. IMO, if it ain't detected, it ain't important.
|
5
4
|
|
6
5
|
## Installation
|
7
6
|
|
@@ -24,7 +23,7 @@ You can use the `is_crawler?` method with just a user agent string to determine
|
|
24
23
|
class MyController < ActionController::Base
|
25
24
|
include IsCrawler
|
26
25
|
def index
|
27
|
-
if
|
26
|
+
if is_crawler? request.env["HTTP_USER_AGENT"]
|
28
27
|
render 'special_crawler_index'
|
29
28
|
else
|
30
29
|
render 'normal_boring_index'
|
@@ -38,7 +37,7 @@ You can use the `is_crawler?` method with just a user agent string to determine
|
|
38
37
|
|
39
38
|
You can also define custom crawlers like this:
|
40
39
|
|
41
|
-
Crawler::CUSTOM << Crawler.new(:custom_crawler_name, "string that is always present in crawler
|
40
|
+
Crawler::CUSTOM << Crawler.new(:custom_crawler_name, "string that is always present in the crawler's user agent")
|
42
41
|
|
43
42
|
That's it!
|
44
43
|
|
data/lib/config/crawlers.yml
CHANGED
@@ -1,34 +1,22 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
alexa:
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
bing:
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
openwebspider:
|
24
|
-
ua_string: OpenWebSpider
|
25
|
-
technorati:
|
26
|
-
ua_string: Technoratibot
|
27
|
-
twitter:
|
28
|
-
ua_string: Twitterbot
|
29
|
-
yahoo:
|
30
|
-
ua_string: Yahoo! Slurp
|
31
|
-
yahoo_jp:
|
32
|
-
ua_string: Y!J
|
33
|
-
yandex:
|
34
|
-
ua_string: Yandex
|
1
|
+
crawlers:
|
2
|
+
addthis: AddThis.com
|
3
|
+
alexa: ia_archiver
|
4
|
+
archive_org: archive.org_bot
|
5
|
+
ask: Ask Jeeves
|
6
|
+
baidu: baidu
|
7
|
+
bing: bingbot
|
8
|
+
bitly: bitlybot
|
9
|
+
blekko: Blekkobot
|
10
|
+
exabot: Exabot
|
11
|
+
facebook: facebookexternalhit
|
12
|
+
flipboard: FlipboardProxy
|
13
|
+
google: Googlebot
|
14
|
+
google_web_preview: Google Web Preview
|
15
|
+
msn: MSNBot
|
16
|
+
mywebsearch: MyWebSearch
|
17
|
+
openwebspider: OpenWebSpider
|
18
|
+
technorati: Technoratibot
|
19
|
+
twitter: Twitterbot
|
20
|
+
yahoo: Yahoo! Slurp
|
21
|
+
yahoo_jp: Y!J
|
22
|
+
yandex: Yandex
|
data/lib/crawler.rb
CHANGED
@@ -1,8 +1,8 @@
|
|
1
1
|
require 'yaml'
|
2
2
|
|
3
3
|
class Crawler < Struct.new(:name, :ua_string)
|
4
|
-
DEFAULT = YAML.load(File.read(File.expand_path('../config/crawlers.yml', __FILE__))).collect do |k,v|
|
5
|
-
Crawler.new(k.to_sym, v
|
4
|
+
DEFAULT = YAML.load(File.read(File.expand_path('../config/crawlers.yml', __FILE__)))["crawlers"].collect do |k,v|
|
5
|
+
Crawler.new(k.to_sym, v)
|
6
6
|
end
|
7
7
|
|
8
8
|
CUSTOM = []
|
@@ -17,7 +17,7 @@ class Crawler < Struct.new(:name, :ua_string)
|
|
17
17
|
end
|
18
18
|
|
19
19
|
def which_crawler user_agent
|
20
|
-
all.detect {|crawler| crawler.matches? user_agent }.name rescue nil
|
20
|
+
all.detect { |crawler| crawler.matches? user_agent }.name rescue nil
|
21
21
|
end
|
22
22
|
end
|
23
23
|
|
data/lib/is_crawler/version.rb
CHANGED
data/lib/is_crawler.rb
CHANGED
@@ -6,7 +6,7 @@ module IsCrawler
|
|
6
6
|
if specific_crawlers && specific_crawlers.size > 0
|
7
7
|
specific_crawlers.include?(Crawler.which_crawler(requesting_user_agent))
|
8
8
|
else
|
9
|
-
Crawler.matches_any?(requesting_user_agent)
|
9
|
+
Crawler.matches_any?(requesting_user_agent)
|
10
10
|
end
|
11
11
|
end
|
12
12
|
end
|
data/spec/lib/crawler_spec.rb
CHANGED
@@ -1,62 +1,69 @@
|
|
1
1
|
describe Crawler do
|
2
|
-
let(:
|
2
|
+
let(:chrome_user_agent) { 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1309.0 Safari/537.17' }
|
3
|
+
let(:google_user_agent) { 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)' }
|
4
|
+
let(:facebook_user_agent) { 'facebookexternalhit/1.0 (http://www.facebook.com/externalhit_uatext.php)' }
|
3
5
|
|
4
6
|
describe '.matches_any?' do
|
5
7
|
subject { Crawler.matches_any?(user_agent) }
|
6
8
|
|
7
9
|
context 'When an unknown user agent is encountered' do
|
10
|
+
let(:user_agent) { chrome_user_agent }
|
8
11
|
it { should be_false }
|
9
12
|
end
|
10
13
|
|
11
14
|
context 'When a known user agent is encountered' do
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
15
|
+
context 'such as the facebook crawler' do
|
16
|
+
let(:user_agent) { facebook_user_agent }
|
17
|
+
it { should be_true }
|
18
|
+
end
|
19
|
+
|
20
|
+
context 'such as the Googlebot' do
|
21
|
+
let(:user_agent) { google_user_agent }
|
22
|
+
it { should be_true }
|
17
23
|
end
|
18
24
|
end
|
19
25
|
end
|
20
26
|
|
21
27
|
describe '#matches?' do
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
end
|
28
|
+
describe 'Comparing Googlebot\'s known UA string' do
|
29
|
+
subject { Crawler.new(:google, 'Googlebot').matches?(user_agent) }
|
30
|
+
context "with a matching string" do
|
31
|
+
let(:user_agent) { google_user_agent }
|
32
|
+
it { should be_true }
|
33
|
+
end
|
29
34
|
|
30
|
-
|
31
|
-
|
32
|
-
|
35
|
+
context 'with a non-matching string' do
|
36
|
+
let(:user_agent) { chrome_user_agent }
|
37
|
+
it { should be_false }
|
33
38
|
end
|
34
39
|
end
|
35
40
|
end
|
36
41
|
|
37
|
-
describe '
|
42
|
+
describe '.which_crawler' do
|
38
43
|
subject { Crawler.which_crawler(user_agent) }
|
39
44
|
context 'When the provided string matches a crawler' do
|
40
|
-
let(:user_agent) {
|
45
|
+
let(:user_agent) { facebook_user_agent }
|
41
46
|
it { should be :facebook }
|
42
47
|
end
|
43
48
|
|
44
49
|
context 'When the provided string matches no crawlers' do
|
50
|
+
let(:user_agent) { chrome_user_agent }
|
45
51
|
it { should be_nil }
|
46
52
|
end
|
47
53
|
end
|
48
54
|
|
49
55
|
describe 'Custom Crawler' do
|
50
56
|
let(:custom_crawler) { Crawler.new(:custom, "Custom/1.0") }
|
57
|
+
let(:user_agent) { custom_crawler.ua_string }
|
51
58
|
before { Crawler::CUSTOM << custom_crawler }
|
52
59
|
context '.matches_any' do
|
53
60
|
subject { Crawler.matches_any?(user_agent) }
|
54
61
|
context 'When the provided string matches the custom crawler' do
|
55
|
-
let(:user_agent) { "Custom/1.0" }
|
56
62
|
it { should be_true }
|
57
63
|
end
|
58
64
|
|
59
65
|
context 'When the provided string does not match the custom crawler' do
|
66
|
+
let(:user_agent) { chrome_user_agent }
|
60
67
|
it { should be_false }
|
61
68
|
end
|
62
69
|
end
|
@@ -64,11 +71,11 @@ describe Crawler do
|
|
64
71
|
context '.which_crawler' do
|
65
72
|
subject { Crawler.which_crawler(user_agent) }
|
66
73
|
context 'When the provided string matches the custom crawler' do
|
67
|
-
let(:user_agent) { "Custom/1.0" }
|
68
74
|
it { should be custom_crawler.name }
|
69
75
|
end
|
70
76
|
|
71
77
|
context 'When the provided string does not match the custom crawler' do
|
78
|
+
let(:user_agent) { chrome_user_agent }
|
72
79
|
it { should_not be custom_crawler.name }
|
73
80
|
end
|
74
81
|
end
|
@@ -76,11 +83,11 @@ describe Crawler do
|
|
76
83
|
context '#matches?' do
|
77
84
|
subject { custom_crawler.matches?(user_agent) }
|
78
85
|
context 'When the provided string matches the custom crawler' do
|
79
|
-
let(:user_agent) { "Custom/1.0" }
|
80
86
|
it { should be_true }
|
81
87
|
end
|
82
88
|
|
83
89
|
context 'When the provided string does not match the custom crawler' do
|
90
|
+
let(:user_agent) { chrome_user_agent }
|
84
91
|
it { should be_false }
|
85
92
|
end
|
86
93
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: is_crawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.3
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-03-
|
12
|
+
date: 2013-03-04 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rake
|
@@ -77,7 +77,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
77
77
|
version: '0'
|
78
78
|
segments:
|
79
79
|
- 0
|
80
|
-
hash: -
|
80
|
+
hash: -1826155163722186205
|
81
81
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
82
82
|
none: false
|
83
83
|
requirements:
|
@@ -86,7 +86,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
86
86
|
version: '0'
|
87
87
|
segments:
|
88
88
|
- 0
|
89
|
-
hash: -
|
89
|
+
hash: -1826155163722186205
|
90
90
|
requirements: []
|
91
91
|
rubyforge_project:
|
92
92
|
rubygems_version: 1.8.24
|