human_power 0.0.6 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +25 -0
- data/lib/human_power.rb +13 -0
- data/lib/human_power/version.rb +1 -1
- data/test/bot_detection_test.rb +43 -0
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e0c1c83f98908134117b2b4aab263d7624fbdd8c
|
4
|
+
data.tar.gz: 622632e2dd563dacc9c1da16c45bc7c39063f7d2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 17b7c4f6212ec8cd7c7b163cc4852490ae979a1c9c85f0186bf4fedcfba7b8846dfc442245c1bf89568101638a7e93c0d4d9921dc02b3544a25684842bfabec4
|
7
|
+
data.tar.gz: f0cec68d61d6e3fb520816e3add83874c6a61ed1c289ef3fefe342a60597b1e2d5c4437b7ac51aaf750330c2ef6ea6442f91c6c574804ca7e5fb6765c9d33b3c
|
data/README.md
CHANGED
@@ -77,6 +77,31 @@ Then visit `/robots.txt` in your browser.
|
|
77
77
|
Please see [user_agents.yml](https://github.com/lassebunk/human_power/blob/master/user_agents.yml) for a list of 170+ built-in user agents/crawlers you can use like shown above.
|
78
78
|
The list is from [UserAgentString.com](http://www.useragentstring.com/pages/Crawlerlist/).
|
79
79
|
|
80
|
+
### Bot detection
|
81
|
+
|
82
|
+
You can use the `HumanPower.is_bot?` method to check if a user agent is a known bot / crawler:
|
83
|
+
|
84
|
+
```ruby
|
85
|
+
# Googlebot
|
86
|
+
ua = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
|
87
|
+
HumanPower.is_bot?(ua) # => true
|
88
|
+
|
89
|
+
# Chrome
|
90
|
+
ua = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1944.0 Safari/537.36"
|
91
|
+
HumanPower.is_bot?(ua) # => false
|
92
|
+
|
93
|
+
# in Rails
|
94
|
+
HumanPower.is_bot?(request.user_agent) # => performs check on current user agent
|
95
|
+
```
|
96
|
+
|
97
|
+
### Regular expression
|
98
|
+
|
99
|
+
If you need to get a regular expression for bot detection, you can use:
|
100
|
+
|
101
|
+
```ruby
|
102
|
+
HumanPower.bot_regex # => regular expression that matches all known bots / crawlers
|
103
|
+
```
|
104
|
+
|
80
105
|
## Caveats
|
81
106
|
|
82
107
|
Human Power is great for adding rules to your robots.txt.
|
data/lib/human_power.rb
CHANGED
@@ -24,6 +24,19 @@ module HumanPower
|
|
24
24
|
@user_agents ||= load_user_agents
|
25
25
|
end
|
26
26
|
|
27
|
+
# Regular expression to match bot user agents.
|
28
|
+
def bot_regex
|
29
|
+
@bot_regex ||= begin
|
30
|
+
escaped_values = user_agents.values.map { |ua| Regexp.escape(ua) }
|
31
|
+
/#{escaped_values.join("|")}/i
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
# Returns +true+ if a given user agent is a bot.
|
36
|
+
def is_bot?(user_agent)
|
37
|
+
!!(user_agent =~ bot_regex)
|
38
|
+
end
|
39
|
+
|
27
40
|
private
|
28
41
|
|
29
42
|
# Loads the built-in user agents from crawlers.yml.
|
data/lib/human_power/version.rb
CHANGED
@@ -0,0 +1,43 @@
|
|
1
|
+
require 'test_helper'
|
2
|
+
|
3
|
+
class GeneratorTest < ActionView::TestCase
|
4
|
+
test "bot regex matches" do
|
5
|
+
bot_user_agents.each do |ua|
|
6
|
+
assert_match HumanPower.bot_regex, ua
|
7
|
+
end
|
8
|
+
end
|
9
|
+
|
10
|
+
test "bot regex browser matches" do
|
11
|
+
browser_user_agents.each do |ua|
|
12
|
+
assert_no_match HumanPower.bot_regex, ua
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
test "bot detection" do
|
17
|
+
bot_user_agents.each do |ua|
|
18
|
+
assert HumanPower.is_bot?(ua)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
test "browser detection" do
|
23
|
+
browser_user_agents.each do |ua|
|
24
|
+
assert !HumanPower.is_bot?(ua)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
private
|
29
|
+
|
30
|
+
def bot_user_agents
|
31
|
+
["Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
|
32
|
+
"Googlebot/2.1 (+http://www.googlebot.com/bot.html)",
|
33
|
+
"Mozilla/4.0 (compatible; B-l-i-t-z-B-O-T)",
|
34
|
+
"Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)"]
|
35
|
+
end
|
36
|
+
|
37
|
+
def browser_user_agents
|
38
|
+
["Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20100101 Firefox/31.0",
|
39
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1944.0 Safari/537.36",
|
40
|
+
"Mozilla/5.0 (iPad; CPU OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5355d Safari/8536.25",
|
41
|
+
"Mozilla/5.0 (compatible; MSIE 10.6; Windows NT 6.1; Trident/5.0; InfoPath.2; SLCC1; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 2.0.50727) 3gpp-gba UNTRUSTED/1.0"]
|
42
|
+
end
|
43
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: human_power
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 0.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Lasse Bunk
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-
|
11
|
+
date: 2014-05-12 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -91,6 +91,7 @@ files:
|
|
91
91
|
- lib/human_power/rails/engine.rb
|
92
92
|
- lib/human_power/rule.rb
|
93
93
|
- lib/human_power/version.rb
|
94
|
+
- test/bot_detection_test.rb
|
94
95
|
- test/dummy/README.rdoc
|
95
96
|
- test/dummy/Rakefile
|
96
97
|
- test/dummy/app/assets/images/.keep
|
@@ -159,6 +160,7 @@ signing_key:
|
|
159
160
|
specification_version: 4
|
160
161
|
summary: Easy generation of robots.txt. Force the robots into submission!
|
161
162
|
test_files:
|
163
|
+
- test/bot_detection_test.rb
|
162
164
|
- test/dummy/README.rdoc
|
163
165
|
- test/dummy/Rakefile
|
164
166
|
- test/dummy/app/assets/images/.keep
|