human_power 0.0.6 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +25 -0
- data/lib/human_power.rb +13 -0
- data/lib/human_power/version.rb +1 -1
- data/test/bot_detection_test.rb +43 -0
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e0c1c83f98908134117b2b4aab263d7624fbdd8c
|
4
|
+
data.tar.gz: 622632e2dd563dacc9c1da16c45bc7c39063f7d2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 17b7c4f6212ec8cd7c7b163cc4852490ae979a1c9c85f0186bf4fedcfba7b8846dfc442245c1bf89568101638a7e93c0d4d9921dc02b3544a25684842bfabec4
|
7
|
+
data.tar.gz: f0cec68d61d6e3fb520816e3add83874c6a61ed1c289ef3fefe342a60597b1e2d5c4437b7ac51aaf750330c2ef6ea6442f91c6c574804ca7e5fb6765c9d33b3c
|
data/README.md
CHANGED
@@ -77,6 +77,31 @@ Then visit `/robots.txt` in your browser.
|
|
77
77
|
Please see [user_agents.yml](https://github.com/lassebunk/human_power/blob/master/user_agents.yml) for a list of 170+ built-in user agents/crawlers you can use like shown above.
|
78
78
|
The list is from [UserAgentString.com](http://www.useragentstring.com/pages/Crawlerlist/).
|
79
79
|
|
80
|
+
### Bot detection
|
81
|
+
|
82
|
+
You can use the `HumanPower.is_bot?` method to check if a user agent is a known bot / crawler:
|
83
|
+
|
84
|
+
```ruby
|
85
|
+
# Googlebot
|
86
|
+
ua = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
|
87
|
+
HumanPower.is_bot?(ua) # => true
|
88
|
+
|
89
|
+
# Chrome
|
90
|
+
ua = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1944.0 Safari/537.36"
|
91
|
+
HumanPower.is_bot?(ua) # => false
|
92
|
+
|
93
|
+
# in Rails
|
94
|
+
HumanPower.is_bot?(request.user_agent) # => performs check on current user agent
|
95
|
+
```
|
96
|
+
|
97
|
+
### Regular expression
|
98
|
+
|
99
|
+
If you need to get a regular expression for bot detection, you can use:
|
100
|
+
|
101
|
+
```ruby
|
102
|
+
HumanPower.bot_regex # => regular expression that matches all known bots / crawlers
|
103
|
+
```
|
104
|
+
|
80
105
|
## Caveats
|
81
106
|
|
82
107
|
Human Power is great for adding rules to your robots.txt.
|
data/lib/human_power.rb
CHANGED
@@ -24,6 +24,19 @@ module HumanPower
|
|
24
24
|
@user_agents ||= load_user_agents
|
25
25
|
end
|
26
26
|
|
27
|
+
# Regular expression to match bot user agents.
|
28
|
+
def bot_regex
|
29
|
+
@bot_regex ||= begin
|
30
|
+
escaped_values = user_agents.values.map { |ua| Regexp.escape(ua) }
|
31
|
+
/#{escaped_values.join("|")}/i
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
# Returns +true+ if a given user agent is a bot.
|
36
|
+
def is_bot?(user_agent)
|
37
|
+
!!(user_agent =~ bot_regex)
|
38
|
+
end
|
39
|
+
|
27
40
|
private
|
28
41
|
|
29
42
|
# Loads the built-in user agents from crawlers.yml.
|
data/lib/human_power/version.rb
CHANGED
@@ -0,0 +1,43 @@
|
|
1
|
+
require 'test_helper'
|
2
|
+
|
3
|
+
class GeneratorTest < ActionView::TestCase
|
4
|
+
test "bot regex matches" do
|
5
|
+
bot_user_agents.each do |ua|
|
6
|
+
assert_match HumanPower.bot_regex, ua
|
7
|
+
end
|
8
|
+
end
|
9
|
+
|
10
|
+
test "bot regex browser matches" do
|
11
|
+
browser_user_agents.each do |ua|
|
12
|
+
assert_no_match HumanPower.bot_regex, ua
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
test "bot detection" do
|
17
|
+
bot_user_agents.each do |ua|
|
18
|
+
assert HumanPower.is_bot?(ua)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
test "browser detection" do
|
23
|
+
browser_user_agents.each do |ua|
|
24
|
+
assert !HumanPower.is_bot?(ua)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
private
|
29
|
+
|
30
|
+
def bot_user_agents
|
31
|
+
["Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
|
32
|
+
"Googlebot/2.1 (+http://www.googlebot.com/bot.html)",
|
33
|
+
"Mozilla/4.0 (compatible; B-l-i-t-z-B-O-T)",
|
34
|
+
"Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)"]
|
35
|
+
end
|
36
|
+
|
37
|
+
def browser_user_agents
|
38
|
+
["Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20100101 Firefox/31.0",
|
39
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1944.0 Safari/537.36",
|
40
|
+
"Mozilla/5.0 (iPad; CPU OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5355d Safari/8536.25",
|
41
|
+
"Mozilla/5.0 (compatible; MSIE 10.6; Windows NT 6.1; Trident/5.0; InfoPath.2; SLCC1; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 2.0.50727) 3gpp-gba UNTRUSTED/1.0"]
|
42
|
+
end
|
43
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: human_power
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 0.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Lasse Bunk
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-
|
11
|
+
date: 2014-05-12 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -91,6 +91,7 @@ files:
|
|
91
91
|
- lib/human_power/rails/engine.rb
|
92
92
|
- lib/human_power/rule.rb
|
93
93
|
- lib/human_power/version.rb
|
94
|
+
- test/bot_detection_test.rb
|
94
95
|
- test/dummy/README.rdoc
|
95
96
|
- test/dummy/Rakefile
|
96
97
|
- test/dummy/app/assets/images/.keep
|
@@ -159,6 +160,7 @@ signing_key:
|
|
159
160
|
specification_version: 4
|
160
161
|
summary: Easy generation of robots.txt. Force the robots into submission!
|
161
162
|
test_files:
|
163
|
+
- test/bot_detection_test.rb
|
162
164
|
- test/dummy/README.rdoc
|
163
165
|
- test/dummy/Rakefile
|
164
166
|
- test/dummy/app/assets/images/.keep
|