legitbot 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +4 -0
- data/.travis.yml +5 -0
- data/Gemfile +2 -0
- data/LICENSE.txt +13 -0
- data/README.md +50 -0
- data/Rakefile +15 -0
- data/legitbot.gemspec +24 -0
- data/lib/legitbot.rb +9 -0
- data/lib/legitbot/baidu.rb +16 -0
- data/lib/legitbot/bing.rb +16 -0
- data/lib/legitbot/botmatch.rb +58 -0
- data/lib/legitbot/duckduckgo.rb +16 -0
- data/lib/legitbot/google.rb +20 -0
- data/lib/legitbot/legitbot.rb +31 -0
- data/lib/legitbot/version.rb +3 -0
- data/lib/legitbot/yandex.rb +20 -0
- data/test/botmatch_test.rb +24 -0
- data/test/google_test.rb +37 -0
- data/test/legitbot_test.rb +12 -0
- metadata +95 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 394b73c86f4a4aefe5cd96a6483ec5f000aaed04
|
4
|
+
data.tar.gz: 7b0fbc498167e635487b4e659dcd88aa95a7360a
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 82e4a3b94efee99fee6a0cacea416c9596e9d383def2ea5191211ce087f3cf46b5240a87c43d7859a90560eca04c07c3eedebe9b4efb389d9697e28f364f749c
|
7
|
+
data.tar.gz: bd7ff484dc01003c95b6dbd399f4c795480a7d1a194d6c09e98b4d03bc51b26790f6c4b6190ae06c89e0afcc71c50d293b5fb9a8a3a5f078c4d7d87bb70bcc76
|
data/.gitignore
ADDED
data/.travis.yml
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
Copyright (C) 2015 Alexander Azarov <self@alaz.me>
|
2
|
+
|
3
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
you may not use this file except in compliance with the License.
|
5
|
+
You may obtain a copy of the License at
|
6
|
+
|
7
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
|
9
|
+
Unless required by applicable law or agreed to in writing, software
|
10
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
See the License for the specific language governing permissions and
|
13
|
+
limitations under the License.
|
data/README.md
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
# Legitbot
|
2
|
+
|
3
|
+
Ruby gem to check if an IP really belongs to some bot, typically a search
|
4
|
+
engine. This can of much help if one wants to protect his/her web site from
|
5
|
+
malicious scanners who pretend to be e.g. a Googlebot.
|
6
|
+
|
7
|
+
## Usage
|
8
|
+
|
9
|
+
Suppose you have a Web request and you'd like to make sure it's not from a fake
|
10
|
+
search engine:
|
11
|
+
|
12
|
+
```ruby
|
13
|
+
bot = Legitbot.bot(userAgent, ip)
|
14
|
+
```
|
15
|
+
|
16
|
+
`bot` will be `nil` if no bot signature was found in the `User-Agent`. Otherwise,
|
17
|
+
it will be an instance with methods
|
18
|
+
|
19
|
+
```ruby
|
20
|
+
bot.detected_as # => "Google"
|
21
|
+
bot.valid? # => true
|
22
|
+
bot.fake? # => false
|
23
|
+
```
|
24
|
+
|
25
|
+
Sometimes you already know what search engine to expect. For example, you may
|
26
|
+
be using [rack-attack](https://github.com/kickstarter/rack-attack):
|
27
|
+
|
28
|
+
```ruby
|
29
|
+
Rack::Attack.blocklist("fake Googlebot") do |req|
|
30
|
+
req.user_agent =~ %r(Googlebot) && Legitbot::Google.fake?(req.ip)
|
31
|
+
end
|
32
|
+
```
|
33
|
+
|
34
|
+
## Issues, problems, plans
|
35
|
+
|
36
|
+
* Rails middleware
|
37
|
+
* Facebook: https://developers.facebook.com/docs/sharing/webmasters/crawler
|
38
|
+
|
39
|
+
## License
|
40
|
+
|
41
|
+
Apache 2.0
|
42
|
+
|
43
|
+
## References
|
44
|
+
|
45
|
+
* I have initially created Play Framework version in Scala: [play-legitbot](https://github.com/osinka/play-legitbot)
|
46
|
+
* Article [When (Fake) Googlebots Attack Your Rails App](http://jessewolgamott.com/blog/2015/11/17/when-fake-googlebots-attack-your-rails-app/)
|
47
|
+
* [Voight-Kampff](https://github.com/biola/Voight-Kampff) is a Ruby gem which
|
48
|
+
detects bots by `User-Agent`
|
49
|
+
* [browser](https://github.com/fnando/browser) is a Ruby gem which may tell
|
50
|
+
you if the request comes from a search engine.
|
data/Rakefile
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'bundler'
|
3
|
+
|
4
|
+
Bundler::GemHelper.install_tasks
|
5
|
+
|
6
|
+
require "rake/testtask"
|
7
|
+
|
8
|
+
Rake::TestTask.new do |t|
|
9
|
+
t.libs << "test"
|
10
|
+
t.test_files = FileList['test/*_test.rb']
|
11
|
+
t.warning = true
|
12
|
+
t.verbose = true
|
13
|
+
end
|
14
|
+
|
15
|
+
task default: %w[test]
|
data/legitbot.gemspec
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
$LOAD_PATH.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "legitbot/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |spec|
|
6
|
+
spec.name = 'legitbot'
|
7
|
+
spec.version = Legitbot::VERSION
|
8
|
+
spec.license = 'Apache-2.0'
|
9
|
+
|
10
|
+
spec.author = "Alexander Azarov"
|
11
|
+
spec.email = "self@alaz.me"
|
12
|
+
spec.homepage = "https://github.com/alaz/legitbot"
|
13
|
+
spec.summary = %q{Validate Web request was made by legitimate search engine}
|
14
|
+
spec.description = "A library to make sure a Web request has been "\
|
15
|
+
"made by a real search engine, not a fake"
|
16
|
+
|
17
|
+
spec.required_ruby_version = '>= 2.0.0'
|
18
|
+
spec.add_development_dependency "rake"
|
19
|
+
spec.add_development_dependency "minitest"
|
20
|
+
|
21
|
+
spec.files = `git ls-files`.split($/)
|
22
|
+
spec.rdoc_options = ["--charset=UTF-8"]
|
23
|
+
spec.test_files = Dir.glob("test/**/*")
|
24
|
+
end
|
data/lib/legitbot.rb
ADDED
@@ -0,0 +1,9 @@
|
|
1
|
+
require 'resolv'
|
2
|
+
|
3
|
+
require_relative 'legitbot/legitbot'
|
4
|
+
require_relative 'legitbot/botmatch'
|
5
|
+
require_relative 'legitbot/google'
|
6
|
+
require_relative 'legitbot/yandex'
|
7
|
+
require_relative 'legitbot/bing'
|
8
|
+
require_relative 'legitbot/baidu'
|
9
|
+
require_relative 'legitbot/duckduckgo'
|
@@ -0,0 +1,16 @@
|
|
1
|
+
module Legitbot
|
2
|
+
# http://help.baidu.com/question?prod_en=master&class=498&id=1000973
|
3
|
+
class Baidu < BotMatch
|
4
|
+
ValidDomains = ["baidu.com.", "baidu.jp."]
|
5
|
+
|
6
|
+
def initialize(ip, resolver_config = nil)
|
7
|
+
super(ip, resolver_config)
|
8
|
+
end
|
9
|
+
|
10
|
+
def valid?
|
11
|
+
subdomain_of?(*Baidu::ValidDomains)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
rule Legitbot::Baidu, %w(Baiduspider)
|
16
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
module Legitbot
|
2
|
+
# https://blogs.bing.com/webmaster/2012/08/31/how-to-verify-that-bingbot-is-bingbot/
|
3
|
+
class Bing < BotMatch
|
4
|
+
ValidDomains = ["search.msn.com."]
|
5
|
+
|
6
|
+
def initialize(ip, resolver_config = nil)
|
7
|
+
super(ip, resolver_config)
|
8
|
+
end
|
9
|
+
|
10
|
+
def valid?
|
11
|
+
subdomain_of?(*Bing::ValidDomains) && reverse_resolves?
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
rule Legitbot::Bing, %w(Bingbot bingbot)
|
16
|
+
end
|
@@ -0,0 +1,58 @@
|
|
1
|
+
module Legitbot
|
2
|
+
##
|
3
|
+
# Represents a bot instance match. Typical methods are
|
4
|
+
# +valid?+, +fake?+ and +detected_as+
|
5
|
+
#
|
6
|
+
class BotMatch
|
7
|
+
def initialize(ip, resolver_config = nil)
|
8
|
+
@dns = Resolv::DNS.new(resolver_config)
|
9
|
+
@ip = ip
|
10
|
+
end
|
11
|
+
|
12
|
+
##
|
13
|
+
# Returns a Resolv::DNS::Name instance with
|
14
|
+
# the reverse name
|
15
|
+
def reverse_domain
|
16
|
+
@reverse_domain ||= @dns.getname(@ip)
|
17
|
+
end
|
18
|
+
|
19
|
+
##
|
20
|
+
# Returns a String with the reverse name
|
21
|
+
def reverse_name
|
22
|
+
reverse_domain.to_s
|
23
|
+
end
|
24
|
+
|
25
|
+
##
|
26
|
+
# Returns a String with IP created from the reverse name
|
27
|
+
def reversed_ip
|
28
|
+
@reverse_ip ||= @dns.getaddress(reverse_name)
|
29
|
+
@reverse_ip.to_s
|
30
|
+
end
|
31
|
+
|
32
|
+
def reverse_resolves?
|
33
|
+
reversed_ip == @ip
|
34
|
+
end
|
35
|
+
|
36
|
+
def subdomain_of?(*domains)
|
37
|
+
domains.any? { |d|
|
38
|
+
reverse_domain.subdomain_of? Resolv::DNS::Name.create(d)
|
39
|
+
}
|
40
|
+
end
|
41
|
+
|
42
|
+
def detected_as
|
43
|
+
self.class.name.split('::').last
|
44
|
+
end
|
45
|
+
|
46
|
+
def fake?
|
47
|
+
!valid?
|
48
|
+
end
|
49
|
+
|
50
|
+
def self.valid?(ip, resolver_config = nil)
|
51
|
+
self.new(ip, resolver_config).valid?
|
52
|
+
end
|
53
|
+
|
54
|
+
def self.fake?(ip, resolver_config = nil)
|
55
|
+
self.new(ip, resolver_config).fake?
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
module Legitbot
|
2
|
+
# https://duckduckgo.com/duckduckbot
|
3
|
+
class DuckDuckGo < BotMatch
|
4
|
+
ValidIPs = %w(72.94.249.34 72.94.249.35 72.94.249.36 72.94.249.37 72.94.249.38)
|
5
|
+
|
6
|
+
def initialize(ip, resolver_config = nil)
|
7
|
+
super(ip, resolver_config)
|
8
|
+
end
|
9
|
+
|
10
|
+
def valid?
|
11
|
+
DuckDuckGo::ValidIPs.include? @ip
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
rule Legitbot::DuckDuckGo, %w(DuckDuckGo)
|
16
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
require 'resolv'
|
2
|
+
|
3
|
+
module Legitbot
|
4
|
+
# https://support.google.com/webmasters/answer/1061943
|
5
|
+
# https://support.google.com/webmasters/answer/80553
|
6
|
+
|
7
|
+
class Google < BotMatch
|
8
|
+
ValidDomains = ["google.com.", "googlebot.com."]
|
9
|
+
|
10
|
+
def initialize(ip, resolver_config = nil)
|
11
|
+
super(ip, resolver_config)
|
12
|
+
end
|
13
|
+
|
14
|
+
def valid?
|
15
|
+
subdomain_of?(*Google::ValidDomains) && reverse_resolves?
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
rule Legitbot::Google, %w(Googlebot Mediapartners-Google AdsBot-Google)
|
20
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
module Legitbot
|
2
|
+
@rules = []
|
3
|
+
|
4
|
+
##
|
5
|
+
# Lookup a bot based on its signature from +User-Agent+ header.
|
6
|
+
#
|
7
|
+
# If a block given, passes the found bot to the block.
|
8
|
+
#
|
9
|
+
# Returns +nil+ if no bot found and a bot match instance
|
10
|
+
# otherwise.
|
11
|
+
# :yields: a found bot
|
12
|
+
#
|
13
|
+
def self.bot(userAgent, ip, resolver_config = nil)
|
14
|
+
bot =
|
15
|
+
@rules.select { |rule|
|
16
|
+
rule[:fragments].any? {|f| userAgent.index f}
|
17
|
+
}.map { |rule|
|
18
|
+
rule[:class].new(ip, resolver_config)
|
19
|
+
}.first
|
20
|
+
|
21
|
+
if bot && block_given?
|
22
|
+
yield bot
|
23
|
+
else
|
24
|
+
bot
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
def self.rule(clazz, fragments)
|
29
|
+
@rules << {:class => clazz, :fragments => fragments}
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
module Legitbot
|
2
|
+
# https://yandex.com/support/webmaster/robot-workings/check-yandex-robots.xml
|
3
|
+
|
4
|
+
class Yandex < BotMatch
|
5
|
+
ValidDomains = ["yandex.ru.", "yandex.net.", "yandex.com."]
|
6
|
+
|
7
|
+
def initialize(ip, resolver_config = nil)
|
8
|
+
super(ip, resolver_config)
|
9
|
+
end
|
10
|
+
|
11
|
+
def valid?
|
12
|
+
subdomain_of?(*Yandex::ValidDomains) && reverse_resolves?
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
rule Legitbot::Yandex, %w(YandexBot YandexMobileBot YandexImages YandexVideo
|
17
|
+
YandexMedia YandexBlogs YandexFavicons YandexWebmaster YandexPagechecker
|
18
|
+
YandexImageResizer YandexSitelinks YandexMetrika YandexDirectDyn YandexRCA
|
19
|
+
YaDirectFetcher YandexAntivirus YandexVertis YandexCalendar)
|
20
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
require 'minitest/autorun'
|
2
|
+
require 'legitbot'
|
3
|
+
|
4
|
+
class BotMatchTest < Minitest::Test
|
5
|
+
def test_reverse_name
|
6
|
+
match = Legitbot::BotMatch.new "66.249.78.6"
|
7
|
+
assert_equal "crawl-66-249-78-6.googlebot.com", match.reverse_name
|
8
|
+
end
|
9
|
+
|
10
|
+
def test_reverse_ip
|
11
|
+
match = Legitbot::BotMatch.new "66.249.78.6"
|
12
|
+
assert_equal "66.249.78.6", match.reversed_ip
|
13
|
+
end
|
14
|
+
|
15
|
+
def test_reverse_resolves
|
16
|
+
match = Legitbot::BotMatch.new "66.249.78.6"
|
17
|
+
assert_equal true, match.reverse_resolves?
|
18
|
+
end
|
19
|
+
|
20
|
+
def test_valid_class_syntax
|
21
|
+
assert Legitbot::Google.valid?("66.249.78.6"), msg: "Valid Googlebot"
|
22
|
+
assert Legitbot::Google.fake?("149.210.164.47"), msg: "Fake Googlebot"
|
23
|
+
end
|
24
|
+
end
|
data/test/google_test.rb
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
require 'minitest/autorun'
|
2
|
+
require 'legitbot'
|
3
|
+
|
4
|
+
class GoogleTest < Minitest::Test
|
5
|
+
def test_malicious_ip
|
6
|
+
ip = "149.210.164.47"
|
7
|
+
match = Legitbot::Google.new ip
|
8
|
+
reverse_name = match.reverse_name
|
9
|
+
assert !match.subdomain_of?("googlebot.com."), msg: "#{reverse_name} is not a subdomain of googlebot.com"
|
10
|
+
assert !match.valid?, msg: "#{ip} is not a real Googlebot IP"
|
11
|
+
end
|
12
|
+
|
13
|
+
def test_valid_ip
|
14
|
+
ip = "66.249.78.6"
|
15
|
+
match = Legitbot::Google.new ip
|
16
|
+
reverse_name = match.reverse_name
|
17
|
+
assert match.subdomain_of?("googlebot.com."), msg: "#{reverse_name} is a subdomain of googlebot.com"
|
18
|
+
assert match.valid?, msg: "#{ip} is a valid Googlebot IP"
|
19
|
+
end
|
20
|
+
|
21
|
+
def test_malicious_ua
|
22
|
+
bot = Legitbot.bot("Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)", "149.210.164.47")
|
23
|
+
assert bot, msg: "Googlebot detected from User-Agent"
|
24
|
+
assert !bot.valid?, msg: "Not a valid Googlebot"
|
25
|
+
end
|
26
|
+
|
27
|
+
def test_valid_ua
|
28
|
+
bot = Legitbot.bot("Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)", "66.249.78.6")
|
29
|
+
assert bot, msg: "Googlebot detected from User-Agent"
|
30
|
+
assert bot.valid?, msg: "Valid Googlebot"
|
31
|
+
end
|
32
|
+
|
33
|
+
def test_engine_name
|
34
|
+
bot = Legitbot.bot("Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)", "66.249.78.6")
|
35
|
+
assert_equal "Google", bot.detected_as
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,12 @@
|
|
1
|
+
require 'minitest/autorun'
|
2
|
+
require 'legitbot'
|
3
|
+
|
4
|
+
class LegitbotTest < Minitest::Test
|
5
|
+
def test_rules
|
6
|
+
assert !Legitbot.bot("Firefox", "127.0.0.1"), msg: "Not a bot"
|
7
|
+
|
8
|
+
Legitbot.bot("Firefox", "127.0.0.1") do |bot|
|
9
|
+
flunk "No bot Firefox is possible"
|
10
|
+
end
|
11
|
+
end
|
12
|
+
end
|
metadata
ADDED
@@ -0,0 +1,95 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: legitbot
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Alexander Azarov
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2016-12-20 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: rake
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: minitest
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
description: A library to make sure a Web request has been made by a real search engine,
|
42
|
+
not a fake
|
43
|
+
email: self@alaz.me
|
44
|
+
executables: []
|
45
|
+
extensions: []
|
46
|
+
extra_rdoc_files: []
|
47
|
+
files:
|
48
|
+
- ".gitignore"
|
49
|
+
- ".travis.yml"
|
50
|
+
- Gemfile
|
51
|
+
- LICENSE.txt
|
52
|
+
- README.md
|
53
|
+
- Rakefile
|
54
|
+
- legitbot.gemspec
|
55
|
+
- lib/legitbot.rb
|
56
|
+
- lib/legitbot/baidu.rb
|
57
|
+
- lib/legitbot/bing.rb
|
58
|
+
- lib/legitbot/botmatch.rb
|
59
|
+
- lib/legitbot/duckduckgo.rb
|
60
|
+
- lib/legitbot/google.rb
|
61
|
+
- lib/legitbot/legitbot.rb
|
62
|
+
- lib/legitbot/version.rb
|
63
|
+
- lib/legitbot/yandex.rb
|
64
|
+
- test/botmatch_test.rb
|
65
|
+
- test/google_test.rb
|
66
|
+
- test/legitbot_test.rb
|
67
|
+
homepage: https://github.com/alaz/legitbot
|
68
|
+
licenses:
|
69
|
+
- Apache-2.0
|
70
|
+
metadata: {}
|
71
|
+
post_install_message:
|
72
|
+
rdoc_options:
|
73
|
+
- "--charset=UTF-8"
|
74
|
+
require_paths:
|
75
|
+
- lib
|
76
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
77
|
+
requirements:
|
78
|
+
- - ">="
|
79
|
+
- !ruby/object:Gem::Version
|
80
|
+
version: 2.0.0
|
81
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
82
|
+
requirements:
|
83
|
+
- - ">="
|
84
|
+
- !ruby/object:Gem::Version
|
85
|
+
version: '0'
|
86
|
+
requirements: []
|
87
|
+
rubyforge_project:
|
88
|
+
rubygems_version: 2.5.2
|
89
|
+
signing_key:
|
90
|
+
specification_version: 4
|
91
|
+
summary: Validate Web request was made by legitimate search engine
|
92
|
+
test_files:
|
93
|
+
- test/botmatch_test.rb
|
94
|
+
- test/google_test.rb
|
95
|
+
- test/legitbot_test.rb
|