legitbot 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 394b73c86f4a4aefe5cd96a6483ec5f000aaed04
4
+ data.tar.gz: 7b0fbc498167e635487b4e659dcd88aa95a7360a
5
+ SHA512:
6
+ metadata.gz: 82e4a3b94efee99fee6a0cacea416c9596e9d383def2ea5191211ce087f3cf46b5240a87c43d7859a90560eca04c07c3eedebe9b4efb389d9697e28f364f749c
7
+ data.tar.gz: bd7ff484dc01003c95b6dbd399f4c795480a7d1a194d6c09e98b4d03bc51b26790f6c4b6190ae06c89e0afcc71c50d293b5fb9a8a3a5f078c4d7d87bb70bcc76
data/.gitignore ADDED
@@ -0,0 +1,4 @@
1
+ Gemfile.lock
2
+ .bundle
3
+ *.gem
4
+ *.gemfile.lock
data/.travis.yml ADDED
@@ -0,0 +1,5 @@
1
+ sudo: false
2
+ language: ruby
3
+ cache: bundler
4
+ rvm:
5
+ - 2.2
data/Gemfile ADDED
@@ -0,0 +1,2 @@
1
+ source 'https://rubygems.org'
2
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,13 @@
1
+ Copyright (C) 2015 Alexander Azarov <self@alaz.me>
2
+
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
data/README.md ADDED
@@ -0,0 +1,50 @@
1
+ # Legitbot
2
+
3
+ Ruby gem to check if an IP really belongs to some bot, typically a search
4
+ engine. This can of much help if one wants to protect his/her web site from
5
+ malicious scanners who pretend to be e.g. a Googlebot.
6
+
7
+ ## Usage
8
+
9
+ Suppose you have a Web request and you'd like to make sure it's not from a fake
10
+ search engine:
11
+
12
+ ```ruby
13
+ bot = Legitbot.bot(userAgent, ip)
14
+ ```
15
+
16
+ `bot` will be `nil` if no bot signature was found in the `User-Agent`. Otherwise,
17
+ it will be an instance with methods
18
+
19
+ ```ruby
20
+ bot.detected_as # => "Google"
21
+ bot.valid? # => true
22
+ bot.fake? # => false
23
+ ```
24
+
25
+ Sometimes you already know what search engine to expect. For example, you may
26
+ be using [rack-attack](https://github.com/kickstarter/rack-attack):
27
+
28
+ ```ruby
29
+ Rack::Attack.blocklist("fake Googlebot") do |req|
30
+ req.user_agent =~ %r(Googlebot) && Legitbot::Google.fake?(req.ip)
31
+ end
32
+ ```
33
+
34
+ ## Issues, problems, plans
35
+
36
+ * Rails middleware
37
+ * Facebook: https://developers.facebook.com/docs/sharing/webmasters/crawler
38
+
39
+ ## License
40
+
41
+ Apache 2.0
42
+
43
+ ## References
44
+
45
+ * I have initially created Play Framework version in Scala: [play-legitbot](https://github.com/osinka/play-legitbot)
46
+ * Article [When (Fake) Googlebots Attack Your Rails App](http://jessewolgamott.com/blog/2015/11/17/when-fake-googlebots-attack-your-rails-app/)
47
+ * [Voight-Kampff](https://github.com/biola/Voight-Kampff) is a Ruby gem which
48
+ detects bots by `User-Agent`
49
+ * [browser](https://github.com/fnando/browser) is a Ruby gem which may tell
50
+ you if the request comes from a search engine.
data/Rakefile ADDED
@@ -0,0 +1,15 @@
1
+ require 'rubygems'
2
+ require 'bundler'
3
+
4
+ Bundler::GemHelper.install_tasks
5
+
6
+ require "rake/testtask"
7
+
8
+ Rake::TestTask.new do |t|
9
+ t.libs << "test"
10
+ t.test_files = FileList['test/*_test.rb']
11
+ t.warning = true
12
+ t.verbose = true
13
+ end
14
+
15
+ task default: %w[test]
data/legitbot.gemspec ADDED
@@ -0,0 +1,24 @@
1
+ # encoding: utf-8
2
+ $LOAD_PATH.push File.expand_path("../lib", __FILE__)
3
+ require "legitbot/version"
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = 'legitbot'
7
+ spec.version = Legitbot::VERSION
8
+ spec.license = 'Apache-2.0'
9
+
10
+ spec.author = "Alexander Azarov"
11
+ spec.email = "self@alaz.me"
12
+ spec.homepage = "https://github.com/alaz/legitbot"
13
+ spec.summary = %q{Validate Web request was made by legitimate search engine}
14
+ spec.description = "A library to make sure a Web request has been "\
15
+ "made by a real search engine, not a fake"
16
+
17
+ spec.required_ruby_version = '>= 2.0.0'
18
+ spec.add_development_dependency "rake"
19
+ spec.add_development_dependency "minitest"
20
+
21
+ spec.files = `git ls-files`.split($/)
22
+ spec.rdoc_options = ["--charset=UTF-8"]
23
+ spec.test_files = Dir.glob("test/**/*")
24
+ end
data/lib/legitbot.rb ADDED
@@ -0,0 +1,9 @@
1
+ require 'resolv'
2
+
3
+ require_relative 'legitbot/legitbot'
4
+ require_relative 'legitbot/botmatch'
5
+ require_relative 'legitbot/google'
6
+ require_relative 'legitbot/yandex'
7
+ require_relative 'legitbot/bing'
8
+ require_relative 'legitbot/baidu'
9
+ require_relative 'legitbot/duckduckgo'
@@ -0,0 +1,16 @@
1
+ module Legitbot
2
+ # http://help.baidu.com/question?prod_en=master&class=498&id=1000973
3
+ class Baidu < BotMatch
4
+ ValidDomains = ["baidu.com.", "baidu.jp."]
5
+
6
+ def initialize(ip, resolver_config = nil)
7
+ super(ip, resolver_config)
8
+ end
9
+
10
+ def valid?
11
+ subdomain_of?(*Baidu::ValidDomains)
12
+ end
13
+ end
14
+
15
+ rule Legitbot::Baidu, %w(Baiduspider)
16
+ end
@@ -0,0 +1,16 @@
1
+ module Legitbot
2
+ # https://blogs.bing.com/webmaster/2012/08/31/how-to-verify-that-bingbot-is-bingbot/
3
+ class Bing < BotMatch
4
+ ValidDomains = ["search.msn.com."]
5
+
6
+ def initialize(ip, resolver_config = nil)
7
+ super(ip, resolver_config)
8
+ end
9
+
10
+ def valid?
11
+ subdomain_of?(*Bing::ValidDomains) && reverse_resolves?
12
+ end
13
+ end
14
+
15
+ rule Legitbot::Bing, %w(Bingbot bingbot)
16
+ end
@@ -0,0 +1,58 @@
1
+ module Legitbot
2
+ ##
3
+ # Represents a bot instance match. Typical methods are
4
+ # +valid?+, +fake?+ and +detected_as+
5
+ #
6
+ class BotMatch
7
+ def initialize(ip, resolver_config = nil)
8
+ @dns = Resolv::DNS.new(resolver_config)
9
+ @ip = ip
10
+ end
11
+
12
+ ##
13
+ # Returns a Resolv::DNS::Name instance with
14
+ # the reverse name
15
+ def reverse_domain
16
+ @reverse_domain ||= @dns.getname(@ip)
17
+ end
18
+
19
+ ##
20
+ # Returns a String with the reverse name
21
+ def reverse_name
22
+ reverse_domain.to_s
23
+ end
24
+
25
+ ##
26
+ # Returns a String with IP created from the reverse name
27
+ def reversed_ip
28
+ @reverse_ip ||= @dns.getaddress(reverse_name)
29
+ @reverse_ip.to_s
30
+ end
31
+
32
+ def reverse_resolves?
33
+ reversed_ip == @ip
34
+ end
35
+
36
+ def subdomain_of?(*domains)
37
+ domains.any? { |d|
38
+ reverse_domain.subdomain_of? Resolv::DNS::Name.create(d)
39
+ }
40
+ end
41
+
42
+ def detected_as
43
+ self.class.name.split('::').last
44
+ end
45
+
46
+ def fake?
47
+ !valid?
48
+ end
49
+
50
+ def self.valid?(ip, resolver_config = nil)
51
+ self.new(ip, resolver_config).valid?
52
+ end
53
+
54
+ def self.fake?(ip, resolver_config = nil)
55
+ self.new(ip, resolver_config).fake?
56
+ end
57
+ end
58
+ end
@@ -0,0 +1,16 @@
1
+ module Legitbot
2
+ # https://duckduckgo.com/duckduckbot
3
+ class DuckDuckGo < BotMatch
4
+ ValidIPs = %w(72.94.249.34 72.94.249.35 72.94.249.36 72.94.249.37 72.94.249.38)
5
+
6
+ def initialize(ip, resolver_config = nil)
7
+ super(ip, resolver_config)
8
+ end
9
+
10
+ def valid?
11
+ DuckDuckGo::ValidIPs.include? @ip
12
+ end
13
+ end
14
+
15
+ rule Legitbot::DuckDuckGo, %w(DuckDuckGo)
16
+ end
@@ -0,0 +1,20 @@
1
+ require 'resolv'
2
+
3
+ module Legitbot
4
+ # https://support.google.com/webmasters/answer/1061943
5
+ # https://support.google.com/webmasters/answer/80553
6
+
7
+ class Google < BotMatch
8
+ ValidDomains = ["google.com.", "googlebot.com."]
9
+
10
+ def initialize(ip, resolver_config = nil)
11
+ super(ip, resolver_config)
12
+ end
13
+
14
+ def valid?
15
+ subdomain_of?(*Google::ValidDomains) && reverse_resolves?
16
+ end
17
+ end
18
+
19
+ rule Legitbot::Google, %w(Googlebot Mediapartners-Google AdsBot-Google)
20
+ end
@@ -0,0 +1,31 @@
1
+ module Legitbot
2
+ @rules = []
3
+
4
+ ##
5
+ # Lookup a bot based on its signature from +User-Agent+ header.
6
+ #
7
+ # If a block given, passes the found bot to the block.
8
+ #
9
+ # Returns +nil+ if no bot found and a bot match instance
10
+ # otherwise.
11
+ # :yields: a found bot
12
+ #
13
+ def self.bot(userAgent, ip, resolver_config = nil)
14
+ bot =
15
+ @rules.select { |rule|
16
+ rule[:fragments].any? {|f| userAgent.index f}
17
+ }.map { |rule|
18
+ rule[:class].new(ip, resolver_config)
19
+ }.first
20
+
21
+ if bot && block_given?
22
+ yield bot
23
+ else
24
+ bot
25
+ end
26
+ end
27
+
28
+ def self.rule(clazz, fragments)
29
+ @rules << {:class => clazz, :fragments => fragments}
30
+ end
31
+ end
@@ -0,0 +1,3 @@
1
+ module Legitbot
2
+ VERSION = '0.0.1'
3
+ end
@@ -0,0 +1,20 @@
1
+ module Legitbot
2
+ # https://yandex.com/support/webmaster/robot-workings/check-yandex-robots.xml
3
+
4
+ class Yandex < BotMatch
5
+ ValidDomains = ["yandex.ru.", "yandex.net.", "yandex.com."]
6
+
7
+ def initialize(ip, resolver_config = nil)
8
+ super(ip, resolver_config)
9
+ end
10
+
11
+ def valid?
12
+ subdomain_of?(*Yandex::ValidDomains) && reverse_resolves?
13
+ end
14
+ end
15
+
16
+ rule Legitbot::Yandex, %w(YandexBot YandexMobileBot YandexImages YandexVideo
17
+ YandexMedia YandexBlogs YandexFavicons YandexWebmaster YandexPagechecker
18
+ YandexImageResizer YandexSitelinks YandexMetrika YandexDirectDyn YandexRCA
19
+ YaDirectFetcher YandexAntivirus YandexVertis YandexCalendar)
20
+ end
@@ -0,0 +1,24 @@
1
+ require 'minitest/autorun'
2
+ require 'legitbot'
3
+
4
+ class BotMatchTest < Minitest::Test
5
+ def test_reverse_name
6
+ match = Legitbot::BotMatch.new "66.249.78.6"
7
+ assert_equal "crawl-66-249-78-6.googlebot.com", match.reverse_name
8
+ end
9
+
10
+ def test_reverse_ip
11
+ match = Legitbot::BotMatch.new "66.249.78.6"
12
+ assert_equal "66.249.78.6", match.reversed_ip
13
+ end
14
+
15
+ def test_reverse_resolves
16
+ match = Legitbot::BotMatch.new "66.249.78.6"
17
+ assert_equal true, match.reverse_resolves?
18
+ end
19
+
20
+ def test_valid_class_syntax
21
+ assert Legitbot::Google.valid?("66.249.78.6"), msg: "Valid Googlebot"
22
+ assert Legitbot::Google.fake?("149.210.164.47"), msg: "Fake Googlebot"
23
+ end
24
+ end
@@ -0,0 +1,37 @@
1
+ require 'minitest/autorun'
2
+ require 'legitbot'
3
+
4
+ class GoogleTest < Minitest::Test
5
+ def test_malicious_ip
6
+ ip = "149.210.164.47"
7
+ match = Legitbot::Google.new ip
8
+ reverse_name = match.reverse_name
9
+ assert !match.subdomain_of?("googlebot.com."), msg: "#{reverse_name} is not a subdomain of googlebot.com"
10
+ assert !match.valid?, msg: "#{ip} is not a real Googlebot IP"
11
+ end
12
+
13
+ def test_valid_ip
14
+ ip = "66.249.78.6"
15
+ match = Legitbot::Google.new ip
16
+ reverse_name = match.reverse_name
17
+ assert match.subdomain_of?("googlebot.com."), msg: "#{reverse_name} is a subdomain of googlebot.com"
18
+ assert match.valid?, msg: "#{ip} is a valid Googlebot IP"
19
+ end
20
+
21
+ def test_malicious_ua
22
+ bot = Legitbot.bot("Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)", "149.210.164.47")
23
+ assert bot, msg: "Googlebot detected from User-Agent"
24
+ assert !bot.valid?, msg: "Not a valid Googlebot"
25
+ end
26
+
27
+ def test_valid_ua
28
+ bot = Legitbot.bot("Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)", "66.249.78.6")
29
+ assert bot, msg: "Googlebot detected from User-Agent"
30
+ assert bot.valid?, msg: "Valid Googlebot"
31
+ end
32
+
33
+ def test_engine_name
34
+ bot = Legitbot.bot("Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)", "66.249.78.6")
35
+ assert_equal "Google", bot.detected_as
36
+ end
37
+ end
@@ -0,0 +1,12 @@
1
+ require 'minitest/autorun'
2
+ require 'legitbot'
3
+
4
+ class LegitbotTest < Minitest::Test
5
+ def test_rules
6
+ assert !Legitbot.bot("Firefox", "127.0.0.1"), msg: "Not a bot"
7
+
8
+ Legitbot.bot("Firefox", "127.0.0.1") do |bot|
9
+ flunk "No bot Firefox is possible"
10
+ end
11
+ end
12
+ end
metadata ADDED
@@ -0,0 +1,95 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: legitbot
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Alexander Azarov
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2016-12-20 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rake
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: minitest
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ description: A library to make sure a Web request has been made by a real search engine,
42
+ not a fake
43
+ email: self@alaz.me
44
+ executables: []
45
+ extensions: []
46
+ extra_rdoc_files: []
47
+ files:
48
+ - ".gitignore"
49
+ - ".travis.yml"
50
+ - Gemfile
51
+ - LICENSE.txt
52
+ - README.md
53
+ - Rakefile
54
+ - legitbot.gemspec
55
+ - lib/legitbot.rb
56
+ - lib/legitbot/baidu.rb
57
+ - lib/legitbot/bing.rb
58
+ - lib/legitbot/botmatch.rb
59
+ - lib/legitbot/duckduckgo.rb
60
+ - lib/legitbot/google.rb
61
+ - lib/legitbot/legitbot.rb
62
+ - lib/legitbot/version.rb
63
+ - lib/legitbot/yandex.rb
64
+ - test/botmatch_test.rb
65
+ - test/google_test.rb
66
+ - test/legitbot_test.rb
67
+ homepage: https://github.com/alaz/legitbot
68
+ licenses:
69
+ - Apache-2.0
70
+ metadata: {}
71
+ post_install_message:
72
+ rdoc_options:
73
+ - "--charset=UTF-8"
74
+ require_paths:
75
+ - lib
76
+ required_ruby_version: !ruby/object:Gem::Requirement
77
+ requirements:
78
+ - - ">="
79
+ - !ruby/object:Gem::Version
80
+ version: 2.0.0
81
+ required_rubygems_version: !ruby/object:Gem::Requirement
82
+ requirements:
83
+ - - ">="
84
+ - !ruby/object:Gem::Version
85
+ version: '0'
86
+ requirements: []
87
+ rubyforge_project:
88
+ rubygems_version: 2.5.2
89
+ signing_key:
90
+ specification_version: 4
91
+ summary: Validate Web request was made by legitimate search engine
92
+ test_files:
93
+ - test/botmatch_test.rb
94
+ - test/google_test.rb
95
+ - test/legitbot_test.rb