legitbot 0.3.2 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/workflows/build.yml +60 -0
- data/.gitignore +1 -0
- data/.rubocop.yml +2 -0
- data/.ruby-version +1 -0
- data/Gemfile +2 -0
- data/README.md +3 -1
- data/Rakefile +5 -3
- data/legitbot.gemspec +19 -18
- data/lib/legitbot.rb +4 -0
- data/lib/legitbot/ahrefs.rb +13 -8
- data/lib/legitbot/apple.rb +11 -11
- data/lib/legitbot/baidu.rb +5 -7
- data/lib/legitbot/bing.rb +5 -7
- data/lib/legitbot/botmatch.rb +17 -44
- data/lib/legitbot/config/resolver.rb +18 -0
- data/lib/legitbot/duckduckgo.rb +18 -7
- data/lib/legitbot/facebook.rb +8 -34
- data/lib/legitbot/google.rb +5 -8
- data/lib/legitbot/legitbot.rb +14 -9
- data/lib/legitbot/oracle.rb +10 -0
- data/lib/legitbot/pinterest.rb +5 -8
- data/lib/legitbot/twitter.rb +14 -0
- data/lib/legitbot/validators/domains.rb +71 -0
- data/lib/legitbot/validators/ip_ranges.rb +81 -0
- data/lib/legitbot/version.rb +3 -1
- data/lib/legitbot/yandex.rb +41 -12
- data/test/ahrefs_test.rb +16 -8
- data/test/apple_as_google_test.rb +9 -4
- data/test/apple_test.rb +11 -4
- data/test/botmatch_test.rb +4 -22
- data/test/facebook_test.rb +25 -10
- data/test/google_test.rb +24 -14
- data/test/legitbot/validators/domains_test.rb +58 -0
- data/test/legitbot/validators/ip_ranges_test.rb +113 -0
- data/test/legitbot_test.rb +8 -4
- data/test/oracle_test.rb +36 -0
- data/test/pinterest_test.rb +26 -14
- data/test/twitter_test.rb +36 -0
- metadata +87 -23
- data/.travis.yml +0 -12
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 34d1432c7c405d783b22a46851db88ccdea9f303defeccdd1cf98604bbb6ce09
|
4
|
+
data.tar.gz: a66b586f4b2dca67fb875ea37add6e7d89a7ce5d0705c3d1898d96ecf091036e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: cad1db2571e939020f74e871365c4748dd78ff18eb8ad9f005ea5bf5b0707835e296afa2601fc6309c994f69b7903d21da788fc219f5d712ee75e1ae9885fb7b
|
7
|
+
data.tar.gz: c72af598d60c55aff35a1b5e244dcde160a67589c588d68b9482d5e5c5f0590441c92505ec94beb462ff70a730ec9aabe80877a2d9db2f72c566c3a9c0b19059
|
@@ -0,0 +1,60 @@
|
|
1
|
+
name: build
|
2
|
+
|
3
|
+
on: [push]
|
4
|
+
|
5
|
+
jobs:
|
6
|
+
test:
|
7
|
+
runs-on: ubuntu-latest
|
8
|
+
|
9
|
+
strategy:
|
10
|
+
fail-fast: false
|
11
|
+
matrix:
|
12
|
+
ruby: [ jruby, 2.6 ]
|
13
|
+
|
14
|
+
steps:
|
15
|
+
- uses: actions/checkout@v2
|
16
|
+
- name: Set up Ruby
|
17
|
+
uses: ruby/setup-ruby@v1
|
18
|
+
with:
|
19
|
+
ruby-version: ${{ matrix.ruby }}
|
20
|
+
- name: Cache dependencies
|
21
|
+
uses: actions/cache@v1
|
22
|
+
with:
|
23
|
+
path: vendor/bundle
|
24
|
+
key: ${{ runner.os }}-${{ matrix.ruby }}-gems-${{ hashFiles('**/Gemfile.lock') }}
|
25
|
+
restore-keys: |
|
26
|
+
${{ runner.os }}-${{ matrix.ruby }}-gems-
|
27
|
+
- name: Install dependencies
|
28
|
+
run: |
|
29
|
+
bundle config path vendor/bundle
|
30
|
+
bundle install --jobs 4 --retry 3
|
31
|
+
- name: Run tests
|
32
|
+
run: bundle exec rake test
|
33
|
+
|
34
|
+
lint:
|
35
|
+
needs: test
|
36
|
+
runs-on: ubuntu-latest
|
37
|
+
|
38
|
+
strategy:
|
39
|
+
matrix:
|
40
|
+
ruby: [ 2.6 ]
|
41
|
+
|
42
|
+
steps:
|
43
|
+
- uses: actions/checkout@v2
|
44
|
+
- name: Set up Ruby
|
45
|
+
uses: ruby/setup-ruby@v1
|
46
|
+
with:
|
47
|
+
ruby-version: ${{ matrix.ruby }}
|
48
|
+
- name: Cache dependencies
|
49
|
+
uses: actions/cache@v1
|
50
|
+
with:
|
51
|
+
path: vendor/bundle
|
52
|
+
key: ${{ runner.os }}-${{ matrix.ruby }}-gems-${{ hashFiles('**/Gemfile.lock') }}
|
53
|
+
restore-keys: |
|
54
|
+
${{ runner.os }}-${{ matrix.ruby }}-gems-
|
55
|
+
- name: Install dependencies
|
56
|
+
run: |
|
57
|
+
bundle config path vendor/bundle
|
58
|
+
bundle install --jobs 4 --retry 3
|
59
|
+
- name: Run linter
|
60
|
+
run: bundle exec rubocop
|
data/.gitignore
CHANGED
data/.rubocop.yml
ADDED
data/.ruby-version
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
2.4
|
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
# Legitbot
|
1
|
+
# Legitbot ![](https://github.com/alaz/legitbot/workflows/build/badge.svg) [![Gem Version](https://badge.fury.io/rb/legitbot.svg)](https://badge.fury.io/rb/legitbot)
|
2
2
|
|
3
3
|
Ruby gem to check that an IP belongs to a bot, typically a search
|
4
4
|
engine. This can be of help in protecting a web site from fake search
|
@@ -50,7 +50,9 @@ end
|
|
50
50
|
* [DuckDuckGo bot](https://duckduckgo.com/duckduckbot)
|
51
51
|
* [Facebook crawler](https://developers.facebook.com/docs/sharing/webmasters/crawler)
|
52
52
|
* [Google crawlers](https://support.google.com/webmasters/answer/1061943)
|
53
|
+
* [Oracle Data Cloud Crawler](https://www.oracle.com/corporate/acquisitions/grapeshot/crawler.html)
|
53
54
|
* [Pinterest](https://help.pinterest.com/en/articles/about-pinterest-crawler-0)
|
55
|
+
* [Twitterbot](https://developer.twitter.com/en/docs/tweets/optimize-with-cards/guides/getting-started), the list of IPs is in the [Troubleshooting page](https://developer.twitter.com/en/docs/tweets/optimize-with-cards/guides/troubleshooting-cards)
|
54
56
|
* [Yandex robots](https://yandex.com/support/webmaster/robot-workings/check-yandex-robots.xml)
|
55
57
|
|
56
58
|
## License
|
data/Rakefile
CHANGED
@@ -1,14 +1,16 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'rubygems'
|
2
4
|
require 'bundler'
|
3
5
|
require 'bump/tasks'
|
4
|
-
require
|
6
|
+
require 'rake/testtask'
|
5
7
|
Bundler::GemHelper.install_tasks
|
6
8
|
|
7
9
|
Bump.tag_by_default = true
|
8
10
|
|
9
11
|
Rake::TestTask.new do |t|
|
10
|
-
t.libs <<
|
11
|
-
t.test_files = FileList['test
|
12
|
+
t.libs << 'test'
|
13
|
+
t.test_files = FileList['test/**/*_test.rb']
|
12
14
|
t.warning = true
|
13
15
|
t.verbose = true
|
14
16
|
end
|
data/legitbot.gemspec
CHANGED
@@ -1,27 +1,28 @@
|
|
1
|
-
#
|
2
|
-
|
3
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
$LOAD_PATH.push File.expand_path('lib', __dir__)
|
4
|
+
require 'legitbot/version'
|
4
5
|
|
5
6
|
Gem::Specification.new do |spec|
|
6
7
|
spec.name = 'legitbot'
|
7
8
|
spec.version = Legitbot::VERSION
|
8
9
|
spec.license = 'Apache-2.0'
|
9
10
|
|
10
|
-
spec.author =
|
11
|
-
spec.email =
|
12
|
-
spec.homepage =
|
13
|
-
spec.summary =
|
14
|
-
spec.description =
|
15
|
-
"made by a real search engine, not a malicious agent"
|
11
|
+
spec.author = 'Alexander Azarov'
|
12
|
+
spec.email = 'self@alaz.me'
|
13
|
+
spec.homepage = 'https://github.com/alaz/legitbot'
|
14
|
+
spec.summary = 'Validate requests from Web crawlers: impersonating or not?'
|
15
|
+
spec.description = 'Does Web request come from a real search engine or from an impersonating agent?'
|
16
16
|
|
17
|
-
spec.required_ruby_version = '>= 2.
|
18
|
-
spec.add_dependency
|
19
|
-
spec.add_dependency
|
20
|
-
spec.add_development_dependency
|
21
|
-
spec.add_development_dependency
|
22
|
-
spec.add_development_dependency
|
17
|
+
spec.required_ruby_version = '>= 2.4.0'
|
18
|
+
spec.add_dependency 'augmented_interval_tree', '~> 0.1', '>= 0.1.1'
|
19
|
+
spec.add_dependency 'irrc', '~> 0.2', '>= 0.2.1'
|
20
|
+
spec.add_development_dependency 'bump', '~> 0.8', '>= 0.8.0'
|
21
|
+
spec.add_development_dependency 'minitest', '~> 5.1', '>= 5.1.0'
|
22
|
+
spec.add_development_dependency 'rake', '~> 12.3', '>= 12.3.0'
|
23
|
+
spec.add_development_dependency 'rubocop', '~> 0.74', '>= 0.74.0'
|
23
24
|
|
24
|
-
spec.files = `git ls-files`.split(
|
25
|
-
spec.rdoc_options = [
|
26
|
-
spec.test_files = Dir.glob(
|
25
|
+
spec.files = `git ls-files`.split($INPUT_RECORD_SEPARATOR)
|
26
|
+
spec.rdoc_options = ['--charset=UTF-8']
|
27
|
+
spec.test_files = Dir.glob('test/**/*')
|
27
28
|
end
|
data/lib/legitbot.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require_relative 'legitbot/legitbot'
|
2
4
|
require_relative 'legitbot/botmatch'
|
3
5
|
|
@@ -8,5 +10,7 @@ require_relative 'legitbot/bing'
|
|
8
10
|
require_relative 'legitbot/duckduckgo'
|
9
11
|
require_relative 'legitbot/facebook'
|
10
12
|
require_relative 'legitbot/google'
|
13
|
+
require_relative 'legitbot/oracle'
|
11
14
|
require_relative 'legitbot/pinterest'
|
15
|
+
require_relative 'legitbot/twitter'
|
12
16
|
require_relative 'legitbot/yandex'
|
data/lib/legitbot/ahrefs.rb
CHANGED
@@ -1,13 +1,18 @@
|
|
1
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Legitbot # :nodoc:
|
2
4
|
# https://ahrefs.com/robot
|
3
5
|
class Ahrefs < BotMatch
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
6
|
+
ip_ranges %w[
|
7
|
+
54.36.148.0/24
|
8
|
+
54.36.149.0/24
|
9
|
+
54.36.150.0/24
|
10
|
+
195.154.122.0/24
|
11
|
+
195.154.123.0/24
|
12
|
+
195.154.126.0/24
|
13
|
+
195.154.127.0/24
|
14
|
+
]
|
10
15
|
end
|
11
16
|
|
12
|
-
rule Legitbot::Ahrefs, %w
|
17
|
+
rule Legitbot::Ahrefs, %w[AhrefsBot]
|
13
18
|
end
|
data/lib/legitbot/apple.rb
CHANGED
@@ -1,20 +1,20 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'ipaddr'
|
2
4
|
|
3
|
-
module Legitbot
|
5
|
+
module Legitbot # :nodoc:
|
4
6
|
# https://support.apple.com/en-us/HT204683
|
5
|
-
|
6
7
|
class Apple < BotMatch
|
7
|
-
|
8
|
-
|
9
|
-
def valid?
|
10
|
-
ip = IPAddr.new @ip
|
11
|
-
Range.include? ip
|
12
|
-
end
|
8
|
+
ip_ranges '17.0.0.0/8'
|
13
9
|
end
|
14
10
|
|
15
|
-
|
11
|
+
# https://support.apple.com/en-us/HT204683
|
12
|
+
# rubocop:disable Naming/ClassAndModuleCamelCase
|
13
|
+
class Apple_as_Google < BotMatch
|
14
|
+
ip_ranges '17.0.0.0/8'
|
16
15
|
end
|
16
|
+
# rubocop:enable Naming/ClassAndModuleCamelCase
|
17
17
|
|
18
|
-
rule Legitbot::Apple, %w
|
19
|
-
rule Legitbot::Apple_as_Google, %w
|
18
|
+
rule Legitbot::Apple, %w[Applebot]
|
19
|
+
rule Legitbot::Apple_as_Google, %w[Googlebot]
|
20
20
|
end
|
data/lib/legitbot/baidu.rb
CHANGED
@@ -1,12 +1,10 @@
|
|
1
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Legitbot # :nodoc:
|
2
4
|
# http://help.baidu.com/question?prod_en=master&class=498&id=1000973
|
3
5
|
class Baidu < BotMatch
|
4
|
-
|
5
|
-
|
6
|
-
def valid?
|
7
|
-
subdomain_of?(*Baidu::ValidDomains)
|
8
|
-
end
|
6
|
+
domains 'baidu.com.', 'baidu.jp.', reverse: false
|
9
7
|
end
|
10
8
|
|
11
|
-
rule Legitbot::Baidu, %w
|
9
|
+
rule Legitbot::Baidu, %w[Baiduspider]
|
12
10
|
end
|
data/lib/legitbot/bing.rb
CHANGED
@@ -1,12 +1,10 @@
|
|
1
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Legitbot # :nodoc:
|
2
4
|
# https://blogs.bing.com/webmaster/2012/08/31/how-to-verify-that-bingbot-is-bingbot/
|
3
5
|
class Bing < BotMatch
|
4
|
-
|
5
|
-
|
6
|
-
def valid?
|
7
|
-
subdomain_of?(*Bing::ValidDomains) && reverse_resolves?
|
8
|
-
end
|
6
|
+
domains 'search.msn.com.'
|
9
7
|
end
|
10
8
|
|
11
|
-
rule Legitbot::Bing, %w
|
9
|
+
rule Legitbot::Bing, %w[Bingbot bingbot]
|
12
10
|
end
|
data/lib/legitbot/botmatch.rb
CHANGED
@@ -1,5 +1,8 @@
|
|
1
|
-
|
2
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative 'config/resolver'
|
4
|
+
require_relative 'validators/domains'
|
5
|
+
require_relative 'validators/ip_ranges'
|
3
6
|
|
4
7
|
module Legitbot
|
5
8
|
##
|
@@ -7,61 +10,31 @@ module Legitbot
|
|
7
10
|
# +valid?+, +fake?+ and +detected_as+
|
8
11
|
#
|
9
12
|
class BotMatch
|
10
|
-
|
11
|
-
|
12
|
-
@ip = ip
|
13
|
-
end
|
14
|
-
|
15
|
-
##
|
16
|
-
# Returns a Resolv::DNS::Name instance with
|
17
|
-
# the reverse name
|
18
|
-
def reverse_domain
|
19
|
-
@reverse_domain ||= @dns.getname(@ip)
|
20
|
-
rescue Resolv::ResolvError
|
21
|
-
@reverse_domain ||= nil
|
22
|
-
end
|
23
|
-
|
24
|
-
##
|
25
|
-
# Returns a String with the reverse name
|
26
|
-
def reverse_name
|
27
|
-
reverse_domain&.to_s
|
28
|
-
end
|
29
|
-
|
30
|
-
##
|
31
|
-
# Returns a String with IP created from the reverse name
|
32
|
-
def reversed_ip
|
33
|
-
return nil if reverse_name.nil?
|
13
|
+
include Legitbot::Validators::IpRanges
|
14
|
+
include Legitbot::Validators::Domains
|
34
15
|
|
35
|
-
|
36
|
-
@
|
37
|
-
end
|
38
|
-
|
39
|
-
def reverse_resolves?
|
40
|
-
@ip == reversed_ip
|
41
|
-
end
|
42
|
-
|
43
|
-
def subdomain_of?(*domains)
|
44
|
-
return false if reverse_name.nil?
|
45
|
-
|
46
|
-
domains.any? { |d|
|
47
|
-
reverse_domain.subdomain_of? Resolv::DNS::Name.create(d)
|
48
|
-
}
|
16
|
+
def initialize(ip)
|
17
|
+
@ip = ip
|
49
18
|
end
|
50
19
|
|
51
20
|
def detected_as
|
52
21
|
self.class.name.split('::').last.downcase.to_sym
|
53
22
|
end
|
54
23
|
|
24
|
+
def valid?
|
25
|
+
valid_ip? && valid_domain?
|
26
|
+
end
|
27
|
+
|
55
28
|
def fake?
|
56
29
|
!valid?
|
57
30
|
end
|
58
31
|
|
59
|
-
def self.valid?(ip
|
60
|
-
|
32
|
+
def self.valid?(ip)
|
33
|
+
new(ip).valid?
|
61
34
|
end
|
62
35
|
|
63
|
-
def self.fake?(ip
|
64
|
-
|
36
|
+
def self.fake?(ip)
|
37
|
+
new(ip).fake?
|
65
38
|
end
|
66
39
|
end
|
67
40
|
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'resolv'
|
4
|
+
|
5
|
+
module Legitbot
|
6
|
+
module Config
|
7
|
+
module Resolver # :nodoc:
|
8
|
+
def resolver_config(options = nil)
|
9
|
+
@resolver_config = options
|
10
|
+
end
|
11
|
+
|
12
|
+
def resolver
|
13
|
+
@resolver_config ||= Legitbot.resolver_config
|
14
|
+
@resolver ||= Resolv::DNS.new @resolver_config
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
data/lib/legitbot/duckduckgo.rb
CHANGED
@@ -1,12 +1,23 @@
|
|
1
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Legitbot # :nodoc:
|
2
4
|
# https://duckduckgo.com/duckduckbot
|
3
5
|
class DuckDuckGo < BotMatch
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
6
|
+
ip_ranges %w[
|
7
|
+
23.21.227.69
|
8
|
+
40.88.21.235
|
9
|
+
50.16.241.113
|
10
|
+
50.16.241.114
|
11
|
+
50.16.241.117
|
12
|
+
50.16.247.234
|
13
|
+
52.204.97.54
|
14
|
+
52.5.190.19
|
15
|
+
54.197.234.188
|
16
|
+
54.208.100.253
|
17
|
+
54.208.102.37
|
18
|
+
107.21.1.8
|
19
|
+
]
|
9
20
|
end
|
10
21
|
|
11
|
-
rule Legitbot::DuckDuckGo, %w
|
22
|
+
rule Legitbot::DuckDuckGo, %w[DuckDuckBot]
|
12
23
|
end
|
data/lib/legitbot/facebook.rb
CHANGED
@@ -1,48 +1,22 @@
|
|
1
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
2
3
|
require 'irrc'
|
3
|
-
require 'interval_tree'
|
4
4
|
|
5
|
-
module Legitbot
|
5
|
+
module Legitbot # :nodoc:
|
6
6
|
# https://developers.facebook.com/docs/sharing/webmasters/crawler
|
7
|
-
|
8
7
|
class Facebook < BotMatch
|
9
8
|
AS = 'AS32934'
|
10
9
|
|
11
|
-
|
12
|
-
ip = IPAddr.new(@ip)
|
13
|
-
Facebook.valid_ips[ip.ipv4? ? :ipv4 : :ipv6].search(ip.to_i).size > 0
|
14
|
-
end
|
15
|
-
|
16
|
-
@mutex = Mutex.new
|
17
|
-
|
18
|
-
def self.valid_ips
|
19
|
-
@mutex.synchronize { @ips ||= load_ips }
|
20
|
-
end
|
21
|
-
|
22
|
-
def self.reload!
|
23
|
-
@mutex.synchronize { @ips = load_ips }
|
24
|
-
end
|
25
|
-
|
26
|
-
def self.load_ips
|
27
|
-
whois.map do |(family, records)|
|
28
|
-
ranges = records.map do |cidr|
|
29
|
-
range = IPAddr.new(cidr).to_range
|
30
|
-
(range.begin.to_i..range.end.to_i)
|
31
|
-
end
|
32
|
-
[family, IntervalTree::Tree.new(ranges)]
|
33
|
-
end.to_h
|
34
|
-
end
|
35
|
-
|
36
|
-
def self.whois
|
10
|
+
ip_ranges do
|
37
11
|
client = Irrc::Client.new
|
38
12
|
client.query :radb, AS
|
39
13
|
results = client.perform
|
40
14
|
|
41
|
-
%i
|
42
|
-
|
43
|
-
end.
|
15
|
+
%i[ipv4 ipv6].map do |family|
|
16
|
+
results[AS][family][AS]
|
17
|
+
end.flatten
|
44
18
|
end
|
45
19
|
end
|
46
20
|
|
47
|
-
rule Legitbot::Facebook, %w
|
21
|
+
rule Legitbot::Facebook, %w[Facebot facebookexternalhit/1.1]
|
48
22
|
end
|