legitbot 0.3.2 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/build.yml +60 -0
- data/.gitignore +1 -0
- data/.rubocop.yml +2 -0
- data/.ruby-version +1 -0
- data/Gemfile +2 -0
- data/README.md +3 -1
- data/Rakefile +5 -3
- data/legitbot.gemspec +19 -18
- data/lib/legitbot.rb +4 -0
- data/lib/legitbot/ahrefs.rb +13 -8
- data/lib/legitbot/apple.rb +11 -11
- data/lib/legitbot/baidu.rb +5 -7
- data/lib/legitbot/bing.rb +5 -7
- data/lib/legitbot/botmatch.rb +17 -44
- data/lib/legitbot/config/resolver.rb +18 -0
- data/lib/legitbot/duckduckgo.rb +18 -7
- data/lib/legitbot/facebook.rb +8 -34
- data/lib/legitbot/google.rb +5 -8
- data/lib/legitbot/legitbot.rb +14 -9
- data/lib/legitbot/oracle.rb +10 -0
- data/lib/legitbot/pinterest.rb +5 -8
- data/lib/legitbot/twitter.rb +14 -0
- data/lib/legitbot/validators/domains.rb +71 -0
- data/lib/legitbot/validators/ip_ranges.rb +81 -0
- data/lib/legitbot/version.rb +3 -1
- data/lib/legitbot/yandex.rb +41 -12
- data/test/ahrefs_test.rb +16 -8
- data/test/apple_as_google_test.rb +9 -4
- data/test/apple_test.rb +11 -4
- data/test/botmatch_test.rb +4 -22
- data/test/facebook_test.rb +25 -10
- data/test/google_test.rb +24 -14
- data/test/legitbot/validators/domains_test.rb +58 -0
- data/test/legitbot/validators/ip_ranges_test.rb +113 -0
- data/test/legitbot_test.rb +8 -4
- data/test/oracle_test.rb +36 -0
- data/test/pinterest_test.rb +26 -14
- data/test/twitter_test.rb +36 -0
- metadata +87 -23
- data/.travis.yml +0 -12
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 34d1432c7c405d783b22a46851db88ccdea9f303defeccdd1cf98604bbb6ce09
|
4
|
+
data.tar.gz: a66b586f4b2dca67fb875ea37add6e7d89a7ce5d0705c3d1898d96ecf091036e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: cad1db2571e939020f74e871365c4748dd78ff18eb8ad9f005ea5bf5b0707835e296afa2601fc6309c994f69b7903d21da788fc219f5d712ee75e1ae9885fb7b
|
7
|
+
data.tar.gz: c72af598d60c55aff35a1b5e244dcde160a67589c588d68b9482d5e5c5f0590441c92505ec94beb462ff70a730ec9aabe80877a2d9db2f72c566c3a9c0b19059
|
@@ -0,0 +1,60 @@
|
|
1
|
+
name: build
|
2
|
+
|
3
|
+
on: [push]
|
4
|
+
|
5
|
+
jobs:
|
6
|
+
test:
|
7
|
+
runs-on: ubuntu-latest
|
8
|
+
|
9
|
+
strategy:
|
10
|
+
fail-fast: false
|
11
|
+
matrix:
|
12
|
+
ruby: [ jruby, 2.6 ]
|
13
|
+
|
14
|
+
steps:
|
15
|
+
- uses: actions/checkout@v2
|
16
|
+
- name: Set up Ruby
|
17
|
+
uses: ruby/setup-ruby@v1
|
18
|
+
with:
|
19
|
+
ruby-version: ${{ matrix.ruby }}
|
20
|
+
- name: Cache dependencies
|
21
|
+
uses: actions/cache@v1
|
22
|
+
with:
|
23
|
+
path: vendor/bundle
|
24
|
+
key: ${{ runner.os }}-${{ matrix.ruby }}-gems-${{ hashFiles('**/Gemfile.lock') }}
|
25
|
+
restore-keys: |
|
26
|
+
${{ runner.os }}-${{ matrix.ruby }}-gems-
|
27
|
+
- name: Install dependencies
|
28
|
+
run: |
|
29
|
+
bundle config path vendor/bundle
|
30
|
+
bundle install --jobs 4 --retry 3
|
31
|
+
- name: Run tests
|
32
|
+
run: bundle exec rake test
|
33
|
+
|
34
|
+
lint:
|
35
|
+
needs: test
|
36
|
+
runs-on: ubuntu-latest
|
37
|
+
|
38
|
+
strategy:
|
39
|
+
matrix:
|
40
|
+
ruby: [ 2.6 ]
|
41
|
+
|
42
|
+
steps:
|
43
|
+
- uses: actions/checkout@v2
|
44
|
+
- name: Set up Ruby
|
45
|
+
uses: ruby/setup-ruby@v1
|
46
|
+
with:
|
47
|
+
ruby-version: ${{ matrix.ruby }}
|
48
|
+
- name: Cache dependencies
|
49
|
+
uses: actions/cache@v1
|
50
|
+
with:
|
51
|
+
path: vendor/bundle
|
52
|
+
key: ${{ runner.os }}-${{ matrix.ruby }}-gems-${{ hashFiles('**/Gemfile.lock') }}
|
53
|
+
restore-keys: |
|
54
|
+
${{ runner.os }}-${{ matrix.ruby }}-gems-
|
55
|
+
- name: Install dependencies
|
56
|
+
run: |
|
57
|
+
bundle config path vendor/bundle
|
58
|
+
bundle install --jobs 4 --retry 3
|
59
|
+
- name: Run linter
|
60
|
+
run: bundle exec rubocop
|
data/.gitignore
CHANGED
data/.rubocop.yml
ADDED
data/.ruby-version
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
2.4
|
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
# Legitbot
|
1
|
+
# Legitbot  [](https://badge.fury.io/rb/legitbot)
|
2
2
|
|
3
3
|
Ruby gem to check that an IP belongs to a bot, typically a search
|
4
4
|
engine. This can be of help in protecting a web site from fake search
|
@@ -50,7 +50,9 @@ end
|
|
50
50
|
* [DuckDuckGo bot](https://duckduckgo.com/duckduckbot)
|
51
51
|
* [Facebook crawler](https://developers.facebook.com/docs/sharing/webmasters/crawler)
|
52
52
|
* [Google crawlers](https://support.google.com/webmasters/answer/1061943)
|
53
|
+
* [Oracle Data Cloud Crawler](https://www.oracle.com/corporate/acquisitions/grapeshot/crawler.html)
|
53
54
|
* [Pinterest](https://help.pinterest.com/en/articles/about-pinterest-crawler-0)
|
55
|
+
* [Twitterbot](https://developer.twitter.com/en/docs/tweets/optimize-with-cards/guides/getting-started), the list of IPs is in the [Troubleshooting page](https://developer.twitter.com/en/docs/tweets/optimize-with-cards/guides/troubleshooting-cards)
|
54
56
|
* [Yandex robots](https://yandex.com/support/webmaster/robot-workings/check-yandex-robots.xml)
|
55
57
|
|
56
58
|
## License
|
data/Rakefile
CHANGED
@@ -1,14 +1,16 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'rubygems'
|
2
4
|
require 'bundler'
|
3
5
|
require 'bump/tasks'
|
4
|
-
require
|
6
|
+
require 'rake/testtask'
|
5
7
|
Bundler::GemHelper.install_tasks
|
6
8
|
|
7
9
|
Bump.tag_by_default = true
|
8
10
|
|
9
11
|
Rake::TestTask.new do |t|
|
10
|
-
t.libs <<
|
11
|
-
t.test_files = FileList['test
|
12
|
+
t.libs << 'test'
|
13
|
+
t.test_files = FileList['test/**/*_test.rb']
|
12
14
|
t.warning = true
|
13
15
|
t.verbose = true
|
14
16
|
end
|
data/legitbot.gemspec
CHANGED
@@ -1,27 +1,28 @@
|
|
1
|
-
#
|
2
|
-
|
3
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
$LOAD_PATH.push File.expand_path('lib', __dir__)
|
4
|
+
require 'legitbot/version'
|
4
5
|
|
5
6
|
Gem::Specification.new do |spec|
|
6
7
|
spec.name = 'legitbot'
|
7
8
|
spec.version = Legitbot::VERSION
|
8
9
|
spec.license = 'Apache-2.0'
|
9
10
|
|
10
|
-
spec.author =
|
11
|
-
spec.email =
|
12
|
-
spec.homepage =
|
13
|
-
spec.summary =
|
14
|
-
spec.description =
|
15
|
-
"made by a real search engine, not a malicious agent"
|
11
|
+
spec.author = 'Alexander Azarov'
|
12
|
+
spec.email = 'self@alaz.me'
|
13
|
+
spec.homepage = 'https://github.com/alaz/legitbot'
|
14
|
+
spec.summary = 'Validate requests from Web crawlers: impersonating or not?'
|
15
|
+
spec.description = 'Does Web request come from a real search engine or from an impersonating agent?'
|
16
16
|
|
17
|
-
spec.required_ruby_version = '>= 2.
|
18
|
-
spec.add_dependency
|
19
|
-
spec.add_dependency
|
20
|
-
spec.add_development_dependency
|
21
|
-
spec.add_development_dependency
|
22
|
-
spec.add_development_dependency
|
17
|
+
spec.required_ruby_version = '>= 2.4.0'
|
18
|
+
spec.add_dependency 'augmented_interval_tree', '~> 0.1', '>= 0.1.1'
|
19
|
+
spec.add_dependency 'irrc', '~> 0.2', '>= 0.2.1'
|
20
|
+
spec.add_development_dependency 'bump', '~> 0.8', '>= 0.8.0'
|
21
|
+
spec.add_development_dependency 'minitest', '~> 5.1', '>= 5.1.0'
|
22
|
+
spec.add_development_dependency 'rake', '~> 12.3', '>= 12.3.0'
|
23
|
+
spec.add_development_dependency 'rubocop', '~> 0.74', '>= 0.74.0'
|
23
24
|
|
24
|
-
spec.files = `git ls-files`.split(
|
25
|
-
spec.rdoc_options = [
|
26
|
-
spec.test_files = Dir.glob(
|
25
|
+
spec.files = `git ls-files`.split($INPUT_RECORD_SEPARATOR)
|
26
|
+
spec.rdoc_options = ['--charset=UTF-8']
|
27
|
+
spec.test_files = Dir.glob('test/**/*')
|
27
28
|
end
|
data/lib/legitbot.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require_relative 'legitbot/legitbot'
|
2
4
|
require_relative 'legitbot/botmatch'
|
3
5
|
|
@@ -8,5 +10,7 @@ require_relative 'legitbot/bing'
|
|
8
10
|
require_relative 'legitbot/duckduckgo'
|
9
11
|
require_relative 'legitbot/facebook'
|
10
12
|
require_relative 'legitbot/google'
|
13
|
+
require_relative 'legitbot/oracle'
|
11
14
|
require_relative 'legitbot/pinterest'
|
15
|
+
require_relative 'legitbot/twitter'
|
12
16
|
require_relative 'legitbot/yandex'
|
data/lib/legitbot/ahrefs.rb
CHANGED
@@ -1,13 +1,18 @@
|
|
1
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Legitbot # :nodoc:
|
2
4
|
# https://ahrefs.com/robot
|
3
5
|
class Ahrefs < BotMatch
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
6
|
+
ip_ranges %w[
|
7
|
+
54.36.148.0/24
|
8
|
+
54.36.149.0/24
|
9
|
+
54.36.150.0/24
|
10
|
+
195.154.122.0/24
|
11
|
+
195.154.123.0/24
|
12
|
+
195.154.126.0/24
|
13
|
+
195.154.127.0/24
|
14
|
+
]
|
10
15
|
end
|
11
16
|
|
12
|
-
rule Legitbot::Ahrefs, %w
|
17
|
+
rule Legitbot::Ahrefs, %w[AhrefsBot]
|
13
18
|
end
|
data/lib/legitbot/apple.rb
CHANGED
@@ -1,20 +1,20 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'ipaddr'
|
2
4
|
|
3
|
-
module Legitbot
|
5
|
+
module Legitbot # :nodoc:
|
4
6
|
# https://support.apple.com/en-us/HT204683
|
5
|
-
|
6
7
|
class Apple < BotMatch
|
7
|
-
|
8
|
-
|
9
|
-
def valid?
|
10
|
-
ip = IPAddr.new @ip
|
11
|
-
Range.include? ip
|
12
|
-
end
|
8
|
+
ip_ranges '17.0.0.0/8'
|
13
9
|
end
|
14
10
|
|
15
|
-
|
11
|
+
# https://support.apple.com/en-us/HT204683
|
12
|
+
# rubocop:disable Naming/ClassAndModuleCamelCase
|
13
|
+
class Apple_as_Google < BotMatch
|
14
|
+
ip_ranges '17.0.0.0/8'
|
16
15
|
end
|
16
|
+
# rubocop:enable Naming/ClassAndModuleCamelCase
|
17
17
|
|
18
|
-
rule Legitbot::Apple, %w
|
19
|
-
rule Legitbot::Apple_as_Google, %w
|
18
|
+
rule Legitbot::Apple, %w[Applebot]
|
19
|
+
rule Legitbot::Apple_as_Google, %w[Googlebot]
|
20
20
|
end
|
data/lib/legitbot/baidu.rb
CHANGED
@@ -1,12 +1,10 @@
|
|
1
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Legitbot # :nodoc:
|
2
4
|
# http://help.baidu.com/question?prod_en=master&class=498&id=1000973
|
3
5
|
class Baidu < BotMatch
|
4
|
-
|
5
|
-
|
6
|
-
def valid?
|
7
|
-
subdomain_of?(*Baidu::ValidDomains)
|
8
|
-
end
|
6
|
+
domains 'baidu.com.', 'baidu.jp.', reverse: false
|
9
7
|
end
|
10
8
|
|
11
|
-
rule Legitbot::Baidu, %w
|
9
|
+
rule Legitbot::Baidu, %w[Baiduspider]
|
12
10
|
end
|
data/lib/legitbot/bing.rb
CHANGED
@@ -1,12 +1,10 @@
|
|
1
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Legitbot # :nodoc:
|
2
4
|
# https://blogs.bing.com/webmaster/2012/08/31/how-to-verify-that-bingbot-is-bingbot/
|
3
5
|
class Bing < BotMatch
|
4
|
-
|
5
|
-
|
6
|
-
def valid?
|
7
|
-
subdomain_of?(*Bing::ValidDomains) && reverse_resolves?
|
8
|
-
end
|
6
|
+
domains 'search.msn.com.'
|
9
7
|
end
|
10
8
|
|
11
|
-
rule Legitbot::Bing, %w
|
9
|
+
rule Legitbot::Bing, %w[Bingbot bingbot]
|
12
10
|
end
|
data/lib/legitbot/botmatch.rb
CHANGED
@@ -1,5 +1,8 @@
|
|
1
|
-
|
2
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative 'config/resolver'
|
4
|
+
require_relative 'validators/domains'
|
5
|
+
require_relative 'validators/ip_ranges'
|
3
6
|
|
4
7
|
module Legitbot
|
5
8
|
##
|
@@ -7,61 +10,31 @@ module Legitbot
|
|
7
10
|
# +valid?+, +fake?+ and +detected_as+
|
8
11
|
#
|
9
12
|
class BotMatch
|
10
|
-
|
11
|
-
|
12
|
-
@ip = ip
|
13
|
-
end
|
14
|
-
|
15
|
-
##
|
16
|
-
# Returns a Resolv::DNS::Name instance with
|
17
|
-
# the reverse name
|
18
|
-
def reverse_domain
|
19
|
-
@reverse_domain ||= @dns.getname(@ip)
|
20
|
-
rescue Resolv::ResolvError
|
21
|
-
@reverse_domain ||= nil
|
22
|
-
end
|
23
|
-
|
24
|
-
##
|
25
|
-
# Returns a String with the reverse name
|
26
|
-
def reverse_name
|
27
|
-
reverse_domain&.to_s
|
28
|
-
end
|
29
|
-
|
30
|
-
##
|
31
|
-
# Returns a String with IP created from the reverse name
|
32
|
-
def reversed_ip
|
33
|
-
return nil if reverse_name.nil?
|
13
|
+
include Legitbot::Validators::IpRanges
|
14
|
+
include Legitbot::Validators::Domains
|
34
15
|
|
35
|
-
|
36
|
-
@
|
37
|
-
end
|
38
|
-
|
39
|
-
def reverse_resolves?
|
40
|
-
@ip == reversed_ip
|
41
|
-
end
|
42
|
-
|
43
|
-
def subdomain_of?(*domains)
|
44
|
-
return false if reverse_name.nil?
|
45
|
-
|
46
|
-
domains.any? { |d|
|
47
|
-
reverse_domain.subdomain_of? Resolv::DNS::Name.create(d)
|
48
|
-
}
|
16
|
+
def initialize(ip)
|
17
|
+
@ip = ip
|
49
18
|
end
|
50
19
|
|
51
20
|
def detected_as
|
52
21
|
self.class.name.split('::').last.downcase.to_sym
|
53
22
|
end
|
54
23
|
|
24
|
+
def valid?
|
25
|
+
valid_ip? && valid_domain?
|
26
|
+
end
|
27
|
+
|
55
28
|
def fake?
|
56
29
|
!valid?
|
57
30
|
end
|
58
31
|
|
59
|
-
def self.valid?(ip
|
60
|
-
|
32
|
+
def self.valid?(ip)
|
33
|
+
new(ip).valid?
|
61
34
|
end
|
62
35
|
|
63
|
-
def self.fake?(ip
|
64
|
-
|
36
|
+
def self.fake?(ip)
|
37
|
+
new(ip).fake?
|
65
38
|
end
|
66
39
|
end
|
67
40
|
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'resolv'
|
4
|
+
|
5
|
+
module Legitbot
|
6
|
+
module Config
|
7
|
+
module Resolver # :nodoc:
|
8
|
+
def resolver_config(options = nil)
|
9
|
+
@resolver_config = options
|
10
|
+
end
|
11
|
+
|
12
|
+
def resolver
|
13
|
+
@resolver_config ||= Legitbot.resolver_config
|
14
|
+
@resolver ||= Resolv::DNS.new @resolver_config
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
data/lib/legitbot/duckduckgo.rb
CHANGED
@@ -1,12 +1,23 @@
|
|
1
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Legitbot # :nodoc:
|
2
4
|
# https://duckduckgo.com/duckduckbot
|
3
5
|
class DuckDuckGo < BotMatch
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
6
|
+
ip_ranges %w[
|
7
|
+
23.21.227.69
|
8
|
+
40.88.21.235
|
9
|
+
50.16.241.113
|
10
|
+
50.16.241.114
|
11
|
+
50.16.241.117
|
12
|
+
50.16.247.234
|
13
|
+
52.204.97.54
|
14
|
+
52.5.190.19
|
15
|
+
54.197.234.188
|
16
|
+
54.208.100.253
|
17
|
+
54.208.102.37
|
18
|
+
107.21.1.8
|
19
|
+
]
|
9
20
|
end
|
10
21
|
|
11
|
-
rule Legitbot::DuckDuckGo, %w
|
22
|
+
rule Legitbot::DuckDuckGo, %w[DuckDuckBot]
|
12
23
|
end
|
data/lib/legitbot/facebook.rb
CHANGED
@@ -1,48 +1,22 @@
|
|
1
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
2
3
|
require 'irrc'
|
3
|
-
require 'interval_tree'
|
4
4
|
|
5
|
-
module Legitbot
|
5
|
+
module Legitbot # :nodoc:
|
6
6
|
# https://developers.facebook.com/docs/sharing/webmasters/crawler
|
7
|
-
|
8
7
|
class Facebook < BotMatch
|
9
8
|
AS = 'AS32934'
|
10
9
|
|
11
|
-
|
12
|
-
ip = IPAddr.new(@ip)
|
13
|
-
Facebook.valid_ips[ip.ipv4? ? :ipv4 : :ipv6].search(ip.to_i).size > 0
|
14
|
-
end
|
15
|
-
|
16
|
-
@mutex = Mutex.new
|
17
|
-
|
18
|
-
def self.valid_ips
|
19
|
-
@mutex.synchronize { @ips ||= load_ips }
|
20
|
-
end
|
21
|
-
|
22
|
-
def self.reload!
|
23
|
-
@mutex.synchronize { @ips = load_ips }
|
24
|
-
end
|
25
|
-
|
26
|
-
def self.load_ips
|
27
|
-
whois.map do |(family, records)|
|
28
|
-
ranges = records.map do |cidr|
|
29
|
-
range = IPAddr.new(cidr).to_range
|
30
|
-
(range.begin.to_i..range.end.to_i)
|
31
|
-
end
|
32
|
-
[family, IntervalTree::Tree.new(ranges)]
|
33
|
-
end.to_h
|
34
|
-
end
|
35
|
-
|
36
|
-
def self.whois
|
10
|
+
ip_ranges do
|
37
11
|
client = Irrc::Client.new
|
38
12
|
client.query :radb, AS
|
39
13
|
results = client.perform
|
40
14
|
|
41
|
-
%i
|
42
|
-
|
43
|
-
end.
|
15
|
+
%i[ipv4 ipv6].map do |family|
|
16
|
+
results[AS][family][AS]
|
17
|
+
end.flatten
|
44
18
|
end
|
45
19
|
end
|
46
20
|
|
47
|
-
rule Legitbot::Facebook, %w
|
21
|
+
rule Legitbot::Facebook, %w[Facebot facebookexternalhit/1.1]
|
48
22
|
end
|