legitbot 0.3.2 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/build.yml +60 -0
- data/.gitignore +1 -0
- data/.rubocop.yml +2 -0
- data/.ruby-version +1 -0
- data/Gemfile +2 -0
- data/README.md +3 -1
- data/Rakefile +5 -3
- data/legitbot.gemspec +19 -18
- data/lib/legitbot.rb +4 -0
- data/lib/legitbot/ahrefs.rb +13 -8
- data/lib/legitbot/apple.rb +11 -11
- data/lib/legitbot/baidu.rb +5 -7
- data/lib/legitbot/bing.rb +5 -7
- data/lib/legitbot/botmatch.rb +17 -44
- data/lib/legitbot/config/resolver.rb +18 -0
- data/lib/legitbot/duckduckgo.rb +18 -7
- data/lib/legitbot/facebook.rb +8 -34
- data/lib/legitbot/google.rb +5 -8
- data/lib/legitbot/legitbot.rb +14 -9
- data/lib/legitbot/oracle.rb +10 -0
- data/lib/legitbot/pinterest.rb +5 -8
- data/lib/legitbot/twitter.rb +14 -0
- data/lib/legitbot/validators/domains.rb +71 -0
- data/lib/legitbot/validators/ip_ranges.rb +81 -0
- data/lib/legitbot/version.rb +3 -1
- data/lib/legitbot/yandex.rb +41 -12
- data/test/ahrefs_test.rb +16 -8
- data/test/apple_as_google_test.rb +9 -4
- data/test/apple_test.rb +11 -4
- data/test/botmatch_test.rb +4 -22
- data/test/facebook_test.rb +25 -10
- data/test/google_test.rb +24 -14
- data/test/legitbot/validators/domains_test.rb +58 -0
- data/test/legitbot/validators/ip_ranges_test.rb +113 -0
- data/test/legitbot_test.rb +8 -4
- data/test/oracle_test.rb +36 -0
- data/test/pinterest_test.rb +26 -14
- data/test/twitter_test.rb +36 -0
- metadata +87 -23
- data/.travis.yml +0 -12
data/lib/legitbot/google.rb
CHANGED
@@ -1,14 +1,11 @@
|
|
1
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Legitbot # :nodoc:
|
2
4
|
# https://support.google.com/webmasters/answer/1061943
|
3
5
|
# https://support.google.com/webmasters/answer/80553
|
4
|
-
|
5
6
|
class Google < BotMatch
|
6
|
-
|
7
|
-
|
8
|
-
def valid?
|
9
|
-
subdomain_of?(*Google::ValidDomains) && reverse_resolves?
|
10
|
-
end
|
7
|
+
domains 'google.com.', 'googlebot.com.'
|
11
8
|
end
|
12
9
|
|
13
|
-
rule Legitbot::Google, %w
|
10
|
+
rule Legitbot::Google, %w[Googlebot Mediapartners-Google AdsBot-Google]
|
14
11
|
end
|
data/lib/legitbot/legitbot.rb
CHANGED
@@ -1,6 +1,14 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
##
|
4
|
+
# Bot lookup based on user agent
|
1
5
|
module Legitbot
|
2
6
|
@rules = []
|
3
7
|
|
8
|
+
class << self
|
9
|
+
attr_accessor :resolver_config
|
10
|
+
end
|
11
|
+
|
4
12
|
##
|
5
13
|
# Lookup a bot based on its signature from +User-Agent+ header.
|
6
14
|
#
|
@@ -10,15 +18,12 @@ module Legitbot
|
|
10
18
|
# otherwise.
|
11
19
|
# :yields: a found bot
|
12
20
|
#
|
13
|
-
def self.bot(
|
14
|
-
bots =
|
15
|
-
|
16
|
-
|
17
|
-
}.map { |rule|
|
18
|
-
rule[:class].new(ip, resolver_config)
|
19
|
-
}
|
21
|
+
def self.bot(user_agent, ip)
|
22
|
+
bots = @rules
|
23
|
+
.select { |rule| rule[:fragments].any? { |f| user_agent.index f } }
|
24
|
+
.map { |rule| rule[:class].new(ip) }
|
20
25
|
|
21
|
-
selected = bots.select
|
26
|
+
selected = bots.select(&:valid?).first if bots.size > 1
|
22
27
|
selected = bots.last if selected.nil?
|
23
28
|
|
24
29
|
if selected && block_given?
|
@@ -29,6 +34,6 @@ module Legitbot
|
|
29
34
|
end
|
30
35
|
|
31
36
|
def self.rule(clazz, fragments)
|
32
|
-
@rules << {:
|
37
|
+
@rules << { class: clazz, fragments: fragments }
|
33
38
|
end
|
34
39
|
end
|
data/lib/legitbot/pinterest.rb
CHANGED
@@ -1,13 +1,10 @@
|
|
1
|
-
|
2
|
-
# https://help.pinterest.com/en/articles/about-pinterest-crawler-0
|
1
|
+
# frozen_string_literal: true
|
3
2
|
|
3
|
+
module Legitbot # :nodoc:
|
4
|
+
# https://help.pinterest.com/en/articles/about-pinterest-crawler-0
|
4
5
|
class Pinterest < BotMatch
|
5
|
-
|
6
|
-
|
7
|
-
def valid?
|
8
|
-
subdomain_of?(*Pinterest::ValidDomains) && reverse_resolves?
|
9
|
-
end
|
6
|
+
domains 'pinterest.com.'
|
10
7
|
end
|
11
8
|
|
12
|
-
rule Legitbot::Pinterest, %w
|
9
|
+
rule Legitbot::Pinterest, %w[Pinterestbot Pinterest/0.2]
|
13
10
|
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Legitbot # :nodoc:
|
4
|
+
# https://developer.twitter.com/en/docs/tweets/optimize-with-cards/guides/getting-started
|
5
|
+
# https://developer.twitter.com/en/docs/tweets/optimize-with-cards/guides/troubleshooting-cards
|
6
|
+
class Twitter < BotMatch
|
7
|
+
ip_ranges %w[
|
8
|
+
199.16.156.0/22
|
9
|
+
199.59.148.0/22
|
10
|
+
]
|
11
|
+
end
|
12
|
+
|
13
|
+
rule Legitbot::Twitter, %w[Twitterbot]
|
14
|
+
end
|
@@ -0,0 +1,71 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'resolv'
|
4
|
+
require 'ipaddr'
|
5
|
+
|
6
|
+
module Legitbot
|
7
|
+
module Validators
|
8
|
+
#
|
9
|
+
# In a bot matcher:
|
10
|
+
# `domains 'search.msn.com', ...`
|
11
|
+
# `domains 'googlebot.com', reverse: false`
|
12
|
+
#
|
13
|
+
# `reverse` is true by default.
|
14
|
+
module Domains
|
15
|
+
class << self
|
16
|
+
def included(base)
|
17
|
+
base.extend ClassMethods
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def valid_domain?
|
22
|
+
self.class.valid_domain?(@ip)
|
23
|
+
end
|
24
|
+
|
25
|
+
module ClassMethods # :nodoc:
|
26
|
+
include Legitbot::Config::Resolver
|
27
|
+
|
28
|
+
def domains(*list, reverse: true)
|
29
|
+
@valid_domains = list.flatten.map { |d| Resolv::DNS::Name.create(d) }
|
30
|
+
@validate_reverse_record = reverse
|
31
|
+
end
|
32
|
+
|
33
|
+
def check_domains?
|
34
|
+
instance_variable_defined?(:@valid_domains)
|
35
|
+
end
|
36
|
+
|
37
|
+
def valid_domain?(ip)
|
38
|
+
return true unless check_domains?
|
39
|
+
return true if @valid_domains.empty?
|
40
|
+
|
41
|
+
domains = reverse_domains(ip)
|
42
|
+
return false if domains.empty?
|
43
|
+
|
44
|
+
record = find_subdomain_record(domains)
|
45
|
+
return false unless record
|
46
|
+
return true unless @validate_reverse_record
|
47
|
+
|
48
|
+
ip == reverse_ip(record)
|
49
|
+
end
|
50
|
+
|
51
|
+
def reverse_domains(ip)
|
52
|
+
resolver.getnames(ip)
|
53
|
+
rescue Resolv::ResolvError
|
54
|
+
nil
|
55
|
+
end
|
56
|
+
|
57
|
+
def find_subdomain_record(domains)
|
58
|
+
domains.find do |d|
|
59
|
+
@valid_domains.any? { |vd| d.subdomain_of?(vd) }
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
def reverse_ip(record)
|
64
|
+
return nil if record.nil?
|
65
|
+
|
66
|
+
resolver.getaddress(record.to_s).to_s
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
@@ -0,0 +1,81 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'ipaddr'
|
4
|
+
require 'interval_tree'
|
5
|
+
|
6
|
+
module Legitbot
|
7
|
+
module Validators
|
8
|
+
#
|
9
|
+
# In a bot matcher:
|
10
|
+
# `ip_ranges ip, range, ip, ...`
|
11
|
+
# `ip_ranges do [ip, range, ...]; end`
|
12
|
+
module IpRanges
|
13
|
+
class << self
|
14
|
+
def included(base)
|
15
|
+
base.extend ClassMethods
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def valid_ip?
|
20
|
+
self.class.valid_ip?(@ip)
|
21
|
+
end
|
22
|
+
|
23
|
+
module ClassMethods # :nodoc:
|
24
|
+
FAMILIES = %i[ipv4 ipv6].freeze
|
25
|
+
EMPTY_GENERATOR = proc { [] }
|
26
|
+
|
27
|
+
def ip_ranges(*ips, &block)
|
28
|
+
@ip_ranges = partition_ips(ips.flatten) unless ips.empty?
|
29
|
+
@ip_ranges_loader = block_given? ? block : EMPTY_GENERATOR
|
30
|
+
@ip_loader_mutex = Mutex.new
|
31
|
+
end
|
32
|
+
|
33
|
+
def check_ranges?
|
34
|
+
instance_variable_defined?(:@ip_ranges_loader)
|
35
|
+
end
|
36
|
+
|
37
|
+
def valid_ip?(ip)
|
38
|
+
return true unless check_ranges?
|
39
|
+
return true if valid_ips.empty?
|
40
|
+
|
41
|
+
obj = IPAddr.new(ip)
|
42
|
+
ranges = valid_ips[obj.ipv4? ? :ipv4 : :ipv6].search(obj.to_i)
|
43
|
+
!ranges.empty?
|
44
|
+
end
|
45
|
+
|
46
|
+
def valid_ips
|
47
|
+
@ip_loader_mutex.synchronize do
|
48
|
+
@ip_ranges ||= load_ips
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
def reload_ips
|
53
|
+
@ip_loader_mutex.synchronize do
|
54
|
+
@ip_ranges = load_ips
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
def load_ips
|
59
|
+
partition_ips(@ip_ranges_loader.call)
|
60
|
+
end
|
61
|
+
|
62
|
+
# rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
63
|
+
def partition_ips(ips)
|
64
|
+
return [] if ips.empty?
|
65
|
+
|
66
|
+
ips
|
67
|
+
.map { |cidr| IPAddr.new(cidr) }
|
68
|
+
.partition(&:ipv4?)
|
69
|
+
.each_with_index
|
70
|
+
.map do |list, index|
|
71
|
+
ranges = list.map(&:to_range).map do |r|
|
72
|
+
(r.begin.to_i..r.end.to_i)
|
73
|
+
end
|
74
|
+
[FAMILIES[index], IntervalTree::Tree.new(ranges)]
|
75
|
+
end.to_h
|
76
|
+
end
|
77
|
+
# rubocop:enable Metrics/AbcSize, Metrics/MethodLength
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
data/lib/legitbot/version.rb
CHANGED
data/lib/legitbot/yandex.rb
CHANGED
@@ -1,17 +1,46 @@
|
|
1
|
-
|
2
|
-
# https://yandex.com/support/webmaster/robot-workings/check-yandex-robots.html
|
1
|
+
# frozen_string_literal: true
|
3
2
|
|
3
|
+
module Legitbot # :nodoc:
|
4
|
+
# https://yandex.com/support/webmaster/robot-workings/check-yandex-robots.html
|
4
5
|
class Yandex < BotMatch
|
5
|
-
|
6
|
-
|
7
|
-
def valid?
|
8
|
-
subdomain_of?(*Yandex::ValidDomains) && reverse_resolves?
|
9
|
-
end
|
6
|
+
domains 'yandex.ru.', 'yandex.net.', 'yandex.com.'
|
10
7
|
end
|
11
8
|
|
12
|
-
rule Legitbot::Yandex, %w
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
9
|
+
rule Legitbot::Yandex, %w[
|
10
|
+
YandexAccessibilityBot
|
11
|
+
YandexAdNet
|
12
|
+
YandexBlogs
|
13
|
+
YandexBot/
|
14
|
+
YandexCalendar
|
15
|
+
YandexDirect/
|
16
|
+
YandexDirectDyn
|
17
|
+
YandexFavicons
|
18
|
+
YaDirectFetcher
|
19
|
+
YandexForDomain
|
20
|
+
YandexImages
|
21
|
+
YandexImageResizer
|
22
|
+
YandexMobileBot
|
23
|
+
YandexMarket
|
24
|
+
YandexMedia
|
25
|
+
YandexMetrika
|
26
|
+
YandexMobileScreenShotBot
|
27
|
+
YandexNews
|
28
|
+
YandexOntoDB
|
29
|
+
YandexOntoDBAPI
|
30
|
+
YandexPagechecker
|
31
|
+
YandexPartner
|
32
|
+
YandexRCA
|
33
|
+
YandexSearchShop
|
34
|
+
YandexSitelinks
|
35
|
+
YandexSpravBot
|
36
|
+
YandexTracker
|
37
|
+
YandexTurbo
|
38
|
+
YandexVertis
|
39
|
+
YandexVerticals
|
40
|
+
YandexVideo
|
41
|
+
YandexVideoParser
|
42
|
+
YandexWebmaster
|
43
|
+
YandexScreenshotBot
|
44
|
+
YandexMedianaBot
|
45
|
+
]
|
17
46
|
end
|
data/test/ahrefs_test.rb
CHANGED
@@ -1,28 +1,36 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'minitest/autorun'
|
2
4
|
require 'legitbot'
|
3
5
|
|
4
6
|
class AhrefsTest < Minitest::Test
|
5
7
|
def test_malicious_ip
|
6
|
-
ip =
|
8
|
+
ip = '149.210.164.47'
|
7
9
|
match = Legitbot::Ahrefs.new ip
|
8
10
|
assert !match.valid?, msg: "#{ip} is not a real Ahrefs IP"
|
9
11
|
end
|
10
12
|
|
11
13
|
def test_valid_ip
|
12
|
-
ip =
|
14
|
+
ip = '54.36.148.0'
|
13
15
|
match = Legitbot::Ahrefs.new ip
|
14
16
|
assert match.valid?, msg: "#{ip} is a valid Ahrefs IP"
|
15
17
|
end
|
16
18
|
|
17
19
|
def test_malicious_ua
|
18
|
-
bot = Legitbot.bot(
|
19
|
-
|
20
|
-
|
20
|
+
bot = Legitbot.bot(
|
21
|
+
'Mozilla/5.0 (compatible; AhrefsBot/6.1; +http://ahrefs.com/robot/)',
|
22
|
+
'149.210.164.47'
|
23
|
+
)
|
24
|
+
assert bot, msg: 'Ahrefs detected from User-Agent'
|
25
|
+
assert !bot.valid?, msg: 'Not a valid Ahrefs'
|
21
26
|
end
|
22
27
|
|
23
28
|
def test_valid_ua
|
24
|
-
bot = Legitbot.bot(
|
25
|
-
|
26
|
-
|
29
|
+
bot = Legitbot.bot(
|
30
|
+
'Mozilla/5.0 (compatible; AhrefsBot/6.1; +http://ahrefs.com/robot/)',
|
31
|
+
'54.36.148.0'
|
32
|
+
)
|
33
|
+
assert bot, msg: 'Ahrefs detected from User-Agent'
|
34
|
+
assert bot.valid?, msg: 'Valid Ahrefs'
|
27
35
|
end
|
28
36
|
end
|
@@ -1,22 +1,27 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'minitest/autorun'
|
2
4
|
require 'legitbot'
|
3
5
|
|
4
6
|
class AppleAsGoogleTest < Minitest::Test
|
5
7
|
def test_valid_ip
|
6
|
-
ip =
|
8
|
+
ip = '17.58.98.60'
|
7
9
|
match = Legitbot::Apple_as_Google.new(ip)
|
8
10
|
assert match.valid?, msg: "#{ip} is a valid Applebot IP"
|
9
11
|
end
|
10
12
|
|
11
13
|
def test_invalid_ip
|
12
|
-
ip =
|
14
|
+
ip = '127.0.0.1'
|
13
15
|
match = Legitbot::Apple_as_Google.new(ip)
|
14
16
|
assert match.fake?, msg: "#{ip} is a fake Applebot IP"
|
15
17
|
end
|
16
18
|
|
17
19
|
def test_user_agent
|
18
|
-
bot = Legitbot.bot(
|
20
|
+
bot = Legitbot.bot(
|
21
|
+
'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
|
22
|
+
'17.58.98.60'
|
23
|
+
)
|
19
24
|
assert_equal :apple_as_google, bot.detected_as
|
20
|
-
assert bot.valid?, msg:
|
25
|
+
assert bot.valid?, msg: 'A valid Applebot User-agent and IP'
|
21
26
|
end
|
22
27
|
end
|
data/test/apple_test.rb
CHANGED
@@ -1,22 +1,29 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'minitest/autorun'
|
2
4
|
require 'legitbot'
|
3
5
|
|
4
6
|
class AppleTest < Minitest::Test
|
5
7
|
def test_valid_ip
|
6
|
-
ip =
|
8
|
+
ip = '17.58.98.60'
|
7
9
|
match = Legitbot::Apple.new(ip)
|
8
10
|
assert match.valid?, msg: "#{ip} is a valid Applebot IP"
|
9
11
|
end
|
10
12
|
|
11
13
|
def test_invalid_ip
|
12
|
-
ip =
|
14
|
+
ip = '127.0.0.1'
|
13
15
|
match = Legitbot::Apple.new(ip)
|
14
16
|
assert match.fake?, msg: "#{ip} is a fake Applebot IP"
|
15
17
|
end
|
16
18
|
|
19
|
+
# rubocop:disable Layout/LineLength
|
17
20
|
def test_user_agent
|
18
|
-
bot = Legitbot.bot(
|
21
|
+
bot = Legitbot.bot(
|
22
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/600.2.5 (KHTML, like Gecko) Version/8.0.2 Safari/600.2.5 (Applebot/0.1; +http://www.apple.com/go/applebot)',
|
23
|
+
'17.58.98.60'
|
24
|
+
)
|
19
25
|
assert_equal :apple, bot.detected_as
|
20
|
-
assert bot.valid?, msg:
|
26
|
+
assert bot.valid?, msg: 'A valid Applebot User-agent and IP'
|
21
27
|
end
|
28
|
+
# rubocop:enable Layout/LineLength
|
22
29
|
end
|
data/test/botmatch_test.rb
CHANGED
@@ -1,29 +1,11 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'minitest/autorun'
|
2
4
|
require 'legitbot'
|
3
5
|
|
4
6
|
class BotMatchTest < Minitest::Test
|
5
|
-
def test_reverse_name
|
6
|
-
match = Legitbot::BotMatch.new "66.249.64.141"
|
7
|
-
assert_equal "crawl-66-249-64-141.googlebot.com", match.reverse_name
|
8
|
-
end
|
9
|
-
|
10
|
-
def test_reverse_ip
|
11
|
-
match = Legitbot::BotMatch.new "66.249.64.141"
|
12
|
-
assert_equal "66.249.64.141", match.reversed_ip
|
13
|
-
end
|
14
|
-
|
15
|
-
def test_reverse_resolves
|
16
|
-
match = Legitbot::BotMatch.new "66.249.64.141"
|
17
|
-
assert_equal true, match.reverse_resolves?
|
18
|
-
end
|
19
|
-
|
20
|
-
def test_reverse_doesnt_resolve
|
21
|
-
match = Legitbot::BotMatch.new "5.140.70.64"
|
22
|
-
assert !match.reverse_resolves?
|
23
|
-
end
|
24
|
-
|
25
7
|
def test_valid_class_syntax
|
26
|
-
assert Legitbot::Google.valid?(
|
27
|
-
assert Legitbot::Google.fake?(
|
8
|
+
assert Legitbot::Google.valid?('66.249.64.141'), msg: 'Valid Googlebot'
|
9
|
+
assert Legitbot::Google.fake?('149.210.164.47'), msg: 'Fake Googlebot'
|
28
10
|
end
|
29
11
|
end
|