legitbot 1.0.0 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.editorconfig +17 -0
- data/.github/workflows/build.yml +1 -1
- data/.rubocop.yml +1 -0
- data/README.md +6 -0
- data/legitbot.gemspec +2 -2
- data/lib/legitbot.rb +1 -0
- data/lib/legitbot/alexa.rb +17 -0
- data/lib/legitbot/apple.rb +0 -8
- data/lib/legitbot/legitbot.rb +2 -0
- data/lib/legitbot/validators/ip_ranges.rb +12 -8
- data/lib/legitbot/version.rb +1 -1
- data/test/alexa_test.rb +36 -0
- data/test/facebook_test.rb +2 -2
- data/test/legitbot/validators/ip_ranges_test.rb +18 -0
- metadata +14 -12
- data/test/apple_as_google_test.rb +0 -27
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6cc8cb39a7c005fd9c8e97d095f482c98db48ebbb570408feb81782e03ce4e88
|
4
|
+
data.tar.gz: 200b76238f1cbd2239233bc82347e5d0f4c00b3856754a87beea7ecc3e80e54e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7f3fe97a9e42feaf5233ae5733428fa7d87f1f5e545860240a1990dcef30a5785adeb349858817dc39a494978a910686778b3bb601f6928890ea1078d488b194
|
7
|
+
data.tar.gz: d4af0fced1c8fc0d629b49bab5da65ecdb945905ce2e2b76f0c78a3ed2aa7c71aa978d048bc11b591bb3f73c4a7981f23f1b52b5a98531e626ec611f6d436aaa
|
data/.editorconfig
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
root = true
|
2
|
+
|
3
|
+
[*]
|
4
|
+
end_of_line = lf
|
5
|
+
insert_final_newline = true
|
6
|
+
trim_trailing_whitespace = true
|
7
|
+
charset = utf-8
|
8
|
+
|
9
|
+
indent_style = space
|
10
|
+
indent_size = 2
|
11
|
+
|
12
|
+
[*.md]
|
13
|
+
trim_trailing_whitespace = false
|
14
|
+
|
15
|
+
[*.yml]
|
16
|
+
indent_style = space
|
17
|
+
indent_size = 2
|
data/.github/workflows/build.yml
CHANGED
data/.rubocop.yml
CHANGED
data/README.md
CHANGED
@@ -44,6 +44,7 @@ end
|
|
44
44
|
## Supported
|
45
45
|
|
46
46
|
* [Ahrefs](https://ahrefs.com/robot)
|
47
|
+
* [Alexa](https://support.alexa.com/hc/en-us/articles/360046707834-What-are-the-IP-addresses-for-Alexa-s-Certify-and-Site-Audit-crawlers-)
|
47
48
|
* [Applebot](https://support.apple.com/en-us/HT204683)
|
48
49
|
* [Baidu spider](http://help.baidu.com/question?prod_en=master&class=498&id=1000973)
|
49
50
|
* [Bingbot](https://blogs.bing.com/webmaster/2012/08/31/how-to-verify-that-bingbot-is-bingbot/)
|
@@ -67,3 +68,8 @@ Apache 2.0
|
|
67
68
|
detects bots by `User-Agent`
|
68
69
|
* [crawler_detect](https://github.com/loadkpi/crawler_detect) is a Ruby gem and Rack
|
69
70
|
middleware to detect crawlers by few different request headers, including `User-Agent`
|
71
|
+
* Project Honeypot's
|
72
|
+
[http:BL](https://www.projecthoneypot.org/httpbl_api.php) can not only
|
73
|
+
classify IP as a search engine, but also label them as suspicious and
|
74
|
+
reports the number of days since the last activity. My implementation of
|
75
|
+
the protocol in Scala is [here](https://github.com/osinka/httpbl).
|
data/legitbot.gemspec
CHANGED
@@ -19,8 +19,8 @@ Gem::Specification.new do |spec|
|
|
19
19
|
spec.add_dependency 'irrc', '~> 0.2', '>= 0.2.1'
|
20
20
|
spec.add_development_dependency 'bump', '~> 0.8', '>= 0.8.0'
|
21
21
|
spec.add_development_dependency 'minitest', '~> 5.1', '>= 5.1.0'
|
22
|
-
spec.add_development_dependency 'rake', '~>
|
23
|
-
spec.add_development_dependency 'rubocop', '~> 0.
|
22
|
+
spec.add_development_dependency 'rake', '~> 13.0', '>= 13.0.0'
|
23
|
+
spec.add_development_dependency 'rubocop', '~> 0.92.0', '>= 0.92.0'
|
24
24
|
|
25
25
|
spec.files = `git ls-files`.split($INPUT_RECORD_SEPARATOR)
|
26
26
|
spec.rdoc_options = ['--charset=UTF-8']
|
data/lib/legitbot.rb
CHANGED
@@ -0,0 +1,17 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Legitbot # :nodoc:
|
4
|
+
# https://support.alexa.com/hc/en-us/articles/360046707834-What-are-the-IP-addresses-for-Alexa-s-Certify-and-Site-Audit-crawlers-
|
5
|
+
# https://support.alexa.com/hc/en-us/articles/200462340
|
6
|
+
# https://support.alexa.com/hc/en-us/articles/200450194
|
7
|
+
class Alexa < BotMatch
|
8
|
+
ip_ranges %w[
|
9
|
+
52.86.176.3
|
10
|
+
52.4.48.181
|
11
|
+
52.2.182.169
|
12
|
+
52.86.185.29
|
13
|
+
]
|
14
|
+
end
|
15
|
+
|
16
|
+
rule Legitbot::Alexa, %w[Alexabot ia_archiver]
|
17
|
+
end
|
data/lib/legitbot/apple.rb
CHANGED
@@ -8,13 +8,5 @@ module Legitbot # :nodoc:
|
|
8
8
|
ip_ranges '17.0.0.0/8'
|
9
9
|
end
|
10
10
|
|
11
|
-
# https://support.apple.com/en-us/HT204683
|
12
|
-
# rubocop:disable Naming/ClassAndModuleCamelCase
|
13
|
-
class Apple_as_Google < BotMatch
|
14
|
-
ip_ranges '17.0.0.0/8'
|
15
|
-
end
|
16
|
-
# rubocop:enable Naming/ClassAndModuleCamelCase
|
17
|
-
|
18
11
|
rule Legitbot::Apple, %w[Applebot]
|
19
|
-
rule Legitbot::Apple_as_Google, %w[Googlebot]
|
20
12
|
end
|
data/lib/legitbot/legitbot.rb
CHANGED
@@ -18,6 +18,7 @@ module Legitbot
|
|
18
18
|
# otherwise.
|
19
19
|
# :yields: a found bot
|
20
20
|
#
|
21
|
+
# rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
21
22
|
def self.bot(user_agent, ip)
|
22
23
|
bots = @rules
|
23
24
|
.select { |rule| rule[:fragments].any? { |f| user_agent.index f } }
|
@@ -32,6 +33,7 @@ module Legitbot
|
|
32
33
|
selected
|
33
34
|
end
|
34
35
|
end
|
36
|
+
# rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
35
37
|
|
36
38
|
def self.rule(clazz, fragments)
|
37
39
|
@rules << { class: clazz, fragments: fragments }
|
@@ -40,7 +40,7 @@ module Legitbot
|
|
40
40
|
|
41
41
|
obj = IPAddr.new(ip)
|
42
42
|
ranges = valid_ips[obj.ipv4? ? :ipv4 : :ipv6].search(obj.to_i)
|
43
|
-
!ranges.empty?
|
43
|
+
!ranges.nil? && !ranges.empty?
|
44
44
|
end
|
45
45
|
|
46
46
|
def valid_ips
|
@@ -59,22 +59,26 @@ module Legitbot
|
|
59
59
|
partition_ips(@ip_ranges_loader.call)
|
60
60
|
end
|
61
61
|
|
62
|
-
# rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
63
62
|
def partition_ips(ips)
|
64
|
-
return []
|
63
|
+
return [] unless ips&.any?
|
65
64
|
|
66
65
|
ips
|
67
66
|
.map { |cidr| IPAddr.new(cidr) }
|
68
67
|
.partition(&:ipv4?)
|
69
68
|
.each_with_index
|
70
69
|
.map do |list, index|
|
71
|
-
|
72
|
-
(r.begin.to_i..r.end.to_i)
|
73
|
-
end
|
74
|
-
[FAMILIES[index], IntervalTree::Tree.new(ranges)]
|
70
|
+
[FAMILIES[index], build_interval_tree(list)]
|
75
71
|
end.to_h
|
76
72
|
end
|
77
|
-
|
73
|
+
|
74
|
+
private
|
75
|
+
|
76
|
+
def build_interval_tree(list)
|
77
|
+
ranges = list.map(&:to_range).map do |r|
|
78
|
+
(r.begin.to_i..r.end.to_i)
|
79
|
+
end
|
80
|
+
IntervalTree::Tree.new(ranges)
|
81
|
+
end
|
78
82
|
end
|
79
83
|
end
|
80
84
|
end
|
data/lib/legitbot/version.rb
CHANGED
data/test/alexa_test.rb
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'minitest/autorun'
|
4
|
+
require 'legitbot'
|
5
|
+
|
6
|
+
class AlexaTest < Minitest::Test
|
7
|
+
def test_malicious_ip
|
8
|
+
ip = '149.210.164.47'
|
9
|
+
match = Legitbot::Alexa.new ip
|
10
|
+
assert !match.valid?, msg: "#{ip} is not a real Alexa IP"
|
11
|
+
end
|
12
|
+
|
13
|
+
def test_valid_ip
|
14
|
+
ip = '52.86.176.3'
|
15
|
+
match = Legitbot::Alexa.new ip
|
16
|
+
assert match.valid?, msg: "#{ip} is a valid Alexa IP"
|
17
|
+
end
|
18
|
+
|
19
|
+
def test_malicious_ua
|
20
|
+
bot = Legitbot.bot(
|
21
|
+
'Mozilla/5.0 (compatible; Alexabot/1.0; +http://www.alexa.com/help/certifyscan; certifyscan@alexa.com)',
|
22
|
+
'149.210.164.47'
|
23
|
+
)
|
24
|
+
assert bot, msg: 'Alexa detected from User-Agent'
|
25
|
+
assert !bot.valid?, msg: 'Not a valid Alexa'
|
26
|
+
end
|
27
|
+
|
28
|
+
def test_valid_ua
|
29
|
+
bot = Legitbot.bot(
|
30
|
+
'Mozilla/5.0 (compatible; Alexabot/1.0; +http://www.alexa.com/help/certifyscan; certifyscan@alexa.com)',
|
31
|
+
'52.86.176.3'
|
32
|
+
)
|
33
|
+
assert bot, msg: 'Alexa detected from User-Agent'
|
34
|
+
assert bot.valid?, msg: 'Valid Alexa'
|
35
|
+
end
|
36
|
+
end
|
data/test/facebook_test.rb
CHANGED
@@ -33,7 +33,7 @@ class FacebookTest < Minitest::Test
|
|
33
33
|
assert match.fake?, msg: "#{ip} is a fake Facebook IP"
|
34
34
|
end
|
35
35
|
|
36
|
-
# rubocop:disable
|
36
|
+
# rubocop:disable Layout/LineLength, Metrics/MethodLength
|
37
37
|
def test_user_agent
|
38
38
|
Legitbot.bot(
|
39
39
|
'facebookexternalhit/1.1 (+http://www.facebook.com/externalhit_uatext.php)',
|
@@ -59,5 +59,5 @@ class FacebookTest < Minitest::Test
|
|
59
59
|
assert bot.fake?, msg: 'fake Facebook'
|
60
60
|
end
|
61
61
|
end
|
62
|
-
# rubocop:enable
|
62
|
+
# rubocop:enable Layout/LineLength, Metrics/MethodLength
|
63
63
|
end
|
@@ -46,6 +46,16 @@ module Legitbot
|
|
46
46
|
end
|
47
47
|
end
|
48
48
|
|
49
|
+
class NilRanges
|
50
|
+
include IpRanges
|
51
|
+
ip_ranges { nil }
|
52
|
+
end
|
53
|
+
|
54
|
+
class Ipv4Ranges
|
55
|
+
include IpRanges
|
56
|
+
ip_ranges { ['66.220.144.0/21'] }
|
57
|
+
end
|
58
|
+
|
49
59
|
class IpRangesTest < Minitest::Test
|
50
60
|
def test_partition_method
|
51
61
|
empty = NoRanges.partition_ips([])
|
@@ -108,6 +118,14 @@ module Legitbot
|
|
108
118
|
assert_equal 2, LoadRanges.counter
|
109
119
|
end
|
110
120
|
# rubocop:enable Metrics/AbcSize
|
121
|
+
|
122
|
+
def test_nil_ranges
|
123
|
+
assert NilRanges.valid_ip?('127.0.0.1')
|
124
|
+
end
|
125
|
+
|
126
|
+
def test_ipv4_only_ranges
|
127
|
+
refute Ipv4Ranges.valid_ip?('2a03:2880:f234:0:0:0:0:1')
|
128
|
+
end
|
111
129
|
end
|
112
130
|
end
|
113
131
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: legitbot
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Alexander Azarov
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-
|
11
|
+
date: 2020-09-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: augmented_interval_tree
|
@@ -96,40 +96,40 @@ dependencies:
|
|
96
96
|
requirements:
|
97
97
|
- - "~>"
|
98
98
|
- !ruby/object:Gem::Version
|
99
|
-
version: '
|
99
|
+
version: '13.0'
|
100
100
|
- - ">="
|
101
101
|
- !ruby/object:Gem::Version
|
102
|
-
version:
|
102
|
+
version: 13.0.0
|
103
103
|
type: :development
|
104
104
|
prerelease: false
|
105
105
|
version_requirements: !ruby/object:Gem::Requirement
|
106
106
|
requirements:
|
107
107
|
- - "~>"
|
108
108
|
- !ruby/object:Gem::Version
|
109
|
-
version: '
|
109
|
+
version: '13.0'
|
110
110
|
- - ">="
|
111
111
|
- !ruby/object:Gem::Version
|
112
|
-
version:
|
112
|
+
version: 13.0.0
|
113
113
|
- !ruby/object:Gem::Dependency
|
114
114
|
name: rubocop
|
115
115
|
requirement: !ruby/object:Gem::Requirement
|
116
116
|
requirements:
|
117
117
|
- - "~>"
|
118
118
|
- !ruby/object:Gem::Version
|
119
|
-
version:
|
119
|
+
version: 0.92.0
|
120
120
|
- - ">="
|
121
121
|
- !ruby/object:Gem::Version
|
122
|
-
version: 0.
|
122
|
+
version: 0.92.0
|
123
123
|
type: :development
|
124
124
|
prerelease: false
|
125
125
|
version_requirements: !ruby/object:Gem::Requirement
|
126
126
|
requirements:
|
127
127
|
- - "~>"
|
128
128
|
- !ruby/object:Gem::Version
|
129
|
-
version:
|
129
|
+
version: 0.92.0
|
130
130
|
- - ">="
|
131
131
|
- !ruby/object:Gem::Version
|
132
|
-
version: 0.
|
132
|
+
version: 0.92.0
|
133
133
|
description: Does Web request come from a real search engine or from an impersonating
|
134
134
|
agent?
|
135
135
|
email: self@alaz.me
|
@@ -137,6 +137,7 @@ executables: []
|
|
137
137
|
extensions: []
|
138
138
|
extra_rdoc_files: []
|
139
139
|
files:
|
140
|
+
- ".editorconfig"
|
140
141
|
- ".github/workflows/build.yml"
|
141
142
|
- ".gitignore"
|
142
143
|
- ".rubocop.yml"
|
@@ -148,6 +149,7 @@ files:
|
|
148
149
|
- legitbot.gemspec
|
149
150
|
- lib/legitbot.rb
|
150
151
|
- lib/legitbot/ahrefs.rb
|
152
|
+
- lib/legitbot/alexa.rb
|
151
153
|
- lib/legitbot/apple.rb
|
152
154
|
- lib/legitbot/baidu.rb
|
153
155
|
- lib/legitbot/bing.rb
|
@@ -165,7 +167,7 @@ files:
|
|
165
167
|
- lib/legitbot/version.rb
|
166
168
|
- lib/legitbot/yandex.rb
|
167
169
|
- test/ahrefs_test.rb
|
168
|
-
- test/
|
170
|
+
- test/alexa_test.rb
|
169
171
|
- test/apple_test.rb
|
170
172
|
- test/botmatch_test.rb
|
171
173
|
- test/facebook_test.rb
|
@@ -205,9 +207,9 @@ test_files:
|
|
205
207
|
- test/legitbot/validators/domains_test.rb
|
206
208
|
- test/legitbot/validators/ip_ranges_test.rb
|
207
209
|
- test/pinterest_test.rb
|
210
|
+
- test/alexa_test.rb
|
208
211
|
- test/ahrefs_test.rb
|
209
212
|
- test/apple_test.rb
|
210
|
-
- test/apple_as_google_test.rb
|
211
213
|
- test/oracle_test.rb
|
212
214
|
- test/google_test.rb
|
213
215
|
- test/botmatch_test.rb
|
@@ -1,27 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
require 'minitest/autorun'
|
4
|
-
require 'legitbot'
|
5
|
-
|
6
|
-
class AppleAsGoogleTest < Minitest::Test
|
7
|
-
def test_valid_ip
|
8
|
-
ip = '17.58.98.60'
|
9
|
-
match = Legitbot::Apple_as_Google.new(ip)
|
10
|
-
assert match.valid?, msg: "#{ip} is a valid Applebot IP"
|
11
|
-
end
|
12
|
-
|
13
|
-
def test_invalid_ip
|
14
|
-
ip = '127.0.0.1'
|
15
|
-
match = Legitbot::Apple_as_Google.new(ip)
|
16
|
-
assert match.fake?, msg: "#{ip} is a fake Applebot IP"
|
17
|
-
end
|
18
|
-
|
19
|
-
def test_user_agent
|
20
|
-
bot = Legitbot.bot(
|
21
|
-
'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
|
22
|
-
'17.58.98.60'
|
23
|
-
)
|
24
|
-
assert_equal :apple_as_google, bot.detected_as
|
25
|
-
assert bot.valid?, msg: 'A valid Applebot User-agent and IP'
|
26
|
-
end
|
27
|
-
end
|