legitbot 1.0.0 → 1.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 34d1432c7c405d783b22a46851db88ccdea9f303defeccdd1cf98604bbb6ce09
4
- data.tar.gz: a66b586f4b2dca67fb875ea37add6e7d89a7ce5d0705c3d1898d96ecf091036e
3
+ metadata.gz: 6cc8cb39a7c005fd9c8e97d095f482c98db48ebbb570408feb81782e03ce4e88
4
+ data.tar.gz: 200b76238f1cbd2239233bc82347e5d0f4c00b3856754a87beea7ecc3e80e54e
5
5
  SHA512:
6
- metadata.gz: cad1db2571e939020f74e871365c4748dd78ff18eb8ad9f005ea5bf5b0707835e296afa2601fc6309c994f69b7903d21da788fc219f5d712ee75e1ae9885fb7b
7
- data.tar.gz: c72af598d60c55aff35a1b5e244dcde160a67589c588d68b9482d5e5c5f0590441c92505ec94beb462ff70a730ec9aabe80877a2d9db2f72c566c3a9c0b19059
6
+ metadata.gz: 7f3fe97a9e42feaf5233ae5733428fa7d87f1f5e545860240a1990dcef30a5785adeb349858817dc39a494978a910686778b3bb601f6928890ea1078d488b194
7
+ data.tar.gz: d4af0fced1c8fc0d629b49bab5da65ecdb945905ce2e2b76f0c78a3ed2aa7c71aa978d048bc11b591bb3f73c4a7981f23f1b52b5a98531e626ec611f6d436aaa
@@ -0,0 +1,17 @@
1
+ root = true
2
+
3
+ [*]
4
+ end_of_line = lf
5
+ insert_final_newline = true
6
+ trim_trailing_whitespace = true
7
+ charset = utf-8
8
+
9
+ indent_style = space
10
+ indent_size = 2
11
+
12
+ [*.md]
13
+ trim_trailing_whitespace = false
14
+
15
+ [*.yml]
16
+ indent_style = space
17
+ indent_size = 2
@@ -1,6 +1,6 @@
1
1
  name: build
2
2
 
3
- on: [push]
3
+ on: [pull_request, push]
4
4
 
5
5
  jobs:
6
6
  test:
@@ -1,2 +1,3 @@
1
1
  AllCops:
2
2
  CacheRootDirectory: 'vendor'
3
+ NewCops: enable
data/README.md CHANGED
@@ -44,6 +44,7 @@ end
44
44
  ## Supported
45
45
 
46
46
  * [Ahrefs](https://ahrefs.com/robot)
47
+ * [Alexa](https://support.alexa.com/hc/en-us/articles/360046707834-What-are-the-IP-addresses-for-Alexa-s-Certify-and-Site-Audit-crawlers-)
47
48
  * [Applebot](https://support.apple.com/en-us/HT204683)
48
49
  * [Baidu spider](http://help.baidu.com/question?prod_en=master&class=498&id=1000973)
49
50
  * [Bingbot](https://blogs.bing.com/webmaster/2012/08/31/how-to-verify-that-bingbot-is-bingbot/)
@@ -67,3 +68,8 @@ Apache 2.0
67
68
  detects bots by `User-Agent`
68
69
  * [crawler_detect](https://github.com/loadkpi/crawler_detect) is a Ruby gem and Rack
69
70
  middleware to detect crawlers by few different request headers, including `User-Agent`
71
+ * Project Honeypot's
72
+ [http:BL](https://www.projecthoneypot.org/httpbl_api.php) can not only
73
+ classify IP as a search engine, but also label them as suspicious and
74
+ reports the number of days since the last activity. My implementation of
75
+ the protocol in Scala is [here](https://github.com/osinka/httpbl).
@@ -19,8 +19,8 @@ Gem::Specification.new do |spec|
19
19
  spec.add_dependency 'irrc', '~> 0.2', '>= 0.2.1'
20
20
  spec.add_development_dependency 'bump', '~> 0.8', '>= 0.8.0'
21
21
  spec.add_development_dependency 'minitest', '~> 5.1', '>= 5.1.0'
22
- spec.add_development_dependency 'rake', '~> 12.3', '>= 12.3.0'
23
- spec.add_development_dependency 'rubocop', '~> 0.74', '>= 0.74.0'
22
+ spec.add_development_dependency 'rake', '~> 13.0', '>= 13.0.0'
23
+ spec.add_development_dependency 'rubocop', '~> 0.92.0', '>= 0.92.0'
24
24
 
25
25
  spec.files = `git ls-files`.split($INPUT_RECORD_SEPARATOR)
26
26
  spec.rdoc_options = ['--charset=UTF-8']
@@ -4,6 +4,7 @@ require_relative 'legitbot/legitbot'
4
4
  require_relative 'legitbot/botmatch'
5
5
 
6
6
  require_relative 'legitbot/ahrefs'
7
+ require_relative 'legitbot/alexa'
7
8
  require_relative 'legitbot/apple'
8
9
  require_relative 'legitbot/baidu'
9
10
  require_relative 'legitbot/bing'
@@ -0,0 +1,17 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Legitbot # :nodoc:
4
+ # https://support.alexa.com/hc/en-us/articles/360046707834-What-are-the-IP-addresses-for-Alexa-s-Certify-and-Site-Audit-crawlers-
5
+ # https://support.alexa.com/hc/en-us/articles/200462340
6
+ # https://support.alexa.com/hc/en-us/articles/200450194
7
+ class Alexa < BotMatch
8
+ ip_ranges %w[
9
+ 52.86.176.3
10
+ 52.4.48.181
11
+ 52.2.182.169
12
+ 52.86.185.29
13
+ ]
14
+ end
15
+
16
+ rule Legitbot::Alexa, %w[Alexabot ia_archiver]
17
+ end
@@ -8,13 +8,5 @@ module Legitbot # :nodoc:
8
8
  ip_ranges '17.0.0.0/8'
9
9
  end
10
10
 
11
- # https://support.apple.com/en-us/HT204683
12
- # rubocop:disable Naming/ClassAndModuleCamelCase
13
- class Apple_as_Google < BotMatch
14
- ip_ranges '17.0.0.0/8'
15
- end
16
- # rubocop:enable Naming/ClassAndModuleCamelCase
17
-
18
11
  rule Legitbot::Apple, %w[Applebot]
19
- rule Legitbot::Apple_as_Google, %w[Googlebot]
20
12
  end
@@ -18,6 +18,7 @@ module Legitbot
18
18
  # otherwise.
19
19
  # :yields: a found bot
20
20
  #
21
+ # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
21
22
  def self.bot(user_agent, ip)
22
23
  bots = @rules
23
24
  .select { |rule| rule[:fragments].any? { |f| user_agent.index f } }
@@ -32,6 +33,7 @@ module Legitbot
32
33
  selected
33
34
  end
34
35
  end
36
+ # rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
35
37
 
36
38
  def self.rule(clazz, fragments)
37
39
  @rules << { class: clazz, fragments: fragments }
@@ -40,7 +40,7 @@ module Legitbot
40
40
 
41
41
  obj = IPAddr.new(ip)
42
42
  ranges = valid_ips[obj.ipv4? ? :ipv4 : :ipv6].search(obj.to_i)
43
- !ranges.empty?
43
+ !ranges.nil? && !ranges.empty?
44
44
  end
45
45
 
46
46
  def valid_ips
@@ -59,22 +59,26 @@ module Legitbot
59
59
  partition_ips(@ip_ranges_loader.call)
60
60
  end
61
61
 
62
- # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
63
62
  def partition_ips(ips)
64
- return [] if ips.empty?
63
+ return [] unless ips&.any?
65
64
 
66
65
  ips
67
66
  .map { |cidr| IPAddr.new(cidr) }
68
67
  .partition(&:ipv4?)
69
68
  .each_with_index
70
69
  .map do |list, index|
71
- ranges = list.map(&:to_range).map do |r|
72
- (r.begin.to_i..r.end.to_i)
73
- end
74
- [FAMILIES[index], IntervalTree::Tree.new(ranges)]
70
+ [FAMILIES[index], build_interval_tree(list)]
75
71
  end.to_h
76
72
  end
77
- # rubocop:enable Metrics/AbcSize, Metrics/MethodLength
73
+
74
+ private
75
+
76
+ def build_interval_tree(list)
77
+ ranges = list.map(&:to_range).map do |r|
78
+ (r.begin.to_i..r.end.to_i)
79
+ end
80
+ IntervalTree::Tree.new(ranges)
81
+ end
78
82
  end
79
83
  end
80
84
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Legitbot
4
- VERSION = '1.0.0'
4
+ VERSION = '1.2.0'
5
5
  end
@@ -0,0 +1,36 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'minitest/autorun'
4
+ require 'legitbot'
5
+
6
+ class AlexaTest < Minitest::Test
7
+ def test_malicious_ip
8
+ ip = '149.210.164.47'
9
+ match = Legitbot::Alexa.new ip
10
+ assert !match.valid?, msg: "#{ip} is not a real Alexa IP"
11
+ end
12
+
13
+ def test_valid_ip
14
+ ip = '52.86.176.3'
15
+ match = Legitbot::Alexa.new ip
16
+ assert match.valid?, msg: "#{ip} is a valid Alexa IP"
17
+ end
18
+
19
+ def test_malicious_ua
20
+ bot = Legitbot.bot(
21
+ 'Mozilla/5.0 (compatible; Alexabot/1.0; +http://www.alexa.com/help/certifyscan; certifyscan@alexa.com)',
22
+ '149.210.164.47'
23
+ )
24
+ assert bot, msg: 'Alexa detected from User-Agent'
25
+ assert !bot.valid?, msg: 'Not a valid Alexa'
26
+ end
27
+
28
+ def test_valid_ua
29
+ bot = Legitbot.bot(
30
+ 'Mozilla/5.0 (compatible; Alexabot/1.0; +http://www.alexa.com/help/certifyscan; certifyscan@alexa.com)',
31
+ '52.86.176.3'
32
+ )
33
+ assert bot, msg: 'Alexa detected from User-Agent'
34
+ assert bot.valid?, msg: 'Valid Alexa'
35
+ end
36
+ end
@@ -33,7 +33,7 @@ class FacebookTest < Minitest::Test
33
33
  assert match.fake?, msg: "#{ip} is a fake Facebook IP"
34
34
  end
35
35
 
36
- # rubocop:disable Metrics/AbcSize, Layout/LineLength, Metrics/MethodLength
36
+ # rubocop:disable Layout/LineLength, Metrics/MethodLength
37
37
  def test_user_agent
38
38
  Legitbot.bot(
39
39
  'facebookexternalhit/1.1 (+http://www.facebook.com/externalhit_uatext.php)',
@@ -59,5 +59,5 @@ class FacebookTest < Minitest::Test
59
59
  assert bot.fake?, msg: 'fake Facebook'
60
60
  end
61
61
  end
62
- # rubocop:enable Metrics/AbcSize, Layout/LineLength, Metrics/MethodLength
62
+ # rubocop:enable Layout/LineLength, Metrics/MethodLength
63
63
  end
@@ -46,6 +46,16 @@ module Legitbot
46
46
  end
47
47
  end
48
48
 
49
+ class NilRanges
50
+ include IpRanges
51
+ ip_ranges { nil }
52
+ end
53
+
54
+ class Ipv4Ranges
55
+ include IpRanges
56
+ ip_ranges { ['66.220.144.0/21'] }
57
+ end
58
+
49
59
  class IpRangesTest < Minitest::Test
50
60
  def test_partition_method
51
61
  empty = NoRanges.partition_ips([])
@@ -108,6 +118,14 @@ module Legitbot
108
118
  assert_equal 2, LoadRanges.counter
109
119
  end
110
120
  # rubocop:enable Metrics/AbcSize
121
+
122
+ def test_nil_ranges
123
+ assert NilRanges.valid_ip?('127.0.0.1')
124
+ end
125
+
126
+ def test_ipv4_only_ranges
127
+ refute Ipv4Ranges.valid_ip?('2a03:2880:f234:0:0:0:0:1')
128
+ end
111
129
  end
112
130
  end
113
131
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: legitbot
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 1.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Alexander Azarov
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-06-18 00:00:00.000000000 Z
11
+ date: 2020-09-27 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: augmented_interval_tree
@@ -96,40 +96,40 @@ dependencies:
96
96
  requirements:
97
97
  - - "~>"
98
98
  - !ruby/object:Gem::Version
99
- version: '12.3'
99
+ version: '13.0'
100
100
  - - ">="
101
101
  - !ruby/object:Gem::Version
102
- version: 12.3.0
102
+ version: 13.0.0
103
103
  type: :development
104
104
  prerelease: false
105
105
  version_requirements: !ruby/object:Gem::Requirement
106
106
  requirements:
107
107
  - - "~>"
108
108
  - !ruby/object:Gem::Version
109
- version: '12.3'
109
+ version: '13.0'
110
110
  - - ">="
111
111
  - !ruby/object:Gem::Version
112
- version: 12.3.0
112
+ version: 13.0.0
113
113
  - !ruby/object:Gem::Dependency
114
114
  name: rubocop
115
115
  requirement: !ruby/object:Gem::Requirement
116
116
  requirements:
117
117
  - - "~>"
118
118
  - !ruby/object:Gem::Version
119
- version: '0.74'
119
+ version: 0.92.0
120
120
  - - ">="
121
121
  - !ruby/object:Gem::Version
122
- version: 0.74.0
122
+ version: 0.92.0
123
123
  type: :development
124
124
  prerelease: false
125
125
  version_requirements: !ruby/object:Gem::Requirement
126
126
  requirements:
127
127
  - - "~>"
128
128
  - !ruby/object:Gem::Version
129
- version: '0.74'
129
+ version: 0.92.0
130
130
  - - ">="
131
131
  - !ruby/object:Gem::Version
132
- version: 0.74.0
132
+ version: 0.92.0
133
133
  description: Does Web request come from a real search engine or from an impersonating
134
134
  agent?
135
135
  email: self@alaz.me
@@ -137,6 +137,7 @@ executables: []
137
137
  extensions: []
138
138
  extra_rdoc_files: []
139
139
  files:
140
+ - ".editorconfig"
140
141
  - ".github/workflows/build.yml"
141
142
  - ".gitignore"
142
143
  - ".rubocop.yml"
@@ -148,6 +149,7 @@ files:
148
149
  - legitbot.gemspec
149
150
  - lib/legitbot.rb
150
151
  - lib/legitbot/ahrefs.rb
152
+ - lib/legitbot/alexa.rb
151
153
  - lib/legitbot/apple.rb
152
154
  - lib/legitbot/baidu.rb
153
155
  - lib/legitbot/bing.rb
@@ -165,7 +167,7 @@ files:
165
167
  - lib/legitbot/version.rb
166
168
  - lib/legitbot/yandex.rb
167
169
  - test/ahrefs_test.rb
168
- - test/apple_as_google_test.rb
170
+ - test/alexa_test.rb
169
171
  - test/apple_test.rb
170
172
  - test/botmatch_test.rb
171
173
  - test/facebook_test.rb
@@ -205,9 +207,9 @@ test_files:
205
207
  - test/legitbot/validators/domains_test.rb
206
208
  - test/legitbot/validators/ip_ranges_test.rb
207
209
  - test/pinterest_test.rb
210
+ - test/alexa_test.rb
208
211
  - test/ahrefs_test.rb
209
212
  - test/apple_test.rb
210
- - test/apple_as_google_test.rb
211
213
  - test/oracle_test.rb
212
214
  - test/google_test.rb
213
215
  - test/botmatch_test.rb
@@ -1,27 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'minitest/autorun'
4
- require 'legitbot'
5
-
6
- class AppleAsGoogleTest < Minitest::Test
7
- def test_valid_ip
8
- ip = '17.58.98.60'
9
- match = Legitbot::Apple_as_Google.new(ip)
10
- assert match.valid?, msg: "#{ip} is a valid Applebot IP"
11
- end
12
-
13
- def test_invalid_ip
14
- ip = '127.0.0.1'
15
- match = Legitbot::Apple_as_Google.new(ip)
16
- assert match.fake?, msg: "#{ip} is a fake Applebot IP"
17
- end
18
-
19
- def test_user_agent
20
- bot = Legitbot.bot(
21
- 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
22
- '17.58.98.60'
23
- )
24
- assert_equal :apple_as_google, bot.detected_as
25
- assert bot.valid?, msg: 'A valid Applebot User-agent and IP'
26
- end
27
- end