legitbot 1.0.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 34d1432c7c405d783b22a46851db88ccdea9f303defeccdd1cf98604bbb6ce09
4
- data.tar.gz: a66b586f4b2dca67fb875ea37add6e7d89a7ce5d0705c3d1898d96ecf091036e
3
+ metadata.gz: 6cc8cb39a7c005fd9c8e97d095f482c98db48ebbb570408feb81782e03ce4e88
4
+ data.tar.gz: 200b76238f1cbd2239233bc82347e5d0f4c00b3856754a87beea7ecc3e80e54e
5
5
  SHA512:
6
- metadata.gz: cad1db2571e939020f74e871365c4748dd78ff18eb8ad9f005ea5bf5b0707835e296afa2601fc6309c994f69b7903d21da788fc219f5d712ee75e1ae9885fb7b
7
- data.tar.gz: c72af598d60c55aff35a1b5e244dcde160a67589c588d68b9482d5e5c5f0590441c92505ec94beb462ff70a730ec9aabe80877a2d9db2f72c566c3a9c0b19059
6
+ metadata.gz: 7f3fe97a9e42feaf5233ae5733428fa7d87f1f5e545860240a1990dcef30a5785adeb349858817dc39a494978a910686778b3bb601f6928890ea1078d488b194
7
+ data.tar.gz: d4af0fced1c8fc0d629b49bab5da65ecdb945905ce2e2b76f0c78a3ed2aa7c71aa978d048bc11b591bb3f73c4a7981f23f1b52b5a98531e626ec611f6d436aaa
@@ -0,0 +1,17 @@
1
+ root = true
2
+
3
+ [*]
4
+ end_of_line = lf
5
+ insert_final_newline = true
6
+ trim_trailing_whitespace = true
7
+ charset = utf-8
8
+
9
+ indent_style = space
10
+ indent_size = 2
11
+
12
+ [*.md]
13
+ trim_trailing_whitespace = false
14
+
15
+ [*.yml]
16
+ indent_style = space
17
+ indent_size = 2
@@ -1,6 +1,6 @@
1
1
  name: build
2
2
 
3
- on: [push]
3
+ on: [pull_request, push]
4
4
 
5
5
  jobs:
6
6
  test:
@@ -1,2 +1,3 @@
1
1
  AllCops:
2
2
  CacheRootDirectory: 'vendor'
3
+ NewCops: enable
data/README.md CHANGED
@@ -44,6 +44,7 @@ end
44
44
  ## Supported
45
45
 
46
46
  * [Ahrefs](https://ahrefs.com/robot)
47
+ * [Alexa](https://support.alexa.com/hc/en-us/articles/360046707834-What-are-the-IP-addresses-for-Alexa-s-Certify-and-Site-Audit-crawlers-)
47
48
  * [Applebot](https://support.apple.com/en-us/HT204683)
48
49
  * [Baidu spider](http://help.baidu.com/question?prod_en=master&class=498&id=1000973)
49
50
  * [Bingbot](https://blogs.bing.com/webmaster/2012/08/31/how-to-verify-that-bingbot-is-bingbot/)
@@ -67,3 +68,8 @@ Apache 2.0
67
68
  detects bots by `User-Agent`
68
69
  * [crawler_detect](https://github.com/loadkpi/crawler_detect) is a Ruby gem and Rack
69
70
  middleware to detect crawlers by few different request headers, including `User-Agent`
71
+ * Project Honeypot's
72
+ [http:BL](https://www.projecthoneypot.org/httpbl_api.php) can not only
73
+ classify IP as a search engine, but also label them as suspicious and
74
+ reports the number of days since the last activity. My implementation of
75
+ the protocol in Scala is [here](https://github.com/osinka/httpbl).
@@ -19,8 +19,8 @@ Gem::Specification.new do |spec|
19
19
  spec.add_dependency 'irrc', '~> 0.2', '>= 0.2.1'
20
20
  spec.add_development_dependency 'bump', '~> 0.8', '>= 0.8.0'
21
21
  spec.add_development_dependency 'minitest', '~> 5.1', '>= 5.1.0'
22
- spec.add_development_dependency 'rake', '~> 12.3', '>= 12.3.0'
23
- spec.add_development_dependency 'rubocop', '~> 0.74', '>= 0.74.0'
22
+ spec.add_development_dependency 'rake', '~> 13.0', '>= 13.0.0'
23
+ spec.add_development_dependency 'rubocop', '~> 0.92.0', '>= 0.92.0'
24
24
 
25
25
  spec.files = `git ls-files`.split($INPUT_RECORD_SEPARATOR)
26
26
  spec.rdoc_options = ['--charset=UTF-8']
@@ -4,6 +4,7 @@ require_relative 'legitbot/legitbot'
4
4
  require_relative 'legitbot/botmatch'
5
5
 
6
6
  require_relative 'legitbot/ahrefs'
7
+ require_relative 'legitbot/alexa'
7
8
  require_relative 'legitbot/apple'
8
9
  require_relative 'legitbot/baidu'
9
10
  require_relative 'legitbot/bing'
@@ -0,0 +1,17 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Legitbot # :nodoc:
4
+ # https://support.alexa.com/hc/en-us/articles/360046707834-What-are-the-IP-addresses-for-Alexa-s-Certify-and-Site-Audit-crawlers-
5
+ # https://support.alexa.com/hc/en-us/articles/200462340
6
+ # https://support.alexa.com/hc/en-us/articles/200450194
7
+ class Alexa < BotMatch
8
+ ip_ranges %w[
9
+ 52.86.176.3
10
+ 52.4.48.181
11
+ 52.2.182.169
12
+ 52.86.185.29
13
+ ]
14
+ end
15
+
16
+ rule Legitbot::Alexa, %w[Alexabot ia_archiver]
17
+ end
@@ -8,13 +8,5 @@ module Legitbot # :nodoc:
8
8
  ip_ranges '17.0.0.0/8'
9
9
  end
10
10
 
11
- # https://support.apple.com/en-us/HT204683
12
- # rubocop:disable Naming/ClassAndModuleCamelCase
13
- class Apple_as_Google < BotMatch
14
- ip_ranges '17.0.0.0/8'
15
- end
16
- # rubocop:enable Naming/ClassAndModuleCamelCase
17
-
18
11
  rule Legitbot::Apple, %w[Applebot]
19
- rule Legitbot::Apple_as_Google, %w[Googlebot]
20
12
  end
@@ -18,6 +18,7 @@ module Legitbot
18
18
  # otherwise.
19
19
  # :yields: a found bot
20
20
  #
21
+ # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
21
22
  def self.bot(user_agent, ip)
22
23
  bots = @rules
23
24
  .select { |rule| rule[:fragments].any? { |f| user_agent.index f } }
@@ -32,6 +33,7 @@ module Legitbot
32
33
  selected
33
34
  end
34
35
  end
36
+ # rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
35
37
 
36
38
  def self.rule(clazz, fragments)
37
39
  @rules << { class: clazz, fragments: fragments }
@@ -40,7 +40,7 @@ module Legitbot
40
40
 
41
41
  obj = IPAddr.new(ip)
42
42
  ranges = valid_ips[obj.ipv4? ? :ipv4 : :ipv6].search(obj.to_i)
43
- !ranges.empty?
43
+ !ranges.nil? && !ranges.empty?
44
44
  end
45
45
 
46
46
  def valid_ips
@@ -59,22 +59,26 @@ module Legitbot
59
59
  partition_ips(@ip_ranges_loader.call)
60
60
  end
61
61
 
62
- # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
63
62
  def partition_ips(ips)
64
- return [] if ips.empty?
63
+ return [] unless ips&.any?
65
64
 
66
65
  ips
67
66
  .map { |cidr| IPAddr.new(cidr) }
68
67
  .partition(&:ipv4?)
69
68
  .each_with_index
70
69
  .map do |list, index|
71
- ranges = list.map(&:to_range).map do |r|
72
- (r.begin.to_i..r.end.to_i)
73
- end
74
- [FAMILIES[index], IntervalTree::Tree.new(ranges)]
70
+ [FAMILIES[index], build_interval_tree(list)]
75
71
  end.to_h
76
72
  end
77
- # rubocop:enable Metrics/AbcSize, Metrics/MethodLength
73
+
74
+ private
75
+
76
+ def build_interval_tree(list)
77
+ ranges = list.map(&:to_range).map do |r|
78
+ (r.begin.to_i..r.end.to_i)
79
+ end
80
+ IntervalTree::Tree.new(ranges)
81
+ end
78
82
  end
79
83
  end
80
84
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Legitbot
4
- VERSION = '1.0.0'
4
+ VERSION = '1.2.0'
5
5
  end
@@ -0,0 +1,36 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'minitest/autorun'
4
+ require 'legitbot'
5
+
6
+ class AlexaTest < Minitest::Test
7
+ def test_malicious_ip
8
+ ip = '149.210.164.47'
9
+ match = Legitbot::Alexa.new ip
10
+ assert !match.valid?, msg: "#{ip} is not a real Alexa IP"
11
+ end
12
+
13
+ def test_valid_ip
14
+ ip = '52.86.176.3'
15
+ match = Legitbot::Alexa.new ip
16
+ assert match.valid?, msg: "#{ip} is a valid Alexa IP"
17
+ end
18
+
19
+ def test_malicious_ua
20
+ bot = Legitbot.bot(
21
+ 'Mozilla/5.0 (compatible; Alexabot/1.0; +http://www.alexa.com/help/certifyscan; certifyscan@alexa.com)',
22
+ '149.210.164.47'
23
+ )
24
+ assert bot, msg: 'Alexa detected from User-Agent'
25
+ assert !bot.valid?, msg: 'Not a valid Alexa'
26
+ end
27
+
28
+ def test_valid_ua
29
+ bot = Legitbot.bot(
30
+ 'Mozilla/5.0 (compatible; Alexabot/1.0; +http://www.alexa.com/help/certifyscan; certifyscan@alexa.com)',
31
+ '52.86.176.3'
32
+ )
33
+ assert bot, msg: 'Alexa detected from User-Agent'
34
+ assert bot.valid?, msg: 'Valid Alexa'
35
+ end
36
+ end
@@ -33,7 +33,7 @@ class FacebookTest < Minitest::Test
33
33
  assert match.fake?, msg: "#{ip} is a fake Facebook IP"
34
34
  end
35
35
 
36
- # rubocop:disable Metrics/AbcSize, Layout/LineLength, Metrics/MethodLength
36
+ # rubocop:disable Layout/LineLength, Metrics/MethodLength
37
37
  def test_user_agent
38
38
  Legitbot.bot(
39
39
  'facebookexternalhit/1.1 (+http://www.facebook.com/externalhit_uatext.php)',
@@ -59,5 +59,5 @@ class FacebookTest < Minitest::Test
59
59
  assert bot.fake?, msg: 'fake Facebook'
60
60
  end
61
61
  end
62
- # rubocop:enable Metrics/AbcSize, Layout/LineLength, Metrics/MethodLength
62
+ # rubocop:enable Layout/LineLength, Metrics/MethodLength
63
63
  end
@@ -46,6 +46,16 @@ module Legitbot
46
46
  end
47
47
  end
48
48
 
49
+ class NilRanges
50
+ include IpRanges
51
+ ip_ranges { nil }
52
+ end
53
+
54
+ class Ipv4Ranges
55
+ include IpRanges
56
+ ip_ranges { ['66.220.144.0/21'] }
57
+ end
58
+
49
59
  class IpRangesTest < Minitest::Test
50
60
  def test_partition_method
51
61
  empty = NoRanges.partition_ips([])
@@ -108,6 +118,14 @@ module Legitbot
108
118
  assert_equal 2, LoadRanges.counter
109
119
  end
110
120
  # rubocop:enable Metrics/AbcSize
121
+
122
+ def test_nil_ranges
123
+ assert NilRanges.valid_ip?('127.0.0.1')
124
+ end
125
+
126
+ def test_ipv4_only_ranges
127
+ refute Ipv4Ranges.valid_ip?('2a03:2880:f234:0:0:0:0:1')
128
+ end
111
129
  end
112
130
  end
113
131
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: legitbot
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 1.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Alexander Azarov
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-06-18 00:00:00.000000000 Z
11
+ date: 2020-09-27 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: augmented_interval_tree
@@ -96,40 +96,40 @@ dependencies:
96
96
  requirements:
97
97
  - - "~>"
98
98
  - !ruby/object:Gem::Version
99
- version: '12.3'
99
+ version: '13.0'
100
100
  - - ">="
101
101
  - !ruby/object:Gem::Version
102
- version: 12.3.0
102
+ version: 13.0.0
103
103
  type: :development
104
104
  prerelease: false
105
105
  version_requirements: !ruby/object:Gem::Requirement
106
106
  requirements:
107
107
  - - "~>"
108
108
  - !ruby/object:Gem::Version
109
- version: '12.3'
109
+ version: '13.0'
110
110
  - - ">="
111
111
  - !ruby/object:Gem::Version
112
- version: 12.3.0
112
+ version: 13.0.0
113
113
  - !ruby/object:Gem::Dependency
114
114
  name: rubocop
115
115
  requirement: !ruby/object:Gem::Requirement
116
116
  requirements:
117
117
  - - "~>"
118
118
  - !ruby/object:Gem::Version
119
- version: '0.74'
119
+ version: 0.92.0
120
120
  - - ">="
121
121
  - !ruby/object:Gem::Version
122
- version: 0.74.0
122
+ version: 0.92.0
123
123
  type: :development
124
124
  prerelease: false
125
125
  version_requirements: !ruby/object:Gem::Requirement
126
126
  requirements:
127
127
  - - "~>"
128
128
  - !ruby/object:Gem::Version
129
- version: '0.74'
129
+ version: 0.92.0
130
130
  - - ">="
131
131
  - !ruby/object:Gem::Version
132
- version: 0.74.0
132
+ version: 0.92.0
133
133
  description: Does Web request come from a real search engine or from an impersonating
134
134
  agent?
135
135
  email: self@alaz.me
@@ -137,6 +137,7 @@ executables: []
137
137
  extensions: []
138
138
  extra_rdoc_files: []
139
139
  files:
140
+ - ".editorconfig"
140
141
  - ".github/workflows/build.yml"
141
142
  - ".gitignore"
142
143
  - ".rubocop.yml"
@@ -148,6 +149,7 @@ files:
148
149
  - legitbot.gemspec
149
150
  - lib/legitbot.rb
150
151
  - lib/legitbot/ahrefs.rb
152
+ - lib/legitbot/alexa.rb
151
153
  - lib/legitbot/apple.rb
152
154
  - lib/legitbot/baidu.rb
153
155
  - lib/legitbot/bing.rb
@@ -165,7 +167,7 @@ files:
165
167
  - lib/legitbot/version.rb
166
168
  - lib/legitbot/yandex.rb
167
169
  - test/ahrefs_test.rb
168
- - test/apple_as_google_test.rb
170
+ - test/alexa_test.rb
169
171
  - test/apple_test.rb
170
172
  - test/botmatch_test.rb
171
173
  - test/facebook_test.rb
@@ -205,9 +207,9 @@ test_files:
205
207
  - test/legitbot/validators/domains_test.rb
206
208
  - test/legitbot/validators/ip_ranges_test.rb
207
209
  - test/pinterest_test.rb
210
+ - test/alexa_test.rb
208
211
  - test/ahrefs_test.rb
209
212
  - test/apple_test.rb
210
- - test/apple_as_google_test.rb
211
213
  - test/oracle_test.rb
212
214
  - test/google_test.rb
213
215
  - test/botmatch_test.rb
@@ -1,27 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'minitest/autorun'
4
- require 'legitbot'
5
-
6
- class AppleAsGoogleTest < Minitest::Test
7
- def test_valid_ip
8
- ip = '17.58.98.60'
9
- match = Legitbot::Apple_as_Google.new(ip)
10
- assert match.valid?, msg: "#{ip} is a valid Applebot IP"
11
- end
12
-
13
- def test_invalid_ip
14
- ip = '127.0.0.1'
15
- match = Legitbot::Apple_as_Google.new(ip)
16
- assert match.fake?, msg: "#{ip} is a fake Applebot IP"
17
- end
18
-
19
- def test_user_agent
20
- bot = Legitbot.bot(
21
- 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
22
- '17.58.98.60'
23
- )
24
- assert_equal :apple_as_google, bot.detected_as
25
- assert bot.valid?, msg: 'A valid Applebot User-agent and IP'
26
- end
27
- end