legitbot 0.3.2 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: d775b4e4e434615989b8ef35aca9ac6498b06742ca6e8534f9bb4d6f3d5c9b04
4
- data.tar.gz: 239069018d1b043ec99bce8496b3760fc189611f6aa83545835d5f71ce7f94c5
3
+ metadata.gz: 34d1432c7c405d783b22a46851db88ccdea9f303defeccdd1cf98604bbb6ce09
4
+ data.tar.gz: a66b586f4b2dca67fb875ea37add6e7d89a7ce5d0705c3d1898d96ecf091036e
5
5
  SHA512:
6
- metadata.gz: '08e08f94cbb4979566982d91a525f7d03618c9c10e85613bcebfd51f8f8f8b0dab669ba9cd1b67c7a6d6a36c23d23ad88b64996bf252bd884082045ae590d38c'
7
- data.tar.gz: 97c4dd80bf62205ea4a03a0bc7570d57520611b4015f3bf9f66b450d78878a3fde1c9097e0ac79691e40d5f29e9ba676aaa3af70f71650adb61c7f4a0ee1ddf0
6
+ metadata.gz: cad1db2571e939020f74e871365c4748dd78ff18eb8ad9f005ea5bf5b0707835e296afa2601fc6309c994f69b7903d21da788fc219f5d712ee75e1ae9885fb7b
7
+ data.tar.gz: c72af598d60c55aff35a1b5e244dcde160a67589c588d68b9482d5e5c5f0590441c92505ec94beb462ff70a730ec9aabe80877a2d9db2f72c566c3a9c0b19059
@@ -0,0 +1,60 @@
1
+ name: build
2
+
3
+ on: [push]
4
+
5
+ jobs:
6
+ test:
7
+ runs-on: ubuntu-latest
8
+
9
+ strategy:
10
+ fail-fast: false
11
+ matrix:
12
+ ruby: [ jruby, 2.6 ]
13
+
14
+ steps:
15
+ - uses: actions/checkout@v2
16
+ - name: Set up Ruby
17
+ uses: ruby/setup-ruby@v1
18
+ with:
19
+ ruby-version: ${{ matrix.ruby }}
20
+ - name: Cache dependencies
21
+ uses: actions/cache@v1
22
+ with:
23
+ path: vendor/bundle
24
+ key: ${{ runner.os }}-${{ matrix.ruby }}-gems-${{ hashFiles('**/Gemfile.lock') }}
25
+ restore-keys: |
26
+ ${{ runner.os }}-${{ matrix.ruby }}-gems-
27
+ - name: Install dependencies
28
+ run: |
29
+ bundle config path vendor/bundle
30
+ bundle install --jobs 4 --retry 3
31
+ - name: Run tests
32
+ run: bundle exec rake test
33
+
34
+ lint:
35
+ needs: test
36
+ runs-on: ubuntu-latest
37
+
38
+ strategy:
39
+ matrix:
40
+ ruby: [ 2.6 ]
41
+
42
+ steps:
43
+ - uses: actions/checkout@v2
44
+ - name: Set up Ruby
45
+ uses: ruby/setup-ruby@v1
46
+ with:
47
+ ruby-version: ${{ matrix.ruby }}
48
+ - name: Cache dependencies
49
+ uses: actions/cache@v1
50
+ with:
51
+ path: vendor/bundle
52
+ key: ${{ runner.os }}-${{ matrix.ruby }}-gems-${{ hashFiles('**/Gemfile.lock') }}
53
+ restore-keys: |
54
+ ${{ runner.os }}-${{ matrix.ruby }}-gems-
55
+ - name: Install dependencies
56
+ run: |
57
+ bundle config path vendor/bundle
58
+ bundle install --jobs 4 --retry 3
59
+ - name: Run linter
60
+ run: bundle exec rubocop
data/.gitignore CHANGED
@@ -4,3 +4,4 @@ Gemfile.lock
4
4
  *.gemfile.lock
5
5
  /pkg
6
6
  /tags
7
+ /vendor
@@ -0,0 +1,2 @@
1
+ AllCops:
2
+ CacheRootDirectory: 'vendor'
@@ -0,0 +1 @@
1
+ 2.4
data/Gemfile CHANGED
@@ -1,2 +1,4 @@
1
+ # frozen_string_literal: true
2
+
1
3
  source 'https://rubygems.org'
2
4
  gemspec
data/README.md CHANGED
@@ -1,4 +1,4 @@
1
- # Legitbot [![Build Status](https://secure.travis-ci.org/alaz/legitbot.png?branch=master)](http://travis-ci.org/alaz/legitbot) [![Gem Version](https://badge.fury.io/rb/legitbot.svg)](https://badge.fury.io/rb/legitbot)
1
+ # Legitbot ![](https://github.com/alaz/legitbot/workflows/build/badge.svg) [![Gem Version](https://badge.fury.io/rb/legitbot.svg)](https://badge.fury.io/rb/legitbot)
2
2
 
3
3
  Ruby gem to check that an IP belongs to a bot, typically a search
4
4
  engine. This can be of help in protecting a web site from fake search
@@ -50,7 +50,9 @@ end
50
50
  * [DuckDuckGo bot](https://duckduckgo.com/duckduckbot)
51
51
  * [Facebook crawler](https://developers.facebook.com/docs/sharing/webmasters/crawler)
52
52
  * [Google crawlers](https://support.google.com/webmasters/answer/1061943)
53
+ * [Oracle Data Cloud Crawler](https://www.oracle.com/corporate/acquisitions/grapeshot/crawler.html)
53
54
  * [Pinterest](https://help.pinterest.com/en/articles/about-pinterest-crawler-0)
55
+ * [Twitterbot](https://developer.twitter.com/en/docs/tweets/optimize-with-cards/guides/getting-started), the list of IPs is in the [Troubleshooting page](https://developer.twitter.com/en/docs/tweets/optimize-with-cards/guides/troubleshooting-cards)
54
56
  * [Yandex robots](https://yandex.com/support/webmaster/robot-workings/check-yandex-robots.xml)
55
57
 
56
58
  ## License
data/Rakefile CHANGED
@@ -1,14 +1,16 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'rubygems'
2
4
  require 'bundler'
3
5
  require 'bump/tasks'
4
- require "rake/testtask"
6
+ require 'rake/testtask'
5
7
  Bundler::GemHelper.install_tasks
6
8
 
7
9
  Bump.tag_by_default = true
8
10
 
9
11
  Rake::TestTask.new do |t|
10
- t.libs << "test"
11
- t.test_files = FileList['test/*_test.rb']
12
+ t.libs << 'test'
13
+ t.test_files = FileList['test/**/*_test.rb']
12
14
  t.warning = true
13
15
  t.verbose = true
14
16
  end
@@ -1,27 +1,28 @@
1
- # encoding: utf-8
2
- $LOAD_PATH.push File.expand_path("../lib", __FILE__)
3
- require "legitbot/version"
1
+ # frozen_string_literal: true
2
+
3
+ $LOAD_PATH.push File.expand_path('lib', __dir__)
4
+ require 'legitbot/version'
4
5
 
5
6
  Gem::Specification.new do |spec|
6
7
  spec.name = 'legitbot'
7
8
  spec.version = Legitbot::VERSION
8
9
  spec.license = 'Apache-2.0'
9
10
 
10
- spec.author = "Alexander Azarov"
11
- spec.email = "self@alaz.me"
12
- spec.homepage = "https://github.com/alaz/legitbot"
13
- spec.summary = %q{Validate requests from Web crawlers: impersonating or not?}
14
- spec.description = "A library to make sure a Web request has been "\
15
- "made by a real search engine, not a malicious agent"
11
+ spec.author = 'Alexander Azarov'
12
+ spec.email = 'self@alaz.me'
13
+ spec.homepage = 'https://github.com/alaz/legitbot'
14
+ spec.summary = 'Validate requests from Web crawlers: impersonating or not?'
15
+ spec.description = 'Does Web request come from a real search engine or from an impersonating agent?'
16
16
 
17
- spec.required_ruby_version = '>= 2.3.0'
18
- spec.add_dependency "irrc", ">= 0.2.1"
19
- spec.add_dependency "augmented_interval_tree", ">= 0.1.1"
20
- spec.add_development_dependency "bump"
21
- spec.add_development_dependency "rake"
22
- spec.add_development_dependency "minitest"
17
+ spec.required_ruby_version = '>= 2.4.0'
18
+ spec.add_dependency 'augmented_interval_tree', '~> 0.1', '>= 0.1.1'
19
+ spec.add_dependency 'irrc', '~> 0.2', '>= 0.2.1'
20
+ spec.add_development_dependency 'bump', '~> 0.8', '>= 0.8.0'
21
+ spec.add_development_dependency 'minitest', '~> 5.1', '>= 5.1.0'
22
+ spec.add_development_dependency 'rake', '~> 12.3', '>= 12.3.0'
23
+ spec.add_development_dependency 'rubocop', '~> 0.74', '>= 0.74.0'
23
24
 
24
- spec.files = `git ls-files`.split($/)
25
- spec.rdoc_options = ["--charset=UTF-8"]
26
- spec.test_files = Dir.glob("test/**/*")
25
+ spec.files = `git ls-files`.split($INPUT_RECORD_SEPARATOR)
26
+ spec.rdoc_options = ['--charset=UTF-8']
27
+ spec.test_files = Dir.glob('test/**/*')
27
28
  end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require_relative 'legitbot/legitbot'
2
4
  require_relative 'legitbot/botmatch'
3
5
 
@@ -8,5 +10,7 @@ require_relative 'legitbot/bing'
8
10
  require_relative 'legitbot/duckduckgo'
9
11
  require_relative 'legitbot/facebook'
10
12
  require_relative 'legitbot/google'
13
+ require_relative 'legitbot/oracle'
11
14
  require_relative 'legitbot/pinterest'
15
+ require_relative 'legitbot/twitter'
12
16
  require_relative 'legitbot/yandex'
@@ -1,13 +1,18 @@
1
- module Legitbot
1
+ # frozen_string_literal: true
2
+
3
+ module Legitbot # :nodoc:
2
4
  # https://ahrefs.com/robot
3
5
  class Ahrefs < BotMatch
4
- Ranges = %w(54.36.148.0/24 54.36.149.0/24 54.36.150.0/24 195.154.122.0/24 195.154.123.0/24 195.154.126.0/24 195.154.127.0/24)
5
-
6
- def valid?
7
- ip = IPAddr.new @ip
8
- Ranges.any? { |range| IPAddr.new(range).include? ip }
9
- end
6
+ ip_ranges %w[
7
+ 54.36.148.0/24
8
+ 54.36.149.0/24
9
+ 54.36.150.0/24
10
+ 195.154.122.0/24
11
+ 195.154.123.0/24
12
+ 195.154.126.0/24
13
+ 195.154.127.0/24
14
+ ]
10
15
  end
11
16
 
12
- rule Legitbot::Ahrefs, %w(AhrefsBot)
17
+ rule Legitbot::Ahrefs, %w[AhrefsBot]
13
18
  end
@@ -1,20 +1,20 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'ipaddr'
2
4
 
3
- module Legitbot
5
+ module Legitbot # :nodoc:
4
6
  # https://support.apple.com/en-us/HT204683
5
-
6
7
  class Apple < BotMatch
7
- Range = IPAddr.new('17.0.0.0/8')
8
-
9
- def valid?
10
- ip = IPAddr.new @ip
11
- Range.include? ip
12
- end
8
+ ip_ranges '17.0.0.0/8'
13
9
  end
14
10
 
15
- class Apple_as_Google < Apple
11
+ # https://support.apple.com/en-us/HT204683
12
+ # rubocop:disable Naming/ClassAndModuleCamelCase
13
+ class Apple_as_Google < BotMatch
14
+ ip_ranges '17.0.0.0/8'
16
15
  end
16
+ # rubocop:enable Naming/ClassAndModuleCamelCase
17
17
 
18
- rule Legitbot::Apple, %w(Applebot)
19
- rule Legitbot::Apple_as_Google, %w(Googlebot)
18
+ rule Legitbot::Apple, %w[Applebot]
19
+ rule Legitbot::Apple_as_Google, %w[Googlebot]
20
20
  end
@@ -1,12 +1,10 @@
1
- module Legitbot
1
+ # frozen_string_literal: true
2
+
3
+ module Legitbot # :nodoc:
2
4
  # http://help.baidu.com/question?prod_en=master&class=498&id=1000973
3
5
  class Baidu < BotMatch
4
- ValidDomains = ["baidu.com.", "baidu.jp."]
5
-
6
- def valid?
7
- subdomain_of?(*Baidu::ValidDomains)
8
- end
6
+ domains 'baidu.com.', 'baidu.jp.', reverse: false
9
7
  end
10
8
 
11
- rule Legitbot::Baidu, %w(Baiduspider)
9
+ rule Legitbot::Baidu, %w[Baiduspider]
12
10
  end
@@ -1,12 +1,10 @@
1
- module Legitbot
1
+ # frozen_string_literal: true
2
+
3
+ module Legitbot # :nodoc:
2
4
  # https://blogs.bing.com/webmaster/2012/08/31/how-to-verify-that-bingbot-is-bingbot/
3
5
  class Bing < BotMatch
4
- ValidDomains = ["search.msn.com."]
5
-
6
- def valid?
7
- subdomain_of?(*Bing::ValidDomains) && reverse_resolves?
8
- end
6
+ domains 'search.msn.com.'
9
7
  end
10
8
 
11
- rule Legitbot::Bing, %w(Bingbot bingbot)
9
+ rule Legitbot::Bing, %w[Bingbot bingbot]
12
10
  end
@@ -1,5 +1,8 @@
1
- require 'resolv'
2
- require 'ipaddr'
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'config/resolver'
4
+ require_relative 'validators/domains'
5
+ require_relative 'validators/ip_ranges'
3
6
 
4
7
  module Legitbot
5
8
  ##
@@ -7,61 +10,31 @@ module Legitbot
7
10
  # +valid?+, +fake?+ and +detected_as+
8
11
  #
9
12
  class BotMatch
10
- def initialize(ip, resolver_config = nil)
11
- @dns = Resolv::DNS.new(resolver_config)
12
- @ip = ip
13
- end
14
-
15
- ##
16
- # Returns a Resolv::DNS::Name instance with
17
- # the reverse name
18
- def reverse_domain
19
- @reverse_domain ||= @dns.getname(@ip)
20
- rescue Resolv::ResolvError
21
- @reverse_domain ||= nil
22
- end
23
-
24
- ##
25
- # Returns a String with the reverse name
26
- def reverse_name
27
- reverse_domain&.to_s
28
- end
29
-
30
- ##
31
- # Returns a String with IP created from the reverse name
32
- def reversed_ip
33
- return nil if reverse_name.nil?
13
+ include Legitbot::Validators::IpRanges
14
+ include Legitbot::Validators::Domains
34
15
 
35
- @reverse_ip ||= @dns.getaddress(reverse_name)
36
- @reverse_ip.to_s
37
- end
38
-
39
- def reverse_resolves?
40
- @ip == reversed_ip
41
- end
42
-
43
- def subdomain_of?(*domains)
44
- return false if reverse_name.nil?
45
-
46
- domains.any? { |d|
47
- reverse_domain.subdomain_of? Resolv::DNS::Name.create(d)
48
- }
16
+ def initialize(ip)
17
+ @ip = ip
49
18
  end
50
19
 
51
20
  def detected_as
52
21
  self.class.name.split('::').last.downcase.to_sym
53
22
  end
54
23
 
24
+ def valid?
25
+ valid_ip? && valid_domain?
26
+ end
27
+
55
28
  def fake?
56
29
  !valid?
57
30
  end
58
31
 
59
- def self.valid?(ip, resolver_config = nil)
60
- self.new(ip, resolver_config).valid?
32
+ def self.valid?(ip)
33
+ new(ip).valid?
61
34
  end
62
35
 
63
- def self.fake?(ip, resolver_config = nil)
64
- self.new(ip, resolver_config).fake?
36
+ def self.fake?(ip)
37
+ new(ip).fake?
65
38
  end
66
39
  end
67
40
  end
@@ -0,0 +1,18 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'resolv'
4
+
5
+ module Legitbot
6
+ module Config
7
+ module Resolver # :nodoc:
8
+ def resolver_config(options = nil)
9
+ @resolver_config = options
10
+ end
11
+
12
+ def resolver
13
+ @resolver_config ||= Legitbot.resolver_config
14
+ @resolver ||= Resolv::DNS.new @resolver_config
15
+ end
16
+ end
17
+ end
18
+ end
@@ -1,12 +1,23 @@
1
- module Legitbot
1
+ # frozen_string_literal: true
2
+
3
+ module Legitbot # :nodoc:
2
4
  # https://duckduckgo.com/duckduckbot
3
5
  class DuckDuckGo < BotMatch
4
- ValidIPs = %w(50.16.241.113 50.16.241.114 50.16.241.117 50.16.247.234 52.204.97.54 52.5.190.19 54.197.234.188 54.208.100.253 23.21.227.69)
5
-
6
- def valid?
7
- DuckDuckGo::ValidIPs.include? @ip
8
- end
6
+ ip_ranges %w[
7
+ 23.21.227.69
8
+ 40.88.21.235
9
+ 50.16.241.113
10
+ 50.16.241.114
11
+ 50.16.241.117
12
+ 50.16.247.234
13
+ 52.204.97.54
14
+ 52.5.190.19
15
+ 54.197.234.188
16
+ 54.208.100.253
17
+ 54.208.102.37
18
+ 107.21.1.8
19
+ ]
9
20
  end
10
21
 
11
- rule Legitbot::DuckDuckGo, %w(DuckDuckBot)
22
+ rule Legitbot::DuckDuckGo, %w[DuckDuckBot]
12
23
  end
@@ -1,48 +1,22 @@
1
- require 'ipaddr'
1
+ # frozen_string_literal: true
2
+
2
3
  require 'irrc'
3
- require 'interval_tree'
4
4
 
5
- module Legitbot
5
+ module Legitbot # :nodoc:
6
6
  # https://developers.facebook.com/docs/sharing/webmasters/crawler
7
-
8
7
  class Facebook < BotMatch
9
8
  AS = 'AS32934'
10
9
 
11
- def valid?
12
- ip = IPAddr.new(@ip)
13
- Facebook.valid_ips[ip.ipv4? ? :ipv4 : :ipv6].search(ip.to_i).size > 0
14
- end
15
-
16
- @mutex = Mutex.new
17
-
18
- def self.valid_ips
19
- @mutex.synchronize { @ips ||= load_ips }
20
- end
21
-
22
- def self.reload!
23
- @mutex.synchronize { @ips = load_ips }
24
- end
25
-
26
- def self.load_ips
27
- whois.map do |(family, records)|
28
- ranges = records.map do |cidr|
29
- range = IPAddr.new(cidr).to_range
30
- (range.begin.to_i..range.end.to_i)
31
- end
32
- [family, IntervalTree::Tree.new(ranges)]
33
- end.to_h
34
- end
35
-
36
- def self.whois
10
+ ip_ranges do
37
11
  client = Irrc::Client.new
38
12
  client.query :radb, AS
39
13
  results = client.perform
40
14
 
41
- %i(ipv4 ipv6).map do |family|
42
- [family, results[AS][family][AS]]
43
- end.to_h
15
+ %i[ipv4 ipv6].map do |family|
16
+ results[AS][family][AS]
17
+ end.flatten
44
18
  end
45
19
  end
46
20
 
47
- rule Legitbot::Facebook, %w(Facebot facebookexternalhit/1.1)
21
+ rule Legitbot::Facebook, %w[Facebot facebookexternalhit/1.1]
48
22
  end