legitbot 0.3.2 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: d775b4e4e434615989b8ef35aca9ac6498b06742ca6e8534f9bb4d6f3d5c9b04
4
- data.tar.gz: 239069018d1b043ec99bce8496b3760fc189611f6aa83545835d5f71ce7f94c5
3
+ metadata.gz: dfc1b3322ff4f85957dabf6790d535f27f99feed47bdb4ff1bba65f6242d31a2
4
+ data.tar.gz: 32e842cc3d297b3afda0ef9b265121a11c363857a726819f886b441e6c53a53c
5
5
  SHA512:
6
- metadata.gz: '08e08f94cbb4979566982d91a525f7d03618c9c10e85613bcebfd51f8f8f8b0dab669ba9cd1b67c7a6d6a36c23d23ad88b64996bf252bd884082045ae590d38c'
7
- data.tar.gz: 97c4dd80bf62205ea4a03a0bc7570d57520611b4015f3bf9f66b450d78878a3fde1c9097e0ac79691e40d5f29e9ba676aaa3af70f71650adb61c7f4a0ee1ddf0
6
+ metadata.gz: 3654c256da13b37045425457a96ac9a8b41c5ae5c0cce49b7898170e2d23a66a5cb7e612503dc1d88dc2e1240dcb07c9ccc5d5aa8439f280144abe969dc0ae7b
7
+ data.tar.gz: 554e120d1001a71f455aedcd4d30397b22130279a6f99f8e022d40ade50427590dd52e982820399852ec52fbb2c96b7ea1461f82d8e88cb89acc053097136b32
data/.rubocop.yml ADDED
@@ -0,0 +1,8 @@
1
+ AllCops:
2
+ Include:
3
+ - '**/Gemfile'
4
+ - '**/Rakefile'
5
+ - 'lib/**/*.rb'
6
+ - 'test/**/*.rb'
7
+ Exclude:
8
+ - 'pkg/**'
data/Gemfile CHANGED
@@ -1,2 +1,4 @@
1
+ # frozen_string_literal: true
2
+
1
3
  source 'https://rubygems.org'
2
4
  gemspec
data/Rakefile CHANGED
@@ -1,14 +1,16 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'rubygems'
2
4
  require 'bundler'
3
5
  require 'bump/tasks'
4
- require "rake/testtask"
6
+ require 'rake/testtask'
5
7
  Bundler::GemHelper.install_tasks
6
8
 
7
9
  Bump.tag_by_default = true
8
10
 
9
11
  Rake::TestTask.new do |t|
10
- t.libs << "test"
11
- t.test_files = FileList['test/*_test.rb']
12
+ t.libs << 'test'
13
+ t.test_files = FileList['test/**/*_test.rb']
12
14
  t.warning = true
13
15
  t.verbose = true
14
16
  end
data/legitbot.gemspec CHANGED
@@ -17,9 +17,10 @@ Gem::Specification.new do |spec|
17
17
  spec.required_ruby_version = '>= 2.3.0'
18
18
  spec.add_dependency "irrc", ">= 0.2.1"
19
19
  spec.add_dependency "augmented_interval_tree", ">= 0.1.1"
20
- spec.add_development_dependency "bump"
21
- spec.add_development_dependency "rake"
22
- spec.add_development_dependency "minitest"
20
+ spec.add_development_dependency "bump", '>= 0.8.0'
21
+ spec.add_development_dependency "rake", '>= 12.3.0'
22
+ spec.add_development_dependency "rubocop", '>= 0.74.0'
23
+ spec.add_development_dependency "minitest", '>= 5.1.0'
23
24
 
24
25
  spec.files = `git ls-files`.split($/)
25
26
  spec.rdoc_options = ["--charset=UTF-8"]
@@ -1,13 +1,18 @@
1
- module Legitbot
1
+ # frozen_string_literal: true
2
+
3
+ module Legitbot # :nodoc:
2
4
  # https://ahrefs.com/robot
3
5
  class Ahrefs < BotMatch
4
- Ranges = %w(54.36.148.0/24 54.36.149.0/24 54.36.150.0/24 195.154.122.0/24 195.154.123.0/24 195.154.126.0/24 195.154.127.0/24)
5
-
6
- def valid?
7
- ip = IPAddr.new @ip
8
- Ranges.any? { |range| IPAddr.new(range).include? ip }
9
- end
6
+ ip_ranges %w[
7
+ 54.36.148.0/24
8
+ 54.36.149.0/24
9
+ 54.36.150.0/24
10
+ 195.154.122.0/24
11
+ 195.154.123.0/24
12
+ 195.154.126.0/24
13
+ 195.154.127.0/24
14
+ ]
10
15
  end
11
16
 
12
- rule Legitbot::Ahrefs, %w(AhrefsBot)
17
+ rule Legitbot::Ahrefs, %w[AhrefsBot]
13
18
  end
@@ -1,20 +1,20 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'ipaddr'
2
4
 
3
- module Legitbot
5
+ module Legitbot # :nodoc:
4
6
  # https://support.apple.com/en-us/HT204683
5
-
6
7
  class Apple < BotMatch
7
- Range = IPAddr.new('17.0.0.0/8')
8
-
9
- def valid?
10
- ip = IPAddr.new @ip
11
- Range.include? ip
12
- end
8
+ ip_ranges '17.0.0.0/8'
13
9
  end
14
10
 
15
- class Apple_as_Google < Apple
11
+ # https://support.apple.com/en-us/HT204683
12
+ # rubocop:disable Naming/ClassAndModuleCamelCase
13
+ class Apple_as_Google < BotMatch
14
+ ip_ranges '17.0.0.0/8'
16
15
  end
16
+ # rubocop:enable Naming/ClassAndModuleCamelCase
17
17
 
18
- rule Legitbot::Apple, %w(Applebot)
19
- rule Legitbot::Apple_as_Google, %w(Googlebot)
18
+ rule Legitbot::Apple, %w[Applebot]
19
+ rule Legitbot::Apple_as_Google, %w[Googlebot]
20
20
  end
@@ -1,12 +1,10 @@
1
- module Legitbot
1
+ # frozen_string_literal: true
2
+
3
+ module Legitbot # :nodoc:
2
4
  # http://help.baidu.com/question?prod_en=master&class=498&id=1000973
3
5
  class Baidu < BotMatch
4
- ValidDomains = ["baidu.com.", "baidu.jp."]
5
-
6
- def valid?
7
- subdomain_of?(*Baidu::ValidDomains)
8
- end
6
+ domains 'baidu.com.', 'baidu.jp.', reverse: false
9
7
  end
10
8
 
11
- rule Legitbot::Baidu, %w(Baiduspider)
9
+ rule Legitbot::Baidu, %w[Baiduspider]
12
10
  end
data/lib/legitbot/bing.rb CHANGED
@@ -1,12 +1,10 @@
1
- module Legitbot
1
+ # frozen_string_literal: true
2
+
3
+ module Legitbot # :nodoc:
2
4
  # https://blogs.bing.com/webmaster/2012/08/31/how-to-verify-that-bingbot-is-bingbot/
3
5
  class Bing < BotMatch
4
- ValidDomains = ["search.msn.com."]
5
-
6
- def valid?
7
- subdomain_of?(*Bing::ValidDomains) && reverse_resolves?
8
- end
6
+ domains 'search.msn.com.'
9
7
  end
10
8
 
11
- rule Legitbot::Bing, %w(Bingbot bingbot)
9
+ rule Legitbot::Bing, %w[Bingbot bingbot]
12
10
  end
@@ -1,5 +1,8 @@
1
- require 'resolv'
2
- require 'ipaddr'
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'config/resolver'
4
+ require_relative 'validators/domains'
5
+ require_relative 'validators/ip_ranges'
3
6
 
4
7
  module Legitbot
5
8
  ##
@@ -7,61 +10,31 @@ module Legitbot
7
10
  # +valid?+, +fake?+ and +detected_as+
8
11
  #
9
12
  class BotMatch
10
- def initialize(ip, resolver_config = nil)
11
- @dns = Resolv::DNS.new(resolver_config)
12
- @ip = ip
13
- end
14
-
15
- ##
16
- # Returns a Resolv::DNS::Name instance with
17
- # the reverse name
18
- def reverse_domain
19
- @reverse_domain ||= @dns.getname(@ip)
20
- rescue Resolv::ResolvError
21
- @reverse_domain ||= nil
22
- end
23
-
24
- ##
25
- # Returns a String with the reverse name
26
- def reverse_name
27
- reverse_domain&.to_s
28
- end
29
-
30
- ##
31
- # Returns a String with IP created from the reverse name
32
- def reversed_ip
33
- return nil if reverse_name.nil?
13
+ include Legitbot::Validators::IpRanges
14
+ include Legitbot::Validators::Domains
34
15
 
35
- @reverse_ip ||= @dns.getaddress(reverse_name)
36
- @reverse_ip.to_s
37
- end
38
-
39
- def reverse_resolves?
40
- @ip == reversed_ip
41
- end
42
-
43
- def subdomain_of?(*domains)
44
- return false if reverse_name.nil?
45
-
46
- domains.any? { |d|
47
- reverse_domain.subdomain_of? Resolv::DNS::Name.create(d)
48
- }
16
+ def initialize(ip)
17
+ @ip = ip
49
18
  end
50
19
 
51
20
  def detected_as
52
21
  self.class.name.split('::').last.downcase.to_sym
53
22
  end
54
23
 
24
+ def valid?
25
+ valid_ip? && valid_domain?
26
+ end
27
+
55
28
  def fake?
56
29
  !valid?
57
30
  end
58
31
 
59
- def self.valid?(ip, resolver_config = nil)
60
- self.new(ip, resolver_config).valid?
32
+ def self.valid?(ip)
33
+ new(ip).valid?
61
34
  end
62
35
 
63
- def self.fake?(ip, resolver_config = nil)
64
- self.new(ip, resolver_config).fake?
36
+ def self.fake?(ip)
37
+ new(ip).fake?
65
38
  end
66
39
  end
67
40
  end
@@ -0,0 +1,18 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'resolv'
4
+
5
+ module Legitbot
6
+ module Config
7
+ module Resolver # :nodoc:
8
+ def resolver_config(options = nil)
9
+ @resolver_config = options
10
+ end
11
+
12
+ def resolver
13
+ @resolver_config ||= Legitbot.resolver_config
14
+ @resolver ||= Resolv::DNS.new @resolver_config
15
+ end
16
+ end
17
+ end
18
+ end
@@ -1,12 +1,20 @@
1
- module Legitbot
1
+ # frozen_string_literal: true
2
+
3
+ module Legitbot # :nodoc:
2
4
  # https://duckduckgo.com/duckduckbot
3
5
  class DuckDuckGo < BotMatch
4
- ValidIPs = %w(50.16.241.113 50.16.241.114 50.16.241.117 50.16.247.234 52.204.97.54 52.5.190.19 54.197.234.188 54.208.100.253 23.21.227.69)
5
-
6
- def valid?
7
- DuckDuckGo::ValidIPs.include? @ip
8
- end
6
+ ip_ranges %w[
7
+ 50.16.241.113
8
+ 50.16.241.114
9
+ 50.16.241.117
10
+ 50.16.247.234
11
+ 52.204.97.54
12
+ 52.5.190.19
13
+ 54.197.234.188
14
+ 54.208.100.253
15
+ 23.21.227.69
16
+ ]
9
17
  end
10
18
 
11
- rule Legitbot::DuckDuckGo, %w(DuckDuckBot)
19
+ rule Legitbot::DuckDuckGo, %w[DuckDuckBot]
12
20
  end
@@ -1,48 +1,22 @@
1
- require 'ipaddr'
1
+ # frozen_string_literal: true
2
+
2
3
  require 'irrc'
3
- require 'interval_tree'
4
4
 
5
- module Legitbot
5
+ module Legitbot # :nodoc:
6
6
  # https://developers.facebook.com/docs/sharing/webmasters/crawler
7
-
8
7
  class Facebook < BotMatch
9
8
  AS = 'AS32934'
10
9
 
11
- def valid?
12
- ip = IPAddr.new(@ip)
13
- Facebook.valid_ips[ip.ipv4? ? :ipv4 : :ipv6].search(ip.to_i).size > 0
14
- end
15
-
16
- @mutex = Mutex.new
17
-
18
- def self.valid_ips
19
- @mutex.synchronize { @ips ||= load_ips }
20
- end
21
-
22
- def self.reload!
23
- @mutex.synchronize { @ips = load_ips }
24
- end
25
-
26
- def self.load_ips
27
- whois.map do |(family, records)|
28
- ranges = records.map do |cidr|
29
- range = IPAddr.new(cidr).to_range
30
- (range.begin.to_i..range.end.to_i)
31
- end
32
- [family, IntervalTree::Tree.new(ranges)]
33
- end.to_h
34
- end
35
-
36
- def self.whois
10
+ ip_ranges do
37
11
  client = Irrc::Client.new
38
12
  client.query :radb, AS
39
13
  results = client.perform
40
14
 
41
- %i(ipv4 ipv6).map do |family|
42
- [family, results[AS][family][AS]]
43
- end.to_h
15
+ %i[ipv4 ipv6].map do |family|
16
+ results[AS][family][AS]
17
+ end.flatten
44
18
  end
45
19
  end
46
20
 
47
- rule Legitbot::Facebook, %w(Facebot facebookexternalhit/1.1)
21
+ rule Legitbot::Facebook, %w[Facebot facebookexternalhit/1.1]
48
22
  end
@@ -1,14 +1,11 @@
1
- module Legitbot
1
+ # frozen_string_literal: true
2
+
3
+ module Legitbot # :nodoc:
2
4
  # https://support.google.com/webmasters/answer/1061943
3
5
  # https://support.google.com/webmasters/answer/80553
4
-
5
6
  class Google < BotMatch
6
- ValidDomains = ["google.com.", "googlebot.com."]
7
-
8
- def valid?
9
- subdomain_of?(*Google::ValidDomains) && reverse_resolves?
10
- end
7
+ domains 'google.com.', 'googlebot.com.'
11
8
  end
12
9
 
13
- rule Legitbot::Google, %w(Googlebot Mediapartners-Google AdsBot-Google)
10
+ rule Legitbot::Google, %w[Googlebot Mediapartners-Google AdsBot-Google]
14
11
  end
@@ -1,6 +1,14 @@
1
+ # frozen_string_literal: true
2
+
3
+ ##
4
+ # Bot lookup based on user agent
1
5
  module Legitbot
2
6
  @rules = []
3
7
 
8
+ class << self
9
+ attr_accessor :resolver_config
10
+ end
11
+
4
12
  ##
5
13
  # Lookup a bot based on its signature from +User-Agent+ header.
6
14
  #
@@ -10,15 +18,12 @@ module Legitbot
10
18
  # otherwise.
11
19
  # :yields: a found bot
12
20
  #
13
- def self.bot(userAgent, ip, resolver_config = nil)
14
- bots =
15
- @rules.select { |rule|
16
- rule[:fragments].any? {|f| userAgent.index f}
17
- }.map { |rule|
18
- rule[:class].new(ip, resolver_config)
19
- }
21
+ def self.bot(user_agent, ip)
22
+ bots = @rules
23
+ .select { |rule| rule[:fragments].any? { |f| user_agent.index f } }
24
+ .map { |rule| rule[:class].new(ip) }
20
25
 
21
- selected = bots.select { |b| b.valid? }.first if bots.size > 1
26
+ selected = bots.select(&:valid?).first if bots.size > 1
22
27
  selected = bots.last if selected.nil?
23
28
 
24
29
  if selected && block_given?
@@ -29,6 +34,6 @@ module Legitbot
29
34
  end
30
35
 
31
36
  def self.rule(clazz, fragments)
32
- @rules << {:class => clazz, :fragments => fragments}
37
+ @rules << { class: clazz, fragments: fragments }
33
38
  end
34
39
  end
@@ -1,13 +1,10 @@
1
- module Legitbot
2
- # https://help.pinterest.com/en/articles/about-pinterest-crawler-0
1
+ # frozen_string_literal: true
3
2
 
3
+ module Legitbot # :nodoc:
4
+ # https://help.pinterest.com/en/articles/about-pinterest-crawler-0
4
5
  class Pinterest < BotMatch
5
- ValidDomains = ["pinterest.com."]
6
-
7
- def valid?
8
- subdomain_of?(*Pinterest::ValidDomains) && reverse_resolves?
9
- end
6
+ domains 'pinterest.com.'
10
7
  end
11
8
 
12
- rule Legitbot::Pinterest, %w(Pinterestbot Pinterest/0.2)
9
+ rule Legitbot::Pinterest, %w[Pinterestbot Pinterest/0.2]
13
10
  end
@@ -0,0 +1,71 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'resolv'
4
+ require 'ipaddr'
5
+
6
+ module Legitbot
7
+ module Validators
8
+ #
9
+ # In a bot matcher:
10
+ # `domains 'search.msn.com', ...`
11
+ # `domains 'googlebot.com', reverse: false`
12
+ #
13
+ # `reverse` is true by default.
14
+ module Domains
15
+ class << self
16
+ def included(base)
17
+ base.extend ClassMethods
18
+ end
19
+ end
20
+
21
+ def valid_domain?
22
+ self.class.valid_domain?(@ip)
23
+ end
24
+
25
+ module ClassMethods # :nodoc:
26
+ include Legitbot::Config::Resolver
27
+
28
+ def domains(*list, reverse: true)
29
+ @valid_domains = list.flatten.map { |d| Resolv::DNS::Name.create(d) }
30
+ @validate_reverse_record = reverse
31
+ end
32
+
33
+ def check_domains?
34
+ instance_variable_defined?(:@valid_domains)
35
+ end
36
+
37
+ def valid_domain?(ip)
38
+ return true unless check_domains?
39
+ return true if @valid_domains.empty?
40
+
41
+ domains = reverse_domains(ip)
42
+ return false if domains.empty?
43
+
44
+ record = find_subdomain_record(domains)
45
+ return false unless record
46
+ return true unless @validate_reverse_record
47
+
48
+ ip == reverse_ip(record)
49
+ end
50
+
51
+ def reverse_domains(ip)
52
+ resolver.getnames(ip)
53
+ rescue Resolv::ResolvError
54
+ nil
55
+ end
56
+
57
+ def find_subdomain_record(domains)
58
+ domains.find do |d|
59
+ @valid_domains.any? { |vd| d.subdomain_of?(vd) }
60
+ end
61
+ end
62
+
63
+ def reverse_ip(record)
64
+ return nil if record.nil?
65
+
66
+ resolver.getaddress(record.to_s).to_s
67
+ end
68
+ end
69
+ end
70
+ end
71
+ end
@@ -0,0 +1,81 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'ipaddr'
4
+ require 'interval_tree'
5
+
6
+ module Legitbot
7
+ module Validators
8
+ #
9
+ # In a bot matcher:
10
+ # `ip_ranges ip, range, ip, ...`
11
+ # `ip_ranges do [ip, range, ...]; end`
12
+ module IpRanges
13
+ class << self
14
+ def included(base)
15
+ base.extend ClassMethods
16
+ end
17
+ end
18
+
19
+ def valid_ip?
20
+ self.class.valid_ip?(@ip)
21
+ end
22
+
23
+ module ClassMethods # :nodoc:
24
+ FAMILIES = %i[ipv4 ipv6].freeze
25
+ EMPTY_GENERATOR = proc { [] }
26
+
27
+ def ip_ranges(*ips, &block)
28
+ @ip_ranges = partition_ips(ips.flatten) unless ips.empty?
29
+ @ip_ranges_loader = block_given? ? block : EMPTY_GENERATOR
30
+ @ip_loader_mutex = Mutex.new
31
+ end
32
+
33
+ def check_ranges?
34
+ instance_variable_defined?(:@ip_ranges_loader)
35
+ end
36
+
37
+ def valid_ip?(ip)
38
+ return true unless check_ranges?
39
+ return true if valid_ips.empty?
40
+
41
+ obj = IPAddr.new(ip)
42
+ ranges = valid_ips[obj.ipv4? ? :ipv4 : :ipv6].search(obj.to_i)
43
+ !ranges.empty?
44
+ end
45
+
46
+ def valid_ips
47
+ @ip_loader_mutex.synchronize do
48
+ @ip_ranges ||= load_ips
49
+ end
50
+ end
51
+
52
+ def reload_ips
53
+ @ip_loader_mutex.synchronize do
54
+ @ip_ranges = load_ips
55
+ end
56
+ end
57
+
58
+ def load_ips
59
+ partition_ips(@ip_ranges_loader.call)
60
+ end
61
+
62
+ # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
63
+ def partition_ips(ips)
64
+ return [] if ips.empty?
65
+
66
+ ips
67
+ .map { |cidr| IPAddr.new(cidr) }
68
+ .partition(&:ipv4?)
69
+ .each_with_index
70
+ .map do |list, index|
71
+ ranges = list.map(&:to_range).map do |r|
72
+ (r.begin.to_i..r.end.to_i)
73
+ end
74
+ [FAMILIES[index], IntervalTree::Tree.new(ranges)]
75
+ end.to_h
76
+ end
77
+ # rubocop:enable Metrics/AbcSize, Metrics/MethodLength
78
+ end
79
+ end
80
+ end
81
+ end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Legitbot
2
- VERSION = '0.3.2'
4
+ VERSION = '0.4.0'
3
5
  end
@@ -1,17 +1,33 @@
1
- module Legitbot
2
- # https://yandex.com/support/webmaster/robot-workings/check-yandex-robots.html
1
+ # frozen_string_literal: true
3
2
 
3
+ module Legitbot # :nodoc:
4
+ # https://yandex.com/support/webmaster/robot-workings/check-yandex-robots.html
4
5
  class Yandex < BotMatch
5
- ValidDomains = ["yandex.ru.", "yandex.net.", "yandex.com."]
6
-
7
- def valid?
8
- subdomain_of?(*Yandex::ValidDomains) && reverse_resolves?
9
- end
6
+ domains 'yandex.ru.', 'yandex.net.', 'yandex.com.'
10
7
  end
11
8
 
12
- rule Legitbot::Yandex, %w(YandexBot YandexAccessibilityBot YandexMobileBot
13
- YandexDirectDyn YandexScreenshotBot YandexImages YandexVideo YandexVideoParser
14
- YandexMedia YandexBlogs YandexFavicons YandexWebmaster YandexPagechecker
15
- YandexImageResizer YaDirectFetcher YandexCalendar YandexSitelinks YandexMetrika
16
- YandexNews YandexVertis YandexSearchShop YandexVerticals)
9
+ rule Legitbot::Yandex, %w[
10
+ YandexBot
11
+ YandexAccessibilityBot
12
+ YandexMobileBot
13
+ YandexDirectDyn
14
+ YandexScreenshotBot
15
+ YandexImages
16
+ YandexVideo
17
+ YandexVideoParser
18
+ YandexMedia
19
+ YandexBlogs
20
+ YandexFavicons
21
+ YandexWebmaster
22
+ YandexPagechecker
23
+ YandexImageResizer
24
+ YaDirectFetcher
25
+ YandexCalendar
26
+ YandexSitelinks
27
+ YandexMetrika
28
+ YandexNews
29
+ YandexVertis
30
+ YandexSearchShop
31
+ YandexVerticals
32
+ ]
17
33
  end
data/lib/legitbot.rb CHANGED
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require_relative 'legitbot/legitbot'
2
4
  require_relative 'legitbot/botmatch'
3
5