bot_verification 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +20 -0
- data/Gemfile +7 -0
- data/Gemfile.lock +256 -0
- data/LICENSE +21 -0
- data/README.md +355 -0
- data/Rakefile +8 -0
- data/bot_verification.gemspec +37 -0
- data/lib/bot_verification/bot_patterns.rb +121 -0
- data/lib/bot_verification/configuration.rb +139 -0
- data/lib/bot_verification/controller_concern.rb +150 -0
- data/lib/bot_verification/ip_range_fetcher.rb +155 -0
- data/lib/bot_verification/ip_range_model.rb +132 -0
- data/lib/bot_verification/railtie.rb +22 -0
- data/lib/bot_verification/refresh_job.rb +36 -0
- data/lib/bot_verification/service.rb +232 -0
- data/lib/bot_verification/version.rb +5 -0
- data/lib/bot_verification.rb +74 -0
- data/lib/generators/bot_verification/install_generator.rb +92 -0
- data/lib/generators/bot_verification/templates/initializer.rb.erb +58 -0
- data/lib/generators/bot_verification/templates/migration.rb.erb +18 -0
- data/lib/generators/bot_verification/templates/model.rb.erb +13 -0
- data/lib/generators/bot_verification/templates/refresh_job.rb.erb +21 -0
- data/lib/tasks/bot_verification.rake +95 -0
- metadata +127 -0
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module BotVerification
|
|
4
|
+
# Module to be included in your ApplicationRecord model
|
|
5
|
+
# Provides IP range storage and lookup functionality
|
|
6
|
+
#
|
|
7
|
+
# @example Create your own model
|
|
8
|
+
# class BotIpRange < ApplicationRecord
|
|
9
|
+
# include BotVerification::IpRangeModel
|
|
10
|
+
# end
|
|
11
|
+
#
|
|
12
|
+
module IpRangeModel
|
|
13
|
+
extend ActiveSupport::Concern
|
|
14
|
+
|
|
15
|
+
included do
|
|
16
|
+
validates :bot_type, presence: true, inclusion: { in: ->(_) { BotPatterns::VALID_BOT_TYPES } }
|
|
17
|
+
validates :cidr, presence: true
|
|
18
|
+
validates :ip_version, presence: true, inclusion: { in: [ 4, 6 ] }
|
|
19
|
+
|
|
20
|
+
scope :for_bot, ->(bot_type) { where(bot_type: bot_type) }
|
|
21
|
+
scope :ipv4, -> { where(ip_version: 4) }
|
|
22
|
+
scope :ipv6, -> { where(ip_version: 6) }
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
class_methods do
|
|
26
|
+
# Check if an IP address belongs to a known bot's IP range
|
|
27
|
+
#
|
|
28
|
+
# @param ip [String] IP address to check
|
|
29
|
+
# @param bot_type [String, Symbol] Bot type (google, bing, etc.)
|
|
30
|
+
# @return [Boolean]
|
|
31
|
+
def ip_belongs_to_bot?(ip, bot_type)
|
|
32
|
+
return false if ip.blank? || bot_type.blank?
|
|
33
|
+
|
|
34
|
+
bot_type = bot_type.to_s
|
|
35
|
+
return false unless BotPatterns::VALID_BOT_TYPES.include?(bot_type)
|
|
36
|
+
|
|
37
|
+
ranges = cached_ranges_for(bot_type)
|
|
38
|
+
return false if ranges.empty?
|
|
39
|
+
|
|
40
|
+
begin
|
|
41
|
+
ip_addr = IPAddr.new(ip)
|
|
42
|
+
ranges.any? { |range| range.include?(ip_addr) }
|
|
43
|
+
rescue IPAddr::InvalidAddressError
|
|
44
|
+
false
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
# Get cached IPAddr objects for a bot type
|
|
49
|
+
def cached_ranges_for(bot_type)
|
|
50
|
+
cache_key = "#{BotVerification.configuration.cache_key_prefix}:ranges:#{bot_type}:v1"
|
|
51
|
+
|
|
52
|
+
BotVerification.configuration.cache.fetch(cache_key, expires_in: 1.hour) do
|
|
53
|
+
for_bot(bot_type).pluck(:cidr).filter_map do |cidr|
|
|
54
|
+
IPAddr.new(cidr)
|
|
55
|
+
rescue IPAddr::InvalidAddressError
|
|
56
|
+
nil
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
# Clear the cache for a bot type
|
|
62
|
+
def clear_cache_for(bot_type)
|
|
63
|
+
cache_key = "#{BotVerification.configuration.cache_key_prefix}:ranges:#{bot_type}:v1"
|
|
64
|
+
BotVerification.configuration.cache.delete(cache_key)
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
# Clear all bot IP range caches
|
|
68
|
+
def clear_all_caches
|
|
69
|
+
BotPatterns::VALID_BOT_TYPES.each { |type| clear_cache_for(type) }
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
# Bulk import ranges for a bot type (replaces existing)
|
|
73
|
+
#
|
|
74
|
+
# @param bot_type [String] Bot type
|
|
75
|
+
# @param ranges [Array<Hash>] Array of { cidr:, ip_version:, source_url: }
|
|
76
|
+
# @return [Integer] Number of records imported
|
|
77
|
+
def import_ranges(bot_type, ranges, source_url: nil)
|
|
78
|
+
transaction do
|
|
79
|
+
for_bot(bot_type).delete_all
|
|
80
|
+
|
|
81
|
+
now = Time.current
|
|
82
|
+
records = ranges.map do |range|
|
|
83
|
+
{
|
|
84
|
+
bot_type: bot_type,
|
|
85
|
+
cidr: range[:cidr],
|
|
86
|
+
ip_version: range[:ip_version] || (range[:cidr].include?(":") ? 6 : 4),
|
|
87
|
+
source_url: range[:source_url] || source_url,
|
|
88
|
+
last_verified_at: now,
|
|
89
|
+
created_at: now,
|
|
90
|
+
updated_at: now
|
|
91
|
+
}
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
insert_all(records) if records.any?
|
|
95
|
+
clear_cache_for(bot_type)
|
|
96
|
+
records.size
|
|
97
|
+
end
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
# Check if we have any ranges for a bot type
|
|
101
|
+
def has_ranges_for?(bot_type)
|
|
102
|
+
for_bot(bot_type).exists?
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
# Get stats about stored ranges
|
|
106
|
+
def stats
|
|
107
|
+
BotPatterns::VALID_BOT_TYPES.index_with do |bot_type|
|
|
108
|
+
{
|
|
109
|
+
count: for_bot(bot_type).count,
|
|
110
|
+
last_updated: for_bot(bot_type).maximum(:last_verified_at)
|
|
111
|
+
}
|
|
112
|
+
end
|
|
113
|
+
end
|
|
114
|
+
end
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
# Default model class that uses the configurable table name
|
|
118
|
+
# This is used if you don't create your own model
|
|
119
|
+
class IpRange < ActiveRecord::Base
|
|
120
|
+
include IpRangeModel
|
|
121
|
+
|
|
122
|
+
self.table_name = -> { BotVerification.configuration.table_name }
|
|
123
|
+
|
|
124
|
+
def self.table_name
|
|
125
|
+
if @table_name.respond_to?(:call)
|
|
126
|
+
@table_name.call
|
|
127
|
+
else
|
|
128
|
+
@table_name || BotVerification.configuration.table_name
|
|
129
|
+
end
|
|
130
|
+
end
|
|
131
|
+
end
|
|
132
|
+
end
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module BotVerification
|
|
4
|
+
class Railtie < Rails::Railtie
|
|
5
|
+
railtie_name :bot_verification
|
|
6
|
+
|
|
7
|
+
# Load rake tasks
|
|
8
|
+
rake_tasks do
|
|
9
|
+
load File.expand_path("../tasks/bot_verification.rake", __dir__)
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
# Set up logger after Rails initializes
|
|
13
|
+
initializer "bot_verification.configure_logger" do
|
|
14
|
+
BotVerification.configuration.logger ||= Rails.logger
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
# Add generators path
|
|
18
|
+
generators do
|
|
19
|
+
require_relative "../generators/bot_verification/install_generator"
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
end
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module BotVerification
|
|
4
|
+
# Background job for refreshing bot IP ranges
|
|
5
|
+
#
|
|
6
|
+
# Can be used directly or subclassed for customization:
|
|
7
|
+
#
|
|
8
|
+
# @example Direct usage with Sidekiq
|
|
9
|
+
# BotVerification::RefreshJob.perform_async
|
|
10
|
+
# BotVerification::RefreshJob.perform_async("google")
|
|
11
|
+
#
|
|
12
|
+
# @example Direct usage with ActiveJob
|
|
13
|
+
# BotVerification::RefreshJob.perform_later
|
|
14
|
+
# BotVerification::RefreshJob.perform_later("openai_gptbot")
|
|
15
|
+
#
|
|
16
|
+
# @example Subclass for custom queue or error handling
|
|
17
|
+
# class RefreshBotIpRangesJob < BotVerification::RefreshJob
|
|
18
|
+
# queue_as :low
|
|
19
|
+
#
|
|
20
|
+
# def perform(bot_type = nil)
|
|
21
|
+
# super
|
|
22
|
+
# rescue => e
|
|
23
|
+
# Airbrake.notify(e)
|
|
24
|
+
# raise
|
|
25
|
+
# end
|
|
26
|
+
# end
|
|
27
|
+
#
|
|
28
|
+
class RefreshJob < ActiveJob::Base
|
|
29
|
+
queue_as :default
|
|
30
|
+
|
|
31
|
+
# @param bot_type [String, Symbol, nil] Specific bot type to refresh, or nil for all
|
|
32
|
+
def perform(bot_type = nil)
|
|
33
|
+
BotVerification.refresh_ip_ranges!(bot_type)
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
end
|
|
@@ -0,0 +1,232 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module BotVerification
|
|
4
|
+
# Core verification service
|
|
5
|
+
#
|
|
6
|
+
# Performance characteristics:
|
|
7
|
+
# 1. Rails cache lookup (~1ms) - checks if IP was recently verified
|
|
8
|
+
# 2. IP range check (~5-10ms) - database lookup with in-memory caching
|
|
9
|
+
# 3. Reverse DNS (~100-2000ms) - only if IP range check fails, with strict timeout
|
|
10
|
+
#
|
|
11
|
+
class Service
|
|
12
|
+
include BotPatterns
|
|
13
|
+
|
|
14
|
+
MODES = %i[search_engines search_and_ai all_known].freeze
|
|
15
|
+
|
|
16
|
+
class << self
|
|
17
|
+
# Main entry point - checks if request is from a verified good bot
|
|
18
|
+
#
|
|
19
|
+
# @param ip [String] The IP address to verify
|
|
20
|
+
# @param user_agent [String] The user agent string
|
|
21
|
+
# @param mode [Symbol] Verification mode:
|
|
22
|
+
# - :search_engines (default) - Only IP-verified search engine bots
|
|
23
|
+
# - :search_and_ai - Search engines + IP-verified AI bots
|
|
24
|
+
# - :all_known - All known bot patterns (trusts user agent, least secure)
|
|
25
|
+
# @return [Boolean] true if verified good bot
|
|
26
|
+
def verified_good_bot?(ip, user_agent, mode: :search_engines)
|
|
27
|
+
return false if ip.blank? || user_agent.blank?
|
|
28
|
+
raise ArgumentError, "Invalid mode: #{mode}" unless MODES.include?(mode)
|
|
29
|
+
|
|
30
|
+
bot_type = detect_bot_type(user_agent)
|
|
31
|
+
return report_verification_result(nil, false, mode, ip, user_agent) unless bot_type
|
|
32
|
+
|
|
33
|
+
verified = case mode
|
|
34
|
+
when :search_engines
|
|
35
|
+
verify_search_engine_bot(ip, bot_type)
|
|
36
|
+
when :search_and_ai
|
|
37
|
+
verify_search_engine_bot(ip, bot_type) || verify_ai_bot_internal(ip, bot_type)
|
|
38
|
+
when :all_known
|
|
39
|
+
true
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
report_verification_result(bot_type, verified, mode, ip, user_agent)
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
# Check specifically if request is from a verified AI bot
|
|
46
|
+
#
|
|
47
|
+
# @param ip [String] The IP address to verify
|
|
48
|
+
# @param user_agent [String] The user agent string
|
|
49
|
+
# @param strict [Boolean] If true (default), require IP verification for bots that publish ranges.
|
|
50
|
+
# Bots without published IP ranges (like Anthropic) are always trusted by user agent.
|
|
51
|
+
# @return [Boolean] true if verified AI bot
|
|
52
|
+
def verified_ai_bot?(ip, user_agent, strict: true)
|
|
53
|
+
return false if ip.blank? || user_agent.blank?
|
|
54
|
+
|
|
55
|
+
bot_type = detect_ai_bot_type(user_agent)
|
|
56
|
+
return false unless bot_type
|
|
57
|
+
|
|
58
|
+
# AI bots without published IP ranges must be trusted by user agent
|
|
59
|
+
# (there's no other way to verify them)
|
|
60
|
+
if BotPatterns::AI_BOTS_WITHOUT_VERIFICATION.include?(bot_type)
|
|
61
|
+
true
|
|
62
|
+
elsif BotPatterns::AI_BOTS_WITH_IP_RANGES.include?(bot_type)
|
|
63
|
+
verified_bot_ip?(ip, bot_type)
|
|
64
|
+
elsif strict
|
|
65
|
+
false
|
|
66
|
+
else
|
|
67
|
+
true
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
# Detect which bot type based on user agent (all bot types)
|
|
72
|
+
def detect_bot_type(user_agent)
|
|
73
|
+
return nil if user_agent.blank?
|
|
74
|
+
|
|
75
|
+
BotPatterns::ALL_BOT_PATTERNS.each do |bot_type, pattern|
|
|
76
|
+
return bot_type if user_agent.match?(pattern)
|
|
77
|
+
end
|
|
78
|
+
nil
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
# Detect AI bot type specifically
|
|
82
|
+
def detect_ai_bot_type(user_agent)
|
|
83
|
+
BotPatterns::AI_BOT_PATTERNS.each do |bot_type, pattern|
|
|
84
|
+
return bot_type if user_agent.match?(pattern)
|
|
85
|
+
end
|
|
86
|
+
nil
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
# Detect search engine bot type specifically
|
|
90
|
+
def detect_search_bot_type(user_agent)
|
|
91
|
+
BotPatterns::SEARCH_BOT_PATTERNS.each do |bot_type, pattern|
|
|
92
|
+
return bot_type if user_agent.match?(pattern)
|
|
93
|
+
end
|
|
94
|
+
nil
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
# Check if IP belongs to a known bot via IP ranges or DNS
|
|
98
|
+
def verified_bot_ip?(ip, bot_type)
|
|
99
|
+
cache_key = "#{config.cache_key_prefix}:#{bot_type}:#{ip}"
|
|
100
|
+
|
|
101
|
+
cached = cache.read(cache_key)
|
|
102
|
+
return cached unless cached.nil?
|
|
103
|
+
|
|
104
|
+
result = ip_in_known_ranges?(ip, bot_type)
|
|
105
|
+
|
|
106
|
+
# Fall back to DNS verification if IP range check failed and DNS is enabled
|
|
107
|
+
if result == false && !config.skip_dns_verification && BotPatterns::SEARCH_BOTS_WITH_DNS.include?(bot_type)
|
|
108
|
+
result = verify_by_reverse_dns_with_timeout(ip, bot_type)
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
cache.write(cache_key, result, expires_in: config.cache_ttl)
|
|
112
|
+
result
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
# Check if IP is in known IP ranges for the bot type
|
|
116
|
+
def ip_in_known_ranges?(ip, bot_type)
|
|
117
|
+
return false unless BotPatterns::ALL_BOTS_WITH_IP_RANGES.include?(bot_type)
|
|
118
|
+
|
|
119
|
+
BotVerification.ip_range_model.ip_belongs_to_bot?(ip, bot_type)
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
# Verify bot by reverse DNS lookup with strict timeout
|
|
123
|
+
def verify_by_reverse_dns_with_timeout(ip, bot_type)
|
|
124
|
+
Timeout.timeout(config.dns_total_timeout) do
|
|
125
|
+
verify_by_reverse_dns(ip, bot_type)
|
|
126
|
+
end
|
|
127
|
+
rescue Timeout::Error
|
|
128
|
+
config.logger.warn("BotVerification: DNS verification timed out for #{ip} (#{bot_type})")
|
|
129
|
+
false
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
# Verify bot by reverse DNS lookup
|
|
133
|
+
def verify_by_reverse_dns(ip, bot_type)
|
|
134
|
+
suffixes = BotPatterns::SEARCH_BOT_DNS_SUFFIXES[bot_type]
|
|
135
|
+
return false unless suffixes
|
|
136
|
+
|
|
137
|
+
hostname = reverse_dns_lookup(ip)
|
|
138
|
+
return false unless hostname
|
|
139
|
+
|
|
140
|
+
valid_suffix = suffixes.any? { |suffix| hostname.end_with?(suffix) }
|
|
141
|
+
return false unless valid_suffix
|
|
142
|
+
|
|
143
|
+
forward_ips = forward_dns_lookup(hostname)
|
|
144
|
+
forward_ips.include?(ip)
|
|
145
|
+
rescue StandardError => e
|
|
146
|
+
config.logger.warn("BotVerification: DNS verification failed for #{ip}: #{e.message}")
|
|
147
|
+
false
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
# Check if a user agent is a known AI bot
|
|
151
|
+
def ai_bot_user_agent?(user_agent)
|
|
152
|
+
return false if user_agent.blank?
|
|
153
|
+
|
|
154
|
+
BotPatterns::AI_BOT_PATTERNS.values.any? { |pattern| user_agent.match?(pattern) }
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
# Check if a user agent is a known search engine bot
|
|
158
|
+
def search_bot_user_agent?(user_agent)
|
|
159
|
+
return false if user_agent.blank?
|
|
160
|
+
|
|
161
|
+
BotPatterns::SEARCH_BOT_PATTERNS.values.any? { |pattern| user_agent.match?(pattern) }
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
# Generate a cache key for session-based caching
|
|
165
|
+
def session_cache_key(ip, user_agent)
|
|
166
|
+
fingerprint = Digest::SHA256.hexdigest("#{ip}:#{user_agent}")[0, 16]
|
|
167
|
+
"bot_verify:#{fingerprint}"
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
private
|
|
171
|
+
|
|
172
|
+
def verify_search_engine_bot(ip, bot_type)
|
|
173
|
+
return false unless BotPatterns::SEARCH_BOT_PATTERNS.key?(bot_type)
|
|
174
|
+
return false unless BotPatterns::SEARCH_BOTS_WITH_DNS.include?(bot_type)
|
|
175
|
+
|
|
176
|
+
verified_bot_ip?(ip, bot_type)
|
|
177
|
+
end
|
|
178
|
+
|
|
179
|
+
def verify_ai_bot_internal(ip, bot_type)
|
|
180
|
+
return false unless BotPatterns::AI_BOT_PATTERNS.key?(bot_type)
|
|
181
|
+
|
|
182
|
+
# AI bots without published IP ranges are trusted by user agent
|
|
183
|
+
if BotPatterns::AI_BOTS_WITHOUT_VERIFICATION.include?(bot_type)
|
|
184
|
+
true
|
|
185
|
+
elsif BotPatterns::AI_BOTS_WITH_IP_RANGES.include?(bot_type)
|
|
186
|
+
verified_bot_ip?(ip, bot_type)
|
|
187
|
+
else
|
|
188
|
+
false
|
|
189
|
+
end
|
|
190
|
+
end
|
|
191
|
+
|
|
192
|
+
def reverse_dns_lookup(ip)
|
|
193
|
+
Timeout.timeout(config.dns_timeout) do
|
|
194
|
+
resolver.getname(ip)
|
|
195
|
+
end
|
|
196
|
+
rescue Resolv::ResolvError, Timeout::Error
|
|
197
|
+
nil
|
|
198
|
+
end
|
|
199
|
+
|
|
200
|
+
def forward_dns_lookup(hostname)
|
|
201
|
+
Timeout.timeout(config.dns_timeout) do
|
|
202
|
+
resolver.getaddresses(hostname)
|
|
203
|
+
end
|
|
204
|
+
rescue Resolv::ResolvError, Timeout::Error
|
|
205
|
+
[]
|
|
206
|
+
end
|
|
207
|
+
|
|
208
|
+
def resolver
|
|
209
|
+
@resolver ||= Resolv::DNS.new
|
|
210
|
+
end
|
|
211
|
+
|
|
212
|
+
def config
|
|
213
|
+
BotVerification.configuration
|
|
214
|
+
end
|
|
215
|
+
|
|
216
|
+
def cache
|
|
217
|
+
config.cache
|
|
218
|
+
end
|
|
219
|
+
|
|
220
|
+
def report_verification_result(bot_type, verified, mode, ip, user_agent)
|
|
221
|
+
config.report_verification(
|
|
222
|
+
bot_type: bot_type,
|
|
223
|
+
verified: verified,
|
|
224
|
+
mode: mode,
|
|
225
|
+
ip: ip,
|
|
226
|
+
user_agent: user_agent
|
|
227
|
+
)
|
|
228
|
+
verified
|
|
229
|
+
end
|
|
230
|
+
end
|
|
231
|
+
end
|
|
232
|
+
end
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "rails"
|
|
4
|
+
require "active_record"
|
|
5
|
+
require "active_support"
|
|
6
|
+
require "resolv"
|
|
7
|
+
require "timeout"
|
|
8
|
+
require "digest"
|
|
9
|
+
require "net/http"
|
|
10
|
+
require "openssl"
|
|
11
|
+
require "json"
|
|
12
|
+
|
|
13
|
+
require_relative "bot_verification/version"
|
|
14
|
+
require_relative "bot_verification/configuration"
|
|
15
|
+
require_relative "bot_verification/bot_patterns"
|
|
16
|
+
require_relative "bot_verification/service"
|
|
17
|
+
require_relative "bot_verification/ip_range_model"
|
|
18
|
+
require_relative "bot_verification/ip_range_fetcher"
|
|
19
|
+
require_relative "bot_verification/controller_concern"
|
|
20
|
+
require_relative "bot_verification/refresh_job" if defined?(ActiveJob::Base)
|
|
21
|
+
require_relative "bot_verification/railtie" if defined?(Rails::Railtie)
|
|
22
|
+
|
|
23
|
+
module BotVerification
|
|
24
|
+
class Error < StandardError; end
|
|
25
|
+
class ConfigurationError < Error; end
|
|
26
|
+
|
|
27
|
+
class << self
|
|
28
|
+
attr_writer :configuration
|
|
29
|
+
|
|
30
|
+
def configuration
|
|
31
|
+
@configuration ||= Configuration.new
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def configure
|
|
35
|
+
yield(configuration)
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def reset_configuration!
|
|
39
|
+
@configuration = Configuration.new
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
# Convenience method to access the service
|
|
43
|
+
def verify(ip, user_agent, mode: :search_engines)
|
|
44
|
+
Service.verified_good_bot?(ip, user_agent, mode: mode)
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
# Check if an IP belongs to a known bot
|
|
48
|
+
def verify_ip(ip, bot_type)
|
|
49
|
+
Service.verified_bot_ip?(ip, bot_type)
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
# Detect bot type from user agent
|
|
53
|
+
def detect_bot(user_agent)
|
|
54
|
+
Service.detect_bot_type(user_agent)
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
# Refresh IP ranges from official sources
|
|
58
|
+
def refresh_ip_ranges!(bot_type = nil)
|
|
59
|
+
IpRangeFetcher.refresh!(bot_type)
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
# Get the IP range model class
|
|
63
|
+
def ip_range_model
|
|
64
|
+
configuration.ip_range_model_class
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
# Check if the IP ranges table exists
|
|
68
|
+
def table_exists?
|
|
69
|
+
return false unless ActiveRecord::Base.connected?
|
|
70
|
+
|
|
71
|
+
ActiveRecord::Base.connection.table_exists?(configuration.table_name)
|
|
72
|
+
end
|
|
73
|
+
end
|
|
74
|
+
end
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "rails/generators"
|
|
4
|
+
require "rails/generators/active_record"
|
|
5
|
+
|
|
6
|
+
module BotVerification
|
|
7
|
+
module Generators
|
|
8
|
+
class InstallGenerator < Rails::Generators::Base
|
|
9
|
+
include ActiveRecord::Generators::Migration
|
|
10
|
+
|
|
11
|
+
source_root File.expand_path("templates", __dir__)
|
|
12
|
+
|
|
13
|
+
class_option :table_name, type: :string, default: "bot_ip_ranges",
|
|
14
|
+
desc: "Name of the database table for storing bot IP ranges"
|
|
15
|
+
|
|
16
|
+
class_option :skip_migration, type: :boolean, default: false,
|
|
17
|
+
desc: "Skip creating the migration (if table already exists)"
|
|
18
|
+
|
|
19
|
+
class_option :skip_initializer, type: :boolean, default: false,
|
|
20
|
+
desc: "Skip creating the initializer"
|
|
21
|
+
|
|
22
|
+
class_option :skip_model, type: :boolean, default: false,
|
|
23
|
+
desc: "Skip creating the model file"
|
|
24
|
+
|
|
25
|
+
desc "Installs BotVerification gem: creates migration, initializer, and model"
|
|
26
|
+
|
|
27
|
+
def create_migration
|
|
28
|
+
return if options[:skip_migration]
|
|
29
|
+
|
|
30
|
+
if table_exists?
|
|
31
|
+
say "Table '#{table_name}' already exists. Skipping migration.", :yellow
|
|
32
|
+
return
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
migration_template "migration.rb.erb", "db/migrate/create_#{table_name}.rb"
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def create_initializer
|
|
39
|
+
return if options[:skip_initializer]
|
|
40
|
+
|
|
41
|
+
template "initializer.rb.erb", "config/initializers/bot_verification.rb"
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
def create_model
|
|
45
|
+
return if options[:skip_model]
|
|
46
|
+
|
|
47
|
+
template "model.rb.erb", "app/models/bot_ip_range.rb"
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def create_job
|
|
51
|
+
template "refresh_job.rb.erb", "app/jobs/refresh_bot_ip_ranges_job.rb"
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
def show_post_install_message
|
|
55
|
+
say ""
|
|
56
|
+
say "BotVerification installed successfully!", :green
|
|
57
|
+
say ""
|
|
58
|
+
say "Next steps:", :yellow
|
|
59
|
+
say " 1. Run migrations: rails db:migrate"
|
|
60
|
+
say " 2. Fetch initial IP ranges: rails bot_verification:refresh"
|
|
61
|
+
say " 3. Schedule daily refresh (add to Sidekiq-Cron or cron):"
|
|
62
|
+
say " RefreshBotIpRangesJob.perform_later"
|
|
63
|
+
say ""
|
|
64
|
+
say "Usage in controllers:"
|
|
65
|
+
say " include BotVerification::ControllerConcern"
|
|
66
|
+
say " "
|
|
67
|
+
say " if verified_good_bot?"
|
|
68
|
+
say " # Request is from verified bot"
|
|
69
|
+
say " end"
|
|
70
|
+
say ""
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
private
|
|
74
|
+
|
|
75
|
+
def table_name
|
|
76
|
+
options[:table_name]
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
def table_exists?
|
|
80
|
+
return false unless ActiveRecord::Base.connected?
|
|
81
|
+
|
|
82
|
+
ActiveRecord::Base.connection.table_exists?(table_name)
|
|
83
|
+
rescue StandardError
|
|
84
|
+
false
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
def migration_version
|
|
88
|
+
"[#{ActiveRecord::VERSION::MAJOR}.#{ActiveRecord::VERSION::MINOR}]"
|
|
89
|
+
end
|
|
90
|
+
end
|
|
91
|
+
end
|
|
92
|
+
end
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# BotVerification configuration
|
|
4
|
+
# See: https://github.com/mattkuklinski/bot_verification
|
|
5
|
+
|
|
6
|
+
BotVerification.configure do |config|
|
|
7
|
+
# Table name for storing bot IP ranges
|
|
8
|
+
# Default: "bot_ip_ranges"
|
|
9
|
+
config.table_name = "<%= table_name %>"
|
|
10
|
+
|
|
11
|
+
# Skip DNS verification entirely (only use IP range matching)
|
|
12
|
+
# Set to true if DNS lookups add unacceptable latency for your project.
|
|
13
|
+
# When enabled, bots like Apple/Yandex/Baidu that don't publish IP ranges
|
|
14
|
+
# won't be verifiable - only Google/Bing/OpenAI/etc with published ranges.
|
|
15
|
+
# Default: false
|
|
16
|
+
# config.skip_dns_verification = false
|
|
17
|
+
|
|
18
|
+
# Timeout for each DNS lookup (reverse and forward) in seconds
|
|
19
|
+
# Only applies when skip_dns_verification is false
|
|
20
|
+
# Default: 1.0
|
|
21
|
+
# config.dns_timeout = 1.0
|
|
22
|
+
|
|
23
|
+
# Total timeout for all DNS operations in seconds
|
|
24
|
+
# Only applies when skip_dns_verification is false
|
|
25
|
+
# Default: 2.0
|
|
26
|
+
# config.dns_total_timeout = 2.0
|
|
27
|
+
|
|
28
|
+
# How long to cache verification results in Rails cache
|
|
29
|
+
# Default: 24.hours
|
|
30
|
+
# config.cache_ttl = 24.hours
|
|
31
|
+
|
|
32
|
+
# How long to cache results in session
|
|
33
|
+
# Default: 1.hour
|
|
34
|
+
# config.session_cache_ttl = 1.hour
|
|
35
|
+
|
|
36
|
+
# Cache key prefix for Rails cache
|
|
37
|
+
# Default: "bot_verification"
|
|
38
|
+
# config.cache_key_prefix = "bot_verification"
|
|
39
|
+
|
|
40
|
+
# Custom IP range model class name (optional)
|
|
41
|
+
# If you want to use your own model instead of the generated one
|
|
42
|
+
# config.ip_range_model_name = "BotIpRange"
|
|
43
|
+
|
|
44
|
+
# Error callback - integrate with your error tracking service
|
|
45
|
+
# Called when errors occur during IP range refresh
|
|
46
|
+
# config.on_error = ->(error, context) {
|
|
47
|
+
# Airbrake.notify(error, context)
|
|
48
|
+
# # or: Sentry.capture_exception(error, extra: context)
|
|
49
|
+
# # or: Rails.logger.error("BotVerification error: #{error.message}")
|
|
50
|
+
# }
|
|
51
|
+
|
|
52
|
+
# Refresh complete callback - for notifications or monitoring
|
|
53
|
+
# Called after IP range refresh completes (success or partial failure)
|
|
54
|
+
# config.on_refresh_complete = ->(results) {
|
|
55
|
+
# failures = results.select { |_, r| !r[:success] }
|
|
56
|
+
# SlackNotifier.notify("Bot IPs refreshed. Failures: #{failures.keys}") if failures.any?
|
|
57
|
+
# }
|
|
58
|
+
end
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
class Create<%= table_name.camelize %> < ActiveRecord::Migration<%= migration_version %>
|
|
4
|
+
def change
|
|
5
|
+
create_table :<%= table_name %> do |t|
|
|
6
|
+
t.string :bot_type, null: false
|
|
7
|
+
t.string :cidr, null: false
|
|
8
|
+
t.integer :ip_version, null: false, default: 4
|
|
9
|
+
t.string :source_url
|
|
10
|
+
t.datetime :last_verified_at
|
|
11
|
+
|
|
12
|
+
t.timestamps
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
add_index :<%= table_name %>, :bot_type
|
|
16
|
+
add_index :<%= table_name %>, [:bot_type, :cidr], unique: true
|
|
17
|
+
end
|
|
18
|
+
end
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# Stores known IP ranges for legitimate search engine and AI bots
|
|
4
|
+
# These ranges are refreshed daily by RefreshBotIpRangesJob
|
|
5
|
+
#
|
|
6
|
+
# @example Check if an IP belongs to a known bot
|
|
7
|
+
# BotIpRange.ip_belongs_to_bot?("66.249.66.1", :google) # => true
|
|
8
|
+
#
|
|
9
|
+
class BotIpRange < ApplicationRecord
|
|
10
|
+
include BotVerification::IpRangeModel
|
|
11
|
+
|
|
12
|
+
self.table_name = "<%= table_name %>"
|
|
13
|
+
end
|