bot_verification 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +20 -0
- data/Gemfile +7 -0
- data/Gemfile.lock +256 -0
- data/LICENSE +21 -0
- data/README.md +355 -0
- data/Rakefile +8 -0
- data/bot_verification.gemspec +37 -0
- data/lib/bot_verification/bot_patterns.rb +121 -0
- data/lib/bot_verification/configuration.rb +139 -0
- data/lib/bot_verification/controller_concern.rb +150 -0
- data/lib/bot_verification/ip_range_fetcher.rb +155 -0
- data/lib/bot_verification/ip_range_model.rb +132 -0
- data/lib/bot_verification/railtie.rb +22 -0
- data/lib/bot_verification/refresh_job.rb +36 -0
- data/lib/bot_verification/service.rb +232 -0
- data/lib/bot_verification/version.rb +5 -0
- data/lib/bot_verification.rb +74 -0
- data/lib/generators/bot_verification/install_generator.rb +92 -0
- data/lib/generators/bot_verification/templates/initializer.rb.erb +58 -0
- data/lib/generators/bot_verification/templates/migration.rb.erb +18 -0
- data/lib/generators/bot_verification/templates/model.rb.erb +13 -0
- data/lib/generators/bot_verification/templates/refresh_job.rb.erb +21 -0
- data/lib/tasks/bot_verification.rake +95 -0
- metadata +127 -0
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "lib/bot_verification/version"
|
|
4
|
+
|
|
5
|
+
Gem::Specification.new do |spec|
|
|
6
|
+
spec.name = "bot_verification"
|
|
7
|
+
spec.version = BotVerification::VERSION
|
|
8
|
+
spec.authors = ["Web Ventures Ltd"]
|
|
9
|
+
spec.email = ["gems@dev.webven.nz"]
|
|
10
|
+
|
|
11
|
+
spec.summary = "Verify legitimate search engine and AI bots by IP"
|
|
12
|
+
spec.description = "A Rails engine for verifying that requests claiming to be from " \
|
|
13
|
+
"search engine bots (Google, Bing, etc.) and AI bots (GPTBot, PerplexityBot) " \
|
|
14
|
+
"are actually from those services, using IP range matching and reverse DNS verification."
|
|
15
|
+
spec.homepage = "https://github.com/webventures/bot_verification"
|
|
16
|
+
spec.license = "MIT"
|
|
17
|
+
spec.required_ruby_version = ">= 3.2.0"
|
|
18
|
+
|
|
19
|
+
spec.metadata["homepage_uri"] = spec.homepage
|
|
20
|
+
spec.metadata["source_code_uri"] = spec.homepage
|
|
21
|
+
spec.metadata["changelog_uri"] = "#{spec.homepage}/blob/main/CHANGELOG.md"
|
|
22
|
+
|
|
23
|
+
spec.files = Dir.chdir(__dir__) do
|
|
24
|
+
`git ls-files -z`.split("\x0").reject do |f|
|
|
25
|
+
(f == __FILE__) || f.match(%r{\A(?:(?:bin|test|spec|features)/|\.(?:git|github|travis|circleci)|appveyor)})
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
spec.bindir = "exe"
|
|
29
|
+
spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
|
|
30
|
+
spec.require_paths = ["lib"]
|
|
31
|
+
|
|
32
|
+
spec.add_dependency "rails", ">= 7.0"
|
|
33
|
+
spec.add_dependency "resolv", ">= 0.2"
|
|
34
|
+
|
|
35
|
+
spec.add_development_dependency "rspec", "~> 3.12"
|
|
36
|
+
spec.add_development_dependency "sqlite3", ">= 2.1"
|
|
37
|
+
end
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module BotVerification
|
|
4
|
+
module BotPatterns
|
|
5
|
+
# =============================================================================
|
|
6
|
+
# SEARCH ENGINE BOTS
|
|
7
|
+
# =============================================================================
|
|
8
|
+
|
|
9
|
+
# User agent patterns for search engine bots
|
|
10
|
+
SEARCH_BOT_PATTERNS = {
|
|
11
|
+
google: /Googlebot|Google-Extended|Mediapartners-Google|AdsBot-Google|APIs-Google/i,
|
|
12
|
+
bing: /Bingbot|msnbot|BingPreview/i,
|
|
13
|
+
apple: /Applebot/i,
|
|
14
|
+
yandex: /YandexBot/i,
|
|
15
|
+
baidu: /Baiduspider/i
|
|
16
|
+
}.freeze
|
|
17
|
+
|
|
18
|
+
# Valid reverse DNS suffixes for search engine bots
|
|
19
|
+
SEARCH_BOT_DNS_SUFFIXES = {
|
|
20
|
+
google: %w[.googlebot.com .google.com .googleusercontent.com],
|
|
21
|
+
bing: %w[.search.msn.com],
|
|
22
|
+
apple: %w[.applebot.apple.com],
|
|
23
|
+
yandex: %w[.yandex.ru .yandex.net .yandex.com],
|
|
24
|
+
baidu: %w[.crawl.baidu.com .crawl.baidu.jp]
|
|
25
|
+
}.freeze
|
|
26
|
+
|
|
27
|
+
# Search engine bots with IP ranges in database
|
|
28
|
+
SEARCH_BOTS_WITH_IP_RANGES = %i[google bing].freeze
|
|
29
|
+
|
|
30
|
+
# Search engine bots that support reverse DNS verification
|
|
31
|
+
SEARCH_BOTS_WITH_DNS = %i[google bing apple yandex baidu].freeze
|
|
32
|
+
|
|
33
|
+
# =============================================================================
|
|
34
|
+
# AI BOTS
|
|
35
|
+
# =============================================================================
|
|
36
|
+
|
|
37
|
+
# User agent patterns for AI bots
|
|
38
|
+
AI_BOT_PATTERNS = {
|
|
39
|
+
openai_gptbot: /GPTBot/i,
|
|
40
|
+
openai_chatgpt: /ChatGPT-User/i,
|
|
41
|
+
openai_searchbot: /OAI-SearchBot/i,
|
|
42
|
+
anthropic: /ClaudeBot|Claude-Web|anthropic-ai/i,
|
|
43
|
+
perplexity: /PerplexityBot|Perplexity-User/i,
|
|
44
|
+
amazon: /Amazonbot/i,
|
|
45
|
+
cohere: /cohere-ai/i,
|
|
46
|
+
meta: /meta-externalagent/i,
|
|
47
|
+
bytedance: /Bytespider/i
|
|
48
|
+
}.freeze
|
|
49
|
+
|
|
50
|
+
# AI bots with officially published IP ranges (can be verified)
|
|
51
|
+
AI_BOTS_WITH_IP_RANGES = %i[openai_gptbot openai_chatgpt openai_searchbot perplexity amazon].freeze
|
|
52
|
+
|
|
53
|
+
# AI bots without official IP ranges (cannot be reliably verified)
|
|
54
|
+
AI_BOTS_WITHOUT_VERIFICATION = %i[anthropic cohere meta bytedance].freeze
|
|
55
|
+
|
|
56
|
+
# =============================================================================
|
|
57
|
+
# SOCIAL/OTHER BOTS (no verification available)
|
|
58
|
+
# =============================================================================
|
|
59
|
+
|
|
60
|
+
SOCIAL_BOT_PATTERNS = {
|
|
61
|
+
duckduckgo: /DuckDuckBot/i,
|
|
62
|
+
facebook: /facebookexternalhit|Facebot/i,
|
|
63
|
+
twitter: /Twitterbot/i,
|
|
64
|
+
linkedin: /LinkedInBot/i,
|
|
65
|
+
slack: /Slackbot/i,
|
|
66
|
+
discord: /Discordbot/i,
|
|
67
|
+
telegram: /TelegramBot/i
|
|
68
|
+
}.freeze
|
|
69
|
+
|
|
70
|
+
# =============================================================================
|
|
71
|
+
# COMBINED
|
|
72
|
+
# =============================================================================
|
|
73
|
+
|
|
74
|
+
ALL_BOT_PATTERNS = SEARCH_BOT_PATTERNS.merge(AI_BOT_PATTERNS).merge(SOCIAL_BOT_PATTERNS).freeze
|
|
75
|
+
ALL_BOTS_WITH_IP_RANGES = (SEARCH_BOTS_WITH_IP_RANGES + AI_BOTS_WITH_IP_RANGES).freeze
|
|
76
|
+
|
|
77
|
+
# Search engine bot types (as strings for database)
|
|
78
|
+
SEARCH_BOT_TYPES = %w[google bing].freeze
|
|
79
|
+
|
|
80
|
+
# AI bot types with IP ranges (as strings for database)
|
|
81
|
+
AI_BOT_TYPES = %w[openai_gptbot openai_chatgpt openai_searchbot perplexity amazon].freeze
|
|
82
|
+
|
|
83
|
+
# All bot types that have IP ranges
|
|
84
|
+
VALID_BOT_TYPES = (SEARCH_BOT_TYPES + AI_BOT_TYPES).freeze
|
|
85
|
+
|
|
86
|
+
# =============================================================================
|
|
87
|
+
# IP RANGE SOURCES
|
|
88
|
+
# =============================================================================
|
|
89
|
+
|
|
90
|
+
SEARCH_ENGINE_SOURCES = {
|
|
91
|
+
google: [
|
|
92
|
+
{ url: "https://developers.google.com/static/search/apis/ipranges/googlebot.json", name: "Googlebot" },
|
|
93
|
+
{ url: "https://developers.google.com/static/search/apis/ipranges/special-crawlers.json", name: "Google Special" }
|
|
94
|
+
],
|
|
95
|
+
bing: [
|
|
96
|
+
{ url: "https://www.bing.com/toolbox/bingbot.json", name: "Bingbot" }
|
|
97
|
+
]
|
|
98
|
+
}.freeze
|
|
99
|
+
|
|
100
|
+
AI_BOT_SOURCES = {
|
|
101
|
+
openai_gptbot: [
|
|
102
|
+
{ url: "https://openai.com/gptbot.json", name: "GPTBot" }
|
|
103
|
+
],
|
|
104
|
+
openai_chatgpt: [
|
|
105
|
+
{ url: "https://openai.com/chatgpt-user.json", name: "ChatGPT-User" }
|
|
106
|
+
],
|
|
107
|
+
openai_searchbot: [
|
|
108
|
+
{ url: "https://openai.com/searchbot.json", name: "OAI-SearchBot" }
|
|
109
|
+
],
|
|
110
|
+
perplexity: [
|
|
111
|
+
{ url: "https://www.perplexity.ai/perplexitybot.json", name: "PerplexityBot" },
|
|
112
|
+
{ url: "https://www.perplexity.ai/perplexity-user.json", name: "Perplexity-User" }
|
|
113
|
+
],
|
|
114
|
+
amazon: [
|
|
115
|
+
{ url: "https://developer.amazon.com/amazonbot/ip-addresses/", name: "Amazonbot" }
|
|
116
|
+
]
|
|
117
|
+
}.freeze
|
|
118
|
+
|
|
119
|
+
IP_RANGE_SOURCES = SEARCH_ENGINE_SOURCES.merge(AI_BOT_SOURCES).freeze
|
|
120
|
+
end
|
|
121
|
+
end
|
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module BotVerification
|
|
4
|
+
class Configuration
|
|
5
|
+
# Table name for storing bot IP ranges
|
|
6
|
+
# Default: "bot_ip_ranges"
|
|
7
|
+
attr_accessor :table_name
|
|
8
|
+
|
|
9
|
+
# Skip DNS verification entirely (only use IP range matching)
|
|
10
|
+
# Set to true if DNS lookups are unacceptable for your project
|
|
11
|
+
# Default: false
|
|
12
|
+
attr_accessor :skip_dns_verification
|
|
13
|
+
|
|
14
|
+
# Verify SSL certificates when fetching IP ranges
|
|
15
|
+
# Set to false in development if you have certificate issues
|
|
16
|
+
# Default: true (always verify in production!)
|
|
17
|
+
attr_accessor :verify_ssl
|
|
18
|
+
|
|
19
|
+
# Timeout for each DNS lookup (reverse and forward) in seconds
|
|
20
|
+
# Default: 1.0
|
|
21
|
+
attr_accessor :dns_timeout
|
|
22
|
+
|
|
23
|
+
# Total timeout for all DNS operations in seconds
|
|
24
|
+
# Default: 2.0
|
|
25
|
+
attr_accessor :dns_total_timeout
|
|
26
|
+
|
|
27
|
+
# How long to cache verification results in Rails cache
|
|
28
|
+
# Default: 24 hours
|
|
29
|
+
attr_accessor :cache_ttl
|
|
30
|
+
|
|
31
|
+
# How long to cache results in session
|
|
32
|
+
# Default: 1 hour
|
|
33
|
+
attr_accessor :session_cache_ttl
|
|
34
|
+
|
|
35
|
+
# Cache key prefix for Rails cache
|
|
36
|
+
# Default: "bot_verification"
|
|
37
|
+
attr_accessor :cache_key_prefix
|
|
38
|
+
|
|
39
|
+
# Logger instance (defaults to Rails.logger)
|
|
40
|
+
attr_accessor :logger
|
|
41
|
+
|
|
42
|
+
# Custom IP range model class name (optional)
|
|
43
|
+
# If set, uses your own model instead of the built-in one
|
|
44
|
+
# The model must include BotVerification::IpRangeModel
|
|
45
|
+
attr_accessor :ip_range_model_name
|
|
46
|
+
|
|
47
|
+
# Error callback - called when errors occur during IP range refresh
|
|
48
|
+
# Useful for integrating with error tracking services (Airbrake, Sentry, etc.)
|
|
49
|
+
# @example
|
|
50
|
+
# config.on_error = ->(error, context) { Airbrake.notify(error, context) }
|
|
51
|
+
attr_accessor :on_error
|
|
52
|
+
|
|
53
|
+
# Refresh complete callback - called after IP range refresh completes
|
|
54
|
+
# Useful for notifications or monitoring
|
|
55
|
+
# @example
|
|
56
|
+
# config.on_refresh_complete = ->(results) { SlackNotifier.notify("Bot IPs refreshed: #{results}") }
|
|
57
|
+
attr_accessor :on_refresh_complete
|
|
58
|
+
|
|
59
|
+
# Verification callback - called after each bot verification attempt
|
|
60
|
+
# Useful for collecting statistics about bot traffic
|
|
61
|
+
# @example
|
|
62
|
+
# config.on_verification = ->(result) {
|
|
63
|
+
# # result = { bot_type: :google, verified: true, mode: :search_engines, ip: "...", user_agent: "..." }
|
|
64
|
+
# BotRequestStats.increment(result[:bot_type], result[:verified])
|
|
65
|
+
# }
|
|
66
|
+
attr_accessor :on_verification
|
|
67
|
+
|
|
68
|
+
def initialize
|
|
69
|
+
@table_name = "bot_ip_ranges"
|
|
70
|
+
@skip_dns_verification = false
|
|
71
|
+
@verify_ssl = true
|
|
72
|
+
@dns_timeout = 1.0
|
|
73
|
+
@dns_total_timeout = 2.0
|
|
74
|
+
@cache_ttl = 24.hours
|
|
75
|
+
@session_cache_ttl = 1.hour
|
|
76
|
+
@cache_key_prefix = "bot_verification"
|
|
77
|
+
@logger = nil
|
|
78
|
+
@ip_range_model_name = nil
|
|
79
|
+
@on_error = nil
|
|
80
|
+
@on_refresh_complete = nil
|
|
81
|
+
@on_verification = nil
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
# Report an error through the configured callback
|
|
85
|
+
def report_error(error, context = {})
|
|
86
|
+
return unless on_error.respond_to?(:call)
|
|
87
|
+
|
|
88
|
+
on_error.call(error, context)
|
|
89
|
+
rescue StandardError => e
|
|
90
|
+
logger.error("[BotVerification] Error in on_error callback: #{e.message}")
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
# Report refresh completion through the configured callback
|
|
94
|
+
def report_refresh_complete(results)
|
|
95
|
+
return unless on_refresh_complete.respond_to?(:call)
|
|
96
|
+
|
|
97
|
+
on_refresh_complete.call(results)
|
|
98
|
+
rescue StandardError => e
|
|
99
|
+
logger.error("[BotVerification] Error in on_refresh_complete callback: #{e.message}")
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
# Report verification attempt through the configured callback
|
|
103
|
+
def report_verification(result)
|
|
104
|
+
return unless on_verification.respond_to?(:call)
|
|
105
|
+
|
|
106
|
+
on_verification.call(result)
|
|
107
|
+
rescue StandardError => e
|
|
108
|
+
logger.error("[BotVerification] Error in on_verification callback: #{e.message}")
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
def logger
|
|
112
|
+
@logger || (defined?(Rails) ? Rails.logger : Logger.new($stdout))
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
def cache
|
|
116
|
+
@cache ||= if defined?(Rails) && Rails.respond_to?(:cache) && Rails.cache
|
|
117
|
+
Rails.cache
|
|
118
|
+
else
|
|
119
|
+
ActiveSupport::Cache::MemoryStore.new
|
|
120
|
+
end
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
# Get the IP range model class
|
|
124
|
+
def ip_range_model_class
|
|
125
|
+
if @ip_range_model_name
|
|
126
|
+
@ip_range_model_name.constantize
|
|
127
|
+
else
|
|
128
|
+
BotVerification::IpRange
|
|
129
|
+
end
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
# Validate configuration
|
|
133
|
+
def validate!
|
|
134
|
+
raise ConfigurationError, "table_name cannot be blank" if table_name.blank?
|
|
135
|
+
raise ConfigurationError, "dns_timeout must be positive" unless dns_timeout.positive?
|
|
136
|
+
raise ConfigurationError, "dns_total_timeout must be positive" unless dns_total_timeout.positive?
|
|
137
|
+
end
|
|
138
|
+
end
|
|
139
|
+
end
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module BotVerification
|
|
4
|
+
# Controller concern for bot verification with session-based caching
|
|
5
|
+
#
|
|
6
|
+
# @example Usage in a controller
|
|
7
|
+
# class MyController < ApplicationController
|
|
8
|
+
# include BotVerification::ControllerConcern
|
|
9
|
+
#
|
|
10
|
+
# def show
|
|
11
|
+
# if verified_good_bot?
|
|
12
|
+
# # Serve full content to verified bots
|
|
13
|
+
# else
|
|
14
|
+
# # Rate limit or require authentication
|
|
15
|
+
# end
|
|
16
|
+
# end
|
|
17
|
+
# end
|
|
18
|
+
#
|
|
19
|
+
module ControllerConcern
|
|
20
|
+
extend ActiveSupport::Concern
|
|
21
|
+
|
|
22
|
+
# Check if current request is from a verified good bot
|
|
23
|
+
# Uses session caching to avoid repeated verification calls
|
|
24
|
+
#
|
|
25
|
+
# @param mode [Symbol] Verification mode (see Service)
|
|
26
|
+
# @return [Boolean]
|
|
27
|
+
def verified_good_bot?(mode: :search_engines)
|
|
28
|
+
return @_verified_good_bot if defined?(@_verified_good_bot) && @_verification_mode == mode
|
|
29
|
+
|
|
30
|
+
@_verification_mode = mode
|
|
31
|
+
@_verified_good_bot = check_bot_with_session_cache(mode: mode)
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
# Check specifically if request is from a verified AI bot
|
|
35
|
+
#
|
|
36
|
+
# @param strict [Boolean] See Service#verified_ai_bot?
|
|
37
|
+
# @return [Boolean]
|
|
38
|
+
def verified_ai_bot?(strict: true)
|
|
39
|
+
cache_key = bot_session_cache_key
|
|
40
|
+
return false unless cache_key
|
|
41
|
+
|
|
42
|
+
cached = read_bot_session_cache(cache_key)
|
|
43
|
+
return true if cached && cached[:ai_verified] == true
|
|
44
|
+
|
|
45
|
+
result = Service.verified_ai_bot?(
|
|
46
|
+
request.remote_ip,
|
|
47
|
+
request.user_agent,
|
|
48
|
+
strict: strict
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
if result
|
|
52
|
+
write_bot_session_cache(cache_key, ai_verified: true, bot_type: detected_bot_type)
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
result
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
# Get the detected bot type for the current request (if any)
|
|
59
|
+
#
|
|
60
|
+
# @return [Symbol, nil]
|
|
61
|
+
def detected_bot_type
|
|
62
|
+
@_detected_bot_type ||= Service.detect_bot_type(request.user_agent)
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
# Check if user agent looks like a bot (regardless of verification)
|
|
66
|
+
#
|
|
67
|
+
# @return [Boolean]
|
|
68
|
+
def bot_user_agent?
|
|
69
|
+
detected_bot_type.present?
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
# Check if user agent looks like an AI bot
|
|
73
|
+
#
|
|
74
|
+
# @return [Boolean]
|
|
75
|
+
def ai_bot_user_agent?
|
|
76
|
+
Service.ai_bot_user_agent?(request.user_agent)
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
# Check if user agent looks like a search engine bot
|
|
80
|
+
#
|
|
81
|
+
# @return [Boolean]
|
|
82
|
+
def search_bot_user_agent?
|
|
83
|
+
Service.search_bot_user_agent?(request.user_agent)
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
private
|
|
87
|
+
|
|
88
|
+
def check_bot_with_session_cache(mode:)
|
|
89
|
+
cache_key = bot_session_cache_key
|
|
90
|
+
return false unless cache_key
|
|
91
|
+
|
|
92
|
+
cached = read_bot_session_cache(cache_key)
|
|
93
|
+
return cached[:verified] == true if cached
|
|
94
|
+
|
|
95
|
+
result = Service.verified_good_bot?(
|
|
96
|
+
request.remote_ip,
|
|
97
|
+
request.user_agent,
|
|
98
|
+
mode: mode
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
write_bot_session_cache(cache_key, verified: result, bot_type: detected_bot_type)
|
|
102
|
+
result
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
def bot_session_cache_key
|
|
106
|
+
return nil if request.remote_ip.blank? || request.user_agent.blank?
|
|
107
|
+
|
|
108
|
+
Service.session_cache_key(request.remote_ip, request.user_agent)
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
def read_bot_session_cache(key)
|
|
112
|
+
return nil unless bot_session_available?
|
|
113
|
+
|
|
114
|
+
cache_store = session[:bot_verification]
|
|
115
|
+
return nil unless cache_store.is_a?(Hash)
|
|
116
|
+
|
|
117
|
+
entry = cache_store[key]
|
|
118
|
+
return nil unless entry.is_a?(Hash)
|
|
119
|
+
|
|
120
|
+
cached_at = entry[:at]
|
|
121
|
+
ttl = BotVerification.configuration.session_cache_ttl.to_i
|
|
122
|
+
if cached_at && (Time.current.to_i - cached_at) > ttl
|
|
123
|
+
cache_store.delete(key)
|
|
124
|
+
return nil
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
entry
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
def write_bot_session_cache(key, data)
|
|
131
|
+
return unless bot_session_available?
|
|
132
|
+
|
|
133
|
+
session[:bot_verification] ||= {}
|
|
134
|
+
|
|
135
|
+
# Limit cache size to prevent session bloat
|
|
136
|
+
if session[:bot_verification].size >= 10
|
|
137
|
+
oldest_key = session[:bot_verification].min_by { |_, v| v[:at] || 0 }&.first
|
|
138
|
+
session[:bot_verification].delete(oldest_key) if oldest_key
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
session[:bot_verification][key] = data.merge(at: Time.current.to_i)
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
def bot_session_available?
|
|
145
|
+
defined?(session) && session.respond_to?(:[])
|
|
146
|
+
rescue ActionController::InvalidAuthenticityToken
|
|
147
|
+
false
|
|
148
|
+
end
|
|
149
|
+
end
|
|
150
|
+
end
|
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module BotVerification
|
|
4
|
+
# Fetches IP ranges from official sources
|
|
5
|
+
# Can be called manually or scheduled via cron/Sidekiq
|
|
6
|
+
#
|
|
7
|
+
# @example Run manually
|
|
8
|
+
# BotVerification::IpRangeFetcher.refresh!
|
|
9
|
+
#
|
|
10
|
+
# @example Refresh specific bot type
|
|
11
|
+
# BotVerification::IpRangeFetcher.refresh!(:google)
|
|
12
|
+
#
|
|
13
|
+
class IpRangeFetcher
|
|
14
|
+
class << self
|
|
15
|
+
# Refresh IP ranges from official sources
|
|
16
|
+
#
|
|
17
|
+
# @param bot_type [String, Symbol, nil] Specific bot type, or nil for all
|
|
18
|
+
# @return [Hash] Results for each bot type
|
|
19
|
+
def refresh!(bot_type = nil)
|
|
20
|
+
if bot_type
|
|
21
|
+
{ bot_type.to_sym => refresh_bot_type(bot_type.to_s) }
|
|
22
|
+
else
|
|
23
|
+
refresh_all
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
private
|
|
28
|
+
|
|
29
|
+
def refresh_all
|
|
30
|
+
log_info("Starting refresh of all bot IP ranges")
|
|
31
|
+
|
|
32
|
+
results = {}
|
|
33
|
+
BotPatterns::IP_RANGE_SOURCES.each_key do |type|
|
|
34
|
+
results[type] = refresh_bot_type(type.to_s)
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
log_info("Completed refresh: #{results.inspect}")
|
|
38
|
+
config.report_refresh_complete(results)
|
|
39
|
+
results
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def refresh_bot_type(bot_type)
|
|
43
|
+
sources = BotPatterns::IP_RANGE_SOURCES[bot_type.to_sym]
|
|
44
|
+
return { error: "Unknown bot type: #{bot_type}" } unless sources
|
|
45
|
+
|
|
46
|
+
ranges = []
|
|
47
|
+
|
|
48
|
+
sources.each do |source|
|
|
49
|
+
log_info("Fetching #{source[:name]} from #{source[:url]}")
|
|
50
|
+
|
|
51
|
+
begin
|
|
52
|
+
fetched = fetch_ranges_from_url(source[:url])
|
|
53
|
+
ranges.concat(fetched.map { |r| r.merge(source_url: source[:url]) })
|
|
54
|
+
log_info("Fetched #{fetched.size} ranges from #{source[:name]}")
|
|
55
|
+
rescue StandardError => e
|
|
56
|
+
log_error("Failed to fetch #{source[:name]}: #{e.message}")
|
|
57
|
+
config.report_error(e, bot_type: bot_type, source: source[:name], url: source[:url])
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
if ranges.any?
|
|
62
|
+
count = BotVerification.ip_range_model.import_ranges(bot_type, ranges)
|
|
63
|
+
log_info("Imported #{count} ranges for #{bot_type}")
|
|
64
|
+
{ success: true, count: count }
|
|
65
|
+
else
|
|
66
|
+
log_warn("No ranges fetched for #{bot_type}")
|
|
67
|
+
{ success: false, error: "No ranges fetched" }
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
def fetch_ranges_from_url(url)
|
|
72
|
+
uri = URI.parse(url)
|
|
73
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
|
74
|
+
http.use_ssl = uri.scheme == "https"
|
|
75
|
+
http.verify_mode = config.verify_ssl ? OpenSSL::SSL::VERIFY_PEER : OpenSSL::SSL::VERIFY_NONE
|
|
76
|
+
http.open_timeout = 10
|
|
77
|
+
http.read_timeout = 30
|
|
78
|
+
|
|
79
|
+
request = Net::HTTP::Get.new(uri.request_uri)
|
|
80
|
+
request["User-Agent"] = "BotVerification Gem/#{VERSION}"
|
|
81
|
+
|
|
82
|
+
response = http.request(request)
|
|
83
|
+
|
|
84
|
+
unless response.is_a?(Net::HTTPSuccess)
|
|
85
|
+
raise "HTTP #{response.code}: #{response.message}"
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
parse_ip_ranges(response.body)
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
def parse_ip_ranges(body)
|
|
92
|
+
# Try direct JSON parse first
|
|
93
|
+
data = JSON.parse(body)
|
|
94
|
+
extract_prefixes(data)
|
|
95
|
+
rescue JSON::ParserError
|
|
96
|
+
# If direct parse fails, try to extract JSON from HTML (Amazon's format)
|
|
97
|
+
extract_json_from_html(body)
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
def extract_prefixes(data)
|
|
101
|
+
prefixes = data["prefixes"] || []
|
|
102
|
+
|
|
103
|
+
prefixes.filter_map do |prefix|
|
|
104
|
+
cidr = prefix["ipv4Prefix"] || prefix["ipv6Prefix"]
|
|
105
|
+
next unless cidr
|
|
106
|
+
|
|
107
|
+
# Ensure CIDR notation (Amazon uses bare IPs without /32)
|
|
108
|
+
cidr = "#{cidr}/32" if prefix["ipv4Prefix"] && !cidr.include?("/")
|
|
109
|
+
cidr = "#{cidr}/128" if prefix["ipv6Prefix"] && !cidr.include?("/")
|
|
110
|
+
|
|
111
|
+
ip_version = prefix["ipv4Prefix"] ? 4 : 6
|
|
112
|
+
{ cidr: cidr, ip_version: ip_version }
|
|
113
|
+
end
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
def extract_json_from_html(html)
|
|
117
|
+
# Amazon embeds JSON in HTML with escaped quotes
|
|
118
|
+
# Look for JSON-like structure with prefixes
|
|
119
|
+
return [] unless html.include?("prefixes")
|
|
120
|
+
|
|
121
|
+
# Unescape HTML entities
|
|
122
|
+
unescaped = html.gsub(""", '"').gsub("&", "&").gsub("<", "<").gsub(">", ">")
|
|
123
|
+
|
|
124
|
+
# Try to find and parse JSON object containing prefixes
|
|
125
|
+
match = unescaped.match(/\{\s*"creationTime"[^{]*"prefixes"\s*:\s*\[.*?\]\s*\}/m)
|
|
126
|
+
return [] unless match
|
|
127
|
+
|
|
128
|
+
data = JSON.parse(match[0])
|
|
129
|
+
extract_prefixes(data)
|
|
130
|
+
rescue JSON::ParserError => e
|
|
131
|
+
log_error("Failed to parse embedded JSON: #{e.message}")
|
|
132
|
+
[]
|
|
133
|
+
rescue StandardError => e
|
|
134
|
+
log_error("Failed to extract JSON from HTML: #{e.message}")
|
|
135
|
+
[]
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
def config
|
|
139
|
+
BotVerification.configuration
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
def log_info(message)
|
|
143
|
+
config.logger.info("[BotVerification] #{message}")
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
def log_warn(message)
|
|
147
|
+
config.logger.warn("[BotVerification] #{message}")
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
def log_error(message)
|
|
151
|
+
config.logger.error("[BotVerification] #{message}")
|
|
152
|
+
end
|
|
153
|
+
end
|
|
154
|
+
end
|
|
155
|
+
end
|