bot_verification 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,37 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "lib/bot_verification/version"
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = "bot_verification"
7
+ spec.version = BotVerification::VERSION
8
+ spec.authors = ["Web Ventures Ltd"]
9
+ spec.email = ["gems@dev.webven.nz"]
10
+
11
+ spec.summary = "Verify legitimate search engine and AI bots by IP"
12
+ spec.description = "A Rails engine for verifying that requests claiming to be from " \
13
+ "search engine bots (Google, Bing, etc.) and AI bots (GPTBot, PerplexityBot) " \
14
+ "are actually from those services, using IP range matching and reverse DNS verification."
15
+ spec.homepage = "https://github.com/webventures/bot_verification"
16
+ spec.license = "MIT"
17
+ spec.required_ruby_version = ">= 3.2.0"
18
+
19
+ spec.metadata["homepage_uri"] = spec.homepage
20
+ spec.metadata["source_code_uri"] = spec.homepage
21
+ spec.metadata["changelog_uri"] = "#{spec.homepage}/blob/main/CHANGELOG.md"
22
+
23
+ spec.files = Dir.chdir(__dir__) do
24
+ `git ls-files -z`.split("\x0").reject do |f|
25
+ (f == __FILE__) || f.match(%r{\A(?:(?:bin|test|spec|features)/|\.(?:git|github|travis|circleci)|appveyor)})
26
+ end
27
+ end
28
+ spec.bindir = "exe"
29
+ spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
30
+ spec.require_paths = ["lib"]
31
+
32
+ spec.add_dependency "rails", ">= 7.0"
33
+ spec.add_dependency "resolv", ">= 0.2"
34
+
35
+ spec.add_development_dependency "rspec", "~> 3.12"
36
+ spec.add_development_dependency "sqlite3", ">= 2.1"
37
+ end
@@ -0,0 +1,121 @@
1
+ # frozen_string_literal: true
2
+
3
+ module BotVerification
4
+ module BotPatterns
5
+ # =============================================================================
6
+ # SEARCH ENGINE BOTS
7
+ # =============================================================================
8
+
9
+ # User agent patterns for search engine bots
10
+ SEARCH_BOT_PATTERNS = {
11
+ google: /Googlebot|Google-Extended|Mediapartners-Google|AdsBot-Google|APIs-Google/i,
12
+ bing: /Bingbot|msnbot|BingPreview/i,
13
+ apple: /Applebot/i,
14
+ yandex: /YandexBot/i,
15
+ baidu: /Baiduspider/i
16
+ }.freeze
17
+
18
+ # Valid reverse DNS suffixes for search engine bots
19
+ SEARCH_BOT_DNS_SUFFIXES = {
20
+ google: %w[.googlebot.com .google.com .googleusercontent.com],
21
+ bing: %w[.search.msn.com],
22
+ apple: %w[.applebot.apple.com],
23
+ yandex: %w[.yandex.ru .yandex.net .yandex.com],
24
+ baidu: %w[.crawl.baidu.com .crawl.baidu.jp]
25
+ }.freeze
26
+
27
+ # Search engine bots with IP ranges in database
28
+ SEARCH_BOTS_WITH_IP_RANGES = %i[google bing].freeze
29
+
30
+ # Search engine bots that support reverse DNS verification
31
+ SEARCH_BOTS_WITH_DNS = %i[google bing apple yandex baidu].freeze
32
+
33
+ # =============================================================================
34
+ # AI BOTS
35
+ # =============================================================================
36
+
37
+ # User agent patterns for AI bots
38
+ AI_BOT_PATTERNS = {
39
+ openai_gptbot: /GPTBot/i,
40
+ openai_chatgpt: /ChatGPT-User/i,
41
+ openai_searchbot: /OAI-SearchBot/i,
42
+ anthropic: /ClaudeBot|Claude-Web|anthropic-ai/i,
43
+ perplexity: /PerplexityBot|Perplexity-User/i,
44
+ amazon: /Amazonbot/i,
45
+ cohere: /cohere-ai/i,
46
+ meta: /meta-externalagent/i,
47
+ bytedance: /Bytespider/i
48
+ }.freeze
49
+
50
+ # AI bots with officially published IP ranges (can be verified)
51
+ AI_BOTS_WITH_IP_RANGES = %i[openai_gptbot openai_chatgpt openai_searchbot perplexity amazon].freeze
52
+
53
+ # AI bots without official IP ranges (cannot be reliably verified)
54
+ AI_BOTS_WITHOUT_VERIFICATION = %i[anthropic cohere meta bytedance].freeze
55
+
56
+ # =============================================================================
57
+ # SOCIAL/OTHER BOTS (no verification available)
58
+ # =============================================================================
59
+
60
+ SOCIAL_BOT_PATTERNS = {
61
+ duckduckgo: /DuckDuckBot/i,
62
+ facebook: /facebookexternalhit|Facebot/i,
63
+ twitter: /Twitterbot/i,
64
+ linkedin: /LinkedInBot/i,
65
+ slack: /Slackbot/i,
66
+ discord: /Discordbot/i,
67
+ telegram: /TelegramBot/i
68
+ }.freeze
69
+
70
+ # =============================================================================
71
+ # COMBINED
72
+ # =============================================================================
73
+
74
+ ALL_BOT_PATTERNS = SEARCH_BOT_PATTERNS.merge(AI_BOT_PATTERNS).merge(SOCIAL_BOT_PATTERNS).freeze
75
+ ALL_BOTS_WITH_IP_RANGES = (SEARCH_BOTS_WITH_IP_RANGES + AI_BOTS_WITH_IP_RANGES).freeze
76
+
77
+ # Search engine bot types (as strings for database)
78
+ SEARCH_BOT_TYPES = %w[google bing].freeze
79
+
80
+ # AI bot types with IP ranges (as strings for database)
81
+ AI_BOT_TYPES = %w[openai_gptbot openai_chatgpt openai_searchbot perplexity amazon].freeze
82
+
83
+ # All bot types that have IP ranges
84
+ VALID_BOT_TYPES = (SEARCH_BOT_TYPES + AI_BOT_TYPES).freeze
85
+
86
+ # =============================================================================
87
+ # IP RANGE SOURCES
88
+ # =============================================================================
89
+
90
+ SEARCH_ENGINE_SOURCES = {
91
+ google: [
92
+ { url: "https://developers.google.com/static/search/apis/ipranges/googlebot.json", name: "Googlebot" },
93
+ { url: "https://developers.google.com/static/search/apis/ipranges/special-crawlers.json", name: "Google Special" }
94
+ ],
95
+ bing: [
96
+ { url: "https://www.bing.com/toolbox/bingbot.json", name: "Bingbot" }
97
+ ]
98
+ }.freeze
99
+
100
+ AI_BOT_SOURCES = {
101
+ openai_gptbot: [
102
+ { url: "https://openai.com/gptbot.json", name: "GPTBot" }
103
+ ],
104
+ openai_chatgpt: [
105
+ { url: "https://openai.com/chatgpt-user.json", name: "ChatGPT-User" }
106
+ ],
107
+ openai_searchbot: [
108
+ { url: "https://openai.com/searchbot.json", name: "OAI-SearchBot" }
109
+ ],
110
+ perplexity: [
111
+ { url: "https://www.perplexity.ai/perplexitybot.json", name: "PerplexityBot" },
112
+ { url: "https://www.perplexity.ai/perplexity-user.json", name: "Perplexity-User" }
113
+ ],
114
+ amazon: [
115
+ { url: "https://developer.amazon.com/amazonbot/ip-addresses/", name: "Amazonbot" }
116
+ ]
117
+ }.freeze
118
+
119
+ IP_RANGE_SOURCES = SEARCH_ENGINE_SOURCES.merge(AI_BOT_SOURCES).freeze
120
+ end
121
+ end
@@ -0,0 +1,139 @@
1
+ # frozen_string_literal: true
2
+
3
+ module BotVerification
4
+ class Configuration
5
+ # Table name for storing bot IP ranges
6
+ # Default: "bot_ip_ranges"
7
+ attr_accessor :table_name
8
+
9
+ # Skip DNS verification entirely (only use IP range matching)
10
+ # Set to true if DNS lookups are unacceptable for your project
11
+ # Default: false
12
+ attr_accessor :skip_dns_verification
13
+
14
+ # Verify SSL certificates when fetching IP ranges
15
+ # Set to false in development if you have certificate issues
16
+ # Default: true (always verify in production!)
17
+ attr_accessor :verify_ssl
18
+
19
+ # Timeout for each DNS lookup (reverse and forward) in seconds
20
+ # Default: 1.0
21
+ attr_accessor :dns_timeout
22
+
23
+ # Total timeout for all DNS operations in seconds
24
+ # Default: 2.0
25
+ attr_accessor :dns_total_timeout
26
+
27
+ # How long to cache verification results in Rails cache
28
+ # Default: 24 hours
29
+ attr_accessor :cache_ttl
30
+
31
+ # How long to cache results in session
32
+ # Default: 1 hour
33
+ attr_accessor :session_cache_ttl
34
+
35
+ # Cache key prefix for Rails cache
36
+ # Default: "bot_verification"
37
+ attr_accessor :cache_key_prefix
38
+
39
+ # Logger instance (defaults to Rails.logger)
40
+ attr_accessor :logger
41
+
42
+ # Custom IP range model class name (optional)
43
+ # If set, uses your own model instead of the built-in one
44
+ # The model must include BotVerification::IpRangeModel
45
+ attr_accessor :ip_range_model_name
46
+
47
+ # Error callback - called when errors occur during IP range refresh
48
+ # Useful for integrating with error tracking services (Airbrake, Sentry, etc.)
49
+ # @example
50
+ # config.on_error = ->(error, context) { Airbrake.notify(error, context) }
51
+ attr_accessor :on_error
52
+
53
+ # Refresh complete callback - called after IP range refresh completes
54
+ # Useful for notifications or monitoring
55
+ # @example
56
+ # config.on_refresh_complete = ->(results) { SlackNotifier.notify("Bot IPs refreshed: #{results}") }
57
+ attr_accessor :on_refresh_complete
58
+
59
+ # Verification callback - called after each bot verification attempt
60
+ # Useful for collecting statistics about bot traffic
61
+ # @example
62
+ # config.on_verification = ->(result) {
63
+ # # result = { bot_type: :google, verified: true, mode: :search_engines, ip: "...", user_agent: "..." }
64
+ # BotRequestStats.increment(result[:bot_type], result[:verified])
65
+ # }
66
+ attr_accessor :on_verification
67
+
68
+ def initialize
69
+ @table_name = "bot_ip_ranges"
70
+ @skip_dns_verification = false
71
+ @verify_ssl = true
72
+ @dns_timeout = 1.0
73
+ @dns_total_timeout = 2.0
74
+ @cache_ttl = 24.hours
75
+ @session_cache_ttl = 1.hour
76
+ @cache_key_prefix = "bot_verification"
77
+ @logger = nil
78
+ @ip_range_model_name = nil
79
+ @on_error = nil
80
+ @on_refresh_complete = nil
81
+ @on_verification = nil
82
+ end
83
+
84
+ # Report an error through the configured callback
85
+ def report_error(error, context = {})
86
+ return unless on_error.respond_to?(:call)
87
+
88
+ on_error.call(error, context)
89
+ rescue StandardError => e
90
+ logger.error("[BotVerification] Error in on_error callback: #{e.message}")
91
+ end
92
+
93
+ # Report refresh completion through the configured callback
94
+ def report_refresh_complete(results)
95
+ return unless on_refresh_complete.respond_to?(:call)
96
+
97
+ on_refresh_complete.call(results)
98
+ rescue StandardError => e
99
+ logger.error("[BotVerification] Error in on_refresh_complete callback: #{e.message}")
100
+ end
101
+
102
+ # Report verification attempt through the configured callback
103
+ def report_verification(result)
104
+ return unless on_verification.respond_to?(:call)
105
+
106
+ on_verification.call(result)
107
+ rescue StandardError => e
108
+ logger.error("[BotVerification] Error in on_verification callback: #{e.message}")
109
+ end
110
+
111
+ def logger
112
+ @logger || (defined?(Rails) ? Rails.logger : Logger.new($stdout))
113
+ end
114
+
115
+ def cache
116
+ @cache ||= if defined?(Rails) && Rails.respond_to?(:cache) && Rails.cache
117
+ Rails.cache
118
+ else
119
+ ActiveSupport::Cache::MemoryStore.new
120
+ end
121
+ end
122
+
123
+ # Get the IP range model class
124
+ def ip_range_model_class
125
+ if @ip_range_model_name
126
+ @ip_range_model_name.constantize
127
+ else
128
+ BotVerification::IpRange
129
+ end
130
+ end
131
+
132
+ # Validate configuration
133
+ def validate!
134
+ raise ConfigurationError, "table_name cannot be blank" if table_name.blank?
135
+ raise ConfigurationError, "dns_timeout must be positive" unless dns_timeout.positive?
136
+ raise ConfigurationError, "dns_total_timeout must be positive" unless dns_total_timeout.positive?
137
+ end
138
+ end
139
+ end
@@ -0,0 +1,150 @@
1
+ # frozen_string_literal: true
2
+
3
+ module BotVerification
4
+ # Controller concern for bot verification with session-based caching
5
+ #
6
+ # @example Usage in a controller
7
+ # class MyController < ApplicationController
8
+ # include BotVerification::ControllerConcern
9
+ #
10
+ # def show
11
+ # if verified_good_bot?
12
+ # # Serve full content to verified bots
13
+ # else
14
+ # # Rate limit or require authentication
15
+ # end
16
+ # end
17
+ # end
18
+ #
19
+ module ControllerConcern
20
+ extend ActiveSupport::Concern
21
+
22
+ # Check if current request is from a verified good bot
23
+ # Uses session caching to avoid repeated verification calls
24
+ #
25
+ # @param mode [Symbol] Verification mode (see Service)
26
+ # @return [Boolean]
27
+ def verified_good_bot?(mode: :search_engines)
28
+ return @_verified_good_bot if defined?(@_verified_good_bot) && @_verification_mode == mode
29
+
30
+ @_verification_mode = mode
31
+ @_verified_good_bot = check_bot_with_session_cache(mode: mode)
32
+ end
33
+
34
+ # Check specifically if request is from a verified AI bot
35
+ #
36
+ # @param strict [Boolean] See Service#verified_ai_bot?
37
+ # @return [Boolean]
38
+ def verified_ai_bot?(strict: true)
39
+ cache_key = bot_session_cache_key
40
+ return false unless cache_key
41
+
42
+ cached = read_bot_session_cache(cache_key)
43
+ return true if cached && cached[:ai_verified] == true
44
+
45
+ result = Service.verified_ai_bot?(
46
+ request.remote_ip,
47
+ request.user_agent,
48
+ strict: strict
49
+ )
50
+
51
+ if result
52
+ write_bot_session_cache(cache_key, ai_verified: true, bot_type: detected_bot_type)
53
+ end
54
+
55
+ result
56
+ end
57
+
58
+ # Get the detected bot type for the current request (if any)
59
+ #
60
+ # @return [Symbol, nil]
61
+ def detected_bot_type
62
+ @_detected_bot_type ||= Service.detect_bot_type(request.user_agent)
63
+ end
64
+
65
+ # Check if user agent looks like a bot (regardless of verification)
66
+ #
67
+ # @return [Boolean]
68
+ def bot_user_agent?
69
+ detected_bot_type.present?
70
+ end
71
+
72
+ # Check if user agent looks like an AI bot
73
+ #
74
+ # @return [Boolean]
75
+ def ai_bot_user_agent?
76
+ Service.ai_bot_user_agent?(request.user_agent)
77
+ end
78
+
79
+ # Check if user agent looks like a search engine bot
80
+ #
81
+ # @return [Boolean]
82
+ def search_bot_user_agent?
83
+ Service.search_bot_user_agent?(request.user_agent)
84
+ end
85
+
86
+ private
87
+
88
+ def check_bot_with_session_cache(mode:)
89
+ cache_key = bot_session_cache_key
90
+ return false unless cache_key
91
+
92
+ cached = read_bot_session_cache(cache_key)
93
+ return cached[:verified] == true if cached
94
+
95
+ result = Service.verified_good_bot?(
96
+ request.remote_ip,
97
+ request.user_agent,
98
+ mode: mode
99
+ )
100
+
101
+ write_bot_session_cache(cache_key, verified: result, bot_type: detected_bot_type)
102
+ result
103
+ end
104
+
105
+ def bot_session_cache_key
106
+ return nil if request.remote_ip.blank? || request.user_agent.blank?
107
+
108
+ Service.session_cache_key(request.remote_ip, request.user_agent)
109
+ end
110
+
111
+ def read_bot_session_cache(key)
112
+ return nil unless bot_session_available?
113
+
114
+ cache_store = session[:bot_verification]
115
+ return nil unless cache_store.is_a?(Hash)
116
+
117
+ entry = cache_store[key]
118
+ return nil unless entry.is_a?(Hash)
119
+
120
+ cached_at = entry[:at]
121
+ ttl = BotVerification.configuration.session_cache_ttl.to_i
122
+ if cached_at && (Time.current.to_i - cached_at) > ttl
123
+ cache_store.delete(key)
124
+ return nil
125
+ end
126
+
127
+ entry
128
+ end
129
+
130
+ def write_bot_session_cache(key, data)
131
+ return unless bot_session_available?
132
+
133
+ session[:bot_verification] ||= {}
134
+
135
+ # Limit cache size to prevent session bloat
136
+ if session[:bot_verification].size >= 10
137
+ oldest_key = session[:bot_verification].min_by { |_, v| v[:at] || 0 }&.first
138
+ session[:bot_verification].delete(oldest_key) if oldest_key
139
+ end
140
+
141
+ session[:bot_verification][key] = data.merge(at: Time.current.to_i)
142
+ end
143
+
144
+ def bot_session_available?
145
+ defined?(session) && session.respond_to?(:[])
146
+ rescue ActionController::InvalidAuthenticityToken
147
+ false
148
+ end
149
+ end
150
+ end
@@ -0,0 +1,155 @@
1
+ # frozen_string_literal: true
2
+
3
+ module BotVerification
4
+ # Fetches IP ranges from official sources
5
+ # Can be called manually or scheduled via cron/Sidekiq
6
+ #
7
+ # @example Run manually
8
+ # BotVerification::IpRangeFetcher.refresh!
9
+ #
10
+ # @example Refresh specific bot type
11
+ # BotVerification::IpRangeFetcher.refresh!(:google)
12
+ #
13
+ class IpRangeFetcher
14
+ class << self
15
+ # Refresh IP ranges from official sources
16
+ #
17
+ # @param bot_type [String, Symbol, nil] Specific bot type, or nil for all
18
+ # @return [Hash] Results for each bot type
19
+ def refresh!(bot_type = nil)
20
+ if bot_type
21
+ { bot_type.to_sym => refresh_bot_type(bot_type.to_s) }
22
+ else
23
+ refresh_all
24
+ end
25
+ end
26
+
27
+ private
28
+
29
+ def refresh_all
30
+ log_info("Starting refresh of all bot IP ranges")
31
+
32
+ results = {}
33
+ BotPatterns::IP_RANGE_SOURCES.each_key do |type|
34
+ results[type] = refresh_bot_type(type.to_s)
35
+ end
36
+
37
+ log_info("Completed refresh: #{results.inspect}")
38
+ config.report_refresh_complete(results)
39
+ results
40
+ end
41
+
42
+ def refresh_bot_type(bot_type)
43
+ sources = BotPatterns::IP_RANGE_SOURCES[bot_type.to_sym]
44
+ return { error: "Unknown bot type: #{bot_type}" } unless sources
45
+
46
+ ranges = []
47
+
48
+ sources.each do |source|
49
+ log_info("Fetching #{source[:name]} from #{source[:url]}")
50
+
51
+ begin
52
+ fetched = fetch_ranges_from_url(source[:url])
53
+ ranges.concat(fetched.map { |r| r.merge(source_url: source[:url]) })
54
+ log_info("Fetched #{fetched.size} ranges from #{source[:name]}")
55
+ rescue StandardError => e
56
+ log_error("Failed to fetch #{source[:name]}: #{e.message}")
57
+ config.report_error(e, bot_type: bot_type, source: source[:name], url: source[:url])
58
+ end
59
+ end
60
+
61
+ if ranges.any?
62
+ count = BotVerification.ip_range_model.import_ranges(bot_type, ranges)
63
+ log_info("Imported #{count} ranges for #{bot_type}")
64
+ { success: true, count: count }
65
+ else
66
+ log_warn("No ranges fetched for #{bot_type}")
67
+ { success: false, error: "No ranges fetched" }
68
+ end
69
+ end
70
+
71
+ def fetch_ranges_from_url(url)
72
+ uri = URI.parse(url)
73
+ http = Net::HTTP.new(uri.host, uri.port)
74
+ http.use_ssl = uri.scheme == "https"
75
+ http.verify_mode = config.verify_ssl ? OpenSSL::SSL::VERIFY_PEER : OpenSSL::SSL::VERIFY_NONE
76
+ http.open_timeout = 10
77
+ http.read_timeout = 30
78
+
79
+ request = Net::HTTP::Get.new(uri.request_uri)
80
+ request["User-Agent"] = "BotVerification Gem/#{VERSION}"
81
+
82
+ response = http.request(request)
83
+
84
+ unless response.is_a?(Net::HTTPSuccess)
85
+ raise "HTTP #{response.code}: #{response.message}"
86
+ end
87
+
88
+ parse_ip_ranges(response.body)
89
+ end
90
+
91
+ def parse_ip_ranges(body)
92
+ # Try direct JSON parse first
93
+ data = JSON.parse(body)
94
+ extract_prefixes(data)
95
+ rescue JSON::ParserError
96
+ # If direct parse fails, try to extract JSON from HTML (Amazon's format)
97
+ extract_json_from_html(body)
98
+ end
99
+
100
+ def extract_prefixes(data)
101
+ prefixes = data["prefixes"] || []
102
+
103
+ prefixes.filter_map do |prefix|
104
+ cidr = prefix["ipv4Prefix"] || prefix["ipv6Prefix"]
105
+ next unless cidr
106
+
107
+ # Ensure CIDR notation (Amazon uses bare IPs without /32)
108
+ cidr = "#{cidr}/32" if prefix["ipv4Prefix"] && !cidr.include?("/")
109
+ cidr = "#{cidr}/128" if prefix["ipv6Prefix"] && !cidr.include?("/")
110
+
111
+ ip_version = prefix["ipv4Prefix"] ? 4 : 6
112
+ { cidr: cidr, ip_version: ip_version }
113
+ end
114
+ end
115
+
116
+ def extract_json_from_html(html)
117
+ # Amazon embeds JSON in HTML with escaped quotes
118
+ # Look for JSON-like structure with prefixes
119
+ return [] unless html.include?("prefixes")
120
+
121
+ # Unescape HTML entities
122
+ unescaped = html.gsub("&quot;", '"').gsub("&amp;", "&").gsub("&lt;", "<").gsub("&gt;", ">")
123
+
124
+ # Try to find and parse JSON object containing prefixes
125
+ match = unescaped.match(/\{\s*"creationTime"[^{]*"prefixes"\s*:\s*\[.*?\]\s*\}/m)
126
+ return [] unless match
127
+
128
+ data = JSON.parse(match[0])
129
+ extract_prefixes(data)
130
+ rescue JSON::ParserError => e
131
+ log_error("Failed to parse embedded JSON: #{e.message}")
132
+ []
133
+ rescue StandardError => e
134
+ log_error("Failed to extract JSON from HTML: #{e.message}")
135
+ []
136
+ end
137
+
138
+ def config
139
+ BotVerification.configuration
140
+ end
141
+
142
+ def log_info(message)
143
+ config.logger.info("[BotVerification] #{message}")
144
+ end
145
+
146
+ def log_warn(message)
147
+ config.logger.warn("[BotVerification] #{message}")
148
+ end
149
+
150
+ def log_error(message)
151
+ config.logger.error("[BotVerification] #{message}")
152
+ end
153
+ end
154
+ end
155
+ end