spyglasses 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,491 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'net/http'
4
+ require 'uri'
5
+ require 'json'
6
+ require 'thread'
7
+
8
+ module Spyglasses
9
+ class Client
10
+ include Spyglasses::Types
11
+
12
+ attr_reader :configuration, :patterns, :ai_referrers, :pattern_version, :last_pattern_sync
13
+
14
+ def initialize(config = nil)
15
+ @configuration = config || Configuration.new
16
+ @patterns = []
17
+ @ai_referrers = []
18
+ @pattern_regex_cache = {}
19
+ @pattern_version = '1.0.0'
20
+ @last_pattern_sync = 0
21
+ @mutex = Mutex.new
22
+
23
+ # Property settings loaded from API
24
+ @block_ai_model_trainers = false
25
+ @custom_blocks = []
26
+ @custom_allows = []
27
+
28
+ load_default_patterns
29
+
30
+ # Auto-sync patterns if enabled and API key is present
31
+ if @configuration.auto_sync? && @configuration.api_key_present?
32
+ Thread.new do
33
+ begin
34
+ sync_patterns
35
+ rescue => e
36
+ log_debug("Error syncing patterns: #{e.message}")
37
+ end
38
+ end
39
+ end
40
+ end
41
+
42
+ # Sync patterns from the API
43
+ def sync_patterns
44
+ unless @configuration.api_key_present?
45
+ message = 'No API key set for pattern sync'
46
+ log_debug(message)
47
+ return message
48
+ end
49
+
50
+ begin
51
+ uri = URI(@configuration.patterns_endpoint)
52
+ http = Net::HTTP.new(uri.host, uri.port)
53
+ http.use_ssl = uri.scheme == 'https'
54
+ http.read_timeout = 30
55
+ http.open_timeout = 10
56
+
57
+ request = Net::HTTP::Get.new(uri)
58
+ request['Content-Type'] = 'application/json'
59
+ request['x-api-key'] = @configuration.api_key
60
+
61
+ response = http.request(request)
62
+
63
+ unless response.is_a?(Net::HTTPSuccess)
64
+ message = "Pattern sync HTTP error #{response.code}: #{response.message}"
65
+ log_debug(message)
66
+ return message
67
+ end
68
+
69
+ data = JSON.parse(response.body)
70
+ api_response = ApiPatternResponse.new(data)
71
+
72
+ # Thread-safe update of patterns
73
+ @mutex.synchronize do
74
+ @patterns = api_response.patterns
75
+ @ai_referrers = api_response.ai_referrers
76
+ @pattern_version = api_response.version
77
+ @last_pattern_sync = Time.now.to_i
78
+
79
+ # Update property settings
80
+ @block_ai_model_trainers = api_response.property_settings.block_ai_model_trainers
81
+ @custom_blocks = api_response.property_settings.custom_blocks
82
+ @custom_allows = api_response.property_settings.custom_allows
83
+
84
+ # Clear regex cache
85
+ @pattern_regex_cache.clear
86
+ end
87
+
88
+ log_debug("Synced #{@patterns.length} patterns and #{@ai_referrers.length} AI referrers")
89
+ log_debug("Property settings: block_ai_model_trainers=#{@block_ai_model_trainers}, custom_blocks=#{@custom_blocks.length}, custom_allows=#{@custom_allows.length}")
90
+
91
+ api_response
92
+ rescue => e
93
+ message = "Error syncing patterns: #{e.message}"
94
+ log_debug(message)
95
+ message
96
+ end
97
+ end
98
+
99
+ # Detect if a user agent is a bot
100
+ def detect_bot(user_agent)
101
+ return DetectionResult.new unless user_agent && !user_agent.empty?
102
+
103
+ log_debug("Checking user agent: \"#{user_agent[0..149]}#{user_agent.length > 150 ? '...' : ''}\"")
104
+ log_debug("Testing against #{@patterns.length} bot patterns")
105
+
106
+ @patterns.each do |pattern|
107
+ begin
108
+ regex = get_regex_for_pattern(pattern.pattern)
109
+ log_debug("Testing pattern: \"#{pattern.pattern}\" (#{pattern.type || 'unknown'} - #{pattern.company || 'unknown company'})")
110
+
111
+ if regex.match?(user_agent)
112
+ should_block = should_block_pattern?(pattern)
113
+
114
+ log_debug("✅ BOT DETECTED! Pattern matched: \"#{pattern.pattern}\"")
115
+ log_debug("Bot details: type=#{pattern.type}, category=#{pattern.category}, subcategory=#{pattern.subcategory}, company=#{pattern.company}, is_ai_model_trainer=#{pattern.is_ai_model_trainer}, should_block=#{should_block}")
116
+
117
+ bot_info = BotInfo.new(
118
+ pattern: pattern.pattern,
119
+ type: pattern.type || 'unknown',
120
+ category: pattern.category || 'Unknown',
121
+ subcategory: pattern.subcategory || 'Unclassified',
122
+ company: pattern.company,
123
+ is_compliant: pattern.is_compliant || false,
124
+ is_ai_model_trainer: pattern.is_ai_model_trainer || false,
125
+ intent: pattern.intent || 'unknown',
126
+ url: pattern.url
127
+ )
128
+
129
+ return DetectionResult.new(
130
+ is_bot: true,
131
+ should_block: should_block,
132
+ source_type: 'bot',
133
+ matched_pattern: pattern.pattern,
134
+ info: bot_info
135
+ )
136
+ end
137
+ rescue => e
138
+ log_debug("Error with pattern #{pattern.pattern}: #{e.message}")
139
+ end
140
+ end
141
+
142
+ log_debug('No bot patterns matched user agent')
143
+ DetectionResult.new
144
+ end
145
+
146
+ # Detect if a referrer is from an AI platform
147
+ def detect_ai_referrer(referrer)
148
+ return DetectionResult.new unless referrer && !referrer.empty?
149
+
150
+ log_debug("Checking referrer: \"#{referrer}\"")
151
+
152
+ # Extract hostname from referrer
153
+ hostname = extract_hostname(referrer)
154
+ log_debug("Extracted hostname: \"#{hostname}\"")
155
+
156
+ @ai_referrers.each do |ai_referrer|
157
+ log_debug("Testing AI referrer: \"#{ai_referrer.name}\" (#{ai_referrer.company}) with patterns: #{ai_referrer.patterns.join(', ')}")
158
+
159
+ ai_referrer.patterns.each do |pattern|
160
+ log_debug("Testing AI referrer pattern: \"#{pattern}\" against hostname: \"#{hostname}\"")
161
+
162
+ if hostname.include?(pattern)
163
+ log_debug("✅ AI REFERRER DETECTED! Pattern matched: \"#{pattern}\"")
164
+ log_debug("AI referrer details: name=#{ai_referrer.name}, company=#{ai_referrer.company}, id=#{ai_referrer.id}")
165
+
166
+ return DetectionResult.new(
167
+ is_bot: false,
168
+ should_block: false,
169
+ source_type: 'ai_referrer',
170
+ matched_pattern: pattern,
171
+ info: ai_referrer
172
+ )
173
+ end
174
+ end
175
+ end
176
+
177
+ DetectionResult.new
178
+ end
179
+
180
+ # Combined detection for both bot and AI referrer
181
+ def detect(user_agent, referrer = nil)
182
+ log_debug("detect() called with user_agent: #{user_agent ? "\"#{user_agent[0..99]}#{user_agent.length > 100 ? '...' : ''}\"" : 'nil'}, referrer: #{referrer || 'nil'}")
183
+
184
+ # Check for bot first
185
+ bot_result = detect_bot(user_agent)
186
+ if bot_result.is_bot
187
+ log_debug('🤖 Final result: BOT detected, returning bot result')
188
+ return bot_result
189
+ end
190
+
191
+ # Check for AI referrer if provided
192
+ if referrer
193
+ log_debug('No bot detected, starting AI referrer detection...')
194
+ referrer_result = detect_ai_referrer(referrer)
195
+ if referrer_result.source_type == 'ai_referrer'
196
+ log_debug('🧠 Final result: AI REFERRER detected, returning referrer result')
197
+ return referrer_result
198
+ end
199
+ else
200
+ log_debug('No referrer provided, skipping AI referrer detection')
201
+ end
202
+
203
+ DetectionResult.new
204
+ end
205
+
206
+ # Log a request to the collector
207
+ def log_request(detection_result, request_info)
208
+ log_debug("log_request() called for source_type: #{detection_result.source_type}")
209
+
210
+ return unless @configuration.api_key_present? && detection_result.source_type != 'none'
211
+
212
+ log_debug("Preparing to log #{detection_result.source_type} event to collector")
213
+
214
+ # Prepare metadata
215
+ metadata = { was_blocked: detection_result.should_block }
216
+
217
+ if detection_result.source_type == 'bot' && detection_result.info
218
+ bot_info = detection_result.info
219
+ metadata.merge!(
220
+ agent_type: bot_info.type,
221
+ agent_category: bot_info.category,
222
+ agent_subcategory: bot_info.subcategory,
223
+ company: bot_info.company,
224
+ is_compliant: bot_info.is_compliant,
225
+ intent: bot_info.intent,
226
+ confidence: 0.9,
227
+ detection_method: 'pattern_match'
228
+ )
229
+ elsif detection_result.source_type == 'ai_referrer' && detection_result.info
230
+ referrer_info = detection_result.info
231
+ metadata.merge!(
232
+ source_type: 'ai_referrer',
233
+ referrer_id: referrer_info.id,
234
+ referrer_name: referrer_info.name,
235
+ company: referrer_info.company
236
+ )
237
+ end
238
+
239
+ payload = CollectorPayload.new(
240
+ url: request_info[:url],
241
+ user_agent: request_info[:user_agent],
242
+ ip_address: request_info[:ip_address],
243
+ request_method: request_info[:request_method],
244
+ request_path: request_info[:request_path],
245
+ request_query: request_info[:request_query],
246
+ referrer: request_info[:referrer],
247
+ response_status: request_info[:response_status] || (detection_result.should_block ? 403 : 200),
248
+ response_time_ms: request_info[:response_time_ms] || 0,
249
+ headers: request_info[:headers] || {},
250
+ platform_type: @configuration.platform_type,
251
+ metadata: metadata
252
+ )
253
+
254
+ # Send request in background thread to avoid blocking
255
+ Thread.new do
256
+ send_collector_request(payload, detection_result.source_type)
257
+ end
258
+ end
259
+
260
+ private
261
+
262
+ def load_default_patterns
263
+ # Default patterns similar to the TypeScript SDK
264
+ @patterns = [
265
+ # AI Assistants
266
+ BotPattern.new(
267
+ pattern: 'ChatGPT-User\/[0-9]',
268
+ url: 'https://platform.openai.com/docs/bots',
269
+ type: 'chatgpt-user',
270
+ category: 'AI Agent',
271
+ subcategory: 'AI Assistants',
272
+ company: 'OpenAI',
273
+ is_compliant: true,
274
+ is_ai_model_trainer: false,
275
+ intent: 'UserQuery'
276
+ ),
277
+ BotPattern.new(
278
+ pattern: 'Perplexity-User\/[0-9]',
279
+ url: 'https://docs.perplexity.ai/guides/bots',
280
+ type: 'perplexity-user',
281
+ category: 'AI Agent',
282
+ subcategory: 'AI Assistants',
283
+ company: 'Perplexity AI',
284
+ is_compliant: true,
285
+ is_ai_model_trainer: false,
286
+ intent: 'UserQuery'
287
+ ),
288
+ BotPattern.new(
289
+ pattern: 'Gemini-User\/[0-9]',
290
+ url: 'https://ai.google.dev/gemini-api/docs/bots',
291
+ type: 'gemini-user',
292
+ category: 'AI Agent',
293
+ subcategory: 'AI Assistants',
294
+ company: 'Google',
295
+ is_compliant: true,
296
+ is_ai_model_trainer: false,
297
+ intent: 'UserQuery'
298
+ ),
299
+ BotPattern.new(
300
+ pattern: 'Claude-User\/[0-9]',
301
+ url: 'https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler',
302
+ type: 'claude-user',
303
+ category: 'AI Agent',
304
+ subcategory: 'AI Assistants',
305
+ company: 'Anthropic',
306
+ is_compliant: true,
307
+ is_ai_model_trainer: false,
308
+ intent: 'UserQuery'
309
+ ),
310
+
311
+ # AI Model Training Crawlers
312
+ BotPattern.new(
313
+ pattern: 'CCBot\/[0-9]',
314
+ url: 'https://commoncrawl.org/ccbot',
315
+ type: 'ccbot',
316
+ category: 'AI Crawler',
317
+ subcategory: 'Model Training Crawlers',
318
+ company: 'Common Crawl',
319
+ is_compliant: true,
320
+ is_ai_model_trainer: true,
321
+ intent: 'DataCollection'
322
+ ),
323
+ BotPattern.new(
324
+ pattern: 'ClaudeBot\/[0-9]',
325
+ url: 'https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler',
326
+ type: 'claude-bot',
327
+ category: 'AI Crawler',
328
+ subcategory: 'Model Training Crawlers',
329
+ company: 'Anthropic',
330
+ is_compliant: true,
331
+ is_ai_model_trainer: true,
332
+ intent: 'DataCollection'
333
+ ),
334
+ BotPattern.new(
335
+ pattern: 'GPTBot\/[0-9]',
336
+ url: 'https://platform.openai.com/docs/gptbot',
337
+ type: 'gptbot',
338
+ category: 'AI Crawler',
339
+ subcategory: 'Model Training Crawlers',
340
+ company: 'OpenAI',
341
+ is_compliant: true,
342
+ is_ai_model_trainer: true,
343
+ intent: 'DataCollection'
344
+ ),
345
+ BotPattern.new(
346
+ pattern: 'meta-externalagent\/[0-9]',
347
+ url: 'https://developers.facebook.com/docs/sharing/webmasters/crawler',
348
+ type: 'meta-externalagent',
349
+ category: 'AI Crawler',
350
+ subcategory: 'Model Training Crawlers',
351
+ company: 'Meta',
352
+ is_compliant: true,
353
+ is_ai_model_trainer: true,
354
+ intent: 'DataCollection'
355
+ ),
356
+ BotPattern.new(
357
+ pattern: 'Applebot-Extended\/[0-9]',
358
+ url: 'https://support.apple.com/en-us/119829',
359
+ type: 'applebot-extended',
360
+ category: 'AI Crawler',
361
+ subcategory: 'Model Training Crawlers',
362
+ company: 'Apple',
363
+ is_compliant: true,
364
+ is_ai_model_trainer: true,
365
+ intent: 'DataCollection'
366
+ )
367
+ ]
368
+
369
+ # Default AI referrers
370
+ @ai_referrers = [
371
+ AiReferrerInfo.new(
372
+ id: 'chatgpt',
373
+ name: 'ChatGPT',
374
+ company: 'OpenAI',
375
+ url: 'https://chat.openai.com',
376
+ patterns: ['chat.openai.com', 'chatgpt.com'],
377
+ description: 'Traffic from ChatGPT users clicking on links'
378
+ ),
379
+ AiReferrerInfo.new(
380
+ id: 'claude',
381
+ name: 'Claude',
382
+ company: 'Anthropic',
383
+ url: 'https://claude.ai',
384
+ patterns: ['claude.ai'],
385
+ description: 'Traffic from Claude users clicking on links'
386
+ ),
387
+ AiReferrerInfo.new(
388
+ id: 'perplexity',
389
+ name: 'Perplexity',
390
+ company: 'Perplexity AI',
391
+ url: 'https://perplexity.ai',
392
+ patterns: ['perplexity.ai'],
393
+ description: 'Traffic from Perplexity users clicking on links'
394
+ ),
395
+ AiReferrerInfo.new(
396
+ id: 'gemini',
397
+ name: 'Gemini',
398
+ company: 'Google',
399
+ url: 'https://gemini.google.com',
400
+ patterns: ['gemini.google.com', 'bard.google.com'],
401
+ description: 'Traffic from Gemini users clicking on links'
402
+ ),
403
+ AiReferrerInfo.new(
404
+ id: 'copilot',
405
+ name: 'Microsoft Copilot',
406
+ company: 'Microsoft',
407
+ url: 'https://copilot.microsoft.com/',
408
+ patterns: ['copilot.microsoft.com', 'bing.com/chat'],
409
+ description: 'Traffic from Microsoft Copilot users clicking on links'
410
+ )
411
+ ]
412
+ end
413
+
414
+ def get_regex_for_pattern(pattern)
415
+ return @pattern_regex_cache[pattern] if @pattern_regex_cache.key?(pattern)
416
+
417
+ @pattern_regex_cache[pattern] = Regexp.new(pattern, Regexp::IGNORECASE)
418
+ end
419
+
420
+ def should_block_pattern?(pattern_data)
421
+ # Check if pattern is explicitly allowed
422
+ return false if @custom_allows.include?("pattern:#{pattern_data.pattern}")
423
+
424
+ category = pattern_data.category || 'Unknown'
425
+ subcategory = pattern_data.subcategory || 'Unclassified'
426
+ type = pattern_data.type || 'unknown'
427
+
428
+ # Check if any parent is explicitly allowed
429
+ return false if @custom_allows.include?("category:#{category}") ||
430
+ @custom_allows.include?("subcategory:#{category}:#{subcategory}") ||
431
+ @custom_allows.include?("type:#{category}:#{subcategory}:#{type}")
432
+
433
+ # Check if pattern is explicitly blocked
434
+ return true if @custom_blocks.include?("pattern:#{pattern_data.pattern}")
435
+
436
+ # Check if any parent is explicitly blocked
437
+ return true if @custom_blocks.include?("category:#{category}") ||
438
+ @custom_blocks.include?("subcategory:#{category}:#{subcategory}") ||
439
+ @custom_blocks.include?("type:#{category}:#{subcategory}:#{type}")
440
+
441
+ # Check for AI model trainers global setting
442
+ return true if @block_ai_model_trainers && pattern_data.is_ai_model_trainer
443
+
444
+ # Default to not blocking
445
+ false
446
+ end
447
+
448
+ def extract_hostname(referrer)
449
+ uri = URI.parse(referrer)
450
+ uri.hostname&.downcase || referrer.downcase
451
+ rescue URI::InvalidURIError
452
+ referrer.downcase
453
+ end
454
+
455
+ def send_collector_request(payload, source_type)
456
+ begin
457
+ uri = URI(@configuration.collect_endpoint)
458
+ http = Net::HTTP.new(uri.host, uri.port)
459
+ http.use_ssl = uri.scheme == 'https'
460
+ http.read_timeout = 10
461
+ http.open_timeout = 5
462
+
463
+ request = Net::HTTP::Post.new(uri)
464
+ request['Content-Type'] = 'application/json'
465
+ request['x-api-key'] = @configuration.api_key
466
+ request.body = payload.to_json
467
+
468
+ log_debug("Making POST request to #{@configuration.collect_endpoint}")
469
+ log_debug("Payload size: #{request.body.bytesize} bytes")
470
+
471
+ response = http.request(request)
472
+
473
+ log_debug("Collector response status: #{response.code} #{response.message}")
474
+
475
+ if response.is_a?(Net::HTTPSuccess)
476
+ log_debug("✅ Successfully logged #{source_type} event")
477
+ else
478
+ log_debug("❌ Failed to log #{source_type} event")
479
+ end
480
+ rescue => e
481
+ log_debug("❌ Exception during collector request for #{source_type}: #{e.message}")
482
+ end
483
+ end
484
+
485
+ def log_debug(message)
486
+ return unless @configuration.debug?
487
+
488
+ puts "[Spyglasses] #{message}"
489
+ end
490
+ end
491
+ end
@@ -0,0 +1,79 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Spyglasses
4
+ class Configuration
5
+ DEFAULT_COLLECT_ENDPOINT = 'https://www.spyglasses.io/api/collect'
6
+ DEFAULT_PATTERNS_ENDPOINT = 'https://www.spyglasses.io/api/patterns'
7
+ DEFAULT_CACHE_TTL = 24 * 60 * 60 # 24 hours in seconds
8
+ DEFAULT_PLATFORM_TYPE = 'ruby'
9
+
10
+ attr_accessor :api_key, :debug, :collect_endpoint, :patterns_endpoint,
11
+ :auto_sync, :platform_type, :cache_ttl, :exclude_paths
12
+
13
+ def initialize
14
+ # Load from environment variables by default
15
+ @api_key = ENV['SPYGLASSES_API_KEY']
16
+ @debug = ENV['SPYGLASSES_DEBUG'] == 'true'
17
+ @collect_endpoint = ENV['SPYGLASSES_COLLECT_ENDPOINT'] || DEFAULT_COLLECT_ENDPOINT
18
+ @patterns_endpoint = ENV['SPYGLASSES_PATTERNS_ENDPOINT'] || DEFAULT_PATTERNS_ENDPOINT
19
+ @auto_sync = ENV['SPYGLASSES_AUTO_SYNC'] != 'false' # Default to true
20
+ @platform_type = ENV['SPYGLASSES_PLATFORM_TYPE'] || DEFAULT_PLATFORM_TYPE
21
+ @cache_ttl = (ENV['SPYGLASSES_CACHE_TTL'] || DEFAULT_CACHE_TTL).to_i
22
+ @exclude_paths = []
23
+ end
24
+
25
+ def api_key_present?
26
+ !@api_key.nil? && !@api_key.empty?
27
+ end
28
+
29
+ def debug?
30
+ @debug
31
+ end
32
+
33
+ def auto_sync?
34
+ @auto_sync
35
+ end
36
+
37
+ def validate!
38
+ unless api_key_present?
39
+ raise ConfigurationError, 'API key is required. Set SPYGLASSES_API_KEY environment variable or configure via Spyglasses.configure'
40
+ end
41
+
42
+ unless valid_url?(@collect_endpoint)
43
+ raise ConfigurationError, "Invalid collect endpoint: #{@collect_endpoint}"
44
+ end
45
+
46
+ unless valid_url?(@patterns_endpoint)
47
+ raise ConfigurationError, "Invalid patterns endpoint: #{@patterns_endpoint}"
48
+ end
49
+
50
+ if @cache_ttl < 0
51
+ raise ConfigurationError, "Cache TTL must be non-negative: #{@cache_ttl}"
52
+ end
53
+ end
54
+
55
+ def to_h
56
+ {
57
+ api_key: @api_key ? "#{@api_key[0..7]}..." : nil,
58
+ debug: @debug,
59
+ collect_endpoint: @collect_endpoint,
60
+ patterns_endpoint: @patterns_endpoint,
61
+ auto_sync: @auto_sync,
62
+ platform_type: @platform_type,
63
+ cache_ttl: @cache_ttl,
64
+ exclude_paths: @exclude_paths
65
+ }
66
+ end
67
+
68
+ private
69
+
70
+ def valid_url?(url)
71
+ return false if url.nil? || url.empty?
72
+
73
+ uri = URI.parse(url)
74
+ uri.is_a?(URI::HTTP) || uri.is_a?(URI::HTTPS)
75
+ rescue URI::InvalidURIError
76
+ false
77
+ end
78
+ end
79
+ end