status_mcp 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1253 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "fast_mcp"
4
+ require "json"
5
+ require "net/http"
6
+ require "uri"
7
+ require "openssl"
8
+ require "nokogiri"
9
+ require_relative "../status_mcp"
10
+
11
+ module StatusMcp
12
+ class Server
13
+ def self.start
14
+ server = FastMcp::Server.new(name: "status_mcp", version: StatusMcp::VERSION)
15
+
16
+ server.register_tool(SearchServicesTool)
17
+ server.register_tool(GetServiceDetailsTool)
18
+ server.register_tool(ListServicesTool)
19
+ server.register_tool(FetchStatusTool)
20
+
21
+ server.start
22
+ end
23
+
24
+ class BaseTool < FastMcp::Tool
25
+ protected
26
+
27
+ def load_data
28
+ unless File.exist?(StatusMcp::DATA_PATH)
29
+ return []
30
+ end
31
+
32
+ JSON.parse(File.read(StatusMcp::DATA_PATH))
33
+ rescue JSON::ParserError
34
+ []
35
+ end
36
+
37
+ def format_service(service)
38
+ output = "## #{service["name"]}\n"
39
+
40
+ output += "- **Official Status**: #{service["status_url"]}\n" if service["status_url"]
41
+ output += "- **Website**: #{service["website_url"]}\n" if service["website_url"]
42
+ output += "- **Security**: #{service["security_url"]}\n" if service["security_url"]
43
+ output += "- **Support**: #{service["support_url"]}\n" if service["support_url"]
44
+
45
+ if service["aux_urls"]&.any?
46
+ output += "- **Other Links**: #{service["aux_urls"].join(", ")}\n"
47
+ end
48
+
49
+ output
50
+ end
51
+
52
+ # Calculate Levenshtein distance between two strings
53
+ def levenshtein_distance(str1, str2)
54
+ m = str1.length
55
+ n = str2.length
56
+ return n if m.zero?
57
+ return m if n.zero?
58
+
59
+ d = Array.new(m + 1) { Array.new(n + 1) }
60
+
61
+ (0..m).each { |i| d[i][0] = i }
62
+ (0..n).each { |j| d[0][j] = j }
63
+
64
+ (1..m).each do |i|
65
+ (1..n).each do |j|
66
+ cost = (str1[i - 1] == str2[j - 1]) ? 0 : 1
67
+ d[i][j] = [
68
+ d[i - 1][j] + 1, # deletion
69
+ d[i][j - 1] + 1, # insertion
70
+ d[i - 1][j - 1] + cost # substitution
71
+ ].min
72
+ end
73
+ end
74
+
75
+ d[m][n]
76
+ end
77
+
78
+ # Calculate similarity ratio between two strings (0.0 to 1.0)
79
+ def similarity_ratio(str1, str2)
80
+ return 1.0 if str1 == str2
81
+ return 0.0 if str1.empty? || str2.empty?
82
+
83
+ max_len = [str1.length, str2.length].max
84
+ distance = levenshtein_distance(str1, str2)
85
+ 1.0 - (distance.to_f / max_len)
86
+ end
87
+
88
+ # Find services using fuzzy matching
89
+ def find_services_fuzzy(services, query, threshold: 0.6)
90
+ query_lower = query.downcase.strip
91
+ return [] if query_lower.empty?
92
+
93
+ results = []
94
+
95
+ services.each do |service|
96
+ name = service["name"]
97
+ name_lower = name.downcase
98
+
99
+ # Exact match (case-insensitive)
100
+ if name_lower == query_lower
101
+ results << {service: service, score: 1.0, match_type: :exact}
102
+ next
103
+ end
104
+
105
+ # Substring match
106
+ if name_lower.include?(query_lower) || query_lower.include?(name_lower)
107
+ # Higher score for longer matches
108
+ match_length = [name_lower.length, query_lower.length].min
109
+ score = match_length.to_f / [name_lower.length, query_lower.length].max
110
+ results << {service: service, score: score, match_type: :substring}
111
+ next
112
+ end
113
+
114
+ # Fuzzy match using Levenshtein distance
115
+ similarity = similarity_ratio(name_lower, query_lower)
116
+ if similarity >= threshold
117
+ results << {service: service, score: similarity, match_type: :fuzzy}
118
+ end
119
+ end
120
+
121
+ # Sort by score (highest first), then by name
122
+ results.sort_by { |r| [-r[:score], r[:service]["name"]] }
123
+ end
124
+ end
125
+
126
+ class SearchServicesTool < BaseTool
127
+ tool_name "search_services"
128
+ description "Search for services by name (supports fuzzy matching)"
129
+
130
+ arguments do
131
+ required(:query).filled(:string).description("Search query")
132
+ end
133
+
134
+ def call(query:)
135
+ services = load_data
136
+ fuzzy_results = find_services_fuzzy(services, query, threshold: 0.5)
137
+
138
+ if fuzzy_results.empty?
139
+ "No services found matching '#{query}'"
140
+ else
141
+ # Deduplicate by service name (case-insensitive)
142
+ seen = {}
143
+ unique_results = fuzzy_results.select do |r|
144
+ name_key = r[:service]["name"]&.downcase
145
+ if seen[name_key]
146
+ false
147
+ else
148
+ seen[name_key] = true
149
+ true
150
+ end
151
+ end
152
+
153
+ # Limit to top 20 results
154
+ unique_results.first(20).map { |r| format_service(r[:service]) }.join("\n\n")
155
+ end
156
+ end
157
+ end
158
+
159
+ class GetServiceDetailsTool < BaseTool
160
+ tool_name "get_service_details"
161
+ description "Get detailed status links for a specific service (supports fuzzy matching)"
162
+
163
+ arguments do
164
+ required(:name).filled(:string).description("Service name (exact or fuzzy match)")
165
+ end
166
+
167
+ def call(name:)
168
+ services = load_data
169
+ fuzzy_results = find_services_fuzzy(services, name, threshold: 0.6)
170
+
171
+ if fuzzy_results.empty?
172
+ "Service '#{name}' not found"
173
+ elsif fuzzy_results.first[:match_type] == :exact || fuzzy_results.first[:score] >= 0.9
174
+ # Exact match or very high confidence - return the best match
175
+ format_service(fuzzy_results.first[:service])
176
+ elsif fuzzy_results.length == 1
177
+ # Single fuzzy match - return it
178
+ format_service(fuzzy_results.first[:service])
179
+ else
180
+ # Multiple matches - show the best one but mention alternatives
181
+ best_match = fuzzy_results.first
182
+ alternatives = fuzzy_results[1..2].map { |r| r[:service]["name"] }.compact
183
+
184
+ output = format_service(best_match[:service])
185
+ if alternatives.any?
186
+ output += "\n\n**Note**: Did you mean one of these? #{alternatives.join(", ")}"
187
+ end
188
+ output
189
+ end
190
+ end
191
+ end
192
+
193
+ class ListServicesTool < BaseTool
194
+ tool_name "list_services"
195
+ description "List all available services (limited to first 50 if too many)"
196
+
197
+ arguments do
198
+ optional(:limit).filled(:integer).description("Limit number of results (default 50)")
199
+ end
200
+
201
+ def call(limit: 50)
202
+ services = load_data
203
+ limit ||= 50
204
+
205
+ list = services.take(limit).map { |s| s["name"] }
206
+
207
+ response = "Available services (#{list.size}/#{services.size}):\n"
208
+ response += list.join(", ")
209
+
210
+ if services.size > limit
211
+ response += "\n... and #{services.size - limit} more."
212
+ end
213
+
214
+ response
215
+ end
216
+ end
217
+
218
+ class FetchStatusTool < BaseTool
219
+ # Maximum response size (1MB) to protect against zip bombs and crawler protection pages
220
+ MAX_RESPONSE_SIZE = 1 * 1024 * 1024 # 1MB
221
+
222
+ tool_name "fetch_status"
223
+ description "Fetch status from a status_url with HTML purification. Extracts latest status, history, and messages from status pages."
224
+
225
+ arguments do
226
+ required(:status_url).filled(:string).description("Status page URL to fetch")
227
+ optional(:max_length).filled(:integer).description("Maximum length of extracted text in characters (default: 10000)")
228
+ end
229
+
230
+ def call(status_url:, max_length: 10000)
231
+ # Try incident.io API first (only if we detect it's an incident.io page)
232
+ api_info = nil
233
+ api_url = nil
234
+ # Only try incident.io API for known incident.io domains or if we detect it
235
+ if might_be_incident_io?(status_url)
236
+ begin
237
+ api_url = build_incident_io_api_url(status_url)
238
+ if api_url
239
+ api_info = fetch_and_parse_incident_io_api(api_url, max_length)
240
+ # If API returns error, don't use it
241
+ if api_info&.dig(:error)&.include?("404")
242
+ api_info = nil
243
+ end
244
+ end
245
+ rescue => e
246
+ # Not an incident.io page or API failed, continue with other methods
247
+ end
248
+ end
249
+
250
+ # Try RSS/Atom feeds (they're more reliable for JS-rendered pages)
251
+ feed_urls = build_feed_urls(status_url)
252
+ feed_info = nil
253
+ successful_feed_url = nil
254
+
255
+ if !api_info || (!api_info[:history]&.any? && !api_info[:latest_status])
256
+ feed_urls.each do |feed_url|
257
+ feed_info = fetch_and_parse_feed(feed_url, max_length)
258
+ if feed_info && (feed_info[:history]&.any? || feed_info[:latest_status])
259
+ successful_feed_url = feed_url
260
+ break
261
+ end
262
+ rescue => e
263
+ # Try next feed URL
264
+ next
265
+ end
266
+ end
267
+
268
+ # Fetch main status page (as fallback or supplement)
269
+ main_info = nil
270
+ begin
271
+ main_info = fetch_and_extract(status_url, max_length)
272
+ rescue => e
273
+ # If we have feed info, that's okay
274
+ main_info = {latest_status: nil, history: [], messages: [], error: nil} unless feed_info
275
+ end
276
+
277
+ # Try to fetch history page if it exists
278
+ history_url = build_history_url(status_url)
279
+ history_info = nil
280
+
281
+ if history_url && history_url != status_url
282
+ begin
283
+ history_info = fetch_and_extract(history_url, max_length, history_only: true)
284
+ rescue => e
285
+ # Silently fail if history page doesn't exist or has errors
286
+ # This is expected for many status pages
287
+ end
288
+ end
289
+
290
+ # Merge results (prioritize API data, then feed data, then HTML)
291
+ combined_history = []
292
+ if api_info && api_info[:history]&.any?
293
+ combined_history.concat(api_info[:history])
294
+ end
295
+ if feed_info && feed_info[:history]&.any?
296
+ combined_history.concat(feed_info[:history])
297
+ end
298
+ combined_history.concat(main_info[:history] || []) if main_info && main_info[:history]
299
+ combined_history.concat(history_info[:history] || []) if history_info && history_info[:history]
300
+
301
+ # Remove duplicates (simple text-based deduplication)
302
+ combined_history = combined_history.uniq { |item| item[0..100] }
303
+
304
+ # Determine latest status (prioritize API, then HTML page, then feed)
305
+ # HTML page is more reliable than feed fallback statuses
306
+ latest_status = api_info[:latest_status] if api_info && api_info[:latest_status]
307
+ latest_status ||= main_info[:latest_status] if main_info && main_info[:latest_status]
308
+ # Only use feed status if HTML didn't find one (feed statuses are often fallbacks)
309
+ latest_status ||= feed_info[:latest_status] if feed_info && feed_info[:latest_status]
310
+
311
+ # Get HTTP status code from main page (most reliable)
312
+ http_status_code = main_info[:http_status_code] if main_info && main_info[:http_status_code]
313
+
314
+ # Only include errors if they're meaningful
315
+ final_error = nil
316
+ if main_info && main_info[:error] && !main_info[:error].empty?
317
+ # Don't show JS-rendered page errors if we got data from feeds/API
318
+ final_error = if main_info[:error].include?("JavaScript-rendered") && (feed_info && (feed_info[:history]&.any? || feed_info[:latest_status]) || api_info && (api_info[:history]&.any? || api_info[:latest_status]))
319
+ # JS-rendered page but we got data from other sources, that's fine
320
+ nil
321
+ else
322
+ main_info[:error]
323
+ end
324
+ # Only show feed/API errors if we didn't get any useful data from HTML
325
+ elsif !main_info || !main_info[:latest_status]
326
+ if feed_info && feed_info[:error] && !feed_info[:error].empty? && !feed_info[:history]&.any? && !feed_info[:latest_status]
327
+ # Don't show "Not a valid RSS or Atom feed" - it's not really an error, just no feed available
328
+ unless feed_info[:error].include?("Not a valid RSS or Atom feed")
329
+ final_error = feed_info[:error]
330
+ end
331
+ elsif api_info && api_info[:error] && !api_info[:error].empty? && !api_info[:history]&.any? && !api_info[:latest_status]
332
+ final_error = api_info[:error]
333
+ end
334
+ end
335
+
336
+ {
337
+ status_url: status_url,
338
+ api_url: api_url,
339
+ feed_url: successful_feed_url,
340
+ history_url: history_url,
341
+ latest_status: latest_status,
342
+ history: combined_history.first(20), # Limit to 20 most recent
343
+ messages: (api_info && api_info[:messages]) || (feed_info && feed_info[:messages]) || (main_info && main_info[:messages]) || [],
344
+ extracted_at: Time.now.iso8601,
345
+ error: final_error,
346
+ http_status_code: http_status_code
347
+ }
348
+ rescue => e
349
+ error_message = if e.is_a?(StatusMcp::ResponseSizeExceededError)
350
+ "Response size limit exceeded: #{e.message}"
351
+ else
352
+ "Error fetching status: #{e.message}"
353
+ end
354
+
355
+ {
356
+ status_url: status_url,
357
+ error: error_message,
358
+ latest_status: nil,
359
+ history: [],
360
+ messages: []
361
+ }
362
+ end
363
+
364
+ def fetch_with_redirects(url, max_redirects: 5, accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
365
+ current_url = url
366
+ redirect_count = 0
367
+
368
+ while redirect_count < max_redirects
369
+ uri = URI(current_url)
370
+ http = Net::HTTP.new(uri.host, uri.port)
371
+ http.use_ssl = (uri.scheme == "https")
372
+ if http.use_ssl?
373
+ http.verify_mode = OpenSSL::SSL::VERIFY_PEER
374
+ http.ca_file = OpenSSL::X509::DEFAULT_CERT_FILE if File.exist?(OpenSSL::X509::DEFAULT_CERT_FILE)
375
+ end
376
+ http.read_timeout = 10
377
+ http.open_timeout = 10
378
+
379
+ request = Net::HTTP::Get.new(uri)
380
+ request["User-Agent"] = "Mozilla/5.0 (compatible; StatusMcp/1.0)"
381
+ request["Accept"] = accept
382
+
383
+ response = http.request(request)
384
+
385
+ # Handle redirects (301, 302, 307, 308)
386
+ if response.is_a?(Net::HTTPRedirection) && response["location"]
387
+ redirect_count += 1
388
+ location = response["location"]
389
+ # Handle relative redirects
390
+ current_url = URI.join(current_url, location).to_s
391
+ next
392
+ end
393
+
394
+ # Check response size before returning (protect against zip bombs and crawler protection)
395
+ if response.is_a?(Net::HTTPSuccess)
396
+ # Check Content-Length header first if available (optimization to avoid reading large bodies)
397
+ content_length = response["Content-Length"]
398
+ if content_length
399
+ content_length_int = content_length.to_i
400
+ if content_length_int > MAX_RESPONSE_SIZE
401
+ raise StatusMcp::ResponseSizeExceededError.new(content_length_int, MAX_RESPONSE_SIZE, uri: uri.to_s)
402
+ end
403
+ end
404
+
405
+ # Read body and check actual size (Content-Length might be missing or incorrect)
406
+ response_body = response.body || ""
407
+ response_size = response_body.bytesize
408
+ if response_size > MAX_RESPONSE_SIZE
409
+ raise StatusMcp::ResponseSizeExceededError.new(response_size, MAX_RESPONSE_SIZE, uri: uri.to_s)
410
+ end
411
+ end
412
+
413
+ return response
414
+ end
415
+
416
+ # Too many redirects
417
+ raise "Too many redirects (max: #{max_redirects})"
418
+ end
419
+
420
+ def fetch_and_extract(url, max_length, history_only: false)
421
+ response = fetch_with_redirects(url)
422
+ http_status_code = response.code.to_i
423
+
424
+ unless response.is_a?(Net::HTTPSuccess)
425
+ return {
426
+ error: "Failed to fetch: #{response.code} #{response.message}",
427
+ http_status_code: http_status_code,
428
+ latest_status: nil,
429
+ history: [],
430
+ messages: []
431
+ }
432
+ end
433
+
434
+ html_body = response.body || ""
435
+ # Additional size check (already checked in fetch_with_redirects, but double-check for safety)
436
+ if html_body.bytesize > MAX_RESPONSE_SIZE
437
+ raise StatusMcp::ResponseSizeExceededError.new(html_body.bytesize, MAX_RESPONSE_SIZE, uri: url)
438
+ end
439
+
440
+ uri = URI(url)
441
+
442
+ # Validate and parse HTML
443
+ doc = validate_and_parse_html(html_body, uri)
444
+
445
+ # Extract status information and include HTTP status code
446
+ if history_only
447
+ {
448
+ latest_status: nil,
449
+ history: extract_history(doc),
450
+ messages: [],
451
+ http_status_code: http_status_code
452
+ }
453
+ else
454
+ extract_status_info(doc, max_length).merge(http_status_code: http_status_code)
455
+ end
456
+ end
457
+
458
+ def build_history_url(status_url)
459
+ uri = URI(status_url)
460
+
461
+ # Don't build history URL if we're already on a history page
462
+ return nil if uri.path.end_with?("/history")
463
+
464
+ # Common patterns for history pages
465
+ base_path = uri.path.chomp("/")
466
+
467
+ # Try /history
468
+ history_path = base_path.empty? ? "/history" : "#{base_path}/history"
469
+ history_uri = uri.dup
470
+ history_uri.path = history_path
471
+
472
+ history_uri.to_s
473
+ end
474
+
475
+ def might_be_incident_io?(status_url)
476
+ # Known incident.io domains
477
+ incident_io_domains = [
478
+ "status.openai.com",
479
+ "status.notion.so",
480
+ "status.zapier.com",
481
+ "status.buffer.com"
482
+ ]
483
+
484
+ uri = URI(status_url)
485
+ return true if incident_io_domains.include?(uri.host)
486
+
487
+ # Could add HTML check here in the future
488
+ false
489
+ end
490
+
491
+ def build_incident_io_api_url(status_url)
492
+ uri = URI(status_url)
493
+
494
+ # Check if this looks like an incident.io status page
495
+ # Pattern: https://status.example.com/proxy/status.example.com
496
+ host = uri.host
497
+
498
+ # Build the incident.io API URL
499
+ api_path = "/proxy/#{host}"
500
+ api_uri = uri.dup
501
+ api_uri.path = api_path
502
+
503
+ api_uri.to_s
504
+ end
505
+
506
+ def fetch_and_parse_incident_io_api(api_url, max_length)
507
+ response = fetch_with_redirects(api_url, accept: "application/json")
508
+
509
+ unless response.is_a?(Net::HTTPSuccess)
510
+ return {
511
+ error: "Failed to fetch API: #{response.code} #{response.message}",
512
+ latest_status: nil,
513
+ history: [],
514
+ messages: []
515
+ }
516
+ end
517
+
518
+ json_body = response.body || ""
519
+ # Additional size check (already checked in fetch_with_redirects, but double-check for safety)
520
+ if json_body.bytesize > MAX_RESPONSE_SIZE
521
+ raise StatusMcp::ResponseSizeExceededError.new(json_body.bytesize, MAX_RESPONSE_SIZE, uri: api_url)
522
+ end
523
+
524
+ # Parse JSON response
525
+ parse_incident_io_api(json_body, max_length)
526
+ rescue StatusMcp::ResponseSizeExceededError
527
+ # Re-raise response size errors
528
+ raise
529
+ rescue JSON::ParserError => e
530
+ {
531
+ error: "Error parsing API JSON: #{e.message}",
532
+ latest_status: nil,
533
+ history: [],
534
+ messages: []
535
+ }
536
+ rescue => e
537
+ {
538
+ error: "Error fetching API: #{e.message}",
539
+ latest_status: nil,
540
+ history: [],
541
+ messages: []
542
+ }
543
+ end
544
+
545
+ def parse_incident_io_api(json_body, max_length)
546
+ data = JSON.parse(json_body)
547
+ summary = data["summary"] || {}
548
+
549
+ ongoing_incidents = summary["ongoing_incidents"] || []
550
+ scheduled_maintenances = summary["scheduled_maintenances"] || []
551
+ components = summary["components"] || []
552
+
553
+ history_items = []
554
+ messages = []
555
+ latest_status = nil
556
+
557
+ # Extract from ongoing incidents
558
+ ongoing_incidents.each do |incident|
559
+ title = incident["name"] || "Ongoing Incident"
560
+ status = incident["status"] || "Investigating"
561
+ description = incident["description"] || ""
562
+
563
+ # Clean HTML from description if present
564
+ if description.include?("<")
565
+ desc_doc = Nokogiri::HTML(description)
566
+ description = desc_doc.text.strip
567
+ end
568
+
569
+ item_text = "#{title} - Status: #{status}"
570
+ item_text += " - #{description[0..300]}" if description && !description.empty?
571
+
572
+ history_items << purify_text(item_text)
573
+
574
+ # Use first incident's status as latest status
575
+ latest_status ||= status
576
+ end
577
+
578
+ # Extract from scheduled maintenances
579
+ scheduled_maintenances.each do |maintenance|
580
+ title = maintenance["name"] || "Scheduled Maintenance"
581
+ status = maintenance["status"] || "Scheduled"
582
+ description = maintenance["description"] || ""
583
+
584
+ # Clean HTML from description if present
585
+ if description.include?("<")
586
+ desc_doc = Nokogiri::HTML(description)
587
+ description = desc_doc.text.strip
588
+ end
589
+
590
+ scheduled_for = maintenance["scheduled_for"] || ""
591
+ scheduled_until = maintenance["scheduled_until"] || ""
592
+
593
+ item_text = "#{title} - Status: #{status}"
594
+ item_text += " - Scheduled: #{scheduled_for}" if scheduled_for && !scheduled_for.empty?
595
+ item_text += " until #{scheduled_until}" if scheduled_until && !scheduled_until.empty?
596
+ item_text += " - #{description[0..300]}" if description && !description.empty?
597
+
598
+ history_items << purify_text(item_text)
599
+ end
600
+
601
+ # Determine overall status if no incidents
602
+ unless latest_status
603
+ if ongoing_incidents.empty? && scheduled_maintenances.empty?
604
+ # Check component statuses
605
+ all_operational = components.all? do |comp|
606
+ comp_status = comp["status"] || comp["operational_status"]
607
+ comp_status&.downcase&.include?("operational") || comp_status.nil?
608
+ end
609
+
610
+ if all_operational
611
+ latest_status = "Operational"
612
+ else
613
+ # Find non-operational components
614
+ non_operational = components.select do |comp|
615
+ comp_status = comp["status"] || comp["operational_status"]
616
+ comp_status && !comp_status.downcase.include?("operational")
617
+ end
618
+
619
+ latest_status = if non_operational.any?
620
+ "Degraded Performance"
621
+ else
622
+ "Operational"
623
+ end
624
+ end
625
+ else
626
+ latest_status = "See incidents"
627
+ end
628
+ end
629
+
630
+ # Truncate if needed
631
+ total_length = history_items.join("\n").length
632
+ if total_length > max_length
633
+ history_items = truncate_array(history_items, max_length)
634
+ end
635
+
636
+ {
637
+ latest_status: latest_status,
638
+ history: history_items,
639
+ messages: messages
640
+ }
641
+ end
642
+
643
+ def build_feed_urls(status_url)
644
+ uri = URI(status_url)
645
+ base_path = uri.path.chomp("/")
646
+
647
+ # Common RSS/Atom feed patterns
648
+ feed_patterns = [
649
+ "/feed.rss",
650
+ "/feed.atom",
651
+ "/rss",
652
+ "/atom",
653
+ "/feed",
654
+ "/status.rss",
655
+ "/status.atom"
656
+ ]
657
+
658
+ feed_urls = []
659
+ feed_patterns.each do |pattern|
660
+ feed_path = base_path.empty? ? pattern : "#{base_path}#{pattern}"
661
+ feed_uri = uri.dup
662
+ feed_uri.path = feed_path
663
+ feed_urls << feed_uri.to_s
664
+ end
665
+
666
+ feed_urls
667
+ end
668
+
669
+ def fetch_and_parse_feed(feed_url, max_length)
670
+ response = fetch_with_redirects(feed_url, accept: "application/rss+xml,application/atom+xml,application/xml,text/xml,*/*;q=0.9")
671
+
672
+ unless response.is_a?(Net::HTTPSuccess)
673
+ return {
674
+ error: "Failed to fetch feed: #{response.code} #{response.message}",
675
+ latest_status: nil,
676
+ history: [],
677
+ messages: []
678
+ }
679
+ end
680
+
681
+ feed_body = response.body || ""
682
+ # Additional size check (already checked in fetch_with_redirects, but double-check for safety)
683
+ if feed_body.bytesize > MAX_RESPONSE_SIZE
684
+ raise StatusMcp::ResponseSizeExceededError.new(feed_body.bytesize, MAX_RESPONSE_SIZE, uri: feed_url)
685
+ end
686
+
687
+ # Parse RSS/Atom feed
688
+ parse_feed(feed_body, max_length)
689
+ rescue StatusMcp::ResponseSizeExceededError
690
+ # Re-raise response size errors
691
+ raise
692
+ rescue => e
693
+ {
694
+ error: "Error parsing feed: #{e.message}",
695
+ latest_status: nil,
696
+ history: [],
697
+ messages: []
698
+ }
699
+ end
700
+
701
+ def parse_feed(feed_body, max_length)
702
+ doc = Nokogiri::XML(feed_body)
703
+
704
+ # Determine feed type (RSS or Atom)
705
+ is_atom = doc.root&.name == "feed" || doc.at("feed")
706
+ is_rss = doc.root&.name == "rss" || doc.at("rss")
707
+
708
+ unless is_rss || is_atom
709
+ return {
710
+ error: "Not a valid RSS or Atom feed",
711
+ latest_status: nil,
712
+ history: [],
713
+ messages: []
714
+ }
715
+ end
716
+
717
+ history_items = []
718
+ latest_status = nil
719
+
720
+ if is_rss
721
+ # Parse RSS feed
722
+ items = doc.css("item")
723
+ items.each do |item|
724
+ title = item.css("title").first&.text&.strip || ""
725
+ description = item.css("description").first&.text&.strip || ""
726
+ pub_date = item.css("pubDate").first&.text&.strip || ""
727
+
728
+ # Clean HTML from description
729
+ if description.include?("<")
730
+ desc_doc = Nokogiri::HTML(description)
731
+ # Remove lists and other HTML elements, get clean text
732
+ desc_doc.css("ul, ol, li, br").each { |el| el.replace("\n") }
733
+ description = desc_doc.text.strip
734
+ # Normalize whitespace
735
+ description = description.gsub(/\n{2,}/, "\n").gsub(/[ \t]{2,}/, " ").strip
736
+ end
737
+
738
+ # Extract status from description (look for "Status: ..." patterns)
739
+ # Try to get just the status word (Resolved, Operational, etc.)
740
+ status_match = description.match(/Status:\s*([A-Za-z]+)/i) || title.match(/Status:\s*([A-Za-z]+)/i)
741
+ if status_match && !latest_status
742
+ status_word = status_match[1].strip
743
+ # Only use if it's a known status word
744
+ if status_word.match?(/^(Resolved|Operational|Degraded|Down|Investigating|Monitoring|Identified|Partial|Major|Minor)$/i)
745
+ latest_status = status_word
746
+ end
747
+ end
748
+
749
+ # Build history item (clean up description first)
750
+ # Remove status line and component lists from description for cleaner output
751
+ clean_description = description.dup
752
+ clean_description = clean_description.gsub(/Status:\s*[^\n]+/i, "").strip
753
+ clean_description = clean_description.gsub(/Affected components[^\n]*/i, "").strip
754
+ clean_description = clean_description.gsub(/\(Operational\)/i, "").strip
755
+ clean_description = clean_description.gsub(/\n{2,}/, "\n").strip
756
+
757
+ item_text = title.to_s
758
+ if clean_description && !clean_description.empty? && clean_description.length > 10
759
+ item_text += " - #{clean_description[0..500]}" # Limit description length
760
+ end
761
+ item_text += " (#{pub_date})" if pub_date && !pub_date.empty?
762
+
763
+ history_items << purify_text(item_text) if item_text.length >= 20
764
+ end
765
+ elsif is_atom
766
+ # Parse Atom feed
767
+ entries = doc.css("entry")
768
+ entries.each do |entry|
769
+ title = entry.css("title").first&.text&.strip || ""
770
+ content = entry.css("content").first&.text&.strip || entry.css("summary").first&.text&.strip || ""
771
+ updated = entry.css("updated").first&.text&.strip || entry.css("published").first&.text&.strip || ""
772
+
773
+ # Clean HTML from content
774
+ if content.include?("<")
775
+ content_doc = Nokogiri::HTML(content)
776
+ # Remove lists and other HTML elements, get clean text
777
+ content_doc.css("ul, ol, li, br").each { |el| el.replace("\n") }
778
+ content = content_doc.text.strip
779
+ # Normalize whitespace
780
+ content = content.gsub(/\n{2,}/, "\n").gsub(/[ \t]{2,}/, " ").strip
781
+ end
782
+
783
+ # Extract status from content (look for "Status: ..." patterns)
784
+ # Try to get just the status word (Resolved, Operational, etc.)
785
+ status_match = content.match(/Status:\s*([A-Za-z]+)/i) || title.match(/Status:\s*([A-Za-z]+)/i)
786
+ if status_match && !latest_status
787
+ status_word = status_match[1].strip
788
+ # Only use if it's a known status word
789
+ if status_word.match?(/^(Resolved|Operational|Degraded|Down|Investigating|Monitoring|Identified|Partial|Major|Minor)$/i)
790
+ latest_status = status_word
791
+ end
792
+ end
793
+
794
+ # Build history item (clean up content first)
795
+ # Remove status line and component lists from content for cleaner output
796
+ clean_content = content.dup
797
+ clean_content = clean_content.gsub(/Status:\s*[^\n]+/i, "").strip
798
+ clean_content = clean_content.gsub(/Affected components[^\n]*/i, "").strip
799
+ clean_content = clean_content.gsub(/\(Operational\)/i, "").strip
800
+ clean_content = clean_content.gsub(/\n{2,}/, "\n").strip
801
+
802
+ item_text = title.to_s
803
+ if clean_content && !clean_content.empty? && clean_content.length > 10
804
+ item_text += " - #{clean_content[0..500]}" # Limit content length
805
+ end
806
+ item_text += " (#{updated})" if updated && !updated.empty?
807
+
808
+ history_items << purify_text(item_text) if item_text.length >= 20
809
+ end
810
+ end
811
+
812
+ # Determine overall status from feed title or latest item
813
+ unless latest_status
814
+ feed_title = doc.css("channel > title, feed > title").first&.text&.strip
815
+ # Only use feed title if it's a short status word, not a page title
816
+ if feed_title && feed_title.length < 50 && feed_title.match?(/^(operational|degraded|down|outage|incident|maintenance|all systems operational)$/i)
817
+ latest_status = feed_title
818
+ elsif history_items.any?
819
+ # Check if all items are scheduled maintenance (not actual incidents)
820
+ scheduled_count = history_items.count { |item| item.match?(/scheduled|maintenance/i) && !item.match?(/incident|outage|degraded|down|investigating/i) }
821
+
822
+ # Check if all items are resolved
823
+ resolved_count = history_items.count { |item| item.match?(/resolved|operational/i) && !item.match?(/investigating|monitoring|identified/i) }
824
+
825
+ # If all are scheduled maintenance or all resolved, likely operational
826
+ latest_status = if scheduled_count == history_items.length || (resolved_count == history_items.length && history_items.length > 0)
827
+ "Operational"
828
+ # If there are active incidents (investigating, monitoring, identified)
829
+ elsif history_items.any? { |item| item.match?(/investigating|monitoring|identified|degraded|down|outage/i) && !item.match?(/resolved|operational/i) }
830
+ "See recent incidents"
831
+ # If we have history but can't determine, default to operational (better than "See recent incidents")
832
+ else
833
+ "Operational"
834
+ end
835
+ end
836
+ end
837
+
838
+ # Truncate if needed
839
+ total_length = history_items.join("\n").length
840
+ if total_length > max_length
841
+ history_items = truncate_array(history_items, max_length)
842
+ end
843
+
844
+ {
845
+ latest_status: latest_status,
846
+ history: history_items,
847
+ messages: []
848
+ }
849
+ end
850
+
851
+ private
852
+
853
+ def validate_and_parse_html(body, uri)
854
+ # Check for common crawler protection patterns (more specific)
855
+ protection_patterns = [
856
+ /checking your browser.*?before accessing/i,
857
+ /ddos protection.*?checking/i,
858
+ /access denied.*?cloudflare/i,
859
+ /please wait.*?cloudflare/i,
860
+ /captcha.*?verification/i,
861
+ /rate limit.*?exceeded/i,
862
+ /blocked.*?security/i
863
+ ]
864
+
865
+ if protection_patterns.any? { |pattern| body.match?(pattern) }
866
+ raise "Response appears to be a crawler protection page"
867
+ end
868
+
869
+ # Check if response is actually HTML
870
+ unless body.strip.start_with?("<!DOCTYPE", "<html", "<HTML") || body.include?("<html")
871
+ raise "Response does not appear to be HTML"
872
+ end
873
+
874
+ doc = Nokogiri::HTML(body)
875
+
876
+ # Check if this is a JavaScript-rendered page (React, Vue, Angular, etc.)
877
+ js_rendered_indicators = [
878
+ /<div[^>]*id=["']root["'][^>]*>\s*<\/div>/i,
879
+ /<div[^>]*id=["']app["'][^>]*>\s*<\/div>/i,
880
+ /You need to enable JavaScript/i,
881
+ /<noscript>.*?enable.*?javascript/i,
882
+ /react.*?root|vue.*?app|angular.*?app/i
883
+ ]
884
+
885
+ is_js_rendered = js_rendered_indicators.any? { |pattern| body.match?(pattern) }
886
+
887
+ # Check if HTML is empty or appears to be an error page
888
+ # For JS-rendered pages, allow shorter content
889
+ min_length = is_js_rendered ? 20 : 50
890
+ if doc.text.strip.length < min_length
891
+ if is_js_rendered
892
+ raise "HTML response appears to be a JavaScript-rendered page with no server-side content"
893
+ else
894
+ raise "HTML response appears to be empty or too short"
895
+ end
896
+ end
897
+
898
+ # Check for common error page indicators (but be more specific)
899
+ error_indicators = [
900
+ /^error 404$/i,
901
+ /^page not found$/i,
902
+ /^access denied$/i,
903
+ /^forbidden$/i,
904
+ /^internal server error$/i,
905
+ /404.*?not found/i,
906
+ /error.*?404/i
907
+ ]
908
+
909
+ # Only flag as error if it's clearly an error page (title or main heading)
910
+ title = doc.css("title").first&.text&.strip
911
+ main_heading = doc.css("h1").first&.text&.strip
912
+
913
+ if (title && error_indicators.any? { |pattern| title.match?(pattern) }) ||
914
+ (main_heading && error_indicators.any? { |pattern| main_heading.match?(pattern) })
915
+ raise "HTML response appears to be an error page"
916
+ end
917
+
918
+ doc
919
+ rescue Nokogiri::XML::SyntaxError => e
920
+ raise "Failed to parse HTML: #{e.message}"
921
+ end
922
+
923
+ def extract_status_info(doc, max_length)
924
+ # Remove unwanted elements
925
+ doc.css("nav, header, footer, .navigation, .sidebar, script, style, .cookie-banner, .privacy-banner, .advertisement, .ads").remove
926
+
927
+ # Common status page patterns
928
+ # 1. Status indicators (operational, degraded, down, etc.)
929
+ latest_status = extract_latest_status(doc)
930
+
931
+ # 2. Recent incidents/updates/history
932
+ history = extract_history(doc)
933
+
934
+ # 3. Messages/announcements
935
+ messages = extract_messages(doc)
936
+
937
+ # Truncate if needed
938
+ status_text = latest_status || ""
939
+ history_text = history.join("\n")
940
+ messages_text = messages.join("\n")
941
+ total_text = [status_text, history_text, messages_text].reject(&:empty?).join("\n")
942
+
943
+ if total_text.length > max_length
944
+ # Truncate proportionally, giving priority to status and history
945
+ status_length = status_text.length
946
+ history_length = history_text.length
947
+ messages_length = messages_text.length
948
+ total_length = total_text.length
949
+
950
+ if total_length > 0
951
+ status_max = [(max_length * 0.3).to_i, status_length].min if status_length > 0
952
+ history_max = [(max_length * 0.5).to_i, history_length].min if history_length > 0
953
+ messages_max = [(max_length * 0.2).to_i, messages_length].min if messages_length > 0
954
+
955
+ latest_status = truncate_text(latest_status, status_max) if latest_status && status_max
956
+ history = truncate_array(history, history_max) if history_max
957
+ messages = truncate_array(messages, messages_max) if messages_max
958
+ end
959
+ end
960
+
961
+ {
962
+ latest_status: latest_status,
963
+ history: history,
964
+ messages: messages
965
+ }
966
+ end
967
+
968
+ def extract_latest_status(doc)
969
+ # First, check for component status lists (common pattern: "Component Name ? Operational")
970
+ # Look for patterns like "Component ? Operational" or lists of components with status
971
+ component_status_pattern = /([A-Za-z0-9\s-]+)\s+[?•]\s+(Operational|Degraded|Down|Outage|Maintenance)/i
972
+ all_text = doc.text
973
+
974
+ # Count operational vs non-operational components
975
+ operational_matches = all_text.scan(component_status_pattern)
976
+ if operational_matches.any?
977
+ operational_count = operational_matches.count { |m| m[1]&.match?(/operational/i) }
978
+ non_operational = operational_matches.reject { |m| m[1]&.match?(/operational/i) }
979
+
980
+ # If all components are operational, return "Operational"
981
+ if non_operational.empty? && operational_count > 0
982
+ return "Operational"
983
+ # If some are non-operational, check for active incidents
984
+ elsif non_operational.any?
985
+ # Check if there are active incidents (Investigating, Monitoring, Identified)
986
+ if all_text.match?(/Investigating|Monitoring|Identified/i) && !all_text.match?(/Resolved|Completed/i)
987
+ return "Degraded Performance"
988
+ elsif non_operational.any? { |m| m[1]&.match?(/down|outage/i) }
989
+ return "Partial Outage"
990
+ else
991
+ return "Degraded Performance"
992
+ end
993
+ end
994
+ end
995
+
996
+ # Try common status page selectors
997
+ status_selectors = [
998
+ ".status-indicator",
999
+ ".status",
1000
+ "[class*='status-indicator']",
1001
+ "[data-status]",
1002
+ ".component-status",
1003
+ ".operational-status",
1004
+ ".current-status",
1005
+ "div[class*='indicator'][class*='status']",
1006
+ ".page-status",
1007
+ "main .status:first-of-type",
1008
+ ".status-page-status",
1009
+ "[data-component-status]",
1010
+ ".unresolved-incident",
1011
+ ".resolved-incident",
1012
+ "h1[class*='status']",
1013
+ "h2[class*='status']"
1014
+ ]
1015
+
1016
+ status_text = nil
1017
+ status_selectors.each do |selector|
1018
+ element = doc.css(selector).first
1019
+ next unless element
1020
+
1021
+ # Remove nested navigation and UI elements
1022
+ element.css("nav, .navigation, .menu, a, button, .button, .link").remove
1023
+
1024
+ # Get text and clean it
1025
+ text = element.text.strip
1026
+ next if text.empty? || text.length < 3
1027
+ next if text.length > 300 # Too long, probably not a status indicator
1028
+
1029
+ # Check for status keywords
1030
+ if text.match?(/operational|degraded|down|outage|incident|maintenance|all systems|resolved|investigating|partial|major|minor/i)
1031
+ status_text = text
1032
+ break
1033
+ end
1034
+ end
1035
+
1036
+ # If no specific status found, try to find main content area
1037
+ unless status_text
1038
+ main_content = doc.css("main, .content, .status-content, .page-content, article, [role='main']").first
1039
+ if main_content
1040
+ # Remove UI elements from main content
1041
+ main_content.css("nav, header, footer, .navigation, .sidebar, .menu, form, input, button, .header, .footer").remove
1042
+
1043
+ # Look for status-like text in first few headings
1044
+ headings = main_content.css("h1, h2, h3").first(5)
1045
+ headings.each do |heading|
1046
+ text = heading.text.strip
1047
+ # Check if it's a status (short and contains status keywords)
1048
+ # Exclude page structure patterns
1049
+ if text.length > 3 && text.length <= 200 &&
1050
+ text.match?(/operational|degraded|down|outage|incident|maintenance|all systems|resolved|investigating|partial|major|minor/i) &&
1051
+ !text.match?(/^(Support|Log in|Sign up|Subscribe|Email|Get|Visit|Click|Home|About)/i) &&
1052
+ !text.match?(/Status.*Incident History|Incident History.*Status|.*Status.*-.*Incident/i)
1053
+ status_text = text
1054
+ break
1055
+ end
1056
+ end
1057
+
1058
+ # If headings didn't work, try first short paragraph
1059
+ unless status_text
1060
+ paragraphs = main_content.css("p").first(5)
1061
+ short_paras = paragraphs.select { |p|
1062
+ text = p.text.strip
1063
+ text.length < 200 && text.length > 10 &&
1064
+ text.match?(/operational|degraded|down|outage|incident|maintenance|all systems|resolved|investigating|partial|major|minor/i)
1065
+ }
1066
+ if short_paras.any?
1067
+ status_text = short_paras.first.text.strip
1068
+ end
1069
+ end
1070
+ end
1071
+ end
1072
+
1073
+ # Fallback: extract from title (but filter out common page titles)
1074
+ unless status_text
1075
+ title = doc.css("title").first&.text&.strip
1076
+ if title && !title.match?(/^(Status|System Status|Service Status|.*Status)$/i)
1077
+ # Only use title if it's short and looks like a status
1078
+ if title.length < 100 && title.match?(/operational|degraded|down|outage|incident|maintenance/i)
1079
+ status_text = title
1080
+ end
1081
+ end
1082
+ end
1083
+
1084
+ text = purify_text(status_text)
1085
+ # Filter out if it looks like navigation or UI
1086
+ return nil if text && (text.match?(/^(Support|Log in|Sign up|Subscribe|Email|Get|Visit|Click|Home|About)/i) || text.split(/\s+/).length > 20)
1087
+ # Filter out page structure patterns like "Service Status - Incident History"
1088
+ return nil if text&.match?(/Status.*Incident History|Incident History.*Status/i)
1089
+ text
1090
+ end
1091
+
1092
+ def extract_history(doc)
1093
+ # Common history/incident selectors
1094
+ history_selectors = [
1095
+ ".incident",
1096
+ ".incident-list",
1097
+ ".history",
1098
+ ".timeline",
1099
+ ".status-update",
1100
+ ".update",
1101
+ "[class*='incident']",
1102
+ "[class*='history']",
1103
+ "[class*='timeline']",
1104
+ "[class*='update']",
1105
+ ".recent-incidents",
1106
+ ".past-incidents",
1107
+ "[data-incident]",
1108
+ ".incident-item",
1109
+ ".history-item",
1110
+ "article[class*='incident']",
1111
+ "article[class*='update']"
1112
+ ]
1113
+
1114
+ history_items = []
1115
+
1116
+ history_selectors.each do |selector|
1117
+ elements = doc.css(selector)
1118
+ next if elements.empty?
1119
+
1120
+ elements.first(15).each do |element|
1121
+ # Remove nested navigation and UI elements
1122
+ element.css("nav, .navigation, .menu, script, style, .close, .dismiss, button, .button").remove
1123
+
1124
+ text = element.text.strip
1125
+ next if text.empty? || text.length < 20
1126
+
1127
+ # Clean up the text
1128
+ text = purify_text(text)
1129
+ history_items << text if text.length >= 20
1130
+ end
1131
+
1132
+ break if history_items.any?
1133
+ end
1134
+
1135
+ # If no specific history found, try to extract from main content
1136
+ if history_items.empty?
1137
+ main_content = doc.css("main, .content, article, [role='main']").first
1138
+ if main_content
1139
+ # Remove UI elements
1140
+ main_content.css("nav, header, footer, .navigation, .sidebar, .menu, form, input, button, .header, .footer").remove
1141
+
1142
+ # Look for list items, sections, or divs that might be history
1143
+ items = main_content.css("li, .item, .entry, section, article, div[class*='incident'], div[class*='update'], div[class*='history']").first(20)
1144
+ items.each do |item|
1145
+ # Skip if it's too small or looks like navigation
1146
+ next if item.css("nav, .navigation, .menu").any?
1147
+
1148
+ text = item.text.strip
1149
+ next if text.empty? || text.length < 20
1150
+ next if text.length > 2000 # Too long, probably not a single history item
1151
+
1152
+ # Check if it looks like a status update (has date, time, or status keywords)
1153
+ if text.match?(/\d{4}|\d{1,2}\/\d{1,2}|\d{1,2}-\d{1,2}|\d{1,2}:\d{2}|UTC|EST|PST|operational|degraded|resolved|investigating|incident|outage|maintenance|update|resolved/i)
1154
+ cleaned = purify_text(text)
1155
+ history_items << cleaned if cleaned.length >= 20
1156
+ end
1157
+ end
1158
+ end
1159
+ end
1160
+
1161
+ history_items.first(20) # Limit to 20 most recent
1162
+ end
1163
+
1164
+ def extract_messages(doc)
1165
+ # Common message/announcement selectors
1166
+ message_selectors = [
1167
+ ".message",
1168
+ ".announcement",
1169
+ ".alert",
1170
+ ".notification",
1171
+ "[class*='message']",
1172
+ "[class*='announcement']",
1173
+ "[class*='alert']",
1174
+ ".banner-message",
1175
+ ".status-message"
1176
+ ]
1177
+
1178
+ messages = []
1179
+
1180
+ message_selectors.each do |selector|
1181
+ elements = doc.css(selector)
1182
+ next if elements.empty?
1183
+
1184
+ elements.first(5).each do |element|
1185
+ element.css("script, style, .close, .dismiss").remove
1186
+ text = element.text.strip
1187
+ next if text.empty? || text.length < 10
1188
+
1189
+ messages << purify_text(text)
1190
+ end
1191
+
1192
+ break if messages.any?
1193
+ end
1194
+
1195
+ messages.first(5) # Limit to 5 messages
1196
+ end
1197
+
1198
+ def purify_text(text)
1199
+ return nil unless text
1200
+
1201
+ # Remove common UI text patterns
1202
+ text = text.gsub(/Notifications.*?signed in.*?reload/im, "")
1203
+ text = text.gsub(/You must be signed in.*?reload/im, "")
1204
+ text = text.gsub(/There was an error.*?reload/im, "")
1205
+ text = text.gsub(/Please reload this page.*?/im, "")
1206
+ text = text.gsub(/Loading.*?/im, "")
1207
+ text = text.gsub(/Cookie|Privacy|Accept|Decline/i, "") if text.length < 50
1208
+
1209
+ # Clean up whitespace
1210
+ text = text.gsub(/\n{3,}/, "\n\n")
1211
+ text = text.gsub(/[ \t]{2,}/, " ")
1212
+ text = text.strip
1213
+
1214
+ # Remove excessive blank lines
1215
+ text.gsub(/\n{3,}/, "\n\n")
1216
+ end
1217
+
1218
+ def truncate_text(text, max_length)
1219
+ return text unless text && text.length > max_length
1220
+
1221
+ # Try to cut at a reasonable point (sentence or paragraph boundary)
1222
+ truncated = text[0..max_length]
1223
+ cut_point = truncated.rindex(/[.!?]\s+/) || truncated.rindex(/\n\n/) || truncated.rindex(/\n/) || max_length
1224
+ truncated[0..cut_point].strip + "..."
1225
+ end
1226
+
1227
+ def truncate_array(items, max_total_length)
1228
+ return [] if items.empty?
1229
+
1230
+ result = []
1231
+ current_length = 0
1232
+
1233
+ items.each do |item|
1234
+ item_length = item.length
1235
+ if current_length + item_length <= max_total_length
1236
+ result << item
1237
+ current_length += item_length
1238
+ else
1239
+ # Try to fit partial item
1240
+ remaining = max_total_length - current_length
1241
+ if remaining > 100 # Only add if we have meaningful space left
1242
+ truncated_item = truncate_text(item, remaining)
1243
+ result << truncated_item
1244
+ end
1245
+ break
1246
+ end
1247
+ end
1248
+
1249
+ result
1250
+ end
1251
+ end
1252
+ end
1253
+ end