RubyGems - aircana - Versions diffs - 1.1.0.rc1 → 1.3.0 - Mend

aircana 1.1.0.rc1 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

checksums.yaml +4 -4
data/.rspec_status +188 -160
data/CHANGELOG.md +7 -0
data/CLAUDE.md +18 -6
data/README.md +47 -6
data/lib/aircana/cli/app.rb +5 -2
data/lib/aircana/cli/commands/agents.rb +118 -6
data/lib/aircana/cli/commands/doctor.rb +11 -5
data/lib/aircana/cli/commands/doctor_checks.rb +48 -0
data/lib/aircana/cli/commands/install.rb +4 -5
data/lib/aircana/contexts/confluence.rb +0 -1
data/lib/aircana/contexts/manifest.rb +20 -0
data/lib/aircana/contexts/web.rb +341 -0
data/lib/aircana/generators/agents_generator.rb +1 -1
data/lib/aircana/generators/helpers.rb +1 -2
data/lib/aircana/generators/hooks_generator.rb +2 -0
data/lib/aircana/generators/relevant_files_command_generator.rb +1 -1
data/lib/aircana/system_checker.rb +11 -0
data/lib/aircana/templates/agents/base_agent.erb +4 -3
data/lib/aircana/templates/hooks/notification_sqs.erb +75 -0
data/lib/aircana/templates/hooks/pre_tool_use.erb +32 -7
data/lib/aircana/version.rb +1 -1
data/spec_output_1758908468_248/commands/air-add-relevant-files.md +1 -0
data/spec_output_1758908479_36/commands/air-add-relevant-files.md +1 -0
data/spec_output_1758908547_132/commands/air-add-relevant-files.md +1 -0
data/spec_output_1758908553_721/commands/air-add-relevant-files.md +1 -0
data/spec_output_1758917010_960/commands/air-add-relevant-files.md +1 -0
data/spec_output_1758917064_555/commands/air-add-relevant-files.md +1 -0
metadata +11 -7
data/lib/aircana/cli/commands/plan.rb +0 -69
data/lib/aircana/cli/commands/work.rb +0 -69
data/lib/aircana/templates/agents/defaults/planner.erb +0 -126
data/lib/aircana/templates/agents/defaults/worker.erb +0 -185

data/lib/aircana/cli/commands/agents.rb CHANGED Viewed

@@ -4,6 +4,7 @@ require "json"
 require "tty-prompt"
 require_relative "../../generators/agents_generator"
 require_relative "../../contexts/manifest"
+require_relative "../../contexts/web"
 module Aircana
   module CLI
@@ -43,6 +44,9 @@ module Aircana
           # Prompt for knowledge fetching
           prompt_for_knowledge_fetch(prompt, normalized_agent_name)
+          # Prompt for web URL fetching
+          prompt_for_url_fetch(prompt, normalized_agent_name)
           # Prompt for agent file review
           prompt_for_agent_review(prompt, file)
@@ -59,6 +63,44 @@ module Aircana
           print_agents_list(agent_folders)
         end
+        def add_url(agent, url) # rubocop:disable Metrics/MethodLength, Metrics/AbcSize, Metrics/PerceivedComplexity
+          normalized_agent = normalize_string(agent)
+          unless agent_exists?(normalized_agent)
+            Aircana.human_logger.error "Agent '#{agent}' not found. Use 'aircana agents list' to see available agents."
+            exit 1
+          end
+          web = Aircana::Contexts::Web.new
+          result = web.fetch_url_for(agent: normalized_agent, url: url)
+          if result
+            # Update manifest with the new URL
+            existing_sources = Aircana::Contexts::Manifest.sources_from_manifest(normalized_agent)
+            web_sources = existing_sources.select { |s| s["type"] == "web" }
+            other_sources = existing_sources.reject { |s| s["type"] == "web" }
+            if web_sources.any?
+              # Add to existing web source
+              web_sources.first["urls"] << result
+            else
+              # Create new web source
+              web_sources = [{ "type" => "web", "urls" => [result] }]
+            end
+            all_sources = other_sources + web_sources
+            Aircana::Contexts::Manifest.update_manifest(normalized_agent, all_sources)
+            Aircana.human_logger.success "Successfully added URL to agent '#{agent}'"
+          else
+            Aircana.human_logger.error "Failed to fetch URL: #{url}"
+            exit 1
+          end
+        rescue Aircana::Error => e
+          Aircana.human_logger.error "Failed to add URL: #{e.message}"
+          exit 1
+        end
         private
         def perform_refresh(normalized_agent)
@@ -77,20 +119,38 @@ module Aircana
           end
         end
-        def perform_manifest_aware_refresh(normalized_agent)
-          confluence = Aircana::Contexts::Confluence.new
+        def perform_manifest_aware_refresh(normalized_agent) # rubocop:disable Metrics/MethodLength, Metrics/AbcSize
+          total_pages = 0
+          all_sources = []
           # Try manifest-based refresh first
           if Aircana::Contexts::Manifest.manifest_exists?(normalized_agent)
             Aircana.human_logger.info "Refreshing from knowledge manifest..."
-            result = confluence.refresh_from_manifest(agent: normalized_agent)
+            # Refresh Confluence sources
+            confluence = Aircana::Contexts::Confluence.new
+            confluence_result = confluence.refresh_from_manifest(agent: normalized_agent)
+            total_pages += confluence_result[:pages_count]
+            all_sources.concat(confluence_result[:sources])
+            # Refresh web sources
+            web = Aircana::Contexts::Web.new
+            web_result = web.refresh_web_sources(agent: normalized_agent)
+            total_pages += web_result[:pages_count]
+            all_sources.concat(web_result[:sources])
           else
             Aircana.human_logger.info "No manifest found, falling back to label-based search..."
-            result = confluence.fetch_pages_for(agent: normalized_agent)
+            confluence = Aircana::Contexts::Confluence.new
+            confluence_result = confluence.fetch_pages_for(agent: normalized_agent)
+            total_pages += confluence_result[:pages_count]
+            all_sources.concat(confluence_result[:sources])
           end
-          log_refresh_result(normalized_agent, result[:pages_count])
-          result
+          # Update manifest with all sources combined
+          Aircana::Contexts::Manifest.update_manifest(normalized_agent, all_sources) if all_sources.any?
+          log_refresh_result(normalized_agent, total_pages)
+          { pages_count: total_pages, sources: all_sources }
         end
         def show_gitignore_recommendation
@@ -133,6 +193,9 @@ module Aircana
             within its domain.
             Print the output to STDOUT only, without any additional commentary.
+            The description should be 2-3 sentences. Most of the agent's context comes from
+            its knowledge base
           PROMPT
         end
@@ -153,6 +216,43 @@ module Aircana
           Aircana.human_logger.info "You can try again later with 'aircana agents refresh #{normalized_agent_name}'"
         end
+        def prompt_for_url_fetch(prompt, normalized_agent_name) # rubocop:disable Metrics/MethodLength, Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
+          return unless prompt.yes?("Would you like to add web URLs for this agent's knowledge base?")
+          urls = []
+          loop do
+            url = prompt.ask("Enter URL (or press Enter to finish):")
+            break if url.nil? || url.strip.empty?
+            url = url.strip
+            if valid_url?(url)
+              urls << url
+            else
+              Aircana.human_logger.warn "Invalid URL format: #{url}. Please enter a valid HTTP or HTTPS URL."
+            end
+          end
+          return if urls.empty?
+          begin
+            Aircana.human_logger.info "Fetching #{urls.size} URL(s)..."
+            web = Aircana::Contexts::Web.new
+            result = web.fetch_urls_for(agent: normalized_agent_name, urls: urls)
+            if result[:pages_count].positive?
+              Aircana.human_logger.success "Successfully fetched #{result[:pages_count]} URL(s)"
+              show_gitignore_recommendation
+            else
+              Aircana.human_logger.warn "No URLs were successfully fetched"
+            end
+          rescue Aircana::Error => e
+            Aircana.human_logger.warn "Failed to fetch URLs: #{e.message}"
+            Aircana.human_logger.info(
+              "You can add URLs later with 'aircana agents add-url #{normalized_agent_name} <URL>'"
+            )
+          end
+        end
         def prompt_for_agent_review(prompt, file_path)
           Aircana.human_logger.info "Agent file created at: #{file_path}"
@@ -214,6 +314,18 @@ module Aircana
           config["description"] || "No description available"
         end
+        def agent_exists?(agent_name)
+          agent_dir = File.join(Aircana.configuration.agent_knowledge_dir, agent_name)
+          Dir.exist?(agent_dir)
+        end
+        def valid_url?(url)
+          uri = URI.parse(url)
+          %w[http https].include?(uri.scheme) && !uri.host.nil?
+        rescue URI::InvalidURIError
+          false
+        end
         def find_available_editor
           %w[code subl atom nano vim vi].find { |cmd| system("which #{cmd} > /dev/null 2>&1") }
         end

data/lib/aircana/cli/commands/doctor.rb CHANGED Viewed

@@ -15,6 +15,7 @@ module Aircana
         include DoctorChecks::ClaudeIntegration
         include DoctorChecks::AircanaConfiguration
         include DoctorChecks::OptionalIntegrations
+        include DoctorChecks::SQSIntegration
         def run(verbose: false)
           @verbose = verbose
@@ -22,11 +23,7 @@ module Aircana
           Aircana.human_logger.info "🔍 Checking Aircana system health...\n"
-          check_required_dependencies
-          check_claude_integration
-          check_optional_dependencies
-          check_aircana_configuration
-          check_optional_integrations
+          run_all_checks
           display_summary
           @issues_found ? 1 : 0
@@ -34,6 +31,15 @@ module Aircana
         private
+        def run_all_checks
+          check_required_dependencies
+          check_claude_integration
+          check_optional_dependencies
+          check_aircana_configuration
+          check_optional_integrations
+          check_sqs_integration
+        end
         def check_required_dependencies
           Aircana.human_logger.info "Required Dependencies:"

data/lib/aircana/cli/commands/doctor_checks.rb CHANGED Viewed

@@ -126,6 +126,54 @@ module Aircana
           end
         end
       end
+      module SQSIntegration
+        def check_sqs_integration
+          Aircana.human_logger.info "\nSQS Integration:"
+          check_sqs_dependencies
+          check_sqs_configuration
+        end
+        def check_sqs_dependencies
+          check_command("aws", "SQS operations", required: false)
+          check_command("jq", "JSON processing for notifications", required: false)
+        end
+        def check_sqs_configuration
+          sqs_queue_url = ENV.fetch("AIRCANA_SQS_QUEUE_URL", nil)
+          sqs_message_template = ENV.fetch("AIRCANA_SQS_MESSAGE_TEMPLATE", nil)
+          aws_region = ENV.fetch("AWS_REGION", "us-east-1")
+          if sqs_configured?(sqs_queue_url, sqs_message_template)
+            log_success("SQS Config", "Environment variables configured")
+            log_sqs_config_details(sqs_queue_url, sqs_message_template, aws_region) if @verbose
+          else
+            log_info("SQS Config", "Not configured")
+            log_sqs_configuration_remedy
+          end
+        end
+        def sqs_configured?(queue_url, message_template)
+          !queue_url.nil? && !queue_url.empty? &&
+            !message_template.nil? && !message_template.empty?
+        end
+        def log_sqs_config_details(queue_url, message_template, aws_region)
+          log_info("  AIRCANA_SQS_QUEUE_URL", queue_url.length > 50 ? "#{queue_url[0..47]}..." : queue_url)
+          log_info("  AIRCANA_SQS_MESSAGE_TEMPLATE",
+                   message_template.length > 40 ? "#{message_template[0..37]}..." : message_template)
+          log_info("  AWS_REGION", aws_region)
+        end
+        def log_sqs_configuration_remedy
+          log_remedy("Set AIRCANA_SQS_QUEUE_URL and AIRCANA_SQS_MESSAGE_TEMPLATE for SQS notifications")
+          log_remedy("Example:")
+          log_remedy('  export AIRCANA_SQS_QUEUE_URL="https://sqs.us-east-1.amazonaws.com/account/queue"')
+          log_remedy('  export AIRCANA_SQS_MESSAGE_TEMPLATE=\'{"text":"{{message}}}\'')
+          log_remedy('  export AWS_REGION="us-east-1"  # Optional, defaults to us-east-1')
+        end
+      end
     end
   end
 end

data/lib/aircana/cli/commands/install.rb CHANGED Viewed

@@ -9,7 +9,7 @@ module Aircana
     module Install
       class << self
         def run
-          ensure_output_exists
+          generate_files
           ensure_project_config_exists
           install_commands_to_claude
           install_hooks_to_claude
@@ -17,10 +17,8 @@ module Aircana
         private
-        def ensure_output_exists
-          return if Dir.exist?(Aircana.configuration.output_dir)
-          Aircana.human_logger.warn("No generated output files-auto generating now...")
+        def generate_files
+          Aircana.human_logger.info("Generating files before installation...")
           Generate.run
         end
@@ -118,6 +116,7 @@ module Aircana
             "post_tool_use" => { event: "PostToolUse", matcher: nil },
             "user_prompt_submit" => { event: "UserPromptSubmit", matcher: nil },
             "session_start" => { event: "SessionStart", matcher: nil },
+            "notification_sqs" => { event: "Notification", matcher: nil },
             "rubocop_pre_commit" => { event: "PreToolUse", matcher: "Bash" },
             "rspec_test" => { event: "PostToolUse", matcher: "Bash" },
             "bundle_install" => { event: "PostToolUse", matcher: "Bash" }

data/lib/aircana/contexts/confluence.rb CHANGED Viewed

@@ -56,7 +56,6 @@ module Aircana
         return { pages_count: 0, sources: [] } if all_pages.empty?
         updated_sources = process_pages_with_manifest(all_pages, agent)
-        Manifest.update_manifest(agent, updated_sources)
         { pages_count: all_pages.size, sources: updated_sources }
       end

data/lib/aircana/contexts/manifest.rb CHANGED Viewed

@@ -128,6 +128,8 @@ module Aircana
           case source["type"]
           when "confluence"
             validate_confluence_source(source)
+          when "web"
+            validate_web_source(source)
           else
             raise ManifestError, "Unknown source type: #{source["type"]}"
           end
@@ -140,6 +142,24 @@ module Aircana
           raise ManifestError, "Confluence pages must be an array"
         end
+        def validate_web_source(source)
+          raise ManifestError, "Web source missing required field: urls" unless source.key?("urls")
+          raise ManifestError, "Web urls must be an array" unless source["urls"].is_a?(Array)
+          source["urls"].each do |url_entry|
+            validate_web_url_entry(url_entry)
+          end
+        end
+        def validate_web_url_entry(url_entry)
+          raise ManifestError, "Each URL entry must be a hash" unless url_entry.is_a?(Hash)
+          raise ManifestError, "URL entry missing required field: url" unless url_entry.key?("url")
+          raise ManifestError, "URL entry missing required field: title" unless url_entry.key?("title")
+        end
       end
     end

data/lib/aircana/contexts/web.rb ADDED Viewed

@@ -0,0 +1,341 @@
+# frozen_string_literal: true
+require "httparty"
+require "reverse_markdown"
+require "uri"
+require_relative "local"
+require_relative "manifest"
+require_relative "../progress_tracker"
+require_relative "../version"
+require_relative "../llm/claude_client"
+module Aircana
+  module Contexts
+    class Web # rubocop:disable Metrics/ClassLength
+      include HTTParty
+      headers "User-Agent" => "Aircana/#{Aircana::VERSION} (+https://github.com/westonkd/aircana)"
+      default_timeout 30
+      follow_redirects true
+      def initialize
+        @local_storage = Local.new
+      end
+      def fetch_url_for(agent:, url:)
+        validate_url!(url)
+        page_data = fetch_and_process_url(url)
+        store_page_as_markdown(page_data, agent)
+        build_url_metadata(page_data)
+      rescue StandardError => e
+        handle_fetch_error(url, e)
+        nil
+      end
+      def fetch_urls_for(agent:, urls:) # rubocop:disable Metrics/MethodLength
+        return { pages_count: 0, sources: [] } if urls.empty?
+        pages_metadata = []
+        successful_urls = []
+        ProgressTracker.with_batch_progress(urls, "Fetching URLs") do |url, _index|
+          metadata = fetch_url_for(agent: agent, url: url)
+          if metadata
+            pages_metadata << metadata
+            successful_urls << url
+          end
+        end
+        if successful_urls.any?
+          sources = build_sources_metadata(successful_urls, pages_metadata)
+          update_or_create_manifest(agent, sources)
+          { pages_count: successful_urls.size, sources: sources }
+        else
+          { pages_count: 0, sources: [] }
+        end
+      end
+      def refresh_web_sources(agent:) # rubocop:disable Metrics/CyclomaticComplexity
+        sources = Manifest.sources_from_manifest(agent)
+        web_sources = sources.select { |s| s["type"] == "web" }
+        return { pages_count: 0, sources: [] } if web_sources.empty?
+        all_urls = web_sources.flat_map { |source| source["urls"]&.map { |u| u["url"] } || [] }
+        return { pages_count: 0, sources: [] } if all_urls.empty?
+        fetch_urls_for(agent: agent, urls: all_urls)
+      end
+      private
+      def validate_url!(url)
+        uri = URI.parse(url)
+        raise Error, "URL must use HTTP or HTTPS protocol" unless %w[http https].include?(uri.scheme)
+        raise Error, "Invalid URL format" unless uri.host
+      rescue URI::InvalidURIError
+        raise Error, "Invalid URL format"
+      end
+      def fetch_and_process_url(url) # rubocop:disable Metrics/MethodLength
+        Aircana.human_logger.info("Fetching #{url}")
+        response = self.class.get(url)
+        raise Error, "Failed to fetch URL (#{response.code})" unless response.success?
+        html_title = extract_title(response.body)
+        content = convert_to_markdown(response.body)
+        title = generate_meaningful_title(html_title, content, url)
+        {
+          url: url,
+          title: title,
+          content: content,
+          last_fetched: Time.now.utc.strftime("%Y-%m-%dT%H:%M:%SZ")
+        }
+      end
+      def extract_title(html) # rubocop:disable Metrics/MethodLength
+        title_match = html.match(%r{<title[^>]*>(.*?)</title>}im)
+        return nil unless title_match
+        title = title_match[1].strip
+        # Decode HTML entities
+        title.gsub(/&([a-zA-Z]+|#\d+);/) do |match|
+          case match
+          when "&amp;" then "&"
+          when "&lt;" then "<"
+          when "&gt;" then ">"
+          when "&quot;" then '"'
+          when "&#39;", "&apos;" then "'"
+          else match
+          end
+        end
+      end
+      def extract_title_from_url(url)
+        uri = URI.parse(url)
+        # Use the last path segment or host as fallback title
+        path_segments = uri.path.split("/").reject(&:empty?)
+        if path_segments.any?
+          path_segments.last.gsub(/[-_]/, " ").split.map(&:capitalize).join(" ")
+        else
+          uri.host
+        end
+      end
+      def generate_meaningful_title(html_title, content, url) # rubocop:disable Metrics/CyclomaticComplexity
+        # If we have a good HTML title that's descriptive, use it
+        return html_title if html_title && html_title.length > 10 && !generic_title?(html_title)
+        # If content is too short, use fallback
+        return html_title || extract_title_from_url(url) if content.length < 50
+        # Use Claude to generate a meaningful title based on content
+        begin
+          generate_title_with_claude(content, url)
+        rescue StandardError => e
+          Aircana.human_logger.warn("Failed to generate title with Claude: #{e.message}")
+          html_title || extract_title_from_url(url)
+        end
+      end
+      def generic_title?(title)
+        generic_patterns = [
+          /^(home|index|welcome|untitled|document)$/i,
+          /^(page|default)$/i,
+          /^\s*$/,
+          # Truncated titles (contain ellipsis)
+          /\.\.\./,
+          # Titles with excessive metadata (site names, IDs, etc.)
+          / - .+ - \d+$/,
+          # Question titles that are truncated
+          /^how do i .+\.\.\./i,
+          /^what is .+\.\.\./i
+        ]
+        generic_patterns.any? { |pattern| title.match?(pattern) }
+      end
+      def generate_title_with_claude(content, url)
+        prompt = build_title_generation_prompt(content, url)
+        claude_client = LLM::ClaudeClient.new
+        claude_client.prompt(prompt).strip
+      end
+      def build_title_generation_prompt(content, url) # rubocop:disable Metrics/MethodLength
+        # Truncate content to avoid overly long prompts
+        truncated_content = content.length > 1000 ? "#{content[0..1000]}..." : content
+        <<~PROMPT
+          Based on the following web page content from #{url}, generate a concise, descriptive title
+          that would help an AI agent understand what this document contains and when it would be useful.
+          The title should be:
+          - 3-8 words long
+          - Focused on the main topic or purpose
+          - Helpful for knowledge retrieval
+          - Professional and clear
+          Content:
+          #{truncated_content}
+          Respond with only the title, no additional text or explanation.
+        PROMPT
+      end
+      def convert_to_markdown(html)
+        return "" if html.nil? || html.empty?
+        # Extract meaningful content by removing unwanted elements
+        cleaned_html = extract_main_content(html)
+        ReverseMarkdown.convert(cleaned_html, github_flavored: true)
+      rescue StandardError => e
+        Aircana.human_logger.warn "Failed to convert HTML to markdown: #{e.message}"
+        # Fallback to plain text extraction
+        extract_text_content(html)
+      end
+      def store_page_as_markdown(page_data, agent)
+        @local_storage.store_content(
+          title: page_data[:title],
+          content: page_data[:content],
+          agent: agent
+        )
+      end
+      def build_url_metadata(page_data)
+        {
+          "url" => page_data[:url],
+          "title" => page_data[:title],
+          "last_fetched" => page_data[:last_fetched]
+        }
+      end
+      def build_sources_metadata(_urls, pages_metadata)
+        [
+          {
+            "type" => "web",
+            "urls" => pages_metadata
+          }
+        ]
+      end
+      def update_or_create_manifest(agent, new_sources)
+        existing_sources = Manifest.sources_from_manifest(agent)
+        # Remove existing web sources and add new ones
+        other_sources = existing_sources.reject { |s| s["type"] == "web" }
+        all_sources = other_sources + new_sources
+        if Manifest.manifest_exists?(agent)
+          Manifest.update_manifest(agent, all_sources)
+        else
+          Manifest.create_manifest(agent, all_sources)
+        end
+      end
+      def handle_fetch_error(url, error)
+        case error
+        when URI::InvalidURIError
+          Aircana.human_logger.error "Invalid URL format: #{url}"
+        when HTTParty::Error
+          Aircana.human_logger.error "HTTP error fetching #{url}: #{error.message}"
+        when Error
+          Aircana.human_logger.error "Error fetching #{url}: #{error.message}"
+        else
+          Aircana.human_logger.error "Unexpected error fetching #{url}: #{error.message}"
+        end
+      end
+      def extract_main_content(html) # rubocop:disable Metrics/MethodLength
+        # Try to find the main content area using common selectors
+        content_patterns = [
+          # Common main content selectors
+          %r{<main[^>]*>(.*?)</main>}mi,
+          %r{<article[^>]*>(.*?)</article>}mi,
+          %r{<div[^>]*class="[^"]*content[^"]*"[^>]*>(.*?)</div>}mi,
+          %r{<div[^>]*id="content"[^>]*>(.*?)</div>}mi,
+          %r{<div[^>]*class="[^"]*post[^"]*"[^>]*>(.*?)</div>}mi,
+          # Documentation specific
+          %r{<div[^>]*class="[^"]*docs[^"]*"[^>]*>(.*?)</div>}mi,
+          %r{<div[^>]*class="[^"]*documentation[^"]*"[^>]*>(.*?)</div>}mi,
+          # Body content as fallback
+          %r{<body[^>]*>(.*?)</body>}mi
+        ]
+        extracted_content = nil
+        content_patterns.each do |pattern|
+          match = html.match(pattern)
+          if match && match[1].strip.length > 100 # Ensure meaningful content
+            extracted_content = match[1]
+            break
+          end
+        end
+        # If no pattern matched or content is too short, use the full HTML
+        content_to_clean = extracted_content || html
+        # Remove unwanted elements
+        clean_html_content(content_to_clean)
+      end
+      def clean_html_content(html) # rubocop:disable Metrics/MethodLength
+        cleaned = html.dup
+        # Remove script and style tags completely
+        cleaned = cleaned.gsub(%r{<script[^>]*>.*?</script>}mi, "")
+        cleaned = cleaned.gsub(%r{<style[^>]*>.*?</style>}mi, "")
+        # Remove navigation, header, footer, sidebar elements
+        navigation_selectors = %w[nav header footer aside sidebar menu breadcrumb]
+        navigation_selectors.each do |selector|
+          # Remove by tag name
+          cleaned = cleaned.gsub(%r{<#{selector}[^>]*>.*?</#{selector}>}mi, "")
+          # Remove by class name (common patterns)
+          cleaned = cleaned.gsub(%r{<[^>]+class="[^"]*#{selector}[^"]*"[^>]*>.*?</[^>]+>}mi, "")
+          cleaned = cleaned.gsub(%r{<[^>]+id="#{selector}"[^>]*>.*?</[^>]+>}mi, "")
+        end
+        # Remove common non-content elements
+        unwanted_patterns = [
+          %r{<div[^>]*class="[^"]*comment[^"]*"[^>]*>.*?</div>}mi,
+          %r{<div[^>]*class="[^"]*social[^"]*"[^>]*>.*?</div>}mi,
+          %r{<div[^>]*class="[^"]*share[^"]*"[^>]*>.*?</div>}mi,
+          %r{<div[^>]*class="[^"]*ad[^"]*"[^>]*>.*?</div>}mi,
+          %r{<div[^>]*class="[^"]*advertisement[^"]*"[^>]*>.*?</div>}mi,
+          %r{<div[^>]*class="[^"]*popup[^"]*"[^>]*>.*?</div>}mi,
+          %r{<div[^>]*class="[^"]*modal[^"]*"[^>]*>.*?</div>}mi
+        ]
+        unwanted_patterns.each do |pattern|
+          cleaned = cleaned.gsub(pattern, "")
+        end
+        # Clean up whitespace
+        cleaned.gsub(/\n\s*\n\s*\n+/, "\n\n").strip
+      end
+      def extract_text_content(html) # rubocop:disable Metrics/MethodLength
+        # Fallback method for plain text extraction
+        text = html.gsub(%r{<script[^>]*>.*?</script>}mi, "")
+                   .gsub(%r{<style[^>]*>.*?</style>}mi, "")
+                   .gsub(/<[^>]+>/, "")
+                   .gsub("&nbsp;", " ")
+                   .gsub("&amp;", "&")
+                   .gsub("&lt;", "<")
+                   .gsub("&gt;", ">")
+                   .gsub("&quot;", '"')
+                   .gsub(/\s+/, " ")
+                   .strip
+        # If the extracted text is very short, it might not be useful
+        text.length < 20 ? "Content could not be extracted from this page." : text
+      end
+    end
+  end
+end