aircana 1.1.0.rc1 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. checksums.yaml +4 -4
  2. data/.rspec_status +188 -160
  3. data/CHANGELOG.md +7 -0
  4. data/CLAUDE.md +18 -6
  5. data/README.md +47 -6
  6. data/lib/aircana/cli/app.rb +5 -2
  7. data/lib/aircana/cli/commands/agents.rb +118 -6
  8. data/lib/aircana/cli/commands/doctor.rb +11 -5
  9. data/lib/aircana/cli/commands/doctor_checks.rb +48 -0
  10. data/lib/aircana/cli/commands/install.rb +4 -5
  11. data/lib/aircana/contexts/confluence.rb +0 -1
  12. data/lib/aircana/contexts/manifest.rb +20 -0
  13. data/lib/aircana/contexts/web.rb +341 -0
  14. data/lib/aircana/generators/agents_generator.rb +1 -1
  15. data/lib/aircana/generators/helpers.rb +1 -2
  16. data/lib/aircana/generators/hooks_generator.rb +2 -0
  17. data/lib/aircana/generators/relevant_files_command_generator.rb +1 -1
  18. data/lib/aircana/system_checker.rb +11 -0
  19. data/lib/aircana/templates/agents/base_agent.erb +4 -3
  20. data/lib/aircana/templates/hooks/notification_sqs.erb +75 -0
  21. data/lib/aircana/templates/hooks/pre_tool_use.erb +32 -7
  22. data/lib/aircana/version.rb +1 -1
  23. data/spec_output_1758908468_248/commands/air-add-relevant-files.md +1 -0
  24. data/spec_output_1758908479_36/commands/air-add-relevant-files.md +1 -0
  25. data/spec_output_1758908547_132/commands/air-add-relevant-files.md +1 -0
  26. data/spec_output_1758908553_721/commands/air-add-relevant-files.md +1 -0
  27. data/spec_output_1758917010_960/commands/air-add-relevant-files.md +1 -0
  28. data/spec_output_1758917064_555/commands/air-add-relevant-files.md +1 -0
  29. metadata +11 -7
  30. data/lib/aircana/cli/commands/plan.rb +0 -69
  31. data/lib/aircana/cli/commands/work.rb +0 -69
  32. data/lib/aircana/templates/agents/defaults/planner.erb +0 -126
  33. data/lib/aircana/templates/agents/defaults/worker.erb +0 -185
@@ -4,6 +4,7 @@ require "json"
4
4
  require "tty-prompt"
5
5
  require_relative "../../generators/agents_generator"
6
6
  require_relative "../../contexts/manifest"
7
+ require_relative "../../contexts/web"
7
8
 
8
9
  module Aircana
9
10
  module CLI
@@ -43,6 +44,9 @@ module Aircana
43
44
  # Prompt for knowledge fetching
44
45
  prompt_for_knowledge_fetch(prompt, normalized_agent_name)
45
46
 
47
+ # Prompt for web URL fetching
48
+ prompt_for_url_fetch(prompt, normalized_agent_name)
49
+
46
50
  # Prompt for agent file review
47
51
  prompt_for_agent_review(prompt, file)
48
52
 
@@ -59,6 +63,44 @@ module Aircana
59
63
  print_agents_list(agent_folders)
60
64
  end
61
65
 
66
+ def add_url(agent, url) # rubocop:disable Metrics/MethodLength, Metrics/AbcSize, Metrics/PerceivedComplexity
67
+ normalized_agent = normalize_string(agent)
68
+
69
+ unless agent_exists?(normalized_agent)
70
+ Aircana.human_logger.error "Agent '#{agent}' not found. Use 'aircana agents list' to see available agents."
71
+ exit 1
72
+ end
73
+
74
+ web = Aircana::Contexts::Web.new
75
+ result = web.fetch_url_for(agent: normalized_agent, url: url)
76
+
77
+ if result
78
+ # Update manifest with the new URL
79
+ existing_sources = Aircana::Contexts::Manifest.sources_from_manifest(normalized_agent)
80
+ web_sources = existing_sources.select { |s| s["type"] == "web" }
81
+ other_sources = existing_sources.reject { |s| s["type"] == "web" }
82
+
83
+ if web_sources.any?
84
+ # Add to existing web source
85
+ web_sources.first["urls"] << result
86
+ else
87
+ # Create new web source
88
+ web_sources = [{ "type" => "web", "urls" => [result] }]
89
+ end
90
+
91
+ all_sources = other_sources + web_sources
92
+ Aircana::Contexts::Manifest.update_manifest(normalized_agent, all_sources)
93
+
94
+ Aircana.human_logger.success "Successfully added URL to agent '#{agent}'"
95
+ else
96
+ Aircana.human_logger.error "Failed to fetch URL: #{url}"
97
+ exit 1
98
+ end
99
+ rescue Aircana::Error => e
100
+ Aircana.human_logger.error "Failed to add URL: #{e.message}"
101
+ exit 1
102
+ end
103
+
62
104
  private
63
105
 
64
106
  def perform_refresh(normalized_agent)
@@ -77,20 +119,38 @@ module Aircana
77
119
  end
78
120
  end
79
121
 
80
- def perform_manifest_aware_refresh(normalized_agent)
81
- confluence = Aircana::Contexts::Confluence.new
122
+ def perform_manifest_aware_refresh(normalized_agent) # rubocop:disable Metrics/MethodLength, Metrics/AbcSize
123
+ total_pages = 0
124
+ all_sources = []
82
125
 
83
126
  # Try manifest-based refresh first
84
127
  if Aircana::Contexts::Manifest.manifest_exists?(normalized_agent)
85
128
  Aircana.human_logger.info "Refreshing from knowledge manifest..."
86
- result = confluence.refresh_from_manifest(agent: normalized_agent)
129
+
130
+ # Refresh Confluence sources
131
+ confluence = Aircana::Contexts::Confluence.new
132
+ confluence_result = confluence.refresh_from_manifest(agent: normalized_agent)
133
+ total_pages += confluence_result[:pages_count]
134
+ all_sources.concat(confluence_result[:sources])
135
+
136
+ # Refresh web sources
137
+ web = Aircana::Contexts::Web.new
138
+ web_result = web.refresh_web_sources(agent: normalized_agent)
139
+ total_pages += web_result[:pages_count]
140
+ all_sources.concat(web_result[:sources])
87
141
  else
88
142
  Aircana.human_logger.info "No manifest found, falling back to label-based search..."
89
- result = confluence.fetch_pages_for(agent: normalized_agent)
143
+ confluence = Aircana::Contexts::Confluence.new
144
+ confluence_result = confluence.fetch_pages_for(agent: normalized_agent)
145
+ total_pages += confluence_result[:pages_count]
146
+ all_sources.concat(confluence_result[:sources])
90
147
  end
91
148
 
92
- log_refresh_result(normalized_agent, result[:pages_count])
93
- result
149
+ # Update manifest with all sources combined
150
+ Aircana::Contexts::Manifest.update_manifest(normalized_agent, all_sources) if all_sources.any?
151
+
152
+ log_refresh_result(normalized_agent, total_pages)
153
+ { pages_count: total_pages, sources: all_sources }
94
154
  end
95
155
 
96
156
  def show_gitignore_recommendation
@@ -133,6 +193,9 @@ module Aircana
133
193
  within its domain.
134
194
 
135
195
  Print the output to STDOUT only, without any additional commentary.
196
+
197
+ The description should be 2-3 sentences. Most of the agent's context comes from
198
+ its knowledge base
136
199
  PROMPT
137
200
  end
138
201
 
@@ -153,6 +216,43 @@ module Aircana
153
216
  Aircana.human_logger.info "You can try again later with 'aircana agents refresh #{normalized_agent_name}'"
154
217
  end
155
218
 
219
+ def prompt_for_url_fetch(prompt, normalized_agent_name) # rubocop:disable Metrics/MethodLength, Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
220
+ return unless prompt.yes?("Would you like to add web URLs for this agent's knowledge base?")
221
+
222
+ urls = []
223
+ loop do
224
+ url = prompt.ask("Enter URL (or press Enter to finish):")
225
+ break if url.nil? || url.strip.empty?
226
+
227
+ url = url.strip
228
+ if valid_url?(url)
229
+ urls << url
230
+ else
231
+ Aircana.human_logger.warn "Invalid URL format: #{url}. Please enter a valid HTTP or HTTPS URL."
232
+ end
233
+ end
234
+
235
+ return if urls.empty?
236
+
237
+ begin
238
+ Aircana.human_logger.info "Fetching #{urls.size} URL(s)..."
239
+ web = Aircana::Contexts::Web.new
240
+ result = web.fetch_urls_for(agent: normalized_agent_name, urls: urls)
241
+
242
+ if result[:pages_count].positive?
243
+ Aircana.human_logger.success "Successfully fetched #{result[:pages_count]} URL(s)"
244
+ show_gitignore_recommendation
245
+ else
246
+ Aircana.human_logger.warn "No URLs were successfully fetched"
247
+ end
248
+ rescue Aircana::Error => e
249
+ Aircana.human_logger.warn "Failed to fetch URLs: #{e.message}"
250
+ Aircana.human_logger.info(
251
+ "You can add URLs later with 'aircana agents add-url #{normalized_agent_name} <URL>'"
252
+ )
253
+ end
254
+ end
255
+
156
256
  def prompt_for_agent_review(prompt, file_path)
157
257
  Aircana.human_logger.info "Agent file created at: #{file_path}"
158
258
 
@@ -214,6 +314,18 @@ module Aircana
214
314
  config["description"] || "No description available"
215
315
  end
216
316
 
317
+ def agent_exists?(agent_name)
318
+ agent_dir = File.join(Aircana.configuration.agent_knowledge_dir, agent_name)
319
+ Dir.exist?(agent_dir)
320
+ end
321
+
322
+ def valid_url?(url)
323
+ uri = URI.parse(url)
324
+ %w[http https].include?(uri.scheme) && !uri.host.nil?
325
+ rescue URI::InvalidURIError
326
+ false
327
+ end
328
+
217
329
  def find_available_editor
218
330
  %w[code subl atom nano vim vi].find { |cmd| system("which #{cmd} > /dev/null 2>&1") }
219
331
  end
@@ -15,6 +15,7 @@ module Aircana
15
15
  include DoctorChecks::ClaudeIntegration
16
16
  include DoctorChecks::AircanaConfiguration
17
17
  include DoctorChecks::OptionalIntegrations
18
+ include DoctorChecks::SQSIntegration
18
19
 
19
20
  def run(verbose: false)
20
21
  @verbose = verbose
@@ -22,11 +23,7 @@ module Aircana
22
23
 
23
24
  Aircana.human_logger.info "🔍 Checking Aircana system health...\n"
24
25
 
25
- check_required_dependencies
26
- check_claude_integration
27
- check_optional_dependencies
28
- check_aircana_configuration
29
- check_optional_integrations
26
+ run_all_checks
30
27
 
31
28
  display_summary
32
29
  @issues_found ? 1 : 0
@@ -34,6 +31,15 @@ module Aircana
34
31
 
35
32
  private
36
33
 
34
+ def run_all_checks
35
+ check_required_dependencies
36
+ check_claude_integration
37
+ check_optional_dependencies
38
+ check_aircana_configuration
39
+ check_optional_integrations
40
+ check_sqs_integration
41
+ end
42
+
37
43
  def check_required_dependencies
38
44
  Aircana.human_logger.info "Required Dependencies:"
39
45
 
@@ -126,6 +126,54 @@ module Aircana
126
126
  end
127
127
  end
128
128
  end
129
+
130
+ module SQSIntegration
131
+ def check_sqs_integration
132
+ Aircana.human_logger.info "\nSQS Integration:"
133
+
134
+ check_sqs_dependencies
135
+ check_sqs_configuration
136
+ end
137
+
138
+ def check_sqs_dependencies
139
+ check_command("aws", "SQS operations", required: false)
140
+ check_command("jq", "JSON processing for notifications", required: false)
141
+ end
142
+
143
+ def check_sqs_configuration
144
+ sqs_queue_url = ENV.fetch("AIRCANA_SQS_QUEUE_URL", nil)
145
+ sqs_message_template = ENV.fetch("AIRCANA_SQS_MESSAGE_TEMPLATE", nil)
146
+ aws_region = ENV.fetch("AWS_REGION", "us-east-1")
147
+
148
+ if sqs_configured?(sqs_queue_url, sqs_message_template)
149
+ log_success("SQS Config", "Environment variables configured")
150
+ log_sqs_config_details(sqs_queue_url, sqs_message_template, aws_region) if @verbose
151
+ else
152
+ log_info("SQS Config", "Not configured")
153
+ log_sqs_configuration_remedy
154
+ end
155
+ end
156
+
157
+ def sqs_configured?(queue_url, message_template)
158
+ !queue_url.nil? && !queue_url.empty? &&
159
+ !message_template.nil? && !message_template.empty?
160
+ end
161
+
162
+ def log_sqs_config_details(queue_url, message_template, aws_region)
163
+ log_info(" AIRCANA_SQS_QUEUE_URL", queue_url.length > 50 ? "#{queue_url[0..47]}..." : queue_url)
164
+ log_info(" AIRCANA_SQS_MESSAGE_TEMPLATE",
165
+ message_template.length > 40 ? "#{message_template[0..37]}..." : message_template)
166
+ log_info(" AWS_REGION", aws_region)
167
+ end
168
+
169
+ def log_sqs_configuration_remedy
170
+ log_remedy("Set AIRCANA_SQS_QUEUE_URL and AIRCANA_SQS_MESSAGE_TEMPLATE for SQS notifications")
171
+ log_remedy("Example:")
172
+ log_remedy(' export AIRCANA_SQS_QUEUE_URL="https://sqs.us-east-1.amazonaws.com/account/queue"')
173
+ log_remedy(' export AIRCANA_SQS_MESSAGE_TEMPLATE=\'{"text":"{{message}}}\'')
174
+ log_remedy(' export AWS_REGION="us-east-1" # Optional, defaults to us-east-1')
175
+ end
176
+ end
129
177
  end
130
178
  end
131
179
  end
@@ -9,7 +9,7 @@ module Aircana
9
9
  module Install
10
10
  class << self
11
11
  def run
12
- ensure_output_exists
12
+ generate_files
13
13
  ensure_project_config_exists
14
14
  install_commands_to_claude
15
15
  install_hooks_to_claude
@@ -17,10 +17,8 @@ module Aircana
17
17
 
18
18
  private
19
19
 
20
- def ensure_output_exists
21
- return if Dir.exist?(Aircana.configuration.output_dir)
22
-
23
- Aircana.human_logger.warn("No generated output files-auto generating now...")
20
+ def generate_files
21
+ Aircana.human_logger.info("Generating files before installation...")
24
22
  Generate.run
25
23
  end
26
24
 
@@ -118,6 +116,7 @@ module Aircana
118
116
  "post_tool_use" => { event: "PostToolUse", matcher: nil },
119
117
  "user_prompt_submit" => { event: "UserPromptSubmit", matcher: nil },
120
118
  "session_start" => { event: "SessionStart", matcher: nil },
119
+ "notification_sqs" => { event: "Notification", matcher: nil },
121
120
  "rubocop_pre_commit" => { event: "PreToolUse", matcher: "Bash" },
122
121
  "rspec_test" => { event: "PostToolUse", matcher: "Bash" },
123
122
  "bundle_install" => { event: "PostToolUse", matcher: "Bash" }
@@ -56,7 +56,6 @@ module Aircana
56
56
  return { pages_count: 0, sources: [] } if all_pages.empty?
57
57
 
58
58
  updated_sources = process_pages_with_manifest(all_pages, agent)
59
- Manifest.update_manifest(agent, updated_sources)
60
59
 
61
60
  { pages_count: all_pages.size, sources: updated_sources }
62
61
  end
@@ -128,6 +128,8 @@ module Aircana
128
128
  case source["type"]
129
129
  when "confluence"
130
130
  validate_confluence_source(source)
131
+ when "web"
132
+ validate_web_source(source)
131
133
  else
132
134
  raise ManifestError, "Unknown source type: #{source["type"]}"
133
135
  end
@@ -140,6 +142,24 @@ module Aircana
140
142
 
141
143
  raise ManifestError, "Confluence pages must be an array"
142
144
  end
145
+
146
+ def validate_web_source(source)
147
+ raise ManifestError, "Web source missing required field: urls" unless source.key?("urls")
148
+
149
+ raise ManifestError, "Web urls must be an array" unless source["urls"].is_a?(Array)
150
+
151
+ source["urls"].each do |url_entry|
152
+ validate_web_url_entry(url_entry)
153
+ end
154
+ end
155
+
156
+ def validate_web_url_entry(url_entry)
157
+ raise ManifestError, "Each URL entry must be a hash" unless url_entry.is_a?(Hash)
158
+
159
+ raise ManifestError, "URL entry missing required field: url" unless url_entry.key?("url")
160
+
161
+ raise ManifestError, "URL entry missing required field: title" unless url_entry.key?("title")
162
+ end
143
163
  end
144
164
  end
145
165
 
@@ -0,0 +1,341 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "httparty"
4
+ require "reverse_markdown"
5
+ require "uri"
6
+ require_relative "local"
7
+ require_relative "manifest"
8
+ require_relative "../progress_tracker"
9
+ require_relative "../version"
10
+ require_relative "../llm/claude_client"
11
+
12
+ module Aircana
13
+ module Contexts
14
+ class Web # rubocop:disable Metrics/ClassLength
15
+ include HTTParty
16
+
17
+ headers "User-Agent" => "Aircana/#{Aircana::VERSION} (+https://github.com/westonkd/aircana)"
18
+ default_timeout 30
19
+ follow_redirects true
20
+
21
+ def initialize
22
+ @local_storage = Local.new
23
+ end
24
+
25
+ def fetch_url_for(agent:, url:)
26
+ validate_url!(url)
27
+
28
+ page_data = fetch_and_process_url(url)
29
+ store_page_as_markdown(page_data, agent)
30
+
31
+ build_url_metadata(page_data)
32
+ rescue StandardError => e
33
+ handle_fetch_error(url, e)
34
+ nil
35
+ end
36
+
37
+ def fetch_urls_for(agent:, urls:) # rubocop:disable Metrics/MethodLength
38
+ return { pages_count: 0, sources: [] } if urls.empty?
39
+
40
+ pages_metadata = []
41
+ successful_urls = []
42
+
43
+ ProgressTracker.with_batch_progress(urls, "Fetching URLs") do |url, _index|
44
+ metadata = fetch_url_for(agent: agent, url: url)
45
+ if metadata
46
+ pages_metadata << metadata
47
+ successful_urls << url
48
+ end
49
+ end
50
+
51
+ if successful_urls.any?
52
+ sources = build_sources_metadata(successful_urls, pages_metadata)
53
+ update_or_create_manifest(agent, sources)
54
+ { pages_count: successful_urls.size, sources: sources }
55
+ else
56
+ { pages_count: 0, sources: [] }
57
+ end
58
+ end
59
+
60
+ def refresh_web_sources(agent:) # rubocop:disable Metrics/CyclomaticComplexity
61
+ sources = Manifest.sources_from_manifest(agent)
62
+ web_sources = sources.select { |s| s["type"] == "web" }
63
+
64
+ return { pages_count: 0, sources: [] } if web_sources.empty?
65
+
66
+ all_urls = web_sources.flat_map { |source| source["urls"]&.map { |u| u["url"] } || [] }
67
+ return { pages_count: 0, sources: [] } if all_urls.empty?
68
+
69
+ fetch_urls_for(agent: agent, urls: all_urls)
70
+ end
71
+
72
+ private
73
+
74
+ def validate_url!(url)
75
+ uri = URI.parse(url)
76
+ raise Error, "URL must use HTTP or HTTPS protocol" unless %w[http https].include?(uri.scheme)
77
+ raise Error, "Invalid URL format" unless uri.host
78
+ rescue URI::InvalidURIError
79
+ raise Error, "Invalid URL format"
80
+ end
81
+
82
+ def fetch_and_process_url(url) # rubocop:disable Metrics/MethodLength
83
+ Aircana.human_logger.info("Fetching #{url}")
84
+
85
+ response = self.class.get(url)
86
+
87
+ raise Error, "Failed to fetch URL (#{response.code})" unless response.success?
88
+
89
+ html_title = extract_title(response.body)
90
+ content = convert_to_markdown(response.body)
91
+ title = generate_meaningful_title(html_title, content, url)
92
+
93
+ {
94
+ url: url,
95
+ title: title,
96
+ content: content,
97
+ last_fetched: Time.now.utc.strftime("%Y-%m-%dT%H:%M:%SZ")
98
+ }
99
+ end
100
+
101
+ def extract_title(html) # rubocop:disable Metrics/MethodLength
102
+ title_match = html.match(%r{<title[^>]*>(.*?)</title>}im)
103
+ return nil unless title_match
104
+
105
+ title = title_match[1].strip
106
+ # Decode HTML entities
107
+ title.gsub(/&([a-zA-Z]+|#\d+);/) do |match|
108
+ case match
109
+ when "&amp;" then "&"
110
+ when "&lt;" then "<"
111
+ when "&gt;" then ">"
112
+ when "&quot;" then '"'
113
+ when "&#39;", "&apos;" then "'"
114
+ else match
115
+ end
116
+ end
117
+ end
118
+
119
+ def extract_title_from_url(url)
120
+ uri = URI.parse(url)
121
+ # Use the last path segment or host as fallback title
122
+ path_segments = uri.path.split("/").reject(&:empty?)
123
+ if path_segments.any?
124
+ path_segments.last.gsub(/[-_]/, " ").split.map(&:capitalize).join(" ")
125
+ else
126
+ uri.host
127
+ end
128
+ end
129
+
130
+ def generate_meaningful_title(html_title, content, url) # rubocop:disable Metrics/CyclomaticComplexity
131
+ # If we have a good HTML title that's descriptive, use it
132
+ return html_title if html_title && html_title.length > 10 && !generic_title?(html_title)
133
+
134
+ # If content is too short, use fallback
135
+ return html_title || extract_title_from_url(url) if content.length < 50
136
+
137
+ # Use Claude to generate a meaningful title based on content
138
+ begin
139
+ generate_title_with_claude(content, url)
140
+ rescue StandardError => e
141
+ Aircana.human_logger.warn("Failed to generate title with Claude: #{e.message}")
142
+ html_title || extract_title_from_url(url)
143
+ end
144
+ end
145
+
146
+ def generic_title?(title)
147
+ generic_patterns = [
148
+ /^(home|index|welcome|untitled|document)$/i,
149
+ /^(page|default)$/i,
150
+ /^\s*$/,
151
+ # Truncated titles (contain ellipsis)
152
+ /\.\.\./,
153
+ # Titles with excessive metadata (site names, IDs, etc.)
154
+ / - .+ - \d+$/,
155
+ # Question titles that are truncated
156
+ /^how do i .+\.\.\./i,
157
+ /^what is .+\.\.\./i
158
+ ]
159
+
160
+ generic_patterns.any? { |pattern| title.match?(pattern) }
161
+ end
162
+
163
+ def generate_title_with_claude(content, url)
164
+ prompt = build_title_generation_prompt(content, url)
165
+ claude_client = LLM::ClaudeClient.new
166
+ claude_client.prompt(prompt).strip
167
+ end
168
+
169
+ def build_title_generation_prompt(content, url) # rubocop:disable Metrics/MethodLength
170
+ # Truncate content to avoid overly long prompts
171
+ truncated_content = content.length > 1000 ? "#{content[0..1000]}..." : content
172
+
173
+ <<~PROMPT
174
+ Based on the following web page content from #{url}, generate a concise, descriptive title
175
+ that would help an AI agent understand what this document contains and when it would be useful.
176
+
177
+ The title should be:
178
+ - 3-8 words long
179
+ - Focused on the main topic or purpose
180
+ - Helpful for knowledge retrieval
181
+ - Professional and clear
182
+
183
+ Content:
184
+ #{truncated_content}
185
+
186
+ Respond with only the title, no additional text or explanation.
187
+ PROMPT
188
+ end
189
+
190
+ def convert_to_markdown(html)
191
+ return "" if html.nil? || html.empty?
192
+
193
+ # Extract meaningful content by removing unwanted elements
194
+ cleaned_html = extract_main_content(html)
195
+
196
+ ReverseMarkdown.convert(cleaned_html, github_flavored: true)
197
+ rescue StandardError => e
198
+ Aircana.human_logger.warn "Failed to convert HTML to markdown: #{e.message}"
199
+ # Fallback to plain text extraction
200
+ extract_text_content(html)
201
+ end
202
+
203
+ def store_page_as_markdown(page_data, agent)
204
+ @local_storage.store_content(
205
+ title: page_data[:title],
206
+ content: page_data[:content],
207
+ agent: agent
208
+ )
209
+ end
210
+
211
+ def build_url_metadata(page_data)
212
+ {
213
+ "url" => page_data[:url],
214
+ "title" => page_data[:title],
215
+ "last_fetched" => page_data[:last_fetched]
216
+ }
217
+ end
218
+
219
+ def build_sources_metadata(_urls, pages_metadata)
220
+ [
221
+ {
222
+ "type" => "web",
223
+ "urls" => pages_metadata
224
+ }
225
+ ]
226
+ end
227
+
228
+ def update_or_create_manifest(agent, new_sources)
229
+ existing_sources = Manifest.sources_from_manifest(agent)
230
+
231
+ # Remove existing web sources and add new ones
232
+ other_sources = existing_sources.reject { |s| s["type"] == "web" }
233
+ all_sources = other_sources + new_sources
234
+
235
+ if Manifest.manifest_exists?(agent)
236
+ Manifest.update_manifest(agent, all_sources)
237
+ else
238
+ Manifest.create_manifest(agent, all_sources)
239
+ end
240
+ end
241
+
242
+ def handle_fetch_error(url, error)
243
+ case error
244
+ when URI::InvalidURIError
245
+ Aircana.human_logger.error "Invalid URL format: #{url}"
246
+ when HTTParty::Error
247
+ Aircana.human_logger.error "HTTP error fetching #{url}: #{error.message}"
248
+ when Error
249
+ Aircana.human_logger.error "Error fetching #{url}: #{error.message}"
250
+ else
251
+ Aircana.human_logger.error "Unexpected error fetching #{url}: #{error.message}"
252
+ end
253
+ end
254
+
255
+ def extract_main_content(html) # rubocop:disable Metrics/MethodLength
256
+ # Try to find the main content area using common selectors
257
+ content_patterns = [
258
+ # Common main content selectors
259
+ %r{<main[^>]*>(.*?)</main>}mi,
260
+ %r{<article[^>]*>(.*?)</article>}mi,
261
+ %r{<div[^>]*class="[^"]*content[^"]*"[^>]*>(.*?)</div>}mi,
262
+ %r{<div[^>]*id="content"[^>]*>(.*?)</div>}mi,
263
+ %r{<div[^>]*class="[^"]*post[^"]*"[^>]*>(.*?)</div>}mi,
264
+ # Documentation specific
265
+ %r{<div[^>]*class="[^"]*docs[^"]*"[^>]*>(.*?)</div>}mi,
266
+ %r{<div[^>]*class="[^"]*documentation[^"]*"[^>]*>(.*?)</div>}mi,
267
+ # Body content as fallback
268
+ %r{<body[^>]*>(.*?)</body>}mi
269
+ ]
270
+
271
+ extracted_content = nil
272
+ content_patterns.each do |pattern|
273
+ match = html.match(pattern)
274
+ if match && match[1].strip.length > 100 # Ensure meaningful content
275
+ extracted_content = match[1]
276
+ break
277
+ end
278
+ end
279
+
280
+ # If no pattern matched or content is too short, use the full HTML
281
+ content_to_clean = extracted_content || html
282
+
283
+ # Remove unwanted elements
284
+ clean_html_content(content_to_clean)
285
+ end
286
+
287
+ def clean_html_content(html) # rubocop:disable Metrics/MethodLength
288
+ cleaned = html.dup
289
+
290
+ # Remove script and style tags completely
291
+ cleaned = cleaned.gsub(%r{<script[^>]*>.*?</script>}mi, "")
292
+ cleaned = cleaned.gsub(%r{<style[^>]*>.*?</style>}mi, "")
293
+
294
+ # Remove navigation, header, footer, sidebar elements
295
+ navigation_selectors = %w[nav header footer aside sidebar menu breadcrumb]
296
+ navigation_selectors.each do |selector|
297
+ # Remove by tag name
298
+ cleaned = cleaned.gsub(%r{<#{selector}[^>]*>.*?</#{selector}>}mi, "")
299
+ # Remove by class name (common patterns)
300
+ cleaned = cleaned.gsub(%r{<[^>]+class="[^"]*#{selector}[^"]*"[^>]*>.*?</[^>]+>}mi, "")
301
+ cleaned = cleaned.gsub(%r{<[^>]+id="#{selector}"[^>]*>.*?</[^>]+>}mi, "")
302
+ end
303
+
304
+ # Remove common non-content elements
305
+ unwanted_patterns = [
306
+ %r{<div[^>]*class="[^"]*comment[^"]*"[^>]*>.*?</div>}mi,
307
+ %r{<div[^>]*class="[^"]*social[^"]*"[^>]*>.*?</div>}mi,
308
+ %r{<div[^>]*class="[^"]*share[^"]*"[^>]*>.*?</div>}mi,
309
+ %r{<div[^>]*class="[^"]*ad[^"]*"[^>]*>.*?</div>}mi,
310
+ %r{<div[^>]*class="[^"]*advertisement[^"]*"[^>]*>.*?</div>}mi,
311
+ %r{<div[^>]*class="[^"]*popup[^"]*"[^>]*>.*?</div>}mi,
312
+ %r{<div[^>]*class="[^"]*modal[^"]*"[^>]*>.*?</div>}mi
313
+ ]
314
+
315
+ unwanted_patterns.each do |pattern|
316
+ cleaned = cleaned.gsub(pattern, "")
317
+ end
318
+
319
+ # Clean up whitespace
320
+ cleaned.gsub(/\n\s*\n\s*\n+/, "\n\n").strip
321
+ end
322
+
323
+ def extract_text_content(html) # rubocop:disable Metrics/MethodLength
324
+ # Fallback method for plain text extraction
325
+ text = html.gsub(%r{<script[^>]*>.*?</script>}mi, "")
326
+ .gsub(%r{<style[^>]*>.*?</style>}mi, "")
327
+ .gsub(/<[^>]+>/, "")
328
+ .gsub("&nbsp;", " ")
329
+ .gsub("&amp;", "&")
330
+ .gsub("&lt;", "<")
331
+ .gsub("&gt;", ">")
332
+ .gsub("&quot;", '"')
333
+ .gsub(/\s+/, " ")
334
+ .strip
335
+
336
+ # If the extracted text is very short, it might not be useful
337
+ text.length < 20 ? "Content could not be extracted from this page." : text
338
+ end
339
+ end
340
+ end
341
+ end