aircana 1.1.0.rc1 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rspec_status +188 -160
- data/CHANGELOG.md +7 -0
- data/CLAUDE.md +18 -6
- data/README.md +47 -6
- data/lib/aircana/cli/app.rb +5 -2
- data/lib/aircana/cli/commands/agents.rb +118 -6
- data/lib/aircana/cli/commands/doctor.rb +11 -5
- data/lib/aircana/cli/commands/doctor_checks.rb +48 -0
- data/lib/aircana/cli/commands/install.rb +4 -5
- data/lib/aircana/contexts/confluence.rb +0 -1
- data/lib/aircana/contexts/manifest.rb +20 -0
- data/lib/aircana/contexts/web.rb +341 -0
- data/lib/aircana/generators/agents_generator.rb +1 -1
- data/lib/aircana/generators/helpers.rb +1 -2
- data/lib/aircana/generators/hooks_generator.rb +2 -0
- data/lib/aircana/generators/relevant_files_command_generator.rb +1 -1
- data/lib/aircana/system_checker.rb +11 -0
- data/lib/aircana/templates/agents/base_agent.erb +4 -3
- data/lib/aircana/templates/hooks/notification_sqs.erb +75 -0
- data/lib/aircana/templates/hooks/pre_tool_use.erb +32 -7
- data/lib/aircana/version.rb +1 -1
- data/spec_output_1758908468_248/commands/air-add-relevant-files.md +1 -0
- data/spec_output_1758908479_36/commands/air-add-relevant-files.md +1 -0
- data/spec_output_1758908547_132/commands/air-add-relevant-files.md +1 -0
- data/spec_output_1758908553_721/commands/air-add-relevant-files.md +1 -0
- data/spec_output_1758917010_960/commands/air-add-relevant-files.md +1 -0
- data/spec_output_1758917064_555/commands/air-add-relevant-files.md +1 -0
- metadata +11 -7
- data/lib/aircana/cli/commands/plan.rb +0 -69
- data/lib/aircana/cli/commands/work.rb +0 -69
- data/lib/aircana/templates/agents/defaults/planner.erb +0 -126
- data/lib/aircana/templates/agents/defaults/worker.erb +0 -185
@@ -4,6 +4,7 @@ require "json"
|
|
4
4
|
require "tty-prompt"
|
5
5
|
require_relative "../../generators/agents_generator"
|
6
6
|
require_relative "../../contexts/manifest"
|
7
|
+
require_relative "../../contexts/web"
|
7
8
|
|
8
9
|
module Aircana
|
9
10
|
module CLI
|
@@ -43,6 +44,9 @@ module Aircana
|
|
43
44
|
# Prompt for knowledge fetching
|
44
45
|
prompt_for_knowledge_fetch(prompt, normalized_agent_name)
|
45
46
|
|
47
|
+
# Prompt for web URL fetching
|
48
|
+
prompt_for_url_fetch(prompt, normalized_agent_name)
|
49
|
+
|
46
50
|
# Prompt for agent file review
|
47
51
|
prompt_for_agent_review(prompt, file)
|
48
52
|
|
@@ -59,6 +63,44 @@ module Aircana
|
|
59
63
|
print_agents_list(agent_folders)
|
60
64
|
end
|
61
65
|
|
66
|
+
def add_url(agent, url) # rubocop:disable Metrics/MethodLength, Metrics/AbcSize, Metrics/PerceivedComplexity
|
67
|
+
normalized_agent = normalize_string(agent)
|
68
|
+
|
69
|
+
unless agent_exists?(normalized_agent)
|
70
|
+
Aircana.human_logger.error "Agent '#{agent}' not found. Use 'aircana agents list' to see available agents."
|
71
|
+
exit 1
|
72
|
+
end
|
73
|
+
|
74
|
+
web = Aircana::Contexts::Web.new
|
75
|
+
result = web.fetch_url_for(agent: normalized_agent, url: url)
|
76
|
+
|
77
|
+
if result
|
78
|
+
# Update manifest with the new URL
|
79
|
+
existing_sources = Aircana::Contexts::Manifest.sources_from_manifest(normalized_agent)
|
80
|
+
web_sources = existing_sources.select { |s| s["type"] == "web" }
|
81
|
+
other_sources = existing_sources.reject { |s| s["type"] == "web" }
|
82
|
+
|
83
|
+
if web_sources.any?
|
84
|
+
# Add to existing web source
|
85
|
+
web_sources.first["urls"] << result
|
86
|
+
else
|
87
|
+
# Create new web source
|
88
|
+
web_sources = [{ "type" => "web", "urls" => [result] }]
|
89
|
+
end
|
90
|
+
|
91
|
+
all_sources = other_sources + web_sources
|
92
|
+
Aircana::Contexts::Manifest.update_manifest(normalized_agent, all_sources)
|
93
|
+
|
94
|
+
Aircana.human_logger.success "Successfully added URL to agent '#{agent}'"
|
95
|
+
else
|
96
|
+
Aircana.human_logger.error "Failed to fetch URL: #{url}"
|
97
|
+
exit 1
|
98
|
+
end
|
99
|
+
rescue Aircana::Error => e
|
100
|
+
Aircana.human_logger.error "Failed to add URL: #{e.message}"
|
101
|
+
exit 1
|
102
|
+
end
|
103
|
+
|
62
104
|
private
|
63
105
|
|
64
106
|
def perform_refresh(normalized_agent)
|
@@ -77,20 +119,38 @@ module Aircana
|
|
77
119
|
end
|
78
120
|
end
|
79
121
|
|
80
|
-
def perform_manifest_aware_refresh(normalized_agent)
|
81
|
-
|
122
|
+
def perform_manifest_aware_refresh(normalized_agent) # rubocop:disable Metrics/MethodLength, Metrics/AbcSize
|
123
|
+
total_pages = 0
|
124
|
+
all_sources = []
|
82
125
|
|
83
126
|
# Try manifest-based refresh first
|
84
127
|
if Aircana::Contexts::Manifest.manifest_exists?(normalized_agent)
|
85
128
|
Aircana.human_logger.info "Refreshing from knowledge manifest..."
|
86
|
-
|
129
|
+
|
130
|
+
# Refresh Confluence sources
|
131
|
+
confluence = Aircana::Contexts::Confluence.new
|
132
|
+
confluence_result = confluence.refresh_from_manifest(agent: normalized_agent)
|
133
|
+
total_pages += confluence_result[:pages_count]
|
134
|
+
all_sources.concat(confluence_result[:sources])
|
135
|
+
|
136
|
+
# Refresh web sources
|
137
|
+
web = Aircana::Contexts::Web.new
|
138
|
+
web_result = web.refresh_web_sources(agent: normalized_agent)
|
139
|
+
total_pages += web_result[:pages_count]
|
140
|
+
all_sources.concat(web_result[:sources])
|
87
141
|
else
|
88
142
|
Aircana.human_logger.info "No manifest found, falling back to label-based search..."
|
89
|
-
|
143
|
+
confluence = Aircana::Contexts::Confluence.new
|
144
|
+
confluence_result = confluence.fetch_pages_for(agent: normalized_agent)
|
145
|
+
total_pages += confluence_result[:pages_count]
|
146
|
+
all_sources.concat(confluence_result[:sources])
|
90
147
|
end
|
91
148
|
|
92
|
-
|
93
|
-
|
149
|
+
# Update manifest with all sources combined
|
150
|
+
Aircana::Contexts::Manifest.update_manifest(normalized_agent, all_sources) if all_sources.any?
|
151
|
+
|
152
|
+
log_refresh_result(normalized_agent, total_pages)
|
153
|
+
{ pages_count: total_pages, sources: all_sources }
|
94
154
|
end
|
95
155
|
|
96
156
|
def show_gitignore_recommendation
|
@@ -133,6 +193,9 @@ module Aircana
|
|
133
193
|
within its domain.
|
134
194
|
|
135
195
|
Print the output to STDOUT only, without any additional commentary.
|
196
|
+
|
197
|
+
The description should be 2-3 sentences. Most of the agent's context comes from
|
198
|
+
its knowledge base
|
136
199
|
PROMPT
|
137
200
|
end
|
138
201
|
|
@@ -153,6 +216,43 @@ module Aircana
|
|
153
216
|
Aircana.human_logger.info "You can try again later with 'aircana agents refresh #{normalized_agent_name}'"
|
154
217
|
end
|
155
218
|
|
219
|
+
def prompt_for_url_fetch(prompt, normalized_agent_name) # rubocop:disable Metrics/MethodLength, Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
220
|
+
return unless prompt.yes?("Would you like to add web URLs for this agent's knowledge base?")
|
221
|
+
|
222
|
+
urls = []
|
223
|
+
loop do
|
224
|
+
url = prompt.ask("Enter URL (or press Enter to finish):")
|
225
|
+
break if url.nil? || url.strip.empty?
|
226
|
+
|
227
|
+
url = url.strip
|
228
|
+
if valid_url?(url)
|
229
|
+
urls << url
|
230
|
+
else
|
231
|
+
Aircana.human_logger.warn "Invalid URL format: #{url}. Please enter a valid HTTP or HTTPS URL."
|
232
|
+
end
|
233
|
+
end
|
234
|
+
|
235
|
+
return if urls.empty?
|
236
|
+
|
237
|
+
begin
|
238
|
+
Aircana.human_logger.info "Fetching #{urls.size} URL(s)..."
|
239
|
+
web = Aircana::Contexts::Web.new
|
240
|
+
result = web.fetch_urls_for(agent: normalized_agent_name, urls: urls)
|
241
|
+
|
242
|
+
if result[:pages_count].positive?
|
243
|
+
Aircana.human_logger.success "Successfully fetched #{result[:pages_count]} URL(s)"
|
244
|
+
show_gitignore_recommendation
|
245
|
+
else
|
246
|
+
Aircana.human_logger.warn "No URLs were successfully fetched"
|
247
|
+
end
|
248
|
+
rescue Aircana::Error => e
|
249
|
+
Aircana.human_logger.warn "Failed to fetch URLs: #{e.message}"
|
250
|
+
Aircana.human_logger.info(
|
251
|
+
"You can add URLs later with 'aircana agents add-url #{normalized_agent_name} <URL>'"
|
252
|
+
)
|
253
|
+
end
|
254
|
+
end
|
255
|
+
|
156
256
|
def prompt_for_agent_review(prompt, file_path)
|
157
257
|
Aircana.human_logger.info "Agent file created at: #{file_path}"
|
158
258
|
|
@@ -214,6 +314,18 @@ module Aircana
|
|
214
314
|
config["description"] || "No description available"
|
215
315
|
end
|
216
316
|
|
317
|
+
def agent_exists?(agent_name)
|
318
|
+
agent_dir = File.join(Aircana.configuration.agent_knowledge_dir, agent_name)
|
319
|
+
Dir.exist?(agent_dir)
|
320
|
+
end
|
321
|
+
|
322
|
+
def valid_url?(url)
|
323
|
+
uri = URI.parse(url)
|
324
|
+
%w[http https].include?(uri.scheme) && !uri.host.nil?
|
325
|
+
rescue URI::InvalidURIError
|
326
|
+
false
|
327
|
+
end
|
328
|
+
|
217
329
|
def find_available_editor
|
218
330
|
%w[code subl atom nano vim vi].find { |cmd| system("which #{cmd} > /dev/null 2>&1") }
|
219
331
|
end
|
@@ -15,6 +15,7 @@ module Aircana
|
|
15
15
|
include DoctorChecks::ClaudeIntegration
|
16
16
|
include DoctorChecks::AircanaConfiguration
|
17
17
|
include DoctorChecks::OptionalIntegrations
|
18
|
+
include DoctorChecks::SQSIntegration
|
18
19
|
|
19
20
|
def run(verbose: false)
|
20
21
|
@verbose = verbose
|
@@ -22,11 +23,7 @@ module Aircana
|
|
22
23
|
|
23
24
|
Aircana.human_logger.info "🔍 Checking Aircana system health...\n"
|
24
25
|
|
25
|
-
|
26
|
-
check_claude_integration
|
27
|
-
check_optional_dependencies
|
28
|
-
check_aircana_configuration
|
29
|
-
check_optional_integrations
|
26
|
+
run_all_checks
|
30
27
|
|
31
28
|
display_summary
|
32
29
|
@issues_found ? 1 : 0
|
@@ -34,6 +31,15 @@ module Aircana
|
|
34
31
|
|
35
32
|
private
|
36
33
|
|
34
|
+
def run_all_checks
|
35
|
+
check_required_dependencies
|
36
|
+
check_claude_integration
|
37
|
+
check_optional_dependencies
|
38
|
+
check_aircana_configuration
|
39
|
+
check_optional_integrations
|
40
|
+
check_sqs_integration
|
41
|
+
end
|
42
|
+
|
37
43
|
def check_required_dependencies
|
38
44
|
Aircana.human_logger.info "Required Dependencies:"
|
39
45
|
|
@@ -126,6 +126,54 @@ module Aircana
|
|
126
126
|
end
|
127
127
|
end
|
128
128
|
end
|
129
|
+
|
130
|
+
module SQSIntegration
|
131
|
+
def check_sqs_integration
|
132
|
+
Aircana.human_logger.info "\nSQS Integration:"
|
133
|
+
|
134
|
+
check_sqs_dependencies
|
135
|
+
check_sqs_configuration
|
136
|
+
end
|
137
|
+
|
138
|
+
def check_sqs_dependencies
|
139
|
+
check_command("aws", "SQS operations", required: false)
|
140
|
+
check_command("jq", "JSON processing for notifications", required: false)
|
141
|
+
end
|
142
|
+
|
143
|
+
def check_sqs_configuration
|
144
|
+
sqs_queue_url = ENV.fetch("AIRCANA_SQS_QUEUE_URL", nil)
|
145
|
+
sqs_message_template = ENV.fetch("AIRCANA_SQS_MESSAGE_TEMPLATE", nil)
|
146
|
+
aws_region = ENV.fetch("AWS_REGION", "us-east-1")
|
147
|
+
|
148
|
+
if sqs_configured?(sqs_queue_url, sqs_message_template)
|
149
|
+
log_success("SQS Config", "Environment variables configured")
|
150
|
+
log_sqs_config_details(sqs_queue_url, sqs_message_template, aws_region) if @verbose
|
151
|
+
else
|
152
|
+
log_info("SQS Config", "Not configured")
|
153
|
+
log_sqs_configuration_remedy
|
154
|
+
end
|
155
|
+
end
|
156
|
+
|
157
|
+
def sqs_configured?(queue_url, message_template)
|
158
|
+
!queue_url.nil? && !queue_url.empty? &&
|
159
|
+
!message_template.nil? && !message_template.empty?
|
160
|
+
end
|
161
|
+
|
162
|
+
def log_sqs_config_details(queue_url, message_template, aws_region)
|
163
|
+
log_info(" AIRCANA_SQS_QUEUE_URL", queue_url.length > 50 ? "#{queue_url[0..47]}..." : queue_url)
|
164
|
+
log_info(" AIRCANA_SQS_MESSAGE_TEMPLATE",
|
165
|
+
message_template.length > 40 ? "#{message_template[0..37]}..." : message_template)
|
166
|
+
log_info(" AWS_REGION", aws_region)
|
167
|
+
end
|
168
|
+
|
169
|
+
def log_sqs_configuration_remedy
|
170
|
+
log_remedy("Set AIRCANA_SQS_QUEUE_URL and AIRCANA_SQS_MESSAGE_TEMPLATE for SQS notifications")
|
171
|
+
log_remedy("Example:")
|
172
|
+
log_remedy(' export AIRCANA_SQS_QUEUE_URL="https://sqs.us-east-1.amazonaws.com/account/queue"')
|
173
|
+
log_remedy(' export AIRCANA_SQS_MESSAGE_TEMPLATE=\'{"text":"{{message}}}\'')
|
174
|
+
log_remedy(' export AWS_REGION="us-east-1" # Optional, defaults to us-east-1')
|
175
|
+
end
|
176
|
+
end
|
129
177
|
end
|
130
178
|
end
|
131
179
|
end
|
@@ -9,7 +9,7 @@ module Aircana
|
|
9
9
|
module Install
|
10
10
|
class << self
|
11
11
|
def run
|
12
|
-
|
12
|
+
generate_files
|
13
13
|
ensure_project_config_exists
|
14
14
|
install_commands_to_claude
|
15
15
|
install_hooks_to_claude
|
@@ -17,10 +17,8 @@ module Aircana
|
|
17
17
|
|
18
18
|
private
|
19
19
|
|
20
|
-
def
|
21
|
-
|
22
|
-
|
23
|
-
Aircana.human_logger.warn("No generated output files-auto generating now...")
|
20
|
+
def generate_files
|
21
|
+
Aircana.human_logger.info("Generating files before installation...")
|
24
22
|
Generate.run
|
25
23
|
end
|
26
24
|
|
@@ -118,6 +116,7 @@ module Aircana
|
|
118
116
|
"post_tool_use" => { event: "PostToolUse", matcher: nil },
|
119
117
|
"user_prompt_submit" => { event: "UserPromptSubmit", matcher: nil },
|
120
118
|
"session_start" => { event: "SessionStart", matcher: nil },
|
119
|
+
"notification_sqs" => { event: "Notification", matcher: nil },
|
121
120
|
"rubocop_pre_commit" => { event: "PreToolUse", matcher: "Bash" },
|
122
121
|
"rspec_test" => { event: "PostToolUse", matcher: "Bash" },
|
123
122
|
"bundle_install" => { event: "PostToolUse", matcher: "Bash" }
|
@@ -56,7 +56,6 @@ module Aircana
|
|
56
56
|
return { pages_count: 0, sources: [] } if all_pages.empty?
|
57
57
|
|
58
58
|
updated_sources = process_pages_with_manifest(all_pages, agent)
|
59
|
-
Manifest.update_manifest(agent, updated_sources)
|
60
59
|
|
61
60
|
{ pages_count: all_pages.size, sources: updated_sources }
|
62
61
|
end
|
@@ -128,6 +128,8 @@ module Aircana
|
|
128
128
|
case source["type"]
|
129
129
|
when "confluence"
|
130
130
|
validate_confluence_source(source)
|
131
|
+
when "web"
|
132
|
+
validate_web_source(source)
|
131
133
|
else
|
132
134
|
raise ManifestError, "Unknown source type: #{source["type"]}"
|
133
135
|
end
|
@@ -140,6 +142,24 @@ module Aircana
|
|
140
142
|
|
141
143
|
raise ManifestError, "Confluence pages must be an array"
|
142
144
|
end
|
145
|
+
|
146
|
+
def validate_web_source(source)
|
147
|
+
raise ManifestError, "Web source missing required field: urls" unless source.key?("urls")
|
148
|
+
|
149
|
+
raise ManifestError, "Web urls must be an array" unless source["urls"].is_a?(Array)
|
150
|
+
|
151
|
+
source["urls"].each do |url_entry|
|
152
|
+
validate_web_url_entry(url_entry)
|
153
|
+
end
|
154
|
+
end
|
155
|
+
|
156
|
+
def validate_web_url_entry(url_entry)
|
157
|
+
raise ManifestError, "Each URL entry must be a hash" unless url_entry.is_a?(Hash)
|
158
|
+
|
159
|
+
raise ManifestError, "URL entry missing required field: url" unless url_entry.key?("url")
|
160
|
+
|
161
|
+
raise ManifestError, "URL entry missing required field: title" unless url_entry.key?("title")
|
162
|
+
end
|
143
163
|
end
|
144
164
|
end
|
145
165
|
|
@@ -0,0 +1,341 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "httparty"
|
4
|
+
require "reverse_markdown"
|
5
|
+
require "uri"
|
6
|
+
require_relative "local"
|
7
|
+
require_relative "manifest"
|
8
|
+
require_relative "../progress_tracker"
|
9
|
+
require_relative "../version"
|
10
|
+
require_relative "../llm/claude_client"
|
11
|
+
|
12
|
+
module Aircana
|
13
|
+
module Contexts
|
14
|
+
class Web # rubocop:disable Metrics/ClassLength
|
15
|
+
include HTTParty
|
16
|
+
|
17
|
+
headers "User-Agent" => "Aircana/#{Aircana::VERSION} (+https://github.com/westonkd/aircana)"
|
18
|
+
default_timeout 30
|
19
|
+
follow_redirects true
|
20
|
+
|
21
|
+
def initialize
|
22
|
+
@local_storage = Local.new
|
23
|
+
end
|
24
|
+
|
25
|
+
def fetch_url_for(agent:, url:)
|
26
|
+
validate_url!(url)
|
27
|
+
|
28
|
+
page_data = fetch_and_process_url(url)
|
29
|
+
store_page_as_markdown(page_data, agent)
|
30
|
+
|
31
|
+
build_url_metadata(page_data)
|
32
|
+
rescue StandardError => e
|
33
|
+
handle_fetch_error(url, e)
|
34
|
+
nil
|
35
|
+
end
|
36
|
+
|
37
|
+
def fetch_urls_for(agent:, urls:) # rubocop:disable Metrics/MethodLength
|
38
|
+
return { pages_count: 0, sources: [] } if urls.empty?
|
39
|
+
|
40
|
+
pages_metadata = []
|
41
|
+
successful_urls = []
|
42
|
+
|
43
|
+
ProgressTracker.with_batch_progress(urls, "Fetching URLs") do |url, _index|
|
44
|
+
metadata = fetch_url_for(agent: agent, url: url)
|
45
|
+
if metadata
|
46
|
+
pages_metadata << metadata
|
47
|
+
successful_urls << url
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
if successful_urls.any?
|
52
|
+
sources = build_sources_metadata(successful_urls, pages_metadata)
|
53
|
+
update_or_create_manifest(agent, sources)
|
54
|
+
{ pages_count: successful_urls.size, sources: sources }
|
55
|
+
else
|
56
|
+
{ pages_count: 0, sources: [] }
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
def refresh_web_sources(agent:) # rubocop:disable Metrics/CyclomaticComplexity
|
61
|
+
sources = Manifest.sources_from_manifest(agent)
|
62
|
+
web_sources = sources.select { |s| s["type"] == "web" }
|
63
|
+
|
64
|
+
return { pages_count: 0, sources: [] } if web_sources.empty?
|
65
|
+
|
66
|
+
all_urls = web_sources.flat_map { |source| source["urls"]&.map { |u| u["url"] } || [] }
|
67
|
+
return { pages_count: 0, sources: [] } if all_urls.empty?
|
68
|
+
|
69
|
+
fetch_urls_for(agent: agent, urls: all_urls)
|
70
|
+
end
|
71
|
+
|
72
|
+
private
|
73
|
+
|
74
|
+
def validate_url!(url)
|
75
|
+
uri = URI.parse(url)
|
76
|
+
raise Error, "URL must use HTTP or HTTPS protocol" unless %w[http https].include?(uri.scheme)
|
77
|
+
raise Error, "Invalid URL format" unless uri.host
|
78
|
+
rescue URI::InvalidURIError
|
79
|
+
raise Error, "Invalid URL format"
|
80
|
+
end
|
81
|
+
|
82
|
+
def fetch_and_process_url(url) # rubocop:disable Metrics/MethodLength
|
83
|
+
Aircana.human_logger.info("Fetching #{url}")
|
84
|
+
|
85
|
+
response = self.class.get(url)
|
86
|
+
|
87
|
+
raise Error, "Failed to fetch URL (#{response.code})" unless response.success?
|
88
|
+
|
89
|
+
html_title = extract_title(response.body)
|
90
|
+
content = convert_to_markdown(response.body)
|
91
|
+
title = generate_meaningful_title(html_title, content, url)
|
92
|
+
|
93
|
+
{
|
94
|
+
url: url,
|
95
|
+
title: title,
|
96
|
+
content: content,
|
97
|
+
last_fetched: Time.now.utc.strftime("%Y-%m-%dT%H:%M:%SZ")
|
98
|
+
}
|
99
|
+
end
|
100
|
+
|
101
|
+
def extract_title(html) # rubocop:disable Metrics/MethodLength
|
102
|
+
title_match = html.match(%r{<title[^>]*>(.*?)</title>}im)
|
103
|
+
return nil unless title_match
|
104
|
+
|
105
|
+
title = title_match[1].strip
|
106
|
+
# Decode HTML entities
|
107
|
+
title.gsub(/&([a-zA-Z]+|#\d+);/) do |match|
|
108
|
+
case match
|
109
|
+
when "&" then "&"
|
110
|
+
when "<" then "<"
|
111
|
+
when ">" then ">"
|
112
|
+
when """ then '"'
|
113
|
+
when "'", "'" then "'"
|
114
|
+
else match
|
115
|
+
end
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
def extract_title_from_url(url)
|
120
|
+
uri = URI.parse(url)
|
121
|
+
# Use the last path segment or host as fallback title
|
122
|
+
path_segments = uri.path.split("/").reject(&:empty?)
|
123
|
+
if path_segments.any?
|
124
|
+
path_segments.last.gsub(/[-_]/, " ").split.map(&:capitalize).join(" ")
|
125
|
+
else
|
126
|
+
uri.host
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
def generate_meaningful_title(html_title, content, url) # rubocop:disable Metrics/CyclomaticComplexity
|
131
|
+
# If we have a good HTML title that's descriptive, use it
|
132
|
+
return html_title if html_title && html_title.length > 10 && !generic_title?(html_title)
|
133
|
+
|
134
|
+
# If content is too short, use fallback
|
135
|
+
return html_title || extract_title_from_url(url) if content.length < 50
|
136
|
+
|
137
|
+
# Use Claude to generate a meaningful title based on content
|
138
|
+
begin
|
139
|
+
generate_title_with_claude(content, url)
|
140
|
+
rescue StandardError => e
|
141
|
+
Aircana.human_logger.warn("Failed to generate title with Claude: #{e.message}")
|
142
|
+
html_title || extract_title_from_url(url)
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
146
|
+
def generic_title?(title)
|
147
|
+
generic_patterns = [
|
148
|
+
/^(home|index|welcome|untitled|document)$/i,
|
149
|
+
/^(page|default)$/i,
|
150
|
+
/^\s*$/,
|
151
|
+
# Truncated titles (contain ellipsis)
|
152
|
+
/\.\.\./,
|
153
|
+
# Titles with excessive metadata (site names, IDs, etc.)
|
154
|
+
/ - .+ - \d+$/,
|
155
|
+
# Question titles that are truncated
|
156
|
+
/^how do i .+\.\.\./i,
|
157
|
+
/^what is .+\.\.\./i
|
158
|
+
]
|
159
|
+
|
160
|
+
generic_patterns.any? { |pattern| title.match?(pattern) }
|
161
|
+
end
|
162
|
+
|
163
|
+
def generate_title_with_claude(content, url)
|
164
|
+
prompt = build_title_generation_prompt(content, url)
|
165
|
+
claude_client = LLM::ClaudeClient.new
|
166
|
+
claude_client.prompt(prompt).strip
|
167
|
+
end
|
168
|
+
|
169
|
+
def build_title_generation_prompt(content, url) # rubocop:disable Metrics/MethodLength
|
170
|
+
# Truncate content to avoid overly long prompts
|
171
|
+
truncated_content = content.length > 1000 ? "#{content[0..1000]}..." : content
|
172
|
+
|
173
|
+
<<~PROMPT
|
174
|
+
Based on the following web page content from #{url}, generate a concise, descriptive title
|
175
|
+
that would help an AI agent understand what this document contains and when it would be useful.
|
176
|
+
|
177
|
+
The title should be:
|
178
|
+
- 3-8 words long
|
179
|
+
- Focused on the main topic or purpose
|
180
|
+
- Helpful for knowledge retrieval
|
181
|
+
- Professional and clear
|
182
|
+
|
183
|
+
Content:
|
184
|
+
#{truncated_content}
|
185
|
+
|
186
|
+
Respond with only the title, no additional text or explanation.
|
187
|
+
PROMPT
|
188
|
+
end
|
189
|
+
|
190
|
+
def convert_to_markdown(html)
|
191
|
+
return "" if html.nil? || html.empty?
|
192
|
+
|
193
|
+
# Extract meaningful content by removing unwanted elements
|
194
|
+
cleaned_html = extract_main_content(html)
|
195
|
+
|
196
|
+
ReverseMarkdown.convert(cleaned_html, github_flavored: true)
|
197
|
+
rescue StandardError => e
|
198
|
+
Aircana.human_logger.warn "Failed to convert HTML to markdown: #{e.message}"
|
199
|
+
# Fallback to plain text extraction
|
200
|
+
extract_text_content(html)
|
201
|
+
end
|
202
|
+
|
203
|
+
def store_page_as_markdown(page_data, agent)
|
204
|
+
@local_storage.store_content(
|
205
|
+
title: page_data[:title],
|
206
|
+
content: page_data[:content],
|
207
|
+
agent: agent
|
208
|
+
)
|
209
|
+
end
|
210
|
+
|
211
|
+
def build_url_metadata(page_data)
|
212
|
+
{
|
213
|
+
"url" => page_data[:url],
|
214
|
+
"title" => page_data[:title],
|
215
|
+
"last_fetched" => page_data[:last_fetched]
|
216
|
+
}
|
217
|
+
end
|
218
|
+
|
219
|
+
def build_sources_metadata(_urls, pages_metadata)
|
220
|
+
[
|
221
|
+
{
|
222
|
+
"type" => "web",
|
223
|
+
"urls" => pages_metadata
|
224
|
+
}
|
225
|
+
]
|
226
|
+
end
|
227
|
+
|
228
|
+
def update_or_create_manifest(agent, new_sources)
|
229
|
+
existing_sources = Manifest.sources_from_manifest(agent)
|
230
|
+
|
231
|
+
# Remove existing web sources and add new ones
|
232
|
+
other_sources = existing_sources.reject { |s| s["type"] == "web" }
|
233
|
+
all_sources = other_sources + new_sources
|
234
|
+
|
235
|
+
if Manifest.manifest_exists?(agent)
|
236
|
+
Manifest.update_manifest(agent, all_sources)
|
237
|
+
else
|
238
|
+
Manifest.create_manifest(agent, all_sources)
|
239
|
+
end
|
240
|
+
end
|
241
|
+
|
242
|
+
def handle_fetch_error(url, error)
|
243
|
+
case error
|
244
|
+
when URI::InvalidURIError
|
245
|
+
Aircana.human_logger.error "Invalid URL format: #{url}"
|
246
|
+
when HTTParty::Error
|
247
|
+
Aircana.human_logger.error "HTTP error fetching #{url}: #{error.message}"
|
248
|
+
when Error
|
249
|
+
Aircana.human_logger.error "Error fetching #{url}: #{error.message}"
|
250
|
+
else
|
251
|
+
Aircana.human_logger.error "Unexpected error fetching #{url}: #{error.message}"
|
252
|
+
end
|
253
|
+
end
|
254
|
+
|
255
|
+
def extract_main_content(html) # rubocop:disable Metrics/MethodLength
|
256
|
+
# Try to find the main content area using common selectors
|
257
|
+
content_patterns = [
|
258
|
+
# Common main content selectors
|
259
|
+
%r{<main[^>]*>(.*?)</main>}mi,
|
260
|
+
%r{<article[^>]*>(.*?)</article>}mi,
|
261
|
+
%r{<div[^>]*class="[^"]*content[^"]*"[^>]*>(.*?)</div>}mi,
|
262
|
+
%r{<div[^>]*id="content"[^>]*>(.*?)</div>}mi,
|
263
|
+
%r{<div[^>]*class="[^"]*post[^"]*"[^>]*>(.*?)</div>}mi,
|
264
|
+
# Documentation specific
|
265
|
+
%r{<div[^>]*class="[^"]*docs[^"]*"[^>]*>(.*?)</div>}mi,
|
266
|
+
%r{<div[^>]*class="[^"]*documentation[^"]*"[^>]*>(.*?)</div>}mi,
|
267
|
+
# Body content as fallback
|
268
|
+
%r{<body[^>]*>(.*?)</body>}mi
|
269
|
+
]
|
270
|
+
|
271
|
+
extracted_content = nil
|
272
|
+
content_patterns.each do |pattern|
|
273
|
+
match = html.match(pattern)
|
274
|
+
if match && match[1].strip.length > 100 # Ensure meaningful content
|
275
|
+
extracted_content = match[1]
|
276
|
+
break
|
277
|
+
end
|
278
|
+
end
|
279
|
+
|
280
|
+
# If no pattern matched or content is too short, use the full HTML
|
281
|
+
content_to_clean = extracted_content || html
|
282
|
+
|
283
|
+
# Remove unwanted elements
|
284
|
+
clean_html_content(content_to_clean)
|
285
|
+
end
|
286
|
+
|
287
|
+
def clean_html_content(html) # rubocop:disable Metrics/MethodLength
|
288
|
+
cleaned = html.dup
|
289
|
+
|
290
|
+
# Remove script and style tags completely
|
291
|
+
cleaned = cleaned.gsub(%r{<script[^>]*>.*?</script>}mi, "")
|
292
|
+
cleaned = cleaned.gsub(%r{<style[^>]*>.*?</style>}mi, "")
|
293
|
+
|
294
|
+
# Remove navigation, header, footer, sidebar elements
|
295
|
+
navigation_selectors = %w[nav header footer aside sidebar menu breadcrumb]
|
296
|
+
navigation_selectors.each do |selector|
|
297
|
+
# Remove by tag name
|
298
|
+
cleaned = cleaned.gsub(%r{<#{selector}[^>]*>.*?</#{selector}>}mi, "")
|
299
|
+
# Remove by class name (common patterns)
|
300
|
+
cleaned = cleaned.gsub(%r{<[^>]+class="[^"]*#{selector}[^"]*"[^>]*>.*?</[^>]+>}mi, "")
|
301
|
+
cleaned = cleaned.gsub(%r{<[^>]+id="#{selector}"[^>]*>.*?</[^>]+>}mi, "")
|
302
|
+
end
|
303
|
+
|
304
|
+
# Remove common non-content elements
|
305
|
+
unwanted_patterns = [
|
306
|
+
%r{<div[^>]*class="[^"]*comment[^"]*"[^>]*>.*?</div>}mi,
|
307
|
+
%r{<div[^>]*class="[^"]*social[^"]*"[^>]*>.*?</div>}mi,
|
308
|
+
%r{<div[^>]*class="[^"]*share[^"]*"[^>]*>.*?</div>}mi,
|
309
|
+
%r{<div[^>]*class="[^"]*ad[^"]*"[^>]*>.*?</div>}mi,
|
310
|
+
%r{<div[^>]*class="[^"]*advertisement[^"]*"[^>]*>.*?</div>}mi,
|
311
|
+
%r{<div[^>]*class="[^"]*popup[^"]*"[^>]*>.*?</div>}mi,
|
312
|
+
%r{<div[^>]*class="[^"]*modal[^"]*"[^>]*>.*?</div>}mi
|
313
|
+
]
|
314
|
+
|
315
|
+
unwanted_patterns.each do |pattern|
|
316
|
+
cleaned = cleaned.gsub(pattern, "")
|
317
|
+
end
|
318
|
+
|
319
|
+
# Clean up whitespace
|
320
|
+
cleaned.gsub(/\n\s*\n\s*\n+/, "\n\n").strip
|
321
|
+
end
|
322
|
+
|
323
|
+
def extract_text_content(html) # rubocop:disable Metrics/MethodLength
|
324
|
+
# Fallback method for plain text extraction
|
325
|
+
text = html.gsub(%r{<script[^>]*>.*?</script>}mi, "")
|
326
|
+
.gsub(%r{<style[^>]*>.*?</style>}mi, "")
|
327
|
+
.gsub(/<[^>]+>/, "")
|
328
|
+
.gsub(" ", " ")
|
329
|
+
.gsub("&", "&")
|
330
|
+
.gsub("<", "<")
|
331
|
+
.gsub(">", ">")
|
332
|
+
.gsub(""", '"')
|
333
|
+
.gsub(/\s+/, " ")
|
334
|
+
.strip
|
335
|
+
|
336
|
+
# If the extracted text is very short, it might not be useful
|
337
|
+
text.length < 20 ? "Content could not be extracted from this page." : text
|
338
|
+
end
|
339
|
+
end
|
340
|
+
end
|
341
|
+
end
|