aircana 1.0.0 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rspec_status +195 -142
- data/.rubocop.yml +5 -0
- data/CLAUDE.md +18 -4
- data/lib/aircana/cli/app.rb +5 -0
- data/lib/aircana/cli/commands/agents.rb +141 -4
- data/lib/aircana/contexts/confluence.rb +75 -3
- data/lib/aircana/contexts/manifest.rb +168 -0
- data/lib/aircana/contexts/web.rb +341 -0
- data/lib/aircana/symlink_manager.rb +20 -0
- data/lib/aircana/version.rb +1 -1
- metadata +5 -3
@@ -0,0 +1,341 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "httparty"
|
4
|
+
require "reverse_markdown"
|
5
|
+
require "uri"
|
6
|
+
require_relative "local"
|
7
|
+
require_relative "manifest"
|
8
|
+
require_relative "../progress_tracker"
|
9
|
+
require_relative "../version"
|
10
|
+
require_relative "../llm/claude_client"
|
11
|
+
|
12
|
+
module Aircana
|
13
|
+
module Contexts
|
14
|
+
class Web # rubocop:disable Metrics/ClassLength
|
15
|
+
include HTTParty
|
16
|
+
|
17
|
+
headers "User-Agent" => "Aircana/#{Aircana::VERSION} (+https://github.com/westonkd/aircana)"
|
18
|
+
default_timeout 30
|
19
|
+
follow_redirects true
|
20
|
+
|
21
|
+
def initialize
|
22
|
+
@local_storage = Local.new
|
23
|
+
end
|
24
|
+
|
25
|
+
def fetch_url_for(agent:, url:)
|
26
|
+
validate_url!(url)
|
27
|
+
|
28
|
+
page_data = fetch_and_process_url(url)
|
29
|
+
store_page_as_markdown(page_data, agent)
|
30
|
+
|
31
|
+
build_url_metadata(page_data)
|
32
|
+
rescue StandardError => e
|
33
|
+
handle_fetch_error(url, e)
|
34
|
+
nil
|
35
|
+
end
|
36
|
+
|
37
|
+
def fetch_urls_for(agent:, urls:) # rubocop:disable Metrics/MethodLength
|
38
|
+
return { pages_count: 0, sources: [] } if urls.empty?
|
39
|
+
|
40
|
+
pages_metadata = []
|
41
|
+
successful_urls = []
|
42
|
+
|
43
|
+
ProgressTracker.with_batch_progress(urls, "Fetching URLs") do |url, _index|
|
44
|
+
metadata = fetch_url_for(agent: agent, url: url)
|
45
|
+
if metadata
|
46
|
+
pages_metadata << metadata
|
47
|
+
successful_urls << url
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
if successful_urls.any?
|
52
|
+
sources = build_sources_metadata(successful_urls, pages_metadata)
|
53
|
+
update_or_create_manifest(agent, sources)
|
54
|
+
{ pages_count: successful_urls.size, sources: sources }
|
55
|
+
else
|
56
|
+
{ pages_count: 0, sources: [] }
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
def refresh_web_sources(agent:) # rubocop:disable Metrics/CyclomaticComplexity
|
61
|
+
sources = Manifest.sources_from_manifest(agent)
|
62
|
+
web_sources = sources.select { |s| s["type"] == "web" }
|
63
|
+
|
64
|
+
return { pages_count: 0, sources: [] } if web_sources.empty?
|
65
|
+
|
66
|
+
all_urls = web_sources.flat_map { |source| source["urls"]&.map { |u| u["url"] } || [] }
|
67
|
+
return { pages_count: 0, sources: [] } if all_urls.empty?
|
68
|
+
|
69
|
+
fetch_urls_for(agent: agent, urls: all_urls)
|
70
|
+
end
|
71
|
+
|
72
|
+
private
|
73
|
+
|
74
|
+
def validate_url!(url)
|
75
|
+
uri = URI.parse(url)
|
76
|
+
raise Error, "URL must use HTTP or HTTPS protocol" unless %w[http https].include?(uri.scheme)
|
77
|
+
raise Error, "Invalid URL format" unless uri.host
|
78
|
+
rescue URI::InvalidURIError
|
79
|
+
raise Error, "Invalid URL format"
|
80
|
+
end
|
81
|
+
|
82
|
+
def fetch_and_process_url(url) # rubocop:disable Metrics/MethodLength
|
83
|
+
Aircana.human_logger.info("Fetching #{url}")
|
84
|
+
|
85
|
+
response = self.class.get(url)
|
86
|
+
|
87
|
+
raise Error, "Failed to fetch URL (#{response.code})" unless response.success?
|
88
|
+
|
89
|
+
html_title = extract_title(response.body)
|
90
|
+
content = convert_to_markdown(response.body)
|
91
|
+
title = generate_meaningful_title(html_title, content, url)
|
92
|
+
|
93
|
+
{
|
94
|
+
url: url,
|
95
|
+
title: title,
|
96
|
+
content: content,
|
97
|
+
last_fetched: Time.now.utc.strftime("%Y-%m-%dT%H:%M:%SZ")
|
98
|
+
}
|
99
|
+
end
|
100
|
+
|
101
|
+
def extract_title(html) # rubocop:disable Metrics/MethodLength
|
102
|
+
title_match = html.match(%r{<title[^>]*>(.*?)</title>}im)
|
103
|
+
return nil unless title_match
|
104
|
+
|
105
|
+
title = title_match[1].strip
|
106
|
+
# Decode HTML entities
|
107
|
+
title.gsub(/&([a-zA-Z]+|#\d+);/) do |match|
|
108
|
+
case match
|
109
|
+
when "&" then "&"
|
110
|
+
when "<" then "<"
|
111
|
+
when ">" then ">"
|
112
|
+
when """ then '"'
|
113
|
+
when "'", "'" then "'"
|
114
|
+
else match
|
115
|
+
end
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
def extract_title_from_url(url)
|
120
|
+
uri = URI.parse(url)
|
121
|
+
# Use the last path segment or host as fallback title
|
122
|
+
path_segments = uri.path.split("/").reject(&:empty?)
|
123
|
+
if path_segments.any?
|
124
|
+
path_segments.last.gsub(/[-_]/, " ").split.map(&:capitalize).join(" ")
|
125
|
+
else
|
126
|
+
uri.host
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
def generate_meaningful_title(html_title, content, url) # rubocop:disable Metrics/CyclomaticComplexity
|
131
|
+
# If we have a good HTML title that's descriptive, use it
|
132
|
+
return html_title if html_title && html_title.length > 10 && !generic_title?(html_title)
|
133
|
+
|
134
|
+
# If content is too short, use fallback
|
135
|
+
return html_title || extract_title_from_url(url) if content.length < 50
|
136
|
+
|
137
|
+
# Use Claude to generate a meaningful title based on content
|
138
|
+
begin
|
139
|
+
generate_title_with_claude(content, url)
|
140
|
+
rescue StandardError => e
|
141
|
+
Aircana.human_logger.warn("Failed to generate title with Claude: #{e.message}")
|
142
|
+
html_title || extract_title_from_url(url)
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
146
|
+
def generic_title?(title)
|
147
|
+
generic_patterns = [
|
148
|
+
/^(home|index|welcome|untitled|document)$/i,
|
149
|
+
/^(page|default)$/i,
|
150
|
+
/^\s*$/,
|
151
|
+
# Truncated titles (contain ellipsis)
|
152
|
+
/\.\.\./,
|
153
|
+
# Titles with excessive metadata (site names, IDs, etc.)
|
154
|
+
/ - .+ - \d+$/,
|
155
|
+
# Question titles that are truncated
|
156
|
+
/^how do i .+\.\.\./i,
|
157
|
+
/^what is .+\.\.\./i
|
158
|
+
]
|
159
|
+
|
160
|
+
generic_patterns.any? { |pattern| title.match?(pattern) }
|
161
|
+
end
|
162
|
+
|
163
|
+
def generate_title_with_claude(content, url)
|
164
|
+
prompt = build_title_generation_prompt(content, url)
|
165
|
+
claude_client = LLM::ClaudeClient.new
|
166
|
+
claude_client.prompt(prompt).strip
|
167
|
+
end
|
168
|
+
|
169
|
+
def build_title_generation_prompt(content, url) # rubocop:disable Metrics/MethodLength
|
170
|
+
# Truncate content to avoid overly long prompts
|
171
|
+
truncated_content = content.length > 1000 ? "#{content[0..1000]}..." : content
|
172
|
+
|
173
|
+
<<~PROMPT
|
174
|
+
Based on the following web page content from #{url}, generate a concise, descriptive title
|
175
|
+
that would help an AI agent understand what this document contains and when it would be useful.
|
176
|
+
|
177
|
+
The title should be:
|
178
|
+
- 3-8 words long
|
179
|
+
- Focused on the main topic or purpose
|
180
|
+
- Helpful for knowledge retrieval
|
181
|
+
- Professional and clear
|
182
|
+
|
183
|
+
Content:
|
184
|
+
#{truncated_content}
|
185
|
+
|
186
|
+
Respond with only the title, no additional text or explanation.
|
187
|
+
PROMPT
|
188
|
+
end
|
189
|
+
|
190
|
+
def convert_to_markdown(html)
|
191
|
+
return "" if html.nil? || html.empty?
|
192
|
+
|
193
|
+
# Extract meaningful content by removing unwanted elements
|
194
|
+
cleaned_html = extract_main_content(html)
|
195
|
+
|
196
|
+
ReverseMarkdown.convert(cleaned_html, github_flavored: true)
|
197
|
+
rescue StandardError => e
|
198
|
+
Aircana.human_logger.warn "Failed to convert HTML to markdown: #{e.message}"
|
199
|
+
# Fallback to plain text extraction
|
200
|
+
extract_text_content(html)
|
201
|
+
end
|
202
|
+
|
203
|
+
def store_page_as_markdown(page_data, agent)
|
204
|
+
@local_storage.store_content(
|
205
|
+
title: page_data[:title],
|
206
|
+
content: page_data[:content],
|
207
|
+
agent: agent
|
208
|
+
)
|
209
|
+
end
|
210
|
+
|
211
|
+
def build_url_metadata(page_data)
|
212
|
+
{
|
213
|
+
"url" => page_data[:url],
|
214
|
+
"title" => page_data[:title],
|
215
|
+
"last_fetched" => page_data[:last_fetched]
|
216
|
+
}
|
217
|
+
end
|
218
|
+
|
219
|
+
def build_sources_metadata(_urls, pages_metadata)
|
220
|
+
[
|
221
|
+
{
|
222
|
+
"type" => "web",
|
223
|
+
"urls" => pages_metadata
|
224
|
+
}
|
225
|
+
]
|
226
|
+
end
|
227
|
+
|
228
|
+
def update_or_create_manifest(agent, new_sources)
|
229
|
+
existing_sources = Manifest.sources_from_manifest(agent)
|
230
|
+
|
231
|
+
# Remove existing web sources and add new ones
|
232
|
+
other_sources = existing_sources.reject { |s| s["type"] == "web" }
|
233
|
+
all_sources = other_sources + new_sources
|
234
|
+
|
235
|
+
if Manifest.manifest_exists?(agent)
|
236
|
+
Manifest.update_manifest(agent, all_sources)
|
237
|
+
else
|
238
|
+
Manifest.create_manifest(agent, all_sources)
|
239
|
+
end
|
240
|
+
end
|
241
|
+
|
242
|
+
def handle_fetch_error(url, error)
|
243
|
+
case error
|
244
|
+
when URI::InvalidURIError
|
245
|
+
Aircana.human_logger.error "Invalid URL format: #{url}"
|
246
|
+
when HTTParty::Error
|
247
|
+
Aircana.human_logger.error "HTTP error fetching #{url}: #{error.message}"
|
248
|
+
when Error
|
249
|
+
Aircana.human_logger.error "Error fetching #{url}: #{error.message}"
|
250
|
+
else
|
251
|
+
Aircana.human_logger.error "Unexpected error fetching #{url}: #{error.message}"
|
252
|
+
end
|
253
|
+
end
|
254
|
+
|
255
|
+
def extract_main_content(html) # rubocop:disable Metrics/MethodLength
|
256
|
+
# Try to find the main content area using common selectors
|
257
|
+
content_patterns = [
|
258
|
+
# Common main content selectors
|
259
|
+
%r{<main[^>]*>(.*?)</main>}mi,
|
260
|
+
%r{<article[^>]*>(.*?)</article>}mi,
|
261
|
+
%r{<div[^>]*class="[^"]*content[^"]*"[^>]*>(.*?)</div>}mi,
|
262
|
+
%r{<div[^>]*id="content"[^>]*>(.*?)</div>}mi,
|
263
|
+
%r{<div[^>]*class="[^"]*post[^"]*"[^>]*>(.*?)</div>}mi,
|
264
|
+
# Documentation specific
|
265
|
+
%r{<div[^>]*class="[^"]*docs[^"]*"[^>]*>(.*?)</div>}mi,
|
266
|
+
%r{<div[^>]*class="[^"]*documentation[^"]*"[^>]*>(.*?)</div>}mi,
|
267
|
+
# Body content as fallback
|
268
|
+
%r{<body[^>]*>(.*?)</body>}mi
|
269
|
+
]
|
270
|
+
|
271
|
+
extracted_content = nil
|
272
|
+
content_patterns.each do |pattern|
|
273
|
+
match = html.match(pattern)
|
274
|
+
if match && match[1].strip.length > 100 # Ensure meaningful content
|
275
|
+
extracted_content = match[1]
|
276
|
+
break
|
277
|
+
end
|
278
|
+
end
|
279
|
+
|
280
|
+
# If no pattern matched or content is too short, use the full HTML
|
281
|
+
content_to_clean = extracted_content || html
|
282
|
+
|
283
|
+
# Remove unwanted elements
|
284
|
+
clean_html_content(content_to_clean)
|
285
|
+
end
|
286
|
+
|
287
|
+
def clean_html_content(html) # rubocop:disable Metrics/MethodLength
|
288
|
+
cleaned = html.dup
|
289
|
+
|
290
|
+
# Remove script and style tags completely
|
291
|
+
cleaned = cleaned.gsub(%r{<script[^>]*>.*?</script>}mi, "")
|
292
|
+
cleaned = cleaned.gsub(%r{<style[^>]*>.*?</style>}mi, "")
|
293
|
+
|
294
|
+
# Remove navigation, header, footer, sidebar elements
|
295
|
+
navigation_selectors = %w[nav header footer aside sidebar menu breadcrumb]
|
296
|
+
navigation_selectors.each do |selector|
|
297
|
+
# Remove by tag name
|
298
|
+
cleaned = cleaned.gsub(%r{<#{selector}[^>]*>.*?</#{selector}>}mi, "")
|
299
|
+
# Remove by class name (common patterns)
|
300
|
+
cleaned = cleaned.gsub(%r{<[^>]+class="[^"]*#{selector}[^"]*"[^>]*>.*?</[^>]+>}mi, "")
|
301
|
+
cleaned = cleaned.gsub(%r{<[^>]+id="#{selector}"[^>]*>.*?</[^>]+>}mi, "")
|
302
|
+
end
|
303
|
+
|
304
|
+
# Remove common non-content elements
|
305
|
+
unwanted_patterns = [
|
306
|
+
%r{<div[^>]*class="[^"]*comment[^"]*"[^>]*>.*?</div>}mi,
|
307
|
+
%r{<div[^>]*class="[^"]*social[^"]*"[^>]*>.*?</div>}mi,
|
308
|
+
%r{<div[^>]*class="[^"]*share[^"]*"[^>]*>.*?</div>}mi,
|
309
|
+
%r{<div[^>]*class="[^"]*ad[^"]*"[^>]*>.*?</div>}mi,
|
310
|
+
%r{<div[^>]*class="[^"]*advertisement[^"]*"[^>]*>.*?</div>}mi,
|
311
|
+
%r{<div[^>]*class="[^"]*popup[^"]*"[^>]*>.*?</div>}mi,
|
312
|
+
%r{<div[^>]*class="[^"]*modal[^"]*"[^>]*>.*?</div>}mi
|
313
|
+
]
|
314
|
+
|
315
|
+
unwanted_patterns.each do |pattern|
|
316
|
+
cleaned = cleaned.gsub(pattern, "")
|
317
|
+
end
|
318
|
+
|
319
|
+
# Clean up whitespace
|
320
|
+
cleaned.gsub(/\n\s*\n\s*\n+/, "\n\n").strip
|
321
|
+
end
|
322
|
+
|
323
|
+
def extract_text_content(html) # rubocop:disable Metrics/MethodLength
|
324
|
+
# Fallback method for plain text extraction
|
325
|
+
text = html.gsub(%r{<script[^>]*>.*?</script>}mi, "")
|
326
|
+
.gsub(%r{<style[^>]*>.*?</style>}mi, "")
|
327
|
+
.gsub(/<[^>]+>/, "")
|
328
|
+
.gsub(" ", " ")
|
329
|
+
.gsub("&", "&")
|
330
|
+
.gsub("<", "<")
|
331
|
+
.gsub(">", ">")
|
332
|
+
.gsub(""", '"')
|
333
|
+
.gsub(/\s+/, " ")
|
334
|
+
.strip
|
335
|
+
|
336
|
+
# If the extracted text is very short, it might not be useful
|
337
|
+
text.length < 20 ? "Content could not be extracted from this page." : text
|
338
|
+
end
|
339
|
+
end
|
340
|
+
end
|
341
|
+
end
|
@@ -133,6 +133,26 @@ module Aircana
|
|
133
133
|
to = Pathname.new(File.expand_path(to_path))
|
134
134
|
to.relative_path_from(from).to_s
|
135
135
|
end
|
136
|
+
|
137
|
+
# Helper methods for resolving symlinked agent paths
|
138
|
+
def resolve_agent_path(agent_name)
|
139
|
+
agent_path = File.join(Aircana.configuration.agent_knowledge_dir, agent_name)
|
140
|
+
|
141
|
+
if File.symlink?(agent_path)
|
142
|
+
File.readlink(agent_path)
|
143
|
+
else
|
144
|
+
agent_path
|
145
|
+
end
|
146
|
+
end
|
147
|
+
|
148
|
+
def agent_is_symlinked?(agent_name)
|
149
|
+
agent_path = File.join(Aircana.configuration.agent_knowledge_dir, agent_name)
|
150
|
+
File.symlink?(agent_path)
|
151
|
+
end
|
152
|
+
|
153
|
+
def resolve_symlinked_path(path)
|
154
|
+
File.symlink?(path) ? File.readlink(path) : path
|
155
|
+
end
|
136
156
|
end
|
137
157
|
end
|
138
158
|
end
|
data/lib/aircana/version.rb
CHANGED
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: aircana
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Weston Dransfield
|
8
8
|
bindir: exe
|
9
9
|
cert_chain: []
|
10
|
-
date:
|
10
|
+
date: 2025-09-26 00:00:00.000000000 Z
|
11
11
|
dependencies:
|
12
12
|
- !ruby/object:Gem::Dependency
|
13
13
|
name: httparty
|
@@ -143,7 +143,9 @@ files:
|
|
143
143
|
- lib/aircana/contexts/confluence_logging.rb
|
144
144
|
- lib/aircana/contexts/confluence_setup.rb
|
145
145
|
- lib/aircana/contexts/local.rb
|
146
|
+
- lib/aircana/contexts/manifest.rb
|
146
147
|
- lib/aircana/contexts/relevant_files.rb
|
148
|
+
- lib/aircana/contexts/web.rb
|
147
149
|
- lib/aircana/fzf_helper.rb
|
148
150
|
- lib/aircana/generators.rb
|
149
151
|
- lib/aircana/generators/agents_generator.rb
|
@@ -196,7 +198,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
196
198
|
- !ruby/object:Gem::Version
|
197
199
|
version: '0'
|
198
200
|
requirements: []
|
199
|
-
rubygems_version: 3.6.
|
201
|
+
rubygems_version: 3.6.2
|
200
202
|
specification_version: 4
|
201
203
|
summary: Humble workflow and context utilities for engineering with agents
|
202
204
|
test_files: []
|