aircana 1.0.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,341 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "httparty"
4
+ require "reverse_markdown"
5
+ require "uri"
6
+ require_relative "local"
7
+ require_relative "manifest"
8
+ require_relative "../progress_tracker"
9
+ require_relative "../version"
10
+ require_relative "../llm/claude_client"
11
+
12
+ module Aircana
13
+ module Contexts
14
+ class Web # rubocop:disable Metrics/ClassLength
15
+ include HTTParty
16
+
17
+ headers "User-Agent" => "Aircana/#{Aircana::VERSION} (+https://github.com/westonkd/aircana)"
18
+ default_timeout 30
19
+ follow_redirects true
20
+
21
+ def initialize
22
+ @local_storage = Local.new
23
+ end
24
+
25
+ def fetch_url_for(agent:, url:)
26
+ validate_url!(url)
27
+
28
+ page_data = fetch_and_process_url(url)
29
+ store_page_as_markdown(page_data, agent)
30
+
31
+ build_url_metadata(page_data)
32
+ rescue StandardError => e
33
+ handle_fetch_error(url, e)
34
+ nil
35
+ end
36
+
37
+ def fetch_urls_for(agent:, urls:) # rubocop:disable Metrics/MethodLength
38
+ return { pages_count: 0, sources: [] } if urls.empty?
39
+
40
+ pages_metadata = []
41
+ successful_urls = []
42
+
43
+ ProgressTracker.with_batch_progress(urls, "Fetching URLs") do |url, _index|
44
+ metadata = fetch_url_for(agent: agent, url: url)
45
+ if metadata
46
+ pages_metadata << metadata
47
+ successful_urls << url
48
+ end
49
+ end
50
+
51
+ if successful_urls.any?
52
+ sources = build_sources_metadata(successful_urls, pages_metadata)
53
+ update_or_create_manifest(agent, sources)
54
+ { pages_count: successful_urls.size, sources: sources }
55
+ else
56
+ { pages_count: 0, sources: [] }
57
+ end
58
+ end
59
+
60
+ def refresh_web_sources(agent:) # rubocop:disable Metrics/CyclomaticComplexity
61
+ sources = Manifest.sources_from_manifest(agent)
62
+ web_sources = sources.select { |s| s["type"] == "web" }
63
+
64
+ return { pages_count: 0, sources: [] } if web_sources.empty?
65
+
66
+ all_urls = web_sources.flat_map { |source| source["urls"]&.map { |u| u["url"] } || [] }
67
+ return { pages_count: 0, sources: [] } if all_urls.empty?
68
+
69
+ fetch_urls_for(agent: agent, urls: all_urls)
70
+ end
71
+
72
+ private
73
+
74
+ def validate_url!(url)
75
+ uri = URI.parse(url)
76
+ raise Error, "URL must use HTTP or HTTPS protocol" unless %w[http https].include?(uri.scheme)
77
+ raise Error, "Invalid URL format" unless uri.host
78
+ rescue URI::InvalidURIError
79
+ raise Error, "Invalid URL format"
80
+ end
81
+
82
+ def fetch_and_process_url(url) # rubocop:disable Metrics/MethodLength
83
+ Aircana.human_logger.info("Fetching #{url}")
84
+
85
+ response = self.class.get(url)
86
+
87
+ raise Error, "Failed to fetch URL (#{response.code})" unless response.success?
88
+
89
+ html_title = extract_title(response.body)
90
+ content = convert_to_markdown(response.body)
91
+ title = generate_meaningful_title(html_title, content, url)
92
+
93
+ {
94
+ url: url,
95
+ title: title,
96
+ content: content,
97
+ last_fetched: Time.now.utc.strftime("%Y-%m-%dT%H:%M:%SZ")
98
+ }
99
+ end
100
+
101
+ def extract_title(html) # rubocop:disable Metrics/MethodLength
102
+ title_match = html.match(%r{<title[^>]*>(.*?)</title>}im)
103
+ return nil unless title_match
104
+
105
+ title = title_match[1].strip
106
+ # Decode HTML entities
107
+ title.gsub(/&([a-zA-Z]+|#\d+);/) do |match|
108
+ case match
109
+ when "&amp;" then "&"
110
+ when "&lt;" then "<"
111
+ when "&gt;" then ">"
112
+ when "&quot;" then '"'
113
+ when "&#39;", "&apos;" then "'"
114
+ else match
115
+ end
116
+ end
117
+ end
118
+
119
+ def extract_title_from_url(url)
120
+ uri = URI.parse(url)
121
+ # Use the last path segment or host as fallback title
122
+ path_segments = uri.path.split("/").reject(&:empty?)
123
+ if path_segments.any?
124
+ path_segments.last.gsub(/[-_]/, " ").split.map(&:capitalize).join(" ")
125
+ else
126
+ uri.host
127
+ end
128
+ end
129
+
130
+ def generate_meaningful_title(html_title, content, url) # rubocop:disable Metrics/CyclomaticComplexity
131
+ # If we have a good HTML title that's descriptive, use it
132
+ return html_title if html_title && html_title.length > 10 && !generic_title?(html_title)
133
+
134
+ # If content is too short, use fallback
135
+ return html_title || extract_title_from_url(url) if content.length < 50
136
+
137
+ # Use Claude to generate a meaningful title based on content
138
+ begin
139
+ generate_title_with_claude(content, url)
140
+ rescue StandardError => e
141
+ Aircana.human_logger.warn("Failed to generate title with Claude: #{e.message}")
142
+ html_title || extract_title_from_url(url)
143
+ end
144
+ end
145
+
146
+ def generic_title?(title)
147
+ generic_patterns = [
148
+ /^(home|index|welcome|untitled|document)$/i,
149
+ /^(page|default)$/i,
150
+ /^\s*$/,
151
+ # Truncated titles (contain ellipsis)
152
+ /\.\.\./,
153
+ # Titles with excessive metadata (site names, IDs, etc.)
154
+ / - .+ - \d+$/,
155
+ # Question titles that are truncated
156
+ /^how do i .+\.\.\./i,
157
+ /^what is .+\.\.\./i
158
+ ]
159
+
160
+ generic_patterns.any? { |pattern| title.match?(pattern) }
161
+ end
162
+
163
+ def generate_title_with_claude(content, url)
164
+ prompt = build_title_generation_prompt(content, url)
165
+ claude_client = LLM::ClaudeClient.new
166
+ claude_client.prompt(prompt).strip
167
+ end
168
+
169
+ def build_title_generation_prompt(content, url) # rubocop:disable Metrics/MethodLength
170
+ # Truncate content to avoid overly long prompts
171
+ truncated_content = content.length > 1000 ? "#{content[0..1000]}..." : content
172
+
173
+ <<~PROMPT
174
+ Based on the following web page content from #{url}, generate a concise, descriptive title
175
+ that would help an AI agent understand what this document contains and when it would be useful.
176
+
177
+ The title should be:
178
+ - 3-8 words long
179
+ - Focused on the main topic or purpose
180
+ - Helpful for knowledge retrieval
181
+ - Professional and clear
182
+
183
+ Content:
184
+ #{truncated_content}
185
+
186
+ Respond with only the title, no additional text or explanation.
187
+ PROMPT
188
+ end
189
+
190
+ def convert_to_markdown(html)
191
+ return "" if html.nil? || html.empty?
192
+
193
+ # Extract meaningful content by removing unwanted elements
194
+ cleaned_html = extract_main_content(html)
195
+
196
+ ReverseMarkdown.convert(cleaned_html, github_flavored: true)
197
+ rescue StandardError => e
198
+ Aircana.human_logger.warn "Failed to convert HTML to markdown: #{e.message}"
199
+ # Fallback to plain text extraction
200
+ extract_text_content(html)
201
+ end
202
+
203
+ def store_page_as_markdown(page_data, agent)
204
+ @local_storage.store_content(
205
+ title: page_data[:title],
206
+ content: page_data[:content],
207
+ agent: agent
208
+ )
209
+ end
210
+
211
+ def build_url_metadata(page_data)
212
+ {
213
+ "url" => page_data[:url],
214
+ "title" => page_data[:title],
215
+ "last_fetched" => page_data[:last_fetched]
216
+ }
217
+ end
218
+
219
+ def build_sources_metadata(_urls, pages_metadata)
220
+ [
221
+ {
222
+ "type" => "web",
223
+ "urls" => pages_metadata
224
+ }
225
+ ]
226
+ end
227
+
228
+ def update_or_create_manifest(agent, new_sources)
229
+ existing_sources = Manifest.sources_from_manifest(agent)
230
+
231
+ # Remove existing web sources and add new ones
232
+ other_sources = existing_sources.reject { |s| s["type"] == "web" }
233
+ all_sources = other_sources + new_sources
234
+
235
+ if Manifest.manifest_exists?(agent)
236
+ Manifest.update_manifest(agent, all_sources)
237
+ else
238
+ Manifest.create_manifest(agent, all_sources)
239
+ end
240
+ end
241
+
242
+ def handle_fetch_error(url, error)
243
+ case error
244
+ when URI::InvalidURIError
245
+ Aircana.human_logger.error "Invalid URL format: #{url}"
246
+ when HTTParty::Error
247
+ Aircana.human_logger.error "HTTP error fetching #{url}: #{error.message}"
248
+ when Error
249
+ Aircana.human_logger.error "Error fetching #{url}: #{error.message}"
250
+ else
251
+ Aircana.human_logger.error "Unexpected error fetching #{url}: #{error.message}"
252
+ end
253
+ end
254
+
255
+ def extract_main_content(html) # rubocop:disable Metrics/MethodLength
256
+ # Try to find the main content area using common selectors
257
+ content_patterns = [
258
+ # Common main content selectors
259
+ %r{<main[^>]*>(.*?)</main>}mi,
260
+ %r{<article[^>]*>(.*?)</article>}mi,
261
+ %r{<div[^>]*class="[^"]*content[^"]*"[^>]*>(.*?)</div>}mi,
262
+ %r{<div[^>]*id="content"[^>]*>(.*?)</div>}mi,
263
+ %r{<div[^>]*class="[^"]*post[^"]*"[^>]*>(.*?)</div>}mi,
264
+ # Documentation specific
265
+ %r{<div[^>]*class="[^"]*docs[^"]*"[^>]*>(.*?)</div>}mi,
266
+ %r{<div[^>]*class="[^"]*documentation[^"]*"[^>]*>(.*?)</div>}mi,
267
+ # Body content as fallback
268
+ %r{<body[^>]*>(.*?)</body>}mi
269
+ ]
270
+
271
+ extracted_content = nil
272
+ content_patterns.each do |pattern|
273
+ match = html.match(pattern)
274
+ if match && match[1].strip.length > 100 # Ensure meaningful content
275
+ extracted_content = match[1]
276
+ break
277
+ end
278
+ end
279
+
280
+ # If no pattern matched or content is too short, use the full HTML
281
+ content_to_clean = extracted_content || html
282
+
283
+ # Remove unwanted elements
284
+ clean_html_content(content_to_clean)
285
+ end
286
+
287
+ def clean_html_content(html) # rubocop:disable Metrics/MethodLength
288
+ cleaned = html.dup
289
+
290
+ # Remove script and style tags completely
291
+ cleaned = cleaned.gsub(%r{<script[^>]*>.*?</script>}mi, "")
292
+ cleaned = cleaned.gsub(%r{<style[^>]*>.*?</style>}mi, "")
293
+
294
+ # Remove navigation, header, footer, sidebar elements
295
+ navigation_selectors = %w[nav header footer aside sidebar menu breadcrumb]
296
+ navigation_selectors.each do |selector|
297
+ # Remove by tag name
298
+ cleaned = cleaned.gsub(%r{<#{selector}[^>]*>.*?</#{selector}>}mi, "")
299
+ # Remove by class name (common patterns)
300
+ cleaned = cleaned.gsub(%r{<[^>]+class="[^"]*#{selector}[^"]*"[^>]*>.*?</[^>]+>}mi, "")
301
+ cleaned = cleaned.gsub(%r{<[^>]+id="#{selector}"[^>]*>.*?</[^>]+>}mi, "")
302
+ end
303
+
304
+ # Remove common non-content elements
305
+ unwanted_patterns = [
306
+ %r{<div[^>]*class="[^"]*comment[^"]*"[^>]*>.*?</div>}mi,
307
+ %r{<div[^>]*class="[^"]*social[^"]*"[^>]*>.*?</div>}mi,
308
+ %r{<div[^>]*class="[^"]*share[^"]*"[^>]*>.*?</div>}mi,
309
+ %r{<div[^>]*class="[^"]*ad[^"]*"[^>]*>.*?</div>}mi,
310
+ %r{<div[^>]*class="[^"]*advertisement[^"]*"[^>]*>.*?</div>}mi,
311
+ %r{<div[^>]*class="[^"]*popup[^"]*"[^>]*>.*?</div>}mi,
312
+ %r{<div[^>]*class="[^"]*modal[^"]*"[^>]*>.*?</div>}mi
313
+ ]
314
+
315
+ unwanted_patterns.each do |pattern|
316
+ cleaned = cleaned.gsub(pattern, "")
317
+ end
318
+
319
+ # Clean up whitespace
320
+ cleaned.gsub(/\n\s*\n\s*\n+/, "\n\n").strip
321
+ end
322
+
323
+ def extract_text_content(html) # rubocop:disable Metrics/MethodLength
324
+ # Fallback method for plain text extraction
325
+ text = html.gsub(%r{<script[^>]*>.*?</script>}mi, "")
326
+ .gsub(%r{<style[^>]*>.*?</style>}mi, "")
327
+ .gsub(/<[^>]+>/, "")
328
+ .gsub("&nbsp;", " ")
329
+ .gsub("&amp;", "&")
330
+ .gsub("&lt;", "<")
331
+ .gsub("&gt;", ">")
332
+ .gsub("&quot;", '"')
333
+ .gsub(/\s+/, " ")
334
+ .strip
335
+
336
+ # If the extracted text is very short, it might not be useful
337
+ text.length < 20 ? "Content could not be extracted from this page." : text
338
+ end
339
+ end
340
+ end
341
+ end
@@ -133,6 +133,26 @@ module Aircana
133
133
  to = Pathname.new(File.expand_path(to_path))
134
134
  to.relative_path_from(from).to_s
135
135
  end
136
+
137
+ # Helper methods for resolving symlinked agent paths
138
+ def resolve_agent_path(agent_name)
139
+ agent_path = File.join(Aircana.configuration.agent_knowledge_dir, agent_name)
140
+
141
+ if File.symlink?(agent_path)
142
+ File.readlink(agent_path)
143
+ else
144
+ agent_path
145
+ end
146
+ end
147
+
148
+ def agent_is_symlinked?(agent_name)
149
+ agent_path = File.join(Aircana.configuration.agent_knowledge_dir, agent_name)
150
+ File.symlink?(agent_path)
151
+ end
152
+
153
+ def resolve_symlinked_path(path)
154
+ File.symlink?(path) ? File.readlink(path) : path
155
+ end
136
156
  end
137
157
  end
138
158
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Aircana
4
- VERSION = "1.0.0"
4
+ VERSION = "1.2.0"
5
5
  end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: aircana
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 1.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Weston Dransfield
8
8
  bindir: exe
9
9
  cert_chain: []
10
- date: 1980-01-02 00:00:00.000000000 Z
10
+ date: 2025-09-26 00:00:00.000000000 Z
11
11
  dependencies:
12
12
  - !ruby/object:Gem::Dependency
13
13
  name: httparty
@@ -143,7 +143,9 @@ files:
143
143
  - lib/aircana/contexts/confluence_logging.rb
144
144
  - lib/aircana/contexts/confluence_setup.rb
145
145
  - lib/aircana/contexts/local.rb
146
+ - lib/aircana/contexts/manifest.rb
146
147
  - lib/aircana/contexts/relevant_files.rb
148
+ - lib/aircana/contexts/web.rb
147
149
  - lib/aircana/fzf_helper.rb
148
150
  - lib/aircana/generators.rb
149
151
  - lib/aircana/generators/agents_generator.rb
@@ -196,7 +198,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
196
198
  - !ruby/object:Gem::Version
197
199
  version: '0'
198
200
  requirements: []
199
- rubygems_version: 3.6.9
201
+ rubygems_version: 3.6.2
200
202
  specification_version: 4
201
203
  summary: Humble workflow and context utilities for engineering with agents
202
204
  test_files: []