hokipoki 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,401 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Hokipoki
4
- module Intelligence
5
- # Smart Retrieval Engine with Real-time User Feedback
6
- # Enhanced version of the original with comprehensive operation visibility
7
- class SmartRetrievalEngine
8
- include Singleton
9
-
10
- def initialize
11
- @logger = Rails.logger
12
- @feedback = Feedback::DisplayManager.instance
13
- end
14
-
15
- # PATTERN-003: Intent-based analysis with surgical precision + real vector similarity
16
- def retrieve_targeted_facts(query, token_budget: 1500, intent: 'auto')
17
- start_time = Time.current
18
-
19
- @feedback.performing_vector_retrieval(query, token_budget) do
20
- @logger.info "🎯 SMART RETRIEVAL: Processing query with vector similarity"
21
- end
22
-
23
- begin
24
- # Step 1: Analyze intent if auto
25
- if intent == 'auto'
26
- @feedback.debug_info('intent_analysis', 'detecting query intent')
27
- intent = analyze_query_intent(query)
28
- @feedback.debug_info('intent_detected', intent.to_s)
29
- end
30
-
31
- # Step 2: Multi-stage retrieval with vector similarity
32
- @feedback.pulling_from_hive_mind('vectors')
33
- facts = perform_multi_stage_retrieval(query, token_budget, intent)
34
-
35
- # Step 3: Calculate processing metrics
36
- processing_time = ((Time.current - start_time) * 1000).round(2)
37
-
38
- if facts.present?
39
- fact_count = facts.is_a?(Array) ? facts.length : 1
40
- total_tokens = estimate_total_tokens(facts)
41
- tokens_saved = calculate_tokens_saved(total_tokens, query)
42
-
43
- @feedback.vector_results_found(fact_count, processing_time)
44
- @feedback.vector_context_built(fact_count, total_tokens)
45
- @feedback.total_tokens_saved(tokens_saved, get_session_total_saved(tokens_saved))
46
- @feedback.token_usage(total_tokens, token_budget)
47
-
48
- return facts
49
- else
50
- @feedback.vector_results_found(0, processing_time)
51
- @feedback.operation_warning('vector_retrieval', 'falling back to pattern guidance')
52
- end
53
-
54
- rescue => e
55
- @feedback.operation_error('vector_retrieval', e.message)
56
- @logger.warn "Vector retrieval failed: #{e.message}, falling back to patterns"
57
- end
58
-
59
- # Fallback to pattern-based retrieval
60
- @feedback.debug_info('fallback', 'using pattern-based retrieval')
61
- fallback_result = get_pattern_fallback(query, intent)
62
-
63
- if fallback_result
64
- @feedback.vector_results_found(1, ((Time.current - start_time) * 1000).round(2))
65
- return fallback_result
66
- else
67
- @feedback.vector_results_found(0, ((Time.current - start_time) * 1000).round(2))
68
- return "Context: No specific guidance available for query: #{query}"
69
- end
70
- end
71
-
72
- # Multi-stage retrieval with comprehensive feedback
73
- def perform_multi_stage_retrieval(query, token_budget, intent)
74
- @feedback.debug_info('retrieval_stages', 'starting multi-stage retrieval')
75
-
76
- # Stage 1: Vector similarity search
77
- @feedback.debug_info('stage_1', 'vector similarity search')
78
- vector_candidates = perform_vector_similarity_search(query, intent)
79
-
80
- # Stage 2: Rank by relevance
81
- @feedback.debug_info('stage_2', 'ranking by relevance')
82
- ranked_candidates = rank_by_relevance(vector_candidates, query, intent)
83
-
84
- # Stage 3: Filter by quality
85
- @feedback.debug_info('stage_3', 'filtering by quality')
86
- filtered_candidates = filter_by_quality(ranked_candidates)
87
-
88
- # Stage 4: Build context within token budget
89
- @feedback.debug_info('stage_4', 'building final context')
90
- final_context = build_contextual_response(filtered_candidates, query, token_budget)
91
-
92
- final_context
93
- end
94
-
95
- # Vector similarity search with detailed feedback
96
- def perform_vector_similarity_search(query, intent)
97
- @feedback.debug_info('vector_search', "searching for intent: #{intent}")
98
-
99
- candidates = []
100
-
101
- # Search 1: Exact semantic match
102
- @feedback.debug_info('search_exact', 'performing exact semantic match')
103
- exact_matches = search_documents_by_content(query, similarity_threshold: 0.9, limit: 10)
104
- candidates.concat(tag_results(exact_matches, :exact_match))
105
- @feedback.debug_info('exact_results', "found #{exact_matches.length} exact matches")
106
-
107
- # Search 2: Conceptual similarity
108
- @feedback.debug_info('search_conceptual', 'performing conceptual similarity search')
109
- conceptual_matches = search_documents_by_content(query, similarity_threshold: 0.7, limit: 15)
110
- candidates.concat(tag_results(conceptual_matches, :conceptual_match))
111
- @feedback.debug_info('conceptual_results', "found #{conceptual_matches.length} conceptual matches")
112
-
113
- # Search 3: Intent-based search
114
- @feedback.debug_info('search_intent', "performing intent-based search for #{intent}")
115
- intent_matches = search_by_intent(query, intent)
116
- candidates.concat(tag_results(intent_matches, :intent_match))
117
- @feedback.debug_info('intent_results', "found #{intent_matches.length} intent matches")
118
-
119
- # Remove duplicates
120
- unique_candidates = deduplicate_candidates(candidates)
121
- @feedback.debug_info('deduplication', "#{candidates.length} → #{unique_candidates.length} after deduplication")
122
-
123
- unique_candidates
124
- end
125
-
126
- # Search documents with feedback
127
- def search_documents_by_content(query, similarity_threshold: 0.7, limit: 10)
128
- @feedback.debug_info('db_query', "searching documents (threshold: #{similarity_threshold})")
129
-
130
- # Check if Document model is available
131
- unless defined?(Document)
132
- @feedback.operation_warning('db_search', 'Document model not available')
133
- return []
134
- end
135
-
136
- begin
137
- # Search documents with content matching
138
- results = Document.where("content ILIKE ?", "%#{sanitize_query(query)}%")
139
- .limit(limit * 2) # Get extra for filtering
140
- .includes(:keywords)
141
-
142
- @feedback.debug_info('db_results', "found #{results.count} database matches")
143
-
144
- # Calculate similarity scores
145
- scored_results = results.map do |doc|
146
- similarity_score = calculate_content_similarity(query, doc.content)
147
-
148
- {
149
- document: doc,
150
- content: doc.content,
151
- similarity: similarity_score,
152
- metadata: doc.metadata || {},
153
- keywords: extract_keywords(doc),
154
- relevance: assess_content_relevance(doc, query)
155
- }
156
- end
157
-
158
- # Filter by threshold and sort
159
- filtered_results = scored_results.select { |r| r[:similarity] >= similarity_threshold }
160
- final_results = filtered_results.sort_by { |r| -(r[:similarity] * 0.7 + r[:relevance] * 0.3) }
161
- .first(limit)
162
-
163
- @feedback.debug_info('filtering', "#{scored_results.length} → #{final_results.length} after filtering")
164
-
165
- final_results
166
-
167
- rescue => e
168
- @feedback.operation_error('db_search', e.message)
169
- []
170
- end
171
- end
172
-
173
- # Intent-based search with feedback
174
- def search_by_intent(query, intent)
175
- @feedback.debug_info('intent_search', "searching for #{intent} intent")
176
-
177
- # Map intent to search strategies
178
- search_strategy = get_search_strategy_for_intent(intent)
179
- @feedback.debug_info('search_strategy', "using #{search_strategy[:fact_types].join(', ')} fact types")
180
-
181
- return [] unless defined?(Document)
182
-
183
- begin
184
- intent_docs = Document.where("metadata->>'fact_type' IN (?)", search_strategy[:fact_types])
185
- .where("content ILIKE ?", "%#{sanitize_query(query)}%")
186
- .limit(10)
187
-
188
- @feedback.debug_info('intent_results', "found #{intent_docs.count} intent-specific documents")
189
-
190
- intent_docs.map do |doc|
191
- base_relevance = calculate_content_relevance(doc, query)
192
- boosted_relevance = base_relevance * search_strategy[:boost]
193
-
194
- {
195
- document: doc,
196
- content: doc.content,
197
- similarity: boosted_relevance,
198
- metadata: doc.metadata || {},
199
- keywords: extract_keywords(doc),
200
- intent: intent,
201
- fact_type: doc.metadata&.dig('fact_type')
202
- }
203
- end
204
-
205
- rescue => e
206
- @feedback.operation_error('intent_search', e.message)
207
- []
208
- end
209
- end
210
-
211
- # Build final context with token tracking
212
- def build_contextual_response(candidates, query, token_budget)
213
- return "" if candidates.empty?
214
-
215
- @feedback.debug_info('context_building', "processing #{candidates.length} candidates")
216
-
217
- # Sort by overall quality score
218
- sorted_candidates = candidates.sort_by do |candidate|
219
- -(candidate[:similarity] * 0.6 + candidate[:relevance] * 0.4)
220
- end
221
-
222
- # Build context within token budget
223
- context_parts = []
224
- used_tokens = 0
225
-
226
- sorted_candidates.each_with_index do |candidate, index|
227
- content = candidate[:content]
228
- content_tokens = estimate_tokens(content)
229
-
230
- if used_tokens + content_tokens <= token_budget
231
- context_parts << content
232
- used_tokens += content_tokens
233
- @feedback.debug_info('context_add', "added part #{index + 1} (#{content_tokens} tokens)")
234
- else
235
- @feedback.debug_info('context_skip', "skipped part #{index + 1} (would exceed budget)")
236
- break
237
- end
238
- end
239
-
240
- if context_parts.any?
241
- final_context = context_parts.join(" | ")
242
- @feedback.vector_context_built(context_parts.length, used_tokens)
243
- final_context
244
- else
245
- @feedback.operation_warning('context_building', 'no content fit within token budget')
246
- ""
247
- end
248
- end
249
-
250
- private
251
-
252
- def analyze_query_intent(query)
253
- return :implementation if query.match?(/how to|implement|create|build|fix|debug|error/i)
254
- return :definition if query.match?(/what is|define|explain|meaning|concept/i)
255
- return :frontend if query.match?(/css|style|html|frontend|design/i)
256
- return :commands if query.match?(/command|run|execute|bash|terminal/i)
257
- return :reference if query.match?(/example|show me|sample|demo/i)
258
- :general
259
- end
260
-
261
- def get_search_strategy_for_intent(intent)
262
- strategies = {
263
- implementation: { fact_types: ['code_example', 'tutorial', 'implementation'], boost: 1.2 },
264
- debugging: { fact_types: ['error_solution', 'debugging', 'troubleshooting'], boost: 1.1 },
265
- definition: { fact_types: ['explanation', 'tutorial', 'definition'], boost: 1.0 },
266
- reference: { fact_types: ['api_reference', 'documentation', 'specification'], boost: 1.3 },
267
- frontend: { fact_types: ['css_example', 'html_example', 'frontend'], boost: 1.1 },
268
- commands: { fact_types: ['command', 'terminal', 'script'], boost: 1.2 }
269
- }
270
-
271
- strategies[intent] || { fact_types: ['general'], boost: 1.0 }
272
- end
273
-
274
- def rank_by_relevance(candidates, query, intent)
275
- candidates.map do |candidate|
276
- # Calculate combined relevance score
277
- similarity_score = candidate[:similarity] || 0.5
278
- relevance_score = candidate[:relevance] || 0.5
279
- intent_bonus = candidate[:intent] == intent ? 0.1 : 0.0
280
-
281
- candidate[:combined_score] = similarity_score * 0.5 + relevance_score * 0.4 + intent_bonus
282
- candidate
283
- end.sort_by { |c| -c[:combined_score] }
284
- end
285
-
286
- def filter_by_quality(candidates)
287
- quality_threshold = 0.3
288
- candidates.select { |candidate| (candidate[:combined_score] || 0) >= quality_threshold }
289
- end
290
-
291
- def tag_results(results, source_type)
292
- results.map do |result|
293
- result[:source_type] = source_type
294
- result
295
- end
296
- end
297
-
298
- def deduplicate_candidates(candidates)
299
- seen_docs = Set.new
300
- candidates.reject do |candidate|
301
- doc_id = candidate.dig(:document, :id) || candidate[:content]&.hash
302
- if seen_docs.include?(doc_id)
303
- true
304
- else
305
- seen_docs.add(doc_id)
306
- false
307
- end
308
- end
309
- end
310
-
311
- def calculate_content_similarity(query, content)
312
- # Simple word overlap similarity
313
- query_words = query.downcase.split(/\W+/).reject(&:blank?)
314
- content_words = content.downcase.split(/\W+/).reject(&:blank?)
315
-
316
- return 0.0 if query_words.empty? || content_words.empty?
317
-
318
- intersection = (query_words & content_words).size
319
- union = (query_words | content_words).size
320
-
321
- intersection.to_f / union
322
- end
323
-
324
- def calculate_content_relevance(document, query)
325
- # Enhanced relevance calculation
326
- query_terms = query.downcase.split(/\W+/).reject(&:blank?)
327
- content_lower = document.content.downcase
328
-
329
- # Count matches and assess density
330
- matches = query_terms.count { |term| content_lower.include?(term) }
331
- match_density = matches.to_f / [query_terms.length, 1].max
332
-
333
- # Boost for document metadata
334
- metadata_boost = document.metadata.present? ? 0.1 : 0.0
335
-
336
- [match_density + metadata_boost, 1.0].min
337
- end
338
-
339
- def assess_content_relevance(document, query)
340
- calculate_content_relevance(document, query)
341
- end
342
-
343
- def extract_keywords(document)
344
- if document.respond_to?(:keywords) && document.keywords.respond_to?(:pluck)
345
- document.keywords.pluck(:name)
346
- else
347
- []
348
- end
349
- end
350
-
351
- def sanitize_query(query)
352
- query.to_s.gsub(/['";\\]/, '').strip
353
- end
354
-
355
- def estimate_tokens(text)
356
- (text.to_s.length * 0.25).ceil
357
- end
358
-
359
- def estimate_total_tokens(facts)
360
- case facts
361
- when Array
362
- facts.sum { |fact| estimate_tokens(fact) }
363
- when String
364
- estimate_tokens(facts)
365
- else
366
- estimate_tokens(facts.to_s)
367
- end
368
- end
369
-
370
- def calculate_tokens_saved(context_tokens, query)
371
- # Estimate tokens saved by providing specific context vs generic response
372
- baseline_tokens = query.length * 0.8 # Rough estimate of generic response
373
- savings = [baseline_tokens - context_tokens, 0].max.round
374
- savings
375
- end
376
-
377
- def get_session_total_saved(current_saved)
378
- # Track session total (would be persistent in real implementation)
379
- @session_tokens_saved ||= 0
380
- @session_tokens_saved += current_saved
381
- @session_tokens_saved
382
- end
383
-
384
- def get_pattern_fallback(query, intent)
385
- # Basic pattern-based fallback
386
- case intent
387
- when :implementation
388
- "Context: Implementation guidance for #{query} | Use best practices and error handling"
389
- when :debugging
390
- "Context: Debug #{query} | Check logs, verify configuration, test incrementally"
391
- when :definition
392
- "Context: #{query} definition | Core concepts and usage patterns"
393
- when :reference
394
- "Context: #{query} reference | API documentation and examples"
395
- else
396
- "Context: General guidance for #{query}"
397
- end
398
- end
399
- end
400
- end
401
- end