hokipoki 0.3.4 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,525 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'sqlite3'
4
+ require 'json'
5
+ require 'digest'
6
+
7
+ module Hokipoki
8
+ # Core Vector Engine - The brain behind parasitic intelligence
9
+ # Uses template-based compression for 75% storage reduction
10
+ class VectorEngine
11
+ include Singleton
12
+
13
+ def initialize
14
+ @db_path = File.expand_path('~/.hokipoki/vectors.db')
15
+ @template_store = nil
16
+ @keyword_index = nil
17
+ @vector_cache = {}
18
+ @learning_patterns = {}
19
+ @stats = {
20
+ total_vectors: 0,
21
+ successful_retrievals: 0,
22
+ failed_retrievals: 0,
23
+ compression_ratio: 0.0
24
+ }
25
+
26
+ ensure_db_directory
27
+ initialize_database
28
+ display_startup_message
29
+ end
30
+
31
+ # Main API - Retrieve intelligent facts with template generation
32
+ def retrieve_facts(query, token_budget: 1500)
33
+ $stdout.puts "🧠 VECTOR ENGINE: Analyzing query intent..."
34
+
35
+ begin
36
+ # 1. Analyze query intent with multiple dimensions
37
+ intent = analyze_query_intent(query)
38
+ $stdout.puts " šŸŽÆ Intent detected: #{intent}"
39
+
40
+ # 2. Extract technical keywords
41
+ keywords = extract_technical_keywords(query)
42
+ $stdout.puts " šŸ“ Keywords: #{keywords.join(', ')}"
43
+
44
+ # 3. Find matching template vectors
45
+ matching_vectors = find_matching_vectors(keywords, intent)
46
+ $stdout.puts " šŸ” Found #{matching_vectors.length} matching vectors"
47
+
48
+ if matching_vectors.any?
49
+ # 4. Generate content from templates
50
+ generated_content = generate_content_from_vectors(matching_vectors, intent, keywords)
51
+
52
+ # 5. Apply token budget management
53
+ final_content = apply_token_budget(generated_content, token_budget)
54
+
55
+ @stats[:successful_retrievals] += 1
56
+ $stdout.puts " āœ… Content generated (#{estimate_tokens(final_content)} tokens)"
57
+
58
+ # Learn from successful retrieval
59
+ learn_from_success(query, keywords, intent, final_content)
60
+
61
+ return [final_content]
62
+ else
63
+ # Fallback to template-based generation
64
+ $stdout.puts " āš ļø No vectors found, using template fallback"
65
+ fallback_content = generate_fallback_content(query, intent, keywords)
66
+
67
+ @stats[:failed_retrievals] += 1
68
+ return [fallback_content]
69
+ end
70
+
71
+ rescue => e
72
+ $stdout.puts " āŒ Vector engine error: #{e.message}"
73
+ @stats[:failed_retrievals] += 1
74
+ return generate_emergency_fallback(query, intent)
75
+ end
76
+ end
77
+
78
+ # Store content as template-based vector (75% compression)
79
+ def store_template_vector(content, source_file, metadata = {})
80
+ $stdout.puts "🦠 STORING VECTOR: #{File.basename(source_file || 'unknown')}"
81
+
82
+ # 1. Detect content template type
83
+ template_type = detect_template_type(content)
84
+ $stdout.puts " šŸ“‹ Template type: #{template_type}"
85
+
86
+ # 2. Extract atomic keywords
87
+ keywords = extract_atomic_keywords(content)
88
+ $stdout.puts " šŸ”‘ Extracted #{keywords.length} keywords"
89
+
90
+ # 3. Calculate keyword weights (TF-IDF style)
91
+ keyword_weights = calculate_keyword_weights(keywords, content)
92
+
93
+ # 4. Determine generation parameters
94
+ generation_params = {
95
+ complexity: assess_content_complexity(content),
96
+ style: detect_content_style(content),
97
+ domain: detect_content_domain(content),
98
+ original_length: content.length
99
+ }
100
+
101
+ # 5. Store compressed representation
102
+ vector_id = store_vector_record(
103
+ template_type: template_type,
104
+ keywords: keywords,
105
+ keyword_weights: keyword_weights,
106
+ generation_params: generation_params,
107
+ source_file: source_file,
108
+ metadata: metadata
109
+ )
110
+
111
+ # 6. Update statistics
112
+ original_size = content.bytesize
113
+ compressed_size = calculate_compressed_size(keywords, template_type, generation_params)
114
+ compression_ratio = ((original_size - compressed_size).to_f / original_size * 100).round(2)
115
+
116
+ @stats[:total_vectors] += 1
117
+ @stats[:compression_ratio] = ((@stats[:compression_ratio] * (@stats[:total_vectors] - 1)) + compression_ratio) / @stats[:total_vectors]
118
+
119
+ $stdout.puts " āœ… Stored with #{compression_ratio}% compression"
120
+
121
+ vector_id
122
+ end
123
+
124
+ # Get engine statistics
125
+ def statistics
126
+ success_rate = @stats[:successful_retrievals] + @stats[:failed_retrievals] > 0 ?
127
+ (@stats[:successful_retrievals].to_f / (@stats[:successful_retrievals] + @stats[:failed_retrievals]) * 100).round(1) : 0
128
+
129
+ {
130
+ total_vectors: @stats[:total_vectors],
131
+ success_rate: success_rate,
132
+ average_compression: @stats[:compression_ratio].round(2),
133
+ cache_size: @vector_cache.size,
134
+ learning_patterns: @learning_patterns.size,
135
+ database_size: File.exist?(@db_path) ? File.size(@db_path) : 0
136
+ }
137
+ end
138
+
139
+ # Display status for Claude visibility
140
+ def display_status
141
+ stats = statistics
142
+
143
+ message = "\n🧠 VECTOR ENGINE STATUS\n"
144
+ message += "=" * 40 + "\n"
145
+ message += "šŸ“Š Total Vectors: #{stats[:total_vectors]}\n"
146
+ message += "āœ… Success Rate: #{stats[:success_rate]}%\n"
147
+ message += "šŸ—œļø Avg Compression: #{stats[:average_compression]}%\n"
148
+ message += "šŸ’¾ Cache Size: #{stats[:cache_size]} entries\n"
149
+ message += "🧠 Learning Patterns: #{stats[:learning_patterns]}\n"
150
+ message += "šŸ’æ Database Size: #{(stats[:database_size] / 1024.0).round(2)} KB\n"
151
+ message += "=" * 40 + "\n"
152
+
153
+ $stdout.puts message
154
+ puts message
155
+
156
+ message
157
+ end
158
+
159
+ # Scan and vectorize project files
160
+ def scan_project(project_path = Dir.pwd)
161
+ $stdout.puts "šŸ” SCANNING PROJECT: #{File.basename(project_path)}"
162
+
163
+ # Find relevant files
164
+ file_patterns = %w[**/*.rb **/*.js **/*.erb **/*.yml **/*.md]
165
+ files = file_patterns.flat_map { |pattern| Dir.glob(File.join(project_path, pattern)) }
166
+
167
+ # Filter out excluded paths
168
+ excluded_patterns = %w[node_modules vendor log tmp .git]
169
+ files = files.reject { |file| excluded_patterns.any? { |pattern| file.include?(pattern) } }
170
+
171
+ $stdout.puts "šŸ“ Found #{files.length} files to process"
172
+
173
+ processed = 0
174
+ files.each_with_index do |file, index|
175
+ begin
176
+ $stdout.puts " [#{index + 1}/#{files.length}] Processing: #{File.basename(file)}"
177
+
178
+ content = File.read(file)
179
+ next if content.strip.empty?
180
+
181
+ store_template_vector(content, file, {
182
+ file_type: File.extname(file),
183
+ processed_at: Time.current.iso8601
184
+ })
185
+
186
+ processed += 1
187
+
188
+ rescue => e
189
+ $stdout.puts " āŒ Error processing #{file}: #{e.message}"
190
+ end
191
+ end
192
+
193
+ $stdout.puts "āœ… PROJECT SCAN COMPLETE: #{processed}/#{files.length} files processed"
194
+ processed
195
+ end
196
+
197
+ private
198
+
199
+ def ensure_db_directory
200
+ db_dir = File.dirname(@db_path)
201
+ FileUtils.mkdir_p(db_dir) unless Dir.exist?(db_dir)
202
+ end
203
+
204
+ def initialize_database
205
+ @db = SQLite3::Database.new(@db_path)
206
+ @db.results_as_hash = true
207
+
208
+ create_tables_if_needed
209
+ end
210
+
211
+ def create_tables_if_needed
212
+ @db.execute <<~SQL
213
+ CREATE TABLE IF NOT EXISTS vectors (
214
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
215
+ template_type TEXT NOT NULL,
216
+ keywords TEXT NOT NULL,
217
+ keyword_weights TEXT NOT NULL,
218
+ generation_params TEXT NOT NULL,
219
+ source_file TEXT,
220
+ metadata TEXT,
221
+ created_at TEXT DEFAULT CURRENT_TIMESTAMP,
222
+ updated_at TEXT DEFAULT CURRENT_TIMESTAMP
223
+ )
224
+ SQL
225
+
226
+ @db.execute <<~SQL
227
+ CREATE INDEX IF NOT EXISTS idx_template_type ON vectors(template_type)
228
+ SQL
229
+
230
+ @db.execute <<~SQL
231
+ CREATE INDEX IF NOT EXISTS idx_keywords ON vectors(keywords)
232
+ SQL
233
+ end
234
+
235
+ def analyze_query_intent(query)
236
+ # Multi-dimensional intent analysis
237
+ intents = []
238
+
239
+ # Primary intent detection
240
+ case query.downcase
241
+ when /\b(implement|create|build|make|add|generate|write)\b/
242
+ intents << :implementation
243
+ when /\b(error|bug|fix|debug|troubleshoot|issue|problem)\b/
244
+ intents << :debugging
245
+ when /\b(how|what|why|when|where|explain|understand|learn)\b/
246
+ intents << :learning
247
+ when /\b(optimize|improve|enhance|better|faster|performance)\b/
248
+ intents << :optimization
249
+ when /\b(example|show|demo|sample|tutorial|guide)\b/
250
+ intents << :reference
251
+ when /\b(test|testing|spec|rspec|jest|unit|integration)\b/
252
+ intents << :testing
253
+ end
254
+
255
+ # Secondary intent detection
256
+ intents << :css if query.match?(/\b(css|style|stylesheet|tailwind|bootstrap)\b/i)
257
+ intents << :javascript if query.match?(/\b(js|javascript|node|npm|react|vue)\b/i)
258
+ intents << :ruby if query.match?(/\b(ruby|rails|gem|bundler|rake)\b/i)
259
+ intents << :database if query.match?(/\b(database|sql|migration|model|table)\b/i)
260
+
261
+ # Default to general if no specific intent detected
262
+ intents << :general if intents.empty?
263
+
264
+ intents.first || :general
265
+ end
266
+
267
+ def extract_technical_keywords(query)
268
+ # Remove stop words and extract meaningful terms
269
+ stop_words = %w[
270
+ the a an and or but in on at to for of with by from
271
+ how do i can you please help me show get make create
272
+ what is are was were will would should could might
273
+ this that these those here there where when why
274
+ ]
275
+
276
+ # Extract words, filter stop words, keep technical terms
277
+ words = query.downcase
278
+ .gsub(/[^\w\s]/, ' ')
279
+ .split(/\s+/)
280
+ .reject { |word| stop_words.include?(word) || word.length < 2 }
281
+ .select { |word| word.length > 2 || technical_term?(word) }
282
+
283
+ # Remove duplicates and limit to most relevant
284
+ words.uniq.first(10)
285
+ end
286
+
287
+ def technical_term?(word)
288
+ # Short technical terms that should be preserved
289
+ technical_terms = %w[css js sql api url ui ux db id]
290
+ technical_terms.include?(word.downcase)
291
+ end
292
+
293
+ def find_matching_vectors(keywords, intent)
294
+ # Build SQL query to find matching vectors
295
+ keyword_conditions = keywords.map { |keyword| "keywords LIKE '%#{keyword}%'" }.join(' OR ')
296
+
297
+ sql = if keyword_conditions.present?
298
+ "SELECT * FROM vectors WHERE (#{keyword_conditions}) ORDER BY created_at DESC LIMIT 10"
299
+ else
300
+ "SELECT * FROM vectors ORDER BY created_at DESC LIMIT 5"
301
+ end
302
+
303
+ @db.execute(sql)
304
+ end
305
+
306
+ def generate_content_from_vectors(vectors, intent, keywords)
307
+ template_store = get_template_store
308
+
309
+ generated_parts = vectors.map do |vector|
310
+ template_type = vector['template_type']
311
+ stored_keywords = JSON.parse(vector['keywords'])
312
+ generation_params = JSON.parse(vector['generation_params'])
313
+
314
+ template_store.generate_content(
315
+ template_type,
316
+ keywords: keywords + stored_keywords,
317
+ intent: intent,
318
+ params: generation_params
319
+ )
320
+ end
321
+
322
+ # Combine and deduplicate content
323
+ combined_content = generated_parts.compact.join(' | ')
324
+ combined_content.present? ? combined_content : generate_fallback_content(keywords.join(' '), intent, keywords)
325
+ end
326
+
327
+ def get_template_store
328
+ @template_store ||= TemplateStore.new
329
+ end
330
+
331
+ def apply_token_budget(content, budget)
332
+ estimated_tokens = estimate_tokens(content)
333
+
334
+ if estimated_tokens <= budget
335
+ content
336
+ else
337
+ # Truncate content to fit budget
338
+ chars_per_token = content.length.to_f / estimated_tokens
339
+ max_chars = (budget * chars_per_token * 0.9).to_i # 90% safety margin
340
+
341
+ content[0..max_chars].strip + "..."
342
+ end
343
+ end
344
+
345
+ def estimate_tokens(text)
346
+ return 0 if text.nil? || text.empty?
347
+ # Conservative estimation: ~4 characters per token
348
+ (text.length / 4.0).ceil
349
+ end
350
+
351
+ def generate_fallback_content(query, intent, keywords)
352
+ case intent
353
+ when :implementation
354
+ "Implementation context: #{keywords.join(', ')}. Consider Rails conventions, security best practices, and maintainable code patterns."
355
+ when :debugging
356
+ "Debugging context: #{keywords.join(', ')}. Check logs, verify configurations, test incrementally. Common issues: environment variables, dependencies, permissions."
357
+ when :learning
358
+ "Learning context: #{keywords.join(', ')}. Focus on fundamentals, official documentation, and proven patterns. Practice with simple examples first."
359
+ when :optimization
360
+ "Optimization context: #{keywords.join(', ')}. Profile before optimizing, focus on bottlenecks, consider caching, database queries, and algorithmic improvements."
361
+ else
362
+ "Context: #{keywords.join(', ')}. Use best practices, follow conventions, prioritize readability and maintainability."
363
+ end
364
+ end
365
+
366
+ def generate_emergency_fallback(query, intent)
367
+ ["Emergency context: Basic guidance available. Check documentation and verify system configuration."]
368
+ end
369
+
370
+ def detect_template_type(content)
371
+ # Analyze content to determine template type
372
+ case content
373
+ when /class\s+\w+.*< ApplicationRecord/, /belongs_to/, /has_many/, /validates/
374
+ 'active_record_model'
375
+ when /class\s+\w+.*< ApplicationController/, /def\s+\w+/, /render/, /redirect_to/
376
+ 'rails_controller'
377
+ when /def\s+\w+/, /class\s+\w+/, /module\s+\w+/
378
+ 'ruby_class'
379
+ when /function\s+\w+/, /const\s+\w+/, /=>\s*/, /async\s+function/
380
+ 'javascript_code'
381
+ when /\.css/, /background:/, /color:/, /font-size:/, /margin:/, /padding:/
382
+ 'css_styles'
383
+ when /<%=/, /<% /, /<%# /, /<html/, /<div/, /<span/
384
+ 'erb_template'
385
+ when /describe/, /it\s+["']/, /expect/, /test/, /spec/
386
+ 'test_spec'
387
+ when /#\s+[A-Z]/, /##\s+/, /###\s+/, /\*\s+/, /-\s+/
388
+ 'documentation'
389
+ when /migration/, /create_table/, /add_column/, /drop_table/
390
+ 'database_migration'
391
+ when /config/, /settings/, /environment/, /secrets/
392
+ 'configuration'
393
+ else
394
+ 'general_code'
395
+ end
396
+ end
397
+
398
+ def extract_atomic_keywords(content)
399
+ keywords = []
400
+
401
+ # Extract Ruby methods
402
+ content.scan(/def\s+(\w+)/) { |match| keywords << "method_#{match[0]}" }
403
+
404
+ # Extract class names
405
+ content.scan(/class\s+(\w+)/) { |match| keywords << "class_#{match[0]}" }
406
+
407
+ # Extract constants
408
+ content.scan(/([A-Z][A-Z_]+)\s*=/) { |match| keywords << "constant_#{match[0]}" }
409
+
410
+ # Extract CSS classes
411
+ content.scan(/class=["\']([^"\']+)["\']/) { |match|
412
+ match[0].split(/\s+/).each { |cls| keywords << "css_#{cls}" }
413
+ }
414
+
415
+ # Extract technical terms
416
+ technical_terms = content.scan(/\b(rails|react|vue|angular|bootstrap|tailwind|postgres|mysql|redis|docker|kubernetes|aws|api|rest|graphql|jwt|oauth|json|xml|html|css|javascript|ruby|python|sql)\b/i)
417
+ technical_terms.flatten.each { |term| keywords << "tech_#{term.downcase}" }
418
+
419
+ # Extract file extensions and types
420
+ content.scan(/\.(\w+)$/) { |match| keywords << "filetype_#{match[0]}" }
421
+
422
+ keywords.uniq.first(20) # Limit to most relevant keywords
423
+ end
424
+
425
+ def calculate_keyword_weights(keywords, content)
426
+ weights = {}
427
+ total_words = content.split(/\s+/).length
428
+
429
+ keywords.each do |keyword|
430
+ # Simple TF-IDF approximation
431
+ term_frequency = content.scan(/#{Regexp.escape(keyword)}/i).length
432
+ # Inverse document frequency approximation (assuming 1000 documents)
433
+ inverse_doc_freq = Math.log(1000.0 / [@stats[:total_vectors] + 1, 1].max)
434
+
435
+ weights[keyword] = (term_frequency.to_f / total_words) * inverse_doc_freq
436
+ end
437
+
438
+ weights
439
+ end
440
+
441
+ def assess_content_complexity(content)
442
+ # Simple complexity assessment
443
+ factors = 0
444
+ factors += 1 if content.length > 1000
445
+ factors += 1 if content.scan(/def\s+\w+/).length > 5
446
+ factors += 1 if content.scan(/class\s+\w+/).length > 1
447
+ factors += 1 if content.include?('module')
448
+ factors += 1 if content.scan(/\b(async|await|promise|callback)\b/i).any?
449
+
450
+ case factors
451
+ when 0..1 then 'simple'
452
+ when 2..3 then 'moderate'
453
+ else 'complex'
454
+ end
455
+ end
456
+
457
+ def detect_content_style(content)
458
+ return 'tutorial' if content.match?(/step\s+\d+|first|then|next|finally/i)
459
+ return 'reference' if content.match?(/api|documentation|docs|spec/i)
460
+ return 'example' if content.match?(/example|demo|sample|usage/i)
461
+ return 'guide' if content.match?(/guide|how\s+to|walkthrough/i)
462
+ 'standard'
463
+ end
464
+
465
+ def detect_content_domain(content)
466
+ return 'frontend' if content.match?/(css|html|javascript|react|vue|angular)/i
467
+ return 'backend' if content.match?/(rails|ruby|controller|model|database)/i
468
+ return 'devops' if content.match?/(docker|kubernetes|aws|deployment|server)/i
469
+ return 'testing' if content.match?/(test|spec|rspec|jest|unit|integration)/i
470
+ 'general'
471
+ end
472
+
473
+ def store_vector_record(template_type:, keywords:, keyword_weights:, generation_params:, source_file:, metadata:)
474
+ vector_id = Digest::SHA256.hexdigest("#{template_type}_#{keywords.join('_')}_#{Time.current.to_f}")
475
+
476
+ @db.execute(
477
+ "INSERT INTO vectors (id, template_type, keywords, keyword_weights, generation_params, source_file, metadata) VALUES (?, ?, ?, ?, ?, ?, ?)",
478
+ [
479
+ vector_id,
480
+ template_type,
481
+ JSON.generate(keywords),
482
+ JSON.generate(keyword_weights),
483
+ JSON.generate(generation_params),
484
+ source_file,
485
+ JSON.generate(metadata)
486
+ ]
487
+ )
488
+
489
+ vector_id
490
+ end
491
+
492
+ def calculate_compressed_size(keywords, template_type, generation_params)
493
+ # Estimate compressed representation size
494
+ keywords_size = keywords.join('').bytesize
495
+ template_size = template_type.bytesize
496
+ params_size = JSON.generate(generation_params).bytesize
497
+
498
+ keywords_size + template_size + params_size + 100 # Base overhead
499
+ end
500
+
501
+ def learn_from_success(query, keywords, intent, content)
502
+ # Store successful pattern for future improvement
503
+ pattern_key = "#{intent}_#{keywords.first(3).join('_')}"
504
+
505
+ @learning_patterns[pattern_key] ||= { successes: 0, failures: 0, last_success: nil }
506
+ @learning_patterns[pattern_key][:successes] += 1
507
+ @learning_patterns[pattern_key][:last_success] = Time.current
508
+
509
+ # Keep only recent patterns to avoid memory bloat
510
+ if @learning_patterns.size > 1000
511
+ oldest_patterns = @learning_patterns.sort_by { |k, v| v[:last_success] || Time.at(0) }.first(200)
512
+ oldest_patterns.each { |pattern, _| @learning_patterns.delete(pattern) }
513
+ end
514
+ end
515
+
516
+ def display_startup_message
517
+ $stdout.puts "\n🧠 VECTOR ENGINE: Initialized successfully"
518
+ $stdout.puts "šŸ’¾ Database: #{@db_path}"
519
+ $stdout.puts "šŸ“Š Ready for parasitic intelligence operations"
520
+ end
521
+ end
522
+ end
523
+
524
+ # Load template store dependency
525
+ require_relative 'template_store'
@@ -1,3 +1,3 @@
1
1
  module Hokipoki
2
- VERSION = "0.3.4"
2
+ VERSION = "0.5.1"
3
3
  end