htm 0.0.10 → 0.0.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. checksums.yaml +4 -4
  2. data/.dictate.toml +46 -0
  3. data/.envrc +2 -0
  4. data/CHANGELOG.md +86 -3
  5. data/README.md +86 -7
  6. data/Rakefile +14 -2
  7. data/bin/htm_mcp.rb +621 -0
  8. data/config/database.yml +20 -13
  9. data/db/migrate/00010_add_soft_delete_to_associations.rb +29 -0
  10. data/db/migrate/00011_add_performance_indexes.rb +21 -0
  11. data/db/migrate/00012_add_tags_trigram_index.rb +18 -0
  12. data/db/migrate/00013_enable_lz4_compression.rb +43 -0
  13. data/db/schema.sql +49 -92
  14. data/docs/api/index.md +1 -1
  15. data/docs/api/yard/HTM.md +2 -4
  16. data/docs/architecture/index.md +1 -1
  17. data/docs/development/index.md +1 -1
  18. data/docs/getting-started/index.md +1 -1
  19. data/docs/guides/index.md +1 -1
  20. data/docs/images/telemetry-architecture.svg +153 -0
  21. data/docs/telemetry.md +391 -0
  22. data/examples/README.md +171 -1
  23. data/examples/cli_app/README.md +1 -1
  24. data/examples/cli_app/htm_cli.rb +1 -1
  25. data/examples/mcp_client.rb +529 -0
  26. data/examples/sinatra_app/app.rb +1 -1
  27. data/examples/telemetry/README.md +147 -0
  28. data/examples/telemetry/SETUP_README.md +169 -0
  29. data/examples/telemetry/demo.rb +498 -0
  30. data/examples/telemetry/grafana/dashboards/htm-metrics.json +457 -0
  31. data/lib/htm/configuration.rb +261 -70
  32. data/lib/htm/database.rb +46 -22
  33. data/lib/htm/embedding_service.rb +24 -14
  34. data/lib/htm/errors.rb +15 -1
  35. data/lib/htm/jobs/generate_embedding_job.rb +19 -0
  36. data/lib/htm/jobs/generate_propositions_job.rb +103 -0
  37. data/lib/htm/jobs/generate_tags_job.rb +24 -0
  38. data/lib/htm/loaders/markdown_chunker.rb +79 -0
  39. data/lib/htm/loaders/markdown_loader.rb +41 -15
  40. data/lib/htm/long_term_memory/fulltext_search.rb +138 -0
  41. data/lib/htm/long_term_memory/hybrid_search.rb +324 -0
  42. data/lib/htm/long_term_memory/node_operations.rb +209 -0
  43. data/lib/htm/long_term_memory/relevance_scorer.rb +355 -0
  44. data/lib/htm/long_term_memory/robot_operations.rb +34 -0
  45. data/lib/htm/long_term_memory/tag_operations.rb +428 -0
  46. data/lib/htm/long_term_memory/vector_search.rb +109 -0
  47. data/lib/htm/long_term_memory.rb +51 -1153
  48. data/lib/htm/models/node.rb +35 -2
  49. data/lib/htm/models/node_tag.rb +31 -0
  50. data/lib/htm/models/robot_node.rb +31 -0
  51. data/lib/htm/models/tag.rb +44 -0
  52. data/lib/htm/proposition_service.rb +169 -0
  53. data/lib/htm/query_cache.rb +214 -0
  54. data/lib/htm/sql_builder.rb +178 -0
  55. data/lib/htm/tag_service.rb +16 -6
  56. data/lib/htm/tasks.rb +8 -2
  57. data/lib/htm/telemetry.rb +224 -0
  58. data/lib/htm/version.rb +1 -1
  59. data/lib/htm.rb +64 -3
  60. data/lib/tasks/doc.rake +1 -1
  61. data/lib/tasks/htm.rake +259 -13
  62. data/mkdocs.yml +96 -96
  63. metadata +75 -18
  64. data/.aigcm_msg +0 -1
  65. data/.claude/settings.local.json +0 -92
  66. data/CLAUDE.md +0 -603
  67. data/examples/cli_app/temp.log +0 -93
  68. data/lib/htm/loaders/paragraph_chunker.rb +0 -112
  69. data/notes/ARCHITECTURE_REVIEW.md +0 -1167
  70. data/notes/IMPLEMENTATION_SUMMARY.md +0 -606
  71. data/notes/MULTI_FRAMEWORK_IMPLEMENTATION.md +0 -451
  72. data/notes/next_steps.md +0 -100
  73. data/notes/plan.md +0 -627
  74. data/notes/tag_ontology_enhancement_ideas.md +0 -222
  75. data/notes/timescaledb_removal_summary.md +0 -200
@@ -2,8 +2,19 @@
2
2
 
3
3
  require 'pg'
4
4
  require 'json'
5
- require 'lru_redux'
6
- require 'digest'
5
+
6
+ # Load standalone utility classes
7
+ require_relative 'sql_builder'
8
+ require_relative 'query_cache'
9
+
10
+ # Load modules
11
+ require_relative 'long_term_memory/relevance_scorer'
12
+ require_relative 'long_term_memory/node_operations'
13
+ require_relative 'long_term_memory/robot_operations'
14
+ require_relative 'long_term_memory/tag_operations'
15
+ require_relative 'long_term_memory/vector_search'
16
+ require_relative 'long_term_memory/fulltext_search'
17
+ require_relative 'long_term_memory/hybrid_search'
7
18
 
8
19
  class HTM
9
20
  # Long-term Memory - PostgreSQL/TimescaleDB-backed permanent storage
@@ -17,7 +28,40 @@ class HTM
17
28
  # - ActiveRecord ORM for data access
18
29
  # - Query result caching for efficiency
19
30
  #
31
+ # This class uses standalone utility classes and modules:
32
+ #
33
+ # Standalone classes (used via class methods or instances):
34
+ # - HTM::SqlBuilder: SQL condition building helpers (class methods)
35
+ # - HTM::QueryCache: Query result caching (instantiated as @cache)
36
+ #
37
+ # Included modules:
38
+ # - RelevanceScorer: Dynamic relevance scoring
39
+ # - NodeOperations: Node CRUD operations
40
+ # - RobotOperations: Robot registration and activity
41
+ # - TagOperations: Tag management
42
+ # - VectorSearch: Vector similarity search
43
+ # - FulltextSearch: Full-text search
44
+ # - HybridSearch: Combined search strategies
45
+ #
20
46
  class LongTermMemory
47
+ # Include modules (order matters - dependencies first)
48
+ #
49
+ # Dependency graph:
50
+ # TagOperations, RobotOperations (no deps)
51
+ # NodeOperations → @cache (QueryCache instance)
52
+ # VectorSearch → HTM::SqlBuilder, @cache, NodeOperations
53
+ # FulltextSearch → HTM::SqlBuilder, @cache, NodeOperations
54
+ # HybridSearch → HTM::SqlBuilder, @cache, TagOperations, NodeOperations
55
+ # RelevanceScorer → HTM::SqlBuilder, TagOperations, VectorSearch, FulltextSearch
56
+ #
57
+ include TagOperations
58
+ include RobotOperations
59
+ include NodeOperations
60
+ include VectorSearch
61
+ include FulltextSearch
62
+ include HybridSearch
63
+ include RelevanceScorer
64
+
21
65
  DEFAULT_QUERY_TIMEOUT = 30_000 # milliseconds (30 seconds)
22
66
  MAX_VECTOR_DIMENSION = 2000 # Maximum supported dimension with HNSW index (pgvector limitation)
23
67
  DEFAULT_CACHE_SIZE = 1000 # Number of queries to cache
@@ -50,283 +94,7 @@ class HTM
50
94
  ActiveRecord::Base.connection.execute("SET statement_timeout = #{@query_timeout}")
51
95
 
52
96
  # Initialize query result cache (disable with cache_size: 0)
53
- if cache_size > 0
54
- @query_cache = LruRedux::TTL::ThreadSafeCache.new(cache_size, cache_ttl)
55
- @cache_stats = { hits: 0, misses: 0 }
56
- @cache_stats_mutex = Mutex.new # Thread-safety for cache statistics
57
- end
58
- end
59
-
60
- # Add a node to long-term memory (with deduplication)
61
- #
62
- # If content already exists (by content_hash), links the robot to the existing
63
- # node and updates timestamps. Otherwise creates a new node.
64
- #
65
- # @param content [String] Conversation message/utterance
66
- # @param token_count [Integer] Token count
67
- # @param robot_id [Integer] Robot identifier
68
- # @param embedding [Array<Float>, nil] Pre-generated embedding vector
69
- # @param metadata [Hash] Flexible metadata for the node (default: {})
70
- # @return [Hash] { node_id:, is_new:, robot_node: }
71
- #
72
- def add(content:, token_count: 0, robot_id:, embedding: nil, metadata: {})
73
- content_hash = HTM::Models::Node.generate_content_hash(content)
74
-
75
- # Wrap in transaction to ensure data consistency
76
- ActiveRecord::Base.transaction do
77
- # Check for existing node with same content (including soft-deleted)
78
- # This avoids unique constraint violations on content_hash
79
- existing_node = HTM::Models::Node.with_deleted.find_by(content_hash: content_hash)
80
-
81
- # If found but soft-deleted, restore it
82
- if existing_node&.deleted?
83
- existing_node.restore!
84
- HTM.logger.info "Restored soft-deleted node #{existing_node.id} for content match"
85
- end
86
-
87
- if existing_node
88
- # Link robot to existing node (or update if already linked)
89
- robot_node = link_robot_to_node(robot_id: robot_id, node: existing_node)
90
-
91
- # Update the node's updated_at timestamp
92
- existing_node.touch
93
-
94
- {
95
- node_id: existing_node.id,
96
- is_new: false,
97
- robot_node: robot_node
98
- }
99
- else
100
- # Prepare embedding if provided
101
- embedding_str = nil
102
- if embedding
103
- # Pad embedding to 2000 dimensions if needed
104
- actual_dimension = embedding.length
105
- padded_embedding = if actual_dimension < 2000
106
- embedding + Array.new(2000 - actual_dimension, 0.0)
107
- else
108
- embedding
109
- end
110
- embedding_str = "[#{padded_embedding.join(',')}]"
111
- end
112
-
113
- # Create new node
114
- node = HTM::Models::Node.create!(
115
- content: content,
116
- content_hash: content_hash,
117
- token_count: token_count,
118
- embedding: embedding_str,
119
- metadata: metadata
120
- )
121
-
122
- # Link robot to new node
123
- robot_node = link_robot_to_node(robot_id: robot_id, node: node)
124
-
125
- # Invalidate cache since database content changed
126
- invalidate_cache!
127
-
128
- {
129
- node_id: node.id,
130
- is_new: true,
131
- robot_node: robot_node
132
- }
133
- end
134
- end
135
- end
136
-
137
- # Link a robot to a node (create or update robot_node record)
138
- #
139
- # @param robot_id [Integer] Robot ID
140
- # @param node [HTM::Models::Node] Node to link
141
- # @param working_memory [Boolean] Whether node is in working memory (default: false)
142
- # @return [HTM::Models::RobotNode] The robot_node link record
143
- #
144
- def link_robot_to_node(robot_id:, node:, working_memory: false)
145
- robot_node = HTM::Models::RobotNode.find_by(robot_id: robot_id, node_id: node.id)
146
-
147
- if robot_node
148
- # Existing link - record that robot remembered this again
149
- robot_node.record_remember!
150
- robot_node.update!(working_memory: working_memory) if working_memory
151
- else
152
- # New link
153
- robot_node = HTM::Models::RobotNode.create!(
154
- robot_id: robot_id,
155
- node_id: node.id,
156
- first_remembered_at: Time.current,
157
- last_remembered_at: Time.current,
158
- remember_count: 1,
159
- working_memory: working_memory
160
- )
161
- end
162
-
163
- robot_node
164
- end
165
-
166
- # Retrieve a node by ID
167
- #
168
- # Automatically tracks access by incrementing access_count and updating last_accessed
169
- #
170
- # @param node_id [Integer] Node database ID
171
- # @return [Hash, nil] Node data or nil
172
- #
173
- def retrieve(node_id)
174
- node = HTM::Models::Node.find_by(id: node_id)
175
- return nil unless node
176
-
177
- # Track access (atomic increment)
178
- node.increment!(:access_count)
179
- node.touch(:last_accessed)
180
-
181
- node.attributes
182
- end
183
-
184
- # Update last_accessed timestamp
185
- #
186
- # @param node_id [Integer] Node database ID
187
- # @return [void]
188
- #
189
- def update_last_accessed(node_id)
190
- node = HTM::Models::Node.find_by(id: node_id)
191
- node&.update(last_accessed: Time.current)
192
- end
193
-
194
- # Delete a node
195
- #
196
- # @param node_id [Integer] Node database ID
197
- # @return [void]
198
- #
199
- def delete(node_id)
200
- node = HTM::Models::Node.find_by(id: node_id)
201
- node&.destroy
202
-
203
- # Invalidate cache since database content changed
204
- invalidate_cache!
205
- end
206
-
207
- # Check if a node exists
208
- #
209
- # @param node_id [Integer] Node database ID
210
- # @return [Boolean] True if node exists
211
- #
212
- def exists?(node_id)
213
- HTM::Models::Node.exists?(node_id)
214
- end
215
-
216
- # Vector similarity search
217
- #
218
- # @param timeframe [nil, Range, Array<Range>] Time range(s) to search (nil = no filter)
219
- # @param query [String] Search query
220
- # @param limit [Integer] Maximum results
221
- # @param embedding_service [Object] Service to generate embeddings
222
- # @param metadata [Hash] Filter by metadata fields (default: {})
223
- # @return [Array<Hash>] Matching nodes
224
- #
225
- def search(timeframe:, query:, limit:, embedding_service:, metadata: {})
226
- cached_query(:search, timeframe, query, limit, metadata) do
227
- search_uncached(timeframe: timeframe, query: query, limit: limit, embedding_service: embedding_service, metadata: metadata)
228
- end
229
- end
230
-
231
- # Full-text search
232
- #
233
- # @param timeframe [Range] Time range to search
234
- # @param query [String] Search query
235
- # @param limit [Integer] Maximum results
236
- # @param metadata [Hash] Filter by metadata fields (default: {})
237
- # @return [Array<Hash>] Matching nodes
238
- #
239
- def search_fulltext(timeframe:, query:, limit:, metadata: {})
240
- cached_query(:fulltext, timeframe, query, limit, metadata) do
241
- search_fulltext_uncached(timeframe: timeframe, query: query, limit: limit, metadata: metadata)
242
- end
243
- end
244
-
245
- # Hybrid search (full-text + vector)
246
- #
247
- # @param timeframe [Range] Time range to search
248
- # @param query [String] Search query
249
- # @param limit [Integer] Maximum results
250
- # @param embedding_service [Object] Service to generate embeddings
251
- # @param prefilter_limit [Integer] Candidates to consider (default: 100)
252
- # @param metadata [Hash] Filter by metadata fields (default: {})
253
- # @return [Array<Hash>] Matching nodes
254
- #
255
- def search_hybrid(timeframe:, query:, limit:, embedding_service:, prefilter_limit: 100, metadata: {})
256
- cached_query(:hybrid, timeframe, query, limit, prefilter_limit, metadata) do
257
- search_hybrid_uncached(timeframe: timeframe, query: query, limit: limit, embedding_service: embedding_service, prefilter_limit: prefilter_limit, metadata: metadata)
258
- end
259
- end
260
-
261
- # Add a tag to a node
262
- #
263
- # @param node_id [Integer] Node database ID
264
- # @param tag [String] Tag name
265
- # @return [void]
266
- #
267
- def add_tag(node_id:, tag:)
268
- tag_record = HTM::Models::Tag.find_or_create_by(name: tag)
269
- HTM::Models::NodeTag.create(
270
- node_id: node_id,
271
- tag_id: tag_record.id
272
- )
273
- rescue ActiveRecord::RecordNotUnique
274
- # Tag association already exists, ignore
275
- end
276
-
277
- # Mark nodes as evicted from working memory
278
- #
279
- # Sets working_memory = false on the robot_nodes join table for the specified
280
- # robot and node IDs.
281
- #
282
- # @param robot_id [Integer] Robot ID whose working memory is being evicted
283
- # @param node_ids [Array<Integer>] Node IDs to mark as evicted
284
- # @return [void]
285
- #
286
- def mark_evicted(robot_id:, node_ids:)
287
- return if node_ids.empty?
288
-
289
- HTM::Models::RobotNode
290
- .where(robot_id: robot_id, node_id: node_ids)
291
- .update_all(working_memory: false)
292
- end
293
-
294
- # Track access for multiple nodes (bulk operation)
295
- #
296
- # Updates access_count and last_accessed for all nodes in the array
297
- #
298
- # @param node_ids [Array<Integer>] Node IDs that were accessed
299
- # @return [void]
300
- #
301
- def track_access(node_ids)
302
- return if node_ids.empty?
303
-
304
- # Atomic batch update
305
- HTM::Models::Node.where(id: node_ids).update_all(
306
- "access_count = access_count + 1, last_accessed = NOW()"
307
- )
308
- end
309
-
310
- # Register a robot
311
- #
312
- # @param robot_id [String] Robot identifier
313
- # @param robot_name [String] Robot name
314
- # @return [void]
315
- #
316
- def register_robot(robot_name)
317
- robot = HTM::Models::Robot.find_or_create_by(name: robot_name)
318
- robot.update(last_active: Time.current)
319
- robot.id
320
- end
321
-
322
- # Update robot activity timestamp
323
- #
324
- # @param robot_id [String] Robot identifier
325
- # @return [void]
326
- #
327
- def update_robot_activity(robot_id)
328
- robot = HTM::Models::Robot.find_by(id: robot_id)
329
- robot&.update(last_active: Time.current)
97
+ @cache = HTM::QueryCache.new(size: cache_size, ttl: cache_ttl)
330
98
  end
331
99
 
332
100
  # Get memory statistics
@@ -346,8 +114,8 @@ class HTM
346
114
  }
347
115
 
348
116
  # Include cache statistics if cache is enabled
349
- if @query_cache
350
- base_stats[:cache] = cache_stats
117
+ if @cache&.enabled?
118
+ base_stats[:cache] = @cache.stats
351
119
  end
352
120
 
353
121
  base_stats
@@ -359,887 +127,17 @@ class HTM
359
127
  # This method kept for API compatibility
360
128
  end
361
129
 
362
- # Clear the query cache
363
- #
364
- # Call this after any operation that modifies data (soft delete, restore, etc.)
365
- # to ensure subsequent queries see fresh results.
130
+ # Clear the query result cache
366
131
  #
367
132
  # @return [void]
368
133
  #
369
134
  def clear_cache!
370
- invalidate_cache!
135
+ @cache&.clear!
371
136
  end
372
137
 
373
138
  # For backwards compatibility with tests/code that expect pool_size
374
139
  def pool_size
375
140
  ActiveRecord::Base.connection_pool.size
376
141
  end
377
-
378
- # Retrieve nodes by ontological topic
379
- #
380
- # @param topic_path [String] Topic hierarchy path
381
- # @param exact [Boolean] Exact match or prefix match
382
- # @param limit [Integer] Maximum results
383
- # @return [Array<Hash>] Matching nodes
384
- #
385
- def nodes_by_topic(topic_path, exact: false, limit: 50)
386
- if exact
387
- nodes = HTM::Models::Node
388
- .joins(:tags)
389
- .where(tags: { name: topic_path })
390
- .distinct
391
- .order(created_at: :desc)
392
- .limit(limit)
393
- else
394
- nodes = HTM::Models::Node
395
- .joins(:tags)
396
- .where("tags.name LIKE ?", "#{topic_path}%")
397
- .distinct
398
- .order(created_at: :desc)
399
- .limit(limit)
400
- end
401
-
402
- nodes.map(&:attributes)
403
- end
404
-
405
- # Get ontology structure view
406
- #
407
- # @return [Array<Hash>] Ontology structure
408
- #
409
- def ontology_structure
410
- result = ActiveRecord::Base.connection.select_all(
411
- "SELECT * FROM ontology_structure WHERE root_topic IS NOT NULL ORDER BY root_topic, level1_topic, level2_topic"
412
- )
413
- result.to_a
414
- end
415
-
416
- # Get topic relationships (co-occurrence)
417
- #
418
- # @param min_shared_nodes [Integer] Minimum shared nodes
419
- # @param limit [Integer] Maximum relationships
420
- # @return [Array<Hash>] Topic relationships
421
- #
422
- def topic_relationships(min_shared_nodes: 2, limit: 50)
423
- # Use parameterized query to prevent SQL injection
424
- sql = <<~SQL
425
- SELECT t1.name AS topic1, t2.name AS topic2, COUNT(DISTINCT nt1.node_id) AS shared_nodes
426
- FROM tags t1
427
- JOIN node_tags nt1 ON t1.id = nt1.tag_id
428
- JOIN node_tags nt2 ON nt1.node_id = nt2.node_id
429
- JOIN tags t2 ON nt2.tag_id = t2.id
430
- WHERE t1.name < t2.name
431
- GROUP BY t1.name, t2.name
432
- HAVING COUNT(DISTINCT nt1.node_id) >= $1
433
- ORDER BY shared_nodes DESC
434
- LIMIT $2
435
- SQL
436
-
437
- result = ActiveRecord::Base.connection.exec_query(
438
- sql,
439
- 'topic_relationships',
440
- [[nil, min_shared_nodes.to_i], [nil, limit.to_i]]
441
- )
442
- result.to_a
443
- end
444
-
445
- # Get topics for a specific node
446
- #
447
- # @param node_id [Integer] Node database ID
448
- # @return [Array<String>] Topic paths
449
- #
450
- def node_topics(node_id)
451
- HTM::Models::Tag
452
- .joins(:node_tags)
453
- .where(node_tags: { node_id: node_id })
454
- .order(:name)
455
- .pluck(:name)
456
- end
457
-
458
- # Calculate dynamic relevance score for a node given query context
459
- #
460
- # Combines multiple signals:
461
- # - Vector similarity (semantic match)
462
- # - Tag overlap (categorical match)
463
- # - Recency (freshness)
464
- # - Access frequency (popularity/utility)
465
- #
466
- # @param node [Hash] Node data with similarity, tags, created_at, access_count
467
- # @param query_tags [Array<String>] Tags associated with the query
468
- # @param vector_similarity [Float, nil] Pre-computed vector similarity (0-1)
469
- # @param node_tags [Array<String>, nil] Pre-loaded tags for this node (avoids N+1 query)
470
- # @return [Float] Composite relevance score (0-10)
471
- #
472
- def calculate_relevance(node:, query_tags: [], vector_similarity: nil, node_tags: nil)
473
- # 1. Vector similarity (semantic match) - weight: 0.5
474
- semantic_score = if vector_similarity
475
- vector_similarity
476
- elsif node['similarity']
477
- node['similarity'].to_f
478
- else
479
- 0.5 # Neutral if no embedding
480
- end
481
-
482
- # 2. Tag overlap (categorical relevance) - weight: 0.3
483
- # Use pre-loaded tags if provided, otherwise fetch (for backward compatibility)
484
- node_tags ||= get_node_tags(node['id'])
485
- tag_score = if query_tags.any? && node_tags.any?
486
- weighted_hierarchical_jaccard(query_tags, node_tags)
487
- else
488
- 0.5 # Neutral if no tags
489
- end
490
-
491
- # 3. Recency (temporal relevance) - weight: 0.1
492
- age_hours = (Time.now - Time.parse(node['created_at'].to_s)) / 3600.0
493
- recency_score = Math.exp(-age_hours / 168.0) # 1-week half-life
494
-
495
- # 4. Access frequency (behavioral signal) - weight: 0.1
496
- access_count = node['access_count'] || 0
497
- access_score = Math.log(1 + access_count) / 10.0 # Normalize to 0-1
498
-
499
- # Weighted composite (scale to 0-10)
500
- relevance = (
501
- (semantic_score * 0.5) +
502
- (tag_score * 0.3) +
503
- (recency_score * 0.1) +
504
- (access_score * 0.1)
505
- ) * 10.0
506
-
507
- relevance.clamp(0.0, 10.0)
508
- end
509
-
510
- # Search with dynamic relevance scoring
511
- #
512
- # Returns nodes with calculated relevance scores based on query context
513
- #
514
- # @param timeframe [nil, Range, Array<Range>] Time range(s) to search (nil = no filter)
515
- # @param query [String, nil] Search query
516
- # @param query_tags [Array<String>] Tags to match
517
- # @param limit [Integer] Maximum results
518
- # @param embedding_service [Object, nil] Service to generate embeddings
519
- # @param metadata [Hash] Filter by metadata fields (default: {})
520
- # @return [Array<Hash>] Nodes with relevance scores
521
- #
522
- def search_with_relevance(timeframe:, query: nil, query_tags: [], limit: 20, embedding_service: nil, metadata: {})
523
- # Get candidates from appropriate search method
524
- candidates = if query && embedding_service
525
- # Vector search
526
- search_uncached(timeframe: timeframe, query: query, limit: limit * 2, embedding_service: embedding_service, metadata: metadata)
527
- elsif query
528
- # Full-text search
529
- search_fulltext_uncached(timeframe: timeframe, query: query, limit: limit * 2, metadata: metadata)
530
- else
531
- # Time-range only (or no filter if timeframe is nil)
532
- scope = HTM::Models::Node.where(deleted_at: nil)
533
- scope = apply_timeframe_scope(scope, timeframe)
534
- scope = apply_metadata_scope(scope, metadata)
535
- scope.order(created_at: :desc).limit(limit * 2).map(&:attributes)
536
- end
537
-
538
- # Batch load all tags for candidates (fixes N+1 query)
539
- node_ids = candidates.map { |n| n['id'] }
540
- tags_by_node = batch_load_node_tags(node_ids)
541
-
542
- # Calculate relevance for each candidate
543
- scored_nodes = candidates.map do |node|
544
- node_tags = tags_by_node[node['id']] || []
545
-
546
- relevance = calculate_relevance(
547
- node: node,
548
- query_tags: query_tags,
549
- vector_similarity: node['similarity']&.to_f,
550
- node_tags: node_tags
551
- )
552
-
553
- node.merge({
554
- 'relevance' => relevance,
555
- 'tags' => node_tags
556
- })
557
- end
558
-
559
- # Sort by relevance and return top K
560
- scored_nodes
561
- .sort_by { |n| -n['relevance'] }
562
- .take(limit)
563
- end
564
-
565
- # Get tags for a specific node
566
- #
567
- # @param node_id [Integer] Node database ID
568
- # @return [Array<String>] Tag names
569
- #
570
- def get_node_tags(node_id)
571
- HTM::Models::Tag
572
- .joins(:node_tags)
573
- .where(node_tags: { node_id: node_id })
574
- .pluck(:name)
575
- rescue ActiveRecord::ActiveRecordError => e
576
- HTM.logger.error("Failed to retrieve tags for node #{node_id}: #{e.message}")
577
- []
578
- end
579
-
580
- # Batch load tags for multiple nodes (avoids N+1 queries)
581
- #
582
- # @param node_ids [Array<Integer>] Node database IDs
583
- # @return [Hash<Integer, Array<String>>] Map of node_id to array of tag names
584
- #
585
- def batch_load_node_tags(node_ids)
586
- return {} if node_ids.empty?
587
-
588
- # Single query to get all tags for all nodes
589
- results = HTM::Models::NodeTag
590
- .joins(:tag)
591
- .where(node_id: node_ids)
592
- .pluck(:node_id, 'tags.name')
593
-
594
- # Group by node_id
595
- results.group_by(&:first).transform_values { |pairs| pairs.map(&:last) }
596
- rescue ActiveRecord::ActiveRecordError => e
597
- HTM.logger.error("Failed to batch load tags: #{e.message}")
598
- {}
599
- end
600
-
601
- # Search nodes by tags
602
- #
603
- # @param tags [Array<String>] Tags to search for
604
- # @param match_all [Boolean] If true, match ALL tags; if false, match ANY tag
605
- # @param timeframe [Range, nil] Optional time range filter
606
- # @param limit [Integer] Maximum results
607
- # @return [Array<Hash>] Matching nodes with relevance scores
608
- #
609
- def search_by_tags(tags:, match_all: false, timeframe: nil, limit: 20)
610
- return [] if tags.empty?
611
-
612
- # Build base query
613
- query = HTM::Models::Node
614
- .joins(:tags)
615
- .where(tags: { name: tags })
616
- .distinct
617
-
618
- # Apply timeframe filter if provided
619
- query = query.where(created_at: timeframe) if timeframe
620
-
621
- if match_all
622
- # Match ALL tags (intersection)
623
- query = query
624
- .group('nodes.id')
625
- .having('COUNT(DISTINCT tags.name) = ?', tags.size)
626
- end
627
-
628
- # Get results
629
- nodes = query.limit(limit).map(&:attributes)
630
-
631
- # Batch load all tags for nodes (fixes N+1 query)
632
- node_ids = nodes.map { |n| n['id'] }
633
- tags_by_node = batch_load_node_tags(node_ids)
634
-
635
- # Calculate relevance and enrich with tags
636
- nodes.map do |node|
637
- node_tags = tags_by_node[node['id']] || []
638
- relevance = calculate_relevance(
639
- node: node,
640
- query_tags: tags,
641
- node_tags: node_tags
642
- )
643
-
644
- node.merge({
645
- 'relevance' => relevance,
646
- 'tags' => node_tags
647
- })
648
- end.sort_by { |n| -n['relevance'] }
649
- end
650
-
651
- # Get most popular tags
652
- #
653
- # @param limit [Integer] Number of tags to return
654
- # @param timeframe [Range, nil] Optional time range filter
655
- # @return [Array<Hash>] Tags with usage counts
656
- #
657
- def popular_tags(limit: 20, timeframe: nil)
658
- query = HTM::Models::Tag
659
- .joins(:node_tags)
660
- .joins('INNER JOIN nodes ON nodes.id = node_tags.node_id')
661
- .group('tags.id', 'tags.name')
662
- .select('tags.name, COUNT(node_tags.id) as usage_count')
663
-
664
- query = query.where('nodes.created_at >= ? AND nodes.created_at <= ?', timeframe.begin, timeframe.end) if timeframe
665
-
666
- query
667
- .order('usage_count DESC')
668
- .limit(limit)
669
- .map { |tag| { name: tag.name, usage_count: tag.usage_count } }
670
- end
671
-
672
- # Find tags that match terms in the query
673
- #
674
- # Searches the tags table for tags where any hierarchy level matches
675
- # query words. For example, query "PostgreSQL database" would match
676
- # tags like "database:postgresql", "database:sql", etc.
677
- # Find tags matching a query using semantic extraction
678
- #
679
- # @param query [String] Search query
680
- # @param include_extracted [Boolean] If true, returns hash with :extracted and :matched keys
681
- # @return [Array<String>] Matching tag names (default)
682
- # @return [Hash] If include_extracted: { extracted: [...], matched: [...] }
683
- #
684
- def find_query_matching_tags(query, include_extracted: false)
685
- empty_result = include_extracted ? { extracted: [], matched: [] } : []
686
- return empty_result if query.nil? || query.strip.empty?
687
-
688
- # Use the tag extractor to generate semantic tags from the query
689
- # This uses the same LLM process as when storing nodes
690
- existing_tags = HTM::Models::Tag.pluck(:name).sample(50)
691
- extracted_tags = HTM::TagService.extract(query, existing_ontology: existing_tags)
692
-
693
- if extracted_tags.empty?
694
- return include_extracted ? { extracted: [], matched: [] } : []
695
- end
696
-
697
- # Step 1: Try exact matches
698
- exact_matches = HTM::Models::Tag.where(name: extracted_tags).pluck(:name)
699
-
700
- if exact_matches.any?
701
- return include_extracted ? { extracted: extracted_tags, matched: exact_matches } : exact_matches
702
- end
703
-
704
- # Step 2: Try matching on parent/prefix levels
705
- # For "person:human:character:popeye", try "person:human:character", "person:human", "person"
706
- prefix_candidates = extracted_tags.flat_map do |tag|
707
- levels = tag.split(':')
708
- (1...levels.size).map { |i| levels[0, i].join(':') }
709
- end.uniq
710
-
711
- if prefix_candidates.any?
712
- prefix_matches = HTM::Models::Tag.where(name: prefix_candidates).pluck(:name)
713
- if prefix_matches.any?
714
- return include_extracted ? { extracted: extracted_tags, matched: prefix_matches } : prefix_matches
715
- end
716
- end
717
-
718
- # Step 3: Try matching individual components, starting from rightmost (most specific)
719
- # For "person:human:character:popeye", try "popeye", then "character", then "human", then "person"
720
- # Search for tags that contain this component at any level
721
- all_components = extracted_tags.flat_map { |tag| tag.split(':') }.uniq
722
-
723
- # Order by specificity: components that appear at deeper levels first
724
- component_depths = Hash.new(0)
725
- extracted_tags.each do |tag|
726
- levels = tag.split(':')
727
- levels.each_with_index { |comp, idx| component_depths[comp] = [component_depths[comp], idx].max }
728
- end
729
- ordered_components = all_components.sort_by { |c| -component_depths[c] }
730
-
731
- # Try each component, starting with most specific (rightmost)
732
- ordered_components.each do |component|
733
- # Find tags where this component appears at any level
734
- component_matches = HTM::Models::Tag
735
- .where("name = ? OR name LIKE ? OR name LIKE ? OR name LIKE ?",
736
- component, # exact match (single-level tag)
737
- "#{component}:%", # starts with component
738
- "%:#{component}", # ends with component
739
- "%:#{component}:%") # component in middle
740
- .pluck(:name)
741
-
742
- if component_matches.any?
743
- return include_extracted ? { extracted: extracted_tags, matched: component_matches } : component_matches
744
- end
745
- end
746
-
747
- # No matches found at any level
748
- include_extracted ? { extracted: extracted_tags, matched: [] } : []
749
- end
750
-
751
- private
752
-
753
- # Sanitize embedding for SQL use
754
- #
755
- # Validates that all values are numeric and converts to safe PostgreSQL vector format.
756
- # This prevents SQL injection by ensuring only valid numeric values are included.
757
- #
758
- # @param embedding [Array<Numeric>] Embedding vector
759
- # @return [String] Sanitized vector string for PostgreSQL (e.g., "[0.1,0.2,0.3]")
760
- # @raise [ArgumentError] If embedding contains non-numeric values
761
- #
762
- def sanitize_embedding_for_sql(embedding)
763
- unless embedding.is_a?(Array) && embedding.all? { |v| v.is_a?(Numeric) && v.finite? }
764
- raise ArgumentError, "Embedding must be an array of finite numeric values"
765
- end
766
-
767
- "[#{embedding.map { |v| v.to_f }.join(',')}]"
768
- end
769
-
770
- # Build SQL condition for timeframe filtering
771
- #
772
- # @param timeframe [nil, Range, Array<Range>] Time range(s)
773
- # @param table_alias [String] Table alias (default: none)
774
- # @return [String, nil] SQL condition or nil for no filter
775
- #
776
- def build_timeframe_condition(timeframe, table_alias: nil)
777
- return nil if timeframe.nil?
778
-
779
- prefix = table_alias ? "#{table_alias}." : ""
780
- column = "#{prefix}created_at"
781
- conn = ActiveRecord::Base.connection
782
-
783
- case timeframe
784
- when Range
785
- # Use quote to safely escape timestamp values
786
- begin_quoted = conn.quote(timeframe.begin.iso8601)
787
- end_quoted = conn.quote(timeframe.end.iso8601)
788
- "(#{column} BETWEEN #{begin_quoted} AND #{end_quoted})"
789
- when Array
790
- conditions = timeframe.map do |range|
791
- begin_quoted = conn.quote(range.begin.iso8601)
792
- end_quoted = conn.quote(range.end.iso8601)
793
- "(#{column} BETWEEN #{begin_quoted} AND #{end_quoted})"
794
- end
795
- "(#{conditions.join(' OR ')})"
796
- else
797
- nil
798
- end
799
- end
800
-
801
- # Build ActiveRecord where clause for timeframe
802
- #
803
- # @param scope [ActiveRecord::Relation] Base scope
804
- # @param timeframe [nil, Range, Array<Range>] Time range(s)
805
- # @return [ActiveRecord::Relation] Scoped query
806
- #
807
- def apply_timeframe_scope(scope, timeframe)
808
- return scope if timeframe.nil?
809
-
810
- case timeframe
811
- when Range
812
- scope.where(created_at: timeframe)
813
- when Array
814
- # Build OR conditions for multiple ranges
815
- conditions = timeframe.map { |range| scope.where(created_at: range) }
816
- conditions.reduce { |result, condition| result.or(condition) }
817
- else
818
- scope
819
- end
820
- end
821
-
822
- # Build SQL condition for metadata filtering (JSONB containment)
823
- #
824
- # @param metadata [Hash] Metadata to filter by
825
- # @param table_alias [String] Table alias (default: none)
826
- # @return [String, nil] SQL condition or nil for no filter
827
- #
828
- def build_metadata_condition(metadata, table_alias: nil)
829
- return nil if metadata.nil? || metadata.empty?
830
-
831
- prefix = table_alias ? "#{table_alias}." : ""
832
- column = "#{prefix}metadata"
833
- conn = ActiveRecord::Base.connection
834
-
835
- # Use JSONB containment operator @>
836
- # This matches if the metadata column contains all key-value pairs in the filter
837
- quoted_metadata = conn.quote(metadata.to_json)
838
- "(#{column} @> #{quoted_metadata}::jsonb)"
839
- end
840
-
841
- # Build ActiveRecord where clause for metadata
842
- #
843
- # @param scope [ActiveRecord::Relation] Base scope
844
- # @param metadata [Hash] Metadata to filter by
845
- # @return [ActiveRecord::Relation] Scoped query
846
- #
847
- def apply_metadata_scope(scope, metadata)
848
- return scope if metadata.nil? || metadata.empty?
849
-
850
- # Use JSONB containment operator
851
- scope.where("metadata @> ?::jsonb", metadata.to_json)
852
- end
853
-
854
- # Generate cache key for query
855
- #
856
- # @param method [Symbol] Search method name
857
- # @param timeframe [nil, Range, Array<Range>] Time range(s)
858
- # @param query [String] Search query
859
- # @param limit [Integer] Result limit
860
- # @param args [Array] Additional arguments
861
- # @return [String] Cache key
862
- #
863
- def cache_key_for(method, timeframe, query, limit, *args)
864
- timeframe_key = case timeframe
865
- when nil
866
- "nil"
867
- when Range
868
- "#{timeframe.begin.to_i}-#{timeframe.end.to_i}"
869
- when Array
870
- timeframe.map { |r| "#{r.begin.to_i}-#{r.end.to_i}" }.join(',')
871
- else
872
- timeframe.to_s
873
- end
874
-
875
- key_parts = [
876
- method,
877
- timeframe_key,
878
- query,
879
- limit,
880
- *args
881
- ]
882
- Digest::SHA256.hexdigest(key_parts.join('|'))
883
- end
884
-
885
- # Get cache statistics
886
- #
887
- # @return [Hash, nil] Cache stats or nil if cache disabled
888
- #
889
- def cache_stats
890
- return nil unless @query_cache
891
-
892
- total = @cache_stats[:hits] + @cache_stats[:misses]
893
- hit_rate = total > 0 ? (@cache_stats[:hits].to_f / total * 100).round(2) : 0.0
894
-
895
- {
896
- hits: @cache_stats[:hits],
897
- misses: @cache_stats[:misses],
898
- hit_rate: hit_rate,
899
- size: @query_cache.count
900
- }
901
- end
902
-
903
- # Calculate Jaccard similarity between two sets
904
- #
905
- # @param set_a [Array] First set
906
- # @param set_b [Array] Second set
907
- # @return [Float] Jaccard similarity (0.0-1.0)
908
- #
909
- def jaccard_similarity(set_a, set_b)
910
- return 0.0 if set_a.empty? && set_b.empty?
911
- return 0.0 if set_a.empty? || set_b.empty?
912
-
913
- intersection = (set_a & set_b).size
914
- union = (set_a | set_b).size
915
-
916
- intersection.to_f / union
917
- end
918
-
919
- def weighted_hierarchical_jaccard(set_a, set_b)
920
- return 0.0 if set_a.empty? || set_b.empty?
921
-
922
- total_weighted_similarity = 0.0
923
- total_weights = 0.0
924
-
925
- set_a.each do |tag_a|
926
- set_b.each do |tag_b|
927
- similarity, weight = calculate_hierarchical_similarity(tag_a, tag_b)
928
- total_weighted_similarity += similarity * weight
929
- total_weights += weight
930
- end
931
- end
932
-
933
- total_weights > 0 ? total_weighted_similarity / total_weights : 0.0
934
- end
935
-
936
-
937
-
938
- # Invalidate (clear) the query cache
939
- #
940
- # @return [void]
941
- #
942
- def invalidate_cache!
943
- @query_cache.clear if @query_cache
944
- end
945
-
946
- # Execute a query with caching
947
- #
948
- # @param method [Symbol] Search method name for cache key
949
- # @param args [Array] Arguments for cache key (timeframe, query, limit, etc.)
950
- # @yield Block that executes the actual query
951
- # @return [Array<Hash>] Query results (from cache or freshly executed)
952
- #
953
- def cached_query(method, *args, &block)
954
- return yield unless @query_cache
955
-
956
- cache_key = cache_key_for(method, *args)
957
-
958
- if (cached = @query_cache[cache_key])
959
- @cache_stats_mutex.synchronize { @cache_stats[:hits] += 1 }
960
- return cached
961
- end
962
-
963
- @cache_stats_mutex.synchronize { @cache_stats[:misses] += 1 }
964
- result = yield
965
- @query_cache[cache_key] = result
966
- result
967
- end
968
-
969
- # Uncached vector similarity search
970
- #
971
- # Generates query embedding client-side and performs vector search in database.
972
- #
973
- # @param timeframe [nil, Range, Array<Range>] Time range(s) to search (nil = no filter)
974
- # @param query [String] Search query
975
- # @param limit [Integer] Maximum results
976
- # @param embedding_service [Object] Service to generate query embedding
977
- # @param metadata [Hash] Filter by metadata fields (default: {})
978
- # @return [Array<Hash>] Matching nodes
979
- #
980
- def search_uncached(timeframe:, query:, limit:, embedding_service:, metadata: {})
981
- # Generate query embedding client-side
982
- query_embedding = embedding_service.embed(query)
983
-
984
- # Pad embedding to 2000 dimensions if needed (to match nodes.embedding vector(2000))
985
- if query_embedding.length < 2000
986
- query_embedding = query_embedding + Array.new(2000 - query_embedding.length, 0.0)
987
- end
988
-
989
- # Sanitize embedding for safe SQL use (validates all values are numeric)
990
- embedding_str = sanitize_embedding_for_sql(query_embedding)
991
-
992
- # Build filter conditions
993
- timeframe_condition = build_timeframe_condition(timeframe)
994
- metadata_condition = build_metadata_condition(metadata)
995
-
996
- conditions = ["embedding IS NOT NULL", "deleted_at IS NULL"]
997
- conditions << timeframe_condition if timeframe_condition
998
- conditions << metadata_condition if metadata_condition
999
-
1000
- where_clause = "WHERE #{conditions.join(' AND ')}"
1001
-
1002
- # Use quote to safely escape the embedding string in the query
1003
- quoted_embedding = ActiveRecord::Base.connection.quote(embedding_str)
1004
-
1005
- result = ActiveRecord::Base.connection.select_all(
1006
- <<~SQL,
1007
- SELECT id, content, access_count, created_at, token_count,
1008
- 1 - (embedding <=> #{quoted_embedding}::vector) as similarity
1009
- FROM nodes
1010
- #{where_clause}
1011
- ORDER BY embedding <=> #{quoted_embedding}::vector
1012
- LIMIT #{limit.to_i}
1013
- SQL
1014
- )
1015
-
1016
- # Track access for retrieved nodes
1017
- node_ids = result.map { |r| r['id'] }
1018
- track_access(node_ids)
1019
-
1020
- result.to_a
1021
- end
1022
-
1023
- # Uncached full-text search
1024
- #
1025
- # @param timeframe [nil, Range, Array<Range>] Time range(s) to search (nil = no filter)
1026
- # @param query [String] Search query
1027
- # @param limit [Integer] Maximum results
1028
- # @param metadata [Hash] Filter by metadata fields (default: {})
1029
- # @return [Array<Hash>] Matching nodes
1030
- #
1031
- def search_fulltext_uncached(timeframe:, query:, limit:, metadata: {})
1032
- # Build filter conditions
1033
- timeframe_condition = build_timeframe_condition(timeframe)
1034
- metadata_condition = build_metadata_condition(metadata)
1035
-
1036
- additional_conditions = []
1037
- additional_conditions << timeframe_condition if timeframe_condition
1038
- additional_conditions << metadata_condition if metadata_condition
1039
- additional_sql = additional_conditions.any? ? "AND #{additional_conditions.join(' AND ')}" : ""
1040
-
1041
- result = ActiveRecord::Base.connection.select_all(
1042
- ActiveRecord::Base.sanitize_sql_array([
1043
- <<~SQL,
1044
- SELECT id, content, access_count, created_at, token_count,
1045
- ts_rank(to_tsvector('english', content), plainto_tsquery('english', ?)) as rank
1046
- FROM nodes
1047
- WHERE deleted_at IS NULL
1048
- AND to_tsvector('english', content) @@ plainto_tsquery('english', ?)
1049
- #{additional_sql}
1050
- ORDER BY rank DESC
1051
- LIMIT ?
1052
- SQL
1053
- query, query, limit
1054
- ])
1055
- )
1056
-
1057
- # Track access for retrieved nodes
1058
- node_ids = result.map { |r| r['id'] }
1059
- track_access(node_ids)
1060
-
1061
- result.to_a
1062
- end
1063
-
1064
- # Uncached hybrid search
1065
- #
1066
- # Generates query embedding client-side, then combines:
1067
- # 1. Full-text search for content matching
1068
- # 2. Tag matching for categorical relevance
1069
- # 3. Vector similarity for semantic ranking
1070
- #
1071
- # @param timeframe [nil, Range, Array<Range>] Time range(s) to search (nil = no filter)
1072
- # @param query [String] Search query
1073
- # @param limit [Integer] Maximum results
1074
- # @param embedding_service [Object] Service to generate query embedding
1075
- # @param prefilter_limit [Integer] Candidates to consider
1076
- # @param metadata [Hash] Filter by metadata fields (default: {})
1077
- # @return [Array<Hash>] Matching nodes with similarity and tag_boost scores
1078
- #
1079
- def search_hybrid_uncached(timeframe:, query:, limit:, embedding_service:, prefilter_limit:, metadata: {})
1080
- # Generate query embedding client-side
1081
- query_embedding = embedding_service.embed(query)
1082
-
1083
- # Pad embedding to 2000 dimensions if needed
1084
- if query_embedding.length < 2000
1085
- query_embedding = query_embedding + Array.new(2000 - query_embedding.length, 0.0)
1086
- end
1087
-
1088
- # Sanitize embedding for safe SQL use (validates all values are numeric)
1089
- embedding_str = sanitize_embedding_for_sql(query_embedding)
1090
- quoted_embedding = ActiveRecord::Base.connection.quote(embedding_str)
1091
-
1092
- # Build filter conditions (with table alias for CTEs)
1093
- timeframe_condition = build_timeframe_condition(timeframe, table_alias: 'n')
1094
- metadata_condition = build_metadata_condition(metadata, table_alias: 'n')
1095
-
1096
- additional_conditions = []
1097
- additional_conditions << timeframe_condition if timeframe_condition
1098
- additional_conditions << metadata_condition if metadata_condition
1099
- additional_sql = additional_conditions.any? ? "AND #{additional_conditions.join(' AND ')}" : ""
1100
-
1101
- # Same for non-aliased queries
1102
- timeframe_condition_bare = build_timeframe_condition(timeframe)
1103
- metadata_condition_bare = build_metadata_condition(metadata)
1104
-
1105
- additional_conditions_bare = []
1106
- additional_conditions_bare << timeframe_condition_bare if timeframe_condition_bare
1107
- additional_conditions_bare << metadata_condition_bare if metadata_condition_bare
1108
- additional_sql_bare = additional_conditions_bare.any? ? "AND #{additional_conditions_bare.join(' AND ')}" : ""
1109
-
1110
- # Find tags that match query terms
1111
- matching_tags = find_query_matching_tags(query)
1112
-
1113
- # Build the hybrid query
1114
- # If we have matching tags, include nodes with those tags in the candidate pool
1115
- # NOTE: Hybrid search includes nodes without embeddings using a default
1116
- # similarity score of 0.5. This allows newly created nodes to appear in
1117
- # search results immediately (via fulltext matching) before their embeddings
1118
- # are generated by background jobs.
1119
-
1120
- if matching_tags.any?
1121
- # Escape tag names for SQL
1122
- tag_list = matching_tags.map { |t| ActiveRecord::Base.connection.quote(t) }.join(', ')
1123
- result = ActiveRecord::Base.connection.select_all(
1124
- ActiveRecord::Base.sanitize_sql_array([
1125
- <<~SQL,
1126
- WITH fulltext_candidates AS (
1127
- -- Nodes matching full-text search (with or without embeddings)
1128
- SELECT DISTINCT n.id, n.content, n.access_count, n.created_at, n.token_count, n.embedding
1129
- FROM nodes n
1130
- WHERE n.deleted_at IS NULL
1131
- AND to_tsvector('english', n.content) @@ plainto_tsquery('english', ?)
1132
- #{additional_sql}
1133
- LIMIT ?
1134
- ),
1135
- tag_candidates AS (
1136
- -- Nodes matching relevant tags (with or without embeddings)
1137
- SELECT DISTINCT n.id, n.content, n.access_count, n.created_at, n.token_count, n.embedding
1138
- FROM nodes n
1139
- JOIN node_tags nt ON nt.node_id = n.id
1140
- JOIN tags t ON t.id = nt.tag_id
1141
- WHERE n.deleted_at IS NULL
1142
- AND t.name IN (#{tag_list})
1143
- #{additional_sql}
1144
- LIMIT ?
1145
- ),
1146
- all_candidates AS (
1147
- SELECT * FROM fulltext_candidates
1148
- UNION
1149
- SELECT * FROM tag_candidates
1150
- ),
1151
- scored AS (
1152
- SELECT
1153
- ac.id, ac.content, ac.access_count, ac.created_at, ac.token_count,
1154
- CASE
1155
- WHEN ac.embedding IS NOT NULL THEN 1 - (ac.embedding <=> #{quoted_embedding}::vector)
1156
- ELSE 0.5 -- Default similarity for nodes without embeddings
1157
- END as similarity,
1158
- COALESCE((
1159
- SELECT COUNT(DISTINCT t.name)::float / ?
1160
- FROM node_tags nt
1161
- JOIN tags t ON t.id = nt.tag_id
1162
- WHERE nt.node_id = ac.id AND t.name IN (#{tag_list})
1163
- ), 0) as tag_boost
1164
- FROM all_candidates ac
1165
- )
1166
- SELECT id, content, access_count, created_at, token_count,
1167
- similarity, tag_boost,
1168
- (similarity * 0.7 + tag_boost * 0.3) as combined_score
1169
- FROM scored
1170
- ORDER BY combined_score DESC
1171
- LIMIT ?
1172
- SQL
1173
- query, prefilter_limit,
1174
- prefilter_limit,
1175
- matching_tags.length.to_f,
1176
- limit
1177
- ])
1178
- )
1179
- else
1180
- # No matching tags, fall back to standard hybrid (fulltext + vector)
1181
- # Include nodes without embeddings with a default similarity score
1182
- result = ActiveRecord::Base.connection.select_all(
1183
- ActiveRecord::Base.sanitize_sql_array([
1184
- <<~SQL,
1185
- WITH candidates AS (
1186
- SELECT id, content, access_count, created_at, token_count, embedding
1187
- FROM nodes
1188
- WHERE deleted_at IS NULL
1189
- AND to_tsvector('english', content) @@ plainto_tsquery('english', ?)
1190
- #{additional_sql_bare}
1191
- LIMIT ?
1192
- )
1193
- SELECT id, content, access_count, created_at, token_count,
1194
- CASE
1195
- WHEN embedding IS NOT NULL THEN 1 - (embedding <=> #{quoted_embedding}::vector)
1196
- ELSE 0.5 -- Default similarity for nodes without embeddings
1197
- END as similarity,
1198
- 0.0 as tag_boost,
1199
- CASE
1200
- WHEN embedding IS NOT NULL THEN 1 - (embedding <=> #{quoted_embedding}::vector)
1201
- ELSE 0.5 -- Default score for nodes without embeddings (fulltext matched)
1202
- END as combined_score
1203
- FROM candidates
1204
- ORDER BY combined_score DESC
1205
- LIMIT ?
1206
- SQL
1207
- query, prefilter_limit, limit
1208
- ])
1209
- )
1210
- end
1211
-
1212
- # Track access for retrieved nodes
1213
- node_ids = result.map { |r| r['id'] }
1214
- track_access(node_ids)
1215
-
1216
- result.to_a
1217
- end
1218
-
1219
-
1220
- def calculate_hierarchical_similarity(tag_a, tag_b)
1221
- parts_a = tag_a.split(':')
1222
- parts_b = tag_b.split(':')
1223
-
1224
- # Calculate overlap at each level
1225
- common_levels = 0
1226
- max_depth = [parts_a.length, parts_b.length].max
1227
-
1228
- (0...max_depth).each do |i|
1229
- if i < parts_a.length && i < parts_b.length && parts_a[i] == parts_b[i]
1230
- common_levels += 1
1231
- else
1232
- break
1233
- end
1234
- end
1235
-
1236
- # Calculate weight based on hierarchy depth (higher levels = more weight)
1237
- depth_weight = 1.0 / max_depth
1238
-
1239
- # Calculate normalized similarity (0-1)
1240
- similarity = max_depth > 0 ? (common_levels.to_f / max_depth) : 0.0
1241
-
1242
- [similarity, depth_weight]
1243
- end
1244
142
  end
1245
143
  end