htm 0.0.2 → 0.0.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.aigcm_msg +1 -0
- data/.architecture/reviews/comprehensive-codebase-review.md +577 -0
- data/.claude/settings.local.json +92 -0
- data/.irbrc +283 -80
- data/.tbls.yml +2 -1
- data/CHANGELOG.md +294 -26
- data/CLAUDE.md +603 -0
- data/README.md +76 -5
- data/Rakefile +5 -0
- data/db/migrate/{20250101000001_enable_extensions.rb → 00001_enable_extensions.rb} +0 -1
- data/db/migrate/00002_create_robots.rb +11 -0
- data/db/migrate/00003_create_file_sources.rb +20 -0
- data/db/migrate/00004_create_nodes.rb +65 -0
- data/db/migrate/00005_create_tags.rb +13 -0
- data/db/migrate/00006_create_node_tags.rb +18 -0
- data/db/migrate/00007_create_robot_nodes.rb +26 -0
- data/db/migrate/00009_add_working_memory_to_robot_nodes.rb +12 -0
- data/db/schema.sql +172 -1
- data/docs/api/database.md +1 -2
- data/docs/api/htm.md +197 -2
- data/docs/api/yard/HTM/ActiveRecordConfig.md +23 -0
- data/docs/api/yard/HTM/AuthorizationError.md +11 -0
- data/docs/api/yard/HTM/CircuitBreaker.md +92 -0
- data/docs/api/yard/HTM/CircuitBreakerOpenError.md +34 -0
- data/docs/api/yard/HTM/Configuration.md +175 -0
- data/docs/api/yard/HTM/Database.md +99 -0
- data/docs/api/yard/HTM/DatabaseError.md +14 -0
- data/docs/api/yard/HTM/EmbeddingError.md +18 -0
- data/docs/api/yard/HTM/EmbeddingService.md +58 -0
- data/docs/api/yard/HTM/Error.md +11 -0
- data/docs/api/yard/HTM/JobAdapter.md +39 -0
- data/docs/api/yard/HTM/LongTermMemory.md +342 -0
- data/docs/api/yard/HTM/NotFoundError.md +17 -0
- data/docs/api/yard/HTM/Observability.md +107 -0
- data/docs/api/yard/HTM/QueryTimeoutError.md +19 -0
- data/docs/api/yard/HTM/Railtie.md +27 -0
- data/docs/api/yard/HTM/ResourceExhaustedError.md +13 -0
- data/docs/api/yard/HTM/TagError.md +18 -0
- data/docs/api/yard/HTM/TagService.md +67 -0
- data/docs/api/yard/HTM/Timeframe/Result.md +24 -0
- data/docs/api/yard/HTM/Timeframe.md +40 -0
- data/docs/api/yard/HTM/TimeframeExtractor/Result.md +24 -0
- data/docs/api/yard/HTM/TimeframeExtractor.md +45 -0
- data/docs/api/yard/HTM/ValidationError.md +20 -0
- data/docs/api/yard/HTM/WorkingMemory.md +131 -0
- data/docs/api/yard/HTM.md +80 -0
- data/docs/api/yard/index.csv +179 -0
- data/docs/api/yard-reference.md +51 -0
- data/docs/database/README.md +128 -128
- data/docs/database/public.file_sources.md +42 -0
- data/docs/database/public.file_sources.svg +211 -0
- data/docs/database/public.node_tags.md +4 -4
- data/docs/database/public.node_tags.svg +212 -79
- data/docs/database/public.nodes.md +22 -12
- data/docs/database/public.nodes.svg +246 -127
- data/docs/database/public.robot_nodes.md +11 -9
- data/docs/database/public.robot_nodes.svg +220 -98
- data/docs/database/public.robots.md +2 -2
- data/docs/database/public.robots.svg +136 -81
- data/docs/database/public.tags.md +3 -3
- data/docs/database/public.tags.svg +118 -39
- data/docs/database/schema.json +850 -771
- data/docs/database/schema.svg +256 -197
- data/docs/development/schema.md +67 -2
- data/docs/guides/adding-memories.md +93 -7
- data/docs/guides/recalling-memories.md +36 -1
- data/examples/README.md +280 -0
- data/examples/cli_app/htm_cli.rb +65 -5
- data/examples/cli_app/temp.log +93 -0
- data/examples/file_loader_usage.rb +177 -0
- data/examples/robot_groups/lib/robot_group.rb +419 -0
- data/examples/robot_groups/lib/working_memory_channel.rb +140 -0
- data/examples/robot_groups/multi_process.rb +286 -0
- data/examples/robot_groups/robot_worker.rb +136 -0
- data/examples/robot_groups/same_process.rb +229 -0
- data/examples/timeframe_demo.rb +276 -0
- data/lib/htm/active_record_config.rb +1 -1
- data/lib/htm/circuit_breaker.rb +202 -0
- data/lib/htm/configuration.rb +59 -13
- data/lib/htm/database.rb +67 -36
- data/lib/htm/embedding_service.rb +39 -2
- data/lib/htm/errors.rb +131 -11
- data/lib/htm/jobs/generate_embedding_job.rb +5 -4
- data/lib/htm/jobs/generate_tags_job.rb +4 -0
- data/lib/htm/loaders/markdown_loader.rb +263 -0
- data/lib/htm/loaders/paragraph_chunker.rb +112 -0
- data/lib/htm/long_term_memory.rb +460 -343
- data/lib/htm/models/file_source.rb +99 -0
- data/lib/htm/models/node.rb +80 -5
- data/lib/htm/models/robot.rb +24 -1
- data/lib/htm/models/robot_node.rb +1 -0
- data/lib/htm/models/tag.rb +254 -4
- data/lib/htm/observability.rb +395 -0
- data/lib/htm/tag_service.rb +60 -3
- data/lib/htm/tasks.rb +26 -1
- data/lib/htm/timeframe.rb +194 -0
- data/lib/htm/timeframe_extractor.rb +307 -0
- data/lib/htm/version.rb +1 -1
- data/lib/htm/working_memory.rb +165 -70
- data/lib/htm.rb +328 -130
- data/lib/tasks/doc.rake +300 -0
- data/lib/tasks/files.rake +299 -0
- data/lib/tasks/htm.rake +158 -3
- data/lib/tasks/jobs.rake +3 -9
- data/lib/tasks/tags.rake +166 -6
- data/mkdocs.yml +36 -1
- data/notes/ARCHITECTURE_REVIEW.md +1167 -0
- data/notes/IMPLEMENTATION_SUMMARY.md +606 -0
- data/notes/MULTI_FRAMEWORK_IMPLEMENTATION.md +451 -0
- data/notes/next_steps.md +100 -0
- data/notes/plan.md +627 -0
- data/notes/tag_ontology_enhancement_ideas.md +222 -0
- data/notes/timescaledb_removal_summary.md +200 -0
- metadata +125 -15
- data/db/migrate/20250101000002_create_robots.rb +0 -14
- data/db/migrate/20250101000003_create_nodes.rb +0 -42
- data/db/migrate/20250101000005_create_tags.rb +0 -38
- data/db/migrate/20250101000007_add_node_vector_indexes.rb +0 -30
- data/db/migrate/20250125000001_add_content_hash_to_nodes.rb +0 -14
- data/db/migrate/20250125000002_create_robot_nodes.rb +0 -35
- data/db/migrate/20250125000003_remove_source_and_robot_id_from_nodes.rb +0 -28
- data/db/migrate/20250126000001_create_working_memories.rb +0 -19
- data/db/migrate/20250126000002_remove_unused_columns.rb +0 -12
- data/docs/database/public.working_memories.md +0 -40
- data/docs/database/public.working_memories.svg +0 -112
- data/lib/htm/models/working_memory_entry.rb +0 -88
data/lib/htm/long_term_memory.rb
CHANGED
|
@@ -25,6 +25,23 @@ class HTM
|
|
|
25
25
|
|
|
26
26
|
attr_reader :query_timeout
|
|
27
27
|
|
|
28
|
+
# Initialize long-term memory storage
|
|
29
|
+
#
|
|
30
|
+
# @param config [Hash] Database configuration (host, port, dbname, user, password)
|
|
31
|
+
# @param pool_size [Integer, nil] Connection pool size (uses ActiveRecord default if nil)
|
|
32
|
+
# @param query_timeout [Integer] Query timeout in milliseconds (default: 30000)
|
|
33
|
+
# @param cache_size [Integer] Number of query results to cache (default: 1000, use 0 to disable)
|
|
34
|
+
# @param cache_ttl [Integer] Cache time-to-live in seconds (default: 300)
|
|
35
|
+
#
|
|
36
|
+
# @example Initialize with defaults
|
|
37
|
+
# ltm = LongTermMemory.new(HTM::Database.default_config)
|
|
38
|
+
#
|
|
39
|
+
# @example Initialize with custom cache settings
|
|
40
|
+
# ltm = LongTermMemory.new(config, cache_size: 500, cache_ttl: 600)
|
|
41
|
+
#
|
|
42
|
+
# @example Disable caching
|
|
43
|
+
# ltm = LongTermMemory.new(config, cache_size: 0)
|
|
44
|
+
#
|
|
28
45
|
def initialize(config, pool_size: nil, query_timeout: DEFAULT_QUERY_TIMEOUT, cache_size: DEFAULT_CACHE_SIZE, cache_ttl: DEFAULT_CACHE_TTL)
|
|
29
46
|
@config = config
|
|
30
47
|
@query_timeout = query_timeout # in milliseconds
|
|
@@ -36,6 +53,7 @@ class HTM
|
|
|
36
53
|
if cache_size > 0
|
|
37
54
|
@query_cache = LruRedux::TTL::ThreadSafeCache.new(cache_size, cache_ttl)
|
|
38
55
|
@cache_stats = { hits: 0, misses: 0 }
|
|
56
|
+
@cache_stats_mutex = Mutex.new # Thread-safety for cache statistics
|
|
39
57
|
end
|
|
40
58
|
end
|
|
41
59
|
|
|
@@ -48,60 +66,71 @@ class HTM
|
|
|
48
66
|
# @param token_count [Integer] Token count
|
|
49
67
|
# @param robot_id [Integer] Robot identifier
|
|
50
68
|
# @param embedding [Array<Float>, nil] Pre-generated embedding vector
|
|
69
|
+
# @param metadata [Hash] Flexible metadata for the node (default: {})
|
|
51
70
|
# @return [Hash] { node_id:, is_new:, robot_node: }
|
|
52
71
|
#
|
|
53
|
-
def add(content:, token_count: 0, robot_id:, embedding: nil)
|
|
72
|
+
def add(content:, token_count: 0, robot_id:, embedding: nil, metadata: {})
|
|
54
73
|
content_hash = HTM::Models::Node.generate_content_hash(content)
|
|
55
74
|
|
|
56
|
-
#
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
robot_node = link_robot_to_node(robot_id: robot_id, node: existing_node)
|
|
75
|
+
# Wrap in transaction to ensure data consistency
|
|
76
|
+
ActiveRecord::Base.transaction do
|
|
77
|
+
# Check for existing node with same content (including soft-deleted)
|
|
78
|
+
# This avoids unique constraint violations on content_hash
|
|
79
|
+
existing_node = HTM::Models::Node.with_deleted.find_by(content_hash: content_hash)
|
|
62
80
|
|
|
63
|
-
#
|
|
64
|
-
existing_node
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
node_id: existing_node.id,
|
|
68
|
-
is_new: false,
|
|
69
|
-
robot_node: robot_node
|
|
70
|
-
}
|
|
71
|
-
else
|
|
72
|
-
# Prepare embedding if provided
|
|
73
|
-
embedding_str = nil
|
|
74
|
-
if embedding
|
|
75
|
-
# Pad embedding to 2000 dimensions if needed
|
|
76
|
-
actual_dimension = embedding.length
|
|
77
|
-
padded_embedding = if actual_dimension < 2000
|
|
78
|
-
embedding + Array.new(2000 - actual_dimension, 0.0)
|
|
79
|
-
else
|
|
80
|
-
embedding
|
|
81
|
-
end
|
|
82
|
-
embedding_str = "[#{padded_embedding.join(',')}]"
|
|
81
|
+
# If found but soft-deleted, restore it
|
|
82
|
+
if existing_node&.deleted?
|
|
83
|
+
existing_node.restore!
|
|
84
|
+
HTM.logger.info "Restored soft-deleted node #{existing_node.id} for content match"
|
|
83
85
|
end
|
|
84
86
|
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
content_hash: content_hash,
|
|
89
|
-
token_count: token_count,
|
|
90
|
-
embedding: embedding_str,
|
|
91
|
-
embedding_dimension: embedding&.length
|
|
92
|
-
)
|
|
87
|
+
if existing_node
|
|
88
|
+
# Link robot to existing node (or update if already linked)
|
|
89
|
+
robot_node = link_robot_to_node(robot_id: robot_id, node: existing_node)
|
|
93
90
|
|
|
94
|
-
|
|
95
|
-
|
|
91
|
+
# Update the node's updated_at timestamp
|
|
92
|
+
existing_node.touch
|
|
96
93
|
|
|
97
|
-
|
|
98
|
-
|
|
94
|
+
{
|
|
95
|
+
node_id: existing_node.id,
|
|
96
|
+
is_new: false,
|
|
97
|
+
robot_node: robot_node
|
|
98
|
+
}
|
|
99
|
+
else
|
|
100
|
+
# Prepare embedding if provided
|
|
101
|
+
embedding_str = nil
|
|
102
|
+
if embedding
|
|
103
|
+
# Pad embedding to 2000 dimensions if needed
|
|
104
|
+
actual_dimension = embedding.length
|
|
105
|
+
padded_embedding = if actual_dimension < 2000
|
|
106
|
+
embedding + Array.new(2000 - actual_dimension, 0.0)
|
|
107
|
+
else
|
|
108
|
+
embedding
|
|
109
|
+
end
|
|
110
|
+
embedding_str = "[#{padded_embedding.join(',')}]"
|
|
111
|
+
end
|
|
99
112
|
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
113
|
+
# Create new node
|
|
114
|
+
node = HTM::Models::Node.create!(
|
|
115
|
+
content: content,
|
|
116
|
+
content_hash: content_hash,
|
|
117
|
+
token_count: token_count,
|
|
118
|
+
embedding: embedding_str,
|
|
119
|
+
metadata: metadata
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
# Link robot to new node
|
|
123
|
+
robot_node = link_robot_to_node(robot_id: robot_id, node: node)
|
|
124
|
+
|
|
125
|
+
# Invalidate cache since database content changed
|
|
126
|
+
invalidate_cache!
|
|
127
|
+
|
|
128
|
+
{
|
|
129
|
+
node_id: node.id,
|
|
130
|
+
is_new: true,
|
|
131
|
+
robot_node: robot_node
|
|
132
|
+
}
|
|
133
|
+
end
|
|
105
134
|
end
|
|
106
135
|
end
|
|
107
136
|
|
|
@@ -109,14 +138,16 @@ class HTM
|
|
|
109
138
|
#
|
|
110
139
|
# @param robot_id [Integer] Robot ID
|
|
111
140
|
# @param node [HTM::Models::Node] Node to link
|
|
141
|
+
# @param working_memory [Boolean] Whether node is in working memory (default: false)
|
|
112
142
|
# @return [HTM::Models::RobotNode] The robot_node link record
|
|
113
143
|
#
|
|
114
|
-
def link_robot_to_node(robot_id:, node:)
|
|
144
|
+
def link_robot_to_node(robot_id:, node:, working_memory: false)
|
|
115
145
|
robot_node = HTM::Models::RobotNode.find_by(robot_id: robot_id, node_id: node.id)
|
|
116
146
|
|
|
117
147
|
if robot_node
|
|
118
148
|
# Existing link - record that robot remembered this again
|
|
119
149
|
robot_node.record_remember!
|
|
150
|
+
robot_node.update!(working_memory: working_memory) if working_memory
|
|
120
151
|
else
|
|
121
152
|
# New link
|
|
122
153
|
robot_node = HTM::Models::RobotNode.create!(
|
|
@@ -124,7 +155,8 @@ class HTM
|
|
|
124
155
|
node_id: node.id,
|
|
125
156
|
first_remembered_at: Time.current,
|
|
126
157
|
last_remembered_at: Time.current,
|
|
127
|
-
remember_count: 1
|
|
158
|
+
remember_count: 1,
|
|
159
|
+
working_memory: working_memory
|
|
128
160
|
)
|
|
129
161
|
end
|
|
130
162
|
|
|
@@ -183,33 +215,17 @@ class HTM
|
|
|
183
215
|
|
|
184
216
|
# Vector similarity search
|
|
185
217
|
#
|
|
186
|
-
# @param timeframe [Range] Time range to search
|
|
218
|
+
# @param timeframe [nil, Range, Array<Range>] Time range(s) to search (nil = no filter)
|
|
187
219
|
# @param query [String] Search query
|
|
188
220
|
# @param limit [Integer] Maximum results
|
|
189
221
|
# @param embedding_service [Object] Service to generate embeddings
|
|
222
|
+
# @param metadata [Hash] Filter by metadata fields (default: {})
|
|
190
223
|
# @return [Array<Hash>] Matching nodes
|
|
191
224
|
#
|
|
192
|
-
def search(timeframe:, query:, limit:, embedding_service:)
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
# Generate cache key
|
|
197
|
-
cache_key = cache_key_for(:search, timeframe, query, limit)
|
|
198
|
-
|
|
199
|
-
# Try to get from cache
|
|
200
|
-
cached = @query_cache[cache_key]
|
|
201
|
-
if cached
|
|
202
|
-
@cache_stats[:hits] += 1
|
|
203
|
-
return cached
|
|
225
|
+
def search(timeframe:, query:, limit:, embedding_service:, metadata: {})
|
|
226
|
+
cached_query(:search, timeframe, query, limit, metadata) do
|
|
227
|
+
search_uncached(timeframe: timeframe, query: query, limit: limit, embedding_service: embedding_service, metadata: metadata)
|
|
204
228
|
end
|
|
205
|
-
|
|
206
|
-
# Cache miss - execute query
|
|
207
|
-
@cache_stats[:misses] += 1
|
|
208
|
-
result = search_uncached(timeframe: timeframe, query: query, limit: limit, embedding_service: embedding_service)
|
|
209
|
-
|
|
210
|
-
# Store in cache
|
|
211
|
-
@query_cache[cache_key] = result
|
|
212
|
-
result
|
|
213
229
|
end
|
|
214
230
|
|
|
215
231
|
# Full-text search
|
|
@@ -217,29 +233,13 @@ class HTM
|
|
|
217
233
|
# @param timeframe [Range] Time range to search
|
|
218
234
|
# @param query [String] Search query
|
|
219
235
|
# @param limit [Integer] Maximum results
|
|
236
|
+
# @param metadata [Hash] Filter by metadata fields (default: {})
|
|
220
237
|
# @return [Array<Hash>] Matching nodes
|
|
221
238
|
#
|
|
222
|
-
def search_fulltext(timeframe:, query:, limit:)
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
# Generate cache key
|
|
227
|
-
cache_key = cache_key_for(:fulltext, timeframe, query, limit)
|
|
228
|
-
|
|
229
|
-
# Try to get from cache
|
|
230
|
-
cached = @query_cache[cache_key]
|
|
231
|
-
if cached
|
|
232
|
-
@cache_stats[:hits] += 1
|
|
233
|
-
return cached
|
|
239
|
+
def search_fulltext(timeframe:, query:, limit:, metadata: {})
|
|
240
|
+
cached_query(:fulltext, timeframe, query, limit, metadata) do
|
|
241
|
+
search_fulltext_uncached(timeframe: timeframe, query: query, limit: limit, metadata: metadata)
|
|
234
242
|
end
|
|
235
|
-
|
|
236
|
-
# Cache miss - execute query
|
|
237
|
-
@cache_stats[:misses] += 1
|
|
238
|
-
result = search_fulltext_uncached(timeframe: timeframe, query: query, limit: limit)
|
|
239
|
-
|
|
240
|
-
# Store in cache
|
|
241
|
-
@query_cache[cache_key] = result
|
|
242
|
-
result
|
|
243
243
|
end
|
|
244
244
|
|
|
245
245
|
# Hybrid search (full-text + vector)
|
|
@@ -249,29 +249,13 @@ class HTM
|
|
|
249
249
|
# @param limit [Integer] Maximum results
|
|
250
250
|
# @param embedding_service [Object] Service to generate embeddings
|
|
251
251
|
# @param prefilter_limit [Integer] Candidates to consider (default: 100)
|
|
252
|
+
# @param metadata [Hash] Filter by metadata fields (default: {})
|
|
252
253
|
# @return [Array<Hash>] Matching nodes
|
|
253
254
|
#
|
|
254
|
-
def search_hybrid(timeframe:, query:, limit:, embedding_service:, prefilter_limit: 100)
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
# Generate cache key
|
|
259
|
-
cache_key = cache_key_for(:hybrid, timeframe, query, limit, prefilter_limit)
|
|
260
|
-
|
|
261
|
-
# Try to get from cache
|
|
262
|
-
cached = @query_cache[cache_key]
|
|
263
|
-
if cached
|
|
264
|
-
@cache_stats[:hits] += 1
|
|
265
|
-
return cached
|
|
255
|
+
def search_hybrid(timeframe:, query:, limit:, embedding_service:, prefilter_limit: 100, metadata: {})
|
|
256
|
+
cached_query(:hybrid, timeframe, query, limit, prefilter_limit, metadata) do
|
|
257
|
+
search_hybrid_uncached(timeframe: timeframe, query: query, limit: limit, embedding_service: embedding_service, prefilter_limit: prefilter_limit, metadata: metadata)
|
|
266
258
|
end
|
|
267
|
-
|
|
268
|
-
# Cache miss - execute query
|
|
269
|
-
@cache_stats[:misses] += 1
|
|
270
|
-
result = search_hybrid_uncached(timeframe: timeframe, query: query, limit: limit, embedding_service: embedding_service, prefilter_limit: prefilter_limit)
|
|
271
|
-
|
|
272
|
-
# Store in cache
|
|
273
|
-
@query_cache[cache_key] = result
|
|
274
|
-
result
|
|
275
259
|
end
|
|
276
260
|
|
|
277
261
|
# Add a tag to a node
|
|
@@ -292,15 +276,19 @@ class HTM
|
|
|
292
276
|
|
|
293
277
|
# Mark nodes as evicted from working memory
|
|
294
278
|
#
|
|
295
|
-
#
|
|
296
|
-
#
|
|
297
|
-
# tracking. This method is retained for API compatibility but is a no-op.
|
|
279
|
+
# Sets working_memory = false on the robot_nodes join table for the specified
|
|
280
|
+
# robot and node IDs.
|
|
298
281
|
#
|
|
299
|
-
# @param
|
|
282
|
+
# @param robot_id [Integer] Robot ID whose working memory is being evicted
|
|
283
|
+
# @param node_ids [Array<Integer>] Node IDs to mark as evicted
|
|
300
284
|
# @return [void]
|
|
301
285
|
#
|
|
302
|
-
def mark_evicted(node_ids)
|
|
303
|
-
|
|
286
|
+
def mark_evicted(robot_id:, node_ids:)
|
|
287
|
+
return if node_ids.empty?
|
|
288
|
+
|
|
289
|
+
HTM::Models::RobotNode
|
|
290
|
+
.where(robot_id: robot_id, node_id: node_ids)
|
|
291
|
+
.update_all(working_memory: false)
|
|
304
292
|
end
|
|
305
293
|
|
|
306
294
|
# Track access for multiple nodes (bulk operation)
|
|
@@ -371,6 +359,17 @@ class HTM
|
|
|
371
359
|
# This method kept for API compatibility
|
|
372
360
|
end
|
|
373
361
|
|
|
362
|
+
# Clear the query cache
|
|
363
|
+
#
|
|
364
|
+
# Call this after any operation that modifies data (soft delete, restore, etc.)
|
|
365
|
+
# to ensure subsequent queries see fresh results.
|
|
366
|
+
#
|
|
367
|
+
# @return [void]
|
|
368
|
+
#
|
|
369
|
+
def clear_cache!
|
|
370
|
+
invalidate_cache!
|
|
371
|
+
end
|
|
372
|
+
|
|
374
373
|
# For backwards compatibility with tests/code that expect pool_size
|
|
375
374
|
def pool_size
|
|
376
375
|
ActiveRecord::Base.connection_pool.size
|
|
@@ -421,19 +420,24 @@ class HTM
|
|
|
421
420
|
# @return [Array<Hash>] Topic relationships
|
|
422
421
|
#
|
|
423
422
|
def topic_relationships(min_shared_nodes: 2, limit: 50)
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
423
|
+
# Use parameterized query to prevent SQL injection
|
|
424
|
+
sql = <<~SQL
|
|
425
|
+
SELECT t1.name AS topic1, t2.name AS topic2, COUNT(DISTINCT nt1.node_id) AS shared_nodes
|
|
426
|
+
FROM tags t1
|
|
427
|
+
JOIN node_tags nt1 ON t1.id = nt1.tag_id
|
|
428
|
+
JOIN node_tags nt2 ON nt1.node_id = nt2.node_id
|
|
429
|
+
JOIN tags t2 ON nt2.tag_id = t2.id
|
|
430
|
+
WHERE t1.name < t2.name
|
|
431
|
+
GROUP BY t1.name, t2.name
|
|
432
|
+
HAVING COUNT(DISTINCT nt1.node_id) >= $1
|
|
433
|
+
ORDER BY shared_nodes DESC
|
|
434
|
+
LIMIT $2
|
|
435
|
+
SQL
|
|
436
|
+
|
|
437
|
+
result = ActiveRecord::Base.connection.exec_query(
|
|
438
|
+
sql,
|
|
439
|
+
'topic_relationships',
|
|
440
|
+
[[nil, min_shared_nodes.to_i], [nil, limit.to_i]]
|
|
437
441
|
)
|
|
438
442
|
result.to_a
|
|
439
443
|
end
|
|
@@ -462,9 +466,10 @@ class HTM
|
|
|
462
466
|
# @param node [Hash] Node data with similarity, tags, created_at, access_count
|
|
463
467
|
# @param query_tags [Array<String>] Tags associated with the query
|
|
464
468
|
# @param vector_similarity [Float, nil] Pre-computed vector similarity (0-1)
|
|
469
|
+
# @param node_tags [Array<String>, nil] Pre-loaded tags for this node (avoids N+1 query)
|
|
465
470
|
# @return [Float] Composite relevance score (0-10)
|
|
466
471
|
#
|
|
467
|
-
def calculate_relevance(node:, query_tags: [], vector_similarity: nil)
|
|
472
|
+
def calculate_relevance(node:, query_tags: [], vector_similarity: nil, node_tags: nil)
|
|
468
473
|
# 1. Vector similarity (semantic match) - weight: 0.5
|
|
469
474
|
semantic_score = if vector_similarity
|
|
470
475
|
vector_similarity
|
|
@@ -475,7 +480,8 @@ class HTM
|
|
|
475
480
|
end
|
|
476
481
|
|
|
477
482
|
# 2. Tag overlap (categorical relevance) - weight: 0.3
|
|
478
|
-
|
|
483
|
+
# Use pre-loaded tags if provided, otherwise fetch (for backward compatibility)
|
|
484
|
+
node_tags ||= get_node_tags(node['id'])
|
|
479
485
|
tag_score = if query_tags.any? && node_tags.any?
|
|
480
486
|
weighted_hierarchical_jaccard(query_tags, node_tags)
|
|
481
487
|
else
|
|
@@ -505,41 +511,48 @@ class HTM
|
|
|
505
511
|
#
|
|
506
512
|
# Returns nodes with calculated relevance scores based on query context
|
|
507
513
|
#
|
|
508
|
-
# @param timeframe [Range] Time range to search
|
|
514
|
+
# @param timeframe [nil, Range, Array<Range>] Time range(s) to search (nil = no filter)
|
|
509
515
|
# @param query [String, nil] Search query
|
|
510
516
|
# @param query_tags [Array<String>] Tags to match
|
|
511
517
|
# @param limit [Integer] Maximum results
|
|
512
518
|
# @param embedding_service [Object, nil] Service to generate embeddings
|
|
519
|
+
# @param metadata [Hash] Filter by metadata fields (default: {})
|
|
513
520
|
# @return [Array<Hash>] Nodes with relevance scores
|
|
514
521
|
#
|
|
515
|
-
def search_with_relevance(timeframe:, query: nil, query_tags: [], limit: 20, embedding_service: nil)
|
|
522
|
+
def search_with_relevance(timeframe:, query: nil, query_tags: [], limit: 20, embedding_service: nil, metadata: {})
|
|
516
523
|
# Get candidates from appropriate search method
|
|
517
524
|
candidates = if query && embedding_service
|
|
518
525
|
# Vector search
|
|
519
|
-
search_uncached(timeframe: timeframe, query: query, limit: limit * 2, embedding_service: embedding_service)
|
|
526
|
+
search_uncached(timeframe: timeframe, query: query, limit: limit * 2, embedding_service: embedding_service, metadata: metadata)
|
|
520
527
|
elsif query
|
|
521
528
|
# Full-text search
|
|
522
|
-
search_fulltext_uncached(timeframe: timeframe, query: query, limit: limit * 2)
|
|
529
|
+
search_fulltext_uncached(timeframe: timeframe, query: query, limit: limit * 2, metadata: metadata)
|
|
523
530
|
else
|
|
524
|
-
# Time-range only
|
|
525
|
-
HTM::Models::Node
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
.map(&:attributes)
|
|
531
|
+
# Time-range only (or no filter if timeframe is nil)
|
|
532
|
+
scope = HTM::Models::Node.where(deleted_at: nil)
|
|
533
|
+
scope = apply_timeframe_scope(scope, timeframe)
|
|
534
|
+
scope = apply_metadata_scope(scope, metadata)
|
|
535
|
+
scope.order(created_at: :desc).limit(limit * 2).map(&:attributes)
|
|
530
536
|
end
|
|
531
537
|
|
|
538
|
+
# Batch load all tags for candidates (fixes N+1 query)
|
|
539
|
+
node_ids = candidates.map { |n| n['id'] }
|
|
540
|
+
tags_by_node = batch_load_node_tags(node_ids)
|
|
541
|
+
|
|
532
542
|
# Calculate relevance for each candidate
|
|
533
543
|
scored_nodes = candidates.map do |node|
|
|
544
|
+
node_tags = tags_by_node[node['id']] || []
|
|
545
|
+
|
|
534
546
|
relevance = calculate_relevance(
|
|
535
547
|
node: node,
|
|
536
548
|
query_tags: query_tags,
|
|
537
|
-
vector_similarity: node['similarity']&.to_f
|
|
549
|
+
vector_similarity: node['similarity']&.to_f,
|
|
550
|
+
node_tags: node_tags
|
|
538
551
|
)
|
|
539
552
|
|
|
540
553
|
node.merge({
|
|
541
554
|
'relevance' => relevance,
|
|
542
|
-
'tags' =>
|
|
555
|
+
'tags' => node_tags
|
|
543
556
|
})
|
|
544
557
|
end
|
|
545
558
|
|
|
@@ -559,10 +572,32 @@ class HTM
|
|
|
559
572
|
.joins(:node_tags)
|
|
560
573
|
.where(node_tags: { node_id: node_id })
|
|
561
574
|
.pluck(:name)
|
|
562
|
-
rescue
|
|
575
|
+
rescue ActiveRecord::ActiveRecordError => e
|
|
576
|
+
HTM.logger.error("Failed to retrieve tags for node #{node_id}: #{e.message}")
|
|
563
577
|
[]
|
|
564
578
|
end
|
|
565
579
|
|
|
580
|
+
# Batch load tags for multiple nodes (avoids N+1 queries)
|
|
581
|
+
#
|
|
582
|
+
# @param node_ids [Array<Integer>] Node database IDs
|
|
583
|
+
# @return [Hash<Integer, Array<String>>] Map of node_id to array of tag names
|
|
584
|
+
#
|
|
585
|
+
def batch_load_node_tags(node_ids)
|
|
586
|
+
return {} if node_ids.empty?
|
|
587
|
+
|
|
588
|
+
# Single query to get all tags for all nodes
|
|
589
|
+
results = HTM::Models::NodeTag
|
|
590
|
+
.joins(:tag)
|
|
591
|
+
.where(node_id: node_ids)
|
|
592
|
+
.pluck(:node_id, 'tags.name')
|
|
593
|
+
|
|
594
|
+
# Group by node_id
|
|
595
|
+
results.group_by(&:first).transform_values { |pairs| pairs.map(&:last) }
|
|
596
|
+
rescue ActiveRecord::ActiveRecordError => e
|
|
597
|
+
HTM.logger.error("Failed to batch load tags: #{e.message}")
|
|
598
|
+
{}
|
|
599
|
+
end
|
|
600
|
+
|
|
566
601
|
# Search nodes by tags
|
|
567
602
|
#
|
|
568
603
|
# @param tags [Array<String>] Tags to search for
|
|
@@ -593,16 +628,22 @@ class HTM
|
|
|
593
628
|
# Get results
|
|
594
629
|
nodes = query.limit(limit).map(&:attributes)
|
|
595
630
|
|
|
631
|
+
# Batch load all tags for nodes (fixes N+1 query)
|
|
632
|
+
node_ids = nodes.map { |n| n['id'] }
|
|
633
|
+
tags_by_node = batch_load_node_tags(node_ids)
|
|
634
|
+
|
|
596
635
|
# Calculate relevance and enrich with tags
|
|
597
636
|
nodes.map do |node|
|
|
637
|
+
node_tags = tags_by_node[node['id']] || []
|
|
598
638
|
relevance = calculate_relevance(
|
|
599
639
|
node: node,
|
|
600
|
-
query_tags: tags
|
|
640
|
+
query_tags: tags,
|
|
641
|
+
node_tags: node_tags
|
|
601
642
|
)
|
|
602
643
|
|
|
603
644
|
node.merge({
|
|
604
645
|
'relevance' => relevance,
|
|
605
|
-
'tags' =>
|
|
646
|
+
'tags' => node_tags
|
|
606
647
|
})
|
|
607
648
|
end.sort_by { |n| -n['relevance'] }
|
|
608
649
|
end
|
|
@@ -633,43 +674,207 @@ class HTM
|
|
|
633
674
|
# Searches the tags table for tags where any hierarchy level matches
|
|
634
675
|
# query words. For example, query "PostgreSQL database" would match
|
|
635
676
|
# tags like "database:postgresql", "database:sql", etc.
|
|
677
|
+
# Find tags matching a query using semantic extraction
|
|
636
678
|
#
|
|
637
679
|
# @param query [String] Search query
|
|
638
|
-
# @
|
|
680
|
+
# @param include_extracted [Boolean] If true, returns hash with :extracted and :matched keys
|
|
681
|
+
# @return [Array<String>] Matching tag names (default)
|
|
682
|
+
# @return [Hash] If include_extracted: { extracted: [...], matched: [...] }
|
|
639
683
|
#
|
|
640
|
-
def find_query_matching_tags(query)
|
|
641
|
-
|
|
684
|
+
def find_query_matching_tags(query, include_extracted: false)
|
|
685
|
+
empty_result = include_extracted ? { extracted: [], matched: [] } : []
|
|
686
|
+
return empty_result if query.nil? || query.strip.empty?
|
|
642
687
|
|
|
643
|
-
#
|
|
644
|
-
|
|
645
|
-
|
|
688
|
+
# Use the tag extractor to generate semantic tags from the query
|
|
689
|
+
# This uses the same LLM process as when storing nodes
|
|
690
|
+
existing_tags = HTM::Models::Tag.pluck(:name).sample(50)
|
|
691
|
+
extracted_tags = HTM::TagService.extract(query, existing_ontology: existing_tags)
|
|
646
692
|
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
values = words.map { |w| "%#{w}%" }
|
|
693
|
+
if extracted_tags.empty?
|
|
694
|
+
return include_extracted ? { extracted: [], matched: [] } : []
|
|
695
|
+
end
|
|
651
696
|
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
697
|
+
# Step 1: Try exact matches
|
|
698
|
+
exact_matches = HTM::Models::Tag.where(name: extracted_tags).pluck(:name)
|
|
699
|
+
|
|
700
|
+
if exact_matches.any?
|
|
701
|
+
return include_extracted ? { extracted: extracted_tags, matched: exact_matches } : exact_matches
|
|
702
|
+
end
|
|
703
|
+
|
|
704
|
+
# Step 2: Try matching on parent/prefix levels
|
|
705
|
+
# For "person:human:character:popeye", try "person:human:character", "person:human", "person"
|
|
706
|
+
prefix_candidates = extracted_tags.flat_map do |tag|
|
|
707
|
+
levels = tag.split(':')
|
|
708
|
+
(1...levels.size).map { |i| levels[0, i].join(':') }
|
|
709
|
+
end.uniq
|
|
710
|
+
|
|
711
|
+
if prefix_candidates.any?
|
|
712
|
+
prefix_matches = HTM::Models::Tag.where(name: prefix_candidates).pluck(:name)
|
|
713
|
+
if prefix_matches.any?
|
|
714
|
+
return include_extracted ? { extracted: extracted_tags, matched: prefix_matches } : prefix_matches
|
|
715
|
+
end
|
|
716
|
+
end
|
|
717
|
+
|
|
718
|
+
# Step 3: Try matching individual components, starting from rightmost (most specific)
|
|
719
|
+
# For "person:human:character:popeye", try "popeye", then "character", then "human", then "person"
|
|
720
|
+
# Search for tags that contain this component at any level
|
|
721
|
+
all_components = extracted_tags.flat_map { |tag| tag.split(':') }.uniq
|
|
722
|
+
|
|
723
|
+
# Order by specificity: components that appear at deeper levels first
|
|
724
|
+
component_depths = Hash.new(0)
|
|
725
|
+
extracted_tags.each do |tag|
|
|
726
|
+
levels = tag.split(':')
|
|
727
|
+
levels.each_with_index { |comp, idx| component_depths[comp] = [component_depths[comp], idx].max }
|
|
728
|
+
end
|
|
729
|
+
ordered_components = all_components.sort_by { |c| -component_depths[c] }
|
|
730
|
+
|
|
731
|
+
# Try each component, starting with most specific (rightmost)
|
|
732
|
+
ordered_components.each do |component|
|
|
733
|
+
# Find tags where this component appears at any level
|
|
734
|
+
component_matches = HTM::Models::Tag
|
|
735
|
+
.where("name = ? OR name LIKE ? OR name LIKE ? OR name LIKE ?",
|
|
736
|
+
component, # exact match (single-level tag)
|
|
737
|
+
"#{component}:%", # starts with component
|
|
738
|
+
"%:#{component}", # ends with component
|
|
739
|
+
"%:#{component}:%") # component in middle
|
|
740
|
+
.pluck(:name)
|
|
741
|
+
|
|
742
|
+
if component_matches.any?
|
|
743
|
+
return include_extracted ? { extracted: extracted_tags, matched: component_matches } : component_matches
|
|
744
|
+
end
|
|
745
|
+
end
|
|
746
|
+
|
|
747
|
+
# No matches found at any level
|
|
748
|
+
include_extracted ? { extracted: extracted_tags, matched: [] } : []
|
|
655
749
|
end
|
|
656
750
|
|
|
657
751
|
private
|
|
658
752
|
|
|
753
|
+
# Sanitize embedding for SQL use
|
|
754
|
+
#
|
|
755
|
+
# Validates that all values are numeric and converts to safe PostgreSQL vector format.
|
|
756
|
+
# This prevents SQL injection by ensuring only valid numeric values are included.
|
|
757
|
+
#
|
|
758
|
+
# @param embedding [Array<Numeric>] Embedding vector
|
|
759
|
+
# @return [String] Sanitized vector string for PostgreSQL (e.g., "[0.1,0.2,0.3]")
|
|
760
|
+
# @raise [ArgumentError] If embedding contains non-numeric values
|
|
761
|
+
#
|
|
762
|
+
def sanitize_embedding_for_sql(embedding)
|
|
763
|
+
unless embedding.is_a?(Array) && embedding.all? { |v| v.is_a?(Numeric) && v.finite? }
|
|
764
|
+
raise ArgumentError, "Embedding must be an array of finite numeric values"
|
|
765
|
+
end
|
|
766
|
+
|
|
767
|
+
"[#{embedding.map { |v| v.to_f }.join(',')}]"
|
|
768
|
+
end
|
|
769
|
+
|
|
770
|
+
# Build SQL condition for timeframe filtering
|
|
771
|
+
#
|
|
772
|
+
# @param timeframe [nil, Range, Array<Range>] Time range(s)
|
|
773
|
+
# @param table_alias [String] Table alias (default: none)
|
|
774
|
+
# @return [String, nil] SQL condition or nil for no filter
|
|
775
|
+
#
|
|
776
|
+
def build_timeframe_condition(timeframe, table_alias: nil)
|
|
777
|
+
return nil if timeframe.nil?
|
|
778
|
+
|
|
779
|
+
prefix = table_alias ? "#{table_alias}." : ""
|
|
780
|
+
column = "#{prefix}created_at"
|
|
781
|
+
conn = ActiveRecord::Base.connection
|
|
782
|
+
|
|
783
|
+
case timeframe
|
|
784
|
+
when Range
|
|
785
|
+
# Use quote to safely escape timestamp values
|
|
786
|
+
begin_quoted = conn.quote(timeframe.begin.iso8601)
|
|
787
|
+
end_quoted = conn.quote(timeframe.end.iso8601)
|
|
788
|
+
"(#{column} BETWEEN #{begin_quoted} AND #{end_quoted})"
|
|
789
|
+
when Array
|
|
790
|
+
conditions = timeframe.map do |range|
|
|
791
|
+
begin_quoted = conn.quote(range.begin.iso8601)
|
|
792
|
+
end_quoted = conn.quote(range.end.iso8601)
|
|
793
|
+
"(#{column} BETWEEN #{begin_quoted} AND #{end_quoted})"
|
|
794
|
+
end
|
|
795
|
+
"(#{conditions.join(' OR ')})"
|
|
796
|
+
else
|
|
797
|
+
nil
|
|
798
|
+
end
|
|
799
|
+
end
|
|
800
|
+
|
|
801
|
+
# Build ActiveRecord where clause for timeframe
|
|
802
|
+
#
|
|
803
|
+
# @param scope [ActiveRecord::Relation] Base scope
|
|
804
|
+
# @param timeframe [nil, Range, Array<Range>] Time range(s)
|
|
805
|
+
# @return [ActiveRecord::Relation] Scoped query
|
|
806
|
+
#
|
|
807
|
+
def apply_timeframe_scope(scope, timeframe)
|
|
808
|
+
return scope if timeframe.nil?
|
|
809
|
+
|
|
810
|
+
case timeframe
|
|
811
|
+
when Range
|
|
812
|
+
scope.where(created_at: timeframe)
|
|
813
|
+
when Array
|
|
814
|
+
# Build OR conditions for multiple ranges
|
|
815
|
+
conditions = timeframe.map { |range| scope.where(created_at: range) }
|
|
816
|
+
conditions.reduce { |result, condition| result.or(condition) }
|
|
817
|
+
else
|
|
818
|
+
scope
|
|
819
|
+
end
|
|
820
|
+
end
|
|
821
|
+
|
|
822
|
+
# Build SQL condition for metadata filtering (JSONB containment)
|
|
823
|
+
#
|
|
824
|
+
# @param metadata [Hash] Metadata to filter by
|
|
825
|
+
# @param table_alias [String] Table alias (default: none)
|
|
826
|
+
# @return [String, nil] SQL condition or nil for no filter
|
|
827
|
+
#
|
|
828
|
+
def build_metadata_condition(metadata, table_alias: nil)
|
|
829
|
+
return nil if metadata.nil? || metadata.empty?
|
|
830
|
+
|
|
831
|
+
prefix = table_alias ? "#{table_alias}." : ""
|
|
832
|
+
column = "#{prefix}metadata"
|
|
833
|
+
conn = ActiveRecord::Base.connection
|
|
834
|
+
|
|
835
|
+
# Use JSONB containment operator @>
|
|
836
|
+
# This matches if the metadata column contains all key-value pairs in the filter
|
|
837
|
+
quoted_metadata = conn.quote(metadata.to_json)
|
|
838
|
+
"(#{column} @> #{quoted_metadata}::jsonb)"
|
|
839
|
+
end
|
|
840
|
+
|
|
841
|
+
# Build ActiveRecord where clause for metadata
|
|
842
|
+
#
|
|
843
|
+
# @param scope [ActiveRecord::Relation] Base scope
|
|
844
|
+
# @param metadata [Hash] Metadata to filter by
|
|
845
|
+
# @return [ActiveRecord::Relation] Scoped query
|
|
846
|
+
#
|
|
847
|
+
def apply_metadata_scope(scope, metadata)
|
|
848
|
+
return scope if metadata.nil? || metadata.empty?
|
|
849
|
+
|
|
850
|
+
# Use JSONB containment operator
|
|
851
|
+
scope.where("metadata @> ?::jsonb", metadata.to_json)
|
|
852
|
+
end
|
|
853
|
+
|
|
659
854
|
# Generate cache key for query
|
|
660
855
|
#
|
|
661
856
|
# @param method [Symbol] Search method name
|
|
662
|
-
# @param timeframe [Range] Time range
|
|
857
|
+
# @param timeframe [nil, Range, Array<Range>] Time range(s)
|
|
663
858
|
# @param query [String] Search query
|
|
664
859
|
# @param limit [Integer] Result limit
|
|
665
860
|
# @param args [Array] Additional arguments
|
|
666
861
|
# @return [String] Cache key
|
|
667
862
|
#
|
|
668
863
|
def cache_key_for(method, timeframe, query, limit, *args)
|
|
864
|
+
timeframe_key = case timeframe
|
|
865
|
+
when nil
|
|
866
|
+
"nil"
|
|
867
|
+
when Range
|
|
868
|
+
"#{timeframe.begin.to_i}-#{timeframe.end.to_i}"
|
|
869
|
+
when Array
|
|
870
|
+
timeframe.map { |r| "#{r.begin.to_i}-#{r.end.to_i}" }.join(',')
|
|
871
|
+
else
|
|
872
|
+
timeframe.to_s
|
|
873
|
+
end
|
|
874
|
+
|
|
669
875
|
key_parts = [
|
|
670
876
|
method,
|
|
671
|
-
|
|
672
|
-
timeframe.end.to_i,
|
|
877
|
+
timeframe_key,
|
|
673
878
|
query,
|
|
674
879
|
limit,
|
|
675
880
|
*args
|
|
@@ -738,17 +943,41 @@ class HTM
|
|
|
738
943
|
@query_cache.clear if @query_cache
|
|
739
944
|
end
|
|
740
945
|
|
|
946
|
+
# Execute a query with caching
|
|
947
|
+
#
|
|
948
|
+
# @param method [Symbol] Search method name for cache key
|
|
949
|
+
# @param args [Array] Arguments for cache key (timeframe, query, limit, etc.)
|
|
950
|
+
# @yield Block that executes the actual query
|
|
951
|
+
# @return [Array<Hash>] Query results (from cache or freshly executed)
|
|
952
|
+
#
|
|
953
|
+
def cached_query(method, *args, &block)
|
|
954
|
+
return yield unless @query_cache
|
|
955
|
+
|
|
956
|
+
cache_key = cache_key_for(method, *args)
|
|
957
|
+
|
|
958
|
+
if (cached = @query_cache[cache_key])
|
|
959
|
+
@cache_stats_mutex.synchronize { @cache_stats[:hits] += 1 }
|
|
960
|
+
return cached
|
|
961
|
+
end
|
|
962
|
+
|
|
963
|
+
@cache_stats_mutex.synchronize { @cache_stats[:misses] += 1 }
|
|
964
|
+
result = yield
|
|
965
|
+
@query_cache[cache_key] = result
|
|
966
|
+
result
|
|
967
|
+
end
|
|
968
|
+
|
|
741
969
|
# Uncached vector similarity search
|
|
742
970
|
#
|
|
743
971
|
# Generates query embedding client-side and performs vector search in database.
|
|
744
972
|
#
|
|
745
|
-
# @param timeframe [Range] Time range to search
|
|
973
|
+
# @param timeframe [nil, Range, Array<Range>] Time range(s) to search (nil = no filter)
|
|
746
974
|
# @param query [String] Search query
|
|
747
975
|
# @param limit [Integer] Maximum results
|
|
748
976
|
# @param embedding_service [Object] Service to generate query embedding
|
|
977
|
+
# @param metadata [Hash] Filter by metadata fields (default: {})
|
|
749
978
|
# @return [Array<Hash>] Matching nodes
|
|
750
979
|
#
|
|
751
|
-
def search_uncached(timeframe:, query:, limit:, embedding_service:)
|
|
980
|
+
def search_uncached(timeframe:, query:, limit:, embedding_service:, metadata: {})
|
|
752
981
|
# Generate query embedding client-side
|
|
753
982
|
query_embedding = embedding_service.embed(query)
|
|
754
983
|
|
|
@@ -757,17 +986,29 @@ class HTM
|
|
|
757
986
|
query_embedding = query_embedding + Array.new(2000 - query_embedding.length, 0.0)
|
|
758
987
|
end
|
|
759
988
|
|
|
760
|
-
#
|
|
761
|
-
embedding_str =
|
|
989
|
+
# Sanitize embedding for safe SQL use (validates all values are numeric)
|
|
990
|
+
embedding_str = sanitize_embedding_for_sql(query_embedding)
|
|
991
|
+
|
|
992
|
+
# Build filter conditions
|
|
993
|
+
timeframe_condition = build_timeframe_condition(timeframe)
|
|
994
|
+
metadata_condition = build_metadata_condition(metadata)
|
|
995
|
+
|
|
996
|
+
conditions = ["embedding IS NOT NULL", "deleted_at IS NULL"]
|
|
997
|
+
conditions << timeframe_condition if timeframe_condition
|
|
998
|
+
conditions << metadata_condition if metadata_condition
|
|
999
|
+
|
|
1000
|
+
where_clause = "WHERE #{conditions.join(' AND ')}"
|
|
1001
|
+
|
|
1002
|
+
# Use quote to safely escape the embedding string in the query
|
|
1003
|
+
quoted_embedding = ActiveRecord::Base.connection.quote(embedding_str)
|
|
762
1004
|
|
|
763
1005
|
result = ActiveRecord::Base.connection.select_all(
|
|
764
1006
|
<<~SQL,
|
|
765
1007
|
SELECT id, content, access_count, created_at, token_count,
|
|
766
|
-
1 - (embedding <=>
|
|
1008
|
+
1 - (embedding <=> #{quoted_embedding}::vector) as similarity
|
|
767
1009
|
FROM nodes
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
ORDER BY embedding <=> '#{embedding_str}'::vector
|
|
1010
|
+
#{where_clause}
|
|
1011
|
+
ORDER BY embedding <=> #{quoted_embedding}::vector
|
|
771
1012
|
LIMIT #{limit.to_i}
|
|
772
1013
|
SQL
|
|
773
1014
|
)
|
|
@@ -781,24 +1022,35 @@ class HTM
|
|
|
781
1022
|
|
|
782
1023
|
# Uncached full-text search
|
|
783
1024
|
#
|
|
784
|
-
# @param timeframe [Range] Time range to search
|
|
1025
|
+
# @param timeframe [nil, Range, Array<Range>] Time range(s) to search (nil = no filter)
|
|
785
1026
|
# @param query [String] Search query
|
|
786
1027
|
# @param limit [Integer] Maximum results
|
|
1028
|
+
# @param metadata [Hash] Filter by metadata fields (default: {})
|
|
787
1029
|
# @return [Array<Hash>] Matching nodes
|
|
788
1030
|
#
|
|
789
|
-
def search_fulltext_uncached(timeframe:, query:, limit:)
|
|
1031
|
+
def search_fulltext_uncached(timeframe:, query:, limit:, metadata: {})
|
|
1032
|
+
# Build filter conditions
|
|
1033
|
+
timeframe_condition = build_timeframe_condition(timeframe)
|
|
1034
|
+
metadata_condition = build_metadata_condition(metadata)
|
|
1035
|
+
|
|
1036
|
+
additional_conditions = []
|
|
1037
|
+
additional_conditions << timeframe_condition if timeframe_condition
|
|
1038
|
+
additional_conditions << metadata_condition if metadata_condition
|
|
1039
|
+
additional_sql = additional_conditions.any? ? "AND #{additional_conditions.join(' AND ')}" : ""
|
|
1040
|
+
|
|
790
1041
|
result = ActiveRecord::Base.connection.select_all(
|
|
791
1042
|
ActiveRecord::Base.sanitize_sql_array([
|
|
792
1043
|
<<~SQL,
|
|
793
1044
|
SELECT id, content, access_count, created_at, token_count,
|
|
794
1045
|
ts_rank(to_tsvector('english', content), plainto_tsquery('english', ?)) as rank
|
|
795
1046
|
FROM nodes
|
|
796
|
-
WHERE
|
|
1047
|
+
WHERE deleted_at IS NULL
|
|
797
1048
|
AND to_tsvector('english', content) @@ plainto_tsquery('english', ?)
|
|
1049
|
+
#{additional_sql}
|
|
798
1050
|
ORDER BY rank DESC
|
|
799
1051
|
LIMIT ?
|
|
800
1052
|
SQL
|
|
801
|
-
query,
|
|
1053
|
+
query, query, limit
|
|
802
1054
|
])
|
|
803
1055
|
)
|
|
804
1056
|
|
|
@@ -816,14 +1068,15 @@ class HTM
|
|
|
816
1068
|
# 2. Tag matching for categorical relevance
|
|
817
1069
|
# 3. Vector similarity for semantic ranking
|
|
818
1070
|
#
|
|
819
|
-
# @param timeframe [Range] Time range to search
|
|
1071
|
+
# @param timeframe [nil, Range, Array<Range>] Time range(s) to search (nil = no filter)
|
|
820
1072
|
# @param query [String] Search query
|
|
821
1073
|
# @param limit [Integer] Maximum results
|
|
822
1074
|
# @param embedding_service [Object] Service to generate query embedding
|
|
823
1075
|
# @param prefilter_limit [Integer] Candidates to consider
|
|
1076
|
+
# @param metadata [Hash] Filter by metadata fields (default: {})
|
|
824
1077
|
# @return [Array<Hash>] Matching nodes with similarity and tag_boost scores
|
|
825
1078
|
#
|
|
826
|
-
def search_hybrid_uncached(timeframe:, query:, limit:, embedding_service:, prefilter_limit:)
|
|
1079
|
+
def search_hybrid_uncached(timeframe:, query:, limit:, embedding_service:, prefilter_limit:, metadata: {})
|
|
827
1080
|
# Generate query embedding client-side
|
|
828
1081
|
query_embedding = embedding_service.embed(query)
|
|
829
1082
|
|
|
@@ -832,8 +1085,27 @@ class HTM
|
|
|
832
1085
|
query_embedding = query_embedding + Array.new(2000 - query_embedding.length, 0.0)
|
|
833
1086
|
end
|
|
834
1087
|
|
|
835
|
-
#
|
|
836
|
-
embedding_str =
|
|
1088
|
+
# Sanitize embedding for safe SQL use (validates all values are numeric)
|
|
1089
|
+
embedding_str = sanitize_embedding_for_sql(query_embedding)
|
|
1090
|
+
quoted_embedding = ActiveRecord::Base.connection.quote(embedding_str)
|
|
1091
|
+
|
|
1092
|
+
# Build filter conditions (with table alias for CTEs)
|
|
1093
|
+
timeframe_condition = build_timeframe_condition(timeframe, table_alias: 'n')
|
|
1094
|
+
metadata_condition = build_metadata_condition(metadata, table_alias: 'n')
|
|
1095
|
+
|
|
1096
|
+
additional_conditions = []
|
|
1097
|
+
additional_conditions << timeframe_condition if timeframe_condition
|
|
1098
|
+
additional_conditions << metadata_condition if metadata_condition
|
|
1099
|
+
additional_sql = additional_conditions.any? ? "AND #{additional_conditions.join(' AND ')}" : ""
|
|
1100
|
+
|
|
1101
|
+
# Same for non-aliased queries
|
|
1102
|
+
timeframe_condition_bare = build_timeframe_condition(timeframe)
|
|
1103
|
+
metadata_condition_bare = build_metadata_condition(metadata)
|
|
1104
|
+
|
|
1105
|
+
additional_conditions_bare = []
|
|
1106
|
+
additional_conditions_bare << timeframe_condition_bare if timeframe_condition_bare
|
|
1107
|
+
additional_conditions_bare << metadata_condition_bare if metadata_condition_bare
|
|
1108
|
+
additional_sql_bare = additional_conditions_bare.any? ? "AND #{additional_conditions_bare.join(' AND ')}" : ""
|
|
837
1109
|
|
|
838
1110
|
# Find tags that match query terms
|
|
839
1111
|
matching_tags = find_query_matching_tags(query)
|
|
@@ -843,10 +1115,7 @@ class HTM
|
|
|
843
1115
|
# NOTE: Hybrid search includes nodes without embeddings using a default
|
|
844
1116
|
# similarity score of 0.5. This allows newly created nodes to appear in
|
|
845
1117
|
# search results immediately (via fulltext matching) before their embeddings
|
|
846
|
-
# are generated by background jobs.
|
|
847
|
-
# (seconds) where async embedding generation hasn't completed yet.
|
|
848
|
-
# In production with longer timeframes, embeddings are typically ready
|
|
849
|
-
# within 1-5 seconds, so this fallback is rarely used.
|
|
1118
|
+
# are generated by background jobs.
|
|
850
1119
|
|
|
851
1120
|
if matching_tags.any?
|
|
852
1121
|
# Escape tag names for SQL
|
|
@@ -858,8 +1127,9 @@ class HTM
|
|
|
858
1127
|
-- Nodes matching full-text search (with or without embeddings)
|
|
859
1128
|
SELECT DISTINCT n.id, n.content, n.access_count, n.created_at, n.token_count, n.embedding
|
|
860
1129
|
FROM nodes n
|
|
861
|
-
WHERE n.
|
|
1130
|
+
WHERE n.deleted_at IS NULL
|
|
862
1131
|
AND to_tsvector('english', n.content) @@ plainto_tsquery('english', ?)
|
|
1132
|
+
#{additional_sql}
|
|
863
1133
|
LIMIT ?
|
|
864
1134
|
),
|
|
865
1135
|
tag_candidates AS (
|
|
@@ -868,8 +1138,9 @@ class HTM
|
|
|
868
1138
|
FROM nodes n
|
|
869
1139
|
JOIN node_tags nt ON nt.node_id = n.id
|
|
870
1140
|
JOIN tags t ON t.id = nt.tag_id
|
|
871
|
-
WHERE n.
|
|
1141
|
+
WHERE n.deleted_at IS NULL
|
|
872
1142
|
AND t.name IN (#{tag_list})
|
|
1143
|
+
#{additional_sql}
|
|
873
1144
|
LIMIT ?
|
|
874
1145
|
),
|
|
875
1146
|
all_candidates AS (
|
|
@@ -881,7 +1152,7 @@ class HTM
|
|
|
881
1152
|
SELECT
|
|
882
1153
|
ac.id, ac.content, ac.access_count, ac.created_at, ac.token_count,
|
|
883
1154
|
CASE
|
|
884
|
-
WHEN ac.embedding IS NOT NULL THEN 1 - (ac.embedding <=>
|
|
1155
|
+
WHEN ac.embedding IS NOT NULL THEN 1 - (ac.embedding <=> #{quoted_embedding}::vector)
|
|
885
1156
|
ELSE 0.5 -- Default similarity for nodes without embeddings
|
|
886
1157
|
END as similarity,
|
|
887
1158
|
COALESCE((
|
|
@@ -899,8 +1170,8 @@ class HTM
|
|
|
899
1170
|
ORDER BY combined_score DESC
|
|
900
1171
|
LIMIT ?
|
|
901
1172
|
SQL
|
|
902
|
-
|
|
903
|
-
|
|
1173
|
+
query, prefilter_limit,
|
|
1174
|
+
prefilter_limit,
|
|
904
1175
|
matching_tags.length.to_f,
|
|
905
1176
|
limit
|
|
906
1177
|
])
|
|
@@ -914,25 +1185,26 @@ class HTM
|
|
|
914
1185
|
WITH candidates AS (
|
|
915
1186
|
SELECT id, content, access_count, created_at, token_count, embedding
|
|
916
1187
|
FROM nodes
|
|
917
|
-
WHERE
|
|
1188
|
+
WHERE deleted_at IS NULL
|
|
918
1189
|
AND to_tsvector('english', content) @@ plainto_tsquery('english', ?)
|
|
1190
|
+
#{additional_sql_bare}
|
|
919
1191
|
LIMIT ?
|
|
920
1192
|
)
|
|
921
1193
|
SELECT id, content, access_count, created_at, token_count,
|
|
922
1194
|
CASE
|
|
923
|
-
WHEN embedding IS NOT NULL THEN 1 - (embedding <=>
|
|
1195
|
+
WHEN embedding IS NOT NULL THEN 1 - (embedding <=> #{quoted_embedding}::vector)
|
|
924
1196
|
ELSE 0.5 -- Default similarity for nodes without embeddings
|
|
925
1197
|
END as similarity,
|
|
926
1198
|
0.0 as tag_boost,
|
|
927
1199
|
CASE
|
|
928
|
-
WHEN embedding IS NOT NULL THEN 1 - (embedding <=>
|
|
1200
|
+
WHEN embedding IS NOT NULL THEN 1 - (embedding <=> #{quoted_embedding}::vector)
|
|
929
1201
|
ELSE 0.5 -- Default score for nodes without embeddings (fulltext matched)
|
|
930
1202
|
END as combined_score
|
|
931
1203
|
FROM candidates
|
|
932
1204
|
ORDER BY combined_score DESC
|
|
933
1205
|
LIMIT ?
|
|
934
1206
|
SQL
|
|
935
|
-
|
|
1207
|
+
query, prefilter_limit, limit
|
|
936
1208
|
])
|
|
937
1209
|
)
|
|
938
1210
|
end
|
|
@@ -969,160 +1241,5 @@ class HTM
|
|
|
969
1241
|
|
|
970
1242
|
[similarity, depth_weight]
|
|
971
1243
|
end
|
|
972
|
-
|
|
973
|
-
#######################################
|
|
974
|
-
=begin
|
|
975
|
-
|
|
976
|
-
# Enhanced hierarchical similarity (with term_bonus for deep term matches like "country-music")
|
|
977
|
-
# Replaces your private calculate_hierarchical_similarity
|
|
978
|
-
def calculate_hierarchical_similarity(tag_a, tag_b, max_depth: 5)
|
|
979
|
-
return [0.0, 1.0] if tag_a.empty? || tag_b.empty? # [similarity, weight]
|
|
980
|
-
|
|
981
|
-
parts_a = tag_a.split(':').reject(&:empty?)
|
|
982
|
-
parts_b = tag_b.split(':').reject(&:empty?)
|
|
983
|
-
return [0.0, 1.0] if parts_a.empty? || parts_b.empty?
|
|
984
|
-
|
|
985
|
-
# Prefix similarity
|
|
986
|
-
local_max = [parts_a.length, parts_b.length].max
|
|
987
|
-
common_levels = 0
|
|
988
|
-
(0...local_max).each do |i|
|
|
989
|
-
if i < parts_a.length && i < parts_b.length && parts_a[i] == parts_b[i]
|
|
990
|
-
common_levels += 1
|
|
991
|
-
else
|
|
992
|
-
break
|
|
993
|
-
end
|
|
994
|
-
end
|
|
995
|
-
prefix_sim = local_max > 0 ? common_levels.to_f / local_max : 0.0
|
|
996
|
-
|
|
997
|
-
# Term bonus: Shared terms weighted by avg depth
|
|
998
|
-
common_terms = parts_a.to_set & parts_b.to_set
|
|
999
|
-
term_bonus = 0.0
|
|
1000
|
-
common_terms.each do |term|
|
|
1001
|
-
depth_a = parts_a.index(term) + 1
|
|
1002
|
-
depth_b = parts_b.index(term) + 1
|
|
1003
|
-
avg_depth = (depth_a + depth_b) / 2.0
|
|
1004
|
-
depth_weight = avg_depth / max_depth.to_f
|
|
1005
|
-
term_bonus += depth_weight * 0.8 # Increased from 0.5 for more aggression
|
|
1006
|
-
end
|
|
1007
|
-
term_bonus = [1.0, term_bonus].min
|
|
1008
|
-
|
|
1009
|
-
# Combined similarity (your weight now favors deeper via local_max)
|
|
1010
|
-
sim = (prefix_sim + term_bonus) / 2.0
|
|
1011
|
-
weight = local_max.to_f / max_depth # Deeper = higher weight (flipped from your 1/max)
|
|
1012
|
-
|
|
1013
|
-
[sim, weight]
|
|
1014
|
-
end
|
|
1015
|
-
|
|
1016
|
-
# Enhanced weighted_hierarchical_jaccard (uses new similarity; adds max_pairs fallback)
|
|
1017
|
-
# Replaces your private weighted_hierarchical_jaccard
|
|
1018
|
-
def weighted_hierarchical_jaccard(set_a, set_b, max_depth: 5, max_pairs: 1000)
|
|
1019
|
-
return 0.0 if set_a.empty? || set_b.empty?
|
|
1020
|
-
|
|
1021
|
-
# Fallback to flat Jaccard for large sets (your jaccard_similarity)
|
|
1022
|
-
if set_a.size * set_b.size > max_pairs
|
|
1023
|
-
terms_a = set_a.flat_map { |tag| tag.split(':').reject(&:empty?) }.to_set
|
|
1024
|
-
terms_b = set_b.flat_map { |tag| tag.split(':').reject(&:empty?) }.to_set
|
|
1025
|
-
return jaccard_similarity(terms_a.to_a, terms_b.to_a)
|
|
1026
|
-
end
|
|
1027
|
-
|
|
1028
|
-
total_weighted_similarity = 0.0
|
|
1029
|
-
total_weights = 0.0
|
|
1030
|
-
set_a.each do |tag_a|
|
|
1031
|
-
set_b.each do |tag_b|
|
|
1032
|
-
similarity, weight = calculate_hierarchical_similarity(tag_a, tag_b, max_depth: max_depth)
|
|
1033
|
-
total_weighted_similarity += similarity * weight
|
|
1034
|
-
total_weights += weight
|
|
1035
|
-
end
|
|
1036
|
-
end
|
|
1037
|
-
total_weights > 0 ? total_weighted_similarity / total_weights : 0.0
|
|
1038
|
-
end
|
|
1039
|
-
|
|
1040
|
-
# Updated calculate_relevance (adds ont_weight param; scales to 0-100 option)
|
|
1041
|
-
# Enhances your existing method
|
|
1042
|
-
def calculate_relevance(node:, query_tags: [], vector_similarity: nil, ont_weight: 1.0, scale_to_100: false)
|
|
1043
|
-
# 1. Vector similarity (semantic) - weight: 0.5
|
|
1044
|
-
semantic_score = if vector_similarity
|
|
1045
|
-
vector_similarity
|
|
1046
|
-
elsif node['similarity']
|
|
1047
|
-
node['similarity'].to_f
|
|
1048
|
-
else
|
|
1049
|
-
0.5
|
|
1050
|
-
end
|
|
1051
|
-
|
|
1052
|
-
# 2. Tag overlap (ontology) - weight: 0.3, boosted by ont_weight
|
|
1053
|
-
node_tags = get_node_tags(node['id'])
|
|
1054
|
-
tag_score = if query_tags.any? && node_tags.any?
|
|
1055
|
-
weighted_hierarchical_jaccard(query_tags, node_tags) * ont_weight
|
|
1056
|
-
else
|
|
1057
|
-
0.5
|
|
1058
|
-
end
|
|
1059
|
-
tag_score = [tag_score, 1.0].min # Cap boosted score
|
|
1060
|
-
|
|
1061
|
-
# 3. Recency - weight: 0.1
|
|
1062
|
-
age_hours = (Time.current - Time.parse(node['created_at'].to_s)) / 3600.0
|
|
1063
|
-
recency_score = Math.exp(-age_hours / 168.0)
|
|
1064
|
-
|
|
1065
|
-
# 4. Access frequency - weight: 0.1
|
|
1066
|
-
access_count = node['access_count'] || 0
|
|
1067
|
-
access_score = Math.log(1 + access_count) / 10.0
|
|
1068
|
-
|
|
1069
|
-
# Weighted composite (0-10 base)
|
|
1070
|
-
relevance_0_10 = (
|
|
1071
|
-
(semantic_score * 0.5) +
|
|
1072
|
-
(tag_score * 0.3) +
|
|
1073
|
-
(recency_score * 0.1) +
|
|
1074
|
-
(access_score * 0.1)
|
|
1075
|
-
).clamp(0.0, 10.0)
|
|
1076
|
-
|
|
1077
|
-
# Scale to 0-100 if requested
|
|
1078
|
-
final_relevance = scale_to_100 ? (relevance_0_10 * 10.0).round(2) : relevance_0_10
|
|
1079
|
-
|
|
1080
|
-
final_relevance
|
|
1081
|
-
end
|
|
1082
|
-
|
|
1083
|
-
# Updated search_with_relevance (adds threshold: for 0-100 filtering; ont_weight)
|
|
1084
|
-
# Enhances your existing method
|
|
1085
|
-
def search_with_relevance(timeframe:, query: nil, query_tags: [], limit: 20, embedding_service: nil, threshold: nil, ont_weight: 1.0, scale_to_100: true)
|
|
1086
|
-
# Get candidates (your logic)
|
|
1087
|
-
candidates = if query && embedding_service
|
|
1088
|
-
search_uncached(timeframe: timeframe, query: query, limit: limit * 3, embedding_service: embedding_service) # Oversample more for thresholds
|
|
1089
|
-
elsif query
|
|
1090
|
-
search_fulltext_uncached(timeframe: timeframe, query: query, limit: limit * 3)
|
|
1091
|
-
else
|
|
1092
|
-
HTM::Models::Node
|
|
1093
|
-
.where(created_at: timeframe)
|
|
1094
|
-
.order(created_at: :desc)
|
|
1095
|
-
.limit(limit * 3)
|
|
1096
|
-
.map(&:attributes)
|
|
1097
|
-
end
|
|
1098
|
-
|
|
1099
|
-
# Score and enrich
|
|
1100
|
-
scored_nodes = candidates.map do |node|
|
|
1101
|
-
relevance = calculate_relevance(
|
|
1102
|
-
node: node,
|
|
1103
|
-
query_tags: query_tags,
|
|
1104
|
-
vector_similarity: node['similarity']&.to_f,
|
|
1105
|
-
ont_weight: ont_weight,
|
|
1106
|
-
scale_to_100: scale_to_100
|
|
1107
|
-
)
|
|
1108
|
-
node.merge({
|
|
1109
|
-
'relevance' => relevance,
|
|
1110
|
-
'tags' => get_node_tags(node['id'])
|
|
1111
|
-
})
|
|
1112
|
-
end
|
|
1113
|
-
|
|
1114
|
-
# Filter by threshold if provided (e.g., >=80 for 0-100 scale)
|
|
1115
|
-
scored_nodes = scored_nodes.select { |n| threshold.nil? || n['relevance'] >= threshold }
|
|
1116
|
-
|
|
1117
|
-
# Sort by relevance DESC, take limit (or all if threshold used)
|
|
1118
|
-
scored_nodes
|
|
1119
|
-
.sort_by { |n| -n['relevance'] }
|
|
1120
|
-
.take(limit)
|
|
1121
|
-
end
|
|
1122
|
-
|
|
1123
|
-
=end
|
|
1124
|
-
|
|
1125
|
-
|
|
1126
|
-
|
|
1127
1244
|
end
|
|
1128
1245
|
end
|