htm 0.0.1 → 0.0.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.aigcm_msg +1 -0
- data/.architecture/reviews/comprehensive-codebase-review.md +577 -0
- data/.claude/settings.local.json +92 -0
- data/.envrc +1 -0
- data/.irbrc +283 -80
- data/.tbls.yml +31 -0
- data/CHANGELOG.md +314 -16
- data/CLAUDE.md +603 -0
- data/README.md +76 -5
- data/Rakefile +5 -0
- data/SETUP.md +132 -101
- data/db/migrate/{20250101000001_enable_extensions.rb → 00001_enable_extensions.rb} +0 -1
- data/db/migrate/00002_create_robots.rb +11 -0
- data/db/migrate/00003_create_file_sources.rb +20 -0
- data/db/migrate/00004_create_nodes.rb +65 -0
- data/db/migrate/00005_create_tags.rb +13 -0
- data/db/migrate/00006_create_node_tags.rb +18 -0
- data/db/migrate/00007_create_robot_nodes.rb +26 -0
- data/db/migrate/00009_add_working_memory_to_robot_nodes.rb +12 -0
- data/db/schema.sql +390 -36
- data/docs/api/database.md +19 -232
- data/docs/api/embedding-service.md +1 -7
- data/docs/api/htm.md +305 -364
- data/docs/api/index.md +1 -7
- data/docs/api/long-term-memory.md +342 -590
- data/docs/api/yard/HTM/ActiveRecordConfig.md +23 -0
- data/docs/api/yard/HTM/AuthorizationError.md +11 -0
- data/docs/api/yard/HTM/CircuitBreaker.md +92 -0
- data/docs/api/yard/HTM/CircuitBreakerOpenError.md +34 -0
- data/docs/api/yard/HTM/Configuration.md +175 -0
- data/docs/api/yard/HTM/Database.md +99 -0
- data/docs/api/yard/HTM/DatabaseError.md +14 -0
- data/docs/api/yard/HTM/EmbeddingError.md +18 -0
- data/docs/api/yard/HTM/EmbeddingService.md +58 -0
- data/docs/api/yard/HTM/Error.md +11 -0
- data/docs/api/yard/HTM/JobAdapter.md +39 -0
- data/docs/api/yard/HTM/LongTermMemory.md +342 -0
- data/docs/api/yard/HTM/NotFoundError.md +17 -0
- data/docs/api/yard/HTM/Observability.md +107 -0
- data/docs/api/yard/HTM/QueryTimeoutError.md +19 -0
- data/docs/api/yard/HTM/Railtie.md +27 -0
- data/docs/api/yard/HTM/ResourceExhaustedError.md +13 -0
- data/docs/api/yard/HTM/TagError.md +18 -0
- data/docs/api/yard/HTM/TagService.md +67 -0
- data/docs/api/yard/HTM/Timeframe/Result.md +24 -0
- data/docs/api/yard/HTM/Timeframe.md +40 -0
- data/docs/api/yard/HTM/TimeframeExtractor/Result.md +24 -0
- data/docs/api/yard/HTM/TimeframeExtractor.md +45 -0
- data/docs/api/yard/HTM/ValidationError.md +20 -0
- data/docs/api/yard/HTM/WorkingMemory.md +131 -0
- data/docs/api/yard/HTM.md +80 -0
- data/docs/api/yard/index.csv +179 -0
- data/docs/api/yard-reference.md +51 -0
- data/docs/architecture/adrs/001-postgresql-timescaledb.md +1 -1
- data/docs/architecture/adrs/003-ollama-embeddings.md +1 -1
- data/docs/architecture/adrs/010-redis-working-memory-rejected.md +2 -27
- data/docs/architecture/adrs/index.md +2 -13
- data/docs/architecture/hive-mind.md +165 -166
- data/docs/architecture/index.md +2 -2
- data/docs/architecture/overview.md +5 -171
- data/docs/architecture/two-tier-memory.md +1 -35
- data/docs/assets/images/adr-010-current-architecture.svg +37 -0
- data/docs/assets/images/adr-010-proposed-architecture.svg +48 -0
- data/docs/assets/images/adr-dependency-tree.svg +93 -0
- data/docs/assets/images/class-hierarchy.svg +55 -0
- data/docs/assets/images/exception-hierarchy.svg +45 -0
- data/docs/assets/images/htm-architecture-overview.svg +83 -0
- data/docs/assets/images/htm-complete-memory-flow.svg +160 -0
- data/docs/assets/images/htm-context-assembly-flow.svg +148 -0
- data/docs/assets/images/htm-eviction-process.svg +141 -0
- data/docs/assets/images/htm-memory-addition-flow.svg +138 -0
- data/docs/assets/images/htm-memory-recall-flow.svg +152 -0
- data/docs/assets/images/htm-node-states.svg +123 -0
- data/docs/assets/images/project-structure.svg +78 -0
- data/docs/assets/images/test-directory-structure.svg +38 -0
- data/{dbdoc → docs/database}/README.md +127 -125
- data/docs/database/public.file_sources.md +42 -0
- data/docs/database/public.file_sources.svg +211 -0
- data/{dbdoc → docs/database}/public.node_tags.md +7 -8
- data/docs/database/public.node_tags.svg +239 -0
- data/{dbdoc → docs/database}/public.nodes.md +22 -17
- data/docs/database/public.nodes.svg +271 -0
- data/docs/database/public.robot_nodes.md +46 -0
- data/docs/database/public.robot_nodes.svg +243 -0
- data/{dbdoc → docs/database}/public.robots.md +2 -3
- data/docs/database/public.robots.svg +161 -0
- data/docs/database/public.tags.svg +139 -0
- data/{dbdoc → docs/database}/schema.json +941 -630
- data/docs/database/schema.svg +282 -0
- data/docs/development/index.md +1 -29
- data/docs/development/schema.md +134 -309
- data/docs/development/testing.md +1 -9
- data/docs/getting-started/index.md +47 -0
- data/docs/{installation.md → getting-started/installation.md} +2 -2
- data/docs/{quick-start.md → getting-started/quick-start.md} +5 -5
- data/docs/guides/adding-memories.md +295 -643
- data/docs/guides/recalling-memories.md +36 -1
- data/docs/guides/search-strategies.md +85 -51
- data/docs/images/htm-er-diagram.svg +156 -0
- data/docs/index.md +16 -31
- data/docs/multi_framework_support.md +4 -4
- data/examples/README.md +280 -0
- data/examples/basic_usage.rb +18 -16
- data/examples/cli_app/htm_cli.rb +146 -8
- data/examples/cli_app/temp.log +93 -0
- data/examples/custom_llm_configuration.rb +1 -2
- data/examples/example_app/app.rb +11 -14
- data/examples/file_loader_usage.rb +177 -0
- data/examples/robot_groups/lib/robot_group.rb +419 -0
- data/examples/robot_groups/lib/working_memory_channel.rb +140 -0
- data/examples/robot_groups/multi_process.rb +286 -0
- data/examples/robot_groups/robot_worker.rb +136 -0
- data/examples/robot_groups/same_process.rb +229 -0
- data/examples/sinatra_app/Gemfile +1 -0
- data/examples/sinatra_app/Gemfile.lock +166 -0
- data/examples/sinatra_app/app.rb +219 -24
- data/examples/timeframe_demo.rb +276 -0
- data/lib/htm/active_record_config.rb +10 -3
- data/lib/htm/circuit_breaker.rb +202 -0
- data/lib/htm/configuration.rb +313 -80
- data/lib/htm/database.rb +67 -36
- data/lib/htm/embedding_service.rb +39 -2
- data/lib/htm/errors.rb +131 -11
- data/lib/htm/{sinatra.rb → integrations/sinatra.rb} +87 -12
- data/lib/htm/job_adapter.rb +10 -3
- data/lib/htm/jobs/generate_embedding_job.rb +5 -4
- data/lib/htm/jobs/generate_tags_job.rb +4 -0
- data/lib/htm/loaders/markdown_loader.rb +263 -0
- data/lib/htm/loaders/paragraph_chunker.rb +112 -0
- data/lib/htm/long_term_memory.rb +601 -321
- data/lib/htm/models/file_source.rb +99 -0
- data/lib/htm/models/node.rb +116 -12
- data/lib/htm/models/robot.rb +53 -4
- data/lib/htm/models/robot_node.rb +51 -0
- data/lib/htm/models/tag.rb +302 -0
- data/lib/htm/observability.rb +395 -0
- data/lib/htm/tag_service.rb +60 -3
- data/lib/htm/tasks.rb +29 -0
- data/lib/htm/timeframe.rb +194 -0
- data/lib/htm/timeframe_extractor.rb +307 -0
- data/lib/htm/version.rb +1 -1
- data/lib/htm/working_memory.rb +165 -70
- data/lib/htm.rb +352 -133
- data/lib/tasks/doc.rake +300 -0
- data/lib/tasks/files.rake +299 -0
- data/lib/tasks/htm.rake +188 -2
- data/lib/tasks/jobs.rake +10 -12
- data/lib/tasks/tags.rake +194 -0
- data/mkdocs.yml +91 -9
- data/notes/ARCHITECTURE_REVIEW.md +1167 -0
- data/notes/IMPLEMENTATION_SUMMARY.md +606 -0
- data/notes/MULTI_FRAMEWORK_IMPLEMENTATION.md +451 -0
- data/notes/next_steps.md +100 -0
- data/notes/plan.md +627 -0
- data/notes/tag_ontology_enhancement_ideas.md +222 -0
- data/notes/timescaledb_removal_summary.md +200 -0
- metadata +177 -37
- data/db/migrate/20250101000002_create_robots.rb +0 -14
- data/db/migrate/20250101000003_create_nodes.rb +0 -42
- data/db/migrate/20250101000005_create_tags.rb +0 -38
- data/db/migrate/20250101000007_add_node_vector_indexes.rb +0 -30
- data/dbdoc/public.node_tags.svg +0 -112
- data/dbdoc/public.nodes.svg +0 -118
- data/dbdoc/public.robots.svg +0 -90
- data/dbdoc/public.tags.svg +0 -60
- data/dbdoc/schema.svg +0 -154
- data/{dbdoc → docs/database}/public.node_stats.md +0 -0
- data/{dbdoc → docs/database}/public.node_stats.svg +0 -0
- data/{dbdoc → docs/database}/public.nodes_tags.md +0 -0
- data/{dbdoc → docs/database}/public.nodes_tags.svg +0 -0
- data/{dbdoc → docs/database}/public.ontology_structure.md +0 -0
- data/{dbdoc → docs/database}/public.ontology_structure.svg +0 -0
- data/{dbdoc → docs/database}/public.operations_log.md +0 -0
- data/{dbdoc → docs/database}/public.operations_log.svg +0 -0
- data/{dbdoc → docs/database}/public.relationships.md +0 -0
- data/{dbdoc → docs/database}/public.relationships.svg +0 -0
- data/{dbdoc → docs/database}/public.robot_activity.md +0 -0
- data/{dbdoc → docs/database}/public.robot_activity.svg +0 -0
- data/{dbdoc → docs/database}/public.schema_migrations.md +0 -0
- data/{dbdoc → docs/database}/public.schema_migrations.svg +0 -0
- data/{dbdoc → docs/database}/public.tags.md +3 -3
- /data/{dbdoc → docs/database}/public.topic_relationships.md +0 -0
- /data/{dbdoc → docs/database}/public.topic_relationships.svg +0 -0
data/lib/htm/long_term_memory.rb
CHANGED
|
@@ -25,6 +25,23 @@ class HTM
|
|
|
25
25
|
|
|
26
26
|
attr_reader :query_timeout
|
|
27
27
|
|
|
28
|
+
# Initialize long-term memory storage
|
|
29
|
+
#
|
|
30
|
+
# @param config [Hash] Database configuration (host, port, dbname, user, password)
|
|
31
|
+
# @param pool_size [Integer, nil] Connection pool size (uses ActiveRecord default if nil)
|
|
32
|
+
# @param query_timeout [Integer] Query timeout in milliseconds (default: 30000)
|
|
33
|
+
# @param cache_size [Integer] Number of query results to cache (default: 1000, use 0 to disable)
|
|
34
|
+
# @param cache_ttl [Integer] Cache time-to-live in seconds (default: 300)
|
|
35
|
+
#
|
|
36
|
+
# @example Initialize with defaults
|
|
37
|
+
# ltm = LongTermMemory.new(HTM::Database.default_config)
|
|
38
|
+
#
|
|
39
|
+
# @example Initialize with custom cache settings
|
|
40
|
+
# ltm = LongTermMemory.new(config, cache_size: 500, cache_ttl: 600)
|
|
41
|
+
#
|
|
42
|
+
# @example Disable caching
|
|
43
|
+
# ltm = LongTermMemory.new(config, cache_size: 0)
|
|
44
|
+
#
|
|
28
45
|
def initialize(config, pool_size: nil, query_timeout: DEFAULT_QUERY_TIMEOUT, cache_size: DEFAULT_CACHE_SIZE, cache_ttl: DEFAULT_CACHE_TTL)
|
|
29
46
|
@config = config
|
|
30
47
|
@query_timeout = query_timeout # in milliseconds
|
|
@@ -36,47 +53,114 @@ class HTM
|
|
|
36
53
|
if cache_size > 0
|
|
37
54
|
@query_cache = LruRedux::TTL::ThreadSafeCache.new(cache_size, cache_ttl)
|
|
38
55
|
@cache_stats = { hits: 0, misses: 0 }
|
|
56
|
+
@cache_stats_mutex = Mutex.new # Thread-safety for cache statistics
|
|
39
57
|
end
|
|
40
58
|
end
|
|
41
59
|
|
|
42
|
-
# Add a node to long-term memory
|
|
60
|
+
# Add a node to long-term memory (with deduplication)
|
|
43
61
|
#
|
|
44
|
-
#
|
|
62
|
+
# If content already exists (by content_hash), links the robot to the existing
|
|
63
|
+
# node and updates timestamps. Otherwise creates a new node.
|
|
45
64
|
#
|
|
46
65
|
# @param content [String] Conversation message/utterance
|
|
47
|
-
# @param speaker [String] Who said it: 'user' or robot name
|
|
48
66
|
# @param token_count [Integer] Token count
|
|
49
|
-
# @param robot_id [
|
|
67
|
+
# @param robot_id [Integer] Robot identifier
|
|
50
68
|
# @param embedding [Array<Float>, nil] Pre-generated embedding vector
|
|
51
|
-
# @
|
|
52
|
-
#
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
69
|
+
# @param metadata [Hash] Flexible metadata for the node (default: {})
|
|
70
|
+
# @return [Hash] { node_id:, is_new:, robot_node: }
|
|
71
|
+
#
|
|
72
|
+
def add(content:, token_count: 0, robot_id:, embedding: nil, metadata: {})
|
|
73
|
+
content_hash = HTM::Models::Node.generate_content_hash(content)
|
|
74
|
+
|
|
75
|
+
# Wrap in transaction to ensure data consistency
|
|
76
|
+
ActiveRecord::Base.transaction do
|
|
77
|
+
# Check for existing node with same content (including soft-deleted)
|
|
78
|
+
# This avoids unique constraint violations on content_hash
|
|
79
|
+
existing_node = HTM::Models::Node.with_deleted.find_by(content_hash: content_hash)
|
|
80
|
+
|
|
81
|
+
# If found but soft-deleted, restore it
|
|
82
|
+
if existing_node&.deleted?
|
|
83
|
+
existing_node.restore!
|
|
84
|
+
HTM.logger.info "Restored soft-deleted node #{existing_node.id} for content match"
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
if existing_node
|
|
88
|
+
# Link robot to existing node (or update if already linked)
|
|
89
|
+
robot_node = link_robot_to_node(robot_id: robot_id, node: existing_node)
|
|
90
|
+
|
|
91
|
+
# Update the node's updated_at timestamp
|
|
92
|
+
existing_node.touch
|
|
93
|
+
|
|
94
|
+
{
|
|
95
|
+
node_id: existing_node.id,
|
|
96
|
+
is_new: false,
|
|
97
|
+
robot_node: robot_node
|
|
98
|
+
}
|
|
60
99
|
else
|
|
61
|
-
|
|
100
|
+
# Prepare embedding if provided
|
|
101
|
+
embedding_str = nil
|
|
102
|
+
if embedding
|
|
103
|
+
# Pad embedding to 2000 dimensions if needed
|
|
104
|
+
actual_dimension = embedding.length
|
|
105
|
+
padded_embedding = if actual_dimension < 2000
|
|
106
|
+
embedding + Array.new(2000 - actual_dimension, 0.0)
|
|
107
|
+
else
|
|
108
|
+
embedding
|
|
109
|
+
end
|
|
110
|
+
embedding_str = "[#{padded_embedding.join(',')}]"
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
# Create new node
|
|
114
|
+
node = HTM::Models::Node.create!(
|
|
115
|
+
content: content,
|
|
116
|
+
content_hash: content_hash,
|
|
117
|
+
token_count: token_count,
|
|
118
|
+
embedding: embedding_str,
|
|
119
|
+
metadata: metadata
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
# Link robot to new node
|
|
123
|
+
robot_node = link_robot_to_node(robot_id: robot_id, node: node)
|
|
124
|
+
|
|
125
|
+
# Invalidate cache since database content changed
|
|
126
|
+
invalidate_cache!
|
|
127
|
+
|
|
128
|
+
{
|
|
129
|
+
node_id: node.id,
|
|
130
|
+
is_new: true,
|
|
131
|
+
robot_node: robot_node
|
|
132
|
+
}
|
|
62
133
|
end
|
|
63
|
-
embedding_str = "[#{padded_embedding.join(',')}]"
|
|
64
134
|
end
|
|
135
|
+
end
|
|
65
136
|
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
)
|
|
137
|
+
# Link a robot to a node (create or update robot_node record)
|
|
138
|
+
#
|
|
139
|
+
# @param robot_id [Integer] Robot ID
|
|
140
|
+
# @param node [HTM::Models::Node] Node to link
|
|
141
|
+
# @param working_memory [Boolean] Whether node is in working memory (default: false)
|
|
142
|
+
# @return [HTM::Models::RobotNode] The robot_node link record
|
|
143
|
+
#
|
|
144
|
+
def link_robot_to_node(robot_id:, node:, working_memory: false)
|
|
145
|
+
robot_node = HTM::Models::RobotNode.find_by(robot_id: robot_id, node_id: node.id)
|
|
75
146
|
|
|
76
|
-
|
|
77
|
-
|
|
147
|
+
if robot_node
|
|
148
|
+
# Existing link - record that robot remembered this again
|
|
149
|
+
robot_node.record_remember!
|
|
150
|
+
robot_node.update!(working_memory: working_memory) if working_memory
|
|
151
|
+
else
|
|
152
|
+
# New link
|
|
153
|
+
robot_node = HTM::Models::RobotNode.create!(
|
|
154
|
+
robot_id: robot_id,
|
|
155
|
+
node_id: node.id,
|
|
156
|
+
first_remembered_at: Time.current,
|
|
157
|
+
last_remembered_at: Time.current,
|
|
158
|
+
remember_count: 1,
|
|
159
|
+
working_memory: working_memory
|
|
160
|
+
)
|
|
161
|
+
end
|
|
78
162
|
|
|
79
|
-
|
|
163
|
+
robot_node
|
|
80
164
|
end
|
|
81
165
|
|
|
82
166
|
# Retrieve a node by ID
|
|
@@ -131,33 +215,17 @@ class HTM
|
|
|
131
215
|
|
|
132
216
|
# Vector similarity search
|
|
133
217
|
#
|
|
134
|
-
# @param timeframe [Range] Time range to search
|
|
218
|
+
# @param timeframe [nil, Range, Array<Range>] Time range(s) to search (nil = no filter)
|
|
135
219
|
# @param query [String] Search query
|
|
136
220
|
# @param limit [Integer] Maximum results
|
|
137
221
|
# @param embedding_service [Object] Service to generate embeddings
|
|
222
|
+
# @param metadata [Hash] Filter by metadata fields (default: {})
|
|
138
223
|
# @return [Array<Hash>] Matching nodes
|
|
139
224
|
#
|
|
140
|
-
def search(timeframe:, query:, limit:, embedding_service:)
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
# Generate cache key
|
|
145
|
-
cache_key = cache_key_for(:search, timeframe, query, limit)
|
|
146
|
-
|
|
147
|
-
# Try to get from cache
|
|
148
|
-
cached = @query_cache[cache_key]
|
|
149
|
-
if cached
|
|
150
|
-
@cache_stats[:hits] += 1
|
|
151
|
-
return cached
|
|
225
|
+
def search(timeframe:, query:, limit:, embedding_service:, metadata: {})
|
|
226
|
+
cached_query(:search, timeframe, query, limit, metadata) do
|
|
227
|
+
search_uncached(timeframe: timeframe, query: query, limit: limit, embedding_service: embedding_service, metadata: metadata)
|
|
152
228
|
end
|
|
153
|
-
|
|
154
|
-
# Cache miss - execute query
|
|
155
|
-
@cache_stats[:misses] += 1
|
|
156
|
-
result = search_uncached(timeframe: timeframe, query: query, limit: limit, embedding_service: embedding_service)
|
|
157
|
-
|
|
158
|
-
# Store in cache
|
|
159
|
-
@query_cache[cache_key] = result
|
|
160
|
-
result
|
|
161
229
|
end
|
|
162
230
|
|
|
163
231
|
# Full-text search
|
|
@@ -165,29 +233,13 @@ class HTM
|
|
|
165
233
|
# @param timeframe [Range] Time range to search
|
|
166
234
|
# @param query [String] Search query
|
|
167
235
|
# @param limit [Integer] Maximum results
|
|
236
|
+
# @param metadata [Hash] Filter by metadata fields (default: {})
|
|
168
237
|
# @return [Array<Hash>] Matching nodes
|
|
169
238
|
#
|
|
170
|
-
def search_fulltext(timeframe:, query:, limit:)
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
# Generate cache key
|
|
175
|
-
cache_key = cache_key_for(:fulltext, timeframe, query, limit)
|
|
176
|
-
|
|
177
|
-
# Try to get from cache
|
|
178
|
-
cached = @query_cache[cache_key]
|
|
179
|
-
if cached
|
|
180
|
-
@cache_stats[:hits] += 1
|
|
181
|
-
return cached
|
|
239
|
+
def search_fulltext(timeframe:, query:, limit:, metadata: {})
|
|
240
|
+
cached_query(:fulltext, timeframe, query, limit, metadata) do
|
|
241
|
+
search_fulltext_uncached(timeframe: timeframe, query: query, limit: limit, metadata: metadata)
|
|
182
242
|
end
|
|
183
|
-
|
|
184
|
-
# Cache miss - execute query
|
|
185
|
-
@cache_stats[:misses] += 1
|
|
186
|
-
result = search_fulltext_uncached(timeframe: timeframe, query: query, limit: limit)
|
|
187
|
-
|
|
188
|
-
# Store in cache
|
|
189
|
-
@query_cache[cache_key] = result
|
|
190
|
-
result
|
|
191
243
|
end
|
|
192
244
|
|
|
193
245
|
# Hybrid search (full-text + vector)
|
|
@@ -197,29 +249,13 @@ class HTM
|
|
|
197
249
|
# @param limit [Integer] Maximum results
|
|
198
250
|
# @param embedding_service [Object] Service to generate embeddings
|
|
199
251
|
# @param prefilter_limit [Integer] Candidates to consider (default: 100)
|
|
252
|
+
# @param metadata [Hash] Filter by metadata fields (default: {})
|
|
200
253
|
# @return [Array<Hash>] Matching nodes
|
|
201
254
|
#
|
|
202
|
-
def search_hybrid(timeframe:, query:, limit:, embedding_service:, prefilter_limit: 100)
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
# Generate cache key
|
|
207
|
-
cache_key = cache_key_for(:hybrid, timeframe, query, limit, prefilter_limit)
|
|
208
|
-
|
|
209
|
-
# Try to get from cache
|
|
210
|
-
cached = @query_cache[cache_key]
|
|
211
|
-
if cached
|
|
212
|
-
@cache_stats[:hits] += 1
|
|
213
|
-
return cached
|
|
255
|
+
def search_hybrid(timeframe:, query:, limit:, embedding_service:, prefilter_limit: 100, metadata: {})
|
|
256
|
+
cached_query(:hybrid, timeframe, query, limit, prefilter_limit, metadata) do
|
|
257
|
+
search_hybrid_uncached(timeframe: timeframe, query: query, limit: limit, embedding_service: embedding_service, prefilter_limit: prefilter_limit, metadata: metadata)
|
|
214
258
|
end
|
|
215
|
-
|
|
216
|
-
# Cache miss - execute query
|
|
217
|
-
@cache_stats[:misses] += 1
|
|
218
|
-
result = search_hybrid_uncached(timeframe: timeframe, query: query, limit: limit, embedding_service: embedding_service, prefilter_limit: prefilter_limit)
|
|
219
|
-
|
|
220
|
-
# Store in cache
|
|
221
|
-
@query_cache[cache_key] = result
|
|
222
|
-
result
|
|
223
259
|
end
|
|
224
260
|
|
|
225
261
|
# Add a tag to a node
|
|
@@ -240,13 +276,19 @@ class HTM
|
|
|
240
276
|
|
|
241
277
|
# Mark nodes as evicted from working memory
|
|
242
278
|
#
|
|
243
|
-
#
|
|
279
|
+
# Sets working_memory = false on the robot_nodes join table for the specified
|
|
280
|
+
# robot and node IDs.
|
|
281
|
+
#
|
|
282
|
+
# @param robot_id [Integer] Robot ID whose working memory is being evicted
|
|
283
|
+
# @param node_ids [Array<Integer>] Node IDs to mark as evicted
|
|
244
284
|
# @return [void]
|
|
245
285
|
#
|
|
246
|
-
def mark_evicted(node_ids)
|
|
286
|
+
def mark_evicted(robot_id:, node_ids:)
|
|
247
287
|
return if node_ids.empty?
|
|
248
288
|
|
|
249
|
-
HTM::Models::
|
|
289
|
+
HTM::Models::RobotNode
|
|
290
|
+
.where(robot_id: robot_id, node_id: node_ids)
|
|
291
|
+
.update_all(working_memory: false)
|
|
250
292
|
end
|
|
251
293
|
|
|
252
294
|
# Track access for multiple nodes (bulk operation)
|
|
@@ -294,7 +336,7 @@ class HTM
|
|
|
294
336
|
def stats
|
|
295
337
|
base_stats = {
|
|
296
338
|
total_nodes: HTM::Models::Node.count,
|
|
297
|
-
nodes_by_robot: HTM::Models::
|
|
339
|
+
nodes_by_robot: HTM::Models::RobotNode.group(:robot_id).count,
|
|
298
340
|
total_tags: HTM::Models::Tag.count,
|
|
299
341
|
oldest_memory: HTM::Models::Node.minimum(:created_at),
|
|
300
342
|
newest_memory: HTM::Models::Node.maximum(:created_at),
|
|
@@ -317,6 +359,17 @@ class HTM
|
|
|
317
359
|
# This method kept for API compatibility
|
|
318
360
|
end
|
|
319
361
|
|
|
362
|
+
# Clear the query cache
|
|
363
|
+
#
|
|
364
|
+
# Call this after any operation that modifies data (soft delete, restore, etc.)
|
|
365
|
+
# to ensure subsequent queries see fresh results.
|
|
366
|
+
#
|
|
367
|
+
# @return [void]
|
|
368
|
+
#
|
|
369
|
+
def clear_cache!
|
|
370
|
+
invalidate_cache!
|
|
371
|
+
end
|
|
372
|
+
|
|
320
373
|
# For backwards compatibility with tests/code that expect pool_size
|
|
321
374
|
def pool_size
|
|
322
375
|
ActiveRecord::Base.connection_pool.size
|
|
@@ -367,19 +420,24 @@ class HTM
|
|
|
367
420
|
# @return [Array<Hash>] Topic relationships
|
|
368
421
|
#
|
|
369
422
|
def topic_relationships(min_shared_nodes: 2, limit: 50)
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
423
|
+
# Use parameterized query to prevent SQL injection
|
|
424
|
+
sql = <<~SQL
|
|
425
|
+
SELECT t1.name AS topic1, t2.name AS topic2, COUNT(DISTINCT nt1.node_id) AS shared_nodes
|
|
426
|
+
FROM tags t1
|
|
427
|
+
JOIN node_tags nt1 ON t1.id = nt1.tag_id
|
|
428
|
+
JOIN node_tags nt2 ON nt1.node_id = nt2.node_id
|
|
429
|
+
JOIN tags t2 ON nt2.tag_id = t2.id
|
|
430
|
+
WHERE t1.name < t2.name
|
|
431
|
+
GROUP BY t1.name, t2.name
|
|
432
|
+
HAVING COUNT(DISTINCT nt1.node_id) >= $1
|
|
433
|
+
ORDER BY shared_nodes DESC
|
|
434
|
+
LIMIT $2
|
|
435
|
+
SQL
|
|
436
|
+
|
|
437
|
+
result = ActiveRecord::Base.connection.exec_query(
|
|
438
|
+
sql,
|
|
439
|
+
'topic_relationships',
|
|
440
|
+
[[nil, min_shared_nodes.to_i], [nil, limit.to_i]]
|
|
383
441
|
)
|
|
384
442
|
result.to_a
|
|
385
443
|
end
|
|
@@ -408,9 +466,10 @@ class HTM
|
|
|
408
466
|
# @param node [Hash] Node data with similarity, tags, created_at, access_count
|
|
409
467
|
# @param query_tags [Array<String>] Tags associated with the query
|
|
410
468
|
# @param vector_similarity [Float, nil] Pre-computed vector similarity (0-1)
|
|
469
|
+
# @param node_tags [Array<String>, nil] Pre-loaded tags for this node (avoids N+1 query)
|
|
411
470
|
# @return [Float] Composite relevance score (0-10)
|
|
412
471
|
#
|
|
413
|
-
def calculate_relevance(node:, query_tags: [], vector_similarity: nil)
|
|
472
|
+
def calculate_relevance(node:, query_tags: [], vector_similarity: nil, node_tags: nil)
|
|
414
473
|
# 1. Vector similarity (semantic match) - weight: 0.5
|
|
415
474
|
semantic_score = if vector_similarity
|
|
416
475
|
vector_similarity
|
|
@@ -421,7 +480,8 @@ class HTM
|
|
|
421
480
|
end
|
|
422
481
|
|
|
423
482
|
# 2. Tag overlap (categorical relevance) - weight: 0.3
|
|
424
|
-
|
|
483
|
+
# Use pre-loaded tags if provided, otherwise fetch (for backward compatibility)
|
|
484
|
+
node_tags ||= get_node_tags(node['id'])
|
|
425
485
|
tag_score = if query_tags.any? && node_tags.any?
|
|
426
486
|
weighted_hierarchical_jaccard(query_tags, node_tags)
|
|
427
487
|
else
|
|
@@ -451,41 +511,48 @@ class HTM
|
|
|
451
511
|
#
|
|
452
512
|
# Returns nodes with calculated relevance scores based on query context
|
|
453
513
|
#
|
|
454
|
-
# @param timeframe [Range] Time range to search
|
|
514
|
+
# @param timeframe [nil, Range, Array<Range>] Time range(s) to search (nil = no filter)
|
|
455
515
|
# @param query [String, nil] Search query
|
|
456
516
|
# @param query_tags [Array<String>] Tags to match
|
|
457
517
|
# @param limit [Integer] Maximum results
|
|
458
518
|
# @param embedding_service [Object, nil] Service to generate embeddings
|
|
519
|
+
# @param metadata [Hash] Filter by metadata fields (default: {})
|
|
459
520
|
# @return [Array<Hash>] Nodes with relevance scores
|
|
460
521
|
#
|
|
461
|
-
def search_with_relevance(timeframe:, query: nil, query_tags: [], limit: 20, embedding_service: nil)
|
|
522
|
+
def search_with_relevance(timeframe:, query: nil, query_tags: [], limit: 20, embedding_service: nil, metadata: {})
|
|
462
523
|
# Get candidates from appropriate search method
|
|
463
524
|
candidates = if query && embedding_service
|
|
464
525
|
# Vector search
|
|
465
|
-
search_uncached(timeframe: timeframe, query: query, limit: limit * 2, embedding_service: embedding_service)
|
|
526
|
+
search_uncached(timeframe: timeframe, query: query, limit: limit * 2, embedding_service: embedding_service, metadata: metadata)
|
|
466
527
|
elsif query
|
|
467
528
|
# Full-text search
|
|
468
|
-
search_fulltext_uncached(timeframe: timeframe, query: query, limit: limit * 2)
|
|
529
|
+
search_fulltext_uncached(timeframe: timeframe, query: query, limit: limit * 2, metadata: metadata)
|
|
469
530
|
else
|
|
470
|
-
# Time-range only
|
|
471
|
-
HTM::Models::Node
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
.map(&:attributes)
|
|
531
|
+
# Time-range only (or no filter if timeframe is nil)
|
|
532
|
+
scope = HTM::Models::Node.where(deleted_at: nil)
|
|
533
|
+
scope = apply_timeframe_scope(scope, timeframe)
|
|
534
|
+
scope = apply_metadata_scope(scope, metadata)
|
|
535
|
+
scope.order(created_at: :desc).limit(limit * 2).map(&:attributes)
|
|
476
536
|
end
|
|
477
537
|
|
|
538
|
+
# Batch load all tags for candidates (fixes N+1 query)
|
|
539
|
+
node_ids = candidates.map { |n| n['id'] }
|
|
540
|
+
tags_by_node = batch_load_node_tags(node_ids)
|
|
541
|
+
|
|
478
542
|
# Calculate relevance for each candidate
|
|
479
543
|
scored_nodes = candidates.map do |node|
|
|
544
|
+
node_tags = tags_by_node[node['id']] || []
|
|
545
|
+
|
|
480
546
|
relevance = calculate_relevance(
|
|
481
547
|
node: node,
|
|
482
548
|
query_tags: query_tags,
|
|
483
|
-
vector_similarity: node['similarity']&.to_f
|
|
549
|
+
vector_similarity: node['similarity']&.to_f,
|
|
550
|
+
node_tags: node_tags
|
|
484
551
|
)
|
|
485
552
|
|
|
486
553
|
node.merge({
|
|
487
554
|
'relevance' => relevance,
|
|
488
|
-
'tags' =>
|
|
555
|
+
'tags' => node_tags
|
|
489
556
|
})
|
|
490
557
|
end
|
|
491
558
|
|
|
@@ -505,10 +572,32 @@ class HTM
|
|
|
505
572
|
.joins(:node_tags)
|
|
506
573
|
.where(node_tags: { node_id: node_id })
|
|
507
574
|
.pluck(:name)
|
|
508
|
-
rescue
|
|
575
|
+
rescue ActiveRecord::ActiveRecordError => e
|
|
576
|
+
HTM.logger.error("Failed to retrieve tags for node #{node_id}: #{e.message}")
|
|
509
577
|
[]
|
|
510
578
|
end
|
|
511
579
|
|
|
580
|
+
# Batch load tags for multiple nodes (avoids N+1 queries)
|
|
581
|
+
#
|
|
582
|
+
# @param node_ids [Array<Integer>] Node database IDs
|
|
583
|
+
# @return [Hash<Integer, Array<String>>] Map of node_id to array of tag names
|
|
584
|
+
#
|
|
585
|
+
def batch_load_node_tags(node_ids)
|
|
586
|
+
return {} if node_ids.empty?
|
|
587
|
+
|
|
588
|
+
# Single query to get all tags for all nodes
|
|
589
|
+
results = HTM::Models::NodeTag
|
|
590
|
+
.joins(:tag)
|
|
591
|
+
.where(node_id: node_ids)
|
|
592
|
+
.pluck(:node_id, 'tags.name')
|
|
593
|
+
|
|
594
|
+
# Group by node_id
|
|
595
|
+
results.group_by(&:first).transform_values { |pairs| pairs.map(&:last) }
|
|
596
|
+
rescue ActiveRecord::ActiveRecordError => e
|
|
597
|
+
HTM.logger.error("Failed to batch load tags: #{e.message}")
|
|
598
|
+
{}
|
|
599
|
+
end
|
|
600
|
+
|
|
512
601
|
# Search nodes by tags
|
|
513
602
|
#
|
|
514
603
|
# @param tags [Array<String>] Tags to search for
|
|
@@ -539,16 +628,22 @@ class HTM
|
|
|
539
628
|
# Get results
|
|
540
629
|
nodes = query.limit(limit).map(&:attributes)
|
|
541
630
|
|
|
631
|
+
# Batch load all tags for nodes (fixes N+1 query)
|
|
632
|
+
node_ids = nodes.map { |n| n['id'] }
|
|
633
|
+
tags_by_node = batch_load_node_tags(node_ids)
|
|
634
|
+
|
|
542
635
|
# Calculate relevance and enrich with tags
|
|
543
636
|
nodes.map do |node|
|
|
637
|
+
node_tags = tags_by_node[node['id']] || []
|
|
544
638
|
relevance = calculate_relevance(
|
|
545
639
|
node: node,
|
|
546
|
-
query_tags: tags
|
|
640
|
+
query_tags: tags,
|
|
641
|
+
node_tags: node_tags
|
|
547
642
|
)
|
|
548
643
|
|
|
549
644
|
node.merge({
|
|
550
645
|
'relevance' => relevance,
|
|
551
|
-
'tags' =>
|
|
646
|
+
'tags' => node_tags
|
|
552
647
|
})
|
|
553
648
|
end.sort_by { |n| -n['relevance'] }
|
|
554
649
|
end
|
|
@@ -574,22 +669,212 @@ class HTM
|
|
|
574
669
|
.map { |tag| { name: tag.name, usage_count: tag.usage_count } }
|
|
575
670
|
end
|
|
576
671
|
|
|
672
|
+
# Find tags that match terms in the query
|
|
673
|
+
#
|
|
674
|
+
# Searches the tags table for tags where any hierarchy level matches
|
|
675
|
+
# query words. For example, query "PostgreSQL database" would match
|
|
676
|
+
# tags like "database:postgresql", "database:sql", etc.
|
|
677
|
+
# Find tags matching a query using semantic extraction
|
|
678
|
+
#
|
|
679
|
+
# @param query [String] Search query
|
|
680
|
+
# @param include_extracted [Boolean] If true, returns hash with :extracted and :matched keys
|
|
681
|
+
# @return [Array<String>] Matching tag names (default)
|
|
682
|
+
# @return [Hash] If include_extracted: { extracted: [...], matched: [...] }
|
|
683
|
+
#
|
|
684
|
+
def find_query_matching_tags(query, include_extracted: false)
|
|
685
|
+
empty_result = include_extracted ? { extracted: [], matched: [] } : []
|
|
686
|
+
return empty_result if query.nil? || query.strip.empty?
|
|
687
|
+
|
|
688
|
+
# Use the tag extractor to generate semantic tags from the query
|
|
689
|
+
# This uses the same LLM process as when storing nodes
|
|
690
|
+
existing_tags = HTM::Models::Tag.pluck(:name).sample(50)
|
|
691
|
+
extracted_tags = HTM::TagService.extract(query, existing_ontology: existing_tags)
|
|
692
|
+
|
|
693
|
+
if extracted_tags.empty?
|
|
694
|
+
return include_extracted ? { extracted: [], matched: [] } : []
|
|
695
|
+
end
|
|
696
|
+
|
|
697
|
+
# Step 1: Try exact matches
|
|
698
|
+
exact_matches = HTM::Models::Tag.where(name: extracted_tags).pluck(:name)
|
|
699
|
+
|
|
700
|
+
if exact_matches.any?
|
|
701
|
+
return include_extracted ? { extracted: extracted_tags, matched: exact_matches } : exact_matches
|
|
702
|
+
end
|
|
703
|
+
|
|
704
|
+
# Step 2: Try matching on parent/prefix levels
|
|
705
|
+
# For "person:human:character:popeye", try "person:human:character", "person:human", "person"
|
|
706
|
+
prefix_candidates = extracted_tags.flat_map do |tag|
|
|
707
|
+
levels = tag.split(':')
|
|
708
|
+
(1...levels.size).map { |i| levels[0, i].join(':') }
|
|
709
|
+
end.uniq
|
|
710
|
+
|
|
711
|
+
if prefix_candidates.any?
|
|
712
|
+
prefix_matches = HTM::Models::Tag.where(name: prefix_candidates).pluck(:name)
|
|
713
|
+
if prefix_matches.any?
|
|
714
|
+
return include_extracted ? { extracted: extracted_tags, matched: prefix_matches } : prefix_matches
|
|
715
|
+
end
|
|
716
|
+
end
|
|
717
|
+
|
|
718
|
+
# Step 3: Try matching individual components, starting from rightmost (most specific)
|
|
719
|
+
# For "person:human:character:popeye", try "popeye", then "character", then "human", then "person"
|
|
720
|
+
# Search for tags that contain this component at any level
|
|
721
|
+
all_components = extracted_tags.flat_map { |tag| tag.split(':') }.uniq
|
|
722
|
+
|
|
723
|
+
# Order by specificity: components that appear at deeper levels first
|
|
724
|
+
component_depths = Hash.new(0)
|
|
725
|
+
extracted_tags.each do |tag|
|
|
726
|
+
levels = tag.split(':')
|
|
727
|
+
levels.each_with_index { |comp, idx| component_depths[comp] = [component_depths[comp], idx].max }
|
|
728
|
+
end
|
|
729
|
+
ordered_components = all_components.sort_by { |c| -component_depths[c] }
|
|
730
|
+
|
|
731
|
+
# Try each component, starting with most specific (rightmost)
|
|
732
|
+
ordered_components.each do |component|
|
|
733
|
+
# Find tags where this component appears at any level
|
|
734
|
+
component_matches = HTM::Models::Tag
|
|
735
|
+
.where("name = ? OR name LIKE ? OR name LIKE ? OR name LIKE ?",
|
|
736
|
+
component, # exact match (single-level tag)
|
|
737
|
+
"#{component}:%", # starts with component
|
|
738
|
+
"%:#{component}", # ends with component
|
|
739
|
+
"%:#{component}:%") # component in middle
|
|
740
|
+
.pluck(:name)
|
|
741
|
+
|
|
742
|
+
if component_matches.any?
|
|
743
|
+
return include_extracted ? { extracted: extracted_tags, matched: component_matches } : component_matches
|
|
744
|
+
end
|
|
745
|
+
end
|
|
746
|
+
|
|
747
|
+
# No matches found at any level
|
|
748
|
+
include_extracted ? { extracted: extracted_tags, matched: [] } : []
|
|
749
|
+
end
|
|
750
|
+
|
|
577
751
|
private
|
|
578
752
|
|
|
753
|
+
# Sanitize embedding for SQL use
|
|
754
|
+
#
|
|
755
|
+
# Validates that all values are numeric and converts to safe PostgreSQL vector format.
|
|
756
|
+
# This prevents SQL injection by ensuring only valid numeric values are included.
|
|
757
|
+
#
|
|
758
|
+
# @param embedding [Array<Numeric>] Embedding vector
|
|
759
|
+
# @return [String] Sanitized vector string for PostgreSQL (e.g., "[0.1,0.2,0.3]")
|
|
760
|
+
# @raise [ArgumentError] If embedding contains non-numeric values
|
|
761
|
+
#
|
|
762
|
+
def sanitize_embedding_for_sql(embedding)
|
|
763
|
+
unless embedding.is_a?(Array) && embedding.all? { |v| v.is_a?(Numeric) && v.finite? }
|
|
764
|
+
raise ArgumentError, "Embedding must be an array of finite numeric values"
|
|
765
|
+
end
|
|
766
|
+
|
|
767
|
+
"[#{embedding.map { |v| v.to_f }.join(',')}]"
|
|
768
|
+
end
|
|
769
|
+
|
|
770
|
+
# Build SQL condition for timeframe filtering
|
|
771
|
+
#
|
|
772
|
+
# @param timeframe [nil, Range, Array<Range>] Time range(s)
|
|
773
|
+
# @param table_alias [String] Table alias (default: none)
|
|
774
|
+
# @return [String, nil] SQL condition or nil for no filter
|
|
775
|
+
#
|
|
776
|
+
def build_timeframe_condition(timeframe, table_alias: nil)
|
|
777
|
+
return nil if timeframe.nil?
|
|
778
|
+
|
|
779
|
+
prefix = table_alias ? "#{table_alias}." : ""
|
|
780
|
+
column = "#{prefix}created_at"
|
|
781
|
+
conn = ActiveRecord::Base.connection
|
|
782
|
+
|
|
783
|
+
case timeframe
|
|
784
|
+
when Range
|
|
785
|
+
# Use quote to safely escape timestamp values
|
|
786
|
+
begin_quoted = conn.quote(timeframe.begin.iso8601)
|
|
787
|
+
end_quoted = conn.quote(timeframe.end.iso8601)
|
|
788
|
+
"(#{column} BETWEEN #{begin_quoted} AND #{end_quoted})"
|
|
789
|
+
when Array
|
|
790
|
+
conditions = timeframe.map do |range|
|
|
791
|
+
begin_quoted = conn.quote(range.begin.iso8601)
|
|
792
|
+
end_quoted = conn.quote(range.end.iso8601)
|
|
793
|
+
"(#{column} BETWEEN #{begin_quoted} AND #{end_quoted})"
|
|
794
|
+
end
|
|
795
|
+
"(#{conditions.join(' OR ')})"
|
|
796
|
+
else
|
|
797
|
+
nil
|
|
798
|
+
end
|
|
799
|
+
end
|
|
800
|
+
|
|
801
|
+
# Build ActiveRecord where clause for timeframe
|
|
802
|
+
#
|
|
803
|
+
# @param scope [ActiveRecord::Relation] Base scope
|
|
804
|
+
# @param timeframe [nil, Range, Array<Range>] Time range(s)
|
|
805
|
+
# @return [ActiveRecord::Relation] Scoped query
|
|
806
|
+
#
|
|
807
|
+
def apply_timeframe_scope(scope, timeframe)
|
|
808
|
+
return scope if timeframe.nil?
|
|
809
|
+
|
|
810
|
+
case timeframe
|
|
811
|
+
when Range
|
|
812
|
+
scope.where(created_at: timeframe)
|
|
813
|
+
when Array
|
|
814
|
+
# Build OR conditions for multiple ranges
|
|
815
|
+
conditions = timeframe.map { |range| scope.where(created_at: range) }
|
|
816
|
+
conditions.reduce { |result, condition| result.or(condition) }
|
|
817
|
+
else
|
|
818
|
+
scope
|
|
819
|
+
end
|
|
820
|
+
end
|
|
821
|
+
|
|
822
|
+
# Build SQL condition for metadata filtering (JSONB containment)
|
|
823
|
+
#
|
|
824
|
+
# @param metadata [Hash] Metadata to filter by
|
|
825
|
+
# @param table_alias [String] Table alias (default: none)
|
|
826
|
+
# @return [String, nil] SQL condition or nil for no filter
|
|
827
|
+
#
|
|
828
|
+
def build_metadata_condition(metadata, table_alias: nil)
|
|
829
|
+
return nil if metadata.nil? || metadata.empty?
|
|
830
|
+
|
|
831
|
+
prefix = table_alias ? "#{table_alias}." : ""
|
|
832
|
+
column = "#{prefix}metadata"
|
|
833
|
+
conn = ActiveRecord::Base.connection
|
|
834
|
+
|
|
835
|
+
# Use JSONB containment operator @>
|
|
836
|
+
# This matches if the metadata column contains all key-value pairs in the filter
|
|
837
|
+
quoted_metadata = conn.quote(metadata.to_json)
|
|
838
|
+
"(#{column} @> #{quoted_metadata}::jsonb)"
|
|
839
|
+
end
|
|
840
|
+
|
|
841
|
+
# Build ActiveRecord where clause for metadata
|
|
842
|
+
#
|
|
843
|
+
# @param scope [ActiveRecord::Relation] Base scope
|
|
844
|
+
# @param metadata [Hash] Metadata to filter by
|
|
845
|
+
# @return [ActiveRecord::Relation] Scoped query
|
|
846
|
+
#
|
|
847
|
+
def apply_metadata_scope(scope, metadata)
|
|
848
|
+
return scope if metadata.nil? || metadata.empty?
|
|
849
|
+
|
|
850
|
+
# Use JSONB containment operator
|
|
851
|
+
scope.where("metadata @> ?::jsonb", metadata.to_json)
|
|
852
|
+
end
|
|
853
|
+
|
|
579
854
|
# Generate cache key for query
|
|
580
855
|
#
|
|
581
856
|
# @param method [Symbol] Search method name
|
|
582
|
-
# @param timeframe [Range] Time range
|
|
857
|
+
# @param timeframe [nil, Range, Array<Range>] Time range(s)
|
|
583
858
|
# @param query [String] Search query
|
|
584
859
|
# @param limit [Integer] Result limit
|
|
585
860
|
# @param args [Array] Additional arguments
|
|
586
861
|
# @return [String] Cache key
|
|
587
862
|
#
|
|
588
863
|
def cache_key_for(method, timeframe, query, limit, *args)
|
|
864
|
+
timeframe_key = case timeframe
|
|
865
|
+
when nil
|
|
866
|
+
"nil"
|
|
867
|
+
when Range
|
|
868
|
+
"#{timeframe.begin.to_i}-#{timeframe.end.to_i}"
|
|
869
|
+
when Array
|
|
870
|
+
timeframe.map { |r| "#{r.begin.to_i}-#{r.end.to_i}" }.join(',')
|
|
871
|
+
else
|
|
872
|
+
timeframe.to_s
|
|
873
|
+
end
|
|
874
|
+
|
|
589
875
|
key_parts = [
|
|
590
876
|
method,
|
|
591
|
-
|
|
592
|
-
timeframe.end.to_i,
|
|
877
|
+
timeframe_key,
|
|
593
878
|
query,
|
|
594
879
|
limit,
|
|
595
880
|
*args
|
|
@@ -658,17 +943,41 @@ class HTM
|
|
|
658
943
|
@query_cache.clear if @query_cache
|
|
659
944
|
end
|
|
660
945
|
|
|
946
|
+
# Execute a query with caching
|
|
947
|
+
#
|
|
948
|
+
# @param method [Symbol] Search method name for cache key
|
|
949
|
+
# @param args [Array] Arguments for cache key (timeframe, query, limit, etc.)
|
|
950
|
+
# @yield Block that executes the actual query
|
|
951
|
+
# @return [Array<Hash>] Query results (from cache or freshly executed)
|
|
952
|
+
#
|
|
953
|
+
def cached_query(method, *args, &block)
|
|
954
|
+
return yield unless @query_cache
|
|
955
|
+
|
|
956
|
+
cache_key = cache_key_for(method, *args)
|
|
957
|
+
|
|
958
|
+
if (cached = @query_cache[cache_key])
|
|
959
|
+
@cache_stats_mutex.synchronize { @cache_stats[:hits] += 1 }
|
|
960
|
+
return cached
|
|
961
|
+
end
|
|
962
|
+
|
|
963
|
+
@cache_stats_mutex.synchronize { @cache_stats[:misses] += 1 }
|
|
964
|
+
result = yield
|
|
965
|
+
@query_cache[cache_key] = result
|
|
966
|
+
result
|
|
967
|
+
end
|
|
968
|
+
|
|
661
969
|
# Uncached vector similarity search
|
|
662
970
|
#
|
|
663
971
|
# Generates query embedding client-side and performs vector search in database.
|
|
664
972
|
#
|
|
665
|
-
# @param timeframe [Range] Time range to search
|
|
973
|
+
# @param timeframe [nil, Range, Array<Range>] Time range(s) to search (nil = no filter)
|
|
666
974
|
# @param query [String] Search query
|
|
667
975
|
# @param limit [Integer] Maximum results
|
|
668
976
|
# @param embedding_service [Object] Service to generate query embedding
|
|
977
|
+
# @param metadata [Hash] Filter by metadata fields (default: {})
|
|
669
978
|
# @return [Array<Hash>] Matching nodes
|
|
670
979
|
#
|
|
671
|
-
def search_uncached(timeframe:, query:, limit:, embedding_service:)
|
|
980
|
+
def search_uncached(timeframe:, query:, limit:, embedding_service:, metadata: {})
|
|
672
981
|
# Generate query embedding client-side
|
|
673
982
|
query_embedding = embedding_service.embed(query)
|
|
674
983
|
|
|
@@ -677,17 +986,29 @@ class HTM
|
|
|
677
986
|
query_embedding = query_embedding + Array.new(2000 - query_embedding.length, 0.0)
|
|
678
987
|
end
|
|
679
988
|
|
|
680
|
-
#
|
|
681
|
-
embedding_str =
|
|
989
|
+
# Sanitize embedding for safe SQL use (validates all values are numeric)
|
|
990
|
+
embedding_str = sanitize_embedding_for_sql(query_embedding)
|
|
991
|
+
|
|
992
|
+
# Build filter conditions
|
|
993
|
+
timeframe_condition = build_timeframe_condition(timeframe)
|
|
994
|
+
metadata_condition = build_metadata_condition(metadata)
|
|
995
|
+
|
|
996
|
+
conditions = ["embedding IS NOT NULL", "deleted_at IS NULL"]
|
|
997
|
+
conditions << timeframe_condition if timeframe_condition
|
|
998
|
+
conditions << metadata_condition if metadata_condition
|
|
999
|
+
|
|
1000
|
+
where_clause = "WHERE #{conditions.join(' AND ')}"
|
|
1001
|
+
|
|
1002
|
+
# Use quote to safely escape the embedding string in the query
|
|
1003
|
+
quoted_embedding = ActiveRecord::Base.connection.quote(embedding_str)
|
|
682
1004
|
|
|
683
1005
|
result = ActiveRecord::Base.connection.select_all(
|
|
684
1006
|
<<~SQL,
|
|
685
|
-
SELECT id, content,
|
|
686
|
-
1 - (embedding <=>
|
|
1007
|
+
SELECT id, content, access_count, created_at, token_count,
|
|
1008
|
+
1 - (embedding <=> #{quoted_embedding}::vector) as similarity
|
|
687
1009
|
FROM nodes
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
ORDER BY embedding <=> '#{embedding_str}'::vector
|
|
1010
|
+
#{where_clause}
|
|
1011
|
+
ORDER BY embedding <=> #{quoted_embedding}::vector
|
|
691
1012
|
LIMIT #{limit.to_i}
|
|
692
1013
|
SQL
|
|
693
1014
|
)
|
|
@@ -701,24 +1022,35 @@ class HTM
|
|
|
701
1022
|
|
|
702
1023
|
# Uncached full-text search
|
|
703
1024
|
#
|
|
704
|
-
# @param timeframe [Range] Time range to search
|
|
1025
|
+
# @param timeframe [nil, Range, Array<Range>] Time range(s) to search (nil = no filter)
|
|
705
1026
|
# @param query [String] Search query
|
|
706
1027
|
# @param limit [Integer] Maximum results
|
|
1028
|
+
# @param metadata [Hash] Filter by metadata fields (default: {})
|
|
707
1029
|
# @return [Array<Hash>] Matching nodes
|
|
708
1030
|
#
|
|
709
|
-
def search_fulltext_uncached(timeframe:, query:, limit:)
|
|
1031
|
+
def search_fulltext_uncached(timeframe:, query:, limit:, metadata: {})
|
|
1032
|
+
# Build filter conditions
|
|
1033
|
+
timeframe_condition = build_timeframe_condition(timeframe)
|
|
1034
|
+
metadata_condition = build_metadata_condition(metadata)
|
|
1035
|
+
|
|
1036
|
+
additional_conditions = []
|
|
1037
|
+
additional_conditions << timeframe_condition if timeframe_condition
|
|
1038
|
+
additional_conditions << metadata_condition if metadata_condition
|
|
1039
|
+
additional_sql = additional_conditions.any? ? "AND #{additional_conditions.join(' AND ')}" : ""
|
|
1040
|
+
|
|
710
1041
|
result = ActiveRecord::Base.connection.select_all(
|
|
711
1042
|
ActiveRecord::Base.sanitize_sql_array([
|
|
712
1043
|
<<~SQL,
|
|
713
|
-
SELECT id, content,
|
|
1044
|
+
SELECT id, content, access_count, created_at, token_count,
|
|
714
1045
|
ts_rank(to_tsvector('english', content), plainto_tsquery('english', ?)) as rank
|
|
715
1046
|
FROM nodes
|
|
716
|
-
WHERE
|
|
1047
|
+
WHERE deleted_at IS NULL
|
|
717
1048
|
AND to_tsvector('english', content) @@ plainto_tsquery('english', ?)
|
|
1049
|
+
#{additional_sql}
|
|
718
1050
|
ORDER BY rank DESC
|
|
719
1051
|
LIMIT ?
|
|
720
1052
|
SQL
|
|
721
|
-
query,
|
|
1053
|
+
query, query, limit
|
|
722
1054
|
])
|
|
723
1055
|
)
|
|
724
1056
|
|
|
@@ -731,17 +1063,20 @@ class HTM
|
|
|
731
1063
|
|
|
732
1064
|
# Uncached hybrid search
|
|
733
1065
|
#
|
|
734
|
-
# Generates query embedding client-side, then combines
|
|
735
|
-
#
|
|
1066
|
+
# Generates query embedding client-side, then combines:
|
|
1067
|
+
# 1. Full-text search for content matching
|
|
1068
|
+
# 2. Tag matching for categorical relevance
|
|
1069
|
+
# 3. Vector similarity for semantic ranking
|
|
736
1070
|
#
|
|
737
|
-
# @param timeframe [Range] Time range to search
|
|
1071
|
+
# @param timeframe [nil, Range, Array<Range>] Time range(s) to search (nil = no filter)
|
|
738
1072
|
# @param query [String] Search query
|
|
739
1073
|
# @param limit [Integer] Maximum results
|
|
740
1074
|
# @param embedding_service [Object] Service to generate query embedding
|
|
741
1075
|
# @param prefilter_limit [Integer] Candidates to consider
|
|
742
|
-
# @
|
|
1076
|
+
# @param metadata [Hash] Filter by metadata fields (default: {})
|
|
1077
|
+
# @return [Array<Hash>] Matching nodes with similarity and tag_boost scores
|
|
743
1078
|
#
|
|
744
|
-
def search_hybrid_uncached(timeframe:, query:, limit:, embedding_service:, prefilter_limit:)
|
|
1079
|
+
def search_hybrid_uncached(timeframe:, query:, limit:, embedding_service:, prefilter_limit:, metadata: {})
|
|
745
1080
|
# Generate query embedding client-side
|
|
746
1081
|
query_embedding = embedding_service.embed(query)
|
|
747
1082
|
|
|
@@ -750,29 +1085,129 @@ class HTM
|
|
|
750
1085
|
query_embedding = query_embedding + Array.new(2000 - query_embedding.length, 0.0)
|
|
751
1086
|
end
|
|
752
1087
|
|
|
753
|
-
#
|
|
754
|
-
embedding_str =
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
|
|
1088
|
+
# Sanitize embedding for safe SQL use (validates all values are numeric)
|
|
1089
|
+
embedding_str = sanitize_embedding_for_sql(query_embedding)
|
|
1090
|
+
quoted_embedding = ActiveRecord::Base.connection.quote(embedding_str)
|
|
1091
|
+
|
|
1092
|
+
# Build filter conditions (with table alias for CTEs)
|
|
1093
|
+
timeframe_condition = build_timeframe_condition(timeframe, table_alias: 'n')
|
|
1094
|
+
metadata_condition = build_metadata_condition(metadata, table_alias: 'n')
|
|
1095
|
+
|
|
1096
|
+
additional_conditions = []
|
|
1097
|
+
additional_conditions << timeframe_condition if timeframe_condition
|
|
1098
|
+
additional_conditions << metadata_condition if metadata_condition
|
|
1099
|
+
additional_sql = additional_conditions.any? ? "AND #{additional_conditions.join(' AND ')}" : ""
|
|
1100
|
+
|
|
1101
|
+
# Same for non-aliased queries
|
|
1102
|
+
timeframe_condition_bare = build_timeframe_condition(timeframe)
|
|
1103
|
+
metadata_condition_bare = build_metadata_condition(metadata)
|
|
1104
|
+
|
|
1105
|
+
additional_conditions_bare = []
|
|
1106
|
+
additional_conditions_bare << timeframe_condition_bare if timeframe_condition_bare
|
|
1107
|
+
additional_conditions_bare << metadata_condition_bare if metadata_condition_bare
|
|
1108
|
+
additional_sql_bare = additional_conditions_bare.any? ? "AND #{additional_conditions_bare.join(' AND ')}" : ""
|
|
1109
|
+
|
|
1110
|
+
# Find tags that match query terms
|
|
1111
|
+
matching_tags = find_query_matching_tags(query)
|
|
1112
|
+
|
|
1113
|
+
# Build the hybrid query
|
|
1114
|
+
# If we have matching tags, include nodes with those tags in the candidate pool
|
|
1115
|
+
# NOTE: Hybrid search includes nodes without embeddings using a default
|
|
1116
|
+
# similarity score of 0.5. This allows newly created nodes to appear in
|
|
1117
|
+
# search results immediately (via fulltext matching) before their embeddings
|
|
1118
|
+
# are generated by background jobs.
|
|
1119
|
+
|
|
1120
|
+
if matching_tags.any?
|
|
1121
|
+
# Escape tag names for SQL
|
|
1122
|
+
tag_list = matching_tags.map { |t| ActiveRecord::Base.connection.quote(t) }.join(', ')
|
|
1123
|
+
result = ActiveRecord::Base.connection.select_all(
|
|
1124
|
+
ActiveRecord::Base.sanitize_sql_array([
|
|
1125
|
+
<<~SQL,
|
|
1126
|
+
WITH fulltext_candidates AS (
|
|
1127
|
+
-- Nodes matching full-text search (with or without embeddings)
|
|
1128
|
+
SELECT DISTINCT n.id, n.content, n.access_count, n.created_at, n.token_count, n.embedding
|
|
1129
|
+
FROM nodes n
|
|
1130
|
+
WHERE n.deleted_at IS NULL
|
|
1131
|
+
AND to_tsvector('english', n.content) @@ plainto_tsquery('english', ?)
|
|
1132
|
+
#{additional_sql}
|
|
1133
|
+
LIMIT ?
|
|
1134
|
+
),
|
|
1135
|
+
tag_candidates AS (
|
|
1136
|
+
-- Nodes matching relevant tags (with or without embeddings)
|
|
1137
|
+
SELECT DISTINCT n.id, n.content, n.access_count, n.created_at, n.token_count, n.embedding
|
|
1138
|
+
FROM nodes n
|
|
1139
|
+
JOIN node_tags nt ON nt.node_id = n.id
|
|
1140
|
+
JOIN tags t ON t.id = nt.tag_id
|
|
1141
|
+
WHERE n.deleted_at IS NULL
|
|
1142
|
+
AND t.name IN (#{tag_list})
|
|
1143
|
+
#{additional_sql}
|
|
1144
|
+
LIMIT ?
|
|
1145
|
+
),
|
|
1146
|
+
all_candidates AS (
|
|
1147
|
+
SELECT * FROM fulltext_candidates
|
|
1148
|
+
UNION
|
|
1149
|
+
SELECT * FROM tag_candidates
|
|
1150
|
+
),
|
|
1151
|
+
scored AS (
|
|
1152
|
+
SELECT
|
|
1153
|
+
ac.id, ac.content, ac.access_count, ac.created_at, ac.token_count,
|
|
1154
|
+
CASE
|
|
1155
|
+
WHEN ac.embedding IS NOT NULL THEN 1 - (ac.embedding <=> #{quoted_embedding}::vector)
|
|
1156
|
+
ELSE 0.5 -- Default similarity for nodes without embeddings
|
|
1157
|
+
END as similarity,
|
|
1158
|
+
COALESCE((
|
|
1159
|
+
SELECT COUNT(DISTINCT t.name)::float / ?
|
|
1160
|
+
FROM node_tags nt
|
|
1161
|
+
JOIN tags t ON t.id = nt.tag_id
|
|
1162
|
+
WHERE nt.node_id = ac.id AND t.name IN (#{tag_list})
|
|
1163
|
+
), 0) as tag_boost
|
|
1164
|
+
FROM all_candidates ac
|
|
1165
|
+
)
|
|
1166
|
+
SELECT id, content, access_count, created_at, token_count,
|
|
1167
|
+
similarity, tag_boost,
|
|
1168
|
+
(similarity * 0.7 + tag_boost * 0.3) as combined_score
|
|
1169
|
+
FROM scored
|
|
1170
|
+
ORDER BY combined_score DESC
|
|
765
1171
|
LIMIT ?
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
|
|
771
|
-
|
|
772
|
-
|
|
773
|
-
|
|
774
|
-
|
|
775
|
-
|
|
1172
|
+
SQL
|
|
1173
|
+
query, prefilter_limit,
|
|
1174
|
+
prefilter_limit,
|
|
1175
|
+
matching_tags.length.to_f,
|
|
1176
|
+
limit
|
|
1177
|
+
])
|
|
1178
|
+
)
|
|
1179
|
+
else
|
|
1180
|
+
# No matching tags, fall back to standard hybrid (fulltext + vector)
|
|
1181
|
+
# Include nodes without embeddings with a default similarity score
|
|
1182
|
+
result = ActiveRecord::Base.connection.select_all(
|
|
1183
|
+
ActiveRecord::Base.sanitize_sql_array([
|
|
1184
|
+
<<~SQL,
|
|
1185
|
+
WITH candidates AS (
|
|
1186
|
+
SELECT id, content, access_count, created_at, token_count, embedding
|
|
1187
|
+
FROM nodes
|
|
1188
|
+
WHERE deleted_at IS NULL
|
|
1189
|
+
AND to_tsvector('english', content) @@ plainto_tsquery('english', ?)
|
|
1190
|
+
#{additional_sql_bare}
|
|
1191
|
+
LIMIT ?
|
|
1192
|
+
)
|
|
1193
|
+
SELECT id, content, access_count, created_at, token_count,
|
|
1194
|
+
CASE
|
|
1195
|
+
WHEN embedding IS NOT NULL THEN 1 - (embedding <=> #{quoted_embedding}::vector)
|
|
1196
|
+
ELSE 0.5 -- Default similarity for nodes without embeddings
|
|
1197
|
+
END as similarity,
|
|
1198
|
+
0.0 as tag_boost,
|
|
1199
|
+
CASE
|
|
1200
|
+
WHEN embedding IS NOT NULL THEN 1 - (embedding <=> #{quoted_embedding}::vector)
|
|
1201
|
+
ELSE 0.5 -- Default score for nodes without embeddings (fulltext matched)
|
|
1202
|
+
END as combined_score
|
|
1203
|
+
FROM candidates
|
|
1204
|
+
ORDER BY combined_score DESC
|
|
1205
|
+
LIMIT ?
|
|
1206
|
+
SQL
|
|
1207
|
+
query, prefilter_limit, limit
|
|
1208
|
+
])
|
|
1209
|
+
)
|
|
1210
|
+
end
|
|
776
1211
|
|
|
777
1212
|
# Track access for retrieved nodes
|
|
778
1213
|
node_ids = result.map { |r| r['id'] }
|
|
@@ -806,160 +1241,5 @@ class HTM
|
|
|
806
1241
|
|
|
807
1242
|
[similarity, depth_weight]
|
|
808
1243
|
end
|
|
809
|
-
|
|
810
|
-
#######################################
|
|
811
|
-
=begin
|
|
812
|
-
|
|
813
|
-
# Enhanced hierarchical similarity (with term_bonus for deep term matches like "country-music")
|
|
814
|
-
# Replaces your private calculate_hierarchical_similarity
|
|
815
|
-
def calculate_hierarchical_similarity(tag_a, tag_b, max_depth: 5)
|
|
816
|
-
return [0.0, 1.0] if tag_a.empty? || tag_b.empty? # [similarity, weight]
|
|
817
|
-
|
|
818
|
-
parts_a = tag_a.split(':').reject(&:empty?)
|
|
819
|
-
parts_b = tag_b.split(':').reject(&:empty?)
|
|
820
|
-
return [0.0, 1.0] if parts_a.empty? || parts_b.empty?
|
|
821
|
-
|
|
822
|
-
# Prefix similarity
|
|
823
|
-
local_max = [parts_a.length, parts_b.length].max
|
|
824
|
-
common_levels = 0
|
|
825
|
-
(0...local_max).each do |i|
|
|
826
|
-
if i < parts_a.length && i < parts_b.length && parts_a[i] == parts_b[i]
|
|
827
|
-
common_levels += 1
|
|
828
|
-
else
|
|
829
|
-
break
|
|
830
|
-
end
|
|
831
|
-
end
|
|
832
|
-
prefix_sim = local_max > 0 ? common_levels.to_f / local_max : 0.0
|
|
833
|
-
|
|
834
|
-
# Term bonus: Shared terms weighted by avg depth
|
|
835
|
-
common_terms = parts_a.to_set & parts_b.to_set
|
|
836
|
-
term_bonus = 0.0
|
|
837
|
-
common_terms.each do |term|
|
|
838
|
-
depth_a = parts_a.index(term) + 1
|
|
839
|
-
depth_b = parts_b.index(term) + 1
|
|
840
|
-
avg_depth = (depth_a + depth_b) / 2.0
|
|
841
|
-
depth_weight = avg_depth / max_depth.to_f
|
|
842
|
-
term_bonus += depth_weight * 0.8 # Increased from 0.5 for more aggression
|
|
843
|
-
end
|
|
844
|
-
term_bonus = [1.0, term_bonus].min
|
|
845
|
-
|
|
846
|
-
# Combined similarity (your weight now favors deeper via local_max)
|
|
847
|
-
sim = (prefix_sim + term_bonus) / 2.0
|
|
848
|
-
weight = local_max.to_f / max_depth # Deeper = higher weight (flipped from your 1/max)
|
|
849
|
-
|
|
850
|
-
[sim, weight]
|
|
851
|
-
end
|
|
852
|
-
|
|
853
|
-
# Enhanced weighted_hierarchical_jaccard (uses new similarity; adds max_pairs fallback)
|
|
854
|
-
# Replaces your private weighted_hierarchical_jaccard
|
|
855
|
-
def weighted_hierarchical_jaccard(set_a, set_b, max_depth: 5, max_pairs: 1000)
|
|
856
|
-
return 0.0 if set_a.empty? || set_b.empty?
|
|
857
|
-
|
|
858
|
-
# Fallback to flat Jaccard for large sets (your jaccard_similarity)
|
|
859
|
-
if set_a.size * set_b.size > max_pairs
|
|
860
|
-
terms_a = set_a.flat_map { |tag| tag.split(':').reject(&:empty?) }.to_set
|
|
861
|
-
terms_b = set_b.flat_map { |tag| tag.split(':').reject(&:empty?) }.to_set
|
|
862
|
-
return jaccard_similarity(terms_a.to_a, terms_b.to_a)
|
|
863
|
-
end
|
|
864
|
-
|
|
865
|
-
total_weighted_similarity = 0.0
|
|
866
|
-
total_weights = 0.0
|
|
867
|
-
set_a.each do |tag_a|
|
|
868
|
-
set_b.each do |tag_b|
|
|
869
|
-
similarity, weight = calculate_hierarchical_similarity(tag_a, tag_b, max_depth: max_depth)
|
|
870
|
-
total_weighted_similarity += similarity * weight
|
|
871
|
-
total_weights += weight
|
|
872
|
-
end
|
|
873
|
-
end
|
|
874
|
-
total_weights > 0 ? total_weighted_similarity / total_weights : 0.0
|
|
875
|
-
end
|
|
876
|
-
|
|
877
|
-
# Updated calculate_relevance (adds ont_weight param; scales to 0-100 option)
|
|
878
|
-
# Enhances your existing method
|
|
879
|
-
def calculate_relevance(node:, query_tags: [], vector_similarity: nil, ont_weight: 1.0, scale_to_100: false)
|
|
880
|
-
# 1. Vector similarity (semantic) - weight: 0.5
|
|
881
|
-
semantic_score = if vector_similarity
|
|
882
|
-
vector_similarity
|
|
883
|
-
elsif node['similarity']
|
|
884
|
-
node['similarity'].to_f
|
|
885
|
-
else
|
|
886
|
-
0.5
|
|
887
|
-
end
|
|
888
|
-
|
|
889
|
-
# 2. Tag overlap (ontology) - weight: 0.3, boosted by ont_weight
|
|
890
|
-
node_tags = get_node_tags(node['id'])
|
|
891
|
-
tag_score = if query_tags.any? && node_tags.any?
|
|
892
|
-
weighted_hierarchical_jaccard(query_tags, node_tags) * ont_weight
|
|
893
|
-
else
|
|
894
|
-
0.5
|
|
895
|
-
end
|
|
896
|
-
tag_score = [tag_score, 1.0].min # Cap boosted score
|
|
897
|
-
|
|
898
|
-
# 3. Recency - weight: 0.1
|
|
899
|
-
age_hours = (Time.current - Time.parse(node['created_at'].to_s)) / 3600.0
|
|
900
|
-
recency_score = Math.exp(-age_hours / 168.0)
|
|
901
|
-
|
|
902
|
-
# 4. Access frequency - weight: 0.1
|
|
903
|
-
access_count = node['access_count'] || 0
|
|
904
|
-
access_score = Math.log(1 + access_count) / 10.0
|
|
905
|
-
|
|
906
|
-
# Weighted composite (0-10 base)
|
|
907
|
-
relevance_0_10 = (
|
|
908
|
-
(semantic_score * 0.5) +
|
|
909
|
-
(tag_score * 0.3) +
|
|
910
|
-
(recency_score * 0.1) +
|
|
911
|
-
(access_score * 0.1)
|
|
912
|
-
).clamp(0.0, 10.0)
|
|
913
|
-
|
|
914
|
-
# Scale to 0-100 if requested
|
|
915
|
-
final_relevance = scale_to_100 ? (relevance_0_10 * 10.0).round(2) : relevance_0_10
|
|
916
|
-
|
|
917
|
-
final_relevance
|
|
918
|
-
end
|
|
919
|
-
|
|
920
|
-
# Updated search_with_relevance (adds threshold: for 0-100 filtering; ont_weight)
|
|
921
|
-
# Enhances your existing method
|
|
922
|
-
def search_with_relevance(timeframe:, query: nil, query_tags: [], limit: 20, embedding_service: nil, threshold: nil, ont_weight: 1.0, scale_to_100: true)
|
|
923
|
-
# Get candidates (your logic)
|
|
924
|
-
candidates = if query && embedding_service
|
|
925
|
-
search_uncached(timeframe: timeframe, query: query, limit: limit * 3, embedding_service: embedding_service) # Oversample more for thresholds
|
|
926
|
-
elsif query
|
|
927
|
-
search_fulltext_uncached(timeframe: timeframe, query: query, limit: limit * 3)
|
|
928
|
-
else
|
|
929
|
-
HTM::Models::Node
|
|
930
|
-
.where(created_at: timeframe)
|
|
931
|
-
.order(created_at: :desc)
|
|
932
|
-
.limit(limit * 3)
|
|
933
|
-
.map(&:attributes)
|
|
934
|
-
end
|
|
935
|
-
|
|
936
|
-
# Score and enrich
|
|
937
|
-
scored_nodes = candidates.map do |node|
|
|
938
|
-
relevance = calculate_relevance(
|
|
939
|
-
node: node,
|
|
940
|
-
query_tags: query_tags,
|
|
941
|
-
vector_similarity: node['similarity']&.to_f,
|
|
942
|
-
ont_weight: ont_weight,
|
|
943
|
-
scale_to_100: scale_to_100
|
|
944
|
-
)
|
|
945
|
-
node.merge({
|
|
946
|
-
'relevance' => relevance,
|
|
947
|
-
'tags' => get_node_tags(node['id'])
|
|
948
|
-
})
|
|
949
|
-
end
|
|
950
|
-
|
|
951
|
-
# Filter by threshold if provided (e.g., >=80 for 0-100 scale)
|
|
952
|
-
scored_nodes = scored_nodes.select { |n| threshold.nil? || n['relevance'] >= threshold }
|
|
953
|
-
|
|
954
|
-
# Sort by relevance DESC, take limit (or all if threshold used)
|
|
955
|
-
scored_nodes
|
|
956
|
-
.sort_by { |n| -n['relevance'] }
|
|
957
|
-
.take(limit)
|
|
958
|
-
end
|
|
959
|
-
|
|
960
|
-
=end
|
|
961
|
-
|
|
962
|
-
|
|
963
|
-
|
|
964
1244
|
end
|
|
965
1245
|
end
|