htm 0.0.31 → 0.0.32

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (157) hide show
  1. checksums.yaml +4 -4
  2. data/.irbrc +2 -3
  3. data/.rubocop.yml +184 -0
  4. data/CHANGELOG.md +46 -0
  5. data/README.md +2 -0
  6. data/Rakefile +93 -12
  7. data/db/migrate/00008_create_node_relationships.rb +54 -0
  8. data/db/migrate/00009_fix_node_relationships_column_types.rb +17 -0
  9. data/db/schema.sql +124 -1
  10. data/docs/api/database.md +35 -57
  11. data/docs/api/embedding-service.md +1 -1
  12. data/docs/api/index.md +26 -15
  13. data/docs/api/working-memory.md +8 -8
  14. data/docs/architecture/index.md +5 -7
  15. data/docs/architecture/overview.md +5 -8
  16. data/docs/assets/images/htm-architecture-overview.svg +1 -1
  17. data/docs/assets/images/htm-context-assembly-flow.svg +2 -2
  18. data/docs/assets/images/htm-layered-architecture.svg +3 -3
  19. data/docs/assets/images/two-tier-memory-architecture.svg +1 -1
  20. data/docs/database/README.md +1 -0
  21. data/docs/database_rake_tasks.md +20 -28
  22. data/docs/development/contributing.md +5 -5
  23. data/docs/development/index.md +4 -7
  24. data/docs/development/schema.md +71 -1
  25. data/docs/development/setup.md +40 -82
  26. data/docs/development/testing.md +1 -1
  27. data/docs/examples/file-loading.md +4 -4
  28. data/docs/examples/mcp-client.md +1 -1
  29. data/docs/getting-started/quick-start.md +4 -4
  30. data/docs/guides/adding-memories.md +14 -1
  31. data/docs/guides/configuration.md +5 -5
  32. data/docs/guides/context-assembly.md +4 -4
  33. data/docs/guides/file-loading.md +12 -12
  34. data/docs/guides/getting-started.md +2 -2
  35. data/docs/guides/long-term-memory.md +7 -27
  36. data/docs/guides/propositions.md +20 -19
  37. data/docs/guides/recalling-memories.md +5 -5
  38. data/docs/guides/tags.md +18 -13
  39. data/docs/multi_framework_support.md +1 -1
  40. data/docs/robots/hive-mind.md +1 -1
  41. data/docs/robots/multi-robot.md +2 -2
  42. data/docs/robots/robot-groups.md +1 -1
  43. data/docs/robots/two-tier-memory.md +72 -94
  44. data/docs/setup_local_database.md +8 -54
  45. data/docs/using_rake_tasks_in_your_app.md +6 -6
  46. data/examples/01_basic_usage.rb +1 -0
  47. data/examples/03_custom_llm_configuration.rb +1 -0
  48. data/examples/04_file_loader_usage.rb +1 -0
  49. data/examples/05_timeframe_demo.rb +1 -0
  50. data/examples/06_example_app/app.rb +1 -0
  51. data/examples/07_cli_app/htm_cli.rb +1 -0
  52. data/examples/09_mcp_client.rb +1 -0
  53. data/examples/10_telemetry/demo.rb +1 -0
  54. data/examples/11_robot_groups/multi_process.rb +1 -0
  55. data/examples/11_robot_groups/same_process.rb +1 -0
  56. data/examples/12_rails_app/.envrc +12 -0
  57. data/examples/12_rails_app/Gemfile +8 -3
  58. data/examples/12_rails_app/Gemfile.lock +94 -89
  59. data/examples/12_rails_app/README.md +70 -19
  60. data/examples/12_rails_app/app/controllers/application_controller.rb +6 -0
  61. data/examples/12_rails_app/app/controllers/chats_controller.rb +305 -0
  62. data/examples/12_rails_app/app/controllers/dashboard_controller.rb +3 -0
  63. data/examples/12_rails_app/app/controllers/files_controller.rb +17 -2
  64. data/examples/12_rails_app/app/controllers/home_controller.rb +8 -0
  65. data/examples/12_rails_app/app/controllers/memories_controller.rb +9 -4
  66. data/examples/12_rails_app/app/controllers/messages_controller.rb +214 -0
  67. data/examples/12_rails_app/app/controllers/robots_controller.rb +11 -1
  68. data/examples/12_rails_app/app/controllers/tags_controller.rb +14 -1
  69. data/examples/12_rails_app/app/javascript/application.js +1 -1
  70. data/examples/12_rails_app/app/models/application_record.rb +5 -0
  71. data/examples/12_rails_app/app/models/chat.rb +36 -0
  72. data/examples/12_rails_app/app/models/message.rb +5 -0
  73. data/examples/12_rails_app/app/models/model.rb +5 -0
  74. data/examples/12_rails_app/app/models/tool_call.rb +5 -0
  75. data/examples/12_rails_app/app/views/chats/index.html.erb +61 -0
  76. data/examples/12_rails_app/app/views/chats/show.html.erb +213 -0
  77. data/examples/12_rails_app/app/views/dashboard/index.html.erb +3 -0
  78. data/examples/12_rails_app/app/views/files/index.html.erb +10 -5
  79. data/examples/12_rails_app/app/views/files/new.html.erb +4 -2
  80. data/examples/12_rails_app/app/views/files/show.html.erb +19 -3
  81. data/examples/12_rails_app/app/views/home/index.html.erb +45 -0
  82. data/examples/12_rails_app/app/views/layouts/application.html.erb +20 -18
  83. data/examples/12_rails_app/app/views/memories/_memory_card.html.erb +1 -1
  84. data/examples/12_rails_app/app/views/memories/deleted.html.erb +3 -1
  85. data/examples/12_rails_app/app/views/memories/edit.html.erb +2 -0
  86. data/examples/12_rails_app/app/views/memories/index.html.erb +2 -0
  87. data/examples/12_rails_app/app/views/memories/new.html.erb +2 -0
  88. data/examples/12_rails_app/app/views/memories/show.html.erb +4 -2
  89. data/examples/12_rails_app/app/views/messages/_message.html.erb +20 -0
  90. data/examples/12_rails_app/app/views/robots/index.html.erb +2 -0
  91. data/examples/12_rails_app/app/views/robots/new.html.erb +2 -0
  92. data/examples/12_rails_app/app/views/robots/show.html.erb +2 -0
  93. data/examples/12_rails_app/app/views/search/index.html.erb +59 -8
  94. data/examples/12_rails_app/app/views/shared/_navbar.html.erb +75 -29
  95. data/examples/12_rails_app/app/views/tags/index.html.erb +2 -0
  96. data/examples/12_rails_app/app/views/tags/show.html.erb +3 -1
  97. data/examples/12_rails_app/config/application.rb +1 -1
  98. data/examples/12_rails_app/config/database.yml +9 -5
  99. data/examples/12_rails_app/config/importmap.rb +1 -1
  100. data/examples/12_rails_app/config/initializers/htm.rb +9 -2
  101. data/examples/12_rails_app/config/initializers/ruby_llm.rb +33 -0
  102. data/examples/12_rails_app/config/routes.rb +39 -23
  103. data/examples/12_rails_app/db/migrate/20250124000001_create_ruby_llm_tables.rb +34 -0
  104. data/examples/12_rails_app/db/migrate/20250124000002_create_models_table.rb +28 -0
  105. data/examples/12_rails_app/db/schema.rb +67 -0
  106. data/examples/examples_helper.rb +25 -0
  107. data/lib/htm/circuit_breaker.rb +5 -6
  108. data/lib/htm/config/builder.rb +12 -12
  109. data/lib/htm/config/database.rb +21 -27
  110. data/lib/htm/config/validator.rb +12 -18
  111. data/lib/htm/config.rb +76 -65
  112. data/lib/htm/database.rb +193 -199
  113. data/lib/htm/embedding_service.rb +4 -9
  114. data/lib/htm/integrations/sinatra.rb +7 -7
  115. data/lib/htm/job_adapter.rb +14 -21
  116. data/lib/htm/jobs/generate_embedding_job.rb +28 -44
  117. data/lib/htm/jobs/generate_propositions_job.rb +29 -55
  118. data/lib/htm/jobs/generate_relationships_job.rb +137 -0
  119. data/lib/htm/jobs/generate_tags_job.rb +45 -67
  120. data/lib/htm/loaders/markdown_loader.rb +65 -112
  121. data/lib/htm/long_term_memory/fulltext_search.rb +1 -1
  122. data/lib/htm/long_term_memory/hybrid_search.rb +300 -128
  123. data/lib/htm/long_term_memory/node_operations.rb +2 -2
  124. data/lib/htm/long_term_memory/relevance_scorer.rb +100 -68
  125. data/lib/htm/long_term_memory/tag_operations.rb +87 -120
  126. data/lib/htm/long_term_memory/vector_search.rb +1 -1
  127. data/lib/htm/long_term_memory.rb +2 -1
  128. data/lib/htm/mcp/cli.rb +59 -58
  129. data/lib/htm/mcp/server.rb +5 -6
  130. data/lib/htm/mcp/tools.rb +30 -36
  131. data/lib/htm/migration.rb +10 -10
  132. data/lib/htm/models/node.rb +2 -3
  133. data/lib/htm/models/node_relationship.rb +72 -0
  134. data/lib/htm/models/node_tag.rb +2 -2
  135. data/lib/htm/models/robot_node.rb +2 -2
  136. data/lib/htm/models/tag.rb +41 -28
  137. data/lib/htm/observability.rb +45 -51
  138. data/lib/htm/proposition_service.rb +3 -7
  139. data/lib/htm/query_cache.rb +13 -15
  140. data/lib/htm/railtie.rb +1 -2
  141. data/lib/htm/robot_group.rb +9 -9
  142. data/lib/htm/sequel_config.rb +1 -0
  143. data/lib/htm/sql_builder.rb +1 -1
  144. data/lib/htm/tag_service.rb +2 -6
  145. data/lib/htm/timeframe.rb +4 -5
  146. data/lib/htm/timeframe_extractor.rb +42 -83
  147. data/lib/htm/version.rb +1 -1
  148. data/lib/htm/workflows/remember_workflow.rb +112 -115
  149. data/lib/htm/working_memory.rb +21 -26
  150. data/lib/htm.rb +103 -116
  151. data/lib/tasks/db.rake +0 -2
  152. data/lib/tasks/doc.rake +14 -13
  153. data/lib/tasks/files.rake +5 -12
  154. data/lib/tasks/htm.rake +70 -71
  155. data/lib/tasks/jobs.rake +41 -47
  156. data/lib/tasks/tags.rake +3 -8
  157. metadata +25 -100
@@ -58,21 +58,21 @@ class HTM
58
58
  def calculate_relevance(node:, query_tags: [], vector_similarity: nil, node_tags: nil)
59
59
  # 1. Vector similarity (semantic match)
60
60
  semantic_score = if vector_similarity
61
- vector_similarity
62
- elsif node['similarity']
63
- node['similarity'].to_f
64
- else
65
- DEFAULT_NEUTRAL_SCORE # Neutral if no embedding
66
- end
61
+ vector_similarity
62
+ elsif node['similarity']
63
+ node['similarity'].to_f
64
+ else
65
+ DEFAULT_NEUTRAL_SCORE # Neutral if no embedding
66
+ end
67
67
 
68
68
  # 2. Tag overlap (categorical relevance)
69
69
  # Use pre-loaded tags if provided, otherwise fetch (for backward compatibility)
70
70
  node_tags ||= get_node_tags(node['id'])
71
71
  tag_score = if query_tags.any? && node_tags.any?
72
- weighted_hierarchical_jaccard(query_tags, node_tags)
73
- else
74
- DEFAULT_NEUTRAL_SCORE # Neutral if no tags
75
- end
72
+ weighted_hierarchical_jaccard(query_tags, node_tags)
73
+ else
74
+ DEFAULT_NEUTRAL_SCORE # Neutral if no tags
75
+ end
76
76
 
77
77
  # 3. Recency (temporal relevance) - exponential decay with half-life
78
78
  age_hours = (Time.now - Time.parse(node['created_at'].to_s)) / 3600.0
@@ -108,16 +108,22 @@ class HTM
108
108
  def search_with_relevance(timeframe:, query: nil, query_tags: [], limit: 20, embedding_service: nil, metadata: {})
109
109
  # Get candidates from appropriate search method
110
110
  candidates = if query && embedding_service
111
- # Vector search (returns hashes directly)
112
- search_uncached(timeframe: timeframe, query: query, limit: limit * 2, embedding_service: embedding_service, metadata: metadata)
113
- elsif query
114
- # Full-text search (returns hashes directly)
115
- search_fulltext_uncached(timeframe: timeframe, query: query, limit: limit * 2, metadata: metadata)
116
- else
117
- # Time-range only - use raw SQL to avoid ORM object instantiation
118
- # This is more efficient than .map(&:attributes) which creates intermediate objects
119
- fetch_candidates_by_timeframe(timeframe: timeframe, metadata: metadata, limit: limit * 2)
120
- end
111
+ # Vector search (returns hashes directly)
112
+ search_uncached(timeframe: timeframe, query: query, limit: limit * 2, embedding_service: embedding_service,
113
+ metadata: metadata)
114
+ elsif query
115
+ # Full-text search (returns hashes directly)
116
+ search_fulltext_uncached(timeframe: timeframe, query: query, limit: limit * 2, metadata: metadata)
117
+ else
118
+ # Time-range only - use raw SQL to avoid ORM object instantiation
119
+ # This is more efficient than .map(&:attributes) which creates intermediate objects
120
+ fetch_candidates_by_timeframe(timeframe: timeframe, metadata: metadata, limit: limit * 2)
121
+ end
122
+
123
+ # Normalize similarity and text_rank to [0,1] across all candidates
124
+ # before scoring so weighted sum is unbiased (ts_rank is unbounded,
125
+ # similarity is already [0,1] but may be narrow)
126
+ normalize_scores_batch(candidates)
121
127
 
122
128
  # Batch load all tags for candidates (fixes N+1 query)
123
129
  node_ids = candidates.map { |n| n['id'] }
@@ -183,62 +189,49 @@ class HTM
183
189
  def search_by_tags(tags:, match_all: false, timeframe: nil, limit: 20)
184
190
  return [] if tags.empty?
185
191
 
186
- # Build base query with specific columns to avoid loading unnecessary data
187
- query = HTM::Models::Node
188
- .select(
189
- Sequel[:nodes][:id],
190
- Sequel[:nodes][:content],
191
- Sequel[:nodes][:access_count],
192
- Sequel[:nodes][:created_at],
193
- Sequel[:nodes][:token_count]
194
- )
195
- .join(:node_tags, node_id: :id)
196
- .join(:tags, id: Sequel[:node_tags][:tag_id])
197
- .where(Sequel[:tags][:name] => tags)
198
- .distinct
199
-
200
- # Apply timeframe filter if provided
201
- query = query.where(Sequel[:nodes][:created_at] => timeframe) if timeframe
202
-
203
- if match_all
204
- # Match ALL tags (intersection)
205
- query = query
206
- .group(Sequel[:nodes][:id])
207
- .having { Sequel.function(:count, Sequel[:tags][:name].distinct) =~ tags.size }
208
- end
192
+ nodes = fetch_nodes_by_tags(tags, match_all: match_all, timeframe: timeframe, limit: limit)
193
+ enrich_nodes_with_relevance(nodes, query_tags: tags)
194
+ end
209
195
 
210
- # Fetch and convert to hashes with string keys
211
- nodes = query.limit(limit).all.map do |row|
212
- {
213
- 'id' => row[:id],
214
- 'content' => row[:content],
215
- 'access_count' => row[:access_count],
216
- 'created_at' => row[:created_at],
217
- 'token_count' => row[:token_count]
218
- }
196
+ private
197
+
198
+ def fetch_nodes_by_tags(tags, match_all:, timeframe:, limit:)
199
+ query = build_tag_base_query(tags, timeframe)
200
+ query = apply_match_all_constraint(query, tags) if match_all
201
+ query.limit(limit).all.map do |row|
202
+ { 'id' => row[:id], 'content' => row[:content],
203
+ 'access_count' => row[:access_count], 'created_at' => row[:created_at], 'token_count' => row[:token_count] }
219
204
  end
205
+ end
220
206
 
221
- # Batch load all tags for nodes (fixes N+1 query)
222
- node_ids = nodes.map { |n| n['id'] }
223
- tags_by_node = batch_load_node_tags(node_ids)
207
+ def build_tag_base_query(tags, timeframe)
208
+ cols = [Sequel[:nodes][:id], Sequel[:nodes][:content], Sequel[:nodes][:access_count],
209
+ Sequel[:nodes][:created_at], Sequel[:nodes][:token_count]]
210
+ query = HTM::Models::Node
211
+ .select(*cols)
212
+ .join(:node_tags, node_id: :id)
213
+ .join(:tags, id: Sequel[:node_tags][:tag_id])
214
+ .where(Sequel[:tags][:name] => tags)
215
+ .distinct
216
+ timeframe ? query.where(Sequel[:nodes][:created_at] => timeframe) : query
217
+ end
224
218
 
225
- # Calculate relevance and enrich with tags (modify in-place)
226
- nodes.map do |node|
227
- node_tags = tags_by_node[node['id']] || []
228
- relevance = calculate_relevance(
229
- node: node,
230
- query_tags: tags,
231
- node_tags: node_tags
232
- )
219
+ def apply_match_all_constraint(query, tags)
220
+ query.group(Sequel[:nodes][:id])
221
+ .having { Sequel.function(:count, Sequel[:tags][:name].distinct) =~ tags.size }
222
+ end
233
223
 
234
- node['relevance'] = relevance
224
+ def enrich_nodes_with_relevance(nodes, query_tags:)
225
+ tags_by_node = batch_load_node_tags(nodes.map { |n| n['id'] })
226
+ enriched = nodes.map do |node|
227
+ node_tags = tags_by_node[node['id']] || []
228
+ node['relevance'] = calculate_relevance(node: node, query_tags: query_tags, node_tags: node_tags)
235
229
  node['tags'] = node_tags
236
230
  node
237
- end.sort_by { |n| -n['relevance'] }
231
+ end
232
+ enriched.sort_by { |n| -n['relevance'] }
238
233
  end
239
234
 
240
- private
241
-
242
235
  # Calculate Jaccard similarity between two sets
243
236
  #
244
237
  # @param set_a [Array] First set
@@ -309,7 +302,7 @@ class HTM
309
302
  end
310
303
  end
311
304
 
312
- total_weights > 0 ? total_weighted_similarity / total_weights : 0.0
305
+ total_weights.positive? ? total_weighted_similarity / total_weights : 0.0
313
306
  end
314
307
 
315
308
  # Calculate similarity between two pre-split hierarchical tags
@@ -341,6 +334,45 @@ class HTM
341
334
  [similarity, depth_weight]
342
335
  end
343
336
 
337
+ # Min-max normalize signal columns across all candidates to [0, 1]
338
+ #
339
+ # Normalizes 'similarity' and 'text_rank' in-place so the weighted
340
+ # composite in calculate_relevance is not biased by different scales
341
+ # (ts_rank is unbounded, similarity is [0,1]).
342
+ #
343
+ # Handles edge cases:
344
+ # - Single element: no-op (already effectively normalized)
345
+ # - All-same values: maps to 1.0 (avoids division by zero)
346
+ # - Missing keys: skips normalization for that signal
347
+ #
348
+ # @param candidates [Array<Hash>] Candidate nodes (modified in-place)
349
+ # @return [Array<Hash>] Same array, normalized
350
+ #
351
+ def normalize_scores_batch(candidates)
352
+ return candidates if candidates.size <= 1
353
+
354
+ %w[similarity text_rank].each do |key|
355
+ values = candidates.filter_map { |c| c[key]&.to_f }
356
+ next if values.empty?
357
+
358
+ min_val = values.min
359
+ max_val = values.max
360
+ range = max_val - min_val
361
+
362
+ candidates.each do |c|
363
+ next unless c.key?(key) && c[key]
364
+
365
+ c[key] = if range.zero?
366
+ 1.0
367
+ else
368
+ (c[key].to_f - min_val) / range
369
+ end
370
+ end
371
+ end
372
+
373
+ candidates
374
+ end
375
+
344
376
  # Calculate similarity between two hierarchical tags (string version)
345
377
  #
346
378
  # Compares tags level by level, returning both similarity and a weight
@@ -78,45 +78,10 @@ class HTM
78
78
  # - default - LIKE prefix match (e.g., "database" matches "database:postgresql")
79
79
  #
80
80
  def nodes_by_topic(topic_path, exact: false, fuzzy: false, min_similarity: DEFAULT_TAG_SIMILARITY_THRESHOLD, limit: 50)
81
- # Enforce limit to prevent DoS
82
- safe_limit = [[limit.to_i, 1].max, MAX_TAG_QUERY_LIMIT].min
83
-
84
- # Build base query with joins
85
- # Use subquery with DISTINCT ON to get unique nodes by id
86
- if exact
87
- node_ids = HTM::Models::Node
88
- .select(Sequel[:nodes][:id])
89
- .join(:node_tags, node_id: :id)
90
- .join(:tags, id: Sequel[:node_tags][:tag_id])
91
- .where(Sequel[:tags][:name] => topic_path)
92
- .distinct
93
- .select_map(Sequel[:nodes][:id])
94
- elsif fuzzy
95
- # Trigram similarity search - tolerates typos and partial matches
96
- safe_similarity = [[min_similarity.to_f, 0.0].max, 1.0].min
97
- node_ids = HTM::Models::Node
98
- .select(Sequel[:nodes][:id])
99
- .join(:node_tags, node_id: :id)
100
- .join(:tags, id: Sequel[:node_tags][:tag_id])
101
- .where(Sequel.lit("similarity(tags.name, ?) >= ?", topic_path, safe_similarity))
102
- .distinct
103
- .select_map(Sequel[:nodes][:id])
104
- else
105
- # Sanitize LIKE pattern to prevent wildcard injection
106
- safe_pattern = HTM::SqlBuilder.sanitize_like_pattern(topic_path)
107
- node_ids = HTM::Models::Node
108
- .select(Sequel[:nodes][:id])
109
- .join(:node_tags, node_id: :id)
110
- .join(:tags, id: Sequel[:node_tags][:tag_id])
111
- .where(Sequel.like(Sequel[:tags][:name], "#{safe_pattern}%"))
112
- .distinct
113
- .select_map(Sequel[:nodes][:id])
114
- end
115
-
116
- # Return empty array if no node_ids found
81
+ safe_limit = limit.to_i.clamp(1, MAX_TAG_QUERY_LIMIT)
82
+ node_ids = node_ids_for_topic(topic_path, exact: exact, fuzzy: fuzzy, min_similarity: min_similarity)
117
83
  return [] if node_ids.empty?
118
84
 
119
- # Fetch full node records for the matching ids
120
85
  HTM::Models::Node
121
86
  .where(id: node_ids)
122
87
  .order(Sequel.desc(:created_at))
@@ -143,7 +108,7 @@ class HTM
143
108
  #
144
109
  def topic_relationships(min_shared_nodes: 2, limit: 50)
145
110
  # Enforce limit to prevent DoS
146
- safe_limit = [[limit.to_i, 1].max, MAX_TAG_QUERY_LIMIT].min
111
+ safe_limit = limit.to_i.clamp(1, MAX_TAG_QUERY_LIMIT)
147
112
  safe_min = [min_shared_nodes.to_i, 1].max
148
113
 
149
114
  sql = <<~SQL
@@ -200,9 +165,9 @@ class HTM
200
165
 
201
166
  # Single query to get all tags for all nodes
202
167
  results = HTM::Models::NodeTag
203
- .join(:tags, id: :tag_id)
204
- .where(node_id: node_ids)
205
- .select_map([:node_id, Sequel[:tags][:name]])
168
+ .join(:tags, id: :tag_id)
169
+ .where(node_id: node_ids)
170
+ .select_map([:node_id, Sequel[:tags][:name]])
206
171
 
207
172
  # Group by node_id
208
173
  results.group_by(&:first).transform_values { |pairs| pairs.map(&:last) }
@@ -218,25 +183,11 @@ class HTM
218
183
  # @return [Array<Hash>] Tags with usage counts
219
184
  #
220
185
  def popular_tags(limit: 20, timeframe: nil)
221
- # Enforce limit to prevent DoS
222
- safe_limit = [[limit.to_i, 1].max, MAX_TAG_QUERY_LIMIT].min
223
-
224
- query = HTM::Models::Tag
225
- .join(:node_tags, tag_id: :id)
226
- .join(:nodes, id: Sequel[:node_tags][:node_id])
227
- .group(Sequel[:tags][:id], Sequel[:tags][:name])
228
- .select(Sequel[:tags][:name], Sequel.function(:count, Sequel[:node_tags][:id]).as(:usage_count))
229
-
230
- if timeframe
231
- query = query.where(Sequel[:nodes][:created_at] >= timeframe.begin)
232
- .where(Sequel[:nodes][:created_at] <= timeframe.end)
233
- end
234
-
235
- query
236
- .order(Sequel.desc(:usage_count))
237
- .limit(safe_limit)
238
- .all
239
- .map { |tag| { name: tag[:name], usage_count: tag[:usage_count].to_i } }
186
+ safe_limit = limit.to_i.clamp(1, MAX_TAG_QUERY_LIMIT)
187
+ query = base_popular_tags_query
188
+ query = filter_by_timeframe(query, timeframe) if timeframe
189
+ query.order(Sequel.desc(:usage_count)).limit(safe_limit).all
190
+ .map { |tag| { name: tag[:name], usage_count: tag[:usage_count].to_i } }
240
191
  end
241
192
 
242
193
  # Fuzzy search for tags using trigram similarity
@@ -254,8 +205,8 @@ class HTM
254
205
  return [] if query.nil? || query.strip.empty?
255
206
 
256
207
  # Enforce limits
257
- safe_limit = [[limit.to_i, 1].max, MAX_TAG_QUERY_LIMIT].min
258
- safe_similarity = [[min_similarity.to_f, 0.0].max, 1.0].min
208
+ safe_limit = limit.to_i.clamp(1, MAX_TAG_QUERY_LIMIT)
209
+ safe_similarity = min_similarity.to_f.clamp(0.0, 1.0)
259
210
 
260
211
  sql = <<~SQL
261
212
  SELECT name, similarity(name, ?) as similarity
@@ -266,8 +217,8 @@ class HTM
266
217
  SQL
267
218
 
268
219
  HTM.db.fetch(sql, query, query, safe_similarity, safe_limit)
269
- .all
270
- .map { |r| { name: r[:name], similarity: r[:similarity].to_f } }
220
+ .all
221
+ .map { |r| { name: r[:name], similarity: r[:similarity].to_f } }
271
222
  rescue Sequel::Error => e
272
223
  HTM.logger.error("Failed to search tags: #{e.message}")
273
224
  []
@@ -366,76 +317,92 @@ class HTM
366
317
  # @param min_similarity [Float] Minimum similarity for trigram matching
367
318
  # @return [Array<String>] Matched tag names
368
319
  #
369
- def find_matching_tags_unified(exact_candidates:, prefix_candidates:, component_candidates:, fuzzy_fallback: true, min_similarity: DEFAULT_TAG_SIMILARITY_THRESHOLD)
320
+ def find_matching_tags_unified(exact_candidates:, prefix_candidates:, component_candidates:, fuzzy_fallback: true,
321
+ min_similarity: DEFAULT_TAG_SIMILARITY_THRESHOLD)
370
322
  return [] if exact_candidates.empty? && prefix_candidates.empty? && component_candidates.empty?
371
323
 
372
324
  conditions = []
373
325
  params = []
326
+ append_exact_conditions(conditions, params, exact_candidates)
327
+ append_prefix_conditions(conditions, params, prefix_candidates)
328
+ append_component_conditions(conditions, params, component_candidates)
329
+ append_trigram_conditions(conditions, params, component_candidates, min_similarity) if fuzzy_fallback && component_candidates.any?
330
+ return [] if conditions.empty?
374
331
 
375
- # Exact matches (highest priority)
376
- # Use Sequel.lit with ? placeholders for proper parameter binding
377
- if exact_candidates.any?
378
- placeholders = exact_candidates.map { '?' }.join(', ')
379
- conditions << "(SELECT name, 1 as priority FROM tags WHERE name IN (#{placeholders}))"
380
- params.concat(exact_candidates)
381
- end
382
-
383
- # Prefix matches
384
- if prefix_candidates.any?
385
- placeholders = prefix_candidates.map { '?' }.join(', ')
386
- conditions << "(SELECT name, 2 as priority FROM tags WHERE name IN (#{placeholders}))"
387
- params.concat(prefix_candidates)
388
- end
389
-
390
- # Component matches
391
- if component_candidates.any?
392
- component_conditions = component_candidates.map do |_|
393
- "(name = ? OR name LIKE ? OR name LIKE ? OR name LIKE ?)"
394
- end
332
+ params << MAX_TAG_QUERY_LIMIT
333
+ sql = "SELECT DISTINCT name FROM (#{conditions.join(' UNION ')}) AS matches ORDER BY name LIMIT ?"
334
+ HTM.db.fetch(sql, *params).all.map { |r| r[:name] }
335
+ rescue Sequel::Error => e
336
+ HTM.logger.error("Failed to find matching tags: #{e.message}")
337
+ []
338
+ end
395
339
 
396
- component_params = component_candidates.flat_map do |component|
397
- safe_component = HTM::SqlBuilder.sanitize_like_pattern(component)
398
- [
399
- component, # exact match
400
- "#{safe_component}:%", # starts with
401
- "%:#{safe_component}", # ends with
402
- "%:#{safe_component}:%" # in middle
403
- ]
404
- end
340
+ def base_popular_tags_query
341
+ HTM::Models::Tag
342
+ .join(:node_tags, tag_id: :id)
343
+ .join(:nodes, id: Sequel[:node_tags][:node_id])
344
+ .group(Sequel[:tags][:id], Sequel[:tags][:name])
345
+ .select(Sequel[:tags][:name], Sequel.function(:count, Sequel[:node_tags][:id]).as(:usage_count))
346
+ end
405
347
 
406
- conditions << "(SELECT name, 3 as priority FROM tags WHERE #{component_conditions.join(' OR ')})"
407
- params.concat(component_params)
408
- end
348
+ def filter_by_timeframe(query, timeframe)
349
+ query
350
+ .where(Sequel[:nodes][:created_at] >= timeframe.begin)
351
+ .where(Sequel[:nodes][:created_at] <= timeframe.end)
352
+ end
409
353
 
410
- # Trigram fuzzy matches (lowest priority - fallback for typos)
411
- if fuzzy_fallback && component_candidates.any?
412
- safe_similarity = [[min_similarity.to_f, 0.0].max, 1.0].min
413
- trigram_conditions = component_candidates.map do |_|
414
- "similarity(name, ?) >= ?"
354
+ def node_ids_for_topic(topic_path, exact:, fuzzy:, min_similarity:)
355
+ base = HTM::Models::Node
356
+ .select(Sequel[:nodes][:id])
357
+ .join(:node_tags, node_id: :id)
358
+ .join(:tags, id: Sequel[:node_tags][:tag_id])
359
+ .distinct
360
+
361
+ node_ids_dataset =
362
+ if exact
363
+ base.where(Sequel[:tags][:name] => topic_path)
364
+ elsif fuzzy
365
+ safe_sim = min_similarity.to_f.clamp(0.0, 1.0)
366
+ base.where(Sequel.lit("similarity(tags.name, ?) >= ?", topic_path, safe_sim))
367
+ else
368
+ safe_pattern = HTM::SqlBuilder.sanitize_like_pattern(topic_path)
369
+ base.where(Sequel.like(Sequel[:tags][:name], "#{safe_pattern}%"))
415
370
  end
416
- trigram_params = component_candidates.flat_map { |c| [c, safe_similarity] }
417
371
 
418
- conditions << "(SELECT name, 4 as priority FROM tags WHERE #{trigram_conditions.join(' OR ')})"
419
- params.concat(trigram_params)
420
- end
372
+ node_ids_dataset.select_map(Sequel[:nodes][:id])
373
+ end
421
374
 
422
- return [] if conditions.empty?
375
+ def append_exact_conditions(conditions, params, exact_candidates)
376
+ return unless exact_candidates.any?
377
+ placeholders = exact_candidates.map { '?' }.join(', ')
378
+ conditions << "(SELECT name, 1 as priority FROM tags WHERE name IN (#{placeholders}))"
379
+ params.concat(exact_candidates)
380
+ end
423
381
 
424
- # Combine with UNION and order by priority
425
- params << MAX_TAG_QUERY_LIMIT
382
+ def append_prefix_conditions(conditions, params, prefix_candidates)
383
+ return unless prefix_candidates.any?
384
+ placeholders = prefix_candidates.map { '?' }.join(', ')
385
+ conditions << "(SELECT name, 2 as priority FROM tags WHERE name IN (#{placeholders}))"
386
+ params.concat(prefix_candidates)
387
+ end
426
388
 
427
- sql = <<~SQL
428
- SELECT DISTINCT name FROM (
429
- #{conditions.join(' UNION ')}
430
- ) AS matches
431
- ORDER BY name
432
- LIMIT ?
433
- SQL
389
+ def append_component_conditions(conditions, params, component_candidates)
390
+ return unless component_candidates.any?
391
+ component_conditions = component_candidates.map { "(name = ? OR name LIKE ? OR name LIKE ? OR name LIKE ?)" }
392
+ component_params = component_candidates.flat_map do |component|
393
+ safe = HTM::SqlBuilder.sanitize_like_pattern(component)
394
+ [component, "#{safe}:%", "%:#{safe}", "%:#{safe}:%"]
395
+ end
396
+ conditions << "(SELECT name, 3 as priority FROM tags WHERE #{component_conditions.join(' OR ')})"
397
+ params.concat(component_params)
398
+ end
434
399
 
435
- HTM.db.fetch(sql, *params).all.map { |r| r[:name] }
436
- rescue Sequel::Error => e
437
- HTM.logger.error("Failed to find matching tags: #{e.message}")
438
- []
400
+ def append_trigram_conditions(conditions, params, component_candidates, min_similarity)
401
+ safe_similarity = min_similarity.to_f.clamp(0.0, 1.0)
402
+ trigram_conditions = component_candidates.map { "similarity(name, ?) >= ?" }
403
+ trigram_params = component_candidates.flat_map { |c| [c, safe_similarity] }
404
+ conditions << "(SELECT name, 4 as priority FROM tags WHERE #{trigram_conditions.join(' OR ')})"
405
+ params.concat(trigram_params)
439
406
  end
440
407
  end
441
408
  end
@@ -28,7 +28,7 @@ class HTM
28
28
  #
29
29
  def search(timeframe:, query:, limit:, embedding_service:, metadata: {})
30
30
  # Enforce limit to prevent DoS
31
- safe_limit = [[limit.to_i, 1].max, MAX_VECTOR_LIMIT].min
31
+ safe_limit = limit.to_i.clamp(1, MAX_VECTOR_LIMIT)
32
32
 
33
33
  start_time = Process.clock_gettime(Process::CLOCK_MONOTONIC)
34
34
  result = @cache.fetch(:search, timeframe, query, safe_limit, metadata) do
@@ -86,7 +86,8 @@ class HTM
86
86
  # @example Disable caching
87
87
  # ltm = LongTermMemory.new(config, cache_size: 0)
88
88
  #
89
- def initialize(config, pool_size: nil, query_timeout: DEFAULT_QUERY_TIMEOUT, cache_size: DEFAULT_CACHE_SIZE, cache_ttl: DEFAULT_CACHE_TTL)
89
+ def initialize(config, pool_size: nil, query_timeout: DEFAULT_QUERY_TIMEOUT, cache_size: DEFAULT_CACHE_SIZE,
90
+ cache_ttl: DEFAULT_CACHE_TTL)
90
91
  @config = config
91
92
  @query_timeout = query_timeout # in milliseconds
92
93