htm 0.0.20 → 0.0.30

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (154) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +60 -0
  3. data/Rakefile +104 -18
  4. data/db/migrate/00001_enable_extensions.rb +9 -5
  5. data/db/migrate/00002_create_robots.rb +18 -6
  6. data/db/migrate/00003_create_file_sources.rb +30 -17
  7. data/db/migrate/00004_create_nodes.rb +60 -48
  8. data/db/migrate/00005_create_tags.rb +24 -12
  9. data/db/migrate/00006_create_node_tags.rb +28 -13
  10. data/db/migrate/00007_create_robot_nodes.rb +40 -26
  11. data/db/schema.sql +17 -1
  12. data/db/seeds.rb +33 -33
  13. data/docs/database/naming-convention.md +244 -0
  14. data/docs/database_rake_tasks.md +31 -0
  15. data/docs/development/rake-tasks.md +80 -35
  16. data/docs/guides/mcp-server.md +70 -1
  17. data/examples/.envrc +6 -0
  18. data/examples/.gitignore +2 -0
  19. data/examples/00_create_examples_db.rb +94 -0
  20. data/examples/{basic_usage.rb → 01_basic_usage.rb} +12 -16
  21. data/examples/{custom_llm_configuration.rb → 03_custom_llm_configuration.rb} +13 -3
  22. data/examples/{file_loader_usage.rb → 04_file_loader_usage.rb} +11 -14
  23. data/examples/{timeframe_demo.rb → 05_timeframe_demo.rb} +10 -3
  24. data/examples/{example_app → 06_example_app}/app.rb +15 -15
  25. data/examples/{cli_app → 07_cli_app}/htm_cli.rb +15 -22
  26. data/examples/08_sinatra_app/Gemfile.lock +241 -0
  27. data/examples/{sinatra_app → 08_sinatra_app}/app.rb +19 -18
  28. data/examples/{mcp_client.rb → 09_mcp_client.rb} +5 -8
  29. data/examples/{telemetry → 10_telemetry}/SETUP_README.md +1 -1
  30. data/examples/{telemetry → 10_telemetry}/demo.rb +14 -10
  31. data/examples/11_robot_groups/README.md +335 -0
  32. data/examples/{robot_groups → 11_robot_groups/lib}/robot_worker.rb +17 -3
  33. data/examples/{robot_groups → 11_robot_groups}/multi_process.rb +9 -9
  34. data/examples/{robot_groups → 11_robot_groups}/same_process.rb +9 -12
  35. data/examples/{rails_app → 12_rails_app}/Gemfile +3 -0
  36. data/examples/{rails_app → 12_rails_app}/Gemfile.lock +87 -58
  37. data/examples/{rails_app → 12_rails_app}/app/controllers/dashboard_controller.rb +10 -6
  38. data/examples/{rails_app → 12_rails_app}/app/controllers/files_controller.rb +5 -5
  39. data/examples/{rails_app → 12_rails_app}/app/controllers/memories_controller.rb +11 -7
  40. data/examples/{rails_app → 12_rails_app}/app/controllers/robots_controller.rb +8 -8
  41. data/examples/12_rails_app/app/controllers/tags_controller.rb +36 -0
  42. data/examples/{rails_app → 12_rails_app}/app/views/dashboard/index.html.erb +2 -2
  43. data/examples/{rails_app → 12_rails_app}/app/views/files/new.html.erb +5 -2
  44. data/examples/{rails_app → 12_rails_app}/app/views/memories/_memory_card.html.erb +3 -3
  45. data/examples/{rails_app → 12_rails_app}/app/views/memories/deleted.html.erb +3 -3
  46. data/examples/{rails_app → 12_rails_app}/app/views/memories/edit.html.erb +3 -3
  47. data/examples/{rails_app → 12_rails_app}/app/views/memories/show.html.erb +4 -4
  48. data/examples/{rails_app → 12_rails_app}/app/views/robots/index.html.erb +2 -2
  49. data/examples/{rails_app → 12_rails_app}/app/views/robots/show.html.erb +4 -4
  50. data/examples/{rails_app → 12_rails_app}/app/views/search/index.html.erb +1 -1
  51. data/examples/{rails_app → 12_rails_app}/app/views/tags/index.html.erb +2 -2
  52. data/examples/{rails_app → 12_rails_app}/app/views/tags/show.html.erb +1 -1
  53. data/examples/12_rails_app/config/initializers/htm.rb +7 -0
  54. data/examples/12_rails_app/config/initializers/rack.rb +5 -0
  55. data/examples/README.md +230 -211
  56. data/examples/examples_helper.rb +138 -0
  57. data/lib/htm/config/builder.rb +167 -0
  58. data/lib/htm/config/database.rb +317 -0
  59. data/lib/htm/config/defaults.yml +37 -9
  60. data/lib/htm/config/section.rb +74 -0
  61. data/lib/htm/config/validator.rb +83 -0
  62. data/lib/htm/config.rb +64 -360
  63. data/lib/htm/database.rb +85 -127
  64. data/lib/htm/errors.rb +14 -0
  65. data/lib/htm/integrations/sinatra.rb +13 -44
  66. data/lib/htm/jobs/generate_embedding_job.rb +3 -4
  67. data/lib/htm/jobs/generate_propositions_job.rb +4 -5
  68. data/lib/htm/jobs/generate_tags_job.rb +16 -15
  69. data/lib/htm/loaders/defaults_loader.rb +23 -0
  70. data/lib/htm/loaders/markdown_loader.rb +17 -15
  71. data/lib/htm/loaders/xdg_config_loader.rb +9 -9
  72. data/lib/htm/long_term_memory/fulltext_search.rb +14 -14
  73. data/lib/htm/long_term_memory/hybrid_search.rb +396 -229
  74. data/lib/htm/long_term_memory/node_operations.rb +24 -23
  75. data/lib/htm/long_term_memory/relevance_scorer.rb +23 -20
  76. data/lib/htm/long_term_memory/robot_operations.rb +4 -4
  77. data/lib/htm/long_term_memory/tag_operations.rb +91 -77
  78. data/lib/htm/long_term_memory/vector_search.rb +4 -5
  79. data/lib/htm/long_term_memory.rb +13 -13
  80. data/lib/htm/mcp/cli.rb +115 -8
  81. data/lib/htm/mcp/resources.rb +4 -3
  82. data/lib/htm/mcp/server.rb +5 -4
  83. data/lib/htm/mcp/tools.rb +37 -28
  84. data/lib/htm/migration.rb +72 -0
  85. data/lib/htm/models/file_source.rb +52 -31
  86. data/lib/htm/models/node.rb +224 -108
  87. data/lib/htm/models/node_tag.rb +49 -28
  88. data/lib/htm/models/robot.rb +38 -27
  89. data/lib/htm/models/robot_node.rb +63 -35
  90. data/lib/htm/models/tag.rb +126 -123
  91. data/lib/htm/observability.rb +45 -41
  92. data/lib/htm/proposition_service.rb +76 -7
  93. data/lib/htm/railtie.rb +2 -2
  94. data/lib/htm/robot_group.rb +30 -18
  95. data/lib/htm/sequel_config.rb +215 -0
  96. data/lib/htm/sql_builder.rb +14 -16
  97. data/lib/htm/tag_service.rb +78 -0
  98. data/lib/htm/tasks.rb +3 -0
  99. data/lib/htm/version.rb +1 -1
  100. data/lib/htm/workflows/remember_workflow.rb +6 -5
  101. data/lib/htm.rb +26 -22
  102. data/lib/tasks/db.rake +0 -2
  103. data/lib/tasks/doc.rake +2 -2
  104. data/lib/tasks/files.rake +11 -18
  105. data/lib/tasks/htm.rake +190 -62
  106. data/lib/tasks/jobs.rake +179 -54
  107. data/lib/tasks/tags.rake +8 -13
  108. data/scripts/backfill_parent_tags.rb +376 -0
  109. data/scripts/normalize_plural_tags.rb +335 -0
  110. metadata +109 -80
  111. data/examples/rails_app/app/controllers/tags_controller.rb +0 -30
  112. data/examples/sinatra_app/Gemfile.lock +0 -166
  113. data/lib/htm/active_record_config.rb +0 -104
  114. /data/examples/{config_file_example → 02_config_file_example}/README.md +0 -0
  115. /data/examples/{config_file_example → 02_config_file_example}/config/htm.local.yml +0 -0
  116. /data/examples/{config_file_example → 02_config_file_example}/custom_config.yml +0 -0
  117. /data/examples/{config_file_example → 02_config_file_example}/show_config.rb +0 -0
  118. /data/examples/{example_app → 06_example_app}/Rakefile +0 -0
  119. /data/examples/{cli_app → 07_cli_app}/README.md +0 -0
  120. /data/examples/{sinatra_app → 08_sinatra_app}/Gemfile +0 -0
  121. /data/examples/{telemetry → 10_telemetry}/README.md +0 -0
  122. /data/examples/{telemetry → 10_telemetry}/grafana/dashboards/htm-metrics.json +0 -0
  123. /data/examples/{rails_app → 12_rails_app}/.gitignore +0 -0
  124. /data/examples/{rails_app → 12_rails_app}/Procfile.dev +0 -0
  125. /data/examples/{rails_app → 12_rails_app}/README.md +0 -0
  126. /data/examples/{rails_app → 12_rails_app}/Rakefile +0 -0
  127. /data/examples/{rails_app → 12_rails_app}/app/assets/stylesheets/application.css +0 -0
  128. /data/examples/{rails_app → 12_rails_app}/app/assets/stylesheets/inter-font.css +0 -0
  129. /data/examples/{rails_app → 12_rails_app}/app/controllers/application_controller.rb +0 -0
  130. /data/examples/{rails_app → 12_rails_app}/app/controllers/search_controller.rb +0 -0
  131. /data/examples/{rails_app → 12_rails_app}/app/javascript/application.js +0 -0
  132. /data/examples/{rails_app → 12_rails_app}/app/javascript/controllers/application.js +0 -0
  133. /data/examples/{rails_app → 12_rails_app}/app/javascript/controllers/index.js +0 -0
  134. /data/examples/{rails_app → 12_rails_app}/app/views/files/index.html.erb +0 -0
  135. /data/examples/{rails_app → 12_rails_app}/app/views/files/show.html.erb +0 -0
  136. /data/examples/{rails_app → 12_rails_app}/app/views/layouts/application.html.erb +0 -0
  137. /data/examples/{rails_app → 12_rails_app}/app/views/memories/index.html.erb +0 -0
  138. /data/examples/{rails_app → 12_rails_app}/app/views/memories/new.html.erb +0 -0
  139. /data/examples/{rails_app → 12_rails_app}/app/views/robots/new.html.erb +0 -0
  140. /data/examples/{rails_app → 12_rails_app}/app/views/shared/_navbar.html.erb +0 -0
  141. /data/examples/{rails_app → 12_rails_app}/app/views/shared/_stat_card.html.erb +0 -0
  142. /data/examples/{rails_app → 12_rails_app}/bin/dev +0 -0
  143. /data/examples/{rails_app → 12_rails_app}/bin/rails +0 -0
  144. /data/examples/{rails_app → 12_rails_app}/bin/rake +0 -0
  145. /data/examples/{rails_app → 12_rails_app}/config/application.rb +0 -0
  146. /data/examples/{rails_app → 12_rails_app}/config/boot.rb +0 -0
  147. /data/examples/{rails_app → 12_rails_app}/config/database.yml +0 -0
  148. /data/examples/{rails_app → 12_rails_app}/config/environment.rb +0 -0
  149. /data/examples/{rails_app → 12_rails_app}/config/importmap.rb +0 -0
  150. /data/examples/{rails_app → 12_rails_app}/config/routes.rb +0 -0
  151. /data/examples/{rails_app → 12_rails_app}/config/tailwind.config.js +0 -0
  152. /data/examples/{rails_app → 12_rails_app}/config.ru +0 -0
  153. /data/examples/{rails_app → 12_rails_app}/log/.keep +0 -0
  154. /data/examples/{rails_app → 12_rails_app}/tmp/local_secret.txt +0 -0
@@ -2,16 +2,20 @@
2
2
 
3
3
  class HTM
4
4
  class LongTermMemory
5
- # Hybrid search combining full-text and vector similarity
5
+ # Hybrid search using Reciprocal Rank Fusion (RRF)
6
6
  #
7
- # Performs combined search using:
8
- # 1. Full-text search for content matching
9
- # 2. Tag matching for categorical relevance
10
- # 3. Vector similarity for semantic ranking
7
+ # Performs three independent searches and merges results:
8
+ # 1. Vector similarity search for semantic matching
9
+ # 2. Full-text search for keyword matching
10
+ # 3. Tag-based search for hierarchical category matching
11
11
  #
12
- # Nodes without embeddings are included with a default similarity score,
13
- # allowing newly created nodes to appear immediately before background
14
- # jobs complete their embedding generation.
12
+ # Results are merged using RRF scoring. Nodes appearing in multiple
13
+ # searches receive boosted scores, making them rank higher.
14
+ #
15
+ # Tag scoring uses hierarchical depth matching - the more levels of a
16
+ # tag hierarchy that match, the higher the score contribution.
17
+ #
18
+ # RRF Formula: score = Σ 1/(k + rank) for each search where node appears
15
19
  #
16
20
  # Results are cached for performance.
17
21
  #
@@ -20,31 +24,38 @@ class HTM
20
24
  module HybridSearch
21
25
  # Maximum results to prevent DoS via unbounded queries
22
26
  MAX_HYBRID_LIMIT = 1000
23
- MAX_PREFILTER_LIMIT = 5000
24
27
 
25
- # Hybrid search (full-text + vector)
28
+ # RRF constant - higher values reduce the impact of rank differences
29
+ # 60 is the standard value from the original RRF paper
30
+ RRF_K = 60
31
+
32
+ # Multiplier for candidates from each search
33
+ # We fetch more candidates than requested to ensure good fusion
34
+ CANDIDATE_MULTIPLIER = 3
35
+
36
+ # Hybrid search using Reciprocal Rank Fusion
26
37
  #
27
38
  # @param timeframe [Range] Time range to search
28
39
  # @param query [String] Search query
29
40
  # @param limit [Integer] Maximum results (capped at MAX_HYBRID_LIMIT)
30
41
  # @param embedding_service [Object] Service to generate embeddings
31
- # @param prefilter_limit [Integer] Candidates to consider (default: 100, capped at MAX_PREFILTER_LIMIT)
42
+ # @param prefilter_limit [Integer] Candidates per search (default: 100)
32
43
  # @param metadata [Hash] Filter by metadata fields (default: {})
33
44
  # @return [Array<Hash>] Matching nodes
34
45
  #
35
46
  def search_hybrid(timeframe:, query:, limit:, embedding_service:, prefilter_limit: 100, metadata: {})
36
47
  # Enforce limits to prevent DoS
37
48
  safe_limit = [[limit.to_i, 1].max, MAX_HYBRID_LIMIT].min
38
- safe_prefilter = [[prefilter_limit.to_i, 1].max, MAX_PREFILTER_LIMIT].min
49
+ safe_prefilter = [prefilter_limit.to_i, 1].max
39
50
 
40
51
  start_time = Process.clock_gettime(Process::CLOCK_MONOTONIC)
41
52
  result = @cache.fetch(:hybrid, timeframe, query, safe_limit, safe_prefilter, metadata) do
42
- search_hybrid_uncached(
53
+ search_hybrid_rrf(
43
54
  timeframe: timeframe,
44
55
  query: query,
45
56
  limit: safe_limit,
46
57
  embedding_service: embedding_service,
47
- prefilter_limit: safe_prefilter,
58
+ candidate_limit: safe_prefilter * CANDIDATE_MULTIPLIER,
48
59
  metadata: metadata
49
60
  )
50
61
  end
@@ -55,269 +66,425 @@ class HTM
55
66
 
56
67
  private
57
68
 
58
- # Threshold for skipping tag extraction (as ratio of limit)
59
- # If fulltext returns >= this ratio of requested results, skip expensive tag extraction
60
- TAG_EXTRACTION_THRESHOLD = 0.5
61
-
62
- # Uncached hybrid search
69
+ # Hybrid search using Reciprocal Rank Fusion
63
70
  #
64
- # Generates query embedding client-side, then combines:
65
- # 1. Full-text search for content matching
66
- # 2. Tag matching for categorical relevance (lazy - skipped if fulltext sufficient)
67
- # 3. Vector similarity for semantic ranking
71
+ # Runs vector, fulltext, and tag searches independently, then merges
72
+ # results using RRF scoring. Nodes appearing in multiple searches
73
+ # get contributions from each, naturally boosting them.
68
74
  #
69
- # @param timeframe [nil, Range, Array<Range>] Time range(s) to search (nil = no filter)
75
+ # @param timeframe [nil, Range, Array<Range>] Time range(s) to search
70
76
  # @param query [String] Search query
71
77
  # @param limit [Integer] Maximum results
72
78
  # @param embedding_service [Object] Service to generate query embedding
73
- # @param prefilter_limit [Integer] Candidates to consider
74
- # @param metadata [Hash] Filter by metadata fields (default: {})
75
- # @return [Array<Hash>] Matching nodes with similarity and tag_boost scores
79
+ # @param candidate_limit [Integer] Candidates to fetch from each search
80
+ # @param metadata [Hash] Filter by metadata fields
81
+ # @return [Array<Hash>] Merged results with RRF scores
76
82
  #
77
- def search_hybrid_uncached(timeframe:, query:, limit:, embedding_service:, prefilter_limit:, metadata: {})
78
- # Generate query embedding client-side
79
- query_embedding = embedding_service.embed(query)
80
-
81
- # Validate embedding before use
82
- unless query_embedding.is_a?(Array) && query_embedding.any?
83
- HTM.logger.error("Invalid embedding returned from embedding service")
84
- return []
85
- end
86
-
87
- # Pad embedding to 2000 dimensions if needed
88
- padded_embedding = HTM::SqlBuilder.pad_embedding(query_embedding)
89
-
90
- # Sanitize embedding for safe SQL use (validates all values are numeric)
91
- embedding_str = HTM::SqlBuilder.sanitize_embedding(padded_embedding)
92
-
93
- # Build filter conditions (with table alias for CTEs)
94
- timeframe_condition = HTM::SqlBuilder.timeframe_condition(timeframe, table_alias: 'n')
95
- metadata_condition = HTM::SqlBuilder.metadata_condition(metadata, table_alias: 'n')
96
-
97
- additional_conditions = []
98
- additional_conditions << timeframe_condition if timeframe_condition
99
- additional_conditions << metadata_condition if metadata_condition
100
- additional_sql = additional_conditions.any? ? "AND #{additional_conditions.join(' AND ')}" : ""
101
-
102
- # Same for non-aliased queries
103
- timeframe_condition_bare = HTM::SqlBuilder.timeframe_condition(timeframe)
104
- metadata_condition_bare = HTM::SqlBuilder.metadata_condition(metadata)
83
+ def search_hybrid_rrf(timeframe:, query:, limit:, embedding_service:, candidate_limit:, metadata: {})
84
+ # Run all three searches independently
85
+ vector_results = fetch_vector_candidates(
86
+ query: query,
87
+ embedding_service: embedding_service,
88
+ timeframe: timeframe,
89
+ metadata: metadata,
90
+ limit: candidate_limit
91
+ )
105
92
 
106
- additional_conditions_bare = []
107
- additional_conditions_bare << timeframe_condition_bare if timeframe_condition_bare
108
- additional_conditions_bare << metadata_condition_bare if metadata_condition_bare
109
- additional_sql_bare = additional_conditions_bare.any? ? "AND #{additional_conditions_bare.join(' AND ')}" : ""
93
+ fulltext_results = fetch_fulltext_candidates(
94
+ query: query,
95
+ timeframe: timeframe,
96
+ metadata: metadata,
97
+ limit: candidate_limit
98
+ )
110
99
 
111
- # OPTIMIZATION: Lazy tag extraction
112
- # Only extract tags if fulltext results are insufficient.
113
- # This skips the expensive LLM call (~500-3000ms) when fulltext alone
114
- # provides enough results.
115
- fulltext_count = count_fulltext_matches(
100
+ # Extract tags from query and find matching nodes
101
+ tag_results = fetch_tag_candidates(
116
102
  query: query,
117
- additional_sql_bare: additional_sql_bare,
118
- limit: prefilter_limit
103
+ timeframe: timeframe,
104
+ metadata: metadata,
105
+ limit: candidate_limit
119
106
  )
120
107
 
121
- # Only call expensive tag extraction if fulltext results are below threshold
122
- matching_tags = if fulltext_count < (limit * TAG_EXTRACTION_THRESHOLD)
123
- find_query_matching_tags(query)
124
- else
125
- []
126
- end
108
+ # Merge using RRF
109
+ merged = merge_with_rrf(vector_results, fulltext_results, tag_results)
127
110
 
128
- # Build the hybrid query
129
- # NOTE: Hybrid search includes nodes without embeddings using a default
130
- # similarity score of 0.5. This allows newly created nodes to appear in
131
- # search results immediately (via fulltext matching) before their embeddings
132
- # are generated by background jobs.
133
-
134
- result = if matching_tags.any?
135
- search_hybrid_with_tags(
136
- query: query,
137
- embedding_str: embedding_str,
138
- matching_tags: matching_tags,
139
- additional_sql: additional_sql,
140
- prefilter_limit: prefilter_limit,
141
- limit: limit
142
- )
143
- else
144
- search_hybrid_without_tags(
145
- query: query,
146
- embedding_str: embedding_str,
147
- additional_sql_bare: additional_sql_bare,
148
- prefilter_limit: prefilter_limit,
149
- limit: limit
150
- )
151
- end
111
+ # Take top results
112
+ top_results = merged.first(limit)
152
113
 
153
114
  # Track access for retrieved nodes
154
- node_ids = result.map { |r| r['id'] }
115
+ node_ids = top_results.map { |r| r['id'] }
155
116
  track_access(node_ids)
156
117
 
157
- result.to_a
118
+ top_results
158
119
  end
159
120
 
160
- # Count fulltext matches quickly (for lazy tag extraction decision)
121
+ # Fetch candidates using vector similarity search
161
122
  #
162
123
  # @param query [String] Search query
163
- # @param additional_sql_bare [String] Additional SQL conditions
164
- # @param limit [Integer] Maximum to count up to
165
- # @return [Integer] Number of fulltext matches (capped at limit)
124
+ # @param embedding_service [Object] Service to generate embeddings
125
+ # @param timeframe [nil, Range, Array<Range>] Time filter
126
+ # @param metadata [Hash] Metadata filter
127
+ # @param limit [Integer] Maximum candidates
128
+ # @return [Array<Hash>] Results with similarity scores
166
129
  #
167
- def count_fulltext_matches(query:, additional_sql_bare:, limit:)
130
+ def fetch_vector_candidates(query:, embedding_service:, timeframe:, metadata:, limit:)
131
+ # Generate query embedding
132
+ query_embedding = embedding_service.embed(query)
133
+
134
+ unless query_embedding.is_a?(Array) && query_embedding.any?
135
+ HTM.logger.error("Invalid embedding returned from embedding service")
136
+ return []
137
+ end
138
+
139
+ padded_embedding = HTM::SqlBuilder.pad_embedding(query_embedding)
140
+ embedding_str = HTM::SqlBuilder.sanitize_embedding(padded_embedding)
141
+
142
+ # Build filter conditions
143
+ timeframe_condition = HTM::SqlBuilder.timeframe_condition(timeframe)
144
+ metadata_condition = HTM::SqlBuilder.metadata_condition(metadata)
145
+
146
+ conditions = ["embedding IS NOT NULL", "deleted_at IS NULL"]
147
+ conditions << timeframe_condition if timeframe_condition
148
+ conditions << metadata_condition if metadata_condition
149
+
150
+ where_clause = "WHERE #{conditions.join(' AND ')}"
151
+
152
+ # Note: Using Sequel.lit for the vector comparison since it needs special handling
153
+ embedding_literal = HTM.db.literal(embedding_str)
168
154
  sql = <<~SQL
169
- SELECT COUNT(*) FROM (
170
- SELECT 1 FROM nodes
171
- WHERE deleted_at IS NULL
172
- AND to_tsvector('english', content) @@ plainto_tsquery('english', ?)
173
- #{additional_sql_bare}
174
- LIMIT ?
175
- ) AS limited_count
155
+ SELECT id, content, access_count, created_at, token_count,
156
+ 1 - (embedding <=> #{embedding_literal}::vector) as similarity
157
+ FROM nodes
158
+ #{where_clause}
159
+ ORDER BY embedding <=> #{embedding_literal}::vector
160
+ LIMIT ?
176
161
  SQL
177
162
 
178
- result = ActiveRecord::Base.connection.select_value(
179
- ActiveRecord::Base.sanitize_sql_array([sql, query, limit])
180
- )
181
- result.to_i
163
+ HTM.db.fetch(sql, limit).all.map { |r| r.transform_keys(&:to_s) }
182
164
  end
183
165
 
184
- # Hybrid search with tag matching
185
- #
186
- # Uses parameterized queries and LEFT JOIN for efficient tag boosting.
166
+ # Fetch candidates using full-text search
187
167
  #
188
168
  # @param query [String] Search query
189
- # @param embedding_str [String] Sanitized embedding string
190
- # @param matching_tags [Array<String>] Tags matching the query
191
- # @param additional_sql [String] Additional SQL conditions
192
- # @param prefilter_limit [Integer] Candidates to consider
193
- # @param limit [Integer] Maximum results
194
- # @return [ActiveRecord::Result] Query results
169
+ # @param timeframe [nil, Range, Array<Range>] Time filter
170
+ # @param metadata [Hash] Metadata filter
171
+ # @param limit [Integer] Maximum candidates
172
+ # @return [Array<Hash>] Results with text rank scores
195
173
  #
196
- def search_hybrid_with_tags(query:, embedding_str:, matching_tags:, additional_sql:, prefilter_limit:, limit:)
197
- # Build tag placeholders for parameterized query
198
- tag_placeholders = matching_tags.map { '?' }.join(', ')
199
- tag_count = matching_tags.length.to_f
174
+ def fetch_fulltext_candidates(query:, timeframe:, metadata:, limit:)
175
+ timeframe_condition = HTM::SqlBuilder.timeframe_condition(timeframe)
176
+ metadata_condition = HTM::SqlBuilder.metadata_condition(metadata)
177
+
178
+ additional_conditions = []
179
+ additional_conditions << timeframe_condition if timeframe_condition
180
+ additional_conditions << metadata_condition if metadata_condition
181
+ additional_sql = additional_conditions.any? ? "AND #{additional_conditions.join(' AND ')}" : ""
200
182
 
201
- # Use parameterized query with proper placeholder binding
202
- # LEFT JOIN replaces correlated subquery for O(n) instead of O(n²)
183
+ # Combined tsvector + trigram search (same as fulltext_search.rb)
184
+ # Escape the query for safe interpolation in trigram comparisons
185
+ query_literal = HTM.db.literal(query)
203
186
  sql = <<~SQL
204
- WITH fulltext_candidates AS (
205
- -- Nodes matching full-text search (with or without embeddings)
206
- SELECT n.id, n.content, n.access_count, n.created_at, n.token_count, n.embedding
207
- FROM nodes n
208
- WHERE n.deleted_at IS NULL
209
- AND to_tsvector('english', n.content) @@ plainto_tsquery('english', ?)
187
+ WITH tsvector_matches AS (
188
+ SELECT id, content, access_count, created_at, token_count,
189
+ (1.0 + ts_rank(to_tsvector('english', content), plainto_tsquery('english', #{query_literal}))) as text_rank
190
+ FROM nodes
191
+ WHERE deleted_at IS NULL
192
+ AND to_tsvector('english', content) @@ plainto_tsquery('english', #{query_literal})
210
193
  #{additional_sql}
211
- LIMIT ?
212
194
  ),
213
- tag_candidates AS (
214
- -- Nodes matching relevant tags (with or without embeddings)
215
- SELECT n.id, n.content, n.access_count, n.created_at, n.token_count, n.embedding
216
- FROM nodes n
217
- JOIN node_tags nt ON nt.node_id = n.id
218
- JOIN tags t ON t.id = nt.tag_id
219
- WHERE n.deleted_at IS NULL
220
- AND t.name IN (#{tag_placeholders})
195
+ trigram_matches AS (
196
+ SELECT id, content, access_count, created_at, token_count,
197
+ similarity(content, #{query_literal}) as text_rank
198
+ FROM nodes
199
+ WHERE deleted_at IS NULL
200
+ AND similarity(content, #{query_literal}) >= 0.1
201
+ AND id NOT IN (SELECT id FROM tsvector_matches)
221
202
  #{additional_sql}
222
- LIMIT ?
223
- ),
224
- all_candidates AS (
225
- SELECT * FROM fulltext_candidates
226
- UNION
227
- SELECT * FROM tag_candidates
228
- ),
229
- tag_counts AS (
230
- -- Pre-compute tag counts using JOIN instead of correlated subquery
231
- SELECT nt.node_id, COUNT(DISTINCT t.name)::float AS matched_tags
232
- FROM node_tags nt
233
- JOIN tags t ON t.id = nt.tag_id
234
- WHERE t.name IN (#{tag_placeholders})
235
- GROUP BY nt.node_id
236
203
  ),
237
- scored AS (
238
- SELECT
239
- ac.id, ac.content, ac.access_count, ac.created_at, ac.token_count,
240
- CASE
241
- WHEN ac.embedding IS NOT NULL THEN 1 - (ac.embedding <=> ?::vector)
242
- ELSE 0.5
243
- END as similarity,
244
- COALESCE(tc.matched_tags / ?, 0) as tag_boost
245
- FROM all_candidates ac
246
- LEFT JOIN tag_counts tc ON tc.node_id = ac.id
204
+ combined AS (
205
+ SELECT * FROM tsvector_matches
206
+ UNION ALL
207
+ SELECT * FROM trigram_matches
247
208
  )
248
- SELECT id, content, access_count, created_at, token_count,
249
- similarity, tag_boost,
250
- (similarity * 0.7 + tag_boost * 0.3) as combined_score
251
- FROM scored
252
- ORDER BY combined_score DESC
209
+ SELECT id, content, access_count, created_at, token_count, text_rank
210
+ FROM combined
211
+ ORDER BY text_rank DESC
253
212
  LIMIT ?
254
213
  SQL
255
214
 
256
- # Build parameter array: query, prefilter, tags (first IN), prefilter, tags (second IN), embedding, tag_count, limit
257
- params = [
258
- query,
259
- prefilter_limit,
260
- *matching_tags,
261
- prefilter_limit,
262
- *matching_tags,
263
- embedding_str,
264
- tag_count,
265
- limit
266
- ]
267
-
268
- ActiveRecord::Base.connection.select_all(
269
- ActiveRecord::Base.sanitize_sql_array([sql, *params])
270
- )
215
+ HTM.db.fetch(sql, limit).all.map { |r| r.transform_keys(&:to_s) }
271
216
  end
272
217
 
273
- # Hybrid search without tag matching (fallback)
218
+ # Fetch candidates using tag-based search with hierarchical scoring
219
+ #
220
+ # Extracts tags from the query, finds nodes with matching tags,
221
+ # and scores based on hierarchical depth match.
222
+ #
223
+ # Scoring: For a query tag "database:postgresql:extensions" (3 levels):
224
+ # - Node with "database:postgresql:extensions" = 3/3 = 1.0
225
+ # - Node with "database:postgresql" = 2/3 = 0.67
226
+ # - Node with "database" = 1/3 = 0.33
274
227
  #
275
228
  # @param query [String] Search query
276
- # @param embedding_str [String] Sanitized embedding string
277
- # @param additional_sql_bare [String] Additional SQL conditions (no alias)
278
- # @param prefilter_limit [Integer] Candidates to consider
279
- # @param limit [Integer] Maximum results
280
- # @return [ActiveRecord::Result] Query results
229
+ # @param timeframe [nil, Range, Array<Range>] Time filter
230
+ # @param metadata [Hash] Metadata filter
231
+ # @param limit [Integer] Maximum candidates
232
+ # @return [Array<Hash>] Results with tag_depth_score
281
233
  #
282
- def search_hybrid_without_tags(query:, embedding_str:, additional_sql_bare:, prefilter_limit:, limit:)
283
- # No matching tags, fall back to standard hybrid (fulltext + vector)
284
- # Include nodes without embeddings with a default similarity score
285
- # Optimized: compute similarity once in CTE, reuse for combined_score
234
+ def fetch_tag_candidates(query:, timeframe:, metadata:, limit:)
235
+ # Extract tags from query using the existing tag extraction infrastructure
236
+ tag_extraction = find_query_matching_tags(query, include_extracted: true)
237
+ extracted_tags = tag_extraction[:extracted] || []
238
+ matched_db_tags = tag_extraction[:matched] || []
239
+
240
+ return [] if extracted_tags.empty? && matched_db_tags.empty?
241
+
242
+ # Build a map of tag prefixes to their max depth
243
+ # This allows us to score partial matches
244
+ tag_depth_map = build_tag_depth_map(extracted_tags)
245
+
246
+ # Use matched_db_tags if available, otherwise use extracted_tags
247
+ search_tags = matched_db_tags.any? ? matched_db_tags : extracted_tags
248
+
249
+ return [] if search_tags.empty?
250
+
251
+ # Build filter conditions
252
+ timeframe_condition = HTM::SqlBuilder.timeframe_condition(timeframe, table_alias: 'n')
253
+ metadata_condition = HTM::SqlBuilder.metadata_condition(metadata, table_alias: 'n')
254
+
255
+ additional_conditions = []
256
+ additional_conditions << timeframe_condition if timeframe_condition
257
+ additional_conditions << metadata_condition if metadata_condition
258
+ additional_sql = additional_conditions.any? ? "AND #{additional_conditions.join(' AND ')}" : ""
259
+
260
+ # Find nodes with matching tags
261
+ # Use Sequel's literal to safely quote tag names
262
+ tag_literals = search_tags.map { |tag| HTM.db.literal(tag) }.join(', ')
263
+
286
264
  sql = <<~SQL
287
- WITH candidates AS (
288
- SELECT id, content, access_count, created_at, token_count, embedding
289
- FROM nodes
290
- WHERE deleted_at IS NULL
291
- AND to_tsvector('english', content) @@ plainto_tsquery('english', ?)
292
- #{additional_sql_bare}
293
- LIMIT ?
294
- ),
295
- scored AS (
296
- SELECT id, content, access_count, created_at, token_count,
297
- CASE
298
- WHEN embedding IS NOT NULL THEN 1 - (embedding <=> ?::vector)
299
- ELSE 0.5
300
- END as similarity
301
- FROM candidates
302
- )
303
- SELECT id, content, access_count, created_at, token_count,
304
- similarity,
305
- 0.0 as tag_boost,
306
- similarity as combined_score
307
- FROM scored
308
- ORDER BY combined_score DESC
265
+ SELECT DISTINCT n.id, n.content, n.access_count, n.created_at, n.token_count,
266
+ array_agg(t.name) as matched_tags
267
+ FROM nodes n
268
+ JOIN node_tags nt ON nt.node_id = n.id
269
+ JOIN tags t ON t.id = nt.tag_id
270
+ WHERE n.deleted_at IS NULL
271
+ AND t.name IN (#{tag_literals})
272
+ #{additional_sql}
273
+ GROUP BY n.id, n.content, n.access_count, n.created_at, n.token_count
309
274
  LIMIT ?
310
275
  SQL
311
276
 
312
- ActiveRecord::Base.connection.select_all(
313
- ActiveRecord::Base.sanitize_sql_array([
314
- sql,
315
- query,
316
- prefilter_limit,
317
- embedding_str,
318
- limit
319
- ])
320
- )
277
+ results = HTM.db.fetch(sql, limit).all
278
+
279
+ # Calculate depth scores for each result
280
+ results.map do |result|
281
+ matched_tags = parse_pg_array(result[:matched_tags])
282
+ depth_score = calculate_tag_depth_score(matched_tags, tag_depth_map)
283
+
284
+ result.transform_keys(&:to_s).merge('tag_depth_score' => depth_score, 'matched_tags' => matched_tags)
285
+ end.sort_by { |r| -r['tag_depth_score'] }
286
+ end
287
+
288
+ # Build a map of tag prefixes to their depth information
289
+ #
290
+ # For tag "database:postgresql:extensions":
291
+ # - "database" => { depth: 1, max_depth: 3 }
292
+ # - "database:postgresql" => { depth: 2, max_depth: 3 }
293
+ # - "database:postgresql:extensions" => { depth: 3, max_depth: 3 }
294
+ #
295
+ # @param extracted_tags [Array<String>] Tags extracted from query
296
+ # @return [Hash] Map of tag/prefix to depth info
297
+ #
298
+ def build_tag_depth_map(extracted_tags)
299
+ depth_map = {}
300
+
301
+ extracted_tags.each do |tag|
302
+ levels = tag.split(':')
303
+ max_depth = levels.size
304
+
305
+ # Add entry for each prefix level
306
+ (1..max_depth).each do |depth|
307
+ prefix = levels[0, depth].join(':')
308
+ # Keep the highest max_depth if prefix appears in multiple tags
309
+ if !depth_map.key?(prefix) || depth_map[prefix][:max_depth] < max_depth
310
+ depth_map[prefix] = { depth: depth, max_depth: max_depth }
311
+ end
312
+ end
313
+ end
314
+
315
+ depth_map
316
+ end
317
+
318
+ # Calculate depth score for a node's matched tags
319
+ #
320
+ # The score is based on how deeply the matched tags align with
321
+ # the extracted query tags. More levels matched = higher score.
322
+ #
323
+ # @param matched_tags [Array<String>] Tags the node has that matched
324
+ # @param tag_depth_map [Hash] Map of tag/prefix to depth info
325
+ # @return [Float] Normalized score (0.0 to 1.0)
326
+ #
327
+ def calculate_tag_depth_score(matched_tags, tag_depth_map)
328
+ return 0.0 if matched_tags.empty? || tag_depth_map.empty?
329
+
330
+ # Find the best depth match for each matched tag
331
+ best_score = 0.0
332
+
333
+ matched_tags.each do |tag|
334
+ if tag_depth_map.key?(tag)
335
+ info = tag_depth_map[tag]
336
+ # Score is depth / max_depth
337
+ # e.g., "database:postgresql" matching query "database:postgresql:extensions"
338
+ # gives 2/3 = 0.67
339
+ score = info[:depth].to_f / info[:max_depth].to_f
340
+ best_score = [best_score, score].max
341
+ else
342
+ # Check if this tag is a parent of any extracted tag
343
+ tag_depth_map.each do |prefix, info|
344
+ if prefix.start_with?(tag + ':') || prefix == tag
345
+ score = tag.split(':').size.to_f / info[:max_depth].to_f
346
+ best_score = [best_score, score].max
347
+ end
348
+ end
349
+ end
350
+ end
351
+
352
+ # Bonus for multiple tag matches (capped at 0.2 extra)
353
+ multi_match_bonus = [(matched_tags.size - 1) * 0.05, 0.2].min
354
+
355
+ [best_score + multi_match_bonus, 1.0].min
356
+ end
357
+
358
+ # Parse PostgreSQL array string to Ruby array
359
+ #
360
+ # @param pg_array [String, Array, Sequel::Postgres::PGArray] PostgreSQL array or Ruby array
361
+ # @return [Array<String>] Parsed array
362
+ #
363
+ def parse_pg_array(pg_array)
364
+ # Handle Sequel::Postgres::PGArray (wraps Ruby Array)
365
+ return pg_array.to_a if pg_array.respond_to?(:to_a) && !pg_array.is_a?(String)
366
+ return pg_array if pg_array.is_a?(Array)
367
+ return [] if pg_array.nil? || (pg_array.respond_to?(:empty?) && pg_array.empty?)
368
+
369
+ # Handle raw PostgreSQL array format: {val1,val2,val3}
370
+ pg_str = pg_array.to_s
371
+ if pg_str.start_with?('{') && pg_str.end_with?('}')
372
+ pg_str[1..-2].split(',').map { |s| s.gsub(/^"|"$/, '') }
373
+ else
374
+ [pg_str]
375
+ end
376
+ end
377
+
378
+ # Merge three result sets using Reciprocal Rank Fusion
379
+ #
380
+ # RRF score = Σ 1/(k + rank) for each list where the item appears
381
+ #
382
+ # Items appearing in multiple lists naturally get higher scores
383
+ # because they receive contributions from multiple ranks.
384
+ #
385
+ # @param vector_results [Array<Hash>] Vector search results (ordered by similarity)
386
+ # @param fulltext_results [Array<Hash>] Fulltext search results (ordered by text_rank)
387
+ # @param tag_results [Array<Hash>] Tag search results (ordered by tag_depth_score)
388
+ # @return [Array<Hash>] Merged results sorted by RRF score
389
+ #
390
+ def merge_with_rrf(vector_results, fulltext_results, tag_results = [])
391
+ # Build RRF scores
392
+ # Key: node_id, Value: { node_data:, rrf_score:, sources: }
393
+ merged = {}
394
+
395
+ # Process vector results
396
+ vector_results.each_with_index do |result, index|
397
+ id = result['id']
398
+ rank = index + 1 # 1-based rank
399
+ rrf_contribution = 1.0 / (RRF_K + rank)
400
+
401
+ merged[id] = {
402
+ 'id' => result['id'],
403
+ 'content' => result['content'],
404
+ 'access_count' => result['access_count'],
405
+ 'created_at' => result['created_at'],
406
+ 'token_count' => result['token_count'],
407
+ 'similarity' => result['similarity'],
408
+ 'text_rank' => 0.0,
409
+ 'tag_depth_score' => 0.0,
410
+ 'matched_tags' => [],
411
+ 'rrf_score' => rrf_contribution,
412
+ 'vector_rank' => rank,
413
+ 'fulltext_rank' => nil,
414
+ 'tag_rank' => nil,
415
+ 'sources' => ['vector']
416
+ }
417
+ end
418
+
419
+ # Process fulltext results
420
+ fulltext_results.each_with_index do |result, index|
421
+ id = result['id']
422
+ rank = index + 1 # 1-based rank
423
+ rrf_contribution = 1.0 / (RRF_K + rank)
424
+
425
+ if merged.key?(id)
426
+ # Node appears in both - add RRF contribution (this is the boost!)
427
+ merged[id]['rrf_score'] += rrf_contribution
428
+ merged[id]['text_rank'] = result['text_rank']
429
+ merged[id]['fulltext_rank'] = rank
430
+ merged[id]['sources'] << 'fulltext'
431
+ else
432
+ # Node only in fulltext
433
+ merged[id] = {
434
+ 'id' => result['id'],
435
+ 'content' => result['content'],
436
+ 'access_count' => result['access_count'],
437
+ 'created_at' => result['created_at'],
438
+ 'token_count' => result['token_count'],
439
+ 'similarity' => 0.0,
440
+ 'text_rank' => result['text_rank'],
441
+ 'tag_depth_score' => 0.0,
442
+ 'matched_tags' => [],
443
+ 'rrf_score' => rrf_contribution,
444
+ 'vector_rank' => nil,
445
+ 'fulltext_rank' => rank,
446
+ 'tag_rank' => nil,
447
+ 'sources' => ['fulltext']
448
+ }
449
+ end
450
+ end
451
+
452
+ # Process tag results
453
+ tag_results.each_with_index do |result, index|
454
+ id = result['id']
455
+ rank = index + 1 # 1-based rank
456
+ rrf_contribution = 1.0 / (RRF_K + rank)
457
+
458
+ if merged.key?(id)
459
+ # Node already found - add RRF contribution (boost!)
460
+ merged[id]['rrf_score'] += rrf_contribution
461
+ merged[id]['tag_depth_score'] = result['tag_depth_score']
462
+ merged[id]['matched_tags'] = result['matched_tags']
463
+ merged[id]['tag_rank'] = rank
464
+ merged[id]['sources'] << 'tags'
465
+ else
466
+ # Node only found via tags
467
+ merged[id] = {
468
+ 'id' => result['id'],
469
+ 'content' => result['content'],
470
+ 'access_count' => result['access_count'],
471
+ 'created_at' => result['created_at'],
472
+ 'token_count' => result['token_count'],
473
+ 'similarity' => 0.0,
474
+ 'text_rank' => 0.0,
475
+ 'tag_depth_score' => result['tag_depth_score'],
476
+ 'matched_tags' => result['matched_tags'],
477
+ 'rrf_score' => rrf_contribution,
478
+ 'vector_rank' => nil,
479
+ 'fulltext_rank' => nil,
480
+ 'tag_rank' => rank,
481
+ 'sources' => ['tags']
482
+ }
483
+ end
484
+ end
485
+
486
+ # Sort by RRF score descending
487
+ merged.values.sort_by { |r| -r['rrf_score'] }
321
488
  end
322
489
  end
323
490
  end