htm 0.0.18 → 0.0.30
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +119 -1
- data/README.md +12 -0
- data/Rakefile +104 -18
- data/db/migrate/00001_enable_extensions.rb +9 -5
- data/db/migrate/00002_create_robots.rb +18 -6
- data/db/migrate/00003_create_file_sources.rb +30 -17
- data/db/migrate/00004_create_nodes.rb +60 -48
- data/db/migrate/00005_create_tags.rb +24 -12
- data/db/migrate/00006_create_node_tags.rb +28 -13
- data/db/migrate/00007_create_robot_nodes.rb +40 -26
- data/db/schema.sql +17 -1
- data/db/seeds.rb +34 -34
- data/docs/api/embedding-service.md +140 -110
- data/docs/api/yard/HTM/ActiveRecordConfig.md +6 -0
- data/docs/api/yard/HTM/Config.md +173 -0
- data/docs/api/yard/HTM/ConfigSection.md +28 -0
- data/docs/api/yard/HTM/Database.md +1 -1
- data/docs/api/yard/HTM/Railtie.md +2 -2
- data/docs/api/yard/HTM.md +0 -57
- data/docs/api/yard/index.csv +76 -61
- data/docs/api/yard-reference.md +2 -1
- data/docs/architecture/adrs/003-ollama-embeddings.md +45 -36
- data/docs/architecture/adrs/004-hive-mind.md +1 -1
- data/docs/architecture/adrs/008-robot-identification.md +1 -1
- data/docs/architecture/index.md +11 -9
- data/docs/architecture/overview.md +11 -7
- data/docs/assets/images/balanced-strategy-decay.svg +41 -0
- data/docs/assets/images/class-hierarchy.svg +1 -1
- data/docs/assets/images/eviction-priority.svg +43 -0
- data/docs/assets/images/exception-hierarchy.svg +2 -2
- data/docs/assets/images/hive-mind-shared-memory.svg +52 -0
- data/docs/assets/images/htm-architecture-overview.svg +3 -3
- data/docs/assets/images/htm-core-components.svg +4 -4
- data/docs/assets/images/htm-layered-architecture.svg +1 -1
- data/docs/assets/images/htm-memory-addition-flow.svg +2 -2
- data/docs/assets/images/htm-memory-recall-flow.svg +2 -2
- data/docs/assets/images/memory-topology.svg +53 -0
- data/docs/assets/images/two-tier-memory-architecture.svg +55 -0
- data/docs/database/naming-convention.md +244 -0
- data/docs/database_rake_tasks.md +31 -0
- data/docs/development/rake-tasks.md +80 -35
- data/docs/development/setup.md +76 -44
- data/docs/examples/basic-usage.md +133 -0
- data/docs/examples/config-files.md +170 -0
- data/docs/examples/file-loading.md +208 -0
- data/docs/examples/index.md +116 -0
- data/docs/examples/llm-configuration.md +168 -0
- data/docs/examples/mcp-client.md +172 -0
- data/docs/examples/rails-integration.md +173 -0
- data/docs/examples/robot-groups.md +210 -0
- data/docs/examples/sinatra-integration.md +218 -0
- data/docs/examples/standalone-app.md +216 -0
- data/docs/examples/telemetry.md +224 -0
- data/docs/examples/timeframes.md +143 -0
- data/docs/getting-started/installation.md +97 -40
- data/docs/getting-started/quick-start.md +28 -11
- data/docs/guides/configuration.md +515 -0
- data/docs/guides/file-loading.md +322 -0
- data/docs/guides/getting-started.md +40 -9
- data/docs/guides/index.md +3 -3
- data/docs/guides/mcp-server.md +100 -13
- data/docs/guides/propositions.md +264 -0
- data/docs/guides/recalling-memories.md +4 -4
- data/docs/guides/search-strategies.md +3 -3
- data/docs/guides/tags.md +318 -0
- data/docs/guides/telemetry.md +229 -0
- data/docs/index.md +8 -16
- data/docs/{architecture → robots}/hive-mind.md +8 -111
- data/docs/robots/index.md +73 -0
- data/docs/{guides → robots}/multi-robot.md +3 -3
- data/docs/{guides → robots}/robot-groups.md +8 -7
- data/docs/{architecture → robots}/two-tier-memory.md +13 -149
- data/docs/robots/why-robots.md +85 -0
- data/examples/.envrc +6 -0
- data/examples/.gitignore +2 -0
- data/examples/00_create_examples_db.rb +94 -0
- data/examples/{basic_usage.rb → 01_basic_usage.rb} +12 -16
- data/examples/{custom_llm_configuration.rb → 03_custom_llm_configuration.rb} +13 -3
- data/examples/{file_loader_usage.rb → 04_file_loader_usage.rb} +11 -14
- data/examples/{timeframe_demo.rb → 05_timeframe_demo.rb} +10 -3
- data/examples/{example_app → 06_example_app}/app.rb +15 -15
- data/examples/{cli_app → 07_cli_app}/htm_cli.rb +15 -22
- data/examples/08_sinatra_app/Gemfile.lock +241 -0
- data/examples/{sinatra_app → 08_sinatra_app}/app.rb +19 -18
- data/examples/{mcp_client.rb → 09_mcp_client.rb} +5 -8
- data/examples/{telemetry → 10_telemetry}/SETUP_README.md +1 -1
- data/examples/{telemetry → 10_telemetry}/demo.rb +14 -10
- data/examples/11_robot_groups/README.md +335 -0
- data/examples/{robot_groups → 11_robot_groups/lib}/robot_worker.rb +17 -3
- data/examples/{robot_groups → 11_robot_groups}/multi_process.rb +9 -9
- data/examples/{robot_groups → 11_robot_groups}/same_process.rb +9 -12
- data/examples/{rails_app → 12_rails_app}/Gemfile +3 -0
- data/examples/{rails_app → 12_rails_app}/Gemfile.lock +87 -58
- data/examples/{rails_app → 12_rails_app}/app/controllers/dashboard_controller.rb +10 -6
- data/examples/{rails_app → 12_rails_app}/app/controllers/files_controller.rb +5 -5
- data/examples/{rails_app → 12_rails_app}/app/controllers/memories_controller.rb +11 -7
- data/examples/{rails_app → 12_rails_app}/app/controllers/robots_controller.rb +8 -8
- data/examples/12_rails_app/app/controllers/tags_controller.rb +36 -0
- data/examples/{rails_app → 12_rails_app}/app/views/dashboard/index.html.erb +2 -2
- data/examples/{rails_app → 12_rails_app}/app/views/files/new.html.erb +5 -2
- data/examples/{rails_app → 12_rails_app}/app/views/memories/_memory_card.html.erb +3 -3
- data/examples/{rails_app → 12_rails_app}/app/views/memories/deleted.html.erb +3 -3
- data/examples/{rails_app → 12_rails_app}/app/views/memories/edit.html.erb +3 -3
- data/examples/{rails_app → 12_rails_app}/app/views/memories/show.html.erb +4 -4
- data/examples/{rails_app → 12_rails_app}/app/views/robots/index.html.erb +2 -2
- data/examples/{rails_app → 12_rails_app}/app/views/robots/show.html.erb +4 -4
- data/examples/{rails_app → 12_rails_app}/app/views/search/index.html.erb +1 -1
- data/examples/{rails_app → 12_rails_app}/app/views/tags/index.html.erb +2 -2
- data/examples/{rails_app → 12_rails_app}/app/views/tags/show.html.erb +1 -1
- data/examples/12_rails_app/config/initializers/htm.rb +7 -0
- data/examples/12_rails_app/config/initializers/rack.rb +5 -0
- data/examples/README.md +230 -211
- data/examples/examples_helper.rb +138 -0
- data/lib/htm/config/builder.rb +167 -0
- data/lib/htm/config/database.rb +317 -0
- data/lib/htm/config/defaults.yml +41 -13
- data/lib/htm/config/section.rb +74 -0
- data/lib/htm/config/validator.rb +83 -0
- data/lib/htm/config.rb +65 -361
- data/lib/htm/database.rb +85 -127
- data/lib/htm/errors.rb +14 -0
- data/lib/htm/integrations/sinatra.rb +13 -44
- data/lib/htm/job_adapter.rb +75 -1
- data/lib/htm/jobs/generate_embedding_job.rb +3 -4
- data/lib/htm/jobs/generate_propositions_job.rb +4 -5
- data/lib/htm/jobs/generate_tags_job.rb +16 -15
- data/lib/htm/loaders/defaults_loader.rb +23 -0
- data/lib/htm/loaders/markdown_loader.rb +17 -15
- data/lib/htm/loaders/xdg_config_loader.rb +9 -9
- data/lib/htm/long_term_memory/fulltext_search.rb +14 -14
- data/lib/htm/long_term_memory/hybrid_search.rb +396 -229
- data/lib/htm/long_term_memory/node_operations.rb +24 -23
- data/lib/htm/long_term_memory/relevance_scorer.rb +23 -20
- data/lib/htm/long_term_memory/robot_operations.rb +4 -4
- data/lib/htm/long_term_memory/tag_operations.rb +91 -77
- data/lib/htm/long_term_memory/vector_search.rb +4 -5
- data/lib/htm/long_term_memory.rb +13 -13
- data/lib/htm/mcp/cli.rb +115 -8
- data/lib/htm/mcp/resources.rb +4 -3
- data/lib/htm/mcp/server.rb +5 -4
- data/lib/htm/mcp/tools.rb +37 -28
- data/lib/htm/migration.rb +72 -0
- data/lib/htm/models/file_source.rb +52 -31
- data/lib/htm/models/node.rb +224 -108
- data/lib/htm/models/node_tag.rb +49 -28
- data/lib/htm/models/robot.rb +38 -27
- data/lib/htm/models/robot_node.rb +63 -35
- data/lib/htm/models/tag.rb +126 -123
- data/lib/htm/observability.rb +45 -41
- data/lib/htm/proposition_service.rb +76 -7
- data/lib/htm/railtie.rb +2 -2
- data/lib/htm/robot_group.rb +30 -18
- data/lib/htm/sequel_config.rb +215 -0
- data/lib/htm/sql_builder.rb +14 -16
- data/lib/htm/tag_service.rb +78 -0
- data/lib/htm/tasks.rb +3 -0
- data/lib/htm/version.rb +1 -1
- data/lib/htm/workflows/remember_workflow.rb +213 -0
- data/lib/htm.rb +27 -22
- data/lib/tasks/db.rake +0 -2
- data/lib/tasks/doc.rake +2 -2
- data/lib/tasks/files.rake +11 -18
- data/lib/tasks/htm.rake +190 -62
- data/lib/tasks/jobs.rake +179 -54
- data/lib/tasks/tags.rake +8 -13
- data/mkdocs.yml +33 -8
- data/scripts/backfill_parent_tags.rb +376 -0
- data/scripts/normalize_plural_tags.rb +335 -0
- metadata +168 -86
- data/docs/api/yard/HTM/Configuration.md +0 -240
- data/docs/telemetry.md +0 -391
- data/examples/rails_app/app/controllers/tags_controller.rb +0 -30
- data/examples/sinatra_app/Gemfile.lock +0 -166
- data/lib/htm/active_record_config.rb +0 -104
- /data/examples/{config_file_example → 02_config_file_example}/README.md +0 -0
- /data/examples/{config_file_example → 02_config_file_example}/config/htm.local.yml +0 -0
- /data/examples/{config_file_example → 02_config_file_example}/custom_config.yml +0 -0
- /data/examples/{config_file_example → 02_config_file_example}/show_config.rb +0 -0
- /data/examples/{example_app → 06_example_app}/Rakefile +0 -0
- /data/examples/{cli_app → 07_cli_app}/README.md +0 -0
- /data/examples/{sinatra_app → 08_sinatra_app}/Gemfile +0 -0
- /data/examples/{telemetry → 10_telemetry}/README.md +0 -0
- /data/examples/{telemetry → 10_telemetry}/grafana/dashboards/htm-metrics.json +0 -0
- /data/examples/{rails_app → 12_rails_app}/.gitignore +0 -0
- /data/examples/{rails_app → 12_rails_app}/Procfile.dev +0 -0
- /data/examples/{rails_app → 12_rails_app}/README.md +0 -0
- /data/examples/{rails_app → 12_rails_app}/Rakefile +0 -0
- /data/examples/{rails_app → 12_rails_app}/app/assets/stylesheets/application.css +0 -0
- /data/examples/{rails_app → 12_rails_app}/app/assets/stylesheets/inter-font.css +0 -0
- /data/examples/{rails_app → 12_rails_app}/app/controllers/application_controller.rb +0 -0
- /data/examples/{rails_app → 12_rails_app}/app/controllers/search_controller.rb +0 -0
- /data/examples/{rails_app → 12_rails_app}/app/javascript/application.js +0 -0
- /data/examples/{rails_app → 12_rails_app}/app/javascript/controllers/application.js +0 -0
- /data/examples/{rails_app → 12_rails_app}/app/javascript/controllers/index.js +0 -0
- /data/examples/{rails_app → 12_rails_app}/app/views/files/index.html.erb +0 -0
- /data/examples/{rails_app → 12_rails_app}/app/views/files/show.html.erb +0 -0
- /data/examples/{rails_app → 12_rails_app}/app/views/layouts/application.html.erb +0 -0
- /data/examples/{rails_app → 12_rails_app}/app/views/memories/index.html.erb +0 -0
- /data/examples/{rails_app → 12_rails_app}/app/views/memories/new.html.erb +0 -0
- /data/examples/{rails_app → 12_rails_app}/app/views/robots/new.html.erb +0 -0
- /data/examples/{rails_app → 12_rails_app}/app/views/shared/_navbar.html.erb +0 -0
- /data/examples/{rails_app → 12_rails_app}/app/views/shared/_stat_card.html.erb +0 -0
- /data/examples/{rails_app → 12_rails_app}/bin/dev +0 -0
- /data/examples/{rails_app → 12_rails_app}/bin/rails +0 -0
- /data/examples/{rails_app → 12_rails_app}/bin/rake +0 -0
- /data/examples/{rails_app → 12_rails_app}/config/application.rb +0 -0
- /data/examples/{rails_app → 12_rails_app}/config/boot.rb +0 -0
- /data/examples/{rails_app → 12_rails_app}/config/database.yml +0 -0
- /data/examples/{rails_app → 12_rails_app}/config/environment.rb +0 -0
- /data/examples/{rails_app → 12_rails_app}/config/importmap.rb +0 -0
- /data/examples/{rails_app → 12_rails_app}/config/routes.rb +0 -0
- /data/examples/{rails_app → 12_rails_app}/config/tailwind.config.js +0 -0
- /data/examples/{rails_app → 12_rails_app}/config.ru +0 -0
- /data/examples/{rails_app → 12_rails_app}/log/.keep +0 -0
- /data/examples/{rails_app → 12_rails_app}/tmp/local_secret.txt +0 -0
|
@@ -2,16 +2,20 @@
|
|
|
2
2
|
|
|
3
3
|
class HTM
|
|
4
4
|
class LongTermMemory
|
|
5
|
-
# Hybrid search
|
|
5
|
+
# Hybrid search using Reciprocal Rank Fusion (RRF)
|
|
6
6
|
#
|
|
7
|
-
# Performs
|
|
8
|
-
# 1.
|
|
9
|
-
# 2.
|
|
10
|
-
# 3.
|
|
7
|
+
# Performs three independent searches and merges results:
|
|
8
|
+
# 1. Vector similarity search for semantic matching
|
|
9
|
+
# 2. Full-text search for keyword matching
|
|
10
|
+
# 3. Tag-based search for hierarchical category matching
|
|
11
11
|
#
|
|
12
|
-
#
|
|
13
|
-
#
|
|
14
|
-
#
|
|
12
|
+
# Results are merged using RRF scoring. Nodes appearing in multiple
|
|
13
|
+
# searches receive boosted scores, making them rank higher.
|
|
14
|
+
#
|
|
15
|
+
# Tag scoring uses hierarchical depth matching - the more levels of a
|
|
16
|
+
# tag hierarchy that match, the higher the score contribution.
|
|
17
|
+
#
|
|
18
|
+
# RRF Formula: score = Σ 1/(k + rank) for each search where node appears
|
|
15
19
|
#
|
|
16
20
|
# Results are cached for performance.
|
|
17
21
|
#
|
|
@@ -20,31 +24,38 @@ class HTM
|
|
|
20
24
|
module HybridSearch
|
|
21
25
|
# Maximum results to prevent DoS via unbounded queries
|
|
22
26
|
MAX_HYBRID_LIMIT = 1000
|
|
23
|
-
MAX_PREFILTER_LIMIT = 5000
|
|
24
27
|
|
|
25
|
-
#
|
|
28
|
+
# RRF constant - higher values reduce the impact of rank differences
|
|
29
|
+
# 60 is the standard value from the original RRF paper
|
|
30
|
+
RRF_K = 60
|
|
31
|
+
|
|
32
|
+
# Multiplier for candidates from each search
|
|
33
|
+
# We fetch more candidates than requested to ensure good fusion
|
|
34
|
+
CANDIDATE_MULTIPLIER = 3
|
|
35
|
+
|
|
36
|
+
# Hybrid search using Reciprocal Rank Fusion
|
|
26
37
|
#
|
|
27
38
|
# @param timeframe [Range] Time range to search
|
|
28
39
|
# @param query [String] Search query
|
|
29
40
|
# @param limit [Integer] Maximum results (capped at MAX_HYBRID_LIMIT)
|
|
30
41
|
# @param embedding_service [Object] Service to generate embeddings
|
|
31
|
-
# @param prefilter_limit [Integer] Candidates
|
|
42
|
+
# @param prefilter_limit [Integer] Candidates per search (default: 100)
|
|
32
43
|
# @param metadata [Hash] Filter by metadata fields (default: {})
|
|
33
44
|
# @return [Array<Hash>] Matching nodes
|
|
34
45
|
#
|
|
35
46
|
def search_hybrid(timeframe:, query:, limit:, embedding_service:, prefilter_limit: 100, metadata: {})
|
|
36
47
|
# Enforce limits to prevent DoS
|
|
37
48
|
safe_limit = [[limit.to_i, 1].max, MAX_HYBRID_LIMIT].min
|
|
38
|
-
safe_prefilter = [
|
|
49
|
+
safe_prefilter = [prefilter_limit.to_i, 1].max
|
|
39
50
|
|
|
40
51
|
start_time = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
41
52
|
result = @cache.fetch(:hybrid, timeframe, query, safe_limit, safe_prefilter, metadata) do
|
|
42
|
-
|
|
53
|
+
search_hybrid_rrf(
|
|
43
54
|
timeframe: timeframe,
|
|
44
55
|
query: query,
|
|
45
56
|
limit: safe_limit,
|
|
46
57
|
embedding_service: embedding_service,
|
|
47
|
-
|
|
58
|
+
candidate_limit: safe_prefilter * CANDIDATE_MULTIPLIER,
|
|
48
59
|
metadata: metadata
|
|
49
60
|
)
|
|
50
61
|
end
|
|
@@ -55,269 +66,425 @@ class HTM
|
|
|
55
66
|
|
|
56
67
|
private
|
|
57
68
|
|
|
58
|
-
#
|
|
59
|
-
# If fulltext returns >= this ratio of requested results, skip expensive tag extraction
|
|
60
|
-
TAG_EXTRACTION_THRESHOLD = 0.5
|
|
61
|
-
|
|
62
|
-
# Uncached hybrid search
|
|
69
|
+
# Hybrid search using Reciprocal Rank Fusion
|
|
63
70
|
#
|
|
64
|
-
#
|
|
65
|
-
#
|
|
66
|
-
#
|
|
67
|
-
# 3. Vector similarity for semantic ranking
|
|
71
|
+
# Runs vector, fulltext, and tag searches independently, then merges
|
|
72
|
+
# results using RRF scoring. Nodes appearing in multiple searches
|
|
73
|
+
# get contributions from each, naturally boosting them.
|
|
68
74
|
#
|
|
69
|
-
# @param timeframe [nil, Range, Array<Range>] Time range(s) to search
|
|
75
|
+
# @param timeframe [nil, Range, Array<Range>] Time range(s) to search
|
|
70
76
|
# @param query [String] Search query
|
|
71
77
|
# @param limit [Integer] Maximum results
|
|
72
78
|
# @param embedding_service [Object] Service to generate query embedding
|
|
73
|
-
# @param
|
|
74
|
-
# @param metadata [Hash] Filter by metadata fields
|
|
75
|
-
# @return [Array<Hash>]
|
|
79
|
+
# @param candidate_limit [Integer] Candidates to fetch from each search
|
|
80
|
+
# @param metadata [Hash] Filter by metadata fields
|
|
81
|
+
# @return [Array<Hash>] Merged results with RRF scores
|
|
76
82
|
#
|
|
77
|
-
def
|
|
78
|
-
#
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
# Pad embedding to 2000 dimensions if needed
|
|
88
|
-
padded_embedding = HTM::SqlBuilder.pad_embedding(query_embedding)
|
|
89
|
-
|
|
90
|
-
# Sanitize embedding for safe SQL use (validates all values are numeric)
|
|
91
|
-
embedding_str = HTM::SqlBuilder.sanitize_embedding(padded_embedding)
|
|
92
|
-
|
|
93
|
-
# Build filter conditions (with table alias for CTEs)
|
|
94
|
-
timeframe_condition = HTM::SqlBuilder.timeframe_condition(timeframe, table_alias: 'n')
|
|
95
|
-
metadata_condition = HTM::SqlBuilder.metadata_condition(metadata, table_alias: 'n')
|
|
96
|
-
|
|
97
|
-
additional_conditions = []
|
|
98
|
-
additional_conditions << timeframe_condition if timeframe_condition
|
|
99
|
-
additional_conditions << metadata_condition if metadata_condition
|
|
100
|
-
additional_sql = additional_conditions.any? ? "AND #{additional_conditions.join(' AND ')}" : ""
|
|
101
|
-
|
|
102
|
-
# Same for non-aliased queries
|
|
103
|
-
timeframe_condition_bare = HTM::SqlBuilder.timeframe_condition(timeframe)
|
|
104
|
-
metadata_condition_bare = HTM::SqlBuilder.metadata_condition(metadata)
|
|
83
|
+
def search_hybrid_rrf(timeframe:, query:, limit:, embedding_service:, candidate_limit:, metadata: {})
|
|
84
|
+
# Run all three searches independently
|
|
85
|
+
vector_results = fetch_vector_candidates(
|
|
86
|
+
query: query,
|
|
87
|
+
embedding_service: embedding_service,
|
|
88
|
+
timeframe: timeframe,
|
|
89
|
+
metadata: metadata,
|
|
90
|
+
limit: candidate_limit
|
|
91
|
+
)
|
|
105
92
|
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
93
|
+
fulltext_results = fetch_fulltext_candidates(
|
|
94
|
+
query: query,
|
|
95
|
+
timeframe: timeframe,
|
|
96
|
+
metadata: metadata,
|
|
97
|
+
limit: candidate_limit
|
|
98
|
+
)
|
|
110
99
|
|
|
111
|
-
#
|
|
112
|
-
|
|
113
|
-
# This skips the expensive LLM call (~500-3000ms) when fulltext alone
|
|
114
|
-
# provides enough results.
|
|
115
|
-
fulltext_count = count_fulltext_matches(
|
|
100
|
+
# Extract tags from query and find matching nodes
|
|
101
|
+
tag_results = fetch_tag_candidates(
|
|
116
102
|
query: query,
|
|
117
|
-
|
|
118
|
-
|
|
103
|
+
timeframe: timeframe,
|
|
104
|
+
metadata: metadata,
|
|
105
|
+
limit: candidate_limit
|
|
119
106
|
)
|
|
120
107
|
|
|
121
|
-
#
|
|
122
|
-
|
|
123
|
-
find_query_matching_tags(query)
|
|
124
|
-
else
|
|
125
|
-
[]
|
|
126
|
-
end
|
|
108
|
+
# Merge using RRF
|
|
109
|
+
merged = merge_with_rrf(vector_results, fulltext_results, tag_results)
|
|
127
110
|
|
|
128
|
-
#
|
|
129
|
-
|
|
130
|
-
# similarity score of 0.5. This allows newly created nodes to appear in
|
|
131
|
-
# search results immediately (via fulltext matching) before their embeddings
|
|
132
|
-
# are generated by background jobs.
|
|
133
|
-
|
|
134
|
-
result = if matching_tags.any?
|
|
135
|
-
search_hybrid_with_tags(
|
|
136
|
-
query: query,
|
|
137
|
-
embedding_str: embedding_str,
|
|
138
|
-
matching_tags: matching_tags,
|
|
139
|
-
additional_sql: additional_sql,
|
|
140
|
-
prefilter_limit: prefilter_limit,
|
|
141
|
-
limit: limit
|
|
142
|
-
)
|
|
143
|
-
else
|
|
144
|
-
search_hybrid_without_tags(
|
|
145
|
-
query: query,
|
|
146
|
-
embedding_str: embedding_str,
|
|
147
|
-
additional_sql_bare: additional_sql_bare,
|
|
148
|
-
prefilter_limit: prefilter_limit,
|
|
149
|
-
limit: limit
|
|
150
|
-
)
|
|
151
|
-
end
|
|
111
|
+
# Take top results
|
|
112
|
+
top_results = merged.first(limit)
|
|
152
113
|
|
|
153
114
|
# Track access for retrieved nodes
|
|
154
|
-
node_ids =
|
|
115
|
+
node_ids = top_results.map { |r| r['id'] }
|
|
155
116
|
track_access(node_ids)
|
|
156
117
|
|
|
157
|
-
|
|
118
|
+
top_results
|
|
158
119
|
end
|
|
159
120
|
|
|
160
|
-
#
|
|
121
|
+
# Fetch candidates using vector similarity search
|
|
161
122
|
#
|
|
162
123
|
# @param query [String] Search query
|
|
163
|
-
# @param
|
|
164
|
-
# @param
|
|
165
|
-
# @
|
|
124
|
+
# @param embedding_service [Object] Service to generate embeddings
|
|
125
|
+
# @param timeframe [nil, Range, Array<Range>] Time filter
|
|
126
|
+
# @param metadata [Hash] Metadata filter
|
|
127
|
+
# @param limit [Integer] Maximum candidates
|
|
128
|
+
# @return [Array<Hash>] Results with similarity scores
|
|
166
129
|
#
|
|
167
|
-
def
|
|
130
|
+
def fetch_vector_candidates(query:, embedding_service:, timeframe:, metadata:, limit:)
|
|
131
|
+
# Generate query embedding
|
|
132
|
+
query_embedding = embedding_service.embed(query)
|
|
133
|
+
|
|
134
|
+
unless query_embedding.is_a?(Array) && query_embedding.any?
|
|
135
|
+
HTM.logger.error("Invalid embedding returned from embedding service")
|
|
136
|
+
return []
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
padded_embedding = HTM::SqlBuilder.pad_embedding(query_embedding)
|
|
140
|
+
embedding_str = HTM::SqlBuilder.sanitize_embedding(padded_embedding)
|
|
141
|
+
|
|
142
|
+
# Build filter conditions
|
|
143
|
+
timeframe_condition = HTM::SqlBuilder.timeframe_condition(timeframe)
|
|
144
|
+
metadata_condition = HTM::SqlBuilder.metadata_condition(metadata)
|
|
145
|
+
|
|
146
|
+
conditions = ["embedding IS NOT NULL", "deleted_at IS NULL"]
|
|
147
|
+
conditions << timeframe_condition if timeframe_condition
|
|
148
|
+
conditions << metadata_condition if metadata_condition
|
|
149
|
+
|
|
150
|
+
where_clause = "WHERE #{conditions.join(' AND ')}"
|
|
151
|
+
|
|
152
|
+
# Note: Using Sequel.lit for the vector comparison since it needs special handling
|
|
153
|
+
embedding_literal = HTM.db.literal(embedding_str)
|
|
168
154
|
sql = <<~SQL
|
|
169
|
-
SELECT
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
) AS limited_count
|
|
155
|
+
SELECT id, content, access_count, created_at, token_count,
|
|
156
|
+
1 - (embedding <=> #{embedding_literal}::vector) as similarity
|
|
157
|
+
FROM nodes
|
|
158
|
+
#{where_clause}
|
|
159
|
+
ORDER BY embedding <=> #{embedding_literal}::vector
|
|
160
|
+
LIMIT ?
|
|
176
161
|
SQL
|
|
177
162
|
|
|
178
|
-
|
|
179
|
-
ActiveRecord::Base.sanitize_sql_array([sql, query, limit])
|
|
180
|
-
)
|
|
181
|
-
result.to_i
|
|
163
|
+
HTM.db.fetch(sql, limit).all.map { |r| r.transform_keys(&:to_s) }
|
|
182
164
|
end
|
|
183
165
|
|
|
184
|
-
#
|
|
185
|
-
#
|
|
186
|
-
# Uses parameterized queries and LEFT JOIN for efficient tag boosting.
|
|
166
|
+
# Fetch candidates using full-text search
|
|
187
167
|
#
|
|
188
168
|
# @param query [String] Search query
|
|
189
|
-
# @param
|
|
190
|
-
# @param
|
|
191
|
-
# @param
|
|
192
|
-
# @
|
|
193
|
-
# @param limit [Integer] Maximum results
|
|
194
|
-
# @return [ActiveRecord::Result] Query results
|
|
169
|
+
# @param timeframe [nil, Range, Array<Range>] Time filter
|
|
170
|
+
# @param metadata [Hash] Metadata filter
|
|
171
|
+
# @param limit [Integer] Maximum candidates
|
|
172
|
+
# @return [Array<Hash>] Results with text rank scores
|
|
195
173
|
#
|
|
196
|
-
def
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
174
|
+
def fetch_fulltext_candidates(query:, timeframe:, metadata:, limit:)
|
|
175
|
+
timeframe_condition = HTM::SqlBuilder.timeframe_condition(timeframe)
|
|
176
|
+
metadata_condition = HTM::SqlBuilder.metadata_condition(metadata)
|
|
177
|
+
|
|
178
|
+
additional_conditions = []
|
|
179
|
+
additional_conditions << timeframe_condition if timeframe_condition
|
|
180
|
+
additional_conditions << metadata_condition if metadata_condition
|
|
181
|
+
additional_sql = additional_conditions.any? ? "AND #{additional_conditions.join(' AND ')}" : ""
|
|
200
182
|
|
|
201
|
-
#
|
|
202
|
-
#
|
|
183
|
+
# Combined tsvector + trigram search (same as fulltext_search.rb)
|
|
184
|
+
# Escape the query for safe interpolation in trigram comparisons
|
|
185
|
+
query_literal = HTM.db.literal(query)
|
|
203
186
|
sql = <<~SQL
|
|
204
|
-
WITH
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
FROM nodes
|
|
208
|
-
WHERE
|
|
209
|
-
AND to_tsvector('english',
|
|
187
|
+
WITH tsvector_matches AS (
|
|
188
|
+
SELECT id, content, access_count, created_at, token_count,
|
|
189
|
+
(1.0 + ts_rank(to_tsvector('english', content), plainto_tsquery('english', #{query_literal}))) as text_rank
|
|
190
|
+
FROM nodes
|
|
191
|
+
WHERE deleted_at IS NULL
|
|
192
|
+
AND to_tsvector('english', content) @@ plainto_tsquery('english', #{query_literal})
|
|
210
193
|
#{additional_sql}
|
|
211
|
-
LIMIT ?
|
|
212
194
|
),
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
FROM nodes
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
AND t.name IN (#{tag_placeholders})
|
|
195
|
+
trigram_matches AS (
|
|
196
|
+
SELECT id, content, access_count, created_at, token_count,
|
|
197
|
+
similarity(content, #{query_literal}) as text_rank
|
|
198
|
+
FROM nodes
|
|
199
|
+
WHERE deleted_at IS NULL
|
|
200
|
+
AND similarity(content, #{query_literal}) >= 0.1
|
|
201
|
+
AND id NOT IN (SELECT id FROM tsvector_matches)
|
|
221
202
|
#{additional_sql}
|
|
222
|
-
LIMIT ?
|
|
223
|
-
),
|
|
224
|
-
all_candidates AS (
|
|
225
|
-
SELECT * FROM fulltext_candidates
|
|
226
|
-
UNION
|
|
227
|
-
SELECT * FROM tag_candidates
|
|
228
|
-
),
|
|
229
|
-
tag_counts AS (
|
|
230
|
-
-- Pre-compute tag counts using JOIN instead of correlated subquery
|
|
231
|
-
SELECT nt.node_id, COUNT(DISTINCT t.name)::float AS matched_tags
|
|
232
|
-
FROM node_tags nt
|
|
233
|
-
JOIN tags t ON t.id = nt.tag_id
|
|
234
|
-
WHERE t.name IN (#{tag_placeholders})
|
|
235
|
-
GROUP BY nt.node_id
|
|
236
203
|
),
|
|
237
|
-
|
|
238
|
-
SELECT
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
WHEN ac.embedding IS NOT NULL THEN 1 - (ac.embedding <=> ?::vector)
|
|
242
|
-
ELSE 0.5
|
|
243
|
-
END as similarity,
|
|
244
|
-
COALESCE(tc.matched_tags / ?, 0) as tag_boost
|
|
245
|
-
FROM all_candidates ac
|
|
246
|
-
LEFT JOIN tag_counts tc ON tc.node_id = ac.id
|
|
204
|
+
combined AS (
|
|
205
|
+
SELECT * FROM tsvector_matches
|
|
206
|
+
UNION ALL
|
|
207
|
+
SELECT * FROM trigram_matches
|
|
247
208
|
)
|
|
248
|
-
SELECT id, content, access_count, created_at, token_count,
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
FROM scored
|
|
252
|
-
ORDER BY combined_score DESC
|
|
209
|
+
SELECT id, content, access_count, created_at, token_count, text_rank
|
|
210
|
+
FROM combined
|
|
211
|
+
ORDER BY text_rank DESC
|
|
253
212
|
LIMIT ?
|
|
254
213
|
SQL
|
|
255
214
|
|
|
256
|
-
|
|
257
|
-
params = [
|
|
258
|
-
query,
|
|
259
|
-
prefilter_limit,
|
|
260
|
-
*matching_tags,
|
|
261
|
-
prefilter_limit,
|
|
262
|
-
*matching_tags,
|
|
263
|
-
embedding_str,
|
|
264
|
-
tag_count,
|
|
265
|
-
limit
|
|
266
|
-
]
|
|
267
|
-
|
|
268
|
-
ActiveRecord::Base.connection.select_all(
|
|
269
|
-
ActiveRecord::Base.sanitize_sql_array([sql, *params])
|
|
270
|
-
)
|
|
215
|
+
HTM.db.fetch(sql, limit).all.map { |r| r.transform_keys(&:to_s) }
|
|
271
216
|
end
|
|
272
217
|
|
|
273
|
-
#
|
|
218
|
+
# Fetch candidates using tag-based search with hierarchical scoring
|
|
219
|
+
#
|
|
220
|
+
# Extracts tags from the query, finds nodes with matching tags,
|
|
221
|
+
# and scores based on hierarchical depth match.
|
|
222
|
+
#
|
|
223
|
+
# Scoring: For a query tag "database:postgresql:extensions" (3 levels):
|
|
224
|
+
# - Node with "database:postgresql:extensions" = 3/3 = 1.0
|
|
225
|
+
# - Node with "database:postgresql" = 2/3 = 0.67
|
|
226
|
+
# - Node with "database" = 1/3 = 0.33
|
|
274
227
|
#
|
|
275
228
|
# @param query [String] Search query
|
|
276
|
-
# @param
|
|
277
|
-
# @param
|
|
278
|
-
# @param
|
|
279
|
-
# @
|
|
280
|
-
# @return [ActiveRecord::Result] Query results
|
|
229
|
+
# @param timeframe [nil, Range, Array<Range>] Time filter
|
|
230
|
+
# @param metadata [Hash] Metadata filter
|
|
231
|
+
# @param limit [Integer] Maximum candidates
|
|
232
|
+
# @return [Array<Hash>] Results with tag_depth_score
|
|
281
233
|
#
|
|
282
|
-
def
|
|
283
|
-
#
|
|
284
|
-
|
|
285
|
-
|
|
234
|
+
def fetch_tag_candidates(query:, timeframe:, metadata:, limit:)
|
|
235
|
+
# Extract tags from query using the existing tag extraction infrastructure
|
|
236
|
+
tag_extraction = find_query_matching_tags(query, include_extracted: true)
|
|
237
|
+
extracted_tags = tag_extraction[:extracted] || []
|
|
238
|
+
matched_db_tags = tag_extraction[:matched] || []
|
|
239
|
+
|
|
240
|
+
return [] if extracted_tags.empty? && matched_db_tags.empty?
|
|
241
|
+
|
|
242
|
+
# Build a map of tag prefixes to their max depth
|
|
243
|
+
# This allows us to score partial matches
|
|
244
|
+
tag_depth_map = build_tag_depth_map(extracted_tags)
|
|
245
|
+
|
|
246
|
+
# Use matched_db_tags if available, otherwise use extracted_tags
|
|
247
|
+
search_tags = matched_db_tags.any? ? matched_db_tags : extracted_tags
|
|
248
|
+
|
|
249
|
+
return [] if search_tags.empty?
|
|
250
|
+
|
|
251
|
+
# Build filter conditions
|
|
252
|
+
timeframe_condition = HTM::SqlBuilder.timeframe_condition(timeframe, table_alias: 'n')
|
|
253
|
+
metadata_condition = HTM::SqlBuilder.metadata_condition(metadata, table_alias: 'n')
|
|
254
|
+
|
|
255
|
+
additional_conditions = []
|
|
256
|
+
additional_conditions << timeframe_condition if timeframe_condition
|
|
257
|
+
additional_conditions << metadata_condition if metadata_condition
|
|
258
|
+
additional_sql = additional_conditions.any? ? "AND #{additional_conditions.join(' AND ')}" : ""
|
|
259
|
+
|
|
260
|
+
# Find nodes with matching tags
|
|
261
|
+
# Use Sequel's literal to safely quote tag names
|
|
262
|
+
tag_literals = search_tags.map { |tag| HTM.db.literal(tag) }.join(', ')
|
|
263
|
+
|
|
286
264
|
sql = <<~SQL
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
SELECT id, content, access_count, created_at, token_count,
|
|
297
|
-
CASE
|
|
298
|
-
WHEN embedding IS NOT NULL THEN 1 - (embedding <=> ?::vector)
|
|
299
|
-
ELSE 0.5
|
|
300
|
-
END as similarity
|
|
301
|
-
FROM candidates
|
|
302
|
-
)
|
|
303
|
-
SELECT id, content, access_count, created_at, token_count,
|
|
304
|
-
similarity,
|
|
305
|
-
0.0 as tag_boost,
|
|
306
|
-
similarity as combined_score
|
|
307
|
-
FROM scored
|
|
308
|
-
ORDER BY combined_score DESC
|
|
265
|
+
SELECT DISTINCT n.id, n.content, n.access_count, n.created_at, n.token_count,
|
|
266
|
+
array_agg(t.name) as matched_tags
|
|
267
|
+
FROM nodes n
|
|
268
|
+
JOIN node_tags nt ON nt.node_id = n.id
|
|
269
|
+
JOIN tags t ON t.id = nt.tag_id
|
|
270
|
+
WHERE n.deleted_at IS NULL
|
|
271
|
+
AND t.name IN (#{tag_literals})
|
|
272
|
+
#{additional_sql}
|
|
273
|
+
GROUP BY n.id, n.content, n.access_count, n.created_at, n.token_count
|
|
309
274
|
LIMIT ?
|
|
310
275
|
SQL
|
|
311
276
|
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
277
|
+
results = HTM.db.fetch(sql, limit).all
|
|
278
|
+
|
|
279
|
+
# Calculate depth scores for each result
|
|
280
|
+
results.map do |result|
|
|
281
|
+
matched_tags = parse_pg_array(result[:matched_tags])
|
|
282
|
+
depth_score = calculate_tag_depth_score(matched_tags, tag_depth_map)
|
|
283
|
+
|
|
284
|
+
result.transform_keys(&:to_s).merge('tag_depth_score' => depth_score, 'matched_tags' => matched_tags)
|
|
285
|
+
end.sort_by { |r| -r['tag_depth_score'] }
|
|
286
|
+
end
|
|
287
|
+
|
|
288
|
+
# Build a map of tag prefixes to their depth information
|
|
289
|
+
#
|
|
290
|
+
# For tag "database:postgresql:extensions":
|
|
291
|
+
# - "database" => { depth: 1, max_depth: 3 }
|
|
292
|
+
# - "database:postgresql" => { depth: 2, max_depth: 3 }
|
|
293
|
+
# - "database:postgresql:extensions" => { depth: 3, max_depth: 3 }
|
|
294
|
+
#
|
|
295
|
+
# @param extracted_tags [Array<String>] Tags extracted from query
|
|
296
|
+
# @return [Hash] Map of tag/prefix to depth info
|
|
297
|
+
#
|
|
298
|
+
def build_tag_depth_map(extracted_tags)
|
|
299
|
+
depth_map = {}
|
|
300
|
+
|
|
301
|
+
extracted_tags.each do |tag|
|
|
302
|
+
levels = tag.split(':')
|
|
303
|
+
max_depth = levels.size
|
|
304
|
+
|
|
305
|
+
# Add entry for each prefix level
|
|
306
|
+
(1..max_depth).each do |depth|
|
|
307
|
+
prefix = levels[0, depth].join(':')
|
|
308
|
+
# Keep the highest max_depth if prefix appears in multiple tags
|
|
309
|
+
if !depth_map.key?(prefix) || depth_map[prefix][:max_depth] < max_depth
|
|
310
|
+
depth_map[prefix] = { depth: depth, max_depth: max_depth }
|
|
311
|
+
end
|
|
312
|
+
end
|
|
313
|
+
end
|
|
314
|
+
|
|
315
|
+
depth_map
|
|
316
|
+
end
|
|
317
|
+
|
|
318
|
+
# Calculate depth score for a node's matched tags
|
|
319
|
+
#
|
|
320
|
+
# The score is based on how deeply the matched tags align with
|
|
321
|
+
# the extracted query tags. More levels matched = higher score.
|
|
322
|
+
#
|
|
323
|
+
# @param matched_tags [Array<String>] Tags the node has that matched
|
|
324
|
+
# @param tag_depth_map [Hash] Map of tag/prefix to depth info
|
|
325
|
+
# @return [Float] Normalized score (0.0 to 1.0)
|
|
326
|
+
#
|
|
327
|
+
def calculate_tag_depth_score(matched_tags, tag_depth_map)
|
|
328
|
+
return 0.0 if matched_tags.empty? || tag_depth_map.empty?
|
|
329
|
+
|
|
330
|
+
# Find the best depth match for each matched tag
|
|
331
|
+
best_score = 0.0
|
|
332
|
+
|
|
333
|
+
matched_tags.each do |tag|
|
|
334
|
+
if tag_depth_map.key?(tag)
|
|
335
|
+
info = tag_depth_map[tag]
|
|
336
|
+
# Score is depth / max_depth
|
|
337
|
+
# e.g., "database:postgresql" matching query "database:postgresql:extensions"
|
|
338
|
+
# gives 2/3 = 0.67
|
|
339
|
+
score = info[:depth].to_f / info[:max_depth].to_f
|
|
340
|
+
best_score = [best_score, score].max
|
|
341
|
+
else
|
|
342
|
+
# Check if this tag is a parent of any extracted tag
|
|
343
|
+
tag_depth_map.each do |prefix, info|
|
|
344
|
+
if prefix.start_with?(tag + ':') || prefix == tag
|
|
345
|
+
score = tag.split(':').size.to_f / info[:max_depth].to_f
|
|
346
|
+
best_score = [best_score, score].max
|
|
347
|
+
end
|
|
348
|
+
end
|
|
349
|
+
end
|
|
350
|
+
end
|
|
351
|
+
|
|
352
|
+
# Bonus for multiple tag matches (capped at 0.2 extra)
|
|
353
|
+
multi_match_bonus = [(matched_tags.size - 1) * 0.05, 0.2].min
|
|
354
|
+
|
|
355
|
+
[best_score + multi_match_bonus, 1.0].min
|
|
356
|
+
end
|
|
357
|
+
|
|
358
|
+
# Parse PostgreSQL array string to Ruby array
|
|
359
|
+
#
|
|
360
|
+
# @param pg_array [String, Array, Sequel::Postgres::PGArray] PostgreSQL array or Ruby array
|
|
361
|
+
# @return [Array<String>] Parsed array
|
|
362
|
+
#
|
|
363
|
+
def parse_pg_array(pg_array)
|
|
364
|
+
# Handle Sequel::Postgres::PGArray (wraps Ruby Array)
|
|
365
|
+
return pg_array.to_a if pg_array.respond_to?(:to_a) && !pg_array.is_a?(String)
|
|
366
|
+
return pg_array if pg_array.is_a?(Array)
|
|
367
|
+
return [] if pg_array.nil? || (pg_array.respond_to?(:empty?) && pg_array.empty?)
|
|
368
|
+
|
|
369
|
+
# Handle raw PostgreSQL array format: {val1,val2,val3}
|
|
370
|
+
pg_str = pg_array.to_s
|
|
371
|
+
if pg_str.start_with?('{') && pg_str.end_with?('}')
|
|
372
|
+
pg_str[1..-2].split(',').map { |s| s.gsub(/^"|"$/, '') }
|
|
373
|
+
else
|
|
374
|
+
[pg_str]
|
|
375
|
+
end
|
|
376
|
+
end
|
|
377
|
+
|
|
378
|
+
# Merge three result sets using Reciprocal Rank Fusion
|
|
379
|
+
#
|
|
380
|
+
# RRF score = Σ 1/(k + rank) for each list where the item appears
|
|
381
|
+
#
|
|
382
|
+
# Items appearing in multiple lists naturally get higher scores
|
|
383
|
+
# because they receive contributions from multiple ranks.
|
|
384
|
+
#
|
|
385
|
+
# @param vector_results [Array<Hash>] Vector search results (ordered by similarity)
|
|
386
|
+
# @param fulltext_results [Array<Hash>] Fulltext search results (ordered by text_rank)
|
|
387
|
+
# @param tag_results [Array<Hash>] Tag search results (ordered by tag_depth_score)
|
|
388
|
+
# @return [Array<Hash>] Merged results sorted by RRF score
|
|
389
|
+
#
|
|
390
|
+
def merge_with_rrf(vector_results, fulltext_results, tag_results = [])
|
|
391
|
+
# Build RRF scores
|
|
392
|
+
# Key: node_id, Value: { node_data:, rrf_score:, sources: }
|
|
393
|
+
merged = {}
|
|
394
|
+
|
|
395
|
+
# Process vector results
|
|
396
|
+
vector_results.each_with_index do |result, index|
|
|
397
|
+
id = result['id']
|
|
398
|
+
rank = index + 1 # 1-based rank
|
|
399
|
+
rrf_contribution = 1.0 / (RRF_K + rank)
|
|
400
|
+
|
|
401
|
+
merged[id] = {
|
|
402
|
+
'id' => result['id'],
|
|
403
|
+
'content' => result['content'],
|
|
404
|
+
'access_count' => result['access_count'],
|
|
405
|
+
'created_at' => result['created_at'],
|
|
406
|
+
'token_count' => result['token_count'],
|
|
407
|
+
'similarity' => result['similarity'],
|
|
408
|
+
'text_rank' => 0.0,
|
|
409
|
+
'tag_depth_score' => 0.0,
|
|
410
|
+
'matched_tags' => [],
|
|
411
|
+
'rrf_score' => rrf_contribution,
|
|
412
|
+
'vector_rank' => rank,
|
|
413
|
+
'fulltext_rank' => nil,
|
|
414
|
+
'tag_rank' => nil,
|
|
415
|
+
'sources' => ['vector']
|
|
416
|
+
}
|
|
417
|
+
end
|
|
418
|
+
|
|
419
|
+
# Process fulltext results
|
|
420
|
+
fulltext_results.each_with_index do |result, index|
|
|
421
|
+
id = result['id']
|
|
422
|
+
rank = index + 1 # 1-based rank
|
|
423
|
+
rrf_contribution = 1.0 / (RRF_K + rank)
|
|
424
|
+
|
|
425
|
+
if merged.key?(id)
|
|
426
|
+
# Node appears in both - add RRF contribution (this is the boost!)
|
|
427
|
+
merged[id]['rrf_score'] += rrf_contribution
|
|
428
|
+
merged[id]['text_rank'] = result['text_rank']
|
|
429
|
+
merged[id]['fulltext_rank'] = rank
|
|
430
|
+
merged[id]['sources'] << 'fulltext'
|
|
431
|
+
else
|
|
432
|
+
# Node only in fulltext
|
|
433
|
+
merged[id] = {
|
|
434
|
+
'id' => result['id'],
|
|
435
|
+
'content' => result['content'],
|
|
436
|
+
'access_count' => result['access_count'],
|
|
437
|
+
'created_at' => result['created_at'],
|
|
438
|
+
'token_count' => result['token_count'],
|
|
439
|
+
'similarity' => 0.0,
|
|
440
|
+
'text_rank' => result['text_rank'],
|
|
441
|
+
'tag_depth_score' => 0.0,
|
|
442
|
+
'matched_tags' => [],
|
|
443
|
+
'rrf_score' => rrf_contribution,
|
|
444
|
+
'vector_rank' => nil,
|
|
445
|
+
'fulltext_rank' => rank,
|
|
446
|
+
'tag_rank' => nil,
|
|
447
|
+
'sources' => ['fulltext']
|
|
448
|
+
}
|
|
449
|
+
end
|
|
450
|
+
end
|
|
451
|
+
|
|
452
|
+
# Process tag results
|
|
453
|
+
tag_results.each_with_index do |result, index|
|
|
454
|
+
id = result['id']
|
|
455
|
+
rank = index + 1 # 1-based rank
|
|
456
|
+
rrf_contribution = 1.0 / (RRF_K + rank)
|
|
457
|
+
|
|
458
|
+
if merged.key?(id)
|
|
459
|
+
# Node already found - add RRF contribution (boost!)
|
|
460
|
+
merged[id]['rrf_score'] += rrf_contribution
|
|
461
|
+
merged[id]['tag_depth_score'] = result['tag_depth_score']
|
|
462
|
+
merged[id]['matched_tags'] = result['matched_tags']
|
|
463
|
+
merged[id]['tag_rank'] = rank
|
|
464
|
+
merged[id]['sources'] << 'tags'
|
|
465
|
+
else
|
|
466
|
+
# Node only found via tags
|
|
467
|
+
merged[id] = {
|
|
468
|
+
'id' => result['id'],
|
|
469
|
+
'content' => result['content'],
|
|
470
|
+
'access_count' => result['access_count'],
|
|
471
|
+
'created_at' => result['created_at'],
|
|
472
|
+
'token_count' => result['token_count'],
|
|
473
|
+
'similarity' => 0.0,
|
|
474
|
+
'text_rank' => 0.0,
|
|
475
|
+
'tag_depth_score' => result['tag_depth_score'],
|
|
476
|
+
'matched_tags' => result['matched_tags'],
|
|
477
|
+
'rrf_score' => rrf_contribution,
|
|
478
|
+
'vector_rank' => nil,
|
|
479
|
+
'fulltext_rank' => nil,
|
|
480
|
+
'tag_rank' => rank,
|
|
481
|
+
'sources' => ['tags']
|
|
482
|
+
}
|
|
483
|
+
end
|
|
484
|
+
end
|
|
485
|
+
|
|
486
|
+
# Sort by RRF score descending
|
|
487
|
+
merged.values.sort_by { |r| -r['rrf_score'] }
|
|
321
488
|
end
|
|
322
489
|
end
|
|
323
490
|
end
|