swarm_sdk 2.7.13 → 3.0.0.alpha1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/swarm_sdk/ruby_llm_patches/chat_callbacks_patch.rb +43 -22
- data/lib/swarm_sdk/ruby_llm_patches/init.rb +6 -0
- data/lib/swarm_sdk/ruby_llm_patches/mcp_ssl_patch.rb +144 -0
- data/lib/swarm_sdk/ruby_llm_patches/tool_concurrency_patch.rb +3 -4
- data/lib/swarm_sdk/v3/agent.rb +1165 -0
- data/lib/swarm_sdk/v3/agent_builder.rb +533 -0
- data/lib/swarm_sdk/v3/agent_definition.rb +330 -0
- data/lib/swarm_sdk/v3/configuration.rb +490 -0
- data/lib/swarm_sdk/v3/debug_log.rb +86 -0
- data/lib/swarm_sdk/v3/event_stream.rb +130 -0
- data/lib/swarm_sdk/v3/hooks/context.rb +112 -0
- data/lib/swarm_sdk/v3/hooks/result.rb +115 -0
- data/lib/swarm_sdk/v3/hooks/runner.rb +128 -0
- data/lib/swarm_sdk/v3/mcp/connector.rb +183 -0
- data/lib/swarm_sdk/v3/mcp/mcp_error.rb +15 -0
- data/lib/swarm_sdk/v3/mcp/server_definition.rb +125 -0
- data/lib/swarm_sdk/v3/mcp/ssl_http_transport.rb +103 -0
- data/lib/swarm_sdk/v3/mcp/stdio_transport.rb +135 -0
- data/lib/swarm_sdk/v3/mcp/tool_proxy.rb +53 -0
- data/lib/swarm_sdk/v3/memory/adapters/base.rb +297 -0
- data/lib/swarm_sdk/v3/memory/adapters/faiss_support.rb +194 -0
- data/lib/swarm_sdk/v3/memory/adapters/filesystem_adapter.rb +212 -0
- data/lib/swarm_sdk/v3/memory/adapters/sqlite_adapter.rb +507 -0
- data/lib/swarm_sdk/v3/memory/adapters/vector_utils.rb +88 -0
- data/lib/swarm_sdk/v3/memory/card.rb +206 -0
- data/lib/swarm_sdk/v3/memory/cluster.rb +146 -0
- data/lib/swarm_sdk/v3/memory/compressor.rb +496 -0
- data/lib/swarm_sdk/v3/memory/consolidator.rb +427 -0
- data/lib/swarm_sdk/v3/memory/context_builder.rb +339 -0
- data/lib/swarm_sdk/v3/memory/edge.rb +105 -0
- data/lib/swarm_sdk/v3/memory/embedder.rb +185 -0
- data/lib/swarm_sdk/v3/memory/exposure_tracker.rb +104 -0
- data/lib/swarm_sdk/v3/memory/ingestion_pipeline.rb +394 -0
- data/lib/swarm_sdk/v3/memory/retriever.rb +289 -0
- data/lib/swarm_sdk/v3/memory/store.rb +489 -0
- data/lib/swarm_sdk/v3/skills/loader.rb +147 -0
- data/lib/swarm_sdk/v3/skills/manifest.rb +45 -0
- data/lib/swarm_sdk/v3/sub_task_agent.rb +248 -0
- data/lib/swarm_sdk/v3/tools/base.rb +80 -0
- data/lib/swarm_sdk/v3/tools/bash.rb +174 -0
- data/lib/swarm_sdk/v3/tools/clock.rb +32 -0
- data/lib/swarm_sdk/v3/tools/edit.rb +111 -0
- data/lib/swarm_sdk/v3/tools/glob.rb +96 -0
- data/lib/swarm_sdk/v3/tools/grep.rb +200 -0
- data/lib/swarm_sdk/v3/tools/message_teammate.rb +15 -0
- data/lib/swarm_sdk/v3/tools/message_user.rb +15 -0
- data/lib/swarm_sdk/v3/tools/read.rb +181 -0
- data/lib/swarm_sdk/v3/tools/read_tracker.rb +40 -0
- data/lib/swarm_sdk/v3/tools/registry.rb +208 -0
- data/lib/swarm_sdk/v3/tools/sub_task.rb +183 -0
- data/lib/swarm_sdk/v3/tools/think.rb +88 -0
- data/lib/swarm_sdk/v3/tools/write.rb +87 -0
- data/lib/swarm_sdk/v3.rb +145 -0
- metadata +84 -148
- data/lib/swarm_sdk/agent/RETRY_LOGIC.md +0 -175
- data/lib/swarm_sdk/agent/builder.rb +0 -680
- data/lib/swarm_sdk/agent/chat.rb +0 -1432
- data/lib/swarm_sdk/agent/chat_helpers/context_tracker.rb +0 -375
- data/lib/swarm_sdk/agent/chat_helpers/event_emitter.rb +0 -204
- data/lib/swarm_sdk/agent/chat_helpers/hook_integration.rb +0 -480
- data/lib/swarm_sdk/agent/chat_helpers/instrumentation.rb +0 -85
- data/lib/swarm_sdk/agent/chat_helpers/llm_configuration.rb +0 -290
- data/lib/swarm_sdk/agent/chat_helpers/logging_helpers.rb +0 -116
- data/lib/swarm_sdk/agent/chat_helpers/serialization.rb +0 -83
- data/lib/swarm_sdk/agent/chat_helpers/system_reminder_injector.rb +0 -134
- data/lib/swarm_sdk/agent/chat_helpers/system_reminders.rb +0 -79
- data/lib/swarm_sdk/agent/chat_helpers/token_tracking.rb +0 -146
- data/lib/swarm_sdk/agent/context.rb +0 -115
- data/lib/swarm_sdk/agent/context_manager.rb +0 -315
- data/lib/swarm_sdk/agent/definition.rb +0 -581
- data/lib/swarm_sdk/agent/llm_instrumentation_middleware.rb +0 -226
- data/lib/swarm_sdk/agent/system_prompt_builder.rb +0 -161
- data/lib/swarm_sdk/agent/tool_registry.rb +0 -189
- data/lib/swarm_sdk/agent_registry.rb +0 -146
- data/lib/swarm_sdk/builders/base_builder.rb +0 -553
- data/lib/swarm_sdk/claude_code_agent_adapter.rb +0 -205
- data/lib/swarm_sdk/concerns/cleanupable.rb +0 -39
- data/lib/swarm_sdk/concerns/snapshotable.rb +0 -67
- data/lib/swarm_sdk/concerns/validatable.rb +0 -55
- data/lib/swarm_sdk/config.rb +0 -367
- data/lib/swarm_sdk/configuration/parser.rb +0 -397
- data/lib/swarm_sdk/configuration/translator.rb +0 -283
- data/lib/swarm_sdk/configuration.rb +0 -165
- data/lib/swarm_sdk/context_compactor/metrics.rb +0 -147
- data/lib/swarm_sdk/context_compactor/token_counter.rb +0 -102
- data/lib/swarm_sdk/context_compactor.rb +0 -335
- data/lib/swarm_sdk/context_management/builder.rb +0 -128
- data/lib/swarm_sdk/context_management/context.rb +0 -328
- data/lib/swarm_sdk/custom_tool_registry.rb +0 -226
- data/lib/swarm_sdk/defaults.rb +0 -251
- data/lib/swarm_sdk/events_to_messages.rb +0 -199
- data/lib/swarm_sdk/hooks/adapter.rb +0 -359
- data/lib/swarm_sdk/hooks/context.rb +0 -197
- data/lib/swarm_sdk/hooks/definition.rb +0 -80
- data/lib/swarm_sdk/hooks/error.rb +0 -29
- data/lib/swarm_sdk/hooks/executor.rb +0 -146
- data/lib/swarm_sdk/hooks/registry.rb +0 -147
- data/lib/swarm_sdk/hooks/result.rb +0 -150
- data/lib/swarm_sdk/hooks/shell_executor.rb +0 -256
- data/lib/swarm_sdk/hooks/tool_call.rb +0 -35
- data/lib/swarm_sdk/hooks/tool_result.rb +0 -62
- data/lib/swarm_sdk/log_collector.rb +0 -227
- data/lib/swarm_sdk/log_stream.rb +0 -127
- data/lib/swarm_sdk/markdown_parser.rb +0 -75
- data/lib/swarm_sdk/model_aliases.json +0 -8
- data/lib/swarm_sdk/models.json +0 -44002
- data/lib/swarm_sdk/models.rb +0 -161
- data/lib/swarm_sdk/node_context.rb +0 -245
- data/lib/swarm_sdk/observer/builder.rb +0 -81
- data/lib/swarm_sdk/observer/config.rb +0 -45
- data/lib/swarm_sdk/observer/manager.rb +0 -236
- data/lib/swarm_sdk/patterns/agent_observer.rb +0 -160
- data/lib/swarm_sdk/permissions/config.rb +0 -239
- data/lib/swarm_sdk/permissions/error_formatter.rb +0 -121
- data/lib/swarm_sdk/permissions/path_matcher.rb +0 -35
- data/lib/swarm_sdk/permissions/validator.rb +0 -173
- data/lib/swarm_sdk/permissions_builder.rb +0 -122
- data/lib/swarm_sdk/plugin.rb +0 -309
- data/lib/swarm_sdk/plugin_registry.rb +0 -101
- data/lib/swarm_sdk/proc_helpers.rb +0 -53
- data/lib/swarm_sdk/prompts/base_system_prompt.md.erb +0 -117
- data/lib/swarm_sdk/restore_result.rb +0 -65
- data/lib/swarm_sdk/result.rb +0 -212
- data/lib/swarm_sdk/snapshot.rb +0 -156
- data/lib/swarm_sdk/snapshot_from_events.rb +0 -397
- data/lib/swarm_sdk/state_restorer.rb +0 -476
- data/lib/swarm_sdk/state_snapshot.rb +0 -334
- data/lib/swarm_sdk/swarm/agent_initializer.rb +0 -648
- data/lib/swarm_sdk/swarm/all_agents_builder.rb +0 -195
- data/lib/swarm_sdk/swarm/builder.rb +0 -256
- data/lib/swarm_sdk/swarm/executor.rb +0 -290
- data/lib/swarm_sdk/swarm/hook_triggers.rb +0 -151
- data/lib/swarm_sdk/swarm/lazy_delegate_chat.rb +0 -372
- data/lib/swarm_sdk/swarm/logging_callbacks.rb +0 -360
- data/lib/swarm_sdk/swarm/mcp_configurator.rb +0 -270
- data/lib/swarm_sdk/swarm/swarm_registry_builder.rb +0 -67
- data/lib/swarm_sdk/swarm/tool_configurator.rb +0 -392
- data/lib/swarm_sdk/swarm.rb +0 -843
- data/lib/swarm_sdk/swarm_loader.rb +0 -145
- data/lib/swarm_sdk/swarm_registry.rb +0 -136
- data/lib/swarm_sdk/tools/base.rb +0 -63
- data/lib/swarm_sdk/tools/bash.rb +0 -280
- data/lib/swarm_sdk/tools/clock.rb +0 -46
- data/lib/swarm_sdk/tools/delegate.rb +0 -389
- data/lib/swarm_sdk/tools/document_converters/base_converter.rb +0 -83
- data/lib/swarm_sdk/tools/document_converters/docx_converter.rb +0 -99
- data/lib/swarm_sdk/tools/document_converters/html_converter.rb +0 -101
- data/lib/swarm_sdk/tools/document_converters/pdf_converter.rb +0 -78
- data/lib/swarm_sdk/tools/document_converters/xlsx_converter.rb +0 -194
- data/lib/swarm_sdk/tools/edit.rb +0 -145
- data/lib/swarm_sdk/tools/glob.rb +0 -166
- data/lib/swarm_sdk/tools/grep.rb +0 -235
- data/lib/swarm_sdk/tools/image_extractors/docx_image_extractor.rb +0 -43
- data/lib/swarm_sdk/tools/image_extractors/pdf_image_extractor.rb +0 -167
- data/lib/swarm_sdk/tools/image_formats/tiff_builder.rb +0 -65
- data/lib/swarm_sdk/tools/mcp_tool_stub.rb +0 -198
- data/lib/swarm_sdk/tools/multi_edit.rb +0 -236
- data/lib/swarm_sdk/tools/path_resolver.rb +0 -92
- data/lib/swarm_sdk/tools/read.rb +0 -261
- data/lib/swarm_sdk/tools/registry.rb +0 -205
- data/lib/swarm_sdk/tools/scratchpad/scratchpad_list.rb +0 -117
- data/lib/swarm_sdk/tools/scratchpad/scratchpad_read.rb +0 -97
- data/lib/swarm_sdk/tools/scratchpad/scratchpad_write.rb +0 -108
- data/lib/swarm_sdk/tools/stores/read_tracker.rb +0 -96
- data/lib/swarm_sdk/tools/stores/scratchpad_storage.rb +0 -273
- data/lib/swarm_sdk/tools/stores/storage.rb +0 -142
- data/lib/swarm_sdk/tools/stores/todo_manager.rb +0 -65
- data/lib/swarm_sdk/tools/think.rb +0 -100
- data/lib/swarm_sdk/tools/todo_write.rb +0 -237
- data/lib/swarm_sdk/tools/web_fetch.rb +0 -264
- data/lib/swarm_sdk/tools/write.rb +0 -112
- data/lib/swarm_sdk/transcript_builder.rb +0 -278
- data/lib/swarm_sdk/utils.rb +0 -68
- data/lib/swarm_sdk/validation_result.rb +0 -33
- data/lib/swarm_sdk/version.rb +0 -5
- data/lib/swarm_sdk/workflow/agent_config.rb +0 -95
- data/lib/swarm_sdk/workflow/builder.rb +0 -227
- data/lib/swarm_sdk/workflow/executor.rb +0 -497
- data/lib/swarm_sdk/workflow/node_builder.rb +0 -593
- data/lib/swarm_sdk/workflow/transformer_executor.rb +0 -250
- data/lib/swarm_sdk/workflow.rb +0 -589
- data/lib/swarm_sdk.rb +0 -718
|
@@ -0,0 +1,289 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SwarmSDK
|
|
4
|
+
module V3
|
|
5
|
+
module Memory
|
|
6
|
+
# Hybrid search retriever combining semantic and keyword search
|
|
7
|
+
#
|
|
8
|
+
# Uses reciprocal rank fusion (RRF) to combine results from:
|
|
9
|
+
# 1. Semantic search via FAISS vector similarity
|
|
10
|
+
# 2. Keyword search via text matching
|
|
11
|
+
# 3. Graph expansion via 1-hop edge traversal
|
|
12
|
+
#
|
|
13
|
+
# @example
|
|
14
|
+
# retriever = Retriever.new(adapter: adapter, embedder: embedder)
|
|
15
|
+
# results = retriever.search("JWT authentication", top_k: 10)
|
|
16
|
+
class Retriever
|
|
17
|
+
# @param adapter [Adapters::Base] Storage adapter
|
|
18
|
+
# @param embedder [Embedder] Text embedder
|
|
19
|
+
# @param semantic_weight [Float] Weight for semantic results (0.0-1.0)
|
|
20
|
+
# @param keyword_weight [Float] Weight for keyword results (0.0-1.0)
|
|
21
|
+
def initialize(adapter:, embedder:, semantic_weight: 0.5, keyword_weight: 0.5)
|
|
22
|
+
@adapter = adapter
|
|
23
|
+
@embedder = embedder
|
|
24
|
+
@semantic_weight = semantic_weight
|
|
25
|
+
@keyword_weight = keyword_weight
|
|
26
|
+
@config = Configuration.instance
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
# Search for relevant memory cards
|
|
30
|
+
#
|
|
31
|
+
# @param query [String] Search query
|
|
32
|
+
# @param top_k [Integer] Number of results to return
|
|
33
|
+
# @return [Array<Card>] Ranked cards
|
|
34
|
+
#
|
|
35
|
+
# @example
|
|
36
|
+
# cards = retriever.search("How does auth work?", top_k: 10)
|
|
37
|
+
def search(query, top_k: 15)
|
|
38
|
+
DebugLog.log("retriever", "search: query=#{query[0..60].inspect}, top_k=#{top_k}")
|
|
39
|
+
|
|
40
|
+
semantic_results = DebugLog.time("retriever", "semantic_search(#{top_k * 2})") do
|
|
41
|
+
semantic_search(query, top_k: top_k * 2)
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
keyword_results = DebugLog.time("retriever", "keyword_search(#{top_k * 2})") do
|
|
45
|
+
keyword_search(query, top_k: top_k * 2)
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
DebugLog.log("retriever", "semantic=#{semantic_results.size} keyword=#{keyword_results.size}")
|
|
49
|
+
|
|
50
|
+
# Reciprocal rank fusion
|
|
51
|
+
fused = DebugLog.time("retriever", "rrf") do
|
|
52
|
+
reciprocal_rank_fusion(semantic_results, keyword_results)
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
# Graph expansion: add 1-hop neighbors of top results
|
|
56
|
+
expanded = DebugLog.time("retriever", "graph_expand") do
|
|
57
|
+
graph_expand(fused.take(top_k), top_k: top_k)
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
DebugLog.log("retriever", "fused=#{fused.size} expanded=#{expanded.size} returning=#{[expanded.size, top_k].min}")
|
|
61
|
+
|
|
62
|
+
# Return unique cards, limited to top_k
|
|
63
|
+
expanded.take(top_k)
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
private
|
|
67
|
+
|
|
68
|
+
# Semantic search via vector similarity
|
|
69
|
+
#
|
|
70
|
+
# @param query [String] Search query
|
|
71
|
+
# @param top_k [Integer] Number of results
|
|
72
|
+
# @return [Array<String>] Ranked card IDs
|
|
73
|
+
def semantic_search(query, top_k:)
|
|
74
|
+
embedding = @embedder.embed(query)
|
|
75
|
+
results = @adapter.vector_search(embedding, top_k: top_k, threshold: 0.1)
|
|
76
|
+
results.sort_by { |r| -r[:similarity] }.map { |r| r[:id] }
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
# BM25 keyword search
|
|
80
|
+
#
|
|
81
|
+
# Implements Okapi BM25 scoring for keyword-based retrieval.
|
|
82
|
+
# Catches exact names, IDs, error messages, and other literal matches
|
|
83
|
+
# that semantic search might miss.
|
|
84
|
+
#
|
|
85
|
+
# @param query [String] Search query
|
|
86
|
+
# @param top_k [Integer] Number of results
|
|
87
|
+
# @return [Array<String>] Ranked card IDs
|
|
88
|
+
def keyword_search(query, top_k:)
|
|
89
|
+
terms = tokenize(query)
|
|
90
|
+
return [] if terms.empty?
|
|
91
|
+
|
|
92
|
+
cards = @adapter.list_cards
|
|
93
|
+
return [] if cards.empty?
|
|
94
|
+
|
|
95
|
+
# Precompute corpus statistics for BM25
|
|
96
|
+
avg_dl = cards.sum { |c| tokenize(c.text).size }.to_f / cards.size
|
|
97
|
+
doc_freq = compute_document_frequencies(terms, cards)
|
|
98
|
+
n = cards.size
|
|
99
|
+
|
|
100
|
+
scored = cards.filter_map do |card|
|
|
101
|
+
card_terms = tokenize(card.text)
|
|
102
|
+
dl = card_terms.size
|
|
103
|
+
|
|
104
|
+
# BM25 score
|
|
105
|
+
bm25 = terms.sum do |term|
|
|
106
|
+
tf = card_terms.count(term)
|
|
107
|
+
next 0.0 if tf == 0
|
|
108
|
+
|
|
109
|
+
df = doc_freq[term] || 0
|
|
110
|
+
idf = Math.log((n - df + 0.5) / (df + 0.5) + 1.0)
|
|
111
|
+
numerator = tf * (@config.bm25_k1 + 1)
|
|
112
|
+
denominator = tf + @config.bm25_k1 * (1 - @config.bm25_b + @config.bm25_b * dl / avg_dl)
|
|
113
|
+
idf * numerator / denominator
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
# Boost for entity matches (exact term in entities list)
|
|
117
|
+
entity_boost = card.entities.count do |entity|
|
|
118
|
+
terms.any? { |t| entity.downcase.include?(t) }
|
|
119
|
+
end
|
|
120
|
+
bm25 += entity_boost * @config.bm25_entity_boost
|
|
121
|
+
|
|
122
|
+
{ id: card.id, score: bm25 } if bm25 > 0.0
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
scored.sort_by { |s| -s[:score] }.take(top_k).map { |s| s[:id] }
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
# Combine ranked lists using reciprocal rank fusion
|
|
129
|
+
#
|
|
130
|
+
# @param semantic_ids [Array<String>] Semantic search results
|
|
131
|
+
# @param keyword_ids [Array<String>] Keyword search results
|
|
132
|
+
# @return [Array<Card>] Fused and ranked cards
|
|
133
|
+
def reciprocal_rank_fusion(semantic_ids, keyword_ids)
|
|
134
|
+
scores = Hash.new(0.0)
|
|
135
|
+
|
|
136
|
+
semantic_ids.each_with_index do |id, rank|
|
|
137
|
+
scores[id] += @semantic_weight * (1.0 / (@config.rrf_k + rank + 1))
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
keyword_ids.each_with_index do |id, rank|
|
|
141
|
+
scores[id] += @keyword_weight * (1.0 / (@config.rrf_k + rank + 1))
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
ranked_ids = scores.sort_by { |_, score| -score }.map(&:first)
|
|
145
|
+
ranked_ids.filter_map { |id| @adapter.read_card(id) }
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
# Expand results with 1-hop graph neighbors
|
|
149
|
+
#
|
|
150
|
+
# Limits to `config.max_neighbors_per_seed` neighbors per seed card
|
|
151
|
+
# to prevent any single highly-connected card from dominating
|
|
152
|
+
# the expanded results.
|
|
153
|
+
#
|
|
154
|
+
# @param cards [Array<Card>] Initial result cards
|
|
155
|
+
# @param top_k [Integer] Maximum total results
|
|
156
|
+
# @return [Array<Card>] Expanded results
|
|
157
|
+
def graph_expand(cards, top_k:)
|
|
158
|
+
seen_ids = Set.new(cards.map(&:id))
|
|
159
|
+
expanded = cards.dup
|
|
160
|
+
|
|
161
|
+
cards.each do |card|
|
|
162
|
+
break if expanded.size >= top_k
|
|
163
|
+
|
|
164
|
+
edges = @adapter.edges_for(card.id)
|
|
165
|
+
neighbors_added = 0
|
|
166
|
+
|
|
167
|
+
# Sort edges by weight descending to prefer strongest connections
|
|
168
|
+
edges.sort_by { |e| -e.weight }.each do |edge|
|
|
169
|
+
break if expanded.size >= top_k
|
|
170
|
+
break if neighbors_added >= @config.max_neighbors_per_seed
|
|
171
|
+
|
|
172
|
+
neighbor_id = edge.from_id == card.id ? edge.to_id : edge.from_id
|
|
173
|
+
next if seen_ids.include?(neighbor_id)
|
|
174
|
+
|
|
175
|
+
neighbor = @adapter.read_card(neighbor_id)
|
|
176
|
+
next unless neighbor
|
|
177
|
+
|
|
178
|
+
seen_ids.add(neighbor_id)
|
|
179
|
+
expanded << neighbor
|
|
180
|
+
neighbors_added += 1
|
|
181
|
+
end
|
|
182
|
+
end
|
|
183
|
+
|
|
184
|
+
expanded
|
|
185
|
+
end
|
|
186
|
+
|
|
187
|
+
# Tokenize text into lowercase terms, filtering stopwords and short tokens
|
|
188
|
+
#
|
|
189
|
+
# @param text [String] Text to tokenize
|
|
190
|
+
# @return [Array<String>] Lowercase tokens
|
|
191
|
+
def tokenize(text)
|
|
192
|
+
text.downcase
|
|
193
|
+
.gsub(/[^a-z0-9\s_-]/, " ")
|
|
194
|
+
.split(/\s+/)
|
|
195
|
+
.reject { |t| t.length < 2 || STOPWORDS.include?(t) }
|
|
196
|
+
end
|
|
197
|
+
|
|
198
|
+
# Compute document frequency for each query term
|
|
199
|
+
#
|
|
200
|
+
# @param terms [Array<String>] Query terms
|
|
201
|
+
# @param cards [Array<Card>] All cards in the corpus
|
|
202
|
+
# @return [Hash<String, Integer>] Term → document count
|
|
203
|
+
def compute_document_frequencies(terms, cards)
|
|
204
|
+
freq = Hash.new(0)
|
|
205
|
+
cards.each do |card|
|
|
206
|
+
card_terms = tokenize(card.text).to_set
|
|
207
|
+
terms.each { |term| freq[term] += 1 if card_terms.include?(term) }
|
|
208
|
+
end
|
|
209
|
+
freq
|
|
210
|
+
end
|
|
211
|
+
|
|
212
|
+
# Common English stopwords to filter from BM25 scoring
|
|
213
|
+
STOPWORDS = Set.new([
|
|
214
|
+
"the",
|
|
215
|
+
"a",
|
|
216
|
+
"an",
|
|
217
|
+
"is",
|
|
218
|
+
"are",
|
|
219
|
+
"was",
|
|
220
|
+
"were",
|
|
221
|
+
"be",
|
|
222
|
+
"been",
|
|
223
|
+
"being",
|
|
224
|
+
"have",
|
|
225
|
+
"has",
|
|
226
|
+
"had",
|
|
227
|
+
"do",
|
|
228
|
+
"does",
|
|
229
|
+
"did",
|
|
230
|
+
"will",
|
|
231
|
+
"would",
|
|
232
|
+
"shall",
|
|
233
|
+
"should",
|
|
234
|
+
"may",
|
|
235
|
+
"might",
|
|
236
|
+
"can",
|
|
237
|
+
"could",
|
|
238
|
+
"of",
|
|
239
|
+
"in",
|
|
240
|
+
"to",
|
|
241
|
+
"for",
|
|
242
|
+
"on",
|
|
243
|
+
"with",
|
|
244
|
+
"at",
|
|
245
|
+
"by",
|
|
246
|
+
"from",
|
|
247
|
+
"as",
|
|
248
|
+
"into",
|
|
249
|
+
"through",
|
|
250
|
+
"about",
|
|
251
|
+
"between",
|
|
252
|
+
"after",
|
|
253
|
+
"before",
|
|
254
|
+
"above",
|
|
255
|
+
"below",
|
|
256
|
+
"up",
|
|
257
|
+
"down",
|
|
258
|
+
"out",
|
|
259
|
+
"off",
|
|
260
|
+
"over",
|
|
261
|
+
"under",
|
|
262
|
+
"again",
|
|
263
|
+
"this",
|
|
264
|
+
"that",
|
|
265
|
+
"these",
|
|
266
|
+
"those",
|
|
267
|
+
"it",
|
|
268
|
+
"its",
|
|
269
|
+
"he",
|
|
270
|
+
"she",
|
|
271
|
+
"they",
|
|
272
|
+
"them",
|
|
273
|
+
"and",
|
|
274
|
+
"but",
|
|
275
|
+
"or",
|
|
276
|
+
"nor",
|
|
277
|
+
"not",
|
|
278
|
+
"no",
|
|
279
|
+
"so",
|
|
280
|
+
"if",
|
|
281
|
+
"then",
|
|
282
|
+
"than",
|
|
283
|
+
"too",
|
|
284
|
+
"very",
|
|
285
|
+
]).freeze
|
|
286
|
+
end
|
|
287
|
+
end
|
|
288
|
+
end
|
|
289
|
+
end
|