fact_db 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.envrc +2 -0
- data/.yardopts +5 -0
- data/CHANGELOG.md +64 -0
- data/README.md +107 -6
- data/Rakefile +243 -10
- data/db/migrate/001_enable_extensions.rb +1 -0
- data/db/migrate/002_create_sources.rb +49 -0
- data/db/migrate/003_create_entities.rb +27 -15
- data/db/migrate/004_create_entity_aliases.rb +20 -7
- data/db/migrate/005_create_facts.rb +37 -21
- data/db/migrate/006_create_entity_mentions.rb +14 -6
- data/db/migrate/007_create_fact_sources.rb +16 -8
- data/docs/api/extractors/index.md +5 -5
- data/docs/api/extractors/llm.md +17 -17
- data/docs/api/extractors/rule-based.md +14 -14
- data/docs/api/facts.md +20 -20
- data/docs/api/index.md +4 -4
- data/docs/api/models/entity.md +21 -21
- data/docs/api/models/fact.md +15 -15
- data/docs/api/models/index.md +7 -7
- data/docs/api/models/{content.md → source.md} +29 -29
- data/docs/api/pipeline/extraction.md +25 -25
- data/docs/api/pipeline/index.md +1 -1
- data/docs/api/pipeline/resolution.md +4 -4
- data/docs/api/services/entity-service.md +20 -20
- data/docs/api/services/fact-service.md +12 -12
- data/docs/api/services/index.md +5 -5
- data/docs/api/services/{content-service.md → source-service.md} +27 -27
- data/docs/architecture/database-schema.md +46 -46
- data/docs/architecture/entity-resolution.md +6 -6
- data/docs/architecture/index.md +10 -10
- data/docs/architecture/temporal-facts.md +5 -5
- data/docs/architecture/three-layer-model.md +17 -17
- data/docs/concepts.md +6 -6
- data/docs/examples/basic-usage.md +20 -20
- data/docs/examples/hr-onboarding.md +17 -17
- data/docs/examples/index.md +4 -4
- data/docs/examples/news-analysis.md +23 -23
- data/docs/getting-started/database-setup.md +28 -20
- data/docs/getting-started/index.md +3 -3
- data/docs/getting-started/quick-start.md +33 -30
- data/docs/guides/batch-processing.md +26 -26
- data/docs/guides/configuration.md +158 -77
- data/docs/guides/entity-management.md +14 -14
- data/docs/guides/extracting-facts.md +28 -28
- data/docs/guides/ingesting-content.md +14 -14
- data/docs/guides/llm-integration.md +40 -32
- data/docs/guides/temporal-queries.md +11 -11
- data/docs/index.md +6 -2
- data/examples/.envrc +4 -0
- data/examples/.gitignore +1 -0
- data/examples/001_configuration.rb +312 -0
- data/examples/{basic_usage.rb → 010_basic_usage.rb} +47 -56
- data/examples/{entity_management.rb → 020_entity_management.rb} +57 -72
- data/examples/{temporal_queries.rb → 030_temporal_queries.rb} +39 -59
- data/examples/040_output_formats.rb +177 -0
- data/examples/{rule_based_extraction.rb → 050_rule_based_extraction.rb} +39 -45
- data/examples/060_fluent_temporal_api.rb +217 -0
- data/examples/070_introspection.rb +252 -0
- data/examples/{hr_system.rb → 080_hr_system.rb} +56 -75
- data/examples/090_ingest_demo.rb +515 -0
- data/examples/100_query_context.rb +668 -0
- data/examples/110_prove_it.rb +204 -0
- data/examples/120_dump_database.rb +358 -0
- data/examples/130_rag_feedback_loop.rb +858 -0
- data/examples/README.md +229 -15
- data/examples/data/lincoln_associates.md +201 -0
- data/examples/data/lincoln_biography.md +66 -0
- data/examples/data/lincoln_cabinet.md +243 -0
- data/examples/data/lincoln_family.md +163 -0
- data/examples/data/lincoln_military.md +241 -0
- data/examples/data/lincoln_todd_family.md +136 -0
- data/examples/ingest_reporter.rb +335 -0
- data/examples/utilities.rb +182 -0
- data/lib/fact_db/config/defaults.yml +254 -0
- data/lib/fact_db/config.rb +94 -35
- data/lib/fact_db/database.rb +98 -8
- data/lib/fact_db/extractors/base.rb +106 -21
- data/lib/fact_db/extractors/llm_extractor.rb +35 -63
- data/lib/fact_db/extractors/manual_extractor.rb +46 -6
- data/lib/fact_db/extractors/rule_based_extractor.rb +136 -25
- data/lib/fact_db/llm/adapter.rb +3 -3
- data/lib/fact_db/models/entity.rb +94 -22
- data/lib/fact_db/models/entity_alias.rb +41 -7
- data/lib/fact_db/models/entity_mention.rb +34 -1
- data/lib/fact_db/models/fact.rb +259 -28
- data/lib/fact_db/models/fact_source.rb +43 -9
- data/lib/fact_db/models/source.rb +113 -0
- data/lib/fact_db/pipeline/extraction_pipeline.rb +35 -35
- data/lib/fact_db/pipeline/resolution_pipeline.rb +5 -5
- data/lib/fact_db/query_result.rb +202 -0
- data/lib/fact_db/resolution/entity_resolver.rb +139 -39
- data/lib/fact_db/resolution/fact_resolver.rb +86 -14
- data/lib/fact_db/services/entity_service.rb +246 -37
- data/lib/fact_db/services/fact_service.rb +254 -17
- data/lib/fact_db/services/source_service.rb +164 -0
- data/lib/fact_db/temporal/query.rb +71 -7
- data/lib/fact_db/temporal/query_builder.rb +69 -0
- data/lib/fact_db/temporal/timeline.rb +102 -11
- data/lib/fact_db/transformers/base.rb +77 -0
- data/lib/fact_db/transformers/cypher_transformer.rb +185 -0
- data/lib/fact_db/transformers/json_transformer.rb +17 -0
- data/lib/fact_db/transformers/raw_transformer.rb +35 -0
- data/lib/fact_db/transformers/text_transformer.rb +114 -0
- data/lib/fact_db/transformers/triple_transformer.rb +138 -0
- data/lib/fact_db/validation/alias_filter.rb +185 -0
- data/lib/fact_db/version.rb +1 -1
- data/lib/fact_db.rb +281 -30
- data/mkdocs.yml +2 -2
- metadata +60 -16
- data/db/migrate/002_create_contents.rb +0 -44
- data/lib/fact_db/models/content.rb +0 -62
- data/lib/fact_db/services/content_service.rb +0 -93
|
@@ -0,0 +1,668 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
# frozen_string_literal: true
|
|
3
|
+
|
|
4
|
+
# Query Context Generator for FactDb
|
|
5
|
+
#
|
|
6
|
+
# Takes a natural language query and generates context from the facts database
|
|
7
|
+
# suitable for LLM consumption.
|
|
8
|
+
#
|
|
9
|
+
# Usage:
|
|
10
|
+
# ruby query_context.rb "Who is Sapphira's husband?"
|
|
11
|
+
# ruby query_context.rb "What happened to Ananias?"
|
|
12
|
+
# ruby query_context.rb --format triples "Tell me about Peter"
|
|
13
|
+
# ruby query_context.rb --format json "Who are the apostles?"
|
|
14
|
+
# ruby query_context.rb --verbose "Where was Stephen martyred?"
|
|
15
|
+
#
|
|
16
|
+
# Options:
|
|
17
|
+
# --format FORMAT Output format: text (default), json, triples, cypher
|
|
18
|
+
# --verbose Show detailed processing steps
|
|
19
|
+
# --limit N Maximum number of facts to return (default: 20)
|
|
20
|
+
|
|
21
|
+
require_relative "utilities"
|
|
22
|
+
require "optparse"
|
|
23
|
+
|
|
24
|
+
# Note: CLI tool - uses cli_setup! which does NOT reset database
|
|
25
|
+
|
|
26
|
+
class QueryContextGenerator
|
|
27
|
+
FORMATS = %i[text json triples cypher raw].freeze
|
|
28
|
+
|
|
29
|
+
def initialize(options = {})
|
|
30
|
+
@format = options[:format] || :text
|
|
31
|
+
@verbose = options[:verbose] || false
|
|
32
|
+
@limit = options[:limit] || 20
|
|
33
|
+
@rank = options[:rank] != false # Default to true
|
|
34
|
+
setup_factdb
|
|
35
|
+
@query_embedding = nil # Cache for query embedding
|
|
36
|
+
load_ranking_weights
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def load_ranking_weights
|
|
40
|
+
ranking = FactDb.config.ranking
|
|
41
|
+
|
|
42
|
+
# Load weights from config with fallback defaults
|
|
43
|
+
@weights = {
|
|
44
|
+
ts_rank: ranking&.ts_rank_weight || 0.25,
|
|
45
|
+
vector_similarity: ranking&.vector_similarity_weight || 0.25,
|
|
46
|
+
entity_mentions: ranking&.entity_mention_weight || 0.15,
|
|
47
|
+
direct_answer: ranking&.direct_answer_weight || 0.15,
|
|
48
|
+
term_overlap: ranking&.term_overlap_weight || 0.10,
|
|
49
|
+
relationship_match: ranking&.relationship_match_weight || 0.05,
|
|
50
|
+
confidence: ranking&.confidence_weight || 0.05
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
log_step("Ranking weights loaded", @weights.map { |k, v| "#{k}: #{v}" }) if @verbose
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def run(query)
|
|
57
|
+
log_header(query)
|
|
58
|
+
|
|
59
|
+
# Step 1: Extract potential entity names from the query
|
|
60
|
+
candidates = extract_entity_candidates(query)
|
|
61
|
+
log_step("Entity candidates", candidates)
|
|
62
|
+
|
|
63
|
+
# Step 2: Resolve entities from candidates
|
|
64
|
+
resolved_entities = resolve_entities(candidates)
|
|
65
|
+
log_step("Resolved entities", resolved_entities.map { |e| "#{e.name} (#{e.kind})" })
|
|
66
|
+
|
|
67
|
+
# Step 3: Gather facts from multiple strategies
|
|
68
|
+
all_facts = gather_facts(query, resolved_entities)
|
|
69
|
+
log_step("Facts gathered", "#{all_facts.size} facts")
|
|
70
|
+
|
|
71
|
+
# Step 4: Rank facts by relevance to the query
|
|
72
|
+
@ranked_results = nil
|
|
73
|
+
if @rank
|
|
74
|
+
@ranked_results = rank_facts(query, all_facts, resolved_entities)
|
|
75
|
+
log_step("Top ranked facts", @ranked_results.first(5).map { |f| "#{f[:score].round(2)}: #{f[:fact].text[0..60]}..." })
|
|
76
|
+
all_facts = @ranked_results.map { |f| f[:fact] }
|
|
77
|
+
|
|
78
|
+
# Show signal breakdown if verbose
|
|
79
|
+
if @verbose && @ranked_results.any?
|
|
80
|
+
show_signal_breakdown(@ranked_results.first(3))
|
|
81
|
+
end
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
# Step 5: Build and output context
|
|
85
|
+
output_context(query, resolved_entities, all_facts)
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
def show_signal_breakdown(ranked_facts)
|
|
89
|
+
puts "\n--- Signal Breakdown (Top #{ranked_facts.size}) ---"
|
|
90
|
+
puts " Configured weights: #{@weights.map { |k, v| "#{k}=#{v}" }.join(', ')}"
|
|
91
|
+
|
|
92
|
+
ranked_facts.each_with_index do |result, idx|
|
|
93
|
+
fact = result[:fact]
|
|
94
|
+
signals = result[:signals]
|
|
95
|
+
|
|
96
|
+
puts "\n#{idx + 1}. \"#{fact.text[0..70]}...\""
|
|
97
|
+
puts " Total Score: #{result[:score].round(3)}"
|
|
98
|
+
puts " Signals:"
|
|
99
|
+
|
|
100
|
+
signals.each do |name, value|
|
|
101
|
+
max_weight = @weights[name] || 0.25
|
|
102
|
+
fill_ratio = max_weight > 0 ? (value / max_weight) : 0
|
|
103
|
+
bar_length = (fill_ratio * 10).round
|
|
104
|
+
bar = "#" * bar_length + "." * (10 - bar_length)
|
|
105
|
+
puts " #{name.to_s.ljust(18)} #{value.round(3).to_s.ljust(6)} / #{max_weight.to_s.ljust(4)} |#{bar}|"
|
|
106
|
+
end
|
|
107
|
+
end
|
|
108
|
+
puts
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
private
|
|
112
|
+
|
|
113
|
+
def setup_factdb
|
|
114
|
+
DemoUtilities.ensure_demo_environment!
|
|
115
|
+
DemoUtilities.require_fact_db!
|
|
116
|
+
|
|
117
|
+
FactDb.configure do |config|
|
|
118
|
+
config.logger = Logger.new("/dev/null")
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
FactDb::Database.establish_connection!
|
|
122
|
+
|
|
123
|
+
@facts = FactDb.new
|
|
124
|
+
@entity_service = @facts.entity_service
|
|
125
|
+
@fact_service = @facts.fact_service
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
def extract_entity_candidates(query)
|
|
129
|
+
candidates = []
|
|
130
|
+
|
|
131
|
+
# Extract capitalized words/phrases (potential proper nouns)
|
|
132
|
+
# Match sequences of capitalized words
|
|
133
|
+
query.scan(/\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b/).each do |match|
|
|
134
|
+
candidates << match unless stop_words.include?(match.downcase)
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
# Extract words in possessive form (e.g., "Sapphira's" -> "Sapphira")
|
|
138
|
+
query.scan(/\b([A-Z][a-z]+)'s\b/).flatten.each do |match|
|
|
139
|
+
candidates << match
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
# Extract quoted strings
|
|
143
|
+
query.scan(/"([^"]+)"/).flatten.each do |match|
|
|
144
|
+
candidates << match
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
# Also try key nouns from the query (lowercase entities might exist)
|
|
148
|
+
extract_key_nouns(query).each do |noun|
|
|
149
|
+
candidates << noun
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
candidates.uniq
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
def extract_key_nouns(query)
|
|
156
|
+
nouns = []
|
|
157
|
+
|
|
158
|
+
# Common question patterns - extract the object
|
|
159
|
+
patterns = [
|
|
160
|
+
/who (?:is|was|were) (.+?)(?:\?|$)/i,
|
|
161
|
+
/what (?:is|was|were|happened to) (.+?)(?:\?|$)/i,
|
|
162
|
+
/where (?:is|was|did) (.+?)(?:\?|$)/i,
|
|
163
|
+
/tell me about (.+?)(?:\?|$)/i,
|
|
164
|
+
/(?:husband|wife|spouse) of (.+?)(?:\?|$)/i,
|
|
165
|
+
/(.+?)'s (?:husband|wife|father|mother|son|daughter)/i
|
|
166
|
+
]
|
|
167
|
+
|
|
168
|
+
patterns.each do |pattern|
|
|
169
|
+
if (match = query.match(pattern))
|
|
170
|
+
# Clean up the captured group
|
|
171
|
+
noun = match[1].strip.gsub(/[?.!]$/, "")
|
|
172
|
+
nouns << noun unless noun.empty?
|
|
173
|
+
end
|
|
174
|
+
end
|
|
175
|
+
|
|
176
|
+
nouns.uniq
|
|
177
|
+
end
|
|
178
|
+
|
|
179
|
+
def stop_words
|
|
180
|
+
%w[who what where when why how is was were are the a an and or but]
|
|
181
|
+
end
|
|
182
|
+
|
|
183
|
+
def resolve_entities(candidates)
|
|
184
|
+
entities = []
|
|
185
|
+
|
|
186
|
+
candidates.each do |name|
|
|
187
|
+
# Try exact resolution first
|
|
188
|
+
resolved = @entity_service.resolve(name)
|
|
189
|
+
if resolved
|
|
190
|
+
entities << resolved.entity
|
|
191
|
+
next
|
|
192
|
+
end
|
|
193
|
+
|
|
194
|
+
# Try search as fallback
|
|
195
|
+
search_results = @entity_service.search(name, limit: 3)
|
|
196
|
+
search_results.each do |entity|
|
|
197
|
+
entities << entity
|
|
198
|
+
end
|
|
199
|
+
end
|
|
200
|
+
|
|
201
|
+
entities.uniq(&:id)
|
|
202
|
+
end
|
|
203
|
+
|
|
204
|
+
def gather_facts(query, entities)
|
|
205
|
+
facts = []
|
|
206
|
+
|
|
207
|
+
# Strategy 1: Get facts mentioning resolved entities
|
|
208
|
+
entities.each do |entity|
|
|
209
|
+
entity_facts = @fact_service.current_facts(entity: entity.id, limit: @limit)
|
|
210
|
+
facts.concat(entity_facts.to_a)
|
|
211
|
+
end
|
|
212
|
+
|
|
213
|
+
# Strategy 2: Full-text search on the query
|
|
214
|
+
search_facts = @fact_service.search(query, limit: @limit)
|
|
215
|
+
facts.concat(search_facts.to_a)
|
|
216
|
+
|
|
217
|
+
# Strategy 3: Search for key terms from the query
|
|
218
|
+
extract_search_terms(query).each do |term|
|
|
219
|
+
term_facts = @fact_service.search(term, limit: 5)
|
|
220
|
+
facts.concat(term_facts.to_a)
|
|
221
|
+
end
|
|
222
|
+
|
|
223
|
+
# Strategy 4: Semantic search if available
|
|
224
|
+
begin
|
|
225
|
+
semantic_facts = @fact_service.semantic_search(query, limit: @limit)
|
|
226
|
+
facts.concat(semantic_facts.to_a) if semantic_facts.any?
|
|
227
|
+
rescue StandardError
|
|
228
|
+
# Semantic search not available (no embeddings)
|
|
229
|
+
end
|
|
230
|
+
|
|
231
|
+
# Deduplicate and limit
|
|
232
|
+
facts.uniq(&:id).first(@limit)
|
|
233
|
+
end
|
|
234
|
+
|
|
235
|
+
# Rank facts by relevance to the query
|
|
236
|
+
# Returns array of { fact:, score:, signals: } sorted by score descending
|
|
237
|
+
def rank_facts(query, facts, resolved_entities)
|
|
238
|
+
return [] if facts.empty?
|
|
239
|
+
|
|
240
|
+
query_lower = query.downcase
|
|
241
|
+
query_terms = extract_query_terms(query)
|
|
242
|
+
entity_names = resolved_entities.flat_map { |e| [e.name.downcase] + e.all_aliases.map(&:downcase) }
|
|
243
|
+
|
|
244
|
+
# Pre-compute expensive scores for all facts at once
|
|
245
|
+
fact_ids = facts.map(&:id)
|
|
246
|
+
ts_rank_scores = compute_ts_rank_scores(query, fact_ids)
|
|
247
|
+
vector_scores = compute_vector_similarity_scores(query, fact_ids)
|
|
248
|
+
|
|
249
|
+
scored_facts = facts.map do |fact|
|
|
250
|
+
signals = {}
|
|
251
|
+
text_lower = fact.text.downcase
|
|
252
|
+
|
|
253
|
+
# Signal 1: PostgreSQL ts_rank score
|
|
254
|
+
# Full-text search relevance from PostgreSQL
|
|
255
|
+
ts_score = ts_rank_scores[fact.id] || 0.0
|
|
256
|
+
signals[:ts_rank] = [ts_score * @weights[:ts_rank], @weights[:ts_rank]].min
|
|
257
|
+
|
|
258
|
+
# Signal 2: Vector similarity score
|
|
259
|
+
# Semantic similarity via pgvector embeddings
|
|
260
|
+
vec_score = vector_scores[fact.id] || 0.0
|
|
261
|
+
signals[:vector_similarity] = vec_score * @weights[:vector_similarity]
|
|
262
|
+
|
|
263
|
+
# Signal 3: Entity mention score
|
|
264
|
+
# Facts mentioning query entities rank higher
|
|
265
|
+
entity_mention_score = 0.0
|
|
266
|
+
mention_increment = @weights[:entity_mentions] / 2.0 # Allow up to 2 entity mentions
|
|
267
|
+
entity_names.each do |name|
|
|
268
|
+
entity_mention_score += mention_increment if text_lower.include?(name)
|
|
269
|
+
end
|
|
270
|
+
signals[:entity_mentions] = [entity_mention_score, @weights[:entity_mentions]].min
|
|
271
|
+
|
|
272
|
+
# Signal 4: Query term overlap
|
|
273
|
+
# How many query terms appear in the fact
|
|
274
|
+
term_matches = query_terms.count { |term| text_lower.include?(term.downcase) }
|
|
275
|
+
term_score = query_terms.empty? ? 0 : (term_matches.to_f / query_terms.size) * @weights[:term_overlap]
|
|
276
|
+
signals[:term_overlap] = term_score
|
|
277
|
+
|
|
278
|
+
# Signal 5: Relationship term bonus
|
|
279
|
+
# Bonus for facts containing relationship words from the query
|
|
280
|
+
relationship_terms = extract_relationship_terms(query)
|
|
281
|
+
rel_matches = relationship_terms.count { |term| text_lower.include?(term.downcase) }
|
|
282
|
+
signals[:relationship_match] = rel_matches > 0 ? @weights[:relationship_match] : 0.0
|
|
283
|
+
|
|
284
|
+
# Signal 6: Direct answer bonus
|
|
285
|
+
# Bonus if fact structure matches query intent (uses relative scoring)
|
|
286
|
+
direct_score = score_direct_answer(query, fact)
|
|
287
|
+
signals[:direct_answer] = direct_score * @weights[:direct_answer] / 0.25 # Normalize from original 0.25 max
|
|
288
|
+
|
|
289
|
+
# Signal 7: Fact confidence
|
|
290
|
+
# Use the fact's stored confidence score
|
|
291
|
+
signals[:confidence] = (fact.confidence || 0.5) * @weights[:confidence]
|
|
292
|
+
|
|
293
|
+
# Calculate total score
|
|
294
|
+
total_score = signals.values.sum
|
|
295
|
+
|
|
296
|
+
{ fact: fact, score: total_score, signals: signals }
|
|
297
|
+
end
|
|
298
|
+
|
|
299
|
+
scored_facts.sort_by { |f| -f[:score] }
|
|
300
|
+
end
|
|
301
|
+
|
|
302
|
+
# Compute PostgreSQL ts_rank scores for full-text search relevance
|
|
303
|
+
# Returns hash of { fact_id => normalized_score (0-1) }
|
|
304
|
+
def compute_ts_rank_scores(query, fact_ids)
|
|
305
|
+
return {} if fact_ids.empty? || query.strip.empty?
|
|
306
|
+
|
|
307
|
+
# Use ts_rank_cd (cover density) for better phrase matching
|
|
308
|
+
sql = <<~SQL
|
|
309
|
+
SELECT id,
|
|
310
|
+
ts_rank_cd(to_tsvector('english', text),
|
|
311
|
+
plainto_tsquery('english', ?),
|
|
312
|
+
32) as rank
|
|
313
|
+
FROM fact_db_facts
|
|
314
|
+
WHERE id IN (?)
|
|
315
|
+
SQL
|
|
316
|
+
|
|
317
|
+
results = ActiveRecord::Base.connection.execute(
|
|
318
|
+
ActiveRecord::Base.sanitize_sql([sql, query, fact_ids])
|
|
319
|
+
)
|
|
320
|
+
|
|
321
|
+
scores = {}
|
|
322
|
+
max_score = 0.0
|
|
323
|
+
|
|
324
|
+
results.each do |row|
|
|
325
|
+
score = row["rank"].to_f
|
|
326
|
+
scores[row["id"]] = score
|
|
327
|
+
max_score = score if score > max_score
|
|
328
|
+
end
|
|
329
|
+
|
|
330
|
+
# Normalize scores to 0-1 range
|
|
331
|
+
if max_score > 0
|
|
332
|
+
scores.transform_values { |s| s / max_score }
|
|
333
|
+
else
|
|
334
|
+
scores
|
|
335
|
+
end
|
|
336
|
+
rescue StandardError => e
|
|
337
|
+
log_step("ts_rank error", e.message) if @verbose
|
|
338
|
+
{}
|
|
339
|
+
end
|
|
340
|
+
|
|
341
|
+
# Compute vector similarity scores using pgvector
|
|
342
|
+
# Returns hash of { fact_id => similarity_score (0-1) }
|
|
343
|
+
def compute_vector_similarity_scores(query, fact_ids)
|
|
344
|
+
return {} if fact_ids.empty?
|
|
345
|
+
|
|
346
|
+
# Get query embedding (cached)
|
|
347
|
+
query_embedding = get_query_embedding(query)
|
|
348
|
+
return {} unless query_embedding
|
|
349
|
+
|
|
350
|
+
# Use pgvector's cosine distance operator (<=>)
|
|
351
|
+
# Convert distance to similarity: similarity = 1 - distance
|
|
352
|
+
# Cosine distance ranges from 0 (identical) to 2 (opposite)
|
|
353
|
+
sql = <<~SQL
|
|
354
|
+
SELECT id,
|
|
355
|
+
1 - (embedding <=> ?) as similarity
|
|
356
|
+
FROM fact_db_facts
|
|
357
|
+
WHERE id IN (?)
|
|
358
|
+
AND embedding IS NOT NULL
|
|
359
|
+
SQL
|
|
360
|
+
|
|
361
|
+
# Format embedding as PostgreSQL vector string
|
|
362
|
+
embedding_str = "[#{query_embedding.join(',')}]"
|
|
363
|
+
|
|
364
|
+
results = ActiveRecord::Base.connection.execute(
|
|
365
|
+
ActiveRecord::Base.sanitize_sql([sql, embedding_str, fact_ids])
|
|
366
|
+
)
|
|
367
|
+
|
|
368
|
+
scores = {}
|
|
369
|
+
results.each do |row|
|
|
370
|
+
# Clamp to 0-1 range (cosine similarity can be negative for opposite vectors)
|
|
371
|
+
similarity = [[row["similarity"].to_f, 0.0].max, 1.0].min
|
|
372
|
+
scores[row["id"]] = similarity
|
|
373
|
+
end
|
|
374
|
+
|
|
375
|
+
scores
|
|
376
|
+
rescue StandardError => e
|
|
377
|
+
log_step("vector similarity error", e.message) if @verbose
|
|
378
|
+
{}
|
|
379
|
+
end
|
|
380
|
+
|
|
381
|
+
# Get or generate embedding for the query
|
|
382
|
+
def get_query_embedding(query)
|
|
383
|
+
return @query_embedding if @query_embedding
|
|
384
|
+
|
|
385
|
+
embedding_generator = FactDb.config.embedding_generator
|
|
386
|
+
return nil unless embedding_generator
|
|
387
|
+
|
|
388
|
+
@query_embedding = embedding_generator.call(query)
|
|
389
|
+
log_step("Query embedding", "Generated #{@query_embedding&.size || 0} dimensions") if @verbose
|
|
390
|
+
@query_embedding
|
|
391
|
+
rescue StandardError => e
|
|
392
|
+
log_step("embedding error", e.message) if @verbose
|
|
393
|
+
nil
|
|
394
|
+
end
|
|
395
|
+
|
|
396
|
+
def extract_query_terms(query)
|
|
397
|
+
# Extract meaningful terms from query, excluding stop words
|
|
398
|
+
stop_words = %w[who what where when why how is was were are the a an and or but
|
|
399
|
+
to of in for on with at by from as tell me about]
|
|
400
|
+
query.downcase
|
|
401
|
+
.gsub(/[^a-z\s']/, " ")
|
|
402
|
+
.split
|
|
403
|
+
.reject { |w| w.length < 2 || stop_words.include?(w) }
|
|
404
|
+
.uniq
|
|
405
|
+
end
|
|
406
|
+
|
|
407
|
+
def extract_relationship_terms(query)
|
|
408
|
+
# Extract relationship-indicating terms from query
|
|
409
|
+
relationship_words = %w[husband wife spouse married father mother son daughter
|
|
410
|
+
brother sister parent child born died killed
|
|
411
|
+
works worked employed job role position title
|
|
412
|
+
lives lived location city country
|
|
413
|
+
member belongs part joined left]
|
|
414
|
+
|
|
415
|
+
query_lower = query.downcase
|
|
416
|
+
relationship_words.select { |word| query_lower.include?(word) }
|
|
417
|
+
end
|
|
418
|
+
|
|
419
|
+
def score_direct_answer(query, fact)
|
|
420
|
+
query_lower = query.downcase
|
|
421
|
+
fact_lower = fact.text.downcase
|
|
422
|
+
|
|
423
|
+
# Pattern: "Who is X's husband/wife?" -> look for spouse relationships
|
|
424
|
+
# Recognize both the queried term AND its inverse (wife/husband)
|
|
425
|
+
if query_lower.include?("husband") || query_lower.include?("wife")
|
|
426
|
+
# Highest score for facts that define the relationship
|
|
427
|
+
if fact_lower.match?(/('s|his|her) (husband|wife)\b/) ||
|
|
428
|
+
fact_lower.match?(/\b(husband|wife) (of|named|was|is)\b/) ||
|
|
429
|
+
fact_lower.match?(/\bhad a (husband|wife)\b/) ||
|
|
430
|
+
fact_lower.match?(/\bmarried to\b/)
|
|
431
|
+
return 0.25
|
|
432
|
+
end
|
|
433
|
+
# Good score for facts mentioning spouse terms
|
|
434
|
+
return 0.2 if fact_lower.include?("husband") || fact_lower.include?("wife") || fact_lower.include?("married")
|
|
435
|
+
end
|
|
436
|
+
|
|
437
|
+
# Pattern: "What happened to X?" -> look for action verbs about X
|
|
438
|
+
if query_lower.match?(/what happened/)
|
|
439
|
+
return 0.15 if fact_lower.match?(/died|killed|fell|buried|arrested|healed|spoke/)
|
|
440
|
+
end
|
|
441
|
+
|
|
442
|
+
# Pattern: "Where was X?" -> look for location indicators
|
|
443
|
+
if query_lower.match?(/where (was|is|did)/)
|
|
444
|
+
return 0.15 if fact_lower.match?(/in |at |to |from |temple|jerusalem|prison|house/)
|
|
445
|
+
end
|
|
446
|
+
|
|
447
|
+
# Pattern: "Who are the X?" -> look for group membership
|
|
448
|
+
if query_lower.match?(/who are/)
|
|
449
|
+
return 0.1 if fact_lower.match?(/apostle|disciple|believer|member/)
|
|
450
|
+
end
|
|
451
|
+
|
|
452
|
+
0.0
|
|
453
|
+
end
|
|
454
|
+
|
|
455
|
+
def extract_search_terms(query)
|
|
456
|
+
terms = []
|
|
457
|
+
|
|
458
|
+
# Relationship terms to search for
|
|
459
|
+
relationship_words = %w[husband wife spouse father mother son daughter
|
|
460
|
+
brother sister married killed died born
|
|
461
|
+
apostle disciple prophet leader]
|
|
462
|
+
|
|
463
|
+
relationship_words.each do |word|
|
|
464
|
+
terms << word if query.downcase.include?(word)
|
|
465
|
+
end
|
|
466
|
+
|
|
467
|
+
# Add entity names as search terms
|
|
468
|
+
query.scan(/\b[A-Z][a-z]+\b/).each do |word|
|
|
469
|
+
terms << word unless stop_words.include?(word.downcase)
|
|
470
|
+
end
|
|
471
|
+
|
|
472
|
+
terms.uniq
|
|
473
|
+
end
|
|
474
|
+
|
|
475
|
+
def output_context(query, entities, facts)
|
|
476
|
+
if facts.empty? && entities.empty?
|
|
477
|
+
puts "No relevant context found for: #{query}"
|
|
478
|
+
puts "\nTry:"
|
|
479
|
+
puts " - Check if data has been ingested (run ingest_demo.rb first)"
|
|
480
|
+
puts " - Use different search terms"
|
|
481
|
+
puts " - Check available entities with: ruby introspection.rb"
|
|
482
|
+
return
|
|
483
|
+
end
|
|
484
|
+
|
|
485
|
+
# Build QueryResult for transformer
|
|
486
|
+
result = FactDb::QueryResult.new(query: query)
|
|
487
|
+
result.add_facts(facts)
|
|
488
|
+
result.resolve_entities(@entity_service)
|
|
489
|
+
|
|
490
|
+
case @format
|
|
491
|
+
when :text
|
|
492
|
+
output_text_context(query, result)
|
|
493
|
+
when :json
|
|
494
|
+
output_json_context(result)
|
|
495
|
+
when :triples
|
|
496
|
+
output_triples_context(result)
|
|
497
|
+
when :cypher
|
|
498
|
+
output_cypher_context(result)
|
|
499
|
+
when :raw
|
|
500
|
+
output_raw_context(result)
|
|
501
|
+
end
|
|
502
|
+
end
|
|
503
|
+
|
|
504
|
+
def output_text_context(query, result)
|
|
505
|
+
puts <<~HEADER
|
|
506
|
+
================================================================================
|
|
507
|
+
CONTEXT FOR QUERY: #{query}
|
|
508
|
+
================================================================================
|
|
509
|
+
|
|
510
|
+
HEADER
|
|
511
|
+
|
|
512
|
+
transformer = FactDb::Transformers::TextTransformer.new
|
|
513
|
+
puts transformer.transform(result)
|
|
514
|
+
|
|
515
|
+
puts <<~FOOTER
|
|
516
|
+
|
|
517
|
+
--------------------------------------------------------------------------------
|
|
518
|
+
Retrieved #{result.fact_count} facts about #{result.entity_count} entities
|
|
519
|
+
================================================================================
|
|
520
|
+
FOOTER
|
|
521
|
+
end
|
|
522
|
+
|
|
523
|
+
def output_json_context(result)
|
|
524
|
+
require "json"
|
|
525
|
+
puts JSON.pretty_generate(result.to_h)
|
|
526
|
+
end
|
|
527
|
+
|
|
528
|
+
def output_triples_context(result)
|
|
529
|
+
transformer = FactDb::Transformers::TripleTransformer.new
|
|
530
|
+
triples = transformer.transform(result)
|
|
531
|
+
|
|
532
|
+
puts "# Triples for query: #{result.query}"
|
|
533
|
+
puts "# Format: [subject, predicate, object]"
|
|
534
|
+
puts
|
|
535
|
+
|
|
536
|
+
triples.each do |triple|
|
|
537
|
+
puts triple.inspect
|
|
538
|
+
end
|
|
539
|
+
|
|
540
|
+
puts
|
|
541
|
+
puts "# Total: #{triples.size} triples"
|
|
542
|
+
end
|
|
543
|
+
|
|
544
|
+
def output_cypher_context(result)
|
|
545
|
+
transformer = FactDb::Transformers::CypherTransformer.new
|
|
546
|
+
cypher = transformer.transform(result)
|
|
547
|
+
|
|
548
|
+
puts "// Cypher statements for query: #{result.query}"
|
|
549
|
+
puts cypher
|
|
550
|
+
end
|
|
551
|
+
|
|
552
|
+
def output_raw_context(result)
|
|
553
|
+
transformer = FactDb::Transformers::RawTransformer.new
|
|
554
|
+
raw = transformer.transform(result)
|
|
555
|
+
|
|
556
|
+
require "amazing_print"
|
|
557
|
+
ap raw
|
|
558
|
+
end
|
|
559
|
+
|
|
560
|
+
def log_header(query)
|
|
561
|
+
return unless @verbose
|
|
562
|
+
|
|
563
|
+
puts "=" * 70
|
|
564
|
+
puts "Query Context Generator"
|
|
565
|
+
puts "=" * 70
|
|
566
|
+
puts "Query: #{query}"
|
|
567
|
+
puts "Format: #{@format}"
|
|
568
|
+
puts "Limit: #{@limit}"
|
|
569
|
+
puts
|
|
570
|
+
end
|
|
571
|
+
|
|
572
|
+
def log_step(label, value)
|
|
573
|
+
return unless @verbose
|
|
574
|
+
|
|
575
|
+
puts "--- #{label} ---"
|
|
576
|
+
if value.is_a?(Array)
|
|
577
|
+
if value.empty?
|
|
578
|
+
puts " (none)"
|
|
579
|
+
else
|
|
580
|
+
value.each { |v| puts " - #{v}" }
|
|
581
|
+
end
|
|
582
|
+
else
|
|
583
|
+
puts " #{value}"
|
|
584
|
+
end
|
|
585
|
+
puts
|
|
586
|
+
end
|
|
587
|
+
end
|
|
588
|
+
|
|
589
|
+
# Main execution
|
|
590
|
+
if __FILE__ == $PROGRAM_NAME
|
|
591
|
+
options = { format: :text, verbose: false, limit: 20, rank: true }
|
|
592
|
+
|
|
593
|
+
parser = OptionParser.new do |opts|
|
|
594
|
+
opts.banner = "Usage: #{$PROGRAM_NAME} [options] \"query\""
|
|
595
|
+
|
|
596
|
+
opts.on("-f", "--format FORMAT", QueryContextGenerator::FORMATS,
|
|
597
|
+
"Output format (#{QueryContextGenerator::FORMATS.join(', ')})") do |f|
|
|
598
|
+
options[:format] = f
|
|
599
|
+
end
|
|
600
|
+
|
|
601
|
+
opts.on("-v", "--verbose", "Show detailed processing steps") do
|
|
602
|
+
options[:verbose] = true
|
|
603
|
+
end
|
|
604
|
+
|
|
605
|
+
opts.on("-l", "--limit N", Integer, "Maximum facts to return (default: 20)") do |n|
|
|
606
|
+
options[:limit] = n
|
|
607
|
+
end
|
|
608
|
+
|
|
609
|
+
opts.on("--no-rank", "Disable relevance ranking") do
|
|
610
|
+
options[:rank] = false
|
|
611
|
+
end
|
|
612
|
+
|
|
613
|
+
opts.on("-h", "--help", "Show this help message") do
|
|
614
|
+
puts opts
|
|
615
|
+
puts <<~EXAMPLES
|
|
616
|
+
|
|
617
|
+
Examples:
|
|
618
|
+
#{$PROGRAM_NAME} "Who is Sapphira's husband?"
|
|
619
|
+
#{$PROGRAM_NAME} "What happened to Ananias?"
|
|
620
|
+
#{$PROGRAM_NAME} --format triples "Tell me about Peter"
|
|
621
|
+
#{$PROGRAM_NAME} --format json "Who are the apostles?"
|
|
622
|
+
#{$PROGRAM_NAME} --verbose "Where was Stephen martyred?"
|
|
623
|
+
#{$PROGRAM_NAME} --no-rank "Tell me about the apostles"
|
|
624
|
+
|
|
625
|
+
Relevance Ranking:
|
|
626
|
+
Facts are ranked using configurable signal weights (defaults shown):
|
|
627
|
+
- ts_rank_weight: 0.25 PostgreSQL full-text search relevance
|
|
628
|
+
- vector_similarity_weight: 0.25 Semantic similarity via pgvector
|
|
629
|
+
- entity_mention_weight: 0.15 Facts mentioning query entities
|
|
630
|
+
- direct_answer_weight: 0.15 Pattern match for query intent
|
|
631
|
+
- term_overlap_weight: 0.10 Query word matches
|
|
632
|
+
- relationship_match_weight: 0.05 Relationship words (husband, etc.)
|
|
633
|
+
- confidence_weight: 0.05 Fact's stored confidence score
|
|
634
|
+
|
|
635
|
+
Configure weights in FactDb:
|
|
636
|
+
FactDb.configure do |config|
|
|
637
|
+
config.ranking.ts_rank_weight = 0.30
|
|
638
|
+
config.ranking.vector_similarity_weight = 0.20
|
|
639
|
+
# ... etc
|
|
640
|
+
end
|
|
641
|
+
|
|
642
|
+
Or via environment variables:
|
|
643
|
+
FDB_RANKING__TS_RANK_WEIGHT=0.30
|
|
644
|
+
|
|
645
|
+
Note: vector_similarity requires embedding_generator to be configured.
|
|
646
|
+
|
|
647
|
+
Prerequisites:
|
|
648
|
+
Run ingest_demo.rb to populate the database with facts first:
|
|
649
|
+
ruby ingest_demo.rb acts_esv/
|
|
650
|
+
|
|
651
|
+
Environment:
|
|
652
|
+
DATABASE_URL # PostgreSQL connection (default: postgres://$USER@localhost/fact_db_demo)
|
|
653
|
+
EXAMPLES
|
|
654
|
+
exit 0
|
|
655
|
+
end
|
|
656
|
+
end
|
|
657
|
+
|
|
658
|
+
parser.parse!
|
|
659
|
+
|
|
660
|
+
if ARGV.empty?
|
|
661
|
+
puts "Error: Please provide a query"
|
|
662
|
+
puts parser
|
|
663
|
+
exit 1
|
|
664
|
+
end
|
|
665
|
+
|
|
666
|
+
query = ARGV.join(" ")
|
|
667
|
+
QueryContextGenerator.new(options).run(query)
|
|
668
|
+
end
|