fact_db 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (114) hide show
  1. checksums.yaml +4 -4
  2. data/.envrc +2 -0
  3. data/.yardopts +5 -0
  4. data/CHANGELOG.md +64 -0
  5. data/README.md +107 -6
  6. data/Rakefile +243 -10
  7. data/db/migrate/001_enable_extensions.rb +1 -0
  8. data/db/migrate/002_create_sources.rb +49 -0
  9. data/db/migrate/003_create_entities.rb +27 -15
  10. data/db/migrate/004_create_entity_aliases.rb +20 -7
  11. data/db/migrate/005_create_facts.rb +37 -21
  12. data/db/migrate/006_create_entity_mentions.rb +14 -6
  13. data/db/migrate/007_create_fact_sources.rb +16 -8
  14. data/docs/api/extractors/index.md +5 -5
  15. data/docs/api/extractors/llm.md +17 -17
  16. data/docs/api/extractors/rule-based.md +14 -14
  17. data/docs/api/facts.md +20 -20
  18. data/docs/api/index.md +4 -4
  19. data/docs/api/models/entity.md +21 -21
  20. data/docs/api/models/fact.md +15 -15
  21. data/docs/api/models/index.md +7 -7
  22. data/docs/api/models/{content.md → source.md} +29 -29
  23. data/docs/api/pipeline/extraction.md +25 -25
  24. data/docs/api/pipeline/index.md +1 -1
  25. data/docs/api/pipeline/resolution.md +4 -4
  26. data/docs/api/services/entity-service.md +20 -20
  27. data/docs/api/services/fact-service.md +12 -12
  28. data/docs/api/services/index.md +5 -5
  29. data/docs/api/services/{content-service.md → source-service.md} +27 -27
  30. data/docs/architecture/database-schema.md +46 -46
  31. data/docs/architecture/entity-resolution.md +6 -6
  32. data/docs/architecture/index.md +10 -10
  33. data/docs/architecture/temporal-facts.md +5 -5
  34. data/docs/architecture/three-layer-model.md +17 -17
  35. data/docs/concepts.md +6 -6
  36. data/docs/examples/basic-usage.md +20 -20
  37. data/docs/examples/hr-onboarding.md +17 -17
  38. data/docs/examples/index.md +4 -4
  39. data/docs/examples/news-analysis.md +23 -23
  40. data/docs/getting-started/database-setup.md +28 -20
  41. data/docs/getting-started/index.md +3 -3
  42. data/docs/getting-started/quick-start.md +33 -30
  43. data/docs/guides/batch-processing.md +26 -26
  44. data/docs/guides/configuration.md +158 -77
  45. data/docs/guides/entity-management.md +14 -14
  46. data/docs/guides/extracting-facts.md +28 -28
  47. data/docs/guides/ingesting-content.md +14 -14
  48. data/docs/guides/llm-integration.md +40 -32
  49. data/docs/guides/temporal-queries.md +11 -11
  50. data/docs/index.md +6 -2
  51. data/examples/.envrc +4 -0
  52. data/examples/.gitignore +1 -0
  53. data/examples/001_configuration.rb +312 -0
  54. data/examples/{basic_usage.rb → 010_basic_usage.rb} +47 -56
  55. data/examples/{entity_management.rb → 020_entity_management.rb} +57 -72
  56. data/examples/{temporal_queries.rb → 030_temporal_queries.rb} +39 -59
  57. data/examples/040_output_formats.rb +177 -0
  58. data/examples/{rule_based_extraction.rb → 050_rule_based_extraction.rb} +39 -45
  59. data/examples/060_fluent_temporal_api.rb +217 -0
  60. data/examples/070_introspection.rb +252 -0
  61. data/examples/{hr_system.rb → 080_hr_system.rb} +56 -75
  62. data/examples/090_ingest_demo.rb +515 -0
  63. data/examples/100_query_context.rb +668 -0
  64. data/examples/110_prove_it.rb +204 -0
  65. data/examples/120_dump_database.rb +358 -0
  66. data/examples/130_rag_feedback_loop.rb +858 -0
  67. data/examples/README.md +229 -15
  68. data/examples/data/lincoln_associates.md +201 -0
  69. data/examples/data/lincoln_biography.md +66 -0
  70. data/examples/data/lincoln_cabinet.md +243 -0
  71. data/examples/data/lincoln_family.md +163 -0
  72. data/examples/data/lincoln_military.md +241 -0
  73. data/examples/data/lincoln_todd_family.md +136 -0
  74. data/examples/ingest_reporter.rb +335 -0
  75. data/examples/utilities.rb +182 -0
  76. data/lib/fact_db/config/defaults.yml +254 -0
  77. data/lib/fact_db/config.rb +94 -35
  78. data/lib/fact_db/database.rb +98 -8
  79. data/lib/fact_db/extractors/base.rb +106 -21
  80. data/lib/fact_db/extractors/llm_extractor.rb +35 -63
  81. data/lib/fact_db/extractors/manual_extractor.rb +46 -6
  82. data/lib/fact_db/extractors/rule_based_extractor.rb +136 -25
  83. data/lib/fact_db/llm/adapter.rb +3 -3
  84. data/lib/fact_db/models/entity.rb +94 -22
  85. data/lib/fact_db/models/entity_alias.rb +41 -7
  86. data/lib/fact_db/models/entity_mention.rb +34 -1
  87. data/lib/fact_db/models/fact.rb +259 -28
  88. data/lib/fact_db/models/fact_source.rb +43 -9
  89. data/lib/fact_db/models/source.rb +113 -0
  90. data/lib/fact_db/pipeline/extraction_pipeline.rb +35 -35
  91. data/lib/fact_db/pipeline/resolution_pipeline.rb +5 -5
  92. data/lib/fact_db/query_result.rb +202 -0
  93. data/lib/fact_db/resolution/entity_resolver.rb +139 -39
  94. data/lib/fact_db/resolution/fact_resolver.rb +86 -14
  95. data/lib/fact_db/services/entity_service.rb +246 -37
  96. data/lib/fact_db/services/fact_service.rb +254 -17
  97. data/lib/fact_db/services/source_service.rb +164 -0
  98. data/lib/fact_db/temporal/query.rb +71 -7
  99. data/lib/fact_db/temporal/query_builder.rb +69 -0
  100. data/lib/fact_db/temporal/timeline.rb +102 -11
  101. data/lib/fact_db/transformers/base.rb +77 -0
  102. data/lib/fact_db/transformers/cypher_transformer.rb +185 -0
  103. data/lib/fact_db/transformers/json_transformer.rb +17 -0
  104. data/lib/fact_db/transformers/raw_transformer.rb +35 -0
  105. data/lib/fact_db/transformers/text_transformer.rb +114 -0
  106. data/lib/fact_db/transformers/triple_transformer.rb +138 -0
  107. data/lib/fact_db/validation/alias_filter.rb +185 -0
  108. data/lib/fact_db/version.rb +1 -1
  109. data/lib/fact_db.rb +281 -30
  110. data/mkdocs.yml +2 -2
  111. metadata +60 -16
  112. data/db/migrate/002_create_contents.rb +0 -44
  113. data/lib/fact_db/models/content.rb +0 -62
  114. data/lib/fact_db/services/content_service.rb +0 -93
@@ -0,0 +1,668 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ # Query Context Generator for FactDb
5
+ #
6
+ # Takes a natural language query and generates context from the facts database
7
+ # suitable for LLM consumption.
8
+ #
9
+ # Usage:
10
+ # ruby query_context.rb "Who is Sapphira's husband?"
11
+ # ruby query_context.rb "What happened to Ananias?"
12
+ # ruby query_context.rb --format triples "Tell me about Peter"
13
+ # ruby query_context.rb --format json "Who are the apostles?"
14
+ # ruby query_context.rb --verbose "Where was Stephen martyred?"
15
+ #
16
+ # Options:
17
+ # --format FORMAT Output format: text (default), json, triples, cypher
18
+ # --verbose Show detailed processing steps
19
+ # --limit N Maximum number of facts to return (default: 20)
20
+
21
+ require_relative "utilities"
22
+ require "optparse"
23
+
24
+ # Note: CLI tool - uses cli_setup! which does NOT reset database
25
+
26
+ class QueryContextGenerator
27
+ FORMATS = %i[text json triples cypher raw].freeze
28
+
29
+ def initialize(options = {})
30
+ @format = options[:format] || :text
31
+ @verbose = options[:verbose] || false
32
+ @limit = options[:limit] || 20
33
+ @rank = options[:rank] != false # Default to true
34
+ setup_factdb
35
+ @query_embedding = nil # Cache for query embedding
36
+ load_ranking_weights
37
+ end
38
+
39
+ def load_ranking_weights
40
+ ranking = FactDb.config.ranking
41
+
42
+ # Load weights from config with fallback defaults
43
+ @weights = {
44
+ ts_rank: ranking&.ts_rank_weight || 0.25,
45
+ vector_similarity: ranking&.vector_similarity_weight || 0.25,
46
+ entity_mentions: ranking&.entity_mention_weight || 0.15,
47
+ direct_answer: ranking&.direct_answer_weight || 0.15,
48
+ term_overlap: ranking&.term_overlap_weight || 0.10,
49
+ relationship_match: ranking&.relationship_match_weight || 0.05,
50
+ confidence: ranking&.confidence_weight || 0.05
51
+ }
52
+
53
+ log_step("Ranking weights loaded", @weights.map { |k, v| "#{k}: #{v}" }) if @verbose
54
+ end
55
+
56
+ def run(query)
57
+ log_header(query)
58
+
59
+ # Step 1: Extract potential entity names from the query
60
+ candidates = extract_entity_candidates(query)
61
+ log_step("Entity candidates", candidates)
62
+
63
+ # Step 2: Resolve entities from candidates
64
+ resolved_entities = resolve_entities(candidates)
65
+ log_step("Resolved entities", resolved_entities.map { |e| "#{e.name} (#{e.kind})" })
66
+
67
+ # Step 3: Gather facts from multiple strategies
68
+ all_facts = gather_facts(query, resolved_entities)
69
+ log_step("Facts gathered", "#{all_facts.size} facts")
70
+
71
+ # Step 4: Rank facts by relevance to the query
72
+ @ranked_results = nil
73
+ if @rank
74
+ @ranked_results = rank_facts(query, all_facts, resolved_entities)
75
+ log_step("Top ranked facts", @ranked_results.first(5).map { |f| "#{f[:score].round(2)}: #{f[:fact].text[0..60]}..." })
76
+ all_facts = @ranked_results.map { |f| f[:fact] }
77
+
78
+ # Show signal breakdown if verbose
79
+ if @verbose && @ranked_results.any?
80
+ show_signal_breakdown(@ranked_results.first(3))
81
+ end
82
+ end
83
+
84
+ # Step 5: Build and output context
85
+ output_context(query, resolved_entities, all_facts)
86
+ end
87
+
88
+ def show_signal_breakdown(ranked_facts)
89
+ puts "\n--- Signal Breakdown (Top #{ranked_facts.size}) ---"
90
+ puts " Configured weights: #{@weights.map { |k, v| "#{k}=#{v}" }.join(', ')}"
91
+
92
+ ranked_facts.each_with_index do |result, idx|
93
+ fact = result[:fact]
94
+ signals = result[:signals]
95
+
96
+ puts "\n#{idx + 1}. \"#{fact.text[0..70]}...\""
97
+ puts " Total Score: #{result[:score].round(3)}"
98
+ puts " Signals:"
99
+
100
+ signals.each do |name, value|
101
+ max_weight = @weights[name] || 0.25
102
+ fill_ratio = max_weight > 0 ? (value / max_weight) : 0
103
+ bar_length = (fill_ratio * 10).round
104
+ bar = "#" * bar_length + "." * (10 - bar_length)
105
+ puts " #{name.to_s.ljust(18)} #{value.round(3).to_s.ljust(6)} / #{max_weight.to_s.ljust(4)} |#{bar}|"
106
+ end
107
+ end
108
+ puts
109
+ end
110
+
111
+ private
112
+
113
+ def setup_factdb
114
+ DemoUtilities.ensure_demo_environment!
115
+ DemoUtilities.require_fact_db!
116
+
117
+ FactDb.configure do |config|
118
+ config.logger = Logger.new("/dev/null")
119
+ end
120
+
121
+ FactDb::Database.establish_connection!
122
+
123
+ @facts = FactDb.new
124
+ @entity_service = @facts.entity_service
125
+ @fact_service = @facts.fact_service
126
+ end
127
+
128
+ def extract_entity_candidates(query)
129
+ candidates = []
130
+
131
+ # Extract capitalized words/phrases (potential proper nouns)
132
+ # Match sequences of capitalized words
133
+ query.scan(/\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b/).each do |match|
134
+ candidates << match unless stop_words.include?(match.downcase)
135
+ end
136
+
137
+ # Extract words in possessive form (e.g., "Sapphira's" -> "Sapphira")
138
+ query.scan(/\b([A-Z][a-z]+)'s\b/).flatten.each do |match|
139
+ candidates << match
140
+ end
141
+
142
+ # Extract quoted strings
143
+ query.scan(/"([^"]+)"/).flatten.each do |match|
144
+ candidates << match
145
+ end
146
+
147
+ # Also try key nouns from the query (lowercase entities might exist)
148
+ extract_key_nouns(query).each do |noun|
149
+ candidates << noun
150
+ end
151
+
152
+ candidates.uniq
153
+ end
154
+
155
+ def extract_key_nouns(query)
156
+ nouns = []
157
+
158
+ # Common question patterns - extract the object
159
+ patterns = [
160
+ /who (?:is|was|were) (.+?)(?:\?|$)/i,
161
+ /what (?:is|was|were|happened to) (.+?)(?:\?|$)/i,
162
+ /where (?:is|was|did) (.+?)(?:\?|$)/i,
163
+ /tell me about (.+?)(?:\?|$)/i,
164
+ /(?:husband|wife|spouse) of (.+?)(?:\?|$)/i,
165
+ /(.+?)'s (?:husband|wife|father|mother|son|daughter)/i
166
+ ]
167
+
168
+ patterns.each do |pattern|
169
+ if (match = query.match(pattern))
170
+ # Clean up the captured group
171
+ noun = match[1].strip.gsub(/[?.!]$/, "")
172
+ nouns << noun unless noun.empty?
173
+ end
174
+ end
175
+
176
+ nouns.uniq
177
+ end
178
+
179
+ def stop_words
180
+ %w[who what where when why how is was were are the a an and or but]
181
+ end
182
+
183
+ def resolve_entities(candidates)
184
+ entities = []
185
+
186
+ candidates.each do |name|
187
+ # Try exact resolution first
188
+ resolved = @entity_service.resolve(name)
189
+ if resolved
190
+ entities << resolved.entity
191
+ next
192
+ end
193
+
194
+ # Try search as fallback
195
+ search_results = @entity_service.search(name, limit: 3)
196
+ search_results.each do |entity|
197
+ entities << entity
198
+ end
199
+ end
200
+
201
+ entities.uniq(&:id)
202
+ end
203
+
204
+ def gather_facts(query, entities)
205
+ facts = []
206
+
207
+ # Strategy 1: Get facts mentioning resolved entities
208
+ entities.each do |entity|
209
+ entity_facts = @fact_service.current_facts(entity: entity.id, limit: @limit)
210
+ facts.concat(entity_facts.to_a)
211
+ end
212
+
213
+ # Strategy 2: Full-text search on the query
214
+ search_facts = @fact_service.search(query, limit: @limit)
215
+ facts.concat(search_facts.to_a)
216
+
217
+ # Strategy 3: Search for key terms from the query
218
+ extract_search_terms(query).each do |term|
219
+ term_facts = @fact_service.search(term, limit: 5)
220
+ facts.concat(term_facts.to_a)
221
+ end
222
+
223
+ # Strategy 4: Semantic search if available
224
+ begin
225
+ semantic_facts = @fact_service.semantic_search(query, limit: @limit)
226
+ facts.concat(semantic_facts.to_a) if semantic_facts.any?
227
+ rescue StandardError
228
+ # Semantic search not available (no embeddings)
229
+ end
230
+
231
+ # Deduplicate and limit
232
+ facts.uniq(&:id).first(@limit)
233
+ end
234
+
235
+ # Rank facts by relevance to the query
236
+ # Returns array of { fact:, score:, signals: } sorted by score descending
237
+ def rank_facts(query, facts, resolved_entities)
238
+ return [] if facts.empty?
239
+
240
+ query_lower = query.downcase
241
+ query_terms = extract_query_terms(query)
242
+ entity_names = resolved_entities.flat_map { |e| [e.name.downcase] + e.all_aliases.map(&:downcase) }
243
+
244
+ # Pre-compute expensive scores for all facts at once
245
+ fact_ids = facts.map(&:id)
246
+ ts_rank_scores = compute_ts_rank_scores(query, fact_ids)
247
+ vector_scores = compute_vector_similarity_scores(query, fact_ids)
248
+
249
+ scored_facts = facts.map do |fact|
250
+ signals = {}
251
+ text_lower = fact.text.downcase
252
+
253
+ # Signal 1: PostgreSQL ts_rank score
254
+ # Full-text search relevance from PostgreSQL
255
+ ts_score = ts_rank_scores[fact.id] || 0.0
256
+ signals[:ts_rank] = [ts_score * @weights[:ts_rank], @weights[:ts_rank]].min
257
+
258
+ # Signal 2: Vector similarity score
259
+ # Semantic similarity via pgvector embeddings
260
+ vec_score = vector_scores[fact.id] || 0.0
261
+ signals[:vector_similarity] = vec_score * @weights[:vector_similarity]
262
+
263
+ # Signal 3: Entity mention score
264
+ # Facts mentioning query entities rank higher
265
+ entity_mention_score = 0.0
266
+ mention_increment = @weights[:entity_mentions] / 2.0 # Allow up to 2 entity mentions
267
+ entity_names.each do |name|
268
+ entity_mention_score += mention_increment if text_lower.include?(name)
269
+ end
270
+ signals[:entity_mentions] = [entity_mention_score, @weights[:entity_mentions]].min
271
+
272
+ # Signal 4: Query term overlap
273
+ # How many query terms appear in the fact
274
+ term_matches = query_terms.count { |term| text_lower.include?(term.downcase) }
275
+ term_score = query_terms.empty? ? 0 : (term_matches.to_f / query_terms.size) * @weights[:term_overlap]
276
+ signals[:term_overlap] = term_score
277
+
278
+ # Signal 5: Relationship term bonus
279
+ # Bonus for facts containing relationship words from the query
280
+ relationship_terms = extract_relationship_terms(query)
281
+ rel_matches = relationship_terms.count { |term| text_lower.include?(term.downcase) }
282
+ signals[:relationship_match] = rel_matches > 0 ? @weights[:relationship_match] : 0.0
283
+
284
+ # Signal 6: Direct answer bonus
285
+ # Bonus if fact structure matches query intent (uses relative scoring)
286
+ direct_score = score_direct_answer(query, fact)
287
+ signals[:direct_answer] = direct_score * @weights[:direct_answer] / 0.25 # Normalize from original 0.25 max
288
+
289
+ # Signal 7: Fact confidence
290
+ # Use the fact's stored confidence score
291
+ signals[:confidence] = (fact.confidence || 0.5) * @weights[:confidence]
292
+
293
+ # Calculate total score
294
+ total_score = signals.values.sum
295
+
296
+ { fact: fact, score: total_score, signals: signals }
297
+ end
298
+
299
+ scored_facts.sort_by { |f| -f[:score] }
300
+ end
301
+
302
+ # Compute PostgreSQL ts_rank scores for full-text search relevance
303
+ # Returns hash of { fact_id => normalized_score (0-1) }
304
+ def compute_ts_rank_scores(query, fact_ids)
305
+ return {} if fact_ids.empty? || query.strip.empty?
306
+
307
+ # Use ts_rank_cd (cover density) for better phrase matching
308
+ sql = <<~SQL
309
+ SELECT id,
310
+ ts_rank_cd(to_tsvector('english', text),
311
+ plainto_tsquery('english', ?),
312
+ 32) as rank
313
+ FROM fact_db_facts
314
+ WHERE id IN (?)
315
+ SQL
316
+
317
+ results = ActiveRecord::Base.connection.execute(
318
+ ActiveRecord::Base.sanitize_sql([sql, query, fact_ids])
319
+ )
320
+
321
+ scores = {}
322
+ max_score = 0.0
323
+
324
+ results.each do |row|
325
+ score = row["rank"].to_f
326
+ scores[row["id"]] = score
327
+ max_score = score if score > max_score
328
+ end
329
+
330
+ # Normalize scores to 0-1 range
331
+ if max_score > 0
332
+ scores.transform_values { |s| s / max_score }
333
+ else
334
+ scores
335
+ end
336
+ rescue StandardError => e
337
+ log_step("ts_rank error", e.message) if @verbose
338
+ {}
339
+ end
340
+
341
+ # Compute vector similarity scores using pgvector
342
+ # Returns hash of { fact_id => similarity_score (0-1) }
343
+ def compute_vector_similarity_scores(query, fact_ids)
344
+ return {} if fact_ids.empty?
345
+
346
+ # Get query embedding (cached)
347
+ query_embedding = get_query_embedding(query)
348
+ return {} unless query_embedding
349
+
350
+ # Use pgvector's cosine distance operator (<=>)
351
+ # Convert distance to similarity: similarity = 1 - distance
352
+ # Cosine distance ranges from 0 (identical) to 2 (opposite)
353
+ sql = <<~SQL
354
+ SELECT id,
355
+ 1 - (embedding <=> ?) as similarity
356
+ FROM fact_db_facts
357
+ WHERE id IN (?)
358
+ AND embedding IS NOT NULL
359
+ SQL
360
+
361
+ # Format embedding as PostgreSQL vector string
362
+ embedding_str = "[#{query_embedding.join(',')}]"
363
+
364
+ results = ActiveRecord::Base.connection.execute(
365
+ ActiveRecord::Base.sanitize_sql([sql, embedding_str, fact_ids])
366
+ )
367
+
368
+ scores = {}
369
+ results.each do |row|
370
+ # Clamp to 0-1 range (cosine similarity can be negative for opposite vectors)
371
+ similarity = [[row["similarity"].to_f, 0.0].max, 1.0].min
372
+ scores[row["id"]] = similarity
373
+ end
374
+
375
+ scores
376
+ rescue StandardError => e
377
+ log_step("vector similarity error", e.message) if @verbose
378
+ {}
379
+ end
380
+
381
+ # Get or generate embedding for the query
382
+ def get_query_embedding(query)
383
+ return @query_embedding if @query_embedding
384
+
385
+ embedding_generator = FactDb.config.embedding_generator
386
+ return nil unless embedding_generator
387
+
388
+ @query_embedding = embedding_generator.call(query)
389
+ log_step("Query embedding", "Generated #{@query_embedding&.size || 0} dimensions") if @verbose
390
+ @query_embedding
391
+ rescue StandardError => e
392
+ log_step("embedding error", e.message) if @verbose
393
+ nil
394
+ end
395
+
396
+ def extract_query_terms(query)
397
+ # Extract meaningful terms from query, excluding stop words
398
+ stop_words = %w[who what where when why how is was were are the a an and or but
399
+ to of in for on with at by from as tell me about]
400
+ query.downcase
401
+ .gsub(/[^a-z\s']/, " ")
402
+ .split
403
+ .reject { |w| w.length < 2 || stop_words.include?(w) }
404
+ .uniq
405
+ end
406
+
407
+ def extract_relationship_terms(query)
408
+ # Extract relationship-indicating terms from query
409
+ relationship_words = %w[husband wife spouse married father mother son daughter
410
+ brother sister parent child born died killed
411
+ works worked employed job role position title
412
+ lives lived location city country
413
+ member belongs part joined left]
414
+
415
+ query_lower = query.downcase
416
+ relationship_words.select { |word| query_lower.include?(word) }
417
+ end
418
+
419
+ def score_direct_answer(query, fact)
420
+ query_lower = query.downcase
421
+ fact_lower = fact.text.downcase
422
+
423
+ # Pattern: "Who is X's husband/wife?" -> look for spouse relationships
424
+ # Recognize both the queried term AND its inverse (wife/husband)
425
+ if query_lower.include?("husband") || query_lower.include?("wife")
426
+ # Highest score for facts that define the relationship
427
+ if fact_lower.match?(/('s|his|her) (husband|wife)\b/) ||
428
+ fact_lower.match?(/\b(husband|wife) (of|named|was|is)\b/) ||
429
+ fact_lower.match?(/\bhad a (husband|wife)\b/) ||
430
+ fact_lower.match?(/\bmarried to\b/)
431
+ return 0.25
432
+ end
433
+ # Good score for facts mentioning spouse terms
434
+ return 0.2 if fact_lower.include?("husband") || fact_lower.include?("wife") || fact_lower.include?("married")
435
+ end
436
+
437
+ # Pattern: "What happened to X?" -> look for action verbs about X
438
+ if query_lower.match?(/what happened/)
439
+ return 0.15 if fact_lower.match?(/died|killed|fell|buried|arrested|healed|spoke/)
440
+ end
441
+
442
+ # Pattern: "Where was X?" -> look for location indicators
443
+ if query_lower.match?(/where (was|is|did)/)
444
+ return 0.15 if fact_lower.match?(/in |at |to |from |temple|jerusalem|prison|house/)
445
+ end
446
+
447
+ # Pattern: "Who are the X?" -> look for group membership
448
+ if query_lower.match?(/who are/)
449
+ return 0.1 if fact_lower.match?(/apostle|disciple|believer|member/)
450
+ end
451
+
452
+ 0.0
453
+ end
454
+
455
+ def extract_search_terms(query)
456
+ terms = []
457
+
458
+ # Relationship terms to search for
459
+ relationship_words = %w[husband wife spouse father mother son daughter
460
+ brother sister married killed died born
461
+ apostle disciple prophet leader]
462
+
463
+ relationship_words.each do |word|
464
+ terms << word if query.downcase.include?(word)
465
+ end
466
+
467
+ # Add entity names as search terms
468
+ query.scan(/\b[A-Z][a-z]+\b/).each do |word|
469
+ terms << word unless stop_words.include?(word.downcase)
470
+ end
471
+
472
+ terms.uniq
473
+ end
474
+
475
+ def output_context(query, entities, facts)
476
+ if facts.empty? && entities.empty?
477
+ puts "No relevant context found for: #{query}"
478
+ puts "\nTry:"
479
+ puts " - Check if data has been ingested (run ingest_demo.rb first)"
480
+ puts " - Use different search terms"
481
+ puts " - Check available entities with: ruby introspection.rb"
482
+ return
483
+ end
484
+
485
+ # Build QueryResult for transformer
486
+ result = FactDb::QueryResult.new(query: query)
487
+ result.add_facts(facts)
488
+ result.resolve_entities(@entity_service)
489
+
490
+ case @format
491
+ when :text
492
+ output_text_context(query, result)
493
+ when :json
494
+ output_json_context(result)
495
+ when :triples
496
+ output_triples_context(result)
497
+ when :cypher
498
+ output_cypher_context(result)
499
+ when :raw
500
+ output_raw_context(result)
501
+ end
502
+ end
503
+
504
+ def output_text_context(query, result)
505
+ puts <<~HEADER
506
+ ================================================================================
507
+ CONTEXT FOR QUERY: #{query}
508
+ ================================================================================
509
+
510
+ HEADER
511
+
512
+ transformer = FactDb::Transformers::TextTransformer.new
513
+ puts transformer.transform(result)
514
+
515
+ puts <<~FOOTER
516
+
517
+ --------------------------------------------------------------------------------
518
+ Retrieved #{result.fact_count} facts about #{result.entity_count} entities
519
+ ================================================================================
520
+ FOOTER
521
+ end
522
+
523
+ def output_json_context(result)
524
+ require "json"
525
+ puts JSON.pretty_generate(result.to_h)
526
+ end
527
+
528
+ def output_triples_context(result)
529
+ transformer = FactDb::Transformers::TripleTransformer.new
530
+ triples = transformer.transform(result)
531
+
532
+ puts "# Triples for query: #{result.query}"
533
+ puts "# Format: [subject, predicate, object]"
534
+ puts
535
+
536
+ triples.each do |triple|
537
+ puts triple.inspect
538
+ end
539
+
540
+ puts
541
+ puts "# Total: #{triples.size} triples"
542
+ end
543
+
544
+ def output_cypher_context(result)
545
+ transformer = FactDb::Transformers::CypherTransformer.new
546
+ cypher = transformer.transform(result)
547
+
548
+ puts "// Cypher statements for query: #{result.query}"
549
+ puts cypher
550
+ end
551
+
552
+ def output_raw_context(result)
553
+ transformer = FactDb::Transformers::RawTransformer.new
554
+ raw = transformer.transform(result)
555
+
556
+ require "amazing_print"
557
+ ap raw
558
+ end
559
+
560
+ def log_header(query)
561
+ return unless @verbose
562
+
563
+ puts "=" * 70
564
+ puts "Query Context Generator"
565
+ puts "=" * 70
566
+ puts "Query: #{query}"
567
+ puts "Format: #{@format}"
568
+ puts "Limit: #{@limit}"
569
+ puts
570
+ end
571
+
572
+ def log_step(label, value)
573
+ return unless @verbose
574
+
575
+ puts "--- #{label} ---"
576
+ if value.is_a?(Array)
577
+ if value.empty?
578
+ puts " (none)"
579
+ else
580
+ value.each { |v| puts " - #{v}" }
581
+ end
582
+ else
583
+ puts " #{value}"
584
+ end
585
+ puts
586
+ end
587
+ end
588
+
589
+ # Main execution
590
+ if __FILE__ == $PROGRAM_NAME
591
+ options = { format: :text, verbose: false, limit: 20, rank: true }
592
+
593
+ parser = OptionParser.new do |opts|
594
+ opts.banner = "Usage: #{$PROGRAM_NAME} [options] \"query\""
595
+
596
+ opts.on("-f", "--format FORMAT", QueryContextGenerator::FORMATS,
597
+ "Output format (#{QueryContextGenerator::FORMATS.join(', ')})") do |f|
598
+ options[:format] = f
599
+ end
600
+
601
+ opts.on("-v", "--verbose", "Show detailed processing steps") do
602
+ options[:verbose] = true
603
+ end
604
+
605
+ opts.on("-l", "--limit N", Integer, "Maximum facts to return (default: 20)") do |n|
606
+ options[:limit] = n
607
+ end
608
+
609
+ opts.on("--no-rank", "Disable relevance ranking") do
610
+ options[:rank] = false
611
+ end
612
+
613
+ opts.on("-h", "--help", "Show this help message") do
614
+ puts opts
615
+ puts <<~EXAMPLES
616
+
617
+ Examples:
618
+ #{$PROGRAM_NAME} "Who is Sapphira's husband?"
619
+ #{$PROGRAM_NAME} "What happened to Ananias?"
620
+ #{$PROGRAM_NAME} --format triples "Tell me about Peter"
621
+ #{$PROGRAM_NAME} --format json "Who are the apostles?"
622
+ #{$PROGRAM_NAME} --verbose "Where was Stephen martyred?"
623
+ #{$PROGRAM_NAME} --no-rank "Tell me about the apostles"
624
+
625
+ Relevance Ranking:
626
+ Facts are ranked using configurable signal weights (defaults shown):
627
+ - ts_rank_weight: 0.25 PostgreSQL full-text search relevance
628
+ - vector_similarity_weight: 0.25 Semantic similarity via pgvector
629
+ - entity_mention_weight: 0.15 Facts mentioning query entities
630
+ - direct_answer_weight: 0.15 Pattern match for query intent
631
+ - term_overlap_weight: 0.10 Query word matches
632
+ - relationship_match_weight: 0.05 Relationship words (husband, etc.)
633
+ - confidence_weight: 0.05 Fact's stored confidence score
634
+
635
+ Configure weights in FactDb:
636
+ FactDb.configure do |config|
637
+ config.ranking.ts_rank_weight = 0.30
638
+ config.ranking.vector_similarity_weight = 0.20
639
+ # ... etc
640
+ end
641
+
642
+ Or via environment variables:
643
+ FDB_RANKING__TS_RANK_WEIGHT=0.30
644
+
645
+ Note: vector_similarity requires embedding_generator to be configured.
646
+
647
+ Prerequisites:
648
+ Run ingest_demo.rb to populate the database with facts first:
649
+ ruby ingest_demo.rb acts_esv/
650
+
651
+ Environment:
652
+ DATABASE_URL # PostgreSQL connection (default: postgres://$USER@localhost/fact_db_demo)
653
+ EXAMPLES
654
+ exit 0
655
+ end
656
+ end
657
+
658
+ parser.parse!
659
+
660
+ if ARGV.empty?
661
+ puts "Error: Please provide a query"
662
+ puts parser
663
+ exit 1
664
+ end
665
+
666
+ query = ARGV.join(" ")
667
+ QueryContextGenerator.new(options).run(query)
668
+ end