fact_db 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (114) hide show
  1. checksums.yaml +4 -4
  2. data/.envrc +2 -0
  3. data/.yardopts +5 -0
  4. data/CHANGELOG.md +64 -0
  5. data/README.md +107 -6
  6. data/Rakefile +243 -10
  7. data/db/migrate/001_enable_extensions.rb +1 -0
  8. data/db/migrate/002_create_sources.rb +49 -0
  9. data/db/migrate/003_create_entities.rb +27 -15
  10. data/db/migrate/004_create_entity_aliases.rb +20 -7
  11. data/db/migrate/005_create_facts.rb +37 -21
  12. data/db/migrate/006_create_entity_mentions.rb +14 -6
  13. data/db/migrate/007_create_fact_sources.rb +16 -8
  14. data/docs/api/extractors/index.md +5 -5
  15. data/docs/api/extractors/llm.md +17 -17
  16. data/docs/api/extractors/rule-based.md +14 -14
  17. data/docs/api/facts.md +20 -20
  18. data/docs/api/index.md +4 -4
  19. data/docs/api/models/entity.md +21 -21
  20. data/docs/api/models/fact.md +15 -15
  21. data/docs/api/models/index.md +7 -7
  22. data/docs/api/models/{content.md → source.md} +29 -29
  23. data/docs/api/pipeline/extraction.md +25 -25
  24. data/docs/api/pipeline/index.md +1 -1
  25. data/docs/api/pipeline/resolution.md +4 -4
  26. data/docs/api/services/entity-service.md +20 -20
  27. data/docs/api/services/fact-service.md +12 -12
  28. data/docs/api/services/index.md +5 -5
  29. data/docs/api/services/{content-service.md → source-service.md} +27 -27
  30. data/docs/architecture/database-schema.md +46 -46
  31. data/docs/architecture/entity-resolution.md +6 -6
  32. data/docs/architecture/index.md +10 -10
  33. data/docs/architecture/temporal-facts.md +5 -5
  34. data/docs/architecture/three-layer-model.md +17 -17
  35. data/docs/concepts.md +6 -6
  36. data/docs/examples/basic-usage.md +20 -20
  37. data/docs/examples/hr-onboarding.md +17 -17
  38. data/docs/examples/index.md +4 -4
  39. data/docs/examples/news-analysis.md +23 -23
  40. data/docs/getting-started/database-setup.md +28 -20
  41. data/docs/getting-started/index.md +3 -3
  42. data/docs/getting-started/quick-start.md +33 -30
  43. data/docs/guides/batch-processing.md +26 -26
  44. data/docs/guides/configuration.md +158 -77
  45. data/docs/guides/entity-management.md +14 -14
  46. data/docs/guides/extracting-facts.md +28 -28
  47. data/docs/guides/ingesting-content.md +14 -14
  48. data/docs/guides/llm-integration.md +40 -32
  49. data/docs/guides/temporal-queries.md +11 -11
  50. data/docs/index.md +6 -2
  51. data/examples/.envrc +4 -0
  52. data/examples/.gitignore +1 -0
  53. data/examples/001_configuration.rb +312 -0
  54. data/examples/{basic_usage.rb → 010_basic_usage.rb} +47 -56
  55. data/examples/{entity_management.rb → 020_entity_management.rb} +57 -72
  56. data/examples/{temporal_queries.rb → 030_temporal_queries.rb} +39 -59
  57. data/examples/040_output_formats.rb +177 -0
  58. data/examples/{rule_based_extraction.rb → 050_rule_based_extraction.rb} +39 -45
  59. data/examples/060_fluent_temporal_api.rb +217 -0
  60. data/examples/070_introspection.rb +252 -0
  61. data/examples/{hr_system.rb → 080_hr_system.rb} +56 -75
  62. data/examples/090_ingest_demo.rb +515 -0
  63. data/examples/100_query_context.rb +668 -0
  64. data/examples/110_prove_it.rb +204 -0
  65. data/examples/120_dump_database.rb +358 -0
  66. data/examples/130_rag_feedback_loop.rb +858 -0
  67. data/examples/README.md +229 -15
  68. data/examples/data/lincoln_associates.md +201 -0
  69. data/examples/data/lincoln_biography.md +66 -0
  70. data/examples/data/lincoln_cabinet.md +243 -0
  71. data/examples/data/lincoln_family.md +163 -0
  72. data/examples/data/lincoln_military.md +241 -0
  73. data/examples/data/lincoln_todd_family.md +136 -0
  74. data/examples/ingest_reporter.rb +335 -0
  75. data/examples/utilities.rb +182 -0
  76. data/lib/fact_db/config/defaults.yml +254 -0
  77. data/lib/fact_db/config.rb +94 -35
  78. data/lib/fact_db/database.rb +98 -8
  79. data/lib/fact_db/extractors/base.rb +106 -21
  80. data/lib/fact_db/extractors/llm_extractor.rb +35 -63
  81. data/lib/fact_db/extractors/manual_extractor.rb +46 -6
  82. data/lib/fact_db/extractors/rule_based_extractor.rb +136 -25
  83. data/lib/fact_db/llm/adapter.rb +3 -3
  84. data/lib/fact_db/models/entity.rb +94 -22
  85. data/lib/fact_db/models/entity_alias.rb +41 -7
  86. data/lib/fact_db/models/entity_mention.rb +34 -1
  87. data/lib/fact_db/models/fact.rb +259 -28
  88. data/lib/fact_db/models/fact_source.rb +43 -9
  89. data/lib/fact_db/models/source.rb +113 -0
  90. data/lib/fact_db/pipeline/extraction_pipeline.rb +35 -35
  91. data/lib/fact_db/pipeline/resolution_pipeline.rb +5 -5
  92. data/lib/fact_db/query_result.rb +202 -0
  93. data/lib/fact_db/resolution/entity_resolver.rb +139 -39
  94. data/lib/fact_db/resolution/fact_resolver.rb +86 -14
  95. data/lib/fact_db/services/entity_service.rb +246 -37
  96. data/lib/fact_db/services/fact_service.rb +254 -17
  97. data/lib/fact_db/services/source_service.rb +164 -0
  98. data/lib/fact_db/temporal/query.rb +71 -7
  99. data/lib/fact_db/temporal/query_builder.rb +69 -0
  100. data/lib/fact_db/temporal/timeline.rb +102 -11
  101. data/lib/fact_db/transformers/base.rb +77 -0
  102. data/lib/fact_db/transformers/cypher_transformer.rb +185 -0
  103. data/lib/fact_db/transformers/json_transformer.rb +17 -0
  104. data/lib/fact_db/transformers/raw_transformer.rb +35 -0
  105. data/lib/fact_db/transformers/text_transformer.rb +114 -0
  106. data/lib/fact_db/transformers/triple_transformer.rb +138 -0
  107. data/lib/fact_db/validation/alias_filter.rb +185 -0
  108. data/lib/fact_db/version.rb +1 -1
  109. data/lib/fact_db.rb +281 -30
  110. data/mkdocs.yml +2 -2
  111. metadata +60 -16
  112. data/db/migrate/002_create_contents.rb +0 -44
  113. data/lib/fact_db/models/content.rb +0 -62
  114. data/lib/fact_db/services/content_service.rb +0 -93
@@ -0,0 +1,858 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ # RAG Feedback Loop Demo for FactDb
5
+ #
6
+ # This example demonstrates a Retrieval-Augmented Generation (RAG) workflow
7
+ # that feeds LLM-generated knowledge back into the fact database:
8
+ #
9
+ # 1. Takes a user prompt from the command line
10
+ # 2. Retrieves relevant facts from the database as context
11
+ # 3. Enhances the prompt with retrieved context
12
+ # 4. Sends the enhanced prompt to an LLM for processing
13
+ # 5. Ingests the LLM response back into the fact database
14
+ # 6. Reports on new facts discovered from the LLM output
15
+ #
16
+ # This creates a "knowledge compounding" effect where each query can
17
+ # expand the database with synthesized knowledge.
18
+ #
19
+ # Usage:
20
+ # ruby 130_rag_feedback_loop.rb "What are the key events in Acts chapter 5?"
21
+ # ruby 130_rag_feedback_loop.rb --verbose "Tell me about Ananias and Sapphira"
22
+ # ruby 130_rag_feedback_loop.rb --dry-run "Summarize Peter's role in Acts"
23
+ # ruby 130_rag_feedback_loop.rb --context-only "Who was Stephen?"
24
+ #
25
+ # Options:
26
+ # --verbose Show detailed processing steps
27
+ # --dry-run Show enhanced prompt but don't call LLM or ingest
28
+ # --context-only Show retrieved context without LLM processing
29
+ # --limit N Maximum facts to retrieve for context (default: 15)
30
+ # --format FORMAT Context format: text, json (default: text)
31
+
32
+ require_relative "utilities"
33
+ require "optparse"
34
+ require "set"
35
+ require "debug_me"
36
+ include DebugMe
37
+
38
+ class RagFeedbackLoop
39
+ CONTEXT_FORMATS = %i[text json].freeze
40
+
41
+ def initialize(options = {})
42
+ @verbose = options[:verbose] || false
43
+ @dry_run = options[:dry_run] || false
44
+ @context_only = options[:context_only] || false
45
+ @limit = options[:limit] || 15
46
+ @min_relevance = options[:min_relevance] || 0.0
47
+ @format = options[:format] || :text
48
+ @stats_before = {}
49
+ @stats_after = {}
50
+ @filtered_count = 0
51
+ @scored_context = []
52
+ @duplicate_facts = []
53
+ setup_factdb
54
+ end
55
+
56
+ def run(prompt)
57
+ log_header(prompt)
58
+ capture_stats_before
59
+
60
+ # Step 1: Retrieve relevant context from the database
61
+ log_step("Step 1", "Retrieving relevant facts from database...")
62
+ context_facts = retrieve_context(prompt)
63
+
64
+ if context_facts.empty?
65
+ puts "\nNo existing context found in database for this prompt."
66
+ puts "The LLM will respond without fact database context."
67
+ puts
68
+ else
69
+ log_step("Retrieved Facts", "#{context_facts.size} facts found")
70
+ display_context(context_facts) if @verbose
71
+ end
72
+
73
+ # Step 2: Build enhanced prompt with context
74
+ log_step("Step 2", "Building enhanced prompt with context...")
75
+ enhanced_prompt = build_enhanced_prompt(prompt, context_facts)
76
+
77
+ if @dry_run || @context_only
78
+ display_enhanced_prompt(enhanced_prompt)
79
+ return if @context_only
80
+
81
+ puts "\n[DRY RUN] Skipping LLM call and ingestion"
82
+ return
83
+ end
84
+
85
+ # Step 3: Send to LLM for processing
86
+ log_step("Step 3", "Sending enhanced prompt to LLM...")
87
+ llm_response = call_llm(enhanced_prompt)
88
+
89
+ if llm_response.nil? || llm_response.strip.empty?
90
+ puts "\nError: LLM returned empty response"
91
+ return
92
+ end
93
+
94
+ display_llm_response(llm_response)
95
+
96
+ # Step 4: Ingest LLM response into database
97
+ log_step("Step 4", "Ingesting LLM response into fact database...")
98
+ content = ingest_response(prompt, llm_response)
99
+
100
+ # Step 5: Extract facts from the response
101
+ log_step("Step 5", "Extracting facts from LLM response...")
102
+ extracted_facts = extract_facts(content, llm_response)
103
+
104
+ # Step 6: Report on new entries
105
+ capture_stats_after
106
+ report_new_entries(extracted_facts)
107
+
108
+ demo_footer("RAG Feedback Loop Complete!")
109
+ end
110
+
111
+ private
112
+
113
+ def setup_factdb
114
+ DemoUtilities.ensure_demo_environment!
115
+ DemoUtilities.require_fact_db!
116
+
117
+ log_path = File.join(__dir__, "#{File.basename(__FILE__, '.rb')}.log")
118
+
119
+ FactDb.configure do |config|
120
+ config.default_extractor = :llm
121
+ config.logger = Logger.new(File.open(log_path, 'w'))
122
+
123
+ # Configure LLM client
124
+ provider = ENV.fetch("FACT_DB_LLM_PROVIDER", "anthropic").to_sym
125
+ config.llm_client = FactDb::LLM::Adapter.new(provider: provider)
126
+ end
127
+
128
+ FactDb::Database.establish_connection!
129
+
130
+ @facts = FactDb.new
131
+ @entity_service = @facts.entity_service
132
+ @fact_service = @facts.fact_service
133
+ @source_service = @facts.source_service
134
+ @extractor = FactDb::Extractors::Base.for(:llm)
135
+ @llm_client = FactDb.config.llm_client
136
+ end
137
+
138
+ def capture_stats_before
139
+ @stats_before = {
140
+ facts: FactDb::Models::Fact.count,
141
+ entities: FactDb::Models::Entity.count,
142
+ sources: FactDb::Models::Source.count
143
+ }
144
+ end
145
+
146
+ def capture_stats_after
147
+ @stats_after = {
148
+ facts: FactDb::Models::Fact.count,
149
+ entities: FactDb::Models::Entity.count,
150
+ sources: FactDb::Models::Source.count
151
+ }
152
+ end
153
+
154
+ def retrieve_context(prompt)
155
+ facts = []
156
+ gather_limit = @limit * 3 # Gather more candidates for ranking
157
+
158
+ # Strategy 1: Enhanced entity resolution (includes bigrams, key terms, fuzzy matching)
159
+ @resolved_entities = resolve_entities_enhanced(prompt)
160
+
161
+ log_step("Resolved entities", @resolved_entities.map(&:name)) if @verbose
162
+
163
+ # Get facts for resolved entities (include both canonical and synthesized)
164
+ @resolved_entities.each do |entity|
165
+ # current_facts only returns canonical status - we need synthesized too
166
+ entity_facts = FactDb::Models::Fact
167
+ .joins(:entity_mentions)
168
+ .where(entity_mentions: { entity_id: entity.id })
169
+ .where(status: %w[canonical synthesized corroborated])
170
+ .currently_valid
171
+ .limit(gather_limit)
172
+ facts.concat(entity_facts.to_a)
173
+ end
174
+
175
+ # Strategy 2: Full-text search (include synthesized facts)
176
+ search_facts = @fact_service.search(prompt, status: nil, limit: gather_limit)
177
+ facts.concat(search_facts.to_a)
178
+
179
+ # Strategy 3: Search key terms (include synthesized facts)
180
+ extract_key_terms(prompt).each do |term|
181
+ term_facts = @fact_service.search(term, status: nil, limit: 10)
182
+ facts.concat(term_facts.to_a)
183
+ end
184
+
185
+ # Strategy 4: Semantic search if available
186
+ begin
187
+ semantic_facts = @fact_service.semantic_search(prompt, limit: gather_limit)
188
+ facts.concat(semantic_facts.to_a) if semantic_facts.any?
189
+ rescue StandardError
190
+ # Semantic search not available
191
+ end
192
+
193
+ # Deduplicate candidates
194
+ unique_facts = facts.uniq(&:id)
195
+
196
+ # Rank by relevance and return top N (now returns array of {fact:, score:, signals:})
197
+ @scored_context = rank_facts_by_relevance(prompt, unique_facts)
198
+ @scored_context.map { |sf| sf[:fact] }
199
+ end
200
+
201
+ # Rank facts by relevance to the query using multiple signals
202
+ # Returns array of hashes with :fact, :score, and :signals keys
203
+ def rank_facts_by_relevance(query, facts)
204
+ return [] if facts.empty?
205
+
206
+ query_terms = extract_key_terms(query)
207
+ entity_names = @resolved_entities.flat_map do |e|
208
+ [e.name.downcase] + e.all_aliases.map(&:downcase)
209
+ end
210
+
211
+ # Compute ts_rank scores for full-text relevance
212
+ ts_scores = compute_ts_rank_scores(query, facts.map(&:id))
213
+
214
+ scored_facts = facts.map do |fact|
215
+ text_lower = fact.text.downcase
216
+ signals = {}
217
+
218
+ # Signal 1: PostgreSQL ts_rank (full-text search relevance) - weight 0.30
219
+ ts_score = ts_scores[fact.id] || 0.0
220
+ signals[:ts_rank] = ts_score * 0.30
221
+
222
+ # Signal 2: Entity mention score - weight 0.25
223
+ entity_match_count = entity_names.count { |name| text_lower.include?(name) }
224
+ signals[:entity] = [entity_match_count * 0.125, 0.25].min
225
+
226
+ # Signal 3: Query term overlap - weight 0.20
227
+ term_matches = query_terms.count { |term| text_lower.include?(term.downcase) }
228
+ signals[:terms] = query_terms.empty? ? 0 : (term_matches.to_f / query_terms.size) * 0.20
229
+
230
+ # Signal 4: Fact confidence - weight 0.15
231
+ signals[:confidence] = (fact.confidence || 0.5) * 0.15
232
+
233
+ # Signal 5: Prefer canonical facts over synthesized - weight 0.10
234
+ signals[:status] = case fact.status
235
+ when "canonical" then 0.10
236
+ when "corroborated" then 0.08
237
+ when "synthesized" then 0.05
238
+ else 0.03
239
+ end
240
+
241
+ total_score = signals.values.sum
242
+
243
+ { fact: fact, score: total_score, signals: signals }
244
+ end
245
+
246
+ # Sort by score descending
247
+ sorted = scored_facts.sort_by { |f| -f[:score] }
248
+
249
+ # Filter by minimum relevance threshold
250
+ if @min_relevance > 0
251
+ before_count = sorted.size
252
+ sorted = sorted.select { |f| f[:score] >= @min_relevance }
253
+ @filtered_count = before_count - sorted.size
254
+ end
255
+
256
+ sorted.first(@limit)
257
+ end
258
+
259
+ # Compute PostgreSQL ts_rank scores for full-text search relevance
260
+ def compute_ts_rank_scores(query, fact_ids)
261
+ return {} if fact_ids.empty? || query.strip.empty?
262
+
263
+ sql = <<~SQL
264
+ SELECT id,
265
+ ts_rank_cd(to_tsvector('english', text),
266
+ plainto_tsquery('english', ?),
267
+ 32) as rank
268
+ FROM fact_db_facts
269
+ WHERE id IN (?)
270
+ SQL
271
+
272
+ results = ActiveRecord::Base.connection.execute(
273
+ ActiveRecord::Base.sanitize_sql([sql, query, fact_ids])
274
+ )
275
+
276
+ scores = {}
277
+ max_score = 0.0
278
+
279
+ results.each do |row|
280
+ score = row["rank"].to_f
281
+ scores[row["id"]] = score
282
+ max_score = score if score > max_score
283
+ end
284
+
285
+ # Normalize scores to 0-1 range
286
+ max_score > 0 ? scores.transform_values { |s| s / max_score } : scores
287
+ rescue StandardError => e
288
+ log_step("ts_rank error", e.message) if @verbose
289
+ {}
290
+ end
291
+
292
+ def extract_entity_candidates(query)
293
+ candidates = []
294
+
295
+ # Capitalized words/phrases
296
+ query.scan(/\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b/).each do |match|
297
+ candidates << match unless stop_words.include?(match.downcase)
298
+ end
299
+
300
+ # Possessive forms
301
+ query.scan(/\b([A-Z][a-z]+)'s\b/).flatten.each do |match|
302
+ candidates << match
303
+ end
304
+
305
+ # Quoted strings
306
+ query.scan(/"([^"]+)"/).flatten.each do |match|
307
+ candidates << match
308
+ end
309
+
310
+ candidates.uniq
311
+ end
312
+
313
+ # Simplified entity resolution - lets the database do the heavy lifting
314
+ def resolve_entities_enhanced(query)
315
+ entities = []
316
+
317
+ # Strategy 1: Rule-based extraction for proper nouns (capitalized words)
318
+ candidates = extract_entity_candidates(query)
319
+ log_step("Capitalized candidates", candidates) if @verbose
320
+
321
+ candidates.each do |name|
322
+ resolved = @entity_service.resolve(name)
323
+ entities << resolved.entity if resolved
324
+ end
325
+
326
+ # Strategy 2: Full query search - database LIKE matching handles phrases
327
+ log_step("Full query search", query) if @verbose
328
+ search_results = @entity_service.search(query, limit: 5)
329
+ entities.concat(search_results.to_a)
330
+
331
+ # Strategy 3: Fuzzy search - database pg_trgm handles misspellings
332
+ log_step("Fuzzy search", query) if @verbose
333
+ fuzzy_results = @entity_service.fuzzy_search(query, threshold: 0.3, limit: 5)
334
+ entities.concat(fuzzy_results)
335
+
336
+ resolved = entities.uniq(&:id)
337
+ log_step("Resolved entities", resolved.map(&:name)) if @verbose
338
+ resolved
339
+ end
340
+
341
+ def extract_key_terms(query)
342
+ query.downcase
343
+ .gsub(/[^a-z\s']/, " ")
344
+ .split
345
+ .reject { |w| w.length < 3 || stop_words.include?(w) }
346
+ .uniq
347
+ end
348
+
349
+ def stop_words
350
+ %w[who what where when why how is was were are the a an and or but
351
+ to of in for on with at by from as tell me about can could would
352
+ should will shall may might must]
353
+ end
354
+
355
+ def build_enhanced_prompt(user_prompt, context_facts)
356
+ context_text = format_context(context_facts)
357
+ system_prompt = FactDb.config.prompts.rag_system
358
+
359
+ <<~PROMPT
360
+ #{system_prompt}
361
+ #{context_section(context_text)}
362
+
363
+ USER QUESTION: #{user_prompt}
364
+
365
+ Please provide a comprehensive response with specific details and facts:
366
+ PROMPT
367
+ end
368
+
369
+ def context_section(context_text)
370
+ if context_text.strip.empty?
371
+ "CONTEXT: No existing facts found in the database for this topic."
372
+ else
373
+ <<~CONTEXT
374
+ CONTEXT FROM FACT DATABASE:
375
+ #{context_text}
376
+ CONTEXT
377
+ end
378
+ end
379
+
380
+ def format_context(facts)
381
+ return "" if facts.empty?
382
+
383
+ case @format
384
+ when :json
385
+ format_context_json(facts)
386
+ else
387
+ format_context_text(facts)
388
+ end
389
+ end
390
+
391
+ def format_context_text(facts)
392
+ lines = []
393
+ facts.each_with_index do |fact, idx|
394
+ # Get entity mentions for context
395
+ entities = fact.entity_mentions.map do |m|
396
+ "#{m.entity.name} (#{m.entity.kind})"
397
+ end.uniq
398
+
399
+ lines << "#{idx + 1}. #{fact.text}"
400
+ lines << " Entities: #{entities.join(', ')}" if entities.any?
401
+ lines << " Valid from: #{fact.valid_at}" if fact.valid_at
402
+ end
403
+ lines.join("\n")
404
+ end
405
+
406
+ def format_context_json(facts)
407
+ require "json"
408
+ data = facts.map do |fact|
409
+ {
410
+ text: fact.text,
411
+ entities: fact.entity_mentions.map { |m| m.entity.name },
412
+ valid_at: fact.valid_at&.to_s,
413
+ confidence: fact.confidence
414
+ }
415
+ end
416
+ JSON.pretty_generate(data)
417
+ end
418
+
419
+ def call_llm(prompt)
420
+ unless @llm_client
421
+ puts "Error: LLM client not configured"
422
+ puts "Set ANTHROPIC_API_KEY, OPENAI_API_KEY, or configure via FactDb.configure"
423
+ return nil
424
+ end
425
+
426
+ with_spinner("Waiting for LLM response...") do
427
+ @llm_client.chat(prompt)
428
+ end
429
+ rescue StandardError => e
430
+ debug_me { [:e] }
431
+ puts "Error calling LLM: #{e.message}"
432
+ nil
433
+ end
434
+
435
+ def ingest_response(original_prompt, llm_response)
436
+ timestamp = Time.now.strftime("%Y%m%d_%H%M%S")
437
+ title = "RAG Response: #{original_prompt[0..50]}..."
438
+
439
+ @source_service.create(
440
+ llm_response,
441
+ kind: :document,
442
+ title: title,
443
+ metadata: {
444
+ source_type: "rag_synthesis",
445
+ original_prompt: original_prompt,
446
+ generated_at: timestamp,
447
+ llm_provider: @llm_client&.provider&.to_s,
448
+ llm_model: @llm_client&.model
449
+ }
450
+ )
451
+ end
452
+
453
+ def extract_facts(content, response_text)
454
+ extracted_facts = []
455
+ @duplicate_facts = []
456
+ response_lines = response_text.lines
457
+
458
+ begin
459
+ # Use the LLM extractor to find atomic facts
460
+ raw_facts = with_spinner("Extracting atomic facts...") do
461
+ @extractor.extract(response_text)
462
+ end
463
+
464
+ log_step("Raw extracted facts", "#{raw_facts.size} facts identified") if @verbose
465
+
466
+ raw_facts.each do |fact_data|
467
+ # Check for duplicate before creating
468
+ existing_fact = find_duplicate_fact(fact_data[:text])
469
+ if existing_fact
470
+ @duplicate_facts << { extracted_text: fact_data[:text], existing_fact: existing_fact }
471
+ next
472
+ end
473
+
474
+ # Resolve/create entities from mentions
475
+ mentions = []
476
+ (fact_data[:mentions] || []).each do |mention_data|
477
+ entity = @entity_service.resolve_or_create(
478
+ mention_data[:name],
479
+ kind: mention_data[:kind] || :concept,
480
+ aliases: mention_data[:aliases] || [],
481
+ description: "Extracted from RAG synthesis"
482
+ )
483
+
484
+ mentions << {
485
+ entity_id: entity.id,
486
+ role: mention_data[:role] || determine_role(mention_data[:type]),
487
+ text: mention_data[:name]
488
+ }
489
+ end
490
+
491
+ # Find the source lines where this fact appears in the response
492
+ line_info = find_source_lines(fact_data[:text], response_lines, mentions)
493
+
494
+ # Create the fact with source line information
495
+ fact = @fact_service.create(
496
+ fact_data[:text],
497
+ valid_at: fact_data[:valid_at] || Date.today,
498
+ invalid_at: fact_data[:invalid_at],
499
+ extraction_method: :llm,
500
+ status: :synthesized, # Mark as synthesized from LLM
501
+ confidence: (fact_data[:confidence] || 0.7) * 0.9, # Slight discount for synthesized facts
502
+ mentions: mentions.uniq { |m| m[:entity_id] },
503
+ metadata: {
504
+ source_type: "rag_synthesis",
505
+ extraction_timestamp: Time.now.iso8601,
506
+ line_start: line_info[:line_start],
507
+ line_end: line_info[:line_end]
508
+ }
509
+ )
510
+
511
+ fact.add_source(source: source, kind: :primary, confidence: 0.9)
512
+ extracted_facts << fact
513
+
514
+ rescue StandardError => e
515
+ debug_me { [:fact_data, :e] } if @verbose
516
+ # Continue with other facts
517
+ end
518
+ rescue StandardError => e
519
+ debug_me { [:e] }
520
+ puts "Error during fact extraction: #{e.message}"
521
+ end
522
+
523
+ extracted_facts
524
+ end
525
+
526
+ # Find an existing fact that matches the given text (duplicate detection)
527
+ def find_duplicate_fact(text)
528
+ normalized_text = normalize_text(text)
529
+ return nil if normalized_text.length < 10
530
+
531
+ # Search for similar facts
532
+ candidates = @fact_service.search(text, status: nil, limit: 10)
533
+
534
+ candidates.find do |candidate|
535
+ normalized_candidate = normalize_text(candidate.text)
536
+ # Consider it a duplicate if normalized texts are very similar
537
+ text_similarity(normalized_text, normalized_candidate) > 0.85
538
+ end
539
+ end
540
+
541
+ def normalize_text(text)
542
+ text.downcase
543
+ .gsub(/[^a-z0-9\s]/, "")
544
+ .gsub(/\s+/, " ")
545
+ .strip
546
+ end
547
+
548
+ def text_similarity(text1, text2)
549
+ return 1.0 if text1 == text2
550
+ return 0.0 if text1.empty? || text2.empty?
551
+
552
+ # Simple word overlap similarity
553
+ words1 = text1.split.to_set
554
+ words2 = text2.split.to_set
555
+ intersection = words1 & words2
556
+ union = words1 | words2
557
+
558
+ union.empty? ? 0.0 : intersection.size.to_f / union.size
559
+ end
560
+
561
+ # Find the line numbers in the source text where a fact most likely originated
562
+ def find_source_lines(text, source_lines, mentions)
563
+ return { line_start: 1, line_end: source_lines.length } if source_lines.empty?
564
+
565
+ # Extract key terms from the fact and entity mentions
566
+ key_terms = extract_fact_key_terms(text, mentions)
567
+ return { line_start: 1, line_end: source_lines.length } if key_terms.empty?
568
+
569
+ # Score each line by how many key terms it contains
570
+ line_scores = source_lines.each_with_index.map do |line, idx|
571
+ line_lower = line.downcase
572
+ score = key_terms.count { |term| line_lower.include?(term.downcase) }
573
+ { line_number: idx + 1, score: score }
574
+ end
575
+
576
+ # Find lines with matches
577
+ matching_lines = line_scores.select { |l| l[:score] > 0 }
578
+
579
+ if matching_lines.empty?
580
+ # No direct matches - return full document range
581
+ { line_start: 1, line_end: source_lines.length }
582
+ else
583
+ # Return the range covering all matching lines, plus context
584
+ first_match = matching_lines.first[:line_number]
585
+ last_match = matching_lines.last[:line_number]
586
+
587
+ # Add 1 line of context on each side
588
+ line_start = [first_match - 1, 1].max
589
+ line_end = [last_match + 1, source_lines.length].min
590
+
591
+ { line_start: line_start, line_end: line_end }
592
+ end
593
+ end
594
+
595
+ def extract_fact_key_terms(text, mentions)
596
+ terms = []
597
+
598
+ # Add entity names from mentions
599
+ mentions.each do |mention|
600
+ terms << mention[:text] if mention[:text]
601
+ end
602
+
603
+ # Extract significant words from the fact text
604
+ stop_words = %w[a an the is was were are been being have has had do does did
605
+ will would could should may might must shall can to of in for
606
+ on with at by from as into through during before after]
607
+
608
+ fact_words = text.downcase
609
+ .gsub(/[^a-z\s]/, " ")
610
+ .split
611
+ .reject { |w| w.length < 4 || stop_words.include?(w) }
612
+ .uniq
613
+
614
+ terms.concat(fact_words)
615
+ terms.compact.uniq.reject(&:empty?)
616
+ end
617
+
618
+ def determine_role(entity_type)
619
+ case entity_type.to_s
620
+ when "person" then :subject
621
+ when "place" then :location
622
+ when "organization" then :object
623
+ when "event" then :temporal
624
+ else :subject
625
+ end
626
+ end
627
+
628
+ def report_new_entries(extracted_facts)
629
+ puts "\n" + "=" * 60
630
+ puts "NEW DATABASE ENTRIES"
631
+ puts "=" * 60
632
+
633
+ # Calculate deltas
634
+ facts_added = @stats_after[:facts] - @stats_before[:facts]
635
+ entities_added = @stats_after[:entities] - @stats_before[:entities]
636
+ sources_added = @stats_after[:sources] - @stats_before[:sources]
637
+ duplicates_found = @duplicate_facts&.size || 0
638
+
639
+ puts "\nSummary:"
640
+ puts " Source records added: #{sources_added}"
641
+ puts " Facts extracted: #{facts_added}"
642
+ puts " Duplicates skipped: #{duplicates_found}" if duplicates_found > 0
643
+ puts " Entities discovered: #{entities_added}"
644
+
645
+ if extracted_facts.any?
646
+ puts "\nExtracted Facts:"
647
+ extracted_facts.each do |fact|
648
+ puts "\n [ID: #{fact.id}] #{fact.text[0..90]}#{'...' if fact.text.length > 90}"
649
+
650
+ entities = fact.entity_mentions.map { |m| m.entity.name }
651
+ puts " Entities: #{entities.join(', ')}" if entities.any?
652
+ puts " Confidence: #{(fact.confidence * 100).round(1)}%"
653
+ puts " Status: #{fact.status}"
654
+ end
655
+ else
656
+ puts "\n No new facts were extracted from the LLM response."
657
+ end
658
+
659
+ # Show duplicates that were skipped
660
+ if @duplicate_facts&.any?
661
+ puts "\nDuplicate Facts (skipped):"
662
+ @duplicate_facts.each do |dup|
663
+ existing = dup[:existing_fact]
664
+ puts "\n [DUP of ID: #{existing.id}] #{dup[:extracted_text][0..80]}..."
665
+ puts " Existing: #{existing.text[0..80]}..."
666
+ end
667
+ end
668
+
669
+ # Show any new entities
670
+ if entities_added > 0
671
+ puts "\nNew Entities Discovered:"
672
+ new_entities = FactDb::Models::Entity
673
+ .order(created_at: :desc)
674
+ .limit(entities_added)
675
+
676
+ new_entities.each do |entity|
677
+ puts " - #{entity.name} (#{entity.kind})"
678
+ end
679
+ end
680
+
681
+ puts
682
+ end
683
+
684
+ def display_context(facts)
685
+ puts "\n--- Retrieved Context ---"
686
+ @scored_context.each_with_index do |scored_fact, idx|
687
+ fact = scored_fact[:fact]
688
+ score = scored_fact[:score]
689
+ puts " #{idx + 1}. [#{(score * 100).round(1)}%] #{fact.text[0..70]}..."
690
+ end
691
+ if @filtered_count > 0
692
+ puts " (#{@filtered_count} facts filtered out due to low relevance)"
693
+ end
694
+ puts
695
+ end
696
+
697
+ def display_enhanced_prompt(prompt)
698
+ puts "\n" + "=" * 60
699
+ puts "ENHANCED PROMPT"
700
+ puts "=" * 60
701
+ puts prompt
702
+ puts "=" * 60
703
+ end
704
+
705
+ def display_llm_response(response)
706
+ puts "\n" + "=" * 60
707
+ puts "LLM RESPONSE"
708
+ puts "=" * 60
709
+ puts response
710
+ puts "=" * 60
711
+ end
712
+
713
+ def with_spinner(message)
714
+ spinner_chars = %w[... .. . .. ...]
715
+ spinning = true
716
+ result = nil
717
+
718
+ spinner_thread = Thread.new do
719
+ i = 0
720
+ while spinning
721
+ print "\r #{spinner_chars[i % spinner_chars.length]} #{message} "
722
+ $stdout.flush
723
+ sleep 0.3
724
+ i += 1
725
+ end
726
+ end
727
+
728
+ begin
729
+ result = yield
730
+ ensure
731
+ spinning = false
732
+ spinner_thread.join
733
+ print "\r#{' ' * (message.length + 15)}\r"
734
+ $stdout.flush
735
+ end
736
+
737
+ result
738
+ end
739
+
740
+ def log_header(prompt)
741
+ puts "=" * 60
742
+ puts "RAG Feedback Loop Demo"
743
+ puts "=" * 60
744
+ puts "Prompt: #{prompt}"
745
+ puts "Mode: #{mode_description}"
746
+ puts "Context limit: #{@limit} facts"
747
+ puts "Min relevance: #{(@min_relevance * 100).round(1)}%" if @min_relevance > 0
748
+ puts
749
+ end
750
+
751
+ def mode_description
752
+ return "Context Only" if @context_only
753
+ return "Dry Run" if @dry_run
754
+
755
+ "Full Pipeline"
756
+ end
757
+
758
+ def log_step(label, value)
759
+ return unless @verbose || !value.is_a?(Array)
760
+
761
+ if value.is_a?(Array)
762
+ puts "\n--- #{label} ---"
763
+ if value.empty?
764
+ puts " (none)"
765
+ else
766
+ value.each { |v| puts " - #{v}" }
767
+ end
768
+ else
769
+ puts " [#{label}] #{value}"
770
+ end
771
+ end
772
+ end
773
+
774
+ # Main execution
775
+ if __FILE__ == $PROGRAM_NAME
776
+ options = { verbose: false, dry_run: false, context_only: false, limit: 15, min_relevance: 0.0, format: :text }
777
+
778
+ parser = OptionParser.new do |opts|
779
+ opts.banner = "Usage: #{$PROGRAM_NAME} [options] \"prompt\""
780
+
781
+ opts.on("-v", "--verbose", "Show detailed processing steps") do
782
+ options[:verbose] = true
783
+ end
784
+
785
+ opts.on("-d", "--dry-run", "Show enhanced prompt without calling LLM") do
786
+ options[:dry_run] = true
787
+ end
788
+
789
+ opts.on("-c", "--context-only", "Show retrieved context without LLM processing") do
790
+ options[:context_only] = true
791
+ end
792
+
793
+ opts.on("-l", "--limit N", Integer, "Maximum facts for context (default: 15)") do |n|
794
+ options[:limit] = n
795
+ end
796
+
797
+ opts.on("-m", "--min-relevance PERCENT", Float,
798
+ "Minimum relevance score 0-100 to include fact (default: 0)") do |p|
799
+ options[:min_relevance] = p / 100.0
800
+ end
801
+
802
+ opts.on("-f", "--format FORMAT", RagFeedbackLoop::CONTEXT_FORMATS,
803
+ "Context format: text, json (default: text)") do |f|
804
+ options[:format] = f
805
+ end
806
+
807
+ opts.on("-h", "--help", "Show this help message") do
808
+ puts opts
809
+ puts <<~EXAMPLES
810
+
811
+ Examples:
812
+ #{$PROGRAM_NAME} "What are the key events in Acts chapter 5?"
813
+ #{$PROGRAM_NAME} --verbose "Tell me about Ananias and Sapphira"
814
+ #{$PROGRAM_NAME} --dry-run "Summarize Peter's role in Acts"
815
+ #{$PROGRAM_NAME} --context-only "Who was Stephen?"
816
+ #{$PROGRAM_NAME} --min-relevance 15 "Tell me about the Comanche"
817
+ #{$PROGRAM_NAME} --format json "What happened to the apostles?"
818
+
819
+ Workflow:
820
+ 1. Retrieves relevant facts from the database based on your prompt
821
+ 2. Builds an enhanced prompt with the retrieved context
822
+ 3. Sends the enhanced prompt to the configured LLM
823
+ 4. Ingests the LLM response as new content in the database
824
+ 5. Extracts atomic facts from the response using LLM extraction
825
+ 6. Reports on all new facts and entities added to the database
826
+
827
+ Knowledge Compounding:
828
+ Each query potentially adds new synthesized knowledge to the database.
829
+ Subsequent queries can then leverage this expanded knowledge base,
830
+ creating a compounding effect where the system becomes more knowledgeable
831
+ over time.
832
+
833
+ Prerequisites:
834
+ - Run ingest_demo.rb first to populate the database with initial facts
835
+ - Configure LLM provider (ANTHROPIC_API_KEY, OPENAI_API_KEY, etc.)
836
+
837
+ Environment:
838
+ FACT_DB_LLM_PROVIDER # LLM provider (anthropic, openai, gemini, ollama)
839
+ ANTHROPIC_API_KEY # API key for Anthropic (default provider)
840
+ OPENAI_API_KEY # API key for OpenAI
841
+ DATABASE_URL # PostgreSQL connection URL
842
+ EXAMPLES
843
+ exit 0
844
+ end
845
+ end
846
+
847
+ parser.parse!
848
+
849
+ if ARGV.empty?
850
+ puts "Error: Please provide a prompt"
851
+ puts "Usage: #{$PROGRAM_NAME} [options] \"your prompt here\""
852
+ puts "Run '#{$PROGRAM_NAME} --help' for more information"
853
+ exit 1
854
+ end
855
+
856
+ prompt = ARGV.join(" ")
857
+ RagFeedbackLoop.new(**options).run(prompt)
858
+ end