fact_db 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. checksums.yaml +7 -0
  2. data/.envrc +1 -0
  3. data/CHANGELOG.md +48 -0
  4. data/COMMITS.md +196 -0
  5. data/README.md +102 -0
  6. data/Rakefile +41 -0
  7. data/db/migrate/001_enable_extensions.rb +7 -0
  8. data/db/migrate/002_create_contents.rb +44 -0
  9. data/db/migrate/003_create_entities.rb +36 -0
  10. data/db/migrate/004_create_entity_aliases.rb +18 -0
  11. data/db/migrate/005_create_facts.rb +65 -0
  12. data/db/migrate/006_create_entity_mentions.rb +18 -0
  13. data/db/migrate/007_create_fact_sources.rb +18 -0
  14. data/docs/api/extractors/index.md +71 -0
  15. data/docs/api/extractors/llm.md +162 -0
  16. data/docs/api/extractors/manual.md +92 -0
  17. data/docs/api/extractors/rule-based.md +165 -0
  18. data/docs/api/facts.md +300 -0
  19. data/docs/api/index.md +66 -0
  20. data/docs/api/models/content.md +165 -0
  21. data/docs/api/models/entity.md +202 -0
  22. data/docs/api/models/fact.md +270 -0
  23. data/docs/api/models/index.md +77 -0
  24. data/docs/api/pipeline/extraction.md +175 -0
  25. data/docs/api/pipeline/index.md +72 -0
  26. data/docs/api/pipeline/resolution.md +209 -0
  27. data/docs/api/services/content-service.md +166 -0
  28. data/docs/api/services/entity-service.md +202 -0
  29. data/docs/api/services/fact-service.md +223 -0
  30. data/docs/api/services/index.md +55 -0
  31. data/docs/architecture/database-schema.md +293 -0
  32. data/docs/architecture/entity-resolution.md +293 -0
  33. data/docs/architecture/index.md +149 -0
  34. data/docs/architecture/temporal-facts.md +268 -0
  35. data/docs/architecture/three-layer-model.md +242 -0
  36. data/docs/assets/css/custom.css +137 -0
  37. data/docs/assets/fact_db.jpg +0 -0
  38. data/docs/assets/images/fact_db.jpg +0 -0
  39. data/docs/concepts.md +183 -0
  40. data/docs/examples/basic-usage.md +235 -0
  41. data/docs/examples/hr-onboarding.md +312 -0
  42. data/docs/examples/index.md +64 -0
  43. data/docs/examples/news-analysis.md +288 -0
  44. data/docs/getting-started/database-setup.md +170 -0
  45. data/docs/getting-started/index.md +71 -0
  46. data/docs/getting-started/installation.md +98 -0
  47. data/docs/getting-started/quick-start.md +191 -0
  48. data/docs/guides/batch-processing.md +325 -0
  49. data/docs/guides/configuration.md +243 -0
  50. data/docs/guides/entity-management.md +364 -0
  51. data/docs/guides/extracting-facts.md +299 -0
  52. data/docs/guides/index.md +22 -0
  53. data/docs/guides/ingesting-content.md +252 -0
  54. data/docs/guides/llm-integration.md +299 -0
  55. data/docs/guides/temporal-queries.md +315 -0
  56. data/docs/index.md +121 -0
  57. data/examples/README.md +130 -0
  58. data/examples/basic_usage.rb +164 -0
  59. data/examples/entity_management.rb +216 -0
  60. data/examples/hr_system.rb +428 -0
  61. data/examples/rule_based_extraction.rb +258 -0
  62. data/examples/temporal_queries.rb +245 -0
  63. data/lib/fact_db/config.rb +71 -0
  64. data/lib/fact_db/database.rb +45 -0
  65. data/lib/fact_db/errors.rb +10 -0
  66. data/lib/fact_db/extractors/base.rb +117 -0
  67. data/lib/fact_db/extractors/llm_extractor.rb +179 -0
  68. data/lib/fact_db/extractors/manual_extractor.rb +53 -0
  69. data/lib/fact_db/extractors/rule_based_extractor.rb +228 -0
  70. data/lib/fact_db/llm/adapter.rb +109 -0
  71. data/lib/fact_db/models/content.rb +62 -0
  72. data/lib/fact_db/models/entity.rb +84 -0
  73. data/lib/fact_db/models/entity_alias.rb +26 -0
  74. data/lib/fact_db/models/entity_mention.rb +33 -0
  75. data/lib/fact_db/models/fact.rb +192 -0
  76. data/lib/fact_db/models/fact_source.rb +35 -0
  77. data/lib/fact_db/pipeline/extraction_pipeline.rb +146 -0
  78. data/lib/fact_db/pipeline/resolution_pipeline.rb +129 -0
  79. data/lib/fact_db/resolution/entity_resolver.rb +261 -0
  80. data/lib/fact_db/resolution/fact_resolver.rb +259 -0
  81. data/lib/fact_db/services/content_service.rb +93 -0
  82. data/lib/fact_db/services/entity_service.rb +150 -0
  83. data/lib/fact_db/services/fact_service.rb +193 -0
  84. data/lib/fact_db/temporal/query.rb +125 -0
  85. data/lib/fact_db/temporal/timeline.rb +134 -0
  86. data/lib/fact_db/version.rb +5 -0
  87. data/lib/fact_db.rb +141 -0
  88. data/mkdocs.yml +198 -0
  89. metadata +288 -0
@@ -0,0 +1,245 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ # Temporal Queries Example for FactDb
5
+ #
6
+ # This example demonstrates:
7
+ # - Creating facts with temporal bounds
8
+ # - Querying facts at specific points in time
9
+ # - Superseding facts (replacing old with new)
10
+ # - Detecting fact changes over time
11
+ # - Building temporal diffs
12
+
13
+ require "bundler/setup"
14
+ require "fact_db"
15
+
16
+ FactDb.configure do |config|
17
+ config.database_url = ENV.fetch("DATABASE_URL", "postgres://#{ENV['USER']}@localhost/fact_db_demo")
18
+ end
19
+
20
+ # Ensure database tables exist
21
+ FactDb::Database.migrate!
22
+
23
+ clock = FactDb.new
24
+ entity_service = clock.entity_service
25
+ fact_service = clock.fact_service
26
+
27
+ puts "=" * 60
28
+ puts "FactDb Temporal Queries Demo"
29
+ puts "=" * 60
30
+
31
+ # Setup: Create entities for our scenario
32
+ puts "\n--- Setup: Creating Entities ---\n"
33
+
34
+ company = entity_service.create(
35
+ "TechCorp Ltd",
36
+ type: :organization,
37
+ description: "Technology company"
38
+ )
39
+
40
+ ceo = entity_service.create(
41
+ "Alice Chen",
42
+ type: :person,
43
+ description: "Executive"
44
+ )
45
+
46
+ new_ceo = entity_service.create(
47
+ "David Park",
48
+ type: :person,
49
+ description: "Executive"
50
+ )
51
+
52
+ cfo = entity_service.create(
53
+ "Sarah Miller",
54
+ type: :person,
55
+ description: "Finance executive"
56
+ )
57
+
58
+ puts "Created entities: #{company.canonical_name}, #{ceo.canonical_name}, #{new_ceo.canonical_name}, #{cfo.canonical_name}"
59
+
60
+ # Section 1: Creating Temporal Facts
61
+ puts "\n--- Section 1: Creating Temporal Facts ---\n"
62
+
63
+ # Fact with open-ended validity (still true)
64
+ fact1 = fact_service.create(
65
+ "TechCorp Ltd is headquartered in Austin, Texas",
66
+ valid_at: Date.new(2015, 1, 1),
67
+ mentions: [{ entity_id: company.id, role: :subject, text: "TechCorp Ltd" }]
68
+ )
69
+ puts "Created: #{fact1.fact_text}"
70
+ puts " Valid: #{fact1.valid_at} - present"
71
+
72
+ # Fact with closed validity (historical)
73
+ fact2 = fact_service.create(
74
+ "Alice Chen is CEO of TechCorp Ltd",
75
+ valid_at: Date.new(2018, 3, 1),
76
+ invalid_at: Date.new(2024, 12, 31),
77
+ mentions: [
78
+ { entity_id: ceo.id, role: :subject, text: "Alice Chen" },
79
+ { entity_id: company.id, role: :object, text: "TechCorp Ltd" }
80
+ ]
81
+ )
82
+ puts "\nCreated: #{fact2.fact_text}"
83
+ puts " Valid: #{fact2.valid_at} - #{fact2.invalid_at}"
84
+
85
+ # Current CEO
86
+ fact3 = fact_service.create(
87
+ "David Park is CEO of TechCorp Ltd",
88
+ valid_at: Date.new(2025, 1, 1),
89
+ mentions: [
90
+ { entity_id: new_ceo.id, role: :subject, text: "David Park" },
91
+ { entity_id: company.id, role: :object, text: "TechCorp Ltd" }
92
+ ]
93
+ )
94
+ puts "\nCreated: #{fact3.fact_text}"
95
+ puts " Valid: #{fact3.valid_at} - present"
96
+
97
+ # Another current fact
98
+ fact4 = fact_service.create(
99
+ "Sarah Miller is CFO of TechCorp Ltd",
100
+ valid_at: Date.new(2020, 6, 15),
101
+ mentions: [
102
+ { entity_id: cfo.id, role: :subject, text: "Sarah Miller" },
103
+ { entity_id: company.id, role: :object, text: "TechCorp Ltd" }
104
+ ]
105
+ )
106
+ puts "\nCreated: #{fact4.fact_text}"
107
+ puts " Valid: #{fact4.valid_at} - present"
108
+
109
+ # Section 2: Point-in-Time Queries
110
+ puts "\n--- Section 2: Point-in-Time Queries ---\n"
111
+
112
+ # Query facts valid at different dates
113
+ dates_to_query = [
114
+ Date.new(2019, 6, 1), # Alice was CEO
115
+ Date.new(2024, 6, 1), # Alice still CEO
116
+ Date.new(2025, 6, 1), # David is CEO
117
+ Date.today
118
+ ]
119
+
120
+ dates_to_query.each do |date|
121
+ puts "\nFacts about TechCorp on #{date}:"
122
+ facts = fact_service.facts_at(date, entity: company.id)
123
+ facts.each do |fact|
124
+ puts " - #{fact.fact_text}"
125
+ end
126
+ end
127
+
128
+ # Section 3: Current vs Historical Facts
129
+ puts "\n--- Section 3: Current vs Historical Facts ---\n"
130
+
131
+ puts "Currently valid facts about TechCorp:"
132
+ current = fact_service.current_facts(entity: company.id)
133
+ current.each { |f| puts " - #{f.fact_text}" }
134
+
135
+ puts "\nAll historical facts:"
136
+ FactDb::Models::Fact.historical.each do |fact|
137
+ puts " - #{fact.fact_text} (ended: #{fact.invalid_at})"
138
+ end
139
+
140
+ # Section 4: Superseding Facts
141
+ puts "\n--- Section 4: Superseding Facts ---\n"
142
+
143
+ # Company valuation that changes over time
144
+ valuation_2020 = fact_service.create(
145
+ "TechCorp Ltd has a market valuation of $500 million",
146
+ valid_at: Date.new(2020, 1, 1),
147
+ mentions: [{ entity_id: company.id, role: :subject, text: "TechCorp Ltd" }]
148
+ )
149
+ puts "Created valuation fact: #{valuation_2020.fact_text}"
150
+
151
+ # Supersede with new valuation
152
+ valuation_2023 = fact_service.supersede(
153
+ valuation_2020.id,
154
+ "TechCorp Ltd has a market valuation of $1.2 billion",
155
+ valid_at: Date.new(2023, 1, 1),
156
+ mentions: [{ entity_id: company.id, role: :subject, text: "TechCorp Ltd" }]
157
+ )
158
+ puts "\nSuperseded with: #{valuation_2023.fact_text}"
159
+
160
+ # Check the old fact status
161
+ valuation_2020.reload
162
+ puts "\nOriginal fact status: #{valuation_2020.status}"
163
+ puts "Original fact now invalid at: #{valuation_2020.invalid_at}"
164
+
165
+ # Section 5: Temporal Timeline
166
+ puts "\n--- Section 5: Temporal Timeline for Company ---\n"
167
+
168
+ timeline = fact_service.timeline(
169
+ entity_id: company.id,
170
+ from: Date.new(2015, 1, 1),
171
+ to: Date.today
172
+ )
173
+
174
+ puts "Complete timeline for #{company.canonical_name}:"
175
+ timeline.each do |entry|
176
+ end_date = entry[:invalid_at] || "present"
177
+ status_indicator = entry[:status] == "canonical" ? "" : " [#{entry[:status]}]"
178
+ puts " #{entry[:valid_at]} - #{end_date}: #{entry[:fact_text]}#{status_indicator}"
179
+ end
180
+
181
+ # Section 6: Temporal Diff
182
+ puts "\n--- Section 6: Temporal Diff ---\n"
183
+
184
+ temporal_query = FactDb::Temporal::Query.new
185
+
186
+ # Compare company facts between two dates
187
+ puts "Changes to TechCorp facts between 2020-01-01 and 2025-06-01:"
188
+ diff = temporal_query.diff(
189
+ entity_id: company.id,
190
+ from_date: Date.new(2020, 1, 1),
191
+ to_date: Date.new(2025, 6, 1)
192
+ )
193
+
194
+ if diff[:added].any?
195
+ puts "\n Added:"
196
+ diff[:added].each { |f| puts " + #{f.fact_text}" }
197
+ end
198
+
199
+ if diff[:removed].any?
200
+ puts "\n Removed:"
201
+ diff[:removed].each { |f| puts " - #{f.fact_text}" }
202
+ end
203
+
204
+ if diff[:unchanged].any?
205
+ puts "\n Unchanged:"
206
+ diff[:unchanged].each { |f| puts " = #{f.fact_text}" }
207
+ end
208
+
209
+ # Section 7: Facts Created/Invalidated in Date Range
210
+ puts "\n--- Section 7: Facts by Creation/Invalidation Period ---\n"
211
+
212
+ puts "Facts that became valid in 2025:"
213
+ new_facts = temporal_query.facts_created_between(
214
+ from: Date.new(2025, 1, 1),
215
+ to: Date.new(2025, 12, 31)
216
+ )
217
+ new_facts.each { |f| puts " - #{f.fact_text} (valid from #{f.valid_at})" }
218
+
219
+ puts "\nFacts that ended in 2024:"
220
+ ended_facts = temporal_query.facts_invalidated_between(
221
+ from: Date.new(2024, 1, 1),
222
+ to: Date.new(2024, 12, 31)
223
+ )
224
+ ended_facts.each { |f| puts " - #{f.fact_text} (ended #{f.invalid_at})" }
225
+
226
+ # Section 8: Entity Role Queries
227
+ puts "\n--- Section 8: Query by Entity Role ---\n"
228
+
229
+ puts "Facts where TechCorp is the subject:"
230
+ subject_facts = temporal_query.facts_with_entity_role(
231
+ entity_id: company.id,
232
+ role: :subject
233
+ )
234
+ subject_facts.each { |f| puts " - #{f.fact_text}" }
235
+
236
+ puts "\nFacts where TechCorp is the object:"
237
+ object_facts = temporal_query.facts_with_entity_role(
238
+ entity_id: company.id,
239
+ role: :object
240
+ )
241
+ object_facts.each { |f| puts " - #{f.fact_text}" }
242
+
243
+ puts "\n" + "=" * 60
244
+ puts "Temporal Queries Demo Complete!"
245
+ puts "=" * 60
@@ -0,0 +1,71 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "anyway_config"
4
+ require "logger"
5
+
6
+ module FactDb
7
+ class Config < Anyway::Config
8
+ config_name :fact_db
9
+
10
+ # Database configuration
11
+ attr_config :database_url
12
+ attr_config database_pool_size: 5,
13
+ database_timeout: 30_000
14
+
15
+ # Embedding configuration
16
+ attr_config :embedding_generator
17
+ attr_config embedding_dimensions: 1536
18
+
19
+ # LLM configuration
20
+ attr_config :llm_client, :llm_provider, :llm_model, :llm_api_key
21
+
22
+ # Extraction configuration
23
+ attr_config default_extractor: :manual
24
+
25
+ # Entity resolution thresholds
26
+ attr_config fuzzy_match_threshold: 0.85,
27
+ auto_merge_threshold: 0.95
28
+
29
+ # Logging
30
+ attr_config :logger
31
+ attr_config log_level: :info
32
+
33
+ # Build LLM client from configuration if not explicitly set
34
+ def llm_client
35
+ return super if super
36
+
37
+ return nil unless llm_provider
38
+
39
+ @llm_client ||= LLM::Adapter.new(
40
+ provider: llm_provider.to_sym,
41
+ model: llm_model,
42
+ api_key: llm_api_key
43
+ )
44
+ end
45
+
46
+ def logger
47
+ super || Logger.new($stdout, level: log_level)
48
+ end
49
+
50
+ def validate!
51
+ raise ConfigurationError, "Database URL required" unless database_url
52
+
53
+ self
54
+ end
55
+ end
56
+
57
+ class << self
58
+ def config
59
+ @config ||= Config.new
60
+ end
61
+
62
+ def configure
63
+ yield(config) if block_given?
64
+ config
65
+ end
66
+
67
+ def reset_configuration!
68
+ @config = nil
69
+ end
70
+ end
71
+ end
@@ -0,0 +1,45 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "active_record"
4
+ require "neighbor"
5
+
6
+ module FactDb
7
+ module Database
8
+ class << self
9
+ def establish_connection!(config = FactDb.config)
10
+ ActiveRecord::Base.establish_connection(config.database_url)
11
+ ActiveRecord::Base.logger = config.logger if config.logger
12
+ end
13
+
14
+ def connected?
15
+ ActiveRecord::Base.connected?
16
+ end
17
+
18
+ def migrate!
19
+ establish_connection! unless connected?
20
+ migrations_path = File.expand_path("../../db/migrate", __dir__)
21
+ ActiveRecord::MigrationContext.new(migrations_path).migrate
22
+ end
23
+
24
+ def rollback!(steps = 1)
25
+ establish_connection! unless connected?
26
+ migrations_path = File.expand_path("../../db/migrate", __dir__)
27
+ ActiveRecord::MigrationContext.new(migrations_path).rollback(steps)
28
+ end
29
+
30
+ def reset!
31
+ establish_connection! unless connected?
32
+ ActiveRecord::Base.connection.tables.each do |table|
33
+ next if table == "schema_migrations"
34
+ ActiveRecord::Base.connection.drop_table(table, if_exists: true, force: :cascade)
35
+ end
36
+ migrate!
37
+ end
38
+
39
+ def schema_version
40
+ establish_connection! unless connected?
41
+ ActiveRecord::SchemaMigration.all.map(&:version).max || 0
42
+ end
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,10 @@
1
+ # frozen_string_literal: true
2
+
3
+ module FactDb
4
+ class Error < StandardError; end
5
+ class ValidationError < Error; end
6
+ class NotFoundError < Error; end
7
+ class ResolutionError < Error; end
8
+ class ExtractionError < Error; end
9
+ class ConfigurationError < Error; end
10
+ end
@@ -0,0 +1,117 @@
1
+ # frozen_string_literal: true
2
+
3
+ module FactDb
4
+ module Extractors
5
+ class Base
6
+ attr_reader :config
7
+
8
+ def initialize(config = FactDb.config)
9
+ @config = config
10
+ end
11
+
12
+ # Extract facts from text
13
+ # @param text [String] Raw text to extract from
14
+ # @param context [Hash] Additional context (captured_at, source_uri, etc.)
15
+ # @return [Array<Hash>] Array of fact data hashes
16
+ def extract(text, context = {})
17
+ raise NotImplementedError, "#{self.class} must implement #extract"
18
+ end
19
+
20
+ # Extract entities from text
21
+ # @param text [String] Raw text to extract from
22
+ # @return [Array<Hash>] Array of { name:, type:, aliases: }
23
+ def extract_entities(text)
24
+ raise NotImplementedError, "#{self.class} must implement #extract_entities"
25
+ end
26
+
27
+ # Get the extraction method name
28
+ def extraction_method
29
+ self.class.name.split("::").last.sub("Extractor", "").underscore
30
+ end
31
+
32
+ class << self
33
+ def for(type, config = FactDb.config)
34
+ case type.to_sym
35
+ when :manual
36
+ ManualExtractor.new(config)
37
+ when :llm
38
+ LLMExtractor.new(config)
39
+ when :rule_based
40
+ RuleBasedExtractor.new(config)
41
+ else
42
+ raise ArgumentError, "Unknown extractor type: #{type}"
43
+ end
44
+ end
45
+
46
+ def available_types
47
+ %i[manual llm rule_based]
48
+ end
49
+ end
50
+
51
+ protected
52
+
53
+ # Parse a date string, returning nil if invalid
54
+ def parse_date(date_str)
55
+ return nil if date_str.nil? || date_str.to_s.empty?
56
+
57
+ # Try chronic for natural language dates
58
+ if defined?(Chronic)
59
+ chronic_result = Chronic.parse(date_str)
60
+ return chronic_result.to_date if chronic_result
61
+ end
62
+
63
+ Date.parse(date_str.to_s)
64
+ rescue Date::Error, ArgumentError
65
+ nil
66
+ end
67
+
68
+ # Parse a timestamp string, returning nil if invalid
69
+ def parse_timestamp(timestamp_str)
70
+ return nil if timestamp_str.nil? || timestamp_str.to_s.empty?
71
+
72
+ # Try chronic for natural language dates
73
+ if defined?(Chronic)
74
+ chronic_result = Chronic.parse(timestamp_str)
75
+ return chronic_result if chronic_result
76
+ end
77
+
78
+ Time.parse(timestamp_str.to_s)
79
+ rescue ArgumentError
80
+ nil
81
+ end
82
+
83
+ # Build a standardized fact hash
84
+ def build_fact(text:, valid_at:, invalid_at: nil, mentions: [], confidence: 1.0, metadata: {})
85
+ {
86
+ text: text.strip,
87
+ valid_at: valid_at,
88
+ invalid_at: invalid_at,
89
+ mentions: mentions,
90
+ confidence: confidence,
91
+ metadata: metadata,
92
+ extraction_method: extraction_method
93
+ }
94
+ end
95
+
96
+ # Build a standardized entity hash
97
+ def build_entity(name:, type:, aliases: [], attributes: {})
98
+ {
99
+ name: name.strip,
100
+ type: type.to_s,
101
+ aliases: aliases.map(&:strip),
102
+ attributes: attributes
103
+ }
104
+ end
105
+
106
+ # Build a standardized mention hash
107
+ def build_mention(name:, type:, role: nil, confidence: 1.0)
108
+ {
109
+ name: name.strip,
110
+ type: type.to_s,
111
+ role: role&.to_s,
112
+ confidence: confidence
113
+ }
114
+ end
115
+ end
116
+ end
117
+ end
@@ -0,0 +1,179 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "json"
4
+
5
+ module FactDb
6
+ module Extractors
7
+ class LLMExtractor < Base
8
+ FACT_EXTRACTION_PROMPT = <<~PROMPT
9
+ Extract factual assertions from the following text. For each fact:
10
+ 1. State the assertion clearly and concisely
11
+ 2. Identify when it became true (valid_at) if mentioned
12
+ 3. Identify when it stopped being true (invalid_at) if mentioned
13
+ 4. Identify entities mentioned (people, organizations, places, products)
14
+ 5. Assign a confidence score (0.0 to 1.0) based on how explicitly stated the fact is
15
+
16
+ Text:
17
+ %<text>s
18
+
19
+ Return as a JSON array with this structure:
20
+ [
21
+ {
22
+ "text": "Paula works at Microsoft as Principal Engineer",
23
+ "valid_at": "2024-01-10",
24
+ "invalid_at": null,
25
+ "confidence": 0.95,
26
+ "mentions": [
27
+ {"name": "Paula", "type": "person", "role": "subject"},
28
+ {"name": "Microsoft", "type": "organization", "role": "object"}
29
+ ]
30
+ }
31
+ ]
32
+
33
+ Rules:
34
+ - Extract only factual assertions, not opinions or speculation
35
+ - Use ISO 8601 date format (YYYY-MM-DD) when possible
36
+ - Set invalid_at to null if the fact is still true or unknown
37
+ - Set valid_at to null if the timing is not mentioned
38
+ - Entity types: person, organization, place, product, event, concept
39
+ - Roles: subject, object, location, temporal, instrument, beneficiary
40
+
41
+ Return only valid JSON, no additional text.
42
+ PROMPT
43
+
44
+ ENTITY_EXTRACTION_PROMPT = <<~PROMPT
45
+ Extract all named entities from the following text.
46
+ For each entity:
47
+ 1. Identify the canonical name
48
+ 2. Classify the type (person, organization, place, product, event, concept)
49
+ 3. List any aliases or alternative names mentioned
50
+
51
+ Text:
52
+ %<text>s
53
+
54
+ Return as a JSON array:
55
+ [
56
+ {
57
+ "name": "Paula Chen",
58
+ "type": "person",
59
+ "aliases": ["Paula", "P. Chen"]
60
+ }
61
+ ]
62
+
63
+ Return only valid JSON, no additional text.
64
+ PROMPT
65
+
66
+ def extract(text, context = {})
67
+ return [] if text.nil? || text.strip.empty?
68
+
69
+ client = config.llm_client
70
+ raise ConfigurationError, "LLM client not configured" unless client
71
+
72
+ prompt = format(FACT_EXTRACTION_PROMPT, text: text)
73
+ response = call_llm(client, prompt)
74
+
75
+ parse_fact_response(response, context)
76
+ end
77
+
78
+ def extract_entities(text)
79
+ return [] if text.nil? || text.strip.empty?
80
+
81
+ client = config.llm_client
82
+ raise ConfigurationError, "LLM client not configured" unless client
83
+
84
+ prompt = format(ENTITY_EXTRACTION_PROMPT, text: text)
85
+ response = call_llm(client, prompt)
86
+
87
+ parse_entity_response(response)
88
+ end
89
+
90
+ private
91
+
92
+ def call_llm(client, prompt)
93
+ # Support multiple LLM client interfaces
94
+ if client.respond_to?(:chat)
95
+ # Standard chat interface (most LLM gems)
96
+ client.chat(prompt)
97
+ elsif client.respond_to?(:complete)
98
+ # Completion interface
99
+ client.complete(prompt)
100
+ elsif client.respond_to?(:call)
101
+ # Callable/lambda interface
102
+ client.call(prompt)
103
+ else
104
+ raise ConfigurationError, "LLM client must respond to :chat, :complete, or :call"
105
+ end
106
+ end
107
+
108
+ def parse_fact_response(response, context)
109
+ json = extract_json(response)
110
+ parsed = JSON.parse(json)
111
+
112
+ parsed.map do |fact_data|
113
+ valid_at = parse_timestamp(fact_data["valid_at"]) ||
114
+ context[:captured_at] ||
115
+ Time.current
116
+
117
+ build_fact(
118
+ text: fact_data["text"],
119
+ valid_at: valid_at,
120
+ invalid_at: parse_timestamp(fact_data["invalid_at"]),
121
+ mentions: parse_mentions(fact_data["mentions"]),
122
+ confidence: fact_data["confidence"]&.to_f || 0.8,
123
+ metadata: { llm_response: fact_data }
124
+ )
125
+ end
126
+ rescue JSON::ParserError => e
127
+ config.logger&.warn("Failed to parse LLM fact response: #{e.message}")
128
+ []
129
+ end
130
+
131
+ def parse_entity_response(response)
132
+ json = extract_json(response)
133
+ parsed = JSON.parse(json)
134
+
135
+ parsed.map do |entity_data|
136
+ build_entity(
137
+ name: entity_data["name"],
138
+ type: entity_data["type"] || "concept",
139
+ aliases: entity_data["aliases"] || [],
140
+ attributes: entity_data["attributes"] || {}
141
+ )
142
+ end
143
+ rescue JSON::ParserError => e
144
+ config.logger&.warn("Failed to parse LLM entity response: #{e.message}")
145
+ []
146
+ end
147
+
148
+ def parse_mentions(mentions_data)
149
+ return [] unless mentions_data.is_a?(Array)
150
+
151
+ mentions_data.map do |mention|
152
+ build_mention(
153
+ name: mention["name"],
154
+ type: mention["type"] || "concept",
155
+ role: mention["role"],
156
+ confidence: mention["confidence"]&.to_f || 1.0
157
+ )
158
+ end
159
+ end
160
+
161
+ def extract_json(response)
162
+ # Handle responses that may have markdown code blocks
163
+ text = response.to_s.strip
164
+
165
+ # Remove markdown code blocks if present
166
+ if text.start_with?("```")
167
+ text = text.sub(/\A```(?:json)?\n?/, "").sub(/\n?```\z/, "")
168
+ end
169
+
170
+ # Find JSON array in response
171
+ if (match = text.match(/\[[\s\S]*\]/))
172
+ match[0]
173
+ else
174
+ text
175
+ end
176
+ end
177
+ end
178
+ end
179
+ end