fact_db 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.envrc +1 -0
- data/CHANGELOG.md +48 -0
- data/COMMITS.md +196 -0
- data/README.md +102 -0
- data/Rakefile +41 -0
- data/db/migrate/001_enable_extensions.rb +7 -0
- data/db/migrate/002_create_contents.rb +44 -0
- data/db/migrate/003_create_entities.rb +36 -0
- data/db/migrate/004_create_entity_aliases.rb +18 -0
- data/db/migrate/005_create_facts.rb +65 -0
- data/db/migrate/006_create_entity_mentions.rb +18 -0
- data/db/migrate/007_create_fact_sources.rb +18 -0
- data/docs/api/extractors/index.md +71 -0
- data/docs/api/extractors/llm.md +162 -0
- data/docs/api/extractors/manual.md +92 -0
- data/docs/api/extractors/rule-based.md +165 -0
- data/docs/api/facts.md +300 -0
- data/docs/api/index.md +66 -0
- data/docs/api/models/content.md +165 -0
- data/docs/api/models/entity.md +202 -0
- data/docs/api/models/fact.md +270 -0
- data/docs/api/models/index.md +77 -0
- data/docs/api/pipeline/extraction.md +175 -0
- data/docs/api/pipeline/index.md +72 -0
- data/docs/api/pipeline/resolution.md +209 -0
- data/docs/api/services/content-service.md +166 -0
- data/docs/api/services/entity-service.md +202 -0
- data/docs/api/services/fact-service.md +223 -0
- data/docs/api/services/index.md +55 -0
- data/docs/architecture/database-schema.md +293 -0
- data/docs/architecture/entity-resolution.md +293 -0
- data/docs/architecture/index.md +149 -0
- data/docs/architecture/temporal-facts.md +268 -0
- data/docs/architecture/three-layer-model.md +242 -0
- data/docs/assets/css/custom.css +137 -0
- data/docs/assets/fact_db.jpg +0 -0
- data/docs/assets/images/fact_db.jpg +0 -0
- data/docs/concepts.md +183 -0
- data/docs/examples/basic-usage.md +235 -0
- data/docs/examples/hr-onboarding.md +312 -0
- data/docs/examples/index.md +64 -0
- data/docs/examples/news-analysis.md +288 -0
- data/docs/getting-started/database-setup.md +170 -0
- data/docs/getting-started/index.md +71 -0
- data/docs/getting-started/installation.md +98 -0
- data/docs/getting-started/quick-start.md +191 -0
- data/docs/guides/batch-processing.md +325 -0
- data/docs/guides/configuration.md +243 -0
- data/docs/guides/entity-management.md +364 -0
- data/docs/guides/extracting-facts.md +299 -0
- data/docs/guides/index.md +22 -0
- data/docs/guides/ingesting-content.md +252 -0
- data/docs/guides/llm-integration.md +299 -0
- data/docs/guides/temporal-queries.md +315 -0
- data/docs/index.md +121 -0
- data/examples/README.md +130 -0
- data/examples/basic_usage.rb +164 -0
- data/examples/entity_management.rb +216 -0
- data/examples/hr_system.rb +428 -0
- data/examples/rule_based_extraction.rb +258 -0
- data/examples/temporal_queries.rb +245 -0
- data/lib/fact_db/config.rb +71 -0
- data/lib/fact_db/database.rb +45 -0
- data/lib/fact_db/errors.rb +10 -0
- data/lib/fact_db/extractors/base.rb +117 -0
- data/lib/fact_db/extractors/llm_extractor.rb +179 -0
- data/lib/fact_db/extractors/manual_extractor.rb +53 -0
- data/lib/fact_db/extractors/rule_based_extractor.rb +228 -0
- data/lib/fact_db/llm/adapter.rb +109 -0
- data/lib/fact_db/models/content.rb +62 -0
- data/lib/fact_db/models/entity.rb +84 -0
- data/lib/fact_db/models/entity_alias.rb +26 -0
- data/lib/fact_db/models/entity_mention.rb +33 -0
- data/lib/fact_db/models/fact.rb +192 -0
- data/lib/fact_db/models/fact_source.rb +35 -0
- data/lib/fact_db/pipeline/extraction_pipeline.rb +146 -0
- data/lib/fact_db/pipeline/resolution_pipeline.rb +129 -0
- data/lib/fact_db/resolution/entity_resolver.rb +261 -0
- data/lib/fact_db/resolution/fact_resolver.rb +259 -0
- data/lib/fact_db/services/content_service.rb +93 -0
- data/lib/fact_db/services/entity_service.rb +150 -0
- data/lib/fact_db/services/fact_service.rb +193 -0
- data/lib/fact_db/temporal/query.rb +125 -0
- data/lib/fact_db/temporal/timeline.rb +134 -0
- data/lib/fact_db/version.rb +5 -0
- data/lib/fact_db.rb +141 -0
- data/mkdocs.yml +198 -0
- metadata +288 -0
|
@@ -0,0 +1,245 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
# frozen_string_literal: true
|
|
3
|
+
|
|
4
|
+
# Temporal Queries Example for FactDb
|
|
5
|
+
#
|
|
6
|
+
# This example demonstrates:
|
|
7
|
+
# - Creating facts with temporal bounds
|
|
8
|
+
# - Querying facts at specific points in time
|
|
9
|
+
# - Superseding facts (replacing old with new)
|
|
10
|
+
# - Detecting fact changes over time
|
|
11
|
+
# - Building temporal diffs
|
|
12
|
+
|
|
13
|
+
require "bundler/setup"
|
|
14
|
+
require "fact_db"
|
|
15
|
+
|
|
16
|
+
FactDb.configure do |config|
|
|
17
|
+
config.database_url = ENV.fetch("DATABASE_URL", "postgres://#{ENV['USER']}@localhost/fact_db_demo")
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
# Ensure database tables exist
|
|
21
|
+
FactDb::Database.migrate!
|
|
22
|
+
|
|
23
|
+
clock = FactDb.new
|
|
24
|
+
entity_service = clock.entity_service
|
|
25
|
+
fact_service = clock.fact_service
|
|
26
|
+
|
|
27
|
+
puts "=" * 60
|
|
28
|
+
puts "FactDb Temporal Queries Demo"
|
|
29
|
+
puts "=" * 60
|
|
30
|
+
|
|
31
|
+
# Setup: Create entities for our scenario
|
|
32
|
+
puts "\n--- Setup: Creating Entities ---\n"
|
|
33
|
+
|
|
34
|
+
company = entity_service.create(
|
|
35
|
+
"TechCorp Ltd",
|
|
36
|
+
type: :organization,
|
|
37
|
+
description: "Technology company"
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
ceo = entity_service.create(
|
|
41
|
+
"Alice Chen",
|
|
42
|
+
type: :person,
|
|
43
|
+
description: "Executive"
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
new_ceo = entity_service.create(
|
|
47
|
+
"David Park",
|
|
48
|
+
type: :person,
|
|
49
|
+
description: "Executive"
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
cfo = entity_service.create(
|
|
53
|
+
"Sarah Miller",
|
|
54
|
+
type: :person,
|
|
55
|
+
description: "Finance executive"
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
puts "Created entities: #{company.canonical_name}, #{ceo.canonical_name}, #{new_ceo.canonical_name}, #{cfo.canonical_name}"
|
|
59
|
+
|
|
60
|
+
# Section 1: Creating Temporal Facts
|
|
61
|
+
puts "\n--- Section 1: Creating Temporal Facts ---\n"
|
|
62
|
+
|
|
63
|
+
# Fact with open-ended validity (still true)
|
|
64
|
+
fact1 = fact_service.create(
|
|
65
|
+
"TechCorp Ltd is headquartered in Austin, Texas",
|
|
66
|
+
valid_at: Date.new(2015, 1, 1),
|
|
67
|
+
mentions: [{ entity_id: company.id, role: :subject, text: "TechCorp Ltd" }]
|
|
68
|
+
)
|
|
69
|
+
puts "Created: #{fact1.fact_text}"
|
|
70
|
+
puts " Valid: #{fact1.valid_at} - present"
|
|
71
|
+
|
|
72
|
+
# Fact with closed validity (historical)
|
|
73
|
+
fact2 = fact_service.create(
|
|
74
|
+
"Alice Chen is CEO of TechCorp Ltd",
|
|
75
|
+
valid_at: Date.new(2018, 3, 1),
|
|
76
|
+
invalid_at: Date.new(2024, 12, 31),
|
|
77
|
+
mentions: [
|
|
78
|
+
{ entity_id: ceo.id, role: :subject, text: "Alice Chen" },
|
|
79
|
+
{ entity_id: company.id, role: :object, text: "TechCorp Ltd" }
|
|
80
|
+
]
|
|
81
|
+
)
|
|
82
|
+
puts "\nCreated: #{fact2.fact_text}"
|
|
83
|
+
puts " Valid: #{fact2.valid_at} - #{fact2.invalid_at}"
|
|
84
|
+
|
|
85
|
+
# Current CEO
|
|
86
|
+
fact3 = fact_service.create(
|
|
87
|
+
"David Park is CEO of TechCorp Ltd",
|
|
88
|
+
valid_at: Date.new(2025, 1, 1),
|
|
89
|
+
mentions: [
|
|
90
|
+
{ entity_id: new_ceo.id, role: :subject, text: "David Park" },
|
|
91
|
+
{ entity_id: company.id, role: :object, text: "TechCorp Ltd" }
|
|
92
|
+
]
|
|
93
|
+
)
|
|
94
|
+
puts "\nCreated: #{fact3.fact_text}"
|
|
95
|
+
puts " Valid: #{fact3.valid_at} - present"
|
|
96
|
+
|
|
97
|
+
# Another current fact
|
|
98
|
+
fact4 = fact_service.create(
|
|
99
|
+
"Sarah Miller is CFO of TechCorp Ltd",
|
|
100
|
+
valid_at: Date.new(2020, 6, 15),
|
|
101
|
+
mentions: [
|
|
102
|
+
{ entity_id: cfo.id, role: :subject, text: "Sarah Miller" },
|
|
103
|
+
{ entity_id: company.id, role: :object, text: "TechCorp Ltd" }
|
|
104
|
+
]
|
|
105
|
+
)
|
|
106
|
+
puts "\nCreated: #{fact4.fact_text}"
|
|
107
|
+
puts " Valid: #{fact4.valid_at} - present"
|
|
108
|
+
|
|
109
|
+
# Section 2: Point-in-Time Queries
|
|
110
|
+
puts "\n--- Section 2: Point-in-Time Queries ---\n"
|
|
111
|
+
|
|
112
|
+
# Query facts valid at different dates
|
|
113
|
+
dates_to_query = [
|
|
114
|
+
Date.new(2019, 6, 1), # Alice was CEO
|
|
115
|
+
Date.new(2024, 6, 1), # Alice still CEO
|
|
116
|
+
Date.new(2025, 6, 1), # David is CEO
|
|
117
|
+
Date.today
|
|
118
|
+
]
|
|
119
|
+
|
|
120
|
+
dates_to_query.each do |date|
|
|
121
|
+
puts "\nFacts about TechCorp on #{date}:"
|
|
122
|
+
facts = fact_service.facts_at(date, entity: company.id)
|
|
123
|
+
facts.each do |fact|
|
|
124
|
+
puts " - #{fact.fact_text}"
|
|
125
|
+
end
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
# Section 3: Current vs Historical Facts
|
|
129
|
+
puts "\n--- Section 3: Current vs Historical Facts ---\n"
|
|
130
|
+
|
|
131
|
+
puts "Currently valid facts about TechCorp:"
|
|
132
|
+
current = fact_service.current_facts(entity: company.id)
|
|
133
|
+
current.each { |f| puts " - #{f.fact_text}" }
|
|
134
|
+
|
|
135
|
+
puts "\nAll historical facts:"
|
|
136
|
+
FactDb::Models::Fact.historical.each do |fact|
|
|
137
|
+
puts " - #{fact.fact_text} (ended: #{fact.invalid_at})"
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
# Section 4: Superseding Facts
|
|
141
|
+
puts "\n--- Section 4: Superseding Facts ---\n"
|
|
142
|
+
|
|
143
|
+
# Company valuation that changes over time
|
|
144
|
+
valuation_2020 = fact_service.create(
|
|
145
|
+
"TechCorp Ltd has a market valuation of $500 million",
|
|
146
|
+
valid_at: Date.new(2020, 1, 1),
|
|
147
|
+
mentions: [{ entity_id: company.id, role: :subject, text: "TechCorp Ltd" }]
|
|
148
|
+
)
|
|
149
|
+
puts "Created valuation fact: #{valuation_2020.fact_text}"
|
|
150
|
+
|
|
151
|
+
# Supersede with new valuation
|
|
152
|
+
valuation_2023 = fact_service.supersede(
|
|
153
|
+
valuation_2020.id,
|
|
154
|
+
"TechCorp Ltd has a market valuation of $1.2 billion",
|
|
155
|
+
valid_at: Date.new(2023, 1, 1),
|
|
156
|
+
mentions: [{ entity_id: company.id, role: :subject, text: "TechCorp Ltd" }]
|
|
157
|
+
)
|
|
158
|
+
puts "\nSuperseded with: #{valuation_2023.fact_text}"
|
|
159
|
+
|
|
160
|
+
# Check the old fact status
|
|
161
|
+
valuation_2020.reload
|
|
162
|
+
puts "\nOriginal fact status: #{valuation_2020.status}"
|
|
163
|
+
puts "Original fact now invalid at: #{valuation_2020.invalid_at}"
|
|
164
|
+
|
|
165
|
+
# Section 5: Temporal Timeline
|
|
166
|
+
puts "\n--- Section 5: Temporal Timeline for Company ---\n"
|
|
167
|
+
|
|
168
|
+
timeline = fact_service.timeline(
|
|
169
|
+
entity_id: company.id,
|
|
170
|
+
from: Date.new(2015, 1, 1),
|
|
171
|
+
to: Date.today
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
puts "Complete timeline for #{company.canonical_name}:"
|
|
175
|
+
timeline.each do |entry|
|
|
176
|
+
end_date = entry[:invalid_at] || "present"
|
|
177
|
+
status_indicator = entry[:status] == "canonical" ? "" : " [#{entry[:status]}]"
|
|
178
|
+
puts " #{entry[:valid_at]} - #{end_date}: #{entry[:fact_text]}#{status_indicator}"
|
|
179
|
+
end
|
|
180
|
+
|
|
181
|
+
# Section 6: Temporal Diff
|
|
182
|
+
puts "\n--- Section 6: Temporal Diff ---\n"
|
|
183
|
+
|
|
184
|
+
temporal_query = FactDb::Temporal::Query.new
|
|
185
|
+
|
|
186
|
+
# Compare company facts between two dates
|
|
187
|
+
puts "Changes to TechCorp facts between 2020-01-01 and 2025-06-01:"
|
|
188
|
+
diff = temporal_query.diff(
|
|
189
|
+
entity_id: company.id,
|
|
190
|
+
from_date: Date.new(2020, 1, 1),
|
|
191
|
+
to_date: Date.new(2025, 6, 1)
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
if diff[:added].any?
|
|
195
|
+
puts "\n Added:"
|
|
196
|
+
diff[:added].each { |f| puts " + #{f.fact_text}" }
|
|
197
|
+
end
|
|
198
|
+
|
|
199
|
+
if diff[:removed].any?
|
|
200
|
+
puts "\n Removed:"
|
|
201
|
+
diff[:removed].each { |f| puts " - #{f.fact_text}" }
|
|
202
|
+
end
|
|
203
|
+
|
|
204
|
+
if diff[:unchanged].any?
|
|
205
|
+
puts "\n Unchanged:"
|
|
206
|
+
diff[:unchanged].each { |f| puts " = #{f.fact_text}" }
|
|
207
|
+
end
|
|
208
|
+
|
|
209
|
+
# Section 7: Facts Created/Invalidated in Date Range
|
|
210
|
+
puts "\n--- Section 7: Facts by Creation/Invalidation Period ---\n"
|
|
211
|
+
|
|
212
|
+
puts "Facts that became valid in 2025:"
|
|
213
|
+
new_facts = temporal_query.facts_created_between(
|
|
214
|
+
from: Date.new(2025, 1, 1),
|
|
215
|
+
to: Date.new(2025, 12, 31)
|
|
216
|
+
)
|
|
217
|
+
new_facts.each { |f| puts " - #{f.fact_text} (valid from #{f.valid_at})" }
|
|
218
|
+
|
|
219
|
+
puts "\nFacts that ended in 2024:"
|
|
220
|
+
ended_facts = temporal_query.facts_invalidated_between(
|
|
221
|
+
from: Date.new(2024, 1, 1),
|
|
222
|
+
to: Date.new(2024, 12, 31)
|
|
223
|
+
)
|
|
224
|
+
ended_facts.each { |f| puts " - #{f.fact_text} (ended #{f.invalid_at})" }
|
|
225
|
+
|
|
226
|
+
# Section 8: Entity Role Queries
|
|
227
|
+
puts "\n--- Section 8: Query by Entity Role ---\n"
|
|
228
|
+
|
|
229
|
+
puts "Facts where TechCorp is the subject:"
|
|
230
|
+
subject_facts = temporal_query.facts_with_entity_role(
|
|
231
|
+
entity_id: company.id,
|
|
232
|
+
role: :subject
|
|
233
|
+
)
|
|
234
|
+
subject_facts.each { |f| puts " - #{f.fact_text}" }
|
|
235
|
+
|
|
236
|
+
puts "\nFacts where TechCorp is the object:"
|
|
237
|
+
object_facts = temporal_query.facts_with_entity_role(
|
|
238
|
+
entity_id: company.id,
|
|
239
|
+
role: :object
|
|
240
|
+
)
|
|
241
|
+
object_facts.each { |f| puts " - #{f.fact_text}" }
|
|
242
|
+
|
|
243
|
+
puts "\n" + "=" * 60
|
|
244
|
+
puts "Temporal Queries Demo Complete!"
|
|
245
|
+
puts "=" * 60
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "anyway_config"
|
|
4
|
+
require "logger"
|
|
5
|
+
|
|
6
|
+
module FactDb
|
|
7
|
+
class Config < Anyway::Config
|
|
8
|
+
config_name :fact_db
|
|
9
|
+
|
|
10
|
+
# Database configuration
|
|
11
|
+
attr_config :database_url
|
|
12
|
+
attr_config database_pool_size: 5,
|
|
13
|
+
database_timeout: 30_000
|
|
14
|
+
|
|
15
|
+
# Embedding configuration
|
|
16
|
+
attr_config :embedding_generator
|
|
17
|
+
attr_config embedding_dimensions: 1536
|
|
18
|
+
|
|
19
|
+
# LLM configuration
|
|
20
|
+
attr_config :llm_client, :llm_provider, :llm_model, :llm_api_key
|
|
21
|
+
|
|
22
|
+
# Extraction configuration
|
|
23
|
+
attr_config default_extractor: :manual
|
|
24
|
+
|
|
25
|
+
# Entity resolution thresholds
|
|
26
|
+
attr_config fuzzy_match_threshold: 0.85,
|
|
27
|
+
auto_merge_threshold: 0.95
|
|
28
|
+
|
|
29
|
+
# Logging
|
|
30
|
+
attr_config :logger
|
|
31
|
+
attr_config log_level: :info
|
|
32
|
+
|
|
33
|
+
# Build LLM client from configuration if not explicitly set
|
|
34
|
+
def llm_client
|
|
35
|
+
return super if super
|
|
36
|
+
|
|
37
|
+
return nil unless llm_provider
|
|
38
|
+
|
|
39
|
+
@llm_client ||= LLM::Adapter.new(
|
|
40
|
+
provider: llm_provider.to_sym,
|
|
41
|
+
model: llm_model,
|
|
42
|
+
api_key: llm_api_key
|
|
43
|
+
)
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def logger
|
|
47
|
+
super || Logger.new($stdout, level: log_level)
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def validate!
|
|
51
|
+
raise ConfigurationError, "Database URL required" unless database_url
|
|
52
|
+
|
|
53
|
+
self
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
class << self
|
|
58
|
+
def config
|
|
59
|
+
@config ||= Config.new
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
def configure
|
|
63
|
+
yield(config) if block_given?
|
|
64
|
+
config
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
def reset_configuration!
|
|
68
|
+
@config = nil
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
end
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "active_record"
|
|
4
|
+
require "neighbor"
|
|
5
|
+
|
|
6
|
+
module FactDb
|
|
7
|
+
module Database
|
|
8
|
+
class << self
|
|
9
|
+
def establish_connection!(config = FactDb.config)
|
|
10
|
+
ActiveRecord::Base.establish_connection(config.database_url)
|
|
11
|
+
ActiveRecord::Base.logger = config.logger if config.logger
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def connected?
|
|
15
|
+
ActiveRecord::Base.connected?
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def migrate!
|
|
19
|
+
establish_connection! unless connected?
|
|
20
|
+
migrations_path = File.expand_path("../../db/migrate", __dir__)
|
|
21
|
+
ActiveRecord::MigrationContext.new(migrations_path).migrate
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def rollback!(steps = 1)
|
|
25
|
+
establish_connection! unless connected?
|
|
26
|
+
migrations_path = File.expand_path("../../db/migrate", __dir__)
|
|
27
|
+
ActiveRecord::MigrationContext.new(migrations_path).rollback(steps)
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def reset!
|
|
31
|
+
establish_connection! unless connected?
|
|
32
|
+
ActiveRecord::Base.connection.tables.each do |table|
|
|
33
|
+
next if table == "schema_migrations"
|
|
34
|
+
ActiveRecord::Base.connection.drop_table(table, if_exists: true, force: :cascade)
|
|
35
|
+
end
|
|
36
|
+
migrate!
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def schema_version
|
|
40
|
+
establish_connection! unless connected?
|
|
41
|
+
ActiveRecord::SchemaMigration.all.map(&:version).max || 0
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
end
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module FactDb
|
|
4
|
+
class Error < StandardError; end
|
|
5
|
+
class ValidationError < Error; end
|
|
6
|
+
class NotFoundError < Error; end
|
|
7
|
+
class ResolutionError < Error; end
|
|
8
|
+
class ExtractionError < Error; end
|
|
9
|
+
class ConfigurationError < Error; end
|
|
10
|
+
end
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module FactDb
|
|
4
|
+
module Extractors
|
|
5
|
+
class Base
|
|
6
|
+
attr_reader :config
|
|
7
|
+
|
|
8
|
+
def initialize(config = FactDb.config)
|
|
9
|
+
@config = config
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
# Extract facts from text
|
|
13
|
+
# @param text [String] Raw text to extract from
|
|
14
|
+
# @param context [Hash] Additional context (captured_at, source_uri, etc.)
|
|
15
|
+
# @return [Array<Hash>] Array of fact data hashes
|
|
16
|
+
def extract(text, context = {})
|
|
17
|
+
raise NotImplementedError, "#{self.class} must implement #extract"
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
# Extract entities from text
|
|
21
|
+
# @param text [String] Raw text to extract from
|
|
22
|
+
# @return [Array<Hash>] Array of { name:, type:, aliases: }
|
|
23
|
+
def extract_entities(text)
|
|
24
|
+
raise NotImplementedError, "#{self.class} must implement #extract_entities"
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
# Get the extraction method name
|
|
28
|
+
def extraction_method
|
|
29
|
+
self.class.name.split("::").last.sub("Extractor", "").underscore
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
class << self
|
|
33
|
+
def for(type, config = FactDb.config)
|
|
34
|
+
case type.to_sym
|
|
35
|
+
when :manual
|
|
36
|
+
ManualExtractor.new(config)
|
|
37
|
+
when :llm
|
|
38
|
+
LLMExtractor.new(config)
|
|
39
|
+
when :rule_based
|
|
40
|
+
RuleBasedExtractor.new(config)
|
|
41
|
+
else
|
|
42
|
+
raise ArgumentError, "Unknown extractor type: #{type}"
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def available_types
|
|
47
|
+
%i[manual llm rule_based]
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
protected
|
|
52
|
+
|
|
53
|
+
# Parse a date string, returning nil if invalid
|
|
54
|
+
def parse_date(date_str)
|
|
55
|
+
return nil if date_str.nil? || date_str.to_s.empty?
|
|
56
|
+
|
|
57
|
+
# Try chronic for natural language dates
|
|
58
|
+
if defined?(Chronic)
|
|
59
|
+
chronic_result = Chronic.parse(date_str)
|
|
60
|
+
return chronic_result.to_date if chronic_result
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
Date.parse(date_str.to_s)
|
|
64
|
+
rescue Date::Error, ArgumentError
|
|
65
|
+
nil
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
# Parse a timestamp string, returning nil if invalid
|
|
69
|
+
def parse_timestamp(timestamp_str)
|
|
70
|
+
return nil if timestamp_str.nil? || timestamp_str.to_s.empty?
|
|
71
|
+
|
|
72
|
+
# Try chronic for natural language dates
|
|
73
|
+
if defined?(Chronic)
|
|
74
|
+
chronic_result = Chronic.parse(timestamp_str)
|
|
75
|
+
return chronic_result if chronic_result
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
Time.parse(timestamp_str.to_s)
|
|
79
|
+
rescue ArgumentError
|
|
80
|
+
nil
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
# Build a standardized fact hash
|
|
84
|
+
def build_fact(text:, valid_at:, invalid_at: nil, mentions: [], confidence: 1.0, metadata: {})
|
|
85
|
+
{
|
|
86
|
+
text: text.strip,
|
|
87
|
+
valid_at: valid_at,
|
|
88
|
+
invalid_at: invalid_at,
|
|
89
|
+
mentions: mentions,
|
|
90
|
+
confidence: confidence,
|
|
91
|
+
metadata: metadata,
|
|
92
|
+
extraction_method: extraction_method
|
|
93
|
+
}
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
# Build a standardized entity hash
|
|
97
|
+
def build_entity(name:, type:, aliases: [], attributes: {})
|
|
98
|
+
{
|
|
99
|
+
name: name.strip,
|
|
100
|
+
type: type.to_s,
|
|
101
|
+
aliases: aliases.map(&:strip),
|
|
102
|
+
attributes: attributes
|
|
103
|
+
}
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
# Build a standardized mention hash
|
|
107
|
+
def build_mention(name:, type:, role: nil, confidence: 1.0)
|
|
108
|
+
{
|
|
109
|
+
name: name.strip,
|
|
110
|
+
type: type.to_s,
|
|
111
|
+
role: role&.to_s,
|
|
112
|
+
confidence: confidence
|
|
113
|
+
}
|
|
114
|
+
end
|
|
115
|
+
end
|
|
116
|
+
end
|
|
117
|
+
end
|
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "json"
|
|
4
|
+
|
|
5
|
+
module FactDb
|
|
6
|
+
module Extractors
|
|
7
|
+
class LLMExtractor < Base
|
|
8
|
+
FACT_EXTRACTION_PROMPT = <<~PROMPT
|
|
9
|
+
Extract factual assertions from the following text. For each fact:
|
|
10
|
+
1. State the assertion clearly and concisely
|
|
11
|
+
2. Identify when it became true (valid_at) if mentioned
|
|
12
|
+
3. Identify when it stopped being true (invalid_at) if mentioned
|
|
13
|
+
4. Identify entities mentioned (people, organizations, places, products)
|
|
14
|
+
5. Assign a confidence score (0.0 to 1.0) based on how explicitly stated the fact is
|
|
15
|
+
|
|
16
|
+
Text:
|
|
17
|
+
%<text>s
|
|
18
|
+
|
|
19
|
+
Return as a JSON array with this structure:
|
|
20
|
+
[
|
|
21
|
+
{
|
|
22
|
+
"text": "Paula works at Microsoft as Principal Engineer",
|
|
23
|
+
"valid_at": "2024-01-10",
|
|
24
|
+
"invalid_at": null,
|
|
25
|
+
"confidence": 0.95,
|
|
26
|
+
"mentions": [
|
|
27
|
+
{"name": "Paula", "type": "person", "role": "subject"},
|
|
28
|
+
{"name": "Microsoft", "type": "organization", "role": "object"}
|
|
29
|
+
]
|
|
30
|
+
}
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
Rules:
|
|
34
|
+
- Extract only factual assertions, not opinions or speculation
|
|
35
|
+
- Use ISO 8601 date format (YYYY-MM-DD) when possible
|
|
36
|
+
- Set invalid_at to null if the fact is still true or unknown
|
|
37
|
+
- Set valid_at to null if the timing is not mentioned
|
|
38
|
+
- Entity types: person, organization, place, product, event, concept
|
|
39
|
+
- Roles: subject, object, location, temporal, instrument, beneficiary
|
|
40
|
+
|
|
41
|
+
Return only valid JSON, no additional text.
|
|
42
|
+
PROMPT
|
|
43
|
+
|
|
44
|
+
ENTITY_EXTRACTION_PROMPT = <<~PROMPT
|
|
45
|
+
Extract all named entities from the following text.
|
|
46
|
+
For each entity:
|
|
47
|
+
1. Identify the canonical name
|
|
48
|
+
2. Classify the type (person, organization, place, product, event, concept)
|
|
49
|
+
3. List any aliases or alternative names mentioned
|
|
50
|
+
|
|
51
|
+
Text:
|
|
52
|
+
%<text>s
|
|
53
|
+
|
|
54
|
+
Return as a JSON array:
|
|
55
|
+
[
|
|
56
|
+
{
|
|
57
|
+
"name": "Paula Chen",
|
|
58
|
+
"type": "person",
|
|
59
|
+
"aliases": ["Paula", "P. Chen"]
|
|
60
|
+
}
|
|
61
|
+
]
|
|
62
|
+
|
|
63
|
+
Return only valid JSON, no additional text.
|
|
64
|
+
PROMPT
|
|
65
|
+
|
|
66
|
+
def extract(text, context = {})
|
|
67
|
+
return [] if text.nil? || text.strip.empty?
|
|
68
|
+
|
|
69
|
+
client = config.llm_client
|
|
70
|
+
raise ConfigurationError, "LLM client not configured" unless client
|
|
71
|
+
|
|
72
|
+
prompt = format(FACT_EXTRACTION_PROMPT, text: text)
|
|
73
|
+
response = call_llm(client, prompt)
|
|
74
|
+
|
|
75
|
+
parse_fact_response(response, context)
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
def extract_entities(text)
|
|
79
|
+
return [] if text.nil? || text.strip.empty?
|
|
80
|
+
|
|
81
|
+
client = config.llm_client
|
|
82
|
+
raise ConfigurationError, "LLM client not configured" unless client
|
|
83
|
+
|
|
84
|
+
prompt = format(ENTITY_EXTRACTION_PROMPT, text: text)
|
|
85
|
+
response = call_llm(client, prompt)
|
|
86
|
+
|
|
87
|
+
parse_entity_response(response)
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
private
|
|
91
|
+
|
|
92
|
+
def call_llm(client, prompt)
|
|
93
|
+
# Support multiple LLM client interfaces
|
|
94
|
+
if client.respond_to?(:chat)
|
|
95
|
+
# Standard chat interface (most LLM gems)
|
|
96
|
+
client.chat(prompt)
|
|
97
|
+
elsif client.respond_to?(:complete)
|
|
98
|
+
# Completion interface
|
|
99
|
+
client.complete(prompt)
|
|
100
|
+
elsif client.respond_to?(:call)
|
|
101
|
+
# Callable/lambda interface
|
|
102
|
+
client.call(prompt)
|
|
103
|
+
else
|
|
104
|
+
raise ConfigurationError, "LLM client must respond to :chat, :complete, or :call"
|
|
105
|
+
end
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
def parse_fact_response(response, context)
|
|
109
|
+
json = extract_json(response)
|
|
110
|
+
parsed = JSON.parse(json)
|
|
111
|
+
|
|
112
|
+
parsed.map do |fact_data|
|
|
113
|
+
valid_at = parse_timestamp(fact_data["valid_at"]) ||
|
|
114
|
+
context[:captured_at] ||
|
|
115
|
+
Time.current
|
|
116
|
+
|
|
117
|
+
build_fact(
|
|
118
|
+
text: fact_data["text"],
|
|
119
|
+
valid_at: valid_at,
|
|
120
|
+
invalid_at: parse_timestamp(fact_data["invalid_at"]),
|
|
121
|
+
mentions: parse_mentions(fact_data["mentions"]),
|
|
122
|
+
confidence: fact_data["confidence"]&.to_f || 0.8,
|
|
123
|
+
metadata: { llm_response: fact_data }
|
|
124
|
+
)
|
|
125
|
+
end
|
|
126
|
+
rescue JSON::ParserError => e
|
|
127
|
+
config.logger&.warn("Failed to parse LLM fact response: #{e.message}")
|
|
128
|
+
[]
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
def parse_entity_response(response)
|
|
132
|
+
json = extract_json(response)
|
|
133
|
+
parsed = JSON.parse(json)
|
|
134
|
+
|
|
135
|
+
parsed.map do |entity_data|
|
|
136
|
+
build_entity(
|
|
137
|
+
name: entity_data["name"],
|
|
138
|
+
type: entity_data["type"] || "concept",
|
|
139
|
+
aliases: entity_data["aliases"] || [],
|
|
140
|
+
attributes: entity_data["attributes"] || {}
|
|
141
|
+
)
|
|
142
|
+
end
|
|
143
|
+
rescue JSON::ParserError => e
|
|
144
|
+
config.logger&.warn("Failed to parse LLM entity response: #{e.message}")
|
|
145
|
+
[]
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
def parse_mentions(mentions_data)
|
|
149
|
+
return [] unless mentions_data.is_a?(Array)
|
|
150
|
+
|
|
151
|
+
mentions_data.map do |mention|
|
|
152
|
+
build_mention(
|
|
153
|
+
name: mention["name"],
|
|
154
|
+
type: mention["type"] || "concept",
|
|
155
|
+
role: mention["role"],
|
|
156
|
+
confidence: mention["confidence"]&.to_f || 1.0
|
|
157
|
+
)
|
|
158
|
+
end
|
|
159
|
+
end
|
|
160
|
+
|
|
161
|
+
def extract_json(response)
|
|
162
|
+
# Handle responses that may have markdown code blocks
|
|
163
|
+
text = response.to_s.strip
|
|
164
|
+
|
|
165
|
+
# Remove markdown code blocks if present
|
|
166
|
+
if text.start_with?("```")
|
|
167
|
+
text = text.sub(/\A```(?:json)?\n?/, "").sub(/\n?```\z/, "")
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
# Find JSON array in response
|
|
171
|
+
if (match = text.match(/\[[\s\S]*\]/))
|
|
172
|
+
match[0]
|
|
173
|
+
else
|
|
174
|
+
text
|
|
175
|
+
end
|
|
176
|
+
end
|
|
177
|
+
end
|
|
178
|
+
end
|
|
179
|
+
end
|