fact_db 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.envrc +1 -0
- data/CHANGELOG.md +48 -0
- data/COMMITS.md +196 -0
- data/README.md +102 -0
- data/Rakefile +41 -0
- data/db/migrate/001_enable_extensions.rb +7 -0
- data/db/migrate/002_create_contents.rb +44 -0
- data/db/migrate/003_create_entities.rb +36 -0
- data/db/migrate/004_create_entity_aliases.rb +18 -0
- data/db/migrate/005_create_facts.rb +65 -0
- data/db/migrate/006_create_entity_mentions.rb +18 -0
- data/db/migrate/007_create_fact_sources.rb +18 -0
- data/docs/api/extractors/index.md +71 -0
- data/docs/api/extractors/llm.md +162 -0
- data/docs/api/extractors/manual.md +92 -0
- data/docs/api/extractors/rule-based.md +165 -0
- data/docs/api/facts.md +300 -0
- data/docs/api/index.md +66 -0
- data/docs/api/models/content.md +165 -0
- data/docs/api/models/entity.md +202 -0
- data/docs/api/models/fact.md +270 -0
- data/docs/api/models/index.md +77 -0
- data/docs/api/pipeline/extraction.md +175 -0
- data/docs/api/pipeline/index.md +72 -0
- data/docs/api/pipeline/resolution.md +209 -0
- data/docs/api/services/content-service.md +166 -0
- data/docs/api/services/entity-service.md +202 -0
- data/docs/api/services/fact-service.md +223 -0
- data/docs/api/services/index.md +55 -0
- data/docs/architecture/database-schema.md +293 -0
- data/docs/architecture/entity-resolution.md +293 -0
- data/docs/architecture/index.md +149 -0
- data/docs/architecture/temporal-facts.md +268 -0
- data/docs/architecture/three-layer-model.md +242 -0
- data/docs/assets/css/custom.css +137 -0
- data/docs/assets/fact_db.jpg +0 -0
- data/docs/assets/images/fact_db.jpg +0 -0
- data/docs/concepts.md +183 -0
- data/docs/examples/basic-usage.md +235 -0
- data/docs/examples/hr-onboarding.md +312 -0
- data/docs/examples/index.md +64 -0
- data/docs/examples/news-analysis.md +288 -0
- data/docs/getting-started/database-setup.md +170 -0
- data/docs/getting-started/index.md +71 -0
- data/docs/getting-started/installation.md +98 -0
- data/docs/getting-started/quick-start.md +191 -0
- data/docs/guides/batch-processing.md +325 -0
- data/docs/guides/configuration.md +243 -0
- data/docs/guides/entity-management.md +364 -0
- data/docs/guides/extracting-facts.md +299 -0
- data/docs/guides/index.md +22 -0
- data/docs/guides/ingesting-content.md +252 -0
- data/docs/guides/llm-integration.md +299 -0
- data/docs/guides/temporal-queries.md +315 -0
- data/docs/index.md +121 -0
- data/examples/README.md +130 -0
- data/examples/basic_usage.rb +164 -0
- data/examples/entity_management.rb +216 -0
- data/examples/hr_system.rb +428 -0
- data/examples/rule_based_extraction.rb +258 -0
- data/examples/temporal_queries.rb +245 -0
- data/lib/fact_db/config.rb +71 -0
- data/lib/fact_db/database.rb +45 -0
- data/lib/fact_db/errors.rb +10 -0
- data/lib/fact_db/extractors/base.rb +117 -0
- data/lib/fact_db/extractors/llm_extractor.rb +179 -0
- data/lib/fact_db/extractors/manual_extractor.rb +53 -0
- data/lib/fact_db/extractors/rule_based_extractor.rb +228 -0
- data/lib/fact_db/llm/adapter.rb +109 -0
- data/lib/fact_db/models/content.rb +62 -0
- data/lib/fact_db/models/entity.rb +84 -0
- data/lib/fact_db/models/entity_alias.rb +26 -0
- data/lib/fact_db/models/entity_mention.rb +33 -0
- data/lib/fact_db/models/fact.rb +192 -0
- data/lib/fact_db/models/fact_source.rb +35 -0
- data/lib/fact_db/pipeline/extraction_pipeline.rb +146 -0
- data/lib/fact_db/pipeline/resolution_pipeline.rb +129 -0
- data/lib/fact_db/resolution/entity_resolver.rb +261 -0
- data/lib/fact_db/resolution/fact_resolver.rb +259 -0
- data/lib/fact_db/services/content_service.rb +93 -0
- data/lib/fact_db/services/entity_service.rb +150 -0
- data/lib/fact_db/services/fact_service.rb +193 -0
- data/lib/fact_db/temporal/query.rb +125 -0
- data/lib/fact_db/temporal/timeline.rb +134 -0
- data/lib/fact_db/version.rb +5 -0
- data/lib/fact_db.rb +141 -0
- data/mkdocs.yml +198 -0
- metadata +288 -0
|
@@ -0,0 +1,261 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module FactDb
|
|
4
|
+
module Resolution
|
|
5
|
+
class EntityResolver
|
|
6
|
+
attr_reader :config
|
|
7
|
+
|
|
8
|
+
def initialize(config = FactDb.config)
|
|
9
|
+
@config = config
|
|
10
|
+
@threshold = config.fuzzy_match_threshold
|
|
11
|
+
@auto_merge_threshold = config.auto_merge_threshold
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
# Resolve a name to an entity
|
|
15
|
+
def resolve(name, type: nil)
|
|
16
|
+
return nil if name.nil? || name.empty?
|
|
17
|
+
|
|
18
|
+
# 1. Exact alias match
|
|
19
|
+
exact = find_by_exact_alias(name, type: type)
|
|
20
|
+
return ResolvedEntity.new(exact, confidence: 1.0, match_type: :exact_alias) if exact
|
|
21
|
+
|
|
22
|
+
# 2. Canonical name match
|
|
23
|
+
canonical = find_by_canonical_name(name, type: type)
|
|
24
|
+
return ResolvedEntity.new(canonical, confidence: 1.0, match_type: :canonical_name) if canonical
|
|
25
|
+
|
|
26
|
+
# 3. Fuzzy matching
|
|
27
|
+
fuzzy = find_by_fuzzy_match(name, type: type)
|
|
28
|
+
return fuzzy if fuzzy && fuzzy.confidence >= @threshold
|
|
29
|
+
|
|
30
|
+
# 4. No match found
|
|
31
|
+
nil
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
# Resolve or create an entity
|
|
35
|
+
def resolve_or_create(name, type:, aliases: [], attributes: {})
|
|
36
|
+
resolved = resolve(name, type: type)
|
|
37
|
+
return resolved.entity if resolved
|
|
38
|
+
|
|
39
|
+
create_entity(name, type: type, aliases: aliases, attributes: attributes)
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
# Merge two entities, keeping one as canonical
|
|
43
|
+
def merge(keep_id, merge_id)
|
|
44
|
+
keep = Models::Entity.find(keep_id)
|
|
45
|
+
merge_entity = Models::Entity.find(merge_id)
|
|
46
|
+
|
|
47
|
+
raise ResolutionError, "Cannot merge entity into itself" if keep_id == merge_id
|
|
48
|
+
raise ResolutionError, "Cannot merge already merged entity" if merge_entity.merged?
|
|
49
|
+
|
|
50
|
+
Models::Entity.transaction do
|
|
51
|
+
# Move all aliases to kept entity
|
|
52
|
+
merge_entity.aliases.each do |alias_record|
|
|
53
|
+
keep.aliases.find_or_create_by!(alias_text: alias_record.alias_text) do |a|
|
|
54
|
+
a.alias_type = alias_record.alias_type
|
|
55
|
+
a.confidence = alias_record.confidence
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
# Add the merged entity's canonical name as an alias
|
|
60
|
+
keep.aliases.find_or_create_by!(alias_text: merge_entity.canonical_name) do |a|
|
|
61
|
+
a.alias_type = "name"
|
|
62
|
+
a.confidence = 1.0
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
# Update all entity mentions to point to kept entity
|
|
66
|
+
Models::EntityMention.where(entity_id: merge_id).update_all(entity_id: keep_id)
|
|
67
|
+
|
|
68
|
+
# Mark merged entity
|
|
69
|
+
merge_entity.update!(
|
|
70
|
+
resolution_status: "merged",
|
|
71
|
+
merged_into_id: keep_id
|
|
72
|
+
)
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
keep.reload
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
# Split an entity into multiple entities
|
|
79
|
+
def split(entity_id, split_configs)
|
|
80
|
+
original = Models::Entity.find(entity_id)
|
|
81
|
+
|
|
82
|
+
Models::Entity.transaction do
|
|
83
|
+
new_entities = split_configs.map do |config|
|
|
84
|
+
create_entity(
|
|
85
|
+
config[:name],
|
|
86
|
+
type: config[:type] || original.entity_type,
|
|
87
|
+
aliases: config[:aliases] || [],
|
|
88
|
+
attributes: config[:attributes] || {}
|
|
89
|
+
)
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
original.update!(resolution_status: "split")
|
|
93
|
+
|
|
94
|
+
new_entities
|
|
95
|
+
end
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
# Find potential duplicate entities
|
|
99
|
+
def find_duplicates(threshold: nil)
|
|
100
|
+
threshold ||= @threshold
|
|
101
|
+
duplicates = []
|
|
102
|
+
|
|
103
|
+
entities = Models::Entity.resolved.to_a
|
|
104
|
+
|
|
105
|
+
entities.each_with_index do |entity, i|
|
|
106
|
+
entities[(i + 1)..].each do |other|
|
|
107
|
+
similarity = calculate_similarity(entity.canonical_name, other.canonical_name)
|
|
108
|
+
if similarity >= threshold
|
|
109
|
+
duplicates << {
|
|
110
|
+
entity1: entity,
|
|
111
|
+
entity2: other,
|
|
112
|
+
similarity: similarity
|
|
113
|
+
}
|
|
114
|
+
end
|
|
115
|
+
end
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
duplicates.sort_by { |d| -d[:similarity] }
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
# Auto-merge high-confidence duplicates
|
|
122
|
+
def auto_merge_duplicates!
|
|
123
|
+
duplicates = find_duplicates(threshold: @auto_merge_threshold)
|
|
124
|
+
|
|
125
|
+
duplicates.each do |dup|
|
|
126
|
+
next if dup[:entity1].merged? || dup[:entity2].merged?
|
|
127
|
+
|
|
128
|
+
# Keep the entity with more mentions
|
|
129
|
+
keep, merge_entity = if dup[:entity1].entity_mentions.count >= dup[:entity2].entity_mentions.count
|
|
130
|
+
[dup[:entity1], dup[:entity2]]
|
|
131
|
+
else
|
|
132
|
+
[dup[:entity2], dup[:entity1]]
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
merge(keep.id, merge_entity.id)
|
|
136
|
+
end
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
private
|
|
140
|
+
|
|
141
|
+
def find_by_exact_alias(name, type:)
|
|
142
|
+
scope = Models::EntityAlias.where(["LOWER(alias_text) = ?", name.downcase])
|
|
143
|
+
scope = scope.joins(:entity).where(fact_db_entities: { entity_type: type }) if type
|
|
144
|
+
scope = scope.joins(:entity).where.not(fact_db_entities: { resolution_status: "merged" })
|
|
145
|
+
scope.first&.entity
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
def find_by_canonical_name(name, type:)
|
|
149
|
+
scope = Models::Entity.where(["LOWER(canonical_name) = ?", name.downcase])
|
|
150
|
+
scope = scope.where(entity_type: type) if type
|
|
151
|
+
scope.not_merged.first
|
|
152
|
+
end
|
|
153
|
+
|
|
154
|
+
def find_by_fuzzy_match(name, type:)
|
|
155
|
+
candidates = Models::Entity.not_merged
|
|
156
|
+
candidates = candidates.where(entity_type: type) if type
|
|
157
|
+
|
|
158
|
+
best_match = nil
|
|
159
|
+
best_similarity = 0
|
|
160
|
+
|
|
161
|
+
candidates.find_each do |entity|
|
|
162
|
+
# Check canonical name
|
|
163
|
+
similarity = calculate_similarity(name, entity.canonical_name)
|
|
164
|
+
if similarity > best_similarity
|
|
165
|
+
best_similarity = similarity
|
|
166
|
+
best_match = entity
|
|
167
|
+
end
|
|
168
|
+
|
|
169
|
+
# Check aliases
|
|
170
|
+
entity.aliases.each do |alias_record|
|
|
171
|
+
alias_similarity = calculate_similarity(name, alias_record.alias_text)
|
|
172
|
+
if alias_similarity > best_similarity
|
|
173
|
+
best_similarity = alias_similarity
|
|
174
|
+
best_match = entity
|
|
175
|
+
end
|
|
176
|
+
end
|
|
177
|
+
end
|
|
178
|
+
|
|
179
|
+
return nil if best_match.nil? || best_similarity < @threshold
|
|
180
|
+
|
|
181
|
+
ResolvedEntity.new(best_match, confidence: best_similarity, match_type: :fuzzy)
|
|
182
|
+
end
|
|
183
|
+
|
|
184
|
+
def create_entity(name, type:, aliases: [], attributes: {})
|
|
185
|
+
entity = Models::Entity.create!(
|
|
186
|
+
canonical_name: name,
|
|
187
|
+
entity_type: type,
|
|
188
|
+
attributes: attributes,
|
|
189
|
+
resolution_status: "resolved"
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
aliases.each do |alias_text|
|
|
193
|
+
entity.add_alias(alias_text)
|
|
194
|
+
end
|
|
195
|
+
|
|
196
|
+
entity
|
|
197
|
+
end
|
|
198
|
+
|
|
199
|
+
def calculate_similarity(a, b)
|
|
200
|
+
return 1.0 if a.downcase == b.downcase
|
|
201
|
+
|
|
202
|
+
max_len = [a.length, b.length].max
|
|
203
|
+
return 1.0 if max_len.zero?
|
|
204
|
+
|
|
205
|
+
1.0 - (levenshtein_distance(a.downcase, b.downcase).to_f / max_len)
|
|
206
|
+
end
|
|
207
|
+
|
|
208
|
+
def levenshtein_distance(a, b)
|
|
209
|
+
m = a.length
|
|
210
|
+
n = b.length
|
|
211
|
+
d = Array.new(m + 1) { |i| i }
|
|
212
|
+
|
|
213
|
+
(1..n).each do |j|
|
|
214
|
+
prev = d[0]
|
|
215
|
+
d[0] = j
|
|
216
|
+
(1..m).each do |i|
|
|
217
|
+
temp = d[i]
|
|
218
|
+
d[i] = if a[i - 1] == b[j - 1]
|
|
219
|
+
prev
|
|
220
|
+
else
|
|
221
|
+
[prev + 1, d[i] + 1, d[i - 1] + 1].min
|
|
222
|
+
end
|
|
223
|
+
prev = temp
|
|
224
|
+
end
|
|
225
|
+
end
|
|
226
|
+
|
|
227
|
+
d[m]
|
|
228
|
+
end
|
|
229
|
+
end
|
|
230
|
+
|
|
231
|
+
class ResolvedEntity
|
|
232
|
+
attr_reader :entity, :confidence, :match_type
|
|
233
|
+
|
|
234
|
+
def initialize(entity, confidence:, match_type:)
|
|
235
|
+
@entity = entity
|
|
236
|
+
@confidence = confidence
|
|
237
|
+
@match_type = match_type
|
|
238
|
+
end
|
|
239
|
+
|
|
240
|
+
def exact_match?
|
|
241
|
+
confidence == 1.0
|
|
242
|
+
end
|
|
243
|
+
|
|
244
|
+
def fuzzy_match?
|
|
245
|
+
match_type == :fuzzy
|
|
246
|
+
end
|
|
247
|
+
|
|
248
|
+
def id
|
|
249
|
+
entity.id
|
|
250
|
+
end
|
|
251
|
+
|
|
252
|
+
def canonical_name
|
|
253
|
+
entity.canonical_name
|
|
254
|
+
end
|
|
255
|
+
|
|
256
|
+
def entity_type
|
|
257
|
+
entity.entity_type
|
|
258
|
+
end
|
|
259
|
+
end
|
|
260
|
+
end
|
|
261
|
+
end
|
|
@@ -0,0 +1,259 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module FactDb
|
|
4
|
+
module Resolution
|
|
5
|
+
class FactResolver
|
|
6
|
+
attr_reader :config
|
|
7
|
+
|
|
8
|
+
def initialize(config = FactDb.config)
|
|
9
|
+
@config = config
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
# Supersede an existing fact with a new one
|
|
13
|
+
def supersede(old_fact_id, new_fact_text, valid_at:, mentions: [])
|
|
14
|
+
old_fact = Models::Fact.find(old_fact_id)
|
|
15
|
+
|
|
16
|
+
raise ResolutionError, "Cannot supersede already superseded fact" if old_fact.superseded?
|
|
17
|
+
|
|
18
|
+
Models::Fact.transaction do
|
|
19
|
+
new_fact = Models::Fact.create!(
|
|
20
|
+
fact_text: new_fact_text,
|
|
21
|
+
valid_at: valid_at,
|
|
22
|
+
status: "canonical",
|
|
23
|
+
extraction_method: old_fact.extraction_method,
|
|
24
|
+
confidence: old_fact.confidence
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
# Copy mentions from old fact if not provided
|
|
28
|
+
if mentions.empty?
|
|
29
|
+
old_fact.entity_mentions.each do |mention|
|
|
30
|
+
new_fact.add_mention(
|
|
31
|
+
entity: mention.entity,
|
|
32
|
+
text: mention.mention_text,
|
|
33
|
+
role: mention.mention_role,
|
|
34
|
+
confidence: mention.confidence
|
|
35
|
+
)
|
|
36
|
+
end
|
|
37
|
+
else
|
|
38
|
+
mentions.each do |mention|
|
|
39
|
+
entity = mention[:entity] || Models::Entity.find(mention[:entity_id])
|
|
40
|
+
new_fact.add_mention(
|
|
41
|
+
entity: entity,
|
|
42
|
+
text: mention[:text],
|
|
43
|
+
role: mention[:role],
|
|
44
|
+
confidence: mention[:confidence] || 1.0
|
|
45
|
+
)
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
# Copy sources from old fact
|
|
50
|
+
old_fact.fact_sources.each do |source|
|
|
51
|
+
new_fact.add_source(
|
|
52
|
+
content: source.content,
|
|
53
|
+
type: source.source_type,
|
|
54
|
+
excerpt: source.excerpt,
|
|
55
|
+
confidence: source.confidence
|
|
56
|
+
)
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
# Mark old fact as superseded
|
|
60
|
+
old_fact.update!(
|
|
61
|
+
status: "superseded",
|
|
62
|
+
superseded_by_id: new_fact.id,
|
|
63
|
+
invalid_at: valid_at
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
new_fact
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
# Synthesize a new fact from multiple source facts
|
|
71
|
+
def synthesize(source_fact_ids, synthesized_text, valid_at:, invalid_at: nil, mentions: [])
|
|
72
|
+
source_facts = Models::Fact.where(id: source_fact_ids)
|
|
73
|
+
|
|
74
|
+
raise ResolutionError, "No source facts found" if source_facts.empty?
|
|
75
|
+
|
|
76
|
+
Models::Fact.transaction do
|
|
77
|
+
synthesized = Models::Fact.create!(
|
|
78
|
+
fact_text: synthesized_text,
|
|
79
|
+
valid_at: valid_at,
|
|
80
|
+
invalid_at: invalid_at,
|
|
81
|
+
status: "synthesized",
|
|
82
|
+
derived_from_ids: source_fact_ids,
|
|
83
|
+
extraction_method: "synthesized",
|
|
84
|
+
confidence: calculate_synthesized_confidence(source_facts)
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
# Aggregate entity mentions from source facts if not provided
|
|
88
|
+
if mentions.empty?
|
|
89
|
+
aggregate_mentions(source_facts).each do |mention|
|
|
90
|
+
synthesized.add_mention(**mention)
|
|
91
|
+
end
|
|
92
|
+
else
|
|
93
|
+
mentions.each do |mention|
|
|
94
|
+
entity = mention[:entity] || Models::Entity.find(mention[:entity_id])
|
|
95
|
+
synthesized.add_mention(
|
|
96
|
+
entity: entity,
|
|
97
|
+
text: mention[:text],
|
|
98
|
+
role: mention[:role],
|
|
99
|
+
confidence: mention[:confidence] || 1.0
|
|
100
|
+
)
|
|
101
|
+
end
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
# Link all source content
|
|
105
|
+
source_facts.each do |source_fact|
|
|
106
|
+
source_fact.fact_sources.each do |source|
|
|
107
|
+
synthesized.add_source(
|
|
108
|
+
content: source.content,
|
|
109
|
+
type: "supporting",
|
|
110
|
+
excerpt: source.excerpt,
|
|
111
|
+
confidence: source.confidence
|
|
112
|
+
)
|
|
113
|
+
end
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
synthesized
|
|
117
|
+
end
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
# Mark a fact as corroborated by another fact
|
|
121
|
+
def corroborate(fact_id, corroborating_fact_id)
|
|
122
|
+
fact = Models::Fact.find(fact_id)
|
|
123
|
+
_corroborating = Models::Fact.find(corroborating_fact_id)
|
|
124
|
+
|
|
125
|
+
raise ResolutionError, "Cannot corroborate with same fact" if fact_id == corroborating_fact_id
|
|
126
|
+
|
|
127
|
+
fact.update!(
|
|
128
|
+
corroborated_by_ids: (fact.corroborated_by_ids + [corroborating_fact_id]).uniq
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
# Optionally update status to corroborated if it was just canonical
|
|
132
|
+
fact.update!(status: "corroborated") if fact.status == "canonical" && fact.corroborated_by_ids.size >= 2
|
|
133
|
+
|
|
134
|
+
fact
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
# Invalidate a fact without replacement
|
|
138
|
+
def invalidate(fact_id, at: Time.current)
|
|
139
|
+
fact = Models::Fact.find(fact_id)
|
|
140
|
+
fact.update!(invalid_at: at)
|
|
141
|
+
fact
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
# Find potentially conflicting facts
|
|
145
|
+
def find_conflicts(entity_id: nil, topic: nil)
|
|
146
|
+
scope = Models::Fact.canonical.currently_valid
|
|
147
|
+
|
|
148
|
+
if entity_id
|
|
149
|
+
scope = scope.mentioning_entity(entity_id)
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
if topic
|
|
153
|
+
scope = scope.search_text(topic)
|
|
154
|
+
end
|
|
155
|
+
|
|
156
|
+
# Group facts that might be about the same thing
|
|
157
|
+
facts = scope.to_a
|
|
158
|
+
conflicts = []
|
|
159
|
+
|
|
160
|
+
facts.each_with_index do |fact, i|
|
|
161
|
+
facts[(i + 1)..].each do |other|
|
|
162
|
+
similarity = text_similarity(fact.fact_text, other.fact_text)
|
|
163
|
+
if similarity > 0.5 && similarity < 0.95
|
|
164
|
+
conflicts << {
|
|
165
|
+
fact1: fact,
|
|
166
|
+
fact2: other,
|
|
167
|
+
similarity: similarity
|
|
168
|
+
}
|
|
169
|
+
end
|
|
170
|
+
end
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
conflicts.sort_by { |c| -c[:similarity] }
|
|
174
|
+
end
|
|
175
|
+
|
|
176
|
+
# Resolve conflicts by keeping one fact and superseding others
|
|
177
|
+
def resolve_conflict(keep_fact_id, supersede_fact_ids, reason: nil)
|
|
178
|
+
Models::Fact.transaction do
|
|
179
|
+
supersede_fact_ids.each do |fact_id|
|
|
180
|
+
fact = Models::Fact.find(fact_id)
|
|
181
|
+
fact.update!(
|
|
182
|
+
status: "superseded",
|
|
183
|
+
superseded_by_id: keep_fact_id,
|
|
184
|
+
invalid_at: Time.current,
|
|
185
|
+
metadata: fact.metadata.merge(supersede_reason: reason)
|
|
186
|
+
)
|
|
187
|
+
end
|
|
188
|
+
end
|
|
189
|
+
|
|
190
|
+
Models::Fact.find(keep_fact_id)
|
|
191
|
+
end
|
|
192
|
+
|
|
193
|
+
# Build a timeline fact from point-in-time facts
|
|
194
|
+
def build_timeline_fact(entity_id:, topic: nil)
|
|
195
|
+
facts = Models::Fact.mentioning_entity(entity_id)
|
|
196
|
+
facts = facts.search_text(topic) if topic
|
|
197
|
+
facts = facts.order(valid_at: :asc).to_a
|
|
198
|
+
|
|
199
|
+
return nil if facts.empty?
|
|
200
|
+
|
|
201
|
+
# Find start and end dates
|
|
202
|
+
start_date = facts.first.valid_at
|
|
203
|
+
end_date = facts.select { |f| f.invalid_at }.map(&:invalid_at).max
|
|
204
|
+
|
|
205
|
+
entity = Models::Entity.find(entity_id)
|
|
206
|
+
synthesized_text = "#{entity.canonical_name}: #{topic || 'timeline'} from #{start_date.to_date}"
|
|
207
|
+
synthesized_text += " to #{end_date.to_date}" if end_date
|
|
208
|
+
|
|
209
|
+
synthesize(
|
|
210
|
+
facts.map(&:id),
|
|
211
|
+
synthesized_text,
|
|
212
|
+
valid_at: start_date,
|
|
213
|
+
invalid_at: end_date
|
|
214
|
+
)
|
|
215
|
+
end
|
|
216
|
+
|
|
217
|
+
private
|
|
218
|
+
|
|
219
|
+
def calculate_synthesized_confidence(source_facts)
|
|
220
|
+
confidences = source_facts.map(&:confidence)
|
|
221
|
+
confidences.sum / confidences.size
|
|
222
|
+
end
|
|
223
|
+
|
|
224
|
+
def aggregate_mentions(source_facts)
|
|
225
|
+
mentions = {}
|
|
226
|
+
|
|
227
|
+
source_facts.each do |fact|
|
|
228
|
+
fact.entity_mentions.each do |mention|
|
|
229
|
+
key = [mention.entity_id, mention.mention_role]
|
|
230
|
+
existing = mentions[key]
|
|
231
|
+
|
|
232
|
+
if existing.nil? || mention.confidence > existing[:confidence]
|
|
233
|
+
mentions[key] = {
|
|
234
|
+
entity: mention.entity,
|
|
235
|
+
text: mention.mention_text,
|
|
236
|
+
role: mention.mention_role,
|
|
237
|
+
confidence: mention.confidence
|
|
238
|
+
}
|
|
239
|
+
end
|
|
240
|
+
end
|
|
241
|
+
end
|
|
242
|
+
|
|
243
|
+
mentions.values
|
|
244
|
+
end
|
|
245
|
+
|
|
246
|
+
def text_similarity(text1, text2)
|
|
247
|
+
words1 = text1.downcase.split
|
|
248
|
+
words2 = text2.downcase.split
|
|
249
|
+
|
|
250
|
+
return 0.0 if words1.empty? || words2.empty?
|
|
251
|
+
|
|
252
|
+
intersection = words1 & words2
|
|
253
|
+
union = words1 | words2
|
|
254
|
+
|
|
255
|
+
intersection.size.to_f / union.size
|
|
256
|
+
end
|
|
257
|
+
end
|
|
258
|
+
end
|
|
259
|
+
end
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module FactDb
|
|
4
|
+
module Services
|
|
5
|
+
class ContentService
|
|
6
|
+
attr_reader :config
|
|
7
|
+
|
|
8
|
+
def initialize(config = FactDb.config)
|
|
9
|
+
@config = config
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def create(raw_text, type:, captured_at: Time.current, metadata: {}, title: nil, source_uri: nil)
|
|
13
|
+
content_hash = Digest::SHA256.hexdigest(raw_text)
|
|
14
|
+
|
|
15
|
+
# Check for duplicate content
|
|
16
|
+
existing = Models::Content.find_by(content_hash: content_hash)
|
|
17
|
+
return existing if existing
|
|
18
|
+
|
|
19
|
+
embedding = generate_embedding(raw_text)
|
|
20
|
+
|
|
21
|
+
Models::Content.create!(
|
|
22
|
+
raw_text: raw_text,
|
|
23
|
+
content_hash: content_hash,
|
|
24
|
+
content_type: type.to_s,
|
|
25
|
+
title: title,
|
|
26
|
+
source_uri: source_uri,
|
|
27
|
+
source_metadata: metadata,
|
|
28
|
+
captured_at: captured_at,
|
|
29
|
+
embedding: embedding
|
|
30
|
+
)
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def find(id)
|
|
34
|
+
Models::Content.find(id)
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def find_by_hash(hash)
|
|
38
|
+
Models::Content.find_by(content_hash: hash)
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def search(query, type: nil, from: nil, to: nil, limit: 20)
|
|
42
|
+
scope = Models::Content.search_text(query)
|
|
43
|
+
scope = scope.by_type(type) if type
|
|
44
|
+
scope = scope.captured_after(from) if from
|
|
45
|
+
scope = scope.captured_before(to) if to
|
|
46
|
+
scope.order(captured_at: :desc).limit(limit)
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
def semantic_search(query, limit: 20)
|
|
50
|
+
embedding = generate_embedding(query)
|
|
51
|
+
return Models::Content.none unless embedding
|
|
52
|
+
|
|
53
|
+
Models::Content.nearest_neighbors(embedding, limit: limit)
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def by_type(type, limit: nil)
|
|
57
|
+
scope = Models::Content.by_type(type).order(captured_at: :desc)
|
|
58
|
+
scope = scope.limit(limit) if limit
|
|
59
|
+
scope
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
def between(from, to)
|
|
63
|
+
Models::Content.captured_between(from, to).order(captured_at: :asc)
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
def recent(limit: 10)
|
|
67
|
+
Models::Content.order(captured_at: :desc).limit(limit)
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
def stats
|
|
71
|
+
{
|
|
72
|
+
total: Models::Content.count,
|
|
73
|
+
total_count: Models::Content.count,
|
|
74
|
+
by_type: Models::Content.group(:content_type).count,
|
|
75
|
+
earliest: Models::Content.minimum(:captured_at),
|
|
76
|
+
latest: Models::Content.maximum(:captured_at),
|
|
77
|
+
total_words: Models::Content.sum("array_length(regexp_split_to_array(raw_text, '\\s+'), 1)")
|
|
78
|
+
}
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
private
|
|
82
|
+
|
|
83
|
+
def generate_embedding(text)
|
|
84
|
+
return nil unless config.embedding_generator
|
|
85
|
+
|
|
86
|
+
config.embedding_generator.call(text)
|
|
87
|
+
rescue StandardError => e
|
|
88
|
+
config.logger&.warn("Failed to generate embedding: #{e.message}")
|
|
89
|
+
nil
|
|
90
|
+
end
|
|
91
|
+
end
|
|
92
|
+
end
|
|
93
|
+
end
|