fact_db 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.envrc +2 -0
- data/.yardopts +5 -0
- data/CHANGELOG.md +64 -0
- data/README.md +107 -6
- data/Rakefile +243 -10
- data/db/migrate/001_enable_extensions.rb +1 -0
- data/db/migrate/002_create_sources.rb +49 -0
- data/db/migrate/003_create_entities.rb +27 -15
- data/db/migrate/004_create_entity_aliases.rb +20 -7
- data/db/migrate/005_create_facts.rb +37 -21
- data/db/migrate/006_create_entity_mentions.rb +14 -6
- data/db/migrate/007_create_fact_sources.rb +16 -8
- data/docs/api/extractors/index.md +5 -5
- data/docs/api/extractors/llm.md +17 -17
- data/docs/api/extractors/rule-based.md +14 -14
- data/docs/api/facts.md +20 -20
- data/docs/api/index.md +4 -4
- data/docs/api/models/entity.md +21 -21
- data/docs/api/models/fact.md +15 -15
- data/docs/api/models/index.md +7 -7
- data/docs/api/models/{content.md → source.md} +29 -29
- data/docs/api/pipeline/extraction.md +25 -25
- data/docs/api/pipeline/index.md +1 -1
- data/docs/api/pipeline/resolution.md +4 -4
- data/docs/api/services/entity-service.md +20 -20
- data/docs/api/services/fact-service.md +12 -12
- data/docs/api/services/index.md +5 -5
- data/docs/api/services/{content-service.md → source-service.md} +27 -27
- data/docs/architecture/database-schema.md +46 -46
- data/docs/architecture/entity-resolution.md +6 -6
- data/docs/architecture/index.md +10 -10
- data/docs/architecture/temporal-facts.md +5 -5
- data/docs/architecture/three-layer-model.md +17 -17
- data/docs/concepts.md +6 -6
- data/docs/examples/basic-usage.md +20 -20
- data/docs/examples/hr-onboarding.md +17 -17
- data/docs/examples/index.md +4 -4
- data/docs/examples/news-analysis.md +23 -23
- data/docs/getting-started/database-setup.md +28 -20
- data/docs/getting-started/index.md +3 -3
- data/docs/getting-started/quick-start.md +33 -30
- data/docs/guides/batch-processing.md +26 -26
- data/docs/guides/configuration.md +158 -77
- data/docs/guides/entity-management.md +14 -14
- data/docs/guides/extracting-facts.md +28 -28
- data/docs/guides/ingesting-content.md +14 -14
- data/docs/guides/llm-integration.md +40 -32
- data/docs/guides/temporal-queries.md +11 -11
- data/docs/index.md +6 -2
- data/examples/.envrc +4 -0
- data/examples/.gitignore +1 -0
- data/examples/001_configuration.rb +312 -0
- data/examples/{basic_usage.rb → 010_basic_usage.rb} +47 -56
- data/examples/{entity_management.rb → 020_entity_management.rb} +57 -72
- data/examples/{temporal_queries.rb → 030_temporal_queries.rb} +39 -59
- data/examples/040_output_formats.rb +177 -0
- data/examples/{rule_based_extraction.rb → 050_rule_based_extraction.rb} +39 -45
- data/examples/060_fluent_temporal_api.rb +217 -0
- data/examples/070_introspection.rb +252 -0
- data/examples/{hr_system.rb → 080_hr_system.rb} +56 -75
- data/examples/090_ingest_demo.rb +515 -0
- data/examples/100_query_context.rb +668 -0
- data/examples/110_prove_it.rb +204 -0
- data/examples/120_dump_database.rb +358 -0
- data/examples/130_rag_feedback_loop.rb +858 -0
- data/examples/README.md +229 -15
- data/examples/data/lincoln_associates.md +201 -0
- data/examples/data/lincoln_biography.md +66 -0
- data/examples/data/lincoln_cabinet.md +243 -0
- data/examples/data/lincoln_family.md +163 -0
- data/examples/data/lincoln_military.md +241 -0
- data/examples/data/lincoln_todd_family.md +136 -0
- data/examples/ingest_reporter.rb +335 -0
- data/examples/utilities.rb +182 -0
- data/lib/fact_db/config/defaults.yml +254 -0
- data/lib/fact_db/config.rb +94 -35
- data/lib/fact_db/database.rb +98 -8
- data/lib/fact_db/extractors/base.rb +106 -21
- data/lib/fact_db/extractors/llm_extractor.rb +35 -63
- data/lib/fact_db/extractors/manual_extractor.rb +46 -6
- data/lib/fact_db/extractors/rule_based_extractor.rb +136 -25
- data/lib/fact_db/llm/adapter.rb +3 -3
- data/lib/fact_db/models/entity.rb +94 -22
- data/lib/fact_db/models/entity_alias.rb +41 -7
- data/lib/fact_db/models/entity_mention.rb +34 -1
- data/lib/fact_db/models/fact.rb +259 -28
- data/lib/fact_db/models/fact_source.rb +43 -9
- data/lib/fact_db/models/source.rb +113 -0
- data/lib/fact_db/pipeline/extraction_pipeline.rb +35 -35
- data/lib/fact_db/pipeline/resolution_pipeline.rb +5 -5
- data/lib/fact_db/query_result.rb +202 -0
- data/lib/fact_db/resolution/entity_resolver.rb +139 -39
- data/lib/fact_db/resolution/fact_resolver.rb +86 -14
- data/lib/fact_db/services/entity_service.rb +246 -37
- data/lib/fact_db/services/fact_service.rb +254 -17
- data/lib/fact_db/services/source_service.rb +164 -0
- data/lib/fact_db/temporal/query.rb +71 -7
- data/lib/fact_db/temporal/query_builder.rb +69 -0
- data/lib/fact_db/temporal/timeline.rb +102 -11
- data/lib/fact_db/transformers/base.rb +77 -0
- data/lib/fact_db/transformers/cypher_transformer.rb +185 -0
- data/lib/fact_db/transformers/json_transformer.rb +17 -0
- data/lib/fact_db/transformers/raw_transformer.rb +35 -0
- data/lib/fact_db/transformers/text_transformer.rb +114 -0
- data/lib/fact_db/transformers/triple_transformer.rb +138 -0
- data/lib/fact_db/validation/alias_filter.rb +185 -0
- data/lib/fact_db/version.rb +1 -1
- data/lib/fact_db.rb +281 -30
- data/mkdocs.yml +2 -2
- metadata +60 -16
- data/db/migrate/002_create_contents.rb +0 -44
- data/lib/fact_db/models/content.rb +0 -62
- data/lib/fact_db/services/content_service.rb +0 -93
|
@@ -2,20 +2,61 @@
|
|
|
2
2
|
|
|
3
3
|
module FactDb
|
|
4
4
|
module Services
|
|
5
|
+
# Service class for managing facts in the database
|
|
6
|
+
#
|
|
7
|
+
# Provides methods for creating, querying, and manipulating facts
|
|
8
|
+
# including temporal queries, semantic search, and conflict resolution.
|
|
9
|
+
#
|
|
10
|
+
# @example Basic usage
|
|
11
|
+
# service = FactService.new
|
|
12
|
+
# fact = service.create("John works at Acme", valid_at: Date.today)
|
|
13
|
+
#
|
|
5
14
|
class FactService
|
|
6
|
-
|
|
15
|
+
# @return [FactDb::Config] the configuration object
|
|
16
|
+
attr_reader :config
|
|
7
17
|
|
|
18
|
+
# @return [FactDb::Resolution::FactResolver] the fact resolver instance
|
|
19
|
+
attr_reader :resolver
|
|
20
|
+
|
|
21
|
+
# @return [FactDb::Services::EntityService] the entity service instance
|
|
22
|
+
attr_reader :entity_service
|
|
23
|
+
|
|
24
|
+
# Initializes a new FactService instance
|
|
25
|
+
#
|
|
26
|
+
# @param config [FactDb::Config] configuration object (defaults to FactDb.config)
|
|
8
27
|
def initialize(config = FactDb.config)
|
|
9
28
|
@config = config
|
|
10
29
|
@resolver = Resolution::FactResolver.new(config)
|
|
11
30
|
@entity_service = EntityService.new(config)
|
|
12
31
|
end
|
|
13
32
|
|
|
14
|
-
|
|
33
|
+
# Creates a new fact in the database
|
|
34
|
+
#
|
|
35
|
+
# @param text [String] the fact text content
|
|
36
|
+
# @param valid_at [Date, Time] when the fact became valid
|
|
37
|
+
# @param invalid_at [Date, Time, nil] when the fact became invalid (nil if still valid)
|
|
38
|
+
# @param status [Symbol] fact status (:canonical, :superseded, :synthesized)
|
|
39
|
+
# @param source_id [Integer, nil] ID of the source document
|
|
40
|
+
# @param mentions [Array<Hash>] entity mentions with :name, :kind, :role, :confidence keys
|
|
41
|
+
# @param extraction_method [Symbol] how the fact was extracted (:manual, :llm, :rule_based)
|
|
42
|
+
# @param confidence [Float] confidence score from 0.0 to 1.0
|
|
43
|
+
# @param metadata [Hash] additional metadata for the fact
|
|
44
|
+
# @return [FactDb::Models::Fact] the created fact
|
|
45
|
+
#
|
|
46
|
+
# @example Create a fact with mentions
|
|
47
|
+
# service.create(
|
|
48
|
+
# "John works at Acme Corp",
|
|
49
|
+
# valid_at: Date.parse("2024-01-15"),
|
|
50
|
+
# mentions: [
|
|
51
|
+
# { name: "John", kind: :person, role: :subject },
|
|
52
|
+
# { name: "Acme Corp", kind: :organization, role: :object }
|
|
53
|
+
# ]
|
|
54
|
+
# )
|
|
55
|
+
def create(text, valid_at:, invalid_at: nil, status: :canonical, source_id: nil, mentions: [], extraction_method: :manual, confidence: 1.0, metadata: {})
|
|
15
56
|
embedding = generate_embedding(text)
|
|
16
57
|
|
|
17
58
|
fact = Models::Fact.create!(
|
|
18
|
-
|
|
59
|
+
text: text,
|
|
19
60
|
valid_at: valid_at,
|
|
20
61
|
invalid_at: invalid_at,
|
|
21
62
|
status: status.to_s,
|
|
@@ -25,10 +66,10 @@ module FactDb
|
|
|
25
66
|
embedding: embedding
|
|
26
67
|
)
|
|
27
68
|
|
|
28
|
-
# Link to source
|
|
29
|
-
if
|
|
30
|
-
|
|
31
|
-
fact.add_source(
|
|
69
|
+
# Link to source
|
|
70
|
+
if source_id
|
|
71
|
+
source = Models::Source.find(source_id)
|
|
72
|
+
fact.add_source(source: source, kind: "primary")
|
|
32
73
|
end
|
|
33
74
|
|
|
34
75
|
# Add entity mentions
|
|
@@ -45,17 +86,65 @@ module FactDb
|
|
|
45
86
|
fact
|
|
46
87
|
end
|
|
47
88
|
|
|
89
|
+
# Finds an existing fact or creates a new one
|
|
90
|
+
#
|
|
91
|
+
# Uses a SHA256 digest of the text and valid_at date to find duplicates.
|
|
92
|
+
#
|
|
93
|
+
# @param text [String] the fact text content
|
|
94
|
+
# @param valid_at [Date, Time] when the fact became valid
|
|
95
|
+
# @param invalid_at [Date, Time, nil] when the fact became invalid
|
|
96
|
+
# @param status [Symbol] fact status
|
|
97
|
+
# @param source_id [Integer, nil] ID of the source document
|
|
98
|
+
# @param mentions [Array<Hash>] entity mentions
|
|
99
|
+
# @param extraction_method [Symbol] extraction method used
|
|
100
|
+
# @param confidence [Float] confidence score
|
|
101
|
+
# @param metadata [Hash] additional metadata
|
|
102
|
+
# @return [FactDb::Models::Fact] the found or created fact
|
|
103
|
+
def find_or_create(text, valid_at:, invalid_at: nil, status: :canonical, source_id: nil, mentions: [], extraction_method: :manual, confidence: 1.0, metadata: {})
|
|
104
|
+
digest = Digest::SHA256.hexdigest(text)
|
|
105
|
+
existing = Models::Fact.find_by(digest: digest, valid_at: valid_at)
|
|
106
|
+
|
|
107
|
+
return existing if existing
|
|
108
|
+
|
|
109
|
+
create(
|
|
110
|
+
text,
|
|
111
|
+
valid_at: valid_at,
|
|
112
|
+
invalid_at: invalid_at,
|
|
113
|
+
status: status,
|
|
114
|
+
source_id: source_id,
|
|
115
|
+
mentions: mentions,
|
|
116
|
+
extraction_method: extraction_method,
|
|
117
|
+
confidence: confidence,
|
|
118
|
+
metadata: metadata
|
|
119
|
+
)
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
# Finds a fact by ID
|
|
123
|
+
#
|
|
124
|
+
# @param id [Integer] the fact ID
|
|
125
|
+
# @return [FactDb::Models::Fact] the found fact
|
|
126
|
+
# @raise [ActiveRecord::RecordNotFound] if fact not found
|
|
48
127
|
def find(id)
|
|
49
128
|
Models::Fact.find(id)
|
|
50
129
|
end
|
|
51
130
|
|
|
52
|
-
|
|
53
|
-
|
|
131
|
+
# Extracts facts from a source document
|
|
132
|
+
#
|
|
133
|
+
# Uses the configured extractor to parse the source content and create facts.
|
|
134
|
+
#
|
|
135
|
+
# @param source_id [Integer] ID of the source to extract from
|
|
136
|
+
# @param extractor [Symbol] extractor type (:manual, :llm, :rule_based)
|
|
137
|
+
# @return [Array<FactDb::Models::Fact>] array of created facts
|
|
138
|
+
#
|
|
139
|
+
# @example Extract facts using LLM
|
|
140
|
+
# facts = service.extract_from_source(source.id, extractor: :llm)
|
|
141
|
+
def extract_from_source(source_id, extractor: config.default_extractor)
|
|
142
|
+
source = Models::Source.find(source_id)
|
|
54
143
|
extractor_instance = Extractors::Base.for(extractor, config)
|
|
55
144
|
|
|
56
145
|
extracted = extractor_instance.extract(
|
|
57
|
-
content
|
|
58
|
-
{ captured_at:
|
|
146
|
+
source.content,
|
|
147
|
+
{ captured_at: source.captured_at }
|
|
59
148
|
)
|
|
60
149
|
|
|
61
150
|
extracted.map do |fact_data|
|
|
@@ -63,7 +152,7 @@ module FactDb
|
|
|
63
152
|
fact_data[:text],
|
|
64
153
|
valid_at: fact_data[:valid_at],
|
|
65
154
|
invalid_at: fact_data[:invalid_at],
|
|
66
|
-
|
|
155
|
+
source_id: source_id,
|
|
67
156
|
mentions: fact_data[:mentions],
|
|
68
157
|
extraction_method: fact_data[:extraction_method] || extractor,
|
|
69
158
|
confidence: fact_data[:confidence] || 1.0,
|
|
@@ -72,6 +161,20 @@ module FactDb
|
|
|
72
161
|
end
|
|
73
162
|
end
|
|
74
163
|
|
|
164
|
+
# Alias for backward compatibility
|
|
165
|
+
alias extract_from_content extract_from_source
|
|
166
|
+
|
|
167
|
+
# Queries facts with filtering options
|
|
168
|
+
#
|
|
169
|
+
# @param topic [String, nil] topic to search for in fact text
|
|
170
|
+
# @param at [Date, Time, nil] point in time for temporal query
|
|
171
|
+
# @param entity [Integer, nil] entity ID to filter by
|
|
172
|
+
# @param status [Symbol] fact status filter (:canonical, :superseded, :all)
|
|
173
|
+
# @param limit [Integer, nil] maximum number of results
|
|
174
|
+
# @return [ActiveRecord::Relation] matching facts
|
|
175
|
+
#
|
|
176
|
+
# @example Query facts about a topic at a specific date
|
|
177
|
+
# service.query(topic: "employment", at: Date.parse("2024-01-15"))
|
|
75
178
|
def query(topic: nil, at: nil, entity: nil, status: :canonical, limit: nil)
|
|
76
179
|
Temporal::Query.new.execute(
|
|
77
180
|
topic: topic,
|
|
@@ -82,40 +185,107 @@ module FactDb
|
|
|
82
185
|
)
|
|
83
186
|
end
|
|
84
187
|
|
|
188
|
+
# Returns currently valid facts
|
|
189
|
+
#
|
|
190
|
+
# @param entity [Integer, nil] entity ID to filter by
|
|
191
|
+
# @param topic [String, nil] topic to search for
|
|
192
|
+
# @param limit [Integer, nil] maximum number of results
|
|
193
|
+
# @return [ActiveRecord::Relation] currently valid canonical facts
|
|
85
194
|
def current_facts(entity: nil, topic: nil, limit: nil)
|
|
86
195
|
query(topic: topic, entity: entity, at: nil, status: :canonical, limit: limit)
|
|
87
196
|
end
|
|
88
197
|
|
|
198
|
+
# Returns facts valid at a specific date
|
|
199
|
+
#
|
|
200
|
+
# @param date [Date, Time] the point in time
|
|
201
|
+
# @param entity [Integer, nil] entity ID to filter by
|
|
202
|
+
# @param topic [String, nil] topic to search for
|
|
203
|
+
# @return [ActiveRecord::Relation] facts valid at the given date
|
|
89
204
|
def facts_at(date, entity: nil, topic: nil)
|
|
90
205
|
query(topic: topic, entity: entity, at: date, status: :canonical)
|
|
91
206
|
end
|
|
92
207
|
|
|
208
|
+
# Builds a timeline of facts for an entity
|
|
209
|
+
#
|
|
210
|
+
# @param entity_id [Integer] the entity ID
|
|
211
|
+
# @param from [Date, Time, nil] start of timeline range
|
|
212
|
+
# @param to [Date, Time, nil] end of timeline range
|
|
213
|
+
# @return [FactDb::Temporal::Timeline] timeline of facts
|
|
214
|
+
#
|
|
215
|
+
# @example Get timeline for past year
|
|
216
|
+
# service.timeline(entity_id: 1, from: 1.year.ago, to: Date.today)
|
|
93
217
|
def timeline(entity_id:, from: nil, to: nil)
|
|
94
218
|
Temporal::Timeline.new.build(entity_id: entity_id, from: from, to: to)
|
|
95
219
|
end
|
|
96
220
|
|
|
97
|
-
|
|
98
|
-
|
|
221
|
+
# Supersedes an old fact with new information
|
|
222
|
+
#
|
|
223
|
+
# Marks the old fact as superseded and creates a new canonical fact.
|
|
224
|
+
#
|
|
225
|
+
# @param old_fact_id [Integer] ID of the fact to supersede
|
|
226
|
+
# @param new_text [String] the updated fact text
|
|
227
|
+
# @param valid_at [Date, Time] when the new fact became valid
|
|
228
|
+
# @param mentions [Array<Hash>] entity mentions for the new fact
|
|
229
|
+
# @return [FactDb::Models::Fact] the new fact
|
|
230
|
+
def supersede(old_fact_id, new_text, valid_at:, mentions: [])
|
|
231
|
+
@resolver.supersede(old_fact_id, new_text, valid_at: valid_at, mentions: mentions)
|
|
99
232
|
end
|
|
100
233
|
|
|
234
|
+
# Synthesizes multiple facts into a single summary fact
|
|
235
|
+
#
|
|
236
|
+
# @param source_fact_ids [Array<Integer>] IDs of facts to synthesize
|
|
237
|
+
# @param synthesized_text [String] the synthesized summary text
|
|
238
|
+
# @param valid_at [Date, Time] when the synthesis is valid from
|
|
239
|
+
# @param invalid_at [Date, Time, nil] when the synthesis becomes invalid
|
|
240
|
+
# @param mentions [Array<Hash>] entity mentions for the synthesized fact
|
|
241
|
+
# @return [FactDb::Models::Fact] the synthesized fact
|
|
101
242
|
def synthesize(source_fact_ids, synthesized_text, valid_at:, invalid_at: nil, mentions: [])
|
|
102
243
|
@resolver.synthesize(source_fact_ids, synthesized_text, valid_at: valid_at, invalid_at: invalid_at, mentions: mentions)
|
|
103
244
|
end
|
|
104
245
|
|
|
246
|
+
# Invalidates a fact at a specific time
|
|
247
|
+
#
|
|
248
|
+
# @param fact_id [Integer] ID of the fact to invalidate
|
|
249
|
+
# @param at [Time] when the fact became invalid (defaults to now)
|
|
250
|
+
# @return [FactDb::Models::Fact] the invalidated fact
|
|
105
251
|
def invalidate(fact_id, at: Time.current)
|
|
106
252
|
@resolver.invalidate(fact_id, at: at)
|
|
107
253
|
end
|
|
108
254
|
|
|
255
|
+
# Links a corroborating fact to support another fact
|
|
256
|
+
#
|
|
257
|
+
# @param fact_id [Integer] ID of the fact being corroborated
|
|
258
|
+
# @param corroborating_fact_id [Integer] ID of the supporting fact
|
|
259
|
+
# @return [FactDb::Models::Fact] the updated fact
|
|
109
260
|
def corroborate(fact_id, corroborating_fact_id)
|
|
110
261
|
@resolver.corroborate(fact_id, corroborating_fact_id)
|
|
111
262
|
end
|
|
112
263
|
|
|
264
|
+
# Searches facts using full-text search
|
|
265
|
+
#
|
|
266
|
+
# @param query [String] the search query
|
|
267
|
+
# @param entity [Integer, nil] entity ID to filter by
|
|
268
|
+
# @param status [Symbol] fact status filter
|
|
269
|
+
# @param limit [Integer] maximum number of results
|
|
270
|
+
# @return [ActiveRecord::Relation] matching facts
|
|
113
271
|
def search(query, entity: nil, status: :canonical, limit: 20)
|
|
114
272
|
scope = Models::Fact.search_text(query)
|
|
115
273
|
scope = apply_filters(scope, entity: entity, status: status)
|
|
116
274
|
scope.order(valid_at: :desc).limit(limit)
|
|
117
275
|
end
|
|
118
276
|
|
|
277
|
+
# Searches facts using semantic similarity (vector search)
|
|
278
|
+
#
|
|
279
|
+
# Requires an embedding generator to be configured.
|
|
280
|
+
#
|
|
281
|
+
# @param query [String] the search query
|
|
282
|
+
# @param entity [Integer, nil] entity ID to filter by
|
|
283
|
+
# @param at [Date, Time, nil] point in time for temporal filtering
|
|
284
|
+
# @param limit [Integer] maximum number of results
|
|
285
|
+
# @return [ActiveRecord::Relation] semantically similar facts
|
|
286
|
+
#
|
|
287
|
+
# @example Find semantically similar facts
|
|
288
|
+
# service.semantic_search("Who manages the sales team?", limit: 5)
|
|
119
289
|
def semantic_search(query, entity: nil, at: nil, limit: 20)
|
|
120
290
|
embedding = generate_embedding(query)
|
|
121
291
|
return Models::Fact.none unless embedding
|
|
@@ -127,29 +297,58 @@ module FactDb
|
|
|
127
297
|
scope.limit(limit)
|
|
128
298
|
end
|
|
129
299
|
|
|
300
|
+
# Finds conflicting facts for an entity or topic
|
|
301
|
+
#
|
|
302
|
+
# @param entity_id [Integer, nil] entity ID to check
|
|
303
|
+
# @param topic [String, nil] topic to check
|
|
304
|
+
# @return [Array<Hash>] array of conflict descriptions
|
|
130
305
|
def find_conflicts(entity_id: nil, topic: nil)
|
|
131
306
|
@resolver.find_conflicts(entity_id: entity_id, topic: topic)
|
|
132
307
|
end
|
|
133
308
|
|
|
309
|
+
# Resolves a conflict by keeping one fact and superseding others
|
|
310
|
+
#
|
|
311
|
+
# @param keep_fact_id [Integer] ID of the fact to keep
|
|
312
|
+
# @param supersede_fact_ids [Array<Integer>] IDs of facts to supersede
|
|
313
|
+
# @param reason [String, nil] reason for the resolution
|
|
314
|
+
# @return [FactDb::Models::Fact] the kept fact
|
|
134
315
|
def resolve_conflict(keep_fact_id, supersede_fact_ids, reason: nil)
|
|
135
316
|
@resolver.resolve_conflict(keep_fact_id, supersede_fact_ids, reason: reason)
|
|
136
317
|
end
|
|
137
318
|
|
|
319
|
+
# Builds a timeline fact summarizing an entity's history
|
|
320
|
+
#
|
|
321
|
+
# @param entity_id [Integer] the entity ID
|
|
322
|
+
# @param topic [String, nil] optional topic filter
|
|
323
|
+
# @return [Hash] timeline summary data
|
|
138
324
|
def build_timeline_fact(entity_id:, topic: nil)
|
|
139
325
|
@resolver.build_timeline_fact(entity_id: entity_id, topic: topic)
|
|
140
326
|
end
|
|
141
327
|
|
|
328
|
+
# Returns recently created facts
|
|
329
|
+
#
|
|
330
|
+
# @param limit [Integer] maximum number of results
|
|
331
|
+
# @param status [Symbol] fact status filter
|
|
332
|
+
# @return [ActiveRecord::Relation] recent facts ordered by creation date
|
|
142
333
|
def recent(limit: 10, status: :canonical)
|
|
143
334
|
scope = Models::Fact.where(status: status.to_s).order(created_at: :desc)
|
|
144
335
|
scope.limit(limit)
|
|
145
336
|
end
|
|
146
337
|
|
|
338
|
+
# Returns facts by extraction method
|
|
339
|
+
#
|
|
340
|
+
# @param method [Symbol, String] extraction method (:manual, :llm, :rule_based)
|
|
341
|
+
# @param limit [Integer, nil] maximum number of results
|
|
342
|
+
# @return [ActiveRecord::Relation] facts extracted by the given method
|
|
147
343
|
def by_extraction_method(method, limit: nil)
|
|
148
344
|
scope = Models::Fact.extracted_by(method.to_s).order(created_at: :desc)
|
|
149
345
|
scope = scope.limit(limit) if limit
|
|
150
346
|
scope
|
|
151
347
|
end
|
|
152
348
|
|
|
349
|
+
# Returns aggregate statistics about all facts
|
|
350
|
+
#
|
|
351
|
+
# @return [Hash] statistics including counts by status and extraction method
|
|
153
352
|
def stats
|
|
154
353
|
{
|
|
155
354
|
total: Models::Fact.count,
|
|
@@ -162,16 +361,54 @@ module FactDb
|
|
|
162
361
|
}
|
|
163
362
|
end
|
|
164
363
|
|
|
364
|
+
# Returns fact statistics for an entity (or all facts)
|
|
365
|
+
#
|
|
366
|
+
# @param entity_id [Integer, nil] Entity ID (nil for all facts)
|
|
367
|
+
# @return [Hash] Statistics by fact status
|
|
368
|
+
def fact_stats(entity_id = nil)
|
|
369
|
+
scope = entity_id ? Models::Fact.mentioning_entity(entity_id) : Models::Fact.all
|
|
370
|
+
|
|
371
|
+
{
|
|
372
|
+
canonical: scope.where(status: "canonical").count,
|
|
373
|
+
superseded: scope.where(status: "superseded").count,
|
|
374
|
+
corroborated: scope.where.not(corroborated_by_ids: nil).where.not(corroborated_by_ids: []).count,
|
|
375
|
+
synthesized: scope.where(status: "synthesized").count
|
|
376
|
+
}
|
|
377
|
+
end
|
|
378
|
+
|
|
165
379
|
private
|
|
166
380
|
|
|
167
381
|
def resolve_or_create_entity(mention)
|
|
168
382
|
# If entity_id is already provided, use that entity directly
|
|
169
|
-
|
|
383
|
+
if mention[:entity_id]
|
|
384
|
+
entity = Models::Entity.find(mention[:entity_id])
|
|
385
|
+
# Still add any new aliases even for existing entities
|
|
386
|
+
add_aliases_to_entity(entity, mention[:aliases])
|
|
387
|
+
return entity
|
|
388
|
+
end
|
|
170
389
|
|
|
171
390
|
name = mention[:name] || mention[:text]
|
|
172
|
-
|
|
391
|
+
kind = mention[:kind]&.to_sym || :concept
|
|
392
|
+
aliases = mention[:aliases] || []
|
|
393
|
+
|
|
394
|
+
entity = @entity_service.resolve_or_create(name, kind: kind, aliases: aliases)
|
|
395
|
+
|
|
396
|
+
# If entity was resolved (not created), still add any new aliases
|
|
397
|
+
add_aliases_to_entity(entity, aliases) if aliases.any?
|
|
173
398
|
|
|
174
|
-
|
|
399
|
+
entity
|
|
400
|
+
end
|
|
401
|
+
|
|
402
|
+
def add_aliases_to_entity(entity, aliases)
|
|
403
|
+
return unless aliases&.any?
|
|
404
|
+
|
|
405
|
+
aliases.each do |alias_text|
|
|
406
|
+
next if alias_text.to_s.strip.empty?
|
|
407
|
+
next if entity.name.downcase == alias_text.to_s.strip.downcase
|
|
408
|
+
next if entity.all_aliases.map(&:downcase).include?(alias_text.to_s.strip.downcase)
|
|
409
|
+
|
|
410
|
+
entity.add_alias(alias_text.to_s.strip)
|
|
411
|
+
end
|
|
175
412
|
end
|
|
176
413
|
|
|
177
414
|
def apply_filters(scope, entity: nil, status: nil)
|
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module FactDb
|
|
4
|
+
module Services
|
|
5
|
+
# Service class for managing source documents in the database
|
|
6
|
+
#
|
|
7
|
+
# Provides methods for creating, searching, and retrieving source documents
|
|
8
|
+
# which are the original content from which facts are extracted.
|
|
9
|
+
#
|
|
10
|
+
# @example Basic usage
|
|
11
|
+
# service = SourceService.new
|
|
12
|
+
# source = service.create("Meeting notes...", kind: :document)
|
|
13
|
+
#
|
|
14
|
+
class SourceService
|
|
15
|
+
# @return [FactDb::Config] the configuration object
|
|
16
|
+
attr_reader :config
|
|
17
|
+
|
|
18
|
+
# Initializes a new SourceService instance
|
|
19
|
+
#
|
|
20
|
+
# @param config [FactDb::Config] configuration object (defaults to FactDb.config)
|
|
21
|
+
def initialize(config = FactDb.config)
|
|
22
|
+
@config = config
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
# Creates a new source document in the database
|
|
26
|
+
#
|
|
27
|
+
# Automatically deduplicates by content hash - returns existing source if content matches.
|
|
28
|
+
#
|
|
29
|
+
# @param content [String] the source content text
|
|
30
|
+
# @param kind [Symbol, String] source kind (:document, :email, :transcript, etc.)
|
|
31
|
+
# @param captured_at [Time] when the source was captured (defaults to now)
|
|
32
|
+
# @param metadata [Hash] additional metadata
|
|
33
|
+
# @param title [String, nil] optional title
|
|
34
|
+
# @param source_uri [String, nil] optional URI of the original source
|
|
35
|
+
# @return [FactDb::Models::Source] the created or existing source
|
|
36
|
+
#
|
|
37
|
+
# @example Create a source with metadata
|
|
38
|
+
# service.create("Email content...",
|
|
39
|
+
# kind: :email,
|
|
40
|
+
# captured_at: Time.parse("2024-01-15"),
|
|
41
|
+
# metadata: { from: "john@example.com" })
|
|
42
|
+
def create(content, kind:, captured_at: Time.current, metadata: {}, title: nil, source_uri: nil)
|
|
43
|
+
content_hash = Digest::SHA256.hexdigest(content)
|
|
44
|
+
|
|
45
|
+
# Check for duplicate content
|
|
46
|
+
existing = Models::Source.find_by(content_hash: content_hash)
|
|
47
|
+
return existing if existing
|
|
48
|
+
|
|
49
|
+
embedding = generate_embedding(content)
|
|
50
|
+
|
|
51
|
+
Models::Source.create!(
|
|
52
|
+
content: content,
|
|
53
|
+
content_hash: content_hash,
|
|
54
|
+
kind: kind.to_s,
|
|
55
|
+
title: title,
|
|
56
|
+
source_uri: source_uri,
|
|
57
|
+
metadata: metadata,
|
|
58
|
+
captured_at: captured_at,
|
|
59
|
+
embedding: embedding
|
|
60
|
+
)
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
# Finds a source by ID
|
|
64
|
+
#
|
|
65
|
+
# @param id [Integer] the source ID
|
|
66
|
+
# @return [FactDb::Models::Source] the found source
|
|
67
|
+
# @raise [ActiveRecord::RecordNotFound] if source not found
|
|
68
|
+
def find(id)
|
|
69
|
+
Models::Source.find(id)
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
# Finds a source by content hash
|
|
73
|
+
#
|
|
74
|
+
# @param hash [String] the SHA256 content hash
|
|
75
|
+
# @return [FactDb::Models::Source, nil] the found source or nil
|
|
76
|
+
def find_by_hash(hash)
|
|
77
|
+
Models::Source.find_by(content_hash: hash)
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
# Searches sources using full-text search with optional filters
|
|
81
|
+
#
|
|
82
|
+
# @param query [String] the search query
|
|
83
|
+
# @param kind [Symbol, String, nil] optional kind filter
|
|
84
|
+
# @param from [Date, Time, nil] captured after this date
|
|
85
|
+
# @param to [Date, Time, nil] captured before this date
|
|
86
|
+
# @param limit [Integer] maximum number of results
|
|
87
|
+
# @return [ActiveRecord::Relation] matching sources
|
|
88
|
+
def search(query, kind: nil, from: nil, to: nil, limit: 20)
|
|
89
|
+
scope = Models::Source.search_text(query)
|
|
90
|
+
scope = scope.by_kind(kind) if kind
|
|
91
|
+
scope = scope.captured_after(from) if from
|
|
92
|
+
scope = scope.captured_before(to) if to
|
|
93
|
+
scope.order(captured_at: :desc).limit(limit)
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
# Searches sources using semantic similarity (vector search)
|
|
97
|
+
#
|
|
98
|
+
# Requires an embedding generator to be configured.
|
|
99
|
+
#
|
|
100
|
+
# @param query [String] the search query
|
|
101
|
+
# @param limit [Integer] maximum number of results
|
|
102
|
+
# @return [ActiveRecord::Relation] semantically similar sources
|
|
103
|
+
def semantic_search(query, limit: 20)
|
|
104
|
+
embedding = generate_embedding(query)
|
|
105
|
+
return Models::Source.none unless embedding
|
|
106
|
+
|
|
107
|
+
Models::Source.nearest_neighbors(embedding, limit: limit)
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
# Returns sources of a specific kind
|
|
111
|
+
#
|
|
112
|
+
# @param kind [Symbol, String] the source kind
|
|
113
|
+
# @param limit [Integer, nil] maximum number of results
|
|
114
|
+
# @return [ActiveRecord::Relation] sources of that kind
|
|
115
|
+
def by_kind(kind, limit: nil)
|
|
116
|
+
scope = Models::Source.by_kind(kind).order(captured_at: :desc)
|
|
117
|
+
scope = scope.limit(limit) if limit
|
|
118
|
+
scope
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
# Returns sources captured between two dates
|
|
122
|
+
#
|
|
123
|
+
# @param from [Date, Time] start of range
|
|
124
|
+
# @param to [Date, Time] end of range
|
|
125
|
+
# @return [ActiveRecord::Relation] sources in the date range
|
|
126
|
+
def between(from, to)
|
|
127
|
+
Models::Source.captured_between(from, to).order(captured_at: :asc)
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
# Returns recently captured sources
|
|
131
|
+
#
|
|
132
|
+
# @param limit [Integer] maximum number of results
|
|
133
|
+
# @return [ActiveRecord::Relation] recent sources ordered by capture date
|
|
134
|
+
def recent(limit: 10)
|
|
135
|
+
Models::Source.order(captured_at: :desc).limit(limit)
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
# Returns aggregate statistics about sources
|
|
139
|
+
#
|
|
140
|
+
# @return [Hash] statistics including counts by kind and date range
|
|
141
|
+
def stats
|
|
142
|
+
{
|
|
143
|
+
total: Models::Source.count,
|
|
144
|
+
total_count: Models::Source.count,
|
|
145
|
+
by_kind: Models::Source.group(:kind).count,
|
|
146
|
+
earliest: Models::Source.minimum(:captured_at),
|
|
147
|
+
latest: Models::Source.maximum(:captured_at),
|
|
148
|
+
total_words: Models::Source.sum("array_length(regexp_split_to_array(content, '\\s+'), 1)")
|
|
149
|
+
}
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
private
|
|
153
|
+
|
|
154
|
+
def generate_embedding(text)
|
|
155
|
+
return nil unless config.embedding_generator
|
|
156
|
+
|
|
157
|
+
config.embedding_generator.call(text)
|
|
158
|
+
rescue StandardError => e
|
|
159
|
+
config.logger&.warn("Failed to generate embedding: #{e.message}")
|
|
160
|
+
nil
|
|
161
|
+
end
|
|
162
|
+
end
|
|
163
|
+
end
|
|
164
|
+
end
|