fact_db 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (114) hide show
  1. checksums.yaml +4 -4
  2. data/.envrc +2 -0
  3. data/.yardopts +5 -0
  4. data/CHANGELOG.md +64 -0
  5. data/README.md +107 -6
  6. data/Rakefile +243 -10
  7. data/db/migrate/001_enable_extensions.rb +1 -0
  8. data/db/migrate/002_create_sources.rb +49 -0
  9. data/db/migrate/003_create_entities.rb +27 -15
  10. data/db/migrate/004_create_entity_aliases.rb +20 -7
  11. data/db/migrate/005_create_facts.rb +37 -21
  12. data/db/migrate/006_create_entity_mentions.rb +14 -6
  13. data/db/migrate/007_create_fact_sources.rb +16 -8
  14. data/docs/api/extractors/index.md +5 -5
  15. data/docs/api/extractors/llm.md +17 -17
  16. data/docs/api/extractors/rule-based.md +14 -14
  17. data/docs/api/facts.md +20 -20
  18. data/docs/api/index.md +4 -4
  19. data/docs/api/models/entity.md +21 -21
  20. data/docs/api/models/fact.md +15 -15
  21. data/docs/api/models/index.md +7 -7
  22. data/docs/api/models/{content.md → source.md} +29 -29
  23. data/docs/api/pipeline/extraction.md +25 -25
  24. data/docs/api/pipeline/index.md +1 -1
  25. data/docs/api/pipeline/resolution.md +4 -4
  26. data/docs/api/services/entity-service.md +20 -20
  27. data/docs/api/services/fact-service.md +12 -12
  28. data/docs/api/services/index.md +5 -5
  29. data/docs/api/services/{content-service.md → source-service.md} +27 -27
  30. data/docs/architecture/database-schema.md +46 -46
  31. data/docs/architecture/entity-resolution.md +6 -6
  32. data/docs/architecture/index.md +10 -10
  33. data/docs/architecture/temporal-facts.md +5 -5
  34. data/docs/architecture/three-layer-model.md +17 -17
  35. data/docs/concepts.md +6 -6
  36. data/docs/examples/basic-usage.md +20 -20
  37. data/docs/examples/hr-onboarding.md +17 -17
  38. data/docs/examples/index.md +4 -4
  39. data/docs/examples/news-analysis.md +23 -23
  40. data/docs/getting-started/database-setup.md +28 -20
  41. data/docs/getting-started/index.md +3 -3
  42. data/docs/getting-started/quick-start.md +33 -30
  43. data/docs/guides/batch-processing.md +26 -26
  44. data/docs/guides/configuration.md +158 -77
  45. data/docs/guides/entity-management.md +14 -14
  46. data/docs/guides/extracting-facts.md +28 -28
  47. data/docs/guides/ingesting-content.md +14 -14
  48. data/docs/guides/llm-integration.md +40 -32
  49. data/docs/guides/temporal-queries.md +11 -11
  50. data/docs/index.md +6 -2
  51. data/examples/.envrc +4 -0
  52. data/examples/.gitignore +1 -0
  53. data/examples/001_configuration.rb +312 -0
  54. data/examples/{basic_usage.rb → 010_basic_usage.rb} +47 -56
  55. data/examples/{entity_management.rb → 020_entity_management.rb} +57 -72
  56. data/examples/{temporal_queries.rb → 030_temporal_queries.rb} +39 -59
  57. data/examples/040_output_formats.rb +177 -0
  58. data/examples/{rule_based_extraction.rb → 050_rule_based_extraction.rb} +39 -45
  59. data/examples/060_fluent_temporal_api.rb +217 -0
  60. data/examples/070_introspection.rb +252 -0
  61. data/examples/{hr_system.rb → 080_hr_system.rb} +56 -75
  62. data/examples/090_ingest_demo.rb +515 -0
  63. data/examples/100_query_context.rb +668 -0
  64. data/examples/110_prove_it.rb +204 -0
  65. data/examples/120_dump_database.rb +358 -0
  66. data/examples/130_rag_feedback_loop.rb +858 -0
  67. data/examples/README.md +229 -15
  68. data/examples/data/lincoln_associates.md +201 -0
  69. data/examples/data/lincoln_biography.md +66 -0
  70. data/examples/data/lincoln_cabinet.md +243 -0
  71. data/examples/data/lincoln_family.md +163 -0
  72. data/examples/data/lincoln_military.md +241 -0
  73. data/examples/data/lincoln_todd_family.md +136 -0
  74. data/examples/ingest_reporter.rb +335 -0
  75. data/examples/utilities.rb +182 -0
  76. data/lib/fact_db/config/defaults.yml +254 -0
  77. data/lib/fact_db/config.rb +94 -35
  78. data/lib/fact_db/database.rb +98 -8
  79. data/lib/fact_db/extractors/base.rb +106 -21
  80. data/lib/fact_db/extractors/llm_extractor.rb +35 -63
  81. data/lib/fact_db/extractors/manual_extractor.rb +46 -6
  82. data/lib/fact_db/extractors/rule_based_extractor.rb +136 -25
  83. data/lib/fact_db/llm/adapter.rb +3 -3
  84. data/lib/fact_db/models/entity.rb +94 -22
  85. data/lib/fact_db/models/entity_alias.rb +41 -7
  86. data/lib/fact_db/models/entity_mention.rb +34 -1
  87. data/lib/fact_db/models/fact.rb +259 -28
  88. data/lib/fact_db/models/fact_source.rb +43 -9
  89. data/lib/fact_db/models/source.rb +113 -0
  90. data/lib/fact_db/pipeline/extraction_pipeline.rb +35 -35
  91. data/lib/fact_db/pipeline/resolution_pipeline.rb +5 -5
  92. data/lib/fact_db/query_result.rb +202 -0
  93. data/lib/fact_db/resolution/entity_resolver.rb +139 -39
  94. data/lib/fact_db/resolution/fact_resolver.rb +86 -14
  95. data/lib/fact_db/services/entity_service.rb +246 -37
  96. data/lib/fact_db/services/fact_service.rb +254 -17
  97. data/lib/fact_db/services/source_service.rb +164 -0
  98. data/lib/fact_db/temporal/query.rb +71 -7
  99. data/lib/fact_db/temporal/query_builder.rb +69 -0
  100. data/lib/fact_db/temporal/timeline.rb +102 -11
  101. data/lib/fact_db/transformers/base.rb +77 -0
  102. data/lib/fact_db/transformers/cypher_transformer.rb +185 -0
  103. data/lib/fact_db/transformers/json_transformer.rb +17 -0
  104. data/lib/fact_db/transformers/raw_transformer.rb +35 -0
  105. data/lib/fact_db/transformers/text_transformer.rb +114 -0
  106. data/lib/fact_db/transformers/triple_transformer.rb +138 -0
  107. data/lib/fact_db/validation/alias_filter.rb +185 -0
  108. data/lib/fact_db/version.rb +1 -1
  109. data/lib/fact_db.rb +281 -30
  110. data/mkdocs.yml +2 -2
  111. metadata +60 -16
  112. data/db/migrate/002_create_contents.rb +0 -44
  113. data/lib/fact_db/models/content.rb +0 -62
  114. data/lib/fact_db/services/content_service.rb +0 -93
@@ -2,20 +2,61 @@
2
2
 
3
3
  module FactDb
4
4
  module Services
5
+ # Service class for managing facts in the database
6
+ #
7
+ # Provides methods for creating, querying, and manipulating facts
8
+ # including temporal queries, semantic search, and conflict resolution.
9
+ #
10
+ # @example Basic usage
11
+ # service = FactService.new
12
+ # fact = service.create("John works at Acme", valid_at: Date.today)
13
+ #
5
14
  class FactService
6
- attr_reader :config, :resolver, :entity_service
15
+ # @return [FactDb::Config] the configuration object
16
+ attr_reader :config
7
17
 
18
+ # @return [FactDb::Resolution::FactResolver] the fact resolver instance
19
+ attr_reader :resolver
20
+
21
+ # @return [FactDb::Services::EntityService] the entity service instance
22
+ attr_reader :entity_service
23
+
24
+ # Initializes a new FactService instance
25
+ #
26
+ # @param config [FactDb::Config] configuration object (defaults to FactDb.config)
8
27
  def initialize(config = FactDb.config)
9
28
  @config = config
10
29
  @resolver = Resolution::FactResolver.new(config)
11
30
  @entity_service = EntityService.new(config)
12
31
  end
13
32
 
14
- def create(text, valid_at:, invalid_at: nil, status: :canonical, source_content_id: nil, mentions: [], extraction_method: :manual, confidence: 1.0, metadata: {})
33
+ # Creates a new fact in the database
34
+ #
35
+ # @param text [String] the fact text content
36
+ # @param valid_at [Date, Time] when the fact became valid
37
+ # @param invalid_at [Date, Time, nil] when the fact became invalid (nil if still valid)
38
+ # @param status [Symbol] fact status (:canonical, :superseded, :synthesized)
39
+ # @param source_id [Integer, nil] ID of the source document
40
+ # @param mentions [Array<Hash>] entity mentions with :name, :kind, :role, :confidence keys
41
+ # @param extraction_method [Symbol] how the fact was extracted (:manual, :llm, :rule_based)
42
+ # @param confidence [Float] confidence score from 0.0 to 1.0
43
+ # @param metadata [Hash] additional metadata for the fact
44
+ # @return [FactDb::Models::Fact] the created fact
45
+ #
46
+ # @example Create a fact with mentions
47
+ # service.create(
48
+ # "John works at Acme Corp",
49
+ # valid_at: Date.parse("2024-01-15"),
50
+ # mentions: [
51
+ # { name: "John", kind: :person, role: :subject },
52
+ # { name: "Acme Corp", kind: :organization, role: :object }
53
+ # ]
54
+ # )
55
+ def create(text, valid_at:, invalid_at: nil, status: :canonical, source_id: nil, mentions: [], extraction_method: :manual, confidence: 1.0, metadata: {})
15
56
  embedding = generate_embedding(text)
16
57
 
17
58
  fact = Models::Fact.create!(
18
- fact_text: text,
59
+ text: text,
19
60
  valid_at: valid_at,
20
61
  invalid_at: invalid_at,
21
62
  status: status.to_s,
@@ -25,10 +66,10 @@ module FactDb
25
66
  embedding: embedding
26
67
  )
27
68
 
28
- # Link to source content
29
- if source_content_id
30
- content = Models::Content.find(source_content_id)
31
- fact.add_source(content: content, type: "primary")
69
+ # Link to source
70
+ if source_id
71
+ source = Models::Source.find(source_id)
72
+ fact.add_source(source: source, kind: "primary")
32
73
  end
33
74
 
34
75
  # Add entity mentions
@@ -45,17 +86,65 @@ module FactDb
45
86
  fact
46
87
  end
47
88
 
89
+ # Finds an existing fact or creates a new one
90
+ #
91
+ # Uses a SHA256 digest of the text and valid_at date to find duplicates.
92
+ #
93
+ # @param text [String] the fact text content
94
+ # @param valid_at [Date, Time] when the fact became valid
95
+ # @param invalid_at [Date, Time, nil] when the fact became invalid
96
+ # @param status [Symbol] fact status
97
+ # @param source_id [Integer, nil] ID of the source document
98
+ # @param mentions [Array<Hash>] entity mentions
99
+ # @param extraction_method [Symbol] extraction method used
100
+ # @param confidence [Float] confidence score
101
+ # @param metadata [Hash] additional metadata
102
+ # @return [FactDb::Models::Fact] the found or created fact
103
+ def find_or_create(text, valid_at:, invalid_at: nil, status: :canonical, source_id: nil, mentions: [], extraction_method: :manual, confidence: 1.0, metadata: {})
104
+ digest = Digest::SHA256.hexdigest(text)
105
+ existing = Models::Fact.find_by(digest: digest, valid_at: valid_at)
106
+
107
+ return existing if existing
108
+
109
+ create(
110
+ text,
111
+ valid_at: valid_at,
112
+ invalid_at: invalid_at,
113
+ status: status,
114
+ source_id: source_id,
115
+ mentions: mentions,
116
+ extraction_method: extraction_method,
117
+ confidence: confidence,
118
+ metadata: metadata
119
+ )
120
+ end
121
+
122
+ # Finds a fact by ID
123
+ #
124
+ # @param id [Integer] the fact ID
125
+ # @return [FactDb::Models::Fact] the found fact
126
+ # @raise [ActiveRecord::RecordNotFound] if fact not found
48
127
  def find(id)
49
128
  Models::Fact.find(id)
50
129
  end
51
130
 
52
- def extract_from_content(content_id, extractor: config.default_extractor)
53
- content = Models::Content.find(content_id)
131
+ # Extracts facts from a source document
132
+ #
133
+ # Uses the configured extractor to parse the source content and create facts.
134
+ #
135
+ # @param source_id [Integer] ID of the source to extract from
136
+ # @param extractor [Symbol] extractor type (:manual, :llm, :rule_based)
137
+ # @return [Array<FactDb::Models::Fact>] array of created facts
138
+ #
139
+ # @example Extract facts using LLM
140
+ # facts = service.extract_from_source(source.id, extractor: :llm)
141
+ def extract_from_source(source_id, extractor: config.default_extractor)
142
+ source = Models::Source.find(source_id)
54
143
  extractor_instance = Extractors::Base.for(extractor, config)
55
144
 
56
145
  extracted = extractor_instance.extract(
57
- content.raw_text,
58
- { captured_at: content.captured_at }
146
+ source.content,
147
+ { captured_at: source.captured_at }
59
148
  )
60
149
 
61
150
  extracted.map do |fact_data|
@@ -63,7 +152,7 @@ module FactDb
63
152
  fact_data[:text],
64
153
  valid_at: fact_data[:valid_at],
65
154
  invalid_at: fact_data[:invalid_at],
66
- source_content_id: content_id,
155
+ source_id: source_id,
67
156
  mentions: fact_data[:mentions],
68
157
  extraction_method: fact_data[:extraction_method] || extractor,
69
158
  confidence: fact_data[:confidence] || 1.0,
@@ -72,6 +161,20 @@ module FactDb
72
161
  end
73
162
  end
74
163
 
164
+ # Alias for backward compatibility
165
+ alias extract_from_content extract_from_source
166
+
167
+ # Queries facts with filtering options
168
+ #
169
+ # @param topic [String, nil] topic to search for in fact text
170
+ # @param at [Date, Time, nil] point in time for temporal query
171
+ # @param entity [Integer, nil] entity ID to filter by
172
+ # @param status [Symbol] fact status filter (:canonical, :superseded, :all)
173
+ # @param limit [Integer, nil] maximum number of results
174
+ # @return [ActiveRecord::Relation] matching facts
175
+ #
176
+ # @example Query facts about a topic at a specific date
177
+ # service.query(topic: "employment", at: Date.parse("2024-01-15"))
75
178
  def query(topic: nil, at: nil, entity: nil, status: :canonical, limit: nil)
76
179
  Temporal::Query.new.execute(
77
180
  topic: topic,
@@ -82,40 +185,107 @@ module FactDb
82
185
  )
83
186
  end
84
187
 
188
+ # Returns currently valid facts
189
+ #
190
+ # @param entity [Integer, nil] entity ID to filter by
191
+ # @param topic [String, nil] topic to search for
192
+ # @param limit [Integer, nil] maximum number of results
193
+ # @return [ActiveRecord::Relation] currently valid canonical facts
85
194
  def current_facts(entity: nil, topic: nil, limit: nil)
86
195
  query(topic: topic, entity: entity, at: nil, status: :canonical, limit: limit)
87
196
  end
88
197
 
198
+ # Returns facts valid at a specific date
199
+ #
200
+ # @param date [Date, Time] the point in time
201
+ # @param entity [Integer, nil] entity ID to filter by
202
+ # @param topic [String, nil] topic to search for
203
+ # @return [ActiveRecord::Relation] facts valid at the given date
89
204
  def facts_at(date, entity: nil, topic: nil)
90
205
  query(topic: topic, entity: entity, at: date, status: :canonical)
91
206
  end
92
207
 
208
+ # Builds a timeline of facts for an entity
209
+ #
210
+ # @param entity_id [Integer] the entity ID
211
+ # @param from [Date, Time, nil] start of timeline range
212
+ # @param to [Date, Time, nil] end of timeline range
213
+ # @return [FactDb::Temporal::Timeline] timeline of facts
214
+ #
215
+ # @example Get timeline for past year
216
+ # service.timeline(entity_id: 1, from: 1.year.ago, to: Date.today)
93
217
  def timeline(entity_id:, from: nil, to: nil)
94
218
  Temporal::Timeline.new.build(entity_id: entity_id, from: from, to: to)
95
219
  end
96
220
 
97
- def supersede(old_fact_id, new_fact_text, valid_at:, mentions: [])
98
- @resolver.supersede(old_fact_id, new_fact_text, valid_at: valid_at, mentions: mentions)
221
+ # Supersedes an old fact with new information
222
+ #
223
+ # Marks the old fact as superseded and creates a new canonical fact.
224
+ #
225
+ # @param old_fact_id [Integer] ID of the fact to supersede
226
+ # @param new_text [String] the updated fact text
227
+ # @param valid_at [Date, Time] when the new fact became valid
228
+ # @param mentions [Array<Hash>] entity mentions for the new fact
229
+ # @return [FactDb::Models::Fact] the new fact
230
+ def supersede(old_fact_id, new_text, valid_at:, mentions: [])
231
+ @resolver.supersede(old_fact_id, new_text, valid_at: valid_at, mentions: mentions)
99
232
  end
100
233
 
234
+ # Synthesizes multiple facts into a single summary fact
235
+ #
236
+ # @param source_fact_ids [Array<Integer>] IDs of facts to synthesize
237
+ # @param synthesized_text [String] the synthesized summary text
238
+ # @param valid_at [Date, Time] when the synthesis is valid from
239
+ # @param invalid_at [Date, Time, nil] when the synthesis becomes invalid
240
+ # @param mentions [Array<Hash>] entity mentions for the synthesized fact
241
+ # @return [FactDb::Models::Fact] the synthesized fact
101
242
  def synthesize(source_fact_ids, synthesized_text, valid_at:, invalid_at: nil, mentions: [])
102
243
  @resolver.synthesize(source_fact_ids, synthesized_text, valid_at: valid_at, invalid_at: invalid_at, mentions: mentions)
103
244
  end
104
245
 
246
+ # Invalidates a fact at a specific time
247
+ #
248
+ # @param fact_id [Integer] ID of the fact to invalidate
249
+ # @param at [Time] when the fact became invalid (defaults to now)
250
+ # @return [FactDb::Models::Fact] the invalidated fact
105
251
  def invalidate(fact_id, at: Time.current)
106
252
  @resolver.invalidate(fact_id, at: at)
107
253
  end
108
254
 
255
+ # Links a corroborating fact to support another fact
256
+ #
257
+ # @param fact_id [Integer] ID of the fact being corroborated
258
+ # @param corroborating_fact_id [Integer] ID of the supporting fact
259
+ # @return [FactDb::Models::Fact] the updated fact
109
260
  def corroborate(fact_id, corroborating_fact_id)
110
261
  @resolver.corroborate(fact_id, corroborating_fact_id)
111
262
  end
112
263
 
264
+ # Searches facts using full-text search
265
+ #
266
+ # @param query [String] the search query
267
+ # @param entity [Integer, nil] entity ID to filter by
268
+ # @param status [Symbol] fact status filter
269
+ # @param limit [Integer] maximum number of results
270
+ # @return [ActiveRecord::Relation] matching facts
113
271
  def search(query, entity: nil, status: :canonical, limit: 20)
114
272
  scope = Models::Fact.search_text(query)
115
273
  scope = apply_filters(scope, entity: entity, status: status)
116
274
  scope.order(valid_at: :desc).limit(limit)
117
275
  end
118
276
 
277
+ # Searches facts using semantic similarity (vector search)
278
+ #
279
+ # Requires an embedding generator to be configured.
280
+ #
281
+ # @param query [String] the search query
282
+ # @param entity [Integer, nil] entity ID to filter by
283
+ # @param at [Date, Time, nil] point in time for temporal filtering
284
+ # @param limit [Integer] maximum number of results
285
+ # @return [ActiveRecord::Relation] semantically similar facts
286
+ #
287
+ # @example Find semantically similar facts
288
+ # service.semantic_search("Who manages the sales team?", limit: 5)
119
289
  def semantic_search(query, entity: nil, at: nil, limit: 20)
120
290
  embedding = generate_embedding(query)
121
291
  return Models::Fact.none unless embedding
@@ -127,29 +297,58 @@ module FactDb
127
297
  scope.limit(limit)
128
298
  end
129
299
 
300
+ # Finds conflicting facts for an entity or topic
301
+ #
302
+ # @param entity_id [Integer, nil] entity ID to check
303
+ # @param topic [String, nil] topic to check
304
+ # @return [Array<Hash>] array of conflict descriptions
130
305
  def find_conflicts(entity_id: nil, topic: nil)
131
306
  @resolver.find_conflicts(entity_id: entity_id, topic: topic)
132
307
  end
133
308
 
309
+ # Resolves a conflict by keeping one fact and superseding others
310
+ #
311
+ # @param keep_fact_id [Integer] ID of the fact to keep
312
+ # @param supersede_fact_ids [Array<Integer>] IDs of facts to supersede
313
+ # @param reason [String, nil] reason for the resolution
314
+ # @return [FactDb::Models::Fact] the kept fact
134
315
  def resolve_conflict(keep_fact_id, supersede_fact_ids, reason: nil)
135
316
  @resolver.resolve_conflict(keep_fact_id, supersede_fact_ids, reason: reason)
136
317
  end
137
318
 
319
+ # Builds a timeline fact summarizing an entity's history
320
+ #
321
+ # @param entity_id [Integer] the entity ID
322
+ # @param topic [String, nil] optional topic filter
323
+ # @return [Hash] timeline summary data
138
324
  def build_timeline_fact(entity_id:, topic: nil)
139
325
  @resolver.build_timeline_fact(entity_id: entity_id, topic: topic)
140
326
  end
141
327
 
328
+ # Returns recently created facts
329
+ #
330
+ # @param limit [Integer] maximum number of results
331
+ # @param status [Symbol] fact status filter
332
+ # @return [ActiveRecord::Relation] recent facts ordered by creation date
142
333
  def recent(limit: 10, status: :canonical)
143
334
  scope = Models::Fact.where(status: status.to_s).order(created_at: :desc)
144
335
  scope.limit(limit)
145
336
  end
146
337
 
338
+ # Returns facts by extraction method
339
+ #
340
+ # @param method [Symbol, String] extraction method (:manual, :llm, :rule_based)
341
+ # @param limit [Integer, nil] maximum number of results
342
+ # @return [ActiveRecord::Relation] facts extracted by the given method
147
343
  def by_extraction_method(method, limit: nil)
148
344
  scope = Models::Fact.extracted_by(method.to_s).order(created_at: :desc)
149
345
  scope = scope.limit(limit) if limit
150
346
  scope
151
347
  end
152
348
 
349
+ # Returns aggregate statistics about all facts
350
+ #
351
+ # @return [Hash] statistics including counts by status and extraction method
153
352
  def stats
154
353
  {
155
354
  total: Models::Fact.count,
@@ -162,16 +361,54 @@ module FactDb
162
361
  }
163
362
  end
164
363
 
364
+ # Returns fact statistics for an entity (or all facts)
365
+ #
366
+ # @param entity_id [Integer, nil] Entity ID (nil for all facts)
367
+ # @return [Hash] Statistics by fact status
368
+ def fact_stats(entity_id = nil)
369
+ scope = entity_id ? Models::Fact.mentioning_entity(entity_id) : Models::Fact.all
370
+
371
+ {
372
+ canonical: scope.where(status: "canonical").count,
373
+ superseded: scope.where(status: "superseded").count,
374
+ corroborated: scope.where.not(corroborated_by_ids: nil).where.not(corroborated_by_ids: []).count,
375
+ synthesized: scope.where(status: "synthesized").count
376
+ }
377
+ end
378
+
165
379
  private
166
380
 
167
381
  def resolve_or_create_entity(mention)
168
382
  # If entity_id is already provided, use that entity directly
169
- return Models::Entity.find(mention[:entity_id]) if mention[:entity_id]
383
+ if mention[:entity_id]
384
+ entity = Models::Entity.find(mention[:entity_id])
385
+ # Still add any new aliases even for existing entities
386
+ add_aliases_to_entity(entity, mention[:aliases])
387
+ return entity
388
+ end
170
389
 
171
390
  name = mention[:name] || mention[:text]
172
- type = mention[:type]&.to_sym || :concept
391
+ kind = mention[:kind]&.to_sym || :concept
392
+ aliases = mention[:aliases] || []
393
+
394
+ entity = @entity_service.resolve_or_create(name, kind: kind, aliases: aliases)
395
+
396
+ # If entity was resolved (not created), still add any new aliases
397
+ add_aliases_to_entity(entity, aliases) if aliases.any?
173
398
 
174
- @entity_service.resolve_or_create(name, type: type)
399
+ entity
400
+ end
401
+
402
+ def add_aliases_to_entity(entity, aliases)
403
+ return unless aliases&.any?
404
+
405
+ aliases.each do |alias_text|
406
+ next if alias_text.to_s.strip.empty?
407
+ next if entity.name.downcase == alias_text.to_s.strip.downcase
408
+ next if entity.all_aliases.map(&:downcase).include?(alias_text.to_s.strip.downcase)
409
+
410
+ entity.add_alias(alias_text.to_s.strip)
411
+ end
175
412
  end
176
413
 
177
414
  def apply_filters(scope, entity: nil, status: nil)
@@ -0,0 +1,164 @@
1
+ # frozen_string_literal: true
2
+
3
+ module FactDb
4
+ module Services
5
+ # Service class for managing source documents in the database
6
+ #
7
+ # Provides methods for creating, searching, and retrieving source documents
8
+ # which are the original content from which facts are extracted.
9
+ #
10
+ # @example Basic usage
11
+ # service = SourceService.new
12
+ # source = service.create("Meeting notes...", kind: :document)
13
+ #
14
+ class SourceService
15
+ # @return [FactDb::Config] the configuration object
16
+ attr_reader :config
17
+
18
+ # Initializes a new SourceService instance
19
+ #
20
+ # @param config [FactDb::Config] configuration object (defaults to FactDb.config)
21
+ def initialize(config = FactDb.config)
22
+ @config = config
23
+ end
24
+
25
+ # Creates a new source document in the database
26
+ #
27
+ # Automatically deduplicates by content hash - returns existing source if content matches.
28
+ #
29
+ # @param content [String] the source content text
30
+ # @param kind [Symbol, String] source kind (:document, :email, :transcript, etc.)
31
+ # @param captured_at [Time] when the source was captured (defaults to now)
32
+ # @param metadata [Hash] additional metadata
33
+ # @param title [String, nil] optional title
34
+ # @param source_uri [String, nil] optional URI of the original source
35
+ # @return [FactDb::Models::Source] the created or existing source
36
+ #
37
+ # @example Create a source with metadata
38
+ # service.create("Email content...",
39
+ # kind: :email,
40
+ # captured_at: Time.parse("2024-01-15"),
41
+ # metadata: { from: "john@example.com" })
42
+ def create(content, kind:, captured_at: Time.current, metadata: {}, title: nil, source_uri: nil)
43
+ content_hash = Digest::SHA256.hexdigest(content)
44
+
45
+ # Check for duplicate content
46
+ existing = Models::Source.find_by(content_hash: content_hash)
47
+ return existing if existing
48
+
49
+ embedding = generate_embedding(content)
50
+
51
+ Models::Source.create!(
52
+ content: content,
53
+ content_hash: content_hash,
54
+ kind: kind.to_s,
55
+ title: title,
56
+ source_uri: source_uri,
57
+ metadata: metadata,
58
+ captured_at: captured_at,
59
+ embedding: embedding
60
+ )
61
+ end
62
+
63
+ # Finds a source by ID
64
+ #
65
+ # @param id [Integer] the source ID
66
+ # @return [FactDb::Models::Source] the found source
67
+ # @raise [ActiveRecord::RecordNotFound] if source not found
68
+ def find(id)
69
+ Models::Source.find(id)
70
+ end
71
+
72
+ # Finds a source by content hash
73
+ #
74
+ # @param hash [String] the SHA256 content hash
75
+ # @return [FactDb::Models::Source, nil] the found source or nil
76
+ def find_by_hash(hash)
77
+ Models::Source.find_by(content_hash: hash)
78
+ end
79
+
80
+ # Searches sources using full-text search with optional filters
81
+ #
82
+ # @param query [String] the search query
83
+ # @param kind [Symbol, String, nil] optional kind filter
84
+ # @param from [Date, Time, nil] captured after this date
85
+ # @param to [Date, Time, nil] captured before this date
86
+ # @param limit [Integer] maximum number of results
87
+ # @return [ActiveRecord::Relation] matching sources
88
+ def search(query, kind: nil, from: nil, to: nil, limit: 20)
89
+ scope = Models::Source.search_text(query)
90
+ scope = scope.by_kind(kind) if kind
91
+ scope = scope.captured_after(from) if from
92
+ scope = scope.captured_before(to) if to
93
+ scope.order(captured_at: :desc).limit(limit)
94
+ end
95
+
96
+ # Searches sources using semantic similarity (vector search)
97
+ #
98
+ # Requires an embedding generator to be configured.
99
+ #
100
+ # @param query [String] the search query
101
+ # @param limit [Integer] maximum number of results
102
+ # @return [ActiveRecord::Relation] semantically similar sources
103
+ def semantic_search(query, limit: 20)
104
+ embedding = generate_embedding(query)
105
+ return Models::Source.none unless embedding
106
+
107
+ Models::Source.nearest_neighbors(embedding, limit: limit)
108
+ end
109
+
110
+ # Returns sources of a specific kind
111
+ #
112
+ # @param kind [Symbol, String] the source kind
113
+ # @param limit [Integer, nil] maximum number of results
114
+ # @return [ActiveRecord::Relation] sources of that kind
115
+ def by_kind(kind, limit: nil)
116
+ scope = Models::Source.by_kind(kind).order(captured_at: :desc)
117
+ scope = scope.limit(limit) if limit
118
+ scope
119
+ end
120
+
121
+ # Returns sources captured between two dates
122
+ #
123
+ # @param from [Date, Time] start of range
124
+ # @param to [Date, Time] end of range
125
+ # @return [ActiveRecord::Relation] sources in the date range
126
+ def between(from, to)
127
+ Models::Source.captured_between(from, to).order(captured_at: :asc)
128
+ end
129
+
130
+ # Returns recently captured sources
131
+ #
132
+ # @param limit [Integer] maximum number of results
133
+ # @return [ActiveRecord::Relation] recent sources ordered by capture date
134
+ def recent(limit: 10)
135
+ Models::Source.order(captured_at: :desc).limit(limit)
136
+ end
137
+
138
+ # Returns aggregate statistics about sources
139
+ #
140
+ # @return [Hash] statistics including counts by kind and date range
141
+ def stats
142
+ {
143
+ total: Models::Source.count,
144
+ total_count: Models::Source.count,
145
+ by_kind: Models::Source.group(:kind).count,
146
+ earliest: Models::Source.minimum(:captured_at),
147
+ latest: Models::Source.maximum(:captured_at),
148
+ total_words: Models::Source.sum("array_length(regexp_split_to_array(content, '\\s+'), 1)")
149
+ }
150
+ end
151
+
152
+ private
153
+
154
+ def generate_embedding(text)
155
+ return nil unless config.embedding_generator
156
+
157
+ config.embedding_generator.call(text)
158
+ rescue StandardError => e
159
+ config.logger&.warn("Failed to generate embedding: #{e.message}")
160
+ nil
161
+ end
162
+ end
163
+ end
164
+ end