fact_db 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (114) hide show
  1. checksums.yaml +4 -4
  2. data/.envrc +2 -0
  3. data/.yardopts +5 -0
  4. data/CHANGELOG.md +64 -0
  5. data/README.md +107 -6
  6. data/Rakefile +243 -10
  7. data/db/migrate/001_enable_extensions.rb +1 -0
  8. data/db/migrate/002_create_sources.rb +49 -0
  9. data/db/migrate/003_create_entities.rb +27 -15
  10. data/db/migrate/004_create_entity_aliases.rb +20 -7
  11. data/db/migrate/005_create_facts.rb +37 -21
  12. data/db/migrate/006_create_entity_mentions.rb +14 -6
  13. data/db/migrate/007_create_fact_sources.rb +16 -8
  14. data/docs/api/extractors/index.md +5 -5
  15. data/docs/api/extractors/llm.md +17 -17
  16. data/docs/api/extractors/rule-based.md +14 -14
  17. data/docs/api/facts.md +20 -20
  18. data/docs/api/index.md +4 -4
  19. data/docs/api/models/entity.md +21 -21
  20. data/docs/api/models/fact.md +15 -15
  21. data/docs/api/models/index.md +7 -7
  22. data/docs/api/models/{content.md → source.md} +29 -29
  23. data/docs/api/pipeline/extraction.md +25 -25
  24. data/docs/api/pipeline/index.md +1 -1
  25. data/docs/api/pipeline/resolution.md +4 -4
  26. data/docs/api/services/entity-service.md +20 -20
  27. data/docs/api/services/fact-service.md +12 -12
  28. data/docs/api/services/index.md +5 -5
  29. data/docs/api/services/{content-service.md → source-service.md} +27 -27
  30. data/docs/architecture/database-schema.md +46 -46
  31. data/docs/architecture/entity-resolution.md +6 -6
  32. data/docs/architecture/index.md +10 -10
  33. data/docs/architecture/temporal-facts.md +5 -5
  34. data/docs/architecture/three-layer-model.md +17 -17
  35. data/docs/concepts.md +6 -6
  36. data/docs/examples/basic-usage.md +20 -20
  37. data/docs/examples/hr-onboarding.md +17 -17
  38. data/docs/examples/index.md +4 -4
  39. data/docs/examples/news-analysis.md +23 -23
  40. data/docs/getting-started/database-setup.md +28 -20
  41. data/docs/getting-started/index.md +3 -3
  42. data/docs/getting-started/quick-start.md +33 -30
  43. data/docs/guides/batch-processing.md +26 -26
  44. data/docs/guides/configuration.md +158 -77
  45. data/docs/guides/entity-management.md +14 -14
  46. data/docs/guides/extracting-facts.md +28 -28
  47. data/docs/guides/ingesting-content.md +14 -14
  48. data/docs/guides/llm-integration.md +40 -32
  49. data/docs/guides/temporal-queries.md +11 -11
  50. data/docs/index.md +6 -2
  51. data/examples/.envrc +4 -0
  52. data/examples/.gitignore +1 -0
  53. data/examples/001_configuration.rb +312 -0
  54. data/examples/{basic_usage.rb → 010_basic_usage.rb} +47 -56
  55. data/examples/{entity_management.rb → 020_entity_management.rb} +57 -72
  56. data/examples/{temporal_queries.rb → 030_temporal_queries.rb} +39 -59
  57. data/examples/040_output_formats.rb +177 -0
  58. data/examples/{rule_based_extraction.rb → 050_rule_based_extraction.rb} +39 -45
  59. data/examples/060_fluent_temporal_api.rb +217 -0
  60. data/examples/070_introspection.rb +252 -0
  61. data/examples/{hr_system.rb → 080_hr_system.rb} +56 -75
  62. data/examples/090_ingest_demo.rb +515 -0
  63. data/examples/100_query_context.rb +668 -0
  64. data/examples/110_prove_it.rb +204 -0
  65. data/examples/120_dump_database.rb +358 -0
  66. data/examples/130_rag_feedback_loop.rb +858 -0
  67. data/examples/README.md +229 -15
  68. data/examples/data/lincoln_associates.md +201 -0
  69. data/examples/data/lincoln_biography.md +66 -0
  70. data/examples/data/lincoln_cabinet.md +243 -0
  71. data/examples/data/lincoln_family.md +163 -0
  72. data/examples/data/lincoln_military.md +241 -0
  73. data/examples/data/lincoln_todd_family.md +136 -0
  74. data/examples/ingest_reporter.rb +335 -0
  75. data/examples/utilities.rb +182 -0
  76. data/lib/fact_db/config/defaults.yml +254 -0
  77. data/lib/fact_db/config.rb +94 -35
  78. data/lib/fact_db/database.rb +98 -8
  79. data/lib/fact_db/extractors/base.rb +106 -21
  80. data/lib/fact_db/extractors/llm_extractor.rb +35 -63
  81. data/lib/fact_db/extractors/manual_extractor.rb +46 -6
  82. data/lib/fact_db/extractors/rule_based_extractor.rb +136 -25
  83. data/lib/fact_db/llm/adapter.rb +3 -3
  84. data/lib/fact_db/models/entity.rb +94 -22
  85. data/lib/fact_db/models/entity_alias.rb +41 -7
  86. data/lib/fact_db/models/entity_mention.rb +34 -1
  87. data/lib/fact_db/models/fact.rb +259 -28
  88. data/lib/fact_db/models/fact_source.rb +43 -9
  89. data/lib/fact_db/models/source.rb +113 -0
  90. data/lib/fact_db/pipeline/extraction_pipeline.rb +35 -35
  91. data/lib/fact_db/pipeline/resolution_pipeline.rb +5 -5
  92. data/lib/fact_db/query_result.rb +202 -0
  93. data/lib/fact_db/resolution/entity_resolver.rb +139 -39
  94. data/lib/fact_db/resolution/fact_resolver.rb +86 -14
  95. data/lib/fact_db/services/entity_service.rb +246 -37
  96. data/lib/fact_db/services/fact_service.rb +254 -17
  97. data/lib/fact_db/services/source_service.rb +164 -0
  98. data/lib/fact_db/temporal/query.rb +71 -7
  99. data/lib/fact_db/temporal/query_builder.rb +69 -0
  100. data/lib/fact_db/temporal/timeline.rb +102 -11
  101. data/lib/fact_db/transformers/base.rb +77 -0
  102. data/lib/fact_db/transformers/cypher_transformer.rb +185 -0
  103. data/lib/fact_db/transformers/json_transformer.rb +17 -0
  104. data/lib/fact_db/transformers/raw_transformer.rb +35 -0
  105. data/lib/fact_db/transformers/text_transformer.rb +114 -0
  106. data/lib/fact_db/transformers/triple_transformer.rb +138 -0
  107. data/lib/fact_db/validation/alias_filter.rb +185 -0
  108. data/lib/fact_db/version.rb +1 -1
  109. data/lib/fact_db.rb +281 -30
  110. data/mkdocs.yml +2 -2
  111. metadata +60 -16
  112. data/db/migrate/002_create_contents.rb +0 -44
  113. data/lib/fact_db/models/content.rb +0 -62
  114. data/lib/fact_db/services/content_service.rb +0 -93
@@ -2,6 +2,22 @@
2
2
 
3
3
  module FactDb
4
4
  module Models
5
+ # Represents a temporal fact in the database
6
+ #
7
+ # Facts are the core data structure in FactDb, representing statements with
8
+ # temporal validity (valid_at/invalid_at), entity mentions, and source provenance.
9
+ # Facts can be canonical, superseded, or synthesized from other facts.
10
+ #
11
+ # @example Create a fact
12
+ # fact = Fact.create!(
13
+ # text: "John works at Acme Corp",
14
+ # valid_at: Date.parse("2024-01-15"),
15
+ # status: "canonical"
16
+ # )
17
+ #
18
+ # @example Query currently valid facts
19
+ # Fact.canonical.currently_valid
20
+ #
5
21
  class Fact < ActiveRecord::Base
6
22
  self.table_name = "fact_db_facts"
7
23
 
@@ -11,113 +27,209 @@ module FactDb
11
27
 
12
28
  has_many :fact_sources, class_name: "FactDb::Models::FactSource",
13
29
  foreign_key: :fact_id, dependent: :destroy
14
- has_many :source_contents, through: :fact_sources, source: :content
30
+ has_many :sources, through: :fact_sources, source: :source
15
31
 
16
32
  belongs_to :superseded_by, class_name: "FactDb::Models::Fact",
17
33
  foreign_key: :superseded_by_id, optional: true
18
34
  has_many :supersedes, class_name: "FactDb::Models::Fact",
19
35
  foreign_key: :superseded_by_id
20
36
 
21
- validates :fact_text, presence: true
22
- validates :fact_hash, presence: true
37
+ validates :text, presence: true
38
+ validates :digest, presence: true, uniqueness: { scope: :valid_at }
23
39
  validates :valid_at, presence: true
24
40
  validates :status, presence: true
25
41
 
26
- before_validation :generate_fact_hash, on: :create
42
+ before_validation :generate_digest, on: :create
27
43
 
28
- # Fact statuses
44
+ # @return [Array<String>] valid fact statuses
29
45
  STATUSES = %w[canonical superseded corroborated synthesized].freeze
46
+
47
+ # @return [Array<String>] valid extraction methods
30
48
  EXTRACTION_METHODS = %w[manual llm rule_based].freeze
31
49
 
32
50
  validates :status, inclusion: { in: STATUSES }
33
51
  validates :extraction_method, inclusion: { in: EXTRACTION_METHODS }, allow_nil: true
34
52
 
35
- # Core scopes
53
+ # @!group Scopes
54
+
55
+ # @!method canonical
56
+ # Returns facts with canonical status
57
+ # @return [ActiveRecord::Relation]
36
58
  scope :canonical, -> { where(status: "canonical") }
59
+
60
+ # @!method superseded
61
+ # Returns facts that have been superseded
62
+ # @return [ActiveRecord::Relation]
37
63
  scope :superseded, -> { where(status: "superseded") }
64
+
65
+ # @!method synthesized
66
+ # Returns facts that were synthesized from other facts
67
+ # @return [ActiveRecord::Relation]
38
68
  scope :synthesized, -> { where(status: "synthesized") }
39
69
 
40
- # Temporal scopes - the heart of the Event Clock
70
+ # @!method currently_valid
71
+ # Returns facts that are currently valid (no invalid_at date)
72
+ # @return [ActiveRecord::Relation]
41
73
  scope :currently_valid, -> { where(invalid_at: nil) }
74
+
75
+ # @!method historical
76
+ # Returns facts that have been invalidated
77
+ # @return [ActiveRecord::Relation]
42
78
  scope :historical, -> { where.not(invalid_at: nil) }
43
79
 
80
+ # @!method valid_at(date)
81
+ # Returns facts valid at a specific point in time
82
+ # @param date [Date, Time] the point in time
83
+ # @return [ActiveRecord::Relation]
44
84
  scope :valid_at, lambda { |date|
45
85
  where("valid_at <= ?", date)
46
86
  .where("invalid_at > ? OR invalid_at IS NULL", date)
47
87
  }
48
88
 
89
+ # @!method valid_between(from, to)
90
+ # Returns facts valid during a date range
91
+ # @param from [Date, Time] start of range
92
+ # @param to [Date, Time] end of range
93
+ # @return [ActiveRecord::Relation]
49
94
  scope :valid_between, lambda { |from, to|
50
95
  where("valid_at <= ? AND (invalid_at > ? OR invalid_at IS NULL)", to, from)
51
96
  }
52
97
 
98
+ # @!method became_valid_between(from, to)
99
+ # Returns facts that became valid within a date range
100
+ # @param from [Date, Time] start of range
101
+ # @param to [Date, Time] end of range
102
+ # @return [ActiveRecord::Relation]
53
103
  scope :became_valid_between, lambda { |from, to|
54
104
  where(valid_at: from..to)
55
105
  }
56
106
 
107
+ # @!method became_invalid_between(from, to)
108
+ # Returns facts that became invalid within a date range
109
+ # @param from [Date, Time] start of range
110
+ # @param to [Date, Time] end of range
111
+ # @return [ActiveRecord::Relation]
57
112
  scope :became_invalid_between, lambda { |from, to|
58
113
  where(invalid_at: from..to)
59
114
  }
60
115
 
61
- # Entity filtering
116
+ # @!method mentioning_entity(entity_id)
117
+ # Returns facts that mention a specific entity
118
+ # @param entity_id [Integer] the entity ID
119
+ # @return [ActiveRecord::Relation]
62
120
  scope :mentioning_entity, lambda { |entity_id|
63
- joins(:entity_mentions).where(fact_db_entity_mentions: { entity_id: entity_id })
121
+ joins(:entity_mentions).where(fact_db_entity_mentions: { entity_id: entity_id }).distinct
64
122
  }
65
123
 
124
+ # @!method with_role(entity_id, role)
125
+ # Returns facts where an entity has a specific role
126
+ # @param entity_id [Integer] the entity ID
127
+ # @param role [String, Symbol] the mention role (subject, object, etc.)
128
+ # @return [ActiveRecord::Relation]
66
129
  scope :with_role, lambda { |entity_id, role|
67
130
  joins(:entity_mentions).where(
68
131
  fact_db_entity_mentions: { entity_id: entity_id, mention_role: role }
69
- )
132
+ ).distinct
70
133
  }
71
134
 
72
- # Full-text search
135
+ # @!method search_text(query)
136
+ # Full-text search on fact text using PostgreSQL tsvector
137
+ # @param query [String] the search query
138
+ # @return [ActiveRecord::Relation]
73
139
  scope :search_text, lambda { |query|
74
- where("to_tsvector('english', fact_text) @@ plainto_tsquery('english', ?)", query)
140
+ where("to_tsvector('english', text) @@ plainto_tsquery('english', ?)", query)
75
141
  }
76
142
 
77
- # Extraction method
143
+ # @!method extracted_by(method)
144
+ # Returns facts extracted by a specific method
145
+ # @param method [String, Symbol] extraction method (manual, llm, rule_based)
146
+ # @return [ActiveRecord::Relation]
78
147
  scope :extracted_by, ->(method) { where(extraction_method: method) }
148
+
149
+ # @!method by_extraction_method(method)
150
+ # Alias for extracted_by
151
+ # @param method [String, Symbol] extraction method
152
+ # @return [ActiveRecord::Relation]
79
153
  scope :by_extraction_method, ->(method) { where(extraction_method: method) }
80
154
 
81
- # Confidence filtering
155
+ # @!method high_confidence
156
+ # Returns facts with confidence >= 0.9
157
+ # @return [ActiveRecord::Relation]
82
158
  scope :high_confidence, -> { where("confidence >= ?", 0.9) }
159
+
160
+ # @!method low_confidence
161
+ # Returns facts with confidence < 0.5
162
+ # @return [ActiveRecord::Relation]
83
163
  scope :low_confidence, -> { where("confidence < ?", 0.5) }
84
164
 
165
+ # @!endgroup
166
+
167
+ # Checks if the fact is currently valid
168
+ #
169
+ # @return [Boolean] true if the fact has no invalid_at date
85
170
  def currently_valid?
86
171
  invalid_at.nil?
87
172
  end
88
173
 
174
+ # Checks if the fact was valid at a specific date
175
+ #
176
+ # @param date [Date, Time] the point in time to check
177
+ # @return [Boolean] true if the fact was valid at the given date
89
178
  def valid_at?(date)
90
179
  valid_at <= date && (invalid_at.nil? || invalid_at > date)
91
180
  end
92
181
 
182
+ # Returns the duration the fact was valid
183
+ #
184
+ # @return [ActiveSupport::Duration, nil] duration or nil if still valid
93
185
  def duration
94
186
  return nil if invalid_at.nil?
95
187
 
96
188
  invalid_at - valid_at
97
189
  end
98
190
 
191
+ # Returns the duration in days the fact was valid
192
+ #
193
+ # @return [Integer, nil] number of days or nil if still valid
99
194
  def duration_days
100
195
  return nil if invalid_at.nil?
101
196
 
102
197
  (invalid_at.to_date - valid_at.to_date).to_i
103
198
  end
104
199
 
200
+ # Checks if this fact has been superseded
201
+ #
202
+ # @return [Boolean] true if status is "superseded"
105
203
  def superseded?
106
204
  status == "superseded"
107
205
  end
108
206
 
207
+ # Checks if this fact was synthesized from other facts
208
+ #
209
+ # @return [Boolean] true if status is "synthesized"
109
210
  def synthesized?
110
211
  status == "synthesized"
111
212
  end
112
213
 
214
+ # Invalidates this fact at a specific time
215
+ #
216
+ # @param at [Time] when the fact became invalid (defaults to now)
217
+ # @return [Boolean] true if update succeeded
113
218
  def invalidate!(at: Time.current)
114
219
  update!(invalid_at: at)
115
220
  end
116
221
 
117
- def supersede_with!(new_fact_text, valid_at:)
222
+ # Supersedes this fact with new information
223
+ #
224
+ # Creates a new canonical fact and marks this one as superseded.
225
+ #
226
+ # @param new_text [String] the updated fact text
227
+ # @param valid_at [Date, Time] when the new fact became valid
228
+ # @return [FactDb::Models::Fact] the new fact
229
+ def supersede_with!(new_text, valid_at:)
118
230
  transaction do
119
231
  new_fact = self.class.create!(
120
- fact_text: new_fact_text,
232
+ text: new_text,
121
233
  valid_at: valid_at,
122
234
  status: "canonical",
123
235
  extraction_method: extraction_method
@@ -133,6 +245,13 @@ module FactDb
133
245
  end
134
246
  end
135
247
 
248
+ # Adds an entity mention to this fact
249
+ #
250
+ # @param entity [FactDb::Models::Entity] the entity being mentioned
251
+ # @param text [String] the mention text as it appears in the fact
252
+ # @param role [String, Symbol, nil] the role (subject, object, etc.)
253
+ # @param confidence [Float] confidence score (0.0 to 1.0)
254
+ # @return [FactDb::Models::EntityMention] the created or found mention
136
255
  def add_mention(entity:, text:, role: nil, confidence: 1.0)
137
256
  entity_mentions.find_or_create_by!(entity: entity, mention_text: text) do |m|
138
257
  m.mention_role = role
@@ -140,42 +259,154 @@ module FactDb
140
259
  end
141
260
  end
142
261
 
143
- def add_source(content:, type: "primary", excerpt: nil, confidence: 1.0)
144
- fact_sources.find_or_create_by!(content: content) do |s|
145
- s.source_type = type
262
+ # Adds a source document to this fact
263
+ #
264
+ # @param source [FactDb::Models::Source] the source document
265
+ # @param kind [String] source kind (primary, corroborating, etc.)
266
+ # @param excerpt [String, nil] relevant excerpt from the source
267
+ # @param confidence [Float] confidence score (0.0 to 1.0)
268
+ # @return [FactDb::Models::FactSource] the created or found fact-source link
269
+ def add_source(source:, kind: "primary", excerpt: nil, confidence: 1.0)
270
+ fact_sources.find_or_create_by!(source: source) do |s|
271
+ s.kind = kind
146
272
  s.excerpt = excerpt
147
273
  s.confidence = confidence
148
274
  end
149
275
  end
150
276
 
151
- # Get source facts for synthesized facts
277
+ # Returns the source facts for synthesized facts
278
+ #
279
+ # @return [ActiveRecord::Relation] facts this one was derived from
152
280
  def source_facts
153
281
  return Fact.none unless derived_from_ids.any?
154
282
 
155
283
  Fact.where(id: derived_from_ids)
156
284
  end
157
285
 
158
- # Get facts that corroborate this one
286
+ # Returns facts that corroborate this one
287
+ #
288
+ # @return [ActiveRecord::Relation] corroborating facts
159
289
  def corroborating_facts
160
290
  return Fact.none unless corroborated_by_ids.any?
161
291
 
162
292
  Fact.where(id: corroborated_by_ids)
163
293
  end
164
294
 
165
- # Evidence chain - trace back to original content
295
+ # Returns the complete evidence chain back to original sources
296
+ #
297
+ # Recursively traces through synthesized facts to find all original sources.
298
+ #
299
+ # @return [Array<FactDb::Models::Source>] unique source documents
166
300
  def evidence_chain
167
- sources = source_contents.to_a
301
+ evidence = sources.to_a
168
302
 
169
303
  if synthesized? && derived_from_ids.any?
170
304
  source_facts.each do |source_fact|
171
- sources.concat(source_fact.evidence_chain)
305
+ evidence.concat(source_fact.evidence_chain)
172
306
  end
173
307
  end
174
308
 
175
- sources.uniq
309
+ evidence.uniq
310
+ end
311
+
312
+ # Returns the original source lines from which this fact was derived
313
+ #
314
+ # Uses line metadata to extract the relevant section from the source document
315
+ # and highlights lines containing key terms from the fact.
316
+ #
317
+ # @return [Hash, nil] hash with :full_section, :focused_lines, :focused_line_numbers, :key_terms
318
+ # or nil if source/line metadata unavailable
319
+ #
320
+ # @example
321
+ # fact.prove_it
322
+ # # => {
323
+ # # full_section: "...",
324
+ # # focused_lines: "John joined Acme Corp...",
325
+ # # focused_line_numbers: [15, 16],
326
+ # # key_terms: ["John", "Acme Corp"]
327
+ # # }
328
+ def prove_it
329
+ source = fact_sources.first&.source
330
+ return nil unless source&.content
331
+
332
+ line_start = metadata&.dig("line_start")
333
+ line_end = metadata&.dig("line_end")
334
+ return nil unless line_start && line_end
335
+
336
+ lines = source.content.lines
337
+ start_idx = line_start.to_i - 1
338
+ end_idx = line_end.to_i - 1
339
+
340
+ return nil if start_idx < 0 || end_idx >= lines.length
341
+
342
+ section_lines = lines[start_idx..end_idx]
343
+ full_section = section_lines.join
344
+
345
+ # Find focused lines by matching key terms from fact
346
+ key_terms = extract_key_terms
347
+ scored_lines = score_lines_by_relevance(section_lines, key_terms, start_idx)
348
+
349
+ # Return lines that have at least one match, sorted by line number
350
+ relevant = scored_lines.select { |l| l[:score] > 0 }
351
+ .sort_by { |l| l[:line_number] }
352
+
353
+ {
354
+ full_section: full_section,
355
+ focused_lines: relevant.map { |l| l[:text] }.join,
356
+ focused_line_numbers: relevant.map { |l| l[:line_number] },
357
+ key_terms: key_terms
358
+ }
359
+ end
360
+
361
+ private
362
+
363
+ def extract_key_terms
364
+ terms = []
365
+
366
+ # Get entity names from mentions
367
+ entity_mentions.includes(:entity).each do |mention|
368
+ terms << mention.entity&.name if mention.entity&.name
369
+ terms << mention.mention_text if mention.mention_text
370
+ end
371
+
372
+ # Extract significant words from fact text (exclude common words)
373
+ stop_words = %w[a an the is was were are been being have has had do does did
374
+ will would could should may might must shall can to of in for
375
+ on with at by from as into through during before after above
376
+ below between under again further then once here there when
377
+ where why how all each few more most other some such no nor
378
+ not only own same so than too very just and but or if]
379
+
380
+ fact_words = text.downcase
381
+ .gsub(/[^a-z\s]/, " ")
382
+ .split
383
+ .reject { |w| w.length < 3 || stop_words.include?(w) }
384
+ .uniq
385
+
386
+ terms.concat(fact_words)
387
+ terms.compact.uniq.reject(&:empty?)
176
388
  end
177
389
 
178
- # Vector similarity search
390
+ def score_lines_by_relevance(lines, key_terms, start_idx)
391
+ lines.each_with_index.map do |line, idx|
392
+ line_lower = line.downcase
393
+ score = key_terms.count { |term| line_lower.include?(term.downcase) }
394
+
395
+ {
396
+ line_number: start_idx + idx + 1,
397
+ text: line,
398
+ score: score
399
+ }
400
+ end
401
+ end
402
+
403
+ public
404
+
405
+ # Finds facts by vector similarity using pgvector
406
+ #
407
+ # @param embedding [Array<Float>] the embedding vector to search with
408
+ # @param limit [Integer] maximum number of results
409
+ # @return [ActiveRecord::Relation] facts ordered by similarity
179
410
  def self.nearest_neighbors(embedding, limit: 10)
180
411
  return none unless embedding
181
412
 
@@ -184,8 +415,8 @@ module FactDb
184
415
 
185
416
  private
186
417
 
187
- def generate_fact_hash
188
- self.fact_hash = Digest::SHA256.hexdigest(fact_text) if fact_text.present?
418
+ def generate_digest
419
+ self.digest = Digest::SHA256.hexdigest(text) if text.present?
189
420
  end
190
421
  end
191
422
  end
@@ -2,28 +2,62 @@
2
2
 
3
3
  module FactDb
4
4
  module Models
5
+ # Join model linking facts to source documents
6
+ #
7
+ # Represents the provenance relationship between a fact and the source
8
+ # document(s) it was extracted from, including the relationship type
9
+ # and an optional excerpt.
10
+ #
11
+ # @example Link a fact to a source
12
+ # fact_source = FactSource.create!(
13
+ # fact: fact, source: document,
14
+ # kind: "primary", excerpt: "relevant quote..."
15
+ # )
16
+ #
5
17
  class FactSource < ActiveRecord::Base
6
18
  self.table_name = "fact_db_fact_sources"
7
19
 
8
20
  belongs_to :fact, class_name: "FactDb::Models::Fact"
9
- belongs_to :content, class_name: "FactDb::Models::Content"
21
+ belongs_to :source, class_name: "FactDb::Models::Source"
10
22
 
11
- validates :fact_id, uniqueness: { scope: :content_id }
23
+ validates :fact_id, uniqueness: { scope: :source_id }
12
24
 
13
- # Source types
14
- TYPES = %w[primary supporting corroborating].freeze
25
+ # @return [Array<String>] valid source relationship kinds
26
+ KINDS = %w[primary supporting corroborating].freeze
15
27
 
16
- validates :source_type, inclusion: { in: TYPES }
28
+ validates :kind, inclusion: { in: KINDS }
17
29
 
18
- scope :primary, -> { where(source_type: "primary") }
19
- scope :supporting, -> { where(source_type: "supporting") }
20
- scope :corroborating, -> { where(source_type: "corroborating") }
30
+ # @!method primary
31
+ # Returns primary source links
32
+ # @return [ActiveRecord::Relation]
33
+ scope :primary, -> { where(kind: "primary") }
34
+
35
+ # @!method supporting
36
+ # Returns supporting source links
37
+ # @return [ActiveRecord::Relation]
38
+ scope :supporting, -> { where(kind: "supporting") }
39
+
40
+ # @!method corroborating
41
+ # Returns corroborating source links
42
+ # @return [ActiveRecord::Relation]
43
+ scope :corroborating, -> { where(kind: "corroborating") }
44
+
45
+ # @!method high_confidence
46
+ # Returns source links with confidence >= 0.9
47
+ # @return [ActiveRecord::Relation]
21
48
  scope :high_confidence, -> { where("confidence >= ?", 0.9) }
22
49
 
50
+ # Checks if this is the primary source for the fact
51
+ #
52
+ # @return [Boolean] true if kind is "primary"
23
53
  def primary?
24
- source_type == "primary"
54
+ kind == "primary"
25
55
  end
26
56
 
57
+ # Returns a preview of the excerpt, truncated if needed
58
+ #
59
+ # @param length [Integer] maximum length (default: 100)
60
+ # @return [String, nil] excerpt preview with "..." if truncated, or nil if no excerpt
27
61
  def excerpt_preview(length: 100)
28
62
  return nil if excerpt.nil?
29
63
  return excerpt if excerpt.length <= length
@@ -0,0 +1,113 @@
1
+ # frozen_string_literal: true
2
+
3
+ module FactDb
4
+ module Models
5
+ # Represents a source document from which facts are extracted
6
+ #
7
+ # Sources are immutable content documents (emails, transcripts, documents, etc.)
8
+ # that serve as the provenance for extracted facts. Content is deduplicated
9
+ # by SHA256 hash.
10
+ #
11
+ # @example Create a source
12
+ # source = Source.create!(content: "Meeting notes...", kind: "meeting_notes", captured_at: Time.now)
13
+ #
14
+ # @example Search sources
15
+ # Source.search_text("quarterly report").by_kind("document")
16
+ #
17
+ class Source < ActiveRecord::Base
18
+ self.table_name = "fact_db_sources"
19
+
20
+ has_many :fact_sources, class_name: "FactDb::Models::FactSource",
21
+ foreign_key: :source_id, dependent: :destroy
22
+ has_many :facts, through: :fact_sources
23
+
24
+ validates :content_hash, presence: true, uniqueness: true
25
+ validates :kind, presence: true
26
+ validates :content, presence: true
27
+ validates :captured_at, presence: true
28
+
29
+ before_validation :generate_content_hash, on: :create
30
+
31
+ # @return [Array<String>] valid source content kinds
32
+ KINDS = %w[email transcript document slack meeting_notes contract report].freeze
33
+
34
+ validates :kind, inclusion: { in: KINDS }, allow_nil: false
35
+
36
+ # @!method by_kind(k)
37
+ # Returns sources of a specific kind
38
+ # @param k [String] the source kind
39
+ # @return [ActiveRecord::Relation]
40
+ scope :by_kind, ->(k) { where(kind: k) }
41
+
42
+ # @!method captured_between(from, to)
43
+ # Returns sources captured within a date range
44
+ # @param from [Date, Time] start of range
45
+ # @param to [Date, Time] end of range
46
+ # @return [ActiveRecord::Relation]
47
+ scope :captured_between, ->(from, to) { where(captured_at: from..to) }
48
+
49
+ # @!method captured_after(date)
50
+ # Returns sources captured after a date
51
+ # @param date [Date, Time] the cutoff date
52
+ # @return [ActiveRecord::Relation]
53
+ scope :captured_after, ->(date) { where("captured_at >= ?", date) }
54
+
55
+ # @!method captured_before(date)
56
+ # Returns sources captured before a date
57
+ # @param date [Date, Time] the cutoff date
58
+ # @return [ActiveRecord::Relation]
59
+ scope :captured_before, ->(date) { where("captured_at <= ?", date) }
60
+
61
+ # @!method search_text(query)
62
+ # Full-text search on source content using PostgreSQL tsvector
63
+ # @param query [String] the search query
64
+ # @return [ActiveRecord::Relation]
65
+ scope :search_text, lambda { |query|
66
+ where("to_tsvector('english', content) @@ plainto_tsquery('english', ?)", query)
67
+ }
68
+
69
+ # Finds sources by vector similarity using pgvector
70
+ #
71
+ # @param embedding [Array<Float>] the embedding vector to search with
72
+ # @param limit [Integer] maximum number of results
73
+ # @return [ActiveRecord::Relation] sources ordered by similarity
74
+ def self.nearest_neighbors(embedding, limit: 10)
75
+ return none unless embedding
76
+
77
+ order(Arel.sql("embedding <=> '#{embedding}'")).limit(limit)
78
+ end
79
+
80
+ # Returns whether the source content can be modified
81
+ #
82
+ # Sources are always immutable to preserve provenance integrity.
83
+ #
84
+ # @return [Boolean] always returns true
85
+ def immutable?
86
+ true
87
+ end
88
+
89
+ # Returns the word count of the content
90
+ #
91
+ # @return [Integer] number of words in content
92
+ def word_count
93
+ content.split.size
94
+ end
95
+
96
+ # Returns a preview of the content, truncated if needed
97
+ #
98
+ # @param length [Integer] maximum length (default: 200)
99
+ # @return [String] content preview with "..." if truncated
100
+ def preview(length: 200)
101
+ return content if content.length <= length
102
+
103
+ "#{content[0, length]}..."
104
+ end
105
+
106
+ private
107
+
108
+ def generate_content_hash
109
+ self.content_hash = Digest::SHA256.hexdigest(content) if content.present?
110
+ end
111
+ end
112
+ end
113
+ end