fact_db 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. checksums.yaml +7 -0
  2. data/.envrc +1 -0
  3. data/CHANGELOG.md +48 -0
  4. data/COMMITS.md +196 -0
  5. data/README.md +102 -0
  6. data/Rakefile +41 -0
  7. data/db/migrate/001_enable_extensions.rb +7 -0
  8. data/db/migrate/002_create_contents.rb +44 -0
  9. data/db/migrate/003_create_entities.rb +36 -0
  10. data/db/migrate/004_create_entity_aliases.rb +18 -0
  11. data/db/migrate/005_create_facts.rb +65 -0
  12. data/db/migrate/006_create_entity_mentions.rb +18 -0
  13. data/db/migrate/007_create_fact_sources.rb +18 -0
  14. data/docs/api/extractors/index.md +71 -0
  15. data/docs/api/extractors/llm.md +162 -0
  16. data/docs/api/extractors/manual.md +92 -0
  17. data/docs/api/extractors/rule-based.md +165 -0
  18. data/docs/api/facts.md +300 -0
  19. data/docs/api/index.md +66 -0
  20. data/docs/api/models/content.md +165 -0
  21. data/docs/api/models/entity.md +202 -0
  22. data/docs/api/models/fact.md +270 -0
  23. data/docs/api/models/index.md +77 -0
  24. data/docs/api/pipeline/extraction.md +175 -0
  25. data/docs/api/pipeline/index.md +72 -0
  26. data/docs/api/pipeline/resolution.md +209 -0
  27. data/docs/api/services/content-service.md +166 -0
  28. data/docs/api/services/entity-service.md +202 -0
  29. data/docs/api/services/fact-service.md +223 -0
  30. data/docs/api/services/index.md +55 -0
  31. data/docs/architecture/database-schema.md +293 -0
  32. data/docs/architecture/entity-resolution.md +293 -0
  33. data/docs/architecture/index.md +149 -0
  34. data/docs/architecture/temporal-facts.md +268 -0
  35. data/docs/architecture/three-layer-model.md +242 -0
  36. data/docs/assets/css/custom.css +137 -0
  37. data/docs/assets/fact_db.jpg +0 -0
  38. data/docs/assets/images/fact_db.jpg +0 -0
  39. data/docs/concepts.md +183 -0
  40. data/docs/examples/basic-usage.md +235 -0
  41. data/docs/examples/hr-onboarding.md +312 -0
  42. data/docs/examples/index.md +64 -0
  43. data/docs/examples/news-analysis.md +288 -0
  44. data/docs/getting-started/database-setup.md +170 -0
  45. data/docs/getting-started/index.md +71 -0
  46. data/docs/getting-started/installation.md +98 -0
  47. data/docs/getting-started/quick-start.md +191 -0
  48. data/docs/guides/batch-processing.md +325 -0
  49. data/docs/guides/configuration.md +243 -0
  50. data/docs/guides/entity-management.md +364 -0
  51. data/docs/guides/extracting-facts.md +299 -0
  52. data/docs/guides/index.md +22 -0
  53. data/docs/guides/ingesting-content.md +252 -0
  54. data/docs/guides/llm-integration.md +299 -0
  55. data/docs/guides/temporal-queries.md +315 -0
  56. data/docs/index.md +121 -0
  57. data/examples/README.md +130 -0
  58. data/examples/basic_usage.rb +164 -0
  59. data/examples/entity_management.rb +216 -0
  60. data/examples/hr_system.rb +428 -0
  61. data/examples/rule_based_extraction.rb +258 -0
  62. data/examples/temporal_queries.rb +245 -0
  63. data/lib/fact_db/config.rb +71 -0
  64. data/lib/fact_db/database.rb +45 -0
  65. data/lib/fact_db/errors.rb +10 -0
  66. data/lib/fact_db/extractors/base.rb +117 -0
  67. data/lib/fact_db/extractors/llm_extractor.rb +179 -0
  68. data/lib/fact_db/extractors/manual_extractor.rb +53 -0
  69. data/lib/fact_db/extractors/rule_based_extractor.rb +228 -0
  70. data/lib/fact_db/llm/adapter.rb +109 -0
  71. data/lib/fact_db/models/content.rb +62 -0
  72. data/lib/fact_db/models/entity.rb +84 -0
  73. data/lib/fact_db/models/entity_alias.rb +26 -0
  74. data/lib/fact_db/models/entity_mention.rb +33 -0
  75. data/lib/fact_db/models/fact.rb +192 -0
  76. data/lib/fact_db/models/fact_source.rb +35 -0
  77. data/lib/fact_db/pipeline/extraction_pipeline.rb +146 -0
  78. data/lib/fact_db/pipeline/resolution_pipeline.rb +129 -0
  79. data/lib/fact_db/resolution/entity_resolver.rb +261 -0
  80. data/lib/fact_db/resolution/fact_resolver.rb +259 -0
  81. data/lib/fact_db/services/content_service.rb +93 -0
  82. data/lib/fact_db/services/entity_service.rb +150 -0
  83. data/lib/fact_db/services/fact_service.rb +193 -0
  84. data/lib/fact_db/temporal/query.rb +125 -0
  85. data/lib/fact_db/temporal/timeline.rb +134 -0
  86. data/lib/fact_db/version.rb +5 -0
  87. data/lib/fact_db.rb +141 -0
  88. data/mkdocs.yml +198 -0
  89. metadata +288 -0
@@ -0,0 +1,261 @@
1
+ # frozen_string_literal: true
2
+
3
+ module FactDb
4
+ module Resolution
5
+ class EntityResolver
6
+ attr_reader :config
7
+
8
+ def initialize(config = FactDb.config)
9
+ @config = config
10
+ @threshold = config.fuzzy_match_threshold
11
+ @auto_merge_threshold = config.auto_merge_threshold
12
+ end
13
+
14
+ # Resolve a name to an entity
15
+ def resolve(name, type: nil)
16
+ return nil if name.nil? || name.empty?
17
+
18
+ # 1. Exact alias match
19
+ exact = find_by_exact_alias(name, type: type)
20
+ return ResolvedEntity.new(exact, confidence: 1.0, match_type: :exact_alias) if exact
21
+
22
+ # 2. Canonical name match
23
+ canonical = find_by_canonical_name(name, type: type)
24
+ return ResolvedEntity.new(canonical, confidence: 1.0, match_type: :canonical_name) if canonical
25
+
26
+ # 3. Fuzzy matching
27
+ fuzzy = find_by_fuzzy_match(name, type: type)
28
+ return fuzzy if fuzzy && fuzzy.confidence >= @threshold
29
+
30
+ # 4. No match found
31
+ nil
32
+ end
33
+
34
+ # Resolve or create an entity
35
+ def resolve_or_create(name, type:, aliases: [], attributes: {})
36
+ resolved = resolve(name, type: type)
37
+ return resolved.entity if resolved
38
+
39
+ create_entity(name, type: type, aliases: aliases, attributes: attributes)
40
+ end
41
+
42
+ # Merge two entities, keeping one as canonical
43
+ def merge(keep_id, merge_id)
44
+ keep = Models::Entity.find(keep_id)
45
+ merge_entity = Models::Entity.find(merge_id)
46
+
47
+ raise ResolutionError, "Cannot merge entity into itself" if keep_id == merge_id
48
+ raise ResolutionError, "Cannot merge already merged entity" if merge_entity.merged?
49
+
50
+ Models::Entity.transaction do
51
+ # Move all aliases to kept entity
52
+ merge_entity.aliases.each do |alias_record|
53
+ keep.aliases.find_or_create_by!(alias_text: alias_record.alias_text) do |a|
54
+ a.alias_type = alias_record.alias_type
55
+ a.confidence = alias_record.confidence
56
+ end
57
+ end
58
+
59
+ # Add the merged entity's canonical name as an alias
60
+ keep.aliases.find_or_create_by!(alias_text: merge_entity.canonical_name) do |a|
61
+ a.alias_type = "name"
62
+ a.confidence = 1.0
63
+ end
64
+
65
+ # Update all entity mentions to point to kept entity
66
+ Models::EntityMention.where(entity_id: merge_id).update_all(entity_id: keep_id)
67
+
68
+ # Mark merged entity
69
+ merge_entity.update!(
70
+ resolution_status: "merged",
71
+ merged_into_id: keep_id
72
+ )
73
+ end
74
+
75
+ keep.reload
76
+ end
77
+
78
+ # Split an entity into multiple entities
79
+ def split(entity_id, split_configs)
80
+ original = Models::Entity.find(entity_id)
81
+
82
+ Models::Entity.transaction do
83
+ new_entities = split_configs.map do |config|
84
+ create_entity(
85
+ config[:name],
86
+ type: config[:type] || original.entity_type,
87
+ aliases: config[:aliases] || [],
88
+ attributes: config[:attributes] || {}
89
+ )
90
+ end
91
+
92
+ original.update!(resolution_status: "split")
93
+
94
+ new_entities
95
+ end
96
+ end
97
+
98
+ # Find potential duplicate entities
99
+ def find_duplicates(threshold: nil)
100
+ threshold ||= @threshold
101
+ duplicates = []
102
+
103
+ entities = Models::Entity.resolved.to_a
104
+
105
+ entities.each_with_index do |entity, i|
106
+ entities[(i + 1)..].each do |other|
107
+ similarity = calculate_similarity(entity.canonical_name, other.canonical_name)
108
+ if similarity >= threshold
109
+ duplicates << {
110
+ entity1: entity,
111
+ entity2: other,
112
+ similarity: similarity
113
+ }
114
+ end
115
+ end
116
+ end
117
+
118
+ duplicates.sort_by { |d| -d[:similarity] }
119
+ end
120
+
121
+ # Auto-merge high-confidence duplicates
122
+ def auto_merge_duplicates!
123
+ duplicates = find_duplicates(threshold: @auto_merge_threshold)
124
+
125
+ duplicates.each do |dup|
126
+ next if dup[:entity1].merged? || dup[:entity2].merged?
127
+
128
+ # Keep the entity with more mentions
129
+ keep, merge_entity = if dup[:entity1].entity_mentions.count >= dup[:entity2].entity_mentions.count
130
+ [dup[:entity1], dup[:entity2]]
131
+ else
132
+ [dup[:entity2], dup[:entity1]]
133
+ end
134
+
135
+ merge(keep.id, merge_entity.id)
136
+ end
137
+ end
138
+
139
+ private
140
+
141
+ def find_by_exact_alias(name, type:)
142
+ scope = Models::EntityAlias.where(["LOWER(alias_text) = ?", name.downcase])
143
+ scope = scope.joins(:entity).where(fact_db_entities: { entity_type: type }) if type
144
+ scope = scope.joins(:entity).where.not(fact_db_entities: { resolution_status: "merged" })
145
+ scope.first&.entity
146
+ end
147
+
148
+ def find_by_canonical_name(name, type:)
149
+ scope = Models::Entity.where(["LOWER(canonical_name) = ?", name.downcase])
150
+ scope = scope.where(entity_type: type) if type
151
+ scope.not_merged.first
152
+ end
153
+
154
+ def find_by_fuzzy_match(name, type:)
155
+ candidates = Models::Entity.not_merged
156
+ candidates = candidates.where(entity_type: type) if type
157
+
158
+ best_match = nil
159
+ best_similarity = 0
160
+
161
+ candidates.find_each do |entity|
162
+ # Check canonical name
163
+ similarity = calculate_similarity(name, entity.canonical_name)
164
+ if similarity > best_similarity
165
+ best_similarity = similarity
166
+ best_match = entity
167
+ end
168
+
169
+ # Check aliases
170
+ entity.aliases.each do |alias_record|
171
+ alias_similarity = calculate_similarity(name, alias_record.alias_text)
172
+ if alias_similarity > best_similarity
173
+ best_similarity = alias_similarity
174
+ best_match = entity
175
+ end
176
+ end
177
+ end
178
+
179
+ return nil if best_match.nil? || best_similarity < @threshold
180
+
181
+ ResolvedEntity.new(best_match, confidence: best_similarity, match_type: :fuzzy)
182
+ end
183
+
184
+ def create_entity(name, type:, aliases: [], attributes: {})
185
+ entity = Models::Entity.create!(
186
+ canonical_name: name,
187
+ entity_type: type,
188
+ attributes: attributes,
189
+ resolution_status: "resolved"
190
+ )
191
+
192
+ aliases.each do |alias_text|
193
+ entity.add_alias(alias_text)
194
+ end
195
+
196
+ entity
197
+ end
198
+
199
+ def calculate_similarity(a, b)
200
+ return 1.0 if a.downcase == b.downcase
201
+
202
+ max_len = [a.length, b.length].max
203
+ return 1.0 if max_len.zero?
204
+
205
+ 1.0 - (levenshtein_distance(a.downcase, b.downcase).to_f / max_len)
206
+ end
207
+
208
+ def levenshtein_distance(a, b)
209
+ m = a.length
210
+ n = b.length
211
+ d = Array.new(m + 1) { |i| i }
212
+
213
+ (1..n).each do |j|
214
+ prev = d[0]
215
+ d[0] = j
216
+ (1..m).each do |i|
217
+ temp = d[i]
218
+ d[i] = if a[i - 1] == b[j - 1]
219
+ prev
220
+ else
221
+ [prev + 1, d[i] + 1, d[i - 1] + 1].min
222
+ end
223
+ prev = temp
224
+ end
225
+ end
226
+
227
+ d[m]
228
+ end
229
+ end
230
+
231
+ class ResolvedEntity
232
+ attr_reader :entity, :confidence, :match_type
233
+
234
+ def initialize(entity, confidence:, match_type:)
235
+ @entity = entity
236
+ @confidence = confidence
237
+ @match_type = match_type
238
+ end
239
+
240
+ def exact_match?
241
+ confidence == 1.0
242
+ end
243
+
244
+ def fuzzy_match?
245
+ match_type == :fuzzy
246
+ end
247
+
248
+ def id
249
+ entity.id
250
+ end
251
+
252
+ def canonical_name
253
+ entity.canonical_name
254
+ end
255
+
256
+ def entity_type
257
+ entity.entity_type
258
+ end
259
+ end
260
+ end
261
+ end
@@ -0,0 +1,259 @@
1
+ # frozen_string_literal: true
2
+
3
+ module FactDb
4
+ module Resolution
5
+ class FactResolver
6
+ attr_reader :config
7
+
8
+ def initialize(config = FactDb.config)
9
+ @config = config
10
+ end
11
+
12
+ # Supersede an existing fact with a new one
13
+ def supersede(old_fact_id, new_fact_text, valid_at:, mentions: [])
14
+ old_fact = Models::Fact.find(old_fact_id)
15
+
16
+ raise ResolutionError, "Cannot supersede already superseded fact" if old_fact.superseded?
17
+
18
+ Models::Fact.transaction do
19
+ new_fact = Models::Fact.create!(
20
+ fact_text: new_fact_text,
21
+ valid_at: valid_at,
22
+ status: "canonical",
23
+ extraction_method: old_fact.extraction_method,
24
+ confidence: old_fact.confidence
25
+ )
26
+
27
+ # Copy mentions from old fact if not provided
28
+ if mentions.empty?
29
+ old_fact.entity_mentions.each do |mention|
30
+ new_fact.add_mention(
31
+ entity: mention.entity,
32
+ text: mention.mention_text,
33
+ role: mention.mention_role,
34
+ confidence: mention.confidence
35
+ )
36
+ end
37
+ else
38
+ mentions.each do |mention|
39
+ entity = mention[:entity] || Models::Entity.find(mention[:entity_id])
40
+ new_fact.add_mention(
41
+ entity: entity,
42
+ text: mention[:text],
43
+ role: mention[:role],
44
+ confidence: mention[:confidence] || 1.0
45
+ )
46
+ end
47
+ end
48
+
49
+ # Copy sources from old fact
50
+ old_fact.fact_sources.each do |source|
51
+ new_fact.add_source(
52
+ content: source.content,
53
+ type: source.source_type,
54
+ excerpt: source.excerpt,
55
+ confidence: source.confidence
56
+ )
57
+ end
58
+
59
+ # Mark old fact as superseded
60
+ old_fact.update!(
61
+ status: "superseded",
62
+ superseded_by_id: new_fact.id,
63
+ invalid_at: valid_at
64
+ )
65
+
66
+ new_fact
67
+ end
68
+ end
69
+
70
+ # Synthesize a new fact from multiple source facts
71
+ def synthesize(source_fact_ids, synthesized_text, valid_at:, invalid_at: nil, mentions: [])
72
+ source_facts = Models::Fact.where(id: source_fact_ids)
73
+
74
+ raise ResolutionError, "No source facts found" if source_facts.empty?
75
+
76
+ Models::Fact.transaction do
77
+ synthesized = Models::Fact.create!(
78
+ fact_text: synthesized_text,
79
+ valid_at: valid_at,
80
+ invalid_at: invalid_at,
81
+ status: "synthesized",
82
+ derived_from_ids: source_fact_ids,
83
+ extraction_method: "synthesized",
84
+ confidence: calculate_synthesized_confidence(source_facts)
85
+ )
86
+
87
+ # Aggregate entity mentions from source facts if not provided
88
+ if mentions.empty?
89
+ aggregate_mentions(source_facts).each do |mention|
90
+ synthesized.add_mention(**mention)
91
+ end
92
+ else
93
+ mentions.each do |mention|
94
+ entity = mention[:entity] || Models::Entity.find(mention[:entity_id])
95
+ synthesized.add_mention(
96
+ entity: entity,
97
+ text: mention[:text],
98
+ role: mention[:role],
99
+ confidence: mention[:confidence] || 1.0
100
+ )
101
+ end
102
+ end
103
+
104
+ # Link all source content
105
+ source_facts.each do |source_fact|
106
+ source_fact.fact_sources.each do |source|
107
+ synthesized.add_source(
108
+ content: source.content,
109
+ type: "supporting",
110
+ excerpt: source.excerpt,
111
+ confidence: source.confidence
112
+ )
113
+ end
114
+ end
115
+
116
+ synthesized
117
+ end
118
+ end
119
+
120
+ # Mark a fact as corroborated by another fact
121
+ def corroborate(fact_id, corroborating_fact_id)
122
+ fact = Models::Fact.find(fact_id)
123
+ _corroborating = Models::Fact.find(corroborating_fact_id)
124
+
125
+ raise ResolutionError, "Cannot corroborate with same fact" if fact_id == corroborating_fact_id
126
+
127
+ fact.update!(
128
+ corroborated_by_ids: (fact.corroborated_by_ids + [corroborating_fact_id]).uniq
129
+ )
130
+
131
+ # Optionally update status to corroborated if it was just canonical
132
+ fact.update!(status: "corroborated") if fact.status == "canonical" && fact.corroborated_by_ids.size >= 2
133
+
134
+ fact
135
+ end
136
+
137
+ # Invalidate a fact without replacement
138
+ def invalidate(fact_id, at: Time.current)
139
+ fact = Models::Fact.find(fact_id)
140
+ fact.update!(invalid_at: at)
141
+ fact
142
+ end
143
+
144
+ # Find potentially conflicting facts
145
+ def find_conflicts(entity_id: nil, topic: nil)
146
+ scope = Models::Fact.canonical.currently_valid
147
+
148
+ if entity_id
149
+ scope = scope.mentioning_entity(entity_id)
150
+ end
151
+
152
+ if topic
153
+ scope = scope.search_text(topic)
154
+ end
155
+
156
+ # Group facts that might be about the same thing
157
+ facts = scope.to_a
158
+ conflicts = []
159
+
160
+ facts.each_with_index do |fact, i|
161
+ facts[(i + 1)..].each do |other|
162
+ similarity = text_similarity(fact.fact_text, other.fact_text)
163
+ if similarity > 0.5 && similarity < 0.95
164
+ conflicts << {
165
+ fact1: fact,
166
+ fact2: other,
167
+ similarity: similarity
168
+ }
169
+ end
170
+ end
171
+ end
172
+
173
+ conflicts.sort_by { |c| -c[:similarity] }
174
+ end
175
+
176
+ # Resolve conflicts by keeping one fact and superseding others
177
+ def resolve_conflict(keep_fact_id, supersede_fact_ids, reason: nil)
178
+ Models::Fact.transaction do
179
+ supersede_fact_ids.each do |fact_id|
180
+ fact = Models::Fact.find(fact_id)
181
+ fact.update!(
182
+ status: "superseded",
183
+ superseded_by_id: keep_fact_id,
184
+ invalid_at: Time.current,
185
+ metadata: fact.metadata.merge(supersede_reason: reason)
186
+ )
187
+ end
188
+ end
189
+
190
+ Models::Fact.find(keep_fact_id)
191
+ end
192
+
193
+ # Build a timeline fact from point-in-time facts
194
+ def build_timeline_fact(entity_id:, topic: nil)
195
+ facts = Models::Fact.mentioning_entity(entity_id)
196
+ facts = facts.search_text(topic) if topic
197
+ facts = facts.order(valid_at: :asc).to_a
198
+
199
+ return nil if facts.empty?
200
+
201
+ # Find start and end dates
202
+ start_date = facts.first.valid_at
203
+ end_date = facts.select { |f| f.invalid_at }.map(&:invalid_at).max
204
+
205
+ entity = Models::Entity.find(entity_id)
206
+ synthesized_text = "#{entity.canonical_name}: #{topic || 'timeline'} from #{start_date.to_date}"
207
+ synthesized_text += " to #{end_date.to_date}" if end_date
208
+
209
+ synthesize(
210
+ facts.map(&:id),
211
+ synthesized_text,
212
+ valid_at: start_date,
213
+ invalid_at: end_date
214
+ )
215
+ end
216
+
217
+ private
218
+
219
+ def calculate_synthesized_confidence(source_facts)
220
+ confidences = source_facts.map(&:confidence)
221
+ confidences.sum / confidences.size
222
+ end
223
+
224
+ def aggregate_mentions(source_facts)
225
+ mentions = {}
226
+
227
+ source_facts.each do |fact|
228
+ fact.entity_mentions.each do |mention|
229
+ key = [mention.entity_id, mention.mention_role]
230
+ existing = mentions[key]
231
+
232
+ if existing.nil? || mention.confidence > existing[:confidence]
233
+ mentions[key] = {
234
+ entity: mention.entity,
235
+ text: mention.mention_text,
236
+ role: mention.mention_role,
237
+ confidence: mention.confidence
238
+ }
239
+ end
240
+ end
241
+ end
242
+
243
+ mentions.values
244
+ end
245
+
246
+ def text_similarity(text1, text2)
247
+ words1 = text1.downcase.split
248
+ words2 = text2.downcase.split
249
+
250
+ return 0.0 if words1.empty? || words2.empty?
251
+
252
+ intersection = words1 & words2
253
+ union = words1 | words2
254
+
255
+ intersection.size.to_f / union.size
256
+ end
257
+ end
258
+ end
259
+ end
@@ -0,0 +1,93 @@
1
+ # frozen_string_literal: true
2
+
3
+ module FactDb
4
+ module Services
5
+ class ContentService
6
+ attr_reader :config
7
+
8
+ def initialize(config = FactDb.config)
9
+ @config = config
10
+ end
11
+
12
+ def create(raw_text, type:, captured_at: Time.current, metadata: {}, title: nil, source_uri: nil)
13
+ content_hash = Digest::SHA256.hexdigest(raw_text)
14
+
15
+ # Check for duplicate content
16
+ existing = Models::Content.find_by(content_hash: content_hash)
17
+ return existing if existing
18
+
19
+ embedding = generate_embedding(raw_text)
20
+
21
+ Models::Content.create!(
22
+ raw_text: raw_text,
23
+ content_hash: content_hash,
24
+ content_type: type.to_s,
25
+ title: title,
26
+ source_uri: source_uri,
27
+ source_metadata: metadata,
28
+ captured_at: captured_at,
29
+ embedding: embedding
30
+ )
31
+ end
32
+
33
+ def find(id)
34
+ Models::Content.find(id)
35
+ end
36
+
37
+ def find_by_hash(hash)
38
+ Models::Content.find_by(content_hash: hash)
39
+ end
40
+
41
+ def search(query, type: nil, from: nil, to: nil, limit: 20)
42
+ scope = Models::Content.search_text(query)
43
+ scope = scope.by_type(type) if type
44
+ scope = scope.captured_after(from) if from
45
+ scope = scope.captured_before(to) if to
46
+ scope.order(captured_at: :desc).limit(limit)
47
+ end
48
+
49
+ def semantic_search(query, limit: 20)
50
+ embedding = generate_embedding(query)
51
+ return Models::Content.none unless embedding
52
+
53
+ Models::Content.nearest_neighbors(embedding, limit: limit)
54
+ end
55
+
56
+ def by_type(type, limit: nil)
57
+ scope = Models::Content.by_type(type).order(captured_at: :desc)
58
+ scope = scope.limit(limit) if limit
59
+ scope
60
+ end
61
+
62
+ def between(from, to)
63
+ Models::Content.captured_between(from, to).order(captured_at: :asc)
64
+ end
65
+
66
+ def recent(limit: 10)
67
+ Models::Content.order(captured_at: :desc).limit(limit)
68
+ end
69
+
70
+ def stats
71
+ {
72
+ total: Models::Content.count,
73
+ total_count: Models::Content.count,
74
+ by_type: Models::Content.group(:content_type).count,
75
+ earliest: Models::Content.minimum(:captured_at),
76
+ latest: Models::Content.maximum(:captured_at),
77
+ total_words: Models::Content.sum("array_length(regexp_split_to_array(raw_text, '\\s+'), 1)")
78
+ }
79
+ end
80
+
81
+ private
82
+
83
+ def generate_embedding(text)
84
+ return nil unless config.embedding_generator
85
+
86
+ config.embedding_generator.call(text)
87
+ rescue StandardError => e
88
+ config.logger&.warn("Failed to generate embedding: #{e.message}")
89
+ nil
90
+ end
91
+ end
92
+ end
93
+ end