fact_db 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.envrc +2 -0
- data/.yardopts +5 -0
- data/CHANGELOG.md +64 -0
- data/README.md +107 -6
- data/Rakefile +243 -10
- data/db/migrate/001_enable_extensions.rb +1 -0
- data/db/migrate/002_create_sources.rb +49 -0
- data/db/migrate/003_create_entities.rb +27 -15
- data/db/migrate/004_create_entity_aliases.rb +20 -7
- data/db/migrate/005_create_facts.rb +37 -21
- data/db/migrate/006_create_entity_mentions.rb +14 -6
- data/db/migrate/007_create_fact_sources.rb +16 -8
- data/docs/api/extractors/index.md +5 -5
- data/docs/api/extractors/llm.md +17 -17
- data/docs/api/extractors/rule-based.md +14 -14
- data/docs/api/facts.md +20 -20
- data/docs/api/index.md +4 -4
- data/docs/api/models/entity.md +21 -21
- data/docs/api/models/fact.md +15 -15
- data/docs/api/models/index.md +7 -7
- data/docs/api/models/{content.md → source.md} +29 -29
- data/docs/api/pipeline/extraction.md +25 -25
- data/docs/api/pipeline/index.md +1 -1
- data/docs/api/pipeline/resolution.md +4 -4
- data/docs/api/services/entity-service.md +20 -20
- data/docs/api/services/fact-service.md +12 -12
- data/docs/api/services/index.md +5 -5
- data/docs/api/services/{content-service.md → source-service.md} +27 -27
- data/docs/architecture/database-schema.md +46 -46
- data/docs/architecture/entity-resolution.md +6 -6
- data/docs/architecture/index.md +10 -10
- data/docs/architecture/temporal-facts.md +5 -5
- data/docs/architecture/three-layer-model.md +17 -17
- data/docs/concepts.md +6 -6
- data/docs/examples/basic-usage.md +20 -20
- data/docs/examples/hr-onboarding.md +17 -17
- data/docs/examples/index.md +4 -4
- data/docs/examples/news-analysis.md +23 -23
- data/docs/getting-started/database-setup.md +28 -20
- data/docs/getting-started/index.md +3 -3
- data/docs/getting-started/quick-start.md +33 -30
- data/docs/guides/batch-processing.md +26 -26
- data/docs/guides/configuration.md +158 -77
- data/docs/guides/entity-management.md +14 -14
- data/docs/guides/extracting-facts.md +28 -28
- data/docs/guides/ingesting-content.md +14 -14
- data/docs/guides/llm-integration.md +40 -32
- data/docs/guides/temporal-queries.md +11 -11
- data/docs/index.md +6 -2
- data/examples/.envrc +4 -0
- data/examples/.gitignore +1 -0
- data/examples/001_configuration.rb +312 -0
- data/examples/{basic_usage.rb → 010_basic_usage.rb} +47 -56
- data/examples/{entity_management.rb → 020_entity_management.rb} +57 -72
- data/examples/{temporal_queries.rb → 030_temporal_queries.rb} +39 -59
- data/examples/040_output_formats.rb +177 -0
- data/examples/{rule_based_extraction.rb → 050_rule_based_extraction.rb} +39 -45
- data/examples/060_fluent_temporal_api.rb +217 -0
- data/examples/070_introspection.rb +252 -0
- data/examples/{hr_system.rb → 080_hr_system.rb} +56 -75
- data/examples/090_ingest_demo.rb +515 -0
- data/examples/100_query_context.rb +668 -0
- data/examples/110_prove_it.rb +204 -0
- data/examples/120_dump_database.rb +358 -0
- data/examples/130_rag_feedback_loop.rb +858 -0
- data/examples/README.md +229 -15
- data/examples/data/lincoln_associates.md +201 -0
- data/examples/data/lincoln_biography.md +66 -0
- data/examples/data/lincoln_cabinet.md +243 -0
- data/examples/data/lincoln_family.md +163 -0
- data/examples/data/lincoln_military.md +241 -0
- data/examples/data/lincoln_todd_family.md +136 -0
- data/examples/ingest_reporter.rb +335 -0
- data/examples/utilities.rb +182 -0
- data/lib/fact_db/config/defaults.yml +254 -0
- data/lib/fact_db/config.rb +94 -35
- data/lib/fact_db/database.rb +98 -8
- data/lib/fact_db/extractors/base.rb +106 -21
- data/lib/fact_db/extractors/llm_extractor.rb +35 -63
- data/lib/fact_db/extractors/manual_extractor.rb +46 -6
- data/lib/fact_db/extractors/rule_based_extractor.rb +136 -25
- data/lib/fact_db/llm/adapter.rb +3 -3
- data/lib/fact_db/models/entity.rb +94 -22
- data/lib/fact_db/models/entity_alias.rb +41 -7
- data/lib/fact_db/models/entity_mention.rb +34 -1
- data/lib/fact_db/models/fact.rb +259 -28
- data/lib/fact_db/models/fact_source.rb +43 -9
- data/lib/fact_db/models/source.rb +113 -0
- data/lib/fact_db/pipeline/extraction_pipeline.rb +35 -35
- data/lib/fact_db/pipeline/resolution_pipeline.rb +5 -5
- data/lib/fact_db/query_result.rb +202 -0
- data/lib/fact_db/resolution/entity_resolver.rb +139 -39
- data/lib/fact_db/resolution/fact_resolver.rb +86 -14
- data/lib/fact_db/services/entity_service.rb +246 -37
- data/lib/fact_db/services/fact_service.rb +254 -17
- data/lib/fact_db/services/source_service.rb +164 -0
- data/lib/fact_db/temporal/query.rb +71 -7
- data/lib/fact_db/temporal/query_builder.rb +69 -0
- data/lib/fact_db/temporal/timeline.rb +102 -11
- data/lib/fact_db/transformers/base.rb +77 -0
- data/lib/fact_db/transformers/cypher_transformer.rb +185 -0
- data/lib/fact_db/transformers/json_transformer.rb +17 -0
- data/lib/fact_db/transformers/raw_transformer.rb +35 -0
- data/lib/fact_db/transformers/text_transformer.rb +114 -0
- data/lib/fact_db/transformers/triple_transformer.rb +138 -0
- data/lib/fact_db/validation/alias_filter.rb +185 -0
- data/lib/fact_db/version.rb +1 -1
- data/lib/fact_db.rb +281 -30
- data/mkdocs.yml +2 -2
- metadata +60 -16
- data/db/migrate/002_create_contents.rb +0 -44
- data/lib/fact_db/models/content.rb +0 -62
- data/lib/fact_db/services/content_service.rb +0 -93
data/lib/fact_db/models/fact.rb
CHANGED
|
@@ -2,6 +2,22 @@
|
|
|
2
2
|
|
|
3
3
|
module FactDb
|
|
4
4
|
module Models
|
|
5
|
+
# Represents a temporal fact in the database
|
|
6
|
+
#
|
|
7
|
+
# Facts are the core data structure in FactDb, representing statements with
|
|
8
|
+
# temporal validity (valid_at/invalid_at), entity mentions, and source provenance.
|
|
9
|
+
# Facts can be canonical, superseded, or synthesized from other facts.
|
|
10
|
+
#
|
|
11
|
+
# @example Create a fact
|
|
12
|
+
# fact = Fact.create!(
|
|
13
|
+
# text: "John works at Acme Corp",
|
|
14
|
+
# valid_at: Date.parse("2024-01-15"),
|
|
15
|
+
# status: "canonical"
|
|
16
|
+
# )
|
|
17
|
+
#
|
|
18
|
+
# @example Query currently valid facts
|
|
19
|
+
# Fact.canonical.currently_valid
|
|
20
|
+
#
|
|
5
21
|
class Fact < ActiveRecord::Base
|
|
6
22
|
self.table_name = "fact_db_facts"
|
|
7
23
|
|
|
@@ -11,113 +27,209 @@ module FactDb
|
|
|
11
27
|
|
|
12
28
|
has_many :fact_sources, class_name: "FactDb::Models::FactSource",
|
|
13
29
|
foreign_key: :fact_id, dependent: :destroy
|
|
14
|
-
has_many :
|
|
30
|
+
has_many :sources, through: :fact_sources, source: :source
|
|
15
31
|
|
|
16
32
|
belongs_to :superseded_by, class_name: "FactDb::Models::Fact",
|
|
17
33
|
foreign_key: :superseded_by_id, optional: true
|
|
18
34
|
has_many :supersedes, class_name: "FactDb::Models::Fact",
|
|
19
35
|
foreign_key: :superseded_by_id
|
|
20
36
|
|
|
21
|
-
validates :
|
|
22
|
-
validates :
|
|
37
|
+
validates :text, presence: true
|
|
38
|
+
validates :digest, presence: true, uniqueness: { scope: :valid_at }
|
|
23
39
|
validates :valid_at, presence: true
|
|
24
40
|
validates :status, presence: true
|
|
25
41
|
|
|
26
|
-
before_validation :
|
|
42
|
+
before_validation :generate_digest, on: :create
|
|
27
43
|
|
|
28
|
-
#
|
|
44
|
+
# @return [Array<String>] valid fact statuses
|
|
29
45
|
STATUSES = %w[canonical superseded corroborated synthesized].freeze
|
|
46
|
+
|
|
47
|
+
# @return [Array<String>] valid extraction methods
|
|
30
48
|
EXTRACTION_METHODS = %w[manual llm rule_based].freeze
|
|
31
49
|
|
|
32
50
|
validates :status, inclusion: { in: STATUSES }
|
|
33
51
|
validates :extraction_method, inclusion: { in: EXTRACTION_METHODS }, allow_nil: true
|
|
34
52
|
|
|
35
|
-
#
|
|
53
|
+
# @!group Scopes
|
|
54
|
+
|
|
55
|
+
# @!method canonical
|
|
56
|
+
# Returns facts with canonical status
|
|
57
|
+
# @return [ActiveRecord::Relation]
|
|
36
58
|
scope :canonical, -> { where(status: "canonical") }
|
|
59
|
+
|
|
60
|
+
# @!method superseded
|
|
61
|
+
# Returns facts that have been superseded
|
|
62
|
+
# @return [ActiveRecord::Relation]
|
|
37
63
|
scope :superseded, -> { where(status: "superseded") }
|
|
64
|
+
|
|
65
|
+
# @!method synthesized
|
|
66
|
+
# Returns facts that were synthesized from other facts
|
|
67
|
+
# @return [ActiveRecord::Relation]
|
|
38
68
|
scope :synthesized, -> { where(status: "synthesized") }
|
|
39
69
|
|
|
40
|
-
#
|
|
70
|
+
# @!method currently_valid
|
|
71
|
+
# Returns facts that are currently valid (no invalid_at date)
|
|
72
|
+
# @return [ActiveRecord::Relation]
|
|
41
73
|
scope :currently_valid, -> { where(invalid_at: nil) }
|
|
74
|
+
|
|
75
|
+
# @!method historical
|
|
76
|
+
# Returns facts that have been invalidated
|
|
77
|
+
# @return [ActiveRecord::Relation]
|
|
42
78
|
scope :historical, -> { where.not(invalid_at: nil) }
|
|
43
79
|
|
|
80
|
+
# @!method valid_at(date)
|
|
81
|
+
# Returns facts valid at a specific point in time
|
|
82
|
+
# @param date [Date, Time] the point in time
|
|
83
|
+
# @return [ActiveRecord::Relation]
|
|
44
84
|
scope :valid_at, lambda { |date|
|
|
45
85
|
where("valid_at <= ?", date)
|
|
46
86
|
.where("invalid_at > ? OR invalid_at IS NULL", date)
|
|
47
87
|
}
|
|
48
88
|
|
|
89
|
+
# @!method valid_between(from, to)
|
|
90
|
+
# Returns facts valid during a date range
|
|
91
|
+
# @param from [Date, Time] start of range
|
|
92
|
+
# @param to [Date, Time] end of range
|
|
93
|
+
# @return [ActiveRecord::Relation]
|
|
49
94
|
scope :valid_between, lambda { |from, to|
|
|
50
95
|
where("valid_at <= ? AND (invalid_at > ? OR invalid_at IS NULL)", to, from)
|
|
51
96
|
}
|
|
52
97
|
|
|
98
|
+
# @!method became_valid_between(from, to)
|
|
99
|
+
# Returns facts that became valid within a date range
|
|
100
|
+
# @param from [Date, Time] start of range
|
|
101
|
+
# @param to [Date, Time] end of range
|
|
102
|
+
# @return [ActiveRecord::Relation]
|
|
53
103
|
scope :became_valid_between, lambda { |from, to|
|
|
54
104
|
where(valid_at: from..to)
|
|
55
105
|
}
|
|
56
106
|
|
|
107
|
+
# @!method became_invalid_between(from, to)
|
|
108
|
+
# Returns facts that became invalid within a date range
|
|
109
|
+
# @param from [Date, Time] start of range
|
|
110
|
+
# @param to [Date, Time] end of range
|
|
111
|
+
# @return [ActiveRecord::Relation]
|
|
57
112
|
scope :became_invalid_between, lambda { |from, to|
|
|
58
113
|
where(invalid_at: from..to)
|
|
59
114
|
}
|
|
60
115
|
|
|
61
|
-
#
|
|
116
|
+
# @!method mentioning_entity(entity_id)
|
|
117
|
+
# Returns facts that mention a specific entity
|
|
118
|
+
# @param entity_id [Integer] the entity ID
|
|
119
|
+
# @return [ActiveRecord::Relation]
|
|
62
120
|
scope :mentioning_entity, lambda { |entity_id|
|
|
63
|
-
joins(:entity_mentions).where(fact_db_entity_mentions: { entity_id: entity_id })
|
|
121
|
+
joins(:entity_mentions).where(fact_db_entity_mentions: { entity_id: entity_id }).distinct
|
|
64
122
|
}
|
|
65
123
|
|
|
124
|
+
# @!method with_role(entity_id, role)
|
|
125
|
+
# Returns facts where an entity has a specific role
|
|
126
|
+
# @param entity_id [Integer] the entity ID
|
|
127
|
+
# @param role [String, Symbol] the mention role (subject, object, etc.)
|
|
128
|
+
# @return [ActiveRecord::Relation]
|
|
66
129
|
scope :with_role, lambda { |entity_id, role|
|
|
67
130
|
joins(:entity_mentions).where(
|
|
68
131
|
fact_db_entity_mentions: { entity_id: entity_id, mention_role: role }
|
|
69
|
-
)
|
|
132
|
+
).distinct
|
|
70
133
|
}
|
|
71
134
|
|
|
72
|
-
#
|
|
135
|
+
# @!method search_text(query)
|
|
136
|
+
# Full-text search on fact text using PostgreSQL tsvector
|
|
137
|
+
# @param query [String] the search query
|
|
138
|
+
# @return [ActiveRecord::Relation]
|
|
73
139
|
scope :search_text, lambda { |query|
|
|
74
|
-
where("to_tsvector('english',
|
|
140
|
+
where("to_tsvector('english', text) @@ plainto_tsquery('english', ?)", query)
|
|
75
141
|
}
|
|
76
142
|
|
|
77
|
-
#
|
|
143
|
+
# @!method extracted_by(method)
|
|
144
|
+
# Returns facts extracted by a specific method
|
|
145
|
+
# @param method [String, Symbol] extraction method (manual, llm, rule_based)
|
|
146
|
+
# @return [ActiveRecord::Relation]
|
|
78
147
|
scope :extracted_by, ->(method) { where(extraction_method: method) }
|
|
148
|
+
|
|
149
|
+
# @!method by_extraction_method(method)
|
|
150
|
+
# Alias for extracted_by
|
|
151
|
+
# @param method [String, Symbol] extraction method
|
|
152
|
+
# @return [ActiveRecord::Relation]
|
|
79
153
|
scope :by_extraction_method, ->(method) { where(extraction_method: method) }
|
|
80
154
|
|
|
81
|
-
#
|
|
155
|
+
# @!method high_confidence
|
|
156
|
+
# Returns facts with confidence >= 0.9
|
|
157
|
+
# @return [ActiveRecord::Relation]
|
|
82
158
|
scope :high_confidence, -> { where("confidence >= ?", 0.9) }
|
|
159
|
+
|
|
160
|
+
# @!method low_confidence
|
|
161
|
+
# Returns facts with confidence < 0.5
|
|
162
|
+
# @return [ActiveRecord::Relation]
|
|
83
163
|
scope :low_confidence, -> { where("confidence < ?", 0.5) }
|
|
84
164
|
|
|
165
|
+
# @!endgroup
|
|
166
|
+
|
|
167
|
+
# Checks if the fact is currently valid
|
|
168
|
+
#
|
|
169
|
+
# @return [Boolean] true if the fact has no invalid_at date
|
|
85
170
|
def currently_valid?
|
|
86
171
|
invalid_at.nil?
|
|
87
172
|
end
|
|
88
173
|
|
|
174
|
+
# Checks if the fact was valid at a specific date
|
|
175
|
+
#
|
|
176
|
+
# @param date [Date, Time] the point in time to check
|
|
177
|
+
# @return [Boolean] true if the fact was valid at the given date
|
|
89
178
|
def valid_at?(date)
|
|
90
179
|
valid_at <= date && (invalid_at.nil? || invalid_at > date)
|
|
91
180
|
end
|
|
92
181
|
|
|
182
|
+
# Returns the duration the fact was valid
|
|
183
|
+
#
|
|
184
|
+
# @return [ActiveSupport::Duration, nil] duration or nil if still valid
|
|
93
185
|
def duration
|
|
94
186
|
return nil if invalid_at.nil?
|
|
95
187
|
|
|
96
188
|
invalid_at - valid_at
|
|
97
189
|
end
|
|
98
190
|
|
|
191
|
+
# Returns the duration in days the fact was valid
|
|
192
|
+
#
|
|
193
|
+
# @return [Integer, nil] number of days or nil if still valid
|
|
99
194
|
def duration_days
|
|
100
195
|
return nil if invalid_at.nil?
|
|
101
196
|
|
|
102
197
|
(invalid_at.to_date - valid_at.to_date).to_i
|
|
103
198
|
end
|
|
104
199
|
|
|
200
|
+
# Checks if this fact has been superseded
|
|
201
|
+
#
|
|
202
|
+
# @return [Boolean] true if status is "superseded"
|
|
105
203
|
def superseded?
|
|
106
204
|
status == "superseded"
|
|
107
205
|
end
|
|
108
206
|
|
|
207
|
+
# Checks if this fact was synthesized from other facts
|
|
208
|
+
#
|
|
209
|
+
# @return [Boolean] true if status is "synthesized"
|
|
109
210
|
def synthesized?
|
|
110
211
|
status == "synthesized"
|
|
111
212
|
end
|
|
112
213
|
|
|
214
|
+
# Invalidates this fact at a specific time
|
|
215
|
+
#
|
|
216
|
+
# @param at [Time] when the fact became invalid (defaults to now)
|
|
217
|
+
# @return [Boolean] true if update succeeded
|
|
113
218
|
def invalidate!(at: Time.current)
|
|
114
219
|
update!(invalid_at: at)
|
|
115
220
|
end
|
|
116
221
|
|
|
117
|
-
|
|
222
|
+
# Supersedes this fact with new information
|
|
223
|
+
#
|
|
224
|
+
# Creates a new canonical fact and marks this one as superseded.
|
|
225
|
+
#
|
|
226
|
+
# @param new_text [String] the updated fact text
|
|
227
|
+
# @param valid_at [Date, Time] when the new fact became valid
|
|
228
|
+
# @return [FactDb::Models::Fact] the new fact
|
|
229
|
+
def supersede_with!(new_text, valid_at:)
|
|
118
230
|
transaction do
|
|
119
231
|
new_fact = self.class.create!(
|
|
120
|
-
|
|
232
|
+
text: new_text,
|
|
121
233
|
valid_at: valid_at,
|
|
122
234
|
status: "canonical",
|
|
123
235
|
extraction_method: extraction_method
|
|
@@ -133,6 +245,13 @@ module FactDb
|
|
|
133
245
|
end
|
|
134
246
|
end
|
|
135
247
|
|
|
248
|
+
# Adds an entity mention to this fact
|
|
249
|
+
#
|
|
250
|
+
# @param entity [FactDb::Models::Entity] the entity being mentioned
|
|
251
|
+
# @param text [String] the mention text as it appears in the fact
|
|
252
|
+
# @param role [String, Symbol, nil] the role (subject, object, etc.)
|
|
253
|
+
# @param confidence [Float] confidence score (0.0 to 1.0)
|
|
254
|
+
# @return [FactDb::Models::EntityMention] the created or found mention
|
|
136
255
|
def add_mention(entity:, text:, role: nil, confidence: 1.0)
|
|
137
256
|
entity_mentions.find_or_create_by!(entity: entity, mention_text: text) do |m|
|
|
138
257
|
m.mention_role = role
|
|
@@ -140,42 +259,154 @@ module FactDb
|
|
|
140
259
|
end
|
|
141
260
|
end
|
|
142
261
|
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
262
|
+
# Adds a source document to this fact
|
|
263
|
+
#
|
|
264
|
+
# @param source [FactDb::Models::Source] the source document
|
|
265
|
+
# @param kind [String] source kind (primary, corroborating, etc.)
|
|
266
|
+
# @param excerpt [String, nil] relevant excerpt from the source
|
|
267
|
+
# @param confidence [Float] confidence score (0.0 to 1.0)
|
|
268
|
+
# @return [FactDb::Models::FactSource] the created or found fact-source link
|
|
269
|
+
def add_source(source:, kind: "primary", excerpt: nil, confidence: 1.0)
|
|
270
|
+
fact_sources.find_or_create_by!(source: source) do |s|
|
|
271
|
+
s.kind = kind
|
|
146
272
|
s.excerpt = excerpt
|
|
147
273
|
s.confidence = confidence
|
|
148
274
|
end
|
|
149
275
|
end
|
|
150
276
|
|
|
151
|
-
#
|
|
277
|
+
# Returns the source facts for synthesized facts
|
|
278
|
+
#
|
|
279
|
+
# @return [ActiveRecord::Relation] facts this one was derived from
|
|
152
280
|
def source_facts
|
|
153
281
|
return Fact.none unless derived_from_ids.any?
|
|
154
282
|
|
|
155
283
|
Fact.where(id: derived_from_ids)
|
|
156
284
|
end
|
|
157
285
|
|
|
158
|
-
#
|
|
286
|
+
# Returns facts that corroborate this one
|
|
287
|
+
#
|
|
288
|
+
# @return [ActiveRecord::Relation] corroborating facts
|
|
159
289
|
def corroborating_facts
|
|
160
290
|
return Fact.none unless corroborated_by_ids.any?
|
|
161
291
|
|
|
162
292
|
Fact.where(id: corroborated_by_ids)
|
|
163
293
|
end
|
|
164
294
|
|
|
165
|
-
#
|
|
295
|
+
# Returns the complete evidence chain back to original sources
|
|
296
|
+
#
|
|
297
|
+
# Recursively traces through synthesized facts to find all original sources.
|
|
298
|
+
#
|
|
299
|
+
# @return [Array<FactDb::Models::Source>] unique source documents
|
|
166
300
|
def evidence_chain
|
|
167
|
-
|
|
301
|
+
evidence = sources.to_a
|
|
168
302
|
|
|
169
303
|
if synthesized? && derived_from_ids.any?
|
|
170
304
|
source_facts.each do |source_fact|
|
|
171
|
-
|
|
305
|
+
evidence.concat(source_fact.evidence_chain)
|
|
172
306
|
end
|
|
173
307
|
end
|
|
174
308
|
|
|
175
|
-
|
|
309
|
+
evidence.uniq
|
|
310
|
+
end
|
|
311
|
+
|
|
312
|
+
# Returns the original source lines from which this fact was derived
|
|
313
|
+
#
|
|
314
|
+
# Uses line metadata to extract the relevant section from the source document
|
|
315
|
+
# and highlights lines containing key terms from the fact.
|
|
316
|
+
#
|
|
317
|
+
# @return [Hash, nil] hash with :full_section, :focused_lines, :focused_line_numbers, :key_terms
|
|
318
|
+
# or nil if source/line metadata unavailable
|
|
319
|
+
#
|
|
320
|
+
# @example
|
|
321
|
+
# fact.prove_it
|
|
322
|
+
# # => {
|
|
323
|
+
# # full_section: "...",
|
|
324
|
+
# # focused_lines: "John joined Acme Corp...",
|
|
325
|
+
# # focused_line_numbers: [15, 16],
|
|
326
|
+
# # key_terms: ["John", "Acme Corp"]
|
|
327
|
+
# # }
|
|
328
|
+
def prove_it
|
|
329
|
+
source = fact_sources.first&.source
|
|
330
|
+
return nil unless source&.content
|
|
331
|
+
|
|
332
|
+
line_start = metadata&.dig("line_start")
|
|
333
|
+
line_end = metadata&.dig("line_end")
|
|
334
|
+
return nil unless line_start && line_end
|
|
335
|
+
|
|
336
|
+
lines = source.content.lines
|
|
337
|
+
start_idx = line_start.to_i - 1
|
|
338
|
+
end_idx = line_end.to_i - 1
|
|
339
|
+
|
|
340
|
+
return nil if start_idx < 0 || end_idx >= lines.length
|
|
341
|
+
|
|
342
|
+
section_lines = lines[start_idx..end_idx]
|
|
343
|
+
full_section = section_lines.join
|
|
344
|
+
|
|
345
|
+
# Find focused lines by matching key terms from fact
|
|
346
|
+
key_terms = extract_key_terms
|
|
347
|
+
scored_lines = score_lines_by_relevance(section_lines, key_terms, start_idx)
|
|
348
|
+
|
|
349
|
+
# Return lines that have at least one match, sorted by line number
|
|
350
|
+
relevant = scored_lines.select { |l| l[:score] > 0 }
|
|
351
|
+
.sort_by { |l| l[:line_number] }
|
|
352
|
+
|
|
353
|
+
{
|
|
354
|
+
full_section: full_section,
|
|
355
|
+
focused_lines: relevant.map { |l| l[:text] }.join,
|
|
356
|
+
focused_line_numbers: relevant.map { |l| l[:line_number] },
|
|
357
|
+
key_terms: key_terms
|
|
358
|
+
}
|
|
359
|
+
end
|
|
360
|
+
|
|
361
|
+
private
|
|
362
|
+
|
|
363
|
+
def extract_key_terms
|
|
364
|
+
terms = []
|
|
365
|
+
|
|
366
|
+
# Get entity names from mentions
|
|
367
|
+
entity_mentions.includes(:entity).each do |mention|
|
|
368
|
+
terms << mention.entity&.name if mention.entity&.name
|
|
369
|
+
terms << mention.mention_text if mention.mention_text
|
|
370
|
+
end
|
|
371
|
+
|
|
372
|
+
# Extract significant words from fact text (exclude common words)
|
|
373
|
+
stop_words = %w[a an the is was were are been being have has had do does did
|
|
374
|
+
will would could should may might must shall can to of in for
|
|
375
|
+
on with at by from as into through during before after above
|
|
376
|
+
below between under again further then once here there when
|
|
377
|
+
where why how all each few more most other some such no nor
|
|
378
|
+
not only own same so than too very just and but or if]
|
|
379
|
+
|
|
380
|
+
fact_words = text.downcase
|
|
381
|
+
.gsub(/[^a-z\s]/, " ")
|
|
382
|
+
.split
|
|
383
|
+
.reject { |w| w.length < 3 || stop_words.include?(w) }
|
|
384
|
+
.uniq
|
|
385
|
+
|
|
386
|
+
terms.concat(fact_words)
|
|
387
|
+
terms.compact.uniq.reject(&:empty?)
|
|
176
388
|
end
|
|
177
389
|
|
|
178
|
-
|
|
390
|
+
def score_lines_by_relevance(lines, key_terms, start_idx)
|
|
391
|
+
lines.each_with_index.map do |line, idx|
|
|
392
|
+
line_lower = line.downcase
|
|
393
|
+
score = key_terms.count { |term| line_lower.include?(term.downcase) }
|
|
394
|
+
|
|
395
|
+
{
|
|
396
|
+
line_number: start_idx + idx + 1,
|
|
397
|
+
text: line,
|
|
398
|
+
score: score
|
|
399
|
+
}
|
|
400
|
+
end
|
|
401
|
+
end
|
|
402
|
+
|
|
403
|
+
public
|
|
404
|
+
|
|
405
|
+
# Finds facts by vector similarity using pgvector
|
|
406
|
+
#
|
|
407
|
+
# @param embedding [Array<Float>] the embedding vector to search with
|
|
408
|
+
# @param limit [Integer] maximum number of results
|
|
409
|
+
# @return [ActiveRecord::Relation] facts ordered by similarity
|
|
179
410
|
def self.nearest_neighbors(embedding, limit: 10)
|
|
180
411
|
return none unless embedding
|
|
181
412
|
|
|
@@ -184,8 +415,8 @@ module FactDb
|
|
|
184
415
|
|
|
185
416
|
private
|
|
186
417
|
|
|
187
|
-
def
|
|
188
|
-
self.
|
|
418
|
+
def generate_digest
|
|
419
|
+
self.digest = Digest::SHA256.hexdigest(text) if text.present?
|
|
189
420
|
end
|
|
190
421
|
end
|
|
191
422
|
end
|
|
@@ -2,28 +2,62 @@
|
|
|
2
2
|
|
|
3
3
|
module FactDb
|
|
4
4
|
module Models
|
|
5
|
+
# Join model linking facts to source documents
|
|
6
|
+
#
|
|
7
|
+
# Represents the provenance relationship between a fact and the source
|
|
8
|
+
# document(s) it was extracted from, including the relationship type
|
|
9
|
+
# and an optional excerpt.
|
|
10
|
+
#
|
|
11
|
+
# @example Link a fact to a source
|
|
12
|
+
# fact_source = FactSource.create!(
|
|
13
|
+
# fact: fact, source: document,
|
|
14
|
+
# kind: "primary", excerpt: "relevant quote..."
|
|
15
|
+
# )
|
|
16
|
+
#
|
|
5
17
|
class FactSource < ActiveRecord::Base
|
|
6
18
|
self.table_name = "fact_db_fact_sources"
|
|
7
19
|
|
|
8
20
|
belongs_to :fact, class_name: "FactDb::Models::Fact"
|
|
9
|
-
belongs_to :
|
|
21
|
+
belongs_to :source, class_name: "FactDb::Models::Source"
|
|
10
22
|
|
|
11
|
-
validates :fact_id, uniqueness: { scope: :
|
|
23
|
+
validates :fact_id, uniqueness: { scope: :source_id }
|
|
12
24
|
|
|
13
|
-
#
|
|
14
|
-
|
|
25
|
+
# @return [Array<String>] valid source relationship kinds
|
|
26
|
+
KINDS = %w[primary supporting corroborating].freeze
|
|
15
27
|
|
|
16
|
-
validates :
|
|
28
|
+
validates :kind, inclusion: { in: KINDS }
|
|
17
29
|
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
30
|
+
# @!method primary
|
|
31
|
+
# Returns primary source links
|
|
32
|
+
# @return [ActiveRecord::Relation]
|
|
33
|
+
scope :primary, -> { where(kind: "primary") }
|
|
34
|
+
|
|
35
|
+
# @!method supporting
|
|
36
|
+
# Returns supporting source links
|
|
37
|
+
# @return [ActiveRecord::Relation]
|
|
38
|
+
scope :supporting, -> { where(kind: "supporting") }
|
|
39
|
+
|
|
40
|
+
# @!method corroborating
|
|
41
|
+
# Returns corroborating source links
|
|
42
|
+
# @return [ActiveRecord::Relation]
|
|
43
|
+
scope :corroborating, -> { where(kind: "corroborating") }
|
|
44
|
+
|
|
45
|
+
# @!method high_confidence
|
|
46
|
+
# Returns source links with confidence >= 0.9
|
|
47
|
+
# @return [ActiveRecord::Relation]
|
|
21
48
|
scope :high_confidence, -> { where("confidence >= ?", 0.9) }
|
|
22
49
|
|
|
50
|
+
# Checks if this is the primary source for the fact
|
|
51
|
+
#
|
|
52
|
+
# @return [Boolean] true if kind is "primary"
|
|
23
53
|
def primary?
|
|
24
|
-
|
|
54
|
+
kind == "primary"
|
|
25
55
|
end
|
|
26
56
|
|
|
57
|
+
# Returns a preview of the excerpt, truncated if needed
|
|
58
|
+
#
|
|
59
|
+
# @param length [Integer] maximum length (default: 100)
|
|
60
|
+
# @return [String, nil] excerpt preview with "..." if truncated, or nil if no excerpt
|
|
27
61
|
def excerpt_preview(length: 100)
|
|
28
62
|
return nil if excerpt.nil?
|
|
29
63
|
return excerpt if excerpt.length <= length
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module FactDb
|
|
4
|
+
module Models
|
|
5
|
+
# Represents a source document from which facts are extracted
|
|
6
|
+
#
|
|
7
|
+
# Sources are immutable content documents (emails, transcripts, documents, etc.)
|
|
8
|
+
# that serve as the provenance for extracted facts. Content is deduplicated
|
|
9
|
+
# by SHA256 hash.
|
|
10
|
+
#
|
|
11
|
+
# @example Create a source
|
|
12
|
+
# source = Source.create!(content: "Meeting notes...", kind: "meeting_notes", captured_at: Time.now)
|
|
13
|
+
#
|
|
14
|
+
# @example Search sources
|
|
15
|
+
# Source.search_text("quarterly report").by_kind("document")
|
|
16
|
+
#
|
|
17
|
+
class Source < ActiveRecord::Base
|
|
18
|
+
self.table_name = "fact_db_sources"
|
|
19
|
+
|
|
20
|
+
has_many :fact_sources, class_name: "FactDb::Models::FactSource",
|
|
21
|
+
foreign_key: :source_id, dependent: :destroy
|
|
22
|
+
has_many :facts, through: :fact_sources
|
|
23
|
+
|
|
24
|
+
validates :content_hash, presence: true, uniqueness: true
|
|
25
|
+
validates :kind, presence: true
|
|
26
|
+
validates :content, presence: true
|
|
27
|
+
validates :captured_at, presence: true
|
|
28
|
+
|
|
29
|
+
before_validation :generate_content_hash, on: :create
|
|
30
|
+
|
|
31
|
+
# @return [Array<String>] valid source content kinds
|
|
32
|
+
KINDS = %w[email transcript document slack meeting_notes contract report].freeze
|
|
33
|
+
|
|
34
|
+
validates :kind, inclusion: { in: KINDS }, allow_nil: false
|
|
35
|
+
|
|
36
|
+
# @!method by_kind(k)
|
|
37
|
+
# Returns sources of a specific kind
|
|
38
|
+
# @param k [String] the source kind
|
|
39
|
+
# @return [ActiveRecord::Relation]
|
|
40
|
+
scope :by_kind, ->(k) { where(kind: k) }
|
|
41
|
+
|
|
42
|
+
# @!method captured_between(from, to)
|
|
43
|
+
# Returns sources captured within a date range
|
|
44
|
+
# @param from [Date, Time] start of range
|
|
45
|
+
# @param to [Date, Time] end of range
|
|
46
|
+
# @return [ActiveRecord::Relation]
|
|
47
|
+
scope :captured_between, ->(from, to) { where(captured_at: from..to) }
|
|
48
|
+
|
|
49
|
+
# @!method captured_after(date)
|
|
50
|
+
# Returns sources captured after a date
|
|
51
|
+
# @param date [Date, Time] the cutoff date
|
|
52
|
+
# @return [ActiveRecord::Relation]
|
|
53
|
+
scope :captured_after, ->(date) { where("captured_at >= ?", date) }
|
|
54
|
+
|
|
55
|
+
# @!method captured_before(date)
|
|
56
|
+
# Returns sources captured before a date
|
|
57
|
+
# @param date [Date, Time] the cutoff date
|
|
58
|
+
# @return [ActiveRecord::Relation]
|
|
59
|
+
scope :captured_before, ->(date) { where("captured_at <= ?", date) }
|
|
60
|
+
|
|
61
|
+
# @!method search_text(query)
|
|
62
|
+
# Full-text search on source content using PostgreSQL tsvector
|
|
63
|
+
# @param query [String] the search query
|
|
64
|
+
# @return [ActiveRecord::Relation]
|
|
65
|
+
scope :search_text, lambda { |query|
|
|
66
|
+
where("to_tsvector('english', content) @@ plainto_tsquery('english', ?)", query)
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
# Finds sources by vector similarity using pgvector
|
|
70
|
+
#
|
|
71
|
+
# @param embedding [Array<Float>] the embedding vector to search with
|
|
72
|
+
# @param limit [Integer] maximum number of results
|
|
73
|
+
# @return [ActiveRecord::Relation] sources ordered by similarity
|
|
74
|
+
def self.nearest_neighbors(embedding, limit: 10)
|
|
75
|
+
return none unless embedding
|
|
76
|
+
|
|
77
|
+
order(Arel.sql("embedding <=> '#{embedding}'")).limit(limit)
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
# Returns whether the source content can be modified
|
|
81
|
+
#
|
|
82
|
+
# Sources are always immutable to preserve provenance integrity.
|
|
83
|
+
#
|
|
84
|
+
# @return [Boolean] always returns true
|
|
85
|
+
def immutable?
|
|
86
|
+
true
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
# Returns the word count of the content
|
|
90
|
+
#
|
|
91
|
+
# @return [Integer] number of words in content
|
|
92
|
+
def word_count
|
|
93
|
+
content.split.size
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
# Returns a preview of the content, truncated if needed
|
|
97
|
+
#
|
|
98
|
+
# @param length [Integer] maximum length (default: 200)
|
|
99
|
+
# @return [String] content preview with "..." if truncated
|
|
100
|
+
def preview(length: 200)
|
|
101
|
+
return content if content.length <= length
|
|
102
|
+
|
|
103
|
+
"#{content[0, length]}..."
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
private
|
|
107
|
+
|
|
108
|
+
def generate_content_hash
|
|
109
|
+
self.content_hash = Digest::SHA256.hexdigest(content) if content.present?
|
|
110
|
+
end
|
|
111
|
+
end
|
|
112
|
+
end
|
|
113
|
+
end
|