fact_db 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.envrc +2 -0
- data/.yardopts +5 -0
- data/CHANGELOG.md +64 -0
- data/README.md +107 -6
- data/Rakefile +243 -10
- data/db/migrate/001_enable_extensions.rb +1 -0
- data/db/migrate/002_create_sources.rb +49 -0
- data/db/migrate/003_create_entities.rb +27 -15
- data/db/migrate/004_create_entity_aliases.rb +20 -7
- data/db/migrate/005_create_facts.rb +37 -21
- data/db/migrate/006_create_entity_mentions.rb +14 -6
- data/db/migrate/007_create_fact_sources.rb +16 -8
- data/docs/api/extractors/index.md +5 -5
- data/docs/api/extractors/llm.md +17 -17
- data/docs/api/extractors/rule-based.md +14 -14
- data/docs/api/facts.md +20 -20
- data/docs/api/index.md +4 -4
- data/docs/api/models/entity.md +21 -21
- data/docs/api/models/fact.md +15 -15
- data/docs/api/models/index.md +7 -7
- data/docs/api/models/{content.md → source.md} +29 -29
- data/docs/api/pipeline/extraction.md +25 -25
- data/docs/api/pipeline/index.md +1 -1
- data/docs/api/pipeline/resolution.md +4 -4
- data/docs/api/services/entity-service.md +20 -20
- data/docs/api/services/fact-service.md +12 -12
- data/docs/api/services/index.md +5 -5
- data/docs/api/services/{content-service.md → source-service.md} +27 -27
- data/docs/architecture/database-schema.md +46 -46
- data/docs/architecture/entity-resolution.md +6 -6
- data/docs/architecture/index.md +10 -10
- data/docs/architecture/temporal-facts.md +5 -5
- data/docs/architecture/three-layer-model.md +17 -17
- data/docs/concepts.md +6 -6
- data/docs/examples/basic-usage.md +20 -20
- data/docs/examples/hr-onboarding.md +17 -17
- data/docs/examples/index.md +4 -4
- data/docs/examples/news-analysis.md +23 -23
- data/docs/getting-started/database-setup.md +28 -20
- data/docs/getting-started/index.md +3 -3
- data/docs/getting-started/quick-start.md +33 -30
- data/docs/guides/batch-processing.md +26 -26
- data/docs/guides/configuration.md +158 -77
- data/docs/guides/entity-management.md +14 -14
- data/docs/guides/extracting-facts.md +28 -28
- data/docs/guides/ingesting-content.md +14 -14
- data/docs/guides/llm-integration.md +40 -32
- data/docs/guides/temporal-queries.md +11 -11
- data/docs/index.md +6 -2
- data/examples/.envrc +4 -0
- data/examples/.gitignore +1 -0
- data/examples/001_configuration.rb +312 -0
- data/examples/{basic_usage.rb → 010_basic_usage.rb} +47 -56
- data/examples/{entity_management.rb → 020_entity_management.rb} +57 -72
- data/examples/{temporal_queries.rb → 030_temporal_queries.rb} +39 -59
- data/examples/040_output_formats.rb +177 -0
- data/examples/{rule_based_extraction.rb → 050_rule_based_extraction.rb} +39 -45
- data/examples/060_fluent_temporal_api.rb +217 -0
- data/examples/070_introspection.rb +252 -0
- data/examples/{hr_system.rb → 080_hr_system.rb} +56 -75
- data/examples/090_ingest_demo.rb +515 -0
- data/examples/100_query_context.rb +668 -0
- data/examples/110_prove_it.rb +204 -0
- data/examples/120_dump_database.rb +358 -0
- data/examples/130_rag_feedback_loop.rb +858 -0
- data/examples/README.md +229 -15
- data/examples/data/lincoln_associates.md +201 -0
- data/examples/data/lincoln_biography.md +66 -0
- data/examples/data/lincoln_cabinet.md +243 -0
- data/examples/data/lincoln_family.md +163 -0
- data/examples/data/lincoln_military.md +241 -0
- data/examples/data/lincoln_todd_family.md +136 -0
- data/examples/ingest_reporter.rb +335 -0
- data/examples/utilities.rb +182 -0
- data/lib/fact_db/config/defaults.yml +254 -0
- data/lib/fact_db/config.rb +94 -35
- data/lib/fact_db/database.rb +98 -8
- data/lib/fact_db/extractors/base.rb +106 -21
- data/lib/fact_db/extractors/llm_extractor.rb +35 -63
- data/lib/fact_db/extractors/manual_extractor.rb +46 -6
- data/lib/fact_db/extractors/rule_based_extractor.rb +136 -25
- data/lib/fact_db/llm/adapter.rb +3 -3
- data/lib/fact_db/models/entity.rb +94 -22
- data/lib/fact_db/models/entity_alias.rb +41 -7
- data/lib/fact_db/models/entity_mention.rb +34 -1
- data/lib/fact_db/models/fact.rb +259 -28
- data/lib/fact_db/models/fact_source.rb +43 -9
- data/lib/fact_db/models/source.rb +113 -0
- data/lib/fact_db/pipeline/extraction_pipeline.rb +35 -35
- data/lib/fact_db/pipeline/resolution_pipeline.rb +5 -5
- data/lib/fact_db/query_result.rb +202 -0
- data/lib/fact_db/resolution/entity_resolver.rb +139 -39
- data/lib/fact_db/resolution/fact_resolver.rb +86 -14
- data/lib/fact_db/services/entity_service.rb +246 -37
- data/lib/fact_db/services/fact_service.rb +254 -17
- data/lib/fact_db/services/source_service.rb +164 -0
- data/lib/fact_db/temporal/query.rb +71 -7
- data/lib/fact_db/temporal/query_builder.rb +69 -0
- data/lib/fact_db/temporal/timeline.rb +102 -11
- data/lib/fact_db/transformers/base.rb +77 -0
- data/lib/fact_db/transformers/cypher_transformer.rb +185 -0
- data/lib/fact_db/transformers/json_transformer.rb +17 -0
- data/lib/fact_db/transformers/raw_transformer.rb +35 -0
- data/lib/fact_db/transformers/text_transformer.rb +114 -0
- data/lib/fact_db/transformers/triple_transformer.rb +138 -0
- data/lib/fact_db/validation/alias_filter.rb +185 -0
- data/lib/fact_db/version.rb +1 -1
- data/lib/fact_db.rb +281 -30
- data/mkdocs.yml +2 -2
- metadata +60 -16
- data/db/migrate/002_create_contents.rb +0 -44
- data/lib/fact_db/models/content.rb +0 -62
- data/lib/fact_db/services/content_service.rb +0 -93
data/lib/fact_db.rb
CHANGED
|
@@ -9,7 +9,7 @@ require_relative "fact_db/config"
|
|
|
9
9
|
require_relative "fact_db/database"
|
|
10
10
|
|
|
11
11
|
# Models
|
|
12
|
-
require_relative "fact_db/models/
|
|
12
|
+
require_relative "fact_db/models/source"
|
|
13
13
|
require_relative "fact_db/models/entity"
|
|
14
14
|
require_relative "fact_db/models/entity_alias"
|
|
15
15
|
require_relative "fact_db/models/fact"
|
|
@@ -19,11 +19,15 @@ require_relative "fact_db/models/fact_source"
|
|
|
19
19
|
# Temporal queries
|
|
20
20
|
require_relative "fact_db/temporal/query"
|
|
21
21
|
require_relative "fact_db/temporal/timeline"
|
|
22
|
+
require_relative "fact_db/temporal/query_builder"
|
|
22
23
|
|
|
23
24
|
# Resolution
|
|
24
25
|
require_relative "fact_db/resolution/entity_resolver"
|
|
25
26
|
require_relative "fact_db/resolution/fact_resolver"
|
|
26
27
|
|
|
28
|
+
# Validation
|
|
29
|
+
require_relative "fact_db/validation/alias_filter"
|
|
30
|
+
|
|
27
31
|
# Extractors
|
|
28
32
|
require_relative "fact_db/extractors/base"
|
|
29
33
|
require_relative "fact_db/extractors/manual_extractor"
|
|
@@ -38,31 +42,49 @@ require_relative "fact_db/pipeline/extraction_pipeline"
|
|
|
38
42
|
require_relative "fact_db/pipeline/resolution_pipeline"
|
|
39
43
|
|
|
40
44
|
# Services
|
|
41
|
-
require_relative "fact_db/services/
|
|
45
|
+
require_relative "fact_db/services/source_service"
|
|
42
46
|
require_relative "fact_db/services/entity_service"
|
|
43
47
|
require_relative "fact_db/services/fact_service"
|
|
44
48
|
|
|
49
|
+
# Transformers (output formatting)
|
|
50
|
+
require_relative "fact_db/transformers/base"
|
|
51
|
+
require_relative "fact_db/transformers/raw_transformer"
|
|
52
|
+
require_relative "fact_db/transformers/json_transformer"
|
|
53
|
+
require_relative "fact_db/transformers/triple_transformer"
|
|
54
|
+
require_relative "fact_db/transformers/cypher_transformer"
|
|
55
|
+
require_relative "fact_db/transformers/text_transformer"
|
|
56
|
+
|
|
57
|
+
# Query Result
|
|
58
|
+
require_relative "fact_db/query_result"
|
|
59
|
+
|
|
45
60
|
module FactDb
|
|
46
61
|
class Facts
|
|
47
|
-
|
|
62
|
+
# Available output formats for LLM consumption
|
|
63
|
+
FORMATS = %i[raw json triples cypher text].freeze
|
|
64
|
+
|
|
65
|
+
# Available retrieval strategies
|
|
66
|
+
STRATEGIES = %i[auto semantic fulltext graph temporal hybrid].freeze
|
|
67
|
+
|
|
68
|
+
attr_reader :config, :source_service, :entity_service, :fact_service,
|
|
48
69
|
:extraction_pipeline, :resolution_pipeline
|
|
49
70
|
|
|
50
71
|
def initialize(config: nil)
|
|
51
72
|
@config = config || FactDb.config
|
|
52
73
|
Database.establish_connection!(@config)
|
|
53
74
|
|
|
54
|
-
@
|
|
75
|
+
@source_service = Services::SourceService.new(@config)
|
|
55
76
|
@entity_service = Services::EntityService.new(@config)
|
|
56
77
|
@fact_service = Services::FactService.new(@config)
|
|
57
78
|
@extraction_pipeline = Pipeline::ExtractionPipeline.new(@config)
|
|
58
79
|
@resolution_pipeline = Pipeline::ResolutionPipeline.new(@config)
|
|
80
|
+
@transformers = build_transformers
|
|
59
81
|
end
|
|
60
82
|
|
|
61
|
-
# Ingest raw content
|
|
62
|
-
def ingest(
|
|
63
|
-
@
|
|
64
|
-
|
|
65
|
-
|
|
83
|
+
# Ingest raw content
|
|
84
|
+
def ingest(content, kind:, captured_at: Time.current, metadata: {}, title: nil, source_uri: nil)
|
|
85
|
+
@source_service.create(
|
|
86
|
+
content,
|
|
87
|
+
kind: kind,
|
|
66
88
|
captured_at: captured_at,
|
|
67
89
|
metadata: metadata,
|
|
68
90
|
title: title,
|
|
@@ -70,19 +92,27 @@ module FactDb
|
|
|
70
92
|
)
|
|
71
93
|
end
|
|
72
94
|
|
|
73
|
-
# Extract facts from
|
|
74
|
-
def extract_facts(
|
|
75
|
-
@fact_service.
|
|
95
|
+
# Extract facts from source
|
|
96
|
+
def extract_facts(source_id, extractor: @config.default_extractor)
|
|
97
|
+
@fact_service.extract_from_source(source_id, extractor: extractor)
|
|
76
98
|
end
|
|
77
99
|
|
|
78
100
|
# Query facts with temporal and entity filtering
|
|
79
|
-
|
|
80
|
-
|
|
101
|
+
#
|
|
102
|
+
# @param topic [String, nil] Topic to search for
|
|
103
|
+
# @param at [Date, Time, String, nil] Point in time for temporal query
|
|
104
|
+
# @param entity [Integer, nil] Entity ID to filter by
|
|
105
|
+
# @param status [Symbol] Fact status (:canonical, :superseded, :synthesized, :all)
|
|
106
|
+
# @param format [Symbol] Output format (:json, :triples, :cypher, :text, :prolog)
|
|
107
|
+
# @return [Array, String, Hash] Results in requested format
|
|
108
|
+
def query_facts(topic: nil, at: nil, entity: nil, status: :canonical, format: :json)
|
|
109
|
+
results = @fact_service.query(topic: topic, at: at, entity: entity, status: status)
|
|
110
|
+
transform_results(results, topic: topic, format: format)
|
|
81
111
|
end
|
|
82
112
|
|
|
83
113
|
# Resolve a name to an entity
|
|
84
|
-
def resolve_entity(name,
|
|
85
|
-
@entity_service.resolve(name,
|
|
114
|
+
def resolve_entity(name, kind: nil)
|
|
115
|
+
@entity_service.resolve(name, kind: kind)
|
|
86
116
|
end
|
|
87
117
|
|
|
88
118
|
# Build a timeline for an entity
|
|
@@ -91,37 +121,152 @@ module FactDb
|
|
|
91
121
|
end
|
|
92
122
|
|
|
93
123
|
# Get currently valid facts about an entity
|
|
94
|
-
|
|
95
|
-
|
|
124
|
+
#
|
|
125
|
+
# @param entity_id [Integer] Entity ID
|
|
126
|
+
# @param format [Symbol] Output format
|
|
127
|
+
# @return [Array, String, Hash] Results in requested format
|
|
128
|
+
def current_facts_for(entity_id, format: :json)
|
|
129
|
+
results = @fact_service.current_facts(entity: entity_id)
|
|
130
|
+
transform_results(results, topic: "entity_#{entity_id}", format: format)
|
|
96
131
|
end
|
|
97
132
|
|
|
98
133
|
# Get facts valid at a specific point in time
|
|
99
|
-
|
|
100
|
-
|
|
134
|
+
#
|
|
135
|
+
# @param at [Date, Time, String] Point in time
|
|
136
|
+
# @param entity [Integer, nil] Entity ID to filter by
|
|
137
|
+
# @param topic [String, nil] Topic to search for
|
|
138
|
+
# @param format [Symbol] Output format
|
|
139
|
+
# @return [Array, String, Hash] Results in requested format
|
|
140
|
+
def facts_at(at, entity: nil, topic: nil, format: :json)
|
|
141
|
+
results = @fact_service.facts_at(at, entity: entity, topic: topic)
|
|
142
|
+
transform_results(results, topic: topic || "facts_at_#{at}", format: format)
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
# Temporal query builder - query at a specific point in time
|
|
146
|
+
#
|
|
147
|
+
# @param date [Date, Time, String] Point in time
|
|
148
|
+
# @return [Temporal::QueryBuilder] Scoped query builder
|
|
149
|
+
#
|
|
150
|
+
# @example
|
|
151
|
+
# facts.at("2024-01-15").query("Paula's role", format: :cypher)
|
|
152
|
+
# facts.at("2024-01-15").facts_for(entity_id)
|
|
153
|
+
# facts.at("2024-01-15").compare_to("2024-06-15")
|
|
154
|
+
def at(date)
|
|
155
|
+
Temporal::QueryBuilder.new(self, parse_date(date))
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
# Compare what changed between two dates
|
|
159
|
+
#
|
|
160
|
+
# @param topic [String, nil] Topic to compare (nil for all facts)
|
|
161
|
+
# @param from [Date, Time, String] Start date
|
|
162
|
+
# @param to [Date, Time, String] End date
|
|
163
|
+
# @return [Hash] Differences with :added, :removed, :unchanged keys
|
|
164
|
+
def diff(topic = nil, from:, to:)
|
|
165
|
+
from_date = parse_date(from)
|
|
166
|
+
to_date = parse_date(to)
|
|
167
|
+
|
|
168
|
+
from_results = @fact_service.query(topic: topic, at: from_date, status: :canonical)
|
|
169
|
+
to_results = @fact_service.query(topic: topic, at: to_date, status: :canonical)
|
|
170
|
+
|
|
171
|
+
from_set = facts_to_comparable(from_results)
|
|
172
|
+
to_set = facts_to_comparable(to_results)
|
|
173
|
+
|
|
174
|
+
{
|
|
175
|
+
topic: topic,
|
|
176
|
+
from: from_date,
|
|
177
|
+
to: to_date,
|
|
178
|
+
added: to_results.select { |f| !from_set.include?(comparable_key(f)) },
|
|
179
|
+
removed: from_results.select { |f| !to_set.include?(comparable_key(f)) },
|
|
180
|
+
unchanged: from_results.select { |f| to_set.include?(comparable_key(f)) }
|
|
181
|
+
}
|
|
182
|
+
end
|
|
183
|
+
|
|
184
|
+
# Introspect the schema - what does the layer know about?
|
|
185
|
+
#
|
|
186
|
+
# @param topic [String, nil] Optional topic to introspect specifically
|
|
187
|
+
# @return [Hash] Schema information or topic-specific coverage
|
|
188
|
+
def introspect(topic = nil)
|
|
189
|
+
topic ? introspect_topic(topic) : introspect_schema
|
|
190
|
+
end
|
|
191
|
+
|
|
192
|
+
# Suggest queries based on what's stored for a topic
|
|
193
|
+
#
|
|
194
|
+
# @param topic [String] Topic to get suggestions for
|
|
195
|
+
# @return [Array<String>] Suggested queries
|
|
196
|
+
def suggest_queries(topic)
|
|
197
|
+
resolved = resolve_entity(topic)
|
|
198
|
+
return [] unless resolved
|
|
199
|
+
|
|
200
|
+
entity = resolved.respond_to?(:entity) ? resolved.entity : resolved
|
|
201
|
+
suggestions = []
|
|
202
|
+
|
|
203
|
+
entity_kind = entity.respond_to?(:kind) ? entity.kind : nil
|
|
204
|
+
suggestions << "current status" if entity_kind == "person"
|
|
205
|
+
|
|
206
|
+
# Check relationships
|
|
207
|
+
relationships = @entity_service.relationship_types_for(entity.id)
|
|
208
|
+
suggestions << "employment history" if relationships.include?(:works_at) || relationships.include?(:object)
|
|
209
|
+
suggestions << "team members" if relationships.include?(:works_with)
|
|
210
|
+
suggestions << "reporting chain" if relationships.include?(:reports_to)
|
|
211
|
+
|
|
212
|
+
# Check fact coverage
|
|
213
|
+
fact_stats = @fact_service.fact_stats(entity.id)
|
|
214
|
+
suggestions << "timeline" if fact_stats[:canonical]&.positive?
|
|
215
|
+
suggestions << "historical changes" if fact_stats[:superseded]&.positive?
|
|
216
|
+
|
|
217
|
+
suggestions
|
|
218
|
+
end
|
|
219
|
+
|
|
220
|
+
# Suggest retrieval strategies for a query
|
|
221
|
+
#
|
|
222
|
+
# @param query_text [String] The query
|
|
223
|
+
# @return [Array<Hash>] Strategy options with descriptions
|
|
224
|
+
def suggest_strategies(query_text)
|
|
225
|
+
strategies = []
|
|
226
|
+
|
|
227
|
+
# Check for temporal keywords
|
|
228
|
+
if query_text.match?(/\b(yesterday|last\s+week|last\s+month|ago|since|before|after|between)\b/i)
|
|
229
|
+
strategies << { strategy: :temporal, description: "Filter by date range" }
|
|
230
|
+
end
|
|
231
|
+
|
|
232
|
+
# Check for semantic intent
|
|
233
|
+
if query_text.match?(/\b(about|related|similar|like)\b/i)
|
|
234
|
+
strategies << { strategy: :semantic, description: "Search by semantic similarity" }
|
|
235
|
+
end
|
|
236
|
+
|
|
237
|
+
# Check for entity focus
|
|
238
|
+
if query_text.match?(/\b(who|what|where)\b/i)
|
|
239
|
+
strategies << { strategy: :graph, description: "Traverse from entity node" }
|
|
240
|
+
end
|
|
241
|
+
|
|
242
|
+
# Default: hybrid
|
|
243
|
+
strategies << { strategy: :hybrid, description: "Combine multiple strategies" }
|
|
244
|
+
|
|
245
|
+
strategies
|
|
101
246
|
end
|
|
102
247
|
|
|
103
|
-
# Batch extract facts from multiple
|
|
248
|
+
# Batch extract facts from multiple sources
|
|
104
249
|
#
|
|
105
|
-
# @param
|
|
250
|
+
# @param source_ids [Array<Integer>] Source IDs to process
|
|
106
251
|
# @param extractor [Symbol] Extractor type (:manual, :llm, :rule_based)
|
|
107
252
|
# @param parallel [Boolean] Whether to use parallel processing
|
|
108
|
-
# @return [Array<Hash>] Results with extracted facts per
|
|
109
|
-
def batch_extract(
|
|
110
|
-
|
|
253
|
+
# @return [Array<Hash>] Results with extracted facts per source
|
|
254
|
+
def batch_extract(source_ids, extractor: @config.default_extractor, parallel: true)
|
|
255
|
+
sources = Models::Source.where(id: source_ids).to_a
|
|
111
256
|
if parallel
|
|
112
|
-
@extraction_pipeline.process_parallel(
|
|
257
|
+
@extraction_pipeline.process_parallel(sources, extractor: extractor)
|
|
113
258
|
else
|
|
114
|
-
@extraction_pipeline.process(
|
|
259
|
+
@extraction_pipeline.process(sources, extractor: extractor)
|
|
115
260
|
end
|
|
116
261
|
end
|
|
117
262
|
|
|
118
263
|
# Batch resolve entity names
|
|
119
264
|
#
|
|
120
265
|
# @param names [Array<String>] Entity names to resolve
|
|
121
|
-
# @param
|
|
266
|
+
# @param kind [Symbol, nil] Entity kind filter
|
|
122
267
|
# @return [Array<Hash>] Resolution results
|
|
123
|
-
def batch_resolve_entities(names,
|
|
124
|
-
@resolution_pipeline.resolve_entities(names,
|
|
268
|
+
def batch_resolve_entities(names, kind: nil)
|
|
269
|
+
@resolution_pipeline.resolve_entities(names, kind: kind)
|
|
125
270
|
end
|
|
126
271
|
|
|
127
272
|
# Detect fact conflicts for multiple entities
|
|
@@ -131,6 +276,112 @@ module FactDb
|
|
|
131
276
|
def detect_fact_conflicts(entity_ids)
|
|
132
277
|
@resolution_pipeline.detect_conflicts(entity_ids)
|
|
133
278
|
end
|
|
279
|
+
|
|
280
|
+
private
|
|
281
|
+
|
|
282
|
+
def build_transformers
|
|
283
|
+
{
|
|
284
|
+
raw: Transformers::RawTransformer.new,
|
|
285
|
+
json: Transformers::JsonTransformer.new,
|
|
286
|
+
triples: Transformers::TripleTransformer.new,
|
|
287
|
+
cypher: Transformers::CypherTransformer.new,
|
|
288
|
+
text: Transformers::TextTransformer.new
|
|
289
|
+
}
|
|
290
|
+
end
|
|
291
|
+
|
|
292
|
+
def transform_results(results, topic:, format:)
|
|
293
|
+
validate_format!(format)
|
|
294
|
+
|
|
295
|
+
query_result = QueryResult.new(query: topic || "query")
|
|
296
|
+
query_result.add_facts(results)
|
|
297
|
+
query_result.resolve_entities(@entity_service)
|
|
298
|
+
|
|
299
|
+
# Return QueryResult directly for :json format to support fluent API methods
|
|
300
|
+
# like each_fact, fact_count, etc. Use query_result.to_h for Hash output.
|
|
301
|
+
return query_result if format == :json
|
|
302
|
+
|
|
303
|
+
@transformers[format].transform(query_result)
|
|
304
|
+
end
|
|
305
|
+
|
|
306
|
+
def validate_format!(format)
|
|
307
|
+
return if FORMATS.include?(format)
|
|
308
|
+
|
|
309
|
+
raise ArgumentError, "Unknown format: #{format}. Available: #{FORMATS.join(', ')}"
|
|
310
|
+
end
|
|
311
|
+
|
|
312
|
+
def parse_date(date)
|
|
313
|
+
return nil if date.nil?
|
|
314
|
+
return date if date.is_a?(Date) || date.is_a?(Time)
|
|
315
|
+
|
|
316
|
+
Date.parse(date.to_s)
|
|
317
|
+
rescue ArgumentError
|
|
318
|
+
nil
|
|
319
|
+
end
|
|
320
|
+
|
|
321
|
+
def introspect_schema
|
|
322
|
+
{
|
|
323
|
+
capabilities: collect_capabilities,
|
|
324
|
+
entity_kinds: Models::Entity.distinct.pluck(:kind).compact,
|
|
325
|
+
fact_statuses: %w[canonical superseded corroborated synthesized],
|
|
326
|
+
extraction_methods: %w[manual llm rule_based],
|
|
327
|
+
output_formats: FORMATS,
|
|
328
|
+
retrieval_strategies: STRATEGIES,
|
|
329
|
+
statistics: collect_statistics
|
|
330
|
+
}
|
|
331
|
+
end
|
|
332
|
+
|
|
333
|
+
def introspect_topic(topic)
|
|
334
|
+
resolved = resolve_entity(topic)
|
|
335
|
+
return nil unless resolved
|
|
336
|
+
|
|
337
|
+
entity = resolved.respond_to?(:entity) ? resolved.entity : resolved
|
|
338
|
+
|
|
339
|
+
{
|
|
340
|
+
entity: entity_info(entity),
|
|
341
|
+
coverage: {
|
|
342
|
+
facts: @fact_service.fact_stats(entity.id),
|
|
343
|
+
timespan: @entity_service.timespan_for(entity.id)
|
|
344
|
+
},
|
|
345
|
+
relationships: @entity_service.relationship_types_for(entity.id),
|
|
346
|
+
suggested_queries: suggest_queries(topic)
|
|
347
|
+
}
|
|
348
|
+
end
|
|
349
|
+
|
|
350
|
+
def collect_capabilities
|
|
351
|
+
capabilities = [:temporal_query, :entity_resolution, :introspection]
|
|
352
|
+
|
|
353
|
+
capabilities << :semantic_search if @config.embedding_generator
|
|
354
|
+
capabilities << :llm_extraction if @config.llm_client || @config.llm&.provider
|
|
355
|
+
|
|
356
|
+
capabilities
|
|
357
|
+
end
|
|
358
|
+
|
|
359
|
+
def collect_statistics
|
|
360
|
+
{
|
|
361
|
+
facts: @fact_service.stats,
|
|
362
|
+
entities: @entity_service.stats,
|
|
363
|
+
sources: @source_service.stats
|
|
364
|
+
}
|
|
365
|
+
end
|
|
366
|
+
|
|
367
|
+
def entity_info(entity)
|
|
368
|
+
{
|
|
369
|
+
id: entity.id,
|
|
370
|
+
name: entity.name,
|
|
371
|
+
kind: entity.kind,
|
|
372
|
+
resolution_status: entity.resolution_status,
|
|
373
|
+
aliases: entity.aliases.map { |a| { name: a.name, kind: a.kind } }
|
|
374
|
+
}
|
|
375
|
+
end
|
|
376
|
+
|
|
377
|
+
def facts_to_comparable(facts)
|
|
378
|
+
facts.map { |f| comparable_key(f) }.to_set
|
|
379
|
+
end
|
|
380
|
+
|
|
381
|
+
def comparable_key(fact)
|
|
382
|
+
text = fact.respond_to?(:text) ? fact.text : fact[:text]
|
|
383
|
+
"#{text}".downcase.strip
|
|
384
|
+
end
|
|
134
385
|
end
|
|
135
386
|
|
|
136
387
|
class << self
|
data/mkdocs.yml
CHANGED
|
@@ -174,12 +174,12 @@ nav:
|
|
|
174
174
|
- Facts: api/facts.md
|
|
175
175
|
- Models:
|
|
176
176
|
- api/models/index.md
|
|
177
|
-
-
|
|
177
|
+
- Source: api/models/source.md
|
|
178
178
|
- Entity: api/models/entity.md
|
|
179
179
|
- Fact: api/models/fact.md
|
|
180
180
|
- Services:
|
|
181
181
|
- api/services/index.md
|
|
182
|
-
-
|
|
182
|
+
- SourceService: api/services/source-service.md
|
|
183
183
|
- EntityService: api/services/entity-service.md
|
|
184
184
|
- FactService: api/services/fact-service.md
|
|
185
185
|
- Extractors:
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: fact_db
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.0.
|
|
4
|
+
version: 0.0.3
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Dewayne VanHoozer
|
|
@@ -52,19 +52,19 @@ dependencies:
|
|
|
52
52
|
- !ruby/object:Gem::Version
|
|
53
53
|
version: '0.3'
|
|
54
54
|
- !ruby/object:Gem::Dependency
|
|
55
|
-
name:
|
|
55
|
+
name: myway_config
|
|
56
56
|
requirement: !ruby/object:Gem::Requirement
|
|
57
57
|
requirements:
|
|
58
|
-
- - "
|
|
58
|
+
- - "~>"
|
|
59
59
|
- !ruby/object:Gem::Version
|
|
60
|
-
version:
|
|
60
|
+
version: 0.1.1
|
|
61
61
|
type: :runtime
|
|
62
62
|
prerelease: false
|
|
63
63
|
version_requirements: !ruby/object:Gem::Requirement
|
|
64
64
|
requirements:
|
|
65
|
-
- - "
|
|
65
|
+
- - "~>"
|
|
66
66
|
- !ruby/object:Gem::Version
|
|
67
|
-
version:
|
|
67
|
+
version: 0.1.1
|
|
68
68
|
- !ruby/object:Gem::Dependency
|
|
69
69
|
name: chronic
|
|
70
70
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -149,6 +149,20 @@ dependencies:
|
|
|
149
149
|
- - ">="
|
|
150
150
|
- !ruby/object:Gem::Version
|
|
151
151
|
version: '0'
|
|
152
|
+
- !ruby/object:Gem::Dependency
|
|
153
|
+
name: timecop
|
|
154
|
+
requirement: !ruby/object:Gem::Requirement
|
|
155
|
+
requirements:
|
|
156
|
+
- - ">="
|
|
157
|
+
- !ruby/object:Gem::Version
|
|
158
|
+
version: '0'
|
|
159
|
+
type: :development
|
|
160
|
+
prerelease: false
|
|
161
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
162
|
+
requirements:
|
|
163
|
+
- - ">="
|
|
164
|
+
- !ruby/object:Gem::Version
|
|
165
|
+
version: '0'
|
|
152
166
|
- !ruby/object:Gem::Dependency
|
|
153
167
|
name: yard
|
|
154
168
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -175,12 +189,13 @@ extensions: []
|
|
|
175
189
|
extra_rdoc_files: []
|
|
176
190
|
files:
|
|
177
191
|
- ".envrc"
|
|
192
|
+
- ".yardopts"
|
|
178
193
|
- CHANGELOG.md
|
|
179
194
|
- COMMITS.md
|
|
180
195
|
- README.md
|
|
181
196
|
- Rakefile
|
|
182
197
|
- db/migrate/001_enable_extensions.rb
|
|
183
|
-
- db/migrate/
|
|
198
|
+
- db/migrate/002_create_sources.rb
|
|
184
199
|
- db/migrate/003_create_entities.rb
|
|
185
200
|
- db/migrate/004_create_entity_aliases.rb
|
|
186
201
|
- db/migrate/005_create_facts.rb
|
|
@@ -192,17 +207,17 @@ files:
|
|
|
192
207
|
- docs/api/extractors/rule-based.md
|
|
193
208
|
- docs/api/facts.md
|
|
194
209
|
- docs/api/index.md
|
|
195
|
-
- docs/api/models/content.md
|
|
196
210
|
- docs/api/models/entity.md
|
|
197
211
|
- docs/api/models/fact.md
|
|
198
212
|
- docs/api/models/index.md
|
|
213
|
+
- docs/api/models/source.md
|
|
199
214
|
- docs/api/pipeline/extraction.md
|
|
200
215
|
- docs/api/pipeline/index.md
|
|
201
216
|
- docs/api/pipeline/resolution.md
|
|
202
|
-
- docs/api/services/content-service.md
|
|
203
217
|
- docs/api/services/entity-service.md
|
|
204
218
|
- docs/api/services/fact-service.md
|
|
205
219
|
- docs/api/services/index.md
|
|
220
|
+
- docs/api/services/source-service.md
|
|
206
221
|
- docs/architecture/database-schema.md
|
|
207
222
|
- docs/architecture/entity-resolution.md
|
|
208
223
|
- docs/architecture/index.md
|
|
@@ -229,14 +244,34 @@ files:
|
|
|
229
244
|
- docs/guides/llm-integration.md
|
|
230
245
|
- docs/guides/temporal-queries.md
|
|
231
246
|
- docs/index.md
|
|
247
|
+
- examples/.envrc
|
|
248
|
+
- examples/.gitignore
|
|
249
|
+
- examples/001_configuration.rb
|
|
250
|
+
- examples/010_basic_usage.rb
|
|
251
|
+
- examples/020_entity_management.rb
|
|
252
|
+
- examples/030_temporal_queries.rb
|
|
253
|
+
- examples/040_output_formats.rb
|
|
254
|
+
- examples/050_rule_based_extraction.rb
|
|
255
|
+
- examples/060_fluent_temporal_api.rb
|
|
256
|
+
- examples/070_introspection.rb
|
|
257
|
+
- examples/080_hr_system.rb
|
|
258
|
+
- examples/090_ingest_demo.rb
|
|
259
|
+
- examples/100_query_context.rb
|
|
260
|
+
- examples/110_prove_it.rb
|
|
261
|
+
- examples/120_dump_database.rb
|
|
262
|
+
- examples/130_rag_feedback_loop.rb
|
|
232
263
|
- examples/README.md
|
|
233
|
-
- examples/
|
|
234
|
-
- examples/
|
|
235
|
-
- examples/
|
|
236
|
-
- examples/
|
|
237
|
-
- examples/
|
|
264
|
+
- examples/data/lincoln_associates.md
|
|
265
|
+
- examples/data/lincoln_biography.md
|
|
266
|
+
- examples/data/lincoln_cabinet.md
|
|
267
|
+
- examples/data/lincoln_family.md
|
|
268
|
+
- examples/data/lincoln_military.md
|
|
269
|
+
- examples/data/lincoln_todd_family.md
|
|
270
|
+
- examples/ingest_reporter.rb
|
|
271
|
+
- examples/utilities.rb
|
|
238
272
|
- lib/fact_db.rb
|
|
239
273
|
- lib/fact_db/config.rb
|
|
274
|
+
- lib/fact_db/config/defaults.yml
|
|
240
275
|
- lib/fact_db/database.rb
|
|
241
276
|
- lib/fact_db/errors.rb
|
|
242
277
|
- lib/fact_db/extractors/base.rb
|
|
@@ -244,21 +279,30 @@ files:
|
|
|
244
279
|
- lib/fact_db/extractors/manual_extractor.rb
|
|
245
280
|
- lib/fact_db/extractors/rule_based_extractor.rb
|
|
246
281
|
- lib/fact_db/llm/adapter.rb
|
|
247
|
-
- lib/fact_db/models/content.rb
|
|
248
282
|
- lib/fact_db/models/entity.rb
|
|
249
283
|
- lib/fact_db/models/entity_alias.rb
|
|
250
284
|
- lib/fact_db/models/entity_mention.rb
|
|
251
285
|
- lib/fact_db/models/fact.rb
|
|
252
286
|
- lib/fact_db/models/fact_source.rb
|
|
287
|
+
- lib/fact_db/models/source.rb
|
|
253
288
|
- lib/fact_db/pipeline/extraction_pipeline.rb
|
|
254
289
|
- lib/fact_db/pipeline/resolution_pipeline.rb
|
|
290
|
+
- lib/fact_db/query_result.rb
|
|
255
291
|
- lib/fact_db/resolution/entity_resolver.rb
|
|
256
292
|
- lib/fact_db/resolution/fact_resolver.rb
|
|
257
|
-
- lib/fact_db/services/content_service.rb
|
|
258
293
|
- lib/fact_db/services/entity_service.rb
|
|
259
294
|
- lib/fact_db/services/fact_service.rb
|
|
295
|
+
- lib/fact_db/services/source_service.rb
|
|
260
296
|
- lib/fact_db/temporal/query.rb
|
|
297
|
+
- lib/fact_db/temporal/query_builder.rb
|
|
261
298
|
- lib/fact_db/temporal/timeline.rb
|
|
299
|
+
- lib/fact_db/transformers/base.rb
|
|
300
|
+
- lib/fact_db/transformers/cypher_transformer.rb
|
|
301
|
+
- lib/fact_db/transformers/json_transformer.rb
|
|
302
|
+
- lib/fact_db/transformers/raw_transformer.rb
|
|
303
|
+
- lib/fact_db/transformers/text_transformer.rb
|
|
304
|
+
- lib/fact_db/transformers/triple_transformer.rb
|
|
305
|
+
- lib/fact_db/validation/alias_filter.rb
|
|
262
306
|
- lib/fact_db/version.rb
|
|
263
307
|
- mkdocs.yml
|
|
264
308
|
homepage: https://github.com/MadBomber/fact_db
|
|
@@ -1,44 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
class CreateContents < ActiveRecord::Migration[7.0]
|
|
4
|
-
def change
|
|
5
|
-
create_table :fact_db_contents do |t|
|
|
6
|
-
# Content identification
|
|
7
|
-
t.string :content_hash, null: false, limit: 64
|
|
8
|
-
t.string :content_type, null: false, limit: 50
|
|
9
|
-
|
|
10
|
-
# The raw content (immutable)
|
|
11
|
-
t.text :raw_text, null: false
|
|
12
|
-
t.string :title, limit: 500
|
|
13
|
-
|
|
14
|
-
# Source metadata
|
|
15
|
-
t.text :source_uri
|
|
16
|
-
t.jsonb :source_metadata, null: false, default: {}
|
|
17
|
-
|
|
18
|
-
# Vector embedding for semantic search
|
|
19
|
-
t.vector :embedding, limit: 1536
|
|
20
|
-
|
|
21
|
-
# Timestamps
|
|
22
|
-
t.timestamptz :captured_at, null: false
|
|
23
|
-
t.timestamps
|
|
24
|
-
end
|
|
25
|
-
|
|
26
|
-
add_index :fact_db_contents, :content_hash, unique: true
|
|
27
|
-
add_index :fact_db_contents, :captured_at
|
|
28
|
-
add_index :fact_db_contents, :content_type
|
|
29
|
-
add_index :fact_db_contents, :source_metadata, using: :gin
|
|
30
|
-
|
|
31
|
-
# Full-text search index
|
|
32
|
-
execute <<-SQL
|
|
33
|
-
CREATE INDEX idx_contents_fulltext ON fact_db_contents
|
|
34
|
-
USING gin(to_tsvector('english', raw_text));
|
|
35
|
-
SQL
|
|
36
|
-
|
|
37
|
-
# HNSW index for vector similarity search (if pgvector supports it)
|
|
38
|
-
# This creates a cosine similarity index for fast nearest neighbor queries
|
|
39
|
-
execute <<-SQL
|
|
40
|
-
CREATE INDEX idx_contents_embedding ON fact_db_contents
|
|
41
|
-
USING hnsw (embedding vector_cosine_ops);
|
|
42
|
-
SQL
|
|
43
|
-
end
|
|
44
|
-
end
|
|
@@ -1,62 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
module FactDb
|
|
4
|
-
module Models
|
|
5
|
-
class Content < ActiveRecord::Base
|
|
6
|
-
self.table_name = "fact_db_contents"
|
|
7
|
-
|
|
8
|
-
has_many :fact_sources, class_name: "FactDb::Models::FactSource",
|
|
9
|
-
foreign_key: :content_id, dependent: :destroy
|
|
10
|
-
has_many :facts, through: :fact_sources
|
|
11
|
-
|
|
12
|
-
validates :content_hash, presence: true, uniqueness: true
|
|
13
|
-
validates :content_type, presence: true
|
|
14
|
-
validates :raw_text, presence: true
|
|
15
|
-
validates :captured_at, presence: true
|
|
16
|
-
|
|
17
|
-
before_validation :generate_content_hash, on: :create
|
|
18
|
-
|
|
19
|
-
# Content types
|
|
20
|
-
TYPES = %w[email transcript document slack meeting_notes contract report].freeze
|
|
21
|
-
|
|
22
|
-
validates :content_type, inclusion: { in: TYPES }, allow_nil: false
|
|
23
|
-
|
|
24
|
-
scope :by_type, ->(type) { where(content_type: type) }
|
|
25
|
-
scope :captured_between, ->(from, to) { where(captured_at: from..to) }
|
|
26
|
-
scope :captured_after, ->(date) { where("captured_at >= ?", date) }
|
|
27
|
-
scope :captured_before, ->(date) { where("captured_at <= ?", date) }
|
|
28
|
-
|
|
29
|
-
# Full-text search
|
|
30
|
-
scope :search_text, lambda { |query|
|
|
31
|
-
where("to_tsvector('english', raw_text) @@ plainto_tsquery('english', ?)", query)
|
|
32
|
-
}
|
|
33
|
-
|
|
34
|
-
# Vector similarity search (requires neighbor gem configured)
|
|
35
|
-
def self.nearest_neighbors(embedding, limit: 10)
|
|
36
|
-
return none unless embedding
|
|
37
|
-
|
|
38
|
-
order(Arel.sql("embedding <=> '#{embedding}'")).limit(limit)
|
|
39
|
-
end
|
|
40
|
-
|
|
41
|
-
def immutable?
|
|
42
|
-
true
|
|
43
|
-
end
|
|
44
|
-
|
|
45
|
-
def word_count
|
|
46
|
-
raw_text.split.size
|
|
47
|
-
end
|
|
48
|
-
|
|
49
|
-
def preview(length: 200)
|
|
50
|
-
return raw_text if raw_text.length <= length
|
|
51
|
-
|
|
52
|
-
"#{raw_text[0, length]}..."
|
|
53
|
-
end
|
|
54
|
-
|
|
55
|
-
private
|
|
56
|
-
|
|
57
|
-
def generate_content_hash
|
|
58
|
-
self.content_hash = Digest::SHA256.hexdigest(raw_text) if raw_text.present?
|
|
59
|
-
end
|
|
60
|
-
end
|
|
61
|
-
end
|
|
62
|
-
end
|