fact_db 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. checksums.yaml +7 -0
  2. data/.envrc +1 -0
  3. data/CHANGELOG.md +48 -0
  4. data/COMMITS.md +196 -0
  5. data/README.md +102 -0
  6. data/Rakefile +41 -0
  7. data/db/migrate/001_enable_extensions.rb +7 -0
  8. data/db/migrate/002_create_contents.rb +44 -0
  9. data/db/migrate/003_create_entities.rb +36 -0
  10. data/db/migrate/004_create_entity_aliases.rb +18 -0
  11. data/db/migrate/005_create_facts.rb +65 -0
  12. data/db/migrate/006_create_entity_mentions.rb +18 -0
  13. data/db/migrate/007_create_fact_sources.rb +18 -0
  14. data/docs/api/extractors/index.md +71 -0
  15. data/docs/api/extractors/llm.md +162 -0
  16. data/docs/api/extractors/manual.md +92 -0
  17. data/docs/api/extractors/rule-based.md +165 -0
  18. data/docs/api/facts.md +300 -0
  19. data/docs/api/index.md +66 -0
  20. data/docs/api/models/content.md +165 -0
  21. data/docs/api/models/entity.md +202 -0
  22. data/docs/api/models/fact.md +270 -0
  23. data/docs/api/models/index.md +77 -0
  24. data/docs/api/pipeline/extraction.md +175 -0
  25. data/docs/api/pipeline/index.md +72 -0
  26. data/docs/api/pipeline/resolution.md +209 -0
  27. data/docs/api/services/content-service.md +166 -0
  28. data/docs/api/services/entity-service.md +202 -0
  29. data/docs/api/services/fact-service.md +223 -0
  30. data/docs/api/services/index.md +55 -0
  31. data/docs/architecture/database-schema.md +293 -0
  32. data/docs/architecture/entity-resolution.md +293 -0
  33. data/docs/architecture/index.md +149 -0
  34. data/docs/architecture/temporal-facts.md +268 -0
  35. data/docs/architecture/three-layer-model.md +242 -0
  36. data/docs/assets/css/custom.css +137 -0
  37. data/docs/assets/fact_db.jpg +0 -0
  38. data/docs/assets/images/fact_db.jpg +0 -0
  39. data/docs/concepts.md +183 -0
  40. data/docs/examples/basic-usage.md +235 -0
  41. data/docs/examples/hr-onboarding.md +312 -0
  42. data/docs/examples/index.md +64 -0
  43. data/docs/examples/news-analysis.md +288 -0
  44. data/docs/getting-started/database-setup.md +170 -0
  45. data/docs/getting-started/index.md +71 -0
  46. data/docs/getting-started/installation.md +98 -0
  47. data/docs/getting-started/quick-start.md +191 -0
  48. data/docs/guides/batch-processing.md +325 -0
  49. data/docs/guides/configuration.md +243 -0
  50. data/docs/guides/entity-management.md +364 -0
  51. data/docs/guides/extracting-facts.md +299 -0
  52. data/docs/guides/index.md +22 -0
  53. data/docs/guides/ingesting-content.md +252 -0
  54. data/docs/guides/llm-integration.md +299 -0
  55. data/docs/guides/temporal-queries.md +315 -0
  56. data/docs/index.md +121 -0
  57. data/examples/README.md +130 -0
  58. data/examples/basic_usage.rb +164 -0
  59. data/examples/entity_management.rb +216 -0
  60. data/examples/hr_system.rb +428 -0
  61. data/examples/rule_based_extraction.rb +258 -0
  62. data/examples/temporal_queries.rb +245 -0
  63. data/lib/fact_db/config.rb +71 -0
  64. data/lib/fact_db/database.rb +45 -0
  65. data/lib/fact_db/errors.rb +10 -0
  66. data/lib/fact_db/extractors/base.rb +117 -0
  67. data/lib/fact_db/extractors/llm_extractor.rb +179 -0
  68. data/lib/fact_db/extractors/manual_extractor.rb +53 -0
  69. data/lib/fact_db/extractors/rule_based_extractor.rb +228 -0
  70. data/lib/fact_db/llm/adapter.rb +109 -0
  71. data/lib/fact_db/models/content.rb +62 -0
  72. data/lib/fact_db/models/entity.rb +84 -0
  73. data/lib/fact_db/models/entity_alias.rb +26 -0
  74. data/lib/fact_db/models/entity_mention.rb +33 -0
  75. data/lib/fact_db/models/fact.rb +192 -0
  76. data/lib/fact_db/models/fact_source.rb +35 -0
  77. data/lib/fact_db/pipeline/extraction_pipeline.rb +146 -0
  78. data/lib/fact_db/pipeline/resolution_pipeline.rb +129 -0
  79. data/lib/fact_db/resolution/entity_resolver.rb +261 -0
  80. data/lib/fact_db/resolution/fact_resolver.rb +259 -0
  81. data/lib/fact_db/services/content_service.rb +93 -0
  82. data/lib/fact_db/services/entity_service.rb +150 -0
  83. data/lib/fact_db/services/fact_service.rb +193 -0
  84. data/lib/fact_db/temporal/query.rb +125 -0
  85. data/lib/fact_db/temporal/timeline.rb +134 -0
  86. data/lib/fact_db/version.rb +5 -0
  87. data/lib/fact_db.rb +141 -0
  88. data/mkdocs.yml +198 -0
  89. metadata +288 -0
@@ -0,0 +1,192 @@
1
+ # frozen_string_literal: true
2
+
3
+ module FactDb
4
+ module Models
5
+ class Fact < ActiveRecord::Base
6
+ self.table_name = "fact_db_facts"
7
+
8
+ has_many :entity_mentions, class_name: "FactDb::Models::EntityMention",
9
+ foreign_key: :fact_id, dependent: :destroy
10
+ has_many :entities, through: :entity_mentions
11
+
12
+ has_many :fact_sources, class_name: "FactDb::Models::FactSource",
13
+ foreign_key: :fact_id, dependent: :destroy
14
+ has_many :source_contents, through: :fact_sources, source: :content
15
+
16
+ belongs_to :superseded_by, class_name: "FactDb::Models::Fact",
17
+ foreign_key: :superseded_by_id, optional: true
18
+ has_many :supersedes, class_name: "FactDb::Models::Fact",
19
+ foreign_key: :superseded_by_id
20
+
21
+ validates :fact_text, presence: true
22
+ validates :fact_hash, presence: true
23
+ validates :valid_at, presence: true
24
+ validates :status, presence: true
25
+
26
+ before_validation :generate_fact_hash, on: :create
27
+
28
+ # Fact statuses
29
+ STATUSES = %w[canonical superseded corroborated synthesized].freeze
30
+ EXTRACTION_METHODS = %w[manual llm rule_based].freeze
31
+
32
+ validates :status, inclusion: { in: STATUSES }
33
+ validates :extraction_method, inclusion: { in: EXTRACTION_METHODS }, allow_nil: true
34
+
35
+ # Core scopes
36
+ scope :canonical, -> { where(status: "canonical") }
37
+ scope :superseded, -> { where(status: "superseded") }
38
+ scope :synthesized, -> { where(status: "synthesized") }
39
+
40
+ # Temporal scopes - the heart of the Event Clock
41
+ scope :currently_valid, -> { where(invalid_at: nil) }
42
+ scope :historical, -> { where.not(invalid_at: nil) }
43
+
44
+ scope :valid_at, lambda { |date|
45
+ where("valid_at <= ?", date)
46
+ .where("invalid_at > ? OR invalid_at IS NULL", date)
47
+ }
48
+
49
+ scope :valid_between, lambda { |from, to|
50
+ where("valid_at <= ? AND (invalid_at > ? OR invalid_at IS NULL)", to, from)
51
+ }
52
+
53
+ scope :became_valid_between, lambda { |from, to|
54
+ where(valid_at: from..to)
55
+ }
56
+
57
+ scope :became_invalid_between, lambda { |from, to|
58
+ where(invalid_at: from..to)
59
+ }
60
+
61
+ # Entity filtering
62
+ scope :mentioning_entity, lambda { |entity_id|
63
+ joins(:entity_mentions).where(fact_db_entity_mentions: { entity_id: entity_id })
64
+ }
65
+
66
+ scope :with_role, lambda { |entity_id, role|
67
+ joins(:entity_mentions).where(
68
+ fact_db_entity_mentions: { entity_id: entity_id, mention_role: role }
69
+ )
70
+ }
71
+
72
+ # Full-text search
73
+ scope :search_text, lambda { |query|
74
+ where("to_tsvector('english', fact_text) @@ plainto_tsquery('english', ?)", query)
75
+ }
76
+
77
+ # Extraction method
78
+ scope :extracted_by, ->(method) { where(extraction_method: method) }
79
+ scope :by_extraction_method, ->(method) { where(extraction_method: method) }
80
+
81
+ # Confidence filtering
82
+ scope :high_confidence, -> { where("confidence >= ?", 0.9) }
83
+ scope :low_confidence, -> { where("confidence < ?", 0.5) }
84
+
85
+ def currently_valid?
86
+ invalid_at.nil?
87
+ end
88
+
89
+ def valid_at?(date)
90
+ valid_at <= date && (invalid_at.nil? || invalid_at > date)
91
+ end
92
+
93
+ def duration
94
+ return nil if invalid_at.nil?
95
+
96
+ invalid_at - valid_at
97
+ end
98
+
99
+ def duration_days
100
+ return nil if invalid_at.nil?
101
+
102
+ (invalid_at.to_date - valid_at.to_date).to_i
103
+ end
104
+
105
+ def superseded?
106
+ status == "superseded"
107
+ end
108
+
109
+ def synthesized?
110
+ status == "synthesized"
111
+ end
112
+
113
+ def invalidate!(at: Time.current)
114
+ update!(invalid_at: at)
115
+ end
116
+
117
+ def supersede_with!(new_fact_text, valid_at:)
118
+ transaction do
119
+ new_fact = self.class.create!(
120
+ fact_text: new_fact_text,
121
+ valid_at: valid_at,
122
+ status: "canonical",
123
+ extraction_method: extraction_method
124
+ )
125
+
126
+ update!(
127
+ status: "superseded",
128
+ superseded_by_id: new_fact.id,
129
+ invalid_at: valid_at
130
+ )
131
+
132
+ new_fact
133
+ end
134
+ end
135
+
136
+ def add_mention(entity:, text:, role: nil, confidence: 1.0)
137
+ entity_mentions.find_or_create_by!(entity: entity, mention_text: text) do |m|
138
+ m.mention_role = role
139
+ m.confidence = confidence
140
+ end
141
+ end
142
+
143
+ def add_source(content:, type: "primary", excerpt: nil, confidence: 1.0)
144
+ fact_sources.find_or_create_by!(content: content) do |s|
145
+ s.source_type = type
146
+ s.excerpt = excerpt
147
+ s.confidence = confidence
148
+ end
149
+ end
150
+
151
+ # Get source facts for synthesized facts
152
+ def source_facts
153
+ return Fact.none unless derived_from_ids.any?
154
+
155
+ Fact.where(id: derived_from_ids)
156
+ end
157
+
158
+ # Get facts that corroborate this one
159
+ def corroborating_facts
160
+ return Fact.none unless corroborated_by_ids.any?
161
+
162
+ Fact.where(id: corroborated_by_ids)
163
+ end
164
+
165
+ # Evidence chain - trace back to original content
166
+ def evidence_chain
167
+ sources = source_contents.to_a
168
+
169
+ if synthesized? && derived_from_ids.any?
170
+ source_facts.each do |source_fact|
171
+ sources.concat(source_fact.evidence_chain)
172
+ end
173
+ end
174
+
175
+ sources.uniq
176
+ end
177
+
178
+ # Vector similarity search
179
+ def self.nearest_neighbors(embedding, limit: 10)
180
+ return none unless embedding
181
+
182
+ order(Arel.sql("embedding <=> '#{embedding}'")).limit(limit)
183
+ end
184
+
185
+ private
186
+
187
+ def generate_fact_hash
188
+ self.fact_hash = Digest::SHA256.hexdigest(fact_text) if fact_text.present?
189
+ end
190
+ end
191
+ end
192
+ end
@@ -0,0 +1,35 @@
1
+ # frozen_string_literal: true
2
+
3
+ module FactDb
4
+ module Models
5
+ class FactSource < ActiveRecord::Base
6
+ self.table_name = "fact_db_fact_sources"
7
+
8
+ belongs_to :fact, class_name: "FactDb::Models::Fact"
9
+ belongs_to :content, class_name: "FactDb::Models::Content"
10
+
11
+ validates :fact_id, uniqueness: { scope: :content_id }
12
+
13
+ # Source types
14
+ TYPES = %w[primary supporting corroborating].freeze
15
+
16
+ validates :source_type, inclusion: { in: TYPES }
17
+
18
+ scope :primary, -> { where(source_type: "primary") }
19
+ scope :supporting, -> { where(source_type: "supporting") }
20
+ scope :corroborating, -> { where(source_type: "corroborating") }
21
+ scope :high_confidence, -> { where("confidence >= ?", 0.9) }
22
+
23
+ def primary?
24
+ source_type == "primary"
25
+ end
26
+
27
+ def excerpt_preview(length: 100)
28
+ return nil if excerpt.nil?
29
+ return excerpt if excerpt.length <= length
30
+
31
+ "#{excerpt[0, length]}..."
32
+ end
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,146 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "simple_flow"
4
+
5
+ module FactDb
6
+ module Pipeline
7
+ # Pipeline for extracting facts from content using SimpleFlow
8
+ # Supports parallel processing of multiple content items
9
+ #
10
+ # @example Sequential extraction
11
+ # pipeline = ExtractionPipeline.new(config)
12
+ # results = pipeline.process([content1, content2], extractor: :llm)
13
+ #
14
+ # @example Parallel extraction
15
+ # pipeline = ExtractionPipeline.new(config)
16
+ # results = pipeline.process_parallel([content1, content2, content3], extractor: :llm)
17
+ #
18
+ class ExtractionPipeline
19
+ attr_reader :config
20
+
21
+ def initialize(config = FactDb.config)
22
+ @config = config
23
+ end
24
+
25
+ # Process multiple content items sequentially
26
+ #
27
+ # @param contents [Array<Models::Content>] Content records to process
28
+ # @param extractor [Symbol] Extractor type (:manual, :llm, :rule_based)
29
+ # @return [Array<Hash>] Results with extracted facts per content
30
+ def process(contents, extractor: config.default_extractor)
31
+ pipeline = build_extraction_pipeline(extractor)
32
+
33
+ contents.map do |content|
34
+ result = pipeline.call(SimpleFlow::Result.new(content))
35
+ {
36
+ content_id: content.id,
37
+ facts: result.success? ? result.value : [],
38
+ error: result.halted? ? result.error : nil
39
+ }
40
+ end
41
+ end
42
+
43
+ # Process multiple content items in parallel
44
+ # Uses SimpleFlow's parallel execution capabilities
45
+ #
46
+ # @param contents [Array<Models::Content>] Content records to process
47
+ # @param extractor [Symbol] Extractor type (:manual, :llm, :rule_based)
48
+ # @return [Array<Hash>] Results with extracted facts per content
49
+ def process_parallel(contents, extractor: config.default_extractor)
50
+ pipeline = build_parallel_pipeline(contents, extractor)
51
+ initial_result = SimpleFlow::Result.new(contents: contents, results: {})
52
+
53
+ final_result = pipeline.call(initial_result)
54
+
55
+ contents.map do |content|
56
+ result = final_result.value[:results][content.id]
57
+ {
58
+ content_id: content.id,
59
+ facts: result&.dig(:facts) || [],
60
+ error: result&.dig(:error)
61
+ }
62
+ end
63
+ end
64
+
65
+ private
66
+
67
+ def build_extraction_pipeline(extractor)
68
+ extractor_instance = get_extractor(extractor)
69
+
70
+ SimpleFlow::Pipeline.new do
71
+ # Step 1: Validate content
72
+ step ->(result) {
73
+ content = result.value
74
+ if content.nil? || content.raw_text.blank?
75
+ result.halt("Content is empty or missing")
76
+ else
77
+ result.continue(content)
78
+ end
79
+ }
80
+
81
+ # Step 2: Extract facts
82
+ step ->(result) {
83
+ content = result.value
84
+ begin
85
+ facts = extractor_instance.extract(content)
86
+ result.continue(facts)
87
+ rescue StandardError => e
88
+ result.halt("Extraction failed: #{e.message}")
89
+ end
90
+ }
91
+
92
+ # Step 3: Validate extracted facts
93
+ step ->(result) {
94
+ facts = result.value
95
+ valid_facts = facts.select { |f| f.valid? }
96
+ result.continue(valid_facts)
97
+ }
98
+ end
99
+ end
100
+
101
+ def build_parallel_pipeline(contents, extractor)
102
+ extractor_instance = get_extractor(extractor)
103
+
104
+ SimpleFlow::Pipeline.new do
105
+ # Create a step for each content item
106
+ contents.each do |content|
107
+ step "extract_#{content.id}", depends_on: [] do |result|
108
+ begin
109
+ facts = extractor_instance.extract(content)
110
+ valid_facts = facts.select { |f| f.valid? }
111
+
112
+ new_results = result.value[:results].merge(
113
+ content.id => { facts: valid_facts, error: nil }
114
+ )
115
+ result.continue(result.value.merge(results: new_results))
116
+ rescue StandardError => e
117
+ new_results = result.value[:results].merge(
118
+ content.id => { facts: [], error: e.message }
119
+ )
120
+ result.continue(result.value.merge(results: new_results))
121
+ end
122
+ end
123
+ end
124
+
125
+ # Aggregate results
126
+ step "aggregate", depends_on: contents.map { |c| "extract_#{c.id}" } do |result|
127
+ result.continue(result.value)
128
+ end
129
+ end
130
+ end
131
+
132
+ def get_extractor(extractor)
133
+ case extractor.to_sym
134
+ when :manual
135
+ Extractors::ManualExtractor.new(config)
136
+ when :llm
137
+ Extractors::LLMExtractor.new(config)
138
+ when :rule_based
139
+ Extractors::RuleBasedExtractor.new(config)
140
+ else
141
+ raise ConfigurationError, "Unknown extractor: #{extractor}"
142
+ end
143
+ end
144
+ end
145
+ end
146
+ end
@@ -0,0 +1,129 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "simple_flow"
4
+
5
+ module FactDb
6
+ module Pipeline
7
+ # Pipeline for resolving entities and facts using SimpleFlow
8
+ # Supports parallel resolution of multiple items
9
+ #
10
+ # @example Resolve entities in parallel
11
+ # pipeline = ResolutionPipeline.new(config)
12
+ # results = pipeline.resolve_entities(["John Smith", "Jane Doe", "Acme Corp"])
13
+ #
14
+ class ResolutionPipeline
15
+ attr_reader :config, :entity_resolver, :fact_resolver
16
+
17
+ def initialize(config = FactDb.config)
18
+ @config = config
19
+ @entity_resolver = Resolution::EntityResolver.new(config)
20
+ @fact_resolver = Resolution::FactResolver.new(config)
21
+ end
22
+
23
+ # Resolve multiple entity names in parallel
24
+ #
25
+ # @param names [Array<String>] Entity names to resolve
26
+ # @param type [Symbol, nil] Entity type filter
27
+ # @return [Array<Hash>] Resolution results
28
+ def resolve_entities(names, type: nil)
29
+ pipeline = build_entity_resolution_pipeline(names, type)
30
+ initial_result = SimpleFlow::Result.new(names: names, resolved: {})
31
+
32
+ final_result = pipeline.call(initial_result)
33
+
34
+ names.map do |name|
35
+ resolution = final_result.value[:resolved][name]
36
+ {
37
+ name: name,
38
+ entity: resolution&.dig(:entity),
39
+ status: resolution&.dig(:status) || :failed,
40
+ error: resolution&.dig(:error)
41
+ }
42
+ end
43
+ end
44
+
45
+ # Find and resolve conflicts for multiple entities in parallel
46
+ #
47
+ # @param entity_ids [Array<Integer>] Entity IDs to check for conflicts
48
+ # @return [Array<Hash>] Conflict detection results
49
+ def detect_conflicts(entity_ids)
50
+ pipeline = build_conflict_detection_pipeline(entity_ids)
51
+ initial_result = SimpleFlow::Result.new(entity_ids: entity_ids, conflicts: {})
52
+
53
+ final_result = pipeline.call(initial_result)
54
+
55
+ entity_ids.map do |entity_id|
56
+ conflicts = final_result.value[:conflicts][entity_id]
57
+ {
58
+ entity_id: entity_id,
59
+ conflicts: conflicts || [],
60
+ conflict_count: conflicts&.size || 0
61
+ }
62
+ end
63
+ end
64
+
65
+ private
66
+
67
+ def build_entity_resolution_pipeline(names, type)
68
+ resolver = @entity_resolver
69
+
70
+ SimpleFlow::Pipeline.new do
71
+ # Create parallel resolution steps
72
+ names.each do |name|
73
+ step "resolve_#{name.hash.abs}", depends_on: [] do |result|
74
+ begin
75
+ entity = resolver.resolve(name, type: type)
76
+ status = entity ? :resolved : :not_found
77
+
78
+ new_resolved = result.value[:resolved].merge(
79
+ name => { entity: entity, status: status, error: nil }
80
+ )
81
+ result.continue(result.value.merge(resolved: new_resolved))
82
+ rescue StandardError => e
83
+ new_resolved = result.value[:resolved].merge(
84
+ name => { entity: nil, status: :error, error: e.message }
85
+ )
86
+ result.continue(result.value.merge(resolved: new_resolved))
87
+ end
88
+ end
89
+ end
90
+
91
+ # Aggregate
92
+ step "aggregate", depends_on: names.map { |n| "resolve_#{n.hash.abs}" } do |result|
93
+ result.continue(result.value)
94
+ end
95
+ end
96
+ end
97
+
98
+ def build_conflict_detection_pipeline(entity_ids)
99
+ resolver = @fact_resolver
100
+
101
+ SimpleFlow::Pipeline.new do
102
+ # Create parallel conflict detection steps
103
+ entity_ids.each do |entity_id|
104
+ step "conflicts_#{entity_id}", depends_on: [] do |result|
105
+ begin
106
+ conflicts = resolver.find_conflicts(entity_id: entity_id)
107
+
108
+ new_conflicts = result.value[:conflicts].merge(
109
+ entity_id => conflicts
110
+ )
111
+ result.continue(result.value.merge(conflicts: new_conflicts))
112
+ rescue StandardError
113
+ new_conflicts = result.value[:conflicts].merge(
114
+ entity_id => []
115
+ )
116
+ result.continue(result.value.merge(conflicts: new_conflicts))
117
+ end
118
+ end
119
+ end
120
+
121
+ # Aggregate
122
+ step "aggregate", depends_on: entity_ids.map { |id| "conflicts_#{id}" } do |result|
123
+ result.continue(result.value)
124
+ end
125
+ end
126
+ end
127
+ end
128
+ end
129
+ end