fact_db 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (114) hide show
  1. checksums.yaml +4 -4
  2. data/.envrc +2 -0
  3. data/.yardopts +5 -0
  4. data/CHANGELOG.md +64 -0
  5. data/README.md +107 -6
  6. data/Rakefile +243 -10
  7. data/db/migrate/001_enable_extensions.rb +1 -0
  8. data/db/migrate/002_create_sources.rb +49 -0
  9. data/db/migrate/003_create_entities.rb +27 -15
  10. data/db/migrate/004_create_entity_aliases.rb +20 -7
  11. data/db/migrate/005_create_facts.rb +37 -21
  12. data/db/migrate/006_create_entity_mentions.rb +14 -6
  13. data/db/migrate/007_create_fact_sources.rb +16 -8
  14. data/docs/api/extractors/index.md +5 -5
  15. data/docs/api/extractors/llm.md +17 -17
  16. data/docs/api/extractors/rule-based.md +14 -14
  17. data/docs/api/facts.md +20 -20
  18. data/docs/api/index.md +4 -4
  19. data/docs/api/models/entity.md +21 -21
  20. data/docs/api/models/fact.md +15 -15
  21. data/docs/api/models/index.md +7 -7
  22. data/docs/api/models/{content.md → source.md} +29 -29
  23. data/docs/api/pipeline/extraction.md +25 -25
  24. data/docs/api/pipeline/index.md +1 -1
  25. data/docs/api/pipeline/resolution.md +4 -4
  26. data/docs/api/services/entity-service.md +20 -20
  27. data/docs/api/services/fact-service.md +12 -12
  28. data/docs/api/services/index.md +5 -5
  29. data/docs/api/services/{content-service.md → source-service.md} +27 -27
  30. data/docs/architecture/database-schema.md +46 -46
  31. data/docs/architecture/entity-resolution.md +6 -6
  32. data/docs/architecture/index.md +10 -10
  33. data/docs/architecture/temporal-facts.md +5 -5
  34. data/docs/architecture/three-layer-model.md +17 -17
  35. data/docs/concepts.md +6 -6
  36. data/docs/examples/basic-usage.md +20 -20
  37. data/docs/examples/hr-onboarding.md +17 -17
  38. data/docs/examples/index.md +4 -4
  39. data/docs/examples/news-analysis.md +23 -23
  40. data/docs/getting-started/database-setup.md +28 -20
  41. data/docs/getting-started/index.md +3 -3
  42. data/docs/getting-started/quick-start.md +33 -30
  43. data/docs/guides/batch-processing.md +26 -26
  44. data/docs/guides/configuration.md +158 -77
  45. data/docs/guides/entity-management.md +14 -14
  46. data/docs/guides/extracting-facts.md +28 -28
  47. data/docs/guides/ingesting-content.md +14 -14
  48. data/docs/guides/llm-integration.md +40 -32
  49. data/docs/guides/temporal-queries.md +11 -11
  50. data/docs/index.md +6 -2
  51. data/examples/.envrc +4 -0
  52. data/examples/.gitignore +1 -0
  53. data/examples/001_configuration.rb +312 -0
  54. data/examples/{basic_usage.rb → 010_basic_usage.rb} +47 -56
  55. data/examples/{entity_management.rb → 020_entity_management.rb} +57 -72
  56. data/examples/{temporal_queries.rb → 030_temporal_queries.rb} +39 -59
  57. data/examples/040_output_formats.rb +177 -0
  58. data/examples/{rule_based_extraction.rb → 050_rule_based_extraction.rb} +39 -45
  59. data/examples/060_fluent_temporal_api.rb +217 -0
  60. data/examples/070_introspection.rb +252 -0
  61. data/examples/{hr_system.rb → 080_hr_system.rb} +56 -75
  62. data/examples/090_ingest_demo.rb +515 -0
  63. data/examples/100_query_context.rb +668 -0
  64. data/examples/110_prove_it.rb +204 -0
  65. data/examples/120_dump_database.rb +358 -0
  66. data/examples/130_rag_feedback_loop.rb +858 -0
  67. data/examples/README.md +229 -15
  68. data/examples/data/lincoln_associates.md +201 -0
  69. data/examples/data/lincoln_biography.md +66 -0
  70. data/examples/data/lincoln_cabinet.md +243 -0
  71. data/examples/data/lincoln_family.md +163 -0
  72. data/examples/data/lincoln_military.md +241 -0
  73. data/examples/data/lincoln_todd_family.md +136 -0
  74. data/examples/ingest_reporter.rb +335 -0
  75. data/examples/utilities.rb +182 -0
  76. data/lib/fact_db/config/defaults.yml +254 -0
  77. data/lib/fact_db/config.rb +94 -35
  78. data/lib/fact_db/database.rb +98 -8
  79. data/lib/fact_db/extractors/base.rb +106 -21
  80. data/lib/fact_db/extractors/llm_extractor.rb +35 -63
  81. data/lib/fact_db/extractors/manual_extractor.rb +46 -6
  82. data/lib/fact_db/extractors/rule_based_extractor.rb +136 -25
  83. data/lib/fact_db/llm/adapter.rb +3 -3
  84. data/lib/fact_db/models/entity.rb +94 -22
  85. data/lib/fact_db/models/entity_alias.rb +41 -7
  86. data/lib/fact_db/models/entity_mention.rb +34 -1
  87. data/lib/fact_db/models/fact.rb +259 -28
  88. data/lib/fact_db/models/fact_source.rb +43 -9
  89. data/lib/fact_db/models/source.rb +113 -0
  90. data/lib/fact_db/pipeline/extraction_pipeline.rb +35 -35
  91. data/lib/fact_db/pipeline/resolution_pipeline.rb +5 -5
  92. data/lib/fact_db/query_result.rb +202 -0
  93. data/lib/fact_db/resolution/entity_resolver.rb +139 -39
  94. data/lib/fact_db/resolution/fact_resolver.rb +86 -14
  95. data/lib/fact_db/services/entity_service.rb +246 -37
  96. data/lib/fact_db/services/fact_service.rb +254 -17
  97. data/lib/fact_db/services/source_service.rb +164 -0
  98. data/lib/fact_db/temporal/query.rb +71 -7
  99. data/lib/fact_db/temporal/query_builder.rb +69 -0
  100. data/lib/fact_db/temporal/timeline.rb +102 -11
  101. data/lib/fact_db/transformers/base.rb +77 -0
  102. data/lib/fact_db/transformers/cypher_transformer.rb +185 -0
  103. data/lib/fact_db/transformers/json_transformer.rb +17 -0
  104. data/lib/fact_db/transformers/raw_transformer.rb +35 -0
  105. data/lib/fact_db/transformers/text_transformer.rb +114 -0
  106. data/lib/fact_db/transformers/triple_transformer.rb +138 -0
  107. data/lib/fact_db/validation/alias_filter.rb +185 -0
  108. data/lib/fact_db/version.rb +1 -1
  109. data/lib/fact_db.rb +281 -30
  110. data/mkdocs.yml +2 -2
  111. metadata +60 -16
  112. data/db/migrate/002_create_contents.rb +0 -44
  113. data/lib/fact_db/models/content.rb +0 -62
  114. data/lib/fact_db/services/content_service.rb +0 -93
@@ -4,16 +4,16 @@ require "simple_flow"
4
4
 
5
5
  module FactDb
6
6
  module Pipeline
7
- # Pipeline for extracting facts from content using SimpleFlow
8
- # Supports parallel processing of multiple content items
7
+ # Pipeline for extracting facts from sources using SimpleFlow
8
+ # Supports parallel processing of multiple source items
9
9
  #
10
10
  # @example Sequential extraction
11
11
  # pipeline = ExtractionPipeline.new(config)
12
- # results = pipeline.process([content1, content2], extractor: :llm)
12
+ # results = pipeline.process([source1, source2], extractor: :llm)
13
13
  #
14
14
  # @example Parallel extraction
15
15
  # pipeline = ExtractionPipeline.new(config)
16
- # results = pipeline.process_parallel([content1, content2, content3], extractor: :llm)
16
+ # results = pipeline.process_parallel([source1, source2, source3], extractor: :llm)
17
17
  #
18
18
  class ExtractionPipeline
19
19
  attr_reader :config
@@ -22,40 +22,40 @@ module FactDb
22
22
  @config = config
23
23
  end
24
24
 
25
- # Process multiple content items sequentially
25
+ # Process multiple source items sequentially
26
26
  #
27
- # @param contents [Array<Models::Content>] Content records to process
27
+ # @param sources [Array<Models::Source>] Source records to process
28
28
  # @param extractor [Symbol] Extractor type (:manual, :llm, :rule_based)
29
- # @return [Array<Hash>] Results with extracted facts per content
30
- def process(contents, extractor: config.default_extractor)
29
+ # @return [Array<Hash>] Results with extracted facts per source
30
+ def process(sources, extractor: config.default_extractor)
31
31
  pipeline = build_extraction_pipeline(extractor)
32
32
 
33
- contents.map do |content|
34
- result = pipeline.call(SimpleFlow::Result.new(content))
33
+ sources.map do |source|
34
+ result = pipeline.call(SimpleFlow::Result.new(source))
35
35
  {
36
- content_id: content.id,
36
+ source_id: source.id,
37
37
  facts: result.success? ? result.value : [],
38
38
  error: result.halted? ? result.error : nil
39
39
  }
40
40
  end
41
41
  end
42
42
 
43
- # Process multiple content items in parallel
43
+ # Process multiple source items in parallel
44
44
  # Uses SimpleFlow's parallel execution capabilities
45
45
  #
46
- # @param contents [Array<Models::Content>] Content records to process
46
+ # @param sources [Array<Models::Source>] Source records to process
47
47
  # @param extractor [Symbol] Extractor type (:manual, :llm, :rule_based)
48
- # @return [Array<Hash>] Results with extracted facts per content
49
- def process_parallel(contents, extractor: config.default_extractor)
50
- pipeline = build_parallel_pipeline(contents, extractor)
51
- initial_result = SimpleFlow::Result.new(contents: contents, results: {})
48
+ # @return [Array<Hash>] Results with extracted facts per source
49
+ def process_parallel(sources, extractor: config.default_extractor)
50
+ pipeline = build_parallel_pipeline(sources, extractor)
51
+ initial_result = SimpleFlow::Result.new(sources: sources, results: {})
52
52
 
53
53
  final_result = pipeline.call(initial_result)
54
54
 
55
- contents.map do |content|
56
- result = final_result.value[:results][content.id]
55
+ sources.map do |source|
56
+ result = final_result.value[:results][source.id]
57
57
  {
58
- content_id: content.id,
58
+ source_id: source.id,
59
59
  facts: result&.dig(:facts) || [],
60
60
  error: result&.dig(:error)
61
61
  }
@@ -68,21 +68,21 @@ module FactDb
68
68
  extractor_instance = get_extractor(extractor)
69
69
 
70
70
  SimpleFlow::Pipeline.new do
71
- # Step 1: Validate content
71
+ # Step 1: Validate source
72
72
  step ->(result) {
73
- content = result.value
74
- if content.nil? || content.raw_text.blank?
75
- result.halt("Content is empty or missing")
73
+ source = result.value
74
+ if source.nil? || source.content.blank?
75
+ result.halt("Source content is empty or missing")
76
76
  else
77
- result.continue(content)
77
+ result.continue(source)
78
78
  end
79
79
  }
80
80
 
81
81
  # Step 2: Extract facts
82
82
  step ->(result) {
83
- content = result.value
83
+ source = result.value
84
84
  begin
85
- facts = extractor_instance.extract(content)
85
+ facts = extractor_instance.extract(source)
86
86
  result.continue(facts)
87
87
  rescue StandardError => e
88
88
  result.halt("Extraction failed: #{e.message}")
@@ -98,24 +98,24 @@ module FactDb
98
98
  end
99
99
  end
100
100
 
101
- def build_parallel_pipeline(contents, extractor)
101
+ def build_parallel_pipeline(sources, extractor)
102
102
  extractor_instance = get_extractor(extractor)
103
103
 
104
104
  SimpleFlow::Pipeline.new do
105
- # Create a step for each content item
106
- contents.each do |content|
107
- step "extract_#{content.id}", depends_on: [] do |result|
105
+ # Create a step for each source item
106
+ sources.each do |source|
107
+ step "extract_#{source.id}", depends_on: [] do |result|
108
108
  begin
109
- facts = extractor_instance.extract(content)
109
+ facts = extractor_instance.extract(source)
110
110
  valid_facts = facts.select { |f| f.valid? }
111
111
 
112
112
  new_results = result.value[:results].merge(
113
- content.id => { facts: valid_facts, error: nil }
113
+ source.id => { facts: valid_facts, error: nil }
114
114
  )
115
115
  result.continue(result.value.merge(results: new_results))
116
116
  rescue StandardError => e
117
117
  new_results = result.value[:results].merge(
118
- content.id => { facts: [], error: e.message }
118
+ source.id => { facts: [], error: e.message }
119
119
  )
120
120
  result.continue(result.value.merge(results: new_results))
121
121
  end
@@ -123,7 +123,7 @@ module FactDb
123
123
  end
124
124
 
125
125
  # Aggregate results
126
- step "aggregate", depends_on: contents.map { |c| "extract_#{c.id}" } do |result|
126
+ step "aggregate", depends_on: sources.map { |s| "extract_#{s.id}" } do |result|
127
127
  result.continue(result.value)
128
128
  end
129
129
  end
@@ -23,10 +23,10 @@ module FactDb
23
23
  # Resolve multiple entity names in parallel
24
24
  #
25
25
  # @param names [Array<String>] Entity names to resolve
26
- # @param type [Symbol, nil] Entity type filter
26
+ # @param kind [Symbol, nil] Entity kind filter
27
27
  # @return [Array<Hash>] Resolution results
28
- def resolve_entities(names, type: nil)
29
- pipeline = build_entity_resolution_pipeline(names, type)
28
+ def resolve_entities(names, kind: nil)
29
+ pipeline = build_entity_resolution_pipeline(names, kind)
30
30
  initial_result = SimpleFlow::Result.new(names: names, resolved: {})
31
31
 
32
32
  final_result = pipeline.call(initial_result)
@@ -64,7 +64,7 @@ module FactDb
64
64
 
65
65
  private
66
66
 
67
- def build_entity_resolution_pipeline(names, type)
67
+ def build_entity_resolution_pipeline(names, kind)
68
68
  resolver = @entity_resolver
69
69
 
70
70
  SimpleFlow::Pipeline.new do
@@ -72,7 +72,7 @@ module FactDb
72
72
  names.each do |name|
73
73
  step "resolve_#{name.hash.abs}", depends_on: [] do |result|
74
74
  begin
75
- entity = resolver.resolve(name, type: type)
75
+ entity = resolver.resolve(name, kind: kind)
76
76
  status = entity ? :resolved : :not_found
77
77
 
78
78
  new_resolved = result.value[:resolved].merge(
@@ -0,0 +1,202 @@
1
+ # frozen_string_literal: true
2
+
3
+ module FactDb
4
+ # Holds the results of a query and provides a unified interface
5
+ # for transformers to work with.
6
+ #
7
+ # @example Basic usage
8
+ # result = QueryResult.new(query: "Paula Chen")
9
+ # result.add_facts(facts)
10
+ # result.resolve_entities(entity_service)
11
+ # triples = TripleTransformer.new.transform(result)
12
+ #
13
+ class QueryResult
14
+ attr_reader :query, :facts, :entities, :metadata, :raw_facts
15
+
16
+ def initialize(query:)
17
+ @query = query
18
+ @facts = []
19
+ @raw_facts = []
20
+ @entities = {}
21
+ @metadata = {
22
+ retrieved_at: Time.now,
23
+ stores_queried: [:fact_db]
24
+ }
25
+ end
26
+
27
+ # Add facts to the result set.
28
+ #
29
+ # @param facts [Array<Fact>, Array<Hash>] Facts to add
30
+ # @return [void]
31
+ def add_facts(facts)
32
+ return if facts.nil? || facts.empty?
33
+
34
+ @raw_facts += Array(facts)
35
+ @facts += normalize_facts(facts)
36
+ end
37
+
38
+ # Resolve and cache entities mentioned in facts.
39
+ #
40
+ # @param entity_service [EntityService] Service to resolve entities
41
+ # @return [void]
42
+ def resolve_entities(entity_service = nil)
43
+ entity_ids = collect_entity_ids
44
+ return if entity_ids.empty?
45
+
46
+ entity_ids.each do |id|
47
+ next if @entities[id]
48
+
49
+ entity = resolve_entity(entity_service, id)
50
+ @entities[id] = normalize_entity(entity) if entity
51
+ end
52
+ end
53
+
54
+ # Check if results are empty.
55
+ #
56
+ # @return [Boolean]
57
+ def empty?
58
+ @facts.empty?
59
+ end
60
+
61
+ # Get all items for comparison operations.
62
+ #
63
+ # @return [Array<Hash>] Normalized items
64
+ def items
65
+ @facts.map { |f| normalize_for_comparison(f) }
66
+ end
67
+
68
+ # Convert to hash for JSON serialization.
69
+ #
70
+ # @return [Hash]
71
+ def to_h
72
+ {
73
+ query: @query,
74
+ facts: @facts,
75
+ entities: @entities,
76
+ metadata: @metadata
77
+ }
78
+ end
79
+
80
+ # Hash-like access for backward compatibility.
81
+ #
82
+ # @param key [Symbol, String] Key to access
83
+ # @return [Object] Value for the key
84
+ def [](key)
85
+ to_h[key.to_sym]
86
+ end
87
+
88
+ # Iterate over all facts.
89
+ #
90
+ # @yield [Hash] Each normalized fact
91
+ # @return [void]
92
+ def each_fact(&block)
93
+ @facts.each(&block)
94
+ end
95
+
96
+ # Iterate over all entities.
97
+ #
98
+ # @yield [Hash] Each normalized entity
99
+ # @return [void]
100
+ def each_entity(&block)
101
+ @entities.values.each(&block)
102
+ end
103
+
104
+ # Get count of facts.
105
+ #
106
+ # @return [Integer]
107
+ def fact_count
108
+ @facts.size
109
+ end
110
+
111
+ # Get count of entities.
112
+ #
113
+ # @return [Integer]
114
+ def entity_count
115
+ @entities.size
116
+ end
117
+
118
+ private
119
+
120
+ def normalize_facts(facts)
121
+ facts.map do |fact|
122
+ if fact.is_a?(Hash)
123
+ fact
124
+ elsif fact.respond_to?(:as_json)
125
+ fact.as_json.transform_keys(&:to_sym)
126
+ else
127
+ {
128
+ id: fact.id,
129
+ text: fact.text,
130
+ valid_at: fact.valid_at,
131
+ invalid_at: fact.invalid_at,
132
+ status: fact.status,
133
+ confidence: fact.respond_to?(:confidence) ? fact.confidence : nil,
134
+ entity_mentions: extract_mentions(fact)
135
+ }
136
+ end
137
+ end
138
+ end
139
+
140
+ def extract_mentions(fact)
141
+ return [] unless fact.respond_to?(:entity_mentions)
142
+
143
+ fact.entity_mentions.map do |mention|
144
+ if mention.is_a?(Hash)
145
+ mention
146
+ else
147
+ {
148
+ entity_id: mention.entity_id,
149
+ mention_role: mention.mention_role,
150
+ mention_text: mention.respond_to?(:mention_text) ? mention.mention_text : nil,
151
+ confidence: mention.respond_to?(:confidence) ? mention.confidence : nil
152
+ }
153
+ end
154
+ end
155
+ end
156
+
157
+ def normalize_entity(entity)
158
+ if entity.is_a?(Hash)
159
+ entity
160
+ elsif entity.respond_to?(:as_json)
161
+ entity.as_json.transform_keys(&:to_sym)
162
+ else
163
+ {
164
+ id: entity.id,
165
+ name: entity.name,
166
+ kind: entity.kind,
167
+ aliases: entity.respond_to?(:aliases) ? entity.aliases.map { |a| { name: a.name, kind: a.kind } } : [],
168
+ resolution_status: entity.respond_to?(:resolution_status) ? entity.resolution_status : nil
169
+ }
170
+ end
171
+ end
172
+
173
+ def collect_entity_ids
174
+ ids = Set.new
175
+
176
+ @facts.each do |fact|
177
+ mentions = fact[:entity_mentions] || []
178
+ mentions.each { |m| ids << m[:entity_id] }
179
+ end
180
+
181
+ ids.to_a.compact
182
+ end
183
+
184
+ def resolve_entity(entity_service, id)
185
+ return nil unless entity_service
186
+
187
+ if entity_service.respond_to?(:find)
188
+ entity_service.find(id)
189
+ end
190
+ rescue StandardError
191
+ nil
192
+ end
193
+
194
+ def normalize_for_comparison(item)
195
+ {
196
+ type: :fact,
197
+ text: item[:text],
198
+ valid_at: item[:valid_at]
199
+ }
200
+ end
201
+ end
202
+ end