fact_db 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. checksums.yaml +7 -0
  2. data/.envrc +1 -0
  3. data/CHANGELOG.md +48 -0
  4. data/COMMITS.md +196 -0
  5. data/README.md +102 -0
  6. data/Rakefile +41 -0
  7. data/db/migrate/001_enable_extensions.rb +7 -0
  8. data/db/migrate/002_create_contents.rb +44 -0
  9. data/db/migrate/003_create_entities.rb +36 -0
  10. data/db/migrate/004_create_entity_aliases.rb +18 -0
  11. data/db/migrate/005_create_facts.rb +65 -0
  12. data/db/migrate/006_create_entity_mentions.rb +18 -0
  13. data/db/migrate/007_create_fact_sources.rb +18 -0
  14. data/docs/api/extractors/index.md +71 -0
  15. data/docs/api/extractors/llm.md +162 -0
  16. data/docs/api/extractors/manual.md +92 -0
  17. data/docs/api/extractors/rule-based.md +165 -0
  18. data/docs/api/facts.md +300 -0
  19. data/docs/api/index.md +66 -0
  20. data/docs/api/models/content.md +165 -0
  21. data/docs/api/models/entity.md +202 -0
  22. data/docs/api/models/fact.md +270 -0
  23. data/docs/api/models/index.md +77 -0
  24. data/docs/api/pipeline/extraction.md +175 -0
  25. data/docs/api/pipeline/index.md +72 -0
  26. data/docs/api/pipeline/resolution.md +209 -0
  27. data/docs/api/services/content-service.md +166 -0
  28. data/docs/api/services/entity-service.md +202 -0
  29. data/docs/api/services/fact-service.md +223 -0
  30. data/docs/api/services/index.md +55 -0
  31. data/docs/architecture/database-schema.md +293 -0
  32. data/docs/architecture/entity-resolution.md +293 -0
  33. data/docs/architecture/index.md +149 -0
  34. data/docs/architecture/temporal-facts.md +268 -0
  35. data/docs/architecture/three-layer-model.md +242 -0
  36. data/docs/assets/css/custom.css +137 -0
  37. data/docs/assets/fact_db.jpg +0 -0
  38. data/docs/assets/images/fact_db.jpg +0 -0
  39. data/docs/concepts.md +183 -0
  40. data/docs/examples/basic-usage.md +235 -0
  41. data/docs/examples/hr-onboarding.md +312 -0
  42. data/docs/examples/index.md +64 -0
  43. data/docs/examples/news-analysis.md +288 -0
  44. data/docs/getting-started/database-setup.md +170 -0
  45. data/docs/getting-started/index.md +71 -0
  46. data/docs/getting-started/installation.md +98 -0
  47. data/docs/getting-started/quick-start.md +191 -0
  48. data/docs/guides/batch-processing.md +325 -0
  49. data/docs/guides/configuration.md +243 -0
  50. data/docs/guides/entity-management.md +364 -0
  51. data/docs/guides/extracting-facts.md +299 -0
  52. data/docs/guides/index.md +22 -0
  53. data/docs/guides/ingesting-content.md +252 -0
  54. data/docs/guides/llm-integration.md +299 -0
  55. data/docs/guides/temporal-queries.md +315 -0
  56. data/docs/index.md +121 -0
  57. data/examples/README.md +130 -0
  58. data/examples/basic_usage.rb +164 -0
  59. data/examples/entity_management.rb +216 -0
  60. data/examples/hr_system.rb +428 -0
  61. data/examples/rule_based_extraction.rb +258 -0
  62. data/examples/temporal_queries.rb +245 -0
  63. data/lib/fact_db/config.rb +71 -0
  64. data/lib/fact_db/database.rb +45 -0
  65. data/lib/fact_db/errors.rb +10 -0
  66. data/lib/fact_db/extractors/base.rb +117 -0
  67. data/lib/fact_db/extractors/llm_extractor.rb +179 -0
  68. data/lib/fact_db/extractors/manual_extractor.rb +53 -0
  69. data/lib/fact_db/extractors/rule_based_extractor.rb +228 -0
  70. data/lib/fact_db/llm/adapter.rb +109 -0
  71. data/lib/fact_db/models/content.rb +62 -0
  72. data/lib/fact_db/models/entity.rb +84 -0
  73. data/lib/fact_db/models/entity_alias.rb +26 -0
  74. data/lib/fact_db/models/entity_mention.rb +33 -0
  75. data/lib/fact_db/models/fact.rb +192 -0
  76. data/lib/fact_db/models/fact_source.rb +35 -0
  77. data/lib/fact_db/pipeline/extraction_pipeline.rb +146 -0
  78. data/lib/fact_db/pipeline/resolution_pipeline.rb +129 -0
  79. data/lib/fact_db/resolution/entity_resolver.rb +261 -0
  80. data/lib/fact_db/resolution/fact_resolver.rb +259 -0
  81. data/lib/fact_db/services/content_service.rb +93 -0
  82. data/lib/fact_db/services/entity_service.rb +150 -0
  83. data/lib/fact_db/services/fact_service.rb +193 -0
  84. data/lib/fact_db/temporal/query.rb +125 -0
  85. data/lib/fact_db/temporal/timeline.rb +134 -0
  86. data/lib/fact_db/version.rb +5 -0
  87. data/lib/fact_db.rb +141 -0
  88. data/mkdocs.yml +198 -0
  89. metadata +288 -0
@@ -0,0 +1,325 @@
1
+ # Batch Processing
2
+
3
+ FactDb uses the `simple_flow` gem to provide concurrent pipeline processing for efficient batch operations.
4
+
5
+ ## Overview
6
+
7
+ Batch processing is useful for:
8
+
9
+ - Processing multiple documents at once
10
+ - Resolving many entity names
11
+ - Detecting conflicts across entities
12
+ - Bulk fact extraction
13
+
14
+ ## Batch Extraction
15
+
16
+ ### Sequential Processing
17
+
18
+ Process content one at a time:
19
+
20
+ ```ruby
21
+ facts = FactDb.new
22
+
23
+ content_ids = [content1.id, content2.id, content3.id]
24
+
25
+ results = facts.batch_extract(
26
+ content_ids,
27
+ extractor: :llm,
28
+ parallel: false
29
+ )
30
+ ```
31
+
32
+ ### Parallel Processing
33
+
34
+ Process content concurrently (default):
35
+
36
+ ```ruby
37
+ results = facts.batch_extract(
38
+ content_ids,
39
+ extractor: :llm,
40
+ parallel: true # default
41
+ )
42
+
43
+ results.each do |result|
44
+ puts "Content #{result[:content_id]}:"
45
+ puts " Facts extracted: #{result[:facts].count}"
46
+ puts " Error: #{result[:error]}" if result[:error]
47
+ end
48
+ ```
49
+
50
+ ### Result Structure
51
+
52
+ ```ruby
53
+ result = {
54
+ content_id: 123,
55
+ facts: [<Fact>, <Fact>, ...], # Extracted facts
56
+ error: nil # Error message if failed
57
+ }
58
+ ```
59
+
60
+ ## Batch Entity Resolution
61
+
62
+ Resolve multiple names at once:
63
+
64
+ ```ruby
65
+ names = [
66
+ "Paula Chen",
67
+ "John Smith",
68
+ "Microsoft",
69
+ "Acme Corporation",
70
+ "Seattle"
71
+ ]
72
+
73
+ results = facts.batch_resolve_entities(names, type: nil)
74
+
75
+ results.each do |result|
76
+ case result[:status]
77
+ when :resolved
78
+ puts "#{result[:name]} -> #{result[:entity].canonical_name}"
79
+ when :not_found
80
+ puts "#{result[:name]} -> Not found"
81
+ when :error
82
+ puts "#{result[:name]} -> Error: #{result[:error]}"
83
+ end
84
+ end
85
+ ```
86
+
87
+ ### With Type Filtering
88
+
89
+ ```ruby
90
+ # Only resolve as person entities
91
+ results = facts.batch_resolve_entities(names, type: :person)
92
+ ```
93
+
94
+ ## Conflict Detection
95
+
96
+ Check multiple entities for conflicting facts:
97
+
98
+ ```ruby
99
+ entity_ids = [paula.id, john.id, microsoft.id]
100
+
101
+ results = facts.detect_fact_conflicts(entity_ids)
102
+
103
+ results.each do |result|
104
+ if result[:conflict_count] > 0
105
+ puts "Entity #{result[:entity_id]} has #{result[:conflict_count]} conflicts:"
106
+ result[:conflicts].each do |conflict|
107
+ puts " #{conflict[:fact1].fact_text}"
108
+ puts " vs"
109
+ puts " #{conflict[:fact2].fact_text}"
110
+ puts " Similarity: #{conflict[:similarity]}"
111
+ end
112
+ end
113
+ end
114
+ ```
115
+
116
+ ## Using Pipelines Directly
117
+
118
+ For more control, use the pipeline classes directly:
119
+
120
+ ### Extraction Pipeline
121
+
122
+ ```ruby
123
+ pipeline = FactDb::Pipeline::ExtractionPipeline.new(FactDb.config)
124
+
125
+ # Sequential
126
+ results = pipeline.process(contents, extractor: :llm)
127
+
128
+ # Parallel
129
+ results = pipeline.process_parallel(contents, extractor: :llm)
130
+ ```
131
+
132
+ ### Resolution Pipeline
133
+
134
+ ```ruby
135
+ pipeline = FactDb::Pipeline::ResolutionPipeline.new(FactDb.config)
136
+
137
+ # Resolve entities
138
+ results = pipeline.resolve_entities(names, type: :person)
139
+
140
+ # Detect conflicts
141
+ results = pipeline.detect_conflicts(entity_ids)
142
+ ```
143
+
144
+ ## SimpleFlow Integration
145
+
146
+ FactDb's pipelines are built on SimpleFlow:
147
+
148
+ ```ruby
149
+ require 'simple_flow'
150
+
151
+ # Create custom pipeline
152
+ pipeline = SimpleFlow::Pipeline.new do
153
+ # Step 1: Validate
154
+ step ->(result) {
155
+ content = result.value
156
+ if content.raw_text.blank?
157
+ result.halt("Empty content")
158
+ else
159
+ result.continue(content)
160
+ end
161
+ }
162
+
163
+ # Step 2: Extract
164
+ step ->(result) {
165
+ facts = extractor.extract(result.value)
166
+ result.continue(facts)
167
+ }
168
+
169
+ # Step 3: Validate facts
170
+ step ->(result) {
171
+ valid_facts = result.value.select(&:valid?)
172
+ result.continue(valid_facts)
173
+ }
174
+ end
175
+
176
+ # Execute
177
+ result = pipeline.call(SimpleFlow::Result.new(content))
178
+ ```
179
+
180
+ ## Error Handling
181
+
182
+ ### Graceful Degradation
183
+
184
+ ```ruby
185
+ results = facts.batch_extract(content_ids, extractor: :llm)
186
+
187
+ successful = results.select { |r| r[:error].nil? }
188
+ failed = results.reject { |r| r[:error].nil? }
189
+
190
+ puts "Successful: #{successful.count}"
191
+ puts "Failed: #{failed.count}"
192
+
193
+ # Retry failed items with different extractor
194
+ if failed.any?
195
+ retry_ids = failed.map { |r| r[:content_id] }
196
+ retry_results = facts.batch_extract(retry_ids, extractor: :rule_based)
197
+ end
198
+ ```
199
+
200
+ ### Logging Errors
201
+
202
+ ```ruby
203
+ results.each do |result|
204
+ if result[:error]
205
+ logger.error(
206
+ "Extraction failed",
207
+ content_id: result[:content_id],
208
+ error: result[:error]
209
+ )
210
+ end
211
+ end
212
+ ```
213
+
214
+ ## Performance Considerations
215
+
216
+ ### Optimal Batch Size
217
+
218
+ ```ruby
219
+ # Process in batches of 10-50 for optimal performance
220
+ content_ids.each_slice(25) do |batch|
221
+ results = facts.batch_extract(batch, parallel: true)
222
+ process_results(results)
223
+ end
224
+ ```
225
+
226
+ ### Rate Limiting
227
+
228
+ For LLM extraction, add delays between batches:
229
+
230
+ ```ruby
231
+ content_ids.each_slice(10) do |batch|
232
+ results = facts.batch_extract(batch, extractor: :llm)
233
+ process_results(results)
234
+ sleep(2) # Rate limit
235
+ end
236
+ ```
237
+
238
+ ### Memory Management
239
+
240
+ ```ruby
241
+ # Process results immediately to avoid memory buildup
242
+ content_ids.each_slice(50) do |batch|
243
+ results = facts.batch_extract(batch)
244
+
245
+ results.each do |result|
246
+ # Process and discard
247
+ save_facts(result[:facts])
248
+ end
249
+
250
+ # Force garbage collection if needed
251
+ GC.start if batch_count % 10 == 0
252
+ end
253
+ ```
254
+
255
+ ## Monitoring
256
+
257
+ Track batch processing metrics:
258
+
259
+ ```ruby
260
+ start_time = Time.now
261
+
262
+ results = facts.batch_extract(content_ids, parallel: true)
263
+
264
+ duration = Time.now - start_time
265
+ success_rate = results.count { |r| r[:error].nil? }.to_f / results.count
266
+
267
+ puts "Processed #{results.count} items in #{duration}s"
268
+ puts "Success rate: #{(success_rate * 100).round(1)}%"
269
+ puts "Items/second: #{(results.count / duration).round(2)}"
270
+ ```
271
+
272
+ ## Best Practices
273
+
274
+ ### 1. Use Parallel for Large Batches
275
+
276
+ ```ruby
277
+ # Sequential for small batches (< 5 items)
278
+ if content_ids.count < 5
279
+ results = facts.batch_extract(content_ids, parallel: false)
280
+ else
281
+ results = facts.batch_extract(content_ids, parallel: true)
282
+ end
283
+ ```
284
+
285
+ ### 2. Handle Partial Failures
286
+
287
+ ```ruby
288
+ def process_batch(content_ids)
289
+ results = facts.batch_extract(content_ids)
290
+
291
+ {
292
+ successful: results.select { |r| r[:error].nil? },
293
+ failed: results.reject { |r| r[:error].nil? }
294
+ }
295
+ end
296
+
297
+ batch_result = process_batch(content_ids)
298
+ retry_failed(batch_result[:failed]) if batch_result[:failed].any?
299
+ ```
300
+
301
+ ### 3. Log Progress
302
+
303
+ ```ruby
304
+ total = content_ids.count
305
+ processed = 0
306
+
307
+ content_ids.each_slice(25) do |batch|
308
+ results = facts.batch_extract(batch)
309
+ processed += batch.count
310
+
311
+ logger.info "Progress: #{processed}/#{total} (#{(processed.to_f/total*100).round(1)}%)"
312
+ end
313
+ ```
314
+
315
+ ### 4. Use Appropriate Extractors
316
+
317
+ ```ruby
318
+ # LLM for complex documents
319
+ complex_docs = contents.select { |c| c.raw_text.length > 1000 }
320
+ facts.batch_extract(complex_docs.map(&:id), extractor: :llm)
321
+
322
+ # Rule-based for simple, structured content
323
+ simple_docs = contents.select { |c| c.raw_text.length <= 1000 }
324
+ facts.batch_extract(simple_docs.map(&:id), extractor: :rule_based)
325
+ ```
@@ -0,0 +1,243 @@
1
+ # Configuration
2
+
3
+ FactDb uses the `anyway_config` gem for flexible configuration via environment variables, YAML files, or Ruby code.
4
+
5
+ ## Configuration Methods
6
+
7
+ ### Environment Variables
8
+
9
+ All settings can be configured via environment variables with the `EVENT_CLOCK_` prefix:
10
+
11
+ ```bash
12
+ export EVENT_CLOCK_DATABASE_URL="postgresql://localhost/fact_db"
13
+ export EVENT_CLOCK_DATABASE_POOL_SIZE=10
14
+ export EVENT_CLOCK_LLM_PROVIDER="openai"
15
+ export EVENT_CLOCK_LLM_MODEL="gpt-4o-mini"
16
+ export EVENT_CLOCK_LLM_API_KEY="sk-..."
17
+ export EVENT_CLOCK_FUZZY_MATCH_THRESHOLD=0.85
18
+ ```
19
+
20
+ ### YAML Configuration
21
+
22
+ Create `config/fact_db.yml`:
23
+
24
+ ```yaml
25
+ # Database
26
+ database_url: postgresql://localhost/fact_db
27
+ database_pool_size: 10
28
+ database_timeout: 30000
29
+
30
+ # Embeddings
31
+ embedding_dimensions: 1536
32
+
33
+ # LLM
34
+ llm_provider: openai
35
+ llm_model: gpt-4o-mini
36
+ llm_api_key: <%= ENV['OPENAI_API_KEY'] %>
37
+
38
+ # Extraction
39
+ default_extractor: manual
40
+
41
+ # Entity Resolution
42
+ fuzzy_match_threshold: 0.85
43
+ auto_merge_threshold: 0.95
44
+
45
+ # Logging
46
+ log_level: info
47
+ ```
48
+
49
+ ### Ruby Block
50
+
51
+ ```ruby
52
+ FactDb.configure do |config|
53
+ # Database
54
+ config.database_url = "postgresql://localhost/fact_db"
55
+ config.database_pool_size = 10
56
+ config.database_timeout = 30_000
57
+
58
+ # Embeddings
59
+ config.embedding_dimensions = 1536
60
+ config.embedding_generator = ->(text) {
61
+ # Your embedding generation logic
62
+ OpenAI::Client.new.embeddings(input: text)
63
+ }
64
+
65
+ # LLM
66
+ config.llm_provider = :openai
67
+ config.llm_model = "gpt-4o-mini"
68
+ config.llm_api_key = ENV['OPENAI_API_KEY']
69
+
70
+ # Or provide a pre-configured client
71
+ config.llm_client = FactDb::LLM::Adapter.new(
72
+ provider: :anthropic,
73
+ model: "claude-sonnet-4-20250514"
74
+ )
75
+
76
+ # Extraction
77
+ config.default_extractor = :llm
78
+
79
+ # Entity Resolution
80
+ config.fuzzy_match_threshold = 0.85
81
+ config.auto_merge_threshold = 0.95
82
+
83
+ # Logging
84
+ config.logger = Rails.logger
85
+ config.log_level = :debug
86
+ end
87
+ ```
88
+
89
+ ## Configuration Options
90
+
91
+ ### Database Settings
92
+
93
+ | Option | Type | Default | Description |
94
+ |--------|------|---------|-------------|
95
+ | `database_url` | String | nil | PostgreSQL connection URL (required) |
96
+ | `database_pool_size` | Integer | 5 | Connection pool size |
97
+ | `database_timeout` | Integer | 30000 | Query timeout in milliseconds |
98
+
99
+ ### Embedding Settings
100
+
101
+ | Option | Type | Default | Description |
102
+ |--------|------|---------|-------------|
103
+ | `embedding_dimensions` | Integer | 1536 | Vector dimensions (match your model) |
104
+ | `embedding_generator` | Proc | nil | Custom embedding generation function |
105
+
106
+ ### LLM Settings
107
+
108
+ | Option | Type | Default | Description |
109
+ |--------|------|---------|-------------|
110
+ | `llm_client` | Object | nil | Pre-configured LLM client |
111
+ | `llm_provider` | Symbol | nil | Provider name (:openai, :anthropic, etc.) |
112
+ | `llm_model` | String | varies | Model name |
113
+ | `llm_api_key` | String | nil | API key |
114
+
115
+ ### Extraction Settings
116
+
117
+ | Option | Type | Default | Description |
118
+ |--------|------|---------|-------------|
119
+ | `default_extractor` | Symbol | :manual | Default extraction method |
120
+
121
+ ### Resolution Settings
122
+
123
+ | Option | Type | Default | Description |
124
+ |--------|------|---------|-------------|
125
+ | `fuzzy_match_threshold` | Float | 0.85 | Minimum similarity for fuzzy matching |
126
+ | `auto_merge_threshold` | Float | 0.95 | Similarity threshold for auto-merge |
127
+
128
+ ### Logging Settings
129
+
130
+ | Option | Type | Default | Description |
131
+ |--------|------|---------|-------------|
132
+ | `logger` | Logger | STDOUT | Logger instance |
133
+ | `log_level` | Symbol | :info | Log level |
134
+
135
+ ## LLM Provider Configuration
136
+
137
+ ### OpenAI
138
+
139
+ ```ruby
140
+ FactDb.configure do |config|
141
+ config.llm_provider = :openai
142
+ config.llm_model = "gpt-4o-mini" # or "gpt-4o", "gpt-4-turbo"
143
+ config.llm_api_key = ENV['OPENAI_API_KEY']
144
+ end
145
+ ```
146
+
147
+ ### Anthropic
148
+
149
+ ```ruby
150
+ FactDb.configure do |config|
151
+ config.llm_provider = :anthropic
152
+ config.llm_model = "claude-sonnet-4-20250514"
153
+ config.llm_api_key = ENV['ANTHROPIC_API_KEY']
154
+ end
155
+ ```
156
+
157
+ ### Google Gemini
158
+
159
+ ```ruby
160
+ FactDb.configure do |config|
161
+ config.llm_provider = :gemini
162
+ config.llm_model = "gemini-2.0-flash"
163
+ config.llm_api_key = ENV['GEMINI_API_KEY']
164
+ end
165
+ ```
166
+
167
+ ### Ollama (Local)
168
+
169
+ ```ruby
170
+ FactDb.configure do |config|
171
+ config.llm_provider = :ollama
172
+ config.llm_model = "llama3.2"
173
+ # No API key needed for local Ollama
174
+ end
175
+ ```
176
+
177
+ ### AWS Bedrock
178
+
179
+ ```ruby
180
+ FactDb.configure do |config|
181
+ config.llm_provider = :bedrock
182
+ config.llm_model = "claude-sonnet-4"
183
+ # Uses AWS credentials from environment
184
+ end
185
+ ```
186
+
187
+ ### OpenRouter
188
+
189
+ ```ruby
190
+ FactDb.configure do |config|
191
+ config.llm_provider = :openrouter
192
+ config.llm_model = "anthropic/claude-sonnet-4"
193
+ config.llm_api_key = ENV['OPENROUTER_API_KEY']
194
+ end
195
+ ```
196
+
197
+ ## Environment-Specific Configuration
198
+
199
+ Use YAML anchors for shared settings:
200
+
201
+ ```yaml
202
+ # config/fact_db.yml
203
+ defaults: &defaults
204
+ embedding_dimensions: 1536
205
+ fuzzy_match_threshold: 0.85
206
+
207
+ development:
208
+ <<: *defaults
209
+ database_url: postgresql://localhost/fact_db_dev
210
+ log_level: debug
211
+
212
+ test:
213
+ <<: *defaults
214
+ database_url: postgresql://localhost/fact_db_test
215
+ log_level: warn
216
+
217
+ production:
218
+ <<: *defaults
219
+ database_url: <%= ENV['DATABASE_URL'] %>
220
+ log_level: info
221
+ ```
222
+
223
+ ## Validation
224
+
225
+ Validate configuration at startup:
226
+
227
+ ```ruby
228
+ FactDb.configure do |config|
229
+ config.database_url = ENV['DATABASE_URL']
230
+ end
231
+
232
+ # Raises ConfigurationError if invalid
233
+ FactDb.config.validate!
234
+ ```
235
+
236
+ ## Reset Configuration
237
+
238
+ For testing, reset configuration between tests:
239
+
240
+ ```ruby
241
+ # In test setup
242
+ FactDb.reset_configuration!
243
+ ```