fact_db 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.envrc +1 -0
- data/CHANGELOG.md +48 -0
- data/COMMITS.md +196 -0
- data/README.md +102 -0
- data/Rakefile +41 -0
- data/db/migrate/001_enable_extensions.rb +7 -0
- data/db/migrate/002_create_contents.rb +44 -0
- data/db/migrate/003_create_entities.rb +36 -0
- data/db/migrate/004_create_entity_aliases.rb +18 -0
- data/db/migrate/005_create_facts.rb +65 -0
- data/db/migrate/006_create_entity_mentions.rb +18 -0
- data/db/migrate/007_create_fact_sources.rb +18 -0
- data/docs/api/extractors/index.md +71 -0
- data/docs/api/extractors/llm.md +162 -0
- data/docs/api/extractors/manual.md +92 -0
- data/docs/api/extractors/rule-based.md +165 -0
- data/docs/api/facts.md +300 -0
- data/docs/api/index.md +66 -0
- data/docs/api/models/content.md +165 -0
- data/docs/api/models/entity.md +202 -0
- data/docs/api/models/fact.md +270 -0
- data/docs/api/models/index.md +77 -0
- data/docs/api/pipeline/extraction.md +175 -0
- data/docs/api/pipeline/index.md +72 -0
- data/docs/api/pipeline/resolution.md +209 -0
- data/docs/api/services/content-service.md +166 -0
- data/docs/api/services/entity-service.md +202 -0
- data/docs/api/services/fact-service.md +223 -0
- data/docs/api/services/index.md +55 -0
- data/docs/architecture/database-schema.md +293 -0
- data/docs/architecture/entity-resolution.md +293 -0
- data/docs/architecture/index.md +149 -0
- data/docs/architecture/temporal-facts.md +268 -0
- data/docs/architecture/three-layer-model.md +242 -0
- data/docs/assets/css/custom.css +137 -0
- data/docs/assets/fact_db.jpg +0 -0
- data/docs/assets/images/fact_db.jpg +0 -0
- data/docs/concepts.md +183 -0
- data/docs/examples/basic-usage.md +235 -0
- data/docs/examples/hr-onboarding.md +312 -0
- data/docs/examples/index.md +64 -0
- data/docs/examples/news-analysis.md +288 -0
- data/docs/getting-started/database-setup.md +170 -0
- data/docs/getting-started/index.md +71 -0
- data/docs/getting-started/installation.md +98 -0
- data/docs/getting-started/quick-start.md +191 -0
- data/docs/guides/batch-processing.md +325 -0
- data/docs/guides/configuration.md +243 -0
- data/docs/guides/entity-management.md +364 -0
- data/docs/guides/extracting-facts.md +299 -0
- data/docs/guides/index.md +22 -0
- data/docs/guides/ingesting-content.md +252 -0
- data/docs/guides/llm-integration.md +299 -0
- data/docs/guides/temporal-queries.md +315 -0
- data/docs/index.md +121 -0
- data/examples/README.md +130 -0
- data/examples/basic_usage.rb +164 -0
- data/examples/entity_management.rb +216 -0
- data/examples/hr_system.rb +428 -0
- data/examples/rule_based_extraction.rb +258 -0
- data/examples/temporal_queries.rb +245 -0
- data/lib/fact_db/config.rb +71 -0
- data/lib/fact_db/database.rb +45 -0
- data/lib/fact_db/errors.rb +10 -0
- data/lib/fact_db/extractors/base.rb +117 -0
- data/lib/fact_db/extractors/llm_extractor.rb +179 -0
- data/lib/fact_db/extractors/manual_extractor.rb +53 -0
- data/lib/fact_db/extractors/rule_based_extractor.rb +228 -0
- data/lib/fact_db/llm/adapter.rb +109 -0
- data/lib/fact_db/models/content.rb +62 -0
- data/lib/fact_db/models/entity.rb +84 -0
- data/lib/fact_db/models/entity_alias.rb +26 -0
- data/lib/fact_db/models/entity_mention.rb +33 -0
- data/lib/fact_db/models/fact.rb +192 -0
- data/lib/fact_db/models/fact_source.rb +35 -0
- data/lib/fact_db/pipeline/extraction_pipeline.rb +146 -0
- data/lib/fact_db/pipeline/resolution_pipeline.rb +129 -0
- data/lib/fact_db/resolution/entity_resolver.rb +261 -0
- data/lib/fact_db/resolution/fact_resolver.rb +259 -0
- data/lib/fact_db/services/content_service.rb +93 -0
- data/lib/fact_db/services/entity_service.rb +150 -0
- data/lib/fact_db/services/fact_service.rb +193 -0
- data/lib/fact_db/temporal/query.rb +125 -0
- data/lib/fact_db/temporal/timeline.rb +134 -0
- data/lib/fact_db/version.rb +5 -0
- data/lib/fact_db.rb +141 -0
- data/mkdocs.yml +198 -0
- metadata +288 -0
|
@@ -0,0 +1,325 @@
|
|
|
1
|
+
# Batch Processing
|
|
2
|
+
|
|
3
|
+
FactDb uses the `simple_flow` gem to provide concurrent pipeline processing for efficient batch operations.
|
|
4
|
+
|
|
5
|
+
## Overview
|
|
6
|
+
|
|
7
|
+
Batch processing is useful for:
|
|
8
|
+
|
|
9
|
+
- Processing multiple documents at once
|
|
10
|
+
- Resolving many entity names
|
|
11
|
+
- Detecting conflicts across entities
|
|
12
|
+
- Bulk fact extraction
|
|
13
|
+
|
|
14
|
+
## Batch Extraction
|
|
15
|
+
|
|
16
|
+
### Sequential Processing
|
|
17
|
+
|
|
18
|
+
Process content one at a time:
|
|
19
|
+
|
|
20
|
+
```ruby
|
|
21
|
+
facts = FactDb.new
|
|
22
|
+
|
|
23
|
+
content_ids = [content1.id, content2.id, content3.id]
|
|
24
|
+
|
|
25
|
+
results = facts.batch_extract(
|
|
26
|
+
content_ids,
|
|
27
|
+
extractor: :llm,
|
|
28
|
+
parallel: false
|
|
29
|
+
)
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
### Parallel Processing
|
|
33
|
+
|
|
34
|
+
Process content concurrently (default):
|
|
35
|
+
|
|
36
|
+
```ruby
|
|
37
|
+
results = facts.batch_extract(
|
|
38
|
+
content_ids,
|
|
39
|
+
extractor: :llm,
|
|
40
|
+
parallel: true # default
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
results.each do |result|
|
|
44
|
+
puts "Content #{result[:content_id]}:"
|
|
45
|
+
puts " Facts extracted: #{result[:facts].count}"
|
|
46
|
+
puts " Error: #{result[:error]}" if result[:error]
|
|
47
|
+
end
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
### Result Structure
|
|
51
|
+
|
|
52
|
+
```ruby
|
|
53
|
+
result = {
|
|
54
|
+
content_id: 123,
|
|
55
|
+
facts: [<Fact>, <Fact>, ...], # Extracted facts
|
|
56
|
+
error: nil # Error message if failed
|
|
57
|
+
}
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
## Batch Entity Resolution
|
|
61
|
+
|
|
62
|
+
Resolve multiple names at once:
|
|
63
|
+
|
|
64
|
+
```ruby
|
|
65
|
+
names = [
|
|
66
|
+
"Paula Chen",
|
|
67
|
+
"John Smith",
|
|
68
|
+
"Microsoft",
|
|
69
|
+
"Acme Corporation",
|
|
70
|
+
"Seattle"
|
|
71
|
+
]
|
|
72
|
+
|
|
73
|
+
results = facts.batch_resolve_entities(names, type: nil)
|
|
74
|
+
|
|
75
|
+
results.each do |result|
|
|
76
|
+
case result[:status]
|
|
77
|
+
when :resolved
|
|
78
|
+
puts "#{result[:name]} -> #{result[:entity].canonical_name}"
|
|
79
|
+
when :not_found
|
|
80
|
+
puts "#{result[:name]} -> Not found"
|
|
81
|
+
when :error
|
|
82
|
+
puts "#{result[:name]} -> Error: #{result[:error]}"
|
|
83
|
+
end
|
|
84
|
+
end
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
### With Type Filtering
|
|
88
|
+
|
|
89
|
+
```ruby
|
|
90
|
+
# Only resolve as person entities
|
|
91
|
+
results = facts.batch_resolve_entities(names, type: :person)
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
## Conflict Detection
|
|
95
|
+
|
|
96
|
+
Check multiple entities for conflicting facts:
|
|
97
|
+
|
|
98
|
+
```ruby
|
|
99
|
+
entity_ids = [paula.id, john.id, microsoft.id]
|
|
100
|
+
|
|
101
|
+
results = facts.detect_fact_conflicts(entity_ids)
|
|
102
|
+
|
|
103
|
+
results.each do |result|
|
|
104
|
+
if result[:conflict_count] > 0
|
|
105
|
+
puts "Entity #{result[:entity_id]} has #{result[:conflict_count]} conflicts:"
|
|
106
|
+
result[:conflicts].each do |conflict|
|
|
107
|
+
puts " #{conflict[:fact1].fact_text}"
|
|
108
|
+
puts " vs"
|
|
109
|
+
puts " #{conflict[:fact2].fact_text}"
|
|
110
|
+
puts " Similarity: #{conflict[:similarity]}"
|
|
111
|
+
end
|
|
112
|
+
end
|
|
113
|
+
end
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
## Using Pipelines Directly
|
|
117
|
+
|
|
118
|
+
For more control, use the pipeline classes directly:
|
|
119
|
+
|
|
120
|
+
### Extraction Pipeline
|
|
121
|
+
|
|
122
|
+
```ruby
|
|
123
|
+
pipeline = FactDb::Pipeline::ExtractionPipeline.new(FactDb.config)
|
|
124
|
+
|
|
125
|
+
# Sequential
|
|
126
|
+
results = pipeline.process(contents, extractor: :llm)
|
|
127
|
+
|
|
128
|
+
# Parallel
|
|
129
|
+
results = pipeline.process_parallel(contents, extractor: :llm)
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
### Resolution Pipeline
|
|
133
|
+
|
|
134
|
+
```ruby
|
|
135
|
+
pipeline = FactDb::Pipeline::ResolutionPipeline.new(FactDb.config)
|
|
136
|
+
|
|
137
|
+
# Resolve entities
|
|
138
|
+
results = pipeline.resolve_entities(names, type: :person)
|
|
139
|
+
|
|
140
|
+
# Detect conflicts
|
|
141
|
+
results = pipeline.detect_conflicts(entity_ids)
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
## SimpleFlow Integration
|
|
145
|
+
|
|
146
|
+
FactDb's pipelines are built on SimpleFlow:
|
|
147
|
+
|
|
148
|
+
```ruby
|
|
149
|
+
require 'simple_flow'
|
|
150
|
+
|
|
151
|
+
# Create custom pipeline
|
|
152
|
+
pipeline = SimpleFlow::Pipeline.new do
|
|
153
|
+
# Step 1: Validate
|
|
154
|
+
step ->(result) {
|
|
155
|
+
content = result.value
|
|
156
|
+
if content.raw_text.blank?
|
|
157
|
+
result.halt("Empty content")
|
|
158
|
+
else
|
|
159
|
+
result.continue(content)
|
|
160
|
+
end
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
# Step 2: Extract
|
|
164
|
+
step ->(result) {
|
|
165
|
+
facts = extractor.extract(result.value)
|
|
166
|
+
result.continue(facts)
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
# Step 3: Validate facts
|
|
170
|
+
step ->(result) {
|
|
171
|
+
valid_facts = result.value.select(&:valid?)
|
|
172
|
+
result.continue(valid_facts)
|
|
173
|
+
}
|
|
174
|
+
end
|
|
175
|
+
|
|
176
|
+
# Execute
|
|
177
|
+
result = pipeline.call(SimpleFlow::Result.new(content))
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
## Error Handling
|
|
181
|
+
|
|
182
|
+
### Graceful Degradation
|
|
183
|
+
|
|
184
|
+
```ruby
|
|
185
|
+
results = facts.batch_extract(content_ids, extractor: :llm)
|
|
186
|
+
|
|
187
|
+
successful = results.select { |r| r[:error].nil? }
|
|
188
|
+
failed = results.reject { |r| r[:error].nil? }
|
|
189
|
+
|
|
190
|
+
puts "Successful: #{successful.count}"
|
|
191
|
+
puts "Failed: #{failed.count}"
|
|
192
|
+
|
|
193
|
+
# Retry failed items with different extractor
|
|
194
|
+
if failed.any?
|
|
195
|
+
retry_ids = failed.map { |r| r[:content_id] }
|
|
196
|
+
retry_results = facts.batch_extract(retry_ids, extractor: :rule_based)
|
|
197
|
+
end
|
|
198
|
+
```
|
|
199
|
+
|
|
200
|
+
### Logging Errors
|
|
201
|
+
|
|
202
|
+
```ruby
|
|
203
|
+
results.each do |result|
|
|
204
|
+
if result[:error]
|
|
205
|
+
logger.error(
|
|
206
|
+
"Extraction failed",
|
|
207
|
+
content_id: result[:content_id],
|
|
208
|
+
error: result[:error]
|
|
209
|
+
)
|
|
210
|
+
end
|
|
211
|
+
end
|
|
212
|
+
```
|
|
213
|
+
|
|
214
|
+
## Performance Considerations
|
|
215
|
+
|
|
216
|
+
### Optimal Batch Size
|
|
217
|
+
|
|
218
|
+
```ruby
|
|
219
|
+
# Process in batches of 10-50 for optimal performance
|
|
220
|
+
content_ids.each_slice(25) do |batch|
|
|
221
|
+
results = facts.batch_extract(batch, parallel: true)
|
|
222
|
+
process_results(results)
|
|
223
|
+
end
|
|
224
|
+
```
|
|
225
|
+
|
|
226
|
+
### Rate Limiting
|
|
227
|
+
|
|
228
|
+
For LLM extraction, add delays between batches:
|
|
229
|
+
|
|
230
|
+
```ruby
|
|
231
|
+
content_ids.each_slice(10) do |batch|
|
|
232
|
+
results = facts.batch_extract(batch, extractor: :llm)
|
|
233
|
+
process_results(results)
|
|
234
|
+
sleep(2) # Rate limit
|
|
235
|
+
end
|
|
236
|
+
```
|
|
237
|
+
|
|
238
|
+
### Memory Management
|
|
239
|
+
|
|
240
|
+
```ruby
|
|
241
|
+
# Process results immediately to avoid memory buildup
|
|
242
|
+
content_ids.each_slice(50) do |batch|
|
|
243
|
+
results = facts.batch_extract(batch)
|
|
244
|
+
|
|
245
|
+
results.each do |result|
|
|
246
|
+
# Process and discard
|
|
247
|
+
save_facts(result[:facts])
|
|
248
|
+
end
|
|
249
|
+
|
|
250
|
+
# Force garbage collection if needed
|
|
251
|
+
GC.start if batch_count % 10 == 0
|
|
252
|
+
end
|
|
253
|
+
```
|
|
254
|
+
|
|
255
|
+
## Monitoring
|
|
256
|
+
|
|
257
|
+
Track batch processing metrics:
|
|
258
|
+
|
|
259
|
+
```ruby
|
|
260
|
+
start_time = Time.now
|
|
261
|
+
|
|
262
|
+
results = facts.batch_extract(content_ids, parallel: true)
|
|
263
|
+
|
|
264
|
+
duration = Time.now - start_time
|
|
265
|
+
success_rate = results.count { |r| r[:error].nil? }.to_f / results.count
|
|
266
|
+
|
|
267
|
+
puts "Processed #{results.count} items in #{duration}s"
|
|
268
|
+
puts "Success rate: #{(success_rate * 100).round(1)}%"
|
|
269
|
+
puts "Items/second: #{(results.count / duration).round(2)}"
|
|
270
|
+
```
|
|
271
|
+
|
|
272
|
+
## Best Practices
|
|
273
|
+
|
|
274
|
+
### 1. Use Parallel for Large Batches
|
|
275
|
+
|
|
276
|
+
```ruby
|
|
277
|
+
# Sequential for small batches (< 5 items)
|
|
278
|
+
if content_ids.count < 5
|
|
279
|
+
results = facts.batch_extract(content_ids, parallel: false)
|
|
280
|
+
else
|
|
281
|
+
results = facts.batch_extract(content_ids, parallel: true)
|
|
282
|
+
end
|
|
283
|
+
```
|
|
284
|
+
|
|
285
|
+
### 2. Handle Partial Failures
|
|
286
|
+
|
|
287
|
+
```ruby
|
|
288
|
+
def process_batch(content_ids)
|
|
289
|
+
results = facts.batch_extract(content_ids)
|
|
290
|
+
|
|
291
|
+
{
|
|
292
|
+
successful: results.select { |r| r[:error].nil? },
|
|
293
|
+
failed: results.reject { |r| r[:error].nil? }
|
|
294
|
+
}
|
|
295
|
+
end
|
|
296
|
+
|
|
297
|
+
batch_result = process_batch(content_ids)
|
|
298
|
+
retry_failed(batch_result[:failed]) if batch_result[:failed].any?
|
|
299
|
+
```
|
|
300
|
+
|
|
301
|
+
### 3. Log Progress
|
|
302
|
+
|
|
303
|
+
```ruby
|
|
304
|
+
total = content_ids.count
|
|
305
|
+
processed = 0
|
|
306
|
+
|
|
307
|
+
content_ids.each_slice(25) do |batch|
|
|
308
|
+
results = facts.batch_extract(batch)
|
|
309
|
+
processed += batch.count
|
|
310
|
+
|
|
311
|
+
logger.info "Progress: #{processed}/#{total} (#{(processed.to_f/total*100).round(1)}%)"
|
|
312
|
+
end
|
|
313
|
+
```
|
|
314
|
+
|
|
315
|
+
### 4. Use Appropriate Extractors
|
|
316
|
+
|
|
317
|
+
```ruby
|
|
318
|
+
# LLM for complex documents
|
|
319
|
+
complex_docs = contents.select { |c| c.raw_text.length > 1000 }
|
|
320
|
+
facts.batch_extract(complex_docs.map(&:id), extractor: :llm)
|
|
321
|
+
|
|
322
|
+
# Rule-based for simple, structured content
|
|
323
|
+
simple_docs = contents.select { |c| c.raw_text.length <= 1000 }
|
|
324
|
+
facts.batch_extract(simple_docs.map(&:id), extractor: :rule_based)
|
|
325
|
+
```
|
|
@@ -0,0 +1,243 @@
|
|
|
1
|
+
# Configuration
|
|
2
|
+
|
|
3
|
+
FactDb uses the `anyway_config` gem for flexible configuration via environment variables, YAML files, or Ruby code.
|
|
4
|
+
|
|
5
|
+
## Configuration Methods
|
|
6
|
+
|
|
7
|
+
### Environment Variables
|
|
8
|
+
|
|
9
|
+
All settings can be configured via environment variables with the `EVENT_CLOCK_` prefix:
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
export EVENT_CLOCK_DATABASE_URL="postgresql://localhost/fact_db"
|
|
13
|
+
export EVENT_CLOCK_DATABASE_POOL_SIZE=10
|
|
14
|
+
export EVENT_CLOCK_LLM_PROVIDER="openai"
|
|
15
|
+
export EVENT_CLOCK_LLM_MODEL="gpt-4o-mini"
|
|
16
|
+
export EVENT_CLOCK_LLM_API_KEY="sk-..."
|
|
17
|
+
export EVENT_CLOCK_FUZZY_MATCH_THRESHOLD=0.85
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
### YAML Configuration
|
|
21
|
+
|
|
22
|
+
Create `config/fact_db.yml`:
|
|
23
|
+
|
|
24
|
+
```yaml
|
|
25
|
+
# Database
|
|
26
|
+
database_url: postgresql://localhost/fact_db
|
|
27
|
+
database_pool_size: 10
|
|
28
|
+
database_timeout: 30000
|
|
29
|
+
|
|
30
|
+
# Embeddings
|
|
31
|
+
embedding_dimensions: 1536
|
|
32
|
+
|
|
33
|
+
# LLM
|
|
34
|
+
llm_provider: openai
|
|
35
|
+
llm_model: gpt-4o-mini
|
|
36
|
+
llm_api_key: <%= ENV['OPENAI_API_KEY'] %>
|
|
37
|
+
|
|
38
|
+
# Extraction
|
|
39
|
+
default_extractor: manual
|
|
40
|
+
|
|
41
|
+
# Entity Resolution
|
|
42
|
+
fuzzy_match_threshold: 0.85
|
|
43
|
+
auto_merge_threshold: 0.95
|
|
44
|
+
|
|
45
|
+
# Logging
|
|
46
|
+
log_level: info
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
### Ruby Block
|
|
50
|
+
|
|
51
|
+
```ruby
|
|
52
|
+
FactDb.configure do |config|
|
|
53
|
+
# Database
|
|
54
|
+
config.database_url = "postgresql://localhost/fact_db"
|
|
55
|
+
config.database_pool_size = 10
|
|
56
|
+
config.database_timeout = 30_000
|
|
57
|
+
|
|
58
|
+
# Embeddings
|
|
59
|
+
config.embedding_dimensions = 1536
|
|
60
|
+
config.embedding_generator = ->(text) {
|
|
61
|
+
# Your embedding generation logic
|
|
62
|
+
OpenAI::Client.new.embeddings(input: text)
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
# LLM
|
|
66
|
+
config.llm_provider = :openai
|
|
67
|
+
config.llm_model = "gpt-4o-mini"
|
|
68
|
+
config.llm_api_key = ENV['OPENAI_API_KEY']
|
|
69
|
+
|
|
70
|
+
# Or provide a pre-configured client
|
|
71
|
+
config.llm_client = FactDb::LLM::Adapter.new(
|
|
72
|
+
provider: :anthropic,
|
|
73
|
+
model: "claude-sonnet-4-20250514"
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
# Extraction
|
|
77
|
+
config.default_extractor = :llm
|
|
78
|
+
|
|
79
|
+
# Entity Resolution
|
|
80
|
+
config.fuzzy_match_threshold = 0.85
|
|
81
|
+
config.auto_merge_threshold = 0.95
|
|
82
|
+
|
|
83
|
+
# Logging
|
|
84
|
+
config.logger = Rails.logger
|
|
85
|
+
config.log_level = :debug
|
|
86
|
+
end
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
## Configuration Options
|
|
90
|
+
|
|
91
|
+
### Database Settings
|
|
92
|
+
|
|
93
|
+
| Option | Type | Default | Description |
|
|
94
|
+
|--------|------|---------|-------------|
|
|
95
|
+
| `database_url` | String | nil | PostgreSQL connection URL (required) |
|
|
96
|
+
| `database_pool_size` | Integer | 5 | Connection pool size |
|
|
97
|
+
| `database_timeout` | Integer | 30000 | Query timeout in milliseconds |
|
|
98
|
+
|
|
99
|
+
### Embedding Settings
|
|
100
|
+
|
|
101
|
+
| Option | Type | Default | Description |
|
|
102
|
+
|--------|------|---------|-------------|
|
|
103
|
+
| `embedding_dimensions` | Integer | 1536 | Vector dimensions (match your model) |
|
|
104
|
+
| `embedding_generator` | Proc | nil | Custom embedding generation function |
|
|
105
|
+
|
|
106
|
+
### LLM Settings
|
|
107
|
+
|
|
108
|
+
| Option | Type | Default | Description |
|
|
109
|
+
|--------|------|---------|-------------|
|
|
110
|
+
| `llm_client` | Object | nil | Pre-configured LLM client |
|
|
111
|
+
| `llm_provider` | Symbol | nil | Provider name (:openai, :anthropic, etc.) |
|
|
112
|
+
| `llm_model` | String | varies | Model name |
|
|
113
|
+
| `llm_api_key` | String | nil | API key |
|
|
114
|
+
|
|
115
|
+
### Extraction Settings
|
|
116
|
+
|
|
117
|
+
| Option | Type | Default | Description |
|
|
118
|
+
|--------|------|---------|-------------|
|
|
119
|
+
| `default_extractor` | Symbol | :manual | Default extraction method |
|
|
120
|
+
|
|
121
|
+
### Resolution Settings
|
|
122
|
+
|
|
123
|
+
| Option | Type | Default | Description |
|
|
124
|
+
|--------|------|---------|-------------|
|
|
125
|
+
| `fuzzy_match_threshold` | Float | 0.85 | Minimum similarity for fuzzy matching |
|
|
126
|
+
| `auto_merge_threshold` | Float | 0.95 | Similarity threshold for auto-merge |
|
|
127
|
+
|
|
128
|
+
### Logging Settings
|
|
129
|
+
|
|
130
|
+
| Option | Type | Default | Description |
|
|
131
|
+
|--------|------|---------|-------------|
|
|
132
|
+
| `logger` | Logger | STDOUT | Logger instance |
|
|
133
|
+
| `log_level` | Symbol | :info | Log level |
|
|
134
|
+
|
|
135
|
+
## LLM Provider Configuration
|
|
136
|
+
|
|
137
|
+
### OpenAI
|
|
138
|
+
|
|
139
|
+
```ruby
|
|
140
|
+
FactDb.configure do |config|
|
|
141
|
+
config.llm_provider = :openai
|
|
142
|
+
config.llm_model = "gpt-4o-mini" # or "gpt-4o", "gpt-4-turbo"
|
|
143
|
+
config.llm_api_key = ENV['OPENAI_API_KEY']
|
|
144
|
+
end
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
### Anthropic
|
|
148
|
+
|
|
149
|
+
```ruby
|
|
150
|
+
FactDb.configure do |config|
|
|
151
|
+
config.llm_provider = :anthropic
|
|
152
|
+
config.llm_model = "claude-sonnet-4-20250514"
|
|
153
|
+
config.llm_api_key = ENV['ANTHROPIC_API_KEY']
|
|
154
|
+
end
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
### Google Gemini
|
|
158
|
+
|
|
159
|
+
```ruby
|
|
160
|
+
FactDb.configure do |config|
|
|
161
|
+
config.llm_provider = :gemini
|
|
162
|
+
config.llm_model = "gemini-2.0-flash"
|
|
163
|
+
config.llm_api_key = ENV['GEMINI_API_KEY']
|
|
164
|
+
end
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
### Ollama (Local)
|
|
168
|
+
|
|
169
|
+
```ruby
|
|
170
|
+
FactDb.configure do |config|
|
|
171
|
+
config.llm_provider = :ollama
|
|
172
|
+
config.llm_model = "llama3.2"
|
|
173
|
+
# No API key needed for local Ollama
|
|
174
|
+
end
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
### AWS Bedrock
|
|
178
|
+
|
|
179
|
+
```ruby
|
|
180
|
+
FactDb.configure do |config|
|
|
181
|
+
config.llm_provider = :bedrock
|
|
182
|
+
config.llm_model = "claude-sonnet-4"
|
|
183
|
+
# Uses AWS credentials from environment
|
|
184
|
+
end
|
|
185
|
+
```
|
|
186
|
+
|
|
187
|
+
### OpenRouter
|
|
188
|
+
|
|
189
|
+
```ruby
|
|
190
|
+
FactDb.configure do |config|
|
|
191
|
+
config.llm_provider = :openrouter
|
|
192
|
+
config.llm_model = "anthropic/claude-sonnet-4"
|
|
193
|
+
config.llm_api_key = ENV['OPENROUTER_API_KEY']
|
|
194
|
+
end
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
## Environment-Specific Configuration
|
|
198
|
+
|
|
199
|
+
Use YAML anchors for shared settings:
|
|
200
|
+
|
|
201
|
+
```yaml
|
|
202
|
+
# config/fact_db.yml
|
|
203
|
+
defaults: &defaults
|
|
204
|
+
embedding_dimensions: 1536
|
|
205
|
+
fuzzy_match_threshold: 0.85
|
|
206
|
+
|
|
207
|
+
development:
|
|
208
|
+
<<: *defaults
|
|
209
|
+
database_url: postgresql://localhost/fact_db_dev
|
|
210
|
+
log_level: debug
|
|
211
|
+
|
|
212
|
+
test:
|
|
213
|
+
<<: *defaults
|
|
214
|
+
database_url: postgresql://localhost/fact_db_test
|
|
215
|
+
log_level: warn
|
|
216
|
+
|
|
217
|
+
production:
|
|
218
|
+
<<: *defaults
|
|
219
|
+
database_url: <%= ENV['DATABASE_URL'] %>
|
|
220
|
+
log_level: info
|
|
221
|
+
```
|
|
222
|
+
|
|
223
|
+
## Validation
|
|
224
|
+
|
|
225
|
+
Validate configuration at startup:
|
|
226
|
+
|
|
227
|
+
```ruby
|
|
228
|
+
FactDb.configure do |config|
|
|
229
|
+
config.database_url = ENV['DATABASE_URL']
|
|
230
|
+
end
|
|
231
|
+
|
|
232
|
+
# Raises ConfigurationError if invalid
|
|
233
|
+
FactDb.config.validate!
|
|
234
|
+
```
|
|
235
|
+
|
|
236
|
+
## Reset Configuration
|
|
237
|
+
|
|
238
|
+
For testing, reset configuration between tests:
|
|
239
|
+
|
|
240
|
+
```ruby
|
|
241
|
+
# In test setup
|
|
242
|
+
FactDb.reset_configuration!
|
|
243
|
+
```
|