fact_db 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.envrc +2 -0
- data/.yardopts +5 -0
- data/CHANGELOG.md +64 -0
- data/README.md +107 -6
- data/Rakefile +243 -10
- data/db/migrate/001_enable_extensions.rb +1 -0
- data/db/migrate/002_create_sources.rb +49 -0
- data/db/migrate/003_create_entities.rb +27 -15
- data/db/migrate/004_create_entity_aliases.rb +20 -7
- data/db/migrate/005_create_facts.rb +37 -21
- data/db/migrate/006_create_entity_mentions.rb +14 -6
- data/db/migrate/007_create_fact_sources.rb +16 -8
- data/docs/api/extractors/index.md +5 -5
- data/docs/api/extractors/llm.md +17 -17
- data/docs/api/extractors/rule-based.md +14 -14
- data/docs/api/facts.md +20 -20
- data/docs/api/index.md +4 -4
- data/docs/api/models/entity.md +21 -21
- data/docs/api/models/fact.md +15 -15
- data/docs/api/models/index.md +7 -7
- data/docs/api/models/{content.md → source.md} +29 -29
- data/docs/api/pipeline/extraction.md +25 -25
- data/docs/api/pipeline/index.md +1 -1
- data/docs/api/pipeline/resolution.md +4 -4
- data/docs/api/services/entity-service.md +20 -20
- data/docs/api/services/fact-service.md +12 -12
- data/docs/api/services/index.md +5 -5
- data/docs/api/services/{content-service.md → source-service.md} +27 -27
- data/docs/architecture/database-schema.md +46 -46
- data/docs/architecture/entity-resolution.md +6 -6
- data/docs/architecture/index.md +10 -10
- data/docs/architecture/temporal-facts.md +5 -5
- data/docs/architecture/three-layer-model.md +17 -17
- data/docs/concepts.md +6 -6
- data/docs/examples/basic-usage.md +20 -20
- data/docs/examples/hr-onboarding.md +17 -17
- data/docs/examples/index.md +4 -4
- data/docs/examples/news-analysis.md +23 -23
- data/docs/getting-started/database-setup.md +28 -20
- data/docs/getting-started/index.md +3 -3
- data/docs/getting-started/quick-start.md +33 -30
- data/docs/guides/batch-processing.md +26 -26
- data/docs/guides/configuration.md +158 -77
- data/docs/guides/entity-management.md +14 -14
- data/docs/guides/extracting-facts.md +28 -28
- data/docs/guides/ingesting-content.md +14 -14
- data/docs/guides/llm-integration.md +40 -32
- data/docs/guides/temporal-queries.md +11 -11
- data/docs/index.md +6 -2
- data/examples/.envrc +4 -0
- data/examples/.gitignore +1 -0
- data/examples/001_configuration.rb +312 -0
- data/examples/{basic_usage.rb → 010_basic_usage.rb} +47 -56
- data/examples/{entity_management.rb → 020_entity_management.rb} +57 -72
- data/examples/{temporal_queries.rb → 030_temporal_queries.rb} +39 -59
- data/examples/040_output_formats.rb +177 -0
- data/examples/{rule_based_extraction.rb → 050_rule_based_extraction.rb} +39 -45
- data/examples/060_fluent_temporal_api.rb +217 -0
- data/examples/070_introspection.rb +252 -0
- data/examples/{hr_system.rb → 080_hr_system.rb} +56 -75
- data/examples/090_ingest_demo.rb +515 -0
- data/examples/100_query_context.rb +668 -0
- data/examples/110_prove_it.rb +204 -0
- data/examples/120_dump_database.rb +358 -0
- data/examples/130_rag_feedback_loop.rb +858 -0
- data/examples/README.md +229 -15
- data/examples/data/lincoln_associates.md +201 -0
- data/examples/data/lincoln_biography.md +66 -0
- data/examples/data/lincoln_cabinet.md +243 -0
- data/examples/data/lincoln_family.md +163 -0
- data/examples/data/lincoln_military.md +241 -0
- data/examples/data/lincoln_todd_family.md +136 -0
- data/examples/ingest_reporter.rb +335 -0
- data/examples/utilities.rb +182 -0
- data/lib/fact_db/config/defaults.yml +254 -0
- data/lib/fact_db/config.rb +94 -35
- data/lib/fact_db/database.rb +98 -8
- data/lib/fact_db/extractors/base.rb +106 -21
- data/lib/fact_db/extractors/llm_extractor.rb +35 -63
- data/lib/fact_db/extractors/manual_extractor.rb +46 -6
- data/lib/fact_db/extractors/rule_based_extractor.rb +136 -25
- data/lib/fact_db/llm/adapter.rb +3 -3
- data/lib/fact_db/models/entity.rb +94 -22
- data/lib/fact_db/models/entity_alias.rb +41 -7
- data/lib/fact_db/models/entity_mention.rb +34 -1
- data/lib/fact_db/models/fact.rb +259 -28
- data/lib/fact_db/models/fact_source.rb +43 -9
- data/lib/fact_db/models/source.rb +113 -0
- data/lib/fact_db/pipeline/extraction_pipeline.rb +35 -35
- data/lib/fact_db/pipeline/resolution_pipeline.rb +5 -5
- data/lib/fact_db/query_result.rb +202 -0
- data/lib/fact_db/resolution/entity_resolver.rb +139 -39
- data/lib/fact_db/resolution/fact_resolver.rb +86 -14
- data/lib/fact_db/services/entity_service.rb +246 -37
- data/lib/fact_db/services/fact_service.rb +254 -17
- data/lib/fact_db/services/source_service.rb +164 -0
- data/lib/fact_db/temporal/query.rb +71 -7
- data/lib/fact_db/temporal/query_builder.rb +69 -0
- data/lib/fact_db/temporal/timeline.rb +102 -11
- data/lib/fact_db/transformers/base.rb +77 -0
- data/lib/fact_db/transformers/cypher_transformer.rb +185 -0
- data/lib/fact_db/transformers/json_transformer.rb +17 -0
- data/lib/fact_db/transformers/raw_transformer.rb +35 -0
- data/lib/fact_db/transformers/text_transformer.rb +114 -0
- data/lib/fact_db/transformers/triple_transformer.rb +138 -0
- data/lib/fact_db/validation/alias_filter.rb +185 -0
- data/lib/fact_db/version.rb +1 -1
- data/lib/fact_db.rb +281 -30
- data/mkdocs.yml +2 -2
- metadata +60 -16
- data/db/migrate/002_create_contents.rb +0 -44
- data/lib/fact_db/models/content.rb +0 -62
- data/lib/fact_db/services/content_service.rb +0 -93
|
@@ -1,48 +1,89 @@
|
|
|
1
1
|
# Configuration
|
|
2
2
|
|
|
3
|
-
FactDb uses the `anyway_config` gem for flexible configuration via environment variables, YAML files, or Ruby code.
|
|
3
|
+
FactDb uses the `anyway_config` gem for flexible configuration via environment variables, YAML files, or Ruby code. Configuration uses **nested sections** for better organization.
|
|
4
|
+
|
|
5
|
+
## Configuration Sources
|
|
6
|
+
|
|
7
|
+
Configuration is loaded from multiple sources (lowest to highest priority):
|
|
8
|
+
|
|
9
|
+
1. **Bundled defaults** - `lib/fact_db/config/defaults.yml` (ships with gem)
|
|
10
|
+
2. **XDG user config** - `~/.config/fact_db/fact_db.yml`
|
|
11
|
+
3. **Project config** - `./config/fact_db.yml`
|
|
12
|
+
4. **Local overrides** - `./config/fact_db.local.yml` (gitignored)
|
|
13
|
+
5. **Environment variables** - `FDB_*`
|
|
14
|
+
6. **Ruby configure block** - `FactDb.configure { |c| ... }`
|
|
15
|
+
|
|
16
|
+
## Configuration Access Pattern
|
|
17
|
+
|
|
18
|
+
FactDb uses nested configuration sections:
|
|
19
|
+
|
|
20
|
+
```ruby
|
|
21
|
+
# Nested access
|
|
22
|
+
FactDb.config.database.url
|
|
23
|
+
FactDb.config.database.pool_size
|
|
24
|
+
FactDb.config.llm.provider
|
|
25
|
+
FactDb.config.llm.model
|
|
26
|
+
FactDb.config.ranking.ts_rank_weight
|
|
27
|
+
```
|
|
4
28
|
|
|
5
29
|
## Configuration Methods
|
|
6
30
|
|
|
7
31
|
### Environment Variables
|
|
8
32
|
|
|
9
|
-
All settings
|
|
33
|
+
All settings use the `FDB_` prefix with double underscores for nested values:
|
|
10
34
|
|
|
11
35
|
```bash
|
|
12
|
-
|
|
13
|
-
export
|
|
14
|
-
export
|
|
15
|
-
export
|
|
16
|
-
|
|
17
|
-
|
|
36
|
+
# Database settings
|
|
37
|
+
export FDB_DATABASE__URL="postgresql://localhost/fact_db"
|
|
38
|
+
export FDB_DATABASE__POOL_SIZE=10
|
|
39
|
+
export FDB_DATABASE__TIMEOUT=30000
|
|
40
|
+
|
|
41
|
+
# LLM settings
|
|
42
|
+
export FDB_LLM__PROVIDER="openai"
|
|
43
|
+
export FDB_LLM__MODEL="gpt-4o-mini"
|
|
44
|
+
export FDB_LLM__API_KEY="sk-..."
|
|
45
|
+
|
|
46
|
+
# Top-level settings
|
|
47
|
+
export FDB_FUZZY_MATCH_THRESHOLD=0.85
|
|
48
|
+
export FDB_DEFAULT_EXTRACTOR="llm"
|
|
49
|
+
export FDB_LOG_LEVEL="debug"
|
|
18
50
|
```
|
|
19
51
|
|
|
20
52
|
### YAML Configuration
|
|
21
53
|
|
|
22
|
-
Create `config/fact_db.yml
|
|
54
|
+
Create `config/fact_db.yml` with nested sections:
|
|
23
55
|
|
|
24
56
|
```yaml
|
|
25
57
|
# Database
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
58
|
+
database:
|
|
59
|
+
url: postgresql://localhost/fact_db
|
|
60
|
+
pool_size: 10
|
|
61
|
+
timeout: 30000
|
|
29
62
|
|
|
30
63
|
# Embeddings
|
|
31
|
-
|
|
64
|
+
embedding:
|
|
65
|
+
dimensions: 1536
|
|
32
66
|
|
|
33
67
|
# LLM
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
68
|
+
llm:
|
|
69
|
+
provider: openai
|
|
70
|
+
model: gpt-4o-mini
|
|
71
|
+
api_key: <%= ENV['OPENAI_API_KEY'] %>
|
|
72
|
+
|
|
73
|
+
# Ranking weights (should sum to 1.0)
|
|
74
|
+
ranking:
|
|
75
|
+
ts_rank_weight: 0.25
|
|
76
|
+
vector_similarity_weight: 0.25
|
|
77
|
+
entity_mention_weight: 0.15
|
|
78
|
+
direct_answer_weight: 0.15
|
|
79
|
+
term_overlap_weight: 0.10
|
|
80
|
+
relationship_match_weight: 0.05
|
|
81
|
+
confidence_weight: 0.05
|
|
82
|
+
|
|
83
|
+
# Top-level settings
|
|
39
84
|
default_extractor: manual
|
|
40
|
-
|
|
41
|
-
# Entity Resolution
|
|
42
85
|
fuzzy_match_threshold: 0.85
|
|
43
86
|
auto_merge_threshold: 0.95
|
|
44
|
-
|
|
45
|
-
# Logging
|
|
46
87
|
log_level: info
|
|
47
88
|
```
|
|
48
89
|
|
|
@@ -51,21 +92,21 @@ log_level: info
|
|
|
51
92
|
```ruby
|
|
52
93
|
FactDb.configure do |config|
|
|
53
94
|
# Database
|
|
54
|
-
config.
|
|
55
|
-
config.
|
|
56
|
-
config.
|
|
95
|
+
config.database.url = "postgresql://localhost/fact_db"
|
|
96
|
+
config.database.pool_size = 10
|
|
97
|
+
config.database.timeout = 30_000
|
|
57
98
|
|
|
58
99
|
# Embeddings
|
|
59
|
-
config.
|
|
100
|
+
config.embedding.dimensions = 1536
|
|
60
101
|
config.embedding_generator = ->(text) {
|
|
61
102
|
# Your embedding generation logic
|
|
62
103
|
OpenAI::Client.new.embeddings(input: text)
|
|
63
104
|
}
|
|
64
105
|
|
|
65
|
-
# LLM
|
|
66
|
-
config.
|
|
67
|
-
config.
|
|
68
|
-
config.
|
|
106
|
+
# LLM (nested access)
|
|
107
|
+
config.llm.provider = :openai
|
|
108
|
+
config.llm.model = "gpt-4o-mini"
|
|
109
|
+
config.llm.api_key = ENV['OPENAI_API_KEY']
|
|
69
110
|
|
|
70
111
|
# Or provide a pre-configured client
|
|
71
112
|
config.llm_client = FactDb::LLM::Adapter.new(
|
|
@@ -73,10 +114,12 @@ FactDb.configure do |config|
|
|
|
73
114
|
model: "claude-sonnet-4-20250514"
|
|
74
115
|
)
|
|
75
116
|
|
|
76
|
-
#
|
|
77
|
-
config.
|
|
117
|
+
# Ranking weights
|
|
118
|
+
config.ranking.ts_rank_weight = 0.30
|
|
119
|
+
config.ranking.vector_similarity_weight = 0.25
|
|
78
120
|
|
|
79
|
-
#
|
|
121
|
+
# Top-level settings
|
|
122
|
+
config.default_extractor = :llm
|
|
80
123
|
config.fuzzy_match_threshold = 0.85
|
|
81
124
|
config.auto_merge_threshold = 0.95
|
|
82
125
|
|
|
@@ -90,46 +133,62 @@ end
|
|
|
90
133
|
|
|
91
134
|
### Database Settings
|
|
92
135
|
|
|
136
|
+
Access: `FactDb.config.database.*`
|
|
137
|
+
|
|
93
138
|
| Option | Type | Default | Description |
|
|
94
139
|
|--------|------|---------|-------------|
|
|
95
|
-
| `
|
|
96
|
-
| `
|
|
97
|
-
| `
|
|
140
|
+
| `url` | String | nil | PostgreSQL connection URL |
|
|
141
|
+
| `host` | String | localhost | Database host |
|
|
142
|
+
| `port` | Integer | 5432 | Database port |
|
|
143
|
+
| `name` | String | nil | Database name |
|
|
144
|
+
| `user` | String | nil | Database user |
|
|
145
|
+
| `password` | String | nil | Database password |
|
|
146
|
+
| `pool_size` | Integer | 5 | Connection pool size |
|
|
147
|
+
| `timeout` | Integer | 30000 | Query timeout in milliseconds |
|
|
98
148
|
|
|
99
149
|
### Embedding Settings
|
|
100
150
|
|
|
151
|
+
Access: `FactDb.config.embedding.*`
|
|
152
|
+
|
|
101
153
|
| Option | Type | Default | Description |
|
|
102
154
|
|--------|------|---------|-------------|
|
|
103
|
-
| `
|
|
104
|
-
| `
|
|
155
|
+
| `dimensions` | Integer | 1536 | Vector dimensions (match your model) |
|
|
156
|
+
| `generator` | Proc | nil | Custom embedding generation function |
|
|
105
157
|
|
|
106
158
|
### LLM Settings
|
|
107
159
|
|
|
160
|
+
Access: `FactDb.config.llm.*`
|
|
161
|
+
|
|
108
162
|
| Option | Type | Default | Description |
|
|
109
163
|
|--------|------|---------|-------------|
|
|
110
|
-
| `
|
|
111
|
-
| `
|
|
112
|
-
| `
|
|
113
|
-
| `
|
|
164
|
+
| `client` | Object | nil | Pre-configured LLM client |
|
|
165
|
+
| `provider` | Symbol | nil | Provider name (:openai, :anthropic, etc.) |
|
|
166
|
+
| `model` | String | varies | Model name |
|
|
167
|
+
| `api_key` | String | nil | API key |
|
|
168
|
+
|
|
169
|
+
### Ranking Settings
|
|
114
170
|
|
|
115
|
-
|
|
171
|
+
Access: `FactDb.config.ranking.*`
|
|
116
172
|
|
|
117
173
|
| Option | Type | Default | Description |
|
|
118
174
|
|--------|------|---------|-------------|
|
|
119
|
-
| `
|
|
175
|
+
| `ts_rank_weight` | Float | 0.25 | PostgreSQL full-text search weight |
|
|
176
|
+
| `vector_similarity_weight` | Float | 0.25 | Semantic similarity weight |
|
|
177
|
+
| `entity_mention_weight` | Float | 0.15 | Entity mentions weight |
|
|
178
|
+
| `direct_answer_weight` | Float | 0.15 | Direct answer pattern weight |
|
|
179
|
+
| `term_overlap_weight` | Float | 0.10 | Query word matches weight |
|
|
180
|
+
| `relationship_match_weight` | Float | 0.05 | Relationship words weight |
|
|
181
|
+
| `confidence_weight` | Float | 0.05 | Stored confidence score weight |
|
|
182
|
+
|
|
183
|
+
**Note:** Weights should sum to approximately 1.0.
|
|
120
184
|
|
|
121
|
-
###
|
|
185
|
+
### Top-Level Settings
|
|
122
186
|
|
|
123
187
|
| Option | Type | Default | Description |
|
|
124
188
|
|--------|------|---------|-------------|
|
|
189
|
+
| `default_extractor` | Symbol | :manual | Default extraction method |
|
|
125
190
|
| `fuzzy_match_threshold` | Float | 0.85 | Minimum similarity for fuzzy matching |
|
|
126
191
|
| `auto_merge_threshold` | Float | 0.95 | Similarity threshold for auto-merge |
|
|
127
|
-
|
|
128
|
-
### Logging Settings
|
|
129
|
-
|
|
130
|
-
| Option | Type | Default | Description |
|
|
131
|
-
|--------|------|---------|-------------|
|
|
132
|
-
| `logger` | Logger | STDOUT | Logger instance |
|
|
133
192
|
| `log_level` | Symbol | :info | Log level |
|
|
134
193
|
|
|
135
194
|
## LLM Provider Configuration
|
|
@@ -138,9 +197,9 @@ end
|
|
|
138
197
|
|
|
139
198
|
```ruby
|
|
140
199
|
FactDb.configure do |config|
|
|
141
|
-
config.
|
|
142
|
-
config.
|
|
143
|
-
config.
|
|
200
|
+
config.llm.provider = :openai
|
|
201
|
+
config.llm.model = "gpt-4o-mini" # or "gpt-4o", "gpt-4-turbo"
|
|
202
|
+
config.llm.api_key = ENV['OPENAI_API_KEY']
|
|
144
203
|
end
|
|
145
204
|
```
|
|
146
205
|
|
|
@@ -148,9 +207,9 @@ end
|
|
|
148
207
|
|
|
149
208
|
```ruby
|
|
150
209
|
FactDb.configure do |config|
|
|
151
|
-
config.
|
|
152
|
-
config.
|
|
153
|
-
config.
|
|
210
|
+
config.llm.provider = :anthropic
|
|
211
|
+
config.llm.model = "claude-sonnet-4-20250514"
|
|
212
|
+
config.llm.api_key = ENV['ANTHROPIC_API_KEY']
|
|
154
213
|
end
|
|
155
214
|
```
|
|
156
215
|
|
|
@@ -158,9 +217,9 @@ end
|
|
|
158
217
|
|
|
159
218
|
```ruby
|
|
160
219
|
FactDb.configure do |config|
|
|
161
|
-
config.
|
|
162
|
-
config.
|
|
163
|
-
config.
|
|
220
|
+
config.llm.provider = :gemini
|
|
221
|
+
config.llm.model = "gemini-2.0-flash"
|
|
222
|
+
config.llm.api_key = ENV['GEMINI_API_KEY']
|
|
164
223
|
end
|
|
165
224
|
```
|
|
166
225
|
|
|
@@ -168,8 +227,8 @@ end
|
|
|
168
227
|
|
|
169
228
|
```ruby
|
|
170
229
|
FactDb.configure do |config|
|
|
171
|
-
config.
|
|
172
|
-
config.
|
|
230
|
+
config.llm.provider = :ollama
|
|
231
|
+
config.llm.model = "llama3.2"
|
|
173
232
|
# No API key needed for local Ollama
|
|
174
233
|
end
|
|
175
234
|
```
|
|
@@ -178,8 +237,8 @@ end
|
|
|
178
237
|
|
|
179
238
|
```ruby
|
|
180
239
|
FactDb.configure do |config|
|
|
181
|
-
config.
|
|
182
|
-
config.
|
|
240
|
+
config.llm.provider = :bedrock
|
|
241
|
+
config.llm.model = "claude-sonnet-4"
|
|
183
242
|
# Uses AWS credentials from environment
|
|
184
243
|
end
|
|
185
244
|
```
|
|
@@ -188,45 +247,58 @@ end
|
|
|
188
247
|
|
|
189
248
|
```ruby
|
|
190
249
|
FactDb.configure do |config|
|
|
191
|
-
config.
|
|
192
|
-
config.
|
|
193
|
-
config.
|
|
250
|
+
config.llm.provider = :openrouter
|
|
251
|
+
config.llm.model = "anthropic/claude-sonnet-4"
|
|
252
|
+
config.llm.api_key = ENV['OPENROUTER_API_KEY']
|
|
194
253
|
end
|
|
195
254
|
```
|
|
196
255
|
|
|
256
|
+
## XDG User Configuration
|
|
257
|
+
|
|
258
|
+
FactDb supports XDG Base Directory Specification for user-level configuration:
|
|
259
|
+
|
|
260
|
+
- `~/.config/fact_db/fact_db.yml` (Linux/macOS)
|
|
261
|
+
- `~/Library/Application Support/fact_db/fact_db.yml` (macOS)
|
|
262
|
+
- `$XDG_CONFIG_HOME/fact_db/fact_db.yml` (if XDG_CONFIG_HOME is set)
|
|
263
|
+
|
|
264
|
+
This allows you to set personal defaults that apply across all projects.
|
|
265
|
+
|
|
197
266
|
## Environment-Specific Configuration
|
|
198
267
|
|
|
199
|
-
|
|
268
|
+
The bundled defaults support environment-specific overrides:
|
|
200
269
|
|
|
201
270
|
```yaml
|
|
202
271
|
# config/fact_db.yml
|
|
203
|
-
defaults:
|
|
204
|
-
|
|
272
|
+
defaults:
|
|
273
|
+
embedding:
|
|
274
|
+
dimensions: 1536
|
|
205
275
|
fuzzy_match_threshold: 0.85
|
|
206
276
|
|
|
207
277
|
development:
|
|
208
|
-
|
|
209
|
-
|
|
278
|
+
database:
|
|
279
|
+
name: fact_db_development
|
|
210
280
|
log_level: debug
|
|
211
281
|
|
|
212
282
|
test:
|
|
213
|
-
|
|
214
|
-
|
|
283
|
+
database:
|
|
284
|
+
name: fact_db_test
|
|
215
285
|
log_level: warn
|
|
216
286
|
|
|
217
287
|
production:
|
|
218
|
-
|
|
219
|
-
|
|
288
|
+
database:
|
|
289
|
+
pool_size: 25
|
|
220
290
|
log_level: info
|
|
221
291
|
```
|
|
222
292
|
|
|
293
|
+
Environment is detected from: `FDB_ENV` > `RAILS_ENV` > `RACK_ENV` > `'development'`
|
|
294
|
+
|
|
223
295
|
## Validation
|
|
224
296
|
|
|
225
297
|
Validate configuration at startup:
|
|
226
298
|
|
|
227
299
|
```ruby
|
|
228
300
|
FactDb.configure do |config|
|
|
229
|
-
config.
|
|
301
|
+
config.database.url = ENV['DATABASE_URL']
|
|
230
302
|
end
|
|
231
303
|
|
|
232
304
|
# Raises ConfigurationError if invalid
|
|
@@ -241,3 +313,12 @@ For testing, reset configuration between tests:
|
|
|
241
313
|
# In test setup
|
|
242
314
|
FactDb.reset_configuration!
|
|
243
315
|
```
|
|
316
|
+
|
|
317
|
+
## Environment Helpers
|
|
318
|
+
|
|
319
|
+
```ruby
|
|
320
|
+
FactDb.config.test? # true if FDB_ENV == 'test'
|
|
321
|
+
FactDb.config.development? # true if FDB_ENV == 'development'
|
|
322
|
+
FactDb.config.production? # true if FDB_ENV == 'production'
|
|
323
|
+
FactDb.config.environment # returns current environment string
|
|
324
|
+
```
|
|
@@ -85,7 +85,7 @@ facts.entity_service.add_alias(
|
|
|
85
85
|
|
|
86
86
|
```ruby
|
|
87
87
|
entity.entity_aliases.each do |alias_record|
|
|
88
|
-
puts "#{alias_record.
|
|
88
|
+
puts "#{alias_record.name} (#{alias_record.type})"
|
|
89
89
|
puts " Confidence: #{alias_record.confidence}"
|
|
90
90
|
end
|
|
91
91
|
```
|
|
@@ -142,7 +142,7 @@ results = facts.batch_resolve_entities(names)
|
|
|
142
142
|
results.each do |result|
|
|
143
143
|
status = result[:status] # :resolved, :not_found, :error
|
|
144
144
|
entity = result[:entity]
|
|
145
|
-
puts "#{result[:name]}: #{status} -> #{entity&.
|
|
145
|
+
puts "#{result[:name]}: #{status} -> #{entity&.name}"
|
|
146
146
|
end
|
|
147
147
|
```
|
|
148
148
|
|
|
@@ -157,13 +157,13 @@ facts.entity_service.merge(entity1.id, entity2.id)
|
|
|
157
157
|
# After merge:
|
|
158
158
|
entity2.reload
|
|
159
159
|
entity2.resolution_status # => "merged"
|
|
160
|
-
entity2.
|
|
160
|
+
entity2.canonical_id # => entity1.id
|
|
161
161
|
```
|
|
162
162
|
|
|
163
163
|
### What Happens on Merge
|
|
164
164
|
|
|
165
165
|
1. Entity2's status changes to "merged"
|
|
166
|
-
2. Entity2 points to entity1 via `
|
|
166
|
+
2. Entity2 points to entity1 via `canonical_id`
|
|
167
167
|
3. Entity2's aliases are copied to entity1
|
|
168
168
|
4. All facts mentioning entity2 now also reference entity1
|
|
169
169
|
|
|
@@ -184,7 +184,7 @@ end
|
|
|
184
184
|
```ruby
|
|
185
185
|
facts.entity_service.update(
|
|
186
186
|
entity.id,
|
|
187
|
-
|
|
187
|
+
name: "Paula M. Chen"
|
|
188
188
|
)
|
|
189
189
|
```
|
|
190
190
|
|
|
@@ -203,7 +203,7 @@ facts.entity_service.update(
|
|
|
203
203
|
# Reclassify entity type
|
|
204
204
|
facts.entity_service.update(
|
|
205
205
|
entity.id,
|
|
206
|
-
|
|
206
|
+
type: :organization
|
|
207
207
|
)
|
|
208
208
|
```
|
|
209
209
|
|
|
@@ -250,15 +250,15 @@ entities = facts.entity_service.search("Paula")
|
|
|
250
250
|
|
|
251
251
|
```ruby
|
|
252
252
|
people = FactDb::Models::Entity
|
|
253
|
-
.where(
|
|
253
|
+
.where(type: 'person')
|
|
254
254
|
.where.not(resolution_status: 'merged')
|
|
255
255
|
```
|
|
256
256
|
|
|
257
|
-
### Find Entities in
|
|
257
|
+
### Find Entities in Source
|
|
258
258
|
|
|
259
259
|
```ruby
|
|
260
|
-
# Find all entities mentioned in a
|
|
261
|
-
entities = facts.entity_service.
|
|
260
|
+
# Find all entities mentioned in a source
|
|
261
|
+
entities = facts.entity_service.in_source(source.id)
|
|
262
262
|
```
|
|
263
263
|
|
|
264
264
|
### Find Related Entities
|
|
@@ -327,9 +327,9 @@ unresolved = FactDb::Models::Entity
|
|
|
327
327
|
|
|
328
328
|
unresolved.each do |entity|
|
|
329
329
|
# Try to find duplicates
|
|
330
|
-
similar = facts.entity_service.search(entity.
|
|
330
|
+
similar = facts.entity_service.search(entity.name)
|
|
331
331
|
if similar.count > 1
|
|
332
|
-
puts "Potential duplicate: #{entity.
|
|
332
|
+
puts "Potential duplicate: #{entity.name}"
|
|
333
333
|
end
|
|
334
334
|
end
|
|
335
335
|
```
|
|
@@ -343,8 +343,8 @@ active_entities = FactDb::Models::Entity
|
|
|
343
343
|
|
|
344
344
|
# Or follow the merge chain
|
|
345
345
|
def canonical_entity(entity)
|
|
346
|
-
while entity.
|
|
347
|
-
entity = FactDb::Models::Entity.find(entity.
|
|
346
|
+
while entity.canonical_id
|
|
347
|
+
entity = FactDb::Models::Entity.find(entity.canonical_id)
|
|
348
348
|
end
|
|
349
349
|
entity
|
|
350
350
|
end
|
|
@@ -24,7 +24,7 @@ fact = facts.fact_service.create(
|
|
|
24
24
|
{ entity: microsoft, role: "organization", text: "Microsoft" }
|
|
25
25
|
],
|
|
26
26
|
sources: [
|
|
27
|
-
{
|
|
27
|
+
{ source: source, type: "primary", excerpt: "...accepted the offer..." }
|
|
28
28
|
]
|
|
29
29
|
)
|
|
30
30
|
```
|
|
@@ -36,19 +36,19 @@ Use AI to automatically extract facts:
|
|
|
36
36
|
```ruby
|
|
37
37
|
# Configure LLM
|
|
38
38
|
FactDb.configure do |config|
|
|
39
|
-
config.
|
|
40
|
-
config.
|
|
39
|
+
config.llm.provider = :openai
|
|
40
|
+
config.llm.api_key = ENV['OPENAI_API_KEY']
|
|
41
41
|
end
|
|
42
42
|
|
|
43
43
|
facts = FactDb.new
|
|
44
44
|
|
|
45
|
-
# Extract facts from
|
|
46
|
-
extracted = facts.extract_facts(
|
|
45
|
+
# Extract facts from source
|
|
46
|
+
extracted = facts.extract_facts(source.id, extractor: :llm)
|
|
47
47
|
|
|
48
48
|
extracted.each do |fact|
|
|
49
|
-
puts fact.
|
|
49
|
+
puts fact.text
|
|
50
50
|
puts " Valid from: #{fact.valid_at}"
|
|
51
|
-
puts " Entities: #{fact.entity_mentions.map(&:entity).map(&:
|
|
51
|
+
puts " Entities: #{fact.entity_mentions.map(&:entity).map(&:name)}"
|
|
52
52
|
end
|
|
53
53
|
```
|
|
54
54
|
|
|
@@ -57,7 +57,7 @@ end
|
|
|
57
57
|
Use regex patterns for structured content:
|
|
58
58
|
|
|
59
59
|
```ruby
|
|
60
|
-
extracted = facts.extract_facts(
|
|
60
|
+
extracted = facts.extract_facts(source.id, extractor: :rule_based)
|
|
61
61
|
```
|
|
62
62
|
|
|
63
63
|
The rule-based extractor includes patterns for:
|
|
@@ -76,7 +76,7 @@ FactDb.configure do |config|
|
|
|
76
76
|
end
|
|
77
77
|
|
|
78
78
|
# Uses configured default
|
|
79
|
-
extracted = facts.extract_facts(
|
|
79
|
+
extracted = facts.extract_facts(source.id)
|
|
80
80
|
```
|
|
81
81
|
|
|
82
82
|
## Fact Structure
|
|
@@ -85,8 +85,8 @@ Every extracted fact includes:
|
|
|
85
85
|
|
|
86
86
|
```ruby
|
|
87
87
|
fact = Models::Fact.new(
|
|
88
|
-
|
|
89
|
-
|
|
88
|
+
text: "Paula Chen is Principal Engineer at Microsoft",
|
|
89
|
+
digest: "sha256...", # For deduplication
|
|
90
90
|
valid_at: Time.parse("2024-01-10"),
|
|
91
91
|
invalid_at: nil, # nil = currently valid
|
|
92
92
|
status: "canonical", # canonical, superseded, corroborated, synthesized
|
|
@@ -127,7 +127,7 @@ Facts link to source content:
|
|
|
127
127
|
|
|
128
128
|
```ruby
|
|
129
129
|
fact.add_source(
|
|
130
|
-
|
|
130
|
+
source: email_source,
|
|
131
131
|
type: "primary",
|
|
132
132
|
excerpt: "Paula has accepted our offer to join as Principal Engineer...",
|
|
133
133
|
confidence: 0.95
|
|
@@ -147,16 +147,16 @@ fact.add_source(
|
|
|
147
147
|
Process multiple content items:
|
|
148
148
|
|
|
149
149
|
```ruby
|
|
150
|
-
|
|
150
|
+
source_ids = [source1.id, source2.id, source3.id]
|
|
151
151
|
|
|
152
152
|
# Sequential processing
|
|
153
|
-
results = facts.batch_extract(
|
|
153
|
+
results = facts.batch_extract(source_ids, parallel: false)
|
|
154
154
|
|
|
155
155
|
# Parallel processing (default)
|
|
156
|
-
results = facts.batch_extract(
|
|
156
|
+
results = facts.batch_extract(source_ids, parallel: true)
|
|
157
157
|
|
|
158
158
|
results.each do |result|
|
|
159
|
-
puts "
|
|
159
|
+
puts "Source #{result[:source_id]}:"
|
|
160
160
|
puts " Facts: #{result[:facts].count}"
|
|
161
161
|
puts " Error: #{result[:error]}" if result[:error]
|
|
162
162
|
end
|
|
@@ -168,11 +168,11 @@ Create custom extractors by extending the base class:
|
|
|
168
168
|
|
|
169
169
|
```ruby
|
|
170
170
|
class MyExtractor < FactDb::Extractors::Base
|
|
171
|
-
def extract(
|
|
171
|
+
def extract(source)
|
|
172
172
|
extracted = []
|
|
173
173
|
|
|
174
174
|
# Your extraction logic here
|
|
175
|
-
# Parse content
|
|
175
|
+
# Parse source.content
|
|
176
176
|
# Create fact records
|
|
177
177
|
|
|
178
178
|
extracted
|
|
@@ -180,8 +180,8 @@ class MyExtractor < FactDb::Extractors::Base
|
|
|
180
180
|
end
|
|
181
181
|
|
|
182
182
|
# Register and use
|
|
183
|
-
facts.fact_service.
|
|
184
|
-
|
|
183
|
+
facts.fact_service.extract_from_source(
|
|
184
|
+
source.id,
|
|
185
185
|
extractor: MyExtractor.new(config)
|
|
186
186
|
)
|
|
187
187
|
```
|
|
@@ -217,7 +217,7 @@ After extraction, you may want to:
|
|
|
217
217
|
### Resolve Entities
|
|
218
218
|
|
|
219
219
|
```ruby
|
|
220
|
-
extracted = facts.extract_facts(
|
|
220
|
+
extracted = facts.extract_facts(source.id, extractor: :llm)
|
|
221
221
|
|
|
222
222
|
extracted.each do |fact|
|
|
223
223
|
fact.entity_mentions.each do |mention|
|
|
@@ -239,8 +239,8 @@ conflicts = facts.fact_service.resolver.find_conflicts(
|
|
|
239
239
|
|
|
240
240
|
conflicts.each do |conflict|
|
|
241
241
|
puts "Conflict between:"
|
|
242
|
-
puts " #{conflict[:fact1].
|
|
243
|
-
puts " #{conflict[:fact2].
|
|
242
|
+
puts " #{conflict[:fact1].text}"
|
|
243
|
+
puts " #{conflict[:fact2].text}"
|
|
244
244
|
end
|
|
245
245
|
```
|
|
246
246
|
|
|
@@ -248,7 +248,7 @@ end
|
|
|
248
248
|
|
|
249
249
|
```ruby
|
|
250
250
|
# If multiple sources say the same thing
|
|
251
|
-
if fact1.
|
|
251
|
+
if fact1.text.similar_to?(fact2.text)
|
|
252
252
|
facts.fact_service.resolver.corroborate(fact1.id, fact2.id)
|
|
253
253
|
end
|
|
254
254
|
```
|
|
@@ -258,7 +258,7 @@ end
|
|
|
258
258
|
### 1. Review LLM Extractions
|
|
259
259
|
|
|
260
260
|
```ruby
|
|
261
|
-
extracted = facts.extract_facts(
|
|
261
|
+
extracted = facts.extract_facts(source.id, extractor: :llm)
|
|
262
262
|
|
|
263
263
|
extracted.select { |f| f.confidence < 0.8 }.each do |fact|
|
|
264
264
|
# Flag for human review
|
|
@@ -282,7 +282,7 @@ end
|
|
|
282
282
|
fact = facts.fact_service.create(
|
|
283
283
|
"Important fact",
|
|
284
284
|
valid_at: Date.today,
|
|
285
|
-
sources: [{
|
|
285
|
+
sources: [{ source: source_record, type: "primary" }]
|
|
286
286
|
)
|
|
287
287
|
```
|
|
288
288
|
|
|
@@ -290,10 +290,10 @@ fact = facts.fact_service.create(
|
|
|
290
290
|
|
|
291
291
|
```ruby
|
|
292
292
|
begin
|
|
293
|
-
extracted = facts.extract_facts(
|
|
293
|
+
extracted = facts.extract_facts(source.id, extractor: :llm)
|
|
294
294
|
rescue FactDb::ExtractionError => e
|
|
295
295
|
logger.error "Extraction failed: #{e.message}"
|
|
296
296
|
# Fall back to manual or rule-based
|
|
297
|
-
extracted = facts.extract_facts(
|
|
297
|
+
extracted = facts.extract_facts(source.id, extractor: :rule_based)
|
|
298
298
|
end
|
|
299
299
|
```
|