fact_db 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.envrc +1 -0
- data/CHANGELOG.md +48 -0
- data/COMMITS.md +196 -0
- data/README.md +102 -0
- data/Rakefile +41 -0
- data/db/migrate/001_enable_extensions.rb +7 -0
- data/db/migrate/002_create_contents.rb +44 -0
- data/db/migrate/003_create_entities.rb +36 -0
- data/db/migrate/004_create_entity_aliases.rb +18 -0
- data/db/migrate/005_create_facts.rb +65 -0
- data/db/migrate/006_create_entity_mentions.rb +18 -0
- data/db/migrate/007_create_fact_sources.rb +18 -0
- data/docs/api/extractors/index.md +71 -0
- data/docs/api/extractors/llm.md +162 -0
- data/docs/api/extractors/manual.md +92 -0
- data/docs/api/extractors/rule-based.md +165 -0
- data/docs/api/facts.md +300 -0
- data/docs/api/index.md +66 -0
- data/docs/api/models/content.md +165 -0
- data/docs/api/models/entity.md +202 -0
- data/docs/api/models/fact.md +270 -0
- data/docs/api/models/index.md +77 -0
- data/docs/api/pipeline/extraction.md +175 -0
- data/docs/api/pipeline/index.md +72 -0
- data/docs/api/pipeline/resolution.md +209 -0
- data/docs/api/services/content-service.md +166 -0
- data/docs/api/services/entity-service.md +202 -0
- data/docs/api/services/fact-service.md +223 -0
- data/docs/api/services/index.md +55 -0
- data/docs/architecture/database-schema.md +293 -0
- data/docs/architecture/entity-resolution.md +293 -0
- data/docs/architecture/index.md +149 -0
- data/docs/architecture/temporal-facts.md +268 -0
- data/docs/architecture/three-layer-model.md +242 -0
- data/docs/assets/css/custom.css +137 -0
- data/docs/assets/fact_db.jpg +0 -0
- data/docs/assets/images/fact_db.jpg +0 -0
- data/docs/concepts.md +183 -0
- data/docs/examples/basic-usage.md +235 -0
- data/docs/examples/hr-onboarding.md +312 -0
- data/docs/examples/index.md +64 -0
- data/docs/examples/news-analysis.md +288 -0
- data/docs/getting-started/database-setup.md +170 -0
- data/docs/getting-started/index.md +71 -0
- data/docs/getting-started/installation.md +98 -0
- data/docs/getting-started/quick-start.md +191 -0
- data/docs/guides/batch-processing.md +325 -0
- data/docs/guides/configuration.md +243 -0
- data/docs/guides/entity-management.md +364 -0
- data/docs/guides/extracting-facts.md +299 -0
- data/docs/guides/index.md +22 -0
- data/docs/guides/ingesting-content.md +252 -0
- data/docs/guides/llm-integration.md +299 -0
- data/docs/guides/temporal-queries.md +315 -0
- data/docs/index.md +121 -0
- data/examples/README.md +130 -0
- data/examples/basic_usage.rb +164 -0
- data/examples/entity_management.rb +216 -0
- data/examples/hr_system.rb +428 -0
- data/examples/rule_based_extraction.rb +258 -0
- data/examples/temporal_queries.rb +245 -0
- data/lib/fact_db/config.rb +71 -0
- data/lib/fact_db/database.rb +45 -0
- data/lib/fact_db/errors.rb +10 -0
- data/lib/fact_db/extractors/base.rb +117 -0
- data/lib/fact_db/extractors/llm_extractor.rb +179 -0
- data/lib/fact_db/extractors/manual_extractor.rb +53 -0
- data/lib/fact_db/extractors/rule_based_extractor.rb +228 -0
- data/lib/fact_db/llm/adapter.rb +109 -0
- data/lib/fact_db/models/content.rb +62 -0
- data/lib/fact_db/models/entity.rb +84 -0
- data/lib/fact_db/models/entity_alias.rb +26 -0
- data/lib/fact_db/models/entity_mention.rb +33 -0
- data/lib/fact_db/models/fact.rb +192 -0
- data/lib/fact_db/models/fact_source.rb +35 -0
- data/lib/fact_db/pipeline/extraction_pipeline.rb +146 -0
- data/lib/fact_db/pipeline/resolution_pipeline.rb +129 -0
- data/lib/fact_db/resolution/entity_resolver.rb +261 -0
- data/lib/fact_db/resolution/fact_resolver.rb +259 -0
- data/lib/fact_db/services/content_service.rb +93 -0
- data/lib/fact_db/services/entity_service.rb +150 -0
- data/lib/fact_db/services/fact_service.rb +193 -0
- data/lib/fact_db/temporal/query.rb +125 -0
- data/lib/fact_db/temporal/timeline.rb +134 -0
- data/lib/fact_db/version.rb +5 -0
- data/lib/fact_db.rb +141 -0
- data/mkdocs.yml +198 -0
- metadata +288 -0
|
@@ -0,0 +1,364 @@
|
|
|
1
|
+
# Entity Management
|
|
2
|
+
|
|
3
|
+
Entities represent real-world things mentioned in facts - people, organizations, places, and more.
|
|
4
|
+
|
|
5
|
+
## Creating Entities
|
|
6
|
+
|
|
7
|
+
### Basic Creation
|
|
8
|
+
|
|
9
|
+
```ruby
|
|
10
|
+
facts = FactDb.new
|
|
11
|
+
|
|
12
|
+
person = facts.entity_service.create(
|
|
13
|
+
"Paula Chen",
|
|
14
|
+
type: :person
|
|
15
|
+
)
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
### With Aliases
|
|
19
|
+
|
|
20
|
+
```ruby
|
|
21
|
+
person = facts.entity_service.create(
|
|
22
|
+
"Paula Chen",
|
|
23
|
+
type: :person,
|
|
24
|
+
aliases: ["Paula", "P. Chen", "Chen, Paula"]
|
|
25
|
+
)
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
### With Metadata
|
|
29
|
+
|
|
30
|
+
```ruby
|
|
31
|
+
person = facts.entity_service.create(
|
|
32
|
+
"Paula Chen",
|
|
33
|
+
type: :person,
|
|
34
|
+
aliases: ["Paula"],
|
|
35
|
+
metadata: {
|
|
36
|
+
employee_id: "E12345",
|
|
37
|
+
department: "Engineering",
|
|
38
|
+
start_date: "2024-01-10"
|
|
39
|
+
}
|
|
40
|
+
)
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
## Entity Types
|
|
44
|
+
|
|
45
|
+
| Type | Description | Examples |
|
|
46
|
+
|------|-------------|----------|
|
|
47
|
+
| `:person` | Individual people | Paula Chen, John Smith |
|
|
48
|
+
| `:organization` | Companies, teams | Microsoft, Platform Team |
|
|
49
|
+
| `:place` | Locations | San Francisco, Building A |
|
|
50
|
+
| `:product` | Products, services | Windows 11, Azure |
|
|
51
|
+
| `:event` | Named events | Q4 Earnings, Annual Review |
|
|
52
|
+
|
|
53
|
+
```ruby
|
|
54
|
+
# Custom types are also supported
|
|
55
|
+
entity = facts.entity_service.create(
|
|
56
|
+
"TPS Report",
|
|
57
|
+
type: :document_type
|
|
58
|
+
)
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
## Managing Aliases
|
|
62
|
+
|
|
63
|
+
### Add Alias
|
|
64
|
+
|
|
65
|
+
```ruby
|
|
66
|
+
facts.entity_service.add_alias(
|
|
67
|
+
entity.id,
|
|
68
|
+
"P. Chen",
|
|
69
|
+
type: :abbreviation,
|
|
70
|
+
confidence: 0.95
|
|
71
|
+
)
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
### Alias Types
|
|
75
|
+
|
|
76
|
+
| Type | Description |
|
|
77
|
+
|------|-------------|
|
|
78
|
+
| `nickname` | Informal names |
|
|
79
|
+
| `abbreviation` | Shortened forms |
|
|
80
|
+
| `formal` | Formal/legal names |
|
|
81
|
+
| `maiden_name` | Previous names |
|
|
82
|
+
| `trading_name` | Business aliases |
|
|
83
|
+
|
|
84
|
+
### List Aliases
|
|
85
|
+
|
|
86
|
+
```ruby
|
|
87
|
+
entity.entity_aliases.each do |alias_record|
|
|
88
|
+
puts "#{alias_record.alias_text} (#{alias_record.alias_type})"
|
|
89
|
+
puts " Confidence: #{alias_record.confidence}"
|
|
90
|
+
end
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
### Remove Alias
|
|
94
|
+
|
|
95
|
+
```ruby
|
|
96
|
+
facts.entity_service.remove_alias(entity.id, "Old Name")
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
## Entity Resolution
|
|
100
|
+
|
|
101
|
+
### Basic Resolution
|
|
102
|
+
|
|
103
|
+
```ruby
|
|
104
|
+
# Resolve a name to an entity
|
|
105
|
+
entity = facts.resolve_entity("Paula Chen")
|
|
106
|
+
|
|
107
|
+
# Returns existing entity or nil if not found
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
### Type-Constrained Resolution
|
|
111
|
+
|
|
112
|
+
```ruby
|
|
113
|
+
# Only match person entities
|
|
114
|
+
person = facts.resolve_entity("Paula", type: :person)
|
|
115
|
+
|
|
116
|
+
# Only match organizations
|
|
117
|
+
org = facts.resolve_entity("Microsoft", type: :organization)
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
### Resolution Strategies
|
|
121
|
+
|
|
122
|
+
The resolver tries in order:
|
|
123
|
+
|
|
124
|
+
1. **Exact match** on canonical name
|
|
125
|
+
2. **Alias match** on registered aliases
|
|
126
|
+
3. **Fuzzy match** using Levenshtein distance
|
|
127
|
+
|
|
128
|
+
```ruby
|
|
129
|
+
# Configure fuzzy matching
|
|
130
|
+
FactDb.configure do |config|
|
|
131
|
+
config.fuzzy_match_threshold = 0.85 # 85% similarity required
|
|
132
|
+
end
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
### Batch Resolution
|
|
136
|
+
|
|
137
|
+
```ruby
|
|
138
|
+
names = ["Paula Chen", "John Smith", "Microsoft", "Seattle"]
|
|
139
|
+
|
|
140
|
+
results = facts.batch_resolve_entities(names)
|
|
141
|
+
|
|
142
|
+
results.each do |result|
|
|
143
|
+
status = result[:status] # :resolved, :not_found, :error
|
|
144
|
+
entity = result[:entity]
|
|
145
|
+
puts "#{result[:name]}: #{status} -> #{entity&.canonical_name}"
|
|
146
|
+
end
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
## Merging Entities
|
|
150
|
+
|
|
151
|
+
When duplicate entities are discovered:
|
|
152
|
+
|
|
153
|
+
```ruby
|
|
154
|
+
# Merge entity2 into entity1 (entity1 is kept)
|
|
155
|
+
facts.entity_service.merge(entity1.id, entity2.id)
|
|
156
|
+
|
|
157
|
+
# After merge:
|
|
158
|
+
entity2.reload
|
|
159
|
+
entity2.resolution_status # => "merged"
|
|
160
|
+
entity2.merged_into_id # => entity1.id
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
### What Happens on Merge
|
|
164
|
+
|
|
165
|
+
1. Entity2's status changes to "merged"
|
|
166
|
+
2. Entity2 points to entity1 via `merged_into_id`
|
|
167
|
+
3. Entity2's aliases are copied to entity1
|
|
168
|
+
4. All facts mentioning entity2 now also reference entity1
|
|
169
|
+
|
|
170
|
+
### Auto-Merge
|
|
171
|
+
|
|
172
|
+
Configure automatic merging for high-confidence matches:
|
|
173
|
+
|
|
174
|
+
```ruby
|
|
175
|
+
FactDb.configure do |config|
|
|
176
|
+
config.auto_merge_threshold = 0.95 # Auto-merge at 95% similarity
|
|
177
|
+
end
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
## Updating Entities
|
|
181
|
+
|
|
182
|
+
### Update Canonical Name
|
|
183
|
+
|
|
184
|
+
```ruby
|
|
185
|
+
facts.entity_service.update(
|
|
186
|
+
entity.id,
|
|
187
|
+
canonical_name: "Paula M. Chen"
|
|
188
|
+
)
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
### Update Metadata
|
|
192
|
+
|
|
193
|
+
```ruby
|
|
194
|
+
facts.entity_service.update(
|
|
195
|
+
entity.id,
|
|
196
|
+
metadata: entity.metadata.merge(title: "Senior Principal Engineer")
|
|
197
|
+
)
|
|
198
|
+
```
|
|
199
|
+
|
|
200
|
+
### Change Type
|
|
201
|
+
|
|
202
|
+
```ruby
|
|
203
|
+
# Reclassify entity type
|
|
204
|
+
facts.entity_service.update(
|
|
205
|
+
entity.id,
|
|
206
|
+
entity_type: :organization
|
|
207
|
+
)
|
|
208
|
+
```
|
|
209
|
+
|
|
210
|
+
## Resolution Status
|
|
211
|
+
|
|
212
|
+
| Status | Description |
|
|
213
|
+
|--------|-------------|
|
|
214
|
+
| `unresolved` | Entity created but not confirmed |
|
|
215
|
+
| `resolved` | Entity identity confirmed |
|
|
216
|
+
| `merged` | Entity merged into another |
|
|
217
|
+
|
|
218
|
+
### Mark as Resolved
|
|
219
|
+
|
|
220
|
+
```ruby
|
|
221
|
+
facts.entity_service.update(
|
|
222
|
+
entity.id,
|
|
223
|
+
resolution_status: :resolved
|
|
224
|
+
)
|
|
225
|
+
```
|
|
226
|
+
|
|
227
|
+
### Find Unresolved
|
|
228
|
+
|
|
229
|
+
```ruby
|
|
230
|
+
unresolved = FactDb::Models::Entity
|
|
231
|
+
.where(resolution_status: 'unresolved')
|
|
232
|
+
.order(created_at: :desc)
|
|
233
|
+
```
|
|
234
|
+
|
|
235
|
+
## Querying Entities
|
|
236
|
+
|
|
237
|
+
### Find by ID
|
|
238
|
+
|
|
239
|
+
```ruby
|
|
240
|
+
entity = facts.entity_service.find(entity_id)
|
|
241
|
+
```
|
|
242
|
+
|
|
243
|
+
### Search by Name
|
|
244
|
+
|
|
245
|
+
```ruby
|
|
246
|
+
entities = facts.entity_service.search("Paula")
|
|
247
|
+
```
|
|
248
|
+
|
|
249
|
+
### Filter by Type
|
|
250
|
+
|
|
251
|
+
```ruby
|
|
252
|
+
people = FactDb::Models::Entity
|
|
253
|
+
.where(entity_type: 'person')
|
|
254
|
+
.where.not(resolution_status: 'merged')
|
|
255
|
+
```
|
|
256
|
+
|
|
257
|
+
### Find Entities in Content
|
|
258
|
+
|
|
259
|
+
```ruby
|
|
260
|
+
# Find all entities mentioned in a content
|
|
261
|
+
entities = facts.entity_service.in_content(content.id)
|
|
262
|
+
```
|
|
263
|
+
|
|
264
|
+
### Find Related Entities
|
|
265
|
+
|
|
266
|
+
```ruby
|
|
267
|
+
# Entities mentioned in facts about Paula
|
|
268
|
+
related = facts.entity_service.related_to(paula.id)
|
|
269
|
+
```
|
|
270
|
+
|
|
271
|
+
## Semantic Search
|
|
272
|
+
|
|
273
|
+
Search entities by meaning:
|
|
274
|
+
|
|
275
|
+
```ruby
|
|
276
|
+
# Find entities similar to a description
|
|
277
|
+
similar = facts.entity_service.semantic_search(
|
|
278
|
+
"software engineering leadership",
|
|
279
|
+
type: :person,
|
|
280
|
+
limit: 10
|
|
281
|
+
)
|
|
282
|
+
```
|
|
283
|
+
|
|
284
|
+
## Best Practices
|
|
285
|
+
|
|
286
|
+
### 1. Use Comprehensive Aliases
|
|
287
|
+
|
|
288
|
+
```ruby
|
|
289
|
+
entity = facts.entity_service.create(
|
|
290
|
+
"International Business Machines Corporation",
|
|
291
|
+
type: :organization,
|
|
292
|
+
aliases: [
|
|
293
|
+
"IBM",
|
|
294
|
+
"Big Blue",
|
|
295
|
+
"International Business Machines",
|
|
296
|
+
"IBM Corp",
|
|
297
|
+
"IBM Corporation"
|
|
298
|
+
]
|
|
299
|
+
)
|
|
300
|
+
```
|
|
301
|
+
|
|
302
|
+
### 2. Store Relevant Metadata
|
|
303
|
+
|
|
304
|
+
```ruby
|
|
305
|
+
person = facts.entity_service.create(
|
|
306
|
+
"Paula Chen",
|
|
307
|
+
type: :person,
|
|
308
|
+
metadata: {
|
|
309
|
+
# Stable identifiers
|
|
310
|
+
employee_id: "E12345",
|
|
311
|
+
linkedin_url: "linkedin.com/in/paulachen",
|
|
312
|
+
|
|
313
|
+
# Useful context
|
|
314
|
+
department: "Engineering",
|
|
315
|
+
location: "San Francisco"
|
|
316
|
+
}
|
|
317
|
+
)
|
|
318
|
+
```
|
|
319
|
+
|
|
320
|
+
### 3. Review Unresolved Entities
|
|
321
|
+
|
|
322
|
+
```ruby
|
|
323
|
+
# Periodically review unresolved entities
|
|
324
|
+
unresolved = FactDb::Models::Entity
|
|
325
|
+
.where(resolution_status: 'unresolved')
|
|
326
|
+
.where('created_at < ?', 1.week.ago)
|
|
327
|
+
|
|
328
|
+
unresolved.each do |entity|
|
|
329
|
+
# Try to find duplicates
|
|
330
|
+
similar = facts.entity_service.search(entity.canonical_name)
|
|
331
|
+
if similar.count > 1
|
|
332
|
+
puts "Potential duplicate: #{entity.canonical_name}"
|
|
333
|
+
end
|
|
334
|
+
end
|
|
335
|
+
```
|
|
336
|
+
|
|
337
|
+
### 4. Handle Merged Entities
|
|
338
|
+
|
|
339
|
+
```ruby
|
|
340
|
+
# When querying, exclude merged entities
|
|
341
|
+
active_entities = FactDb::Models::Entity
|
|
342
|
+
.where.not(resolution_status: 'merged')
|
|
343
|
+
|
|
344
|
+
# Or follow the merge chain
|
|
345
|
+
def canonical_entity(entity)
|
|
346
|
+
while entity.merged_into_id
|
|
347
|
+
entity = FactDb::Models::Entity.find(entity.merged_into_id)
|
|
348
|
+
end
|
|
349
|
+
entity
|
|
350
|
+
end
|
|
351
|
+
```
|
|
352
|
+
|
|
353
|
+
### 5. Validate Entity Types
|
|
354
|
+
|
|
355
|
+
```ruby
|
|
356
|
+
VALID_TYPES = %i[person organization place product event].freeze
|
|
357
|
+
|
|
358
|
+
def create_entity(name, type:)
|
|
359
|
+
unless VALID_TYPES.include?(type.to_sym)
|
|
360
|
+
raise ArgumentError, "Invalid entity type: #{type}"
|
|
361
|
+
end
|
|
362
|
+
facts.entity_service.create(name, type: type)
|
|
363
|
+
end
|
|
364
|
+
```
|
|
@@ -0,0 +1,299 @@
|
|
|
1
|
+
# Extracting Facts
|
|
2
|
+
|
|
3
|
+
Facts are extracted from content using one of three methods: manual, LLM-powered, or rule-based.
|
|
4
|
+
|
|
5
|
+
## Extraction Methods
|
|
6
|
+
|
|
7
|
+
### Manual Extraction
|
|
8
|
+
|
|
9
|
+
Create facts directly via the API:
|
|
10
|
+
|
|
11
|
+
```ruby
|
|
12
|
+
facts = FactDb.new
|
|
13
|
+
|
|
14
|
+
# Create entities first
|
|
15
|
+
paula = facts.entity_service.create("Paula Chen", type: :person)
|
|
16
|
+
microsoft = facts.entity_service.create("Microsoft", type: :organization)
|
|
17
|
+
|
|
18
|
+
# Create fact with explicit links
|
|
19
|
+
fact = facts.fact_service.create(
|
|
20
|
+
"Paula Chen joined Microsoft as Principal Engineer",
|
|
21
|
+
valid_at: Date.parse("2024-01-10"),
|
|
22
|
+
mentions: [
|
|
23
|
+
{ entity: paula, role: "subject", text: "Paula Chen" },
|
|
24
|
+
{ entity: microsoft, role: "organization", text: "Microsoft" }
|
|
25
|
+
],
|
|
26
|
+
sources: [
|
|
27
|
+
{ content: content, type: "primary", excerpt: "...accepted the offer..." }
|
|
28
|
+
]
|
|
29
|
+
)
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
### LLM Extraction
|
|
33
|
+
|
|
34
|
+
Use AI to automatically extract facts:
|
|
35
|
+
|
|
36
|
+
```ruby
|
|
37
|
+
# Configure LLM
|
|
38
|
+
FactDb.configure do |config|
|
|
39
|
+
config.llm_provider = :openai
|
|
40
|
+
config.llm_api_key = ENV['OPENAI_API_KEY']
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
facts = FactDb.new
|
|
44
|
+
|
|
45
|
+
# Extract facts from content
|
|
46
|
+
extracted = facts.extract_facts(content.id, extractor: :llm)
|
|
47
|
+
|
|
48
|
+
extracted.each do |fact|
|
|
49
|
+
puts fact.fact_text
|
|
50
|
+
puts " Valid from: #{fact.valid_at}"
|
|
51
|
+
puts " Entities: #{fact.entity_mentions.map(&:entity).map(&:canonical_name)}"
|
|
52
|
+
end
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
### Rule-Based Extraction
|
|
56
|
+
|
|
57
|
+
Use regex patterns for structured content:
|
|
58
|
+
|
|
59
|
+
```ruby
|
|
60
|
+
extracted = facts.extract_facts(content.id, extractor: :rule_based)
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
The rule-based extractor includes patterns for:
|
|
64
|
+
|
|
65
|
+
- Dates and time references
|
|
66
|
+
- Employment events (joined, promoted, left)
|
|
67
|
+
- Title/role changes
|
|
68
|
+
- Location references
|
|
69
|
+
- Organizational relationships
|
|
70
|
+
|
|
71
|
+
## Setting Default Extractor
|
|
72
|
+
|
|
73
|
+
```ruby
|
|
74
|
+
FactDb.configure do |config|
|
|
75
|
+
config.default_extractor = :llm # or :manual, :rule_based
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
# Uses configured default
|
|
79
|
+
extracted = facts.extract_facts(content.id)
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
## Fact Structure
|
|
83
|
+
|
|
84
|
+
Every extracted fact includes:
|
|
85
|
+
|
|
86
|
+
```ruby
|
|
87
|
+
fact = Models::Fact.new(
|
|
88
|
+
fact_text: "Paula Chen is Principal Engineer at Microsoft",
|
|
89
|
+
fact_hash: "sha256...", # For deduplication
|
|
90
|
+
valid_at: Time.parse("2024-01-10"),
|
|
91
|
+
invalid_at: nil, # nil = currently valid
|
|
92
|
+
status: "canonical", # canonical, superseded, corroborated, synthesized
|
|
93
|
+
confidence: 0.95, # Extraction confidence
|
|
94
|
+
extraction_method: "llm", # manual, llm, rule_based
|
|
95
|
+
metadata: {} # Additional data
|
|
96
|
+
)
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
## Entity Mentions
|
|
100
|
+
|
|
101
|
+
Facts link to entities via mentions:
|
|
102
|
+
|
|
103
|
+
```ruby
|
|
104
|
+
fact.add_mention(
|
|
105
|
+
entity: paula,
|
|
106
|
+
text: "Paula Chen", # How entity was mentioned
|
|
107
|
+
role: "subject", # Role in the fact
|
|
108
|
+
confidence: 0.95 # Resolution confidence
|
|
109
|
+
)
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
### Mention Roles
|
|
113
|
+
|
|
114
|
+
| Role | Description | Example |
|
|
115
|
+
|------|-------------|---------|
|
|
116
|
+
| `subject` | Primary actor | "Paula joined..." |
|
|
117
|
+
| `object` | Target | "...hired Paula" |
|
|
118
|
+
| `organization` | Company/team | "...at Microsoft" |
|
|
119
|
+
| `location` | Place | "...in Seattle" |
|
|
120
|
+
| `role` | Title/position | "...as Engineer" |
|
|
121
|
+
| `temporal` | Time reference | "...in Q4 2024" |
|
|
122
|
+
| `attribute` | Property | "...with 10 years experience" |
|
|
123
|
+
|
|
124
|
+
## Source Links
|
|
125
|
+
|
|
126
|
+
Facts link to source content:
|
|
127
|
+
|
|
128
|
+
```ruby
|
|
129
|
+
fact.add_source(
|
|
130
|
+
content: email_content,
|
|
131
|
+
type: "primary",
|
|
132
|
+
excerpt: "Paula has accepted our offer to join as Principal Engineer...",
|
|
133
|
+
confidence: 0.95
|
|
134
|
+
)
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
### Source Types
|
|
138
|
+
|
|
139
|
+
| Type | Description |
|
|
140
|
+
|------|-------------|
|
|
141
|
+
| `primary` | Direct source of the fact |
|
|
142
|
+
| `supporting` | Confirms the fact |
|
|
143
|
+
| `contradicting` | Contradicts the fact |
|
|
144
|
+
|
|
145
|
+
## Batch Extraction
|
|
146
|
+
|
|
147
|
+
Process multiple content items:
|
|
148
|
+
|
|
149
|
+
```ruby
|
|
150
|
+
content_ids = [content1.id, content2.id, content3.id]
|
|
151
|
+
|
|
152
|
+
# Sequential processing
|
|
153
|
+
results = facts.batch_extract(content_ids, parallel: false)
|
|
154
|
+
|
|
155
|
+
# Parallel processing (default)
|
|
156
|
+
results = facts.batch_extract(content_ids, parallel: true)
|
|
157
|
+
|
|
158
|
+
results.each do |result|
|
|
159
|
+
puts "Content #{result[:content_id]}:"
|
|
160
|
+
puts " Facts: #{result[:facts].count}"
|
|
161
|
+
puts " Error: #{result[:error]}" if result[:error]
|
|
162
|
+
end
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
## Custom Extractors
|
|
166
|
+
|
|
167
|
+
Create custom extractors by extending the base class:
|
|
168
|
+
|
|
169
|
+
```ruby
|
|
170
|
+
class MyExtractor < FactDb::Extractors::Base
|
|
171
|
+
def extract(content)
|
|
172
|
+
extracted = []
|
|
173
|
+
|
|
174
|
+
# Your extraction logic here
|
|
175
|
+
# Parse content.raw_text
|
|
176
|
+
# Create fact records
|
|
177
|
+
|
|
178
|
+
extracted
|
|
179
|
+
end
|
|
180
|
+
end
|
|
181
|
+
|
|
182
|
+
# Register and use
|
|
183
|
+
facts.fact_service.extract_from_content(
|
|
184
|
+
content.id,
|
|
185
|
+
extractor: MyExtractor.new(config)
|
|
186
|
+
)
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
## Extraction Confidence
|
|
190
|
+
|
|
191
|
+
Track confidence levels:
|
|
192
|
+
|
|
193
|
+
```ruby
|
|
194
|
+
# High confidence - direct statement
|
|
195
|
+
fact = facts.fact_service.create(
|
|
196
|
+
"Paula is Principal Engineer",
|
|
197
|
+
confidence: 0.95
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
# Medium confidence - inferred
|
|
201
|
+
fact = facts.fact_service.create(
|
|
202
|
+
"Paula likely works in Engineering",
|
|
203
|
+
confidence: 0.7
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
# Low confidence - speculation
|
|
207
|
+
fact = facts.fact_service.create(
|
|
208
|
+
"Paula may be promoted soon",
|
|
209
|
+
confidence: 0.4
|
|
210
|
+
)
|
|
211
|
+
```
|
|
212
|
+
|
|
213
|
+
## Post-Extraction Processing
|
|
214
|
+
|
|
215
|
+
After extraction, you may want to:
|
|
216
|
+
|
|
217
|
+
### Resolve Entities
|
|
218
|
+
|
|
219
|
+
```ruby
|
|
220
|
+
extracted = facts.extract_facts(content.id, extractor: :llm)
|
|
221
|
+
|
|
222
|
+
extracted.each do |fact|
|
|
223
|
+
fact.entity_mentions.each do |mention|
|
|
224
|
+
if mention.entity.nil?
|
|
225
|
+
# Resolve unlinked mention
|
|
226
|
+
entity = facts.resolve_entity(mention.mention_text)
|
|
227
|
+
mention.update!(entity: entity) if entity
|
|
228
|
+
end
|
|
229
|
+
end
|
|
230
|
+
end
|
|
231
|
+
```
|
|
232
|
+
|
|
233
|
+
### Detect Conflicts
|
|
234
|
+
|
|
235
|
+
```ruby
|
|
236
|
+
conflicts = facts.fact_service.resolver.find_conflicts(
|
|
237
|
+
entity_id: paula.id
|
|
238
|
+
)
|
|
239
|
+
|
|
240
|
+
conflicts.each do |conflict|
|
|
241
|
+
puts "Conflict between:"
|
|
242
|
+
puts " #{conflict[:fact1].fact_text}"
|
|
243
|
+
puts " #{conflict[:fact2].fact_text}"
|
|
244
|
+
end
|
|
245
|
+
```
|
|
246
|
+
|
|
247
|
+
### Corroborate Facts
|
|
248
|
+
|
|
249
|
+
```ruby
|
|
250
|
+
# If multiple sources say the same thing
|
|
251
|
+
if fact1.fact_text.similar_to?(fact2.fact_text)
|
|
252
|
+
facts.fact_service.resolver.corroborate(fact1.id, fact2.id)
|
|
253
|
+
end
|
|
254
|
+
```
|
|
255
|
+
|
|
256
|
+
## Best Practices
|
|
257
|
+
|
|
258
|
+
### 1. Review LLM Extractions
|
|
259
|
+
|
|
260
|
+
```ruby
|
|
261
|
+
extracted = facts.extract_facts(content.id, extractor: :llm)
|
|
262
|
+
|
|
263
|
+
extracted.select { |f| f.confidence < 0.8 }.each do |fact|
|
|
264
|
+
# Flag for human review
|
|
265
|
+
fact.update!(metadata: fact.metadata.merge(needs_review: true))
|
|
266
|
+
end
|
|
267
|
+
```
|
|
268
|
+
|
|
269
|
+
### 2. Validate Temporal Information
|
|
270
|
+
|
|
271
|
+
```ruby
|
|
272
|
+
# Ensure valid_at is reasonable
|
|
273
|
+
if fact.valid_at > Time.current
|
|
274
|
+
logger.warn "Future date detected: #{fact.valid_at}"
|
|
275
|
+
end
|
|
276
|
+
```
|
|
277
|
+
|
|
278
|
+
### 3. Link Sources
|
|
279
|
+
|
|
280
|
+
```ruby
|
|
281
|
+
# Always link facts to their sources
|
|
282
|
+
fact = facts.fact_service.create(
|
|
283
|
+
"Important fact",
|
|
284
|
+
valid_at: Date.today,
|
|
285
|
+
sources: [{ content: source_content, type: "primary" }]
|
|
286
|
+
)
|
|
287
|
+
```
|
|
288
|
+
|
|
289
|
+
### 4. Handle Extraction Errors
|
|
290
|
+
|
|
291
|
+
```ruby
|
|
292
|
+
begin
|
|
293
|
+
extracted = facts.extract_facts(content.id, extractor: :llm)
|
|
294
|
+
rescue FactDb::ExtractionError => e
|
|
295
|
+
logger.error "Extraction failed: #{e.message}"
|
|
296
|
+
# Fall back to manual or rule-based
|
|
297
|
+
extracted = facts.extract_facts(content.id, extractor: :rule_based)
|
|
298
|
+
end
|
|
299
|
+
```
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# Guides
|
|
2
|
+
|
|
3
|
+
Practical guides for using FactDb in your applications.
|
|
4
|
+
|
|
5
|
+
## Configuration
|
|
6
|
+
|
|
7
|
+
- [Configuration](configuration.md) - Configure database, LLM, and extraction settings
|
|
8
|
+
|
|
9
|
+
## Working with Data
|
|
10
|
+
|
|
11
|
+
- [Ingesting Content](ingesting-content.md) - Import documents, emails, and other content
|
|
12
|
+
- [Extracting Facts](extracting-facts.md) - Extract facts using different methods
|
|
13
|
+
- [LLM Integration](llm-integration.md) - Set up AI-powered extraction
|
|
14
|
+
- [Temporal Queries](temporal-queries.md) - Query facts across time
|
|
15
|
+
|
|
16
|
+
## Entity Management
|
|
17
|
+
|
|
18
|
+
- [Entity Management](entity-management.md) - Create, resolve, and merge entities
|
|
19
|
+
|
|
20
|
+
## Performance
|
|
21
|
+
|
|
22
|
+
- [Batch Processing](batch-processing.md) - Process content in parallel
|