fact_db 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.envrc +1 -0
- data/CHANGELOG.md +48 -0
- data/COMMITS.md +196 -0
- data/README.md +102 -0
- data/Rakefile +41 -0
- data/db/migrate/001_enable_extensions.rb +7 -0
- data/db/migrate/002_create_contents.rb +44 -0
- data/db/migrate/003_create_entities.rb +36 -0
- data/db/migrate/004_create_entity_aliases.rb +18 -0
- data/db/migrate/005_create_facts.rb +65 -0
- data/db/migrate/006_create_entity_mentions.rb +18 -0
- data/db/migrate/007_create_fact_sources.rb +18 -0
- data/docs/api/extractors/index.md +71 -0
- data/docs/api/extractors/llm.md +162 -0
- data/docs/api/extractors/manual.md +92 -0
- data/docs/api/extractors/rule-based.md +165 -0
- data/docs/api/facts.md +300 -0
- data/docs/api/index.md +66 -0
- data/docs/api/models/content.md +165 -0
- data/docs/api/models/entity.md +202 -0
- data/docs/api/models/fact.md +270 -0
- data/docs/api/models/index.md +77 -0
- data/docs/api/pipeline/extraction.md +175 -0
- data/docs/api/pipeline/index.md +72 -0
- data/docs/api/pipeline/resolution.md +209 -0
- data/docs/api/services/content-service.md +166 -0
- data/docs/api/services/entity-service.md +202 -0
- data/docs/api/services/fact-service.md +223 -0
- data/docs/api/services/index.md +55 -0
- data/docs/architecture/database-schema.md +293 -0
- data/docs/architecture/entity-resolution.md +293 -0
- data/docs/architecture/index.md +149 -0
- data/docs/architecture/temporal-facts.md +268 -0
- data/docs/architecture/three-layer-model.md +242 -0
- data/docs/assets/css/custom.css +137 -0
- data/docs/assets/fact_db.jpg +0 -0
- data/docs/assets/images/fact_db.jpg +0 -0
- data/docs/concepts.md +183 -0
- data/docs/examples/basic-usage.md +235 -0
- data/docs/examples/hr-onboarding.md +312 -0
- data/docs/examples/index.md +64 -0
- data/docs/examples/news-analysis.md +288 -0
- data/docs/getting-started/database-setup.md +170 -0
- data/docs/getting-started/index.md +71 -0
- data/docs/getting-started/installation.md +98 -0
- data/docs/getting-started/quick-start.md +191 -0
- data/docs/guides/batch-processing.md +325 -0
- data/docs/guides/configuration.md +243 -0
- data/docs/guides/entity-management.md +364 -0
- data/docs/guides/extracting-facts.md +299 -0
- data/docs/guides/index.md +22 -0
- data/docs/guides/ingesting-content.md +252 -0
- data/docs/guides/llm-integration.md +299 -0
- data/docs/guides/temporal-queries.md +315 -0
- data/docs/index.md +121 -0
- data/examples/README.md +130 -0
- data/examples/basic_usage.rb +164 -0
- data/examples/entity_management.rb +216 -0
- data/examples/hr_system.rb +428 -0
- data/examples/rule_based_extraction.rb +258 -0
- data/examples/temporal_queries.rb +245 -0
- data/lib/fact_db/config.rb +71 -0
- data/lib/fact_db/database.rb +45 -0
- data/lib/fact_db/errors.rb +10 -0
- data/lib/fact_db/extractors/base.rb +117 -0
- data/lib/fact_db/extractors/llm_extractor.rb +179 -0
- data/lib/fact_db/extractors/manual_extractor.rb +53 -0
- data/lib/fact_db/extractors/rule_based_extractor.rb +228 -0
- data/lib/fact_db/llm/adapter.rb +109 -0
- data/lib/fact_db/models/content.rb +62 -0
- data/lib/fact_db/models/entity.rb +84 -0
- data/lib/fact_db/models/entity_alias.rb +26 -0
- data/lib/fact_db/models/entity_mention.rb +33 -0
- data/lib/fact_db/models/fact.rb +192 -0
- data/lib/fact_db/models/fact_source.rb +35 -0
- data/lib/fact_db/pipeline/extraction_pipeline.rb +146 -0
- data/lib/fact_db/pipeline/resolution_pipeline.rb +129 -0
- data/lib/fact_db/resolution/entity_resolver.rb +261 -0
- data/lib/fact_db/resolution/fact_resolver.rb +259 -0
- data/lib/fact_db/services/content_service.rb +93 -0
- data/lib/fact_db/services/entity_service.rb +150 -0
- data/lib/fact_db/services/fact_service.rb +193 -0
- data/lib/fact_db/temporal/query.rb +125 -0
- data/lib/fact_db/temporal/timeline.rb +134 -0
- data/lib/fact_db/version.rb +5 -0
- data/lib/fact_db.rb +141 -0
- data/mkdocs.yml +198 -0
- metadata +288 -0
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
# Architecture
|
|
2
|
+
|
|
3
|
+
FactDb is designed around the Event Clock concept - a system for capturing organizational reasoning through temporal facts with full audit trails.
|
|
4
|
+
|
|
5
|
+
## Design Principles
|
|
6
|
+
|
|
7
|
+
### Immutable Content
|
|
8
|
+
|
|
9
|
+
Source content is never modified after ingestion. This ensures:
|
|
10
|
+
|
|
11
|
+
- Complete audit trails
|
|
12
|
+
- Reproducible fact extraction
|
|
13
|
+
- Historical accuracy
|
|
14
|
+
|
|
15
|
+
### Temporal First
|
|
16
|
+
|
|
17
|
+
Every fact has temporal bounds (`valid_at`, `invalid_at`). This enables:
|
|
18
|
+
|
|
19
|
+
- Point-in-time queries
|
|
20
|
+
- Change tracking
|
|
21
|
+
- Historical analysis
|
|
22
|
+
|
|
23
|
+
### Entity Resolution
|
|
24
|
+
|
|
25
|
+
Names and mentions are resolved to canonical entities:
|
|
26
|
+
|
|
27
|
+
- Reduces ambiguity
|
|
28
|
+
- Enables cross-reference
|
|
29
|
+
- Supports alias matching
|
|
30
|
+
|
|
31
|
+
### Provenance
|
|
32
|
+
|
|
33
|
+
Every fact links back to source content:
|
|
34
|
+
|
|
35
|
+
- Verifiable assertions
|
|
36
|
+
- Confidence tracking
|
|
37
|
+
- Contradiction detection
|
|
38
|
+
|
|
39
|
+
## System Architecture
|
|
40
|
+
|
|
41
|
+
```mermaid
|
|
42
|
+
graph TB
|
|
43
|
+
subgraph Input
|
|
44
|
+
Email[Email]
|
|
45
|
+
Doc[Documents]
|
|
46
|
+
News[News]
|
|
47
|
+
API[APIs]
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
subgraph FactDb["FactDb Core"]
|
|
51
|
+
CS[ContentService]
|
|
52
|
+
ES[EntityService]
|
|
53
|
+
FS[FactService]
|
|
54
|
+
|
|
55
|
+
subgraph Extractors
|
|
56
|
+
ME[ManualExtractor]
|
|
57
|
+
LE[LLMExtractor]
|
|
58
|
+
RE[RuleBasedExtractor]
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
subgraph Resolution
|
|
62
|
+
ER[EntityResolver]
|
|
63
|
+
FR[FactResolver]
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
subgraph Pipeline
|
|
67
|
+
EP[ExtractionPipeline]
|
|
68
|
+
RP[ResolutionPipeline]
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
subgraph Storage["PostgreSQL + pgvector"]
|
|
73
|
+
Contents[(Contents)]
|
|
74
|
+
Entities[(Entities)]
|
|
75
|
+
Facts[(Facts)]
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
Email --> CS
|
|
79
|
+
Doc --> CS
|
|
80
|
+
News --> CS
|
|
81
|
+
API --> CS
|
|
82
|
+
|
|
83
|
+
CS --> Contents
|
|
84
|
+
ES --> Entities
|
|
85
|
+
FS --> Facts
|
|
86
|
+
|
|
87
|
+
Extractors --> FS
|
|
88
|
+
Resolution --> ES
|
|
89
|
+
Resolution --> FS
|
|
90
|
+
Pipeline --> Extractors
|
|
91
|
+
|
|
92
|
+
style Email fill:#1E40AF,stroke:#1E3A8A,color:#FFFFFF
|
|
93
|
+
style Doc fill:#1E40AF,stroke:#1E3A8A,color:#FFFFFF
|
|
94
|
+
style News fill:#1E40AF,stroke:#1E3A8A,color:#FFFFFF
|
|
95
|
+
style API fill:#1E40AF,stroke:#1E3A8A,color:#FFFFFF
|
|
96
|
+
style CS fill:#B45309,stroke:#92400E,color:#FFFFFF
|
|
97
|
+
style ES fill:#B45309,stroke:#92400E,color:#FFFFFF
|
|
98
|
+
style FS fill:#B45309,stroke:#92400E,color:#FFFFFF
|
|
99
|
+
style ME fill:#047857,stroke:#065F46,color:#FFFFFF
|
|
100
|
+
style LE fill:#047857,stroke:#065F46,color:#FFFFFF
|
|
101
|
+
style RE fill:#047857,stroke:#065F46,color:#FFFFFF
|
|
102
|
+
style ER fill:#C2410C,stroke:#9A3412,color:#FFFFFF
|
|
103
|
+
style FR fill:#C2410C,stroke:#9A3412,color:#FFFFFF
|
|
104
|
+
style EP fill:#7C3AED,stroke:#6D28D9,color:#FFFFFF
|
|
105
|
+
style RP fill:#7C3AED,stroke:#6D28D9,color:#FFFFFF
|
|
106
|
+
style Contents fill:#B91C1C,stroke:#991B1B,color:#FFFFFF
|
|
107
|
+
style Entities fill:#B91C1C,stroke:#991B1B,color:#FFFFFF
|
|
108
|
+
style Facts fill:#B91C1C,stroke:#991B1B,color:#FFFFFF
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
## Components
|
|
112
|
+
|
|
113
|
+
### Services
|
|
114
|
+
|
|
115
|
+
- **ContentService** - Ingests and manages source content
|
|
116
|
+
- **EntityService** - Creates and resolves entities
|
|
117
|
+
- **FactService** - Extracts, creates, and queries facts
|
|
118
|
+
|
|
119
|
+
### Extractors
|
|
120
|
+
|
|
121
|
+
- **ManualExtractor** - API-driven fact creation
|
|
122
|
+
- **LLMExtractor** - AI-powered extraction using OpenAI, Anthropic, etc.
|
|
123
|
+
- **RuleBasedExtractor** - Pattern matching with regular expressions
|
|
124
|
+
|
|
125
|
+
### Resolution
|
|
126
|
+
|
|
127
|
+
- **EntityResolver** - Matches mentions to canonical entities
|
|
128
|
+
- **FactResolver** - Handles supersession, synthesis, and conflicts
|
|
129
|
+
|
|
130
|
+
### Pipeline
|
|
131
|
+
|
|
132
|
+
- **ExtractionPipeline** - Concurrent fact extraction using SimpleFlow
|
|
133
|
+
- **ResolutionPipeline** - Parallel entity resolution
|
|
134
|
+
|
|
135
|
+
## Data Flow
|
|
136
|
+
|
|
137
|
+
1. **Ingest** - Raw content enters the system
|
|
138
|
+
2. **Store** - Content is hashed, deduplicated, and stored
|
|
139
|
+
3. **Extract** - Facts are extracted via chosen method
|
|
140
|
+
4. **Resolve** - Entity mentions are resolved to canonical entities
|
|
141
|
+
5. **Link** - Facts are linked to sources and entities
|
|
142
|
+
6. **Query** - Temporal queries retrieve relevant facts
|
|
143
|
+
|
|
144
|
+
## Documentation
|
|
145
|
+
|
|
146
|
+
- [Three-Layer Model](three-layer-model.md) - Content, Entities, Facts
|
|
147
|
+
- [Temporal Facts](temporal-facts.md) - Validity periods and status
|
|
148
|
+
- [Entity Resolution](entity-resolution.md) - Resolving mentions
|
|
149
|
+
- [Database Schema](database-schema.md) - Table structure
|
|
@@ -0,0 +1,268 @@
|
|
|
1
|
+
# Temporal Facts
|
|
2
|
+
|
|
3
|
+
Temporal facts are the core innovation of FactDb - assertions with explicit validity periods that enable point-in-time queries.
|
|
4
|
+
|
|
5
|
+
## Validity Periods
|
|
6
|
+
|
|
7
|
+
Every fact has two timestamps:
|
|
8
|
+
|
|
9
|
+
```ruby
|
|
10
|
+
{
|
|
11
|
+
valid_at: "2024-01-10T00:00:00Z", # When fact became true
|
|
12
|
+
invalid_at: "2024-06-01T00:00:00Z" # When fact stopped being true (or nil)
|
|
13
|
+
}
|
|
14
|
+
```
|
|
15
|
+
|
|
16
|
+
### Currently Valid
|
|
17
|
+
|
|
18
|
+
Facts with `invalid_at: nil` are currently valid:
|
|
19
|
+
|
|
20
|
+
```ruby
|
|
21
|
+
# Paula is currently a Principal Engineer
|
|
22
|
+
{
|
|
23
|
+
fact_text: "Paula Chen is Principal Engineer",
|
|
24
|
+
valid_at: "2024-01-10",
|
|
25
|
+
invalid_at: nil
|
|
26
|
+
}
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
### Historical
|
|
30
|
+
|
|
31
|
+
Facts with both dates represent historical periods:
|
|
32
|
+
|
|
33
|
+
```ruby
|
|
34
|
+
# Paula was Senior Engineer before promotion
|
|
35
|
+
{
|
|
36
|
+
fact_text: "Paula Chen is Senior Engineer",
|
|
37
|
+
valid_at: "2022-03-15",
|
|
38
|
+
invalid_at: "2024-01-10"
|
|
39
|
+
}
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
## Temporal Queries
|
|
43
|
+
|
|
44
|
+
### Current Facts
|
|
45
|
+
|
|
46
|
+
```ruby
|
|
47
|
+
# Get facts valid right now
|
|
48
|
+
current = facts.query_facts(
|
|
49
|
+
entity: paula.id,
|
|
50
|
+
status: :canonical
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
# Shorthand
|
|
54
|
+
current = facts.current_facts_for(paula.id)
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
### Point-in-Time
|
|
58
|
+
|
|
59
|
+
```ruby
|
|
60
|
+
# What was true on a specific date?
|
|
61
|
+
queried = facts.facts_at(
|
|
62
|
+
Date.parse("2023-06-15"),
|
|
63
|
+
entity: paula.id
|
|
64
|
+
)
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
### Time Range
|
|
68
|
+
|
|
69
|
+
```ruby
|
|
70
|
+
# Facts active during a period
|
|
71
|
+
queried = facts.fact_service.query(
|
|
72
|
+
entity: paula.id,
|
|
73
|
+
from: Date.parse("2023-01-01"),
|
|
74
|
+
to: Date.parse("2023-12-31")
|
|
75
|
+
)
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
## Fact Status
|
|
79
|
+
|
|
80
|
+
Facts move through different statuses as information evolves:
|
|
81
|
+
|
|
82
|
+
```mermaid
|
|
83
|
+
stateDiagram-v2
|
|
84
|
+
[*] --> canonical: Extracted
|
|
85
|
+
canonical --> corroborated: 2+ sources confirm
|
|
86
|
+
canonical --> superseded: New info replaces
|
|
87
|
+
corroborated --> superseded: Later replaced
|
|
88
|
+
canonical --> synthesized: Combined
|
|
89
|
+
superseded --> [*]
|
|
90
|
+
|
|
91
|
+
classDef blue fill:#1E40AF,stroke:#1E3A8A,color:#FFFFFF
|
|
92
|
+
classDef green fill:#047857,stroke:#065F46,color:#FFFFFF
|
|
93
|
+
classDef red fill:#B91C1C,stroke:#991B1B,color:#FFFFFF
|
|
94
|
+
classDef yellow fill:#B45309,stroke:#92400E,color:#FFFFFF
|
|
95
|
+
|
|
96
|
+
class canonical blue
|
|
97
|
+
class corroborated green
|
|
98
|
+
class superseded red
|
|
99
|
+
class synthesized yellow
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
### Canonical
|
|
103
|
+
|
|
104
|
+
The current authoritative version of a fact:
|
|
105
|
+
|
|
106
|
+
```ruby
|
|
107
|
+
fact = facts.fact_service.create(
|
|
108
|
+
"Paula Chen is Principal Engineer",
|
|
109
|
+
valid_at: Date.parse("2024-01-10"),
|
|
110
|
+
mentions: [{ entity: paula, role: "subject" }]
|
|
111
|
+
)
|
|
112
|
+
# fact.status => "canonical"
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
### Superseded
|
|
116
|
+
|
|
117
|
+
When information changes, old facts are superseded:
|
|
118
|
+
|
|
119
|
+
```ruby
|
|
120
|
+
# Paula gets promoted
|
|
121
|
+
new_fact = facts.fact_service.resolver.supersede(
|
|
122
|
+
old_fact.id,
|
|
123
|
+
"Paula Chen is Senior Principal Engineer",
|
|
124
|
+
valid_at: Date.parse("2024-06-01")
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
# old_fact.status => "superseded"
|
|
128
|
+
# old_fact.invalid_at => "2024-06-01"
|
|
129
|
+
# old_fact.superseded_by_id => new_fact.id
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
### Corroborated
|
|
133
|
+
|
|
134
|
+
Facts confirmed by multiple independent sources:
|
|
135
|
+
|
|
136
|
+
```ruby
|
|
137
|
+
# Two sources say the same thing
|
|
138
|
+
facts.fact_service.resolver.corroborate(fact.id, other_fact.id)
|
|
139
|
+
facts.fact_service.resolver.corroborate(fact.id, third_fact.id)
|
|
140
|
+
|
|
141
|
+
# After 2+ corroborations
|
|
142
|
+
fact.reload
|
|
143
|
+
# fact.status => "corroborated"
|
|
144
|
+
# fact.corroborated_by_ids => [other_fact.id, third_fact.id]
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
### Synthesized
|
|
148
|
+
|
|
149
|
+
Derived facts combine information from multiple sources:
|
|
150
|
+
|
|
151
|
+
```ruby
|
|
152
|
+
synthesized = facts.fact_service.resolver.synthesize(
|
|
153
|
+
[fact1.id, fact2.id, fact3.id],
|
|
154
|
+
"Paula worked at Microsoft from Jan 2024 to present, starting as Principal Engineer",
|
|
155
|
+
valid_at: Date.parse("2024-01-10")
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
# synthesized.status => "synthesized"
|
|
159
|
+
# synthesized.derived_from_ids => [fact1.id, fact2.id, fact3.id]
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
## Timelines
|
|
163
|
+
|
|
164
|
+
Build complete timelines for entities:
|
|
165
|
+
|
|
166
|
+
```ruby
|
|
167
|
+
timeline = facts.timeline_for(paula.id)
|
|
168
|
+
|
|
169
|
+
# Returns chronological list of facts
|
|
170
|
+
timeline.each do |entry|
|
|
171
|
+
puts "#{entry.valid_at}: #{entry.fact_text}"
|
|
172
|
+
puts " Until: #{entry.invalid_at || 'present'}"
|
|
173
|
+
end
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
### Timeline Example
|
|
177
|
+
|
|
178
|
+
```
|
|
179
|
+
2022-03-15: Paula Chen joined Company as Software Engineer
|
|
180
|
+
Until: 2023-01-10
|
|
181
|
+
|
|
182
|
+
2023-01-10: Paula Chen promoted to Senior Engineer
|
|
183
|
+
Until: 2024-01-10
|
|
184
|
+
|
|
185
|
+
2024-01-10: Paula Chen is Principal Engineer at Microsoft
|
|
186
|
+
Until: present
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
## Conflict Detection
|
|
190
|
+
|
|
191
|
+
FactDb can detect potentially conflicting facts:
|
|
192
|
+
|
|
193
|
+
```ruby
|
|
194
|
+
conflicts = facts.fact_service.resolver.find_conflicts(
|
|
195
|
+
entity_id: paula.id,
|
|
196
|
+
topic: "title"
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
conflicts.each do |conflict|
|
|
200
|
+
puts "Potential conflict:"
|
|
201
|
+
puts " Fact 1: #{conflict[:fact1].fact_text}"
|
|
202
|
+
puts " Fact 2: #{conflict[:fact2].fact_text}"
|
|
203
|
+
puts " Similarity: #{conflict[:similarity]}"
|
|
204
|
+
end
|
|
205
|
+
```
|
|
206
|
+
|
|
207
|
+
### Resolving Conflicts
|
|
208
|
+
|
|
209
|
+
```ruby
|
|
210
|
+
# Keep one fact, supersede the others
|
|
211
|
+
facts.fact_service.resolver.resolve_conflict(
|
|
212
|
+
keep_fact_id: correct_fact.id,
|
|
213
|
+
supersede_fact_ids: [wrong_fact.id],
|
|
214
|
+
reason: "Verified with HR records"
|
|
215
|
+
)
|
|
216
|
+
```
|
|
217
|
+
|
|
218
|
+
## Best Practices
|
|
219
|
+
|
|
220
|
+
### 1. Always Set valid_at
|
|
221
|
+
|
|
222
|
+
```ruby
|
|
223
|
+
# Good - explicit date
|
|
224
|
+
facts.fact_service.create(
|
|
225
|
+
"Paula joined the team",
|
|
226
|
+
valid_at: Date.parse("2024-01-10")
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
# Avoid - implicit current time
|
|
230
|
+
# valid_at defaults to Time.current if not provided
|
|
231
|
+
```
|
|
232
|
+
|
|
233
|
+
### 2. Supersede Don't Delete
|
|
234
|
+
|
|
235
|
+
```ruby
|
|
236
|
+
# Good - supersede old fact
|
|
237
|
+
facts.fact_service.resolver.supersede(
|
|
238
|
+
old_fact.id,
|
|
239
|
+
"Updated information",
|
|
240
|
+
valid_at: Date.today
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
# Avoid - deleting facts
|
|
244
|
+
# old_fact.destroy # Loses audit trail
|
|
245
|
+
```
|
|
246
|
+
|
|
247
|
+
### 3. Track Confidence
|
|
248
|
+
|
|
249
|
+
```ruby
|
|
250
|
+
fact = facts.fact_service.create(
|
|
251
|
+
"Paula may be promoted soon",
|
|
252
|
+
valid_at: Date.today,
|
|
253
|
+
confidence: 0.6 # Lower confidence for speculation
|
|
254
|
+
)
|
|
255
|
+
```
|
|
256
|
+
|
|
257
|
+
### 4. Link Sources
|
|
258
|
+
|
|
259
|
+
```ruby
|
|
260
|
+
fact = facts.fact_service.create(
|
|
261
|
+
"Paula is Principal Engineer",
|
|
262
|
+
valid_at: Date.parse("2024-01-10"),
|
|
263
|
+
sources: [
|
|
264
|
+
{ content: email, type: "primary", excerpt: "...accepted the offer..." },
|
|
265
|
+
{ content: announcement, type: "supporting" }
|
|
266
|
+
]
|
|
267
|
+
)
|
|
268
|
+
```
|
|
@@ -0,0 +1,242 @@
|
|
|
1
|
+
# Three-Layer Model
|
|
2
|
+
|
|
3
|
+
FactDb organizes information into three distinct layers, each with specific responsibilities.
|
|
4
|
+
|
|
5
|
+
## Overview
|
|
6
|
+
|
|
7
|
+
```mermaid
|
|
8
|
+
graph TB
|
|
9
|
+
subgraph Layer1["Layer 1: Content"]
|
|
10
|
+
C1[Immutable Documents]
|
|
11
|
+
C2[Source Evidence]
|
|
12
|
+
C3[Captured Timestamps]
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
subgraph Layer2["Layer 2: Entities"]
|
|
16
|
+
E1[Canonical Names]
|
|
17
|
+
E2[Aliases]
|
|
18
|
+
E3[Types]
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
subgraph Layer3["Layer 3: Facts"]
|
|
22
|
+
F1[Temporal Assertions]
|
|
23
|
+
F2[Validity Periods]
|
|
24
|
+
F3[Status Tracking]
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
Layer1 --> Layer3
|
|
28
|
+
Layer2 --> Layer3
|
|
29
|
+
|
|
30
|
+
style C1 fill:#1E40AF,stroke:#1E3A8A,color:#FFFFFF
|
|
31
|
+
style C2 fill:#1E40AF,stroke:#1E3A8A,color:#FFFFFF
|
|
32
|
+
style C3 fill:#1E40AF,stroke:#1E3A8A,color:#FFFFFF
|
|
33
|
+
style E1 fill:#047857,stroke:#065F46,color:#FFFFFF
|
|
34
|
+
style E2 fill:#047857,stroke:#065F46,color:#FFFFFF
|
|
35
|
+
style E3 fill:#047857,stroke:#065F46,color:#FFFFFF
|
|
36
|
+
style F1 fill:#B91C1C,stroke:#991B1B,color:#FFFFFF
|
|
37
|
+
style F2 fill:#B91C1C,stroke:#991B1B,color:#FFFFFF
|
|
38
|
+
style F3 fill:#B91C1C,stroke:#991B1B,color:#FFFFFF
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
## Layer 1: Content
|
|
42
|
+
|
|
43
|
+
The content layer stores raw source material that serves as evidence for facts.
|
|
44
|
+
|
|
45
|
+
### Characteristics
|
|
46
|
+
|
|
47
|
+
| Property | Description |
|
|
48
|
+
|----------|-------------|
|
|
49
|
+
| Immutable | Content never changes after ingestion |
|
|
50
|
+
| Deduplicated | SHA256 hash prevents duplicate storage |
|
|
51
|
+
| Timestamped | `captured_at` records when content was obtained |
|
|
52
|
+
| Typed | Categories like email, document, article |
|
|
53
|
+
| Searchable | Full-text and semantic vector search |
|
|
54
|
+
|
|
55
|
+
### Content Types
|
|
56
|
+
|
|
57
|
+
```ruby
|
|
58
|
+
# Common content types
|
|
59
|
+
:email # Email messages
|
|
60
|
+
:document # General documents
|
|
61
|
+
:article # News articles
|
|
62
|
+
:transcript # Meeting transcripts
|
|
63
|
+
:report # Reports and analysis
|
|
64
|
+
:announcement # Official announcements
|
|
65
|
+
:social # Social media posts
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
### Example
|
|
69
|
+
|
|
70
|
+
```ruby
|
|
71
|
+
content = facts.ingest(
|
|
72
|
+
"Paula Chen accepted the offer for Principal Engineer...",
|
|
73
|
+
type: :email,
|
|
74
|
+
title: "RE: Offer Letter - Paula Chen",
|
|
75
|
+
source_uri: "mailto:hr@company.com/12345",
|
|
76
|
+
captured_at: Time.current,
|
|
77
|
+
metadata: {
|
|
78
|
+
from: "hr@company.com",
|
|
79
|
+
to: "hiring@company.com",
|
|
80
|
+
subject: "RE: Offer Letter - Paula Chen"
|
|
81
|
+
}
|
|
82
|
+
)
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
## Layer 2: Entities
|
|
86
|
+
|
|
87
|
+
Entities represent real-world things mentioned in content.
|
|
88
|
+
|
|
89
|
+
### Entity Types
|
|
90
|
+
|
|
91
|
+
| Type | Description | Examples |
|
|
92
|
+
|------|-------------|----------|
|
|
93
|
+
| `person` | Individual people | Paula Chen, John Smith |
|
|
94
|
+
| `organization` | Companies, teams, groups | Microsoft, Platform Team |
|
|
95
|
+
| `place` | Locations | San Francisco, Building A |
|
|
96
|
+
| `product` | Products and services | Windows 11, Azure |
|
|
97
|
+
| `event` | Named events | Q4 Earnings, Annual Review |
|
|
98
|
+
|
|
99
|
+
### Resolution Status
|
|
100
|
+
|
|
101
|
+
```mermaid
|
|
102
|
+
stateDiagram-v2
|
|
103
|
+
[*] --> unresolved: Created
|
|
104
|
+
unresolved --> resolved: Confirmed identity
|
|
105
|
+
resolved --> merged: Duplicate found
|
|
106
|
+
merged --> [*]: Points to canonical
|
|
107
|
+
|
|
108
|
+
classDef blue fill:#1E40AF,stroke:#1E3A8A,color:#FFFFFF
|
|
109
|
+
classDef green fill:#047857,stroke:#065F46,color:#FFFFFF
|
|
110
|
+
classDef red fill:#B91C1C,stroke:#991B1B,color:#FFFFFF
|
|
111
|
+
|
|
112
|
+
class unresolved blue
|
|
113
|
+
class resolved green
|
|
114
|
+
class merged red
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
- **unresolved** - Entity created but not confirmed
|
|
118
|
+
- **resolved** - Entity identity confirmed
|
|
119
|
+
- **merged** - Entity merged into another (canonical) entity
|
|
120
|
+
|
|
121
|
+
### Aliases
|
|
122
|
+
|
|
123
|
+
Entities can have multiple aliases for flexible matching:
|
|
124
|
+
|
|
125
|
+
```ruby
|
|
126
|
+
entity = facts.entity_service.create(
|
|
127
|
+
"Paula Chen",
|
|
128
|
+
type: :person,
|
|
129
|
+
aliases: [
|
|
130
|
+
"Paula",
|
|
131
|
+
"P. Chen",
|
|
132
|
+
"Chen, Paula"
|
|
133
|
+
]
|
|
134
|
+
)
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
## Layer 3: Facts
|
|
138
|
+
|
|
139
|
+
Facts are temporal assertions about entities, extracted from content.
|
|
140
|
+
|
|
141
|
+
### Fact Structure
|
|
142
|
+
|
|
143
|
+
```ruby
|
|
144
|
+
fact = Models::Fact.new(
|
|
145
|
+
fact_text: "Paula Chen is Principal Engineer at Microsoft",
|
|
146
|
+
valid_at: Date.parse("2024-01-10"),
|
|
147
|
+
invalid_at: nil, # Still valid
|
|
148
|
+
status: "canonical",
|
|
149
|
+
confidence: 0.95,
|
|
150
|
+
extraction_method: "llm"
|
|
151
|
+
)
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
### Temporal Bounds
|
|
155
|
+
|
|
156
|
+
Every fact has:
|
|
157
|
+
|
|
158
|
+
- `valid_at` - When the fact became true (required)
|
|
159
|
+
- `invalid_at` - When the fact stopped being true (nil if current)
|
|
160
|
+
|
|
161
|
+
```ruby
|
|
162
|
+
# Currently valid fact
|
|
163
|
+
fact1 = { valid_at: "2024-01-10", invalid_at: nil }
|
|
164
|
+
|
|
165
|
+
# Historical fact
|
|
166
|
+
fact2 = { valid_at: "2023-01-01", invalid_at: "2024-01-09" }
|
|
167
|
+
|
|
168
|
+
# Point-in-time query
|
|
169
|
+
facts.facts_at(Date.parse("2023-06-15")) # Returns fact2
|
|
170
|
+
facts.facts_at(Date.parse("2024-02-01")) # Returns fact1
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
### Fact Status
|
|
174
|
+
|
|
175
|
+
| Status | Description |
|
|
176
|
+
|--------|-------------|
|
|
177
|
+
| `canonical` | Current authoritative version |
|
|
178
|
+
| `superseded` | Replaced by newer information |
|
|
179
|
+
| `corroborated` | Confirmed by multiple sources |
|
|
180
|
+
| `synthesized` | Derived from multiple facts |
|
|
181
|
+
|
|
182
|
+
### Relationships
|
|
183
|
+
|
|
184
|
+
Facts connect to both content and entities:
|
|
185
|
+
|
|
186
|
+
```mermaid
|
|
187
|
+
graph LR
|
|
188
|
+
C[Content] -->|fact_sources| F[Fact]
|
|
189
|
+
F -->|entity_mentions| E1[Entity 1]
|
|
190
|
+
F -->|entity_mentions| E2[Entity 2]
|
|
191
|
+
|
|
192
|
+
style C fill:#1E40AF,stroke:#1E3A8A,color:#FFFFFF
|
|
193
|
+
style F fill:#B91C1C,stroke:#991B1B,color:#FFFFFF
|
|
194
|
+
style E1 fill:#047857,stroke:#065F46,color:#FFFFFF
|
|
195
|
+
style E2 fill:#047857,stroke:#065F46,color:#FFFFFF
|
|
196
|
+
```
|
|
197
|
+
|
|
198
|
+
## Layer Interactions
|
|
199
|
+
|
|
200
|
+
### Content to Facts
|
|
201
|
+
|
|
202
|
+
Facts are extracted from content and maintain source links:
|
|
203
|
+
|
|
204
|
+
```ruby
|
|
205
|
+
# Extract facts from content
|
|
206
|
+
extracted = facts.extract_facts(content.id, extractor: :llm)
|
|
207
|
+
|
|
208
|
+
# Each fact links back to source
|
|
209
|
+
extracted.first.fact_sources.each do |source|
|
|
210
|
+
puts source.content.title
|
|
211
|
+
puts source.excerpt
|
|
212
|
+
end
|
|
213
|
+
```
|
|
214
|
+
|
|
215
|
+
### Entities to Facts
|
|
216
|
+
|
|
217
|
+
Facts mention entities with specific roles:
|
|
218
|
+
|
|
219
|
+
```ruby
|
|
220
|
+
fact.entity_mentions.each do |mention|
|
|
221
|
+
puts "#{mention.entity.canonical_name}: #{mention.mention_role}"
|
|
222
|
+
end
|
|
223
|
+
# Output:
|
|
224
|
+
# Paula Chen: subject
|
|
225
|
+
# Microsoft: organization
|
|
226
|
+
# Principal Engineer: role
|
|
227
|
+
```
|
|
228
|
+
|
|
229
|
+
### Cross-Layer Queries
|
|
230
|
+
|
|
231
|
+
Query across all layers:
|
|
232
|
+
|
|
233
|
+
```ruby
|
|
234
|
+
# Find all content about an entity
|
|
235
|
+
contents = facts.content_service.mentioning_entity(paula.id)
|
|
236
|
+
|
|
237
|
+
# Find all entities mentioned in content
|
|
238
|
+
entities = facts.entity_service.in_content(content.id)
|
|
239
|
+
|
|
240
|
+
# Find all facts from a specific source
|
|
241
|
+
source_facts = facts.fact_service.from_content(content.id)
|
|
242
|
+
```
|