fact_db 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. checksums.yaml +7 -0
  2. data/.envrc +1 -0
  3. data/CHANGELOG.md +48 -0
  4. data/COMMITS.md +196 -0
  5. data/README.md +102 -0
  6. data/Rakefile +41 -0
  7. data/db/migrate/001_enable_extensions.rb +7 -0
  8. data/db/migrate/002_create_contents.rb +44 -0
  9. data/db/migrate/003_create_entities.rb +36 -0
  10. data/db/migrate/004_create_entity_aliases.rb +18 -0
  11. data/db/migrate/005_create_facts.rb +65 -0
  12. data/db/migrate/006_create_entity_mentions.rb +18 -0
  13. data/db/migrate/007_create_fact_sources.rb +18 -0
  14. data/docs/api/extractors/index.md +71 -0
  15. data/docs/api/extractors/llm.md +162 -0
  16. data/docs/api/extractors/manual.md +92 -0
  17. data/docs/api/extractors/rule-based.md +165 -0
  18. data/docs/api/facts.md +300 -0
  19. data/docs/api/index.md +66 -0
  20. data/docs/api/models/content.md +165 -0
  21. data/docs/api/models/entity.md +202 -0
  22. data/docs/api/models/fact.md +270 -0
  23. data/docs/api/models/index.md +77 -0
  24. data/docs/api/pipeline/extraction.md +175 -0
  25. data/docs/api/pipeline/index.md +72 -0
  26. data/docs/api/pipeline/resolution.md +209 -0
  27. data/docs/api/services/content-service.md +166 -0
  28. data/docs/api/services/entity-service.md +202 -0
  29. data/docs/api/services/fact-service.md +223 -0
  30. data/docs/api/services/index.md +55 -0
  31. data/docs/architecture/database-schema.md +293 -0
  32. data/docs/architecture/entity-resolution.md +293 -0
  33. data/docs/architecture/index.md +149 -0
  34. data/docs/architecture/temporal-facts.md +268 -0
  35. data/docs/architecture/three-layer-model.md +242 -0
  36. data/docs/assets/css/custom.css +137 -0
  37. data/docs/assets/fact_db.jpg +0 -0
  38. data/docs/assets/images/fact_db.jpg +0 -0
  39. data/docs/concepts.md +183 -0
  40. data/docs/examples/basic-usage.md +235 -0
  41. data/docs/examples/hr-onboarding.md +312 -0
  42. data/docs/examples/index.md +64 -0
  43. data/docs/examples/news-analysis.md +288 -0
  44. data/docs/getting-started/database-setup.md +170 -0
  45. data/docs/getting-started/index.md +71 -0
  46. data/docs/getting-started/installation.md +98 -0
  47. data/docs/getting-started/quick-start.md +191 -0
  48. data/docs/guides/batch-processing.md +325 -0
  49. data/docs/guides/configuration.md +243 -0
  50. data/docs/guides/entity-management.md +364 -0
  51. data/docs/guides/extracting-facts.md +299 -0
  52. data/docs/guides/index.md +22 -0
  53. data/docs/guides/ingesting-content.md +252 -0
  54. data/docs/guides/llm-integration.md +299 -0
  55. data/docs/guides/temporal-queries.md +315 -0
  56. data/docs/index.md +121 -0
  57. data/examples/README.md +130 -0
  58. data/examples/basic_usage.rb +164 -0
  59. data/examples/entity_management.rb +216 -0
  60. data/examples/hr_system.rb +428 -0
  61. data/examples/rule_based_extraction.rb +258 -0
  62. data/examples/temporal_queries.rb +245 -0
  63. data/lib/fact_db/config.rb +71 -0
  64. data/lib/fact_db/database.rb +45 -0
  65. data/lib/fact_db/errors.rb +10 -0
  66. data/lib/fact_db/extractors/base.rb +117 -0
  67. data/lib/fact_db/extractors/llm_extractor.rb +179 -0
  68. data/lib/fact_db/extractors/manual_extractor.rb +53 -0
  69. data/lib/fact_db/extractors/rule_based_extractor.rb +228 -0
  70. data/lib/fact_db/llm/adapter.rb +109 -0
  71. data/lib/fact_db/models/content.rb +62 -0
  72. data/lib/fact_db/models/entity.rb +84 -0
  73. data/lib/fact_db/models/entity_alias.rb +26 -0
  74. data/lib/fact_db/models/entity_mention.rb +33 -0
  75. data/lib/fact_db/models/fact.rb +192 -0
  76. data/lib/fact_db/models/fact_source.rb +35 -0
  77. data/lib/fact_db/pipeline/extraction_pipeline.rb +146 -0
  78. data/lib/fact_db/pipeline/resolution_pipeline.rb +129 -0
  79. data/lib/fact_db/resolution/entity_resolver.rb +261 -0
  80. data/lib/fact_db/resolution/fact_resolver.rb +259 -0
  81. data/lib/fact_db/services/content_service.rb +93 -0
  82. data/lib/fact_db/services/entity_service.rb +150 -0
  83. data/lib/fact_db/services/fact_service.rb +193 -0
  84. data/lib/fact_db/temporal/query.rb +125 -0
  85. data/lib/fact_db/temporal/timeline.rb +134 -0
  86. data/lib/fact_db/version.rb +5 -0
  87. data/lib/fact_db.rb +141 -0
  88. data/mkdocs.yml +198 -0
  89. metadata +288 -0
@@ -0,0 +1,137 @@
1
+ /* EventClock Documentation Custom Styles */
2
+
3
+ /* Code blocks */
4
+ .highlight pre {
5
+ border-radius: 6px;
6
+ }
7
+
8
+ /* Tables */
9
+ .md-typeset table:not([class]) {
10
+ font-size: 0.85rem;
11
+ }
12
+
13
+ .md-typeset table:not([class]) th {
14
+ background-color: var(--md-primary-fg-color);
15
+ color: var(--md-primary-bg-color);
16
+ }
17
+
18
+ /* Admonitions */
19
+ .md-typeset .admonition {
20
+ border-radius: 6px;
21
+ }
22
+
23
+ /* Method signatures */
24
+ .md-typeset code {
25
+ border-radius: 4px;
26
+ }
27
+
28
+ /* Navigation */
29
+ .md-nav__link {
30
+ font-size: 0.85rem;
31
+ }
32
+
33
+ /* Mermaid diagrams */
34
+ .mermaid {
35
+ text-align: center;
36
+ margin: 1.5rem 0;
37
+ }
38
+
39
+ /* API reference styling */
40
+ .md-typeset h3 code {
41
+ background: transparent;
42
+ padding: 0;
43
+ }
44
+
45
+ /* Parameters table */
46
+ .md-typeset table:not([class]) td:first-child code {
47
+ white-space: nowrap;
48
+ }
49
+
50
+ /* Example blocks */
51
+ .md-typeset .example {
52
+ background-color: var(--md-code-bg-color);
53
+ border-left: 4px solid var(--md-accent-fg-color);
54
+ padding: 1rem;
55
+ margin: 1rem 0;
56
+ border-radius: 0 6px 6px 0;
57
+ }
58
+
59
+ /* Cards grid (for index pages) */
60
+ .grid.cards {
61
+ display: grid;
62
+ grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
63
+ gap: 1rem;
64
+ margin: 1rem 0;
65
+ }
66
+
67
+ /* Method dividers */
68
+ .md-typeset hr {
69
+ margin: 2rem 0;
70
+ border-top: 1px solid var(--md-default-fg-color--lightest);
71
+ }
72
+
73
+ /* Smaller font for long code blocks */
74
+ .md-typeset pre > code {
75
+ font-size: 0.8rem;
76
+ }
77
+
78
+ /* Timeline styling */
79
+ .timeline {
80
+ border-left: 2px solid var(--md-accent-fg-color);
81
+ padding-left: 1rem;
82
+ margin-left: 0.5rem;
83
+ }
84
+
85
+ .timeline-item {
86
+ position: relative;
87
+ padding-bottom: 1rem;
88
+ }
89
+
90
+ .timeline-item::before {
91
+ content: "";
92
+ position: absolute;
93
+ left: -1.4rem;
94
+ top: 0.5rem;
95
+ width: 10px;
96
+ height: 10px;
97
+ background: var(--md-accent-fg-color);
98
+ border-radius: 50%;
99
+ }
100
+
101
+ /* Status badges */
102
+ .status-canonical {
103
+ background: #4caf50;
104
+ color: white;
105
+ padding: 2px 8px;
106
+ border-radius: 4px;
107
+ font-size: 0.75rem;
108
+ }
109
+
110
+ .status-superseded {
111
+ background: #ff9800;
112
+ color: white;
113
+ padding: 2px 8px;
114
+ border-radius: 4px;
115
+ font-size: 0.75rem;
116
+ }
117
+
118
+ .status-corroborated {
119
+ background: #2196f3;
120
+ color: white;
121
+ padding: 2px 8px;
122
+ border-radius: 4px;
123
+ font-size: 0.75rem;
124
+ }
125
+
126
+ .status-synthesized {
127
+ background: #9c27b0;
128
+ color: white;
129
+ padding: 2px 8px;
130
+ border-radius: 4px;
131
+ font-size: 0.75rem;
132
+ }
133
+
134
+ /* Dark mode adjustments */
135
+ [data-md-color-scheme="slate"] .md-typeset table:not([class]) th {
136
+ background-color: var(--md-primary-fg-color--dark);
137
+ }
Binary file
Binary file
data/docs/concepts.md ADDED
@@ -0,0 +1,183 @@
1
+ # Core Concepts
2
+
3
+ FactDb is built around several key concepts that work together to provide temporal fact tracking with full provenance.
4
+
5
+ ## The Three-Layer Model
6
+
7
+ ### Content Layer
8
+
9
+ The content layer stores immutable source documents - emails, articles, reports, transcripts, or any text that contains facts. Content is:
10
+
11
+ - **Immutable** - Once ingested, content never changes
12
+ - **Deduplicated** - Identified by content hash to prevent duplicates
13
+ - **Timestamped** - Records when the content was captured
14
+ - **Searchable** - Supports full-text and semantic search via embeddings
15
+
16
+ ### Entity Layer
17
+
18
+ Entities represent real-world things mentioned in content:
19
+
20
+ | Type | Examples |
21
+ |------|----------|
22
+ | `person` | Paula Chen, John Smith |
23
+ | `organization` | Microsoft, Acme Corp |
24
+ | `place` | San Francisco, Building A |
25
+ | `product` | Windows 11, iPhone |
26
+ | `event` | Q4 2024 Earnings Call |
27
+
28
+ Entities support:
29
+
30
+ - **Canonical Names** - The authoritative name for the entity
31
+ - **Aliases** - Alternative names and spellings
32
+ - **Resolution** - Matching mentions to entities via exact match, aliases, or fuzzy matching
33
+ - **Merging** - Combining duplicate entities when discovered
34
+
35
+ ### Fact Layer
36
+
37
+ Facts are temporal assertions about entities:
38
+
39
+ ```ruby
40
+ # A fact has:
41
+ # - fact_text: The assertion itself
42
+ # - valid_at: When the fact became true
43
+ # - invalid_at: When the fact stopped being true (nil if still valid)
44
+ # - status: canonical, superseded, corroborated, synthesized
45
+ # - entity_mentions: Links to entities mentioned in the fact
46
+ # - fact_sources: Links to source content
47
+ ```
48
+
49
+ ## Fact Lifecycle
50
+
51
+ Facts move through different statuses as information evolves:
52
+
53
+ ```mermaid
54
+ stateDiagram-v2
55
+ [*] --> canonical: New fact extracted
56
+ canonical --> corroborated: Multiple sources confirm
57
+ canonical --> superseded: New information replaces
58
+ canonical --> synthesized: Combined with other facts
59
+ superseded --> [*]: Archived
60
+ corroborated --> superseded: Later replaced
61
+
62
+ classDef blue fill:#1E40AF,stroke:#1E3A8A,color:#FFFFFF
63
+ classDef green fill:#047857,stroke:#065F46,color:#FFFFFF
64
+ classDef red fill:#B91C1C,stroke:#991B1B,color:#FFFFFF
65
+ classDef yellow fill:#B45309,stroke:#92400E,color:#FFFFFF
66
+
67
+ class canonical blue
68
+ class corroborated green
69
+ class superseded red
70
+ class synthesized yellow
71
+ ```
72
+
73
+ ### Canonical
74
+
75
+ A canonical fact is the current authoritative version:
76
+
77
+ ```ruby
78
+ fact = facts.fact_service.create(
79
+ "Paula is a Principal Engineer",
80
+ valid_at: Date.parse("2024-01-10"),
81
+ mentions: [{ entity: paula, role: "subject" }]
82
+ )
83
+ # fact.status => "canonical"
84
+ ```
85
+
86
+ ### Superseded
87
+
88
+ When information changes, the old fact is superseded:
89
+
90
+ ```ruby
91
+ # Paula gets promoted
92
+ new_fact = facts.fact_service.resolver.supersede(
93
+ fact.id,
94
+ "Paula is a Senior Principal Engineer",
95
+ valid_at: Date.parse("2024-06-01")
96
+ )
97
+ # old fact.status => "superseded"
98
+ # old fact.invalid_at => "2024-06-01"
99
+ # new_fact.status => "canonical"
100
+ ```
101
+
102
+ ### Corroborated
103
+
104
+ Facts confirmed by multiple sources gain higher confidence:
105
+
106
+ ```ruby
107
+ facts.fact_service.resolver.corroborate(fact.id, other_fact.id)
108
+ # After 2+ corroborations: fact.status => "corroborated"
109
+ ```
110
+
111
+ ### Synthesized
112
+
113
+ Derived facts combine information from multiple sources:
114
+
115
+ ```ruby
116
+ synthesized = facts.fact_service.resolver.synthesize(
117
+ [fact1.id, fact2.id, fact3.id],
118
+ "Paula worked at Microsoft from Jan 2024, promoted to Senior in June 2024",
119
+ valid_at: Date.parse("2024-01-10")
120
+ )
121
+ # synthesized.status => "synthesized"
122
+ # synthesized.derived_from_ids => [fact1.id, fact2.id, fact3.id]
123
+ ```
124
+
125
+ ## Temporal Queries
126
+
127
+ The power of FactDb is querying facts across time:
128
+
129
+ ```ruby
130
+ # What do we know now?
131
+ current_facts = facts.query_facts(entity: paula.id)
132
+
133
+ # What did we know on March 1st?
134
+ march_facts = facts.facts_at(Date.parse("2024-03-01"), entity: paula.id)
135
+
136
+ # What's the full timeline?
137
+ timeline = facts.timeline_for(paula.id, from: "2024-01-01", to: "2024-12-31")
138
+ ```
139
+
140
+ ## Entity Resolution
141
+
142
+ When extracting facts, mentions must be resolved to entities:
143
+
144
+ ```mermaid
145
+ graph LR
146
+ M1["'Paula'"] --> R{Resolver}
147
+ M2["'P. Chen'"] --> R
148
+ M3["'Paula Chen'"] --> R
149
+ R --> E[Paula Chen Entity]
150
+
151
+ style M1 fill:#1E40AF,stroke:#1E3A8A,color:#FFFFFF
152
+ style M2 fill:#1E40AF,stroke:#1E3A8A,color:#FFFFFF
153
+ style M3 fill:#1E40AF,stroke:#1E3A8A,color:#FFFFFF
154
+ style R fill:#B45309,stroke:#92400E,color:#FFFFFF
155
+ style E fill:#047857,stroke:#065F46,color:#FFFFFF
156
+ ```
157
+
158
+ Resolution strategies (in order):
159
+
160
+ 1. **Exact Match** - Direct canonical name match
161
+ 2. **Alias Match** - Match against registered aliases
162
+ 3. **Fuzzy Match** - Levenshtein distance for typos/variations
163
+ 4. **Create New** - If no match, optionally create new entity
164
+
165
+ ## Audit Trails
166
+
167
+ Every fact maintains links to its source:
168
+
169
+ ```ruby
170
+ fact.fact_sources.each do |source|
171
+ puts "Source: #{source.content.title}"
172
+ puts "Type: #{source.source_type}" # primary, supporting, contradicting
173
+ puts "Excerpt: #{source.excerpt}"
174
+ puts "Confidence: #{source.confidence}"
175
+ end
176
+ ```
177
+
178
+ This enables:
179
+
180
+ - Tracing facts back to original documents
181
+ - Verifying information
182
+ - Understanding confidence levels
183
+ - Investigating contradictions
@@ -0,0 +1,235 @@
1
+ # Basic Usage
2
+
3
+ A simple introduction to FactDb's core functionality.
4
+
5
+ ## Setup
6
+
7
+ ```ruby
8
+ require 'fact_db'
9
+
10
+ # Configure
11
+ FactDb.configure do |config|
12
+ config.database_url = ENV['DATABASE_URL']
13
+ config.llm_provider = :openai
14
+ config.llm_api_key = ENV['OPENAI_API_KEY']
15
+ end
16
+
17
+ # Create facts instance
18
+ facts = FactDb.new
19
+ ```
20
+
21
+ ## Ingest Content
22
+
23
+ ```ruby
24
+ # Ingest an email
25
+ email = facts.ingest(
26
+ <<~TEXT,
27
+ Hi team,
28
+
29
+ I'm excited to announce that Paula Chen has accepted our offer
30
+ to join Microsoft as Principal Engineer starting January 10, 2024.
31
+
32
+ She'll be part of the Platform team reporting to Sarah Johnson.
33
+
34
+ Best,
35
+ HR
36
+ TEXT
37
+ type: :email,
38
+ title: "New Hire Announcement - Paula Chen",
39
+ captured_at: Time.current
40
+ )
41
+
42
+ puts "Ingested content ID: #{email.id}"
43
+ ```
44
+
45
+ ## Create Entities
46
+
47
+ ```ruby
48
+ # Create people
49
+ paula = facts.entity_service.create(
50
+ "Paula Chen",
51
+ type: :person,
52
+ aliases: ["Paula"]
53
+ )
54
+
55
+ sarah = facts.entity_service.create(
56
+ "Sarah Johnson",
57
+ type: :person,
58
+ aliases: ["Sarah"]
59
+ )
60
+
61
+ # Create organization
62
+ microsoft = facts.entity_service.create(
63
+ "Microsoft",
64
+ type: :organization,
65
+ aliases: ["MS", "MSFT"]
66
+ )
67
+
68
+ platform_team = facts.entity_service.create(
69
+ "Platform Team",
70
+ type: :organization
71
+ )
72
+
73
+ puts "Created entities: Paula, Sarah, Microsoft, Platform Team"
74
+ ```
75
+
76
+ ## Extract Facts Manually
77
+
78
+ ```ruby
79
+ # Create facts with explicit links
80
+ fact1 = facts.fact_service.create(
81
+ "Paula Chen joined Microsoft as Principal Engineer",
82
+ valid_at: Date.parse("2024-01-10"),
83
+ mentions: [
84
+ { entity: paula, role: "subject", text: "Paula Chen" },
85
+ { entity: microsoft, role: "organization", text: "Microsoft" }
86
+ ],
87
+ sources: [
88
+ { content: email, type: "primary" }
89
+ ]
90
+ )
91
+
92
+ fact2 = facts.fact_service.create(
93
+ "Paula Chen reports to Sarah Johnson",
94
+ valid_at: Date.parse("2024-01-10"),
95
+ mentions: [
96
+ { entity: paula, role: "subject", text: "Paula Chen" },
97
+ { entity: sarah, role: "object", text: "Sarah Johnson" }
98
+ ],
99
+ sources: [
100
+ { content: email, type: "primary" }
101
+ ]
102
+ )
103
+
104
+ fact3 = facts.fact_service.create(
105
+ "Paula Chen is on the Platform Team",
106
+ valid_at: Date.parse("2024-01-10"),
107
+ mentions: [
108
+ { entity: paula, role: "subject", text: "Paula Chen" },
109
+ { entity: platform_team, role: "organization", text: "Platform Team" }
110
+ ],
111
+ sources: [
112
+ { content: email, type: "primary" }
113
+ ]
114
+ )
115
+
116
+ puts "Created #{3} facts"
117
+ ```
118
+
119
+ ## Extract Facts with LLM
120
+
121
+ ```ruby
122
+ # Alternative: let LLM extract facts
123
+ extracted = facts.extract_facts(email.id, extractor: :llm)
124
+
125
+ puts "LLM extracted #{extracted.count} facts:"
126
+ extracted.each do |fact|
127
+ puts " - #{fact.fact_text}"
128
+ end
129
+ ```
130
+
131
+ ## Query Facts
132
+
133
+ ```ruby
134
+ # Current facts about Paula
135
+ puts "\nCurrent facts about Paula:"
136
+ facts.current_facts_for(paula.id).each do |fact|
137
+ puts " - #{fact.fact_text}"
138
+ end
139
+
140
+ # Facts about Microsoft
141
+ puts "\nFacts about Microsoft:"
142
+ facts.query_facts(entity: microsoft.id).each do |fact|
143
+ puts " - #{fact.fact_text}"
144
+ end
145
+ ```
146
+
147
+ ## Resolve Entity
148
+
149
+ ```ruby
150
+ # Resolve a name
151
+ resolved = facts.resolve_entity("Paula")
152
+ puts "\n'Paula' resolves to: #{resolved&.canonical_name}"
153
+
154
+ # Type-constrained resolution
155
+ person = facts.resolve_entity("Paula", type: :person)
156
+ puts "'Paula' as person: #{person&.canonical_name}"
157
+ ```
158
+
159
+ ## Update Facts (Supersession)
160
+
161
+ ```ruby
162
+ # Paula gets promoted
163
+ new_fact = facts.fact_service.resolver.supersede(
164
+ fact1.id,
165
+ "Paula Chen is Senior Principal Engineer at Microsoft",
166
+ valid_at: Date.parse("2024-06-01")
167
+ )
168
+
169
+ puts "\nSuperseded fact:"
170
+ puts " Old: #{fact1.reload.fact_text} (#{fact1.status})"
171
+ puts " New: #{new_fact.fact_text} (#{new_fact.status})"
172
+ ```
173
+
174
+ ## Timeline
175
+
176
+ ```ruby
177
+ # Build timeline
178
+ puts "\nPaula's timeline:"
179
+ facts.timeline_for(paula.id).each do |fact|
180
+ valid = fact.invalid_at ? "#{fact.valid_at} - #{fact.invalid_at}" : "#{fact.valid_at} - present"
181
+ puts " #{valid}: #{fact.fact_text}"
182
+ end
183
+ ```
184
+
185
+ ## Historical Query
186
+
187
+ ```ruby
188
+ # What did we know before promotion?
189
+ puts "\nFacts about Paula on March 1, 2024:"
190
+ facts.facts_at(Date.parse("2024-03-01"), entity: paula.id).each do |fact|
191
+ puts " - #{fact.fact_text}"
192
+ end
193
+
194
+ # What do we know after promotion?
195
+ puts "\nFacts about Paula on July 1, 2024:"
196
+ facts.facts_at(Date.parse("2024-07-01"), entity: paula.id).each do |fact|
197
+ puts " - #{fact.fact_text}"
198
+ end
199
+ ```
200
+
201
+ ## Complete Script
202
+
203
+ ```ruby
204
+ #!/usr/bin/env ruby
205
+ require 'fact_db'
206
+
207
+ # Setup
208
+ FactDb.configure do |config|
209
+ config.database_url = ENV['DATABASE_URL'] || 'postgresql://localhost/fact_db'
210
+ end
211
+
212
+ facts = FactDb.new
213
+
214
+ # Ingest
215
+ content = facts.ingest("Paula joined Microsoft on Jan 10, 2024", type: :note)
216
+
217
+ # Create entities
218
+ paula = facts.entity_service.create("Paula", type: :person)
219
+ microsoft = facts.entity_service.create("Microsoft", type: :organization)
220
+
221
+ # Create fact
222
+ fact = facts.fact_service.create(
223
+ "Paula joined Microsoft",
224
+ valid_at: Date.parse("2024-01-10"),
225
+ mentions: [
226
+ { entity: paula, role: "subject", text: "Paula" },
227
+ { entity: microsoft, role: "organization", text: "Microsoft" }
228
+ ],
229
+ sources: [{ content: content, type: "primary" }]
230
+ )
231
+
232
+ # Query
233
+ puts "Current facts about Paula:"
234
+ facts.current_facts_for(paula.id).each { |f| puts " - #{f.fact_text}" }
235
+ ```