fact_db 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. checksums.yaml +7 -0
  2. data/.envrc +1 -0
  3. data/CHANGELOG.md +48 -0
  4. data/COMMITS.md +196 -0
  5. data/README.md +102 -0
  6. data/Rakefile +41 -0
  7. data/db/migrate/001_enable_extensions.rb +7 -0
  8. data/db/migrate/002_create_contents.rb +44 -0
  9. data/db/migrate/003_create_entities.rb +36 -0
  10. data/db/migrate/004_create_entity_aliases.rb +18 -0
  11. data/db/migrate/005_create_facts.rb +65 -0
  12. data/db/migrate/006_create_entity_mentions.rb +18 -0
  13. data/db/migrate/007_create_fact_sources.rb +18 -0
  14. data/docs/api/extractors/index.md +71 -0
  15. data/docs/api/extractors/llm.md +162 -0
  16. data/docs/api/extractors/manual.md +92 -0
  17. data/docs/api/extractors/rule-based.md +165 -0
  18. data/docs/api/facts.md +300 -0
  19. data/docs/api/index.md +66 -0
  20. data/docs/api/models/content.md +165 -0
  21. data/docs/api/models/entity.md +202 -0
  22. data/docs/api/models/fact.md +270 -0
  23. data/docs/api/models/index.md +77 -0
  24. data/docs/api/pipeline/extraction.md +175 -0
  25. data/docs/api/pipeline/index.md +72 -0
  26. data/docs/api/pipeline/resolution.md +209 -0
  27. data/docs/api/services/content-service.md +166 -0
  28. data/docs/api/services/entity-service.md +202 -0
  29. data/docs/api/services/fact-service.md +223 -0
  30. data/docs/api/services/index.md +55 -0
  31. data/docs/architecture/database-schema.md +293 -0
  32. data/docs/architecture/entity-resolution.md +293 -0
  33. data/docs/architecture/index.md +149 -0
  34. data/docs/architecture/temporal-facts.md +268 -0
  35. data/docs/architecture/three-layer-model.md +242 -0
  36. data/docs/assets/css/custom.css +137 -0
  37. data/docs/assets/fact_db.jpg +0 -0
  38. data/docs/assets/images/fact_db.jpg +0 -0
  39. data/docs/concepts.md +183 -0
  40. data/docs/examples/basic-usage.md +235 -0
  41. data/docs/examples/hr-onboarding.md +312 -0
  42. data/docs/examples/index.md +64 -0
  43. data/docs/examples/news-analysis.md +288 -0
  44. data/docs/getting-started/database-setup.md +170 -0
  45. data/docs/getting-started/index.md +71 -0
  46. data/docs/getting-started/installation.md +98 -0
  47. data/docs/getting-started/quick-start.md +191 -0
  48. data/docs/guides/batch-processing.md +325 -0
  49. data/docs/guides/configuration.md +243 -0
  50. data/docs/guides/entity-management.md +364 -0
  51. data/docs/guides/extracting-facts.md +299 -0
  52. data/docs/guides/index.md +22 -0
  53. data/docs/guides/ingesting-content.md +252 -0
  54. data/docs/guides/llm-integration.md +299 -0
  55. data/docs/guides/temporal-queries.md +315 -0
  56. data/docs/index.md +121 -0
  57. data/examples/README.md +130 -0
  58. data/examples/basic_usage.rb +164 -0
  59. data/examples/entity_management.rb +216 -0
  60. data/examples/hr_system.rb +428 -0
  61. data/examples/rule_based_extraction.rb +258 -0
  62. data/examples/temporal_queries.rb +245 -0
  63. data/lib/fact_db/config.rb +71 -0
  64. data/lib/fact_db/database.rb +45 -0
  65. data/lib/fact_db/errors.rb +10 -0
  66. data/lib/fact_db/extractors/base.rb +117 -0
  67. data/lib/fact_db/extractors/llm_extractor.rb +179 -0
  68. data/lib/fact_db/extractors/manual_extractor.rb +53 -0
  69. data/lib/fact_db/extractors/rule_based_extractor.rb +228 -0
  70. data/lib/fact_db/llm/adapter.rb +109 -0
  71. data/lib/fact_db/models/content.rb +62 -0
  72. data/lib/fact_db/models/entity.rb +84 -0
  73. data/lib/fact_db/models/entity_alias.rb +26 -0
  74. data/lib/fact_db/models/entity_mention.rb +33 -0
  75. data/lib/fact_db/models/fact.rb +192 -0
  76. data/lib/fact_db/models/fact_source.rb +35 -0
  77. data/lib/fact_db/pipeline/extraction_pipeline.rb +146 -0
  78. data/lib/fact_db/pipeline/resolution_pipeline.rb +129 -0
  79. data/lib/fact_db/resolution/entity_resolver.rb +261 -0
  80. data/lib/fact_db/resolution/fact_resolver.rb +259 -0
  81. data/lib/fact_db/services/content_service.rb +93 -0
  82. data/lib/fact_db/services/entity_service.rb +150 -0
  83. data/lib/fact_db/services/fact_service.rb +193 -0
  84. data/lib/fact_db/temporal/query.rb +125 -0
  85. data/lib/fact_db/temporal/timeline.rb +134 -0
  86. data/lib/fact_db/version.rb +5 -0
  87. data/lib/fact_db.rb +141 -0
  88. data/mkdocs.yml +198 -0
  89. metadata +288 -0
@@ -0,0 +1,252 @@
1
+ # Ingesting Content
2
+
3
+ Content is the foundation of FactDb - immutable source documents from which facts are extracted.
4
+
5
+ ## Basic Ingestion
6
+
7
+ ```ruby
8
+ facts = FactDb.new
9
+
10
+ content = facts.ingest(
11
+ "Paula Chen joined Microsoft as Principal Engineer on January 10, 2024.",
12
+ type: :announcement
13
+ )
14
+ ```
15
+
16
+ ## Full Options
17
+
18
+ ```ruby
19
+ content = facts.ingest(
20
+ raw_text,
21
+ type: :email,
22
+ title: "RE: Offer Letter - Paula Chen",
23
+ source_uri: "mailto:hr@company.com/msg/12345",
24
+ captured_at: Time.parse("2024-01-08 10:30:00"),
25
+ metadata: {
26
+ from: "hr@company.com",
27
+ to: "hiring@company.com",
28
+ cc: ["manager@company.com"],
29
+ subject: "RE: Offer Letter - Paula Chen",
30
+ thread_id: "THR-12345"
31
+ }
32
+ )
33
+ ```
34
+
35
+ ## Content Types
36
+
37
+ Choose a type that best describes the source:
38
+
39
+ | Type | Use Case |
40
+ |------|----------|
41
+ | `:email` | Email messages |
42
+ | `:document` | General documents, PDFs |
43
+ | `:article` | News articles, blog posts |
44
+ | `:transcript` | Meeting transcripts, interviews |
45
+ | `:report` | Reports, analysis documents |
46
+ | `:announcement` | Official announcements |
47
+ | `:social` | Social media posts |
48
+ | `:form` | Structured forms, surveys |
49
+ | `:note` | Notes, memos |
50
+
51
+ ```ruby
52
+ # Custom types are also allowed
53
+ content = facts.ingest(text, type: :slack_message)
54
+ ```
55
+
56
+ ## Metadata
57
+
58
+ Store additional context in metadata:
59
+
60
+ ```ruby
61
+ # Email metadata
62
+ metadata: {
63
+ from: "sender@example.com",
64
+ to: "recipient@example.com",
65
+ subject: "Important Update",
66
+ message_id: "<abc123@mail.example.com>"
67
+ }
68
+
69
+ # Document metadata
70
+ metadata: {
71
+ author: "Jane Smith",
72
+ version: "2.1",
73
+ department: "Engineering",
74
+ classification: "internal"
75
+ }
76
+
77
+ # Article metadata
78
+ metadata: {
79
+ author: "John Doe",
80
+ publication: "Tech News",
81
+ url: "https://technews.com/article/123",
82
+ published_at: "2024-01-15T14:30:00Z"
83
+ }
84
+ ```
85
+
86
+ ## Deduplication
87
+
88
+ Content is automatically deduplicated by SHA256 hash:
89
+
90
+ ```ruby
91
+ # First ingestion - creates new record
92
+ content1 = facts.ingest("Hello world", type: :note)
93
+
94
+ # Second ingestion - returns existing record
95
+ content2 = facts.ingest("Hello world", type: :note)
96
+
97
+ content1.id == content2.id # => true
98
+ ```
99
+
100
+ ## Timestamps
101
+
102
+ ### captured_at
103
+
104
+ When the content was captured/received (defaults to current time):
105
+
106
+ ```ruby
107
+ # Email received yesterday
108
+ content = facts.ingest(
109
+ email_body,
110
+ type: :email,
111
+ captured_at: Time.parse("2024-01-14 09:00:00")
112
+ )
113
+ ```
114
+
115
+ ### created_at
116
+
117
+ Automatically set when record is created (system timestamp).
118
+
119
+ ## Batch Ingestion
120
+
121
+ For multiple documents:
122
+
123
+ ```ruby
124
+ documents = [
125
+ { text: "Doc 1 content", type: :document, title: "Doc 1" },
126
+ { text: "Doc 2 content", type: :document, title: "Doc 2" },
127
+ { text: "Doc 3 content", type: :document, title: "Doc 3" }
128
+ ]
129
+
130
+ contents = documents.map do |doc|
131
+ facts.ingest(doc[:text], type: doc[:type], title: doc[:title])
132
+ end
133
+ ```
134
+
135
+ ## Content Service
136
+
137
+ For advanced operations, use the content service directly:
138
+
139
+ ```ruby
140
+ # Create content
141
+ content = facts.content_service.create(
142
+ raw_text,
143
+ type: :document,
144
+ title: "Annual Report"
145
+ )
146
+
147
+ # Find by ID
148
+ content = facts.content_service.find(content_id)
149
+
150
+ # Find by hash
151
+ content = facts.content_service.find_by_hash(sha256_hash)
152
+
153
+ # Search by text
154
+ results = facts.content_service.search("quarterly earnings")
155
+
156
+ # Semantic search (requires embedding)
157
+ results = facts.content_service.semantic_search(
158
+ "financial performance",
159
+ limit: 10
160
+ )
161
+ ```
162
+
163
+ ## Embeddings
164
+
165
+ If you configure an embedding generator, content embeddings are created automatically:
166
+
167
+ ```ruby
168
+ FactDb.configure do |config|
169
+ config.embedding_generator = ->(text) {
170
+ # Your embedding logic
171
+ client.embeddings(input: text)
172
+ }
173
+ end
174
+
175
+ # Embeddings generated on ingest
176
+ content = facts.ingest(text, type: :document)
177
+ content.embedding # => [0.123, -0.456, ...]
178
+ ```
179
+
180
+ ## Source URIs
181
+
182
+ Track original locations with source_uri:
183
+
184
+ ```ruby
185
+ # Email
186
+ source_uri: "mailto:sender@example.com/msg/12345"
187
+
188
+ # Web page
189
+ source_uri: "https://example.com/articles/123"
190
+
191
+ # File
192
+ source_uri: "file:///path/to/document.pdf"
193
+
194
+ # Database record
195
+ source_uri: "db://crm/contacts/12345"
196
+
197
+ # API
198
+ source_uri: "api://salesforce/leads/ABC123"
199
+ ```
200
+
201
+ ## Best Practices
202
+
203
+ ### 1. Preserve Original Text
204
+
205
+ ```ruby
206
+ # Good - preserve original formatting
207
+ facts.ingest(original_email_body, type: :email)
208
+
209
+ # Avoid - don't pre-process
210
+ facts.ingest(cleaned_text.strip.downcase, type: :email)
211
+ ```
212
+
213
+ ### 2. Include Context in Metadata
214
+
215
+ ```ruby
216
+ content = facts.ingest(
217
+ transcript,
218
+ type: :transcript,
219
+ title: "Q4 2024 Earnings Call",
220
+ metadata: {
221
+ participants: ["CEO", "CFO", "Analysts"],
222
+ duration_minutes: 60,
223
+ recording_url: "https://..."
224
+ }
225
+ )
226
+ ```
227
+
228
+ ### 3. Use Consistent Types
229
+
230
+ ```ruby
231
+ # Define content types for your organization
232
+ module ContentTypes
233
+ EMAIL = :email
234
+ SLACK = :slack_message
235
+ MEETING = :meeting_transcript
236
+ # ...
237
+ end
238
+
239
+ facts.ingest(text, type: ContentTypes::EMAIL)
240
+ ```
241
+
242
+ ### 4. Track Source
243
+
244
+ ```ruby
245
+ # Always include source information for audit trails
246
+ content = facts.ingest(
247
+ text,
248
+ type: :document,
249
+ source_uri: "sharepoint://documents/annual-report-2024.pdf",
250
+ metadata: { uploaded_by: "jane@company.com" }
251
+ )
252
+ ```
@@ -0,0 +1,299 @@
1
+ # LLM Integration
2
+
3
+ FactDb integrates with multiple LLM providers via the `ruby_llm` gem for AI-powered fact extraction.
4
+
5
+ ## Setup
6
+
7
+ ### Install ruby_llm
8
+
9
+ Add to your Gemfile:
10
+
11
+ ```ruby
12
+ gem 'ruby_llm'
13
+ ```
14
+
15
+ ### Configure Provider
16
+
17
+ === "OpenAI"
18
+
19
+ ```ruby
20
+ FactDb.configure do |config|
21
+ config.llm_provider = :openai
22
+ config.llm_model = "gpt-4o-mini"
23
+ config.llm_api_key = ENV['OPENAI_API_KEY']
24
+ end
25
+ ```
26
+
27
+ === "Anthropic"
28
+
29
+ ```ruby
30
+ FactDb.configure do |config|
31
+ config.llm_provider = :anthropic
32
+ config.llm_model = "claude-sonnet-4-20250514"
33
+ config.llm_api_key = ENV['ANTHROPIC_API_KEY']
34
+ end
35
+ ```
36
+
37
+ === "Google Gemini"
38
+
39
+ ```ruby
40
+ FactDb.configure do |config|
41
+ config.llm_provider = :gemini
42
+ config.llm_model = "gemini-2.0-flash"
43
+ config.llm_api_key = ENV['GEMINI_API_KEY']
44
+ end
45
+ ```
46
+
47
+ === "Ollama (Local)"
48
+
49
+ ```ruby
50
+ FactDb.configure do |config|
51
+ config.llm_provider = :ollama
52
+ config.llm_model = "llama3.2"
53
+ end
54
+ ```
55
+
56
+ ## Supported Providers
57
+
58
+ | Provider | Models | Config Key |
59
+ |----------|--------|------------|
60
+ | OpenAI | gpt-4o, gpt-4o-mini, gpt-4-turbo | `OPENAI_API_KEY` |
61
+ | Anthropic | claude-sonnet-4, claude-3-haiku | `ANTHROPIC_API_KEY` |
62
+ | Google Gemini | gemini-2.0-flash, gemini-pro | `GEMINI_API_KEY` |
63
+ | Ollama | llama3.2, mistral, codellama | (local) |
64
+ | AWS Bedrock | claude-sonnet-4, titan | AWS credentials |
65
+ | OpenRouter | Various | `OPENROUTER_API_KEY` |
66
+
67
+ ## Default Models
68
+
69
+ If no model is specified, these defaults are used:
70
+
71
+ ```ruby
72
+ PROVIDER_DEFAULTS = {
73
+ openai: "gpt-4o-mini",
74
+ anthropic: "claude-sonnet-4-20250514",
75
+ gemini: "gemini-2.0-flash",
76
+ ollama: "llama3.2",
77
+ bedrock: "claude-sonnet-4",
78
+ openrouter: "anthropic/claude-sonnet-4"
79
+ }
80
+ ```
81
+
82
+ ## Using LLM Extraction
83
+
84
+ ```ruby
85
+ facts = FactDb.new
86
+
87
+ # Ingest content
88
+ content = facts.ingest(
89
+ "Paula Chen joined Microsoft as Principal Engineer on January 10, 2024. She previously worked at Google for 5 years.",
90
+ type: :announcement
91
+ )
92
+
93
+ # Extract facts using LLM
94
+ extracted = facts.extract_facts(content.id, extractor: :llm)
95
+
96
+ extracted.each do |fact|
97
+ puts "Fact: #{fact.fact_text}"
98
+ puts " Valid: #{fact.valid_at}"
99
+ puts " Confidence: #{fact.confidence}"
100
+ fact.entity_mentions.each do |m|
101
+ puts " Entity: #{m.entity.canonical_name} (#{m.mention_role})"
102
+ end
103
+ end
104
+ ```
105
+
106
+ ## Extraction Prompts
107
+
108
+ The LLM extractor uses carefully designed prompts to extract:
109
+
110
+ 1. **Facts** - Temporal assertions about entities
111
+ 2. **Entities** - People, organizations, places mentioned
112
+ 3. **Dates** - When facts became valid
113
+ 4. **Relationships** - How entities relate to facts
114
+
115
+ ### Example Prompt Structure
116
+
117
+ ```
118
+ Extract temporal facts from this content. For each fact:
119
+ 1. Identify the assertion (what is being stated)
120
+ 2. Identify entities mentioned (people, organizations, places)
121
+ 3. Determine when the fact became valid
122
+ 4. Assess confidence level
123
+
124
+ Content:
125
+ {content.raw_text}
126
+
127
+ Return JSON:
128
+ {
129
+ "facts": [
130
+ {
131
+ "text": "...",
132
+ "valid_at": "YYYY-MM-DD",
133
+ "entities": [
134
+ {"name": "...", "type": "person|organization|place", "role": "subject|object|..."}
135
+ ],
136
+ "confidence": 0.0-1.0
137
+ }
138
+ ]
139
+ }
140
+ ```
141
+
142
+ ## Custom LLM Client
143
+
144
+ Provide a pre-configured client:
145
+
146
+ ```ruby
147
+ # Create custom adapter
148
+ adapter = FactDb::LLM::Adapter.new(
149
+ provider: :openai,
150
+ model: "gpt-4o",
151
+ api_key: ENV['OPENAI_API_KEY']
152
+ )
153
+
154
+ FactDb.configure do |config|
155
+ config.llm_client = adapter
156
+ end
157
+ ```
158
+
159
+ ## Direct LLM Usage
160
+
161
+ Use the adapter directly:
162
+
163
+ ```ruby
164
+ adapter = FactDb::LLM::Adapter.new(
165
+ provider: :anthropic,
166
+ model: "claude-sonnet-4-20250514"
167
+ )
168
+
169
+ response = adapter.chat("Extract facts from: Paula joined Microsoft on Jan 10, 2024")
170
+ puts response
171
+ ```
172
+
173
+ ## Error Handling
174
+
175
+ ```ruby
176
+ begin
177
+ extracted = facts.extract_facts(content.id, extractor: :llm)
178
+ rescue FactDb::ConfigurationError => e
179
+ # LLM not configured or ruby_llm missing
180
+ puts "LLM Error: #{e.message}"
181
+ # Fall back to rule-based
182
+ extracted = facts.extract_facts(content.id, extractor: :rule_based)
183
+ rescue StandardError => e
184
+ # API error, rate limit, etc.
185
+ puts "Extraction failed: #{e.message}"
186
+ end
187
+ ```
188
+
189
+ ## Batch Processing with LLM
190
+
191
+ Process multiple documents efficiently:
192
+
193
+ ```ruby
194
+ content_ids = [content1.id, content2.id, content3.id]
195
+
196
+ # Parallel processing (uses simple_flow pipeline)
197
+ results = facts.batch_extract(content_ids, extractor: :llm, parallel: true)
198
+
199
+ results.each do |result|
200
+ if result[:error]
201
+ puts "Error for #{result[:content_id]}: #{result[:error]}"
202
+ else
203
+ puts "Extracted #{result[:facts].count} facts from #{result[:content_id]}"
204
+ end
205
+ end
206
+ ```
207
+
208
+ ## Cost Optimization
209
+
210
+ ### Use Appropriate Models
211
+
212
+ ```ruby
213
+ # For simple extractions, use smaller models
214
+ config.llm_model = "gpt-4o-mini" # Cheaper than gpt-4o
215
+
216
+ # For complex documents, use larger models
217
+ config.llm_model = "gpt-4o"
218
+ ```
219
+
220
+ ### Batch Processing
221
+
222
+ ```ruby
223
+ # Process in batches to reduce API calls
224
+ content_ids.each_slice(10) do |batch|
225
+ facts.batch_extract(batch, extractor: :llm)
226
+ sleep(1) # Rate limiting
227
+ end
228
+ ```
229
+
230
+ ### Local Models
231
+
232
+ ```ruby
233
+ # Use Ollama for development/testing
234
+ FactDb.configure do |config|
235
+ config.llm_provider = :ollama
236
+ config.llm_model = "llama3.2"
237
+ end
238
+ ```
239
+
240
+ ## Testing
241
+
242
+ Mock LLM responses in tests:
243
+
244
+ ```ruby
245
+ class MockLLMClient
246
+ def chat(prompt)
247
+ # Return predictable test data
248
+ '{"facts": [{"text": "Test fact", "valid_at": "2024-01-01", "entities": [], "confidence": 0.9}]}'
249
+ end
250
+ end
251
+
252
+ FactDb.configure do |config|
253
+ config.llm_client = MockLLMClient.new
254
+ end
255
+ ```
256
+
257
+ ## Best Practices
258
+
259
+ ### 1. Validate Extractions
260
+
261
+ ```ruby
262
+ extracted = facts.extract_facts(content.id, extractor: :llm)
263
+
264
+ extracted.each do |fact|
265
+ # Flag low-confidence extractions
266
+ if fact.confidence < 0.7
267
+ fact.update!(metadata: { needs_review: true })
268
+ end
269
+ end
270
+ ```
271
+
272
+ ### 2. Use Caching
273
+
274
+ ```ruby
275
+ # Cache LLM responses for repeated content
276
+ cache_key = "llm_extraction:#{content.content_hash}"
277
+ extracted = Rails.cache.fetch(cache_key) do
278
+ facts.extract_facts(content.id, extractor: :llm)
279
+ end
280
+ ```
281
+
282
+ ### 3. Handle Rate Limits
283
+
284
+ ```ruby
285
+ require 'retryable'
286
+
287
+ Retryable.retryable(tries: 3, sleep: 5) do
288
+ facts.extract_facts(content.id, extractor: :llm)
289
+ end
290
+ ```
291
+
292
+ ### 4. Monitor Usage
293
+
294
+ ```ruby
295
+ # Track extraction statistics
296
+ extracted = facts.extract_facts(content.id, extractor: :llm)
297
+ StatsD.increment('fact_db.llm_extractions')
298
+ StatsD.histogram('fact_db.facts_per_content', extracted.count)
299
+ ```