fact_db 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.envrc +1 -0
- data/CHANGELOG.md +48 -0
- data/COMMITS.md +196 -0
- data/README.md +102 -0
- data/Rakefile +41 -0
- data/db/migrate/001_enable_extensions.rb +7 -0
- data/db/migrate/002_create_contents.rb +44 -0
- data/db/migrate/003_create_entities.rb +36 -0
- data/db/migrate/004_create_entity_aliases.rb +18 -0
- data/db/migrate/005_create_facts.rb +65 -0
- data/db/migrate/006_create_entity_mentions.rb +18 -0
- data/db/migrate/007_create_fact_sources.rb +18 -0
- data/docs/api/extractors/index.md +71 -0
- data/docs/api/extractors/llm.md +162 -0
- data/docs/api/extractors/manual.md +92 -0
- data/docs/api/extractors/rule-based.md +165 -0
- data/docs/api/facts.md +300 -0
- data/docs/api/index.md +66 -0
- data/docs/api/models/content.md +165 -0
- data/docs/api/models/entity.md +202 -0
- data/docs/api/models/fact.md +270 -0
- data/docs/api/models/index.md +77 -0
- data/docs/api/pipeline/extraction.md +175 -0
- data/docs/api/pipeline/index.md +72 -0
- data/docs/api/pipeline/resolution.md +209 -0
- data/docs/api/services/content-service.md +166 -0
- data/docs/api/services/entity-service.md +202 -0
- data/docs/api/services/fact-service.md +223 -0
- data/docs/api/services/index.md +55 -0
- data/docs/architecture/database-schema.md +293 -0
- data/docs/architecture/entity-resolution.md +293 -0
- data/docs/architecture/index.md +149 -0
- data/docs/architecture/temporal-facts.md +268 -0
- data/docs/architecture/three-layer-model.md +242 -0
- data/docs/assets/css/custom.css +137 -0
- data/docs/assets/fact_db.jpg +0 -0
- data/docs/assets/images/fact_db.jpg +0 -0
- data/docs/concepts.md +183 -0
- data/docs/examples/basic-usage.md +235 -0
- data/docs/examples/hr-onboarding.md +312 -0
- data/docs/examples/index.md +64 -0
- data/docs/examples/news-analysis.md +288 -0
- data/docs/getting-started/database-setup.md +170 -0
- data/docs/getting-started/index.md +71 -0
- data/docs/getting-started/installation.md +98 -0
- data/docs/getting-started/quick-start.md +191 -0
- data/docs/guides/batch-processing.md +325 -0
- data/docs/guides/configuration.md +243 -0
- data/docs/guides/entity-management.md +364 -0
- data/docs/guides/extracting-facts.md +299 -0
- data/docs/guides/index.md +22 -0
- data/docs/guides/ingesting-content.md +252 -0
- data/docs/guides/llm-integration.md +299 -0
- data/docs/guides/temporal-queries.md +315 -0
- data/docs/index.md +121 -0
- data/examples/README.md +130 -0
- data/examples/basic_usage.rb +164 -0
- data/examples/entity_management.rb +216 -0
- data/examples/hr_system.rb +428 -0
- data/examples/rule_based_extraction.rb +258 -0
- data/examples/temporal_queries.rb +245 -0
- data/lib/fact_db/config.rb +71 -0
- data/lib/fact_db/database.rb +45 -0
- data/lib/fact_db/errors.rb +10 -0
- data/lib/fact_db/extractors/base.rb +117 -0
- data/lib/fact_db/extractors/llm_extractor.rb +179 -0
- data/lib/fact_db/extractors/manual_extractor.rb +53 -0
- data/lib/fact_db/extractors/rule_based_extractor.rb +228 -0
- data/lib/fact_db/llm/adapter.rb +109 -0
- data/lib/fact_db/models/content.rb +62 -0
- data/lib/fact_db/models/entity.rb +84 -0
- data/lib/fact_db/models/entity_alias.rb +26 -0
- data/lib/fact_db/models/entity_mention.rb +33 -0
- data/lib/fact_db/models/fact.rb +192 -0
- data/lib/fact_db/models/fact_source.rb +35 -0
- data/lib/fact_db/pipeline/extraction_pipeline.rb +146 -0
- data/lib/fact_db/pipeline/resolution_pipeline.rb +129 -0
- data/lib/fact_db/resolution/entity_resolver.rb +261 -0
- data/lib/fact_db/resolution/fact_resolver.rb +259 -0
- data/lib/fact_db/services/content_service.rb +93 -0
- data/lib/fact_db/services/entity_service.rb +150 -0
- data/lib/fact_db/services/fact_service.rb +193 -0
- data/lib/fact_db/temporal/query.rb +125 -0
- data/lib/fact_db/temporal/timeline.rb +134 -0
- data/lib/fact_db/version.rb +5 -0
- data/lib/fact_db.rb +141 -0
- data/mkdocs.yml +198 -0
- metadata +288 -0
|
@@ -0,0 +1,252 @@
|
|
|
1
|
+
# Ingesting Content
|
|
2
|
+
|
|
3
|
+
Content is the foundation of FactDb - immutable source documents from which facts are extracted.
|
|
4
|
+
|
|
5
|
+
## Basic Ingestion
|
|
6
|
+
|
|
7
|
+
```ruby
|
|
8
|
+
facts = FactDb.new
|
|
9
|
+
|
|
10
|
+
content = facts.ingest(
|
|
11
|
+
"Paula Chen joined Microsoft as Principal Engineer on January 10, 2024.",
|
|
12
|
+
type: :announcement
|
|
13
|
+
)
|
|
14
|
+
```
|
|
15
|
+
|
|
16
|
+
## Full Options
|
|
17
|
+
|
|
18
|
+
```ruby
|
|
19
|
+
content = facts.ingest(
|
|
20
|
+
raw_text,
|
|
21
|
+
type: :email,
|
|
22
|
+
title: "RE: Offer Letter - Paula Chen",
|
|
23
|
+
source_uri: "mailto:hr@company.com/msg/12345",
|
|
24
|
+
captured_at: Time.parse("2024-01-08 10:30:00"),
|
|
25
|
+
metadata: {
|
|
26
|
+
from: "hr@company.com",
|
|
27
|
+
to: "hiring@company.com",
|
|
28
|
+
cc: ["manager@company.com"],
|
|
29
|
+
subject: "RE: Offer Letter - Paula Chen",
|
|
30
|
+
thread_id: "THR-12345"
|
|
31
|
+
}
|
|
32
|
+
)
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
## Content Types
|
|
36
|
+
|
|
37
|
+
Choose a type that best describes the source:
|
|
38
|
+
|
|
39
|
+
| Type | Use Case |
|
|
40
|
+
|------|----------|
|
|
41
|
+
| `:email` | Email messages |
|
|
42
|
+
| `:document` | General documents, PDFs |
|
|
43
|
+
| `:article` | News articles, blog posts |
|
|
44
|
+
| `:transcript` | Meeting transcripts, interviews |
|
|
45
|
+
| `:report` | Reports, analysis documents |
|
|
46
|
+
| `:announcement` | Official announcements |
|
|
47
|
+
| `:social` | Social media posts |
|
|
48
|
+
| `:form` | Structured forms, surveys |
|
|
49
|
+
| `:note` | Notes, memos |
|
|
50
|
+
|
|
51
|
+
```ruby
|
|
52
|
+
# Custom types are also allowed
|
|
53
|
+
content = facts.ingest(text, type: :slack_message)
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
## Metadata
|
|
57
|
+
|
|
58
|
+
Store additional context in metadata:
|
|
59
|
+
|
|
60
|
+
```ruby
|
|
61
|
+
# Email metadata
|
|
62
|
+
metadata: {
|
|
63
|
+
from: "sender@example.com",
|
|
64
|
+
to: "recipient@example.com",
|
|
65
|
+
subject: "Important Update",
|
|
66
|
+
message_id: "<abc123@mail.example.com>"
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
# Document metadata
|
|
70
|
+
metadata: {
|
|
71
|
+
author: "Jane Smith",
|
|
72
|
+
version: "2.1",
|
|
73
|
+
department: "Engineering",
|
|
74
|
+
classification: "internal"
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
# Article metadata
|
|
78
|
+
metadata: {
|
|
79
|
+
author: "John Doe",
|
|
80
|
+
publication: "Tech News",
|
|
81
|
+
url: "https://technews.com/article/123",
|
|
82
|
+
published_at: "2024-01-15T14:30:00Z"
|
|
83
|
+
}
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
## Deduplication
|
|
87
|
+
|
|
88
|
+
Content is automatically deduplicated by SHA256 hash:
|
|
89
|
+
|
|
90
|
+
```ruby
|
|
91
|
+
# First ingestion - creates new record
|
|
92
|
+
content1 = facts.ingest("Hello world", type: :note)
|
|
93
|
+
|
|
94
|
+
# Second ingestion - returns existing record
|
|
95
|
+
content2 = facts.ingest("Hello world", type: :note)
|
|
96
|
+
|
|
97
|
+
content1.id == content2.id # => true
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
## Timestamps
|
|
101
|
+
|
|
102
|
+
### captured_at
|
|
103
|
+
|
|
104
|
+
When the content was captured/received (defaults to current time):
|
|
105
|
+
|
|
106
|
+
```ruby
|
|
107
|
+
# Email received yesterday
|
|
108
|
+
content = facts.ingest(
|
|
109
|
+
email_body,
|
|
110
|
+
type: :email,
|
|
111
|
+
captured_at: Time.parse("2024-01-14 09:00:00")
|
|
112
|
+
)
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
### created_at
|
|
116
|
+
|
|
117
|
+
Automatically set when record is created (system timestamp).
|
|
118
|
+
|
|
119
|
+
## Batch Ingestion
|
|
120
|
+
|
|
121
|
+
For multiple documents:
|
|
122
|
+
|
|
123
|
+
```ruby
|
|
124
|
+
documents = [
|
|
125
|
+
{ text: "Doc 1 content", type: :document, title: "Doc 1" },
|
|
126
|
+
{ text: "Doc 2 content", type: :document, title: "Doc 2" },
|
|
127
|
+
{ text: "Doc 3 content", type: :document, title: "Doc 3" }
|
|
128
|
+
]
|
|
129
|
+
|
|
130
|
+
contents = documents.map do |doc|
|
|
131
|
+
facts.ingest(doc[:text], type: doc[:type], title: doc[:title])
|
|
132
|
+
end
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
## Content Service
|
|
136
|
+
|
|
137
|
+
For advanced operations, use the content service directly:
|
|
138
|
+
|
|
139
|
+
```ruby
|
|
140
|
+
# Create content
|
|
141
|
+
content = facts.content_service.create(
|
|
142
|
+
raw_text,
|
|
143
|
+
type: :document,
|
|
144
|
+
title: "Annual Report"
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
# Find by ID
|
|
148
|
+
content = facts.content_service.find(content_id)
|
|
149
|
+
|
|
150
|
+
# Find by hash
|
|
151
|
+
content = facts.content_service.find_by_hash(sha256_hash)
|
|
152
|
+
|
|
153
|
+
# Search by text
|
|
154
|
+
results = facts.content_service.search("quarterly earnings")
|
|
155
|
+
|
|
156
|
+
# Semantic search (requires embedding)
|
|
157
|
+
results = facts.content_service.semantic_search(
|
|
158
|
+
"financial performance",
|
|
159
|
+
limit: 10
|
|
160
|
+
)
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
## Embeddings
|
|
164
|
+
|
|
165
|
+
If you configure an embedding generator, content embeddings are created automatically:
|
|
166
|
+
|
|
167
|
+
```ruby
|
|
168
|
+
FactDb.configure do |config|
|
|
169
|
+
config.embedding_generator = ->(text) {
|
|
170
|
+
# Your embedding logic
|
|
171
|
+
client.embeddings(input: text)
|
|
172
|
+
}
|
|
173
|
+
end
|
|
174
|
+
|
|
175
|
+
# Embeddings generated on ingest
|
|
176
|
+
content = facts.ingest(text, type: :document)
|
|
177
|
+
content.embedding # => [0.123, -0.456, ...]
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
## Source URIs
|
|
181
|
+
|
|
182
|
+
Track original locations with source_uri:
|
|
183
|
+
|
|
184
|
+
```ruby
|
|
185
|
+
# Email
|
|
186
|
+
source_uri: "mailto:sender@example.com/msg/12345"
|
|
187
|
+
|
|
188
|
+
# Web page
|
|
189
|
+
source_uri: "https://example.com/articles/123"
|
|
190
|
+
|
|
191
|
+
# File
|
|
192
|
+
source_uri: "file:///path/to/document.pdf"
|
|
193
|
+
|
|
194
|
+
# Database record
|
|
195
|
+
source_uri: "db://crm/contacts/12345"
|
|
196
|
+
|
|
197
|
+
# API
|
|
198
|
+
source_uri: "api://salesforce/leads/ABC123"
|
|
199
|
+
```
|
|
200
|
+
|
|
201
|
+
## Best Practices
|
|
202
|
+
|
|
203
|
+
### 1. Preserve Original Text
|
|
204
|
+
|
|
205
|
+
```ruby
|
|
206
|
+
# Good - preserve original formatting
|
|
207
|
+
facts.ingest(original_email_body, type: :email)
|
|
208
|
+
|
|
209
|
+
# Avoid - don't pre-process
|
|
210
|
+
facts.ingest(cleaned_text.strip.downcase, type: :email)
|
|
211
|
+
```
|
|
212
|
+
|
|
213
|
+
### 2. Include Context in Metadata
|
|
214
|
+
|
|
215
|
+
```ruby
|
|
216
|
+
content = facts.ingest(
|
|
217
|
+
transcript,
|
|
218
|
+
type: :transcript,
|
|
219
|
+
title: "Q4 2024 Earnings Call",
|
|
220
|
+
metadata: {
|
|
221
|
+
participants: ["CEO", "CFO", "Analysts"],
|
|
222
|
+
duration_minutes: 60,
|
|
223
|
+
recording_url: "https://..."
|
|
224
|
+
}
|
|
225
|
+
)
|
|
226
|
+
```
|
|
227
|
+
|
|
228
|
+
### 3. Use Consistent Types
|
|
229
|
+
|
|
230
|
+
```ruby
|
|
231
|
+
# Define content types for your organization
|
|
232
|
+
module ContentTypes
|
|
233
|
+
EMAIL = :email
|
|
234
|
+
SLACK = :slack_message
|
|
235
|
+
MEETING = :meeting_transcript
|
|
236
|
+
# ...
|
|
237
|
+
end
|
|
238
|
+
|
|
239
|
+
facts.ingest(text, type: ContentTypes::EMAIL)
|
|
240
|
+
```
|
|
241
|
+
|
|
242
|
+
### 4. Track Source
|
|
243
|
+
|
|
244
|
+
```ruby
|
|
245
|
+
# Always include source information for audit trails
|
|
246
|
+
content = facts.ingest(
|
|
247
|
+
text,
|
|
248
|
+
type: :document,
|
|
249
|
+
source_uri: "sharepoint://documents/annual-report-2024.pdf",
|
|
250
|
+
metadata: { uploaded_by: "jane@company.com" }
|
|
251
|
+
)
|
|
252
|
+
```
|
|
@@ -0,0 +1,299 @@
|
|
|
1
|
+
# LLM Integration
|
|
2
|
+
|
|
3
|
+
FactDb integrates with multiple LLM providers via the `ruby_llm` gem for AI-powered fact extraction.
|
|
4
|
+
|
|
5
|
+
## Setup
|
|
6
|
+
|
|
7
|
+
### Install ruby_llm
|
|
8
|
+
|
|
9
|
+
Add to your Gemfile:
|
|
10
|
+
|
|
11
|
+
```ruby
|
|
12
|
+
gem 'ruby_llm'
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
### Configure Provider
|
|
16
|
+
|
|
17
|
+
=== "OpenAI"
|
|
18
|
+
|
|
19
|
+
```ruby
|
|
20
|
+
FactDb.configure do |config|
|
|
21
|
+
config.llm_provider = :openai
|
|
22
|
+
config.llm_model = "gpt-4o-mini"
|
|
23
|
+
config.llm_api_key = ENV['OPENAI_API_KEY']
|
|
24
|
+
end
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
=== "Anthropic"
|
|
28
|
+
|
|
29
|
+
```ruby
|
|
30
|
+
FactDb.configure do |config|
|
|
31
|
+
config.llm_provider = :anthropic
|
|
32
|
+
config.llm_model = "claude-sonnet-4-20250514"
|
|
33
|
+
config.llm_api_key = ENV['ANTHROPIC_API_KEY']
|
|
34
|
+
end
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
=== "Google Gemini"
|
|
38
|
+
|
|
39
|
+
```ruby
|
|
40
|
+
FactDb.configure do |config|
|
|
41
|
+
config.llm_provider = :gemini
|
|
42
|
+
config.llm_model = "gemini-2.0-flash"
|
|
43
|
+
config.llm_api_key = ENV['GEMINI_API_KEY']
|
|
44
|
+
end
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
=== "Ollama (Local)"
|
|
48
|
+
|
|
49
|
+
```ruby
|
|
50
|
+
FactDb.configure do |config|
|
|
51
|
+
config.llm_provider = :ollama
|
|
52
|
+
config.llm_model = "llama3.2"
|
|
53
|
+
end
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
## Supported Providers
|
|
57
|
+
|
|
58
|
+
| Provider | Models | Config Key |
|
|
59
|
+
|----------|--------|------------|
|
|
60
|
+
| OpenAI | gpt-4o, gpt-4o-mini, gpt-4-turbo | `OPENAI_API_KEY` |
|
|
61
|
+
| Anthropic | claude-sonnet-4, claude-3-haiku | `ANTHROPIC_API_KEY` |
|
|
62
|
+
| Google Gemini | gemini-2.0-flash, gemini-pro | `GEMINI_API_KEY` |
|
|
63
|
+
| Ollama | llama3.2, mistral, codellama | (local) |
|
|
64
|
+
| AWS Bedrock | claude-sonnet-4, titan | AWS credentials |
|
|
65
|
+
| OpenRouter | Various | `OPENROUTER_API_KEY` |
|
|
66
|
+
|
|
67
|
+
## Default Models
|
|
68
|
+
|
|
69
|
+
If no model is specified, these defaults are used:
|
|
70
|
+
|
|
71
|
+
```ruby
|
|
72
|
+
PROVIDER_DEFAULTS = {
|
|
73
|
+
openai: "gpt-4o-mini",
|
|
74
|
+
anthropic: "claude-sonnet-4-20250514",
|
|
75
|
+
gemini: "gemini-2.0-flash",
|
|
76
|
+
ollama: "llama3.2",
|
|
77
|
+
bedrock: "claude-sonnet-4",
|
|
78
|
+
openrouter: "anthropic/claude-sonnet-4"
|
|
79
|
+
}
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
## Using LLM Extraction
|
|
83
|
+
|
|
84
|
+
```ruby
|
|
85
|
+
facts = FactDb.new
|
|
86
|
+
|
|
87
|
+
# Ingest content
|
|
88
|
+
content = facts.ingest(
|
|
89
|
+
"Paula Chen joined Microsoft as Principal Engineer on January 10, 2024. She previously worked at Google for 5 years.",
|
|
90
|
+
type: :announcement
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
# Extract facts using LLM
|
|
94
|
+
extracted = facts.extract_facts(content.id, extractor: :llm)
|
|
95
|
+
|
|
96
|
+
extracted.each do |fact|
|
|
97
|
+
puts "Fact: #{fact.fact_text}"
|
|
98
|
+
puts " Valid: #{fact.valid_at}"
|
|
99
|
+
puts " Confidence: #{fact.confidence}"
|
|
100
|
+
fact.entity_mentions.each do |m|
|
|
101
|
+
puts " Entity: #{m.entity.canonical_name} (#{m.mention_role})"
|
|
102
|
+
end
|
|
103
|
+
end
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
## Extraction Prompts
|
|
107
|
+
|
|
108
|
+
The LLM extractor uses carefully designed prompts to extract:
|
|
109
|
+
|
|
110
|
+
1. **Facts** - Temporal assertions about entities
|
|
111
|
+
2. **Entities** - People, organizations, places mentioned
|
|
112
|
+
3. **Dates** - When facts became valid
|
|
113
|
+
4. **Relationships** - How entities relate to facts
|
|
114
|
+
|
|
115
|
+
### Example Prompt Structure
|
|
116
|
+
|
|
117
|
+
```
|
|
118
|
+
Extract temporal facts from this content. For each fact:
|
|
119
|
+
1. Identify the assertion (what is being stated)
|
|
120
|
+
2. Identify entities mentioned (people, organizations, places)
|
|
121
|
+
3. Determine when the fact became valid
|
|
122
|
+
4. Assess confidence level
|
|
123
|
+
|
|
124
|
+
Content:
|
|
125
|
+
{content.raw_text}
|
|
126
|
+
|
|
127
|
+
Return JSON:
|
|
128
|
+
{
|
|
129
|
+
"facts": [
|
|
130
|
+
{
|
|
131
|
+
"text": "...",
|
|
132
|
+
"valid_at": "YYYY-MM-DD",
|
|
133
|
+
"entities": [
|
|
134
|
+
{"name": "...", "type": "person|organization|place", "role": "subject|object|..."}
|
|
135
|
+
],
|
|
136
|
+
"confidence": 0.0-1.0
|
|
137
|
+
}
|
|
138
|
+
]
|
|
139
|
+
}
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
## Custom LLM Client
|
|
143
|
+
|
|
144
|
+
Provide a pre-configured client:
|
|
145
|
+
|
|
146
|
+
```ruby
|
|
147
|
+
# Create custom adapter
|
|
148
|
+
adapter = FactDb::LLM::Adapter.new(
|
|
149
|
+
provider: :openai,
|
|
150
|
+
model: "gpt-4o",
|
|
151
|
+
api_key: ENV['OPENAI_API_KEY']
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
FactDb.configure do |config|
|
|
155
|
+
config.llm_client = adapter
|
|
156
|
+
end
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
## Direct LLM Usage
|
|
160
|
+
|
|
161
|
+
Use the adapter directly:
|
|
162
|
+
|
|
163
|
+
```ruby
|
|
164
|
+
adapter = FactDb::LLM::Adapter.new(
|
|
165
|
+
provider: :anthropic,
|
|
166
|
+
model: "claude-sonnet-4-20250514"
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
response = adapter.chat("Extract facts from: Paula joined Microsoft on Jan 10, 2024")
|
|
170
|
+
puts response
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
## Error Handling
|
|
174
|
+
|
|
175
|
+
```ruby
|
|
176
|
+
begin
|
|
177
|
+
extracted = facts.extract_facts(content.id, extractor: :llm)
|
|
178
|
+
rescue FactDb::ConfigurationError => e
|
|
179
|
+
# LLM not configured or ruby_llm missing
|
|
180
|
+
puts "LLM Error: #{e.message}"
|
|
181
|
+
# Fall back to rule-based
|
|
182
|
+
extracted = facts.extract_facts(content.id, extractor: :rule_based)
|
|
183
|
+
rescue StandardError => e
|
|
184
|
+
# API error, rate limit, etc.
|
|
185
|
+
puts "Extraction failed: #{e.message}"
|
|
186
|
+
end
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
## Batch Processing with LLM
|
|
190
|
+
|
|
191
|
+
Process multiple documents efficiently:
|
|
192
|
+
|
|
193
|
+
```ruby
|
|
194
|
+
content_ids = [content1.id, content2.id, content3.id]
|
|
195
|
+
|
|
196
|
+
# Parallel processing (uses simple_flow pipeline)
|
|
197
|
+
results = facts.batch_extract(content_ids, extractor: :llm, parallel: true)
|
|
198
|
+
|
|
199
|
+
results.each do |result|
|
|
200
|
+
if result[:error]
|
|
201
|
+
puts "Error for #{result[:content_id]}: #{result[:error]}"
|
|
202
|
+
else
|
|
203
|
+
puts "Extracted #{result[:facts].count} facts from #{result[:content_id]}"
|
|
204
|
+
end
|
|
205
|
+
end
|
|
206
|
+
```
|
|
207
|
+
|
|
208
|
+
## Cost Optimization
|
|
209
|
+
|
|
210
|
+
### Use Appropriate Models
|
|
211
|
+
|
|
212
|
+
```ruby
|
|
213
|
+
# For simple extractions, use smaller models
|
|
214
|
+
config.llm_model = "gpt-4o-mini" # Cheaper than gpt-4o
|
|
215
|
+
|
|
216
|
+
# For complex documents, use larger models
|
|
217
|
+
config.llm_model = "gpt-4o"
|
|
218
|
+
```
|
|
219
|
+
|
|
220
|
+
### Batch Processing
|
|
221
|
+
|
|
222
|
+
```ruby
|
|
223
|
+
# Process in batches to reduce API calls
|
|
224
|
+
content_ids.each_slice(10) do |batch|
|
|
225
|
+
facts.batch_extract(batch, extractor: :llm)
|
|
226
|
+
sleep(1) # Rate limiting
|
|
227
|
+
end
|
|
228
|
+
```
|
|
229
|
+
|
|
230
|
+
### Local Models
|
|
231
|
+
|
|
232
|
+
```ruby
|
|
233
|
+
# Use Ollama for development/testing
|
|
234
|
+
FactDb.configure do |config|
|
|
235
|
+
config.llm_provider = :ollama
|
|
236
|
+
config.llm_model = "llama3.2"
|
|
237
|
+
end
|
|
238
|
+
```
|
|
239
|
+
|
|
240
|
+
## Testing
|
|
241
|
+
|
|
242
|
+
Mock LLM responses in tests:
|
|
243
|
+
|
|
244
|
+
```ruby
|
|
245
|
+
class MockLLMClient
|
|
246
|
+
def chat(prompt)
|
|
247
|
+
# Return predictable test data
|
|
248
|
+
'{"facts": [{"text": "Test fact", "valid_at": "2024-01-01", "entities": [], "confidence": 0.9}]}'
|
|
249
|
+
end
|
|
250
|
+
end
|
|
251
|
+
|
|
252
|
+
FactDb.configure do |config|
|
|
253
|
+
config.llm_client = MockLLMClient.new
|
|
254
|
+
end
|
|
255
|
+
```
|
|
256
|
+
|
|
257
|
+
## Best Practices
|
|
258
|
+
|
|
259
|
+
### 1. Validate Extractions
|
|
260
|
+
|
|
261
|
+
```ruby
|
|
262
|
+
extracted = facts.extract_facts(content.id, extractor: :llm)
|
|
263
|
+
|
|
264
|
+
extracted.each do |fact|
|
|
265
|
+
# Flag low-confidence extractions
|
|
266
|
+
if fact.confidence < 0.7
|
|
267
|
+
fact.update!(metadata: { needs_review: true })
|
|
268
|
+
end
|
|
269
|
+
end
|
|
270
|
+
```
|
|
271
|
+
|
|
272
|
+
### 2. Use Caching
|
|
273
|
+
|
|
274
|
+
```ruby
|
|
275
|
+
# Cache LLM responses for repeated content
|
|
276
|
+
cache_key = "llm_extraction:#{content.content_hash}"
|
|
277
|
+
extracted = Rails.cache.fetch(cache_key) do
|
|
278
|
+
facts.extract_facts(content.id, extractor: :llm)
|
|
279
|
+
end
|
|
280
|
+
```
|
|
281
|
+
|
|
282
|
+
### 3. Handle Rate Limits
|
|
283
|
+
|
|
284
|
+
```ruby
|
|
285
|
+
require 'retryable'
|
|
286
|
+
|
|
287
|
+
Retryable.retryable(tries: 3, sleep: 5) do
|
|
288
|
+
facts.extract_facts(content.id, extractor: :llm)
|
|
289
|
+
end
|
|
290
|
+
```
|
|
291
|
+
|
|
292
|
+
### 4. Monitor Usage
|
|
293
|
+
|
|
294
|
+
```ruby
|
|
295
|
+
# Track extraction statistics
|
|
296
|
+
extracted = facts.extract_facts(content.id, extractor: :llm)
|
|
297
|
+
StatsD.increment('fact_db.llm_extractions')
|
|
298
|
+
StatsD.histogram('fact_db.facts_per_content', extracted.count)
|
|
299
|
+
```
|