fact_db 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.envrc +2 -0
- data/.yardopts +5 -0
- data/CHANGELOG.md +64 -0
- data/README.md +107 -6
- data/Rakefile +243 -10
- data/db/migrate/001_enable_extensions.rb +1 -0
- data/db/migrate/002_create_sources.rb +49 -0
- data/db/migrate/003_create_entities.rb +27 -15
- data/db/migrate/004_create_entity_aliases.rb +20 -7
- data/db/migrate/005_create_facts.rb +37 -21
- data/db/migrate/006_create_entity_mentions.rb +14 -6
- data/db/migrate/007_create_fact_sources.rb +16 -8
- data/docs/api/extractors/index.md +5 -5
- data/docs/api/extractors/llm.md +17 -17
- data/docs/api/extractors/rule-based.md +14 -14
- data/docs/api/facts.md +20 -20
- data/docs/api/index.md +4 -4
- data/docs/api/models/entity.md +21 -21
- data/docs/api/models/fact.md +15 -15
- data/docs/api/models/index.md +7 -7
- data/docs/api/models/{content.md → source.md} +29 -29
- data/docs/api/pipeline/extraction.md +25 -25
- data/docs/api/pipeline/index.md +1 -1
- data/docs/api/pipeline/resolution.md +4 -4
- data/docs/api/services/entity-service.md +20 -20
- data/docs/api/services/fact-service.md +12 -12
- data/docs/api/services/index.md +5 -5
- data/docs/api/services/{content-service.md → source-service.md} +27 -27
- data/docs/architecture/database-schema.md +46 -46
- data/docs/architecture/entity-resolution.md +6 -6
- data/docs/architecture/index.md +10 -10
- data/docs/architecture/temporal-facts.md +5 -5
- data/docs/architecture/three-layer-model.md +17 -17
- data/docs/concepts.md +6 -6
- data/docs/examples/basic-usage.md +20 -20
- data/docs/examples/hr-onboarding.md +17 -17
- data/docs/examples/index.md +4 -4
- data/docs/examples/news-analysis.md +23 -23
- data/docs/getting-started/database-setup.md +28 -20
- data/docs/getting-started/index.md +3 -3
- data/docs/getting-started/quick-start.md +33 -30
- data/docs/guides/batch-processing.md +26 -26
- data/docs/guides/configuration.md +158 -77
- data/docs/guides/entity-management.md +14 -14
- data/docs/guides/extracting-facts.md +28 -28
- data/docs/guides/ingesting-content.md +14 -14
- data/docs/guides/llm-integration.md +40 -32
- data/docs/guides/temporal-queries.md +11 -11
- data/docs/index.md +6 -2
- data/examples/.envrc +4 -0
- data/examples/.gitignore +1 -0
- data/examples/001_configuration.rb +312 -0
- data/examples/{basic_usage.rb → 010_basic_usage.rb} +47 -56
- data/examples/{entity_management.rb → 020_entity_management.rb} +57 -72
- data/examples/{temporal_queries.rb → 030_temporal_queries.rb} +39 -59
- data/examples/040_output_formats.rb +177 -0
- data/examples/{rule_based_extraction.rb → 050_rule_based_extraction.rb} +39 -45
- data/examples/060_fluent_temporal_api.rb +217 -0
- data/examples/070_introspection.rb +252 -0
- data/examples/{hr_system.rb → 080_hr_system.rb} +56 -75
- data/examples/090_ingest_demo.rb +515 -0
- data/examples/100_query_context.rb +668 -0
- data/examples/110_prove_it.rb +204 -0
- data/examples/120_dump_database.rb +358 -0
- data/examples/130_rag_feedback_loop.rb +858 -0
- data/examples/README.md +229 -15
- data/examples/data/lincoln_associates.md +201 -0
- data/examples/data/lincoln_biography.md +66 -0
- data/examples/data/lincoln_cabinet.md +243 -0
- data/examples/data/lincoln_family.md +163 -0
- data/examples/data/lincoln_military.md +241 -0
- data/examples/data/lincoln_todd_family.md +136 -0
- data/examples/ingest_reporter.rb +335 -0
- data/examples/utilities.rb +182 -0
- data/lib/fact_db/config/defaults.yml +254 -0
- data/lib/fact_db/config.rb +94 -35
- data/lib/fact_db/database.rb +98 -8
- data/lib/fact_db/extractors/base.rb +106 -21
- data/lib/fact_db/extractors/llm_extractor.rb +35 -63
- data/lib/fact_db/extractors/manual_extractor.rb +46 -6
- data/lib/fact_db/extractors/rule_based_extractor.rb +136 -25
- data/lib/fact_db/llm/adapter.rb +3 -3
- data/lib/fact_db/models/entity.rb +94 -22
- data/lib/fact_db/models/entity_alias.rb +41 -7
- data/lib/fact_db/models/entity_mention.rb +34 -1
- data/lib/fact_db/models/fact.rb +259 -28
- data/lib/fact_db/models/fact_source.rb +43 -9
- data/lib/fact_db/models/source.rb +113 -0
- data/lib/fact_db/pipeline/extraction_pipeline.rb +35 -35
- data/lib/fact_db/pipeline/resolution_pipeline.rb +5 -5
- data/lib/fact_db/query_result.rb +202 -0
- data/lib/fact_db/resolution/entity_resolver.rb +139 -39
- data/lib/fact_db/resolution/fact_resolver.rb +86 -14
- data/lib/fact_db/services/entity_service.rb +246 -37
- data/lib/fact_db/services/fact_service.rb +254 -17
- data/lib/fact_db/services/source_service.rb +164 -0
- data/lib/fact_db/temporal/query.rb +71 -7
- data/lib/fact_db/temporal/query_builder.rb +69 -0
- data/lib/fact_db/temporal/timeline.rb +102 -11
- data/lib/fact_db/transformers/base.rb +77 -0
- data/lib/fact_db/transformers/cypher_transformer.rb +185 -0
- data/lib/fact_db/transformers/json_transformer.rb +17 -0
- data/lib/fact_db/transformers/raw_transformer.rb +35 -0
- data/lib/fact_db/transformers/text_transformer.rb +114 -0
- data/lib/fact_db/transformers/triple_transformer.rb +138 -0
- data/lib/fact_db/validation/alias_filter.rb +185 -0
- data/lib/fact_db/version.rb +1 -1
- data/lib/fact_db.rb +281 -30
- data/mkdocs.yml +2 -2
- metadata +60 -16
- data/db/migrate/002_create_contents.rb +0 -44
- data/lib/fact_db/models/content.rb +0 -62
- data/lib/fact_db/services/content_service.rb +0 -93
|
@@ -24,7 +24,7 @@ FactDb provides migrations that create all necessary tables:
|
|
|
24
24
|
require 'fact_db'
|
|
25
25
|
|
|
26
26
|
FactDb.configure do |config|
|
|
27
|
-
config.
|
|
27
|
+
config.database.url = "postgresql://localhost/fact_db"
|
|
28
28
|
end
|
|
29
29
|
|
|
30
30
|
FactDb::Database.migrate!
|
|
@@ -34,19 +34,19 @@ FactDb::Database.migrate!
|
|
|
34
34
|
|
|
35
35
|
The migrations create six tables:
|
|
36
36
|
|
|
37
|
-
###
|
|
37
|
+
### sources
|
|
38
38
|
|
|
39
|
-
Stores immutable source
|
|
39
|
+
Stores immutable source content.
|
|
40
40
|
|
|
41
41
|
| Column | Type | Description |
|
|
42
42
|
|--------|------|-------------|
|
|
43
43
|
| id | bigint | Primary key |
|
|
44
44
|
| content_hash | string | SHA256 hash for deduplication |
|
|
45
|
-
|
|
|
46
|
-
|
|
|
45
|
+
| type | string | Type (email, document, article) |
|
|
46
|
+
| content | text | Original source content |
|
|
47
47
|
| title | string | Optional title |
|
|
48
48
|
| source_uri | string | Original location |
|
|
49
|
-
|
|
|
49
|
+
| metadata | jsonb | Additional metadata |
|
|
50
50
|
| embedding | vector(1536) | Semantic search vector |
|
|
51
51
|
| captured_at | timestamptz | When content was captured |
|
|
52
52
|
|
|
@@ -57,10 +57,10 @@ Stores resolved identities.
|
|
|
57
57
|
| Column | Type | Description |
|
|
58
58
|
|--------|------|-------------|
|
|
59
59
|
| id | bigint | Primary key |
|
|
60
|
-
|
|
|
61
|
-
|
|
|
60
|
+
| name | string | Authoritative name |
|
|
61
|
+
| type | string | person, organization, place, etc. |
|
|
62
62
|
| resolution_status | string | unresolved, resolved, merged |
|
|
63
|
-
|
|
|
63
|
+
| canonical_id | bigint | Points to canonical entity if merged |
|
|
64
64
|
| metadata | jsonb | Additional attributes |
|
|
65
65
|
| embedding | vector(1536) | Semantic search vector |
|
|
66
66
|
|
|
@@ -72,8 +72,8 @@ Stores alternative names for entities.
|
|
|
72
72
|
|--------|------|-------------|
|
|
73
73
|
| id | bigint | Primary key |
|
|
74
74
|
| entity_id | bigint | Foreign key to entities |
|
|
75
|
-
|
|
|
76
|
-
|
|
|
75
|
+
| name | string | Alternative name |
|
|
76
|
+
| type | string | nickname, abbreviation, etc. |
|
|
77
77
|
| confidence | float | Match confidence (0-1) |
|
|
78
78
|
|
|
79
79
|
### facts
|
|
@@ -83,8 +83,8 @@ Stores temporal assertions.
|
|
|
83
83
|
| Column | Type | Description |
|
|
84
84
|
|--------|------|-------------|
|
|
85
85
|
| id | bigint | Primary key |
|
|
86
|
-
|
|
|
87
|
-
|
|
|
86
|
+
| text | text | The assertion |
|
|
87
|
+
| digest | string | SHA256 digest for deduplication |
|
|
88
88
|
| valid_at | timestamptz | When fact became true |
|
|
89
89
|
| invalid_at | timestamptz | When fact stopped being true |
|
|
90
90
|
| status | string | canonical, superseded, corroborated, synthesized |
|
|
@@ -117,8 +117,8 @@ Links facts to source content.
|
|
|
117
117
|
|--------|------|-------------|
|
|
118
118
|
| id | bigint | Primary key |
|
|
119
119
|
| fact_id | bigint | Foreign key to facts |
|
|
120
|
-
|
|
|
121
|
-
|
|
|
120
|
+
| source_id | bigint | Foreign key to sources |
|
|
121
|
+
| kind | string | primary, supporting, corroborating |
|
|
122
122
|
| excerpt | text | Relevant text excerpt |
|
|
123
123
|
| confidence | float | Source confidence |
|
|
124
124
|
|
|
@@ -128,8 +128,8 @@ The migrations create indexes for:
|
|
|
128
128
|
|
|
129
129
|
- Content hash (unique)
|
|
130
130
|
- Content type
|
|
131
|
-
- Full-text search on
|
|
132
|
-
- Entity
|
|
131
|
+
- Full-text search on content
|
|
132
|
+
- Entity name
|
|
133
133
|
- Entity type
|
|
134
134
|
- Fact status
|
|
135
135
|
- Temporal range queries (valid_at, invalid_at)
|
|
@@ -158,12 +158,20 @@ Configure the connection pool for your workload:
|
|
|
158
158
|
|
|
159
159
|
```ruby
|
|
160
160
|
FactDb.configure do |config|
|
|
161
|
-
config.
|
|
162
|
-
config.
|
|
163
|
-
config.
|
|
161
|
+
config.database.url = ENV['DATABASE_URL']
|
|
162
|
+
config.database.pool_size = 10 # Default: 5
|
|
163
|
+
config.database.timeout = 60_000 # Default: 30000ms
|
|
164
164
|
end
|
|
165
165
|
```
|
|
166
166
|
|
|
167
|
+
Or via environment variables:
|
|
168
|
+
|
|
169
|
+
```bash
|
|
170
|
+
export FDB_DATABASE__URL="postgresql://localhost/fact_db"
|
|
171
|
+
export FDB_DATABASE__POOL_SIZE=10
|
|
172
|
+
export FDB_DATABASE__TIMEOUT=60000
|
|
173
|
+
```
|
|
174
|
+
|
|
167
175
|
## Next Steps
|
|
168
176
|
|
|
169
177
|
- [Quick Start](quick-start.md) - Start using FactDb
|
|
@@ -55,17 +55,17 @@ require 'fact_db'
|
|
|
55
55
|
|
|
56
56
|
# Configure
|
|
57
57
|
FactDb.configure do |config|
|
|
58
|
-
config.
|
|
58
|
+
config.database.url = ENV['DATABASE_URL']
|
|
59
59
|
end
|
|
60
60
|
|
|
61
61
|
# Create a facts instance
|
|
62
62
|
facts = FactDb.new
|
|
63
63
|
|
|
64
64
|
# Ingest content
|
|
65
|
-
|
|
65
|
+
source = facts.ingest("Important information...", type: :document)
|
|
66
66
|
|
|
67
67
|
# Extract and query facts
|
|
68
|
-
extracted = facts.extract_facts(
|
|
68
|
+
extracted = facts.extract_facts(source.id)
|
|
69
69
|
```
|
|
70
70
|
|
|
71
71
|
Continue to the [Installation Guide](installation.md) to begin.
|
|
@@ -9,27 +9,30 @@ Create a configuration file or use environment variables:
|
|
|
9
9
|
=== "Environment Variables"
|
|
10
10
|
|
|
11
11
|
```bash
|
|
12
|
-
export
|
|
13
|
-
export
|
|
14
|
-
export
|
|
12
|
+
export FDB_DATABASE__URL="postgresql://localhost/fact_db"
|
|
13
|
+
export FDB_LLM__PROVIDER="openai"
|
|
14
|
+
export FDB_LLM__API_KEY="sk-..."
|
|
15
15
|
```
|
|
16
16
|
|
|
17
17
|
=== "YAML Config"
|
|
18
18
|
|
|
19
19
|
```yaml
|
|
20
20
|
# config/fact_db.yml
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
21
|
+
database:
|
|
22
|
+
url: postgresql://localhost/fact_db
|
|
23
|
+
|
|
24
|
+
llm:
|
|
25
|
+
provider: openai
|
|
26
|
+
api_key: <%= ENV['OPENAI_API_KEY'] %>
|
|
24
27
|
```
|
|
25
28
|
|
|
26
29
|
=== "Ruby Block"
|
|
27
30
|
|
|
28
31
|
```ruby
|
|
29
32
|
FactDb.configure do |config|
|
|
30
|
-
config.
|
|
31
|
-
config.
|
|
32
|
-
config.
|
|
33
|
+
config.database.url = "postgresql://localhost/fact_db"
|
|
34
|
+
config.llm.provider = :openai
|
|
35
|
+
config.llm.api_key = ENV['OPENAI_API_KEY']
|
|
33
36
|
end
|
|
34
37
|
```
|
|
35
38
|
|
|
@@ -41,7 +44,7 @@ Run the migrations:
|
|
|
41
44
|
require 'fact_db'
|
|
42
45
|
|
|
43
46
|
FactDb.configure do |config|
|
|
44
|
-
config.
|
|
47
|
+
config.database.url = ENV['DATABASE_URL']
|
|
45
48
|
end
|
|
46
49
|
|
|
47
50
|
# Run migrations
|
|
@@ -60,14 +63,14 @@ facts = FactDb.new
|
|
|
60
63
|
|
|
61
64
|
```ruby
|
|
62
65
|
# Ingest an email
|
|
63
|
-
|
|
66
|
+
source = facts.ingest(
|
|
64
67
|
"Hi team, Paula Chen has accepted our offer and will join as Principal Engineer starting January 10, 2024. She'll be reporting to Sarah in the Platform team.",
|
|
65
|
-
|
|
68
|
+
kind: :email,
|
|
66
69
|
title: "New Hire Announcement",
|
|
67
70
|
captured_at: Time.current
|
|
68
71
|
)
|
|
69
72
|
|
|
70
|
-
puts "Ingested
|
|
73
|
+
puts "Ingested source: #{source.id}"
|
|
71
74
|
```
|
|
72
75
|
|
|
73
76
|
## 5. Create Entities
|
|
@@ -76,19 +79,19 @@ puts "Ingested content: #{content.id}"
|
|
|
76
79
|
# Create entities for people and organizations
|
|
77
80
|
paula = facts.entity_service.create(
|
|
78
81
|
"Paula Chen",
|
|
79
|
-
|
|
82
|
+
kind: :person,
|
|
80
83
|
aliases: ["Paula", "P. Chen"]
|
|
81
84
|
)
|
|
82
85
|
|
|
83
86
|
sarah = facts.entity_service.create(
|
|
84
87
|
"Sarah Johnson",
|
|
85
|
-
|
|
88
|
+
kind: :person,
|
|
86
89
|
aliases: ["Sarah"]
|
|
87
90
|
)
|
|
88
91
|
|
|
89
92
|
platform_team = facts.entity_service.create(
|
|
90
93
|
"Platform Team",
|
|
91
|
-
|
|
94
|
+
kind: :organization
|
|
92
95
|
)
|
|
93
96
|
```
|
|
94
97
|
|
|
@@ -104,7 +107,7 @@ fact = facts.fact_service.create(
|
|
|
104
107
|
{ entity: paula, role: "subject", text: "Paula Chen" }
|
|
105
108
|
],
|
|
106
109
|
sources: [
|
|
107
|
-
{
|
|
110
|
+
{ source: source, type: "primary" }
|
|
108
111
|
]
|
|
109
112
|
)
|
|
110
113
|
```
|
|
@@ -113,10 +116,10 @@ fact = facts.fact_service.create(
|
|
|
113
116
|
|
|
114
117
|
```ruby
|
|
115
118
|
# Extract facts automatically using LLM
|
|
116
|
-
extracted = facts.extract_facts(
|
|
119
|
+
extracted = facts.extract_facts(source.id, extractor: :llm)
|
|
117
120
|
|
|
118
121
|
extracted.each do |fact|
|
|
119
|
-
puts "Extracted: #{fact.
|
|
122
|
+
puts "Extracted: #{fact.text}"
|
|
120
123
|
puts " Valid from: #{fact.valid_at}"
|
|
121
124
|
end
|
|
122
125
|
```
|
|
@@ -126,7 +129,7 @@ end
|
|
|
126
129
|
```ruby
|
|
127
130
|
# Get current facts about Paula
|
|
128
131
|
current = facts.current_facts_for(paula.id)
|
|
129
|
-
current.each { |f| puts f.
|
|
132
|
+
current.each { |f| puts f.text }
|
|
130
133
|
|
|
131
134
|
# Get facts valid at a specific date
|
|
132
135
|
historical = facts.facts_at(
|
|
@@ -144,7 +147,7 @@ team_facts = facts.query_facts(topic: "Platform Team")
|
|
|
144
147
|
timeline = facts.timeline_for(paula.id)
|
|
145
148
|
|
|
146
149
|
timeline.each do |entry|
|
|
147
|
-
puts "#{entry[:date]}: #{entry[:fact].
|
|
150
|
+
puts "#{entry[:date]}: #{entry[:fact].text}"
|
|
148
151
|
end
|
|
149
152
|
```
|
|
150
153
|
|
|
@@ -155,32 +158,32 @@ require 'fact_db'
|
|
|
155
158
|
|
|
156
159
|
# Configure
|
|
157
160
|
FactDb.configure do |config|
|
|
158
|
-
config.
|
|
159
|
-
config.
|
|
160
|
-
config.
|
|
161
|
+
config.database.url = ENV['DATABASE_URL']
|
|
162
|
+
config.llm.provider = :openai
|
|
163
|
+
config.llm.api_key = ENV['OPENAI_API_KEY']
|
|
161
164
|
end
|
|
162
165
|
|
|
163
166
|
# Create facts instance
|
|
164
167
|
facts = FactDb.new
|
|
165
168
|
|
|
166
169
|
# Ingest content
|
|
167
|
-
|
|
170
|
+
source = facts.ingest(
|
|
168
171
|
"Paula Chen joined Microsoft as Principal Engineer on January 10, 2024.",
|
|
169
|
-
|
|
172
|
+
kind: :announcement,
|
|
170
173
|
captured_at: Time.current
|
|
171
174
|
)
|
|
172
175
|
|
|
173
176
|
# Create entities
|
|
174
|
-
paula = facts.entity_service.create("Paula Chen",
|
|
175
|
-
microsoft = facts.entity_service.create("Microsoft",
|
|
177
|
+
paula = facts.entity_service.create("Paula Chen", kind: :person)
|
|
178
|
+
microsoft = facts.entity_service.create("Microsoft", kind: :organization)
|
|
176
179
|
|
|
177
180
|
# Extract facts via LLM
|
|
178
|
-
extracted = facts.extract_facts(
|
|
181
|
+
extracted = facts.extract_facts(source.id, extractor: :llm)
|
|
179
182
|
|
|
180
183
|
# Query
|
|
181
184
|
puts "Current facts about Paula:"
|
|
182
185
|
facts.current_facts_for(paula.id).each do |fact|
|
|
183
|
-
puts " - #{fact.
|
|
186
|
+
puts " - #{fact.text}"
|
|
184
187
|
end
|
|
185
188
|
```
|
|
186
189
|
|
|
@@ -20,10 +20,10 @@ Process content one at a time:
|
|
|
20
20
|
```ruby
|
|
21
21
|
facts = FactDb.new
|
|
22
22
|
|
|
23
|
-
|
|
23
|
+
source_ids = [content1.id, content2.id, content3.id]
|
|
24
24
|
|
|
25
25
|
results = facts.batch_extract(
|
|
26
|
-
|
|
26
|
+
source_ids,
|
|
27
27
|
extractor: :llm,
|
|
28
28
|
parallel: false
|
|
29
29
|
)
|
|
@@ -35,13 +35,13 @@ Process content concurrently (default):
|
|
|
35
35
|
|
|
36
36
|
```ruby
|
|
37
37
|
results = facts.batch_extract(
|
|
38
|
-
|
|
38
|
+
source_ids,
|
|
39
39
|
extractor: :llm,
|
|
40
40
|
parallel: true # default
|
|
41
41
|
)
|
|
42
42
|
|
|
43
43
|
results.each do |result|
|
|
44
|
-
puts "Content #{result[:
|
|
44
|
+
puts "Content #{result[:source_id]}:"
|
|
45
45
|
puts " Facts extracted: #{result[:facts].count}"
|
|
46
46
|
puts " Error: #{result[:error]}" if result[:error]
|
|
47
47
|
end
|
|
@@ -51,7 +51,7 @@ end
|
|
|
51
51
|
|
|
52
52
|
```ruby
|
|
53
53
|
result = {
|
|
54
|
-
|
|
54
|
+
source_id: 123,
|
|
55
55
|
facts: [<Fact>, <Fact>, ...], # Extracted facts
|
|
56
56
|
error: nil # Error message if failed
|
|
57
57
|
}
|
|
@@ -75,7 +75,7 @@ results = facts.batch_resolve_entities(names, type: nil)
|
|
|
75
75
|
results.each do |result|
|
|
76
76
|
case result[:status]
|
|
77
77
|
when :resolved
|
|
78
|
-
puts "#{result[:name]} -> #{result[:entity].
|
|
78
|
+
puts "#{result[:name]} -> #{result[:entity].name}"
|
|
79
79
|
when :not_found
|
|
80
80
|
puts "#{result[:name]} -> Not found"
|
|
81
81
|
when :error
|
|
@@ -104,9 +104,9 @@ results.each do |result|
|
|
|
104
104
|
if result[:conflict_count] > 0
|
|
105
105
|
puts "Entity #{result[:entity_id]} has #{result[:conflict_count]} conflicts:"
|
|
106
106
|
result[:conflicts].each do |conflict|
|
|
107
|
-
puts " #{conflict[:fact1].
|
|
107
|
+
puts " #{conflict[:fact1].text}"
|
|
108
108
|
puts " vs"
|
|
109
|
-
puts " #{conflict[:fact2].
|
|
109
|
+
puts " #{conflict[:fact2].text}"
|
|
110
110
|
puts " Similarity: #{conflict[:similarity]}"
|
|
111
111
|
end
|
|
112
112
|
end
|
|
@@ -153,7 +153,7 @@ pipeline = SimpleFlow::Pipeline.new do
|
|
|
153
153
|
# Step 1: Validate
|
|
154
154
|
step ->(result) {
|
|
155
155
|
content = result.value
|
|
156
|
-
if content.
|
|
156
|
+
if source.content.blank?
|
|
157
157
|
result.halt("Empty content")
|
|
158
158
|
else
|
|
159
159
|
result.continue(content)
|
|
@@ -182,7 +182,7 @@ result = pipeline.call(SimpleFlow::Result.new(content))
|
|
|
182
182
|
### Graceful Degradation
|
|
183
183
|
|
|
184
184
|
```ruby
|
|
185
|
-
results = facts.batch_extract(
|
|
185
|
+
results = facts.batch_extract(source_ids, extractor: :llm)
|
|
186
186
|
|
|
187
187
|
successful = results.select { |r| r[:error].nil? }
|
|
188
188
|
failed = results.reject { |r| r[:error].nil? }
|
|
@@ -192,7 +192,7 @@ puts "Failed: #{failed.count}"
|
|
|
192
192
|
|
|
193
193
|
# Retry failed items with different extractor
|
|
194
194
|
if failed.any?
|
|
195
|
-
retry_ids = failed.map { |r| r[:
|
|
195
|
+
retry_ids = failed.map { |r| r[:source_id] }
|
|
196
196
|
retry_results = facts.batch_extract(retry_ids, extractor: :rule_based)
|
|
197
197
|
end
|
|
198
198
|
```
|
|
@@ -204,7 +204,7 @@ results.each do |result|
|
|
|
204
204
|
if result[:error]
|
|
205
205
|
logger.error(
|
|
206
206
|
"Extraction failed",
|
|
207
|
-
|
|
207
|
+
source_id: result[:source_id],
|
|
208
208
|
error: result[:error]
|
|
209
209
|
)
|
|
210
210
|
end
|
|
@@ -217,7 +217,7 @@ end
|
|
|
217
217
|
|
|
218
218
|
```ruby
|
|
219
219
|
# Process in batches of 10-50 for optimal performance
|
|
220
|
-
|
|
220
|
+
source_ids.each_slice(25) do |batch|
|
|
221
221
|
results = facts.batch_extract(batch, parallel: true)
|
|
222
222
|
process_results(results)
|
|
223
223
|
end
|
|
@@ -228,7 +228,7 @@ end
|
|
|
228
228
|
For LLM extraction, add delays between batches:
|
|
229
229
|
|
|
230
230
|
```ruby
|
|
231
|
-
|
|
231
|
+
source_ids.each_slice(10) do |batch|
|
|
232
232
|
results = facts.batch_extract(batch, extractor: :llm)
|
|
233
233
|
process_results(results)
|
|
234
234
|
sleep(2) # Rate limit
|
|
@@ -239,7 +239,7 @@ end
|
|
|
239
239
|
|
|
240
240
|
```ruby
|
|
241
241
|
# Process results immediately to avoid memory buildup
|
|
242
|
-
|
|
242
|
+
source_ids.each_slice(50) do |batch|
|
|
243
243
|
results = facts.batch_extract(batch)
|
|
244
244
|
|
|
245
245
|
results.each do |result|
|
|
@@ -259,7 +259,7 @@ Track batch processing metrics:
|
|
|
259
259
|
```ruby
|
|
260
260
|
start_time = Time.now
|
|
261
261
|
|
|
262
|
-
results = facts.batch_extract(
|
|
262
|
+
results = facts.batch_extract(source_ids, parallel: true)
|
|
263
263
|
|
|
264
264
|
duration = Time.now - start_time
|
|
265
265
|
success_rate = results.count { |r| r[:error].nil? }.to_f / results.count
|
|
@@ -275,18 +275,18 @@ puts "Items/second: #{(results.count / duration).round(2)}"
|
|
|
275
275
|
|
|
276
276
|
```ruby
|
|
277
277
|
# Sequential for small batches (< 5 items)
|
|
278
|
-
if
|
|
279
|
-
results = facts.batch_extract(
|
|
278
|
+
if source_ids.count < 5
|
|
279
|
+
results = facts.batch_extract(source_ids, parallel: false)
|
|
280
280
|
else
|
|
281
|
-
results = facts.batch_extract(
|
|
281
|
+
results = facts.batch_extract(source_ids, parallel: true)
|
|
282
282
|
end
|
|
283
283
|
```
|
|
284
284
|
|
|
285
285
|
### 2. Handle Partial Failures
|
|
286
286
|
|
|
287
287
|
```ruby
|
|
288
|
-
def process_batch(
|
|
289
|
-
results = facts.batch_extract(
|
|
288
|
+
def process_batch(source_ids)
|
|
289
|
+
results = facts.batch_extract(source_ids)
|
|
290
290
|
|
|
291
291
|
{
|
|
292
292
|
successful: results.select { |r| r[:error].nil? },
|
|
@@ -294,17 +294,17 @@ def process_batch(content_ids)
|
|
|
294
294
|
}
|
|
295
295
|
end
|
|
296
296
|
|
|
297
|
-
batch_result = process_batch(
|
|
297
|
+
batch_result = process_batch(source_ids)
|
|
298
298
|
retry_failed(batch_result[:failed]) if batch_result[:failed].any?
|
|
299
299
|
```
|
|
300
300
|
|
|
301
301
|
### 3. Log Progress
|
|
302
302
|
|
|
303
303
|
```ruby
|
|
304
|
-
total =
|
|
304
|
+
total = source_ids.count
|
|
305
305
|
processed = 0
|
|
306
306
|
|
|
307
|
-
|
|
307
|
+
source_ids.each_slice(25) do |batch|
|
|
308
308
|
results = facts.batch_extract(batch)
|
|
309
309
|
processed += batch.count
|
|
310
310
|
|
|
@@ -316,10 +316,10 @@ end
|
|
|
316
316
|
|
|
317
317
|
```ruby
|
|
318
318
|
# LLM for complex documents
|
|
319
|
-
complex_docs =
|
|
319
|
+
complex_docs = sources.select { |s| s.content.length > 1000 }
|
|
320
320
|
facts.batch_extract(complex_docs.map(&:id), extractor: :llm)
|
|
321
321
|
|
|
322
322
|
# Rule-based for simple, structured content
|
|
323
|
-
simple_docs =
|
|
323
|
+
simple_docs = sources.select { |s| s.content.length <= 1000 }
|
|
324
324
|
facts.batch_extract(simple_docs.map(&:id), extractor: :rule_based)
|
|
325
325
|
```
|