fact_db 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.envrc +2 -0
- data/.yardopts +5 -0
- data/CHANGELOG.md +64 -0
- data/README.md +107 -6
- data/Rakefile +243 -10
- data/db/migrate/001_enable_extensions.rb +1 -0
- data/db/migrate/002_create_sources.rb +49 -0
- data/db/migrate/003_create_entities.rb +27 -15
- data/db/migrate/004_create_entity_aliases.rb +20 -7
- data/db/migrate/005_create_facts.rb +37 -21
- data/db/migrate/006_create_entity_mentions.rb +14 -6
- data/db/migrate/007_create_fact_sources.rb +16 -8
- data/docs/api/extractors/index.md +5 -5
- data/docs/api/extractors/llm.md +17 -17
- data/docs/api/extractors/rule-based.md +14 -14
- data/docs/api/facts.md +20 -20
- data/docs/api/index.md +4 -4
- data/docs/api/models/entity.md +21 -21
- data/docs/api/models/fact.md +15 -15
- data/docs/api/models/index.md +7 -7
- data/docs/api/models/{content.md → source.md} +29 -29
- data/docs/api/pipeline/extraction.md +25 -25
- data/docs/api/pipeline/index.md +1 -1
- data/docs/api/pipeline/resolution.md +4 -4
- data/docs/api/services/entity-service.md +20 -20
- data/docs/api/services/fact-service.md +12 -12
- data/docs/api/services/index.md +5 -5
- data/docs/api/services/{content-service.md → source-service.md} +27 -27
- data/docs/architecture/database-schema.md +46 -46
- data/docs/architecture/entity-resolution.md +6 -6
- data/docs/architecture/index.md +10 -10
- data/docs/architecture/temporal-facts.md +5 -5
- data/docs/architecture/three-layer-model.md +17 -17
- data/docs/concepts.md +6 -6
- data/docs/examples/basic-usage.md +20 -20
- data/docs/examples/hr-onboarding.md +17 -17
- data/docs/examples/index.md +4 -4
- data/docs/examples/news-analysis.md +23 -23
- data/docs/getting-started/database-setup.md +28 -20
- data/docs/getting-started/index.md +3 -3
- data/docs/getting-started/quick-start.md +33 -30
- data/docs/guides/batch-processing.md +26 -26
- data/docs/guides/configuration.md +158 -77
- data/docs/guides/entity-management.md +14 -14
- data/docs/guides/extracting-facts.md +28 -28
- data/docs/guides/ingesting-content.md +14 -14
- data/docs/guides/llm-integration.md +40 -32
- data/docs/guides/temporal-queries.md +11 -11
- data/docs/index.md +6 -2
- data/examples/.envrc +4 -0
- data/examples/.gitignore +1 -0
- data/examples/001_configuration.rb +312 -0
- data/examples/{basic_usage.rb → 010_basic_usage.rb} +47 -56
- data/examples/{entity_management.rb → 020_entity_management.rb} +57 -72
- data/examples/{temporal_queries.rb → 030_temporal_queries.rb} +39 -59
- data/examples/040_output_formats.rb +177 -0
- data/examples/{rule_based_extraction.rb → 050_rule_based_extraction.rb} +39 -45
- data/examples/060_fluent_temporal_api.rb +217 -0
- data/examples/070_introspection.rb +252 -0
- data/examples/{hr_system.rb → 080_hr_system.rb} +56 -75
- data/examples/090_ingest_demo.rb +515 -0
- data/examples/100_query_context.rb +668 -0
- data/examples/110_prove_it.rb +204 -0
- data/examples/120_dump_database.rb +358 -0
- data/examples/130_rag_feedback_loop.rb +858 -0
- data/examples/README.md +229 -15
- data/examples/data/lincoln_associates.md +201 -0
- data/examples/data/lincoln_biography.md +66 -0
- data/examples/data/lincoln_cabinet.md +243 -0
- data/examples/data/lincoln_family.md +163 -0
- data/examples/data/lincoln_military.md +241 -0
- data/examples/data/lincoln_todd_family.md +136 -0
- data/examples/ingest_reporter.rb +335 -0
- data/examples/utilities.rb +182 -0
- data/lib/fact_db/config/defaults.yml +254 -0
- data/lib/fact_db/config.rb +94 -35
- data/lib/fact_db/database.rb +98 -8
- data/lib/fact_db/extractors/base.rb +106 -21
- data/lib/fact_db/extractors/llm_extractor.rb +35 -63
- data/lib/fact_db/extractors/manual_extractor.rb +46 -6
- data/lib/fact_db/extractors/rule_based_extractor.rb +136 -25
- data/lib/fact_db/llm/adapter.rb +3 -3
- data/lib/fact_db/models/entity.rb +94 -22
- data/lib/fact_db/models/entity_alias.rb +41 -7
- data/lib/fact_db/models/entity_mention.rb +34 -1
- data/lib/fact_db/models/fact.rb +259 -28
- data/lib/fact_db/models/fact_source.rb +43 -9
- data/lib/fact_db/models/source.rb +113 -0
- data/lib/fact_db/pipeline/extraction_pipeline.rb +35 -35
- data/lib/fact_db/pipeline/resolution_pipeline.rb +5 -5
- data/lib/fact_db/query_result.rb +202 -0
- data/lib/fact_db/resolution/entity_resolver.rb +139 -39
- data/lib/fact_db/resolution/fact_resolver.rb +86 -14
- data/lib/fact_db/services/entity_service.rb +246 -37
- data/lib/fact_db/services/fact_service.rb +254 -17
- data/lib/fact_db/services/source_service.rb +164 -0
- data/lib/fact_db/temporal/query.rb +71 -7
- data/lib/fact_db/temporal/query_builder.rb +69 -0
- data/lib/fact_db/temporal/timeline.rb +102 -11
- data/lib/fact_db/transformers/base.rb +77 -0
- data/lib/fact_db/transformers/cypher_transformer.rb +185 -0
- data/lib/fact_db/transformers/json_transformer.rb +17 -0
- data/lib/fact_db/transformers/raw_transformer.rb +35 -0
- data/lib/fact_db/transformers/text_transformer.rb +114 -0
- data/lib/fact_db/transformers/triple_transformer.rb +138 -0
- data/lib/fact_db/validation/alias_filter.rb +185 -0
- data/lib/fact_db/version.rb +1 -1
- data/lib/fact_db.rb +281 -30
- data/mkdocs.yml +2 -2
- metadata +60 -16
- data/db/migrate/002_create_contents.rb +0 -44
- data/lib/fact_db/models/content.rb +0 -62
- data/lib/fact_db/services/content_service.rb +0 -93
data/docs/api/models/entity.md
CHANGED
|
@@ -6,8 +6,8 @@ Stores resolved identities (people, organizations, places, etc.).
|
|
|
6
6
|
|
|
7
7
|
```ruby
|
|
8
8
|
entity = FactDb::Models::Entity.new(
|
|
9
|
-
|
|
10
|
-
|
|
9
|
+
name: "Paula Chen",
|
|
10
|
+
kind: "person"
|
|
11
11
|
)
|
|
12
12
|
```
|
|
13
13
|
|
|
@@ -16,15 +16,15 @@ entity = FactDb::Models::Entity.new(
|
|
|
16
16
|
| Attribute | Type | Description |
|
|
17
17
|
|-----------|------|-------------|
|
|
18
18
|
| `id` | Integer | Primary key |
|
|
19
|
-
| `
|
|
20
|
-
| `
|
|
19
|
+
| `name` | String | Authoritative name |
|
|
20
|
+
| `kind` | String | Kind (person, organization, place, etc.) |
|
|
21
21
|
| `resolution_status` | String | Status (unresolved, resolved, merged) |
|
|
22
|
-
| `
|
|
22
|
+
| `canonical_id` | Integer | Points to canonical entity if merged |
|
|
23
23
|
| `metadata` | Hash | Additional attributes (JSONB) |
|
|
24
24
|
| `embedding` | Vector | Semantic search vector |
|
|
25
25
|
| `created_at` | DateTime | Record creation time |
|
|
26
26
|
|
|
27
|
-
## Entity
|
|
27
|
+
## Entity Kinds
|
|
28
28
|
|
|
29
29
|
- `person` - Individual people
|
|
30
30
|
- `organization` - Companies, teams, groups
|
|
@@ -52,7 +52,7 @@ belongs_to :merged_into, class_name: 'Entity', optional: true
|
|
|
52
52
|
### add_alias
|
|
53
53
|
|
|
54
54
|
```ruby
|
|
55
|
-
def add_alias(text,
|
|
55
|
+
def add_alias(text, kind: nil, confidence: 1.0)
|
|
56
56
|
```
|
|
57
57
|
|
|
58
58
|
Add an alias to the entity.
|
|
@@ -60,7 +60,7 @@ Add an alias to the entity.
|
|
|
60
60
|
**Example:**
|
|
61
61
|
|
|
62
62
|
```ruby
|
|
63
|
-
entity.add_alias("Paula",
|
|
63
|
+
entity.add_alias("Paula", kind: "nickname", confidence: 0.95)
|
|
64
64
|
```
|
|
65
65
|
|
|
66
66
|
### merged?
|
|
@@ -88,16 +88,16 @@ canonical = entity.canonical # Returns the canonical entity
|
|
|
88
88
|
|
|
89
89
|
## Scopes
|
|
90
90
|
|
|
91
|
-
###
|
|
91
|
+
### by_kind
|
|
92
92
|
|
|
93
93
|
```ruby
|
|
94
|
-
scope :
|
|
94
|
+
scope :by_kind, ->(k) { where(kind: k) }
|
|
95
95
|
```
|
|
96
96
|
|
|
97
|
-
Filter by entity
|
|
97
|
+
Filter by entity kind.
|
|
98
98
|
|
|
99
99
|
```ruby
|
|
100
|
-
Entity.
|
|
100
|
+
Entity.by_kind('person')
|
|
101
101
|
```
|
|
102
102
|
|
|
103
103
|
### active
|
|
@@ -124,7 +124,7 @@ Only resolved entities.
|
|
|
124
124
|
|
|
125
125
|
```ruby
|
|
126
126
|
scope :search_name, ->(query) {
|
|
127
|
-
where("
|
|
127
|
+
where("name ILIKE ?", "%#{query}%")
|
|
128
128
|
}
|
|
129
129
|
```
|
|
130
130
|
|
|
@@ -140,8 +140,8 @@ Entity.search_name("paula")
|
|
|
140
140
|
|
|
141
141
|
```ruby
|
|
142
142
|
entity = Entity.create!(
|
|
143
|
-
|
|
144
|
-
|
|
143
|
+
name: "Paula Chen",
|
|
144
|
+
kind: "person",
|
|
145
145
|
metadata: {
|
|
146
146
|
department: "Engineering",
|
|
147
147
|
employee_id: "E12345"
|
|
@@ -153,15 +153,15 @@ entity = Entity.create!(
|
|
|
153
153
|
|
|
154
154
|
```ruby
|
|
155
155
|
entity.add_alias("Paula")
|
|
156
|
-
entity.add_alias("P. Chen",
|
|
157
|
-
entity.add_alias("Chen, Paula",
|
|
156
|
+
entity.add_alias("P. Chen", kind: "abbreviation")
|
|
157
|
+
entity.add_alias("Chen, Paula", kind: "formal")
|
|
158
158
|
```
|
|
159
159
|
|
|
160
160
|
### Check Aliases
|
|
161
161
|
|
|
162
162
|
```ruby
|
|
163
163
|
entity.entity_aliases.each do |a|
|
|
164
|
-
puts "#{a.
|
|
164
|
+
puts "#{a.name} (#{a.kind})"
|
|
165
165
|
end
|
|
166
166
|
```
|
|
167
167
|
|
|
@@ -169,7 +169,7 @@ end
|
|
|
169
169
|
|
|
170
170
|
```ruby
|
|
171
171
|
entity.facts.each do |fact|
|
|
172
|
-
puts "#{fact.valid_at}: #{fact.
|
|
172
|
+
puts "#{fact.valid_at}: #{fact.text}"
|
|
173
173
|
end
|
|
174
174
|
```
|
|
175
175
|
|
|
@@ -192,11 +192,11 @@ similar = Entity
|
|
|
192
192
|
# entity2 will be merged into entity1
|
|
193
193
|
entity2.update!(
|
|
194
194
|
resolution_status: 'merged',
|
|
195
|
-
|
|
195
|
+
canonical_id: entity1.id
|
|
196
196
|
)
|
|
197
197
|
|
|
198
198
|
# Copy aliases
|
|
199
199
|
entity2.entity_aliases.each do |a|
|
|
200
|
-
entity1.add_alias(a.
|
|
200
|
+
entity1.add_alias(a.name, kind: a.kind)
|
|
201
201
|
end
|
|
202
202
|
```
|
data/docs/api/models/fact.md
CHANGED
|
@@ -6,7 +6,7 @@ Stores temporal assertions about entities.
|
|
|
6
6
|
|
|
7
7
|
```ruby
|
|
8
8
|
fact = FactDb::Models::Fact.new(
|
|
9
|
-
|
|
9
|
+
text: "Paula Chen is Principal Engineer",
|
|
10
10
|
valid_at: Date.parse("2024-01-10"),
|
|
11
11
|
status: "canonical"
|
|
12
12
|
)
|
|
@@ -17,8 +17,8 @@ fact = FactDb::Models::Fact.new(
|
|
|
17
17
|
| Attribute | Type | Description |
|
|
18
18
|
|-----------|------|-------------|
|
|
19
19
|
| `id` | Integer | Primary key |
|
|
20
|
-
| `
|
|
21
|
-
| `
|
|
20
|
+
| `text` | Text | The assertion |
|
|
21
|
+
| `digest` | String | SHA256 digest for deduplication |
|
|
22
22
|
| `valid_at` | DateTime | When fact became true |
|
|
23
23
|
| `invalid_at` | DateTime | When fact stopped being true (nil if current) |
|
|
24
24
|
| `status` | String | Status (canonical, superseded, corroborated, synthesized) |
|
|
@@ -72,7 +72,7 @@ fact.add_mention(
|
|
|
72
72
|
### add_source
|
|
73
73
|
|
|
74
74
|
```ruby
|
|
75
|
-
def add_source(
|
|
75
|
+
def add_source(source:, kind: "primary", excerpt: nil, confidence: 1.0)
|
|
76
76
|
```
|
|
77
77
|
|
|
78
78
|
Add a source content link.
|
|
@@ -81,8 +81,8 @@ Add a source content link.
|
|
|
81
81
|
|
|
82
82
|
```ruby
|
|
83
83
|
fact.add_source(
|
|
84
|
-
|
|
85
|
-
|
|
84
|
+
source: email,
|
|
85
|
+
kind: "primary",
|
|
86
86
|
excerpt: "...accepted the offer..."
|
|
87
87
|
)
|
|
88
88
|
```
|
|
@@ -169,7 +169,7 @@ Fact.mentioning_entity(paula.id)
|
|
|
169
169
|
|
|
170
170
|
```ruby
|
|
171
171
|
scope :search_text, ->(query) {
|
|
172
|
-
where("
|
|
172
|
+
where("text @@ plainto_tsquery(?)", query)
|
|
173
173
|
}
|
|
174
174
|
```
|
|
175
175
|
|
|
@@ -207,7 +207,7 @@ High confidence facts only.
|
|
|
207
207
|
|
|
208
208
|
```ruby
|
|
209
209
|
fact = Fact.create!(
|
|
210
|
-
|
|
210
|
+
text: "Paula Chen joined Microsoft as Principal Engineer",
|
|
211
211
|
valid_at: Date.parse("2024-01-10"),
|
|
212
212
|
status: "canonical",
|
|
213
213
|
extraction_method: "manual",
|
|
@@ -219,7 +219,7 @@ fact.add_mention(entity: paula, text: "Paula Chen", role: "subject")
|
|
|
219
219
|
fact.add_mention(entity: microsoft, text: "Microsoft", role: "organization")
|
|
220
220
|
|
|
221
221
|
# Add source
|
|
222
|
-
fact.add_source(
|
|
222
|
+
fact.add_source(source: announcement, kind: "primary")
|
|
223
223
|
```
|
|
224
224
|
|
|
225
225
|
### Query Facts
|
|
@@ -239,7 +239,7 @@ Fact.search_text("promoted")
|
|
|
239
239
|
|
|
240
240
|
```ruby
|
|
241
241
|
new_fact = Fact.create!(
|
|
242
|
-
|
|
242
|
+
text: "Paula Chen is Senior Principal Engineer",
|
|
243
243
|
valid_at: Date.parse("2024-06-01"),
|
|
244
244
|
status: "canonical"
|
|
245
245
|
)
|
|
@@ -254,10 +254,10 @@ old_fact.update!(
|
|
|
254
254
|
### Get Sources
|
|
255
255
|
|
|
256
256
|
```ruby
|
|
257
|
-
fact.fact_sources.each do |
|
|
258
|
-
puts "Source: #{source.
|
|
259
|
-
puts "
|
|
260
|
-
puts "Excerpt: #{
|
|
257
|
+
fact.fact_sources.each do |fact_source|
|
|
258
|
+
puts "Source: #{fact_source.source.title}"
|
|
259
|
+
puts "Kind: #{fact_source.kind}"
|
|
260
|
+
puts "Excerpt: #{fact_source.excerpt}"
|
|
261
261
|
end
|
|
262
262
|
```
|
|
263
263
|
|
|
@@ -265,6 +265,6 @@ end
|
|
|
265
265
|
|
|
266
266
|
```ruby
|
|
267
267
|
fact.entity_mentions.each do |mention|
|
|
268
|
-
puts "#{mention.entity.
|
|
268
|
+
puts "#{mention.entity.name} (#{mention.mention_role})"
|
|
269
269
|
end
|
|
270
270
|
```
|
data/docs/api/models/index.md
CHANGED
|
@@ -4,7 +4,7 @@ FactDb uses ActiveRecord models for data persistence.
|
|
|
4
4
|
|
|
5
5
|
## Core Models
|
|
6
6
|
|
|
7
|
-
- [
|
|
7
|
+
- [Source](source.md) - Immutable source content
|
|
8
8
|
- [Entity](entity.md) - Resolved identities with aliases
|
|
9
9
|
- [Fact](fact.md) - Temporal assertions
|
|
10
10
|
|
|
@@ -23,8 +23,8 @@ end
|
|
|
23
23
|
| Column | Type | Description |
|
|
24
24
|
|--------|------|-------------|
|
|
25
25
|
| entity_id | bigint | Parent entity |
|
|
26
|
-
|
|
|
27
|
-
|
|
|
26
|
+
| name | string | Alternative name |
|
|
27
|
+
| type | string | Type (nickname, abbreviation, etc.) |
|
|
28
28
|
| confidence | float | Match confidence |
|
|
29
29
|
|
|
30
30
|
### EntityMention
|
|
@@ -53,15 +53,15 @@ Links facts to source content.
|
|
|
53
53
|
```ruby
|
|
54
54
|
class FactSource < ActiveRecord::Base
|
|
55
55
|
belongs_to :fact
|
|
56
|
-
belongs_to :
|
|
56
|
+
belongs_to :source
|
|
57
57
|
end
|
|
58
58
|
```
|
|
59
59
|
|
|
60
60
|
| Column | Type | Description |
|
|
61
61
|
|--------|------|-------------|
|
|
62
62
|
| fact_id | bigint | Parent fact |
|
|
63
|
-
|
|
|
64
|
-
|
|
|
63
|
+
| source_id | bigint | Source content |
|
|
64
|
+
| kind | string | Kind (primary, supporting, corroborating) |
|
|
65
65
|
| excerpt | text | Relevant text excerpt |
|
|
66
66
|
| confidence | float | Source confidence |
|
|
67
67
|
|
|
@@ -69,7 +69,7 @@ end
|
|
|
69
69
|
|
|
70
70
|
```mermaid
|
|
71
71
|
erDiagram
|
|
72
|
-
|
|
72
|
+
Source ||--o{ FactSource : "sourced by"
|
|
73
73
|
Entity ||--o{ EntityAlias : "has"
|
|
74
74
|
Entity ||--o{ EntityMention : "mentioned in"
|
|
75
75
|
Fact ||--o{ EntityMention : "mentions"
|
|
@@ -1,13 +1,13 @@
|
|
|
1
|
-
#
|
|
1
|
+
# Source Model
|
|
2
2
|
|
|
3
|
-
Stores immutable source
|
|
3
|
+
Stores immutable source content from which facts are extracted.
|
|
4
4
|
|
|
5
|
-
## Class: `FactDb::Models::
|
|
5
|
+
## Class: `FactDb::Models::Source`
|
|
6
6
|
|
|
7
7
|
```ruby
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
8
|
+
source = FactDb::Models::Source.new(
|
|
9
|
+
content: "Document content...",
|
|
10
|
+
kind: "email",
|
|
11
11
|
captured_at: Time.current
|
|
12
12
|
)
|
|
13
13
|
```
|
|
@@ -18,11 +18,11 @@ content = FactDb::Models::Content.new(
|
|
|
18
18
|
|-----------|------|-------------|
|
|
19
19
|
| `id` | Integer | Primary key |
|
|
20
20
|
| `content_hash` | String | SHA256 hash for deduplication |
|
|
21
|
-
| `
|
|
22
|
-
| `
|
|
21
|
+
| `kind` | String | Kind (email, document, etc.) |
|
|
22
|
+
| `content` | Text | Original unmodified text content |
|
|
23
23
|
| `title` | String | Optional title |
|
|
24
24
|
| `source_uri` | String | Original location |
|
|
25
|
-
| `
|
|
25
|
+
| `metadata` | Hash | Additional metadata (JSONB) |
|
|
26
26
|
| `embedding` | Vector | Semantic search vector |
|
|
27
27
|
| `captured_at` | DateTime | When content was captured |
|
|
28
28
|
| `created_at` | DateTime | Record creation time |
|
|
@@ -49,7 +49,7 @@ before_create :generate_embedding
|
|
|
49
49
|
def compute_hash
|
|
50
50
|
```
|
|
51
51
|
|
|
52
|
-
Computes SHA256 hash of
|
|
52
|
+
Computes SHA256 hash of content for deduplication.
|
|
53
53
|
|
|
54
54
|
### generate_embedding
|
|
55
55
|
|
|
@@ -67,30 +67,30 @@ Generates embedding vector using configured generator.
|
|
|
67
67
|
def self.find_or_create_by_text(text, **attributes)
|
|
68
68
|
```
|
|
69
69
|
|
|
70
|
-
Find existing
|
|
70
|
+
Find existing source by hash or create new.
|
|
71
71
|
|
|
72
72
|
**Example:**
|
|
73
73
|
|
|
74
74
|
```ruby
|
|
75
|
-
|
|
75
|
+
source = Source.find_or_create_by_text(
|
|
76
76
|
"Document text",
|
|
77
|
-
|
|
77
|
+
kind: "document",
|
|
78
78
|
captured_at: Time.current
|
|
79
79
|
)
|
|
80
80
|
```
|
|
81
81
|
|
|
82
82
|
## Scopes
|
|
83
83
|
|
|
84
|
-
###
|
|
84
|
+
### by_kind
|
|
85
85
|
|
|
86
86
|
```ruby
|
|
87
|
-
scope :
|
|
87
|
+
scope :by_kind, ->(kind) { where(kind: kind) }
|
|
88
88
|
```
|
|
89
89
|
|
|
90
|
-
Filter by content
|
|
90
|
+
Filter by content kind.
|
|
91
91
|
|
|
92
92
|
```ruby
|
|
93
|
-
|
|
93
|
+
Source.by_kind('email')
|
|
94
94
|
```
|
|
95
95
|
|
|
96
96
|
### captured_between
|
|
@@ -104,35 +104,35 @@ scope :captured_between, ->(from, to) {
|
|
|
104
104
|
Filter by capture date range.
|
|
105
105
|
|
|
106
106
|
```ruby
|
|
107
|
-
|
|
107
|
+
Source.captured_between(1.week.ago, Time.current)
|
|
108
108
|
```
|
|
109
109
|
|
|
110
110
|
### search_text
|
|
111
111
|
|
|
112
112
|
```ruby
|
|
113
113
|
scope :search_text, ->(query) {
|
|
114
|
-
where("
|
|
114
|
+
where("content @@ plainto_tsquery(?)", query)
|
|
115
115
|
}
|
|
116
116
|
```
|
|
117
117
|
|
|
118
118
|
Full-text search.
|
|
119
119
|
|
|
120
120
|
```ruby
|
|
121
|
-
|
|
121
|
+
Source.search_text("quarterly earnings")
|
|
122
122
|
```
|
|
123
123
|
|
|
124
124
|
## Usage Examples
|
|
125
125
|
|
|
126
|
-
### Create
|
|
126
|
+
### Create Source
|
|
127
127
|
|
|
128
128
|
```ruby
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
129
|
+
source = Source.create!(
|
|
130
|
+
content: "Important document...",
|
|
131
|
+
kind: "document",
|
|
132
132
|
title: "Q4 Report",
|
|
133
133
|
source_uri: "https://example.com/report.pdf",
|
|
134
134
|
captured_at: Time.current,
|
|
135
|
-
|
|
135
|
+
metadata: {
|
|
136
136
|
author: "Jane Smith",
|
|
137
137
|
department: "Finance"
|
|
138
138
|
}
|
|
@@ -143,14 +143,14 @@ content = Content.create!(
|
|
|
143
143
|
|
|
144
144
|
```ruby
|
|
145
145
|
hash = Digest::SHA256.hexdigest("Document text")
|
|
146
|
-
|
|
146
|
+
source = Source.find_by(content_hash: hash)
|
|
147
147
|
```
|
|
148
148
|
|
|
149
149
|
### Get Related Facts
|
|
150
150
|
|
|
151
151
|
```ruby
|
|
152
|
-
|
|
153
|
-
puts fact.
|
|
152
|
+
source.facts.each do |fact|
|
|
153
|
+
puts fact.text
|
|
154
154
|
end
|
|
155
155
|
```
|
|
156
156
|
|
|
@@ -158,7 +158,7 @@ end
|
|
|
158
158
|
|
|
159
159
|
```ruby
|
|
160
160
|
# Requires embedding
|
|
161
|
-
similar =
|
|
161
|
+
similar = Source
|
|
162
162
|
.where.not(embedding: nil)
|
|
163
163
|
.order(Arel.sql("embedding <=> '#{query_embedding}'"))
|
|
164
164
|
.limit(10)
|
|
@@ -13,14 +13,14 @@ pipeline = FactDb::Pipeline::ExtractionPipeline.new(config)
|
|
|
13
13
|
### process
|
|
14
14
|
|
|
15
15
|
```ruby
|
|
16
|
-
def process(
|
|
16
|
+
def process(sources, extractor: config.default_extractor)
|
|
17
17
|
```
|
|
18
18
|
|
|
19
|
-
Process
|
|
19
|
+
Process source items sequentially.
|
|
20
20
|
|
|
21
21
|
**Parameters:**
|
|
22
22
|
|
|
23
|
-
- `
|
|
23
|
+
- `sources` (Array<Source>) - Source records
|
|
24
24
|
- `extractor` (Symbol) - Extraction method
|
|
25
25
|
|
|
26
26
|
**Returns:** `Array<Hash>`
|
|
@@ -28,8 +28,8 @@ Process content items sequentially.
|
|
|
28
28
|
**Example:**
|
|
29
29
|
|
|
30
30
|
```ruby
|
|
31
|
-
|
|
32
|
-
results = pipeline.process(
|
|
31
|
+
sources = Models::Source.where(id: [1, 2, 3])
|
|
32
|
+
results = pipeline.process(sources, extractor: :llm)
|
|
33
33
|
```
|
|
34
34
|
|
|
35
35
|
---
|
|
@@ -37,14 +37,14 @@ results = pipeline.process(contents, extractor: :llm)
|
|
|
37
37
|
### process_parallel
|
|
38
38
|
|
|
39
39
|
```ruby
|
|
40
|
-
def process_parallel(
|
|
40
|
+
def process_parallel(sources, extractor: config.default_extractor)
|
|
41
41
|
```
|
|
42
42
|
|
|
43
|
-
Process
|
|
43
|
+
Process source items concurrently.
|
|
44
44
|
|
|
45
45
|
**Parameters:**
|
|
46
46
|
|
|
47
|
-
- `
|
|
47
|
+
- `sources` (Array<Source>) - Source records
|
|
48
48
|
- `extractor` (Symbol) - Extraction method
|
|
49
49
|
|
|
50
50
|
**Returns:** `Array<Hash>`
|
|
@@ -52,10 +52,10 @@ Process content items concurrently.
|
|
|
52
52
|
**Example:**
|
|
53
53
|
|
|
54
54
|
```ruby
|
|
55
|
-
results = pipeline.process_parallel(
|
|
55
|
+
results = pipeline.process_parallel(sources, extractor: :llm)
|
|
56
56
|
|
|
57
57
|
results.each do |result|
|
|
58
|
-
puts "
|
|
58
|
+
puts "Source #{result[:source_id]}:"
|
|
59
59
|
puts " Facts: #{result[:facts].count}"
|
|
60
60
|
puts " Error: #{result[:error]}" if result[:error]
|
|
61
61
|
end
|
|
@@ -67,7 +67,7 @@ end
|
|
|
67
67
|
|
|
68
68
|
```mermaid
|
|
69
69
|
graph LR
|
|
70
|
-
A[
|
|
70
|
+
A[Source] --> B[Validate]
|
|
71
71
|
B --> C[Extract]
|
|
72
72
|
C --> D[Validate Facts]
|
|
73
73
|
D --> E[Results]
|
|
@@ -79,7 +79,7 @@ graph LR
|
|
|
79
79
|
style E fill:#B91C1C,stroke:#991B1B,color:#FFFFFF
|
|
80
80
|
```
|
|
81
81
|
|
|
82
|
-
1. **Validate** - Check
|
|
82
|
+
1. **Validate** - Check source is not empty
|
|
83
83
|
2. **Extract** - Run extractor
|
|
84
84
|
3. **Validate Facts** - Filter valid facts
|
|
85
85
|
4. **Results** - Return extracted facts
|
|
@@ -89,9 +89,9 @@ graph LR
|
|
|
89
89
|
```mermaid
|
|
90
90
|
graph TB
|
|
91
91
|
subgraph Parallel
|
|
92
|
-
A1[
|
|
93
|
-
A2[
|
|
94
|
-
A3[
|
|
92
|
+
A1[Source 1] --> E1[Extract 1]
|
|
93
|
+
A2[Source 2] --> E2[Extract 2]
|
|
94
|
+
A3[Source 3] --> E3[Extract 3]
|
|
95
95
|
end
|
|
96
96
|
E1 --> Aggregate
|
|
97
97
|
E2 --> Aggregate
|
|
@@ -110,22 +110,22 @@ graph TB
|
|
|
110
110
|
|
|
111
111
|
```ruby
|
|
112
112
|
{
|
|
113
|
-
|
|
113
|
+
source_id: 123,
|
|
114
114
|
facts: [<Fact>, <Fact>, ...], # Extracted facts
|
|
115
115
|
error: nil # Error message if failed
|
|
116
116
|
}
|
|
117
117
|
```
|
|
118
118
|
|
|
119
|
-
## Usage via
|
|
119
|
+
## Usage via FactDb
|
|
120
120
|
|
|
121
121
|
```ruby
|
|
122
122
|
facts = FactDb.new
|
|
123
123
|
|
|
124
124
|
# Sequential
|
|
125
|
-
results = facts.batch_extract(
|
|
125
|
+
results = facts.batch_extract(source_ids, parallel: false)
|
|
126
126
|
|
|
127
127
|
# Parallel (default)
|
|
128
|
-
results = facts.batch_extract(
|
|
128
|
+
results = facts.batch_extract(source_ids, parallel: true)
|
|
129
129
|
```
|
|
130
130
|
|
|
131
131
|
## Error Handling
|
|
@@ -133,13 +133,13 @@ results = facts.batch_extract(content_ids, parallel: true)
|
|
|
133
133
|
The pipeline catches errors per-item:
|
|
134
134
|
|
|
135
135
|
```ruby
|
|
136
|
-
results = pipeline.process_parallel(
|
|
136
|
+
results = pipeline.process_parallel(sources)
|
|
137
137
|
|
|
138
138
|
results.each do |result|
|
|
139
139
|
if result[:error]
|
|
140
|
-
logger.error "
|
|
140
|
+
logger.error "Source #{result[:source_id]}: #{result[:error]}"
|
|
141
141
|
else
|
|
142
|
-
logger.info "
|
|
142
|
+
logger.info "Source #{result[:source_id]}: #{result[:facts].count} facts"
|
|
143
143
|
end
|
|
144
144
|
end
|
|
145
145
|
```
|
|
@@ -151,12 +151,12 @@ end
|
|
|
151
151
|
Optimal batch size depends on:
|
|
152
152
|
|
|
153
153
|
- Extractor type (LLM has rate limits)
|
|
154
|
-
-
|
|
154
|
+
- Source length
|
|
155
155
|
- System resources
|
|
156
156
|
|
|
157
157
|
```ruby
|
|
158
158
|
# Process in optimal batches
|
|
159
|
-
|
|
159
|
+
sources.each_slice(25) do |batch|
|
|
160
160
|
results = pipeline.process_parallel(batch)
|
|
161
161
|
process_results(results)
|
|
162
162
|
end
|
|
@@ -167,7 +167,7 @@ end
|
|
|
167
167
|
For large batches, process and discard:
|
|
168
168
|
|
|
169
169
|
```ruby
|
|
170
|
-
|
|
170
|
+
sources.each_slice(50) do |batch|
|
|
171
171
|
results = pipeline.process_parallel(batch)
|
|
172
172
|
save_facts(results.flat_map { |r| r[:facts] })
|
|
173
173
|
# Results discarded after each batch
|
data/docs/api/pipeline/index.md
CHANGED
|
@@ -33,7 +33,7 @@ results = pipeline.resolve_entities(names)
|
|
|
33
33
|
|
|
34
34
|
results.each do |result|
|
|
35
35
|
puts "#{result[:name]}: #{result[:status]}"
|
|
36
|
-
puts " Entity: #{result[:entity]&.
|
|
36
|
+
puts " Entity: #{result[:entity]&.name}"
|
|
37
37
|
end
|
|
38
38
|
```
|
|
39
39
|
|
|
@@ -61,7 +61,7 @@ results = pipeline.detect_conflicts([paula.id, john.id])
|
|
|
61
61
|
results.each do |result|
|
|
62
62
|
puts "Entity #{result[:entity_id]}: #{result[:conflict_count]} conflicts"
|
|
63
63
|
result[:conflicts].each do |c|
|
|
64
|
-
puts " - #{c[:fact1].
|
|
64
|
+
puts " - #{c[:fact1].text} vs #{c[:fact2].text}"
|
|
65
65
|
end
|
|
66
66
|
end
|
|
67
67
|
```
|
|
@@ -153,7 +153,7 @@ results = facts.detect_fact_conflicts([entity1.id, entity2.id])
|
|
|
153
153
|
|
|
154
154
|
The pipeline uses the EntityResolver which tries:
|
|
155
155
|
|
|
156
|
-
1. **Exact match** on
|
|
156
|
+
1. **Exact match** on name
|
|
157
157
|
2. **Alias match** on registered aliases
|
|
158
158
|
3. **Fuzzy match** using Levenshtein distance
|
|
159
159
|
|
|
@@ -202,7 +202,7 @@ end
|
|
|
202
202
|
|
|
203
203
|
```ruby
|
|
204
204
|
# Load entities into memory first
|
|
205
|
-
Entity.where(
|
|
205
|
+
Entity.where(type: 'person').to_a
|
|
206
206
|
|
|
207
207
|
# Then resolve
|
|
208
208
|
results = pipeline.resolve_entities(person_names, type: :person)
|