fact_db 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.envrc +2 -0
- data/.yardopts +5 -0
- data/CHANGELOG.md +64 -0
- data/README.md +107 -6
- data/Rakefile +243 -10
- data/db/migrate/001_enable_extensions.rb +1 -0
- data/db/migrate/002_create_sources.rb +49 -0
- data/db/migrate/003_create_entities.rb +27 -15
- data/db/migrate/004_create_entity_aliases.rb +20 -7
- data/db/migrate/005_create_facts.rb +37 -21
- data/db/migrate/006_create_entity_mentions.rb +14 -6
- data/db/migrate/007_create_fact_sources.rb +16 -8
- data/docs/api/extractors/index.md +5 -5
- data/docs/api/extractors/llm.md +17 -17
- data/docs/api/extractors/rule-based.md +14 -14
- data/docs/api/facts.md +20 -20
- data/docs/api/index.md +4 -4
- data/docs/api/models/entity.md +21 -21
- data/docs/api/models/fact.md +15 -15
- data/docs/api/models/index.md +7 -7
- data/docs/api/models/{content.md → source.md} +29 -29
- data/docs/api/pipeline/extraction.md +25 -25
- data/docs/api/pipeline/index.md +1 -1
- data/docs/api/pipeline/resolution.md +4 -4
- data/docs/api/services/entity-service.md +20 -20
- data/docs/api/services/fact-service.md +12 -12
- data/docs/api/services/index.md +5 -5
- data/docs/api/services/{content-service.md → source-service.md} +27 -27
- data/docs/architecture/database-schema.md +46 -46
- data/docs/architecture/entity-resolution.md +6 -6
- data/docs/architecture/index.md +10 -10
- data/docs/architecture/temporal-facts.md +5 -5
- data/docs/architecture/three-layer-model.md +17 -17
- data/docs/concepts.md +6 -6
- data/docs/examples/basic-usage.md +20 -20
- data/docs/examples/hr-onboarding.md +17 -17
- data/docs/examples/index.md +4 -4
- data/docs/examples/news-analysis.md +23 -23
- data/docs/getting-started/database-setup.md +28 -20
- data/docs/getting-started/index.md +3 -3
- data/docs/getting-started/quick-start.md +33 -30
- data/docs/guides/batch-processing.md +26 -26
- data/docs/guides/configuration.md +158 -77
- data/docs/guides/entity-management.md +14 -14
- data/docs/guides/extracting-facts.md +28 -28
- data/docs/guides/ingesting-content.md +14 -14
- data/docs/guides/llm-integration.md +40 -32
- data/docs/guides/temporal-queries.md +11 -11
- data/docs/index.md +6 -2
- data/examples/.envrc +4 -0
- data/examples/.gitignore +1 -0
- data/examples/001_configuration.rb +312 -0
- data/examples/{basic_usage.rb → 010_basic_usage.rb} +47 -56
- data/examples/{entity_management.rb → 020_entity_management.rb} +57 -72
- data/examples/{temporal_queries.rb → 030_temporal_queries.rb} +39 -59
- data/examples/040_output_formats.rb +177 -0
- data/examples/{rule_based_extraction.rb → 050_rule_based_extraction.rb} +39 -45
- data/examples/060_fluent_temporal_api.rb +217 -0
- data/examples/070_introspection.rb +252 -0
- data/examples/{hr_system.rb → 080_hr_system.rb} +56 -75
- data/examples/090_ingest_demo.rb +515 -0
- data/examples/100_query_context.rb +668 -0
- data/examples/110_prove_it.rb +204 -0
- data/examples/120_dump_database.rb +358 -0
- data/examples/130_rag_feedback_loop.rb +858 -0
- data/examples/README.md +229 -15
- data/examples/data/lincoln_associates.md +201 -0
- data/examples/data/lincoln_biography.md +66 -0
- data/examples/data/lincoln_cabinet.md +243 -0
- data/examples/data/lincoln_family.md +163 -0
- data/examples/data/lincoln_military.md +241 -0
- data/examples/data/lincoln_todd_family.md +136 -0
- data/examples/ingest_reporter.rb +335 -0
- data/examples/utilities.rb +182 -0
- data/lib/fact_db/config/defaults.yml +254 -0
- data/lib/fact_db/config.rb +94 -35
- data/lib/fact_db/database.rb +98 -8
- data/lib/fact_db/extractors/base.rb +106 -21
- data/lib/fact_db/extractors/llm_extractor.rb +35 -63
- data/lib/fact_db/extractors/manual_extractor.rb +46 -6
- data/lib/fact_db/extractors/rule_based_extractor.rb +136 -25
- data/lib/fact_db/llm/adapter.rb +3 -3
- data/lib/fact_db/models/entity.rb +94 -22
- data/lib/fact_db/models/entity_alias.rb +41 -7
- data/lib/fact_db/models/entity_mention.rb +34 -1
- data/lib/fact_db/models/fact.rb +259 -28
- data/lib/fact_db/models/fact_source.rb +43 -9
- data/lib/fact_db/models/source.rb +113 -0
- data/lib/fact_db/pipeline/extraction_pipeline.rb +35 -35
- data/lib/fact_db/pipeline/resolution_pipeline.rb +5 -5
- data/lib/fact_db/query_result.rb +202 -0
- data/lib/fact_db/resolution/entity_resolver.rb +139 -39
- data/lib/fact_db/resolution/fact_resolver.rb +86 -14
- data/lib/fact_db/services/entity_service.rb +246 -37
- data/lib/fact_db/services/fact_service.rb +254 -17
- data/lib/fact_db/services/source_service.rb +164 -0
- data/lib/fact_db/temporal/query.rb +71 -7
- data/lib/fact_db/temporal/query_builder.rb +69 -0
- data/lib/fact_db/temporal/timeline.rb +102 -11
- data/lib/fact_db/transformers/base.rb +77 -0
- data/lib/fact_db/transformers/cypher_transformer.rb +185 -0
- data/lib/fact_db/transformers/json_transformer.rb +17 -0
- data/lib/fact_db/transformers/raw_transformer.rb +35 -0
- data/lib/fact_db/transformers/text_transformer.rb +114 -0
- data/lib/fact_db/transformers/triple_transformer.rb +138 -0
- data/lib/fact_db/validation/alias_filter.rb +185 -0
- data/lib/fact_db/version.rb +1 -1
- data/lib/fact_db.rb +281 -30
- data/mkdocs.yml +2 -2
- metadata +60 -16
- data/db/migrate/002_create_contents.rb +0 -44
- data/lib/fact_db/models/content.rb +0 -62
- data/lib/fact_db/services/content_service.rb +0 -93
|
@@ -13,15 +13,15 @@ service = FactDb::Services::EntityService.new(config)
|
|
|
13
13
|
### create
|
|
14
14
|
|
|
15
15
|
```ruby
|
|
16
|
-
def create(
|
|
16
|
+
def create(name, kind:, aliases: [], metadata: {})
|
|
17
17
|
```
|
|
18
18
|
|
|
19
19
|
Create a new entity.
|
|
20
20
|
|
|
21
21
|
**Parameters:**
|
|
22
22
|
|
|
23
|
-
- `
|
|
24
|
-
- `
|
|
23
|
+
- `name` (String) - Authoritative name
|
|
24
|
+
- `kind` (Symbol) - Entity kind
|
|
25
25
|
- `aliases` (Array) - Alternative names
|
|
26
26
|
- `metadata` (Hash) - Additional attributes
|
|
27
27
|
|
|
@@ -32,7 +32,7 @@ Create a new entity.
|
|
|
32
32
|
```ruby
|
|
33
33
|
entity = service.create(
|
|
34
34
|
"Paula Chen",
|
|
35
|
-
|
|
35
|
+
kind: :person,
|
|
36
36
|
aliases: ["Paula", "P. Chen"],
|
|
37
37
|
metadata: { department: "Engineering" }
|
|
38
38
|
)
|
|
@@ -55,7 +55,7 @@ Find entity by ID.
|
|
|
55
55
|
### resolve
|
|
56
56
|
|
|
57
57
|
```ruby
|
|
58
|
-
def resolve(name,
|
|
58
|
+
def resolve(name, kind: nil)
|
|
59
59
|
```
|
|
60
60
|
|
|
61
61
|
Resolve a name to an entity using multiple strategies.
|
|
@@ -63,14 +63,14 @@ Resolve a name to an entity using multiple strategies.
|
|
|
63
63
|
**Parameters:**
|
|
64
64
|
|
|
65
65
|
- `name` (String) - Name to resolve
|
|
66
|
-
- `
|
|
66
|
+
- `kind` (Symbol) - Optional kind filter
|
|
67
67
|
|
|
68
68
|
**Returns:** `Models::Entity` or `nil`
|
|
69
69
|
|
|
70
70
|
**Example:**
|
|
71
71
|
|
|
72
72
|
```ruby
|
|
73
|
-
entity = service.resolve("Paula Chen",
|
|
73
|
+
entity = service.resolve("Paula Chen", kind: :person)
|
|
74
74
|
```
|
|
75
75
|
|
|
76
76
|
---
|
|
@@ -78,7 +78,7 @@ entity = service.resolve("Paula Chen", type: :person)
|
|
|
78
78
|
### add_alias
|
|
79
79
|
|
|
80
80
|
```ruby
|
|
81
|
-
def add_alias(entity_id,
|
|
81
|
+
def add_alias(entity_id, alias_name, kind: nil, confidence: 1.0)
|
|
82
82
|
```
|
|
83
83
|
|
|
84
84
|
Add an alias to an entity.
|
|
@@ -86,7 +86,7 @@ Add an alias to an entity.
|
|
|
86
86
|
**Example:**
|
|
87
87
|
|
|
88
88
|
```ruby
|
|
89
|
-
service.add_alias(entity.id, "P. Chen",
|
|
89
|
+
service.add_alias(entity.id, "P. Chen", kind: :abbreviation)
|
|
90
90
|
```
|
|
91
91
|
|
|
92
92
|
---
|
|
@@ -94,7 +94,7 @@ service.add_alias(entity.id, "P. Chen", type: :abbreviation)
|
|
|
94
94
|
### remove_alias
|
|
95
95
|
|
|
96
96
|
```ruby
|
|
97
|
-
def remove_alias(entity_id,
|
|
97
|
+
def remove_alias(entity_id, alias_name)
|
|
98
98
|
```
|
|
99
99
|
|
|
100
100
|
Remove an alias from an entity.
|
|
@@ -130,7 +130,7 @@ Update entity attributes.
|
|
|
130
130
|
```ruby
|
|
131
131
|
service.update(
|
|
132
132
|
entity.id,
|
|
133
|
-
|
|
133
|
+
name: "Paula M. Chen",
|
|
134
134
|
metadata: { title: "Senior Engineer" }
|
|
135
135
|
)
|
|
136
136
|
```
|
|
@@ -140,7 +140,7 @@ service.update(
|
|
|
140
140
|
### search
|
|
141
141
|
|
|
142
142
|
```ruby
|
|
143
|
-
def search(query,
|
|
143
|
+
def search(query, kind: nil, limit: 20)
|
|
144
144
|
```
|
|
145
145
|
|
|
146
146
|
Search entities by name.
|
|
@@ -148,32 +148,32 @@ Search entities by name.
|
|
|
148
148
|
**Parameters:**
|
|
149
149
|
|
|
150
150
|
- `query` (String) - Search query
|
|
151
|
-
- `
|
|
151
|
+
- `kind` (Symbol) - Optional kind filter
|
|
152
152
|
- `limit` (Integer) - Max results
|
|
153
153
|
|
|
154
154
|
**Returns:** `Array<Models::Entity>`
|
|
155
155
|
|
|
156
156
|
---
|
|
157
157
|
|
|
158
|
-
###
|
|
158
|
+
### by_kind
|
|
159
159
|
|
|
160
160
|
```ruby
|
|
161
|
-
def
|
|
161
|
+
def by_kind(kind)
|
|
162
162
|
```
|
|
163
163
|
|
|
164
|
-
Filter entities by
|
|
164
|
+
Filter entities by kind.
|
|
165
165
|
|
|
166
166
|
**Returns:** `ActiveRecord::Relation`
|
|
167
167
|
|
|
168
168
|
---
|
|
169
169
|
|
|
170
|
-
###
|
|
170
|
+
### in_source
|
|
171
171
|
|
|
172
172
|
```ruby
|
|
173
|
-
def
|
|
173
|
+
def in_source(source_id)
|
|
174
174
|
```
|
|
175
175
|
|
|
176
|
-
Find entities mentioned in a
|
|
176
|
+
Find entities mentioned in a source.
|
|
177
177
|
|
|
178
178
|
**Returns:** `Array<Models::Entity>`
|
|
179
179
|
|
|
@@ -194,7 +194,7 @@ Find entities that appear in facts with the given entity.
|
|
|
194
194
|
### semantic_search
|
|
195
195
|
|
|
196
196
|
```ruby
|
|
197
|
-
def semantic_search(query,
|
|
197
|
+
def semantic_search(query, kind: nil, limit: 10)
|
|
198
198
|
```
|
|
199
199
|
|
|
200
200
|
Semantic similarity search using embeddings.
|
|
@@ -19,14 +19,14 @@ service = FactDb::Services::FactService.new(config)
|
|
|
19
19
|
### create
|
|
20
20
|
|
|
21
21
|
```ruby
|
|
22
|
-
def create(
|
|
22
|
+
def create(text, valid_at:, invalid_at: nil, mentions: [], sources: [], confidence: 1.0, metadata: {})
|
|
23
23
|
```
|
|
24
24
|
|
|
25
25
|
Create a new fact.
|
|
26
26
|
|
|
27
27
|
**Parameters:**
|
|
28
28
|
|
|
29
|
-
- `
|
|
29
|
+
- `text` (String) - The assertion
|
|
30
30
|
- `valid_at` (Date/Time) - When fact became true
|
|
31
31
|
- `invalid_at` (Date/Time) - When fact stopped (optional)
|
|
32
32
|
- `mentions` (Array) - Entity mentions
|
|
@@ -46,7 +46,7 @@ fact = service.create(
|
|
|
46
46
|
{ entity: paula, role: "subject", text: "Paula Chen" }
|
|
47
47
|
],
|
|
48
48
|
sources: [
|
|
49
|
-
{
|
|
49
|
+
{ source: email, type: "primary" }
|
|
50
50
|
]
|
|
51
51
|
)
|
|
52
52
|
```
|
|
@@ -65,17 +65,17 @@ Find fact by ID.
|
|
|
65
65
|
|
|
66
66
|
---
|
|
67
67
|
|
|
68
|
-
###
|
|
68
|
+
### extract_from_source
|
|
69
69
|
|
|
70
70
|
```ruby
|
|
71
|
-
def
|
|
71
|
+
def extract_from_source(source_id, extractor: config.default_extractor)
|
|
72
72
|
```
|
|
73
73
|
|
|
74
|
-
Extract facts from
|
|
74
|
+
Extract facts from source using specified extractor.
|
|
75
75
|
|
|
76
76
|
**Parameters:**
|
|
77
77
|
|
|
78
|
-
- `
|
|
78
|
+
- `source_id` (Integer) - Source ID
|
|
79
79
|
- `extractor` (Symbol) - Extractor type (:manual, :llm, :rule_based)
|
|
80
80
|
|
|
81
81
|
**Returns:** `Array<Models::Fact>`
|
|
@@ -83,7 +83,7 @@ Extract facts from content using specified extractor.
|
|
|
83
83
|
**Example:**
|
|
84
84
|
|
|
85
85
|
```ruby
|
|
86
|
-
facts = service.
|
|
86
|
+
facts = service.extract_from_source(source.id, extractor: :llm)
|
|
87
87
|
```
|
|
88
88
|
|
|
89
89
|
---
|
|
@@ -142,19 +142,19 @@ Build a timeline for an entity.
|
|
|
142
142
|
```ruby
|
|
143
143
|
timeline = service.timeline(entity_id: paula.id)
|
|
144
144
|
timeline.each do |fact|
|
|
145
|
-
puts "#{fact.valid_at}: #{fact.
|
|
145
|
+
puts "#{fact.valid_at}: #{fact.text}"
|
|
146
146
|
end
|
|
147
147
|
```
|
|
148
148
|
|
|
149
149
|
---
|
|
150
150
|
|
|
151
|
-
###
|
|
151
|
+
### from_source
|
|
152
152
|
|
|
153
153
|
```ruby
|
|
154
|
-
def
|
|
154
|
+
def from_source(source_id)
|
|
155
155
|
```
|
|
156
156
|
|
|
157
|
-
Get facts sourced from specific
|
|
157
|
+
Get facts sourced from specific source.
|
|
158
158
|
|
|
159
159
|
**Returns:** `Array<Models::Fact>`
|
|
160
160
|
|
data/docs/api/services/index.md
CHANGED
|
@@ -4,7 +4,7 @@ Services provide the business logic layer for FactDb operations.
|
|
|
4
4
|
|
|
5
5
|
## Available Services
|
|
6
6
|
|
|
7
|
-
- [
|
|
7
|
+
- [SourceService](source-service.md) - Ingest and manage source content
|
|
8
8
|
- [EntityService](entity-service.md) - Create and resolve entities
|
|
9
9
|
- [FactService](fact-service.md) - Extract and query facts
|
|
10
10
|
|
|
@@ -26,12 +26,12 @@ end
|
|
|
26
26
|
|
|
27
27
|
## Accessing Services
|
|
28
28
|
|
|
29
|
-
### Via
|
|
29
|
+
### Via FactDb
|
|
30
30
|
|
|
31
31
|
```ruby
|
|
32
32
|
facts = FactDb.new
|
|
33
33
|
|
|
34
|
-
facts.
|
|
34
|
+
facts.source_service.create(text, type: :document)
|
|
35
35
|
facts.entity_service.create("Paula", type: :person)
|
|
36
36
|
facts.fact_service.create("Fact text", valid_at: Date.today)
|
|
37
37
|
```
|
|
@@ -39,8 +39,8 @@ facts.fact_service.create("Fact text", valid_at: Date.today)
|
|
|
39
39
|
### Directly
|
|
40
40
|
|
|
41
41
|
```ruby
|
|
42
|
-
service = FactDb::Services::
|
|
43
|
-
|
|
42
|
+
service = FactDb::Services::SourceService.new(config)
|
|
43
|
+
source = service.create(text, type: :document)
|
|
44
44
|
```
|
|
45
45
|
|
|
46
46
|
## Common Methods
|
|
@@ -1,11 +1,11 @@
|
|
|
1
|
-
#
|
|
1
|
+
# SourceService
|
|
2
2
|
|
|
3
3
|
Service for ingesting and managing source content.
|
|
4
4
|
|
|
5
|
-
## Class: `FactDb::Services::
|
|
5
|
+
## Class: `FactDb::Services::SourceService`
|
|
6
6
|
|
|
7
7
|
```ruby
|
|
8
|
-
service = FactDb::Services::
|
|
8
|
+
service = FactDb::Services::SourceService.new(config)
|
|
9
9
|
```
|
|
10
10
|
|
|
11
11
|
## Methods
|
|
@@ -13,28 +13,28 @@ service = FactDb::Services::ContentService.new(config)
|
|
|
13
13
|
### create
|
|
14
14
|
|
|
15
15
|
```ruby
|
|
16
|
-
def create(
|
|
16
|
+
def create(content, kind:, captured_at: Time.current, metadata: {}, title: nil, source_uri: nil)
|
|
17
17
|
```
|
|
18
18
|
|
|
19
|
-
Create new
|
|
19
|
+
Create new source with automatic deduplication.
|
|
20
20
|
|
|
21
21
|
**Parameters:**
|
|
22
22
|
|
|
23
|
-
- `
|
|
24
|
-
- `
|
|
23
|
+
- `content` (String) - Source text content
|
|
24
|
+
- `kind` (Symbol) - Content kind
|
|
25
25
|
- `captured_at` (Time) - Capture timestamp
|
|
26
26
|
- `metadata` (Hash) - Additional metadata
|
|
27
27
|
- `title` (String) - Optional title
|
|
28
28
|
- `source_uri` (String) - Original location
|
|
29
29
|
|
|
30
|
-
**Returns:** `Models::
|
|
30
|
+
**Returns:** `Models::Source`
|
|
31
31
|
|
|
32
32
|
**Example:**
|
|
33
33
|
|
|
34
34
|
```ruby
|
|
35
|
-
|
|
35
|
+
source = service.create(
|
|
36
36
|
"Email body text...",
|
|
37
|
-
|
|
37
|
+
kind: :email,
|
|
38
38
|
title: "RE: Important",
|
|
39
39
|
metadata: { from: "sender@example.com" }
|
|
40
40
|
)
|
|
@@ -48,9 +48,9 @@ content = service.create(
|
|
|
48
48
|
def find(id)
|
|
49
49
|
```
|
|
50
50
|
|
|
51
|
-
Find
|
|
51
|
+
Find source by ID.
|
|
52
52
|
|
|
53
|
-
**Returns:** `Models::
|
|
53
|
+
**Returns:** `Models::Source`
|
|
54
54
|
|
|
55
55
|
---
|
|
56
56
|
|
|
@@ -60,15 +60,15 @@ Find content by ID.
|
|
|
60
60
|
def find_by_hash(hash)
|
|
61
61
|
```
|
|
62
62
|
|
|
63
|
-
Find
|
|
63
|
+
Find source by SHA256 hash.
|
|
64
64
|
|
|
65
|
-
**Returns:** `Models::
|
|
65
|
+
**Returns:** `Models::Source` or `nil`
|
|
66
66
|
|
|
67
67
|
**Example:**
|
|
68
68
|
|
|
69
69
|
```ruby
|
|
70
70
|
hash = Digest::SHA256.hexdigest(text)
|
|
71
|
-
|
|
71
|
+
source = service.find_by_hash(hash)
|
|
72
72
|
```
|
|
73
73
|
|
|
74
74
|
---
|
|
@@ -79,14 +79,14 @@ content = service.find_by_hash(hash)
|
|
|
79
79
|
def search(query, limit: 20)
|
|
80
80
|
```
|
|
81
81
|
|
|
82
|
-
Full-text search
|
|
82
|
+
Full-text search sources.
|
|
83
83
|
|
|
84
84
|
**Parameters:**
|
|
85
85
|
|
|
86
86
|
- `query` (String) - Search query
|
|
87
87
|
- `limit` (Integer) - Max results
|
|
88
88
|
|
|
89
|
-
**Returns:** `Array<Models::
|
|
89
|
+
**Returns:** `Array<Models::Source>`
|
|
90
90
|
|
|
91
91
|
**Example:**
|
|
92
92
|
|
|
@@ -109,7 +109,7 @@ Semantic similarity search using embeddings.
|
|
|
109
109
|
- `query` (String) - Search query
|
|
110
110
|
- `limit` (Integer) - Max results
|
|
111
111
|
|
|
112
|
-
**Returns:** `Array<Models::
|
|
112
|
+
**Returns:** `Array<Models::Source>`
|
|
113
113
|
|
|
114
114
|
**Example:**
|
|
115
115
|
|
|
@@ -119,20 +119,20 @@ results = service.semantic_search("financial performance")
|
|
|
119
119
|
|
|
120
120
|
---
|
|
121
121
|
|
|
122
|
-
###
|
|
122
|
+
### by_kind
|
|
123
123
|
|
|
124
124
|
```ruby
|
|
125
|
-
def
|
|
125
|
+
def by_kind(kind)
|
|
126
126
|
```
|
|
127
127
|
|
|
128
|
-
Filter
|
|
128
|
+
Filter sources by kind.
|
|
129
129
|
|
|
130
130
|
**Returns:** `ActiveRecord::Relation`
|
|
131
131
|
|
|
132
132
|
**Example:**
|
|
133
133
|
|
|
134
134
|
```ruby
|
|
135
|
-
emails = service.
|
|
135
|
+
emails = service.by_kind(:email)
|
|
136
136
|
```
|
|
137
137
|
|
|
138
138
|
---
|
|
@@ -143,9 +143,9 @@ emails = service.by_type(:email)
|
|
|
143
143
|
def recent(limit: 20)
|
|
144
144
|
```
|
|
145
145
|
|
|
146
|
-
Get recently captured
|
|
146
|
+
Get recently captured sources.
|
|
147
147
|
|
|
148
|
-
**Returns:** `Array<Models::
|
|
148
|
+
**Returns:** `Array<Models::Source>`
|
|
149
149
|
|
|
150
150
|
---
|
|
151
151
|
|
|
@@ -155,12 +155,12 @@ Get recently captured content.
|
|
|
155
155
|
def mentioning_entity(entity_id)
|
|
156
156
|
```
|
|
157
157
|
|
|
158
|
-
Find
|
|
158
|
+
Find sources that mention an entity (via facts).
|
|
159
159
|
|
|
160
|
-
**Returns:** `Array<Models::
|
|
160
|
+
**Returns:** `Array<Models::Source>`
|
|
161
161
|
|
|
162
162
|
**Example:**
|
|
163
163
|
|
|
164
164
|
```ruby
|
|
165
|
-
|
|
165
|
+
paula_sources = service.mentioning_entity(paula.id)
|
|
166
166
|
```
|
|
@@ -6,21 +6,21 @@ FactDb uses PostgreSQL with the pgvector extension for semantic search capabilit
|
|
|
6
6
|
|
|
7
7
|
```mermaid
|
|
8
8
|
erDiagram
|
|
9
|
-
|
|
9
|
+
sources ||--o{ fact_sources : "sourced by"
|
|
10
10
|
entities ||--o{ entity_aliases : "has"
|
|
11
11
|
entities ||--o{ entity_mentions : "mentioned in"
|
|
12
12
|
facts ||--o{ entity_mentions : "mentions"
|
|
13
13
|
facts ||--o{ fact_sources : "sourced from"
|
|
14
14
|
facts ||--o| facts : "superseded by"
|
|
15
15
|
|
|
16
|
-
|
|
16
|
+
sources {
|
|
17
17
|
bigint id PK
|
|
18
18
|
string content_hash UK
|
|
19
|
-
string
|
|
20
|
-
text
|
|
19
|
+
string type
|
|
20
|
+
text content
|
|
21
21
|
string title
|
|
22
22
|
string source_uri
|
|
23
|
-
jsonb
|
|
23
|
+
jsonb metadata
|
|
24
24
|
vector embedding
|
|
25
25
|
timestamptz captured_at
|
|
26
26
|
timestamptz created_at
|
|
@@ -28,10 +28,10 @@ erDiagram
|
|
|
28
28
|
|
|
29
29
|
entities {
|
|
30
30
|
bigint id PK
|
|
31
|
-
string
|
|
32
|
-
string
|
|
31
|
+
string name
|
|
32
|
+
string type
|
|
33
33
|
string resolution_status
|
|
34
|
-
bigint
|
|
34
|
+
bigint canonical_id FK
|
|
35
35
|
jsonb metadata
|
|
36
36
|
vector embedding
|
|
37
37
|
timestamptz created_at
|
|
@@ -40,15 +40,15 @@ erDiagram
|
|
|
40
40
|
entity_aliases {
|
|
41
41
|
bigint id PK
|
|
42
42
|
bigint entity_id FK
|
|
43
|
-
string
|
|
44
|
-
string
|
|
43
|
+
string name
|
|
44
|
+
string type
|
|
45
45
|
float confidence
|
|
46
46
|
}
|
|
47
47
|
|
|
48
48
|
facts {
|
|
49
49
|
bigint id PK
|
|
50
|
-
text
|
|
51
|
-
string
|
|
50
|
+
text text
|
|
51
|
+
string digest
|
|
52
52
|
timestamptz valid_at
|
|
53
53
|
timestamptz invalid_at
|
|
54
54
|
string status
|
|
@@ -74,8 +74,8 @@ erDiagram
|
|
|
74
74
|
fact_sources {
|
|
75
75
|
bigint id PK
|
|
76
76
|
bigint fact_id FK
|
|
77
|
-
bigint
|
|
78
|
-
string
|
|
77
|
+
bigint source_id FK
|
|
78
|
+
string kind
|
|
79
79
|
text excerpt
|
|
80
80
|
float confidence
|
|
81
81
|
}
|
|
@@ -83,28 +83,28 @@ erDiagram
|
|
|
83
83
|
|
|
84
84
|
## Tables
|
|
85
85
|
|
|
86
|
-
###
|
|
86
|
+
### sources
|
|
87
87
|
|
|
88
|
-
Stores immutable source
|
|
88
|
+
Stores immutable source content.
|
|
89
89
|
|
|
90
90
|
```sql
|
|
91
|
-
CREATE TABLE
|
|
91
|
+
CREATE TABLE sources (
|
|
92
92
|
id BIGSERIAL PRIMARY KEY,
|
|
93
93
|
content_hash VARCHAR(64) NOT NULL UNIQUE,
|
|
94
|
-
|
|
95
|
-
|
|
94
|
+
type VARCHAR(50) NOT NULL,
|
|
95
|
+
content TEXT NOT NULL,
|
|
96
96
|
title VARCHAR(255),
|
|
97
97
|
source_uri TEXT,
|
|
98
|
-
|
|
98
|
+
metadata JSONB NOT NULL DEFAULT '{}',
|
|
99
99
|
embedding VECTOR(1536),
|
|
100
100
|
captured_at TIMESTAMPTZ NOT NULL,
|
|
101
101
|
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
|
102
102
|
);
|
|
103
103
|
|
|
104
|
-
CREATE INDEX
|
|
105
|
-
CREATE INDEX
|
|
106
|
-
CREATE INDEX
|
|
107
|
-
CREATE INDEX
|
|
104
|
+
CREATE INDEX idx_sources_type ON sources(type);
|
|
105
|
+
CREATE INDEX idx_sources_captured ON sources(captured_at);
|
|
106
|
+
CREATE INDEX idx_sources_text ON sources USING gin(to_tsvector('english', content));
|
|
107
|
+
CREATE INDEX idx_sources_embedding ON sources USING hnsw(embedding vector_cosine_ops);
|
|
108
108
|
```
|
|
109
109
|
|
|
110
110
|
### entities
|
|
@@ -114,17 +114,17 @@ Stores resolved identities.
|
|
|
114
114
|
```sql
|
|
115
115
|
CREATE TABLE entities (
|
|
116
116
|
id BIGSERIAL PRIMARY KEY,
|
|
117
|
-
|
|
118
|
-
|
|
117
|
+
name VARCHAR(255) NOT NULL,
|
|
118
|
+
type VARCHAR(50) NOT NULL,
|
|
119
119
|
resolution_status VARCHAR(20) NOT NULL DEFAULT 'unresolved',
|
|
120
|
-
|
|
120
|
+
canonical_id BIGINT REFERENCES entities(id),
|
|
121
121
|
metadata JSONB NOT NULL DEFAULT '{}',
|
|
122
122
|
embedding VECTOR(1536),
|
|
123
123
|
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
|
124
124
|
);
|
|
125
125
|
|
|
126
|
-
CREATE INDEX idx_entities_name ON entities(
|
|
127
|
-
CREATE INDEX idx_entities_type ON entities(
|
|
126
|
+
CREATE INDEX idx_entities_name ON entities(name);
|
|
127
|
+
CREATE INDEX idx_entities_type ON entities(type);
|
|
128
128
|
CREATE INDEX idx_entities_status ON entities(resolution_status);
|
|
129
129
|
CREATE INDEX idx_entities_embedding ON entities USING hnsw(embedding vector_cosine_ops);
|
|
130
130
|
```
|
|
@@ -137,14 +137,14 @@ Stores alternative names for entities.
|
|
|
137
137
|
CREATE TABLE entity_aliases (
|
|
138
138
|
id BIGSERIAL PRIMARY KEY,
|
|
139
139
|
entity_id BIGINT NOT NULL REFERENCES entities(id) ON DELETE CASCADE,
|
|
140
|
-
|
|
141
|
-
|
|
140
|
+
name VARCHAR(255) NOT NULL,
|
|
141
|
+
type VARCHAR(50),
|
|
142
142
|
confidence FLOAT DEFAULT 1.0
|
|
143
143
|
);
|
|
144
144
|
|
|
145
145
|
CREATE INDEX idx_aliases_entity ON entity_aliases(entity_id);
|
|
146
|
-
CREATE INDEX idx_aliases_text ON entity_aliases(
|
|
147
|
-
CREATE UNIQUE INDEX idx_aliases_unique ON entity_aliases(entity_id,
|
|
146
|
+
CREATE INDEX idx_aliases_text ON entity_aliases(name);
|
|
147
|
+
CREATE UNIQUE INDEX idx_aliases_unique ON entity_aliases(entity_id, name);
|
|
148
148
|
```
|
|
149
149
|
|
|
150
150
|
### facts
|
|
@@ -154,8 +154,8 @@ Stores temporal assertions.
|
|
|
154
154
|
```sql
|
|
155
155
|
CREATE TABLE facts (
|
|
156
156
|
id BIGSERIAL PRIMARY KEY,
|
|
157
|
-
|
|
158
|
-
|
|
157
|
+
text TEXT NOT NULL,
|
|
158
|
+
digest VARCHAR(64) NOT NULL,
|
|
159
159
|
valid_at TIMESTAMPTZ NOT NULL,
|
|
160
160
|
invalid_at TIMESTAMPTZ,
|
|
161
161
|
status VARCHAR(20) NOT NULL DEFAULT 'canonical',
|
|
@@ -174,7 +174,7 @@ CREATE INDEX idx_facts_valid ON facts(valid_at);
|
|
|
174
174
|
CREATE INDEX idx_facts_invalid ON facts(invalid_at);
|
|
175
175
|
CREATE INDEX idx_facts_temporal ON facts(valid_at, invalid_at);
|
|
176
176
|
CREATE INDEX idx_facts_method ON facts(extraction_method);
|
|
177
|
-
CREATE INDEX idx_facts_text ON facts USING gin(to_tsvector('english',
|
|
177
|
+
CREATE INDEX idx_facts_text ON facts USING gin(to_tsvector('english', text));
|
|
178
178
|
CREATE INDEX idx_facts_embedding ON facts USING hnsw(embedding vector_cosine_ops);
|
|
179
179
|
```
|
|
180
180
|
|
|
@@ -205,15 +205,15 @@ Links facts to source content.
|
|
|
205
205
|
CREATE TABLE fact_sources (
|
|
206
206
|
id BIGSERIAL PRIMARY KEY,
|
|
207
207
|
fact_id BIGINT NOT NULL REFERENCES facts(id) ON DELETE CASCADE,
|
|
208
|
-
|
|
209
|
-
|
|
208
|
+
source_id BIGINT NOT NULL REFERENCES sources(id),
|
|
209
|
+
kind VARCHAR(50) NOT NULL DEFAULT 'primary',
|
|
210
210
|
excerpt TEXT,
|
|
211
211
|
confidence FLOAT DEFAULT 1.0
|
|
212
212
|
);
|
|
213
213
|
|
|
214
|
-
CREATE INDEX
|
|
215
|
-
CREATE INDEX
|
|
216
|
-
CREATE INDEX
|
|
214
|
+
CREATE INDEX idx_fact_sources_fact ON fact_sources(fact_id);
|
|
215
|
+
CREATE INDEX idx_fact_sources_source ON fact_sources(source_id);
|
|
216
|
+
CREATE INDEX idx_fact_sources_kind ON fact_sources(kind);
|
|
217
217
|
```
|
|
218
218
|
|
|
219
219
|
## Vector Indexes
|
|
@@ -221,8 +221,8 @@ CREATE INDEX idx_sources_type ON fact_sources(source_type);
|
|
|
221
221
|
FactDb uses HNSW indexes for fast approximate nearest neighbor search:
|
|
222
222
|
|
|
223
223
|
```sql
|
|
224
|
-
--
|
|
225
|
-
CREATE INDEX
|
|
224
|
+
-- Sources semantic search
|
|
225
|
+
CREATE INDEX idx_sources_embedding ON sources
|
|
226
226
|
USING hnsw(embedding vector_cosine_ops)
|
|
227
227
|
WITH (m = 16, ef_construction = 64);
|
|
228
228
|
|
|
@@ -269,7 +269,7 @@ ORDER BY f.valid_at ASC;
|
|
|
269
269
|
|
|
270
270
|
```sql
|
|
271
271
|
SELECT *, embedding <=> '[...]' AS distance
|
|
272
|
-
FROM
|
|
272
|
+
FROM sources
|
|
273
273
|
ORDER BY embedding <=> '[...]'
|
|
274
274
|
LIMIT 10;
|
|
275
275
|
```
|
|
@@ -279,7 +279,7 @@ LIMIT 10;
|
|
|
279
279
|
### Vacuum and Analyze
|
|
280
280
|
|
|
281
281
|
```sql
|
|
282
|
-
VACUUM ANALYZE
|
|
282
|
+
VACUUM ANALYZE sources;
|
|
283
283
|
VACUUM ANALYZE entities;
|
|
284
284
|
VACUUM ANALYZE facts;
|
|
285
285
|
```
|
|
@@ -287,7 +287,7 @@ VACUUM ANALYZE facts;
|
|
|
287
287
|
### Reindex Vectors
|
|
288
288
|
|
|
289
289
|
```sql
|
|
290
|
-
REINDEX INDEX
|
|
290
|
+
REINDEX INDEX idx_sources_embedding;
|
|
291
291
|
REINDEX INDEX idx_entities_embedding;
|
|
292
292
|
REINDEX INDEX idx_facts_embedding;
|
|
293
293
|
```
|
|
@@ -33,7 +33,7 @@ Direct match against canonical names:
|
|
|
33
33
|
```ruby
|
|
34
34
|
# Looking for "Microsoft"
|
|
35
35
|
entity = facts.resolve_entity("Microsoft")
|
|
36
|
-
# Matches: Entity(
|
|
36
|
+
# Matches: Entity(name: "Microsoft")
|
|
37
37
|
```
|
|
38
38
|
|
|
39
39
|
### 2. Alias Match
|
|
@@ -130,7 +130,7 @@ facts.entity_service.add_alias(
|
|
|
130
130
|
|
|
131
131
|
```ruby
|
|
132
132
|
entity.entity_aliases.each do |alias_record|
|
|
133
|
-
puts "#{alias_record.
|
|
133
|
+
puts "#{alias_record.name} (#{alias_record.type})"
|
|
134
134
|
end
|
|
135
135
|
```
|
|
136
136
|
|
|
@@ -153,7 +153,7 @@ facts.entity_service.merge(
|
|
|
153
153
|
|
|
154
154
|
# After merge:
|
|
155
155
|
# - entity2.resolution_status => "merged"
|
|
156
|
-
# - entity2.
|
|
156
|
+
# - entity2.canonical_id => entity1.id
|
|
157
157
|
# - All facts mentioning entity2 now also reference entity1
|
|
158
158
|
```
|
|
159
159
|
|
|
@@ -193,11 +193,11 @@ fact = facts.fact_service.create(
|
|
|
193
193
|
The LLM extractor resolves mentions automatically:
|
|
194
194
|
|
|
195
195
|
```ruby
|
|
196
|
-
extracted = facts.extract_facts(
|
|
196
|
+
extracted = facts.extract_facts(source.id, extractor: :llm)
|
|
197
197
|
|
|
198
198
|
extracted.each do |fact|
|
|
199
199
|
fact.entity_mentions.each do |mention|
|
|
200
|
-
puts "Resolved '#{mention.mention_text}' to #{mention.entity.
|
|
200
|
+
puts "Resolved '#{mention.mention_text}' to #{mention.entity.name}"
|
|
201
201
|
puts " Role: #{mention.mention_role}"
|
|
202
202
|
puts " Confidence: #{mention.confidence}"
|
|
203
203
|
end
|
|
@@ -240,7 +240,7 @@ results = facts.batch_resolve_entities(names)
|
|
|
240
240
|
|
|
241
241
|
results.each do |result|
|
|
242
242
|
puts "#{result[:name]}: #{result[:status]}"
|
|
243
|
-
puts " Entity: #{result[:entity]&.
|
|
243
|
+
puts " Entity: #{result[:entity]&.name}"
|
|
244
244
|
end
|
|
245
245
|
```
|
|
246
246
|
|