fact_db 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.envrc +1 -0
- data/CHANGELOG.md +48 -0
- data/COMMITS.md +196 -0
- data/README.md +102 -0
- data/Rakefile +41 -0
- data/db/migrate/001_enable_extensions.rb +7 -0
- data/db/migrate/002_create_contents.rb +44 -0
- data/db/migrate/003_create_entities.rb +36 -0
- data/db/migrate/004_create_entity_aliases.rb +18 -0
- data/db/migrate/005_create_facts.rb +65 -0
- data/db/migrate/006_create_entity_mentions.rb +18 -0
- data/db/migrate/007_create_fact_sources.rb +18 -0
- data/docs/api/extractors/index.md +71 -0
- data/docs/api/extractors/llm.md +162 -0
- data/docs/api/extractors/manual.md +92 -0
- data/docs/api/extractors/rule-based.md +165 -0
- data/docs/api/facts.md +300 -0
- data/docs/api/index.md +66 -0
- data/docs/api/models/content.md +165 -0
- data/docs/api/models/entity.md +202 -0
- data/docs/api/models/fact.md +270 -0
- data/docs/api/models/index.md +77 -0
- data/docs/api/pipeline/extraction.md +175 -0
- data/docs/api/pipeline/index.md +72 -0
- data/docs/api/pipeline/resolution.md +209 -0
- data/docs/api/services/content-service.md +166 -0
- data/docs/api/services/entity-service.md +202 -0
- data/docs/api/services/fact-service.md +223 -0
- data/docs/api/services/index.md +55 -0
- data/docs/architecture/database-schema.md +293 -0
- data/docs/architecture/entity-resolution.md +293 -0
- data/docs/architecture/index.md +149 -0
- data/docs/architecture/temporal-facts.md +268 -0
- data/docs/architecture/three-layer-model.md +242 -0
- data/docs/assets/css/custom.css +137 -0
- data/docs/assets/fact_db.jpg +0 -0
- data/docs/assets/images/fact_db.jpg +0 -0
- data/docs/concepts.md +183 -0
- data/docs/examples/basic-usage.md +235 -0
- data/docs/examples/hr-onboarding.md +312 -0
- data/docs/examples/index.md +64 -0
- data/docs/examples/news-analysis.md +288 -0
- data/docs/getting-started/database-setup.md +170 -0
- data/docs/getting-started/index.md +71 -0
- data/docs/getting-started/installation.md +98 -0
- data/docs/getting-started/quick-start.md +191 -0
- data/docs/guides/batch-processing.md +325 -0
- data/docs/guides/configuration.md +243 -0
- data/docs/guides/entity-management.md +364 -0
- data/docs/guides/extracting-facts.md +299 -0
- data/docs/guides/index.md +22 -0
- data/docs/guides/ingesting-content.md +252 -0
- data/docs/guides/llm-integration.md +299 -0
- data/docs/guides/temporal-queries.md +315 -0
- data/docs/index.md +121 -0
- data/examples/README.md +130 -0
- data/examples/basic_usage.rb +164 -0
- data/examples/entity_management.rb +216 -0
- data/examples/hr_system.rb +428 -0
- data/examples/rule_based_extraction.rb +258 -0
- data/examples/temporal_queries.rb +245 -0
- data/lib/fact_db/config.rb +71 -0
- data/lib/fact_db/database.rb +45 -0
- data/lib/fact_db/errors.rb +10 -0
- data/lib/fact_db/extractors/base.rb +117 -0
- data/lib/fact_db/extractors/llm_extractor.rb +179 -0
- data/lib/fact_db/extractors/manual_extractor.rb +53 -0
- data/lib/fact_db/extractors/rule_based_extractor.rb +228 -0
- data/lib/fact_db/llm/adapter.rb +109 -0
- data/lib/fact_db/models/content.rb +62 -0
- data/lib/fact_db/models/entity.rb +84 -0
- data/lib/fact_db/models/entity_alias.rb +26 -0
- data/lib/fact_db/models/entity_mention.rb +33 -0
- data/lib/fact_db/models/fact.rb +192 -0
- data/lib/fact_db/models/fact_source.rb +35 -0
- data/lib/fact_db/pipeline/extraction_pipeline.rb +146 -0
- data/lib/fact_db/pipeline/resolution_pipeline.rb +129 -0
- data/lib/fact_db/resolution/entity_resolver.rb +261 -0
- data/lib/fact_db/resolution/fact_resolver.rb +259 -0
- data/lib/fact_db/services/content_service.rb +93 -0
- data/lib/fact_db/services/entity_service.rb +150 -0
- data/lib/fact_db/services/fact_service.rb +193 -0
- data/lib/fact_db/temporal/query.rb +125 -0
- data/lib/fact_db/temporal/timeline.rb +134 -0
- data/lib/fact_db/version.rb +5 -0
- data/lib/fact_db.rb +141 -0
- data/mkdocs.yml +198 -0
- metadata +288 -0
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
# Models
|
|
2
|
+
|
|
3
|
+
FactDb uses ActiveRecord models for data persistence.
|
|
4
|
+
|
|
5
|
+
## Core Models
|
|
6
|
+
|
|
7
|
+
- [Content](content.md) - Immutable source documents
|
|
8
|
+
- [Entity](entity.md) - Resolved identities with aliases
|
|
9
|
+
- [Fact](fact.md) - Temporal assertions
|
|
10
|
+
|
|
11
|
+
## Supporting Models
|
|
12
|
+
|
|
13
|
+
### EntityAlias
|
|
14
|
+
|
|
15
|
+
Stores alternative names for entities.
|
|
16
|
+
|
|
17
|
+
```ruby
|
|
18
|
+
class EntityAlias < ActiveRecord::Base
|
|
19
|
+
belongs_to :entity
|
|
20
|
+
end
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
| Column | Type | Description |
|
|
24
|
+
|--------|------|-------------|
|
|
25
|
+
| entity_id | bigint | Parent entity |
|
|
26
|
+
| alias_text | string | Alternative name |
|
|
27
|
+
| alias_type | string | Type (nickname, abbreviation, etc.) |
|
|
28
|
+
| confidence | float | Match confidence |
|
|
29
|
+
|
|
30
|
+
### EntityMention
|
|
31
|
+
|
|
32
|
+
Links facts to mentioned entities.
|
|
33
|
+
|
|
34
|
+
```ruby
|
|
35
|
+
class EntityMention < ActiveRecord::Base
|
|
36
|
+
belongs_to :fact
|
|
37
|
+
belongs_to :entity
|
|
38
|
+
end
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
| Column | Type | Description |
|
|
42
|
+
|--------|------|-------------|
|
|
43
|
+
| fact_id | bigint | Parent fact |
|
|
44
|
+
| entity_id | bigint | Referenced entity |
|
|
45
|
+
| mention_text | string | How entity was mentioned |
|
|
46
|
+
| mention_role | string | Role (subject, object, etc.) |
|
|
47
|
+
| confidence | float | Resolution confidence |
|
|
48
|
+
|
|
49
|
+
### FactSource
|
|
50
|
+
|
|
51
|
+
Links facts to source content.
|
|
52
|
+
|
|
53
|
+
```ruby
|
|
54
|
+
class FactSource < ActiveRecord::Base
|
|
55
|
+
belongs_to :fact
|
|
56
|
+
belongs_to :content
|
|
57
|
+
end
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
| Column | Type | Description |
|
|
61
|
+
|--------|------|-------------|
|
|
62
|
+
| fact_id | bigint | Parent fact |
|
|
63
|
+
| content_id | bigint | Source content |
|
|
64
|
+
| source_type | string | Type (primary, supporting, contradicting) |
|
|
65
|
+
| excerpt | text | Relevant text excerpt |
|
|
66
|
+
| confidence | float | Source confidence |
|
|
67
|
+
|
|
68
|
+
## Model Relationships
|
|
69
|
+
|
|
70
|
+
```mermaid
|
|
71
|
+
erDiagram
|
|
72
|
+
Content ||--o{ FactSource : "sourced by"
|
|
73
|
+
Entity ||--o{ EntityAlias : "has"
|
|
74
|
+
Entity ||--o{ EntityMention : "mentioned in"
|
|
75
|
+
Fact ||--o{ EntityMention : "mentions"
|
|
76
|
+
Fact ||--o{ FactSource : "sourced from"
|
|
77
|
+
```
|
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
# ExtractionPipeline
|
|
2
|
+
|
|
3
|
+
Concurrent fact extraction from multiple content items.
|
|
4
|
+
|
|
5
|
+
## Class: `FactDb::Pipeline::ExtractionPipeline`
|
|
6
|
+
|
|
7
|
+
```ruby
|
|
8
|
+
pipeline = FactDb::Pipeline::ExtractionPipeline.new(config)
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## Methods
|
|
12
|
+
|
|
13
|
+
### process
|
|
14
|
+
|
|
15
|
+
```ruby
|
|
16
|
+
def process(contents, extractor: config.default_extractor)
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
Process content items sequentially.
|
|
20
|
+
|
|
21
|
+
**Parameters:**
|
|
22
|
+
|
|
23
|
+
- `contents` (Array<Content>) - Content records
|
|
24
|
+
- `extractor` (Symbol) - Extraction method
|
|
25
|
+
|
|
26
|
+
**Returns:** `Array<Hash>`
|
|
27
|
+
|
|
28
|
+
**Example:**
|
|
29
|
+
|
|
30
|
+
```ruby
|
|
31
|
+
contents = Models::Content.where(id: [1, 2, 3])
|
|
32
|
+
results = pipeline.process(contents, extractor: :llm)
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
---
|
|
36
|
+
|
|
37
|
+
### process_parallel
|
|
38
|
+
|
|
39
|
+
```ruby
|
|
40
|
+
def process_parallel(contents, extractor: config.default_extractor)
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
Process content items concurrently.
|
|
44
|
+
|
|
45
|
+
**Parameters:**
|
|
46
|
+
|
|
47
|
+
- `contents` (Array<Content>) - Content records
|
|
48
|
+
- `extractor` (Symbol) - Extraction method
|
|
49
|
+
|
|
50
|
+
**Returns:** `Array<Hash>`
|
|
51
|
+
|
|
52
|
+
**Example:**
|
|
53
|
+
|
|
54
|
+
```ruby
|
|
55
|
+
results = pipeline.process_parallel(contents, extractor: :llm)
|
|
56
|
+
|
|
57
|
+
results.each do |result|
|
|
58
|
+
puts "Content #{result[:content_id]}:"
|
|
59
|
+
puts " Facts: #{result[:facts].count}"
|
|
60
|
+
puts " Error: #{result[:error]}" if result[:error]
|
|
61
|
+
end
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
## Pipeline Steps
|
|
65
|
+
|
|
66
|
+
### Sequential Pipeline
|
|
67
|
+
|
|
68
|
+
```mermaid
|
|
69
|
+
graph LR
|
|
70
|
+
A[Content] --> B[Validate]
|
|
71
|
+
B --> C[Extract]
|
|
72
|
+
C --> D[Validate Facts]
|
|
73
|
+
D --> E[Results]
|
|
74
|
+
|
|
75
|
+
style A fill:#1E40AF,stroke:#1E3A8A,color:#FFFFFF
|
|
76
|
+
style B fill:#B45309,stroke:#92400E,color:#FFFFFF
|
|
77
|
+
style C fill:#047857,stroke:#065F46,color:#FFFFFF
|
|
78
|
+
style D fill:#B45309,stroke:#92400E,color:#FFFFFF
|
|
79
|
+
style E fill:#B91C1C,stroke:#991B1B,color:#FFFFFF
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
1. **Validate** - Check content is not empty
|
|
83
|
+
2. **Extract** - Run extractor
|
|
84
|
+
3. **Validate Facts** - Filter valid facts
|
|
85
|
+
4. **Results** - Return extracted facts
|
|
86
|
+
|
|
87
|
+
### Parallel Pipeline
|
|
88
|
+
|
|
89
|
+
```mermaid
|
|
90
|
+
graph TB
|
|
91
|
+
subgraph Parallel
|
|
92
|
+
A1[Content 1] --> E1[Extract 1]
|
|
93
|
+
A2[Content 2] --> E2[Extract 2]
|
|
94
|
+
A3[Content 3] --> E3[Extract 3]
|
|
95
|
+
end
|
|
96
|
+
E1 --> Aggregate
|
|
97
|
+
E2 --> Aggregate
|
|
98
|
+
E3 --> Aggregate
|
|
99
|
+
|
|
100
|
+
style A1 fill:#1E40AF,stroke:#1E3A8A,color:#FFFFFF
|
|
101
|
+
style A2 fill:#1E40AF,stroke:#1E3A8A,color:#FFFFFF
|
|
102
|
+
style A3 fill:#1E40AF,stroke:#1E3A8A,color:#FFFFFF
|
|
103
|
+
style E1 fill:#047857,stroke:#065F46,color:#FFFFFF
|
|
104
|
+
style E2 fill:#047857,stroke:#065F46,color:#FFFFFF
|
|
105
|
+
style E3 fill:#047857,stroke:#065F46,color:#FFFFFF
|
|
106
|
+
style Aggregate fill:#B91C1C,stroke:#991B1B,color:#FFFFFF
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
## Result Structure
|
|
110
|
+
|
|
111
|
+
```ruby
|
|
112
|
+
{
|
|
113
|
+
content_id: 123,
|
|
114
|
+
facts: [<Fact>, <Fact>, ...], # Extracted facts
|
|
115
|
+
error: nil # Error message if failed
|
|
116
|
+
}
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
## Usage via Facts
|
|
120
|
+
|
|
121
|
+
```ruby
|
|
122
|
+
facts = FactDb.new
|
|
123
|
+
|
|
124
|
+
# Sequential
|
|
125
|
+
results = facts.batch_extract(content_ids, parallel: false)
|
|
126
|
+
|
|
127
|
+
# Parallel (default)
|
|
128
|
+
results = facts.batch_extract(content_ids, parallel: true)
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
## Error Handling
|
|
132
|
+
|
|
133
|
+
The pipeline catches errors per-item:
|
|
134
|
+
|
|
135
|
+
```ruby
|
|
136
|
+
results = pipeline.process_parallel(contents)
|
|
137
|
+
|
|
138
|
+
results.each do |result|
|
|
139
|
+
if result[:error]
|
|
140
|
+
logger.error "Content #{result[:content_id]}: #{result[:error]}"
|
|
141
|
+
else
|
|
142
|
+
logger.info "Content #{result[:content_id]}: #{result[:facts].count} facts"
|
|
143
|
+
end
|
|
144
|
+
end
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
## Performance
|
|
148
|
+
|
|
149
|
+
### Batch Size
|
|
150
|
+
|
|
151
|
+
Optimal batch size depends on:
|
|
152
|
+
|
|
153
|
+
- Extractor type (LLM has rate limits)
|
|
154
|
+
- Content length
|
|
155
|
+
- System resources
|
|
156
|
+
|
|
157
|
+
```ruby
|
|
158
|
+
# Process in optimal batches
|
|
159
|
+
contents.each_slice(25) do |batch|
|
|
160
|
+
results = pipeline.process_parallel(batch)
|
|
161
|
+
process_results(results)
|
|
162
|
+
end
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
### Memory
|
|
166
|
+
|
|
167
|
+
For large batches, process and discard:
|
|
168
|
+
|
|
169
|
+
```ruby
|
|
170
|
+
contents.each_slice(50) do |batch|
|
|
171
|
+
results = pipeline.process_parallel(batch)
|
|
172
|
+
save_facts(results.flat_map { |r| r[:facts] })
|
|
173
|
+
# Results discarded after each batch
|
|
174
|
+
end
|
|
175
|
+
```
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
# Pipeline
|
|
2
|
+
|
|
3
|
+
Pipelines provide concurrent processing for batch operations using SimpleFlow.
|
|
4
|
+
|
|
5
|
+
## Available Pipelines
|
|
6
|
+
|
|
7
|
+
- [ExtractionPipeline](extraction.md) - Concurrent fact extraction
|
|
8
|
+
- [ResolutionPipeline](resolution.md) - Parallel entity resolution
|
|
9
|
+
|
|
10
|
+
## SimpleFlow Integration
|
|
11
|
+
|
|
12
|
+
Pipelines are built on the `simple_flow` gem:
|
|
13
|
+
|
|
14
|
+
```ruby
|
|
15
|
+
require 'simple_flow'
|
|
16
|
+
|
|
17
|
+
pipeline = SimpleFlow::Pipeline.new do
|
|
18
|
+
step ->(result) { result.continue(transformed_value) }
|
|
19
|
+
step ->(result) { result.continue(more_transformation) }
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
result = pipeline.call(SimpleFlow::Result.new(initial_value))
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
## Pipeline Pattern
|
|
26
|
+
|
|
27
|
+
All pipelines follow a common structure:
|
|
28
|
+
|
|
29
|
+
```ruby
|
|
30
|
+
class SomePipeline
|
|
31
|
+
attr_reader :config
|
|
32
|
+
|
|
33
|
+
def initialize(config = FactDb.config)
|
|
34
|
+
@config = config
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def process(items, **options)
|
|
38
|
+
# Sequential processing
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def process_parallel(items, **options)
|
|
42
|
+
# Parallel processing
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
## Result Structure
|
|
48
|
+
|
|
49
|
+
Pipeline results follow a consistent format:
|
|
50
|
+
|
|
51
|
+
```ruby
|
|
52
|
+
{
|
|
53
|
+
content_id: 123, # Item identifier
|
|
54
|
+
facts: [<Fact>, ...], # Extracted/resolved items
|
|
55
|
+
error: nil # Error message if failed
|
|
56
|
+
}
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
## Error Handling
|
|
60
|
+
|
|
61
|
+
Pipelines handle errors gracefully:
|
|
62
|
+
|
|
63
|
+
```ruby
|
|
64
|
+
results = pipeline.process_parallel(items)
|
|
65
|
+
|
|
66
|
+
successful = results.select { |r| r[:error].nil? }
|
|
67
|
+
failed = results.reject { |r| r[:error].nil? }
|
|
68
|
+
|
|
69
|
+
failed.each do |result|
|
|
70
|
+
logger.error "Failed: #{result[:error]}"
|
|
71
|
+
end
|
|
72
|
+
```
|
|
@@ -0,0 +1,209 @@
|
|
|
1
|
+
# ResolutionPipeline
|
|
2
|
+
|
|
3
|
+
Parallel entity resolution and conflict detection.
|
|
4
|
+
|
|
5
|
+
## Class: `FactDb::Pipeline::ResolutionPipeline`
|
|
6
|
+
|
|
7
|
+
```ruby
|
|
8
|
+
pipeline = FactDb::Pipeline::ResolutionPipeline.new(config)
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## Methods
|
|
12
|
+
|
|
13
|
+
### resolve_entities
|
|
14
|
+
|
|
15
|
+
```ruby
|
|
16
|
+
def resolve_entities(names, type: nil)
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
Resolve multiple entity names in parallel.
|
|
20
|
+
|
|
21
|
+
**Parameters:**
|
|
22
|
+
|
|
23
|
+
- `names` (Array<String>) - Names to resolve
|
|
24
|
+
- `type` (Symbol) - Optional entity type filter
|
|
25
|
+
|
|
26
|
+
**Returns:** `Array<Hash>`
|
|
27
|
+
|
|
28
|
+
**Example:**
|
|
29
|
+
|
|
30
|
+
```ruby
|
|
31
|
+
names = ["Paula Chen", "Microsoft", "Seattle"]
|
|
32
|
+
results = pipeline.resolve_entities(names)
|
|
33
|
+
|
|
34
|
+
results.each do |result|
|
|
35
|
+
puts "#{result[:name]}: #{result[:status]}"
|
|
36
|
+
puts " Entity: #{result[:entity]&.canonical_name}"
|
|
37
|
+
end
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
---
|
|
41
|
+
|
|
42
|
+
### detect_conflicts
|
|
43
|
+
|
|
44
|
+
```ruby
|
|
45
|
+
def detect_conflicts(entity_ids)
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
Find fact conflicts for multiple entities in parallel.
|
|
49
|
+
|
|
50
|
+
**Parameters:**
|
|
51
|
+
|
|
52
|
+
- `entity_ids` (Array<Integer>) - Entity IDs to check
|
|
53
|
+
|
|
54
|
+
**Returns:** `Array<Hash>`
|
|
55
|
+
|
|
56
|
+
**Example:**
|
|
57
|
+
|
|
58
|
+
```ruby
|
|
59
|
+
results = pipeline.detect_conflicts([paula.id, john.id])
|
|
60
|
+
|
|
61
|
+
results.each do |result|
|
|
62
|
+
puts "Entity #{result[:entity_id]}: #{result[:conflict_count]} conflicts"
|
|
63
|
+
result[:conflicts].each do |c|
|
|
64
|
+
puts " - #{c[:fact1].fact_text} vs #{c[:fact2].fact_text}"
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
## Result Structures
|
|
70
|
+
|
|
71
|
+
### Resolution Result
|
|
72
|
+
|
|
73
|
+
```ruby
|
|
74
|
+
{
|
|
75
|
+
name: "Paula Chen",
|
|
76
|
+
entity: <Entity>, # Resolved entity or nil
|
|
77
|
+
status: :resolved, # :resolved, :not_found, :error
|
|
78
|
+
error: nil # Error message if failed
|
|
79
|
+
}
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
### Conflict Detection Result
|
|
83
|
+
|
|
84
|
+
```ruby
|
|
85
|
+
{
|
|
86
|
+
entity_id: 123,
|
|
87
|
+
conflicts: [
|
|
88
|
+
{
|
|
89
|
+
fact1: <Fact>,
|
|
90
|
+
fact2: <Fact>,
|
|
91
|
+
similarity: 0.75
|
|
92
|
+
}
|
|
93
|
+
],
|
|
94
|
+
conflict_count: 1
|
|
95
|
+
}
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
## Pipeline Steps
|
|
99
|
+
|
|
100
|
+
### Entity Resolution Pipeline
|
|
101
|
+
|
|
102
|
+
```mermaid
|
|
103
|
+
graph TB
|
|
104
|
+
subgraph Parallel
|
|
105
|
+
N1[Name 1] --> R1[Resolve 1]
|
|
106
|
+
N2[Name 2] --> R2[Resolve 2]
|
|
107
|
+
N3[Name 3] --> R3[Resolve 3]
|
|
108
|
+
end
|
|
109
|
+
R1 --> Aggregate
|
|
110
|
+
R2 --> Aggregate
|
|
111
|
+
R3 --> Aggregate
|
|
112
|
+
|
|
113
|
+
style N1 fill:#1E40AF,stroke:#1E3A8A,color:#FFFFFF
|
|
114
|
+
style N2 fill:#1E40AF,stroke:#1E3A8A,color:#FFFFFF
|
|
115
|
+
style N3 fill:#1E40AF,stroke:#1E3A8A,color:#FFFFFF
|
|
116
|
+
style R1 fill:#047857,stroke:#065F46,color:#FFFFFF
|
|
117
|
+
style R2 fill:#047857,stroke:#065F46,color:#FFFFFF
|
|
118
|
+
style R3 fill:#047857,stroke:#065F46,color:#FFFFFF
|
|
119
|
+
style Aggregate fill:#B91C1C,stroke:#991B1B,color:#FFFFFF
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
### Conflict Detection Pipeline
|
|
123
|
+
|
|
124
|
+
```mermaid
|
|
125
|
+
graph TB
|
|
126
|
+
subgraph Parallel
|
|
127
|
+
E1[Entity 1] --> C1[Find Conflicts 1]
|
|
128
|
+
E2[Entity 2] --> C2[Find Conflicts 2]
|
|
129
|
+
end
|
|
130
|
+
C1 --> Aggregate
|
|
131
|
+
C2 --> Aggregate
|
|
132
|
+
|
|
133
|
+
style E1 fill:#047857,stroke:#065F46,color:#FFFFFF
|
|
134
|
+
style E2 fill:#047857,stroke:#065F46,color:#FFFFFF
|
|
135
|
+
style C1 fill:#B45309,stroke:#92400E,color:#FFFFFF
|
|
136
|
+
style C2 fill:#B45309,stroke:#92400E,color:#FFFFFF
|
|
137
|
+
style Aggregate fill:#B91C1C,stroke:#991B1B,color:#FFFFFF
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
## Usage via Facts
|
|
141
|
+
|
|
142
|
+
```ruby
|
|
143
|
+
facts = FactDb.new
|
|
144
|
+
|
|
145
|
+
# Resolve entities
|
|
146
|
+
results = facts.batch_resolve_entities(["Paula", "Microsoft"])
|
|
147
|
+
|
|
148
|
+
# Detect conflicts
|
|
149
|
+
results = facts.detect_fact_conflicts([entity1.id, entity2.id])
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
## Resolution Strategies
|
|
153
|
+
|
|
154
|
+
The pipeline uses the EntityResolver which tries:
|
|
155
|
+
|
|
156
|
+
1. **Exact match** on canonical name
|
|
157
|
+
2. **Alias match** on registered aliases
|
|
158
|
+
3. **Fuzzy match** using Levenshtein distance
|
|
159
|
+
|
|
160
|
+
```ruby
|
|
161
|
+
FactDb.configure do |config|
|
|
162
|
+
config.fuzzy_match_threshold = 0.85
|
|
163
|
+
end
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
## Error Handling
|
|
167
|
+
|
|
168
|
+
```ruby
|
|
169
|
+
results = pipeline.resolve_entities(names)
|
|
170
|
+
|
|
171
|
+
# Handle unresolved names
|
|
172
|
+
unresolved = results.select { |r| r[:status] == :not_found }
|
|
173
|
+
unresolved.each do |result|
|
|
174
|
+
# Optionally create new entities
|
|
175
|
+
entity = facts.entity_service.create(
|
|
176
|
+
result[:name],
|
|
177
|
+
type: :person,
|
|
178
|
+
metadata: { needs_review: true }
|
|
179
|
+
)
|
|
180
|
+
end
|
|
181
|
+
|
|
182
|
+
# Handle errors
|
|
183
|
+
errors = results.select { |r| r[:status] == :error }
|
|
184
|
+
errors.each do |result|
|
|
185
|
+
logger.error "Resolution failed for #{result[:name]}: #{result[:error]}"
|
|
186
|
+
end
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
## Performance Tips
|
|
190
|
+
|
|
191
|
+
### Batch Size
|
|
192
|
+
|
|
193
|
+
```ruby
|
|
194
|
+
# Process in batches for large name lists
|
|
195
|
+
names.each_slice(100) do |batch|
|
|
196
|
+
results = pipeline.resolve_entities(batch)
|
|
197
|
+
process_results(results)
|
|
198
|
+
end
|
|
199
|
+
```
|
|
200
|
+
|
|
201
|
+
### Pre-warm Cache
|
|
202
|
+
|
|
203
|
+
```ruby
|
|
204
|
+
# Load entities into memory first
|
|
205
|
+
Entity.where(entity_type: 'person').to_a
|
|
206
|
+
|
|
207
|
+
# Then resolve
|
|
208
|
+
results = pipeline.resolve_entities(person_names, type: :person)
|
|
209
|
+
```
|
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
# ContentService
|
|
2
|
+
|
|
3
|
+
Service for ingesting and managing source content.
|
|
4
|
+
|
|
5
|
+
## Class: `FactDb::Services::ContentService`
|
|
6
|
+
|
|
7
|
+
```ruby
|
|
8
|
+
service = FactDb::Services::ContentService.new(config)
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## Methods
|
|
12
|
+
|
|
13
|
+
### create
|
|
14
|
+
|
|
15
|
+
```ruby
|
|
16
|
+
def create(raw_text, type:, captured_at: Time.current, metadata: {}, title: nil, source_uri: nil)
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
Create new content with automatic deduplication.
|
|
20
|
+
|
|
21
|
+
**Parameters:**
|
|
22
|
+
|
|
23
|
+
- `raw_text` (String) - Content text
|
|
24
|
+
- `type` (Symbol) - Content type
|
|
25
|
+
- `captured_at` (Time) - Capture timestamp
|
|
26
|
+
- `metadata` (Hash) - Additional metadata
|
|
27
|
+
- `title` (String) - Optional title
|
|
28
|
+
- `source_uri` (String) - Original location
|
|
29
|
+
|
|
30
|
+
**Returns:** `Models::Content`
|
|
31
|
+
|
|
32
|
+
**Example:**
|
|
33
|
+
|
|
34
|
+
```ruby
|
|
35
|
+
content = service.create(
|
|
36
|
+
"Email body text...",
|
|
37
|
+
type: :email,
|
|
38
|
+
title: "RE: Important",
|
|
39
|
+
metadata: { from: "sender@example.com" }
|
|
40
|
+
)
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
---
|
|
44
|
+
|
|
45
|
+
### find
|
|
46
|
+
|
|
47
|
+
```ruby
|
|
48
|
+
def find(id)
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
Find content by ID.
|
|
52
|
+
|
|
53
|
+
**Returns:** `Models::Content`
|
|
54
|
+
|
|
55
|
+
---
|
|
56
|
+
|
|
57
|
+
### find_by_hash
|
|
58
|
+
|
|
59
|
+
```ruby
|
|
60
|
+
def find_by_hash(hash)
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
Find content by SHA256 hash.
|
|
64
|
+
|
|
65
|
+
**Returns:** `Models::Content` or `nil`
|
|
66
|
+
|
|
67
|
+
**Example:**
|
|
68
|
+
|
|
69
|
+
```ruby
|
|
70
|
+
hash = Digest::SHA256.hexdigest(text)
|
|
71
|
+
content = service.find_by_hash(hash)
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
---
|
|
75
|
+
|
|
76
|
+
### search
|
|
77
|
+
|
|
78
|
+
```ruby
|
|
79
|
+
def search(query, limit: 20)
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
Full-text search content.
|
|
83
|
+
|
|
84
|
+
**Parameters:**
|
|
85
|
+
|
|
86
|
+
- `query` (String) - Search query
|
|
87
|
+
- `limit` (Integer) - Max results
|
|
88
|
+
|
|
89
|
+
**Returns:** `Array<Models::Content>`
|
|
90
|
+
|
|
91
|
+
**Example:**
|
|
92
|
+
|
|
93
|
+
```ruby
|
|
94
|
+
results = service.search("quarterly report", limit: 10)
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
---
|
|
98
|
+
|
|
99
|
+
### semantic_search
|
|
100
|
+
|
|
101
|
+
```ruby
|
|
102
|
+
def semantic_search(query, limit: 10)
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
Semantic similarity search using embeddings.
|
|
106
|
+
|
|
107
|
+
**Parameters:**
|
|
108
|
+
|
|
109
|
+
- `query` (String) - Search query
|
|
110
|
+
- `limit` (Integer) - Max results
|
|
111
|
+
|
|
112
|
+
**Returns:** `Array<Models::Content>`
|
|
113
|
+
|
|
114
|
+
**Example:**
|
|
115
|
+
|
|
116
|
+
```ruby
|
|
117
|
+
results = service.semantic_search("financial performance")
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
---
|
|
121
|
+
|
|
122
|
+
### by_type
|
|
123
|
+
|
|
124
|
+
```ruby
|
|
125
|
+
def by_type(type)
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
Filter content by type.
|
|
129
|
+
|
|
130
|
+
**Returns:** `ActiveRecord::Relation`
|
|
131
|
+
|
|
132
|
+
**Example:**
|
|
133
|
+
|
|
134
|
+
```ruby
|
|
135
|
+
emails = service.by_type(:email)
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
---
|
|
139
|
+
|
|
140
|
+
### recent
|
|
141
|
+
|
|
142
|
+
```ruby
|
|
143
|
+
def recent(limit: 20)
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
Get recently captured content.
|
|
147
|
+
|
|
148
|
+
**Returns:** `Array<Models::Content>`
|
|
149
|
+
|
|
150
|
+
---
|
|
151
|
+
|
|
152
|
+
### mentioning_entity
|
|
153
|
+
|
|
154
|
+
```ruby
|
|
155
|
+
def mentioning_entity(entity_id)
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
Find content that mentions an entity (via facts).
|
|
159
|
+
|
|
160
|
+
**Returns:** `Array<Models::Content>`
|
|
161
|
+
|
|
162
|
+
**Example:**
|
|
163
|
+
|
|
164
|
+
```ruby
|
|
165
|
+
paula_content = service.mentioning_entity(paula.id)
|
|
166
|
+
```
|