fact_db 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.envrc +2 -0
- data/.yardopts +5 -0
- data/CHANGELOG.md +64 -0
- data/README.md +107 -6
- data/Rakefile +243 -10
- data/db/migrate/001_enable_extensions.rb +1 -0
- data/db/migrate/002_create_sources.rb +49 -0
- data/db/migrate/003_create_entities.rb +27 -15
- data/db/migrate/004_create_entity_aliases.rb +20 -7
- data/db/migrate/005_create_facts.rb +37 -21
- data/db/migrate/006_create_entity_mentions.rb +14 -6
- data/db/migrate/007_create_fact_sources.rb +16 -8
- data/docs/api/extractors/index.md +5 -5
- data/docs/api/extractors/llm.md +17 -17
- data/docs/api/extractors/rule-based.md +14 -14
- data/docs/api/facts.md +20 -20
- data/docs/api/index.md +4 -4
- data/docs/api/models/entity.md +21 -21
- data/docs/api/models/fact.md +15 -15
- data/docs/api/models/index.md +7 -7
- data/docs/api/models/{content.md → source.md} +29 -29
- data/docs/api/pipeline/extraction.md +25 -25
- data/docs/api/pipeline/index.md +1 -1
- data/docs/api/pipeline/resolution.md +4 -4
- data/docs/api/services/entity-service.md +20 -20
- data/docs/api/services/fact-service.md +12 -12
- data/docs/api/services/index.md +5 -5
- data/docs/api/services/{content-service.md → source-service.md} +27 -27
- data/docs/architecture/database-schema.md +46 -46
- data/docs/architecture/entity-resolution.md +6 -6
- data/docs/architecture/index.md +10 -10
- data/docs/architecture/temporal-facts.md +5 -5
- data/docs/architecture/three-layer-model.md +17 -17
- data/docs/concepts.md +6 -6
- data/docs/examples/basic-usage.md +20 -20
- data/docs/examples/hr-onboarding.md +17 -17
- data/docs/examples/index.md +4 -4
- data/docs/examples/news-analysis.md +23 -23
- data/docs/getting-started/database-setup.md +28 -20
- data/docs/getting-started/index.md +3 -3
- data/docs/getting-started/quick-start.md +33 -30
- data/docs/guides/batch-processing.md +26 -26
- data/docs/guides/configuration.md +158 -77
- data/docs/guides/entity-management.md +14 -14
- data/docs/guides/extracting-facts.md +28 -28
- data/docs/guides/ingesting-content.md +14 -14
- data/docs/guides/llm-integration.md +40 -32
- data/docs/guides/temporal-queries.md +11 -11
- data/docs/index.md +6 -2
- data/examples/.envrc +4 -0
- data/examples/.gitignore +1 -0
- data/examples/001_configuration.rb +312 -0
- data/examples/{basic_usage.rb → 010_basic_usage.rb} +47 -56
- data/examples/{entity_management.rb → 020_entity_management.rb} +57 -72
- data/examples/{temporal_queries.rb → 030_temporal_queries.rb} +39 -59
- data/examples/040_output_formats.rb +177 -0
- data/examples/{rule_based_extraction.rb → 050_rule_based_extraction.rb} +39 -45
- data/examples/060_fluent_temporal_api.rb +217 -0
- data/examples/070_introspection.rb +252 -0
- data/examples/{hr_system.rb → 080_hr_system.rb} +56 -75
- data/examples/090_ingest_demo.rb +515 -0
- data/examples/100_query_context.rb +668 -0
- data/examples/110_prove_it.rb +204 -0
- data/examples/120_dump_database.rb +358 -0
- data/examples/130_rag_feedback_loop.rb +858 -0
- data/examples/README.md +229 -15
- data/examples/data/lincoln_associates.md +201 -0
- data/examples/data/lincoln_biography.md +66 -0
- data/examples/data/lincoln_cabinet.md +243 -0
- data/examples/data/lincoln_family.md +163 -0
- data/examples/data/lincoln_military.md +241 -0
- data/examples/data/lincoln_todd_family.md +136 -0
- data/examples/ingest_reporter.rb +335 -0
- data/examples/utilities.rb +182 -0
- data/lib/fact_db/config/defaults.yml +254 -0
- data/lib/fact_db/config.rb +94 -35
- data/lib/fact_db/database.rb +98 -8
- data/lib/fact_db/extractors/base.rb +106 -21
- data/lib/fact_db/extractors/llm_extractor.rb +35 -63
- data/lib/fact_db/extractors/manual_extractor.rb +46 -6
- data/lib/fact_db/extractors/rule_based_extractor.rb +136 -25
- data/lib/fact_db/llm/adapter.rb +3 -3
- data/lib/fact_db/models/entity.rb +94 -22
- data/lib/fact_db/models/entity_alias.rb +41 -7
- data/lib/fact_db/models/entity_mention.rb +34 -1
- data/lib/fact_db/models/fact.rb +259 -28
- data/lib/fact_db/models/fact_source.rb +43 -9
- data/lib/fact_db/models/source.rb +113 -0
- data/lib/fact_db/pipeline/extraction_pipeline.rb +35 -35
- data/lib/fact_db/pipeline/resolution_pipeline.rb +5 -5
- data/lib/fact_db/query_result.rb +202 -0
- data/lib/fact_db/resolution/entity_resolver.rb +139 -39
- data/lib/fact_db/resolution/fact_resolver.rb +86 -14
- data/lib/fact_db/services/entity_service.rb +246 -37
- data/lib/fact_db/services/fact_service.rb +254 -17
- data/lib/fact_db/services/source_service.rb +164 -0
- data/lib/fact_db/temporal/query.rb +71 -7
- data/lib/fact_db/temporal/query_builder.rb +69 -0
- data/lib/fact_db/temporal/timeline.rb +102 -11
- data/lib/fact_db/transformers/base.rb +77 -0
- data/lib/fact_db/transformers/cypher_transformer.rb +185 -0
- data/lib/fact_db/transformers/json_transformer.rb +17 -0
- data/lib/fact_db/transformers/raw_transformer.rb +35 -0
- data/lib/fact_db/transformers/text_transformer.rb +114 -0
- data/lib/fact_db/transformers/triple_transformer.rb +138 -0
- data/lib/fact_db/validation/alias_filter.rb +185 -0
- data/lib/fact_db/version.rb +1 -1
- data/lib/fact_db.rb +281 -30
- data/mkdocs.yml +2 -2
- metadata +60 -16
- data/db/migrate/002_create_contents.rb +0 -44
- data/lib/fact_db/models/content.rb +0 -62
- data/lib/fact_db/services/content_service.rb +0 -93
|
@@ -2,17 +2,30 @@
|
|
|
2
2
|
|
|
3
3
|
class CreateEntityAliases < ActiveRecord::Migration[7.0]
|
|
4
4
|
def change
|
|
5
|
-
create_table :fact_db_entity_aliases do |t|
|
|
6
|
-
t.references :entity, null: false, foreign_key: { to_table: :fact_db_entities, on_delete: :cascade }
|
|
7
|
-
|
|
8
|
-
t.string :
|
|
9
|
-
|
|
5
|
+
create_table :fact_db_entity_aliases, comment: "Alternative names and identifiers for entities enabling flexible matching" do |t|
|
|
6
|
+
t.references :entity, null: false, foreign_key: { to_table: :fact_db_entities, on_delete: :cascade },
|
|
7
|
+
comment: "The canonical entity this alias refers to"
|
|
8
|
+
t.string :name, null: false, limit: 500,
|
|
9
|
+
comment: "The alternative name, identifier, or reference text"
|
|
10
|
+
t.string :kind, limit: 50,
|
|
11
|
+
comment: "Classification of alias: name, nickname, email, handle, abbreviation, former_name"
|
|
12
|
+
t.float :confidence, default: 1.0,
|
|
13
|
+
comment: "Confidence score (0.0-1.0) that this alias correctly refers to the entity"
|
|
10
14
|
|
|
11
15
|
t.timestamps
|
|
12
16
|
end
|
|
13
17
|
|
|
14
|
-
add_index :fact_db_entity_aliases, :
|
|
15
|
-
add_index :fact_db_entity_aliases, [:entity_id, :
|
|
18
|
+
add_index :fact_db_entity_aliases, :name
|
|
19
|
+
add_index :fact_db_entity_aliases, [:entity_id, :name], unique: true,
|
|
16
20
|
name: "idx_unique_entity_alias"
|
|
21
|
+
|
|
22
|
+
# GIN trigram index on name for fuzzy alias matching
|
|
23
|
+
execute <<-SQL
|
|
24
|
+
CREATE INDEX idx_entity_aliases_name_trgm ON fact_db_entity_aliases
|
|
25
|
+
USING gin (name gin_trgm_ops);
|
|
26
|
+
SQL
|
|
27
|
+
|
|
28
|
+
execute "COMMENT ON COLUMN fact_db_entity_aliases.created_at IS 'When this alias association was created';"
|
|
29
|
+
execute "COMMENT ON COLUMN fact_db_entity_aliases.updated_at IS 'When this alias record was last modified';"
|
|
17
30
|
end
|
|
18
31
|
end
|
|
@@ -2,35 +2,42 @@
|
|
|
2
2
|
|
|
3
3
|
class CreateFacts < ActiveRecord::Migration[7.0]
|
|
4
4
|
def change
|
|
5
|
-
create_table :fact_db_facts do |t|
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
t.string :
|
|
5
|
+
create_table :fact_db_facts, comment: "Extracted factual assertions with temporal validity tracking (Event Clock pattern)" do |t|
|
|
6
|
+
t.text :text, null: false,
|
|
7
|
+
comment: "The factual assertion in natural language form"
|
|
8
|
+
t.string :digest, null: false, limit: 64,
|
|
9
|
+
comment: "SHA-256 hash of normalized text for deduplication"
|
|
9
10
|
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
t.timestamptz :invalid_at
|
|
11
|
+
t.timestamptz :valid_at, null: false,
|
|
12
|
+
comment: "When this fact became true (Event Clock valid_from)"
|
|
13
|
+
t.timestamptz :invalid_at,
|
|
14
|
+
comment: "When this fact ceased to be true; NULL means still valid (Event Clock valid_to)"
|
|
13
15
|
|
|
14
|
-
|
|
15
|
-
|
|
16
|
+
t.string :status, null: false, default: "canonical", limit: 20,
|
|
17
|
+
comment: "Fact lifecycle state: canonical, superseded, retracted, or disputed"
|
|
16
18
|
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
t.bigint :derived_from_ids, array: true, default: []
|
|
20
|
-
|
|
19
|
+
t.bigint :superseded_by_id,
|
|
20
|
+
comment: "Reference to newer fact that replaces this one"
|
|
21
|
+
t.bigint :derived_from_ids, array: true, default: [],
|
|
22
|
+
comment: "Array of fact IDs from which this fact was inferred or derived"
|
|
23
|
+
t.bigint :corroborated_by_ids, array: true, default: [],
|
|
24
|
+
comment: "Array of fact IDs that independently confirm this fact"
|
|
21
25
|
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
t.string :extraction_method, limit: 50
|
|
25
|
-
|
|
26
|
+
t.float :confidence, default: 1.0,
|
|
27
|
+
comment: "Confidence score (0.0-1.0) in the accuracy of this fact"
|
|
28
|
+
t.string :extraction_method, limit: 50,
|
|
29
|
+
comment: "How fact was extracted: manual, llm_extraction, rule_based, etc."
|
|
30
|
+
t.jsonb :metadata, null: false, default: {},
|
|
31
|
+
comment: "Additional structured data: extraction context, source details, tags"
|
|
26
32
|
|
|
27
|
-
|
|
28
|
-
|
|
33
|
+
t.vector :embedding, limit: 1536,
|
|
34
|
+
comment: "Vector embedding for semantic fact search and similarity matching"
|
|
29
35
|
|
|
30
36
|
t.timestamps
|
|
31
37
|
end
|
|
32
38
|
|
|
33
|
-
|
|
39
|
+
# Unique constraint on digest + valid_at allows same fact text at different times
|
|
40
|
+
add_index :fact_db_facts, [:digest, :valid_at], unique: true, name: "index_fact_db_facts_on_digest_valid_at"
|
|
34
41
|
add_index :fact_db_facts, :valid_at
|
|
35
42
|
add_index :fact_db_facts, :invalid_at
|
|
36
43
|
add_index :fact_db_facts, :status
|
|
@@ -53,7 +60,7 @@ class CreateFacts < ActiveRecord::Migration[7.0]
|
|
|
53
60
|
# Full-text search index
|
|
54
61
|
execute <<-SQL
|
|
55
62
|
CREATE INDEX idx_facts_fulltext ON fact_db_facts
|
|
56
|
-
USING gin(to_tsvector('english',
|
|
63
|
+
USING gin(to_tsvector('english', text));
|
|
57
64
|
SQL
|
|
58
65
|
|
|
59
66
|
# HNSW index for vector similarity search
|
|
@@ -61,5 +68,14 @@ class CreateFacts < ActiveRecord::Migration[7.0]
|
|
|
61
68
|
CREATE INDEX idx_facts_embedding ON fact_db_facts
|
|
62
69
|
USING hnsw (embedding vector_cosine_ops);
|
|
63
70
|
SQL
|
|
71
|
+
|
|
72
|
+
# GIN trigram index on text for fuzzy fact search
|
|
73
|
+
execute <<-SQL
|
|
74
|
+
CREATE INDEX idx_facts_text_trgm ON fact_db_facts
|
|
75
|
+
USING gin (text gin_trgm_ops);
|
|
76
|
+
SQL
|
|
77
|
+
|
|
78
|
+
execute "COMMENT ON COLUMN fact_db_facts.created_at IS 'When this fact was recorded in the database';"
|
|
79
|
+
execute "COMMENT ON COLUMN fact_db_facts.updated_at IS 'When this fact record was last modified';"
|
|
64
80
|
end
|
|
65
81
|
end
|
|
@@ -2,17 +2,25 @@
|
|
|
2
2
|
|
|
3
3
|
class CreateEntityMentions < ActiveRecord::Migration[7.0]
|
|
4
4
|
def change
|
|
5
|
-
create_table :fact_db_entity_mentions do |t|
|
|
6
|
-
t.references :fact, null: false, foreign_key: { to_table: :fact_db_facts, on_delete: :cascade }
|
|
7
|
-
|
|
8
|
-
t.
|
|
9
|
-
|
|
10
|
-
t.
|
|
5
|
+
create_table :fact_db_entity_mentions, comment: "Links entities to facts where they are mentioned, with role context" do |t|
|
|
6
|
+
t.references :fact, null: false, foreign_key: { to_table: :fact_db_facts, on_delete: :cascade },
|
|
7
|
+
comment: "The fact containing this entity mention"
|
|
8
|
+
t.references :entity, null: false, foreign_key: { to_table: :fact_db_entities, on_delete: :cascade },
|
|
9
|
+
comment: "The resolved entity being mentioned"
|
|
10
|
+
t.string :mention_text, null: false, limit: 500,
|
|
11
|
+
comment: "The exact text used to reference the entity in the fact"
|
|
12
|
+
t.string :mention_role, limit: 50,
|
|
13
|
+
comment: "Semantic role of entity in fact: subject, object, location, time, instrument, etc."
|
|
14
|
+
t.float :confidence, default: 1.0,
|
|
15
|
+
comment: "Confidence score (0.0-1.0) that mention correctly resolves to entity"
|
|
11
16
|
|
|
12
17
|
t.timestamps
|
|
13
18
|
end
|
|
14
19
|
|
|
15
20
|
add_index :fact_db_entity_mentions, [:fact_id, :entity_id, :mention_text],
|
|
16
21
|
unique: true, name: "idx_unique_fact_entity_mention"
|
|
22
|
+
|
|
23
|
+
execute "COMMENT ON COLUMN fact_db_entity_mentions.created_at IS 'When this mention link was created';"
|
|
24
|
+
execute "COMMENT ON COLUMN fact_db_entity_mentions.updated_at IS 'When this mention record was last modified';"
|
|
17
25
|
end
|
|
18
26
|
end
|
|
@@ -2,17 +2,25 @@
|
|
|
2
2
|
|
|
3
3
|
class CreateFactSources < ActiveRecord::Migration[7.0]
|
|
4
4
|
def change
|
|
5
|
-
create_table :fact_db_fact_sources do |t|
|
|
6
|
-
t.references :fact, null: false, foreign_key: { to_table: :fact_db_facts, on_delete: :cascade }
|
|
7
|
-
|
|
8
|
-
t.
|
|
9
|
-
|
|
10
|
-
t.
|
|
5
|
+
create_table :fact_db_fact_sources, comment: "Links facts to their source content for provenance tracking" do |t|
|
|
6
|
+
t.references :fact, null: false, foreign_key: { to_table: :fact_db_facts, on_delete: :cascade },
|
|
7
|
+
comment: "The fact derived from this source"
|
|
8
|
+
t.references :source, null: false, foreign_key: { to_table: :fact_db_sources, on_delete: :cascade },
|
|
9
|
+
comment: "The source content from which the fact was extracted"
|
|
10
|
+
t.string :kind, default: "primary", limit: 50,
|
|
11
|
+
comment: "Relationship type: primary (direct extraction), supporting, or corroborating"
|
|
12
|
+
t.text :excerpt,
|
|
13
|
+
comment: "The specific text passage within the content that supports this fact"
|
|
14
|
+
t.float :confidence, default: 1.0,
|
|
15
|
+
comment: "Confidence score (0.0-1.0) that this source supports the fact"
|
|
11
16
|
|
|
12
17
|
t.timestamps
|
|
13
18
|
end
|
|
14
19
|
|
|
15
|
-
add_index :fact_db_fact_sources, [:fact_id, :
|
|
16
|
-
name: "
|
|
20
|
+
add_index :fact_db_fact_sources, [:fact_id, :source_id], unique: true,
|
|
21
|
+
name: "idx_unique_fact_source"
|
|
22
|
+
|
|
23
|
+
execute "COMMENT ON COLUMN fact_db_fact_sources.created_at IS 'When this source link was established';"
|
|
24
|
+
execute "COMMENT ON COLUMN fact_db_fact_sources.updated_at IS 'When this source record was last modified';"
|
|
17
25
|
end
|
|
18
26
|
end
|
|
@@ -34,11 +34,11 @@ end
|
|
|
34
34
|
|
|
35
35
|
```ruby
|
|
36
36
|
class MyExtractor < FactDb::Extractors::Base
|
|
37
|
-
def extract(
|
|
37
|
+
def extract(source)
|
|
38
38
|
facts = []
|
|
39
39
|
|
|
40
40
|
# Your extraction logic
|
|
41
|
-
# Parse content
|
|
41
|
+
# Parse source.content
|
|
42
42
|
# Create fact records
|
|
43
43
|
|
|
44
44
|
facts
|
|
@@ -48,18 +48,18 @@ end
|
|
|
48
48
|
|
|
49
49
|
## Using Extractors
|
|
50
50
|
|
|
51
|
-
### Via
|
|
51
|
+
### Via FactDb
|
|
52
52
|
|
|
53
53
|
```ruby
|
|
54
54
|
facts = FactDb.new
|
|
55
|
-
extracted = facts.extract_facts(
|
|
55
|
+
extracted = facts.extract_facts(source.id, extractor: :llm)
|
|
56
56
|
```
|
|
57
57
|
|
|
58
58
|
### Directly
|
|
59
59
|
|
|
60
60
|
```ruby
|
|
61
61
|
extractor = FactDb::Extractors::LLMExtractor.new(config)
|
|
62
|
-
facts = extractor.extract(
|
|
62
|
+
facts = extractor.extract(source)
|
|
63
63
|
```
|
|
64
64
|
|
|
65
65
|
## Extractor Selection
|
data/docs/api/extractors/llm.md
CHANGED
|
@@ -17,9 +17,9 @@ extractor = FactDb::Extractors::LLMExtractor.new(config)
|
|
|
17
17
|
|
|
18
18
|
```ruby
|
|
19
19
|
FactDb.configure do |config|
|
|
20
|
-
config.
|
|
21
|
-
config.
|
|
22
|
-
config.
|
|
20
|
+
config.llm.provider = :openai
|
|
21
|
+
config.llm.model = "gpt-4o-mini"
|
|
22
|
+
config.llm.api_key = ENV['OPENAI_API_KEY']
|
|
23
23
|
end
|
|
24
24
|
```
|
|
25
25
|
|
|
@@ -35,7 +35,7 @@ Extract facts from content using LLM.
|
|
|
35
35
|
|
|
36
36
|
**Parameters:**
|
|
37
37
|
|
|
38
|
-
- `
|
|
38
|
+
- `source` (Models::Source) - Source to process
|
|
39
39
|
|
|
40
40
|
**Returns:** `Array<Models::Fact>`
|
|
41
41
|
|
|
@@ -43,10 +43,10 @@ Extract facts from content using LLM.
|
|
|
43
43
|
|
|
44
44
|
```ruby
|
|
45
45
|
extractor = LLMExtractor.new(config)
|
|
46
|
-
facts = extractor.extract(
|
|
46
|
+
facts = extractor.extract(source)
|
|
47
47
|
|
|
48
48
|
facts.each do |fact|
|
|
49
|
-
puts fact.
|
|
49
|
+
puts fact.text
|
|
50
50
|
puts " Valid: #{fact.valid_at}"
|
|
51
51
|
puts " Confidence: #{fact.confidence}"
|
|
52
52
|
end
|
|
@@ -73,7 +73,7 @@ Extract temporal facts from this content. For each fact:
|
|
|
73
73
|
4. Assess confidence level
|
|
74
74
|
|
|
75
75
|
Content:
|
|
76
|
-
{content
|
|
76
|
+
{source.content}
|
|
77
77
|
|
|
78
78
|
Return JSON:
|
|
79
79
|
{
|
|
@@ -94,12 +94,12 @@ Return JSON:
|
|
|
94
94
|
|
|
95
95
|
| Provider | Models | Config |
|
|
96
96
|
|----------|--------|--------|
|
|
97
|
-
| OpenAI | gpt-4o, gpt-4o-mini | `
|
|
98
|
-
| Anthropic | claude-sonnet-4, claude-3-haiku | `
|
|
99
|
-
| Google | gemini-2.0-flash | `
|
|
100
|
-
| Ollama | llama3.2, mistral | `
|
|
101
|
-
| AWS Bedrock | claude-sonnet-4 | `
|
|
102
|
-
| OpenRouter | Various | `
|
|
97
|
+
| OpenAI | gpt-4o, gpt-4o-mini | `llm.provider = :openai` |
|
|
98
|
+
| Anthropic | claude-sonnet-4, claude-3-haiku | `llm.provider = :anthropic` |
|
|
99
|
+
| Google | gemini-2.0-flash | `llm.provider = :gemini` |
|
|
100
|
+
| Ollama | llama3.2, mistral | `llm.provider = :ollama` |
|
|
101
|
+
| AWS Bedrock | claude-sonnet-4 | `llm.provider = :bedrock` |
|
|
102
|
+
| OpenRouter | Various | `llm.provider = :openrouter` |
|
|
103
103
|
|
|
104
104
|
## Error Handling
|
|
105
105
|
|
|
@@ -134,7 +134,7 @@ end
|
|
|
134
134
|
### 1. Validate Results
|
|
135
135
|
|
|
136
136
|
```ruby
|
|
137
|
-
facts = extractor.extract(
|
|
137
|
+
facts = extractor.extract(source)
|
|
138
138
|
facts.each do |fact|
|
|
139
139
|
if fact.confidence < 0.7
|
|
140
140
|
fact.update!(metadata: { needs_review: true })
|
|
@@ -145,9 +145,9 @@ end
|
|
|
145
145
|
### 2. Cache Responses
|
|
146
146
|
|
|
147
147
|
```ruby
|
|
148
|
-
cache_key = "llm:#{
|
|
148
|
+
cache_key = "llm:#{source.content_hash}"
|
|
149
149
|
facts = Rails.cache.fetch(cache_key) do
|
|
150
|
-
extractor.extract(
|
|
150
|
+
extractor.extract(source)
|
|
151
151
|
end
|
|
152
152
|
```
|
|
153
153
|
|
|
@@ -157,6 +157,6 @@ end
|
|
|
157
157
|
require 'retryable'
|
|
158
158
|
|
|
159
159
|
Retryable.retryable(tries: 3, sleep: lambda { |n| 2**n }) do
|
|
160
|
-
extractor.extract(
|
|
160
|
+
extractor.extract(source)
|
|
161
161
|
end
|
|
162
162
|
```
|
|
@@ -65,13 +65,13 @@ The extractor includes patterns for common fact types:
|
|
|
65
65
|
```ruby
|
|
66
66
|
extractor = RuleBasedExtractor.new(config)
|
|
67
67
|
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
68
|
+
source = Models::Source.create!(
|
|
69
|
+
content: "Paula Chen joined Microsoft on January 10, 2024 as Principal Engineer.",
|
|
70
|
+
type: "announcement",
|
|
71
71
|
captured_at: Time.current
|
|
72
72
|
)
|
|
73
73
|
|
|
74
|
-
facts = extractor.extract(
|
|
74
|
+
facts = extractor.extract(source)
|
|
75
75
|
# Returns facts about:
|
|
76
76
|
# - Paula joining Microsoft
|
|
77
77
|
# - Paula's title as Principal Engineer
|
|
@@ -99,20 +99,20 @@ class CustomRuleExtractor < FactDb::Extractors::RuleBasedExtractor
|
|
|
99
99
|
|
|
100
100
|
private
|
|
101
101
|
|
|
102
|
-
def extract_custom_patterns(
|
|
102
|
+
def extract_custom_patterns(source)
|
|
103
103
|
facts = []
|
|
104
104
|
CUSTOM_PATTERNS.each do |rule|
|
|
105
|
-
content.
|
|
106
|
-
facts << send(rule[:handler], match,
|
|
105
|
+
source.content.scan(rule[:pattern]) do |match|
|
|
106
|
+
facts << send(rule[:handler], match, source)
|
|
107
107
|
end
|
|
108
108
|
end
|
|
109
109
|
facts
|
|
110
110
|
end
|
|
111
111
|
|
|
112
|
-
def extract_revenue(match,
|
|
112
|
+
def extract_revenue(match, source)
|
|
113
113
|
Models::Fact.create!(
|
|
114
|
-
|
|
115
|
-
valid_at:
|
|
114
|
+
text: "Revenue of $#{match[:amount]}",
|
|
115
|
+
valid_at: source.captured_at,
|
|
116
116
|
extraction_method: "rule_based",
|
|
117
117
|
# ...
|
|
118
118
|
)
|
|
@@ -141,7 +141,7 @@ end
|
|
|
141
141
|
|
|
142
142
|
```ruby
|
|
143
143
|
# Use rule-based for structured content
|
|
144
|
-
if content.
|
|
144
|
+
if content.type == "form"
|
|
145
145
|
facts = rule_extractor.extract(content)
|
|
146
146
|
else
|
|
147
147
|
facts = llm_extractor.extract(content)
|
|
@@ -155,11 +155,11 @@ facts = extractor.extract(content)
|
|
|
155
155
|
facts.select { |f| f.confidence > 0.8 }
|
|
156
156
|
```
|
|
157
157
|
|
|
158
|
-
### 3. Log Unmatched
|
|
158
|
+
### 3. Log Unmatched Sources
|
|
159
159
|
|
|
160
160
|
```ruby
|
|
161
|
-
facts = extractor.extract(
|
|
161
|
+
facts = extractor.extract(source)
|
|
162
162
|
if facts.empty?
|
|
163
|
-
logger.info "No patterns matched for
|
|
163
|
+
logger.info "No patterns matched for source #{source.id}"
|
|
164
164
|
end
|
|
165
165
|
```
|
data/docs/api/facts.md
CHANGED
|
@@ -15,7 +15,7 @@ facts = FactDb::Facts.new(config: custom_config)
|
|
|
15
15
|
| Attribute | Type | Description |
|
|
16
16
|
|-----------|------|-------------|
|
|
17
17
|
| `config` | Config | Configuration instance |
|
|
18
|
-
| `
|
|
18
|
+
| `source_service` | SourceService | Service for source operations |
|
|
19
19
|
| `entity_service` | EntityService | Service for entity operations |
|
|
20
20
|
| `fact_service` | FactService | Service for fact operations |
|
|
21
21
|
| `extraction_pipeline` | ExtractionPipeline | Pipeline for batch extraction |
|
|
@@ -43,7 +43,7 @@ facts = FactDb.new
|
|
|
43
43
|
|
|
44
44
|
# Use custom configuration
|
|
45
45
|
config = FactDb::Config.new
|
|
46
|
-
config.
|
|
46
|
+
config.database.url = "postgresql://localhost/my_db"
|
|
47
47
|
facts = FactDb.new(config: config)
|
|
48
48
|
```
|
|
49
49
|
|
|
@@ -52,28 +52,28 @@ facts = FactDb.new(config: config)
|
|
|
52
52
|
### ingest
|
|
53
53
|
|
|
54
54
|
```ruby
|
|
55
|
-
def ingest(
|
|
55
|
+
def ingest(content, kind:, captured_at: Time.current, metadata: {}, title: nil, source_uri: nil)
|
|
56
56
|
```
|
|
57
57
|
|
|
58
58
|
Ingest raw content into the fact database.
|
|
59
59
|
|
|
60
60
|
**Parameters:**
|
|
61
61
|
|
|
62
|
-
- `
|
|
63
|
-
- `
|
|
62
|
+
- `content` (String) - The source text content
|
|
63
|
+
- `kind` (Symbol) - Content kind (:email, :document, :article, etc.)
|
|
64
64
|
- `captured_at` (Time, optional) - When content was captured
|
|
65
65
|
- `metadata` (Hash, optional) - Additional metadata
|
|
66
66
|
- `title` (String, optional) - Content title
|
|
67
67
|
- `source_uri` (String, optional) - Original location
|
|
68
68
|
|
|
69
|
-
**Returns:** `Models::
|
|
69
|
+
**Returns:** `Models::Source`
|
|
70
70
|
|
|
71
71
|
**Example:**
|
|
72
72
|
|
|
73
73
|
```ruby
|
|
74
|
-
|
|
74
|
+
source = facts.ingest(
|
|
75
75
|
"Paula joined Microsoft on Jan 10, 2024",
|
|
76
|
-
|
|
76
|
+
kind: :announcement,
|
|
77
77
|
title: "New Hire",
|
|
78
78
|
captured_at: Time.current
|
|
79
79
|
)
|
|
@@ -84,14 +84,14 @@ content = facts.ingest(
|
|
|
84
84
|
### extract_facts
|
|
85
85
|
|
|
86
86
|
```ruby
|
|
87
|
-
def extract_facts(
|
|
87
|
+
def extract_facts(source_id, extractor: @config.default_extractor)
|
|
88
88
|
```
|
|
89
89
|
|
|
90
90
|
Extract facts from content.
|
|
91
91
|
|
|
92
92
|
**Parameters:**
|
|
93
93
|
|
|
94
|
-
- `
|
|
94
|
+
- `source_id` (Integer) - Source ID
|
|
95
95
|
- `extractor` (Symbol, optional) - Extraction method (:manual, :llm, :rule_based)
|
|
96
96
|
|
|
97
97
|
**Returns:** `Array<Models::Fact>`
|
|
@@ -99,7 +99,7 @@ Extract facts from content.
|
|
|
99
99
|
**Example:**
|
|
100
100
|
|
|
101
101
|
```ruby
|
|
102
|
-
extracted = facts.extract_facts(
|
|
102
|
+
extracted = facts.extract_facts(source.id, extractor: :llm)
|
|
103
103
|
```
|
|
104
104
|
|
|
105
105
|
---
|
|
@@ -139,7 +139,7 @@ results = facts.query_facts(at: Date.parse("2023-06-15"))
|
|
|
139
139
|
### resolve_entity
|
|
140
140
|
|
|
141
141
|
```ruby
|
|
142
|
-
def resolve_entity(name,
|
|
142
|
+
def resolve_entity(name, kind: nil)
|
|
143
143
|
```
|
|
144
144
|
|
|
145
145
|
Resolve a name to an entity.
|
|
@@ -147,14 +147,14 @@ Resolve a name to an entity.
|
|
|
147
147
|
**Parameters:**
|
|
148
148
|
|
|
149
149
|
- `name` (String) - Name to resolve
|
|
150
|
-
- `
|
|
150
|
+
- `kind` (Symbol, optional) - Entity kind filter
|
|
151
151
|
|
|
152
152
|
**Returns:** `Models::Entity` or `nil`
|
|
153
153
|
|
|
154
154
|
**Example:**
|
|
155
155
|
|
|
156
156
|
```ruby
|
|
157
|
-
entity = facts.resolve_entity("Paula Chen",
|
|
157
|
+
entity = facts.resolve_entity("Paula Chen", kind: :person)
|
|
158
158
|
```
|
|
159
159
|
|
|
160
160
|
---
|
|
@@ -232,14 +232,14 @@ historical = facts.facts_at(Date.parse("2023-06-15"), entity: paula.id)
|
|
|
232
232
|
### batch_extract
|
|
233
233
|
|
|
234
234
|
```ruby
|
|
235
|
-
def batch_extract(
|
|
235
|
+
def batch_extract(source_ids, extractor: @config.default_extractor, parallel: true)
|
|
236
236
|
```
|
|
237
237
|
|
|
238
238
|
Batch extract facts from multiple content items.
|
|
239
239
|
|
|
240
240
|
**Parameters:**
|
|
241
241
|
|
|
242
|
-
- `
|
|
242
|
+
- `source_ids` (Array<Integer>) - Source IDs to process
|
|
243
243
|
- `extractor` (Symbol, optional) - Extraction method
|
|
244
244
|
- `parallel` (Boolean, optional) - Use parallel processing (default: true)
|
|
245
245
|
|
|
@@ -248,9 +248,9 @@ Batch extract facts from multiple content items.
|
|
|
248
248
|
**Example:**
|
|
249
249
|
|
|
250
250
|
```ruby
|
|
251
|
-
results = facts.batch_extract([
|
|
251
|
+
results = facts.batch_extract([s1.id, s2.id, s3.id], parallel: true)
|
|
252
252
|
results.each do |r|
|
|
253
|
-
puts "#{r[:
|
|
253
|
+
puts "#{r[:source_id]}: #{r[:facts].count} facts"
|
|
254
254
|
end
|
|
255
255
|
```
|
|
256
256
|
|
|
@@ -259,7 +259,7 @@ end
|
|
|
259
259
|
### batch_resolve_entities
|
|
260
260
|
|
|
261
261
|
```ruby
|
|
262
|
-
def batch_resolve_entities(names,
|
|
262
|
+
def batch_resolve_entities(names, kind: nil)
|
|
263
263
|
```
|
|
264
264
|
|
|
265
265
|
Batch resolve entity names.
|
|
@@ -267,7 +267,7 @@ Batch resolve entity names.
|
|
|
267
267
|
**Parameters:**
|
|
268
268
|
|
|
269
269
|
- `names` (Array<String>) - Names to resolve
|
|
270
|
-
- `
|
|
270
|
+
- `kind` (Symbol, optional) - Entity kind filter
|
|
271
271
|
|
|
272
272
|
**Returns:** `Array<Hash>` - Resolution results
|
|
273
273
|
|
data/docs/api/index.md
CHANGED
|
@@ -8,13 +8,13 @@ Complete API documentation for FactDb.
|
|
|
8
8
|
|
|
9
9
|
## Models
|
|
10
10
|
|
|
11
|
-
- [
|
|
11
|
+
- [Source](models/source.md) - Immutable source content
|
|
12
12
|
- [Entity](models/entity.md) - Resolved identities
|
|
13
13
|
- [Fact](models/fact.md) - Temporal assertions
|
|
14
14
|
|
|
15
15
|
## Services
|
|
16
16
|
|
|
17
|
-
- [
|
|
17
|
+
- [SourceService](services/source-service.md) - Ingest and manage sources
|
|
18
18
|
- [EntityService](services/entity-service.md) - Create and resolve entities
|
|
19
19
|
- [FactService](services/fact-service.md) - Extract and query facts
|
|
20
20
|
|
|
@@ -37,14 +37,14 @@ FactDb
|
|
|
37
37
|
├── Config # Configuration
|
|
38
38
|
├── Database # Database connection
|
|
39
39
|
├── Models
|
|
40
|
-
│ ├──
|
|
40
|
+
│ ├── Source
|
|
41
41
|
│ ├── Entity
|
|
42
42
|
│ ├── EntityAlias
|
|
43
43
|
│ ├── Fact
|
|
44
44
|
│ ├── EntityMention
|
|
45
45
|
│ └── FactSource
|
|
46
46
|
├── Services
|
|
47
|
-
│ ├──
|
|
47
|
+
│ ├── SourceService
|
|
48
48
|
│ ├── EntityService
|
|
49
49
|
│ └── FactService
|
|
50
50
|
├── Extractors
|