fact_db 0.0.3 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +39 -0
- data/Rakefile +10 -0
- data/db/migrate/008_add_persistent_tsvectors.rb +97 -0
- data/docs/architecture/database-schema.md +18 -2
- data/examples/001_configuration.rb +33 -0
- data/examples/100_query_context.rb +1 -1
- data/examples/130_rag_feedback_loop.rb +1 -1
- data/lib/fact_db/models/fact.rb +2 -2
- data/lib/fact_db/models/source.rb +2 -2
- data/lib/fact_db/version.rb +1 -1
- metadata +3 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: ac11ce032c5e56849e6910c26154add65d81d3aaaa349969e584c493633e8db9
|
|
4
|
+
data.tar.gz: c0ace2cd5605d4530fc3f2a16e91a2edb85dd5532ccc80bdf3d0a2248a0e2a27
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: e0ecf9c10caaa2e5836282bfac4b8de0b0f287948ca7f9de5413a8a00bb6d188e272b9a0f10d1832ca68806c9743ecdeac3939c304c6636c06d0b96847e33eb9
|
|
7
|
+
data.tar.gz: efd5e98fb240194a467bf6a8ddec659c754635290f54f99dab4c11779f627589fa659b8827feb9b0bd7e1c2d761e592e3c03fc30965d7a54095c32e05c48f310
|
data/CHANGELOG.md
CHANGED
|
@@ -8,6 +8,45 @@ All notable changes to this project will be documented in this file.
|
|
|
8
8
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
|
9
9
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
10
10
|
|
|
11
|
+
## [0.1.0] - Unreleased
|
|
12
|
+
|
|
13
|
+
### Added
|
|
14
|
+
|
|
15
|
+
- **Persistent TSVectors** - Precomputed `tsvector` columns for full-text search
|
|
16
|
+
- `text_vector` column on `fact_db_facts` replaces on-the-fly `to_tsvector()` computation
|
|
17
|
+
- `content_vector` column on `fact_db_sources` replaces on-the-fly `to_tsvector()` computation
|
|
18
|
+
- Database triggers automatically keep vectors in sync on INSERT/UPDATE
|
|
19
|
+
- GIN indexes created `CONCURRENTLY` for non-blocking deployment
|
|
20
|
+
- Migration backfills existing rows and drops redundant expression-based indexes
|
|
21
|
+
- **Configurable LLM Prompts** - Extraction prompts moved to configuration
|
|
22
|
+
- `config.prompts.fact_extraction` - Customizable fact extraction prompt template
|
|
23
|
+
- `config.prompts.entity_extraction` - Customizable entity extraction prompt template
|
|
24
|
+
- `config.prompts.rag_system` - Customizable RAG system prompt
|
|
25
|
+
- Override via config files, environment variables (`FDB_PROMPTS__*`), or programmatic configuration
|
|
26
|
+
- **Configuration Defaults File** - `lib/fact_db/config/defaults.yml` as single source of truth for all config schema and defaults
|
|
27
|
+
- **Configuration Example** - New `examples/001_configuration.rb` demonstrating all configuration methods
|
|
28
|
+
- **Ingest Reporter** - New `examples/ingest_reporter.rb` with structured reporting for markdown ingestion
|
|
29
|
+
|
|
30
|
+
### Changed
|
|
31
|
+
|
|
32
|
+
- **Full-text search scopes** now query persisted `tsvector` columns instead of computing them at query time
|
|
33
|
+
- `Fact.search_text` queries `text_vector` column directly
|
|
34
|
+
- `Source.search_text` queries `content_vector` column directly
|
|
35
|
+
- `ts_rank_cd()` calls in examples use persisted columns instead of recomputing
|
|
36
|
+
- **LLM Extractor** - Removed hardcoded prompt constants; prompts now loaded from configuration
|
|
37
|
+
- **Markdown ingestion** - Refactored with new reporter and reduced progress verbosity
|
|
38
|
+
- Version bump from 0.0.4 to 0.1.0
|
|
39
|
+
|
|
40
|
+
## [0.0.4] - 2026-01-12
|
|
41
|
+
|
|
42
|
+
### Added
|
|
43
|
+
|
|
44
|
+
- **LLM prompt configuration** - Prompts for fact/entity extraction are now configurable
|
|
45
|
+
|
|
46
|
+
### Changed
|
|
47
|
+
|
|
48
|
+
- Version bump from 0.0.3 to 0.0.4
|
|
49
|
+
|
|
11
50
|
## [0.0.3] - 2026-01-12
|
|
12
51
|
|
|
13
52
|
### Added
|
data/Rakefile
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
require "bundler/gem_tasks"
|
|
4
|
+
require "fileutils"
|
|
4
5
|
require "rake/testtask"
|
|
5
6
|
|
|
6
7
|
Rake::TestTask.new(:test) do |t|
|
|
@@ -264,6 +265,15 @@ namespace :docs do
|
|
|
264
265
|
task :yard do
|
|
265
266
|
output_dir = File.expand_path("doc", __dir__)
|
|
266
267
|
system("yard", "doc") || abort("yard doc failed")
|
|
268
|
+
|
|
269
|
+
# Create symlink for README.md image path (docs/assets -> assets)
|
|
270
|
+
# README.md references docs/assets/fact_db.jpg which needs to resolve in YARD output
|
|
271
|
+
docs_dir = File.join(output_dir, "docs")
|
|
272
|
+
FileUtils.mkdir_p(docs_dir)
|
|
273
|
+
symlink_path = File.join(docs_dir, "assets")
|
|
274
|
+
FileUtils.rm_f(symlink_path)
|
|
275
|
+
FileUtils.ln_sf("../assets", symlink_path)
|
|
276
|
+
|
|
267
277
|
puts "YARD documentation built to #{output_dir}"
|
|
268
278
|
end
|
|
269
279
|
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
class AddPersistentTsvectors < ActiveRecord::Migration[7.0]
|
|
4
|
+
# Uses disable_ddl_transaction! so GIN indexes can be created CONCURRENTLY
|
|
5
|
+
# without holding an exclusive lock on the table.
|
|
6
|
+
disable_ddl_transaction!
|
|
7
|
+
|
|
8
|
+
def up
|
|
9
|
+
# --- fact_db_facts: persist tsvector for full-text search on text column ---
|
|
10
|
+
|
|
11
|
+
add_column :fact_db_facts, :text_vector, :tsvector,
|
|
12
|
+
comment: "Precomputed tsvector for full-text search on text"
|
|
13
|
+
|
|
14
|
+
execute <<-SQL
|
|
15
|
+
CREATE TRIGGER fact_db_facts_text_vector_update
|
|
16
|
+
BEFORE INSERT OR UPDATE ON fact_db_facts
|
|
17
|
+
FOR EACH ROW
|
|
18
|
+
EXECUTE FUNCTION tsvector_update_trigger(
|
|
19
|
+
text_vector,
|
|
20
|
+
'pg_catalog.english',
|
|
21
|
+
text
|
|
22
|
+
);
|
|
23
|
+
SQL
|
|
24
|
+
|
|
25
|
+
execute <<-SQL
|
|
26
|
+
CREATE INDEX CONCURRENTLY idx_facts_text_vector
|
|
27
|
+
ON fact_db_facts
|
|
28
|
+
USING GIN (text_vector);
|
|
29
|
+
SQL
|
|
30
|
+
|
|
31
|
+
# Backfill existing rows by touching them so the trigger fires
|
|
32
|
+
execute <<-SQL
|
|
33
|
+
UPDATE fact_db_facts SET text_vector = to_tsvector('english', COALESCE(text, ''));
|
|
34
|
+
SQL
|
|
35
|
+
|
|
36
|
+
# --- fact_db_sources: persist tsvector for full-text search on content column ---
|
|
37
|
+
|
|
38
|
+
add_column :fact_db_sources, :content_vector, :tsvector,
|
|
39
|
+
comment: "Precomputed tsvector for full-text search on content"
|
|
40
|
+
|
|
41
|
+
execute <<-SQL
|
|
42
|
+
CREATE TRIGGER fact_db_sources_content_vector_update
|
|
43
|
+
BEFORE INSERT OR UPDATE ON fact_db_sources
|
|
44
|
+
FOR EACH ROW
|
|
45
|
+
EXECUTE FUNCTION tsvector_update_trigger(
|
|
46
|
+
content_vector,
|
|
47
|
+
'pg_catalog.english',
|
|
48
|
+
content
|
|
49
|
+
);
|
|
50
|
+
SQL
|
|
51
|
+
|
|
52
|
+
execute <<-SQL
|
|
53
|
+
CREATE INDEX CONCURRENTLY idx_sources_content_vector
|
|
54
|
+
ON fact_db_sources
|
|
55
|
+
USING GIN (content_vector);
|
|
56
|
+
SQL
|
|
57
|
+
|
|
58
|
+
# Backfill existing rows
|
|
59
|
+
execute <<-SQL
|
|
60
|
+
UPDATE fact_db_sources SET content_vector = to_tsvector('english', COALESCE(content, ''));
|
|
61
|
+
SQL
|
|
62
|
+
|
|
63
|
+
# Drop the old expression-based GIN indexes (now redundant)
|
|
64
|
+
execute "DROP INDEX IF EXISTS idx_facts_fulltext;"
|
|
65
|
+
execute "DROP INDEX IF EXISTS idx_sources_fulltext;"
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
def down
|
|
69
|
+
# Restore the original expression-based GIN indexes
|
|
70
|
+
execute <<-SQL
|
|
71
|
+
CREATE INDEX CONCURRENTLY idx_facts_fulltext ON fact_db_facts
|
|
72
|
+
USING gin(to_tsvector('english', text));
|
|
73
|
+
SQL
|
|
74
|
+
|
|
75
|
+
execute <<-SQL
|
|
76
|
+
CREATE INDEX CONCURRENTLY idx_sources_fulltext ON fact_db_sources
|
|
77
|
+
USING gin(to_tsvector('english', content));
|
|
78
|
+
SQL
|
|
79
|
+
|
|
80
|
+
# Drop triggers
|
|
81
|
+
execute <<-SQL
|
|
82
|
+
DROP TRIGGER IF EXISTS fact_db_facts_text_vector_update ON fact_db_facts;
|
|
83
|
+
SQL
|
|
84
|
+
|
|
85
|
+
execute <<-SQL
|
|
86
|
+
DROP TRIGGER IF EXISTS fact_db_sources_content_vector_update ON fact_db_sources;
|
|
87
|
+
SQL
|
|
88
|
+
|
|
89
|
+
# Drop indexes
|
|
90
|
+
execute "DROP INDEX CONCURRENTLY IF EXISTS idx_facts_text_vector;"
|
|
91
|
+
execute "DROP INDEX CONCURRENTLY IF EXISTS idx_sources_content_vector;"
|
|
92
|
+
|
|
93
|
+
# Remove columns
|
|
94
|
+
remove_column :fact_db_facts, :text_vector
|
|
95
|
+
remove_column :fact_db_sources, :content_vector
|
|
96
|
+
end
|
|
97
|
+
end
|
|
@@ -18,6 +18,7 @@ erDiagram
|
|
|
18
18
|
string content_hash UK
|
|
19
19
|
string type
|
|
20
20
|
text content
|
|
21
|
+
tsvector content_vector
|
|
21
22
|
string title
|
|
22
23
|
string source_uri
|
|
23
24
|
jsonb metadata
|
|
@@ -48,6 +49,7 @@ erDiagram
|
|
|
48
49
|
facts {
|
|
49
50
|
bigint id PK
|
|
50
51
|
text text
|
|
52
|
+
tsvector text_vector
|
|
51
53
|
string digest
|
|
52
54
|
timestamptz valid_at
|
|
53
55
|
timestamptz invalid_at
|
|
@@ -93,6 +95,7 @@ CREATE TABLE sources (
|
|
|
93
95
|
content_hash VARCHAR(64) NOT NULL UNIQUE,
|
|
94
96
|
type VARCHAR(50) NOT NULL,
|
|
95
97
|
content TEXT NOT NULL,
|
|
98
|
+
content_vector TSVECTOR,
|
|
96
99
|
title VARCHAR(255),
|
|
97
100
|
source_uri TEXT,
|
|
98
101
|
metadata JSONB NOT NULL DEFAULT '{}',
|
|
@@ -103,8 +106,14 @@ CREATE TABLE sources (
|
|
|
103
106
|
|
|
104
107
|
CREATE INDEX idx_sources_type ON sources(type);
|
|
105
108
|
CREATE INDEX idx_sources_captured ON sources(captured_at);
|
|
106
|
-
CREATE INDEX
|
|
109
|
+
CREATE INDEX idx_sources_content_vector ON sources USING GIN(content_vector);
|
|
107
110
|
CREATE INDEX idx_sources_embedding ON sources USING hnsw(embedding vector_cosine_ops);
|
|
111
|
+
|
|
112
|
+
-- Trigger keeps content_vector in sync with content
|
|
113
|
+
CREATE TRIGGER sources_content_vector_update
|
|
114
|
+
BEFORE INSERT OR UPDATE ON sources
|
|
115
|
+
FOR EACH ROW
|
|
116
|
+
EXECUTE FUNCTION tsvector_update_trigger(content_vector, 'pg_catalog.english', content);
|
|
108
117
|
```
|
|
109
118
|
|
|
110
119
|
### entities
|
|
@@ -155,6 +164,7 @@ Stores temporal assertions.
|
|
|
155
164
|
CREATE TABLE facts (
|
|
156
165
|
id BIGSERIAL PRIMARY KEY,
|
|
157
166
|
text TEXT NOT NULL,
|
|
167
|
+
text_vector TSVECTOR,
|
|
158
168
|
digest VARCHAR(64) NOT NULL,
|
|
159
169
|
valid_at TIMESTAMPTZ NOT NULL,
|
|
160
170
|
invalid_at TIMESTAMPTZ,
|
|
@@ -174,8 +184,14 @@ CREATE INDEX idx_facts_valid ON facts(valid_at);
|
|
|
174
184
|
CREATE INDEX idx_facts_invalid ON facts(invalid_at);
|
|
175
185
|
CREATE INDEX idx_facts_temporal ON facts(valid_at, invalid_at);
|
|
176
186
|
CREATE INDEX idx_facts_method ON facts(extraction_method);
|
|
177
|
-
CREATE INDEX
|
|
187
|
+
CREATE INDEX idx_facts_text_vector ON facts USING GIN(text_vector);
|
|
178
188
|
CREATE INDEX idx_facts_embedding ON facts USING hnsw(embedding vector_cosine_ops);
|
|
189
|
+
|
|
190
|
+
-- Trigger keeps text_vector in sync with text
|
|
191
|
+
CREATE TRIGGER facts_text_vector_update
|
|
192
|
+
BEFORE INSERT OR UPDATE ON facts
|
|
193
|
+
FOR EACH ROW
|
|
194
|
+
EXECUTE FUNCTION tsvector_update_trigger(text_vector, 'pg_catalog.english', text);
|
|
179
195
|
```
|
|
180
196
|
|
|
181
197
|
### entity_mentions
|
|
@@ -116,6 +116,11 @@ puts <<~DATABASE_CONFIG
|
|
|
116
116
|
config.ranking.relationship_match_weight: #{config.ranking.relationship_match_weight}
|
|
117
117
|
config.ranking.confidence_weight: #{config.ranking.confidence_weight}
|
|
118
118
|
|
|
119
|
+
LLM Prompts (configurable templates for extraction):
|
|
120
|
+
config.prompts.fact_extraction: #{config.prompts.fact_extraction.lines.first.strip.inspect}...
|
|
121
|
+
config.prompts.entity_extraction: #{config.prompts.entity_extraction.lines.first.strip.inspect}...
|
|
122
|
+
config.prompts.rag_system: #{config.prompts.rag_system.lines.first.strip.inspect}...
|
|
123
|
+
|
|
119
124
|
General settings:
|
|
120
125
|
config.default_extractor: #{config.default_extractor.inspect}
|
|
121
126
|
config.fuzzy_match_threshold: #{config.fuzzy_match_threshold}
|
|
@@ -154,6 +159,11 @@ puts <<~ENV_VARS
|
|
|
154
159
|
export FDB_RANKING__TS_RANK_WEIGHT=0.30
|
|
155
160
|
export FDB_RANKING__VECTOR_SIMILARITY_WEIGHT=0.25
|
|
156
161
|
|
|
162
|
+
# LLM Prompts (multi-line values work with heredocs in shell)
|
|
163
|
+
export FDB_PROMPTS__FACT_EXTRACTION="Your custom prompt..."
|
|
164
|
+
export FDB_PROMPTS__ENTITY_EXTRACTION="Your custom prompt..."
|
|
165
|
+
export FDB_PROMPTS__RAG_SYSTEM="Your custom system prompt..."
|
|
166
|
+
|
|
157
167
|
# General settings
|
|
158
168
|
export FDB_DEFAULT_EXTRACTOR=llm
|
|
159
169
|
export FDB_FUZZY_MATCH_THRESHOLD=0.80
|
|
@@ -259,6 +269,25 @@ puts <<~CONFIG_FILES
|
|
|
259
269
|
|
|
260
270
|
database:
|
|
261
271
|
password: my_local_password
|
|
272
|
+
|
|
273
|
+
Custom Prompts (in any config file):
|
|
274
|
+
---
|
|
275
|
+
# Override LLM prompts for fact/entity extraction
|
|
276
|
+
# Prompts use %<text>s as placeholder for input text
|
|
277
|
+
prompts:
|
|
278
|
+
fact_extraction: |
|
|
279
|
+
Extract facts from the following text.
|
|
280
|
+
Text: %<text>s
|
|
281
|
+
Return JSON array of facts.
|
|
282
|
+
|
|
283
|
+
entity_extraction: |
|
|
284
|
+
Extract named entities from the text.
|
|
285
|
+
Text: %<text>s
|
|
286
|
+
Return JSON array of entities.
|
|
287
|
+
|
|
288
|
+
rag_system: |
|
|
289
|
+
You are a helpful assistant with access to a fact database.
|
|
290
|
+
Use the provided context to answer questions.
|
|
262
291
|
CONFIG_FILES
|
|
263
292
|
|
|
264
293
|
demo_section("Section 7: Database Configuration")
|
|
@@ -307,6 +336,10 @@ puts <<~REFERENCE
|
|
|
307
336
|
config.llm - LLM configuration
|
|
308
337
|
config.embedding - Embedding configuration
|
|
309
338
|
config.ranking - Ranking weights
|
|
339
|
+
config.prompts - LLM prompt templates
|
|
340
|
+
config.prompts.fact_extraction - Fact extraction prompt
|
|
341
|
+
config.prompts.entity_extraction - Entity extraction prompt
|
|
342
|
+
config.prompts.rag_system - RAG system prompt
|
|
310
343
|
REFERENCE
|
|
311
344
|
|
|
312
345
|
demo_footer("Configuration Demo Complete!")
|
|
@@ -307,7 +307,7 @@ class QueryContextGenerator
|
|
|
307
307
|
# Use ts_rank_cd (cover density) for better phrase matching
|
|
308
308
|
sql = <<~SQL
|
|
309
309
|
SELECT id,
|
|
310
|
-
ts_rank_cd(
|
|
310
|
+
ts_rank_cd(text_vector,
|
|
311
311
|
plainto_tsquery('english', ?),
|
|
312
312
|
32) as rank
|
|
313
313
|
FROM fact_db_facts
|
data/lib/fact_db/models/fact.rb
CHANGED
|
@@ -133,11 +133,11 @@ module FactDb
|
|
|
133
133
|
}
|
|
134
134
|
|
|
135
135
|
# @!method search_text(query)
|
|
136
|
-
# Full-text search on fact text using
|
|
136
|
+
# Full-text search on fact text using persisted tsvector column
|
|
137
137
|
# @param query [String] the search query
|
|
138
138
|
# @return [ActiveRecord::Relation]
|
|
139
139
|
scope :search_text, lambda { |query|
|
|
140
|
-
where("
|
|
140
|
+
where("text_vector @@ plainto_tsquery('english', ?)", query)
|
|
141
141
|
}
|
|
142
142
|
|
|
143
143
|
# @!method extracted_by(method)
|
|
@@ -59,11 +59,11 @@ module FactDb
|
|
|
59
59
|
scope :captured_before, ->(date) { where("captured_at <= ?", date) }
|
|
60
60
|
|
|
61
61
|
# @!method search_text(query)
|
|
62
|
-
# Full-text search on source content using
|
|
62
|
+
# Full-text search on source content using persisted tsvector column
|
|
63
63
|
# @param query [String] the search query
|
|
64
64
|
# @return [ActiveRecord::Relation]
|
|
65
65
|
scope :search_text, lambda { |query|
|
|
66
|
-
where("
|
|
66
|
+
where("content_vector @@ plainto_tsquery('english', ?)", query)
|
|
67
67
|
}
|
|
68
68
|
|
|
69
69
|
# Finds sources by vector similarity using pgvector
|
data/lib/fact_db/version.rb
CHANGED
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: fact_db
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.0
|
|
4
|
+
version: 0.1.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Dewayne VanHoozer
|
|
@@ -201,6 +201,7 @@ files:
|
|
|
201
201
|
- db/migrate/005_create_facts.rb
|
|
202
202
|
- db/migrate/006_create_entity_mentions.rb
|
|
203
203
|
- db/migrate/007_create_fact_sources.rb
|
|
204
|
+
- db/migrate/008_add_persistent_tsvectors.rb
|
|
204
205
|
- docs/api/extractors/index.md
|
|
205
206
|
- docs/api/extractors/llm.md
|
|
206
207
|
- docs/api/extractors/manual.md
|
|
@@ -326,7 +327,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
326
327
|
- !ruby/object:Gem::Version
|
|
327
328
|
version: '0'
|
|
328
329
|
requirements: []
|
|
329
|
-
rubygems_version: 4.0.
|
|
330
|
+
rubygems_version: 4.0.6
|
|
330
331
|
specification_version: 4
|
|
331
332
|
summary: Temporal fact tracking with entity resolution and audit trails
|
|
332
333
|
test_files: []
|