fact_db 0.0.3 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: c9a22512c569e81df1cd3e7d216dc1356c043c8e454e631df14d3fadffd13b39
4
- data.tar.gz: 260f0183ffc6a7166d2111a953215c7835ab3fd47e213474d2eda66d2f8ea582
3
+ metadata.gz: ac11ce032c5e56849e6910c26154add65d81d3aaaa349969e584c493633e8db9
4
+ data.tar.gz: c0ace2cd5605d4530fc3f2a16e91a2edb85dd5532ccc80bdf3d0a2248a0e2a27
5
5
  SHA512:
6
- metadata.gz: 539a7bef88cb16f6f590d6227a5b0820347197d679cc187c9e34990381f5b6a550ed60b1733d829da1b5c0e6f67e3b0dc7bc0d9bfda606acad282914c7db9fbe
7
- data.tar.gz: 4c7e4b859af803c853cd2ce7d1a7aeba7c0d45f34934483d2caacd984eae29f6cf783803de19be071ca51f0b5df41b0e14bdf287a0d4815e42b3e5b2cedb7131
6
+ metadata.gz: e0ecf9c10caaa2e5836282bfac4b8de0b0f287948ca7f9de5413a8a00bb6d188e272b9a0f10d1832ca68806c9743ecdeac3939c304c6636c06d0b96847e33eb9
7
+ data.tar.gz: efd5e98fb240194a467bf6a8ddec659c754635290f54f99dab4c11779f627589fa659b8827feb9b0bd7e1c2d761e592e3c03fc30965d7a54095c32e05c48f310
data/CHANGELOG.md CHANGED
@@ -8,6 +8,45 @@ All notable changes to this project will be documented in this file.
8
8
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
9
9
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
10
10
 
11
+ ## [0.1.0] - Unreleased
12
+
13
+ ### Added
14
+
15
+ - **Persistent TSVectors** - Precomputed `tsvector` columns for full-text search
16
+ - `text_vector` column on `fact_db_facts` replaces on-the-fly `to_tsvector()` computation
17
+ - `content_vector` column on `fact_db_sources` replaces on-the-fly `to_tsvector()` computation
18
+ - Database triggers automatically keep vectors in sync on INSERT/UPDATE
19
+ - GIN indexes created `CONCURRENTLY` for non-blocking deployment
20
+ - Migration backfills existing rows and drops redundant expression-based indexes
21
+ - **Configurable LLM Prompts** - Extraction prompts moved to configuration
22
+ - `config.prompts.fact_extraction` - Customizable fact extraction prompt template
23
+ - `config.prompts.entity_extraction` - Customizable entity extraction prompt template
24
+ - `config.prompts.rag_system` - Customizable RAG system prompt
25
+ - Override via config files, environment variables (`FDB_PROMPTS__*`), or programmatic configuration
26
+ - **Configuration Defaults File** - `lib/fact_db/config/defaults.yml` as single source of truth for all config schema and defaults
27
+ - **Configuration Example** - New `examples/001_configuration.rb` demonstrating all configuration methods
28
+ - **Ingest Reporter** - New `examples/ingest_reporter.rb` with structured reporting for markdown ingestion
29
+
30
+ ### Changed
31
+
32
+ - **Full-text search scopes** now query persisted `tsvector` columns instead of computing them at query time
33
+ - `Fact.search_text` queries `text_vector` column directly
34
+ - `Source.search_text` queries `content_vector` column directly
35
+ - `ts_rank_cd()` calls in examples use persisted columns instead of recomputing
36
+ - **LLM Extractor** - Removed hardcoded prompt constants; prompts now loaded from configuration
37
+ - **Markdown ingestion** - Refactored with new reporter and reduced progress verbosity
38
+ - Version bump from 0.0.4 to 0.1.0
39
+
40
+ ## [0.0.4] - 2026-01-12
41
+
42
+ ### Added
43
+
44
+ - **LLM prompt configuration** - Prompts for fact/entity extraction are now configurable
45
+
46
+ ### Changed
47
+
48
+ - Version bump from 0.0.3 to 0.0.4
49
+
11
50
  ## [0.0.3] - 2026-01-12
12
51
 
13
52
  ### Added
data/Rakefile CHANGED
@@ -1,6 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require "bundler/gem_tasks"
4
+ require "fileutils"
4
5
  require "rake/testtask"
5
6
 
6
7
  Rake::TestTask.new(:test) do |t|
@@ -264,6 +265,15 @@ namespace :docs do
264
265
  task :yard do
265
266
  output_dir = File.expand_path("doc", __dir__)
266
267
  system("yard", "doc") || abort("yard doc failed")
268
+
269
+ # Create symlink for README.md image path (docs/assets -> assets)
270
+ # README.md references docs/assets/fact_db.jpg which needs to resolve in YARD output
271
+ docs_dir = File.join(output_dir, "docs")
272
+ FileUtils.mkdir_p(docs_dir)
273
+ symlink_path = File.join(docs_dir, "assets")
274
+ FileUtils.rm_f(symlink_path)
275
+ FileUtils.ln_sf("../assets", symlink_path)
276
+
267
277
  puts "YARD documentation built to #{output_dir}"
268
278
  end
269
279
 
@@ -0,0 +1,97 @@
1
+ # frozen_string_literal: true
2
+
3
+ class AddPersistentTsvectors < ActiveRecord::Migration[7.0]
4
+ # Uses disable_ddl_transaction! so GIN indexes can be created CONCURRENTLY
5
+ # without holding an exclusive lock on the table.
6
+ disable_ddl_transaction!
7
+
8
+ def up
9
+ # --- fact_db_facts: persist tsvector for full-text search on text column ---
10
+
11
+ add_column :fact_db_facts, :text_vector, :tsvector,
12
+ comment: "Precomputed tsvector for full-text search on text"
13
+
14
+ execute <<-SQL
15
+ CREATE TRIGGER fact_db_facts_text_vector_update
16
+ BEFORE INSERT OR UPDATE ON fact_db_facts
17
+ FOR EACH ROW
18
+ EXECUTE FUNCTION tsvector_update_trigger(
19
+ text_vector,
20
+ 'pg_catalog.english',
21
+ text
22
+ );
23
+ SQL
24
+
25
+ execute <<-SQL
26
+ CREATE INDEX CONCURRENTLY idx_facts_text_vector
27
+ ON fact_db_facts
28
+ USING GIN (text_vector);
29
+ SQL
30
+
31
+ # Backfill existing rows by touching them so the trigger fires
32
+ execute <<-SQL
33
+ UPDATE fact_db_facts SET text_vector = to_tsvector('english', COALESCE(text, ''));
34
+ SQL
35
+
36
+ # --- fact_db_sources: persist tsvector for full-text search on content column ---
37
+
38
+ add_column :fact_db_sources, :content_vector, :tsvector,
39
+ comment: "Precomputed tsvector for full-text search on content"
40
+
41
+ execute <<-SQL
42
+ CREATE TRIGGER fact_db_sources_content_vector_update
43
+ BEFORE INSERT OR UPDATE ON fact_db_sources
44
+ FOR EACH ROW
45
+ EXECUTE FUNCTION tsvector_update_trigger(
46
+ content_vector,
47
+ 'pg_catalog.english',
48
+ content
49
+ );
50
+ SQL
51
+
52
+ execute <<-SQL
53
+ CREATE INDEX CONCURRENTLY idx_sources_content_vector
54
+ ON fact_db_sources
55
+ USING GIN (content_vector);
56
+ SQL
57
+
58
+ # Backfill existing rows
59
+ execute <<-SQL
60
+ UPDATE fact_db_sources SET content_vector = to_tsvector('english', COALESCE(content, ''));
61
+ SQL
62
+
63
+ # Drop the old expression-based GIN indexes (now redundant)
64
+ execute "DROP INDEX IF EXISTS idx_facts_fulltext;"
65
+ execute "DROP INDEX IF EXISTS idx_sources_fulltext;"
66
+ end
67
+
68
+ def down
69
+ # Restore the original expression-based GIN indexes
70
+ execute <<-SQL
71
+ CREATE INDEX CONCURRENTLY idx_facts_fulltext ON fact_db_facts
72
+ USING gin(to_tsvector('english', text));
73
+ SQL
74
+
75
+ execute <<-SQL
76
+ CREATE INDEX CONCURRENTLY idx_sources_fulltext ON fact_db_sources
77
+ USING gin(to_tsvector('english', content));
78
+ SQL
79
+
80
+ # Drop triggers
81
+ execute <<-SQL
82
+ DROP TRIGGER IF EXISTS fact_db_facts_text_vector_update ON fact_db_facts;
83
+ SQL
84
+
85
+ execute <<-SQL
86
+ DROP TRIGGER IF EXISTS fact_db_sources_content_vector_update ON fact_db_sources;
87
+ SQL
88
+
89
+ # Drop indexes
90
+ execute "DROP INDEX CONCURRENTLY IF EXISTS idx_facts_text_vector;"
91
+ execute "DROP INDEX CONCURRENTLY IF EXISTS idx_sources_content_vector;"
92
+
93
+ # Remove columns
94
+ remove_column :fact_db_facts, :text_vector
95
+ remove_column :fact_db_sources, :content_vector
96
+ end
97
+ end
@@ -18,6 +18,7 @@ erDiagram
18
18
  string content_hash UK
19
19
  string type
20
20
  text content
21
+ tsvector content_vector
21
22
  string title
22
23
  string source_uri
23
24
  jsonb metadata
@@ -48,6 +49,7 @@ erDiagram
48
49
  facts {
49
50
  bigint id PK
50
51
  text text
52
+ tsvector text_vector
51
53
  string digest
52
54
  timestamptz valid_at
53
55
  timestamptz invalid_at
@@ -93,6 +95,7 @@ CREATE TABLE sources (
93
95
  content_hash VARCHAR(64) NOT NULL UNIQUE,
94
96
  type VARCHAR(50) NOT NULL,
95
97
  content TEXT NOT NULL,
98
+ content_vector TSVECTOR,
96
99
  title VARCHAR(255),
97
100
  source_uri TEXT,
98
101
  metadata JSONB NOT NULL DEFAULT '{}',
@@ -103,8 +106,14 @@ CREATE TABLE sources (
103
106
 
104
107
  CREATE INDEX idx_sources_type ON sources(type);
105
108
  CREATE INDEX idx_sources_captured ON sources(captured_at);
106
- CREATE INDEX idx_sources_text ON sources USING gin(to_tsvector('english', content));
109
+ CREATE INDEX idx_sources_content_vector ON sources USING GIN(content_vector);
107
110
  CREATE INDEX idx_sources_embedding ON sources USING hnsw(embedding vector_cosine_ops);
111
+
112
+ -- Trigger keeps content_vector in sync with content
113
+ CREATE TRIGGER sources_content_vector_update
114
+ BEFORE INSERT OR UPDATE ON sources
115
+ FOR EACH ROW
116
+ EXECUTE FUNCTION tsvector_update_trigger(content_vector, 'pg_catalog.english', content);
108
117
  ```
109
118
 
110
119
  ### entities
@@ -155,6 +164,7 @@ Stores temporal assertions.
155
164
  CREATE TABLE facts (
156
165
  id BIGSERIAL PRIMARY KEY,
157
166
  text TEXT NOT NULL,
167
+ text_vector TSVECTOR,
158
168
  digest VARCHAR(64) NOT NULL,
159
169
  valid_at TIMESTAMPTZ NOT NULL,
160
170
  invalid_at TIMESTAMPTZ,
@@ -174,8 +184,14 @@ CREATE INDEX idx_facts_valid ON facts(valid_at);
174
184
  CREATE INDEX idx_facts_invalid ON facts(invalid_at);
175
185
  CREATE INDEX idx_facts_temporal ON facts(valid_at, invalid_at);
176
186
  CREATE INDEX idx_facts_method ON facts(extraction_method);
177
- CREATE INDEX idx_facts_text ON facts USING gin(to_tsvector('english', text));
187
+ CREATE INDEX idx_facts_text_vector ON facts USING GIN(text_vector);
178
188
  CREATE INDEX idx_facts_embedding ON facts USING hnsw(embedding vector_cosine_ops);
189
+
190
+ -- Trigger keeps text_vector in sync with text
191
+ CREATE TRIGGER facts_text_vector_update
192
+ BEFORE INSERT OR UPDATE ON facts
193
+ FOR EACH ROW
194
+ EXECUTE FUNCTION tsvector_update_trigger(text_vector, 'pg_catalog.english', text);
179
195
  ```
180
196
 
181
197
  ### entity_mentions
@@ -116,6 +116,11 @@ puts <<~DATABASE_CONFIG
116
116
  config.ranking.relationship_match_weight: #{config.ranking.relationship_match_weight}
117
117
  config.ranking.confidence_weight: #{config.ranking.confidence_weight}
118
118
 
119
+ LLM Prompts (configurable templates for extraction):
120
+ config.prompts.fact_extraction: #{config.prompts.fact_extraction.lines.first.strip.inspect}...
121
+ config.prompts.entity_extraction: #{config.prompts.entity_extraction.lines.first.strip.inspect}...
122
+ config.prompts.rag_system: #{config.prompts.rag_system.lines.first.strip.inspect}...
123
+
119
124
  General settings:
120
125
  config.default_extractor: #{config.default_extractor.inspect}
121
126
  config.fuzzy_match_threshold: #{config.fuzzy_match_threshold}
@@ -154,6 +159,11 @@ puts <<~ENV_VARS
154
159
  export FDB_RANKING__TS_RANK_WEIGHT=0.30
155
160
  export FDB_RANKING__VECTOR_SIMILARITY_WEIGHT=0.25
156
161
 
162
+ # LLM Prompts (multi-line values work with heredocs in shell)
163
+ export FDB_PROMPTS__FACT_EXTRACTION="Your custom prompt..."
164
+ export FDB_PROMPTS__ENTITY_EXTRACTION="Your custom prompt..."
165
+ export FDB_PROMPTS__RAG_SYSTEM="Your custom system prompt..."
166
+
157
167
  # General settings
158
168
  export FDB_DEFAULT_EXTRACTOR=llm
159
169
  export FDB_FUZZY_MATCH_THRESHOLD=0.80
@@ -259,6 +269,25 @@ puts <<~CONFIG_FILES
259
269
 
260
270
  database:
261
271
  password: my_local_password
272
+
273
+ Custom Prompts (in any config file):
274
+ ---
275
+ # Override LLM prompts for fact/entity extraction
276
+ # Prompts use %<text>s as placeholder for input text
277
+ prompts:
278
+ fact_extraction: |
279
+ Extract facts from the following text.
280
+ Text: %<text>s
281
+ Return JSON array of facts.
282
+
283
+ entity_extraction: |
284
+ Extract named entities from the text.
285
+ Text: %<text>s
286
+ Return JSON array of entities.
287
+
288
+ rag_system: |
289
+ You are a helpful assistant with access to a fact database.
290
+ Use the provided context to answer questions.
262
291
  CONFIG_FILES
263
292
 
264
293
  demo_section("Section 7: Database Configuration")
@@ -307,6 +336,10 @@ puts <<~REFERENCE
307
336
  config.llm - LLM configuration
308
337
  config.embedding - Embedding configuration
309
338
  config.ranking - Ranking weights
339
+ config.prompts - LLM prompt templates
340
+ config.prompts.fact_extraction - Fact extraction prompt
341
+ config.prompts.entity_extraction - Entity extraction prompt
342
+ config.prompts.rag_system - RAG system prompt
310
343
  REFERENCE
311
344
 
312
345
  demo_footer("Configuration Demo Complete!")
@@ -307,7 +307,7 @@ class QueryContextGenerator
307
307
  # Use ts_rank_cd (cover density) for better phrase matching
308
308
  sql = <<~SQL
309
309
  SELECT id,
310
- ts_rank_cd(to_tsvector('english', text),
310
+ ts_rank_cd(text_vector,
311
311
  plainto_tsquery('english', ?),
312
312
  32) as rank
313
313
  FROM fact_db_facts
@@ -262,7 +262,7 @@ class RagFeedbackLoop
262
262
 
263
263
  sql = <<~SQL
264
264
  SELECT id,
265
- ts_rank_cd(to_tsvector('english', text),
265
+ ts_rank_cd(text_vector,
266
266
  plainto_tsquery('english', ?),
267
267
  32) as rank
268
268
  FROM fact_db_facts
@@ -133,11 +133,11 @@ module FactDb
133
133
  }
134
134
 
135
135
  # @!method search_text(query)
136
- # Full-text search on fact text using PostgreSQL tsvector
136
+ # Full-text search on fact text using persisted tsvector column
137
137
  # @param query [String] the search query
138
138
  # @return [ActiveRecord::Relation]
139
139
  scope :search_text, lambda { |query|
140
- where("to_tsvector('english', text) @@ plainto_tsquery('english', ?)", query)
140
+ where("text_vector @@ plainto_tsquery('english', ?)", query)
141
141
  }
142
142
 
143
143
  # @!method extracted_by(method)
@@ -59,11 +59,11 @@ module FactDb
59
59
  scope :captured_before, ->(date) { where("captured_at <= ?", date) }
60
60
 
61
61
  # @!method search_text(query)
62
- # Full-text search on source content using PostgreSQL tsvector
62
+ # Full-text search on source content using persisted tsvector column
63
63
  # @param query [String] the search query
64
64
  # @return [ActiveRecord::Relation]
65
65
  scope :search_text, lambda { |query|
66
- where("to_tsvector('english', content) @@ plainto_tsquery('english', ?)", query)
66
+ where("content_vector @@ plainto_tsquery('english', ?)", query)
67
67
  }
68
68
 
69
69
  # Finds sources by vector similarity using pgvector
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module FactDb
4
- VERSION = "0.0.3"
4
+ VERSION = "0.1.0"
5
5
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: fact_db
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Dewayne VanHoozer
@@ -201,6 +201,7 @@ files:
201
201
  - db/migrate/005_create_facts.rb
202
202
  - db/migrate/006_create_entity_mentions.rb
203
203
  - db/migrate/007_create_fact_sources.rb
204
+ - db/migrate/008_add_persistent_tsvectors.rb
204
205
  - docs/api/extractors/index.md
205
206
  - docs/api/extractors/llm.md
206
207
  - docs/api/extractors/manual.md
@@ -326,7 +327,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
326
327
  - !ruby/object:Gem::Version
327
328
  version: '0'
328
329
  requirements: []
329
- rubygems_version: 4.0.3
330
+ rubygems_version: 4.0.6
330
331
  specification_version: 4
331
332
  summary: Temporal fact tracking with entity resolution and audit trails
332
333
  test_files: []