RubyGems - fact_db - Versions diffs - 0.0.3 → 0.1.0 - Mend

fact_db 0.0.3 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +39 -0
data/Rakefile +10 -0
data/db/migrate/008_add_persistent_tsvectors.rb +97 -0
data/docs/architecture/database-schema.md +18 -2
data/examples/001_configuration.rb +33 -0
data/examples/100_query_context.rb +1 -1
data/examples/130_rag_feedback_loop.rb +1 -1
data/lib/fact_db/models/fact.rb +2 -2
data/lib/fact_db/models/source.rb +2 -2
data/lib/fact_db/version.rb +1 -1
metadata +3 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: c9a22512c569e81df1cd3e7d216dc1356c043c8e454e631df14d3fadffd13b39
-  data.tar.gz: 260f0183ffc6a7166d2111a953215c7835ab3fd47e213474d2eda66d2f8ea582
+  metadata.gz: ac11ce032c5e56849e6910c26154add65d81d3aaaa349969e584c493633e8db9
+  data.tar.gz: c0ace2cd5605d4530fc3f2a16e91a2edb85dd5532ccc80bdf3d0a2248a0e2a27
 SHA512:
-  metadata.gz: 539a7bef88cb16f6f590d6227a5b0820347197d679cc187c9e34990381f5b6a550ed60b1733d829da1b5c0e6f67e3b0dc7bc0d9bfda606acad282914c7db9fbe
-  data.tar.gz: 4c7e4b859af803c853cd2ce7d1a7aeba7c0d45f34934483d2caacd984eae29f6cf783803de19be071ca51f0b5df41b0e14bdf287a0d4815e42b3e5b2cedb7131
+  metadata.gz: e0ecf9c10caaa2e5836282bfac4b8de0b0f287948ca7f9de5413a8a00bb6d188e272b9a0f10d1832ca68806c9743ecdeac3939c304c6636c06d0b96847e33eb9
+  data.tar.gz: efd5e98fb240194a467bf6a8ddec659c754635290f54f99dab4c11779f627589fa659b8827feb9b0bd7e1c2d761e592e3c03fc30965d7a54095c32e05c48f310

data/CHANGELOG.md CHANGED Viewed

@@ -8,6 +8,45 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+## [0.1.0] - Unreleased
+### Added
+- **Persistent TSVectors** - Precomputed `tsvector` columns for full-text search
+  - `text_vector` column on `fact_db_facts` replaces on-the-fly `to_tsvector()` computation
+  - `content_vector` column on `fact_db_sources` replaces on-the-fly `to_tsvector()` computation
+  - Database triggers automatically keep vectors in sync on INSERT/UPDATE
+  - GIN indexes created `CONCURRENTLY` for non-blocking deployment
+  - Migration backfills existing rows and drops redundant expression-based indexes
+- **Configurable LLM Prompts** - Extraction prompts moved to configuration
+  - `config.prompts.fact_extraction` - Customizable fact extraction prompt template
+  - `config.prompts.entity_extraction` - Customizable entity extraction prompt template
+  - `config.prompts.rag_system` - Customizable RAG system prompt
+  - Override via config files, environment variables (`FDB_PROMPTS__*`), or programmatic configuration
+- **Configuration Defaults File** - `lib/fact_db/config/defaults.yml` as single source of truth for all config schema and defaults
+- **Configuration Example** - New `examples/001_configuration.rb` demonstrating all configuration methods
+- **Ingest Reporter** - New `examples/ingest_reporter.rb` with structured reporting for markdown ingestion
+### Changed
+- **Full-text search scopes** now query persisted `tsvector` columns instead of computing them at query time
+  - `Fact.search_text` queries `text_vector` column directly
+  - `Source.search_text` queries `content_vector` column directly
+  - `ts_rank_cd()` calls in examples use persisted columns instead of recomputing
+- **LLM Extractor** - Removed hardcoded prompt constants; prompts now loaded from configuration
+- **Markdown ingestion** - Refactored with new reporter and reduced progress verbosity
+- Version bump from 0.0.4 to 0.1.0
+## [0.0.4] - 2026-01-12
+### Added
+- **LLM prompt configuration** - Prompts for fact/entity extraction are now configurable
+### Changed
+- Version bump from 0.0.3 to 0.0.4
 ## [0.0.3] - 2026-01-12
 ### Added

data/Rakefile CHANGED Viewed

@@ -1,6 +1,7 @@
 # frozen_string_literal: true
 require "bundler/gem_tasks"
+require "fileutils"
 require "rake/testtask"
 Rake::TestTask.new(:test) do |t|
@@ -264,6 +265,15 @@ namespace :docs do
   task :yard do
     output_dir = File.expand_path("doc", __dir__)
     system("yard", "doc") || abort("yard doc failed")
+    # Create symlink for README.md image path (docs/assets -> assets)
+    # README.md references docs/assets/fact_db.jpg which needs to resolve in YARD output
+    docs_dir = File.join(output_dir, "docs")
+    FileUtils.mkdir_p(docs_dir)
+    symlink_path = File.join(docs_dir, "assets")
+    FileUtils.rm_f(symlink_path)
+    FileUtils.ln_sf("../assets", symlink_path)
     puts "YARD documentation built to #{output_dir}"
   end

data/db/migrate/008_add_persistent_tsvectors.rb ADDED Viewed

@@ -0,0 +1,97 @@
+# frozen_string_literal: true
+class AddPersistentTsvectors < ActiveRecord::Migration[7.0]
+  # Uses disable_ddl_transaction! so GIN indexes can be created CONCURRENTLY
+  # without holding an exclusive lock on the table.
+  disable_ddl_transaction!
+  def up
+    # --- fact_db_facts: persist tsvector for full-text search on text column ---
+    add_column :fact_db_facts, :text_vector, :tsvector,
+               comment: "Precomputed tsvector for full-text search on text"
+    execute <<-SQL
+      CREATE TRIGGER fact_db_facts_text_vector_update
+        BEFORE INSERT OR UPDATE ON fact_db_facts
+        FOR EACH ROW
+        EXECUTE FUNCTION tsvector_update_trigger(
+          text_vector,
+          'pg_catalog.english',
+          text
+        );
+    SQL
+    execute <<-SQL
+      CREATE INDEX CONCURRENTLY idx_facts_text_vector
+        ON fact_db_facts
+        USING GIN (text_vector);
+    SQL
+    # Backfill existing rows by touching them so the trigger fires
+    execute <<-SQL
+      UPDATE fact_db_facts SET text_vector = to_tsvector('english', COALESCE(text, ''));
+    SQL
+    # --- fact_db_sources: persist tsvector for full-text search on content column ---
+    add_column :fact_db_sources, :content_vector, :tsvector,
+               comment: "Precomputed tsvector for full-text search on content"
+    execute <<-SQL
+      CREATE TRIGGER fact_db_sources_content_vector_update
+        BEFORE INSERT OR UPDATE ON fact_db_sources
+        FOR EACH ROW
+        EXECUTE FUNCTION tsvector_update_trigger(
+          content_vector,
+          'pg_catalog.english',
+          content
+        );
+    SQL
+    execute <<-SQL
+      CREATE INDEX CONCURRENTLY idx_sources_content_vector
+        ON fact_db_sources
+        USING GIN (content_vector);
+    SQL
+    # Backfill existing rows
+    execute <<-SQL
+      UPDATE fact_db_sources SET content_vector = to_tsvector('english', COALESCE(content, ''));
+    SQL
+    # Drop the old expression-based GIN indexes (now redundant)
+    execute "DROP INDEX IF EXISTS idx_facts_fulltext;"
+    execute "DROP INDEX IF EXISTS idx_sources_fulltext;"
+  end
+  def down
+    # Restore the original expression-based GIN indexes
+    execute <<-SQL
+      CREATE INDEX CONCURRENTLY idx_facts_fulltext ON fact_db_facts
+        USING gin(to_tsvector('english', text));
+    SQL
+    execute <<-SQL
+      CREATE INDEX CONCURRENTLY idx_sources_fulltext ON fact_db_sources
+        USING gin(to_tsvector('english', content));
+    SQL
+    # Drop triggers
+    execute <<-SQL
+      DROP TRIGGER IF EXISTS fact_db_facts_text_vector_update ON fact_db_facts;
+    SQL
+    execute <<-SQL
+      DROP TRIGGER IF EXISTS fact_db_sources_content_vector_update ON fact_db_sources;
+    SQL
+    # Drop indexes
+    execute "DROP INDEX CONCURRENTLY IF EXISTS idx_facts_text_vector;"
+    execute "DROP INDEX CONCURRENTLY IF EXISTS idx_sources_content_vector;"
+    # Remove columns
+    remove_column :fact_db_facts, :text_vector
+    remove_column :fact_db_sources, :content_vector
+  end
+end

data/docs/architecture/database-schema.md CHANGED Viewed

@@ -18,6 +18,7 @@ erDiagram
         string content_hash UK
         string type
         text content
+        tsvector content_vector
         string title
         string source_uri
         jsonb metadata
@@ -48,6 +49,7 @@ erDiagram
     facts {
         bigint id PK
         text text
+        tsvector text_vector
         string digest
         timestamptz valid_at
         timestamptz invalid_at
@@ -93,6 +95,7 @@ CREATE TABLE sources (
     content_hash VARCHAR(64) NOT NULL UNIQUE,
     type VARCHAR(50) NOT NULL,
     content TEXT NOT NULL,
+    content_vector TSVECTOR,
     title VARCHAR(255),
     source_uri TEXT,
     metadata JSONB NOT NULL DEFAULT '{}',
@@ -103,8 +106,14 @@ CREATE TABLE sources (
 CREATE INDEX idx_sources_type ON sources(type);
 CREATE INDEX idx_sources_captured ON sources(captured_at);
-CREATE INDEX idx_sources_text ON sources USING gin(to_tsvector('english', content));
+CREATE INDEX idx_sources_content_vector ON sources USING GIN(content_vector);
 CREATE INDEX idx_sources_embedding ON sources USING hnsw(embedding vector_cosine_ops);
+-- Trigger keeps content_vector in sync with content
+CREATE TRIGGER sources_content_vector_update
+    BEFORE INSERT OR UPDATE ON sources
+    FOR EACH ROW
+    EXECUTE FUNCTION tsvector_update_trigger(content_vector, 'pg_catalog.english', content);
 ```
 ### entities
@@ -155,6 +164,7 @@ Stores temporal assertions.
 CREATE TABLE facts (
     id BIGSERIAL PRIMARY KEY,
     text TEXT NOT NULL,
+    text_vector TSVECTOR,
     digest VARCHAR(64) NOT NULL,
     valid_at TIMESTAMPTZ NOT NULL,
     invalid_at TIMESTAMPTZ,
@@ -174,8 +184,14 @@ CREATE INDEX idx_facts_valid ON facts(valid_at);
 CREATE INDEX idx_facts_invalid ON facts(invalid_at);
 CREATE INDEX idx_facts_temporal ON facts(valid_at, invalid_at);
 CREATE INDEX idx_facts_method ON facts(extraction_method);
-CREATE INDEX idx_facts_text ON facts USING gin(to_tsvector('english', text));
+CREATE INDEX idx_facts_text_vector ON facts USING GIN(text_vector);
 CREATE INDEX idx_facts_embedding ON facts USING hnsw(embedding vector_cosine_ops);
+-- Trigger keeps text_vector in sync with text
+CREATE TRIGGER facts_text_vector_update
+    BEFORE INSERT OR UPDATE ON facts
+    FOR EACH ROW
+    EXECUTE FUNCTION tsvector_update_trigger(text_vector, 'pg_catalog.english', text);
 ```
 ### entity_mentions

data/examples/001_configuration.rb CHANGED Viewed

@@ -116,6 +116,11 @@ puts <<~DATABASE_CONFIG
     config.ranking.relationship_match_weight: #{config.ranking.relationship_match_weight}
     config.ranking.confidence_weight:         #{config.ranking.confidence_weight}
+  LLM Prompts (configurable templates for extraction):
+    config.prompts.fact_extraction:   #{config.prompts.fact_extraction.lines.first.strip.inspect}...
+    config.prompts.entity_extraction: #{config.prompts.entity_extraction.lines.first.strip.inspect}...
+    config.prompts.rag_system:        #{config.prompts.rag_system.lines.first.strip.inspect}...
   General settings:
     config.default_extractor:     #{config.default_extractor.inspect}
     config.fuzzy_match_threshold: #{config.fuzzy_match_threshold}
@@ -154,6 +159,11 @@ puts <<~ENV_VARS
     export FDB_RANKING__TS_RANK_WEIGHT=0.30
     export FDB_RANKING__VECTOR_SIMILARITY_WEIGHT=0.25
+    # LLM Prompts (multi-line values work with heredocs in shell)
+    export FDB_PROMPTS__FACT_EXTRACTION="Your custom prompt..."
+    export FDB_PROMPTS__ENTITY_EXTRACTION="Your custom prompt..."
+    export FDB_PROMPTS__RAG_SYSTEM="Your custom system prompt..."
     # General settings
     export FDB_DEFAULT_EXTRACTOR=llm
     export FDB_FUZZY_MATCH_THRESHOLD=0.80
@@ -259,6 +269,25 @@ puts <<~CONFIG_FILES
     database:
       password: my_local_password
+  Custom Prompts (in any config file):
+    ---
+    # Override LLM prompts for fact/entity extraction
+    # Prompts use %<text>s as placeholder for input text
+    prompts:
+      fact_extraction: |
+        Extract facts from the following text.
+        Text: %<text>s
+        Return JSON array of facts.
+      entity_extraction: |
+        Extract named entities from the text.
+        Text: %<text>s
+        Return JSON array of entities.
+      rag_system: |
+        You are a helpful assistant with access to a fact database.
+        Use the provided context to answer questions.
 CONFIG_FILES
 demo_section("Section 7: Database Configuration")
@@ -307,6 +336,10 @@ puts <<~REFERENCE
     config.llm                      - LLM configuration
     config.embedding                - Embedding configuration
     config.ranking                  - Ranking weights
+    config.prompts                  - LLM prompt templates
+    config.prompts.fact_extraction  - Fact extraction prompt
+    config.prompts.entity_extraction - Entity extraction prompt
+    config.prompts.rag_system       - RAG system prompt
 REFERENCE
 demo_footer("Configuration Demo Complete!")

data/examples/100_query_context.rb CHANGED Viewed

@@ -307,7 +307,7 @@ class QueryContextGenerator
     # Use ts_rank_cd (cover density) for better phrase matching
     sql = <<~SQL
       SELECT id,
-             ts_rank_cd(to_tsvector('english', text),
+             ts_rank_cd(text_vector,
                         plainto_tsquery('english', ?),
                         32) as rank
       FROM fact_db_facts

data/examples/130_rag_feedback_loop.rb CHANGED Viewed

@@ -262,7 +262,7 @@ class RagFeedbackLoop
     sql = <<~SQL
       SELECT id,
-             ts_rank_cd(to_tsvector('english', text),
+             ts_rank_cd(text_vector,
                         plainto_tsquery('english', ?),
                         32) as rank
       FROM fact_db_facts

data/lib/fact_db/models/fact.rb CHANGED Viewed

@@ -133,11 +133,11 @@ module FactDb
       }
       # @!method search_text(query)
-      #   Full-text search on fact text using PostgreSQL tsvector
+      #   Full-text search on fact text using persisted tsvector column
       #   @param query [String] the search query
       #   @return [ActiveRecord::Relation]
       scope :search_text, lambda { |query|
-        where("to_tsvector('english', text) @@ plainto_tsquery('english', ?)", query)
+        where("text_vector @@ plainto_tsquery('english', ?)", query)
       }
       # @!method extracted_by(method)

data/lib/fact_db/models/source.rb CHANGED Viewed

@@ -59,11 +59,11 @@ module FactDb
       scope :captured_before, ->(date) { where("captured_at <= ?", date) }
       # @!method search_text(query)
-      #   Full-text search on source content using PostgreSQL tsvector
+      #   Full-text search on source content using persisted tsvector column
       #   @param query [String] the search query
       #   @return [ActiveRecord::Relation]
       scope :search_text, lambda { |query|
-        where("to_tsvector('english', content) @@ plainto_tsquery('english', ?)", query)
+        where("content_vector @@ plainto_tsquery('english', ?)", query)
       }
       # Finds sources by vector similarity using pgvector

data/lib/fact_db/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module FactDb
-  VERSION = "0.0.3"
+  VERSION = "0.1.0"
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: fact_db
 version: !ruby/object:Gem::Version
-  version: 0.0.3
+  version: 0.1.0
 platform: ruby
 authors:
 - Dewayne VanHoozer
@@ -201,6 +201,7 @@ files:
 - db/migrate/005_create_facts.rb
 - db/migrate/006_create_entity_mentions.rb
 - db/migrate/007_create_fact_sources.rb
+- db/migrate/008_add_persistent_tsvectors.rb
 - docs/api/extractors/index.md
 - docs/api/extractors/llm.md
 - docs/api/extractors/manual.md
@@ -326,7 +327,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubygems_version: 4.0.3
+rubygems_version: 4.0.6
 specification_version: 4
 summary: Temporal fact tracking with entity resolution and audit trails
 test_files: []