RubyGems - claude_memory - Versions diffs - 0.2.0 → 0.3.0 - Mend

claude_memory 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (104) hide show

checksums.yaml +4 -4
data/.claude/.mind.mv2.o2N83S +0 -0
data/.claude/CLAUDE.md +1 -0
data/.claude/rules/claude_memory.generated.md +28 -9
data/.claude/settings.local.json +9 -1
data/.claude/skills/check-memory/SKILL.md +77 -0
data/.claude/skills/improve/SKILL.md +532 -0
data/.claude/skills/improve/feature-patterns.md +1221 -0
data/.claude/skills/quality-update/SKILL.md +229 -0
data/.claude/skills/quality-update/implementation-guide.md +346 -0
data/.claude/skills/review-commit/SKILL.md +199 -0
data/.claude/skills/review-for-quality/SKILL.md +154 -0
data/.claude/skills/review-for-quality/expert-checklists.md +79 -0
data/.claude/skills/setup-memory/SKILL.md +168 -0
data/.claude/skills/study-repo/SKILL.md +307 -0
data/.claude/skills/study-repo/analysis-template.md +323 -0
data/.claude/skills/study-repo/focus-examples.md +327 -0
data/CHANGELOG.md +133 -0
data/CLAUDE.md +130 -11
data/README.md +117 -10
data/db/migrations/001_create_initial_schema.rb +117 -0
data/db/migrations/002_add_project_scoping.rb +33 -0
data/db/migrations/003_add_session_metadata.rb +42 -0
data/db/migrations/004_add_fact_embeddings.rb +20 -0
data/db/migrations/005_add_incremental_sync.rb +21 -0
data/db/migrations/006_add_operation_tracking.rb +40 -0
data/db/migrations/007_add_ingestion_metrics.rb +26 -0
data/docs/.claude/mind.mv2.lock +0 -0
data/docs/GETTING_STARTED.md +587 -0
data/docs/RELEASE_NOTES_v0.2.0.md +0 -1
data/docs/RUBY_COMMUNITY_POST_v0.2.0.md +0 -2
data/docs/architecture.md +9 -8
data/docs/auto_init_design.md +230 -0
data/docs/improvements.md +557 -731
data/docs/influence/.gitkeep +13 -0
data/docs/influence/grepai.md +933 -0
data/docs/influence/qmd.md +2195 -0
data/docs/plugin.md +257 -11
data/docs/quality_review.md +472 -1273
data/docs/remaining_improvements.md +330 -0
data/lefthook.yml +13 -0
data/lib/claude_memory/commands/checks/claude_md_check.rb +41 -0
data/lib/claude_memory/commands/checks/database_check.rb +120 -0
data/lib/claude_memory/commands/checks/hooks_check.rb +112 -0
data/lib/claude_memory/commands/checks/reporter.rb +110 -0
data/lib/claude_memory/commands/checks/snapshot_check.rb +30 -0
data/lib/claude_memory/commands/doctor_command.rb +12 -129
data/lib/claude_memory/commands/help_command.rb +1 -0
data/lib/claude_memory/commands/hook_command.rb +9 -2
data/lib/claude_memory/commands/index_command.rb +169 -0
data/lib/claude_memory/commands/ingest_command.rb +1 -1
data/lib/claude_memory/commands/init_command.rb +5 -197
data/lib/claude_memory/commands/initializers/database_ensurer.rb +30 -0
data/lib/claude_memory/commands/initializers/global_initializer.rb +85 -0
data/lib/claude_memory/commands/initializers/hooks_configurator.rb +156 -0
data/lib/claude_memory/commands/initializers/mcp_configurator.rb +56 -0
data/lib/claude_memory/commands/initializers/memory_instructions_writer.rb +135 -0
data/lib/claude_memory/commands/initializers/project_initializer.rb +111 -0
data/lib/claude_memory/commands/recover_command.rb +75 -0
data/lib/claude_memory/commands/registry.rb +5 -1
data/lib/claude_memory/commands/stats_command.rb +239 -0
data/lib/claude_memory/commands/uninstall_command.rb +226 -0
data/lib/claude_memory/core/batch_loader.rb +32 -0
data/lib/claude_memory/core/concept_ranker.rb +73 -0
data/lib/claude_memory/core/embedding_candidate_builder.rb +37 -0
data/lib/claude_memory/core/fact_collector.rb +51 -0
data/lib/claude_memory/core/fact_query_builder.rb +154 -0
data/lib/claude_memory/core/fact_ranker.rb +113 -0
data/lib/claude_memory/core/result_builder.rb +54 -0
data/lib/claude_memory/core/result_sorter.rb +25 -0
data/lib/claude_memory/core/scope_filter.rb +61 -0
data/lib/claude_memory/core/text_builder.rb +29 -0
data/lib/claude_memory/embeddings/generator.rb +161 -0
data/lib/claude_memory/embeddings/similarity.rb +69 -0
data/lib/claude_memory/hook/handler.rb +4 -3
data/lib/claude_memory/index/lexical_fts.rb +7 -2
data/lib/claude_memory/infrastructure/operation_tracker.rb +158 -0
data/lib/claude_memory/infrastructure/schema_validator.rb +206 -0
data/lib/claude_memory/ingest/content_sanitizer.rb +6 -7
data/lib/claude_memory/ingest/ingester.rb +99 -15
data/lib/claude_memory/ingest/metadata_extractor.rb +57 -0
data/lib/claude_memory/ingest/tool_extractor.rb +71 -0
data/lib/claude_memory/mcp/response_formatter.rb +331 -0
data/lib/claude_memory/mcp/server.rb +19 -0
data/lib/claude_memory/mcp/setup_status_analyzer.rb +73 -0
data/lib/claude_memory/mcp/tool_definitions.rb +279 -0
data/lib/claude_memory/mcp/tool_helpers.rb +80 -0
data/lib/claude_memory/mcp/tools.rb +330 -320
data/lib/claude_memory/recall/dual_query_template.rb +63 -0
data/lib/claude_memory/recall.rb +304 -237
data/lib/claude_memory/resolve/resolver.rb +52 -49
data/lib/claude_memory/store/sqlite_store.rb +210 -144
data/lib/claude_memory/store/store_manager.rb +6 -6
data/lib/claude_memory/sweep/sweeper.rb +6 -0
data/lib/claude_memory/version.rb +1 -1
data/lib/claude_memory.rb +35 -3
metadata +71 -11
data/.claude/.mind.mv2.aLCUZd +0 -0
data/.claude/memory.sqlite3 +0 -0
data/.mcp.json +0 -11
/data/docs/{feature_adoption_plan.md → plans/feature_adoption_plan.md} +0 -0
/data/docs/{feature_adoption_plan_revised.md → plans/feature_adoption_plan_revised.md} +0 -0
/data/docs/{plan.md → plans/plan.md} +0 -0
/data/docs/{updated_plan.md → plans/updated_plan.md} +0 -0

data/lib/claude_memory/embeddings/generator.rb ADDED Viewed

@@ -0,0 +1,161 @@
+# frozen_string_literal: true
+require "digest"
+module ClaudeMemory
+  module Embeddings
+    # Lightweight embedding generator using TF-IDF approach
+    # Generates normalized 384-dimensional vectors for semantic similarity
+    #
+    # This is a pragmatic implementation that works without heavy dependencies.
+    # Future: Can be upgraded to transformer-based models (sentence-transformers)
+    class Generator
+      EMBEDDING_DIM = 384
+      # Common technical terms and programming concepts for vocabulary
+      VOCABULARY = %w[
+        database framework library module class function method
+        api rest graphql http request response server client
+        authentication authorization token session cookie jwt
+        user admin role permission access control security
+        error exception handling validation sanitization
+        test spec unit integration end-to-end e2e
+        frontend backend fullstack ui ux component
+        react vue angular svelte javascript typescript
+        ruby python java go rust php elixir
+        sql nosql postgresql mysql mongodb redis sqlite
+        docker kubernetes container orchestration deployment
+        git branch commit merge pull push repository
+        configuration environment variable setting preference
+        logger logging debug trace info warn error
+        cache caching storage persistence state
+        async await promise callback thread process
+        route routing middleware handler controller
+        model view template render component
+        form input button submit validation
+        dependency injection service factory singleton
+        migration schema table column index constraint
+        query filter sort pagination limit offset
+        create read update delete crud operation
+        json xml yaml csv format serialization
+        encrypt decrypt hash salt cipher algorithm
+        webhook event listener subscriber publisher
+        job queue worker background task schedule
+        metric monitoring performance optimization
+        refactor cleanup technical debt improvement
+      ].freeze
+      def initialize
+        @vocabulary_index = VOCABULARY.each_with_index.to_h
+        @idf_weights = compute_idf_weights
+      end
+      # Generate embedding vector for text
+      # @param text [String] input text to embed
+      # @return [Array<Float>] normalized 384-dimensional vector
+      def generate(text)
+        return zero_vector if text.nil? || text.empty?
+        # Tokenize and compute TF-IDF
+        tokens = tokenize(text.downcase)
+        return zero_vector if tokens.empty?
+        # Build term frequency map
+        tf_map = tokens.each_with_object(Hash.new(0)) { |token, h| h[token] += 1 }
+        # Normalize term frequencies
+        max_tf = tf_map.values.max.to_f
+        tf_map.transform_values! { |count| count / max_tf }
+        # Compute TF-IDF vector
+        vector = Array.new(VOCABULARY.size, 0.0)
+        tf_map.each do |term, tf|
+          idx = @vocabulary_index[term]
+          next unless idx
+          idf = @idf_weights[term] || 1.0
+          vector[idx] = tf * idf
+        end
+        # Add positional encoding to capture word order (simple hash-based)
+        positional_features = compute_positional_features(tokens)
+        # Combine vocabulary vector with positional features
+        combined = vector + positional_features
+        # Pad or truncate to EMBEDDING_DIM
+        final_vector = if combined.size > EMBEDDING_DIM
+          combined[0...EMBEDDING_DIM]
+        else
+          combined + Array.new(EMBEDDING_DIM - combined.size, 0.0)
+        end
+        # Normalize to unit length for cosine similarity
+        normalize(final_vector)
+      end
+      private
+      def tokenize(text)
+        # Simple tokenization: split on non-word characters
+        text.scan(/\w+/)
+      end
+      def compute_idf_weights
+        # Assign higher weights to more specific technical terms
+        # General terms get lower weights
+        weights = {}
+        # Very common terms (lower weight)
+        common = %w[the is are was were be been being have has had do does did
+          for with from that this these those can could would should
+          will make get set add remove update delete create]
+        common.each { |term| weights[term] = 0.5 }
+        # Technical terms (higher weight)
+        VOCABULARY.each { |term| weights[term] ||= 2.0 }
+        weights
+      end
+      def compute_positional_features(tokens)
+        # Capture word order and bi-grams using simple hashing
+        features_dim = EMBEDDING_DIM - VOCABULARY.size
+        features = Array.new(features_dim, 0.0)
+        # Unigram features
+        tokens.each_with_index do |token, i|
+          hash = Digest::MD5.hexdigest("#{token}_#{i % 10}").to_i(16)
+          idx = hash % features_dim
+          features[idx] += 1.0
+        end
+        # Bigram features
+        tokens.each_cons(2) do |token1, token2|
+          bigram = "#{token1}_#{token2}"
+          hash = Digest::MD5.hexdigest(bigram).to_i(16)
+          idx = hash % features_dim
+          features[idx] += 0.5
+        end
+        # Normalize positional features
+        max_val = features.max
+        features.map! { |v| (max_val > 0) ? v / max_val : 0.0 } if max_val
+        features
+      end
+      def normalize(vector)
+        # Normalize to unit length
+        magnitude = Math.sqrt(vector.sum { |v| v * v })
+        return vector if magnitude.zero?
+        vector.map { |v| v / magnitude }
+      end
+      def zero_vector
+        Array.new(EMBEDDING_DIM, 0.0)
+      end
+    end
+  end
+end

data/lib/claude_memory/embeddings/similarity.rb ADDED Viewed

@@ -0,0 +1,69 @@
+# frozen_string_literal: true
+module ClaudeMemory
+  module Embeddings
+    # Calculates similarity between embedding vectors
+    # Uses cosine similarity for comparing normalized vectors
+    class Similarity
+      # Calculate cosine similarity between two vectors
+      # Assumes vectors are already normalized to unit length
+      # @param vec_a [Array<Float>] first vector
+      # @param vec_b [Array<Float>] second vector
+      # @return [Float] similarity score between 0 and 1
+      def self.cosine(vec_a, vec_b)
+        return 0.0 if vec_a.nil? || vec_b.nil?
+        return 0.0 if vec_a.empty? || vec_b.empty?
+        # For normalized vectors, cosine similarity is just the dot product
+        dot_product = vec_a.zip(vec_b).sum { |a, b| a * b }
+        # Clamp to [0, 1] range (handle floating point errors)
+        dot_product.clamp(0.0, 1.0)
+      end
+      # Find top K most similar items
+      # @param query_vector [Array<Float>] query embedding
+      # @param candidates [Array<Hash>] array of hashes with :embedding key
+      # @param k [Integer] number of top results to return
+      # @return [Array<Hash>] top K candidates with :similarity scores
+      def self.top_k(query_vector, candidates, k)
+        return [] if candidates.empty?
+        # Calculate similarities and score
+        scored = candidates.map do |candidate|
+          embedding = candidate[:embedding]
+          similarity = cosine(query_vector, embedding)
+          {
+            candidate: candidate,
+            similarity: similarity
+          }
+        end
+        # Sort by similarity (highest first) and take top K
+        scored.sort_by { |item| -item[:similarity] }.take(k)
+      end
+      # Calculate average similarity of a vector to multiple other vectors
+      # Useful for multi-concept queries
+      # @param query_vector [Array<Float>] query embedding
+      # @param target_vectors [Array<Array<Float>>] target embeddings
+      # @return [Float] average similarity
+      def self.average_similarity(query_vector, target_vectors)
+        return 0.0 if target_vectors.empty?
+        similarities = target_vectors.map { |vec| cosine(query_vector, vec) }
+        similarities.sum / similarities.size.to_f
+      end
+      # Batch calculate similarities between one query and many candidates
+      # More efficient than calling cosine repeatedly
+      # @param query_vector [Array<Float>] query embedding
+      # @param candidate_vectors [Array<Array<Float>>] candidate embeddings
+      # @return [Array<Float>] similarity scores in same order as candidates
+      def self.batch_similarities(query_vector, candidate_vectors)
+        candidate_vectors.map { |vec| cosine(query_vector, vec) }
+      end
+    end
+  end
+end

data/lib/claude_memory/hook/handler.rb CHANGED Viewed

@@ -9,13 +9,14 @@ module ClaudeMemory
       def initialize(store, env: ENV)
         @store = store
+        @config = Configuration.new(env)
         @env = env
       end
       def ingest(payload)
-        session_id = payload["session_id"] || @env["CLAUDE_SESSION_ID"]
-        transcript_path = payload["transcript_path"] || @env["CLAUDE_TRANSCRIPT_PATH"]
-        project_path = payload["project_path"] || @env["CLAUDE_PROJECT_DIR"] || Dir.pwd
+        session_id = payload["session_id"] || @config.session_id
+        transcript_path = payload["transcript_path"] || @config.transcript_path
+        project_path = payload["project_path"] || @config.project_dir
         raise PayloadError, "Missing required field: session_id" if session_id.nil? || session_id.empty?
         raise PayloadError, "Missing required field: transcript_path" if transcript_path.nil? || transcript_path.empty?

data/lib/claude_memory/index/lexical_fts.rb CHANGED Viewed

@@ -6,10 +6,11 @@ module ClaudeMemory
       def initialize(store)
         @store = store
         @db = store.db
-        ensure_fts_table!
+        @fts_table_ensured = false
       end
       def index_content_item(content_item_id, text)
+        ensure_fts_table!
         existing = @db[:content_fts].where(content_item_id: content_item_id).get(:content_item_id)
         return if existing
@@ -17,6 +18,7 @@ module ClaudeMemory
       end
       def search(query, limit: 20)
+        ensure_fts_table!
         return [] if query.nil? || query.strip.empty?
         if query.strip == "*"
@@ -48,10 +50,13 @@ module ClaudeMemory
       private
       def ensure_fts_table!
+        return if @fts_table_ensured
         @db.run(<<~SQL)
-          CREATE VIRTUAL TABLE IF NOT EXISTS content_fts
+          CREATE VIRTUAL TABLE IF NOT EXISTS content_fts
           USING fts5(content_item_id UNINDEXED, text, tokenize='porter unicode61')
         SQL
+        @fts_table_ensured = true
       end
     end
   end

data/lib/claude_memory/infrastructure/operation_tracker.rb ADDED Viewed

@@ -0,0 +1,158 @@
+# frozen_string_literal: true
+module ClaudeMemory
+  module Infrastructure
+    # Tracks long-running operations with checkpoints for resumability
+    # Enables detection of stuck operations and provides recovery mechanisms
+    class OperationTracker
+      STALE_THRESHOLD_SECONDS = 86400 # 24 hours
+      def initialize(store)
+        @store = store
+      end
+      # Start tracking a new operation
+      # Returns operation_id
+      def start_operation(operation_type:, scope:, total_items: nil, checkpoint_data: {})
+        now = Time.now.utc.iso8601
+        # Mark any stale operations as failed before starting new one
+        cleanup_stale_operations!(operation_type, scope)
+        @store.db[:operation_progress].insert(
+          operation_type: operation_type,
+          scope: scope,
+          status: "running",
+          total_items: total_items,
+          processed_items: 0,
+          checkpoint_data: checkpoint_data.to_json,
+          started_at: now,
+          completed_at: nil
+        )
+      end
+      # Update progress with new checkpoint data
+      def update_progress(operation_id, processed_items:, checkpoint_data: nil)
+        updates = {processed_items: processed_items}
+        updates[:checkpoint_data] = checkpoint_data.to_json if checkpoint_data
+        @store.db[:operation_progress].where(id: operation_id).update(updates)
+      end
+      # Mark operation as completed
+      def complete_operation(operation_id)
+        now = Time.now.utc.iso8601
+        @store.db[:operation_progress].where(id: operation_id).update(
+          status: "completed",
+          completed_at: now
+        )
+      end
+      # Mark operation as failed with error message
+      def fail_operation(operation_id, error_message)
+        now = Time.now.utc.iso8601
+        checkpoint_data = @store.db[:operation_progress].where(id: operation_id).get(:checkpoint_data)
+        checkpoint = checkpoint_data ? JSON.parse(checkpoint_data) : {}
+        checkpoint[:error] = error_message
+        @store.db[:operation_progress].where(id: operation_id).update(
+          status: "failed",
+          completed_at: now,
+          checkpoint_data: checkpoint.to_json
+        )
+      end
+      # Get checkpoint data for resuming operation
+      # Returns {operation_id:, checkpoint_data:, processed_items:} or nil
+      # Only returns non-stale operations (< 24h old)
+      def get_checkpoint(operation_type:, scope:)
+        threshold_time = (Time.now.utc - STALE_THRESHOLD_SECONDS).iso8601
+        op = @store.db[:operation_progress]
+          .where(operation_type: operation_type, scope: scope, status: "running")
+          .where { started_at >= threshold_time }  # Exclude stale operations
+          .order(Sequel.desc(:started_at))
+          .first
+        return nil unless op
+        checkpoint_data = op[:checkpoint_data] ? JSON.parse(op[:checkpoint_data], symbolize_names: true) : {}
+        {
+          operation_id: op[:id],
+          checkpoint_data: checkpoint_data,
+          processed_items: op[:processed_items] || 0,
+          total_items: op[:total_items],
+          started_at: op[:started_at]
+        }
+      end
+      # Get all stuck operations (running for > 24h)
+      def stuck_operations
+        threshold_time = (Time.now.utc - STALE_THRESHOLD_SECONDS).iso8601
+        @store.db[:operation_progress]
+          .where(status: "running")
+          .where { started_at < threshold_time }
+          .all
+      end
+      # Reset stuck operations to failed status
+      def reset_stuck_operations(operation_type: nil, scope: nil)
+        dataset = @store.db[:operation_progress].where(status: "running")
+        dataset = dataset.where(operation_type: operation_type) if operation_type
+        dataset = dataset.where(scope: scope) if scope
+        threshold_time = (Time.now.utc - STALE_THRESHOLD_SECONDS).iso8601
+        stuck = dataset.where { started_at < threshold_time }
+        count = stuck.count
+        return 0 if count.zero?
+        now = Time.now.utc.iso8601
+        error_message = "Reset by recover command - operation exceeded 24h timeout"
+        # Fetch each stuck operation, update checkpoint in Ruby, then save
+        stuck.all.each do |op|
+          checkpoint = op[:checkpoint_data] ? JSON.parse(op[:checkpoint_data]) : {}
+          checkpoint["error"] = error_message
+          @store.db[:operation_progress]
+            .where(id: op[:id])
+            .update(
+              status: "failed",
+              completed_at: now,
+              checkpoint_data: JSON.generate(checkpoint)
+            )
+        end
+        count
+      end
+      private
+      # Mark stale operations as failed before starting new operation
+      def cleanup_stale_operations!(operation_type, scope)
+        threshold_time = (Time.now.utc - STALE_THRESHOLD_SECONDS).iso8601
+        now = Time.now.utc.iso8601
+        error_message = "Automatically marked as failed - operation exceeded 24h timeout"
+        stale = @store.db[:operation_progress]
+          .where(operation_type: operation_type, scope: scope, status: "running")
+          .where { started_at < threshold_time }
+        # Fetch each stale operation, update checkpoint in Ruby, then save
+        stale.all.each do |op|
+          checkpoint = op[:checkpoint_data] ? JSON.parse(op[:checkpoint_data]) : {}
+          checkpoint["error"] = error_message
+          @store.db[:operation_progress]
+            .where(id: op[:id])
+            .update(
+              status: "failed",
+              completed_at: now,
+              checkpoint_data: JSON.generate(checkpoint)
+            )
+        end
+      end
+    end
+  end
+end

data/lib/claude_memory/infrastructure/schema_validator.rb ADDED Viewed

@@ -0,0 +1,206 @@
+# frozen_string_literal: true
+module ClaudeMemory
+  module Infrastructure
+    # Validates database schema integrity and data consistency
+    # Records validation results in schema_health table
+    class SchemaValidator
+      EXPECTED_TABLES = %i[
+        meta content_items delta_cursors entities entity_aliases facts
+        provenance fact_links conflicts tool_calls
+        operation_progress schema_health
+      ].freeze
+      # FTS table is created lazily, so it's optional
+      OPTIONAL_TABLES = %i[content_fts].freeze
+      CRITICAL_COLUMNS = {
+        facts: %i[id subject_entity_id predicate status scope project_path embedding_json],
+        content_items: %i[id source session_id text_hash ingested_at source_mtime],
+        entities: %i[id type canonical_name slug],
+        operation_progress: %i[id operation_type scope status started_at]
+      }.freeze
+      CRITICAL_INDEXES = %i[
+        idx_facts_predicate idx_facts_subject idx_facts_status idx_facts_scope
+        idx_facts_project idx_provenance_fact idx_content_items_session
+        idx_operation_progress_type idx_operation_progress_status
+      ].freeze
+      def initialize(store)
+        @store = store
+      end
+      def validate
+        issues = []
+        # Check tables exist
+        tables = @store.db.tables
+        missing_tables = EXPECTED_TABLES - tables
+        missing_tables.each do |table|
+          issues << {severity: "error", message: "Missing table: #{table}"}
+        end
+        # Check critical columns exist
+        CRITICAL_COLUMNS.each do |table, columns|
+          next unless tables.include?(table)
+          existing_columns = @store.db.schema(table).map(&:first)
+          missing_columns = columns - existing_columns
+          missing_columns.each do |column|
+            issues << {severity: "error", message: "Missing column #{table}.#{column}"}
+          end
+        end
+        # Check critical indexes exist
+        index_names = @store.db["SELECT name FROM sqlite_master WHERE type='index'"]
+          .all.map { |r| r[:name] }
+        missing_indexes = CRITICAL_INDEXES - index_names.map(&:to_sym)
+        missing_indexes.each do |index|
+          issues << {severity: "warning", message: "Missing index: #{index}"}
+        end
+        # Check for orphaned records
+        check_orphaned_provenance(issues)
+        check_orphaned_fact_links(issues)
+        check_orphaned_tool_calls(issues)
+        # Check for invalid enum values
+        check_invalid_fact_scopes(issues)
+        check_invalid_fact_status(issues)
+        check_invalid_operation_status(issues)
+        # Check embedding dimensions
+        check_embedding_dimensions(issues)
+        # Record validation result
+        record_health_check(issues)
+        {
+          valid: issues.none? { |i| i[:severity] == "error" },
+          issues: issues
+        }
+      end
+      private
+      def check_orphaned_provenance(issues)
+        orphaned = @store.db[:provenance]
+          .left_join(:facts, id: :fact_id)
+          .where(Sequel[:facts][:id] => nil)
+          .count
+        if orphaned > 0
+          issues << {severity: "error", message: "#{orphaned} orphaned provenance record(s) without corresponding facts"}
+        end
+      end
+      def check_orphaned_fact_links(issues)
+        orphaned_from = @store.db[:fact_links]
+          .left_join(:facts, id: :from_fact_id)
+          .where(Sequel[:facts][:id] => nil)
+          .count
+        orphaned_to = @store.db[:fact_links]
+          .left_join(Sequel[:facts].as(:to_facts), id: :to_fact_id)
+          .where(Sequel[:to_facts][:id] => nil)
+          .count
+        total_orphaned = orphaned_from + orphaned_to
+        if total_orphaned > 0
+          issues << {severity: "error", message: "#{total_orphaned} orphaned fact_links record(s)"}
+        end
+      end
+      def check_orphaned_tool_calls(issues)
+        orphaned = @store.db[:tool_calls]
+          .left_join(:content_items, id: :content_item_id)
+          .where(Sequel[:content_items][:id] => nil)
+          .count
+        if orphaned > 0
+          issues << {severity: "warning", message: "#{orphaned} orphaned tool_calls record(s) without corresponding content_items"}
+        end
+      end
+      def check_invalid_fact_scopes(issues)
+        invalid = @store.facts
+          .where(Sequel.~(scope: %w[global project]))
+          .count
+        if invalid > 0
+          issues << {severity: "error", message: "#{invalid} fact(s) with invalid scope (must be 'global' or 'project')"}
+        end
+      end
+      def check_invalid_fact_status(issues)
+        valid_statuses = %w[active superseded]
+        invalid = @store.facts
+          .where(Sequel.~(status: valid_statuses))
+          .count
+        if invalid > 0
+          issues << {severity: "warning", message: "#{invalid} fact(s) with non-standard status"}
+        end
+      end
+      def check_invalid_operation_status(issues)
+        return unless @store.db.tables.include?(:operation_progress)
+        valid_statuses = %w[running completed failed]
+        invalid = @store.operation_progress
+          .where(Sequel.~(status: valid_statuses))
+          .count
+        if invalid > 0
+          issues << {severity: "error", message: "#{invalid} operation(s) with invalid status"}
+        end
+      end
+      def check_embedding_dimensions(issues)
+        # Check that all embeddings have correct dimensions (384)
+        facts_with_embeddings = @store.facts
+          .where(Sequel.~(embedding_json: nil))
+          .select(:id, :embedding_json)
+          .limit(10)  # Sample first 10
+        facts_with_embeddings.each do |fact|
+          embedding = JSON.parse(fact[:embedding_json])
+          if embedding.size != 384
+            issues << {severity: "error", message: "Fact #{fact[:id]} has embedding with incorrect dimensions (#{embedding.size}, expected 384)"}
+            break  # Only report first occurrence
+          end
+        end
+      rescue JSON::ParserError
+        issues << {severity: "error", message: "Invalid JSON in embedding_json column"}
+      end
+      def record_health_check(issues)
+        now = Time.now.utc.iso8601
+        version = @store.schema_version
+        # Get table counts for snapshot
+        table_counts = {}
+        @store.db.tables.each do |table|
+          table_counts[table.to_s] = @store.db[table].count
+        end
+        validation_status = if issues.any? { |i| i[:severity] == "error" }
+          "corrupt"
+        elsif issues.any?
+          "degraded"
+        else
+          "healthy"
+        end
+        @store.schema_health.insert(
+          checked_at: now,
+          schema_version: version,
+          validation_status: validation_status,
+          issues_json: issues.to_json,
+          table_counts_json: table_counts.to_json
+        )
+      end
+    end
+  end
+end

data/lib/claude_memory/ingest/content_sanitizer.rb CHANGED Viewed

@@ -2,22 +2,21 @@
 module ClaudeMemory
   module Ingest
+    # Strips privacy tags from transcript content before ingestion.
+    #
+    # Note: No tag count limit is enforced. The regex pattern /<tag>.*?<\/tag>/m
+    # is provably safe from ReDoS (non-greedy matching with clear delimiters).
+    # Performance is O(n) and excellent even with 1000+ tags (~0.6ms).
+    # Long Claude sessions legitimately accumulate many tags (100-200+).
     class ContentSanitizer
       SYSTEM_TAGS = ["claude-memory-context"].freeze
       USER_TAGS = ["private", "no-memory", "secret"].freeze
-      MAX_TAG_COUNT = 100
       def self.strip_tags(text)
         tags = Pure.all_tags
-        validate_tag_count!(text, tags)
         Pure.strip_tags(text, tags)
       end
-      def self.validate_tag_count!(text, tags)
-        count = Pure.count_tags(text, tags)
-        raise Error, "Too many privacy tags (#{count}), possible ReDoS attack" if count > MAX_TAG_COUNT
-      end
       module Pure
         def self.all_tags
           @all_tags ||= begin