RubyGems - htm - Versions diffs - 0.0.11 → 0.0.14 - Mend

htm 0.0.11 → 0.0.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (74) hide show

checksums.yaml +4 -4
data/.dictate.toml +46 -0
data/.envrc +2 -0
data/CHANGELOG.md +52 -2
data/README.md +79 -0
data/Rakefile +14 -2
data/bin/htm_mcp.rb +94 -0
data/config/database.yml +20 -13
data/db/migrate/00010_add_soft_delete_to_associations.rb +29 -0
data/db/migrate/00011_add_performance_indexes.rb +21 -0
data/db/migrate/00012_add_tags_trigram_index.rb +18 -0
data/db/migrate/00013_enable_lz4_compression.rb +43 -0
data/db/schema.sql +49 -92
data/docs/api/index.md +1 -1
data/docs/api/yard/HTM.md +2 -4
data/docs/architecture/index.md +1 -1
data/docs/development/index.md +1 -1
data/docs/getting-started/index.md +1 -1
data/docs/guides/index.md +1 -1
data/docs/images/telemetry-architecture.svg +153 -0
data/docs/telemetry.md +391 -0
data/examples/README.md +46 -1
data/examples/cli_app/README.md +1 -1
data/examples/cli_app/htm_cli.rb +1 -1
data/examples/sinatra_app/app.rb +1 -1
data/examples/telemetry/README.md +147 -0
data/examples/telemetry/SETUP_README.md +169 -0
data/examples/telemetry/demo.rb +498 -0
data/examples/telemetry/grafana/dashboards/htm-metrics.json +457 -0
data/lib/htm/configuration.rb +261 -70
data/lib/htm/database.rb +46 -22
data/lib/htm/embedding_service.rb +24 -14
data/lib/htm/errors.rb +15 -1
data/lib/htm/jobs/generate_embedding_job.rb +19 -0
data/lib/htm/jobs/generate_propositions_job.rb +103 -0
data/lib/htm/jobs/generate_tags_job.rb +24 -0
data/lib/htm/loaders/markdown_chunker.rb +79 -0
data/lib/htm/loaders/markdown_loader.rb +41 -15
data/lib/htm/long_term_memory/fulltext_search.rb +138 -0
data/lib/htm/long_term_memory/hybrid_search.rb +324 -0
data/lib/htm/long_term_memory/node_operations.rb +209 -0
data/lib/htm/long_term_memory/relevance_scorer.rb +355 -0
data/lib/htm/long_term_memory/robot_operations.rb +34 -0
data/lib/htm/long_term_memory/tag_operations.rb +428 -0
data/lib/htm/long_term_memory/vector_search.rb +109 -0
data/lib/htm/long_term_memory.rb +51 -1153
data/lib/htm/models/node.rb +35 -2
data/lib/htm/models/node_tag.rb +31 -0
data/lib/htm/models/robot_node.rb +31 -0
data/lib/htm/models/tag.rb +44 -0
data/lib/htm/proposition_service.rb +169 -0
data/lib/htm/query_cache.rb +214 -0
data/lib/htm/sql_builder.rb +178 -0
data/lib/htm/tag_service.rb +16 -6
data/lib/htm/tasks.rb +8 -2
data/lib/htm/telemetry.rb +224 -0
data/lib/htm/version.rb +1 -1
data/lib/htm.rb +64 -3
data/lib/tasks/doc.rake +1 -1
data/lib/tasks/htm.rake +259 -13
data/mkdocs.yml +96 -96
metadata +42 -16
data/.aigcm_msg +0 -1
data/.claude/settings.local.json +0 -95
data/CLAUDE.md +0 -603
data/examples/cli_app/temp.log +0 -93
data/lib/htm/loaders/paragraph_chunker.rb +0 -112
data/notes/ARCHITECTURE_REVIEW.md +0 -1167
data/notes/IMPLEMENTATION_SUMMARY.md +0 -606
data/notes/MULTI_FRAMEWORK_IMPLEMENTATION.md +0 -451
data/notes/next_steps.md +0 -100
data/notes/plan.md +0 -627
data/notes/tag_ontology_enhancement_ideas.md +0 -222
data/notes/timescaledb_removal_summary.md +0 -200

data/lib/htm/long_term_memory/hybrid_search.rb ADDED Viewed

@@ -0,0 +1,324 @@
+# frozen_string_literal: true
+class HTM
+  class LongTermMemory
+    # Hybrid search combining full-text and vector similarity
+    #
+    # Performs combined search using:
+    # 1. Full-text search for content matching
+    # 2. Tag matching for categorical relevance
+    # 3. Vector similarity for semantic ranking
+    #
+    # Nodes without embeddings are included with a default similarity score,
+    # allowing newly created nodes to appear immediately before background
+    # jobs complete their embedding generation.
+    #
+    # Results are cached for performance.
+    #
+    # Security: All queries use parameterized placeholders to prevent SQL injection.
+    #
+    module HybridSearch
+      # Maximum results to prevent DoS via unbounded queries
+      MAX_HYBRID_LIMIT = 1000
+      MAX_PREFILTER_LIMIT = 5000
+      # Hybrid search (full-text + vector)
+      #
+      # @param timeframe [Range] Time range to search
+      # @param query [String] Search query
+      # @param limit [Integer] Maximum results (capped at MAX_HYBRID_LIMIT)
+      # @param embedding_service [Object] Service to generate embeddings
+      # @param prefilter_limit [Integer] Candidates to consider (default: 100, capped at MAX_PREFILTER_LIMIT)
+      # @param metadata [Hash] Filter by metadata fields (default: {})
+      # @return [Array<Hash>] Matching nodes
+      #
+      def search_hybrid(timeframe:, query:, limit:, embedding_service:, prefilter_limit: 100, metadata: {})
+        # Enforce limits to prevent DoS
+        safe_limit = [[limit.to_i, 1].max, MAX_HYBRID_LIMIT].min
+        safe_prefilter = [[prefilter_limit.to_i, 1].max, MAX_PREFILTER_LIMIT].min
+        start_time = Process.clock_gettime(Process::CLOCK_MONOTONIC)
+        result = @cache.fetch(:hybrid, timeframe, query, safe_limit, safe_prefilter, metadata) do
+          search_hybrid_uncached(
+            timeframe: timeframe,
+            query: query,
+            limit: safe_limit,
+            embedding_service: embedding_service,
+            prefilter_limit: safe_prefilter,
+            metadata: metadata
+          )
+        end
+        elapsed_ms = ((Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time) * 1000).round
+        HTM::Telemetry.search_latency.record(elapsed_ms, attributes: { 'strategy' => 'hybrid' })
+        result
+      end
+      private
+      # Threshold for skipping tag extraction (as ratio of limit)
+      # If fulltext returns >= this ratio of requested results, skip expensive tag extraction
+      TAG_EXTRACTION_THRESHOLD = 0.5
+      # Uncached hybrid search
+      #
+      # Generates query embedding client-side, then combines:
+      # 1. Full-text search for content matching
+      # 2. Tag matching for categorical relevance (lazy - skipped if fulltext sufficient)
+      # 3. Vector similarity for semantic ranking
+      #
+      # @param timeframe [nil, Range, Array<Range>] Time range(s) to search (nil = no filter)
+      # @param query [String] Search query
+      # @param limit [Integer] Maximum results
+      # @param embedding_service [Object] Service to generate query embedding
+      # @param prefilter_limit [Integer] Candidates to consider
+      # @param metadata [Hash] Filter by metadata fields (default: {})
+      # @return [Array<Hash>] Matching nodes with similarity and tag_boost scores
+      #
+      def search_hybrid_uncached(timeframe:, query:, limit:, embedding_service:, prefilter_limit:, metadata: {})
+        # Generate query embedding client-side
+        query_embedding = embedding_service.embed(query)
+        # Validate embedding before use
+        unless query_embedding.is_a?(Array) && query_embedding.any?
+          HTM.logger.error("Invalid embedding returned from embedding service")
+          return []
+        end
+        # Pad embedding to 2000 dimensions if needed
+        padded_embedding = HTM::SqlBuilder.pad_embedding(query_embedding)
+        # Sanitize embedding for safe SQL use (validates all values are numeric)
+        embedding_str = HTM::SqlBuilder.sanitize_embedding(padded_embedding)
+        # Build filter conditions (with table alias for CTEs)
+        timeframe_condition = HTM::SqlBuilder.timeframe_condition(timeframe, table_alias: 'n')
+        metadata_condition = HTM::SqlBuilder.metadata_condition(metadata, table_alias: 'n')
+        additional_conditions = []
+        additional_conditions << timeframe_condition if timeframe_condition
+        additional_conditions << metadata_condition if metadata_condition
+        additional_sql = additional_conditions.any? ? "AND #{additional_conditions.join(' AND ')}" : ""
+        # Same for non-aliased queries
+        timeframe_condition_bare = HTM::SqlBuilder.timeframe_condition(timeframe)
+        metadata_condition_bare = HTM::SqlBuilder.metadata_condition(metadata)
+        additional_conditions_bare = []
+        additional_conditions_bare << timeframe_condition_bare if timeframe_condition_bare
+        additional_conditions_bare << metadata_condition_bare if metadata_condition_bare
+        additional_sql_bare = additional_conditions_bare.any? ? "AND #{additional_conditions_bare.join(' AND ')}" : ""
+        # OPTIMIZATION: Lazy tag extraction
+        # Only extract tags if fulltext results are insufficient.
+        # This skips the expensive LLM call (~500-3000ms) when fulltext alone
+        # provides enough results.
+        fulltext_count = count_fulltext_matches(
+          query: query,
+          additional_sql_bare: additional_sql_bare,
+          limit: prefilter_limit
+        )
+        # Only call expensive tag extraction if fulltext results are below threshold
+        matching_tags = if fulltext_count < (limit * TAG_EXTRACTION_THRESHOLD)
+          find_query_matching_tags(query)
+        else
+          []
+        end
+        # Build the hybrid query
+        # NOTE: Hybrid search includes nodes without embeddings using a default
+        # similarity score of 0.5. This allows newly created nodes to appear in
+        # search results immediately (via fulltext matching) before their embeddings
+        # are generated by background jobs.
+        result = if matching_tags.any?
+          search_hybrid_with_tags(
+            query: query,
+            embedding_str: embedding_str,
+            matching_tags: matching_tags,
+            additional_sql: additional_sql,
+            prefilter_limit: prefilter_limit,
+            limit: limit
+          )
+        else
+          search_hybrid_without_tags(
+            query: query,
+            embedding_str: embedding_str,
+            additional_sql_bare: additional_sql_bare,
+            prefilter_limit: prefilter_limit,
+            limit: limit
+          )
+        end
+        # Track access for retrieved nodes
+        node_ids = result.map { |r| r['id'] }
+        track_access(node_ids)
+        result.to_a
+      end
+      # Count fulltext matches quickly (for lazy tag extraction decision)
+      #
+      # @param query [String] Search query
+      # @param additional_sql_bare [String] Additional SQL conditions
+      # @param limit [Integer] Maximum to count up to
+      # @return [Integer] Number of fulltext matches (capped at limit)
+      #
+      def count_fulltext_matches(query:, additional_sql_bare:, limit:)
+        sql = <<~SQL
+          SELECT COUNT(*) FROM (
+            SELECT 1 FROM nodes
+            WHERE deleted_at IS NULL
+            AND to_tsvector('english', content) @@ plainto_tsquery('english', ?)
+            #{additional_sql_bare}
+            LIMIT ?
+          ) AS limited_count
+        SQL
+        result = ActiveRecord::Base.connection.select_value(
+          ActiveRecord::Base.sanitize_sql_array([sql, query, limit])
+        )
+        result.to_i
+      end
+      # Hybrid search with tag matching
+      #
+      # Uses parameterized queries and LEFT JOIN for efficient tag boosting.
+      #
+      # @param query [String] Search query
+      # @param embedding_str [String] Sanitized embedding string
+      # @param matching_tags [Array<String>] Tags matching the query
+      # @param additional_sql [String] Additional SQL conditions
+      # @param prefilter_limit [Integer] Candidates to consider
+      # @param limit [Integer] Maximum results
+      # @return [ActiveRecord::Result] Query results
+      #
+      def search_hybrid_with_tags(query:, embedding_str:, matching_tags:, additional_sql:, prefilter_limit:, limit:)
+        # Build tag placeholders for parameterized query
+        tag_placeholders = matching_tags.map { '?' }.join(', ')
+        tag_count = matching_tags.length.to_f
+        # Use parameterized query with proper placeholder binding
+        # LEFT JOIN replaces correlated subquery for O(n) instead of O(n²)
+        sql = <<~SQL
+          WITH fulltext_candidates AS (
+            -- Nodes matching full-text search (with or without embeddings)
+            SELECT n.id, n.content, n.access_count, n.created_at, n.token_count, n.embedding
+            FROM nodes n
+            WHERE n.deleted_at IS NULL
+            AND to_tsvector('english', n.content) @@ plainto_tsquery('english', ?)
+            #{additional_sql}
+            LIMIT ?
+          ),
+          tag_candidates AS (
+            -- Nodes matching relevant tags (with or without embeddings)
+            SELECT n.id, n.content, n.access_count, n.created_at, n.token_count, n.embedding
+            FROM nodes n
+            JOIN node_tags nt ON nt.node_id = n.id
+            JOIN tags t ON t.id = nt.tag_id
+            WHERE n.deleted_at IS NULL
+            AND t.name IN (#{tag_placeholders})
+            #{additional_sql}
+            LIMIT ?
+          ),
+          all_candidates AS (
+            SELECT * FROM fulltext_candidates
+            UNION
+            SELECT * FROM tag_candidates
+          ),
+          tag_counts AS (
+            -- Pre-compute tag counts using JOIN instead of correlated subquery
+            SELECT nt.node_id, COUNT(DISTINCT t.name)::float AS matched_tags
+            FROM node_tags nt
+            JOIN tags t ON t.id = nt.tag_id
+            WHERE t.name IN (#{tag_placeholders})
+            GROUP BY nt.node_id
+          ),
+          scored AS (
+            SELECT
+              ac.id, ac.content, ac.access_count, ac.created_at, ac.token_count,
+              CASE
+                WHEN ac.embedding IS NOT NULL THEN 1 - (ac.embedding <=> ?::vector)
+                ELSE 0.5
+              END as similarity,
+              COALESCE(tc.matched_tags / ?, 0) as tag_boost
+            FROM all_candidates ac
+            LEFT JOIN tag_counts tc ON tc.node_id = ac.id
+          )
+          SELECT id, content, access_count, created_at, token_count,
+                 similarity, tag_boost,
+                 (similarity * 0.7 + tag_boost * 0.3) as combined_score
+          FROM scored
+          ORDER BY combined_score DESC
+          LIMIT ?
+        SQL
+        # Build parameter array: query, prefilter, tags (first IN), prefilter, tags (second IN), embedding, tag_count, limit
+        params = [
+          query,
+          prefilter_limit,
+          *matching_tags,
+          prefilter_limit,
+          *matching_tags,
+          embedding_str,
+          tag_count,
+          limit
+        ]
+        ActiveRecord::Base.connection.select_all(
+          ActiveRecord::Base.sanitize_sql_array([sql, *params])
+        )
+      end
+      # Hybrid search without tag matching (fallback)
+      #
+      # @param query [String] Search query
+      # @param embedding_str [String] Sanitized embedding string
+      # @param additional_sql_bare [String] Additional SQL conditions (no alias)
+      # @param prefilter_limit [Integer] Candidates to consider
+      # @param limit [Integer] Maximum results
+      # @return [ActiveRecord::Result] Query results
+      #
+      def search_hybrid_without_tags(query:, embedding_str:, additional_sql_bare:, prefilter_limit:, limit:)
+        # No matching tags, fall back to standard hybrid (fulltext + vector)
+        # Include nodes without embeddings with a default similarity score
+        # Optimized: compute similarity once in CTE, reuse for combined_score
+        sql = <<~SQL
+          WITH candidates AS (
+            SELECT id, content, access_count, created_at, token_count, embedding
+            FROM nodes
+            WHERE deleted_at IS NULL
+            AND to_tsvector('english', content) @@ plainto_tsquery('english', ?)
+            #{additional_sql_bare}
+            LIMIT ?
+          ),
+          scored AS (
+            SELECT id, content, access_count, created_at, token_count,
+                   CASE
+                     WHEN embedding IS NOT NULL THEN 1 - (embedding <=> ?::vector)
+                     ELSE 0.5
+                   END as similarity
+            FROM candidates
+          )
+          SELECT id, content, access_count, created_at, token_count,
+                 similarity,
+                 0.0 as tag_boost,
+                 similarity as combined_score
+          FROM scored
+          ORDER BY combined_score DESC
+          LIMIT ?
+        SQL
+        ActiveRecord::Base.connection.select_all(
+          ActiveRecord::Base.sanitize_sql_array([
+            sql,
+            query,
+            prefilter_limit,
+            embedding_str,
+            limit
+          ])
+        )
+      end
+    end
+  end
+end

data/lib/htm/long_term_memory/node_operations.rb ADDED Viewed

@@ -0,0 +1,209 @@
+# frozen_string_literal: true
+class HTM
+  class LongTermMemory
+    # Node CRUD operations for LongTermMemory
+    #
+    # Handles creating, reading, updating, and deleting memory nodes with:
+    # - Content deduplication via SHA-256 hash
+    # - Soft delete restoration on duplicate content
+    # - Robot-node linking with remember tracking
+    # - Bulk access tracking
+    #
+    module NodeOperations
+      # Add a node to long-term memory (with deduplication)
+      #
+      # If content already exists (by content_hash), links the robot to the existing
+      # node and updates timestamps. Otherwise creates a new node.
+      #
+      # @param content [String] Conversation message/utterance
+      # @param token_count [Integer] Token count
+      # @param robot_id [Integer] Robot identifier
+      # @param embedding [Array<Float>, nil] Pre-generated embedding vector
+      # @param metadata [Hash] Flexible metadata for the node (default: {})
+      # @return [Hash] { node_id:, is_new:, robot_node: }
+      # @raise [ArgumentError] If metadata is not a Hash
+      #
+      def add(content:, token_count: 0, robot_id:, embedding: nil, metadata: {})
+        # Validate metadata parameter
+        unless metadata.is_a?(Hash)
+          raise ArgumentError, "metadata must be a Hash, got #{metadata.class}"
+        end
+        content_hash = HTM::Models::Node.generate_content_hash(content)
+        # Wrap in transaction to ensure data consistency
+        ActiveRecord::Base.transaction do
+          # Check for existing node with same content (including soft-deleted)
+          # This avoids unique constraint violations on content_hash
+          existing_node = HTM::Models::Node.with_deleted.find_by(content_hash: content_hash)
+          # If found but soft-deleted, restore it
+          if existing_node&.deleted?
+            existing_node.restore!
+            HTM.logger.info "Restored soft-deleted node #{existing_node.id} for content match"
+          end
+          if existing_node
+            # Link robot to existing node (or update if already linked)
+            robot_node = link_robot_to_node(robot_id: robot_id, node: existing_node)
+            # Update the node's updated_at timestamp
+            existing_node.touch
+            {
+              node_id: existing_node.id,
+              is_new: false,
+              robot_node: robot_node
+            }
+          else
+            # Prepare embedding if provided
+            embedding_str = nil
+            if embedding
+              # Use centralized padding and sanitization
+              padded_embedding = HTM::SqlBuilder.pad_embedding(embedding)
+              embedding_str = HTM::SqlBuilder.sanitize_embedding(padded_embedding)
+            end
+            # Create new node
+            node = HTM::Models::Node.create!(
+              content: content,
+              content_hash: content_hash,
+              token_count: token_count,
+              embedding: embedding_str,
+              metadata: metadata
+            )
+            # Link robot to new node
+            robot_node = link_robot_to_node(robot_id: robot_id, node: node)
+            # Selectively invalidate search-related cache entries only
+            # (preserves unrelated cached data like tag queries)
+            @cache&.invalidate_methods!(:search, :fulltext, :hybrid)
+            {
+              node_id: node.id,
+              is_new: true,
+              robot_node: robot_node
+            }
+          end
+        end
+      end
+      # Link a robot to a node (create or update robot_node record)
+      #
+      # @param robot_id [Integer] Robot ID
+      # @param node [HTM::Models::Node] Node to link
+      # @param working_memory [Boolean] Whether node is in working memory (default: false)
+      # @return [HTM::Models::RobotNode] The robot_node link record
+      #
+      def link_robot_to_node(robot_id:, node:, working_memory: false)
+        robot_node = HTM::Models::RobotNode.find_by(robot_id: robot_id, node_id: node.id)
+        if robot_node
+          # Existing link - record that robot remembered this again
+          robot_node.record_remember!
+          robot_node.update!(working_memory: working_memory) if working_memory
+        else
+          # New link
+          robot_node = HTM::Models::RobotNode.create!(
+            robot_id: robot_id,
+            node_id: node.id,
+            first_remembered_at: Time.current,
+            last_remembered_at: Time.current,
+            remember_count: 1,
+            working_memory: working_memory
+          )
+        end
+        robot_node
+      end
+      # Retrieve a node by ID
+      #
+      # Automatically tracks access by incrementing access_count and updating last_accessed.
+      # Uses a single UPDATE query instead of separate increment! and touch calls.
+      #
+      # @param node_id [Integer] Node database ID
+      # @return [Hash, nil] Node data or nil
+      #
+      def retrieve(node_id)
+        node = HTM::Models::Node.find_by(id: node_id)
+        return nil unless node
+        # Track access in a single UPDATE query (instead of separate increment! and touch)
+        node.update_columns(
+          access_count: node.access_count + 1,
+          last_accessed: Time.current
+        )
+        # Reload to get updated values
+        node.reload.attributes
+      end
+      # Update last_accessed timestamp
+      #
+      # @param node_id [Integer] Node database ID
+      # @return [void]
+      #
+      def update_last_accessed(node_id)
+        node = HTM::Models::Node.find_by(id: node_id)
+        node&.update(last_accessed: Time.current)
+      end
+      # Delete a node
+      #
+      # @param node_id [Integer] Node database ID
+      # @return [void]
+      #
+      def delete(node_id)
+        node = HTM::Models::Node.find_by(id: node_id)
+        node&.destroy
+        # Selectively invalidate search-related cache entries only
+        @cache&.invalidate_methods!(:search, :fulltext, :hybrid)
+      end
+      # Check if a node exists
+      #
+      # @param node_id [Integer] Node database ID
+      # @return [Boolean] True if node exists
+      #
+      def exists?(node_id)
+        HTM::Models::Node.exists?(node_id)
+      end
+      # Mark nodes as evicted from working memory
+      #
+      # Sets working_memory = false on the robot_nodes join table for the specified
+      # robot and node IDs.
+      #
+      # @param robot_id [Integer] Robot ID whose working memory is being evicted
+      # @param node_ids [Array<Integer>] Node IDs to mark as evicted
+      # @return [void]
+      #
+      def mark_evicted(robot_id:, node_ids:)
+        return if node_ids.empty?
+        HTM::Models::RobotNode
+          .where(robot_id: robot_id, node_id: node_ids)
+          .update_all(working_memory: false)
+      end
+      # Track access for multiple nodes (bulk operation)
+      #
+      # Updates access_count and last_accessed for all nodes in the array
+      #
+      # @param node_ids [Array<Integer>] Node IDs that were accessed
+      # @return [void]
+      #
+      def track_access(node_ids)
+        return if node_ids.empty?
+        # Atomic batch update
+        HTM::Models::Node.where(id: node_ids).update_all(
+          "access_count = access_count + 1, last_accessed = NOW()"
+        )
+      end
+    end
+  end
+end