RubyGems - htm - Versions diffs - 0.0.10 → 0.0.14 - Mend

htm 0.0.10 → 0.0.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (75) hide show

checksums.yaml +4 -4
data/.dictate.toml +46 -0
data/.envrc +2 -0
data/CHANGELOG.md +86 -3
data/README.md +86 -7
data/Rakefile +14 -2
data/bin/htm_mcp.rb +621 -0
data/config/database.yml +20 -13
data/db/migrate/00010_add_soft_delete_to_associations.rb +29 -0
data/db/migrate/00011_add_performance_indexes.rb +21 -0
data/db/migrate/00012_add_tags_trigram_index.rb +18 -0
data/db/migrate/00013_enable_lz4_compression.rb +43 -0
data/db/schema.sql +49 -92
data/docs/api/index.md +1 -1
data/docs/api/yard/HTM.md +2 -4
data/docs/architecture/index.md +1 -1
data/docs/development/index.md +1 -1
data/docs/getting-started/index.md +1 -1
data/docs/guides/index.md +1 -1
data/docs/images/telemetry-architecture.svg +153 -0
data/docs/telemetry.md +391 -0
data/examples/README.md +171 -1
data/examples/cli_app/README.md +1 -1
data/examples/cli_app/htm_cli.rb +1 -1
data/examples/mcp_client.rb +529 -0
data/examples/sinatra_app/app.rb +1 -1
data/examples/telemetry/README.md +147 -0
data/examples/telemetry/SETUP_README.md +169 -0
data/examples/telemetry/demo.rb +498 -0
data/examples/telemetry/grafana/dashboards/htm-metrics.json +457 -0
data/lib/htm/configuration.rb +261 -70
data/lib/htm/database.rb +46 -22
data/lib/htm/embedding_service.rb +24 -14
data/lib/htm/errors.rb +15 -1
data/lib/htm/jobs/generate_embedding_job.rb +19 -0
data/lib/htm/jobs/generate_propositions_job.rb +103 -0
data/lib/htm/jobs/generate_tags_job.rb +24 -0
data/lib/htm/loaders/markdown_chunker.rb +79 -0
data/lib/htm/loaders/markdown_loader.rb +41 -15
data/lib/htm/long_term_memory/fulltext_search.rb +138 -0
data/lib/htm/long_term_memory/hybrid_search.rb +324 -0
data/lib/htm/long_term_memory/node_operations.rb +209 -0
data/lib/htm/long_term_memory/relevance_scorer.rb +355 -0
data/lib/htm/long_term_memory/robot_operations.rb +34 -0
data/lib/htm/long_term_memory/tag_operations.rb +428 -0
data/lib/htm/long_term_memory/vector_search.rb +109 -0
data/lib/htm/long_term_memory.rb +51 -1153
data/lib/htm/models/node.rb +35 -2
data/lib/htm/models/node_tag.rb +31 -0
data/lib/htm/models/robot_node.rb +31 -0
data/lib/htm/models/tag.rb +44 -0
data/lib/htm/proposition_service.rb +169 -0
data/lib/htm/query_cache.rb +214 -0
data/lib/htm/sql_builder.rb +178 -0
data/lib/htm/tag_service.rb +16 -6
data/lib/htm/tasks.rb +8 -2
data/lib/htm/telemetry.rb +224 -0
data/lib/htm/version.rb +1 -1
data/lib/htm.rb +64 -3
data/lib/tasks/doc.rake +1 -1
data/lib/tasks/htm.rake +259 -13
data/mkdocs.yml +96 -96
metadata +75 -18
data/.aigcm_msg +0 -1
data/.claude/settings.local.json +0 -92
data/CLAUDE.md +0 -603
data/examples/cli_app/temp.log +0 -93
data/lib/htm/loaders/paragraph_chunker.rb +0 -112
data/notes/ARCHITECTURE_REVIEW.md +0 -1167
data/notes/IMPLEMENTATION_SUMMARY.md +0 -606
data/notes/MULTI_FRAMEWORK_IMPLEMENTATION.md +0 -451
data/notes/next_steps.md +0 -100
data/notes/plan.md +0 -627
data/notes/tag_ontology_enhancement_ideas.md +0 -222
data/notes/timescaledb_removal_summary.md +0 -200

data/lib/htm/long_term_memory/relevance_scorer.rb ADDED Viewed

@@ -0,0 +1,355 @@
+# frozen_string_literal: true
+class HTM
+  class LongTermMemory
+    # Relevance scoring for search results
+    #
+    # Combines multiple signals to calculate dynamic relevance:
+    # - Vector similarity (semantic match) - config.relevance_semantic_weight (default: 0.5)
+    # - Tag overlap (categorical match) - config.relevance_tag_weight (default: 0.3)
+    # - Recency (freshness) - config.relevance_recency_weight (default: 0.1)
+    # - Access frequency (popularity/utility) - config.relevance_access_weight (default: 0.1)
+    #
+    # Recency decay uses configurable half-life: config.relevance_recency_half_life_hours (default: 168 = 1 week)
+    #
+    # Also provides tag similarity calculations using hierarchical Jaccard.
+    #
+    module RelevanceScorer
+      # Default score when signal is unavailable
+      DEFAULT_NEUTRAL_SCORE = 0.5
+      # Access frequency normalization
+      ACCESS_SCORE_NORMALIZER = 10.0
+      # Final score scaling
+      RELEVANCE_SCALE = 10.0
+      RELEVANCE_MIN = 0.0
+      RELEVANCE_MAX = 10.0
+      # Configurable scoring weights (via HTM.configuration)
+      def weight_semantic
+        HTM.configuration.relevance_semantic_weight
+      end
+      def weight_tag
+        HTM.configuration.relevance_tag_weight
+      end
+      def weight_recency
+        HTM.configuration.relevance_recency_weight
+      end
+      def weight_access
+        HTM.configuration.relevance_access_weight
+      end
+      def recency_half_life_hours
+        HTM.configuration.relevance_recency_half_life_hours
+      end
+      # Calculate dynamic relevance score for a node given query context
+      #
+      # @param node [Hash] Node data with similarity, tags, created_at, access_count
+      # @param query_tags [Array<String>] Tags associated with the query
+      # @param vector_similarity [Float, nil] Pre-computed vector similarity (0-1)
+      # @param node_tags [Array<String>, nil] Pre-loaded tags for this node (avoids N+1 query)
+      # @return [Float] Composite relevance score (RELEVANCE_MIN to RELEVANCE_MAX)
+      #
+      def calculate_relevance(node:, query_tags: [], vector_similarity: nil, node_tags: nil)
+        # 1. Vector similarity (semantic match)
+        semantic_score = if vector_similarity
+          vector_similarity
+        elsif node['similarity']
+          node['similarity'].to_f
+        else
+          DEFAULT_NEUTRAL_SCORE  # Neutral if no embedding
+        end
+        # 2. Tag overlap (categorical relevance)
+        # Use pre-loaded tags if provided, otherwise fetch (for backward compatibility)
+        node_tags ||= get_node_tags(node['id'])
+        tag_score = if query_tags.any? && node_tags.any?
+          weighted_hierarchical_jaccard(query_tags, node_tags)
+        else
+          DEFAULT_NEUTRAL_SCORE  # Neutral if no tags
+        end
+        # 3. Recency (temporal relevance) - exponential decay with half-life
+        age_hours = (Time.now - Time.parse(node['created_at'].to_s)) / 3600.0
+        recency_score = Math.exp(-age_hours / recency_half_life_hours)
+        # 4. Access frequency (behavioral signal) - log-normalized
+        access_count = node['access_count'] || 0
+        access_score = Math.log(1 + access_count) / ACCESS_SCORE_NORMALIZER
+        # Weighted composite with final scaling
+        relevance = (
+          (semantic_score * weight_semantic) +
+          (tag_score * weight_tag) +
+          (recency_score * weight_recency) +
+          (access_score * weight_access)
+        ) * RELEVANCE_SCALE
+        relevance.clamp(RELEVANCE_MIN, RELEVANCE_MAX)
+      end
+      # Search with dynamic relevance scoring
+      #
+      # Returns nodes with calculated relevance scores based on query context
+      #
+      # @param timeframe [nil, Range, Array<Range>] Time range(s) to search (nil = no filter)
+      # @param query [String, nil] Search query
+      # @param query_tags [Array<String>] Tags to match
+      # @param limit [Integer] Maximum results
+      # @param embedding_service [Object, nil] Service to generate embeddings
+      # @param metadata [Hash] Filter by metadata fields (default: {})
+      # @return [Array<Hash>] Nodes with relevance scores
+      #
+      def search_with_relevance(timeframe:, query: nil, query_tags: [], limit: 20, embedding_service: nil, metadata: {})
+        # Get candidates from appropriate search method
+        candidates = if query && embedding_service
+          # Vector search (returns hashes directly)
+          search_uncached(timeframe: timeframe, query: query, limit: limit * 2, embedding_service: embedding_service, metadata: metadata)
+        elsif query
+          # Full-text search (returns hashes directly)
+          search_fulltext_uncached(timeframe: timeframe, query: query, limit: limit * 2, metadata: metadata)
+        else
+          # Time-range only - use raw SQL to avoid ActiveRecord object instantiation
+          # This is more efficient than .map(&:attributes) which creates intermediate objects
+          fetch_candidates_by_timeframe(timeframe: timeframe, metadata: metadata, limit: limit * 2)
+        end
+        # Batch load all tags for candidates (fixes N+1 query)
+        node_ids = candidates.map { |n| n['id'] }
+        tags_by_node = batch_load_node_tags(node_ids)
+        # Calculate relevance for each candidate, building final hash in-place
+        scored_nodes = candidates.map do |node|
+          node_tags = tags_by_node[node['id']] || []
+          relevance = calculate_relevance(
+            node: node,
+            query_tags: query_tags,
+            vector_similarity: node['similarity']&.to_f,
+            node_tags: node_tags
+          )
+          # Modify in-place to avoid creating new Hash
+          node['relevance'] = relevance
+          node['tags'] = node_tags
+          node
+        end
+        # Sort by relevance and return top K
+        scored_nodes
+          .sort_by { |n| -n['relevance'] }
+          .take(limit)
+      end
+      # Fetch candidates by timeframe using raw SQL (avoids ActiveRecord overhead)
+      #
+      # @param timeframe [nil, Range, Array<Range>] Time range(s) to search
+      # @param metadata [Hash] Filter by metadata fields
+      # @param limit [Integer] Maximum results
+      # @return [Array<Hash>] Candidate nodes as hashes
+      #
+      def fetch_candidates_by_timeframe(timeframe:, metadata:, limit:)
+        timeframe_condition = HTM::SqlBuilder.timeframe_condition(timeframe)
+        metadata_condition = HTM::SqlBuilder.metadata_condition(metadata)
+        conditions = ['deleted_at IS NULL']
+        conditions << timeframe_condition if timeframe_condition
+        conditions << metadata_condition if metadata_condition
+        sql = <<~SQL
+          SELECT id, content, access_count, created_at, token_count
+          FROM nodes
+          WHERE #{conditions.join(' AND ')}
+          ORDER BY created_at DESC
+          LIMIT ?
+        SQL
+        result = ActiveRecord::Base.connection.select_all(
+          ActiveRecord::Base.sanitize_sql_array([sql, limit])
+        )
+        result.to_a
+      end
+      # Search nodes by tags
+      #
+      # @param tags [Array<String>] Tags to search for
+      # @param match_all [Boolean] If true, match ALL tags; if false, match ANY tag
+      # @param timeframe [Range, nil] Optional time range filter
+      # @param limit [Integer] Maximum results
+      # @return [Array<Hash>] Matching nodes with relevance scores
+      #
+      def search_by_tags(tags:, match_all: false, timeframe: nil, limit: 20)
+        return [] if tags.empty?
+        # Build base query with specific columns to avoid loading unnecessary data
+        query = HTM::Models::Node
+          .select('nodes.id, nodes.content, nodes.access_count, nodes.created_at, nodes.token_count')
+          .joins(:tags)
+          .where(tags: { name: tags })
+          .distinct
+        # Apply timeframe filter if provided
+        query = query.where(created_at: timeframe) if timeframe
+        if match_all
+          # Match ALL tags (intersection)
+          query = query
+            .group('nodes.id')
+            .having('COUNT(DISTINCT tags.name) = ?', tags.size)
+        end
+        # Convert to hashes efficiently using pluck-style approach
+        # This avoids instantiating full ActiveRecord objects
+        nodes = query.limit(limit).map do |node|
+          {
+            'id' => node.id,
+            'content' => node.content,
+            'access_count' => node.access_count,
+            'created_at' => node.created_at,
+            'token_count' => node.token_count
+          }
+        end
+        # Batch load all tags for nodes (fixes N+1 query)
+        node_ids = nodes.map { |n| n['id'] }
+        tags_by_node = batch_load_node_tags(node_ids)
+        # Calculate relevance and enrich with tags (modify in-place)
+        nodes.map do |node|
+          node_tags = tags_by_node[node['id']] || []
+          relevance = calculate_relevance(
+            node: node,
+            query_tags: tags,
+            node_tags: node_tags
+          )
+          node['relevance'] = relevance
+          node['tags'] = node_tags
+          node
+        end.sort_by { |n| -n['relevance'] }
+      end
+      private
+      # Calculate Jaccard similarity between two sets
+      #
+      # @param set_a [Array] First set
+      # @param set_b [Array] Second set
+      # @return [Float] Jaccard similarity (0.0-1.0)
+      #
+      def jaccard_similarity(set_a, set_b)
+        return 0.0 if set_a.empty? && set_b.empty?
+        return 0.0 if set_a.empty? || set_b.empty?
+        intersection = (set_a & set_b).size
+        union = (set_a | set_b).size
+        intersection.to_f / union
+      end
+      # Calculate weighted hierarchical Jaccard similarity
+      #
+      # Compares hierarchical tags accounting for partial matches at different levels.
+      # Optimized to pre-compute tag hierarchies and use early termination.
+      #
+      # Performance: O(n*m) where n,m are tag counts, but with:
+      # - Pre-computed splits to avoid repeated String#split
+      # - Early termination when root categories don't match
+      # - Set-based exact match fast path
+      #
+      # @param set_a [Array<String>] First set of hierarchical tags
+      # @param set_b [Array<String>] Second set of hierarchical tags
+      # @return [Float] Weighted similarity (0.0-1.0)
+      #
+      def weighted_hierarchical_jaccard(set_a, set_b)
+        return 0.0 if set_a.empty? || set_b.empty?
+        # Fast path: check for exact matches first
+        exact_matches = (set_a & set_b).size
+        return 1.0 if exact_matches == set_a.size && exact_matches == set_b.size
+        # Pre-compute tag hierarchies to avoid repeated String#split
+        hierarchies_a = set_a.map { |tag| tag.split(':') }
+        hierarchies_b = set_b.map { |tag| tag.split(':') }
+        # Build root category index for early termination optimization
+        # Group tags by their root category for faster matching
+        roots_b = hierarchies_b.group_by(&:first)
+        total_weighted_similarity = 0.0
+        total_weights = 0.0
+        hierarchies_a.each do |parts_a|
+          root_a = parts_a.first
+          # Only compare with tags that share the same root category
+          matching_hierarchies = roots_b[root_a] || []
+          # Also include all hierarchies if no root match (for cross-category comparison)
+          candidates = matching_hierarchies.empty? ? hierarchies_b : matching_hierarchies
+          candidates.each do |parts_b|
+            similarity, weight = calculate_hierarchical_similarity_cached(parts_a, parts_b)
+            total_weighted_similarity += similarity * weight
+            total_weights += weight
+          end
+          # Add zero-similarity weight for non-matching root categories
+          (hierarchies_b.size - candidates.size).times do
+            # Non-matching roots contribute weight but zero similarity
+            total_weights += 0.5  # Average weight for non-matches
+          end
+        end
+        total_weights > 0 ? total_weighted_similarity / total_weights : 0.0
+      end
+      # Calculate similarity between two pre-split hierarchical tags
+      #
+      # Optimized version that takes pre-split arrays to avoid redundant splits.
+      #
+      # @param parts_a [Array<String>] First tag hierarchy (pre-split)
+      # @param parts_b [Array<String>] Second tag hierarchy (pre-split)
+      # @return [Array<Float, Float>] [similarity, weight] both in range 0.0-1.0
+      #
+      def calculate_hierarchical_similarity_cached(parts_a, parts_b)
+        # Calculate overlap at each level using zip for efficiency
+        max_depth = [parts_a.length, parts_b.length].max
+        min_depth = [parts_a.length, parts_b.length].min
+        # Count common levels from root
+        common_levels = 0
+        min_depth.times do |i|
+          break unless parts_a[i] == parts_b[i]
+          common_levels += 1
+        end
+        # Weight based on hierarchy depth (deeper = less weight)
+        depth_weight = 1.0 / max_depth
+        # Normalized similarity
+        similarity = common_levels.to_f / max_depth
+        [similarity, depth_weight]
+      end
+      # Calculate similarity between two hierarchical tags (string version)
+      #
+      # Compares tags level by level, returning both similarity and a weight
+      # based on hierarchy depth (higher levels = more weight).
+      #
+      # @param tag_a [String] First tag (e.g., "database:postgresql:extensions")
+      # @param tag_b [String] Second tag (e.g., "database:postgresql:queries")
+      # @return [Array<Float, Float>] [similarity, weight] both in range 0.0-1.0
+      #
+      def calculate_hierarchical_similarity(tag_a, tag_b)
+        calculate_hierarchical_similarity_cached(tag_a.split(':'), tag_b.split(':'))
+      end
+    end
+  end
+end

data/lib/htm/long_term_memory/robot_operations.rb ADDED Viewed

@@ -0,0 +1,34 @@
+# frozen_string_literal: true
+class HTM
+  class LongTermMemory
+    # Robot registration and activity tracking
+    #
+    # Handles robot lifecycle management including:
+    # - Registration (find or create)
+    # - Activity timestamp updates
+    #
+    module RobotOperations
+      # Register a robot
+      #
+      # @param robot_name [String] Robot name
+      # @return [Integer] Robot ID
+      #
+      def register_robot(robot_name)
+        robot = HTM::Models::Robot.find_or_create_by(name: robot_name)
+        robot.update(last_active: Time.current)
+        robot.id
+      end
+      # Update robot activity timestamp
+      #
+      # @param robot_id [Integer] Robot identifier
+      # @return [void]
+      #
+      def update_robot_activity(robot_id)
+        robot = HTM::Models::Robot.find_by(id: robot_id)
+        robot&.update(last_active: Time.current)
+      end
+    end
+  end
+end